Injection script fixes.

Temporary fix for core when injecting large warc.
This commit is contained in:
Zak Betz
2015-07-08 14:03:39 -06:00
parent a7ae510e31
commit 6e21bc7d7c
4 changed files with 28 additions and 19 deletions

@ -776,4 +776,4 @@ install-pkgs-local:
warcinjector:
-rm -r /home/zak/.pex/build/inject-*
-rm -r /home/zak/.pex/install/inject-*
pex -r requests -r sqlite3 -r pyopenssl -r ndg-httpsclient -r pyasn1 -r multiprocessing -e inject.inject:main -o script/warc-inject -s '/home/zak/repos/open-source-search-engine/script/' --no-wheel
pex -r requests -r pyopenssl -r ndg-httpsclient -r pyasn1 -r multiprocessing -e inject.inject:main -o script/warc-inject -s '/home/zak/repos/open-source-search-engine/script/' --no-wheel

@ -3482,6 +3482,11 @@ bool XmlDoc::indexWarcOrArc ( char ctype ) {
if ( ctype == CT_WARC ) {
// find "WARC/1.0" or whatever
char *whp = m_fptr;
if( ! whp ) {
// FIXME: shouldn't get here with a NULL
log("build: No buffer for file=%s", file->getFilename());
goto warcDone;
}
// we do terminate last warc rec with \0 so be aware of that...
int32_t maxCount = 10;
for ( ; *whp && strncmp(whp,"WARC/",5) && --maxCount>0; whp++);

@ -9,6 +9,7 @@ import subprocess
import multiprocessing
import sqlite3
import datetime
import sys
#Generate environment with:
#pex -r requests -r multiprocessing -e inject:main -o warc-inject -s '.' --no-wheel
@ -44,7 +45,7 @@ def injectItem(item, c):
'metadata':json.dumps(itemMetadata),
'c':'ait'}
print "sending", postVars,' to gb'
if False:
if True:
rp = requests.post("http://localhost:8000/admin/inject", postVars)
statusCode = rp.status_code
print postVars['url'], rp.status_code
@ -77,28 +78,31 @@ def getPage(page):
def main():
getPage(4)
# from multiprocessing.pool import ThreadPool
# pool = ThreadPool(processes=5)
# print pool.map(getPage, xrange(1,1200))
print 'arguments were', sys.argv
if len(sys.argv) == 2:
if sys.argv[1] == 'init':
init()
print 'initialized'
return sys.exit(0)
if sys.argv[1] == 'reset':
import os
os.unlink('items.db')
init()
return sys.exit(0)
else:
#getPage(4)
from multiprocessing.pool import ThreadPool
pool = ThreadPool(processes=10)
print pool.map(getPage, xrange(1,1300))
def init():
db = sqlite3.connect('items.db', detect_types=sqlite3.PARSE_DECLTYPES|sqlite3.PARSE_COLNAMES)
c = db.cursor()
db.execute('''CREATE TABLE items
c.execute('''CREATE TABLE items
(item text, file text, updated timestamp, status integer)''')
db.commit()
db.close()
if __name__ == '__main__':
import sys
if len(sys.argv) == 2:
if sys.argv[1] == 'init':
init()
if sys.argv[1] == 'reset':
import os
os.unlink('items.db')
init()
else:
main()
main()

Binary file not shown.