Merge branch 'diffbot-testing' into testing

Conflicts:
	Parms.cpp
	XmlDoc.cpp
This commit is contained in:
mwells
2014-06-19 21:51:44 -07:00
4 changed files with 167 additions and 38 deletions

@ -220,6 +220,9 @@ bool sendBackDump ( TcpSocket *sock, HttpRequest *hr ) {
long dr = 1;
// do not dedup bulk jobs
if ( cr->m_isCustomCrawl == 2 ) dr = 0;
// do not dedup for crawls either it is too confusing!!!!
// ppl wonder where the results are!
dr = 0;
sb2.safePrintf("GET /search.csv?icc=1&format=csv&sc=0&"
// dedup. since stream=1 and pss=0 below
// this will dedup on page content hash only
@ -254,12 +257,15 @@ bool sendBackDump ( TcpSocket *sock, HttpRequest *hr ) {
long dr = 1;
// do not dedup bulk jobs
if ( cr->m_isCustomCrawl == 2 ) dr = 0;
// do not dedup for crawls either it is too confusing!!!!
// ppl wonder where the results are!
dr = 0;
sb2.safePrintf("GET /search.csv?icc=1&format=json&sc=0&"
// dedup. since stream=1 and pss=0 below
// this will dedup on page content hash only
// which is super fast.
"dr=%li&"
"c=%s&n=1000000&"
"c=%s&n=1000000&"
// we can stream this because unlink csv it
// has no header row that needs to be
// computed from all results.
@ -3245,8 +3251,8 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
"<a href=/search?icc=1&format=json&"
// disable site clustering
"sc=0&"
// dodupcontentremoval:
"dr=1&"
// doNOTdupcontentremoval:
"dr=0&"
"c=%s&n=10000000&rand=%llu&scores=0&id=1&"
"stream=1&" // stream results back as we get them
"q="

165
Parms.cpp

@ -397,6 +397,44 @@ bool CommandDeleteColl2 ( char *rec , WaitEntry *we ) {
return true;
}
bool CommandForceNextSpiderRound ( char *rec ) {
// caller must specify collnum
collnum_t collnum = getCollnumFromParmRec ( rec );
// need this
CollectionRec *cr = g_collectiondb.getRec ( collnum );
if ( ! cr ) {
g_errno = ENOCOLLREC;
log("parms: bad collnum %li for restart spider round",
(long)collnum);
return true;
}
// seems like parmlist is an rdblist, so we have a key_t followed
// by 4 bytes of datasize then the data... which is an ascii string
// in our case...
char *data = getDataFromParmRec ( rec );
long roundStartTime;
long newRoundNum;
// see the HACK: in Parms::convertHttpRequestToParmList() where we
// construct this data in response to a "roundStart" cmd. we used
// sprintf() so it's natural to use sscanf() to parse it out.
sscanf ( data , "%lu,%li", &roundStartTime,&newRoundNum);
cr->m_spiderRoundStartTime = roundStartTime;
cr->m_spiderRoundNum = newRoundNum;
// reset the round counts. this will log a msg. resetting the
// round counts will prevent maxToProcess/maxToCrawl from holding
// us back...
spiderRoundIncremented ( cr );
// yeah, if we don't nuke doledb then it doesn't work...
cr->rebuildUrlFilters();
return true;
}
// . returns true and sets g_errno on error
// . returns false if would block
bool CommandRestartColl ( char *rec , WaitEntry *we ) {
@ -9429,6 +9467,67 @@ void Parms::init ( ) {
m->m_def = "unspecified";
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m->m_title = "spider round start time";
m->m_desc = "When the next spider round starts. If you force this to "
"zero it sets it to the current time. That way you can "
"respider all the urls that were already spidered, and urls "
"that were not yet spidered in the round will still be "
"spidered.";
m->m_cgi = "spiderRoundStart";
m->m_size = 0;
m->m_off = (char *)&cr.m_spiderRoundStartTime - x;
m->m_type = TYPE_LONG;
m->m_def = "0";
m->m_group = 0;
m->m_page = PAGE_SPIDER;
m->m_obj = OBJ_COLL;
m->m_flags = PF_HIDDEN | PF_REBUILDURLFILTERS ;
m++;
// DIFFBOT:
// this http parm actually ads the "forceround" parm to the parmlist
// below with the appropriate args.
m->m_title = "manually restart a spider round";
m->m_desc = "Updates round number and resets local processed "
"and crawled counts to 0.";
m->m_cgi = "roundStart";
m->m_type = TYPE_CMD;
m->m_func = NULL;
m->m_group = 0;
m->m_page = PAGE_SPIDER;
m->m_obj = OBJ_COLL;
m->m_flags = PF_HIDDEN;
m++;
// DIFFBOT:
// . this is sent to each shard by issuing a "restartRound=" cmd
// . similar to the "addcoll" cmd we add args to it and make it
// the "forceround" cmd parm and add THAT to the parmlist.
// so "roundStart=1" is really an alias for us.
m->m_title = "manually restart a spider round on shard";
m->m_desc = "Updates round number and resets local processed "
"and crawled counts to 0.";
m->m_cgi = "forceround";
//m->m_off = (char *)&cr.m_spiderRoundStartTime - x;
m->m_type = TYPE_CMD;
m->m_func = CommandForceNextSpiderRound;
m->m_group = 0;
m->m_page = PAGE_SPIDER;
m->m_obj = OBJ_COLL;
m->m_flags = PF_HIDDEN | PF_REBUILDURLFILTERS ;
m++;
m->m_title = "spider round num";
m->m_desc = "The spider round number.";
m->m_cgi = "spiderRoundNum";
m->m_off = (char *)&cr.m_spiderRoundNum - x;
m->m_type = TYPE_LONG;
m->m_def = "0";
m->m_group = 0;
m->m_page = PAGE_SPIDER;
m->m_obj = OBJ_COLL;
m->m_flags = PF_HIDDEN ;
m++;
m->m_title = "send email alerts to sysadmin";
@ -15204,30 +15303,6 @@ void Parms::init ( ) {
m->m_group = 0;
m++;*/
m->m_title = "spider round start time";
m->m_desc = "When the spider round started";
m->m_cgi = "roundStart";
m->m_off = (char *)&cr.m_spiderRoundStartTime - x;
m->m_type = TYPE_LONG;
m->m_def = "0";
m->m_group = 0;
m->m_flags = PF_HIDDEN | PF_REBUILDURLFILTERS ;
m->m_page = PAGE_SPIDER;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "spider round num";
m->m_desc = "The spider round number.";
m->m_cgi = "spiderRoundNum";
m->m_off = (char *)&cr.m_spiderRoundNum - x;
m->m_type = TYPE_LONG;
m->m_def = "0";
m->m_group = 0;
m->m_flags = PF_HIDDEN ;
m->m_page = PAGE_SPIDER;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "scraping enabled procog";
m->m_desc = "Do searches for queries in this hosts part of the "
"query log.";
@ -18937,6 +19012,38 @@ bool Parms::convertHttpRequestToParmList (HttpRequest *hr, SafeBuf *parmList,
}
}
// . DIFFBOT HACK: so ppl can manually restart a spider round
// . val can be 0 or 1 or anything. i.e. roundStart=0 works.
// . map this parm to another parm with the round start
// time (current time) and the new round # as the args.
// . this will call CommandForceNextSpiderRound() function
// on every shard with these args, "tmpVal".
if ( strcmp(m->m_cgi,"roundStart") == 0 ) {
// use the current time so anything spidered before
// this time (the round start time) will be respidered
//sprintf(tmp,"%lu",getTimeGlobalNoCore());
//val = tmp;
char tmpVal[64];
// use the same round start time for all shards
sprintf(tmpVal,
"%lu,%li"
,getTimeGlobalNoCore()
,cr->m_spiderRoundNum+1
);
// . also add command to reset crawl/process counts
// so if you hit maxToProcess/maxToCrawl it will
// not stop the round from restarting
// . CommandResetCrawlCounts()
if ( ! addNewParmToList1 ( parmList ,
parmCollnum ,
tmpVal, // a string
0 , // occNum (for arrays)
"forceround" ) )
return false;
// don't bother going below
continue;
}
// if a collection name was also provided, assume that is
// the target of the reset/delete/restart. we still
// need PageAddDelete.cpp to work...
@ -19023,16 +19130,18 @@ bool Parms::convertHttpRequestToParmList (HttpRequest *hr, SafeBuf *parmList,
if ( m->m_obj == OBJ_NONE ) continue;
if ( m->m_obj == OBJ_SI ) continue;
// convert spiderRoundStartTime=0 to
// spiderRoundStartTime=<currenttime>+30secs
// convert spiderRoundStartTime=0 (roundStart=0 roundStart=1)
// to spiderRoundStartTime=<currenttime>+30secs
// so that will force the next spider round to kick in
/*
bool restartRound = false;
char tmp[24];
if ( strcmp(field,"roundStart")==0 &&
val && (val[0]=='0'||val[0]=='1') && val[1]==0 ) {
val && (val[0]=='0'||val[0]=='1') && val[1]==0 )
sprintf(tmp,"%lu",(long)getTimeGlobalNoCore()+0);
val = tmp;
}
*/
// add it to a list now
if ( ! addNewParmToList2 ( parmList ,

@ -5377,7 +5377,7 @@ void doneSendingNotification ( void *state ) {
// waiting tree will usually be empty for this coll since no
// spider requests had a valid spider priority, so let's rebuild!
// this is not necessary because PF_REBUILD is set for the
// "roundStart" parm in Parms.cpp so it will rebuild if that parm
// "spiderRoundStart" parm in Parms.cpp so it will rebuild if that parm
// changes already.
//if ( cr->m_spiderColl )
// cr->m_spiderColl->m_waitingTreeNeedsRebuild = true;
@ -5389,10 +5389,10 @@ void doneSendingNotification ( void *state ) {
g_parms.addNewParmToList1 ( &parmList,cr->m_collnum,roundStr,-1 ,
"spiderRoundNum");
g_parms.addNewParmToList1 ( &parmList,cr->m_collnum,roundTime, -1 ,
"roundStart");
"spiderRoundStart");
//g_parms.addParmToList1 ( &parmList , cr , "spiderRoundNum" );
//g_parms.addParmToList1 ( &parmList , cr , "roundStart" );
//g_parms.addParmToList1 ( &parmList , cr , "spiderRoundStart" );
// this uses msg4 so parm ordering is guaranteed
g_parms.broadcastParmList ( &parmList , NULL , NULL );

@ -1299,6 +1299,12 @@ bool XmlDoc::set4 ( SpiderRequest *sreq ,
m_conceptWeightValid = true;
*/
// fix some corruption i've seen
if ( m_sreq.m_urlIsDocId && ! is_digit(m_sreq.m_url[0]) ) {
log("xmldoc: fixing sreq %s to non docid",m_sreq.m_url);
m_sreq.m_urlIsDocId = 0;
}
// if url is a docid... we are from pagereindex.cpp
//if ( sreq->m_isPageReindex ) {
// now we can have url-based page reindex requests because
@ -1306,8 +1312,8 @@ bool XmlDoc::set4 ( SpiderRequest *sreq ,
// we add a spider request of the PARENT url for it as page reindex
//if ( is_digit ( sreq->m_url[0] ) ) {
// watch out for 0.r.msn.com!!
if ( sreq->m_urlIsDocId ) {
m_docId = atoll(sreq->m_url);
if ( m_sreq.m_urlIsDocId ) {
m_docId = atoll(m_sreq.m_url);
// assume its good
m_docIdValid = true;
// similar to set3() above
@ -1321,7 +1327,7 @@ bool XmlDoc::set4 ( SpiderRequest *sreq ,
// add www is now REQUIRED for all!
// crap, injection of tmblr.co/ZHw5yo1E5TAaW fails because
// www.tmblr.co has no IP
setFirstUrl ( sreq->m_url , false );//true ); // false );
setFirstUrl ( m_sreq.m_url , false );//true ); // false );
// you can't call this from a docid based url until you
// know the uh48
//setSpideredTime();
@ -13754,6 +13760,12 @@ SafeBuf *XmlDoc::getTokenizedDiffbotReply ( ) {
bool inQuotes = false;
// scan now
for ( ; *x ; x++ ) {
// escaping a backslash?
if ( *x == '\\' && x[1] == '\\' ) {
// skip two bytes then..
x++;
continue;
}
// escaping a quote? ignore quote then.
if ( *x == '\\' && x[1] == '\"' ) {
// skip two bytes then..
@ -16125,7 +16137,7 @@ void XmlDoc::filterStart_r ( bool amThread ) {
snprintf(cmd,2047 ,"ulimit -v 25000 ; ulimit -t 30 ; nice -n 19 %s/pdftohtml -q -i -noframes -stdout %s > %s", wdir , in ,out );
else if ( ctype == CT_DOC )
// "wdir" include trailing '/'? not sure
snprintf(cmd,2047, "ulimit -v 25000 ; ulimit -t 30 ; ANTIWORDHOME=%s/antiword-dir ; nice -n 19 %s/antiword %s> %s" , wdir , wdir , in , out );
snprintf(cmd,2047, "ulimit -v 25000 ; ulimit -t 30 ; export ANTIWORDHOME=%s/antiword-dir ; nice -n 19 %s/antiword %s> %s" , wdir , wdir , in , out );
else if ( ctype == CT_XLS )
snprintf(cmd,2047, "ulimit -v 25000 ; ulimit -t 30 ; timeout 10s nice -n 19 %s/xlhtml %s > %s" , wdir , in , out );
// this is too buggy for now... causes hanging threads because it
@ -20356,6 +20368,8 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
ksr.m_avoidSpiderLinks = 1;
// avoid EDOCUNCHANGED
ksr.m_ignoreDocUnchangedError = 1;
// no longer docid based we set it to parentUrl
ksr.m_urlIsDocId = 0;
// but it is not docid based, so overwrite the docid
// in ksr.m_url with the parent multidoc url. it \0 terms it.
strcpy(ksr.m_url , parentUrl );//, MAX_URL_LEN-1);