forked from Mirrors/privacore-open-source-search-engine
Merge branch 'diffbot-testing' into testing
Conflicts: Parms.cpp XmlDoc.cpp
This commit is contained in:
@ -220,6 +220,9 @@ bool sendBackDump ( TcpSocket *sock, HttpRequest *hr ) {
|
||||
long dr = 1;
|
||||
// do not dedup bulk jobs
|
||||
if ( cr->m_isCustomCrawl == 2 ) dr = 0;
|
||||
// do not dedup for crawls either it is too confusing!!!!
|
||||
// ppl wonder where the results are!
|
||||
dr = 0;
|
||||
sb2.safePrintf("GET /search.csv?icc=1&format=csv&sc=0&"
|
||||
// dedup. since stream=1 and pss=0 below
|
||||
// this will dedup on page content hash only
|
||||
@ -254,12 +257,15 @@ bool sendBackDump ( TcpSocket *sock, HttpRequest *hr ) {
|
||||
long dr = 1;
|
||||
// do not dedup bulk jobs
|
||||
if ( cr->m_isCustomCrawl == 2 ) dr = 0;
|
||||
// do not dedup for crawls either it is too confusing!!!!
|
||||
// ppl wonder where the results are!
|
||||
dr = 0;
|
||||
sb2.safePrintf("GET /search.csv?icc=1&format=json&sc=0&"
|
||||
// dedup. since stream=1 and pss=0 below
|
||||
// this will dedup on page content hash only
|
||||
// which is super fast.
|
||||
"dr=%li&"
|
||||
"c=%s&n=1000000&"
|
||||
"c=%s&n=1000000&"
|
||||
// we can stream this because unlink csv it
|
||||
// has no header row that needs to be
|
||||
// computed from all results.
|
||||
@ -3245,8 +3251,8 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
|
||||
"<a href=/search?icc=1&format=json&"
|
||||
// disable site clustering
|
||||
"sc=0&"
|
||||
// dodupcontentremoval:
|
||||
"dr=1&"
|
||||
// doNOTdupcontentremoval:
|
||||
"dr=0&"
|
||||
"c=%s&n=10000000&rand=%llu&scores=0&id=1&"
|
||||
"stream=1&" // stream results back as we get them
|
||||
"q="
|
||||
|
165
Parms.cpp
165
Parms.cpp
@ -397,6 +397,44 @@ bool CommandDeleteColl2 ( char *rec , WaitEntry *we ) {
|
||||
return true;
|
||||
}
|
||||
|
||||
bool CommandForceNextSpiderRound ( char *rec ) {
|
||||
|
||||
// caller must specify collnum
|
||||
collnum_t collnum = getCollnumFromParmRec ( rec );
|
||||
// need this
|
||||
CollectionRec *cr = g_collectiondb.getRec ( collnum );
|
||||
if ( ! cr ) {
|
||||
g_errno = ENOCOLLREC;
|
||||
log("parms: bad collnum %li for restart spider round",
|
||||
(long)collnum);
|
||||
return true;
|
||||
}
|
||||
|
||||
// seems like parmlist is an rdblist, so we have a key_t followed
|
||||
// by 4 bytes of datasize then the data... which is an ascii string
|
||||
// in our case...
|
||||
char *data = getDataFromParmRec ( rec );
|
||||
long roundStartTime;
|
||||
long newRoundNum;
|
||||
// see the HACK: in Parms::convertHttpRequestToParmList() where we
|
||||
// construct this data in response to a "roundStart" cmd. we used
|
||||
// sprintf() so it's natural to use sscanf() to parse it out.
|
||||
sscanf ( data , "%lu,%li", &roundStartTime,&newRoundNum);
|
||||
|
||||
cr->m_spiderRoundStartTime = roundStartTime;
|
||||
cr->m_spiderRoundNum = newRoundNum;
|
||||
|
||||
// reset the round counts. this will log a msg. resetting the
|
||||
// round counts will prevent maxToProcess/maxToCrawl from holding
|
||||
// us back...
|
||||
spiderRoundIncremented ( cr );
|
||||
|
||||
// yeah, if we don't nuke doledb then it doesn't work...
|
||||
cr->rebuildUrlFilters();
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
// . returns true and sets g_errno on error
|
||||
// . returns false if would block
|
||||
bool CommandRestartColl ( char *rec , WaitEntry *we ) {
|
||||
@ -9429,6 +9467,67 @@ void Parms::init ( ) {
|
||||
m->m_def = "unspecified";
|
||||
m->m_page = PAGE_MASTER;
|
||||
m->m_obj = OBJ_CONF;
|
||||
|
||||
m->m_title = "spider round start time";
|
||||
m->m_desc = "When the next spider round starts. If you force this to "
|
||||
"zero it sets it to the current time. That way you can "
|
||||
"respider all the urls that were already spidered, and urls "
|
||||
"that were not yet spidered in the round will still be "
|
||||
"spidered.";
|
||||
m->m_cgi = "spiderRoundStart";
|
||||
m->m_size = 0;
|
||||
m->m_off = (char *)&cr.m_spiderRoundStartTime - x;
|
||||
m->m_type = TYPE_LONG;
|
||||
m->m_def = "0";
|
||||
m->m_group = 0;
|
||||
m->m_page = PAGE_SPIDER;
|
||||
m->m_obj = OBJ_COLL;
|
||||
m->m_flags = PF_HIDDEN | PF_REBUILDURLFILTERS ;
|
||||
m++;
|
||||
|
||||
// DIFFBOT:
|
||||
// this http parm actually ads the "forceround" parm to the parmlist
|
||||
// below with the appropriate args.
|
||||
m->m_title = "manually restart a spider round";
|
||||
m->m_desc = "Updates round number and resets local processed "
|
||||
"and crawled counts to 0.";
|
||||
m->m_cgi = "roundStart";
|
||||
m->m_type = TYPE_CMD;
|
||||
m->m_func = NULL;
|
||||
m->m_group = 0;
|
||||
m->m_page = PAGE_SPIDER;
|
||||
m->m_obj = OBJ_COLL;
|
||||
m->m_flags = PF_HIDDEN;
|
||||
m++;
|
||||
|
||||
// DIFFBOT:
|
||||
// . this is sent to each shard by issuing a "restartRound=" cmd
|
||||
// . similar to the "addcoll" cmd we add args to it and make it
|
||||
// the "forceround" cmd parm and add THAT to the parmlist.
|
||||
// so "roundStart=1" is really an alias for us.
|
||||
m->m_title = "manually restart a spider round on shard";
|
||||
m->m_desc = "Updates round number and resets local processed "
|
||||
"and crawled counts to 0.";
|
||||
m->m_cgi = "forceround";
|
||||
//m->m_off = (char *)&cr.m_spiderRoundStartTime - x;
|
||||
m->m_type = TYPE_CMD;
|
||||
m->m_func = CommandForceNextSpiderRound;
|
||||
m->m_group = 0;
|
||||
m->m_page = PAGE_SPIDER;
|
||||
m->m_obj = OBJ_COLL;
|
||||
m->m_flags = PF_HIDDEN | PF_REBUILDURLFILTERS ;
|
||||
m++;
|
||||
|
||||
m->m_title = "spider round num";
|
||||
m->m_desc = "The spider round number.";
|
||||
m->m_cgi = "spiderRoundNum";
|
||||
m->m_off = (char *)&cr.m_spiderRoundNum - x;
|
||||
m->m_type = TYPE_LONG;
|
||||
m->m_def = "0";
|
||||
m->m_group = 0;
|
||||
m->m_page = PAGE_SPIDER;
|
||||
m->m_obj = OBJ_COLL;
|
||||
m->m_flags = PF_HIDDEN ;
|
||||
m++;
|
||||
|
||||
m->m_title = "send email alerts to sysadmin";
|
||||
@ -15204,30 +15303,6 @@ void Parms::init ( ) {
|
||||
m->m_group = 0;
|
||||
m++;*/
|
||||
|
||||
m->m_title = "spider round start time";
|
||||
m->m_desc = "When the spider round started";
|
||||
m->m_cgi = "roundStart";
|
||||
m->m_off = (char *)&cr.m_spiderRoundStartTime - x;
|
||||
m->m_type = TYPE_LONG;
|
||||
m->m_def = "0";
|
||||
m->m_group = 0;
|
||||
m->m_flags = PF_HIDDEN | PF_REBUILDURLFILTERS ;
|
||||
m->m_page = PAGE_SPIDER;
|
||||
m->m_obj = OBJ_COLL;
|
||||
m++;
|
||||
|
||||
m->m_title = "spider round num";
|
||||
m->m_desc = "The spider round number.";
|
||||
m->m_cgi = "spiderRoundNum";
|
||||
m->m_off = (char *)&cr.m_spiderRoundNum - x;
|
||||
m->m_type = TYPE_LONG;
|
||||
m->m_def = "0";
|
||||
m->m_group = 0;
|
||||
m->m_flags = PF_HIDDEN ;
|
||||
m->m_page = PAGE_SPIDER;
|
||||
m->m_obj = OBJ_COLL;
|
||||
m++;
|
||||
|
||||
m->m_title = "scraping enabled procog";
|
||||
m->m_desc = "Do searches for queries in this hosts part of the "
|
||||
"query log.";
|
||||
@ -18937,6 +19012,38 @@ bool Parms::convertHttpRequestToParmList (HttpRequest *hr, SafeBuf *parmList,
|
||||
}
|
||||
}
|
||||
|
||||
// . DIFFBOT HACK: so ppl can manually restart a spider round
|
||||
// . val can be 0 or 1 or anything. i.e. roundStart=0 works.
|
||||
// . map this parm to another parm with the round start
|
||||
// time (current time) and the new round # as the args.
|
||||
// . this will call CommandForceNextSpiderRound() function
|
||||
// on every shard with these args, "tmpVal".
|
||||
if ( strcmp(m->m_cgi,"roundStart") == 0 ) {
|
||||
// use the current time so anything spidered before
|
||||
// this time (the round start time) will be respidered
|
||||
//sprintf(tmp,"%lu",getTimeGlobalNoCore());
|
||||
//val = tmp;
|
||||
char tmpVal[64];
|
||||
// use the same round start time for all shards
|
||||
sprintf(tmpVal,
|
||||
"%lu,%li"
|
||||
,getTimeGlobalNoCore()
|
||||
,cr->m_spiderRoundNum+1
|
||||
);
|
||||
// . also add command to reset crawl/process counts
|
||||
// so if you hit maxToProcess/maxToCrawl it will
|
||||
// not stop the round from restarting
|
||||
// . CommandResetCrawlCounts()
|
||||
if ( ! addNewParmToList1 ( parmList ,
|
||||
parmCollnum ,
|
||||
tmpVal, // a string
|
||||
0 , // occNum (for arrays)
|
||||
"forceround" ) )
|
||||
return false;
|
||||
// don't bother going below
|
||||
continue;
|
||||
}
|
||||
|
||||
// if a collection name was also provided, assume that is
|
||||
// the target of the reset/delete/restart. we still
|
||||
// need PageAddDelete.cpp to work...
|
||||
@ -19023,16 +19130,18 @@ bool Parms::convertHttpRequestToParmList (HttpRequest *hr, SafeBuf *parmList,
|
||||
if ( m->m_obj == OBJ_NONE ) continue;
|
||||
if ( m->m_obj == OBJ_SI ) continue;
|
||||
|
||||
// convert spiderRoundStartTime=0 to
|
||||
// spiderRoundStartTime=<currenttime>+30secs
|
||||
// convert spiderRoundStartTime=0 (roundStart=0 roundStart=1)
|
||||
// to spiderRoundStartTime=<currenttime>+30secs
|
||||
// so that will force the next spider round to kick in
|
||||
/*
|
||||
bool restartRound = false;
|
||||
char tmp[24];
|
||||
if ( strcmp(field,"roundStart")==0 &&
|
||||
val && (val[0]=='0'||val[0]=='1') && val[1]==0 ) {
|
||||
val && (val[0]=='0'||val[0]=='1') && val[1]==0 )
|
||||
sprintf(tmp,"%lu",(long)getTimeGlobalNoCore()+0);
|
||||
val = tmp;
|
||||
}
|
||||
|
||||
*/
|
||||
|
||||
// add it to a list now
|
||||
if ( ! addNewParmToList2 ( parmList ,
|
||||
|
@ -5377,7 +5377,7 @@ void doneSendingNotification ( void *state ) {
|
||||
// waiting tree will usually be empty for this coll since no
|
||||
// spider requests had a valid spider priority, so let's rebuild!
|
||||
// this is not necessary because PF_REBUILD is set for the
|
||||
// "roundStart" parm in Parms.cpp so it will rebuild if that parm
|
||||
// "spiderRoundStart" parm in Parms.cpp so it will rebuild if that parm
|
||||
// changes already.
|
||||
//if ( cr->m_spiderColl )
|
||||
// cr->m_spiderColl->m_waitingTreeNeedsRebuild = true;
|
||||
@ -5389,10 +5389,10 @@ void doneSendingNotification ( void *state ) {
|
||||
g_parms.addNewParmToList1 ( &parmList,cr->m_collnum,roundStr,-1 ,
|
||||
"spiderRoundNum");
|
||||
g_parms.addNewParmToList1 ( &parmList,cr->m_collnum,roundTime, -1 ,
|
||||
"roundStart");
|
||||
"spiderRoundStart");
|
||||
|
||||
//g_parms.addParmToList1 ( &parmList , cr , "spiderRoundNum" );
|
||||
//g_parms.addParmToList1 ( &parmList , cr , "roundStart" );
|
||||
//g_parms.addParmToList1 ( &parmList , cr , "spiderRoundStart" );
|
||||
// this uses msg4 so parm ordering is guaranteed
|
||||
g_parms.broadcastParmList ( &parmList , NULL , NULL );
|
||||
|
||||
|
22
XmlDoc.cpp
22
XmlDoc.cpp
@ -1299,6 +1299,12 @@ bool XmlDoc::set4 ( SpiderRequest *sreq ,
|
||||
m_conceptWeightValid = true;
|
||||
*/
|
||||
|
||||
// fix some corruption i've seen
|
||||
if ( m_sreq.m_urlIsDocId && ! is_digit(m_sreq.m_url[0]) ) {
|
||||
log("xmldoc: fixing sreq %s to non docid",m_sreq.m_url);
|
||||
m_sreq.m_urlIsDocId = 0;
|
||||
}
|
||||
|
||||
// if url is a docid... we are from pagereindex.cpp
|
||||
//if ( sreq->m_isPageReindex ) {
|
||||
// now we can have url-based page reindex requests because
|
||||
@ -1306,8 +1312,8 @@ bool XmlDoc::set4 ( SpiderRequest *sreq ,
|
||||
// we add a spider request of the PARENT url for it as page reindex
|
||||
//if ( is_digit ( sreq->m_url[0] ) ) {
|
||||
// watch out for 0.r.msn.com!!
|
||||
if ( sreq->m_urlIsDocId ) {
|
||||
m_docId = atoll(sreq->m_url);
|
||||
if ( m_sreq.m_urlIsDocId ) {
|
||||
m_docId = atoll(m_sreq.m_url);
|
||||
// assume its good
|
||||
m_docIdValid = true;
|
||||
// similar to set3() above
|
||||
@ -1321,7 +1327,7 @@ bool XmlDoc::set4 ( SpiderRequest *sreq ,
|
||||
// add www is now REQUIRED for all!
|
||||
// crap, injection of tmblr.co/ZHw5yo1E5TAaW fails because
|
||||
// www.tmblr.co has no IP
|
||||
setFirstUrl ( sreq->m_url , false );//true ); // false );
|
||||
setFirstUrl ( m_sreq.m_url , false );//true ); // false );
|
||||
// you can't call this from a docid based url until you
|
||||
// know the uh48
|
||||
//setSpideredTime();
|
||||
@ -13754,6 +13760,12 @@ SafeBuf *XmlDoc::getTokenizedDiffbotReply ( ) {
|
||||
bool inQuotes = false;
|
||||
// scan now
|
||||
for ( ; *x ; x++ ) {
|
||||
// escaping a backslash?
|
||||
if ( *x == '\\' && x[1] == '\\' ) {
|
||||
// skip two bytes then..
|
||||
x++;
|
||||
continue;
|
||||
}
|
||||
// escaping a quote? ignore quote then.
|
||||
if ( *x == '\\' && x[1] == '\"' ) {
|
||||
// skip two bytes then..
|
||||
@ -16125,7 +16137,7 @@ void XmlDoc::filterStart_r ( bool amThread ) {
|
||||
snprintf(cmd,2047 ,"ulimit -v 25000 ; ulimit -t 30 ; nice -n 19 %s/pdftohtml -q -i -noframes -stdout %s > %s", wdir , in ,out );
|
||||
else if ( ctype == CT_DOC )
|
||||
// "wdir" include trailing '/'? not sure
|
||||
snprintf(cmd,2047, "ulimit -v 25000 ; ulimit -t 30 ; ANTIWORDHOME=%s/antiword-dir ; nice -n 19 %s/antiword %s> %s" , wdir , wdir , in , out );
|
||||
snprintf(cmd,2047, "ulimit -v 25000 ; ulimit -t 30 ; export ANTIWORDHOME=%s/antiword-dir ; nice -n 19 %s/antiword %s> %s" , wdir , wdir , in , out );
|
||||
else if ( ctype == CT_XLS )
|
||||
snprintf(cmd,2047, "ulimit -v 25000 ; ulimit -t 30 ; timeout 10s nice -n 19 %s/xlhtml %s > %s" , wdir , in , out );
|
||||
// this is too buggy for now... causes hanging threads because it
|
||||
@ -20356,6 +20368,8 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
|
||||
ksr.m_avoidSpiderLinks = 1;
|
||||
// avoid EDOCUNCHANGED
|
||||
ksr.m_ignoreDocUnchangedError = 1;
|
||||
// no longer docid based we set it to parentUrl
|
||||
ksr.m_urlIsDocId = 0;
|
||||
// but it is not docid based, so overwrite the docid
|
||||
// in ksr.m_url with the parent multidoc url. it \0 terms it.
|
||||
strcpy(ksr.m_url , parentUrl );//, MAX_URL_LEN-1);
|
||||
|
Reference in New Issue
Block a user