Merge branch 'diffbot-testing' into testing

Conflicts: Parms.cpp XmlDoc.cpp
2014-06-19 21:51:44 -07:00
parent 18b2271ac1 149e88efc8
commit b49ed5c0bc
4 changed files with 167 additions and 38 deletions
--- a/PageCrawlBot.cpp
+++ b/PageCrawlBot.cpp
@ -220,6 +220,9 @@ bool sendBackDump ( TcpSocket *sock, HttpRequest *hr ) {
 		long dr = 1;
 		// do not dedup bulk jobs
 		if ( cr->m_isCustomCrawl == 2 ) dr = 0;
+		// do not dedup for crawls either it is too confusing!!!!
+		// ppl wonder where the results are!
+		dr = 0;
 		sb2.safePrintf("GET /search.csv?icc=1&format=csv&sc=0&"
 			       // dedup. since stream=1 and pss=0 below
 			       // this will dedup on page content hash only
@ -254,12 +257,15 @@ bool sendBackDump ( TcpSocket *sock, HttpRequest *hr ) {
 		long dr = 1;
 		// do not dedup bulk jobs
 		if ( cr->m_isCustomCrawl == 2 ) dr = 0;
+		// do not dedup for crawls either it is too confusing!!!!
+		// ppl wonder where the results are!
+		dr = 0;
 		sb2.safePrintf("GET /search.csv?icc=1&format=json&sc=0&"
 			       // dedup. since stream=1 and pss=0 below
 			       // this will dedup on page content hash only
 			       // which is super fast.
 			       "dr=%li&"
-			      "c=%s&n=1000000&"
+			       "c=%s&n=1000000&"
 			       // we can stream this because unlink csv it
 			       // has no header row that needs to be 
 			       // computed from all results.
@ -3245,8 +3251,8 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
 			      "<a href=/search?icc=1&format=json&"
 			      // disable site clustering
 			      "sc=0&"
-			      // dodupcontentremoval:
-			      "dr=1&"
+			      // doNOTdupcontentremoval:
+			      "dr=0&"
 			      "c=%s&n=10000000&rand=%llu&scores=0&id=1&"
 			      "stream=1&" // stream results back as we get them
 			      "q="
--- a/Parms.cpp
+++ b/Parms.cpp
@ -397,6 +397,44 @@ bool CommandDeleteColl2 ( char *rec , WaitEntry *we ) {
 	return true;
 }

+bool CommandForceNextSpiderRound ( char *rec ) {
+
+	// caller must specify collnum
+	collnum_t collnum = getCollnumFromParmRec ( rec );
+	// need this
+	CollectionRec *cr = g_collectiondb.getRec ( collnum );
+	if ( ! cr ) {
+		g_errno = ENOCOLLREC;
+		log("parms: bad collnum %li for restart spider round",
+		    (long)collnum);
+		return true;
+	}
+
+	// seems like parmlist is an rdblist, so we have a key_t followed
+	// by 4 bytes of datasize then the data... which is an ascii string
+	// in our case...
+	char *data = getDataFromParmRec ( rec );
+	long roundStartTime;
+	long newRoundNum;
+	// see the HACK: in Parms::convertHttpRequestToParmList() where we
+	// construct this data in response to a "roundStart" cmd. we used
+	// sprintf() so it's natural to use sscanf() to parse it out.
+	sscanf ( data , "%lu,%li", &roundStartTime,&newRoundNum);
+
+	cr->m_spiderRoundStartTime = roundStartTime;
+	cr->m_spiderRoundNum = newRoundNum;
+
+	// reset the round counts. this will log a msg. resetting the
+	// round counts will prevent maxToProcess/maxToCrawl from holding
+	// us back...
+	spiderRoundIncremented ( cr );
+
+	// yeah, if we don't nuke doledb then it doesn't work...
+	cr->rebuildUrlFilters();
+
+	return true;
+}
+
 // . returns true and sets g_errno on error
 // . returns false if would block
 bool CommandRestartColl ( char *rec , WaitEntry *we ) {
@ -9429,6 +9467,67 @@ void Parms::init ( ) {
 	m->m_def   = "unspecified";
 	m->m_page  = PAGE_MASTER;
 	m->m_obj   = OBJ_CONF;
+
+	m->m_title = "spider round start time";
+	m->m_desc  = "When the next spider round starts. If you force this to "
+		"zero it sets it to the current time. That way you can "
+		"respider all the urls that were already spidered, and urls "
+		"that were not yet spidered in the round will still be "
+		"spidered.";
+	m->m_cgi   = "spiderRoundStart";
+	m->m_size  = 0;
+	m->m_off   = (char *)&cr.m_spiderRoundStartTime - x;
+	m->m_type  = TYPE_LONG;
+	m->m_def   = "0";
+	m->m_group = 0;
+	m->m_page  = PAGE_SPIDER;
+	m->m_obj   = OBJ_COLL;
+	m->m_flags = PF_HIDDEN | PF_REBUILDURLFILTERS ;
+	m++;
+
+	// DIFFBOT:
+	// this http parm actually ads the "forceround" parm to the parmlist
+	// below with the appropriate args.
+	m->m_title = "manually restart a spider round";
+	m->m_desc  = "Updates round number and resets local processed "
+		"and crawled counts to 0.";
+	m->m_cgi   = "roundStart";
+	m->m_type  = TYPE_CMD;
+	m->m_func  = NULL;
+	m->m_group = 0;
+	m->m_page  = PAGE_SPIDER;
+	m->m_obj   = OBJ_COLL;
+	m->m_flags = PF_HIDDEN;
+	m++;
+
+	// DIFFBOT:
+	// . this is sent to each shard by issuing a "restartRound=" cmd
+	// . similar to the "addcoll" cmd we add args to it and make it
+	//   the "forceround" cmd parm and add THAT to the parmlist.
+	//   so "roundStart=1" is really an alias for us.
+	m->m_title = "manually restart a spider round on shard";
+	m->m_desc  = "Updates round number and resets local processed "
+		"and crawled counts to 0.";
+	m->m_cgi   = "forceround";
+	//m->m_off   = (char *)&cr.m_spiderRoundStartTime - x;
+	m->m_type  = TYPE_CMD;
+	m->m_func  = CommandForceNextSpiderRound;
+	m->m_group = 0;
+	m->m_page  = PAGE_SPIDER;
+	m->m_obj   = OBJ_COLL;
+	m->m_flags = PF_HIDDEN | PF_REBUILDURLFILTERS ;
+	m++;
+
+	m->m_title = "spider round num";
+	m->m_desc  = "The spider round number.";
+	m->m_cgi   = "spiderRoundNum";
+	m->m_off   = (char *)&cr.m_spiderRoundNum - x;
+	m->m_type  = TYPE_LONG;
+	m->m_def   = "0";
+	m->m_group = 0;
+	m->m_page  = PAGE_SPIDER;
+	m->m_obj   = OBJ_COLL;
+	m->m_flags = PF_HIDDEN ;
 	m++;

 	m->m_title = "send email alerts to sysadmin";
@ -15204,30 +15303,6 @@ void Parms::init ( ) {
 	m->m_group = 0;
 	m++;*/

-	m->m_title = "spider round start time";
-	m->m_desc  = "When the spider round started";
-	m->m_cgi   = "roundStart";
-	m->m_off   = (char *)&cr.m_spiderRoundStartTime - x;
-	m->m_type  = TYPE_LONG;
-	m->m_def   = "0";
-	m->m_group = 0;
-	m->m_flags = PF_HIDDEN | PF_REBUILDURLFILTERS ;
-	m->m_page  = PAGE_SPIDER;
-	m->m_obj   = OBJ_COLL;
-	m++;
-
-	m->m_title = "spider round num";
-	m->m_desc  = "The spider round number.";
-	m->m_cgi   = "spiderRoundNum";
-	m->m_off   = (char *)&cr.m_spiderRoundNum - x;
-	m->m_type  = TYPE_LONG;
-	m->m_def   = "0";
-	m->m_group = 0;
-	m->m_flags = PF_HIDDEN ;
-	m->m_page  = PAGE_SPIDER;
-	m->m_obj   = OBJ_COLL;
-	m++;
-
 	m->m_title = "scraping enabled procog";
 	m->m_desc  = "Do searches for queries in this hosts part of the "
 		"query log.";
@ -18937,6 +19012,38 @@ bool Parms::convertHttpRequestToParmList (HttpRequest *hr, SafeBuf *parmList,
 			}
 		}

+		// . DIFFBOT HACK: so ppl can manually restart a spider round
+		// . val can be 0 or 1 or anything. i.e. roundStart=0 works.
+		// . map this parm to another parm with the round start
+		//   time (current time) and the new round # as the args.
+		// . this will call CommandForceNextSpiderRound() function
+		//   on every shard with these args, "tmpVal".
+		if ( strcmp(m->m_cgi,"roundStart") == 0 ) {
+			// use the current time so anything spidered before
+			// this time (the round start time) will be respidered
+			//sprintf(tmp,"%lu",getTimeGlobalNoCore());
+			//val = tmp;
+			char tmpVal[64];
+			// use the same round start time for all shards
+			sprintf(tmpVal,
+				"%lu,%li"
+				,getTimeGlobalNoCore()
+				,cr->m_spiderRoundNum+1
+				);
+			// . also add command to reset crawl/process counts
+			//   so if you hit maxToProcess/maxToCrawl it will
+			//   not stop the round from restarting
+			// . CommandResetCrawlCounts()
+			if ( ! addNewParmToList1 ( parmList ,
+						   parmCollnum ,
+						   tmpVal, // a string
+						   0 , // occNum (for arrays)
+						   "forceround" ) )
+				return false;
+			// don't bother going below
+			continue;
+		}
+
 		// if a collection name was also provided, assume that is
 		// the target of the reset/delete/restart. we still
 		// need PageAddDelete.cpp to work...
@ -19023,16 +19130,18 @@ bool Parms::convertHttpRequestToParmList (HttpRequest *hr, SafeBuf *parmList,
 		if ( m->m_obj == OBJ_NONE ) continue;
 		if ( m->m_obj == OBJ_SI ) continue;

-		// convert spiderRoundStartTime=0 to
-		// spiderRoundStartTime=<currenttime>+30secs
+		// convert spiderRoundStartTime=0 (roundStart=0 roundStart=1) 
+		// to spiderRoundStartTime=<currenttime>+30secs
 		// so that will force the next spider round to kick in
+		/*
+		bool restartRound = false;
 		char tmp[24];
 		if ( strcmp(field,"roundStart")==0 && 
-		     val && (val[0]=='0'||val[0]=='1') && val[1]==0 ) {
+		     val && (val[0]=='0'||val[0]=='1') && val[1]==0 ) 
 			sprintf(tmp,"%lu",(long)getTimeGlobalNoCore()+0);
 			val = tmp;
 		}
-
+		*/

 		// add it to a list now
 		if ( ! addNewParmToList2 ( parmList ,
--- a/Spider.cpp
+++ b/Spider.cpp
@ -5377,7 +5377,7 @@ void doneSendingNotification ( void *state ) {
 	// waiting tree will usually be empty for this coll since no
 	// spider requests had a valid spider priority, so let's rebuild!
 	// this is not necessary because PF_REBUILD is set for the
-	// "roundStart" parm in Parms.cpp so it will rebuild if that parm
+	// "spiderRoundStart" parm in Parms.cpp so it will rebuild if that parm
 	// changes already.
 	//if ( cr->m_spiderColl )
 	//	cr->m_spiderColl->m_waitingTreeNeedsRebuild = true;
@ -5389,10 +5389,10 @@ void doneSendingNotification ( void *state ) {
 	g_parms.addNewParmToList1 ( &parmList,cr->m_collnum,roundStr,-1 ,
 				    "spiderRoundNum");
 	g_parms.addNewParmToList1 ( &parmList,cr->m_collnum,roundTime, -1 ,
-				    "roundStart");
+				    "spiderRoundStart");

 	//g_parms.addParmToList1 ( &parmList , cr , "spiderRoundNum" ); 
-	//g_parms.addParmToList1 ( &parmList , cr , "roundStart" ); 
+	//g_parms.addParmToList1 ( &parmList , cr , "spiderRoundStart" ); 
 	// this uses msg4 so parm ordering is guaranteed
 	g_parms.broadcastParmList ( &parmList , NULL , NULL );

--- a/XmlDoc.cpp
+++ b/XmlDoc.cpp
@ -1299,6 +1299,12 @@ bool XmlDoc::set4 ( SpiderRequest *sreq      ,
 	m_conceptWeightValid          = true;
 	*/

+	// fix some corruption i've seen
+	if ( m_sreq.m_urlIsDocId && ! is_digit(m_sreq.m_url[0]) ) {
+		log("xmldoc: fixing sreq %s to non docid",m_sreq.m_url);
+		m_sreq.m_urlIsDocId = 0;
+	}
+
 	// if url is a docid... we are from pagereindex.cpp
 	//if ( sreq->m_isPageReindex ) {
 	// now we can have url-based page reindex requests because
@ -1306,8 +1312,8 @@ bool XmlDoc::set4 ( SpiderRequest *sreq      ,
 	// we add a spider request of the PARENT url for it as page reindex
 	//if ( is_digit ( sreq->m_url[0] ) ) {
 	// watch out for 0.r.msn.com!!
-	if ( sreq->m_urlIsDocId ) {
-		m_docId          = atoll(sreq->m_url);
+	if ( m_sreq.m_urlIsDocId ) {
+		m_docId          = atoll(m_sreq.m_url);
 		// assume its good
 		m_docIdValid     = true;
 		// similar to set3() above
@ -1321,7 +1327,7 @@ bool XmlDoc::set4 ( SpiderRequest *sreq      ,
 		// add www is now REQUIRED for all!
 		// crap, injection of tmblr.co/ZHw5yo1E5TAaW fails because
 		// www.tmblr.co has no IP
-		setFirstUrl ( sreq->m_url , false );//true ); // false );
+		setFirstUrl ( m_sreq.m_url , false );//true ); // false );
 		// you can't call this from a docid based url until you
 		// know the uh48
 		//setSpideredTime();
@ -13754,6 +13760,12 @@ SafeBuf *XmlDoc::getTokenizedDiffbotReply ( ) {
 		bool  inQuotes = false;
 		// scan now
 		for (  ; *x ; x++ ) {
+			// escaping a backslash?
+			if ( *x == '\\' && x[1] == '\\' ) {
+				// skip two bytes then..
+				x++;
+				continue;
+			}
 			// escaping a quote? ignore quote then.
 			if ( *x == '\\' && x[1] == '\"' ) {
 				// skip two bytes then..
@ -16125,7 +16137,7 @@ void XmlDoc::filterStart_r ( bool amThread ) {
 		snprintf(cmd,2047 ,"ulimit -v 25000 ; ulimit -t 30 ; nice -n 19 %s/pdftohtml -q -i -noframes -stdout %s > %s", wdir , in ,out );
 	else if ( ctype == CT_DOC ) 
 		// "wdir" include trailing '/'? not sure
-		snprintf(cmd,2047, "ulimit -v 25000 ; ulimit -t 30 ; ANTIWORDHOME=%s/antiword-dir ; nice -n 19 %s/antiword %s> %s" , wdir , wdir , in , out );
+		snprintf(cmd,2047, "ulimit -v 25000 ; ulimit -t 30 ; export ANTIWORDHOME=%s/antiword-dir ; nice -n 19 %s/antiword %s> %s" , wdir , wdir , in , out );
 	else if ( ctype == CT_XLS )
 		snprintf(cmd,2047, "ulimit -v 25000 ; ulimit -t 30 ; timeout 10s nice -n 19 %s/xlhtml %s > %s" , wdir , in , out );
 	// this is too buggy for now... causes hanging threads because it
@ -20356,6 +20368,8 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
 		ksr.m_avoidSpiderLinks = 1;
 		// avoid EDOCUNCHANGED
 		ksr.m_ignoreDocUnchangedError = 1;
+		// no longer docid based we set it to parentUrl
+		ksr.m_urlIsDocId = 0;
 		// but it is not docid based, so overwrite the docid
 		// in ksr.m_url with the parent multidoc url. it \0 terms it.
 		strcpy(ksr.m_url , parentUrl );//, MAX_URL_LEN-1);