Merge branch 'diffbot-testing' into diffbot-sam

2015-05-19 19:39:32 -07:00 · 2015-05-19 19:39:32 -07:00 · 145c125abd
commit 145c125abd
parent 132a1940cc 69e7b1165d
9 changed files with 205 additions and 41 deletions
--- a/Collectiondb.cpp
+++ b/Collectiondb.cpp
@ -463,12 +463,24 @@ bool Collectiondb::addNewColl ( char *coll ,
 		if ( ! h ) {
 			log("crawlbot: bad custom collname");
 			g_errno = EBADENGINEER;
+			mdelete ( cr, sizeof(CollectionRec), "CollectionRec" ); 
+			delete ( cr );
 			return true;
 		}
 		*h = '\0';
 		crawl = h + 1;
 		if ( ! crawl[0] ) {
 			log("crawlbot: bad custom crawl name");
+			mdelete ( cr, sizeof(CollectionRec), "CollectionRec" ); 
+			delete ( cr );
+			g_errno = EBADENGINEER;
+			return true;
+		}
+		// or if too big!
+		if ( gbstrlen(crawl) > 30 ) {
+			log("crawlbot: crawlbot crawl NAME is over 30 chars");
+			mdelete ( cr, sizeof(CollectionRec), "CollectionRec" ); 
+			delete ( cr );
 			g_errno = EBADENGINEER;
 			return true;
 		}
@ -1939,12 +1951,13 @@ bool CollectionRec::load ( char *coll , int32_t i ) {

 	// the list of ip addresses that we have detected as being throttled
 	// and therefore backoff and use proxies for
-	sb.reset();
-	sb.safePrintf("%scoll.%s.%"INT32"/",
-		      g_hostdb.m_dir , m_coll , (int32_t)m_collnum );
-	m_twitchyTable.m_allocName = "twittbl";
-	m_twitchyTable.load ( sb.getBufStart() , "ipstouseproxiesfor.dat" );
-
+	if ( ! g_conf.m_doingCommandLine ) {
+		sb.reset();
+		sb.safePrintf("%scoll.%s.%"INT32"/",
+			      g_hostdb.m_dir , m_coll , (int32_t)m_collnum );
+		m_twitchyTable.m_allocName = "twittbl";
+		m_twitchyTable.load ( sb.getBufStart() , "ipstouseproxiesfor.dat" );
+	}

 	

@ -3472,6 +3485,70 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() {
 	char *ppp = m_diffbotPageProcessPattern.getBufStart();
 	if ( ppp && ! ppp[0] ) ppp = NULL;

+	///////
+	//
+	// recompile regular expressions
+	//
+	///////
+
+
+	if ( m_hasucr ) {
+		regfree ( &m_ucr );
+		m_hasucr = false;
+	}
+
+	if ( m_hasupr ) {
+		regfree ( &m_upr );
+		m_hasupr = false;
+	}
+
+	// copy into tmpbuf
+	SafeBuf tmp;
+
+	char *rx = m_diffbotUrlCrawlRegEx.getBufStart();
+	if ( rx && ! rx[0] ) rx = NULL;
+	if ( rx ) {
+		tmp.reset();
+		tmp.safeStrcpy ( rx );
+		expandRegExShortcuts ( &tmp );
+		m_hasucr = true;
+	}
+	int32_t err;
+	if ( rx && ( err = regcomp ( &m_ucr , tmp.getBufStart() ,
+				     REG_EXTENDED| //REG_ICASE|
+				     REG_NEWLINE ) ) ) { // |REG_NOSUB) ) {
+		// error!
+		char errbuf[1024];
+		regerror(err,&m_ucr,errbuf,1000);
+		log("coll: regcomp %s failed: %s. "
+		    "Ignoring.",
+		    rx,errbuf);
+		regfree ( &m_ucr );
+		m_hasucr = false;
+	}
+
+
+	rx = m_diffbotUrlProcessRegEx.getBufStart();
+	if ( rx && ! rx[0] ) rx = NULL;
+	if ( rx ) m_hasupr = true;
+	if ( rx ) {
+		tmp.reset();
+		tmp.safeStrcpy ( rx );
+		expandRegExShortcuts ( &tmp );
+		m_hasupr = true;
+	}
+	if ( rx && ( err = regcomp ( &m_upr , tmp.getBufStart() ,
+				     REG_EXTENDED| // REG_ICASE|
+				     REG_NEWLINE ) ) ) { // |REG_NOSUB) ) {
+		char errbuf[1024];
+		regerror(err,&m_upr,errbuf,1000);
+		// error!
+		log("coll: regcomp %s failed: %s. "
+		    "Ignoring.",
+		    rx,errbuf);
+		regfree ( &m_upr );
+		m_hasupr = false;
+	}

 	// what diffbot url to use for processing
 	char *api = m_diffbotApiUrl.getBufStart();
@ -3913,17 +3990,20 @@ void testRegex ( ) {
 	rx = ".*?article[0-9]*?.html";

 	regex_t ucr;
+	int32_t err;

-	if ( regcomp ( &ucr , rx ,
-		       REG_ICASE
-		       |REG_EXTENDED
-		       //|REG_NEWLINE
-		       //|REG_NOSUB
-		       ) ) {
+	if ( ( err = regcomp ( &ucr , rx ,
+			       REG_ICASE
+			       |REG_EXTENDED
+			       //|REG_NEWLINE
+			       //|REG_NOSUB
+			       ) ) ) {
 		// error!
+		char errbuf[1024];
+		regerror(err,&ucr,errbuf,1000);
 		log("xmldoc: regcomp %s failed: %s. "
 		    "Ignoring.",
-		    rx,mstrerror(errno));
+		    rx,errbuf);
 	}

 	logf(LOG_DEBUG,"db: compiled '%s' for crawl pattern",rx);
--- a/Errno.cpp
+++ b/Errno.cpp
@ -108,6 +108,7 @@ case	EDNSBAD          : return "DNS sent an unknown response code";
 case	EDNSREFUSED      : return "DNS refused to talk";
 case	EDNSDEAD         : return "DNS hostname does not exist";
 case	EDNSTIMEDOUT     : return "DNS timed out";
+case	EDNSERROR        : return "DNS lookup error";
 case	ECOLLTOOBIG      : return "Collection is too long";
 case	ESTRIKEOUT       : return "Retried enough times, deleting doc";
 case	ENOPERM          : return "Permission Denied";
--- a/Errno.h
+++ b/Errno.h
@ -112,6 +112,7 @@ enum {
 	EDNSREFUSED      , //dns refused to talk to us
 	EDNSDEAD         , //dns is dead
 	EDNSTIMEDOUT     , //was just EUDPTIMEDOUT
+	EDNSERROR        ,
 	ECOLLTOOBIG      , //collection is too long
 	ESTRIKEOUT       , //retried enough times; deleting doc & giving up
 	ENOPERM          , //permission denied
--- a/Mem.cpp
+++ b/Mem.cpp
@ -700,7 +700,7 @@ void Mem::addMem ( void *mem , int32_t size , const char *note , char isnew ) {
 	//(int32_t)mem,size,h,s_n,note);
 	s_n++;
 	// debug
-	if ( size > MINMEM && g_conf.m_logDebugMemUsage )
+	if ( (size > MINMEM && g_conf.m_logDebugMemUsage) || size>=100000000 )
 		log(LOG_INFO,"mem: addMem(%"INT32"): %s. ptr=0x%"PTRFMT" "
 		    "used=%"INT64"",
 		    size,note,(PTRTYPE)mem,m_used);
@ -1023,7 +1023,7 @@ bool Mem::rmMem  ( void *mem , int32_t size , const char *note ) {

 keepgoing:
 	// debug
-	if ( size > MINMEM && g_conf.m_logDebugMemUsage )
+	if ( (size > MINMEM && g_conf.m_logDebugMemUsage) || size>=100000000 )
 		log(LOG_INFO,"mem: rmMem (%"INT32"): "
 		    "ptr=0x%"PTRFMT" %s.",size,(PTRTYPE)mem,note);

--- a/Msg2.cpp
+++ b/Msg2.cpp
@ -194,11 +194,11 @@ bool Msg2::getLists ( ) {
 		int32_t minRecSize = m_minRecSizes[m_i];

 		// sanity check
-		if ( ( minRecSize > ( 500 * 1024 * 1024 ) || 
-		       minRecSize < 0) ){
-			log( "minRecSize = %"INT32"", minRecSize );
-			char *xx=NULL; *xx=0;
-		}
+		// if ( ( minRecSize > ( 500 * 1024 * 1024 ) || 
+		//        minRecSize < 0) ){
+		// 	log( "minRecSize = %"INT32"", minRecSize );
+		// 	char *xx=NULL; *xx=0;
+		// }

 		//bool forceLocalIndexdb = true;
 		// if it is a no-split term, we may gotta get it over the net
@ -407,7 +407,13 @@ bool Msg2::getLists ( ) {

 		// like 90MB last time i checked. so it won't read more
 		// than that...
-		int32_t minRecSizes = DEFAULT_POSDB_READSIZE;
+		// MDW: no, it's better to print oom then not give all the
+		// results leaving users scratching their heads. besides,
+		// we should do docid range splitting before we go out of
+		// mem. we should also report the size of each termlist
+		// in bytes in the query info header.
+		//int32_t minRecSizes = DEFAULT_POSDB_READSIZE;
+		int32_t minRecSizes = -1;

 		// start up the read. thread will wait in thread queue to 
 		// launch if too many threads are out.
@ -596,12 +602,13 @@ bool Msg2::gotList ( RdbList *list ) {
 	for ( int32_t i = 0 ; i < m_numLists ; i++ ) {
 		if ( m_lists[i].m_listSize < m_minRecSizes[i] ) continue;
 		if ( m_minRecSizes[i] == 0 ) continue;
+		if ( m_minRecSizes[i] == -1 ) continue;
 		// do not print this if compiling section xpathsitehash stats
 		// because we only need like 10k of list to get a decent
 		// reading
 		if ( m_req->m_forSectionStats ) break;
-		log("msg2: read termlist #%"INT32" size=%"INT32" maxSize=%"INT32". losing "
-		    "docIds!",
+		log("msg2: read termlist #%"INT32" size=%"INT32" "
+		    "maxSize=%"INT32". losing docIds!",
 		    i,m_lists[i].m_listSize,m_minRecSizes[i]);
 	}

--- a/Msg3a.cpp
+++ b/Msg3a.cpp
@ -377,6 +377,9 @@ bool Msg3a::gotCacheReply ( ) {
 		// 'time enough for love' query was hitting 30MB termlists.
 		//rs = 50000000;
 		rs = DEFAULT_POSDB_READSIZE;//90000000; // 90MB!
+		// it is better to go oom then leave users scratching their
+		// heads as to why some results are not being returned.
+		rs = -1;
 		// if section stats, limit to 1MB
 		//if ( m_r->m_getSectionStats ) rs = 1000000;
 		// get the jth query term
--- a/Msg5.cpp
+++ b/Msg5.cpp
@ -182,9 +182,10 @@ bool Msg5::getList ( char     rdbId         ,
 	// log("Msg5::readList: startKey > endKey warning"); 
 	// we no longer allow negative minRecSizes
 	if ( minRecSizes < 0 ) {
-		log(LOG_LOGIC,"net: msg5: MinRecSizes < 0, using 1.");
-		minRecSizes = 1;
-		char *xx = NULL; *xx = 0;
+		if ( g_conf.m_logDebugDb )
+		      log(LOG_LOGIC,"net: msg5: MinRecSizes < 0, using 2GB.");
+		minRecSizes = 0x7fffffff;
+		//char *xx = NULL; *xx = 0;
 	}
 	// ensure startKey last bit clear, endKey last bit set
 	//if ( (startKey.n0 & 0x01) == 0x01 ) 
--- a/PageReindex.cpp
+++ b/PageReindex.cpp
@ -438,6 +438,25 @@ bool Msg1c::gotList ( ) {
 		// use only 64k values so we don't stress doledb/waittrees/etc.
 		// for large #'s of docids
 		int32_t firstIp = (docId & 0x0000ffff);
+
+		// bits 6-13 of the docid are the domain hash so use those
+		// when doing a REINDEX (not delete!) to ensure that requests
+		// on the same domain go to the same shard, at least when
+		// we have up to 256 shards. if we have more than 256 shards
+		// at this point some shards will not participate in the
+		// query reindex/delete process because of this, so 
+		// we'll want to allow more bits in in that case perhaps.
+		// check out Hostdb::getShardNum(RDB_SPIDERDB) in Hostdb.cpp
+		// to see what shard is responsible for storing and indexing 
+		// this SpiderRequest based on the firstIp.
+		if ( ! m_forceDel ) { 
+			// if we are a REINDEX not a delete because 
+			// deletes don't need to spider/redownload the doc
+			// so the distribution can be more random
+			firstIp >>= 6;
+			firstIp &= 0xff;
+		}
+
 		// 0 is not a legit val. it'll core below.
 		if ( firstIp == 0 ) firstIp = 1;
 		// use a fake ip
--- a/XmlDoc.cpp
+++ b/XmlDoc.cpp
@ -1800,7 +1800,10 @@ bool XmlDoc::set2 ( char    *titleRec ,
 	//m_hasContactInfoValid = true;

 	// sanity check. if m_siteValid is true, this must be there
-	if ( ! ptr_site ) { char *xx=NULL;*xx=0; }
+	if ( ! ptr_site ) { 
+		log("set4: ptr_site is null for docid %"INT64"",m_docId);
+		//char *xx=NULL;*xx=0; }
+	}

 	// lookup the tagdb rec fresh if setting for a summary. that way we
 	// can see if it is banned or not
@ -2469,16 +2472,50 @@ bool XmlDoc::indexDoc ( ) {
 		if ( ! m_firstIpValid ) { char *xx=NULL;*xx=0; }
 		// sanity log
 		if ( *fip == 0 || *fip == -1 ) {
+			// 
+			// now add a spider status doc for this so we know
+			// why a crawl might have failed to start
+			//
+			SafeBuf *ssDocMetaList = NULL;
+			// save this
+			int32_t saved = m_indexCode;
+			// and make it the real reason for the spider status doc
+			m_indexCode = EDNSERROR;
+			// get the spiderreply ready to be added
+			
+			ssDocMetaList = getSpiderStatusDocMetaList(NULL ,false);//del
+			// revert
+			m_indexCode = saved;
+			// error?
+			if ( ! ssDocMetaList ) return true;
+			// blocked?
+			if ( ssDocMetaList == (void *)-1 ) return false;
+			// need to alloc space for it too
+			char *list = ssDocMetaList->getBufStart();
+			int32_t len = ssDocMetaList->length();
+			//needx += len;
+			// this too
+			m_addedStatusDocSize = len;
+			m_addedStatusDocSizeValid = true;
+
 			char *url = "unknown";
 			if ( m_sreqValid ) url = m_sreq.m_url;
 			log("build: error2 getting real firstip of %"INT32" for "
 			    "%s. Not adding new spider req", (int32_t)*fip,url);
+			// also count it as a crawl attempt
+			cr->m_localCrawlInfo.m_pageDownloadAttempts++;
+			cr->m_globalCrawlInfo.m_pageDownloadAttempts++;
+
+			if ( ! m_metaList2.safeMemcpy ( list , len ) )
+				return true;
+
 			goto skipNewAdd1;
 		}
 		// store the new request (store reply for this below)
 		char rd = RDB_SPIDERDB;
 		if ( m_useSecondaryRdbs ) rd = RDB2_SPIDERDB2;
-		m_metaList2.pushChar(rd);
+		if ( ! m_metaList2.pushChar(rd) )
+			return true;
 		// store it here
 		SpiderRequest revisedReq;
 		// this fills it in
@ -27312,7 +27349,7 @@ SafeBuf *XmlDoc::getSpiderStatusDocMetaList ( SpiderReply *reply ,
 // . TODO:
 //   usedProxy:1
 //   proxyIp:1.2.3.4
-SafeBuf *XmlDoc::getSpiderStatusDocMetaList2 ( SpiderReply *reply ) {	
+SafeBuf *XmlDoc::getSpiderStatusDocMetaList2 ( SpiderReply *reply1 ) {	

 	setStatus ( "making spider reply meta list");

@ -27333,8 +27370,8 @@ SafeBuf *XmlDoc::getSpiderStatusDocMetaList2 ( SpiderReply *reply ) {
 	int64_t *uqd = getAvailDocIdOnly ( d );
 	if ( ! uqd || uqd == (void *)-1 ) return  (SafeBuf *)uqd;

-	unsigned char *hc = (unsigned char *)getHopCount();
-	if ( ! hc || hc == (void *)-1 ) return (SafeBuf *)hc;
+	// unsigned char *hc = (unsigned char *)getHopCount();
+	// if ( ! hc || hc == (void *)-1 ) return (SafeBuf *)hc;

 	int32_t tmpVal = -1;
 	int32_t *priority = &tmpVal;
@ -27370,7 +27407,7 @@ SafeBuf *XmlDoc::getSpiderStatusDocMetaList2 ( SpiderReply *reply ) {
 	if ( ! m_indexCodeValid ) { char *xx=NULL;*xx=0; }

 	// why isn't gbhopcount: being indexed consistently?
-	if ( ! m_hopCountValid )  { char *xx=NULL;*xx=0; }
+	//if ( ! m_hopCountValid )  { char *xx=NULL;*xx=0; }

 	// reset just in case
 	m_spiderStatusDocMetaList.reset();
@ -27405,12 +27442,17 @@ SafeBuf *XmlDoc::getSpiderStatusDocMetaList2 ( SpiderReply *reply ) {
 		jd.safePrintf("\"gbssFinalRedirectUrl\":\"%s\",\n",
 			      ptr_redirUrl);

+	if ( m_indexCodeValid ) {
+		jd.safePrintf("\"gbssStatusCode\":%i,\n",(int)m_indexCode);
+		jd.safePrintf("\"gbssStatusMsg\":\"");
+		jd.jsonEncode (mstrerror(m_indexCode));
+		jd.safePrintf("\",\n");
+	}
+	else {
+		jd.safePrintf("\"gbssStatusCode\":-1,\n");
+		jd.safePrintf("\"gbssStatusMsg\":\"???\",\n");
+	}

-	jd.safePrintf("\"gbssStatusCode\":%i,\n",(int)m_indexCode);
-
-	jd.safePrintf("\"gbssStatusMsg\":\"");
-	jd.jsonEncode (mstrerror(m_indexCode));
-	jd.safePrintf("\",\n");

 	if ( m_httpStatusValid )
 		jd.safePrintf("\"gbssHttpStatus\":%"INT32",\n",
@ -27454,12 +27496,15 @@ SafeBuf *XmlDoc::getSpiderStatusDocMetaList2 ( SpiderReply *reply ) {
 	jd.safePrintf("\",\n");

 	//if ( m_redirUrlPtr && m_redirUrlValid )
-	jd.safePrintf("\"gbssNumRedirects\":%"INT32",\n",
-		      m_numRedirects);
+	//if ( m_numRedirectsValid )
+	jd.safePrintf("\"gbssNumRedirects\":%"INT32",\n",m_numRedirects);

-	jd.safePrintf("\"gbssDocId\":%"INT64",\n", m_docId);//*uqd);
+	if ( m_docIdValid )
+		jd.safePrintf("\"gbssDocId\":%"INT64",\n", m_docId);//*uqd);

-	jd.safePrintf("\"gbssHopCount\":%"INT32",\n",(int32_t)*hc);
+	if ( m_hopCountValid )
+		//jd.safePrintf("\"gbssHopCount\":%"INT32",\n",(int32_t)*hc);
+		jd.safePrintf("\"gbssHopCount\":%"INT32",\n",(int32_t)m_hopCount);

 	// crawlbot round
 	if ( cr->m_isCustomCrawl )
@ -27886,6 +27931,13 @@ SafeBuf *XmlDoc::getSpiderStatusDocMetaList2 ( SpiderReply *reply ) {
 	xd->ptr_site = ptr_site;
 	xd->size_site = size_site;

+	// if this is null then ip lookup failed i guess so just use
+	// the subdomain
+	if ( ! ptr_site && m_firstUrlValid ) {
+		xd->ptr_site  = m_firstUrl.getHost();
+		xd->size_site = m_firstUrl.getHostLen();
+	}
+
 	// use the same uh48 of our parent
 	int64_t uh48 = m_firstUrl.getUrlHash48();
 	// then make into a titlerec but store in metalistbuf, not m_titleRec