Merge branch 'diffbot-testing' of github.com:gigablast/open-source-search-engine into diffbot-testing

Conflicts: Spider.cpp
2015-06-18 08:40:53 -07:00 · 2015-06-18 08:40:53 -07:00 · 18dbaf89c9
commit 18dbaf89c9
parent e9f1ab1150 f8b17e1dc9
10 changed files with 234 additions and 109 deletions
--- a/Collectiondb.cpp
+++ b/Collectiondb.cpp
@ -3573,6 +3573,8 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() {
 	float respiderFreq = m_collectiveRespiderFrequency;
 	if ( respiderFreq <= 0.0 ) respiderFreq = 3652.5;

+	// lower from 7 to 1 since we have so many collections now
+	int32_t diffbotipms = 1; // 7

 	// make the gigablast regex table just "default" so it does not
 	// filtering, but accepts all urls. we will add code to pass the urls
@ -3587,10 +3589,10 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() {
 		// domains it slows diffbot back-end down, so change this
 		// from 100 to 7 if doing a bulk job
 		if ( m_isCustomCrawl == 2 )
-			m_maxSpidersPerRule[i] = 7;
+			m_maxSpidersPerRule[i] = 2;// try 2 not 1 to be faster

 		m_spiderIpWaits     [i] = wait;
-		m_spiderIpMaxSpiders[i] = 7; // keep it respectful
+		m_spiderIpMaxSpiders[i] = diffbotipms; // keep it respectful
 		// ethan wants some speed
 		if ( isEthan )
 			m_spiderIpMaxSpiders[i] = 30;
--- a/Mem.cpp
+++ b/Mem.cpp
@ -537,7 +537,7 @@ void Mem::addMem ( void *mem , int32_t size , const char *note , char isnew ) {
 	if ( ! s_initialized ) {
 		//m_memtablesize = m_maxMem / 6510;
 		// support 1.2M ptrs for now. good for about 8GB
-		m_memtablesize = 1200*1024;//m_maxMem / 6510;
+		m_memtablesize = 3000*1024;//m_maxMem / 6510;
 		//if ( m_maxMem < 8000000000 ) { char *xx=NULL;*xx=0; }
 	}

--- a/Msg3.cpp
+++ b/Msg3.cpp
@ -785,7 +785,7 @@ bool Msg3::doneScanning ( ) {
 	// if shutting down gb then limit to 20 so we can shutdown because
 	// it can't shutdown until all threads are out of the queue i think
 	if ( g_process.m_mode == EXIT_MODE && max < 0 ) {
-		log("msg3: forcing retries to 0 because shutting down");
+		//log("msg3: forcing retries to 0 because shutting down");
 		max = 0;
 	}

--- a/Parms.cpp
+++ b/Parms.cpp
@ -17851,7 +17851,8 @@ void Parms::init ( ) {
 	// can use those to sort regular docs and not have spider reply
 	// status docs in the serps.
 	// back on 4/21/2015 seems pretty stable.
-	m->m_def   = "1";
+	// but it uses disk space so turn off for now again. 6/16/2015
+	m->m_def   = "0";
 	m->m_page  = PAGE_SPIDER;
 	m->m_obj   = OBJ_COLL;
 	m->m_flags = PF_CLONE;
--- a/Rdb.cpp
+++ b/Rdb.cpp
@ -1833,6 +1833,14 @@ bool Rdb::addList ( collnum_t collnum , RdbList *list,
 	       m_rdbId == RDB_DOLEDB     ||
 	       m_rdbId == RDB_SPIDERDB   ||
 	       m_rdbId == RDB_REVDB      ) ) {
+
+		// exception, spider status docs can be deleted from titledb
+		// if user turns off 'index spider replies' before doing
+		// the rebuild, when not rebuilding titledb.
+		if ( m_rdbId == RDB_TITLEDB && 
+		     list->m_listSize == 12 )
+			goto exception;
+
 		// allow banning of sites still
 		//m_rdbId == RDB_TAGDB     ) ) {
 		log("db: How did an add come in while in repair mode?"
@ -1840,6 +1848,9 @@ bool Rdb::addList ( collnum_t collnum , RdbList *list,
 		g_errno = EREPAIRING;
 		return false;
 	}
+
+ exception:
+
 	/*
 	if ( g_repair.isRepairActive() &&
 	     g_repair.m_fullRebuild    && 
--- a/RdbBase.cpp
+++ b/RdbBase.cpp
@ -839,6 +839,22 @@ int32_t RdbBase::addFile ( int32_t id , bool isNew , int32_t mergeNum , int32_t
 		// this returns false and sets g_errno on error
 		if ( ! m->generateMap ( f ) ) {
 			log("db: Map generation failed.");
+			log("db: Moving .dat and .map file to trash dir");
+			SafeBuf tmp;
+			tmp.safePrintf("%s",f->getFilename());
+			// take off .dat and make it * so we can move map file
+			int32_t len = tmp.getLength();
+			char *str = tmp.getBufStart();
+			str[len-3] = '*';
+			str[len-2] = '\0';
+			SafeBuf cmd;
+			cmd.safePrintf("mv %s/%s %s/trash/",
+				       m_dir.getDir(),
+				       str,
+				       g_hostdb.m_dir);
+			log("db: %s",cmd.getBufStart() );
+			gbsystem ( cmd.getBufStart() );
+			exit(0);
 			mdelete ( f , sizeof(BigFile),"RdbBase");
 			delete (f); 
 			mdelete ( m , sizeof(RdbMap),"RdbBase");
@ -1359,6 +1375,9 @@ void RdbBase::attemptMerge ( int32_t niceness, bool forceMergeAll, bool doLog ,
 	if ( g_merge.m_isSuspended  ) return;
 	if ( g_merge2.m_isSuspended ) return;

+	// shutting down? do not start another merge then
+	if ( g_process.m_mode == EXIT_MODE ) return;
+
 	// sanity checks
 	if (   g_loop.m_inQuickPoll ) { 
 		log("rdb: cant attempt merge in quickpoll");
--- a/RdbMap.cpp
+++ b/RdbMap.cpp
@ -323,10 +323,11 @@ bool RdbMap::verifyMap2 ( ) {
 			KEYSET(lastKey,k,m_ks); continue; }
 		// just bitch for now
 		log(
-		    "db: Key out of order in map file %s. "
+		    "db: Key out of order in map file %s%s. "
 		    "page = %"INT32". key offset = %"INT64". Map or data file is "
-		    "corrupt, but it is probably the data file.", 
-		    m_file.getFilename() ,
+		    "corrupt, but it is probably the data file. Please "
+		    "delete the map file and restart.", 
+		    m_file.m_dir,m_file.getFilename() ,
 		    i,(int64_t)m_pageSize*(int64_t)i+getOffset(i));

 		//log("db: oldk.n1=%08"XINT32" n0=%016"XINT64"",
@ -336,6 +337,7 @@ bool RdbMap::verifyMap2 ( ) {
 		    KEY1(lastKey,m_ks),KEY0(lastKey));
 		log("db:    k.n1=%016"XINT64" n0=%016"XINT64"",KEY1(k,m_ks),KEY0(k));
 		log("db: m_numPages = %"INT32"",m_numPages);
+		exit(0);
 		//char *xx=NULL;*xx=0;
 		// was k too small?
 		//if ( i + 1 < m_numPages && lastKey <= getKey(i+1) ) {
@ -1371,6 +1373,9 @@ bool RdbMap::chopHead ( int32_t fileHeadSize ) {
 bool RdbMap::generateMap ( BigFile *f ) {
 	reset();
 	if ( g_conf.m_readOnlyMode ) return false;
+
+	log("db: Generating map for %s/%s",f->m_dir,f->getFilename());
+
 	// we don't support headless datafiles right now
 	if ( ! f->doesPartExist(0) ) {
 		g_errno = EBADENGINEER;
--- a/XmlDoc.cpp
+++ b/XmlDoc.cpp
@ -20354,6 +20354,17 @@ bool XmlDoc::logIt ( SafeBuf *bb ) {
 		sb->safePrintf("addstatusdocsize=%05"INT32" ",0);


+	if ( m_useSecondaryRdbs ) {
+		sb->safePrintf("useposdb=%i ",(int)m_usePosdb);
+		sb->safePrintf("usetitledb=%i ",(int)m_useTitledb);
+		sb->safePrintf("useclusterdb=%i ",(int)m_useClusterdb);
+		sb->safePrintf("usespiderdb=%i ",(int)m_useSpiderdb);
+		sb->safePrintf("uselinkdb=%i ",(int)m_useLinkdb);
+		if ( cr )
+			sb->safePrintf("indexspiderreplies=%i ",(int)
+				       cr->m_indexSpiderReplies);
+	}
+
 	if ( size_imageData && m_imageDataValid ) {
 		// url is in data now
 		ThumbnailArray *ta = (ThumbnailArray *)ptr_imageData;
@ -21913,6 +21924,58 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
 	// returning from a handler that had an error?
 	if ( g_errno ) return NULL;

+	// if we are a spider status doc/titlerec and we are doing a rebuild
+	// operation, then keep it simple
+	if ( m_setFromTitleRec &&
+	     m_useSecondaryRdbs &&
+	     m_contentTypeValid &&
+	     m_contentType == CT_STATUS ) {
+		// if not rebuilding posdb then done, list is empty since
+		// spider status docs do not contribute to linkdb, clusterdb,..
+		if ( ! m_usePosdb && ! m_useTitledb ) {
+			m_metaListValid = true;
+			return m_metaList;
+		}
+
+		/////////////
+		//
+		// if user disabled spider status docs then delete the titlerec
+		// AND the posdb index list from our dbs for this ss doc
+		//
+		/////////////
+		CollectionRec *cr = getCollRec();
+		if ( ! cr ) return NULL;
+		if ( ! cr->m_indexSpiderReplies ) {
+			int64_t uh48 = m_firstUrl.getUrlHash48();
+			// delete title rec. true = delete?
+			key_t tkey = g_titledb.makeKey (m_docId,uh48,true);
+			// shortcut
+			SafeBuf *ssb = &m_spiderStatusDocMetaList;
+			// add to list. and we do not add the spider status
+			// doc to posdb since we deleted its titlerec.
+			ssb->pushChar(RDB_TITLEDB); // RDB2_TITLEDB2
+			ssb->safeMemcpy ( &tkey , sizeof(key_t) );
+			m_metaList      = ssb->getBufStart();
+			m_metaListSize  = ssb->getLength  ();
+			m_metaListValid = true;
+			return m_metaList;
+		}
+
+		// set safebuf to the json of the spider status doc
+		SafeBuf jd;
+		if ( ! jd.safeMemcpy ( ptr_utf8Content , size_utf8Content ) )
+			return NULL;
+		// set m_spiderStatusDocMetaList from the json
+		if ( ! setSpiderStatusDocMetaList ( &jd , m_docId ) )
+			return NULL;
+		// TODO: support titledb rebuild as well
+		m_metaList      = m_spiderStatusDocMetaList.getBufStart();
+		m_metaListSize  = m_spiderStatusDocMetaList.getLength();
+		m_metaListValid = true;
+		return m_metaList;
+	}
+	     
+
 	// any other indexing issue? hey! g_errno might not be set here
 	//if ( m_indexCode ) { g_errno = m_indexCode; return NULL; }

@ -22937,11 +23000,20 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
 	// i guess it is safe to do this after getting the spiderreply
 	SafeBuf *spiderStatusDocMetaList = NULL;
 	//if ( indexReply ) {
+
 	// get the spiderreply ready to be added to the rdbs w/ msg4
-	spiderStatusDocMetaList = getSpiderStatusDocMetaList (newsr,forDelete);
-	// block?
-	if ( ! spiderStatusDocMetaList ||
-	     spiderStatusDocMetaList == (void *)-1)
+	// but if doing a rebuild operation then do not get it, we'll rebuild
+	// it since it will have its own titlerec
+	if ( ! m_useSecondaryRdbs ) {
+		spiderStatusDocMetaList = 
+			getSpiderStatusDocMetaList (newsr,forDelete);
+		if ( ! spiderStatusDocMetaList ) {
+			log("build: ss doc metalist null. bad!");
+			return NULL;
+		}
+	}
+
+	if ( spiderStatusDocMetaList == (void *)-1)
 		return (char *)spiderStatusDocMetaList;
 	//}

@ -24070,6 +24142,8 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
 			 spiderStatusDocMetaList->getBufStart() ,
 			 spiderStatusDocMetaList->length() );
 		m_p += spiderStatusDocMetaList->length();
+		m_addedStatusDocSize = spiderStatusDocMetaList->length();
+		m_addedStatusDocSizeValid = true;
 	}

 	/*
@ -27739,21 +27813,121 @@ SafeBuf *XmlDoc::getSpiderStatusDocMetaList2 ( SpiderReply *reply1 ) {
 	// end the json spider status doc
 	jd.safePrintf("\n}\n");

+	// BEFORE ANY HASHING
+	int32_t savedDist = m_dist;
+
+	// add the index list for it. it returns false and sets g_errno on err
+	// otherwise it sets m_spiderStatusDocMetaList
+	if ( ! setSpiderStatusDocMetaList ( &jd , *uqd ) )
+		return NULL;
+
+	// now make the titlerec 
+	char xdhead[2048];
+	// just the head of it. this is the hacky part.
+	XmlDoc *xd = (XmlDoc *)xdhead;
+	// clear it out
+	memset ( xdhead, 0 , 2048);
+
+	// copy stuff from THIS so the spider reply "document" has the same
+	// header info stuff
+	int32_t hsize = (char *)&ptr_firstUrl - (char *)this;
+	if ( hsize > 2048 ) { char *xx=NULL;*xx=0; }
+	gbmemcpy ( xdhead , (char *)this , hsize );
+
+	// override spider time in case we had error to be consistent
+	// with the actual SpiderReply record
+	//xd->m_spideredTime = reply->m_spideredTime;
+	//xd->m_spideredTimeValid = true;
+	// sanity
+	//if ( reply->m_spideredTime != m_spideredTime ) {char *xx=NULL;*xx=0;}
+
+	// this will cause the maroon box next to the search result to
+	// say "STATUS" similar to "PDF" "DOC" etc.
+	xd->m_contentType  = CT_STATUS;
+
+	int32_t fullsize = &m_dummyEnd - (char *)this;
+	if ( fullsize > 2048 ) { char *xx=NULL;*xx=0; }
+
+	/*
+	// the ptr_* were all zero'd out, put the ones we want to keep back in
+	SafeBuf tmp;
+	// was "Spider Status: %s" but that is unnecessary
+	tmp.safePrintf("<title>%s</title>",
+		       mstrerror(m_indexCode));
+
+	// if we are a dup...
+	if ( m_indexCode == EDOCDUP )
+		tmp.safePrintf("Dup of docid %"INT64"<br>", m_docIdWeAreADupOf );
+
+	if ( m_redirUrlPtr && m_redirUrlValid )
+		tmp.safePrintf("Redirected to %s<br>",m_redirUrlPtr->getUrl());
+	*/
+
+	// put stats like we log out from logIt
+	//tmp.safePrintf("<div style=max-width:800px;>\n");
+	// store log output into doc
+	//logIt(&tmp);
+	//tmp.safePrintf("\n</div>");
+
+	// the content is just the title tag above
+	// xd->ptr_utf8Content = tmp.getBufStart();
+	// xd->size_utf8Content = tmp.length()+1;
+	xd->ptr_utf8Content = jd.getBufStart();
+	xd->size_utf8Content = jd.length()+1;
+
+	// keep the same url as the doc we are the spider reply for
+	xd->ptr_firstUrl = ptr_firstUrl;
+	xd->size_firstUrl = size_firstUrl;
+
+	// serps need site, otherwise search results core
+	xd->ptr_site = ptr_site;
+	xd->size_site = size_site;
+
+	// if this is null then ip lookup failed i guess so just use
+	// the subdomain
+	if ( ! ptr_site && m_firstUrlValid ) {
+		xd->ptr_site  = m_firstUrl.getHost();
+		xd->size_site = m_firstUrl.getHostLen();
+	}
+
+	// use the same uh48 of our parent
+	int64_t uh48 = m_firstUrl.getUrlHash48();
+	// then make into a titlerec but store in metalistbuf, not m_titleRec
+	SafeBuf titleRecBuf;
+	// this should not include ptrs that are NULL when compressing
+	// using its m_internalFlags1
+	if ( ! xd->setTitleRecBuf( &titleRecBuf,*uqd,uh48 ) ) 
+		return NULL;
+
+	// concat titleRec to our posdb key records
+	if ( ! m_spiderStatusDocMetaList.pushChar((char)RDB_TITLEDB) )
+		return NULL;
+	if ( ! m_spiderStatusDocMetaList.cat(titleRecBuf) ) 
+		return NULL;
+
+	// return the right val
+	m_dist = savedDist;
+
+	// ok, good to go, ready to add to posdb and titledb
+	m_spiderStatusDocMetaListValid = true;
+	return &m_spiderStatusDocMetaList;
+}
+
+
+bool XmlDoc::setSpiderStatusDocMetaList ( SafeBuf *jd , int64_t uqd ) {

 	// the posdb table
 	HashTableX tt4;
 	if ( !tt4.set(18,4,256,NULL,0,false,m_niceness,"posdb-spindx"))
-		return NULL;
+		return false;


 	Json jp2;
-	if (! jp2.parseJsonStringIntoJsonItems ( jd.getBufStart(),m_niceness)){
+	if (! jp2.parseJsonStringIntoJsonItems (jd->getBufStart(),m_niceness)){
 		g_errno = EBADJSONPARSER;
-		return NULL;
+		return false;
 	}

-	// BEFORE ANY HASHING
-	int32_t savedDist = m_dist;
 	// re-set to 0
 	m_dist = 0;

@ -27859,7 +28033,7 @@ SafeBuf *XmlDoc::getSpiderStatusDocMetaList2 ( SpiderReply *reply1 ) {
 	*/

 	// store keys in safebuf then to make our own meta list
-	addTable144 ( &tt4 , *uqd , &m_spiderStatusDocMetaList );
+	addTable144 ( &tt4 , uqd , &m_spiderStatusDocMetaList );

 	// debug this shit
 	//SafeBuf tmpsb;
@ -27868,97 +28042,7 @@ SafeBuf *XmlDoc::getSpiderStatusDocMetaList2 ( SpiderReply *reply1 ) {
 	//		&tmpsb );
 	//logf(LOG_DEBUG,"%s\n",tmpsb.getBufStart());

-
-	// now make the titlerec 
-	char xdhead[2048];
-	// just the head of it. this is the hacky part.
-	XmlDoc *xd = (XmlDoc *)xdhead;
-	// clear it out
-	memset ( xdhead, 0 , 2048);
-
-	// copy stuff from THIS so the spider reply "document" has the same
-	// header info stuff
-	int32_t hsize = (char *)&ptr_firstUrl - (char *)this;
-	if ( hsize > 2048 ) { char *xx=NULL;*xx=0; }
-	gbmemcpy ( xdhead , (char *)this , hsize );
-
-	// override spider time in case we had error to be consistent
-	// with the actual SpiderReply record
-	//xd->m_spideredTime = reply->m_spideredTime;
-	//xd->m_spideredTimeValid = true;
-	// sanity
-	//if ( reply->m_spideredTime != m_spideredTime ) {char *xx=NULL;*xx=0;}
-
-	// this will cause the maroon box next to the search result to
-	// say "STATUS" similar to "PDF" "DOC" etc.
-	xd->m_contentType  = CT_STATUS;
-
-	int32_t fullsize = &m_dummyEnd - (char *)this;
-	if ( fullsize > 2048 ) { char *xx=NULL;*xx=0; }
-
-	/*
-	// the ptr_* were all zero'd out, put the ones we want to keep back in
-	SafeBuf tmp;
-	// was "Spider Status: %s" but that is unnecessary
-	tmp.safePrintf("<title>%s</title>",
-		       mstrerror(m_indexCode));
-
-	// if we are a dup...
-	if ( m_indexCode == EDOCDUP )
-		tmp.safePrintf("Dup of docid %"INT64"<br>", m_docIdWeAreADupOf );
-
-	if ( m_redirUrlPtr && m_redirUrlValid )
-		tmp.safePrintf("Redirected to %s<br>",m_redirUrlPtr->getUrl());
-	*/
-
-	// put stats like we log out from logIt
-	//tmp.safePrintf("<div style=max-width:800px;>\n");
-	// store log output into doc
-	//logIt(&tmp);
-	//tmp.safePrintf("\n</div>");
-
-	// the content is just the title tag above
-	// xd->ptr_utf8Content = tmp.getBufStart();
-	// xd->size_utf8Content = tmp.length()+1;
-	xd->ptr_utf8Content = jd.getBufStart();
-	xd->size_utf8Content = jd.length()+1;
-
-	// keep the same url as the doc we are the spider reply for
-	xd->ptr_firstUrl = ptr_firstUrl;
-	xd->size_firstUrl = size_firstUrl;
-
-	// serps need site, otherwise search results core
-	xd->ptr_site = ptr_site;
-	xd->size_site = size_site;
-
-	// if this is null then ip lookup failed i guess so just use
-	// the subdomain
-	if ( ! ptr_site && m_firstUrlValid ) {
-		xd->ptr_site  = m_firstUrl.getHost();
-		xd->size_site = m_firstUrl.getHostLen();
-	}
-
-	// use the same uh48 of our parent
-	int64_t uh48 = m_firstUrl.getUrlHash48();
-	// then make into a titlerec but store in metalistbuf, not m_titleRec
-	SafeBuf titleRecBuf;
-	// this should not include ptrs that are NULL when compressing
-	// using its m_internalFlags1
-	if ( ! xd->setTitleRecBuf( &titleRecBuf,*uqd,uh48 ) ) 
-		return NULL;
-
-	// concat titleRec to our posdb key records
-	if ( ! m_spiderStatusDocMetaList.pushChar((char)RDB_TITLEDB) )
-		return NULL;
-	if ( ! m_spiderStatusDocMetaList.cat(titleRecBuf) ) 
-		return NULL;
-
-	// return the right val
-	m_dist = savedDist;
-
-	// ok, good to go, ready to add to posdb and titledb
-	m_spiderStatusDocMetaListValid = true;
-	return &m_spiderStatusDocMetaList;
+	return true;
 }

 // returns false and sets g_errno on error
--- a/XmlDoc.h
+++ b/XmlDoc.h
@ -509,6 +509,7 @@ class XmlDoc {
 	SafeBuf *getSpiderStatusDocMetaList ( class SpiderReply *reply ,
 					      bool forDelete ) ;
 	SafeBuf *getSpiderStatusDocMetaList2 ( class SpiderReply *reply ) ;
+	bool setSpiderStatusDocMetaList ( SafeBuf *jd , int64_t ssDocId ) ;
 	SafeBuf m_spiderStatusDocMetaList;
 	char *getIsAdult ( ) ;
 	int32_t **getIndCatIds ( ) ;
--- a/main.cpp
+++ b/main.cpp
@ -6618,6 +6618,7 @@ void dumpTitledb (char *coll,int32_t startFileNum,int32_t numFiles,bool includeT
 			"ch32=%010"UINT32" "
 			"clen=%07"INT32" "
 			"cs=%04d "
+			"ctype=%s "
 			"lang=%02d "
 			"sni=%03"INT32" "
 			//"cats=%"INT32" "
@ -6642,6 +6643,7 @@ void dumpTitledb (char *coll,int32_t startFileNum,int32_t numFiles,bool includeT
 			xd->m_contentHash32,
 			xd->size_utf8Content,//tr.getContentLen() ,
 			xd->m_charset,//tr.getCharset(),
+			g_contentTypeStrings[xd->m_contentType],
 			xd->m_langId,//tr.getLanguage(),
 			(int32_t)xd->m_siteNumInlinks,//tr.getDocQuality(),
 			//nc,