fix for sitenuminlinks logic so we

do not overwrite global-index imported site pop info in tagdb
2025-07-15 02:36:08 -04:00 · 2014-09-26 13:36:12 -07:00
parent 89fb0a9866
commit c85df203a0
1 changed files with 8 additions and 1 deletions
--- a/XmlDoc.cpp
+++ b/XmlDoc.cpp
@ -12478,6 +12478,8 @@ long *XmlDoc::getSiteNumInlinks ( ) {
 		maxAge *= 3600*24;
 		// so youtube which has 2997 links will add an extra 29 days
 		maxAge += (sni / 100) * 86400;
+		// hack for global index. never affect siteinlinks i imported
+		if ( strcmp(cr->m_coll,"GLOBAL-INDEX") == 0 ) age = 0;
 		// invalidate for that as wel
 		if ( age > maxAge ) valid = false;
 	}
@ -36501,7 +36503,12 @@ SafeBuf *XmlDoc::getNewTagBuf ( ) {
 	// if running for diffbot crawlbot then isCustomCrawl is true
 	// so do not update the siteinlink info already in tagdb since i 
 	// imported it from my main collection. we do not want to overwrite it.
-	if ( cr->m_isCustomCrawl ) goto skipSiteInlinks;
+	// NO, because for single site crawls we bottlenech on msg25
+	// when there are millions of urls. we only skip this
+	// for the global-index and if already in tagdb!
+	// No, let's just not invalidate the sitenuminlinks* tags
+	// in XmlDoc::getSiteNumInlinks()
+	//if ( strcmp(cr->m_coll,"GLOBAL-INDEX") == 0 ) ) goto skipSiteInlinks;

 	// sitenuminlinksfresh
 	old2 = gr->getLong("sitenuminlinksuniqueip",-1,NULL,&timestamp);