Merge branch 'diffbot-testing' into testing

2015-01-31 15:35:02 -07:00 · 2015-01-31 15:35:02 -07:00 · e8948cea65
commit e8948cea65
parent 39beee8b22 f4ca6d8cd4
9 changed files with 2678052 additions and 23 deletions
--- a/Process.cpp
+++ b/Process.cpp
@ -183,6 +183,9 @@ char *g_files[] = {
 	"wiktionary-lang.txt",
 	"wiktionary-syns.dat",

+	// gives us siteranks for the most popular sites:
+	"sitelinks.txt",
+
 	"unifiedDict.txt",
 	//"unifiedDict-buf.txt",
 	//"unifiedDict-map.dat",
--- a/Tagdb.cpp
+++ b/Tagdb.cpp
@ -1776,6 +1776,8 @@ Tagdb g_tagdb2;
 // reset rdb and Xmls
 void Tagdb::reset() {
 	m_rdb.reset();
+	m_siteBuf1.purge();
+	m_siteBuf2.purge();
 	//s_lockTable2.reset();
 }

@ -4945,3 +4947,232 @@ int32_t Tag::getDedupHash ( ) {

 	return dh;
 }
+
+// make sure sizeof(Entry2)=5 not 8!
+#pragma pack(1)
+
+class Entry1 {
+public:
+	uint32_t m_hostHash32;
+	uint32_t m_siteNumInlinksUniqueCBlock;
+};
+
+class Entry2 {
+public:
+	uint32_t m_hostHash32;
+	uint8_t  m_siteNumInlinksUniqueCBlock;
+};
+
+static int linkSort1Cmp ( const void *a, const void *b ) {
+	Entry1 *ea = (Entry1 *)a;
+	Entry1 *eb = (Entry1 *)b;
+	if ( ea->m_hostHash32 > eb->m_hostHash32 ) return  1;
+	if ( ea->m_hostHash32 < eb->m_hostHash32 ) return -1;
+	return 0;
+}
+
+static int linkSort2Cmp ( const void *a, const void *b ) {
+	Entry2 *ea = (Entry2 *)a;
+	Entry2 *eb = (Entry2 *)b;
+	if ( ea->m_hostHash32 > eb->m_hostHash32 ) return  1;
+	if ( ea->m_hostHash32 < eb->m_hostHash32 ) return -1;
+	return 0;
+}
+
+bool Tagdb::loadMinSiteInlinksBuffer ( ) {
+
+	if ( ! loadMinSiteInlinksBuffer2() ) return false;
+
+	// sanity testing
+	uint32_t hostHash32 = hash32n("www.imdb.com");
+	int32_t msi = getMinSiteInlinks ( hostHash32 );
+	if ( msi < 10 ) {
+		log("tagdb: bad siteinlinks. linkedin.com not found.");
+		//return false;
+	}
+	hostHash32 = hash32n("0009.org" );
+	msi = getMinSiteInlinks ( hostHash32 );
+	if ( msi < 0 ) 	{
+		log("tagdb: bad siteinlinks. 0009.org not found.");
+		//return false;
+	}
+	Url tmp;
+	tmp.set("gnu.org");
+	hostHash32 = tmp.getHash32WithWWW();
+	msi = getMinSiteInlinks ( hostHash32 );
+	if ( msi < 0 ) 	{
+		log("tagdb: bad siteinlinks. www.gnu.org not found.");
+		//return false;
+	}
+
+	
+	return true;
+}
+
+bool Tagdb::loadMinSiteInlinksBuffer2 ( ) {
+
+	// use 4 bytes for the first 130,000 entries or so to hold
+	// # of site inlinks. then we only need 1 byte since the remaining
+	// 25M are <256 sitenuminlinksunqiecblocks
+	m_siteBuf1.load("sitelinks1.dat");
+	m_siteBuf2.load("sitelinks2.dat");
+
+	m_siteBuf1.setLabel("sitelnks");
+	m_siteBuf2.setLabel("sitelnks");
+
+	if ( m_siteBuf1.length() > 0 &&
+	     m_siteBuf2.length() > 0 ) 
+		return true;
+
+	log("gb: loading ./sitelinks.txt");
+
+	// ok, make it
+	SafeBuf tmp;
+	tmp.load("./sitelinks.txt");
+	if ( tmp.length() <= 0 ) {
+		log("gb: fatal error. could not find required file "
+		    "./sitelinks.txt");
+		return false;
+	}
+
+	log("gb: starting initial creation of sitelinks1.dat and "
+	    "sitelinks2.dat files");
+
+	// now parse each line in that
+	char *p = tmp.getBufStart();
+	char *pend = p + tmp.length();
+	char *newp = NULL;
+	SafeBuf buf1;
+	SafeBuf buf2;
+	int32_t count = 0;
+	for ( ; p < pend ; p = newp ) {
+		
+		if ( ++count % 1000000 == 0 )
+			log("gb: parsing line # %"INT32,count);
+
+		// advance to next line
+		newp = p;
+		for ( ; newp < pend && *newp != '\n' ; newp++ );
+		if ( newp < pend ) newp++;
+		// parse this line
+		int32_t numLinks = atoi(p);
+		// skip number
+		for ( ; *p && *p != ' ' && *p != '\n' ; p++ );
+		// strange
+		if ( ! *p || *p == '\n' ) continue;
+		// skip spaces
+		for ( ; *p == ' ' ; p++ );
+		// get hostname
+		char *host = p;
+		// find end of it
+		for ( ; *p && *p != '\n' && *p != ' ' && *p != '\t' ; p++ );
+		// hash it
+		uint32_t hostHash32 = hash32 ( host , p - host );
+		// store in buffer
+		if ( numLinks >= 256 ) {
+			Entry1 e1;
+			e1.m_siteNumInlinksUniqueCBlock = numLinks;
+			e1.m_hostHash32 = hostHash32;
+			buf1.safeMemcpy ( &e1 , sizeof(Entry1) );
+		}
+		else {
+			Entry2 e2;
+			e2.m_siteNumInlinksUniqueCBlock = numLinks;
+			e2.m_hostHash32 = hostHash32;
+			buf2.safeMemcpy ( &e2 , sizeof(Entry2) );
+		}
+	}		
+
+	log("gb: sorting sitelink data");
+
+	// now sort each one
+	qsort ( buf1.getBufStart() , 
+		buf1.length()/sizeof(Entry1),
+		sizeof(Entry1),
+		linkSort1Cmp );
+
+	qsort ( buf2.getBufStart() , 
+		buf2.length()/sizeof(Entry2),
+		sizeof(Entry2),
+		linkSort2Cmp );
+
+
+	// now copy to the official buffer so we only alloc what we need
+	m_siteBuf1.safeMemcpy ( &buf1 );
+	m_siteBuf2.safeMemcpy ( &buf2 );
+
+	log("gb: saving sitelinks1.dat and sitelinks2.dat");
+
+	m_siteBuf1.save("./sitelinks1.dat");
+	m_siteBuf2.save("./sitelinks2.dat");
+
+	return true;
+}
+
+int32_t Tagdb::getMinSiteInlinks ( uint32_t hostHash32 ) {
+
+	if ( m_siteBuf1.length() <= 0 ) { 
+		log("tagdb: load not called");
+		char *xx=NULL;*xx=0; 
+	}
+
+	// first check buf1 doing bstep
+	int32_t ne = m_siteBuf1.length() / sizeof(Entry1);
+	Entry1 *ep = (Entry1 *)m_siteBuf1.getBufStart();
+	Entry2 *fp = NULL;
+	int32_t i = ne / 2;
+	int32_t step = ne / 2;
+	int32_t count = 0;
+
+ loop1:
+
+	if ( i < 0 ) i = 0;
+	if ( i >= ne ) i = ne-1;
+	// after 3 single steps if no hit, try next hosthash buf
+	if ( count == 3 ) goto tryNextBuf;
+	step /= 2;
+	if ( step == 0 ) {
+		step = 1;
+		count++;
+	}
+	if ( hostHash32 < ep[i].m_hostHash32 ) {
+		i -= step;
+		goto loop1;
+	}
+	if ( hostHash32 > ep[i].m_hostHash32 ) {
+		i += step;
+		goto loop1;
+	}
+	return ep[i].m_siteNumInlinksUniqueCBlock;
+
+ tryNextBuf:
+
+	// reset parms
+	ne = m_siteBuf2.length() / sizeof(Entry2);
+	fp = (Entry2 *)m_siteBuf2.getBufStart();
+	i = ne / 2;
+	step = ne / 2;
+	count = 0;
+
+ loop2:
+
+	if ( i < 0 ) i = 0;
+	if ( i >= ne ) i = ne-1;
+	// after 3 single steps if no hit, that's it...
+	if ( count == 3 ) return -1;
+	step /= 2;
+	if ( step == 0 ) {
+		step = 1;
+		count++;
+	}
+	if ( hostHash32 < fp[i].m_hostHash32 ) {
+		i -= step;
+		goto loop2;
+	}
+	if ( hostHash32 > fp[i].m_hostHash32 ) {
+		i += step;
+		goto loop2;
+	}
+	return fp[i].m_siteNumInlinksUniqueCBlock;
+	
+}
--- a/Tagdb.h
+++ b/Tagdb.h
@ -373,6 +373,12 @@ class Tagdb  {

 	DiskPageCache m_pc;

+	bool    loadMinSiteInlinksBuffer ( );
+	bool    loadMinSiteInlinksBuffer2 ( );
+	int32_t getMinSiteInlinks ( uint32_t hostHash32 ) ;
+	SafeBuf m_siteBuf1;
+	SafeBuf m_siteBuf2;
+
 };

 // derive this from tagdb
--- a/Url.cpp
+++ b/Url.cpp
@ -1740,6 +1740,12 @@ int32_t Url::getSiteHash32 ( char *coll ) {
 }
 */

+int32_t Url::getHash32WithWWW ( ) {
+	uint32_t hh = hash32n ( "www." );
+	int32_t conti = 4;
+	hh = hash32_cont ( m_domain , m_dlen , hh , &conti );
+	return hh;
+}

 int32_t Url::getHostHash32 ( ) { 
 	return hash32 ( m_host , m_hlen ); 
--- a/Url.h
+++ b/Url.h
@ -184,6 +184,9 @@ public:
 	int32_t      getHostHash32   ( ) ;
 	int32_t      getDomainHash32 ( ) ;

+	// if url is xyz.com then get hash of www.xyz.com
+	int32_t getHash32WithWWW ( );
+
 	int64_t getUrlHash64    ( ) ;
 	int64_t getHostHash64   ( ) ;
 	int64_t getDomainHash64   ( ) ;
--- a/XmlDoc.cpp
+++ b/XmlDoc.cpp
@ -12576,6 +12576,7 @@ Addresses *XmlDoc::getAddresses ( ) {
 	return &m_addresses;
 }

+/*
 int32_t *XmlDoc::getSiteNumInlinksUniqueIp ( ) {
 	if ( m_siteNumInlinksUniqueIpValid ) 
 		return &m_siteNumInlinksUniqueIp;
@ -12611,6 +12612,7 @@ int32_t *XmlDoc::getSiteNumInlinksTotal ( ) {
 	// ok we must be valid
 	return &m_siteNumInlinksTotal;
 }	
+*/

 // we need this for setting SpiderRequest::m_parentFirstIp of each outlink
 int32_t *XmlDoc::getFirstIp ( ) {
@ -12652,6 +12654,9 @@ uint8_t *XmlDoc::getSiteNumInlinks8 () {
 	return &m_siteNumInlinks8;
 }

+// this is the # of GOOD INLINKS to the site. so it is no more than
+// 1 per c block, and it has to pass link spam detection. this is the
+// highest-level count of inlinks to the site. use it a lot.
 int32_t *XmlDoc::getSiteNumInlinks ( ) {

 	if ( m_siteNumInlinksValid ) return &m_siteNumInlinks;
@ -12798,10 +12803,15 @@ int32_t *XmlDoc::getSiteNumInlinks ( ) {
 		    (PTRTYPE)tag3,
 		    m_firstUrl.m_url);

+	LinkInfo *sinfo = NULL;
+	char *mysite = NULL;
+
 	// if we are good return it
 	if ( tag && valid ) {
 		// set it
 		m_siteNumInlinks = atol(tag->getTagData());
+		m_siteNumInlinksValid = true;
+
 		// companion tags
 		if ( tag2 ) {
 			m_siteNumInlinksUniqueIp = atol(tag2->getTagData());
@ -12815,9 +12825,10 @@ int32_t *XmlDoc::getSiteNumInlinks ( ) {
 			m_siteNumInlinksTotal =atol(tag4->getTagData());
 			m_siteNumInlinksTotalValid = true;
 		}
-		// it is good to go now
-		m_siteNumInlinksValid = true;
-		return &m_siteNumInlinks;
+
+		// . consult our sitelinks.txt file
+		// . returns -1 if not found
+		goto updateToMin;
 	}

 	// set status. we can time status changes with this routine!
@ -12845,7 +12856,7 @@ int32_t *XmlDoc::getSiteNumInlinks ( ) {
 	m_updatingSiteLinkInfoTags = true;

 	// we need to re-get both if either is NULL
-	LinkInfo *sinfo = getSiteLinkInfo();
+	sinfo = getSiteLinkInfo();
 	// block or error?
 	if ( ! sinfo || sinfo == (LinkInfo *)-1) return (int32_t *)sinfo;

@ -12859,7 +12870,7 @@ int32_t *XmlDoc::getSiteNumInlinks ( ) {
 	//Links *links = getLinks ();
 	//if ( ! links || links == (Links *)-1 ) return (int32_t *)links;

-	char *mysite = getSite();
+	mysite = getSite();
 	if ( ! mysite || mysite == (void *)-1 ) return (int32_t *)mysite;

 	setStatus ( "adding site info tags to tagdb 1");
@ -12881,6 +12892,45 @@ int32_t *XmlDoc::getSiteNumInlinks ( ) {
 	m_siteNumInlinksUniqueCBlockValid = true;
 	m_siteNumInlinksTotalValid = true;

+
+ updateToMin:
+
+	// . consult our sitelinks.txt file
+	// . returns -1 if not found
+	int32_t hostHash32 = getHostHash32a();
+	int32_t min = g_tagdb.getMinSiteInlinks ( hostHash32 );
+
+	// try with www if not there
+	if ( min < 0 && ! m_firstUrl.hasSubdomain() ) {
+		int32_t wwwHash32 = m_firstUrl.getHash32WithWWW();
+		min = g_tagdb.getMinSiteInlinks ( wwwHash32 );
+	}
+
+	if ( min >= 0 ) {
+		if ( m_siteNumInlinks < min ||
+		     ! m_siteNumInlinksValid ) {
+			m_siteNumInlinks = min;
+			m_siteNumInlinksValid = true;
+		}
+		// if ( ! m_siteNumInlinksUniqueIpValid ||
+		//      m_siteNumInlinksUniqueIp < min ) {
+		// 	m_siteNumInlinksUniqueIp = min;
+		// 	m_siteNumInlinksUniqueIpValid = true;
+		// }
+		// if ( ! m_siteNumInlinksUniqueCBlockValid ||
+		//      m_siteNumInlinksUniqueCBlock < min ) {
+		// 	m_siteNumInlinksUniqueCBlock = min;
+		// 	m_siteNumInlinksUniqueCBlockValid = true;
+		// }
+		// if ( ! m_siteNumInlinksTotalValid ||
+		//      m_siteNumInlinksTotal < min ) {
+		// 	m_siteNumInlinksTotal = min;
+		// 	m_siteNumInlinksTotalValid = true;
+		// }
+	}		
+
+
+
 	// deal with it
 	return &m_siteNumInlinks;
 }
@ -19868,10 +19918,10 @@ bool XmlDoc::logIt ( SafeBuf *bb ) {

 	if ( m_siteNumInlinksValid ) {
 		sb->safePrintf("siteinlinks=%04"INT32" ",m_siteNumInlinks );
-		sb->safePrintf("siteipinlinks=%"INT32" ",
-			      m_siteNumInlinksUniqueIp);
-		sb->safePrintf("sitecblockinlinks=%"INT32" ",
-			      m_siteNumInlinksUniqueCBlock);
+		// sb->safePrintf("siteipinlinks=%"INT32" ",
+		// 	      m_siteNumInlinksUniqueIp);
+		// sb->safePrintf("sitecblockinlinks=%"INT32" ",
+		// 	      m_siteNumInlinksUniqueCBlock);
 		int32_t sr = ::getSiteRank ( m_siteNumInlinks );
 		sb->safePrintf("siterank=%"INT32" ", sr );
 	}
@ -25171,6 +25221,21 @@ char *XmlDoc::addOutlinkSpiderRecsToMetaList ( ) {
 		Tag *st = NULL;
 		if ( gr ) st = gr->getTag ("sitenuminlinks");
 		if ( st ) ksni = atol(st->getTagData());
+
+		int32_t hostHash32   = url.getHostHash32();
+		// . consult our sitelinks.txt file
+		// . returns -1 if not found
+		int32_t min = g_tagdb.getMinSiteInlinks ( hostHash32 );
+
+		// try with www if not there
+		if ( min < 0 && ! url.hasSubdomain() ) {
+			int32_t wwwHash32 = url.getHash32WithWWW();
+			min = g_tagdb.getMinSiteInlinks ( wwwHash32 );
+		}
+
+		if ( min >= 0 && ksni < min ) 
+			ksni = min;
+		
 		//if ( ! m_siteNumInlinksValid ) { char *xx=NULL;*xx=0; }
 		//int32_t ksni = m_siteNumInlinks;
 		
@ -25188,7 +25253,6 @@ char *XmlDoc::addOutlinkSpiderRecsToMetaList ( ) {

 		// get it quick
 		bool ispingserver = url.isPingServer();
-		int32_t hostHash32   = url.getHostHash32();
 		int32_t domHash32    = url.getDomainHash32();

 		// is link rss?
--- a/XmlDoc.h
+++ b/XmlDoc.h
@ -641,9 +641,9 @@ class XmlDoc {
 	//class Url *getAboutUsLink ( ) ;
 	int32_t *getFirstIp ( ) ;
 	bool *updateFirstIp ( ) ;
-	int32_t *getSiteNumInlinksUniqueIp ( ) ;
-	int32_t *getSiteNumInlinksUniqueCBlock ( ) ;
-	int32_t *getSiteNumInlinksTotal ( );
+	//int32_t *getSiteNumInlinksUniqueIp ( ) ;
+	//int32_t *getSiteNumInlinksUniqueCBlock ( ) ;
+	//int32_t *getSiteNumInlinksTotal ( );
 	//int32_t *getSiteNumInlinksFresh ( ) ;
 	//int32_t *getSitePop ( ) ;
 	uint8_t *getSiteNumInlinks8 () ;
--- a/main.cpp
+++ b/main.cpp
@ -3609,6 +3609,10 @@ int main2 ( int argc , char *argv[] ) {
 		return 1;
 	}
 	*/
+
+	// init minsitenuminlinks buffer
+	g_tagdb.loadMinSiteInlinksBuffer();
+
 	// . then our main udp server
 	// . must pass defaults since g_dns uses it's own port/instance of it
 	// . server should listen to a socket and register with g_loop
@ -11962,7 +11966,7 @@ void dumpTagdb (char *coll,int32_t startFileNum,int32_t numFiles,
 	char sbuf[1024*2];
 	int32_t siteNumInlinks = -1;
 	int32_t typeSite = hash64Lower_a("site",4);
-	int32_t typeInlinks = hash64Lower_a("sitenuminlinksuniquecblock",26);
+	int32_t typeInlinks = hash64Lower_a("sitenuminlinks",14);

 loop:
 	// use msg5 to get the list, should ALWAYS block since no threads
@ -12035,25 +12039,71 @@ void dumpTagdb (char *coll,int32_t startFileNum,int32_t numFiles,
 		char tmpBuf[1024];
 		SafeBuf sb(tmpBuf, 1024);

+		bool match = false;
+
+		hostHash = tag->m_key.n1;
+
+		if ( hostHash == lastHostHash ) {
+			match = true;
+		}
+		else {
+			site = NULL;
+			siteNumInlinks = -1;
+		}
+
+		lastHostHash = hostHash;
+
+		// if ( hostHash == 3079740012919792457LL )
+		// 	log("hey");
+
 		// making sitelist.txt?
-		if ( tag->m_type == typeSite ) {
-			hostHash = tag->m_key.n1;
+		if ( tag->m_type == typeSite && req == 'z' ) {
 			site = tag->getTagData();
 			// make it null if too many .'s
 			if ( site ) {
 				char *p = site;
 				int count = 0;
+				int alpha = 0;
+				int colons = 0;
 				// foo.bar.baz.com is ok
-				for ( ; *p ; p++ ) 
+				for ( ; *p ; p++ ) {
 					if ( *p == '.' ) count++;
+					if ( *p == ':' ) colons++;
+					if ( is_alpha_a(*p) || *p=='-' ) 
+						alpha++;
+				}
 				if ( count >= 4 )
 					site = NULL;
+				if ( colons > 1 )
+					site = NULL;
+				// no ip addresses allowed, need an alpha char
+				if ( alpha == 0 )
+					site = NULL;
 			}
+			// ends in :?
+			int slen = 0;
+			if ( site ) slen = gbstrlen(site);
+			if ( site && site[slen-1] == ':' )
+				site = NULL;
+			// port bug
+			if ( site && site[slen-2] == ':' && site[slen-1]=='/')
+				site = NULL;
+			// remove heavy spammers to save space
+			if ( site && strstr(site,"daily-camshow-report") )
+				site = NULL;
+			if ( site && strstr(site,".livejasminhd.") )
+				site = NULL;
+			if ( site && strstr(site,".pornlivenews.") )
+				site = NULL;
+			if ( site && strstr(site,".isapornblog.") )
+				site = NULL;
+			if ( site && strstr(site,".teen-model-24.") )
+				site = NULL;
 			if ( site && ! is_ascii2_a ( site, gbstrlen(site) ) ) {
 				site = NULL;
 				continue;
 			}
-			if ( lastHostHash == hostHash && siteNumInlinks>=0) {
+			if ( match && siteNumInlinks>=0) {
 				// if we ask for 1 or 2 we end up with 100M
 				// entries, but with 3+ we get 27M
 				if ( siteNumInlinks > 2 && site )
@ -12063,14 +12113,12 @@ void dumpTagdb (char *coll,int32_t startFileNum,int32_t numFiles,
 			}
 			// save it
 			if ( site ) strcpy ( sbuf , site );
-			lastHostHash = hostHash;
 			continue;
 		}

-		if ( tag->m_type == typeInlinks ) {
-			hostHash = tag->m_key.n1;
+		if ( tag->m_type == typeInlinks && req == 'z' ) {
 			siteNumInlinks = atoi(tag->getTagData());
-			if ( lastHostHash == hostHash && site ) {
+			if ( match && site ) {
 				// if we ask for 1 or 2 we end up with 100M
 				// entries, but with 3+ we get 27M
 				if ( siteNumInlinks > 2 )
@ -12078,7 +12126,6 @@ void dumpTagdb (char *coll,int32_t startFileNum,int32_t numFiles,
 				siteNumInlinks = -1;
 				site = NULL;
 			}
-			lastHostHash = hostHash;
 			continue;
 		}

--- a/sitelinks.txt
+++ b/sitelinks.txt