support for gbcontenthash:xxxxx for doing

exact match deduping. highest site rank page wins, on ties, lowest docid wins.
2013-11-04 13:47:13 -08:00
parent d78413a6c0
commit 8c9d5d824b
5 changed files with 168 additions and 53 deletions
--- a/Query.cpp
+++ b/Query.cpp
@ -3031,6 +3031,7 @@ struct QueryField g_fields[] = {

 	{"gbgigabitvector", FIELD_GBGIGABITVECTOR, false,""},
 	{"gbsamplevector", FIELD_GBSAMPLEVECTOR, false,""},
+	{"gbcontenthash", FIELD_GBCONTENTHASH, false,""},
 	{"gbcountry",FIELD_GBCOUNTRY,false,""},
 	{"gbad",FIELD_GBAD,false,""},

@ -3676,6 +3677,7 @@ bool QueryTerm::isSplit() {
 	if(m_fieldCode == FIELD_GBGIGABITVECTOR) return false;
 	if(m_fieldCode == FIELD_GBSAMPLEVECTOR)  return false;
 	if(m_fieldCode == FIELD_GBSECTIONHASH)  return false;
+	if(m_fieldCode == FIELD_GBCONTENTHASH)  return false;
 	return true;
 }

--- a/Query.h
+++ b/Query.h
@ -103,6 +103,7 @@ typedef unsigned long long qvec_t;
 #define FIELD_GBCSENUM         50
 #define FIELD_GBSECTIONHASH    51
 #define FIELD_GBDOCID          52
+#define FIELD_GBCONTENTHASH    53

 #define FIELD_GBOTHER 92

--- a/Spider.cpp
+++ b/Spider.cpp
@ -10100,6 +10100,8 @@ void gotCrawlInfoReply ( void *state , UdpSlot *slot ) {
 	sendNotificationForCollRec ( cr );
 }

+#define SPIDER_DONE_TIMER 10
+
 void handleRequestc1 ( UdpSlot *slot , long niceness ) {
 	char *request = slot->m_readBuf;
 	// just a single collnum
@ -10165,7 +10167,8 @@ void handleRequestc1 ( UdpSlot *slot , long niceness ) {
 	     ci->m_lastSpiderCouldLaunch &&
 	     //cr->m_spideringEnabled &&
 	     //g_conf.m_spideringEnabled &&
-	     ci->m_lastSpiderAttempt - ci->m_lastSpiderCouldLaunch > 60 )
+	     ci->m_lastSpiderAttempt - ci->m_lastSpiderCouldLaunch > 
+	     (long)SPIDER_DONE_TIMER )
 		// assume our crawl on this host is completed i guess
 		ci->m_hasUrlsReadyToSpider = 0;

--- a/XmlDoc.cpp
+++ b/XmlDoc.cpp
@ -7609,8 +7609,7 @@ bool isSimilar_sorted ( long   *vec0 ,
 	goto mergeLoop;
 }

-
-uint64_t *XmlDoc::getDupHash ( ) {
+uint64_t *XmlDoc::getFuzzyDupHash ( ) {

 	if ( m_dupHashValid ) return &m_dupHash;
 	uint32_t *h1 = getTagPairHash32();
@ -7626,30 +7625,66 @@ uint64_t *XmlDoc::getDupHash ( ) {
 	return &m_dupHash;
 }

+long long *XmlDoc::getExactContentHash64 ( ) {

-IndexList *XmlDoc::getDupList ( ) {
+	if ( m_exactContentHash64Valid )
+		return &m_exactContentHash64;
+
+	char **u8 = getUtf8Content();
+	if ( ! u8 || u8 == (char **)-1) return (long long *)u8;
+
+	unsigned char *p = (unsigned char *)*u8;
+
+	long plen = size_utf8Content;
+	if ( plen > 0 ) plen--;
+
+	// sanity
+	if ( ! p ) return 0LL;
+	if ( p[plen] != '\0' ) { char *xx=NULL;*xx=0; }
+
+	unsigned char *pend = (unsigned char *)p + plen;
+	unsigned long long h64 = 0LL;
+	unsigned char pos = 0;
+	for ( ; p < pend ; p++ ) {
+		// breathe
+		QUICKPOLL ( m_niceness );
+		// xor this in right
+		h64 ^= g_hashtab[pos][p[0]];
+		pos++;
+	}
+
+	m_exactContentHash64Valid = true;
+	m_exactContentHash64 = h64;
+	return &m_exactContentHash64;
+}
+
+
+RdbList *XmlDoc::getDupList ( ) {
 	if ( m_dupListValid ) return &m_dupList;

 	// until we start using posdb and not indexdb, just return an
 	// empty list.
 	// TODO: MDW fix the deduping.
-	m_dupList.reset();
-	m_dupListValid = true;
-	return &m_dupList;
+	//m_dupList.reset();
+	//m_dupListValid = true;
+	//return &m_dupList;
 	//
 	// end temp hack
 	//

-	uint64_t *dh = getDupHash ( );
-	if ( ! dh || dh == (uint64_t *)-1 ) return (IndexList *)dh;
+	//uint64_t *dh = getDupHash ( );
+	//if ( ! dh || dh == (uint64_t *)-1 ) return (IndexList *)dh;

 	CollectionRec *cr = getCollRec();
 	if ( ! cr ) return NULL;

+	long long *ph64 = getExactContentHash64();
+	if ( ! ph64 || ph64 == (void *)-1 ) return (RdbList *)ph64;
+
 	// must match term in XmlDoc::hashVectors()
 	char qbuf[256];
-	snprintf(qbuf, 256, "%llu",*dh);
-	uint64_t pre    = hash64b ( "gbduphash" , 0LL );
+	snprintf(qbuf, 256, "%llu",*ph64);
+	uint64_t pre    = hash64b ( "gbcontenthash" , 0LL );
 	uint64_t termId = hash64b ( qbuf        , pre );
 	// get the startkey, endkey for termlist
 	key144_t sk ;
@ -7664,17 +7699,32 @@ IndexList *XmlDoc::getDupList ( ) {
 				0     , // port
 				0     , // maxCacheAge
 				false , // add to cache?
-				RDB_INDEXDB ,
+				RDB_POSDB, // INDEXDB ,
 				cr->m_coll      ,
 				&m_dupList  ,
 				(char *)&sk          ,
 				(char *)&ek          ,
-				306         , // minRecSizes in bytes
+				606006        , // minRecSizes in bytes
 				m_masterState , // state
 				m_masterLoop  ,
-				m_niceness    ))
+				m_niceness    ,
+				true , // error correction?
+				true , // include tree?
+				true , // domerge?
+				-1 , // firsthosti
+				0 , // startfilenum
+				-1, // # files
+				30 , // timeout
+				-1 , // syncpoint
+				-1 , // preferlocal reads
+				NULL, // msg5
+				NULL, // msg5b
+				false , // isRealMerge
+				true , // allow page cache
+				false , // forcelocalindexdb
+				true ) ) // NOSPLIT! THIS IS DIFFERENT
 		// return -1 if this blocks
-		return (IndexList *)-1;
+		return (RdbList *)-1;
 	// assume valid!
 	m_dupListValid = true;
 	return &m_dupList;
@ -7701,47 +7751,83 @@ char *XmlDoc::getIsDup ( ) {
 	long long *mydocid = getDocId();
 	if ( ! mydocid || mydocid == (long long *)-1) return (char *)mydocid;
 	// get the duplist!
-	IndexList *list = getDupList();
-	if ( ! list || list == (IndexList *)list ) return (char *)list;
+	RdbList *list = getDupList();
+	if ( ! list || list == (RdbList *)-1 ) return (char *)list;
+
+	// sanity. must be posdb list.
+	if ( ! list->isEmpty() && list->m_ks != 18 ) { char *xx=NULL;*xx=0;}

 	setStatus ( "checking for dups" );

 	// . see if there are any pages that seem like they are dups of us
 	// . they must also have a HIGHER score than us, for us to be 
 	//   considered the dup
-	if ( ! m_didQuickDupCheck ) {
-		// do not repeat
-		m_didQuickDupCheck = true;
-		// init
-		uint8_t maxScore = 0;
-		uint8_t myScore  = 0;
-		// assume not a dup
-		m_isDup = false;
-		// get the docid that we are a dup of
-		for ( ; ! list->isExhausted() ; list->skipCurrentRecord() ) {
-			// get the docid
-			long long d = list->getCurrentDocId();
-			// get the score
-			uint8_t score = list->getCurrentScore();
-			// skip if us!
-			if ( d == *getDocId() ) {
-				// record our score
-				myScore = score;
-				continue;
-			}
-			// get the winner
-			if ( score > maxScore ) maxScore = score;
+	//if ( ! m_didQuickDupCheck ) {
+	//	// do not repeat
+	//	m_didQuickDupCheck = true;
+
+	// init
+	//uint8_t maxScore = 0;
+	//uint8_t myScore  = 0;
+	char maxSiteRank = -1;
+	long long maxDocId = -1LL;
+	// assume not a dup
+	m_isDup = false;
+	// get the docid that we are a dup of
+	for ( ; ! list->isExhausted() ; list->skipCurrentRecord() ) {
+		// breathe
+		QUICKPOLL(m_niceness);
+		//long long d = list->getCurrentDocId();
+		char *rec = list->getCurrentRec();
+		// get the docid
+		long long d = g_posdb.getDocId ( rec );
+		// get the score
+		//uint8_t score = list->getCurrentScore();
+		// just let the best site rank win i guess?
+		// even though one page may have more inlinks???
+		char sr = (char )g_posdb.getSiteRank ( rec );
+		// skip if us!
+		//if ( d == *getDocId() ) {
+		//	// record our score
+		//	//myScore = score;
+		//	mySiteRank = sr;
+		//	continue;
+		//}
+		// get the winner
+		//if ( score > maxScore ) maxScore = score;
+		if ( sr > maxSiteRank ) {
+			maxSiteRank = sr;
+			maxDocId = d;
+			continue;
 		}
-		// reset its ptr for stuff below
-		list->resetListPtr();
-		// are we the highest scoring doc with this template?
-		// corollary: if all dups have equal scores they will be
-		// removed until there is only one doc that matches the pattern
-		if ( myScore >= maxScore ) {
-			m_isDupValid = true; 
-			return &m_isDup; 
+		if ( sr < maxSiteRank ) continue;
+		// fallback to docid?
+		if ( d < maxDocId ) {
+			maxDocId = d;
+			continue;
 		}
 	}
+	// are we the highest scoring doc with this template?
+	// corollary: if all dups have equal scores they will be
+	// removed until there is only one doc that matches the pattern
+	//if ( myScore >= maxScore ) {
+	if ( maxDocId >= 0 && maxDocId != *mydocid ) {
+		m_isDup = true;
+		m_isDupValid = true; 
+		return &m_isDup; 
+	}
+
+	m_isDup = false;
+	m_isDupValid = true; 
+	return &m_isDup; 
+
+	/*
+	  we now temporarily at least, do exact dup checking...
+	  later we will bring in the fuzzy code...
+
+	// reset its ptr for stuff below
+	list->resetListPtr();
+
 loop:
 	// . get a title rec for the current docid
 	// . but if exhausted, we are not a dup!
@ -7759,6 +7845,7 @@ char *XmlDoc::getIsDup ( ) {
 	list->skipCurrentRecord();
 	// loop up
 	goto loop;
+	*/
 }

 char *XmlDoc::isDupOfUs ( long long d ) {
@ -15773,8 +15860,6 @@ char **XmlDoc::getUtf8Content ( ) {
 	return &ptr_utf8Content;
 }

-
-
 // *pend should be \0
 long getContentHash32Fast ( unsigned char *p , 
 			    long plen ,
@ -22377,6 +22462,10 @@ bool XmlDoc::hashNoSplit ( HashTableX *tt ) {

 	//if ( m_skipIndexing ) return true;

+	// this should be ready to go and not block!
+	long long *pch64 = getExactContentHash64();
+	if ( ! pch64 || pch64 == (void *)-1 ) { char *xx=NULL;*xx=0; }
+
 	// shortcut
 	Url *fu = getFirstUrl();

@ -22397,6 +22486,19 @@ bool XmlDoc::hashNoSplit ( HashTableX *tt ) {
 	// desc is NULL, prefix will be used as desc
 	if ( ! hashString ( dom,dlen,&hi ) ) return false;

+
+	// for exact content deduping
+	setStatus ( "hashing gbcontenthash (deduping) no-split keys" );	
+	char hbuf[64];
+	sprintf(hbuf,"%llu",*pch64);
+	hi.m_hashGroup = HASHGROUP_INTAG;
+	hi.m_prefix    = "gbcontenthash";
+	hi.m_tt        = tt;
+	hi.m_noSplit   = true;
+	if ( ! hashString ( dom,dlen,&hi ) ) return false;
+
+
+
 	setStatus ( "hashing no-split qhost keys" );

 	char *host = fu->getHost    ();
@ -24627,6 +24729,9 @@ bool XmlDoc::hashVectors ( HashTableX *tt ) {
 	//uint64_t h1 = m_tagVector.getVectorHash();
 	//uint64_t h2 = getGigabitVectorScorelessHash(gigabitVec);
 	//uint64_t h64 = hash64 ( h1 , h2 );
+
+	// take this out for now
+	/*
 	uint64_t *dh = getDupHash ( );
 	blen = sprintf(buf,"%llu", *dh );//h64);
 	//field = "gbduphash";
@ -24636,6 +24741,7 @@ bool XmlDoc::hashVectors ( HashTableX *tt ) {
 	hi.m_desc      = "dup vector hash";
 	// this returns false on failure
 	if ( ! hashString ( buf,blen,&hi ) ) return false;
+	*/

 	// hash the wikipedia docids we match
 	if ( ! m_wikiDocIdsValid   ) { char *xx=NULL;*xx=0; }
--- a/XmlDoc.h
+++ b/XmlDoc.h
@ -40,7 +40,7 @@
 #include "SearchInput.h"
 #include "Msg40.h"
 #include "Dates.h"
-#include "IndexList.h"
+//#include "IndexList.h"
 #include "Msg0.h"
 #include "Msg22.h"
 #include "Tagdb.h"
@ -550,8 +550,9 @@ class XmlDoc {
 	float *getGigabitSimilarity ( class XmlDoc *xd2 ) ;
 	float *getPageSimilarity ( class XmlDoc *xd2 ) ;
 	float *getPercentChanged ( );
-	uint64_t *getDupHash ( );
-	class IndexList *getDupList ( ) ;
+	uint64_t *getFuzzyDupHash ( );
+	long long *getExactContentHash64();
+	class RdbList *getDupList ( ) ;
 	class RdbList *getLikedbListForReq ( );
 	class RdbList *getLikedbListForIndexing ( );
 	long addLikedbRecords ( bool justGetSize ) ;
@ -1308,6 +1309,7 @@ class XmlDoc {
 	bool m_isErrorPageValid;
 	bool m_isHijackedValid;
 	bool m_dupHashValid;
+	bool m_exactContentHash64Valid;

 	// shadows
 	char m_isRSS2;
@ -1358,9 +1360,10 @@ class XmlDoc {
 	float m_percentChanged;
 	bool  m_unchanged;
 	// what docids are similar to us? docids are in this list
-	IndexList m_dupList;
+	RdbList m_dupList;
 	RdbList m_likedbList;
 	uint64_t m_dupHash;
+	long long m_exactContentHash64;
 	Msg0 m_msg0;
 	Msg5 m_msg5;
 	char m_isDup;