support for gbcontenthash:xxxxx for doing
exact match deduping. highest site rank page wins, on ties, lowest docid wins.
This commit is contained in:
@ -3031,6 +3031,7 @@ struct QueryField g_fields[] = {
|
||||
|
||||
{"gbgigabitvector", FIELD_GBGIGABITVECTOR, false,""},
|
||||
{"gbsamplevector", FIELD_GBSAMPLEVECTOR, false,""},
|
||||
{"gbcontenthash", FIELD_GBCONTENTHASH, false,""},
|
||||
{"gbcountry",FIELD_GBCOUNTRY,false,""},
|
||||
{"gbad",FIELD_GBAD,false,""},
|
||||
|
||||
@ -3676,6 +3677,7 @@ bool QueryTerm::isSplit() {
|
||||
if(m_fieldCode == FIELD_GBGIGABITVECTOR) return false;
|
||||
if(m_fieldCode == FIELD_GBSAMPLEVECTOR) return false;
|
||||
if(m_fieldCode == FIELD_GBSECTIONHASH) return false;
|
||||
if(m_fieldCode == FIELD_GBCONTENTHASH) return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
|
1
Query.h
1
Query.h
@ -103,6 +103,7 @@ typedef unsigned long long qvec_t;
|
||||
#define FIELD_GBCSENUM 50
|
||||
#define FIELD_GBSECTIONHASH 51
|
||||
#define FIELD_GBDOCID 52
|
||||
#define FIELD_GBCONTENTHASH 53
|
||||
|
||||
#define FIELD_GBOTHER 92
|
||||
|
||||
|
@ -10100,6 +10100,8 @@ void gotCrawlInfoReply ( void *state , UdpSlot *slot ) {
|
||||
sendNotificationForCollRec ( cr );
|
||||
}
|
||||
|
||||
#define SPIDER_DONE_TIMER 10
|
||||
|
||||
void handleRequestc1 ( UdpSlot *slot , long niceness ) {
|
||||
char *request = slot->m_readBuf;
|
||||
// just a single collnum
|
||||
@ -10165,7 +10167,8 @@ void handleRequestc1 ( UdpSlot *slot , long niceness ) {
|
||||
ci->m_lastSpiderCouldLaunch &&
|
||||
//cr->m_spideringEnabled &&
|
||||
//g_conf.m_spideringEnabled &&
|
||||
ci->m_lastSpiderAttempt - ci->m_lastSpiderCouldLaunch > 60 )
|
||||
ci->m_lastSpiderAttempt - ci->m_lastSpiderCouldLaunch >
|
||||
(long)SPIDER_DONE_TIMER )
|
||||
// assume our crawl on this host is completed i guess
|
||||
ci->m_hasUrlsReadyToSpider = 0;
|
||||
|
||||
|
202
XmlDoc.cpp
202
XmlDoc.cpp
@ -7609,8 +7609,7 @@ bool isSimilar_sorted ( long *vec0 ,
|
||||
goto mergeLoop;
|
||||
}
|
||||
|
||||
|
||||
uint64_t *XmlDoc::getDupHash ( ) {
|
||||
uint64_t *XmlDoc::getFuzzyDupHash ( ) {
|
||||
|
||||
if ( m_dupHashValid ) return &m_dupHash;
|
||||
uint32_t *h1 = getTagPairHash32();
|
||||
@ -7626,30 +7625,66 @@ uint64_t *XmlDoc::getDupHash ( ) {
|
||||
return &m_dupHash;
|
||||
}
|
||||
|
||||
long long *XmlDoc::getExactContentHash64 ( ) {
|
||||
|
||||
IndexList *XmlDoc::getDupList ( ) {
|
||||
if ( m_exactContentHash64Valid )
|
||||
return &m_exactContentHash64;
|
||||
|
||||
char **u8 = getUtf8Content();
|
||||
if ( ! u8 || u8 == (char **)-1) return (long long *)u8;
|
||||
|
||||
unsigned char *p = (unsigned char *)*u8;
|
||||
|
||||
long plen = size_utf8Content;
|
||||
if ( plen > 0 ) plen--;
|
||||
|
||||
// sanity
|
||||
if ( ! p ) return 0LL;
|
||||
if ( p[plen] != '\0' ) { char *xx=NULL;*xx=0; }
|
||||
|
||||
unsigned char *pend = (unsigned char *)p + plen;
|
||||
unsigned long long h64 = 0LL;
|
||||
unsigned char pos = 0;
|
||||
for ( ; p < pend ; p++ ) {
|
||||
// breathe
|
||||
QUICKPOLL ( m_niceness );
|
||||
// xor this in right
|
||||
h64 ^= g_hashtab[pos][p[0]];
|
||||
pos++;
|
||||
}
|
||||
|
||||
m_exactContentHash64Valid = true;
|
||||
m_exactContentHash64 = h64;
|
||||
return &m_exactContentHash64;
|
||||
}
|
||||
|
||||
|
||||
RdbList *XmlDoc::getDupList ( ) {
|
||||
if ( m_dupListValid ) return &m_dupList;
|
||||
|
||||
// until we start using posdb and not indexdb, just return an
|
||||
// empty list.
|
||||
// TODO: MDW fix the deduping.
|
||||
m_dupList.reset();
|
||||
m_dupListValid = true;
|
||||
return &m_dupList;
|
||||
//m_dupList.reset();
|
||||
//m_dupListValid = true;
|
||||
//return &m_dupList;
|
||||
//
|
||||
// end temp hack
|
||||
//
|
||||
|
||||
uint64_t *dh = getDupHash ( );
|
||||
if ( ! dh || dh == (uint64_t *)-1 ) return (IndexList *)dh;
|
||||
//uint64_t *dh = getDupHash ( );
|
||||
//if ( ! dh || dh == (uint64_t *)-1 ) return (IndexList *)dh;
|
||||
|
||||
CollectionRec *cr = getCollRec();
|
||||
if ( ! cr ) return NULL;
|
||||
|
||||
long long *ph64 = getExactContentHash64();
|
||||
if ( ! ph64 || ph64 == (void *)-1 ) return (RdbList *)ph64;
|
||||
|
||||
// must match term in XmlDoc::hashVectors()
|
||||
char qbuf[256];
|
||||
snprintf(qbuf, 256, "%llu",*dh);
|
||||
uint64_t pre = hash64b ( "gbduphash" , 0LL );
|
||||
snprintf(qbuf, 256, "%llu",*ph64);
|
||||
uint64_t pre = hash64b ( "gbcontenthash" , 0LL );
|
||||
uint64_t termId = hash64b ( qbuf , pre );
|
||||
// get the startkey, endkey for termlist
|
||||
key144_t sk ;
|
||||
@ -7664,17 +7699,32 @@ IndexList *XmlDoc::getDupList ( ) {
|
||||
0 , // port
|
||||
0 , // maxCacheAge
|
||||
false , // add to cache?
|
||||
RDB_INDEXDB ,
|
||||
RDB_POSDB, // INDEXDB ,
|
||||
cr->m_coll ,
|
||||
&m_dupList ,
|
||||
(char *)&sk ,
|
||||
(char *)&ek ,
|
||||
306 , // minRecSizes in bytes
|
||||
606006 , // minRecSizes in bytes
|
||||
m_masterState , // state
|
||||
m_masterLoop ,
|
||||
m_niceness ))
|
||||
m_niceness ,
|
||||
true , // error correction?
|
||||
true , // include tree?
|
||||
true , // domerge?
|
||||
-1 , // firsthosti
|
||||
0 , // startfilenum
|
||||
-1, // # files
|
||||
30 , // timeout
|
||||
-1 , // syncpoint
|
||||
-1 , // preferlocal reads
|
||||
NULL, // msg5
|
||||
NULL, // msg5b
|
||||
false , // isRealMerge
|
||||
true , // allow page cache
|
||||
false , // forcelocalindexdb
|
||||
true ) ) // NOSPLIT! THIS IS DIFFERENT
|
||||
// return -1 if this blocks
|
||||
return (IndexList *)-1;
|
||||
return (RdbList *)-1;
|
||||
// assume valid!
|
||||
m_dupListValid = true;
|
||||
return &m_dupList;
|
||||
@ -7701,47 +7751,83 @@ char *XmlDoc::getIsDup ( ) {
|
||||
long long *mydocid = getDocId();
|
||||
if ( ! mydocid || mydocid == (long long *)-1) return (char *)mydocid;
|
||||
// get the duplist!
|
||||
IndexList *list = getDupList();
|
||||
if ( ! list || list == (IndexList *)list ) return (char *)list;
|
||||
RdbList *list = getDupList();
|
||||
if ( ! list || list == (RdbList *)-1 ) return (char *)list;
|
||||
|
||||
// sanity. must be posdb list.
|
||||
if ( ! list->isEmpty() && list->m_ks != 18 ) { char *xx=NULL;*xx=0;}
|
||||
|
||||
setStatus ( "checking for dups" );
|
||||
|
||||
// . see if there are any pages that seem like they are dups of us
|
||||
// . they must also have a HIGHER score than us, for us to be
|
||||
// considered the dup
|
||||
if ( ! m_didQuickDupCheck ) {
|
||||
// do not repeat
|
||||
m_didQuickDupCheck = true;
|
||||
// init
|
||||
uint8_t maxScore = 0;
|
||||
uint8_t myScore = 0;
|
||||
// assume not a dup
|
||||
m_isDup = false;
|
||||
// get the docid that we are a dup of
|
||||
for ( ; ! list->isExhausted() ; list->skipCurrentRecord() ) {
|
||||
// get the docid
|
||||
long long d = list->getCurrentDocId();
|
||||
// get the score
|
||||
uint8_t score = list->getCurrentScore();
|
||||
// skip if us!
|
||||
if ( d == *getDocId() ) {
|
||||
// record our score
|
||||
myScore = score;
|
||||
continue;
|
||||
}
|
||||
// get the winner
|
||||
if ( score > maxScore ) maxScore = score;
|
||||
//if ( ! m_didQuickDupCheck ) {
|
||||
// // do not repeat
|
||||
// m_didQuickDupCheck = true;
|
||||
|
||||
// init
|
||||
//uint8_t maxScore = 0;
|
||||
//uint8_t myScore = 0;
|
||||
char maxSiteRank = -1;
|
||||
long long maxDocId = -1LL;
|
||||
// assume not a dup
|
||||
m_isDup = false;
|
||||
// get the docid that we are a dup of
|
||||
for ( ; ! list->isExhausted() ; list->skipCurrentRecord() ) {
|
||||
// breathe
|
||||
QUICKPOLL(m_niceness);
|
||||
//long long d = list->getCurrentDocId();
|
||||
char *rec = list->getCurrentRec();
|
||||
// get the docid
|
||||
long long d = g_posdb.getDocId ( rec );
|
||||
// get the score
|
||||
//uint8_t score = list->getCurrentScore();
|
||||
// just let the best site rank win i guess?
|
||||
// even though one page may have more inlinks???
|
||||
char sr = (char )g_posdb.getSiteRank ( rec );
|
||||
// skip if us!
|
||||
//if ( d == *getDocId() ) {
|
||||
// // record our score
|
||||
// //myScore = score;
|
||||
// mySiteRank = sr;
|
||||
// continue;
|
||||
//}
|
||||
// get the winner
|
||||
//if ( score > maxScore ) maxScore = score;
|
||||
if ( sr > maxSiteRank ) {
|
||||
maxSiteRank = sr;
|
||||
maxDocId = d;
|
||||
continue;
|
||||
}
|
||||
// reset its ptr for stuff below
|
||||
list->resetListPtr();
|
||||
// are we the highest scoring doc with this template?
|
||||
// corollary: if all dups have equal scores they will be
|
||||
// removed until there is only one doc that matches the pattern
|
||||
if ( myScore >= maxScore ) {
|
||||
m_isDupValid = true;
|
||||
return &m_isDup;
|
||||
if ( sr < maxSiteRank ) continue;
|
||||
// fallback to docid?
|
||||
if ( d < maxDocId ) {
|
||||
maxDocId = d;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
// are we the highest scoring doc with this template?
|
||||
// corollary: if all dups have equal scores they will be
|
||||
// removed until there is only one doc that matches the pattern
|
||||
//if ( myScore >= maxScore ) {
|
||||
if ( maxDocId >= 0 && maxDocId != *mydocid ) {
|
||||
m_isDup = true;
|
||||
m_isDupValid = true;
|
||||
return &m_isDup;
|
||||
}
|
||||
|
||||
m_isDup = false;
|
||||
m_isDupValid = true;
|
||||
return &m_isDup;
|
||||
|
||||
/*
|
||||
we now temporarily at least, do exact dup checking...
|
||||
later we will bring in the fuzzy code...
|
||||
|
||||
// reset its ptr for stuff below
|
||||
list->resetListPtr();
|
||||
|
||||
loop:
|
||||
// . get a title rec for the current docid
|
||||
// . but if exhausted, we are not a dup!
|
||||
@ -7759,6 +7845,7 @@ char *XmlDoc::getIsDup ( ) {
|
||||
list->skipCurrentRecord();
|
||||
// loop up
|
||||
goto loop;
|
||||
*/
|
||||
}
|
||||
|
||||
char *XmlDoc::isDupOfUs ( long long d ) {
|
||||
@ -15773,8 +15860,6 @@ char **XmlDoc::getUtf8Content ( ) {
|
||||
return &ptr_utf8Content;
|
||||
}
|
||||
|
||||
|
||||
|
||||
// *pend should be \0
|
||||
long getContentHash32Fast ( unsigned char *p ,
|
||||
long plen ,
|
||||
@ -22377,6 +22462,10 @@ bool XmlDoc::hashNoSplit ( HashTableX *tt ) {
|
||||
|
||||
//if ( m_skipIndexing ) return true;
|
||||
|
||||
// this should be ready to go and not block!
|
||||
long long *pch64 = getExactContentHash64();
|
||||
if ( ! pch64 || pch64 == (void *)-1 ) { char *xx=NULL;*xx=0; }
|
||||
|
||||
// shortcut
|
||||
Url *fu = getFirstUrl();
|
||||
|
||||
@ -22397,6 +22486,19 @@ bool XmlDoc::hashNoSplit ( HashTableX *tt ) {
|
||||
// desc is NULL, prefix will be used as desc
|
||||
if ( ! hashString ( dom,dlen,&hi ) ) return false;
|
||||
|
||||
|
||||
// for exact content deduping
|
||||
setStatus ( "hashing gbcontenthash (deduping) no-split keys" );
|
||||
char hbuf[64];
|
||||
sprintf(hbuf,"%llu",*pch64);
|
||||
hi.m_hashGroup = HASHGROUP_INTAG;
|
||||
hi.m_prefix = "gbcontenthash";
|
||||
hi.m_tt = tt;
|
||||
hi.m_noSplit = true;
|
||||
if ( ! hashString ( dom,dlen,&hi ) ) return false;
|
||||
|
||||
|
||||
|
||||
setStatus ( "hashing no-split qhost keys" );
|
||||
|
||||
char *host = fu->getHost ();
|
||||
@ -24627,6 +24729,9 @@ bool XmlDoc::hashVectors ( HashTableX *tt ) {
|
||||
//uint64_t h1 = m_tagVector.getVectorHash();
|
||||
//uint64_t h2 = getGigabitVectorScorelessHash(gigabitVec);
|
||||
//uint64_t h64 = hash64 ( h1 , h2 );
|
||||
|
||||
// take this out for now
|
||||
/*
|
||||
uint64_t *dh = getDupHash ( );
|
||||
blen = sprintf(buf,"%llu", *dh );//h64);
|
||||
//field = "gbduphash";
|
||||
@ -24636,6 +24741,7 @@ bool XmlDoc::hashVectors ( HashTableX *tt ) {
|
||||
hi.m_desc = "dup vector hash";
|
||||
// this returns false on failure
|
||||
if ( ! hashString ( buf,blen,&hi ) ) return false;
|
||||
*/
|
||||
|
||||
// hash the wikipedia docids we match
|
||||
if ( ! m_wikiDocIdsValid ) { char *xx=NULL;*xx=0; }
|
||||
|
11
XmlDoc.h
11
XmlDoc.h
@ -40,7 +40,7 @@
|
||||
#include "SearchInput.h"
|
||||
#include "Msg40.h"
|
||||
#include "Dates.h"
|
||||
#include "IndexList.h"
|
||||
//#include "IndexList.h"
|
||||
#include "Msg0.h"
|
||||
#include "Msg22.h"
|
||||
#include "Tagdb.h"
|
||||
@ -550,8 +550,9 @@ class XmlDoc {
|
||||
float *getGigabitSimilarity ( class XmlDoc *xd2 ) ;
|
||||
float *getPageSimilarity ( class XmlDoc *xd2 ) ;
|
||||
float *getPercentChanged ( );
|
||||
uint64_t *getDupHash ( );
|
||||
class IndexList *getDupList ( ) ;
|
||||
uint64_t *getFuzzyDupHash ( );
|
||||
long long *getExactContentHash64();
|
||||
class RdbList *getDupList ( ) ;
|
||||
class RdbList *getLikedbListForReq ( );
|
||||
class RdbList *getLikedbListForIndexing ( );
|
||||
long addLikedbRecords ( bool justGetSize ) ;
|
||||
@ -1308,6 +1309,7 @@ class XmlDoc {
|
||||
bool m_isErrorPageValid;
|
||||
bool m_isHijackedValid;
|
||||
bool m_dupHashValid;
|
||||
bool m_exactContentHash64Valid;
|
||||
|
||||
// shadows
|
||||
char m_isRSS2;
|
||||
@ -1358,9 +1360,10 @@ class XmlDoc {
|
||||
float m_percentChanged;
|
||||
bool m_unchanged;
|
||||
// what docids are similar to us? docids are in this list
|
||||
IndexList m_dupList;
|
||||
RdbList m_dupList;
|
||||
RdbList m_likedbList;
|
||||
uint64_t m_dupHash;
|
||||
long long m_exactContentHash64;
|
||||
Msg0 m_msg0;
|
||||
Msg5 m_msg5;
|
||||
char m_isDup;
|
||||
|
Reference in New Issue
Block a user