Merge branch 'diffbot-testing' into testing

This commit is contained in:
Matt 2015-01-31 15:35:02 -07:00
commit e8948cea65
9 changed files with 2678052 additions and 23 deletions

@ -183,6 +183,9 @@ char *g_files[] = {
"wiktionary-lang.txt",
"wiktionary-syns.dat",
// gives us siteranks for the most popular sites:
"sitelinks.txt",
"unifiedDict.txt",
//"unifiedDict-buf.txt",
//"unifiedDict-map.dat",

231
Tagdb.cpp

@ -1776,6 +1776,8 @@ Tagdb g_tagdb2;
// reset rdb and Xmls
void Tagdb::reset() {
m_rdb.reset();
m_siteBuf1.purge();
m_siteBuf2.purge();
//s_lockTable2.reset();
}
@ -4945,3 +4947,232 @@ int32_t Tag::getDedupHash ( ) {
return dh;
}
// make sure sizeof(Entry2)=5 not 8!
#pragma pack(1)
class Entry1 {
public:
uint32_t m_hostHash32;
uint32_t m_siteNumInlinksUniqueCBlock;
};
class Entry2 {
public:
uint32_t m_hostHash32;
uint8_t m_siteNumInlinksUniqueCBlock;
};
static int linkSort1Cmp ( const void *a, const void *b ) {
Entry1 *ea = (Entry1 *)a;
Entry1 *eb = (Entry1 *)b;
if ( ea->m_hostHash32 > eb->m_hostHash32 ) return 1;
if ( ea->m_hostHash32 < eb->m_hostHash32 ) return -1;
return 0;
}
static int linkSort2Cmp ( const void *a, const void *b ) {
Entry2 *ea = (Entry2 *)a;
Entry2 *eb = (Entry2 *)b;
if ( ea->m_hostHash32 > eb->m_hostHash32 ) return 1;
if ( ea->m_hostHash32 < eb->m_hostHash32 ) return -1;
return 0;
}
bool Tagdb::loadMinSiteInlinksBuffer ( ) {
if ( ! loadMinSiteInlinksBuffer2() ) return false;
// sanity testing
uint32_t hostHash32 = hash32n("www.imdb.com");
int32_t msi = getMinSiteInlinks ( hostHash32 );
if ( msi < 10 ) {
log("tagdb: bad siteinlinks. linkedin.com not found.");
//return false;
}
hostHash32 = hash32n("0009.org" );
msi = getMinSiteInlinks ( hostHash32 );
if ( msi < 0 ) {
log("tagdb: bad siteinlinks. 0009.org not found.");
//return false;
}
Url tmp;
tmp.set("gnu.org");
hostHash32 = tmp.getHash32WithWWW();
msi = getMinSiteInlinks ( hostHash32 );
if ( msi < 0 ) {
log("tagdb: bad siteinlinks. www.gnu.org not found.");
//return false;
}
return true;
}
bool Tagdb::loadMinSiteInlinksBuffer2 ( ) {
// use 4 bytes for the first 130,000 entries or so to hold
// # of site inlinks. then we only need 1 byte since the remaining
// 25M are <256 sitenuminlinksunqiecblocks
m_siteBuf1.load("sitelinks1.dat");
m_siteBuf2.load("sitelinks2.dat");
m_siteBuf1.setLabel("sitelnks");
m_siteBuf2.setLabel("sitelnks");
if ( m_siteBuf1.length() > 0 &&
m_siteBuf2.length() > 0 )
return true;
log("gb: loading ./sitelinks.txt");
// ok, make it
SafeBuf tmp;
tmp.load("./sitelinks.txt");
if ( tmp.length() <= 0 ) {
log("gb: fatal error. could not find required file "
"./sitelinks.txt");
return false;
}
log("gb: starting initial creation of sitelinks1.dat and "
"sitelinks2.dat files");
// now parse each line in that
char *p = tmp.getBufStart();
char *pend = p + tmp.length();
char *newp = NULL;
SafeBuf buf1;
SafeBuf buf2;
int32_t count = 0;
for ( ; p < pend ; p = newp ) {
if ( ++count % 1000000 == 0 )
log("gb: parsing line # %"INT32,count);
// advance to next line
newp = p;
for ( ; newp < pend && *newp != '\n' ; newp++ );
if ( newp < pend ) newp++;
// parse this line
int32_t numLinks = atoi(p);
// skip number
for ( ; *p && *p != ' ' && *p != '\n' ; p++ );
// strange
if ( ! *p || *p == '\n' ) continue;
// skip spaces
for ( ; *p == ' ' ; p++ );
// get hostname
char *host = p;
// find end of it
for ( ; *p && *p != '\n' && *p != ' ' && *p != '\t' ; p++ );
// hash it
uint32_t hostHash32 = hash32 ( host , p - host );
// store in buffer
if ( numLinks >= 256 ) {
Entry1 e1;
e1.m_siteNumInlinksUniqueCBlock = numLinks;
e1.m_hostHash32 = hostHash32;
buf1.safeMemcpy ( &e1 , sizeof(Entry1) );
}
else {
Entry2 e2;
e2.m_siteNumInlinksUniqueCBlock = numLinks;
e2.m_hostHash32 = hostHash32;
buf2.safeMemcpy ( &e2 , sizeof(Entry2) );
}
}
log("gb: sorting sitelink data");
// now sort each one
qsort ( buf1.getBufStart() ,
buf1.length()/sizeof(Entry1),
sizeof(Entry1),
linkSort1Cmp );
qsort ( buf2.getBufStart() ,
buf2.length()/sizeof(Entry2),
sizeof(Entry2),
linkSort2Cmp );
// now copy to the official buffer so we only alloc what we need
m_siteBuf1.safeMemcpy ( &buf1 );
m_siteBuf2.safeMemcpy ( &buf2 );
log("gb: saving sitelinks1.dat and sitelinks2.dat");
m_siteBuf1.save("./sitelinks1.dat");
m_siteBuf2.save("./sitelinks2.dat");
return true;
}
int32_t Tagdb::getMinSiteInlinks ( uint32_t hostHash32 ) {
if ( m_siteBuf1.length() <= 0 ) {
log("tagdb: load not called");
char *xx=NULL;*xx=0;
}
// first check buf1 doing bstep
int32_t ne = m_siteBuf1.length() / sizeof(Entry1);
Entry1 *ep = (Entry1 *)m_siteBuf1.getBufStart();
Entry2 *fp = NULL;
int32_t i = ne / 2;
int32_t step = ne / 2;
int32_t count = 0;
loop1:
if ( i < 0 ) i = 0;
if ( i >= ne ) i = ne-1;
// after 3 single steps if no hit, try next hosthash buf
if ( count == 3 ) goto tryNextBuf;
step /= 2;
if ( step == 0 ) {
step = 1;
count++;
}
if ( hostHash32 < ep[i].m_hostHash32 ) {
i -= step;
goto loop1;
}
if ( hostHash32 > ep[i].m_hostHash32 ) {
i += step;
goto loop1;
}
return ep[i].m_siteNumInlinksUniqueCBlock;
tryNextBuf:
// reset parms
ne = m_siteBuf2.length() / sizeof(Entry2);
fp = (Entry2 *)m_siteBuf2.getBufStart();
i = ne / 2;
step = ne / 2;
count = 0;
loop2:
if ( i < 0 ) i = 0;
if ( i >= ne ) i = ne-1;
// after 3 single steps if no hit, that's it...
if ( count == 3 ) return -1;
step /= 2;
if ( step == 0 ) {
step = 1;
count++;
}
if ( hostHash32 < fp[i].m_hostHash32 ) {
i -= step;
goto loop2;
}
if ( hostHash32 > fp[i].m_hostHash32 ) {
i += step;
goto loop2;
}
return fp[i].m_siteNumInlinksUniqueCBlock;
}

@ -373,6 +373,12 @@ class Tagdb {
DiskPageCache m_pc;
bool loadMinSiteInlinksBuffer ( );
bool loadMinSiteInlinksBuffer2 ( );
int32_t getMinSiteInlinks ( uint32_t hostHash32 ) ;
SafeBuf m_siteBuf1;
SafeBuf m_siteBuf2;
};
// derive this from tagdb

@ -1740,6 +1740,12 @@ int32_t Url::getSiteHash32 ( char *coll ) {
}
*/
int32_t Url::getHash32WithWWW ( ) {
uint32_t hh = hash32n ( "www." );
int32_t conti = 4;
hh = hash32_cont ( m_domain , m_dlen , hh , &conti );
return hh;
}
int32_t Url::getHostHash32 ( ) {
return hash32 ( m_host , m_hlen );

3
Url.h

@ -184,6 +184,9 @@ public:
int32_t getHostHash32 ( ) ;
int32_t getDomainHash32 ( ) ;
// if url is xyz.com then get hash of www.xyz.com
int32_t getHash32WithWWW ( );
int64_t getUrlHash64 ( ) ;
int64_t getHostHash64 ( ) ;
int64_t getDomainHash64 ( ) ;

@ -12576,6 +12576,7 @@ Addresses *XmlDoc::getAddresses ( ) {
return &m_addresses;
}
/*
int32_t *XmlDoc::getSiteNumInlinksUniqueIp ( ) {
if ( m_siteNumInlinksUniqueIpValid )
return &m_siteNumInlinksUniqueIp;
@ -12611,6 +12612,7 @@ int32_t *XmlDoc::getSiteNumInlinksTotal ( ) {
// ok we must be valid
return &m_siteNumInlinksTotal;
}
*/
// we need this for setting SpiderRequest::m_parentFirstIp of each outlink
int32_t *XmlDoc::getFirstIp ( ) {
@ -12652,6 +12654,9 @@ uint8_t *XmlDoc::getSiteNumInlinks8 () {
return &m_siteNumInlinks8;
}
// this is the # of GOOD INLINKS to the site. so it is no more than
// 1 per c block, and it has to pass link spam detection. this is the
// highest-level count of inlinks to the site. use it a lot.
int32_t *XmlDoc::getSiteNumInlinks ( ) {
if ( m_siteNumInlinksValid ) return &m_siteNumInlinks;
@ -12798,10 +12803,15 @@ int32_t *XmlDoc::getSiteNumInlinks ( ) {
(PTRTYPE)tag3,
m_firstUrl.m_url);
LinkInfo *sinfo = NULL;
char *mysite = NULL;
// if we are good return it
if ( tag && valid ) {
// set it
m_siteNumInlinks = atol(tag->getTagData());
m_siteNumInlinksValid = true;
// companion tags
if ( tag2 ) {
m_siteNumInlinksUniqueIp = atol(tag2->getTagData());
@ -12815,9 +12825,10 @@ int32_t *XmlDoc::getSiteNumInlinks ( ) {
m_siteNumInlinksTotal =atol(tag4->getTagData());
m_siteNumInlinksTotalValid = true;
}
// it is good to go now
m_siteNumInlinksValid = true;
return &m_siteNumInlinks;
// . consult our sitelinks.txt file
// . returns -1 if not found
goto updateToMin;
}
// set status. we can time status changes with this routine!
@ -12845,7 +12856,7 @@ int32_t *XmlDoc::getSiteNumInlinks ( ) {
m_updatingSiteLinkInfoTags = true;
// we need to re-get both if either is NULL
LinkInfo *sinfo = getSiteLinkInfo();
sinfo = getSiteLinkInfo();
// block or error?
if ( ! sinfo || sinfo == (LinkInfo *)-1) return (int32_t *)sinfo;
@ -12859,7 +12870,7 @@ int32_t *XmlDoc::getSiteNumInlinks ( ) {
//Links *links = getLinks ();
//if ( ! links || links == (Links *)-1 ) return (int32_t *)links;
char *mysite = getSite();
mysite = getSite();
if ( ! mysite || mysite == (void *)-1 ) return (int32_t *)mysite;
setStatus ( "adding site info tags to tagdb 1");
@ -12881,6 +12892,45 @@ int32_t *XmlDoc::getSiteNumInlinks ( ) {
m_siteNumInlinksUniqueCBlockValid = true;
m_siteNumInlinksTotalValid = true;
updateToMin:
// . consult our sitelinks.txt file
// . returns -1 if not found
int32_t hostHash32 = getHostHash32a();
int32_t min = g_tagdb.getMinSiteInlinks ( hostHash32 );
// try with www if not there
if ( min < 0 && ! m_firstUrl.hasSubdomain() ) {
int32_t wwwHash32 = m_firstUrl.getHash32WithWWW();
min = g_tagdb.getMinSiteInlinks ( wwwHash32 );
}
if ( min >= 0 ) {
if ( m_siteNumInlinks < min ||
! m_siteNumInlinksValid ) {
m_siteNumInlinks = min;
m_siteNumInlinksValid = true;
}
// if ( ! m_siteNumInlinksUniqueIpValid ||
// m_siteNumInlinksUniqueIp < min ) {
// m_siteNumInlinksUniqueIp = min;
// m_siteNumInlinksUniqueIpValid = true;
// }
// if ( ! m_siteNumInlinksUniqueCBlockValid ||
// m_siteNumInlinksUniqueCBlock < min ) {
// m_siteNumInlinksUniqueCBlock = min;
// m_siteNumInlinksUniqueCBlockValid = true;
// }
// if ( ! m_siteNumInlinksTotalValid ||
// m_siteNumInlinksTotal < min ) {
// m_siteNumInlinksTotal = min;
// m_siteNumInlinksTotalValid = true;
// }
}
// deal with it
return &m_siteNumInlinks;
}
@ -19868,10 +19918,10 @@ bool XmlDoc::logIt ( SafeBuf *bb ) {
if ( m_siteNumInlinksValid ) {
sb->safePrintf("siteinlinks=%04"INT32" ",m_siteNumInlinks );
sb->safePrintf("siteipinlinks=%"INT32" ",
m_siteNumInlinksUniqueIp);
sb->safePrintf("sitecblockinlinks=%"INT32" ",
m_siteNumInlinksUniqueCBlock);
// sb->safePrintf("siteipinlinks=%"INT32" ",
// m_siteNumInlinksUniqueIp);
// sb->safePrintf("sitecblockinlinks=%"INT32" ",
// m_siteNumInlinksUniqueCBlock);
int32_t sr = ::getSiteRank ( m_siteNumInlinks );
sb->safePrintf("siterank=%"INT32" ", sr );
}
@ -25171,6 +25221,21 @@ char *XmlDoc::addOutlinkSpiderRecsToMetaList ( ) {
Tag *st = NULL;
if ( gr ) st = gr->getTag ("sitenuminlinks");
if ( st ) ksni = atol(st->getTagData());
int32_t hostHash32 = url.getHostHash32();
// . consult our sitelinks.txt file
// . returns -1 if not found
int32_t min = g_tagdb.getMinSiteInlinks ( hostHash32 );
// try with www if not there
if ( min < 0 && ! url.hasSubdomain() ) {
int32_t wwwHash32 = url.getHash32WithWWW();
min = g_tagdb.getMinSiteInlinks ( wwwHash32 );
}
if ( min >= 0 && ksni < min )
ksni = min;
//if ( ! m_siteNumInlinksValid ) { char *xx=NULL;*xx=0; }
//int32_t ksni = m_siteNumInlinks;
@ -25188,7 +25253,6 @@ char *XmlDoc::addOutlinkSpiderRecsToMetaList ( ) {
// get it quick
bool ispingserver = url.isPingServer();
int32_t hostHash32 = url.getHostHash32();
int32_t domHash32 = url.getDomainHash32();
// is link rss?

@ -641,9 +641,9 @@ class XmlDoc {
//class Url *getAboutUsLink ( ) ;
int32_t *getFirstIp ( ) ;
bool *updateFirstIp ( ) ;
int32_t *getSiteNumInlinksUniqueIp ( ) ;
int32_t *getSiteNumInlinksUniqueCBlock ( ) ;
int32_t *getSiteNumInlinksTotal ( );
//int32_t *getSiteNumInlinksUniqueIp ( ) ;
//int32_t *getSiteNumInlinksUniqueCBlock ( ) ;
//int32_t *getSiteNumInlinksTotal ( );
//int32_t *getSiteNumInlinksFresh ( ) ;
//int32_t *getSitePop ( ) ;
uint8_t *getSiteNumInlinks8 () ;

@ -3609,6 +3609,10 @@ int main2 ( int argc , char *argv[] ) {
return 1;
}
*/
// init minsitenuminlinks buffer
g_tagdb.loadMinSiteInlinksBuffer();
// . then our main udp server
// . must pass defaults since g_dns uses it's own port/instance of it
// . server should listen to a socket and register with g_loop
@ -11962,7 +11966,7 @@ void dumpTagdb (char *coll,int32_t startFileNum,int32_t numFiles,
char sbuf[1024*2];
int32_t siteNumInlinks = -1;
int32_t typeSite = hash64Lower_a("site",4);
int32_t typeInlinks = hash64Lower_a("sitenuminlinksuniquecblock",26);
int32_t typeInlinks = hash64Lower_a("sitenuminlinks",14);
loop:
// use msg5 to get the list, should ALWAYS block since no threads
@ -12035,25 +12039,71 @@ void dumpTagdb (char *coll,int32_t startFileNum,int32_t numFiles,
char tmpBuf[1024];
SafeBuf sb(tmpBuf, 1024);
bool match = false;
hostHash = tag->m_key.n1;
if ( hostHash == lastHostHash ) {
match = true;
}
else {
site = NULL;
siteNumInlinks = -1;
}
lastHostHash = hostHash;
// if ( hostHash == 3079740012919792457LL )
// log("hey");
// making sitelist.txt?
if ( tag->m_type == typeSite ) {
hostHash = tag->m_key.n1;
if ( tag->m_type == typeSite && req == 'z' ) {
site = tag->getTagData();
// make it null if too many .'s
if ( site ) {
char *p = site;
int count = 0;
int alpha = 0;
int colons = 0;
// foo.bar.baz.com is ok
for ( ; *p ; p++ )
for ( ; *p ; p++ ) {
if ( *p == '.' ) count++;
if ( *p == ':' ) colons++;
if ( is_alpha_a(*p) || *p=='-' )
alpha++;
}
if ( count >= 4 )
site = NULL;
if ( colons > 1 )
site = NULL;
// no ip addresses allowed, need an alpha char
if ( alpha == 0 )
site = NULL;
}
// ends in :?
int slen = 0;
if ( site ) slen = gbstrlen(site);
if ( site && site[slen-1] == ':' )
site = NULL;
// port bug
if ( site && site[slen-2] == ':' && site[slen-1]=='/')
site = NULL;
// remove heavy spammers to save space
if ( site && strstr(site,"daily-camshow-report") )
site = NULL;
if ( site && strstr(site,".livejasminhd.") )
site = NULL;
if ( site && strstr(site,".pornlivenews.") )
site = NULL;
if ( site && strstr(site,".isapornblog.") )
site = NULL;
if ( site && strstr(site,".teen-model-24.") )
site = NULL;
if ( site && ! is_ascii2_a ( site, gbstrlen(site) ) ) {
site = NULL;
continue;
}
if ( lastHostHash == hostHash && siteNumInlinks>=0) {
if ( match && siteNumInlinks>=0) {
// if we ask for 1 or 2 we end up with 100M
// entries, but with 3+ we get 27M
if ( siteNumInlinks > 2 && site )
@ -12063,14 +12113,12 @@ void dumpTagdb (char *coll,int32_t startFileNum,int32_t numFiles,
}
// save it
if ( site ) strcpy ( sbuf , site );
lastHostHash = hostHash;
continue;
}
if ( tag->m_type == typeInlinks ) {
hostHash = tag->m_key.n1;
if ( tag->m_type == typeInlinks && req == 'z' ) {
siteNumInlinks = atoi(tag->getTagData());
if ( lastHostHash == hostHash && site ) {
if ( match && site ) {
// if we ask for 1 or 2 we end up with 100M
// entries, but with 3+ we get 27M
if ( siteNumInlinks > 2 )
@ -12078,7 +12126,6 @@ void dumpTagdb (char *coll,int32_t startFileNum,int32_t numFiles,
siteNumInlinks = -1;
site = NULL;
}
lastHostHash = hostHash;
continue;
}

2677669
sitelinks.txt Normal file

File diff suppressed because it is too large Load Diff