Merge branch 'diffbot-testing' into testing
This commit is contained in:
commit
e8948cea65
@ -183,6 +183,9 @@ char *g_files[] = {
|
||||
"wiktionary-lang.txt",
|
||||
"wiktionary-syns.dat",
|
||||
|
||||
// gives us siteranks for the most popular sites:
|
||||
"sitelinks.txt",
|
||||
|
||||
"unifiedDict.txt",
|
||||
//"unifiedDict-buf.txt",
|
||||
//"unifiedDict-map.dat",
|
||||
|
231
Tagdb.cpp
231
Tagdb.cpp
@ -1776,6 +1776,8 @@ Tagdb g_tagdb2;
|
||||
// reset rdb and Xmls
|
||||
void Tagdb::reset() {
|
||||
m_rdb.reset();
|
||||
m_siteBuf1.purge();
|
||||
m_siteBuf2.purge();
|
||||
//s_lockTable2.reset();
|
||||
}
|
||||
|
||||
@ -4945,3 +4947,232 @@ int32_t Tag::getDedupHash ( ) {
|
||||
|
||||
return dh;
|
||||
}
|
||||
|
||||
// make sure sizeof(Entry2)=5 not 8!
|
||||
#pragma pack(1)
|
||||
|
||||
class Entry1 {
|
||||
public:
|
||||
uint32_t m_hostHash32;
|
||||
uint32_t m_siteNumInlinksUniqueCBlock;
|
||||
};
|
||||
|
||||
class Entry2 {
|
||||
public:
|
||||
uint32_t m_hostHash32;
|
||||
uint8_t m_siteNumInlinksUniqueCBlock;
|
||||
};
|
||||
|
||||
static int linkSort1Cmp ( const void *a, const void *b ) {
|
||||
Entry1 *ea = (Entry1 *)a;
|
||||
Entry1 *eb = (Entry1 *)b;
|
||||
if ( ea->m_hostHash32 > eb->m_hostHash32 ) return 1;
|
||||
if ( ea->m_hostHash32 < eb->m_hostHash32 ) return -1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int linkSort2Cmp ( const void *a, const void *b ) {
|
||||
Entry2 *ea = (Entry2 *)a;
|
||||
Entry2 *eb = (Entry2 *)b;
|
||||
if ( ea->m_hostHash32 > eb->m_hostHash32 ) return 1;
|
||||
if ( ea->m_hostHash32 < eb->m_hostHash32 ) return -1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
bool Tagdb::loadMinSiteInlinksBuffer ( ) {
|
||||
|
||||
if ( ! loadMinSiteInlinksBuffer2() ) return false;
|
||||
|
||||
// sanity testing
|
||||
uint32_t hostHash32 = hash32n("www.imdb.com");
|
||||
int32_t msi = getMinSiteInlinks ( hostHash32 );
|
||||
if ( msi < 10 ) {
|
||||
log("tagdb: bad siteinlinks. linkedin.com not found.");
|
||||
//return false;
|
||||
}
|
||||
hostHash32 = hash32n("0009.org" );
|
||||
msi = getMinSiteInlinks ( hostHash32 );
|
||||
if ( msi < 0 ) {
|
||||
log("tagdb: bad siteinlinks. 0009.org not found.");
|
||||
//return false;
|
||||
}
|
||||
Url tmp;
|
||||
tmp.set("gnu.org");
|
||||
hostHash32 = tmp.getHash32WithWWW();
|
||||
msi = getMinSiteInlinks ( hostHash32 );
|
||||
if ( msi < 0 ) {
|
||||
log("tagdb: bad siteinlinks. www.gnu.org not found.");
|
||||
//return false;
|
||||
}
|
||||
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool Tagdb::loadMinSiteInlinksBuffer2 ( ) {
|
||||
|
||||
// use 4 bytes for the first 130,000 entries or so to hold
|
||||
// # of site inlinks. then we only need 1 byte since the remaining
|
||||
// 25M are <256 sitenuminlinksunqiecblocks
|
||||
m_siteBuf1.load("sitelinks1.dat");
|
||||
m_siteBuf2.load("sitelinks2.dat");
|
||||
|
||||
m_siteBuf1.setLabel("sitelnks");
|
||||
m_siteBuf2.setLabel("sitelnks");
|
||||
|
||||
if ( m_siteBuf1.length() > 0 &&
|
||||
m_siteBuf2.length() > 0 )
|
||||
return true;
|
||||
|
||||
log("gb: loading ./sitelinks.txt");
|
||||
|
||||
// ok, make it
|
||||
SafeBuf tmp;
|
||||
tmp.load("./sitelinks.txt");
|
||||
if ( tmp.length() <= 0 ) {
|
||||
log("gb: fatal error. could not find required file "
|
||||
"./sitelinks.txt");
|
||||
return false;
|
||||
}
|
||||
|
||||
log("gb: starting initial creation of sitelinks1.dat and "
|
||||
"sitelinks2.dat files");
|
||||
|
||||
// now parse each line in that
|
||||
char *p = tmp.getBufStart();
|
||||
char *pend = p + tmp.length();
|
||||
char *newp = NULL;
|
||||
SafeBuf buf1;
|
||||
SafeBuf buf2;
|
||||
int32_t count = 0;
|
||||
for ( ; p < pend ; p = newp ) {
|
||||
|
||||
if ( ++count % 1000000 == 0 )
|
||||
log("gb: parsing line # %"INT32,count);
|
||||
|
||||
// advance to next line
|
||||
newp = p;
|
||||
for ( ; newp < pend && *newp != '\n' ; newp++ );
|
||||
if ( newp < pend ) newp++;
|
||||
// parse this line
|
||||
int32_t numLinks = atoi(p);
|
||||
// skip number
|
||||
for ( ; *p && *p != ' ' && *p != '\n' ; p++ );
|
||||
// strange
|
||||
if ( ! *p || *p == '\n' ) continue;
|
||||
// skip spaces
|
||||
for ( ; *p == ' ' ; p++ );
|
||||
// get hostname
|
||||
char *host = p;
|
||||
// find end of it
|
||||
for ( ; *p && *p != '\n' && *p != ' ' && *p != '\t' ; p++ );
|
||||
// hash it
|
||||
uint32_t hostHash32 = hash32 ( host , p - host );
|
||||
// store in buffer
|
||||
if ( numLinks >= 256 ) {
|
||||
Entry1 e1;
|
||||
e1.m_siteNumInlinksUniqueCBlock = numLinks;
|
||||
e1.m_hostHash32 = hostHash32;
|
||||
buf1.safeMemcpy ( &e1 , sizeof(Entry1) );
|
||||
}
|
||||
else {
|
||||
Entry2 e2;
|
||||
e2.m_siteNumInlinksUniqueCBlock = numLinks;
|
||||
e2.m_hostHash32 = hostHash32;
|
||||
buf2.safeMemcpy ( &e2 , sizeof(Entry2) );
|
||||
}
|
||||
}
|
||||
|
||||
log("gb: sorting sitelink data");
|
||||
|
||||
// now sort each one
|
||||
qsort ( buf1.getBufStart() ,
|
||||
buf1.length()/sizeof(Entry1),
|
||||
sizeof(Entry1),
|
||||
linkSort1Cmp );
|
||||
|
||||
qsort ( buf2.getBufStart() ,
|
||||
buf2.length()/sizeof(Entry2),
|
||||
sizeof(Entry2),
|
||||
linkSort2Cmp );
|
||||
|
||||
|
||||
// now copy to the official buffer so we only alloc what we need
|
||||
m_siteBuf1.safeMemcpy ( &buf1 );
|
||||
m_siteBuf2.safeMemcpy ( &buf2 );
|
||||
|
||||
log("gb: saving sitelinks1.dat and sitelinks2.dat");
|
||||
|
||||
m_siteBuf1.save("./sitelinks1.dat");
|
||||
m_siteBuf2.save("./sitelinks2.dat");
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
int32_t Tagdb::getMinSiteInlinks ( uint32_t hostHash32 ) {
|
||||
|
||||
if ( m_siteBuf1.length() <= 0 ) {
|
||||
log("tagdb: load not called");
|
||||
char *xx=NULL;*xx=0;
|
||||
}
|
||||
|
||||
// first check buf1 doing bstep
|
||||
int32_t ne = m_siteBuf1.length() / sizeof(Entry1);
|
||||
Entry1 *ep = (Entry1 *)m_siteBuf1.getBufStart();
|
||||
Entry2 *fp = NULL;
|
||||
int32_t i = ne / 2;
|
||||
int32_t step = ne / 2;
|
||||
int32_t count = 0;
|
||||
|
||||
loop1:
|
||||
|
||||
if ( i < 0 ) i = 0;
|
||||
if ( i >= ne ) i = ne-1;
|
||||
// after 3 single steps if no hit, try next hosthash buf
|
||||
if ( count == 3 ) goto tryNextBuf;
|
||||
step /= 2;
|
||||
if ( step == 0 ) {
|
||||
step = 1;
|
||||
count++;
|
||||
}
|
||||
if ( hostHash32 < ep[i].m_hostHash32 ) {
|
||||
i -= step;
|
||||
goto loop1;
|
||||
}
|
||||
if ( hostHash32 > ep[i].m_hostHash32 ) {
|
||||
i += step;
|
||||
goto loop1;
|
||||
}
|
||||
return ep[i].m_siteNumInlinksUniqueCBlock;
|
||||
|
||||
tryNextBuf:
|
||||
|
||||
// reset parms
|
||||
ne = m_siteBuf2.length() / sizeof(Entry2);
|
||||
fp = (Entry2 *)m_siteBuf2.getBufStart();
|
||||
i = ne / 2;
|
||||
step = ne / 2;
|
||||
count = 0;
|
||||
|
||||
loop2:
|
||||
|
||||
if ( i < 0 ) i = 0;
|
||||
if ( i >= ne ) i = ne-1;
|
||||
// after 3 single steps if no hit, that's it...
|
||||
if ( count == 3 ) return -1;
|
||||
step /= 2;
|
||||
if ( step == 0 ) {
|
||||
step = 1;
|
||||
count++;
|
||||
}
|
||||
if ( hostHash32 < fp[i].m_hostHash32 ) {
|
||||
i -= step;
|
||||
goto loop2;
|
||||
}
|
||||
if ( hostHash32 > fp[i].m_hostHash32 ) {
|
||||
i += step;
|
||||
goto loop2;
|
||||
}
|
||||
return fp[i].m_siteNumInlinksUniqueCBlock;
|
||||
|
||||
}
|
||||
|
6
Tagdb.h
6
Tagdb.h
@ -373,6 +373,12 @@ class Tagdb {
|
||||
|
||||
DiskPageCache m_pc;
|
||||
|
||||
bool loadMinSiteInlinksBuffer ( );
|
||||
bool loadMinSiteInlinksBuffer2 ( );
|
||||
int32_t getMinSiteInlinks ( uint32_t hostHash32 ) ;
|
||||
SafeBuf m_siteBuf1;
|
||||
SafeBuf m_siteBuf2;
|
||||
|
||||
};
|
||||
|
||||
// derive this from tagdb
|
||||
|
6
Url.cpp
6
Url.cpp
@ -1740,6 +1740,12 @@ int32_t Url::getSiteHash32 ( char *coll ) {
|
||||
}
|
||||
*/
|
||||
|
||||
int32_t Url::getHash32WithWWW ( ) {
|
||||
uint32_t hh = hash32n ( "www." );
|
||||
int32_t conti = 4;
|
||||
hh = hash32_cont ( m_domain , m_dlen , hh , &conti );
|
||||
return hh;
|
||||
}
|
||||
|
||||
int32_t Url::getHostHash32 ( ) {
|
||||
return hash32 ( m_host , m_hlen );
|
||||
|
3
Url.h
3
Url.h
@ -184,6 +184,9 @@ public:
|
||||
int32_t getHostHash32 ( ) ;
|
||||
int32_t getDomainHash32 ( ) ;
|
||||
|
||||
// if url is xyz.com then get hash of www.xyz.com
|
||||
int32_t getHash32WithWWW ( );
|
||||
|
||||
int64_t getUrlHash64 ( ) ;
|
||||
int64_t getHostHash64 ( ) ;
|
||||
int64_t getDomainHash64 ( ) ;
|
||||
|
84
XmlDoc.cpp
84
XmlDoc.cpp
@ -12576,6 +12576,7 @@ Addresses *XmlDoc::getAddresses ( ) {
|
||||
return &m_addresses;
|
||||
}
|
||||
|
||||
/*
|
||||
int32_t *XmlDoc::getSiteNumInlinksUniqueIp ( ) {
|
||||
if ( m_siteNumInlinksUniqueIpValid )
|
||||
return &m_siteNumInlinksUniqueIp;
|
||||
@ -12611,6 +12612,7 @@ int32_t *XmlDoc::getSiteNumInlinksTotal ( ) {
|
||||
// ok we must be valid
|
||||
return &m_siteNumInlinksTotal;
|
||||
}
|
||||
*/
|
||||
|
||||
// we need this for setting SpiderRequest::m_parentFirstIp of each outlink
|
||||
int32_t *XmlDoc::getFirstIp ( ) {
|
||||
@ -12652,6 +12654,9 @@ uint8_t *XmlDoc::getSiteNumInlinks8 () {
|
||||
return &m_siteNumInlinks8;
|
||||
}
|
||||
|
||||
// this is the # of GOOD INLINKS to the site. so it is no more than
|
||||
// 1 per c block, and it has to pass link spam detection. this is the
|
||||
// highest-level count of inlinks to the site. use it a lot.
|
||||
int32_t *XmlDoc::getSiteNumInlinks ( ) {
|
||||
|
||||
if ( m_siteNumInlinksValid ) return &m_siteNumInlinks;
|
||||
@ -12798,10 +12803,15 @@ int32_t *XmlDoc::getSiteNumInlinks ( ) {
|
||||
(PTRTYPE)tag3,
|
||||
m_firstUrl.m_url);
|
||||
|
||||
LinkInfo *sinfo = NULL;
|
||||
char *mysite = NULL;
|
||||
|
||||
// if we are good return it
|
||||
if ( tag && valid ) {
|
||||
// set it
|
||||
m_siteNumInlinks = atol(tag->getTagData());
|
||||
m_siteNumInlinksValid = true;
|
||||
|
||||
// companion tags
|
||||
if ( tag2 ) {
|
||||
m_siteNumInlinksUniqueIp = atol(tag2->getTagData());
|
||||
@ -12815,9 +12825,10 @@ int32_t *XmlDoc::getSiteNumInlinks ( ) {
|
||||
m_siteNumInlinksTotal =atol(tag4->getTagData());
|
||||
m_siteNumInlinksTotalValid = true;
|
||||
}
|
||||
// it is good to go now
|
||||
m_siteNumInlinksValid = true;
|
||||
return &m_siteNumInlinks;
|
||||
|
||||
// . consult our sitelinks.txt file
|
||||
// . returns -1 if not found
|
||||
goto updateToMin;
|
||||
}
|
||||
|
||||
// set status. we can time status changes with this routine!
|
||||
@ -12845,7 +12856,7 @@ int32_t *XmlDoc::getSiteNumInlinks ( ) {
|
||||
m_updatingSiteLinkInfoTags = true;
|
||||
|
||||
// we need to re-get both if either is NULL
|
||||
LinkInfo *sinfo = getSiteLinkInfo();
|
||||
sinfo = getSiteLinkInfo();
|
||||
// block or error?
|
||||
if ( ! sinfo || sinfo == (LinkInfo *)-1) return (int32_t *)sinfo;
|
||||
|
||||
@ -12859,7 +12870,7 @@ int32_t *XmlDoc::getSiteNumInlinks ( ) {
|
||||
//Links *links = getLinks ();
|
||||
//if ( ! links || links == (Links *)-1 ) return (int32_t *)links;
|
||||
|
||||
char *mysite = getSite();
|
||||
mysite = getSite();
|
||||
if ( ! mysite || mysite == (void *)-1 ) return (int32_t *)mysite;
|
||||
|
||||
setStatus ( "adding site info tags to tagdb 1");
|
||||
@ -12881,6 +12892,45 @@ int32_t *XmlDoc::getSiteNumInlinks ( ) {
|
||||
m_siteNumInlinksUniqueCBlockValid = true;
|
||||
m_siteNumInlinksTotalValid = true;
|
||||
|
||||
|
||||
updateToMin:
|
||||
|
||||
// . consult our sitelinks.txt file
|
||||
// . returns -1 if not found
|
||||
int32_t hostHash32 = getHostHash32a();
|
||||
int32_t min = g_tagdb.getMinSiteInlinks ( hostHash32 );
|
||||
|
||||
// try with www if not there
|
||||
if ( min < 0 && ! m_firstUrl.hasSubdomain() ) {
|
||||
int32_t wwwHash32 = m_firstUrl.getHash32WithWWW();
|
||||
min = g_tagdb.getMinSiteInlinks ( wwwHash32 );
|
||||
}
|
||||
|
||||
if ( min >= 0 ) {
|
||||
if ( m_siteNumInlinks < min ||
|
||||
! m_siteNumInlinksValid ) {
|
||||
m_siteNumInlinks = min;
|
||||
m_siteNumInlinksValid = true;
|
||||
}
|
||||
// if ( ! m_siteNumInlinksUniqueIpValid ||
|
||||
// m_siteNumInlinksUniqueIp < min ) {
|
||||
// m_siteNumInlinksUniqueIp = min;
|
||||
// m_siteNumInlinksUniqueIpValid = true;
|
||||
// }
|
||||
// if ( ! m_siteNumInlinksUniqueCBlockValid ||
|
||||
// m_siteNumInlinksUniqueCBlock < min ) {
|
||||
// m_siteNumInlinksUniqueCBlock = min;
|
||||
// m_siteNumInlinksUniqueCBlockValid = true;
|
||||
// }
|
||||
// if ( ! m_siteNumInlinksTotalValid ||
|
||||
// m_siteNumInlinksTotal < min ) {
|
||||
// m_siteNumInlinksTotal = min;
|
||||
// m_siteNumInlinksTotalValid = true;
|
||||
// }
|
||||
}
|
||||
|
||||
|
||||
|
||||
// deal with it
|
||||
return &m_siteNumInlinks;
|
||||
}
|
||||
@ -19868,10 +19918,10 @@ bool XmlDoc::logIt ( SafeBuf *bb ) {
|
||||
|
||||
if ( m_siteNumInlinksValid ) {
|
||||
sb->safePrintf("siteinlinks=%04"INT32" ",m_siteNumInlinks );
|
||||
sb->safePrintf("siteipinlinks=%"INT32" ",
|
||||
m_siteNumInlinksUniqueIp);
|
||||
sb->safePrintf("sitecblockinlinks=%"INT32" ",
|
||||
m_siteNumInlinksUniqueCBlock);
|
||||
// sb->safePrintf("siteipinlinks=%"INT32" ",
|
||||
// m_siteNumInlinksUniqueIp);
|
||||
// sb->safePrintf("sitecblockinlinks=%"INT32" ",
|
||||
// m_siteNumInlinksUniqueCBlock);
|
||||
int32_t sr = ::getSiteRank ( m_siteNumInlinks );
|
||||
sb->safePrintf("siterank=%"INT32" ", sr );
|
||||
}
|
||||
@ -25171,6 +25221,21 @@ char *XmlDoc::addOutlinkSpiderRecsToMetaList ( ) {
|
||||
Tag *st = NULL;
|
||||
if ( gr ) st = gr->getTag ("sitenuminlinks");
|
||||
if ( st ) ksni = atol(st->getTagData());
|
||||
|
||||
int32_t hostHash32 = url.getHostHash32();
|
||||
// . consult our sitelinks.txt file
|
||||
// . returns -1 if not found
|
||||
int32_t min = g_tagdb.getMinSiteInlinks ( hostHash32 );
|
||||
|
||||
// try with www if not there
|
||||
if ( min < 0 && ! url.hasSubdomain() ) {
|
||||
int32_t wwwHash32 = url.getHash32WithWWW();
|
||||
min = g_tagdb.getMinSiteInlinks ( wwwHash32 );
|
||||
}
|
||||
|
||||
if ( min >= 0 && ksni < min )
|
||||
ksni = min;
|
||||
|
||||
//if ( ! m_siteNumInlinksValid ) { char *xx=NULL;*xx=0; }
|
||||
//int32_t ksni = m_siteNumInlinks;
|
||||
|
||||
@ -25188,7 +25253,6 @@ char *XmlDoc::addOutlinkSpiderRecsToMetaList ( ) {
|
||||
|
||||
// get it quick
|
||||
bool ispingserver = url.isPingServer();
|
||||
int32_t hostHash32 = url.getHostHash32();
|
||||
int32_t domHash32 = url.getDomainHash32();
|
||||
|
||||
// is link rss?
|
||||
|
6
XmlDoc.h
6
XmlDoc.h
@ -641,9 +641,9 @@ class XmlDoc {
|
||||
//class Url *getAboutUsLink ( ) ;
|
||||
int32_t *getFirstIp ( ) ;
|
||||
bool *updateFirstIp ( ) ;
|
||||
int32_t *getSiteNumInlinksUniqueIp ( ) ;
|
||||
int32_t *getSiteNumInlinksUniqueCBlock ( ) ;
|
||||
int32_t *getSiteNumInlinksTotal ( );
|
||||
//int32_t *getSiteNumInlinksUniqueIp ( ) ;
|
||||
//int32_t *getSiteNumInlinksUniqueCBlock ( ) ;
|
||||
//int32_t *getSiteNumInlinksTotal ( );
|
||||
//int32_t *getSiteNumInlinksFresh ( ) ;
|
||||
//int32_t *getSitePop ( ) ;
|
||||
uint8_t *getSiteNumInlinks8 () ;
|
||||
|
67
main.cpp
67
main.cpp
@ -3609,6 +3609,10 @@ int main2 ( int argc , char *argv[] ) {
|
||||
return 1;
|
||||
}
|
||||
*/
|
||||
|
||||
// init minsitenuminlinks buffer
|
||||
g_tagdb.loadMinSiteInlinksBuffer();
|
||||
|
||||
// . then our main udp server
|
||||
// . must pass defaults since g_dns uses it's own port/instance of it
|
||||
// . server should listen to a socket and register with g_loop
|
||||
@ -11962,7 +11966,7 @@ void dumpTagdb (char *coll,int32_t startFileNum,int32_t numFiles,
|
||||
char sbuf[1024*2];
|
||||
int32_t siteNumInlinks = -1;
|
||||
int32_t typeSite = hash64Lower_a("site",4);
|
||||
int32_t typeInlinks = hash64Lower_a("sitenuminlinksuniquecblock",26);
|
||||
int32_t typeInlinks = hash64Lower_a("sitenuminlinks",14);
|
||||
|
||||
loop:
|
||||
// use msg5 to get the list, should ALWAYS block since no threads
|
||||
@ -12035,25 +12039,71 @@ void dumpTagdb (char *coll,int32_t startFileNum,int32_t numFiles,
|
||||
char tmpBuf[1024];
|
||||
SafeBuf sb(tmpBuf, 1024);
|
||||
|
||||
bool match = false;
|
||||
|
||||
hostHash = tag->m_key.n1;
|
||||
|
||||
if ( hostHash == lastHostHash ) {
|
||||
match = true;
|
||||
}
|
||||
else {
|
||||
site = NULL;
|
||||
siteNumInlinks = -1;
|
||||
}
|
||||
|
||||
lastHostHash = hostHash;
|
||||
|
||||
// if ( hostHash == 3079740012919792457LL )
|
||||
// log("hey");
|
||||
|
||||
// making sitelist.txt?
|
||||
if ( tag->m_type == typeSite ) {
|
||||
hostHash = tag->m_key.n1;
|
||||
if ( tag->m_type == typeSite && req == 'z' ) {
|
||||
site = tag->getTagData();
|
||||
// make it null if too many .'s
|
||||
if ( site ) {
|
||||
char *p = site;
|
||||
int count = 0;
|
||||
int alpha = 0;
|
||||
int colons = 0;
|
||||
// foo.bar.baz.com is ok
|
||||
for ( ; *p ; p++ )
|
||||
for ( ; *p ; p++ ) {
|
||||
if ( *p == '.' ) count++;
|
||||
if ( *p == ':' ) colons++;
|
||||
if ( is_alpha_a(*p) || *p=='-' )
|
||||
alpha++;
|
||||
}
|
||||
if ( count >= 4 )
|
||||
site = NULL;
|
||||
if ( colons > 1 )
|
||||
site = NULL;
|
||||
// no ip addresses allowed, need an alpha char
|
||||
if ( alpha == 0 )
|
||||
site = NULL;
|
||||
}
|
||||
// ends in :?
|
||||
int slen = 0;
|
||||
if ( site ) slen = gbstrlen(site);
|
||||
if ( site && site[slen-1] == ':' )
|
||||
site = NULL;
|
||||
// port bug
|
||||
if ( site && site[slen-2] == ':' && site[slen-1]=='/')
|
||||
site = NULL;
|
||||
// remove heavy spammers to save space
|
||||
if ( site && strstr(site,"daily-camshow-report") )
|
||||
site = NULL;
|
||||
if ( site && strstr(site,".livejasminhd.") )
|
||||
site = NULL;
|
||||
if ( site && strstr(site,".pornlivenews.") )
|
||||
site = NULL;
|
||||
if ( site && strstr(site,".isapornblog.") )
|
||||
site = NULL;
|
||||
if ( site && strstr(site,".teen-model-24.") )
|
||||
site = NULL;
|
||||
if ( site && ! is_ascii2_a ( site, gbstrlen(site) ) ) {
|
||||
site = NULL;
|
||||
continue;
|
||||
}
|
||||
if ( lastHostHash == hostHash && siteNumInlinks>=0) {
|
||||
if ( match && siteNumInlinks>=0) {
|
||||
// if we ask for 1 or 2 we end up with 100M
|
||||
// entries, but with 3+ we get 27M
|
||||
if ( siteNumInlinks > 2 && site )
|
||||
@ -12063,14 +12113,12 @@ void dumpTagdb (char *coll,int32_t startFileNum,int32_t numFiles,
|
||||
}
|
||||
// save it
|
||||
if ( site ) strcpy ( sbuf , site );
|
||||
lastHostHash = hostHash;
|
||||
continue;
|
||||
}
|
||||
|
||||
if ( tag->m_type == typeInlinks ) {
|
||||
hostHash = tag->m_key.n1;
|
||||
if ( tag->m_type == typeInlinks && req == 'z' ) {
|
||||
siteNumInlinks = atoi(tag->getTagData());
|
||||
if ( lastHostHash == hostHash && site ) {
|
||||
if ( match && site ) {
|
||||
// if we ask for 1 or 2 we end up with 100M
|
||||
// entries, but with 3+ we get 27M
|
||||
if ( siteNumInlinks > 2 )
|
||||
@ -12078,7 +12126,6 @@ void dumpTagdb (char *coll,int32_t startFileNum,int32_t numFiles,
|
||||
siteNumInlinks = -1;
|
||||
site = NULL;
|
||||
}
|
||||
lastHostHash = hostHash;
|
||||
continue;
|
||||
}
|
||||
|
||||
|
2677669
sitelinks.txt
Normal file
2677669
sitelinks.txt
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
x
Reference in New Issue
Block a user