trash files of length 0 that are holding up a merge.
if we can't merge files we end up stockpiling them and things get slow fast.
This commit is contained in:
parent
dfca68ec46
commit
39e621f655
25
RdbBase.cpp
25
RdbBase.cpp
@ -762,15 +762,30 @@ int32_t RdbBase::addFile ( int32_t id , bool isNew , int32_t mergeNum ,
|
||||
|
||||
f->set ( getDir() , name , NULL ); // getStripeDir() );
|
||||
|
||||
// if new insure does not exist
|
||||
// if new ensure does not exist
|
||||
if ( isNew && f->doesExist() ) {
|
||||
log("rdb: creating NEW file %s/%s which already exists!",
|
||||
f->getDir(),
|
||||
f->getFilename());
|
||||
mdelete ( f , sizeof(BigFile),"RdbBFile");
|
||||
delete (f);
|
||||
return -1;
|
||||
char *xx=NULL;*xx=0;
|
||||
// if length is NOT 0 then that sucks, just return
|
||||
if ( f->getFileSize() != 0 ) {
|
||||
mdelete ( f , sizeof(BigFile),"RdbBFile");
|
||||
delete (f);
|
||||
return -1;
|
||||
char *xx=NULL;*xx=0;
|
||||
}
|
||||
// otherwise, move it to the trash
|
||||
SafeBuf cmd;
|
||||
cmd.safePrintf("mv %s/%s %s/trash/",
|
||||
f->getDir(),
|
||||
f->getFilename(),
|
||||
g_hostdb.m_dir);
|
||||
log("rdb: %s",cmd.getBufStart() );
|
||||
gbsystem ( cmd.getBufStart() );
|
||||
// ok, now re-set it since it is no longer there
|
||||
f->reset();
|
||||
// and set it again
|
||||
f->set ( getDir() , name , NULL );
|
||||
}
|
||||
|
||||
|
||||
|
@ -770,8 +770,8 @@ bool RdbList::checkList_r ( bool removeNegRecs , bool sleepOnProblem ,
|
||||
// bad url in spider request?
|
||||
if ( g_spiderdb.isSpiderRequest ( (key128_t *)rec ) ){
|
||||
SpiderRequest *sr = (SpiderRequest *)rec;
|
||||
if ( strncmp(sr->m_url,"http",4) != 0 ) {
|
||||
log("db: spider req url");
|
||||
if ( sr->isCorrupt() ) {
|
||||
log("db: spider req corrupt");
|
||||
char *xx=NULL;*xx=0;
|
||||
}
|
||||
}
|
||||
|
266
Spider.cpp
266
Spider.cpp
@ -13599,9 +13599,31 @@ void dedupSpiderdbList ( RdbList *list , int32_t niceness , bool removeNegRecs )
|
||||
int64_t reqUh48 = 0LL;
|
||||
int64_t repUh48 = 0LL;
|
||||
SpiderReply *oldRep = NULL;
|
||||
SpiderRequest *oldReq = NULL;
|
||||
//SpiderRequest *oldReq = NULL;
|
||||
char *lastKey = NULL;
|
||||
char *prevLastKey = NULL;
|
||||
|
||||
int32_t oldSize = list->m_listSize;
|
||||
int32_t corrupt = 0;
|
||||
// debug
|
||||
// static int32_t s_count = 0;
|
||||
// s_count++;
|
||||
// if ( s_count == 2524 )
|
||||
// log("gotit");
|
||||
|
||||
int32_t numToFilter = 0;
|
||||
|
||||
class Link {
|
||||
public:
|
||||
uint32_t m_srh;
|
||||
SpiderRequest *m_sreq;
|
||||
class Link *m_prev;
|
||||
class Link *m_next;
|
||||
};
|
||||
#define MAXLINKS 30
|
||||
Link *headLink = NULL;
|
||||
Link *tailLink = NULL;
|
||||
Link links[MAXLINKS];
|
||||
int32_t numLinks = 0;
|
||||
|
||||
// save list ptr in case of re-read?
|
||||
//char *saved = list->m_listPtr;
|
||||
@ -13624,10 +13646,8 @@ void dedupSpiderdbList ( RdbList *list , int32_t niceness , bool removeNegRecs )
|
||||
log("spider: filter got negative key");
|
||||
char *xx=NULL;*xx=0;
|
||||
}
|
||||
// save this
|
||||
prevLastKey = lastKey;
|
||||
lastKey = dst;
|
||||
// otherwise, keep it
|
||||
lastKey = dst;
|
||||
memmove ( dst , rec , sizeof(key128_t) );
|
||||
dst += sizeof(key128_t);
|
||||
continue;
|
||||
@ -13655,15 +13675,13 @@ void dedupSpiderdbList ( RdbList *list , int32_t niceness , bool removeNegRecs )
|
||||
continue;
|
||||
// otherwise, erase him
|
||||
dst = restorePoint;
|
||||
lastKey = prevLastKey;
|
||||
}
|
||||
// save in case we get erased
|
||||
restorePoint = dst;
|
||||
prevLastKey = lastKey;
|
||||
lastKey = dst;
|
||||
// get our size
|
||||
int32_t recSize = srep->getRecSize();
|
||||
// and add us
|
||||
lastKey = dst;
|
||||
memmove ( dst , rec , recSize );
|
||||
// advance
|
||||
dst += recSize;
|
||||
@ -13677,6 +13695,12 @@ void dedupSpiderdbList ( RdbList *list , int32_t niceness , bool removeNegRecs )
|
||||
// int16_tcut
|
||||
SpiderRequest *sreq = (SpiderRequest *)rec;
|
||||
|
||||
// might as well filter out corruption
|
||||
if ( sreq->isCorrupt() ) {
|
||||
corrupt += sreq->getRecSize();
|
||||
continue;
|
||||
}
|
||||
|
||||
// int16_tcut
|
||||
int64_t uh48 = sreq->getUrlHash48();
|
||||
|
||||
@ -13714,26 +13738,15 @@ void dedupSpiderdbList ( RdbList *list , int32_t niceness , bool removeNegRecs )
|
||||
//sreq->m_hasSiteVenue = old->m_hasSiteVenue;
|
||||
}
|
||||
|
||||
// if we are not the same url as last request, add it
|
||||
// if we are not the same url as last request, then
|
||||
// we will not need to dedup, but should add ourselves to
|
||||
// the linked list, which we also reset here.
|
||||
if ( uh48 != reqUh48 ) {
|
||||
// a nice hook in
|
||||
addIt:
|
||||
// save in case we get erased
|
||||
restorePoint = dst;
|
||||
prevLastKey = lastKey;
|
||||
// get our size
|
||||
int32_t recSize = sreq->getRecSize();
|
||||
// save this
|
||||
lastKey = dst;
|
||||
// and add us
|
||||
memmove ( dst , rec , recSize );
|
||||
// advance
|
||||
dst += recSize;
|
||||
// update this crap for comparing to next reply
|
||||
reqUh48 = uh48;
|
||||
oldReq = sreq;
|
||||
// get next spiderdb record
|
||||
continue;
|
||||
numLinks = 0;
|
||||
headLink = NULL;
|
||||
tailLink = NULL;
|
||||
// we are the new banner carrier
|
||||
reqUh48 = uh48;
|
||||
}
|
||||
|
||||
// try to kinda grab the min hop count as well
|
||||
@ -13745,9 +13758,140 @@ void dedupSpiderdbList ( RdbList *list , int32_t niceness , bool removeNegRecs )
|
||||
// oldReq->m_hopCount = sreq->m_hopCount;
|
||||
// }
|
||||
|
||||
// why does sitehash32 matter really?
|
||||
uint32_t srh = sreq->m_siteHash32;
|
||||
if ( sreq->m_isNewOutlink ) srh ^= 0xb714d3a3;
|
||||
if ( sreq->m_isInjecting ) srh ^= 0x42538909;
|
||||
if ( sreq->m_hasContent ) srh ^= 0xbbbefd59;
|
||||
if ( sreq->m_isAddUrl ) srh ^= 0x587c5a0b;
|
||||
if ( sreq->m_isPageReindex ) srh ^= 0x70fb3911;
|
||||
if ( sreq->m_forceDelete ) srh ^= 0x4e6e9aee;
|
||||
|
||||
if ( sreq->m_parentIsSiteMap ) srh ^= 0xe0c20e3f;
|
||||
if ( sreq->m_urlIsDocId ) srh ^= 0xee015b07;
|
||||
if ( sreq->m_fakeFirstIp ) srh ^= 0x95b8d376;
|
||||
if ( sreq->m_parentIsRSS ) srh ^= 0xb08c7545;
|
||||
if ( sreq->m_parentIsPermalink ) srh ^= 0xbd688268;
|
||||
if ( sreq->m_parentIsPingServer ) srh ^= 0xb4c8a811;
|
||||
if ( sreq->m_isMenuOutlink ) srh ^= 0xd97bb80b;
|
||||
|
||||
// we may assign url filter priority based on parent langid
|
||||
srh ^= (uint32_t)g_hashtab[0][(uint8_t)sreq->m_parentLangId];
|
||||
|
||||
// if he's essentially different input parms but for the
|
||||
// same url, we want to keep him because he might map the
|
||||
// url to a different url priority!
|
||||
bool skipUs = false;
|
||||
Link *myLink = NULL;
|
||||
Link *link = headLink;
|
||||
|
||||
// debug point. should be deduped by
|
||||
// if ( sreq->m_key.n0==7199823231990374913LL &&
|
||||
// sreq->m_key.n1==6511615362168588088 )
|
||||
// log("hey1");
|
||||
// if ( sreq->m_key.n0==7199823542662487041LL &&
|
||||
// sreq->m_key.n1==6511615362168588088 )
|
||||
// log("hey2");
|
||||
|
||||
// now we keep a list of the last ten
|
||||
for ( ; link ; link = link->m_next ) {
|
||||
if ( srh != link->m_srh ) continue;
|
||||
SpiderRequest *prevReq = link->m_sreq;
|
||||
// if we are better, replace him and stop
|
||||
if ( sreq->m_hopCount < prevReq->m_hopCount )
|
||||
goto replacePrevReq;
|
||||
// skip us if previous guy is better
|
||||
if ( sreq->m_hopCount > prevReq->m_hopCount ) {
|
||||
skipUs = true;
|
||||
break;
|
||||
}
|
||||
|
||||
// TODO: for pro, base on parentSiteNumInlinks here,
|
||||
// and hash hopcounts, but only 0,1,2,3. use 3
|
||||
// for all that are >=3. we can also have two hashes,
|
||||
// m_srh and m_srh2 in the Link class, and if your
|
||||
// new secondary hash is unique we can let you in
|
||||
// if your parentpageinlinks is the highest of all.
|
||||
|
||||
// resort to added time if hopcount is tied
|
||||
// . if the same check who has the most recentaddedtime
|
||||
// . if we are not the most recent, just do not add us
|
||||
// . no, now i want the oldest so we can do
|
||||
// gbssDiscoveryTime and set sreq->m_discoveryTime
|
||||
// accurately, above
|
||||
if ( sreq->m_addedTime >= prevReq->m_addedTime ) {
|
||||
skipUs = true;
|
||||
break;
|
||||
}
|
||||
// otherwise, replace him
|
||||
replacePrevReq:
|
||||
if ( prevReq->m_url[0] != 'h' ) { char *xx=NULL;*xx=0;}
|
||||
prevReq->m_url[0] = 'x'; // mark for removal. xttp://
|
||||
myLink = link;
|
||||
// make a note of this so we physically remove these
|
||||
// entries after we are done with this scan.
|
||||
numToFilter++;
|
||||
goto promoteLinkToHead;
|
||||
}
|
||||
// if we were not as good as someone that was basically the
|
||||
// same SpiderRequest before us, keep going
|
||||
if ( skipUs )
|
||||
continue;
|
||||
|
||||
// add to linked list
|
||||
if ( numLinks < MAXLINKS ) {
|
||||
myLink = &links[numLinks++];
|
||||
myLink->m_prev = NULL;
|
||||
myLink->m_next = NULL;
|
||||
// if first one, we are head and tail
|
||||
if ( numLinks == 1 ) {
|
||||
headLink = myLink;
|
||||
tailLink = myLink;
|
||||
}
|
||||
}
|
||||
// if full, just supplant the tail link
|
||||
else
|
||||
myLink = tailLink;
|
||||
|
||||
promoteLinkToHead:
|
||||
|
||||
myLink->m_srh = srh;
|
||||
myLink->m_sreq = (SpiderRequest *)dst;//sreq;
|
||||
|
||||
// move link to head if not already
|
||||
if ( myLink != headLink ) {
|
||||
// if we are the tail, there will be a new tail
|
||||
if ( myLink == tailLink ) tailLink = myLink->m_prev;
|
||||
// make previous link point over us
|
||||
if ( myLink->m_prev )
|
||||
myLink->m_prev->m_next = myLink->m_next;
|
||||
// make next link ptr point backward over us
|
||||
if ( myLink->m_next )
|
||||
myLink->m_next->m_prev = myLink->m_prev;
|
||||
// make current head point backward to us
|
||||
headLink->m_prev = myLink;
|
||||
// and we point forward to him
|
||||
myLink->m_next = headLink;
|
||||
// and backward to nobody
|
||||
myLink->m_prev = NULL;
|
||||
// and we are the head now
|
||||
headLink = myLink;
|
||||
}
|
||||
|
||||
|
||||
// get our size
|
||||
int32_t recSize = sreq->getRecSize();
|
||||
|
||||
// and add us
|
||||
lastKey = dst;
|
||||
memmove ( dst , rec , recSize );
|
||||
// advance
|
||||
dst += recSize;
|
||||
|
||||
// get next spiderdb record
|
||||
continue;
|
||||
|
||||
/*
|
||||
if ( oldReq->m_siteHash32 != sreq->m_siteHash32 ||
|
||||
oldReq->m_isNewOutlink != sreq->m_isNewOutlink ||
|
||||
// use hopcount now too!
|
||||
@ -13771,11 +13915,14 @@ void dedupSpiderdbList ( RdbList *list , int32_t niceness , bool removeNegRecs )
|
||||
// . no, now i want the oldest so we can do gbssDiscoveryTime
|
||||
// and set sreq->m_discoveryTime accurately, above
|
||||
if ( sreq->m_addedTime >= oldReq->m_addedTime ) continue;
|
||||
|
||||
// otherwise, erase over him
|
||||
dst = restorePoint;
|
||||
lastKey = prevLastKey;
|
||||
// and add us over top of him
|
||||
goto addIt;
|
||||
*/
|
||||
|
||||
}
|
||||
|
||||
// free the old list
|
||||
@ -13786,6 +13933,57 @@ void dedupSpiderdbList ( RdbList *list , int32_t niceness , bool removeNegRecs )
|
||||
if ( dst < list->m_list || dst > list->m_list + list->m_listSize ) {
|
||||
char *xx=NULL;*xx=0; }
|
||||
|
||||
|
||||
/////////
|
||||
//
|
||||
// now remove xttp:// urls if we had some
|
||||
//
|
||||
/////////
|
||||
if ( numToFilter > 0 ) {
|
||||
// update list so for-loop below works
|
||||
list->m_listSize = dst - newList;
|
||||
list->m_listPtr = newList;//dst;
|
||||
list->m_listEnd = list->m_list + list->m_listSize;
|
||||
list->m_listPtrHi = NULL;
|
||||
// and we'll re-write everything back into itself at "dst"
|
||||
dst = newList;
|
||||
}
|
||||
for ( ; ! list->isExhausted() ; ) {
|
||||
// breathe. NO! assume in thread!!
|
||||
//QUICKPOLL(niceness);
|
||||
// get rec
|
||||
char *rec = list->getCurrentRec();
|
||||
// pre skip it
|
||||
list->skipCurrentRec();
|
||||
// skip if negative, just copy over
|
||||
if ( ( rec[0] & 0x01 ) == 0x00 ) {
|
||||
lastKey = dst;
|
||||
memmove ( dst , rec , sizeof(key128_t) );
|
||||
dst += sizeof(key128_t);
|
||||
continue;
|
||||
}
|
||||
// is it a reply?
|
||||
if ( g_spiderdb.isSpiderReply ( (key128_t *)rec ) ) {
|
||||
SpiderReply *srep = (SpiderReply *)rec;
|
||||
int32_t recSize = srep->getRecSize();
|
||||
lastKey = dst;
|
||||
memmove ( dst , rec , recSize );
|
||||
dst += recSize;
|
||||
continue;
|
||||
}
|
||||
SpiderRequest *sreq = (SpiderRequest *)rec;
|
||||
// skip if filtered out
|
||||
if ( sreq->m_url[0] == 'x' )
|
||||
continue;
|
||||
int32_t recSize = sreq->getRecSize();
|
||||
lastKey = dst;
|
||||
memmove ( dst , rec , recSize );
|
||||
dst += recSize;
|
||||
// if ( sreq->getUrlHash48() == 49553538838LL )
|
||||
// log("napkins");
|
||||
}
|
||||
|
||||
|
||||
// and stick our newly filtered list in there
|
||||
//list->m_list = newList;
|
||||
list->m_listSize = dst - newList;
|
||||
@ -13797,6 +13995,15 @@ void dedupSpiderdbList ( RdbList *list , int32_t niceness , bool removeNegRecs )
|
||||
list->m_listPtrHi = NULL;
|
||||
//KEYSET(list->m_lastKey,lastKey,list->m_ks);
|
||||
|
||||
// log("spiderdb: remove ME!!!");
|
||||
// // check it
|
||||
// list->checkList_r(false,false,RDB_SPIDERDB);
|
||||
// list->resetListPtr();
|
||||
|
||||
int32_t delta = oldSize - list->m_listSize;
|
||||
log("spider: deduped %i bytes (of which %i were corrupted) "
|
||||
"out of %i",(int)delta,(int)corrupt,(int)oldSize);
|
||||
|
||||
if ( lastKey ) KEYSET(list->m_lastKey,lastKey,list->m_ks);
|
||||
|
||||
//mfree ( oldbuf , oldSize, "oldspbuf");
|
||||
@ -15019,6 +15226,11 @@ bool SpiderRequest::isCorrupt ( ) {
|
||||
if ( m_url[0] == 'h' && m_url[1]=='t' && m_url[2]=='t' &&
|
||||
m_url[3] == 'p' )
|
||||
return false;
|
||||
// to be a docid as url must have this set
|
||||
if ( ! m_isPageReindex && ! m_urlIsDocId ) {
|
||||
log("spider: got corrupt 3 spiderRequest");
|
||||
return true;
|
||||
}
|
||||
// might be a docid from a pagereindex.cpp
|
||||
if ( ! is_digit(m_url[0]) ) {
|
||||
log("spider: got corrupt 1 spiderRequest");
|
||||
|
Loading…
x
Reference in New Issue
Block a user