Merge branch 'diffbot-testing' into testing

This commit is contained in:
mwells
2014-07-01 08:46:04 -06:00
7 changed files with 122 additions and 43 deletions

@ -829,7 +829,7 @@ bool Collectiondb::deleteRec2 ( collnum_t collnum ) { //, WaitEntry *we ) {
sc->m_cr = NULL;
// this will put it on "death row" so it will be deleted
// once Msg5::m_waitingForList/Merge is NULL
tryToDeleteSpiderColl ( sc );
tryToDeleteSpiderColl ( sc ,"10");
//mdelete ( sc, sizeof(SpiderColl),"nukecr2");
//delete ( sc );
cr->m_spiderColl = NULL;
@ -837,7 +837,7 @@ bool Collectiondb::deleteRec2 ( collnum_t collnum ) { //, WaitEntry *we ) {
// the bulk urls file too i guess
if ( cr->m_isCustomCrawl == 2 ) {
if ( cr->m_isCustomCrawl == 2 && g_hostdb.m_hostId == 0 ) {
SafeBuf bu;
bu.safePrintf("%sbulkurls-%s.txt",
g_hostdb.m_dir , cr->m_coll );
@ -1123,7 +1123,7 @@ bool Collectiondb::resetColl2( collnum_t oldCollnum,
//sc->reset();
// this will put it on "death row" so it will be deleted
// once Msg5::m_waitingForList/Merge is NULL
tryToDeleteSpiderColl ( sc );
tryToDeleteSpiderColl ( sc,"11" );
//mdelete ( sc, sizeof(SpiderColl),"nukecr2");
//delete ( sc );
cr->m_spiderColl = NULL;
@ -1526,6 +1526,7 @@ static CollectionRec g_default;
CollectionRec::CollectionRec() {
m_collnum = -1;
m_coll[0] = '\0';
m_updateRoundNum = 0;
//m_numSearchPwds = 0;
//m_numBanIps = 0;
//m_numSearchIps = 0;
@ -1655,7 +1656,7 @@ void CollectionRec::reset() {
sc->m_deleteMyself = true;
// if not currently being accessed nuke it now
tryToDeleteSpiderColl ( sc );
tryToDeleteSpiderColl ( sc ,"12");
// if ( ! sc->m_msg5.m_waitingForList &&
// ! sc->m_msg5b.m_waitingForList &&

@ -78,6 +78,8 @@ ifeq ("titan","$(HOST)")
# the way it works is not even possible on newer kernels because they no longer
# allow you to override the _errno_location() function. -- matt
# -DMATTWELLS
# turn off stack smash detection because it won't save and dump core when
# stack gets smashed like it normally would when it gets a seg fault signal.
CPPFLAGS = -m32 -g -Wall -pipe -Wno-write-strings -Wstrict-aliasing=0 -Wno-uninitialized -static -DTITAN
LIBS = ./libz.a ./libssl.a ./libcrypto.a ./libiconv.a ./libm.a
else
@ -90,7 +92,7 @@ else
# trying to use good ole' clone() again because it seems the errno location
# thing is fixed by just ignoring it.
#
CPPFLAGS = -m32 -g -Wall -pipe -Wno-write-strings -Wstrict-aliasing=0 -Wno-uninitialized -static -DPTHREADS -Wno-unused-but-set-variable
CPPFLAGS = -m32 -g -Wall -pipe -fno-stack-protector -Wno-write-strings -Wstrict-aliasing=0 -Wno-uninitialized -static -DPTHREADS -Wno-unused-but-set-variable
LIBS= -L. ./libz.a ./libssl.a ./libcrypto.a ./libiconv.a ./libm.a ./libstdc++.a -lpthread
# use this for compiling on CYGWIN: (only for 32bit cygwin right now and
# you have to install the packages that have these libs.

@ -442,10 +442,11 @@ bool gotReplyWrapperxd ( void *state ) {
// and can take a long time.
if ( (req->m_isDebug || took > 100) && req->m_niceness == 0 )
log("query: Took %lli ms to compute summary for d=%lli u=%s "
"niceness=%li",
"niceness=%li status=%s",
took,
xd->m_docId,xd->m_firstUrl.m_url,
xd->m_niceness );
xd->m_niceness ,
mstrerror(g_errno));
if ( (req->m_isDebug || took2 > 100) &&
xd->m_cpuSummaryStartTime &&
req->m_niceness == 0 )

@ -208,7 +208,7 @@ bool SearchInput::set ( TcpSocket *sock , HttpRequest *r ) { //, Query *q ) {
// set this to the collrec of the first valid collnum we encounter
CollectionRec *cr = NULL;
// now convert list of space-separated coll names into list of collnums
char *p = coll;
char *p = r->getString("c",NULL);
// if no collection list was specified look for "token=" and
// use those to make collections. hack for diffbot.
char *token = r->getString("token",NULL);
@ -269,6 +269,19 @@ bool SearchInput::set ( TcpSocket *sock , HttpRequest *r ) { //, Query *q ) {
if ( *p ) goto loop;
}
// use default collection if none provided
if ( ! p && ! token && m_collnumBuf.length() <= 0 ) {
// get default collection rec
CollectionRec *dr = g_collectiondb.getRec (coll);
// add to our list
if ( dr &&
!m_collnumBuf.safeMemcpy(&dr->m_collnum,
sizeof(collnum_t)))
return false;
}
/////
//
// END BUILDING m_collnumBuf

@ -1011,7 +1011,7 @@ SpiderColl *SpiderCache::getSpiderCollIffNonNull ( collnum_t collnum ) {
return cr->m_spiderColl;
}
bool tryToDeleteSpiderColl ( SpiderColl *sc ) {
bool tryToDeleteSpiderColl ( SpiderColl *sc , char *msg ) {
// if not being deleted return false
if ( ! sc->m_deleteMyself ) return false;
// otherwise always return true
@ -1036,11 +1036,21 @@ bool tryToDeleteSpiderColl ( SpiderColl *sc ) {
(long)sc,(long)sc->m_collnum);
return true;
}
// if ( sc->m_gettingList1 ) {
// log("spider: deleting sc=0x%lx for collnum=%li waiting5",
// (long)sc,(long)sc->m_collnum);
// return true;
// }
// if ( sc->m_gettingList2 ) {
// log("spider: deleting sc=0x%lx for collnum=%li waiting6",
// (long)sc,(long)sc->m_collnum);
// return true;
// }
// there's still a core of someone trying to write to someting
// in "sc" so we have to try to fix that. somewhere in xmldoc.cpp
// or spider.cpp. everyone should get sc from cr everytime i'd think
log("spider: deleting sc=0x%lx for collnum=%li",
(long)sc,(long)sc->m_collnum);
log("spider: deleting sc=0x%lx for collnum=%li (msg=%s)",
(long)sc,(long)sc->m_collnum,msg);
// . make sure nobody has it
// . cr might be NULL because Collectiondb.cpp::deleteRec2() might
// have nuked it
@ -1110,7 +1120,7 @@ SpiderColl *SpiderCache::getSpiderColl ( collnum_t collnum ) {
// set this
sc->m_cr = cr;
// did crawlbottesting delete it right away?
if ( tryToDeleteSpiderColl( sc ) ) return NULL;
if ( tryToDeleteSpiderColl( sc ,"1") ) return NULL;
// sanity check
if ( ! cr ) { char *xx=NULL;*xx=0; }
// note it!
@ -2667,7 +2677,7 @@ static void gotSpiderdbListWrapper2( void *state , RdbList *list,Msg5 *msg5) {
// did our collection rec get deleted? since we were doing a read
// the SpiderColl will have been preserved in that case but its
// m_deleteMyself flag will have been set.
if ( tryToDeleteSpiderColl ( THIS ) ) return;
if ( tryToDeleteSpiderColl ( THIS ,"2") ) return;
THIS->populateWaitingTreeFromSpiderdb ( true );
}
@ -3172,7 +3182,7 @@ static void doledWrapper ( void *state ) {
THIS->m_isPopulating = false;
// did collection get nuked while we were waiting for msg1 reply?
if ( tryToDeleteSpiderColl ( THIS ) ) return;
if ( tryToDeleteSpiderColl ( THIS ,"3") ) return;
// . we added a rec to doledb for the firstIp in m_waitingTreeKey, so
// now go to the next node in the wait tree.
@ -3323,7 +3333,7 @@ bool SpiderColl::evalIpLoop ( ) {
// did our collection rec get deleted? since we were doing a read
// the SpiderColl will have been preserved in that case but its
// m_deleteMyself flag will have been set.
if ( tryToDeleteSpiderColl ( this ) ) return false;
if ( tryToDeleteSpiderColl ( this ,"4") ) return false;
// if first time here, let's do a read first
if ( ! m_didRead ) {
@ -3342,7 +3352,7 @@ bool SpiderColl::evalIpLoop ( ) {
// did our collection rec get deleted? since we were doing a read
// the SpiderColl will have been preserved in that case but its
// m_deleteMyself flag will have been set.
if ( tryToDeleteSpiderColl ( this ) )
if ( tryToDeleteSpiderColl ( this ,"5") )
// pretend to block since we got deleted!!!
return false;
@ -11913,7 +11923,9 @@ static long s_requests = 0;
static long s_replies = 0;
static long s_validReplies = 0;
static bool s_inUse = false;
static long s_updateRoundNum = 0;
// we initialize CollectionRec::m_updateRoundNum to 0 so make this 1
static long s_updateRoundNum = 1;
// . just call this once per second for all collections
// . figure out how to backoff on collections that don't need it so much
// . ask every host for their crawl infos for each collection rec
@ -11932,8 +11944,16 @@ void updateAllCrawlInfosSleepWrapper ( int fd , void *state ) {
if ( s_inUse ) return;
char *request = "";
long requestSize = 0;
// "i" means to get incremental updates since last round
// "f" means to get all stats
char *request = "i";
long requestSize = 1;
static bool s_firstCall = true;
if ( s_firstCall ) {
s_firstCall = false;
request = "f";
}
s_inUse = true;
@ -12040,8 +12060,9 @@ void gotCrawlInfoReply ( void *state , UdpSlot *slot ) {
// reply is error? then use the last known good reply we had from him
// assuming udp reply timed out. empty buf just means no update now!
if ( ! slot->m_readBuf && g_errno ) {
log("spider: got crawlinfo reply error: %s",
mstrerror(g_errno));
log("spider: got crawlinfo reply error from host %li: %s. "
"spidering will be paused.",
h->m_hostId,mstrerror(g_errno));
// just clear it
g_errno = 0;
// if never had any reply... can't be valid then
@ -12090,6 +12111,7 @@ void gotCrawlInfoReply ( void *state , UdpSlot *slot ) {
// just not allow spidering if a host is dead
// the sendbuf should never be freed! it points into collrec
// it is 'i' or 'f' right now
slot->m_sendBufAlloc = NULL;
/////
@ -12118,6 +12140,8 @@ void gotCrawlInfoReply ( void *state , UdpSlot *slot ) {
if ( ! cr->m_crawlInfoBuf.getBufStart() ) {
long need = sizeof(CrawlInfo) * g_hostdb.m_numHosts;
cr->m_crawlInfoBuf.reserve(need);
// in case one was udp server timed out or something
cr->m_crawlInfoBuf.zeroOut();
}
CrawlInfo *cia = (CrawlInfo *)cr->m_crawlInfoBuf.getBufStart();
@ -12153,6 +12177,12 @@ void gotCrawlInfoReply ( void *state , UdpSlot *slot ) {
// not updating our crawlinfo states.
//break;
}
else {
if ( ! s_countsAreValid )
log("spider: got all crawlinfo replies. all shards "
"up. spidering back on.");
s_countsAreValid = true;
}
// loop over
@ -12188,6 +12218,12 @@ void gotCrawlInfoReply ( void *state , UdpSlot *slot ) {
// add each hosts counts into the global accumulators
for ( long j = 0 ; j < NUMCRAWLSTATS ; j++ ) {
*gs = *gs + *ss;
// crazy stat?
if ( *ss > 1000000000LL ||
*ss < -1000000000LL )
log("spider: crazy stats %lli "
"from host #%li coll=%s",
*ss,k,cr->m_coll);
gs++;
ss++;
}
@ -12364,7 +12400,9 @@ void gotCrawlInfoReply ( void *state , UdpSlot *slot ) {
void handleRequestc1 ( UdpSlot *slot , long niceness ) {
//char *request = slot->m_readBuf;
// just a single collnum
if ( slot->m_readBufSize != 0 ) { char *xx=NULL;*xx=0;}
if ( slot->m_readBufSize != 1 ) { char *xx=NULL;*xx=0;}
char *req = slot->m_readBuf;
//if ( ! isClockSynced() ) {
//}
@ -12477,12 +12515,26 @@ void handleRequestc1 ( UdpSlot *slot , long niceness ) {
long hostId = slot->m_host->m_hostId;
bool sendIt = false;
// . if not sent to host yet, send
// . this will be true on startup
// . this will be true when WE startup, not them...
// . but once we send it we set flag to false
// . and if we update anything we send we set flag to true
// again for all hosts
if ( ! cr->shouldSendLocalCrawlInfoToHost(hostId) ) continue;
if ( cr->shouldSendLocalCrawlInfoToHost(hostId) )
sendIt = true;
// they can override. if host crashed and came back up
// it might not have saved the global crawl info for a coll
// perhaps, at the very least it does not have
// the correct CollectionRec::m_crawlInfoBuf because we do
// not save the array of crawlinfos for each host for
// all collections.
if ( req && req[0] == 'f' )
sendIt = true;
if ( ! sendIt ) continue;
// note it
// log("spider: sending ci for coll %s to host %li",

@ -40,7 +40,7 @@
#define SP_ADMIN_PAUSED 8 // g_conf.m_spideringEnabled = false
#define SP_COMPLETED 9 // crawl is done, and no repeatCrawl is scheduled
bool tryToDeleteSpiderColl ( SpiderColl *sc ) ;
bool tryToDeleteSpiderColl ( SpiderColl *sc , char *msg ) ;
void spiderRoundIncremented ( class CollectionRec *cr ) ;
bool testPatterns ( ) ;
bool doesStringContainPattern ( char *content , char *pattern ) ;

@ -7506,28 +7506,39 @@ long *XmlDoc::getSummaryVector ( ) {
Title *ti = getTitle();
if ( ! ti || ti == (Title *)-1 ) return (long *)ti;
// store title and summary into "buf" so we can call words.set()
char buf[5000];
char *p = buf;
long avail = 5000;
long len;
//char buf[5000];
SafeBuf sb;
//char *p = buf;
//long avail = 5000;
//long len;
// put title into there
len = ti->m_titleBytes - 1;
if ( len > avail ) len = avail - 10;
if ( len < 0 ) len = 0;
memcpy ( p , ti->m_title , len );
p += len;
// space separting the title from summary
if ( len > 0 ) *p++ = ' ';
long tlen = ti->m_titleBytes - 1;
//if ( len > avail ) len = avail - 10;
if ( tlen < 0 ) tlen = 0;
// put summary into there
len = s->m_summaryLen;
if ( len > avail ) len = avail - 10;
memcpy ( p , s->m_summary , len );
p += len;
long slen = s->m_summaryLen;
// allocate space
long need = tlen + 1 + slen + 1;
if ( ! sb.reserve ( need ) ) return NULL;
//memcpy ( p , ti->m_title , len );
//p += len;
sb.safeMemcpy ( ti->m_title , tlen );
// space separting the title from summary
if ( tlen > 0 ) sb.pushChar(' ');
//if ( len > avail ) len = avail - 10;
//memcpy ( p , s->m_summary , len );
//p += len;
sb.safeMemcpy ( s->m_summary , slen );
// null terminate it
*p = '\0';
//*p = '\0';
sb.nullTerm();
// word-ify it
Words words;
if ( ! words.set9 ( buf , m_niceness ) ) return NULL;
if ( ! words.set9 ( sb.getBufStart() , m_niceness ) ) return NULL;
// . now set the dedup vector from big summary and title
// . store sample vector in here
// . returns size in bytes including null terminating long
@ -28509,7 +28520,6 @@ Msg20Reply *XmlDoc::getMsg20Reply ( ) {
long nowUTC2 = m_req->m_nowUTC;
if ( m_req->m_clockSet ) nowUTC2 = m_req->m_clockSet;
// . summary vector for deduping
// . does not compute anything if we should not! (svSize will be 0)
if ( ! reply->ptr_vbuf &&