Merge branch 'diffbot-testing' into testing
This commit is contained in:
@ -829,7 +829,7 @@ bool Collectiondb::deleteRec2 ( collnum_t collnum ) { //, WaitEntry *we ) {
|
||||
sc->m_cr = NULL;
|
||||
// this will put it on "death row" so it will be deleted
|
||||
// once Msg5::m_waitingForList/Merge is NULL
|
||||
tryToDeleteSpiderColl ( sc );
|
||||
tryToDeleteSpiderColl ( sc ,"10");
|
||||
//mdelete ( sc, sizeof(SpiderColl),"nukecr2");
|
||||
//delete ( sc );
|
||||
cr->m_spiderColl = NULL;
|
||||
@ -837,7 +837,7 @@ bool Collectiondb::deleteRec2 ( collnum_t collnum ) { //, WaitEntry *we ) {
|
||||
|
||||
|
||||
// the bulk urls file too i guess
|
||||
if ( cr->m_isCustomCrawl == 2 ) {
|
||||
if ( cr->m_isCustomCrawl == 2 && g_hostdb.m_hostId == 0 ) {
|
||||
SafeBuf bu;
|
||||
bu.safePrintf("%sbulkurls-%s.txt",
|
||||
g_hostdb.m_dir , cr->m_coll );
|
||||
@ -1123,7 +1123,7 @@ bool Collectiondb::resetColl2( collnum_t oldCollnum,
|
||||
//sc->reset();
|
||||
// this will put it on "death row" so it will be deleted
|
||||
// once Msg5::m_waitingForList/Merge is NULL
|
||||
tryToDeleteSpiderColl ( sc );
|
||||
tryToDeleteSpiderColl ( sc,"11" );
|
||||
//mdelete ( sc, sizeof(SpiderColl),"nukecr2");
|
||||
//delete ( sc );
|
||||
cr->m_spiderColl = NULL;
|
||||
@ -1526,6 +1526,7 @@ static CollectionRec g_default;
|
||||
CollectionRec::CollectionRec() {
|
||||
m_collnum = -1;
|
||||
m_coll[0] = '\0';
|
||||
m_updateRoundNum = 0;
|
||||
//m_numSearchPwds = 0;
|
||||
//m_numBanIps = 0;
|
||||
//m_numSearchIps = 0;
|
||||
@ -1655,7 +1656,7 @@ void CollectionRec::reset() {
|
||||
sc->m_deleteMyself = true;
|
||||
|
||||
// if not currently being accessed nuke it now
|
||||
tryToDeleteSpiderColl ( sc );
|
||||
tryToDeleteSpiderColl ( sc ,"12");
|
||||
|
||||
// if ( ! sc->m_msg5.m_waitingForList &&
|
||||
// ! sc->m_msg5b.m_waitingForList &&
|
||||
|
4
Makefile
4
Makefile
@ -78,6 +78,8 @@ ifeq ("titan","$(HOST)")
|
||||
# the way it works is not even possible on newer kernels because they no longer
|
||||
# allow you to override the _errno_location() function. -- matt
|
||||
# -DMATTWELLS
|
||||
# turn off stack smash detection because it won't save and dump core when
|
||||
# stack gets smashed like it normally would when it gets a seg fault signal.
|
||||
CPPFLAGS = -m32 -g -Wall -pipe -Wno-write-strings -Wstrict-aliasing=0 -Wno-uninitialized -static -DTITAN
|
||||
LIBS = ./libz.a ./libssl.a ./libcrypto.a ./libiconv.a ./libm.a
|
||||
else
|
||||
@ -90,7 +92,7 @@ else
|
||||
# trying to use good ole' clone() again because it seems the errno location
|
||||
# thing is fixed by just ignoring it.
|
||||
#
|
||||
CPPFLAGS = -m32 -g -Wall -pipe -Wno-write-strings -Wstrict-aliasing=0 -Wno-uninitialized -static -DPTHREADS -Wno-unused-but-set-variable
|
||||
CPPFLAGS = -m32 -g -Wall -pipe -fno-stack-protector -Wno-write-strings -Wstrict-aliasing=0 -Wno-uninitialized -static -DPTHREADS -Wno-unused-but-set-variable
|
||||
LIBS= -L. ./libz.a ./libssl.a ./libcrypto.a ./libiconv.a ./libm.a ./libstdc++.a -lpthread
|
||||
# use this for compiling on CYGWIN: (only for 32bit cygwin right now and
|
||||
# you have to install the packages that have these libs.
|
||||
|
@ -442,10 +442,11 @@ bool gotReplyWrapperxd ( void *state ) {
|
||||
// and can take a long time.
|
||||
if ( (req->m_isDebug || took > 100) && req->m_niceness == 0 )
|
||||
log("query: Took %lli ms to compute summary for d=%lli u=%s "
|
||||
"niceness=%li",
|
||||
"niceness=%li status=%s",
|
||||
took,
|
||||
xd->m_docId,xd->m_firstUrl.m_url,
|
||||
xd->m_niceness );
|
||||
xd->m_niceness ,
|
||||
mstrerror(g_errno));
|
||||
if ( (req->m_isDebug || took2 > 100) &&
|
||||
xd->m_cpuSummaryStartTime &&
|
||||
req->m_niceness == 0 )
|
||||
|
@ -208,7 +208,7 @@ bool SearchInput::set ( TcpSocket *sock , HttpRequest *r ) { //, Query *q ) {
|
||||
// set this to the collrec of the first valid collnum we encounter
|
||||
CollectionRec *cr = NULL;
|
||||
// now convert list of space-separated coll names into list of collnums
|
||||
char *p = coll;
|
||||
char *p = r->getString("c",NULL);
|
||||
// if no collection list was specified look for "token=" and
|
||||
// use those to make collections. hack for diffbot.
|
||||
char *token = r->getString("token",NULL);
|
||||
@ -269,6 +269,19 @@ bool SearchInput::set ( TcpSocket *sock , HttpRequest *r ) { //, Query *q ) {
|
||||
if ( *p ) goto loop;
|
||||
}
|
||||
|
||||
// use default collection if none provided
|
||||
if ( ! p && ! token && m_collnumBuf.length() <= 0 ) {
|
||||
// get default collection rec
|
||||
CollectionRec *dr = g_collectiondb.getRec (coll);
|
||||
// add to our list
|
||||
if ( dr &&
|
||||
!m_collnumBuf.safeMemcpy(&dr->m_collnum,
|
||||
sizeof(collnum_t)))
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/////
|
||||
//
|
||||
// END BUILDING m_collnumBuf
|
||||
|
84
Spider.cpp
84
Spider.cpp
@ -1011,7 +1011,7 @@ SpiderColl *SpiderCache::getSpiderCollIffNonNull ( collnum_t collnum ) {
|
||||
return cr->m_spiderColl;
|
||||
}
|
||||
|
||||
bool tryToDeleteSpiderColl ( SpiderColl *sc ) {
|
||||
bool tryToDeleteSpiderColl ( SpiderColl *sc , char *msg ) {
|
||||
// if not being deleted return false
|
||||
if ( ! sc->m_deleteMyself ) return false;
|
||||
// otherwise always return true
|
||||
@ -1036,11 +1036,21 @@ bool tryToDeleteSpiderColl ( SpiderColl *sc ) {
|
||||
(long)sc,(long)sc->m_collnum);
|
||||
return true;
|
||||
}
|
||||
// if ( sc->m_gettingList1 ) {
|
||||
// log("spider: deleting sc=0x%lx for collnum=%li waiting5",
|
||||
// (long)sc,(long)sc->m_collnum);
|
||||
// return true;
|
||||
// }
|
||||
// if ( sc->m_gettingList2 ) {
|
||||
// log("spider: deleting sc=0x%lx for collnum=%li waiting6",
|
||||
// (long)sc,(long)sc->m_collnum);
|
||||
// return true;
|
||||
// }
|
||||
// there's still a core of someone trying to write to someting
|
||||
// in "sc" so we have to try to fix that. somewhere in xmldoc.cpp
|
||||
// or spider.cpp. everyone should get sc from cr everytime i'd think
|
||||
log("spider: deleting sc=0x%lx for collnum=%li",
|
||||
(long)sc,(long)sc->m_collnum);
|
||||
log("spider: deleting sc=0x%lx for collnum=%li (msg=%s)",
|
||||
(long)sc,(long)sc->m_collnum,msg);
|
||||
// . make sure nobody has it
|
||||
// . cr might be NULL because Collectiondb.cpp::deleteRec2() might
|
||||
// have nuked it
|
||||
@ -1110,7 +1120,7 @@ SpiderColl *SpiderCache::getSpiderColl ( collnum_t collnum ) {
|
||||
// set this
|
||||
sc->m_cr = cr;
|
||||
// did crawlbottesting delete it right away?
|
||||
if ( tryToDeleteSpiderColl( sc ) ) return NULL;
|
||||
if ( tryToDeleteSpiderColl( sc ,"1") ) return NULL;
|
||||
// sanity check
|
||||
if ( ! cr ) { char *xx=NULL;*xx=0; }
|
||||
// note it!
|
||||
@ -2667,7 +2677,7 @@ static void gotSpiderdbListWrapper2( void *state , RdbList *list,Msg5 *msg5) {
|
||||
// did our collection rec get deleted? since we were doing a read
|
||||
// the SpiderColl will have been preserved in that case but its
|
||||
// m_deleteMyself flag will have been set.
|
||||
if ( tryToDeleteSpiderColl ( THIS ) ) return;
|
||||
if ( tryToDeleteSpiderColl ( THIS ,"2") ) return;
|
||||
|
||||
THIS->populateWaitingTreeFromSpiderdb ( true );
|
||||
}
|
||||
@ -3172,7 +3182,7 @@ static void doledWrapper ( void *state ) {
|
||||
THIS->m_isPopulating = false;
|
||||
|
||||
// did collection get nuked while we were waiting for msg1 reply?
|
||||
if ( tryToDeleteSpiderColl ( THIS ) ) return;
|
||||
if ( tryToDeleteSpiderColl ( THIS ,"3") ) return;
|
||||
|
||||
// . we added a rec to doledb for the firstIp in m_waitingTreeKey, so
|
||||
// now go to the next node in the wait tree.
|
||||
@ -3323,7 +3333,7 @@ bool SpiderColl::evalIpLoop ( ) {
|
||||
// did our collection rec get deleted? since we were doing a read
|
||||
// the SpiderColl will have been preserved in that case but its
|
||||
// m_deleteMyself flag will have been set.
|
||||
if ( tryToDeleteSpiderColl ( this ) ) return false;
|
||||
if ( tryToDeleteSpiderColl ( this ,"4") ) return false;
|
||||
|
||||
// if first time here, let's do a read first
|
||||
if ( ! m_didRead ) {
|
||||
@ -3342,7 +3352,7 @@ bool SpiderColl::evalIpLoop ( ) {
|
||||
// did our collection rec get deleted? since we were doing a read
|
||||
// the SpiderColl will have been preserved in that case but its
|
||||
// m_deleteMyself flag will have been set.
|
||||
if ( tryToDeleteSpiderColl ( this ) )
|
||||
if ( tryToDeleteSpiderColl ( this ,"5") )
|
||||
// pretend to block since we got deleted!!!
|
||||
return false;
|
||||
|
||||
@ -11913,7 +11923,9 @@ static long s_requests = 0;
|
||||
static long s_replies = 0;
|
||||
static long s_validReplies = 0;
|
||||
static bool s_inUse = false;
|
||||
static long s_updateRoundNum = 0;
|
||||
// we initialize CollectionRec::m_updateRoundNum to 0 so make this 1
|
||||
static long s_updateRoundNum = 1;
|
||||
|
||||
// . just call this once per second for all collections
|
||||
// . figure out how to backoff on collections that don't need it so much
|
||||
// . ask every host for their crawl infos for each collection rec
|
||||
@ -11932,8 +11944,16 @@ void updateAllCrawlInfosSleepWrapper ( int fd , void *state ) {
|
||||
|
||||
if ( s_inUse ) return;
|
||||
|
||||
char *request = "";
|
||||
long requestSize = 0;
|
||||
// "i" means to get incremental updates since last round
|
||||
// "f" means to get all stats
|
||||
char *request = "i";
|
||||
long requestSize = 1;
|
||||
|
||||
static bool s_firstCall = true;
|
||||
if ( s_firstCall ) {
|
||||
s_firstCall = false;
|
||||
request = "f";
|
||||
}
|
||||
|
||||
s_inUse = true;
|
||||
|
||||
@ -12040,8 +12060,9 @@ void gotCrawlInfoReply ( void *state , UdpSlot *slot ) {
|
||||
// reply is error? then use the last known good reply we had from him
|
||||
// assuming udp reply timed out. empty buf just means no update now!
|
||||
if ( ! slot->m_readBuf && g_errno ) {
|
||||
log("spider: got crawlinfo reply error: %s",
|
||||
mstrerror(g_errno));
|
||||
log("spider: got crawlinfo reply error from host %li: %s. "
|
||||
"spidering will be paused.",
|
||||
h->m_hostId,mstrerror(g_errno));
|
||||
// just clear it
|
||||
g_errno = 0;
|
||||
// if never had any reply... can't be valid then
|
||||
@ -12090,6 +12111,7 @@ void gotCrawlInfoReply ( void *state , UdpSlot *slot ) {
|
||||
// just not allow spidering if a host is dead
|
||||
|
||||
// the sendbuf should never be freed! it points into collrec
|
||||
// it is 'i' or 'f' right now
|
||||
slot->m_sendBufAlloc = NULL;
|
||||
|
||||
/////
|
||||
@ -12118,6 +12140,8 @@ void gotCrawlInfoReply ( void *state , UdpSlot *slot ) {
|
||||
if ( ! cr->m_crawlInfoBuf.getBufStart() ) {
|
||||
long need = sizeof(CrawlInfo) * g_hostdb.m_numHosts;
|
||||
cr->m_crawlInfoBuf.reserve(need);
|
||||
// in case one was udp server timed out or something
|
||||
cr->m_crawlInfoBuf.zeroOut();
|
||||
}
|
||||
|
||||
CrawlInfo *cia = (CrawlInfo *)cr->m_crawlInfoBuf.getBufStart();
|
||||
@ -12153,6 +12177,12 @@ void gotCrawlInfoReply ( void *state , UdpSlot *slot ) {
|
||||
// not updating our crawlinfo states.
|
||||
//break;
|
||||
}
|
||||
else {
|
||||
if ( ! s_countsAreValid )
|
||||
log("spider: got all crawlinfo replies. all shards "
|
||||
"up. spidering back on.");
|
||||
s_countsAreValid = true;
|
||||
}
|
||||
|
||||
|
||||
// loop over
|
||||
@ -12188,6 +12218,12 @@ void gotCrawlInfoReply ( void *state , UdpSlot *slot ) {
|
||||
// add each hosts counts into the global accumulators
|
||||
for ( long j = 0 ; j < NUMCRAWLSTATS ; j++ ) {
|
||||
*gs = *gs + *ss;
|
||||
// crazy stat?
|
||||
if ( *ss > 1000000000LL ||
|
||||
*ss < -1000000000LL )
|
||||
log("spider: crazy stats %lli "
|
||||
"from host #%li coll=%s",
|
||||
*ss,k,cr->m_coll);
|
||||
gs++;
|
||||
ss++;
|
||||
}
|
||||
@ -12364,7 +12400,9 @@ void gotCrawlInfoReply ( void *state , UdpSlot *slot ) {
|
||||
void handleRequestc1 ( UdpSlot *slot , long niceness ) {
|
||||
//char *request = slot->m_readBuf;
|
||||
// just a single collnum
|
||||
if ( slot->m_readBufSize != 0 ) { char *xx=NULL;*xx=0;}
|
||||
if ( slot->m_readBufSize != 1 ) { char *xx=NULL;*xx=0;}
|
||||
|
||||
char *req = slot->m_readBuf;
|
||||
|
||||
//if ( ! isClockSynced() ) {
|
||||
//}
|
||||
@ -12477,12 +12515,26 @@ void handleRequestc1 ( UdpSlot *slot , long niceness ) {
|
||||
|
||||
long hostId = slot->m_host->m_hostId;
|
||||
|
||||
bool sendIt = false;
|
||||
|
||||
// . if not sent to host yet, send
|
||||
// . this will be true on startup
|
||||
// . this will be true when WE startup, not them...
|
||||
// . but once we send it we set flag to false
|
||||
// . and if we update anything we send we set flag to true
|
||||
// again for all hosts
|
||||
if ( ! cr->shouldSendLocalCrawlInfoToHost(hostId) ) continue;
|
||||
if ( cr->shouldSendLocalCrawlInfoToHost(hostId) )
|
||||
sendIt = true;
|
||||
|
||||
// they can override. if host crashed and came back up
|
||||
// it might not have saved the global crawl info for a coll
|
||||
// perhaps, at the very least it does not have
|
||||
// the correct CollectionRec::m_crawlInfoBuf because we do
|
||||
// not save the array of crawlinfos for each host for
|
||||
// all collections.
|
||||
if ( req && req[0] == 'f' )
|
||||
sendIt = true;
|
||||
|
||||
if ( ! sendIt ) continue;
|
||||
|
||||
// note it
|
||||
// log("spider: sending ci for coll %s to host %li",
|
||||
|
2
Spider.h
2
Spider.h
@ -40,7 +40,7 @@
|
||||
#define SP_ADMIN_PAUSED 8 // g_conf.m_spideringEnabled = false
|
||||
#define SP_COMPLETED 9 // crawl is done, and no repeatCrawl is scheduled
|
||||
|
||||
bool tryToDeleteSpiderColl ( SpiderColl *sc ) ;
|
||||
bool tryToDeleteSpiderColl ( SpiderColl *sc , char *msg ) ;
|
||||
void spiderRoundIncremented ( class CollectionRec *cr ) ;
|
||||
bool testPatterns ( ) ;
|
||||
bool doesStringContainPattern ( char *content , char *pattern ) ;
|
||||
|
46
XmlDoc.cpp
46
XmlDoc.cpp
@ -7506,28 +7506,39 @@ long *XmlDoc::getSummaryVector ( ) {
|
||||
Title *ti = getTitle();
|
||||
if ( ! ti || ti == (Title *)-1 ) return (long *)ti;
|
||||
// store title and summary into "buf" so we can call words.set()
|
||||
char buf[5000];
|
||||
char *p = buf;
|
||||
long avail = 5000;
|
||||
long len;
|
||||
//char buf[5000];
|
||||
SafeBuf sb;
|
||||
//char *p = buf;
|
||||
//long avail = 5000;
|
||||
//long len;
|
||||
// put title into there
|
||||
len = ti->m_titleBytes - 1;
|
||||
if ( len > avail ) len = avail - 10;
|
||||
if ( len < 0 ) len = 0;
|
||||
memcpy ( p , ti->m_title , len );
|
||||
p += len;
|
||||
// space separting the title from summary
|
||||
if ( len > 0 ) *p++ = ' ';
|
||||
long tlen = ti->m_titleBytes - 1;
|
||||
//if ( len > avail ) len = avail - 10;
|
||||
if ( tlen < 0 ) tlen = 0;
|
||||
|
||||
// put summary into there
|
||||
len = s->m_summaryLen;
|
||||
if ( len > avail ) len = avail - 10;
|
||||
memcpy ( p , s->m_summary , len );
|
||||
p += len;
|
||||
long slen = s->m_summaryLen;
|
||||
|
||||
// allocate space
|
||||
long need = tlen + 1 + slen + 1;
|
||||
if ( ! sb.reserve ( need ) ) return NULL;
|
||||
|
||||
//memcpy ( p , ti->m_title , len );
|
||||
//p += len;
|
||||
sb.safeMemcpy ( ti->m_title , tlen );
|
||||
// space separting the title from summary
|
||||
if ( tlen > 0 ) sb.pushChar(' ');
|
||||
|
||||
//if ( len > avail ) len = avail - 10;
|
||||
//memcpy ( p , s->m_summary , len );
|
||||
//p += len;
|
||||
sb.safeMemcpy ( s->m_summary , slen );
|
||||
// null terminate it
|
||||
*p = '\0';
|
||||
//*p = '\0';
|
||||
sb.nullTerm();
|
||||
// word-ify it
|
||||
Words words;
|
||||
if ( ! words.set9 ( buf , m_niceness ) ) return NULL;
|
||||
if ( ! words.set9 ( sb.getBufStart() , m_niceness ) ) return NULL;
|
||||
// . now set the dedup vector from big summary and title
|
||||
// . store sample vector in here
|
||||
// . returns size in bytes including null terminating long
|
||||
@ -28509,7 +28520,6 @@ Msg20Reply *XmlDoc::getMsg20Reply ( ) {
|
||||
long nowUTC2 = m_req->m_nowUTC;
|
||||
if ( m_req->m_clockSet ) nowUTC2 = m_req->m_clockSet;
|
||||
|
||||
|
||||
// . summary vector for deduping
|
||||
// . does not compute anything if we should not! (svSize will be 0)
|
||||
if ( ! reply->ptr_vbuf &&
|
||||
|
Reference in New Issue
Block a user