Merge branch 'diffbot-testing' into diffbot-matt

This commit is contained in:
Matt Wells
2014-04-28 14:15:02 -07:00
12 changed files with 82 additions and 28 deletions

@ -467,13 +467,13 @@ bool Collectiondb::addNewColl ( char *coll ,
cr->m_collectiveRespiderFrequency = 0.0;
//cr->m_restrictDomain = true;
// reset the crawl stats
// . this will core if a host was dead and then when it came
// back up host #0's parms.cpp told it to add a new coll
cr->m_diffbotCrawlStartTime=
gettimeofdayInMillisecondsGlobalNoCore();
cr->m_diffbotCrawlEndTime = 0LL;
}
// . this will core if a host was dead and then when it came
// back up host #0's parms.cpp told it to add a new coll
cr->m_diffbotCrawlStartTime = getTimeGlobalNoCore();
cr->m_diffbotCrawlEndTime = 0;
// . just the basics on these for now
// . if certain parms are changed then the url filters
// must be rebuilt, as well as possibly the waiting tree!!!
@ -807,6 +807,11 @@ bool Collectiondb::deleteRec2 ( collnum_t collnum ) { //, WaitEntry *we ) {
sc->clearLocks();
//sc->m_collnum = newCollnum;
//sc->reset();
// you have to set this for tryToDeleteSpiderColl to
// actually have a shot at deleting it
sc->m_deleteMyself = true;
// cr will be invalid shortly after this
sc->m_cr = NULL;
// this will put it on "death row" so it will be deleted
// once Msg5::m_waitingForList/Merge is NULL
tryToDeleteSpiderColl ( sc );
@ -1611,12 +1616,14 @@ void CollectionRec::reset() {
sc->m_deleteMyself = true;
// if not currently being accessed nuke it now
if ( ! sc->m_msg5.m_waitingForList &&
! sc->m_msg5b.m_waitingForList &&
! sc->m_msg1.m_mcast.m_inUse ) {
mdelete ( sc, sizeof(SpiderColl),"nukecr2");
delete ( sc );
}
tryToDeleteSpiderColl ( sc );
// if ( ! sc->m_msg5.m_waitingForList &&
// ! sc->m_msg5b.m_waitingForList &&
// ! sc->m_msg1.m_mcast.m_inUse ) {
// mdelete ( sc, sizeof(SpiderColl),"nukecr2");
// delete ( sc );
// }
}
CollectionRec *g_cr = NULL;

@ -665,8 +665,9 @@ class CollectionRec {
long long m_maxToProcess;
long m_maxCrawlRounds;
long long m_diffbotCrawlStartTime;
long long m_diffbotCrawlEndTime;
// in seconds now
long m_diffbotCrawlStartTime;
long m_diffbotCrawlEndTime;
// for testing their regexes etc...
//char m_isDiffbotTestCrawl;

@ -930,7 +930,7 @@ bool Msg3::doneScanning ( ) {
ff->getFilename() ,
m_niceness ) ) {
log("net: Had error while constraining list read from "
"%s: %s%s. vfd=%li parts=%li. "
"%s: %s/%s. vfd=%li parts=%li. "
"This is likely caused by corrupted "
"data on disk.",
mstrerror(g_errno), ff->m_dir ,

@ -100,6 +100,7 @@ Msg40::Msg40() {
m_sendsIn = 0;
m_printi = 0;
m_numDisplayed = 0;
m_numPrintedSoFar = 0;
m_lastChunk = false;
//m_numGigabitInfos = 0;
}
@ -1683,6 +1684,7 @@ bool Msg40::gotSummary ( ) {
if ( m_si && m_numDisplayed <= m_si->m_firstResultNum ){
log("msg40: hiding #%li (%lu)",
m_printi,mr->m_contentHash32);
m20->reset();
continue;
}
@ -1690,7 +1692,9 @@ bool Msg40::gotSummary ( ) {
// . ok, we got it, so print it and stream it
// . this might set m_hadPrintError to true
printSearchResult9 ( m_printi );
printSearchResult9 ( m_printi , m_numPrintedSoFar );
m_numPrintedSoFar++;
// now free the reply to save memory since we could be
// streaming back 1M+. we call reset below, no need for this.
@ -5175,7 +5179,7 @@ bool Msg40::addFacts ( HashTableX *queryTable,
// . printSearchResult into "sb"
bool Msg40::printSearchResult9 ( long ix ) {
bool Msg40::printSearchResult9 ( long ix , long numPrintedSoFar ) {
// . we stream results right onto the socket
// . useful for thousands of results... and saving mem
@ -5202,7 +5206,7 @@ bool Msg40::printSearchResult9 ( long ix ) {
}
// print that out into st->m_sb safebuf
else if ( ! printResult ( st , ix ) ) {
else if ( ! printResult ( st , ix , numPrintedSoFar ) ) {
// oom?
if ( ! g_errno ) g_errno = EBADENGINEER;
log("query: had error: %s",mstrerror(g_errno));

@ -208,7 +208,7 @@ class Msg40 {
long m_lastHeartbeat;
bool printSearchResult9 ( long ix ) ;
bool printSearchResult9 ( long ix , long numPrintedSoFar ) ;
HashTableX m_columnTable;
bool printCSVHeaderRow ( class SafeBuf *sb );
bool printJsonItemInCSV ( class State0 *st , long ix );
@ -265,6 +265,7 @@ class Msg40 {
long m_sendsIn ;
long m_printi ;
long m_numDisplayed ;
long m_numPrintedSoFar;
long m_socketHadError;

@ -802,7 +802,9 @@ bool Msg5::needsRecall ( ) {
RdbBase *base = getRdbBase ( m_rdbId , m_collnum );
// if collection was deleted from under us, base will be NULL
if ( ! base && ! g_errno ) {
log("msg5: base lost for collnum %li",(long)m_collnum);
log("msg5: base lost for rdbid=%li collnum %li",
(long)m_rdbId,(long)m_collnum);
g_errno = ENOCOLLREC;
return false;
}
// sanity check

@ -2355,10 +2355,13 @@ bool printCrawlDetailsInJson ( SafeBuf *sb , CollectionRec *cx ) {
//nomen = "job";
}
sb->safePrintf("\n\n{"
"\"name\":\"%s\",\n"
"\"type\":\"%s\",\n"
"\"jobCreationTimeUTC\":%li,\n"
"\"jobCompletionTimeUTC\":%li,\n"
//"\"alias\":\"%s\",\n"
//"\"crawlingEnabled\":%li,\n"
"\"jobStatus\":{" // nomen = jobStatus / crawlStatus
@ -2384,6 +2387,11 @@ bool printCrawlDetailsInJson ( SafeBuf *sb , CollectionRec *cx ) {
//,cx->m_coll
, cx->m_diffbotCrawlName.getBufStart()
, crawlTypeStr
, cx->m_diffbotCrawlStartTime
// this is 0 if not over yet
, cx->m_diffbotCrawlEndTime
//, alias
//, (long)cx->m_spideringEnabled
, crawlStatus

@ -1001,6 +1001,7 @@ bool gotResults ( void *state ) {
// don't display more than docsWanted results
long count = msg40->getDocsWanted();
bool hadPrintError = false;
long numPrintedSoFar = 0;
//long widgetHeight = hr->getLong("widgetheight",400);
//long widgetwidth = hr->getLong("widgetwidth",250);
@ -1044,7 +1045,7 @@ bool gotResults ( void *state ) {
// prints in xml or html
//
//////////
if ( ! printResult ( st , i ) ) {
if ( ! printResult ( st , i , numPrintedSoFar++ ) ) {
hadPrintError = true;
break;
}
@ -2359,7 +2360,7 @@ static bool printDMOZCategoryUnderResult ( SafeBuf *sb ,
// use this for xml as well as html
bool printResult ( State0 *st, long ix ) {
bool printResult ( State0 *st, long ix , long numPrintedSoFar ) {
SafeBuf *sb = &st->m_sb;
@ -2440,7 +2441,7 @@ bool printResult ( State0 *st, long ix ) {
if ( mr->ptr_content ) {
// for json items separate with \n,\n
if ( si->m_format != FORMAT_HTML && ix>0 )
if ( si->m_format != FORMAT_HTML && numPrintedSoFar > 0 )
sb->safePrintf(",\n");
sb->safeStrcpy ( mr->ptr_content );

@ -50,7 +50,7 @@ public:
bool printSearchResultsHeader ( class State0 *st ) ;
bool printResult ( class State0 *st, long ix );
bool printResult ( class State0 *st, long ix , long numPrintedSoFar );
bool printSearchResultsTail ( class State0 *st ) ;

@ -8522,6 +8522,26 @@ void Parms::init ( ) {
m->m_flags = PF_DIFFBOT;
m++;
m->m_cgi = "dbcrawlstarttime";
m->m_xml = "diffbotCrawlStartTime";
m->m_off = (char *)&cr.m_diffbotCrawlStartTime - x;
m->m_type = TYPE_LONG;
m->m_page = PAGE_NONE;
m->m_obj = OBJ_COLL;
m->m_def = "0";
m->m_flags = PF_DIFFBOT;
m++;
m->m_cgi = "dbcrawlendtime";
m->m_xml = "diffbotCrawlEndTime";
m->m_off = (char *)&cr.m_diffbotCrawlEndTime - x;
m->m_type = TYPE_LONG;
m->m_page = PAGE_NONE;
m->m_obj = OBJ_COLL;
m->m_def = "0";
m->m_flags = PF_DIFFBOT;
m++;
m->m_cgi = "dbcrawlname";
m->m_xml = "diffbotCrawlName";
m->m_off = (char *)&cr.m_diffbotCrawlName - x;

@ -1026,14 +1026,22 @@ bool tryToDeleteSpiderColl ( SpiderColl *sc ) {
(long)sc,(long)sc->m_collnum);
return true;
}
// this means msg5 is out
if ( sc->m_msg5.m_waitingForList ) {
log("spider: deleting sc=0x%lx for collnum=%li waiting4",
(long)sc,(long)sc->m_collnum);
return true;
}
// there's still a core of someone trying to write to someting
// in "sc" so we have to try to fix that. somewhere in xmldoc.cpp
// or spider.cpp. everyone should get sc from cr everytime i'd think
log("spider: deleting sc=0x%lx for collnum=%li",
(long)sc,(long)sc->m_collnum);
// . make sure nobody has it
// . cr might be NULL because Collectiondb.cpp::deleteRec2() might
// have nuked it
CollectionRec *cr = sc->m_cr;
// make sure nobody has it
cr->m_spiderColl = NULL;
if ( cr ) cr->m_spiderColl = NULL;
mdelete ( sc , sizeof(SpiderColl),"postdel1");
delete ( sc );
return true;
@ -12244,6 +12252,8 @@ void handleRequestc1 ( UdpSlot *slot , long niceness ) {
ci->m_hasUrlsReadyToSpider = 0;
// save that!
cr->m_needsSave = true;
// set the time that this happens
cr->m_diffbotCrawlEndTime = getTimeGlobalNoCore();
}
// save it

@ -2113,8 +2113,8 @@ bool XmlDoc::indexDoc ( ) {
// need to save collection rec now during auto save
cr->m_needsSave = true;
// update this just in case we are the last url crawled
long long now = gettimeofdayInMillisecondsGlobal();
cr->m_diffbotCrawlEndTime = now;
//long long now = gettimeofdayInMillisecondsGlobal();
//cr->m_diffbotCrawlEndTime = now;
}