mirror of
https://github.com/privacore/open-source-search-engine.git
synced 2025-07-11 02:16:07 -04:00
Merge branch 'diffbot-testing' into diffbot-matt
This commit is contained in:
@ -467,13 +467,13 @@ bool Collectiondb::addNewColl ( char *coll ,
|
||||
cr->m_collectiveRespiderFrequency = 0.0;
|
||||
//cr->m_restrictDomain = true;
|
||||
// reset the crawl stats
|
||||
// . this will core if a host was dead and then when it came
|
||||
// back up host #0's parms.cpp told it to add a new coll
|
||||
cr->m_diffbotCrawlStartTime=
|
||||
gettimeofdayInMillisecondsGlobalNoCore();
|
||||
cr->m_diffbotCrawlEndTime = 0LL;
|
||||
}
|
||||
|
||||
// . this will core if a host was dead and then when it came
|
||||
// back up host #0's parms.cpp told it to add a new coll
|
||||
cr->m_diffbotCrawlStartTime = getTimeGlobalNoCore();
|
||||
cr->m_diffbotCrawlEndTime = 0;
|
||||
|
||||
// . just the basics on these for now
|
||||
// . if certain parms are changed then the url filters
|
||||
// must be rebuilt, as well as possibly the waiting tree!!!
|
||||
@ -807,6 +807,11 @@ bool Collectiondb::deleteRec2 ( collnum_t collnum ) { //, WaitEntry *we ) {
|
||||
sc->clearLocks();
|
||||
//sc->m_collnum = newCollnum;
|
||||
//sc->reset();
|
||||
// you have to set this for tryToDeleteSpiderColl to
|
||||
// actually have a shot at deleting it
|
||||
sc->m_deleteMyself = true;
|
||||
// cr will be invalid shortly after this
|
||||
sc->m_cr = NULL;
|
||||
// this will put it on "death row" so it will be deleted
|
||||
// once Msg5::m_waitingForList/Merge is NULL
|
||||
tryToDeleteSpiderColl ( sc );
|
||||
@ -1611,12 +1616,14 @@ void CollectionRec::reset() {
|
||||
sc->m_deleteMyself = true;
|
||||
|
||||
// if not currently being accessed nuke it now
|
||||
if ( ! sc->m_msg5.m_waitingForList &&
|
||||
! sc->m_msg5b.m_waitingForList &&
|
||||
! sc->m_msg1.m_mcast.m_inUse ) {
|
||||
mdelete ( sc, sizeof(SpiderColl),"nukecr2");
|
||||
delete ( sc );
|
||||
}
|
||||
tryToDeleteSpiderColl ( sc );
|
||||
|
||||
// if ( ! sc->m_msg5.m_waitingForList &&
|
||||
// ! sc->m_msg5b.m_waitingForList &&
|
||||
// ! sc->m_msg1.m_mcast.m_inUse ) {
|
||||
// mdelete ( sc, sizeof(SpiderColl),"nukecr2");
|
||||
// delete ( sc );
|
||||
// }
|
||||
}
|
||||
|
||||
CollectionRec *g_cr = NULL;
|
||||
|
@ -665,8 +665,9 @@ class CollectionRec {
|
||||
long long m_maxToProcess;
|
||||
long m_maxCrawlRounds;
|
||||
|
||||
long long m_diffbotCrawlStartTime;
|
||||
long long m_diffbotCrawlEndTime;
|
||||
// in seconds now
|
||||
long m_diffbotCrawlStartTime;
|
||||
long m_diffbotCrawlEndTime;
|
||||
|
||||
// for testing their regexes etc...
|
||||
//char m_isDiffbotTestCrawl;
|
||||
|
2
Msg3.cpp
2
Msg3.cpp
@ -930,7 +930,7 @@ bool Msg3::doneScanning ( ) {
|
||||
ff->getFilename() ,
|
||||
m_niceness ) ) {
|
||||
log("net: Had error while constraining list read from "
|
||||
"%s: %s%s. vfd=%li parts=%li. "
|
||||
"%s: %s/%s. vfd=%li parts=%li. "
|
||||
"This is likely caused by corrupted "
|
||||
"data on disk.",
|
||||
mstrerror(g_errno), ff->m_dir ,
|
||||
|
10
Msg40.cpp
10
Msg40.cpp
@ -100,6 +100,7 @@ Msg40::Msg40() {
|
||||
m_sendsIn = 0;
|
||||
m_printi = 0;
|
||||
m_numDisplayed = 0;
|
||||
m_numPrintedSoFar = 0;
|
||||
m_lastChunk = false;
|
||||
//m_numGigabitInfos = 0;
|
||||
}
|
||||
@ -1683,6 +1684,7 @@ bool Msg40::gotSummary ( ) {
|
||||
if ( m_si && m_numDisplayed <= m_si->m_firstResultNum ){
|
||||
log("msg40: hiding #%li (%lu)",
|
||||
m_printi,mr->m_contentHash32);
|
||||
m20->reset();
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -1690,7 +1692,9 @@ bool Msg40::gotSummary ( ) {
|
||||
|
||||
// . ok, we got it, so print it and stream it
|
||||
// . this might set m_hadPrintError to true
|
||||
printSearchResult9 ( m_printi );
|
||||
printSearchResult9 ( m_printi , m_numPrintedSoFar );
|
||||
|
||||
m_numPrintedSoFar++;
|
||||
|
||||
// now free the reply to save memory since we could be
|
||||
// streaming back 1M+. we call reset below, no need for this.
|
||||
@ -5175,7 +5179,7 @@ bool Msg40::addFacts ( HashTableX *queryTable,
|
||||
|
||||
|
||||
// . printSearchResult into "sb"
|
||||
bool Msg40::printSearchResult9 ( long ix ) {
|
||||
bool Msg40::printSearchResult9 ( long ix , long numPrintedSoFar ) {
|
||||
|
||||
// . we stream results right onto the socket
|
||||
// . useful for thousands of results... and saving mem
|
||||
@ -5202,7 +5206,7 @@ bool Msg40::printSearchResult9 ( long ix ) {
|
||||
}
|
||||
|
||||
// print that out into st->m_sb safebuf
|
||||
else if ( ! printResult ( st , ix ) ) {
|
||||
else if ( ! printResult ( st , ix , numPrintedSoFar ) ) {
|
||||
// oom?
|
||||
if ( ! g_errno ) g_errno = EBADENGINEER;
|
||||
log("query: had error: %s",mstrerror(g_errno));
|
||||
|
3
Msg40.h
3
Msg40.h
@ -208,7 +208,7 @@ class Msg40 {
|
||||
|
||||
long m_lastHeartbeat;
|
||||
|
||||
bool printSearchResult9 ( long ix ) ;
|
||||
bool printSearchResult9 ( long ix , long numPrintedSoFar ) ;
|
||||
HashTableX m_columnTable;
|
||||
bool printCSVHeaderRow ( class SafeBuf *sb );
|
||||
bool printJsonItemInCSV ( class State0 *st , long ix );
|
||||
@ -265,6 +265,7 @@ class Msg40 {
|
||||
long m_sendsIn ;
|
||||
long m_printi ;
|
||||
long m_numDisplayed ;
|
||||
long m_numPrintedSoFar;
|
||||
long m_socketHadError;
|
||||
|
||||
|
||||
|
4
Msg5.cpp
4
Msg5.cpp
@ -802,7 +802,9 @@ bool Msg5::needsRecall ( ) {
|
||||
RdbBase *base = getRdbBase ( m_rdbId , m_collnum );
|
||||
// if collection was deleted from under us, base will be NULL
|
||||
if ( ! base && ! g_errno ) {
|
||||
log("msg5: base lost for collnum %li",(long)m_collnum);
|
||||
log("msg5: base lost for rdbid=%li collnum %li",
|
||||
(long)m_rdbId,(long)m_collnum);
|
||||
g_errno = ENOCOLLREC;
|
||||
return false;
|
||||
}
|
||||
// sanity check
|
||||
|
@ -2355,10 +2355,13 @@ bool printCrawlDetailsInJson ( SafeBuf *sb , CollectionRec *cx ) {
|
||||
//nomen = "job";
|
||||
}
|
||||
|
||||
|
||||
sb->safePrintf("\n\n{"
|
||||
"\"name\":\"%s\",\n"
|
||||
"\"type\":\"%s\",\n"
|
||||
|
||||
"\"jobCreationTimeUTC\":%li,\n"
|
||||
"\"jobCompletionTimeUTC\":%li,\n"
|
||||
|
||||
//"\"alias\":\"%s\",\n"
|
||||
//"\"crawlingEnabled\":%li,\n"
|
||||
"\"jobStatus\":{" // nomen = jobStatus / crawlStatus
|
||||
@ -2384,6 +2387,11 @@ bool printCrawlDetailsInJson ( SafeBuf *sb , CollectionRec *cx ) {
|
||||
//,cx->m_coll
|
||||
, cx->m_diffbotCrawlName.getBufStart()
|
||||
, crawlTypeStr
|
||||
|
||||
, cx->m_diffbotCrawlStartTime
|
||||
// this is 0 if not over yet
|
||||
, cx->m_diffbotCrawlEndTime
|
||||
|
||||
//, alias
|
||||
//, (long)cx->m_spideringEnabled
|
||||
, crawlStatus
|
||||
|
@ -1001,6 +1001,7 @@ bool gotResults ( void *state ) {
|
||||
// don't display more than docsWanted results
|
||||
long count = msg40->getDocsWanted();
|
||||
bool hadPrintError = false;
|
||||
long numPrintedSoFar = 0;
|
||||
//long widgetHeight = hr->getLong("widgetheight",400);
|
||||
//long widgetwidth = hr->getLong("widgetwidth",250);
|
||||
|
||||
@ -1044,7 +1045,7 @@ bool gotResults ( void *state ) {
|
||||
// prints in xml or html
|
||||
//
|
||||
//////////
|
||||
if ( ! printResult ( st , i ) ) {
|
||||
if ( ! printResult ( st , i , numPrintedSoFar++ ) ) {
|
||||
hadPrintError = true;
|
||||
break;
|
||||
}
|
||||
@ -2359,7 +2360,7 @@ static bool printDMOZCategoryUnderResult ( SafeBuf *sb ,
|
||||
|
||||
|
||||
// use this for xml as well as html
|
||||
bool printResult ( State0 *st, long ix ) {
|
||||
bool printResult ( State0 *st, long ix , long numPrintedSoFar ) {
|
||||
|
||||
SafeBuf *sb = &st->m_sb;
|
||||
|
||||
@ -2440,7 +2441,7 @@ bool printResult ( State0 *st, long ix ) {
|
||||
if ( mr->ptr_content ) {
|
||||
|
||||
// for json items separate with \n,\n
|
||||
if ( si->m_format != FORMAT_HTML && ix>0 )
|
||||
if ( si->m_format != FORMAT_HTML && numPrintedSoFar > 0 )
|
||||
sb->safePrintf(",\n");
|
||||
|
||||
sb->safeStrcpy ( mr->ptr_content );
|
||||
|
@ -50,7 +50,7 @@ public:
|
||||
|
||||
|
||||
bool printSearchResultsHeader ( class State0 *st ) ;
|
||||
bool printResult ( class State0 *st, long ix );
|
||||
bool printResult ( class State0 *st, long ix , long numPrintedSoFar );
|
||||
bool printSearchResultsTail ( class State0 *st ) ;
|
||||
|
||||
|
||||
|
20
Parms.cpp
20
Parms.cpp
@ -8522,6 +8522,26 @@ void Parms::init ( ) {
|
||||
m->m_flags = PF_DIFFBOT;
|
||||
m++;
|
||||
|
||||
m->m_cgi = "dbcrawlstarttime";
|
||||
m->m_xml = "diffbotCrawlStartTime";
|
||||
m->m_off = (char *)&cr.m_diffbotCrawlStartTime - x;
|
||||
m->m_type = TYPE_LONG;
|
||||
m->m_page = PAGE_NONE;
|
||||
m->m_obj = OBJ_COLL;
|
||||
m->m_def = "0";
|
||||
m->m_flags = PF_DIFFBOT;
|
||||
m++;
|
||||
|
||||
m->m_cgi = "dbcrawlendtime";
|
||||
m->m_xml = "diffbotCrawlEndTime";
|
||||
m->m_off = (char *)&cr.m_diffbotCrawlEndTime - x;
|
||||
m->m_type = TYPE_LONG;
|
||||
m->m_page = PAGE_NONE;
|
||||
m->m_obj = OBJ_COLL;
|
||||
m->m_def = "0";
|
||||
m->m_flags = PF_DIFFBOT;
|
||||
m++;
|
||||
|
||||
m->m_cgi = "dbcrawlname";
|
||||
m->m_xml = "diffbotCrawlName";
|
||||
m->m_off = (char *)&cr.m_diffbotCrawlName - x;
|
||||
|
14
Spider.cpp
14
Spider.cpp
@ -1026,14 +1026,22 @@ bool tryToDeleteSpiderColl ( SpiderColl *sc ) {
|
||||
(long)sc,(long)sc->m_collnum);
|
||||
return true;
|
||||
}
|
||||
// this means msg5 is out
|
||||
if ( sc->m_msg5.m_waitingForList ) {
|
||||
log("spider: deleting sc=0x%lx for collnum=%li waiting4",
|
||||
(long)sc,(long)sc->m_collnum);
|
||||
return true;
|
||||
}
|
||||
// there's still a core of someone trying to write to someting
|
||||
// in "sc" so we have to try to fix that. somewhere in xmldoc.cpp
|
||||
// or spider.cpp. everyone should get sc from cr everytime i'd think
|
||||
log("spider: deleting sc=0x%lx for collnum=%li",
|
||||
(long)sc,(long)sc->m_collnum);
|
||||
// . make sure nobody has it
|
||||
// . cr might be NULL because Collectiondb.cpp::deleteRec2() might
|
||||
// have nuked it
|
||||
CollectionRec *cr = sc->m_cr;
|
||||
// make sure nobody has it
|
||||
cr->m_spiderColl = NULL;
|
||||
if ( cr ) cr->m_spiderColl = NULL;
|
||||
mdelete ( sc , sizeof(SpiderColl),"postdel1");
|
||||
delete ( sc );
|
||||
return true;
|
||||
@ -12244,6 +12252,8 @@ void handleRequestc1 ( UdpSlot *slot , long niceness ) {
|
||||
ci->m_hasUrlsReadyToSpider = 0;
|
||||
// save that!
|
||||
cr->m_needsSave = true;
|
||||
// set the time that this happens
|
||||
cr->m_diffbotCrawlEndTime = getTimeGlobalNoCore();
|
||||
}
|
||||
|
||||
// save it
|
||||
|
@ -2113,8 +2113,8 @@ bool XmlDoc::indexDoc ( ) {
|
||||
// need to save collection rec now during auto save
|
||||
cr->m_needsSave = true;
|
||||
// update this just in case we are the last url crawled
|
||||
long long now = gettimeofdayInMillisecondsGlobal();
|
||||
cr->m_diffbotCrawlEndTime = now;
|
||||
//long long now = gettimeofdayInMillisecondsGlobal();
|
||||
//cr->m_diffbotCrawlEndTime = now;
|
||||
}
|
||||
|
||||
|
||||
|
Reference in New Issue
Block a user