Merge branch 'diffbot-testing' into diffbot-matt

2025-07-11 02:16:07 -04:00 · 2014-04-28 14:15:02 -07:00
parent a2c6527ada 65493fcdec
commit 066a01cba6
12 changed files with 82 additions and 28 deletions
--- a/Collectiondb.cpp
+++ b/Collectiondb.cpp
@ -467,13 +467,13 @@ bool Collectiondb::addNewColl ( char *coll ,
 		cr->m_collectiveRespiderFrequency = 0.0;
 		//cr->m_restrictDomain = true;
 		// reset the crawl stats
-		// . this will core if a host was dead and then when it came
-		//   back up host #0's parms.cpp told it to add a new coll
-		cr->m_diffbotCrawlStartTime=
-			gettimeofdayInMillisecondsGlobalNoCore();
-		cr->m_diffbotCrawlEndTime   = 0LL;
 	}

+	// . this will core if a host was dead and then when it came
+	//   back up host #0's parms.cpp told it to add a new coll
+	cr->m_diffbotCrawlStartTime = getTimeGlobalNoCore();
+	cr->m_diffbotCrawlEndTime   = 0;
+	
 	// . just the basics on these for now
 	// . if certain parms are changed then the url filters
 	//   must be rebuilt, as well as possibly the waiting tree!!!
@ -807,6 +807,11 @@ bool Collectiondb::deleteRec2 ( collnum_t collnum ) { //, WaitEntry *we ) {
 		sc->clearLocks();
 		//sc->m_collnum = newCollnum;
 		//sc->reset();
+		// you have to set this for tryToDeleteSpiderColl to
+		// actually have a shot at deleting it
+		sc->m_deleteMyself = true;
+		// cr will be invalid shortly after this
+		sc->m_cr = NULL;
 		// this will put it on "death row" so it will be deleted
 		// once Msg5::m_waitingForList/Merge is NULL
 		tryToDeleteSpiderColl ( sc );
@ -1611,12 +1616,14 @@ void CollectionRec::reset() {
 	sc->m_deleteMyself = true;

 	// if not currently being accessed nuke it now
-	if ( ! sc->m_msg5.m_waitingForList &&
-	     ! sc->m_msg5b.m_waitingForList &&
-	     ! sc->m_msg1.m_mcast.m_inUse ) {
-		mdelete ( sc, sizeof(SpiderColl),"nukecr2");
-		delete ( sc );
-	}
+	tryToDeleteSpiderColl ( sc );
+
+	// if ( ! sc->m_msg5.m_waitingForList &&
+	//      ! sc->m_msg5b.m_waitingForList &&
+	//      ! sc->m_msg1.m_mcast.m_inUse ) {
+	// 	mdelete ( sc, sizeof(SpiderColl),"nukecr2");
+	// 	delete ( sc );
+	// }
 }

 CollectionRec *g_cr = NULL;
--- a/Collectiondb.h
+++ b/Collectiondb.h
@ -665,8 +665,9 @@ class CollectionRec {
 	long long m_maxToProcess;
 	long      m_maxCrawlRounds;

-	long long m_diffbotCrawlStartTime;
-	long long m_diffbotCrawlEndTime;
+	// in seconds now
+	long m_diffbotCrawlStartTime;
+	long m_diffbotCrawlEndTime;

 	// for testing their regexes etc...
 	//char m_isDiffbotTestCrawl;
--- a/Msg3.cpp
+++ b/Msg3.cpp
@ -930,7 +930,7 @@ bool Msg3::doneScanning ( ) {
 					      ff->getFilename() ,
 					      m_niceness ) ) {
 			log("net: Had error while constraining list read from "
-			    "%s: %s%s. vfd=%li parts=%li. "
+			    "%s: %s/%s. vfd=%li parts=%li. "
 			    "This is likely caused by corrupted "
 			    "data on disk.", 
 			    mstrerror(g_errno), ff->m_dir ,
--- a/Msg40.cpp
+++ b/Msg40.cpp
@ -100,6 +100,7 @@ Msg40::Msg40() {
 	m_sendsIn       = 0;
 	m_printi        = 0;
 	m_numDisplayed  = 0;
+	m_numPrintedSoFar = 0;
 	m_lastChunk     = false;
 	//m_numGigabitInfos = 0;
 }
@ -1683,6 +1684,7 @@ bool Msg40::gotSummary ( ) {
 		if ( m_si && m_numDisplayed <= m_si->m_firstResultNum ){
 			log("msg40: hiding #%li (%lu)",
 			    m_printi,mr->m_contentHash32);
+			m20->reset();
 			continue;
 		}

@ -1690,7 +1692,9 @@ bool Msg40::gotSummary ( ) {

 		// . ok, we got it, so print it and stream it
 		// . this might set m_hadPrintError to true
-		printSearchResult9 ( m_printi );
+		printSearchResult9 ( m_printi , m_numPrintedSoFar );
+
+		m_numPrintedSoFar++;

 		// now free the reply to save memory since we could be 
 		// streaming back 1M+. we call reset below, no need for this.
@ -5175,7 +5179,7 @@ bool Msg40::addFacts ( HashTableX *queryTable,


 // . printSearchResult into "sb"
-bool Msg40::printSearchResult9 ( long ix ) {
+bool Msg40::printSearchResult9 ( long ix , long numPrintedSoFar ) {

 	// . we stream results right onto the socket
 	// . useful for thousands of results... and saving mem
@ -5202,7 +5206,7 @@ bool Msg40::printSearchResult9 ( long ix ) {
 		}

 		// print that out into st->m_sb safebuf
-		else if ( ! printResult ( st , ix ) ) {
+		else if ( ! printResult ( st , ix , numPrintedSoFar ) ) {
 			// oom?
 			if ( ! g_errno ) g_errno = EBADENGINEER;
 			log("query: had error: %s",mstrerror(g_errno));
--- a/Msg40.h
+++ b/Msg40.h
@ -208,7 +208,7 @@ class Msg40 {

 	long m_lastHeartbeat;

-	bool printSearchResult9 ( long ix ) ;
+	bool printSearchResult9 ( long ix , long numPrintedSoFar ) ;
 	HashTableX m_columnTable;
 	bool printCSVHeaderRow ( class SafeBuf *sb );
 	bool printJsonItemInCSV ( class State0 *st , long ix );
@ -265,6 +265,7 @@ class Msg40 {
 	long m_sendsIn       ;
 	long m_printi        ;
 	long m_numDisplayed  ;
+	long m_numPrintedSoFar;
 	long m_socketHadError;


--- a/Msg5.cpp
+++ b/Msg5.cpp
@ -802,7 +802,9 @@ bool Msg5::needsRecall ( ) {
 	RdbBase *base = getRdbBase ( m_rdbId , m_collnum );
 	// if collection was deleted from under us, base will be NULL
 	if ( ! base && ! g_errno ) {
-		log("msg5: base lost for collnum %li",(long)m_collnum);
+		log("msg5: base lost for rdbid=%li collnum %li",
+		    (long)m_rdbId,(long)m_collnum);
+		g_errno = ENOCOLLREC;
 		return false;
 	}
 	// sanity check
--- a/PageCrawlBot.cpp
+++ b/PageCrawlBot.cpp
@ -2355,10 +2355,13 @@ bool printCrawlDetailsInJson ( SafeBuf *sb , CollectionRec *cx ) {
 		//nomen = "job";
 	}

-
 	sb->safePrintf("\n\n{"
 		      "\"name\":\"%s\",\n"
 		      "\"type\":\"%s\",\n"
+
+		       "\"jobCreationTimeUTC\":%li,\n"
+		       "\"jobCompletionTimeUTC\":%li,\n"
+
 		      //"\"alias\":\"%s\",\n"
 		      //"\"crawlingEnabled\":%li,\n"
 		      "\"jobStatus\":{" // nomen = jobStatus / crawlStatus
@ -2384,6 +2387,11 @@ bool printCrawlDetailsInJson ( SafeBuf *sb , CollectionRec *cx ) {
 		      //,cx->m_coll
 		      , cx->m_diffbotCrawlName.getBufStart()
 		      , crawlTypeStr
+
+		       , cx->m_diffbotCrawlStartTime
+		       // this is 0 if not over yet
+		       , cx->m_diffbotCrawlEndTime
+
 		      //, alias
 		      //, (long)cx->m_spideringEnabled
 		      , crawlStatus
--- a/PageResults.cpp
+++ b/PageResults.cpp
@ -1001,6 +1001,7 @@ bool gotResults ( void *state ) {
 	// don't display more than docsWanted results
 	long count = msg40->getDocsWanted();
 	bool hadPrintError = false;
+	long numPrintedSoFar = 0;
 	//long widgetHeight = hr->getLong("widgetheight",400);
 	//long widgetwidth = hr->getLong("widgetwidth",250);

@ -1044,7 +1045,7 @@ bool gotResults ( void *state ) {
 		// prints in xml or html
 		//
 		//////////
-		if ( ! printResult ( st , i ) ) {
+		if ( ! printResult ( st , i , numPrintedSoFar++ ) ) {
 			hadPrintError = true;
 			break;
 		}
@ -2359,7 +2360,7 @@ static bool printDMOZCategoryUnderResult ( SafeBuf *sb ,


 // use this for xml as well as html
-bool printResult ( State0 *st, long ix ) {
+bool printResult ( State0 *st, long ix , long numPrintedSoFar ) {

 	SafeBuf *sb = &st->m_sb;

@ -2440,7 +2441,7 @@ bool printResult ( State0 *st, long ix ) {
 	if ( mr->ptr_content ) {

 		// for json items separate with \n,\n
-		if ( si->m_format != FORMAT_HTML && ix>0 )
+		if ( si->m_format != FORMAT_HTML && numPrintedSoFar > 0 )
 			sb->safePrintf(",\n");

 		sb->safeStrcpy ( mr->ptr_content );
--- a/PageResults.h
+++ b/PageResults.h
@ -50,7 +50,7 @@ public:


 bool printSearchResultsHeader ( class State0 *st ) ;
-bool printResult ( class State0 *st,  long ix );
+bool printResult ( class State0 *st,  long ix , long numPrintedSoFar );
 bool printSearchResultsTail ( class State0 *st ) ;


--- a/Parms.cpp
+++ b/Parms.cpp
@ -8522,6 +8522,26 @@ void Parms::init ( ) {
 	m->m_flags = PF_DIFFBOT;
 	m++;

+	m->m_cgi   = "dbcrawlstarttime";
+	m->m_xml   = "diffbotCrawlStartTime";
+	m->m_off   = (char *)&cr.m_diffbotCrawlStartTime - x;
+	m->m_type  = TYPE_LONG;
+	m->m_page  = PAGE_NONE;
+	m->m_obj   = OBJ_COLL;
+	m->m_def   = "0";
+	m->m_flags = PF_DIFFBOT;
+	m++;
+
+	m->m_cgi   = "dbcrawlendtime";
+	m->m_xml   = "diffbotCrawlEndTime";
+	m->m_off   = (char *)&cr.m_diffbotCrawlEndTime - x;
+	m->m_type  = TYPE_LONG;
+	m->m_page  = PAGE_NONE;
+	m->m_obj   = OBJ_COLL;
+	m->m_def   = "0";
+	m->m_flags = PF_DIFFBOT;
+	m++;
+
 	m->m_cgi   = "dbcrawlname";
 	m->m_xml   = "diffbotCrawlName";
 	m->m_off   = (char *)&cr.m_diffbotCrawlName - x;
--- a/Spider.cpp
+++ b/Spider.cpp
@ -1026,14 +1026,22 @@ bool tryToDeleteSpiderColl ( SpiderColl *sc ) {
 		    (long)sc,(long)sc->m_collnum);
 		return true;
 	}
+	// this means msg5 is out
+	if ( sc->m_msg5.m_waitingForList ) {
+		log("spider: deleting sc=0x%lx for collnum=%li waiting4",
+		    (long)sc,(long)sc->m_collnum);
+		return true;
+	}
 	// there's still a core of someone trying to write to someting
 	// in "sc" so we have to try to fix that. somewhere in xmldoc.cpp
 	// or spider.cpp. everyone should get sc from cr everytime i'd think
 	log("spider: deleting sc=0x%lx for collnum=%li",
 	    (long)sc,(long)sc->m_collnum);
+	// . make sure nobody has it
+	// . cr might be NULL because Collectiondb.cpp::deleteRec2() might
+	//   have nuked it
 	CollectionRec *cr = sc->m_cr;
-	// make sure nobody has it
-	cr->m_spiderColl = NULL;
+	if ( cr ) cr->m_spiderColl = NULL;
 	mdelete ( sc , sizeof(SpiderColl),"postdel1");
 	delete ( sc );
 	return true;
@ -12244,6 +12252,8 @@ void handleRequestc1 ( UdpSlot *slot , long niceness ) {
 			ci->m_hasUrlsReadyToSpider = 0;
 			// save that!
 			cr->m_needsSave = true;
+			// set the time that this happens
+			cr->m_diffbotCrawlEndTime = getTimeGlobalNoCore();
 		}
 		
 		// save it
--- a/XmlDoc.cpp
+++ b/XmlDoc.cpp
@ -2113,8 +2113,8 @@ bool XmlDoc::indexDoc ( ) {
 		// need to save collection rec now during auto save
 		cr->m_needsSave = true;
 		// update this just in case we are the last url crawled
-		long long now = gettimeofdayInMillisecondsGlobal();
-		cr->m_diffbotCrawlEndTime = now;
+		//long long now = gettimeofdayInMillisecondsGlobal();
+		//cr->m_diffbotCrawlEndTime = now;
 	}