new urls.csv polish. moved columns around. added

some new gbss fields, like spidered time.
2015-04-15 17:42:56 -06:00
parent fec347a7df
commit ef42a9cf28
4 changed files with 61 additions and 11 deletions
--- a/HttpServer.cpp
+++ b/HttpServer.cpp
@ -2340,7 +2340,7 @@ int32_t getMsgSize ( char *buf, int32_t bufSize, TcpSocket *s ) {
 		// /admin/basic etc
 		if ( pp + 7 < ppend && strncmp ( pp ,"/admin/",7)==0)
 			max = 0x7fffffff;
-		// bulk job. /v2/bulk
+		// bulk job. /v2/bulk or /v3/crawl/download/token-name...
 		if ( pp + 4 < ppend && strncmp ( pp ,"/v",2)==0 &&
 		     // /v2/bulk
 		     ( ( pp[4] == 'b' && pp[5] == 'u' ) ||
--- a/PageCrawlBot.cpp
+++ b/PageCrawlBot.cpp
@ -288,6 +288,47 @@ bool sendBackDump ( TcpSocket *sock, HttpRequest *hr ) {
 		return sendPageResults ( sock , &hr2 );
 	}

+	// . now the urls.csv is also a query on gbss files
+	// . make an httprequest on stack and call it
+	if ( fmt == FORMAT_CSV && rdbId == RDB_SPIDERDB ) {
+		char tmp2[5000];
+		SafeBuf sb2(tmp2,5000);
+		// never dedup
+		int32_t dr = 0;
+		// do not dedup for crawls either it is too confusing!!!!
+		// ppl wonder where the results are!
+		dr = 0;
+		sb2.safePrintf("GET /search?"
+			       // this is not necessary
+			       //"icc=1&"
+			       "format=csv&"
+			       // no site clustering
+			       "sc=0&"
+			       // never dedup.
+			       "dr=0&"
+			       "c=%s&"
+			       "n=10000000&"
+			       // stream it now
+			       "stream=1&"
+			       // no summary similarity dedup, only exact
+			       // doc content hash. otherwise too slow!!
+			       "pss=0&"
+			       // no gigabits
+			       "dsrt=0&"
+			       // do not compute summary. 0 lines.
+			       //"ns=0&"
+			       "q=gbrevsortbyint%%3AgbssSpiderTime+"
+			       "gbssIsDiffbotObject%%3A0"
+			       "&"
+			       //"prepend=type%%3Ajson"
+			       "\r\n\r\n"
+			       , cr->m_coll
+			       );
+		HttpRequest hr2;
+		hr2.set ( sb2.getBufStart() , sb2.length() , sock );
+		return sendPageResults ( sock , &hr2 );
+	}
+


 	//if ( strncmp ( path ,"/crawlbot/downloadurls",22  ) == 0 )
--- a/PageResults.cpp
+++ b/PageResults.cpp
@ -7975,20 +7975,21 @@ bool printCSVHeaderRow ( SafeBuf *sb , State0 *st , int32_t ct ) {
 		"00gbssUrl",
 		"01gbssDocId",
 		"02gbssDiscoveredTime",
-		"03gbssDownloadStartTime",
+		"03gbssSpiderTime",
 		"06gbssContentLen",
 		"07gbssDupOfDocId" ,
 		"08gbssNumRedirects",
 		"09gbssFinalRedirectUrl",
-		"10gbssPercentContentChanged",
+		"10gbssCrawlDelayMS",
 		"11gbssCrawlRound",
 		"12gbssHopCount",
-		"13gbssIp",
+		"13gbssStatusMsg",
 		"14gbssSentToDiffbotThisTime",
 		"15gbssDiffbotReplyMsg",
-		"16gbssStatusMsg",
-

+		"gbssIp",
+		"gbssPercentContentChanged",
+		"gbssDownloadStartTime",
 		"gbssDownloadEndTime",
 		"gbssContentType",
 		"gbssHttpStatus",
@ -8004,7 +8005,6 @@ bool printCSVHeaderRow ( SafeBuf *sb , State0 *st , int32_t ct ) {
 		"gbssSiteNumInlinks",
 		"gbssSiteRank",
 		"gbssLanguage",
-		"gbssCrawlDelayMS",
 		"gbssDiffbotReplyCode",
 		"gbssDiffbotLen",
 		"gbssDiffbotReplyResponseTimeMS",
@ -8169,8 +8169,8 @@ bool printCSVHeaderRow ( SafeBuf *sb , State0 *st , int32_t ct ) {
 		if ( ! strcmp(hdr,"gbssDiscoveredTime") ) // need this!
 			hdr = "Url Discovered Time";
 		// when it was crawled this time
-		if ( ! strcmp(hdr,"gbssDownloadStartTime") ) 
-			hdr = "Download Time";
+		if ( ! strcmp(hdr,"gbssSpiderTime" ) )
+			hdr = "Crawled Time";
 		if ( ! strcmp(hdr,"gbssContentLen") ) 
 			hdr = "Content Length";
 		if ( ! strcmp(hdr,"gbssDupOfDocId") ) 
@ -8183,6 +8183,8 @@ bool printCSVHeaderRow ( SafeBuf *sb , State0 *st , int32_t ct ) {
 			hdr = "Percent Changed";
 		if ( ! strcmp(hdr,"gbssCrawlRound") ) 
 			hdr = "Crawl Round";
+		if ( ! strcmp(hdr,"gbssCrawlDelay") ) 
+			hdr = "Robots.txt Crawl Delay (ms)";
 		if ( ! strcmp(hdr,"gbssHopCount") ) 
 			hdr = "Hop Count";
 		if ( ! strcmp(hdr,"gbssIp") ) 
@ -8192,7 +8194,7 @@ bool printCSVHeaderRow ( SafeBuf *sb , State0 *st , int32_t ct ) {
 		if ( ! strcmp(hdr,"gbssDiffbotReplyMsg") )
 			hdr = "Process Response";
 		if ( ! strcmp(hdr,"gbssStatusMsg") ) 
-			hdr = "Status";
+			hdr = "Crawl Status";

 		//if ( ! strcmp(hdr,"gbssMatchingUrlFilter") ) 
 		//	hdr = "Matching Expression";
--- a/XmlDoc.cpp
+++ b/XmlDoc.cpp
@ -24584,7 +24584,7 @@ SpiderReply *XmlDoc::getFakeSpiderReply ( ) {
 	//if ( ! cr ) return true;
 }

-
+// getSpiderReply()
 SpiderReply *XmlDoc::getNewSpiderReply ( ) {

 	if ( m_srepValid ) return &m_srep;
@ -27304,6 +27304,13 @@ SafeBuf *XmlDoc::getSpiderStatusDocMetaList2 ( SpiderReply *reply ) {
 			      m_sreq.m_reservedc2);
 	}

+	if ( m_spideredTimeValid )
+		jd.safePrintf("\"gbssSpiderTime\":%"INT32",\n",
+			      m_spideredTime);
+	else
+		jd.safePrintf("\"gbssSpiderTime\":%"INT32",\n",0);
+
+
 	if ( m_firstIndexedDateValid )
 		jd.safePrintf("\"gbssFirstIndexed\":%"UINT32",\n",
 			      m_firstIndexedDate);