new urls.csv polish. moved columns around. added

some new gbss fields, like spidered time.
This commit is contained in:
Matt
2015-04-15 17:42:56 -06:00
parent fec347a7df
commit ef42a9cf28
4 changed files with 61 additions and 11 deletions

@ -2340,7 +2340,7 @@ int32_t getMsgSize ( char *buf, int32_t bufSize, TcpSocket *s ) {
// /admin/basic etc
if ( pp + 7 < ppend && strncmp ( pp ,"/admin/",7)==0)
max = 0x7fffffff;
// bulk job. /v2/bulk
// bulk job. /v2/bulk or /v3/crawl/download/token-name...
if ( pp + 4 < ppend && strncmp ( pp ,"/v",2)==0 &&
// /v2/bulk
( ( pp[4] == 'b' && pp[5] == 'u' ) ||

@ -288,6 +288,47 @@ bool sendBackDump ( TcpSocket *sock, HttpRequest *hr ) {
return sendPageResults ( sock , &hr2 );
}
// . now the urls.csv is also a query on gbss files
// . make an httprequest on stack and call it
if ( fmt == FORMAT_CSV && rdbId == RDB_SPIDERDB ) {
char tmp2[5000];
SafeBuf sb2(tmp2,5000);
// never dedup
int32_t dr = 0;
// do not dedup for crawls either it is too confusing!!!!
// ppl wonder where the results are!
dr = 0;
sb2.safePrintf("GET /search?"
// this is not necessary
//"icc=1&"
"format=csv&"
// no site clustering
"sc=0&"
// never dedup.
"dr=0&"
"c=%s&"
"n=10000000&"
// stream it now
"stream=1&"
// no summary similarity dedup, only exact
// doc content hash. otherwise too slow!!
"pss=0&"
// no gigabits
"dsrt=0&"
// do not compute summary. 0 lines.
//"ns=0&"
"q=gbrevsortbyint%%3AgbssSpiderTime+"
"gbssIsDiffbotObject%%3A0"
"&"
//"prepend=type%%3Ajson"
"\r\n\r\n"
, cr->m_coll
);
HttpRequest hr2;
hr2.set ( sb2.getBufStart() , sb2.length() , sock );
return sendPageResults ( sock , &hr2 );
}
//if ( strncmp ( path ,"/crawlbot/downloadurls",22 ) == 0 )

@ -7975,20 +7975,21 @@ bool printCSVHeaderRow ( SafeBuf *sb , State0 *st , int32_t ct ) {
"00gbssUrl",
"01gbssDocId",
"02gbssDiscoveredTime",
"03gbssDownloadStartTime",
"03gbssSpiderTime",
"06gbssContentLen",
"07gbssDupOfDocId" ,
"08gbssNumRedirects",
"09gbssFinalRedirectUrl",
"10gbssPercentContentChanged",
"10gbssCrawlDelayMS",
"11gbssCrawlRound",
"12gbssHopCount",
"13gbssIp",
"13gbssStatusMsg",
"14gbssSentToDiffbotThisTime",
"15gbssDiffbotReplyMsg",
"16gbssStatusMsg",
"gbssIp",
"gbssPercentContentChanged",
"gbssDownloadStartTime",
"gbssDownloadEndTime",
"gbssContentType",
"gbssHttpStatus",
@ -8004,7 +8005,6 @@ bool printCSVHeaderRow ( SafeBuf *sb , State0 *st , int32_t ct ) {
"gbssSiteNumInlinks",
"gbssSiteRank",
"gbssLanguage",
"gbssCrawlDelayMS",
"gbssDiffbotReplyCode",
"gbssDiffbotLen",
"gbssDiffbotReplyResponseTimeMS",
@ -8169,8 +8169,8 @@ bool printCSVHeaderRow ( SafeBuf *sb , State0 *st , int32_t ct ) {
if ( ! strcmp(hdr,"gbssDiscoveredTime") ) // need this!
hdr = "Url Discovered Time";
// when it was crawled this time
if ( ! strcmp(hdr,"gbssDownloadStartTime") )
hdr = "Download Time";
if ( ! strcmp(hdr,"gbssSpiderTime" ) )
hdr = "Crawled Time";
if ( ! strcmp(hdr,"gbssContentLen") )
hdr = "Content Length";
if ( ! strcmp(hdr,"gbssDupOfDocId") )
@ -8183,6 +8183,8 @@ bool printCSVHeaderRow ( SafeBuf *sb , State0 *st , int32_t ct ) {
hdr = "Percent Changed";
if ( ! strcmp(hdr,"gbssCrawlRound") )
hdr = "Crawl Round";
if ( ! strcmp(hdr,"gbssCrawlDelay") )
hdr = "Robots.txt Crawl Delay (ms)";
if ( ! strcmp(hdr,"gbssHopCount") )
hdr = "Hop Count";
if ( ! strcmp(hdr,"gbssIp") )
@ -8192,7 +8194,7 @@ bool printCSVHeaderRow ( SafeBuf *sb , State0 *st , int32_t ct ) {
if ( ! strcmp(hdr,"gbssDiffbotReplyMsg") )
hdr = "Process Response";
if ( ! strcmp(hdr,"gbssStatusMsg") )
hdr = "Status";
hdr = "Crawl Status";
//if ( ! strcmp(hdr,"gbssMatchingUrlFilter") )
// hdr = "Matching Expression";

@ -24584,7 +24584,7 @@ SpiderReply *XmlDoc::getFakeSpiderReply ( ) {
//if ( ! cr ) return true;
}
// getSpiderReply()
SpiderReply *XmlDoc::getNewSpiderReply ( ) {
if ( m_srepValid ) return &m_srep;
@ -27304,6 +27304,13 @@ SafeBuf *XmlDoc::getSpiderStatusDocMetaList2 ( SpiderReply *reply ) {
m_sreq.m_reservedc2);
}
if ( m_spideredTimeValid )
jd.safePrintf("\"gbssSpiderTime\":%"INT32",\n",
m_spideredTime);
else
jd.safePrintf("\"gbssSpiderTime\":%"INT32",\n",0);
if ( m_firstIndexedDateValid )
jd.safePrintf("\"gbssFirstIndexed\":%"UINT32",\n",
m_firstIndexedDate);