new urls.csv polish. moved columns around. added
some new gbss fields, like spidered time.
This commit is contained in:
@ -2340,7 +2340,7 @@ int32_t getMsgSize ( char *buf, int32_t bufSize, TcpSocket *s ) {
|
||||
// /admin/basic etc
|
||||
if ( pp + 7 < ppend && strncmp ( pp ,"/admin/",7)==0)
|
||||
max = 0x7fffffff;
|
||||
// bulk job. /v2/bulk
|
||||
// bulk job. /v2/bulk or /v3/crawl/download/token-name...
|
||||
if ( pp + 4 < ppend && strncmp ( pp ,"/v",2)==0 &&
|
||||
// /v2/bulk
|
||||
( ( pp[4] == 'b' && pp[5] == 'u' ) ||
|
||||
|
@ -288,6 +288,47 @@ bool sendBackDump ( TcpSocket *sock, HttpRequest *hr ) {
|
||||
return sendPageResults ( sock , &hr2 );
|
||||
}
|
||||
|
||||
// . now the urls.csv is also a query on gbss files
|
||||
// . make an httprequest on stack and call it
|
||||
if ( fmt == FORMAT_CSV && rdbId == RDB_SPIDERDB ) {
|
||||
char tmp2[5000];
|
||||
SafeBuf sb2(tmp2,5000);
|
||||
// never dedup
|
||||
int32_t dr = 0;
|
||||
// do not dedup for crawls either it is too confusing!!!!
|
||||
// ppl wonder where the results are!
|
||||
dr = 0;
|
||||
sb2.safePrintf("GET /search?"
|
||||
// this is not necessary
|
||||
//"icc=1&"
|
||||
"format=csv&"
|
||||
// no site clustering
|
||||
"sc=0&"
|
||||
// never dedup.
|
||||
"dr=0&"
|
||||
"c=%s&"
|
||||
"n=10000000&"
|
||||
// stream it now
|
||||
"stream=1&"
|
||||
// no summary similarity dedup, only exact
|
||||
// doc content hash. otherwise too slow!!
|
||||
"pss=0&"
|
||||
// no gigabits
|
||||
"dsrt=0&"
|
||||
// do not compute summary. 0 lines.
|
||||
//"ns=0&"
|
||||
"q=gbrevsortbyint%%3AgbssSpiderTime+"
|
||||
"gbssIsDiffbotObject%%3A0"
|
||||
"&"
|
||||
//"prepend=type%%3Ajson"
|
||||
"\r\n\r\n"
|
||||
, cr->m_coll
|
||||
);
|
||||
HttpRequest hr2;
|
||||
hr2.set ( sb2.getBufStart() , sb2.length() , sock );
|
||||
return sendPageResults ( sock , &hr2 );
|
||||
}
|
||||
|
||||
|
||||
|
||||
//if ( strncmp ( path ,"/crawlbot/downloadurls",22 ) == 0 )
|
||||
|
@ -7975,20 +7975,21 @@ bool printCSVHeaderRow ( SafeBuf *sb , State0 *st , int32_t ct ) {
|
||||
"00gbssUrl",
|
||||
"01gbssDocId",
|
||||
"02gbssDiscoveredTime",
|
||||
"03gbssDownloadStartTime",
|
||||
"03gbssSpiderTime",
|
||||
"06gbssContentLen",
|
||||
"07gbssDupOfDocId" ,
|
||||
"08gbssNumRedirects",
|
||||
"09gbssFinalRedirectUrl",
|
||||
"10gbssPercentContentChanged",
|
||||
"10gbssCrawlDelayMS",
|
||||
"11gbssCrawlRound",
|
||||
"12gbssHopCount",
|
||||
"13gbssIp",
|
||||
"13gbssStatusMsg",
|
||||
"14gbssSentToDiffbotThisTime",
|
||||
"15gbssDiffbotReplyMsg",
|
||||
"16gbssStatusMsg",
|
||||
|
||||
|
||||
"gbssIp",
|
||||
"gbssPercentContentChanged",
|
||||
"gbssDownloadStartTime",
|
||||
"gbssDownloadEndTime",
|
||||
"gbssContentType",
|
||||
"gbssHttpStatus",
|
||||
@ -8004,7 +8005,6 @@ bool printCSVHeaderRow ( SafeBuf *sb , State0 *st , int32_t ct ) {
|
||||
"gbssSiteNumInlinks",
|
||||
"gbssSiteRank",
|
||||
"gbssLanguage",
|
||||
"gbssCrawlDelayMS",
|
||||
"gbssDiffbotReplyCode",
|
||||
"gbssDiffbotLen",
|
||||
"gbssDiffbotReplyResponseTimeMS",
|
||||
@ -8169,8 +8169,8 @@ bool printCSVHeaderRow ( SafeBuf *sb , State0 *st , int32_t ct ) {
|
||||
if ( ! strcmp(hdr,"gbssDiscoveredTime") ) // need this!
|
||||
hdr = "Url Discovered Time";
|
||||
// when it was crawled this time
|
||||
if ( ! strcmp(hdr,"gbssDownloadStartTime") )
|
||||
hdr = "Download Time";
|
||||
if ( ! strcmp(hdr,"gbssSpiderTime" ) )
|
||||
hdr = "Crawled Time";
|
||||
if ( ! strcmp(hdr,"gbssContentLen") )
|
||||
hdr = "Content Length";
|
||||
if ( ! strcmp(hdr,"gbssDupOfDocId") )
|
||||
@ -8183,6 +8183,8 @@ bool printCSVHeaderRow ( SafeBuf *sb , State0 *st , int32_t ct ) {
|
||||
hdr = "Percent Changed";
|
||||
if ( ! strcmp(hdr,"gbssCrawlRound") )
|
||||
hdr = "Crawl Round";
|
||||
if ( ! strcmp(hdr,"gbssCrawlDelay") )
|
||||
hdr = "Robots.txt Crawl Delay (ms)";
|
||||
if ( ! strcmp(hdr,"gbssHopCount") )
|
||||
hdr = "Hop Count";
|
||||
if ( ! strcmp(hdr,"gbssIp") )
|
||||
@ -8192,7 +8194,7 @@ bool printCSVHeaderRow ( SafeBuf *sb , State0 *st , int32_t ct ) {
|
||||
if ( ! strcmp(hdr,"gbssDiffbotReplyMsg") )
|
||||
hdr = "Process Response";
|
||||
if ( ! strcmp(hdr,"gbssStatusMsg") )
|
||||
hdr = "Status";
|
||||
hdr = "Crawl Status";
|
||||
|
||||
//if ( ! strcmp(hdr,"gbssMatchingUrlFilter") )
|
||||
// hdr = "Matching Expression";
|
||||
|
@ -24584,7 +24584,7 @@ SpiderReply *XmlDoc::getFakeSpiderReply ( ) {
|
||||
//if ( ! cr ) return true;
|
||||
}
|
||||
|
||||
|
||||
// getSpiderReply()
|
||||
SpiderReply *XmlDoc::getNewSpiderReply ( ) {
|
||||
|
||||
if ( m_srepValid ) return &m_srep;
|
||||
@ -27304,6 +27304,13 @@ SafeBuf *XmlDoc::getSpiderStatusDocMetaList2 ( SpiderReply *reply ) {
|
||||
m_sreq.m_reservedc2);
|
||||
}
|
||||
|
||||
if ( m_spideredTimeValid )
|
||||
jd.safePrintf("\"gbssSpiderTime\":%"INT32",\n",
|
||||
m_spideredTime);
|
||||
else
|
||||
jd.safePrintf("\"gbssSpiderTime\":%"INT32",\n",0);
|
||||
|
||||
|
||||
if ( m_firstIndexedDateValid )
|
||||
jd.safePrintf("\"gbssFirstIndexed\":%"UINT32",\n",
|
||||
m_firstIndexedDate);
|
||||
|
Reference in New Issue
Block a user