Merge branch 'ia' into ia-zak
This commit is contained in:
commit
29212bbe9c
106
Collectiondb.cpp
106
Collectiondb.cpp
@ -463,12 +463,24 @@ bool Collectiondb::addNewColl ( char *coll ,
|
||||
if ( ! h ) {
|
||||
log("crawlbot: bad custom collname");
|
||||
g_errno = EBADENGINEER;
|
||||
mdelete ( cr, sizeof(CollectionRec), "CollectionRec" );
|
||||
delete ( cr );
|
||||
return true;
|
||||
}
|
||||
*h = '\0';
|
||||
crawl = h + 1;
|
||||
if ( ! crawl[0] ) {
|
||||
log("crawlbot: bad custom crawl name");
|
||||
mdelete ( cr, sizeof(CollectionRec), "CollectionRec" );
|
||||
delete ( cr );
|
||||
g_errno = EBADENGINEER;
|
||||
return true;
|
||||
}
|
||||
// or if too big!
|
||||
if ( gbstrlen(crawl) > 30 ) {
|
||||
log("crawlbot: crawlbot crawl NAME is over 30 chars");
|
||||
mdelete ( cr, sizeof(CollectionRec), "CollectionRec" );
|
||||
delete ( cr );
|
||||
g_errno = EBADENGINEER;
|
||||
return true;
|
||||
}
|
||||
@ -1939,12 +1951,13 @@ bool CollectionRec::load ( char *coll , int32_t i ) {
|
||||
|
||||
// the list of ip addresses that we have detected as being throttled
|
||||
// and therefore backoff and use proxies for
|
||||
sb.reset();
|
||||
sb.safePrintf("%scoll.%s.%"INT32"/",
|
||||
g_hostdb.m_dir , m_coll , (int32_t)m_collnum );
|
||||
m_twitchyTable.m_allocName = "twittbl";
|
||||
m_twitchyTable.load ( sb.getBufStart() , "ipstouseproxiesfor.dat" );
|
||||
|
||||
if ( ! g_conf.m_doingCommandLine ) {
|
||||
sb.reset();
|
||||
sb.safePrintf("%scoll.%s.%"INT32"/",
|
||||
g_hostdb.m_dir , m_coll , (int32_t)m_collnum );
|
||||
m_twitchyTable.m_allocName = "twittbl";
|
||||
m_twitchyTable.load ( sb.getBufStart() , "ipstouseproxiesfor.dat" );
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -3472,6 +3485,70 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() {
|
||||
char *ppp = m_diffbotPageProcessPattern.getBufStart();
|
||||
if ( ppp && ! ppp[0] ) ppp = NULL;
|
||||
|
||||
///////
|
||||
//
|
||||
// recompile regular expressions
|
||||
//
|
||||
///////
|
||||
|
||||
|
||||
if ( m_hasucr ) {
|
||||
regfree ( &m_ucr );
|
||||
m_hasucr = false;
|
||||
}
|
||||
|
||||
if ( m_hasupr ) {
|
||||
regfree ( &m_upr );
|
||||
m_hasupr = false;
|
||||
}
|
||||
|
||||
// copy into tmpbuf
|
||||
SafeBuf tmp;
|
||||
|
||||
char *rx = m_diffbotUrlCrawlRegEx.getBufStart();
|
||||
if ( rx && ! rx[0] ) rx = NULL;
|
||||
if ( rx ) {
|
||||
tmp.reset();
|
||||
tmp.safeStrcpy ( rx );
|
||||
expandRegExShortcuts ( &tmp );
|
||||
m_hasucr = true;
|
||||
}
|
||||
int32_t err;
|
||||
if ( rx && ( err = regcomp ( &m_ucr , tmp.getBufStart() ,
|
||||
REG_EXTENDED| //REG_ICASE|
|
||||
REG_NEWLINE ) ) ) { // |REG_NOSUB) ) {
|
||||
// error!
|
||||
char errbuf[1024];
|
||||
regerror(err,&m_ucr,errbuf,1000);
|
||||
log("coll: regcomp %s failed: %s. "
|
||||
"Ignoring.",
|
||||
rx,errbuf);
|
||||
regfree ( &m_ucr );
|
||||
m_hasucr = false;
|
||||
}
|
||||
|
||||
|
||||
rx = m_diffbotUrlProcessRegEx.getBufStart();
|
||||
if ( rx && ! rx[0] ) rx = NULL;
|
||||
if ( rx ) m_hasupr = true;
|
||||
if ( rx ) {
|
||||
tmp.reset();
|
||||
tmp.safeStrcpy ( rx );
|
||||
expandRegExShortcuts ( &tmp );
|
||||
m_hasupr = true;
|
||||
}
|
||||
if ( rx && ( err = regcomp ( &m_upr , tmp.getBufStart() ,
|
||||
REG_EXTENDED| // REG_ICASE|
|
||||
REG_NEWLINE ) ) ) { // |REG_NOSUB) ) {
|
||||
char errbuf[1024];
|
||||
regerror(err,&m_upr,errbuf,1000);
|
||||
// error!
|
||||
log("coll: regcomp %s failed: %s. "
|
||||
"Ignoring.",
|
||||
rx,errbuf);
|
||||
regfree ( &m_upr );
|
||||
m_hasupr = false;
|
||||
}
|
||||
|
||||
// what diffbot url to use for processing
|
||||
char *api = m_diffbotApiUrl.getBufStart();
|
||||
@ -3913,17 +3990,20 @@ void testRegex ( ) {
|
||||
rx = ".*?article[0-9]*?.html";
|
||||
|
||||
regex_t ucr;
|
||||
int32_t err;
|
||||
|
||||
if ( regcomp ( &ucr , rx ,
|
||||
REG_ICASE
|
||||
|REG_EXTENDED
|
||||
//|REG_NEWLINE
|
||||
//|REG_NOSUB
|
||||
) ) {
|
||||
if ( ( err = regcomp ( &ucr , rx ,
|
||||
REG_ICASE
|
||||
|REG_EXTENDED
|
||||
//|REG_NEWLINE
|
||||
//|REG_NOSUB
|
||||
) ) ) {
|
||||
// error!
|
||||
char errbuf[1024];
|
||||
regerror(err,&ucr,errbuf,1000);
|
||||
log("xmldoc: regcomp %s failed: %s. "
|
||||
"Ignoring.",
|
||||
rx,mstrerror(errno));
|
||||
rx,errbuf);
|
||||
}
|
||||
|
||||
logf(LOG_DEBUG,"db: compiled '%s' for crawl pattern",rx);
|
||||
|
@ -108,6 +108,7 @@ case EDNSBAD : return "DNS sent an unknown response code";
|
||||
case EDNSREFUSED : return "DNS refused to talk";
|
||||
case EDNSDEAD : return "DNS hostname does not exist";
|
||||
case EDNSTIMEDOUT : return "DNS timed out";
|
||||
case EDNSERROR : return "DNS lookup error";
|
||||
case ECOLLTOOBIG : return "Collection is too long";
|
||||
case ESTRIKEOUT : return "Retried enough times, deleting doc";
|
||||
case ENOPERM : return "Permission Denied";
|
||||
|
1
Errno.h
1
Errno.h
@ -112,6 +112,7 @@ enum {
|
||||
EDNSREFUSED , //dns refused to talk to us
|
||||
EDNSDEAD , //dns is dead
|
||||
EDNSTIMEDOUT , //was just EUDPTIMEDOUT
|
||||
EDNSERROR ,
|
||||
ECOLLTOOBIG , //collection is too long
|
||||
ESTRIKEOUT , //retried enough times; deleting doc & giving up
|
||||
ENOPERM , //permission denied
|
||||
|
4
Mem.cpp
4
Mem.cpp
@ -700,7 +700,7 @@ void Mem::addMem ( void *mem , int32_t size , const char *note , char isnew ) {
|
||||
//(int32_t)mem,size,h,s_n,note);
|
||||
s_n++;
|
||||
// debug
|
||||
if ( size > MINMEM && g_conf.m_logDebugMemUsage )
|
||||
if ( (size > MINMEM && g_conf.m_logDebugMemUsage) || size>=100000000 )
|
||||
log(LOG_INFO,"mem: addMem(%"INT32"): %s. ptr=0x%"PTRFMT" "
|
||||
"used=%"INT64"",
|
||||
size,note,(PTRTYPE)mem,m_used);
|
||||
@ -1023,7 +1023,7 @@ bool Mem::rmMem ( void *mem , int32_t size , const char *note ) {
|
||||
|
||||
keepgoing:
|
||||
// debug
|
||||
if ( size > MINMEM && g_conf.m_logDebugMemUsage )
|
||||
if ( (size > MINMEM && g_conf.m_logDebugMemUsage) || size>=100000000 )
|
||||
log(LOG_INFO,"mem: rmMem (%"INT32"): "
|
||||
"ptr=0x%"PTRFMT" %s.",size,(PTRTYPE)mem,note);
|
||||
|
||||
|
23
Msg2.cpp
23
Msg2.cpp
@ -194,11 +194,11 @@ bool Msg2::getLists ( ) {
|
||||
int32_t minRecSize = m_minRecSizes[m_i];
|
||||
|
||||
// sanity check
|
||||
if ( ( minRecSize > ( 500 * 1024 * 1024 ) ||
|
||||
minRecSize < 0) ){
|
||||
log( "minRecSize = %"INT32"", minRecSize );
|
||||
char *xx=NULL; *xx=0;
|
||||
}
|
||||
// if ( ( minRecSize > ( 500 * 1024 * 1024 ) ||
|
||||
// minRecSize < 0) ){
|
||||
// log( "minRecSize = %"INT32"", minRecSize );
|
||||
// char *xx=NULL; *xx=0;
|
||||
// }
|
||||
|
||||
//bool forceLocalIndexdb = true;
|
||||
// if it is a no-split term, we may gotta get it over the net
|
||||
@ -407,7 +407,13 @@ bool Msg2::getLists ( ) {
|
||||
|
||||
// like 90MB last time i checked. so it won't read more
|
||||
// than that...
|
||||
int32_t minRecSizes = DEFAULT_POSDB_READSIZE;
|
||||
// MDW: no, it's better to print oom then not give all the
|
||||
// results leaving users scratching their heads. besides,
|
||||
// we should do docid range splitting before we go out of
|
||||
// mem. we should also report the size of each termlist
|
||||
// in bytes in the query info header.
|
||||
//int32_t minRecSizes = DEFAULT_POSDB_READSIZE;
|
||||
int32_t minRecSizes = -1;
|
||||
|
||||
// start up the read. thread will wait in thread queue to
|
||||
// launch if too many threads are out.
|
||||
@ -596,12 +602,13 @@ bool Msg2::gotList ( RdbList *list ) {
|
||||
for ( int32_t i = 0 ; i < m_numLists ; i++ ) {
|
||||
if ( m_lists[i].m_listSize < m_minRecSizes[i] ) continue;
|
||||
if ( m_minRecSizes[i] == 0 ) continue;
|
||||
if ( m_minRecSizes[i] == -1 ) continue;
|
||||
// do not print this if compiling section xpathsitehash stats
|
||||
// because we only need like 10k of list to get a decent
|
||||
// reading
|
||||
if ( m_req->m_forSectionStats ) break;
|
||||
log("msg2: read termlist #%"INT32" size=%"INT32" maxSize=%"INT32". losing "
|
||||
"docIds!",
|
||||
log("msg2: read termlist #%"INT32" size=%"INT32" "
|
||||
"maxSize=%"INT32". losing docIds!",
|
||||
i,m_lists[i].m_listSize,m_minRecSizes[i]);
|
||||
}
|
||||
|
||||
|
@ -377,6 +377,9 @@ bool Msg3a::gotCacheReply ( ) {
|
||||
// 'time enough for love' query was hitting 30MB termlists.
|
||||
//rs = 50000000;
|
||||
rs = DEFAULT_POSDB_READSIZE;//90000000; // 90MB!
|
||||
// it is better to go oom then leave users scratching their
|
||||
// heads as to why some results are not being returned.
|
||||
rs = -1;
|
||||
// if section stats, limit to 1MB
|
||||
//if ( m_r->m_getSectionStats ) rs = 1000000;
|
||||
// get the jth query term
|
||||
|
7
Msg5.cpp
7
Msg5.cpp
@ -182,9 +182,10 @@ bool Msg5::getList ( char rdbId ,
|
||||
// log("Msg5::readList: startKey > endKey warning");
|
||||
// we no longer allow negative minRecSizes
|
||||
if ( minRecSizes < 0 ) {
|
||||
log(LOG_LOGIC,"net: msg5: MinRecSizes < 0, using 1.");
|
||||
minRecSizes = 1;
|
||||
char *xx = NULL; *xx = 0;
|
||||
if ( g_conf.m_logDebugDb )
|
||||
log(LOG_LOGIC,"net: msg5: MinRecSizes < 0, using 2GB.");
|
||||
minRecSizes = 0x7fffffff;
|
||||
//char *xx = NULL; *xx = 0;
|
||||
}
|
||||
// ensure startKey last bit clear, endKey last bit set
|
||||
//if ( (startKey.n0 & 0x01) == 0x01 )
|
||||
|
@ -438,6 +438,25 @@ bool Msg1c::gotList ( ) {
|
||||
// use only 64k values so we don't stress doledb/waittrees/etc.
|
||||
// for large #'s of docids
|
||||
int32_t firstIp = (docId & 0x0000ffff);
|
||||
|
||||
// bits 6-13 of the docid are the domain hash so use those
|
||||
// when doing a REINDEX (not delete!) to ensure that requests
|
||||
// on the same domain go to the same shard, at least when
|
||||
// we have up to 256 shards. if we have more than 256 shards
|
||||
// at this point some shards will not participate in the
|
||||
// query reindex/delete process because of this, so
|
||||
// we'll want to allow more bits in in that case perhaps.
|
||||
// check out Hostdb::getShardNum(RDB_SPIDERDB) in Hostdb.cpp
|
||||
// to see what shard is responsible for storing and indexing
|
||||
// this SpiderRequest based on the firstIp.
|
||||
if ( ! m_forceDel ) {
|
||||
// if we are a REINDEX not a delete because
|
||||
// deletes don't need to spider/redownload the doc
|
||||
// so the distribution can be more random
|
||||
firstIp >>= 6;
|
||||
firstIp &= 0xff;
|
||||
}
|
||||
|
||||
// 0 is not a legit val. it'll core below.
|
||||
if ( firstIp == 0 ) firstIp = 1;
|
||||
// use a fake ip
|
||||
|
14
Parms.cpp
14
Parms.cpp
@ -22598,15 +22598,25 @@ bool printUrlExpressionExamples ( SafeBuf *sb ) {
|
||||
"to adjust how often you want things respidered."
|
||||
"</td></tr>"
|
||||
|
||||
"<tr class=poo><td>indexage</td>"
|
||||
"<td>"
|
||||
"How long has it been since the url was last "
|
||||
"successfully indexed? In seconds. "
|
||||
"Can use <, >, <=, >=, ==, != comparison operators."
|
||||
"</td></tr>"
|
||||
|
||||
"<tr class=poo><td>urlage</td>"
|
||||
"<td>"
|
||||
"This is the time, in seconds, since a url was first "
|
||||
"added to spiderdb to be spidered. This is "
|
||||
"This uses the time, in seconds, since a url was "
|
||||
"first added to spiderdb to be spidered, aka "
|
||||
"its discovery date. "
|
||||
"Can use <, >, <=, >=, ==, != comparison operators."
|
||||
"</td></tr>"
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
//"<tr class=poo><td>!newoutlink</td>"
|
||||
//"<td>Matches if document is NOT a new outlink."
|
||||
//"</td></tr>"
|
||||
|
57
Spider.cpp
57
Spider.cpp
@ -207,9 +207,21 @@ int32_t SpiderRequest::print ( SafeBuf *sbarg ) {
|
||||
}
|
||||
|
||||
void SpiderReply::setKey (int32_t firstIp,
|
||||
int64_t parentDocId,
|
||||
// no need for parentdocid in this any more.
|
||||
//int64_t parentDocId,
|
||||
int64_t uh48,
|
||||
bool isDel) {
|
||||
// now we use a 1 parentdocid for replies that were successful
|
||||
int64_t parentDocId = 1;
|
||||
// or 0 if had error. this way we only keep at most 2 SpiderReplies
|
||||
// for each url in spiderdb. we need to keep the last successful
|
||||
// spiderreply in spiderdb so
|
||||
// SpiderRequest::m_lastSuccessfulSpideredTime will be valid.
|
||||
// this way the reply that was successful will occur after the
|
||||
// one that had an error, so we can just check the last spider reply
|
||||
// when doing our scan in scanListForWinners().
|
||||
if ( m_errCode ) parentDocId = 0;
|
||||
|
||||
m_key = g_spiderdb.makeKey ( firstIp,uh48,false,parentDocId , isDel );
|
||||
// set dataSize too!
|
||||
m_dataSize = sizeof(SpiderReply) - sizeof(key128_t) - 4;
|
||||
@ -4565,6 +4577,13 @@ bool SpiderColl::scanListForWinners ( ) {
|
||||
// assume our added time is the first time this url was added
|
||||
sreq->m_discoveryTime = sreq->m_addedTime;
|
||||
|
||||
// record the last time we successfully indexed this doc, ifany
|
||||
if ( srep && ! srep->m_errCode )
|
||||
sreq->m_lastSuccessfulSpideredTime =
|
||||
srep->m_spideredTime;
|
||||
else
|
||||
sreq->m_lastSuccessfulSpideredTime = 0;
|
||||
|
||||
// if ( uh48 == 110582802025376LL )
|
||||
// log("hey");
|
||||
|
||||
@ -4594,10 +4613,12 @@ bool SpiderColl::scanListForWinners ( ) {
|
||||
// and the min added time as well!
|
||||
// get the oldest timestamp so
|
||||
// gbssDiscoveryTime will be accurate.
|
||||
if ( sreq->m_discoveryTime < wsreq->m_discoveryTime )
|
||||
if ( sreq->m_discoveryTime <
|
||||
wsreq->m_discoveryTime )
|
||||
wsreq->m_discoveryTime =
|
||||
sreq->m_discoveryTime;
|
||||
if ( wsreq->m_discoveryTime < sreq->m_discoveryTime )
|
||||
if ( wsreq->m_discoveryTime <
|
||||
sreq->m_discoveryTime )
|
||||
sreq->m_discoveryTime =
|
||||
wsreq->m_discoveryTime;
|
||||
}
|
||||
@ -11313,6 +11334,7 @@ int32_t getUrlFilterNum2 ( SpiderRequest *sreq ,
|
||||
|
||||
if ( *p != 'i' ) goto skipi;
|
||||
|
||||
|
||||
if ( strncmp(p,"isinjected",10) == 0 ) {
|
||||
// skip for msg20
|
||||
if ( isForMsg20 ) continue;
|
||||
@ -11923,6 +11945,7 @@ int32_t getUrlFilterNum2 ( SpiderRequest *sreq ,
|
||||
goto checkNextRule;
|
||||
}
|
||||
|
||||
|
||||
// non-boolen junk
|
||||
skipi:
|
||||
|
||||
@ -12407,6 +12430,32 @@ int32_t getUrlFilterNum2 ( SpiderRequest *sreq ,
|
||||
goto checkNextRule;
|
||||
}
|
||||
|
||||
// constraint for last time url was successfully indexed
|
||||
if ( *p=='i' && strncmp(p,"indexage",8) == 0 ) {
|
||||
// skip for msg20
|
||||
if ( isForMsg20 ) continue;
|
||||
// if never successfully indexed, skip this one
|
||||
if ( sreq->m_lastSuccessfulSpideredTime == 0) continue;
|
||||
int32_t age;
|
||||
age = nowGlobal - sreq->m_lastSuccessfulSpideredTime;
|
||||
// the argument entered by user
|
||||
int32_t uage = atoi(s) ;
|
||||
if ( sign == SIGN_EQ && age != uage ) continue;
|
||||
if ( sign == SIGN_NE && age == uage ) continue;
|
||||
if ( sign == SIGN_GT && age <= uage ) continue;
|
||||
if ( sign == SIGN_LT && age >= uage ) continue;
|
||||
if ( sign == SIGN_GE && age < uage ) continue;
|
||||
if ( sign == SIGN_LE && age > uage ) continue;
|
||||
// skip over 'indexage'
|
||||
p += 8;
|
||||
p = strstr(s, "&&");
|
||||
//if nothing, else then it is a match
|
||||
if ( ! p ) return i;
|
||||
//skip the '&&' and go to next rule
|
||||
p += 2;
|
||||
goto checkNextRule;
|
||||
}
|
||||
|
||||
// selector using the first time it was added to the Spiderdb
|
||||
// added by Sam, May 5th 2015
|
||||
if ( *p=='u' && strncmp(p,"urlage",6) == 0 ) {
|
||||
@ -12430,6 +12479,8 @@ int32_t getUrlFilterNum2 ( SpiderRequest *sreq ,
|
||||
if ( sign == SIGN_LT && sreq_age >= argument_age ) continue;
|
||||
if ( sign == SIGN_GE && sreq_age < argument_age ) continue;
|
||||
if ( sign == SIGN_LE && sreq_age > argument_age ) continue;
|
||||
// skip over 'urlage'
|
||||
p += 6;
|
||||
p = strstr(s, "&&");
|
||||
//if nothing, else then it is a match
|
||||
if ( ! p ) return i;
|
||||
|
8
Spider.h
8
Spider.h
@ -532,7 +532,11 @@ class SpiderRequest {
|
||||
// then we increment the last 8 bits or so. see Msg22.cpp.
|
||||
//int64_t m_probDocId;
|
||||
//int32_t m_reservedc1;
|
||||
int32_t m_reservedc2;
|
||||
//int32_t m_reservedc2;
|
||||
|
||||
// if there is a 'successful' SpiderReply for this url then this is
|
||||
// the SpiderReply::m_spideredTime of the most recent one.
|
||||
int32_t m_lastSuccessfulSpideredTime;
|
||||
|
||||
//int32_t m_parentPubDate;
|
||||
|
||||
@ -955,7 +959,7 @@ class SpiderReply {
|
||||
void reset() { memset ( this , 0 , sizeof(SpiderReply) ); };
|
||||
|
||||
void setKey ( int32_t firstIp,
|
||||
int64_t parentDocId ,
|
||||
//int64_t parentDocId ,
|
||||
int64_t uh48 ,
|
||||
bool isDel ) ;
|
||||
|
||||
|
88
XmlDoc.cpp
88
XmlDoc.cpp
@ -1859,7 +1859,10 @@ bool XmlDoc::set2 ( char *titleRec ,
|
||||
//m_hasContactInfoValid = true;
|
||||
|
||||
// sanity check. if m_siteValid is true, this must be there
|
||||
if ( ! ptr_site ) { char *xx=NULL;*xx=0; }
|
||||
if ( ! ptr_site ) {
|
||||
log("set4: ptr_site is null for docid %"INT64"",m_docId);
|
||||
//char *xx=NULL;*xx=0; }
|
||||
}
|
||||
|
||||
// lookup the tagdb rec fresh if setting for a summary. that way we
|
||||
// can see if it is banned or not
|
||||
@ -2534,16 +2537,50 @@ bool XmlDoc::indexDoc ( ) {
|
||||
if ( ! m_firstIpValid ) { char *xx=NULL;*xx=0; }
|
||||
// sanity log
|
||||
if ( *fip == 0 || *fip == -1 ) {
|
||||
//
|
||||
// now add a spider status doc for this so we know
|
||||
// why a crawl might have failed to start
|
||||
//
|
||||
SafeBuf *ssDocMetaList = NULL;
|
||||
// save this
|
||||
int32_t saved = m_indexCode;
|
||||
// and make it the real reason for the spider status doc
|
||||
m_indexCode = EDNSERROR;
|
||||
// get the spiderreply ready to be added
|
||||
|
||||
ssDocMetaList = getSpiderStatusDocMetaList(NULL ,false);//del
|
||||
// revert
|
||||
m_indexCode = saved;
|
||||
// error?
|
||||
if ( ! ssDocMetaList ) return true;
|
||||
// blocked?
|
||||
if ( ssDocMetaList == (void *)-1 ) return false;
|
||||
// need to alloc space for it too
|
||||
char *list = ssDocMetaList->getBufStart();
|
||||
int32_t len = ssDocMetaList->length();
|
||||
//needx += len;
|
||||
// this too
|
||||
m_addedStatusDocSize = len;
|
||||
m_addedStatusDocSizeValid = true;
|
||||
|
||||
char *url = "unknown";
|
||||
if ( m_sreqValid ) url = m_sreq.m_url;
|
||||
log("build: error2 getting real firstip of %"INT32" for "
|
||||
"%s. Not adding new spider req", (int32_t)*fip,url);
|
||||
// also count it as a crawl attempt
|
||||
cr->m_localCrawlInfo.m_pageDownloadAttempts++;
|
||||
cr->m_globalCrawlInfo.m_pageDownloadAttempts++;
|
||||
|
||||
if ( ! m_metaList2.safeMemcpy ( list , len ) )
|
||||
return true;
|
||||
|
||||
goto skipNewAdd1;
|
||||
}
|
||||
// store the new request (store reply for this below)
|
||||
char rd = RDB_SPIDERDB;
|
||||
if ( m_useSecondaryRdbs ) rd = RDB2_SPIDERDB2;
|
||||
m_metaList2.pushChar(rd);
|
||||
if ( ! m_metaList2.pushChar(rd) )
|
||||
return true;
|
||||
// store it here
|
||||
SpiderRequest revisedReq;
|
||||
// this fills it in
|
||||
@ -23044,7 +23081,7 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
|
||||
srep.m_domHash32 = m_sreq.m_domHash32;
|
||||
srep.m_spideredTime = getTimeGlobal();
|
||||
int64_t uh48 = m_sreq.getUrlHash48();
|
||||
int64_t parentDocId = 0LL;
|
||||
//int64_t parentDocId = 0LL;
|
||||
srep.m_contentHash32 = 0;
|
||||
// were we already in titledb before we started spidering?
|
||||
// yes otherwise we would have called "goto skip9" above
|
||||
@ -23054,7 +23091,7 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
|
||||
srep.m_isIndexedINValid = false;
|
||||
srep.m_errCode = EREINDEXREDIR; // indexCode
|
||||
srep.m_downloadEndTime = 0;
|
||||
srep.setKey ( srep.m_firstIp, parentDocId , uh48 , false );
|
||||
srep.setKey ( srep.m_firstIp, /*parentDocId ,*/uh48 , false );
|
||||
// lock of request needs to match that of reply so the
|
||||
// reply, when recevied by Rdb.cpp which calls addSpiderReply()
|
||||
// can unlock this url so it can be spidered again.
|
||||
@ -25945,7 +25982,7 @@ SpiderReply *XmlDoc::getNewSpiderReply ( ) {
|
||||
log("xmldoc: uh48=%"UINT64" parentdocid=%"UINT64"",uh48,parentDocId);
|
||||
|
||||
// set the key, m_srep.m_key
|
||||
m_srep.setKey ( firstIp, parentDocId , uh48 , false );
|
||||
m_srep.setKey ( firstIp, /*parentDocId ,*/ uh48 , false );
|
||||
|
||||
// . did we download a page? even if indexcode is set we might have
|
||||
// . if this is non-zero that means its valid
|
||||
@ -28372,7 +28409,7 @@ SafeBuf *XmlDoc::getSpiderStatusDocMetaList ( SpiderReply *reply ,
|
||||
// . TODO:
|
||||
// usedProxy:1
|
||||
// proxyIp:1.2.3.4
|
||||
SafeBuf *XmlDoc::getSpiderStatusDocMetaList2 ( SpiderReply *reply ) {
|
||||
SafeBuf *XmlDoc::getSpiderStatusDocMetaList2 ( SpiderReply *reply1 ) {
|
||||
|
||||
setStatus ( "making spider reply meta list");
|
||||
|
||||
@ -28393,8 +28430,8 @@ SafeBuf *XmlDoc::getSpiderStatusDocMetaList2 ( SpiderReply *reply ) {
|
||||
int64_t *uqd = getAvailDocIdOnly ( d );
|
||||
if ( ! uqd || uqd == (void *)-1 ) return (SafeBuf *)uqd;
|
||||
|
||||
unsigned char *hc = (unsigned char *)getHopCount();
|
||||
if ( ! hc || hc == (void *)-1 ) return (SafeBuf *)hc;
|
||||
// unsigned char *hc = (unsigned char *)getHopCount();
|
||||
// if ( ! hc || hc == (void *)-1 ) return (SafeBuf *)hc;
|
||||
|
||||
int32_t tmpVal = -1;
|
||||
int32_t *priority = &tmpVal;
|
||||
@ -28430,7 +28467,7 @@ SafeBuf *XmlDoc::getSpiderStatusDocMetaList2 ( SpiderReply *reply ) {
|
||||
if ( ! m_indexCodeValid ) { char *xx=NULL;*xx=0; }
|
||||
|
||||
// why isn't gbhopcount: being indexed consistently?
|
||||
if ( ! m_hopCountValid ) { char *xx=NULL;*xx=0; }
|
||||
//if ( ! m_hopCountValid ) { char *xx=NULL;*xx=0; }
|
||||
|
||||
// reset just in case
|
||||
m_spiderStatusDocMetaList.reset();
|
||||
@ -28465,12 +28502,17 @@ SafeBuf *XmlDoc::getSpiderStatusDocMetaList2 ( SpiderReply *reply ) {
|
||||
jd.safePrintf("\"gbssFinalRedirectUrl\":\"%s\",\n",
|
||||
ptr_redirUrl);
|
||||
|
||||
if ( m_indexCodeValid ) {
|
||||
jd.safePrintf("\"gbssStatusCode\":%i,\n",(int)m_indexCode);
|
||||
jd.safePrintf("\"gbssStatusMsg\":\"");
|
||||
jd.jsonEncode (mstrerror(m_indexCode));
|
||||
jd.safePrintf("\",\n");
|
||||
}
|
||||
else {
|
||||
jd.safePrintf("\"gbssStatusCode\":-1,\n");
|
||||
jd.safePrintf("\"gbssStatusMsg\":\"???\",\n");
|
||||
}
|
||||
|
||||
jd.safePrintf("\"gbssStatusCode\":%i,\n",(int)m_indexCode);
|
||||
|
||||
jd.safePrintf("\"gbssStatusMsg\":\"");
|
||||
jd.jsonEncode (mstrerror(m_indexCode));
|
||||
jd.safePrintf("\",\n");
|
||||
|
||||
if ( m_httpStatusValid )
|
||||
jd.safePrintf("\"gbssHttpStatus\":%"INT32",\n",
|
||||
@ -28514,12 +28556,15 @@ SafeBuf *XmlDoc::getSpiderStatusDocMetaList2 ( SpiderReply *reply ) {
|
||||
jd.safePrintf("\",\n");
|
||||
|
||||
//if ( m_redirUrlPtr && m_redirUrlValid )
|
||||
jd.safePrintf("\"gbssNumRedirects\":%"INT32",\n",
|
||||
m_numRedirects);
|
||||
//if ( m_numRedirectsValid )
|
||||
jd.safePrintf("\"gbssNumRedirects\":%"INT32",\n",m_numRedirects);
|
||||
|
||||
jd.safePrintf("\"gbssDocId\":%"INT64",\n", m_docId);//*uqd);
|
||||
if ( m_docIdValid )
|
||||
jd.safePrintf("\"gbssDocId\":%"INT64",\n", m_docId);//*uqd);
|
||||
|
||||
jd.safePrintf("\"gbssHopCount\":%"INT32",\n",(int32_t)*hc);
|
||||
if ( m_hopCountValid )
|
||||
//jd.safePrintf("\"gbssHopCount\":%"INT32",\n",(int32_t)*hc);
|
||||
jd.safePrintf("\"gbssHopCount\":%"INT32",\n",(int32_t)m_hopCount);
|
||||
|
||||
// crawlbot round
|
||||
if ( cr->m_isCustomCrawl )
|
||||
@ -28946,6 +28991,13 @@ SafeBuf *XmlDoc::getSpiderStatusDocMetaList2 ( SpiderReply *reply ) {
|
||||
xd->ptr_site = ptr_site;
|
||||
xd->size_site = size_site;
|
||||
|
||||
// if this is null then ip lookup failed i guess so just use
|
||||
// the subdomain
|
||||
if ( ! ptr_site && m_firstUrlValid ) {
|
||||
xd->ptr_site = m_firstUrl.getHost();
|
||||
xd->size_site = m_firstUrl.getHostLen();
|
||||
}
|
||||
|
||||
// use the same uh48 of our parent
|
||||
int64_t uh48 = m_firstUrl.getUrlHash48();
|
||||
// then make into a titlerec but store in metalistbuf, not m_titleRec
|
||||
|
Loading…
x
Reference in New Issue
Block a user