Merge branch 'diffbot-testing' into diffbot-sam
This commit is contained in:
commit
145c125abd
106
Collectiondb.cpp
106
Collectiondb.cpp
@ -463,12 +463,24 @@ bool Collectiondb::addNewColl ( char *coll ,
|
||||
if ( ! h ) {
|
||||
log("crawlbot: bad custom collname");
|
||||
g_errno = EBADENGINEER;
|
||||
mdelete ( cr, sizeof(CollectionRec), "CollectionRec" );
|
||||
delete ( cr );
|
||||
return true;
|
||||
}
|
||||
*h = '\0';
|
||||
crawl = h + 1;
|
||||
if ( ! crawl[0] ) {
|
||||
log("crawlbot: bad custom crawl name");
|
||||
mdelete ( cr, sizeof(CollectionRec), "CollectionRec" );
|
||||
delete ( cr );
|
||||
g_errno = EBADENGINEER;
|
||||
return true;
|
||||
}
|
||||
// or if too big!
|
||||
if ( gbstrlen(crawl) > 30 ) {
|
||||
log("crawlbot: crawlbot crawl NAME is over 30 chars");
|
||||
mdelete ( cr, sizeof(CollectionRec), "CollectionRec" );
|
||||
delete ( cr );
|
||||
g_errno = EBADENGINEER;
|
||||
return true;
|
||||
}
|
||||
@ -1939,12 +1951,13 @@ bool CollectionRec::load ( char *coll , int32_t i ) {
|
||||
|
||||
// the list of ip addresses that we have detected as being throttled
|
||||
// and therefore backoff and use proxies for
|
||||
sb.reset();
|
||||
sb.safePrintf("%scoll.%s.%"INT32"/",
|
||||
g_hostdb.m_dir , m_coll , (int32_t)m_collnum );
|
||||
m_twitchyTable.m_allocName = "twittbl";
|
||||
m_twitchyTable.load ( sb.getBufStart() , "ipstouseproxiesfor.dat" );
|
||||
|
||||
if ( ! g_conf.m_doingCommandLine ) {
|
||||
sb.reset();
|
||||
sb.safePrintf("%scoll.%s.%"INT32"/",
|
||||
g_hostdb.m_dir , m_coll , (int32_t)m_collnum );
|
||||
m_twitchyTable.m_allocName = "twittbl";
|
||||
m_twitchyTable.load ( sb.getBufStart() , "ipstouseproxiesfor.dat" );
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -3472,6 +3485,70 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() {
|
||||
char *ppp = m_diffbotPageProcessPattern.getBufStart();
|
||||
if ( ppp && ! ppp[0] ) ppp = NULL;
|
||||
|
||||
///////
|
||||
//
|
||||
// recompile regular expressions
|
||||
//
|
||||
///////
|
||||
|
||||
|
||||
if ( m_hasucr ) {
|
||||
regfree ( &m_ucr );
|
||||
m_hasucr = false;
|
||||
}
|
||||
|
||||
if ( m_hasupr ) {
|
||||
regfree ( &m_upr );
|
||||
m_hasupr = false;
|
||||
}
|
||||
|
||||
// copy into tmpbuf
|
||||
SafeBuf tmp;
|
||||
|
||||
char *rx = m_diffbotUrlCrawlRegEx.getBufStart();
|
||||
if ( rx && ! rx[0] ) rx = NULL;
|
||||
if ( rx ) {
|
||||
tmp.reset();
|
||||
tmp.safeStrcpy ( rx );
|
||||
expandRegExShortcuts ( &tmp );
|
||||
m_hasucr = true;
|
||||
}
|
||||
int32_t err;
|
||||
if ( rx && ( err = regcomp ( &m_ucr , tmp.getBufStart() ,
|
||||
REG_EXTENDED| //REG_ICASE|
|
||||
REG_NEWLINE ) ) ) { // |REG_NOSUB) ) {
|
||||
// error!
|
||||
char errbuf[1024];
|
||||
regerror(err,&m_ucr,errbuf,1000);
|
||||
log("coll: regcomp %s failed: %s. "
|
||||
"Ignoring.",
|
||||
rx,errbuf);
|
||||
regfree ( &m_ucr );
|
||||
m_hasucr = false;
|
||||
}
|
||||
|
||||
|
||||
rx = m_diffbotUrlProcessRegEx.getBufStart();
|
||||
if ( rx && ! rx[0] ) rx = NULL;
|
||||
if ( rx ) m_hasupr = true;
|
||||
if ( rx ) {
|
||||
tmp.reset();
|
||||
tmp.safeStrcpy ( rx );
|
||||
expandRegExShortcuts ( &tmp );
|
||||
m_hasupr = true;
|
||||
}
|
||||
if ( rx && ( err = regcomp ( &m_upr , tmp.getBufStart() ,
|
||||
REG_EXTENDED| // REG_ICASE|
|
||||
REG_NEWLINE ) ) ) { // |REG_NOSUB) ) {
|
||||
char errbuf[1024];
|
||||
regerror(err,&m_upr,errbuf,1000);
|
||||
// error!
|
||||
log("coll: regcomp %s failed: %s. "
|
||||
"Ignoring.",
|
||||
rx,errbuf);
|
||||
regfree ( &m_upr );
|
||||
m_hasupr = false;
|
||||
}
|
||||
|
||||
// what diffbot url to use for processing
|
||||
char *api = m_diffbotApiUrl.getBufStart();
|
||||
@ -3913,17 +3990,20 @@ void testRegex ( ) {
|
||||
rx = ".*?article[0-9]*?.html";
|
||||
|
||||
regex_t ucr;
|
||||
int32_t err;
|
||||
|
||||
if ( regcomp ( &ucr , rx ,
|
||||
REG_ICASE
|
||||
|REG_EXTENDED
|
||||
//|REG_NEWLINE
|
||||
//|REG_NOSUB
|
||||
) ) {
|
||||
if ( ( err = regcomp ( &ucr , rx ,
|
||||
REG_ICASE
|
||||
|REG_EXTENDED
|
||||
//|REG_NEWLINE
|
||||
//|REG_NOSUB
|
||||
) ) ) {
|
||||
// error!
|
||||
char errbuf[1024];
|
||||
regerror(err,&ucr,errbuf,1000);
|
||||
log("xmldoc: regcomp %s failed: %s. "
|
||||
"Ignoring.",
|
||||
rx,mstrerror(errno));
|
||||
rx,errbuf);
|
||||
}
|
||||
|
||||
logf(LOG_DEBUG,"db: compiled '%s' for crawl pattern",rx);
|
||||
|
@ -108,6 +108,7 @@ case EDNSBAD : return "DNS sent an unknown response code";
|
||||
case EDNSREFUSED : return "DNS refused to talk";
|
||||
case EDNSDEAD : return "DNS hostname does not exist";
|
||||
case EDNSTIMEDOUT : return "DNS timed out";
|
||||
case EDNSERROR : return "DNS lookup error";
|
||||
case ECOLLTOOBIG : return "Collection is too long";
|
||||
case ESTRIKEOUT : return "Retried enough times, deleting doc";
|
||||
case ENOPERM : return "Permission Denied";
|
||||
|
1
Errno.h
1
Errno.h
@ -112,6 +112,7 @@ enum {
|
||||
EDNSREFUSED , //dns refused to talk to us
|
||||
EDNSDEAD , //dns is dead
|
||||
EDNSTIMEDOUT , //was just EUDPTIMEDOUT
|
||||
EDNSERROR ,
|
||||
ECOLLTOOBIG , //collection is too long
|
||||
ESTRIKEOUT , //retried enough times; deleting doc & giving up
|
||||
ENOPERM , //permission denied
|
||||
|
4
Mem.cpp
4
Mem.cpp
@ -700,7 +700,7 @@ void Mem::addMem ( void *mem , int32_t size , const char *note , char isnew ) {
|
||||
//(int32_t)mem,size,h,s_n,note);
|
||||
s_n++;
|
||||
// debug
|
||||
if ( size > MINMEM && g_conf.m_logDebugMemUsage )
|
||||
if ( (size > MINMEM && g_conf.m_logDebugMemUsage) || size>=100000000 )
|
||||
log(LOG_INFO,"mem: addMem(%"INT32"): %s. ptr=0x%"PTRFMT" "
|
||||
"used=%"INT64"",
|
||||
size,note,(PTRTYPE)mem,m_used);
|
||||
@ -1023,7 +1023,7 @@ bool Mem::rmMem ( void *mem , int32_t size , const char *note ) {
|
||||
|
||||
keepgoing:
|
||||
// debug
|
||||
if ( size > MINMEM && g_conf.m_logDebugMemUsage )
|
||||
if ( (size > MINMEM && g_conf.m_logDebugMemUsage) || size>=100000000 )
|
||||
log(LOG_INFO,"mem: rmMem (%"INT32"): "
|
||||
"ptr=0x%"PTRFMT" %s.",size,(PTRTYPE)mem,note);
|
||||
|
||||
|
23
Msg2.cpp
23
Msg2.cpp
@ -194,11 +194,11 @@ bool Msg2::getLists ( ) {
|
||||
int32_t minRecSize = m_minRecSizes[m_i];
|
||||
|
||||
// sanity check
|
||||
if ( ( minRecSize > ( 500 * 1024 * 1024 ) ||
|
||||
minRecSize < 0) ){
|
||||
log( "minRecSize = %"INT32"", minRecSize );
|
||||
char *xx=NULL; *xx=0;
|
||||
}
|
||||
// if ( ( minRecSize > ( 500 * 1024 * 1024 ) ||
|
||||
// minRecSize < 0) ){
|
||||
// log( "minRecSize = %"INT32"", minRecSize );
|
||||
// char *xx=NULL; *xx=0;
|
||||
// }
|
||||
|
||||
//bool forceLocalIndexdb = true;
|
||||
// if it is a no-split term, we may gotta get it over the net
|
||||
@ -407,7 +407,13 @@ bool Msg2::getLists ( ) {
|
||||
|
||||
// like 90MB last time i checked. so it won't read more
|
||||
// than that...
|
||||
int32_t minRecSizes = DEFAULT_POSDB_READSIZE;
|
||||
// MDW: no, it's better to print oom then not give all the
|
||||
// results leaving users scratching their heads. besides,
|
||||
// we should do docid range splitting before we go out of
|
||||
// mem. we should also report the size of each termlist
|
||||
// in bytes in the query info header.
|
||||
//int32_t minRecSizes = DEFAULT_POSDB_READSIZE;
|
||||
int32_t minRecSizes = -1;
|
||||
|
||||
// start up the read. thread will wait in thread queue to
|
||||
// launch if too many threads are out.
|
||||
@ -596,12 +602,13 @@ bool Msg2::gotList ( RdbList *list ) {
|
||||
for ( int32_t i = 0 ; i < m_numLists ; i++ ) {
|
||||
if ( m_lists[i].m_listSize < m_minRecSizes[i] ) continue;
|
||||
if ( m_minRecSizes[i] == 0 ) continue;
|
||||
if ( m_minRecSizes[i] == -1 ) continue;
|
||||
// do not print this if compiling section xpathsitehash stats
|
||||
// because we only need like 10k of list to get a decent
|
||||
// reading
|
||||
if ( m_req->m_forSectionStats ) break;
|
||||
log("msg2: read termlist #%"INT32" size=%"INT32" maxSize=%"INT32". losing "
|
||||
"docIds!",
|
||||
log("msg2: read termlist #%"INT32" size=%"INT32" "
|
||||
"maxSize=%"INT32". losing docIds!",
|
||||
i,m_lists[i].m_listSize,m_minRecSizes[i]);
|
||||
}
|
||||
|
||||
|
@ -377,6 +377,9 @@ bool Msg3a::gotCacheReply ( ) {
|
||||
// 'time enough for love' query was hitting 30MB termlists.
|
||||
//rs = 50000000;
|
||||
rs = DEFAULT_POSDB_READSIZE;//90000000; // 90MB!
|
||||
// it is better to go oom then leave users scratching their
|
||||
// heads as to why some results are not being returned.
|
||||
rs = -1;
|
||||
// if section stats, limit to 1MB
|
||||
//if ( m_r->m_getSectionStats ) rs = 1000000;
|
||||
// get the jth query term
|
||||
|
7
Msg5.cpp
7
Msg5.cpp
@ -182,9 +182,10 @@ bool Msg5::getList ( char rdbId ,
|
||||
// log("Msg5::readList: startKey > endKey warning");
|
||||
// we no longer allow negative minRecSizes
|
||||
if ( minRecSizes < 0 ) {
|
||||
log(LOG_LOGIC,"net: msg5: MinRecSizes < 0, using 1.");
|
||||
minRecSizes = 1;
|
||||
char *xx = NULL; *xx = 0;
|
||||
if ( g_conf.m_logDebugDb )
|
||||
log(LOG_LOGIC,"net: msg5: MinRecSizes < 0, using 2GB.");
|
||||
minRecSizes = 0x7fffffff;
|
||||
//char *xx = NULL; *xx = 0;
|
||||
}
|
||||
// ensure startKey last bit clear, endKey last bit set
|
||||
//if ( (startKey.n0 & 0x01) == 0x01 )
|
||||
|
@ -438,6 +438,25 @@ bool Msg1c::gotList ( ) {
|
||||
// use only 64k values so we don't stress doledb/waittrees/etc.
|
||||
// for large #'s of docids
|
||||
int32_t firstIp = (docId & 0x0000ffff);
|
||||
|
||||
// bits 6-13 of the docid are the domain hash so use those
|
||||
// when doing a REINDEX (not delete!) to ensure that requests
|
||||
// on the same domain go to the same shard, at least when
|
||||
// we have up to 256 shards. if we have more than 256 shards
|
||||
// at this point some shards will not participate in the
|
||||
// query reindex/delete process because of this, so
|
||||
// we'll want to allow more bits in in that case perhaps.
|
||||
// check out Hostdb::getShardNum(RDB_SPIDERDB) in Hostdb.cpp
|
||||
// to see what shard is responsible for storing and indexing
|
||||
// this SpiderRequest based on the firstIp.
|
||||
if ( ! m_forceDel ) {
|
||||
// if we are a REINDEX not a delete because
|
||||
// deletes don't need to spider/redownload the doc
|
||||
// so the distribution can be more random
|
||||
firstIp >>= 6;
|
||||
firstIp &= 0xff;
|
||||
}
|
||||
|
||||
// 0 is not a legit val. it'll core below.
|
||||
if ( firstIp == 0 ) firstIp = 1;
|
||||
// use a fake ip
|
||||
|
82
XmlDoc.cpp
82
XmlDoc.cpp
@ -1800,7 +1800,10 @@ bool XmlDoc::set2 ( char *titleRec ,
|
||||
//m_hasContactInfoValid = true;
|
||||
|
||||
// sanity check. if m_siteValid is true, this must be there
|
||||
if ( ! ptr_site ) { char *xx=NULL;*xx=0; }
|
||||
if ( ! ptr_site ) {
|
||||
log("set4: ptr_site is null for docid %"INT64"",m_docId);
|
||||
//char *xx=NULL;*xx=0; }
|
||||
}
|
||||
|
||||
// lookup the tagdb rec fresh if setting for a summary. that way we
|
||||
// can see if it is banned or not
|
||||
@ -2469,16 +2472,50 @@ bool XmlDoc::indexDoc ( ) {
|
||||
if ( ! m_firstIpValid ) { char *xx=NULL;*xx=0; }
|
||||
// sanity log
|
||||
if ( *fip == 0 || *fip == -1 ) {
|
||||
//
|
||||
// now add a spider status doc for this so we know
|
||||
// why a crawl might have failed to start
|
||||
//
|
||||
SafeBuf *ssDocMetaList = NULL;
|
||||
// save this
|
||||
int32_t saved = m_indexCode;
|
||||
// and make it the real reason for the spider status doc
|
||||
m_indexCode = EDNSERROR;
|
||||
// get the spiderreply ready to be added
|
||||
|
||||
ssDocMetaList = getSpiderStatusDocMetaList(NULL ,false);//del
|
||||
// revert
|
||||
m_indexCode = saved;
|
||||
// error?
|
||||
if ( ! ssDocMetaList ) return true;
|
||||
// blocked?
|
||||
if ( ssDocMetaList == (void *)-1 ) return false;
|
||||
// need to alloc space for it too
|
||||
char *list = ssDocMetaList->getBufStart();
|
||||
int32_t len = ssDocMetaList->length();
|
||||
//needx += len;
|
||||
// this too
|
||||
m_addedStatusDocSize = len;
|
||||
m_addedStatusDocSizeValid = true;
|
||||
|
||||
char *url = "unknown";
|
||||
if ( m_sreqValid ) url = m_sreq.m_url;
|
||||
log("build: error2 getting real firstip of %"INT32" for "
|
||||
"%s. Not adding new spider req", (int32_t)*fip,url);
|
||||
// also count it as a crawl attempt
|
||||
cr->m_localCrawlInfo.m_pageDownloadAttempts++;
|
||||
cr->m_globalCrawlInfo.m_pageDownloadAttempts++;
|
||||
|
||||
if ( ! m_metaList2.safeMemcpy ( list , len ) )
|
||||
return true;
|
||||
|
||||
goto skipNewAdd1;
|
||||
}
|
||||
// store the new request (store reply for this below)
|
||||
char rd = RDB_SPIDERDB;
|
||||
if ( m_useSecondaryRdbs ) rd = RDB2_SPIDERDB2;
|
||||
m_metaList2.pushChar(rd);
|
||||
if ( ! m_metaList2.pushChar(rd) )
|
||||
return true;
|
||||
// store it here
|
||||
SpiderRequest revisedReq;
|
||||
// this fills it in
|
||||
@ -27312,7 +27349,7 @@ SafeBuf *XmlDoc::getSpiderStatusDocMetaList ( SpiderReply *reply ,
|
||||
// . TODO:
|
||||
// usedProxy:1
|
||||
// proxyIp:1.2.3.4
|
||||
SafeBuf *XmlDoc::getSpiderStatusDocMetaList2 ( SpiderReply *reply ) {
|
||||
SafeBuf *XmlDoc::getSpiderStatusDocMetaList2 ( SpiderReply *reply1 ) {
|
||||
|
||||
setStatus ( "making spider reply meta list");
|
||||
|
||||
@ -27333,8 +27370,8 @@ SafeBuf *XmlDoc::getSpiderStatusDocMetaList2 ( SpiderReply *reply ) {
|
||||
int64_t *uqd = getAvailDocIdOnly ( d );
|
||||
if ( ! uqd || uqd == (void *)-1 ) return (SafeBuf *)uqd;
|
||||
|
||||
unsigned char *hc = (unsigned char *)getHopCount();
|
||||
if ( ! hc || hc == (void *)-1 ) return (SafeBuf *)hc;
|
||||
// unsigned char *hc = (unsigned char *)getHopCount();
|
||||
// if ( ! hc || hc == (void *)-1 ) return (SafeBuf *)hc;
|
||||
|
||||
int32_t tmpVal = -1;
|
||||
int32_t *priority = &tmpVal;
|
||||
@ -27370,7 +27407,7 @@ SafeBuf *XmlDoc::getSpiderStatusDocMetaList2 ( SpiderReply *reply ) {
|
||||
if ( ! m_indexCodeValid ) { char *xx=NULL;*xx=0; }
|
||||
|
||||
// why isn't gbhopcount: being indexed consistently?
|
||||
if ( ! m_hopCountValid ) { char *xx=NULL;*xx=0; }
|
||||
//if ( ! m_hopCountValid ) { char *xx=NULL;*xx=0; }
|
||||
|
||||
// reset just in case
|
||||
m_spiderStatusDocMetaList.reset();
|
||||
@ -27405,12 +27442,17 @@ SafeBuf *XmlDoc::getSpiderStatusDocMetaList2 ( SpiderReply *reply ) {
|
||||
jd.safePrintf("\"gbssFinalRedirectUrl\":\"%s\",\n",
|
||||
ptr_redirUrl);
|
||||
|
||||
if ( m_indexCodeValid ) {
|
||||
jd.safePrintf("\"gbssStatusCode\":%i,\n",(int)m_indexCode);
|
||||
jd.safePrintf("\"gbssStatusMsg\":\"");
|
||||
jd.jsonEncode (mstrerror(m_indexCode));
|
||||
jd.safePrintf("\",\n");
|
||||
}
|
||||
else {
|
||||
jd.safePrintf("\"gbssStatusCode\":-1,\n");
|
||||
jd.safePrintf("\"gbssStatusMsg\":\"???\",\n");
|
||||
}
|
||||
|
||||
jd.safePrintf("\"gbssStatusCode\":%i,\n",(int)m_indexCode);
|
||||
|
||||
jd.safePrintf("\"gbssStatusMsg\":\"");
|
||||
jd.jsonEncode (mstrerror(m_indexCode));
|
||||
jd.safePrintf("\",\n");
|
||||
|
||||
if ( m_httpStatusValid )
|
||||
jd.safePrintf("\"gbssHttpStatus\":%"INT32",\n",
|
||||
@ -27454,12 +27496,15 @@ SafeBuf *XmlDoc::getSpiderStatusDocMetaList2 ( SpiderReply *reply ) {
|
||||
jd.safePrintf("\",\n");
|
||||
|
||||
//if ( m_redirUrlPtr && m_redirUrlValid )
|
||||
jd.safePrintf("\"gbssNumRedirects\":%"INT32",\n",
|
||||
m_numRedirects);
|
||||
//if ( m_numRedirectsValid )
|
||||
jd.safePrintf("\"gbssNumRedirects\":%"INT32",\n",m_numRedirects);
|
||||
|
||||
jd.safePrintf("\"gbssDocId\":%"INT64",\n", m_docId);//*uqd);
|
||||
if ( m_docIdValid )
|
||||
jd.safePrintf("\"gbssDocId\":%"INT64",\n", m_docId);//*uqd);
|
||||
|
||||
jd.safePrintf("\"gbssHopCount\":%"INT32",\n",(int32_t)*hc);
|
||||
if ( m_hopCountValid )
|
||||
//jd.safePrintf("\"gbssHopCount\":%"INT32",\n",(int32_t)*hc);
|
||||
jd.safePrintf("\"gbssHopCount\":%"INT32",\n",(int32_t)m_hopCount);
|
||||
|
||||
// crawlbot round
|
||||
if ( cr->m_isCustomCrawl )
|
||||
@ -27886,6 +27931,13 @@ SafeBuf *XmlDoc::getSpiderStatusDocMetaList2 ( SpiderReply *reply ) {
|
||||
xd->ptr_site = ptr_site;
|
||||
xd->size_site = size_site;
|
||||
|
||||
// if this is null then ip lookup failed i guess so just use
|
||||
// the subdomain
|
||||
if ( ! ptr_site && m_firstUrlValid ) {
|
||||
xd->ptr_site = m_firstUrl.getHost();
|
||||
xd->size_site = m_firstUrl.getHostLen();
|
||||
}
|
||||
|
||||
// use the same uh48 of our parent
|
||||
int64_t uh48 = m_firstUrl.getUrlHash48();
|
||||
// then make into a titlerec but store in metalistbuf, not m_titleRec
|
||||
|
Loading…
x
Reference in New Issue
Block a user