Merge branch 'diffbot-testing' into diffbot-sam

This commit is contained in:
Matt 2015-05-19 19:39:32 -07:00
commit 145c125abd
9 changed files with 205 additions and 41 deletions

@ -463,12 +463,24 @@ bool Collectiondb::addNewColl ( char *coll ,
if ( ! h ) {
log("crawlbot: bad custom collname");
g_errno = EBADENGINEER;
mdelete ( cr, sizeof(CollectionRec), "CollectionRec" );
delete ( cr );
return true;
}
*h = '\0';
crawl = h + 1;
if ( ! crawl[0] ) {
log("crawlbot: bad custom crawl name");
mdelete ( cr, sizeof(CollectionRec), "CollectionRec" );
delete ( cr );
g_errno = EBADENGINEER;
return true;
}
// or if too big!
if ( gbstrlen(crawl) > 30 ) {
log("crawlbot: crawlbot crawl NAME is over 30 chars");
mdelete ( cr, sizeof(CollectionRec), "CollectionRec" );
delete ( cr );
g_errno = EBADENGINEER;
return true;
}
@ -1939,12 +1951,13 @@ bool CollectionRec::load ( char *coll , int32_t i ) {
// the list of ip addresses that we have detected as being throttled
// and therefore backoff and use proxies for
sb.reset();
sb.safePrintf("%scoll.%s.%"INT32"/",
g_hostdb.m_dir , m_coll , (int32_t)m_collnum );
m_twitchyTable.m_allocName = "twittbl";
m_twitchyTable.load ( sb.getBufStart() , "ipstouseproxiesfor.dat" );
if ( ! g_conf.m_doingCommandLine ) {
sb.reset();
sb.safePrintf("%scoll.%s.%"INT32"/",
g_hostdb.m_dir , m_coll , (int32_t)m_collnum );
m_twitchyTable.m_allocName = "twittbl";
m_twitchyTable.load ( sb.getBufStart() , "ipstouseproxiesfor.dat" );
}
@ -3472,6 +3485,70 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() {
char *ppp = m_diffbotPageProcessPattern.getBufStart();
if ( ppp && ! ppp[0] ) ppp = NULL;
///////
//
// recompile regular expressions
//
///////
if ( m_hasucr ) {
regfree ( &m_ucr );
m_hasucr = false;
}
if ( m_hasupr ) {
regfree ( &m_upr );
m_hasupr = false;
}
// copy into tmpbuf
SafeBuf tmp;
char *rx = m_diffbotUrlCrawlRegEx.getBufStart();
if ( rx && ! rx[0] ) rx = NULL;
if ( rx ) {
tmp.reset();
tmp.safeStrcpy ( rx );
expandRegExShortcuts ( &tmp );
m_hasucr = true;
}
int32_t err;
if ( rx && ( err = regcomp ( &m_ucr , tmp.getBufStart() ,
REG_EXTENDED| //REG_ICASE|
REG_NEWLINE ) ) ) { // |REG_NOSUB) ) {
// error!
char errbuf[1024];
regerror(err,&m_ucr,errbuf,1000);
log("coll: regcomp %s failed: %s. "
"Ignoring.",
rx,errbuf);
regfree ( &m_ucr );
m_hasucr = false;
}
rx = m_diffbotUrlProcessRegEx.getBufStart();
if ( rx && ! rx[0] ) rx = NULL;
if ( rx ) m_hasupr = true;
if ( rx ) {
tmp.reset();
tmp.safeStrcpy ( rx );
expandRegExShortcuts ( &tmp );
m_hasupr = true;
}
if ( rx && ( err = regcomp ( &m_upr , tmp.getBufStart() ,
REG_EXTENDED| // REG_ICASE|
REG_NEWLINE ) ) ) { // |REG_NOSUB) ) {
char errbuf[1024];
regerror(err,&m_upr,errbuf,1000);
// error!
log("coll: regcomp %s failed: %s. "
"Ignoring.",
rx,errbuf);
regfree ( &m_upr );
m_hasupr = false;
}
// what diffbot url to use for processing
char *api = m_diffbotApiUrl.getBufStart();
@ -3913,17 +3990,20 @@ void testRegex ( ) {
rx = ".*?article[0-9]*?.html";
regex_t ucr;
int32_t err;
if ( regcomp ( &ucr , rx ,
REG_ICASE
|REG_EXTENDED
//|REG_NEWLINE
//|REG_NOSUB
) ) {
if ( ( err = regcomp ( &ucr , rx ,
REG_ICASE
|REG_EXTENDED
//|REG_NEWLINE
//|REG_NOSUB
) ) ) {
// error!
char errbuf[1024];
regerror(err,&ucr,errbuf,1000);
log("xmldoc: regcomp %s failed: %s. "
"Ignoring.",
rx,mstrerror(errno));
rx,errbuf);
}
logf(LOG_DEBUG,"db: compiled '%s' for crawl pattern",rx);

@ -108,6 +108,7 @@ case EDNSBAD : return "DNS sent an unknown response code";
case EDNSREFUSED : return "DNS refused to talk";
case EDNSDEAD : return "DNS hostname does not exist";
case EDNSTIMEDOUT : return "DNS timed out";
case EDNSERROR : return "DNS lookup error";
case ECOLLTOOBIG : return "Collection is too long";
case ESTRIKEOUT : return "Retried enough times, deleting doc";
case ENOPERM : return "Permission Denied";

@ -112,6 +112,7 @@ enum {
EDNSREFUSED , //dns refused to talk to us
EDNSDEAD , //dns is dead
EDNSTIMEDOUT , //was just EUDPTIMEDOUT
EDNSERROR ,
ECOLLTOOBIG , //collection is too long
ESTRIKEOUT , //retried enough times; deleting doc & giving up
ENOPERM , //permission denied

@ -700,7 +700,7 @@ void Mem::addMem ( void *mem , int32_t size , const char *note , char isnew ) {
//(int32_t)mem,size,h,s_n,note);
s_n++;
// debug
if ( size > MINMEM && g_conf.m_logDebugMemUsage )
if ( (size > MINMEM && g_conf.m_logDebugMemUsage) || size>=100000000 )
log(LOG_INFO,"mem: addMem(%"INT32"): %s. ptr=0x%"PTRFMT" "
"used=%"INT64"",
size,note,(PTRTYPE)mem,m_used);
@ -1023,7 +1023,7 @@ bool Mem::rmMem ( void *mem , int32_t size , const char *note ) {
keepgoing:
// debug
if ( size > MINMEM && g_conf.m_logDebugMemUsage )
if ( (size > MINMEM && g_conf.m_logDebugMemUsage) || size>=100000000 )
log(LOG_INFO,"mem: rmMem (%"INT32"): "
"ptr=0x%"PTRFMT" %s.",size,(PTRTYPE)mem,note);

@ -194,11 +194,11 @@ bool Msg2::getLists ( ) {
int32_t minRecSize = m_minRecSizes[m_i];
// sanity check
if ( ( minRecSize > ( 500 * 1024 * 1024 ) ||
minRecSize < 0) ){
log( "minRecSize = %"INT32"", minRecSize );
char *xx=NULL; *xx=0;
}
// if ( ( minRecSize > ( 500 * 1024 * 1024 ) ||
// minRecSize < 0) ){
// log( "minRecSize = %"INT32"", minRecSize );
// char *xx=NULL; *xx=0;
// }
//bool forceLocalIndexdb = true;
// if it is a no-split term, we may gotta get it over the net
@ -407,7 +407,13 @@ bool Msg2::getLists ( ) {
// like 90MB last time i checked. so it won't read more
// than that...
int32_t minRecSizes = DEFAULT_POSDB_READSIZE;
// MDW: no, it's better to print oom then not give all the
// results leaving users scratching their heads. besides,
// we should do docid range splitting before we go out of
// mem. we should also report the size of each termlist
// in bytes in the query info header.
//int32_t minRecSizes = DEFAULT_POSDB_READSIZE;
int32_t minRecSizes = -1;
// start up the read. thread will wait in thread queue to
// launch if too many threads are out.
@ -596,12 +602,13 @@ bool Msg2::gotList ( RdbList *list ) {
for ( int32_t i = 0 ; i < m_numLists ; i++ ) {
if ( m_lists[i].m_listSize < m_minRecSizes[i] ) continue;
if ( m_minRecSizes[i] == 0 ) continue;
if ( m_minRecSizes[i] == -1 ) continue;
// do not print this if compiling section xpathsitehash stats
// because we only need like 10k of list to get a decent
// reading
if ( m_req->m_forSectionStats ) break;
log("msg2: read termlist #%"INT32" size=%"INT32" maxSize=%"INT32". losing "
"docIds!",
log("msg2: read termlist #%"INT32" size=%"INT32" "
"maxSize=%"INT32". losing docIds!",
i,m_lists[i].m_listSize,m_minRecSizes[i]);
}

@ -377,6 +377,9 @@ bool Msg3a::gotCacheReply ( ) {
// 'time enough for love' query was hitting 30MB termlists.
//rs = 50000000;
rs = DEFAULT_POSDB_READSIZE;//90000000; // 90MB!
// it is better to go oom then leave users scratching their
// heads as to why some results are not being returned.
rs = -1;
// if section stats, limit to 1MB
//if ( m_r->m_getSectionStats ) rs = 1000000;
// get the jth query term

@ -182,9 +182,10 @@ bool Msg5::getList ( char rdbId ,
// log("Msg5::readList: startKey > endKey warning");
// we no longer allow negative minRecSizes
if ( minRecSizes < 0 ) {
log(LOG_LOGIC,"net: msg5: MinRecSizes < 0, using 1.");
minRecSizes = 1;
char *xx = NULL; *xx = 0;
if ( g_conf.m_logDebugDb )
log(LOG_LOGIC,"net: msg5: MinRecSizes < 0, using 2GB.");
minRecSizes = 0x7fffffff;
//char *xx = NULL; *xx = 0;
}
// ensure startKey last bit clear, endKey last bit set
//if ( (startKey.n0 & 0x01) == 0x01 )

@ -438,6 +438,25 @@ bool Msg1c::gotList ( ) {
// use only 64k values so we don't stress doledb/waittrees/etc.
// for large #'s of docids
int32_t firstIp = (docId & 0x0000ffff);
// bits 6-13 of the docid are the domain hash so use those
// when doing a REINDEX (not delete!) to ensure that requests
// on the same domain go to the same shard, at least when
// we have up to 256 shards. if we have more than 256 shards
// at this point some shards will not participate in the
// query reindex/delete process because of this, so
// we'll want to allow more bits in in that case perhaps.
// check out Hostdb::getShardNum(RDB_SPIDERDB) in Hostdb.cpp
// to see what shard is responsible for storing and indexing
// this SpiderRequest based on the firstIp.
if ( ! m_forceDel ) {
// if we are a REINDEX not a delete because
// deletes don't need to spider/redownload the doc
// so the distribution can be more random
firstIp >>= 6;
firstIp &= 0xff;
}
// 0 is not a legit val. it'll core below.
if ( firstIp == 0 ) firstIp = 1;
// use a fake ip

@ -1800,7 +1800,10 @@ bool XmlDoc::set2 ( char *titleRec ,
//m_hasContactInfoValid = true;
// sanity check. if m_siteValid is true, this must be there
if ( ! ptr_site ) { char *xx=NULL;*xx=0; }
if ( ! ptr_site ) {
log("set4: ptr_site is null for docid %"INT64"",m_docId);
//char *xx=NULL;*xx=0; }
}
// lookup the tagdb rec fresh if setting for a summary. that way we
// can see if it is banned or not
@ -2469,16 +2472,50 @@ bool XmlDoc::indexDoc ( ) {
if ( ! m_firstIpValid ) { char *xx=NULL;*xx=0; }
// sanity log
if ( *fip == 0 || *fip == -1 ) {
//
// now add a spider status doc for this so we know
// why a crawl might have failed to start
//
SafeBuf *ssDocMetaList = NULL;
// save this
int32_t saved = m_indexCode;
// and make it the real reason for the spider status doc
m_indexCode = EDNSERROR;
// get the spiderreply ready to be added
ssDocMetaList = getSpiderStatusDocMetaList(NULL ,false);//del
// revert
m_indexCode = saved;
// error?
if ( ! ssDocMetaList ) return true;
// blocked?
if ( ssDocMetaList == (void *)-1 ) return false;
// need to alloc space for it too
char *list = ssDocMetaList->getBufStart();
int32_t len = ssDocMetaList->length();
//needx += len;
// this too
m_addedStatusDocSize = len;
m_addedStatusDocSizeValid = true;
char *url = "unknown";
if ( m_sreqValid ) url = m_sreq.m_url;
log("build: error2 getting real firstip of %"INT32" for "
"%s. Not adding new spider req", (int32_t)*fip,url);
// also count it as a crawl attempt
cr->m_localCrawlInfo.m_pageDownloadAttempts++;
cr->m_globalCrawlInfo.m_pageDownloadAttempts++;
if ( ! m_metaList2.safeMemcpy ( list , len ) )
return true;
goto skipNewAdd1;
}
// store the new request (store reply for this below)
char rd = RDB_SPIDERDB;
if ( m_useSecondaryRdbs ) rd = RDB2_SPIDERDB2;
m_metaList2.pushChar(rd);
if ( ! m_metaList2.pushChar(rd) )
return true;
// store it here
SpiderRequest revisedReq;
// this fills it in
@ -27312,7 +27349,7 @@ SafeBuf *XmlDoc::getSpiderStatusDocMetaList ( SpiderReply *reply ,
// . TODO:
// usedProxy:1
// proxyIp:1.2.3.4
SafeBuf *XmlDoc::getSpiderStatusDocMetaList2 ( SpiderReply *reply ) {
SafeBuf *XmlDoc::getSpiderStatusDocMetaList2 ( SpiderReply *reply1 ) {
setStatus ( "making spider reply meta list");
@ -27333,8 +27370,8 @@ SafeBuf *XmlDoc::getSpiderStatusDocMetaList2 ( SpiderReply *reply ) {
int64_t *uqd = getAvailDocIdOnly ( d );
if ( ! uqd || uqd == (void *)-1 ) return (SafeBuf *)uqd;
unsigned char *hc = (unsigned char *)getHopCount();
if ( ! hc || hc == (void *)-1 ) return (SafeBuf *)hc;
// unsigned char *hc = (unsigned char *)getHopCount();
// if ( ! hc || hc == (void *)-1 ) return (SafeBuf *)hc;
int32_t tmpVal = -1;
int32_t *priority = &tmpVal;
@ -27370,7 +27407,7 @@ SafeBuf *XmlDoc::getSpiderStatusDocMetaList2 ( SpiderReply *reply ) {
if ( ! m_indexCodeValid ) { char *xx=NULL;*xx=0; }
// why isn't gbhopcount: being indexed consistently?
if ( ! m_hopCountValid ) { char *xx=NULL;*xx=0; }
//if ( ! m_hopCountValid ) { char *xx=NULL;*xx=0; }
// reset just in case
m_spiderStatusDocMetaList.reset();
@ -27405,12 +27442,17 @@ SafeBuf *XmlDoc::getSpiderStatusDocMetaList2 ( SpiderReply *reply ) {
jd.safePrintf("\"gbssFinalRedirectUrl\":\"%s\",\n",
ptr_redirUrl);
if ( m_indexCodeValid ) {
jd.safePrintf("\"gbssStatusCode\":%i,\n",(int)m_indexCode);
jd.safePrintf("\"gbssStatusMsg\":\"");
jd.jsonEncode (mstrerror(m_indexCode));
jd.safePrintf("\",\n");
}
else {
jd.safePrintf("\"gbssStatusCode\":-1,\n");
jd.safePrintf("\"gbssStatusMsg\":\"???\",\n");
}
jd.safePrintf("\"gbssStatusCode\":%i,\n",(int)m_indexCode);
jd.safePrintf("\"gbssStatusMsg\":\"");
jd.jsonEncode (mstrerror(m_indexCode));
jd.safePrintf("\",\n");
if ( m_httpStatusValid )
jd.safePrintf("\"gbssHttpStatus\":%"INT32",\n",
@ -27454,12 +27496,15 @@ SafeBuf *XmlDoc::getSpiderStatusDocMetaList2 ( SpiderReply *reply ) {
jd.safePrintf("\",\n");
//if ( m_redirUrlPtr && m_redirUrlValid )
jd.safePrintf("\"gbssNumRedirects\":%"INT32",\n",
m_numRedirects);
//if ( m_numRedirectsValid )
jd.safePrintf("\"gbssNumRedirects\":%"INT32",\n",m_numRedirects);
jd.safePrintf("\"gbssDocId\":%"INT64",\n", m_docId);//*uqd);
if ( m_docIdValid )
jd.safePrintf("\"gbssDocId\":%"INT64",\n", m_docId);//*uqd);
jd.safePrintf("\"gbssHopCount\":%"INT32",\n",(int32_t)*hc);
if ( m_hopCountValid )
//jd.safePrintf("\"gbssHopCount\":%"INT32",\n",(int32_t)*hc);
jd.safePrintf("\"gbssHopCount\":%"INT32",\n",(int32_t)m_hopCount);
// crawlbot round
if ( cr->m_isCustomCrawl )
@ -27886,6 +27931,13 @@ SafeBuf *XmlDoc::getSpiderStatusDocMetaList2 ( SpiderReply *reply ) {
xd->ptr_site = ptr_site;
xd->size_site = size_site;
// if this is null then ip lookup failed i guess so just use
// the subdomain
if ( ! ptr_site && m_firstUrlValid ) {
xd->ptr_site = m_firstUrl.getHost();
xd->size_site = m_firstUrl.getHostLen();
}
// use the same uh48 of our parent
int64_t uh48 = m_firstUrl.getUrlHash48();
// then make into a titlerec but store in metalistbuf, not m_titleRec