Merge branch 'diffbot-testing' of github.com:gigablast/open-source-search-engine into diffbot-testing

This commit is contained in:
Matt Wells
2015-03-12 13:19:16 -07:00
11 changed files with 171 additions and 59 deletions

@ -4955,7 +4955,9 @@ bool Links::set ( bool useRelNoFollow ,
//char *coll ,
bool parentIsPermalink ,
Links *oldLinks ,
bool doQuickSet ) {
bool doQuickSet ,
// some json from diffbot:
SafeBuf *diffbotReply ) {
reset();
@ -5026,6 +5028,39 @@ bool Links::set ( bool useRelNoFollow ,
// break;
//}
// get list of links from diffbot json reply
char *p = NULL;
if ( diffbotReply && diffbotReply->length() > 10 )
p = strstr ( diffbotReply->getBufStart() , "\"links\":[\"" );
// skip over the heading stuff
if ( p ) p += 10;
// parse out the links from diffbot reply
for ( ; p ; ) {
// must not be json mark up
if ( ! *p || *p == ']' || *p == '\"' ) break;
// save p
char *start = p;
// get length of the link
for ( ; *p && *p != '\"' ; p++ );
// set end of link
char *end = p;
// add the link
if ( ! addLink ( start , // linkStr
end - start , // linkStrLen
-1, // i
setLinkHash ,
TITLEREC_CURRENT_VERSION ,
niceness ,
false , // isRSS?
TAG_LINK , // node id -> LF_LINKTAG flag
0 )) // flags
return false;
// now advance to next link if any.
for ( ; *p == '\"' || *p == ',' || is_wspace_a(*p) ; p++ );
}
// visit each node in the xml tree. a node can be a tag or a non-tag.
char *urlattr = NULL;
for ( int32_t i=0; i < m_numNodes ; i++ ) {

@ -1177,7 +1177,8 @@ public:
Links *oldLinks , // for LF_OLDLINKS flag
// this is used by Msg13.cpp to quickly get ptrs
// to the links in the document, no normalization!
bool doQuickSet = false );
bool doQuickSet = false ,
class SafeBuf *diffbotReply = NULL );
// set from a simple text buffer
bool set ( char *buf , int32_t niceness ) ;

@ -849,7 +849,7 @@ bool sendPageStats ( TcpSocket *s , HttpRequest *r ) {
// overflows. when we have too many unindexed
// spiderrequests for a particular firstip, we
// start dropping so we don't spam spiderdb
"<tr class=poo><td><b>Dropped Outlinks</b></td><td>%"INT32"</td>\n"
"<tr class=poo><td><b>Dropped Spider Requests</b></td><td>%"INT32"</td>\n"
"<tr class=poo><td><b>Index Shards</b></td><td>%"INT32"</td>\n"
"<tr class=poo><td><b>Hosts per Shard</b></td><td>%"INT32"</td>\n"

@ -2672,9 +2672,17 @@ bool Pages::printCollectionNavBar ( SafeBuf *sb ,
int32_t numPrinted = 0;
bool printMsg = false;
// if doing qa test don't print out collection names because
// they are somewhat random and throw off the diff in qa.cpp
int32_t qa = hr->getLong("qa",0);
//if ( ! strcmp(coll,"qatest123") ) qa = 1;
//for ( int32_t i = a ; i < b ; i++ ) {
for ( int32_t i = 0 ; i < g_collectiondb.m_numRecs ; i++ ) {
if ( qa )
break;
CollectionRec *cc = g_collectiondb.m_recs[i];
if ( ! cc ) continue;

13
Rdb.cpp

@ -2571,6 +2571,19 @@ bool Rdb::addRecord ( collnum_t collnum,
"skipping.",sreq->m_url);
return true;
}
// if we are overflowing...
if ( isReq &&
! sreq->m_isAddUrl &&
! sreq->m_isPageReindex &&
! sreq->m_urlIsDocId &&
! sreq->m_forceDelete &&
sc->isFirstIpInOverflowList ( sreq->m_firstIp ) ) {
if ( g_conf.m_logDebugSpider )
log("spider: skipping for overflow url %s ",
sreq->m_url);
g_stats.m_totalOverflows++;
return true;
}
}
if ( m_useTree && (tn=m_tree.addNode (collnum,key,data,dataSize))>=0) {

@ -3791,6 +3791,27 @@ bool SpiderColl::readListFromSpiderdb ( ) {
//if ( m_isReadDone ) return true;
}
static int32_t s_lastIn = 0;
static int32_t s_lastOut = 0;
bool SpiderColl::isFirstIpInOverflowList ( int32_t firstIp ) {
if ( ! m_overflowList ) return false;
if ( firstIp == 0 || firstIp == -1 ) return false;
if ( firstIp == s_lastIn ) return true;
if ( firstIp == s_lastOut ) return false;
for ( int32_t oi = 0 ; ; oi++ ) {
// stop at end
if ( ! m_overflowList[oi] ) break;
// an ip of zero is end of the list
if ( m_overflowList[oi] == firstIp ) {
s_lastIn = firstIp;
return true;
}
}
s_lastOut = firstIp;
return false;
}
// . ADDS top X winners to m_winnerTree
// . this is ONLY CALLED from evalIpLoop() above
// . scan m_list that we read from spiderdb for m_scanningIp IP
@ -4787,7 +4808,7 @@ bool SpiderColl::scanListForWinners ( ) {
// don't add any more outlinks to this firstip after we
// have 10M spider requests for it.
// lower for testing
//if ( m_totalNewSpiderRequests > 100 )
//if ( m_totalNewSpiderRequests > 1 )
if ( m_totalNewSpiderRequests > 10000000 )
overflow = true;
@ -4825,6 +4846,8 @@ bool SpiderColl::scanListForWinners ( ) {
// if we need to add it...
if ( overflow && ! found && m_overflowList ) {
log("spider: adding %s to overflow list",iptoa(firstIp));
// reset this little cache thingy
s_lastOut = 0;
// take the empty slot if there is one
if ( emptySlot >= 0 )
m_overflowList[emptySlot] = firstIp;
@ -4848,6 +4871,8 @@ bool SpiderColl::scanListForWinners ( ) {
// take it out of list
m_overflowList[oi2] = -1;
log("spider: removing %s from overflow list",iptoa(firstIp));
// reset this little cache thingy
s_lastIn = 0;
break;
}
/////
@ -13954,15 +13979,3 @@ void SpiderLoop::buildActiveList ( ) {
tail = cr;
}
}
bool SpiderColl::isFirstIpInOverflowList ( int32_t firstIp ) {
if ( ! m_overflowList ) return false;
if ( firstIp == 0 || firstIp == -1 ) return false;
for ( int32_t oi = 0 ; ; oi++ ) {
// stop at end
if ( ! m_overflowList[oi] ) break;
// an ip of zero is end of the list
if ( m_overflowList[oi] == firstIp ) return true;
}
return false;
}

@ -2834,7 +2834,7 @@ int TcpServer::sslHandshake ( TcpSocket *s ) {
}
// if the connection happened return r, should be 1
if ( r > 0 ) {
//if ( g_conf.m_logDebugTcp )
if ( g_conf.m_logDebugTcp )
log("tcp: ssl handshake done. entering writing mode "
"sd=%i",s->m_sd);
// ok, it completed, go into writing mode
@ -2882,7 +2882,9 @@ int TcpServer::sslHandshake ( TcpSocket *s ) {
// read callbacks are always registered and if we need a read
// hopefully it will be called. TODO: verify this...
if ( sslError == SSL_ERROR_WANT_READ ) {
log("tcp: ssl handshake is not want write sd=%i",s->m_sd);
if ( g_conf.m_logDebugTcp )
log("tcp: ssl handshake is not want write sd=%i",
s->m_sd);
//logSSLError(s->m_ssl, r);
return 0;
}

@ -369,10 +369,16 @@ bool Xml::set ( char *s ,
// set his parent xml node if is xml
xi->m_parent = parent;
bool endsInSlash = false;
if ( xi->m_node[xi->m_nodeLen-2] == '/' ) endsInSlash = true;
if ( xi->m_node[xi->m_nodeLen-2] == '?' ) endsInSlash = true;
// if not text node then he's the new parent
if ( pureXml &&
xi->m_nodeId &&
xi->m_nodeId != TAG_COMMENT ) {
xi->m_nodeId != TAG_COMMENT &&
xi->m_nodeId != TAG_CDATA &&
! endsInSlash ) {
// if we are a back tag pop the stack
if ( ! xi->isFrontTag() ) {

@ -189,8 +189,6 @@ static int64_t s_lastTimeStart = 0LL;
void XmlDoc::reset ( ) {
m_linkOverflows = 0;
m_isImporting = false;
m_printedMenu = false;
@ -7676,6 +7674,16 @@ Links *XmlDoc::getLinks ( bool doQuickSet ) {
if ( m_linksValid ) return &m_links;
// set status
setStatus ( "getting outlinks");
// . add links from diffbot reply
// . get the reply of json objects from diffbot
// . this will be empty if we are a json object!
// . will also be empty if not meant to be sent to diffbot
// . the TOKENIZED reply consists of \0 separated json objects that
// we create from the original diffbot reply
SafeBuf *dbr = getDiffbotReply();
if ( ! dbr || dbr == (void *)-1 ) return (Links *)dbr;
// this will set it if necessary
Xml *xml = getXml();
// bail on error
@ -7739,7 +7747,8 @@ Links *XmlDoc::getLinks ( bool doQuickSet ) {
m_niceness ,
*pp , // parent url in permalink format?
oldLinks ,// oldLinks, might be NULL!
doQuickSet ))
doQuickSet ,
dbr ) )
return NULL;
m_linksValid = true;
@ -14431,18 +14440,40 @@ void gotDiffbotReplyWrapper ( void *state , TcpSocket *s ) {
if ( THIS->m_diffbotReplyError ) countIt = false;
/*
// solution for bug #2092 but probably not really needed so
// commented out.
// if doing /vxxx/analzye?mode=xxxx then ensure matches
bool isAnalyze = false;
if ( countIt &&
m_diffbotApiUrlValid &&
strstr ( m_diffbotApiUrl.getBufStart(), "/analyze?") )
THIS->m_diffbotApiUrlValid &&
strstr ( THIS->m_diffbotApiUrl.getBufStart(), "/analyze?") )
isAnalyze = true;
char *mode = NULL;
if ( isAnalyze ) {
mode = strstr (m_diffbotApiUrl.getBufStart(), "mode=");
mode = strstr (THIS->m_diffbotApiUrl.getBufStart(), "mode=");
if ( mode ) mode += 5;
// find end of it
}
char *pageType = NULL;
int32_t pageTypeLen;
if ( mode &&
THIS->m_diffbotReplyValid &&
THIS->m_diffbotReply.length() > 5 ) {
char *reply = THIS->m_diffbotReply.getBufStart();
pageType = strstr ( reply , "\"type\":\"" );
if ( pageType ) pageType += 8;
char *e = pageType;
for ( ; *e && *e != '\"' ; e++ );
pageTypeLen = e - pageType;
}
// if it does not match, do not count it
if ( mode && pageType && strncmp ( mode , pageType , pageTypeLen ) )
countIt = false;
*/
// increment this counter on a successful reply from diffbot
@ -20096,10 +20127,6 @@ bool XmlDoc::logIt ( SafeBuf *bb ) {
sb->safePrintf("outlinksadded=%04"INT32" ",
(int32_t)m_numOutlinksAdded);
if ( m_linkOverflows )
sb->safePrintf("linkoverflows=%04"INT32" ",
(int32_t)m_linkOverflows);
if ( m_metaListValid )
sb->safePrintf("addlistsize=%05"INT32" ",
(int32_t)m_metaListSize);
@ -25287,7 +25314,7 @@ char *XmlDoc::addOutlinkSpiderRecsToMetaList ( ) {
bool ignore = false;
if ( mbuf[0] == '1' ) ignore = true;
SpiderColl *sc = g_spiderCache.getSpiderCollIffNonNull ( m_collnum );
//SpiderColl *sc = g_spiderCache.getSpiderCollIffNonNull ( m_collnum );
//
// serialize each link into the metalist now
@ -25306,11 +25333,12 @@ char *XmlDoc::addOutlinkSpiderRecsToMetaList ( ) {
// if firstIp is in the SpiderColl::m_overflowFirstIps list
// then do not add any more links to it. it already has
// more than 500MB worth.
if ( sc && sc->isFirstIpInOverflowList ( firstIp ) ) {
m_linkOverflows++;
g_stats.m_totalOverflows++;
continue;
}
// this was moved to Rdb.cpp's addRecord()
// if ( sc && sc->isFirstIpInOverflowList ( firstIp ) ) {
// m_linkOverflows++;
// g_stats.m_totalOverflows++;
// continue;
// }
// sanity check
//if ( firstIp == 0x03 ) {char *xx=NULL;*xx=0; }
@ -29326,7 +29354,8 @@ bool XmlDoc::hashAds ( HashTableX *tt ) {
char *descr;
//buflen = snprintf(buf,128,"%s-%s",
// m_adProvider[i],m_adClient[i]);
int32_t buflen = snprintf(buf,128,"%"UINT64"",ptr_adVector[i] );
snprintf(buf,128,"%"UINT64"",ptr_adVector[i] );
int32_t bufLen = gbstrlen(buf);
field = "gbad";
descr = "ad provider and id";
// update hash parms
@ -29338,7 +29367,7 @@ bool XmlDoc::hashAds ( HashTableX *tt ) {
//log(LOG_WARN, "build: url %s indexing ad termid %s:%s",
// getFirstUrl()->getUrl(), field, buf);
//this returns false on failure
if ( ! hashString ( buf,buflen,&hi ) ) return false;
if ( ! hashString ( buf,bufLen,&hi ) ) return false;
}
return true;
}
@ -33758,20 +33787,20 @@ bool XmlDoc::hashFacet2 ( char *prefix,
if ( strcmp(prefix,"gbfacetfloat")==0 ) isFloat = true;
// store in buffer for display on pageparser.cpp output
char buf[128];
int32_t bufLen;
char buf[130];
if ( isFloat )
bufLen=sprintf(buf,"facetField=%s facetVal32=%f",term,
*(float *)&val32);
snprintf(buf,128,"facetField=%s facetVal32=%f",term,
*(float *)&val32);
else
bufLen=sprintf(buf,"facetField=%s facetVal32=%"UINT32"",
term,(uint32_t)val32);
snprintf(buf,128,"facetField=%s facetVal32=%"UINT32"",
term,(uint32_t)val32);
int32_t bufLen = gbstrlen(buf);
// make a special hashinfo for this facet
HashInfo hi;
hi.m_tt = tt;
// the full prefix
char fullPrefix[64];
char fullPrefix[66];
snprintf(fullPrefix,64,"%s:%s",prefix,term);
hi.m_prefix = fullPrefix;//"gbfacet";
@ -33865,7 +33894,7 @@ bool XmlDoc::hashFieldMatchTerm ( char *val , int32_t vlen , HashInfo *hi ) {
hi2.m_tt = tt;
// the full prefix
char fullPrefix[64];
snprintf(fullPrefix,64,"%s:%s",prefix,hi->m_prefix);
snprintf(fullPrefix,62,"%s:%s",prefix,hi->m_prefix);
hi2.m_prefix = fullPrefix;//"gbfacet";
// add to wts for PageParser.cpp display
@ -34143,7 +34172,8 @@ bool XmlDoc::hashNumber2 ( float f , HashInfo *hi , char *sortByStr ) {
// store in buffer
char buf[128];
int32_t bufLen = sprintf(buf,"%s:%s float32=%f",sortByStr,hi->m_prefix,f);
snprintf(buf,126,"%s:%s float32=%f",sortByStr,hi->m_prefix,f);
int32_t bufLen = gbstrlen(buf);
// add to wts for PageParser.cpp display
// store it
@ -34251,7 +34281,8 @@ bool XmlDoc::hashNumber3 ( int32_t n , HashInfo *hi , char *sortByStr ) {
// store in buffer
char buf[128];
int32_t bufLen = sprintf(buf,"%s:%s int32=%"INT32"",sortByStr,hi->m_prefix,n);
snprintf(buf,126,"%s:%s int32=%"INT32"",sortByStr, hi->m_prefix,n);
int32_t bufLen = gbstrlen(buf);
// add to wts for PageParser.cpp display
// store it
@ -49843,6 +49874,9 @@ char *XmlDoc::hashXMLFields ( HashTableX *table ) {
// . we just want the "text" nodes
if ( nodes[i].isTag() ) continue;
//if(!strncmp(nodes[i].m_node,"Congress%20Presses%20Uber",20))
// log("hey:hy");
// assemble the full parent name
// like "tag1.tag2.tag3"
nameBuf.reset();

@ -2032,7 +2032,6 @@ class XmlDoc {
char m_isFiltered;
int32_t m_urlFilterNum;
int32_t m_numOutlinksAdded;
int32_t m_linkOverflows;
int32_t m_numOutlinksAddedFromSameDomain;
int32_t m_numOutlinksFiltered;
int32_t m_numOutlinksBanned;

23
qa.cpp

@ -182,6 +182,7 @@ void processReply ( char *reply , int32_t replyLen ) {
markOut ( content , "spider is done (");
markOut ( content , "spider is paused (");
markOut ( content , "spider queue empty (");
markOut ( content , "<totalShards>");
@ -545,7 +546,7 @@ bool qainject1 ( ) {
// turn off images thumbnails
if ( ! s_flags[17] ) {
s_flags[17] = true;
if ( ! getUrl ( "/admin/spider?c=qatest123&mit=0&mns=1"
if ( ! getUrl ( "/admin/spider?c=qatest123&qa=1&mit=0&mns=1"
// no spider replies because it messes
// up our last test to make sure posdb
// is 100% empty.
@ -851,7 +852,7 @@ bool qainject2 ( ) {
if ( ! s_flags[17] ) {
s_flags[17] = true;
// can't turn off spiders because we need for query reindex
if ( ! getUrl ( "/admin/spider?c=qatest123&mit=0&mns=1"
if ( ! getUrl ( "/admin/spider?c=qatest123&qa=1&mit=0&mns=1"
// turn off use robots to avoid that
// xyz.com/robots.txt redir to seekseek.com
"&obeyRobots=0"
@ -1041,7 +1042,7 @@ bool qaSyntax ( ) {
if ( ! s_flags[2] ) {
s_flags[2] = true;
// can't turn off spiders because we need for query reindex
if ( ! getUrl ( "/admin/spider?c=qatest123&mit=0&mns=1"
if ( ! getUrl ( "/admin/spider?c=qatest123&qa=1&mit=0&mns=1"
// index spider reply status docs
"&isr=1"
// turn off use robots to avoid that
@ -1249,7 +1250,7 @@ bool qaimport () {
// turn spiders off so it doesn't spider while we are importing
if ( ! s_flags[18] ) {
s_flags[18] = true;
if ( ! getUrl ( "/admin/spider?cse=0&c=qatest123",
if ( ! getUrl ( "/admin/spider?cse=0&qa=1&c=qatest123",
// checksum of reply expected
238170006 ) )
return false;
@ -1353,7 +1354,7 @@ bool qainlinks() {
// turn spiders off so it doesn't spider while we are importing
if ( ! s_flags[18] ) {
s_flags[18] = true;
if ( ! getUrl ( "/admin/spider?cse=0&c=qatest123",
if ( ! getUrl ( "/admin/spider?cse=0&qa=1&c=qatest123",
// checksum of reply expected
238170006 ) )
return false;
@ -1485,7 +1486,7 @@ bool qareindex() {
// turn off images thumbnails
if ( ! s_flags[17] ) {
s_flags[17] = true;
if ( ! getUrl ( "/admin/spider?c=qatest123&mit=0&mns=1",
if ( ! getUrl ( "/admin/spider?c=qatest123&qa=1&mit=0&mns=1",
// checksum of reply expected
238170006 ) )
return false;
@ -1755,7 +1756,7 @@ bool qaspider1 ( ) {
// set max spiders to 1 for consistency!
if ( ! s_flags[24] ) {
s_flags[24] = true;
if ( ! getUrl ( "/admin/spider?c=qatest123&mit=0&mns=1"
if ( ! getUrl ( "/admin/spider?c=qatest123&qa=1&mit=0&mns=1"
// so site2:www.walmart.com works
"&isr=1"
,
@ -2022,7 +2023,7 @@ bool qaspider2 ( ) {
// turn off images thumbnails
if ( ! s_flags[24] ) {
s_flags[24] = true;
if ( ! getUrl ( "/admin/spider?c=qatest123&mit=0&mns=1",
if ( ! getUrl ( "/admin/spider?c=qatest123&qa=1&mit=0&mns=1",
// checksum of reply expected
238170006 ) )
return false;
@ -2227,7 +2228,7 @@ bool qascrape ( ) {
// turn off images thumbnails
if ( ! s_flags[24] ) {
s_flags[24] = true;
if ( ! getUrl ( "/admin/spider?c=qatest123&mit=0&mns=1",
if ( ! getUrl ( "/admin/spider?c=qatest123&qa=1&mit=0&mns=1",
// checksum of reply expected
238170006 ) )
return false;
@ -2346,7 +2347,7 @@ bool qajson ( ) {
// turn off images thumbnails
if ( ! s_flags[24] ) {
s_flags[24] = true;
if ( ! getUrl ( "/admin/spider?c=qatest123&mit=0&mns=1"
if ( ! getUrl ( "/admin/spider?c=qatest123&qa=1&mit=0&mns=1"
// index spider replies status docs
"&isr=1"
,
@ -2570,7 +2571,7 @@ bool qaxml ( ) {
// turn off images thumbnails
if ( ! s_flags[24] ) {
s_flags[24] = true;
if ( ! getUrl ( "/admin/spider?c=qatest123&mit=0&mns=1",
if ( ! getUrl ( "/admin/spider?c=qatest123&qa=1&mit=0&mns=1",
// checksum of reply expected
238170006 ) )
return false;