Merge pull request from emmanuelcharon/diffbot-testing

modified hopcount computation for custom crawls
This commit is contained in:
Gigablast
2014-12-05 08:53:22 -07:00

@ -19114,6 +19114,38 @@ int8_t *XmlDoc::getHopCount ( ) {
setStatus ( "getting hop count" );
CollectionRec *cr = this->getCollRec();
if(cr) {
if(cr->m_isCustomCrawl){
// for diffbot collections, compute hopcount without casting
// site/rss to 0 hopcount -- copied from below
LinkInfo *info1 = getLinkInfo1();
if ( ! info1 || info1 == (LinkInfo *)-1 ) return (int8_t *)info1;
long origHopCount = -1;
if ( m_sreqValid ) {
origHopCount = m_sreq.m_hopCount;
}
long hc = -1;
if ( m_minInlinkerHopCount + 1 < hc && m_minInlinkerHopCount >= 0 )
hc = m_minInlinkerHopCount + 1;
if ( hc == -1 && m_minInlinkerHopCount >= 0 )
hc = m_minInlinkerHopCount + 1;
if ( origHopCount < hc && origHopCount >= 0 )
hc = origHopCount;
if ( hc == -1 && origHopCount >= 0 )
hc = origHopCount;
if ( hc == -1 )
hc = 1;
if ( hc > 0x7f ) hc = 0x7f;
m_hopCountValid = true;
m_hopCount = hc;
//printf("Custom hopcount: %d for url: %s", m_hopCount, this->ptr_firstUrl);
return &m_hopCount;
}
}
// the unredirected url
Url *f = getFirstUrl();
// get url as string, skip "http://" or "https://"
@ -24984,9 +25016,12 @@ char *XmlDoc::addOutlinkSpiderRecsToMetaList ( ) {
m_indexCode == EDOCNONCANONICAL ) )
ksr.m_hopCount = m_hopCount;
if ( issiteroot ) ksr.m_hopCount = 0;
if ( ispingserver ) ksr.m_hopCount = 0;
//if ( isrss ) ksr.m_hopCount = 0;
// for diffbot custom crawls we keep the computed hopcount
if(!cr->m_isCustomCrawl) {
if ( issiteroot ) ksr.m_hopCount = 0;
if ( ispingserver ) ksr.m_hopCount = 0;
//if ( isrss ) ksr.m_hopCount = 0;
}
// validate it
ksr.m_hopCountValid = true;