do not dedup if &links is in diffbot api url

(or ?links)
This commit is contained in:
Matt Wells
2015-12-09 16:52:11 -08:00
committed by Brian Rasmusson
parent 04e67aeb56
commit 4934afd3a7

@ -4421,22 +4421,6 @@ int32_t *XmlDoc::getIndexCode2 ( ) {
if ( m_recycleContent )
check = false;
// if &links was given in the diffbot api url then do not do
// spider time deduping because the pages are likely rendered using
// javascript, so they'd all seem to be dups of one another.
if ( cr->m_isCustomCrawl && check ) {
SafeBuf *au = getDiffbotApiUrl();
if ( ! au || au == (void *)-1 ) return (int32_t *)au;
char *linksParm = NULL;
if ( au->length() > 0 )
linksParm = strstr ( au->getBufStart() , "&links");
if ( linksParm && linksParm[6] && linksParm[6] != '&' )
linksParm = NULL;
if ( linksParm )
check = false;
}
if ( check ) {
// check inlinks now too!
LinkInfo *info1 = getLinkInfo1 ();
@ -8937,6 +8921,26 @@ char *XmlDoc::getIsDup ( ) {
return &m_isDup;
}
// if &links was given in the diffbot api url then do not do
// spider time deduping because the pages are likely rendered using
// javascript, so they'd all seem to be dups of one another.
if ( cr->m_isCustomCrawl ) {
SafeBuf *au = getDiffbotApiUrl();
if ( ! au || au == (void *)-1 ) return (char *)au;
char *linksParm = NULL;
if ( au->length() > 0 )
linksParm = strstr ( au->getBufStart() , "&links");
if ( ! linksParm && au->length() > 0 )
linksParm = strstr ( au->getBufStart() , "?links");
if ( linksParm && linksParm[6] && linksParm[6] != '&' )
linksParm = NULL;
if ( linksParm ) {
m_isDupValid = true;
m_isDup = false;
return &m_isDup;
}
}
setStatus ( "checking for dups" );
// BUT if we are already indexed and a a crawlbot/bulk diffbot job