mirror of
https://github.com/privacore/open-source-search-engine.git
synced 2025-07-15 02:36:08 -04:00
do not dedup if &links is in diffbot api url
(or ?links)
This commit is contained in:
committed by
Brian Rasmusson
parent
04e67aeb56
commit
4934afd3a7
36
XmlDoc.cpp
36
XmlDoc.cpp
@ -4421,22 +4421,6 @@ int32_t *XmlDoc::getIndexCode2 ( ) {
|
||||
if ( m_recycleContent )
|
||||
check = false;
|
||||
|
||||
// if &links was given in the diffbot api url then do not do
|
||||
// spider time deduping because the pages are likely rendered using
|
||||
// javascript, so they'd all seem to be dups of one another.
|
||||
if ( cr->m_isCustomCrawl && check ) {
|
||||
SafeBuf *au = getDiffbotApiUrl();
|
||||
if ( ! au || au == (void *)-1 ) return (int32_t *)au;
|
||||
char *linksParm = NULL;
|
||||
if ( au->length() > 0 )
|
||||
linksParm = strstr ( au->getBufStart() , "&links");
|
||||
if ( linksParm && linksParm[6] && linksParm[6] != '&' )
|
||||
linksParm = NULL;
|
||||
if ( linksParm )
|
||||
check = false;
|
||||
}
|
||||
|
||||
|
||||
if ( check ) {
|
||||
// check inlinks now too!
|
||||
LinkInfo *info1 = getLinkInfo1 ();
|
||||
@ -8937,6 +8921,26 @@ char *XmlDoc::getIsDup ( ) {
|
||||
return &m_isDup;
|
||||
}
|
||||
|
||||
// if &links was given in the diffbot api url then do not do
|
||||
// spider time deduping because the pages are likely rendered using
|
||||
// javascript, so they'd all seem to be dups of one another.
|
||||
if ( cr->m_isCustomCrawl ) {
|
||||
SafeBuf *au = getDiffbotApiUrl();
|
||||
if ( ! au || au == (void *)-1 ) return (char *)au;
|
||||
char *linksParm = NULL;
|
||||
if ( au->length() > 0 )
|
||||
linksParm = strstr ( au->getBufStart() , "&links");
|
||||
if ( ! linksParm && au->length() > 0 )
|
||||
linksParm = strstr ( au->getBufStart() , "?links");
|
||||
if ( linksParm && linksParm[6] && linksParm[6] != '&' )
|
||||
linksParm = NULL;
|
||||
if ( linksParm ) {
|
||||
m_isDupValid = true;
|
||||
m_isDup = false;
|
||||
return &m_isDup;
|
||||
}
|
||||
}
|
||||
|
||||
setStatus ( "checking for dups" );
|
||||
|
||||
// BUT if we are already indexed and a a crawlbot/bulk diffbot job
|
||||
|
Reference in New Issue
Block a user