forked from Mirrors/privacore-open-source-search-engine
Merge branch 'diffbot' of github.com:gigablast/open-source-search-engine into diffbot
This commit is contained in:
@ -2081,6 +2081,8 @@ void testRegex ( ) {
|
||||
|
||||
rx = "(http://)?(www.)?vault.com/rankings-reviews/company-rankings/law/vault-law-100/\\.aspx\\?pg=[0-9]";
|
||||
|
||||
rx = ".*?article[0-9]*?.html";
|
||||
|
||||
regex_t ucr;
|
||||
|
||||
if ( regcomp ( &ucr , rx ,
|
||||
@ -2097,7 +2099,8 @@ void testRegex ( ) {
|
||||
|
||||
logf(LOG_DEBUG,"db: compiled '%s' for crawl pattern",rx);
|
||||
|
||||
char *url = "http://www.vault.com/rankings-reviews/company-rankings/law/vault-law-100/.aspx?pg=2";
|
||||
//char *url = "http://www.vault.com/rankings-reviews/company-rankings/law/vault-law-100/.aspx?pg=2";
|
||||
char *url = "http://staticpages.diffbot.com/testCrawl/regex/article1.html";
|
||||
|
||||
if ( regexec(&ucr,url,0,NULL,0) )
|
||||
logf(LOG_DEBUG,"db: failed to match %s on %s",
|
||||
|
@ -69,7 +69,9 @@ public:
|
||||
bool printJsonItemInCsv ( char *json , SafeBuf *sb ) ;
|
||||
|
||||
long long m_lastUh48;
|
||||
long m_lastFirstIp;
|
||||
long long m_prevReplyUh48;
|
||||
long m_prevReplyFirstIp;
|
||||
long m_prevReplyError;
|
||||
time_t m_prevReplyDownloadTime;
|
||||
|
||||
@ -247,7 +249,9 @@ bool sendBackDump ( TcpSocket *sock, HttpRequest *hr ) {
|
||||
st->m_needHeaderRow = true;
|
||||
|
||||
st->m_lastUh48 = 0LL;
|
||||
st->m_lastFirstIp = 0;
|
||||
st->m_prevReplyUh48 = 0LL;
|
||||
st->m_prevReplyFirstIp = 0;
|
||||
st->m_prevReplyError = 0;
|
||||
st->m_prevReplyDownloadTime = 0LL;
|
||||
|
||||
@ -714,6 +718,7 @@ void StateCD::printSpiderdbList ( RdbList *list,SafeBuf *sb,char **lastKeyPtr){
|
||||
else if ( srep->m_spideredTime > lastSpidered )
|
||||
lastSpidered = srep->m_spideredTime;
|
||||
m_prevReplyUh48 = srep->getUrlHash48();
|
||||
m_prevReplyFirstIp = srep->m_firstIp;
|
||||
// 0 means indexed successfully. not sure if
|
||||
// this includes http status codes like 404 etc.
|
||||
// i don't think it includes those types of errors!
|
||||
@ -734,11 +739,17 @@ void StateCD::printSpiderdbList ( RdbList *list,SafeBuf *sb,char **lastKeyPtr){
|
||||
|
||||
// print the url if not yet printed
|
||||
long long uh48 = sreq->getUrlHash48 ();
|
||||
long firstIp = sreq->m_firstIp;
|
||||
bool printIt = false;
|
||||
// there can be multiple spiderrequests for the same url!
|
||||
if ( m_lastUh48 != uh48 ) printIt = true;
|
||||
// sometimes the same url has different firstips now that
|
||||
// we have the EFAKEFIRSTIP spider error to avoid spidering
|
||||
// seeds twice...
|
||||
if ( m_lastFirstIp != firstIp ) printIt = true;
|
||||
if ( ! printIt ) continue;
|
||||
m_lastUh48 = uh48;
|
||||
m_lastFirstIp = firstIp;
|
||||
|
||||
// make sure spiderreply is for the same url!
|
||||
if ( srep && srep->getUrlHash48() != sreq->getUrlHash48() )
|
||||
@ -762,6 +773,7 @@ void StateCD::printSpiderdbList ( RdbList *list,SafeBuf *sb,char **lastKeyPtr){
|
||||
// so set "status" to 0 to indicate hasn't been
|
||||
// downloaded yet.
|
||||
if ( m_lastUh48 != m_prevReplyUh48 ) status = 0;
|
||||
if ( m_lastFirstIp != m_prevReplyFirstIp ) status = 0;
|
||||
// if it matches, perhaps an error spidering it?
|
||||
if ( status && m_prevReplyError ) status = -1;
|
||||
|
||||
|
23
Spider.cpp
23
Spider.cpp
@ -3169,6 +3169,21 @@ bool SpiderColl::scanSpiderdb ( bool needList ) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// if the spiderrequest has a fake firstip that means it
|
||||
// was injected without doing a proper ip lookup for speed.
|
||||
// xmldoc.cpp will check for m_fakeFirstIp and it that is
|
||||
// set in the spiderrequest it will simply add a new request
|
||||
// with the correct firstip. it will be a completely different
|
||||
// spiderrequest key then. so no need to keep the "fakes".
|
||||
// it will log the EFAKEFIRSTIP error msg.
|
||||
if ( sreq->m_fakeFirstIp &&
|
||||
srep &&
|
||||
srep->m_spideredTime > sreq->m_addedTime ) {
|
||||
if ( g_conf.m_logDebugSpider )
|
||||
log("spider: skipping6 %s", sreq->m_url);
|
||||
continue;
|
||||
}
|
||||
|
||||
// once we have a spiderreply, even i guess if its an error,
|
||||
// for a url, then bail if respidering is disabled
|
||||
if ( m_cr->m_isCustomCrawl &&
|
||||
@ -10315,6 +10330,14 @@ void dedupSpiderdbList ( RdbList *list , long niceness , bool removeNegRecs ) {
|
||||
// and url has since been spidered, nuke it!
|
||||
if ( sreq->m_urlIsDocId ) continue;
|
||||
|
||||
// same if indexcode was EFAKEFIRSTIP which XmlDoc.cpp
|
||||
// re-adds to spiderdb with the right firstip. once
|
||||
// those guys have a reply we can ignore them.
|
||||
// TODO: what about diffbotxyz spider requests? those
|
||||
// have a fakefirstip... they should not have requests
|
||||
// though, since their parent url has that.
|
||||
if ( sreq->m_fakeFirstIp ) continue;
|
||||
|
||||
SpiderReply *old = oldRep;
|
||||
sreq->m_inGoogle = old->m_inGoogle;
|
||||
sreq->m_hasAuthorityInlink = old->m_hasAuthorityInlink;
|
||||
|
22
XmlDoc.cpp
22
XmlDoc.cpp
@ -2022,6 +2022,9 @@ bool XmlDoc::indexDoc ( ) {
|
||||
|
||||
m_msg4Launched = true;
|
||||
|
||||
// display the url that had the error
|
||||
logIt();
|
||||
|
||||
// log this for debug now
|
||||
SafeBuf tmp;
|
||||
nsr->print(&tmp);
|
||||
@ -2037,13 +2040,15 @@ bool XmlDoc::indexDoc ( ) {
|
||||
m_masterLoop ,
|
||||
m_niceness ) ) {
|
||||
// spider hang bug
|
||||
if ( g_conf.m_testSpiderEnabled )
|
||||
logf(LOG_DEBUG,"build: msg4 meta add3 blocked"
|
||||
"msg4=0x%lx" ,(long)&m_msg4);
|
||||
//if ( g_conf.m_testSpiderEnabled )
|
||||
// logf(LOG_DEBUG,"build: msg4 meta add3 blocked"
|
||||
// "msg4=0x%lx" ,(long)&m_msg4);
|
||||
m_msg4Waiting = true;
|
||||
return false;
|
||||
}
|
||||
|
||||
//logf(LOG_DEBUG,"build: msg4 meta add3 did NOT block" );
|
||||
|
||||
m_msg4Launched = false;
|
||||
|
||||
// all done
|
||||
@ -2068,7 +2073,9 @@ bool XmlDoc::indexDoc2 ( ) {
|
||||
|
||||
// do this before we increment pageDownloadAttempts below so that
|
||||
// john's smoke tests, which use those counts, are not affected
|
||||
if ( m_oldsrValid && m_oldsr.m_fakeFirstIp ) {
|
||||
if ( m_oldsrValid && m_oldsr.m_fakeFirstIp &&
|
||||
// diffbot requests are ok though!
|
||||
! strstr(m_oldsr.m_url,"-diffbotxyz") ) {
|
||||
m_indexCodeValid = true;
|
||||
m_indexCode = EFAKEFIRSTIP;
|
||||
return true;
|
||||
@ -7910,8 +7917,9 @@ char *XmlDoc::getIsDup ( ) {
|
||||
// continue;
|
||||
//}
|
||||
// for debug
|
||||
log("build: doc %s is dup of doid %lli",
|
||||
m_firstUrl.m_url,d);
|
||||
if ( d != m_docId )
|
||||
log("build: doc %s is dup of doid %lli",
|
||||
m_firstUrl.m_url,d);
|
||||
// get the winner
|
||||
//if ( score > maxScore ) maxScore = score;
|
||||
if ( sr > maxSiteRank || maxSiteRank == -1 ) {
|
||||
@ -17523,7 +17531,7 @@ bool XmlDoc::logIt ( ) {
|
||||
// make queues in the case of hammering an ip, which i think
|
||||
// it already does...
|
||||
if ( m_oldsrValid && m_oldsr.m_firstIp != m_firstIp )
|
||||
sb.safePrintf("fakesreqfirstip=%s ",iptoa(m_firstIp) );
|
||||
sb.safePrintf("fakesreqfirstip=%s ",iptoa(m_oldsr.m_firstIp) );
|
||||
|
||||
//
|
||||
// print when this spider request was added
|
||||
|
Reference in New Issue
Block a user