Merge branch 'diffbot-testing' of github.com:gigablast/open-source-search-engine into diffbot-testing
This commit is contained in:
commit
11efab9862
139
Collectiondb.cpp
139
Collectiondb.cpp
@ -452,7 +452,7 @@ bool Collectiondb::addNewColl ( char *coll ,
|
||||
cr->m_diffbotOnlyProcessIfNewUrl = true;
|
||||
// default respider to off
|
||||
cr->m_collectiveRespiderFrequency = 0.0;
|
||||
cr->m_restrictDomain = true;
|
||||
//cr->m_restrictDomain = true;
|
||||
// reset the crawl stats
|
||||
// . this will core if a host was dead and then when it came
|
||||
// back up host #0's parms.cpp told it to add a new coll
|
||||
@ -2091,6 +2091,66 @@ bool CollectionRec::rebuildUrlFilters ( ) {
|
||||
if ( ! upp ) upp = m_diffbotUrlProcessRegEx.getBufStart();
|
||||
if ( upp && ! upp[0] ) upp = NULL;
|
||||
|
||||
///////
|
||||
//
|
||||
// recompile regular expressions
|
||||
//
|
||||
///////
|
||||
|
||||
|
||||
if ( m_hasucr ) {
|
||||
regfree ( &m_ucr );
|
||||
m_hasucr = false;
|
||||
}
|
||||
|
||||
if ( m_hasupr ) {
|
||||
regfree ( &m_upr );
|
||||
m_hasupr = false;
|
||||
}
|
||||
|
||||
// copy into tmpbuf
|
||||
SafeBuf tmp;
|
||||
|
||||
char *rx = m_diffbotUrlCrawlRegEx.getBufStart();
|
||||
if ( rx && ! rx[0] ) rx = NULL;
|
||||
if ( rx ) {
|
||||
tmp.reset();
|
||||
tmp.safeStrcpy ( rx );
|
||||
expandRegExShortcuts ( &tmp );
|
||||
m_hasucr = true;
|
||||
}
|
||||
if ( rx && regcomp ( &m_ucr , tmp.getBufStart() ,
|
||||
REG_EXTENDED| //REG_ICASE|
|
||||
REG_NEWLINE ) ) { // |REG_NOSUB) ) {
|
||||
// error!
|
||||
log("coll: regcomp %s failed: %s. "
|
||||
"Ignoring.",
|
||||
rx,mstrerror(errno));
|
||||
regfree ( &m_ucr );
|
||||
m_hasucr = false;
|
||||
}
|
||||
|
||||
|
||||
rx = m_diffbotUrlProcessRegEx.getBufStart();
|
||||
if ( rx && ! rx[0] ) rx = NULL;
|
||||
if ( rx ) m_hasupr = true;
|
||||
if ( rx ) {
|
||||
tmp.reset();
|
||||
tmp.safeStrcpy ( rx );
|
||||
expandRegExShortcuts ( &tmp );
|
||||
m_hasupr = true;
|
||||
}
|
||||
if ( rx && regcomp ( &m_upr , tmp.getBufStart() ,
|
||||
REG_EXTENDED| // REG_ICASE|
|
||||
REG_NEWLINE ) ) { // |REG_NOSUB) ) {
|
||||
// error!
|
||||
log("coll: regcomp %s failed: %s. "
|
||||
"Ignoring.",
|
||||
rx,mstrerror(errno));
|
||||
regfree ( &m_upr );
|
||||
m_hasupr = false;
|
||||
}
|
||||
|
||||
|
||||
// what diffbot url to use for processing
|
||||
char *api = m_diffbotApiUrl.getBufStart();
|
||||
@ -2139,11 +2199,18 @@ bool CollectionRec::rebuildUrlFilters ( ) {
|
||||
// 2nd default filter
|
||||
// always turn this on for now. they need to add domains they want
|
||||
// to crawl as seeds so they do not spider the web.
|
||||
//if ( m_restrictDomain ) {
|
||||
m_regExs[i].set("!isonsamedomain && !ismanualadd");
|
||||
m_spiderPriorities [i] = SPIDER_PRIORITY_FILTERED;
|
||||
i++;
|
||||
//}
|
||||
// no because FTB seeds with link pages that link to another
|
||||
// domain. they just need to be sure to supply a crawl pattern
|
||||
// to avoid spidering the whole web.
|
||||
//
|
||||
// if they did not EXPLICITLY provide a url crawl pattern or
|
||||
// url crawl regex then restrict to seeds to prevent from spidering
|
||||
// the entire internet
|
||||
if ( ! ucp && ! m_hasucr ) { // m_restrictDomain ) {
|
||||
m_regExs[i].set("!isonsamedomain && !ismanualadd");
|
||||
m_spiderPriorities [i] = SPIDER_PRIORITY_FILTERED;
|
||||
i++;
|
||||
}
|
||||
|
||||
m_regExs[i].set("errorcount>=1 && !hastmperror");
|
||||
m_spiderPriorities [i] = 15;
|
||||
@ -2268,66 +2335,6 @@ bool CollectionRec::rebuildUrlFilters ( ) {
|
||||
m_numRegExs8 = i;
|
||||
//m_numRegExs11 = i;
|
||||
|
||||
///////
|
||||
//
|
||||
// recompile regular expressions
|
||||
//
|
||||
///////
|
||||
|
||||
|
||||
if ( m_hasucr ) {
|
||||
regfree ( &m_ucr );
|
||||
m_hasucr = false;
|
||||
}
|
||||
|
||||
if ( m_hasupr ) {
|
||||
regfree ( &m_upr );
|
||||
m_hasupr = false;
|
||||
}
|
||||
|
||||
// copy into tmpbuf
|
||||
SafeBuf tmp;
|
||||
|
||||
char *rx = m_diffbotUrlCrawlRegEx.getBufStart();
|
||||
if ( rx && ! rx[0] ) rx = NULL;
|
||||
if ( rx ) {
|
||||
tmp.reset();
|
||||
tmp.safeStrcpy ( rx );
|
||||
expandRegExShortcuts ( &tmp );
|
||||
m_hasucr = true;
|
||||
}
|
||||
if ( rx && regcomp ( &m_ucr , tmp.getBufStart() ,
|
||||
REG_EXTENDED| //REG_ICASE|
|
||||
REG_NEWLINE ) ) { // |REG_NOSUB) ) {
|
||||
// error!
|
||||
log("coll: regcomp %s failed: %s. "
|
||||
"Ignoring.",
|
||||
rx,mstrerror(errno));
|
||||
regfree ( &m_ucr );
|
||||
m_hasucr = false;
|
||||
}
|
||||
|
||||
|
||||
rx = m_diffbotUrlProcessRegEx.getBufStart();
|
||||
if ( rx && ! rx[0] ) rx = NULL;
|
||||
if ( rx ) m_hasupr = true;
|
||||
if ( rx ) {
|
||||
tmp.reset();
|
||||
tmp.safeStrcpy ( rx );
|
||||
expandRegExShortcuts ( &tmp );
|
||||
m_hasupr = true;
|
||||
}
|
||||
if ( rx && regcomp ( &m_upr , tmp.getBufStart() ,
|
||||
REG_EXTENDED| // REG_ICASE|
|
||||
REG_NEWLINE ) ) { // |REG_NOSUB) ) {
|
||||
// error!
|
||||
log("coll: regcomp %s failed: %s. "
|
||||
"Ignoring.",
|
||||
rx,mstrerror(errno));
|
||||
regfree ( &m_upr );
|
||||
m_hasupr = false;
|
||||
}
|
||||
|
||||
|
||||
//char *x = "http://staticpages.diffbot.com/testCrawl/article1.html";
|
||||
//if(m_hasupr && regexec(&m_upr,x,0,NULL,0) ) { char *xx=NULL;*xx=0; }
|
||||
|
@ -458,7 +458,7 @@ class CollectionRec {
|
||||
char m_enforceNewQuotas ;
|
||||
char m_doIpLookups ; // considered iff using proxy
|
||||
char m_useRobotsTxt ;
|
||||
char m_restrictDomain ; // say on same domain as seeds?
|
||||
//char m_restrictDomain ; // say on same domain as seeds?
|
||||
char m_doTuringTest ; // for addurl
|
||||
char m_applyFilterToText ; // speeds us up
|
||||
char m_allowHttps ; // read HTTPS using SSL
|
||||
|
@ -2315,10 +2315,10 @@ uint32_t Hostdb::getShardNum ( char rdbId,void *k ) { // ,bool split ) {
|
||||
else if ( rdbId == RDB_LINKDB || rdbId == RDB2_LINKDB2 ) {
|
||||
return m_map [(*(uint16_t *)((char *)k + 26))>>3];
|
||||
}
|
||||
else if ( rdbId == RDB_TFNDB || rdbId == RDB2_TFNDB2 ) {
|
||||
unsigned long long d = g_tfndb.getDocId ( (key_t *)k );
|
||||
return m_map [ ((d>>14)^(d>>7)) & (MAX_KSLOTS-1) ];
|
||||
}
|
||||
//else if ( rdbId == RDB_TFNDB || rdbId == RDB2_TFNDB2 ) {
|
||||
// unsigned long long d = g_tfndb.getDocId ( (key_t *)k );
|
||||
// return m_map [ ((d>>14)^(d>>7)) & (MAX_KSLOTS-1) ];
|
||||
//}
|
||||
else if ( rdbId == RDB_TITLEDB || rdbId == RDB2_TITLEDB2 ) {
|
||||
unsigned long long d = g_titledb.getDocId ( (key_t *)k );
|
||||
return m_map [ ((d>>14)^(d>>7)) & (MAX_KSLOTS-1) ];
|
||||
|
18
Linkdb.cpp
18
Linkdb.cpp
@ -633,8 +633,8 @@ static void sendReplyWrapper ( void *state ) {
|
||||
// steal this buffer
|
||||
char *reply1 = info->getBufStart();
|
||||
long replySize = info->length();
|
||||
// sanity
|
||||
if ( replySize <= 0 ) { char *xx=NULL;*xx=0; }
|
||||
// sanity. no if collrec not found its 0!
|
||||
if ( ! saved && replySize <= 0 ) { char *xx=NULL;*xx=0; }
|
||||
// get original request
|
||||
Msg25Request *req = (Msg25Request *)slot2->m_readBuf;
|
||||
// sanity
|
||||
@ -645,7 +645,10 @@ static void sendReplyWrapper ( void *state ) {
|
||||
nextLink:
|
||||
|
||||
UdpSlot *udpSlot = req->m_udpSlot;
|
||||
|
||||
|
||||
// update for next udpSlot
|
||||
req = req->m_next;
|
||||
|
||||
// just dup the reply for each one
|
||||
char *reply2 = (char *)mdup(reply1,replySize,"m25repd");
|
||||
|
||||
@ -666,7 +669,6 @@ static void sendReplyWrapper ( void *state ) {
|
||||
}
|
||||
|
||||
// if we had a link
|
||||
req = req->m_next;
|
||||
if ( req ) goto nextLink;
|
||||
|
||||
// the destructor
|
||||
@ -684,6 +686,10 @@ void handleRequest25 ( UdpSlot *slot , long netnice ) {
|
||||
// make sure this always NULL for our linked list logic
|
||||
req->m_next = NULL;
|
||||
|
||||
// udp socket for sending back the final linkInfo in m_linkInfoBuf
|
||||
// used by sendReply()
|
||||
req->m_udpSlot = slot;
|
||||
|
||||
// set up the hashtable if our first time
|
||||
if ( ! g_lineTable.isInitialized() )
|
||||
g_lineTable.set ( 8,4,256,NULL,0,false,MAX_NICENESS,"lht25");
|
||||
@ -735,10 +741,6 @@ void handleRequest25 ( UdpSlot *slot , long netnice ) {
|
||||
// point to a real safebuf here for populating with data
|
||||
m25->m_linkInfoBuf = &m25->m_realBuf;
|
||||
|
||||
// udp socket for sending back the final linkInfo in m_linkInfoBuf
|
||||
// used by sendReply()
|
||||
req->m_udpSlot = slot;
|
||||
|
||||
// set some new stuff. should probably be set in getLinkInfo2()
|
||||
// but we are trying to leave that as unaltered as possible to
|
||||
// try to reduce debugging.
|
||||
|
2
Makefile
2
Makefile
@ -2,7 +2,7 @@ SHELL = /bin/bash
|
||||
|
||||
CC=g++
|
||||
|
||||
OBJS = Tfndb.o UdpSlot.o Rebalance.o \
|
||||
OBJS = UdpSlot.o Rebalance.o \
|
||||
Msg13.o Mime.o IndexReadInfo.o \
|
||||
PageGet.o PageHosts.o PageIndexdb.o PageLogin.o \
|
||||
PageParser.o PageInject.o PagePerf.o PageReindex.o PageResults.o \
|
||||
|
@ -14,6 +14,7 @@ void Msg20::constructor () {
|
||||
m_r = NULL;
|
||||
m_inProgress = false;
|
||||
m_launched = false;
|
||||
m_i = -1;
|
||||
reset();
|
||||
m_mcast.constructor();
|
||||
}
|
||||
|
49
Msg40.cpp
49
Msg40.cpp
@ -881,19 +881,20 @@ bool Msg40::reallocMsg20Buf ( ) {
|
||||
return true;
|
||||
}
|
||||
|
||||
m_buf2 = NULL;
|
||||
m_bufMaxSize2 = need;
|
||||
m_numMsg20s = m_msg3a.m_numDocIds;
|
||||
|
||||
// when streaming because we can have hundreds of thousands of
|
||||
// search results we recycle a few msg20s to save mem
|
||||
if ( m_si->m_streamResults ) {
|
||||
long max = MAX_OUTSTANDING_MSG20S;
|
||||
long max = MAX_OUTSTANDING_MSG20S * 2;
|
||||
if ( m_msg3a.m_numDocIds < max ) max = m_msg3a.m_numDocIds;
|
||||
need = max * (4+sizeof(Msg20));
|
||||
m_numMsg20s = max;
|
||||
}
|
||||
|
||||
m_buf2 = NULL;
|
||||
m_bufMaxSize2 = need;
|
||||
|
||||
// do the alloc
|
||||
if ( need ) m_buf2 = (char *)mmalloc ( need ,"Msg40msg20");
|
||||
if ( need && ! m_buf2 ) { m_errno = g_errno; return false; }
|
||||
@ -1033,6 +1034,11 @@ bool Msg40::launchMsg20s ( bool recalled ) {
|
||||
//if ( m_numRequests-m_numReplies >= need ) break;
|
||||
// hard limit
|
||||
if ( m_numRequests-m_numReplies >= maxOut ) break;
|
||||
// do not launch another until m_printi comes back because
|
||||
// all summaries are bottlenecked on printing him out now
|
||||
if ( m_si->m_streamResults &&
|
||||
i >= m_printi + MAX_OUTSTANDING_MSG20S - 1 )
|
||||
break;
|
||||
// do not double count!
|
||||
//if ( i <= m_lastProcessedi ) continue;
|
||||
// do not repeat for this i
|
||||
@ -1238,7 +1244,8 @@ Msg20 *Msg40::getAvailMsg20 ( ) {
|
||||
// m_inProgress is set to false right before it
|
||||
// calls Msg20::m_callback which is gotSummaryWrapper()
|
||||
// so we should be ok with this
|
||||
if ( ! m_msg20[i]->m_inProgress ) return m_msg20[i];
|
||||
if ( m_msg20[i]->m_launched ) continue;
|
||||
return m_msg20[i];
|
||||
}
|
||||
// how can this happen???
|
||||
char *xx=NULL;*xx=0;
|
||||
@ -1377,27 +1384,42 @@ bool Msg40::gotSummary ( ) {
|
||||
// otherwise, get the summary for result #m_printi
|
||||
//Msg20 *m20 = m_msg20[m_printi];
|
||||
|
||||
if ( ! m20 ) {
|
||||
log("msg40: m20 NULL #%li",m_printi);
|
||||
continue;
|
||||
}
|
||||
//if ( ! m20 ) {
|
||||
// log("msg40: m20 NULL #%li",m_printi);
|
||||
// continue;
|
||||
//}
|
||||
|
||||
// if result summary #i not yet in, wait...
|
||||
if ( ! m20 )
|
||||
break;
|
||||
|
||||
// wait if no reply for it yet
|
||||
//if ( m20->m_inProgress )
|
||||
// break;
|
||||
|
||||
if ( m20->m_errno ) {
|
||||
log("msg40: sum #%li error: %s",
|
||||
m_printi,mstrerror(m20->m_errno));
|
||||
// make it available to be reused
|
||||
m20->reset();
|
||||
continue;
|
||||
}
|
||||
|
||||
// get the next reply we are waiting on to print results order
|
||||
Msg20Reply *mr = m20->m_r;
|
||||
if ( ! mr ) break;
|
||||
//if ( ! mr ) { char *xx=NULL;*xx=0; }
|
||||
|
||||
// primitive deduping. for diffbot json exclude url's from the
|
||||
// XmlDoc::m_contentHash32.. it will be zero if invalid i guess
|
||||
if ( m_si && m_si->m_doDupContentRemoval && // &dr=1
|
||||
mr->m_contentHash32 &&
|
||||
m_dedupTable.isInTable ( &mr->m_contentHash32 ) ) {
|
||||
//if ( g_conf.m_logDebugQuery )
|
||||
log("msg40: dup sum #%li (%lu)",m_printi,
|
||||
mr->m_contentHash32);
|
||||
// make it available to be reused
|
||||
m20->reset();
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -1418,8 +1440,12 @@ bool Msg40::gotSummary ( ) {
|
||||
printSearchResult9 ( m_printi );
|
||||
|
||||
// now free the reply to save memory since we could be
|
||||
// streaming back 1M+
|
||||
m20->freeReply();
|
||||
// streaming back 1M+. we call reset below, no need for this.
|
||||
//m20->freeReply();
|
||||
|
||||
// return it so getAvailMsg20() can use it again
|
||||
// this will set m_launched to false
|
||||
m20->reset();
|
||||
}
|
||||
|
||||
// set it to true on all but the last thing we send!
|
||||
@ -1477,6 +1503,9 @@ bool Msg40::gotSummary ( ) {
|
||||
// do a recursive stack explosion
|
||||
// . this returns false if still waiting on more to come back
|
||||
if ( ! launchMsg20s ( true ) ) return false;
|
||||
// it won't launch now if we are bottlnecked waiting for
|
||||
// m_printi's summary to come in
|
||||
if ( m_si->m_streamResults ) return false;
|
||||
// maybe some were cached?
|
||||
//goto refilter;
|
||||
// it returned true, so m_numRequests == m_numReplies and
|
||||
|
@ -784,6 +784,12 @@ void StateCD::printSpiderdbList ( RdbList *list,SafeBuf *sb,char **lastKeyPtr){
|
||||
// do not print "Fake First Ip"...
|
||||
if ( m_prevReplyError == EFAKEFIRSTIP )
|
||||
msg = "Initial crawl request";
|
||||
// if the initial crawl request got a reply then that
|
||||
// means the spiderrequest was added under the correct
|
||||
// firstip... so skip it. i am assuming that the
|
||||
// correct spidrerequest got added ok here...
|
||||
if ( m_prevReplyError == EFAKEFIRSTIP )
|
||||
continue;
|
||||
}
|
||||
|
||||
if ( srep && srep->m_hadDiffbotError )
|
||||
@ -1533,7 +1539,7 @@ static class HelpItem s_his[] = {
|
||||
"the maxtocrawl or maxtoprocess limit, or when the crawl "
|
||||
"completes."},
|
||||
{"obeyRobots","Obey robots.txt files?"},
|
||||
{"restrictDomain","Restrict downloaded urls to domains of seeds?"},
|
||||
//{"restrictDomain","Restrict downloaded urls to domains of seeds?"},
|
||||
|
||||
{"urlCrawlPattern","List of || separated strings. If the url "
|
||||
"contains any of these then we crawl the url, otherwise, we do not. "
|
||||
@ -2365,11 +2371,11 @@ bool printCrawlDetailsInJson ( SafeBuf &sb , CollectionRec *cx ) {
|
||||
// settable parms
|
||||
"\"maxToCrawl\":%lli,\n"
|
||||
"\"maxToProcess\":%lli,\n"
|
||||
"\"restrictDomain\":%li,\n"
|
||||
//"\"restrictDomain\":%li,\n"
|
||||
"\"onlyProcessIfNew\":%li,\n"
|
||||
, cx->m_maxToCrawl
|
||||
, cx->m_maxToProcess
|
||||
, (long)cx->m_restrictDomain
|
||||
//, (long)cx->m_restrictDomain
|
||||
, (long)cx->m_diffbotOnlyProcessIfNewUrl
|
||||
);
|
||||
sb.safePrintf("\"seeds\":\"");
|
||||
@ -3344,13 +3350,15 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
|
||||
urtYes = "";
|
||||
urtNo = " checked";
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
char *rdomYes = " checked";
|
||||
char *rdomNo = "";
|
||||
if ( ! cr->m_restrictDomain ) {
|
||||
rdomYes = "";
|
||||
rdomNo = " checked";
|
||||
}
|
||||
*/
|
||||
|
||||
char *isNewYes = "";
|
||||
char *isNewNo = " checked";
|
||||
@ -3541,15 +3549,15 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
|
||||
"</td>"
|
||||
"</tr>"
|
||||
|
||||
"<tr><td>"
|
||||
"<b>Restrict domain to seeds?</b> "
|
||||
"</td><td>"
|
||||
"<input type=radio name=restrictDomain "
|
||||
"value=1%s> yes "
|
||||
"<input type=radio name=restrictDomain "
|
||||
"value=0%s> no "
|
||||
"</td>"
|
||||
"</tr>"
|
||||
//"<tr><td>"
|
||||
//"<b>Restrict domain to seeds?</b> "
|
||||
//"</td><td>"
|
||||
//"<input type=radio name=restrictDomain "
|
||||
//"value=1%s> yes "
|
||||
//"<input type=radio name=restrictDomain "
|
||||
//"value=0%s> no "
|
||||
//"</td>"
|
||||
//"</tr>"
|
||||
|
||||
//"<tr><td>"
|
||||
//"Use spider proxies on AWS? "
|
||||
@ -3592,8 +3600,8 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
|
||||
, urtYes
|
||||
, urtNo
|
||||
|
||||
, rdomYes
|
||||
, rdomNo
|
||||
//, rdomYes
|
||||
//, rdomNo
|
||||
|
||||
);
|
||||
}
|
||||
|
@ -2057,8 +2057,15 @@ bool printResult ( State0 *st, long ix ) {
|
||||
}
|
||||
|
||||
|
||||
Msg20 *m20 = msg40->m_msg20[ix];
|
||||
Msg20Reply *mr = m20->m_r;
|
||||
Msg20 *m20 ;
|
||||
if ( si->m_streamResults )
|
||||
m20 = msg40->getCompletedSummary(ix);
|
||||
else
|
||||
m20 = msg40->m_msg20[ix];
|
||||
|
||||
// get the reply
|
||||
Msg20Reply *mr = m20->m_r;
|
||||
|
||||
|
||||
// . sometimes the msg20reply is NULL so prevent it coring
|
||||
// . i think this happens if all hosts in a shard are down or timeout
|
||||
@ -5302,6 +5309,14 @@ bool printJsonItemInCSV ( char *json , SafeBuf *sb , State0 *st ) {
|
||||
ji = ptrs[i];
|
||||
// skip if none
|
||||
if ( ! ji ) continue;
|
||||
|
||||
// skip "html" field... too spammy for csv and > 32k causes
|
||||
// libreoffice calc to truncate it and break its parsing
|
||||
if ( ji->m_name &&
|
||||
//! ji->m_parent &&
|
||||
strcmp(ji->m_name,"html")==0)
|
||||
continue;
|
||||
|
||||
//
|
||||
// get value and print otherwise
|
||||
//
|
||||
|
@ -9963,6 +9963,7 @@ void Parms::init ( ) {
|
||||
m++;
|
||||
|
||||
// use url filters for this. this is a crawlbot parm really.
|
||||
/*
|
||||
m->m_title = "restrict domain";
|
||||
m->m_desc = "Keep crawler on same domain as seed urls?";
|
||||
m->m_cgi = "restrictDomain";
|
||||
@ -9972,6 +9973,7 @@ void Parms::init ( ) {
|
||||
// we need to save this it is a diffbot parm
|
||||
m->m_flags = PF_HIDDEN | PF_DIFFBOT;// | PF_NOSAVE;
|
||||
m++;
|
||||
*/
|
||||
|
||||
m->m_title = "do url sporn checking";
|
||||
m->m_desc = "If this is true and the spider finds "
|
||||
|
10
Rdb.cpp
10
Rdb.cpp
@ -13,7 +13,7 @@
|
||||
#include "Datedb.h"
|
||||
#include "Titledb.h"
|
||||
#include "Spider.h"
|
||||
#include "Tfndb.h"
|
||||
//#include "Tfndb.h"
|
||||
//#include "Sync.h"
|
||||
#include "Spider.h"
|
||||
#include "Repair.h"
|
||||
@ -2648,7 +2648,7 @@ Rdb *getRdbFromId ( uint8_t rdbId ) {
|
||||
s_table9 [ RDB_SYNCDB ] = g_syncdb.getRdb();
|
||||
s_table9 [ RDB_SPIDERDB ] = g_spiderdb.getRdb();
|
||||
s_table9 [ RDB_DOLEDB ] = g_doledb.getRdb();
|
||||
s_table9 [ RDB_TFNDB ] = g_tfndb.getRdb();
|
||||
//s_table9 [ RDB_TFNDB ] = g_tfndb.getRdb();
|
||||
s_table9 [ RDB_CLUSTERDB ] = g_clusterdb.getRdb();
|
||||
s_table9 [ RDB_CATDB ] = g_catdb.getRdb();
|
||||
s_table9 [ RDB_DATEDB ] = g_datedb.getRdb();
|
||||
@ -2667,7 +2667,7 @@ Rdb *getRdbFromId ( uint8_t rdbId ) {
|
||||
s_table9 [ RDB2_SECTIONDB2 ] = g_sectiondb2.getRdb();
|
||||
s_table9 [ RDB2_PLACEDB2 ] = g_placedb2.getRdb();
|
||||
s_table9 [ RDB2_SPIDERDB2 ] = g_spiderdb2.getRdb();
|
||||
s_table9 [ RDB2_TFNDB2 ] = g_tfndb2.getRdb();
|
||||
//s_table9 [ RDB2_TFNDB2 ] = g_tfndb2.getRdb();
|
||||
s_table9 [ RDB2_CLUSTERDB2 ] = g_clusterdb2.getRdb();
|
||||
s_table9 [ RDB2_DATEDB2 ] = g_datedb2.getRdb();
|
||||
s_table9 [ RDB2_LINKDB2 ] = g_linkdb2.getRdb();
|
||||
@ -2691,7 +2691,7 @@ char getIdFromRdb ( Rdb *rdb ) {
|
||||
//if ( rdb == g_checksumdb.getRdb() ) return RDB_CHECKSUMDB;
|
||||
if ( rdb == g_spiderdb.getRdb () ) return RDB_SPIDERDB;
|
||||
if ( rdb == g_doledb.getRdb () ) return RDB_DOLEDB;
|
||||
if ( rdb == g_tfndb.getRdb () ) return RDB_TFNDB;
|
||||
//if ( rdb == g_tfndb.getRdb () ) return RDB_TFNDB;
|
||||
if ( rdb == g_clusterdb.getRdb () ) return RDB_CLUSTERDB;
|
||||
if ( rdb == g_statsdb.getRdb () ) return RDB_STATSDB;
|
||||
if ( rdb == g_linkdb.getRdb () ) return RDB_LINKDB;
|
||||
@ -2712,7 +2712,7 @@ char getIdFromRdb ( Rdb *rdb ) {
|
||||
if ( rdb == g_placedb2.getRdb () ) return RDB2_PLACEDB2;
|
||||
//if ( rdb == g_checksumdb2.getRdb() ) return RDB2_CHECKSUMDB2;
|
||||
if ( rdb == g_spiderdb2.getRdb () ) return RDB2_SPIDERDB2;
|
||||
if ( rdb == g_tfndb2.getRdb () ) return RDB2_TFNDB2;
|
||||
//if ( rdb == g_tfndb2.getRdb () ) return RDB2_TFNDB2;
|
||||
if ( rdb == g_clusterdb2.getRdb () ) return RDB2_CLUSTERDB2;
|
||||
//if ( rdb == g_statsdb2.getRdb () ) return RDB2_STATSDB2;
|
||||
if ( rdb == g_linkdb2.getRdb () ) return RDB2_LINKDB2;
|
||||
|
18
RdbBase.cpp
18
RdbBase.cpp
@ -2,7 +2,7 @@
|
||||
|
||||
#include "Rdb.h"
|
||||
#include "Msg35.h"
|
||||
#include "Tfndb.h"
|
||||
//#include "Tfndb.h"
|
||||
//#include "Checksumdb.h"
|
||||
#include "Clusterdb.h"
|
||||
#include "Hostdb.h"
|
||||
@ -966,7 +966,7 @@ bool RdbBase::incorporateMerge ( ) {
|
||||
|
||||
// tfndb has his own merge class since titledb merges write tfndb recs
|
||||
RdbMerge *m = &g_merge;
|
||||
if ( m_rdb == g_tfndb.getRdb() ) m = &g_merge2;
|
||||
//if ( m_rdb == g_tfndb.getRdb() ) m = &g_merge2;
|
||||
|
||||
// print out info of newly merged file
|
||||
long long tp = m_maps[x]->getNumPositiveRecs();
|
||||
@ -974,7 +974,7 @@ bool RdbBase::incorporateMerge ( ) {
|
||||
log(LOG_INFO,
|
||||
"merge: Merge succeeded. %s (#%li) has %lli positive "
|
||||
"and %lli negative recs.", m_files[x]->getFilename(), x, tp, tn);
|
||||
if ( m_rdb == g_posdb.getRdb() || m_rdb == g_tfndb.getRdb() )
|
||||
if ( m_rdb == g_posdb.getRdb() ) // || m_rdb == g_tfndb.getRdb() )
|
||||
log(LOG_INFO,"merge: Removed %lli dup keys.",
|
||||
m->getDupsRemoved() );
|
||||
// . bitch if bad news
|
||||
@ -1470,8 +1470,8 @@ void RdbBase::attemptMerge ( long niceness, bool forceMergeAll, bool doLog ,
|
||||
// if we are tfndb and someone else is merging, do not merge unless
|
||||
// we have 3 or more files
|
||||
long minToMerge = m_minToMerge;
|
||||
if (g_tfndb.getRdb()==m_rdb&& g_merge.isMerging() && minToMerge <=2 )
|
||||
minToMerge = 3;
|
||||
//if (g_tfndb.getRdb()==m_rdb&& g_merge.isMerging() && minToMerge <=2 )
|
||||
// minToMerge = 3;
|
||||
// do not start a tfndb merge while someone is dumping because the
|
||||
// dump starves the tfndb merge and we clog up adding links. i think
|
||||
// this is mainly just indexdb dumps, but we'll see.
|
||||
@ -1565,7 +1565,7 @@ void RdbBase::attemptMerge ( long niceness, bool forceMergeAll, bool doLog ,
|
||||
//if ( m_mergeUrgent ) priority = 2;
|
||||
//else priority = 0;
|
||||
// tfndb doesn't need token, since titledb merge writes tfndb recs
|
||||
if ( m_rdb != g_tfndb.getRdb() &&
|
||||
if ( //m_rdb != g_tfndb.getRdb() &&
|
||||
! g_msg35.getToken ( this , gotTokenForMergeWrapper, priority ) )
|
||||
return ;
|
||||
// bitch if we got token because there was an error somewhere
|
||||
@ -1616,7 +1616,7 @@ void RdbBase::gotTokenForMerge ( ) {
|
||||
}
|
||||
// tfndb has his own merge class since titledb merges write tfndb recs
|
||||
RdbMerge *m = &g_merge;
|
||||
if ( m_rdb == g_tfndb.getRdb() ) m = &g_merge2;
|
||||
//if ( m_rdb == g_tfndb.getRdb() ) m = &g_merge2;
|
||||
// sanity check
|
||||
if ( m_isMerging || m->isMerging() ) {
|
||||
//if ( m_doLog )
|
||||
@ -1724,8 +1724,8 @@ void RdbBase::gotTokenForMerge ( ) {
|
||||
}
|
||||
|
||||
minToMerge = m_minToMerge;
|
||||
if (m_rdb==g_tfndb.getRdb()&& g_merge.isMerging() && minToMerge <=2 )
|
||||
minToMerge = 3;
|
||||
//if (m_rdb==g_tfndb.getRdb()&& g_merge.isMerging() && minToMerge <=2 )
|
||||
// minToMerge = 3;
|
||||
|
||||
// look at this merge:
|
||||
// indexdb0003.dat.part1
|
||||
|
30
XmlDoc.cpp
30
XmlDoc.cpp
@ -2101,8 +2101,15 @@ bool XmlDoc::indexDoc ( ) {
|
||||
// cr->m_localCrawlInfo.m_pageDownloadAttempts);
|
||||
// this is just how many urls we tried to index
|
||||
//cr->m_localCrawlInfo.m_urlsConsidered++;
|
||||
cr->m_localCrawlInfo.m_pageDownloadAttempts++;
|
||||
cr->m_globalCrawlInfo.m_pageDownloadAttempts++;
|
||||
// avoid counting if it is a fake first ip
|
||||
bool countIt = true;
|
||||
// pagereindex.cpp sets this as does any add url (bulk job)
|
||||
if ( m_sreqValid && m_sreq.m_fakeFirstIp )
|
||||
countIt = false;
|
||||
if ( countIt ) {
|
||||
cr->m_localCrawlInfo.m_pageDownloadAttempts++;
|
||||
cr->m_globalCrawlInfo.m_pageDownloadAttempts++;
|
||||
}
|
||||
// need to save collection rec now during auto save
|
||||
cr->m_needsSave = true;
|
||||
// update this just in case we are the last url crawled
|
||||
@ -2358,7 +2365,8 @@ bool XmlDoc::indexDoc2 ( ) {
|
||||
// return false;
|
||||
|
||||
|
||||
|
||||
// MDW: we do this in indexDoc() above why do we need it here?
|
||||
/*
|
||||
// even if not using diffbot, keep track of these counts
|
||||
if ( ! m_isDiffbotJSONObject &&
|
||||
! m_incrementedAttemptsCount ) {
|
||||
@ -2374,7 +2382,7 @@ bool XmlDoc::indexDoc2 ( ) {
|
||||
long long now = gettimeofdayInMillisecondsGlobal();
|
||||
cr->m_diffbotCrawlEndTime = now;
|
||||
}
|
||||
|
||||
*/
|
||||
/*
|
||||
// if we are being called from Spider.cpp and we met our max
|
||||
// to crawl requirement, then bail out on this. this might
|
||||
@ -12973,11 +12981,13 @@ LinkInfo *XmlDoc::getLinkInfo1 ( ) {
|
||||
// because we need the anchor text to pass in to diffbot
|
||||
bool doLinkSpamCheck = cr->m_doLinkSpamCheck;
|
||||
bool oneVotePerIpDom = cr->m_oneVotePerIpDom;
|
||||
if ( cr->m_isCustomCrawl && cr->m_restrictDomain ) {
|
||||
doLinkSpamCheck = false;
|
||||
oneVotePerIpDom = false;
|
||||
onlyNeedGoodInlinks = false;
|
||||
}
|
||||
// this seems to overdo it when we have a ton of linktext
|
||||
// perhaps, so take this out...
|
||||
//if ( cr->m_isCustomCrawl && cr->m_restrictDomain ) {
|
||||
// doLinkSpamCheck = false;
|
||||
// oneVotePerIpDom = false;
|
||||
// onlyNeedGoodInlinks = false;
|
||||
//}
|
||||
|
||||
// call it
|
||||
char *url = getFirstUrl()->getUrl();
|
||||
@ -13764,7 +13774,7 @@ SafeBuf *XmlDoc::getDiffbotReply ( ) {
|
||||
|
||||
// we make a "fake" url for the diffbot reply when indexing it
|
||||
// by appending -diffbotxyz%lu. see "fakeUrl" below.
|
||||
if ( m_firstUrl.getUrlLen() + 15 >= MAX_URL_LEN ) {
|
||||
if ( m_firstUrl.getUrlLen() + 24 >= MAX_URL_LEN ) {
|
||||
if ( m_firstUrlValid )
|
||||
log("build: diffbot url would be too long for "
|
||||
"%s", m_firstUrl.getUrl() );
|
||||
|
43
main.cpp
43
main.cpp
@ -25,7 +25,7 @@
|
||||
#include "Tagdb.h"
|
||||
#include "Catdb.h"
|
||||
#include "Users.h"
|
||||
#include "Tfndb.h"
|
||||
//#include "Tfndb.h"
|
||||
#include "Spider.h"
|
||||
//#include "Doledb.h"
|
||||
//#include "Checksumdb.h"
|
||||
@ -150,8 +150,8 @@ static void dumpTitledb ( char *coll,long sfn,long numFiles,bool includeTree,
|
||||
long long docId , char justPrintDups ,
|
||||
bool dumpSentences ,
|
||||
bool dumpWords );
|
||||
static void dumpTfndb ( char *coll,long sfn,long numFiles,bool includeTree,
|
||||
bool verify);
|
||||
//static void dumpTfndb (char *coll,long sfn,long numFiles,bool includeTree,
|
||||
// bool verify);
|
||||
static long dumpSpiderdb ( char *coll,long sfn,long numFiles,bool includeTree,
|
||||
char printStats , long firstIp );
|
||||
static void dumpSectiondb( char *coll,long sfn,long numFiles,bool includeTree);
|
||||
@ -773,8 +773,8 @@ int main ( int argc , char *argv[] ) {
|
||||
"\tV is z to dump statsdb all keys.\n"
|
||||
"\tV is Z to dump statsdb all keys and data samples.\n"
|
||||
"\tV is L to dump linkdb.\n"
|
||||
"\tV is u to dump tfndb.\n"
|
||||
"\tV is vu to verify tfndb.\n"
|
||||
//"\tV is u to dump tfndb.\n"
|
||||
//"\tV is vu to verify tfndb.\n"
|
||||
"\tC is the name of the collection.\n"
|
||||
"\tX is start file num. (default 0)\n"
|
||||
"\tY is num files. (default -1)\n"
|
||||
@ -2420,10 +2420,10 @@ int main ( int argc , char *argv[] ) {
|
||||
dumpTitledb(coll,startFileNum,numFiles,includeTree,
|
||||
docId,1,false,false);
|
||||
}
|
||||
else if ( argv[cmdarg+1][0] == 'v' && argv[cmdarg+1][1] =='u' )
|
||||
dumpTfndb (coll,startFileNum,numFiles,includeTree,1);
|
||||
else if ( argv[cmdarg+1][0] == 'u' )
|
||||
dumpTfndb (coll,startFileNum,numFiles,includeTree,0);
|
||||
//else if(argv[cmdarg+1][0] == 'v' && argv[cmdarg+1][1] =='u' )
|
||||
// dumpTfndb (coll,startFileNum,numFiles,includeTree,1);
|
||||
//else if ( argv[cmdarg+1][0] == 'u' )
|
||||
// dumpTfndb (coll,startFileNum,numFiles,includeTree,0);
|
||||
else if ( argv[cmdarg+1][0] == 'w' )
|
||||
dumpWaitingTree(coll);
|
||||
else if ( argv[cmdarg+1][0] == 'x' )
|
||||
@ -5652,7 +5652,7 @@ void zlibtest() {
|
||||
|
||||
#include "Rdb.h"
|
||||
#include "Xml.h"
|
||||
#include "Tfndb.h"
|
||||
//#include "Tfndb.h"
|
||||
//#include "Checksumdb.h"
|
||||
#include "Threads.h"
|
||||
|
||||
@ -5988,7 +5988,7 @@ void dumpTitledb (char *coll,long startFileNum,long numFiles,bool includeTree,
|
||||
if ( startKey < *(key_t *)list.getLastKey() ) return;
|
||||
goto loop;
|
||||
}
|
||||
|
||||
/*
|
||||
void dumpTfndb (char *coll,long startFileNum,long numFiles,bool includeTree ,
|
||||
bool verify) {
|
||||
//g_conf.m_spiderdbMaxTreeMem = 1024*1024*30;
|
||||
@ -6060,7 +6060,7 @@ void dumpTfndb (char *coll,long startFileNum,long numFiles,bool includeTree ,
|
||||
if ( startKey < *(key_t *)list.getLastKey() ) return;
|
||||
goto loop;
|
||||
}
|
||||
|
||||
*/
|
||||
void dumpWaitingTree (char *coll ) {
|
||||
RdbTree wt;
|
||||
if (!wt.set(0,-1,true,20000000,true,"waittree2",
|
||||
@ -7895,9 +7895,9 @@ void dumpMissing ( char *coll ) {
|
||||
g_conf.m_indexdbMaxCacheMem = 0;
|
||||
//g_conf.m_clusterdbMaxDiskPageCacheMem = 0;
|
||||
|
||||
g_tfndb.init ();
|
||||
//g_tfndb.init ();
|
||||
//g_collectiondb.init(true); // isDump?
|
||||
g_tfndb.getRdb()->addRdbBase1 ( coll );
|
||||
//g_tfndb.getRdb()->addRdbBase1 ( coll );
|
||||
g_titledb.init();
|
||||
g_titledb.getRdb()->addRdbBase1 ( coll );
|
||||
// if titledb has stuff in memory, do not do this, it needs to
|
||||
@ -7911,7 +7911,8 @@ void dumpMissing ( char *coll ) {
|
||||
}
|
||||
// . just get the docids from tfndb...
|
||||
// . this tfndb rec count is for ALL colls!! DOH!
|
||||
long long numRecs = g_tfndb.getRdb()->getNumTotalRecs();
|
||||
// MDW FIX THIS RIGHT!
|
||||
long long numRecs = 12345;//g_tfndb.getRdb()->getNumTotalRecs();
|
||||
long long oldNumSlots = (numRecs * 100) / 80;
|
||||
// make a power of 2
|
||||
// make it a power of 2
|
||||
@ -7980,10 +7981,10 @@ void dumpMissing ( char *coll ) {
|
||||
if ( (k.n0 & 0x01LL) == 0x00 ) continue;
|
||||
// titledb tree is empty, so this must indicate it is in
|
||||
// spiderdb only
|
||||
long tfn = g_tfndb.getTfn(&k);
|
||||
long tfn = 0;//g_tfndb.getTfn(&k);
|
||||
if ( tfn == 255 ) continue;
|
||||
// get docid
|
||||
unsigned long long d = g_tfndb.getDocId ( &k );
|
||||
unsigned long long d = 0LL;//g_tfndb.getDocId ( &k );
|
||||
// add to hash table
|
||||
//long n = (unsigned long)d & mask;
|
||||
long n = (unsigned long)d % numSlots;
|
||||
@ -8664,12 +8665,12 @@ void removeDocIds ( char *coll , char *filename ) {
|
||||
//g_conf.m_checksumdbMaxCacheMem = 0;
|
||||
//g_conf.m_clusterdbMaxCacheMem = 0;
|
||||
|
||||
g_tfndb.init();
|
||||
//g_tfndb.init();
|
||||
g_indexdb.init ();
|
||||
//g_checksumdb.init();
|
||||
g_clusterdb.init();
|
||||
//g_collectiondb.init(true);
|
||||
g_tfndb.getRdb()->addRdbBase1 ( coll );
|
||||
//g_tfndb.getRdb()->addRdbBase1 ( coll );
|
||||
g_indexdb.getRdb()->addRdbBase1 ( coll );
|
||||
//g_checksumdb.getRdb()->addRdbBase1 ( coll );
|
||||
g_clusterdb.getRdb()->addRdbBase1 ( coll );
|
||||
@ -9044,7 +9045,7 @@ void removeDocIds ( char *coll , char *filename ) {
|
||||
//
|
||||
|
||||
logf(LOG_INFO,"db: Scanning tfndb and removing recs.");
|
||||
r = g_tfndb.getRdb();
|
||||
r = 0;//g_tfndb.getRdb();
|
||||
count = 0;
|
||||
scanned = 0;
|
||||
recs = 0;
|
||||
@ -9089,7 +9090,7 @@ void removeDocIds ( char *coll , char *filename ) {
|
||||
key_t k = list.getCurrentKey();
|
||||
// skip deletes
|
||||
if ( (k.n0 & 0x01) == 0x00 ) continue;
|
||||
unsigned long long d = g_tfndb.getDocId(&k);
|
||||
unsigned long long d = 0;//g_tfndb.getDocId(&k);
|
||||
// see if docid is in delete list
|
||||
long n = (unsigned long)d & mask;
|
||||
while ( slots[n] && slots[n] != d )
|
||||
|
Loading…
x
Reference in New Issue
Block a user