Merge branch 'diffbot-testing' into testing

This commit is contained in:
mwells
2014-08-15 17:05:22 -07:00
12 changed files with 146 additions and 46 deletions

@ -3359,12 +3359,12 @@ bool CollectionRec::rebuildUrlFilters ( ) {
i++;
// if just matches ucp, just crawl it, do not process
m_regExs[i].set("matchesucp");
m_spiderPriorities [i] = 54;
m_spiderPriorities [i] = 53;
if ( m_collectiveRespiderFrequency<=0.0) m_spiderFreqs [i] = 0;
i++;
// just process, do not spider links if does not match ucp
m_regExs[i].set("matchesupp");
m_spiderPriorities [i] = 53;
m_spiderPriorities [i] = 54;
m_harvestLinks [i] = false;
if ( m_collectiveRespiderFrequency<=0.0) m_spiderFreqs [i] = 0;
//m_spiderDiffbotApiUrl[i].set ( api );
@ -3384,7 +3384,7 @@ bool CollectionRec::rebuildUrlFilters ( ) {
// harvest links if we should crawl it
if ( ucp && ! upp ) {
m_regExs[i].set("matchesucp");
m_spiderPriorities [i] = 54;
m_spiderPriorities [i] = 53;
if ( m_collectiveRespiderFrequency<=0.0) m_spiderFreqs [i] = 0;
// process everything since upp is empty
//m_spiderDiffbotApiUrl[i].set ( api );
@ -3404,7 +3404,7 @@ bool CollectionRec::rebuildUrlFilters ( ) {
// just process
if ( upp && ! ucp ) {
m_regExs[i].set("matchesupp");
m_spiderPriorities [i] = 53;
m_spiderPriorities [i] = 54;
if ( m_collectiveRespiderFrequency<=0.0) m_spiderFreqs [i] = 0;
//m_harvestLinks [i] = false;
//m_spiderDiffbotApiUrl[i].set ( api );

2
Conf.h

@ -281,7 +281,7 @@ class Conf {
//bool m_stubHubSpideringEnabled;
//bool m_eventBriteSpideringEnabled;
//bool m_refreshFacebookUsersEnabled;
//bool m_injectionEnabled ;
bool m_injectionEnabled ;
// qa testing loop going on? uses "test" subdir
bool m_testParserEnabled ;
bool m_testSpiderEnabled ;

@ -129,7 +129,9 @@ bool Linkdb::init ( ) {
"linkdb" ,
true , // dedup
0 , // fixeddatasize is 0 since no data
3,//g_conf.m_linkdbMinFilesToMerge ,
// keep it high since we are mostly ssds now and
// the reads are small...
6,//g_conf.m_linkdbMinFilesToMerge ,
// fix this to 15 and rely on the page cache of
// just the satellite files and the daily merge to
// keep things fast.

@ -31,6 +31,16 @@ static void sendReplyWrapper ( void *state ) {
// HttpServer::sendReply() so we gotta copy it here
bool sendPageInject ( TcpSocket *sock , HttpRequest *hr ) {
if ( ! g_conf.m_injectionEnabled ) {
g_errno = EBADENGINEER;
log("inject: injection disabled");
return g_httpServer.sendErrorReply(sock,500,"injection is disabled by "
"the administrator in the master "
"controls");
}
// get the collection
// make a new state
Msg7 *msg7;

@ -9555,15 +9555,16 @@ void Parms::init ( ) {
m->m_obj = OBJ_CONF;
m++;
/*
m->m_title = "url injection enabled";
m->m_desc = "If enabled you can directly inject URLs into the index.";
m->m_cgi = "ie";
m->m_off = (char *)&g_conf.m_injectionEnabled - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m->m_def = "1";
m++;
*/
m->m_title = "init QA tests";
m->m_desc = "If initiated gb performs some integrity tests "

@ -3411,6 +3411,8 @@ float PosdbTable::getTermPairScoreForAny ( long i, long j,
m_quotedStartIds[i] >= 0 )
inSameQuotedPhrase = true;
// oops.. this was not counting non-space punct for 2 units
// instead of 1
if ( inSameQuotedPhrase )
qdist = m_qpos[j] - m_qpos[i];
@ -3503,10 +3505,12 @@ float PosdbTable::getTermPairScoreForAny ( long i, long j,
// "posj=%li",
// i,j,dist,qdist,p1,p2);
// TODO: allow for off by 1
// if it has punct in it then dist will be 3,
// just a space or similar then dist should be 2.
if ( dist > qdist && dist - qdist >= 2 )
goto skip2;
goto skip1;
if ( dist < qdist && qdist - dist >= 2 )
goto skip2;
goto skip1;
}
// are either synonyms
@ -4126,13 +4130,18 @@ bool PosdbTable::setQueryTermInfo ( ) {
if ( ! qt->m_isRequired ) continue;
// set this stff
QueryWord *qw = qt->m_qword;
long wordNum = qw - &m_q->m_qwords[0];
//long wordNum = qw - &m_q->m_qwords[0];
// get one
QueryTermInfo *qti = &qip[nrg];
// and set it
qti->m_qt = qt;
qti->m_qtermNum = i;
qti->m_qpos = wordNum;
// this is not good enough, we need to count
// non-whitespace punct as 2 units not 1 unit
// otherwise qdist gets thrown off and our phrasing fails.
// so use QueryTerm::m_qpos just for this.
//qti->m_qpos = wordNum;
qti->m_qpos = qw->m_posNum;
qti->m_wikiPhraseId = qw->m_wikiPhraseId;
qti->m_quotedStartId = qw->m_quoteStart;
// is it gbsortby:?

115
Query.cpp

@ -1950,6 +1950,8 @@ bool Query::setQWords ( char boolFlag ,
// assume we contain no pipe operator
long pi = -1;
long posNum = 0;
// loop over all words, these QueryWords are 1-1 with "words"
for ( long i = 0 ; i < numWords && i < MAX_QUERY_WORDS ; i++ ) {
// convenience var, these are 1-1 with "words"
@ -1970,6 +1972,31 @@ bool Query::setQWords ( char boolFlag ,
qw->m_word = words.getWord(i);
qw->m_wordLen = words.getWordLen(i);
qw->m_isPunct = words.isPunct(i);
qw->m_posNum = posNum;
// count 1 unit for it
posNum++;
// . we duplicated this code from XmlDoc.cpp's
// getWordPosVec() function
if ( qw->m_isPunct ) { // ! wids[i] ) {
char *wp = qw->m_word;
long wplen = qw->m_wordLen;
// simple space or sequence of just white space
if ( words.isSpaces(i) )
posNum += 0;
// 'cd-rom'
else if ( wp[0]=='-' && wplen==1 )
posNum += 0;
// 'mr. x'
else if ( wp[0]=='.' && words.isSpaces2(i,1))
posNum += 0;
// animal (dog)
else
posNum++;
}
char *w = words.getWord(i);
long wlen = words.getWordLen(i);
// assume it is a query weight operator
@ -2088,6 +2115,10 @@ bool Query::setQWords ( char boolFlag ,
// TODO: fix title:" hey there" (space in quotes is ok)
bool cancelField = false;
if ( words.hasSpace(i) && ! inQuotes ) cancelField = true;
// fix title:"foo bar" "another quote" so "another quote"
// is not in the title: field
if ( words.hasSpace(i) && inQuotes && nq>= 2 )
cancelField = true;
// BUT if we have a quote, and they just got turned off,
// and the space is not after the quote, do not cancel field!
if ( nq == 1 && cancelField ) {
@ -2681,6 +2712,63 @@ bool Query::setQWords ( char boolFlag ,
//Spam spam;
//spam.reset ( words.getNumWords() );
// treat strongly connected phrases like cd-rom and 3.2.0.3 as being
// in quotes for the most part, therefore, set m_quoteStart for them
long j;
long qs = -1;
for ( j = 0 ; j < numWords ; j++ ) {
// skip all but strongly connected words
if ( m_qwords[j].m_ignoreWord != IGNORE_CONNECTED &&
// must also be non punct word OR a space
( !words.isPunct(j) || words.m_words[j][0] == ' ' ) ) {
// break the "quote", if any
qs = -1; continue; }
// if he is punctuation and qs is -1, skip him,
// punctuation words can no longer start a quote
if ( words.isPunct(j) && qs == -1 ) continue;
// uningore him if we should
if ( keepAllSingles ) m_qwords[j].m_ignoreWord = 0;
// if already in quotes, don't bother!
if ( m_qwords[j].m_quoteStart >= 0 ) continue;
// remember him
if ( qs == -1 ) qs = j;
// he starts the phrase
m_qwords[j].m_quoteStart = qs;
// force him into a quoted phrase
m_qwords[j].m_inQuotes = true;
//m_qwords[j].m_inQuotedPhrase = true;
}
// fix for tags.uri:http://foo.com/bar so it works like
// tags.uri:"http://foo.com/bar" like it should
long first = -1;
for ( j = 0 ; j < numWords ; j++ ) {
// stop when we hit spaces
if ( words.hasSpace(j) ) {
first = -1;
continue;
}
// skip if not in field
if ( ! m_qwords[j].m_fieldCode ) continue;
// first alnumword in field?
if ( first == -1 ) {
// must be alnum
if ( m_qwords[j].m_isPunct ) continue;
// must have punct then another alnum word
if ( j+2 >= numWords ) break;
// spaces screw it up
if ( words.hasSpace(j+1) ) continue;
// then an alnum word after
first = j;
}
// we are in fake quoted phrase
m_qwords[j].m_inQuotes = true;
m_qwords[j].m_quoteStart = first;
}
// make the phrases from the words and the tweaked Bits class
//Phrases phrases;
if ( ! phrases.set ( &words ,
@ -2930,33 +3018,6 @@ bool Query::setQWords ( char boolFlag ,
}
}
// treat strongly connected phrases like cd-rom and 3.2.0.3 as being
// in quotes for the most part, therefore, set m_quoteStart for them
long j;
long qs = -1;
for ( j = 0 ; j < numWords ; j++ ) {
// skip all but strongly connected words
if ( m_qwords[j].m_ignoreWord != IGNORE_CONNECTED &&
// must also be non punct word OR a space
( !words.isPunct(j) || words.m_words[j][0] == ' ' ) ) {
// break the "quote", if any
qs = -1; continue; }
// if he is punctuation and qs is -1, skip him,
// punctuation words can no longer start a quote
if ( words.isPunct(j) && qs == -1 ) continue;
// uningore him if we should
if ( keepAllSingles ) m_qwords[j].m_ignoreWord = 0;
// if already in quotes, don't bother!
if ( m_qwords[j].m_quoteStart >= 0 ) continue;
// remember him
if ( qs == -1 ) qs = j;
// he starts the phrase
m_qwords[j].m_quoteStart = qs;
// force him into a quoted phrase
m_qwords[j].m_inQuotes = true;
//m_qwords[j].m_inQuotedPhrase = true;
}
// . if we only have one quoted query then force its sign to be '+'
// . '"get the phrase" the' --> +"get the phrase" (last the is ignored)
// . "time enough for love" --> +"time enough" +"enough for love"

@ -284,6 +284,7 @@ class QueryWord {
// hash of field name then collection, used to hash termId
long long m_prefixHash;
long m_wordNum;
long m_posNum;
// are we in a phrase in a wikipedia title?
long m_wikiPhraseId;
long m_wikiPhraseStart;

@ -1616,6 +1616,7 @@ void Rdb::doneDumping ( ) {
// this should be called every few seconds by the sleep callback, too
void attemptMergeAll ( int fd , void *state ) {
if ( state && g_conf.m_logDebugDb ) state = NULL;
//g_checksumdb.getRdb()->attemptMerge ( 1 , false , !state);
g_linkdb.getRdb()->attemptMerge ( 1 , false , !state);
@ -2611,7 +2612,11 @@ long long Rdb::getNumGlobalRecs ( ) {
// . return number of positive records - negative records
long long Rdb::getNumTotalRecs ( ) {
long long total = 0;
for ( long i = 0 ; i < getNumBases() ; i++ ) {
long nb = getNumBases();
// don't slam the cpu on this if too many collections
if ( nb > 10 ) return 0;
//return 0; // too many collections!!
for ( long i = 0 ; i < nb ; i++ ) {
RdbBase *base = getBase(i);
if ( ! base ) continue;
total += base->getNumTotalRecs();

@ -1564,8 +1564,10 @@ void RdbBase::attemptMerge ( long niceness, bool forceMergeAll, bool doLog ,
// the rdbmaps hold this info
long long totalRecs = 0LL;
float percentNegativeRecs = getPercentNegativeRecsOnDisk ( &totalRecs);
bool doNegCheck = false;
// 1. if disk space is tight and >20% negative recs, force it
if ( g_process.m_diskAvail >= 0 &&
if ( doNegCheck &&
g_process.m_diskAvail >= 0 &&
g_process.m_diskAvail < 10000000000LL && // 10GB
percentNegativeRecs > .20 ) {
m_nextMergeForced = true;
@ -1577,7 +1579,8 @@ void RdbBase::attemptMerge ( long niceness, bool forceMergeAll, bool doLog ,
m_rdb->m_dbname,g_process.m_diskAvail);
}
// 2. if >40% negative recs force it
if ( percentNegativeRecs > .40 ) {
if ( doNegCheck &&
percentNegativeRecs > .40 ) {
m_nextMergeForced = true;
forceMergeAll = true;
log("rdb: hit negative rec concentration of %f "

@ -5248,7 +5248,7 @@ void SpiderLoop::startLoop ( ) {
// in case host when dead.
// now that we only send the info on startup and if changed,
// let's move back down to 1 second
if ( !g_loop.registerSleepCallback(1000,
if ( !g_loop.registerSleepCallback(5000,
this,
updateAllCrawlInfosSleepWrapper))
log("build: failed to register updatecrawlinfowrapper");
@ -12607,6 +12607,8 @@ void handleRequestc1 ( UdpSlot *slot , long niceness ) {
for ( long i = 0 ; i < g_collectiondb.m_numRecs ; i++ ) {
QUICKPOLL(MAX_NICENESS);
CollectionRec *cr = g_collectiondb.m_recs[i];
if ( ! cr ) continue;

@ -10050,7 +10050,11 @@ Url **XmlDoc::getRedirUrl() {
// AND for custom crawl it was messing up the processing
// url format for a nytimes blog subsite which was redirecting
// to the proper nytimes.com site...
! cr->m_isCustomCrawl ) {
// ! cr->m_isCustomCrawl ) {
// no, we need this for custom crawls because otherwise we
// get too many dups in the index. so for nyt we need something
// else
cr->m_isCustomCrawl != 2 ) {
// returns false if blocked, true otherwise
//return addSimplifiedRedirect();
m_redirError = EDOCSIMPLIFIEDREDIR;
@ -37112,7 +37116,9 @@ bool getWordPosVec ( Words *words ,
dist++;
continue;
}
// and so do sequences of punct
// . and so do sequences of punct
// . must duplicate this code in Query.cpp for setting
// QueryWord::m_posNum
if ( ! wids[i] ) {
// simple space or sequence of just white space
if ( words->isSpaces(i) )