mirror of
https://github.com/privacore/open-source-search-engine.git
synced 2025-07-15 02:36:08 -04:00
Merge branch 'diffbot-testing' into testing
This commit is contained in:
@ -3359,12 +3359,12 @@ bool CollectionRec::rebuildUrlFilters ( ) {
|
||||
i++;
|
||||
// if just matches ucp, just crawl it, do not process
|
||||
m_regExs[i].set("matchesucp");
|
||||
m_spiderPriorities [i] = 54;
|
||||
m_spiderPriorities [i] = 53;
|
||||
if ( m_collectiveRespiderFrequency<=0.0) m_spiderFreqs [i] = 0;
|
||||
i++;
|
||||
// just process, do not spider links if does not match ucp
|
||||
m_regExs[i].set("matchesupp");
|
||||
m_spiderPriorities [i] = 53;
|
||||
m_spiderPriorities [i] = 54;
|
||||
m_harvestLinks [i] = false;
|
||||
if ( m_collectiveRespiderFrequency<=0.0) m_spiderFreqs [i] = 0;
|
||||
//m_spiderDiffbotApiUrl[i].set ( api );
|
||||
@ -3384,7 +3384,7 @@ bool CollectionRec::rebuildUrlFilters ( ) {
|
||||
// harvest links if we should crawl it
|
||||
if ( ucp && ! upp ) {
|
||||
m_regExs[i].set("matchesucp");
|
||||
m_spiderPriorities [i] = 54;
|
||||
m_spiderPriorities [i] = 53;
|
||||
if ( m_collectiveRespiderFrequency<=0.0) m_spiderFreqs [i] = 0;
|
||||
// process everything since upp is empty
|
||||
//m_spiderDiffbotApiUrl[i].set ( api );
|
||||
@ -3404,7 +3404,7 @@ bool CollectionRec::rebuildUrlFilters ( ) {
|
||||
// just process
|
||||
if ( upp && ! ucp ) {
|
||||
m_regExs[i].set("matchesupp");
|
||||
m_spiderPriorities [i] = 53;
|
||||
m_spiderPriorities [i] = 54;
|
||||
if ( m_collectiveRespiderFrequency<=0.0) m_spiderFreqs [i] = 0;
|
||||
//m_harvestLinks [i] = false;
|
||||
//m_spiderDiffbotApiUrl[i].set ( api );
|
||||
|
2
Conf.h
2
Conf.h
@ -281,7 +281,7 @@ class Conf {
|
||||
//bool m_stubHubSpideringEnabled;
|
||||
//bool m_eventBriteSpideringEnabled;
|
||||
//bool m_refreshFacebookUsersEnabled;
|
||||
//bool m_injectionEnabled ;
|
||||
bool m_injectionEnabled ;
|
||||
// qa testing loop going on? uses "test" subdir
|
||||
bool m_testParserEnabled ;
|
||||
bool m_testSpiderEnabled ;
|
||||
|
@ -129,7 +129,9 @@ bool Linkdb::init ( ) {
|
||||
"linkdb" ,
|
||||
true , // dedup
|
||||
0 , // fixeddatasize is 0 since no data
|
||||
3,//g_conf.m_linkdbMinFilesToMerge ,
|
||||
// keep it high since we are mostly ssds now and
|
||||
// the reads are small...
|
||||
6,//g_conf.m_linkdbMinFilesToMerge ,
|
||||
// fix this to 15 and rely on the page cache of
|
||||
// just the satellite files and the daily merge to
|
||||
// keep things fast.
|
||||
|
@ -31,6 +31,16 @@ static void sendReplyWrapper ( void *state ) {
|
||||
// HttpServer::sendReply() so we gotta copy it here
|
||||
bool sendPageInject ( TcpSocket *sock , HttpRequest *hr ) {
|
||||
|
||||
if ( ! g_conf.m_injectionEnabled ) {
|
||||
g_errno = EBADENGINEER;
|
||||
log("inject: injection disabled");
|
||||
return g_httpServer.sendErrorReply(sock,500,"injection is disabled by "
|
||||
"the administrator in the master "
|
||||
"controls");
|
||||
}
|
||||
|
||||
|
||||
|
||||
// get the collection
|
||||
// make a new state
|
||||
Msg7 *msg7;
|
||||
|
@ -9555,15 +9555,16 @@ void Parms::init ( ) {
|
||||
m->m_obj = OBJ_CONF;
|
||||
m++;
|
||||
|
||||
/*
|
||||
m->m_title = "url injection enabled";
|
||||
m->m_desc = "If enabled you can directly inject URLs into the index.";
|
||||
m->m_cgi = "ie";
|
||||
m->m_off = (char *)&g_conf.m_injectionEnabled - g;
|
||||
m->m_type = TYPE_BOOL;
|
||||
m->m_def = "0";
|
||||
m->m_flags = PF_HIDDEN | PF_NOSAVE;
|
||||
m->m_page = PAGE_MASTER;
|
||||
m->m_obj = OBJ_CONF;
|
||||
m->m_def = "1";
|
||||
m++;
|
||||
*/
|
||||
|
||||
m->m_title = "init QA tests";
|
||||
m->m_desc = "If initiated gb performs some integrity tests "
|
||||
|
17
Posdb.cpp
17
Posdb.cpp
@ -3411,6 +3411,8 @@ float PosdbTable::getTermPairScoreForAny ( long i, long j,
|
||||
m_quotedStartIds[i] >= 0 )
|
||||
inSameQuotedPhrase = true;
|
||||
|
||||
// oops.. this was not counting non-space punct for 2 units
|
||||
// instead of 1
|
||||
if ( inSameQuotedPhrase )
|
||||
qdist = m_qpos[j] - m_qpos[i];
|
||||
|
||||
@ -3503,10 +3505,12 @@ float PosdbTable::getTermPairScoreForAny ( long i, long j,
|
||||
// "posj=%li",
|
||||
// i,j,dist,qdist,p1,p2);
|
||||
// TODO: allow for off by 1
|
||||
// if it has punct in it then dist will be 3,
|
||||
// just a space or similar then dist should be 2.
|
||||
if ( dist > qdist && dist - qdist >= 2 )
|
||||
goto skip2;
|
||||
goto skip1;
|
||||
if ( dist < qdist && qdist - dist >= 2 )
|
||||
goto skip2;
|
||||
goto skip1;
|
||||
}
|
||||
|
||||
// are either synonyms
|
||||
@ -4126,13 +4130,18 @@ bool PosdbTable::setQueryTermInfo ( ) {
|
||||
if ( ! qt->m_isRequired ) continue;
|
||||
// set this stff
|
||||
QueryWord *qw = qt->m_qword;
|
||||
long wordNum = qw - &m_q->m_qwords[0];
|
||||
//long wordNum = qw - &m_q->m_qwords[0];
|
||||
// get one
|
||||
QueryTermInfo *qti = &qip[nrg];
|
||||
// and set it
|
||||
qti->m_qt = qt;
|
||||
qti->m_qtermNum = i;
|
||||
qti->m_qpos = wordNum;
|
||||
// this is not good enough, we need to count
|
||||
// non-whitespace punct as 2 units not 1 unit
|
||||
// otherwise qdist gets thrown off and our phrasing fails.
|
||||
// so use QueryTerm::m_qpos just for this.
|
||||
//qti->m_qpos = wordNum;
|
||||
qti->m_qpos = qw->m_posNum;
|
||||
qti->m_wikiPhraseId = qw->m_wikiPhraseId;
|
||||
qti->m_quotedStartId = qw->m_quoteStart;
|
||||
// is it gbsortby:?
|
||||
|
115
Query.cpp
115
Query.cpp
@ -1950,6 +1950,8 @@ bool Query::setQWords ( char boolFlag ,
|
||||
// assume we contain no pipe operator
|
||||
long pi = -1;
|
||||
|
||||
long posNum = 0;
|
||||
|
||||
// loop over all words, these QueryWords are 1-1 with "words"
|
||||
for ( long i = 0 ; i < numWords && i < MAX_QUERY_WORDS ; i++ ) {
|
||||
// convenience var, these are 1-1 with "words"
|
||||
@ -1970,6 +1972,31 @@ bool Query::setQWords ( char boolFlag ,
|
||||
qw->m_word = words.getWord(i);
|
||||
qw->m_wordLen = words.getWordLen(i);
|
||||
qw->m_isPunct = words.isPunct(i);
|
||||
|
||||
qw->m_posNum = posNum;
|
||||
|
||||
// count 1 unit for it
|
||||
posNum++;
|
||||
|
||||
// . we duplicated this code from XmlDoc.cpp's
|
||||
// getWordPosVec() function
|
||||
if ( qw->m_isPunct ) { // ! wids[i] ) {
|
||||
char *wp = qw->m_word;
|
||||
long wplen = qw->m_wordLen;
|
||||
// simple space or sequence of just white space
|
||||
if ( words.isSpaces(i) )
|
||||
posNum += 0;
|
||||
// 'cd-rom'
|
||||
else if ( wp[0]=='-' && wplen==1 )
|
||||
posNum += 0;
|
||||
// 'mr. x'
|
||||
else if ( wp[0]=='.' && words.isSpaces2(i,1))
|
||||
posNum += 0;
|
||||
// animal (dog)
|
||||
else
|
||||
posNum++;
|
||||
}
|
||||
|
||||
char *w = words.getWord(i);
|
||||
long wlen = words.getWordLen(i);
|
||||
// assume it is a query weight operator
|
||||
@ -2088,6 +2115,10 @@ bool Query::setQWords ( char boolFlag ,
|
||||
// TODO: fix title:" hey there" (space in quotes is ok)
|
||||
bool cancelField = false;
|
||||
if ( words.hasSpace(i) && ! inQuotes ) cancelField = true;
|
||||
// fix title:"foo bar" "another quote" so "another quote"
|
||||
// is not in the title: field
|
||||
if ( words.hasSpace(i) && inQuotes && nq>= 2 )
|
||||
cancelField = true;
|
||||
// BUT if we have a quote, and they just got turned off,
|
||||
// and the space is not after the quote, do not cancel field!
|
||||
if ( nq == 1 && cancelField ) {
|
||||
@ -2681,6 +2712,63 @@ bool Query::setQWords ( char boolFlag ,
|
||||
//Spam spam;
|
||||
//spam.reset ( words.getNumWords() );
|
||||
|
||||
|
||||
// treat strongly connected phrases like cd-rom and 3.2.0.3 as being
|
||||
// in quotes for the most part, therefore, set m_quoteStart for them
|
||||
long j;
|
||||
long qs = -1;
|
||||
for ( j = 0 ; j < numWords ; j++ ) {
|
||||
// skip all but strongly connected words
|
||||
if ( m_qwords[j].m_ignoreWord != IGNORE_CONNECTED &&
|
||||
// must also be non punct word OR a space
|
||||
( !words.isPunct(j) || words.m_words[j][0] == ' ' ) ) {
|
||||
// break the "quote", if any
|
||||
qs = -1; continue; }
|
||||
// if he is punctuation and qs is -1, skip him,
|
||||
// punctuation words can no longer start a quote
|
||||
if ( words.isPunct(j) && qs == -1 ) continue;
|
||||
// uningore him if we should
|
||||
if ( keepAllSingles ) m_qwords[j].m_ignoreWord = 0;
|
||||
// if already in quotes, don't bother!
|
||||
if ( m_qwords[j].m_quoteStart >= 0 ) continue;
|
||||
// remember him
|
||||
if ( qs == -1 ) qs = j;
|
||||
// he starts the phrase
|
||||
m_qwords[j].m_quoteStart = qs;
|
||||
// force him into a quoted phrase
|
||||
m_qwords[j].m_inQuotes = true;
|
||||
//m_qwords[j].m_inQuotedPhrase = true;
|
||||
}
|
||||
|
||||
// fix for tags.uri:http://foo.com/bar so it works like
|
||||
// tags.uri:"http://foo.com/bar" like it should
|
||||
long first = -1;
|
||||
for ( j = 0 ; j < numWords ; j++ ) {
|
||||
// stop when we hit spaces
|
||||
if ( words.hasSpace(j) ) {
|
||||
first = -1;
|
||||
continue;
|
||||
}
|
||||
// skip if not in field
|
||||
if ( ! m_qwords[j].m_fieldCode ) continue;
|
||||
// first alnumword in field?
|
||||
if ( first == -1 ) {
|
||||
// must be alnum
|
||||
if ( m_qwords[j].m_isPunct ) continue;
|
||||
// must have punct then another alnum word
|
||||
if ( j+2 >= numWords ) break;
|
||||
// spaces screw it up
|
||||
if ( words.hasSpace(j+1) ) continue;
|
||||
// then an alnum word after
|
||||
first = j;
|
||||
}
|
||||
// we are in fake quoted phrase
|
||||
m_qwords[j].m_inQuotes = true;
|
||||
m_qwords[j].m_quoteStart = first;
|
||||
}
|
||||
|
||||
|
||||
|
||||
// make the phrases from the words and the tweaked Bits class
|
||||
//Phrases phrases;
|
||||
if ( ! phrases.set ( &words ,
|
||||
@ -2930,33 +3018,6 @@ bool Query::setQWords ( char boolFlag ,
|
||||
}
|
||||
}
|
||||
|
||||
// treat strongly connected phrases like cd-rom and 3.2.0.3 as being
|
||||
// in quotes for the most part, therefore, set m_quoteStart for them
|
||||
long j;
|
||||
long qs = -1;
|
||||
for ( j = 0 ; j < numWords ; j++ ) {
|
||||
// skip all but strongly connected words
|
||||
if ( m_qwords[j].m_ignoreWord != IGNORE_CONNECTED &&
|
||||
// must also be non punct word OR a space
|
||||
( !words.isPunct(j) || words.m_words[j][0] == ' ' ) ) {
|
||||
// break the "quote", if any
|
||||
qs = -1; continue; }
|
||||
// if he is punctuation and qs is -1, skip him,
|
||||
// punctuation words can no longer start a quote
|
||||
if ( words.isPunct(j) && qs == -1 ) continue;
|
||||
// uningore him if we should
|
||||
if ( keepAllSingles ) m_qwords[j].m_ignoreWord = 0;
|
||||
// if already in quotes, don't bother!
|
||||
if ( m_qwords[j].m_quoteStart >= 0 ) continue;
|
||||
// remember him
|
||||
if ( qs == -1 ) qs = j;
|
||||
// he starts the phrase
|
||||
m_qwords[j].m_quoteStart = qs;
|
||||
// force him into a quoted phrase
|
||||
m_qwords[j].m_inQuotes = true;
|
||||
//m_qwords[j].m_inQuotedPhrase = true;
|
||||
}
|
||||
|
||||
// . if we only have one quoted query then force its sign to be '+'
|
||||
// . '"get the phrase" the' --> +"get the phrase" (last the is ignored)
|
||||
// . "time enough for love" --> +"time enough" +"enough for love"
|
||||
|
1
Query.h
1
Query.h
@ -284,6 +284,7 @@ class QueryWord {
|
||||
// hash of field name then collection, used to hash termId
|
||||
long long m_prefixHash;
|
||||
long m_wordNum;
|
||||
long m_posNum;
|
||||
// are we in a phrase in a wikipedia title?
|
||||
long m_wikiPhraseId;
|
||||
long m_wikiPhraseStart;
|
||||
|
7
Rdb.cpp
7
Rdb.cpp
@ -1616,6 +1616,7 @@ void Rdb::doneDumping ( ) {
|
||||
|
||||
// this should be called every few seconds by the sleep callback, too
|
||||
void attemptMergeAll ( int fd , void *state ) {
|
||||
|
||||
if ( state && g_conf.m_logDebugDb ) state = NULL;
|
||||
//g_checksumdb.getRdb()->attemptMerge ( 1 , false , !state);
|
||||
g_linkdb.getRdb()->attemptMerge ( 1 , false , !state);
|
||||
@ -2611,7 +2612,11 @@ long long Rdb::getNumGlobalRecs ( ) {
|
||||
// . return number of positive records - negative records
|
||||
long long Rdb::getNumTotalRecs ( ) {
|
||||
long long total = 0;
|
||||
for ( long i = 0 ; i < getNumBases() ; i++ ) {
|
||||
long nb = getNumBases();
|
||||
// don't slam the cpu on this if too many collections
|
||||
if ( nb > 10 ) return 0;
|
||||
//return 0; // too many collections!!
|
||||
for ( long i = 0 ; i < nb ; i++ ) {
|
||||
RdbBase *base = getBase(i);
|
||||
if ( ! base ) continue;
|
||||
total += base->getNumTotalRecs();
|
||||
|
@ -1564,8 +1564,10 @@ void RdbBase::attemptMerge ( long niceness, bool forceMergeAll, bool doLog ,
|
||||
// the rdbmaps hold this info
|
||||
long long totalRecs = 0LL;
|
||||
float percentNegativeRecs = getPercentNegativeRecsOnDisk ( &totalRecs);
|
||||
bool doNegCheck = false;
|
||||
// 1. if disk space is tight and >20% negative recs, force it
|
||||
if ( g_process.m_diskAvail >= 0 &&
|
||||
if ( doNegCheck &&
|
||||
g_process.m_diskAvail >= 0 &&
|
||||
g_process.m_diskAvail < 10000000000LL && // 10GB
|
||||
percentNegativeRecs > .20 ) {
|
||||
m_nextMergeForced = true;
|
||||
@ -1577,7 +1579,8 @@ void RdbBase::attemptMerge ( long niceness, bool forceMergeAll, bool doLog ,
|
||||
m_rdb->m_dbname,g_process.m_diskAvail);
|
||||
}
|
||||
// 2. if >40% negative recs force it
|
||||
if ( percentNegativeRecs > .40 ) {
|
||||
if ( doNegCheck &&
|
||||
percentNegativeRecs > .40 ) {
|
||||
m_nextMergeForced = true;
|
||||
forceMergeAll = true;
|
||||
log("rdb: hit negative rec concentration of %f "
|
||||
|
@ -5248,7 +5248,7 @@ void SpiderLoop::startLoop ( ) {
|
||||
// in case host when dead.
|
||||
// now that we only send the info on startup and if changed,
|
||||
// let's move back down to 1 second
|
||||
if ( !g_loop.registerSleepCallback(1000,
|
||||
if ( !g_loop.registerSleepCallback(5000,
|
||||
this,
|
||||
updateAllCrawlInfosSleepWrapper))
|
||||
log("build: failed to register updatecrawlinfowrapper");
|
||||
@ -12607,6 +12607,8 @@ void handleRequestc1 ( UdpSlot *slot , long niceness ) {
|
||||
|
||||
for ( long i = 0 ; i < g_collectiondb.m_numRecs ; i++ ) {
|
||||
|
||||
QUICKPOLL(MAX_NICENESS);
|
||||
|
||||
CollectionRec *cr = g_collectiondb.m_recs[i];
|
||||
if ( ! cr ) continue;
|
||||
|
||||
|
10
XmlDoc.cpp
10
XmlDoc.cpp
@ -10050,7 +10050,11 @@ Url **XmlDoc::getRedirUrl() {
|
||||
// AND for custom crawl it was messing up the processing
|
||||
// url format for a nytimes blog subsite which was redirecting
|
||||
// to the proper nytimes.com site...
|
||||
! cr->m_isCustomCrawl ) {
|
||||
// ! cr->m_isCustomCrawl ) {
|
||||
// no, we need this for custom crawls because otherwise we
|
||||
// get too many dups in the index. so for nyt we need something
|
||||
// else
|
||||
cr->m_isCustomCrawl != 2 ) {
|
||||
// returns false if blocked, true otherwise
|
||||
//return addSimplifiedRedirect();
|
||||
m_redirError = EDOCSIMPLIFIEDREDIR;
|
||||
@ -37112,7 +37116,9 @@ bool getWordPosVec ( Words *words ,
|
||||
dist++;
|
||||
continue;
|
||||
}
|
||||
// and so do sequences of punct
|
||||
// . and so do sequences of punct
|
||||
// . must duplicate this code in Query.cpp for setting
|
||||
// QueryWord::m_posNum
|
||||
if ( ! wids[i] ) {
|
||||
// simple space or sequence of just white space
|
||||
if ( words->isSpaces(i) )
|
||||
|
Reference in New Issue
Block a user