#include "PosdbTable.h" #include "Posdb.h" #include "gb-include.h" #include "PageTemperatureRegistry.h" #include "Docid2Siteflags.h" #include "ScalingFunctions.h" #include "ScoringWeights.h" #include "BitOperations.h" #include "Msg2.h" #include "Msg39.h" #include "Sanity.h" #include "Stats.h" #include "Conf.h" #include "TopTree.h" #include "DocumentIndexChecker.h" #include "Lang.h" #include "GbMutex.h" #include "ScopedLock.h" #include <math.h> #include <valarray> #ifdef _VALGRIND_ #include <valgrind/memcheck.h> #include <valgrind/helgrind.h> #endif #define BF_HALFSTOPWIKIBIGRAM 0x01 // "to be" in "to be or not to be" #define BF_PIPED 0x02 // before a query pipe operator #define BF_SYNONYM 0x04 #define BF_NEGATIVE 0x08 // query word has a negative sign before it #define BF_BIGRAM 0x10 #define BF_NUMBER 0x20 // is it like gbsortby:price? numeric? static const int INTERSECT_SCORING = 0; static const int INTERSECT_DEBUG_INFO = 1; static bool s_init = false; static GbMutex s_mtx_weights; static ScoringWeights s_scoringWeights; static bool s_isCompatible [HASHGROUP_END][HASHGROUP_END]; static bool s_inBody [HASHGROUP_END]; #define gbmin(a,b) ((a)<(b) ? (a) : (b)) #define gbmax(a,b) ((a)>(b) ? (a) : (b)) static inline bool isTermValueInRange( const char *p, const QueryTerm *qt); static inline bool isTermValueInRange2 ( const char *recPtr, const char *subListEnd, const QueryTerm *qt); static inline const char *getWordPosList(uint64_t docId, const char *list, int32_t listSize); static int docIdVoteBufKeyCompare_desc ( const void *h1, const void *h2 ); static void initWeights(); ////////////////// // // THE NEW INTERSECTION LOGIC // ////////////////// PosdbTable::PosdbTable() { // top docid info m_q = NULL; m_msg39req = NULL; reset(); } PosdbTable::~PosdbTable() { reset(); } void PosdbTable::reset() { // has init() been called? m_initialized = false; m_estimatedTotalHits = -1; //freeMem(); // not implemented // does not free the mem of this safebuf, only resets length m_docIdVoteBuf.reset(); m_filtered = 0; m_qiBuf.reset(); // assume no-op m_t1 = 0LL; m_whiteListTable.reset(); m_addedSites = false; // Coverity m_docId = 0; m_hasMaxSerpScore = false; m_siteRankMultiplier = 0.0; m_addListsTime = 0; m_t2 = 0; m_numSlots = 0; m_maxScores = 0; m_collnum = 0; m_qpos = NULL; m_wikiPhraseIds = NULL; m_quotedStartIds = NULL; m_freqWeights = NULL; m_bflags = NULL; m_qtermNums = NULL; m_bestMinTermPairWindowScore = 0.0; m_bestMinTermPairWindowPtrs = NULL; m_docsInColl = 0; m_msg2 = NULL; m_topTree = NULL; m_nqt = 0; m_debug = false; m_logstate = NULL; m_sortByTermNum = -1; m_sortByTermNumInt = -1; m_sortByTermInfoNum = 0; m_sortByTermInfoNumInt = 0; m_minScoreTermNum = 0; m_maxScoreTermNum = 0; m_minScoreVal = 0.0; m_maxScoreVal = 0.0; m_minScoreTermNumInt = 0; m_maxScoreTermNumInt = 0; m_minScoreValInt = 0; m_maxScoreValInt = 0; m_useWhiteTable = false; m_numQueryTermInfos = 0; m_minTermListSize = 0; m_minTermListIdx = 0; m_vecSize = 0; m_allInSameWikiPhrase = 0; m_realMaxTop = 0; } // realloc to save mem if we're rat void PosdbTable::freeMem ( ) { //@todo: ? } // . returns false on error and sets g_errno // . NOTE: termFreqs is just referenced by us, not copied // . sets m_startKeys, m_endKeys and m_minNumRecs for each termId // . TODO: ensure that m_termFreqs[] are all UPPER BOUNDS on the actual #!! // we should be able to get an upper bound estimate from the b-tree // quickly using Msg36! // . we now support multiple plus signs before the query term // . lists[] and termFreqs[] must be 1-1 with q->m_qterms[] void PosdbTable::init(Query *q, bool debug, void *logstate, TopTree *topTree, const DocumentIndexChecker &documentIndexChecker, Msg2 *msg2, Msg39Request *r) { // sanity check -- watch out for double calls if ( m_initialized ) gbshutdownAbort(true); // clear everything reset(); // we are now m_initialized = true; // set debug flag m_debug = (debug || g_conf.m_logDebugQuery); // we should save the lists! //m_lists = msg2->m_lists;//lists; //m_numLists = q->m_numTerms; // seo.cpp supplies a NULL msg2 because it already sets // QueryTerm::m_posdbListPtrs if ( ! msg2 ) return; m_msg2 = msg2; // save this m_collnum = r->m_collnum; // save the request m_msg39req = r; // save this //m_coll = coll; // get the rec for it // CollectionRec *cr = g_collectiondb.getRec ( m_collnum ); // if ( ! cr ) // gbshutdownAbort(true); // set this now //m_collnum = cr->m_collnum; m_documentIndexChecker = &documentIndexChecker; m_topTree = topTree; // remember the query class, it has all the info about the termIds m_q = q; m_nqt = q->getNumTerms(); // for debug msgs m_logstate = logstate; m_realMaxTop = r->m_realMaxTop; if ( m_realMaxTop > MAX_TOP ) m_realMaxTop = MAX_TOP; m_siteRankMultiplier = SITERANKMULTIPLIER; if ( m_q->m_isBoolean ) m_siteRankMultiplier = 0.0; // sanity if ( msg2->getNumLists() != m_q->getNumTerms() ) gbshutdownAbort(true); // copy the list ptrs to the QueryTerm::m_posdbListPtr for ( int32_t i = 0 ; i < m_q->m_numTerms ; i++ ) m_q->m_qterms[i].m_posdbListPtr = msg2->getList(i); // we always use it now if ( ! topTree ) gbshutdownAbort(true); } // // Find the top score for the term for each hash group. // INLINK_TEXT may have more than one entry in the top-scorer list, other hash groups only 1. // INLINK_TEXT may override other lower scoring hashgroups' scores // Sum up the best scores, and return that result. // Sets highestScoringNonBodyPos to the highest scoring position. // float PosdbTable::getBestScoreSumForSingleTerm(int32_t i, const char *wpi, const char *endi, DocIdScore *pdcs, const char **highestScoringNonBodyPos ) { float nonBodyMax = -1.0; int32_t lowestScoreTermIdx = 0; float bestScores[MAX_TOP] = {0}; // make Coverity happy const char *bestwpi[MAX_TOP]; int32_t numTop = 0; logTrace(g_conf.m_logTracePosdb, "BEGIN."); // assume no terms! *highestScoringNonBodyPos = NULL; if ( wpi ) { // Sanity check if( wpi >= endi ) { logTrace(g_conf.m_logTracePosdb, "END, wpi %p >= %p", wpi, endi); return -1.0; } #ifdef _VALGRIND_ VALGRIND_CHECK_MEM_IS_DEFINED(wpi,endi-wpi); #endif bool first = true; char bestmhg[MAX_TOP]; do { float score = 100.0; // good diversity? unsigned char div = Posdb::getDiversityRank ( wpi ); score *= m_msg39req->m_scoringWeights.m_diversityWeights[div]; score *= m_msg39req->m_scoringWeights.m_diversityWeights[div]; // hash group? title? body? heading? etc. unsigned char hg = Posdb::getHashGroup ( wpi ); unsigned char mhg = hg; if ( s_inBody[mhg] ) mhg = HASHGROUP_BODY; score *= m_msg39req->m_scoringWeights.m_hashGroupWeights[hg]; score *= m_msg39req->m_scoringWeights.m_hashGroupWeights[hg]; // good density? unsigned char dens = Posdb::getDensityRank ( wpi ); score *= m_msg39req->m_scoringWeights.m_densityWeights[dens]; score *= m_msg39req->m_scoringWeights.m_densityWeights[dens]; // to make more compatible with pair scores divide by distance of 2 //score /= 2.0; // word spam? unsigned char wspam = Posdb::getWordSpamRank ( wpi ); // word spam weight update if ( hg == HASHGROUP_INLINKTEXT ) { score *= m_msg39req->m_scoringWeights.m_linkerWeights [wspam]; score *= m_msg39req->m_scoringWeights.m_linkerWeights [wspam]; } else { score *= m_msg39req->m_scoringWeights.m_wordSpamWeights[wspam]; score *= m_msg39req->m_scoringWeights.m_wordSpamWeights[wspam]; } // synonym if ( Posdb::getIsSynonym(wpi) ) { score *= m_msg39req->m_synonymWeight; score *= m_msg39req->m_synonymWeight; } // do not allow duplicate hashgroups! int32_t bro = -1; for( int32_t k=0; k < numTop; k++) { if( bestmhg[k] == mhg && hg != HASHGROUP_INLINKTEXT) { bro = k; break; } } if ( bro >= 0 ) { // // We already have a score for this hash group, update if // current score is higher // if ( score > bestScores[bro] ) { bestScores[bro] = score; bestwpi [bro] = wpi; bestmhg [bro] = mhg; } } else if ( numTop < m_realMaxTop ) { // MAX_TOP ) { // // New hash group (or INLINKTEXT). // We have free slots in the top-list. // Store found score. // bestScores[numTop] = score; bestwpi [numTop] = wpi; bestmhg [numTop] = mhg; numTop++; } else if ( score > bestScores[lowestScoreTermIdx] ) { // // New hash group (or INLINKTEXT). // We have NO free slots in the top-list. // Replace lowest score in top-list with current higher score. // bestScores[lowestScoreTermIdx] = score; bestwpi [lowestScoreTermIdx] = wpi; bestmhg [lowestScoreTermIdx] = mhg; } // If top-list is full, make lowestScoreTermIdx point to the lowest score // in the top-list. if ( numTop >= m_realMaxTop ) { // MAX_TOP ) { lowestScoreTermIdx = 0; for ( int32_t k = 1 ; k < m_realMaxTop; k++ ) {//MAX_TOP ; k++ ) { if ( bestScores[k] > bestScores[lowestScoreTermIdx] ) { continue; } lowestScoreTermIdx = k; } } // For findMinTermPairScoreInWindow() sub-out algo. // If the term is not in the body, and the score is the // highest non-body term score, return the position index. if ( score > nonBodyMax && ! s_inBody[hg] ) { nonBodyMax = score; *highestScoringNonBodyPos = wpi; } // first key is 12 bytes if ( first ) { wpi += 6; first = false; } // advance wpi += 6; } while( wpi < endi && Posdb::getKeySize(wpi) == 6 ); } // add up the top scores float sum = 0.0; for ( int32_t k = 0 ; k < numTop ; k++ ) { // if it is something like "enough for" in a wikipedia // phrase like "time enough for love" give it a boost! // now we set a special bit in the keys since we do a mini // merge, we do the same thing for the syn bits if ( Posdb::getIsHalfStopWikiBigram(bestwpi[k]) ) { sum += (bestScores[k] * WIKI_BIGRAM_WEIGHT * WIKI_BIGRAM_WEIGHT); } else { // otherwise just add it up sum += bestScores[k]; } } // wiki weight //sum *= ts; sum *= m_freqWeights[i]; sum *= m_freqWeights[i]; // shortcut //char *maxp = bestwpi[k]; // if terms is a special wiki half stop bigram //if ( m_bflags[i] & BF_HALFSTOPWIKIBIGRAM ) { // sum *= WIKI_BIGRAM_WEIGHT; // sum *= WIKI_BIGRAM_WEIGHT; //} // empty list? //if ( ! wpi ) sum = -2.0; // // end the loop. return now if not collecting scoring info. // if ( ! pdcs ) { logTrace(g_conf.m_logTracePosdb, "END."); return sum; } //# //# The below is for visual presentation of the scoring ONLY //# // none? wtf? if ( numTop <= 0 ) { logTrace(g_conf.m_logTracePosdb, "END."); return sum; } // point into buf SingleScore *sx = (SingleScore *)m_singleScoreBuf.getBufPtr(); int32_t need = sizeof(SingleScore) * numTop; // point to that if ( pdcs->m_singlesOffset < 0 ) { pdcs->m_singlesOffset = m_singleScoreBuf.length(); } // reset this i guess pdcs->m_singleScores = NULL; // sanity if ( m_singleScoreBuf.getAvail() < need ) { static bool s_first = true; if ( s_first ) { log("posdb: CRITICAL single buf overflow"); } s_first = false; logTrace(g_conf.m_logTracePosdb, "END."); return sum; //gbshutdownAbort(true); } } // increase buf ptr over this then m_singleScoreBuf.incrementLength(need); // set each of the top scoring terms individiually for ( int32_t k = 0 ; k < numTop ; k++, sx++ ) { // udpate count pdcs->m_numSingles++; const char *maxp = bestwpi[k]; memset(sx,0,sizeof(*sx)); sx->m_isSynonym = Posdb::getIsSynonym(maxp); sx->m_isHalfStopWikiBigram = Posdb::getIsHalfStopWikiBigram(maxp); //sx->m_isSynonym = (m_bflags[i] & BF_SYNONYM) ; sx->m_diversityRank = Posdb::getDiversityRank(maxp); sx->m_wordSpamRank = Posdb::getWordSpamRank(maxp); sx->m_hashGroup = Posdb::getHashGroup(maxp); sx->m_wordPos = Posdb::getWordPos(maxp); sx->m_densityRank = Posdb::getDensityRank(maxp); float score = bestScores[k]; //score *= ts; score *= m_freqWeights[i]; score *= m_freqWeights[i]; // if terms is a special wiki half stop bigram if ( sx->m_isHalfStopWikiBigram ) { score *= WIKI_BIGRAM_WEIGHT; score *= WIKI_BIGRAM_WEIGHT; } sx->m_finalScore = score; sx->m_tfWeight = m_freqWeights[i]; sx->m_qtermNum = m_qtermNums[i]; //int64_t *termFreqs = (int64_t *)m_msg39req->ptr_termFreqs; //sx->m_termFreq = termFreqs[sx->m_qtermNum]; sx->m_bflags = m_bflags[i]; } logTrace(g_conf.m_logTracePosdb, "END. sum=%f", sum); return sum; } // . advace two ptrs at the same time so it's just a linear scan // . TODO: add all up, then basically taking a weight of the top 6 or so... float PosdbTable::getMaxScoreForNonBodyTermPair(const char *wpi, const char *wpj, const char *endi, const char *endj, int32_t qdist) { // Sanity check if( wpi >= endi || wpj >= endj ) { return -1.0; } #ifdef _VALGRIND_ VALGRIND_CHECK_MEM_IS_DEFINED(wpi,endi-wpi); VALGRIND_CHECK_MEM_IS_DEFINED(wpj,endj-wpj); #endif logTrace(g_conf.m_logTracePosdb, "BEGIN."); int32_t p1 = Posdb::getWordPos ( wpi ); int32_t p2 = Posdb::getWordPos ( wpj ); unsigned char hg1 = Posdb::getHashGroup(wpi); unsigned char hg2 = Posdb::getHashGroup(wpj); //temporary fix: posdb can have junk in it so clamp the hashgroup to the limit if(hg1>=HASHGROUP_END) hg1=HASHGROUP_END-1; if(hg2>=HASHGROUP_END) hg2=HASHGROUP_END-1; unsigned char wsr1 = Posdb::getWordSpamRank(wpi); unsigned char wsr2 = Posdb::getWordSpamRank(wpj); float spamw1 ; float spamw2 ; if ( hg1 == HASHGROUP_INLINKTEXT ) { spamw1 = m_msg39req->m_scoringWeights.m_linkerWeights[wsr1]; } else { spamw1 = m_msg39req->m_scoringWeights.m_wordSpamWeights[wsr1]; } if ( hg2 == HASHGROUP_INLINKTEXT ) { spamw2 = m_msg39req->m_scoringWeights.m_linkerWeights[wsr2]; } else { spamw2 = m_msg39req->m_scoringWeights.m_wordSpamWeights[wsr2]; } // density weight //float denw ; //if ( hg1 == HASHGROUP_BODY ) denw = 1.0; float denw1 = m_msg39req->m_scoringWeights.m_densityWeights[Posdb::getDensityRank(wpi)]; float denw2 = m_msg39req->m_scoringWeights.m_densityWeights[Posdb::getDensityRank(wpj)]; bool firsti = true; bool firstj = true; float score; float max = -1.0; int32_t dist; for(;;) { if ( p1 <= p2 ) { // . skip the pair if they are in different hashgroups // . we no longer allow either to be in the body in this // algo because we handle those cases in the sliding window // algo! if( s_isCompatible[hg1][hg2] ) { // git distance dist = p2 - p1; // if zero, make sure its 2. this happens when the same bigram // is used by both terms. i.e. street uses the bigram // 'street light' and so does 'light'. so the wordpositions // are exactly the same! if ( dist < 2 ) { dist = 2; } // fix distance if in different non-body hashgroups if ( dist > 50 ) { dist = FIXED_DISTANCE; } // subtract from the dist the terms are apart in the query if ( dist >= qdist ) { dist = dist - qdist; } // good density? score = 100 * denw1 * denw2; // hashgroup modifier score *= m_msg39req->m_scoringWeights.m_hashGroupWeights[hg1]; score *= m_msg39req->m_scoringWeights.m_hashGroupWeights[hg2]; // if synonym or alternate word form if ( Posdb::getIsSynonym(wpi) ) { score *= m_msg39req->m_synonymWeight; } if ( Posdb::getIsSynonym(wpj) ) { score *= m_msg39req->m_synonymWeight; } // word spam weights score *= spamw1 * spamw2; // huge title? do not allow 11th+ word to be weighted high //if ( hg1 == HASHGROUP_TITLE && dist > 20 ) // score /= m_msg39req->m_scoringWeights.m_hashGroupWeights[hg1]; // mod by distance score /= (dist + 1.0); // tmp hack //score *= (dist+1.0); // best? if ( score > max ) { max = score; } } // first key is 12 bytes if ( firsti ) { wpi += 6; firsti = false; } // advance wpi += 6; // end of list? if ( wpi >= endi ) { break; // exit for(;;) loop } // exhausted? if ( Posdb::getKeySize ( wpi ) != 6 ) { break; // exit for(;;) loop } // update. include G-bits? p1 = Posdb::getWordPos ( wpi ); // hash group update hg1 = Posdb::getHashGroup ( wpi ); // update density weight in case hash group changed denw1 = m_msg39req->m_scoringWeights.m_densityWeights[Posdb::getDensityRank(wpi)]; // word spam weight update if ( hg1 == HASHGROUP_INLINKTEXT ) { spamw1=m_msg39req->m_scoringWeights.m_linkerWeights[Posdb::getWordSpamRank(wpi)]; } else { spamw1=m_msg39req->m_scoringWeights.m_wordSpamWeights[Posdb::getWordSpamRank(wpi)]; } } else { // . skip the pair if they are in different hashgroups // . we no longer allow either to be in the body in this // algo because we handle those cases in the sliding window // algo! if ( s_isCompatible[hg1][hg2] ) { // get distance dist = p1 - p2; // if zero, make sure its 2. this happens when the same bigram // is used by both terms. i.e. street uses the bigram // 'street light' and so does 'light'. so the wordpositions // are exactly the same! if ( dist < 2 ) dist = 2; // fix distance if in different non-body hashgroups if ( dist > 50 ) { dist = FIXED_DISTANCE; } // subtract from the dist the terms are apart in the query if ( dist >= qdist ) { dist = dist - qdist; // add 1 for being out of order dist += qdist - 1; } else { //dist = dist - qdist; // add 1 for being out of order dist += 1; // qdist - 1; } // compute score based on that junk //score = (MAXWORDPOS+1) - dist; // good diversity? uneeded for pair algo //score *= m_msg39req->m_scoringWeights.m_diversityWeights[div1]; //score *= m_msg39req->m_scoringWeights.m_diversityWeights[div2]; // good density? score = 100 * denw1 * denw2; // hashgroup modifier score *= m_msg39req->m_scoringWeights.m_hashGroupWeights[hg1]; score *= m_msg39req->m_scoringWeights.m_hashGroupWeights[hg2]; // if synonym or alternate word form if ( Posdb::getIsSynonym(wpi) ) score *= m_msg39req->m_synonymWeight; if ( Posdb::getIsSynonym(wpj) ) score *= m_msg39req->m_synonymWeight; //if ( m_bflags[i] & BF_SYNONYM ) score *= m_msg39req->m_synonymWeight; //if ( m_bflags[j] & BF_SYNONYM ) score *= m_msg39req->m_synonymWeight; // word spam weights score *= spamw1 * spamw2; // huge title? do not allow 11th+ word to be weighted high //if ( hg1 == HASHGROUP_TITLE && dist > 20 ) // score /= m_msg39req->m_scoringWeights.m_hashGroupWeights[hg1]; // mod by distance score /= (dist + 1.0); // tmp hack //score *= (dist+1.0); // best? if ( score > max ) { max = score; } } // first key is 12 bytes if ( firstj ) { wpj += 6; firstj = false; } // advance wpj += 6; // end of list? if ( wpj >= endj ) { break; // exit for(;;) loop } // exhausted? if ( Posdb::getKeySize(wpj) != 6 ) { break; // exit for(;;) loop } // update p2 = Posdb::getWordPos(wpj); // hash group update hg2 = Posdb::getHashGroup(wpj); // update density weight in case hash group changed denw2 = m_msg39req->m_scoringWeights.m_densityWeights[Posdb::getDensityRank(wpj)]; // word spam weight update if ( hg2 == HASHGROUP_INLINKTEXT ) { spamw2=m_msg39req->m_scoringWeights.m_linkerWeights[Posdb::getWordSpamRank(wpj)]; } else { spamw2=m_msg39req->m_scoringWeights.m_wordSpamWeights[Posdb::getWordSpamRank(wpj)]; } } } logTrace(g_conf.m_logTracePosdb, "END."); return max; } float PosdbTable::getScoreForTermPair(const char *wpi, const char *wpj, int32_t fixedDistance, int32_t qdist) { logTrace(g_conf.m_logTracePosdb, "BEGIN."); if ( ! wpi ) { logTrace(g_conf.m_logTracePosdb, "END."); return -1.00; } if ( ! wpj ) { logTrace(g_conf.m_logTracePosdb, "END."); return -1.00; } #ifdef _VALGRIND_ VALGRIND_CHECK_MEM_IS_DEFINED(wpi,6); VALGRIND_CHECK_MEM_IS_DEFINED(wpj,6); #endif int32_t p1 = Posdb::getWordPos ( wpi ); int32_t p2 = Posdb::getWordPos ( wpj ); unsigned char hg1 = Posdb::getHashGroup ( wpi ); unsigned char hg2 = Posdb::getHashGroup ( wpj ); unsigned char wsr1 = Posdb::getWordSpamRank(wpi); unsigned char wsr2 = Posdb::getWordSpamRank(wpj); float spamw1; float spamw2; float denw1; float denw2; float dist; float score; if ( hg1 ==HASHGROUP_INLINKTEXT)spamw1=m_msg39req->m_scoringWeights.m_linkerWeights[wsr1]; else spamw1=m_msg39req->m_scoringWeights.m_wordSpamWeights[wsr1]; if ( hg2 ==HASHGROUP_INLINKTEXT)spamw2=m_msg39req->m_scoringWeights.m_linkerWeights[wsr2]; else spamw2=m_msg39req->m_scoringWeights.m_wordSpamWeights[wsr2]; denw1 = m_msg39req->m_scoringWeights.m_densityWeights[Posdb::getDensityRank(wpi)]; denw2 = m_msg39req->m_scoringWeights.m_densityWeights[Posdb::getDensityRank(wpj)]; // set this if ( fixedDistance != 0 ) { dist = fixedDistance; } else { // do the math now if ( p2 < p1 ) dist = p1 - p2; else dist = p2 - p1; // if zero, make sure its 2. this happens when the same bigram // is used by both terms. i.e. street uses the bigram // 'street light' and so does 'light'. so the wordpositions // are exactly the same! if ( dist < 2 ) dist = 2; // subtract from the dist the terms are apart in the query if ( dist >= qdist ) dist = dist - qdist; // out of order? penalize by 1 unit if ( p2 < p1 ) dist += 1; } // TODO: use left and right diversity if no matching query term // is on the left or right //score *= m_msg39req->m_scoringWeights.m_diversityWeights[div1]; //score *= m_msg39req->m_scoringWeights.m_diversityWeights[div2]; // good density? score = 100 * denw1 * denw2; // wikipedia phrase weight //score *= ts; // hashgroup modifier score *= m_msg39req->m_scoringWeights.m_hashGroupWeights[hg1]; score *= m_msg39req->m_scoringWeights.m_hashGroupWeights[hg2]; // if synonym or alternate word form if ( Posdb::getIsSynonym(wpi) ) score *= m_msg39req->m_synonymWeight; if ( Posdb::getIsSynonym(wpj) ) score *= m_msg39req->m_synonymWeight; //if ( m_bflags[i] & BF_SYNONYM ) score *= m_msg39req->m_synonymWeight; //if ( m_bflags[j] & BF_SYNONYM ) score *= m_msg39req->m_synonymWeight; // word spam weights score *= spamw1 * spamw2; // mod by distance score /= (dist + 1.0); // tmp hack //score *= (dist+1.0); logTrace(g_conf.m_logTracePosdb, "END. score=%f", score); return score; } // . advance two ptrs at the same time so it's just a linear scan // . TODO: add all up, then basically taking a weight of the top 6 or so... // . skip body terms not in the sliding window as defined by m_bestMinTermPairWindowPtrs[] float PosdbTable::getTermPairScoreForAny ( int32_t i, int32_t j, const char *wpi, const char *wpj, const char *endi, const char *endj, DocIdScore *pdcs ) { // wiki phrase weight? float wts; logTrace(g_conf.m_logTracePosdb, "BEGIN."); int32_t qdist; // but if they are in the same wikipedia phrase // then try to keep their positions as in the query. // so for 'time enough for love' ideally we want // 'time' to be 6 units apart from 'love' if ( m_wikiPhraseIds[j] == m_wikiPhraseIds[i] && m_wikiPhraseIds[j] ) { // zero means not in a phrase qdist = m_qpos[j] - m_qpos[i]; // wiki weight wts = (float)WIKI_WEIGHT; } else { // basically try to get query words as close // together as possible qdist = 2; // this should help fix // 'what is an unsecured loan' so we are more likely // to get the page that has that exact phrase in it. // yes, but hurts how to make a lock pick set. //qdist = qpos[j] - qpos[i]; // wiki weight wts = 1.0; } bool inSameQuotedPhrase = false; if ( m_quotedStartIds[i] == m_quotedStartIds[j] && m_quotedStartIds[i] >= 0 ) { inSameQuotedPhrase = true; } // oops.. this was not counting non-space punct for 2 units // instead of 1 if ( inSameQuotedPhrase ) { qdist = m_qpos[j] - m_qpos[i]; } int32_t p1 = Posdb::getWordPos(wpi); int32_t p2 = Posdb::getWordPos(wpj); unsigned char hg1 = Posdb::getHashGroup ( wpi ); unsigned char hg2 = Posdb::getHashGroup ( wpj ); // reduce to either HASHGROUP_BODY/TITLE/INLINK/META unsigned char mhg1 = hg1; unsigned char mhg2 = hg2; if ( s_inBody[mhg1] ) { mhg1 = HASHGROUP_BODY; } if ( s_inBody[mhg2] ) { mhg2 = HASHGROUP_BODY; } unsigned char wsr1 = Posdb::getWordSpamRank(wpi); unsigned char wsr2 = Posdb::getWordSpamRank(wpj); float spamw1 ; float spamw2 ; if( hg1 == HASHGROUP_INLINKTEXT ) { spamw1 = m_msg39req->m_scoringWeights.m_linkerWeights[wsr1]; } else { spamw1 = m_msg39req->m_scoringWeights.m_wordSpamWeights[wsr1]; } if( hg2 == HASHGROUP_INLINKTEXT ) { spamw2 = m_msg39req->m_scoringWeights.m_linkerWeights[wsr2]; } else { spamw2 = m_msg39req->m_scoringWeights.m_wordSpamWeights[wsr2]; } // density weight float denw1 = m_msg39req->m_scoringWeights.m_densityWeights[Posdb::getDensityRank(wpi)]; float denw2 = m_msg39req->m_scoringWeights.m_densityWeights[Posdb::getDensityRank(wpj)]; bool firsti = true; bool firstj = true; float score; int32_t lowestScoreTermIdx = -1; float bestScores[MAX_TOP] = {0}; // make Coverity happy const char *bestwpi [MAX_TOP]; const char *bestwpj [MAX_TOP]; char bestmhg1 [MAX_TOP]; char bestmhg2 [MAX_TOP]; char bestFixed [MAX_TOP]; int32_t numTop = 0; int32_t dist; bool fixedDistance; int32_t bro; char syn1; char syn2; for(;;) { // . if p1/p2 is in body and not in window, skip // . this is how we restrict all body terms to the winning // sliding window if ( s_inBody[hg1] && wpi != m_bestMinTermPairWindowPtrs[i] ) { goto skip1; } if ( s_inBody[hg2] && wpj != m_bestMinTermPairWindowPtrs[j] ) { goto skip2; } // make this strictly < now and not <= because in the event // of bigram terms, where p1==p2 we'd like to advance p2/wj to // point to the non-syn single term in order to get a better score // to fix the 'search engine' query on gigablast.com if ( p1 <= p2 ) { // git distance dist = p2 - p1; // if in the same quoted phrase, order is bad! if ( inSameQuotedPhrase ) { // debug //log("dddx: i=%" PRId32" j=%" PRId32" dist=%" PRId32" qdist=%" PRId32" posi=%" PRId32" " // "posj=%" PRId32, // i,j,dist,qdist,p1,p2); // TODO: allow for off by 1 // if it has punct in it then dist will be 3, // just a space or similar then dist should be 2. if ( dist > qdist && dist - qdist >= 2 ) { goto skip1; } if ( dist < qdist && qdist - dist >= 2 ) { goto skip1; } } // are either synonyms syn1 = Posdb::getIsSynonym(wpi); syn2 = Posdb::getIsSynonym(wpj); // if zero, make sure its 2. this happens when the same bigram // is used by both terms. i.e. street uses the bigram // 'street light' and so does 'light'. so the wordpositions // are exactly the same! if ( dist < 2 ) { dist = 2; } // fix distance if in different non-body hashgroups if ( dist < 50 ) { fixedDistance = false; } // body vs title, linktext vs title, linktext vs body else if ( mhg1 != mhg2 ) { dist = FIXED_DISTANCE; fixedDistance = true; } // link text to other link text else if ( mhg1 == HASHGROUP_INLINKTEXT ) { dist = FIXED_DISTANCE; fixedDistance = true; } else { fixedDistance = false; } // if both are link text and > 50 units apart that means // they are from different link texts //if ( hg1 == HASHGROUP_INLINKTEXT && dist > 50 ) goto skip1; // subtract from the dist the terms are apart in the query if ( dist >= qdist ) { dist = dist - qdist; } // good density? score = 100 * denw1 * denw2; // hashgroup modifier score *= m_msg39req->m_scoringWeights.m_hashGroupWeights[hg1]; score *= m_msg39req->m_scoringWeights.m_hashGroupWeights[hg2]; // if synonym or alternate word form if ( syn1 ) { score *= m_msg39req->m_synonymWeight; } if ( syn2 ) { score *= m_msg39req->m_synonymWeight; } // the new logic if ( Posdb::getIsHalfStopWikiBigram(wpi) ) { score *= WIKI_BIGRAM_WEIGHT; } if ( Posdb::getIsHalfStopWikiBigram(wpj) ) { score *= WIKI_BIGRAM_WEIGHT; } // word spam weights score *= spamw1 * spamw2; // huge title? do not allow 11th+ word to be weighted high //if ( hg1 == HASHGROUP_TITLE && dist > 20 ) // score /= m_msg39req->m_scoringWeights.m_hashGroupWeights[hg1]; // mod by distance score /= (dist + 1.0); // tmp hack //score *= (dist+1.0); // if our hg1/hg2 hashgroup pairing already exists // in the bestScores array we have to beat it and then // we have to replace that. we can only have one per, // except for linktext! bro = -1; for ( int32_t k = 0 ; k < numTop ; k++ ) { if ( bestmhg1[k]==mhg1 && hg1 != HASHGROUP_INLINKTEXT ){ bro = k; break; } if ( bestmhg2[k]==mhg2 && hg2 != HASHGROUP_INLINKTEXT ){ bro = k; break; } } if ( bro >= 0 ) { if ( score > bestScores[bro] ) { bestScores[bro] = score; bestwpi [bro] = wpi; bestwpj [bro] = wpj; bestmhg1 [bro] = mhg1; bestmhg2 [bro] = mhg2; bestFixed [bro] = fixedDistance; } } else if ( numTop < m_realMaxTop ) { // MAX_TOP ) { bestScores[numTop] = score; bestwpi [numTop] = wpi; bestwpj [numTop] = wpj; bestmhg1 [numTop] = mhg1; bestmhg2 [numTop] = mhg2; bestFixed [numTop] = fixedDistance; numTop++; } else if ( lowestScoreTermIdx >= 0 && score > bestScores[lowestScoreTermIdx] ) { bestScores[lowestScoreTermIdx] = score; bestwpi [lowestScoreTermIdx] = wpi; bestwpj [lowestScoreTermIdx] = wpj; bestmhg1 [lowestScoreTermIdx] = mhg1; bestmhg2 [lowestScoreTermIdx] = mhg2; bestFixed [lowestScoreTermIdx] = fixedDistance; } // set "lowestScoreTermIdx" to the lowest score out of the top scores if ( numTop >= m_realMaxTop ) { // MAX_TOP ) { lowestScoreTermIdx = 0; for ( int32_t k = 1 ; k < m_realMaxTop;k++){//MAX_TOP;k++ if (bestScores[k] > bestScores[lowestScoreTermIdx] ) { continue; } lowestScoreTermIdx = k; } } skip1: // first key is 12 bytes if ( firsti ) { wpi += 6; firsti = false; } // advance wpi += 6; // end of list? if ( wpi >= endi ) { break; // exit for(;;) loop } // exhausted? if ( Posdb::getKeySize ( wpi ) != 6 ) { // sometimes there is posdb index corruption and // we have a 12 byte key with the same docid but // different siterank or langid because it was // not deleted right! if ( (uint64_t)Posdb::getDocId(wpi) != m_docId ) { gbshutdownAbort(true); } // re-set this i guess firsti = true; } // update. include G-bits? p1 = Posdb::getWordPos ( wpi ); // hash group update hg1 = Posdb::getHashGroup ( wpi ); // the "modified" hash group mhg1 = hg1; if ( s_inBody[mhg1] ) mhg1 = HASHGROUP_BODY; // update density weight in case hash group changed denw1 = m_msg39req->m_scoringWeights.m_densityWeights[Posdb::getDensityRank(wpi)]; // word spam weight update if ( hg1 == HASHGROUP_INLINKTEXT ) { spamw1=m_msg39req->m_scoringWeights.m_linkerWeights[Posdb::getWordSpamRank(wpi)]; } else { spamw1=m_msg39req->m_scoringWeights.m_wordSpamWeights[Posdb::getWordSpamRank(wpi)]; } } else { // get distance dist = p1 - p2; // if in the same quoted phrase, order is bad! if ( inSameQuotedPhrase ) { // debug //log("dddy: i=%" PRId32" j=%" PRId32" dist=%" PRId32" qdist=%" PRId32" posi=%" PRId32" " // "posj=%" PRId32, // i,j,dist,qdist,p1,p2); goto skip2; } // if zero, make sure its 2. this happens when the same bigram // is used by both terms. i.e. street uses the bigram // 'street light' and so does 'light'. so the wordpositions // are exactly the same! if ( dist < 2 ) { dist = 2; } // fix distance if in different non-body hashgroups if ( dist < 50 ) { fixedDistance = false; } // body vs title, linktext vs title, linktext vs body else if ( mhg1 != mhg2 ) { dist = FIXED_DISTANCE; fixedDistance = true; } // link text to other link text else if ( mhg1 == HASHGROUP_INLINKTEXT ) { dist = FIXED_DISTANCE; fixedDistance = true; } else fixedDistance = false; // if both are link text and > 50 units apart that means // they are from different link texts //if ( hg1 == HASHGROUP_INLINKTEXT && dist > 50 ) goto skip2; // subtract from the dist the terms are apart in the query if ( dist >= qdist ) { dist = dist - qdist; // add 1 for being out of order dist += qdist - 1; } else { //dist = dist - qdist; // add 1 for being out of order dist += 1; // qdist - 1; } // compute score based on that junk //score = (MAXWORDPOS+1) - dist; // good diversity? uneeded for pair algo //score *= m_msg39req->m_scoringWeights.m_diversityWeights[div1]; //score *= m_msg39req->m_scoringWeights.m_diversityWeights[div2]; // good density? score = 100 * denw1 * denw2; // hashgroup modifier score *= m_msg39req->m_scoringWeights.m_hashGroupWeights[hg1]; score *= m_msg39req->m_scoringWeights.m_hashGroupWeights[hg2]; // if synonym or alternate word form if ( Posdb::getIsSynonym(wpi) ) { score *= m_msg39req->m_synonymWeight; } if ( Posdb::getIsSynonym(wpj) ) { score *= m_msg39req->m_synonymWeight; } //if ( m_bflags[i] & BF_SYNONYM ) score *= m_msg39req->m_synonymWeight; //if ( m_bflags[j] & BF_SYNONYM ) score *= m_msg39req->m_synonymWeight; // word spam weights score *= spamw1 * spamw2; // huge title? do not allow 11th+ word to be weighted high //if ( hg1 == HASHGROUP_TITLE && dist > 20 ) // score /= m_msg39req->m_scoringWeights.m_hashGroupWeights[hg1]; // mod by distance score /= (dist + 1.0); // tmp hack //score *= (dist+1.0); // if our hg1/hg2 hashgroup pairing already exists // in the bestScores array we have to beat it and then // we have to replace that. we can only have one per, // except for linktext! bro = -1; for ( int32_t k = 0 ; k < numTop ; k++ ) { if ( bestmhg1[k]==mhg1 && hg1 !=HASHGROUP_INLINKTEXT ){ bro = k; break; } if ( bestmhg2[k]==mhg2 && hg2 !=HASHGROUP_INLINKTEXT ){ bro = k; break; } } if ( bro >= 0 ) { if ( score > bestScores[bro] ) { bestScores[bro] = score; bestwpi [bro] = wpi; bestwpj [bro] = wpj; bestmhg1 [bro] = mhg1; bestmhg2 [bro] = mhg2; bestFixed [bro] = fixedDistance; } } // best? else if ( numTop < m_realMaxTop ) { // MAX_TOP ) { bestScores[numTop] = score; bestwpi [numTop] = wpi; bestwpj [numTop] = wpj; bestmhg1 [numTop] = mhg1; bestmhg2 [numTop] = mhg2; bestFixed [numTop] = fixedDistance; numTop++; } else if ( score > bestScores[lowestScoreTermIdx] ) { bestScores[lowestScoreTermIdx] = score; bestwpi [lowestScoreTermIdx] = wpi; bestwpj [lowestScoreTermIdx] = wpj; bestmhg1 [lowestScoreTermIdx] = mhg1; bestmhg2 [lowestScoreTermIdx] = mhg2; bestFixed [lowestScoreTermIdx] = fixedDistance; } // set "lowestScoreTermIdx" to the lowest score out of the top scores if ( numTop >= m_realMaxTop ) { // MAX_TOP ) { lowestScoreTermIdx = 0; for ( int32_t k = 1 ; k < m_realMaxTop;k++){//MAX_TOP;k++ if( bestScores[k] > bestScores[lowestScoreTermIdx]) { continue; } lowestScoreTermIdx = k; } } skip2: // first key is 12 bytes if ( firstj ) { wpj += 6; firstj = false; } // advance wpj += 6; // end of list? if ( wpj >= endj ) { break; // exit for(;;) loop } // exhausted? if ( Posdb::getKeySize ( wpj ) != 6 ) { // sometimes there is posdb index corruption and // we have a 12 byte key with the same docid but // different siterank or langid because it was // not deleted right! if ( (uint64_t)Posdb::getDocId(wpj) != m_docId ) { gbshutdownAbort(true); } // re-set this i guess firstj = true; } // update p2 = Posdb::getWordPos ( wpj ); // hash group update hg2 = Posdb::getHashGroup ( wpj ); // the "modified" hash group mhg2 = hg2; if ( s_inBody[mhg2] ) { mhg2 = HASHGROUP_BODY; } // update density weight in case hash group changed denw2 = m_msg39req->m_scoringWeights.m_densityWeights[Posdb::getDensityRank(wpj)]; // word spam weight update if ( hg2 == HASHGROUP_INLINKTEXT ) { spamw2 = m_msg39req->m_scoringWeights.m_linkerWeights[Posdb::getWordSpamRank(wpj)]; } else { spamw2 = m_msg39req->m_scoringWeights.m_wordSpamWeights[Posdb::getWordSpamRank(wpj)]; } } } // for(;;) // add up the top scores float sum = 0.0; for ( int32_t k = 0 ; k < numTop ; k++ ) { sum += bestScores[k]; } if (m_debug) { for ( int32_t k = 0 ; k < numTop ; k++ ) log(LOG_INFO, "posdb: best score #%" PRId32" = %f",k,bestScores[k]); log(LOG_INFO, "posdb: best score sum = %f",sum); } // wiki phrase weight sum *= wts; // mod by freq weight sum *= m_freqWeights[i]; sum *= m_freqWeights[j]; if (m_debug) { log(LOG_INFO, "posdb: best score final = %f",sum); } // // end the loop. return now if not collecting scoring info. // if ( ! pdcs ) { logTrace(g_conf.m_logTracePosdb, "END."); return sum; } // none? wtf? if ( numTop <= 0 ) { logTrace(g_conf.m_logTracePosdb, "END."); return sum; } // // now store the PairScores into the m_pairScoreBuf for this // top docid. // // point into buf PairScore *px = (PairScore *)m_pairScoreBuf.getBufPtr(); int32_t need = sizeof(PairScore) * numTop; // point to that if ( pdcs->m_pairsOffset < 0 ) { pdcs->m_pairsOffset = m_pairScoreBuf.length(); } // reset this i guess pdcs->m_pairScores = NULL; // sanity if ( m_pairScoreBuf.getAvail() < need ) { // m_pairScores will be NULL static bool s_first = true; if ( s_first ) { log("posdb: CRITICAL pair buf overflow"); } s_first = false; logTrace(g_conf.m_logTracePosdb, "END."); return sum; } // increase buf ptr over this then m_pairScoreBuf.incrementLength(need); // set each of the top scoring terms individiually for ( int32_t k = 0 ; k < numTop ; k++, px++ ) { pdcs->m_numPairs++; memset(px,0,sizeof(*px)); const char *maxp1 = bestwpi[k]; const char *maxp2 = bestwpj[k]; score = bestScores[k]; bool fixedDist = bestFixed[k]; score *= wts; score *= m_freqWeights[i]; score *= m_freqWeights[j]; // we have to encode these bits into the mini merge now if ( Posdb::getIsHalfStopWikiBigram(maxp1) ) { score *= WIKI_BIGRAM_WEIGHT; } if ( Posdb::getIsHalfStopWikiBigram(maxp2) ) { score *= WIKI_BIGRAM_WEIGHT; } //if ( m_bflags[i] & BF_HALFSTOPWIKIBIGRAM ) //if ( m_bflags[j] & BF_HALFSTOPWIKIBIGRAM ) // wiki phrase weight px->m_finalScore = score; px->m_wordPos1 = Posdb::getWordPos(maxp1); px->m_wordPos2 = Posdb::getWordPos(maxp2); syn1 = Posdb::getIsSynonym(maxp1); syn2 = Posdb::getIsSynonym(maxp2); px->m_isSynonym1 = syn1; px->m_isSynonym2 = syn2; px->m_isHalfStopWikiBigram1 = Posdb::getIsHalfStopWikiBigram(maxp1); px->m_isHalfStopWikiBigram2 = Posdb::getIsHalfStopWikiBigram(maxp2); //px->m_isSynonym1 = ( m_bflags[i] & BF_SYNONYM ); //px->m_isSynonym2 = ( m_bflags[j] & BF_SYNONYM ); px->m_diversityRank1 = Posdb::getDiversityRank(maxp1); px->m_diversityRank2 = Posdb::getDiversityRank(maxp2); px->m_wordSpamRank1 = Posdb::getWordSpamRank(maxp1); px->m_wordSpamRank2 = Posdb::getWordSpamRank(maxp2); px->m_hashGroup1 = Posdb::getHashGroup(maxp1); px->m_hashGroup2 = Posdb::getHashGroup(maxp2); px->m_qdist = qdist; // bigram algorithm fix //if ( px->m_wordPos1 == px->m_wordPos2 ) // px->m_wordPos2 += 2; px->m_densityRank1 = Posdb::getDensityRank(maxp1); px->m_densityRank2 = Posdb::getDensityRank(maxp2); px->m_fixedDistance = fixedDist; px->m_qtermNum1 = m_qtermNums[i]; px->m_qtermNum2 = m_qtermNums[j]; //int64_t *termFreqs = (int64_t *)m_msg39req->ptr_termFreqs; //px->m_termFreq1 = termFreqs[px->m_qtermNum1]; //px->m_termFreq2 = termFreqs[px->m_qtermNum2]; px->m_tfWeight1 = m_freqWeights[i]; px->m_tfWeight2 = m_freqWeights[j]; px->m_bflags1 = m_bflags[i]; px->m_bflags2 = m_bflags[j]; // flag it as in same wiki phrase if ( almostEqualFloat(wts, (float)WIKI_WEIGHT) ) { px->m_inSameWikiPhrase = 1; } else { px->m_inSameWikiPhrase = 0; } #ifdef _VALGRIND_ VALGRIND_CHECK_MEM_IS_DEFINED(px,sizeof(*px)); #endif // only log for debug if it is one result if ( m_debug ) { // log each one for debug log(LOG_INFO, "posdb: result #%" PRId32" " "i=%" PRId32" " "j=%" PRId32" " "termNum0=%" PRId32" " "termNum1=%" PRId32" " "finalscore=%f " "tfw0=%f " "tfw1=%f " "fixeddist=%" PRId32" " // bool "wts=%f " "bflags0=%" PRId32" " "bflags1=%" PRId32" " "syn0=%" PRId32" " "syn1=%" PRId32" " "div0=%" PRId32" " "div1=%" PRId32" " "wspam0=%" PRId32" " "wspam1=%" PRId32" " "hgrp0=%s " "hgrp1=%s " "qdist=%" PRId32" " "wpos0=%" PRId32" " "wpos1=%" PRId32" " "dens0=%" PRId32" " "dens1=%" PRId32" ", k, i, j, px->m_qtermNum1, px->m_qtermNum2, score, m_freqWeights[i], m_freqWeights[j], (int32_t) bestFixed[k], wts, (int32_t) m_bflags[i], (int32_t) m_bflags[j], (int32_t) px->m_isSynonym1, (int32_t) px->m_isSynonym2, (int32_t) px->m_diversityRank1, (int32_t) px->m_diversityRank2, (int32_t) px->m_wordSpamRank1, (int32_t) px->m_wordSpamRank2, getHashGroupString(px->m_hashGroup1), getHashGroupString(px->m_hashGroup2), (int32_t) px->m_qdist, (int32_t) px->m_wordPos1, (int32_t) px->m_wordPos2, (int32_t) px->m_densityRank1, (int32_t) px->m_densityRank2 ); } } // do the same but for second bests! so seo.cpp's top term pairs // algo can do a term insertion, and if that hurts the best pair // the 2nd best might cover for it. ideally, we'd have all the term // pairs for this algo, but i think we'll have to get by on just this. logTrace(g_conf.m_logTracePosdb, "END. sum=%f", sum); return sum; } // // // TRY TO SPEED IT UP!!! // // // returns false and sets g_errno on error bool PosdbTable::setQueryTermInfo ( ) { logTrace(g_conf.m_logTracePosdb, "BEGIN."); // alloc space. assume max int32_t qneed = sizeof(QueryTermInfo) * m_q->m_numTerms; if ( ! m_qiBuf.reserve(qneed,"qibuf") ) { return false; // label it too! } // point to those QueryTermInfo *qtibuf = (QueryTermInfo *)m_qiBuf.getBufStart(); int32_t nrg = 0; // assume not sorting by a numeric termlist m_sortByTermNum = -1; m_sortByTermNumInt = -1; // now we have score ranges for gbmin:price:1.99 etc. m_minScoreTermNum = -1; m_maxScoreTermNum = -1; // for gbminint:count:99 etc. m_minScoreTermNumInt = -1; m_maxScoreTermNumInt = -1; m_hasMaxSerpScore = false; if ( m_msg39req->m_minSerpDocId ) { m_hasMaxSerpScore = true; } for ( int32_t i = 0 ; i < m_q->m_numTerms ; i++ ) { QueryTerm *qt = &m_q->m_qterms[i]; logTrace(g_conf.m_logTracePosdb, "i=%" PRId32 ", term=[%.*s], required=%s", i, qt->m_termLen, qt->m_term, qt->m_isRequired?"true":"false"); if ( ! qt->m_isRequired ) { continue; } // set this stff const QueryWord *qw = qt->m_qword; // get one QueryTermInfo *qti = &qtibuf[nrg]; // and set it qti->m_qt = qt; qti->m_qtermNum = i; // this is not good enough, we need to count // non-whitespace punct as 2 units not 1 unit // otherwise qdist gets thrown off and our phrasing fails. // so use QueryTerm::m_qpos just for this. qti->m_qpos = qw->m_posNum; qti->m_wikiPhraseId = qw->m_wikiPhraseId; qti->m_quotedStartId = qw->m_quoteStart; // is it gbsortby:? if ( qt->m_fieldCode == FIELD_GBSORTBYFLOAT || qt->m_fieldCode == FIELD_GBREVSORTBYFLOAT ) { m_sortByTermNum = i; m_sortByTermInfoNum = nrg; } if ( qt->m_fieldCode == FIELD_GBSORTBYINT || qt->m_fieldCode == FIELD_GBREVSORTBYINT ) { m_sortByTermNumInt = i; m_sortByTermInfoNumInt = nrg; // tell topTree to use int scores m_topTree->m_useIntScores = true; } // is it gbmin:price:1.99? if ( qt->m_fieldCode == FIELD_GBNUMBERMIN ) { m_minScoreTermNum = i; m_minScoreVal = qt->m_qword->m_float; } if ( qt->m_fieldCode == FIELD_GBNUMBERMAX ) { m_maxScoreTermNum = i; m_maxScoreVal = qt->m_qword->m_float; } if ( qt->m_fieldCode == FIELD_GBNUMBERMININT ) { m_minScoreTermNumInt = i; m_minScoreValInt = qt->m_qword->m_int; } if ( qt->m_fieldCode == FIELD_GBNUMBERMAXINT ) { m_maxScoreTermNumInt = i; m_maxScoreValInt = qt->m_qword->m_int; } // count int32_t nn = 0; // also add in bigram lists int32_t left = qt->m_leftPhraseTermNum; int32_t right = qt->m_rightPhraseTermNum; // terms const QueryTerm *leftTerm = qt->m_leftPhraseTerm; const QueryTerm *rightTerm = qt->m_rightPhraseTerm; bool leftAlreadyAdded = false; bool rightAlreadyAdded = false; // // add the non-bigram list AFTER the // bigrams, which we like to do when we PREFER the bigram // terms because they are scored higher, specifically, as // in the case of being half stop wikipedia phrases like // "the tigers" for the query 'the tigers' we want to give // a slight bonus, 1.20x, for having that bigram since its // in wikipedia // // // add left bigram lists. BACKWARDS. // if ( left>=0 && leftTerm && leftTerm->m_isWikiHalfStopBigram ){ // assume added leftAlreadyAdded = true; // get list RdbList *list = m_q->m_qterms[left].m_posdbListPtr; // add list ptr into our required group qti->m_subLists[nn] = list; // special flags qti->m_bigramFlags[nn] = BF_HALFSTOPWIKIBIGRAM; // before a pipe operator? if ( qt->m_piped ) qti->m_bigramFlags[nn] |= BF_PIPED; // add list of member terms as well m_q->m_qterms[left].m_bitNum = nrg; // only really add if useful if ( list && !list->isEmpty() ) { nn++; } // add bigram synonyms! like "new jersey" bigram // has the synonym "nj" for ( int32_t k = 0 ; k < m_q->m_numTerms ; k++ ) { QueryTerm *bt = &m_q->m_qterms[k]; if ( bt->m_synonymOf != leftTerm ) { continue; } list = m_q->m_qterms[k].m_posdbListPtr; qti->m_subLists[nn] = list; qti->m_bigramFlags[nn] = 0; qti->m_bigramFlags[nn] |= BF_HALFSTOPWIKIBIGRAM; qti->m_bigramFlags[nn] |= BF_SYNONYM; if (qt->m_piped) { qti->m_bigramFlags[nn]|=BF_PIPED; } // add list of member terms as well bt->m_bitNum = nrg; if ( list && !list->isEmpty() ) { nn++; } } } // // then the right bigram if also in a wiki half stop bigram // if ( right >=0 && rightTerm && rightTerm->m_isWikiHalfStopBigram ){ // assume added rightAlreadyAdded = true; // get list RdbList *list = m_q->m_qterms[right].m_posdbListPtr; // add list ptr into our required group qti->m_subLists[nn] = list; // special flags qti->m_bigramFlags[nn] = BF_HALFSTOPWIKIBIGRAM; // before a pipe operator? if ( qt->m_piped ) qti->m_bigramFlags[nn] |= BF_PIPED; // add list of member terms as well m_q->m_qterms[right].m_bitNum = nrg; // only really add if useful if ( list && !list->isEmpty() ) { nn++; } // add bigram synonyms! like "new jersey" bigram // has the synonym "nj" for ( int32_t k = 0 ; k < m_q->m_numTerms ; k++ ) { QueryTerm *bt = &m_q->m_qterms[k]; if ( bt->m_synonymOf != rightTerm ) { continue; } list = m_q->m_qterms[k].m_posdbListPtr; qti->m_subLists[nn] = list; qti->m_bigramFlags[nn] = 0; qti->m_bigramFlags[nn] |= BF_HALFSTOPWIKIBIGRAM; qti->m_bigramFlags[nn] |= BF_SYNONYM; if (qt->m_piped) { qti->m_bigramFlags[nn]|=BF_PIPED; } // add list of member terms as well bt->m_bitNum = nrg; if ( list && !list->isEmpty() ) { nn++; } } } // // then the non-bigram termlist // // add to it. add backwards since we give precedence to // the first list and we want that to be the NEWEST list! RdbList *list = m_q->m_qterms[i].m_posdbListPtr; // add list ptr into our required group qti->m_subLists[nn] = list; // special flags qti->m_bigramFlags[nn] = 0; // before a pipe operator? if ( qt->m_piped ) qti->m_bigramFlags[nn] |= BF_PIPED; // is it a negative term? if ( qt->m_termSign=='-') qti->m_bigramFlags[nn] |= BF_NEGATIVE; // numeric posdb termlist flags. instead of word position // they have a float stored there for sorting etc. if (qt->m_fieldCode == FIELD_GBSORTBYFLOAT ) qti->m_bigramFlags[nn]|=BF_NUMBER; if (qt->m_fieldCode == FIELD_GBREVSORTBYFLOAT ) qti->m_bigramFlags[nn]|=BF_NUMBER; if (qt->m_fieldCode == FIELD_GBNUMBERMIN ) qti->m_bigramFlags[nn]|=BF_NUMBER; if (qt->m_fieldCode == FIELD_GBNUMBERMAX ) qti->m_bigramFlags[nn]|=BF_NUMBER; if (qt->m_fieldCode == FIELD_GBNUMBEREQUALFLOAT ) qti->m_bigramFlags[nn]|=BF_NUMBER; if (qt->m_fieldCode == FIELD_GBSORTBYINT ) qti->m_bigramFlags[nn]|=BF_NUMBER; if (qt->m_fieldCode == FIELD_GBREVSORTBYINT ) qti->m_bigramFlags[nn]|=BF_NUMBER; if (qt->m_fieldCode == FIELD_GBNUMBERMININT ) qti->m_bigramFlags[nn]|=BF_NUMBER; if (qt->m_fieldCode == FIELD_GBNUMBERMAXINT ) qti->m_bigramFlags[nn]|=BF_NUMBER; if (qt->m_fieldCode == FIELD_GBNUMBEREQUALINT ) qti->m_bigramFlags[nn]|=BF_NUMBER; // add list of member terms qt->m_bitNum = nrg; // only really add if useful // no, because when inserting NEW (related) terms that are // not currently in the document, this list may initially // be empty. if ( list && !list->isEmpty() ) { nn++; } // // add left bigram now if not added above // if ( left >= 0 && ! leftAlreadyAdded ) { // get list list = m_q->m_qterms[left].m_posdbListPtr; // add list ptr into our required group qti->m_subLists[nn] = list; // special flags qti->m_bigramFlags[nn] = BF_BIGRAM; // before a pipe operator? if ( qt->m_piped ) qti->m_bigramFlags[nn] |= BF_PIPED; // add list of member terms as well m_q->m_qterms[left].m_bitNum = nrg; // only really add if useful if ( list && !list->isEmpty() ) { nn++; } // add bigram synonyms! like "new jersey" bigram // has the synonym "nj" for ( int32_t k = 0 ; k < m_q->m_numTerms ; k++ ) { QueryTerm *bt = &m_q->m_qterms[k]; if ( bt->m_synonymOf != leftTerm ) { continue; } list = m_q->m_qterms[k].m_posdbListPtr; qti->m_subLists[nn] = list; qti->m_bigramFlags[nn] = 0; qti->m_bigramFlags[nn] |= BF_SYNONYM; if (qt->m_piped) { qti->m_bigramFlags[nn]|=BF_PIPED; } // add list of member terms as well bt->m_bitNum = nrg; if ( list && !list->isEmpty() ) { nn++; } } } // // add right bigram now if not added above // if ( right >= 0 && ! rightAlreadyAdded ) { // get list list = m_q->m_qterms[right].m_posdbListPtr; // add list ptr into our required group qti->m_subLists[nn] = list; // special flags qti->m_bigramFlags[nn] = BF_BIGRAM; // before a pipe operator? if ( qt->m_piped ) qti->m_bigramFlags[nn] |= BF_PIPED; // add list of member terms as well m_q->m_qterms[right].m_bitNum = nrg; // only really add if useful if ( list && !list->isEmpty() ) { nn++; } // add bigram synonyms! like "new jersey" bigram // has the synonym "nj" for ( int32_t k = 0 ; k < m_q->m_numTerms ; k++ ) { QueryTerm *bt = &m_q->m_qterms[k]; if ( bt->m_synonymOf != rightTerm ) { continue; } list = m_q->m_qterms[k].m_posdbListPtr; qti->m_subLists[nn] = list; qti->m_bigramFlags[nn] = 0; qti->m_bigramFlags[nn] |= BF_SYNONYM; if (qt->m_piped) { qti->m_bigramFlags[nn]|=BF_PIPED; } // add list of member terms as well //qti->m_qtermList[nn] = bt; bt->m_bitNum = nrg; if ( list && !list->isEmpty() ) { nn++; } } } // // ADD SYNONYM TERMS // for ( int32_t k = 0 ; k < m_q->m_numTerms ; k++ ) { QueryTerm *qt2 = &m_q->m_qterms[k]; const QueryTerm *st = qt2->m_synonymOf; // skip if not a synonym of this term if ( st != qt ) { continue; } // its a synonym, add it! list = m_q->m_qterms[k].m_posdbListPtr; // add list ptr into our required group qti->m_subLists[nn] = list; // special flags qti->m_bigramFlags[nn] = BF_SYNONYM; // before a pipe operator? if ( qt->m_piped ) qti->m_bigramFlags[nn] |= BF_PIPED; // set bitnum here i guess qt2->m_bitNum = nrg; // only really add if useful if ( list && !list->isEmpty() ) { nn++; } } // store # lists in required group. nn might be zero! qti->m_numSubLists = nn; // set the term freqs for this list group/set qti->m_termFreqWeight =((float *)m_msg39req->ptr_termFreqWeights)[i]; // crazy? if ( nn >= MAX_SUBLISTS ) { log("query: too many sublists. %" PRId32" >= %" PRId32, nn,(int32_t)MAX_SUBLISTS); logTrace(g_conf.m_logTracePosdb, "END."); return false; } // compute m_totalSubListsSize qti->m_totalSubListsSize = 0LL; for ( int32_t q = 0 ; q < qti->m_numSubLists ; q++ ) { // add list ptr into our required group RdbList *l = qti->m_subLists[q]; // get it int64_t listSize = l->getListSize(); // add it up qti->m_totalSubListsSize += listSize; } // count # required groups nrg++; } // // get the query term with the least data in posdb including syns // m_minTermListSize = 0; m_minTermListIdx = -1; int64_t grand = 0LL; for ( int32_t i = 0 ; i < nrg ; i++ ) { // compute total sizes int64_t total = 0LL; // get it QueryTermInfo *qti = &qtibuf[i]; // do not consider for first termlist if negative if ( qti->m_bigramFlags[0] & BF_NEGATIVE ) { continue; } // add to it total = qti->m_totalSubListsSize; // add up this now grand += total; // get min if ( total < m_minTermListSize || m_minTermListIdx == -1 ) { m_minTermListSize = total; m_minTermListIdx = i; } } // bad! ringbuf[] was not designed for this nonsense! if ( m_minTermListIdx >= 255 ) { gbshutdownAbort(true); } // set this for caller to use to loop over the queryterminfos m_numQueryTermInfos = nrg; if(g_conf.m_logTracePosdb) { logTrace(g_conf.m_logTracePosdb, "m_numQueryTermInfos=%d", m_numQueryTermInfos); for(int i=0; i<m_numQueryTermInfos; i++) { const QueryTermInfo *qti = qtibuf + i; logTrace(g_conf.m_logTracePosdb, " qti[%d]: m_numSubLists=%d m_qtermNum=%d m_qpos=%d", i, qti->m_numSubLists, qti->m_qtermNum, qti->m_qpos); for(int j=0; j<qti->m_numSubLists; j++) logTrace(g_conf.m_logTracePosdb, " sublist %d = %p", j, qti->m_subLists[j]); } } // . m_minTermListSize is set in setQueryTermInfo() // . how many docids do we have at most in the intersection? // . all keys are of same termid, so they are 12 or 6 bytes compressed // . assume 12 if each is a different docid int32_t maxDocIds = m_minTermListSize / 12; // store all interesected docids in here for new algo plus 1 byte vote int32_t need = maxDocIds * 6; // they could all be OR'd together! if ( m_q->m_isBoolean ) need = grand; // so we can always cast a int64_t from a ptr in there // for setting m_docId when m_booleanQuery is true below need += 8; // get max # of docids we got in an intersection from all the lists if ( ! m_docIdVoteBuf.reserve ( need,"divbuf" ) ) { logTrace(g_conf.m_logTracePosdb, "END."); return false; } // i'm feeling if a boolean query put this in there too, the // hashtable that maps each docid to its boolean bit vector // where each bit stands for an operand so we can quickly evaluate // the bit vector in a truth table. // CRAP, can't use min list size because it might be behind a // NOT operator!!! then we core trying to realloc m_bt in a thread // below when trying to grow it. they could all be OR'd together // so alloc the most! int32_t maxSlots = (grand/12) * 2; // try to speed up. this doesn't *seem* to matter, so i took out: //maxSlots *= 2; // get total operands we used //int32_t numOperands = m_q->m_numWords;//Operands; // a quoted phrase counts as a single operand // . QueryTerm::m_bitNum <== m_numQueryTermInfos // . each queryTermInfo class corresponds to one bit in our bit vec // . essentially each queryTermInfo is a query term, but it has // all the synonym and word forms for that query, etc. m_vecSize = m_numQueryTermInfos / 8;//numOperands / 8 ; // allow an extra byte for remainders if ( m_numQueryTermInfos % 8 ) m_vecSize++; // now preallocate the hashtable. 0 niceness. if ( m_q->m_isBoolean && // true = useKeyMagic ! m_bt.set (8,m_vecSize,maxSlots,NULL,0,false,"booltbl",true)) { logTrace(g_conf.m_logTracePosdb, "END."); return false; } // . m_ct maps a boolean "bit vector" to a true/false value // . each "bit" in the "bit vector" indicates if docid has that // particular query term if ( m_q->m_isBoolean && // true = useKeyMagic ! m_ct.set (8,1,maxSlots,NULL,0,false,"booltbl",true)) { logTrace(g_conf.m_logTracePosdb, "END."); return false; } logTrace(g_conf.m_logTracePosdb, "END."); return true; } bool PosdbTable::findCandidateDocIds() { int64_t lastTime = gettimeofdayInMilliseconds(); int64_t now; int64_t took; // // Swap the top 6 bytes of each list. // // This gives a contiguous list of 12-byte docid/score entries // because the first record is always 18 bytes (termid, docid, score) // and the rest are 6 bytes due to our termid compression. // // This makes the lists much much easier to work with, but we have // to remember to swap back when done! (but we dont...) // for ( int32_t k = 0 ; k < m_q->m_numTerms ; k++ ) { // count int64_t total = 0LL; // get the list RdbList *list = m_q->m_qterms[k].m_posdbListPtr; // skip if null if ( ! list ) { continue; } // skip if list is empty, too if ( list->isEmpty() ) { continue; } // tally total += list->getListSize(); // point to start char *p = list->getList(); // remember to swap back when done!! char ttt[12]; memcpy ( ttt , p , 12 ); memcpy ( p , p + 12 , 6 ); memcpy ( p + 6 , ttt , 12 ); // point to the low "hks" bytes now, skipping the termid p += 6; // turn half bit on. first key is now 12 bytes!! *p |= 0x02; // MANGLE the list list->setListSize(list->getListSize() - 6); list->setList(p); logTrace(g_conf.m_logTracePosdb, "termList #%" PRId32" totalSize=%" PRId64, k, total); // print total list sizes if ( m_debug ) { log(LOG_INFO, "query: termlist #%" PRId32" totalSize=%" PRId64, k, total); } } // point to our array of query term infos set in setQueryTermInfos() QueryTermInfo *qtibuf = (QueryTermInfo *)m_qiBuf.getBufStart(); // setQueryTermInfos() should have set how many we have if ( m_numQueryTermInfos == 0 ) { log(LOG_DEBUG, "query: NO REQUIRED TERMS IN QUERY!"); logTrace(g_conf.m_logTracePosdb, "END, m_numQueryTermInfos = 0"); return false; } // . if smallest required list is empty, 0 results // . also set in setQueryTermInfo if ( m_minTermListSize == 0 && ! m_q->m_isBoolean ) { logTrace(g_conf.m_logTracePosdb, "END, m_minTermListSize = 0 and not boolean"); return false; } int32_t listGroupNum = 0; // if all non-negative query terms are in the same wikiphrase then // we can apply the WIKI_WEIGHT in getMaxPossibleScore() which // should help us speed things up! // @@@ BR: Why should this give a boost at all.. ? m_allInSameWikiPhrase = true; for ( int32_t i = 0 ; i < m_numQueryTermInfos ; i++ ) { // get it const QueryTermInfo *qti = &qtibuf[i]; // skip if negative query term if ( qti->m_bigramFlags[0] & BF_NEGATIVE ) { continue; } // skip if numeric field like gbsortby:price gbmin:price:1.23 if ( qti->m_bigramFlags[0] & BF_NUMBER ) { continue; } // set it if ( qti->m_wikiPhraseId == 1 ) { continue; //@@@ BR: Why only check for id = 1 ?? } // stop m_allInSameWikiPhrase = false; break; } logTrace(g_conf.m_logTracePosdb, "m_allInSameWikiPhrase: %s", (m_allInSameWikiPhrase?"true":"false") ); // for boolean queries we scan every docid in all termlists, // then we see what query terms it has, and make a bit vector for it. // then use a hashtable to map that bit vector to a true or false // as to whether we should include it in the results or not. // we use Query::getBitScore(qvec_t ebits) to evaluate a docid's // query term explicit term bit vector. if ( m_q->m_isBoolean ) { logTrace(g_conf.m_logTracePosdb, "makeDocIdVoteBufForBoolQuery"); // keeping the docids sorted is the challenge here... makeDocIdVoteBufForBoolQuery(); } else { // create "m_docIdVoteBuf" filled with just the docids from the // smallest group of sublists (one term, all variations). // // m_minTermListIdx is the queryterminfo that had the smallest total // sublist sizes of m_minTermListSize. this was set in // setQueryTermInfos() // // if all these sublist termlists were 50MB i'd day 10-25ms to // add their docid votes. logTrace(g_conf.m_logTracePosdb, "addDocIdVotes"); addDocIdVotes ( &qtibuf[m_minTermListIdx], listGroupNum ); // now repeat the docid scan for successive lists but only // inc the docid count for docids we match. docids that are // in m_docIdVoteBuf but not in sublist group #i will be removed // from m_docIdVoteBuf. // // worst case scenario with termlists limited // to 30MB will be about 10MB of docid vote buf, but it should // shrink quite a bit every time we call addDocIdVotes() with a // new group of sublists for a query term. but scanning 10MB should // be pretty fast since gk0 does like 4GB/s of main memory reads. // i would think scanning and docid voting for 200MB of termlists // should take like 50-100ms for ( int32_t i = 0 ; i < m_numQueryTermInfos ; i++ ) { // skip if we did it above when allocating the vote buffer if ( i == m_minTermListIdx ) { continue; } // get it const QueryTermInfo *qti = &qtibuf[i]; // do not consider for adding if negative ('my house -home') if ( qti->m_bigramFlags[0] & BF_NEGATIVE ) { continue; } // inc the group number ("score"), which will be set on the docids // in addDocIdVotes if they match. listGroupNum++; // if it hits 256 then wrap back down to 1 if ( listGroupNum >= 256 ) { listGroupNum = 1; } // add it addDocIdVotes ( qti, listGroupNum ); } logTrace(g_conf.m_logTracePosdb, "Added DocIdVotes"); // // remove the negative query term docids from our docid vote buf // for ( int32_t i = 0 ; i < m_numQueryTermInfos ; i++ ) { // skip if we did it above if ( i == m_minTermListIdx ) { continue; } // get it const QueryTermInfo *qti = &qtibuf[i]; // do not consider for adding if negative ('my house -home') if ( ! (qti->m_bigramFlags[0] & BF_NEGATIVE) ) { continue; } // delete the docid from the vote buffer delDocIdVotes ( qti ); } logTrace(g_conf.m_logTracePosdb, "Removed DocIdVotes for negative query terms"); } if ( m_debug ) { now = gettimeofdayInMilliseconds(); took = now - lastTime; log(LOG_INFO, "posdb: new algo phase (find matching docids) took %" PRId64" ms", took); lastTime = now; } // // NOW FILTER EVERY SUBLIST to just the docids in m_docIdVoteBuf. // Filter in place so it is destructive. i would think // that doing a filter on 200MB of termlists wouldn't be more than // 50-100ms since we can read 4GB/s from main memory. // delNonMatchingDocIdsFromSubLists(); logTrace(g_conf.m_logTracePosdb, "Shrunk SubLists"); if(g_conf.m_logTracePosdb) { log(LOG_TRACE,"Shrunk sublists, m_numQueryTermInfos=%d", m_numQueryTermInfos); for(int i=0; i<m_numQueryTermInfos; i++) { log(LOG_TRACE," qti #%d: m_numSubLists=%d m_numMatchingSubLists=%d", i, qtibuf[i].m_numSubLists, qtibuf[i].m_numMatchingSubLists); for(int j=0; j<qtibuf[i].m_numMatchingSubLists; j++) log(LOG_TRACE," matchlist #%d: %d bytes %p - %p", j, qtibuf[i].m_matchingSubListSize[j], qtibuf[i].m_matchingSubListStart[j], qtibuf[i].m_matchingSubListEnd[j]); } } if ( m_debug ) { now = gettimeofdayInMilliseconds(); took = now - lastTime; log(LOG_INFO, "posdb: new algo phase (shrink sublists) took %" PRId64" ms", took); } return true; } bool PosdbTable::genDebugScoreInfo1(int32_t *numProcessed, int32_t *topCursor, bool *docInThisFile, QueryTermInfo *qtibuf) { *docInThisFile = false; // did we get enough score info? if ( *numProcessed >= m_msg39req->m_docsToGet ) { logTrace(g_conf.m_logTracePosdb, "Too many docs processed. m_docId=%" PRId64 ". Reached msg39 m_docsToGet: %" PRId32 ", SKIPPED", m_docId, *numProcessed); return true; } // loop back up here if the docid is from a previous range nextNode: // this mean top tree empty basically if ( *topCursor == -1 ) { logTrace(g_conf.m_logTracePosdb, "topCursor is -1, SKIPPED"); return true; } // get the #1 docid/score node # if ( *topCursor == -9 ) { // if our query had a quoted phrase, might have had no // docids in the top tree! getHighNode() can't handle // that so handle it here if ( m_topTree->getNumUsedNodes() == 0 ) { logTrace(g_conf.m_logTracePosdb, "Num used nodes is 0, SKIPPED"); return true; } // otherwise, initialize topCursor *topCursor = m_topTree->getHighNode(); } // get current node TopNode *tn = m_topTree->getNode ( *topCursor ); // advance *topCursor = m_topTree->getPrev ( *topCursor ); // shortcut m_docId = tn->m_docId; // skip if not in our range! the top tree now holds // all the winners from previous docid ranges. msg39 // now does the search result in docid range chunks to avoid // OOM conditions. if ( m_msg39req->m_minDocId != -1 && m_msg39req->m_maxDocId != -1 && ( m_docId < (uint64_t)m_msg39req->m_minDocId || m_docId >= (uint64_t)m_msg39req->m_maxDocId ) ) { logTrace(g_conf.m_logTracePosdb, "DocId %" PRIu64 " does not match docId range %" PRIu64 " - %" PRIu64 ", SKIPPED", m_docId, (uint64_t)m_msg39req->m_minDocId, (uint64_t)m_msg39req->m_maxDocId); goto nextNode; } *docInThisFile = m_documentIndexChecker->exists(m_docId); if( !(*docInThisFile) ) { logTrace(g_conf.m_logTracePosdb, "DocId %" PRId64 " is not in this file, SKIPPED", m_docId); return false; } logTrace(g_conf.m_logTracePosdb, "DocId %" PRId64 " - setting up score buffer", m_docId); (*numProcessed)++; // set query termlists in all sublists for ( int32_t i = 0 ; i < m_numQueryTermInfos ; i++ ) { // get it QueryTermInfo *qti = &qtibuf[i]; // do not advance negative termlist cursor if ( qti->m_bigramFlags[0] & BF_NEGATIVE ) { continue; } // do each sublist for ( int32_t j = 0 ; j < qti->m_numMatchingSubLists ; j++ ) { // get termlist for that docid const char *xlist = qti->m_matchingSubListStart[j]; const char *xlistEnd = qti->m_matchingSubListEnd[j]; const char *xp = getWordPosList ( m_docId, xlist, xlistEnd - xlist); // not there? xlist will be NULL qti->m_matchingSubListSavedCursor[j] = xp; // if not there make cursor NULL as well if ( ! xp ) { qti->m_matchingSubListCursor[j] = NULL; continue; } // skip over docid list xp += 12; for ( ; ; ) { // do not breach sublist if ( xp >= xlistEnd ) { break; } // break if 12 byte key: another docid! if ( !(xp[0] & 0x04) ) { break; } // skip over 6 byte key xp += 6; } // point to docid sublist end qti->m_matchingSubListCursor[j] = xp; } } return false; } bool PosdbTable::genDebugScoreInfo2(DocIdScore *dcs, int32_t *lastLen, uint64_t *lastDocId, char siteRank, float score, int32_t intScore, char docLang) { char *sx; char *sxEnd; int32_t pairOffset; int32_t pairSize; int32_t singleOffset; int32_t singleSize; dcs->m_siteRank = siteRank; dcs->m_finalScore = score; // a double can capture an int without dropping any bits, // inlike a mere float if ( m_sortByTermNumInt >= 0 ) { dcs->m_finalScore = (double)intScore; } dcs->m_docId = m_docId; dcs->m_numRequiredTerms = m_numQueryTermInfos; dcs->m_docLang = docLang; logTrace(g_conf.m_logTracePosdb, "m_docId=%" PRId64 ", *lastDocId=%" PRId64 "", m_docId, *lastDocId); // ensure enough room we can't allocate in a thread! if ( m_scoreInfoBuf.getAvail()<(int32_t)sizeof(DocIdScore)+1) { logTrace(g_conf.m_logTracePosdb, "END, NO ROOM"); return true; } // if same as last docid, overwrite it since we have a higher // siterank or langid i guess if ( m_docId == *lastDocId ) { m_scoreInfoBuf.m_length = *lastLen; } // save that int32_t len = m_scoreInfoBuf.m_length; // copy into the safebuf for holding the scoring info #ifdef _VALGRIND_ VALGRIND_CHECK_MEM_IS_DEFINED(&dcs,sizeof(dcs)); #endif m_scoreInfoBuf.safeMemcpy ( (char *)dcs, sizeof(DocIdScore) ); // save that *lastLen = len; // save it *lastDocId = m_docId; // try to fix dup docid problem! it was hitting the // overflow check right above here... strange!!! //m_docIdTable.removeKey ( &docId ); ///////////////////////////// // // . docid range split HACK... // . if we are doing docid range splitting, we process // the search results separately in disjoint docid ranges. // . so because we still use the same m_scoreInfoBuf etc. // for each split we process, we must remove info from // a top docid of a previous split that got supplanted by // a docid of this docid-range split, so we do not breach // the allocated buffer size. // . so find out which docid we replaced // in the top tree, and replace his entry in scoreinfobuf // as well! // . his info was already added to m_pairScoreBuf in the // getTermPairScoreForAny() function // ////////////////////////////// // the top tree remains persistent between docid ranges. // and so does the score buf. so we must replace scores // if the docid gets replaced by a better scoring docid // from a following docid range of different docids. // However, scanning the docid scor buffer each time is // really slow, so we need to get the kicked out docid // from the top tree and use that to store its offset // into m_scoreInfoBuf so we can remove it. DocIdScore *si; // only kick out docids from the score buffer when there // is no room left... if ( m_scoreInfoBuf.getAvail() >= (int)sizeof(DocIdScore ) ) { logTrace(g_conf.m_logTracePosdb, "END, OK. m_docId=%" PRId64 ", *lastDocId=%" PRId64 "", m_docId, *lastDocId); return true; } sx = m_scoreInfoBuf.getBufStart(); sxEnd = sx + m_scoreInfoBuf.length(); // if we have not supplanted anyone yet, be on our way for ( ; sx < sxEnd ; sx += sizeof(DocIdScore) ) { si = (DocIdScore *)sx; // if top tree no longer has this docid, we must // remove its associated scoring info so we do not // breach our scoring info bufs if ( ! m_topTree->hasDocId( si->m_docId ) ) { break; } } // might not be full yet if ( sx >= sxEnd ) { logTrace(g_conf.m_logTracePosdb, "END, OK 2"); return true; } // must be there! if ( ! si ) { gbshutdownAbort(true); } // note it because it is slow // this is only used if getting score info, which is // not default when getting an xml or json feed logTrace(g_conf.m_logTracePosdb, "Kicking out docid %" PRId64" from score buf", si->m_docId); // get his single and pair offsets pairOffset = si->m_pairsOffset; pairSize = si->m_numPairs * sizeof(PairScore); singleOffset = si->m_singlesOffset; singleSize = si->m_numSingles * sizeof(SingleScore); // nuke him m_scoreInfoBuf .removeChunk1 ( sx, sizeof(DocIdScore) ); // and his related info m_pairScoreBuf .removeChunk2 ( pairOffset , pairSize ); m_singleScoreBuf.removeChunk2 ( singleOffset , singleSize ); // adjust offsets of remaining single scores sx = m_scoreInfoBuf.getBufStart(); for ( ; sx < sxEnd ; sx += sizeof(DocIdScore) ) { si = (DocIdScore *)sx; if ( si->m_pairsOffset > pairOffset ) { si->m_pairsOffset -= pairSize; } if ( si->m_singlesOffset > singleOffset ) { si->m_singlesOffset -= singleSize; } } // adjust this too! *lastLen -= sizeof(DocIdScore); logTrace(g_conf.m_logTracePosdb, "Returning false"); return false; } void PosdbTable::logDebugScoreInfo(int32_t loglevel) { DocIdScore *si; char *sx; char *sxEnd; logTrace(g_conf.m_logTracePosdb, "BEGIN"); sx = m_scoreInfoBuf.getBufStart(); sxEnd = sx + m_scoreInfoBuf.length(); log(loglevel, "DocId scores in m_scoreInfoBuf:"); for ( ; sx < sxEnd ; sx += sizeof(DocIdScore) ) { si = (DocIdScore *)sx; log(loglevel, " docId: %14" PRIu64 ", score: %f", si->m_docId, si->m_finalScore); // if top tree no longer has this docid, we must // remove its associated scoring info so we do not // breach our scoring info bufs if ( ! m_topTree->hasDocId( si->m_docId ) ) { log(loglevel, " ^ NOT in topTree anymore!"); } } logTrace(g_conf.m_logTracePosdb, "END"); } void PosdbTable::removeScoreInfoForDeletedDocIds() { DocIdScore *si; char *sx; char *sxEnd; int32_t pairOffset; int32_t pairSize; int32_t singleOffset; int32_t singleSize; logTrace(g_conf.m_logTracePosdb, "BEGIN"); sx = m_scoreInfoBuf.getBufStart(); sxEnd = sx + m_scoreInfoBuf.length(); for ( ; sx < sxEnd ; sx += sizeof(DocIdScore) ) { si = (DocIdScore *)sx; // if top tree no longer has this docid, we must // remove its associated scoring info so we do not // breach our scoring info bufs if ( m_topTree->hasDocId( si->m_docId ) ) { continue; } logTrace(g_conf.m_logTracePosdb, "Removing old score info for docId %" PRId64 "", si->m_docId); // get his single and pair offsets pairOffset = si->m_pairsOffset; pairSize = si->m_numPairs * sizeof(PairScore); singleOffset = si->m_singlesOffset; singleSize = si->m_numSingles * sizeof(SingleScore); // nuke him m_scoreInfoBuf .removeChunk1 ( sx, sizeof(DocIdScore) ); // and his related info m_pairScoreBuf .removeChunk2 ( pairOffset , pairSize ); m_singleScoreBuf.removeChunk2 ( singleOffset , singleSize ); // adjust offsets of remaining single scores sx = m_scoreInfoBuf.getBufStart(); for ( ; sx < sxEnd ; sx += sizeof(DocIdScore) ) { si = (DocIdScore *)sx; if ( si->m_pairsOffset > pairOffset ) { si->m_pairsOffset -= pairSize; } if ( si->m_singlesOffset > singleOffset ) { si->m_singlesOffset -= singleSize; } } sxEnd -= sizeof(DocIdScore); } logTrace(g_conf.m_logTracePosdb, "END"); } // Pre-advance each termlist's cursor to skip to next docid. // // Set QueryTermInfo::m_matchingSubListCursor to NEXT docid // Set QueryTermInfo::m_matchingSubListSavedCursor to CURRENT docid // of each termlist so we are ready for a quick skip over this docid. // // TODO: use just a single array of termlist ptrs perhaps, // then we can remove them when they go NULL. and we'd save a little // time not having a nested loop. bool PosdbTable::advanceTermListCursors(const char *docIdPtr, QueryTermInfo *qtibuf) { logTrace(g_conf.m_logTracePosdb, "BEGIN"); for ( int32_t i = 0 ; i < m_numQueryTermInfos ; i++ ) { // get it QueryTermInfo *qti = &qtibuf[i]; // do not advance negative termlist cursor if ( qti->m_bigramFlags[0] & BF_NEGATIVE ) { continue; } // // In first pass, sublists data is initialized by delNonMatchingDocIdsFromSubLists. // In second pass (to get detailed scoring info for UI output), they are initialized above // for ( int32_t j = 0 ; j < qti->m_numMatchingSubLists ; j++ ) { // shortcuts const char *xc = qti->m_matchingSubListCursor[j]; const char *xcEnd = qti->m_matchingSubListEnd[j]; // exhausted? (we can't make cursor NULL because // getMaxPossibleScore() needs the last ptr) // must match docid if ( xc >= xcEnd || *(int32_t *)(xc+8) != *(int32_t *)(docIdPtr+1) || (*(char *)(xc+7)&0xfc) != (*(char *)(docIdPtr)&0xfc) ) { // flag it as not having the docid qti->m_matchingSubListSavedCursor[j] = NULL; // skip this sublist if does not have our docid continue; } // save it qti->m_matchingSubListSavedCursor[j] = xc; // get new docid //log("new docid %" PRId64,Posdb::getDocId(xc) ); // advance the cursors. skip our 12 xc += 12; // then skip any following 6 byte keys because they // share the same docid for ( ; ; xc += 6 ) { // end of whole termlist? if ( xc >= xcEnd ) { break; } // sanity. no 18 byte keys allowed if ( (*xc & 0x06) == 0x00 ) { // i've seen this triggered on gk28. // a dump of posdb for the termlist // for 'post' had corruption in it, // yet its twin, gk92 did not. the // corruption could have occurred // anywhere from nov 2012 to may 2013, // and the posdb file was never // re-merged! must have been blatant // disk malfunction? log("posdb: encountered corrupt posdb list. bailing."); logTrace(g_conf.m_logTracePosdb, "END."); return false; //gbshutdownAbort(true); } // the next docid? it will be a 12 byte key. if ( ! (*xc & 0x04) ) { break; } } // assign to next docid word position list qti->m_matchingSubListCursor[j] = xc; } } logTrace(g_conf.m_logTracePosdb, "END"); return true; } #define RINGBUFSIZE 4096 // // TODO: consider skipping this pre-filter if it sucks, as it does // for 'search engine'. it might save time! // // Returns: // false - docid does not meet minimum score requirement // true - docid can potentially be a top scoring docid // bool PosdbTable::prefilterMaxPossibleScoreByDistance(const QueryTermInfo *qtibuf, const int32_t *qpos, float minWinningScore) { unsigned char ringBuf[RINGBUFSIZE+10]; // reset ring buf. make all slots 0xff. should be 1000 cycles or so. memset ( ringBuf, 0xff, sizeof(ringBuf) ); unsigned char qt; uint32_t wx; int32_t ourFirstPos = -1; int32_t qdist; logTrace(g_conf.m_logTracePosdb, "BEGIN"); // now to speed up 'time enough for love' query which does not // have many super high scoring guys on top we need a more restrictive // filter than getMaxPossibleScore() so let's pick one query term, // the one with the shortest termlist, and see how close it gets to // each of the other query terms. then score each of those pairs. // so quickly record the word positions of each query term into // a ring buffer of 4096 slots where each slot contains the // query term # plus 1. logTrace(g_conf.m_logTracePosdb, "Ring buffer generation"); const QueryTermInfo *qtx = &qtibuf[m_minTermListIdx]; // populate ring buf just for this query term for ( int32_t k = 0 ; k < qtx->m_numMatchingSubLists ; k++ ) { // scan that sublist and add word positions const char *sub = qtx->m_matchingSubListSavedCursor[k]; // skip sublist if it's cursor is exhausted if ( ! sub ) { continue; } const char *end = qtx->m_matchingSubListCursor[k]; // add first key //int32_t wx = Posdb::getWordPos(sub); wx = (*((uint32_t *)(sub+3))) >> 6; // mod with 4096 wx &= (RINGBUFSIZE-1); // store it. 0 is legit. ringBuf[wx] = m_minTermListIdx; // set this ourFirstPos = wx; // skip first key sub += 12; // then 6 byte keys for ( ; sub < end ; sub += 6 ) { // get word position //wx = Posdb::getWordPos(sub); wx = (*((uint32_t *)(sub+3))) >> 6; // mod with 4096 wx &= (RINGBUFSIZE-1); // store it. 0 is legit. ringBuf[wx] = m_minTermListIdx; } } // now get query term closest to query term # m_minTermListIdx which // is the query term # with the shortest termlist // get closest term to m_minTermListIdx and the distance logTrace(g_conf.m_logTracePosdb, "Ring buffer generation 2"); for ( int32_t i = 0 ; i < m_numQueryTermInfos ; i++ ) { if ( i == m_minTermListIdx ) { continue; } // get the query term info const QueryTermInfo *qti = &qtibuf[i]; // if we have a negative term, skip it if ( qti->m_bigramFlags[0] & (BF_NEGATIVE) ) { continue; } // store all his word positions into ring buffer AS WELL for ( int32_t k = 0 ; k < qti->m_numMatchingSubLists ; k++ ) { // scan that sublist and add word positions const char *sub = qti->m_matchingSubListSavedCursor[k]; // skip sublist if it's cursor is exhausted if ( ! sub ) { continue; } const char *end = qti->m_matchingSubListCursor[k]; // add first key //int32_t wx = Posdb::getWordPos(sub); wx = (*((uint32_t *)(sub+3))) >> 6; // mod with 4096 wx &= (RINGBUFSIZE-1); // store it. 0 is legit. ringBuf[wx] = i; // skip first key sub += 12; // then 6 byte keys for ( ; sub < end ; sub += 6 ) { // get word position //wx = Posdb::getWordPos(sub); wx = (*((uint32_t *)(sub+3))) >> 6; // mod with 4096 wx &= (RINGBUFSIZE-1); // store it. 0 is legit. ringBuf[wx] = i; } } // reset int32_t ourLastPos = -1; int32_t hisLastPos = -1; int32_t bestDist = 0x7fffffff; // how far is this guy from the man? for ( int32_t x = 0 ; x < (int32_t)RINGBUFSIZE ; ) { // skip next 4 slots if all empty. fast? if (*(uint32_t *)(ringBuf+x) == 0xffffffff) { x+=4; continue; } // skip if nobody if ( ringBuf[x] == 0xff ) { x++; continue; } // get query term # qt = ringBuf[x]; // if it's the man if ( qt == m_minTermListIdx ) { // record hisLastPos = x; // skip if we are not there yet if ( ourLastPos == -1 ) { x++; continue; } // try distance fix if ( x - ourLastPos < bestDist ) { bestDist = x - ourLastPos; } } // if us else if ( qt == i ) { // record ourLastPos = x; // skip if he's not recorded yet if ( hisLastPos == -1 ) { x++; continue; } // check dist if ( x - hisLastPos < bestDist ) { bestDist = x - hisLastPos; } } x++; } // compare last occurence of query term #x with our first occ. // since this is a RING buffer int32_t wrapDist = ourFirstPos + ((int32_t)RINGBUFSIZE-hisLastPos); if ( wrapDist < bestDist ) { bestDist = wrapDist; } // query distance qdist = qpos[m_minTermListIdx] - qpos[i]; // compute it float maxScore2 = getMaxPossibleScore(&qtibuf[i], bestDist, qdist, &qtibuf[m_minTermListIdx]); // -1 means it has inlink text so do not apply this constraint // to this docid because it is too difficult because we // sum up the inlink text if ( maxScore2 < 0.0 ) { continue; } // if any one of these terms have a max score below the // worst score of the 10th result, then it can not win. // @todo: BR. Really? ANY of them? if ( maxScore2 <= minWinningScore ) { logTrace(g_conf.m_logTracePosdb, "END - docid score too low"); return false; } } logTrace(g_conf.m_logTracePosdb, "END - docid score high enough"); return true; } // // Data for the current DocID found in sublists of each query term // is merged into a single list, so we end up with one list per query // term. // void PosdbTable::mergeTermSubListsForDocId(QueryTermInfo *qtibuf, char *miniMergeBuf, const char **miniMergedList, const char **miniMergedEnd, int *highestInlinkSiteRank) { logTrace(g_conf.m_logTracePosdb, "BEGIN."); // we got a docid that has all the query terms, so merge // each term's sublists into a single list just for this docid. // // all posdb keys for this docid should fit in here, the // mini merge buf: char *mptr = miniMergeBuf; char *mptrEnd = miniMergeBuf + 299000; char *lastMptr = NULL; // Merge each set of sublists, like we merge a term's list with // its two associated bigram lists, if there, the left bigram and // right bigram list. Merge all the synonym lists for that term // together as well, so if the term is 'run' we merge it with the // lists for 'running' 'ran' etc. logTrace(g_conf.m_logTracePosdb, "Merge sublists into a single list per query term"); for ( int32_t j = 0 ; j < m_numQueryTermInfos ; j++ ) { // get the query term info QueryTermInfo *qti = &qtibuf[j]; // just use the flags from first term i guess // NO! this loses the wikihalfstopbigram bit! so we gotta // add that in for the key i guess the same way we add in // the syn bits below!!!!! m_bflags [j] = qti->m_bigramFlags[0]; // if we have a negative term, skip it if ( qti->m_bigramFlags[0] & BF_NEGATIVE ) { // need to make this NULL for getSiteRank() call below miniMergedList[j] = NULL; // if its empty, that's good! continue; } // the merged list for term #j is here: miniMergedList[j] = mptr; bool isFirstKey = true; const char *nwp[MAX_SUBLISTS]; const char *nwpEnd[MAX_SUBLISTS]; char nwpFlags[MAX_SUBLISTS]; // populate the nwp[] arrays for merging int32_t nsub = 0; for ( int32_t k = 0 ; k < qti->m_numMatchingSubLists ; k++ ) { // NULL means does not have that docid if ( ! qti->m_matchingSubListSavedCursor[k] ) { continue; } // getMaxPossibleScore() incremented m_matchingSubListCursor to // the next docid so use m_matchingSubListSavedCursor. nwp[nsub] = qti->m_matchingSubListSavedCursor[k]; nwpEnd[nsub] = qti->m_matchingSubListCursor[k]; nwpFlags[nsub] = qti->m_bigramFlags[k]; nsub++; } // if only one sublist had this docid, no need to merge // UNLESS it's a synonym list then we gotta set the // synbits on it, below!!! or a half stop wiki bigram like // the term "enough for" in the wiki phrase // "time enough for love" because we wanna reward that more! // this halfstopwikibigram bit is set in the indivial keys // so we'd have to at least do a key cleansing, so we can't // do this shortcut right now... mdw oct 10 2015 if ( nsub == 1 && (nwpFlags[0] & BF_NUMBER) && !(nwpFlags[0] & BF_SYNONYM) && !(nwpFlags[0] & BF_HALFSTOPWIKIBIGRAM) ) { miniMergedList[j] = nwp [0]; miniMergedEnd[j] = nwpEnd [0]; m_bflags[j] = nwpFlags[0]; continue; } // Merge the lists into a list in miniMergeBuf. // Get the min of each list bool currTermDone = false; char ks; do { int32_t mink = -1; ks = 0; for ( int32_t k = 0 ; k < nsub ; k++ ) { // skip if list is exhausted if ( ! nwp[k] ) { continue; } // auto winner? if ( mink == -1 ) { mink = k; continue; } if ( KEYCMP(nwp[k], nwp[mink], 6) < 0 ) { mink = k; // a new min... } } // all exhausted? merge next set of sublists then for term #j if ( mink == -1 ) { // continue outer "j < m_numQueryTermInfos" loop. currTermDone = true; } else { // get keysize ks = Posdb::getKeySize(nwp[mink]); // HACK OF CONFUSION: // // skip it if its a query phrase term, like // "searchengine" is for the 'search engine' query // AND it has the synbit which means it was a bigram // in the doc (i.e. occurred as two separate terms) // // second check means it occurred as two separate terms // or could be like bob and occurred as "bob's". // see XmlDoc::hashWords3(). // nwp[mink][2] & 0x03 is the posdb entry original/synonym/hyponym/.. flags if ( ! ((nwpFlags[mink] & BF_BIGRAM) && (nwp[mink][2] & 0x03)) ) { // if the first key in our merged list store the docid crap if ( isFirstKey ) { // store a 12 byte key in the merged list buffer memcpy ( mptr, nwp[mink], 12 ); // Detect highest siterank of inlinkers if ( Posdb::getHashGroup(mptr+6) == HASHGROUP_INLINKTEXT) { char inlinkerSiteRank = Posdb::getWordSpamRank(mptr+6); if(inlinkerSiteRank > *highestInlinkSiteRank) { *highestInlinkSiteRank = inlinkerSiteRank; } } // wipe out its syn bits and re-use our way mptr[2] &= 0xfc; // set the synbit so we know if its a synonym of term if ( nwpFlags[mink] & (BF_BIGRAM|BF_SYNONYM)) { mptr[2] |= 0x02; } // wiki half stop bigram? so for the query // 'time enough for love' the phrase term "enough for" // is a half stopword wiki bigram, because it is in // a phrase in wikipedia ("time enough for love") and // one of the two words in the phrase term is a // stop word. therefore we give it more weight than // just 'enough' by itself. if ( nwpFlags[mink] & BF_HALFSTOPWIKIBIGRAM ) { mptr[2] |= 0x01; } // make sure its 12 bytes! it might have been // the first key for the termid, and 18 bytes. mptr[0] &= 0xf9; mptr[0] |= 0x02; // save it lastMptr = mptr; mptr += 12; isFirstKey = false; } else { // if matches last key word position, do not add! // we should add the bigram first if more important // since it should be added before the actual term // above in the sublist array. so if they are // wikihalfstop bigrams they will be added first, // otherwise, they are added after the regular term. // should fix double scoring bug for 'cheat codes' // query! if ( Posdb::getWordPos(lastMptr) != Posdb::getWordPos(nwp[mink]) ) { memcpy ( mptr, nwp[mink], 6 ); // Detect highest siterank of inlinkers if ( Posdb::getHashGroup(mptr) == HASHGROUP_INLINKTEXT) { char inlinkerSiteRank = Posdb::getWordSpamRank(mptr); if(inlinkerSiteRank > *highestInlinkSiteRank) { *highestInlinkSiteRank = inlinkerSiteRank; } } // wipe out its syn bits and re-use our way mptr[2] &= 0xfc; // set the synbit so we know if its a synonym of term if ( nwpFlags[mink] & (BF_BIGRAM|BF_SYNONYM)) { mptr[2] |= 0x02; } if ( nwpFlags[mink] & BF_HALFSTOPWIKIBIGRAM ) { mptr[2] |= 0x01; } // if it was the first key of its list it may not // have its bit set for being 6 bytes now! so turn // on the 2 compression bits mptr[0] &= 0xf9; mptr[0] |= 0x06; // save it lastMptr = mptr; mptr += 6; } } } // advance the cursor over the key we used. nwp[mink] += ks; // Posdb::getKeySize(nwp[mink]); // exhausted? if ( nwp[mink] >= nwpEnd[mink] ) { nwp[mink] = NULL; } else if ( Posdb::getKeySize(nwp[mink]) != 6 ) { // or hit a different docid nwp[mink] = NULL; } } // mink != -1 //log("skipping ks=%" PRId32,(int32_t)ks); } while( !currTermDone && mptr < mptrEnd ); // merge more ... // wrap it up here since done merging miniMergedEnd[j] = mptr; //log(LOG_ERROR,"%s:%d: j=%" PRId32 ": miniMergedList[%" PRId32 "]=%p, miniMergedEnd[%" PRId32 "]=%p, mptr=%p, mptrEnd=%p, term=[%.*s] - TERM DONE", __func__, __LINE__, j, j, miniMergedList[j], j, miniMergedEnd[j], mptr, mptrEnd, qti->m_qt->m_termLen, qti->m_qt->m_term); } // breach? if ( mptr > miniMergeBuf + 300000 ) { gbshutdownAbort(true); } // Make sure miniMergeList[x] is NULL if it contains no values for(int32_t i=0; i < m_numQueryTermInfos; i++) { // skip if not part of score if ( m_bflags[i] & (BF_PIPED|BF_NEGATIVE) ) { continue; } // get list const char *plist = miniMergedList[i]; const char *plistEnd= miniMergedEnd[i]; int32_t psize = plistEnd - plist; if( !psize ) { // BR fix 20160918: Avoid working on empty lists where start and // end pointers are the same. This can happen if no positions are // copied to the merged list because they are all synonyms or // phrase terms. See "HACK OF CONFUSION" above.. miniMergedList[i] = NULL; } // // sanity check for all // // test it. first key is 12 bytes. if ( psize && Posdb::getKeySize(plist) != 12 ) { log(LOG_ERROR,"%s:%s:%d: psize=%" PRId32 "", __FILE__, __func__, __LINE__, psize); gbshutdownAbort(true); } // next key is 6 if ( psize > 12 && Posdb::getKeySize(plist+12) != 6) { log(LOG_ERROR,"%s:%s:%d: next key size=%" PRId32 "", __FILE__, __func__, __LINE__, Posdb::getKeySize(plist+12)); gbshutdownAbort(true); } } logTrace(g_conf.m_logTracePosdb, "END."); } // // Nested for loops to score the term pairs. // // Store best scores into the scoreMatrix so the sliding window // algorithm can use them from there to do sub-outs // void PosdbTable::createNonBodyTermPairScoreMatrix(const char **miniMergedList, const char **miniMergedEnd, float *scoreMatrix) { int32_t qdist; logTrace(g_conf.m_logTracePosdb, "BEGIN"); // scan over each query term (its synonyms are part of the // QueryTermInfo) for(int32_t i=0; i < m_numQueryTermInfos; i++) { // skip if not part of score if ( m_bflags[i] & (BF_PIPED|BF_NEGATIVE|BF_NUMBER) ) { continue; } // and pair it with each other possible query term for ( int32_t j = i+1 ; j < m_numQueryTermInfos ; j++ ) { // skip if not part of score if ( m_bflags[j] & (BF_PIPED|BF_NEGATIVE|BF_NUMBER) ) { continue; } // but if they are in the same wikipedia phrase // then try to keep their positions as in the query. // so for 'time enough for love' ideally we want // 'time' to be 6 units apart from 'love' float wts; // zero means not in a phrase if ( m_wikiPhraseIds[j] == m_wikiPhraseIds[i] && m_wikiPhraseIds[j] ) { // . the distance between the terms in the query // . ideally we'd like this distance to be reflected // in the matched terms in the body qdist = m_qpos[j] - m_qpos[i]; // wiki weight wts = (float)WIKI_WEIGHT; // .50; } else { // basically try to get query words as close // together as possible qdist = 2; // this should help fix // 'what is an unsecured loan' so we are more likely // to get the page that has that exact phrase in it. // yes, but hurts how to make a lock pick set. //qdist = qpos[j] - qpos[i]; // wiki weight wts = 1.0; } float maxnbtp; // // get score for term pair from non-body occuring terms // if ( miniMergedList[i] && miniMergedList[j] ) { maxnbtp = getMaxScoreForNonBodyTermPair( miniMergedList[i], miniMergedList[j], miniMergedEnd[i], miniMergedEnd[j], qdist); } else { maxnbtp = -1; } // it's -1 if one term is in the body/header/menu/etc. if ( maxnbtp < 0 ) { wts = -1.00; } else { wts *= maxnbtp; wts *= m_freqWeights[i]; wts *= m_freqWeights[j]; } // store in matrix for "sub out" algo below // when doing sliding window scoreMatrix[i*m_nqt+j] = wts; } } logTrace(g_conf.m_logTracePosdb, "END"); } // // Finds the highest single term score sum. // Creates array of highest scoring non-body positions // float PosdbTable::getMinSingleTermScoreSum(const char **miniMergedList, const char **miniMergedEnd, const char **highestScoringNonBodyPos, DocIdScore *pdcs) { float minSingleScore = 999999999.0; bool scoredTerm = false; logTrace(g_conf.m_logTracePosdb, "BEGIN"); // Now add single word scores. // // Having inlink text from siterank 15 of max // diversity/density will result in the largest score, // but we add them all up... // // This should be highly negative if singles[i] has a '-' // termsign... for ( int32_t i = 0 ; i < m_numQueryTermInfos ; i++ ) { if ( ! miniMergedList[i] ) { continue; } // skip if to the left of a pipe operator if( m_bflags[i] & (BF_PIPED|BF_NEGATIVE|BF_NUMBER) ) { continue; } // sometimes there is no wordpos subtermlist for this docid // because it just has the bigram, like "streetlight" and not // the word "light" by itself for the query 'street light' //if ( miniMergedList[i] ) { // assume all word positions are in body //highestScoringNonBodyPos[i] = NULL; // This scans all word positions for this term. // // This should ignore occurences in the body and only // focus on inlink text, etc. // // Sets "highestScoringNonBodyPos" to point to the winning word // position which does NOT occur in the body. // // Adds up MAX_TOP top scores and returns that sum. // // pdcs is NULL if not currPassNum == INTERSECT_DEBUG_INFO float sts = getBestScoreSumForSingleTerm(i, miniMergedList[i], miniMergedEnd[i], pdcs, &highestScoringNonBodyPos[i]); scoredTerm = true; // sanity check if ( highestScoringNonBodyPos[i] && s_inBody[Posdb::getHashGroup(highestScoringNonBodyPos[i])] ) { gbshutdownAbort(true); } //sts /= 3.0; if ( sts < minSingleScore ) { minSingleScore = sts; } } if( !scoredTerm ) { minSingleScore = -1; } logTrace(g_conf.m_logTracePosdb, "END. minSingleScore=%f", minSingleScore); return minSingleScore; } // Like getTermPairScore, but uses the word positions currently pointed to by ptrs[i]. // Does NOT scan the word position lists. // Also tries to sub-out each term with the title or linktext wordpos term // pointed to by "highestScoringNonBodyPos[i]". // // OUTPUT: // m_bestMinTermPairWindowScore: The best minimum window score // m_bestMinTermPairWindowPtrs : Pointers to query term positions giving the best minimum score // void PosdbTable::findMinTermPairScoreInWindow(const char **ptrs, const char **highestScoringNonBodyPos, float *scoreMatrix) { int32_t qdist = 0; float minTermPairScoreInWindow = 999999999.0; bool scoredTerms = false; logTrace(g_conf.m_logTracePosdb, "BEGIN."); // TODO: only do this loop on the (i,j) pairs where i or j // is the term whose position got advanced in the sliding window. for ( int32_t i = 0 ; i < m_numQueryTermInfos; i++ ) { // skip if to the left of a pipe operator if ( m_bflags[i] & (BF_PIPED|BF_NEGATIVE|BF_NUMBER) ) continue; // skip empty list if( !ptrs[i] ) { continue; } //if ( ptrs[i] ) wpi = ptrs[i]; // if term does not occur in body, sub-in the best term // from the title/linktext/etc. //else wpi = highestScoringNonBodyPos[i]; const char *wpi = ptrs[i]; // loop over other terms for(int32_t j = i + 1; j < m_numQueryTermInfos; j++) { // skip if to the left of a pipe operator if ( m_bflags[j] & (BF_PIPED|BF_NEGATIVE|BF_NUMBER) ) continue; // skip empty list if( !ptrs[j] ) { continue; } // TODO: use a cache using wpi/wpj as the key. //if ( ptrs[j] ) wpj = ptrs[j]; // if term does not occur in body, sub-in the best term // from the title/linktext/etc. //else wpj = highestScoringNonBodyPos[j]; const char *wpj = ptrs[j]; float wikiWeight; // in same wikipedia phrase? if ( m_wikiPhraseIds[j] == m_wikiPhraseIds[i] && // zero means not in a phrase m_wikiPhraseIds[j] ) { // try to get dist that matches qdist exactly qdist = m_qpos[j] - m_qpos[i]; // wiki weight wikiWeight = WIKI_WEIGHT; // .50; } else { // basically try to get query words as close // together as possible qdist = 2; // fix 'what is an unsecured loan' to get the // exact phrase with higher score //m_qdist = m_qpos[j] - m_qpos[i]; // wiki weight wikiWeight = 1.0; } // this will be -1 if wpi or wpj is NULL float max = getScoreForTermPair(wpi, wpj, 0, qdist); scoredTerms = true; // try sub-ing in the best title occurence or best // inlink text occurence. cuz if the term is in the title // but these two terms are really far apart, we should // get a better score float score = getScoreForTermPair(highestScoringNonBodyPos[i], wpj, FIXED_DISTANCE, qdist); if ( score > max ) { max = score; } // a double pair sub should be covered in the // getMaxScoreForNonBodyTermPair() function score = getScoreForTermPair(highestScoringNonBodyPos[i], highestScoringNonBodyPos[j], FIXED_DISTANCE, qdist); if ( score > max ) { max = score; } score = getScoreForTermPair(wpi, highestScoringNonBodyPos[j], FIXED_DISTANCE, qdist); if ( score > max ) { max = score; } // wikipedia phrase weight if ( !almostEqualFloat(wikiWeight, 1.0) ) { max *= wikiWeight; } // term freqweight here max *= m_freqWeights[i] * m_freqWeights[j]; // use score from scoreMatrix if bigger if ( scoreMatrix[m_nqt*i+j] > max ) { max = scoreMatrix[m_nqt*i+j]; } // in same quoted phrase? if ( m_quotedStartIds[j] >= 0 && m_quotedStartIds[j] == m_quotedStartIds[i] ) { // no subouts allowed i guess if( !wpi ) { max = -1.0; } else if( !wpj ) { max = -1.0; } else { int32_t qdist = m_qpos[j] - m_qpos[i]; int32_t p1 = Posdb::getWordPos ( wpi ); int32_t p2 = Posdb::getWordPos ( wpj ); int32_t dist = p2 - p1; // must be in right order! if(dist < 0) { max = -1.0; } // allow for a discrepancy of 1 unit in case // there is a comma? i think we add an extra // unit else if(dist > qdist && dist - qdist > 1) { max = -1.0; //log("ddd1: i=%" PRId32" j=%" PRId32" " // "dist=%" PRId32" qdist=%" PRId32, // i,j,dist,qdist); } else if(dist < qdist && qdist - dist > 1) { max = -1.0; } } } // now we want the sliding window with the largest min // term pair score! if(max < minTermPairScoreInWindow) { minTermPairScoreInWindow = max; } } } // Our best minimum score better than current best minimum score? if ( minTermPairScoreInWindow <= m_bestMinTermPairWindowScore || !scoredTerms ) { logTrace(g_conf.m_logTracePosdb, "END."); return; } // Yep, our best minimum score is the highest so far m_bestMinTermPairWindowScore = minTermPairScoreInWindow; // Record term positions in winning window for(int32_t i=0; i < m_numQueryTermInfos; i++) { m_bestMinTermPairWindowPtrs[i] = ptrs[i]; } logTrace(g_conf.m_logTracePosdb, "END."); } float PosdbTable::getMinTermPairScoreSlidingWindow(const char **miniMergedList, const char **miniMergedEnd, const char **highestScoringNonBodyPos, const char **winnerStack, const char **xpos, float *scoreMatrix, DocIdScore *pdcs) { bool allNull; int32_t minPos = 0; logTrace(g_conf.m_logTracePosdb, "Sliding Window algorithm begins"); m_bestMinTermPairWindowPtrs = winnerStack; // Scan the terms that are in the body in a sliding window // // Compute the term pair score on just the terms in that // sliding window. that way, the term for a word like 'dog' // keeps the same word position when it is paired up with the // other terms. // // Compute the score the same way getTermPairScore() works so // we are on the same playing field // // Sub-out each term with its best scoring occurence in the title // or link text or meta tag, etc. but it gets a distance penalty // of like 100 units or so. // // If term does not occur in the body, the sub-out approach should // fix that. // // Keep a matrix of the best scores between two terms from the // above double nested loop algo, and replace that pair if we // got a better score in the sliding window. // use special ptrs for the windows so we do not mangle // miniMergedList[] array because we use that below! for ( int32_t i = 0 ; i < m_numQueryTermInfos ; i++ ) { xpos[i] = miniMergedList[i]; } // // init each list ptr to the first wordpos rec in the body // for THIS DOCID and if no such rec, make it NULL // allNull = true; for(int32_t i = 0; i < m_numQueryTermInfos; i++) { // skip if to the left of a pipe operator if( m_bflags[i] & (BF_PIPED|BF_NEGATIVE|BF_NUMBER) ) { continue; } // skip word position until it is in the body while( xpos[i] && !s_inBody[Posdb::getHashGroup(xpos[i])]) { // advance if ( ! (xpos[i][0] & 0x04) ) { xpos[i] += 12; } else { xpos[i] += 6; } // NULLify list if no more for this docid if( xpos[i] < miniMergedEnd[i] && (xpos[i][0] & 0x04)) { continue; } // ok, no more! null means empty list xpos[i] = NULL; // must be in title or something else then if ( ! highestScoringNonBodyPos[i] ) { gbshutdownAbort(true); } } // if all xpos are NULL, then no terms are in body... if ( xpos[i] ) { allNull = false; } } // if no terms in body, no need to do sliding window bool doneSliding = allNull ? true : false; while( !doneSliding ) { // // Now all xpos point to positions in the document body. Calc the "window" score (score // for current term positions). // // If window score beats m_bestMinTermPairWindowScore we store the term xpos pointers // that define this window in the m_bestMinTermPairWindowPtrs[] array. // // Will try to substitute either of the two term positions with highestScoringNonBodyPos[i] // if better, but will fix the distance to FIXED_DISTANCE to give a distance penalty. // // "scoreMatrix" contains the highest scoring non-body term pair score, which will // be used if higher than the calculated score for the terms. // // Sets m_bestMinTermPairWindowScore and m_bestMinTermPairWindowPtrs if this window score beats it. // findMinTermPairScoreInWindow(xpos, highestScoringNonBodyPos, scoreMatrix); bool advanceMin; do { advanceMin = false; // // Find the minimum word position in the document body for ANY of the query terms. // minPosTermIdx will contain the term index, minPos the position. // int32_t minPosTermIdx = -1; for ( int32_t x = 0 ; x < m_numQueryTermInfos ; x++ ) { // skip if to the left of a pipe operator // and numeric posdb termlists do not have word positions, // they store a float there. if ( m_bflags[x] & (BF_PIPED|BF_NEGATIVE|BF_NUMBER) ) { continue; } if ( ! xpos[x] ) { continue; } if ( xpos[x] && minPosTermIdx == -1 ) { minPosTermIdx = x; //minRec = xpos[x]; minPos = Posdb::getWordPos(xpos[x]); continue; } if ( Posdb::getWordPos(xpos[x]) >= minPos ) { continue; } minPosTermIdx = x; //minRec = xpos[x]; minPos = Posdb::getWordPos(xpos[x]); } // sanity if ( minPosTermIdx < 0 ) { gbshutdownAbort(true); } do { // // Advance the list pointer of the list containing the current // minimum position (minPosTermIdx). If no more positions, set list to NULL. // If all lists are NULL, we are done sliding. // if ( ! (xpos[minPosTermIdx][0] & 0x04) ) { xpos[minPosTermIdx] += 12; } else { xpos[minPosTermIdx] += 6; } // NULLify list if no more positions for this docid for that term. if ( xpos[minPosTermIdx] >= miniMergedEnd[minPosTermIdx] || ! (xpos[minPosTermIdx][0] & 0x04) ) { // exhausted list now xpos[minPosTermIdx] = NULL; // are all null now? int32_t k; for ( k = 0 ; k < m_numQueryTermInfos ; k++ ) { // skip if to the left of a pipe operator if( m_bflags[k] & (BF_PIPED|BF_NEGATIVE|BF_NUMBER)) { continue; } if ( xpos[k] ) { break; } } // all lists are now exhausted if ( k >= m_numQueryTermInfos ) { doneSliding = true; } // No more positions in current term list. Find new term list with lowest position. advanceMin = true; } // if current term position is not in the document body, advance the pointer and look again. } while( !advanceMin && !doneSliding && !s_inBody[Posdb::getHashGroup(xpos[minPosTermIdx])] ); // if current list is exhausted, find new term list with lowest position. } while( advanceMin && !doneSliding ); } // end of while( !doneSliding ) float minPairScore = -1.0; logTrace(g_conf.m_logTracePosdb, "Zak algorithm begins"); for(int32_t i=0; i < m_numQueryTermInfos; i++) { // skip if to the left of a pipe operator if ( m_bflags[i] & (BF_PIPED|BF_NEGATIVE|BF_NUMBER) ) { continue; } if ( ! miniMergedList[i] ) { continue; } for ( int32_t j = i+1 ; j < m_numQueryTermInfos ; j++ ) { // skip if to the left of a pipe operator if ( m_bflags[j] & (BF_PIPED|BF_NEGATIVE|BF_NUMBER) ) { continue; } if ( ! miniMergedList[j] ) { continue; } // . this limits its scoring to the winning sliding window // as far as the in-body terms are concerned // . it will do sub-outs using the score matrix // . this will skip over body terms that are not // in the winning window defined by m_bestMinTermPairWindowPtrs[] // that we set in findMinTermPairScoreInWindow() // . returns the best score for this term float tpscore = getTermPairScoreForAny(i, j, miniMergedList[i], miniMergedList[j], miniMergedEnd[i], miniMergedEnd[j], pdcs); // get min of all term pair scores if ( tpscore >= minPairScore && minPairScore >= 0.0 ) { continue; } // got a new min minPairScore = tpscore; } } logTrace(g_conf.m_logTracePosdb, "Zak algorithm ends. minPairScore=%f", minPairScore); return minPairScore; } // . compare the output of this to intersectLists9_r() // . hopefully this will be easier to understand and faster // . IDEAS: // we could also note that if a term was not in the title or // inlink text it could never beat the 10th score. void PosdbTable::intersectLists10_r ( ) { logTrace(g_conf.m_logTracePosdb, "BEGIN. numTerms: %" PRId32, m_q->m_numTerms); prepareWhiteListTable(); initWeights(); // assume no-op m_t1 = 0LL; // set start time int64_t t1 = gettimeofdayInMilliseconds(); // assume we return early m_addListsTime = 0; if( !findCandidateDocIds() ) { logTrace(g_conf.m_logTracePosdb, "END. Found no candidate docids"); return; } // // The vote buffer now contains the matching docids and each term sublist // has been adjusted to only contain these docids as well. Let the fun begin. // // // TRANSFORM QueryTermInfo::m_* vars into old style arrays // // MUST MATCH allocation in allocTopScoringDocIdsData // char *pp = m_topScoringDocIdsBuf.getBufStart(); int32_t nqt = m_q->m_numTerms; int32_t *wikiPhraseIds = (int32_t *)pp; pp += 4 * nqt; // from QueryTermInfo int32_t *quotedStartIds = (int32_t *)pp; pp += 4 * nqt; // from QueryTermInfo int32_t *qpos = (int32_t *)pp; pp += 4 * nqt; // from QueryTermInfo int32_t *qtermNums = (int32_t *)pp; pp += 4 * nqt; // from QueryTermInfo float *freqWeights = (float *)pp; pp += sizeof(float) * nqt; // from QueryTermInfo const char **miniMergedList = (const char **)pp; pp += sizeof(const char *) * nqt; const char **miniMergedEnd = (const char **)pp; pp += sizeof(const char *) * nqt; const char **highestScoringNonBodyPos = (const char **)pp; pp += sizeof(const char *) * nqt; const char **winnerStack = (const char **)pp; pp += sizeof(const char *) * nqt; const char **xpos = (const char **)pp; pp += sizeof(const char *) * nqt; char *bflags = (char *)pp; pp += sizeof(char) * nqt; float *scoreMatrix = (float *)pp; pp += sizeof(float) *nqt*nqt; if ( pp > m_topScoringDocIdsBuf.getBufEnd() ) gbshutdownAbort(true); int64_t lastTime = gettimeofdayInMilliseconds(); int64_t now; int64_t took; int32_t phase = 3; // 2 first in findCandidateDocIds // point to our array of query term infos set in setQueryTermInfos() QueryTermInfo *qtibuf = (QueryTermInfo *)m_qiBuf.getBufStart(); for ( int32_t i = 0 ; i < m_numQueryTermInfos ; i++ ) { // get it QueryTermInfo *qti = &qtibuf[i]; // set it wikiPhraseIds [i] = qti->m_wikiPhraseId; quotedStartIds[i] = qti->m_quotedStartId; // query term position qpos [i] = qti->m_qpos; qtermNums [i] = qti->m_qtermNum; freqWeights [i] = qti->m_termFreqWeight; } // for findMinTermPairScoreInWindow() function m_freqWeights = freqWeights; m_qtermNums = qtermNums; m_bflags = bflags; ////////// // // OLD MAIN INTERSECTION LOGIC // ///////// DocIdScore dcs; DocIdScore *pdcs = NULL; uint64_t lastDocId = 0LL; int32_t lastLen = 0; char siteRank; int highestInlinkSiteRank; char docLang; float score; int32_t intScore = 0; float minScore = 0.0; float minSingleScore; // scan the posdb keys in the smallest list // raised from 200 to 300,000 for 'da da da' query char miniMergeBuf[300000]; const char *docIdPtr; char *docIdEnd = m_docIdVoteBuf.getBufStart()+m_docIdVoteBuf.length(); float minWinningScore = -1.0; int32_t topCursor = -9; int32_t numProcessed = 0; int32_t prefiltMaxPossScoreFail = 0; int32_t prefiltMaxPossScorePass = 0; int32_t prefiltBestDistMaxPossScoreFail = 0; int32_t prefiltBestDistMaxPossScorePass = 0; // populate the cursors for each sublist int32_t numQueryTermsToHandle = m_numQueryTermInfos; if ( ! m_msg39req->m_doMaxScoreAlgo ) { numQueryTermsToHandle = 0; } // do not do it if we got a gbsortby: field if ( m_sortByTermNum >= 0 ) { numQueryTermsToHandle = 0; } if ( m_sortByTermNumInt >= 0 ) { numQueryTermsToHandle = 0; } // // Run through the scoring logic once or twice. Two passes needed ONLY if we // need to create additional (debug) scoring info for visual display. // int numPassesNeeded = m_msg39req->m_getDocIdScoringInfo ? 2 : 1; for(int currPassNum=0; currPassNum < numPassesNeeded; currPassNum++) { // // Pass 0: Find candidate docids and calculate scores // Pass 1: Only for creating debug scoring info for visual display // switch( currPassNum ) { case INTERSECT_SCORING: logTrace(g_conf.m_logTracePosdb, "Loop 0: Real scoring"); break; case INTERSECT_DEBUG_INFO: logTrace(g_conf.m_logTracePosdb, "Loop 1: Data for visual scoring info"); removeScoreInfoForDeletedDocIds(); break; default: log(LOG_LOGIC,"%s:%d: Illegal pass number %d", __FILE__, __LINE__, currPassNum); gbshutdownLogicError(); } // reset docid to start! docIdPtr = m_docIdVoteBuf.getBufStart(); if( currPassNum == INTERSECT_DEBUG_INFO ) { // // reset QueryTermInfo::m_matchingSubListCursor[] to point to start of // term lists for second pass that gets printable scoring info // for ( int32_t i = 0 ; i < m_numQueryTermInfos ; i++ ) { // get it QueryTermInfo *qti = &qtibuf[i]; // skip negative termlists if ( qti->m_bigramFlags[0] & BF_NEGATIVE ) { continue; } // do each sublist for ( int32_t j = 0 ; j < qti->m_numMatchingSubLists ; j++ ) { qti->m_matchingSubListCursor [j] = qti->m_matchingSubListStart[j]; qti->m_matchingSubListSavedCursor [j] = qti->m_matchingSubListStart[j]; } } } //# //# MAIN LOOP for looping over each docid //# bool allDone = false; while( !allDone && docIdPtr < docIdEnd ) { // logTrace(g_conf.m_logTracePosdb, "Handling next docId"); bool skipToNextDocId = false; siteRank = 0; docLang = langUnknown; highestInlinkSiteRank = -1; bool docInThisFile; if ( currPassNum == INTERSECT_SCORING ) { m_docId = *(uint32_t *)(docIdPtr+1); m_docId <<= 8; m_docId |= (unsigned char)docIdPtr[0]; m_docId >>= 2; docInThisFile = m_documentIndexChecker->exists(m_docId); } else { // // second pass? for printing out transparency info. // genDebugScoreInfo1 sets m_docId from the top scorer tree // if( genDebugScoreInfo1(&numProcessed, &topCursor, &docInThisFile, qtibuf) ) { logTrace(g_conf.m_logTracePosdb, "Pass #%d for file %" PRId32 " done", currPassNum, m_documentIndexChecker->getFileNum()); // returns true if no more docids to handle allDone = true; break; // break out of docIdPtr < docIdEnd loop } } //bool docInThisFile = m_documentIndexChecker->exists(m_docId); logTrace(g_conf.m_logTracePosdb, "Handling next docId: %" PRId64 " - pass #%d - %sfound in this file (%" PRId32 ")", m_docId, currPassNum, docInThisFile?"":"SKIPPING, not ", m_documentIndexChecker->getFileNum()); if(!docInThisFile) { // Only advance cursors in first pass if( currPassNum == INTERSECT_SCORING ) { if( !advanceTermListCursors(docIdPtr, qtibuf) ) { logTrace(g_conf.m_logTracePosdb, "END. advanceTermListCursors failed"); return; } docIdPtr += 6; } continue; } //calculate complete score multiplier float completeScoreMultiplier = 1.0; unsigned flags = 0; if(g_d2fasm.lookupFlags(m_docId,&flags) && flags) { for(int i=0; i<26; i++) { if(flags&(1<<i)) completeScoreMultiplier *= m_msg39req->m_flagScoreMultiplier[i]; } } if( currPassNum == INTERSECT_SCORING ) { // // Pre-advance each termlist's cursor to skip to next docid. // if( !advanceTermListCursors(docIdPtr, qtibuf) ) { logTrace(g_conf.m_logTracePosdb, "END. advanceTermListCursors failed"); return; } if( !m_q->m_isBoolean ) { //## //## PRE-FILTERS. Discard DocIDs that cannot meet the minimum required //## score, before entering the main scoring loop below //## // // TODO: consider skipping this pre-filter if it sucks, as it does // for 'time enough for love'. it might save time! // // Calculate maximum possible score for a document. If the max score // is lower than the current minimum winning score, give up already // now and skip to the next docid. // // minWinningScore is the score of the lowest scoring doc in m_topTree IF // the topTree contains the number of docs requested. Otherwise it is -1 // and the doc is inserted no matter the score. // // Only go through this if we actually have a minimum score to compare with ... // No need if minWinningScore is still -1. // if ( minWinningScore >= 0.0 ) { logTrace(g_conf.m_logTracePosdb, "Compute 'upper bound' for each query term"); // If there's no way we can break into the winner's circle, give up! // This computes an upper bound for each query term for ( int32_t i = 0 ; i < numQueryTermsToHandle ; i++ ) { // skip negative termlists. if ( qtibuf[i].m_bigramFlags[0] & (BF_NEGATIVE) ) { continue; } // an upper bound on the score we could get float maxScore = getMaxPossibleScore ( &qtibuf[i], 0, 0, NULL ); // -1 means it has inlink text so do not apply this constraint // to this docid because it is too difficult because we // sum up the inlink text if ( maxScore < 0.0 ) { continue; } maxScore *= completeScoreMultiplier; // logTrace(g_conf.m_logTracePosdb, "maxScore=%f minWinningScore=%f", maxScore, minWinningScore); // if any one of these terms have a max score below the // worst score of the 10th result, then it can not win. if ( maxScore <= minWinningScore ) { docIdPtr += 6; prefiltMaxPossScoreFail++; skipToNextDocId = true; break; // break out of numQueryTermsToHandle loop } } } if( skipToNextDocId ) { // continue docIdPtr < docIdEnd loop logTrace(g_conf.m_logTracePosdb, "Max possible score for docId %" PRId64 " too low, skipping to next", m_docId); continue; } prefiltMaxPossScorePass++; if ( minWinningScore >= 0.0 && m_sortByTermNum < 0 && m_sortByTermNumInt < 0 ) { if( !prefilterMaxPossibleScoreByDistance(qtibuf, qpos, minWinningScore*completeScoreMultiplier) ) { docIdPtr += 6; prefiltBestDistMaxPossScoreFail++; skipToNextDocId = true; } } // not m_sortByTermNum or m_sortByTermNumInt if( skipToNextDocId ) { // Continue docIdPtr < docIdEnd loop logTrace(g_conf.m_logTracePosdb, "Max possible score by distance for docId %" PRId64 " too low, skipping to next", m_docId); continue; } prefiltBestDistMaxPossScorePass++; } // !m_q->m_isBoolean } // currPassNum == INTERSECT_SCORING if ( m_q->m_isBoolean ) { // add one point for each term matched in the bool query // this is really just for when the terms are from different // fields. if we have unfielded boolean terms we should // do proximity matching. int32_t slot = m_bt.getSlot ( &m_docId ); if ( slot >= 0 ) { uint8_t *bv = (uint8_t *)m_bt.getValueFromSlot(slot); // then a score based on the # of terms that matched int16_t bitsOn = getNumBitsOnX ( bv, m_vecSize ); // but store in hashtable now minScore = (float)bitsOn; } else { minScore = 1.0; } } //## //## PERFORMANCE HACK: ON-DEMAND MINI MERGES. //## //## Data for the current DocID found in sublists of each query term //## is merged into a single list, so we end up with one list per query //## term. They are all stored in a single buffer (miniMergeBuf), which //## the miniMerged* pointers point into.. //## mergeTermSubListsForDocId(qtibuf, miniMergeBuf, miniMergedList, miniMergedEnd, &highestInlinkSiteRank); // clear the counts on this DocIdScore class for this new docid pdcs = NULL; if ( currPassNum == INTERSECT_DEBUG_INFO ) { dcs.reset(); pdcs = &dcs; } //## //## ACTUAL SCORING BEGINS //## if ( !m_q->m_isBoolean ) { // Used by the various scoring functions called below m_qpos = qpos; m_wikiPhraseIds = wikiPhraseIds; m_quotedStartIds = quotedStartIds; m_bestMinTermPairWindowScore = -2.0; //# //# NON-BODY TERM PAIR SCORING LOOP //# createNonBodyTermPairScoreMatrix(miniMergedList, miniMergedEnd, scoreMatrix); //# //# SINGLE TERM SCORE LOOP //# minSingleScore = getMinSingleTermScoreSum(miniMergedList, miniMergedEnd, highestScoringNonBodyPos, pdcs); logTrace(g_conf.m_logTracePosdb, "minSingleScore=%f before multiplication for docId %" PRIu64 "", minSingleScore, m_docId); minSingleScore *= completeScoreMultiplier; //# //# DOCID / SITERANK DETECTION //# for(int32_t k=0; k < m_numQueryTermInfos; k++) { if ( ! miniMergedList[k] ) { continue; } // siterank/langid is always 0 in numeric // termlists so they sort by their number correctly if ( qtibuf[k].m_bigramFlags[0] & (BF_NUMBER) ) { continue; } siteRank = Posdb::getSiteRank ( miniMergedList[k] ); docLang = Posdb::getLangId ( miniMergedList[k] ); break; } logTrace(g_conf.m_logTracePosdb, "Got siteRank %d and docLang %d", (int)siteRank, (int)docLang); //# //# SLIDING WINDOW SCORING ALGORITHM //# // After calling this, m_bestMinTermPairWindowPtrs will point to the // term positions set ("window") that has the highest minimum score. These // pointers are used when determining the minimum term pair score returned // by the function. float minPairScore = getMinTermPairScoreSlidingWindow(miniMergedList, miniMergedEnd, highestScoringNonBodyPos, winnerStack, xpos, scoreMatrix, pdcs); logTrace(g_conf.m_logTracePosdb, "minPairScore=%f before multiplication for docId %" PRIu64 "", minPairScore, m_docId); minPairScore *= completeScoreMultiplier; //# //# Find minimum score - either single term or term pair //# minScore = 999999999.0; // get a min score from all the term pairs if ( minPairScore < minScore && minPairScore >= 0.0 ) { minScore = minPairScore; } // if we only had one query term if ( minSingleScore < minScore ) { minScore = minSingleScore; } logTrace(g_conf.m_logTracePosdb, "minPairScore=%f, minScore=%f for docId %" PRIu64 "", minPairScore, minScore, m_docId); // No positive score? Then skip the doc if ( minScore <= 0.0 ) { if( currPassNum == INTERSECT_SCORING ) { // advance to next docid docIdPtr += 6; } logTrace(g_conf.m_logTracePosdb, "Skipping docid %" PRIu64 " - no positive score", m_docId); // Continue docid loop continue; } } // !m_q->m_isBoolean //# //# Calculate score and give boost based on siterank and highest inlinking siterank //# float adjustedSiteRank = siteRank; if( highestInlinkSiteRank > siteRank ) { //adjust effective siterank because a high-rank site linked to it. Don't adjust it too much though. adjustedSiteRank = siteRank + (highestInlinkSiteRank-siteRank) / 3.0; logTrace(g_conf.m_logTracePosdb, "Highest inlink siterank %d > siterank %d. Adjusting to %f for docId %" PRIu64 "", highestInlinkSiteRank, (int)siteRank, adjustedSiteRank, m_docId); } score = minScore * (adjustedSiteRank*m_siteRankMultiplier+1.0); logTrace(g_conf.m_logTracePosdb, "Score %f for docId %" PRIu64 "", score, m_docId); //# //# Give score boost if query and doc language is the same, //# and optionally a different boost if the language of the //# page is unknown. //# //# Use "qlang" parm to set the language. i.e. "&qlang=fr" //# if ( m_msg39req->m_language != 0 ) { if( m_msg39req->m_language == docLang) { score *= (m_msg39req->m_sameLangWeight); logTrace(g_conf.m_logTracePosdb, "Giving score a matching language boost of x%f: %f for docId %" PRIu64 "", m_msg39req->m_sameLangWeight, score, m_docId); } else if( docLang == 0 ) { score *= (m_msg39req->m_unknownLangWeight); logTrace(g_conf.m_logTracePosdb, "Giving score an unknown language boost of x%f: %f for docId %" PRIu64 "", m_msg39req->m_unknownLangWeight, score, m_docId); } } double page_temperature = 0; bool use_page_temperature = false; float score_before_page_temp = score; if(m_msg39req->m_usePageTemperatureForRanking) { use_page_temperature = true; page_temperature = g_pageTemperatureRegistry.query_page_temperature(m_docId, m_msg39req->m_pageTemperatureWeightMin, m_msg39req->m_pageTemperatureWeightMax); score *= page_temperature; logTrace(g_conf.m_logTracePosdb, "Page temperature for docId %" PRIu64 " is %.14f, score %f -> %f", m_docId, page_temperature, score_before_page_temp, score); } //# //# Handle sortby int/float and minimum docid/score pairs //# if( m_sortByTermNum >= 0 || m_sortByTermNumInt >= 0 || m_hasMaxSerpScore ) { // assume filtered out if ( currPassNum == INTERSECT_SCORING ) { m_filtered++; } // // if we have a gbsortby:price term then score exclusively on that // if ( m_sortByTermNum >= 0 ) { // no term? if ( ! miniMergedList[m_sortByTermInfoNum] ) { // advance to next docid if( currPassNum == INTERSECT_SCORING ) { docIdPtr += 6; } // Continue docIdPtr < docIdEnd loop continue; } score = Posdb::getFloat(miniMergedList[m_sortByTermInfoNum]); } if ( m_sortByTermNumInt >= 0 ) { // no term? if ( ! miniMergedList[m_sortByTermInfoNumInt] ) { // advance to next docid if( currPassNum == INTERSECT_SCORING ) { docIdPtr += 6; } // Continue docIdPtr < docIdEnd loop continue; } intScore = Posdb::getInt(miniMergedList[m_sortByTermInfoNumInt]); // do this so hasMaxSerpScore below works, although // because of roundoff errors we might lose a docid // through the cracks in the widget. //score = (float)intScore; } // now we have a maxscore/maxdocid upper range so the widget // can append only new results to an older result set. if ( m_hasMaxSerpScore ) { bool skipToNext = false; // if dealing with an "int" score use the extra precision // of the double that m_maxSerpScore is! if ( m_sortByTermNumInt >= 0 ) { if ( intScore > (int32_t)m_msg39req->m_maxSerpScore ) { skipToNext = true; } else if ( intScore == (int32_t)m_msg39req->m_maxSerpScore && (int64_t)m_docId <= m_msg39req->m_minSerpDocId ) { skipToNext = true; } } else { if ( score > (float)m_msg39req->m_maxSerpScore ) { skipToNext = true; } else if ( almostEqualFloat(score, (float)m_msg39req->m_maxSerpScore) && (int64_t)m_docId <= m_msg39req->m_minSerpDocId ) { //@todo: Why is m_msg39req->m_maxSerpScore double and score float? skipToNext = true; } } if( skipToNext ) { // advance to next docid if( currPassNum == INTERSECT_SCORING ) { docIdPtr += 6; } // Continue docIdPtr < docIdEnd loop continue; } } // we did not get filtered out if ( currPassNum == INTERSECT_SCORING ) { m_filtered--; } } // We only come here if we actually made it into m_topTree - second round only loops through // TopTree entries. if ( currPassNum == INTERSECT_DEBUG_INFO ) { // NEW 20170423 dcs.m_usePageTemperature = use_page_temperature; dcs.m_pageTemperature = page_temperature; dcs.m_adjustedSiteRank = adjustedSiteRank; if( genDebugScoreInfo2(&dcs, &lastLen, &lastDocId, siteRank, score, intScore, docLang) ) { // advance to next docid if( currPassNum == INTERSECT_SCORING ) { docIdPtr += 6; } // Continue docIdPtr < docIdEnd loop continue; } } if( currPassNum == INTERSECT_SCORING ) { //# //# SCORING DONE! Add to top-scorer tree. //# int32_t tn = m_topTree->getEmptyNode(); // Sanity check if( tn < 0 ) { log(LOG_LOGIC,"%s:%s:%d: No space left in m_topTree", __FILE__, __func__, __LINE__); gbshutdownLogicError(); } TopNode *t = m_topTree->getNode(tn); // set the score and docid ptr t->m_score = score; t->m_docId = m_docId; t->m_flags = flags; // use an integer score like lastSpidered timestamp? if ( m_sortByTermNumInt >= 0 ) { t->m_intScore = intScore; t->m_score = 0.0; if ( ! m_topTree->m_useIntScores) { log(LOG_LOGIC,"%s:%s:%d: Got int score, but m_topTree not setup for int scores!", __FILE__, __func__, __LINE__); gbshutdownLogicError(); } } // // This will not add if tree is full and it is less than the m_lowNode in score. // // If it does get added to a full tree, lowNode will be removed. // m_topTree->addNode(t, tn); // top tree only holds enough docids to satisfy the // m_msg39request::m_docsToGet (m_msg39req->m_docsToGet) request // from the searcher. It basically stores m_docsToGet // into TopTree::m_docsWanted. TopTree::m_docsWanted is often // double m_docsToGet to compensate for site clustering, and // it can be even more than that in order to ensure we get // enough domains represented in the search results. // See TopTree::addNode(). it will not add the "t" node if // its score is not high enough when the top tree is full. if ( m_topTree->getNumUsedNodes() > m_topTree->getNumDocsWanted() ) { // get the lowest scoring node int32_t lowNode = m_topTree->getLowNode(); // and record its score in "minWinningScore" minWinningScore = m_topTree->getNode(lowNode)->m_score; } } // advance to next docid if( currPassNum == INTERSECT_SCORING ) { docIdPtr += 6; } } // docIdPtr < docIdEnd loop if ( m_debug ) { now = gettimeofdayInMilliseconds(); took = now - lastTime; log(LOG_INFO, "posdb: new algo phase %" PRId32" took %" PRId64" ms", phase,took); lastTime = now; phase++; } } // for ... currPassNum < numPassesNeeded if ( m_debug ) { log(LOG_INFO, "posdb: # prefiltMaxPossScoreFail........: %" PRId32" ", prefiltMaxPossScoreFail ); log(LOG_INFO, "posdb: # prefiltMaxPossScorePass........: %" PRId32" ", prefiltMaxPossScorePass ); log(LOG_INFO, "posdb: # prefiltBestDistMaxPossScoreFail: %" PRId32" ", prefiltBestDistMaxPossScoreFail ); log(LOG_INFO, "posdb: # prefiltBestDistMaxPossScorePass: %" PRId32" ", prefiltBestDistMaxPossScorePass ); } if( g_conf.m_logTracePosdb ) { m_topTree->logTreeData(LOG_TRACE); logDebugScoreInfo(LOG_TRACE); } // get time now now = gettimeofdayInMilliseconds(); // store the addLists time m_addListsTime = now - t1; m_t1 = t1; m_t2 = now; logTrace(g_conf.m_logTracePosdb, "END. Took %" PRId64" msec", m_addListsTime); } // . "bestDist" is closest distance to query term # m_minTermListIdx // . set "bestDist" to 1 to ignore it float PosdbTable::getMaxPossibleScore ( const QueryTermInfo *qti, int32_t bestDist, int32_t qdist, const QueryTermInfo *qtm ) { logTrace(g_conf.m_logTracePosdb, "BEGIN."); // get max score of all sublists float bestHashGroupWeight = -1.0; unsigned char bestDensityRank = 0; char siteRank = -1; char docLang = -1; unsigned char hgrp; bool hadHalfStopWikiBigram = false; // scan those sublists to set m_ptrs[] and to get the // max possible score of each one for ( int32_t j = 0 ; j < qti->m_numMatchingSubLists ; j++ ) { // scan backwards up to this const char *start = qti->m_matchingSubListSavedCursor[j]; // skip if does not have our docid if ( ! start ) { continue; } // note it if any is a wiki bigram if ( qti->m_bigramFlags[0] & BF_HALFSTOPWIKIBIGRAM ) { hadHalfStopWikiBigram = true; } // skip if entire sublist/termlist is exhausted if ( start >= qti->m_matchingSubListEnd[j] ) { continue; } if ( siteRank == -1 ) { siteRank = Posdb::getSiteRank(start); } if ( docLang == -1 ) { docLang = Posdb::getLangId(start); } // skip first key because it is 12 bytes, go right to the // 6 byte keys. we deal with it below. start += 12; // get cursor. this is actually pointing to the next docid const char *dc = qti->m_matchingSubListCursor[j]; // back up into our list dc -= 6; // reset this bool retried = false; // do not include the beginning 12 byte key in this loop! for ( ; dc >= start ; dc -= 6 ) { // loop back here for the 12 byte key retry: // get the best hash group hgrp = Posdb::getHashGroup(dc); // if not body, do not apply this algo because // we end up adding term pairs from each hash group if ( hgrp == HASHGROUP_INLINKTEXT ) { return -1.0; } //if ( hgrp == HASHGROUP_TITLE ) return -1.0; // loser? if ( m_msg39req->m_scoringWeights.m_hashGroupWeights[hgrp] < bestHashGroupWeight ) { // if in body, it's over for this termlist // because this is the last hash group // we will encounter. if ( hgrp == HASHGROUP_BODY ) { // @@@ BR: Dangerous assumption if we change indexing order in XmlDoc_Indexing ! @@@ goto nextTermList; } // otherwise, keep chugging continue; } unsigned char dr = Posdb::getDensityRank(dc); // a clean win? if ( m_msg39req->m_scoringWeights.m_hashGroupWeights[hgrp] > bestHashGroupWeight ) { bestHashGroupWeight = m_msg39req->m_scoringWeights.m_hashGroupWeights[hgrp]; bestDensityRank = dr; continue; } // but worst density rank? if ( dr < bestDensityRank ) { continue; } // better? if ( dr > bestDensityRank ) { bestDensityRank = dr; } // another tie, oh well... just ignore it } // handle the beginning 12 byte key if ( ! retried ) { retried = true; dc = qti->m_matchingSubListSavedCursor[j]; goto retry; } nextTermList: continue; } // if nothing, then maybe all sublists were empty? if ( bestHashGroupWeight < 0 ) { return 0.0; } // assume perfect adjacency and that the other term is perfect float score = 100.0; score *= bestHashGroupWeight; score *= bestHashGroupWeight; // since adjacent, 2nd term in pair will be in same sentence // TODO: fix this for 'qtm' it might have a better density rank and // better hashgroup weight, like being in title! score *= m_msg39req->m_scoringWeights.m_densityWeights[bestDensityRank]; score *= m_msg39req->m_scoringWeights.m_densityWeights[bestDensityRank]; // wiki bigram? if ( hadHalfStopWikiBigram ) { score *= WIKI_BIGRAM_WEIGHT; score *= WIKI_BIGRAM_WEIGHT; } //score *= perfectWordSpamWeight * perfectWordSpamWeight; score *= (((float)siteRank)*m_siteRankMultiplier+1.0); // language boost if language specified and if page is same language, or unknown language if ( m_msg39req->m_language != 0 ) { if( m_msg39req->m_language == docLang) { score *= (m_msg39req->m_sameLangWeight); } else if( docLang == 0 ) { score *= (m_msg39req->m_unknownLangWeight); } } // assume the other term we pair with will be 1.0 score *= qti->m_termFreqWeight; // the new logic to fix 'time enough for love' slowness if ( qdist ) { // no use it score *= qtm->m_termFreqWeight; // subtract qdist bestDist -= qdist; // assume in correct order if ( qdist < 0 ) { qdist *= -1; } // make it positive if ( bestDist < 0 ) { bestDist *= -1; } // avoid 0 division if ( bestDist > 1 ) { score /= (float)bestDist; } } // terms in same wikipedia phrase? //if ( wikiWeight != 1.0 ) // score *= WIKI_WEIGHT; // if query is 'time enough for love' it is kinda slow because // we were never multiplying by WIKI_WEIGHT, even though all // query terms were in the same wikipedia phrase. so see if this // speeds it up. if ( m_allInSameWikiPhrase ) { score *= WIKI_WEIGHT; } logTrace(g_conf.m_logTracePosdb, "END. score=%f", score); return score; } //////////////////// // // "White list" functions used to find docids from only specific sites // //////////////////// // this is separate from allocTopTree() function below because we must // call it for each iteration in Msg39::doDocIdSplitLoop() which is used // to avoid reading huge termlists into memory. it breaks the huge lists // up by smaller docid ranges and gets the search results for each docid // range separately. bool PosdbTable::allocWhiteListTable ( ) { // // the whitetable is for the docids in the whitelist. we have // to only show results whose docid is in the whitetable, which // is from the "&sites=abc.com+xyz.com..." custom search site list // provided by the user. // if ( m_msg39req->size_whiteList <= 1 ) m_useWhiteTable = false; // inclds \0 else m_useWhiteTable = true; int32_t sum = 0; for ( int32_t i = 0 ; i < m_msg2->getNumWhiteLists() ; i++ ) { RdbList *list = m_msg2->getWhiteList(i); if ( list->isEmpty() ) continue; // assume 12 bytes for all keys but first which is 18 int32_t size = list->getListSize(); sum += size / 12 + 1; } if ( sum ) { // making this sum * 3 does not show a speedup... hmmm... int32_t numSlots = sum * 2; // keep it restricted to 5 byte keys so we do not have to // extract the docid, we can just hash the ptr to those // 5 bytes (which includes 1 siterank bit as the lowbit, // but should be ok since it should be set the same in // all termlists that have that docid) if (!m_whiteListTable.set(5, 0, numSlots, NULL, 0, false, "wtall", true)) { return false; } } return true; } void PosdbTable::prepareWhiteListTable() { // hash the docids in the whitelist termlists into a hashtable. // every docid in the search results must be in there. the // whitelist termlists are from a provided "&sites=abc.com+xyz.com+.." // cgi parm. the user only wants search results returned from the // specified subdomains. there can be up to MAX_WHITELISTS (500) // sites right now. this hash table must have been pre-allocated // in Posdb::allocTopTree() above since we might be in a thread. if ( m_addedSites ) return; for ( int32_t i = 0 ; i < m_msg2->getNumWhiteLists() ; i++ ) { RdbList *list = m_msg2->getWhiteList(i); if ( list->isEmpty() ) continue; // sanity test int64_t d1 = Posdb::getDocId(list->getList()); if ( d1 > m_msg2->docIdEnd() ) { log("posdb: d1=%" PRId64" > %" PRId64, d1,m_msg2->docIdEnd()); //gbshutdownAbort(true); } if ( d1 < m_msg2->docIdStart() ) { log("posdb: d1=%" PRId64" < %" PRId64, d1,m_msg2->docIdStart()); //gbshutdownAbort(true); } // first key is always 18 bytes cuz it has the termid // scan recs in the list for ( ; ! list->isExhausted() ; list->skipCurrentRecord() ) { char *rec = list->getCurrentRec(); // point to the 5 bytes of docid m_whiteListTable.addKey ( rec + 7 ); } } m_addedSites = true; } bool PosdbTable::allocTopScoringDocIdsData() { int64_t nn1 = m_msg39req->m_docsToGet; int64_t nn2 = 0; // just add all up in case doing boolean OR or something for ( int32_t k = 0 ; k < m_msg2->getNumLists(); k++) { // count RdbList *list = m_msg2->getList(k); // skip if null if ( ! list ) { continue; } // skip if list is empty, too if ( list->isEmpty() ) { continue; } // show if debug if ( m_debug ) { log(LOG_INFO, "toptree: adding listsize %" PRId32" to nn2", list->getListSize()); } // tally. each new docid in this termlist will compress // the 6 byte termid out, so reduce by 6. nn2 += list->getListSize() / ( sizeof(posdbkey_t) -6 ); } // if doing docid range phases where we compute the winning docids // for a range of docids to save memory, then we need to amp this up if ( m_msg39req->m_numDocIdSplits > 1 ) { // if 1 split has only 1 docid the other splits // might have 10 then this doesn't work, so make it // a min of 100. if ( nn2 < 100 ) { nn2 = 100; } // how many docid range splits are we doing? nn2 *= m_msg39req->m_numDocIdSplits; // just in case one split is not as big nn2 *= 2; // boost this guy too since we compare it to nn2 if ( nn1 < 100 ) { nn1 = 100; } nn1 *= m_msg39req->m_numDocIdSplits; nn1 *= 2; } // do not go OOM just because client asked for 10B results and we // only have like 100 results. int64_t nn = gbmin(nn1,nn2); // . do not alloc space for anything if all termlists are empty // . before, even if nn was 0, top tree would alloc a bunch of nodes // and we don't want to do that now to save mem and so // Msg39 can check // if ( m_posdbTable.m_topTree->m_numNodes == 0 ) // to see if it should // advance to the next docid range or not. if ( nn == 0 ) { return true; } // always at least 100 i guess. why? it messes up the // m_scoreInfoBuf capacity and it cores //if ( nn < 100 ) nn = 100; // but 30 is ok since m_scoreInfo buf uses 32 nn = gbmax(nn,30); if ( m_msg39req->m_doSiteClustering ) { nn *= 2; } // limit to 2B docids i guess nn = gbmin(nn,2000000000); if ( m_debug ) { log(LOG_INFO, "toptree: toptree: initializing %" PRId64" nodes",nn); } if ( nn < m_msg39req->m_docsToGet ) { log("query: warning only getting up to %" PRId64" docids " "even though %" PRId32" requested because termlist " "sizes are so small!! splits=%" PRId32 , nn , m_msg39req->m_docsToGet , (int32_t)m_msg39req->m_numDocIdSplits ); } // keep it sane if ( nn > (int64_t)m_msg39req->m_docsToGet * 2 && nn > 60 ) { nn = (int64_t)m_msg39req->m_docsToGet * 2; } // this actually sets the # of nodes to MORE than nn!!! if ( ! m_topTree->setNumNodes(nn,m_msg39req->m_doSiteClustering)) { log("toptree: toptree: error allocating nodes: %s", mstrerror(g_errno)); return false; } // let's use nn*4 to try to get as many score as possible, although // it may still not work! int32_t xx = nn;//m_msg39req->m_docsToGet ; // try to fix a core of growing this table in a thread when xx == 1 xx = gbmax(xx,32); //if ( m_msg39req->m_doSiteClustering ) xx *= 4; m_maxScores = xx; // for seeing if a docid is in toptree. niceness=0. //if ( ! m_docIdTable.set(8,0,xx*4,NULL,0,false,0,"dotb") ) // return false; if ( m_msg39req->m_getDocIdScoringInfo ) { m_scoreInfoBuf.setLabel ("scinfobuf" ); // . for holding the scoring info // . add 1 for the \0 safeMemcpy() likes to put at the end so // it will not realloc on us if ( ! m_scoreInfoBuf.reserve ( xx * sizeof(DocIdScore) +100) ) { return false; } // likewise how many query term pair scores should we get? int32_t numTerms = m_q->m_numTerms; // limit numTerms = gbmin(numTerms,10); // the pairs. divide by 2 since (x,y) is same as (y,x) int32_t numPairs = (numTerms * numTerms) / 2; // then for each pair assume no more than MAX_TOP reps, usually // it's just 1, but be on the safe side numPairs *= m_realMaxTop;//MAX_TOP; // now that is how many pairs per docid and can be 500! but we // still have to multiply by how many docids we want to // compute. so this could easily get into the megabytes, most // of the time we will not need nearly that much however. numPairs *= xx; m_pairScoreBuf.setLabel ( "pairbuf" ); m_singleScoreBuf.setLabel ("snglbuf" ); // but alloc it just in case if ( ! m_pairScoreBuf.reserve (numPairs * sizeof(PairScore) ) ) { return false; } // and for singles int32_t numSingles = numTerms * m_realMaxTop * xx; // MAX_TOP *xx; if ( !m_singleScoreBuf.reserve(numSingles*sizeof(SingleScore))) { return false; } } // m_topScoringDocIdsBuf. MUST MATCH use in intersectLists10_r ! int32_t nqt = m_q->m_numTerms; int32_t need = 0; need += 4 * nqt; // wikiPhraseIds need += 4 * nqt; // quotedStartIds need += 4 * nqt; // qpos need += 4 * nqt; // qtermNums need += sizeof(float ) * nqt; // freqWeights need += sizeof(char *) * nqt; // miniMergedList need += sizeof(char *) * nqt; // miniMergedEnd need += sizeof(char *) * nqt; // highestScoringNonBodyPos need += sizeof(char *) * nqt; // winnerStack need += sizeof(char *) * nqt; // xpos need += sizeof(char ) * nqt; // bflags need += sizeof(float ) * nqt * nqt; // scoreMatrix m_topScoringDocIdsBuf.setLabel("stkbuf1"); if ( ! m_topScoringDocIdsBuf.reserve( need ) ) { return false; } return true; } //////////////////// // // Functions used to find candidate docids, including "Voting" functions // used to find docids with all required term ids and functions to remove // docids that do not match all required terms // //////////////////// // // Run through each term sublist and remove all docids not // found in the docid vote buffer // void PosdbTable::delNonMatchingDocIdsFromSubLists() { logTrace(g_conf.m_logTracePosdb, "BEGIN."); //phase 1: shrink the rdblists for all queryterms (except those with a minus sign) std::valarray<char *> newEndPtr(m_q->m_numTerms); for(int i=0; i<m_q->m_numTerms; i++) { newEndPtr[i] = NULL; if(m_q->m_qterms[i].m_termSign=='-') continue; RdbList *list = m_q->m_qterms[i].m_posdbListPtr; if(!list || list->isEmpty()) continue; // get that sublist char *subListPtr = list->getList(); char *subListEnd = list->getListEnd(); char *dst = subListPtr; // reset docid list ptrs const char *dp = m_docIdVoteBuf.getBufStart(); const char *dpEnd = dp + m_docIdVoteBuf.length(); //log(LOG_INFO,"@@@@ i#%d subListPtr=%p subListEnd=%p", i, subListPtr, subListEnd); for(;;) { // scan the docid list for the current docid in this termlist for ( ; dp < dpEnd; dp += 6 ) { // if current docid in docid list is >= the docid // in the sublist, stop. docid in list is 6 bytes and // subListPtr must be pointing to a 12 byte posdb rec. if ( *(uint32_t *)(dp+1) > *(uint32_t *)(subListPtr+8) ) { break; } // try to catch up docid if it is behind if ( *(uint32_t *)(dp+1) < *(uint32_t *)(subListPtr+8) ) { continue; } // check lower byte if equal if ( *(unsigned char *)(dp) > (*(unsigned char *)(subListPtr+7) & 0xfc ) ) { break; } if ( *(unsigned char *)(dp) < (*(unsigned char *)(subListPtr+7) & 0xfc ) ) { continue; } // copy over the 12 byte key *(int64_t *)dst = *(int64_t *)subListPtr; *(int32_t *)(dst+8) = *(int32_t *)(subListPtr+8); // skip that dst += 12; subListPtr += 12; // copy over any 6 bytes keys following for ( ; ; ) { if ( subListPtr >= subListEnd ) { // give up on this exhausted term list! goto doneWithSubList; } // next docid willbe next 12 bytekey if ( ! ( subListPtr[0] & 0x04 ) ) { break; } // otherwise it's 6 bytes *(int32_t *)dst = *(int32_t *)subListPtr; *(int16_t *)(dst+4) = *(int16_t *)(subListPtr+4); dst += 6; subListPtr += 6; } } // skip that docid record in our termlist. it MUST have been // 12 bytes, a docid heading record. subListPtr += 12; // skip any following keys that are 6 bytes, that means they // share the same docid for ( ; ; ) { // list exhausted? if ( subListPtr >= subListEnd ) { goto doneWithSubList; } // stop if next key is 12 bytes, that is a new docid if ( ! (subListPtr[0] & 0x04) ) { break; } // skip it subListPtr += 6; } } doneWithSubList: //log(LOG_INFO,"@@@ shrunk #%d to %ld (%p-%p)", i, dst - list->getList(), list->getList(), dst); newEndPtr[i] = dst; } //phase 2: set the matchingsublist pointers in qti for(int i=0; i<m_numQueryTermInfos; i++) { QueryTermInfo *qti = ((QueryTermInfo*)m_qiBuf.getBufStart()) + i; if(qti->m_bigramFlags[0]&BF_NEGATIVE) continue; //don't modify sublist for negative terms qti->m_numMatchingSubLists = 0; for(int j=0; j<qti->m_numSubLists; j++) { for(int k=0; k<m_q->m_numTerms; k++) { if(qti->m_subLists[j] == m_q->m_qterms[k].m_posdbListPtr) { char *newStartPtr = m_q->m_qterms[k].m_posdbListPtr->getList(); //same as always int32_t x = qti->m_numMatchingSubLists; qti->m_matchingSubListSize [x] = newEndPtr[k] - newStartPtr; qti->m_matchingSubListStart [x] = newStartPtr; qti->m_matchingSubListEnd [x] = newEndPtr[k]; qti->m_matchingSubListCursor [x] = newStartPtr; qti->m_matchingSubListSavedCursor [x] = newStartPtr; qti->m_numMatchingSubLists++; break; } } } } logTrace(g_conf.m_logTracePosdb, "END."); } // // Removes docids with vote -1, which is set for docids matching // negative query terms (e.g. -rock) // void PosdbTable::delDocIdVotes ( const QueryTermInfo *qti ) { char *bufStart = m_docIdVoteBuf.getBufStart(); char *voteBufPtr = NULL; char *voteBufEnd; char *subListPtr; char *subListEnd; logTrace(g_conf.m_logTracePosdb, "BEGIN."); // just scan each sublist vs. the docid list for ( int32_t i = 0 ; i < qti->m_numSubLists ; i++ ) { // get that sublist subListPtr = qti->m_subLists[i]->getList(); subListEnd = qti->m_subLists[i]->getListEnd(); // reset docid list ptrs voteBufPtr = m_docIdVoteBuf.getBufStart(); voteBufEnd = voteBufPtr + m_docIdVoteBuf.length(); // loop it while ( subListPtr < subListEnd ) { // scan for his docids and inc the vote for ( ;voteBufPtr <voteBufEnd ;voteBufPtr += 6 ) { // if current docid in docid list is >= the docid // in the sublist, stop. docid in list is 6 bytes and // subListPtr must be pointing to a 12 byte posdb rec. if ( *(uint32_t *)(voteBufPtr+1) > *(uint32_t *)(subListPtr+8) ) { break; } // less than? keep going if ( *(uint32_t *)(voteBufPtr+1) < *(uint32_t *)(subListPtr+8) ) { continue; } // top 4 bytes are equal. check lower single byte then. if ( *(unsigned char *)(voteBufPtr) > (*(unsigned char *)(subListPtr+7) & 0xfc ) ) { break; } if ( *(unsigned char *)(voteBufPtr) < (*(unsigned char *)(subListPtr+7) & 0xfc ) ) { continue; } // . equal! mark it as nuked! voteBufPtr[5] = -1; // skip it voteBufPtr += 6; // advance subListPtr now break; } // if we've exhausted this docid list go to next sublist if ( voteBufPtr >= voteBufEnd ) { goto endloop2; } // skip that docid record in our termlist. it MUST have been // 12 bytes, a docid heading record. subListPtr += 12; // skip any following keys that are 6 bytes, that means they // share the same docid for ( ; subListPtr < subListEnd && ((*subListPtr)&0x04); ) { subListPtr += 6; } // if we have more posdb recs in this sublist, then keep // adding our docid votes into the docid list } endloop2: ; // otherwise, advance to next sublist } // now remove docids with a -1 vote, they are nuked voteBufPtr = m_docIdVoteBuf.getBufStart(); voteBufEnd = voteBufPtr + m_docIdVoteBuf.length(); char *dst = voteBufPtr; for ( ; voteBufPtr < voteBufEnd ; voteBufPtr += 6 ) { // do not re-copy it if it was in this negative termlist if ( voteBufPtr[5] == -1 ) { continue; } // copy it over. might be the same address! *(int32_t *) dst = *(int32_t *) voteBufPtr; *(int16_t *)(dst+4) = *(int16_t *)(voteBufPtr+4); dst += 6; } // shrink the buffer size now m_docIdVoteBuf.setLength ( dst - bufStart ); logTrace(g_conf.m_logTracePosdb, "END."); return; } // // First call will allocate the m_docIdVoteBuf and add docids from the shortest // term list to the buffer. // // Next calls will run through all term sublists (synonyms, term variations) and // increase the score in the m_docIdVoteBuf for matching docids. Docids that do // not match a term is removed, so we end up with list of docids matching all // query terms. // void PosdbTable::addDocIdVotes( const QueryTermInfo *qti, int32_t listGroupNum) { logTrace(g_conf.m_logTracePosdb, "BEGIN."); // sanity check, we store this in a single byte below for voting if ( listGroupNum >= 256 ) { gbshutdownAbort(true); } // range terms tend to disappear if the docid's value falls outside // of the specified range... gbmin:offerprice:190 bool isRangeTerm = false; const QueryTerm *qt = qti->m_qt; if ( qt->m_fieldCode == FIELD_GBNUMBERMIN ) isRangeTerm = true; if ( qt->m_fieldCode == FIELD_GBNUMBERMAX ) isRangeTerm = true; if ( qt->m_fieldCode == FIELD_GBNUMBEREQUALFLOAT ) isRangeTerm = true; if ( qt->m_fieldCode == FIELD_GBNUMBERMININT ) isRangeTerm = true; if ( qt->m_fieldCode == FIELD_GBNUMBERMAXINT ) isRangeTerm = true; if ( qt->m_fieldCode == FIELD_GBNUMBEREQUALINT ) isRangeTerm = true; // if ( qt->m_fieldCode == FIELD_GBFIELDMATCH ) // isRangeTerm = true; // // add the first sublist's docids into the docid buf // if( listGroupNum == 0 ) { // the first listGroup is not intersecting, just adding to // the docid vote buf. that is, if the query is "jump car" we // just add all the docids for "jump". Subsequent calls to // this function will intersect those docids with the docids // for "car", resulting in a buffer with docids that contain // both terms. makeDocIdVoteBufForRarestTerm( qti, isRangeTerm); logTrace(g_conf.m_logTracePosdb, "END."); return; } char *bufStart = m_docIdVoteBuf.getBufStart(); char *voteBufPtr = NULL; char *voteBufEnd; char *subListPtr; char *subListEnd; // // For each sublist (term variation) loop through all sublist records // and compare with docids in the vote buffer. If a match is found, the // score in the vote buffer is set to the current ListGroupNum. // // A sublist is a termlist for a particular query term, for instance // the query term "jump" will have sublists for "jump" "jumps" // "jumping" "jumped" and maybe even "jumpy", so that could be // 5 sublists, and their QueryTermInfo::m_qtermNum should be the // same for all 5. // for ( int32_t i = 0 ; i < qti->m_numSubLists; i++) { // get that sublist subListPtr = qti->m_subLists[i]->getList(); subListEnd = qti->m_subLists[i]->getListEnd(); // reset docid list ptrs voteBufPtr = m_docIdVoteBuf.getBufStart(); voteBufEnd = voteBufPtr + m_docIdVoteBuf.length(); // loop it handleNextSubListRecord: // scan for his docids and inc the vote for ( ;voteBufPtr < voteBufEnd ; voteBufPtr += 6 ) { // if current docid in docid list is >= the docid // in the sublist, stop. docid in list is 6 bytes and // subListPtr must be pointing to a 12 byte posdb rec. if ( *(uint32_t *)(voteBufPtr+1) > *(uint32_t *)(subListPtr+8) ) { break; } // less than? keep going if ( *(uint32_t *)(voteBufPtr+1) < *(uint32_t *)(subListPtr+8) ) { continue; } // top 4 bytes are equal. check lower single byte then. if ( *(unsigned char *)(voteBufPtr) > (*(unsigned char *)(subListPtr+7) & 0xfc ) ) { break; } if ( *(unsigned char *)(voteBufPtr) < (*(unsigned char *)(subListPtr+7) & 0xfc ) ) { continue; } // if we are a range term, does this subtermlist // for this docid meet the min/max requirements // of the range term, i.e. gbmin:offprice:190. // if it doesn't then do not add this docid to the // docidVoteBuf, "voteBufPtr" if ( isRangeTerm && ! isTermValueInRange2(subListPtr, subListEnd, qt)) { break; } // . equal! record our vote! // . we start at zero for the // first termlist, and go to 1, etc. voteBufPtr[5] = listGroupNum; // skip it voteBufPtr += 6; // break out to advance subListPtr break; } // if we've exhausted this docid list go to next sublist // since this docid is NOT in the current/ongoing intersection // of the docids for each queryterm if ( voteBufPtr >= voteBufEnd ) { continue; } // skip that docid record in our termlist. it MUST have been // 12 bytes, a docid heading record. subListPtr += 12; // skip any following keys that are 6 bytes, that means they // share the same docid for ( ; subListPtr < subListEnd && ((*subListPtr)&0x04); ) { subListPtr += 6; } // if we have more posdb recs in this sublist, then keep // adding our docid votes into the docid list if ( subListPtr < subListEnd ) { goto handleNextSubListRecord; } // otherwise, advance to next sublist } // // Shrink the docidbuf by removing docids with not enough // votes which means they are missing a query term // voteBufPtr = m_docIdVoteBuf.getBufStart(); voteBufEnd = voteBufPtr + m_docIdVoteBuf.length(); char *dst = voteBufPtr; for ( ; voteBufPtr < voteBufEnd ; voteBufPtr += 6 ) { // skip if it has enough votes to be in search // results so far if ( voteBufPtr[5] != listGroupNum ) { continue; } // copy it over. might be the same address! *(int32_t *) dst = *(int32_t *) voteBufPtr; *(int16_t *)(dst+4) = *(int16_t *)(voteBufPtr+4); dst += 6; } // shrink the buffer size m_docIdVoteBuf.setLength ( dst - bufStart ); logTrace(g_conf.m_logTracePosdb, "END."); } // // Initialize the vote buffer with docids from the shortest // term (rarest required term) list and initialize the scores to 0. // Called by addDocIdVotes // // The buffer consists of 5-byte docids and 1-byte scores. The score // is incremented for each term that matches the docid, and after // each run, the list is "compacted" and shortened so only the // matching docids are left. // void PosdbTable::makeDocIdVoteBufForRarestTerm(const QueryTermInfo *qti, bool isRangeTerm) { char *cursor[MAX_SUBLISTS]; char *cursorEnd[MAX_SUBLISTS]; logTrace(g_conf.m_logTracePosdb, "term id [%" PRId64 "] [%.*s]", qti->m_qt->m_termId, qti->m_qt->m_termLen, qti->m_qt->m_term); for ( int32_t i = 0 ; i < qti->m_numSubLists ; i++ ) { // get that sublist cursor [i] = qti->m_subLists[i]->getList(); cursorEnd [i] = qti->m_subLists[i]->getListEnd(); } char *bufStart = m_docIdVoteBuf.getBufStart(); char *voteBufPtr = m_docIdVoteBuf.getBufStart(); char *recPtr; char *minRecPtr; char *lastMinRecPtr = NULL; int32_t mini = -1; const QueryTerm *qt = qti->m_qt; // get the next min from all the termlists for(;;) { // reset this minRecPtr = NULL; // just scan each sublist vs. the docid list for ( int32_t i = 0 ; i < qti->m_numSubLists ; i++ ) { // skip if exhausted if ( ! cursor[i] ) { continue; } // shortcut recPtr = cursor[i]; // get the min docid if ( ! minRecPtr ) { minRecPtr = recPtr; mini = i; continue; } // compare! if ( *(uint32_t *)(recPtr +8) > *(uint32_t *)(minRecPtr+8) ) { continue; } // a new min if ( *(uint32_t *)(recPtr +8) < *(uint32_t *)(minRecPtr+8) ) { minRecPtr = recPtr; mini = i; continue; } // check lowest byte if ( (*(unsigned char *)(recPtr +7) & 0xfc ) > (*(unsigned char *)(minRecPtr+7) & 0xfc ) ) { continue; } // a new min if ( (*(unsigned char *)(recPtr +7) & 0xfc ) < (*(unsigned char *)(minRecPtr+7) & 0xfc ) ) { minRecPtr = recPtr; mini = i; continue; } } // if no min then all lists exhausted! if ( ! minRecPtr ) { // update length m_docIdVoteBuf.setLength ( voteBufPtr - bufStart ); // all done! logTrace(g_conf.m_logTracePosdb, "END."); return; } bool inRange=false; // if we are a range term, does this subtermlist // for this docid meet the min/max requirements // of the range term, i.e. gbmin:offprice:190. // if it doesn't then do not add this docid to the // docidVoteBuf, "voteBufPtr" if ( isRangeTerm ) { // no longer in range if ( isTermValueInRange2(cursor[mini],cursorEnd[mini],qt)) { inRange = true; } } // advance that guy over that docid cursor[mini] += 12; // 6 byte keys follow? for ( ; ; ) { // end of list? if ( cursor[mini] >= cursorEnd[mini] ) { // use NULL to indicate list is exhausted cursor[mini] = NULL; break; } // if we hit a new 12 byte key for a new docid, stop if ( ! ( cursor[mini][0] & 0x04 ) ) { break; } // check range again if (isRangeTerm && isTermValueInRange2(cursor[mini],cursorEnd[mini],qt)) { inRange = true; } // otherwise, skip this 6 byte key cursor[mini] += 6; } // is it a docid dup? if(lastMinRecPtr && *(uint32_t *)(lastMinRecPtr+8) == *(uint32_t *)(minRecPtr+8) && (*(unsigned char *)(lastMinRecPtr+7)&0xfc) == (*(unsigned char *)(minRecPtr+7)&0xfc)) { continue; } // . do not store the docid if not in the whitelist // . FIX: two lower bits, what are they? at minRecPtrs[7]. // . well the lowest bit is the siterank upper bit and the // other bit is always 0. we should be ok with just using // the 6 bytes of the docid ptr as is though since the siterank // should be the same for the site: terms we indexed for the same // docid!! if ( m_useWhiteTable && ! m_whiteListTable.isInTable(minRecPtr+7) ) { continue; } if ( isRangeTerm && ! inRange ) { continue; } // only update this if we add the docid... that way there can be // a winning "inRange" term in another sublist and the docid will // get added. lastMinRecPtr = minRecPtr; // store our docid. actually it contains two lower bits not // part of the docid, so we'll have to shift and mask to get // the actual docid! // docid is only 5 bytes for now *(int32_t *)(voteBufPtr+1) = *(int32_t *)(minRecPtr+8); // the single lower byte voteBufPtr[0] = minRecPtr[7] & 0xfc; // 0 vote count voteBufPtr[5] = 0; // debug // int64_t dd = Posdb::getDocId(minRecPtr); // log(LOG_ERROR, "%s:%s: adding docid %" PRId64 "", __FILE__, __func__, dd); // advance voteBufPtr += 6; } } // TODO: do this in docid range phases to save memory and be much faster // since we could contain to the L1 cache for hashing bool PosdbTable::makeDocIdVoteBufForBoolQuery( ) { logTrace(g_conf.m_logTracePosdb, "BEGIN."); // . make a hashtable of all the docids from all the termlists // . the value slot will be the operand bit vector i guess // . the size of the vector needs one bit per query operand // . if the vector is only 1-2 bytes we can just evaluate each // combination we encounter and store it into an array, otherwise, // we can use a another hashtable in order to avoid re-evaluation // on if it passes the boolean query. char bitVec[MAX_OVEC_SIZE]; if ( m_vecSize > MAX_OVEC_SIZE ) { m_vecSize = MAX_OVEC_SIZE; } QueryTermInfo *qtibuf = (QueryTermInfo *)m_qiBuf.getBufStart(); // . scan each list of docids to a get a new docid, keep a dedup // table to avoid re-processing the same docid. // . each posdb list we read corresponds to a query term, // or a synonym of a query term, or bigram of a query term, etc. // but we really want to know what operand, so we associate an // operand bit with each query term, and each list can map to // the base query term so we can get the operand # from that. for ( int32_t i = 0 ; i < m_numQueryTermInfos ; i++ ) { // get it QueryTermInfo *qti = &qtibuf[i]; QueryTerm *qt = &m_q->m_qterms[qti->m_qtermNum]; // get the query word //QueryWord *qw = qt->m_qword; // just use the word # now //int32_t opNum = qw->m_wordNum;//opNum; // if this query term # is a gbmin:offprice:190 type // of thing, then we may end up ignoring it based on the // score contained within! bool isRangeTerm = false; if ( qt->m_fieldCode == FIELD_GBNUMBERMIN ) isRangeTerm = true; if ( qt->m_fieldCode == FIELD_GBNUMBERMAX ) isRangeTerm = true; if ( qt->m_fieldCode == FIELD_GBNUMBEREQUALFLOAT ) isRangeTerm = true; if ( qt->m_fieldCode == FIELD_GBNUMBERMININT ) isRangeTerm = true; if ( qt->m_fieldCode == FIELD_GBNUMBERMAXINT ) isRangeTerm = true; if ( qt->m_fieldCode == FIELD_GBNUMBEREQUALINT ) isRangeTerm = true; // if ( qt->m_fieldCode == FIELD_GBFIELDMATCH ) // isRangeTerm = true; // . make it consistent with Query::isTruth() // . m_bitNum is set above to the QueryTermInfo # int32_t bitNum = qt->m_bitNum; // do not consider for adding if negative ('my house -home') //if ( qti->m_bigramFlags[0] & BF_NEGATIVE ) continue; // set all to zeroes memset ( bitVec, 0, m_vecSize ); // set bitvec for this query term # int32_t byte = bitNum / 8; unsigned char mask = 1<<(bitNum % 8); bitVec[byte] |= mask; // each query term can have synonym lists etc. scan those. // this includes the original query termlist as well. for ( int32_t j = 0 ; j < qti->m_numSubLists ; j++ ) { // scan all docids in this list char *p = qti->m_subLists[j]->getList(); char *pend = qti->m_subLists[j]->getListEnd(); //int64_t lastDocId = 0LL; // scan the sub termlist #j for ( ; p < pend ; ) { // place holder int64_t docId = Posdb::getDocId(p); // assume this docid is not in range if we // had a range term like gbmin:offerprice:190 bool inRange = false; // sanity //if ( d < lastDocId ) // gbshutdownAbort(true); //lastDocId = d; // point to it //char *voteBufPtr = p + 8; // check each posdb key for compliance // for gbmin:offprice:190 bool terms if ( isRangeTerm && isTermValueInRange(p,qt) ) { inRange = true; } // this was the first key for this docid for // this termid and possible the first key for // this termid, so skip it, either 12 or 18 // bytes if ( (((char *)p)[0])&0x02 ) { p += 12; } // the first key for this termid? else { p += 18; } // then only 6 byte keys would follow from the // same docid, so skip those as well subloop: if( p < pend && (((char *)p)[0]) & 0x04 ) { // check each posdb key for compliance // for gbmin:offprice:190 bool terms if ( isRangeTerm && isTermValueInRange(p,qt) ) { inRange = true; } p += 6; goto subloop; } // if we had gbmin:offprice:190 and it // was not satisfied, then do not OR in this // bit in the bitvector for the docid if ( isRangeTerm && ! inRange ) { continue; } // convert docid into hash key //int64_t docId = *(int64_t *)voteBufPtr; // shift down 2 bits //docId >>= 2; // and mask //docId &= DOCID_MASK; // test it //int64_t docId = Posdb::getDocId(voteBufPtr-8); //if ( d2 != docId ) // gbshutdownAbort(true); // store this docid though. treat as int64_t // but we mask with keymask int32_t slot = m_bt.getSlot ( &docId ); if ( slot < 0 ) { // we can't alloc in a thread, careful if ( ! m_bt.addKey(&docId,bitVec) ) { gbshutdownAbort(true); } continue; } // or the bit in otherwise char *bv = (char *)m_bt.getValueFromSlot(slot); bv[byte] |= mask; } } } // debug info // int32_t nc = m_bt.getLongestString(); // log("posdb: string of %" PRId32" filled slots!",nc); char *dst = m_docIdVoteBuf.getBufStart(); // . now our hash table is filled with all the docids // . evaluate each bit vector for ( int32_t i = 0 ; i < m_bt.getNumSlots() ; i++ ) { // skip if empty if ( ! m_bt.m_flags[i] ) { continue; } // get the bit vector unsigned char *vec = (unsigned char *)m_bt.getValueFromSlot(i); // hash the vector int64_t h64 = 0LL; for ( int32_t k = 0 ; k < m_vecSize ; k++ ) h64 ^= g_hashtab[(unsigned char)vec[k]][(unsigned char)k]; // check in hash table char *val = (char *)m_ct.getValue ( &h64 ); // it passes, add the ocid if ( m_debug ) { int64_t docId =*(int64_t *)m_bt.getKeyFromSlot(i); log(LOG_INFO, "query: eval d=%" PRIu64" vec[0]=%" PRIx32" h64=%" PRId64, docId,(int32_t)vec[0],h64); } // add him to the good table if ( val && *val ) { // it passes, add the ocid int64_t docId =*(int64_t *)m_bt.getKeyFromSlot(i); // fix it up if ( m_debug ) { log(LOG_INFO, "query: adding d=%" PRIu64" bitVecSize=%" PRId32" bitvec[0]=0x%" PRIx32" (TRUE)", docId,m_vecSize,(int32_t)vec[0]); } // shift up docId <<= 2; // a 6 byte key means you pass memcpy ( dst, &docId, 6 ); dst += 6; continue; } // evaluate the vector char include = m_q->matchesBoolQuery ( (unsigned char *)vec, m_vecSize ); if ( include ) { // it passes, add the ocid int64_t docId =*(int64_t *)m_bt.getKeyFromSlot(i); // fix it up if ( m_debug ) { log(LOG_INFO, "query: adding d=%" PRIu64" vec[0]=0x%" PRIx32, docId,(int32_t)vec[0]); } // shift up docId <<= 2; // a 6 byte key means you pass memcpy ( dst, &docId, 6 ); // test it if ( m_debug ) { int64_t d2; d2 = *(uint32_t *)(dst+1); d2 <<= 8; d2 |= (unsigned char)dst[0]; d2 >>= 2; docId >>= 2; if ( d2 != docId ) gbshutdownAbort(true); } // end test dst += 6; } // store in hash table m_ct.addKey ( &h64, &include ); } // update SafeBuf::m_length m_docIdVoteBuf.setLength ( dst - m_docIdVoteBuf.getBufStart() ); // now sort the docids. TODO: break makeDocIdVoteBufForBoolQuery() // up into docid ranges so we have like 1/100th the # of docids to // sort. that should make this part a lot faster. // i.e. 1000*log(1000) > 1000*(10*log(10))) --> 3000 > 1000 // i.e. it's faster to break it down into 1000 pieces // i.e. for log base 2 maybe it's like 10x faster... qsort ( m_docIdVoteBuf.getBufStart(), m_docIdVoteBuf.length() / 6, 6, docIdVoteBufKeyCompare_desc ); logTrace(g_conf.m_logTracePosdb, "END."); return true; } //////////////////// // // Global functions // //////////////////// // sort vote buf entries in descending order static int docIdVoteBufKeyCompare_desc ( const void *h1, const void *h2 ) { return KEYCMP((const char*)h1, (const char*)h2, 6); } // for boolean queries containing terms like gbmin:offerprice:190 static inline bool isTermValueInRange( const char *p, const QueryTerm *qt ) { // return false if outside of range if ( qt->m_fieldCode == FIELD_GBNUMBERMIN ) { float score2 = Posdb::getFloat ( p ); return ( score2 >= qt->m_qword->m_float ); } if ( qt->m_fieldCode == FIELD_GBNUMBERMAX ) { float score2 = Posdb::getFloat ( p ); return ( score2 <= qt->m_qword->m_float ); } if ( qt->m_fieldCode == FIELD_GBNUMBEREQUALFLOAT ) { float score2 = Posdb::getFloat ( p ); return ( almostEqualFloat(score2, qt->m_qword->m_float) ); } if ( qt->m_fieldCode == FIELD_GBNUMBERMININT ) { int32_t score2 = Posdb::getInt ( p ); return ( score2 >= qt->m_qword->m_int ); } if ( qt->m_fieldCode == FIELD_GBNUMBERMAXINT ) { int32_t score2 = Posdb::getInt ( p ); return ( score2 <= qt->m_qword->m_int ); } if ( qt->m_fieldCode == FIELD_GBNUMBEREQUALINT ) { int32_t score2 = Posdb::getInt ( p ); return ( score2 == qt->m_qword->m_int ); } // if ( qt->m_fieldCode == FIELD_GBFIELDMATCH ) { // int32_t score2 = Posdb::getInt ( p ); // return ( score2 == qt->m_qword->m_int ); // } // how did this happen? gbshutdownAbort(true); } static inline bool isTermValueInRange2 ( const char *recPtr, const char *subListEnd, const QueryTerm *qt ) { // if we got a range term see if in range. if ( isTermValueInRange(recPtr,qt) ) { return true; } recPtr += 12; for(;recPtr<subListEnd&&((*recPtr)&0x04);recPtr +=6) { if ( isTermValueInRange(recPtr,qt) ) { return true; } } return false; } // . b-step into list looking for docid "docId" // . assume p is start of list, excluding 6 byte of termid static inline const char *getWordPosList(uint64_t docId, const char *list, int32_t listSize) { // make step divisible by 6 initially int32_t step = (listSize / 12) * 6; // shortcut const char *listEnd = list + listSize; // divide in half const char *p = list + step; // for detecting not founds char count = 0; for(;;) { // save it const char *origp = p; // scan up to docid. we use this special bit to distinguish between // 6-byte and 12-byte posdb keys for ( ; p > list && (p[1] & 0x02); ) { p -= 6; } // ok, we hit a 12 byte key i guess, so backup 6 more p -= 6; // ok, we got a 12-byte key then i guess uint64_t d = Posdb::getDocId ( p ); // we got a match, but it might be a NEGATIVE key so // we have to try to find the positive keys in that case if ( d == docId ) { // if its positive, no need to do anything else if ( (p[0] & 0x01) == 0x01 ) return p; // ok, it's negative, try to see if the positive is // in here, if not then return NULL. // save current pos const char *current = p; // back up to 6 byte key before this 12 byte key p -= 6; // now go backwards to previous 12 byte key for ( ; p > list && (p[1] & 0x02); ) { p -= 6; } // ok, we hit a 12 byte key i guess, so backup 6 more p -= 6; // is it there? if ( p >= list && Posdb::getDocId(p) == docId ) { // sanity. return NULL if its negative! wtf???? if ( (p[0] & 0x01) == 0x00 ) return NULL; // got it return p; } // ok, no positive before us, try after us p = current; // advance over current 12 byte key p += 12; // now go forwards to next 12 byte key for ( ; p < listEnd && (p[1] & 0x02); ) { p += 6; } // is it there? if ( p + 12 < listEnd && Posdb::getDocId(p) == docId ) { // sanity. return NULL if its negative! wtf???? if ( (p[0] & 0x01) == 0x00 ) { return NULL; } // got it return p; } // . crap, i guess just had a single negative docid then // . return that and the caller will see its negative return current; } // reduce step //step /= 2; step >>= 1; // . make divisible by 6! // . TODO: speed this up!!! step = step - (step % 6); // sanity if ( step % 6 ) { gbshutdownAbort(true); } // ensure never 0 if ( step <= 0 ) { step = 6; // return NULL if not found if ( count++ >= 2 ) { return NULL; } } // go up or down then if ( d < docId ) { p = origp + step; if ( p >= listEnd ) { p = listEnd - 6; } } else { p = origp - step; if ( p < list ) { p = list; } } } } // initialize the weights table static void initWeights ( ) { ScopedLock sl(s_mtx_weights); if ( s_init ) { return; } logTrace(g_conf.m_logTracePosdb, "BEGIN."); s_scoringWeights.init(g_conf.m_diversityWeightMin, g_conf.m_diversityWeightMax, g_conf.m_densityWeightMin, g_conf.m_densityWeightMax, g_conf.m_hashGroupWeightBody, g_conf.m_hashGroupWeightTitle, g_conf.m_hashGroupWeightHeading, g_conf.m_hashGroupWeightInlist, g_conf.m_hashGroupWeightInMetaTag, g_conf.m_hashGroupWeightInLinkText, g_conf.m_hashGroupWeightInTag, g_conf.m_hashGroupWeightNeighborhood, g_conf.m_hashGroupWeightInternalLinkText, g_conf.m_hashGroupWeightInUrl, g_conf.m_hashGroupWeightInMenu); // if two hashgroups are comaptible they can be paired for ( int32_t i = 0 ; i < HASHGROUP_END ; i++ ) { // set this s_inBody[i] = false; // is it body? if ( i == HASHGROUP_BODY || i == HASHGROUP_HEADING || i == HASHGROUP_INLIST || i == HASHGROUP_INMENU ) s_inBody[i] = true; for ( int32_t j = 0 ; j < HASHGROUP_END ; j++ ) { // assume not s_isCompatible[i][j] = false; // or both in body (and not title) bool inBody1 = true; if ( i != HASHGROUP_BODY && i != HASHGROUP_HEADING && i != HASHGROUP_INLIST && //i != HASHGROUP_INURL && i != HASHGROUP_INMENU ) inBody1 = false; bool inBody2 = true; if ( j != HASHGROUP_BODY && j != HASHGROUP_HEADING && j != HASHGROUP_INLIST && //j != HASHGROUP_INURL && j != HASHGROUP_INMENU ) inBody2 = false; // no body allowed now! if ( inBody1 || inBody2 ) { continue; } //if ( ! inBody ) continue; // now neither can be in the body, because we handle // those cases in the new sliding window algo. // if one term is only in the link text and the other // term is only in the title, ... what then? i guess // allow those here, but they will be penalized // some with the fixed distance of like 64 units or // something... s_isCompatible[i][j] = true; // if either is in the body then do not allow now // and handle in the sliding window algo //s_isCompatible[i][j] = 1; } } s_init = true; #ifdef _VALGRIND_ //we read from the weight tables without locking. tell helgrind to ignore that VALGRIND_HG_DISABLE_CHECKING(&s_scoringWeights,sizeof(s_scoringWeights)); VALGRIND_HG_DISABLE_CHECKING(s_isCompatible,sizeof(s_isCompatible)); VALGRIND_HG_DISABLE_CHECKING(s_inBody,sizeof(s_inBody)); #endif logTrace(g_conf.m_logTracePosdb, "END."); } // Called when ranking settings are changed. Normally called from update-parameter // broadcast handling (see handleRequest3fLoop() ) void reinitializeRankingSettings() { s_init = false; initWeights(); } float getHashGroupWeight ( unsigned char hg ) { initWeights(); return s_scoringWeights.m_hashGroupWeights[hg]; } float getDiversityWeight ( unsigned char diversityRank ) { initWeights(); return s_scoringWeights.m_diversityWeights[diversityRank]; } float getDensityWeight ( unsigned char densityRank ) { initWeights(); return s_scoringWeights.m_densityWeights[densityRank]; } float getWordSpamWeight ( unsigned char wordSpamRank ) { initWeights(); return s_scoringWeights.m_wordSpamWeights[wordSpamRank]; } float getLinkerWeight ( unsigned char wordSpamRank ) { initWeights(); return s_scoringWeights.m_linkerWeights[wordSpamRank]; }