2016-07-04 15:06:15 +02:00
# include "PosdbTable.h"
# include "Posdb.h"
# include "gb-include.h"
2016-12-19 12:58:33 +01:00
# include "PageTemperatureRegistry.h"
2017-01-06 17:40:50 +01:00
# include "Docid2Siteflags.h"
2016-07-04 15:06:15 +02:00
# include "ScalingFunctions.h"
2017-01-20 16:27:16 +01:00
# include "ScoringWeights.h"
2016-07-04 15:06:15 +02:00
# include "BitOperations.h"
2016-07-05 12:11:14 +02:00
# include "Msg2.h"
# include "Msg39.h"
# include "Sanity.h"
2016-07-04 15:06:15 +02:00
# include "Stats.h"
# include "Conf.h"
# include "TopTree.h"
2016-11-08 12:14:32 +01:00
# include "DocumentIndexChecker.h"
2017-01-10 14:29:16 +01:00
# include "Lang.h"
2016-09-16 12:14:51 +02:00
# include "GbMutex.h"
# include "ScopedLock.h"
2016-07-04 15:06:15 +02:00
# include <math.h>
2017-06-01 16:06:45 +02:00
# include <valarray>
2016-07-04 15:06:15 +02:00
# ifdef _VALGRIND_
# include <valgrind/memcheck.h>
2016-09-16 12:14:51 +02:00
# include <valgrind/helgrind.h>
2016-07-04 15:06:15 +02:00
# endif
# define BF_HALFSTOPWIKIBIGRAM 0x01 // "to be" in "to be or not to be"
# define BF_PIPED 0x02 // before a query pipe operator
# define BF_SYNONYM 0x04
# define BF_NEGATIVE 0x08 // query word has a negative sign before it
# define BF_BIGRAM 0x10
# define BF_NUMBER 0x20 // is it like gbsortby:price? numeric?
2016-11-07 16:33:55 +01:00
static const int INTERSECT_SCORING = 0 ;
static const int INTERSECT_DEBUG_INFO = 1 ;
2016-07-04 15:06:15 +02:00
2016-08-29 17:05:34 +02:00
static bool s_init = false ;
2016-09-16 12:14:51 +02:00
static GbMutex s_mtx_weights ;
2017-01-20 16:27:16 +01:00
static ScoringWeights s_scoringWeights ;
2016-08-29 17:05:34 +02:00
static bool s_isCompatible [ HASHGROUP_END ] [ HASHGROUP_END ] ;
static bool s_inBody [ HASHGROUP_END ] ;
2016-07-04 15:06:15 +02:00
2016-08-29 17:05:34 +02:00
# define gbmin(a,b) ((a)<(b) ? (a) : (b))
# define gbmax(a,b) ((a)>(b) ? (a) : (b))
2016-07-04 15:06:15 +02:00
2016-08-29 17:05:34 +02:00
static inline bool isTermValueInRange ( const char * p , const QueryTerm * qt ) ;
static inline bool isTermValueInRange2 ( const char * recPtr , const char * subListEnd , const QueryTerm * qt ) ;
2017-03-23 12:08:06 +01:00
static inline const char * getWordPosList ( uint64_t docId , const char * list , int32_t listSize ) ;
2016-08-29 17:05:34 +02:00
static int docIdVoteBufKeyCompare_desc ( const void * h1 , const void * h2 ) ;
static void initWeights ( ) ;
2016-07-04 15:06:15 +02:00
//////////////////
//
// THE NEW INTERSECTION LOGIC
//
//////////////////
PosdbTable : : PosdbTable ( ) {
// top docid info
m_q = NULL ;
2016-09-04 23:01:01 +02:00
m_msg39req = NULL ;
2016-07-04 15:06:15 +02:00
reset ( ) ;
}
PosdbTable : : ~ PosdbTable ( ) {
reset ( ) ;
}
void PosdbTable : : reset ( ) {
// has init() been called?
m_initialized = false ;
m_estimatedTotalHits = - 1 ;
2016-09-06 12:23:24 +02:00
//freeMem(); // not implemented
2016-07-04 15:06:15 +02:00
// does not free the mem of this safebuf, only resets length
m_docIdVoteBuf . reset ( ) ;
m_filtered = 0 ;
m_qiBuf . reset ( ) ;
// assume no-op
m_t1 = 0LL ;
m_whiteListTable . reset ( ) ;
m_addedSites = false ;
2016-09-26 17:40:48 +02:00
// Coverity
2016-09-26 20:21:19 +02:00
m_docId = 0 ;
m_hasMaxSerpScore = false ;
m_siteRankMultiplier = 0.0 ;
m_addListsTime = 0 ;
m_t2 = 0 ;
m_numSlots = 0 ;
m_maxScores = 0 ;
m_collnum = 0 ;
m_qpos = NULL ;
m_wikiPhraseIds = NULL ;
m_quotedStartIds = NULL ;
m_freqWeights = NULL ;
m_bflags = NULL ;
m_qtermNums = NULL ;
2016-10-10 16:02:50 +02:00
m_bestMinTermPairWindowScore = 0.0 ;
m_bestMinTermPairWindowPtrs = NULL ;
2016-09-26 17:40:48 +02:00
m_docsInColl = 0 ;
m_msg2 = NULL ;
m_topTree = NULL ;
m_nqt = 0 ;
m_debug = false ;
m_logstate = NULL ;
2016-10-18 14:17:39 +02:00
m_sortByTermNum = - 1 ;
m_sortByTermNumInt = - 1 ;
2016-09-26 17:40:48 +02:00
m_sortByTermInfoNum = 0 ;
m_sortByTermInfoNumInt = 0 ;
m_minScoreTermNum = 0 ;
m_maxScoreTermNum = 0 ;
m_minScoreVal = 0.0 ;
m_maxScoreVal = 0.0 ;
m_minScoreTermNumInt = 0 ;
m_maxScoreTermNumInt = 0 ;
m_minScoreValInt = 0 ;
m_maxScoreValInt = 0 ;
m_useWhiteTable = false ;
m_numQueryTermInfos = 0 ;
m_minTermListSize = 0 ;
m_minTermListIdx = 0 ;
m_vecSize = 0 ;
m_allInSameWikiPhrase = 0 ;
m_realMaxTop = 0 ;
2016-07-04 15:06:15 +02:00
}
// realloc to save mem if we're rat
void PosdbTable : : freeMem ( ) {
2016-08-29 17:05:34 +02:00
//@todo: ?
2016-07-04 15:06:15 +02:00
}
2016-08-29 17:05:34 +02:00
2016-07-04 15:06:15 +02:00
// . returns false on error and sets g_errno
// . NOTE: termFreqs is just referenced by us, not copied
// . sets m_startKeys, m_endKeys and m_minNumRecs for each termId
// . TODO: ensure that m_termFreqs[] are all UPPER BOUNDS on the actual #!!
// we should be able to get an upper bound estimate from the b-tree
// quickly using Msg36!
// . we now support multiple plus signs before the query term
// . lists[] and termFreqs[] must be 1-1 with q->m_qterms[]
2016-11-08 12:14:32 +01:00
void PosdbTable : : init ( Query * q , bool debug , void * logstate , TopTree * topTree , const DocumentIndexChecker & documentIndexChecker , Msg2 * msg2 , Msg39Request * r ) {
2016-07-04 15:06:15 +02:00
// sanity check -- watch out for double calls
2016-07-05 12:11:14 +02:00
if ( m_initialized )
gbshutdownAbort ( true ) ;
2016-07-04 15:06:15 +02:00
// clear everything
reset ( ) ;
// we are now
m_initialized = true ;
// set debug flag
2016-07-27 16:15:20 +02:00
m_debug = ( debug | | g_conf . m_logDebugQuery ) ;
2016-07-04 15:06:15 +02:00
// we should save the lists!
//m_lists = msg2->m_lists;//lists;
//m_numLists = q->m_numTerms;
// seo.cpp supplies a NULL msg2 because it already sets
// QueryTerm::m_posdbListPtrs
if ( ! msg2 ) return ;
m_msg2 = msg2 ;
// save this
m_collnum = r - > m_collnum ;
// save the request
2016-09-04 23:01:01 +02:00
m_msg39req = r ;
2016-07-04 15:06:15 +02:00
// save this
//m_coll = coll;
// get the rec for it
// CollectionRec *cr = g_collectiondb.getRec ( m_collnum );
2016-07-05 12:11:14 +02:00
// if ( ! cr )
// gbshutdownAbort(true);
2016-07-04 15:06:15 +02:00
// set this now
//m_collnum = cr->m_collnum;
2016-11-08 12:14:32 +01:00
m_documentIndexChecker = & documentIndexChecker ;
2016-07-04 15:06:15 +02:00
m_topTree = topTree ;
// remember the query class, it has all the info about the termIds
m_q = q ;
m_nqt = q - > getNumTerms ( ) ;
// for debug msgs
m_logstate = logstate ;
m_realMaxTop = r - > m_realMaxTop ;
if ( m_realMaxTop > MAX_TOP ) m_realMaxTop = MAX_TOP ;
m_siteRankMultiplier = SITERANKMULTIPLIER ;
if ( m_q - > m_isBoolean ) m_siteRankMultiplier = 0.0 ;
// sanity
2016-07-05 12:11:14 +02:00
if ( msg2 - > getNumLists ( ) ! = m_q - > getNumTerms ( ) )
gbshutdownAbort ( true ) ;
2016-07-04 15:06:15 +02:00
// copy the list ptrs to the QueryTerm::m_posdbListPtr
for ( int32_t i = 0 ; i < m_q - > m_numTerms ; i + + )
m_q - > m_qterms [ i ] . m_posdbListPtr = msg2 - > getList ( i ) ;
// we always use it now
2016-07-05 12:11:14 +02:00
if ( ! topTree )
gbshutdownAbort ( true ) ;
2016-07-04 15:06:15 +02:00
}
2016-08-29 17:05:34 +02:00
2016-10-10 16:02:50 +02:00
//
// Find the top score for the term for each hash group.
// INLINK_TEXT may have more than one entry in the top-scorer list, other hash groups only 1.
// INLINK_TEXT may override other lower scoring hashgroups' scores
// Sum up the best scores, and return that result.
// Sets highestScoringNonBodyPos to the highest scoring position.
//
float PosdbTable : : getBestScoreSumForSingleTerm ( int32_t i , const char * wpi , const char * endi , DocIdScore * pdcs , const char * * highestScoringNonBodyPos ) {
2016-07-04 15:06:15 +02:00
float nonBodyMax = - 1.0 ;
2016-10-13 14:49:49 +02:00
int32_t lowestScoreTermIdx = 0 ;
2016-09-21 22:26:13 +02:00
float bestScores [ MAX_TOP ] = { 0 } ; // make Coverity happy
2016-09-16 14:49:36 +02:00
const char * bestwpi [ MAX_TOP ] ;
2016-07-04 15:06:15 +02:00
int32_t numTop = 0 ;
2016-09-06 12:30:26 +02:00
logTrace ( g_conf . m_logTracePosdb , " BEGIN. " ) ;
2016-08-25 17:27:37 +02:00
2016-07-04 15:06:15 +02:00
// assume no terms!
2016-10-10 16:02:50 +02:00
* highestScoringNonBodyPos = NULL ;
2016-07-04 15:06:15 +02:00
2016-09-20 14:17:20 +02:00
if ( wpi ) {
2017-06-01 16:56:20 +02:00
// Sanity check
if ( wpi > = endi ) {
logTrace ( g_conf . m_logTracePosdb , " END, wpi %p >= %p " , wpi , endi ) ;
return - 1.0 ;
}
2016-09-18 17:00:46 +02:00
# ifdef _VALGRIND_
2016-09-20 14:17:20 +02:00
VALGRIND_CHECK_MEM_IS_DEFINED ( wpi , endi - wpi ) ;
2016-09-18 17:00:46 +02:00
# endif
2016-07-04 15:06:15 +02:00
bool first = true ;
2016-10-10 16:02:50 +02:00
char bestmhg [ MAX_TOP ] ;
2016-07-04 15:06:15 +02:00
do {
float score = 100.0 ;
// good diversity?
2016-09-06 12:46:08 +02:00
unsigned char div = Posdb : : getDiversityRank ( wpi ) ;
2017-01-26 15:24:44 +01:00
score * = m_msg39req - > m_scoringWeights . m_diversityWeights [ div ] ;
score * = m_msg39req - > m_scoringWeights . m_diversityWeights [ div ] ;
2016-07-04 15:06:15 +02:00
// hash group? title? body? heading? etc.
2016-09-06 12:46:08 +02:00
unsigned char hg = Posdb : : getHashGroup ( wpi ) ;
2016-07-04 15:06:15 +02:00
unsigned char mhg = hg ;
if ( s_inBody [ mhg ] ) mhg = HASHGROUP_BODY ;
2017-01-26 15:24:44 +01:00
score * = m_msg39req - > m_scoringWeights . m_hashGroupWeights [ hg ] ;
score * = m_msg39req - > m_scoringWeights . m_hashGroupWeights [ hg ] ;
2016-07-04 15:06:15 +02:00
// good density?
2016-09-06 12:46:08 +02:00
unsigned char dens = Posdb : : getDensityRank ( wpi ) ;
2017-01-26 15:24:44 +01:00
score * = m_msg39req - > m_scoringWeights . m_densityWeights [ dens ] ;
score * = m_msg39req - > m_scoringWeights . m_densityWeights [ dens ] ;
2016-07-04 15:06:15 +02:00
// to make more compatible with pair scores divide by distance of 2
//score /= 2.0;
// word spam?
2016-09-06 12:46:08 +02:00
unsigned char wspam = Posdb : : getWordSpamRank ( wpi ) ;
2016-07-04 15:06:15 +02:00
// word spam weight update
if ( hg = = HASHGROUP_INLINKTEXT ) {
2017-01-26 15:24:44 +01:00
score * = m_msg39req - > m_scoringWeights . m_linkerWeights [ wspam ] ;
score * = m_msg39req - > m_scoringWeights . m_linkerWeights [ wspam ] ;
2016-07-04 15:06:15 +02:00
}
else {
2017-01-26 15:24:44 +01:00
score * = m_msg39req - > m_scoringWeights . m_wordSpamWeights [ wspam ] ;
score * = m_msg39req - > m_scoringWeights . m_wordSpamWeights [ wspam ] ;
2016-07-04 15:06:15 +02:00
}
// synonym
2016-09-06 12:46:08 +02:00
if ( Posdb : : getIsSynonym ( wpi ) ) {
2017-01-20 14:52:54 +01:00
score * = m_msg39req - > m_synonymWeight ;
score * = m_msg39req - > m_synonymWeight ;
2016-07-04 15:06:15 +02:00
}
// do not allow duplicate hashgroups!
int32_t bro = - 1 ;
2016-09-21 22:26:13 +02:00
for ( int32_t k = 0 ; k < numTop ; k + + ) {
if ( bestmhg [ k ] = = mhg & & hg ! = HASHGROUP_INLINKTEXT ) {
2016-07-04 15:06:15 +02:00
bro = k ;
break ;
}
}
2016-09-21 22:26:13 +02:00
2016-07-04 15:06:15 +02:00
if ( bro > = 0 ) {
2016-10-10 16:02:50 +02:00
//
// We already have a score for this hash group, update if
// current score is higher
//
2016-07-04 15:06:15 +02:00
if ( score > bestScores [ bro ] ) {
bestScores [ bro ] = score ;
bestwpi [ bro ] = wpi ;
bestmhg [ bro ] = mhg ;
}
}
2016-09-21 22:26:13 +02:00
else
if ( numTop < m_realMaxTop ) { // MAX_TOP ) {
2016-10-10 16:02:50 +02:00
//
// New hash group (or INLINKTEXT).
// We have free slots in the top-list.
// Store found score.
//
2016-07-04 15:06:15 +02:00
bestScores [ numTop ] = score ;
bestwpi [ numTop ] = wpi ;
bestmhg [ numTop ] = mhg ;
numTop + + ;
}
2016-09-21 22:26:13 +02:00
else
2016-10-13 14:49:49 +02:00
if ( score > bestScores [ lowestScoreTermIdx ] ) {
2016-10-10 16:02:50 +02:00
//
// New hash group (or INLINKTEXT).
// We have NO free slots in the top-list.
// Replace lowest score in top-list with current higher score.
//
2016-10-13 14:49:49 +02:00
bestScores [ lowestScoreTermIdx ] = score ;
bestwpi [ lowestScoreTermIdx ] = wpi ;
bestmhg [ lowestScoreTermIdx ] = mhg ;
2016-07-04 15:06:15 +02:00
}
2016-10-13 14:49:49 +02:00
// If top-list is full, make lowestScoreTermIdx point to the lowest score
2016-10-10 16:02:50 +02:00
// in the top-list.
2016-07-04 15:06:15 +02:00
if ( numTop > = m_realMaxTop ) { // MAX_TOP ) {
2016-10-13 14:49:49 +02:00
lowestScoreTermIdx = 0 ;
2016-09-21 22:26:13 +02:00
for ( int32_t k = 1 ; k < m_realMaxTop ; k + + ) { //MAX_TOP ; k++ ) {
2016-10-13 14:49:49 +02:00
if ( bestScores [ k ] > bestScores [ lowestScoreTermIdx ] ) {
2016-09-21 22:26:13 +02:00
continue ;
}
2016-10-13 14:49:49 +02:00
lowestScoreTermIdx = k ;
2016-07-04 15:06:15 +02:00
}
}
2016-10-10 16:02:50 +02:00
// For findMinTermPairScoreInWindow() sub-out algo.
// If the term is not in the body, and the score is the
// highest non-body term score, return the position index.
2016-07-04 15:06:15 +02:00
if ( score > nonBodyMax & & ! s_inBody [ hg ] ) {
nonBodyMax = score ;
2016-10-10 16:02:50 +02:00
* highestScoringNonBodyPos = wpi ;
2016-07-04 15:06:15 +02:00
}
// first key is 12 bytes
2016-09-21 22:26:13 +02:00
if ( first ) {
wpi + = 6 ;
first = false ;
}
2016-07-04 15:06:15 +02:00
// advance
wpi + = 6 ;
2016-10-10 16:02:50 +02:00
2016-09-06 12:46:08 +02:00
} while ( wpi < endi & & Posdb : : getKeySize ( wpi ) = = 6 ) ;
2016-07-04 15:06:15 +02:00
}
// add up the top scores
float sum = 0.0 ;
for ( int32_t k = 0 ; k < numTop ; k + + ) {
// if it is something like "enough for" in a wikipedia
// phrase like "time enough for love" give it a boost!
// now we set a special bit in the keys since we do a mini
// merge, we do the same thing for the syn bits
2016-09-21 22:26:13 +02:00
if ( Posdb : : getIsHalfStopWikiBigram ( bestwpi [ k ] ) ) {
sum + = ( bestScores [ k ] * WIKI_BIGRAM_WEIGHT * WIKI_BIGRAM_WEIGHT ) ;
}
else {
// otherwise just add it up
2016-07-04 15:06:15 +02:00
sum + = bestScores [ k ] ;
2016-09-21 22:26:13 +02:00
}
2016-07-04 15:06:15 +02:00
}
// wiki weight
//sum *= ts;
sum * = m_freqWeights [ i ] ;
sum * = m_freqWeights [ i ] ;
// shortcut
//char *maxp = bestwpi[k];
// if terms is a special wiki half stop bigram
//if ( m_bflags[i] & BF_HALFSTOPWIKIBIGRAM ) {
// sum *= WIKI_BIGRAM_WEIGHT;
// sum *= WIKI_BIGRAM_WEIGHT;
//}
// empty list?
//if ( ! wpi ) sum = -2.0;
//
// end the loop. return now if not collecting scoring info.
//
2016-08-25 17:27:37 +02:00
if ( ! pdcs ) {
logTrace ( g_conf . m_logTracePosdb , " END. " ) ;
return sum ;
}
2016-10-10 16:02:50 +02:00
2016-10-18 14:17:39 +02:00
//#
//# The below is for visual presentation of the scoring ONLY
//#
2016-08-25 17:27:37 +02:00
2016-07-04 15:06:15 +02:00
// none? wtf?
2016-08-25 17:27:37 +02:00
if ( numTop < = 0 ) {
logTrace ( g_conf . m_logTracePosdb , " END. " ) ;
return sum ;
}
2016-07-04 15:06:15 +02:00
// point into buf
2016-10-18 21:22:29 +02:00
SingleScore * sx = ( SingleScore * ) m_singleScoreBuf . getBufPtr ( ) ;
2016-07-04 15:06:15 +02:00
int32_t need = sizeof ( SingleScore ) * numTop ;
2016-09-21 22:26:13 +02:00
2016-07-04 15:06:15 +02:00
// point to that
2016-09-21 22:26:13 +02:00
if ( pdcs - > m_singlesOffset < 0 ) {
2016-07-04 15:06:15 +02:00
pdcs - > m_singlesOffset = m_singleScoreBuf . length ( ) ;
2016-09-21 22:26:13 +02:00
}
2016-07-04 15:06:15 +02:00
// reset this i guess
pdcs - > m_singleScores = NULL ;
2016-09-21 22:26:13 +02:00
2016-07-04 15:06:15 +02:00
// sanity
if ( m_singleScoreBuf . getAvail ( ) < need ) {
static bool s_first = true ;
2016-09-21 22:26:13 +02:00
if ( s_first ) {
log ( " posdb: CRITICAL single buf overflow " ) ;
}
2016-07-04 15:06:15 +02:00
s_first = false ;
2016-08-25 17:27:37 +02:00
logTrace ( g_conf . m_logTracePosdb , " END. " ) ;
2016-07-04 15:06:15 +02:00
return sum ;
2016-07-05 12:11:14 +02:00
//gbshutdownAbort(true); }
2016-07-04 15:06:15 +02:00
}
2016-09-21 22:26:13 +02:00
2016-07-04 15:06:15 +02:00
// increase buf ptr over this then
m_singleScoreBuf . incrementLength ( need ) ;
// set each of the top scoring terms individiually
for ( int32_t k = 0 ; k < numTop ; k + + , sx + + ) {
// udpate count
pdcs - > m_numSingles + + ;
2016-09-16 14:49:36 +02:00
const char * maxp = bestwpi [ k ] ;
2016-07-04 15:06:15 +02:00
memset ( sx , 0 , sizeof ( * sx ) ) ;
2016-09-21 22:26:13 +02:00
sx - > m_isSynonym = Posdb : : getIsSynonym ( maxp ) ;
sx - > m_isHalfStopWikiBigram = Posdb : : getIsHalfStopWikiBigram ( maxp ) ;
2016-07-04 15:06:15 +02:00
//sx->m_isSynonym = (m_bflags[i] & BF_SYNONYM) ;
2016-09-06 12:46:08 +02:00
sx - > m_diversityRank = Posdb : : getDiversityRank ( maxp ) ;
sx - > m_wordSpamRank = Posdb : : getWordSpamRank ( maxp ) ;
sx - > m_hashGroup = Posdb : : getHashGroup ( maxp ) ;
sx - > m_wordPos = Posdb : : getWordPos ( maxp ) ;
sx - > m_densityRank = Posdb : : getDensityRank ( maxp ) ;
2016-09-21 22:26:13 +02:00
2016-07-04 15:06:15 +02:00
float score = bestScores [ k ] ;
2017-04-30 20:23:09 +02:00
2016-07-04 15:06:15 +02:00
//score *= ts;
score * = m_freqWeights [ i ] ;
score * = m_freqWeights [ i ] ;
// if terms is a special wiki half stop bigram
if ( sx - > m_isHalfStopWikiBigram ) {
score * = WIKI_BIGRAM_WEIGHT ;
score * = WIKI_BIGRAM_WEIGHT ;
}
sx - > m_finalScore = score ;
sx - > m_tfWeight = m_freqWeights [ i ] ;
sx - > m_qtermNum = m_qtermNums [ i ] ;
2016-09-04 23:01:01 +02:00
//int64_t *termFreqs = (int64_t *)m_msg39req->ptr_termFreqs;
2016-07-04 15:06:15 +02:00
//sx->m_termFreq = termFreqs[sx->m_qtermNum];
sx - > m_bflags = m_bflags [ i ] ;
}
2016-08-25 17:27:37 +02:00
logTrace ( g_conf . m_logTracePosdb , " END. sum=%f " , sum ) ;
2016-07-04 15:06:15 +02:00
return sum ;
}
2016-08-25 17:27:37 +02:00
2016-07-04 15:06:15 +02:00
// . advace two ptrs at the same time so it's just a linear scan
// . TODO: add all up, then basically taking a weight of the top 6 or so...
2016-10-10 16:02:50 +02:00
float PosdbTable : : getMaxScoreForNonBodyTermPair ( const char * wpi , const char * wpj , const char * endi ,
const char * endj , int32_t qdist ) {
2016-09-16 14:36:19 +02:00
// Sanity check
if ( wpi > = endi | | wpj > = endj ) {
2016-10-10 16:02:50 +02:00
return - 1.0 ;
2016-09-16 14:36:19 +02:00
}
2016-09-16 12:42:14 +02:00
# ifdef _VALGRIND_
VALGRIND_CHECK_MEM_IS_DEFINED ( wpi , endi - wpi ) ;
VALGRIND_CHECK_MEM_IS_DEFINED ( wpj , endj - wpj ) ;
# endif
2016-08-25 17:27:37 +02:00
logTrace ( g_conf . m_logTracePosdb , " BEGIN. " ) ;
2016-09-06 12:46:08 +02:00
int32_t p1 = Posdb : : getWordPos ( wpi ) ;
int32_t p2 = Posdb : : getWordPos ( wpj ) ;
2016-07-04 15:06:15 +02:00
2016-09-06 15:57:55 +02:00
unsigned char hg1 = Posdb : : getHashGroup ( wpi ) ;
unsigned char hg2 = Posdb : : getHashGroup ( wpj ) ;
2016-07-04 15:06:15 +02:00
2016-09-19 15:03:10 +02:00
//temporary fix: posdb can have junk in it so clamp the hashgroup to the limit
if ( hg1 > = HASHGROUP_END ) hg1 = HASHGROUP_END - 1 ;
if ( hg2 > = HASHGROUP_END ) hg2 = HASHGROUP_END - 1 ;
2016-09-06 12:46:08 +02:00
unsigned char wsr1 = Posdb : : getWordSpamRank ( wpi ) ;
unsigned char wsr2 = Posdb : : getWordSpamRank ( wpj ) ;
2016-07-04 15:06:15 +02:00
float spamw1 ;
float spamw2 ;
2016-09-06 15:57:55 +02:00
if ( hg1 = = HASHGROUP_INLINKTEXT ) {
2017-01-26 15:24:44 +01:00
spamw1 = m_msg39req - > m_scoringWeights . m_linkerWeights [ wsr1 ] ;
2016-09-06 15:57:55 +02:00
}
else {
2017-01-26 15:24:44 +01:00
spamw1 = m_msg39req - > m_scoringWeights . m_wordSpamWeights [ wsr1 ] ;
2016-09-06 15:57:55 +02:00
}
if ( hg2 = = HASHGROUP_INLINKTEXT ) {
2017-01-26 15:24:44 +01:00
spamw2 = m_msg39req - > m_scoringWeights . m_linkerWeights [ wsr2 ] ;
2016-09-06 15:57:55 +02:00
}
else {
2017-01-26 15:24:44 +01:00
spamw2 = m_msg39req - > m_scoringWeights . m_wordSpamWeights [ wsr2 ] ;
2016-09-06 15:57:55 +02:00
}
2016-07-04 15:06:15 +02:00
// density weight
//float denw ;
//if ( hg1 == HASHGROUP_BODY ) denw = 1.0;
2017-01-26 15:24:44 +01:00
float denw1 = m_msg39req - > m_scoringWeights . m_densityWeights [ Posdb : : getDensityRank ( wpi ) ] ;
float denw2 = m_msg39req - > m_scoringWeights . m_densityWeights [ Posdb : : getDensityRank ( wpj ) ] ;
2016-07-04 15:06:15 +02:00
bool firsti = true ;
bool firstj = true ;
float score ;
float max = - 1.0 ;
int32_t dist ;
2016-09-06 15:57:55 +02:00
for ( ; ; ) {
if ( p1 < = p2 ) {
// . skip the pair if they are in different hashgroups
// . we no longer allow either to be in the body in this
// algo because we handle those cases in the sliding window
// algo!
if ( s_isCompatible [ hg1 ] [ hg2 ] ) {
// git distance
dist = p2 - p1 ;
// if zero, make sure its 2. this happens when the same bigram
// is used by both terms. i.e. street uses the bigram
// 'street light' and so does 'light'. so the wordpositions
// are exactly the same!
if ( dist < 2 ) {
dist = 2 ;
}
// fix distance if in different non-body hashgroups
if ( dist > 50 ) {
dist = FIXED_DISTANCE ;
}
2016-07-04 15:06:15 +02:00
2016-09-06 15:57:55 +02:00
// subtract from the dist the terms are apart in the query
if ( dist > = qdist ) {
dist = dist - qdist ;
}
// good density?
score = 100 * denw1 * denw2 ;
// hashgroup modifier
2017-01-26 15:24:44 +01:00
score * = m_msg39req - > m_scoringWeights . m_hashGroupWeights [ hg1 ] ;
score * = m_msg39req - > m_scoringWeights . m_hashGroupWeights [ hg2 ] ;
2016-09-06 15:57:55 +02:00
// if synonym or alternate word form
if ( Posdb : : getIsSynonym ( wpi ) ) {
2017-01-20 14:56:49 +01:00
score * = m_msg39req - > m_synonymWeight ;
2016-09-06 15:57:55 +02:00
}
if ( Posdb : : getIsSynonym ( wpj ) ) {
2017-01-20 14:56:49 +01:00
score * = m_msg39req - > m_synonymWeight ;
2016-09-06 15:57:55 +02:00
}
// word spam weights
score * = spamw1 * spamw2 ;
// huge title? do not allow 11th+ word to be weighted high
//if ( hg1 == HASHGROUP_TITLE && dist > 20 )
2017-01-26 15:24:44 +01:00
// score /= m_msg39req->m_scoringWeights.m_hashGroupWeights[hg1];
2016-09-06 15:57:55 +02:00
// mod by distance
score / = ( dist + 1.0 ) ;
// tmp hack
//score *= (dist+1.0);
// best?
if ( score > max ) {
max = score ;
}
}
// first key is 12 bytes
if ( firsti ) {
wpi + = 6 ;
firsti = false ;
}
// advance
wpi + = 6 ;
// end of list?
if ( wpi > = endi ) {
break ; // exit for(;;) loop
}
// exhausted?
if ( Posdb : : getKeySize ( wpi ) ! = 6 ) {
break ; // exit for(;;) loop
}
// update. include G-bits?
p1 = Posdb : : getWordPos ( wpi ) ;
// hash group update
hg1 = Posdb : : getHashGroup ( wpi ) ;
// update density weight in case hash group changed
2017-01-26 15:24:44 +01:00
denw1 = m_msg39req - > m_scoringWeights . m_densityWeights [ Posdb : : getDensityRank ( wpi ) ] ;
2016-09-06 15:57:55 +02:00
// word spam weight update
if ( hg1 = = HASHGROUP_INLINKTEXT ) {
2017-01-26 15:24:44 +01:00
spamw1 = m_msg39req - > m_scoringWeights . m_linkerWeights [ Posdb : : getWordSpamRank ( wpi ) ] ;
2016-09-06 15:57:55 +02:00
}
else {
2017-01-26 15:24:44 +01:00
spamw1 = m_msg39req - > m_scoringWeights . m_wordSpamWeights [ Posdb : : getWordSpamRank ( wpi ) ] ;
2016-09-06 15:57:55 +02:00
}
2016-07-04 15:06:15 +02:00
}
else {
2016-09-06 15:57:55 +02:00
// . skip the pair if they are in different hashgroups
// . we no longer allow either to be in the body in this
// algo because we handle those cases in the sliding window
// algo!
if ( s_isCompatible [ hg1 ] [ hg2 ] ) {
// get distance
dist = p1 - p2 ;
// if zero, make sure its 2. this happens when the same bigram
// is used by both terms. i.e. street uses the bigram
// 'street light' and so does 'light'. so the wordpositions
// are exactly the same!
if ( dist < 2 ) dist = 2 ;
// fix distance if in different non-body hashgroups
if ( dist > 50 ) {
dist = FIXED_DISTANCE ;
}
2016-07-04 15:06:15 +02:00
2016-09-06 15:57:55 +02:00
// subtract from the dist the terms are apart in the query
if ( dist > = qdist ) {
dist = dist - qdist ;
// add 1 for being out of order
dist + = qdist - 1 ;
}
else {
//dist = dist - qdist;
// add 1 for being out of order
dist + = 1 ; // qdist - 1;
}
// compute score based on that junk
//score = (MAXWORDPOS+1) - dist;
// good diversity? uneeded for pair algo
2017-01-26 15:24:44 +01:00
//score *= m_msg39req->m_scoringWeights.m_diversityWeights[div1];
//score *= m_msg39req->m_scoringWeights.m_diversityWeights[div2];
2016-09-06 15:57:55 +02:00
// good density?
score = 100 * denw1 * denw2 ;
// hashgroup modifier
2017-01-26 15:24:44 +01:00
score * = m_msg39req - > m_scoringWeights . m_hashGroupWeights [ hg1 ] ;
score * = m_msg39req - > m_scoringWeights . m_hashGroupWeights [ hg2 ] ;
2016-09-06 15:57:55 +02:00
// if synonym or alternate word form
2017-01-20 14:56:49 +01:00
if ( Posdb : : getIsSynonym ( wpi ) ) score * = m_msg39req - > m_synonymWeight ;
if ( Posdb : : getIsSynonym ( wpj ) ) score * = m_msg39req - > m_synonymWeight ;
//if ( m_bflags[i] & BF_SYNONYM ) score *= m_msg39req->m_synonymWeight;
//if ( m_bflags[j] & BF_SYNONYM ) score *= m_msg39req->m_synonymWeight;
2016-09-06 15:57:55 +02:00
// word spam weights
score * = spamw1 * spamw2 ;
// huge title? do not allow 11th+ word to be weighted high
//if ( hg1 == HASHGROUP_TITLE && dist > 20 )
2017-01-26 15:24:44 +01:00
// score /= m_msg39req->m_scoringWeights.m_hashGroupWeights[hg1];
2016-09-06 15:57:55 +02:00
// mod by distance
score / = ( dist + 1.0 ) ;
// tmp hack
//score *= (dist+1.0);
// best?
if ( score > max ) {
max = score ;
}
}
// first key is 12 bytes
if ( firstj ) {
wpj + = 6 ;
firstj = false ;
}
// advance
wpj + = 6 ;
// end of list?
if ( wpj > = endj ) {
break ; // exit for(;;) loop
}
// exhausted?
if ( Posdb : : getKeySize ( wpj ) ! = 6 ) {
break ; // exit for(;;) loop
}
// update
p2 = Posdb : : getWordPos ( wpj ) ;
// hash group update
hg2 = Posdb : : getHashGroup ( wpj ) ;
// update density weight in case hash group changed
2017-01-26 15:24:44 +01:00
denw2 = m_msg39req - > m_scoringWeights . m_densityWeights [ Posdb : : getDensityRank ( wpj ) ] ;
2016-09-06 15:57:55 +02:00
// word spam weight update
if ( hg2 = = HASHGROUP_INLINKTEXT ) {
2017-01-26 15:24:44 +01:00
spamw2 = m_msg39req - > m_scoringWeights . m_linkerWeights [ Posdb : : getWordSpamRank ( wpj ) ] ;
2016-09-06 15:57:55 +02:00
}
else {
2017-01-26 15:24:44 +01:00
spamw2 = m_msg39req - > m_scoringWeights . m_wordSpamWeights [ Posdb : : getWordSpamRank ( wpj ) ] ;
2016-09-06 15:57:55 +02:00
}
2016-08-25 17:27:37 +02:00
}
2016-07-04 15:06:15 +02:00
}
2016-08-25 17:27:37 +02:00
logTrace ( g_conf . m_logTracePosdb , " END. " ) ;
2016-10-10 16:02:50 +02:00
return max ;
2016-07-04 15:06:15 +02:00
}
2016-08-29 17:05:34 +02:00
2017-06-01 16:46:14 +02:00
float PosdbTable : : getScoreForTermPair ( const char * wpi , const char * wpj , int32_t fixedDistance , int32_t qdist ) {
2016-08-25 17:27:37 +02:00
logTrace ( g_conf . m_logTracePosdb , " BEGIN. " ) ;
if ( ! wpi ) {
logTrace ( g_conf . m_logTracePosdb , " END. " ) ;
return - 1.00 ;
}
if ( ! wpj ) {
logTrace ( g_conf . m_logTracePosdb , " END. " ) ;
return - 1.00 ;
}
2016-07-04 15:06:15 +02:00
2016-09-16 13:58:44 +02:00
# ifdef _VALGRIND_
VALGRIND_CHECK_MEM_IS_DEFINED ( wpi , 6 ) ;
VALGRIND_CHECK_MEM_IS_DEFINED ( wpj , 6 ) ;
# endif
2016-09-06 12:46:08 +02:00
int32_t p1 = Posdb : : getWordPos ( wpi ) ;
int32_t p2 = Posdb : : getWordPos ( wpj ) ;
unsigned char hg1 = Posdb : : getHashGroup ( wpi ) ;
unsigned char hg2 = Posdb : : getHashGroup ( wpj ) ;
unsigned char wsr1 = Posdb : : getWordSpamRank ( wpi ) ;
unsigned char wsr2 = Posdb : : getWordSpamRank ( wpj ) ;
2016-07-04 15:06:15 +02:00
float spamw1 ;
float spamw2 ;
float denw1 ;
float denw2 ;
float dist ;
float score ;
2017-01-26 15:24:44 +01:00
if ( hg1 = = HASHGROUP_INLINKTEXT ) spamw1 = m_msg39req - > m_scoringWeights . m_linkerWeights [ wsr1 ] ;
else spamw1 = m_msg39req - > m_scoringWeights . m_wordSpamWeights [ wsr1 ] ;
if ( hg2 = = HASHGROUP_INLINKTEXT ) spamw2 = m_msg39req - > m_scoringWeights . m_linkerWeights [ wsr2 ] ;
else spamw2 = m_msg39req - > m_scoringWeights . m_wordSpamWeights [ wsr2 ] ;
denw1 = m_msg39req - > m_scoringWeights . m_densityWeights [ Posdb : : getDensityRank ( wpi ) ] ;
denw2 = m_msg39req - > m_scoringWeights . m_densityWeights [ Posdb : : getDensityRank ( wpj ) ] ;
2016-07-04 15:06:15 +02:00
// set this
if ( fixedDistance ! = 0 ) {
dist = fixedDistance ;
}
else {
// do the math now
if ( p2 < p1 ) dist = p1 - p2 ;
else dist = p2 - p1 ;
// if zero, make sure its 2. this happens when the same bigram
// is used by both terms. i.e. street uses the bigram
// 'street light' and so does 'light'. so the wordpositions
// are exactly the same!
if ( dist < 2 ) dist = 2 ;
// subtract from the dist the terms are apart in the query
2017-06-01 16:46:14 +02:00
if ( dist > = qdist ) dist = dist - qdist ;
2016-07-04 15:06:15 +02:00
// out of order? penalize by 1 unit
if ( p2 < p1 ) dist + = 1 ;
}
// TODO: use left and right diversity if no matching query term
// is on the left or right
2017-01-26 15:24:44 +01:00
//score *= m_msg39req->m_scoringWeights.m_diversityWeights[div1];
//score *= m_msg39req->m_scoringWeights.m_diversityWeights[div2];
2016-07-04 15:06:15 +02:00
// good density?
score = 100 * denw1 * denw2 ;
// wikipedia phrase weight
//score *= ts;
// hashgroup modifier
2017-01-26 15:24:44 +01:00
score * = m_msg39req - > m_scoringWeights . m_hashGroupWeights [ hg1 ] ;
score * = m_msg39req - > m_scoringWeights . m_hashGroupWeights [ hg2 ] ;
2016-07-04 15:06:15 +02:00
// if synonym or alternate word form
2017-01-20 14:56:49 +01:00
if ( Posdb : : getIsSynonym ( wpi ) ) score * = m_msg39req - > m_synonymWeight ;
if ( Posdb : : getIsSynonym ( wpj ) ) score * = m_msg39req - > m_synonymWeight ;
//if ( m_bflags[i] & BF_SYNONYM ) score *= m_msg39req->m_synonymWeight;
//if ( m_bflags[j] & BF_SYNONYM ) score *= m_msg39req->m_synonymWeight;
2016-07-04 15:06:15 +02:00
// word spam weights
score * = spamw1 * spamw2 ;
// mod by distance
score / = ( dist + 1.0 ) ;
// tmp hack
//score *= (dist+1.0);
2016-08-25 17:27:37 +02:00
logTrace ( g_conf . m_logTracePosdb , " END. score=%f " , score ) ;
2016-07-04 15:06:15 +02:00
return score ;
}
2016-08-25 17:27:37 +02:00
2016-07-04 15:06:15 +02:00
// . advance two ptrs at the same time so it's just a linear scan
// . TODO: add all up, then basically taking a weight of the top 6 or so...
2016-10-10 16:02:50 +02:00
// . skip body terms not in the sliding window as defined by m_bestMinTermPairWindowPtrs[]
2016-07-04 15:06:15 +02:00
float PosdbTable : : getTermPairScoreForAny ( int32_t i , int32_t j ,
const char * wpi , const char * wpj ,
const char * endi , const char * endj ,
DocIdScore * pdcs ) {
// wiki phrase weight?
float wts ;
2016-08-25 17:27:37 +02:00
logTrace ( g_conf . m_logTracePosdb , " BEGIN. " ) ;
2016-07-04 15:06:15 +02:00
int32_t qdist ;
// but if they are in the same wikipedia phrase
// then try to keep their positions as in the query.
// so for 'time enough for love' ideally we want
// 'time' to be 6 units apart from 'love'
2016-09-06 16:12:40 +02:00
if ( m_wikiPhraseIds [ j ] = = m_wikiPhraseIds [ i ] & & m_wikiPhraseIds [ j ] ) { // zero means not in a phrase
2016-07-04 15:06:15 +02:00
qdist = m_qpos [ j ] - m_qpos [ i ] ;
// wiki weight
2016-09-06 16:12:40 +02:00
wts = ( float ) WIKI_WEIGHT ;
2016-07-04 15:06:15 +02:00
}
else {
// basically try to get query words as close
// together as possible
qdist = 2 ;
// this should help fix
// 'what is an unsecured loan' so we are more likely
// to get the page that has that exact phrase in it.
// yes, but hurts how to make a lock pick set.
//qdist = qpos[j] - qpos[i];
// wiki weight
wts = 1.0 ;
}
bool inSameQuotedPhrase = false ;
2016-09-06 16:12:40 +02:00
if ( m_quotedStartIds [ i ] = = m_quotedStartIds [ j ] & & m_quotedStartIds [ i ] > = 0 ) {
2016-07-04 15:06:15 +02:00
inSameQuotedPhrase = true ;
2016-09-06 16:12:40 +02:00
}
2016-07-04 15:06:15 +02:00
// oops.. this was not counting non-space punct for 2 units
// instead of 1
2016-09-06 16:12:40 +02:00
if ( inSameQuotedPhrase ) {
2016-07-04 15:06:15 +02:00
qdist = m_qpos [ j ] - m_qpos [ i ] ;
2016-09-06 16:12:40 +02:00
}
2016-07-04 15:06:15 +02:00
2016-09-06 16:12:40 +02:00
int32_t p1 = Posdb : : getWordPos ( wpi ) ;
int32_t p2 = Posdb : : getWordPos ( wpj ) ;
2016-07-04 15:06:15 +02:00
2016-09-06 12:46:08 +02:00
unsigned char hg1 = Posdb : : getHashGroup ( wpi ) ;
unsigned char hg2 = Posdb : : getHashGroup ( wpj ) ;
2016-07-04 15:06:15 +02:00
// reduce to either HASHGROUP_BODY/TITLE/INLINK/META
unsigned char mhg1 = hg1 ;
unsigned char mhg2 = hg2 ;
2016-09-06 16:12:40 +02:00
if ( s_inBody [ mhg1 ] ) {
mhg1 = HASHGROUP_BODY ;
}
if ( s_inBody [ mhg2 ] ) {
mhg2 = HASHGROUP_BODY ;
}
2016-07-04 15:06:15 +02:00
2016-09-06 12:46:08 +02:00
unsigned char wsr1 = Posdb : : getWordSpamRank ( wpi ) ;
unsigned char wsr2 = Posdb : : getWordSpamRank ( wpj ) ;
2016-07-04 15:06:15 +02:00
float spamw1 ;
float spamw2 ;
2016-09-06 16:12:40 +02:00
if ( hg1 = = HASHGROUP_INLINKTEXT ) {
2017-01-26 15:24:44 +01:00
spamw1 = m_msg39req - > m_scoringWeights . m_linkerWeights [ wsr1 ] ;
2016-09-06 16:12:40 +02:00
}
else {
2017-01-26 15:24:44 +01:00
spamw1 = m_msg39req - > m_scoringWeights . m_wordSpamWeights [ wsr1 ] ;
2016-09-06 16:12:40 +02:00
}
2016-07-04 15:06:15 +02:00
2016-09-06 16:12:40 +02:00
if ( hg2 = = HASHGROUP_INLINKTEXT ) {
2017-01-26 15:24:44 +01:00
spamw2 = m_msg39req - > m_scoringWeights . m_linkerWeights [ wsr2 ] ;
2016-09-06 16:12:40 +02:00
}
else {
2017-01-26 15:24:44 +01:00
spamw2 = m_msg39req - > m_scoringWeights . m_wordSpamWeights [ wsr2 ] ;
2016-09-06 16:12:40 +02:00
}
2016-07-04 15:06:15 +02:00
// density weight
2017-01-26 15:24:44 +01:00
float denw1 = m_msg39req - > m_scoringWeights . m_densityWeights [ Posdb : : getDensityRank ( wpi ) ] ;
float denw2 = m_msg39req - > m_scoringWeights . m_densityWeights [ Posdb : : getDensityRank ( wpj ) ] ;
2016-07-04 15:06:15 +02:00
bool firsti = true ;
bool firstj = true ;
float score ;
2016-10-13 14:49:49 +02:00
int32_t lowestScoreTermIdx = - 1 ;
2016-09-21 22:26:13 +02:00
float bestScores [ MAX_TOP ] = { 0 } ; // make Coverity happy
2016-07-04 15:06:15 +02:00
const char * bestwpi [ MAX_TOP ] ;
const char * bestwpj [ MAX_TOP ] ;
char bestmhg1 [ MAX_TOP ] ;
char bestmhg2 [ MAX_TOP ] ;
char bestFixed [ MAX_TOP ] ;
int32_t numTop = 0 ;
int32_t dist ;
bool fixedDistance ;
int32_t bro ;
char syn1 ;
char syn2 ;
2016-09-06 16:12:40 +02:00
for ( ; ; ) {
// . if p1/p2 is in body and not in window, skip
// . this is how we restrict all body terms to the winning
// sliding window
2016-10-10 16:02:50 +02:00
if ( s_inBody [ hg1 ] & & wpi ! = m_bestMinTermPairWindowPtrs [ i ] ) {
2016-09-06 16:12:40 +02:00
goto skip1 ;
}
2016-10-10 16:02:50 +02:00
if ( s_inBody [ hg2 ] & & wpj ! = m_bestMinTermPairWindowPtrs [ j ] ) {
2016-09-06 16:12:40 +02:00
goto skip2 ;
}
2016-07-04 15:06:15 +02:00
2016-09-06 16:12:40 +02:00
// make this strictly < now and not <= because in the event
// of bigram terms, where p1==p2 we'd like to advance p2/wj to
// point to the non-syn single term in order to get a better score
// to fix the 'search engine' query on gigablast.com
if ( p1 < = p2 ) {
// git distance
dist = p2 - p1 ;
// if in the same quoted phrase, order is bad!
if ( inSameQuotedPhrase ) {
// debug
//log("dddx: i=%" PRId32" j=%" PRId32" dist=%" PRId32" qdist=%" PRId32" posi=%" PRId32" "
// "posj=%" PRId32,
// i,j,dist,qdist,p1,p2);
// TODO: allow for off by 1
// if it has punct in it then dist will be 3,
// just a space or similar then dist should be 2.
if ( dist > qdist & & dist - qdist > = 2 ) {
goto skip1 ;
}
if ( dist < qdist & & qdist - dist > = 2 ) {
goto skip1 ;
}
}
2016-07-04 15:06:15 +02:00
2016-09-06 16:12:40 +02:00
// are either synonyms
syn1 = Posdb : : getIsSynonym ( wpi ) ;
syn2 = Posdb : : getIsSynonym ( wpj ) ;
// if zero, make sure its 2. this happens when the same bigram
// is used by both terms. i.e. street uses the bigram
// 'street light' and so does 'light'. so the wordpositions
// are exactly the same!
if ( dist < 2 ) {
dist = 2 ;
}
// fix distance if in different non-body hashgroups
if ( dist < 50 ) {
fixedDistance = false ;
}
// body vs title, linktext vs title, linktext vs body
else if ( mhg1 ! = mhg2 ) {
dist = FIXED_DISTANCE ;
fixedDistance = true ;
}
// link text to other link text
else if ( mhg1 = = HASHGROUP_INLINKTEXT ) {
dist = FIXED_DISTANCE ;
fixedDistance = true ;
}
else {
fixedDistance = false ;
}
// if both are link text and > 50 units apart that means
// they are from different link texts
//if ( hg1 == HASHGROUP_INLINKTEXT && dist > 50 ) goto skip1;
// subtract from the dist the terms are apart in the query
if ( dist > = qdist ) {
dist = dist - qdist ;
}
// good density?
score = 100 * denw1 * denw2 ;
// hashgroup modifier
2017-01-26 15:24:44 +01:00
score * = m_msg39req - > m_scoringWeights . m_hashGroupWeights [ hg1 ] ;
score * = m_msg39req - > m_scoringWeights . m_hashGroupWeights [ hg2 ] ;
2016-07-04 15:06:15 +02:00
2016-09-06 16:12:40 +02:00
// if synonym or alternate word form
if ( syn1 ) {
2017-01-20 14:56:49 +01:00
score * = m_msg39req - > m_synonymWeight ;
2016-09-06 16:12:40 +02:00
}
if ( syn2 ) {
2017-01-20 14:56:49 +01:00
score * = m_msg39req - > m_synonymWeight ;
2016-08-25 17:27:37 +02:00
}
2016-09-06 16:12:40 +02:00
// the new logic
if ( Posdb : : getIsHalfStopWikiBigram ( wpi ) ) {
score * = WIKI_BIGRAM_WEIGHT ;
}
if ( Posdb : : getIsHalfStopWikiBigram ( wpj ) ) {
score * = WIKI_BIGRAM_WEIGHT ;
}
// word spam weights
score * = spamw1 * spamw2 ;
// huge title? do not allow 11th+ word to be weighted high
//if ( hg1 == HASHGROUP_TITLE && dist > 20 )
2017-01-26 15:24:44 +01:00
// score /= m_msg39req->m_scoringWeights.m_hashGroupWeights[hg1];
2016-09-06 16:12:40 +02:00
// mod by distance
score / = ( dist + 1.0 ) ;
// tmp hack
//score *= (dist+1.0);
// if our hg1/hg2 hashgroup pairing already exists
// in the bestScores array we have to beat it and then
// we have to replace that. we can only have one per,
// except for linktext!
bro = - 1 ;
for ( int32_t k = 0 ; k < numTop ; k + + ) {
if ( bestmhg1 [ k ] = = mhg1 & & hg1 ! = HASHGROUP_INLINKTEXT ) {
bro = k ;
break ;
}
if ( bestmhg2 [ k ] = = mhg2 & & hg2 ! = HASHGROUP_INLINKTEXT ) {
bro = k ;
break ;
}
2016-08-25 17:27:37 +02:00
}
2016-07-04 15:06:15 +02:00
2016-09-06 16:12:40 +02:00
if ( bro > = 0 ) {
if ( score > bestScores [ bro ] ) {
bestScores [ bro ] = score ;
bestwpi [ bro ] = wpi ;
bestwpj [ bro ] = wpj ;
bestmhg1 [ bro ] = mhg1 ;
bestmhg2 [ bro ] = mhg2 ;
bestFixed [ bro ] = fixedDistance ;
}
2016-07-04 15:06:15 +02:00
}
2016-09-21 21:27:44 +02:00
else
if ( numTop < m_realMaxTop ) { // MAX_TOP ) {
2016-09-06 16:12:40 +02:00
bestScores [ numTop ] = score ;
bestwpi [ numTop ] = wpi ;
bestwpj [ numTop ] = wpj ;
bestmhg1 [ numTop ] = mhg1 ;
bestmhg2 [ numTop ] = mhg2 ;
bestFixed [ numTop ] = fixedDistance ;
numTop + + ;
2016-07-04 15:06:15 +02:00
}
2016-09-21 21:27:44 +02:00
else
2016-10-13 14:49:49 +02:00
if ( lowestScoreTermIdx > = 0 & & score > bestScores [ lowestScoreTermIdx ] ) {
bestScores [ lowestScoreTermIdx ] = score ;
bestwpi [ lowestScoreTermIdx ] = wpi ;
bestwpj [ lowestScoreTermIdx ] = wpj ;
bestmhg1 [ lowestScoreTermIdx ] = mhg1 ;
bestmhg2 [ lowestScoreTermIdx ] = mhg2 ;
bestFixed [ lowestScoreTermIdx ] = fixedDistance ;
2016-09-06 16:12:40 +02:00
}
2016-10-13 14:49:49 +02:00
// set "lowestScoreTermIdx" to the lowest score out of the top scores
2016-09-06 16:12:40 +02:00
if ( numTop > = m_realMaxTop ) { // MAX_TOP ) {
2016-10-13 14:49:49 +02:00
lowestScoreTermIdx = 0 ;
2016-09-06 16:12:40 +02:00
for ( int32_t k = 1 ; k < m_realMaxTop ; k + + ) { //MAX_TOP;k++
2016-10-13 14:49:49 +02:00
if ( bestScores [ k ] > bestScores [ lowestScoreTermIdx ] ) {
2016-09-06 16:12:40 +02:00
continue ;
}
2016-10-13 14:49:49 +02:00
lowestScoreTermIdx = k ;
2016-08-25 17:27:37 +02:00
}
2016-07-04 15:06:15 +02:00
}
2016-09-06 16:12:40 +02:00
skip1 :
// first key is 12 bytes
if ( firsti ) {
wpi + = 6 ;
firsti = false ;
}
2016-07-04 15:06:15 +02:00
2016-09-06 16:12:40 +02:00
// advance
wpi + = 6 ;
// end of list?
if ( wpi > = endi ) {
break ; // exit for(;;) loop
}
// exhausted?
if ( Posdb : : getKeySize ( wpi ) ! = 6 ) {
// sometimes there is posdb index corruption and
// we have a 12 byte key with the same docid but
// different siterank or langid because it was
// not deleted right!
if ( ( uint64_t ) Posdb : : getDocId ( wpi ) ! = m_docId ) {
gbshutdownAbort ( true ) ;
}
// re-set this i guess
firsti = true ;
}
// update. include G-bits?
p1 = Posdb : : getWordPos ( wpi ) ;
// hash group update
hg1 = Posdb : : getHashGroup ( wpi ) ;
// the "modified" hash group
mhg1 = hg1 ;
if ( s_inBody [ mhg1 ] ) mhg1 = HASHGROUP_BODY ;
// update density weight in case hash group changed
2017-01-26 15:24:44 +01:00
denw1 = m_msg39req - > m_scoringWeights . m_densityWeights [ Posdb : : getDensityRank ( wpi ) ] ;
2016-09-06 16:12:40 +02:00
// word spam weight update
if ( hg1 = = HASHGROUP_INLINKTEXT ) {
2017-01-26 15:24:44 +01:00
spamw1 = m_msg39req - > m_scoringWeights . m_linkerWeights [ Posdb : : getWordSpamRank ( wpi ) ] ;
2016-09-06 16:12:40 +02:00
}
else {
2017-01-26 15:24:44 +01:00
spamw1 = m_msg39req - > m_scoringWeights . m_wordSpamWeights [ Posdb : : getWordSpamRank ( wpi ) ] ;
2016-09-06 16:12:40 +02:00
}
2016-07-04 15:06:15 +02:00
}
else {
2016-09-06 16:12:40 +02:00
// get distance
dist = p1 - p2 ;
// if in the same quoted phrase, order is bad!
if ( inSameQuotedPhrase ) {
// debug
//log("dddy: i=%" PRId32" j=%" PRId32" dist=%" PRId32" qdist=%" PRId32" posi=%" PRId32" "
// "posj=%" PRId32,
// i,j,dist,qdist,p1,p2);
goto skip2 ;
}
2016-07-04 15:06:15 +02:00
2016-09-06 16:12:40 +02:00
// if zero, make sure its 2. this happens when the same bigram
// is used by both terms. i.e. street uses the bigram
// 'street light' and so does 'light'. so the wordpositions
// are exactly the same!
if ( dist < 2 ) {
dist = 2 ;
2016-07-04 15:06:15 +02:00
}
2016-09-06 16:12:40 +02:00
// fix distance if in different non-body hashgroups
if ( dist < 50 ) {
fixedDistance = false ;
2016-07-04 15:06:15 +02:00
}
2016-09-06 16:12:40 +02:00
// body vs title, linktext vs title, linktext vs body
else if ( mhg1 ! = mhg2 ) {
dist = FIXED_DISTANCE ;
fixedDistance = true ;
}
// link text to other link text
else if ( mhg1 = = HASHGROUP_INLINKTEXT ) {
dist = FIXED_DISTANCE ;
fixedDistance = true ;
}
else
fixedDistance = false ;
// if both are link text and > 50 units apart that means
// they are from different link texts
//if ( hg1 == HASHGROUP_INLINKTEXT && dist > 50 ) goto skip2;
// subtract from the dist the terms are apart in the query
if ( dist > = qdist ) {
dist = dist - qdist ;
// add 1 for being out of order
dist + = qdist - 1 ;
}
else {
//dist = dist - qdist;
// add 1 for being out of order
dist + = 1 ; // qdist - 1;
}
// compute score based on that junk
//score = (MAXWORDPOS+1) - dist;
// good diversity? uneeded for pair algo
2017-01-26 15:24:44 +01:00
//score *= m_msg39req->m_scoringWeights.m_diversityWeights[div1];
//score *= m_msg39req->m_scoringWeights.m_diversityWeights[div2];
2016-09-06 16:12:40 +02:00
// good density?
score = 100 * denw1 * denw2 ;
// hashgroup modifier
2017-01-26 15:24:44 +01:00
score * = m_msg39req - > m_scoringWeights . m_hashGroupWeights [ hg1 ] ;
score * = m_msg39req - > m_scoringWeights . m_hashGroupWeights [ hg2 ] ;
2016-09-06 16:12:40 +02:00
// if synonym or alternate word form
if ( Posdb : : getIsSynonym ( wpi ) ) {
2017-01-20 14:56:49 +01:00
score * = m_msg39req - > m_synonymWeight ;
2016-09-06 16:12:40 +02:00
}
if ( Posdb : : getIsSynonym ( wpj ) ) {
2017-01-20 14:56:49 +01:00
score * = m_msg39req - > m_synonymWeight ;
2016-09-06 16:12:40 +02:00
}
2017-01-20 14:56:49 +01:00
//if ( m_bflags[i] & BF_SYNONYM ) score *= m_msg39req->m_synonymWeight;
//if ( m_bflags[j] & BF_SYNONYM ) score *= m_msg39req->m_synonymWeight;
2016-09-06 16:12:40 +02:00
// word spam weights
score * = spamw1 * spamw2 ;
// huge title? do not allow 11th+ word to be weighted high
//if ( hg1 == HASHGROUP_TITLE && dist > 20 )
2017-01-26 15:24:44 +01:00
// score /= m_msg39req->m_scoringWeights.m_hashGroupWeights[hg1];
2016-09-06 16:12:40 +02:00
// mod by distance
score / = ( dist + 1.0 ) ;
// tmp hack
//score *= (dist+1.0);
// if our hg1/hg2 hashgroup pairing already exists
// in the bestScores array we have to beat it and then
// we have to replace that. we can only have one per,
// except for linktext!
bro = - 1 ;
for ( int32_t k = 0 ; k < numTop ; k + + ) {
if ( bestmhg1 [ k ] = = mhg1 & & hg1 ! = HASHGROUP_INLINKTEXT ) {
bro = k ;
break ;
}
if ( bestmhg2 [ k ] = = mhg2 & & hg2 ! = HASHGROUP_INLINKTEXT ) {
bro = k ;
break ;
}
}
if ( bro > = 0 ) {
if ( score > bestScores [ bro ] ) {
bestScores [ bro ] = score ;
bestwpi [ bro ] = wpi ;
bestwpj [ bro ] = wpj ;
bestmhg1 [ bro ] = mhg1 ;
bestmhg2 [ bro ] = mhg2 ;
bestFixed [ bro ] = fixedDistance ;
}
}
// best?
else if ( numTop < m_realMaxTop ) { // MAX_TOP ) {
bestScores [ numTop ] = score ;
bestwpi [ numTop ] = wpi ;
bestwpj [ numTop ] = wpj ;
bestmhg1 [ numTop ] = mhg1 ;
bestmhg2 [ numTop ] = mhg2 ;
bestFixed [ numTop ] = fixedDistance ;
numTop + + ;
}
2016-10-13 14:49:49 +02:00
else if ( score > bestScores [ lowestScoreTermIdx ] ) {
bestScores [ lowestScoreTermIdx ] = score ;
bestwpi [ lowestScoreTermIdx ] = wpi ;
bestwpj [ lowestScoreTermIdx ] = wpj ;
bestmhg1 [ lowestScoreTermIdx ] = mhg1 ;
bestmhg2 [ lowestScoreTermIdx ] = mhg2 ;
bestFixed [ lowestScoreTermIdx ] = fixedDistance ;
2016-09-06 16:12:40 +02:00
}
2016-10-13 14:49:49 +02:00
// set "lowestScoreTermIdx" to the lowest score out of the top scores
2016-09-06 16:12:40 +02:00
if ( numTop > = m_realMaxTop ) { // MAX_TOP ) {
2016-10-13 14:49:49 +02:00
lowestScoreTermIdx = 0 ;
2016-09-06 16:12:40 +02:00
for ( int32_t k = 1 ; k < m_realMaxTop ; k + + ) { //MAX_TOP;k++
2016-10-13 14:49:49 +02:00
if ( bestScores [ k ] > bestScores [ lowestScoreTermIdx ] ) {
2016-09-06 16:12:40 +02:00
continue ;
}
2016-10-13 14:49:49 +02:00
lowestScoreTermIdx = k ;
2016-08-25 17:27:37 +02:00
}
2016-07-04 15:06:15 +02:00
}
2016-09-06 16:12:40 +02:00
skip2 :
// first key is 12 bytes
if ( firstj ) {
wpj + = 6 ;
firstj = false ;
}
// advance
wpj + = 6 ;
// end of list?
if ( wpj > = endj ) {
break ; // exit for(;;) loop
}
// exhausted?
if ( Posdb : : getKeySize ( wpj ) ! = 6 ) {
// sometimes there is posdb index corruption and
// we have a 12 byte key with the same docid but
// different siterank or langid because it was
// not deleted right!
if ( ( uint64_t ) Posdb : : getDocId ( wpj ) ! = m_docId ) {
gbshutdownAbort ( true ) ;
}
// re-set this i guess
firstj = true ;
}
// update
p2 = Posdb : : getWordPos ( wpj ) ;
// hash group update
hg2 = Posdb : : getHashGroup ( wpj ) ;
// the "modified" hash group
mhg2 = hg2 ;
if ( s_inBody [ mhg2 ] ) {
mhg2 = HASHGROUP_BODY ;
}
// update density weight in case hash group changed
2017-01-26 15:24:44 +01:00
denw2 = m_msg39req - > m_scoringWeights . m_densityWeights [ Posdb : : getDensityRank ( wpj ) ] ;
2016-09-06 16:12:40 +02:00
// word spam weight update
if ( hg2 = = HASHGROUP_INLINKTEXT ) {
2017-01-26 15:24:44 +01:00
spamw2 = m_msg39req - > m_scoringWeights . m_linkerWeights [ Posdb : : getWordSpamRank ( wpj ) ] ;
2016-09-06 16:12:40 +02:00
}
else {
2017-01-26 15:24:44 +01:00
spamw2 = m_msg39req - > m_scoringWeights . m_wordSpamWeights [ Posdb : : getWordSpamRank ( wpj ) ] ;
2016-09-06 16:12:40 +02:00
}
2016-07-04 15:06:15 +02:00
}
2016-09-06 16:12:40 +02:00
} // for(;;)
2016-07-04 15:06:15 +02:00
// add up the top scores
float sum = 0.0 ;
2016-08-25 17:27:37 +02:00
for ( int32_t k = 0 ; k < numTop ; k + + ) {
2016-07-04 15:06:15 +02:00
sum + = bestScores [ k ] ;
2016-08-25 17:27:37 +02:00
}
2016-07-04 15:06:15 +02:00
2016-07-27 16:15:20 +02:00
if ( m_debug ) {
2016-07-04 15:06:15 +02:00
for ( int32_t k = 0 ; k < numTop ; k + + )
2016-07-27 16:06:29 +02:00
log ( LOG_INFO , " posdb: best score #% " PRId32 " = %f " , k , bestScores [ k ] ) ;
log ( LOG_INFO , " posdb: best score sum = %f " , sum ) ;
2016-07-04 15:06:15 +02:00
}
// wiki phrase weight
sum * = wts ;
// mod by freq weight
sum * = m_freqWeights [ i ] ;
sum * = m_freqWeights [ j ] ;
2016-09-06 16:12:40 +02:00
if ( m_debug ) {
2016-07-27 16:06:29 +02:00
log ( LOG_INFO , " posdb: best score final = %f " , sum ) ;
2016-09-06 16:12:40 +02:00
}
2016-07-04 15:06:15 +02:00
//
// end the loop. return now if not collecting scoring info.
//
2016-08-25 17:27:37 +02:00
if ( ! pdcs ) {
logTrace ( g_conf . m_logTracePosdb , " END. " ) ;
return sum ;
}
2016-07-04 15:06:15 +02:00
// none? wtf?
2016-08-25 17:27:37 +02:00
if ( numTop < = 0 ) {
logTrace ( g_conf . m_logTracePosdb , " END. " ) ;
return sum ;
}
2016-07-04 15:06:15 +02:00
//
// now store the PairScores into the m_pairScoreBuf for this
// top docid.
//
// point into buf
2016-10-18 21:22:29 +02:00
PairScore * px = ( PairScore * ) m_pairScoreBuf . getBufPtr ( ) ;
2016-07-04 15:06:15 +02:00
int32_t need = sizeof ( PairScore ) * numTop ;
2016-09-06 16:12:40 +02:00
2016-07-04 15:06:15 +02:00
// point to that
2016-09-06 16:12:40 +02:00
if ( pdcs - > m_pairsOffset < 0 ) {
2016-07-04 15:06:15 +02:00
pdcs - > m_pairsOffset = m_pairScoreBuf . length ( ) ;
2016-09-06 16:12:40 +02:00
}
2016-07-04 15:06:15 +02:00
// reset this i guess
pdcs - > m_pairScores = NULL ;
2016-09-06 16:12:40 +02:00
2016-07-04 15:06:15 +02:00
// sanity
if ( m_pairScoreBuf . getAvail ( ) < need ) {
// m_pairScores will be NULL
static bool s_first = true ;
2016-08-25 17:27:37 +02:00
if ( s_first ) {
log ( " posdb: CRITICAL pair buf overflow " ) ;
}
2016-07-04 15:06:15 +02:00
s_first = false ;
2016-08-25 17:27:37 +02:00
logTrace ( g_conf . m_logTracePosdb , " END. " ) ;
2016-07-04 15:06:15 +02:00
return sum ;
}
// increase buf ptr over this then
m_pairScoreBuf . incrementLength ( need ) ;
// set each of the top scoring terms individiually
for ( int32_t k = 0 ; k < numTop ; k + + , px + + ) {
pdcs - > m_numPairs + + ;
memset ( px , 0 , sizeof ( * px ) ) ;
const char * maxp1 = bestwpi [ k ] ;
const char * maxp2 = bestwpj [ k ] ;
2016-09-06 12:09:11 +02:00
score = bestScores [ k ] ;
2016-07-04 15:06:15 +02:00
bool fixedDist = bestFixed [ k ] ;
score * = wts ;
score * = m_freqWeights [ i ] ;
score * = m_freqWeights [ j ] ;
2016-09-06 16:12:40 +02:00
2016-07-04 15:06:15 +02:00
// we have to encode these bits into the mini merge now
2016-09-06 12:46:08 +02:00
if ( Posdb : : getIsHalfStopWikiBigram ( maxp1 ) ) {
2016-07-04 15:06:15 +02:00
score * = WIKI_BIGRAM_WEIGHT ;
2016-08-25 17:27:37 +02:00
}
2016-09-06 12:46:08 +02:00
if ( Posdb : : getIsHalfStopWikiBigram ( maxp2 ) ) {
2016-07-04 15:06:15 +02:00
score * = WIKI_BIGRAM_WEIGHT ;
2016-08-25 17:27:37 +02:00
}
2016-07-04 15:06:15 +02:00
//if ( m_bflags[i] & BF_HALFSTOPWIKIBIGRAM )
//if ( m_bflags[j] & BF_HALFSTOPWIKIBIGRAM )
// wiki phrase weight
px - > m_finalScore = score ;
2016-09-06 12:46:08 +02:00
px - > m_wordPos1 = Posdb : : getWordPos ( maxp1 ) ;
px - > m_wordPos2 = Posdb : : getWordPos ( maxp2 ) ;
syn1 = Posdb : : getIsSynonym ( maxp1 ) ;
syn2 = Posdb : : getIsSynonym ( maxp2 ) ;
2016-07-04 15:06:15 +02:00
px - > m_isSynonym1 = syn1 ;
px - > m_isSynonym2 = syn2 ;
2016-09-06 12:46:08 +02:00
px - > m_isHalfStopWikiBigram1 = Posdb : : getIsHalfStopWikiBigram ( maxp1 ) ;
px - > m_isHalfStopWikiBigram2 = Posdb : : getIsHalfStopWikiBigram ( maxp2 ) ;
2016-07-04 15:06:15 +02:00
//px->m_isSynonym1 = ( m_bflags[i] & BF_SYNONYM );
//px->m_isSynonym2 = ( m_bflags[j] & BF_SYNONYM );
2016-09-06 12:46:08 +02:00
px - > m_diversityRank1 = Posdb : : getDiversityRank ( maxp1 ) ;
px - > m_diversityRank2 = Posdb : : getDiversityRank ( maxp2 ) ;
px - > m_wordSpamRank1 = Posdb : : getWordSpamRank ( maxp1 ) ;
px - > m_wordSpamRank2 = Posdb : : getWordSpamRank ( maxp2 ) ;
px - > m_hashGroup1 = Posdb : : getHashGroup ( maxp1 ) ;
px - > m_hashGroup2 = Posdb : : getHashGroup ( maxp2 ) ;
2016-07-04 15:06:15 +02:00
px - > m_qdist = qdist ;
// bigram algorithm fix
//if ( px->m_wordPos1 == px->m_wordPos2 )
// px->m_wordPos2 += 2;
2016-09-06 12:46:08 +02:00
px - > m_densityRank1 = Posdb : : getDensityRank ( maxp1 ) ;
px - > m_densityRank2 = Posdb : : getDensityRank ( maxp2 ) ;
2016-07-04 15:06:15 +02:00
px - > m_fixedDistance = fixedDist ;
px - > m_qtermNum1 = m_qtermNums [ i ] ;
px - > m_qtermNum2 = m_qtermNums [ j ] ;
2016-09-04 23:01:01 +02:00
//int64_t *termFreqs = (int64_t *)m_msg39req->ptr_termFreqs;
2016-07-04 15:06:15 +02:00
//px->m_termFreq1 = termFreqs[px->m_qtermNum1];
//px->m_termFreq2 = termFreqs[px->m_qtermNum2];
2016-09-16 11:24:23 +02:00
px - > m_tfWeight1 = m_freqWeights [ i ] ;
px - > m_tfWeight2 = m_freqWeights [ j ] ;
2016-07-04 15:06:15 +02:00
px - > m_bflags1 = m_bflags [ i ] ;
px - > m_bflags2 = m_bflags [ j ] ;
2016-09-06 16:12:40 +02:00
2016-07-04 15:06:15 +02:00
// flag it as in same wiki phrase
2016-10-23 22:52:51 +02:00
if ( almostEqualFloat ( wts , ( float ) WIKI_WEIGHT ) ) {
2016-10-21 23:13:32 +02:00
px - > m_inSameWikiPhrase = 1 ;
2016-09-06 16:12:40 +02:00
}
else {
2016-10-21 23:13:32 +02:00
px - > m_inSameWikiPhrase = 0 ;
2016-09-06 16:12:40 +02:00
}
2016-07-04 15:06:15 +02:00
# ifdef _VALGRIND_
VALGRIND_CHECK_MEM_IS_DEFINED ( px , sizeof ( * px ) ) ;
# endif
// only log for debug if it is one result
2016-07-27 16:15:20 +02:00
if ( m_debug ) {
// log each one for debug
log ( LOG_INFO , " posdb: result #% " PRId32 " "
" i=% " PRId32 " "
" j=% " PRId32 " "
" termNum0=% " PRId32 " "
" termNum1=% " PRId32 " "
" finalscore=%f "
" tfw0=%f "
" tfw1=%f "
" fixeddist=% " PRId32 " " // bool
" wts=%f "
" bflags0=% " PRId32 " "
" bflags1=% " PRId32 " "
" syn0=% " PRId32 " "
" syn1=% " PRId32 " "
" div0=% " PRId32 " "
" div1=% " PRId32 " "
" wspam0=% " PRId32 " "
" wspam1=% " PRId32 " "
" hgrp0=%s "
" hgrp1=%s "
" qdist=% " PRId32 " "
" wpos0=% " PRId32 " "
" wpos1=% " PRId32 " "
" dens0=% " PRId32 " "
" dens1=% " PRId32 " " , k , i , j , px - > m_qtermNum1 , px - > m_qtermNum2 , score , m_freqWeights [ i ] ,
m_freqWeights [ j ] , ( int32_t ) bestFixed [ k ] , wts , ( int32_t ) m_bflags [ i ] , ( int32_t ) m_bflags [ j ] ,
( int32_t ) px - > m_isSynonym1 , ( int32_t ) px - > m_isSynonym2 , ( int32_t ) px - > m_diversityRank1 ,
( int32_t ) px - > m_diversityRank2 , ( int32_t ) px - > m_wordSpamRank1 , ( int32_t ) px - > m_wordSpamRank2 ,
getHashGroupString ( px - > m_hashGroup1 ) , getHashGroupString ( px - > m_hashGroup2 ) , ( int32_t ) px - > m_qdist ,
( int32_t ) px - > m_wordPos1 , ( int32_t ) px - > m_wordPos2 , ( int32_t ) px - > m_densityRank1 ,
( int32_t ) px - > m_densityRank2
) ;
}
2016-07-04 15:06:15 +02:00
}
// do the same but for second bests! so seo.cpp's top term pairs
// algo can do a term insertion, and if that hurts the best pair
// the 2nd best might cover for it. ideally, we'd have all the term
// pairs for this algo, but i think we'll have to get by on just this.
2016-08-25 17:27:37 +02:00
logTrace ( g_conf . m_logTracePosdb , " END. sum=%f " , sum ) ;
2016-07-04 15:06:15 +02:00
return sum ;
}
//
//
// TRY TO SPEED IT UP!!!
//
//
// returns false and sets g_errno on error
bool PosdbTable : : setQueryTermInfo ( ) {
2016-08-25 17:27:37 +02:00
logTrace ( g_conf . m_logTracePosdb , " BEGIN. " ) ;
2016-07-04 15:06:15 +02:00
// alloc space. assume max
int32_t qneed = sizeof ( QueryTermInfo ) * m_q - > m_numTerms ;
2016-08-25 17:27:37 +02:00
if ( ! m_qiBuf . reserve ( qneed , " qibuf " ) ) {
return false ; // label it too!
}
2016-07-04 15:06:15 +02:00
// point to those
2016-09-02 14:46:57 +02:00
QueryTermInfo * qtibuf = ( QueryTermInfo * ) m_qiBuf . getBufStart ( ) ;
2016-07-04 15:06:15 +02:00
int32_t nrg = 0 ;
// assume not sorting by a numeric termlist
m_sortByTermNum = - 1 ;
m_sortByTermNumInt = - 1 ;
// now we have score ranges for gbmin:price:1.99 etc.
m_minScoreTermNum = - 1 ;
m_maxScoreTermNum = - 1 ;
// for gbminint:count:99 etc.
m_minScoreTermNumInt = - 1 ;
m_maxScoreTermNumInt = - 1 ;
m_hasMaxSerpScore = false ;
2016-09-04 23:01:01 +02:00
if ( m_msg39req - > m_minSerpDocId ) {
2016-07-04 15:06:15 +02:00
m_hasMaxSerpScore = true ;
2016-08-25 17:27:37 +02:00
}
2016-07-04 15:06:15 +02:00
for ( int32_t i = 0 ; i < m_q - > m_numTerms ; i + + ) {
QueryTerm * qt = & m_q - > m_qterms [ i ] ;
2016-08-25 17:27:37 +02:00
logTrace ( g_conf . m_logTracePosdb , " i=% " PRId32 " , term=[%.*s], required=%s " , i , qt - > m_termLen , qt - > m_term , qt - > m_isRequired ? " true " : " false " ) ;
if ( ! qt - > m_isRequired ) {
continue ;
}
2016-07-04 15:06:15 +02:00
// set this stff
2017-02-20 15:12:17 +01:00
const QueryWord * qw = qt - > m_qword ;
2016-07-04 15:06:15 +02:00
// get one
2016-09-02 14:46:57 +02:00
QueryTermInfo * qti = & qtibuf [ nrg ] ;
2016-07-04 15:06:15 +02:00
// and set it
qti - > m_qt = qt ;
qti - > m_qtermNum = i ;
// this is not good enough, we need to count
// non-whitespace punct as 2 units not 1 unit
// otherwise qdist gets thrown off and our phrasing fails.
// so use QueryTerm::m_qpos just for this.
qti - > m_qpos = qw - > m_posNum ;
qti - > m_wikiPhraseId = qw - > m_wikiPhraseId ;
qti - > m_quotedStartId = qw - > m_quoteStart ;
// is it gbsortby:?
if ( qt - > m_fieldCode = = FIELD_GBSORTBYFLOAT | |
qt - > m_fieldCode = = FIELD_GBREVSORTBYFLOAT ) {
m_sortByTermNum = i ;
m_sortByTermInfoNum = nrg ;
}
if ( qt - > m_fieldCode = = FIELD_GBSORTBYINT | |
qt - > m_fieldCode = = FIELD_GBREVSORTBYINT ) {
m_sortByTermNumInt = i ;
m_sortByTermInfoNumInt = nrg ;
// tell topTree to use int scores
m_topTree - > m_useIntScores = true ;
}
// is it gbmin:price:1.99?
if ( qt - > m_fieldCode = = FIELD_GBNUMBERMIN ) {
m_minScoreTermNum = i ;
m_minScoreVal = qt - > m_qword - > m_float ;
}
if ( qt - > m_fieldCode = = FIELD_GBNUMBERMAX ) {
m_maxScoreTermNum = i ;
m_maxScoreVal = qt - > m_qword - > m_float ;
}
if ( qt - > m_fieldCode = = FIELD_GBNUMBERMININT ) {
m_minScoreTermNumInt = i ;
m_minScoreValInt = qt - > m_qword - > m_int ;
}
if ( qt - > m_fieldCode = = FIELD_GBNUMBERMAXINT ) {
m_maxScoreTermNumInt = i ;
m_maxScoreValInt = qt - > m_qword - > m_int ;
}
// count
int32_t nn = 0 ;
// also add in bigram lists
int32_t left = qt - > m_leftPhraseTermNum ;
int32_t right = qt - > m_rightPhraseTermNum ;
// terms
2017-05-22 16:46:18 +02:00
const QueryTerm * leftTerm = qt - > m_leftPhraseTerm ;
const QueryTerm * rightTerm = qt - > m_rightPhraseTerm ;
2016-07-04 15:06:15 +02:00
bool leftAlreadyAdded = false ;
bool rightAlreadyAdded = false ;
//
// add the non-bigram list AFTER the
// bigrams, which we like to do when we PREFER the bigram
// terms because they are scored higher, specifically, as
// in the case of being half stop wikipedia phrases like
// "the tigers" for the query 'the tigers' we want to give
// a slight bonus, 1.20x, for having that bigram since its
// in wikipedia
//
//
// add left bigram lists. BACKWARDS.
//
if ( left > = 0 & & leftTerm & & leftTerm - > m_isWikiHalfStopBigram ) {
// assume added
leftAlreadyAdded = true ;
// get list
2017-02-20 14:25:06 +01:00
RdbList * list = m_q - > m_qterms [ left ] . m_posdbListPtr ;
2016-07-04 15:06:15 +02:00
// add list ptr into our required group
qti - > m_subLists [ nn ] = list ;
// special flags
qti - > m_bigramFlags [ nn ] = BF_HALFSTOPWIKIBIGRAM ;
// before a pipe operator?
if ( qt - > m_piped ) qti - > m_bigramFlags [ nn ] | = BF_PIPED ;
// add list of member terms as well
m_q - > m_qterms [ left ] . m_bitNum = nrg ;
// only really add if useful
2017-05-23 15:29:45 +02:00
if ( list & & ! list - > isEmpty ( ) ) {
2017-05-23 15:22:05 +02:00
nn + + ;
}
2016-07-04 15:06:15 +02:00
// add bigram synonyms! like "new jersey" bigram
// has the synonym "nj"
for ( int32_t k = 0 ; k < m_q - > m_numTerms ; k + + ) {
QueryTerm * bt = & m_q - > m_qterms [ k ] ;
2016-08-25 17:27:37 +02:00
if ( bt - > m_synonymOf ! = leftTerm ) {
continue ;
}
2016-07-04 15:06:15 +02:00
list = m_q - > m_qterms [ k ] . m_posdbListPtr ;
qti - > m_subLists [ nn ] = list ;
2017-05-23 15:22:05 +02:00
qti - > m_bigramFlags [ nn ] = 0 ;
qti - > m_bigramFlags [ nn ] | = BF_HALFSTOPWIKIBIGRAM ;
2016-07-04 15:06:15 +02:00
qti - > m_bigramFlags [ nn ] | = BF_SYNONYM ;
2016-08-25 17:27:37 +02:00
if ( qt - > m_piped ) {
2016-07-04 15:06:15 +02:00
qti - > m_bigramFlags [ nn ] | = BF_PIPED ;
2016-08-25 17:27:37 +02:00
}
2016-07-04 15:06:15 +02:00
// add list of member terms as well
bt - > m_bitNum = nrg ;
2017-05-23 15:29:45 +02:00
if ( list & & ! list - > isEmpty ( ) ) {
2016-08-25 17:27:37 +02:00
nn + + ;
}
2016-07-04 15:06:15 +02:00
}
}
//
// then the right bigram if also in a wiki half stop bigram
//
2016-08-25 17:27:37 +02:00
if ( right > = 0 & & rightTerm & & rightTerm - > m_isWikiHalfStopBigram ) {
2016-07-04 15:06:15 +02:00
// assume added
rightAlreadyAdded = true ;
// get list
2017-02-20 14:25:06 +01:00
RdbList * list = m_q - > m_qterms [ right ] . m_posdbListPtr ;
2016-07-04 15:06:15 +02:00
// add list ptr into our required group
qti - > m_subLists [ nn ] = list ;
// special flags
qti - > m_bigramFlags [ nn ] = BF_HALFSTOPWIKIBIGRAM ;
// before a pipe operator?
if ( qt - > m_piped ) qti - > m_bigramFlags [ nn ] | = BF_PIPED ;
// add list of member terms as well
m_q - > m_qterms [ right ] . m_bitNum = nrg ;
// only really add if useful
2017-05-23 15:29:45 +02:00
if ( list & & ! list - > isEmpty ( ) ) {
2016-08-25 17:27:37 +02:00
nn + + ;
}
2016-07-04 15:06:15 +02:00
// add bigram synonyms! like "new jersey" bigram
// has the synonym "nj"
for ( int32_t k = 0 ; k < m_q - > m_numTerms ; k + + ) {
QueryTerm * bt = & m_q - > m_qterms [ k ] ;
2016-08-25 17:27:37 +02:00
if ( bt - > m_synonymOf ! = rightTerm ) {
continue ;
}
2016-07-04 15:06:15 +02:00
list = m_q - > m_qterms [ k ] . m_posdbListPtr ;
qti - > m_subLists [ nn ] = list ;
2017-05-23 15:22:05 +02:00
qti - > m_bigramFlags [ nn ] = 0 ;
qti - > m_bigramFlags [ nn ] | = BF_HALFSTOPWIKIBIGRAM ;
2016-07-04 15:06:15 +02:00
qti - > m_bigramFlags [ nn ] | = BF_SYNONYM ;
2016-08-25 17:27:37 +02:00
if ( qt - > m_piped ) {
2016-07-04 15:06:15 +02:00
qti - > m_bigramFlags [ nn ] | = BF_PIPED ;
2016-08-25 17:27:37 +02:00
}
2016-07-04 15:06:15 +02:00
// add list of member terms as well
bt - > m_bitNum = nrg ;
2017-05-23 15:29:45 +02:00
if ( list & & ! list - > isEmpty ( ) ) {
2016-08-25 17:27:37 +02:00
nn + + ;
}
2016-07-04 15:06:15 +02:00
}
}
2016-08-25 17:27:37 +02:00
2016-07-04 15:06:15 +02:00
//
// then the non-bigram termlist
//
// add to it. add backwards since we give precedence to
// the first list and we want that to be the NEWEST list!
2017-02-20 14:25:06 +01:00
RdbList * list = m_q - > m_qterms [ i ] . m_posdbListPtr ;
2016-07-04 15:06:15 +02:00
// add list ptr into our required group
qti - > m_subLists [ nn ] = list ;
// special flags
qti - > m_bigramFlags [ nn ] = 0 ;
// before a pipe operator?
2017-05-30 12:28:53 +02:00
if ( qt - > m_piped )
qti - > m_bigramFlags [ nn ] | = BF_PIPED ;
2016-07-04 15:06:15 +02:00
// is it a negative term?
2017-05-30 12:28:53 +02:00
if ( qt - > m_termSign = = ' - ' )
qti - > m_bigramFlags [ nn ] | = BF_NEGATIVE ;
2016-07-04 15:06:15 +02:00
// numeric posdb termlist flags. instead of word position
// they have a float stored there for sorting etc.
if ( qt - > m_fieldCode = = FIELD_GBSORTBYFLOAT )
qti - > m_bigramFlags [ nn ] | = BF_NUMBER ;
if ( qt - > m_fieldCode = = FIELD_GBREVSORTBYFLOAT )
qti - > m_bigramFlags [ nn ] | = BF_NUMBER ;
if ( qt - > m_fieldCode = = FIELD_GBNUMBERMIN )
qti - > m_bigramFlags [ nn ] | = BF_NUMBER ;
if ( qt - > m_fieldCode = = FIELD_GBNUMBERMAX )
qti - > m_bigramFlags [ nn ] | = BF_NUMBER ;
if ( qt - > m_fieldCode = = FIELD_GBNUMBEREQUALFLOAT )
qti - > m_bigramFlags [ nn ] | = BF_NUMBER ;
if ( qt - > m_fieldCode = = FIELD_GBSORTBYINT )
qti - > m_bigramFlags [ nn ] | = BF_NUMBER ;
if ( qt - > m_fieldCode = = FIELD_GBREVSORTBYINT )
qti - > m_bigramFlags [ nn ] | = BF_NUMBER ;
if ( qt - > m_fieldCode = = FIELD_GBNUMBERMININT )
qti - > m_bigramFlags [ nn ] | = BF_NUMBER ;
if ( qt - > m_fieldCode = = FIELD_GBNUMBERMAXINT )
qti - > m_bigramFlags [ nn ] | = BF_NUMBER ;
if ( qt - > m_fieldCode = = FIELD_GBNUMBEREQUALINT )
qti - > m_bigramFlags [ nn ] | = BF_NUMBER ;
// add list of member terms
qt - > m_bitNum = nrg ;
// only really add if useful
// no, because when inserting NEW (related) terms that are
// not currently in the document, this list may initially
// be empty.
2017-05-23 15:29:45 +02:00
if ( list & & ! list - > isEmpty ( ) ) {
2016-08-25 17:27:37 +02:00
nn + + ;
}
2016-07-04 15:06:15 +02:00
//
// add left bigram now if not added above
//
2016-08-25 17:27:37 +02:00
if ( left > = 0 & & ! leftAlreadyAdded ) {
2016-07-04 15:06:15 +02:00
// get list
list = m_q - > m_qterms [ left ] . m_posdbListPtr ;
// add list ptr into our required group
qti - > m_subLists [ nn ] = list ;
// special flags
2017-05-23 15:22:05 +02:00
qti - > m_bigramFlags [ nn ] = BF_BIGRAM ;
2016-07-04 15:06:15 +02:00
// before a pipe operator?
if ( qt - > m_piped ) qti - > m_bigramFlags [ nn ] | = BF_PIPED ;
2017-05-23 15:22:05 +02:00
// add list of member terms as well
2016-07-04 15:06:15 +02:00
m_q - > m_qterms [ left ] . m_bitNum = nrg ;
// only really add if useful
2017-05-23 15:29:45 +02:00
if ( list & & ! list - > isEmpty ( ) ) {
2016-08-25 17:27:37 +02:00
nn + + ;
}
2016-07-04 15:06:15 +02:00
// add bigram synonyms! like "new jersey" bigram
// has the synonym "nj"
for ( int32_t k = 0 ; k < m_q - > m_numTerms ; k + + ) {
QueryTerm * bt = & m_q - > m_qterms [ k ] ;
2016-08-25 17:27:37 +02:00
if ( bt - > m_synonymOf ! = leftTerm ) {
continue ;
}
2016-07-04 15:06:15 +02:00
list = m_q - > m_qterms [ k ] . m_posdbListPtr ;
qti - > m_subLists [ nn ] = list ;
2017-05-23 15:22:05 +02:00
qti - > m_bigramFlags [ nn ] = 0 ;
qti - > m_bigramFlags [ nn ] | = BF_SYNONYM ;
if ( qt - > m_piped ) {
2016-07-04 15:06:15 +02:00
qti - > m_bigramFlags [ nn ] | = BF_PIPED ;
2017-05-23 15:22:05 +02:00
}
// add list of member terms as well
2016-07-04 15:06:15 +02:00
bt - > m_bitNum = nrg ;
2017-05-23 15:29:45 +02:00
if ( list & & ! list - > isEmpty ( ) ) {
2016-08-25 17:27:37 +02:00
nn + + ;
}
2016-07-04 15:06:15 +02:00
}
}
2016-08-25 17:27:37 +02:00
2016-07-04 15:06:15 +02:00
//
// add right bigram now if not added above
//
2016-08-25 17:27:37 +02:00
if ( right > = 0 & & ! rightAlreadyAdded ) {
2016-07-04 15:06:15 +02:00
// get list
list = m_q - > m_qterms [ right ] . m_posdbListPtr ;
// add list ptr into our required group
qti - > m_subLists [ nn ] = list ;
// special flags
2017-05-23 15:22:05 +02:00
qti - > m_bigramFlags [ nn ] = BF_BIGRAM ;
2016-07-04 15:06:15 +02:00
// before a pipe operator?
if ( qt - > m_piped ) qti - > m_bigramFlags [ nn ] | = BF_PIPED ;
2017-05-23 15:22:05 +02:00
// add list of member terms as well
2016-07-04 15:06:15 +02:00
m_q - > m_qterms [ right ] . m_bitNum = nrg ;
// only really add if useful
2017-05-23 15:29:45 +02:00
if ( list & & ! list - > isEmpty ( ) ) {
2016-08-25 17:27:37 +02:00
nn + + ;
}
2016-07-04 15:06:15 +02:00
// add bigram synonyms! like "new jersey" bigram
// has the synonym "nj"
for ( int32_t k = 0 ; k < m_q - > m_numTerms ; k + + ) {
QueryTerm * bt = & m_q - > m_qterms [ k ] ;
2016-08-25 17:27:37 +02:00
if ( bt - > m_synonymOf ! = rightTerm ) {
continue ;
}
2016-07-04 15:06:15 +02:00
list = m_q - > m_qterms [ k ] . m_posdbListPtr ;
qti - > m_subLists [ nn ] = list ;
2017-05-23 15:22:05 +02:00
qti - > m_bigramFlags [ nn ] = 0 ;
qti - > m_bigramFlags [ nn ] | = BF_SYNONYM ;
if ( qt - > m_piped ) {
2016-07-04 15:06:15 +02:00
qti - > m_bigramFlags [ nn ] | = BF_PIPED ;
2017-05-23 15:22:05 +02:00
}
// add list of member terms as well
2016-07-04 15:06:15 +02:00
//qti->m_qtermList[nn] = bt;
bt - > m_bitNum = nrg ;
2017-05-23 15:29:45 +02:00
if ( list & & ! list - > isEmpty ( ) ) {
2016-08-25 17:27:37 +02:00
nn + + ;
}
2016-07-04 15:06:15 +02:00
}
}
//
// ADD SYNONYM TERMS
//
for ( int32_t k = 0 ; k < m_q - > m_numTerms ; k + + ) {
QueryTerm * qt2 = & m_q - > m_qterms [ k ] ;
2017-05-22 16:46:18 +02:00
const QueryTerm * st = qt2 - > m_synonymOf ;
2016-07-04 15:06:15 +02:00
// skip if not a synonym of this term
2016-08-25 17:27:37 +02:00
if ( st ! = qt ) {
continue ;
}
2016-07-04 15:06:15 +02:00
// its a synonym, add it!
list = m_q - > m_qterms [ k ] . m_posdbListPtr ;
// add list ptr into our required group
qti - > m_subLists [ nn ] = list ;
// special flags
qti - > m_bigramFlags [ nn ] = BF_SYNONYM ;
// before a pipe operator?
if ( qt - > m_piped ) qti - > m_bigramFlags [ nn ] | = BF_PIPED ;
// set bitnum here i guess
qt2 - > m_bitNum = nrg ;
// only really add if useful
2017-05-23 15:29:45 +02:00
if ( list & & ! list - > isEmpty ( ) ) {
2016-08-25 17:27:37 +02:00
nn + + ;
}
2016-07-04 15:06:15 +02:00
}
// store # lists in required group. nn might be zero!
qti - > m_numSubLists = nn ;
// set the term freqs for this list group/set
2016-09-04 23:01:01 +02:00
qti - > m_termFreqWeight = ( ( float * ) m_msg39req - > ptr_termFreqWeights ) [ i ] ;
2016-07-04 15:06:15 +02:00
// crazy?
if ( nn > = MAX_SUBLISTS ) {
log ( " query: too many sublists. % " PRId32 " >= % " PRId32 ,
nn , ( int32_t ) MAX_SUBLISTS ) ;
2016-08-25 17:27:37 +02:00
logTrace ( g_conf . m_logTracePosdb , " END. " ) ;
2016-07-04 15:06:15 +02:00
return false ;
}
// compute m_totalSubListsSize
qti - > m_totalSubListsSize = 0LL ;
for ( int32_t q = 0 ; q < qti - > m_numSubLists ; q + + ) {
// add list ptr into our required group
2016-09-06 12:09:11 +02:00
RdbList * l = qti - > m_subLists [ q ] ;
2016-07-04 15:06:15 +02:00
// get it
2016-09-06 12:09:11 +02:00
int64_t listSize = l - > getListSize ( ) ;
2016-07-04 15:06:15 +02:00
// add it up
qti - > m_totalSubListsSize + = listSize ;
}
// count # required groups
nrg + + ;
}
//
// get the query term with the least data in posdb including syns
//
2016-08-30 15:18:21 +02:00
m_minTermListSize = 0 ;
m_minTermListIdx = - 1 ;
int64_t grand = 0LL ;
2016-07-04 15:06:15 +02:00
for ( int32_t i = 0 ; i < nrg ; i + + ) {
// compute total sizes
int64_t total = 0LL ;
// get it
2016-09-02 14:46:57 +02:00
QueryTermInfo * qti = & qtibuf [ i ] ;
2016-07-04 15:06:15 +02:00
// do not consider for first termlist if negative
2016-08-25 17:27:37 +02:00
if ( qti - > m_bigramFlags [ 0 ] & BF_NEGATIVE ) {
continue ;
}
2016-07-04 15:06:15 +02:00
// add to it
total = qti - > m_totalSubListsSize ;
// add up this now
grand + = total ;
2016-08-30 15:18:21 +02:00
2016-07-04 15:06:15 +02:00
// get min
2016-08-30 15:18:21 +02:00
if ( total < m_minTermListSize | | m_minTermListIdx = = - 1 ) {
m_minTermListSize = total ;
m_minTermListIdx = i ;
2016-07-04 15:06:15 +02:00
}
}
// bad! ringbuf[] was not designed for this nonsense!
2016-08-30 15:18:21 +02:00
if ( m_minTermListIdx > = 255 ) {
2016-07-05 12:11:14 +02:00
gbshutdownAbort ( true ) ;
2016-08-25 17:27:37 +02:00
}
2016-07-04 15:06:15 +02:00
// set this for caller to use to loop over the queryterminfos
m_numQueryTermInfos = nrg ;
2017-05-23 13:47:11 +02:00
if ( g_conf . m_logTracePosdb ) {
logTrace ( g_conf . m_logTracePosdb , " m_numQueryTermInfos=%d " , m_numQueryTermInfos ) ;
for ( int i = 0 ; i < m_numQueryTermInfos ; i + + ) {
const QueryTermInfo * qti = qtibuf + i ;
logTrace ( g_conf . m_logTracePosdb , " qti[%d]: m_numSubLists=%d m_qtermNum=%d m_qpos=%d " , i , qti - > m_numSubLists , qti - > m_qtermNum , qti - > m_qpos ) ;
for ( int j = 0 ; j < qti - > m_numSubLists ; j + + )
logTrace ( g_conf . m_logTracePosdb , " sublist %d = %p " , j , qti - > m_subLists [ j ] ) ;
}
}
2016-08-30 15:18:21 +02:00
// . m_minTermListSize is set in setQueryTermInfo()
2016-07-04 15:06:15 +02:00
// . how many docids do we have at most in the intersection?
// . all keys are of same termid, so they are 12 or 6 bytes compressed
// . assume 12 if each is a different docid
2016-08-30 15:18:21 +02:00
int32_t maxDocIds = m_minTermListSize / 12 ;
2016-07-04 15:06:15 +02:00
// store all interesected docids in here for new algo plus 1 byte vote
int32_t need = maxDocIds * 6 ;
// they could all be OR'd together!
if ( m_q - > m_isBoolean ) need = grand ;
// so we can always cast a int64_t from a ptr in there
// for setting m_docId when m_booleanQuery is true below
need + = 8 ;
// get max # of docids we got in an intersection from all the lists
2016-08-25 17:27:37 +02:00
if ( ! m_docIdVoteBuf . reserve ( need , " divbuf " ) ) {
logTrace ( g_conf . m_logTracePosdb , " END. " ) ;
return false ;
}
2016-07-04 15:06:15 +02:00
// i'm feeling if a boolean query put this in there too, the
// hashtable that maps each docid to its boolean bit vector
// where each bit stands for an operand so we can quickly evaluate
// the bit vector in a truth table.
// CRAP, can't use min list size because it might be behind a
// NOT operator!!! then we core trying to realloc m_bt in a thread
// below when trying to grow it. they could all be OR'd together
// so alloc the most!
int32_t maxSlots = ( grand / 12 ) * 2 ;
// try to speed up. this doesn't *seem* to matter, so i took out:
//maxSlots *= 2;
// get total operands we used
//int32_t numOperands = m_q->m_numWords;//Operands;
// a quoted phrase counts as a single operand
// . QueryTerm::m_bitNum <== m_numQueryTermInfos
// . each queryTermInfo class corresponds to one bit in our bit vec
// . essentially each queryTermInfo is a query term, but it has
// all the synonym and word forms for that query, etc.
m_vecSize = m_numQueryTermInfos / 8 ; //numOperands / 8 ;
// allow an extra byte for remainders
if ( m_numQueryTermInfos % 8 ) m_vecSize + + ;
// now preallocate the hashtable. 0 niceness.
if ( m_q - > m_isBoolean & & // true = useKeyMagic
2016-09-01 18:18:30 +02:00
! m_bt . set ( 8 , m_vecSize , maxSlots , NULL , 0 , false , " booltbl " , true ) ) {
2016-08-25 17:27:37 +02:00
logTrace ( g_conf . m_logTracePosdb , " END. " ) ;
2016-07-04 15:06:15 +02:00
return false ;
2016-08-25 17:27:37 +02:00
}
2016-07-04 15:06:15 +02:00
// . m_ct maps a boolean "bit vector" to a true/false value
// . each "bit" in the "bit vector" indicates if docid has that
// particular query term
if ( m_q - > m_isBoolean & & // true = useKeyMagic
2016-09-01 18:18:30 +02:00
! m_ct . set ( 8 , 1 , maxSlots , NULL , 0 , false , " booltbl " , true ) ) {
2016-08-25 17:27:37 +02:00
logTrace ( g_conf . m_logTracePosdb , " END. " ) ;
2016-07-04 15:06:15 +02:00
return false ;
2016-08-25 17:27:37 +02:00
}
2016-07-04 15:06:15 +02:00
2016-08-25 17:27:37 +02:00
logTrace ( g_conf . m_logTracePosdb , " END. " ) ;
2016-07-04 15:06:15 +02:00
return true ;
}
2016-08-25 17:27:37 +02:00
2016-08-30 15:18:21 +02:00
bool PosdbTable : : findCandidateDocIds ( ) {
int64_t lastTime = gettimeofdayInMilliseconds ( ) ;
int64_t now ;
int64_t took ;
2016-07-04 15:06:15 +02:00
2016-08-29 17:05:34 +02:00
//
// Swap the top 6 bytes of each list.
//
// This gives a contiguous list of 12-byte docid/score entries
// because the first record is always 18 bytes (termid, docid, score)
// and the rest are 6 bytes due to our termid compression.
//
// This makes the lists much much easier to work with, but we have
2016-08-30 15:18:21 +02:00
// to remember to swap back when done! (but we dont...)
2016-08-29 17:05:34 +02:00
//
for ( int32_t k = 0 ; k < m_q - > m_numTerms ; k + + ) {
// count
int64_t total = 0LL ;
// get the list
RdbList * list = m_q - > m_qterms [ k ] . m_posdbListPtr ;
// skip if null
if ( ! list ) {
continue ;
}
// skip if list is empty, too
if ( list - > isEmpty ( ) ) {
continue ;
}
// tally
2016-09-05 13:21:15 +02:00
total + = list - > getListSize ( ) ;
2016-08-29 17:05:34 +02:00
// point to start
2016-09-05 13:21:15 +02:00
char * p = list - > getList ( ) ;
2016-08-29 17:05:34 +02:00
// remember to swap back when done!!
char ttt [ 12 ] ;
2016-11-08 12:43:26 +01:00
memcpy ( ttt , p , 12 ) ;
memcpy ( p , p + 12 , 6 ) ;
memcpy ( p + 6 , ttt , 12 ) ;
2016-07-04 15:06:15 +02:00
2016-08-29 17:05:34 +02:00
// point to the low "hks" bytes now, skipping the termid
p + = 6 ;
// turn half bit on. first key is now 12 bytes!!
* p | = 0x02 ;
// MANGLE the list
2016-09-05 13:42:09 +02:00
list - > setListSize ( list - > getListSize ( ) - 6 ) ;
2016-09-05 13:38:46 +02:00
list - > setList ( p ) ;
2016-07-04 15:06:15 +02:00
2016-08-29 17:05:34 +02:00
logTrace ( g_conf . m_logTracePosdb , " termList #% " PRId32 " totalSize=% " PRId64 , k , total ) ;
2016-07-04 15:06:15 +02:00
2016-08-29 17:05:34 +02:00
// print total list sizes
if ( m_debug ) {
log ( LOG_INFO , " query: termlist #% " PRId32 " totalSize=% " PRId64 , k , total ) ;
}
}
2016-07-04 15:06:15 +02:00
2016-08-29 17:05:34 +02:00
// point to our array of query term infos set in setQueryTermInfos()
2016-09-02 14:46:57 +02:00
QueryTermInfo * qtibuf = ( QueryTermInfo * ) m_qiBuf . getBufStart ( ) ;
2016-08-25 17:27:37 +02:00
2016-08-29 17:05:34 +02:00
// setQueryTermInfos() should have set how many we have
if ( m_numQueryTermInfos = = 0 ) {
log ( LOG_DEBUG , " query: NO REQUIRED TERMS IN QUERY! " ) ;
logTrace ( g_conf . m_logTracePosdb , " END, m_numQueryTermInfos = 0 " ) ;
2016-08-30 15:18:21 +02:00
return false ;
2016-07-04 15:06:15 +02:00
}
2016-08-29 17:05:34 +02:00
// . if smallest required list is empty, 0 results
// . also set in setQueryTermInfo
2016-08-30 15:18:21 +02:00
if ( m_minTermListSize = = 0 & & ! m_q - > m_isBoolean ) {
logTrace ( g_conf . m_logTracePosdb , " END, m_minTermListSize = 0 and not boolean " ) ;
return false ;
2016-07-04 15:06:15 +02:00
}
2016-08-29 17:05:34 +02:00
int32_t listGroupNum = 0 ;
2016-07-04 15:06:15 +02:00
2016-08-29 17:05:34 +02:00
// if all non-negative query terms are in the same wikiphrase then
// we can apply the WIKI_WEIGHT in getMaxPossibleScore() which
// should help us speed things up!
// @@@ BR: Why should this give a boost at all.. ?
m_allInSameWikiPhrase = true ;
for ( int32_t i = 0 ; i < m_numQueryTermInfos ; i + + ) {
// get it
2017-05-23 16:00:03 +02:00
const QueryTermInfo * qti = & qtibuf [ i ] ;
2016-08-30 15:18:21 +02:00
2016-08-29 17:05:34 +02:00
// skip if negative query term
2016-08-30 15:18:21 +02:00
if ( qti - > m_bigramFlags [ 0 ] & BF_NEGATIVE ) {
continue ;
}
2016-08-29 17:05:34 +02:00
// skip if numeric field like gbsortby:price gbmin:price:1.23
2016-08-30 15:18:21 +02:00
if ( qti - > m_bigramFlags [ 0 ] & BF_NUMBER ) {
continue ;
}
2016-08-29 17:05:34 +02:00
// set it
2016-08-30 15:18:21 +02:00
if ( qti - > m_wikiPhraseId = = 1 ) {
continue ; //@@@ BR: Why only check for id = 1 ??
}
2016-08-29 17:05:34 +02:00
// stop
m_allInSameWikiPhrase = false ;
break ;
2016-08-25 17:27:37 +02:00
}
2016-09-06 12:23:24 +02:00
logTrace ( g_conf . m_logTracePosdb , " m_allInSameWikiPhrase: %s " , ( m_allInSameWikiPhrase ? " true " : " false " ) ) ;
2016-07-04 15:06:15 +02:00
2016-08-30 15:18:21 +02:00
2016-08-29 17:05:34 +02:00
// for boolean queries we scan every docid in all termlists,
// then we see what query terms it has, and make a bit vector for it.
// then use a hashtable to map that bit vector to a true or false
// as to whether we should include it in the results or not.
// we use Query::getBitScore(qvec_t ebits) to evaluate a docid's
// query term explicit term bit vector.
if ( m_q - > m_isBoolean ) {
logTrace ( g_conf . m_logTracePosdb , " makeDocIdVoteBufForBoolQuery " ) ;
// keeping the docids sorted is the challenge here...
makeDocIdVoteBufForBoolQuery ( ) ;
}
else {
2016-08-30 15:18:21 +02:00
// create "m_docIdVoteBuf" filled with just the docids from the
// smallest group of sublists (one term, all variations).
//
// m_minTermListIdx is the queryterminfo that had the smallest total
// sublist sizes of m_minTermListSize. this was set in
// setQueryTermInfos()
//
// if all these sublist termlists were 50MB i'd day 10-25ms to
// add their docid votes.
2016-08-29 17:05:34 +02:00
logTrace ( g_conf . m_logTracePosdb , " addDocIdVotes " ) ;
2016-09-02 14:46:57 +02:00
addDocIdVotes ( & qtibuf [ m_minTermListIdx ] , listGroupNum ) ;
2016-08-29 17:05:34 +02:00
2016-08-25 17:27:37 +02:00
2016-08-29 17:05:34 +02:00
// now repeat the docid scan for successive lists but only
// inc the docid count for docids we match. docids that are
// in m_docIdVoteBuf but not in sublist group #i will be removed
2016-08-30 15:18:21 +02:00
// from m_docIdVoteBuf.
//
// worst case scenario with termlists limited
2016-08-29 17:05:34 +02:00
// to 30MB will be about 10MB of docid vote buf, but it should
// shrink quite a bit every time we call addDocIdVotes() with a
// new group of sublists for a query term. but scanning 10MB should
// be pretty fast since gk0 does like 4GB/s of main memory reads.
// i would think scanning and docid voting for 200MB of termlists
// should take like 50-100ms
for ( int32_t i = 0 ; i < m_numQueryTermInfos ; i + + ) {
2016-08-30 15:18:21 +02:00
// skip if we did it above when allocating the vote buffer
if ( i = = m_minTermListIdx ) {
2016-08-29 17:05:34 +02:00
continue ;
}
// get it
2017-05-23 16:00:03 +02:00
const QueryTermInfo * qti = & qtibuf [ i ] ;
2016-08-29 17:05:34 +02:00
// do not consider for adding if negative ('my house -home')
if ( qti - > m_bigramFlags [ 0 ] & BF_NEGATIVE ) {
continue ;
}
2016-08-30 15:18:21 +02:00
// inc the group number ("score"), which will be set on the docids
// in addDocIdVotes if they match.
2016-08-29 17:05:34 +02:00
listGroupNum + + ;
// if it hits 256 then wrap back down to 1
if ( listGroupNum > = 256 ) {
listGroupNum = 1 ;
}
// add it
addDocIdVotes ( qti , listGroupNum ) ;
}
logTrace ( g_conf . m_logTracePosdb , " Added DocIdVotes " ) ;
2016-08-25 17:27:37 +02:00
2016-08-30 15:18:21 +02:00
//
// remove the negative query term docids from our docid vote buf
//
2016-08-29 17:05:34 +02:00
for ( int32_t i = 0 ; i < m_numQueryTermInfos ; i + + ) {
// skip if we did it above
2016-08-30 15:18:21 +02:00
if ( i = = m_minTermListIdx ) {
2016-08-29 17:05:34 +02:00
continue ;
}
// get it
2017-05-23 16:00:03 +02:00
const QueryTermInfo * qti = & qtibuf [ i ] ;
2016-08-29 17:05:34 +02:00
// do not consider for adding if negative ('my house -home')
if ( ! ( qti - > m_bigramFlags [ 0 ] & BF_NEGATIVE ) ) {
continue ;
}
2016-08-30 15:18:21 +02:00
// delete the docid from the vote buffer
2016-08-29 17:05:34 +02:00
delDocIdVotes ( qti ) ;
}
logTrace ( g_conf . m_logTracePosdb , " Removed DocIdVotes for negative query terms " ) ;
2016-08-25 17:27:37 +02:00
}
2016-08-29 17:05:34 +02:00
if ( m_debug ) {
now = gettimeofdayInMilliseconds ( ) ;
took = now - lastTime ;
2016-08-30 15:18:21 +02:00
log ( LOG_INFO , " posdb: new algo phase (find matching docids) took % " PRId64 " ms " , took ) ;
2016-08-29 17:05:34 +02:00
lastTime = now ;
}
2016-08-25 17:27:37 +02:00
//
2016-08-29 17:05:34 +02:00
// NOW FILTER EVERY SUBLIST to just the docids in m_docIdVoteBuf.
// Filter in place so it is destructive. i would think
// that doing a filter on 200MB of termlists wouldn't be more than
// 50-100ms since we can read 4GB/s from main memory.
2016-08-25 17:27:37 +02:00
//
2017-06-01 16:06:45 +02:00
delNonMatchingDocIdsFromSubLists ( ) ;
logTrace ( g_conf . m_logTracePosdb , " Shrunk SubLists " ) ;
if ( g_conf . m_logTracePosdb ) {
log ( LOG_TRACE , " Shrunk sublists, m_numQueryTermInfos=%d " , m_numQueryTermInfos ) ;
for ( int i = 0 ; i < m_numQueryTermInfos ; i + + ) {
log ( LOG_TRACE , " qti #%d: m_numSubLists=%d m_numMatchingSubLists=%d " , i , qtibuf [ i ] . m_numSubLists , qtibuf [ i ] . m_numMatchingSubLists ) ;
for ( int j = 0 ; j < qtibuf [ i ] . m_numMatchingSubLists ; j + + )
log ( LOG_TRACE , " matchlist #%d: %d bytes %p - %p " , j , qtibuf [ i ] . m_matchingSubListSize [ j ] , qtibuf [ i ] . m_matchingSubListStart [ j ] , qtibuf [ i ] . m_matchingSubListEnd [ j ] ) ;
2016-08-30 15:18:21 +02:00
}
2016-08-25 17:27:37 +02:00
}
2016-08-29 17:05:34 +02:00
if ( m_debug ) {
now = gettimeofdayInMilliseconds ( ) ;
took = now - lastTime ;
2016-08-30 15:18:21 +02:00
log ( LOG_INFO , " posdb: new algo phase (shrink sublists) took % " PRId64 " ms " , took ) ;
}
return true ;
}
2017-04-21 17:25:21 +02:00
bool PosdbTable : : genDebugScoreInfo1 ( int32_t * numProcessed , int32_t * topCursor , bool * docInThisFile , QueryTermInfo * qtibuf ) {
* docInThisFile = false ;
2016-09-04 23:01:01 +02:00
// did we get enough score info?
2016-11-08 14:26:10 +01:00
if ( * numProcessed > = m_msg39req - > m_docsToGet ) {
2017-04-21 17:25:21 +02:00
logTrace ( g_conf . m_logTracePosdb , " Too many docs processed. m_docId=% " PRId64 " . Reached msg39 m_docsToGet: % " PRId32 " , SKIPPED " , m_docId , * numProcessed ) ;
2016-09-04 23:01:01 +02:00
return true ;
}
// loop back up here if the docid is from a previous range
nextNode :
// this mean top tree empty basically
2016-11-08 14:26:10 +01:00
if ( * topCursor = = - 1 ) {
2017-04-21 17:25:21 +02:00
logTrace ( g_conf . m_logTracePosdb , " topCursor is -1, SKIPPED " ) ;
2016-09-04 23:01:01 +02:00
return true ;
}
// get the #1 docid/score node #
2016-11-08 14:26:10 +01:00
if ( * topCursor = = - 9 ) {
2016-09-04 23:01:01 +02:00
// if our query had a quoted phrase, might have had no
// docids in the top tree! getHighNode() can't handle
// that so handle it here
2016-10-17 16:05:48 +02:00
if ( m_topTree - > getNumUsedNodes ( ) = = 0 ) {
2017-04-21 17:25:21 +02:00
logTrace ( g_conf . m_logTracePosdb , " Num used nodes is 0, SKIPPED " ) ;
2016-09-04 23:01:01 +02:00
return true ;
}
// otherwise, initialize topCursor
2016-11-08 14:26:10 +01:00
* topCursor = m_topTree - > getHighNode ( ) ;
2016-09-04 23:01:01 +02:00
}
// get current node
2016-11-08 14:26:10 +01:00
TopNode * tn = m_topTree - > getNode ( * topCursor ) ;
2016-09-04 23:01:01 +02:00
// advance
2016-11-08 14:26:10 +01:00
* topCursor = m_topTree - > getPrev ( * topCursor ) ;
2017-04-21 17:25:21 +02:00
2016-09-04 23:01:01 +02:00
// shortcut
m_docId = tn - > m_docId ;
// skip if not in our range! the top tree now holds
// all the winners from previous docid ranges. msg39
// now does the search result in docid range chunks to avoid
// OOM conditions.
if ( m_msg39req - > m_minDocId ! = - 1 & &
m_msg39req - > m_maxDocId ! = - 1 & &
( m_docId < ( uint64_t ) m_msg39req - > m_minDocId | |
m_docId > = ( uint64_t ) m_msg39req - > m_maxDocId ) ) {
2017-04-21 17:25:21 +02:00
logTrace ( g_conf . m_logTracePosdb , " DocId % " PRIu64 " does not match docId range % " PRIu64 " - % " PRIu64 " , SKIPPED " , m_docId , ( uint64_t ) m_msg39req - > m_minDocId , ( uint64_t ) m_msg39req - > m_maxDocId ) ;
2016-09-04 23:01:01 +02:00
goto nextNode ;
}
2017-04-21 17:25:21 +02:00
* docInThisFile = m_documentIndexChecker - > exists ( m_docId ) ;
if ( ! ( * docInThisFile ) ) {
logTrace ( g_conf . m_logTracePosdb , " DocId % " PRId64 " is not in this file, SKIPPED " , m_docId ) ;
return false ;
}
logTrace ( g_conf . m_logTracePosdb , " DocId % " PRId64 " - setting up score buffer " , m_docId ) ;
( * numProcessed ) + + ;
2016-09-04 23:01:01 +02:00
// set query termlists in all sublists
for ( int32_t i = 0 ; i < m_numQueryTermInfos ; i + + ) {
// get it
QueryTermInfo * qti = & qtibuf [ i ] ;
// do not advance negative termlist cursor
if ( qti - > m_bigramFlags [ 0 ] & BF_NEGATIVE ) {
continue ;
}
// do each sublist
for ( int32_t j = 0 ; j < qti - > m_numMatchingSubLists ; j + + ) {
// get termlist for that docid
2016-09-16 14:49:36 +02:00
const char * xlist = qti - > m_matchingSubListStart [ j ] ;
const char * xlistEnd = qti - > m_matchingSubListEnd [ j ] ;
const char * xp = getWordPosList ( m_docId , xlist , xlistEnd - xlist ) ;
2016-09-04 23:01:01 +02:00
// not there? xlist will be NULL
qti - > m_matchingSubListSavedCursor [ j ] = xp ;
// if not there make cursor NULL as well
if ( ! xp ) {
qti - > m_matchingSubListCursor [ j ] = NULL ;
continue ;
}
// skip over docid list
xp + = 12 ;
for ( ; ; ) {
// do not breach sublist
if ( xp > = xlistEnd ) {
break ;
}
// break if 12 byte key: another docid!
if ( ! ( xp [ 0 ] & 0x04 ) ) {
break ;
}
// skip over 6 byte key
xp + = 6 ;
}
// point to docid sublist end
qti - > m_matchingSubListCursor [ j ] = xp ;
}
}
return false ;
}
2016-11-08 14:26:10 +01:00
bool PosdbTable : : genDebugScoreInfo2 ( DocIdScore * dcs , int32_t * lastLen , uint64_t * lastDocId , char siteRank , float score , int32_t intScore , char docLang ) {
2016-09-04 23:01:01 +02:00
char * sx ;
char * sxEnd ;
int32_t pairOffset ;
int32_t pairSize ;
int32_t singleOffset ;
int32_t singleSize ;
2016-11-08 14:26:10 +01:00
dcs - > m_siteRank = siteRank ;
dcs - > m_finalScore = score ;
2016-09-04 23:01:01 +02:00
// a double can capture an int without dropping any bits,
// inlike a mere float
if ( m_sortByTermNumInt > = 0 ) {
2016-11-08 14:26:10 +01:00
dcs - > m_finalScore = ( double ) intScore ;
2016-09-04 23:01:01 +02:00
}
2016-11-08 14:26:10 +01:00
dcs - > m_docId = m_docId ;
dcs - > m_numRequiredTerms = m_numQueryTermInfos ;
dcs - > m_docLang = docLang ;
2017-04-21 17:25:21 +02:00
logTrace ( g_conf . m_logTracePosdb , " m_docId=% " PRId64 " , *lastDocId=% " PRId64 " " , m_docId , * lastDocId ) ;
2016-09-04 23:01:01 +02:00
// ensure enough room we can't allocate in a thread!
if ( m_scoreInfoBuf . getAvail ( ) < ( int32_t ) sizeof ( DocIdScore ) + 1 ) {
2017-04-21 17:25:21 +02:00
logTrace ( g_conf . m_logTracePosdb , " END, NO ROOM " ) ;
2016-09-04 23:01:01 +02:00
return true ;
}
// if same as last docid, overwrite it since we have a higher
// siterank or langid i guess
2016-11-08 14:26:10 +01:00
if ( m_docId = = * lastDocId ) {
m_scoreInfoBuf . m_length = * lastLen ;
2016-09-04 23:01:01 +02:00
}
// save that
int32_t len = m_scoreInfoBuf . m_length ;
// copy into the safebuf for holding the scoring info
# ifdef _VALGRIND_
VALGRIND_CHECK_MEM_IS_DEFINED ( & dcs , sizeof ( dcs ) ) ;
# endif
2016-11-10 13:59:19 +01:00
m_scoreInfoBuf . safeMemcpy ( ( char * ) dcs , sizeof ( DocIdScore ) ) ;
2016-09-04 23:01:01 +02:00
// save that
2016-11-08 14:26:10 +01:00
* lastLen = len ;
2016-09-04 23:01:01 +02:00
// save it
2016-11-08 14:26:10 +01:00
* lastDocId = m_docId ;
2016-09-04 23:01:01 +02:00
// try to fix dup docid problem! it was hitting the
// overflow check right above here... strange!!!
//m_docIdTable.removeKey ( &docId );
/////////////////////////////
//
// . docid range split HACK...
// . if we are doing docid range splitting, we process
// the search results separately in disjoint docid ranges.
// . so because we still use the same m_scoreInfoBuf etc.
// for each split we process, we must remove info from
// a top docid of a previous split that got supplanted by
// a docid of this docid-range split, so we do not breach
// the allocated buffer size.
// . so find out which docid we replaced
// in the top tree, and replace his entry in scoreinfobuf
// as well!
// . his info was already added to m_pairScoreBuf in the
// getTermPairScoreForAny() function
//
//////////////////////////////
// the top tree remains persistent between docid ranges.
// and so does the score buf. so we must replace scores
// if the docid gets replaced by a better scoring docid
// from a following docid range of different docids.
// However, scanning the docid scor buffer each time is
// really slow, so we need to get the kicked out docid
// from the top tree and use that to store its offset
// into m_scoreInfoBuf so we can remove it.
DocIdScore * si ;
// only kick out docids from the score buffer when there
// is no room left...
if ( m_scoreInfoBuf . getAvail ( ) > = ( int ) sizeof ( DocIdScore ) ) {
2017-04-21 17:25:21 +02:00
logTrace ( g_conf . m_logTracePosdb , " END, OK. m_docId=% " PRId64 " , *lastDocId=% " PRId64 " " , m_docId , * lastDocId ) ;
2016-09-04 23:01:01 +02:00
return true ;
}
sx = m_scoreInfoBuf . getBufStart ( ) ;
sxEnd = sx + m_scoreInfoBuf . length ( ) ;
// if we have not supplanted anyone yet, be on our way
for ( ; sx < sxEnd ; sx + = sizeof ( DocIdScore ) ) {
si = ( DocIdScore * ) sx ;
// if top tree no longer has this docid, we must
// remove its associated scoring info so we do not
// breach our scoring info bufs
if ( ! m_topTree - > hasDocId ( si - > m_docId ) ) {
break ;
}
}
// might not be full yet
if ( sx > = sxEnd ) {
2017-04-21 17:25:21 +02:00
logTrace ( g_conf . m_logTracePosdb , " END, OK 2 " ) ;
2016-09-04 23:01:01 +02:00
return true ;
}
// must be there!
if ( ! si ) {
gbshutdownAbort ( true ) ;
}
// note it because it is slow
// this is only used if getting score info, which is
// not default when getting an xml or json feed
2017-04-21 17:25:21 +02:00
logTrace ( g_conf . m_logTracePosdb , " Kicking out docid % " PRId64 " from score buf " , si - > m_docId ) ;
2016-09-04 23:01:01 +02:00
// get his single and pair offsets
pairOffset = si - > m_pairsOffset ;
pairSize = si - > m_numPairs * sizeof ( PairScore ) ;
singleOffset = si - > m_singlesOffset ;
singleSize = si - > m_numSingles * sizeof ( SingleScore ) ;
// nuke him
m_scoreInfoBuf . removeChunk1 ( sx , sizeof ( DocIdScore ) ) ;
// and his related info
m_pairScoreBuf . removeChunk2 ( pairOffset , pairSize ) ;
m_singleScoreBuf . removeChunk2 ( singleOffset , singleSize ) ;
// adjust offsets of remaining single scores
sx = m_scoreInfoBuf . getBufStart ( ) ;
for ( ; sx < sxEnd ; sx + = sizeof ( DocIdScore ) ) {
si = ( DocIdScore * ) sx ;
if ( si - > m_pairsOffset > pairOffset ) {
si - > m_pairsOffset - = pairSize ;
}
if ( si - > m_singlesOffset > singleOffset ) {
si - > m_singlesOffset - = singleSize ;
}
}
// adjust this too!
2016-11-08 14:26:10 +01:00
* lastLen - = sizeof ( DocIdScore ) ;
2016-09-04 23:01:01 +02:00
2017-04-21 17:25:21 +02:00
logTrace ( g_conf . m_logTracePosdb , " Returning false " ) ;
2016-09-04 23:01:01 +02:00
return false ;
}
2017-04-21 17:25:21 +02:00
void PosdbTable : : logDebugScoreInfo ( int32_t loglevel ) {
DocIdScore * si ;
char * sx ;
char * sxEnd ;
logTrace ( g_conf . m_logTracePosdb , " BEGIN " ) ;
sx = m_scoreInfoBuf . getBufStart ( ) ;
sxEnd = sx + m_scoreInfoBuf . length ( ) ;
log ( loglevel , " DocId scores in m_scoreInfoBuf: " ) ;
for ( ; sx < sxEnd ; sx + = sizeof ( DocIdScore ) ) {
si = ( DocIdScore * ) sx ;
log ( loglevel , " docId: %14 " PRIu64 " , score: %f " , si - > m_docId , si - > m_finalScore ) ;
// if top tree no longer has this docid, we must
// remove its associated scoring info so we do not
// breach our scoring info bufs
if ( ! m_topTree - > hasDocId ( si - > m_docId ) ) {
log ( loglevel , " ^ NOT in topTree anymore! " ) ;
}
}
logTrace ( g_conf . m_logTracePosdb , " END " ) ;
}
void PosdbTable : : removeScoreInfoForDeletedDocIds ( ) {
DocIdScore * si ;
char * sx ;
char * sxEnd ;
int32_t pairOffset ;
int32_t pairSize ;
int32_t singleOffset ;
int32_t singleSize ;
logTrace ( g_conf . m_logTracePosdb , " BEGIN " ) ;
sx = m_scoreInfoBuf . getBufStart ( ) ;
sxEnd = sx + m_scoreInfoBuf . length ( ) ;
for ( ; sx < sxEnd ; sx + = sizeof ( DocIdScore ) ) {
si = ( DocIdScore * ) sx ;
// if top tree no longer has this docid, we must
// remove its associated scoring info so we do not
// breach our scoring info bufs
if ( m_topTree - > hasDocId ( si - > m_docId ) ) {
continue ;
}
logTrace ( g_conf . m_logTracePosdb , " Removing old score info for docId % " PRId64 " " , si - > m_docId ) ;
// get his single and pair offsets
pairOffset = si - > m_pairsOffset ;
pairSize = si - > m_numPairs * sizeof ( PairScore ) ;
singleOffset = si - > m_singlesOffset ;
singleSize = si - > m_numSingles * sizeof ( SingleScore ) ;
// nuke him
m_scoreInfoBuf . removeChunk1 ( sx , sizeof ( DocIdScore ) ) ;
// and his related info
m_pairScoreBuf . removeChunk2 ( pairOffset , pairSize ) ;
m_singleScoreBuf . removeChunk2 ( singleOffset , singleSize ) ;
// adjust offsets of remaining single scores
sx = m_scoreInfoBuf . getBufStart ( ) ;
for ( ; sx < sxEnd ; sx + = sizeof ( DocIdScore ) ) {
si = ( DocIdScore * ) sx ;
if ( si - > m_pairsOffset > pairOffset ) {
si - > m_pairsOffset - = pairSize ;
}
if ( si - > m_singlesOffset > singleOffset ) {
si - > m_singlesOffset - = singleSize ;
}
}
sxEnd - = sizeof ( DocIdScore ) ;
}
logTrace ( g_conf . m_logTracePosdb , " END " ) ;
}
2016-09-13 12:32:01 +02:00
// Pre-advance each termlist's cursor to skip to next docid.
//
// Set QueryTermInfo::m_matchingSubListCursor to NEXT docid
// Set QueryTermInfo::m_matchingSubListSavedCursor to CURRENT docid
// of each termlist so we are ready for a quick skip over this docid.
//
// TODO: use just a single array of termlist ptrs perhaps,
// then we can remove them when they go NULL. and we'd save a little
// time not having a nested loop.
bool PosdbTable : : advanceTermListCursors ( const char * docIdPtr , QueryTermInfo * qtibuf ) {
logTrace ( g_conf . m_logTracePosdb , " BEGIN " ) ;
for ( int32_t i = 0 ; i < m_numQueryTermInfos ; i + + ) {
// get it
QueryTermInfo * qti = & qtibuf [ i ] ;
// do not advance negative termlist cursor
if ( qti - > m_bigramFlags [ 0 ] & BF_NEGATIVE ) {
continue ;
}
//
// In first pass, sublists data is initialized by delNonMatchingDocIdsFromSubLists.
// In second pass (to get detailed scoring info for UI output), they are initialized above
//
for ( int32_t j = 0 ; j < qti - > m_numMatchingSubLists ; j + + ) {
// shortcuts
2016-09-16 14:49:36 +02:00
const char * xc = qti - > m_matchingSubListCursor [ j ] ;
const char * xcEnd = qti - > m_matchingSubListEnd [ j ] ;
2016-09-13 12:32:01 +02:00
// exhausted? (we can't make cursor NULL because
// getMaxPossibleScore() needs the last ptr)
// must match docid
if ( xc > = xcEnd | |
* ( int32_t * ) ( xc + 8 ) ! = * ( int32_t * ) ( docIdPtr + 1 ) | |
( * ( char * ) ( xc + 7 ) & 0xfc ) ! = ( * ( char * ) ( docIdPtr ) & 0xfc ) ) {
// flag it as not having the docid
qti - > m_matchingSubListSavedCursor [ j ] = NULL ;
// skip this sublist if does not have our docid
continue ;
}
// save it
qti - > m_matchingSubListSavedCursor [ j ] = xc ;
// get new docid
//log("new docid %" PRId64,Posdb::getDocId(xc) );
// advance the cursors. skip our 12
xc + = 12 ;
// then skip any following 6 byte keys because they
// share the same docid
for ( ; ; xc + = 6 ) {
// end of whole termlist?
if ( xc > = xcEnd ) {
break ;
}
// sanity. no 18 byte keys allowed
if ( ( * xc & 0x06 ) = = 0x00 ) {
// i've seen this triggered on gk28.
// a dump of posdb for the termlist
// for 'post' had corruption in it,
// yet its twin, gk92 did not. the
// corruption could have occurred
// anywhere from nov 2012 to may 2013,
// and the posdb file was never
// re-merged! must have been blatant
// disk malfunction?
log ( " posdb: encountered corrupt posdb list. bailing. " ) ;
logTrace ( g_conf . m_logTracePosdb , " END. " ) ;
return false ;
//gbshutdownAbort(true);
}
// the next docid? it will be a 12 byte key.
if ( ! ( * xc & 0x04 ) ) {
break ;
}
}
// assign to next docid word position list
qti - > m_matchingSubListCursor [ j ] = xc ;
}
}
logTrace ( g_conf . m_logTracePosdb , " END " ) ;
return true ;
}
# define RINGBUFSIZE 4096
//
// TODO: consider skipping this pre-filter if it sucks, as it does
// for 'search engine'. it might save time!
//
// Returns:
// false - docid does not meet minimum score requirement
// true - docid can potentially be a top scoring docid
//
2017-05-23 16:00:03 +02:00
bool PosdbTable : : prefilterMaxPossibleScoreByDistance ( const QueryTermInfo * qtibuf , const int32_t * qpos , float minWinningScore ) {
2016-09-13 12:32:01 +02:00
unsigned char ringBuf [ RINGBUFSIZE + 10 ] ;
2016-10-21 23:13:32 +02:00
// reset ring buf. make all slots 0xff. should be 1000 cycles or so.
memset ( ringBuf , 0xff , sizeof ( ringBuf ) ) ;
2016-09-13 12:32:01 +02:00
unsigned char qt ;
uint32_t wx ;
int32_t ourFirstPos = - 1 ;
int32_t qdist ;
logTrace ( g_conf . m_logTracePosdb , " BEGIN " ) ;
// now to speed up 'time enough for love' query which does not
// have many super high scoring guys on top we need a more restrictive
// filter than getMaxPossibleScore() so let's pick one query term,
// the one with the shortest termlist, and see how close it gets to
// each of the other query terms. then score each of those pairs.
// so quickly record the word positions of each query term into
// a ring buffer of 4096 slots where each slot contains the
// query term # plus 1.
logTrace ( g_conf . m_logTracePosdb , " Ring buffer generation " ) ;
2017-05-23 16:00:03 +02:00
const QueryTermInfo * qtx = & qtibuf [ m_minTermListIdx ] ;
2016-09-13 12:32:01 +02:00
// populate ring buf just for this query term
for ( int32_t k = 0 ; k < qtx - > m_numMatchingSubLists ; k + + ) {
// scan that sublist and add word positions
2016-09-16 14:49:36 +02:00
const char * sub = qtx - > m_matchingSubListSavedCursor [ k ] ;
2016-09-13 12:32:01 +02:00
// skip sublist if it's cursor is exhausted
if ( ! sub ) {
continue ;
}
2016-09-16 14:49:36 +02:00
const char * end = qtx - > m_matchingSubListCursor [ k ] ;
2016-09-13 12:32:01 +02:00
// add first key
//int32_t wx = Posdb::getWordPos(sub);
wx = ( * ( ( uint32_t * ) ( sub + 3 ) ) ) > > 6 ;
// mod with 4096
wx & = ( RINGBUFSIZE - 1 ) ;
// store it. 0 is legit.
ringBuf [ wx ] = m_minTermListIdx ;
// set this
ourFirstPos = wx ;
// skip first key
sub + = 12 ;
// then 6 byte keys
for ( ; sub < end ; sub + = 6 ) {
// get word position
//wx = Posdb::getWordPos(sub);
wx = ( * ( ( uint32_t * ) ( sub + 3 ) ) ) > > 6 ;
// mod with 4096
wx & = ( RINGBUFSIZE - 1 ) ;
// store it. 0 is legit.
ringBuf [ wx ] = m_minTermListIdx ;
}
}
// now get query term closest to query term # m_minTermListIdx which
// is the query term # with the shortest termlist
// get closest term to m_minTermListIdx and the distance
logTrace ( g_conf . m_logTracePosdb , " Ring buffer generation 2 " ) ;
for ( int32_t i = 0 ; i < m_numQueryTermInfos ; i + + ) {
if ( i = = m_minTermListIdx ) {
continue ;
}
// get the query term info
2017-05-23 16:00:03 +02:00
const QueryTermInfo * qti = & qtibuf [ i ] ;
2016-09-13 12:32:01 +02:00
// if we have a negative term, skip it
if ( qti - > m_bigramFlags [ 0 ] & ( BF_NEGATIVE ) ) {
continue ;
}
// store all his word positions into ring buffer AS WELL
for ( int32_t k = 0 ; k < qti - > m_numMatchingSubLists ; k + + ) {
// scan that sublist and add word positions
2016-09-16 14:49:36 +02:00
const char * sub = qti - > m_matchingSubListSavedCursor [ k ] ;
2016-09-13 12:32:01 +02:00
// skip sublist if it's cursor is exhausted
if ( ! sub ) {
continue ;
}
2016-09-16 14:49:36 +02:00
const char * end = qti - > m_matchingSubListCursor [ k ] ;
2016-09-13 12:32:01 +02:00
// add first key
//int32_t wx = Posdb::getWordPos(sub);
wx = ( * ( ( uint32_t * ) ( sub + 3 ) ) ) > > 6 ;
// mod with 4096
wx & = ( RINGBUFSIZE - 1 ) ;
// store it. 0 is legit.
ringBuf [ wx ] = i ;
// skip first key
sub + = 12 ;
// then 6 byte keys
for ( ; sub < end ; sub + = 6 ) {
// get word position
//wx = Posdb::getWordPos(sub);
wx = ( * ( ( uint32_t * ) ( sub + 3 ) ) ) > > 6 ;
// mod with 4096
wx & = ( RINGBUFSIZE - 1 ) ;
// store it. 0 is legit.
ringBuf [ wx ] = i ;
}
}
// reset
int32_t ourLastPos = - 1 ;
int32_t hisLastPos = - 1 ;
int32_t bestDist = 0x7fffffff ;
// how far is this guy from the man?
for ( int32_t x = 0 ; x < ( int32_t ) RINGBUFSIZE ; ) {
// skip next 4 slots if all empty. fast?
if ( * ( uint32_t * ) ( ringBuf + x ) = = 0xffffffff ) {
x + = 4 ;
continue ;
}
// skip if nobody
if ( ringBuf [ x ] = = 0xff ) {
x + + ;
continue ;
}
// get query term #
qt = ringBuf [ x ] ;
// if it's the man
if ( qt = = m_minTermListIdx ) {
// record
hisLastPos = x ;
// skip if we are not there yet
if ( ourLastPos = = - 1 ) {
x + + ;
continue ;
}
// try distance fix
if ( x - ourLastPos < bestDist ) {
bestDist = x - ourLastPos ;
}
}
// if us
else
if ( qt = = i ) {
// record
ourLastPos = x ;
// skip if he's not recorded yet
if ( hisLastPos = = - 1 ) {
x + + ;
continue ;
}
// check dist
if ( x - hisLastPos < bestDist ) {
bestDist = x - hisLastPos ;
}
}
2016-09-18 17:00:46 +02:00
2016-09-13 12:32:01 +02:00
x + + ;
}
// compare last occurence of query term #x with our first occ.
// since this is a RING buffer
int32_t wrapDist = ourFirstPos + ( ( int32_t ) RINGBUFSIZE - hisLastPos ) ;
if ( wrapDist < bestDist ) {
bestDist = wrapDist ;
}
// query distance
qdist = qpos [ m_minTermListIdx ] - qpos [ i ] ;
// compute it
float maxScore2 = getMaxPossibleScore ( & qtibuf [ i ] ,
bestDist ,
qdist ,
& qtibuf [ m_minTermListIdx ] ) ;
// -1 means it has inlink text so do not apply this constraint
// to this docid because it is too difficult because we
// sum up the inlink text
if ( maxScore2 < 0.0 ) {
continue ;
}
// if any one of these terms have a max score below the
// worst score of the 10th result, then it can not win.
// @todo: BR. Really? ANY of them?
if ( maxScore2 < = minWinningScore ) {
logTrace ( g_conf . m_logTracePosdb , " END - docid score too low " ) ;
return false ;
}
}
logTrace ( g_conf . m_logTracePosdb , " END - docid score high enough " ) ;
return true ;
}
2016-08-30 15:18:21 +02:00
2016-09-21 14:04:54 +02:00
//
// Data for the current DocID found in sublists of each query term
// is merged into a single list, so we end up with one list per query
// term.
//
2016-09-21 14:06:51 +02:00
void PosdbTable : : mergeTermSubListsForDocId ( QueryTermInfo * qtibuf , char * miniMergeBuf , const char * * miniMergedList , const char * * miniMergedEnd , int * highestInlinkSiteRank ) {
2016-09-21 14:04:54 +02:00
logTrace ( g_conf . m_logTracePosdb , " BEGIN. " ) ;
// we got a docid that has all the query terms, so merge
// each term's sublists into a single list just for this docid.
//
// all posdb keys for this docid should fit in here, the
// mini merge buf:
2017-05-23 17:16:25 +02:00
char * mptr = miniMergeBuf ;
char * mptrEnd = miniMergeBuf + 299000 ;
char * lastMptr = NULL ;
2016-09-21 14:04:54 +02:00
// Merge each set of sublists, like we merge a term's list with
// its two associated bigram lists, if there, the left bigram and
// right bigram list. Merge all the synonym lists for that term
// together as well, so if the term is 'run' we merge it with the
// lists for 'running' 'ran' etc.
logTrace ( g_conf . m_logTracePosdb , " Merge sublists into a single list per query term " ) ;
for ( int32_t j = 0 ; j < m_numQueryTermInfos ; j + + ) {
// get the query term info
QueryTermInfo * qti = & qtibuf [ j ] ;
// just use the flags from first term i guess
// NO! this loses the wikihalfstopbigram bit! so we gotta
// add that in for the key i guess the same way we add in
// the syn bits below!!!!!
m_bflags [ j ] = qti - > m_bigramFlags [ 0 ] ;
// if we have a negative term, skip it
if ( qti - > m_bigramFlags [ 0 ] & BF_NEGATIVE ) {
// need to make this NULL for getSiteRank() call below
miniMergedList [ j ] = NULL ;
// if its empty, that's good!
continue ;
}
// the merged list for term #j is here:
miniMergedList [ j ] = mptr ;
bool isFirstKey = true ;
2017-05-23 17:16:25 +02:00
const char * nwp [ MAX_SUBLISTS ] ;
const char * nwpEnd [ MAX_SUBLISTS ] ;
char nwpFlags [ MAX_SUBLISTS ] ;
2016-09-21 14:04:54 +02:00
// populate the nwp[] arrays for merging
int32_t nsub = 0 ;
for ( int32_t k = 0 ; k < qti - > m_numMatchingSubLists ; k + + ) {
// NULL means does not have that docid
if ( ! qti - > m_matchingSubListSavedCursor [ k ] ) {
continue ;
}
// getMaxPossibleScore() incremented m_matchingSubListCursor to
// the next docid so use m_matchingSubListSavedCursor.
nwp [ nsub ] = qti - > m_matchingSubListSavedCursor [ k ] ;
nwpEnd [ nsub ] = qti - > m_matchingSubListCursor [ k ] ;
nwpFlags [ nsub ] = qti - > m_bigramFlags [ k ] ;
nsub + + ;
}
// if only one sublist had this docid, no need to merge
// UNLESS it's a synonym list then we gotta set the
// synbits on it, below!!! or a half stop wiki bigram like
// the term "enough for" in the wiki phrase
// "time enough for love" because we wanna reward that more!
// this halfstopwikibigram bit is set in the indivial keys
// so we'd have to at least do a key cleansing, so we can't
// do this shortcut right now... mdw oct 10 2015
if ( nsub = = 1 & &
( nwpFlags [ 0 ] & BF_NUMBER ) & &
! ( nwpFlags [ 0 ] & BF_SYNONYM ) & &
! ( nwpFlags [ 0 ] & BF_HALFSTOPWIKIBIGRAM ) ) {
miniMergedList [ j ] = nwp [ 0 ] ;
miniMergedEnd [ j ] = nwpEnd [ 0 ] ;
m_bflags [ j ] = nwpFlags [ 0 ] ;
continue ;
}
// Merge the lists into a list in miniMergeBuf.
// Get the min of each list
bool currTermDone = false ;
char ks ;
do {
int32_t mink = - 1 ;
ks = 0 ;
for ( int32_t k = 0 ; k < nsub ; k + + ) {
// skip if list is exhausted
if ( ! nwp [ k ] ) {
continue ;
}
// auto winner?
if ( mink = = - 1 ) {
mink = k ;
continue ;
}
if ( KEYCMP ( nwp [ k ] , nwp [ mink ] , 6 ) < 0 ) {
mink = k ; // a new min...
}
}
// all exhausted? merge next set of sublists then for term #j
if ( mink = = - 1 ) {
// continue outer "j < m_numQueryTermInfos" loop.
currTermDone = true ;
}
else {
// get keysize
ks = Posdb : : getKeySize ( nwp [ mink ] ) ;
// HACK OF CONFUSION:
//
// skip it if its a query phrase term, like
// "searchengine" is for the 'search engine' query
// AND it has the synbit which means it was a bigram
// in the doc (i.e. occurred as two separate terms)
//
// second check means it occurred as two separate terms
// or could be like bob and occurred as "bob's".
// see XmlDoc::hashWords3().
2017-06-01 17:28:48 +02:00
// nwp[mink][2] & 0x03 is the posdb entry original/synonym/hyponym/.. flags
2016-09-21 14:04:54 +02:00
if ( ! ( ( nwpFlags [ mink ] & BF_BIGRAM ) & & ( nwp [ mink ] [ 2 ] & 0x03 ) ) ) {
// if the first key in our merged list store the docid crap
if ( isFirstKey ) {
// store a 12 byte key in the merged list buffer
memcpy ( mptr , nwp [ mink ] , 12 ) ;
// Detect highest siterank of inlinkers
if ( Posdb : : getHashGroup ( mptr + 6 ) = = HASHGROUP_INLINKTEXT ) {
char inlinkerSiteRank = Posdb : : getWordSpamRank ( mptr + 6 ) ;
if ( inlinkerSiteRank > * highestInlinkSiteRank ) {
* highestInlinkSiteRank = inlinkerSiteRank ;
}
}
// wipe out its syn bits and re-use our way
mptr [ 2 ] & = 0xfc ;
// set the synbit so we know if its a synonym of term
if ( nwpFlags [ mink ] & ( BF_BIGRAM | BF_SYNONYM ) ) {
mptr [ 2 ] | = 0x02 ;
}
// wiki half stop bigram? so for the query
// 'time enough for love' the phrase term "enough for"
// is a half stopword wiki bigram, because it is in
// a phrase in wikipedia ("time enough for love") and
// one of the two words in the phrase term is a
// stop word. therefore we give it more weight than
// just 'enough' by itself.
if ( nwpFlags [ mink ] & BF_HALFSTOPWIKIBIGRAM ) {
mptr [ 2 ] | = 0x01 ;
}
// make sure its 12 bytes! it might have been
// the first key for the termid, and 18 bytes.
mptr [ 0 ] & = 0xf9 ;
mptr [ 0 ] | = 0x02 ;
// save it
lastMptr = mptr ;
mptr + = 12 ;
isFirstKey = false ;
}
else {
// if matches last key word position, do not add!
// we should add the bigram first if more important
// since it should be added before the actual term
// above in the sublist array. so if they are
// wikihalfstop bigrams they will be added first,
// otherwise, they are added after the regular term.
// should fix double scoring bug for 'cheat codes'
// query!
if ( Posdb : : getWordPos ( lastMptr ) ! = Posdb : : getWordPos ( nwp [ mink ] ) ) {
memcpy ( mptr , nwp [ mink ] , 6 ) ;
// Detect highest siterank of inlinkers
if ( Posdb : : getHashGroup ( mptr ) = = HASHGROUP_INLINKTEXT ) {
char inlinkerSiteRank = Posdb : : getWordSpamRank ( mptr ) ;
if ( inlinkerSiteRank > * highestInlinkSiteRank ) {
* highestInlinkSiteRank = inlinkerSiteRank ;
}
}
// wipe out its syn bits and re-use our way
mptr [ 2 ] & = 0xfc ;
// set the synbit so we know if its a synonym of term
if ( nwpFlags [ mink ] & ( BF_BIGRAM | BF_SYNONYM ) ) {
mptr [ 2 ] | = 0x02 ;
}
if ( nwpFlags [ mink ] & BF_HALFSTOPWIKIBIGRAM ) {
mptr [ 2 ] | = 0x01 ;
}
// if it was the first key of its list it may not
// have its bit set for being 6 bytes now! so turn
// on the 2 compression bits
mptr [ 0 ] & = 0xf9 ;
mptr [ 0 ] | = 0x06 ;
// save it
lastMptr = mptr ;
mptr + = 6 ;
}
}
}
// advance the cursor over the key we used.
nwp [ mink ] + = ks ; // Posdb::getKeySize(nwp[mink]);
// exhausted?
if ( nwp [ mink ] > = nwpEnd [ mink ] ) {
nwp [ mink ] = NULL ;
}
else
if ( Posdb : : getKeySize ( nwp [ mink ] ) ! = 6 ) {
// or hit a different docid
nwp [ mink ] = NULL ;
}
} // mink != -1
//log("skipping ks=%" PRId32,(int32_t)ks);
} while ( ! currTermDone & & mptr < mptrEnd ) ; // merge more ...
// wrap it up here since done merging
miniMergedEnd [ j ] = mptr ;
//log(LOG_ERROR,"%s:%d: j=%" PRId32 ": miniMergedList[%" PRId32 "]=%p, miniMergedEnd[%" PRId32 "]=%p, mptr=%p, mptrEnd=%p, term=[%.*s] - TERM DONE", __func__, __LINE__, j, j, miniMergedList[j], j, miniMergedEnd[j], mptr, mptrEnd, qti->m_qt->m_termLen, qti->m_qt->m_term);
}
// breach?
if ( mptr > miniMergeBuf + 300000 ) {
gbshutdownAbort ( true ) ;
}
2016-10-15 21:19:18 +02:00
// Make sure miniMergeList[x] is NULL if it contains no values
for ( int32_t i = 0 ; i < m_numQueryTermInfos ; i + + ) {
// skip if not part of score
if ( m_bflags [ i ] & ( BF_PIPED | BF_NEGATIVE ) ) {
continue ;
}
2016-09-21 14:04:54 +02:00
2016-10-15 21:19:18 +02:00
// get list
const char * plist = miniMergedList [ i ] ;
const char * plistEnd = miniMergedEnd [ i ] ;
int32_t psize = plistEnd - plist ;
if ( ! psize ) {
// BR fix 20160918: Avoid working on empty lists where start and
// end pointers are the same. This can happen if no positions are
// copied to the merged list because they are all synonyms or
// phrase terms. See "HACK OF CONFUSION" above..
miniMergedList [ i ] = NULL ;
}
2016-10-03 16:25:38 +02:00
2016-10-15 21:19:18 +02:00
//
// sanity check for all
//
// test it. first key is 12 bytes.
if ( psize & & Posdb : : getKeySize ( plist ) ! = 12 ) {
log ( LOG_ERROR , " %s:%s:%d: psize=% " PRId32 " " , __FILE__ , __func__ , __LINE__ , psize ) ;
gbshutdownAbort ( true ) ;
}
2016-10-03 16:25:38 +02:00
2016-10-15 21:19:18 +02:00
// next key is 6
if ( psize > 12 & & Posdb : : getKeySize ( plist + 12 ) ! = 6 ) {
log ( LOG_ERROR , " %s:%s:%d: next key size=% " PRId32 " " , __FILE__ , __func__ , __LINE__ , Posdb : : getKeySize ( plist + 12 ) ) ;
gbshutdownAbort ( true ) ;
}
}
2016-10-03 16:25:38 +02:00
2016-09-21 14:04:54 +02:00
logTrace ( g_conf . m_logTracePosdb , " END. " ) ;
}
2016-10-03 16:25:38 +02:00
2016-09-21 14:04:54 +02:00
2016-10-14 11:57:15 +02:00
//
// Nested for loops to score the term pairs.
//
// Store best scores into the scoreMatrix so the sliding window
// algorithm can use them from there to do sub-outs
//
void PosdbTable : : createNonBodyTermPairScoreMatrix ( const char * * miniMergedList , const char * * miniMergedEnd , float * scoreMatrix ) {
int32_t qdist ;
logTrace ( g_conf . m_logTracePosdb , " BEGIN " ) ;
// scan over each query term (its synonyms are part of the
// QueryTermInfo)
for ( int32_t i = 0 ; i < m_numQueryTermInfos ; i + + ) {
// skip if not part of score
if ( m_bflags [ i ] & ( BF_PIPED | BF_NEGATIVE | BF_NUMBER ) ) {
2016-10-03 16:25:38 +02:00
continue ;
}
2016-10-14 11:57:15 +02:00
// and pair it with each other possible query term
for ( int32_t j = i + 1 ; j < m_numQueryTermInfos ; j + + ) {
// skip if not part of score
if ( m_bflags [ j ] & ( BF_PIPED | BF_NEGATIVE | BF_NUMBER ) ) {
continue ;
}
// but if they are in the same wikipedia phrase
// then try to keep their positions as in the query.
// so for 'time enough for love' ideally we want
// 'time' to be 6 units apart from 'love'
float wts ;
// zero means not in a phrase
if ( m_wikiPhraseIds [ j ] = = m_wikiPhraseIds [ i ] & & m_wikiPhraseIds [ j ] ) {
// . the distance between the terms in the query
// . ideally we'd like this distance to be reflected
// in the matched terms in the body
qdist = m_qpos [ j ] - m_qpos [ i ] ;
// wiki weight
wts = ( float ) WIKI_WEIGHT ; // .50;
2016-10-03 16:25:38 +02:00
}
else {
2016-10-14 11:57:15 +02:00
// basically try to get query words as close
// together as possible
qdist = 2 ;
// this should help fix
// 'what is an unsecured loan' so we are more likely
// to get the page that has that exact phrase in it.
// yes, but hurts how to make a lock pick set.
//qdist = qpos[j] - qpos[i];
// wiki weight
wts = 1.0 ;
2016-10-03 16:25:38 +02:00
}
2016-10-14 11:57:15 +02:00
float maxnbtp ;
//
// get score for term pair from non-body occuring terms
//
if ( miniMergedList [ i ] & & miniMergedList [ j ] ) {
maxnbtp = getMaxScoreForNonBodyTermPair ( miniMergedList [ i ] , miniMergedList [ j ] ,
miniMergedEnd [ i ] , miniMergedEnd [ j ] , qdist ) ;
}
else {
maxnbtp = - 1 ;
2016-10-03 16:25:38 +02:00
}
2016-10-14 11:57:15 +02:00
// it's -1 if one term is in the body/header/menu/etc.
if ( maxnbtp < 0 ) {
wts = - 1.00 ;
}
else {
wts * = maxnbtp ;
wts * = m_freqWeights [ i ] ;
wts * = m_freqWeights [ j ] ;
2016-10-03 16:25:38 +02:00
}
2016-10-14 11:57:15 +02:00
// store in matrix for "sub out" algo below
// when doing sliding window
scoreMatrix [ i * m_nqt + j ] = wts ;
2016-10-03 16:25:38 +02:00
}
}
2016-10-14 11:57:15 +02:00
logTrace ( g_conf . m_logTracePosdb , " END " ) ;
}
2016-10-03 16:25:38 +02:00
2016-10-18 14:17:39 +02:00
//
// Finds the highest single term score sum.
// Creates array of highest scoring non-body positions
//
float PosdbTable : : getMinSingleTermScoreSum ( const char * * miniMergedList , const char * * miniMergedEnd , const char * * highestScoringNonBodyPos , DocIdScore * pdcs ) {
2016-10-14 11:57:15 +02:00
float minSingleScore = 999999999.0 ;
2017-04-10 20:55:07 +02:00
bool scoredTerm = false ;
2016-10-03 16:25:38 +02:00
2016-10-14 11:57:15 +02:00
logTrace ( g_conf . m_logTracePosdb , " BEGIN " ) ;
// Now add single word scores.
//
// Having inlink text from siterank 15 of max
// diversity/density will result in the largest score,
// but we add them all up...
//
// This should be highly negative if singles[i] has a '-'
// termsign...
for ( int32_t i = 0 ; i < m_numQueryTermInfos ; i + + ) {
if ( ! miniMergedList [ i ] ) {
continue ;
}
// skip if to the left of a pipe operator
if ( m_bflags [ i ] & ( BF_PIPED | BF_NEGATIVE | BF_NUMBER ) ) {
continue ;
}
// sometimes there is no wordpos subtermlist for this docid
// because it just has the bigram, like "streetlight" and not
// the word "light" by itself for the query 'street light'
//if ( miniMergedList[i] ) {
// assume all word positions are in body
//highestScoringNonBodyPos[i] = NULL;
// This scans all word positions for this term.
2016-10-03 16:25:38 +02:00
//
2016-10-14 11:57:15 +02:00
// This should ignore occurences in the body and only
// focus on inlink text, etc.
2016-10-03 16:25:38 +02:00
//
2016-10-14 11:57:15 +02:00
// Sets "highestScoringNonBodyPos" to point to the winning word
// position which does NOT occur in the body.
2016-10-03 16:25:38 +02:00
//
2016-10-14 11:57:15 +02:00
// Adds up MAX_TOP top scores and returns that sum.
2016-10-03 16:25:38 +02:00
//
2016-10-14 11:57:15 +02:00
// pdcs is NULL if not currPassNum == INTERSECT_DEBUG_INFO
float sts = getBestScoreSumForSingleTerm ( i , miniMergedList [ i ] , miniMergedEnd [ i ] , pdcs , & highestScoringNonBodyPos [ i ] ) ;
2017-04-10 20:55:07 +02:00
scoredTerm = true ;
2016-10-14 11:57:15 +02:00
// sanity check
if ( highestScoringNonBodyPos [ i ] & & s_inBody [ Posdb : : getHashGroup ( highestScoringNonBodyPos [ i ] ) ] ) {
gbshutdownAbort ( true ) ;
}
//sts /= 3.0;
if ( sts < minSingleScore ) {
minSingleScore = sts ;
}
}
2017-04-10 20:55:07 +02:00
if ( ! scoredTerm ) {
minSingleScore = - 1 ;
}
2016-10-14 11:57:15 +02:00
logTrace ( g_conf . m_logTracePosdb , " END. minSingleScore=%f " , minSingleScore ) ;
return minSingleScore ;
}
2016-10-10 16:02:50 +02:00
// Like getTermPairScore, but uses the word positions currently pointed to by ptrs[i].
// Does NOT scan the word position lists.
// Also tries to sub-out each term with the title or linktext wordpos term
// pointed to by "highestScoringNonBodyPos[i]".
2016-10-18 14:17:39 +02:00
//
// OUTPUT:
// m_bestMinTermPairWindowScore: The best minimum window score
// m_bestMinTermPairWindowPtrs : Pointers to query term positions giving the best minimum score
//
2016-10-10 16:02:50 +02:00
void PosdbTable : : findMinTermPairScoreInWindow ( const char * * ptrs , const char * * highestScoringNonBodyPos , float * scoreMatrix ) {
2017-06-01 16:46:14 +02:00
int32_t qdist = 0 ;
2016-10-10 16:02:50 +02:00
float minTermPairScoreInWindow = 999999999.0 ;
2017-04-10 20:55:07 +02:00
bool scoredTerms = false ;
2016-10-10 16:02:50 +02:00
logTrace ( g_conf . m_logTracePosdb , " BEGIN. " ) ;
// TODO: only do this loop on the (i,j) pairs where i or j
// is the term whose position got advanced in the sliding window.
for ( int32_t i = 0 ; i < m_numQueryTermInfos ; i + + ) {
// skip if to the left of a pipe operator
if ( m_bflags [ i ] & ( BF_PIPED | BF_NEGATIVE | BF_NUMBER ) )
continue ;
// skip empty list
if ( ! ptrs [ i ] ) {
continue ;
}
//if ( ptrs[i] ) wpi = ptrs[i];
// if term does not occur in body, sub-in the best term
// from the title/linktext/etc.
//else wpi = highestScoringNonBodyPos[i];
2017-06-01 16:23:56 +02:00
const char * wpi = ptrs [ i ] ;
2016-10-10 16:02:50 +02:00
// loop over other terms
2017-06-01 16:23:56 +02:00
for ( int32_t j = i + 1 ; j < m_numQueryTermInfos ; j + + ) {
2016-10-10 16:02:50 +02:00
// skip if to the left of a pipe operator
if ( m_bflags [ j ] & ( BF_PIPED | BF_NEGATIVE | BF_NUMBER ) )
continue ;
// skip empty list
if ( ! ptrs [ j ] ) {
continue ;
}
// TODO: use a cache using wpi/wpj as the key.
//if ( ptrs[j] ) wpj = ptrs[j];
// if term does not occur in body, sub-in the best term
// from the title/linktext/etc.
//else wpj = highestScoringNonBodyPos[j];
2017-06-01 16:23:56 +02:00
const char * wpj = ptrs [ j ] ;
float wikiWeight ;
2016-10-10 16:02:50 +02:00
// in same wikipedia phrase?
if ( m_wikiPhraseIds [ j ] = = m_wikiPhraseIds [ i ] & &
// zero means not in a phrase
m_wikiPhraseIds [ j ] ) {
// try to get dist that matches qdist exactly
2017-06-01 16:46:14 +02:00
qdist = m_qpos [ j ] - m_qpos [ i ] ;
2016-10-10 16:02:50 +02:00
// wiki weight
wikiWeight = WIKI_WEIGHT ; // .50;
}
else {
// basically try to get query words as close
// together as possible
2017-06-01 16:46:14 +02:00
qdist = 2 ;
2016-10-10 16:02:50 +02:00
// fix 'what is an unsecured loan' to get the
// exact phrase with higher score
//m_qdist = m_qpos[j] - m_qpos[i];
// wiki weight
wikiWeight = 1.0 ;
}
// this will be -1 if wpi or wpj is NULL
2017-06-01 16:46:14 +02:00
float max = getScoreForTermPair ( wpi , wpj , 0 , qdist ) ;
2017-04-10 20:55:07 +02:00
scoredTerms = true ;
2016-10-10 16:02:50 +02:00
// try sub-ing in the best title occurence or best
// inlink text occurence. cuz if the term is in the title
// but these two terms are really far apart, we should
// get a better score
2017-06-01 16:46:14 +02:00
float score = getScoreForTermPair ( highestScoringNonBodyPos [ i ] , wpj , FIXED_DISTANCE , qdist ) ;
2016-10-10 16:02:50 +02:00
if ( score > max ) {
max = score ;
}
// a double pair sub should be covered in the
// getMaxScoreForNonBodyTermPair() function
2017-06-01 16:46:14 +02:00
score = getScoreForTermPair ( highestScoringNonBodyPos [ i ] , highestScoringNonBodyPos [ j ] , FIXED_DISTANCE , qdist ) ;
2016-10-10 16:02:50 +02:00
if ( score > max ) {
max = score ;
}
2017-06-01 16:46:14 +02:00
score = getScoreForTermPair ( wpi , highestScoringNonBodyPos [ j ] , FIXED_DISTANCE , qdist ) ;
2016-10-10 16:02:50 +02:00
if ( score > max ) {
max = score ;
}
// wikipedia phrase weight
2016-10-23 22:52:51 +02:00
if ( ! almostEqualFloat ( wikiWeight , 1.0 ) ) {
2016-10-10 16:02:50 +02:00
max * = wikiWeight ;
}
// term freqweight here
max * = m_freqWeights [ i ] * m_freqWeights [ j ] ;
// use score from scoreMatrix if bigger
if ( scoreMatrix [ m_nqt * i + j ] > max ) {
max = scoreMatrix [ m_nqt * i + j ] ;
}
// in same quoted phrase?
if ( m_quotedStartIds [ j ] > = 0 & & m_quotedStartIds [ j ] = = m_quotedStartIds [ i ] ) {
// no subouts allowed i guess
if ( ! wpi ) {
max = - 1.0 ;
}
else
if ( ! wpj ) {
max = - 1.0 ;
}
else {
int32_t qdist = m_qpos [ j ] - m_qpos [ i ] ;
int32_t p1 = Posdb : : getWordPos ( wpi ) ;
int32_t p2 = Posdb : : getWordPos ( wpj ) ;
int32_t dist = p2 - p1 ;
// must be in right order!
if ( dist < 0 ) {
max = - 1.0 ;
}
// allow for a discrepancy of 1 unit in case
// there is a comma? i think we add an extra
// unit
else
if ( dist > qdist & & dist - qdist > 1 ) {
max = - 1.0 ;
//log("ddd1: i=%" PRId32" j=%" PRId32" "
// "dist=%" PRId32" qdist=%" PRId32,
// i,j,dist,qdist);
}
else
if ( dist < qdist & & qdist - dist > 1 ) {
max = - 1.0 ;
}
}
}
// now we want the sliding window with the largest min
// term pair score!
if ( max < minTermPairScoreInWindow ) {
minTermPairScoreInWindow = max ;
}
}
}
// Our best minimum score better than current best minimum score?
2017-04-10 20:55:07 +02:00
if ( minTermPairScoreInWindow < = m_bestMinTermPairWindowScore | | ! scoredTerms ) {
2016-10-10 16:02:50 +02:00
logTrace ( g_conf . m_logTracePosdb , " END. " ) ;
return ;
}
// Yep, our best minimum score is the highest so far
m_bestMinTermPairWindowScore = minTermPairScoreInWindow ;
// Record term positions in winning window
for ( int32_t i = 0 ; i < m_numQueryTermInfos ; i + + ) {
m_bestMinTermPairWindowPtrs [ i ] = ptrs [ i ] ;
}
logTrace ( g_conf . m_logTracePosdb , " END. " ) ;
}
2016-10-18 14:17:39 +02:00
float PosdbTable : : getMinTermPairScoreSlidingWindow ( const char * * miniMergedList , const char * * miniMergedEnd , const char * * highestScoringNonBodyPos , const char * * winnerStack , const char * * xpos , float * scoreMatrix , DocIdScore * pdcs ) {
2016-10-03 16:25:38 +02:00
bool allNull ;
int32_t minPos = 0 ;
logTrace ( g_conf . m_logTracePosdb , " Sliding Window algorithm begins " ) ;
2016-10-10 16:02:50 +02:00
m_bestMinTermPairWindowPtrs = winnerStack ;
2016-10-03 16:25:38 +02:00
// Scan the terms that are in the body in a sliding window
//
// Compute the term pair score on just the terms in that
// sliding window. that way, the term for a word like 'dog'
// keeps the same word position when it is paired up with the
// other terms.
//
// Compute the score the same way getTermPairScore() works so
// we are on the same playing field
//
// Sub-out each term with its best scoring occurence in the title
// or link text or meta tag, etc. but it gets a distance penalty
// of like 100 units or so.
//
// If term does not occur in the body, the sub-out approach should
// fix that.
//
// Keep a matrix of the best scores between two terms from the
// above double nested loop algo, and replace that pair if we
// got a better score in the sliding window.
// use special ptrs for the windows so we do not mangle
// miniMergedList[] array because we use that below!
for ( int32_t i = 0 ; i < m_numQueryTermInfos ; i + + ) {
xpos [ i ] = miniMergedList [ i ] ;
}
//
// init each list ptr to the first wordpos rec in the body
// for THIS DOCID and if no such rec, make it NULL
//
allNull = true ;
for ( int32_t i = 0 ; i < m_numQueryTermInfos ; i + + ) {
// skip if to the left of a pipe operator
if ( m_bflags [ i ] & ( BF_PIPED | BF_NEGATIVE | BF_NUMBER ) ) {
continue ;
}
2016-10-10 16:02:50 +02:00
// skip word position until it is in the body
2016-10-03 16:25:38 +02:00
while ( xpos [ i ] & & ! s_inBody [ Posdb : : getHashGroup ( xpos [ i ] ) ] ) {
// advance
if ( ! ( xpos [ i ] [ 0 ] & 0x04 ) ) {
xpos [ i ] + = 12 ;
}
else {
xpos [ i ] + = 6 ;
}
// NULLify list if no more for this docid
if ( xpos [ i ] < miniMergedEnd [ i ] & & ( xpos [ i ] [ 0 ] & 0x04 ) ) {
continue ;
}
// ok, no more! null means empty list
xpos [ i ] = NULL ;
// must be in title or something else then
2016-10-10 16:02:50 +02:00
if ( ! highestScoringNonBodyPos [ i ] ) {
2016-10-03 16:25:38 +02:00
gbshutdownAbort ( true ) ;
}
}
// if all xpos are NULL, then no terms are in body...
if ( xpos [ i ] ) {
allNull = false ;
}
}
// if no terms in body, no need to do sliding window
2016-10-14 09:52:10 +02:00
bool doneSliding = allNull ? true : false ;
2016-10-03 16:25:38 +02:00
while ( ! doneSliding ) {
//
2016-10-14 09:52:10 +02:00
// Now all xpos point to positions in the document body. Calc the "window" score (score
// for current term positions).
2016-10-03 16:25:38 +02:00
//
2016-10-14 09:52:10 +02:00
// If window score beats m_bestMinTermPairWindowScore we store the term xpos pointers
// that define this window in the m_bestMinTermPairWindowPtrs[] array.
2016-10-03 16:25:38 +02:00
//
2016-10-14 09:52:10 +02:00
// Will try to substitute either of the two term positions with highestScoringNonBodyPos[i]
// if better, but will fix the distance to FIXED_DISTANCE to give a distance penalty.
2016-10-03 16:25:38 +02:00
//
2016-10-14 09:52:10 +02:00
// "scoreMatrix" contains the highest scoring non-body term pair score, which will
// be used if higher than the calculated score for the terms.
2016-10-03 16:25:38 +02:00
//
2016-10-18 14:17:39 +02:00
// Sets m_bestMinTermPairWindowScore and m_bestMinTermPairWindowPtrs if this window score beats it.
2016-10-03 16:25:38 +02:00
//
2016-10-10 16:02:50 +02:00
findMinTermPairScoreInWindow ( xpos , highestScoringNonBodyPos , scoreMatrix ) ;
2016-10-03 16:25:38 +02:00
2016-10-13 16:01:15 +02:00
bool advanceMin ;
2016-10-03 16:25:38 +02:00
2016-10-10 16:02:50 +02:00
do {
2016-10-13 16:01:15 +02:00
advanceMin = false ;
2016-10-10 16:02:50 +02:00
//
2016-10-13 16:01:15 +02:00
// Find the minimum word position in the document body for ANY of the query terms.
2016-10-13 14:49:49 +02:00
// minPosTermIdx will contain the term index, minPos the position.
2016-10-10 16:02:50 +02:00
//
2016-10-14 09:52:10 +02:00
int32_t minPosTermIdx = - 1 ;
2016-10-03 16:25:38 +02:00
for ( int32_t x = 0 ; x < m_numQueryTermInfos ; x + + ) {
// skip if to the left of a pipe operator
// and numeric posdb termlists do not have word positions,
// they store a float there.
if ( m_bflags [ x ] & ( BF_PIPED | BF_NEGATIVE | BF_NUMBER ) ) {
continue ;
}
if ( ! xpos [ x ] ) {
continue ;
}
2016-10-13 14:49:49 +02:00
if ( xpos [ x ] & & minPosTermIdx = = - 1 ) {
minPosTermIdx = x ;
2016-10-03 16:25:38 +02:00
//minRec = xpos[x];
minPos = Posdb : : getWordPos ( xpos [ x ] ) ;
continue ;
}
if ( Posdb : : getWordPos ( xpos [ x ] ) > = minPos ) {
continue ;
}
2016-10-13 14:49:49 +02:00
minPosTermIdx = x ;
2016-10-03 16:25:38 +02:00
//minRec = xpos[x];
minPos = Posdb : : getWordPos ( xpos [ x ] ) ;
}
// sanity
2016-10-13 14:49:49 +02:00
if ( minPosTermIdx < 0 ) {
2016-10-03 16:25:38 +02:00
gbshutdownAbort ( true ) ;
}
2016-10-10 16:02:50 +02:00
do {
//
// Advance the list pointer of the list containing the current
2016-10-13 14:49:49 +02:00
// minimum position (minPosTermIdx). If no more positions, set list to NULL.
2016-10-10 16:02:50 +02:00
// If all lists are NULL, we are done sliding.
//
2016-10-13 14:49:49 +02:00
if ( ! ( xpos [ minPosTermIdx ] [ 0 ] & 0x04 ) ) {
xpos [ minPosTermIdx ] + = 12 ;
2016-10-03 16:25:38 +02:00
}
else {
2016-10-13 14:49:49 +02:00
xpos [ minPosTermIdx ] + = 6 ;
2016-10-03 16:25:38 +02:00
}
2016-10-10 16:02:50 +02:00
// NULLify list if no more positions for this docid for that term.
2016-10-13 14:49:49 +02:00
if ( xpos [ minPosTermIdx ] > = miniMergedEnd [ minPosTermIdx ] | | ! ( xpos [ minPosTermIdx ] [ 0 ] & 0x04 ) ) {
2016-10-03 16:25:38 +02:00
// exhausted list now
2016-10-13 14:49:49 +02:00
xpos [ minPosTermIdx ] = NULL ;
2016-10-10 16:02:50 +02:00
2016-10-03 16:25:38 +02:00
// are all null now?
int32_t k ;
for ( k = 0 ; k < m_numQueryTermInfos ; k + + ) {
// skip if to the left of a pipe operator
if ( m_bflags [ k ] & ( BF_PIPED | BF_NEGATIVE | BF_NUMBER ) ) {
continue ;
}
if ( xpos [ k ] ) {
break ;
}
}
// all lists are now exhausted
if ( k > = m_numQueryTermInfos ) {
doneSliding = true ;
}
2016-10-13 16:01:15 +02:00
// No more positions in current term list. Find new term list with lowest position.
2016-10-03 16:25:38 +02:00
advanceMin = true ;
}
2016-10-13 16:01:15 +02:00
// if current term position is not in the document body, advance the pointer and look again.
2016-10-13 14:49:49 +02:00
} while ( ! advanceMin & & ! doneSliding & & ! s_inBody [ Posdb : : getHashGroup ( xpos [ minPosTermIdx ] ) ] ) ;
2016-10-03 16:25:38 +02:00
2016-10-13 16:01:15 +02:00
// if current list is exhausted, find new term list with lowest position.
2016-10-03 16:25:38 +02:00
} while ( advanceMin & & ! doneSliding ) ;
2016-10-14 09:52:10 +02:00
} // end of while( !doneSliding )
2016-10-03 16:25:38 +02:00
2016-10-14 11:57:15 +02:00
float minPairScore = - 1.0 ;
logTrace ( g_conf . m_logTracePosdb , " Zak algorithm begins " ) ;
for ( int32_t i = 0 ; i < m_numQueryTermInfos ; i + + ) {
// skip if to the left of a pipe operator
if ( m_bflags [ i ] & ( BF_PIPED | BF_NEGATIVE | BF_NUMBER ) ) {
continue ;
}
if ( ! miniMergedList [ i ] ) {
continue ;
}
for ( int32_t j = i + 1 ; j < m_numQueryTermInfos ; j + + ) {
// skip if to the left of a pipe operator
if ( m_bflags [ j ] & ( BF_PIPED | BF_NEGATIVE | BF_NUMBER ) ) {
continue ;
}
if ( ! miniMergedList [ j ] ) {
continue ;
}
// . this limits its scoring to the winning sliding window
// as far as the in-body terms are concerned
// . it will do sub-outs using the score matrix
// . this will skip over body terms that are not
// in the winning window defined by m_bestMinTermPairWindowPtrs[]
// that we set in findMinTermPairScoreInWindow()
// . returns the best score for this term
float tpscore = getTermPairScoreForAny ( i , j , miniMergedList [ i ] , miniMergedList [ j ] ,
miniMergedEnd [ i ] , miniMergedEnd [ j ] , pdcs ) ;
// get min of all term pair scores
if ( tpscore > = minPairScore & & minPairScore > = 0.0 ) {
continue ;
}
// got a new min
minPairScore = tpscore ;
}
}
logTrace ( g_conf . m_logTracePosdb , " Zak algorithm ends. minPairScore=%f " , minPairScore ) ;
return minPairScore ;
}
2016-10-03 16:25:38 +02:00
2016-10-18 14:17:39 +02:00
2016-08-30 15:18:21 +02:00
// . compare the output of this to intersectLists9_r()
// . hopefully this will be easier to understand and faster
// . IDEAS:
// we could also note that if a term was not in the title or
// inlink text it could never beat the 10th score.
void PosdbTable : : intersectLists10_r ( ) {
logTrace ( g_conf . m_logTracePosdb , " BEGIN. numTerms: % " PRId32 , m_q - > m_numTerms ) ;
prepareWhiteListTable ( ) ;
initWeights ( ) ;
// assume no-op
m_t1 = 0LL ;
// set start time
int64_t t1 = gettimeofdayInMilliseconds ( ) ;
// assume we return early
m_addListsTime = 0 ;
if ( ! findCandidateDocIds ( ) ) {
logTrace ( g_conf . m_logTracePosdb , " END. Found no candidate docids " ) ;
return ;
2016-08-29 17:05:34 +02:00
}
2016-08-25 17:27:37 +02:00
2016-08-30 15:18:21 +02:00
//
// The vote buffer now contains the matching docids and each term sublist
// has been adjusted to only contain these docids as well. Let the fun begin.
//
2016-08-29 17:05:34 +02:00
//
// TRANSFORM QueryTermInfo::m_* vars into old style arrays
//
2016-08-30 15:18:21 +02:00
// MUST MATCH allocation in allocTopScoringDocIdsData
//
char * pp = m_topScoringDocIdsBuf . getBufStart ( ) ;
2016-08-29 17:05:34 +02:00
int32_t nqt = m_q - > m_numTerms ;
2016-09-02 14:46:57 +02:00
int32_t * wikiPhraseIds = ( int32_t * ) pp ; pp + = 4 * nqt ; // from QueryTermInfo
int32_t * quotedStartIds = ( int32_t * ) pp ; pp + = 4 * nqt ; // from QueryTermInfo
int32_t * qpos = ( int32_t * ) pp ; pp + = 4 * nqt ; // from QueryTermInfo
int32_t * qtermNums = ( int32_t * ) pp ; pp + = 4 * nqt ; // from QueryTermInfo
float * freqWeights = ( float * ) pp ; pp + = sizeof ( float ) * nqt ; // from QueryTermInfo
2016-09-16 14:49:36 +02:00
const char * * miniMergedList = ( const char * * ) pp ; pp + = sizeof ( const char * ) * nqt ;
const char * * miniMergedEnd = ( const char * * ) pp ; pp + = sizeof ( const char * ) * nqt ;
2016-10-10 16:02:50 +02:00
const char * * highestScoringNonBodyPos = ( const char * * ) pp ; pp + = sizeof ( const char * ) * nqt ;
2016-09-16 14:49:36 +02:00
const char * * winnerStack = ( const char * * ) pp ; pp + = sizeof ( const char * ) * nqt ;
const char * * xpos = ( const char * * ) pp ; pp + = sizeof ( const char * ) * nqt ;
2016-08-29 17:05:34 +02:00
char * bflags = ( char * ) pp ; pp + = sizeof ( char ) * nqt ;
float * scoreMatrix = ( float * ) pp ; pp + = sizeof ( float ) * nqt * nqt ;
2016-08-30 15:18:21 +02:00
if ( pp > m_topScoringDocIdsBuf . getBufEnd ( ) )
2016-08-29 17:05:34 +02:00
gbshutdownAbort ( true ) ;
2016-08-25 17:27:37 +02:00
2016-08-30 15:18:21 +02:00
int64_t lastTime = gettimeofdayInMilliseconds ( ) ;
int64_t now ;
int64_t took ;
int32_t phase = 3 ; // 2 first in findCandidateDocIds
2016-09-02 14:46:57 +02:00
// point to our array of query term infos set in setQueryTermInfos()
QueryTermInfo * qtibuf = ( QueryTermInfo * ) m_qiBuf . getBufStart ( ) ;
2016-08-29 17:05:34 +02:00
for ( int32_t i = 0 ; i < m_numQueryTermInfos ; i + + ) {
// get it
2016-09-02 14:46:57 +02:00
QueryTermInfo * qti = & qtibuf [ i ] ;
2016-08-29 17:05:34 +02:00
// set it
wikiPhraseIds [ i ] = qti - > m_wikiPhraseId ;
quotedStartIds [ i ] = qti - > m_quotedStartId ;
// query term position
qpos [ i ] = qti - > m_qpos ;
qtermNums [ i ] = qti - > m_qtermNum ;
freqWeights [ i ] = qti - > m_termFreqWeight ;
2016-08-25 17:27:37 +02:00
}
2016-10-10 16:02:50 +02:00
// for findMinTermPairScoreInWindow() function
2016-08-29 17:05:34 +02:00
m_freqWeights = freqWeights ;
m_qtermNums = qtermNums ;
2016-08-25 17:27:37 +02:00
2016-10-14 11:57:15 +02:00
m_bflags = bflags ;
2016-09-02 14:46:57 +02:00
2016-08-29 17:05:34 +02:00
//////////
//
// OLD MAIN INTERSECTION LOGIC
//
/////////
2016-08-25 17:27:37 +02:00
2016-08-29 17:05:34 +02:00
DocIdScore dcs ;
DocIdScore * pdcs = NULL ;
uint64_t lastDocId = 0LL ;
int32_t lastLen = 0 ;
2016-09-20 19:18:49 +02:00
char siteRank ;
int highestInlinkSiteRank ;
char docLang ;
2016-08-29 17:05:34 +02:00
float score ;
2016-09-06 15:26:40 +02:00
int32_t intScore = 0 ;
float minScore = 0.0 ;
2016-08-29 17:05:34 +02:00
float minSingleScore ;
// scan the posdb keys in the smallest list
// raised from 200 to 300,000 for 'da da da' query
2016-09-13 14:38:09 +02:00
char miniMergeBuf [ 300000 ] ;
2016-09-16 14:49:36 +02:00
const char * docIdPtr ;
2016-08-29 17:05:34 +02:00
char * docIdEnd = m_docIdVoteBuf . getBufStart ( ) + m_docIdVoteBuf . length ( ) ;
float minWinningScore = - 1.0 ;
int32_t topCursor = - 9 ;
int32_t numProcessed = 0 ;
2016-09-13 14:38:09 +02:00
int32_t prefiltMaxPossScoreFail = 0 ;
int32_t prefiltMaxPossScorePass = 0 ;
int32_t prefiltBestDistMaxPossScoreFail = 0 ;
int32_t prefiltBestDistMaxPossScorePass = 0 ;
2016-07-04 15:06:15 +02:00
2016-08-25 17:27:37 +02:00
2016-08-29 17:05:34 +02:00
// populate the cursors for each sublist
2016-07-04 15:06:15 +02:00
2016-09-02 14:46:57 +02:00
int32_t numQueryTermsToHandle = m_numQueryTermInfos ;
2016-09-04 23:01:01 +02:00
if ( ! m_msg39req - > m_doMaxScoreAlgo ) {
2016-09-02 14:46:57 +02:00
numQueryTermsToHandle = 0 ;
}
2016-08-29 17:05:34 +02:00
// do not do it if we got a gbsortby: field
2016-09-02 14:46:57 +02:00
if ( m_sortByTermNum > = 0 ) {
numQueryTermsToHandle = 0 ;
}
if ( m_sortByTermNumInt > = 0 ) {
numQueryTermsToHandle = 0 ;
}
2016-07-04 15:06:15 +02:00
2016-09-07 21:56:57 +02:00
//
// Run through the scoring logic once or twice. Two passes needed ONLY if we
// need to create additional (debug) scoring info for visual display.
//
int numPassesNeeded = m_msg39req - > m_getDocIdScoringInfo ? 2 : 1 ;
2016-08-29 17:05:34 +02:00
2016-09-07 21:56:57 +02:00
for ( int currPassNum = 0 ; currPassNum < numPassesNeeded ; currPassNum + + ) {
2016-09-02 14:46:57 +02:00
//
2016-09-07 21:56:57 +02:00
// Pass 0: Find candidate docids and calculate scores
// Pass 1: Only for creating debug scoring info for visual display
2016-09-02 14:46:57 +02:00
//
2016-09-07 21:56:57 +02:00
switch ( currPassNum ) {
case INTERSECT_SCORING :
logTrace ( g_conf . m_logTracePosdb , " Loop 0: Real scoring " ) ;
break ;
case INTERSECT_DEBUG_INFO :
logTrace ( g_conf . m_logTracePosdb , " Loop 1: Data for visual scoring info " ) ;
2017-04-21 17:25:21 +02:00
removeScoreInfoForDeletedDocIds ( ) ;
2016-09-07 21:56:57 +02:00
break ;
default :
log ( LOG_LOGIC , " %s:%d: Illegal pass number %d " , __FILE__ , __LINE__ , currPassNum ) ;
gbshutdownLogicError ( ) ;
2016-08-29 17:05:34 +02:00
}
2016-09-07 21:56:57 +02:00
// reset docid to start!
docIdPtr = m_docIdVoteBuf . getBufStart ( ) ;
2016-08-29 17:05:34 +02:00
2016-09-07 21:56:57 +02:00
if ( currPassNum = = INTERSECT_DEBUG_INFO ) {
//
// reset QueryTermInfo::m_matchingSubListCursor[] to point to start of
// term lists for second pass that gets printable scoring info
//
for ( int32_t i = 0 ; i < m_numQueryTermInfos ; i + + ) {
// get it
QueryTermInfo * qti = & qtibuf [ i ] ;
// skip negative termlists
if ( qti - > m_bigramFlags [ 0 ] & BF_NEGATIVE ) {
continue ;
}
// do each sublist
for ( int32_t j = 0 ; j < qti - > m_numMatchingSubLists ; j + + ) {
qti - > m_matchingSubListCursor [ j ] = qti - > m_matchingSubListStart [ j ] ;
qti - > m_matchingSubListSavedCursor [ j ] = qti - > m_matchingSubListStart [ j ] ;
}
}
}
2016-08-29 17:05:34 +02:00
2016-09-02 14:46:57 +02:00
2016-10-15 21:19:18 +02:00
//#
//# MAIN LOOP for looping over each docid
//#
2016-09-09 16:29:08 +02:00
bool allDone = false ;
while ( ! allDone & & docIdPtr < docIdEnd ) {
2017-04-21 17:25:21 +02:00
// logTrace(g_conf.m_logTracePosdb, "Handling next docId");
2016-09-13 12:32:01 +02:00
2016-09-09 16:29:08 +02:00
bool skipToNextDocId = false ;
2016-09-20 19:18:49 +02:00
siteRank = 0 ;
docLang = langUnknown ;
highestInlinkSiteRank = - 1 ;
2017-04-21 17:25:21 +02:00
bool docInThisFile ;
if ( currPassNum = = INTERSECT_SCORING ) {
m_docId = * ( uint32_t * ) ( docIdPtr + 1 ) ;
m_docId < < = 8 ;
m_docId | = ( unsigned char ) docIdPtr [ 0 ] ;
m_docId > > = 2 ;
docInThisFile = m_documentIndexChecker - > exists ( m_docId ) ;
}
else {
//
// second pass? for printing out transparency info.
// genDebugScoreInfo1 sets m_docId from the top scorer tree
//
if ( genDebugScoreInfo1 ( & numProcessed , & topCursor , & docInThisFile , qtibuf ) ) {
logTrace ( g_conf . m_logTracePosdb , " Pass #%d for file % " PRId32 " done " , currPassNum , m_documentIndexChecker - > getFileNum ( ) ) ;
2016-09-07 21:56:57 +02:00
2017-04-21 17:25:21 +02:00
// returns true if no more docids to handle
allDone = true ;
break ; // break out of docIdPtr < docIdEnd loop
}
}
2016-11-08 12:14:32 +01:00
2017-04-21 17:25:21 +02:00
//bool docInThisFile = m_documentIndexChecker->exists(m_docId);
logTrace ( g_conf . m_logTracePosdb , " Handling next docId: % " PRId64 " - pass #%d - %sfound in this file (% " PRId32 " ) " , m_docId , currPassNum , docInThisFile ? " " : " SKIPPING, not " , m_documentIndexChecker - > getFileNum ( ) ) ;
2016-11-25 16:27:33 +01:00
2017-04-21 17:25:21 +02:00
if ( ! docInThisFile ) {
2016-11-25 17:11:06 +01:00
// Only advance cursors in first pass
2016-11-25 16:27:33 +01:00
if ( currPassNum = = INTERSECT_SCORING ) {
if ( ! advanceTermListCursors ( docIdPtr , qtibuf ) ) {
logTrace ( g_conf . m_logTracePosdb , " END. advanceTermListCursors failed " ) ;
return ;
}
2017-04-21 17:25:21 +02:00
docIdPtr + = 6 ;
2016-11-08 12:14:32 +01:00
}
2016-11-25 16:27:33 +01:00
2017-04-21 17:25:21 +02:00
continue ;
2016-09-09 16:29:08 +02:00
}
2016-09-02 14:46:57 +02:00
2017-01-06 17:40:50 +01:00
//calculate complete score multiplier
float completeScoreMultiplier = 1.0 ;
unsigned flags = 0 ;
if ( g_d2fasm . lookupFlags ( m_docId , & flags ) & & flags ) {
for ( int i = 0 ; i < 26 ; i + + ) {
if ( flags & ( 1 < < i ) )
2017-01-20 14:22:44 +01:00
completeScoreMultiplier * = m_msg39req - > m_flagScoreMultiplier [ i ] ;
2017-01-06 17:40:50 +01:00
}
}
2017-04-30 20:23:09 +02:00
2016-09-09 16:29:08 +02:00
if ( currPassNum = = INTERSECT_SCORING ) {
2016-09-07 21:56:57 +02:00
//
2016-09-13 12:32:01 +02:00
// Pre-advance each termlist's cursor to skip to next docid.
2016-09-07 21:56:57 +02:00
//
2016-09-13 12:32:01 +02:00
if ( ! advanceTermListCursors ( docIdPtr , qtibuf ) ) {
logTrace ( g_conf . m_logTracePosdb , " END. advanceTermListCursors failed " ) ;
return ;
2016-08-25 17:27:37 +02:00
}
2016-07-04 15:06:15 +02:00
2016-09-09 16:29:08 +02:00
if ( ! m_q - > m_isBoolean ) {
2016-09-13 12:32:01 +02:00
//##
//## PRE-FILTERS. Discard DocIDs that cannot meet the minimum required
//## score, before entering the main scoring loop below
//##
2016-10-18 14:17:39 +02:00
//
2016-09-09 16:29:08 +02:00
// TODO: consider skipping this pre-filter if it sucks, as it does
// for 'time enough for love'. it might save time!
//
// Calculate maximum possible score for a document. If the max score
// is lower than the current minimum winning score, give up already
2016-10-18 14:17:39 +02:00
// now and skip to the next docid.
//
// minWinningScore is the score of the lowest scoring doc in m_topTree IF
// the topTree contains the number of docs requested. Otherwise it is -1
// and the doc is inserted no matter the score.
2016-09-09 16:29:08 +02:00
//
// Only go through this if we actually have a minimum score to compare with ...
2016-10-18 14:17:39 +02:00
// No need if minWinningScore is still -1.
//
2016-09-13 14:38:09 +02:00
if ( minWinningScore > = 0.0 ) {
2016-09-09 16:29:08 +02:00
logTrace ( g_conf . m_logTracePosdb , " Compute 'upper bound' for each query term " ) ;
2016-09-07 21:56:57 +02:00
2016-09-09 16:29:08 +02:00
// If there's no way we can break into the winner's circle, give up!
// This computes an upper bound for each query term
for ( int32_t i = 0 ; i < numQueryTermsToHandle ; i + + ) {
// skip negative termlists.
if ( qtibuf [ i ] . m_bigramFlags [ 0 ] & ( BF_NEGATIVE ) ) {
continue ;
}
2016-09-02 14:46:57 +02:00
2016-09-09 16:29:08 +02:00
// an upper bound on the score we could get
float maxScore = getMaxPossibleScore ( & qtibuf [ i ] , 0 , 0 , NULL ) ;
// -1 means it has inlink text so do not apply this constraint
// to this docid because it is too difficult because we
// sum up the inlink text
if ( maxScore < 0.0 ) {
continue ;
}
2017-01-06 17:40:50 +01:00
maxScore * = completeScoreMultiplier ;
2016-09-09 16:29:08 +02:00
// logTrace(g_conf.m_logTracePosdb, "maxScore=%f minWinningScore=%f", maxScore, minWinningScore);
// if any one of these terms have a max score below the
// worst score of the 10th result, then it can not win.
if ( maxScore < = minWinningScore ) {
docIdPtr + = 6 ;
2016-09-13 14:38:09 +02:00
prefiltMaxPossScoreFail + + ;
2016-09-09 16:29:08 +02:00
skipToNextDocId = true ;
break ; // break out of numQueryTermsToHandle loop
}
2016-09-07 21:56:57 +02:00
}
2016-09-04 23:01:01 +02:00
}
2016-07-04 15:06:15 +02:00
2016-09-09 16:29:08 +02:00
if ( skipToNextDocId ) {
// continue docIdPtr < docIdEnd loop
2017-04-21 17:25:21 +02:00
logTrace ( g_conf . m_logTracePosdb , " Max possible score for docId % " PRId64 " too low, skipping to next " , m_docId ) ;
2016-09-09 16:29:08 +02:00
continue ;
2016-09-04 23:01:01 +02:00
}
2016-09-09 16:29:08 +02:00
2016-09-13 14:38:09 +02:00
prefiltMaxPossScorePass + + ;
2016-09-09 16:29:08 +02:00
2016-09-13 14:38:09 +02:00
if ( minWinningScore > = 0.0 & & m_sortByTermNum < 0 & & m_sortByTermNumInt < 0 ) {
2016-09-09 16:29:08 +02:00
2017-01-06 17:40:50 +01:00
if ( ! prefilterMaxPossibleScoreByDistance ( qtibuf , qpos , minWinningScore * completeScoreMultiplier ) ) {
2016-09-13 12:32:01 +02:00
docIdPtr + = 6 ;
2016-09-13 14:38:09 +02:00
prefiltBestDistMaxPossScoreFail + + ;
2016-09-13 12:32:01 +02:00
skipToNextDocId = true ;
2016-09-09 16:29:08 +02:00
}
} // not m_sortByTermNum or m_sortByTermNumInt
2016-09-09 12:57:46 +02:00
2016-09-09 16:29:08 +02:00
if ( skipToNextDocId ) {
// Continue docIdPtr < docIdEnd loop
2017-04-21 17:25:21 +02:00
logTrace ( g_conf . m_logTracePosdb , " Max possible score by distance for docId % " PRId64 " too low, skipping to next " , m_docId ) ;
2016-09-09 16:29:08 +02:00
continue ;
2016-09-09 12:03:22 +02:00
}
2016-09-13 14:38:09 +02:00
prefiltBestDistMaxPossScorePass + + ;
2016-09-09 16:29:08 +02:00
} // !m_q->m_isBoolean
} // currPassNum == INTERSECT_SCORING
if ( m_q - > m_isBoolean ) {
// add one point for each term matched in the bool query
// this is really just for when the terms are from different
// fields. if we have unfielded boolean terms we should
// do proximity matching.
int32_t slot = m_bt . getSlot ( & m_docId ) ;
if ( slot > = 0 ) {
uint8_t * bv = ( uint8_t * ) m_bt . getValueFromSlot ( slot ) ;
// then a score based on the # of terms that matched
int16_t bitsOn = getNumBitsOnX ( bv , m_vecSize ) ;
// but store in hashtable now
minScore = ( float ) bitsOn ;
}
else {
minScore = 1.0 ;
}
}
2016-09-09 12:03:22 +02:00
2016-09-13 14:38:09 +02:00
2016-09-16 11:24:23 +02:00
//##
//## PERFORMANCE HACK: ON-DEMAND MINI MERGES.
//##
//## Data for the current DocID found in sublists of each query term
//## is merged into a single list, so we end up with one list per query
//## term. They are all stored in a single buffer (miniMergeBuf), which
//## the miniMerged* pointers point into..
//##
2016-09-21 14:06:51 +02:00
mergeTermSubListsForDocId ( qtibuf , miniMergeBuf , miniMergedList , miniMergedEnd , & highestInlinkSiteRank ) ;
2016-09-09 12:03:22 +02:00
2016-09-09 16:29:08 +02:00
// clear the counts on this DocIdScore class for this new docid
pdcs = NULL ;
if ( currPassNum = = INTERSECT_DEBUG_INFO ) {
dcs . reset ( ) ;
pdcs = & dcs ;
}
2016-09-07 21:56:57 +02:00
2016-09-16 11:24:23 +02:00
//##
//## ACTUAL SCORING BEGINS
//##
2016-09-09 16:29:08 +02:00
if ( ! m_q - > m_isBoolean ) {
2016-10-14 11:57:15 +02:00
// Used by the various scoring functions called below
m_qpos = qpos ;
m_wikiPhraseIds = wikiPhraseIds ;
m_quotedStartIds = quotedStartIds ;
m_bestMinTermPairWindowScore = - 2.0 ;
2016-10-10 16:02:50 +02:00
2016-09-21 21:20:16 +02:00
//#
//# NON-BODY TERM PAIR SCORING LOOP
//#
2016-10-14 11:57:15 +02:00
createNonBodyTermPairScoreMatrix ( miniMergedList , miniMergedEnd , scoreMatrix ) ;
2016-09-09 16:29:08 +02:00
2016-09-21 21:20:16 +02:00
//#
//# SINGLE TERM SCORE LOOP
//#
2016-10-18 14:17:39 +02:00
minSingleScore = getMinSingleTermScoreSum ( miniMergedList , miniMergedEnd , highestScoringNonBodyPos , pdcs ) ;
2017-04-23 22:16:28 +02:00
logTrace ( g_conf . m_logTracePosdb , " minSingleScore=%f before multiplication for docId % " PRIu64 " " , minSingleScore , m_docId ) ;
2017-01-06 17:40:50 +01:00
minSingleScore * = completeScoreMultiplier ;
2016-09-18 17:00:46 +02:00
2016-10-10 16:02:50 +02:00
//#
//# DOCID / SITERANK DETECTION
//#
2016-10-14 11:57:15 +02:00
for ( int32_t k = 0 ; k < m_numQueryTermInfos ; k + + ) {
2016-09-20 19:18:49 +02:00
if ( ! miniMergedList [ k ] ) {
continue ;
}
2016-09-07 21:56:57 +02:00
2016-09-20 19:18:49 +02:00
// siterank/langid is always 0 in numeric
// termlists so they sort by their number correctly
if ( qtibuf [ k ] . m_bigramFlags [ 0 ] & ( BF_NUMBER ) ) {
continue ;
2016-09-07 21:56:57 +02:00
}
2016-09-20 19:18:49 +02:00
siteRank = Posdb : : getSiteRank ( miniMergedList [ k ] ) ;
docLang = Posdb : : getLangId ( miniMergedList [ k ] ) ;
break ;
2016-09-07 21:56:57 +02:00
}
2016-09-20 19:18:49 +02:00
logTrace ( g_conf . m_logTracePosdb , " Got siteRank %d and docLang %d " , ( int ) siteRank , ( int ) docLang ) ;
2016-10-03 16:25:38 +02:00
//#
//# SLIDING WINDOW SCORING ALGORITHM
//#
2016-10-13 14:49:49 +02:00
// After calling this, m_bestMinTermPairWindowPtrs will point to the
// term positions set ("window") that has the highest minimum score. These
2016-10-18 14:17:39 +02:00
// pointers are used when determining the minimum term pair score returned
// by the function.
float minPairScore = getMinTermPairScoreSlidingWindow ( miniMergedList , miniMergedEnd , highestScoringNonBodyPos , winnerStack , xpos , scoreMatrix , pdcs ) ;
2017-04-23 22:16:28 +02:00
logTrace ( g_conf . m_logTracePosdb , " minPairScore=%f before multiplication for docId % " PRIu64 " " , minPairScore , m_docId ) ;
2017-01-06 17:40:50 +01:00
minPairScore * = completeScoreMultiplier ;
2016-08-29 17:05:34 +02:00
2016-10-14 11:57:15 +02:00
//#
2016-10-18 14:17:39 +02:00
//# Find minimum score - either single term or term pair
2016-10-14 11:57:15 +02:00
//#
2016-09-09 16:29:08 +02:00
minScore = 999999999.0 ;
// get a min score from all the term pairs
if ( minPairScore < minScore & & minPairScore > = 0.0 ) {
minScore = minPairScore ;
}
2017-04-21 17:25:21 +02:00
2016-09-09 16:29:08 +02:00
// if we only had one query term
if ( minSingleScore < minScore ) {
minScore = minSingleScore ;
}
2016-10-18 14:17:39 +02:00
logTrace ( g_conf . m_logTracePosdb , " minPairScore=%f, minScore=%f for docId % " PRIu64 " " , minPairScore , minScore , m_docId ) ;
2016-07-04 15:06:15 +02:00
2016-09-09 16:29:08 +02:00
2016-10-15 21:19:18 +02:00
// No positive score? Then skip the doc
2016-09-09 16:29:08 +02:00
if ( minScore < = 0.0 ) {
2017-04-21 17:25:21 +02:00
if ( currPassNum = = INTERSECT_SCORING ) {
// advance to next docid
docIdPtr + = 6 ;
}
2016-10-14 11:57:15 +02:00
logTrace ( g_conf . m_logTracePosdb , " Skipping docid % " PRIu64 " - no positive score " , m_docId ) ;
2016-09-09 16:29:08 +02:00
// Continue docid loop
continue ;
}
} // !m_q->m_isBoolean
2016-07-04 15:06:15 +02:00
2016-10-14 11:57:15 +02:00
//#
//# Calculate score and give boost based on siterank and highest inlinking siterank
//#
2017-04-23 22:16:28 +02:00
float adjustedSiteRank = siteRank ;
2016-08-29 17:05:34 +02:00
2016-09-09 16:29:08 +02:00
if ( highestInlinkSiteRank > siteRank ) {
//adjust effective siterank because a high-rank site linked to it. Don't adjust it too much though.
2017-04-23 22:16:28 +02:00
adjustedSiteRank = siteRank + ( highestInlinkSiteRank - siteRank ) / 3.0 ;
logTrace ( g_conf . m_logTracePosdb , " Highest inlink siterank %d > siterank %d. Adjusting to %f for docId % " PRIu64 " " , highestInlinkSiteRank , ( int ) siteRank , adjustedSiteRank , m_docId ) ;
2016-08-25 17:27:37 +02:00
}
2017-04-23 22:16:28 +02:00
score = minScore * ( adjustedSiteRank * m_siteRankMultiplier + 1.0 ) ;
2016-10-18 14:17:39 +02:00
logTrace ( g_conf . m_logTracePosdb , " Score %f for docId % " PRIu64 " " , score , m_docId ) ;
2016-09-07 21:56:57 +02:00
2016-10-14 11:57:15 +02:00
//#
2017-04-29 20:24:00 +02:00
//# Give score boost if query and doc language is the same,
//# and optionally a different boost if the language of the
//# page is unknown.
//#
2016-10-14 11:57:15 +02:00
//# Use "qlang" parm to set the language. i.e. "&qlang=fr"
//#
2017-04-29 20:24:00 +02:00
if ( m_msg39req - > m_language ! = 0 ) {
if ( m_msg39req - > m_language = = docLang ) {
score * = ( m_msg39req - > m_sameLangWeight ) ;
logTrace ( g_conf . m_logTracePosdb , " Giving score a matching language boost of x%f: %f for docId % " PRIu64 " " , m_msg39req - > m_sameLangWeight , score , m_docId ) ;
}
else
if ( docLang = = 0 ) {
score * = ( m_msg39req - > m_unknownLangWeight ) ;
logTrace ( g_conf . m_logTracePosdb , " Giving score an unknown language boost of x%f: %f for docId % " PRIu64 " " , m_msg39req - > m_unknownLangWeight , score , m_docId ) ;
}
2016-08-25 17:27:37 +02:00
}
2016-09-07 21:56:57 +02:00
2017-04-23 22:16:28 +02:00
double page_temperature = 0 ;
bool use_page_temperature = false ;
float score_before_page_temp = score ;
2016-10-14 11:57:15 +02:00
2017-01-20 14:43:55 +01:00
if ( m_msg39req - > m_usePageTemperatureForRanking ) {
2017-04-23 22:16:28 +02:00
use_page_temperature = true ;
2017-04-28 16:43:14 +02:00
page_temperature = g_pageTemperatureRegistry . query_page_temperature ( m_docId , m_msg39req - > m_pageTemperatureWeightMin , m_msg39req - > m_pageTemperatureWeightMax ) ;
2016-12-19 12:58:33 +01:00
score * = page_temperature ;
2017-04-28 16:23:19 +02:00
logTrace ( g_conf . m_logTracePosdb , " Page temperature for docId % " PRIu64 " is %.14f, score %f -> %f " , m_docId , page_temperature , score_before_page_temp , score ) ;
2016-12-19 12:58:33 +01:00
}
2016-09-07 21:56:57 +02:00
2017-04-23 22:16:28 +02:00
2016-10-18 14:17:39 +02:00
//#
//# Handle sortby int/float and minimum docid/score pairs
//#
2016-09-07 21:56:57 +02:00
2016-10-18 14:17:39 +02:00
if ( m_sortByTermNum > = 0 | | m_sortByTermNumInt > = 0 | | m_hasMaxSerpScore ) {
// assume filtered out
if ( currPassNum = = INTERSECT_SCORING ) {
m_filtered + + ;
2016-09-07 21:56:57 +02:00
}
2016-09-09 16:29:08 +02:00
2016-10-18 14:17:39 +02:00
//
// if we have a gbsortby:price term then score exclusively on that
//
if ( m_sortByTermNum > = 0 ) {
// no term?
if ( ! miniMergedList [ m_sortByTermInfoNum ] ) {
// advance to next docid
2017-04-21 17:25:21 +02:00
if ( currPassNum = = INTERSECT_SCORING ) {
docIdPtr + = 6 ;
}
2016-10-18 14:17:39 +02:00
// Continue docIdPtr < docIdEnd loop
continue ;
}
score = Posdb : : getFloat ( miniMergedList [ m_sortByTermInfoNum ] ) ;
}
2016-09-09 16:29:08 +02:00
if ( m_sortByTermNumInt > = 0 ) {
2016-10-18 14:17:39 +02:00
// no term?
if ( ! miniMergedList [ m_sortByTermInfoNumInt ] ) {
// advance to next docid
2017-04-21 17:25:21 +02:00
if ( currPassNum = = INTERSECT_SCORING ) {
docIdPtr + = 6 ;
}
2016-10-18 14:17:39 +02:00
// Continue docIdPtr < docIdEnd loop
continue ;
2016-09-09 16:29:08 +02:00
}
2016-10-18 14:17:39 +02:00
intScore = Posdb : : getInt ( miniMergedList [ m_sortByTermInfoNumInt ] ) ;
// do this so hasMaxSerpScore below works, although
// because of roundoff errors we might lose a docid
// through the cracks in the widget.
//score = (float)intScore;
2016-09-07 21:56:57 +02:00
}
2016-10-18 14:17:39 +02:00
// now we have a maxscore/maxdocid upper range so the widget
// can append only new results to an older result set.
if ( m_hasMaxSerpScore ) {
bool skipToNext = false ;
// if dealing with an "int" score use the extra precision
// of the double that m_maxSerpScore is!
if ( m_sortByTermNumInt > = 0 ) {
if ( intScore > ( int32_t ) m_msg39req - > m_maxSerpScore ) {
skipToNext = true ;
}
else
if ( intScore = = ( int32_t ) m_msg39req - > m_maxSerpScore & & ( int64_t ) m_docId < = m_msg39req - > m_minSerpDocId ) {
skipToNext = true ;
}
2016-09-09 16:29:08 +02:00
}
2016-10-18 14:17:39 +02:00
else {
if ( score > ( float ) m_msg39req - > m_maxSerpScore ) {
skipToNext = true ;
}
else
2016-10-23 22:52:51 +02:00
if ( almostEqualFloat ( score , ( float ) m_msg39req - > m_maxSerpScore ) & & ( int64_t ) m_docId < = m_msg39req - > m_minSerpDocId ) {
//@todo: Why is m_msg39req->m_maxSerpScore double and score float?
2016-10-18 14:17:39 +02:00
skipToNext = true ;
}
2016-09-09 16:29:08 +02:00
}
2016-10-18 14:17:39 +02:00
if ( skipToNext ) {
// advance to next docid
2017-04-21 17:25:21 +02:00
if ( currPassNum = = INTERSECT_SCORING ) {
docIdPtr + = 6 ;
}
2016-10-18 14:17:39 +02:00
// Continue docIdPtr < docIdEnd loop
continue ;
2016-09-09 16:29:08 +02:00
}
}
2016-10-18 14:17:39 +02:00
// we did not get filtered out
if ( currPassNum = = INTERSECT_SCORING ) {
m_filtered - - ;
2016-09-07 21:56:57 +02:00
}
2016-07-04 15:06:15 +02:00
}
2016-08-29 17:05:34 +02:00
2017-04-23 22:16:28 +02:00
// We only come here if we actually made it into m_topTree - second round only loops through
// TopTree entries.
2016-09-09 16:29:08 +02:00
if ( currPassNum = = INTERSECT_DEBUG_INFO ) {
2017-04-23 22:16:28 +02:00
// NEW 20170423
dcs . m_usePageTemperature = use_page_temperature ;
dcs . m_pageTemperature = page_temperature ;
dcs . m_adjustedSiteRank = adjustedSiteRank ;
2016-11-08 14:26:10 +01:00
if ( genDebugScoreInfo2 ( & dcs , & lastLen , & lastDocId , siteRank , score , intScore , docLang ) ) {
2016-09-09 16:29:08 +02:00
// advance to next docid
2017-04-21 17:25:21 +02:00
if ( currPassNum = = INTERSECT_SCORING ) {
docIdPtr + = 6 ;
}
2016-09-09 16:29:08 +02:00
// Continue docIdPtr < docIdEnd loop
continue ;
}
2016-09-07 21:56:57 +02:00
}
2016-10-14 11:57:15 +02:00
if ( currPassNum = = INTERSECT_SCORING ) {
2016-10-18 14:17:39 +02:00
//#
//# SCORING DONE! Add to top-scorer tree.
//#
2016-09-09 16:29:08 +02:00
int32_t tn = m_topTree - > getEmptyNode ( ) ;
2016-10-18 14:17:39 +02:00
// Sanity check
if ( tn < 0 ) {
log ( LOG_LOGIC , " %s:%s:%d: No space left in m_topTree " , __FILE__ , __func__ , __LINE__ ) ;
gbshutdownLogicError ( ) ;
}
2016-10-17 16:05:48 +02:00
TopNode * t = m_topTree - > getNode ( tn ) ;
2016-10-18 14:17:39 +02:00
2016-09-09 16:29:08 +02:00
// set the score and docid ptr
t - > m_score = score ;
t - > m_docId = m_docId ;
2017-01-09 16:11:44 +01:00
t - > m_flags = flags ;
2016-10-18 14:17:39 +02:00
2016-09-09 16:29:08 +02:00
// use an integer score like lastSpidered timestamp?
if ( m_sortByTermNumInt > = 0 ) {
t - > m_intScore = intScore ;
t - > m_score = 0.0 ;
if ( ! m_topTree - > m_useIntScores ) {
2016-10-18 14:17:39 +02:00
log ( LOG_LOGIC , " %s:%s:%d: Got int score, but m_topTree not setup for int scores! " , __FILE__ , __func__ , __LINE__ ) ;
gbshutdownLogicError ( ) ;
2016-09-09 16:29:08 +02:00
}
}
2016-10-18 14:17:39 +02:00
//
// This will not add if tree is full and it is less than the m_lowNode in score.
//
// If it does get added to a full tree, lowNode will be removed.
//
m_topTree - > addNode ( t , tn ) ;
2016-09-09 16:29:08 +02:00
// top tree only holds enough docids to satisfy the
// m_msg39request::m_docsToGet (m_msg39req->m_docsToGet) request
// from the searcher. It basically stores m_docsToGet
// into TopTree::m_docsWanted. TopTree::m_docsWanted is often
// double m_docsToGet to compensate for site clustering, and
// it can be even more than that in order to ensure we get
// enough domains represented in the search results.
// See TopTree::addNode(). it will not add the "t" node if
// its score is not high enough when the top tree is full.
2017-04-21 17:12:42 +02:00
if ( m_topTree - > getNumUsedNodes ( ) > m_topTree - > getNumDocsWanted ( ) ) {
2016-09-09 16:29:08 +02:00
// get the lowest scoring node
int32_t lowNode = m_topTree - > getLowNode ( ) ;
// and record its score in "minWinningScore"
2016-10-17 16:05:48 +02:00
minWinningScore = m_topTree - > getNode ( lowNode ) - > m_score ;
2016-09-07 21:56:57 +02:00
}
}
2016-08-29 17:05:34 +02:00
2016-09-09 16:29:08 +02:00
// advance to next docid
2017-04-21 17:25:21 +02:00
if ( currPassNum = = INTERSECT_SCORING ) {
docIdPtr + = 6 ;
}
2016-09-13 12:32:01 +02:00
} // docIdPtr < docIdEnd loop
2016-09-04 23:01:01 +02:00
2016-09-07 21:56:57 +02:00
if ( m_debug ) {
now = gettimeofdayInMilliseconds ( ) ;
took = now - lastTime ;
log ( LOG_INFO , " posdb: new algo phase % " PRId32 " took % " PRId64 " ms " , phase , took ) ;
lastTime = now ;
phase + + ;
}
2016-08-29 17:05:34 +02:00
2016-09-09 16:29:08 +02:00
} // for ... currPassNum < numPassesNeeded
2016-08-29 17:05:34 +02:00
2016-09-07 21:56:57 +02:00
2016-08-29 17:05:34 +02:00
if ( m_debug ) {
2016-09-13 14:38:09 +02:00
log ( LOG_INFO , " posdb: # prefiltMaxPossScoreFail........: % " PRId32 " " , prefiltMaxPossScoreFail ) ;
log ( LOG_INFO , " posdb: # prefiltMaxPossScorePass........: % " PRId32 " " , prefiltMaxPossScorePass ) ;
log ( LOG_INFO , " posdb: # prefiltBestDistMaxPossScoreFail: % " PRId32 " " , prefiltBestDistMaxPossScoreFail ) ;
log ( LOG_INFO , " posdb: # prefiltBestDistMaxPossScorePass: % " PRId32 " " , prefiltBestDistMaxPossScorePass ) ;
2016-08-29 17:05:34 +02:00
}
2017-04-21 17:25:21 +02:00
if ( g_conf . m_logTracePosdb ) {
m_topTree - > logTreeData ( LOG_TRACE ) ;
logDebugScoreInfo ( LOG_TRACE ) ;
}
2016-08-29 17:05:34 +02:00
// get time now
now = gettimeofdayInMilliseconds ( ) ;
// store the addLists time
m_addListsTime = now - t1 ;
m_t1 = t1 ;
m_t2 = now ;
logTrace ( g_conf . m_logTracePosdb , " END. Took % " PRId64 " msec " , m_addListsTime ) ;
}
2016-08-30 15:18:21 +02:00
// . "bestDist" is closest distance to query term # m_minTermListIdx
2016-08-29 17:05:34 +02:00
// . set "bestDist" to 1 to ignore it
float PosdbTable : : getMaxPossibleScore ( const QueryTermInfo * qti ,
int32_t bestDist ,
int32_t qdist ,
const QueryTermInfo * qtm ) {
logTrace ( g_conf . m_logTracePosdb , " BEGIN. " ) ;
// get max score of all sublists
float bestHashGroupWeight = - 1.0 ;
2016-09-06 14:58:17 +02:00
unsigned char bestDensityRank = 0 ;
2016-08-29 17:05:34 +02:00
char siteRank = - 1 ;
2016-09-21 21:20:16 +02:00
char docLang = - 1 ;
2016-08-29 17:05:34 +02:00
unsigned char hgrp ;
bool hadHalfStopWikiBigram = false ;
2016-09-02 14:46:57 +02:00
2016-08-29 17:05:34 +02:00
// scan those sublists to set m_ptrs[] and to get the
// max possible score of each one
2016-09-02 14:46:57 +02:00
for ( int32_t j = 0 ; j < qti - > m_numMatchingSubLists ; j + + ) {
2016-08-29 17:05:34 +02:00
// scan backwards up to this
2016-09-16 14:49:36 +02:00
const char * start = qti - > m_matchingSubListSavedCursor [ j ] ;
2016-09-02 14:46:57 +02:00
2016-08-29 17:05:34 +02:00
// skip if does not have our docid
2016-09-02 14:46:57 +02:00
if ( ! start ) {
continue ;
}
2016-08-29 17:05:34 +02:00
// note it if any is a wiki bigram
2016-09-02 14:46:57 +02:00
if ( qti - > m_bigramFlags [ 0 ] & BF_HALFSTOPWIKIBIGRAM ) {
2016-08-29 17:05:34 +02:00
hadHalfStopWikiBigram = true ;
2016-09-02 14:46:57 +02:00
}
2016-08-29 17:05:34 +02:00
// skip if entire sublist/termlist is exhausted
2016-09-02 14:46:57 +02:00
if ( start > = qti - > m_matchingSubListEnd [ j ] ) {
continue ;
}
2016-08-29 17:05:34 +02:00
if ( siteRank = = - 1 ) {
2016-09-06 12:46:08 +02:00
siteRank = Posdb : : getSiteRank ( start ) ;
2016-09-21 21:20:16 +02:00
}
if ( docLang = = - 1 ) {
2016-09-06 12:46:08 +02:00
docLang = Posdb : : getLangId ( start ) ;
2016-08-29 17:05:34 +02:00
}
2016-09-02 14:46:57 +02:00
2016-08-29 17:05:34 +02:00
// skip first key because it is 12 bytes, go right to the
// 6 byte keys. we deal with it below.
start + = 12 ;
// get cursor. this is actually pointing to the next docid
2016-09-16 14:49:36 +02:00
const char * dc = qti - > m_matchingSubListCursor [ j ] ;
2016-08-29 17:05:34 +02:00
// back up into our list
dc - = 6 ;
// reset this
bool retried = false ;
2016-09-03 23:12:45 +02:00
2016-08-29 17:05:34 +02:00
// do not include the beginning 12 byte key in this loop!
for ( ; dc > = start ; dc - = 6 ) {
// loop back here for the 12 byte key
retry :
// get the best hash group
2016-09-06 12:46:08 +02:00
hgrp = Posdb : : getHashGroup ( dc ) ;
2016-09-03 23:12:45 +02:00
2016-08-29 17:05:34 +02:00
// if not body, do not apply this algo because
// we end up adding term pairs from each hash group
2016-09-03 23:12:45 +02:00
if ( hgrp = = HASHGROUP_INLINKTEXT ) {
return - 1.0 ;
}
2016-08-29 17:05:34 +02:00
//if ( hgrp == HASHGROUP_TITLE ) return -1.0;
// loser?
2017-01-26 15:24:44 +01:00
if ( m_msg39req - > m_scoringWeights . m_hashGroupWeights [ hgrp ] < bestHashGroupWeight ) {
2016-08-29 17:05:34 +02:00
// if in body, it's over for this termlist
// because this is the last hash group
// we will encounter.
2016-09-03 23:12:45 +02:00
if ( hgrp = = HASHGROUP_BODY ) {
// @@@ BR: Dangerous assumption if we change indexing order in XmlDoc_Indexing ! @@@
2016-08-29 17:05:34 +02:00
goto nextTermList ;
2016-09-03 23:12:45 +02:00
}
2016-08-29 17:05:34 +02:00
// otherwise, keep chugging
continue ;
2016-08-25 17:27:37 +02:00
}
2016-09-03 23:12:45 +02:00
2016-09-06 14:37:38 +02:00
unsigned char dr = Posdb : : getDensityRank ( dc ) ;
2016-09-03 23:12:45 +02:00
2016-08-29 17:05:34 +02:00
// a clean win?
2017-01-26 15:24:44 +01:00
if ( m_msg39req - > m_scoringWeights . m_hashGroupWeights [ hgrp ] > bestHashGroupWeight ) {
bestHashGroupWeight = m_msg39req - > m_scoringWeights . m_hashGroupWeights [ hgrp ] ;
2016-08-29 17:05:34 +02:00
bestDensityRank = dr ;
continue ;
2016-08-25 17:27:37 +02:00
}
2016-09-03 23:12:45 +02:00
2016-08-29 17:05:34 +02:00
// but worst density rank?
2016-09-03 23:12:45 +02:00
if ( dr < bestDensityRank ) {
2016-08-29 17:05:34 +02:00
continue ;
2016-09-03 23:12:45 +02:00
}
2016-08-29 17:05:34 +02:00
// better?
2016-09-03 23:12:45 +02:00
if ( dr > bestDensityRank ) {
2016-08-29 17:05:34 +02:00
bestDensityRank = dr ;
2016-09-03 23:12:45 +02:00
}
2016-08-29 17:05:34 +02:00
// another tie, oh well... just ignore it
2016-08-25 17:27:37 +02:00
}
2016-09-03 23:12:45 +02:00
2016-08-29 17:05:34 +02:00
// handle the beginning 12 byte key
if ( ! retried ) {
retried = true ;
2016-09-02 14:46:57 +02:00
dc = qti - > m_matchingSubListSavedCursor [ j ] ;
2016-08-29 17:05:34 +02:00
goto retry ;
2016-08-25 17:27:37 +02:00
}
2016-07-04 15:06:15 +02:00
2016-08-29 17:05:34 +02:00
nextTermList :
continue ;
2016-07-04 15:06:15 +02:00
}
2016-08-29 17:05:34 +02:00
// if nothing, then maybe all sublists were empty?
2016-09-03 23:12:45 +02:00
if ( bestHashGroupWeight < 0 ) {
return 0.0 ;
}
2016-07-04 15:06:15 +02:00
2016-08-29 17:05:34 +02:00
// assume perfect adjacency and that the other term is perfect
float score = 100.0 ;
score * = bestHashGroupWeight ;
score * = bestHashGroupWeight ;
2016-09-03 23:12:45 +02:00
2016-08-29 17:05:34 +02:00
// since adjacent, 2nd term in pair will be in same sentence
// TODO: fix this for 'qtm' it might have a better density rank and
// better hashgroup weight, like being in title!
2017-01-26 15:24:44 +01:00
score * = m_msg39req - > m_scoringWeights . m_densityWeights [ bestDensityRank ] ;
score * = m_msg39req - > m_scoringWeights . m_densityWeights [ bestDensityRank ] ;
2016-09-03 23:12:45 +02:00
2016-08-29 17:05:34 +02:00
// wiki bigram?
if ( hadHalfStopWikiBigram ) {
score * = WIKI_BIGRAM_WEIGHT ;
score * = WIKI_BIGRAM_WEIGHT ;
2016-07-04 15:06:15 +02:00
}
2016-09-03 23:12:45 +02:00
2016-08-29 17:05:34 +02:00
//score *= perfectWordSpamWeight * perfectWordSpamWeight;
score * = ( ( ( float ) siteRank ) * m_siteRankMultiplier + 1.0 ) ;
2016-07-04 15:06:15 +02:00
2017-04-29 20:24:00 +02:00
// language boost if language specified and if page is same language, or unknown language
if ( m_msg39req - > m_language ! = 0 ) {
if ( m_msg39req - > m_language = = docLang ) {
score * = ( m_msg39req - > m_sameLangWeight ) ;
}
else
if ( docLang = = 0 ) {
score * = ( m_msg39req - > m_unknownLangWeight ) ;
}
2016-09-03 23:12:45 +02:00
}
2016-08-29 17:05:34 +02:00
// assume the other term we pair with will be 1.0
score * = qti - > m_termFreqWeight ;
2016-07-04 15:06:15 +02:00
2016-08-29 17:05:34 +02:00
// the new logic to fix 'time enough for love' slowness
if ( qdist ) {
// no use it
score * = qtm - > m_termFreqWeight ;
2016-09-03 23:12:45 +02:00
2016-08-29 17:05:34 +02:00
// subtract qdist
bestDist - = qdist ;
2016-09-03 23:12:45 +02:00
2016-08-29 17:05:34 +02:00
// assume in correct order
2016-09-03 23:12:45 +02:00
if ( qdist < 0 ) {
qdist * = - 1 ;
}
2016-08-29 17:05:34 +02:00
// make it positive
2016-09-03 23:12:45 +02:00
if ( bestDist < 0 ) {
bestDist * = - 1 ;
}
2016-08-29 17:05:34 +02:00
// avoid 0 division
2016-09-03 23:12:45 +02:00
if ( bestDist > 1 ) {
score / = ( float ) bestDist ;
}
2016-08-25 17:27:37 +02:00
}
2016-08-29 17:05:34 +02:00
// terms in same wikipedia phrase?
//if ( wikiWeight != 1.0 )
// score *= WIKI_WEIGHT;
// if query is 'time enough for love' it is kinda slow because
// we were never multiplying by WIKI_WEIGHT, even though all
// query terms were in the same wikipedia phrase. so see if this
// speeds it up.
2016-09-03 23:12:45 +02:00
if ( m_allInSameWikiPhrase ) {
2016-08-29 17:05:34 +02:00
score * = WIKI_WEIGHT ;
2016-09-03 23:12:45 +02:00
}
2016-08-25 17:27:37 +02:00
2016-08-29 17:05:34 +02:00
logTrace ( g_conf . m_logTracePosdb , " END. score=%f " , score ) ;
return score ;
}
2016-07-04 15:06:15 +02:00
2016-08-30 15:18:21 +02:00
2016-08-29 17:05:34 +02:00
////////////////////
//
2016-08-30 15:18:21 +02:00
// "White list" functions used to find docids from only specific sites
2016-08-29 17:05:34 +02:00
//
////////////////////
2016-07-04 15:06:15 +02:00
2016-08-30 15:18:21 +02:00
// this is separate from allocTopTree() function below because we must
// call it for each iteration in Msg39::doDocIdSplitLoop() which is used
// to avoid reading huge termlists into memory. it breaks the huge lists
// up by smaller docid ranges and gets the search results for each docid
// range separately.
bool PosdbTable : : allocWhiteListTable ( ) {
//
// the whitetable is for the docids in the whitelist. we have
// to only show results whose docid is in the whitetable, which
// is from the "&sites=abc.com+xyz.com..." custom search site list
// provided by the user.
//
2016-09-04 23:01:01 +02:00
if ( m_msg39req - > size_whiteList < = 1 ) m_useWhiteTable = false ; // inclds \0
2016-08-30 15:18:21 +02:00
else m_useWhiteTable = true ;
int32_t sum = 0 ;
for ( int32_t i = 0 ; i < m_msg2 - > getNumWhiteLists ( ) ; i + + ) {
RdbList * list = m_msg2 - > getWhiteList ( i ) ;
if ( list - > isEmpty ( ) ) continue ;
// assume 12 bytes for all keys but first which is 18
int32_t size = list - > getListSize ( ) ;
sum + = size / 12 + 1 ;
}
if ( sum ) {
// making this sum * 3 does not show a speedup... hmmm...
int32_t numSlots = sum * 2 ;
// keep it restricted to 5 byte keys so we do not have to
// extract the docid, we can just hash the ptr to those
// 5 bytes (which includes 1 siterank bit as the lowbit,
// but should be ok since it should be set the same in
// all termlists that have that docid)
2017-04-07 13:06:56 +02:00
if ( ! m_whiteListTable . set ( 5 , 0 , numSlots , NULL , 0 , false , " wtall " , true ) ) {
2016-08-30 15:18:21 +02:00
return false ;
2017-04-07 13:06:56 +02:00
}
2016-08-30 15:18:21 +02:00
}
return true ;
}
2016-08-29 17:05:34 +02:00
2016-08-30 15:18:21 +02:00
void PosdbTable : : prepareWhiteListTable ( )
{
// hash the docids in the whitelist termlists into a hashtable.
// every docid in the search results must be in there. the
// whitelist termlists are from a provided "&sites=abc.com+xyz.com+.."
// cgi parm. the user only wants search results returned from the
// specified subdomains. there can be up to MAX_WHITELISTS (500)
// sites right now. this hash table must have been pre-allocated
// in Posdb::allocTopTree() above since we might be in a thread.
if ( m_addedSites )
return ;
for ( int32_t i = 0 ; i < m_msg2 - > getNumWhiteLists ( ) ; i + + ) {
RdbList * list = m_msg2 - > getWhiteList ( i ) ;
if ( list - > isEmpty ( ) ) continue ;
// sanity test
2016-09-06 12:46:08 +02:00
int64_t d1 = Posdb : : getDocId ( list - > getList ( ) ) ;
2016-08-30 15:18:21 +02:00
if ( d1 > m_msg2 - > docIdEnd ( ) ) {
log ( " posdb: d1=% " PRId64 " > % " PRId64 ,
d1 , m_msg2 - > docIdEnd ( ) ) ;
//gbshutdownAbort(true);
}
if ( d1 < m_msg2 - > docIdStart ( ) ) {
log ( " posdb: d1=% " PRId64 " < % " PRId64 ,
d1 , m_msg2 - > docIdStart ( ) ) ;
//gbshutdownAbort(true);
}
// first key is always 18 bytes cuz it has the termid
// scan recs in the list
for ( ; ! list - > isExhausted ( ) ; list - > skipCurrentRecord ( ) ) {
char * rec = list - > getCurrentRec ( ) ;
// point to the 5 bytes of docid
m_whiteListTable . addKey ( rec + 7 ) ;
}
}
m_addedSites = true ;
}
bool PosdbTable : : allocTopScoringDocIdsData ( ) {
2016-09-04 23:01:01 +02:00
int64_t nn1 = m_msg39req - > m_docsToGet ;
2016-08-30 15:18:21 +02:00
int64_t nn2 = 0 ;
// just add all up in case doing boolean OR or something
for ( int32_t k = 0 ; k < m_msg2 - > getNumLists ( ) ; k + + ) {
// count
RdbList * list = m_msg2 - > getList ( k ) ;
// skip if null
if ( ! list ) {
continue ;
}
// skip if list is empty, too
if ( list - > isEmpty ( ) ) {
continue ;
}
// show if debug
if ( m_debug ) {
2016-09-05 13:21:15 +02:00
log ( LOG_INFO , " toptree: adding listsize % " PRId32 " to nn2 " , list - > getListSize ( ) ) ;
2016-08-30 15:18:21 +02:00
}
// tally. each new docid in this termlist will compress
// the 6 byte termid out, so reduce by 6.
2016-09-05 13:21:15 +02:00
nn2 + = list - > getListSize ( ) / ( sizeof ( posdbkey_t ) - 6 ) ;
2016-08-30 15:18:21 +02:00
}
// if doing docid range phases where we compute the winning docids
// for a range of docids to save memory, then we need to amp this up
2016-09-04 23:01:01 +02:00
if ( m_msg39req - > m_numDocIdSplits > 1 ) {
2016-08-30 15:18:21 +02:00
// if 1 split has only 1 docid the other splits
// might have 10 then this doesn't work, so make it
// a min of 100.
if ( nn2 < 100 ) {
nn2 = 100 ;
}
// how many docid range splits are we doing?
2016-09-04 23:01:01 +02:00
nn2 * = m_msg39req - > m_numDocIdSplits ;
2016-08-30 15:18:21 +02:00
// just in case one split is not as big
nn2 * = 2 ;
// boost this guy too since we compare it to nn2
if ( nn1 < 100 ) {
nn1 = 100 ;
}
2016-09-04 23:01:01 +02:00
nn1 * = m_msg39req - > m_numDocIdSplits ;
2016-08-30 15:18:21 +02:00
nn1 * = 2 ;
}
// do not go OOM just because client asked for 10B results and we
// only have like 100 results.
int64_t nn = gbmin ( nn1 , nn2 ) ;
// . do not alloc space for anything if all termlists are empty
// . before, even if nn was 0, top tree would alloc a bunch of nodes
// and we don't want to do that now to save mem and so
// Msg39 can check
// if ( m_posdbTable.m_topTree->m_numNodes == 0 )
// to see if it should
// advance to the next docid range or not.
if ( nn = = 0 ) {
return true ;
}
// always at least 100 i guess. why? it messes up the
// m_scoreInfoBuf capacity and it cores
//if ( nn < 100 ) nn = 100;
// but 30 is ok since m_scoreInfo buf uses 32
nn = gbmax ( nn , 30 ) ;
2016-09-04 23:01:01 +02:00
if ( m_msg39req - > m_doSiteClustering ) {
2016-08-30 15:18:21 +02:00
nn * = 2 ;
}
// limit to 2B docids i guess
nn = gbmin ( nn , 2000000000 ) ;
if ( m_debug ) {
log ( LOG_INFO , " toptree: toptree: initializing % " PRId64 " nodes " , nn ) ;
}
2016-09-04 23:01:01 +02:00
if ( nn < m_msg39req - > m_docsToGet ) {
2016-08-30 15:18:21 +02:00
log ( " query: warning only getting up to % " PRId64 " docids "
" even though % " PRId32 " requested because termlist "
" sizes are so small!! splits=% " PRId32
, nn
2016-09-04 23:01:01 +02:00
, m_msg39req - > m_docsToGet
, ( int32_t ) m_msg39req - > m_numDocIdSplits
2016-08-30 15:18:21 +02:00
) ;
}
// keep it sane
2016-09-24 14:53:16 +02:00
if ( nn > ( int64_t ) m_msg39req - > m_docsToGet * 2 & & nn > 60 ) {
2016-09-26 20:35:29 +02:00
nn = ( int64_t ) m_msg39req - > m_docsToGet * 2 ;
2016-08-30 15:18:21 +02:00
}
// this actually sets the # of nodes to MORE than nn!!!
2016-09-04 23:01:01 +02:00
if ( ! m_topTree - > setNumNodes ( nn , m_msg39req - > m_doSiteClustering ) ) {
2016-08-30 15:18:21 +02:00
log ( " toptree: toptree: error allocating nodes: %s " ,
mstrerror ( g_errno ) ) ;
return false ;
}
// let's use nn*4 to try to get as many score as possible, although
// it may still not work!
2016-09-04 23:01:01 +02:00
int32_t xx = nn ; //m_msg39req->m_docsToGet ;
2016-08-30 15:18:21 +02:00
// try to fix a core of growing this table in a thread when xx == 1
xx = gbmax ( xx , 32 ) ;
2016-09-04 23:01:01 +02:00
//if ( m_msg39req->m_doSiteClustering ) xx *= 4;
2016-08-30 15:18:21 +02:00
m_maxScores = xx ;
// for seeing if a docid is in toptree. niceness=0.
//if ( ! m_docIdTable.set(8,0,xx*4,NULL,0,false,0,"dotb") )
// return false;
2016-09-04 23:01:01 +02:00
if ( m_msg39req - > m_getDocIdScoringInfo ) {
2016-08-30 15:18:21 +02:00
m_scoreInfoBuf . setLabel ( " scinfobuf " ) ;
// . for holding the scoring info
// . add 1 for the \0 safeMemcpy() likes to put at the end so
// it will not realloc on us
if ( ! m_scoreInfoBuf . reserve ( xx * sizeof ( DocIdScore ) + 100 ) ) {
return false ;
}
// likewise how many query term pair scores should we get?
int32_t numTerms = m_q - > m_numTerms ;
// limit
numTerms = gbmin ( numTerms , 10 ) ;
// the pairs. divide by 2 since (x,y) is same as (y,x)
int32_t numPairs = ( numTerms * numTerms ) / 2 ;
// then for each pair assume no more than MAX_TOP reps, usually
// it's just 1, but be on the safe side
numPairs * = m_realMaxTop ; //MAX_TOP;
// now that is how many pairs per docid and can be 500! but we
// still have to multiply by how many docids we want to
// compute. so this could easily get into the megabytes, most
// of the time we will not need nearly that much however.
numPairs * = xx ;
m_pairScoreBuf . setLabel ( " pairbuf " ) ;
m_singleScoreBuf . setLabel ( " snglbuf " ) ;
// but alloc it just in case
if ( ! m_pairScoreBuf . reserve ( numPairs * sizeof ( PairScore ) ) ) {
return false ;
}
// and for singles
int32_t numSingles = numTerms * m_realMaxTop * xx ; // MAX_TOP *xx;
if ( ! m_singleScoreBuf . reserve ( numSingles * sizeof ( SingleScore ) ) ) {
return false ;
}
}
// m_topScoringDocIdsBuf. MUST MATCH use in intersectLists10_r !
int32_t nqt = m_q - > m_numTerms ;
int32_t need = 0 ;
need + = 4 * nqt ; // wikiPhraseIds
need + = 4 * nqt ; // quotedStartIds
need + = 4 * nqt ; // qpos
need + = 4 * nqt ; // qtermNums
need + = sizeof ( float ) * nqt ; // freqWeights
need + = sizeof ( char * ) * nqt ; // miniMergedList
need + = sizeof ( char * ) * nqt ; // miniMergedEnd
2016-10-10 16:02:50 +02:00
need + = sizeof ( char * ) * nqt ; // highestScoringNonBodyPos
2016-08-30 15:18:21 +02:00
need + = sizeof ( char * ) * nqt ; // winnerStack
need + = sizeof ( char * ) * nqt ; // xpos
need + = sizeof ( char ) * nqt ; // bflags
need + = sizeof ( float ) * nqt * nqt ; // scoreMatrix
m_topScoringDocIdsBuf . setLabel ( " stkbuf1 " ) ;
if ( ! m_topScoringDocIdsBuf . reserve ( need ) ) {
return false ;
}
return true ;
}
////////////////////
//
// Functions used to find candidate docids, including "Voting" functions
// used to find docids with all required term ids and functions to remove
// docids that do not match all required terms
//
////////////////////
//
// Run through each term sublist and remove all docids not
// found in the docid vote buffer
//
2017-06-01 16:06:45 +02:00
void PosdbTable : : delNonMatchingDocIdsFromSubLists ( ) {
2016-08-30 15:18:21 +02:00
logTrace ( g_conf . m_logTracePosdb , " BEGIN. " ) ;
2017-06-01 16:06:45 +02:00
//phase 1: shrink the rdblists for all queryterms (except those with a minus sign)
std : : valarray < char * > newEndPtr ( m_q - > m_numTerms ) ;
for ( int i = 0 ; i < m_q - > m_numTerms ; i + + ) {
newEndPtr [ i ] = NULL ;
if ( m_q - > m_qterms [ i ] . m_termSign = = ' - ' )
continue ;
RdbList * list = m_q - > m_qterms [ i ] . m_posdbListPtr ;
if ( ! list | | list - > isEmpty ( ) )
continue ;
2016-08-30 15:18:21 +02:00
// get that sublist
2017-06-01 16:06:45 +02:00
char * subListPtr = list - > getList ( ) ;
char * subListEnd = list - > getListEnd ( ) ;
char * dst = subListPtr ;
2016-08-30 15:18:21 +02:00
// reset docid list ptrs
2017-05-29 15:04:07 +02:00
const char * dp = m_docIdVoteBuf . getBufStart ( ) ;
const char * dpEnd = dp + m_docIdVoteBuf . length ( ) ;
2017-06-01 16:06:45 +02:00
//log(LOG_INFO,"@@@@ i#%d subListPtr=%p subListEnd=%p", i, subListPtr, subListEnd);
2017-05-29 15:13:37 +02:00
for ( ; ; ) {
// scan the docid list for the current docid in this termlist
for ( ; dp < dpEnd ; dp + = 6 ) {
// if current docid in docid list is >= the docid
// in the sublist, stop. docid in list is 6 bytes and
// subListPtr must be pointing to a 12 byte posdb rec.
if ( * ( uint32_t * ) ( dp + 1 ) > * ( uint32_t * ) ( subListPtr + 8 ) ) {
break ;
}
// try to catch up docid if it is behind
if ( * ( uint32_t * ) ( dp + 1 ) < * ( uint32_t * ) ( subListPtr + 8 ) ) {
continue ;
}
2016-08-30 15:18:21 +02:00
2017-05-29 15:13:37 +02:00
// check lower byte if equal
if ( * ( unsigned char * ) ( dp ) > ( * ( unsigned char * ) ( subListPtr + 7 ) & 0xfc ) ) {
break ;
}
2016-08-30 15:18:21 +02:00
2017-05-29 15:13:37 +02:00
if ( * ( unsigned char * ) ( dp ) < ( * ( unsigned char * ) ( subListPtr + 7 ) & 0xfc ) ) {
continue ;
}
// copy over the 12 byte key
* ( int64_t * ) dst = * ( int64_t * ) subListPtr ;
* ( int32_t * ) ( dst + 8 ) = * ( int32_t * ) ( subListPtr + 8 ) ;
// skip that
dst + = 12 ;
subListPtr + = 12 ;
// copy over any 6 bytes keys following
for ( ; ; ) {
if ( subListPtr > = subListEnd ) {
// give up on this exhausted term list!
goto doneWithSubList ;
}
// next docid willbe next 12 bytekey
if ( ! ( subListPtr [ 0 ] & 0x04 ) ) {
break ;
}
// otherwise it's 6 bytes
* ( int32_t * ) dst = * ( int32_t * ) subListPtr ;
* ( int16_t * ) ( dst + 4 ) = * ( int16_t * ) ( subListPtr + 4 ) ;
dst + = 6 ;
subListPtr + = 6 ;
}
2016-08-30 15:18:21 +02:00
}
2017-05-29 15:13:37 +02:00
// skip that docid record in our termlist. it MUST have been
// 12 bytes, a docid heading record.
2016-08-30 15:18:21 +02:00
subListPtr + = 12 ;
2017-05-29 15:13:37 +02:00
// skip any following keys that are 6 bytes, that means they
// share the same docid
for ( ; ; ) {
// list exhausted?
2016-08-30 15:18:21 +02:00
if ( subListPtr > = subListEnd ) {
goto doneWithSubList ;
}
2017-05-29 15:13:37 +02:00
// stop if next key is 12 bytes, that is a new docid
if ( ! ( subListPtr [ 0 ] & 0x04 ) ) {
2016-08-30 15:18:21 +02:00
break ;
}
2017-05-29 15:13:37 +02:00
// skip it
2016-08-30 15:18:21 +02:00
subListPtr + = 6 ;
}
}
doneWithSubList :
2017-06-01 16:06:45 +02:00
//log(LOG_INFO,"@@@ shrunk #%d to %ld (%p-%p)", i, dst - list->getList(), list->getList(), dst);
newEndPtr [ i ] = dst ;
}
//phase 2: set the matchingsublist pointers in qti
for ( int i = 0 ; i < m_numQueryTermInfos ; i + + ) {
QueryTermInfo * qti = ( ( QueryTermInfo * ) m_qiBuf . getBufStart ( ) ) + i ;
if ( qti - > m_bigramFlags [ 0 ] & BF_NEGATIVE )
continue ; //don't modify sublist for negative terms
qti - > m_numMatchingSubLists = 0 ;
for ( int j = 0 ; j < qti - > m_numSubLists ; j + + ) {
for ( int k = 0 ; k < m_q - > m_numTerms ; k + + ) {
if ( qti - > m_subLists [ j ] = = m_q - > m_qterms [ k ] . m_posdbListPtr ) {
char * newStartPtr = m_q - > m_qterms [ k ] . m_posdbListPtr - > getList ( ) ; //same as always
int32_t x = qti - > m_numMatchingSubLists ;
qti - > m_matchingSubListSize [ x ] = newEndPtr [ k ] - newStartPtr ;
qti - > m_matchingSubListStart [ x ] = newStartPtr ;
qti - > m_matchingSubListEnd [ x ] = newEndPtr [ k ] ;
qti - > m_matchingSubListCursor [ x ] = newStartPtr ;
qti - > m_matchingSubListSavedCursor [ x ] = newStartPtr ;
qti - > m_numMatchingSubLists + + ;
break ;
}
}
2016-08-30 15:18:21 +02:00
}
}
logTrace ( g_conf . m_logTracePosdb , " END. " ) ;
}
//
// Removes docids with vote -1, which is set for docids matching
// negative query terms (e.g. -rock)
//
void PosdbTable : : delDocIdVotes ( const QueryTermInfo * qti ) {
char * bufStart = m_docIdVoteBuf . getBufStart ( ) ;
char * voteBufPtr = NULL ;
char * voteBufEnd ;
char * subListPtr ;
char * subListEnd ;
logTrace ( g_conf . m_logTracePosdb , " BEGIN. " ) ;
// just scan each sublist vs. the docid list
for ( int32_t i = 0 ; i < qti - > m_numSubLists ; i + + ) {
// get that sublist
subListPtr = qti - > m_subLists [ i ] - > getList ( ) ;
subListEnd = qti - > m_subLists [ i ] - > getListEnd ( ) ;
// reset docid list ptrs
voteBufPtr = m_docIdVoteBuf . getBufStart ( ) ;
voteBufEnd = voteBufPtr + m_docIdVoteBuf . length ( ) ;
2016-08-29 17:05:34 +02:00
// loop it
while ( subListPtr < subListEnd ) {
// scan for his docids and inc the vote
for ( ; voteBufPtr < voteBufEnd ; voteBufPtr + = 6 ) {
// if current docid in docid list is >= the docid
// in the sublist, stop. docid in list is 6 bytes and
// subListPtr must be pointing to a 12 byte posdb rec.
if ( * ( uint32_t * ) ( voteBufPtr + 1 ) > * ( uint32_t * ) ( subListPtr + 8 ) ) {
break ;
}
// less than? keep going
if ( * ( uint32_t * ) ( voteBufPtr + 1 ) < * ( uint32_t * ) ( subListPtr + 8 ) ) {
continue ;
}
// top 4 bytes are equal. check lower single byte then.
if ( * ( unsigned char * ) ( voteBufPtr ) > ( * ( unsigned char * ) ( subListPtr + 7 ) & 0xfc ) ) {
break ;
}
if ( * ( unsigned char * ) ( voteBufPtr ) < ( * ( unsigned char * ) ( subListPtr + 7 ) & 0xfc ) ) {
continue ;
}
// . equal! mark it as nuked!
2016-08-30 15:18:21 +02:00
voteBufPtr [ 5 ] = - 1 ;
2016-08-29 17:05:34 +02:00
// skip it
voteBufPtr + = 6 ;
// advance subListPtr now
break ;
2016-07-04 15:06:15 +02:00
}
2016-08-29 17:05:34 +02:00
// if we've exhausted this docid list go to next sublist
if ( voteBufPtr > = voteBufEnd ) {
goto endloop2 ;
2016-07-04 15:06:15 +02:00
}
2016-08-30 15:18:21 +02:00
2016-08-29 17:05:34 +02:00
// skip that docid record in our termlist. it MUST have been
// 12 bytes, a docid heading record.
subListPtr + = 12 ;
// skip any following keys that are 6 bytes, that means they
// share the same docid
2016-09-06 14:22:45 +02:00
for ( ; subListPtr < subListEnd & & ( ( * subListPtr ) & 0x04 ) ; ) {
subListPtr + = 6 ;
}
2016-08-29 17:05:34 +02:00
// if we have more posdb recs in this sublist, then keep
// adding our docid votes into the docid list
2016-07-04 15:06:15 +02:00
}
2016-08-29 17:05:34 +02:00
endloop2 : ;
// otherwise, advance to next sublist
2016-07-04 15:06:15 +02:00
}
2016-08-30 15:18:21 +02:00
// now remove docids with a -1 vote, they are nuked
2016-08-29 17:05:34 +02:00
voteBufPtr = m_docIdVoteBuf . getBufStart ( ) ;
voteBufEnd = voteBufPtr + m_docIdVoteBuf . length ( ) ;
char * dst = voteBufPtr ;
for ( ; voteBufPtr < voteBufEnd ; voteBufPtr + = 6 ) {
// do not re-copy it if it was in this negative termlist
if ( voteBufPtr [ 5 ] = = - 1 ) {
2016-07-04 15:06:15 +02:00
continue ;
2016-08-25 17:27:37 +02:00
}
2016-08-29 17:05:34 +02:00
// copy it over. might be the same address!
* ( int32_t * ) dst = * ( int32_t * ) voteBufPtr ;
* ( int16_t * ) ( dst + 4 ) = * ( int16_t * ) ( voteBufPtr + 4 ) ;
dst + = 6 ;
2016-07-04 15:06:15 +02:00
}
2016-08-29 17:05:34 +02:00
// shrink the buffer size now
m_docIdVoteBuf . setLength ( dst - bufStart ) ;
logTrace ( g_conf . m_logTracePosdb , " END. " ) ;
return ;
}
2016-07-04 15:06:15 +02:00
2016-08-25 17:27:37 +02:00
2016-08-29 17:05:34 +02:00
//
// First call will allocate the m_docIdVoteBuf and add docids from the shortest
// term list to the buffer.
//
// Next calls will run through all term sublists (synonyms, term variations) and
// increase the score in the m_docIdVoteBuf for matching docids. Docids that do
// not match a term is removed, so we end up with list of docids matching all
// query terms.
//
void PosdbTable : : addDocIdVotes ( const QueryTermInfo * qti , int32_t listGroupNum ) {
logTrace ( g_conf . m_logTracePosdb , " BEGIN. " ) ;
// sanity check, we store this in a single byte below for voting
if ( listGroupNum > = 256 ) {
gbshutdownAbort ( true ) ;
2016-07-04 15:06:15 +02:00
}
2016-08-29 17:05:34 +02:00
// range terms tend to disappear if the docid's value falls outside
// of the specified range... gbmin:offerprice:190
bool isRangeTerm = false ;
2017-05-22 16:15:48 +02:00
const QueryTerm * qt = qti - > m_qt ;
2016-08-29 17:05:34 +02:00
if ( qt - > m_fieldCode = = FIELD_GBNUMBERMIN )
isRangeTerm = true ;
if ( qt - > m_fieldCode = = FIELD_GBNUMBERMAX )
isRangeTerm = true ;
if ( qt - > m_fieldCode = = FIELD_GBNUMBEREQUALFLOAT )
isRangeTerm = true ;
if ( qt - > m_fieldCode = = FIELD_GBNUMBERMININT )
isRangeTerm = true ;
if ( qt - > m_fieldCode = = FIELD_GBNUMBERMAXINT )
isRangeTerm = true ;
if ( qt - > m_fieldCode = = FIELD_GBNUMBEREQUALINT )
isRangeTerm = true ;
// if ( qt->m_fieldCode == FIELD_GBFIELDMATCH )
// isRangeTerm = true;
2016-07-04 15:06:15 +02:00
//
2016-08-29 17:05:34 +02:00
// add the first sublist's docids into the docid buf
2016-07-04 15:06:15 +02:00
//
2016-08-29 17:05:34 +02:00
if ( listGroupNum = = 0 ) {
// the first listGroup is not intersecting, just adding to
// the docid vote buf. that is, if the query is "jump car" we
// just add all the docids for "jump". Subsequent calls to
// this function will intersect those docids with the docids
// for "car", resulting in a buffer with docids that contain
// both terms.
2016-07-04 15:06:15 +02:00
2016-08-29 17:05:34 +02:00
makeDocIdVoteBufForRarestTerm ( qti , isRangeTerm ) ;
logTrace ( g_conf . m_logTracePosdb , " END. " ) ;
return ;
}
2016-07-04 15:06:15 +02:00
2016-08-29 17:05:34 +02:00
char * bufStart = m_docIdVoteBuf . getBufStart ( ) ;
char * voteBufPtr = NULL ;
char * voteBufEnd ;
char * subListPtr ;
char * subListEnd ;
2016-07-04 15:06:15 +02:00
2016-08-29 17:05:34 +02:00
//
// For each sublist (term variation) loop through all sublist records
// and compare with docids in the vote buffer. If a match is found, the
// score in the vote buffer is set to the current ListGroupNum.
2016-07-04 15:06:15 +02:00
//
2016-08-29 17:05:34 +02:00
// A sublist is a termlist for a particular query term, for instance
// the query term "jump" will have sublists for "jump" "jumps"
// "jumping" "jumped" and maybe even "jumpy", so that could be
// 5 sublists, and their QueryTermInfo::m_qtermNum should be the
// same for all 5.
2016-07-04 15:06:15 +02:00
//
2016-08-29 17:05:34 +02:00
for ( int32_t i = 0 ; i < qti - > m_numSubLists ; i + + ) {
// get that sublist
subListPtr = qti - > m_subLists [ i ] - > getList ( ) ;
subListEnd = qti - > m_subLists [ i ] - > getListEnd ( ) ;
// reset docid list ptrs
voteBufPtr = m_docIdVoteBuf . getBufStart ( ) ;
voteBufEnd = voteBufPtr + m_docIdVoteBuf . length ( ) ;
2016-08-25 17:27:37 +02:00
2016-08-29 17:05:34 +02:00
// loop it
handleNextSubListRecord :
// scan for his docids and inc the vote
for ( ; voteBufPtr < voteBufEnd ; voteBufPtr + = 6 ) {
// if current docid in docid list is >= the docid
// in the sublist, stop. docid in list is 6 bytes and
// subListPtr must be pointing to a 12 byte posdb rec.
if ( * ( uint32_t * ) ( voteBufPtr + 1 ) > * ( uint32_t * ) ( subListPtr + 8 ) ) {
break ;
}
2016-08-25 17:27:37 +02:00
2016-08-29 17:05:34 +02:00
// less than? keep going
if ( * ( uint32_t * ) ( voteBufPtr + 1 ) < * ( uint32_t * ) ( subListPtr + 8 ) ) {
2016-07-04 15:06:15 +02:00
continue ;
2016-08-25 17:27:37 +02:00
}
2016-08-29 17:05:34 +02:00
// top 4 bytes are equal. check lower single byte then.
if ( * ( unsigned char * ) ( voteBufPtr ) > ( * ( unsigned char * ) ( subListPtr + 7 ) & 0xfc ) ) {
break ;
}
if ( * ( unsigned char * ) ( voteBufPtr ) < ( * ( unsigned char * ) ( subListPtr + 7 ) & 0xfc ) ) {
continue ;
}
// if we are a range term, does this subtermlist
// for this docid meet the min/max requirements
// of the range term, i.e. gbmin:offprice:190.
// if it doesn't then do not add this docid to the
// docidVoteBuf, "voteBufPtr"
if ( isRangeTerm & & ! isTermValueInRange2 ( subListPtr , subListEnd , qt ) ) {
break ;
2016-08-25 17:27:37 +02:00
}
2016-08-29 17:05:34 +02:00
// . equal! record our vote!
// . we start at zero for the
// first termlist, and go to 1, etc.
voteBufPtr [ 5 ] = listGroupNum ;
// skip it
voteBufPtr + = 6 ;
// break out to advance subListPtr
break ;
}
// if we've exhausted this docid list go to next sublist
// since this docid is NOT in the current/ongoing intersection
// of the docids for each queryterm
if ( voteBufPtr > = voteBufEnd ) {
continue ;
2016-07-04 15:06:15 +02:00
}
2016-08-29 17:05:34 +02:00
// skip that docid record in our termlist. it MUST have been
// 12 bytes, a docid heading record.
subListPtr + = 12 ;
// skip any following keys that are 6 bytes, that means they
// share the same docid
2016-09-06 14:22:45 +02:00
for ( ; subListPtr < subListEnd & & ( ( * subListPtr ) & 0x04 ) ; ) {
subListPtr + = 6 ;
}
2016-08-25 17:27:37 +02:00
2016-08-29 17:05:34 +02:00
// if we have more posdb recs in this sublist, then keep
// adding our docid votes into the docid list
if ( subListPtr < subListEnd ) {
goto handleNextSubListRecord ;
2016-08-25 17:27:37 +02:00
}
2016-08-29 17:05:34 +02:00
// otherwise, advance to next sublist
2016-07-04 15:06:15 +02:00
}
2016-08-29 17:05:34 +02:00
//
// Shrink the docidbuf by removing docids with not enough
// votes which means they are missing a query term
//
voteBufPtr = m_docIdVoteBuf . getBufStart ( ) ;
voteBufEnd = voteBufPtr + m_docIdVoteBuf . length ( ) ;
char * dst = voteBufPtr ;
for ( ; voteBufPtr < voteBufEnd ; voteBufPtr + = 6 ) {
// skip if it has enough votes to be in search
// results so far
if ( voteBufPtr [ 5 ] ! = listGroupNum ) {
continue ;
}
// copy it over. might be the same address!
* ( int32_t * ) dst = * ( int32_t * ) voteBufPtr ;
* ( int16_t * ) ( dst + 4 ) = * ( int16_t * ) ( voteBufPtr + 4 ) ;
dst + = 6 ;
2016-08-25 17:27:37 +02:00
}
2016-07-04 15:06:15 +02:00
2016-08-29 17:05:34 +02:00
// shrink the buffer size
m_docIdVoteBuf . setLength ( dst - bufStart ) ;
logTrace ( g_conf . m_logTracePosdb , " END. " ) ;
}
2016-07-04 15:06:15 +02:00
2016-08-29 17:05:34 +02:00
//
// Initialize the vote buffer with docids from the shortest
// term (rarest required term) list and initialize the scores to 0.
// Called by addDocIdVotes
//
// The buffer consists of 5-byte docids and 1-byte scores. The score
// is incremented for each term that matches the docid, and after
// each run, the list is "compacted" and shortened so only the
// matching docids are left.
//
void PosdbTable : : makeDocIdVoteBufForRarestTerm ( const QueryTermInfo * qti , bool isRangeTerm ) {
char * cursor [ MAX_SUBLISTS ] ;
char * cursorEnd [ MAX_SUBLISTS ] ;
2016-07-04 15:06:15 +02:00
2016-08-29 17:05:34 +02:00
logTrace ( g_conf . m_logTracePosdb , " term id [% " PRId64 " ] [%.*s] " , qti - > m_qt - > m_termId , qti - > m_qt - > m_termLen , qti - > m_qt - > m_term ) ;
for ( int32_t i = 0 ; i < qti - > m_numSubLists ; i + + ) {
// get that sublist
cursor [ i ] = qti - > m_subLists [ i ] - > getList ( ) ;
cursorEnd [ i ] = qti - > m_subLists [ i ] - > getListEnd ( ) ;
}
char * bufStart = m_docIdVoteBuf . getBufStart ( ) ;
char * voteBufPtr = m_docIdVoteBuf . getBufStart ( ) ;
char * recPtr ;
char * minRecPtr ;
char * lastMinRecPtr = NULL ;
int32_t mini = - 1 ;
2017-05-22 16:15:48 +02:00
const QueryTerm * qt = qti - > m_qt ;
2016-08-29 17:05:34 +02:00
2016-10-01 15:04:47 +02:00
// get the next min from all the termlists
for ( ; ; ) {
2016-08-29 17:05:34 +02:00
2016-10-01 15:04:47 +02:00
// reset this
minRecPtr = NULL ;
2016-08-29 17:05:34 +02:00
2016-10-01 15:04:47 +02:00
// just scan each sublist vs. the docid list
for ( int32_t i = 0 ; i < qti - > m_numSubLists ; i + + ) {
// skip if exhausted
if ( ! cursor [ i ] ) {
continue ;
}
// shortcut
recPtr = cursor [ i ] ;
// get the min docid
if ( ! minRecPtr ) {
minRecPtr = recPtr ;
mini = i ;
continue ;
}
// compare!
if ( * ( uint32_t * ) ( recPtr + 8 ) >
* ( uint32_t * ) ( minRecPtr + 8 ) ) {
continue ;
}
// a new min
if ( * ( uint32_t * ) ( recPtr + 8 ) <
* ( uint32_t * ) ( minRecPtr + 8 ) ) {
minRecPtr = recPtr ;
mini = i ;
continue ;
}
// check lowest byte
if ( ( * ( unsigned char * ) ( recPtr + 7 ) & 0xfc ) >
( * ( unsigned char * ) ( minRecPtr + 7 ) & 0xfc ) ) {
continue ;
}
// a new min
if ( ( * ( unsigned char * ) ( recPtr + 7 ) & 0xfc ) <
( * ( unsigned char * ) ( minRecPtr + 7 ) & 0xfc ) ) {
minRecPtr = recPtr ;
mini = i ;
continue ;
}
2016-08-25 17:27:37 +02:00
}
2016-10-01 15:04:47 +02:00
// if no min then all lists exhausted!
2016-08-29 17:05:34 +02:00
if ( ! minRecPtr ) {
2016-10-01 15:04:47 +02:00
// update length
m_docIdVoteBuf . setLength ( voteBufPtr - bufStart ) ;
2016-08-29 17:05:34 +02:00
2016-10-01 15:04:47 +02:00
// all done!
logTrace ( g_conf . m_logTracePosdb , " END. " ) ;
return ;
2016-08-29 17:05:34 +02:00
}
2016-10-01 15:04:47 +02:00
bool inRange = false ;
// if we are a range term, does this subtermlist
// for this docid meet the min/max requirements
// of the range term, i.e. gbmin:offprice:190.
// if it doesn't then do not add this docid to the
// docidVoteBuf, "voteBufPtr"
if ( isRangeTerm ) {
// no longer in range
if ( isTermValueInRange2 ( cursor [ mini ] , cursorEnd [ mini ] , qt ) ) {
inRange = true ;
}
2016-08-29 17:05:34 +02:00
}
2016-07-04 15:06:15 +02:00
2016-10-01 15:04:47 +02:00
// advance that guy over that docid
cursor [ mini ] + = 12 ;
// 6 byte keys follow?
for ( ; ; ) {
// end of list?
if ( cursor [ mini ] > = cursorEnd [ mini ] ) {
// use NULL to indicate list is exhausted
cursor [ mini ] = NULL ;
break ;
}
2016-08-29 17:05:34 +02:00
2016-10-01 15:04:47 +02:00
// if we hit a new 12 byte key for a new docid, stop
if ( ! ( cursor [ mini ] [ 0 ] & 0x04 ) ) {
break ;
}
2016-09-16 23:35:42 +02:00
2016-10-01 15:04:47 +02:00
// check range again
if ( isRangeTerm & & isTermValueInRange2 ( cursor [ mini ] , cursorEnd [ mini ] , qt ) ) {
inRange = true ;
}
2016-08-29 17:05:34 +02:00
2016-10-01 15:04:47 +02:00
// otherwise, skip this 6 byte key
cursor [ mini ] + = 6 ;
2016-08-25 17:27:37 +02:00
}
2016-10-01 15:04:47 +02:00
// is it a docid dup?
if ( lastMinRecPtr & &
* ( uint32_t * ) ( lastMinRecPtr + 8 ) = =
* ( uint32_t * ) ( minRecPtr + 8 ) & &
( * ( unsigned char * ) ( lastMinRecPtr + 7 ) & 0xfc ) = =
( * ( unsigned char * ) ( minRecPtr + 7 ) & 0xfc ) ) {
continue ;
2016-08-29 17:05:34 +02:00
}
2016-07-04 15:06:15 +02:00
2016-10-01 15:04:47 +02:00
// . do not store the docid if not in the whitelist
// . FIX: two lower bits, what are they? at minRecPtrs[7].
// . well the lowest bit is the siterank upper bit and the
// other bit is always 0. we should be ok with just using
// the 6 bytes of the docid ptr as is though since the siterank
// should be the same for the site: terms we indexed for the same
// docid!!
if ( m_useWhiteTable & & ! m_whiteListTable . isInTable ( minRecPtr + 7 ) ) {
continue ;
2016-08-29 17:05:34 +02:00
}
2016-07-04 15:06:15 +02:00
2016-10-01 15:04:47 +02:00
if ( isRangeTerm & & ! inRange ) {
continue ;
}
2016-07-04 15:06:15 +02:00
2016-10-01 15:04:47 +02:00
// only update this if we add the docid... that way there can be
// a winning "inRange" term in another sublist and the docid will
// get added.
lastMinRecPtr = minRecPtr ;
2016-07-04 15:06:15 +02:00
2016-10-01 15:04:47 +02:00
// store our docid. actually it contains two lower bits not
// part of the docid, so we'll have to shift and mask to get
// the actual docid!
// docid is only 5 bytes for now
* ( int32_t * ) ( voteBufPtr + 1 ) = * ( int32_t * ) ( minRecPtr + 8 ) ;
// the single lower byte
voteBufPtr [ 0 ] = minRecPtr [ 7 ] & 0xfc ;
// 0 vote count
voteBufPtr [ 5 ] = 0 ;
2016-07-04 15:06:15 +02:00
2016-10-01 15:04:47 +02:00
// debug
// int64_t dd = Posdb::getDocId(minRecPtr);
// log(LOG_ERROR, "%s:%s: adding docid %" PRId64 "", __FILE__, __func__, dd);
2016-07-04 15:06:15 +02:00
2016-10-01 15:04:47 +02:00
// advance
voteBufPtr + = 6 ;
}
2016-08-29 17:05:34 +02:00
}
2016-07-04 15:06:15 +02:00
2016-08-29 17:05:34 +02:00
// TODO: do this in docid range phases to save memory and be much faster
// since we could contain to the L1 cache for hashing
bool PosdbTable : : makeDocIdVoteBufForBoolQuery ( ) {
2016-07-04 15:06:15 +02:00
2016-08-29 17:05:34 +02:00
logTrace ( g_conf . m_logTracePosdb , " BEGIN. " ) ;
2016-07-04 15:06:15 +02:00
2016-08-29 17:05:34 +02:00
// . make a hashtable of all the docids from all the termlists
// . the value slot will be the operand bit vector i guess
// . the size of the vector needs one bit per query operand
// . if the vector is only 1-2 bytes we can just evaluate each
// combination we encounter and store it into an array, otherwise,
// we can use a another hashtable in order to avoid re-evaluation
// on if it passes the boolean query.
char bitVec [ MAX_OVEC_SIZE ] ;
if ( m_vecSize > MAX_OVEC_SIZE ) {
m_vecSize = MAX_OVEC_SIZE ;
2016-08-25 17:27:37 +02:00
}
2016-07-04 15:06:15 +02:00
2016-09-02 14:46:57 +02:00
QueryTermInfo * qtibuf = ( QueryTermInfo * ) m_qiBuf . getBufStart ( ) ;
2016-07-04 15:06:15 +02:00
2016-08-29 17:05:34 +02:00
// . scan each list of docids to a get a new docid, keep a dedup
// table to avoid re-processing the same docid.
// . each posdb list we read corresponds to a query term,
// or a synonym of a query term, or bigram of a query term, etc.
// but we really want to know what operand, so we associate an
// operand bit with each query term, and each list can map to
// the base query term so we can get the operand # from that.
for ( int32_t i = 0 ; i < m_numQueryTermInfos ; i + + ) {
2016-07-27 18:43:29 +02:00
2016-08-29 17:05:34 +02:00
// get it
2016-09-02 14:46:57 +02:00
QueryTermInfo * qti = & qtibuf [ i ] ;
2016-07-04 15:06:15 +02:00
2016-08-29 17:05:34 +02:00
QueryTerm * qt = & m_q - > m_qterms [ qti - > m_qtermNum ] ;
// get the query word
//QueryWord *qw = qt->m_qword;
2016-07-04 15:06:15 +02:00
2016-08-29 17:05:34 +02:00
// just use the word # now
//int32_t opNum = qw->m_wordNum;//opNum;
2016-07-04 15:06:15 +02:00
2016-08-29 17:05:34 +02:00
// if this query term # is a gbmin:offprice:190 type
// of thing, then we may end up ignoring it based on the
// score contained within!
bool isRangeTerm = false ;
if ( qt - > m_fieldCode = = FIELD_GBNUMBERMIN )
isRangeTerm = true ;
if ( qt - > m_fieldCode = = FIELD_GBNUMBERMAX )
isRangeTerm = true ;
if ( qt - > m_fieldCode = = FIELD_GBNUMBEREQUALFLOAT )
isRangeTerm = true ;
if ( qt - > m_fieldCode = = FIELD_GBNUMBERMININT )
isRangeTerm = true ;
if ( qt - > m_fieldCode = = FIELD_GBNUMBERMAXINT )
isRangeTerm = true ;
if ( qt - > m_fieldCode = = FIELD_GBNUMBEREQUALINT )
isRangeTerm = true ;
// if ( qt->m_fieldCode == FIELD_GBFIELDMATCH )
// isRangeTerm = true;
2016-07-04 15:06:15 +02:00
2016-08-29 17:05:34 +02:00
// . make it consistent with Query::isTruth()
// . m_bitNum is set above to the QueryTermInfo #
int32_t bitNum = qt - > m_bitNum ;
2016-07-04 15:06:15 +02:00
2016-08-29 17:05:34 +02:00
// do not consider for adding if negative ('my house -home')
//if ( qti->m_bigramFlags[0] & BF_NEGATIVE ) continue;
2016-07-04 15:06:15 +02:00
2016-08-29 17:05:34 +02:00
// set all to zeroes
memset ( bitVec , 0 , m_vecSize ) ;
2016-07-04 15:06:15 +02:00
2016-08-29 17:05:34 +02:00
// set bitvec for this query term #
int32_t byte = bitNum / 8 ;
unsigned char mask = 1 < < ( bitNum % 8 ) ;
bitVec [ byte ] | = mask ;
2016-07-04 15:06:15 +02:00
2016-08-29 17:05:34 +02:00
// each query term can have synonym lists etc. scan those.
// this includes the original query termlist as well.
for ( int32_t j = 0 ; j < qti - > m_numSubLists ; j + + ) {
2016-07-04 15:06:15 +02:00
2016-08-29 17:05:34 +02:00
// scan all docids in this list
char * p = qti - > m_subLists [ j ] - > getList ( ) ;
char * pend = qti - > m_subLists [ j ] - > getListEnd ( ) ;
2016-08-25 17:27:37 +02:00
2016-08-29 17:05:34 +02:00
//int64_t lastDocId = 0LL;
2016-07-04 15:06:15 +02:00
2016-08-29 17:05:34 +02:00
// scan the sub termlist #j
for ( ; p < pend ; ) {
// place holder
2016-09-06 12:46:08 +02:00
int64_t docId = Posdb : : getDocId ( p ) ;
2016-07-04 15:06:15 +02:00
2016-08-29 17:05:34 +02:00
// assume this docid is not in range if we
// had a range term like gbmin:offerprice:190
bool inRange = false ;
2016-07-04 15:06:15 +02:00
2016-08-29 17:05:34 +02:00
// sanity
//if ( d < lastDocId )
// gbshutdownAbort(true);
//lastDocId = d;
2016-07-04 15:06:15 +02:00
2016-08-29 17:05:34 +02:00
// point to it
//char *voteBufPtr = p + 8;
2016-07-04 15:06:15 +02:00
2016-08-29 17:05:34 +02:00
// check each posdb key for compliance
// for gbmin:offprice:190 bool terms
if ( isRangeTerm & & isTermValueInRange ( p , qt ) ) {
inRange = true ;
}
2016-07-04 15:06:15 +02:00
2016-08-29 17:05:34 +02:00
// this was the first key for this docid for
// this termid and possible the first key for
// this termid, so skip it, either 12 or 18
// bytes
if ( ( ( ( char * ) p ) [ 0 ] ) & 0x02 ) {
p + = 12 ;
}
// the first key for this termid?
else {
p + = 18 ;
}
// then only 6 byte keys would follow from the
// same docid, so skip those as well
subloop :
if ( p < pend & & ( ( ( char * ) p ) [ 0 ] ) & 0x04 ) {
// check each posdb key for compliance
// for gbmin:offprice:190 bool terms
if ( isRangeTerm & & isTermValueInRange ( p , qt ) ) {
inRange = true ;
}
p + = 6 ;
goto subloop ;
}
// if we had gbmin:offprice:190 and it
// was not satisfied, then do not OR in this
// bit in the bitvector for the docid
if ( isRangeTerm & & ! inRange ) {
continue ;
}
// convert docid into hash key
//int64_t docId = *(int64_t *)voteBufPtr;
// shift down 2 bits
//docId >>= 2;
// and mask
//docId &= DOCID_MASK;
// test it
2016-09-06 12:46:08 +02:00
//int64_t docId = Posdb::getDocId(voteBufPtr-8);
2016-08-29 17:05:34 +02:00
//if ( d2 != docId )
// gbshutdownAbort(true);
// store this docid though. treat as int64_t
// but we mask with keymask
int32_t slot = m_bt . getSlot ( & docId ) ;
if ( slot < 0 ) {
// we can't alloc in a thread, careful
if ( ! m_bt . addKey ( & docId , bitVec ) ) {
gbshutdownAbort ( true ) ;
}
continue ;
}
// or the bit in otherwise
char * bv = ( char * ) m_bt . getValueFromSlot ( slot ) ;
bv [ byte ] | = mask ;
2016-08-25 17:27:37 +02:00
}
2016-07-04 15:06:15 +02:00
}
2016-08-29 17:05:34 +02:00
}
2016-07-04 15:06:15 +02:00
2016-08-29 17:05:34 +02:00
// debug info
// int32_t nc = m_bt.getLongestString();
// log("posdb: string of %" PRId32" filled slots!",nc);
char * dst = m_docIdVoteBuf . getBufStart ( ) ;
// . now our hash table is filled with all the docids
// . evaluate each bit vector
2017-04-07 14:19:07 +02:00
for ( int32_t i = 0 ; i < m_bt . getNumSlots ( ) ; i + + ) {
2016-08-29 17:05:34 +02:00
// skip if empty
if ( ! m_bt . m_flags [ i ] ) {
continue ;
2016-07-04 15:06:15 +02:00
}
2016-08-29 17:05:34 +02:00
// get the bit vector
unsigned char * vec = ( unsigned char * ) m_bt . getValueFromSlot ( i ) ;
2016-07-04 15:06:15 +02:00
2016-08-29 17:05:34 +02:00
// hash the vector
int64_t h64 = 0LL ;
for ( int32_t k = 0 ; k < m_vecSize ; k + + )
h64 ^ = g_hashtab [ ( unsigned char ) vec [ k ] ] [ ( unsigned char ) k ] ;
// check in hash table
char * val = ( char * ) m_ct . getValue ( & h64 ) ;
// it passes, add the ocid
if ( m_debug ) {
int64_t docId = * ( int64_t * ) m_bt . getKeyFromSlot ( i ) ;
log ( LOG_INFO , " query: eval d=% " PRIu64 " vec[0]=% " PRIx32 " h64=% " PRId64 , docId , ( int32_t ) vec [ 0 ] , h64 ) ;
}
// add him to the good table
if ( val & & * val ) {
// it passes, add the ocid
int64_t docId = * ( int64_t * ) m_bt . getKeyFromSlot ( i ) ;
2016-08-25 17:27:37 +02:00
2016-08-29 17:05:34 +02:00
// fix it up
if ( m_debug ) {
log ( LOG_INFO , " query: adding d=% " PRIu64 " bitVecSize=% " PRId32 " bitvec[0]=0x% " PRIx32 " (TRUE) " ,
docId , m_vecSize , ( int32_t ) vec [ 0 ] ) ;
2016-08-25 17:27:37 +02:00
}
2016-08-29 17:05:34 +02:00
// shift up
docId < < = 2 ;
// a 6 byte key means you pass
2016-11-08 12:43:26 +01:00
memcpy ( dst , & docId , 6 ) ;
2016-08-29 17:05:34 +02:00
dst + = 6 ;
continue ;
2016-07-04 15:06:15 +02:00
}
2016-08-25 17:27:37 +02:00
2016-08-29 17:05:34 +02:00
// evaluate the vector
char include = m_q - > matchesBoolQuery ( ( unsigned char * ) vec , m_vecSize ) ;
if ( include ) {
// it passes, add the ocid
int64_t docId = * ( int64_t * ) m_bt . getKeyFromSlot ( i ) ;
// fix it up
if ( m_debug ) {
log ( LOG_INFO , " query: adding d=% " PRIu64 " vec[0]=0x% " PRIx32 , docId , ( int32_t ) vec [ 0 ] ) ;
}
// shift up
docId < < = 2 ;
// a 6 byte key means you pass
2016-11-08 12:43:26 +01:00
memcpy ( dst , & docId , 6 ) ;
2016-08-29 17:05:34 +02:00
// test it
if ( m_debug ) {
int64_t d2 ;
d2 = * ( uint32_t * ) ( dst + 1 ) ;
d2 < < = 8 ;
d2 | = ( unsigned char ) dst [ 0 ] ;
d2 > > = 2 ;
docId > > = 2 ;
if ( d2 ! = docId )
gbshutdownAbort ( true ) ;
}
// end test
dst + = 6 ;
2016-07-04 15:06:15 +02:00
}
2016-08-29 17:05:34 +02:00
// store in hash table
m_ct . addKey ( & h64 , & include ) ;
2016-07-04 15:06:15 +02:00
}
2016-08-29 17:05:34 +02:00
// update SafeBuf::m_length
m_docIdVoteBuf . setLength ( dst - m_docIdVoteBuf . getBufStart ( ) ) ;
2016-07-27 16:06:29 +02:00
2016-08-29 17:05:34 +02:00
// now sort the docids. TODO: break makeDocIdVoteBufForBoolQuery()
// up into docid ranges so we have like 1/100th the # of docids to
// sort. that should make this part a lot faster.
// i.e. 1000*log(1000) > 1000*(10*log(10))) --> 3000 > 1000
// i.e. it's faster to break it down into 1000 pieces
// i.e. for log base 2 maybe it's like 10x faster...
qsort ( m_docIdVoteBuf . getBufStart ( ) ,
m_docIdVoteBuf . length ( ) / 6 ,
6 ,
docIdVoteBufKeyCompare_desc ) ;
2016-07-04 15:06:15 +02:00
2016-08-29 17:05:34 +02:00
logTrace ( g_conf . m_logTracePosdb , " END. " ) ;
return true ;
}
2016-07-04 15:06:15 +02:00
2016-07-27 16:06:29 +02:00
2016-08-29 17:05:34 +02:00
////////////////////
//
// Global functions
//
////////////////////
2016-07-04 15:06:15 +02:00
2016-07-27 16:06:29 +02:00
2016-08-29 17:05:34 +02:00
// sort vote buf entries in descending order
static int docIdVoteBufKeyCompare_desc ( const void * h1 , const void * h2 ) {
return KEYCMP ( ( const char * ) h1 , ( const char * ) h2 , 6 ) ;
2016-07-04 15:06:15 +02:00
}
2016-08-29 17:05:34 +02:00
// for boolean queries containing terms like gbmin:offerprice:190
static inline bool isTermValueInRange ( const char * p , const QueryTerm * qt ) {
2016-07-04 15:06:15 +02:00
2016-08-29 17:05:34 +02:00
// return false if outside of range
if ( qt - > m_fieldCode = = FIELD_GBNUMBERMIN ) {
2016-09-06 12:46:08 +02:00
float score2 = Posdb : : getFloat ( p ) ;
2016-08-29 17:05:34 +02:00
return ( score2 > = qt - > m_qword - > m_float ) ;
2016-07-04 15:06:15 +02:00
}
2016-08-29 17:05:34 +02:00
if ( qt - > m_fieldCode = = FIELD_GBNUMBERMAX ) {
2016-09-06 12:46:08 +02:00
float score2 = Posdb : : getFloat ( p ) ;
2016-08-29 17:05:34 +02:00
return ( score2 < = qt - > m_qword - > m_float ) ;
2016-07-04 15:06:15 +02:00
}
2016-08-29 17:05:34 +02:00
if ( qt - > m_fieldCode = = FIELD_GBNUMBEREQUALFLOAT ) {
2016-09-06 12:46:08 +02:00
float score2 = Posdb : : getFloat ( p ) ;
2016-10-23 22:52:51 +02:00
return ( almostEqualFloat ( score2 , qt - > m_qword - > m_float ) ) ;
2016-08-29 17:05:34 +02:00
}
2016-07-04 15:06:15 +02:00
2016-08-29 17:05:34 +02:00
if ( qt - > m_fieldCode = = FIELD_GBNUMBERMININT ) {
2016-09-06 12:46:08 +02:00
int32_t score2 = Posdb : : getInt ( p ) ;
2016-08-29 17:05:34 +02:00
return ( score2 > = qt - > m_qword - > m_int ) ;
2016-07-04 15:06:15 +02:00
}
2016-08-29 17:05:34 +02:00
if ( qt - > m_fieldCode = = FIELD_GBNUMBERMAXINT ) {
2016-09-06 12:46:08 +02:00
int32_t score2 = Posdb : : getInt ( p ) ;
2016-08-29 17:05:34 +02:00
return ( score2 < = qt - > m_qword - > m_int ) ;
}
2016-07-04 15:06:15 +02:00
2016-08-29 17:05:34 +02:00
if ( qt - > m_fieldCode = = FIELD_GBNUMBEREQUALINT ) {
2016-09-06 12:46:08 +02:00
int32_t score2 = Posdb : : getInt ( p ) ;
2016-08-29 17:05:34 +02:00
return ( score2 = = qt - > m_qword - > m_int ) ;
}
2016-07-04 15:06:15 +02:00
2016-08-29 17:05:34 +02:00
// if ( qt->m_fieldCode == FIELD_GBFIELDMATCH ) {
2016-09-06 12:46:08 +02:00
// int32_t score2 = Posdb::getInt ( p );
2016-08-29 17:05:34 +02:00
// return ( score2 == qt->m_qword->m_int );
// }
2016-07-04 15:06:15 +02:00
2016-08-29 17:05:34 +02:00
// how did this happen?
gbshutdownAbort ( true ) ;
2016-07-04 15:06:15 +02:00
}
2016-08-29 17:05:34 +02:00
static inline bool isTermValueInRange2 ( const char * recPtr , const char * subListEnd , const QueryTerm * qt ) {
// if we got a range term see if in range.
if ( isTermValueInRange ( recPtr , qt ) ) {
return true ;
}
recPtr + = 12 ;
for ( ; recPtr < subListEnd & & ( ( * recPtr ) & 0x04 ) ; recPtr + = 6 ) {
if ( isTermValueInRange ( recPtr , qt ) ) {
return true ;
}
2016-08-25 17:27:37 +02:00
}
2016-08-29 17:05:34 +02:00
return false ;
}
2016-07-04 15:06:15 +02:00
2016-08-29 17:05:34 +02:00
// . b-step into list looking for docid "docId"
// . assume p is start of list, excluding 6 byte of termid
2017-03-23 12:08:06 +01:00
static inline const char * getWordPosList ( uint64_t docId , const char * list , int32_t listSize ) {
2016-08-29 17:05:34 +02:00
// make step divisible by 6 initially
int32_t step = ( listSize / 12 ) * 6 ;
// shortcut
2016-09-16 14:49:36 +02:00
const char * listEnd = list + listSize ;
2016-08-29 17:05:34 +02:00
// divide in half
2016-09-16 14:49:36 +02:00
const char * p = list + step ;
2016-08-29 17:05:34 +02:00
// for detecting not founds
char count = 0 ;
2016-10-01 15:04:47 +02:00
for ( ; ; ) {
// save it
const char * origp = p ;
// scan up to docid. we use this special bit to distinguish between
// 6-byte and 12-byte posdb keys
2016-09-06 14:22:45 +02:00
for ( ; p > list & & ( p [ 1 ] & 0x02 ) ; ) {
p - = 6 ;
}
2016-08-29 17:05:34 +02:00
// ok, we hit a 12 byte key i guess, so backup 6 more
p - = 6 ;
2016-09-06 14:22:45 +02:00
2016-10-01 15:04:47 +02:00
// ok, we got a 12-byte key then i guess
2017-03-23 12:08:06 +01:00
uint64_t d = Posdb : : getDocId ( p ) ;
2016-10-01 15:04:47 +02:00
// we got a match, but it might be a NEGATIVE key so
// we have to try to find the positive keys in that case
if ( d = = docId ) {
// if its positive, no need to do anything else
if ( ( p [ 0 ] & 0x01 ) = = 0x01 ) return p ;
// ok, it's negative, try to see if the positive is
// in here, if not then return NULL.
// save current pos
const char * current = p ;
// back up to 6 byte key before this 12 byte key
p - = 6 ;
// now go backwards to previous 12 byte key
for ( ; p > list & & ( p [ 1 ] & 0x02 ) ; ) {
p - = 6 ;
}
// ok, we hit a 12 byte key i guess, so backup 6 more
p - = 6 ;
// is it there?
if ( p > = list & & Posdb : : getDocId ( p ) = = docId ) {
// sanity. return NULL if its negative! wtf????
if ( ( p [ 0 ] & 0x01 ) = = 0x00 ) return NULL ;
// got it
return p ;
}
// ok, no positive before us, try after us
p = current ;
// advance over current 12 byte key
p + = 12 ;
// now go forwards to next 12 byte key
for ( ; p < listEnd & & ( p [ 1 ] & 0x02 ) ; ) {
p + = 6 ;
}
// is it there?
if ( p + 12 < listEnd & & Posdb : : getDocId ( p ) = = docId ) {
// sanity. return NULL if its negative! wtf????
if ( ( p [ 0 ] & 0x01 ) = = 0x00 ) {
return NULL ;
}
// got it
return p ;
}
// . crap, i guess just had a single negative docid then
// . return that and the caller will see its negative
return current ;
}
// reduce step
//step /= 2;
step > > = 1 ;
// . make divisible by 6!
// . TODO: speed this up!!!
step = step - ( step % 6 ) ;
// sanity
if ( step % 6 ) {
gbshutdownAbort ( true ) ;
2016-09-06 14:22:45 +02:00
}
2016-10-01 15:04:47 +02:00
// ensure never 0
if ( step < = 0 ) {
step = 6 ;
// return NULL if not found
if ( count + + > = 2 ) {
2016-08-29 17:05:34 +02:00
return NULL ;
}
}
2016-10-01 15:04:47 +02:00
// go up or down then
if ( d < docId ) {
p = origp + step ;
if ( p > = listEnd ) {
p = listEnd - 6 ;
}
2016-08-29 17:05:34 +02:00
}
2016-10-01 15:04:47 +02:00
else {
p = origp - step ;
if ( p < list ) {
p = list ;
}
2016-08-29 17:05:34 +02:00
}
}
}
2016-07-04 15:06:15 +02:00
2016-08-29 17:05:34 +02:00
// initialize the weights table
static void initWeights ( ) {
2016-09-16 12:14:51 +02:00
ScopedLock sl ( s_mtx_weights ) ;
2016-08-29 17:05:34 +02:00
if ( s_init ) {
return ;
}
2016-10-14 11:57:15 +02:00
logTrace ( g_conf . m_logTracePosdb , " BEGIN. " ) ;
2016-08-29 17:05:34 +02:00
2017-01-20 16:27:16 +01:00
s_scoringWeights . init ( g_conf . m_diversityWeightMin , g_conf . m_diversityWeightMax ,
g_conf . m_densityWeightMin , g_conf . m_densityWeightMax ,
g_conf . m_hashGroupWeightBody ,
g_conf . m_hashGroupWeightTitle ,
g_conf . m_hashGroupWeightHeading ,
g_conf . m_hashGroupWeightInlist ,
g_conf . m_hashGroupWeightInMetaTag ,
g_conf . m_hashGroupWeightInLinkText ,
g_conf . m_hashGroupWeightInTag ,
g_conf . m_hashGroupWeightNeighborhood ,
g_conf . m_hashGroupWeightInternalLinkText ,
g_conf . m_hashGroupWeightInUrl ,
g_conf . m_hashGroupWeightInMenu ) ;
2016-08-29 17:05:34 +02:00
// if two hashgroups are comaptible they can be paired
for ( int32_t i = 0 ; i < HASHGROUP_END ; i + + ) {
// set this
s_inBody [ i ] = false ;
// is it body?
if ( i = = HASHGROUP_BODY | |
i = = HASHGROUP_HEADING | |
i = = HASHGROUP_INLIST | |
i = = HASHGROUP_INMENU )
s_inBody [ i ] = true ;
2016-10-10 16:02:50 +02:00
2016-08-29 17:05:34 +02:00
for ( int32_t j = 0 ; j < HASHGROUP_END ; j + + ) {
// assume not
s_isCompatible [ i ] [ j ] = false ;
// or both in body (and not title)
bool inBody1 = true ;
if ( i ! = HASHGROUP_BODY & &
i ! = HASHGROUP_HEADING & &
i ! = HASHGROUP_INLIST & &
//i != HASHGROUP_INURL &&
i ! = HASHGROUP_INMENU )
inBody1 = false ;
bool inBody2 = true ;
if ( j ! = HASHGROUP_BODY & &
j ! = HASHGROUP_HEADING & &
j ! = HASHGROUP_INLIST & &
//j != HASHGROUP_INURL &&
j ! = HASHGROUP_INMENU )
inBody2 = false ;
// no body allowed now!
if ( inBody1 | | inBody2 ) {
continue ;
}
2016-07-04 15:06:15 +02:00
2016-08-29 17:05:34 +02:00
//if ( ! inBody ) continue;
// now neither can be in the body, because we handle
// those cases in the new sliding window algo.
// if one term is only in the link text and the other
// term is only in the title, ... what then? i guess
// allow those here, but they will be penalized
// some with the fixed distance of like 64 units or
// something...
s_isCompatible [ i ] [ j ] = true ;
// if either is in the body then do not allow now
// and handle in the sliding window algo
//s_isCompatible[i][j] = 1;
}
}
2016-07-04 15:06:15 +02:00
2016-09-16 12:14:51 +02:00
s_init = true ;
# ifdef _VALGRIND_
//we read from the weight tables without locking. tell helgrind to ignore that
2017-01-20 16:27:16 +01:00
VALGRIND_HG_DISABLE_CHECKING ( & s_scoringWeights , sizeof ( s_scoringWeights ) ) ;
2016-09-16 12:14:51 +02:00
VALGRIND_HG_DISABLE_CHECKING ( s_isCompatible , sizeof ( s_isCompatible ) ) ;
VALGRIND_HG_DISABLE_CHECKING ( s_inBody , sizeof ( s_inBody ) ) ;
# endif
2016-08-29 17:05:34 +02:00
logTrace ( g_conf . m_logTracePosdb , " END. " ) ;
}
2016-07-04 15:06:15 +02:00
2016-08-29 17:05:34 +02:00
// Called when ranking settings are changed. Normally called from update-parameter
// broadcast handling (see handleRequest3fLoop() )
void reinitializeRankingSettings ( )
{
s_init = false ;
initWeights ( ) ;
}
2016-07-04 15:06:15 +02:00
2016-08-29 17:05:34 +02:00
float getHashGroupWeight ( unsigned char hg ) {
2016-09-16 12:14:51 +02:00
initWeights ( ) ;
2016-07-04 15:06:15 +02:00
2017-01-20 16:27:16 +01:00
return s_scoringWeights . m_hashGroupWeights [ hg ] ;
2016-08-29 17:05:34 +02:00
}
2016-07-04 15:06:15 +02:00
2016-08-29 17:05:34 +02:00
float getDiversityWeight ( unsigned char diversityRank ) {
2016-09-16 12:14:51 +02:00
initWeights ( ) ;
2016-07-04 15:06:15 +02:00
2017-01-20 16:27:16 +01:00
return s_scoringWeights . m_diversityWeights [ diversityRank ] ;
2016-08-29 17:05:34 +02:00
}
2016-07-04 15:06:15 +02:00
2016-08-29 17:05:34 +02:00
float getDensityWeight ( unsigned char densityRank ) {
2016-09-16 12:14:51 +02:00
initWeights ( ) ;
2016-07-04 15:06:15 +02:00
2017-01-20 16:27:16 +01:00
return s_scoringWeights . m_densityWeights [ densityRank ] ;
2016-08-29 17:05:34 +02:00
}
2016-07-04 15:06:15 +02:00
2016-08-29 17:05:34 +02:00
float getWordSpamWeight ( unsigned char wordSpamRank ) {
2016-09-16 12:14:51 +02:00
initWeights ( ) ;
2016-07-04 15:06:15 +02:00
2017-01-20 16:27:16 +01:00
return s_scoringWeights . m_wordSpamWeights [ wordSpamRank ] ;
2016-08-29 17:05:34 +02:00
}
2016-07-04 15:06:15 +02:00
2016-08-25 17:27:37 +02:00
2016-08-29 17:05:34 +02:00
float getLinkerWeight ( unsigned char wordSpamRank ) {
2016-09-16 12:14:51 +02:00
initWeights ( ) ;
2016-07-04 15:06:15 +02:00
2017-01-20 16:27:16 +01:00
return s_scoringWeights . m_linkerWeights [ wordSpamRank ] ;
2016-07-04 15:06:15 +02:00
}