disk space. added tagdb file cache for better performance, less disk accesses. will help reduce disk load. put file cache sizes in master controls and if they change then update the cache size dynamically.
959 lines
26 KiB
C++
959 lines
26 KiB
C++
// Matt Wells, Copyright May 2012
|
|
|
|
// . format of an 18-byte posdb key
|
|
// tttttttt tttttttt tttttttt tttttttt t = termId (48bits)
|
|
// tttttttt tttttttt dddddddd dddddddd d = docId (38 bits)
|
|
// dddddddd dddddddd dddddd0r rrrggggg r = siterank, g = langid
|
|
// wwwwwwww wwwwwwww wwGGGGss ssvvvvFF w = word postion , s = wordspamrank
|
|
// pppppb1N MMMMLZZD v = diversityrank, p = densityrank
|
|
// M = multiplier, b = in outlink text
|
|
// L = langIdShiftBit (upper bit)
|
|
// G: 0 = body
|
|
// 1 = intitletag
|
|
// 2 = inheading
|
|
// 3 = inlist
|
|
// 4 = inmetatag
|
|
// 5 = inlinktext
|
|
// 6 = tag
|
|
// 7 = inneighborhood
|
|
// 8 = internalinlinktext
|
|
// 9 = inurl
|
|
//
|
|
// F: 0 = original term
|
|
// 1 = conjugate/sing/plural
|
|
// 2 = synonym
|
|
// 3 = hyponym
|
|
|
|
// NOTE: N bit is 1 if the shard of the record is determined by the
|
|
// termid (t bits) and NOT the docid (d bits). N stands for "nosplit"
|
|
// and you can find that logic in XmlDoc.cpp and Msg4.cpp. We store
|
|
// the hash of the content like this so we can see if it is a dup.
|
|
|
|
// NOTE: M bits hold scaling factor (logarithmic) for link text voting
|
|
// so we do not need to repeat the same link text over and over again.
|
|
// Use M bits to hold # of inlinks the page has for other terms.
|
|
|
|
// NOTE: for inlinktext terms the spam rank is the siterank of the
|
|
// inlinker!
|
|
|
|
// NOTE: densityrank for title is based on # of title words only. same goes
|
|
// for incoming inlink text.
|
|
|
|
// NOTE: now we can b-step into the termlist looking for a docid match
|
|
// and not worry about misalignment from the double compression scheme
|
|
// because if the 6th byte's low bit is clear that means its a docid
|
|
// 12-byte key, otherwise its the word position 6-byte key since the delbit
|
|
// can't be clear for those!
|
|
|
|
// THEN we can play with a tuner for how these various things affect
|
|
// the search results ranking.
|
|
|
|
|
|
#ifndef _POSDB_H_
|
|
#define _POSDB_H_
|
|
|
|
#include "Rdb.h"
|
|
#include "Conf.h"
|
|
//#include "Indexdb.h"
|
|
#include "Titledb.h" // DOCID_MASK
|
|
#include "HashTableX.h"
|
|
#include "Sections.h"
|
|
|
|
#define MAXSITERANK 0x0f // 4 bits
|
|
#define MAXLANGID 0x3f // 6 bits (5 bits go in 'g' the other in 'L')
|
|
#define MAXWORDPOS 0x0003ffff // 18 bits
|
|
#define MAXDENSITYRANK 0x1f // 5 bits
|
|
#define MAXWORDSPAMRANK 0x0f // 4 bits
|
|
#define MAXDIVERSITYRANK 0x0f // 4 bits
|
|
#define MAXHASHGROUP 0x0f // 4 bits
|
|
#define MAXMULTIPLIER 0x0f // 4 bits
|
|
#define MAXISSYNONYM 0x03 // 2 bits
|
|
|
|
// values for G bits in the posdb key
|
|
#define HASHGROUP_BODY 0 // body implied
|
|
#define HASHGROUP_TITLE 1
|
|
#define HASHGROUP_HEADING 2 // body implied
|
|
#define HASHGROUP_INLIST 3 // body implied
|
|
#define HASHGROUP_INMETATAG 4
|
|
#define HASHGROUP_INLINKTEXT 5
|
|
#define HASHGROUP_INTAG 6
|
|
#define HASHGROUP_NEIGHBORHOOD 7
|
|
#define HASHGROUP_INTERNALINLINKTEXT 8
|
|
#define HASHGROUP_INURL 9
|
|
#define HASHGROUP_INMENU 10 // body implied
|
|
#define HASHGROUP_END 11
|
|
|
|
float getDiversityWeight ( unsigned char diversityRank );
|
|
float getDensityWeight ( unsigned char densityRank );
|
|
float getWordSpamWeight ( unsigned char wordSpamRank );
|
|
float getLinkerWeight ( unsigned char wordSpamRank );
|
|
char *getHashGroupString ( unsigned char hg );
|
|
float getHashGroupWeight ( unsigned char hg );
|
|
float getTermFreqWeight ( int64_t termFreq , int64_t numDocsInColl );
|
|
|
|
#define SYNONYM_WEIGHT 0.90
|
|
#define WIKI_WEIGHT 0.10 // was 0.20
|
|
#define SITERANKDIVISOR 3.0
|
|
#define SITERANKMULTIPLIER 0.33333333
|
|
//#define SAMELANGMULT 20.0 // FOREIGNLANGDIVISOR 2.0
|
|
|
|
#define POSDBKEY key144_t
|
|
|
|
#define BF_HALFSTOPWIKIBIGRAM 0x01 // "to be" in "to be or not to be"
|
|
#define BF_PIPED 0x02 // before a query pipe operator
|
|
#define BF_SYNONYM 0x04
|
|
#define BF_NEGATIVE 0x08 // query word has a negative sign before it
|
|
#define BF_BIGRAM 0x10 // query word has a negative sign before it
|
|
#define BF_NUMBER 0x20 // is it like gbsortby:price? numeric?
|
|
#define BF_FACET 0x40 // gbfacet:price
|
|
|
|
void printTermList ( int32_t i, char *list, int32_t listSize ) ;
|
|
|
|
// if query is 'the tigers' we weight bigram "the tigers" x 1.20 because
|
|
// its in wikipedia.
|
|
// up this to 1.40 for 'the time machine' query
|
|
#define WIKI_BIGRAM_WEIGHT 1.40
|
|
|
|
class Posdb {
|
|
|
|
public:
|
|
|
|
// resets rdb
|
|
void reset();
|
|
|
|
// sets up our m_rdb from g_conf (global conf class)
|
|
bool init ( );
|
|
|
|
// init the rebuild/secondary rdb, used by PageRepair.cpp
|
|
bool init2 ( int32_t treeMem );
|
|
|
|
bool verify ( char *coll );
|
|
|
|
bool addColl ( char *coll, bool doVerify = true );
|
|
|
|
// . xmldoc.cpp should call this
|
|
// . store all posdb keys from revdbList into one hashtable
|
|
// and only add to new list if not in there
|
|
//bool makeList ( class RdbList *revdbList ,
|
|
// int64_t docId ,
|
|
// class Words *words );
|
|
|
|
|
|
|
|
// . make a 16-byte key from all these components
|
|
// . since it is 16 bytes, the big bit will be set
|
|
void makeKey ( void *kp ,
|
|
int64_t termId ,
|
|
uint64_t docId ,
|
|
int32_t wordPos ,
|
|
char densityRank ,
|
|
char diversityRank ,
|
|
char wordSpamRank ,
|
|
char siteRank ,
|
|
char hashGroup ,
|
|
char langId ,
|
|
// multiplier: we convert into 7 bits in this function
|
|
int32_t multiplier ,
|
|
bool isSynonym ,
|
|
bool isDelKey ,
|
|
bool shardByTermId );
|
|
|
|
// make just the 6 byte key
|
|
void makeKey48 ( char *kp ,
|
|
int32_t wordPos ,
|
|
char densityRank ,
|
|
char diversityRank ,
|
|
char wordSpamRank ,
|
|
char hashGroup ,
|
|
char langId ,
|
|
bool isSynonym ,
|
|
bool isDelKey );
|
|
|
|
|
|
int printList ( RdbList &list ) ;
|
|
|
|
// we map the 32bit score to like 7 bits here
|
|
void setMultiplierBits ( void *vkp , unsigned char mbits ) {
|
|
key144_t *kp = (key144_t *)vkp;
|
|
if ( mbits > MAXMULTIPLIER ) { char *xx=NULL;*xx=0; }
|
|
kp->n0 &= 0xfc0f;
|
|
// map score to bits
|
|
kp->n0 |= ((uint16_t)mbits) << 4;
|
|
}
|
|
|
|
void setDocIdBits ( void *vkp , uint64_t docId ) {
|
|
key144_t *kp = (key144_t *)vkp;
|
|
kp->n1 &= 0x000003ffffffffffLL;
|
|
kp->n1 |= (docId<<(32+10));
|
|
kp->n2 &= 0xffffffffffff0000LL;
|
|
kp->n2 |= docId>>22;
|
|
}
|
|
|
|
void setSiteRankBits ( void *vkp , char siteRank ) {
|
|
key144_t *kp = (key144_t *)vkp;
|
|
if ( siteRank > MAXSITERANK ) { char *xx=NULL;*xx=0; }
|
|
kp->n1 &= 0xfffffe1fffffffffLL;
|
|
kp->n1 |= ((uint64_t)siteRank)<<(32+5);
|
|
}
|
|
|
|
void setLangIdBits ( void *vkp , char langId ) {
|
|
key144_t *kp = (key144_t *)vkp;
|
|
if ( langId > MAXLANGID ) { char *xx=NULL;*xx=0; }
|
|
kp->n1 &= 0xffffffe0ffffffffLL;
|
|
// put the lower 5 bits here
|
|
kp->n1 |= ((uint64_t)(langId&0x1f))<<(32);
|
|
// and the upper 6th bit here. n0 is a int16_t.
|
|
// 0011 1111
|
|
if ( langId & 0x20 ) kp->n0 |= 0x08;
|
|
}
|
|
|
|
// set the word position bits et al to this float
|
|
void setFloat ( void *vkp , float f ) {
|
|
*(float *)(((char *)vkp) + 2) = f; };
|
|
|
|
void setInt ( void *vkp , int32_t x ) {
|
|
*(int32_t *)(((char *)vkp) + 2) = x; };
|
|
|
|
// and read the float as well
|
|
float getFloat ( void *vkp ) {
|
|
return *(float *)(((char *)vkp) + 2); };
|
|
|
|
int32_t getInt ( void *vkp ) {
|
|
return *(int32_t *)(((char *)vkp) + 2); };
|
|
|
|
void setAlignmentBit ( void *vkp , char val ) {
|
|
char *p = (char *)vkp;
|
|
if ( val ) p[1] = p[1] | 0x02;
|
|
else p[1] = p[1] & 0xfd;
|
|
};
|
|
|
|
bool isAlignmentBitClear ( void *vkp ) {
|
|
return ( ( ((char *)vkp)[1] & 0x02 ) == 0x00 );
|
|
};
|
|
|
|
void makeStartKey ( void *kp, int64_t termId ,
|
|
int64_t docId=0LL){
|
|
return makeKey ( kp,
|
|
termId ,
|
|
docId,
|
|
0, // wordpos
|
|
0, // density
|
|
0, // diversity
|
|
0, // wordspam
|
|
0, // siterank
|
|
0, // hashgroup
|
|
0, // langid
|
|
0, // multiplier
|
|
0, // issynonym/etc.
|
|
true , // isdelkey
|
|
false ); // shardbytermid?
|
|
};
|
|
|
|
void makeEndKey ( void *kp,int64_t termId,
|
|
int64_t docId = MAX_DOCID ) {
|
|
return makeKey ( kp,
|
|
termId ,
|
|
docId,
|
|
MAXWORDPOS,
|
|
MAXDENSITYRANK,
|
|
MAXDIVERSITYRANK,
|
|
MAXWORDSPAMRANK,
|
|
MAXSITERANK,
|
|
MAXHASHGROUP,
|
|
MAXLANGID,
|
|
MAXMULTIPLIER,
|
|
MAXISSYNONYM, // issynonym/etc.
|
|
false, // isdelkey
|
|
true);// shard by termid?
|
|
};
|
|
|
|
// we got two compression bits!
|
|
unsigned char getKeySize ( void *key ) {
|
|
if ( (((char *)key)[0])&0x04 ) return 6;
|
|
if ( (((char *)key)[0])&0x02 ) return 12;
|
|
return 18;
|
|
};
|
|
|
|
// PosdbTable uses this to skip from one docid to the next docid
|
|
// in a posdblist
|
|
char *getNextDocIdSublist ( char *p , char *listEnd ) {
|
|
// key must be 12
|
|
//if ( getKeySize(p) != 12 ) { char *xx=NULL;*xx=0; }
|
|
// skip that first key
|
|
p += 12;
|
|
// skip the 6 byte keys
|
|
for ( ; p < listEnd && getKeySize(p) == 6 ; p += 6 );
|
|
// done
|
|
return p;
|
|
}
|
|
|
|
|
|
int64_t getTermId ( void *key ) {
|
|
return ((key144_t *)key)->n2 >> 16;
|
|
};
|
|
|
|
int64_t getDocId ( void *key ) {
|
|
uint64_t d = 0LL;
|
|
d = ((unsigned char *)key)[11];
|
|
d <<= 32;
|
|
d |= *(uint32_t *)(((unsigned char *)key)+7);
|
|
d >>= 2;
|
|
return d;
|
|
//int64_t d = ((key144_t *)key)->n2 & 0xffff;
|
|
//d <<= 22;
|
|
//d |= ((key144_t *)key)->n1 >> (32+8+2);
|
|
//return d;
|
|
};
|
|
|
|
unsigned char getSiteRank ( void *key ) {
|
|
return (((key144_t *)key)->n1 >> 37) & MAXSITERANK;
|
|
};
|
|
|
|
unsigned char getLangId ( void *key ) {
|
|
if ( ((char *)key)[0] & 0x08 )
|
|
return ((((key144_t *)key)->n1 >> 32) & 0x1f) | 0x20;
|
|
else
|
|
return ((((key144_t *)key)->n1 >> 32) & 0x1f) ;
|
|
};
|
|
|
|
unsigned char getHashGroup ( void *key ) {
|
|
//return (((key144_t *)key)->n1 >> 10) & MAXHASHGROUP;
|
|
return ((((unsigned char *)key)[3]) >>2) & MAXHASHGROUP;
|
|
};
|
|
|
|
int32_t getWordPos ( void *key ) {
|
|
//return (((key144_t *)key)->n1 >> 14) & MAXWORDPOS;
|
|
return (*((uint32_t *)((unsigned char *)key+2))) >> (8+6);
|
|
};
|
|
|
|
inline void setWordPos ( char *key , uint32_t wpos ) {
|
|
// truncate
|
|
wpos &= MAXWORDPOS;
|
|
if ( wpos & 0x01 ) key[3] |= 0x40;
|
|
else key[3] &= ~((unsigned char)0x40);
|
|
if ( wpos & 0x02 ) key[3] |= 0x80;
|
|
else key[3] &= ~((unsigned char)0x80);
|
|
wpos >>= 2;
|
|
key[4] = ((char *)&wpos)[0];
|
|
key[5] = ((char *)&wpos)[1];
|
|
};
|
|
|
|
unsigned char getWordSpamRank ( void *key ) {
|
|
//return (((key144_t *)key)->n1 >> 6) & MAXWORDSPAMRANK;
|
|
return ((((uint16_t *)key)[1]) >>6) & MAXWORDSPAMRANK;
|
|
};
|
|
|
|
unsigned char getDiversityRank ( void *key ) {
|
|
//return (((key144_t *)key)->n1 >> 2) & MAXDIVERSITYRANK;
|
|
return ((((unsigned char *)key)[2]) >>2) & MAXDIVERSITYRANK;
|
|
};
|
|
|
|
unsigned char getIsSynonym ( void *key ) {
|
|
return (((key144_t *)key)->n1 ) & 0x03;
|
|
};
|
|
|
|
unsigned char getIsHalfStopWikiBigram ( void *key ) {
|
|
return ((char *)key)[2] & 0x01;
|
|
};
|
|
|
|
unsigned char getDensityRank ( void *key ) {
|
|
return ((*(uint16_t *)key) >> 11) & MAXDENSITYRANK;
|
|
};
|
|
|
|
inline void setDensityRank ( char *key , unsigned char dr ) {
|
|
// shift up
|
|
dr <<= 3;
|
|
// clear out
|
|
key[1] &= 0x07;
|
|
// or in
|
|
key[1] |= dr;
|
|
};
|
|
|
|
char isShardedByTermId ( void *key ){return ((char *)key)[1] & 0x01; };
|
|
|
|
void setShardedByTermIdBit ( void *key ) {
|
|
char *k = (char *)key;
|
|
k[1] |= 0x01;
|
|
};
|
|
|
|
unsigned char getMultiplier ( void *key ) {
|
|
return ((*(uint16_t *)key) >> 4) & MAXMULTIPLIER; };
|
|
|
|
// . HACK: for sectionhash:xxxxx posdb keys
|
|
// . we use the w,G,s,v and F bits
|
|
uint32_t getFacetVal32 ( void *key ) {
|
|
return *(uint32_t *)(((char *)key)+2); };
|
|
void setFacetVal32 ( void *key , int32_t facetVal32 ) {
|
|
*(uint32_t *)(((char *)key)+2) = facetVal32; };
|
|
|
|
int64_t getTermFreq ( collnum_t collnum, int64_t termId ) ;
|
|
|
|
//RdbCache *getCache ( ) { return &m_rdb.m_cache; };
|
|
Rdb *getRdb ( ) { return &m_rdb; };
|
|
|
|
Rdb m_rdb;
|
|
|
|
//DiskPageCache *getDiskPageCache ( ) { return &m_pc; };
|
|
|
|
//DiskPageCache m_pc;
|
|
};
|
|
|
|
class FacetEntry {
|
|
public:
|
|
// # of search results that have this value:
|
|
int32_t m_count;
|
|
// # of docs that have this value:
|
|
int32_t m_outsideSearchResultsCount;
|
|
int64_t m_docId;
|
|
|
|
// cast as double/floats for floats:
|
|
int64_t m_sum;
|
|
int32_t m_max;
|
|
int32_t m_min;
|
|
};
|
|
|
|
|
|
|
|
#define MAX_SUBLISTS 50
|
|
|
|
// . each QueryTerm has this attached additional info now:
|
|
// . these should be 1-1 with query terms, Query::m_qterms[]
|
|
class QueryTermInfo {
|
|
public:
|
|
class QueryTerm *m_qt;
|
|
// the required lists for this query term, synonym lists, etc.
|
|
RdbList *m_subLists [MAX_SUBLISTS];
|
|
// flags to indicate if bigram list should be scored higher
|
|
char m_bigramFlags [MAX_SUBLISTS];
|
|
// shrinkSubLists() set this:
|
|
int32_t m_newSubListSize [MAX_SUBLISTS];
|
|
char *m_newSubListStart [MAX_SUBLISTS];
|
|
char *m_newSubListEnd [MAX_SUBLISTS];
|
|
char *m_cursor [MAX_SUBLISTS];
|
|
char *m_savedCursor [MAX_SUBLISTS];
|
|
// the corresponding QueryTerm for this sublist
|
|
//class QueryTerm *m_qtermList [MAX_SUBLISTS];
|
|
int32_t m_numNewSubLists;
|
|
// how many are valid?
|
|
int32_t m_numSubLists;
|
|
// size of all m_subLists in bytes
|
|
int64_t m_totalSubListsSize;
|
|
// the term freq weight for this term
|
|
float m_termFreqWeight;
|
|
// what query term # do we correspond to in Query.h
|
|
int32_t m_qtermNum;
|
|
// the word position of this query term in the Words.h class
|
|
int32_t m_qpos;
|
|
// the wikipedia phrase id if we start one
|
|
int32_t m_wikiPhraseId;
|
|
// phrase id term or bigram is in
|
|
int32_t m_quotedStartId;
|
|
};
|
|
|
|
|
|
/*
|
|
#include "RdbList.h"
|
|
|
|
class PosdbList : public RdbList {
|
|
|
|
public:
|
|
|
|
// why do i have to repeat this for LinkInfo::set() calling our set()??
|
|
void set ( char *list , int32_t listSize , bool ownData ) {
|
|
RdbList::set ( list ,
|
|
listSize ,
|
|
list , // alloc
|
|
listSize , // alloc size
|
|
0 , // fixed data size
|
|
ownData ,
|
|
true , // use half keys?
|
|
sizeof(key_t));// 12 bytes per key
|
|
};
|
|
|
|
// clear the low bits on the keys so terms are DELETED
|
|
void clearDelBits ( );
|
|
|
|
void print();
|
|
|
|
|
|
// . these are made for special IndexLists, too
|
|
// . getTermId() assumes as 12 byte key
|
|
int64_t getCurrentTermId12 ( ) {
|
|
return getTermId12 ( m_listPtr ); };
|
|
int64_t getTermId12 ( char *rec ) {
|
|
return (*(uint64_t *)(&rec[4])) >> 16 ;
|
|
};
|
|
int64_t getTermId16 ( char *rec ) {
|
|
return (*(uint64_t *)(&rec[8])) >> 16 ;
|
|
};
|
|
// these 2 assume 12 and 6 byte keys respectively
|
|
int64_t getCurrentDocId () {
|
|
if ( isHalfBitOn ( m_listPtr ) ) return getDocId6 (m_listPtr);
|
|
else return getDocId12(m_listPtr);
|
|
};
|
|
int64_t getDocId ( char *rec ) {
|
|
if ( isHalfBitOn ( rec ) ) return getDocId6 (rec);
|
|
else return getDocId12(rec);
|
|
};
|
|
int64_t getCurrentDocId12 ( ) {
|
|
return getDocId12 ( m_listPtr ); };
|
|
int64_t getDocId12 ( char *rec ) {
|
|
return ((*(uint64_t *)(rec)) >> 2) & DOCID_MASK; };
|
|
int64_t getDocId6 ( char *rec ) {
|
|
int64_t docid;
|
|
*(int32_t *)(&docid) = *(int32_t *)rec;
|
|
((char *)&docid)[4] = rec[4];
|
|
docid >>= 2;
|
|
return docid & DOCID_MASK;
|
|
};
|
|
// this works with either 12 or 6 byte keys
|
|
unsigned char getCurrentScore ( ) {
|
|
return getScore(m_listPtr); };
|
|
unsigned char getScore ( char *rec ) { return ~rec[5]; };
|
|
|
|
// uncomplemented...
|
|
void setScore ( char *rec , char score ) { rec[5] = score; };
|
|
|
|
// for date lists only...
|
|
int32_t getCurrentDate ( ) { return ~*(int32_t *)(m_listPtr+6); };
|
|
};
|
|
*/
|
|
|
|
#include "Query.h" // MAX_QUERY_TERMS, qvec_t
|
|
|
|
// max # search results that can be viewed without using TopTree
|
|
//#define MAX_RESULTS 1000
|
|
|
|
class PosdbTable {
|
|
|
|
public:
|
|
|
|
// . returns false on error and sets errno
|
|
// . "termFreqs" are 1-1 with q->m_qterms[]
|
|
// . sets m_q to point to q
|
|
void init (Query *q ,
|
|
char debug ,
|
|
void *logstate ,
|
|
class TopTree *topTree ,
|
|
//char *coll ,
|
|
collnum_t collnum ,
|
|
//IndexList *lists ,
|
|
//int32_t numLists ,
|
|
class Msg2 *msg2,
|
|
class Msg39Request *r );
|
|
|
|
// pre-allocate m_whiteListTable
|
|
bool allocWhiteListTable ( ) ;
|
|
|
|
// pre-allocate memory since intersection runs in a thread
|
|
bool allocTopTree ( );
|
|
|
|
// . returns false on error and sets errno
|
|
// . we assume there are "m_numTerms" lists passed in (see set() above)
|
|
//void intersectLists_r ( );
|
|
|
|
//void intersectLists9_r ( );
|
|
|
|
void getTermPairScoreForNonBody ( int32_t i, int32_t j,
|
|
char *wpi, char *wpj,
|
|
char *endi, char *endj,
|
|
int32_t qdist ,
|
|
float *retMax );
|
|
float getSingleTermScore ( int32_t i, char *wpi , char *endi,
|
|
class DocIdScore *pdcs,
|
|
char **bestPos );
|
|
|
|
void evalSlidingWindow ( char **ptrs ,
|
|
int32_t nr ,
|
|
char **bestPos ,
|
|
float *scoreMatrix ,
|
|
int32_t advancedTermNum );
|
|
float getTermPairScoreForWindow ( int32_t i, int32_t j,
|
|
char *wpi,
|
|
char *wpj,
|
|
int32_t fixedDistance
|
|
);
|
|
|
|
float getTermPairScoreForAny ( int32_t i, int32_t j,
|
|
char *wpi, char *wpj,
|
|
char *endi, char *endj,
|
|
class DocIdScore *pdcs );
|
|
|
|
bool makeDocIdVoteBufForBoolQuery_r ( ) ;
|
|
|
|
// some generic stuff
|
|
PosdbTable();
|
|
~PosdbTable();
|
|
void reset();
|
|
|
|
// Msg39 needs to call these
|
|
void freeMem ( ) ;
|
|
|
|
// has init already been called?
|
|
bool isInitialized ( ) { return m_initialized; };
|
|
|
|
uint64_t m_docId;
|
|
|
|
uint64_t m_docIdHack;
|
|
|
|
bool m_hasFacetTerm;
|
|
|
|
bool m_hasMaxSerpScore;
|
|
|
|
// hack for seo.cpp:
|
|
float m_finalScore;
|
|
float m_preFinalScore;
|
|
|
|
float m_siteRankMultiplier;
|
|
|
|
// how long to add the last batch of lists
|
|
int64_t m_addListsTime;
|
|
int64_t m_t1 ;
|
|
int64_t m_t2 ;
|
|
|
|
int64_t m_estimatedTotalHits;
|
|
|
|
int32_t m_errno;
|
|
|
|
int32_t m_numSlots;
|
|
|
|
int32_t m_maxScores;
|
|
|
|
//char *m_coll;
|
|
collnum_t m_collnum;
|
|
|
|
int32_t *m_qpos;
|
|
int32_t *m_wikiPhraseIds;
|
|
int32_t *m_quotedStartIds;
|
|
//class DocIdScore *m_ds;
|
|
int32_t m_qdist;
|
|
float *m_freqWeights;
|
|
//int64_t *m_freqs;
|
|
char *m_bflags;
|
|
int32_t *m_qtermNums;
|
|
float m_bestWindowScore;
|
|
//char **m_finalWinners1;
|
|
//char **m_finalWinners2;
|
|
//float *m_finalScores;
|
|
char **m_windowTermPtrs;
|
|
|
|
// how many docs in the collection?
|
|
int64_t m_docsInColl;
|
|
|
|
//SectionStats m_sectionStats;
|
|
//SafeBuf m_facetHashList;
|
|
//HashTableX m_dt;
|
|
|
|
class Msg2 *m_msg2;
|
|
|
|
// if getting more than MAX_RESULTS results, use this top tree to hold
|
|
// them rather than the m_top*[] arrays above
|
|
class TopTree *m_topTree;
|
|
|
|
//HashTableX m_docIdTable;
|
|
|
|
SafeBuf m_scoreInfoBuf;
|
|
SafeBuf m_pairScoreBuf;
|
|
SafeBuf m_singleScoreBuf;
|
|
|
|
SafeBuf m_stackBuf;
|
|
|
|
//SafeBuf m_mergeBuf;
|
|
|
|
// a reference to the query
|
|
Query *m_q;
|
|
int32_t m_nqt;
|
|
|
|
// these are NOT in imap space, but in query term space, 1-1 with
|
|
// Query::m_qterms[]
|
|
//IndexList *m_lists;
|
|
//int32_t m_numLists;
|
|
|
|
// has init() been called?
|
|
bool m_initialized;
|
|
|
|
// are we in debug mode?
|
|
char m_debug;
|
|
|
|
// for debug msgs
|
|
void *m_logstate;
|
|
|
|
//int64_t m_numDocsInColl;
|
|
|
|
class Msg39Request *m_r;
|
|
|
|
// for gbsortby:item.price ...
|
|
int32_t m_sortByTermNum;
|
|
int32_t m_sortByTermNumInt;
|
|
|
|
// fix core with these two
|
|
int32_t m_sortByTermInfoNum;
|
|
int32_t m_sortByTermInfoNumInt;
|
|
|
|
// for gbmin:price:1.99
|
|
int32_t m_minScoreTermNum;
|
|
int32_t m_maxScoreTermNum;
|
|
|
|
// for gbmin:price:1.99
|
|
float m_minScoreVal;
|
|
float m_maxScoreVal;
|
|
|
|
// for gbmin:count:99
|
|
int32_t m_minScoreTermNumInt;
|
|
int32_t m_maxScoreTermNumInt;
|
|
|
|
// for gbmin:count:99
|
|
int32_t m_minScoreValInt;
|
|
int32_t m_maxScoreValInt;
|
|
|
|
|
|
// the new intersection/scoring algo
|
|
void intersectLists10_r ( );
|
|
|
|
HashTableX m_whiteListTable;
|
|
bool m_useWhiteTable;
|
|
bool m_addedSites;
|
|
|
|
// sets stuff used by intersect10_r()
|
|
bool setQueryTermInfo ( );
|
|
|
|
void shrinkSubLists ( class QueryTermInfo *qti );
|
|
|
|
int64_t countUniqueDocids( QueryTermInfo *qti ) ;
|
|
|
|
// for intersecting docids
|
|
void addDocIdVotes ( class QueryTermInfo *qti , int32_t listGroupNum );
|
|
|
|
// for negative query terms...
|
|
void rmDocIdVotes ( class QueryTermInfo *qti );
|
|
|
|
// upper score bound
|
|
float getMaxPossibleScore ( class QueryTermInfo *qti ,
|
|
int32_t bestDist ,
|
|
int32_t qdist ,
|
|
class QueryTermInfo *qtm ) ;
|
|
|
|
// stuff set in setQueryTermInf() function:
|
|
SafeBuf m_qiBuf;
|
|
int32_t m_numQueryTermInfos;
|
|
// the size of the smallest set of sublists. each sublists is
|
|
// the main term or a synonym, etc. of the main term.
|
|
int32_t m_minListSize;
|
|
// which query term info has the smallest set of sublists
|
|
int32_t m_minListi;
|
|
// intersect docids from each QueryTermInfo into here
|
|
SafeBuf m_docIdVoteBuf;
|
|
|
|
int32_t m_filtered;
|
|
|
|
// boolean truth table for boolean queries
|
|
HashTableX m_bt;
|
|
HashTableX m_ct;
|
|
// size of the data slot in m_bt
|
|
int32_t m_vecSize;
|
|
|
|
// are all positive query terms in same wikipedia phrase like
|
|
// 'time enough for love'?
|
|
bool m_allInSameWikiPhrase;
|
|
|
|
int32_t m_realMaxTop;
|
|
};
|
|
|
|
#define MAXDST 10
|
|
|
|
// distance used when measuring word from title/linktext/etc to word in body
|
|
#define FIXED_DISTANCE 400
|
|
|
|
class PairScore {
|
|
public:
|
|
float m_finalScore;
|
|
char m_isSynonym1;
|
|
char m_isSynonym2;
|
|
char m_isHalfStopWikiBigram1;
|
|
char m_isHalfStopWikiBigram2;
|
|
char m_diversityRank1;
|
|
char m_diversityRank2;
|
|
char m_densityRank1;
|
|
char m_densityRank2;
|
|
char m_wordSpamRank1;
|
|
char m_wordSpamRank2;
|
|
char m_hashGroup1;
|
|
char m_hashGroup2;
|
|
char m_inSameWikiPhrase;
|
|
char m_fixedDistance;
|
|
int32_t m_wordPos1;
|
|
int32_t m_wordPos2;
|
|
int64_t m_termFreq1;
|
|
int64_t m_termFreq2;
|
|
float m_tfWeight1;
|
|
float m_tfWeight2;
|
|
int32_t m_qtermNum1;
|
|
int32_t m_qtermNum2;
|
|
char m_bflags1;
|
|
char m_bflags2;
|
|
int32_t m_qdist;
|
|
};
|
|
|
|
class SingleScore {
|
|
public:
|
|
float m_finalScore;
|
|
char m_isSynonym;
|
|
char m_isHalfStopWikiBigram;
|
|
char m_diversityRank;
|
|
char m_densityRank;
|
|
char m_wordSpamRank;
|
|
char m_hashGroup;
|
|
int32_t m_wordPos;
|
|
int64_t m_termFreq; // float m_termFreqWeight;
|
|
float m_tfWeight;
|
|
int32_t m_qtermNum;
|
|
char m_bflags;
|
|
};
|
|
|
|
// we add up the pair scores of this many of the top-scoring pairs
|
|
// for inlink text only, so it is accumulative. but now we also
|
|
// have a parm "m_realMaxTop" which is <= MAX_TOP and can be used to
|
|
// tune this down.
|
|
#define MAX_TOP 10
|
|
|
|
// transparent query scoring info per docid
|
|
class DocIdScore {
|
|
public:
|
|
DocIdScore ( ) { reset(); }
|
|
|
|
void reset ( ) {
|
|
m_numPairs = m_numSingles = 0;
|
|
m_pairsOffset = m_singlesOffset = -1;
|
|
m_pairScores = NULL;
|
|
m_singleScores = NULL;
|
|
};
|
|
|
|
// we use QueryChange::getDebugDocIdScore() to "deserialize" per se
|
|
bool serialize ( class SafeBuf *sb );
|
|
|
|
int64_t m_docId;
|
|
// made this a double because of intScores which can't be captured
|
|
// fully with a float. intScores are used to sort by spidered time
|
|
// for example. see Posdb.cpp "intScore".
|
|
double m_finalScore;
|
|
char m_siteRank;
|
|
int32_t m_docLang; // langId
|
|
int32_t m_numRequiredTerms;
|
|
|
|
int32_t m_numPairs;
|
|
int32_t m_numSingles;
|
|
|
|
// . m_pairScores is just all the term pairs serialized
|
|
// . they contain their query term #1 of each term in the pair and
|
|
// they have the match number for each pair, since now each
|
|
// pair of query terms can have up to MAX_TOP associated pairs
|
|
// whose scores we add together to get the final score for that pair
|
|
// . record offset into PosdbTable::m_pairScoreBuf
|
|
// . Msg39Reply::ptr_pairScoreBuf will be this
|
|
int32_t m_pairsOffset;
|
|
// . record offset into PosdbTable.m_singleScoreBuf
|
|
// . Msg39Reply::ptr_singleScoreBuf will be this
|
|
int32_t m_singlesOffset;
|
|
//PairScore m_pairScores [MAXDST][MAXDST][MAX_TOP];
|
|
//SingleScore m_singleScores[MAXDST] [MAX_TOP];
|
|
|
|
// Msg3a.cpp::mergeLists() should set these ptrs after it
|
|
// copies over a top DocIdScore for storing the final results array
|
|
class PairScore *m_pairScores;
|
|
class SingleScore *m_singleScores;
|
|
};
|
|
|
|
|
|
extern Posdb g_posdb;
|
|
extern Posdb g_posdb2;
|
|
extern RdbCache g_termFreqCache;
|
|
|
|
// . b-step into list looking for docid "docId"
|
|
// . assume p is start of list, excluding 6 byte of termid
|
|
inline char *getWordPosList ( int64_t docId , char *list , int32_t listSize ) {
|
|
// make step divisible by 6 initially
|
|
int32_t step = (listSize / 12) * 6;
|
|
// int16_tcut
|
|
char *listEnd = list + listSize;
|
|
// divide in half
|
|
char *p = list + step;
|
|
// for detecting not founds
|
|
char count = 0;
|
|
loop:
|
|
// save it
|
|
char *origp = p;
|
|
// scan up to docid. we use this special bit to distinguish between
|
|
// 6-byte and 12-byte posdb keys
|
|
for ( ; p > list && (p[1] & 0x02) ; p -= 6 );
|
|
// ok, we hit a 12 byte key i guess, so backup 6 more
|
|
p -= 6;
|
|
// ok, we got a 12-byte key then i guess
|
|
int64_t d = g_posdb.getDocId ( p );
|
|
// we got a match, but it might be a NEGATIVE key so
|
|
// we have to try to find the positive keys in that case
|
|
if ( d == docId ) {
|
|
// if its positive, no need to do anything else
|
|
if ( (p[0] & 0x01) == 0x01 ) return p;
|
|
// ok, it's negative, try to see if the positive is
|
|
// in here, if not then return NULL.
|
|
// save current pos
|
|
char *current = p;
|
|
// back up to 6 byte key before this 12 byte key
|
|
p -= 6;
|
|
// now go backwards to previous 12 byte key
|
|
for ( ; p > list && (p[1] & 0x02) ; p -= 6 );
|
|
// ok, we hit a 12 byte key i guess, so backup 6 more
|
|
p -= 6;
|
|
// is it there?
|
|
if ( p >= list && g_posdb.getDocId(p) == docId ) {
|
|
// sanity. return NULL if its negative! wtf????
|
|
if ( (p[0] & 0x01) == 0x00 ) return NULL;
|
|
// got it
|
|
return p;
|
|
}
|
|
// ok, no positive before us, try after us
|
|
p = current;
|
|
// advance over current 12 byte key
|
|
p += 12;
|
|
// now go forwards to next 12 byte key
|
|
for ( ; p < listEnd && (p[1] & 0x02) ; p += 6 );
|
|
// is it there?
|
|
if ( p + 12 < listEnd && g_posdb.getDocId(p) == docId ) {
|
|
// sanity. return NULL if its negative! wtf????
|
|
if ( (p[0] & 0x01) == 0x00 ) return NULL;
|
|
// got it
|
|
return p;
|
|
}
|
|
// . crap, i guess just had a single negative docid then
|
|
// . return that and the caller will see its negative
|
|
return current;
|
|
}
|
|
// reduce step
|
|
//step /= 2;
|
|
step >>= 1;
|
|
// . make divisible by 6!
|
|
// . TODO: speed this up!!!
|
|
step = step - (step % 6);
|
|
// sanity
|
|
if ( step % 6 ) { char *xx=NULL;*xx=0; }
|
|
// ensure never 0
|
|
if ( step <= 0 ) {
|
|
step = 6;
|
|
// return NULL if not found
|
|
if ( count++ >= 2 ) return NULL;
|
|
}
|
|
// go up or down then
|
|
if ( d < docId ) {
|
|
p = origp + step;
|
|
if ( p > listEnd ) p = listEnd - 6;
|
|
}
|
|
else {
|
|
p = origp - step;
|
|
if ( p < list ) p = list;
|
|
}
|
|
// and repeat
|
|
goto loop;
|
|
}
|
|
|
|
#endif
|