facet updates

This commit is contained in:
mwells
2014-06-28 10:26:08 -06:00
parent 222a454d67
commit 7bd37dfaa2
8 changed files with 114 additions and 33 deletions

@ -1400,7 +1400,10 @@ void Msg39::estimateHitsAndSendReply ( ) {
for ( long i = 0 ; i < m_tmpq.m_numTerms; i++ ) {
QueryTerm *qt = &m_tmpq.m_qterms[i];
// skip if not facet
if ( qt->m_fieldCode != FIELD_GBFACET ) continue;
if ( qt->m_fieldCode != FIELD_GBFACETSTR &&
qt->m_fieldCode != FIELD_GBFACETINT &&
qt->m_fieldCode != FIELD_GBFACETFLOAT )
continue;
HashTableX *ft = &qt->m_facetHashTable;
if ( ft->m_numSlotsUsed == 0 ) continue;
long used = ft->m_numSlotsUsed;
@ -1430,7 +1433,10 @@ void Msg39::estimateHitsAndSendReply ( ) {
for ( long i = 0 ; i < m_tmpq.m_numTerms ; i++ ) {
QueryTerm *qt = &m_tmpq.m_qterms[i];
// skip if not facet
if ( qt->m_fieldCode != FIELD_GBFACET ) continue;
if ( qt->m_fieldCode != FIELD_GBFACETSTR &&
qt->m_fieldCode != FIELD_GBFACETINT &&
qt->m_fieldCode != FIELD_GBFACETFLOAT )
continue;
// get all the facet hashes and their counts
HashTableX *ft = &qt->m_facetHashTable;
// skip if none
@ -1440,6 +1446,9 @@ void Msg39::estimateHitsAndSendReply ( ) {
p += 8;
long used = ft->getNumSlotsUsed();
if ( used > (long)MAX_FACETS ) used = (long)MAX_FACETS;
// store count
*(long *)p = used;
p += 4;
long count = 0;
// for sanity check
char *pend = p + (used * 8);
@ -1460,7 +1469,7 @@ void Msg39::estimateHitsAndSendReply ( ) {
}
// now point to that so it can be serialized below
mr.ptr_facetHashList = tmp.getBufStart();
mr.size_facetHashList = tmp.length();
mr.size_facetHashList = p - tmp.getBufStart();//tmp.length();
/////////////
//

@ -963,9 +963,14 @@ bool Msg3a::mergeLists ( ) {
// facethashlists from each shard into
//long long tid = m_q->m_qterms[i].m_termId;
// we hold all the facet values
if ( ! qt->m_facetHashTable.set(4,0,128,NULL,0,false,
m_r->m_niceness,"fhtqt"))
HashTableX *ht = &qt->m_facetHashTable;
// we have to manually cal this
ht->constructor();
// 4 byte key, 4 byte score for counting facet values
if ( ! ht->set(4,4,128,NULL,0,false,m_r->m_niceness,"fhtqt"))
return true;
// sanity
if ( ! ht->m_isWritable ) {char *xx=NULL;*xx=0;}
}
// now scan each facethashlist from each shard and compile into
@ -996,6 +1001,12 @@ bool Msg3a::mergeLists ( ) {
p += 4;
// get that query term
QueryTerm *qt = m_q->getQueryTermByTermId64 ( termId );
// sanity
if ( ! qt ) {
log("msg3a: query: could not find query term with "
"termid %llu for facet",termId);
break;
}
// the end point
char *pend = p + 4 * nh;
// now compile the facet hash list into there
@ -1003,7 +1014,7 @@ bool Msg3a::mergeLists ( ) {
// debug
//log("msg3a: got facethash %li) %lu",k,p[k]);
// hash it up, no dups!
if ( ! qt->m_facetHashTable.addScore((long *)p) )
if ( ! qt->m_facetHashTable.addScore((long *)p,1) )
return true;
}
// now get the next gbfacet: term if there was one

@ -2131,7 +2131,11 @@ bool printSearchResultsHeader ( State0 *st ) {
// only for html for now i guess
if ( si->m_format != FORMAT_HTML ) break;
QueryTerm *qt = &si->m_q.m_qterms[i];
if ( qt->m_fieldCode != FIELD_GBFACET ) continue;
// skip if not facet
if ( qt->m_fieldCode != FIELD_GBFACETSTR &&
qt->m_fieldCode != FIELD_GBFACETINT &&
qt->m_fieldCode != FIELD_GBFACETFLOAT )
continue;
HashTableX *fht = &qt->m_facetHashTable;
// a new table for each facet query term
bool needTable = true;

@ -796,7 +796,10 @@ bool PosdbTable::allocTopTree ( ) {
for ( long i = 0 ; i < m_q->m_numTerms ; i++ ) {
QueryTerm *qt = &m_q->m_qterms[i];
// skip if not facet
if ( qt->m_fieldCode != FIELD_GBFACET ) continue;
if ( qt->m_fieldCode != FIELD_GBFACETSTR &&
qt->m_fieldCode != FIELD_GBFACETINT &&
qt->m_fieldCode != FIELD_GBFACETFLOAT )
continue;
// how big?
long long total = m_msg2->m_lists[i].getListSize();
// skip if empty
@ -4309,7 +4312,11 @@ bool PosdbTable::setQueryTermInfo ( ) {
qti->m_bigramFlags[nn]|=BF_NUMBER;
if (qt->m_fieldCode == FIELD_GBFACET )
if (qt->m_fieldCode == FIELD_GBFACETSTR )
qti->m_bigramFlags[nn]|=BF_FACET;
if (qt->m_fieldCode == FIELD_GBFACETINT )
qti->m_bigramFlags[nn]|=BF_FACET;
if (qt->m_fieldCode == FIELD_GBFACETFLOAT )
qti->m_bigramFlags[nn]|=BF_FACET;
// add list of member terms
@ -6867,7 +6874,10 @@ void PosdbTable::intersectLists10_r ( ) {
// QueryTerm::m_facetHashTable/m_dt
for ( long i = 0 ; i < m_q->m_numTerms ; i++ ) {
QueryTerm *qt = &m_q->m_qterms[i];
if ( qt->m_fieldCode != FIELD_GBFACET ) continue;
if ( qt->m_fieldCode != FIELD_GBFACETSTR &&
qt->m_fieldCode != FIELD_GBFACETINT &&
qt->m_fieldCode != FIELD_GBFACETFLOAT )
continue;
char *p = miniMergedList[i];
//char *pend = miniMergedEnd [i];
//

@ -2280,7 +2280,9 @@ bool Query::setQWords ( char boolFlag ,
fieldCode == FIELD_GBREVSORTBYINT ||
fieldCode == FIELD_GBNUMBERMININT ||
fieldCode == FIELD_GBNUMBERMAXINT ||
fieldCode == FIELD_GBFACET ||
fieldCode == FIELD_GBFACETSTR ||
fieldCode == FIELD_GBFACETINT ||
fieldCode == FIELD_GBFACETFLOAT ||
fieldCode == FIELD_GBAD ) {
// . find 1st space -- that terminates the field value
@ -3232,8 +3234,22 @@ struct QueryField g_fields[] = {
"spidered in seconds since the epoch in UTC."
},
{"gbfacet", FIELD_GBFACET, false,
"Example: 'gbfacet:price' will return facets in the search results "
{"gbfacetstr", FIELD_GBFACETSTR, false,
"Example: 'gbfacet:color' will return facets in the search results "
"by their color field. Any other "
"field name can follow the gbfacet: operator."
},
{"gbfacetint", FIELD_GBFACETINT, false,
"Example: 'gbfacet:numReviews' will return "
"facets in the search results "
"with the # of documents for each number of reviews. Any other "
"field name can follow the gbfacet: operator."
},
{"gbfacetfloat", FIELD_GBFACETFLOAT, false,
"Example: 'gbfacetfloat:price' will return facets in the "
"search results "
"with the # of documents that have certain price ranges. Any other "
"field name can follow the gbfacet: operator."
},

@ -121,7 +121,9 @@ typedef unsigned long long qvec_t;
#define FIELD_GBREVSORTBYINT 60
#define FIELD_GBNUMBERMININT 61
#define FIELD_GBNUMBERMAXINT 62
#define FIELD_GBFACET 63
#define FIELD_GBFACETSTR 63
#define FIELD_GBFACETINT 64
#define FIELD_GBFACETFLOAT 65
#define FIELD_GBOTHER 92

@ -6620,7 +6620,7 @@ SectionStats *XmlDoc::getSectionStats ( long long secHash64 , long sentHash32){
// the statistics for all the values in the posdb keys of this
// termlist, which happen to be innerHTML hashes for all pages
// with this same xpath and on this same site.
sprintf(qbuf,"gbfacet:gbxpathsitehash%llu",secHash64);
sprintf(qbuf,"gbfacetstr:gbxpathsitehash%llu",secHash64);
CollectionRec *cr = getCollRec();
if ( ! cr ) return NULL;
@ -26707,7 +26707,9 @@ bool XmlDoc::hashSections ( HashTableX *tt ) {
hi.m_prefix = prefix;
// we already have the hash of the inner html of the section
hashFacet2 ( prefix, (long)(unsigned long)ih64 , hi.m_tt );
hashFacet2 ( "gbfacetstr",
prefix,
(long)(unsigned long)ih64 , hi.m_tt );
}
return true;
@ -31645,10 +31647,27 @@ bool XmlDoc::hashFacet1 ( char *term ,
// hash the whole string as one value, the value of the facet
long val32 = hash32 ( a , b - a );
return hashFacet2 ( term, val32 , tt );
if ( ! hashFacet2 ( "gbfacetstr",term, val32 , tt ) ) return false;
// if it's a number hash as float and int
if ( nw != 1 ) return true;
char **wptrs = words->m_words;
if ( ! is_digit ( wptrs[0][0] ) ) return true;
// hash with a float val
float f = atof(wptrs[0]);
long vf32 = *(long *)&f;
if ( ! hashFacet2 ( "gbfacetfloat",term, vf32 , tt ) ) return false;
// and an int val
long vi32 = atoi(wptrs[0]);
if ( ! hashFacet2 ( "gbfacetint",term, vi32 , tt ) ) return false;
return true;
}
bool XmlDoc::hashFacet2 ( char *term ,
bool XmlDoc::hashFacet2 ( char *prefix,
char *term ,
long val32 ,
HashTableX *tt ) {
@ -31664,15 +31683,16 @@ bool XmlDoc::hashFacet2 ( char *term ,
// now any field has to support gbfacet:thatfield
// and store the 32-bit termid into where we normally put
// the word position bits, etc.
static long long s_facetPrefixHash = 0LL;
if ( ! s_facetPrefixHash )
s_facetPrefixHash = hash64n ( "gbfacet" );
//static long long s_facetPrefixHash = 0LL;
//if ( ! s_facetPrefixHash )
// s_facetPrefixHash = hash64n ( "gbfacet" );
long long prefixHash = hash64n ( prefix );
long long termId64 = hash64n ( term );
// combine with the "gbfacet" prefix. old prefix hash on right.
// like "price" on right.
long long ph2 = hash64 ( s_facetPrefixHash , termId64);//prefixHash );
// like "price" on left and "gbfacetfloat" on left... see Query.cpp
long long ph2 = hash64 ( termId64, prefixHash );
// . now store it
// . use field hash as the termid. normally this would just be
@ -31730,21 +31750,28 @@ bool XmlDoc::hashFacet2 ( char *term ,
if ( ! m_wts )
return true;
// store in buffer
bool isFloat = false;
if ( strcmp(prefix,"gbfacetfloat")==0 ) isFloat = true;
// store in buffer for display on pageparser.cpp output
char buf[128];
long bufLen = sprintf(buf,"facetField=%s facetVal32=%lu",
term,val32);
long bufLen;
if ( isFloat )
bufLen=sprintf(buf,"facetField=%s facetVal32=%f",term,
*(float *)&val32);
else
bufLen=sprintf(buf,"facetField=%s facetVal32=%lu",term,val32);
// make a special hashinfo for this facet
HashInfo hi;
hi.m_tt = tt;
hi.m_prefix = "gbfacet";
hi.m_prefix = prefix;//"gbfacet";
// add to wts for PageParser.cpp display
// store it
if ( ! storeTerm ( buf,
bufLen,
s_facetPrefixHash,
prefixHash, // s_facetPrefixHash,
&hi,
0, // word#, i,
0, // wordPos
@ -31836,7 +31863,7 @@ bool XmlDoc::hashNumber ( char *beginBuf ,
// . CHROME DETECTION
// . hash a special "gbxpathsitehash12345678" term which has the hash of the
// innerHTML content embedded in it.
// . we do this for doing gbfacet:gbxpathsitehash12345678 etc. on every
// . we do this for doing gbfacetstr:gbxpathsitehash12345678 etc. on every
// section with innerHTML so we can figure out the histogram of each
// section on this page relative to its subdomain. like the distriubtion
// of the innerHTML for this section as it appears on other pages from
@ -33369,7 +33396,9 @@ bool XmlDoc::printDoc ( SafeBuf *sb ) {
prefix = start + tp[i]->m_prefixOff;
bool isFacet = false;
if ( prefix && prefix[0]=='g' && strcmp(prefix,"gbfacet")== 0 )
if ( prefix &&
prefix[0]=='g' &&
strncmp(prefix,"gbfacet",7)== 0 )
isFacet = true;
sb->safePrintf ( "<tr>"
@ -33501,7 +33530,7 @@ bool XmlDoc::printDoc ( SafeBuf *sb ) {
// there is no prefix for such terms now
// TODO: store actual key in there i guess?? or just this bit.
long val32 = 0;
if ( strcmp(prefix,"gbfacet") == 0 )
if ( strncmp(prefix,"gbfacet",7) == 0 )
val32 = g_posdb.getInt(&tp[i]->m_key);
// . this is like gbxpathsitehash1234567

@ -890,9 +890,9 @@ class XmlDoc {
// class HashInfo *hi ,
// long sentHash32 ) ;
bool hashFacet1 ( char *prefix, class Words *words , HashTableX *dt) ;
bool hashFacet1 ( char *term, class Words *words , HashTableX *dt) ;
bool hashFacet2 ( char *prefix, long val32 , HashTableX *dt) ;
bool hashFacet2 ( char *prefix,char *term,long val32, HashTableX *dt) ;
bool hashNumber ( char *beginBuf ,