Merge branch 'diffbot-testing' into diffbot-sam
This commit is contained in:
commit
da41d53575
Collectiondb.cppHashTableX.cppHighlight.cppHighlight.hHttpServer.cppMakefileMatches.cppMatches.hMem.cppMem.hMsg2.cppMsg2.hMsg39.cppMsg39.hMsg3a.cppMsg3a.hMsg40.cppMsg40.hPageGet.cppPageResults.cppPageResults.hPageRoot.cppPageStats.cppPages.cppParms.cppPosdb.cppPosdb.hPostQueryRerank.hQuery.cppQuery.hRdbDump.cppRdbList.cppRdbMap.cppRdbTree.cppSafeBuf.cppSafeBuf.hSearchInput.cppSections.cppSpider.cppSpider.hStats.cppStats.hSummary.cppSummary.hSynonyms.cppTagdb.cppXmlDoc.cpphash.cpphash.hmain.cpp
@ -3579,7 +3579,7 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() {
|
||||
|
||||
// lower from 7 to 1 since we have so many collections now
|
||||
// ok, now we have much less colls so raise back to 7
|
||||
int32_t diffbotipms = 7;// 1; // 7
|
||||
int32_t diffbotipms = 7;//1; // 7
|
||||
|
||||
// make the gigablast regex table just "default" so it does not
|
||||
// filtering, but accepts all urls. we will add code to pass the urls
|
||||
@ -3599,8 +3599,8 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() {
|
||||
m_spiderIpWaits [i] = wait;
|
||||
m_spiderIpMaxSpiders[i] = diffbotipms; // keep it respectful
|
||||
// ethan wants some speed
|
||||
if ( isEthan )
|
||||
m_spiderIpMaxSpiders[i] = 30;
|
||||
// if ( isEthan )
|
||||
// m_spiderIpMaxSpiders[i] = 30;
|
||||
//m_spidersEnabled [i] = 1;
|
||||
m_spiderFreqs [i] = respiderFreq;
|
||||
//m_spiderDiffbotApiUrl[i].purge();
|
||||
@ -3623,6 +3623,18 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() {
|
||||
m_forceDelete [i] = 1;
|
||||
i++;
|
||||
|
||||
// de-prioritize fakefirstip urls so we don't give the impression our
|
||||
// spiders are slow. like if someone adds a bulk job with 100,000 urls
|
||||
// then we sit there and process to lookup their ips and add a real
|
||||
// spider request (if it falls onto the same shard) before we actually
|
||||
// do any real spidering. so keep the priority here low.
|
||||
m_regExs[i].set("isfakeip");
|
||||
m_maxSpidersPerRule [i] = 7;
|
||||
m_spiderIpMaxSpiders [i] = 7;
|
||||
m_spiderPriorities [i] = 20;
|
||||
m_spiderIpWaits [i] = 0;
|
||||
i++;
|
||||
|
||||
// hopcount filter if asked for
|
||||
if( m_diffbotMaxHops >= 0 ) {
|
||||
|
||||
|
@ -18,6 +18,8 @@ void HashTableX::constructor() {
|
||||
m_useKeyMagic = false;
|
||||
m_ks = 0;
|
||||
m_allowGrowth = true;
|
||||
m_numSlots = 0;
|
||||
m_numSlotsUsed = 0;
|
||||
}
|
||||
|
||||
void HashTableX::destructor() {
|
||||
|
@ -160,8 +160,8 @@ int32_t Highlight::set ( SafeBuf *sb ,
|
||||
// . set the anchor counts to 1000*i+1 for each possible query term num
|
||||
// . yes, i know, why +1? because we're assuming the query terms
|
||||
// have been highlighted before us
|
||||
for ( int32_t i = 0 ; i < MAX_QUERY_TERMS ; i++ )
|
||||
m_anchorCounts[i] = 1000*i + 1;
|
||||
//for ( int32_t i = 0 ; i < MAX_QUERY_TERMS ; i++ )
|
||||
// m_anchorCounts[i] = 1000*i + 1;
|
||||
// set lengths of provided front/back highlight tags
|
||||
if ( m_frontTag ) m_frontTagLen = gbstrlen ( frontTag );
|
||||
if ( m_backTag ) m_backTagLen = gbstrlen ( backTag );
|
||||
@ -170,6 +170,10 @@ int32_t Highlight::set ( SafeBuf *sb ,
|
||||
//m_bufLen = bufLen;
|
||||
//m_bufPtr = buf;
|
||||
m_sb = sb;
|
||||
|
||||
// label it
|
||||
m_sb->setLabel ("highw");
|
||||
|
||||
// save room for terminating \0
|
||||
//m_bufEnd = m_buf + m_bufLen - 1;
|
||||
|
||||
|
@ -70,7 +70,7 @@ class Highlight {
|
||||
bool m_doStemming;
|
||||
|
||||
bool m_useAnchors; // click and scroll technology for cached pages
|
||||
int32_t m_anchorCounts [ MAX_QUERY_TERMS ];
|
||||
//int32_t m_anchorCounts [ MAX_QUERY_TERMS ];
|
||||
const char *m_baseUrl;
|
||||
|
||||
int32_t m_numMatches;
|
||||
|
@ -1514,6 +1514,10 @@ bool HttpServer::sendReply ( TcpSocket *s , HttpRequest *r , bool isAdmin) {
|
||||
// is recycled/destroyed
|
||||
// . this will call getMsgPiece() to fill up sendBuf from file
|
||||
int32_t totalToSend = mimeLen + bytesToSend;
|
||||
|
||||
//s->m_state = NULL; // do we need this? yes, cuz s is NULL for cleanUp
|
||||
if ( s && s->m_state == f ) s->m_state = NULL;
|
||||
|
||||
//if ( ! m_tcp.sendMsg ( s ,
|
||||
if ( ! tcp->sendMsg ( s ,
|
||||
sendBuf ,
|
||||
@ -1542,7 +1546,6 @@ bool HttpServer::sendReply ( TcpSocket *s , HttpRequest *r , bool isAdmin) {
|
||||
if ( ! f->isOpen() ) f->open( O_RDONLY );
|
||||
int fd = f->getfd();
|
||||
cleanUp ( f , NULL/*TcpSocket */ );
|
||||
s->m_state = NULL; // do we need this? yes, cuz s is NULL for cleanUp
|
||||
// . AND we need to do this ourselves here
|
||||
// . do it SILENTLY so not message is logged if fd not registered
|
||||
if (tcp->m_useSSL)
|
||||
|
2
Makefile
2
Makefile
@ -10,7 +10,7 @@ CC=g++
|
||||
|
||||
# remove dlstubs.o for CYGWIN
|
||||
OBJS = UdpSlot.o Rebalance.o \
|
||||
Msg13.o Mime.o IndexReadInfo.o \
|
||||
Msg13.o Mime.o \
|
||||
PageGet.o PageHosts.o \
|
||||
PageParser.o PageInject.o PagePerf.o PageReindex.o PageResults.o \
|
||||
PageAddUrl.o PageRoot.o PageSockets.o PageStats.o \
|
||||
|
34
Matches.cpp
34
Matches.cpp
@ -24,10 +24,24 @@
|
||||
Matches::Matches ( ) {
|
||||
m_detectSubPhrases = false;
|
||||
m_numMatchGroups = 0;
|
||||
m_qwordFlags = NULL;
|
||||
m_qwordAllocSize = 0;
|
||||
reset();
|
||||
}
|
||||
Matches::~Matches( ) { reset(); }
|
||||
void Matches::reset ( ) {
|
||||
reset2();
|
||||
if ( m_qwordFlags && m_qwordFlags != (mf_t *)m_tmpBuf ) {
|
||||
mfree ( m_qwordFlags , m_qwordAllocSize , "mmqw" );
|
||||
m_qwordFlags = NULL;
|
||||
}
|
||||
//m_explicitsMatched = 0;
|
||||
//m_matchableRequiredBits = 0;
|
||||
//m_hasAllQueryTerms = false;
|
||||
//m_matchesQuery = false;
|
||||
}
|
||||
|
||||
void Matches::reset2() {
|
||||
m_numMatches = 0;
|
||||
//m_maxNQT = -1;
|
||||
m_numAlnums = 0;
|
||||
@ -39,10 +53,6 @@ void Matches::reset ( ) {
|
||||
m_bitsArray [i].reset();
|
||||
}
|
||||
m_numMatchGroups = 0;
|
||||
//m_explicitsMatched = 0;
|
||||
//m_matchableRequiredBits = 0;
|
||||
//m_hasAllQueryTerms = false;
|
||||
//m_matchesQuery = false;
|
||||
}
|
||||
|
||||
bool Matches::isMatchableTerm ( QueryTerm *qt ) { // , int32_t i ) {
|
||||
@ -103,6 +113,20 @@ void Matches::setQuery ( Query *q ) {
|
||||
|
||||
//memset ( m_foundNegTermVector, 0, m_q->getNumTerms() );
|
||||
|
||||
if ( m_qwordFlags ) { char *xx=NULL;*xx=0; }
|
||||
|
||||
int32_t need = m_q->m_numWords * sizeof(mf_t) ;
|
||||
m_qwordAllocSize = need;
|
||||
if ( need < 128 )
|
||||
m_qwordFlags = (mf_t *)m_tmpBuf;
|
||||
else
|
||||
m_qwordFlags = (mf_t *)mmalloc ( need , "mmqf" );
|
||||
|
||||
if ( ! m_qwordFlags ) {
|
||||
log("matches: alloc failed for query %s",q->m_orig);
|
||||
return;
|
||||
}
|
||||
|
||||
// this is word based. these are each 1 byte
|
||||
memset ( m_qwordFlags , 0 , m_q->m_numWords * sizeof(mf_t));
|
||||
|
||||
@ -278,7 +302,7 @@ bool Matches::set ( XmlDoc *xd ,
|
||||
int32_t niceness ) {
|
||||
|
||||
// don't reset query info!
|
||||
reset();
|
||||
reset2();
|
||||
|
||||
// sanity check
|
||||
if ( ! xd->m_docIdValid ) { char *xx=NULL;*xx=0; }
|
||||
|
@ -142,6 +142,7 @@ class Matches {
|
||||
Matches ( ) ;
|
||||
~Matches( ) ;
|
||||
void reset ( ) ;
|
||||
void reset2 ( ) ;
|
||||
|
||||
// BIG HACK support
|
||||
//int32_t getTermsFound ( bool *hadPhrases , bool *hadWords );
|
||||
@ -183,7 +184,10 @@ class Matches {
|
||||
|
||||
// . 1-1 with Query::m_qwords[] array of QWords
|
||||
// . shows the match flags for that query word
|
||||
mf_t m_qwordFlags[MAX_QUERY_WORDS];
|
||||
//mf_t m_qwordFlags[MAX_QUERY_WORDS];
|
||||
mf_t *m_qwordFlags;
|
||||
int32_t m_qwordAllocSize;
|
||||
char m_tmpBuf[128];
|
||||
|
||||
//stuff for detecting whether a match is part of a larger phrase
|
||||
void setSubPhraseDetection();
|
||||
|
5
Mem.cpp
5
Mem.cpp
@ -530,6 +530,11 @@ void Mem::addMem ( void *mem , int32_t size , const char *note , char isnew ) {
|
||||
|
||||
//validate();
|
||||
|
||||
// if ( note && note[0] == 'S' && note[1] == 'a' &&
|
||||
// note[2] == 'f' && size == 13371521 )
|
||||
// log("mem: got mystery safebuf");
|
||||
|
||||
|
||||
//m_memtablesize = 0;//DMEMTABLESIZE;
|
||||
// 4G/x = 600*1024 -> x = 4000000000.0/(600*1024) = 6510
|
||||
// crap, g_hostdb.init() is called inmain.cpp before
|
||||
|
14
Mem.h
14
Mem.h
@ -280,6 +280,20 @@ inline int32_t getNumBitsOn64 ( uint64_t bits ) {
|
||||
g_a [ *((unsigned char *)(&bits) + 7) ] ;
|
||||
}
|
||||
|
||||
inline int32_t getNumBitsOnX ( unsigned char *s , int32_t slen ) {
|
||||
if ( slen == 1 ) return getNumBitsOn8 ( *s );
|
||||
if ( slen == 2 ) return getNumBitsOn16 ( *(uint16_t *)s );
|
||||
if ( slen == 4 ) return getNumBitsOn32 ( *(uint32_t *)s );
|
||||
if ( slen == 3 )
|
||||
return getNumBitsOn8 ( s[0] ) +
|
||||
getNumBitsOn8 ( s[1] ) +
|
||||
getNumBitsOn8 ( s[2] ) ;
|
||||
int32_t total = 0;
|
||||
for ( int32_t i = 0 ; i < slen ; i++ )
|
||||
total += getNumBitsOn8 ( s[i] );
|
||||
return total;
|
||||
}
|
||||
|
||||
// assume only one bit is set for this (used by Address.cpp)
|
||||
inline int32_t getBitPosLL ( uint8_t *bit ) {
|
||||
// which int32_t is it in?
|
||||
|
6
Msg2.cpp
6
Msg2.cpp
@ -98,7 +98,7 @@ bool Msg2::getLists ( int32_t rdbId ,
|
||||
// set this
|
||||
m_numLists = m_query->m_numTerms;
|
||||
// make sure not too many lists being requested
|
||||
if ( m_numLists > MAX_NUM_LISTS ) {g_errno=ETOOMANYLISTS; return true;}
|
||||
//if(m_numLists > MAX_NUM_LISTS ) {g_errno=ETOOMANYLISTS; return true;}
|
||||
// clear them all
|
||||
//for ( int32_t i = 0 ; i < m_numLists ; i++ ) {
|
||||
// m_inProgress[i] = true;
|
||||
@ -133,7 +133,7 @@ bool Msg2::getLists ( ) {
|
||||
// . make slots for all
|
||||
for ( ; m_i < m_numLists ; m_i++ ) {
|
||||
// sanity for Msg39's sake. do no breach m_lists[].
|
||||
if ( m_i >= MAX_QUERY_TERMS ) { char *xx=NULL;*xx=0; }
|
||||
if ( m_i >= ABS_MAX_QUERY_TERMS ) { char *xx=NULL;*xx=0; }
|
||||
// if any had error, forget the rest. do not launch any more
|
||||
if ( m_errno ) break;
|
||||
// skip if already did it
|
||||
@ -413,6 +413,8 @@ bool Msg2::getLists ( ) {
|
||||
// mem. we should also report the size of each termlist
|
||||
// in bytes in the query info header.
|
||||
//int32_t minRecSizes = DEFAULT_POSDB_READSIZE;
|
||||
// MDW TODO fix this later we go oom too easily for queries
|
||||
// like 'www.disney.nl'
|
||||
int32_t minRecSizes = -1;
|
||||
|
||||
// start up the read. thread will wait in thread queue to
|
||||
|
5
Msg2.h
5
Msg2.h
@ -7,9 +7,10 @@
|
||||
#include "Msg0.h"
|
||||
|
||||
/** define the max # of lists you can get as the max # of query terms for now */
|
||||
#define MAX_NUM_LISTS MAX_QUERY_TERMS
|
||||
//#define MAX_NUM_LISTS MAX_QUERY_TERMS
|
||||
/** how many outstanding msg5 requests at one time? */
|
||||
#define MSG2_MAX_REQUESTS MAX_QUERY_TERMS
|
||||
//#define MSG2_MAX_REQUESTS MAX_QUERY_TERMS
|
||||
#define MSG2_MAX_REQUESTS 32
|
||||
/** support the &sites=xyz.com+abc.com+... to restrict search results to provided sites.*/
|
||||
#define MAX_WHITELISTS 500
|
||||
|
||||
|
37
Msg39.cpp
37
Msg39.cpp
@ -34,6 +34,10 @@ Msg39::Msg39 () {
|
||||
reset();
|
||||
}
|
||||
|
||||
Msg39::~Msg39 () {
|
||||
reset();
|
||||
}
|
||||
|
||||
void Msg39::reset() {
|
||||
if ( m_inUse ) { char *xx=NULL;*xx=0; }
|
||||
m_allocedTree = false;
|
||||
@ -46,8 +50,16 @@ void Msg39::reset() {
|
||||
|
||||
void Msg39::reset2() {
|
||||
// reset lists
|
||||
for ( int32_t j = 0 ; j < m_msg2.m_numLists ; j++ )
|
||||
m_lists[j].freeList();
|
||||
int32_t nqt = m_stackBuf.getLength() / sizeof(RdbList);
|
||||
//for ( int32_t j = 0 ; j < m_msg2.m_numLists && m_lists ; j++ ) {
|
||||
for ( int32_t j = 0 ; j < nqt && m_lists ; j++ ) {
|
||||
//m_lists[j].freeList();
|
||||
//log("msg39: destroy list @ 0x%"PTRFMT,(PTRTYPE)&m_lists[j]);
|
||||
// same thing but more generic
|
||||
m_lists[j].destructor();
|
||||
}
|
||||
m_stackBuf.purge();
|
||||
m_lists = NULL;
|
||||
m_msg2.reset();
|
||||
m_posdbTable.reset();
|
||||
m_callback = NULL;
|
||||
@ -205,7 +217,8 @@ void Msg39::getDocIds2 ( Msg39Request *req ) {
|
||||
if ( ! m_tmpq.set2 ( m_r->ptr_query ,
|
||||
m_r->m_language ,
|
||||
m_r->m_queryExpansion ,
|
||||
m_r->m_useQueryStopWords ) ) {
|
||||
m_r->m_useQueryStopWords ,
|
||||
m_r->m_maxQueryTerms ) ) {
|
||||
log("query: msg39: setQuery: %s." ,
|
||||
mstrerror(g_errno) );
|
||||
sendReply ( m_slot , this , NULL , 0 , 0 , true );
|
||||
@ -223,11 +236,14 @@ void Msg39::getDocIds2 ( Msg39Request *req ) {
|
||||
if ( m_tmpq.getNumTerms() != m_r->m_nqt ) {
|
||||
g_errno = EBADENGINEER;
|
||||
log("query: Query parsing inconsistency for q=%s. "
|
||||
"%i != %i. "
|
||||
"langid=%"INT32". Check langids and m_queryExpansion parms "
|
||||
"which are the only parms that could be different in "
|
||||
"Query::set2(). You probably have different mysynoyms.txt "
|
||||
"files on two different hosts! check that!!"
|
||||
,m_tmpq.m_orig
|
||||
,(int)m_tmpq.getNumTerms()
|
||||
,(int)m_r->m_nqt
|
||||
,(int32_t)m_r->m_language
|
||||
);
|
||||
sendReply ( m_slot , this , NULL , 0 , 0 , true );
|
||||
@ -713,7 +729,7 @@ bool Msg39::getLists () {
|
||||
//(int64_t)m_tmpq.m_qterms[i].m_explicitBit ,
|
||||
//(int64_t)m_tmpq.m_qterms[i].m_implicitBits ,
|
||||
(int32_t)m_tmpq.m_qterms[i].m_hardCount ,
|
||||
(int32_t)m_tmpq.m_componentCodes[i],
|
||||
(int32_t)m_tmpq.m_qterms[i].m_componentCode,
|
||||
(int32_t)m_tmpq.getTermLen(i) ,
|
||||
isSynonym,
|
||||
(int32_t)m_tmpq.m_langId ); // ,tt
|
||||
@ -762,6 +778,19 @@ bool Msg39::getLists () {
|
||||
// split is us????
|
||||
//int32_t split = g_hostdb.m_myHost->m_group;
|
||||
int32_t split = g_hostdb.m_myHost->m_shardNum;
|
||||
|
||||
|
||||
int32_t nqt = m_tmpq.getNumTerms();
|
||||
int32_t need = sizeof(RdbList) * nqt ;
|
||||
m_stackBuf.setLabel("stkbuf2");
|
||||
if ( ! m_stackBuf.reserve ( need ) ) return true;
|
||||
m_lists = (IndexList *)m_stackBuf.getBufStart();
|
||||
m_stackBuf.setLength ( need );
|
||||
for ( int32_t i = 0 ; i < nqt ; i++ ) {
|
||||
m_lists[i].constructor();
|
||||
//log("msg39: constructlist @ 0x%"PTRFMT,(PTRTYPE)&m_lists[i]);
|
||||
}
|
||||
|
||||
// call msg2
|
||||
if ( ! m_msg2.getLists ( rdbId ,
|
||||
m_r->m_collnum,//m_r->ptr_coll ,
|
||||
|
5
Msg39.h
5
Msg39.h
@ -216,6 +216,7 @@ class Msg39 {
|
||||
public:
|
||||
|
||||
Msg39();
|
||||
~Msg39();
|
||||
void reset();
|
||||
void reset2();
|
||||
// register our request handler for Msg39's
|
||||
@ -266,7 +267,9 @@ class Msg39 {
|
||||
|
||||
// . we hold our IndexLists here for passing to PosdbTable
|
||||
// . one array for each of the tiers
|
||||
IndexList m_lists [ MAX_QUERY_TERMS ];
|
||||
//IndexList m_lists [ MAX_QUERY_TERMS ];
|
||||
IndexList *m_lists;
|
||||
SafeBuf m_stackBuf;
|
||||
|
||||
// used for timing
|
||||
int64_t m_startTime;
|
||||
|
42
Msg3a.cpp
42
Msg3a.cpp
@ -317,8 +317,8 @@ bool Msg3a::gotCacheReply ( ) {
|
||||
|
||||
//CollectionRec *cr;
|
||||
//cr = g_collectiondb.getRec(m_r->ptr_coll,m_r->size_coll-1);
|
||||
|
||||
setTermFreqWeights ( m_r->m_collnum,m_q,m_termFreqs,m_termFreqWeights);
|
||||
//setTermFreqWeights(m_r->m_collnum,m_q,m_termFreqs,m_termFreqWeights);
|
||||
setTermFreqWeights ( m_r->m_collnum,m_q );
|
||||
|
||||
if ( m_debug ) {
|
||||
//int64_t *termIds = m_q->getTermIds();
|
||||
@ -338,8 +338,8 @@ bool Msg3a::gotCacheReply ( ) {
|
||||
i,
|
||||
qt->m_term,
|
||||
qt->m_termId,
|
||||
m_termFreqs[i],
|
||||
m_termFreqWeights[i]);
|
||||
qt->m_termFreq,//m_termFreqs[i],
|
||||
qt->m_termFreqWeight);//m_termFreqWeights[i]);
|
||||
// put it back
|
||||
*tpc = c;
|
||||
}
|
||||
@ -368,7 +368,8 @@ bool Msg3a::gotCacheReply ( ) {
|
||||
}
|
||||
|
||||
// a tmp buf
|
||||
int32_t readSizes[MAX_QUERY_TERMS];
|
||||
int32_t readSizes[ABS_MAX_QUERY_TERMS];
|
||||
float tfw [ABS_MAX_QUERY_TERMS];
|
||||
// update our read info
|
||||
for ( int32_t j = 0; j < n ; j++ ) {
|
||||
// the read size for THIS query term
|
||||
@ -379,7 +380,9 @@ bool Msg3a::gotCacheReply ( ) {
|
||||
rs = DEFAULT_POSDB_READSIZE;//90000000; // 90MB!
|
||||
// it is better to go oom then leave users scratching their
|
||||
// heads as to why some results are not being returned.
|
||||
rs = -1;
|
||||
// no, because we are going out of mem for queries like
|
||||
// 'www.disney.nl' etc.
|
||||
//rs = -1;
|
||||
// if section stats, limit to 1MB
|
||||
//if ( m_r->m_getSectionStats ) rs = 1000000;
|
||||
// get the jth query term
|
||||
@ -388,13 +391,14 @@ bool Msg3a::gotCacheReply ( ) {
|
||||
if ( qt->m_ignored ) rs = 0;
|
||||
// set it
|
||||
readSizes[j] = rs;
|
||||
// serialize these too
|
||||
tfw[j] = qt->m_termFreqWeight;
|
||||
}
|
||||
|
||||
// serialize this
|
||||
m_r->ptr_readSizes = (char *)readSizes;
|
||||
m_r->size_readSizes = 4 * n;
|
||||
// and this
|
||||
m_r->ptr_termFreqWeights = (char *)m_termFreqWeights;
|
||||
m_r->ptr_termFreqWeights = (char *)tfw;//m_termFreqWeights;
|
||||
m_r->size_termFreqWeights = 4 * n;
|
||||
// store query into request, might have changed since we called
|
||||
// Query::expandQuery() above
|
||||
@ -1095,7 +1099,10 @@ bool Msg3a::mergeLists ( ) {
|
||||
// log("results: alloc fhtqt of %"PTRFMT" for st0=%"PTRFMT,
|
||||
// (PTRTYPE)ht->m_buf,(PTRTYPE)m_q->m_st0Ptr);
|
||||
// sanity
|
||||
if ( ! ht->m_isWritable ) {char *xx=NULL;*xx=0;}
|
||||
if ( ! ht->m_isWritable ) {
|
||||
log("msg3a: queryterm::constructor not called?");
|
||||
char *xx=NULL;*xx=0;
|
||||
}
|
||||
}
|
||||
|
||||
// now scan each facethashlist from each shard and compile into
|
||||
@ -1548,9 +1555,9 @@ void Msg3a::printTerms ( ) {
|
||||
}
|
||||
|
||||
void setTermFreqWeights ( collnum_t collnum , // char *coll,
|
||||
Query *q ,
|
||||
int64_t *termFreqs,
|
||||
float *termFreqWeights ) {
|
||||
Query *q ) {
|
||||
// int64_t *termFreqs,
|
||||
// float *termFreqWeights ) {
|
||||
|
||||
int64_t numDocsInColl = 0;
|
||||
RdbBase *base = getRdbBase ( RDB_CLUSTERDB , collnum );
|
||||
@ -1562,13 +1569,16 @@ void setTermFreqWeights ( collnum_t collnum , // char *coll,
|
||||
numDocsInColl = 1;
|
||||
}
|
||||
// now get term freqs again, like the good old days
|
||||
int64_t *termIds = q->getTermIds();
|
||||
//int64_t *termIds = q->getTermIds();
|
||||
// just use rdbmap to estimate!
|
||||
for ( int32_t i = 0 ; i < q->getNumTerms(); i++ ) {
|
||||
QueryTerm *qt = &q->m_qterms[i];
|
||||
// GET THE TERMFREQ for setting weights
|
||||
int64_t tf = g_posdb.getTermFreq ( collnum ,termIds[i]);
|
||||
if ( termFreqs ) termFreqs[i] = tf;
|
||||
int64_t tf = g_posdb.getTermFreq ( collnum ,qt->m_termId);
|
||||
//if ( termFreqs ) termFreqs[i] = tf;
|
||||
qt->m_termFreq = tf;
|
||||
float tfw = getTermFreqWeight(tf,numDocsInColl);
|
||||
termFreqWeights[i] = tfw;
|
||||
//termFreqWeights[i] = tfw;
|
||||
qt->m_termFreqWeight = tfw;
|
||||
}
|
||||
}
|
||||
|
10
Msg3a.h
10
Msg3a.h
@ -12,9 +12,9 @@
|
||||
#define DEFAULT_POSDB_READSIZE 90000000
|
||||
|
||||
void setTermFreqWeights ( collnum_t collnum, // char *coll,
|
||||
class Query *q ,
|
||||
int64_t *termFreqs,
|
||||
float *termFreqWeights ) ;
|
||||
class Query *q );
|
||||
//int64_t *termFreqs,
|
||||
//float *termFreqWeights ) ;
|
||||
|
||||
//#define MSG3A_TMP_BUFSIZE (MAX_RESULTS*18)
|
||||
|
||||
@ -131,8 +131,8 @@ public:
|
||||
|
||||
// use msg37 to get TermFreqs
|
||||
//Msg37 m_msg37;
|
||||
int64_t m_termFreqs [MAX_QUERY_TERMS];
|
||||
float m_termFreqWeights[MAX_QUERY_TERMS];
|
||||
//int64_t m_termFreqs [MAX_QUERY_TERMS];
|
||||
//float m_termFreqWeights[MAX_QUERY_TERMS];
|
||||
|
||||
// a multicast class to send the request, one for each split
|
||||
Multicast m_mcast[MAX_SHARDS];
|
||||
|
13
Msg40.cpp
13
Msg40.cpp
@ -666,7 +666,7 @@ bool Msg40::federatedLoop ( ) {
|
||||
mr.size_whiteList = slen;
|
||||
mr.m_timeout = -1; // auto-determine based on #terms
|
||||
// make sure query term counts match in msg39
|
||||
mr.m_maxQueryTerms = m_si->m_maxQueryTerms;
|
||||
//mr.m_maxQueryTerms = m_si->m_maxQueryTerms;
|
||||
mr.m_realMaxTop = m_si->m_realMaxTop;
|
||||
|
||||
mr.m_minSerpDocId = m_si->m_minSerpDocId;
|
||||
@ -699,6 +699,9 @@ bool Msg40::federatedLoop ( ) {
|
||||
//if ( numDocIdSplits < 5 ) numDocIdSplits = 5;
|
||||
//}
|
||||
|
||||
if ( cr ) mr.m_maxQueryTerms = cr->m_maxQueryTerms;
|
||||
else mr.m_maxQueryTerms = 100;
|
||||
|
||||
// special oom hack fix
|
||||
if ( cr && cr->m_isCustomCrawl && numDocIdSplits < 4 )
|
||||
numDocIdSplits = 4;
|
||||
@ -3496,7 +3499,10 @@ bool Msg40::computeGigabits( TopicGroup *tg ) {
|
||||
log("gbits: too many words in samples. "
|
||||
"Discarding the remaining samples "
|
||||
"(maxWords=%"INT32")", maxWords);
|
||||
char *xx=NULL;*xx=0;
|
||||
// return -1 with g_errno set on error
|
||||
g_errno = EBUFTOOSMALL;
|
||||
return -1;
|
||||
//char *xx=NULL;*xx=0;
|
||||
}
|
||||
// the thing we are counting!!!!
|
||||
maxWords += sampleWords;
|
||||
@ -4330,7 +4336,8 @@ void hashExcerpt ( Query *q ,
|
||||
int32_t m_posPtr;
|
||||
};
|
||||
SafeBuf posBuf;
|
||||
int32_t need2 = MAX_QUERY_TERMS * sizeof(PosInfo);
|
||||
//int32_t need2 = MAX_QUERY_TERMS * sizeof(PosInfo);
|
||||
int32_t need2 = q->m_numTerms * sizeof(PosInfo);
|
||||
posBuf.setLabel("m40posbuf");
|
||||
if ( ! posBuf.reserve ( need2 ) ) {
|
||||
log("gigabits: could not allocate 2 local buffer "
|
||||
|
2
Msg40.h
2
Msg40.h
@ -15,7 +15,7 @@
|
||||
#include "Msg20.h" // for getting summary from docId
|
||||
#include "Msg17.h" // a distributed cache of serialized/compressed Msg40s
|
||||
//#include "Msg2b.h" // for generating directories
|
||||
#include "IndexReadInfo.h" // STAGE0,...
|
||||
//#include "IndexReadInfo.h" // STAGE0,...
|
||||
#include "Msg3a.h"
|
||||
#include "PostQueryRerank.h"
|
||||
|
||||
|
22
PageGet.cpp
22
PageGet.cpp
@ -40,7 +40,9 @@ public:
|
||||
bool m_isLocal;
|
||||
//bool m_seq;
|
||||
bool m_rtq;
|
||||
char m_q[MAX_QUERY_LEN+1];
|
||||
//char m_q[MAX_QUERY_LEN+1];
|
||||
SafeBuf m_qsb;
|
||||
char m_qtmpBuf[128];
|
||||
int32_t m_qlen;
|
||||
char m_boolFlag;
|
||||
bool m_printed;
|
||||
@ -98,7 +100,7 @@ bool sendPageGet ( TcpSocket *s , HttpRequest *r ) {
|
||||
int32_t qlen = 0;
|
||||
char *q = r->getString ( "q" , &qlen , NULL /*default*/);
|
||||
// ensure query not too big
|
||||
if ( qlen >= MAX_QUERY_LEN-1 ) {
|
||||
if ( qlen >= ABS_MAX_QUERY_LEN-1 ) {
|
||||
g_errno=EQUERYTOOBIG;
|
||||
return g_httpServer.sendErrorReply (s,500 ,mstrerror(g_errno));
|
||||
}
|
||||
@ -156,8 +158,16 @@ bool sendPageGet ( TcpSocket *s , HttpRequest *r ) {
|
||||
// delete ( st );
|
||||
// return sendPageNetResult( s );
|
||||
//}
|
||||
if ( q && qlen > 0 ) strcpy ( st->m_q , q );
|
||||
else st->m_q[0] = '\0';
|
||||
//if ( q && qlen > 0 ) strcpy ( st->m_q , q );
|
||||
//else st->m_q[0] = '\0';
|
||||
|
||||
st->m_qsb.setBuf ( st->m_qtmpBuf,128,0,false );
|
||||
st->m_qsb.setLabel ( "qsbpg" );
|
||||
|
||||
// save the query
|
||||
if ( q && qlen > 0 )
|
||||
st->m_qsb.safeStrcpy ( q );
|
||||
|
||||
st->m_qlen = qlen;
|
||||
//st->m_seq = seq;
|
||||
st->m_rtq = rtq;
|
||||
@ -415,8 +425,8 @@ bool processLoop ( void *state ) {
|
||||
int32_t startLen2 = sb->length();//p;
|
||||
|
||||
// query should be NULL terminated
|
||||
char *q = st->m_q;
|
||||
int32_t qlen = st->m_qlen;
|
||||
char *q = st->m_qsb.getBufStart();
|
||||
int32_t qlen = st->m_qsb.getLength(); // m_qlen;
|
||||
|
||||
char styleTitle[128] = "font-size:14px;font-weight:600;"
|
||||
"color:#000000;";
|
||||
|
@ -2529,6 +2529,22 @@ bool printSearchResultsHeader ( State0 *st ) {
|
||||
, getLanguageString(si->m_queryLangId) );
|
||||
// print query words we ignored, like stop words
|
||||
printIgnoredWords ( sb , si );
|
||||
|
||||
sb->safePrintf("\t\t<queryNumTermsTotal>"
|
||||
"%"INT32
|
||||
"</queryNumTermsTotal>\n"
|
||||
, q->m_numTermsUntruncated );
|
||||
sb->safePrintf("\t\t<queryNumTermsUsed>"
|
||||
"%"INT32
|
||||
"</queryNumTermsUsed>\n"
|
||||
, q->m_numTerms );
|
||||
int32_t tval = 0;
|
||||
if ( q->m_numTerms < q->m_numTermsUntruncated ) tval = 1;
|
||||
sb->safePrintf("\t\t<queryWasTruncated>"
|
||||
"%"INT32
|
||||
"</queryWasTruncated>\n"
|
||||
, tval );
|
||||
|
||||
for ( int i = 0 ; i < q->m_numTerms ; i++ ) {
|
||||
sb->safePrintf("\t\t<term>\n");
|
||||
QueryTerm *qt = &q->m_qterms[i];
|
||||
@ -2574,7 +2590,8 @@ bool printSearchResultsHeader ( State0 *st ) {
|
||||
,printTerm);
|
||||
term[sq->m_termLen] = c;
|
||||
}
|
||||
int64_t tf = msg40->m_msg3a.m_termFreqs[i];
|
||||
//int64_t tf = msg40->m_msg3a.m_termFreqs[i];
|
||||
int64_t tf = qt->m_termFreq;
|
||||
sb->safePrintf("\t\t\t<termFreq>%"INT64"</termFreq>\n"
|
||||
,tf);
|
||||
sb->safePrintf("\t\t\t<termHash48>%"INT64"</termHash48>\n"
|
||||
@ -2604,6 +2621,19 @@ bool printSearchResultsHeader ( State0 *st ) {
|
||||
sb->safePrintf("\",\n");
|
||||
// print query words we ignored, like stop words
|
||||
printIgnoredWords ( sb , si );
|
||||
|
||||
sb->safePrintf("\t\"queryNumTermsTotal\":"
|
||||
"%"INT32",\n"
|
||||
, q->m_numTermsUntruncated );
|
||||
sb->safePrintf("\t\"queryNumTermsUsed\":"
|
||||
"%"INT32",\n"
|
||||
, q->m_numTerms );
|
||||
int32_t tval = 0;
|
||||
if ( q->m_numTerms < q->m_numTermsUntruncated ) tval = 1;
|
||||
sb->safePrintf("\t\"queryWasTruncated\":"
|
||||
"%"INT32",\n"
|
||||
, tval );
|
||||
|
||||
sb->safePrintf("\t\"terms\":[\n");
|
||||
for ( int i = 0 ; i < q->m_numTerms ; i++ ) {
|
||||
sb->safePrintf("\t\t{\n");
|
||||
@ -2643,7 +2673,8 @@ bool printSearchResultsHeader ( State0 *st ) {
|
||||
sb->safePrintf("\",\n");
|
||||
term[sq->m_termLen] = c;
|
||||
}
|
||||
int64_t tf = msg40->m_msg3a.m_termFreqs[i];
|
||||
//int64_t tf = msg40->m_msg3a.m_termFreqs[i];
|
||||
int64_t tf = qt->m_termFreq;
|
||||
sb->safePrintf("\t\t\"termFreq\":%"INT64",\n"
|
||||
,tf);
|
||||
|
||||
@ -2793,13 +2824,14 @@ bool printSearchResultsHeader ( State0 *st ) {
|
||||
|
||||
//Highlight h;
|
||||
|
||||
st->m_qe[0] = '\0';
|
||||
//st->m_qe[0] = '\0';
|
||||
st->m_qesb.nullTerm();
|
||||
|
||||
// encode query buf
|
||||
//char qe[MAX_QUERY_LEN+1];
|
||||
char *dq = si->m_displayQuery;
|
||||
//int32_t dqlen = si->m_displayQueryLen;
|
||||
if ( dq ) urlEncode(st->m_qe,MAX_QUERY_LEN*2,dq,gbstrlen(dq));
|
||||
if ( dq ) st->m_qesb.urlEncode(dq);
|
||||
|
||||
// how many results were requested?
|
||||
//int32_t docsWanted = msg40->getDocsWanted();
|
||||
@ -5185,7 +5217,7 @@ bool printResult ( State0 *st, int32_t ix , int32_t *numPrintedSoFar ) {
|
||||
"get?"
|
||||
"q=%s&c=%s&d=%"INT64">"
|
||||
"cached</a>\n",
|
||||
st->m_qe , coll ,
|
||||
st->m_qesb.getBufStart() , coll ,
|
||||
mr->m_docId );
|
||||
else if ( printCached )
|
||||
sb->safePrintf ( "<a href=\""
|
||||
@ -5194,7 +5226,7 @@ bool printResult ( State0 *st, int32_t ix , int32_t *numPrintedSoFar ) {
|
||||
"qlang=%s&"
|
||||
"c=%s&d=%"INT64"&cnsp=0\">"
|
||||
"cached</a>\n",
|
||||
st->m_qe ,
|
||||
st->m_qesb.getBufStart() ,
|
||||
// "qlang" parm
|
||||
si->m_defaultSortLang,
|
||||
coll ,
|
||||
@ -5334,7 +5366,7 @@ bool printResult ( State0 *st, int32_t ix , int32_t *numPrintedSoFar ) {
|
||||
"d=%"INT64"&"
|
||||
"cnsp=0\">"
|
||||
"sections</a>\n",
|
||||
st->m_qe ,
|
||||
st->m_qesb.getBufStart() ,
|
||||
// "qlang" parm
|
||||
si->m_defaultSortLang,
|
||||
coll ,
|
||||
@ -5447,7 +5479,7 @@ bool printResult ( State0 *st, int32_t ix , int32_t *numPrintedSoFar ) {
|
||||
qq.urlEncode("site:");
|
||||
qq.urlEncode (hbuf);
|
||||
qq.urlEncode(" | ");
|
||||
qq.safeStrcpy(st->m_qe);
|
||||
qq.safeStrcpy(st->m_qesb.getBufStart());
|
||||
qq.nullTerm();
|
||||
// get the original url and add/replace in query
|
||||
char tmp2[512];
|
||||
@ -6176,8 +6208,14 @@ bool printPairScore ( SafeBuf *sb , SearchInput *si , PairScore *ps ,
|
||||
//int64_t sz2 = ps->m_listSize2;
|
||||
//int64_t tf1 = ps->m_termFreq1;//sz1 / 10;
|
||||
//int64_t tf2 = ps->m_termFreq2;//sz2 / 10;
|
||||
int64_t tf1 = msg40->m_msg3a.m_termFreqs[qtn1];
|
||||
int64_t tf2 = msg40->m_msg3a.m_termFreqs[qtn2];
|
||||
|
||||
QueryTerm *qt1 = &msg40->m_msg3a.m_q->m_qterms[qtn1];
|
||||
QueryTerm *qt2 = &msg40->m_msg3a.m_q->m_qterms[qtn2];
|
||||
|
||||
//int64_t tf1 = msg40->m_msg3a.m_termFreqs[qtn1];
|
||||
//int64_t tf2 = msg40->m_msg3a.m_termFreqs[qtn2];
|
||||
int64_t tf1 = qt1->m_termFreq;
|
||||
int64_t tf2 = qt2->m_termFreq;
|
||||
float tfw1 = ps->m_tfWeight1;
|
||||
float tfw2 = ps->m_tfWeight2;
|
||||
|
||||
@ -6893,7 +6931,9 @@ bool printSingleScore ( SafeBuf *sb ,
|
||||
|
||||
//int64_t tf = ss->m_termFreq;//ss->m_listSize;
|
||||
int32_t qtn = ss->m_qtermNum;
|
||||
int64_t tf = msg40->m_msg3a.m_termFreqs[qtn];
|
||||
//int64_t tf = msg40->m_msg3a.m_termFreqs[qtn];
|
||||
QueryTerm *qt = &msg40->m_msg3a.m_q->m_qterms[qtn];
|
||||
int64_t tf = qt->m_termFreq;
|
||||
float tfw = ss->m_tfWeight;
|
||||
|
||||
if ( si->m_format == FORMAT_XML ) {
|
||||
@ -8252,8 +8292,11 @@ bool printCSVHeaderRow2 ( SafeBuf *sb ,
|
||||
hdr = "Hop Count";
|
||||
if ( ! strcmp(hdr,"gbssIp") )
|
||||
hdr = "IP";
|
||||
if ( ! strcmp(hdr,"gbssDiffbotUri" ) )
|
||||
hdr = "Diffbot URI";
|
||||
// csv report is regular urls not diffbot object urls so
|
||||
// regular urls do not have a just a single diffboturi,
|
||||
// they could have 0 or multiple diffboturis
|
||||
//if ( ! strcmp(hdr,"gbssDiffbotUri" ) )
|
||||
// hdr = "Diffbot URI";
|
||||
if ( ! strcmp(hdr,"gbssSentToDiffbotThisTime") )
|
||||
hdr = "Process Attempted";
|
||||
if ( ! strcmp(hdr,"gbssDiffbotReplyMsg") )
|
||||
|
@ -52,7 +52,8 @@ public:
|
||||
int64_t m_took; // how long it took to get the results
|
||||
HttpRequest m_hr;
|
||||
bool m_printedHeaderRow;
|
||||
char m_qe[MAX_QUERY_LEN+1];
|
||||
//char m_qe[MAX_QUERY_LEN+1];
|
||||
SafeBuf m_qesb;
|
||||
|
||||
// for printing our search result json items in csv:
|
||||
HashTableX m_columnTable;
|
||||
|
@ -1858,11 +1858,11 @@ bool printDirHomePage ( SafeBuf &sb , HttpRequest *r ) {
|
||||
// . call g_httpServer.sendDynamicPage() to send it
|
||||
bool sendPageRoot ( TcpSocket *s , HttpRequest *r, char *cookie ) {
|
||||
// don't allow pages bigger than 128k in cache
|
||||
char buf [ 10*1024 + MAX_QUERY_LEN ];
|
||||
char buf [ 10*1024 ];//+ MAX_QUERY_LEN ];
|
||||
// a ptr into "buf"
|
||||
//char *p = buf;
|
||||
//char *pend = buf + 10*1024 + MAX_QUERY_LEN - 100 ;
|
||||
SafeBuf sb(buf, 10*1024 + MAX_QUERY_LEN);
|
||||
SafeBuf sb(buf, 10*1024 );//+ MAX_QUERY_LEN);
|
||||
// print bgcolors, set focus, set font style
|
||||
//p = g_httpServer.printFocus ( p , pend );
|
||||
//p = g_httpServer.printColors ( p , pend );
|
||||
|
@ -252,18 +252,18 @@ bool sendPageStats ( TcpSocket *s , HttpRequest *r ) {
|
||||
&secs,
|
||||
&msecs);
|
||||
|
||||
int64_t avgTier0Time = 0;
|
||||
int64_t avgTier1Time = 0;
|
||||
int64_t avgTier2Time = 0;
|
||||
if ( g_stats.m_tierHits[0] > 0 )
|
||||
avgTier0Time = g_stats.m_tierTimes[0] /
|
||||
(int64_t)g_stats.m_tierHits[0];
|
||||
if ( g_stats.m_tierHits[1] > 0 )
|
||||
avgTier1Time = g_stats.m_tierTimes[1] /
|
||||
(int64_t)g_stats.m_tierHits[1];
|
||||
if ( g_stats.m_tierHits[2] > 0 )
|
||||
avgTier2Time = g_stats.m_tierTimes[2] /
|
||||
(int64_t)g_stats.m_tierHits[2];
|
||||
// int64_t avgTier0Time = 0;
|
||||
// int64_t avgTier1Time = 0;
|
||||
// int64_t avgTier2Time = 0;
|
||||
// if ( g_stats.m_tierHits[0] > 0 )
|
||||
// avgTier0Time = g_stats.m_tierTimes[0] /
|
||||
// (int64_t)g_stats.m_tierHits[0];
|
||||
// if ( g_stats.m_tierHits[1] > 0 )
|
||||
// avgTier1Time = g_stats.m_tierTimes[1] /
|
||||
// (int64_t)g_stats.m_tierHits[1];
|
||||
// if ( g_stats.m_tierHits[2] > 0 )
|
||||
// avgTier2Time = g_stats.m_tierTimes[2] /
|
||||
// (int64_t)g_stats.m_tierHits[2];
|
||||
|
||||
if ( format == FORMAT_HTML )
|
||||
p.safePrintf (
|
||||
|
78
Pages.cpp
78
Pages.cpp
@ -3519,6 +3519,7 @@ bool printApiForPage ( SafeBuf *sb , int32_t PAGENUM , CollectionRec *cr ) {
|
||||
if ( pageNum != PAGENUM ) continue;
|
||||
|
||||
SafeBuf tmp;
|
||||
tmp.setLabel("apisb");
|
||||
char diff = 0;
|
||||
bool printVal = false;
|
||||
if ( parm->m_type != TYPE_CMD &&
|
||||
@ -3856,6 +3857,25 @@ bool printApiForPage ( SafeBuf *sb , int32_t PAGENUM , CollectionRec *cr ) {
|
||||
"</b>");
|
||||
|
||||
|
||||
sb->brify2 (
|
||||
"\t\t# List of space separated words in the "
|
||||
"query that were ignored for the most part. "
|
||||
"Because they were common words for the "
|
||||
"query language they are in.\n"
|
||||
, cols , "\n\t\t# " , false );
|
||||
sb->safePrintf("<b>\t\t\"ignoredWords\":\"to the\",\n\n"
|
||||
"</b>");
|
||||
|
||||
sb->brify2 (
|
||||
"\t\t# There is a maximum limit placed on the "
|
||||
"number of query terms we search on to keep things "
|
||||
"fast. This can "
|
||||
"be changed in the search controls.\n"
|
||||
, cols , "\n\t\t# " , false );
|
||||
sb->safePrintf("<b>\t\t\"queryNumTermsTotal\":52,\n</b>");
|
||||
sb->safePrintf("<b>\t\t\"queryNumTermsUsed\":20,\n</b>");
|
||||
sb->safePrintf("<b>\t\t\"queryWasTruncated\":1,\n\n</b>");
|
||||
|
||||
sb->brify2 (
|
||||
"\t\t# The start of the terms array. Each query "
|
||||
"is broken down into a list of terms. Each "
|
||||
@ -4037,7 +4057,8 @@ bool printApiForPage ( SafeBuf *sb , int32_t PAGENUM , CollectionRec *cr ) {
|
||||
// end instance
|
||||
sb->safePrintf("<b>\t\t}\n\n</b>");
|
||||
// end gigabit
|
||||
sb->safePrintf("<b>\t\t},\n\n</b>");
|
||||
sb->safePrintf("\t\t# End of the first gigabit\n"
|
||||
"<b>\t\t},\n\n</b>");
|
||||
|
||||
sb->safePrintf("\t\t...\n\n");
|
||||
|
||||
@ -4047,6 +4068,59 @@ bool printApiForPage ( SafeBuf *sb , int32_t PAGENUM , CollectionRec *cr ) {
|
||||
sb->safePrintf("<b>\t],\n\n</b>");
|
||||
|
||||
|
||||
// BEGIN FACETS
|
||||
sb->safePrintf( "\t# Start of the facets array, if any.\n");
|
||||
sb->safePrintf("<b>\t\"facets\":[\n</b>\n");
|
||||
|
||||
sb->safePrintf("\t\t# The first facet in the array.\n");
|
||||
sb->safePrintf("<b>\t\t{\n</b>");
|
||||
|
||||
sb->brify2 ( "\t\t\t"
|
||||
"# The field you are faceting over\n"
|
||||
, cols , "\n\t\t\t# " , false );
|
||||
sb->safePrintf ( "<b>\t\t\t\"field\":\"Company\",\n\n</b>");
|
||||
sb->brify2 ( "\t\t\t"
|
||||
"# How many documents in the collection had "
|
||||
"this particular field? 64-bit integer.\n"
|
||||
, cols , "\n\t\t\t# " , false );
|
||||
sb->safePrintf ( "<b>\t\t\t\"totalDocsWithField\":148553,"
|
||||
"\n\n</b>");
|
||||
|
||||
sb->brify2 ( "\t\t\t"
|
||||
"# How many documents in the collection had "
|
||||
"this particular field with the same value "
|
||||
"as the value line directly below? This should "
|
||||
"always be less than or equal to the "
|
||||
"totalDocsWithField count. 64-bit integer.\n"
|
||||
, cols , "\n\t\t\t# " , false );
|
||||
sb->safePrintf ( "<b>\t\t\t\"totalDocsWithFieldAndValue\":"
|
||||
"44184,\n\n</b>");
|
||||
|
||||
sb->brify2 ( "\t\t\t"
|
||||
"# The value of the field in the case of "
|
||||
"this facet. Can be a string or an integer or "
|
||||
"a float, depending on the type described in "
|
||||
"the gbfacet query term. i.e. gbfacetstr, "
|
||||
"gbfacetint or gbfacetfloat.\n"
|
||||
, cols , "\n\t\t\t# " , false );
|
||||
sb->safePrintf ( "<b>\t\t\t\"value\":"
|
||||
"\"Widgets, Inc.\",\n\n</b>");
|
||||
|
||||
|
||||
sb->brify2 ( "\t\t\t"
|
||||
"# Should be the same as totalDocsWith"
|
||||
"FieldAndValue, "
|
||||
"above. 64-bit integer.\n"
|
||||
, cols , "\n\t\t\t# " , false );
|
||||
sb->safePrintf ( "<b>\t\t\t\"docCount\":"
|
||||
"44184\n\n</b>");
|
||||
|
||||
sb->safePrintf("\t\t# End of the first facet in the array.\n");
|
||||
sb->safePrintf("<b>\t\t}\n\n</b>");
|
||||
|
||||
sb->safePrintf( "\t# End of the facets array.\n");
|
||||
sb->safePrintf("<b>\t],\n\n</b>");
|
||||
// END FACETS
|
||||
|
||||
|
||||
|
||||
@ -4670,7 +4744,7 @@ bool printRedBox ( SafeBuf *mb , TcpSocket *sock , HttpRequest *hr ) {
|
||||
mb->safePrintf("%s",box);
|
||||
mb->safePrintf("%"INT32" %s dead and not responding to "
|
||||
"pings. See the "
|
||||
"<a href=/admin/host?c=%s>hosts table</a>.",
|
||||
"<a href=/admin/hosts?c=%s>hosts table</a>.",
|
||||
ps->m_numHostsDead ,s ,coll);
|
||||
mb->safePrintf("%s",boxEnd);
|
||||
}
|
||||
|
24
Parms.cpp
24
Parms.cpp
@ -7879,17 +7879,19 @@ void Parms::init ( ) {
|
||||
m->m_obj = OBJ_COLL;
|
||||
m++;
|
||||
|
||||
//m->m_title = "max query terms";
|
||||
//m->m_desc = "Do not allow more than this many query terms. Will "
|
||||
// "return error in XML feed error tag if breeched.";
|
||||
//m->m_cgi = "mqt";
|
||||
//m->m_off = (char *)&cr.m_maxQueryTerms - x;
|
||||
m->m_title = "max query terms";
|
||||
m->m_desc = "Do not allow more than this many query terms. Helps "
|
||||
"prevent big queries from resource hogging.";
|
||||
m->m_cgi = "mqt";
|
||||
m->m_off = (char *)&cr.m_maxQueryTerms - x;
|
||||
//m->m_soff = (char *)&si.m_maxQueryTerms - y;
|
||||
//m->m_type = TYPE_LONG;
|
||||
//m->m_def = "20"; // 20 for testing, normally 16
|
||||
//m->m_sparm = 1;
|
||||
//m->m_spriv = 1;
|
||||
//m++;
|
||||
m->m_type = TYPE_LONG;
|
||||
m->m_def = "999999"; // now we got synonyms... etc
|
||||
m->m_group = 0;
|
||||
m->m_flags = 0;//PF_HIDDEN | PF_NOSAVE;
|
||||
m->m_page = PAGE_SEARCH;
|
||||
m->m_obj = OBJ_COLL;
|
||||
m++;
|
||||
|
||||
/*
|
||||
m->m_title = "dictionary site";
|
||||
@ -15283,7 +15285,7 @@ void Parms::init ( ) {
|
||||
m->m_type = TYPE_CHARPTR;
|
||||
m->m_page = PAGE_REINDEX;
|
||||
m->m_obj = OBJ_GBREQUEST;
|
||||
m->m_def = "xx";
|
||||
m->m_def = "en";
|
||||
m->m_flags = PF_API ;
|
||||
m++;
|
||||
|
||||
|
149
Posdb.cpp
149
Posdb.cpp
@ -759,19 +759,22 @@ void PosdbTable::init ( Query *q ,
|
||||
// set this now
|
||||
//m_collnum = cr->m_collnum;
|
||||
|
||||
|
||||
// save it
|
||||
m_topTree = topTree;
|
||||
// a ptr for debugging i guess
|
||||
g_topTree = topTree;
|
||||
// remember the query class, it has all the info about the termIds
|
||||
m_q = q;
|
||||
m_nqt = q->getNumTerms();
|
||||
// for debug msgs
|
||||
m_logstate = logstate;
|
||||
|
||||
m_realMaxTop = r->m_realMaxTop;
|
||||
if ( m_realMaxTop > MAX_TOP ) m_realMaxTop = MAX_TOP;
|
||||
|
||||
m_siteRankMultiplier = SITERANKMULTIPLIER;
|
||||
if ( m_q->m_isBoolean ) m_siteRankMultiplier = 0.0;
|
||||
|
||||
// seo.cpp supplies a NULL msg2 because it already sets
|
||||
// QueryTerm::m_posdbListPtrs
|
||||
if ( ! msg2 ) return;
|
||||
@ -1060,6 +1063,26 @@ bool PosdbTable::allocTopTree ( ) {
|
||||
// make it nongrowable because we'll be in a thread
|
||||
qt->m_facetHashTable.setNonGrow();
|
||||
}
|
||||
|
||||
// m_stackBuf
|
||||
int32_t nqt = m_q->m_numTerms;
|
||||
int32_t need = 0;
|
||||
need += 4 * nqt;
|
||||
need += 4 * nqt;
|
||||
need += 4 * nqt;
|
||||
need += 4 * nqt;
|
||||
need += sizeof(float ) * nqt;
|
||||
need += sizeof(char *) * nqt;
|
||||
need += sizeof(char *) * nqt;
|
||||
need += sizeof(char *) * nqt;
|
||||
need += sizeof(char *) * nqt;
|
||||
need += sizeof(char *) * nqt;
|
||||
need += sizeof(char ) * nqt;
|
||||
need += sizeof(float ) * nqt * nqt; // square matrix
|
||||
m_stackBuf.setLabel("stkbuf1");
|
||||
if ( ! m_stackBuf.reserve( need ) )
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -1378,8 +1401,8 @@ void PosdbTable::evalSlidingWindow ( char **ptrs ,
|
||||
max *= m_freqWeights[i] * m_freqWeights[j];
|
||||
|
||||
// use score from scoreMatrix if bigger
|
||||
if ( scoreMatrix[MAX_QUERY_TERMS*i+j] > max ) {
|
||||
max = scoreMatrix[MAX_QUERY_TERMS*i+j];
|
||||
if ( scoreMatrix[m_nqt*i+j] > max ) {
|
||||
max = scoreMatrix[m_nqt*i+j];
|
||||
//if ( m_ds ) {
|
||||
// winners1[i*MAX_QUERY_TERMS+j] = NULL;
|
||||
// winners2[i*MAX_QUERY_TERMS+j] = NULL;
|
||||
@ -4815,6 +4838,8 @@ bool PosdbTable::setQueryTermInfo ( ) {
|
||||
// below when trying to grow it. they could all be OR'd together
|
||||
// so alloc the most!
|
||||
int32_t maxSlots = (grand/12) * 2;
|
||||
// try to speed up. this doesn't *seem* to matter, so i took out:
|
||||
//maxSlots *= 2;
|
||||
// get total operands we used
|
||||
//int32_t numOperands = m_q->m_numWords;//Operands;
|
||||
// a quoted phrase counts as a single operand
|
||||
@ -4826,15 +4851,15 @@ bool PosdbTable::setQueryTermInfo ( ) {
|
||||
// allow an extra byte for remainders
|
||||
if ( m_numQueryTermInfos % 8 ) m_vecSize++;
|
||||
// now preallocate the hashtable. 0 niceness.
|
||||
if ( m_q->m_isBoolean &&
|
||||
! m_bt.set (8,m_vecSize,maxSlots,NULL,0,false,0,"booltbl"))
|
||||
if ( m_q->m_isBoolean && // true = useKeyMagic
|
||||
! m_bt.set (8,m_vecSize,maxSlots,NULL,0,false,0,"booltbl",true))
|
||||
return false;
|
||||
// . m_ct maps a boolean "bit vector" to a true/false value
|
||||
// . each "bit" in the "bit vector" indicates if docid has that
|
||||
// particular query term
|
||||
if ( m_q->m_isBoolean &&
|
||||
if ( m_q->m_isBoolean && // true = useKeyMagic
|
||||
! m_ct.set (8,1,maxSlots,NULL,0,false,0,
|
||||
"booltbl"))
|
||||
"booltbl",true))
|
||||
return false;
|
||||
|
||||
return true;
|
||||
@ -4999,13 +5024,13 @@ int64_t PosdbTable::countUniqueDocids( QueryTermInfo *qti ) {
|
||||
// inc the TOTAL val count
|
||||
if ( fe ) fe->m_outsideSearchResultsCount++;
|
||||
|
||||
// skip that docid record in our termlist. it MUST have been
|
||||
// 12 bytes, a docid heading record.
|
||||
recPtr += 12;
|
||||
count++;
|
||||
// skip any following keys that are 6 bytes, that means they
|
||||
// share the same docid
|
||||
for ( ; recPtr < subListEnd && ((*recPtr)&0x04); recPtr += 6 );
|
||||
// Increment ptr to the next record
|
||||
int32_t recSize = qti->m_subLists[0]->getRecSize(recPtr);
|
||||
recPtr += recSize;
|
||||
|
||||
// Records that are 6 bytes share the same doc id, so only increment
|
||||
// 'count' if it refers to a record with a new (unique) docId
|
||||
if (recSize > 6) count++;
|
||||
goto loop;
|
||||
}
|
||||
|
||||
@ -5882,6 +5907,8 @@ void PosdbTable::intersectLists10_r ( ) {
|
||||
if ( qti->m_bigramFlags[0] & BF_NEGATIVE ) continue;
|
||||
// inc this
|
||||
listGroupNum++;
|
||||
// if it hits 256 then wrap back down to 1
|
||||
if ( listGroupNum >= 256 ) listGroupNum = 1;
|
||||
// add it
|
||||
addDocIdVotes ( qti , listGroupNum );
|
||||
}
|
||||
@ -5966,11 +5993,28 @@ void PosdbTable::intersectLists10_r ( ) {
|
||||
//
|
||||
// TRANSFORM QueryTermInfo::m_* vars into old style arrays
|
||||
//
|
||||
int32_t wikiPhraseIds [MAX_QUERY_TERMS];
|
||||
int32_t quotedStartIds[MAX_QUERY_TERMS];
|
||||
int32_t qpos [MAX_QUERY_TERMS];
|
||||
int32_t qtermNums [MAX_QUERY_TERMS];
|
||||
float freqWeights [MAX_QUERY_TERMS];
|
||||
// int32_t wikiPhraseIds [MAX_QUERY_TERMS];
|
||||
// int32_t quotedStartIds[MAX_QUERY_TERMS];
|
||||
// int32_t qpos [MAX_QUERY_TERMS];
|
||||
// int32_t qtermNums [MAX_QUERY_TERMS];
|
||||
// float freqWeights [MAX_QUERY_TERMS];
|
||||
// now dynamically allocate to avoid stack smashing
|
||||
char *pp = m_stackBuf.getBufStart();
|
||||
int32_t nqt = m_q->m_numTerms;
|
||||
int32_t *wikiPhraseIds = (int32_t *)pp; pp += 4 * nqt;
|
||||
int32_t *quotedStartIds = (int32_t *)pp; pp += 4 * nqt;
|
||||
int32_t *qpos = (int32_t *)pp; pp += 4 * nqt;
|
||||
int32_t *qtermNums = (int32_t *)pp; pp += 4 * nqt;
|
||||
float *freqWeights = (float *)pp; pp += sizeof(float) * nqt;
|
||||
char **miniMergedList = (char **)pp; pp += sizeof(char *) * nqt;
|
||||
char **miniMergedEnd = (char **)pp; pp += sizeof(char *) * nqt;
|
||||
char **bestPos = (char **)pp; pp += sizeof(char *) * nqt;
|
||||
char **winnerStack = (char **)pp; pp += sizeof(char *) * nqt;
|
||||
char **xpos = (char **)pp; pp += sizeof(char *) * nqt;
|
||||
char *bflags = (char *)pp; pp += sizeof(char) * nqt;
|
||||
float *scoreMatrix = (float *)pp; pp += sizeof(float) *nqt*nqt;
|
||||
if ( pp > m_stackBuf.getBufEnd() ) {char *xx=NULL;*xx=0; }
|
||||
|
||||
for ( int32_t i = 0 ; i < m_numQueryTermInfos ; i++ ) {
|
||||
// get it
|
||||
QueryTermInfo *qti = &qip[i];
|
||||
@ -6012,17 +6056,11 @@ void PosdbTable::intersectLists10_r ( ) {
|
||||
float minPairScore;
|
||||
float minSingleScore;
|
||||
//int64_t docId;
|
||||
char *miniMergedList [MAX_QUERY_TERMS];
|
||||
char *miniMergedEnd [MAX_QUERY_TERMS];
|
||||
char bflags [MAX_QUERY_TERMS];
|
||||
m_bflags = bflags;
|
||||
int32_t qdist;
|
||||
float wts;
|
||||
float pss;
|
||||
float scoreMatrix[MAX_QUERY_TERMS*MAX_QUERY_TERMS];
|
||||
char *bestPos[MAX_QUERY_TERMS];
|
||||
float maxNonBodyScore;
|
||||
char *winnerStack[MAX_QUERY_TERMS];
|
||||
// new vars for removing supplanted docid score infos and
|
||||
// corresponding pair and single score infos
|
||||
char *sx;
|
||||
@ -6340,12 +6378,7 @@ void PosdbTable::intersectLists10_r ( ) {
|
||||
}
|
||||
|
||||
if ( m_q->m_isBoolean ) {
|
||||
minScore = 1.0;
|
||||
// since we are jumping, we need to set m_docId here
|
||||
//m_docId = *(uint32_t *)(docIdPtr+1);
|
||||
//m_docId <<= 8;
|
||||
//m_docId |= (unsigned char)docIdPtr[0];
|
||||
//m_docId >>= 2;
|
||||
//minScore = 1.0;
|
||||
// we can't jump over setting of miniMergeList. do that.
|
||||
goto boolJump1;
|
||||
}
|
||||
@ -6557,6 +6590,30 @@ void PosdbTable::intersectLists10_r ( ) {
|
||||
|
||||
boolJump1:
|
||||
|
||||
if ( m_q->m_isBoolean ) {
|
||||
//minScore = 1.0;
|
||||
// this is somewhat wasteful since it is set below again
|
||||
m_docId = *(uint32_t *)(docIdPtr+1);
|
||||
m_docId <<= 8;
|
||||
m_docId |= (unsigned char)docIdPtr[0];
|
||||
m_docId >>= 2;
|
||||
// add one point for each term matched in the bool query
|
||||
// this is really just for when the terms are from different
|
||||
// fields. if we have unfielded boolean terms we should
|
||||
// do proximity matching.
|
||||
int32_t slot = m_bt.getSlot ( &m_docId );
|
||||
if ( slot >= 0 ) {
|
||||
uint8_t *bv = (uint8_t *)m_bt.getValueFromSlot(slot);
|
||||
// then a score based on the # of terms that matched
|
||||
int16_t bitsOn = getNumBitsOnX ( bv , m_vecSize );
|
||||
// but store in hashtable now
|
||||
minScore = (float)bitsOn;
|
||||
}
|
||||
else {
|
||||
minScore = 1.0;
|
||||
}
|
||||
}
|
||||
|
||||
// we need to do this for seo hacks to merge the synonyms together
|
||||
// into one list
|
||||
seoHackSkip2:
|
||||
@ -6922,7 +6979,7 @@ void PosdbTable::intersectLists10_r ( ) {
|
||||
&pss);
|
||||
// it's -1 if one term is in the body/header/menu/etc.
|
||||
if ( pss < 0 ) {
|
||||
scoreMatrix[i*MAX_QUERY_TERMS+j] = -1.00;
|
||||
scoreMatrix[i*nqt+j] = -1.00;
|
||||
wts = -1.0;
|
||||
}
|
||||
else {
|
||||
@ -6931,7 +6988,7 @@ void PosdbTable::intersectLists10_r ( ) {
|
||||
wts *= m_freqWeights[j];//sfw[j];
|
||||
// store in matrix for "sub out" algo below
|
||||
// when doing sliding window
|
||||
scoreMatrix[i*MAX_QUERY_TERMS+j] = wts;
|
||||
scoreMatrix[i*nqt+j] = wts;
|
||||
// if terms is a special wiki half stop bigram
|
||||
//if ( bflags[i] == 1 ) wts *= WIKI_BIGRAM_WEIGHT;
|
||||
//if ( bflags[j] == 1 ) wts *= WIKI_BIGRAM_WEIGHT;
|
||||
@ -7053,7 +7110,7 @@ void PosdbTable::intersectLists10_r ( ) {
|
||||
|
||||
// use special ptrs for the windows so we do not mangle
|
||||
// miniMergedList[] array because we use that below!
|
||||
char *xpos[MAX_QUERY_TERMS];
|
||||
//char *xpos[MAX_QUERY_TERMS];
|
||||
for ( int32_t i = 0 ; i < m_numQueryTermInfos ; i++ )
|
||||
xpos[i] = miniMergedList[i];
|
||||
|
||||
@ -7262,7 +7319,7 @@ void PosdbTable::intersectLists10_r ( ) {
|
||||
boolJump2:
|
||||
|
||||
// try dividing it by 3! (or multiply by .33333 faster)
|
||||
score = minScore * (((float)siteRank)*SITERANKMULTIPLIER+1.0);
|
||||
score = minScore * (((float)siteRank)*m_siteRankMultiplier+1.0);
|
||||
|
||||
// . not foreign language? give a huge boost
|
||||
// . use "qlang" parm to set the language. i.e. "&qlang=fr"
|
||||
@ -7932,7 +7989,7 @@ float PosdbTable::getMaxPossibleScore ( QueryTermInfo *qti ,
|
||||
score *= WIKI_BIGRAM_WEIGHT;
|
||||
}
|
||||
//score *= perfectWordSpamWeight * perfectWordSpamWeight;
|
||||
score *= (((float)siteRank)*SITERANKMULTIPLIER+1.0);
|
||||
score *= (((float)siteRank)*m_siteRankMultiplier+1.0);
|
||||
|
||||
// language boost if same language (or no lang specified)
|
||||
if ( m_r->m_language == docLang ||
|
||||
@ -8165,6 +8222,10 @@ bool PosdbTable::makeDocIdVoteBufForBoolQuery_r ( ) {
|
||||
}
|
||||
|
||||
|
||||
// debug info
|
||||
// int32_t nc = m_bt.getLongestString();
|
||||
// log("posdb: string of %"INT32" filled slots!",nc);
|
||||
|
||||
char *dst = m_docIdVoteBuf.getBufStart();
|
||||
|
||||
// . now our hash table is filled with all the docids
|
||||
@ -8223,13 +8284,15 @@ bool PosdbTable::makeDocIdVoteBufForBoolQuery_r ( ) {
|
||||
// a 6 byte key means you pass
|
||||
gbmemcpy ( dst , &docId , 6 );
|
||||
// test it
|
||||
int64_t d2;
|
||||
d2 = *(uint32_t *)(dst+1);
|
||||
d2 <<= 8;
|
||||
d2 |= (unsigned char)dst[0];
|
||||
d2 >>= 2;
|
||||
docId >>= 2;
|
||||
if ( d2 != docId ) { char *xx=NULL;*xx=0; }
|
||||
if ( m_debug ) {
|
||||
int64_t d2;
|
||||
d2 = *(uint32_t *)(dst+1);
|
||||
d2 <<= 8;
|
||||
d2 |= (unsigned char)dst[0];
|
||||
d2 >>= 2;
|
||||
docId >>= 2;
|
||||
if ( d2 != docId ) { char *xx=NULL;*xx=0; }
|
||||
}
|
||||
// end test
|
||||
dst += 6;
|
||||
}
|
||||
|
5
Posdb.h
5
Posdb.h
@ -604,6 +604,8 @@ class PosdbTable {
|
||||
float m_finalScore;
|
||||
float m_preFinalScore;
|
||||
|
||||
float m_siteRankMultiplier;
|
||||
|
||||
// how long to add the last batch of lists
|
||||
int64_t m_addListsTime;
|
||||
int64_t m_t1 ;
|
||||
@ -654,10 +656,13 @@ class PosdbTable {
|
||||
SafeBuf m_pairScoreBuf;
|
||||
SafeBuf m_singleScoreBuf;
|
||||
|
||||
SafeBuf m_stackBuf;
|
||||
|
||||
//SafeBuf m_mergeBuf;
|
||||
|
||||
// a reference to the query
|
||||
Query *m_q;
|
||||
int32_t m_nqt;
|
||||
|
||||
// these are NOT in imap space, but in query term space, 1-1 with
|
||||
// Query::m_qterms[]
|
||||
|
@ -29,7 +29,9 @@ typedef float rscore_t;
|
||||
|
||||
#define MINSCORE 1
|
||||
#define MIN_SAVE_SIZE 100
|
||||
#define PQR_BUF_SIZE MAX_QUERY_LEN
|
||||
// we don't use this any more so make it compile
|
||||
//#define PQR_BUF_SIZE MAX_QUERY_LEN
|
||||
#define PQR_BUF_SIZE 64
|
||||
|
||||
class PostQueryRerank {
|
||||
public:
|
||||
|
355
Query.cpp
355
Query.cpp
@ -28,6 +28,7 @@ void Query::constructor ( ) {
|
||||
//m_bmap = NULL;
|
||||
m_bitScores = NULL;
|
||||
m_qwords = NULL;
|
||||
m_numWords = 0;
|
||||
//m_expressions = NULL;
|
||||
m_qwordsAllocSize = 0;
|
||||
//m_expressionsAllocSize = 0;
|
||||
@ -37,8 +38,8 @@ void Query::constructor ( ) {
|
||||
m_st0Ptr = NULL;
|
||||
// we have to manually call this because Query::constructor()
|
||||
// might have been called explicitly
|
||||
for ( int32_t i = 0 ; i < MAX_QUERY_TERMS ; i++ )
|
||||
m_qterms[i].constructor();
|
||||
//for ( int32_t i = 0 ; i < MAX_QUERY_TERMS ; i++ )
|
||||
// m_qterms[i].constructor();
|
||||
//m_expressions = NULL;
|
||||
reset ( );
|
||||
}
|
||||
@ -68,9 +69,19 @@ void Query::reset ( ) {
|
||||
qt->m_facetIndexBuf.purge();
|
||||
}
|
||||
|
||||
for ( int32_t i = 0 ; i < m_numWords ; i++ ) {
|
||||
QueryWord *qw = &m_qwords[i];
|
||||
qw->destructor();
|
||||
}
|
||||
|
||||
m_stackBuf.purge();
|
||||
m_qterms = NULL;
|
||||
|
||||
m_sb.purge();
|
||||
m_osb.purge();
|
||||
m_docIdRestriction = 0LL;
|
||||
m_groupThatHasDocId = NULL;
|
||||
m_bufLen = 0;
|
||||
//m_bufLen = 0;
|
||||
m_origLen = 0;
|
||||
m_numWords = 0;
|
||||
//m_numOperands = 0;
|
||||
@ -84,6 +95,7 @@ void Query::reset ( ) {
|
||||
//if ( m_bitScores && m_bitScoresSize ) // != m_bsbuf )
|
||||
// mfree ( m_bitScores , m_bitScoresSize , "Query2" );
|
||||
//m_bmap = NULL;
|
||||
|
||||
m_bitScores = NULL;
|
||||
//m_bmapSize = 0;
|
||||
m_bitScoresSize = 0;
|
||||
@ -131,14 +143,16 @@ bool Query::set2 ( char *query ,
|
||||
// need language for doing synonyms
|
||||
uint8_t langId ,
|
||||
char queryExpansion ,
|
||||
bool useQueryStopWords ) {
|
||||
//int32_t maxQueryTerms ) {
|
||||
bool useQueryStopWords ,
|
||||
int32_t maxQueryTerms ) {
|
||||
|
||||
m_langId = langId;
|
||||
m_useQueryStopWords = useQueryStopWords;
|
||||
// fix summary rerank and highlighting.
|
||||
bool keepAllSingles = true;
|
||||
|
||||
m_maxQueryTerms = maxQueryTerms;
|
||||
|
||||
// assume boolean auto-detect.
|
||||
char boolFlag = 2;
|
||||
|
||||
@ -150,7 +164,7 @@ bool Query::set2 ( char *query ,
|
||||
if ( ! query ) return true;
|
||||
|
||||
// set to 256 for synonyms?
|
||||
m_maxQueryTerms = 256;
|
||||
//m_maxQueryTerms = 256;
|
||||
m_queryExpansion = queryExpansion;
|
||||
|
||||
int32_t queryLen = gbstrlen(query);
|
||||
@ -160,17 +174,26 @@ bool Query::set2 ( char *query ,
|
||||
//m_coll = coll;
|
||||
//m_collLen = collLen;
|
||||
// truncate query if too big
|
||||
if ( queryLen >= MAX_QUERY_LEN ) {
|
||||
log("query: Query length of %"INT32" must be less than %"INT32". "
|
||||
"Truncating.",queryLen,(int32_t)MAX_QUERY_LEN);
|
||||
queryLen = MAX_QUERY_LEN - 1;
|
||||
if ( queryLen >= ABS_MAX_QUERY_LEN ) {
|
||||
log("query: Query length of %"INT32" must be "
|
||||
"less than %"INT32". "
|
||||
"Truncating.",queryLen,(int32_t)ABS_MAX_QUERY_LEN);
|
||||
queryLen = ABS_MAX_QUERY_LEN - 1;
|
||||
m_truncated = true;
|
||||
}
|
||||
// save original query
|
||||
m_osb.setBuf ( m_otmpBuf , 128 , 0 , false );
|
||||
m_osb.setLabel ("oqbuf" );
|
||||
m_osb.reserve ( queryLen + 1 );
|
||||
m_osb.safeMemcpy ( query , queryLen );
|
||||
m_osb.nullTerm ();
|
||||
|
||||
m_origLen = queryLen;
|
||||
gbmemcpy ( m_orig , query , queryLen );
|
||||
m_orig [ m_origLen ] = '\0';
|
||||
//m_origLen = queryLen;
|
||||
//gbmemcpy ( m_orig , query , queryLen );
|
||||
//m_orig [ m_origLen ] = '\0';
|
||||
|
||||
m_orig = m_osb.getBufStart();
|
||||
m_origLen = m_osb.getLength();
|
||||
|
||||
log(LOG_DEBUG, "query: set called = %s", m_orig);
|
||||
|
||||
@ -204,9 +227,16 @@ bool Query::set2 ( char *query ,
|
||||
// that were set somewhere above!!! i moved top: label above!
|
||||
//reset();
|
||||
|
||||
// reserve some space, guessing how much we'd need
|
||||
m_sb.setBuf(m_tmpBuf3,128,0,false);
|
||||
m_sb.setLabel("qrystk");
|
||||
int32_t need = queryLen * 2 + 32;
|
||||
if ( ! m_sb.reserve ( need ) )
|
||||
return false;
|
||||
|
||||
// convenience ptr
|
||||
char *p = m_buf;
|
||||
char *pend = m_buf + MAX_QUERY_LEN;
|
||||
//char *p = m_buf;
|
||||
//char *pend = m_buf + MAX_QUERY_LEN;
|
||||
bool inQuotesFlag = false;
|
||||
// . copy query into m_buf
|
||||
// . translate ( and ) to special query operators so Words class
|
||||
@ -219,27 +249,31 @@ bool Query::set2 ( char *query ,
|
||||
if ( query[i] == '\"' ) inQuotesFlag = !inQuotesFlag;
|
||||
|
||||
if ( inQuotesFlag ) {
|
||||
*p = query [i];
|
||||
p++;
|
||||
//*p = query [i];
|
||||
//p++;
|
||||
m_sb.pushChar(query[i]);
|
||||
continue;
|
||||
}
|
||||
|
||||
// dst buf must be big enough
|
||||
if ( p + 8 >= pend ) {
|
||||
g_errno = EBUFTOOSMALL;
|
||||
return log(LOG_LOGIC,"query: query: query too big.");
|
||||
}
|
||||
// if ( p + 8 >= pend ) {
|
||||
// g_errno = EBUFTOOSMALL;
|
||||
// return log(LOG_LOGIC,"query: query: query too big.");
|
||||
// }
|
||||
// translate ( and )
|
||||
if ( boolFlag == 1 && query[i] == '(' ) {
|
||||
gbmemcpy ( p , " LeFtP " , 7 ); p += 7;
|
||||
//gbmemcpy ( p , " LeFtP " , 7 ); p += 7;
|
||||
m_sb.safeMemcpy ( " LeFtP " , 7 );
|
||||
continue;
|
||||
}
|
||||
if ( boolFlag == 1 && query[i] == ')' ) {
|
||||
gbmemcpy ( p , " RiGhP " , 7 ); p += 7;
|
||||
//gbmemcpy ( p , " RiGhP " , 7 ); p += 7;
|
||||
m_sb.safeMemcpy ( " RiGhP " , 7 );
|
||||
continue;
|
||||
}
|
||||
if ( query[i] == '|' ) {
|
||||
gbmemcpy ( p , " PiiPE " , 7 ); p += 7;
|
||||
//gbmemcpy ( p , " PiiPE " , 7 ); p += 7;
|
||||
m_sb.safeMemcpy ( " PiiPE " , 7 );
|
||||
continue;
|
||||
}
|
||||
// translate [#a] [#r] [#ap] [#rp] [] [p] to operators
|
||||
@ -249,28 +283,34 @@ bool Query::set2 ( char *query ,
|
||||
while ( is_digit(query[j]) ) j++;
|
||||
char c = query[j];
|
||||
if ( (c == 'a' || c == 'r') && query[j+1]==']' ) {
|
||||
sprintf ( p , " LeFtB %"INT32" %c RiGhB ",val,c);
|
||||
p += gbstrlen(p);
|
||||
//sprintf ( p , " LeFtB %"INT32" %c RiGhB ",
|
||||
m_sb.safePrintf(" LeFtB %"INT32" %c RiGhB ",
|
||||
val,c);
|
||||
//p += gbstrlen(p);
|
||||
i = j + 1;
|
||||
continue;
|
||||
}
|
||||
else if ( (c == 'a' || c == 'r') &&
|
||||
query[j+1]=='p' && query[j+2]==']') {
|
||||
sprintf ( p , " LeFtB %"INT32" %cp RiGhB ",val,c);
|
||||
p += gbstrlen(p);
|
||||
//sprintf ( p , " LeFtB %"INT32" %cp RiGhB ",
|
||||
m_sb.safePrintf(" LeFtB %"INT32" %cp RiGhB ",
|
||||
val,c);
|
||||
//p += gbstrlen(p);
|
||||
i = j + 2;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
if ( query[i] == '[' && query[i+1] == ']' ) {
|
||||
sprintf ( p , " LeFtB RiGhB ");
|
||||
p += gbstrlen(p);
|
||||
//sprintf ( p , " LeFtB RiGhB ");
|
||||
//p += gbstrlen(p);
|
||||
m_sb.safePrintf ( " LeFtB RiGhB ");
|
||||
i = i + 1;
|
||||
continue;
|
||||
}
|
||||
if ( query[i] == '[' && query[i+1] == 'p' && query[i+2]==']') {
|
||||
sprintf ( p , " LeFtB RiGhB ");
|
||||
p += gbstrlen(p);
|
||||
//sprintf ( p , " LeFtB RiGhB ");
|
||||
//p += gbstrlen(p);
|
||||
m_sb.safePrintf ( " LeFtB RiGhB ");
|
||||
i = i + 2;
|
||||
continue;
|
||||
}
|
||||
@ -306,17 +346,22 @@ bool Query::set2 ( char *query ,
|
||||
|
||||
// TODO: copy altavista's operators here? & | !
|
||||
// otherwise, just a plain copy
|
||||
*p = query [i];
|
||||
p++;
|
||||
// *p = query [i];
|
||||
// p++;
|
||||
m_sb.pushChar ( query[i] );
|
||||
}
|
||||
// NULL terminate
|
||||
*p = '\0';
|
||||
//*p = '\0';
|
||||
m_sb.nullTerm();
|
||||
// debug statement
|
||||
//log(LOG_DEBUG,"Query: Got new query=%s",tempBuf);
|
||||
//printf("query: query: Got new query=%s\n",tempBuf);
|
||||
|
||||
// set length
|
||||
m_bufLen = p - m_buf;
|
||||
//m_bufLen = p - m_buf;
|
||||
|
||||
//m_buf = m_sb.getBufStart();
|
||||
//m_bufLen = m_sb.length();
|
||||
|
||||
Words words;
|
||||
Phrases phrases;
|
||||
@ -560,8 +605,108 @@ bool Query::setQTerms ( Words &words , Phrases &phrases ) {
|
||||
// what is the max value for "shift"?
|
||||
int32_t max = (int32_t)MAX_EXPLICIT_BITS;
|
||||
if ( max > m_maxQueryTerms ) max = m_maxQueryTerms;
|
||||
|
||||
// count phrases first for allocating
|
||||
int32_t nqt = 0;
|
||||
for ( int32_t i = 0 ; i < m_numWords ; i++ ) {
|
||||
QueryWord *qw = &m_qwords[i];
|
||||
// skip if ignored... mdw...
|
||||
if ( ! qw->m_phraseId ) continue;
|
||||
if ( qw->m_ignorePhrase ) continue; // could be a repeat
|
||||
// none if weight is absolute zero
|
||||
if ( qw->m_userWeightPhrase == 0 &&
|
||||
qw->m_userTypePhrase == 'a' ) continue;
|
||||
nqt++;
|
||||
}
|
||||
// count phrase terms too!!!
|
||||
for ( int32_t i = 0 ; i < m_numWords; i++ ) {
|
||||
QueryWord *qw = &m_qwords[i];
|
||||
if ( qw->m_ignoreWord &&
|
||||
qw->m_ignoreWord != IGNORE_QSTOP) continue;
|
||||
// ignore if in quotes and part of phrase, watch out
|
||||
// for things like "word", a single word in quotes.
|
||||
if ( qw->m_quoteStart >= 0 && qw->m_phraseId ) continue;
|
||||
// if we are not start of quote and NOT in a phrase we
|
||||
// must be the tailing word i guess.
|
||||
// fixes '"john smith" -"bob dole"' from having
|
||||
// smith and dole as query terms.
|
||||
if ( qw->m_quoteStart >= 0 && qw->m_quoteStart != i )
|
||||
continue;
|
||||
// ignore if weight is absolute zero
|
||||
if ( qw->m_userWeight == 0 &&
|
||||
qw->m_userType == 'a' ) continue;
|
||||
nqt++;
|
||||
}
|
||||
// thirdly, count synonyms
|
||||
Synonyms syn;
|
||||
int32_t sn = 0;
|
||||
if ( m_queryExpansion ) sn = m_numWords;
|
||||
int64_t to = hash64n("to",0LL);
|
||||
for ( int32_t i = 0 ; i < sn ; i++ ) {
|
||||
// get query word
|
||||
QueryWord *qw = &m_qwords[i];
|
||||
// skip if in quotes, we will not get synonyms for it
|
||||
if ( qw->m_inQuotes ) continue;
|
||||
// skip if has plus sign in front
|
||||
if ( qw->m_wordSign == '+' ) continue;
|
||||
// not '-' either i guess
|
||||
if ( qw->m_wordSign == '-' ) continue;
|
||||
// no url: stuff, maybe only title
|
||||
if ( qw->m_fieldCode &&
|
||||
qw->m_fieldCode != FIELD_TITLE &&
|
||||
qw->m_fieldCode != FIELD_GENERIC )
|
||||
continue;
|
||||
// skip if ignored like a stopword (stop to->too)
|
||||
//if ( qw->m_ignoreWord ) continue;
|
||||
// ignore title: etc. words, they are field names
|
||||
if ( qw->m_ignoreWord == IGNORE_FIELDNAME ) continue;
|
||||
// ignore boolean operators
|
||||
if ( qw->m_ignoreWord ) continue;// IGNORE_BOOLOP
|
||||
// no, hurts 'Greencastle IN economic development'
|
||||
if ( qw->m_wordId == to ) continue;
|
||||
// single letters...
|
||||
if ( qw->m_wordLen == 1 ) continue;
|
||||
// set the synonyms for this word
|
||||
char tmpBuf [ TMPSYNBUFSIZE ];
|
||||
int32_t naids = syn.getSynonyms ( &words ,
|
||||
i ,
|
||||
// language of the query.
|
||||
// 0 means unknown. if this
|
||||
// is 0 we sample synonyms
|
||||
// from all languages.
|
||||
m_langId ,
|
||||
tmpBuf ,
|
||||
0 ); // m_niceness );
|
||||
// if no synonyms, all done
|
||||
if ( naids <= 0 ) continue;
|
||||
nqt += naids;
|
||||
}
|
||||
|
||||
m_numTermsUntruncated = nqt;
|
||||
|
||||
if ( nqt > m_maxQueryTerms ) nqt = m_maxQueryTerms;
|
||||
|
||||
// allocate the stack buf
|
||||
if ( nqt ) {
|
||||
int32_t need = nqt * sizeof(QueryTerm) ;
|
||||
if ( ! m_stackBuf.reserve ( need ) )
|
||||
return false;
|
||||
m_stackBuf.setLabel("stkbuf3");
|
||||
char *pp = m_stackBuf.getBufStart();
|
||||
m_qterms = (QueryTerm *)pp;
|
||||
pp += sizeof(QueryTerm);
|
||||
if ( pp > m_stackBuf.getBufEnd() ) { char *xx=NULL;*xx=0; }
|
||||
}
|
||||
|
||||
// call constructor on each one here
|
||||
for ( int32_t i = 0 ; i < nqt ; i++ ) {
|
||||
QueryTerm *qt = &m_qterms[i];
|
||||
qt->constructor();
|
||||
}
|
||||
|
||||
|
||||
//char u8Buf[256];
|
||||
for ( int32_t i = 0 ; i < m_numWords && n < MAX_QUERY_TERMS ; i++ ) {
|
||||
for ( int32_t i = 0 ; i < m_numWords ; i++ ) {
|
||||
// break out if no more explicit bits!
|
||||
/*
|
||||
if ( shift >= max ) {
|
||||
@ -580,9 +725,14 @@ bool Query::setQTerms ( Words &words , Phrases &phrases ) {
|
||||
qw->m_userTypePhrase == 'a' ) continue;
|
||||
|
||||
// stop breach
|
||||
if ( n >= MAX_QUERY_TERMS ) {
|
||||
if ( n >= ABS_MAX_QUERY_TERMS ) {
|
||||
log("query: lost query phrase terms to max term "
|
||||
"limit of %"INT32"",(int32_t)MAX_QUERY_TERMS );
|
||||
"limit of %"INT32"",(int32_t)ABS_MAX_QUERY_TERMS );
|
||||
break;
|
||||
}
|
||||
if ( n >= m_maxQueryTerms ) {
|
||||
log("query: lost query phrase terms to max term cr "
|
||||
"limit of %"INT32"",(int32_t)m_maxQueryTerms);
|
||||
break;
|
||||
}
|
||||
|
||||
@ -604,7 +754,7 @@ bool Query::setQTerms ( Words &words , Phrases &phrases ) {
|
||||
qt->m_isQueryStopWord = false;
|
||||
// change in both places
|
||||
qt->m_termId = qw->m_phraseId & TERMID_MASK;
|
||||
m_termIds[n] = qw->m_phraseId & TERMID_MASK;
|
||||
//m_termIds[n] = qw->m_phraseId & TERMID_MASK;
|
||||
//log(LOG_DEBUG, "Setting query phrase term id %d: %lld", n, m_termIds[n]);
|
||||
qt->m_rawTermId = qw->m_rawPhraseId;
|
||||
// assume explicit bit is 0
|
||||
@ -615,12 +765,12 @@ bool Query::setQTerms ( Words &words , Phrases &phrases ) {
|
||||
// phrases like: "cat dog" AND pig
|
||||
if ( m_isBoolean && qw->m_phraseSign != '*' ) {
|
||||
qt->m_termSign = '\0';
|
||||
m_termSigns[n] = '\0';
|
||||
//m_termSigns[n] = '\0';
|
||||
}
|
||||
// if not boolean, ensure to change signs in both places
|
||||
else {
|
||||
qt->m_termSign = qw->m_phraseSign;
|
||||
m_termSigns[n] = qw->m_phraseSign;
|
||||
//m_termSigns[n] = qw->m_phraseSign;
|
||||
}
|
||||
//
|
||||
// INSERT UOR LOGIC HERE
|
||||
@ -703,7 +853,7 @@ bool Query::setQTerms ( Words &words , Phrases &phrases ) {
|
||||
}
|
||||
|
||||
// now if we have enough room, do the singles
|
||||
for ( int32_t i = 0 ; i < m_numWords && n < MAX_QUERY_TERMS ; i++ ) {
|
||||
for ( int32_t i = 0 ; i < m_numWords ; i++ ) {
|
||||
// break out if no more explicit bits!
|
||||
/*
|
||||
if ( shift >= max ) {
|
||||
@ -738,9 +888,14 @@ bool Query::setQTerms ( Words &words , Phrases &phrases ) {
|
||||
qw->m_userType == 'a' ) continue;
|
||||
|
||||
// stop breach
|
||||
if ( n >= MAX_QUERY_TERMS ) {
|
||||
if ( n >= ABS_MAX_QUERY_TERMS ) {
|
||||
log("query: lost query terms to max term "
|
||||
"limit of %"INT32"",(int32_t)MAX_QUERY_TERMS );
|
||||
"limit of %"INT32"",(int32_t)ABS_MAX_QUERY_TERMS );
|
||||
break;
|
||||
}
|
||||
if ( n >= m_maxQueryTerms ) {
|
||||
log("query: lost query terms to max term cr "
|
||||
"limit of %"INT32"",(int32_t)m_maxQueryTerms);
|
||||
break;
|
||||
}
|
||||
|
||||
@ -760,7 +915,7 @@ bool Query::setQTerms ( Words &words , Phrases &phrases ) {
|
||||
qt->m_isQueryStopWord = qw->m_isQueryStopWord;
|
||||
// change in both places
|
||||
qt->m_termId = qw->m_wordId & TERMID_MASK;
|
||||
m_termIds[n] = qw->m_wordId & TERMID_MASK;
|
||||
//m_termIds[n] = qw->m_wordId & TERMID_MASK;
|
||||
qt->m_rawTermId = qw->m_rawWordId;
|
||||
// assume explicit bit is 0
|
||||
qt->m_explicitBit = 0;
|
||||
@ -769,18 +924,18 @@ bool Query::setQTerms ( Words &words , Phrases &phrases ) {
|
||||
// boolean queries are not allowed term signs
|
||||
if ( m_isBoolean ) {
|
||||
qt->m_termSign = '\0';
|
||||
m_termSigns[n] = '\0';
|
||||
//m_termSigns[n] = '\0';
|
||||
// boolean fix for "health OR +sports" because
|
||||
// the + there means exact word match, no synonyms.
|
||||
if ( qw->m_wordSign == '+' ) {
|
||||
qt->m_termSign = qw->m_wordSign;
|
||||
m_termSigns[n] = qw->m_wordSign;
|
||||
//m_termSigns[n] = qw->m_wordSign;
|
||||
}
|
||||
}
|
||||
// if not boolean, ensure to change signs in both places
|
||||
else {
|
||||
qt->m_termSign = qw->m_wordSign;
|
||||
m_termSigns[n] = qw->m_wordSign;
|
||||
//m_termSigns[n] = qw->m_wordSign;
|
||||
}
|
||||
// get previous text word
|
||||
//int32_t pw = i - 2;
|
||||
@ -1230,16 +1385,14 @@ bool Query::setQTerms ( Words &words , Phrases &phrases ) {
|
||||
// . skip this part if language is unknown i guess
|
||||
//
|
||||
////////////
|
||||
int32_t sn = 0;
|
||||
Synonyms syn;
|
||||
// loop over all words in query and process its synonyms list
|
||||
//if ( m_langId != langUnknown && m_queryExpansion )
|
||||
// if lang is "xx" unknown we still do synonyms it just does
|
||||
// a loop over all languages starting with english
|
||||
if ( m_queryExpansion )
|
||||
sn = m_numWords;
|
||||
// if ( m_queryExpansion )
|
||||
// sn = m_numWords;
|
||||
|
||||
int64_t to = hash64n("to",0LL);
|
||||
//int64_t to = hash64n("to",0LL);
|
||||
|
||||
for ( int32_t i = 0 ; i < sn ; i++ ) {
|
||||
// get query word
|
||||
@ -1257,6 +1410,10 @@ bool Query::setQTerms ( Words &words , Phrases &phrases ) {
|
||||
continue;
|
||||
// skip if ignored like a stopword (stop to->too)
|
||||
//if ( qw->m_ignoreWord ) continue;
|
||||
// ignore title: etc. words, they are field names
|
||||
if ( qw->m_ignoreWord == IGNORE_FIELDNAME ) continue;
|
||||
// ignore boolean operators
|
||||
if ( qw->m_ignoreWord ) continue;// IGNORE_BOOLOP
|
||||
// no, hurts 'Greencastle IN economic development'
|
||||
if ( qw->m_wordId == to ) continue;
|
||||
// single letters...
|
||||
@ -1277,19 +1434,29 @@ bool Query::setQTerms ( Words &words , Phrases &phrases ) {
|
||||
// sanity
|
||||
if ( naids > MAX_SYNS ) { char *xx=NULL;*xx=0; }
|
||||
// now make the buffer to hold them for us
|
||||
qw->m_synWordBuf.setLabel("qswbuf");
|
||||
qw->m_synWordBuf.safeMemcpy ( &syn.m_synWordBuf );
|
||||
// get the term for this word
|
||||
QueryTerm *origTerm = qw->m_queryWordTerm;
|
||||
// loop over synonyms for word #i now
|
||||
for ( int32_t j = 0 ; j < naids ; j++ ) {
|
||||
// stop breach
|
||||
if ( n >= MAX_QUERY_TERMS ) {
|
||||
if ( n >= ABS_MAX_QUERY_TERMS ) {
|
||||
log("query: lost synonyms due to max term "
|
||||
"limit of %"INT32"",(int32_t)MAX_QUERY_TERMS );
|
||||
"limit of %"INT32"",
|
||||
(int32_t)ABS_MAX_QUERY_TERMS );
|
||||
break;
|
||||
}
|
||||
// this happens for 'da da da'
|
||||
if ( ! origTerm ) continue;
|
||||
|
||||
if ( n >= m_maxQueryTerms ) {
|
||||
log("query: lost synonyms due to max cr term "
|
||||
"limit of %"INT32"",
|
||||
(int32_t)m_maxQueryTerms);
|
||||
break;
|
||||
}
|
||||
|
||||
// add that query term
|
||||
QueryTerm *qt = &m_qterms[n];
|
||||
qt->m_qword = qw; // NULL;
|
||||
@ -1346,7 +1513,7 @@ bool Query::setQTerms ( Words &words , Phrases &phrases ) {
|
||||
wid= hash64h(wid,ph);
|
||||
}
|
||||
qt->m_termId = wid & TERMID_MASK;
|
||||
m_termIds[n] = wid & TERMID_MASK;
|
||||
//m_termIds[n] = wid & TERMID_MASK;
|
||||
qt->m_rawTermId = syn.m_aids[j];
|
||||
// assume explicit bit is 0
|
||||
qt->m_explicitBit = 0;
|
||||
@ -1354,18 +1521,18 @@ bool Query::setQTerms ( Words &words , Phrases &phrases ) {
|
||||
// boolean queries are not allowed term signs
|
||||
if ( m_isBoolean ) {
|
||||
qt->m_termSign = '\0';
|
||||
m_termSigns[n] = '\0';
|
||||
//m_termSigns[n] = '\0';
|
||||
// boolean fix for "health OR +sports" because
|
||||
// the + there means exact word match, no syns
|
||||
if ( qw->m_wordSign == '+' ) {
|
||||
qt->m_termSign = qw->m_wordSign;
|
||||
m_termSigns[n] = qw->m_wordSign;
|
||||
//m_termSigns[n] = qw->m_wordSign;
|
||||
}
|
||||
}
|
||||
// if not bool, ensure to change signs in both places
|
||||
else {
|
||||
qt->m_termSign = qw->m_wordSign;
|
||||
m_termSigns[n] = qw->m_wordSign;
|
||||
//m_termSigns[n] = qw->m_wordSign;
|
||||
}
|
||||
// do not use an explicit bit up if we got a hard count
|
||||
qt->m_hardCount = qw->m_hardCount;
|
||||
@ -1413,7 +1580,7 @@ bool Query::setQTerms ( Words &words , Phrases &phrases ) {
|
||||
|
||||
m_numTerms = n;
|
||||
|
||||
if ( n > MAX_QUERY_TERMS ) { char *xx=NULL;*xx=0; }
|
||||
if ( n > ABS_MAX_QUERY_TERMS ) { char *xx=NULL;*xx=0; }
|
||||
|
||||
|
||||
// count them for doing number of combos
|
||||
@ -1493,7 +1660,7 @@ bool Query::setQTerms ( Words &words , Phrases &phrases ) {
|
||||
// . don't forget to set m_termSigns too!
|
||||
if ( n == 1 && m_qterms[0].m_isPhrase && ! m_qterms[0].m_termSign ) {
|
||||
m_qterms[0].m_termSign = '*';
|
||||
m_termSigns[0] = '*';
|
||||
//m_termSigns[0] = '*';
|
||||
}
|
||||
|
||||
// . or bits into the m_implicitBits member of phrase QueryTerms that
|
||||
@ -1524,7 +1691,11 @@ bool Query::setQTerms ( Words &words , Phrases &phrases ) {
|
||||
// . see Msg2.cpp for more info on componentCodes
|
||||
// . -2 means unset, neither a compound term nor a component term at
|
||||
// this time
|
||||
for ( int32_t i = 0 ; i < m_numTerms ; i++ ) m_componentCodes[i] = -2;
|
||||
//for( int32_t i = 0 ; i < m_numTerms ; i++ ) m_componentCodes[i] = -2;
|
||||
for ( int32_t i = 0 ; i < m_numTerms ; i++ ) {
|
||||
QueryTerm *qt = &m_qterms[i];
|
||||
qt->m_componentCode = -2;
|
||||
}
|
||||
m_numComponents = 0;
|
||||
|
||||
// . now set m_phrasePart for Summary.cpp's hackfix filter
|
||||
@ -1879,7 +2050,10 @@ void Query::addCompoundTerms ( ) {
|
||||
|
||||
// -1 means compound, -2 means unset, >= 0 means component
|
||||
bool Query::isCompoundTerm ( int32_t i ) {
|
||||
return ( m_componentCodes[i] == -1 );
|
||||
//return ( m_componentCodes[i] == -1 );
|
||||
if ( i >= m_numTerms ) return false;
|
||||
QueryTerm *qt = &m_qterms[i];
|
||||
return ( qt->m_componentCode == -1 );
|
||||
}
|
||||
|
||||
bool Query::setQWords ( char boolFlag ,
|
||||
@ -1891,16 +2065,17 @@ bool Query::setQWords ( char boolFlag ,
|
||||
// . because we now deal with boolean queries, we make parentheses
|
||||
// their own separate Word, so tell "words" we're setting a query
|
||||
//Words words;
|
||||
if ( ! words.set ( m_buf , m_bufLen,
|
||||
if ( ! words.set ( m_sb.getBufStart() , m_sb.length() ,
|
||||
//buf , m_bufLen,
|
||||
TITLEREC_CURRENT_VERSION, true, true ) )
|
||||
return log("query: Had error parsing query: %s.",
|
||||
mstrerror(g_errno));
|
||||
int32_t numWords = words.getNumWords();
|
||||
// truncate it
|
||||
if ( numWords > MAX_QUERY_WORDS ) {
|
||||
if ( numWords > ABS_MAX_QUERY_WORDS ) {
|
||||
log("query: Had %"INT32" words. Max is %"INT32". Truncating.",
|
||||
numWords,(int32_t)MAX_QUERY_WORDS);
|
||||
numWords = MAX_QUERY_WORDS;
|
||||
numWords,(int32_t)ABS_MAX_QUERY_WORDS);
|
||||
numWords = ABS_MAX_QUERY_WORDS;
|
||||
m_truncated = true;
|
||||
}
|
||||
m_numWords = numWords;
|
||||
@ -1923,11 +2098,14 @@ bool Query::setQWords ( char boolFlag ,
|
||||
return log("query: Could not allocate mem for query.");
|
||||
m_qwordsAllocSize = need;
|
||||
}
|
||||
// reset safebuf in there
|
||||
for ( int32_t i = 0 ; i < m_numWords ; i++ )
|
||||
m_qwords[i].constructor();
|
||||
|
||||
// is all alpha chars in query in upper case? caps lock on?
|
||||
bool allUpper = true;
|
||||
char *p = m_buf;
|
||||
char *pend = m_buf + m_bufLen;
|
||||
char *p = m_sb.getBufStart();//m_buf;
|
||||
char *pend = m_sb.getBuf(); // m_buf + m_bufLen;
|
||||
for ( ; p < pend ; p += getUtf8CharSize(p) )
|
||||
if ( is_alpha_utf8 ( p ) && ! is_upper_utf8 ( p ) ) {
|
||||
allUpper = false; break; }
|
||||
@ -2027,7 +2205,7 @@ bool Query::setQWords ( char boolFlag ,
|
||||
char *ignoreTill = NULL;
|
||||
|
||||
// loop over all words, these QueryWords are 1-1 with "words"
|
||||
for ( int32_t i = 0 ; i < numWords && i < MAX_QUERY_WORDS ; i++ ) {
|
||||
for ( int32_t i = 0 ; i < numWords && i < ABS_MAX_QUERY_WORDS ; i++ ) {
|
||||
// convenience var, these are 1-1 with "words"
|
||||
QueryWord *qw = &m_qwords[i];
|
||||
// set to defaults?
|
||||
@ -2338,12 +2516,14 @@ bool Query::setQWords ( char boolFlag ,
|
||||
// in quotes which is silly, so undo it. But we should
|
||||
// still inherit any quoteSign, however. Be sure to also
|
||||
// set m_inQuotes to false so Matches.cpp::matchWord() works.
|
||||
if ( i == quoteStart ) { // + 1 ) {
|
||||
if ( i + 1 >= numWords || words.getNumQuotes(i+1)>0 ) {
|
||||
qw->m_quoteStart = -1;
|
||||
qw->m_inQuotes = false;
|
||||
}
|
||||
}
|
||||
// MDW: don't undo it because we do not want to get synonyms
|
||||
// of terms in quotes. 7/15/2015
|
||||
// if ( i == quoteStart ) { // + 1 ) {
|
||||
// if ( i + 1 >= numWords || words.getNumQuotes(i+1)>0 ) {
|
||||
// qw->m_quoteStart = -1;
|
||||
// qw->m_inQuotes = false;
|
||||
// }
|
||||
// }
|
||||
// . get prefix hash of collection name and field
|
||||
// . but first convert field to lower case
|
||||
uint64_t ph;
|
||||
@ -3228,7 +3408,8 @@ bool Query::setQWords ( char boolFlag ,
|
||||
// search up to this far
|
||||
int32_t maxj = i + nw;
|
||||
// but not past our truncated limit
|
||||
if ( maxj > MAX_QUERY_WORDS ) maxj = MAX_QUERY_WORDS;
|
||||
if ( maxj > ABS_MAX_QUERY_WORDS )
|
||||
maxj = ABS_MAX_QUERY_WORDS;
|
||||
|
||||
for ( j = i ; j < maxj ; j++ ) {
|
||||
// skip punct
|
||||
@ -3385,7 +3566,7 @@ bool Query::setQWords ( char boolFlag ,
|
||||
// count non-ignored words
|
||||
if ( qw->m_ignoreWord ) continue;
|
||||
// if under limit, continue
|
||||
if ( count++ < MAX_QUERY_TERMS ) continue;
|
||||
if ( count++ < ABS_MAX_QUERY_TERMS ) continue;
|
||||
// . otherwise, ignore
|
||||
// . if we set this for our UOR'ed terms from SearchInput.cpp's
|
||||
// UOR'ed facebook interests then it causes us to get no results!
|
||||
@ -4968,7 +5149,7 @@ void Query::printQueryTerms(){
|
||||
(int64_t)m_qterms[i].m_explicitBit ,
|
||||
(int64_t)m_qterms[i].m_implicitBits ,
|
||||
(int32_t) m_qterms[i].m_hardCount ,
|
||||
m_componentCodes[i],
|
||||
m_qterms[i].m_componentCode,
|
||||
getTermLen(i),
|
||||
tt );
|
||||
}
|
||||
@ -5514,7 +5695,17 @@ bool QueryTerm::isSplit() {
|
||||
// hash of all the query terms
|
||||
int64_t Query::getQueryHash() {
|
||||
int64_t qh = 0LL;
|
||||
for ( int32_t i = 0 ; i < m_numTerms ; i++ )
|
||||
qh = hash64 ( m_termIds[i] , qh );
|
||||
for ( int32_t i = 0 ; i < m_numTerms ; i++ ) {
|
||||
QueryTerm *qt = &m_qterms[i];
|
||||
qh = hash64 ( qt->m_termId , qh );
|
||||
}
|
||||
return qh;
|
||||
}
|
||||
|
||||
void QueryWord::constructor () {
|
||||
m_synWordBuf.constructor();
|
||||
}
|
||||
|
||||
void QueryWord::destructor () {
|
||||
m_synWordBuf.purge();
|
||||
}
|
||||
|
71
Query.h
71
Query.h
@ -10,7 +10,9 @@
|
||||
|
||||
// keep these down to save memory
|
||||
//#define MAX_QUERY_LEN 8000 // url:XXX can be quite long! (MAX_URL_LEN)
|
||||
#define MAX_QUERY_LEN 3200
|
||||
//#define MAX_QUERY_LEN 3200
|
||||
// support big OR queries for image shingles
|
||||
#define ABS_MAX_QUERY_LEN 62000
|
||||
// . words need to deal with int32_t list of sites!
|
||||
// . remember, words can be string of punctuation, too
|
||||
//#define MAX_QUERY_WORDS 5000
|
||||
@ -21,7 +23,8 @@
|
||||
// seems like we alloc just enough to hold our words now so that this
|
||||
// is really a performance capper but it is used in Summary.cpp
|
||||
// and Matches.h so don't go too big just yet
|
||||
#define MAX_QUERY_WORDS 800
|
||||
//#define MAX_QUERY_WORDS 800
|
||||
#define ABS_MAX_QUERY_WORDS 99000
|
||||
|
||||
// . how many IndexLists might we get/intersect
|
||||
// . we now use a int64_t to hold the query term bits for non-boolean queries
|
||||
@ -36,7 +39,8 @@
|
||||
//#define MAX_QUERY_TERMS 40
|
||||
// how to make a lock pick set loses synonyms from 40!
|
||||
//#define MAX_QUERY_TERMS 80
|
||||
#define MAX_QUERY_TERMS 160
|
||||
//#define MAX_QUERY_TERMS 160
|
||||
#define ABS_MAX_QUERY_TERMS 9000
|
||||
|
||||
// only allow up to 200 interests from facebook plus manually entered
|
||||
// because we are limited by the query terms above so we can only
|
||||
@ -270,6 +274,9 @@ class QueryWord {
|
||||
if ( is_wspace_utf8 ( p ) ) return true;
|
||||
return false;
|
||||
};
|
||||
void constructor ();
|
||||
void destructor ();
|
||||
|
||||
//UCScript wordScript() {
|
||||
// UChar*foo;
|
||||
// return ucGetScript(utf16Decode((UChar*)(m_word),&foo));
|
||||
@ -463,6 +470,10 @@ class QueryTerm {
|
||||
char *m_parenList;
|
||||
int32_t m_parenListLen;
|
||||
|
||||
int32_t m_componentCode;
|
||||
int64_t m_termFreq;
|
||||
float m_termFreqWeight;
|
||||
|
||||
// . our representative bits
|
||||
// . the bits in this bit vector is 1-1 with the QueryTerms
|
||||
// . if a doc has query term #i then bit #i will be set
|
||||
@ -624,10 +635,10 @@ class Query {
|
||||
//int32_t collLen ,
|
||||
uint8_t langId ,
|
||||
char queryExpansion ,
|
||||
bool useQueryStopWords = true );
|
||||
//char boolFlag = 2 , // auto-detect if boolean query
|
||||
//bool keepAllSingles = false ,
|
||||
//int32_t maxQueryTerms = 0x7fffffff );
|
||||
bool useQueryStopWords = true ,
|
||||
//char boolFlag = 2 , // auto-detect if boolean query
|
||||
//bool keepAllSingles = false ,
|
||||
int32_t maxQueryTerms = 0x7fffffff );
|
||||
|
||||
// serialize/deserialize ourselves so we don't have to pass the
|
||||
// unmodified string around and reparse it every time
|
||||
@ -680,9 +691,9 @@ class Query {
|
||||
// . the signs and ids are dupped in the QueryTerm classes, too
|
||||
//int64_t *getTermFreqs ( ) { return m_termFreqs ; };
|
||||
//int64_t getTermFreq ( int32_t i ) { return m_termFreqs[i]; };
|
||||
int64_t *getTermIds ( ) { return m_termIds ; };
|
||||
char *getTermSigns ( ) { return m_termSigns ; };
|
||||
int32_t *getComponentCodes ( ) { return m_componentCodes; };
|
||||
//int64_t *getTermIds ( ) { return m_termIds ; };
|
||||
//char *getTermSigns ( ) { return m_termSigns ; };
|
||||
//int32_t *getComponentCodes ( ) { return m_componentCodes; };
|
||||
int64_t getRawWordId ( int32_t i ) { return m_qwords[i].m_rawWordId;};
|
||||
|
||||
int32_t getNumComponentTerms ( ) { return m_numComponents; };
|
||||
@ -926,17 +937,26 @@ class Query {
|
||||
int32_t m_qwordsAllocSize;
|
||||
|
||||
// QueryWords are converted to QueryTerms
|
||||
QueryTerm m_qterms [ MAX_QUERY_TERMS ];
|
||||
//QueryTerm m_qterms [ MAX_QUERY_TERMS ];
|
||||
int32_t m_numTerms;
|
||||
int32_t m_numTermsSpecial;
|
||||
|
||||
int32_t m_numTermsUntruncated;
|
||||
|
||||
// separate vectors for easier interfacing, 1-1 with m_qterms
|
||||
//int64_t m_termFreqs [ MAX_QUERY_TERMS ];
|
||||
int64_t m_termIds [ MAX_QUERY_TERMS ];
|
||||
char m_termSigns [ MAX_QUERY_TERMS ];
|
||||
int32_t m_componentCodes [ MAX_QUERY_TERMS ];
|
||||
char m_ignore [ MAX_QUERY_TERMS ]; // is term ignored?
|
||||
int32_t m_numComponents;
|
||||
//int64_t m_termIds [ MAX_QUERY_TERMS ];
|
||||
//char m_termSigns [ MAX_QUERY_TERMS ];
|
||||
//int32_t m_componentCodes [ MAX_QUERY_TERMS ];
|
||||
//char m_ignore [ MAX_QUERY_TERMS ]; // is term ignored?
|
||||
SafeBuf m_stackBuf;
|
||||
QueryTerm *m_qterms ;
|
||||
//int64_t *m_termIds ;
|
||||
//char *m_termSigns ;
|
||||
//int32_t *m_componentCodes ;
|
||||
//char *m_ignore ; // is term ignored?
|
||||
|
||||
int32_t m_numComponents;
|
||||
|
||||
// how many bits in the full vector?
|
||||
//int32_t m_numExplicitBits;
|
||||
@ -974,18 +994,27 @@ class Query {
|
||||
class Host *m_groupThatHasDocId;
|
||||
|
||||
// for holding the filtered query, in utf8
|
||||
char m_buf [ MAX_QUERY_LEN ];
|
||||
int32_t m_bufLen;
|
||||
//char m_buf [ MAX_QUERY_LEN ];
|
||||
//int32_t m_bufLen;
|
||||
|
||||
// for holding the filtered query, in utf8
|
||||
SafeBuf m_sb;
|
||||
char m_tmpBuf3[128];
|
||||
|
||||
// for holding the filtered/NULL-terminated query for doing
|
||||
// matching. basically store phrases in here without punct
|
||||
// so we can point a needle to them for matching in XmlDoc.cpp.
|
||||
char m_needleBuf [ MAX_QUERY_LEN + 1 ];
|
||||
int32_t m_needleBufLen;
|
||||
//char m_needleBuf [ MAX_QUERY_LEN + 1 ];
|
||||
//int32_t m_needleBufLen;
|
||||
|
||||
// the original query
|
||||
char m_orig [ MAX_QUERY_LEN ];
|
||||
//char m_orig [ MAX_QUERY_LEN ];
|
||||
//int32_t m_origLen;
|
||||
|
||||
char *m_orig;
|
||||
int32_t m_origLen;
|
||||
SafeBuf m_osb;
|
||||
char m_otmpBuf[128];
|
||||
|
||||
// we just have a ptr to this so don't pull the rug out
|
||||
//char *m_coll;
|
||||
|
26
RdbDump.cpp
26
RdbDump.cpp
@ -373,12 +373,12 @@ bool RdbDump::dumpTree ( bool recall ) {
|
||||
//if ( removeNegRecs )
|
||||
// m_list.removeNegRecs();
|
||||
|
||||
// if(!m_list->checkList_r ( false , // removeNegRecs?
|
||||
// false , // sleep on problem?
|
||||
// m_rdb->m_rdbId )) {
|
||||
// log("db: list to dump is not sane!");
|
||||
// char *xx=NULL;*xx=0;
|
||||
// }
|
||||
// if(!m_list->checkList_r ( false , // removeNegRecs?
|
||||
// false , // sleep on problem?
|
||||
// m_rdb->m_rdbId )) {
|
||||
// log("db: list to dump is not sane!");
|
||||
// char *xx=NULL;*xx=0;
|
||||
// }
|
||||
|
||||
|
||||
skip:
|
||||
@ -781,6 +781,10 @@ bool RdbDump::doneReadingForVerify ( ) {
|
||||
if ( m_addToMap ) t = gettimeofdayInMilliseconds();
|
||||
// sanity check
|
||||
if ( m_list->m_ks != m_ks ) { char *xx = NULL; *xx = 0; }
|
||||
|
||||
bool triedToFix = false;
|
||||
|
||||
tryAgain:
|
||||
// . register this with the map now
|
||||
// . only register AFTER it's ALL on disk so we don't get partial
|
||||
// record reads and we don't read stuff on disk that's also in tree
|
||||
@ -788,6 +792,16 @@ bool RdbDump::doneReadingForVerify ( ) {
|
||||
// . we don't have maps when we do unordered dumps
|
||||
// . careful, map is NULL if we're doing unordered dump
|
||||
if ( m_addToMap && m_map && ! m_map->addList ( m_list ) ) {
|
||||
// keys out of order in list from tree?
|
||||
if ( g_errno == ECORRUPTDATA ) {
|
||||
log("db: trying to fix tree or buckets");
|
||||
if ( m_tree ) m_tree->fixTree();
|
||||
//if ( m_buckets ) m_buckets->fixBuckets();
|
||||
if ( m_buckets ) { char *xx=NULL;*xx=0; }
|
||||
if ( triedToFix ) { char *xx=NULL;*xx=0; }
|
||||
triedToFix = true;
|
||||
goto tryAgain;
|
||||
}
|
||||
g_errno = ENOMEM;
|
||||
log("db: Failed to add data to map.");
|
||||
// undo the offset update, the write failed, the parent
|
||||
|
@ -624,7 +624,8 @@ bool RdbList::growList ( int32_t newSize ) {
|
||||
// don't shrink list
|
||||
if ( newSize <= m_allocSize ) return true;
|
||||
// debug msg
|
||||
//log("RdbList::growList from %"INT32" to %"INT32"",m_allocSize , newSize );
|
||||
// log("RdbList::growList 0x%"PTRFMT "from %"INT32" to %"INT32"",
|
||||
// (PTRTYPE)this,m_allocSize , newSize );
|
||||
// make a new buffer
|
||||
char *tmp =(char *) mrealloc ( m_alloc,m_allocSize,newSize,"RdbList");
|
||||
//if ( (int32_t)tmp == 0x904dbd0 )
|
||||
|
27
RdbMap.cpp
27
RdbMap.cpp
@ -323,8 +323,9 @@ bool RdbMap::verifyMap2 ( ) {
|
||||
KEYSET(lastKey,k,m_ks); continue; }
|
||||
// just bitch for now
|
||||
log(
|
||||
"db: Key out of order in map file %s%s. "
|
||||
"page = %"INT32". key offset = %"INT64". Map or data file is "
|
||||
"db: Key out of order in map file %s/%s. "
|
||||
"page = %"INT32". key offset = %"INT64". "
|
||||
"Map or data file is "
|
||||
"corrupt, but it is probably the data file. Please "
|
||||
"delete the map file and restart.",
|
||||
m_file.m_dir,m_file.getFilename() ,
|
||||
@ -337,6 +338,15 @@ bool RdbMap::verifyMap2 ( ) {
|
||||
KEY1(lastKey,m_ks),KEY0(lastKey));
|
||||
log("db: k.n1=%016"XINT64" n0=%016"XINT64"",KEY1(k,m_ks),KEY0(k));
|
||||
log("db: m_numPages = %"INT32"",m_numPages);
|
||||
|
||||
SafeBuf cmd;
|
||||
cmd.safePrintf("mv %s/%s %s/trash/",
|
||||
m_file.m_dir,
|
||||
m_file.getFilename(),
|
||||
g_hostdb.m_dir);
|
||||
log("db: %s",cmd.getBufStart() );
|
||||
gbsystem ( cmd.getBufStart() );
|
||||
|
||||
exit(0);
|
||||
//char *xx=NULL;*xx=0;
|
||||
// was k too small?
|
||||
@ -543,7 +553,8 @@ bool RdbMap::addRecord ( char *key, char *rec , int32_t recSize ) {
|
||||
m_lastLogTime = getTime();
|
||||
//pageNum > 0 && getKey(pageNum-1) > getKey(pageNum) ) {
|
||||
log(LOG_LOGIC,"build: RdbMap: added key out of order. "
|
||||
"count=%"INT64".",m_badKeys);
|
||||
"count=%"INT64" file=%s/%s.",m_badKeys,
|
||||
m_file.m_dir,m_file.getFilename());
|
||||
//log(LOG_LOGIC,"build: k.n1=%"XINT32" %"XINT64" lastKey.n1=%"XINT32" %"XINT64"",
|
||||
// key.n1,key.n0,m_lastKey.n1,m_lastKey.n0 );
|
||||
log(LOG_LOGIC,"build: offset=%"INT64"",
|
||||
@ -556,7 +567,10 @@ bool RdbMap::addRecord ( char *key, char *rec , int32_t recSize ) {
|
||||
g_errno = ECORRUPTDATA;
|
||||
return false;
|
||||
}
|
||||
char *xx=NULL;*xx=0;
|
||||
// if being called from RdbDump.cpp...
|
||||
g_errno = ECORRUPTDATA;
|
||||
return false;
|
||||
//char *xx=NULL;*xx=0;
|
||||
// . during a merge, corruption can happen, so let's core
|
||||
// here until we figure out how to fix it.
|
||||
// . any why wasn't the corruption discovered and patched
|
||||
@ -719,7 +733,10 @@ bool RdbMap::addList ( RdbList *list ) {
|
||||
if ( ! addRecord ( key , rec , recSize ) ) {
|
||||
log("db: Failed to add record to map: %s.",
|
||||
mstrerror(g_errno));
|
||||
char *xx = NULL; *xx = 0;
|
||||
// allow caller to try to fix the tree in the case of dumping
|
||||
// a tree to a file on disk
|
||||
return false;
|
||||
//char *xx = NULL; *xx = 0;
|
||||
}
|
||||
if ( list->skipCurrentRecord() ) goto top2;
|
||||
|
||||
|
19
RdbTree.cpp
19
RdbTree.cpp
@ -1283,19 +1283,26 @@ bool RdbTree::checkTree2 ( bool printMsgs , bool doChainTest ) {
|
||||
if ( m_right[i] >= 0 && m_parents[m_right[i]] != i )
|
||||
return log(
|
||||
"db: Tree right kid and parent disagree.");
|
||||
/*
|
||||
// MDW: why did i comment out the order checking?
|
||||
// check order
|
||||
if ( m_left[i] >= 0 ) {
|
||||
if ( m_left[i] >= 0 &&
|
||||
m_collnums[i] == m_collnums[m_left[i]] ) {
|
||||
char *key = &m_keys[i*m_ks];
|
||||
char *left = &m_keys[m_left[i]*m_ks];
|
||||
if ( KEYCMP(key,left,m_ks)<0) {char *xx=NULL;*xx=0;}
|
||||
if ( KEYCMP(key,left,m_ks)<0)
|
||||
return log("db: Tree left kid > parent %i",i);
|
||||
|
||||
}
|
||||
if ( m_right[i] >= 0 ) {
|
||||
if ( m_right[i] >= 0 &&
|
||||
m_collnums[i] == m_collnums[m_right[i]] ) {
|
||||
char *key = &m_keys[i*m_ks];
|
||||
char *right = &m_keys[m_right[i]*m_ks];
|
||||
if ( KEYCMP(key,right,m_ks)>0) {char *xx=NULL;*xx=0;}
|
||||
if ( KEYCMP(key,right,m_ks)>0)
|
||||
return log("db: Tree right kid < parent %i "
|
||||
"%s < %s",i,
|
||||
KEYSTR(right,m_ks),
|
||||
KEYSTR(key,m_ks) );
|
||||
}
|
||||
*/
|
||||
//g_loop.quickPoll(1, __PRETTY_FUNCTION__, __LINE__);
|
||||
}
|
||||
if ( hkp > 0 )
|
||||
|
@ -522,7 +522,8 @@ int32_t SafeBuf::safeSave (char *filename ) {
|
||||
}
|
||||
|
||||
|
||||
int32_t SafeBuf::fillFromFile(char *dir,char *filename) {
|
||||
int32_t SafeBuf::fillFromFile(char *dir,char *filename,char *label) {
|
||||
m_label = label;
|
||||
char buf[1024];
|
||||
if ( dir ) snprintf(buf,1024,"%s/%s",dir,filename);
|
||||
else snprintf(buf,1024,"%s",filename);
|
||||
|
15
SafeBuf.h
15
SafeBuf.h
@ -10,6 +10,9 @@
|
||||
* (for java programmers, very similar to the StringBuffer class, with all the speed that c++ allows).
|
||||
* Most of strings in Gigablast are handled by those.
|
||||
*/
|
||||
|
||||
#include "iana_charset.h"
|
||||
|
||||
class SafeBuf {
|
||||
public:
|
||||
//*TRUCTORS
|
||||
@ -33,8 +36,11 @@ public:
|
||||
// want SafeBuf to free the data for you. Keep in mind, all
|
||||
// previous content in SafeBuf will be cleared when you pass it
|
||||
// a new buffer.
|
||||
bool setBuf(char *newBuf, int32_t bufMax, int32_t bytesInUse, bool ownData,
|
||||
int16_t encoding );
|
||||
bool setBuf(char *newBuf,
|
||||
int32_t bufMax,
|
||||
int32_t bytesInUse,
|
||||
bool ownData,
|
||||
int16_t encoding = csUTF8 );
|
||||
// yieldBuf() allows you to take over the buffer in SafeBuf.
|
||||
// You may only free the data if it was originally owned by
|
||||
// the SafeBuf.
|
||||
@ -67,8 +73,9 @@ public:
|
||||
int32_t safeSave (char *filename );
|
||||
|
||||
int32_t fillFromFile(char *filename);
|
||||
int32_t fillFromFile(char *dir,char *filename);
|
||||
int32_t load(char *dir,char *fname) { return fillFromFile(dir,fname);};
|
||||
int32_t fillFromFile(char *dir,char *filename, char *label=NULL);
|
||||
int32_t load(char *dir,char *fname,char *label = NULL) {
|
||||
return fillFromFile(dir,fname,label);};
|
||||
int32_t load(char *fname) { return fillFromFile(fname);};
|
||||
|
||||
void filterTags();
|
||||
|
@ -50,14 +50,16 @@ void SearchInput::clear ( int32_t niceness ) {
|
||||
key_t SearchInput::makeKey ( ) {
|
||||
// hash the query
|
||||
int32_t n = m_q.getNumTerms ();
|
||||
int64_t *termIds = m_q.getTermIds ();
|
||||
char *signs = m_q.getTermSigns ();
|
||||
//int64_t *termIds = m_q.getTermIds ();
|
||||
//char *signs = m_q.getTermSigns ();
|
||||
key_t k;
|
||||
k.n1 = 0;
|
||||
k.n0 = hash64 ( (char *)termIds , n * sizeof(int64_t) );
|
||||
k.n0 = hash64 ( (char *)signs , n , k.n0 );
|
||||
//k.n0 = hash64 ( (char *)termIds , n * sizeof(int64_t) );
|
||||
//k.n0 = hash64 ( (char *)signs , n , k.n0 );
|
||||
// user defined weights, for weighting each query term separately
|
||||
for ( int32_t i = 0 ; i < n ; i++ ) {
|
||||
k.n0 = hash64 ((char *)&m_q.m_qterms[i].m_termId ,4, k.n0);
|
||||
k.n0 = hash64 ((char *)&m_q.m_qterms[i].m_termSign ,1, k.n0);
|
||||
k.n0 = hash64 ((char *)&m_q.m_qterms[i].m_userWeight,4, k.n0);
|
||||
k.n0 = hash64 ((char *)&m_q.m_qterms[i].m_userType ,1, k.n0);
|
||||
}
|
||||
@ -468,14 +470,16 @@ bool SearchInput::set ( TcpSocket *sock , HttpRequest *r ) { //, Query *q ) {
|
||||
log("query: qlang of \"%s\" is NOT SUPPORTED. using "
|
||||
"langUnknown, \"xx\".",langAbbr);
|
||||
|
||||
int32_t maxQueryTerms = cr->m_maxQueryTerms;
|
||||
|
||||
// . the query to use for highlighting... can be overriden with "hq"
|
||||
// . we need the language id for doing synonyms
|
||||
if ( m_prepend && m_prepend[0] )
|
||||
m_hqq.set2 ( m_prepend , m_queryLangId , true );
|
||||
m_hqq.set2 ( m_prepend , m_queryLangId , true ,maxQueryTerms);
|
||||
else if ( m_highlightQuery && m_highlightQuery[0] )
|
||||
m_hqq.set2 ( m_highlightQuery , m_queryLangId , true );
|
||||
m_hqq.set2 (m_highlightQuery,m_queryLangId,true,maxQueryTerms);
|
||||
else if ( m_query && m_query[0] )
|
||||
m_hqq.set2 ( m_query , m_queryLangId , true );
|
||||
m_hqq.set2 ( m_query , m_queryLangId , true,maxQueryTerms);
|
||||
|
||||
// log it here
|
||||
log(LOG_INFO,
|
||||
@ -487,7 +491,9 @@ bool SearchInput::set ( TcpSocket *sock , HttpRequest *r ) { //, Query *q ) {
|
||||
// . returns false and sets g_errno on error (ETOOMANYOPERANDS)
|
||||
if ( ! m_q.set2 ( m_sbuf1.getBufStart(),
|
||||
m_queryLangId ,
|
||||
m_queryExpansion ) ) {
|
||||
m_queryExpansion ,
|
||||
true , // use QUERY stopwords?
|
||||
maxQueryTerms ) ) {
|
||||
g_msg = " (error: query has too many operands)";
|
||||
return false;
|
||||
}
|
||||
@ -823,6 +829,9 @@ bool SearchInput::setQueryBuffers ( HttpRequest *hr ) {
|
||||
m_sbuf2.safeStrcpy(" AND ");
|
||||
}
|
||||
}
|
||||
m_sbuf1.setLabel("sisbuf1");
|
||||
m_sbuf2.setLabel("sisbuf2");
|
||||
m_sbuf3.setLabel("sisbuf3");
|
||||
// append the natural query
|
||||
if ( m_query && m_query[0] ) {
|
||||
//if ( p > pstart ) *p++ = ' ';
|
||||
|
@ -1164,7 +1164,9 @@ bool Sections::set ( Words *w ,
|
||||
xh ^= g_hashtab[cnt++][(unsigned char )*p];
|
||||
}
|
||||
// sanity check
|
||||
if ( ! xh ) { char *xx=NULL;*xx=0; }
|
||||
//if ( ! xh ) { char *xx=NULL;*xx=0; }
|
||||
// if it is a string of the same chars it can be 0
|
||||
if ( ! xh ) xh = 1;
|
||||
// store that
|
||||
sn->m_xmlNameHash = (int32_t)xh;
|
||||
}
|
||||
|
23
Spider.cpp
23
Spider.cpp
@ -11641,6 +11641,18 @@ int32_t getUrlFilterNum2 ( SpiderRequest *sreq ,
|
||||
goto checkNextRule;
|
||||
}
|
||||
|
||||
if ( strncmp ( p , "isfakeip",8 ) == 0 ) {
|
||||
// skip for msg20
|
||||
if ( isForMsg20 ) continue;
|
||||
// if no match continue
|
||||
if ( (bool)sreq->m_fakeFirstIp == val ) continue;
|
||||
p += 8;
|
||||
p = strstr(p, "&&");
|
||||
if ( ! p ) return i;
|
||||
p += 2;
|
||||
goto checkNextRule;
|
||||
}
|
||||
|
||||
if ( strncmp ( p , "isonsamedomain",14 ) == 0 ) {
|
||||
// skip for msg20
|
||||
if ( isForMsg20 ) continue;
|
||||
@ -13993,6 +14005,17 @@ bool getSpiderStatusMsg ( CollectionRec *cx , SafeBuf *msg , int32_t *status ) {
|
||||
return msg->safePrintf("Job is initializing.");
|
||||
}
|
||||
|
||||
// if we had seeds and none were successfully crawled, do not just
|
||||
// print that the crawl completed.
|
||||
if ( cx->m_collectiveRespiderFrequency <= 0.0 &&
|
||||
cx->m_isCustomCrawl &&
|
||||
! cx->m_globalCrawlInfo.m_hasUrlsReadyToSpider &&
|
||||
cx->m_globalCrawlInfo.m_pageDownloadAttempts > 0 &&
|
||||
cx->m_globalCrawlInfo.m_pageDownloadSuccesses == 0 ) {
|
||||
*status = SP_SEEDSERROR;
|
||||
return msg->safePrintf("Failed to crawl any seed.");
|
||||
}
|
||||
|
||||
// if we sent an email simply because no urls
|
||||
// were left and we are not recrawling!
|
||||
if ( cx->m_collectiveRespiderFrequency <= 0.0 &&
|
||||
|
1
Spider.h
1
Spider.h
@ -39,6 +39,7 @@
|
||||
#define SP_INPROGRESS 7 // it is going on!
|
||||
#define SP_ADMIN_PAUSED 8 // g_conf.m_spideringEnabled = false
|
||||
#define SP_COMPLETED 9 // crawl is done, and no repeatCrawl is scheduled
|
||||
#define SP_SEEDSERROR 10 // all seeds had an error preventing crawling
|
||||
|
||||
bool tryToDeleteSpiderColl ( SpiderColl *sc , char *msg ) ;
|
||||
void spiderRoundIncremented ( class CollectionRec *cr ) ;
|
||||
|
14
Stats.cpp
14
Stats.cpp
@ -44,13 +44,13 @@ Stats::Stats ( ) {
|
||||
m_totalSpiderSuccessOld = 0;
|
||||
m_totalSpiderErrorsOld = 0;
|
||||
m_msg3aRecallCnt = 0;
|
||||
m_tierHits[0] = 0;
|
||||
m_tierHits[1] = 0;
|
||||
m_tierHits[2] = 0;
|
||||
m_tier2Misses = 0;
|
||||
m_tierTimes[0] = 0;
|
||||
m_tierTimes[1] = 0;
|
||||
m_tierTimes[2] = 0;
|
||||
// m_tierHits[0] = 0;
|
||||
// m_tierHits[1] = 0;
|
||||
// m_tierHits[2] = 0;
|
||||
// m_tier2Misses = 0;
|
||||
// m_tierTimes[0] = 0;
|
||||
// m_tierTimes[1] = 0;
|
||||
// m_tierTimes[2] = 0;
|
||||
//m_totalDedupCand = 0;
|
||||
//m_dedupedCand = 0;
|
||||
//m_bannedDups = 0;
|
||||
|
10
Stats.h
10
Stats.h
@ -11,7 +11,7 @@
|
||||
|
||||
#include "SafeBuf.h"
|
||||
#include "UdpProtocol.h" // MAX_MSG_TYPES
|
||||
#include "IndexReadInfo.h"
|
||||
//#include "IndexReadInfo.h"
|
||||
|
||||
class StatPoint {
|
||||
public:
|
||||
@ -143,8 +143,8 @@ class Stats {
|
||||
// when we just request more docids from the same tier
|
||||
int32_t m_msg3aFastRecalls;
|
||||
// how many resolutions did we get on each tier
|
||||
int32_t m_tierHits [MAX_TIERS];
|
||||
int64_t m_tierTimes[MAX_TIERS];
|
||||
//int32_t m_tierHits [MAX_TIERS];
|
||||
//int64_t m_tierTimes[MAX_TIERS];
|
||||
// how many searches did not get enough results?
|
||||
int32_t m_tier2Misses;
|
||||
// one count for each CR_* defined in Msg51.h
|
||||
@ -160,8 +160,8 @@ class Stats {
|
||||
//int32_t m_errored;
|
||||
int32_t m_msg3aRecalls[6];
|
||||
SafeBuf m_keyCols;
|
||||
int32_t m_numTermsVsTier[14][MAX_TIERS];
|
||||
int32_t m_termsVsTierExp[14][MAX_TIERS][7];
|
||||
//int32_t m_numTermsVsTier[14][MAX_TIERS];
|
||||
//int32_t m_termsVsTierExp[14][MAX_TIERS][7];
|
||||
|
||||
// use m_start so we know what msg stats to clear with memset
|
||||
char m_start;
|
||||
|
53
Summary.cpp
53
Summary.cpp
@ -12,6 +12,8 @@ Summary::Summary()
|
||||
//m_buf = NULL;
|
||||
m_bitScoresBuf = NULL;
|
||||
m_bitScoresBufSize = 0;
|
||||
m_wordWeights = NULL;
|
||||
m_buf4 = NULL;
|
||||
reset();
|
||||
}
|
||||
|
||||
@ -36,6 +38,15 @@ void Summary::reset() {
|
||||
m_numExcerpts = 0;
|
||||
m_summaryLocs.reset();
|
||||
m_summaryLocsPops.reset();
|
||||
if ( m_wordWeights && m_wordWeights != (float *)m_tmpBuf ) {
|
||||
mfree ( m_wordWeights , m_wordWeightSize , "sumww");
|
||||
m_wordWeights = NULL;
|
||||
}
|
||||
m_wordWeights = NULL;
|
||||
if ( m_buf4 && m_buf4 != m_tmpBuf4 ) {
|
||||
mfree ( m_buf4 , m_buf4Size , "ssstkb" );
|
||||
m_buf4 = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -151,6 +162,15 @@ bool Summary::set2 ( Xml *xml ,
|
||||
end - start );
|
||||
start = gettimeofdayInMilliseconds();*/
|
||||
//
|
||||
int32_t need1 = q->m_numWords * sizeof(float);
|
||||
m_wordWeightSize = need1;
|
||||
if ( need1 < 128 )
|
||||
m_wordWeights = (float *)m_tmpBuf;
|
||||
else
|
||||
m_wordWeights = (float *)mmalloc ( need1 , "wwsum" );
|
||||
if ( ! m_wordWeights ) return false;
|
||||
|
||||
|
||||
|
||||
// zero out all word weights
|
||||
for ( int32_t i = 0 ; i < q->m_numWords; i++ )
|
||||
@ -229,11 +249,25 @@ bool Summary::set2 ( Xml *xml ,
|
||||
pend = m_summary + maxSummaryLen;
|
||||
m_numExcerpts = 0;
|
||||
|
||||
int32_t need2 = (1+1+1) * m_q->m_numWords;
|
||||
m_buf4Size = need2;
|
||||
if ( need2 < 128 )
|
||||
m_buf4 = m_tmpBuf4;
|
||||
else
|
||||
m_buf4 = (char *)mmalloc ( need2 , "stkbuf" );
|
||||
if ( ! m_buf4 ) return false;
|
||||
char *x = m_buf4;
|
||||
char *retired = x;
|
||||
x += m_q->m_numWords;
|
||||
char *maxGotIt = x;
|
||||
x += m_q->m_numWords;
|
||||
char *gotIt = x;
|
||||
|
||||
// . the "maxGotIt" count vector accumulates into "retired"
|
||||
// . that is how we keep track of what query words we used for previous
|
||||
// summary excerpts so we try to get diversified excerpts with
|
||||
// different query terms/words in them
|
||||
char retired [ MAX_QUERY_WORDS ];
|
||||
//char retired [ MAX_QUERY_WORDS ];
|
||||
memset ( retired, 0, m_q->m_numWords * sizeof(char) );
|
||||
|
||||
// some query words are already matched in the title
|
||||
@ -260,7 +294,7 @@ bool Summary::set2 ( Xml *xml ,
|
||||
int32_t maxb = 0;
|
||||
int32_t maxi = -1;
|
||||
int32_t lasta = -1;
|
||||
char maxGotIt [ MAX_QUERY_WORDS ];
|
||||
//char maxGotIt [ MAX_QUERY_WORDS ];
|
||||
|
||||
if(lastNumFinal == numFinal) {
|
||||
if(maxLoops-- <= 0) {
|
||||
@ -296,7 +330,7 @@ bool Summary::set2 ( Xml *xml ,
|
||||
if ( skip ) continue;
|
||||
|
||||
// ask him for the query words he matched
|
||||
char gotIt [ MAX_QUERY_WORDS ];
|
||||
//char gotIt [ MAX_QUERY_WORDS ];
|
||||
// clear it for him
|
||||
memset ( gotIt, 0, m_q->m_numWords * sizeof(char) );
|
||||
|
||||
@ -558,6 +592,12 @@ bool Summary::set2 ( Xml *xml ,
|
||||
m_displayLen = p - m_summary;
|
||||
}
|
||||
|
||||
// free the mem we used if we allocated it
|
||||
if ( m_buf4 && m_buf4 != m_tmpBuf4 ) {
|
||||
mfree ( m_buf4 , m_buf4Size , "ssstkb" );
|
||||
m_buf4 = NULL;
|
||||
}
|
||||
|
||||
|
||||
// If we still didn't find a summary, get the default summary
|
||||
if ( p == m_summary ) {
|
||||
@ -570,6 +610,7 @@ bool Summary::set2 ( Xml *xml ,
|
||||
maxSummaryLen );
|
||||
if ( m_numDisplayLines > 0 )
|
||||
m_displayLen = m_summaryLen;
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
@ -1211,7 +1252,7 @@ bool Summary::set1 ( char *doc ,
|
||||
int32_t numTerms = q->getNumTerms();
|
||||
// . now assign scores based on term frequencies
|
||||
// . highest score is 10000, then 9900, 9800, 9700, ...
|
||||
int32_t ptrs [ MAX_QUERY_TERMS ];
|
||||
int32_t ptrs [ ABS_MAX_QUERY_TERMS ];
|
||||
for ( int32_t i = 0 ; i < numTerms ; i++ ) ptrs[i] = i;
|
||||
// convenience var
|
||||
int64_t *freqs = termFreqs; // q->getTermFreqs();
|
||||
@ -1232,7 +1273,7 @@ bool Summary::set1 ( char *doc ,
|
||||
}
|
||||
}
|
||||
// assign scores, give rarest terms highest score
|
||||
int32_t scores [ MAX_QUERY_TERMS ];
|
||||
int32_t scores [ ABS_MAX_QUERY_TERMS ];
|
||||
for ( int32_t i = 0 ; i < numTerms ; i++ )
|
||||
scores[ptrs[i]] = 10000000 - (i*100);
|
||||
// force QUERY stop words to have much lower scores at most 10000
|
||||
@ -1441,7 +1482,7 @@ bool Summary::set1 ( char *doc ,
|
||||
int32_t maxi = -1;
|
||||
int32_t maxa = 0;
|
||||
int32_t maxb = 0;
|
||||
char gotIt [ MAX_QUERY_TERMS ];
|
||||
char gotIt [ ABS_MAX_QUERY_TERMS ];
|
||||
char *maxleft = NULL;
|
||||
char *maxright = NULL;
|
||||
for ( int32_t i = 0 ; i < numMatches ; i++ ) {
|
||||
|
@ -266,7 +266,14 @@ class Summary {
|
||||
|
||||
char *m_bitScoresBuf;
|
||||
int32_t m_bitScoresBufSize;
|
||||
float m_wordWeights[MAX_QUERY_WORDS];
|
||||
//float m_wordWeights[MAX_QUERY_WORDS];
|
||||
float *m_wordWeights;
|
||||
int32_t m_wordWeightSize;
|
||||
char m_tmpBuf[128];
|
||||
|
||||
char *m_buf4;
|
||||
int32_t m_buf4Size;
|
||||
char m_tmpBuf4[128];
|
||||
|
||||
char m_summaryLocBuf[MAX_SUMMARY_LOCS*sizeof(uint64_t)];
|
||||
SafeBuf m_summaryLocs;
|
||||
|
@ -12,6 +12,7 @@
|
||||
#include "Wiktionary.h"
|
||||
|
||||
Synonyms::Synonyms() {
|
||||
m_synWordBuf.setLabel("syswbuf");
|
||||
}
|
||||
|
||||
Synonyms::~Synonyms() {
|
||||
|
@ -5049,8 +5049,8 @@ bool Tagdb::loadMinSiteInlinksBuffer2 ( ) {
|
||||
// use 4 bytes for the first 130,000 entries or so to hold
|
||||
// # of site inlinks. then we only need 1 byte since the remaining
|
||||
// 25M are <256 sitenuminlinksunqiecblocks
|
||||
m_siteBuf1.load(g_hostdb.m_dir,"sitelinks1.dat");
|
||||
m_siteBuf2.load(g_hostdb.m_dir,"sitelinks2.dat");
|
||||
m_siteBuf1.load(g_hostdb.m_dir,"sitelinks1.dat","stelnks1");
|
||||
m_siteBuf2.load(g_hostdb.m_dir,"sitelinks2.dat","stelnks2");
|
||||
|
||||
m_siteBuf1.setLabel("sitelnks");
|
||||
m_siteBuf2.setLabel("sitelnks");
|
||||
|
46
XmlDoc.cpp
46
XmlDoc.cpp
@ -2565,11 +2565,10 @@ bool XmlDoc::indexDoc ( ) {
|
||||
SafeBuf *ssDocMetaList = NULL;
|
||||
// save this
|
||||
int32_t saved = m_indexCode;
|
||||
// and make it the real reason for the spider status doc
|
||||
// make it the real reason for the spider status doc
|
||||
m_indexCode = EDNSERROR;
|
||||
// get the spiderreply ready to be added
|
||||
|
||||
ssDocMetaList = getSpiderStatusDocMetaList(NULL ,false);//del
|
||||
// get the spiderreply ready to be added. false=del
|
||||
ssDocMetaList =getSpiderStatusDocMetaList(NULL ,false);
|
||||
// revert
|
||||
m_indexCode = saved;
|
||||
// error?
|
||||
@ -2586,8 +2585,11 @@ bool XmlDoc::indexDoc ( ) {
|
||||
|
||||
char *url = "unknown";
|
||||
if ( m_sreqValid ) url = m_sreq.m_url;
|
||||
log("build: error2 getting real firstip of %"INT32" for "
|
||||
"%s. Not adding new spider req", (int32_t)*fip,url);
|
||||
log("build: error2 getting real firstip of "
|
||||
"%"INT32" for "
|
||||
"%s. Not adding new spider req. "
|
||||
"spiderstatusdocsize=%"INT32, (int32_t)*fip,url,
|
||||
m_addedStatusDocSize);
|
||||
// also count it as a crawl attempt
|
||||
cr->m_localCrawlInfo.m_pageDownloadAttempts++;
|
||||
cr->m_globalCrawlInfo.m_pageDownloadAttempts++;
|
||||
@ -3130,8 +3132,9 @@ bool isRobotsTxtFile ( char *u , int32_t ulen ) {
|
||||
bool XmlDoc::isContainerDoc ( ) {
|
||||
if ( m_firstUrlValid && m_firstUrl.isWarc() ) return true;
|
||||
if ( m_firstUrlValid && m_firstUrl.isArc () ) return true;
|
||||
if ( ! m_contentDelimValid ) { char *xx=NULL;*xx=0; }
|
||||
if ( m_contentDelim ) return true;
|
||||
//if ( ! m_contentDelimValid ) { char *xx=NULL;*xx=0; }
|
||||
//if ( m_contentDelim ) return true;
|
||||
if ( m_contentDelimValid && m_contentDelim ) return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
@ -9617,11 +9620,15 @@ float computeSimilarity ( int32_t *vec0 ,
|
||||
// . stock the query term hash table
|
||||
// . use the lower 32 bits of the termids to make compatible
|
||||
// with the other vectors we use
|
||||
int64_t *qtids = q->getTermIds ();
|
||||
//int64_t *qtids = q->getTermIds ();
|
||||
int32_t nt = q->getNumTerms();
|
||||
for ( int32_t i = 0 ; i < nt ; i++ ) {
|
||||
// get query term
|
||||
QueryTerm *QT = &q->m_qterms[i];
|
||||
// get the termid
|
||||
int64_t termId = QT->m_termId;
|
||||
// get it
|
||||
uint32_t h = (uint32_t)(qtids[i] & 0xffffffff);
|
||||
uint32_t h = (uint32_t)(termId & 0xffffffff);
|
||||
// hash it
|
||||
if ( ! qt.addKey ( &h ) ) return -1;
|
||||
}
|
||||
@ -28672,6 +28679,11 @@ SafeBuf *XmlDoc::getSpiderStatusDocMetaList2 ( SpiderReply *reply1 ) {
|
||||
jd.safePrintf("\"gbssHttpStatus\":%"INT32",\n",
|
||||
(int32_t)m_httpStatus);
|
||||
|
||||
// do not index gbssIsSeedUrl:0 because there will be too many usually
|
||||
bool isSeed = ( m_sreqValid && m_sreq.m_isAddUrl );
|
||||
if ( isSeed )
|
||||
jd.safePrintf("\"gbssIsSeedUrl\":1,\n");
|
||||
|
||||
if ( od )
|
||||
jd.safePrintf("\"gbssWasIndexed\":1,\n");
|
||||
else
|
||||
@ -28696,6 +28708,18 @@ SafeBuf *XmlDoc::getSpiderStatusDocMetaList2 ( SpiderReply *reply1 ) {
|
||||
else
|
||||
jd.safePrintf("\"gbssDiffbotUri\":"
|
||||
"\"none\",\n");
|
||||
// show the type as gbssDiffbotType:"article" etc.
|
||||
JsonItem *dti = NULL;
|
||||
if ( jp1 )
|
||||
dti = jp1->getItem("type");
|
||||
if ( dti ) {
|
||||
jd.safePrintf("\"gbssDiffbotType\":\"");
|
||||
int32_t vlen;
|
||||
char *val = dti->getValueAsString( &vlen );
|
||||
if ( val ) jd.jsonEncode ( val , vlen );
|
||||
jd.safePrintf("\",\n");
|
||||
}
|
||||
|
||||
}
|
||||
else { // if ( cr->m_isCustomCrawl ) {
|
||||
jd.safePrintf("\"gbssIsDiffbotObject\":0,\n");
|
||||
@ -45262,7 +45286,7 @@ SafeBuf *XmlDoc::getMatchingQueriesScoredForFullQuery ( ) {
|
||||
// prepend to the query?
|
||||
int32_t ulen = m_firstUrl.m_ulen;
|
||||
// go to next guy if this query is too big already
|
||||
if ( ulen + qlen + 10 > MAX_QUERY_LEN ) {
|
||||
if ( ulen + qlen + 10 > ABS_MAX_QUERY_LEN ) {
|
||||
m_queryNum++;
|
||||
goto loop;
|
||||
}
|
||||
|
11
hash.cpp
11
hash.cpp
@ -232,6 +232,17 @@ uint64_t hash64d ( char *p, int32_t plen ) {
|
||||
char ncs = utf8Encode ( x , (char *)tmp );
|
||||
// sanity check
|
||||
if ( ncs > 4 ) { char *xx=NULL;*xx=0; }
|
||||
// i've seen this happen for 4 byte char =
|
||||
// -16,-112,-51,-125 which has x=66371 and y=66371
|
||||
// but utf8Encode() returned 0!
|
||||
if ( ncs == 0 ) {
|
||||
// let's just hash it as-is then
|
||||
tmp[0] = p[0];
|
||||
if ( cs >= 1 ) tmp[1] = p[1];
|
||||
if ( cs >= 2 ) tmp[2] = p[2];
|
||||
if ( cs >= 3 ) tmp[3] = p[3];
|
||||
ncs = cs;
|
||||
}
|
||||
// hash it up
|
||||
h ^= g_hashtab [i++][tmp[0]];
|
||||
if ( ncs == 1 ) continue;
|
||||
|
33
hash.h
33
hash.h
@ -250,6 +250,17 @@ inline uint64_t hash64Lower_utf8_nospaces ( char *p, int32_t len ) {
|
||||
char ncs = utf8Encode ( y , tmp );
|
||||
// sanity check
|
||||
if ( ncs > 4 ) { char *xx=NULL;*xx=0; }
|
||||
// i've seen this happen for 4 byte char =
|
||||
// -16,-112,-51,-125 which has x=66371 and y=66371
|
||||
// but utf8Encode() returned 0!
|
||||
if ( ncs == 0 ) {
|
||||
// let's just hash it as-is then
|
||||
tmp[0] = p[0];
|
||||
if ( cs >= 1 ) tmp[1] = p[1];
|
||||
if ( cs >= 2 ) tmp[2] = p[2];
|
||||
if ( cs >= 3 ) tmp[3] = p[3];
|
||||
ncs = cs;
|
||||
}
|
||||
// hash it up
|
||||
h ^= g_hashtab [i++][(uint8_t)tmp[0]];
|
||||
if ( ncs == 1 ) continue;
|
||||
@ -301,6 +312,17 @@ inline uint64_t hash64Lower_utf8_cont ( char *p,
|
||||
char ncs = utf8Encode ( y , tmp );
|
||||
// sanity check
|
||||
if ( ncs > 4 ) { char *xx=NULL;*xx=0; }
|
||||
// i've seen this happen for 4 byte char =
|
||||
// -16,-112,-51,-125 which has x=66371 and y=66371
|
||||
// but utf8Encode() returned 0!
|
||||
if ( ncs == 0 ) {
|
||||
// let's just hash it as-is then
|
||||
tmp[0] = p[0];
|
||||
if ( cs >= 1 ) tmp[1] = p[1];
|
||||
if ( cs >= 2 ) tmp[2] = p[2];
|
||||
if ( cs >= 3 ) tmp[3] = p[3];
|
||||
ncs = cs;
|
||||
}
|
||||
// hash it up
|
||||
h ^= g_hashtab [i++][(uint8_t)tmp[0]];
|
||||
if ( ncs == 1 ) continue;
|
||||
@ -376,6 +398,17 @@ inline uint64_t hash64Lower_utf8 ( char *p ) {
|
||||
char ncs = utf8Encode ( y , (char *)tmp );
|
||||
// sanity check
|
||||
if ( ncs > 4 ) { char *xx=NULL;*xx=0; }
|
||||
// i've seen this happen for 4 byte char =
|
||||
// -16,-112,-51,-125 which has x=66371 and y=66371
|
||||
// but utf8Encode() returned 0!
|
||||
if ( ncs == 0 ) {
|
||||
// let's just hash it as-is then
|
||||
tmp[0] = p[0];
|
||||
if ( cs >= 1 ) tmp[1] = p[1];
|
||||
if ( cs >= 2 ) tmp[2] = p[2];
|
||||
if ( cs >= 3 ) tmp[3] = p[3];
|
||||
ncs = cs;
|
||||
}
|
||||
// hash it up
|
||||
h ^= g_hashtab [i++][(uint8_t)tmp[0]];
|
||||
if ( ncs == 1 ) continue;
|
||||
|
2
main.cpp
2
main.cpp
@ -4998,7 +4998,7 @@ int install ( install_flag_konst_t installFlag , int32_t hostId , char *dir ,
|
||||
if ( ! f.doesExist() ) target = "gb";
|
||||
|
||||
sprintf(tmp,
|
||||
"scp -c blowfish " // blowfish is faster
|
||||
"scp -c arcfour " // blowfish is faster
|
||||
"%s%s "
|
||||
"%s:%s/gb.installed%s",
|
||||
dir,
|
||||
|
Loading…
x
Reference in New Issue
Block a user