Merge branch 'diffbot-testing' into diffbot-sam

This commit is contained in:
Matt 2015-07-23 09:27:00 -06:00
commit da41d53575
50 changed files with 1046 additions and 307 deletions

@ -3579,7 +3579,7 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() {
// lower from 7 to 1 since we have so many collections now
// ok, now we have much less colls so raise back to 7
int32_t diffbotipms = 7;// 1; // 7
int32_t diffbotipms = 7;//1; // 7
// make the gigablast regex table just "default" so it does not
// filtering, but accepts all urls. we will add code to pass the urls
@ -3599,8 +3599,8 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() {
m_spiderIpWaits [i] = wait;
m_spiderIpMaxSpiders[i] = diffbotipms; // keep it respectful
// ethan wants some speed
if ( isEthan )
m_spiderIpMaxSpiders[i] = 30;
// if ( isEthan )
// m_spiderIpMaxSpiders[i] = 30;
//m_spidersEnabled [i] = 1;
m_spiderFreqs [i] = respiderFreq;
//m_spiderDiffbotApiUrl[i].purge();
@ -3623,6 +3623,18 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() {
m_forceDelete [i] = 1;
i++;
// de-prioritize fakefirstip urls so we don't give the impression our
// spiders are slow. like if someone adds a bulk job with 100,000 urls
// then we sit there and process to lookup their ips and add a real
// spider request (if it falls onto the same shard) before we actually
// do any real spidering. so keep the priority here low.
m_regExs[i].set("isfakeip");
m_maxSpidersPerRule [i] = 7;
m_spiderIpMaxSpiders [i] = 7;
m_spiderPriorities [i] = 20;
m_spiderIpWaits [i] = 0;
i++;
// hopcount filter if asked for
if( m_diffbotMaxHops >= 0 ) {

@ -18,6 +18,8 @@ void HashTableX::constructor() {
m_useKeyMagic = false;
m_ks = 0;
m_allowGrowth = true;
m_numSlots = 0;
m_numSlotsUsed = 0;
}
void HashTableX::destructor() {

@ -160,8 +160,8 @@ int32_t Highlight::set ( SafeBuf *sb ,
// . set the anchor counts to 1000*i+1 for each possible query term num
// . yes, i know, why +1? because we're assuming the query terms
// have been highlighted before us
for ( int32_t i = 0 ; i < MAX_QUERY_TERMS ; i++ )
m_anchorCounts[i] = 1000*i + 1;
//for ( int32_t i = 0 ; i < MAX_QUERY_TERMS ; i++ )
// m_anchorCounts[i] = 1000*i + 1;
// set lengths of provided front/back highlight tags
if ( m_frontTag ) m_frontTagLen = gbstrlen ( frontTag );
if ( m_backTag ) m_backTagLen = gbstrlen ( backTag );
@ -170,6 +170,10 @@ int32_t Highlight::set ( SafeBuf *sb ,
//m_bufLen = bufLen;
//m_bufPtr = buf;
m_sb = sb;
// label it
m_sb->setLabel ("highw");
// save room for terminating \0
//m_bufEnd = m_buf + m_bufLen - 1;

@ -70,7 +70,7 @@ class Highlight {
bool m_doStemming;
bool m_useAnchors; // click and scroll technology for cached pages
int32_t m_anchorCounts [ MAX_QUERY_TERMS ];
//int32_t m_anchorCounts [ MAX_QUERY_TERMS ];
const char *m_baseUrl;
int32_t m_numMatches;

@ -1514,6 +1514,10 @@ bool HttpServer::sendReply ( TcpSocket *s , HttpRequest *r , bool isAdmin) {
// is recycled/destroyed
// . this will call getMsgPiece() to fill up sendBuf from file
int32_t totalToSend = mimeLen + bytesToSend;
//s->m_state = NULL; // do we need this? yes, cuz s is NULL for cleanUp
if ( s && s->m_state == f ) s->m_state = NULL;
//if ( ! m_tcp.sendMsg ( s ,
if ( ! tcp->sendMsg ( s ,
sendBuf ,
@ -1542,7 +1546,6 @@ bool HttpServer::sendReply ( TcpSocket *s , HttpRequest *r , bool isAdmin) {
if ( ! f->isOpen() ) f->open( O_RDONLY );
int fd = f->getfd();
cleanUp ( f , NULL/*TcpSocket */ );
s->m_state = NULL; // do we need this? yes, cuz s is NULL for cleanUp
// . AND we need to do this ourselves here
// . do it SILENTLY so not message is logged if fd not registered
if (tcp->m_useSSL)

@ -10,7 +10,7 @@ CC=g++
# remove dlstubs.o for CYGWIN
OBJS = UdpSlot.o Rebalance.o \
Msg13.o Mime.o IndexReadInfo.o \
Msg13.o Mime.o \
PageGet.o PageHosts.o \
PageParser.o PageInject.o PagePerf.o PageReindex.o PageResults.o \
PageAddUrl.o PageRoot.o PageSockets.o PageStats.o \

@ -24,10 +24,24 @@
Matches::Matches ( ) {
m_detectSubPhrases = false;
m_numMatchGroups = 0;
m_qwordFlags = NULL;
m_qwordAllocSize = 0;
reset();
}
Matches::~Matches( ) { reset(); }
void Matches::reset ( ) {
reset2();
if ( m_qwordFlags && m_qwordFlags != (mf_t *)m_tmpBuf ) {
mfree ( m_qwordFlags , m_qwordAllocSize , "mmqw" );
m_qwordFlags = NULL;
}
//m_explicitsMatched = 0;
//m_matchableRequiredBits = 0;
//m_hasAllQueryTerms = false;
//m_matchesQuery = false;
}
void Matches::reset2() {
m_numMatches = 0;
//m_maxNQT = -1;
m_numAlnums = 0;
@ -39,10 +53,6 @@ void Matches::reset ( ) {
m_bitsArray [i].reset();
}
m_numMatchGroups = 0;
//m_explicitsMatched = 0;
//m_matchableRequiredBits = 0;
//m_hasAllQueryTerms = false;
//m_matchesQuery = false;
}
bool Matches::isMatchableTerm ( QueryTerm *qt ) { // , int32_t i ) {
@ -103,6 +113,20 @@ void Matches::setQuery ( Query *q ) {
//memset ( m_foundNegTermVector, 0, m_q->getNumTerms() );
if ( m_qwordFlags ) { char *xx=NULL;*xx=0; }
int32_t need = m_q->m_numWords * sizeof(mf_t) ;
m_qwordAllocSize = need;
if ( need < 128 )
m_qwordFlags = (mf_t *)m_tmpBuf;
else
m_qwordFlags = (mf_t *)mmalloc ( need , "mmqf" );
if ( ! m_qwordFlags ) {
log("matches: alloc failed for query %s",q->m_orig);
return;
}
// this is word based. these are each 1 byte
memset ( m_qwordFlags , 0 , m_q->m_numWords * sizeof(mf_t));
@ -278,7 +302,7 @@ bool Matches::set ( XmlDoc *xd ,
int32_t niceness ) {
// don't reset query info!
reset();
reset2();
// sanity check
if ( ! xd->m_docIdValid ) { char *xx=NULL;*xx=0; }

@ -142,6 +142,7 @@ class Matches {
Matches ( ) ;
~Matches( ) ;
void reset ( ) ;
void reset2 ( ) ;
// BIG HACK support
//int32_t getTermsFound ( bool *hadPhrases , bool *hadWords );
@ -183,7 +184,10 @@ class Matches {
// . 1-1 with Query::m_qwords[] array of QWords
// . shows the match flags for that query word
mf_t m_qwordFlags[MAX_QUERY_WORDS];
//mf_t m_qwordFlags[MAX_QUERY_WORDS];
mf_t *m_qwordFlags;
int32_t m_qwordAllocSize;
char m_tmpBuf[128];
//stuff for detecting whether a match is part of a larger phrase
void setSubPhraseDetection();

@ -530,6 +530,11 @@ void Mem::addMem ( void *mem , int32_t size , const char *note , char isnew ) {
//validate();
// if ( note && note[0] == 'S' && note[1] == 'a' &&
// note[2] == 'f' && size == 13371521 )
// log("mem: got mystery safebuf");
//m_memtablesize = 0;//DMEMTABLESIZE;
// 4G/x = 600*1024 -> x = 4000000000.0/(600*1024) = 6510
// crap, g_hostdb.init() is called inmain.cpp before

14
Mem.h

@ -280,6 +280,20 @@ inline int32_t getNumBitsOn64 ( uint64_t bits ) {
g_a [ *((unsigned char *)(&bits) + 7) ] ;
}
inline int32_t getNumBitsOnX ( unsigned char *s , int32_t slen ) {
if ( slen == 1 ) return getNumBitsOn8 ( *s );
if ( slen == 2 ) return getNumBitsOn16 ( *(uint16_t *)s );
if ( slen == 4 ) return getNumBitsOn32 ( *(uint32_t *)s );
if ( slen == 3 )
return getNumBitsOn8 ( s[0] ) +
getNumBitsOn8 ( s[1] ) +
getNumBitsOn8 ( s[2] ) ;
int32_t total = 0;
for ( int32_t i = 0 ; i < slen ; i++ )
total += getNumBitsOn8 ( s[i] );
return total;
}
// assume only one bit is set for this (used by Address.cpp)
inline int32_t getBitPosLL ( uint8_t *bit ) {
// which int32_t is it in?

@ -98,7 +98,7 @@ bool Msg2::getLists ( int32_t rdbId ,
// set this
m_numLists = m_query->m_numTerms;
// make sure not too many lists being requested
if ( m_numLists > MAX_NUM_LISTS ) {g_errno=ETOOMANYLISTS; return true;}
//if(m_numLists > MAX_NUM_LISTS ) {g_errno=ETOOMANYLISTS; return true;}
// clear them all
//for ( int32_t i = 0 ; i < m_numLists ; i++ ) {
// m_inProgress[i] = true;
@ -133,7 +133,7 @@ bool Msg2::getLists ( ) {
// . make slots for all
for ( ; m_i < m_numLists ; m_i++ ) {
// sanity for Msg39's sake. do no breach m_lists[].
if ( m_i >= MAX_QUERY_TERMS ) { char *xx=NULL;*xx=0; }
if ( m_i >= ABS_MAX_QUERY_TERMS ) { char *xx=NULL;*xx=0; }
// if any had error, forget the rest. do not launch any more
if ( m_errno ) break;
// skip if already did it
@ -413,6 +413,8 @@ bool Msg2::getLists ( ) {
// mem. we should also report the size of each termlist
// in bytes in the query info header.
//int32_t minRecSizes = DEFAULT_POSDB_READSIZE;
// MDW TODO fix this later we go oom too easily for queries
// like 'www.disney.nl'
int32_t minRecSizes = -1;
// start up the read. thread will wait in thread queue to

5
Msg2.h

@ -7,9 +7,10 @@
#include "Msg0.h"
/** define the max # of lists you can get as the max # of query terms for now */
#define MAX_NUM_LISTS MAX_QUERY_TERMS
//#define MAX_NUM_LISTS MAX_QUERY_TERMS
/** how many outstanding msg5 requests at one time? */
#define MSG2_MAX_REQUESTS MAX_QUERY_TERMS
//#define MSG2_MAX_REQUESTS MAX_QUERY_TERMS
#define MSG2_MAX_REQUESTS 32
/** support the &sites=xyz.com+abc.com+... to restrict search results to provided sites.*/
#define MAX_WHITELISTS 500

@ -34,6 +34,10 @@ Msg39::Msg39 () {
reset();
}
Msg39::~Msg39 () {
reset();
}
void Msg39::reset() {
if ( m_inUse ) { char *xx=NULL;*xx=0; }
m_allocedTree = false;
@ -46,8 +50,16 @@ void Msg39::reset() {
void Msg39::reset2() {
// reset lists
for ( int32_t j = 0 ; j < m_msg2.m_numLists ; j++ )
m_lists[j].freeList();
int32_t nqt = m_stackBuf.getLength() / sizeof(RdbList);
//for ( int32_t j = 0 ; j < m_msg2.m_numLists && m_lists ; j++ ) {
for ( int32_t j = 0 ; j < nqt && m_lists ; j++ ) {
//m_lists[j].freeList();
//log("msg39: destroy list @ 0x%"PTRFMT,(PTRTYPE)&m_lists[j]);
// same thing but more generic
m_lists[j].destructor();
}
m_stackBuf.purge();
m_lists = NULL;
m_msg2.reset();
m_posdbTable.reset();
m_callback = NULL;
@ -205,7 +217,8 @@ void Msg39::getDocIds2 ( Msg39Request *req ) {
if ( ! m_tmpq.set2 ( m_r->ptr_query ,
m_r->m_language ,
m_r->m_queryExpansion ,
m_r->m_useQueryStopWords ) ) {
m_r->m_useQueryStopWords ,
m_r->m_maxQueryTerms ) ) {
log("query: msg39: setQuery: %s." ,
mstrerror(g_errno) );
sendReply ( m_slot , this , NULL , 0 , 0 , true );
@ -223,11 +236,14 @@ void Msg39::getDocIds2 ( Msg39Request *req ) {
if ( m_tmpq.getNumTerms() != m_r->m_nqt ) {
g_errno = EBADENGINEER;
log("query: Query parsing inconsistency for q=%s. "
"%i != %i. "
"langid=%"INT32". Check langids and m_queryExpansion parms "
"which are the only parms that could be different in "
"Query::set2(). You probably have different mysynoyms.txt "
"files on two different hosts! check that!!"
,m_tmpq.m_orig
,(int)m_tmpq.getNumTerms()
,(int)m_r->m_nqt
,(int32_t)m_r->m_language
);
sendReply ( m_slot , this , NULL , 0 , 0 , true );
@ -713,7 +729,7 @@ bool Msg39::getLists () {
//(int64_t)m_tmpq.m_qterms[i].m_explicitBit ,
//(int64_t)m_tmpq.m_qterms[i].m_implicitBits ,
(int32_t)m_tmpq.m_qterms[i].m_hardCount ,
(int32_t)m_tmpq.m_componentCodes[i],
(int32_t)m_tmpq.m_qterms[i].m_componentCode,
(int32_t)m_tmpq.getTermLen(i) ,
isSynonym,
(int32_t)m_tmpq.m_langId ); // ,tt
@ -762,6 +778,19 @@ bool Msg39::getLists () {
// split is us????
//int32_t split = g_hostdb.m_myHost->m_group;
int32_t split = g_hostdb.m_myHost->m_shardNum;
int32_t nqt = m_tmpq.getNumTerms();
int32_t need = sizeof(RdbList) * nqt ;
m_stackBuf.setLabel("stkbuf2");
if ( ! m_stackBuf.reserve ( need ) ) return true;
m_lists = (IndexList *)m_stackBuf.getBufStart();
m_stackBuf.setLength ( need );
for ( int32_t i = 0 ; i < nqt ; i++ ) {
m_lists[i].constructor();
//log("msg39: constructlist @ 0x%"PTRFMT,(PTRTYPE)&m_lists[i]);
}
// call msg2
if ( ! m_msg2.getLists ( rdbId ,
m_r->m_collnum,//m_r->ptr_coll ,

@ -216,6 +216,7 @@ class Msg39 {
public:
Msg39();
~Msg39();
void reset();
void reset2();
// register our request handler for Msg39's
@ -266,7 +267,9 @@ class Msg39 {
// . we hold our IndexLists here for passing to PosdbTable
// . one array for each of the tiers
IndexList m_lists [ MAX_QUERY_TERMS ];
//IndexList m_lists [ MAX_QUERY_TERMS ];
IndexList *m_lists;
SafeBuf m_stackBuf;
// used for timing
int64_t m_startTime;

@ -317,8 +317,8 @@ bool Msg3a::gotCacheReply ( ) {
//CollectionRec *cr;
//cr = g_collectiondb.getRec(m_r->ptr_coll,m_r->size_coll-1);
setTermFreqWeights ( m_r->m_collnum,m_q,m_termFreqs,m_termFreqWeights);
//setTermFreqWeights(m_r->m_collnum,m_q,m_termFreqs,m_termFreqWeights);
setTermFreqWeights ( m_r->m_collnum,m_q );
if ( m_debug ) {
//int64_t *termIds = m_q->getTermIds();
@ -338,8 +338,8 @@ bool Msg3a::gotCacheReply ( ) {
i,
qt->m_term,
qt->m_termId,
m_termFreqs[i],
m_termFreqWeights[i]);
qt->m_termFreq,//m_termFreqs[i],
qt->m_termFreqWeight);//m_termFreqWeights[i]);
// put it back
*tpc = c;
}
@ -368,7 +368,8 @@ bool Msg3a::gotCacheReply ( ) {
}
// a tmp buf
int32_t readSizes[MAX_QUERY_TERMS];
int32_t readSizes[ABS_MAX_QUERY_TERMS];
float tfw [ABS_MAX_QUERY_TERMS];
// update our read info
for ( int32_t j = 0; j < n ; j++ ) {
// the read size for THIS query term
@ -379,7 +380,9 @@ bool Msg3a::gotCacheReply ( ) {
rs = DEFAULT_POSDB_READSIZE;//90000000; // 90MB!
// it is better to go oom then leave users scratching their
// heads as to why some results are not being returned.
rs = -1;
// no, because we are going out of mem for queries like
// 'www.disney.nl' etc.
//rs = -1;
// if section stats, limit to 1MB
//if ( m_r->m_getSectionStats ) rs = 1000000;
// get the jth query term
@ -388,13 +391,14 @@ bool Msg3a::gotCacheReply ( ) {
if ( qt->m_ignored ) rs = 0;
// set it
readSizes[j] = rs;
// serialize these too
tfw[j] = qt->m_termFreqWeight;
}
// serialize this
m_r->ptr_readSizes = (char *)readSizes;
m_r->size_readSizes = 4 * n;
// and this
m_r->ptr_termFreqWeights = (char *)m_termFreqWeights;
m_r->ptr_termFreqWeights = (char *)tfw;//m_termFreqWeights;
m_r->size_termFreqWeights = 4 * n;
// store query into request, might have changed since we called
// Query::expandQuery() above
@ -1095,7 +1099,10 @@ bool Msg3a::mergeLists ( ) {
// log("results: alloc fhtqt of %"PTRFMT" for st0=%"PTRFMT,
// (PTRTYPE)ht->m_buf,(PTRTYPE)m_q->m_st0Ptr);
// sanity
if ( ! ht->m_isWritable ) {char *xx=NULL;*xx=0;}
if ( ! ht->m_isWritable ) {
log("msg3a: queryterm::constructor not called?");
char *xx=NULL;*xx=0;
}
}
// now scan each facethashlist from each shard and compile into
@ -1548,9 +1555,9 @@ void Msg3a::printTerms ( ) {
}
void setTermFreqWeights ( collnum_t collnum , // char *coll,
Query *q ,
int64_t *termFreqs,
float *termFreqWeights ) {
Query *q ) {
// int64_t *termFreqs,
// float *termFreqWeights ) {
int64_t numDocsInColl = 0;
RdbBase *base = getRdbBase ( RDB_CLUSTERDB , collnum );
@ -1562,13 +1569,16 @@ void setTermFreqWeights ( collnum_t collnum , // char *coll,
numDocsInColl = 1;
}
// now get term freqs again, like the good old days
int64_t *termIds = q->getTermIds();
//int64_t *termIds = q->getTermIds();
// just use rdbmap to estimate!
for ( int32_t i = 0 ; i < q->getNumTerms(); i++ ) {
QueryTerm *qt = &q->m_qterms[i];
// GET THE TERMFREQ for setting weights
int64_t tf = g_posdb.getTermFreq ( collnum ,termIds[i]);
if ( termFreqs ) termFreqs[i] = tf;
int64_t tf = g_posdb.getTermFreq ( collnum ,qt->m_termId);
//if ( termFreqs ) termFreqs[i] = tf;
qt->m_termFreq = tf;
float tfw = getTermFreqWeight(tf,numDocsInColl);
termFreqWeights[i] = tfw;
//termFreqWeights[i] = tfw;
qt->m_termFreqWeight = tfw;
}
}

10
Msg3a.h

@ -12,9 +12,9 @@
#define DEFAULT_POSDB_READSIZE 90000000
void setTermFreqWeights ( collnum_t collnum, // char *coll,
class Query *q ,
int64_t *termFreqs,
float *termFreqWeights ) ;
class Query *q );
//int64_t *termFreqs,
//float *termFreqWeights ) ;
//#define MSG3A_TMP_BUFSIZE (MAX_RESULTS*18)
@ -131,8 +131,8 @@ public:
// use msg37 to get TermFreqs
//Msg37 m_msg37;
int64_t m_termFreqs [MAX_QUERY_TERMS];
float m_termFreqWeights[MAX_QUERY_TERMS];
//int64_t m_termFreqs [MAX_QUERY_TERMS];
//float m_termFreqWeights[MAX_QUERY_TERMS];
// a multicast class to send the request, one for each split
Multicast m_mcast[MAX_SHARDS];

@ -666,7 +666,7 @@ bool Msg40::federatedLoop ( ) {
mr.size_whiteList = slen;
mr.m_timeout = -1; // auto-determine based on #terms
// make sure query term counts match in msg39
mr.m_maxQueryTerms = m_si->m_maxQueryTerms;
//mr.m_maxQueryTerms = m_si->m_maxQueryTerms;
mr.m_realMaxTop = m_si->m_realMaxTop;
mr.m_minSerpDocId = m_si->m_minSerpDocId;
@ -699,6 +699,9 @@ bool Msg40::federatedLoop ( ) {
//if ( numDocIdSplits < 5 ) numDocIdSplits = 5;
//}
if ( cr ) mr.m_maxQueryTerms = cr->m_maxQueryTerms;
else mr.m_maxQueryTerms = 100;
// special oom hack fix
if ( cr && cr->m_isCustomCrawl && numDocIdSplits < 4 )
numDocIdSplits = 4;
@ -3496,7 +3499,10 @@ bool Msg40::computeGigabits( TopicGroup *tg ) {
log("gbits: too many words in samples. "
"Discarding the remaining samples "
"(maxWords=%"INT32")", maxWords);
char *xx=NULL;*xx=0;
// return -1 with g_errno set on error
g_errno = EBUFTOOSMALL;
return -1;
//char *xx=NULL;*xx=0;
}
// the thing we are counting!!!!
maxWords += sampleWords;
@ -4330,7 +4336,8 @@ void hashExcerpt ( Query *q ,
int32_t m_posPtr;
};
SafeBuf posBuf;
int32_t need2 = MAX_QUERY_TERMS * sizeof(PosInfo);
//int32_t need2 = MAX_QUERY_TERMS * sizeof(PosInfo);
int32_t need2 = q->m_numTerms * sizeof(PosInfo);
posBuf.setLabel("m40posbuf");
if ( ! posBuf.reserve ( need2 ) ) {
log("gigabits: could not allocate 2 local buffer "

@ -15,7 +15,7 @@
#include "Msg20.h" // for getting summary from docId
#include "Msg17.h" // a distributed cache of serialized/compressed Msg40s
//#include "Msg2b.h" // for generating directories
#include "IndexReadInfo.h" // STAGE0,...
//#include "IndexReadInfo.h" // STAGE0,...
#include "Msg3a.h"
#include "PostQueryRerank.h"

@ -40,7 +40,9 @@ public:
bool m_isLocal;
//bool m_seq;
bool m_rtq;
char m_q[MAX_QUERY_LEN+1];
//char m_q[MAX_QUERY_LEN+1];
SafeBuf m_qsb;
char m_qtmpBuf[128];
int32_t m_qlen;
char m_boolFlag;
bool m_printed;
@ -98,7 +100,7 @@ bool sendPageGet ( TcpSocket *s , HttpRequest *r ) {
int32_t qlen = 0;
char *q = r->getString ( "q" , &qlen , NULL /*default*/);
// ensure query not too big
if ( qlen >= MAX_QUERY_LEN-1 ) {
if ( qlen >= ABS_MAX_QUERY_LEN-1 ) {
g_errno=EQUERYTOOBIG;
return g_httpServer.sendErrorReply (s,500 ,mstrerror(g_errno));
}
@ -156,8 +158,16 @@ bool sendPageGet ( TcpSocket *s , HttpRequest *r ) {
// delete ( st );
// return sendPageNetResult( s );
//}
if ( q && qlen > 0 ) strcpy ( st->m_q , q );
else st->m_q[0] = '\0';
//if ( q && qlen > 0 ) strcpy ( st->m_q , q );
//else st->m_q[0] = '\0';
st->m_qsb.setBuf ( st->m_qtmpBuf,128,0,false );
st->m_qsb.setLabel ( "qsbpg" );
// save the query
if ( q && qlen > 0 )
st->m_qsb.safeStrcpy ( q );
st->m_qlen = qlen;
//st->m_seq = seq;
st->m_rtq = rtq;
@ -415,8 +425,8 @@ bool processLoop ( void *state ) {
int32_t startLen2 = sb->length();//p;
// query should be NULL terminated
char *q = st->m_q;
int32_t qlen = st->m_qlen;
char *q = st->m_qsb.getBufStart();
int32_t qlen = st->m_qsb.getLength(); // m_qlen;
char styleTitle[128] = "font-size:14px;font-weight:600;"
"color:#000000;";

@ -2529,6 +2529,22 @@ bool printSearchResultsHeader ( State0 *st ) {
, getLanguageString(si->m_queryLangId) );
// print query words we ignored, like stop words
printIgnoredWords ( sb , si );
sb->safePrintf("\t\t<queryNumTermsTotal>"
"%"INT32
"</queryNumTermsTotal>\n"
, q->m_numTermsUntruncated );
sb->safePrintf("\t\t<queryNumTermsUsed>"
"%"INT32
"</queryNumTermsUsed>\n"
, q->m_numTerms );
int32_t tval = 0;
if ( q->m_numTerms < q->m_numTermsUntruncated ) tval = 1;
sb->safePrintf("\t\t<queryWasTruncated>"
"%"INT32
"</queryWasTruncated>\n"
, tval );
for ( int i = 0 ; i < q->m_numTerms ; i++ ) {
sb->safePrintf("\t\t<term>\n");
QueryTerm *qt = &q->m_qterms[i];
@ -2574,7 +2590,8 @@ bool printSearchResultsHeader ( State0 *st ) {
,printTerm);
term[sq->m_termLen] = c;
}
int64_t tf = msg40->m_msg3a.m_termFreqs[i];
//int64_t tf = msg40->m_msg3a.m_termFreqs[i];
int64_t tf = qt->m_termFreq;
sb->safePrintf("\t\t\t<termFreq>%"INT64"</termFreq>\n"
,tf);
sb->safePrintf("\t\t\t<termHash48>%"INT64"</termHash48>\n"
@ -2604,6 +2621,19 @@ bool printSearchResultsHeader ( State0 *st ) {
sb->safePrintf("\",\n");
// print query words we ignored, like stop words
printIgnoredWords ( sb , si );
sb->safePrintf("\t\"queryNumTermsTotal\":"
"%"INT32",\n"
, q->m_numTermsUntruncated );
sb->safePrintf("\t\"queryNumTermsUsed\":"
"%"INT32",\n"
, q->m_numTerms );
int32_t tval = 0;
if ( q->m_numTerms < q->m_numTermsUntruncated ) tval = 1;
sb->safePrintf("\t\"queryWasTruncated\":"
"%"INT32",\n"
, tval );
sb->safePrintf("\t\"terms\":[\n");
for ( int i = 0 ; i < q->m_numTerms ; i++ ) {
sb->safePrintf("\t\t{\n");
@ -2643,7 +2673,8 @@ bool printSearchResultsHeader ( State0 *st ) {
sb->safePrintf("\",\n");
term[sq->m_termLen] = c;
}
int64_t tf = msg40->m_msg3a.m_termFreqs[i];
//int64_t tf = msg40->m_msg3a.m_termFreqs[i];
int64_t tf = qt->m_termFreq;
sb->safePrintf("\t\t\"termFreq\":%"INT64",\n"
,tf);
@ -2793,13 +2824,14 @@ bool printSearchResultsHeader ( State0 *st ) {
//Highlight h;
st->m_qe[0] = '\0';
//st->m_qe[0] = '\0';
st->m_qesb.nullTerm();
// encode query buf
//char qe[MAX_QUERY_LEN+1];
char *dq = si->m_displayQuery;
//int32_t dqlen = si->m_displayQueryLen;
if ( dq ) urlEncode(st->m_qe,MAX_QUERY_LEN*2,dq,gbstrlen(dq));
if ( dq ) st->m_qesb.urlEncode(dq);
// how many results were requested?
//int32_t docsWanted = msg40->getDocsWanted();
@ -5185,7 +5217,7 @@ bool printResult ( State0 *st, int32_t ix , int32_t *numPrintedSoFar ) {
"get?"
"q=%s&c=%s&d=%"INT64">"
"cached</a>\n",
st->m_qe , coll ,
st->m_qesb.getBufStart() , coll ,
mr->m_docId );
else if ( printCached )
sb->safePrintf ( "<a href=\""
@ -5194,7 +5226,7 @@ bool printResult ( State0 *st, int32_t ix , int32_t *numPrintedSoFar ) {
"qlang=%s&"
"c=%s&d=%"INT64"&cnsp=0\">"
"cached</a>\n",
st->m_qe ,
st->m_qesb.getBufStart() ,
// "qlang" parm
si->m_defaultSortLang,
coll ,
@ -5334,7 +5366,7 @@ bool printResult ( State0 *st, int32_t ix , int32_t *numPrintedSoFar ) {
"d=%"INT64"&"
"cnsp=0\">"
"sections</a>\n",
st->m_qe ,
st->m_qesb.getBufStart() ,
// "qlang" parm
si->m_defaultSortLang,
coll ,
@ -5447,7 +5479,7 @@ bool printResult ( State0 *st, int32_t ix , int32_t *numPrintedSoFar ) {
qq.urlEncode("site:");
qq.urlEncode (hbuf);
qq.urlEncode(" | ");
qq.safeStrcpy(st->m_qe);
qq.safeStrcpy(st->m_qesb.getBufStart());
qq.nullTerm();
// get the original url and add/replace in query
char tmp2[512];
@ -6176,8 +6208,14 @@ bool printPairScore ( SafeBuf *sb , SearchInput *si , PairScore *ps ,
//int64_t sz2 = ps->m_listSize2;
//int64_t tf1 = ps->m_termFreq1;//sz1 / 10;
//int64_t tf2 = ps->m_termFreq2;//sz2 / 10;
int64_t tf1 = msg40->m_msg3a.m_termFreqs[qtn1];
int64_t tf2 = msg40->m_msg3a.m_termFreqs[qtn2];
QueryTerm *qt1 = &msg40->m_msg3a.m_q->m_qterms[qtn1];
QueryTerm *qt2 = &msg40->m_msg3a.m_q->m_qterms[qtn2];
//int64_t tf1 = msg40->m_msg3a.m_termFreqs[qtn1];
//int64_t tf2 = msg40->m_msg3a.m_termFreqs[qtn2];
int64_t tf1 = qt1->m_termFreq;
int64_t tf2 = qt2->m_termFreq;
float tfw1 = ps->m_tfWeight1;
float tfw2 = ps->m_tfWeight2;
@ -6893,7 +6931,9 @@ bool printSingleScore ( SafeBuf *sb ,
//int64_t tf = ss->m_termFreq;//ss->m_listSize;
int32_t qtn = ss->m_qtermNum;
int64_t tf = msg40->m_msg3a.m_termFreqs[qtn];
//int64_t tf = msg40->m_msg3a.m_termFreqs[qtn];
QueryTerm *qt = &msg40->m_msg3a.m_q->m_qterms[qtn];
int64_t tf = qt->m_termFreq;
float tfw = ss->m_tfWeight;
if ( si->m_format == FORMAT_XML ) {
@ -8252,8 +8292,11 @@ bool printCSVHeaderRow2 ( SafeBuf *sb ,
hdr = "Hop Count";
if ( ! strcmp(hdr,"gbssIp") )
hdr = "IP";
if ( ! strcmp(hdr,"gbssDiffbotUri" ) )
hdr = "Diffbot URI";
// csv report is regular urls not diffbot object urls so
// regular urls do not have a just a single diffboturi,
// they could have 0 or multiple diffboturis
//if ( ! strcmp(hdr,"gbssDiffbotUri" ) )
// hdr = "Diffbot URI";
if ( ! strcmp(hdr,"gbssSentToDiffbotThisTime") )
hdr = "Process Attempted";
if ( ! strcmp(hdr,"gbssDiffbotReplyMsg") )

@ -52,7 +52,8 @@ public:
int64_t m_took; // how long it took to get the results
HttpRequest m_hr;
bool m_printedHeaderRow;
char m_qe[MAX_QUERY_LEN+1];
//char m_qe[MAX_QUERY_LEN+1];
SafeBuf m_qesb;
// for printing our search result json items in csv:
HashTableX m_columnTable;

@ -1858,11 +1858,11 @@ bool printDirHomePage ( SafeBuf &sb , HttpRequest *r ) {
// . call g_httpServer.sendDynamicPage() to send it
bool sendPageRoot ( TcpSocket *s , HttpRequest *r, char *cookie ) {
// don't allow pages bigger than 128k in cache
char buf [ 10*1024 + MAX_QUERY_LEN ];
char buf [ 10*1024 ];//+ MAX_QUERY_LEN ];
// a ptr into "buf"
//char *p = buf;
//char *pend = buf + 10*1024 + MAX_QUERY_LEN - 100 ;
SafeBuf sb(buf, 10*1024 + MAX_QUERY_LEN);
SafeBuf sb(buf, 10*1024 );//+ MAX_QUERY_LEN);
// print bgcolors, set focus, set font style
//p = g_httpServer.printFocus ( p , pend );
//p = g_httpServer.printColors ( p , pend );

@ -252,18 +252,18 @@ bool sendPageStats ( TcpSocket *s , HttpRequest *r ) {
&secs,
&msecs);
int64_t avgTier0Time = 0;
int64_t avgTier1Time = 0;
int64_t avgTier2Time = 0;
if ( g_stats.m_tierHits[0] > 0 )
avgTier0Time = g_stats.m_tierTimes[0] /
(int64_t)g_stats.m_tierHits[0];
if ( g_stats.m_tierHits[1] > 0 )
avgTier1Time = g_stats.m_tierTimes[1] /
(int64_t)g_stats.m_tierHits[1];
if ( g_stats.m_tierHits[2] > 0 )
avgTier2Time = g_stats.m_tierTimes[2] /
(int64_t)g_stats.m_tierHits[2];
// int64_t avgTier0Time = 0;
// int64_t avgTier1Time = 0;
// int64_t avgTier2Time = 0;
// if ( g_stats.m_tierHits[0] > 0 )
// avgTier0Time = g_stats.m_tierTimes[0] /
// (int64_t)g_stats.m_tierHits[0];
// if ( g_stats.m_tierHits[1] > 0 )
// avgTier1Time = g_stats.m_tierTimes[1] /
// (int64_t)g_stats.m_tierHits[1];
// if ( g_stats.m_tierHits[2] > 0 )
// avgTier2Time = g_stats.m_tierTimes[2] /
// (int64_t)g_stats.m_tierHits[2];
if ( format == FORMAT_HTML )
p.safePrintf (

@ -3519,6 +3519,7 @@ bool printApiForPage ( SafeBuf *sb , int32_t PAGENUM , CollectionRec *cr ) {
if ( pageNum != PAGENUM ) continue;
SafeBuf tmp;
tmp.setLabel("apisb");
char diff = 0;
bool printVal = false;
if ( parm->m_type != TYPE_CMD &&
@ -3856,6 +3857,25 @@ bool printApiForPage ( SafeBuf *sb , int32_t PAGENUM , CollectionRec *cr ) {
"</b>");
sb->brify2 (
"\t\t# List of space separated words in the "
"query that were ignored for the most part. "
"Because they were common words for the "
"query language they are in.\n"
, cols , "\n\t\t# " , false );
sb->safePrintf("<b>\t\t\"ignoredWords\":\"to the\",\n\n"
"</b>");
sb->brify2 (
"\t\t# There is a maximum limit placed on the "
"number of query terms we search on to keep things "
"fast. This can "
"be changed in the search controls.\n"
, cols , "\n\t\t# " , false );
sb->safePrintf("<b>\t\t\"queryNumTermsTotal\":52,\n</b>");
sb->safePrintf("<b>\t\t\"queryNumTermsUsed\":20,\n</b>");
sb->safePrintf("<b>\t\t\"queryWasTruncated\":1,\n\n</b>");
sb->brify2 (
"\t\t# The start of the terms array. Each query "
"is broken down into a list of terms. Each "
@ -4037,7 +4057,8 @@ bool printApiForPage ( SafeBuf *sb , int32_t PAGENUM , CollectionRec *cr ) {
// end instance
sb->safePrintf("<b>\t\t}\n\n</b>");
// end gigabit
sb->safePrintf("<b>\t\t},\n\n</b>");
sb->safePrintf("\t\t# End of the first gigabit\n"
"<b>\t\t},\n\n</b>");
sb->safePrintf("\t\t...\n\n");
@ -4047,6 +4068,59 @@ bool printApiForPage ( SafeBuf *sb , int32_t PAGENUM , CollectionRec *cr ) {
sb->safePrintf("<b>\t],\n\n</b>");
// BEGIN FACETS
sb->safePrintf( "\t# Start of the facets array, if any.\n");
sb->safePrintf("<b>\t\"facets\":[\n</b>\n");
sb->safePrintf("\t\t# The first facet in the array.\n");
sb->safePrintf("<b>\t\t{\n</b>");
sb->brify2 ( "\t\t\t"
"# The field you are faceting over\n"
, cols , "\n\t\t\t# " , false );
sb->safePrintf ( "<b>\t\t\t\"field\":\"Company\",\n\n</b>");
sb->brify2 ( "\t\t\t"
"# How many documents in the collection had "
"this particular field? 64-bit integer.\n"
, cols , "\n\t\t\t# " , false );
sb->safePrintf ( "<b>\t\t\t\"totalDocsWithField\":148553,"
"\n\n</b>");
sb->brify2 ( "\t\t\t"
"# How many documents in the collection had "
"this particular field with the same value "
"as the value line directly below? This should "
"always be less than or equal to the "
"totalDocsWithField count. 64-bit integer.\n"
, cols , "\n\t\t\t# " , false );
sb->safePrintf ( "<b>\t\t\t\"totalDocsWithFieldAndValue\":"
"44184,\n\n</b>");
sb->brify2 ( "\t\t\t"
"# The value of the field in the case of "
"this facet. Can be a string or an integer or "
"a float, depending on the type described in "
"the gbfacet query term. i.e. gbfacetstr, "
"gbfacetint or gbfacetfloat.\n"
, cols , "\n\t\t\t# " , false );
sb->safePrintf ( "<b>\t\t\t\"value\":"
"\"Widgets, Inc.\",\n\n</b>");
sb->brify2 ( "\t\t\t"
"# Should be the same as totalDocsWith"
"FieldAndValue, "
"above. 64-bit integer.\n"
, cols , "\n\t\t\t# " , false );
sb->safePrintf ( "<b>\t\t\t\"docCount\":"
"44184\n\n</b>");
sb->safePrintf("\t\t# End of the first facet in the array.\n");
sb->safePrintf("<b>\t\t}\n\n</b>");
sb->safePrintf( "\t# End of the facets array.\n");
sb->safePrintf("<b>\t],\n\n</b>");
// END FACETS
@ -4670,7 +4744,7 @@ bool printRedBox ( SafeBuf *mb , TcpSocket *sock , HttpRequest *hr ) {
mb->safePrintf("%s",box);
mb->safePrintf("%"INT32" %s dead and not responding to "
"pings. See the "
"<a href=/admin/host?c=%s>hosts table</a>.",
"<a href=/admin/hosts?c=%s>hosts table</a>.",
ps->m_numHostsDead ,s ,coll);
mb->safePrintf("%s",boxEnd);
}

@ -7879,17 +7879,19 @@ void Parms::init ( ) {
m->m_obj = OBJ_COLL;
m++;
//m->m_title = "max query terms";
//m->m_desc = "Do not allow more than this many query terms. Will "
// "return error in XML feed error tag if breeched.";
//m->m_cgi = "mqt";
//m->m_off = (char *)&cr.m_maxQueryTerms - x;
m->m_title = "max query terms";
m->m_desc = "Do not allow more than this many query terms. Helps "
"prevent big queries from resource hogging.";
m->m_cgi = "mqt";
m->m_off = (char *)&cr.m_maxQueryTerms - x;
//m->m_soff = (char *)&si.m_maxQueryTerms - y;
//m->m_type = TYPE_LONG;
//m->m_def = "20"; // 20 for testing, normally 16
//m->m_sparm = 1;
//m->m_spriv = 1;
//m++;
m->m_type = TYPE_LONG;
m->m_def = "999999"; // now we got synonyms... etc
m->m_group = 0;
m->m_flags = 0;//PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
/*
m->m_title = "dictionary site";
@ -15283,7 +15285,7 @@ void Parms::init ( ) {
m->m_type = TYPE_CHARPTR;
m->m_page = PAGE_REINDEX;
m->m_obj = OBJ_GBREQUEST;
m->m_def = "xx";
m->m_def = "en";
m->m_flags = PF_API ;
m++;

149
Posdb.cpp

@ -759,19 +759,22 @@ void PosdbTable::init ( Query *q ,
// set this now
//m_collnum = cr->m_collnum;
// save it
m_topTree = topTree;
// a ptr for debugging i guess
g_topTree = topTree;
// remember the query class, it has all the info about the termIds
m_q = q;
m_nqt = q->getNumTerms();
// for debug msgs
m_logstate = logstate;
m_realMaxTop = r->m_realMaxTop;
if ( m_realMaxTop > MAX_TOP ) m_realMaxTop = MAX_TOP;
m_siteRankMultiplier = SITERANKMULTIPLIER;
if ( m_q->m_isBoolean ) m_siteRankMultiplier = 0.0;
// seo.cpp supplies a NULL msg2 because it already sets
// QueryTerm::m_posdbListPtrs
if ( ! msg2 ) return;
@ -1060,6 +1063,26 @@ bool PosdbTable::allocTopTree ( ) {
// make it nongrowable because we'll be in a thread
qt->m_facetHashTable.setNonGrow();
}
// m_stackBuf
int32_t nqt = m_q->m_numTerms;
int32_t need = 0;
need += 4 * nqt;
need += 4 * nqt;
need += 4 * nqt;
need += 4 * nqt;
need += sizeof(float ) * nqt;
need += sizeof(char *) * nqt;
need += sizeof(char *) * nqt;
need += sizeof(char *) * nqt;
need += sizeof(char *) * nqt;
need += sizeof(char *) * nqt;
need += sizeof(char ) * nqt;
need += sizeof(float ) * nqt * nqt; // square matrix
m_stackBuf.setLabel("stkbuf1");
if ( ! m_stackBuf.reserve( need ) )
return false;
return true;
}
@ -1378,8 +1401,8 @@ void PosdbTable::evalSlidingWindow ( char **ptrs ,
max *= m_freqWeights[i] * m_freqWeights[j];
// use score from scoreMatrix if bigger
if ( scoreMatrix[MAX_QUERY_TERMS*i+j] > max ) {
max = scoreMatrix[MAX_QUERY_TERMS*i+j];
if ( scoreMatrix[m_nqt*i+j] > max ) {
max = scoreMatrix[m_nqt*i+j];
//if ( m_ds ) {
// winners1[i*MAX_QUERY_TERMS+j] = NULL;
// winners2[i*MAX_QUERY_TERMS+j] = NULL;
@ -4815,6 +4838,8 @@ bool PosdbTable::setQueryTermInfo ( ) {
// below when trying to grow it. they could all be OR'd together
// so alloc the most!
int32_t maxSlots = (grand/12) * 2;
// try to speed up. this doesn't *seem* to matter, so i took out:
//maxSlots *= 2;
// get total operands we used
//int32_t numOperands = m_q->m_numWords;//Operands;
// a quoted phrase counts as a single operand
@ -4826,15 +4851,15 @@ bool PosdbTable::setQueryTermInfo ( ) {
// allow an extra byte for remainders
if ( m_numQueryTermInfos % 8 ) m_vecSize++;
// now preallocate the hashtable. 0 niceness.
if ( m_q->m_isBoolean &&
! m_bt.set (8,m_vecSize,maxSlots,NULL,0,false,0,"booltbl"))
if ( m_q->m_isBoolean && // true = useKeyMagic
! m_bt.set (8,m_vecSize,maxSlots,NULL,0,false,0,"booltbl",true))
return false;
// . m_ct maps a boolean "bit vector" to a true/false value
// . each "bit" in the "bit vector" indicates if docid has that
// particular query term
if ( m_q->m_isBoolean &&
if ( m_q->m_isBoolean && // true = useKeyMagic
! m_ct.set (8,1,maxSlots,NULL,0,false,0,
"booltbl"))
"booltbl",true))
return false;
return true;
@ -4999,13 +5024,13 @@ int64_t PosdbTable::countUniqueDocids( QueryTermInfo *qti ) {
// inc the TOTAL val count
if ( fe ) fe->m_outsideSearchResultsCount++;
// skip that docid record in our termlist. it MUST have been
// 12 bytes, a docid heading record.
recPtr += 12;
count++;
// skip any following keys that are 6 bytes, that means they
// share the same docid
for ( ; recPtr < subListEnd && ((*recPtr)&0x04); recPtr += 6 );
// Increment ptr to the next record
int32_t recSize = qti->m_subLists[0]->getRecSize(recPtr);
recPtr += recSize;
// Records that are 6 bytes share the same doc id, so only increment
// 'count' if it refers to a record with a new (unique) docId
if (recSize > 6) count++;
goto loop;
}
@ -5882,6 +5907,8 @@ void PosdbTable::intersectLists10_r ( ) {
if ( qti->m_bigramFlags[0] & BF_NEGATIVE ) continue;
// inc this
listGroupNum++;
// if it hits 256 then wrap back down to 1
if ( listGroupNum >= 256 ) listGroupNum = 1;
// add it
addDocIdVotes ( qti , listGroupNum );
}
@ -5966,11 +5993,28 @@ void PosdbTable::intersectLists10_r ( ) {
//
// TRANSFORM QueryTermInfo::m_* vars into old style arrays
//
int32_t wikiPhraseIds [MAX_QUERY_TERMS];
int32_t quotedStartIds[MAX_QUERY_TERMS];
int32_t qpos [MAX_QUERY_TERMS];
int32_t qtermNums [MAX_QUERY_TERMS];
float freqWeights [MAX_QUERY_TERMS];
// int32_t wikiPhraseIds [MAX_QUERY_TERMS];
// int32_t quotedStartIds[MAX_QUERY_TERMS];
// int32_t qpos [MAX_QUERY_TERMS];
// int32_t qtermNums [MAX_QUERY_TERMS];
// float freqWeights [MAX_QUERY_TERMS];
// now dynamically allocate to avoid stack smashing
char *pp = m_stackBuf.getBufStart();
int32_t nqt = m_q->m_numTerms;
int32_t *wikiPhraseIds = (int32_t *)pp; pp += 4 * nqt;
int32_t *quotedStartIds = (int32_t *)pp; pp += 4 * nqt;
int32_t *qpos = (int32_t *)pp; pp += 4 * nqt;
int32_t *qtermNums = (int32_t *)pp; pp += 4 * nqt;
float *freqWeights = (float *)pp; pp += sizeof(float) * nqt;
char **miniMergedList = (char **)pp; pp += sizeof(char *) * nqt;
char **miniMergedEnd = (char **)pp; pp += sizeof(char *) * nqt;
char **bestPos = (char **)pp; pp += sizeof(char *) * nqt;
char **winnerStack = (char **)pp; pp += sizeof(char *) * nqt;
char **xpos = (char **)pp; pp += sizeof(char *) * nqt;
char *bflags = (char *)pp; pp += sizeof(char) * nqt;
float *scoreMatrix = (float *)pp; pp += sizeof(float) *nqt*nqt;
if ( pp > m_stackBuf.getBufEnd() ) {char *xx=NULL;*xx=0; }
for ( int32_t i = 0 ; i < m_numQueryTermInfos ; i++ ) {
// get it
QueryTermInfo *qti = &qip[i];
@ -6012,17 +6056,11 @@ void PosdbTable::intersectLists10_r ( ) {
float minPairScore;
float minSingleScore;
//int64_t docId;
char *miniMergedList [MAX_QUERY_TERMS];
char *miniMergedEnd [MAX_QUERY_TERMS];
char bflags [MAX_QUERY_TERMS];
m_bflags = bflags;
int32_t qdist;
float wts;
float pss;
float scoreMatrix[MAX_QUERY_TERMS*MAX_QUERY_TERMS];
char *bestPos[MAX_QUERY_TERMS];
float maxNonBodyScore;
char *winnerStack[MAX_QUERY_TERMS];
// new vars for removing supplanted docid score infos and
// corresponding pair and single score infos
char *sx;
@ -6340,12 +6378,7 @@ void PosdbTable::intersectLists10_r ( ) {
}
if ( m_q->m_isBoolean ) {
minScore = 1.0;
// since we are jumping, we need to set m_docId here
//m_docId = *(uint32_t *)(docIdPtr+1);
//m_docId <<= 8;
//m_docId |= (unsigned char)docIdPtr[0];
//m_docId >>= 2;
//minScore = 1.0;
// we can't jump over setting of miniMergeList. do that.
goto boolJump1;
}
@ -6557,6 +6590,30 @@ void PosdbTable::intersectLists10_r ( ) {
boolJump1:
if ( m_q->m_isBoolean ) {
//minScore = 1.0;
// this is somewhat wasteful since it is set below again
m_docId = *(uint32_t *)(docIdPtr+1);
m_docId <<= 8;
m_docId |= (unsigned char)docIdPtr[0];
m_docId >>= 2;
// add one point for each term matched in the bool query
// this is really just for when the terms are from different
// fields. if we have unfielded boolean terms we should
// do proximity matching.
int32_t slot = m_bt.getSlot ( &m_docId );
if ( slot >= 0 ) {
uint8_t *bv = (uint8_t *)m_bt.getValueFromSlot(slot);
// then a score based on the # of terms that matched
int16_t bitsOn = getNumBitsOnX ( bv , m_vecSize );
// but store in hashtable now
minScore = (float)bitsOn;
}
else {
minScore = 1.0;
}
}
// we need to do this for seo hacks to merge the synonyms together
// into one list
seoHackSkip2:
@ -6922,7 +6979,7 @@ void PosdbTable::intersectLists10_r ( ) {
&pss);
// it's -1 if one term is in the body/header/menu/etc.
if ( pss < 0 ) {
scoreMatrix[i*MAX_QUERY_TERMS+j] = -1.00;
scoreMatrix[i*nqt+j] = -1.00;
wts = -1.0;
}
else {
@ -6931,7 +6988,7 @@ void PosdbTable::intersectLists10_r ( ) {
wts *= m_freqWeights[j];//sfw[j];
// store in matrix for "sub out" algo below
// when doing sliding window
scoreMatrix[i*MAX_QUERY_TERMS+j] = wts;
scoreMatrix[i*nqt+j] = wts;
// if terms is a special wiki half stop bigram
//if ( bflags[i] == 1 ) wts *= WIKI_BIGRAM_WEIGHT;
//if ( bflags[j] == 1 ) wts *= WIKI_BIGRAM_WEIGHT;
@ -7053,7 +7110,7 @@ void PosdbTable::intersectLists10_r ( ) {
// use special ptrs for the windows so we do not mangle
// miniMergedList[] array because we use that below!
char *xpos[MAX_QUERY_TERMS];
//char *xpos[MAX_QUERY_TERMS];
for ( int32_t i = 0 ; i < m_numQueryTermInfos ; i++ )
xpos[i] = miniMergedList[i];
@ -7262,7 +7319,7 @@ void PosdbTable::intersectLists10_r ( ) {
boolJump2:
// try dividing it by 3! (or multiply by .33333 faster)
score = minScore * (((float)siteRank)*SITERANKMULTIPLIER+1.0);
score = minScore * (((float)siteRank)*m_siteRankMultiplier+1.0);
// . not foreign language? give a huge boost
// . use "qlang" parm to set the language. i.e. "&qlang=fr"
@ -7932,7 +7989,7 @@ float PosdbTable::getMaxPossibleScore ( QueryTermInfo *qti ,
score *= WIKI_BIGRAM_WEIGHT;
}
//score *= perfectWordSpamWeight * perfectWordSpamWeight;
score *= (((float)siteRank)*SITERANKMULTIPLIER+1.0);
score *= (((float)siteRank)*m_siteRankMultiplier+1.0);
// language boost if same language (or no lang specified)
if ( m_r->m_language == docLang ||
@ -8165,6 +8222,10 @@ bool PosdbTable::makeDocIdVoteBufForBoolQuery_r ( ) {
}
// debug info
// int32_t nc = m_bt.getLongestString();
// log("posdb: string of %"INT32" filled slots!",nc);
char *dst = m_docIdVoteBuf.getBufStart();
// . now our hash table is filled with all the docids
@ -8223,13 +8284,15 @@ bool PosdbTable::makeDocIdVoteBufForBoolQuery_r ( ) {
// a 6 byte key means you pass
gbmemcpy ( dst , &docId , 6 );
// test it
int64_t d2;
d2 = *(uint32_t *)(dst+1);
d2 <<= 8;
d2 |= (unsigned char)dst[0];
d2 >>= 2;
docId >>= 2;
if ( d2 != docId ) { char *xx=NULL;*xx=0; }
if ( m_debug ) {
int64_t d2;
d2 = *(uint32_t *)(dst+1);
d2 <<= 8;
d2 |= (unsigned char)dst[0];
d2 >>= 2;
docId >>= 2;
if ( d2 != docId ) { char *xx=NULL;*xx=0; }
}
// end test
dst += 6;
}

@ -604,6 +604,8 @@ class PosdbTable {
float m_finalScore;
float m_preFinalScore;
float m_siteRankMultiplier;
// how long to add the last batch of lists
int64_t m_addListsTime;
int64_t m_t1 ;
@ -654,10 +656,13 @@ class PosdbTable {
SafeBuf m_pairScoreBuf;
SafeBuf m_singleScoreBuf;
SafeBuf m_stackBuf;
//SafeBuf m_mergeBuf;
// a reference to the query
Query *m_q;
int32_t m_nqt;
// these are NOT in imap space, but in query term space, 1-1 with
// Query::m_qterms[]

@ -29,7 +29,9 @@ typedef float rscore_t;
#define MINSCORE 1
#define MIN_SAVE_SIZE 100
#define PQR_BUF_SIZE MAX_QUERY_LEN
// we don't use this any more so make it compile
//#define PQR_BUF_SIZE MAX_QUERY_LEN
#define PQR_BUF_SIZE 64
class PostQueryRerank {
public:

355
Query.cpp

@ -28,6 +28,7 @@ void Query::constructor ( ) {
//m_bmap = NULL;
m_bitScores = NULL;
m_qwords = NULL;
m_numWords = 0;
//m_expressions = NULL;
m_qwordsAllocSize = 0;
//m_expressionsAllocSize = 0;
@ -37,8 +38,8 @@ void Query::constructor ( ) {
m_st0Ptr = NULL;
// we have to manually call this because Query::constructor()
// might have been called explicitly
for ( int32_t i = 0 ; i < MAX_QUERY_TERMS ; i++ )
m_qterms[i].constructor();
//for ( int32_t i = 0 ; i < MAX_QUERY_TERMS ; i++ )
// m_qterms[i].constructor();
//m_expressions = NULL;
reset ( );
}
@ -68,9 +69,19 @@ void Query::reset ( ) {
qt->m_facetIndexBuf.purge();
}
for ( int32_t i = 0 ; i < m_numWords ; i++ ) {
QueryWord *qw = &m_qwords[i];
qw->destructor();
}
m_stackBuf.purge();
m_qterms = NULL;
m_sb.purge();
m_osb.purge();
m_docIdRestriction = 0LL;
m_groupThatHasDocId = NULL;
m_bufLen = 0;
//m_bufLen = 0;
m_origLen = 0;
m_numWords = 0;
//m_numOperands = 0;
@ -84,6 +95,7 @@ void Query::reset ( ) {
//if ( m_bitScores && m_bitScoresSize ) // != m_bsbuf )
// mfree ( m_bitScores , m_bitScoresSize , "Query2" );
//m_bmap = NULL;
m_bitScores = NULL;
//m_bmapSize = 0;
m_bitScoresSize = 0;
@ -131,14 +143,16 @@ bool Query::set2 ( char *query ,
// need language for doing synonyms
uint8_t langId ,
char queryExpansion ,
bool useQueryStopWords ) {
//int32_t maxQueryTerms ) {
bool useQueryStopWords ,
int32_t maxQueryTerms ) {
m_langId = langId;
m_useQueryStopWords = useQueryStopWords;
// fix summary rerank and highlighting.
bool keepAllSingles = true;
m_maxQueryTerms = maxQueryTerms;
// assume boolean auto-detect.
char boolFlag = 2;
@ -150,7 +164,7 @@ bool Query::set2 ( char *query ,
if ( ! query ) return true;
// set to 256 for synonyms?
m_maxQueryTerms = 256;
//m_maxQueryTerms = 256;
m_queryExpansion = queryExpansion;
int32_t queryLen = gbstrlen(query);
@ -160,17 +174,26 @@ bool Query::set2 ( char *query ,
//m_coll = coll;
//m_collLen = collLen;
// truncate query if too big
if ( queryLen >= MAX_QUERY_LEN ) {
log("query: Query length of %"INT32" must be less than %"INT32". "
"Truncating.",queryLen,(int32_t)MAX_QUERY_LEN);
queryLen = MAX_QUERY_LEN - 1;
if ( queryLen >= ABS_MAX_QUERY_LEN ) {
log("query: Query length of %"INT32" must be "
"less than %"INT32". "
"Truncating.",queryLen,(int32_t)ABS_MAX_QUERY_LEN);
queryLen = ABS_MAX_QUERY_LEN - 1;
m_truncated = true;
}
// save original query
m_osb.setBuf ( m_otmpBuf , 128 , 0 , false );
m_osb.setLabel ("oqbuf" );
m_osb.reserve ( queryLen + 1 );
m_osb.safeMemcpy ( query , queryLen );
m_osb.nullTerm ();
m_origLen = queryLen;
gbmemcpy ( m_orig , query , queryLen );
m_orig [ m_origLen ] = '\0';
//m_origLen = queryLen;
//gbmemcpy ( m_orig , query , queryLen );
//m_orig [ m_origLen ] = '\0';
m_orig = m_osb.getBufStart();
m_origLen = m_osb.getLength();
log(LOG_DEBUG, "query: set called = %s", m_orig);
@ -204,9 +227,16 @@ bool Query::set2 ( char *query ,
// that were set somewhere above!!! i moved top: label above!
//reset();
// reserve some space, guessing how much we'd need
m_sb.setBuf(m_tmpBuf3,128,0,false);
m_sb.setLabel("qrystk");
int32_t need = queryLen * 2 + 32;
if ( ! m_sb.reserve ( need ) )
return false;
// convenience ptr
char *p = m_buf;
char *pend = m_buf + MAX_QUERY_LEN;
//char *p = m_buf;
//char *pend = m_buf + MAX_QUERY_LEN;
bool inQuotesFlag = false;
// . copy query into m_buf
// . translate ( and ) to special query operators so Words class
@ -219,27 +249,31 @@ bool Query::set2 ( char *query ,
if ( query[i] == '\"' ) inQuotesFlag = !inQuotesFlag;
if ( inQuotesFlag ) {
*p = query [i];
p++;
//*p = query [i];
//p++;
m_sb.pushChar(query[i]);
continue;
}
// dst buf must be big enough
if ( p + 8 >= pend ) {
g_errno = EBUFTOOSMALL;
return log(LOG_LOGIC,"query: query: query too big.");
}
// if ( p + 8 >= pend ) {
// g_errno = EBUFTOOSMALL;
// return log(LOG_LOGIC,"query: query: query too big.");
// }
// translate ( and )
if ( boolFlag == 1 && query[i] == '(' ) {
gbmemcpy ( p , " LeFtP " , 7 ); p += 7;
//gbmemcpy ( p , " LeFtP " , 7 ); p += 7;
m_sb.safeMemcpy ( " LeFtP " , 7 );
continue;
}
if ( boolFlag == 1 && query[i] == ')' ) {
gbmemcpy ( p , " RiGhP " , 7 ); p += 7;
//gbmemcpy ( p , " RiGhP " , 7 ); p += 7;
m_sb.safeMemcpy ( " RiGhP " , 7 );
continue;
}
if ( query[i] == '|' ) {
gbmemcpy ( p , " PiiPE " , 7 ); p += 7;
//gbmemcpy ( p , " PiiPE " , 7 ); p += 7;
m_sb.safeMemcpy ( " PiiPE " , 7 );
continue;
}
// translate [#a] [#r] [#ap] [#rp] [] [p] to operators
@ -249,28 +283,34 @@ bool Query::set2 ( char *query ,
while ( is_digit(query[j]) ) j++;
char c = query[j];
if ( (c == 'a' || c == 'r') && query[j+1]==']' ) {
sprintf ( p , " LeFtB %"INT32" %c RiGhB ",val,c);
p += gbstrlen(p);
//sprintf ( p , " LeFtB %"INT32" %c RiGhB ",
m_sb.safePrintf(" LeFtB %"INT32" %c RiGhB ",
val,c);
//p += gbstrlen(p);
i = j + 1;
continue;
}
else if ( (c == 'a' || c == 'r') &&
query[j+1]=='p' && query[j+2]==']') {
sprintf ( p , " LeFtB %"INT32" %cp RiGhB ",val,c);
p += gbstrlen(p);
//sprintf ( p , " LeFtB %"INT32" %cp RiGhB ",
m_sb.safePrintf(" LeFtB %"INT32" %cp RiGhB ",
val,c);
//p += gbstrlen(p);
i = j + 2;
continue;
}
}
if ( query[i] == '[' && query[i+1] == ']' ) {
sprintf ( p , " LeFtB RiGhB ");
p += gbstrlen(p);
//sprintf ( p , " LeFtB RiGhB ");
//p += gbstrlen(p);
m_sb.safePrintf ( " LeFtB RiGhB ");
i = i + 1;
continue;
}
if ( query[i] == '[' && query[i+1] == 'p' && query[i+2]==']') {
sprintf ( p , " LeFtB RiGhB ");
p += gbstrlen(p);
//sprintf ( p , " LeFtB RiGhB ");
//p += gbstrlen(p);
m_sb.safePrintf ( " LeFtB RiGhB ");
i = i + 2;
continue;
}
@ -306,17 +346,22 @@ bool Query::set2 ( char *query ,
// TODO: copy altavista's operators here? & | !
// otherwise, just a plain copy
*p = query [i];
p++;
// *p = query [i];
// p++;
m_sb.pushChar ( query[i] );
}
// NULL terminate
*p = '\0';
//*p = '\0';
m_sb.nullTerm();
// debug statement
//log(LOG_DEBUG,"Query: Got new query=%s",tempBuf);
//printf("query: query: Got new query=%s\n",tempBuf);
// set length
m_bufLen = p - m_buf;
//m_bufLen = p - m_buf;
//m_buf = m_sb.getBufStart();
//m_bufLen = m_sb.length();
Words words;
Phrases phrases;
@ -560,8 +605,108 @@ bool Query::setQTerms ( Words &words , Phrases &phrases ) {
// what is the max value for "shift"?
int32_t max = (int32_t)MAX_EXPLICIT_BITS;
if ( max > m_maxQueryTerms ) max = m_maxQueryTerms;
// count phrases first for allocating
int32_t nqt = 0;
for ( int32_t i = 0 ; i < m_numWords ; i++ ) {
QueryWord *qw = &m_qwords[i];
// skip if ignored... mdw...
if ( ! qw->m_phraseId ) continue;
if ( qw->m_ignorePhrase ) continue; // could be a repeat
// none if weight is absolute zero
if ( qw->m_userWeightPhrase == 0 &&
qw->m_userTypePhrase == 'a' ) continue;
nqt++;
}
// count phrase terms too!!!
for ( int32_t i = 0 ; i < m_numWords; i++ ) {
QueryWord *qw = &m_qwords[i];
if ( qw->m_ignoreWord &&
qw->m_ignoreWord != IGNORE_QSTOP) continue;
// ignore if in quotes and part of phrase, watch out
// for things like "word", a single word in quotes.
if ( qw->m_quoteStart >= 0 && qw->m_phraseId ) continue;
// if we are not start of quote and NOT in a phrase we
// must be the tailing word i guess.
// fixes '"john smith" -"bob dole"' from having
// smith and dole as query terms.
if ( qw->m_quoteStart >= 0 && qw->m_quoteStart != i )
continue;
// ignore if weight is absolute zero
if ( qw->m_userWeight == 0 &&
qw->m_userType == 'a' ) continue;
nqt++;
}
// thirdly, count synonyms
Synonyms syn;
int32_t sn = 0;
if ( m_queryExpansion ) sn = m_numWords;
int64_t to = hash64n("to",0LL);
for ( int32_t i = 0 ; i < sn ; i++ ) {
// get query word
QueryWord *qw = &m_qwords[i];
// skip if in quotes, we will not get synonyms for it
if ( qw->m_inQuotes ) continue;
// skip if has plus sign in front
if ( qw->m_wordSign == '+' ) continue;
// not '-' either i guess
if ( qw->m_wordSign == '-' ) continue;
// no url: stuff, maybe only title
if ( qw->m_fieldCode &&
qw->m_fieldCode != FIELD_TITLE &&
qw->m_fieldCode != FIELD_GENERIC )
continue;
// skip if ignored like a stopword (stop to->too)
//if ( qw->m_ignoreWord ) continue;
// ignore title: etc. words, they are field names
if ( qw->m_ignoreWord == IGNORE_FIELDNAME ) continue;
// ignore boolean operators
if ( qw->m_ignoreWord ) continue;// IGNORE_BOOLOP
// no, hurts 'Greencastle IN economic development'
if ( qw->m_wordId == to ) continue;
// single letters...
if ( qw->m_wordLen == 1 ) continue;
// set the synonyms for this word
char tmpBuf [ TMPSYNBUFSIZE ];
int32_t naids = syn.getSynonyms ( &words ,
i ,
// language of the query.
// 0 means unknown. if this
// is 0 we sample synonyms
// from all languages.
m_langId ,
tmpBuf ,
0 ); // m_niceness );
// if no synonyms, all done
if ( naids <= 0 ) continue;
nqt += naids;
}
m_numTermsUntruncated = nqt;
if ( nqt > m_maxQueryTerms ) nqt = m_maxQueryTerms;
// allocate the stack buf
if ( nqt ) {
int32_t need = nqt * sizeof(QueryTerm) ;
if ( ! m_stackBuf.reserve ( need ) )
return false;
m_stackBuf.setLabel("stkbuf3");
char *pp = m_stackBuf.getBufStart();
m_qterms = (QueryTerm *)pp;
pp += sizeof(QueryTerm);
if ( pp > m_stackBuf.getBufEnd() ) { char *xx=NULL;*xx=0; }
}
// call constructor on each one here
for ( int32_t i = 0 ; i < nqt ; i++ ) {
QueryTerm *qt = &m_qterms[i];
qt->constructor();
}
//char u8Buf[256];
for ( int32_t i = 0 ; i < m_numWords && n < MAX_QUERY_TERMS ; i++ ) {
for ( int32_t i = 0 ; i < m_numWords ; i++ ) {
// break out if no more explicit bits!
/*
if ( shift >= max ) {
@ -580,9 +725,14 @@ bool Query::setQTerms ( Words &words , Phrases &phrases ) {
qw->m_userTypePhrase == 'a' ) continue;
// stop breach
if ( n >= MAX_QUERY_TERMS ) {
if ( n >= ABS_MAX_QUERY_TERMS ) {
log("query: lost query phrase terms to max term "
"limit of %"INT32"",(int32_t)MAX_QUERY_TERMS );
"limit of %"INT32"",(int32_t)ABS_MAX_QUERY_TERMS );
break;
}
if ( n >= m_maxQueryTerms ) {
log("query: lost query phrase terms to max term cr "
"limit of %"INT32"",(int32_t)m_maxQueryTerms);
break;
}
@ -604,7 +754,7 @@ bool Query::setQTerms ( Words &words , Phrases &phrases ) {
qt->m_isQueryStopWord = false;
// change in both places
qt->m_termId = qw->m_phraseId & TERMID_MASK;
m_termIds[n] = qw->m_phraseId & TERMID_MASK;
//m_termIds[n] = qw->m_phraseId & TERMID_MASK;
//log(LOG_DEBUG, "Setting query phrase term id %d: %lld", n, m_termIds[n]);
qt->m_rawTermId = qw->m_rawPhraseId;
// assume explicit bit is 0
@ -615,12 +765,12 @@ bool Query::setQTerms ( Words &words , Phrases &phrases ) {
// phrases like: "cat dog" AND pig
if ( m_isBoolean && qw->m_phraseSign != '*' ) {
qt->m_termSign = '\0';
m_termSigns[n] = '\0';
//m_termSigns[n] = '\0';
}
// if not boolean, ensure to change signs in both places
else {
qt->m_termSign = qw->m_phraseSign;
m_termSigns[n] = qw->m_phraseSign;
//m_termSigns[n] = qw->m_phraseSign;
}
//
// INSERT UOR LOGIC HERE
@ -703,7 +853,7 @@ bool Query::setQTerms ( Words &words , Phrases &phrases ) {
}
// now if we have enough room, do the singles
for ( int32_t i = 0 ; i < m_numWords && n < MAX_QUERY_TERMS ; i++ ) {
for ( int32_t i = 0 ; i < m_numWords ; i++ ) {
// break out if no more explicit bits!
/*
if ( shift >= max ) {
@ -738,9 +888,14 @@ bool Query::setQTerms ( Words &words , Phrases &phrases ) {
qw->m_userType == 'a' ) continue;
// stop breach
if ( n >= MAX_QUERY_TERMS ) {
if ( n >= ABS_MAX_QUERY_TERMS ) {
log("query: lost query terms to max term "
"limit of %"INT32"",(int32_t)MAX_QUERY_TERMS );
"limit of %"INT32"",(int32_t)ABS_MAX_QUERY_TERMS );
break;
}
if ( n >= m_maxQueryTerms ) {
log("query: lost query terms to max term cr "
"limit of %"INT32"",(int32_t)m_maxQueryTerms);
break;
}
@ -760,7 +915,7 @@ bool Query::setQTerms ( Words &words , Phrases &phrases ) {
qt->m_isQueryStopWord = qw->m_isQueryStopWord;
// change in both places
qt->m_termId = qw->m_wordId & TERMID_MASK;
m_termIds[n] = qw->m_wordId & TERMID_MASK;
//m_termIds[n] = qw->m_wordId & TERMID_MASK;
qt->m_rawTermId = qw->m_rawWordId;
// assume explicit bit is 0
qt->m_explicitBit = 0;
@ -769,18 +924,18 @@ bool Query::setQTerms ( Words &words , Phrases &phrases ) {
// boolean queries are not allowed term signs
if ( m_isBoolean ) {
qt->m_termSign = '\0';
m_termSigns[n] = '\0';
//m_termSigns[n] = '\0';
// boolean fix for "health OR +sports" because
// the + there means exact word match, no synonyms.
if ( qw->m_wordSign == '+' ) {
qt->m_termSign = qw->m_wordSign;
m_termSigns[n] = qw->m_wordSign;
//m_termSigns[n] = qw->m_wordSign;
}
}
// if not boolean, ensure to change signs in both places
else {
qt->m_termSign = qw->m_wordSign;
m_termSigns[n] = qw->m_wordSign;
//m_termSigns[n] = qw->m_wordSign;
}
// get previous text word
//int32_t pw = i - 2;
@ -1230,16 +1385,14 @@ bool Query::setQTerms ( Words &words , Phrases &phrases ) {
// . skip this part if language is unknown i guess
//
////////////
int32_t sn = 0;
Synonyms syn;
// loop over all words in query and process its synonyms list
//if ( m_langId != langUnknown && m_queryExpansion )
// if lang is "xx" unknown we still do synonyms it just does
// a loop over all languages starting with english
if ( m_queryExpansion )
sn = m_numWords;
// if ( m_queryExpansion )
// sn = m_numWords;
int64_t to = hash64n("to",0LL);
//int64_t to = hash64n("to",0LL);
for ( int32_t i = 0 ; i < sn ; i++ ) {
// get query word
@ -1257,6 +1410,10 @@ bool Query::setQTerms ( Words &words , Phrases &phrases ) {
continue;
// skip if ignored like a stopword (stop to->too)
//if ( qw->m_ignoreWord ) continue;
// ignore title: etc. words, they are field names
if ( qw->m_ignoreWord == IGNORE_FIELDNAME ) continue;
// ignore boolean operators
if ( qw->m_ignoreWord ) continue;// IGNORE_BOOLOP
// no, hurts 'Greencastle IN economic development'
if ( qw->m_wordId == to ) continue;
// single letters...
@ -1277,19 +1434,29 @@ bool Query::setQTerms ( Words &words , Phrases &phrases ) {
// sanity
if ( naids > MAX_SYNS ) { char *xx=NULL;*xx=0; }
// now make the buffer to hold them for us
qw->m_synWordBuf.setLabel("qswbuf");
qw->m_synWordBuf.safeMemcpy ( &syn.m_synWordBuf );
// get the term for this word
QueryTerm *origTerm = qw->m_queryWordTerm;
// loop over synonyms for word #i now
for ( int32_t j = 0 ; j < naids ; j++ ) {
// stop breach
if ( n >= MAX_QUERY_TERMS ) {
if ( n >= ABS_MAX_QUERY_TERMS ) {
log("query: lost synonyms due to max term "
"limit of %"INT32"",(int32_t)MAX_QUERY_TERMS );
"limit of %"INT32"",
(int32_t)ABS_MAX_QUERY_TERMS );
break;
}
// this happens for 'da da da'
if ( ! origTerm ) continue;
if ( n >= m_maxQueryTerms ) {
log("query: lost synonyms due to max cr term "
"limit of %"INT32"",
(int32_t)m_maxQueryTerms);
break;
}
// add that query term
QueryTerm *qt = &m_qterms[n];
qt->m_qword = qw; // NULL;
@ -1346,7 +1513,7 @@ bool Query::setQTerms ( Words &words , Phrases &phrases ) {
wid= hash64h(wid,ph);
}
qt->m_termId = wid & TERMID_MASK;
m_termIds[n] = wid & TERMID_MASK;
//m_termIds[n] = wid & TERMID_MASK;
qt->m_rawTermId = syn.m_aids[j];
// assume explicit bit is 0
qt->m_explicitBit = 0;
@ -1354,18 +1521,18 @@ bool Query::setQTerms ( Words &words , Phrases &phrases ) {
// boolean queries are not allowed term signs
if ( m_isBoolean ) {
qt->m_termSign = '\0';
m_termSigns[n] = '\0';
//m_termSigns[n] = '\0';
// boolean fix for "health OR +sports" because
// the + there means exact word match, no syns
if ( qw->m_wordSign == '+' ) {
qt->m_termSign = qw->m_wordSign;
m_termSigns[n] = qw->m_wordSign;
//m_termSigns[n] = qw->m_wordSign;
}
}
// if not bool, ensure to change signs in both places
else {
qt->m_termSign = qw->m_wordSign;
m_termSigns[n] = qw->m_wordSign;
//m_termSigns[n] = qw->m_wordSign;
}
// do not use an explicit bit up if we got a hard count
qt->m_hardCount = qw->m_hardCount;
@ -1413,7 +1580,7 @@ bool Query::setQTerms ( Words &words , Phrases &phrases ) {
m_numTerms = n;
if ( n > MAX_QUERY_TERMS ) { char *xx=NULL;*xx=0; }
if ( n > ABS_MAX_QUERY_TERMS ) { char *xx=NULL;*xx=0; }
// count them for doing number of combos
@ -1493,7 +1660,7 @@ bool Query::setQTerms ( Words &words , Phrases &phrases ) {
// . don't forget to set m_termSigns too!
if ( n == 1 && m_qterms[0].m_isPhrase && ! m_qterms[0].m_termSign ) {
m_qterms[0].m_termSign = '*';
m_termSigns[0] = '*';
//m_termSigns[0] = '*';
}
// . or bits into the m_implicitBits member of phrase QueryTerms that
@ -1524,7 +1691,11 @@ bool Query::setQTerms ( Words &words , Phrases &phrases ) {
// . see Msg2.cpp for more info on componentCodes
// . -2 means unset, neither a compound term nor a component term at
// this time
for ( int32_t i = 0 ; i < m_numTerms ; i++ ) m_componentCodes[i] = -2;
//for( int32_t i = 0 ; i < m_numTerms ; i++ ) m_componentCodes[i] = -2;
for ( int32_t i = 0 ; i < m_numTerms ; i++ ) {
QueryTerm *qt = &m_qterms[i];
qt->m_componentCode = -2;
}
m_numComponents = 0;
// . now set m_phrasePart for Summary.cpp's hackfix filter
@ -1879,7 +2050,10 @@ void Query::addCompoundTerms ( ) {
// -1 means compound, -2 means unset, >= 0 means component
bool Query::isCompoundTerm ( int32_t i ) {
return ( m_componentCodes[i] == -1 );
//return ( m_componentCodes[i] == -1 );
if ( i >= m_numTerms ) return false;
QueryTerm *qt = &m_qterms[i];
return ( qt->m_componentCode == -1 );
}
bool Query::setQWords ( char boolFlag ,
@ -1891,16 +2065,17 @@ bool Query::setQWords ( char boolFlag ,
// . because we now deal with boolean queries, we make parentheses
// their own separate Word, so tell "words" we're setting a query
//Words words;
if ( ! words.set ( m_buf , m_bufLen,
if ( ! words.set ( m_sb.getBufStart() , m_sb.length() ,
//buf , m_bufLen,
TITLEREC_CURRENT_VERSION, true, true ) )
return log("query: Had error parsing query: %s.",
mstrerror(g_errno));
int32_t numWords = words.getNumWords();
// truncate it
if ( numWords > MAX_QUERY_WORDS ) {
if ( numWords > ABS_MAX_QUERY_WORDS ) {
log("query: Had %"INT32" words. Max is %"INT32". Truncating.",
numWords,(int32_t)MAX_QUERY_WORDS);
numWords = MAX_QUERY_WORDS;
numWords,(int32_t)ABS_MAX_QUERY_WORDS);
numWords = ABS_MAX_QUERY_WORDS;
m_truncated = true;
}
m_numWords = numWords;
@ -1923,11 +2098,14 @@ bool Query::setQWords ( char boolFlag ,
return log("query: Could not allocate mem for query.");
m_qwordsAllocSize = need;
}
// reset safebuf in there
for ( int32_t i = 0 ; i < m_numWords ; i++ )
m_qwords[i].constructor();
// is all alpha chars in query in upper case? caps lock on?
bool allUpper = true;
char *p = m_buf;
char *pend = m_buf + m_bufLen;
char *p = m_sb.getBufStart();//m_buf;
char *pend = m_sb.getBuf(); // m_buf + m_bufLen;
for ( ; p < pend ; p += getUtf8CharSize(p) )
if ( is_alpha_utf8 ( p ) && ! is_upper_utf8 ( p ) ) {
allUpper = false; break; }
@ -2027,7 +2205,7 @@ bool Query::setQWords ( char boolFlag ,
char *ignoreTill = NULL;
// loop over all words, these QueryWords are 1-1 with "words"
for ( int32_t i = 0 ; i < numWords && i < MAX_QUERY_WORDS ; i++ ) {
for ( int32_t i = 0 ; i < numWords && i < ABS_MAX_QUERY_WORDS ; i++ ) {
// convenience var, these are 1-1 with "words"
QueryWord *qw = &m_qwords[i];
// set to defaults?
@ -2338,12 +2516,14 @@ bool Query::setQWords ( char boolFlag ,
// in quotes which is silly, so undo it. But we should
// still inherit any quoteSign, however. Be sure to also
// set m_inQuotes to false so Matches.cpp::matchWord() works.
if ( i == quoteStart ) { // + 1 ) {
if ( i + 1 >= numWords || words.getNumQuotes(i+1)>0 ) {
qw->m_quoteStart = -1;
qw->m_inQuotes = false;
}
}
// MDW: don't undo it because we do not want to get synonyms
// of terms in quotes. 7/15/2015
// if ( i == quoteStart ) { // + 1 ) {
// if ( i + 1 >= numWords || words.getNumQuotes(i+1)>0 ) {
// qw->m_quoteStart = -1;
// qw->m_inQuotes = false;
// }
// }
// . get prefix hash of collection name and field
// . but first convert field to lower case
uint64_t ph;
@ -3228,7 +3408,8 @@ bool Query::setQWords ( char boolFlag ,
// search up to this far
int32_t maxj = i + nw;
// but not past our truncated limit
if ( maxj > MAX_QUERY_WORDS ) maxj = MAX_QUERY_WORDS;
if ( maxj > ABS_MAX_QUERY_WORDS )
maxj = ABS_MAX_QUERY_WORDS;
for ( j = i ; j < maxj ; j++ ) {
// skip punct
@ -3385,7 +3566,7 @@ bool Query::setQWords ( char boolFlag ,
// count non-ignored words
if ( qw->m_ignoreWord ) continue;
// if under limit, continue
if ( count++ < MAX_QUERY_TERMS ) continue;
if ( count++ < ABS_MAX_QUERY_TERMS ) continue;
// . otherwise, ignore
// . if we set this for our UOR'ed terms from SearchInput.cpp's
// UOR'ed facebook interests then it causes us to get no results!
@ -4968,7 +5149,7 @@ void Query::printQueryTerms(){
(int64_t)m_qterms[i].m_explicitBit ,
(int64_t)m_qterms[i].m_implicitBits ,
(int32_t) m_qterms[i].m_hardCount ,
m_componentCodes[i],
m_qterms[i].m_componentCode,
getTermLen(i),
tt );
}
@ -5514,7 +5695,17 @@ bool QueryTerm::isSplit() {
// hash of all the query terms
int64_t Query::getQueryHash() {
int64_t qh = 0LL;
for ( int32_t i = 0 ; i < m_numTerms ; i++ )
qh = hash64 ( m_termIds[i] , qh );
for ( int32_t i = 0 ; i < m_numTerms ; i++ ) {
QueryTerm *qt = &m_qterms[i];
qh = hash64 ( qt->m_termId , qh );
}
return qh;
}
void QueryWord::constructor () {
m_synWordBuf.constructor();
}
void QueryWord::destructor () {
m_synWordBuf.purge();
}

71
Query.h

@ -10,7 +10,9 @@
// keep these down to save memory
//#define MAX_QUERY_LEN 8000 // url:XXX can be quite long! (MAX_URL_LEN)
#define MAX_QUERY_LEN 3200
//#define MAX_QUERY_LEN 3200
// support big OR queries for image shingles
#define ABS_MAX_QUERY_LEN 62000
// . words need to deal with int32_t list of sites!
// . remember, words can be string of punctuation, too
//#define MAX_QUERY_WORDS 5000
@ -21,7 +23,8 @@
// seems like we alloc just enough to hold our words now so that this
// is really a performance capper but it is used in Summary.cpp
// and Matches.h so don't go too big just yet
#define MAX_QUERY_WORDS 800
//#define MAX_QUERY_WORDS 800
#define ABS_MAX_QUERY_WORDS 99000
// . how many IndexLists might we get/intersect
// . we now use a int64_t to hold the query term bits for non-boolean queries
@ -36,7 +39,8 @@
//#define MAX_QUERY_TERMS 40
// how to make a lock pick set loses synonyms from 40!
//#define MAX_QUERY_TERMS 80
#define MAX_QUERY_TERMS 160
//#define MAX_QUERY_TERMS 160
#define ABS_MAX_QUERY_TERMS 9000
// only allow up to 200 interests from facebook plus manually entered
// because we are limited by the query terms above so we can only
@ -270,6 +274,9 @@ class QueryWord {
if ( is_wspace_utf8 ( p ) ) return true;
return false;
};
void constructor ();
void destructor ();
//UCScript wordScript() {
// UChar*foo;
// return ucGetScript(utf16Decode((UChar*)(m_word),&foo));
@ -463,6 +470,10 @@ class QueryTerm {
char *m_parenList;
int32_t m_parenListLen;
int32_t m_componentCode;
int64_t m_termFreq;
float m_termFreqWeight;
// . our representative bits
// . the bits in this bit vector is 1-1 with the QueryTerms
// . if a doc has query term #i then bit #i will be set
@ -624,10 +635,10 @@ class Query {
//int32_t collLen ,
uint8_t langId ,
char queryExpansion ,
bool useQueryStopWords = true );
//char boolFlag = 2 , // auto-detect if boolean query
//bool keepAllSingles = false ,
//int32_t maxQueryTerms = 0x7fffffff );
bool useQueryStopWords = true ,
//char boolFlag = 2 , // auto-detect if boolean query
//bool keepAllSingles = false ,
int32_t maxQueryTerms = 0x7fffffff );
// serialize/deserialize ourselves so we don't have to pass the
// unmodified string around and reparse it every time
@ -680,9 +691,9 @@ class Query {
// . the signs and ids are dupped in the QueryTerm classes, too
//int64_t *getTermFreqs ( ) { return m_termFreqs ; };
//int64_t getTermFreq ( int32_t i ) { return m_termFreqs[i]; };
int64_t *getTermIds ( ) { return m_termIds ; };
char *getTermSigns ( ) { return m_termSigns ; };
int32_t *getComponentCodes ( ) { return m_componentCodes; };
//int64_t *getTermIds ( ) { return m_termIds ; };
//char *getTermSigns ( ) { return m_termSigns ; };
//int32_t *getComponentCodes ( ) { return m_componentCodes; };
int64_t getRawWordId ( int32_t i ) { return m_qwords[i].m_rawWordId;};
int32_t getNumComponentTerms ( ) { return m_numComponents; };
@ -926,17 +937,26 @@ class Query {
int32_t m_qwordsAllocSize;
// QueryWords are converted to QueryTerms
QueryTerm m_qterms [ MAX_QUERY_TERMS ];
//QueryTerm m_qterms [ MAX_QUERY_TERMS ];
int32_t m_numTerms;
int32_t m_numTermsSpecial;
int32_t m_numTermsUntruncated;
// separate vectors for easier interfacing, 1-1 with m_qterms
//int64_t m_termFreqs [ MAX_QUERY_TERMS ];
int64_t m_termIds [ MAX_QUERY_TERMS ];
char m_termSigns [ MAX_QUERY_TERMS ];
int32_t m_componentCodes [ MAX_QUERY_TERMS ];
char m_ignore [ MAX_QUERY_TERMS ]; // is term ignored?
int32_t m_numComponents;
//int64_t m_termIds [ MAX_QUERY_TERMS ];
//char m_termSigns [ MAX_QUERY_TERMS ];
//int32_t m_componentCodes [ MAX_QUERY_TERMS ];
//char m_ignore [ MAX_QUERY_TERMS ]; // is term ignored?
SafeBuf m_stackBuf;
QueryTerm *m_qterms ;
//int64_t *m_termIds ;
//char *m_termSigns ;
//int32_t *m_componentCodes ;
//char *m_ignore ; // is term ignored?
int32_t m_numComponents;
// how many bits in the full vector?
//int32_t m_numExplicitBits;
@ -974,18 +994,27 @@ class Query {
class Host *m_groupThatHasDocId;
// for holding the filtered query, in utf8
char m_buf [ MAX_QUERY_LEN ];
int32_t m_bufLen;
//char m_buf [ MAX_QUERY_LEN ];
//int32_t m_bufLen;
// for holding the filtered query, in utf8
SafeBuf m_sb;
char m_tmpBuf3[128];
// for holding the filtered/NULL-terminated query for doing
// matching. basically store phrases in here without punct
// so we can point a needle to them for matching in XmlDoc.cpp.
char m_needleBuf [ MAX_QUERY_LEN + 1 ];
int32_t m_needleBufLen;
//char m_needleBuf [ MAX_QUERY_LEN + 1 ];
//int32_t m_needleBufLen;
// the original query
char m_orig [ MAX_QUERY_LEN ];
//char m_orig [ MAX_QUERY_LEN ];
//int32_t m_origLen;
char *m_orig;
int32_t m_origLen;
SafeBuf m_osb;
char m_otmpBuf[128];
// we just have a ptr to this so don't pull the rug out
//char *m_coll;

@ -373,12 +373,12 @@ bool RdbDump::dumpTree ( bool recall ) {
//if ( removeNegRecs )
// m_list.removeNegRecs();
// if(!m_list->checkList_r ( false , // removeNegRecs?
// false , // sleep on problem?
// m_rdb->m_rdbId )) {
// log("db: list to dump is not sane!");
// char *xx=NULL;*xx=0;
// }
// if(!m_list->checkList_r ( false , // removeNegRecs?
// false , // sleep on problem?
// m_rdb->m_rdbId )) {
// log("db: list to dump is not sane!");
// char *xx=NULL;*xx=0;
// }
skip:
@ -781,6 +781,10 @@ bool RdbDump::doneReadingForVerify ( ) {
if ( m_addToMap ) t = gettimeofdayInMilliseconds();
// sanity check
if ( m_list->m_ks != m_ks ) { char *xx = NULL; *xx = 0; }
bool triedToFix = false;
tryAgain:
// . register this with the map now
// . only register AFTER it's ALL on disk so we don't get partial
// record reads and we don't read stuff on disk that's also in tree
@ -788,6 +792,16 @@ bool RdbDump::doneReadingForVerify ( ) {
// . we don't have maps when we do unordered dumps
// . careful, map is NULL if we're doing unordered dump
if ( m_addToMap && m_map && ! m_map->addList ( m_list ) ) {
// keys out of order in list from tree?
if ( g_errno == ECORRUPTDATA ) {
log("db: trying to fix tree or buckets");
if ( m_tree ) m_tree->fixTree();
//if ( m_buckets ) m_buckets->fixBuckets();
if ( m_buckets ) { char *xx=NULL;*xx=0; }
if ( triedToFix ) { char *xx=NULL;*xx=0; }
triedToFix = true;
goto tryAgain;
}
g_errno = ENOMEM;
log("db: Failed to add data to map.");
// undo the offset update, the write failed, the parent

@ -624,7 +624,8 @@ bool RdbList::growList ( int32_t newSize ) {
// don't shrink list
if ( newSize <= m_allocSize ) return true;
// debug msg
//log("RdbList::growList from %"INT32" to %"INT32"",m_allocSize , newSize );
// log("RdbList::growList 0x%"PTRFMT "from %"INT32" to %"INT32"",
// (PTRTYPE)this,m_allocSize , newSize );
// make a new buffer
char *tmp =(char *) mrealloc ( m_alloc,m_allocSize,newSize,"RdbList");
//if ( (int32_t)tmp == 0x904dbd0 )

@ -323,8 +323,9 @@ bool RdbMap::verifyMap2 ( ) {
KEYSET(lastKey,k,m_ks); continue; }
// just bitch for now
log(
"db: Key out of order in map file %s%s. "
"page = %"INT32". key offset = %"INT64". Map or data file is "
"db: Key out of order in map file %s/%s. "
"page = %"INT32". key offset = %"INT64". "
"Map or data file is "
"corrupt, but it is probably the data file. Please "
"delete the map file and restart.",
m_file.m_dir,m_file.getFilename() ,
@ -337,6 +338,15 @@ bool RdbMap::verifyMap2 ( ) {
KEY1(lastKey,m_ks),KEY0(lastKey));
log("db: k.n1=%016"XINT64" n0=%016"XINT64"",KEY1(k,m_ks),KEY0(k));
log("db: m_numPages = %"INT32"",m_numPages);
SafeBuf cmd;
cmd.safePrintf("mv %s/%s %s/trash/",
m_file.m_dir,
m_file.getFilename(),
g_hostdb.m_dir);
log("db: %s",cmd.getBufStart() );
gbsystem ( cmd.getBufStart() );
exit(0);
//char *xx=NULL;*xx=0;
// was k too small?
@ -543,7 +553,8 @@ bool RdbMap::addRecord ( char *key, char *rec , int32_t recSize ) {
m_lastLogTime = getTime();
//pageNum > 0 && getKey(pageNum-1) > getKey(pageNum) ) {
log(LOG_LOGIC,"build: RdbMap: added key out of order. "
"count=%"INT64".",m_badKeys);
"count=%"INT64" file=%s/%s.",m_badKeys,
m_file.m_dir,m_file.getFilename());
//log(LOG_LOGIC,"build: k.n1=%"XINT32" %"XINT64" lastKey.n1=%"XINT32" %"XINT64"",
// key.n1,key.n0,m_lastKey.n1,m_lastKey.n0 );
log(LOG_LOGIC,"build: offset=%"INT64"",
@ -556,7 +567,10 @@ bool RdbMap::addRecord ( char *key, char *rec , int32_t recSize ) {
g_errno = ECORRUPTDATA;
return false;
}
char *xx=NULL;*xx=0;
// if being called from RdbDump.cpp...
g_errno = ECORRUPTDATA;
return false;
//char *xx=NULL;*xx=0;
// . during a merge, corruption can happen, so let's core
// here until we figure out how to fix it.
// . any why wasn't the corruption discovered and patched
@ -719,7 +733,10 @@ bool RdbMap::addList ( RdbList *list ) {
if ( ! addRecord ( key , rec , recSize ) ) {
log("db: Failed to add record to map: %s.",
mstrerror(g_errno));
char *xx = NULL; *xx = 0;
// allow caller to try to fix the tree in the case of dumping
// a tree to a file on disk
return false;
//char *xx = NULL; *xx = 0;
}
if ( list->skipCurrentRecord() ) goto top2;

@ -1283,19 +1283,26 @@ bool RdbTree::checkTree2 ( bool printMsgs , bool doChainTest ) {
if ( m_right[i] >= 0 && m_parents[m_right[i]] != i )
return log(
"db: Tree right kid and parent disagree.");
/*
// MDW: why did i comment out the order checking?
// check order
if ( m_left[i] >= 0 ) {
if ( m_left[i] >= 0 &&
m_collnums[i] == m_collnums[m_left[i]] ) {
char *key = &m_keys[i*m_ks];
char *left = &m_keys[m_left[i]*m_ks];
if ( KEYCMP(key,left,m_ks)<0) {char *xx=NULL;*xx=0;}
if ( KEYCMP(key,left,m_ks)<0)
return log("db: Tree left kid > parent %i",i);
}
if ( m_right[i] >= 0 ) {
if ( m_right[i] >= 0 &&
m_collnums[i] == m_collnums[m_right[i]] ) {
char *key = &m_keys[i*m_ks];
char *right = &m_keys[m_right[i]*m_ks];
if ( KEYCMP(key,right,m_ks)>0) {char *xx=NULL;*xx=0;}
if ( KEYCMP(key,right,m_ks)>0)
return log("db: Tree right kid < parent %i "
"%s < %s",i,
KEYSTR(right,m_ks),
KEYSTR(key,m_ks) );
}
*/
//g_loop.quickPoll(1, __PRETTY_FUNCTION__, __LINE__);
}
if ( hkp > 0 )

@ -522,7 +522,8 @@ int32_t SafeBuf::safeSave (char *filename ) {
}
int32_t SafeBuf::fillFromFile(char *dir,char *filename) {
int32_t SafeBuf::fillFromFile(char *dir,char *filename,char *label) {
m_label = label;
char buf[1024];
if ( dir ) snprintf(buf,1024,"%s/%s",dir,filename);
else snprintf(buf,1024,"%s",filename);

@ -10,6 +10,9 @@
* (for java programmers, very similar to the StringBuffer class, with all the speed that c++ allows).
* Most of strings in Gigablast are handled by those.
*/
#include "iana_charset.h"
class SafeBuf {
public:
//*TRUCTORS
@ -33,8 +36,11 @@ public:
// want SafeBuf to free the data for you. Keep in mind, all
// previous content in SafeBuf will be cleared when you pass it
// a new buffer.
bool setBuf(char *newBuf, int32_t bufMax, int32_t bytesInUse, bool ownData,
int16_t encoding );
bool setBuf(char *newBuf,
int32_t bufMax,
int32_t bytesInUse,
bool ownData,
int16_t encoding = csUTF8 );
// yieldBuf() allows you to take over the buffer in SafeBuf.
// You may only free the data if it was originally owned by
// the SafeBuf.
@ -67,8 +73,9 @@ public:
int32_t safeSave (char *filename );
int32_t fillFromFile(char *filename);
int32_t fillFromFile(char *dir,char *filename);
int32_t load(char *dir,char *fname) { return fillFromFile(dir,fname);};
int32_t fillFromFile(char *dir,char *filename, char *label=NULL);
int32_t load(char *dir,char *fname,char *label = NULL) {
return fillFromFile(dir,fname,label);};
int32_t load(char *fname) { return fillFromFile(fname);};
void filterTags();

@ -50,14 +50,16 @@ void SearchInput::clear ( int32_t niceness ) {
key_t SearchInput::makeKey ( ) {
// hash the query
int32_t n = m_q.getNumTerms ();
int64_t *termIds = m_q.getTermIds ();
char *signs = m_q.getTermSigns ();
//int64_t *termIds = m_q.getTermIds ();
//char *signs = m_q.getTermSigns ();
key_t k;
k.n1 = 0;
k.n0 = hash64 ( (char *)termIds , n * sizeof(int64_t) );
k.n0 = hash64 ( (char *)signs , n , k.n0 );
//k.n0 = hash64 ( (char *)termIds , n * sizeof(int64_t) );
//k.n0 = hash64 ( (char *)signs , n , k.n0 );
// user defined weights, for weighting each query term separately
for ( int32_t i = 0 ; i < n ; i++ ) {
k.n0 = hash64 ((char *)&m_q.m_qterms[i].m_termId ,4, k.n0);
k.n0 = hash64 ((char *)&m_q.m_qterms[i].m_termSign ,1, k.n0);
k.n0 = hash64 ((char *)&m_q.m_qterms[i].m_userWeight,4, k.n0);
k.n0 = hash64 ((char *)&m_q.m_qterms[i].m_userType ,1, k.n0);
}
@ -468,14 +470,16 @@ bool SearchInput::set ( TcpSocket *sock , HttpRequest *r ) { //, Query *q ) {
log("query: qlang of \"%s\" is NOT SUPPORTED. using "
"langUnknown, \"xx\".",langAbbr);
int32_t maxQueryTerms = cr->m_maxQueryTerms;
// . the query to use for highlighting... can be overriden with "hq"
// . we need the language id for doing synonyms
if ( m_prepend && m_prepend[0] )
m_hqq.set2 ( m_prepend , m_queryLangId , true );
m_hqq.set2 ( m_prepend , m_queryLangId , true ,maxQueryTerms);
else if ( m_highlightQuery && m_highlightQuery[0] )
m_hqq.set2 ( m_highlightQuery , m_queryLangId , true );
m_hqq.set2 (m_highlightQuery,m_queryLangId,true,maxQueryTerms);
else if ( m_query && m_query[0] )
m_hqq.set2 ( m_query , m_queryLangId , true );
m_hqq.set2 ( m_query , m_queryLangId , true,maxQueryTerms);
// log it here
log(LOG_INFO,
@ -487,7 +491,9 @@ bool SearchInput::set ( TcpSocket *sock , HttpRequest *r ) { //, Query *q ) {
// . returns false and sets g_errno on error (ETOOMANYOPERANDS)
if ( ! m_q.set2 ( m_sbuf1.getBufStart(),
m_queryLangId ,
m_queryExpansion ) ) {
m_queryExpansion ,
true , // use QUERY stopwords?
maxQueryTerms ) ) {
g_msg = " (error: query has too many operands)";
return false;
}
@ -823,6 +829,9 @@ bool SearchInput::setQueryBuffers ( HttpRequest *hr ) {
m_sbuf2.safeStrcpy(" AND ");
}
}
m_sbuf1.setLabel("sisbuf1");
m_sbuf2.setLabel("sisbuf2");
m_sbuf3.setLabel("sisbuf3");
// append the natural query
if ( m_query && m_query[0] ) {
//if ( p > pstart ) *p++ = ' ';

@ -1164,7 +1164,9 @@ bool Sections::set ( Words *w ,
xh ^= g_hashtab[cnt++][(unsigned char )*p];
}
// sanity check
if ( ! xh ) { char *xx=NULL;*xx=0; }
//if ( ! xh ) { char *xx=NULL;*xx=0; }
// if it is a string of the same chars it can be 0
if ( ! xh ) xh = 1;
// store that
sn->m_xmlNameHash = (int32_t)xh;
}

@ -11641,6 +11641,18 @@ int32_t getUrlFilterNum2 ( SpiderRequest *sreq ,
goto checkNextRule;
}
if ( strncmp ( p , "isfakeip",8 ) == 0 ) {
// skip for msg20
if ( isForMsg20 ) continue;
// if no match continue
if ( (bool)sreq->m_fakeFirstIp == val ) continue;
p += 8;
p = strstr(p, "&&");
if ( ! p ) return i;
p += 2;
goto checkNextRule;
}
if ( strncmp ( p , "isonsamedomain",14 ) == 0 ) {
// skip for msg20
if ( isForMsg20 ) continue;
@ -13993,6 +14005,17 @@ bool getSpiderStatusMsg ( CollectionRec *cx , SafeBuf *msg , int32_t *status ) {
return msg->safePrintf("Job is initializing.");
}
// if we had seeds and none were successfully crawled, do not just
// print that the crawl completed.
if ( cx->m_collectiveRespiderFrequency <= 0.0 &&
cx->m_isCustomCrawl &&
! cx->m_globalCrawlInfo.m_hasUrlsReadyToSpider &&
cx->m_globalCrawlInfo.m_pageDownloadAttempts > 0 &&
cx->m_globalCrawlInfo.m_pageDownloadSuccesses == 0 ) {
*status = SP_SEEDSERROR;
return msg->safePrintf("Failed to crawl any seed.");
}
// if we sent an email simply because no urls
// were left and we are not recrawling!
if ( cx->m_collectiveRespiderFrequency <= 0.0 &&

@ -39,6 +39,7 @@
#define SP_INPROGRESS 7 // it is going on!
#define SP_ADMIN_PAUSED 8 // g_conf.m_spideringEnabled = false
#define SP_COMPLETED 9 // crawl is done, and no repeatCrawl is scheduled
#define SP_SEEDSERROR 10 // all seeds had an error preventing crawling
bool tryToDeleteSpiderColl ( SpiderColl *sc , char *msg ) ;
void spiderRoundIncremented ( class CollectionRec *cr ) ;

@ -44,13 +44,13 @@ Stats::Stats ( ) {
m_totalSpiderSuccessOld = 0;
m_totalSpiderErrorsOld = 0;
m_msg3aRecallCnt = 0;
m_tierHits[0] = 0;
m_tierHits[1] = 0;
m_tierHits[2] = 0;
m_tier2Misses = 0;
m_tierTimes[0] = 0;
m_tierTimes[1] = 0;
m_tierTimes[2] = 0;
// m_tierHits[0] = 0;
// m_tierHits[1] = 0;
// m_tierHits[2] = 0;
// m_tier2Misses = 0;
// m_tierTimes[0] = 0;
// m_tierTimes[1] = 0;
// m_tierTimes[2] = 0;
//m_totalDedupCand = 0;
//m_dedupedCand = 0;
//m_bannedDups = 0;

10
Stats.h

@ -11,7 +11,7 @@
#include "SafeBuf.h"
#include "UdpProtocol.h" // MAX_MSG_TYPES
#include "IndexReadInfo.h"
//#include "IndexReadInfo.h"
class StatPoint {
public:
@ -143,8 +143,8 @@ class Stats {
// when we just request more docids from the same tier
int32_t m_msg3aFastRecalls;
// how many resolutions did we get on each tier
int32_t m_tierHits [MAX_TIERS];
int64_t m_tierTimes[MAX_TIERS];
//int32_t m_tierHits [MAX_TIERS];
//int64_t m_tierTimes[MAX_TIERS];
// how many searches did not get enough results?
int32_t m_tier2Misses;
// one count for each CR_* defined in Msg51.h
@ -160,8 +160,8 @@ class Stats {
//int32_t m_errored;
int32_t m_msg3aRecalls[6];
SafeBuf m_keyCols;
int32_t m_numTermsVsTier[14][MAX_TIERS];
int32_t m_termsVsTierExp[14][MAX_TIERS][7];
//int32_t m_numTermsVsTier[14][MAX_TIERS];
//int32_t m_termsVsTierExp[14][MAX_TIERS][7];
// use m_start so we know what msg stats to clear with memset
char m_start;

@ -12,6 +12,8 @@ Summary::Summary()
//m_buf = NULL;
m_bitScoresBuf = NULL;
m_bitScoresBufSize = 0;
m_wordWeights = NULL;
m_buf4 = NULL;
reset();
}
@ -36,6 +38,15 @@ void Summary::reset() {
m_numExcerpts = 0;
m_summaryLocs.reset();
m_summaryLocsPops.reset();
if ( m_wordWeights && m_wordWeights != (float *)m_tmpBuf ) {
mfree ( m_wordWeights , m_wordWeightSize , "sumww");
m_wordWeights = NULL;
}
m_wordWeights = NULL;
if ( m_buf4 && m_buf4 != m_tmpBuf4 ) {
mfree ( m_buf4 , m_buf4Size , "ssstkb" );
m_buf4 = NULL;
}
}
@ -151,6 +162,15 @@ bool Summary::set2 ( Xml *xml ,
end - start );
start = gettimeofdayInMilliseconds();*/
//
int32_t need1 = q->m_numWords * sizeof(float);
m_wordWeightSize = need1;
if ( need1 < 128 )
m_wordWeights = (float *)m_tmpBuf;
else
m_wordWeights = (float *)mmalloc ( need1 , "wwsum" );
if ( ! m_wordWeights ) return false;
// zero out all word weights
for ( int32_t i = 0 ; i < q->m_numWords; i++ )
@ -229,11 +249,25 @@ bool Summary::set2 ( Xml *xml ,
pend = m_summary + maxSummaryLen;
m_numExcerpts = 0;
int32_t need2 = (1+1+1) * m_q->m_numWords;
m_buf4Size = need2;
if ( need2 < 128 )
m_buf4 = m_tmpBuf4;
else
m_buf4 = (char *)mmalloc ( need2 , "stkbuf" );
if ( ! m_buf4 ) return false;
char *x = m_buf4;
char *retired = x;
x += m_q->m_numWords;
char *maxGotIt = x;
x += m_q->m_numWords;
char *gotIt = x;
// . the "maxGotIt" count vector accumulates into "retired"
// . that is how we keep track of what query words we used for previous
// summary excerpts so we try to get diversified excerpts with
// different query terms/words in them
char retired [ MAX_QUERY_WORDS ];
//char retired [ MAX_QUERY_WORDS ];
memset ( retired, 0, m_q->m_numWords * sizeof(char) );
// some query words are already matched in the title
@ -260,7 +294,7 @@ bool Summary::set2 ( Xml *xml ,
int32_t maxb = 0;
int32_t maxi = -1;
int32_t lasta = -1;
char maxGotIt [ MAX_QUERY_WORDS ];
//char maxGotIt [ MAX_QUERY_WORDS ];
if(lastNumFinal == numFinal) {
if(maxLoops-- <= 0) {
@ -296,7 +330,7 @@ bool Summary::set2 ( Xml *xml ,
if ( skip ) continue;
// ask him for the query words he matched
char gotIt [ MAX_QUERY_WORDS ];
//char gotIt [ MAX_QUERY_WORDS ];
// clear it for him
memset ( gotIt, 0, m_q->m_numWords * sizeof(char) );
@ -558,6 +592,12 @@ bool Summary::set2 ( Xml *xml ,
m_displayLen = p - m_summary;
}
// free the mem we used if we allocated it
if ( m_buf4 && m_buf4 != m_tmpBuf4 ) {
mfree ( m_buf4 , m_buf4Size , "ssstkb" );
m_buf4 = NULL;
}
// If we still didn't find a summary, get the default summary
if ( p == m_summary ) {
@ -570,6 +610,7 @@ bool Summary::set2 ( Xml *xml ,
maxSummaryLen );
if ( m_numDisplayLines > 0 )
m_displayLen = m_summaryLen;
return status;
}
@ -1211,7 +1252,7 @@ bool Summary::set1 ( char *doc ,
int32_t numTerms = q->getNumTerms();
// . now assign scores based on term frequencies
// . highest score is 10000, then 9900, 9800, 9700, ...
int32_t ptrs [ MAX_QUERY_TERMS ];
int32_t ptrs [ ABS_MAX_QUERY_TERMS ];
for ( int32_t i = 0 ; i < numTerms ; i++ ) ptrs[i] = i;
// convenience var
int64_t *freqs = termFreqs; // q->getTermFreqs();
@ -1232,7 +1273,7 @@ bool Summary::set1 ( char *doc ,
}
}
// assign scores, give rarest terms highest score
int32_t scores [ MAX_QUERY_TERMS ];
int32_t scores [ ABS_MAX_QUERY_TERMS ];
for ( int32_t i = 0 ; i < numTerms ; i++ )
scores[ptrs[i]] = 10000000 - (i*100);
// force QUERY stop words to have much lower scores at most 10000
@ -1441,7 +1482,7 @@ bool Summary::set1 ( char *doc ,
int32_t maxi = -1;
int32_t maxa = 0;
int32_t maxb = 0;
char gotIt [ MAX_QUERY_TERMS ];
char gotIt [ ABS_MAX_QUERY_TERMS ];
char *maxleft = NULL;
char *maxright = NULL;
for ( int32_t i = 0 ; i < numMatches ; i++ ) {

@ -266,7 +266,14 @@ class Summary {
char *m_bitScoresBuf;
int32_t m_bitScoresBufSize;
float m_wordWeights[MAX_QUERY_WORDS];
//float m_wordWeights[MAX_QUERY_WORDS];
float *m_wordWeights;
int32_t m_wordWeightSize;
char m_tmpBuf[128];
char *m_buf4;
int32_t m_buf4Size;
char m_tmpBuf4[128];
char m_summaryLocBuf[MAX_SUMMARY_LOCS*sizeof(uint64_t)];
SafeBuf m_summaryLocs;

@ -12,6 +12,7 @@
#include "Wiktionary.h"
Synonyms::Synonyms() {
m_synWordBuf.setLabel("syswbuf");
}
Synonyms::~Synonyms() {

@ -5049,8 +5049,8 @@ bool Tagdb::loadMinSiteInlinksBuffer2 ( ) {
// use 4 bytes for the first 130,000 entries or so to hold
// # of site inlinks. then we only need 1 byte since the remaining
// 25M are <256 sitenuminlinksunqiecblocks
m_siteBuf1.load(g_hostdb.m_dir,"sitelinks1.dat");
m_siteBuf2.load(g_hostdb.m_dir,"sitelinks2.dat");
m_siteBuf1.load(g_hostdb.m_dir,"sitelinks1.dat","stelnks1");
m_siteBuf2.load(g_hostdb.m_dir,"sitelinks2.dat","stelnks2");
m_siteBuf1.setLabel("sitelnks");
m_siteBuf2.setLabel("sitelnks");

@ -2565,11 +2565,10 @@ bool XmlDoc::indexDoc ( ) {
SafeBuf *ssDocMetaList = NULL;
// save this
int32_t saved = m_indexCode;
// and make it the real reason for the spider status doc
// make it the real reason for the spider status doc
m_indexCode = EDNSERROR;
// get the spiderreply ready to be added
ssDocMetaList = getSpiderStatusDocMetaList(NULL ,false);//del
// get the spiderreply ready to be added. false=del
ssDocMetaList =getSpiderStatusDocMetaList(NULL ,false);
// revert
m_indexCode = saved;
// error?
@ -2586,8 +2585,11 @@ bool XmlDoc::indexDoc ( ) {
char *url = "unknown";
if ( m_sreqValid ) url = m_sreq.m_url;
log("build: error2 getting real firstip of %"INT32" for "
"%s. Not adding new spider req", (int32_t)*fip,url);
log("build: error2 getting real firstip of "
"%"INT32" for "
"%s. Not adding new spider req. "
"spiderstatusdocsize=%"INT32, (int32_t)*fip,url,
m_addedStatusDocSize);
// also count it as a crawl attempt
cr->m_localCrawlInfo.m_pageDownloadAttempts++;
cr->m_globalCrawlInfo.m_pageDownloadAttempts++;
@ -3130,8 +3132,9 @@ bool isRobotsTxtFile ( char *u , int32_t ulen ) {
bool XmlDoc::isContainerDoc ( ) {
if ( m_firstUrlValid && m_firstUrl.isWarc() ) return true;
if ( m_firstUrlValid && m_firstUrl.isArc () ) return true;
if ( ! m_contentDelimValid ) { char *xx=NULL;*xx=0; }
if ( m_contentDelim ) return true;
//if ( ! m_contentDelimValid ) { char *xx=NULL;*xx=0; }
//if ( m_contentDelim ) return true;
if ( m_contentDelimValid && m_contentDelim ) return true;
return false;
}
@ -9617,11 +9620,15 @@ float computeSimilarity ( int32_t *vec0 ,
// . stock the query term hash table
// . use the lower 32 bits of the termids to make compatible
// with the other vectors we use
int64_t *qtids = q->getTermIds ();
//int64_t *qtids = q->getTermIds ();
int32_t nt = q->getNumTerms();
for ( int32_t i = 0 ; i < nt ; i++ ) {
// get query term
QueryTerm *QT = &q->m_qterms[i];
// get the termid
int64_t termId = QT->m_termId;
// get it
uint32_t h = (uint32_t)(qtids[i] & 0xffffffff);
uint32_t h = (uint32_t)(termId & 0xffffffff);
// hash it
if ( ! qt.addKey ( &h ) ) return -1;
}
@ -28672,6 +28679,11 @@ SafeBuf *XmlDoc::getSpiderStatusDocMetaList2 ( SpiderReply *reply1 ) {
jd.safePrintf("\"gbssHttpStatus\":%"INT32",\n",
(int32_t)m_httpStatus);
// do not index gbssIsSeedUrl:0 because there will be too many usually
bool isSeed = ( m_sreqValid && m_sreq.m_isAddUrl );
if ( isSeed )
jd.safePrintf("\"gbssIsSeedUrl\":1,\n");
if ( od )
jd.safePrintf("\"gbssWasIndexed\":1,\n");
else
@ -28696,6 +28708,18 @@ SafeBuf *XmlDoc::getSpiderStatusDocMetaList2 ( SpiderReply *reply1 ) {
else
jd.safePrintf("\"gbssDiffbotUri\":"
"\"none\",\n");
// show the type as gbssDiffbotType:"article" etc.
JsonItem *dti = NULL;
if ( jp1 )
dti = jp1->getItem("type");
if ( dti ) {
jd.safePrintf("\"gbssDiffbotType\":\"");
int32_t vlen;
char *val = dti->getValueAsString( &vlen );
if ( val ) jd.jsonEncode ( val , vlen );
jd.safePrintf("\",\n");
}
}
else { // if ( cr->m_isCustomCrawl ) {
jd.safePrintf("\"gbssIsDiffbotObject\":0,\n");
@ -45262,7 +45286,7 @@ SafeBuf *XmlDoc::getMatchingQueriesScoredForFullQuery ( ) {
// prepend to the query?
int32_t ulen = m_firstUrl.m_ulen;
// go to next guy if this query is too big already
if ( ulen + qlen + 10 > MAX_QUERY_LEN ) {
if ( ulen + qlen + 10 > ABS_MAX_QUERY_LEN ) {
m_queryNum++;
goto loop;
}

@ -232,6 +232,17 @@ uint64_t hash64d ( char *p, int32_t plen ) {
char ncs = utf8Encode ( x , (char *)tmp );
// sanity check
if ( ncs > 4 ) { char *xx=NULL;*xx=0; }
// i've seen this happen for 4 byte char =
// -16,-112,-51,-125 which has x=66371 and y=66371
// but utf8Encode() returned 0!
if ( ncs == 0 ) {
// let's just hash it as-is then
tmp[0] = p[0];
if ( cs >= 1 ) tmp[1] = p[1];
if ( cs >= 2 ) tmp[2] = p[2];
if ( cs >= 3 ) tmp[3] = p[3];
ncs = cs;
}
// hash it up
h ^= g_hashtab [i++][tmp[0]];
if ( ncs == 1 ) continue;

33
hash.h

@ -250,6 +250,17 @@ inline uint64_t hash64Lower_utf8_nospaces ( char *p, int32_t len ) {
char ncs = utf8Encode ( y , tmp );
// sanity check
if ( ncs > 4 ) { char *xx=NULL;*xx=0; }
// i've seen this happen for 4 byte char =
// -16,-112,-51,-125 which has x=66371 and y=66371
// but utf8Encode() returned 0!
if ( ncs == 0 ) {
// let's just hash it as-is then
tmp[0] = p[0];
if ( cs >= 1 ) tmp[1] = p[1];
if ( cs >= 2 ) tmp[2] = p[2];
if ( cs >= 3 ) tmp[3] = p[3];
ncs = cs;
}
// hash it up
h ^= g_hashtab [i++][(uint8_t)tmp[0]];
if ( ncs == 1 ) continue;
@ -301,6 +312,17 @@ inline uint64_t hash64Lower_utf8_cont ( char *p,
char ncs = utf8Encode ( y , tmp );
// sanity check
if ( ncs > 4 ) { char *xx=NULL;*xx=0; }
// i've seen this happen for 4 byte char =
// -16,-112,-51,-125 which has x=66371 and y=66371
// but utf8Encode() returned 0!
if ( ncs == 0 ) {
// let's just hash it as-is then
tmp[0] = p[0];
if ( cs >= 1 ) tmp[1] = p[1];
if ( cs >= 2 ) tmp[2] = p[2];
if ( cs >= 3 ) tmp[3] = p[3];
ncs = cs;
}
// hash it up
h ^= g_hashtab [i++][(uint8_t)tmp[0]];
if ( ncs == 1 ) continue;
@ -376,6 +398,17 @@ inline uint64_t hash64Lower_utf8 ( char *p ) {
char ncs = utf8Encode ( y , (char *)tmp );
// sanity check
if ( ncs > 4 ) { char *xx=NULL;*xx=0; }
// i've seen this happen for 4 byte char =
// -16,-112,-51,-125 which has x=66371 and y=66371
// but utf8Encode() returned 0!
if ( ncs == 0 ) {
// let's just hash it as-is then
tmp[0] = p[0];
if ( cs >= 1 ) tmp[1] = p[1];
if ( cs >= 2 ) tmp[2] = p[2];
if ( cs >= 3 ) tmp[3] = p[3];
ncs = cs;
}
// hash it up
h ^= g_hashtab [i++][(uint8_t)tmp[0]];
if ( ncs == 1 ) continue;

@ -4998,7 +4998,7 @@ int install ( install_flag_konst_t installFlag , int32_t hostId , char *dir ,
if ( ! f.doesExist() ) target = "gb";
sprintf(tmp,
"scp -c blowfish " // blowfish is faster
"scp -c arcfour " // blowfish is faster
"%s%s "
"%s:%s/gb.installed%s",
dir,