Merge branch 'master' into nomerge2

This commit is contained in:
Ivan Skytte Jørgensen
2017-02-20 16:09:13 +01:00
26 changed files with 103 additions and 215 deletions

1
.gitignore vendored

@ -44,3 +44,4 @@ CMakeLists.txt
*.gcno
coverage*.html
vgcore.*
Make.depend

@ -1150,7 +1150,8 @@ bool HttpServer::sendReply2 ( const char *mime,
// if we are a proxy, and not a compression proxy, then just forward
// the blob as-is if it is a "ZET" (GET-compressed=ZET)
else if ( (myHostType & HT_PROXY) && (*rb == 'Z') ) {
gbmemcpy ( sendBuf , content, contentLen );
if(content)
gbmemcpy ( sendBuf , content, contentLen );
// sanity check
if ( sendBufSize != contentLen ) { g_process.shutdownAbort(true); }
// note it
@ -1161,7 +1162,8 @@ bool HttpServer::sendReply2 ( const char *mime,
gbmemcpy ( p , mime , mimeLen );
p += mimeLen;
// then the page
gbmemcpy ( p , content, contentLen );
if(content)
gbmemcpy ( p , content, contentLen );
p += contentLen;
// sanity check
if ( sendBufSize != contentLen+mimeLen) { g_process.shutdownAbort(true);}

@ -352,8 +352,6 @@ bool Images::getThumbnail ( const char *pageSite,
// just use msg0 and limit to like 1k or something
if ( ! m_msg0.getList ( -1 , // hostid
-1 , // ip
-1 , // port
0 , // maxAge
false , // addToCache?
RDB_POSDB ,
@ -367,7 +365,6 @@ bool Images::getThumbnail ( const char *pageSite,
MAX_NICENESS ,
false , // err correction?
true , // inc tree?
true , // domergeobsolete
-1 , // firstHostId
0 , // start filenum
-1 , // numFiles
@ -440,8 +437,6 @@ bool Images::launchRequests ( ) {
// get the termlist
if ( ! m_msg0.getList ( -1 , // hostid
-1 , // ip
-1 , // port
0 , // maxAge
false , // addToCache?
RDB_POSDB,
@ -455,7 +450,6 @@ bool Images::launchRequests ( ) {
MAX_NICENESS ,
false , // err correction?
true , // inc tree?
true , // domergeobsolete
-1 , // firstHostId
0 , // start filenum
-1 , // numFiles

@ -62,7 +62,7 @@ void Matches::reset2() {
}
bool Matches::isMatchableTerm(const QueryTerm *qt) const {
QueryWord *qw = qt->m_qword;
const QueryWord *qw = qt->m_qword;
// not derived from a query word? how?
if ( ! qw ) return false;
if ( qw->m_ignoreWord == IGNORE_DEFAULT ) return false;
@ -160,7 +160,7 @@ void Matches::setQuery ( Query *q ) {
}
// get the word it is from
QueryWord *qw = qt->m_qword;
const QueryWord *qw = qt->m_qword;
// get word #
int32_t qwn = qw - q->m_qwords;

71
Mem.cpp

@ -35,7 +35,6 @@ static const char MAGICCHAR = (char)0xda;
class Mem g_mem;
static bool freeCacheMem();
@ -59,7 +58,7 @@ static bool s_initialized = 0;
//note: the ScopedMemoryLimitBypass is not thread-safe. The "bypass" flag should really
//be per-thread. Or RdbBase should be reworked to use another technique than artificially
//raising the memory limit while adding a file. Eg. make freeCacheMem() work again?
//raising the memory limit while adding a file.
ScopedMemoryLimitBypass::ScopedMemoryLimitBypass()
: oldMaxMem(g_conf.m_maxMem)
{
@ -125,8 +124,9 @@ void Mem::delnew ( void *ptr , size_t size , const char *note ) {
void * operator new (size_t size) throw (std::bad_alloc) {
logTrace( g_conf.m_logTraceMem, "size=%zu", size );
// don't let electric fence zap us
if ( size == 0 ) return (void *)0x7fffffff;
//new operator is required to return a unique pointer even for zero-byte allocations
if(size==0)
size = 1;
if ( allocationShouldFailRandomly() ) {
g_errno = ENOMEM;
@ -163,8 +163,9 @@ void * operator new (size_t size) throw (std::bad_alloc) {
void * operator new [] (size_t size) throw (std::bad_alloc) {
logTrace( g_conf.m_logTraceMem, "size=%zu", size );
// don't let electric fence zap us
if ( size == 0 ) return (void *)0x7fffffff;
//new operator is required to return a unique pointer even for zero-byte allocations
if(size==0)
size = 1;
size_t max = g_conf.m_maxMem;
@ -899,8 +900,9 @@ int Mem::printMem ( ) {
void *Mem::gbmalloc ( size_t size , const char *note ) {
logTrace( g_conf.m_logTraceMem, "size=%zu note='%s'", size, note );
// don't let electric fence zap us
if ( size == 0 ) return (void *)0x7fffffff;
//malloc() can return a NULL pointer if the size is zero
if(size==0)
return NULL;
if ( allocationShouldFailRandomly() ) {
g_errno = ENOMEM;
@ -908,13 +910,10 @@ void *Mem::gbmalloc ( size_t size , const char *note ) {
return NULL;
}
retry:
size_t max = g_conf.m_maxMem;
// don't go over max
if ( g_mem.getUsedMem() + size + UNDERPAD + OVERPAD >= max ) {
// try to free temp mem. returns true if it freed some.
if ( freeCacheMem() ) goto retry;
g_errno = ENOMEM;
log( LOG_WARN, "mem: malloc(%zu): Out of memory", size );
return NULL;
@ -924,12 +923,8 @@ retry:
mem = (void *)sysmalloc ( size + UNDERPAD + OVERPAD );
int32_t memLoop = 0;
mallocmemloop:
if ( ! mem && size > 0 ) {
g_mem.m_outOfMems++;
// try to free temp mem. returns true if it freed some.
if ( freeCacheMem() ) goto retry;
g_errno = errno;
static int64_t s_lastTime;
static int32_t s_missed = 0;
@ -952,24 +947,6 @@ mallocmemloop:
return NULL;
}
if ( (PTRTYPE)mem < 0x00010000 ) {
void *remem = sysmalloc(size);
log( LOG_WARN, "mem: Caught low memory allocation "
"at %08" PTRFMT", "
"reallocated to %08" PTRFMT"",
(PTRTYPE)mem, (PTRTYPE)remem );
sysfree(mem);
mem = remem;
memLoop++;
if ( memLoop > 100 ) {
log( LOG_WARN, "mem: Attempted to reallocate low "
"memory allocation 100 times, "
"aborting and returning NOMEM." );
g_errno = ENOMEM;
return NULL;
}
goto mallocmemloop;
}
logTrace( g_conf.m_logTraceMem, "mem=%p size=%zu note='%s'", mem, size, note );
@ -991,27 +968,21 @@ void *Mem::gbcalloc ( size_t size , const char *note ) {
void *Mem::gbrealloc ( void *ptr , size_t oldSize , size_t newSize , const char *note ) {
logTrace( g_conf.m_logTraceMem, "ptr=%p oldSize=%zu newSize=%zu note='%s'", ptr, oldSize, newSize, note );
// return dummy values since realloc() returns NULL if failed
if ( oldSize == 0 && newSize == 0 ) return (void *)0x7fffffff;
// do nothing if size is same
if ( oldSize == newSize ) return ptr;
// if newSize is 0...
if ( newSize == 0 ) {
gbfree(ptr, note, oldSize, true);
return (void *)0x7fffffff;
return NULL;
}
retry:
// hack so hostid #0 can use more mem
size_t max = g_conf.m_maxMem;
//if ( g_hostdb.m_hostId == 0 ) max += 2000000000;
// don't go over max
if ( g_mem.getUsedMem() + newSize - oldSize >= max ) {
// try to free temp mem. returns true if it freed some.
if ( freeCacheMem() ) goto retry;
g_errno = ENOMEM;
log( LOG_WARN, "mem: realloc(%zu,%zu): Out of memory.",oldSize,newSize);
return NULL;
@ -1026,8 +997,6 @@ retry:
rmMem(ptr, oldSize, note, true);
// . do the actual realloc
// . CAUTION: don't pass in 0x7fffffff in as "ptr"
// . this was causing problems
char *mem = (char *)sysrealloc ( (char *)ptr - UNDERPAD , newSize + UNDERPAD + OVERPAD );
// remove old guy on sucess
@ -1063,7 +1032,7 @@ retry:
return mem;
}
char *Mem::dup ( const void *data , size_t dataSize , const char *note ) {
void *Mem::dup ( const void *data , size_t dataSize , const char *note ) {
logTrace( g_conf.m_logTraceMem, "data=%p dataSize=%zu note='%s'", data, dataSize, note );
// keep it simple
@ -1074,10 +1043,6 @@ char *Mem::dup ( const void *data , size_t dataSize , const char *note ) {
return mem;
}
char *Mem::strdup( const char *string, const char *note ) {
return dup(string, strlen(string) + 1, note);
}
void Mem::gbfree ( void *ptr , const char *note, size_t size , bool checksize ) {
if(!s_lock.working) return;
@ -1108,15 +1073,3 @@ void Mem::gbfree ( void *ptr , const char *note, size_t size , bool checksize )
if ( isnew ) sysfree ( (char *)ptr );
else sysfree ( (char *)ptr - UNDERPAD );
}
//#include "Msg20.h"
static bool freeCacheMem() {
// returns true if it did free some stuff
//if ( resetMsg20Cache() ) {
// log("mem: freed cache mem.");
// return true;
//}
return false;
}

9
Mem.h

@ -30,10 +30,7 @@ class Mem {
void *gbcalloc ( size_t size , const char *note);
void *gbrealloc ( void *oldPtr, size_t oldSize, size_t newSize, const char *note);
void gbfree(void *ptr, const char *note, size_t size, bool checksize);
char *dup ( const void *data , size_t dataSize , const char *note);
char *strdup ( const char *string, const char *note );
int32_t validate();
void *dup ( const void *data , size_t dataSize , const char *note);
// this one does not include new/delete mem, only *alloc()/free() mem
size_t getUsedMem() const;
@ -74,6 +71,8 @@ class Mem {
const char *m_maxAllocBy; // the biggest single alloc ever done
private:
int32_t validate();
int32_t getMemSlot(void *mem);
// currently used mem (estimate)
@ -108,7 +107,7 @@ static inline void mfree(void *ptr, size_t size, const char *note) {
return g_mem.gbfree(ptr, note, size, true);
}
static inline char *mdup(const void *data, size_t dataSize, const char *note) {
static inline void *mdup(const void *data, size_t dataSize, const char *note) {
return g_mem.dup(data, dataSize, note);
}

@ -99,8 +99,6 @@ bool Msg0::registerHandler ( ) {
// the list updates it on disk it can't flush our cache... so use a small
// maxCacheAge of like , 30 seconds or so...
bool Msg0::getList ( int64_t hostId , // host to ask (-1 if none)
int32_t ip , // info on hostId
int16_t port ,
int32_t maxCacheAge , // max cached age in seconds
bool addToCache , // add net recv'd list to cache?
rdbid_t rdbId , // specifies the rdb
@ -114,7 +112,6 @@ bool Msg0::getList ( int64_t hostId , // host to ask (-1 if none)
int32_t niceness ,
bool doErrorCorrection ,
bool includeTree ,
bool doMerge ,
int32_t firstHostId ,
int32_t startFileNum ,
int32_t numFiles ,
@ -228,32 +225,6 @@ bool Msg0::getList ( int64_t hostId , // host to ask (-1 if none)
// it it stored locally?
bool isLocal = ( m_hostId == -1 && m_shardNum == getMyShardNum() );
/*
int64_t singleDocIdQuery = 0LL;
if ( rdbId == RDB_POSDB ) {
int64_t d1 = g_posdb.getDocId(m_startKey);
int64_t d2 = g_posdb.getDocId(m_endKey);
if ( d1+1 == d2 ) singleDocIdQuery = d1;
}
// . try the LOCAL termlist cache
// . so when msg2 is evaluating a gbdocid:| query and it has to
// use msg0 to go across the network to get the same damn termlist
// over and over again for the same docid, this will help alot.
// . ideally it'd be nice if the seo pipe in xmldoc.cpp can try to
// send the same gbdocid:xxxx docids to the same hosts. maybe hash
// based on docid into the list of hosts and if that host is busy
// just chain until we find someone not busy.
if ( singleDocIdQuery &&
getListFromTermListCache ( coll,
m_startKey,
m_endKey,
maxCacheAge,
list ) )
// found!
return true;
*/
// but always local if only one host
if ( g_hostdb.getNumHosts() == 1 ) isLocal = true;
@ -272,7 +243,7 @@ bool Msg0::getList ( int64_t hostId , // host to ask (-1 if none)
try { m_msg5 = new ( Msg5 ); }
catch ( ... ) {
g_errno = ENOMEM;
log("net: Local alloc for disk read failed "
log(LOG_WARN, "net: Local alloc for disk read failed "
"while tring to read data for %s. "
"Trying remote request.",
getDbnameFromId(m_rdbId));
@ -282,7 +253,7 @@ bool Msg0::getList ( int64_t hostId , // host to ask (-1 if none)
m_deleteMsg5 = true;
}
if ( ! m_msg5->getList ( (rdbid_t)rdbId,
if ( ! m_msg5->getList ( rdbId,
m_collnum ,
m_list ,
m_startKey ,
@ -317,13 +288,10 @@ skip:
log(LOG_DEBUG,"net: msg0: Sending request for data to "
"shard=%" PRIu32" "
"listPtr=%" PTRFMT" minRecSizes=%" PRId32" termId=%" PRIu64" "
//"startKey.n1=%" PRIx32",n0=%" PRIx64" (niceness=%" PRId32")",
"startKey.n1=%" PRIx64",n0=%" PRIx64" (niceness=%" PRId32")",
//g_hostdb.makeHostId ( m_groupId ) ,
m_shardNum,
(PTRTYPE)m_list,
m_minRecSizes, Posdb::getTermId(m_startKey) ,
//m_startKey.n1,m_startKey.n0 , (int32_t)m_niceness);
KEY1(m_startKey,m_ks),KEY0(m_startKey),
(int32_t)m_niceness);
@ -346,8 +314,6 @@ skip:
*p = (char)m_allowPageCache; p++;
KEYSET(p,m_startKey,m_ks); ; p+=m_ks;
KEYSET(p,m_endKey,m_ks); ; p+=m_ks;
// NULL terminated collection name
//strcpy ( p , coll ); p += strlen ( coll ); *p++ = '\0';
*(collnum_t *)p = m_collnum; p += sizeof(collnum_t);
m_requestSize = p - m_request;
// ask an individual host for this list if hostId is NOT -1
@ -392,7 +358,6 @@ skip:
// . need to send out to all the indexdb split hosts
m_numRequests = 0;
m_numReplies = 0;
//for ( int32_t i = 0; i < m_numSplit; i++ ) {
// get the multicast
Multicast *m = &m_mcast;

3
Msg0.h

@ -43,8 +43,6 @@ class Msg0 {
// out of sync with the data
// . a maxCacheAge of 0 (or negative) means not to check the cache
bool getList ( int64_t hostId , // -1 if unspecified
int32_t ip , // info on hostId
int16_t port ,
int32_t maxCacheAge , // max cached age in seconds
bool addToCache , // add net recv'd list to cache?
rdbid_t rdbId , // specifies the rdb
@ -58,7 +56,6 @@ class Msg0 {
int32_t niceness ,
bool doErrorCorrection = true ,
bool includeTree = true ,
bool doMerge = true ,
int32_t firstHostId = -1 ,
int32_t startFileNum = 0 ,
int32_t numFiles = -1 ,

@ -226,6 +226,13 @@ bool Msg13::forwardRequest ( ) {
if ( ++hostId >= nh ) hostId = 0;
}
if(!h) {
//all spider hosts dead (or misconfiguration)
if(!g_errno)
g_errno = ENOHOSTS;
log("spider: msg13 request: %s",mstrerror(g_errno));
return true;
}
hostId = 0; // HACK!!

@ -610,7 +610,7 @@ void Msg39::getLists(int fileNum, int64_t docIdStart, int64_t docIdEnd) {
//char *tpc = qt->m_term + qt->m_termLen;
char sign = qt->m_termSign;
if ( sign == 0 ) sign = '0';
QueryWord *qw = qt->m_qword;
const QueryWord *qw = qt->m_qword;
int32_t wikiPhrId = qw->m_wikiPhraseId;
if ( m_query.isPhrase(i) ) wikiPhrId = 0;
char leftwikibigram = 0;

@ -1529,8 +1529,6 @@ bool Msg5::getRemoteList ( ) {
// data
verify_signature();
if ( ! m_msg0->getList ( h->m_hostId ,
h->m_ip ,
h->m_port ,
0 , // max cached age
false , // add to cache?
m_rdbId , // rdbId
@ -1544,7 +1542,6 @@ bool Msg5::getRemoteList ( ) {
m_niceness ,
false , // do error correction?
true , // include tree?
true , // do merge? (obsolete)
-1 , // first hostid
0 , // startFileNum
-1 , // numFiles (-1=all)

@ -314,8 +314,6 @@ bool Msg51::sendRequest ( int32_t i ) {
// . returns false and sets g_errno on error
// . otherwise, it blocks and returns true
bool s = m_slot[i].m_msg0.getList( -1 , // hostid
-1 , // ip
-1 , // port
m_maxCacheAge ,
m_addToCache ,
RDB_CLUSTERDB ,
@ -329,7 +327,6 @@ bool Msg51::sendRequest ( int32_t i ) {
m_niceness ,
true , // doErrorCorrection
true , // includeTree
true , // doMerge?
firstHostId ,
0 , // startFileNum
-1 , // numFiles

@ -58,7 +58,6 @@ Multicast::Multicast()
m_lastLaunch(0),
m_freeReadBuf(false),
m_key(0),
m_sendToSelf(false),
m_sentToTwin(false)
{
constructor();
@ -102,16 +101,13 @@ void Multicast::reset ( ) {
bool Multicast::send(char *msg, int32_t msgSize, msg_type_t msgType, bool ownMsg, uint32_t shardNum, bool sendToWholeGroup_,
int32_t key, void *state, void *state2, void (*callback)(void *state, void *state2),
int64_t totalTimeout, int32_t niceness, int32_t firstHostId, bool freeReplyBuf) {
bool sendToSelf = true;
// make sure not being re-used!
if ( m_inUse ) {
log( LOG_ERROR, "net: Attempt to re-use active multicast");
g_process.shutdownAbort(true);
}
// reset to free "m_msg" in case we are being re-used (like by Msg14)
//log(LOG_DEBUG, "Multicast: send() 0x%02x",msgType);
reset();
// it is now in use
m_inUse = true;
// set the parameters in this class
@ -121,7 +117,6 @@ bool Multicast::send(char *msg, int32_t msgSize, msg_type_t msgType, bool ownMsg
m_freeReadBuf = freeReplyBuf;
m_msgSize = msgSize;
m_msgType = msgType;
//m_groupId = groupId;
m_state = state;
m_state2 = state2;
m_callback = callback;
@ -135,7 +130,6 @@ bool Multicast::send(char *msg, int32_t msgSize, msg_type_t msgType, bool ownMsg
m_readBufSize = 0;
m_readBufMaxSize = 0;
m_registeredSleep = false;
m_sendToSelf = sendToSelf;
m_sentToTwin = false;
m_key = key;
@ -198,16 +192,7 @@ void Multicast::sendToWholeGroup() {
// if we got a nice reply from him skip him
//slots[i] && m_host[i].m_slot->doneReading() ) continue;
if ( m_host[i].m_retired ) continue;
// sometimes msg1.cpp is able to add the data to the tree
// without problems and will save us a network trans here
if ( ! m_sendToSelf &&
h->m_hostId == g_hostdb.m_hostId &&
! g_conf.m_interfaceMachine ) {
m_host[i].m_retired = true;
m_host[i].m_errno = 0;
m_numReplies++;
continue;
}
// . timeout is in seconds
// . timeout is just the time remaining for the whole groupcast
// int32_t timeout = m_startTime + m_totalTimeout - getTime();
@ -580,7 +565,6 @@ bool Multicast::sendToHost ( int32_t i ) {
int16_t destPort = h->m_port;
// if from hosts2.conf pick the best ip!
//int32_t bestIp = h->m_ip;
int32_t bestIp = g_hostdb.getBestHosts2IP ( h );
// sanity check

@ -194,9 +194,6 @@ private:
int32_t m_key;
// Msg1 might be able to add data to our tree to save a net trans.
bool m_sendToSelf;
bool m_sentToTwin;
void destroySlotsInProgress ( UdpSlot *slot );

@ -651,7 +651,7 @@ static bool printIgnoredWords ( SafeBuf *sb , const SearchInput *si ) {
bool firstIgnored = true;
for ( int32_t i = 0 ; i < qq2->m_numWords ; i++ ) {
//if ( si->m_xml ) break;
QueryWord *qw = &qq2->m_qwords[i];
const QueryWord *qw = &qq2->m_qwords[i];
// only print out words ignored cuz they were stop words
if ( qw->m_ignoreWord != IGNORE_QSTOP ) continue;
// print header -- we got one
@ -1096,7 +1096,7 @@ bool printSearchResultsHeader ( State0 *st ) {
,qt->m_termId);
sb->safePrintf("\t\t\t<termHash64>%" PRIu64"</termHash64>\n"
,qt->m_rawTermId);
QueryWord *qw = qt->m_qword;
const QueryWord *qw = qt->m_qword;
sb->safePrintf("\t\t\t<prefixHash64>%" PRIu64"</prefixHash64>\n"
,qw->m_prefixHash);
sb->safePrintf("\t\t</term>\n");
@ -1174,7 +1174,7 @@ bool printSearchResultsHeader ( State0 *st ) {
,qt->m_rawTermId);
// don't end last query term attr on a omma
QueryWord *qw = qt->m_qword;
const QueryWord *qw = qt->m_qword;
sb->safePrintf("\t\t\"prefixHash64\":%" PRIu64"\n"
,qw->m_prefixHash);

@ -4881,7 +4881,7 @@ void Parms::init ( ) {
"an error message if a shard was down and did not return "
"results for a query. The XML and JSON feed let's you know "
"when a shard is down and will give you the results back "
"any way, but if you would rather have just and error message "
"any way, but if you would rather have just an error message "
"and no results, then set then set this to 'NO'.";
m->m_cgi = "rra";
simple_m_set(Conf,m_returnResultsAnyway);

@ -1521,8 +1521,6 @@ bool PosdbTable::setQueryTermInfo ( ) {
// point to those
QueryTermInfo *qtibuf = (QueryTermInfo *)m_qiBuf.getBufStart();
RdbList *list = NULL;
int32_t nrg = 0;
// assume not sorting by a numeric termlist
@ -1553,7 +1551,7 @@ bool PosdbTable::setQueryTermInfo ( ) {
}
// set this stff
QueryWord *qw = qt->m_qword;
const QueryWord *qw = qt->m_qword;
//int32_t wordNum = qw - &m_q->m_qwords[0];
// get one
QueryTermInfo *qti = &qtibuf[nrg];
@ -1630,7 +1628,7 @@ bool PosdbTable::setQueryTermInfo ( ) {
leftAlreadyAdded = true;
// get list
//list = m_msg2->getList(left);
list = m_q->m_qterms[left].m_posdbListPtr;
RdbList *list = m_q->m_qterms[left].m_posdbListPtr;
// add list ptr into our required group
qti->m_subLists[nn] = list;
// left bigram is #2
@ -1680,7 +1678,7 @@ bool PosdbTable::setQueryTermInfo ( ) {
rightAlreadyAdded = true;
// get list
//list = m_msg2->getList(right);
list = m_q->m_qterms[right].m_posdbListPtr;
RdbList *list = m_q->m_qterms[right].m_posdbListPtr;
// add list ptr into our required group
qti->m_subLists[nn] = list;
// right bigram is #3
@ -1730,7 +1728,7 @@ bool PosdbTable::setQueryTermInfo ( ) {
// add to it. add backwards since we give precedence to
// the first list and we want that to be the NEWEST list!
//list = m_msg2->getList(i);
list = m_q->m_qterms[i].m_posdbListPtr;
RdbList *list = m_q->m_qterms[i].m_posdbListPtr;
// add list ptr into our required group
qti->m_subLists[nn] = list;
// how many in there?

@ -133,9 +133,8 @@ bool Query::set2 ( const char *query ,
// truncate query if too big
if ( queryLen >= ABS_MAX_QUERY_LEN ) {
log("query: Query length of %" PRId32" must be "
"less than %" PRId32". "
"Truncating.",queryLen,(int32_t)ABS_MAX_QUERY_LEN);
log("query: Query length of %" PRId32" must be less than %" PRId32". Truncating.",
queryLen,(int32_t)ABS_MAX_QUERY_LEN);
queryLen = ABS_MAX_QUERY_LEN - 1;
m_truncated = true;
}
@ -256,7 +255,7 @@ bool Query::set2 ( const char *query ,
// disable stuff for site:, ip: and url: queries
for ( int32_t i = 0 ; i < m_numWords ; i++ ) {
QueryWord *qw = &m_qwords[i];
const QueryWord *qw = &m_qwords[i];
if ( qw->m_ignoreWord ) continue;
if ( qw->m_fieldCode == FIELD_SITE &&
qw->m_wordSign != '-' )
@ -379,7 +378,7 @@ bool Query::setQTerms ( const Words &words ) {
// count phrases first for allocating
int32_t nqt = 0;
for ( int32_t i = 0 ; i < m_numWords ; i++ ) {
QueryWord *qw = &m_qwords[i];
const QueryWord *qw = &m_qwords[i];
// skip if ignored... mdw...
if ( ! qw->m_phraseId ) continue;
if ( qw->m_ignorePhrase ) continue; // could be a repeat
@ -390,7 +389,7 @@ bool Query::setQTerms ( const Words &words ) {
}
// count single terms
for ( int32_t i = 0 ; i < m_numWords; i++ ) {
QueryWord *qw = &m_qwords[i];
const QueryWord *qw = &m_qwords[i];
if ( qw->m_ignoreWord &&
qw->m_ignoreWord != IGNORE_QSTOP) continue;
// ignore if in quotes and part of phrase, watch out
@ -413,7 +412,7 @@ bool Query::setQTerms ( const Words &words ) {
int64_t to = hash64n("to");
for ( int32_t i = 0 ; i < m_numWords ; i++ ) {
// get query word
QueryWord *qw = &m_qwords[i];
const QueryWord *qw = &m_qwords[i];
// skip if in quotes, we will not get synonyms for it
if ( qw->m_inQuotes ) continue;
// skip if has plus sign in front
@ -484,13 +483,13 @@ bool Query::setQTerms ( const Words &words ) {
// stop breach
if ( n >= ABS_MAX_QUERY_TERMS ) {
log("query: lost query phrase terms to max term "
"limit of %" PRId32,(int32_t)ABS_MAX_QUERY_TERMS );
log("query: lost query phrase terms to max term limit of %" PRId32,
(int32_t)ABS_MAX_QUERY_TERMS);
break;
}
if ( n >= m_maxQueryTerms ) {
log("query: lost query phrase terms to max term cr "
"limit of %" PRId32,(int32_t)m_maxQueryTerms);
log("query: lost query phrase terms to max term cr limit of %" PRId32,
(int32_t)m_maxQueryTerms);
break;
}
@ -579,13 +578,13 @@ bool Query::setQTerms ( const Words &words ) {
// stop breach
if ( n >= ABS_MAX_QUERY_TERMS ) {
log("query: lost query terms to max term "
"limit of %" PRId32,(int32_t)ABS_MAX_QUERY_TERMS );
log("query: lost query terms to max term limit of %" PRId32,
(int32_t)ABS_MAX_QUERY_TERMS);
break;
}
if ( n >= m_maxQueryTerms ) {
log("query: lost query terms to max term cr "
"limit of %" PRId32,(int32_t)m_maxQueryTerms);
log("query: lost query terms to max term cr limit of %" PRId32,
(int32_t)m_maxQueryTerms);
break;
}
@ -722,7 +721,7 @@ bool Query::setQTerms ( const Words &words ) {
// . set implicit bits, m_implicitBits
// . set m_inPhrase
for (int32_t i = 0; i < m_numWords ; i++ ){
QueryWord *qw = &m_qwords[i];
const QueryWord *qw = &m_qwords[i];
QueryTerm *qt = qw->m_queryWordTerm;
if (!qt) continue;
if ( qw->m_queryPhraseTerm )
@ -752,7 +751,7 @@ bool Query::setQTerms ( const Words &words ) {
// was working.
for ( int32_t j = 0 ; j < m_numWords ; j++ ) {
// must be our same wordId (same word, different occ.)
QueryWord *qw2 = &m_qwords[j];
const QueryWord *qw2 = &m_qwords[j];
if ( qw2->m_wordId != qw->m_wordId ) continue;
// get first word in the phrase that jth word is in
int32_t pn2 = qw2->m_leftPhraseStart;
@ -827,18 +826,16 @@ bool Query::setQTerms ( const Words &words ) {
for ( int32_t j = 0 ; j < naids ; j++ ) {
// stop breach
if ( n >= ABS_MAX_QUERY_TERMS ) {
log("query: lost synonyms due to max term "
"limit of %" PRId32,
(int32_t)ABS_MAX_QUERY_TERMS );
log("query: lost synonyms due to max term limit of %" PRId32,
(int32_t)ABS_MAX_QUERY_TERMS);
break;
}
// this happens for 'da da da'
if ( ! origTerm ) continue;
if ( n >= m_maxQueryTerms ) {
log("query: lost synonyms due to max cr term "
"limit of %" PRId32,
(int32_t)m_maxQueryTerms);
log("query: lost synonyms due to max cr term limit of %" PRId32,
(int32_t)m_maxQueryTerms);
break;
}
@ -1014,8 +1011,7 @@ bool Query::setQTerms ( const Words &words ) {
m_forcedBits = 0; // terms with + signs
m_synonymBits = 0;
for ( int32_t i = 0 ; i < m_numTerms ; i++ ) {
// QueryTerms are derived from QueryWords
QueryTerm *qt = &m_qterms[i];
const QueryTerm *qt = &m_qterms[i];
// don't require if negative
if ( qt->m_termSign == '-' ) {
m_negativeBits |= qt->m_explicitBit; // (1 << i );
@ -1040,8 +1036,7 @@ bool Query::setQTerms ( const Words &words ) {
// set m_matchRequiredBits which we use for Matches.cpp
m_matchRequiredBits = 0;
for ( int32_t i = 0 ; i < m_numTerms ; i++ ) {
// QueryTerms are derived from QueryWords
QueryTerm *qt = &m_qterms[i];
const QueryTerm *qt = &m_qterms[i];
// don't require if negative
if ( qt->m_termSign == '-' ) continue;
// skip all phrase terms
@ -1070,7 +1065,6 @@ bool Query::setQTerms ( const Words &words ) {
m_numRequired = 0;
for ( int32_t i = 0 ; i < m_numTerms ; i++ ) {
// QueryTerms are derived from QueryWords
QueryTerm *qt = &m_qterms[i];
// assume not required
qt->m_isRequired = false;
@ -1087,9 +1081,22 @@ bool Query::setQTerms ( const Words &words ) {
}
//workaround/hack for double-highfreqterm searchs, such as "of a" or "the the" or "the who"
if(m_numWords==3 &&
m_qwords[0].m_ignoreWord==IGNORE_HIGHFREMTERM &&
m_qwords[2].m_ignoreWord==IGNORE_HIGHFREMTERM &&
m_numTerms==1 &&
!m_qterms[0].m_isRequired)
{
log(LOG_DEBUG, "query: Looks like a highfreqterm-highfreqterm query type. Requiring one-and-only QueryTerm/bigram");
m_qterms[0].m_isRequired = true;
//todo: we should investigate if QueryTerm::m_isRequired actually has any effect. It is used
//in a single place in PosdbTable for not generating a QueryTermInfo, but it appears it works
//fine even with the QTI.
}
// required quoted phrase terms
for ( int32_t i = 0 ; i < m_numTerms ; i++ ) {
// QueryTerms are derived from QueryWords
QueryTerm *qt = &m_qterms[i];
// quoted phrase?
if ( ! qt->m_isPhrase ) continue;
@ -1111,21 +1118,20 @@ bool Query::setQTerms ( const Words &words ) {
// . for 'in the nick' , a wiki phrase, make "in the" required
// and give a big bonus for "the nick" below.
for ( int32_t i = 0 ; i < m_numTerms ; i++ ) {
// QueryTerms are derived from QueryWords
QueryTerm *qt = &m_qterms[i];
// don't require if negative
if ( qt->m_termSign == '-' ) continue;
// only check bigrams here
if ( ! qt->m_isPhrase ) continue;
// get the query word that starts this phrase
QueryWord *qw1 = qt->m_qword;
const QueryWord *qw1 = qt->m_qword;
// must be in a wikiphrase
if ( qw1->m_wikiPhraseId <= 0 ) continue;
// what query word # is that?
int32_t qwn = qw1 - m_qwords;
// get the next alnum word after that
// assume its the last word in our bigram phrase
QueryWord *qw2 = &m_qwords[qwn+2];
const QueryWord *qw2 = &m_qwords[qwn+2];
// must be in same wikiphrase
if ( qw2->m_wikiPhraseId != qw1->m_wikiPhraseId ) continue;
// must be two stop words
@ -1191,7 +1197,6 @@ bool Query::setQTerms ( const Words &words ) {
// is a synonym term of the single word term "enough" and is treated
// as such in the Posdb.cpp logic.
for ( int32_t i = 0 ; i < m_numTerms ; i++ ) {
// QueryTerms are derived from QueryWords
QueryTerm *qt = &m_qterms[i];
// assume not!
qt->m_isWikiHalfStopBigram = 0;
@ -1200,14 +1205,14 @@ bool Query::setQTerms ( const Words &words ) {
// only check bigrams here
if ( ! qt->m_isPhrase ) continue;
// get the query word that starts this phrase
QueryWord *qw1 = qt->m_qword;
const QueryWord *qw1 = qt->m_qword;
// must be in a wikiphrase
if ( qw1->m_wikiPhraseId <= 0 ) continue;
// what query word # is that?
int32_t qwn = qw1 - m_qwords;
// get the next alnum word after that
// assume its the last word in our bigram phrase
QueryWord *qw2 = &m_qwords[qwn+2];
const QueryWord *qw2 = &m_qwords[qwn+2];
// must be in same wikiphrase
if ( qw2->m_wikiPhraseId != qw1->m_wikiPhraseId ) continue;
// if both query stop words, should have been handled above
@ -2201,7 +2206,7 @@ bool Query::setQWords ( char boolFlag ,
if ( !phrases.set( &words, &bits ) )
return false;
int64_t *wids = words.getWordIds();
const int64_t *wids = words.getWordIds();
// do phrases stuff
for ( int32_t i = 0 ; i < numWords ; i++ ) {
@ -2474,13 +2479,13 @@ bool Query::setQWords ( char boolFlag ,
// . how many non-negative, non-ignored words/phrases do we have?
count = 0;
for ( int32_t i = 0 ; i < m_numWords ; i++ ) {
QueryWord *qw = &m_qwords[i];
const QueryWord *qw = &m_qwords[i];
if ( qw->m_ignoreWord ) continue;
if ( qw->m_wordSign == '-' ) continue;
count++;
}
for ( int32_t i = 0 ; i < m_numWords ; i++ ) {
QueryWord *qw = &m_qwords[i];
const QueryWord *qw = &m_qwords[i];
if ( qw->m_ignorePhrase ) continue;
if ( qw->m_phraseSign == '-' ) continue;
if ( qw->m_phraseId == 0LL ) continue;
@ -2592,7 +2597,7 @@ int32_t Query::getWordNum(int64_t wordId) const {
// skip if punct or whatever
if ( wordId == 0LL || wordId == -1LL ) return -1;
for ( int32_t i = 0 ; i < m_numWords ; i++ ) {
QueryWord *qw = &m_qwords[i];
const QueryWord *qw = &m_qwords[i];
// the non-raw word id includes a hash with "0", which
// signifies an empty field term
if ( qw->m_rawWordId == wordId ) return i;
@ -3343,7 +3348,7 @@ bool Expression::isTruth(const unsigned char *bitVec, int32_t vecSize) const {
// so operands are expressions as well
Expression *e = (Expression *)qw->m_expressionPtr;
const Expression *e = (const Expression *)qw->m_expressionPtr;
if ( e ) {
// save prev one. -1 means no prev.
prevResult = opResult;
@ -3375,7 +3380,7 @@ bool Expression::isTruth(const unsigned char *bitVec, int32_t vecSize) const {
// save old one
prevResult = opResult;
// convert word to term #
QueryTerm *qt = qw->m_queryWordTerm;
const QueryTerm *qt = qw->m_queryWordTerm;
// fix title:"notre dame" AND NOT irish
if ( ! qt ) qt = qw->m_queryPhraseTerm;
if ( ! qt ) continue;

@ -265,7 +265,7 @@ class QueryTerm {
void constructor ( ) ;
// the query word we were derived from
QueryWord *m_qword;
const QueryWord *m_qword;
// . are we a phrase termid or single word termid from that QueryWord?
// . the QueryWord instance represents both, so we must choose

@ -514,7 +514,7 @@ bool RdbCache::getRecord ( collnum_t collnum ,
*rec = p;
// copy the data and set "list" with it iff "doCopy" is true
if ( doCopy && *recSize > 0 ) {
*rec = mdup ( p , *recSize , "RdbCache3" );
*rec = (char*)mdup ( p , *recSize , "RdbCache3" );
if ( ! *rec ) {
log(LOG_WARN, "db: Could not allocate space for cached record for %s of %" PRId32" bytes.",
m_dbname,*recSize);

@ -298,8 +298,6 @@ bool SiteGetter::getSiteList ( ) {
// get the list. returns false if blocked.
if (!m_msg0.getList( -1, // hostId
0, // ip
0, // port
0, // maxCacheAge
false, // addToCache
RDB_POSDB,
@ -314,7 +312,6 @@ bool SiteGetter::getSiteList ( ) {
// default parms follow
true, // doErrorCorrection?
true, // includeTree?
true, // doMerge?
-1, // firstHostId
0, // startFileNum
-1, // numFiles

@ -1466,8 +1466,6 @@ bool Msg8a::launchGetRequests ( ) {
// . launch this request, even if to ourselves
// . TODO: just use msg0!!
bool status = m->getList ( firstHostId , // hostId
0 , // ip
0 , // port
0 , // maxCacheAge
false , // addToCache
RDB_TAGDB ,
@ -1481,7 +1479,6 @@ bool Msg8a::launchGetRequests ( ) {
m_niceness ,
true , // error correction?
true , // include tree?
true , // doMerge?
firstHostId , // firstHostId
0 , // startFileNum
-1 , // numFiles

@ -1524,13 +1524,15 @@ bool UdpSlot::makeReadBuf ( int32_t msgSize , int32_t numDgrams ) {
// if msgSize is -1 then it is under 1 dgram, but assume the worst
if ( msgSize == -1 ) msgSize = m_maxDgramSize;
// . create a msg buf to hold msg, zero out everything...
// . label it "umsg" so we can grep the *.cpp files for it
m_readBuf = (char *) mmalloc ( msgSize, umsg_label[(uint8_t)m_msgType] );
if ( ! m_readBuf ) {
m_readBufSize = 0;
log(LOG_WARN, "udp: Failed to allocate %" PRId32" bytes to read request or reply on udp socket.", msgSize);
return false;
if(msgSize!=0) {
// . create a msg buf to hold msg, zero out everything...
// . label it "umsg" so we can grep the *.cpp files for it
m_readBuf = (char *) mmalloc ( msgSize, umsg_label[(uint8_t)m_msgType] );
if ( ! m_readBuf ) {
m_readBufSize = 0;
log(LOG_WARN, "udp: Failed to allocate %" PRId32" bytes to read request or reply on udp socket.", msgSize);
return false;
}
}
m_readBufMaxSize = msgSize;
// let the caller know we're good

@ -4855,8 +4855,6 @@ RdbList *XmlDoc::getDupList ( ) {
m_dupListValid = true;
// this is a no-split lookup by default now
if ( ! m_msg0.getList ( -1 , // hostId
0 , // ip
0 , // port
0 , // maxCacheAge
false , // add to cache?
RDB_POSDB, // INDEXDB ,
@ -4870,7 +4868,6 @@ RdbList *XmlDoc::getDupList ( ) {
m_niceness ,
true , // error correction?
true , // include tree?
true , // domerge?
-1 , // firsthosti
0 , // startfilenum
-1, // # files

@ -26,7 +26,7 @@ CPPFLAGS += -std=c++11
# exported in parent make
CPPFLAGS += $(CONFIG_CPPFLAGS)
LIBS += $(BASE_DIR)/libgb.a -lz -lpthread -lssl -lcrypto
LIBS += $(BASE_DIR)/libgb.a -lz -lpthread -lssl -lcrypto -lpcre
LIBS += -L$(BASE_DIR) -lcld2_full
%: libgb.a $(BASE_DIR)/libcld2_full.so %.cpp

@ -58,13 +58,12 @@ int main(int argc, char **argv) {
g_conf.init(NULL);
BigFile bigFile;
bigFile.set(dir, filename);
strcpy(g_hostdb.m_dir, dir);
RdbBuckets buckets;
if (starts_with(filename, "posdb")) {
buckets.set(Posdb::getFixedDataSize(), g_conf.m_posdbMaxTreeMem, "buckets-posdb", RDB_POSDB, "posdb", Posdb::getKeySize());
if (!buckets.fastLoad(&bigFile, "posdb")) {
if (!buckets.loadBuckets("posdb")) {
fprintf(stdout, "Unable to load bucket\n");
return 1;
}