Merge branch 'ia' into ia-zak

This commit is contained in:
Matt
2015-07-22 12:02:19 -06:00
29 changed files with 414 additions and 107 deletions

@ -3579,7 +3579,7 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() {
// lower from 7 to 1 since we have so many collections now
// ok, now we have much less colls so raise back to 7
int32_t diffbotipms = 7;// 1; // 7
int32_t diffbotipms = 7;//1; // 7
// make the gigablast regex table just "default" so it does not
// filtering, but accepts all urls. we will add code to pass the urls
@ -3599,8 +3599,8 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() {
m_spiderIpWaits [i] = wait;
m_spiderIpMaxSpiders[i] = diffbotipms; // keep it respectful
// ethan wants some speed
if ( isEthan )
m_spiderIpMaxSpiders[i] = 30;
// if ( isEthan )
// m_spiderIpMaxSpiders[i] = 30;
//m_spidersEnabled [i] = 1;
m_spiderFreqs [i] = respiderFreq;
//m_spiderDiffbotApiUrl[i].purge();
@ -3623,6 +3623,18 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() {
m_forceDelete [i] = 1;
i++;
// de-prioritize fakefirstip urls so we don't give the impression our
// spiders are slow. like if someone adds a bulk job with 100,000 urls
// then we sit there and process to lookup their ips and add a real
// spider request (if it falls onto the same shard) before we actually
// do any real spidering. so keep the priority here low.
m_regExs[i].set("isfakeip");
m_maxSpidersPerRule [i] = 7;
m_spiderIpMaxSpiders [i] = 7;
m_spiderPriorities [i] = 20;
m_spiderIpWaits [i] = 0;
i++;
// hopcount filter if asked for
if( m_diffbotMaxHops >= 0 ) {

@ -18,6 +18,8 @@ void HashTableX::constructor() {
m_useKeyMagic = false;
m_ks = 0;
m_allowGrowth = true;
m_numSlots = 0;
m_numSlotsUsed = 0;
}
void HashTableX::destructor() {

@ -1514,6 +1514,10 @@ bool HttpServer::sendReply ( TcpSocket *s , HttpRequest *r , bool isAdmin) {
// is recycled/destroyed
// . this will call getMsgPiece() to fill up sendBuf from file
int32_t totalToSend = mimeLen + bytesToSend;
//s->m_state = NULL; // do we need this? yes, cuz s is NULL for cleanUp
if ( s && s->m_state == f ) s->m_state = NULL;
//if ( ! m_tcp.sendMsg ( s ,
if ( ! tcp->sendMsg ( s ,
sendBuf ,
@ -1542,7 +1546,6 @@ bool HttpServer::sendReply ( TcpSocket *s , HttpRequest *r , bool isAdmin) {
if ( ! f->isOpen() ) f->open( O_RDONLY );
int fd = f->getfd();
cleanUp ( f , NULL/*TcpSocket */ );
s->m_state = NULL; // do we need this? yes, cuz s is NULL for cleanUp
// . AND we need to do this ourselves here
// . do it SILENTLY so not message is logged if fd not registered
if (tcp->m_useSSL)

@ -30,6 +30,18 @@ Matches::Matches ( ) {
}
Matches::~Matches( ) { reset(); }
void Matches::reset ( ) {
reset2();
if ( m_qwordFlags && m_qwordFlags != (mf_t *)m_tmpBuf ) {
mfree ( m_qwordFlags , m_qwordAllocSize , "mmqw" );
m_qwordFlags = NULL;
}
//m_explicitsMatched = 0;
//m_matchableRequiredBits = 0;
//m_hasAllQueryTerms = false;
//m_matchesQuery = false;
}
void Matches::reset2() {
m_numMatches = 0;
//m_maxNQT = -1;
m_numAlnums = 0;
@ -41,14 +53,6 @@ void Matches::reset ( ) {
m_bitsArray [i].reset();
}
m_numMatchGroups = 0;
if ( m_qwordFlags && m_qwordFlags != (mf_t *)m_tmpBuf ) {
mfree ( m_qwordFlags , m_qwordAllocSize , "mmqw" );
m_qwordFlags = NULL;
}
//m_explicitsMatched = 0;
//m_matchableRequiredBits = 0;
//m_hasAllQueryTerms = false;
//m_matchesQuery = false;
}
bool Matches::isMatchableTerm ( QueryTerm *qt ) { // , int32_t i ) {
@ -298,7 +302,7 @@ bool Matches::set ( XmlDoc *xd ,
int32_t niceness ) {
// don't reset query info!
reset();
reset2();
// sanity check
if ( ! xd->m_docIdValid ) { char *xx=NULL;*xx=0; }

@ -142,6 +142,7 @@ class Matches {
Matches ( ) ;
~Matches( ) ;
void reset ( ) ;
void reset2 ( ) ;
// BIG HACK support
//int32_t getTermsFound ( bool *hadPhrases , bool *hadWords );

14
Mem.h

@ -280,6 +280,20 @@ inline int32_t getNumBitsOn64 ( uint64_t bits ) {
g_a [ *((unsigned char *)(&bits) + 7) ] ;
}
inline int32_t getNumBitsOnX ( unsigned char *s , int32_t slen ) {
if ( slen == 1 ) return getNumBitsOn8 ( *s );
if ( slen == 2 ) return getNumBitsOn16 ( *(uint16_t *)s );
if ( slen == 4 ) return getNumBitsOn32 ( *(uint32_t *)s );
if ( slen == 3 )
return getNumBitsOn8 ( s[0] ) +
getNumBitsOn8 ( s[1] ) +
getNumBitsOn8 ( s[2] ) ;
int32_t total = 0;
for ( int32_t i = 0 ; i < slen ; i++ )
total += getNumBitsOn8 ( s[i] );
return total;
}
// assume only one bit is set for this (used by Address.cpp)
inline int32_t getBitPosLL ( uint8_t *bit ) {
// which int32_t is it in?

@ -34,6 +34,10 @@ Msg39::Msg39 () {
reset();
}
Msg39::~Msg39 () {
reset();
}
void Msg39::reset() {
if ( m_inUse ) { char *xx=NULL;*xx=0; }
m_allocedTree = false;
@ -46,8 +50,14 @@ void Msg39::reset() {
void Msg39::reset2() {
// reset lists
for ( int32_t j = 0 ; j < m_msg2.m_numLists && m_lists ; j++ )
m_lists[j].freeList();
int32_t nqt = m_stackBuf.getLength() / sizeof(RdbList);
//for ( int32_t j = 0 ; j < m_msg2.m_numLists && m_lists ; j++ ) {
for ( int32_t j = 0 ; j < nqt && m_lists ; j++ ) {
//m_lists[j].freeList();
//log("msg39: destroy list @ 0x%"PTRFMT,(PTRTYPE)&m_lists[j]);
// same thing but more generic
m_lists[j].destructor();
}
m_stackBuf.purge();
m_lists = NULL;
m_msg2.reset();
@ -207,7 +217,8 @@ void Msg39::getDocIds2 ( Msg39Request *req ) {
if ( ! m_tmpq.set2 ( m_r->ptr_query ,
m_r->m_language ,
m_r->m_queryExpansion ,
m_r->m_useQueryStopWords ) ) {
m_r->m_useQueryStopWords ,
m_r->m_maxQueryTerms ) ) {
log("query: msg39: setQuery: %s." ,
mstrerror(g_errno) );
sendReply ( m_slot , this , NULL , 0 , 0 , true );
@ -225,11 +236,14 @@ void Msg39::getDocIds2 ( Msg39Request *req ) {
if ( m_tmpq.getNumTerms() != m_r->m_nqt ) {
g_errno = EBADENGINEER;
log("query: Query parsing inconsistency for q=%s. "
"%i != %i. "
"langid=%"INT32". Check langids and m_queryExpansion parms "
"which are the only parms that could be different in "
"Query::set2(). You probably have different mysynoyms.txt "
"files on two different hosts! check that!!"
,m_tmpq.m_orig
,(int)m_tmpq.getNumTerms()
,(int)m_r->m_nqt
,(int32_t)m_r->m_language
);
sendReply ( m_slot , this , NULL , 0 , 0 , true );
@ -767,11 +781,15 @@ bool Msg39::getLists () {
int32_t nqt = m_tmpq.getNumTerms();
if ( ! m_stackBuf.reserve ( sizeof(RdbList) * nqt ) ) return true;
int32_t need = sizeof(RdbList) * nqt ;
m_stackBuf.setLabel("stkbuf2");
if ( ! m_stackBuf.reserve ( need ) ) return true;
m_lists = (IndexList *)m_stackBuf.getBufStart();
for ( int32_t i = 0 ; i < nqt ; i++ )
m_stackBuf.setLength ( need );
for ( int32_t i = 0 ; i < nqt ; i++ ) {
m_lists[i].constructor();
//log("msg39: constructlist @ 0x%"PTRFMT,(PTRTYPE)&m_lists[i]);
}
// call msg2
if ( ! m_msg2.getLists ( rdbId ,

@ -216,6 +216,7 @@ class Msg39 {
public:
Msg39();
~Msg39();
void reset();
void reset2();
// register our request handler for Msg39's

@ -666,7 +666,7 @@ bool Msg40::federatedLoop ( ) {
mr.size_whiteList = slen;
mr.m_timeout = -1; // auto-determine based on #terms
// make sure query term counts match in msg39
mr.m_maxQueryTerms = m_si->m_maxQueryTerms;
//mr.m_maxQueryTerms = m_si->m_maxQueryTerms;
mr.m_realMaxTop = m_si->m_realMaxTop;
mr.m_minSerpDocId = m_si->m_minSerpDocId;
@ -699,6 +699,9 @@ bool Msg40::federatedLoop ( ) {
//if ( numDocIdSplits < 5 ) numDocIdSplits = 5;
//}
if ( cr ) mr.m_maxQueryTerms = cr->m_maxQueryTerms;
else mr.m_maxQueryTerms = 100;
// special oom hack fix
if ( cr && cr->m_isCustomCrawl && numDocIdSplits < 4 )
numDocIdSplits = 4;

@ -2529,6 +2529,22 @@ bool printSearchResultsHeader ( State0 *st ) {
, getLanguageString(si->m_queryLangId) );
// print query words we ignored, like stop words
printIgnoredWords ( sb , si );
sb->safePrintf("\t\t<queryNumTermsTotal>"
"%"INT32
"</queryNumTermsTotal>\n"
, q->m_numTermsUntruncated );
sb->safePrintf("\t\t<queryNumTermsUsed>"
"%"INT32
"</queryNumTermsUsed>\n"
, q->m_numTerms );
int32_t tval = 0;
if ( q->m_numTerms < q->m_numTermsUntruncated ) tval = 1;
sb->safePrintf("\t\t<queryWasTruncated>"
"%"INT32
"</queryWasTruncated>\n"
, tval );
for ( int i = 0 ; i < q->m_numTerms ; i++ ) {
sb->safePrintf("\t\t<term>\n");
QueryTerm *qt = &q->m_qterms[i];
@ -2605,6 +2621,19 @@ bool printSearchResultsHeader ( State0 *st ) {
sb->safePrintf("\",\n");
// print query words we ignored, like stop words
printIgnoredWords ( sb , si );
sb->safePrintf("\t\"queryNumTermsTotal\":"
"%"INT32",\n"
, q->m_numTermsUntruncated );
sb->safePrintf("\t\"queryNumTermsUsed\":"
"%"INT32",\n"
, q->m_numTerms );
int32_t tval = 0;
if ( q->m_numTerms < q->m_numTermsUntruncated ) tval = 1;
sb->safePrintf("\t\"queryWasTruncated\":"
"%"INT32",\n"
, tval );
sb->safePrintf("\t\"terms\":[\n");
for ( int i = 0 ; i < q->m_numTerms ; i++ ) {
sb->safePrintf("\t\t{\n");
@ -8263,8 +8292,11 @@ bool printCSVHeaderRow2 ( SafeBuf *sb ,
hdr = "Hop Count";
if ( ! strcmp(hdr,"gbssIp") )
hdr = "IP";
if ( ! strcmp(hdr,"gbssDiffbotUri" ) )
hdr = "Diffbot URI";
// csv report is regular urls not diffbot object urls so
// regular urls do not have a just a single diffboturi,
// they could have 0 or multiple diffboturis
//if ( ! strcmp(hdr,"gbssDiffbotUri" ) )
// hdr = "Diffbot URI";
if ( ! strcmp(hdr,"gbssSentToDiffbotThisTime") )
hdr = "Process Attempted";
if ( ! strcmp(hdr,"gbssDiffbotReplyMsg") )

@ -3857,6 +3857,25 @@ bool printApiForPage ( SafeBuf *sb , int32_t PAGENUM , CollectionRec *cr ) {
"</b>");
sb->brify2 (
"\t\t# List of space separated words in the "
"query that were ignored for the most part. "
"Because they were common words for the "
"query language they are in.\n"
, cols , "\n\t\t# " , false );
sb->safePrintf("<b>\t\t\"ignoredWords\":\"to the\",\n\n"
"</b>");
sb->brify2 (
"\t\t# There is a maximum limit placed on the "
"number of query terms we search on to keep things "
"fast. This can "
"be changed in the search controls.\n"
, cols , "\n\t\t# " , false );
sb->safePrintf("<b>\t\t\"queryNumTermsTotal\":52,\n</b>");
sb->safePrintf("<b>\t\t\"queryNumTermsUsed\":20,\n</b>");
sb->safePrintf("<b>\t\t\"queryWasTruncated\":1,\n\n</b>");
sb->brify2 (
"\t\t# The start of the terms array. Each query "
"is broken down into a list of terms. Each "

@ -7879,17 +7879,19 @@ void Parms::init ( ) {
m->m_obj = OBJ_COLL;
m++;
//m->m_title = "max query terms";
//m->m_desc = "Do not allow more than this many query terms. Will "
// "return error in XML feed error tag if breeched.";
//m->m_cgi = "mqt";
//m->m_off = (char *)&cr.m_maxQueryTerms - x;
m->m_title = "max query terms";
m->m_desc = "Do not allow more than this many query terms. Helps "
"prevent big queries from resource hogging.";
m->m_cgi = "mqt";
m->m_off = (char *)&cr.m_maxQueryTerms - x;
//m->m_soff = (char *)&si.m_maxQueryTerms - y;
//m->m_type = TYPE_LONG;
//m->m_def = "20"; // 20 for testing, normally 16
//m->m_sparm = 1;
//m->m_spriv = 1;
//m++;
m->m_type = TYPE_LONG;
m->m_def = "999999"; // now we got synonyms... etc
m->m_group = 0;
m->m_flags = 0;//PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
/*
m->m_title = "dictionary site";
@ -15283,7 +15285,7 @@ void Parms::init ( ) {
m->m_type = TYPE_CHARPTR;
m->m_page = PAGE_REINDEX;
m->m_obj = OBJ_GBREQUEST;
m->m_def = "xx";
m->m_def = "en";
m->m_flags = PF_API ;
m++;

@ -759,7 +759,6 @@ void PosdbTable::init ( Query *q ,
// set this now
//m_collnum = cr->m_collnum;
// save it
m_topTree = topTree;
// a ptr for debugging i guess
@ -773,6 +772,9 @@ void PosdbTable::init ( Query *q ,
m_realMaxTop = r->m_realMaxTop;
if ( m_realMaxTop > MAX_TOP ) m_realMaxTop = MAX_TOP;
m_siteRankMultiplier = SITERANKMULTIPLIER;
if ( m_q->m_isBoolean ) m_siteRankMultiplier = 0.0;
// seo.cpp supplies a NULL msg2 because it already sets
// QueryTerm::m_posdbListPtrs
if ( ! msg2 ) return;
@ -6304,12 +6306,7 @@ void PosdbTable::intersectLists10_r ( ) {
}
if ( m_q->m_isBoolean ) {
minScore = 1.0;
// since we are jumping, we need to set m_docId here
//m_docId = *(uint32_t *)(docIdPtr+1);
//m_docId <<= 8;
//m_docId |= (unsigned char)docIdPtr[0];
//m_docId >>= 2;
//minScore = 1.0;
// we can't jump over setting of miniMergeList. do that.
goto boolJump1;
}
@ -6521,6 +6518,30 @@ void PosdbTable::intersectLists10_r ( ) {
boolJump1:
if ( m_q->m_isBoolean ) {
//minScore = 1.0;
// this is somewhat wasteful since it is set below again
m_docId = *(uint32_t *)(docIdPtr+1);
m_docId <<= 8;
m_docId |= (unsigned char)docIdPtr[0];
m_docId >>= 2;
// add one point for each term matched in the bool query
// this is really just for when the terms are from different
// fields. if we have unfielded boolean terms we should
// do proximity matching.
int32_t slot = m_bt.getSlot ( &m_docId );
if ( slot >= 0 ) {
uint8_t *bv = (uint8_t *)m_bt.getValueFromSlot(slot);
// then a score based on the # of terms that matched
int16_t bitsOn = getNumBitsOnX ( bv , m_vecSize );
// but store in hashtable now
minScore = (float)bitsOn;
}
else {
minScore = 1.0;
}
}
// we need to do this for seo hacks to merge the synonyms together
// into one list
seoHackSkip2:
@ -7226,7 +7247,7 @@ void PosdbTable::intersectLists10_r ( ) {
boolJump2:
// try dividing it by 3! (or multiply by .33333 faster)
score = minScore * (((float)siteRank)*SITERANKMULTIPLIER+1.0);
score = minScore * (((float)siteRank)*m_siteRankMultiplier+1.0);
// . not foreign language? give a huge boost
// . use "qlang" parm to set the language. i.e. "&qlang=fr"
@ -7896,7 +7917,7 @@ float PosdbTable::getMaxPossibleScore ( QueryTermInfo *qti ,
score *= WIKI_BIGRAM_WEIGHT;
}
//score *= perfectWordSpamWeight * perfectWordSpamWeight;
score *= (((float)siteRank)*SITERANKMULTIPLIER+1.0);
score *= (((float)siteRank)*m_siteRankMultiplier+1.0);
// language boost if same language (or no lang specified)
if ( m_r->m_language == docLang ||
@ -8187,13 +8208,15 @@ bool PosdbTable::makeDocIdVoteBufForBoolQuery_r ( ) {
// a 6 byte key means you pass
gbmemcpy ( dst , &docId , 6 );
// test it
int64_t d2;
d2 = *(uint32_t *)(dst+1);
d2 <<= 8;
d2 |= (unsigned char)dst[0];
d2 >>= 2;
docId >>= 2;
if ( d2 != docId ) { char *xx=NULL;*xx=0; }
if ( m_debug ) {
int64_t d2;
d2 = *(uint32_t *)(dst+1);
d2 <<= 8;
d2 |= (unsigned char)dst[0];
d2 >>= 2;
docId >>= 2;
if ( d2 != docId ) { char *xx=NULL;*xx=0; }
}
// end test
dst += 6;
}

@ -604,6 +604,8 @@ class PosdbTable {
float m_finalScore;
float m_preFinalScore;
float m_siteRankMultiplier;
// how long to add the last batch of lists
int64_t m_addListsTime;
int64_t m_t1 ;

@ -74,6 +74,9 @@ void Query::reset ( ) {
qw->destructor();
}
m_stackBuf.purge();
m_qterms = NULL;
m_sb.purge();
m_osb.purge();
m_docIdRestriction = 0LL;
@ -140,14 +143,16 @@ bool Query::set2 ( char *query ,
// need language for doing synonyms
uint8_t langId ,
char queryExpansion ,
bool useQueryStopWords ) {
//int32_t maxQueryTerms ) {
bool useQueryStopWords ,
int32_t maxQueryTerms ) {
m_langId = langId;
m_useQueryStopWords = useQueryStopWords;
// fix summary rerank and highlighting.
bool keepAllSingles = true;
m_maxQueryTerms = maxQueryTerms;
// assume boolean auto-detect.
char boolFlag = 2;
@ -159,7 +164,7 @@ bool Query::set2 ( char *query ,
if ( ! query ) return true;
// set to 256 for synonyms?
m_maxQueryTerms = 256;
//m_maxQueryTerms = 256;
m_queryExpansion = queryExpansion;
int32_t queryLen = gbstrlen(query);
@ -601,7 +606,7 @@ bool Query::setQTerms ( Words &words , Phrases &phrases ) {
int32_t max = (int32_t)MAX_EXPLICIT_BITS;
if ( max > m_maxQueryTerms ) max = m_maxQueryTerms;
// count them first for allocating
// count phrases first for allocating
int32_t nqt = 0;
for ( int32_t i = 0 ; i < m_numWords ; i++ ) {
QueryWord *qw = &m_qwords[i];
@ -653,6 +658,10 @@ bool Query::setQTerms ( Words &words , Phrases &phrases ) {
continue;
// skip if ignored like a stopword (stop to->too)
//if ( qw->m_ignoreWord ) continue;
// ignore title: etc. words, they are field names
if ( qw->m_ignoreWord == IGNORE_FIELDNAME ) continue;
// ignore boolean operators
if ( qw->m_ignoreWord ) continue;// IGNORE_BOOLOP
// no, hurts 'Greencastle IN economic development'
if ( qw->m_wordId == to ) continue;
// single letters...
@ -673,7 +682,9 @@ bool Query::setQTerms ( Words &words , Phrases &phrases ) {
nqt += naids;
}
m_numTermsUntruncated = nqt;
if ( nqt > m_maxQueryTerms ) nqt = m_maxQueryTerms;
// allocate the stack buf
if ( nqt ) {
@ -719,6 +730,11 @@ bool Query::setQTerms ( Words &words , Phrases &phrases ) {
"limit of %"INT32"",(int32_t)ABS_MAX_QUERY_TERMS );
break;
}
if ( n >= m_maxQueryTerms ) {
log("query: lost query phrase terms to max term cr "
"limit of %"INT32"",(int32_t)m_maxQueryTerms);
break;
}
QueryTerm *qt = &m_qterms[n];
qt->m_qword = qw ;
@ -877,6 +893,11 @@ bool Query::setQTerms ( Words &words , Phrases &phrases ) {
"limit of %"INT32"",(int32_t)ABS_MAX_QUERY_TERMS );
break;
}
if ( n >= m_maxQueryTerms ) {
log("query: lost query terms to max term cr "
"limit of %"INT32"",(int32_t)m_maxQueryTerms);
break;
}
QueryTerm *qt = &m_qterms[n];
qt->m_qword = qw ;
@ -1389,6 +1410,10 @@ bool Query::setQTerms ( Words &words , Phrases &phrases ) {
continue;
// skip if ignored like a stopword (stop to->too)
//if ( qw->m_ignoreWord ) continue;
// ignore title: etc. words, they are field names
if ( qw->m_ignoreWord == IGNORE_FIELDNAME ) continue;
// ignore boolean operators
if ( qw->m_ignoreWord ) continue;// IGNORE_BOOLOP
// no, hurts 'Greencastle IN economic development'
if ( qw->m_wordId == to ) continue;
// single letters...
@ -1424,6 +1449,14 @@ bool Query::setQTerms ( Words &words , Phrases &phrases ) {
}
// this happens for 'da da da'
if ( ! origTerm ) continue;
if ( n >= m_maxQueryTerms ) {
log("query: lost synonyms due to max cr term "
"limit of %"INT32"",
(int32_t)m_maxQueryTerms);
break;
}
// add that query term
QueryTerm *qt = &m_qterms[n];
qt->m_qword = qw; // NULL;
@ -2483,12 +2516,14 @@ bool Query::setQWords ( char boolFlag ,
// in quotes which is silly, so undo it. But we should
// still inherit any quoteSign, however. Be sure to also
// set m_inQuotes to false so Matches.cpp::matchWord() works.
if ( i == quoteStart ) { // + 1 ) {
if ( i + 1 >= numWords || words.getNumQuotes(i+1)>0 ) {
qw->m_quoteStart = -1;
qw->m_inQuotes = false;
}
}
// MDW: don't undo it because we do not want to get synonyms
// of terms in quotes. 7/15/2015
// if ( i == quoteStart ) { // + 1 ) {
// if ( i + 1 >= numWords || words.getNumQuotes(i+1)>0 ) {
// qw->m_quoteStart = -1;
// qw->m_inQuotes = false;
// }
// }
// . get prefix hash of collection name and field
// . but first convert field to lower case
uint64_t ph;

10
Query.h

@ -635,10 +635,10 @@ class Query {
//int32_t collLen ,
uint8_t langId ,
char queryExpansion ,
bool useQueryStopWords = true );
//char boolFlag = 2 , // auto-detect if boolean query
//bool keepAllSingles = false ,
//int32_t maxQueryTerms = 0x7fffffff );
bool useQueryStopWords = true ,
//char boolFlag = 2 , // auto-detect if boolean query
//bool keepAllSingles = false ,
int32_t maxQueryTerms = 0x7fffffff );
// serialize/deserialize ourselves so we don't have to pass the
// unmodified string around and reparse it every time
@ -941,6 +941,8 @@ class Query {
int32_t m_numTerms;
int32_t m_numTermsSpecial;
int32_t m_numTermsUntruncated;
// separate vectors for easier interfacing, 1-1 with m_qterms
//int64_t m_termFreqs [ MAX_QUERY_TERMS ];
//int64_t m_termIds [ MAX_QUERY_TERMS ];

@ -373,12 +373,12 @@ bool RdbDump::dumpTree ( bool recall ) {
//if ( removeNegRecs )
// m_list.removeNegRecs();
// if(!m_list->checkList_r ( false , // removeNegRecs?
// false , // sleep on problem?
// m_rdb->m_rdbId )) {
// log("db: list to dump is not sane!");
// char *xx=NULL;*xx=0;
// }
// if(!m_list->checkList_r ( false , // removeNegRecs?
// false , // sleep on problem?
// m_rdb->m_rdbId )) {
// log("db: list to dump is not sane!");
// char *xx=NULL;*xx=0;
// }
skip:
@ -781,6 +781,10 @@ bool RdbDump::doneReadingForVerify ( ) {
if ( m_addToMap ) t = gettimeofdayInMilliseconds();
// sanity check
if ( m_list->m_ks != m_ks ) { char *xx = NULL; *xx = 0; }
bool triedToFix = false;
tryAgain:
// . register this with the map now
// . only register AFTER it's ALL on disk so we don't get partial
// record reads and we don't read stuff on disk that's also in tree
@ -788,6 +792,16 @@ bool RdbDump::doneReadingForVerify ( ) {
// . we don't have maps when we do unordered dumps
// . careful, map is NULL if we're doing unordered dump
if ( m_addToMap && m_map && ! m_map->addList ( m_list ) ) {
// keys out of order in list from tree?
if ( g_errno == ECORRUPTDATA ) {
log("db: trying to fix tree or buckets");
if ( m_tree ) m_tree->fixTree();
//if ( m_buckets ) m_buckets->fixBuckets();
if ( m_buckets ) { char *xx=NULL;*xx=0; }
if ( triedToFix ) { char *xx=NULL;*xx=0; }
triedToFix = true;
goto tryAgain;
}
g_errno = ENOMEM;
log("db: Failed to add data to map.");
// undo the offset update, the write failed, the parent

@ -624,7 +624,8 @@ bool RdbList::growList ( int32_t newSize ) {
// don't shrink list
if ( newSize <= m_allocSize ) return true;
// debug msg
//log("RdbList::growList from %"INT32" to %"INT32"",m_allocSize , newSize );
// log("RdbList::growList 0x%"PTRFMT "from %"INT32" to %"INT32"",
// (PTRTYPE)this,m_allocSize , newSize );
// make a new buffer
char *tmp =(char *) mrealloc ( m_alloc,m_allocSize,newSize,"RdbList");
//if ( (int32_t)tmp == 0x904dbd0 )

@ -323,8 +323,9 @@ bool RdbMap::verifyMap2 ( ) {
KEYSET(lastKey,k,m_ks); continue; }
// just bitch for now
log(
"db: Key out of order in map file %s%s. "
"page = %"INT32". key offset = %"INT64". Map or data file is "
"db: Key out of order in map file %s/%s. "
"page = %"INT32". key offset = %"INT64". "
"Map or data file is "
"corrupt, but it is probably the data file. Please "
"delete the map file and restart.",
m_file.m_dir,m_file.getFilename() ,
@ -337,6 +338,15 @@ bool RdbMap::verifyMap2 ( ) {
KEY1(lastKey,m_ks),KEY0(lastKey));
log("db: k.n1=%016"XINT64" n0=%016"XINT64"",KEY1(k,m_ks),KEY0(k));
log("db: m_numPages = %"INT32"",m_numPages);
SafeBuf cmd;
cmd.safePrintf("mv %s/%s %s/trash/",
m_file.m_dir,
m_file.getFilename(),
g_hostdb.m_dir);
log("db: %s",cmd.getBufStart() );
gbsystem ( cmd.getBufStart() );
exit(0);
//char *xx=NULL;*xx=0;
// was k too small?
@ -543,7 +553,8 @@ bool RdbMap::addRecord ( char *key, char *rec , int32_t recSize ) {
m_lastLogTime = getTime();
//pageNum > 0 && getKey(pageNum-1) > getKey(pageNum) ) {
log(LOG_LOGIC,"build: RdbMap: added key out of order. "
"count=%"INT64".",m_badKeys);
"count=%"INT64" file=%s/%s.",m_badKeys,
m_file.m_dir,m_file.getFilename());
//log(LOG_LOGIC,"build: k.n1=%"XINT32" %"XINT64" lastKey.n1=%"XINT32" %"XINT64"",
// key.n1,key.n0,m_lastKey.n1,m_lastKey.n0 );
log(LOG_LOGIC,"build: offset=%"INT64"",
@ -556,7 +567,10 @@ bool RdbMap::addRecord ( char *key, char *rec , int32_t recSize ) {
g_errno = ECORRUPTDATA;
return false;
}
char *xx=NULL;*xx=0;
// if being called from RdbDump.cpp...
g_errno = ECORRUPTDATA;
return false;
//char *xx=NULL;*xx=0;
// . during a merge, corruption can happen, so let's core
// here until we figure out how to fix it.
// . any why wasn't the corruption discovered and patched
@ -719,7 +733,10 @@ bool RdbMap::addList ( RdbList *list ) {
if ( ! addRecord ( key , rec , recSize ) ) {
log("db: Failed to add record to map: %s.",
mstrerror(g_errno));
char *xx = NULL; *xx = 0;
// allow caller to try to fix the tree in the case of dumping
// a tree to a file on disk
return false;
//char *xx = NULL; *xx = 0;
}
if ( list->skipCurrentRecord() ) goto top2;

@ -1283,19 +1283,26 @@ bool RdbTree::checkTree2 ( bool printMsgs , bool doChainTest ) {
if ( m_right[i] >= 0 && m_parents[m_right[i]] != i )
return log(
"db: Tree right kid and parent disagree.");
/*
// MDW: why did i comment out the order checking?
// check order
if ( m_left[i] >= 0 ) {
if ( m_left[i] >= 0 &&
m_collnums[i] == m_collnums[m_left[i]] ) {
char *key = &m_keys[i*m_ks];
char *left = &m_keys[m_left[i]*m_ks];
if ( KEYCMP(key,left,m_ks)<0) {char *xx=NULL;*xx=0;}
if ( KEYCMP(key,left,m_ks)<0)
return log("db: Tree left kid > parent %i",i);
}
if ( m_right[i] >= 0 ) {
if ( m_right[i] >= 0 &&
m_collnums[i] == m_collnums[m_right[i]] ) {
char *key = &m_keys[i*m_ks];
char *right = &m_keys[m_right[i]*m_ks];
if ( KEYCMP(key,right,m_ks)>0) {char *xx=NULL;*xx=0;}
if ( KEYCMP(key,right,m_ks)>0)
return log("db: Tree right kid < parent %i "
"%s < %s",i,
KEYSTR(right,m_ks),
KEYSTR(key,m_ks) );
}
*/
//g_loop.quickPoll(1, __PRETTY_FUNCTION__, __LINE__);
}
if ( hkp > 0 )

@ -470,14 +470,16 @@ bool SearchInput::set ( TcpSocket *sock , HttpRequest *r ) { //, Query *q ) {
log("query: qlang of \"%s\" is NOT SUPPORTED. using "
"langUnknown, \"xx\".",langAbbr);
int32_t maxQueryTerms = cr->m_maxQueryTerms;
// . the query to use for highlighting... can be overriden with "hq"
// . we need the language id for doing synonyms
if ( m_prepend && m_prepend[0] )
m_hqq.set2 ( m_prepend , m_queryLangId , true );
m_hqq.set2 ( m_prepend , m_queryLangId , true ,maxQueryTerms);
else if ( m_highlightQuery && m_highlightQuery[0] )
m_hqq.set2 ( m_highlightQuery , m_queryLangId , true );
m_hqq.set2 (m_highlightQuery,m_queryLangId,true,maxQueryTerms);
else if ( m_query && m_query[0] )
m_hqq.set2 ( m_query , m_queryLangId , true );
m_hqq.set2 ( m_query , m_queryLangId , true,maxQueryTerms);
// log it here
log(LOG_INFO,
@ -489,7 +491,9 @@ bool SearchInput::set ( TcpSocket *sock , HttpRequest *r ) { //, Query *q ) {
// . returns false and sets g_errno on error (ETOOMANYOPERANDS)
if ( ! m_q.set2 ( m_sbuf1.getBufStart(),
m_queryLangId ,
m_queryExpansion ) ) {
m_queryExpansion ,
true , // use QUERY stopwords?
maxQueryTerms ) ) {
g_msg = " (error: query has too many operands)";
return false;
}

@ -11641,6 +11641,18 @@ int32_t getUrlFilterNum2 ( SpiderRequest *sreq ,
goto checkNextRule;
}
if ( strncmp ( p , "isfakeip",8 ) == 0 ) {
// skip for msg20
if ( isForMsg20 ) continue;
// if no match continue
if ( (bool)sreq->m_fakeFirstIp == val ) continue;
p += 8;
p = strstr(p, "&&");
if ( ! p ) return i;
p += 2;
goto checkNextRule;
}
if ( strncmp ( p , "isonsamedomain",14 ) == 0 ) {
// skip for msg20
if ( isForMsg20 ) continue;
@ -13993,6 +14005,17 @@ bool getSpiderStatusMsg ( CollectionRec *cx , SafeBuf *msg , int32_t *status ) {
return msg->safePrintf("Job is initializing.");
}
// if we had seeds and none were successfully crawled, do not just
// print that the crawl completed.
if ( cx->m_collectiveRespiderFrequency <= 0.0 &&
cx->m_isCustomCrawl &&
! cx->m_globalCrawlInfo.m_hasUrlsReadyToSpider &&
cx->m_globalCrawlInfo.m_pageDownloadAttempts > 0 &&
cx->m_globalCrawlInfo.m_pageDownloadSuccesses == 0 ) {
*status = SP_SEEDSERROR;
return msg->safePrintf("Failed to crawl any seed.");
}
// if we sent an email simply because no urls
// were left and we are not recrawling!
if ( cx->m_collectiveRespiderFrequency <= 0.0 &&

@ -39,6 +39,7 @@
#define SP_INPROGRESS 7 // it is going on!
#define SP_ADMIN_PAUSED 8 // g_conf.m_spideringEnabled = false
#define SP_COMPLETED 9 // crawl is done, and no repeatCrawl is scheduled
#define SP_SEEDSERROR 10 // all seeds had an error preventing crawling
bool tryToDeleteSpiderColl ( SpiderColl *sc , char *msg ) ;
void spiderRoundIncremented ( class CollectionRec *cr ) ;

@ -13,6 +13,7 @@ Summary::Summary()
m_bitScoresBuf = NULL;
m_bitScoresBufSize = 0;
m_wordWeights = NULL;
m_buf4 = NULL;
reset();
}
@ -42,9 +43,10 @@ void Summary::reset() {
m_wordWeights = NULL;
}
m_wordWeights = NULL;
if ( m_buf && m_buf != m_tmpBuf2 )
mfree ( m_buf , m_bufSize , "ssstkb" );
m_buf = NULL;
if ( m_buf4 && m_buf4 != m_tmpBuf4 ) {
mfree ( m_buf4 , m_buf4Size , "ssstkb" );
m_buf4 = NULL;
}
}
@ -248,13 +250,13 @@ bool Summary::set2 ( Xml *xml ,
m_numExcerpts = 0;
int32_t need2 = (1+1+1) * m_q->m_numWords;
m_bufSize = need2;
m_buf4Size = need2;
if ( need2 < 128 )
m_buf = m_tmpBuf2;
m_buf4 = m_tmpBuf4;
else
m_buf = (char *)mmalloc ( need2 , "stkbuf" );
if ( ! m_buf ) return false;
char *x = m_buf;
m_buf4 = (char *)mmalloc ( need2 , "stkbuf" );
if ( ! m_buf4 ) return false;
char *x = m_buf4;
char *retired = x;
x += m_q->m_numWords;
char *maxGotIt = x;
@ -591,9 +593,10 @@ bool Summary::set2 ( Xml *xml ,
}
// free the mem we used if we allocated it
if ( m_buf && m_buf != m_tmpBuf2 )
mfree ( m_buf , m_bufSize , "ssstkb" );
m_buf = NULL;
if ( m_buf4 && m_buf4 != m_tmpBuf4 ) {
mfree ( m_buf4 , m_buf4Size , "ssstkb" );
m_buf4 = NULL;
}
// If we still didn't find a summary, get the default summary

@ -271,9 +271,9 @@ class Summary {
int32_t m_wordWeightSize;
char m_tmpBuf[128];
char *m_buf;
int32_t m_bufSize;
char m_tmpBuf2[128];
char *m_buf4;
int32_t m_buf4Size;
char m_tmpBuf4[128];
char m_summaryLocBuf[MAX_SUMMARY_LOCS*sizeof(uint64_t)];
SafeBuf m_summaryLocs;

@ -2569,11 +2569,10 @@ bool XmlDoc::indexDoc ( ) {
SafeBuf *ssDocMetaList = NULL;
// save this
int32_t saved = m_indexCode;
// and make it the real reason for the spider status doc
// make it the real reason for the spider status doc
m_indexCode = EDNSERROR;
// get the spiderreply ready to be added
ssDocMetaList = getSpiderStatusDocMetaList(NULL ,false);//del
// get the spiderreply ready to be added. false=del
ssDocMetaList =getSpiderStatusDocMetaList(NULL ,false);
// revert
m_indexCode = saved;
// error?
@ -2590,8 +2589,11 @@ bool XmlDoc::indexDoc ( ) {
char *url = "unknown";
if ( m_sreqValid ) url = m_sreq.m_url;
log("build: error2 getting real firstip of %"INT32" for "
"%s. Not adding new spider req", (int32_t)*fip,url);
log("build: error2 getting real firstip of "
"%"INT32" for "
"%s. Not adding new spider req. "
"spiderstatusdocsize=%"INT32, (int32_t)*fip,url,
m_addedStatusDocSize);
// also count it as a crawl attempt
cr->m_localCrawlInfo.m_pageDownloadAttempts++;
cr->m_globalCrawlInfo.m_pageDownloadAttempts++;
@ -3134,8 +3136,9 @@ bool isRobotsTxtFile ( char *u , int32_t ulen ) {
bool XmlDoc::isContainerDoc ( ) {
if ( m_firstUrlValid && m_firstUrl.isWarc() ) return true;
if ( m_firstUrlValid && m_firstUrl.isArc () ) return true;
if ( ! m_contentDelimValid ) { char *xx=NULL;*xx=0; }
if ( m_contentDelim ) return true;
//if ( ! m_contentDelimValid ) { char *xx=NULL;*xx=0; }
//if ( m_contentDelim ) return true;
if ( m_contentDelimValid && m_contentDelim ) return true;
return false;
}
@ -28695,6 +28698,11 @@ SafeBuf *XmlDoc::getSpiderStatusDocMetaList2 ( SpiderReply *reply1 ) {
jd.safePrintf("\"gbssHttpStatus\":%"INT32",\n",
(int32_t)m_httpStatus);
// do not index gbssIsSeedUrl:0 because there will be too many usually
bool isSeed = ( m_sreqValid && m_sreq.m_isAddUrl );
if ( isSeed )
jd.safePrintf("\"gbssIsSeedUrl\":1,\n");
if ( od )
jd.safePrintf("\"gbssWasIndexed\":1,\n");
else
@ -28719,6 +28727,18 @@ SafeBuf *XmlDoc::getSpiderStatusDocMetaList2 ( SpiderReply *reply1 ) {
else
jd.safePrintf("\"gbssDiffbotUri\":"
"\"none\",\n");
// show the type as gbssDiffbotType:"article" etc.
JsonItem *dti = NULL;
if ( jp1 )
dti = jp1->getItem("type");
if ( dti ) {
jd.safePrintf("\"gbssDiffbotType\":\"");
int32_t vlen;
char *val = dti->getValueAsString( &vlen );
if ( val ) jd.jsonEncode ( val , vlen );
jd.safePrintf("\",\n");
}
}
else { // if ( cr->m_isCustomCrawl ) {
jd.safePrintf("\"gbssIsDiffbotObject\":0,\n");

@ -232,6 +232,17 @@ uint64_t hash64d ( char *p, int32_t plen ) {
char ncs = utf8Encode ( x , (char *)tmp );
// sanity check
if ( ncs > 4 ) { char *xx=NULL;*xx=0; }
// i've seen this happen for 4 byte char =
// -16,-112,-51,-125 which has x=66371 and y=66371
// but utf8Encode() returned 0!
if ( ncs == 0 ) {
// let's just hash it as-is then
tmp[0] = p[0];
if ( cs >= 1 ) tmp[1] = p[1];
if ( cs >= 2 ) tmp[2] = p[2];
if ( cs >= 3 ) tmp[3] = p[3];
ncs = cs;
}
// hash it up
h ^= g_hashtab [i++][tmp[0]];
if ( ncs == 1 ) continue;

33
hash.h

@ -250,6 +250,17 @@ inline uint64_t hash64Lower_utf8_nospaces ( char *p, int32_t len ) {
char ncs = utf8Encode ( y , tmp );
// sanity check
if ( ncs > 4 ) { char *xx=NULL;*xx=0; }
// i've seen this happen for 4 byte char =
// -16,-112,-51,-125 which has x=66371 and y=66371
// but utf8Encode() returned 0!
if ( ncs == 0 ) {
// let's just hash it as-is then
tmp[0] = p[0];
if ( cs >= 1 ) tmp[1] = p[1];
if ( cs >= 2 ) tmp[2] = p[2];
if ( cs >= 3 ) tmp[3] = p[3];
ncs = cs;
}
// hash it up
h ^= g_hashtab [i++][(uint8_t)tmp[0]];
if ( ncs == 1 ) continue;
@ -301,6 +312,17 @@ inline uint64_t hash64Lower_utf8_cont ( char *p,
char ncs = utf8Encode ( y , tmp );
// sanity check
if ( ncs > 4 ) { char *xx=NULL;*xx=0; }
// i've seen this happen for 4 byte char =
// -16,-112,-51,-125 which has x=66371 and y=66371
// but utf8Encode() returned 0!
if ( ncs == 0 ) {
// let's just hash it as-is then
tmp[0] = p[0];
if ( cs >= 1 ) tmp[1] = p[1];
if ( cs >= 2 ) tmp[2] = p[2];
if ( cs >= 3 ) tmp[3] = p[3];
ncs = cs;
}
// hash it up
h ^= g_hashtab [i++][(uint8_t)tmp[0]];
if ( ncs == 1 ) continue;
@ -376,6 +398,17 @@ inline uint64_t hash64Lower_utf8 ( char *p ) {
char ncs = utf8Encode ( y , (char *)tmp );
// sanity check
if ( ncs > 4 ) { char *xx=NULL;*xx=0; }
// i've seen this happen for 4 byte char =
// -16,-112,-51,-125 which has x=66371 and y=66371
// but utf8Encode() returned 0!
if ( ncs == 0 ) {
// let's just hash it as-is then
tmp[0] = p[0];
if ( cs >= 1 ) tmp[1] = p[1];
if ( cs >= 2 ) tmp[2] = p[2];
if ( cs >= 3 ) tmp[3] = p[3];
ncs = cs;
}
// hash it up
h ^= g_hashtab [i++][(uint8_t)tmp[0]];
if ( ncs == 1 ) continue;

@ -4998,7 +4998,7 @@ int install ( install_flag_konst_t installFlag , int32_t hostId , char *dir ,
if ( ! f.doesExist() ) target = "gb";
sprintf(tmp,
"scp -c blowfish " // blowfish is faster
"scp -c arcfour " // blowfish is faster
"%s%s "
"%s:%s/gb.installed%s",
dir,