forked from Mirrors/privacore-open-source-search-engine
Merge branch 'ia' into ia-zak
This commit is contained in:
@ -3579,7 +3579,7 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() {
|
||||
|
||||
// lower from 7 to 1 since we have so many collections now
|
||||
// ok, now we have much less colls so raise back to 7
|
||||
int32_t diffbotipms = 7;// 1; // 7
|
||||
int32_t diffbotipms = 7;//1; // 7
|
||||
|
||||
// make the gigablast regex table just "default" so it does not
|
||||
// filtering, but accepts all urls. we will add code to pass the urls
|
||||
@ -3599,8 +3599,8 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() {
|
||||
m_spiderIpWaits [i] = wait;
|
||||
m_spiderIpMaxSpiders[i] = diffbotipms; // keep it respectful
|
||||
// ethan wants some speed
|
||||
if ( isEthan )
|
||||
m_spiderIpMaxSpiders[i] = 30;
|
||||
// if ( isEthan )
|
||||
// m_spiderIpMaxSpiders[i] = 30;
|
||||
//m_spidersEnabled [i] = 1;
|
||||
m_spiderFreqs [i] = respiderFreq;
|
||||
//m_spiderDiffbotApiUrl[i].purge();
|
||||
@ -3623,6 +3623,18 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() {
|
||||
m_forceDelete [i] = 1;
|
||||
i++;
|
||||
|
||||
// de-prioritize fakefirstip urls so we don't give the impression our
|
||||
// spiders are slow. like if someone adds a bulk job with 100,000 urls
|
||||
// then we sit there and process to lookup their ips and add a real
|
||||
// spider request (if it falls onto the same shard) before we actually
|
||||
// do any real spidering. so keep the priority here low.
|
||||
m_regExs[i].set("isfakeip");
|
||||
m_maxSpidersPerRule [i] = 7;
|
||||
m_spiderIpMaxSpiders [i] = 7;
|
||||
m_spiderPriorities [i] = 20;
|
||||
m_spiderIpWaits [i] = 0;
|
||||
i++;
|
||||
|
||||
// hopcount filter if asked for
|
||||
if( m_diffbotMaxHops >= 0 ) {
|
||||
|
||||
|
@ -18,6 +18,8 @@ void HashTableX::constructor() {
|
||||
m_useKeyMagic = false;
|
||||
m_ks = 0;
|
||||
m_allowGrowth = true;
|
||||
m_numSlots = 0;
|
||||
m_numSlotsUsed = 0;
|
||||
}
|
||||
|
||||
void HashTableX::destructor() {
|
||||
|
@ -1514,6 +1514,10 @@ bool HttpServer::sendReply ( TcpSocket *s , HttpRequest *r , bool isAdmin) {
|
||||
// is recycled/destroyed
|
||||
// . this will call getMsgPiece() to fill up sendBuf from file
|
||||
int32_t totalToSend = mimeLen + bytesToSend;
|
||||
|
||||
//s->m_state = NULL; // do we need this? yes, cuz s is NULL for cleanUp
|
||||
if ( s && s->m_state == f ) s->m_state = NULL;
|
||||
|
||||
//if ( ! m_tcp.sendMsg ( s ,
|
||||
if ( ! tcp->sendMsg ( s ,
|
||||
sendBuf ,
|
||||
@ -1542,7 +1546,6 @@ bool HttpServer::sendReply ( TcpSocket *s , HttpRequest *r , bool isAdmin) {
|
||||
if ( ! f->isOpen() ) f->open( O_RDONLY );
|
||||
int fd = f->getfd();
|
||||
cleanUp ( f , NULL/*TcpSocket */ );
|
||||
s->m_state = NULL; // do we need this? yes, cuz s is NULL for cleanUp
|
||||
// . AND we need to do this ourselves here
|
||||
// . do it SILENTLY so not message is logged if fd not registered
|
||||
if (tcp->m_useSSL)
|
||||
|
22
Matches.cpp
22
Matches.cpp
@ -30,6 +30,18 @@ Matches::Matches ( ) {
|
||||
}
|
||||
Matches::~Matches( ) { reset(); }
|
||||
void Matches::reset ( ) {
|
||||
reset2();
|
||||
if ( m_qwordFlags && m_qwordFlags != (mf_t *)m_tmpBuf ) {
|
||||
mfree ( m_qwordFlags , m_qwordAllocSize , "mmqw" );
|
||||
m_qwordFlags = NULL;
|
||||
}
|
||||
//m_explicitsMatched = 0;
|
||||
//m_matchableRequiredBits = 0;
|
||||
//m_hasAllQueryTerms = false;
|
||||
//m_matchesQuery = false;
|
||||
}
|
||||
|
||||
void Matches::reset2() {
|
||||
m_numMatches = 0;
|
||||
//m_maxNQT = -1;
|
||||
m_numAlnums = 0;
|
||||
@ -41,14 +53,6 @@ void Matches::reset ( ) {
|
||||
m_bitsArray [i].reset();
|
||||
}
|
||||
m_numMatchGroups = 0;
|
||||
if ( m_qwordFlags && m_qwordFlags != (mf_t *)m_tmpBuf ) {
|
||||
mfree ( m_qwordFlags , m_qwordAllocSize , "mmqw" );
|
||||
m_qwordFlags = NULL;
|
||||
}
|
||||
//m_explicitsMatched = 0;
|
||||
//m_matchableRequiredBits = 0;
|
||||
//m_hasAllQueryTerms = false;
|
||||
//m_matchesQuery = false;
|
||||
}
|
||||
|
||||
bool Matches::isMatchableTerm ( QueryTerm *qt ) { // , int32_t i ) {
|
||||
@ -298,7 +302,7 @@ bool Matches::set ( XmlDoc *xd ,
|
||||
int32_t niceness ) {
|
||||
|
||||
// don't reset query info!
|
||||
reset();
|
||||
reset2();
|
||||
|
||||
// sanity check
|
||||
if ( ! xd->m_docIdValid ) { char *xx=NULL;*xx=0; }
|
||||
|
@ -142,6 +142,7 @@ class Matches {
|
||||
Matches ( ) ;
|
||||
~Matches( ) ;
|
||||
void reset ( ) ;
|
||||
void reset2 ( ) ;
|
||||
|
||||
// BIG HACK support
|
||||
//int32_t getTermsFound ( bool *hadPhrases , bool *hadWords );
|
||||
|
14
Mem.h
14
Mem.h
@ -280,6 +280,20 @@ inline int32_t getNumBitsOn64 ( uint64_t bits ) {
|
||||
g_a [ *((unsigned char *)(&bits) + 7) ] ;
|
||||
}
|
||||
|
||||
inline int32_t getNumBitsOnX ( unsigned char *s , int32_t slen ) {
|
||||
if ( slen == 1 ) return getNumBitsOn8 ( *s );
|
||||
if ( slen == 2 ) return getNumBitsOn16 ( *(uint16_t *)s );
|
||||
if ( slen == 4 ) return getNumBitsOn32 ( *(uint32_t *)s );
|
||||
if ( slen == 3 )
|
||||
return getNumBitsOn8 ( s[0] ) +
|
||||
getNumBitsOn8 ( s[1] ) +
|
||||
getNumBitsOn8 ( s[2] ) ;
|
||||
int32_t total = 0;
|
||||
for ( int32_t i = 0 ; i < slen ; i++ )
|
||||
total += getNumBitsOn8 ( s[i] );
|
||||
return total;
|
||||
}
|
||||
|
||||
// assume only one bit is set for this (used by Address.cpp)
|
||||
inline int32_t getBitPosLL ( uint8_t *bit ) {
|
||||
// which int32_t is it in?
|
||||
|
28
Msg39.cpp
28
Msg39.cpp
@ -34,6 +34,10 @@ Msg39::Msg39 () {
|
||||
reset();
|
||||
}
|
||||
|
||||
Msg39::~Msg39 () {
|
||||
reset();
|
||||
}
|
||||
|
||||
void Msg39::reset() {
|
||||
if ( m_inUse ) { char *xx=NULL;*xx=0; }
|
||||
m_allocedTree = false;
|
||||
@ -46,8 +50,14 @@ void Msg39::reset() {
|
||||
|
||||
void Msg39::reset2() {
|
||||
// reset lists
|
||||
for ( int32_t j = 0 ; j < m_msg2.m_numLists && m_lists ; j++ )
|
||||
m_lists[j].freeList();
|
||||
int32_t nqt = m_stackBuf.getLength() / sizeof(RdbList);
|
||||
//for ( int32_t j = 0 ; j < m_msg2.m_numLists && m_lists ; j++ ) {
|
||||
for ( int32_t j = 0 ; j < nqt && m_lists ; j++ ) {
|
||||
//m_lists[j].freeList();
|
||||
//log("msg39: destroy list @ 0x%"PTRFMT,(PTRTYPE)&m_lists[j]);
|
||||
// same thing but more generic
|
||||
m_lists[j].destructor();
|
||||
}
|
||||
m_stackBuf.purge();
|
||||
m_lists = NULL;
|
||||
m_msg2.reset();
|
||||
@ -207,7 +217,8 @@ void Msg39::getDocIds2 ( Msg39Request *req ) {
|
||||
if ( ! m_tmpq.set2 ( m_r->ptr_query ,
|
||||
m_r->m_language ,
|
||||
m_r->m_queryExpansion ,
|
||||
m_r->m_useQueryStopWords ) ) {
|
||||
m_r->m_useQueryStopWords ,
|
||||
m_r->m_maxQueryTerms ) ) {
|
||||
log("query: msg39: setQuery: %s." ,
|
||||
mstrerror(g_errno) );
|
||||
sendReply ( m_slot , this , NULL , 0 , 0 , true );
|
||||
@ -225,11 +236,14 @@ void Msg39::getDocIds2 ( Msg39Request *req ) {
|
||||
if ( m_tmpq.getNumTerms() != m_r->m_nqt ) {
|
||||
g_errno = EBADENGINEER;
|
||||
log("query: Query parsing inconsistency for q=%s. "
|
||||
"%i != %i. "
|
||||
"langid=%"INT32". Check langids and m_queryExpansion parms "
|
||||
"which are the only parms that could be different in "
|
||||
"Query::set2(). You probably have different mysynoyms.txt "
|
||||
"files on two different hosts! check that!!"
|
||||
,m_tmpq.m_orig
|
||||
,(int)m_tmpq.getNumTerms()
|
||||
,(int)m_r->m_nqt
|
||||
,(int32_t)m_r->m_language
|
||||
);
|
||||
sendReply ( m_slot , this , NULL , 0 , 0 , true );
|
||||
@ -767,11 +781,15 @@ bool Msg39::getLists () {
|
||||
|
||||
|
||||
int32_t nqt = m_tmpq.getNumTerms();
|
||||
if ( ! m_stackBuf.reserve ( sizeof(RdbList) * nqt ) ) return true;
|
||||
int32_t need = sizeof(RdbList) * nqt ;
|
||||
m_stackBuf.setLabel("stkbuf2");
|
||||
if ( ! m_stackBuf.reserve ( need ) ) return true;
|
||||
m_lists = (IndexList *)m_stackBuf.getBufStart();
|
||||
for ( int32_t i = 0 ; i < nqt ; i++ )
|
||||
m_stackBuf.setLength ( need );
|
||||
for ( int32_t i = 0 ; i < nqt ; i++ ) {
|
||||
m_lists[i].constructor();
|
||||
//log("msg39: constructlist @ 0x%"PTRFMT,(PTRTYPE)&m_lists[i]);
|
||||
}
|
||||
|
||||
// call msg2
|
||||
if ( ! m_msg2.getLists ( rdbId ,
|
||||
|
1
Msg39.h
1
Msg39.h
@ -216,6 +216,7 @@ class Msg39 {
|
||||
public:
|
||||
|
||||
Msg39();
|
||||
~Msg39();
|
||||
void reset();
|
||||
void reset2();
|
||||
// register our request handler for Msg39's
|
||||
|
@ -666,7 +666,7 @@ bool Msg40::federatedLoop ( ) {
|
||||
mr.size_whiteList = slen;
|
||||
mr.m_timeout = -1; // auto-determine based on #terms
|
||||
// make sure query term counts match in msg39
|
||||
mr.m_maxQueryTerms = m_si->m_maxQueryTerms;
|
||||
//mr.m_maxQueryTerms = m_si->m_maxQueryTerms;
|
||||
mr.m_realMaxTop = m_si->m_realMaxTop;
|
||||
|
||||
mr.m_minSerpDocId = m_si->m_minSerpDocId;
|
||||
@ -699,6 +699,9 @@ bool Msg40::federatedLoop ( ) {
|
||||
//if ( numDocIdSplits < 5 ) numDocIdSplits = 5;
|
||||
//}
|
||||
|
||||
if ( cr ) mr.m_maxQueryTerms = cr->m_maxQueryTerms;
|
||||
else mr.m_maxQueryTerms = 100;
|
||||
|
||||
// special oom hack fix
|
||||
if ( cr && cr->m_isCustomCrawl && numDocIdSplits < 4 )
|
||||
numDocIdSplits = 4;
|
||||
|
@ -2529,6 +2529,22 @@ bool printSearchResultsHeader ( State0 *st ) {
|
||||
, getLanguageString(si->m_queryLangId) );
|
||||
// print query words we ignored, like stop words
|
||||
printIgnoredWords ( sb , si );
|
||||
|
||||
sb->safePrintf("\t\t<queryNumTermsTotal>"
|
||||
"%"INT32
|
||||
"</queryNumTermsTotal>\n"
|
||||
, q->m_numTermsUntruncated );
|
||||
sb->safePrintf("\t\t<queryNumTermsUsed>"
|
||||
"%"INT32
|
||||
"</queryNumTermsUsed>\n"
|
||||
, q->m_numTerms );
|
||||
int32_t tval = 0;
|
||||
if ( q->m_numTerms < q->m_numTermsUntruncated ) tval = 1;
|
||||
sb->safePrintf("\t\t<queryWasTruncated>"
|
||||
"%"INT32
|
||||
"</queryWasTruncated>\n"
|
||||
, tval );
|
||||
|
||||
for ( int i = 0 ; i < q->m_numTerms ; i++ ) {
|
||||
sb->safePrintf("\t\t<term>\n");
|
||||
QueryTerm *qt = &q->m_qterms[i];
|
||||
@ -2605,6 +2621,19 @@ bool printSearchResultsHeader ( State0 *st ) {
|
||||
sb->safePrintf("\",\n");
|
||||
// print query words we ignored, like stop words
|
||||
printIgnoredWords ( sb , si );
|
||||
|
||||
sb->safePrintf("\t\"queryNumTermsTotal\":"
|
||||
"%"INT32",\n"
|
||||
, q->m_numTermsUntruncated );
|
||||
sb->safePrintf("\t\"queryNumTermsUsed\":"
|
||||
"%"INT32",\n"
|
||||
, q->m_numTerms );
|
||||
int32_t tval = 0;
|
||||
if ( q->m_numTerms < q->m_numTermsUntruncated ) tval = 1;
|
||||
sb->safePrintf("\t\"queryWasTruncated\":"
|
||||
"%"INT32",\n"
|
||||
, tval );
|
||||
|
||||
sb->safePrintf("\t\"terms\":[\n");
|
||||
for ( int i = 0 ; i < q->m_numTerms ; i++ ) {
|
||||
sb->safePrintf("\t\t{\n");
|
||||
@ -8263,8 +8292,11 @@ bool printCSVHeaderRow2 ( SafeBuf *sb ,
|
||||
hdr = "Hop Count";
|
||||
if ( ! strcmp(hdr,"gbssIp") )
|
||||
hdr = "IP";
|
||||
if ( ! strcmp(hdr,"gbssDiffbotUri" ) )
|
||||
hdr = "Diffbot URI";
|
||||
// csv report is regular urls not diffbot object urls so
|
||||
// regular urls do not have a just a single diffboturi,
|
||||
// they could have 0 or multiple diffboturis
|
||||
//if ( ! strcmp(hdr,"gbssDiffbotUri" ) )
|
||||
// hdr = "Diffbot URI";
|
||||
if ( ! strcmp(hdr,"gbssSentToDiffbotThisTime") )
|
||||
hdr = "Process Attempted";
|
||||
if ( ! strcmp(hdr,"gbssDiffbotReplyMsg") )
|
||||
|
19
Pages.cpp
19
Pages.cpp
@ -3857,6 +3857,25 @@ bool printApiForPage ( SafeBuf *sb , int32_t PAGENUM , CollectionRec *cr ) {
|
||||
"</b>");
|
||||
|
||||
|
||||
sb->brify2 (
|
||||
"\t\t# List of space separated words in the "
|
||||
"query that were ignored for the most part. "
|
||||
"Because they were common words for the "
|
||||
"query language they are in.\n"
|
||||
, cols , "\n\t\t# " , false );
|
||||
sb->safePrintf("<b>\t\t\"ignoredWords\":\"to the\",\n\n"
|
||||
"</b>");
|
||||
|
||||
sb->brify2 (
|
||||
"\t\t# There is a maximum limit placed on the "
|
||||
"number of query terms we search on to keep things "
|
||||
"fast. This can "
|
||||
"be changed in the search controls.\n"
|
||||
, cols , "\n\t\t# " , false );
|
||||
sb->safePrintf("<b>\t\t\"queryNumTermsTotal\":52,\n</b>");
|
||||
sb->safePrintf("<b>\t\t\"queryNumTermsUsed\":20,\n</b>");
|
||||
sb->safePrintf("<b>\t\t\"queryWasTruncated\":1,\n\n</b>");
|
||||
|
||||
sb->brify2 (
|
||||
"\t\t# The start of the terms array. Each query "
|
||||
"is broken down into a list of terms. Each "
|
||||
|
24
Parms.cpp
24
Parms.cpp
@ -7879,17 +7879,19 @@ void Parms::init ( ) {
|
||||
m->m_obj = OBJ_COLL;
|
||||
m++;
|
||||
|
||||
//m->m_title = "max query terms";
|
||||
//m->m_desc = "Do not allow more than this many query terms. Will "
|
||||
// "return error in XML feed error tag if breeched.";
|
||||
//m->m_cgi = "mqt";
|
||||
//m->m_off = (char *)&cr.m_maxQueryTerms - x;
|
||||
m->m_title = "max query terms";
|
||||
m->m_desc = "Do not allow more than this many query terms. Helps "
|
||||
"prevent big queries from resource hogging.";
|
||||
m->m_cgi = "mqt";
|
||||
m->m_off = (char *)&cr.m_maxQueryTerms - x;
|
||||
//m->m_soff = (char *)&si.m_maxQueryTerms - y;
|
||||
//m->m_type = TYPE_LONG;
|
||||
//m->m_def = "20"; // 20 for testing, normally 16
|
||||
//m->m_sparm = 1;
|
||||
//m->m_spriv = 1;
|
||||
//m++;
|
||||
m->m_type = TYPE_LONG;
|
||||
m->m_def = "999999"; // now we got synonyms... etc
|
||||
m->m_group = 0;
|
||||
m->m_flags = 0;//PF_HIDDEN | PF_NOSAVE;
|
||||
m->m_page = PAGE_SEARCH;
|
||||
m->m_obj = OBJ_COLL;
|
||||
m++;
|
||||
|
||||
/*
|
||||
m->m_title = "dictionary site";
|
||||
@ -15283,7 +15285,7 @@ void Parms::init ( ) {
|
||||
m->m_type = TYPE_CHARPTR;
|
||||
m->m_page = PAGE_REINDEX;
|
||||
m->m_obj = OBJ_GBREQUEST;
|
||||
m->m_def = "xx";
|
||||
m->m_def = "en";
|
||||
m->m_flags = PF_API ;
|
||||
m++;
|
||||
|
||||
|
55
Posdb.cpp
55
Posdb.cpp
@ -759,7 +759,6 @@ void PosdbTable::init ( Query *q ,
|
||||
// set this now
|
||||
//m_collnum = cr->m_collnum;
|
||||
|
||||
|
||||
// save it
|
||||
m_topTree = topTree;
|
||||
// a ptr for debugging i guess
|
||||
@ -773,6 +772,9 @@ void PosdbTable::init ( Query *q ,
|
||||
m_realMaxTop = r->m_realMaxTop;
|
||||
if ( m_realMaxTop > MAX_TOP ) m_realMaxTop = MAX_TOP;
|
||||
|
||||
m_siteRankMultiplier = SITERANKMULTIPLIER;
|
||||
if ( m_q->m_isBoolean ) m_siteRankMultiplier = 0.0;
|
||||
|
||||
// seo.cpp supplies a NULL msg2 because it already sets
|
||||
// QueryTerm::m_posdbListPtrs
|
||||
if ( ! msg2 ) return;
|
||||
@ -6304,12 +6306,7 @@ void PosdbTable::intersectLists10_r ( ) {
|
||||
}
|
||||
|
||||
if ( m_q->m_isBoolean ) {
|
||||
minScore = 1.0;
|
||||
// since we are jumping, we need to set m_docId here
|
||||
//m_docId = *(uint32_t *)(docIdPtr+1);
|
||||
//m_docId <<= 8;
|
||||
//m_docId |= (unsigned char)docIdPtr[0];
|
||||
//m_docId >>= 2;
|
||||
//minScore = 1.0;
|
||||
// we can't jump over setting of miniMergeList. do that.
|
||||
goto boolJump1;
|
||||
}
|
||||
@ -6521,6 +6518,30 @@ void PosdbTable::intersectLists10_r ( ) {
|
||||
|
||||
boolJump1:
|
||||
|
||||
if ( m_q->m_isBoolean ) {
|
||||
//minScore = 1.0;
|
||||
// this is somewhat wasteful since it is set below again
|
||||
m_docId = *(uint32_t *)(docIdPtr+1);
|
||||
m_docId <<= 8;
|
||||
m_docId |= (unsigned char)docIdPtr[0];
|
||||
m_docId >>= 2;
|
||||
// add one point for each term matched in the bool query
|
||||
// this is really just for when the terms are from different
|
||||
// fields. if we have unfielded boolean terms we should
|
||||
// do proximity matching.
|
||||
int32_t slot = m_bt.getSlot ( &m_docId );
|
||||
if ( slot >= 0 ) {
|
||||
uint8_t *bv = (uint8_t *)m_bt.getValueFromSlot(slot);
|
||||
// then a score based on the # of terms that matched
|
||||
int16_t bitsOn = getNumBitsOnX ( bv , m_vecSize );
|
||||
// but store in hashtable now
|
||||
minScore = (float)bitsOn;
|
||||
}
|
||||
else {
|
||||
minScore = 1.0;
|
||||
}
|
||||
}
|
||||
|
||||
// we need to do this for seo hacks to merge the synonyms together
|
||||
// into one list
|
||||
seoHackSkip2:
|
||||
@ -7226,7 +7247,7 @@ void PosdbTable::intersectLists10_r ( ) {
|
||||
boolJump2:
|
||||
|
||||
// try dividing it by 3! (or multiply by .33333 faster)
|
||||
score = minScore * (((float)siteRank)*SITERANKMULTIPLIER+1.0);
|
||||
score = minScore * (((float)siteRank)*m_siteRankMultiplier+1.0);
|
||||
|
||||
// . not foreign language? give a huge boost
|
||||
// . use "qlang" parm to set the language. i.e. "&qlang=fr"
|
||||
@ -7896,7 +7917,7 @@ float PosdbTable::getMaxPossibleScore ( QueryTermInfo *qti ,
|
||||
score *= WIKI_BIGRAM_WEIGHT;
|
||||
}
|
||||
//score *= perfectWordSpamWeight * perfectWordSpamWeight;
|
||||
score *= (((float)siteRank)*SITERANKMULTIPLIER+1.0);
|
||||
score *= (((float)siteRank)*m_siteRankMultiplier+1.0);
|
||||
|
||||
// language boost if same language (or no lang specified)
|
||||
if ( m_r->m_language == docLang ||
|
||||
@ -8187,13 +8208,15 @@ bool PosdbTable::makeDocIdVoteBufForBoolQuery_r ( ) {
|
||||
// a 6 byte key means you pass
|
||||
gbmemcpy ( dst , &docId , 6 );
|
||||
// test it
|
||||
int64_t d2;
|
||||
d2 = *(uint32_t *)(dst+1);
|
||||
d2 <<= 8;
|
||||
d2 |= (unsigned char)dst[0];
|
||||
d2 >>= 2;
|
||||
docId >>= 2;
|
||||
if ( d2 != docId ) { char *xx=NULL;*xx=0; }
|
||||
if ( m_debug ) {
|
||||
int64_t d2;
|
||||
d2 = *(uint32_t *)(dst+1);
|
||||
d2 <<= 8;
|
||||
d2 |= (unsigned char)dst[0];
|
||||
d2 >>= 2;
|
||||
docId >>= 2;
|
||||
if ( d2 != docId ) { char *xx=NULL;*xx=0; }
|
||||
}
|
||||
// end test
|
||||
dst += 6;
|
||||
}
|
||||
|
2
Posdb.h
2
Posdb.h
@ -604,6 +604,8 @@ class PosdbTable {
|
||||
float m_finalScore;
|
||||
float m_preFinalScore;
|
||||
|
||||
float m_siteRankMultiplier;
|
||||
|
||||
// how long to add the last batch of lists
|
||||
int64_t m_addListsTime;
|
||||
int64_t m_t1 ;
|
||||
|
55
Query.cpp
55
Query.cpp
@ -74,6 +74,9 @@ void Query::reset ( ) {
|
||||
qw->destructor();
|
||||
}
|
||||
|
||||
m_stackBuf.purge();
|
||||
m_qterms = NULL;
|
||||
|
||||
m_sb.purge();
|
||||
m_osb.purge();
|
||||
m_docIdRestriction = 0LL;
|
||||
@ -140,14 +143,16 @@ bool Query::set2 ( char *query ,
|
||||
// need language for doing synonyms
|
||||
uint8_t langId ,
|
||||
char queryExpansion ,
|
||||
bool useQueryStopWords ) {
|
||||
//int32_t maxQueryTerms ) {
|
||||
bool useQueryStopWords ,
|
||||
int32_t maxQueryTerms ) {
|
||||
|
||||
m_langId = langId;
|
||||
m_useQueryStopWords = useQueryStopWords;
|
||||
// fix summary rerank and highlighting.
|
||||
bool keepAllSingles = true;
|
||||
|
||||
m_maxQueryTerms = maxQueryTerms;
|
||||
|
||||
// assume boolean auto-detect.
|
||||
char boolFlag = 2;
|
||||
|
||||
@ -159,7 +164,7 @@ bool Query::set2 ( char *query ,
|
||||
if ( ! query ) return true;
|
||||
|
||||
// set to 256 for synonyms?
|
||||
m_maxQueryTerms = 256;
|
||||
//m_maxQueryTerms = 256;
|
||||
m_queryExpansion = queryExpansion;
|
||||
|
||||
int32_t queryLen = gbstrlen(query);
|
||||
@ -601,7 +606,7 @@ bool Query::setQTerms ( Words &words , Phrases &phrases ) {
|
||||
int32_t max = (int32_t)MAX_EXPLICIT_BITS;
|
||||
if ( max > m_maxQueryTerms ) max = m_maxQueryTerms;
|
||||
|
||||
// count them first for allocating
|
||||
// count phrases first for allocating
|
||||
int32_t nqt = 0;
|
||||
for ( int32_t i = 0 ; i < m_numWords ; i++ ) {
|
||||
QueryWord *qw = &m_qwords[i];
|
||||
@ -653,6 +658,10 @@ bool Query::setQTerms ( Words &words , Phrases &phrases ) {
|
||||
continue;
|
||||
// skip if ignored like a stopword (stop to->too)
|
||||
//if ( qw->m_ignoreWord ) continue;
|
||||
// ignore title: etc. words, they are field names
|
||||
if ( qw->m_ignoreWord == IGNORE_FIELDNAME ) continue;
|
||||
// ignore boolean operators
|
||||
if ( qw->m_ignoreWord ) continue;// IGNORE_BOOLOP
|
||||
// no, hurts 'Greencastle IN economic development'
|
||||
if ( qw->m_wordId == to ) continue;
|
||||
// single letters...
|
||||
@ -673,7 +682,9 @@ bool Query::setQTerms ( Words &words , Phrases &phrases ) {
|
||||
nqt += naids;
|
||||
}
|
||||
|
||||
m_numTermsUntruncated = nqt;
|
||||
|
||||
if ( nqt > m_maxQueryTerms ) nqt = m_maxQueryTerms;
|
||||
|
||||
// allocate the stack buf
|
||||
if ( nqt ) {
|
||||
@ -719,6 +730,11 @@ bool Query::setQTerms ( Words &words , Phrases &phrases ) {
|
||||
"limit of %"INT32"",(int32_t)ABS_MAX_QUERY_TERMS );
|
||||
break;
|
||||
}
|
||||
if ( n >= m_maxQueryTerms ) {
|
||||
log("query: lost query phrase terms to max term cr "
|
||||
"limit of %"INT32"",(int32_t)m_maxQueryTerms);
|
||||
break;
|
||||
}
|
||||
|
||||
QueryTerm *qt = &m_qterms[n];
|
||||
qt->m_qword = qw ;
|
||||
@ -877,6 +893,11 @@ bool Query::setQTerms ( Words &words , Phrases &phrases ) {
|
||||
"limit of %"INT32"",(int32_t)ABS_MAX_QUERY_TERMS );
|
||||
break;
|
||||
}
|
||||
if ( n >= m_maxQueryTerms ) {
|
||||
log("query: lost query terms to max term cr "
|
||||
"limit of %"INT32"",(int32_t)m_maxQueryTerms);
|
||||
break;
|
||||
}
|
||||
|
||||
QueryTerm *qt = &m_qterms[n];
|
||||
qt->m_qword = qw ;
|
||||
@ -1389,6 +1410,10 @@ bool Query::setQTerms ( Words &words , Phrases &phrases ) {
|
||||
continue;
|
||||
// skip if ignored like a stopword (stop to->too)
|
||||
//if ( qw->m_ignoreWord ) continue;
|
||||
// ignore title: etc. words, they are field names
|
||||
if ( qw->m_ignoreWord == IGNORE_FIELDNAME ) continue;
|
||||
// ignore boolean operators
|
||||
if ( qw->m_ignoreWord ) continue;// IGNORE_BOOLOP
|
||||
// no, hurts 'Greencastle IN economic development'
|
||||
if ( qw->m_wordId == to ) continue;
|
||||
// single letters...
|
||||
@ -1424,6 +1449,14 @@ bool Query::setQTerms ( Words &words , Phrases &phrases ) {
|
||||
}
|
||||
// this happens for 'da da da'
|
||||
if ( ! origTerm ) continue;
|
||||
|
||||
if ( n >= m_maxQueryTerms ) {
|
||||
log("query: lost synonyms due to max cr term "
|
||||
"limit of %"INT32"",
|
||||
(int32_t)m_maxQueryTerms);
|
||||
break;
|
||||
}
|
||||
|
||||
// add that query term
|
||||
QueryTerm *qt = &m_qterms[n];
|
||||
qt->m_qword = qw; // NULL;
|
||||
@ -2483,12 +2516,14 @@ bool Query::setQWords ( char boolFlag ,
|
||||
// in quotes which is silly, so undo it. But we should
|
||||
// still inherit any quoteSign, however. Be sure to also
|
||||
// set m_inQuotes to false so Matches.cpp::matchWord() works.
|
||||
if ( i == quoteStart ) { // + 1 ) {
|
||||
if ( i + 1 >= numWords || words.getNumQuotes(i+1)>0 ) {
|
||||
qw->m_quoteStart = -1;
|
||||
qw->m_inQuotes = false;
|
||||
}
|
||||
}
|
||||
// MDW: don't undo it because we do not want to get synonyms
|
||||
// of terms in quotes. 7/15/2015
|
||||
// if ( i == quoteStart ) { // + 1 ) {
|
||||
// if ( i + 1 >= numWords || words.getNumQuotes(i+1)>0 ) {
|
||||
// qw->m_quoteStart = -1;
|
||||
// qw->m_inQuotes = false;
|
||||
// }
|
||||
// }
|
||||
// . get prefix hash of collection name and field
|
||||
// . but first convert field to lower case
|
||||
uint64_t ph;
|
||||
|
10
Query.h
10
Query.h
@ -635,10 +635,10 @@ class Query {
|
||||
//int32_t collLen ,
|
||||
uint8_t langId ,
|
||||
char queryExpansion ,
|
||||
bool useQueryStopWords = true );
|
||||
//char boolFlag = 2 , // auto-detect if boolean query
|
||||
//bool keepAllSingles = false ,
|
||||
//int32_t maxQueryTerms = 0x7fffffff );
|
||||
bool useQueryStopWords = true ,
|
||||
//char boolFlag = 2 , // auto-detect if boolean query
|
||||
//bool keepAllSingles = false ,
|
||||
int32_t maxQueryTerms = 0x7fffffff );
|
||||
|
||||
// serialize/deserialize ourselves so we don't have to pass the
|
||||
// unmodified string around and reparse it every time
|
||||
@ -941,6 +941,8 @@ class Query {
|
||||
int32_t m_numTerms;
|
||||
int32_t m_numTermsSpecial;
|
||||
|
||||
int32_t m_numTermsUntruncated;
|
||||
|
||||
// separate vectors for easier interfacing, 1-1 with m_qterms
|
||||
//int64_t m_termFreqs [ MAX_QUERY_TERMS ];
|
||||
//int64_t m_termIds [ MAX_QUERY_TERMS ];
|
||||
|
26
RdbDump.cpp
26
RdbDump.cpp
@ -373,12 +373,12 @@ bool RdbDump::dumpTree ( bool recall ) {
|
||||
//if ( removeNegRecs )
|
||||
// m_list.removeNegRecs();
|
||||
|
||||
// if(!m_list->checkList_r ( false , // removeNegRecs?
|
||||
// false , // sleep on problem?
|
||||
// m_rdb->m_rdbId )) {
|
||||
// log("db: list to dump is not sane!");
|
||||
// char *xx=NULL;*xx=0;
|
||||
// }
|
||||
// if(!m_list->checkList_r ( false , // removeNegRecs?
|
||||
// false , // sleep on problem?
|
||||
// m_rdb->m_rdbId )) {
|
||||
// log("db: list to dump is not sane!");
|
||||
// char *xx=NULL;*xx=0;
|
||||
// }
|
||||
|
||||
|
||||
skip:
|
||||
@ -781,6 +781,10 @@ bool RdbDump::doneReadingForVerify ( ) {
|
||||
if ( m_addToMap ) t = gettimeofdayInMilliseconds();
|
||||
// sanity check
|
||||
if ( m_list->m_ks != m_ks ) { char *xx = NULL; *xx = 0; }
|
||||
|
||||
bool triedToFix = false;
|
||||
|
||||
tryAgain:
|
||||
// . register this with the map now
|
||||
// . only register AFTER it's ALL on disk so we don't get partial
|
||||
// record reads and we don't read stuff on disk that's also in tree
|
||||
@ -788,6 +792,16 @@ bool RdbDump::doneReadingForVerify ( ) {
|
||||
// . we don't have maps when we do unordered dumps
|
||||
// . careful, map is NULL if we're doing unordered dump
|
||||
if ( m_addToMap && m_map && ! m_map->addList ( m_list ) ) {
|
||||
// keys out of order in list from tree?
|
||||
if ( g_errno == ECORRUPTDATA ) {
|
||||
log("db: trying to fix tree or buckets");
|
||||
if ( m_tree ) m_tree->fixTree();
|
||||
//if ( m_buckets ) m_buckets->fixBuckets();
|
||||
if ( m_buckets ) { char *xx=NULL;*xx=0; }
|
||||
if ( triedToFix ) { char *xx=NULL;*xx=0; }
|
||||
triedToFix = true;
|
||||
goto tryAgain;
|
||||
}
|
||||
g_errno = ENOMEM;
|
||||
log("db: Failed to add data to map.");
|
||||
// undo the offset update, the write failed, the parent
|
||||
|
@ -624,7 +624,8 @@ bool RdbList::growList ( int32_t newSize ) {
|
||||
// don't shrink list
|
||||
if ( newSize <= m_allocSize ) return true;
|
||||
// debug msg
|
||||
//log("RdbList::growList from %"INT32" to %"INT32"",m_allocSize , newSize );
|
||||
// log("RdbList::growList 0x%"PTRFMT "from %"INT32" to %"INT32"",
|
||||
// (PTRTYPE)this,m_allocSize , newSize );
|
||||
// make a new buffer
|
||||
char *tmp =(char *) mrealloc ( m_alloc,m_allocSize,newSize,"RdbList");
|
||||
//if ( (int32_t)tmp == 0x904dbd0 )
|
||||
|
27
RdbMap.cpp
27
RdbMap.cpp
@ -323,8 +323,9 @@ bool RdbMap::verifyMap2 ( ) {
|
||||
KEYSET(lastKey,k,m_ks); continue; }
|
||||
// just bitch for now
|
||||
log(
|
||||
"db: Key out of order in map file %s%s. "
|
||||
"page = %"INT32". key offset = %"INT64". Map or data file is "
|
||||
"db: Key out of order in map file %s/%s. "
|
||||
"page = %"INT32". key offset = %"INT64". "
|
||||
"Map or data file is "
|
||||
"corrupt, but it is probably the data file. Please "
|
||||
"delete the map file and restart.",
|
||||
m_file.m_dir,m_file.getFilename() ,
|
||||
@ -337,6 +338,15 @@ bool RdbMap::verifyMap2 ( ) {
|
||||
KEY1(lastKey,m_ks),KEY0(lastKey));
|
||||
log("db: k.n1=%016"XINT64" n0=%016"XINT64"",KEY1(k,m_ks),KEY0(k));
|
||||
log("db: m_numPages = %"INT32"",m_numPages);
|
||||
|
||||
SafeBuf cmd;
|
||||
cmd.safePrintf("mv %s/%s %s/trash/",
|
||||
m_file.m_dir,
|
||||
m_file.getFilename(),
|
||||
g_hostdb.m_dir);
|
||||
log("db: %s",cmd.getBufStart() );
|
||||
gbsystem ( cmd.getBufStart() );
|
||||
|
||||
exit(0);
|
||||
//char *xx=NULL;*xx=0;
|
||||
// was k too small?
|
||||
@ -543,7 +553,8 @@ bool RdbMap::addRecord ( char *key, char *rec , int32_t recSize ) {
|
||||
m_lastLogTime = getTime();
|
||||
//pageNum > 0 && getKey(pageNum-1) > getKey(pageNum) ) {
|
||||
log(LOG_LOGIC,"build: RdbMap: added key out of order. "
|
||||
"count=%"INT64".",m_badKeys);
|
||||
"count=%"INT64" file=%s/%s.",m_badKeys,
|
||||
m_file.m_dir,m_file.getFilename());
|
||||
//log(LOG_LOGIC,"build: k.n1=%"XINT32" %"XINT64" lastKey.n1=%"XINT32" %"XINT64"",
|
||||
// key.n1,key.n0,m_lastKey.n1,m_lastKey.n0 );
|
||||
log(LOG_LOGIC,"build: offset=%"INT64"",
|
||||
@ -556,7 +567,10 @@ bool RdbMap::addRecord ( char *key, char *rec , int32_t recSize ) {
|
||||
g_errno = ECORRUPTDATA;
|
||||
return false;
|
||||
}
|
||||
char *xx=NULL;*xx=0;
|
||||
// if being called from RdbDump.cpp...
|
||||
g_errno = ECORRUPTDATA;
|
||||
return false;
|
||||
//char *xx=NULL;*xx=0;
|
||||
// . during a merge, corruption can happen, so let's core
|
||||
// here until we figure out how to fix it.
|
||||
// . any why wasn't the corruption discovered and patched
|
||||
@ -719,7 +733,10 @@ bool RdbMap::addList ( RdbList *list ) {
|
||||
if ( ! addRecord ( key , rec , recSize ) ) {
|
||||
log("db: Failed to add record to map: %s.",
|
||||
mstrerror(g_errno));
|
||||
char *xx = NULL; *xx = 0;
|
||||
// allow caller to try to fix the tree in the case of dumping
|
||||
// a tree to a file on disk
|
||||
return false;
|
||||
//char *xx = NULL; *xx = 0;
|
||||
}
|
||||
if ( list->skipCurrentRecord() ) goto top2;
|
||||
|
||||
|
19
RdbTree.cpp
19
RdbTree.cpp
@ -1283,19 +1283,26 @@ bool RdbTree::checkTree2 ( bool printMsgs , bool doChainTest ) {
|
||||
if ( m_right[i] >= 0 && m_parents[m_right[i]] != i )
|
||||
return log(
|
||||
"db: Tree right kid and parent disagree.");
|
||||
/*
|
||||
// MDW: why did i comment out the order checking?
|
||||
// check order
|
||||
if ( m_left[i] >= 0 ) {
|
||||
if ( m_left[i] >= 0 &&
|
||||
m_collnums[i] == m_collnums[m_left[i]] ) {
|
||||
char *key = &m_keys[i*m_ks];
|
||||
char *left = &m_keys[m_left[i]*m_ks];
|
||||
if ( KEYCMP(key,left,m_ks)<0) {char *xx=NULL;*xx=0;}
|
||||
if ( KEYCMP(key,left,m_ks)<0)
|
||||
return log("db: Tree left kid > parent %i",i);
|
||||
|
||||
}
|
||||
if ( m_right[i] >= 0 ) {
|
||||
if ( m_right[i] >= 0 &&
|
||||
m_collnums[i] == m_collnums[m_right[i]] ) {
|
||||
char *key = &m_keys[i*m_ks];
|
||||
char *right = &m_keys[m_right[i]*m_ks];
|
||||
if ( KEYCMP(key,right,m_ks)>0) {char *xx=NULL;*xx=0;}
|
||||
if ( KEYCMP(key,right,m_ks)>0)
|
||||
return log("db: Tree right kid < parent %i "
|
||||
"%s < %s",i,
|
||||
KEYSTR(right,m_ks),
|
||||
KEYSTR(key,m_ks) );
|
||||
}
|
||||
*/
|
||||
//g_loop.quickPoll(1, __PRETTY_FUNCTION__, __LINE__);
|
||||
}
|
||||
if ( hkp > 0 )
|
||||
|
@ -470,14 +470,16 @@ bool SearchInput::set ( TcpSocket *sock , HttpRequest *r ) { //, Query *q ) {
|
||||
log("query: qlang of \"%s\" is NOT SUPPORTED. using "
|
||||
"langUnknown, \"xx\".",langAbbr);
|
||||
|
||||
int32_t maxQueryTerms = cr->m_maxQueryTerms;
|
||||
|
||||
// . the query to use for highlighting... can be overriden with "hq"
|
||||
// . we need the language id for doing synonyms
|
||||
if ( m_prepend && m_prepend[0] )
|
||||
m_hqq.set2 ( m_prepend , m_queryLangId , true );
|
||||
m_hqq.set2 ( m_prepend , m_queryLangId , true ,maxQueryTerms);
|
||||
else if ( m_highlightQuery && m_highlightQuery[0] )
|
||||
m_hqq.set2 ( m_highlightQuery , m_queryLangId , true );
|
||||
m_hqq.set2 (m_highlightQuery,m_queryLangId,true,maxQueryTerms);
|
||||
else if ( m_query && m_query[0] )
|
||||
m_hqq.set2 ( m_query , m_queryLangId , true );
|
||||
m_hqq.set2 ( m_query , m_queryLangId , true,maxQueryTerms);
|
||||
|
||||
// log it here
|
||||
log(LOG_INFO,
|
||||
@ -489,7 +491,9 @@ bool SearchInput::set ( TcpSocket *sock , HttpRequest *r ) { //, Query *q ) {
|
||||
// . returns false and sets g_errno on error (ETOOMANYOPERANDS)
|
||||
if ( ! m_q.set2 ( m_sbuf1.getBufStart(),
|
||||
m_queryLangId ,
|
||||
m_queryExpansion ) ) {
|
||||
m_queryExpansion ,
|
||||
true , // use QUERY stopwords?
|
||||
maxQueryTerms ) ) {
|
||||
g_msg = " (error: query has too many operands)";
|
||||
return false;
|
||||
}
|
||||
|
23
Spider.cpp
23
Spider.cpp
@ -11641,6 +11641,18 @@ int32_t getUrlFilterNum2 ( SpiderRequest *sreq ,
|
||||
goto checkNextRule;
|
||||
}
|
||||
|
||||
if ( strncmp ( p , "isfakeip",8 ) == 0 ) {
|
||||
// skip for msg20
|
||||
if ( isForMsg20 ) continue;
|
||||
// if no match continue
|
||||
if ( (bool)sreq->m_fakeFirstIp == val ) continue;
|
||||
p += 8;
|
||||
p = strstr(p, "&&");
|
||||
if ( ! p ) return i;
|
||||
p += 2;
|
||||
goto checkNextRule;
|
||||
}
|
||||
|
||||
if ( strncmp ( p , "isonsamedomain",14 ) == 0 ) {
|
||||
// skip for msg20
|
||||
if ( isForMsg20 ) continue;
|
||||
@ -13993,6 +14005,17 @@ bool getSpiderStatusMsg ( CollectionRec *cx , SafeBuf *msg , int32_t *status ) {
|
||||
return msg->safePrintf("Job is initializing.");
|
||||
}
|
||||
|
||||
// if we had seeds and none were successfully crawled, do not just
|
||||
// print that the crawl completed.
|
||||
if ( cx->m_collectiveRespiderFrequency <= 0.0 &&
|
||||
cx->m_isCustomCrawl &&
|
||||
! cx->m_globalCrawlInfo.m_hasUrlsReadyToSpider &&
|
||||
cx->m_globalCrawlInfo.m_pageDownloadAttempts > 0 &&
|
||||
cx->m_globalCrawlInfo.m_pageDownloadSuccesses == 0 ) {
|
||||
*status = SP_SEEDSERROR;
|
||||
return msg->safePrintf("Failed to crawl any seed.");
|
||||
}
|
||||
|
||||
// if we sent an email simply because no urls
|
||||
// were left and we are not recrawling!
|
||||
if ( cx->m_collectiveRespiderFrequency <= 0.0 &&
|
||||
|
1
Spider.h
1
Spider.h
@ -39,6 +39,7 @@
|
||||
#define SP_INPROGRESS 7 // it is going on!
|
||||
#define SP_ADMIN_PAUSED 8 // g_conf.m_spideringEnabled = false
|
||||
#define SP_COMPLETED 9 // crawl is done, and no repeatCrawl is scheduled
|
||||
#define SP_SEEDSERROR 10 // all seeds had an error preventing crawling
|
||||
|
||||
bool tryToDeleteSpiderColl ( SpiderColl *sc , char *msg ) ;
|
||||
void spiderRoundIncremented ( class CollectionRec *cr ) ;
|
||||
|
25
Summary.cpp
25
Summary.cpp
@ -13,6 +13,7 @@ Summary::Summary()
|
||||
m_bitScoresBuf = NULL;
|
||||
m_bitScoresBufSize = 0;
|
||||
m_wordWeights = NULL;
|
||||
m_buf4 = NULL;
|
||||
reset();
|
||||
}
|
||||
|
||||
@ -42,9 +43,10 @@ void Summary::reset() {
|
||||
m_wordWeights = NULL;
|
||||
}
|
||||
m_wordWeights = NULL;
|
||||
if ( m_buf && m_buf != m_tmpBuf2 )
|
||||
mfree ( m_buf , m_bufSize , "ssstkb" );
|
||||
m_buf = NULL;
|
||||
if ( m_buf4 && m_buf4 != m_tmpBuf4 ) {
|
||||
mfree ( m_buf4 , m_buf4Size , "ssstkb" );
|
||||
m_buf4 = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -248,13 +250,13 @@ bool Summary::set2 ( Xml *xml ,
|
||||
m_numExcerpts = 0;
|
||||
|
||||
int32_t need2 = (1+1+1) * m_q->m_numWords;
|
||||
m_bufSize = need2;
|
||||
m_buf4Size = need2;
|
||||
if ( need2 < 128 )
|
||||
m_buf = m_tmpBuf2;
|
||||
m_buf4 = m_tmpBuf4;
|
||||
else
|
||||
m_buf = (char *)mmalloc ( need2 , "stkbuf" );
|
||||
if ( ! m_buf ) return false;
|
||||
char *x = m_buf;
|
||||
m_buf4 = (char *)mmalloc ( need2 , "stkbuf" );
|
||||
if ( ! m_buf4 ) return false;
|
||||
char *x = m_buf4;
|
||||
char *retired = x;
|
||||
x += m_q->m_numWords;
|
||||
char *maxGotIt = x;
|
||||
@ -591,9 +593,10 @@ bool Summary::set2 ( Xml *xml ,
|
||||
}
|
||||
|
||||
// free the mem we used if we allocated it
|
||||
if ( m_buf && m_buf != m_tmpBuf2 )
|
||||
mfree ( m_buf , m_bufSize , "ssstkb" );
|
||||
m_buf = NULL;
|
||||
if ( m_buf4 && m_buf4 != m_tmpBuf4 ) {
|
||||
mfree ( m_buf4 , m_buf4Size , "ssstkb" );
|
||||
m_buf4 = NULL;
|
||||
}
|
||||
|
||||
|
||||
// If we still didn't find a summary, get the default summary
|
||||
|
@ -271,9 +271,9 @@ class Summary {
|
||||
int32_t m_wordWeightSize;
|
||||
char m_tmpBuf[128];
|
||||
|
||||
char *m_buf;
|
||||
int32_t m_bufSize;
|
||||
char m_tmpBuf2[128];
|
||||
char *m_buf4;
|
||||
int32_t m_buf4Size;
|
||||
char m_tmpBuf4[128];
|
||||
|
||||
char m_summaryLocBuf[MAX_SUMMARY_LOCS*sizeof(uint64_t)];
|
||||
SafeBuf m_summaryLocs;
|
||||
|
36
XmlDoc.cpp
36
XmlDoc.cpp
@ -2569,11 +2569,10 @@ bool XmlDoc::indexDoc ( ) {
|
||||
SafeBuf *ssDocMetaList = NULL;
|
||||
// save this
|
||||
int32_t saved = m_indexCode;
|
||||
// and make it the real reason for the spider status doc
|
||||
// make it the real reason for the spider status doc
|
||||
m_indexCode = EDNSERROR;
|
||||
// get the spiderreply ready to be added
|
||||
|
||||
ssDocMetaList = getSpiderStatusDocMetaList(NULL ,false);//del
|
||||
// get the spiderreply ready to be added. false=del
|
||||
ssDocMetaList =getSpiderStatusDocMetaList(NULL ,false);
|
||||
// revert
|
||||
m_indexCode = saved;
|
||||
// error?
|
||||
@ -2590,8 +2589,11 @@ bool XmlDoc::indexDoc ( ) {
|
||||
|
||||
char *url = "unknown";
|
||||
if ( m_sreqValid ) url = m_sreq.m_url;
|
||||
log("build: error2 getting real firstip of %"INT32" for "
|
||||
"%s. Not adding new spider req", (int32_t)*fip,url);
|
||||
log("build: error2 getting real firstip of "
|
||||
"%"INT32" for "
|
||||
"%s. Not adding new spider req. "
|
||||
"spiderstatusdocsize=%"INT32, (int32_t)*fip,url,
|
||||
m_addedStatusDocSize);
|
||||
// also count it as a crawl attempt
|
||||
cr->m_localCrawlInfo.m_pageDownloadAttempts++;
|
||||
cr->m_globalCrawlInfo.m_pageDownloadAttempts++;
|
||||
@ -3134,8 +3136,9 @@ bool isRobotsTxtFile ( char *u , int32_t ulen ) {
|
||||
bool XmlDoc::isContainerDoc ( ) {
|
||||
if ( m_firstUrlValid && m_firstUrl.isWarc() ) return true;
|
||||
if ( m_firstUrlValid && m_firstUrl.isArc () ) return true;
|
||||
if ( ! m_contentDelimValid ) { char *xx=NULL;*xx=0; }
|
||||
if ( m_contentDelim ) return true;
|
||||
//if ( ! m_contentDelimValid ) { char *xx=NULL;*xx=0; }
|
||||
//if ( m_contentDelim ) return true;
|
||||
if ( m_contentDelimValid && m_contentDelim ) return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
@ -28695,6 +28698,11 @@ SafeBuf *XmlDoc::getSpiderStatusDocMetaList2 ( SpiderReply *reply1 ) {
|
||||
jd.safePrintf("\"gbssHttpStatus\":%"INT32",\n",
|
||||
(int32_t)m_httpStatus);
|
||||
|
||||
// do not index gbssIsSeedUrl:0 because there will be too many usually
|
||||
bool isSeed = ( m_sreqValid && m_sreq.m_isAddUrl );
|
||||
if ( isSeed )
|
||||
jd.safePrintf("\"gbssIsSeedUrl\":1,\n");
|
||||
|
||||
if ( od )
|
||||
jd.safePrintf("\"gbssWasIndexed\":1,\n");
|
||||
else
|
||||
@ -28719,6 +28727,18 @@ SafeBuf *XmlDoc::getSpiderStatusDocMetaList2 ( SpiderReply *reply1 ) {
|
||||
else
|
||||
jd.safePrintf("\"gbssDiffbotUri\":"
|
||||
"\"none\",\n");
|
||||
// show the type as gbssDiffbotType:"article" etc.
|
||||
JsonItem *dti = NULL;
|
||||
if ( jp1 )
|
||||
dti = jp1->getItem("type");
|
||||
if ( dti ) {
|
||||
jd.safePrintf("\"gbssDiffbotType\":\"");
|
||||
int32_t vlen;
|
||||
char *val = dti->getValueAsString( &vlen );
|
||||
if ( val ) jd.jsonEncode ( val , vlen );
|
||||
jd.safePrintf("\",\n");
|
||||
}
|
||||
|
||||
}
|
||||
else { // if ( cr->m_isCustomCrawl ) {
|
||||
jd.safePrintf("\"gbssIsDiffbotObject\":0,\n");
|
||||
|
11
hash.cpp
11
hash.cpp
@ -232,6 +232,17 @@ uint64_t hash64d ( char *p, int32_t plen ) {
|
||||
char ncs = utf8Encode ( x , (char *)tmp );
|
||||
// sanity check
|
||||
if ( ncs > 4 ) { char *xx=NULL;*xx=0; }
|
||||
// i've seen this happen for 4 byte char =
|
||||
// -16,-112,-51,-125 which has x=66371 and y=66371
|
||||
// but utf8Encode() returned 0!
|
||||
if ( ncs == 0 ) {
|
||||
// let's just hash it as-is then
|
||||
tmp[0] = p[0];
|
||||
if ( cs >= 1 ) tmp[1] = p[1];
|
||||
if ( cs >= 2 ) tmp[2] = p[2];
|
||||
if ( cs >= 3 ) tmp[3] = p[3];
|
||||
ncs = cs;
|
||||
}
|
||||
// hash it up
|
||||
h ^= g_hashtab [i++][tmp[0]];
|
||||
if ( ncs == 1 ) continue;
|
||||
|
33
hash.h
33
hash.h
@ -250,6 +250,17 @@ inline uint64_t hash64Lower_utf8_nospaces ( char *p, int32_t len ) {
|
||||
char ncs = utf8Encode ( y , tmp );
|
||||
// sanity check
|
||||
if ( ncs > 4 ) { char *xx=NULL;*xx=0; }
|
||||
// i've seen this happen for 4 byte char =
|
||||
// -16,-112,-51,-125 which has x=66371 and y=66371
|
||||
// but utf8Encode() returned 0!
|
||||
if ( ncs == 0 ) {
|
||||
// let's just hash it as-is then
|
||||
tmp[0] = p[0];
|
||||
if ( cs >= 1 ) tmp[1] = p[1];
|
||||
if ( cs >= 2 ) tmp[2] = p[2];
|
||||
if ( cs >= 3 ) tmp[3] = p[3];
|
||||
ncs = cs;
|
||||
}
|
||||
// hash it up
|
||||
h ^= g_hashtab [i++][(uint8_t)tmp[0]];
|
||||
if ( ncs == 1 ) continue;
|
||||
@ -301,6 +312,17 @@ inline uint64_t hash64Lower_utf8_cont ( char *p,
|
||||
char ncs = utf8Encode ( y , tmp );
|
||||
// sanity check
|
||||
if ( ncs > 4 ) { char *xx=NULL;*xx=0; }
|
||||
// i've seen this happen for 4 byte char =
|
||||
// -16,-112,-51,-125 which has x=66371 and y=66371
|
||||
// but utf8Encode() returned 0!
|
||||
if ( ncs == 0 ) {
|
||||
// let's just hash it as-is then
|
||||
tmp[0] = p[0];
|
||||
if ( cs >= 1 ) tmp[1] = p[1];
|
||||
if ( cs >= 2 ) tmp[2] = p[2];
|
||||
if ( cs >= 3 ) tmp[3] = p[3];
|
||||
ncs = cs;
|
||||
}
|
||||
// hash it up
|
||||
h ^= g_hashtab [i++][(uint8_t)tmp[0]];
|
||||
if ( ncs == 1 ) continue;
|
||||
@ -376,6 +398,17 @@ inline uint64_t hash64Lower_utf8 ( char *p ) {
|
||||
char ncs = utf8Encode ( y , (char *)tmp );
|
||||
// sanity check
|
||||
if ( ncs > 4 ) { char *xx=NULL;*xx=0; }
|
||||
// i've seen this happen for 4 byte char =
|
||||
// -16,-112,-51,-125 which has x=66371 and y=66371
|
||||
// but utf8Encode() returned 0!
|
||||
if ( ncs == 0 ) {
|
||||
// let's just hash it as-is then
|
||||
tmp[0] = p[0];
|
||||
if ( cs >= 1 ) tmp[1] = p[1];
|
||||
if ( cs >= 2 ) tmp[2] = p[2];
|
||||
if ( cs >= 3 ) tmp[3] = p[3];
|
||||
ncs = cs;
|
||||
}
|
||||
// hash it up
|
||||
h ^= g_hashtab [i++][(uint8_t)tmp[0]];
|
||||
if ( ncs == 1 ) continue;
|
||||
|
2
main.cpp
2
main.cpp
@ -4998,7 +4998,7 @@ int install ( install_flag_konst_t installFlag , int32_t hostId , char *dir ,
|
||||
if ( ! f.doesExist() ) target = "gb";
|
||||
|
||||
sprintf(tmp,
|
||||
"scp -c blowfish " // blowfish is faster
|
||||
"scp -c arcfour " // blowfish is faster
|
||||
"%s%s "
|
||||
"%s:%s/gb.installed%s",
|
||||
dir,
|
||||
|
Reference in New Issue
Block a user