Merge branch 'diffbot-testing' into ia

Conflicts:
	Parms.cpp
This commit is contained in:
Matt
2015-10-10 14:05:27 -06:00
27 changed files with 537 additions and 163 deletions

@ -532,6 +532,10 @@ Timedb.o:
HashTableX.o:
$(CC) $(DEFS) $(CPPFLAGS) -O2 -c $*.cpp
# getUrlFilterNum2()
Spider.o:
$(CC) $(DEFS) $(CPPFLAGS) -O2 -c $*.cpp
SpiderCache.o:
$(CC) $(DEFS) $(CPPFLAGS) -O2 -c $*.cpp

@ -21,7 +21,7 @@
// uncomment this for EFENCE to do underflow checks instead of the
// default overflow checks
//#define _CHECKUNDERFLOW_
//#define CHECKUNDERFLOW
// only Mem.cpp can call ::malloc, everyone else must call mmalloc() so
// we can keep tabs on memory usage. in Mem.h we #define this to be coreme()
@ -2168,7 +2168,7 @@ void *getElecMem ( int32_t size ) {
// a page above OR a page below
// let's go below this time since that seems to be the problem
#ifdef _CHECKUNDERFLOW_
#ifdef CHECKUNDERFLOW
// how much to alloc
// . assume sysmalloc returs one byte above a page, so we need
// MEMPAGESIZE-1 bytes to move p up to page boundary, another
@ -2302,7 +2302,7 @@ void freeElecMem ( void *fakeMem ) {
char *label = &s_labels[((uint32_t)h)*16];
int32_t fakeSize = s_sizes[h];
#ifdef _CHECKUNDERFLOW_
#ifdef CHECKUNDERFLOW
char *oldProtMem = cp - MEMPAGESIZE;
#else
char *oldProtMem = cp + fakeSize;

@ -1222,13 +1222,16 @@ bool ipWasBanned ( TcpSocket *ts , const char **msg , Msg13Request *r ) {
// if it is a seed url and there are no links, then perhaps we
// are in a blacklist somewhere already from triggering a spider trap
if ( //isInSeedBuf ( cr , r->ptr_url ) &&
// this is set in XmlDoc.cpp based on hopcount really
r->m_isRootSeedUrl &&
! strstr ( ts->m_readBuf, "<a href" ) ) {
*msg = "root/seed url with no outlinks";
return true;
}
// i've seen this flub on a site where they just return a script
// and it is not banned, so let's remove this until we thinkg
// of something better.
// if ( //isInSeedBuf ( cr , r->ptr_url ) &&
// // this is set in XmlDoc.cpp based on hopcount really
// r->m_isRootSeedUrl &&
// ! strstr ( ts->m_readBuf, "<a href" ) ) {
// *msg = "root/seed url with no outlinks";
// return true;
// }
// TODO: compare a simple checksum of the page content to what

@ -157,6 +157,12 @@ bool Msg20::getSummary ( Msg20Request *req ) {
// do not re-route to twins if accessing an external network
if ( hostdb != &g_hostdb ) req->m_expected = false;
if ( req->m_docId < 0 && ! req->ptr_ubuf ) {
log("msg20: docid<0 and no url for msg20::getsummary");
g_errno = EBADREQUEST;
return true;
}
// get groupId from docId, if positive
uint32_t shardNum;
if ( req->m_docId >= 0 )
@ -398,7 +404,8 @@ void handleRequest20 ( UdpSlot *slot , int32_t netnice ) {
// sanity check, the size include the \0
if ( req->m_collnum < 0 ) {
log("query: Got empty collection in msg20 handler. FIX!");
log("query: Got empty collection in msg20 handler. FIX! "
"from ip=%s port=%i",iptoa(slot->m_ip),(int)slot->m_port);
g_udpServer.sendErrorReply ( slot , ENOTFOUND );
return;
//char *xx =NULL; *xx = 0;

@ -76,7 +76,7 @@ class RdbCache *getDiskPageCache ( char rdbId ) {
rpc = &g_rdbCaches[2];
maxSizePtr = &g_conf.m_clusterdbFileCacheSize;
maxMem = *maxSizePtr;
maxRecs = maxMem / 16;
maxRecs = maxMem / 32;
dbname = "clustcache";
}
if ( rdbId == RDB_TITLEDB ) {

@ -154,6 +154,7 @@ void Msg39::getDocIds ( UdpSlot *slot ) {
int32_t requestSize = m_slot->m_readBufSize;
// ensure it's size is ok
if ( requestSize < 8 ) {
BadReq:
g_errno = EBADREQUESTSIZE;
log(LOG_LOGIC,"query: msg39: getDocIds: %s." ,
mstrerror(g_errno) );
@ -169,7 +170,11 @@ void Msg39::getDocIds ( UdpSlot *slot ) {
m_r->m_buf );
// sanity check
if ( finalSize != requestSize ) {char *xx=NULL;*xx=0; }
if ( finalSize != requestSize ) {
log("msg39: sending bad request.");
goto BadReq;
//char *xx=NULL;*xx=0; }
}
getDocIds2 ( m_r );
}

@ -736,14 +736,6 @@ bool Msg3a::gotAllShardReplies ( ) {
// cast it and set it
m_reply [i] = mr;
m_replyMaxSize[i] = replyMaxSize;
// deserialize it (just sets the ptr_ and size_ member vars)
//mr->deserialize ( );
deserializeMsg ( sizeof(Msg39Reply) ,
&mr->size_docIds,
&mr->size_clusterRecs,
&mr->ptr_docIds,
mr->m_buf );
// sanity check
if ( mr->m_nqt != m_q->getNumTerms() ) {
g_errno = EBADREPLY;
@ -761,6 +753,20 @@ bool Msg3a::gotAllShardReplies ( ) {
mstrerror(g_errno));
return true;
}
// deserialize it (just sets the ptr_ and size_ member vars)
//mr->deserialize ( );
if ( ! deserializeMsg ( sizeof(Msg39Reply) ,
&mr->size_docIds,
&mr->size_clusterRecs,
&mr->ptr_docIds,
mr->m_buf ) ) {
g_errno = ECORRUPTDATA;
m_errno = ECORRUPTDATA;
log("query: msg3a: Shard had error: %s",
mstrerror(g_errno));
return true;
}
// skip down here if reply was already set
//skip:
// add of the total hits from each shard, this is how many
@ -1171,18 +1177,6 @@ bool Msg3a::mergeLists ( ) {
continue;
}
fe2->m_count += fe->m_count;
// also accumualte count of total docs, not just in
// the search results, that have this value for this
// facet
fe2->m_outsideSearchResultsCount +=
fe->m_outsideSearchResultsCount;
// prefer docid kinda randomly to balance
// lookupFacets() load in Msg40.cpp
if ( rand() % 2 )
fe2->m_docId = fe->m_docId;
if ( isFloat ) {
@ -1192,23 +1186,38 @@ bool Msg3a::mergeLists ( ) {
sum2 += sum1;
*((double *)&fe2->m_sum) = sum2;
// and min/max as floats
float min1 = *((float *)&fe ->m_min);
float min2 = *((float *)&fe2->m_min);
if ( min1 < min2 ) min2 = min1;
if ( fe2->m_count==0 || (fe->m_count!=0 && min1 < min2 )) min2 = min1;
*((float *)&fe2->m_min) = min2;
float max1 = *((float *)&fe ->m_max);
float max2 = *((float *)&fe2->m_max);
if ( max1 > max2 ) max2 = max1;
if ( fe2->m_count==0 || (fe->m_count!=0 && max1 > max2 )) max2 = max1;
*((float *)&fe2->m_max) = max2;
}
if ( isInt ) {
fe2->m_sum += fe->m_sum;
if ( fe->m_min < fe2->m_min )
if ( fe2->m_count==0 || (fe->m_count!=0 && fe->m_min < fe2->m_min ))
fe2->m_min = fe->m_min;
if ( fe->m_max > fe2->m_max )
if ( fe2->m_count==0 || (fe->m_count!=0 && fe->m_max > fe2->m_max ))
fe2->m_max = fe->m_max;
}
fe2->m_count += fe->m_count;
// also accumualte count of total docs, not just in
// the search results, that have this value for this
// facet
fe2->m_outsideSearchResultsCount +=
fe->m_outsideSearchResultsCount;
// prefer docid kinda randomly to balance
// lookupFacets() load in Msg40.cpp
if ( rand() % 2 )
fe2->m_docId = fe->m_docId;
}
// now get the next gbfacet: term if there was one

@ -1071,7 +1071,7 @@ bool Msg40::reallocMsg20Buf ( ) {
// . allocate m_buf2 to hold all our Msg20 pointers and Msg20 classes
// . how much mem do we need?
// . need space for the msg20 ptrs
int32_t need = m_msg3a.m_numDocIds * sizeof(Msg20 *);
int64_t need = m_msg3a.m_numDocIds * sizeof(Msg20 *);
// need space for the classes themselves, only if "visible" though
for ( int32_t i = 0 ; i < m_msg3a.m_numDocIds ; i++ )
if ( m_msg3a.m_clusterLevels[i] == CR_OK )
@ -1243,6 +1243,13 @@ bool Msg40::reallocMsg20Buf ( ) {
m_buf2 = NULL;
m_bufMaxSize2 = need;
// if ( need > 2000000000 ) {
// log("msg40: need too much mem=%"INT64,need);
// m_errno = ENOMEM;
// g_errno = ENOMEM;
// return false;
// }
// do the alloc
if ( need ) m_buf2 = (char *)mmalloc ( need ,"Msg40msg20");
if ( need && ! m_buf2 ) { m_errno = g_errno; return false; }

@ -1256,10 +1256,21 @@ bool gotResults ( void *state ) {
// into it, and it must be the SAME ptr too!
CollectionRec *cr = si->m_cr;//g_collectiondb.getRec ( collnum );
if ( ! cr ) { // || cr != si->m_cr ) {
g_errno = ENOCOLLREC;
return sendReply(st,NULL);
g_errno = ENOCOLLREC;
return sendReply(st,NULL);
}
// this causes ooms everywhere, not a good fix
if ( ! msg40->m_msg20 && ! si->m_docIdsOnly && msg40->m_errno ) {
log("msg40: failed to get results q=%s",si->m_q.m_orig);
//g_errno = ENOMEM;
g_errno = msg40->m_errno;
return sendReply(st,NULL);
}
//char *coll = cr->m_coll;
/*
@ -3962,6 +3973,8 @@ bool printResult ( State0 *st, int32_t ix , int32_t *numPrintedSoFar ) {
ix, (int32_t)msg40->getClusterLevel(ix));
int64_t d = msg40->getDocId(ix);
// this is normally a double, but cast to float
float docScore = (float)msg40->getScore(ix);
// do not print if it is a summary dup or had some error
// int32_t level = (int32_t)msg40->getClusterLevel(ix);
@ -5083,6 +5096,7 @@ bool printResult ( State0 *st, int32_t ix , int32_t *numPrintedSoFar ) {
// . docId for possible cached link
// . might have merged a bunch together
sb->safePrintf("\t\t<docId>%"INT64"</docId>\n",mr->m_docId );
sb->safePrintf("\t\t<docScore>%f</docScore>\n",docScore);
}
if ( si->m_format == FORMAT_XML && mr->m_contentType != CT_STATUS ) {
@ -5133,6 +5147,7 @@ bool printResult ( State0 *st, int32_t ix , int32_t *numPrintedSoFar ) {
// . docId for possible cached link
// . might have merged a bunch together
sb->safePrintf("\t\t\"docId\":%"INT64",\n",mr->m_docId );
sb->safePrintf("\t\t\"docScore\":%f,\n",docScore);
}
if ( si->m_format == FORMAT_JSON && mr->m_contentType != CT_STATUS ) {

@ -734,6 +734,18 @@ bool sendPageStats ( TcpSocket *s , HttpRequest *r ) {
p.safePrintf("<td>%"INT64"</td>",a);
}
p.safePrintf ("</tr>\n<tr class=poo><td><b><nobr>dropped recs</td>" );
for ( int32_t i = 0 ; i < numCaches ; i++ ) {
int64_t a = caches[i]->m_deletes;
p.safePrintf("<td>%"INT64"</td>",a);
}
p.safePrintf ("</tr>\n<tr class=poo><td><b><nobr>added recs</td>" );
for ( int32_t i = 0 ; i < numCaches ; i++ ) {
int64_t a = caches[i]->m_adds;
p.safePrintf("<td>%"INT64"</td>",a);
}
//p.safePrintf ("</tr>\n<tr class=poo><td><b><nobr>max age</td>" );
//for ( int32_t i = 0 ; i < numCaches ; i++ ) {
// int64_t a = caches[i]->getMaxMem();
@ -2150,6 +2162,34 @@ bool sendPageStats ( TcpSocket *s , HttpRequest *r ) {
p.safePrintf("<td>%"UINT64"</td></tr>\n",total);
p.safePrintf("<tr class=poo><td><b>file cache adds</b></td>");
total = 0;
for ( int32_t i = 0 ; i < nr ; i++ ) {
Rdb *rdb = rdbs[i];
RdbCache *rpc = getDiskPageCache ( rdb->m_rdbId );
if ( ! rpc ) {
p.safePrintf("<td>--</td>");
continue;
}
p.safePrintf("<td>%"UINT64"</td>",rpc->m_adds);
}
p.safePrintf("<td>%"UINT64"</td></tr>\n",total);
p.safePrintf("<tr class=poo><td><b>file cache drops</b></td>");
total = 0;
for ( int32_t i = 0 ; i < nr ; i++ ) {
Rdb *rdb = rdbs[i];
RdbCache *rpc = getDiskPageCache ( rdb->m_rdbId );
if ( ! rpc ) {
p.safePrintf("<td>--</td>");
continue;
}
p.safePrintf("<td>%"UINT64"</td>",rpc->m_deletes);
}
p.safePrintf("<td>%"UINT64"</td></tr>\n",total);
p.safePrintf("<tr class=poo><td><b>file cache used</b></td>");
total = 0;
for ( int32_t i = 0 ; i < nr ; i++ ) {

@ -11288,20 +11288,6 @@ void Parms::init ( ) {
m++;
*/
m->m_title = "verify disk writes";
m->m_desc = "Read what was written in a verification step. Decreases "
"performance, but may help fight disk corruption mostly on "
"Maxtors and Western Digitals.";
m->m_cgi = "vdw";
m->m_off = (char *)&g_conf.m_verifyWrites - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_group = 0;
m->m_flags = 0;//PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
// this is ifdef'd out in Msg3.cpp for performance reasons,
// so do it here, too
#ifdef GBSANITYCHECK
@ -12434,12 +12420,22 @@ void Parms::init ( ) {
m->m_def = "0";
m->m_group = 0;
m->m_flags = PF_API;//PF_HIDDEN | PF_NOSAVE;
*/
m->m_title = "verify disk writes";
m->m_desc = "Read what was written in a verification step. Decreases "
"performance, but may help fight disk corruption mostly on "
"Maxtors and Western Digitals.";
m->m_cgi = "vdw";
m->m_off = (char *)&g_conf.m_verifyWrites - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_group = 0;
m->m_flags = 0;//PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m->m_group = 0;
m++;
*/
m->m_title = "max spider read threads";
m->m_desc = "Maximum number of threads to use per Gigablast process "

@ -918,6 +918,10 @@ bool PosdbTable::allocTopTree ( ) {
, (int32_t)m_r->m_numDocIdSplits
);
// keep it sane
if ( nn > m_r->m_docsToGet * 2 && nn > 60 )
nn = m_r->m_docsToGet * 2;
// this actually sets the # of nodes to MORE than nn!!!
if ( ! m_topTree->setNumNodes(nn,m_r->m_doSiteClustering)) {
log("toptree: toptree: error allocating nodes: %s",
@ -1007,8 +1011,9 @@ bool PosdbTable::allocTopTree ( ) {
continue;
// how big?
int64_t total = m_msg2->m_lists[i].getListSize();
// skip if empty
if ( total == 0 ) {
// skip if empty. no we could be doing a split that is
// empty but other splits are full
if ( total == 0 && m_r->m_numDocIdSplits <= 1 ) {
log("query: empty facets for term #%i",i);
continue;
}
@ -6639,7 +6644,12 @@ void PosdbTable::intersectLists10_r ( ) {
// synbits on it, below!!! or a half stop wiki bigram like
// the term "enough for" in the wiki phrase
// "time enough for love" because we wanna reward that more!
// this halfstopwikibigram bit is set in the indivial keys
// so we'd have to at least do a key cleansing, so we can't
// do this shortcut right now... mdw oct 10 2015
if ( nsub == 1 &&
// need it for gbfacet termlists though it seems
(nwpFlags[0] & BF_FACET) &&
!(nwpFlags[0] & BF_SYNONYM) &&
!(nwpFlags[0] & BF_HALFSTOPWIKIBIGRAM) ) {
miniMergedList [j] = nwp [0];
@ -7565,6 +7575,7 @@ void PosdbTable::intersectLists10_r ( ) {
dcs.m_docLang = docLang;
// ensure enough room we can't allocate in a thread!
if ( m_scoreInfoBuf.getAvail()<(int32_t)sizeof(DocIdScore)+1){
goto advance;
char *xx=NULL;*xx=0; }
// if same as last docid, overwrite it since we have a higher
// siterank or langid i guess

@ -1515,15 +1515,16 @@ bool Process::shutdown2 ( ) {
static bool s_printed = false;
// wait for all threads to return
int32_t n = g_threads.getNumThreadsOutOrQueued() ;
//int32_t n = g_threads.getNumThreadsOutOrQueued() ;
int32_t n = g_threads.getNumWriteThreadsOut();
if ( n != 0 && ! m_urgent ) {
log(LOG_INFO,"gb: Has %"INT32" threads out. Waiting for "
log(LOG_INFO,"gb: Has %"INT32" write threads out. Waiting for "
"them to finish.",n);
return false;
}
else if ( ! s_printed && ! m_urgent ) {
s_printed = true;
log(LOG_INFO,"gb: No threads out.");
log(LOG_INFO,"gb: No write threads out.");
}
@ -1687,6 +1688,9 @@ bool Process::shutdown2 ( ) {
if ( g_process.m_threadOut )
log(LOG_INFO,"gb: still has hdtemp thread");
log("gb. EXITING.");
// exit abruptly
exit(0);
@ -1764,7 +1768,7 @@ bool Process::saveRdbTrees ( bool useThread , bool shuttingDown ) {
// no thread if shutting down
if ( shuttingDown ) useThread = false;
// debug note
log("gb: shuttingdown=%i",(int)shuttingDown);
if ( shuttingDown ) log("gb: trying to shutdown");
// turn off statsdb until everyone is done
//g_statsdb.m_disabled = true;
// loop over all Rdbs and save them

@ -543,7 +543,7 @@ bool RdbCache::getRecord ( collnum_t collnum ,
// of the delete head's space i guess.
// i do this for all caches now... what are the downsides? i forget.
//
bool check = false;
bool check = true;//false;
//if ( this == &g_genericCache[SITEQUALITY_CACHEID] ) check = true;
if ( this == g_dns.getCache () ) check = true;
if ( this == g_dns.getCacheLocal () ) check = true;
@ -558,11 +558,11 @@ bool RdbCache::getRecord ( collnum_t collnum ,
//if ( this == &g_tagdb.m_listCache ) check = true;
// the exact count cache...
//if ( this == &g_qtable ) check = true;
if ( m_totalBufSize < 20000 ) check = false;
//if ( m_totalBufSize < 20000 ) check = false;
if ( check ) promoteRecord = false;
// sanity check, do not allow the site quality cache or dns cache to
// be > 128MB, that just does not make sense and it complicates things
if ( check && m_totalBufSize > BUFSIZE ) { char *xx = NULL; *xx = 0; }
//if(check && m_totalBufSize > BUFSIZE ) { char *xx = NULL; *xx = 0; }
// sanity check
if ( m_tail < 0 || m_tail > m_totalBufSize ) {
char *xx = NULL; *xx = 0; }
@ -957,11 +957,13 @@ bool RdbCache::addRecord ( collnum_t collnum ,
m_memOccupied += ( p - start );
// debug msg (MDW)
//log("cache: adding rec @ %"UINT32" size=%"INT32" tail=%"UINT32"",
// i1c,p-start,m_tail);
//log("cache: stored k.n1=%"UINT32" k.n0=%"UINT64" %"INT32" bytes @ %"UINT32" tail=%"UINT32"",
// ((key_t *)cacheKey)->n1,
// ((key_t *)cacheKey)->n0,p-start,i1c,m_tail);
// if ( this == &g_spiderLoop.m_winnerListCache ) {
// log("cache: adding rec @ %"UINT32" size=%i tail=%"INT32"",
// i1c,(int)(p-start),m_tail);
// log("cache: stored k.n1=%"UINT32" k.n0=%"UINT64" %"INT32" bytes @ %"UINT32" tail=%"UINT32"",
// ((key_t *)cacheKey)->n1,
// ((key_t *)cacheKey)->n0,(int)(p-start),i1c,m_tail);
// }
//if ( m_cks == 4 )
// log("stored k=%"XINT32" %"INT32" bytes @ %"UINT32"",
// *(int32_t *)cacheKey,p-start,i);//(uint32_t)start);
@ -1113,8 +1115,10 @@ bool RdbCache::deleteRec ( ) {
//int32_t saved = m_tail;
// debug msg (MDW)
//log("cache: deleting rec @ %"INT32" size=%"INT32"",m_tail,
// dataSize+2+12+4+4);
// if ( this == &g_spiderLoop.m_winnerListCache ) {
// log("cache: deleting rec @ %"INT32" size=%"INT32"",m_tail,
// dataSize+2+12+4+4);
// }
// skip over rest of rec
p += dataSize;
@ -1128,6 +1132,10 @@ bool RdbCache::deleteRec ( ) {
m_tail +(int32_t)sizeof(collnum_t)+m_cks+4>m_totalBufSize){
char *xx = NULL; *xx = 0;}
// if ( this == &g_spiderLoop.m_winnerListCache )
// log("spider: rdbcache: removing tail rec collnum=%i",
// (int)collnum);
// delete key from hash table, iff is for THIS record
// but if it has not already been voided.
// we set key to KEYMAX() in markDeletedRecord()
@ -1167,8 +1175,10 @@ bool RdbCache::deleteRec ( ) {
void RdbCache::markDeletedRecord(char *ptr){
int32_t dataSize = sizeof(collnum_t)+m_cks+sizeof(int32_t);
// debug it
//logf(LOG_DEBUG,"cache: makeDeleteRecord ptr=0x%"XINT32" off=%"INT32"",
// (int32_t)ptr,ptr-m_bufs[0]);
// if ( this == &g_spiderLoop.m_winnerListCache ) {
//logf(LOG_DEBUG,"cache: makeDeleteRec ptr=0x%"PTRFMT" off=%"INT32"",
// (PTRTYPE)ptr,(int32_t)(ptr-m_bufs[0]));
// }
// get dataSize and data
if ( m_fixedDataSize == -1 || m_supportLists ) {
dataSize += 4 + // size

@ -779,7 +779,8 @@ bool RdbDump::doneReadingForVerify ( ) {
// see if what we wrote is the same as what we read back
if ( m_verifyBuf && memcmp(m_verifyBuf,m_buf,m_bytesToWrite) != 0 &&
if ( m_verifyBuf && g_conf.m_verifyWrites &&
memcmp(m_verifyBuf,m_buf,m_bytesToWrite) != 0 &&
! g_errno ) {
log("disk: Write verification of %"INT32" bytes to file %s "
"failed at offset=%"INT64". Retrying.",

@ -3566,4 +3566,32 @@ void RdbList::setFromSafeBuf ( SafeBuf *sb , char rdbId ) {
}
void RdbList::setFromPtr ( char *p , int32_t psize , char rdbId ) {
// free and NULLify any old m_list we had to make room for our new list
freeList();
// set this first since others depend on it
m_ks = getKeySizeFromRdbId ( rdbId );
// set our list parms
m_list = p;
m_listSize = psize;
m_alloc = p;
m_allocSize = psize;
m_listEnd = m_list + m_listSize;
KEYMIN(m_startKey,m_ks);
KEYMAX(m_endKey ,m_ks);
m_fixedDataSize = getDataSizeFromRdbId ( rdbId );
m_ownData = false;//ownData;
m_useHalfKeys = false;//useHalfKeys;
// use this call now to set m_listPtr and m_listPtrHi based on m_list
resetListPtr();
}

@ -107,6 +107,7 @@ class RdbList {
char keySize = sizeof(key_t) );
void setFromSafeBuf ( class SafeBuf *sb , char rdbId );
void setFromPtr ( char *p , int32_t psize , char rdbId ) ;
// just set the start and end keys
//void set ( key_t startKey , key_t endKey );

@ -236,7 +236,7 @@ bool SafeBuf::pushFloat ( float i) {
// hack off trailing 0's
bool SafeBuf::printFloatPretty ( float f ) {
if ( m_length + 20 > m_capacity && ! reserve(20) )
if ( m_length + 40 > m_capacity && ! reserve(40) )
return false;
char *p = m_buf + m_length;

@ -3576,7 +3576,7 @@ bool SpiderColl::evalIpLoop ( ) {
&doleBuf,
&doleBufSize ,
false, // doCopy?
300, // maxAge, 300 seconds
600, // maxAge, 600 seconds
true ,// incCounts
&cachedTimestamp , // rec timestamp
true ); // promote rec?
@ -3584,25 +3584,47 @@ bool SpiderColl::evalIpLoop ( ) {
}
// if ( m_collnum == 18752 ) {
// int32_t coff = 0;
// if ( inCache && doleBufSize >= 4 ) coff = *(int32_t *)doleBuf;
// log("spider: usecache=%i incache=%i dbufsize=%i currentoff=%i "
// "ctime=%i ip=%s"
// ,(int)useCache
// ,(int)inCache
// ,(int)doleBufSize
// ,(int)coff
// ,(int)cachedTimestamp
// ,iptoa(m_scanningIp));
// }
// doleBuf could be NULL i guess...
if ( inCache ) { // && doleBufSize > 0 ) {
if ( g_conf.m_logDebugSpider )
int32_t crc = hash32 ( doleBuf + 4 , doleBufSize - 4 );
if ( g_conf.m_logDebugSpider ) // || m_collnum == 18752 )
log("spider: GOT %"INT32" bytes of SpiderRequests "
"from winnerlistcache for ip %s",doleBufSize,
iptoa(m_scanningIp));
"from winnerlistcache for ip %s ptr=0x%"PTRFMT
" crc=%"UINT32
,doleBufSize,
iptoa(m_scanningIp),
(PTRTYPE)doleBuf,
crc);
// set own to false so it doesn't get freed
// m_doleBuf.setBuf ( doleBuf ,
// doleBufSize ,
// doleBufSize ,
// false , // ownData?
// 0 ); // encoding. doesn't matter.
m_doleBuf.reset();
//m_doleBuf.reset();
// gotta copy it because we end up re-adding part of it
// to rdbcache below
m_doleBuf.safeMemcpy ( doleBuf , doleBufSize );
//m_doleBuf.safeMemcpy ( doleBuf , doleBufSize );
// we no longer re-add to avoid churn. but do not free it
// so do not 'own' it.
SafeBuf sb;
sb.setBuf ( doleBuf, doleBufSize, doleBufSize, false );
// now add the first rec m_doleBuf into doledb's tree
// and re-add the rest back to the cache with the same key.
return addDoleBufIntoDoledb ( true , cachedTimestamp );
return addDoleBufIntoDoledb(&sb,true);//,cachedTimestamp)
}
top:
@ -4718,6 +4740,9 @@ bool SpiderColl::scanListForWinners ( ) {
int32_t maxWinners = (int32_t)MAX_WINNER_NODES; // 40
//if ( ! m_cr->m_isCustomCrawl ) maxWinners = 1;
// if less than 10MB of spiderdb requests limit to 400
if ( m_totalBytesScanned < 10000000 ) maxWinners = 400;
// only put one doledb record into winner tree if
// the list is pretty short. otherwise, we end up caching
// too much. granted, we only cache for about 2 mins.
@ -5225,16 +5250,23 @@ bool SpiderColl::addWinnersIntoDoledb ( ) {
}
// i've seen this happen, wtf?
if ( m_winnerTree.isEmpty() && m_minFutureTimeMS ) {
// this will update the waiting tree key with minFutureTimeMS
addDoleBufIntoDoledb ( NULL , false );
return true;
}
// i am seeing dup uh48's in the m_winnerTree
int32_t firstIp = m_waitingTreeKey.n0 & 0xffffffff;
char dbuf[3*MAX_WINNER_NODES*(8+1)];
char dbuf[147456];//3*MAX_WINNER_NODES*(8+1)];
HashTableX dedup;
int32_t ntn = m_winnerTree.getNumNodes();
dedup.set ( 8,
0,
(int32_t)2*ntn, // # slots to initialize to
dbuf,
(int32_t)(3*MAX_WINNER_NODES*(8+1)),
147456,//(int32_t)(3*MAX_WINNER_NODES*(8+1)),
false,
MAX_NICENESS,
"windt");
@ -5244,7 +5276,14 @@ bool SpiderColl::addWinnersIntoDoledb ( ) {
// make winner tree into doledb list to add
//
///////////
m_doleBuf.reset();
//m_doleBuf.reset();
//m_doleBuf.setLabel("dolbuf");
// first 4 bytes is offset of next doledb record to add to doledb
// so we do not have to re-add the dolebuf to the cache and make it
// churn. it is really inefficient.
SafeBuf doleBuf;
doleBuf.pushLong(4);
int32_t added = 0;
for ( int32_t node = m_winnerTree.getFirstNode() ;
node >= 0 ;
node = m_winnerTree.getNextNode ( node ) ) {
@ -5294,16 +5333,18 @@ bool SpiderColl::addWinnersIntoDoledb ( ) {
log("spider: got dup uh48=%"UINT64" dammit", winUh48);
continue;
}
// count it
added++;
// do not allow dups
dedup.addKey ( &winUh48 );
// store doledb key first
if ( ! m_doleBuf.safeMemcpy ( &doleKey, sizeof(key_t) ) )
if ( ! doleBuf.safeMemcpy ( &doleKey, sizeof(key_t) ) )
hadError = true;
// then size of spiderrequest
if ( ! m_doleBuf.pushLong ( sreq2->getRecSize() ) )
if ( ! doleBuf.pushLong ( sreq2->getRecSize() ) )
hadError = true;
// then the spiderrequest encapsulated
if ( ! m_doleBuf.safeMemcpy ( sreq2 , sreq2->getRecSize() ))
if ( ! doleBuf.safeMemcpy ( sreq2 , sreq2->getRecSize() ))
hadError=true;
// note and error
if ( hadError ) {
@ -5313,11 +5354,52 @@ bool SpiderColl::addWinnersIntoDoledb ( ) {
}
}
return addDoleBufIntoDoledb ( false , 0 );
// log("spider: added %"INT32" doledb recs to cache for cn=%i "
// "dolebufsize=%i",
// added,
// (int)m_collnum,
// (int)doleBuf.length());
return addDoleBufIntoDoledb ( &doleBuf , false );//, 0 );
}
bool SpiderColl::addDoleBufIntoDoledb ( bool isFromCache ,
uint32_t cachedTimestamp ) {
bool SpiderColl::validateDoleBuf ( SafeBuf *doleBuf ) {
char *doleBufEnd = doleBuf->getBuf();
// get offset
char *pstart = doleBuf->getBufStart();
char *p = pstart;
int32_t jump = *(int32_t *)p;
p += 4;
// sanity
if ( jump < 4 || jump > doleBuf->getLength() ) {
char *xx=NULL;*xx=0; }
bool gotIt = false;
for ( ; p < doleBuf->getBuf() ; ) {
if ( p == pstart + jump )
gotIt = true;
// first is doledbkey
p += sizeof(key_t);
// then size of spider request
int32_t recSize = *(int32_t *)p;
p += 4;
// the spider request encapsulated
SpiderRequest *sreq3;
sreq3 = (SpiderRequest *)p;
// point "p" to next spiderrequest
if ( recSize != sreq3->getRecSize() ) { char *xx=NULL;*xx=0;}
p += recSize;//sreq3->getRecSize();
// sanity
if ( p > doleBufEnd ) { char *xx=NULL;*xx=0; }
if ( p < pstart ) { char *xx=NULL;*xx=0; }
}
if ( ! gotIt ) { char *xx=NULL;*xx=0; }
return true;
}
bool SpiderColl::addDoleBufIntoDoledb ( SafeBuf *doleBuf, bool isFromCache ) {
// uint32_t cachedTimestamp ) {
//validateDoleBuf ( doleBuf );
////////////////////
//
@ -5387,6 +5469,10 @@ bool SpiderColl::addDoleBufIntoDoledb ( bool isFromCache ,
// right now.
if ( m_winnerTree.isEmpty() && m_minFutureTimeMS && ! isFromCache ) {
// save memory
m_winnerTree.reset();
m_winnerTable.reset();
// if in the process of being added to doledb or in doledb...
if ( m_doleIpTable.isInTable ( &firstIp ) ) {
// sanity i guess. remove this line if it hits this!
@ -5497,6 +5583,8 @@ bool SpiderColl::addDoleBufIntoDoledb ( bool isFromCache ,
// how did this happen?
//if ( ! m_msg1Avail ) { char *xx=NULL;*xx=0; }
char *doleBufEnd = doleBuf->getBuf();
// add it to doledb ip table now so that waiting tree does not
// immediately get another spider request from this same ip added
// to it while the msg4 is out. but if add failes we totally bail
@ -5507,36 +5595,50 @@ bool SpiderColl::addDoleBufIntoDoledb ( bool isFromCache ,
//if ( ! addToDoleTable ( m_bestRequest ) ) return true;
// . MDW: now we have a list of doledb records in a SafeBuf:
// . scan the requests in safebuf
int32_t skipSize = 0;
for ( char *p = m_doleBuf.getBufStart() ; p < m_doleBuf.getBuf() ; ) {
// first is doledbkey
p += sizeof(key_t);
// then size of spider request
p += 4;
// the spider request encapsulated
SpiderRequest *sreq3;
sreq3 = (SpiderRequest *)p;
// point "p" to next spiderrequest
p += sreq3->getRecSize();
// for caching logic below, set this
skipSize = sizeof(key_t) + 4 + sreq3->getRecSize();
// process sreq3 my incrementing the firstip count in
// m_doleIpTable
if ( ! addToDoleTable ( sreq3 ) ) return true;
// only add the top key for now!
break;
// get offset
char *p = doleBuf->getBufStart();
int32_t jump = *(int32_t *)p;
// sanity
if ( jump < 4 || jump > doleBuf->getLength() ) {
char *xx=NULL;*xx=0; }
// the jump includes itself
p += jump;
//for ( ; p < m_doleBuf.getBuf() ; ) {
// save it
char *doledbRec = p;
// first is doledbkey
p += sizeof(key_t);
// then size of spider request
p += 4;
// the spider request encapsulated
SpiderRequest *sreq3;
sreq3 = (SpiderRequest *)p;
// point "p" to next spiderrequest
p += sreq3->getRecSize();
// this logic is now in addToDoleTable()
// . if it was empty it is no longer
// . we have this flag here to avoid scanning empty doledb
// priorities because it saves us a msg5 call to doledb in
// the scanning loop
//int32_t bp = sreq3->m_priority;//m_bestRequest->m_priority;
//if ( bp < 0 ) { char *xx=NULL;*xx=0; }
//if ( bp >= MAX_SPIDER_PRIORITIES ) { char *xx=NULL;*xx=0; }
//m_isDoledbEmpty [ bp ] = 0;
}
// sanity
if ( p > doleBufEnd ) { char *xx=NULL;*xx=0; }
// for caching logic below, set this
int32_t doledbRecSize = sizeof(key_t) + 4 + sreq3->getRecSize();
// process sreq3 my incrementing the firstip count in
// m_doleIpTable
if ( ! addToDoleTable ( sreq3 ) ) return true;
// only add the top key for now!
//break;
// // this logic is now in addToDoleTable()
// // . if it was empty it is no longer
// // . we have this flag here to avoid scanning empty doledb
// // priorities because it saves us a msg5 call to doledb in
// // the scanning loop
// //int32_t bp = sreq3->m_priority;//m_bestRequest->m_priority;
// //if ( bp < 0 ) { char *xx=NULL;*xx=0; }
// //if ( bp >= MAX_SPIDER_PRIORITIES ) { char *xx=NULL;*xx=0; }
// //m_isDoledbEmpty [ bp ] = 0;
// }
// now cache the REST of the spider requests to speed up scanning.
// better than adding 400 recs per firstip to doledb because
@ -5545,20 +5647,25 @@ bool SpiderColl::addDoleBufIntoDoledb ( bool isFromCache ,
// top rec.
// allow this to add a 0 length record otherwise we keep the same
// old url in here and keep spidering it over and over again!
bool addToCache = false;
if ( skipSize && m_doleBuf.length() - skipSize > 0 ) addToCache =true;
//bool addToCache = false;
//if( skipSize && m_doleBuf.length() - skipSize > 0 ) addToCache =true;
// if winnertree was empty, then we might have scanned like 10M
// twitter.com urls and not wanted any of them, so we don't want to
// have to keep redoing that!
if ( m_doleBuf.length() == 0 && ! isFromCache ) addToCache = true;
//if ( m_doleBuf.length() == 0 && ! isFromCache ) addToCache = true;
RdbCache *wc = &g_spiderLoop.m_winnerListCache;
// remove from cache? if we added the last spider request in the
// cached dolebuf to doledb then remove it from cache so it's not
// a cached empty dolebuf and we recompute it not using the cache.
if ( isFromCache && skipSize && m_doleBuf.length() - skipSize == 0 ) {
if ( addToCache ) { char *xx=NULL;*xx=0; }
if ( isFromCache && p >= doleBufEnd ) {
//if ( addToCache ) { char *xx=NULL;*xx=0; }
// debug note
// if ( m_collnum == 18752 )
// log("spider: rdbcache: adding single byte. skipsize=%i"
// ,doledbRecSize);
// let's get this working right...
//wc->removeKey ( collnum , k , start );
//wc->markDeletedRecord(start);
@ -5579,21 +5686,67 @@ bool SpiderColl::addDoleBufIntoDoledb ( bool isFromCache ,
//wc->verify();
}
if ( addToCache ) {
// if it wasn't in the cache and it was only one record we
// obviously do not want to add it to the cache.
else if ( p < doleBufEnd ) { // if ( addToCache ) {
key_t cacheKey;
cacheKey.n0 = firstIp;
cacheKey.n1 = 0;
if ( g_conf.m_logDebugSpider )
log("spider: adding %"INT32" bytes of SpiderRequests "
"to winnerlistcache for ip %s",
m_doleBuf.length()-skipSize,iptoa(firstIp));
char *x = doleBuf->getBufStart();
// the new offset is the next record after the one we
// just added to doledb
int32_t newJump = (int32_t)(p - x);
int32_t oldJump = *(int32_t *)x;
// NO! we do a copy in rdbcache and copy the thing over
// since we promote it. so this won't work...
*(int32_t *)x = newJump;
if ( newJump >= doleBuf->getLength() ) { char *xx=NULL;*xx=0;}
if ( newJump < 4 ) { char *xx=NULL;*xx=0;}
if ( g_conf.m_logDebugSpider ) // || m_collnum == 18752 )
log("spider: rdbcache: updating "
"%"INT32" bytes of SpiderRequests "
"to winnerlistcache for ip %s oldjump=%"INT32
" newJump=%"INT32" ptr=0x%"PTRFMT,
doleBuf->length(),iptoa(firstIp),oldJump,
newJump,
(PTRTYPE)x);
//validateDoleBuf ( doleBuf );
//wc->verify();
// inherit timestamp. if 0, RdbCache will set to current time
wc->addRecord ( m_collnum,
(char *)&cacheKey,
m_doleBuf.getBufStart() + skipSize ,
m_doleBuf.length() - skipSize ,
cachedTimestamp );
// don't re-add just use the same modified buffer so we
// don't churn the cache.
// but do add it to cache if not already in there yet.
if ( ! isFromCache ) {
// if ( m_collnum == 18752 )
// log("spider: rdbcache: adding record a new "
// "dbufsize=%i",(int)doleBuf->length());
wc->addRecord ( m_collnum,
(char *)&cacheKey,
doleBuf->getBufStart(),//+ skipSize ,
doleBuf->length() ,//- skipSize ,
0);//cachedTimestamp );
}
//validateDoleBuf( doleBuf );
/*
// test it
char *testPtr;
int32_t testLen;
bool inCache2 = wc->getRecord ( m_collnum ,
(char *)&cacheKey ,
&testPtr,
&testLen,
false, // doCopy?
600, // maxAge,600 secs
true ,// incCounts
NULL , // rec timestamp
true ); // promote?
if ( ! inCache2 ) { char *xx=NULL;*xx=0; }
if ( testLen != m_doleBuf.length() ) {char *xx=NULL;*xx=0; }
if ( *(int32_t *)testPtr != newJump ){char *xx=NULL;*xx=0; }
SafeBuf tmp;
tmp.setBuf ( testPtr , testLen , testLen , false );
validateDoleBuf ( &tmp );
*/
//wc->verify();
}
@ -5631,16 +5784,18 @@ bool SpiderColl::addDoleBufIntoDoledb ( bool isFromCache ,
// only add one doledb record at a time now since we
// have the winnerListCache
m_doleBuf.setLength ( skipSize );
//m_doleBuf.setLength ( skipSize );
tmpList.setFromSafeBuf ( &m_doleBuf , RDB_DOLEDB );
//tmpList.setFromSafeBuf ( &m_doleBuf , RDB_DOLEDB );
tmpList.setFromPtr ( doledbRec , doledbRecSize , RDB_DOLEDB );
// now that doledb is tree-only and never dumps to disk, just
// add it directly
g_doledb.m_rdb.addList ( m_collnum , &tmpList , MAX_NICENESS );
if ( g_conf.m_logDebugSpider )
log("spider: adding doledb tree node size=%"INT32"",skipSize);
log("spider: adding doledb tree node size=%"INT32"",
doledbRecSize);
// and it happens right away. just add it locally.
@ -5700,6 +5855,12 @@ bool SpiderColl::addDoleBufIntoDoledb ( bool isFromCache ,
"removed from waiting table",
iptoa(firstIp));
// save memory
m_winnerTree.reset();
m_winnerTable.reset();
//validateDoleBuf( doleBuf );
// add did not block
return status;
}
@ -9989,10 +10150,23 @@ bool sendPage ( State11 *st ) {
// print time format: 7/23/1971 10:45:32
int64_t timems = gettimeofdayInMillisecondsGlobal();
sb.safePrintf("</b> (current time = %"UINT64")(totalcount=%"INT32")"
"(waittablecount=%"INT32")</td></tr>\n",
"(waittablecount=%"INT32")",
timems,
sc->m_waitingTree.getNumUsedNodes(),
sc->m_waitingTable.getNumUsedSlots());
double a = (double)g_spiderdb.getUrlHash48 ( &sc->m_firstKey );
double b = (double)g_spiderdb.getUrlHash48 ( &sc->m_endKey );
double c = (double)g_spiderdb.getUrlHash48 ( &sc->m_nextKey );
double percent = (100.0 * (c-a)) ;
if ( b-a > 0 ) percent /= (b-a);
if ( percent > 100.0 ) percent = 100.0;
if ( percent < 0.0 ) percent = 0.0;
sb.safePrintf("(spiderdb scan for ip %s is %.2f%% complete)",
iptoa(sc->m_scanningIp),
(float)percent );
sb.safePrintf("</td></tr>\n");
sb.safePrintf("<tr bgcolor=#%s>",DARK_BLUE);
sb.safePrintf("<td><b>spidertime (MS)</b></td>\n");
sb.safePrintf("<td><b>firstip</b></td>\n");

@ -1131,7 +1131,7 @@ class SpiderColl {
// doledbkey + dataSize + bestRequestRec
//char m_doleBuf[MAX_DOLEREC_SIZE];
SafeBuf m_doleBuf;
//SafeBuf m_doleBuf;
bool m_isLoading;
@ -1192,7 +1192,9 @@ class SpiderColl {
bool addToDoleTable ( SpiderRequest *sreq ) ;
bool addDoleBufIntoDoledb (bool isFromCache,uint32_t cachedTimestamp);
bool validateDoleBuf ( SafeBuf *doleBuf ) ;
bool addDoleBufIntoDoledb ( SafeBuf *doleBuf , bool isFromCache);
//,uint32_t cachedTimestamp);
bool updateSiteNumInlinksTable ( int32_t siteHash32,int32_t sni,
time_t tstamp); // time_t

@ -431,6 +431,10 @@ int32_t Threads::getNumThreadsOutOrQueued() {
return n;
}
int32_t Threads::getNumWriteThreadsOut() {
return m_threadQueues[DISK_THREAD].getNumWriteThreadsOut();
}
// . returns false (and may set errno) if failed to launch a thread
// . returns true if thread added to queue successfully
// . may be launched instantly or later depending on # of threads in the queue
@ -554,10 +558,18 @@ static void killStalledFiltersWrapper ( int fd , void *state ) {
// . we put that signal there using sigqeueue() in Threads::exit()
// . this way another thread can be launched right away
int32_t Threads::launchThreads ( ) {
// stop launching threads if trying to exit.
// only launch save tree threads. so if in the middle of saving
// we allow it to complete?
if ( g_process.m_mode == EXIT_MODE )
return 0;
// try launching from each queue
int32_t numLaunched = 0;
// try to launch DISK threads last so cpu-based threads get precedence
for ( int32_t i = m_numQueues - 1 ; i >= 0 ; i-- ) {
// clear g_errno
g_errno = 0;
// launch as many threads as we can from queue #i
@ -870,6 +882,25 @@ int32_t ThreadQueue::getNumThreadsOutOrQueued() {
*/
}
int32_t ThreadQueue::getNumWriteThreadsOut () {
// only consider disk threads
if ( m_threadType != DISK_THREAD ) return 0;
int32_t n = 0;
for ( int32_t i = 0 ; i < m_maxEntries ; i++ ) {
ThreadEntry *e = &m_entries[i];
if ( ! e->m_isOccupied ) continue;
if ( ! e->m_isLaunched ) continue;
if ( e->m_isDone ) continue;
FileState *fs = (FileState *)e->m_state;
if ( ! fs ) continue;
if ( ! fs->m_doWrite ) continue;
n++;
}
return n;
}
// return NULL and set g_errno on error
ThreadEntry *ThreadQueue::addEntry ( int32_t niceness ,
void *state ,
@ -1072,6 +1103,10 @@ ThreadEntry *ThreadQueue::addEntry ( int32_t niceness ,
int32_t Threads::timedCleanUp (int32_t maxTime, int32_t niceness) {
// skip it if exiting
if ( g_process.m_mode == EXIT_MODE )
return 0;
if ( ! m_needsCleanup ) return 0;
//if ( g_inSigHandler ) return 0;
int64_t startTime = gettimeofdayInMillisecondsLocal();
@ -1277,12 +1312,12 @@ bool ThreadQueue::timedCleanUp ( int32_t maxNiceness ) {
"jointid=0x%"XINT64".",
(PTRTYPE)t,(int64_t)t->m_joinTid);
g_threads.returnStack ( t->m_si );
t->m_stack = NULL;
// re-protect this stack
mprotect ( t->m_stack + GUARDSIZE ,
STACK_SIZE - GUARDSIZE,
PROT_NONE );
g_threads.returnStack ( t->m_si );
t->m_stack = NULL;
}
@ -1317,11 +1352,11 @@ bool ThreadQueue::timedCleanUp ( int32_t maxNiceness ) {
"for unknown reason." , pid );
}
//mfree ( t->m_stack , STACK_SIZE , "Threads" );
g_threads.returnStack ( t->m_si );
t->m_stack = NULL;
// re-protect this stack
mprotect ( t->m_stack + GUARDSIZE , STACK_SIZE - GUARDSIZE,
PROT_NONE );
g_threads.returnStack ( t->m_si );
t->m_stack = NULL;
// debug msg
if ( g_conf.m_logDebugThread )
log(LOG_DEBUG,"thread: joined with pid=%"INT32" pid=%"INT32".",
@ -1642,12 +1677,12 @@ bool ThreadQueue::cleanUp ( ThreadEntry *tt , int32_t maxNiceness ) {
"jointid=0x%"XINT64".",
(PTRTYPE)t,(int64_t)t->m_joinTid);
g_threads.returnStack ( t->m_si );
t->m_stack = NULL;
// re-protect this stack
mprotect ( t->m_stack + GUARDSIZE ,
STACK_SIZE - GUARDSIZE,
PROT_NONE );
g_threads.returnStack ( t->m_si );
t->m_stack = NULL;
}
#else
@ -1681,11 +1716,11 @@ bool ThreadQueue::cleanUp ( ThreadEntry *tt , int32_t maxNiceness ) {
"for unknown reason." , pid );
}
//mfree ( t->m_stack , STACK_SIZE , "Threads" );
g_threads.returnStack ( t->m_si );
t->m_stack = NULL;
// re-protect this stack
mprotect ( t->m_stack + GUARDSIZE , STACK_SIZE - GUARDSIZE,
PROT_NONE );
g_threads.returnStack ( t->m_si );
t->m_stack = NULL;
#endif
@ -2738,6 +2773,13 @@ bool ThreadQueue::launchThreadForReals ( ThreadEntry **headPtr ,
// it didn't launch, did it? dec the count.
m_launched--;
// re-protect this stack
mprotect ( t->m_stack + GUARDSIZE , STACK_SIZE - GUARDSIZE,
PROT_NONE );
// RETURN THE STACK
g_threads.returnStack ( t->m_si );
t->m_stack = NULL;
/*
// priority-based LOCAL & GLOBAL launch counts
if ( realNiceness <= 0 ) m_hiLaunched--;

@ -160,6 +160,8 @@ class ThreadQueue {
void reset();
int32_t getNumThreadsOutOrQueued();
int32_t getNumWriteThreadsOut() ;
// . for adding an entry
// . returns false and sets errno on error
@ -297,6 +299,7 @@ class Threads {
bool hasHighPriorityCpuThreads() ;
int32_t getNumThreadsOutOrQueued();
int32_t getNumWriteThreadsOut() ;
// counts the high/low priority (niceness <= 0) threads
//int64_t m_hiLaunched;

@ -16965,9 +16965,8 @@ char **XmlDoc::getHttpReply2 ( ) {
bool isInjecting = getIsInjecting();
if ( ! isInjecting && m_sreqValid && m_sreq.m_hopCount == 0 )
r->m_isRootSeedUrl = 1;
// only if it was a seed for now... so comment out
// if ( ! isInjecting && m_hopCountValid && m_hopCount == 0 )
// r->m_isRootSeedUrl = 1;
if ( ! isInjecting && m_hopCountValid && m_hopCount == 0 )
r->m_isRootSeedUrl = 1;
// sanity check
if ( ! m_firstIpValid ) { char *xx=NULL;*xx=0; }
@ -19221,6 +19220,9 @@ char **XmlDoc::getExpandedUtf8Content ( ) {
// <iframe src=""> which ends up embedding the root url.
if ( urlLen == 0 )
continue;
// skip if "about:blank"
if ( urlLen==11 && strncmp(url,"about:blank",11) == 0 )
continue;
// get our current url
//cu = getCurrentUrl();
// set our frame url
@ -21585,12 +21587,13 @@ bool XmlDoc::logIt ( SafeBuf *bb ) {
//
// print # of link texts from 2nd coll
//
if ( m_linkInfo2Valid ) {
LinkInfo *info = ptr_linkInfo2;
int32_t nt = 0;
if ( info ) nt = info->getNumLinkTexts();
if ( nt ) sb->safePrintf("goodinlinks2=%"INT32" ",nt );
}
// this is not used for what it was used for.
// if ( m_linkInfo2Valid && size_linkInfo2 > 4 ) {
// LinkInfo *info = ptr_linkInfo2;
// int32_t nt = 0;
// if ( info ) nt = info->getNumLinkTexts();
// if ( nt ) sb->safePrintf("goodinlinks2=%"INT32" ",nt );
// }
if ( m_docIdValid )
sb->safePrintf("docid=%"UINT64" ",m_docId);

@ -18,6 +18,8 @@ static void sleepWrapper ( int fd , void *state ) ;
bool sendPageSEO(TcpSocket *s, HttpRequest *hr) {return true;}
bool g_recoveryMode;
int g_inMemcpy;
int32_t g_recoveryLevel;
static int32_t s_maxNumThreads = 1 ;
static int32_t s_launched = 0;
@ -48,7 +50,7 @@ int main ( int argc , char *argv[] ) {
if ( setrlimit(RLIMIT_CORE,&lim) )
log("blaster::setrlimit: %s", mstrerror(errno) );
g_conf.m_maxMem = 500000000;
//g_conf.m_maxMem = 500000000;
// init our table for doing zobrist hashing
if ( ! hashinit() ) {
@ -57,7 +59,7 @@ int main ( int argc , char *argv[] ) {
// init the memory class after conf since it gets maxMem from Conf
//if ( ! g_mem.init ( 20000000 ) ) {
// log("blaster::Mem init failed" ); return 1; }
g_mem.m_maxMem = 200000000;
//g_mem.m_maxMem = 200000000;
// start up log file
if ( ! g_log.init( "/tmp/blasterLog" ) ) {
log("blaster::Log open /tmp/blasterLog failed" ); return 1; }
@ -449,7 +451,9 @@ bool getWords() {
s_words += '\0';
}
fclose ( fd );
log("blaster: read %"INT32" words, %"INT32" bytes in from dictionary.",
s_windices.length() / sizeof(int32_t), s_words.length());
log("blaster: read %"INT32" words, "
"%"INT32" bytes in from dictionary.",
(int32_t)(s_windices.length() / sizeof(int32_t)),
(int32_t)s_words.length());
return true;
}

@ -2504,7 +2504,7 @@ int32_t deserializeMsg ( int32_t baseSize ,
// make it NULL if size is 0 though
if ( *sizePtr == 0 ) *strPtr = NULL;
// sanity check
if ( *sizePtr < 0 ) { char *xx = NULL; *xx =0; }
if ( *sizePtr < 0 ) { g_errno = ECORRUPTDATA; return -1;}
// advance our destination ptr
p += *sizePtr;
// advance both ptrs to next string

@ -620,6 +620,7 @@ char *serializeMsg2 ( void *thisPtr ,
int32_t *retSize );
// convert offsets back into ptrs
// returns -1 on error
int32_t deserializeMsg ( int32_t baseSize ,
int32_t *firstSizeParm ,
int32_t *lastSizeParm ,

4
qa.cpp

@ -1349,6 +1349,10 @@ bool qaSyntax ( ) {
"format=json&"
"q=");
tmp.urlEncode ( s_q[s_i] );
// get back 100 for debugging better
if ( strcmp(s_q[s_i],"gbssStatusCode:0") == 0 ) {
tmp.safePrintf("&n=100");
}
tmp.nullTerm();
// point to next query
s_i++;