Merge branch 'diffbot-testing' into testing

This commit is contained in:
Matt
2015-10-02 19:26:15 -06:00
19 changed files with 412 additions and 111 deletions

@ -532,6 +532,10 @@ Timedb.o:
HashTableX.o:
$(CC) $(DEFS) $(CPPFLAGS) -O2 -c $*.cpp
# getUrlFilterNum2()
Spider.o:
$(CC) $(DEFS) $(CPPFLAGS) -O2 -c $*.cpp
SpiderCache.o:
$(CC) $(DEFS) $(CPPFLAGS) -O2 -c $*.cpp

@ -21,7 +21,7 @@
// uncomment this for EFENCE to do underflow checks instead of the
// default overflow checks
//#define _CHECKUNDERFLOW_
//#define CHECKUNDERFLOW
// only Mem.cpp can call ::malloc, everyone else must call mmalloc() so
// we can keep tabs on memory usage. in Mem.h we #define this to be coreme()
@ -2168,7 +2168,7 @@ void *getElecMem ( int32_t size ) {
// a page above OR a page below
// let's go below this time since that seems to be the problem
#ifdef _CHECKUNDERFLOW_
#ifdef CHECKUNDERFLOW
// how much to alloc
// . assume sysmalloc returs one byte above a page, so we need
// MEMPAGESIZE-1 bytes to move p up to page boundary, another
@ -2302,7 +2302,7 @@ void freeElecMem ( void *fakeMem ) {
char *label = &s_labels[((uint32_t)h)*16];
int32_t fakeSize = s_sizes[h];
#ifdef _CHECKUNDERFLOW_
#ifdef CHECKUNDERFLOW
char *oldProtMem = cp - MEMPAGESIZE;
#else
char *oldProtMem = cp + fakeSize;

@ -1222,13 +1222,16 @@ bool ipWasBanned ( TcpSocket *ts , const char **msg , Msg13Request *r ) {
// if it is a seed url and there are no links, then perhaps we
// are in a blacklist somewhere already from triggering a spider trap
if ( //isInSeedBuf ( cr , r->ptr_url ) &&
// this is set in XmlDoc.cpp based on hopcount really
r->m_isRootSeedUrl &&
! strstr ( ts->m_readBuf, "<a href" ) ) {
*msg = "root/seed url with no outlinks";
return true;
}
// i've seen this flub on a site where they just return a script
// and it is not banned, so let's remove this until we thinkg
// of something better.
// if ( //isInSeedBuf ( cr , r->ptr_url ) &&
// // this is set in XmlDoc.cpp based on hopcount really
// r->m_isRootSeedUrl &&
// ! strstr ( ts->m_readBuf, "<a href" ) ) {
// *msg = "root/seed url with no outlinks";
// return true;
// }
// TODO: compare a simple checksum of the page content to what

@ -157,6 +157,12 @@ bool Msg20::getSummary ( Msg20Request *req ) {
// do not re-route to twins if accessing an external network
if ( hostdb != &g_hostdb ) req->m_expected = false;
if ( req->m_docId < 0 && ! req->ptr_ubuf ) {
log("msg20: docid<0 and no url for msg20::getsummary");
g_errno = EBADREQUEST;
return true;
}
// get groupId from docId, if positive
uint32_t shardNum;
if ( req->m_docId >= 0 )
@ -398,7 +404,8 @@ void handleRequest20 ( UdpSlot *slot , int32_t netnice ) {
// sanity check, the size include the \0
if ( req->m_collnum < 0 ) {
log("query: Got empty collection in msg20 handler. FIX!");
log("query: Got empty collection in msg20 handler. FIX! "
"from ip=%s port=%i",iptoa(slot->m_ip),(int)slot->m_port);
g_udpServer.sendErrorReply ( slot , ENOTFOUND );
return;
//char *xx =NULL; *xx = 0;

@ -76,7 +76,7 @@ class RdbCache *getDiskPageCache ( char rdbId ) {
rpc = &g_rdbCaches[2];
maxSizePtr = &g_conf.m_clusterdbFileCacheSize;
maxMem = *maxSizePtr;
maxRecs = maxMem / 16;
maxRecs = maxMem / 32;
dbname = "clustcache";
}
if ( rdbId == RDB_TITLEDB ) {

@ -154,6 +154,7 @@ void Msg39::getDocIds ( UdpSlot *slot ) {
int32_t requestSize = m_slot->m_readBufSize;
// ensure it's size is ok
if ( requestSize < 8 ) {
BadReq:
g_errno = EBADREQUESTSIZE;
log(LOG_LOGIC,"query: msg39: getDocIds: %s." ,
mstrerror(g_errno) );
@ -169,7 +170,11 @@ void Msg39::getDocIds ( UdpSlot *slot ) {
m_r->m_buf );
// sanity check
if ( finalSize != requestSize ) {char *xx=NULL;*xx=0; }
if ( finalSize != requestSize ) {
log("msg39: sending bad request.");
goto BadReq;
//char *xx=NULL;*xx=0; }
}
getDocIds2 ( m_r );
}

@ -742,14 +742,6 @@ bool Msg3a::gotAllShardReplies ( ) {
// cast it and set it
m_reply [i] = mr;
m_replyMaxSize[i] = replyMaxSize;
// deserialize it (just sets the ptr_ and size_ member vars)
//mr->deserialize ( );
deserializeMsg ( sizeof(Msg39Reply) ,
&mr->size_docIds,
&mr->size_clusterRecs,
&mr->ptr_docIds,
mr->m_buf );
// sanity check
if ( mr->m_nqt != m_q->getNumTerms() ) {
g_errno = EBADREPLY;
@ -767,6 +759,20 @@ bool Msg3a::gotAllShardReplies ( ) {
mstrerror(g_errno));
return true;
}
// deserialize it (just sets the ptr_ and size_ member vars)
//mr->deserialize ( );
if ( ! deserializeMsg ( sizeof(Msg39Reply) ,
&mr->size_docIds,
&mr->size_clusterRecs,
&mr->ptr_docIds,
mr->m_buf ) ) {
g_errno = ECORRUPTDATA;
m_errno = ECORRUPTDATA;
log("query: msg3a: Shard had error: %s",
mstrerror(g_errno));
return true;
}
// skip down here if reply was already set
//skip:
// add of the total hits from each shard, this is how many

@ -1071,7 +1071,7 @@ bool Msg40::reallocMsg20Buf ( ) {
// . allocate m_buf2 to hold all our Msg20 pointers and Msg20 classes
// . how much mem do we need?
// . need space for the msg20 ptrs
int32_t need = m_msg3a.m_numDocIds * sizeof(Msg20 *);
int64_t need = m_msg3a.m_numDocIds * sizeof(Msg20 *);
// need space for the classes themselves, only if "visible" though
for ( int32_t i = 0 ; i < m_msg3a.m_numDocIds ; i++ )
if ( m_msg3a.m_clusterLevels[i] == CR_OK )
@ -1243,6 +1243,12 @@ bool Msg40::reallocMsg20Buf ( ) {
m_buf2 = NULL;
m_bufMaxSize2 = need;
if ( need > 2000000000 ) {
log("msg40: need too much mem=%"INT64,need);
m_errno = g_errno;
return false;
}
// do the alloc
if ( need ) m_buf2 = (char *)mmalloc ( need ,"Msg40msg20");
if ( need && ! m_buf2 ) { m_errno = g_errno; return false; }

@ -1256,10 +1256,18 @@ bool gotResults ( void *state ) {
// into it, and it must be the SAME ptr too!
CollectionRec *cr = si->m_cr;//g_collectiondb.getRec ( collnum );
if ( ! cr ) { // || cr != si->m_cr ) {
g_errno = ENOCOLLREC;
return sendReply(st,NULL);
g_errno = ENOCOLLREC;
return sendReply(st,NULL);
}
if ( ! msg40->m_msg20 && ! si->m_docIdsOnly ) {
log("msg40: failed to get results q=%s",si->m_q.m_orig);
g_errno = ENOMEM;
return sendReply(st,NULL);
}
//char *coll = cr->m_coll;
/*

@ -734,6 +734,18 @@ bool sendPageStats ( TcpSocket *s , HttpRequest *r ) {
p.safePrintf("<td>%"INT64"</td>",a);
}
p.safePrintf ("</tr>\n<tr class=poo><td><b><nobr>dropped recs</td>" );
for ( int32_t i = 0 ; i < numCaches ; i++ ) {
int64_t a = caches[i]->m_deletes;
p.safePrintf("<td>%"INT64"</td>",a);
}
p.safePrintf ("</tr>\n<tr class=poo><td><b><nobr>added recs</td>" );
for ( int32_t i = 0 ; i < numCaches ; i++ ) {
int64_t a = caches[i]->m_adds;
p.safePrintf("<td>%"INT64"</td>",a);
}
//p.safePrintf ("</tr>\n<tr class=poo><td><b><nobr>max age</td>" );
//for ( int32_t i = 0 ; i < numCaches ; i++ ) {
// int64_t a = caches[i]->getMaxMem();
@ -2150,6 +2162,34 @@ bool sendPageStats ( TcpSocket *s , HttpRequest *r ) {
p.safePrintf("<td>%"UINT64"</td></tr>\n",total);
p.safePrintf("<tr class=poo><td><b>file cache adds</b></td>");
total = 0;
for ( int32_t i = 0 ; i < nr ; i++ ) {
Rdb *rdb = rdbs[i];
RdbCache *rpc = getDiskPageCache ( rdb->m_rdbId );
if ( ! rpc ) {
p.safePrintf("<td>--</td>");
continue;
}
p.safePrintf("<td>%"UINT64"</td>",rpc->m_adds);
}
p.safePrintf("<td>%"UINT64"</td></tr>\n",total);
p.safePrintf("<tr class=poo><td><b>file cache drops</b></td>");
total = 0;
for ( int32_t i = 0 ; i < nr ; i++ ) {
Rdb *rdb = rdbs[i];
RdbCache *rpc = getDiskPageCache ( rdb->m_rdbId );
if ( ! rpc ) {
p.safePrintf("<td>--</td>");
continue;
}
p.safePrintf("<td>%"UINT64"</td>",rpc->m_deletes);
}
p.safePrintf("<td>%"UINT64"</td></tr>\n",total);
p.safePrintf("<tr class=poo><td><b>file cache used</b></td>");
total = 0;
for ( int32_t i = 0 ; i < nr ; i++ ) {

@ -1687,6 +1687,9 @@ bool Process::shutdown2 ( ) {
if ( g_process.m_threadOut )
log(LOG_INFO,"gb: still has hdtemp thread");
log("gb. EXITING.");
// exit abruptly
exit(0);

@ -543,7 +543,7 @@ bool RdbCache::getRecord ( collnum_t collnum ,
// of the delete head's space i guess.
// i do this for all caches now... what are the downsides? i forget.
//
bool check = false;
bool check = true;//false;
//if ( this == &g_genericCache[SITEQUALITY_CACHEID] ) check = true;
if ( this == g_dns.getCache () ) check = true;
if ( this == g_dns.getCacheLocal () ) check = true;
@ -558,11 +558,11 @@ bool RdbCache::getRecord ( collnum_t collnum ,
//if ( this == &g_tagdb.m_listCache ) check = true;
// the exact count cache...
//if ( this == &g_qtable ) check = true;
if ( m_totalBufSize < 20000 ) check = false;
//if ( m_totalBufSize < 20000 ) check = false;
if ( check ) promoteRecord = false;
// sanity check, do not allow the site quality cache or dns cache to
// be > 128MB, that just does not make sense and it complicates things
if ( check && m_totalBufSize > BUFSIZE ) { char *xx = NULL; *xx = 0; }
//if(check && m_totalBufSize > BUFSIZE ) { char *xx = NULL; *xx = 0; }
// sanity check
if ( m_tail < 0 || m_tail > m_totalBufSize ) {
char *xx = NULL; *xx = 0; }
@ -957,11 +957,13 @@ bool RdbCache::addRecord ( collnum_t collnum ,
m_memOccupied += ( p - start );
// debug msg (MDW)
//log("cache: adding rec @ %"UINT32" size=%"INT32" tail=%"UINT32"",
// i1c,p-start,m_tail);
//log("cache: stored k.n1=%"UINT32" k.n0=%"UINT64" %"INT32" bytes @ %"UINT32" tail=%"UINT32"",
// ((key_t *)cacheKey)->n1,
// ((key_t *)cacheKey)->n0,p-start,i1c,m_tail);
// if ( this == &g_spiderLoop.m_winnerListCache ) {
// log("cache: adding rec @ %"UINT32" size=%i tail=%"INT32"",
// i1c,(int)(p-start),m_tail);
// log("cache: stored k.n1=%"UINT32" k.n0=%"UINT64" %"INT32" bytes @ %"UINT32" tail=%"UINT32"",
// ((key_t *)cacheKey)->n1,
// ((key_t *)cacheKey)->n0,(int)(p-start),i1c,m_tail);
// }
//if ( m_cks == 4 )
// log("stored k=%"XINT32" %"INT32" bytes @ %"UINT32"",
// *(int32_t *)cacheKey,p-start,i);//(uint32_t)start);
@ -1113,8 +1115,10 @@ bool RdbCache::deleteRec ( ) {
//int32_t saved = m_tail;
// debug msg (MDW)
//log("cache: deleting rec @ %"INT32" size=%"INT32"",m_tail,
// dataSize+2+12+4+4);
// if ( this == &g_spiderLoop.m_winnerListCache ) {
// log("cache: deleting rec @ %"INT32" size=%"INT32"",m_tail,
// dataSize+2+12+4+4);
// }
// skip over rest of rec
p += dataSize;
@ -1128,6 +1132,10 @@ bool RdbCache::deleteRec ( ) {
m_tail +(int32_t)sizeof(collnum_t)+m_cks+4>m_totalBufSize){
char *xx = NULL; *xx = 0;}
// if ( this == &g_spiderLoop.m_winnerListCache )
// log("spider: rdbcache: removing tail rec collnum=%i",
// (int)collnum);
// delete key from hash table, iff is for THIS record
// but if it has not already been voided.
// we set key to KEYMAX() in markDeletedRecord()
@ -1167,8 +1175,10 @@ bool RdbCache::deleteRec ( ) {
void RdbCache::markDeletedRecord(char *ptr){
int32_t dataSize = sizeof(collnum_t)+m_cks+sizeof(int32_t);
// debug it
//logf(LOG_DEBUG,"cache: makeDeleteRecord ptr=0x%"XINT32" off=%"INT32"",
// (int32_t)ptr,ptr-m_bufs[0]);
// if ( this == &g_spiderLoop.m_winnerListCache ) {
//logf(LOG_DEBUG,"cache: makeDeleteRec ptr=0x%"PTRFMT" off=%"INT32"",
// (PTRTYPE)ptr,(int32_t)(ptr-m_bufs[0]));
// }
// get dataSize and data
if ( m_fixedDataSize == -1 || m_supportLists ) {
dataSize += 4 + // size

@ -3566,4 +3566,32 @@ void RdbList::setFromSafeBuf ( SafeBuf *sb , char rdbId ) {
}
void RdbList::setFromPtr ( char *p , int32_t psize , char rdbId ) {
// free and NULLify any old m_list we had to make room for our new list
freeList();
// set this first since others depend on it
m_ks = getKeySizeFromRdbId ( rdbId );
// set our list parms
m_list = p;
m_listSize = psize;
m_alloc = p;
m_allocSize = psize;
m_listEnd = m_list + m_listSize;
KEYMIN(m_startKey,m_ks);
KEYMAX(m_endKey ,m_ks);
m_fixedDataSize = getDataSizeFromRdbId ( rdbId );
m_ownData = false;//ownData;
m_useHalfKeys = false;//useHalfKeys;
// use this call now to set m_listPtr and m_listPtrHi based on m_list
resetListPtr();
}

@ -107,6 +107,7 @@ class RdbList {
char keySize = sizeof(key_t) );
void setFromSafeBuf ( class SafeBuf *sb , char rdbId );
void setFromPtr ( char *p , int32_t psize , char rdbId ) ;
// just set the start and end keys
//void set ( key_t startKey , key_t endKey );

@ -3579,7 +3579,7 @@ bool SpiderColl::evalIpLoop ( ) {
&doleBuf,
&doleBufSize ,
false, // doCopy?
300, // maxAge, 300 seconds
600, // maxAge, 600 seconds
true ,// incCounts
&cachedTimestamp , // rec timestamp
true ); // promote rec?
@ -3587,25 +3587,47 @@ bool SpiderColl::evalIpLoop ( ) {
}
// if ( m_collnum == 18752 ) {
// int32_t coff = 0;
// if ( inCache && doleBufSize >= 4 ) coff = *(int32_t *)doleBuf;
// log("spider: usecache=%i incache=%i dbufsize=%i currentoff=%i "
// "ctime=%i ip=%s"
// ,(int)useCache
// ,(int)inCache
// ,(int)doleBufSize
// ,(int)coff
// ,(int)cachedTimestamp
// ,iptoa(m_scanningIp));
// }
// doleBuf could be NULL i guess...
if ( inCache ) { // && doleBufSize > 0 ) {
if ( g_conf.m_logDebugSpider )
int32_t crc = hash32 ( doleBuf + 4 , doleBufSize - 4 );
if ( g_conf.m_logDebugSpider ) // || m_collnum == 18752 )
log("spider: GOT %"INT32" bytes of SpiderRequests "
"from winnerlistcache for ip %s",doleBufSize,
iptoa(m_scanningIp));
"from winnerlistcache for ip %s ptr=0x%"PTRFMT
" crc=%"UINT32
,doleBufSize,
iptoa(m_scanningIp),
(PTRTYPE)doleBuf,
crc);
// set own to false so it doesn't get freed
// m_doleBuf.setBuf ( doleBuf ,
// doleBufSize ,
// doleBufSize ,
// false , // ownData?
// 0 ); // encoding. doesn't matter.
m_doleBuf.reset();
//m_doleBuf.reset();
// gotta copy it because we end up re-adding part of it
// to rdbcache below
m_doleBuf.safeMemcpy ( doleBuf , doleBufSize );
//m_doleBuf.safeMemcpy ( doleBuf , doleBufSize );
// we no longer re-add to avoid churn. but do not free it
// so do not 'own' it.
SafeBuf sb;
sb.setBuf ( doleBuf, doleBufSize, doleBufSize, false );
// now add the first rec m_doleBuf into doledb's tree
// and re-add the rest back to the cache with the same key.
return addDoleBufIntoDoledb ( true , cachedTimestamp );
return addDoleBufIntoDoledb(&sb,true);//,cachedTimestamp)
}
top:
@ -4721,6 +4743,9 @@ bool SpiderColl::scanListForWinners ( ) {
int32_t maxWinners = (int32_t)MAX_WINNER_NODES; // 40
//if ( ! m_cr->m_isCustomCrawl ) maxWinners = 1;
// if less than 10MB of spiderdb requests limit to 400
if ( m_totalBytesScanned < 10000000 ) maxWinners = 400;
// only put one doledb record into winner tree if
// the list is pretty short. otherwise, we end up caching
// too much. granted, we only cache for about 2 mins.
@ -5228,16 +5253,23 @@ bool SpiderColl::addWinnersIntoDoledb ( ) {
}
// i've seen this happen, wtf?
if ( m_winnerTree.isEmpty() && m_minFutureTimeMS ) {
// this will update the waiting tree key with minFutureTimeMS
addDoleBufIntoDoledb ( NULL , false );
return true;
}
// i am seeing dup uh48's in the m_winnerTree
int32_t firstIp = m_waitingTreeKey.n0 & 0xffffffff;
char dbuf[3*MAX_WINNER_NODES*(8+1)];
char dbuf[147456];//3*MAX_WINNER_NODES*(8+1)];
HashTableX dedup;
int32_t ntn = m_winnerTree.getNumNodes();
dedup.set ( 8,
0,
(int32_t)2*ntn, // # slots to initialize to
dbuf,
(int32_t)(3*MAX_WINNER_NODES*(8+1)),
147456,//(int32_t)(3*MAX_WINNER_NODES*(8+1)),
false,
MAX_NICENESS,
"windt");
@ -5247,7 +5279,14 @@ bool SpiderColl::addWinnersIntoDoledb ( ) {
// make winner tree into doledb list to add
//
///////////
m_doleBuf.reset();
//m_doleBuf.reset();
//m_doleBuf.setLabel("dolbuf");
// first 4 bytes is offset of next doledb record to add to doledb
// so we do not have to re-add the dolebuf to the cache and make it
// churn. it is really inefficient.
SafeBuf doleBuf;
doleBuf.pushLong(4);
int32_t added = 0;
for ( int32_t node = m_winnerTree.getFirstNode() ;
node >= 0 ;
node = m_winnerTree.getNextNode ( node ) ) {
@ -5297,16 +5336,18 @@ bool SpiderColl::addWinnersIntoDoledb ( ) {
log("spider: got dup uh48=%"UINT64" dammit", winUh48);
continue;
}
// count it
added++;
// do not allow dups
dedup.addKey ( &winUh48 );
// store doledb key first
if ( ! m_doleBuf.safeMemcpy ( &doleKey, sizeof(key_t) ) )
if ( ! doleBuf.safeMemcpy ( &doleKey, sizeof(key_t) ) )
hadError = true;
// then size of spiderrequest
if ( ! m_doleBuf.pushLong ( sreq2->getRecSize() ) )
if ( ! doleBuf.pushLong ( sreq2->getRecSize() ) )
hadError = true;
// then the spiderrequest encapsulated
if ( ! m_doleBuf.safeMemcpy ( sreq2 , sreq2->getRecSize() ))
if ( ! doleBuf.safeMemcpy ( sreq2 , sreq2->getRecSize() ))
hadError=true;
// note and error
if ( hadError ) {
@ -5316,11 +5357,52 @@ bool SpiderColl::addWinnersIntoDoledb ( ) {
}
}
return addDoleBufIntoDoledb ( false , 0 );
// log("spider: added %"INT32" doledb recs to cache for cn=%i "
// "dolebufsize=%i",
// added,
// (int)m_collnum,
// (int)doleBuf.length());
return addDoleBufIntoDoledb ( &doleBuf , false );//, 0 );
}
bool SpiderColl::addDoleBufIntoDoledb ( bool isFromCache ,
uint32_t cachedTimestamp ) {
bool SpiderColl::validateDoleBuf ( SafeBuf *doleBuf ) {
char *doleBufEnd = doleBuf->getBuf();
// get offset
char *pstart = doleBuf->getBufStart();
char *p = pstart;
int32_t jump = *(int32_t *)p;
p += 4;
// sanity
if ( jump < 4 || jump > doleBuf->getLength() ) {
char *xx=NULL;*xx=0; }
bool gotIt = false;
for ( ; p < doleBuf->getBuf() ; ) {
if ( p == pstart + jump )
gotIt = true;
// first is doledbkey
p += sizeof(key_t);
// then size of spider request
int32_t recSize = *(int32_t *)p;
p += 4;
// the spider request encapsulated
SpiderRequest *sreq3;
sreq3 = (SpiderRequest *)p;
// point "p" to next spiderrequest
if ( recSize != sreq3->getRecSize() ) { char *xx=NULL;*xx=0;}
p += recSize;//sreq3->getRecSize();
// sanity
if ( p > doleBufEnd ) { char *xx=NULL;*xx=0; }
if ( p < pstart ) { char *xx=NULL;*xx=0; }
}
if ( ! gotIt ) { char *xx=NULL;*xx=0; }
return true;
}
bool SpiderColl::addDoleBufIntoDoledb ( SafeBuf *doleBuf, bool isFromCache ) {
// uint32_t cachedTimestamp ) {
//validateDoleBuf ( doleBuf );
////////////////////
//
@ -5390,6 +5472,10 @@ bool SpiderColl::addDoleBufIntoDoledb ( bool isFromCache ,
// right now.
if ( m_winnerTree.isEmpty() && m_minFutureTimeMS && ! isFromCache ) {
// save memory
m_winnerTree.reset();
m_winnerTable.reset();
// if in the process of being added to doledb or in doledb...
if ( m_doleIpTable.isInTable ( &firstIp ) ) {
// sanity i guess. remove this line if it hits this!
@ -5500,6 +5586,8 @@ bool SpiderColl::addDoleBufIntoDoledb ( bool isFromCache ,
// how did this happen?
//if ( ! m_msg1Avail ) { char *xx=NULL;*xx=0; }
char *doleBufEnd = doleBuf->getBuf();
// add it to doledb ip table now so that waiting tree does not
// immediately get another spider request from this same ip added
// to it while the msg4 is out. but if add failes we totally bail
@ -5510,36 +5598,50 @@ bool SpiderColl::addDoleBufIntoDoledb ( bool isFromCache ,
//if ( ! addToDoleTable ( m_bestRequest ) ) return true;
// . MDW: now we have a list of doledb records in a SafeBuf:
// . scan the requests in safebuf
int32_t skipSize = 0;
for ( char *p = m_doleBuf.getBufStart() ; p < m_doleBuf.getBuf() ; ) {
// first is doledbkey
p += sizeof(key_t);
// then size of spider request
p += 4;
// the spider request encapsulated
SpiderRequest *sreq3;
sreq3 = (SpiderRequest *)p;
// point "p" to next spiderrequest
p += sreq3->getRecSize();
// for caching logic below, set this
skipSize = sizeof(key_t) + 4 + sreq3->getRecSize();
// process sreq3 my incrementing the firstip count in
// m_doleIpTable
if ( ! addToDoleTable ( sreq3 ) ) return true;
// only add the top key for now!
break;
// get offset
char *p = doleBuf->getBufStart();
int32_t jump = *(int32_t *)p;
// sanity
if ( jump < 4 || jump > doleBuf->getLength() ) {
char *xx=NULL;*xx=0; }
// the jump includes itself
p += jump;
//for ( ; p < m_doleBuf.getBuf() ; ) {
// save it
char *doledbRec = p;
// first is doledbkey
p += sizeof(key_t);
// then size of spider request
p += 4;
// the spider request encapsulated
SpiderRequest *sreq3;
sreq3 = (SpiderRequest *)p;
// point "p" to next spiderrequest
p += sreq3->getRecSize();
// this logic is now in addToDoleTable()
// . if it was empty it is no longer
// . we have this flag here to avoid scanning empty doledb
// priorities because it saves us a msg5 call to doledb in
// the scanning loop
//int32_t bp = sreq3->m_priority;//m_bestRequest->m_priority;
//if ( bp < 0 ) { char *xx=NULL;*xx=0; }
//if ( bp >= MAX_SPIDER_PRIORITIES ) { char *xx=NULL;*xx=0; }
//m_isDoledbEmpty [ bp ] = 0;
}
// sanity
if ( p > doleBufEnd ) { char *xx=NULL;*xx=0; }
// for caching logic below, set this
int32_t doledbRecSize = sizeof(key_t) + 4 + sreq3->getRecSize();
// process sreq3 my incrementing the firstip count in
// m_doleIpTable
if ( ! addToDoleTable ( sreq3 ) ) return true;
// only add the top key for now!
//break;
// // this logic is now in addToDoleTable()
// // . if it was empty it is no longer
// // . we have this flag here to avoid scanning empty doledb
// // priorities because it saves us a msg5 call to doledb in
// // the scanning loop
// //int32_t bp = sreq3->m_priority;//m_bestRequest->m_priority;
// //if ( bp < 0 ) { char *xx=NULL;*xx=0; }
// //if ( bp >= MAX_SPIDER_PRIORITIES ) { char *xx=NULL;*xx=0; }
// //m_isDoledbEmpty [ bp ] = 0;
// }
// now cache the REST of the spider requests to speed up scanning.
// better than adding 400 recs per firstip to doledb because
@ -5548,20 +5650,25 @@ bool SpiderColl::addDoleBufIntoDoledb ( bool isFromCache ,
// top rec.
// allow this to add a 0 length record otherwise we keep the same
// old url in here and keep spidering it over and over again!
bool addToCache = false;
if ( skipSize && m_doleBuf.length() - skipSize > 0 ) addToCache =true;
//bool addToCache = false;
//if( skipSize && m_doleBuf.length() - skipSize > 0 ) addToCache =true;
// if winnertree was empty, then we might have scanned like 10M
// twitter.com urls and not wanted any of them, so we don't want to
// have to keep redoing that!
if ( m_doleBuf.length() == 0 && ! isFromCache ) addToCache = true;
//if ( m_doleBuf.length() == 0 && ! isFromCache ) addToCache = true;
RdbCache *wc = &g_spiderLoop.m_winnerListCache;
// remove from cache? if we added the last spider request in the
// cached dolebuf to doledb then remove it from cache so it's not
// a cached empty dolebuf and we recompute it not using the cache.
if ( isFromCache && skipSize && m_doleBuf.length() - skipSize == 0 ) {
if ( addToCache ) { char *xx=NULL;*xx=0; }
if ( isFromCache && p >= doleBufEnd ) {
//if ( addToCache ) { char *xx=NULL;*xx=0; }
// debug note
// if ( m_collnum == 18752 )
// log("spider: rdbcache: adding single byte. skipsize=%i"
// ,doledbRecSize);
// let's get this working right...
//wc->removeKey ( collnum , k , start );
//wc->markDeletedRecord(start);
@ -5582,21 +5689,67 @@ bool SpiderColl::addDoleBufIntoDoledb ( bool isFromCache ,
//wc->verify();
}
if ( addToCache ) {
// if it wasn't in the cache and it was only one record we
// obviously do not want to add it to the cache.
else if ( p < doleBufEnd ) { // if ( addToCache ) {
key_t cacheKey;
cacheKey.n0 = firstIp;
cacheKey.n1 = 0;
if ( g_conf.m_logDebugSpider )
log("spider: adding %"INT32" bytes of SpiderRequests "
"to winnerlistcache for ip %s",
m_doleBuf.length()-skipSize,iptoa(firstIp));
char *x = doleBuf->getBufStart();
// the new offset is the next record after the one we
// just added to doledb
int32_t newJump = (int32_t)(p - x);
int32_t oldJump = *(int32_t *)x;
// NO! we do a copy in rdbcache and copy the thing over
// since we promote it. so this won't work...
*(int32_t *)x = newJump;
if ( newJump >= doleBuf->getLength() ) { char *xx=NULL;*xx=0;}
if ( newJump < 4 ) { char *xx=NULL;*xx=0;}
if ( g_conf.m_logDebugSpider ) // || m_collnum == 18752 )
log("spider: rdbcache: updating "
"%"INT32" bytes of SpiderRequests "
"to winnerlistcache for ip %s oldjump=%"INT32
" newJump=%"INT32" ptr=0x%"PTRFMT,
doleBuf->length(),iptoa(firstIp),oldJump,
newJump,
(PTRTYPE)x);
//validateDoleBuf ( doleBuf );
//wc->verify();
// inherit timestamp. if 0, RdbCache will set to current time
wc->addRecord ( m_collnum,
(char *)&cacheKey,
m_doleBuf.getBufStart() + skipSize ,
m_doleBuf.length() - skipSize ,
cachedTimestamp );
// don't re-add just use the same modified buffer so we
// don't churn the cache.
// but do add it to cache if not already in there yet.
if ( ! isFromCache ) {
// if ( m_collnum == 18752 )
// log("spider: rdbcache: adding record a new "
// "dbufsize=%i",(int)doleBuf->length());
wc->addRecord ( m_collnum,
(char *)&cacheKey,
doleBuf->getBufStart(),//+ skipSize ,
doleBuf->length() ,//- skipSize ,
0);//cachedTimestamp );
}
//validateDoleBuf( doleBuf );
/*
// test it
char *testPtr;
int32_t testLen;
bool inCache2 = wc->getRecord ( m_collnum ,
(char *)&cacheKey ,
&testPtr,
&testLen,
false, // doCopy?
600, // maxAge,600 secs
true ,// incCounts
NULL , // rec timestamp
true ); // promote?
if ( ! inCache2 ) { char *xx=NULL;*xx=0; }
if ( testLen != m_doleBuf.length() ) {char *xx=NULL;*xx=0; }
if ( *(int32_t *)testPtr != newJump ){char *xx=NULL;*xx=0; }
SafeBuf tmp;
tmp.setBuf ( testPtr , testLen , testLen , false );
validateDoleBuf ( &tmp );
*/
//wc->verify();
}
@ -5634,16 +5787,18 @@ bool SpiderColl::addDoleBufIntoDoledb ( bool isFromCache ,
// only add one doledb record at a time now since we
// have the winnerListCache
m_doleBuf.setLength ( skipSize );
//m_doleBuf.setLength ( skipSize );
tmpList.setFromSafeBuf ( &m_doleBuf , RDB_DOLEDB );
//tmpList.setFromSafeBuf ( &m_doleBuf , RDB_DOLEDB );
tmpList.setFromPtr ( doledbRec , doledbRecSize , RDB_DOLEDB );
// now that doledb is tree-only and never dumps to disk, just
// add it directly
g_doledb.m_rdb.addList ( m_collnum , &tmpList , MAX_NICENESS );
if ( g_conf.m_logDebugSpider )
log("spider: adding doledb tree node size=%"INT32"",skipSize);
log("spider: adding doledb tree node size=%"INT32"",
doledbRecSize);
// and it happens right away. just add it locally.
@ -5703,6 +5858,12 @@ bool SpiderColl::addDoleBufIntoDoledb ( bool isFromCache ,
"removed from waiting table",
iptoa(firstIp));
// save memory
m_winnerTree.reset();
m_winnerTable.reset();
//validateDoleBuf( doleBuf );
// add did not block
return status;
}
@ -10011,10 +10172,23 @@ bool sendPage ( State11 *st ) {
// print time format: 7/23/1971 10:45:32
int64_t timems = gettimeofdayInMillisecondsGlobal();
sb.safePrintf("</b> (current time = %"UINT64")(totalcount=%"INT32")"
"(waittablecount=%"INT32")</td></tr>\n",
"(waittablecount=%"INT32")",
timems,
sc->m_waitingTree.getNumUsedNodes(),
sc->m_waitingTable.getNumUsedSlots());
double a = (double)g_spiderdb.getUrlHash48 ( &sc->m_firstKey );
double b = (double)g_spiderdb.getUrlHash48 ( &sc->m_endKey );
double c = (double)g_spiderdb.getUrlHash48 ( &sc->m_nextKey );
double percent = (100.0 * (c-a)) ;
if ( b-a > 0 ) percent /= (b-a);
if ( percent > 100.0 ) percent = 100.0;
if ( percent < 0.0 ) percent = 0.0;
sb.safePrintf("(spiderdb scan for ip %s is %.2f%% complete)",
iptoa(sc->m_scanningIp),
(float)percent );
sb.safePrintf("</td></tr>\n");
sb.safePrintf("<tr bgcolor=#%s>",DARK_BLUE);
sb.safePrintf("<td><b>spidertime (MS)</b></td>\n");
sb.safePrintf("<td><b>firstip</b></td>\n");

@ -1131,7 +1131,7 @@ class SpiderColl {
// doledbkey + dataSize + bestRequestRec
//char m_doleBuf[MAX_DOLEREC_SIZE];
SafeBuf m_doleBuf;
//SafeBuf m_doleBuf;
bool m_isLoading;
@ -1192,7 +1192,9 @@ class SpiderColl {
bool addToDoleTable ( SpiderRequest *sreq ) ;
bool addDoleBufIntoDoledb (bool isFromCache,uint32_t cachedTimestamp);
bool validateDoleBuf ( SafeBuf *doleBuf ) ;
bool addDoleBufIntoDoledb ( SafeBuf *doleBuf , bool isFromCache);
//,uint32_t cachedTimestamp);
bool updateSiteNumInlinksTable ( int32_t siteHash32,int32_t sni,
time_t tstamp); // time_t

@ -16963,9 +16963,8 @@ char **XmlDoc::getHttpReply2 ( ) {
bool isInjecting = getIsInjecting();
if ( ! isInjecting && m_sreqValid && m_sreq.m_hopCount == 0 )
r->m_isRootSeedUrl = 1;
// only if it was a seed for now... so comment out
// if ( ! isInjecting && m_hopCountValid && m_hopCount == 0 )
// r->m_isRootSeedUrl = 1;
if ( ! isInjecting && m_hopCountValid && m_hopCount == 0 )
r->m_isRootSeedUrl = 1;
// sanity check
if ( ! m_firstIpValid ) { char *xx=NULL;*xx=0; }
@ -19219,6 +19218,9 @@ char **XmlDoc::getExpandedUtf8Content ( ) {
// <iframe src=""> which ends up embedding the root url.
if ( urlLen == 0 )
continue;
// skip if "about:blank"
if ( urlLen==11 && strncmp(url,"about:blank",11) == 0 )
continue;
// get our current url
//cu = getCurrentUrl();
// set our frame url
@ -21580,12 +21582,13 @@ bool XmlDoc::logIt ( SafeBuf *bb ) {
//
// print # of link texts from 2nd coll
//
if ( m_linkInfo2Valid ) {
LinkInfo *info = ptr_linkInfo2;
int32_t nt = 0;
if ( info ) nt = info->getNumLinkTexts();
if ( nt ) sb->safePrintf("goodinlinks2=%"INT32" ",nt );
}
// this is not used for what it was used for.
// if ( m_linkInfo2Valid && size_linkInfo2 > 4 ) {
// LinkInfo *info = ptr_linkInfo2;
// int32_t nt = 0;
// if ( info ) nt = info->getNumLinkTexts();
// if ( nt ) sb->safePrintf("goodinlinks2=%"INT32" ",nt );
// }
if ( m_docIdValid )
sb->safePrintf("docid=%"UINT64" ",m_docId);

@ -2504,7 +2504,7 @@ int32_t deserializeMsg ( int32_t baseSize ,
// make it NULL if size is 0 though
if ( *sizePtr == 0 ) *strPtr = NULL;
// sanity check
if ( *sizePtr < 0 ) { char *xx = NULL; *xx =0; }
if ( *sizePtr < 0 ) { g_errno = ECORRUPTDATA; return -1;}
// advance our destination ptr
p += *sizePtr;
// advance both ptrs to next string

@ -620,6 +620,7 @@ char *serializeMsg2 ( void *thisPtr ,
int32_t *retSize );
// convert offsets back into ptrs
// returns -1 on error
int32_t deserializeMsg ( int32_t baseSize ,
int32_t *firstSizeParm ,
int32_t *lastSizeParm ,