Merge branch 'diffbot-testing'

This commit is contained in:
Matt Wells 2015-02-20 08:18:30 -07:00
commit cc98589da3
8 changed files with 90 additions and 40 deletions

@ -845,6 +845,9 @@ bool sendPageStats ( TcpSocket *s , HttpRequest *r ) {
//"<tr class=poo><td><b>Gigablast Version</b></td><td>%s %s</td></tr>\n"
"<tr class=poo><td><b>Parsing Inconsistencies</b></td><td>%"INT32"</td>\n"
"<tr class=poo><td><b>Spiderdb Overflows</b></td><td>%"INT32"</td>\n"
"<tr class=poo><td><b>Index Shards</b></td><td>%"INT32"</td>\n"
"<tr class=poo><td><b>Hosts per Shard</b></td><td>%"INT32"</td>\n"
//"<tr class=poo><td><b>Fully Split</b></td><td>%"INT32"</td>\n"
@ -874,6 +877,7 @@ bool sendPageStats ( TcpSocket *s , HttpRequest *r ) {
//GBPROJECTNAME,
//GBVersion ,
g_stats.m_parsingInconsistencies ,
g_stats.m_totalOverflows,
(int32_t)g_hostdb.getNumShards(),//g_hostdb.m_indexSplits,
(int32_t)g_hostdb.getNumHostsPerShard(),
g_spiderLoop.m_lockTable.m_numSlotsUsed,

@ -1471,8 +1471,24 @@ bool Process::shutdown2 ( ) {
// turn off statsdb so it does not try to add records for these writes
g_statsdb.m_disabled = true;
if ( g_threads.areThreadsEnabled () ) {
log("gb: disabling threads");
// now disable threads so we don't exit while threads are
// outstanding
g_threads.disableThreads();
}
// wait for all threads to return
int32_t n = g_threads.getNumThreadsOutOrQueued() ;
if ( n != 0 ) {
log(LOG_INFO,"gb: Has %"INT32" threads out. Waiting for "
"them to finish.",n);
return false;
}
// assume we will use threads
bool useThreads = true;
// no, not now that we disabled them
bool useThreads = false;//true;
// if urgent do not allow any further threads to be spawned unless
// they were already queued
@ -1621,12 +1637,12 @@ bool Process::shutdown2 ( ) {
g_threads.timedCleanUp(0x7fffffff,MAX_NICENESS);
// wait for all threads to complete...
int32_t n = g_threads.getNumThreadsOutOrQueued() ;
//int32_t n = g_threads.getNumThreadsOutOrQueued() ;
//if ( n > 0 )
// return log(LOG_INFO,
// "gb: Waiting for %"INT32" threads to complete.",n);
log(LOG_INFO,"gb: Has %"INT32" threads out.",n);
//log(LOG_INFO,"gb: Has %"INT32" threads out.",n);
//ok, resetAll will close httpServer's socket so now is the time to

58
Rdb.cpp

@ -3268,25 +3268,7 @@ int32_t Rdb::reclaimMemFromDeletedTreeNodes( int32_t niceness ) {
char *pstart = p;
int32_t marked = 0;
// mark the data of unoccupied nodes somehow
int32_t nn = m_tree.m_minUnusedNode;
for ( int i = 0 ; i < nn ; i++ ) {
QUICKPOLL ( niceness );
// count occupied skip empty nodes in tree
if ( m_tree.m_parents[i] != -2 ) continue;
// get the data
char *data = m_tree.m_data[i];
// skip if somehow null already
if ( ! data ) continue;
// sanity, ensure legit
if ( data < pstart ) { char *xx=NULL;*xx=0; }
// now mark the spiderrequest key as 00000's
memset ( data , 0 , sizeof(SPIDERDBKEY) );
// make it NULL
m_tree.m_data[i] = NULL;
marked++;
}
int32_t occupied = 0;
HashTableX ht;
if (!ht.set ( 4,
@ -3299,7 +3281,27 @@ int32_t Rdb::reclaimMemFromDeletedTreeNodes( int32_t niceness ) {
true )) // useMagic? yes..
return -1;
int32_t noticed = 0;
// mark the data of unoccupied nodes somehow
int32_t nn = m_tree.m_minUnusedNode;
for ( int i = 0 ; i < nn ; i++ ) {
QUICKPOLL ( niceness );
// count occupied skip empty nodes in tree
if ( m_tree.m_parents[i] == -2 ) {marked++; continue; }
// get data ptr
char *data = m_tree.m_data[i];
// sanity, ensure legit
if ( data < pstart ) { char *xx=NULL;*xx=0; }
// offset
int32_t doff = (int32_t)(data - pstart);
// indicate it is legit
int32_t val = 1;
ht.addKey ( &doff , &val );
occupied++;
}
if ( occupied != m_tree.getNumUsedNodes() ) { char *xx=NULL;*xx=0;}
int32_t skipped = 0;
// the spider requests should be linear in there. so we can scan
// them. then put their offset into a map that maps it to the new
@ -3309,23 +3311,25 @@ int32_t Rdb::reclaimMemFromDeletedTreeNodes( int32_t niceness ) {
SpiderRequest *sreq = (SpiderRequest *)p;
int32_t oldOffset = p - pstart;
int32_t recSize = sreq->getRecSize();
// if it has been expunged, skip the copy of it
if ( sreq->m_key.n0 == 0LL &&
sreq->m_key.n1 == 0LL ) {
// if not in hash table it was a delete
if ( ! ht.isInTable ( &oldOffset ) ) {
p += recSize;
noticed++;
skipped++;
continue;
}
//
//// re -add with the proper value now
//
// otherwise, copy it over if still in tree
gbmemcpy ( dst , p , recSize );
int32_t newOffset = dst - pstart;
// store in map
// store in map, overwrite old value of 1
ht.addKey ( &oldOffset , &newOffset );
dst += recSize;
p += recSize;
}
if ( noticed != marked ) { char *xx=NULL;*xx=0; }
//if ( skipped != marked ) { char *xx=NULL;*xx=0; }
// sanity
if(ht.getNumSlotsUsed()!=m_tree.m_numUsedNodes){char *xx=NULL;*xx=0;}
@ -3340,7 +3344,7 @@ int32_t Rdb::reclaimMemFromDeletedTreeNodes( int32_t niceness ) {
if ( reclaimed < 0 ) { char *xx=NULL;*xx=0; }
if ( reclaimed == 0 && marked ) { char *xx=NULL;*xx=0;}
//if ( reclaimed == 0 && marked ) { char *xx=NULL;*xx=0;}
// now update data ptrs in the tree, m_data[]
for ( int i = 0 ; i < nn ; i++ ) {

@ -4368,10 +4368,12 @@ bool SpiderColl::scanListForWinners ( ) {
int32_t maxWinners = (int32_t)MAX_WINNER_NODES; // 40
//if ( ! m_cr->m_isCustomCrawl ) maxWinners = 1;
// only put 40 urls from the same firstIp into doledb if
// we have a lot of urls in our spiderdb already.
// only put one doledb record into winner tree if
// the list is pretty short. otherwise, we end up caching
// too much. granted, we only cache for about 2 mins.
// mdw: for testing take this out!
//if ( m_totalBytesScanned < 200000 ) maxWinners = 1;
if ( m_totalBytesScanned < 25000 ) maxWinners = 1;
// sanity. make sure read is somewhat hefty for our
// maxWinners=1 thing
if ( (int32_t)SR_READ_SIZE < 500000 ) { char *xx=NULL;*xx=0; }
@ -8880,10 +8882,12 @@ bool sendPage ( State11 *st ) {
"<tr>"
//"<td bgcolor=#ff6666>"
"<td>"
"For collection <i>%s</i>: "
"<b><font color=red>%s</font></b>"
"</td>"
"</tr>"
"</table>\n"
, cr->m_coll
, mb.getBufStart() );
@ -11267,6 +11271,7 @@ int32_t getUrlFilterNum2 ( SpiderRequest *sreq ,
goto gotOne;
}
// check for ".css?" substring
// these two suck up a lot of time:
special = strstr(url,".css?");
if ( special ) goto gotOne;
special = strstr(url,"/print/");

@ -238,6 +238,7 @@ bool buildProxyTable ( ) {
}
redo:
int32_t removed = 0;
// scan all SpiderProxies in tmptab
for ( int32_t i = 0 ; i < s_iptab.getNumSlots() ; i++ ) {
// skip empty buckets in hashtable s_iptab
@ -246,12 +247,18 @@ bool buildProxyTable ( ) {
int64_t key = *(int64_t *)s_iptab.getKey(i);
// must also exist in tmptab, otherwise it got removed by user
if ( tmptab.isInTable ( &key ) ) continue;
// skip if not in table
if ( s_iptab.getSlot ( &key ) < 0 ) {
log("sproxy: iptable hashing messed up");
continue;
}
// shoot, it got removed. not in the new list of ip:ports
s_iptab.removeKey ( &key );
removed++;
// hashtable is messed up now, start over
goto redo;
//goto redo;
}
if ( removed ) goto redo;
return true;
}
@ -296,14 +303,18 @@ bool loadSpiderProxyStats ( ) {
initProxyTables();
// save hashtable
s_proxyBannedTable.load(g_hostdb.m_dir,"proxybantable.dat");
// take this out for now since i was seeing dups in s_iptab for
// some reason. was causing an infinite loop bug calling goto redo:
// all the time above.
s_banCountTable.load(g_hostdb.m_dir,"proxybancounttable.dat");
// save hashtable
//s_proxyBannedTable.load(g_hostdb.m_dir,"proxybantable.dat");
//s_banCountTable.load(g_hostdb.m_dir,"proxybancounttable.dat");
// save hash table. this also returns false if does not exist.
if ( ! s_iptab.load(g_hostdb.m_dir,"spiderproxystats.dat") )
return false;
//if ( ! s_iptab.load(g_hostdb.m_dir,"spiderproxystats.dat") )
// return false;
// unset some flags
for ( int32_t i = 0 ; i < s_iptab.getNumSlots() ; i++ ) {

@ -196,6 +196,8 @@ class Stats {
int32_t m_parsingInconsistencies;
int32_t m_totalOverflows;
// count ip and domain hammer for Msg13.cpp here
//int32_t m_numBackoffs;

@ -2370,6 +2370,10 @@ bool XmlDoc::indexDoc ( ) {
if ( g_errno == ENOMEM )
return true;
// and do not add spider reply if shutting down the server
if ( g_errno == ESHUTTINGDOWN )
return true;
// if docid not found when trying to do a query reindex...
// this really shouldn't happen but i think we were adding
// additional SpiderRequests since we were using a fake first ip.
@ -25191,6 +25195,7 @@ char *XmlDoc::addOutlinkSpiderRecsToMetaList ( ) {
// more than 500MB worth.
if ( sc && sc->isFirstIpInOverflowList ( firstIp ) ) {
m_linkOverflows++;
g_stats.m_totalOverflows++;
continue;
}

3
qa.cpp

@ -183,6 +183,9 @@ void processReply ( char *reply , int32_t replyLen ) {
markOut ( content , "spider is done (");
markOut ( content , "spider is paused (");
// 3 Collections etc.
markOut ( content , "/rocket.jpg></div></a></center><br><br><div style=\"width:190px;padding:4px;margin-left:10px;background-color:white;border-top-left-radius:10px;border-bottom-left-radius:10px;border-color:blue;border-width:3px;border-style:solid;margin-right:-3px;border-right-color:white;overflow-y:auto;overflow-x:hidden;line-height:23px;color:black;\"><center><nobr><b>" );
// until i figure this one out, take it out
markOut ( content , "<hits>");