Merge branch 'diffbot-testing'
This commit is contained in:
commit
cc98589da3
@ -845,6 +845,9 @@ bool sendPageStats ( TcpSocket *s , HttpRequest *r ) {
|
||||
|
||||
//"<tr class=poo><td><b>Gigablast Version</b></td><td>%s %s</td></tr>\n"
|
||||
"<tr class=poo><td><b>Parsing Inconsistencies</b></td><td>%"INT32"</td>\n"
|
||||
|
||||
"<tr class=poo><td><b>Spiderdb Overflows</b></td><td>%"INT32"</td>\n"
|
||||
|
||||
"<tr class=poo><td><b>Index Shards</b></td><td>%"INT32"</td>\n"
|
||||
"<tr class=poo><td><b>Hosts per Shard</b></td><td>%"INT32"</td>\n"
|
||||
//"<tr class=poo><td><b>Fully Split</b></td><td>%"INT32"</td>\n"
|
||||
@ -874,6 +877,7 @@ bool sendPageStats ( TcpSocket *s , HttpRequest *r ) {
|
||||
//GBPROJECTNAME,
|
||||
//GBVersion ,
|
||||
g_stats.m_parsingInconsistencies ,
|
||||
g_stats.m_totalOverflows,
|
||||
(int32_t)g_hostdb.getNumShards(),//g_hostdb.m_indexSplits,
|
||||
(int32_t)g_hostdb.getNumHostsPerShard(),
|
||||
g_spiderLoop.m_lockTable.m_numSlotsUsed,
|
||||
|
22
Process.cpp
22
Process.cpp
@ -1471,8 +1471,24 @@ bool Process::shutdown2 ( ) {
|
||||
// turn off statsdb so it does not try to add records for these writes
|
||||
g_statsdb.m_disabled = true;
|
||||
|
||||
if ( g_threads.areThreadsEnabled () ) {
|
||||
log("gb: disabling threads");
|
||||
// now disable threads so we don't exit while threads are
|
||||
// outstanding
|
||||
g_threads.disableThreads();
|
||||
}
|
||||
|
||||
// wait for all threads to return
|
||||
int32_t n = g_threads.getNumThreadsOutOrQueued() ;
|
||||
if ( n != 0 ) {
|
||||
log(LOG_INFO,"gb: Has %"INT32" threads out. Waiting for "
|
||||
"them to finish.",n);
|
||||
return false;
|
||||
}
|
||||
|
||||
// assume we will use threads
|
||||
bool useThreads = true;
|
||||
// no, not now that we disabled them
|
||||
bool useThreads = false;//true;
|
||||
|
||||
// if urgent do not allow any further threads to be spawned unless
|
||||
// they were already queued
|
||||
@ -1621,12 +1637,12 @@ bool Process::shutdown2 ( ) {
|
||||
g_threads.timedCleanUp(0x7fffffff,MAX_NICENESS);
|
||||
|
||||
// wait for all threads to complete...
|
||||
int32_t n = g_threads.getNumThreadsOutOrQueued() ;
|
||||
//int32_t n = g_threads.getNumThreadsOutOrQueued() ;
|
||||
//if ( n > 0 )
|
||||
// return log(LOG_INFO,
|
||||
// "gb: Waiting for %"INT32" threads to complete.",n);
|
||||
|
||||
log(LOG_INFO,"gb: Has %"INT32" threads out.",n);
|
||||
//log(LOG_INFO,"gb: Has %"INT32" threads out.",n);
|
||||
|
||||
|
||||
//ok, resetAll will close httpServer's socket so now is the time to
|
||||
|
58
Rdb.cpp
58
Rdb.cpp
@ -3268,25 +3268,7 @@ int32_t Rdb::reclaimMemFromDeletedTreeNodes( int32_t niceness ) {
|
||||
char *pstart = p;
|
||||
|
||||
int32_t marked = 0;
|
||||
|
||||
// mark the data of unoccupied nodes somehow
|
||||
int32_t nn = m_tree.m_minUnusedNode;
|
||||
for ( int i = 0 ; i < nn ; i++ ) {
|
||||
QUICKPOLL ( niceness );
|
||||
// count occupied skip empty nodes in tree
|
||||
if ( m_tree.m_parents[i] != -2 ) continue;
|
||||
// get the data
|
||||
char *data = m_tree.m_data[i];
|
||||
// skip if somehow null already
|
||||
if ( ! data ) continue;
|
||||
// sanity, ensure legit
|
||||
if ( data < pstart ) { char *xx=NULL;*xx=0; }
|
||||
// now mark the spiderrequest key as 00000's
|
||||
memset ( data , 0 , sizeof(SPIDERDBKEY) );
|
||||
// make it NULL
|
||||
m_tree.m_data[i] = NULL;
|
||||
marked++;
|
||||
}
|
||||
int32_t occupied = 0;
|
||||
|
||||
HashTableX ht;
|
||||
if (!ht.set ( 4,
|
||||
@ -3299,7 +3281,27 @@ int32_t Rdb::reclaimMemFromDeletedTreeNodes( int32_t niceness ) {
|
||||
true )) // useMagic? yes..
|
||||
return -1;
|
||||
|
||||
int32_t noticed = 0;
|
||||
// mark the data of unoccupied nodes somehow
|
||||
int32_t nn = m_tree.m_minUnusedNode;
|
||||
for ( int i = 0 ; i < nn ; i++ ) {
|
||||
QUICKPOLL ( niceness );
|
||||
// count occupied skip empty nodes in tree
|
||||
if ( m_tree.m_parents[i] == -2 ) {marked++; continue; }
|
||||
// get data ptr
|
||||
char *data = m_tree.m_data[i];
|
||||
// sanity, ensure legit
|
||||
if ( data < pstart ) { char *xx=NULL;*xx=0; }
|
||||
// offset
|
||||
int32_t doff = (int32_t)(data - pstart);
|
||||
// indicate it is legit
|
||||
int32_t val = 1;
|
||||
ht.addKey ( &doff , &val );
|
||||
occupied++;
|
||||
}
|
||||
|
||||
if ( occupied != m_tree.getNumUsedNodes() ) { char *xx=NULL;*xx=0;}
|
||||
|
||||
int32_t skipped = 0;
|
||||
|
||||
// the spider requests should be linear in there. so we can scan
|
||||
// them. then put their offset into a map that maps it to the new
|
||||
@ -3309,23 +3311,25 @@ int32_t Rdb::reclaimMemFromDeletedTreeNodes( int32_t niceness ) {
|
||||
SpiderRequest *sreq = (SpiderRequest *)p;
|
||||
int32_t oldOffset = p - pstart;
|
||||
int32_t recSize = sreq->getRecSize();
|
||||
// if it has been expunged, skip the copy of it
|
||||
if ( sreq->m_key.n0 == 0LL &&
|
||||
sreq->m_key.n1 == 0LL ) {
|
||||
// if not in hash table it was a delete
|
||||
if ( ! ht.isInTable ( &oldOffset ) ) {
|
||||
p += recSize;
|
||||
noticed++;
|
||||
skipped++;
|
||||
continue;
|
||||
}
|
||||
//
|
||||
//// re -add with the proper value now
|
||||
//
|
||||
// otherwise, copy it over if still in tree
|
||||
gbmemcpy ( dst , p , recSize );
|
||||
int32_t newOffset = dst - pstart;
|
||||
// store in map
|
||||
// store in map, overwrite old value of 1
|
||||
ht.addKey ( &oldOffset , &newOffset );
|
||||
dst += recSize;
|
||||
p += recSize;
|
||||
}
|
||||
|
||||
if ( noticed != marked ) { char *xx=NULL;*xx=0; }
|
||||
//if ( skipped != marked ) { char *xx=NULL;*xx=0; }
|
||||
|
||||
// sanity
|
||||
if(ht.getNumSlotsUsed()!=m_tree.m_numUsedNodes){char *xx=NULL;*xx=0;}
|
||||
@ -3340,7 +3344,7 @@ int32_t Rdb::reclaimMemFromDeletedTreeNodes( int32_t niceness ) {
|
||||
|
||||
if ( reclaimed < 0 ) { char *xx=NULL;*xx=0; }
|
||||
|
||||
if ( reclaimed == 0 && marked ) { char *xx=NULL;*xx=0;}
|
||||
//if ( reclaimed == 0 && marked ) { char *xx=NULL;*xx=0;}
|
||||
|
||||
// now update data ptrs in the tree, m_data[]
|
||||
for ( int i = 0 ; i < nn ; i++ ) {
|
||||
|
11
Spider.cpp
11
Spider.cpp
@ -4368,10 +4368,12 @@ bool SpiderColl::scanListForWinners ( ) {
|
||||
int32_t maxWinners = (int32_t)MAX_WINNER_NODES; // 40
|
||||
//if ( ! m_cr->m_isCustomCrawl ) maxWinners = 1;
|
||||
|
||||
// only put 40 urls from the same firstIp into doledb if
|
||||
// we have a lot of urls in our spiderdb already.
|
||||
// only put one doledb record into winner tree if
|
||||
// the list is pretty short. otherwise, we end up caching
|
||||
// too much. granted, we only cache for about 2 mins.
|
||||
// mdw: for testing take this out!
|
||||
//if ( m_totalBytesScanned < 200000 ) maxWinners = 1;
|
||||
if ( m_totalBytesScanned < 25000 ) maxWinners = 1;
|
||||
|
||||
// sanity. make sure read is somewhat hefty for our
|
||||
// maxWinners=1 thing
|
||||
if ( (int32_t)SR_READ_SIZE < 500000 ) { char *xx=NULL;*xx=0; }
|
||||
@ -8880,10 +8882,12 @@ bool sendPage ( State11 *st ) {
|
||||
"<tr>"
|
||||
//"<td bgcolor=#ff6666>"
|
||||
"<td>"
|
||||
"For collection <i>%s</i>: "
|
||||
"<b><font color=red>%s</font></b>"
|
||||
"</td>"
|
||||
"</tr>"
|
||||
"</table>\n"
|
||||
, cr->m_coll
|
||||
, mb.getBufStart() );
|
||||
|
||||
|
||||
@ -11267,6 +11271,7 @@ int32_t getUrlFilterNum2 ( SpiderRequest *sreq ,
|
||||
goto gotOne;
|
||||
}
|
||||
// check for ".css?" substring
|
||||
// these two suck up a lot of time:
|
||||
special = strstr(url,".css?");
|
||||
if ( special ) goto gotOne;
|
||||
special = strstr(url,"/print/");
|
||||
|
@ -238,6 +238,7 @@ bool buildProxyTable ( ) {
|
||||
}
|
||||
|
||||
redo:
|
||||
int32_t removed = 0;
|
||||
// scan all SpiderProxies in tmptab
|
||||
for ( int32_t i = 0 ; i < s_iptab.getNumSlots() ; i++ ) {
|
||||
// skip empty buckets in hashtable s_iptab
|
||||
@ -246,12 +247,18 @@ bool buildProxyTable ( ) {
|
||||
int64_t key = *(int64_t *)s_iptab.getKey(i);
|
||||
// must also exist in tmptab, otherwise it got removed by user
|
||||
if ( tmptab.isInTable ( &key ) ) continue;
|
||||
// skip if not in table
|
||||
if ( s_iptab.getSlot ( &key ) < 0 ) {
|
||||
log("sproxy: iptable hashing messed up");
|
||||
continue;
|
||||
}
|
||||
// shoot, it got removed. not in the new list of ip:ports
|
||||
s_iptab.removeKey ( &key );
|
||||
removed++;
|
||||
// hashtable is messed up now, start over
|
||||
goto redo;
|
||||
//goto redo;
|
||||
}
|
||||
|
||||
if ( removed ) goto redo;
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -296,14 +303,18 @@ bool loadSpiderProxyStats ( ) {
|
||||
|
||||
initProxyTables();
|
||||
|
||||
// save hashtable
|
||||
s_proxyBannedTable.load(g_hostdb.m_dir,"proxybantable.dat");
|
||||
// take this out for now since i was seeing dups in s_iptab for
|
||||
// some reason. was causing an infinite loop bug calling goto redo:
|
||||
// all the time above.
|
||||
|
||||
s_banCountTable.load(g_hostdb.m_dir,"proxybancounttable.dat");
|
||||
// save hashtable
|
||||
//s_proxyBannedTable.load(g_hostdb.m_dir,"proxybantable.dat");
|
||||
|
||||
//s_banCountTable.load(g_hostdb.m_dir,"proxybancounttable.dat");
|
||||
|
||||
// save hash table. this also returns false if does not exist.
|
||||
if ( ! s_iptab.load(g_hostdb.m_dir,"spiderproxystats.dat") )
|
||||
return false;
|
||||
//if ( ! s_iptab.load(g_hostdb.m_dir,"spiderproxystats.dat") )
|
||||
// return false;
|
||||
|
||||
// unset some flags
|
||||
for ( int32_t i = 0 ; i < s_iptab.getNumSlots() ; i++ ) {
|
||||
|
2
Stats.h
2
Stats.h
@ -196,6 +196,8 @@ class Stats {
|
||||
|
||||
int32_t m_parsingInconsistencies;
|
||||
|
||||
int32_t m_totalOverflows;
|
||||
|
||||
// count ip and domain hammer for Msg13.cpp here
|
||||
//int32_t m_numBackoffs;
|
||||
|
||||
|
@ -2370,6 +2370,10 @@ bool XmlDoc::indexDoc ( ) {
|
||||
if ( g_errno == ENOMEM )
|
||||
return true;
|
||||
|
||||
// and do not add spider reply if shutting down the server
|
||||
if ( g_errno == ESHUTTINGDOWN )
|
||||
return true;
|
||||
|
||||
// if docid not found when trying to do a query reindex...
|
||||
// this really shouldn't happen but i think we were adding
|
||||
// additional SpiderRequests since we were using a fake first ip.
|
||||
@ -25191,6 +25195,7 @@ char *XmlDoc::addOutlinkSpiderRecsToMetaList ( ) {
|
||||
// more than 500MB worth.
|
||||
if ( sc && sc->isFirstIpInOverflowList ( firstIp ) ) {
|
||||
m_linkOverflows++;
|
||||
g_stats.m_totalOverflows++;
|
||||
continue;
|
||||
}
|
||||
|
||||
|
3
qa.cpp
3
qa.cpp
@ -183,6 +183,9 @@ void processReply ( char *reply , int32_t replyLen ) {
|
||||
markOut ( content , "spider is done (");
|
||||
markOut ( content , "spider is paused (");
|
||||
|
||||
// 3 Collections etc.
|
||||
markOut ( content , "/rocket.jpg></div></a></center><br><br><div style=\"width:190px;padding:4px;margin-left:10px;background-color:white;border-top-left-radius:10px;border-bottom-left-radius:10px;border-color:blue;border-width:3px;border-style:solid;margin-right:-3px;border-right-color:white;overflow-y:auto;overflow-x:hidden;line-height:23px;color:black;\"><center><nobr><b>" );
|
||||
|
||||
// until i figure this one out, take it out
|
||||
markOut ( content , "<hits>");
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user