Merge branch 'diffbot' into diffbot-testing

Conflicts:
	iana_charset.cpp
	iana_charset.h
This commit is contained in:
Matt Wells 2013-12-12 13:01:49 -08:00
commit 7b768d4b86
6 changed files with 34 additions and 10 deletions

@ -1572,6 +1572,7 @@ void RdbBase::gotTokenForMerge ( ) {
bool minOld ;
long id2 = -1;
long minToMerge;
bool overide = false;
//long smini = - 1;
//long sn ;
//long long tfndbSize = 0;
@ -1884,7 +1885,7 @@ void RdbBase::gotTokenForMerge ( ) {
startMerge:
// sanity check
if ( n <= 1 ) {
if ( n <= 1 && ! overide ) {
log(LOG_LOGIC,"merge: gotTokenForMerge: Not merging %li files.",
n);
g_msg35.releaseToken();

@ -249,6 +249,7 @@ long SpiderRequest::printToTable ( SafeBuf *sb , char *status ,
long long now = gettimeofdayInMilliseconds();
long long elapsed = now - xd->m_startTime;
sb->safePrintf(" <td>%llims</td>\n",elapsed);
sb->safePrintf(" <td>%li</td>\n",(long)xd->m_collnum);
}
sb->safePrintf(" <td><nobr>%s</nobr></td>\n",m_url);
@ -346,8 +347,10 @@ long SpiderRequest::printTableHeaderSimple ( SafeBuf *sb ,
sb->safePrintf("<tr>\n");
// how long its been being spidered
if ( currentlySpidering )
if ( currentlySpidering ) {
sb->safePrintf(" <td><b>elapsed</b></td>\n");
sb->safePrintf(" <td><b>coll</b></td>\n");
}
sb->safePrintf(" <td><b>url</b></td>\n");
sb->safePrintf(" <td><b>status</b></td>\n");
@ -448,8 +451,10 @@ long SpiderRequest::printTableHeader ( SafeBuf *sb , bool currentlySpidering) {
sb->safePrintf("<tr>\n");
// how long its been being spidered
if ( currentlySpidering )
if ( currentlySpidering ) {
sb->safePrintf(" <td><b>elapsed</b></td>\n");
sb->safePrintf(" <td><b>coll</b></td>\n");
}
sb->safePrintf(" <td><b>url</b></td>\n");
sb->safePrintf(" <td><b>status</b></td>\n");
@ -4466,7 +4471,7 @@ void SpiderLoop::spiderDoledUrls ( ) {
if ( g_dailyMerge.m_mergeMode ) return;
// skip if too many udp slots being used
if ( g_udpServer.getNumUsedSlots() >= 1300 ) return;
// stop if too many out
// stop if too many out. this is now 50 down from 500.
if ( m_numSpidersOut >= MAX_SPIDERS ) return;
// bail if no collections
if ( g_collectiondb.m_numRecs <= 0 ) return;
@ -6248,8 +6253,14 @@ bool Msg12::gotLockReply ( UdpSlot *slot ) {
60*60*24*365 ) )
return false;
// error?
log("spider: error re-sending confirm request: %s",
mstrerror(g_errno));
// don't spam the log!
static long s_last = 0;
long now = getTimeLocal();
if ( now - s_last >= 1 ) {
s_last = now;
log("spider: error re-sending confirm "
"request: %s", mstrerror(g_errno));
}
}
// only log every 10 seconds for ETRYAGAIN
if ( g_errno == ETRYAGAIN ) {

@ -1274,8 +1274,9 @@ void handleRequestc1 ( UdpSlot *slot , long niceness ) ;
// . supports <META NAME="ROBOTS" CONTENT="NOFOLLOW"> (no links)
// . supports limiting spiders per domain
// max spiders we can have going at once for this process
#define MAX_SPIDERS 500
// . max spiders we can have going at once for this process
// . limit to 50 to preven OOM conditions
#define MAX_SPIDERS 50
class SpiderLoop {

@ -7681,9 +7681,20 @@ long long *XmlDoc::getExactContentHash64 ( ) {
unsigned char *pend = (unsigned char *)p + plen;
unsigned long long h64 = 0LL;
unsigned char pos = 0;
bool lastWasSpace = true;
for ( ; p < pend ; p++ ) {
// breathe
QUICKPOLL ( m_niceness );
// treat sequences of white space as a single ' ' (space)
if ( is_wspace_a(*p) ) {
if ( lastWasSpace ) continue;
lastWasSpace = true;
// treat all white space as a space
h64 ^= g_hashtab[pos][' '];
pos++;
continue;
}
lastWasSpace = false;
// xor this in right
h64 ^= g_hashtab[pos][p[0]];
pos++;

@ -1,5 +1,5 @@
// iana_charset.h
// Generated automatically by parse_iana_charsets.pl Tue Dec 10 22:42:31 2013
// Generated automatically by parse_iana_charsets.pl Thu Dec 12 20:57:59 2013
// DO NOT EDIT!!!
#include "gb-include.h"

@ -1,5 +1,5 @@
// iana_charset.h
// Generated automatically by parse_iana_charsets.pl Tue Dec 10 22:42:31 2013
// Generated automatically by parse_iana_charsets.pl Thu Dec 12 20:57:59 2013
// DO NOT EDIT!!!
#ifndef IANA_CHARSET_H__