Merge branch 'diffbot-testing' into diffbot

Conflicts:
	Spider.cpp
This commit is contained in:
Matt Wells
2015-10-20 11:48:44 -07:00
93 changed files with 3300 additions and 1174 deletions

@ -9,7 +9,7 @@
#include "Threads.h"
#include "Stats.h"
#include "Statsdb.h"
#include "DiskPageCache.h"
//#include "DiskPageCache.h"
#ifdef ASYNCIO
#include <aio.h>
@ -35,11 +35,12 @@ BigFile::~BigFile () {
BigFile::BigFile () {
m_permissions = S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH ;
m_flags = O_RDWR ; // | O_DIRECT;
m_usePartFiles = true;
// NULLify all ptrs to files
//for ( int32_t i = 0 ; i < MAX_PART_FILES ; i++ ) m_files[i] = NULL;
m_maxParts = 0;
m_numParts = 0;
m_pc = NULL;
//m_pc = NULL;
m_vfd = -1;
//m_vfdAllowed = false;
m_fileSize = -1;
@ -74,6 +75,8 @@ bool BigFile::set ( char *dir , char *baseFilename , char *stripeDir ) {
m_dir .setLabel("bfd");
m_baseFilename.setLabel("bfbf");
m_usePartFiles = true;
// use this 32 byte char buf to avoid a malloc if possible
m_baseFilename.setBuf (m_tmpBaseBuf,32,0,false);
@ -265,23 +268,36 @@ bool BigFile::doesPartExist ( int32_t n ) {
static int64_t s_vfd = 0;
// do not use part files for this open so we can open regular really >2GB
// sized files with it
// bool BigFile::open2 ( int flags ,
// void *pc ,
// int64_t maxFileSize ,
// int permissions ) {
// return open ( flags , pc , maxFileSize , permissions , false );
// }
// . overide File::open so we can set m_numParts
// . set maxFileSize when opening a new file for writing and using
// DiskPageCache
// . use maxFileSize of -1 for us to use getFileSize() to set it
bool BigFile::open ( int flags , class DiskPageCache *pc ,
bool BigFile::open ( int flags ,
//class DiskPageCache *pc ,
void *pc ,
int64_t maxFileSize ,
int permissions ) {
m_flags = flags;
m_pc = pc;
//m_pc = pc;
m_permissions = permissions;
m_isClosing = false;
// this is true except when parsing big warc files
m_usePartFiles = true;//usePartFiles;
// . init the page cache for this vfd
// . this returns our "virtual fd", not the same as File::m_vfd
// . returns -1 and sets g_errno on failure
// . we pass m_vfd to getPages() and addPages()
if ( m_pc && m_vfd == -1 ) {
if ( m_vfd == -1 ) {
//if ( maxFileSize == -1 ) maxFileSize = getFileSize();
m_vfd = ++s_vfd;
//g_errno = 0;
@ -527,6 +543,7 @@ bool BigFile::readwrite ( void *buf ,
fstate->m_inPageCache = false;
// . try to get as much as we can from page cache first
// . the vfd of the big file will be the vfd of its last File class
/*
if ( ! doWrite && m_pc && allowPageCache ) {
//int32_t oldOff = offset;
// we have to set these so RdbScan doesn't freak out if we
@ -559,6 +576,7 @@ bool BigFile::readwrite ( void *buf ,
// return true;
//}
}
*/
// sanity check. if you set hitDisk to false, you must allow
// us to check the page cache! silly bean!
if ( ! allowPageCache && ! hitDisk ) { char*xx=NULL;*xx=0; }
@ -591,6 +609,7 @@ bool BigFile::readwrite ( void *buf ,
fstate->m_callback = callback;
fstate->m_niceness = niceness;
fstate->m_flags = m_flags;
fstate->m_usePartFiles = m_usePartFiles;
// sanity
if ( fstate->m_bytesToGo > 150000000 )
log("file: huge read of %"INT64" bytes",(int64_t)size);
@ -603,6 +622,13 @@ bool BigFile::readwrite ( void *buf ,
// situation occurs and pass a g_errno back to the caller.
fstate->m_filenum1 = offset / MAX_PART_SIZE;
fstate->m_filenum2 = (offset + size ) / MAX_PART_SIZE;
// if not really a big file. we use this for parsing huge warc files
if ( ! m_usePartFiles ) {
fstate->m_filenum1 = 0;
fstate->m_filenum2 = 0;
}
// . save the open count for this fd
// . if it changes when we're done with the read we do a re-read
// . it gets incremented once every time File calls ::open and gets
@ -643,9 +669,9 @@ bool BigFile::readwrite ( void *buf ,
fstate->m_errno = 0;
fstate->m_errno2 = 0;
fstate->m_startTime = gettimeofdayInMilliseconds();
fstate->m_pc = m_pc;
if ( ! allowPageCache )
fstate->m_pc = NULL;
//fstate->m_pc = NULL;//m_pc;
// if ( ! allowPageCache )
// fstate->m_pc = NULL;
fstate->m_vfd = m_vfd;
// if hitDisk was false we only check the page cache!
if ( ! hitDisk ) return true;
@ -765,7 +791,7 @@ bool BigFile::readwrite ( void *buf ,
// how many bytes to read from each file?
int64_t readSize1 = size;
int64_t readSize2 = 0;
if ( off1 + readSize1 > MAX_PART_SIZE ) {
if ( off1 + readSize1 > MAX_PART_SIZE && m_usePartFiles ) {
readSize1 = ((int64_t)MAX_PART_SIZE) - off1;
readSize2 = size - readSize1;
}
@ -784,6 +810,10 @@ bool BigFile::readwrite ( void *buf ,
int32_t filenum = offset / MAX_PART_SIZE;
int32_t localOffset = offset % MAX_PART_SIZE;
if ( ! m_usePartFiles ) {
filenum = 0;
localOffset = offset;
}
// read or write?
if ( doWrite ) a0->aio_lio_opcode = LIO_WRITE;
@ -852,7 +882,8 @@ bool BigFile::readwrite ( void *buf ,
int32_t rate = 100000;
if ( took > 500 ) rate = fstate->m_bytesDone / took ;
if ( rate < 8000 && fstate->m_niceness <= 0 ) {
log(LOG_INFO,"disk: Read %"INT32" bytes in %"INT64" ms (%"INT32"MB/s).",
log(LOG_INFO,"disk: Read %"INT64" bytes in %"INT64" "
"ms (%"INT32"KB/s).",
fstate->m_bytesDone,took,rate);
g_stats.m_slowDiskReads++;
}
@ -880,12 +911,12 @@ bool BigFile::readwrite ( void *buf ,
// fstate->m_bytesDone);
// store read/written pages into page cache
if ( ! g_errno && fstate->m_pc )
fstate->m_pc->addPages ( fstate->m_vfd ,
fstate->m_offset ,
fstate->m_bytesDone ,
fstate->m_buf ,
fstate->m_niceness );
// if ( ! g_errno && fstate->m_pc )
// fstate->m_pc->addPages ( fstate->m_vfd ,
// fstate->m_offset ,
// fstate->m_bytesDone ,
// fstate->m_buf ,
// fstate->m_niceness );
// now log our stuff here
if ( g_errno && g_errno != EBADENGINEER )
log("disk: readwrite: %s", mstrerror(g_errno));
@ -952,7 +983,8 @@ void doneWrapper ( void *state , ThreadEntry *t ) {
if ( fstate->m_errno == EDISKSTUCK ) slow = true;
if ( slow && fstate->m_niceness <= 0 ) {
if ( fstate->m_errno != EDISKSTUCK )
log(LOG_INFO, "disk: Read %"INT32" bytes in %"INT64" ms (%"INT32"MB/s).",
log(LOG_INFO, "disk: Read %"INT64" bytes in %"INT64" "
"ms (%"INT32"KB/s).",
fstate->m_bytesDone,took,rate);
g_stats.m_slowDiskReads++;
}
@ -964,12 +996,12 @@ void doneWrapper ( void *state , ThreadEntry *t ) {
if ( ! g_errno ) g_errno = fstate->m_errno2;
// fstate has his own m_pc in case BigFile got deleted, we cannot
// reference it...
if ( ! g_errno && fstate->m_pc )
fstate->m_pc->addPages ( fstate->m_vfd ,
fstate->m_offset ,
fstate->m_bytesDone ,
fstate->m_buf ,
fstate->m_niceness );
// if ( ! g_errno && fstate->m_pc )
// fstate->m_pc->addPages ( fstate->m_vfd ,
// fstate->m_offset ,
// fstate->m_bytesDone ,
// fstate->m_buf ,
// fstate->m_niceness );
// add the stat
if ( ! g_errno ) {
@ -1015,12 +1047,14 @@ void doneWrapper ( void *state , ThreadEntry *t ) {
int32_t tt = LOG_WARN;
if ( g_errno == EFILECLOSED ) tt = LOG_INFO;
if ( g_errno && g_errno != EDISKSTUCK )
log (tt,"disk: %s. fd1=%"INT32" vfd=%"INT32" "
"off=%"INT64" toread=%"INT32".",
mstrerror(g_errno),
(int32_t)fstate->m_fd1,(int32_t)fstate->m_vfd,
(int64_t)fstate->m_offset ,
(int32_t)fstate->m_bytesToGo );
log (tt,"disk: %s. fd1=%"INT32" fd2=%"INT32" "
"off=%"INT64" toread=%"INT32,
mstrerror(g_errno),
(int32_t)fstate->m_fd1,
(int32_t)fstate->m_fd2,
(int64_t)fstate->m_offset ,
(int32_t)fstate->m_bytesToGo
);
// someone is closing our fd without setting File::s_vfds[fd] to -1
if ( g_errno && g_errno != EDISKSTUCK ) {
//int fd1 = fstate->m_fd1;
@ -1256,6 +1290,12 @@ bool readwrite_r ( FileState *fstate , ThreadEntry *t ) {
int32_t len = bytesToGo - bytesDone;
// how many bytes can we write to it now
if ( len > avail ) len = avail;
// hack for reading warc files
if ( ! fstate->m_usePartFiles ) {
filenum = 0;
localOffset = offset;
len = bytesToGo - bytesDone;
}
// get the fd for this filenum
int fd = -1;
if ( filenum == fstate->m_filenum1 ) fd = fstate->m_fd1;
@ -1273,9 +1313,9 @@ bool readwrite_r ( FileState *fstate , ThreadEntry *t ) {
if ( t && t->m_callback == ohcrap ) return false;
// only set this now if we are the first one
if ( g_threads.m_threadQueues[DISK_THREAD].m_hiReturned ==
g_threads.m_threadQueues[DISK_THREAD].m_hiLaunched )
g_lastDiskReadStarted = fstate->m_startTime;
// if ( g_threads.m_threadQueues[DISK_THREAD].m_hiReturned ==
// g_threads.m_threadQueues[DISK_THREAD].m_hiLaunched )
// g_lastDiskReadStarted = fstate->m_startTime;
// fake it out
//static int32_t s_poo = 0;
@ -1340,10 +1380,17 @@ bool readwrite_r ( FileState *fstate , ThreadEntry *t ) {
log("disk: Read of %"INT32" bytes at offset %"INT64" "
" failed because file is too short for that "
"offset? Our fd was probably stolen from us by another "
"thread. Will retry. error=%s.",
"thread. fd1=%i fd2=%i len=%i filenum=%i "
"localoffset=%i. usepart=%i error=%s.",
(int32_t)len,fstate->m_offset,
//fstate->m_this->getDir(),
//fstate->m_this->getFilename(),
fstate->m_fd1,
fstate->m_fd2,
len,
filenum,
localOffset,
fstate->m_usePartFiles,
mstrerror(errno));
errno = EBADENGINEER;
return false; // log("disk::read/write: offset too big");

@ -47,14 +47,14 @@ public:
class BigFile *m_this;
//struct aiocb m_aiostate;
char *m_buf;
int32_t m_bytesToGo;
int64_t m_bytesToGo;
int64_t m_offset;
// . the original offset, because we set m_offset to m_currentOffset
// if the original offset specified is -1
// . we also advance BigFile::m_currentOffset when done w/ read/write
//int64_t m_origOffset;
bool m_doWrite;
int32_t m_bytesDone;
int64_t m_bytesDone;
void *m_state ;
void (*m_callback) ( void *state ) ;
// goes from 0 to 1, the lower the niceness, the higher the priority
@ -79,9 +79,10 @@ public:
// when we started for graphing purposes (in milliseconds)
int64_t m_startTime;
int64_t m_doneTime;
char m_usePartFiles;
// this is used for calling DiskPageCache::addPages() when done
// with the read/write
class DiskPageCache *m_pc;
//class DiskPageCache *m_pc;
// this is just used for accessing the DiskPageCache, m_pc, it is
// a "virtual fd" for this whole file
int64_t m_vfd;
@ -102,10 +103,10 @@ public:
// threads each hogging up 32KB of memory waiting to read tfndb.
// m_allocBuf points to what we allocated.
char *m_allocBuf;
int32_t m_allocSize;
int64_t m_allocSize;
// m_allocOff is offset into m_allocBuf where we start reading into
// from the file
int32_t m_allocOff;
int64_t m_allocOff;
// do not call pthread_create() for every read we do. use async io
// because it should be much much faster
#ifdef ASYNCIO
@ -138,10 +139,23 @@ class BigFile {
// . if you are opening a new file for writing, you need to provide it
// if you pass in a DiskPageCache ptr
bool open ( int flags ,
class DiskPageCache *pc = NULL ,
//class DiskPageCache *pc = NULL ,
void *pc = NULL ,
int64_t maxFileSize = -1 ,
int permissions =
S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH );
//bool usePartFiles = true );
// this will set usepartfiles to false! so use this to open large
// warc or arc files
//bool open2 ( int flags ,
// //class DiskPageCache *pc = NULL ,
// void *pc = NULL ,
// int64_t maxFileSize = -1 ,
// int permissions =
// S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH );
int getFlags() { return m_flags; };
@ -234,7 +248,7 @@ class BigFile {
//int64_t m_currentOffset;
DiskPageCache *getDiskPageCache ( ) { return m_pc; };
//DiskPageCache *getDiskPageCache ( ) { return m_pc; };
int32_t getVfd ( ) { return m_vfd; };
// WARNING: some may have been unlinked from call to chopHead()
@ -347,13 +361,15 @@ class BigFile {
// maximum part #
int32_t m_maxParts;
class DiskPageCache *m_pc;
//class DiskPageCache *m_pc;
int32_t m_vfd;
//bool m_vfdAllowed;
// prevent circular calls to BigFile::close() with this
char m_isClosing;
char m_usePartFiles;
int64_t m_fileSize;
// oldest of the last modified dates of all the part files

@ -13,7 +13,7 @@ void Cachedb::reset() {
bool Cachedb::init ( ) {
// we use the same disk page size as indexdb (for rdbmap.cpp)
int32_t pageSize = GB_INDEXDB_PAGE_SIZE;
//int32_t pageSize = GB_INDEXDB_PAGE_SIZE;
// set this for debugging
//int64_t maxTreeMem = 1000000;
// i've seen some debug entries like 33MB because of
@ -26,7 +26,7 @@ bool Cachedb::init ( ) {
// . >1000 bytes of data per rec
int32_t maxTreeNodes = maxTreeMem /(sizeof(key96_t)+16+1000);
// disk page cache mem, 100MB on gk0 now
int32_t pcmem = 0; // g_conf.m_cachedbMaxDiskPageCacheMem;
//int32_t pcmem = 0; // g_conf.m_cachedbMaxDiskPageCacheMem;
// keep this low if we are the tmp cluster
//if ( g_hostdb.m_useTmpCluster ) pcmem = 0;
// TODO: would be nice to just do page caching on the satellite files;
@ -38,11 +38,11 @@ bool Cachedb::init ( ) {
m_rdbId = RDB_SERPDB;
}
if ( ! m_pc.init ( m_name ,
m_rdbId, // RDB_CACHEDB,
pcmem ,
pageSize ))
return log("db: %s init failed.",m_name);
// if ( ! m_pc.init ( m_name ,
// m_rdbId, // RDB_CACHEDB,
// pcmem ,
// pageSize ))
// return log("db: %s init failed.",m_name);
// init the rdb
if ( ! m_rdb.init ( g_hostdb.m_dir ,
m_name ,
@ -60,7 +60,7 @@ bool Cachedb::init ( ) {
0 , // cache nodes
false, // true , // use half keys
false , // load cache from disk
&m_pc ,
NULL,//&m_pc ,
false , // false
false , // preload page cache
sizeof(key96_t) ,

@ -18,7 +18,7 @@
#define CACHEDBKS sizeof(key96_t)
#include "Rdb.h"
#include "DiskPageCache.h"
//#include "DiskPageCache.h"
// do not change these numbers, they are permanent and stored in cachedb
// that way... just add new numbers to the end.
@ -109,8 +109,8 @@ class Cachedb {
Rdb *getRdb() { return &m_rdb; };
DiskPageCache *getDiskPageCache () { return &m_pc; };
DiskPageCache m_pc;
//DiskPageCache *getDiskPageCache () { return &m_pc; };
//DiskPageCache m_pc;
private:
Rdb m_rdb;

@ -39,18 +39,18 @@ bool Catdb::init ( ) {
int32_t maxTreeNodes = treeMem / 82;
// do not use any page cache if doing tmp cluster in order to
// prevent swapping
int32_t pcmem = g_conf.m_catdbMaxDiskPageCacheMem;
if ( g_hostdb.m_useTmpCluster ) pcmem = 0;
// int32_t pcmem = g_conf.m_catdbMaxDiskPageCacheMem;
// if ( g_hostdb.m_useTmpCluster ) pcmem = 0;
pcmem = 0;
// pcmem = 0;
// each entry in the cache is usually just a single record, no lists,
// unless a hostname has multiple sites in it. has 24 bytes more
// overhead in cache.
//int32_t maxCacheNodes = g_conf.m_tagdbMaxCacheMem / 106;
// we now use a page cache
if ( ! m_pc.init ("catdb",RDB_CATDB,pcmem,
GB_TFNDB_PAGE_SIZE) )
return log("db: Catdb init failed.");
// if ( ! m_pc.init ("catdb",RDB_CATDB,pcmem,
// GB_TFNDB_PAGE_SIZE) )
// return log("db: Catdb init failed.");
// . initialize our own internal rdb
// . i no longer use cache so changes to tagdb are instant
@ -71,7 +71,7 @@ bool Catdb::init ( ) {
0 , //maxCacheNodes ,
false , // half keys?
false , //m_tagdbSaveCache
&m_pc ,
NULL, // &m_pc ,
false,
false,
12, // keysize

@ -21,7 +21,7 @@
#include "Rdb.h"
#include "Url.h"
#include "Loop.h"
#include "DiskPageCache.h"
//#include "DiskPageCache.h"
//#include "CollectionRec.h"
class Catdb {
@ -74,7 +74,7 @@ class Catdb {
void getKeyRange ( bool useIp , Url *url ,
key_t *startKey , key_t *endKey );
DiskPageCache *getDiskPageCache() { return &m_pc; };
//DiskPageCache *getDiskPageCache() { return &m_pc; };
// normalize a url, no www.
void normalizeUrl ( Url *srcUrl, Url *dstUrl );
@ -93,7 +93,7 @@ class Catdb {
// and "not-founds" stored remotely (net cache)
Rdb m_rdb;
DiskPageCache m_pc;
//DiskPageCache m_pc;
};

@ -274,20 +274,20 @@ bool Clusterdb::init ( ) {
// RdbCache has a 4 byte ptr to each rec in the cache
//int32_t maxCacheNodes = maxCacheMem / ( 4 + CLUSTER_REC_SIZE );
//int32_t nodeSize = sizeof(key_t) + sizeof(collnum_t);
int32_t pageSize = GB_TFNDB_PAGE_SIZE;
//int32_t pageSize = GB_TFNDB_PAGE_SIZE;
//int32_t nodeSize = (pageSize + 12) + sizeof(collnum_t) + 20;
//int32_t maxCacheNodes = maxCacheMem / nodeSize ;
// init the page cache
if ( ! m_pc.init ( "clusterdb",
RDB_CLUSTERDB,
pcmem ,
pageSize ) )
//g_conf.m_clusterdbMaxDiskPageCacheMem,
//clusterGetPages,
//clusterAddPages,
//clusterGetVfd,
//clusterRmVfd ))
return log("db: Clusterdb init failed.");
// if ( ! m_pc.init ( "clusterdb",
// RDB_CLUSTERDB,
// pcmem ,
// pageSize ) )
// //g_conf.m_clusterdbMaxDiskPageCacheMem,
// //clusterGetPages,
// //clusterAddPages,
// //clusterGetVfd,
// //clusterRmVfd ))
// return log("db: Clusterdb init failed.");
//bool bias = true;
//if ( g_conf.m_fullSplit ) bias = false;
bool bias = false;
@ -305,7 +305,7 @@ bool Clusterdb::init ( ) {
0,//maxCacheNodes ,
true , // half keys?
g_conf.m_clusterdbSaveCache,
&m_pc ,
NULL,//&m_pc ,
false, // is titledb
true , // preload disk page cache
12, // key size

@ -32,7 +32,7 @@
#include "Url.h"
#include "Conf.h"
#include "Titledb.h"
#include "DiskPageCache.h"
//#include "DiskPageCache.h"
// these are now just TitleRec keys
#define CLUSTER_REC_SIZE (sizeof(key_t))
@ -162,14 +162,14 @@ class Clusterdb {
//char getGigabitSimilarity ( char *vec0 , char *vec1 ,
// int32_t *qtable , int32_t numSlots ) ;
DiskPageCache *getDiskPageCache() { return &m_pc; };
//DiskPageCache *getDiskPageCache() { return &m_pc; };
private:
// this rdb holds urls waiting to be spidered or being spidered
Rdb m_rdb;
DiskPageCache m_pc;
//DiskPageCache m_pc;
};
extern class Clusterdb g_clusterdb;

@ -329,6 +329,12 @@ bool Collectiondb::addExistingColl ( char *coll, collnum_t collnum ) {
if ( cr->m_isCustomCrawl )
cr->m_indexSpiderReplies = true;
// and don't do link voting, will help speed up
if ( cr->m_isCustomCrawl ) {
cr->m_getLinkInfo = false;
cr->m_computeSiteNumInlinks = false;
}
// we need to compile the regular expressions or update the url
// filters with new logic that maps crawlbot parms to url filters
return cr->rebuildUrlFilters ( );
@ -1694,13 +1700,24 @@ collnum_t Collectiondb::reserveCollNum ( ) {
return next;
}
// collnum_t is signed right now because we use -1 to indicate a
// bad collnum.
int32_t scanned = 0;
// search for an empty slot
for ( int32_t i = m_wrapped ; i < m_numRecs ; i++ ) {
for ( int32_t i = m_wrapped ; ; i++ ) {
// because collnum_t is 2 bytes, signed, limit this here
if ( i > 0x7fff ) i = 0;
// how can this happen?
if ( i < 0 ) i = 0;
// if we scanned the max # of recs we could have, we are done
if ( ++scanned >= m_numRecs ) break;
// skip if this is in use
if ( m_recs[i] ) continue;
// start after this one next time
m_wrapped = i+1;
// note it
log("colldb: returning wrapped collnum of %"INT32"",(int32_t)i);
log("colldb: returning wrapped collnum "
"of %"INT32"",(int32_t)i);
return (collnum_t)i;
}
@ -1841,6 +1858,8 @@ void CollectionRec::reset() {
m_hasucr = false;
m_hasupr = false;
m_sendingAlertInProgress = false;
// make sure we do not leave spiders "hanging" waiting for their
// callback to be called... and it never gets called
//if ( m_callbackQueue.length() > 0 ) { char *xx=NULL;*xx=0; }
@ -2313,6 +2332,17 @@ bool CollectionRec::rebuildUrlFilters2 ( ) {
m_spiderFreqs [n] = .00347; // 5 mins
n++;
// a non temporary error, like a 404? retry once per 3 months i guess
m_regExs[n].set("errorcount>=1");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 90; // 90 day retry
m_maxSpidersPerRule [n] = 1; // max spiders
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 2;
m_forceDelete [n] = 1;
n++;
m_regExs[n].set("isaddurl");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 7; // 30 days default

@ -525,6 +525,7 @@ class CollectionRec {
char m_enforceNewQuotas ;
char m_doIpLookups ; // considered iff using proxy
char m_useRobotsTxt ;
char m_obeyRelNoFollowLinks ;
char m_forceUseFloaters ;
char m_automaticallyUseProxies ;
char m_automaticallyBackOff ;
@ -626,6 +627,7 @@ class CollectionRec {
int32_t m_adWidth; // how wide the ad Column is in pixels
char m_dedupResultsByDefault ;
char m_doTagdbLookups ;
char m_clusterByTopicDefault ;
char m_restrictTitledbForQuery ; // move this down here
char m_useOldIps ;
@ -766,7 +768,7 @@ class CollectionRec {
// last time we computed global crawl info
//time_t m_globalCrawlInfoUpdateTime;
EmailInfo m_emailInfo;
//EmailInfo m_emailInfo;
// for counting replies
//int32_t m_replies;
//int32_t m_requests;
@ -974,6 +976,8 @@ class CollectionRec {
// NARROW SEARCH
char m_doNarrowSearch;
char m_sendingAlertInProgress;
// Allow Links: searches on the collection
//char m_allowLinksSearch;
// . reference pages parameters

@ -369,7 +369,7 @@ bool Conf::init ( char *dir ) { // , int32_t hostId ) {
g_conf.m_forceIt = false;
// always turn on threads if live
if ( g_conf.m_isLive ) g_conf.m_useThreads = true;
//if ( g_conf.m_isLive ) g_conf.m_useThreads = true;
// disable this at startup always... no since might have crashed
// in the middle of a test. and we just turn on spiders again when
// already in test mode otherwise hostid #0 will erase all the files.

54
Conf.h

@ -175,7 +175,7 @@ class Conf {
// tagdb parameters
int32_t m_tagdbMaxTreeMem;
int32_t m_tagdbMaxDiskPageCacheMem;
//int32_t m_tagdbMaxDiskPageCacheMem;
//int32_t m_tagdbMaxCacheMem;
//bool m_tagdbUseSeals;
//int32_t m_tagdbMinFilesToMerge;
@ -183,7 +183,7 @@ class Conf {
// catdb parameters
int32_t m_catdbMaxTreeMem;
int32_t m_catdbMaxDiskPageCacheMem;
//int32_t m_catdbMaxDiskPageCacheMem;
int32_t m_catdbMaxCacheMem;
//int32_t m_catdbMinFilesToMerge;
@ -216,7 +216,7 @@ class Conf {
// linkdb for storing linking relations
int32_t m_linkdbMaxTreeMem;
// int32_t m_linkdbMaxCacheMem;
int32_t m_linkdbMaxDiskPageCacheMem;
//int32_t m_linkdbMaxDiskPageCacheMem;
int32_t m_linkdbMinFilesToMerge;
// bool m_linkdbSaveCache;
@ -234,7 +234,7 @@ class Conf {
// for holding urls that have been entered into the spider queue
//int32_t m_tfndbMaxTreeMem ;
int32_t m_tfndbMaxDiskPageCacheMem ; // for the DiskPageCache class only
//int32_t m_tfndbMaxDiskPageCacheMem ; // for the DiskPageCache class only
//int32_t m_tfndbMinFilesToMerge;
//bool m_tfndbSaveCache;
//int64_t m_tfndbMaxUrls;
@ -253,21 +253,23 @@ class Conf {
//int32_t m_spiderdbMaxDiskPageCacheMem ;
//int32_t m_spiderdbMinFilesToMerge;
int32_t m_spiderMaxDiskThreads ;
int32_t m_spiderMaxBigDiskThreads ; // > 1M read
int32_t m_spiderMaxMedDiskThreads ; // 100k - 1M read
int32_t m_spiderMaxSmaDiskThreads ; // < 100k read
int32_t m_queryMaxDiskThreads ;
int32_t m_queryMaxBigDiskThreads ; // > 1M read
int32_t m_queryMaxMedDiskThreads ; // 100k - 1M read
int32_t m_queryMaxSmaDiskThreads ; // < 100k per read
//int32_t m_spiderMaxBigDiskThreads ; // > 1M read
//int32_t m_spiderMaxMedDiskThreads ; // 100k - 1M read
//int32_t m_spiderMaxSmaDiskThreads ; // < 100k read
//int32_t m_queryMaxDiskThreads ;
//int32_t m_queryMaxBigDiskThreads ; // > 1M read
//int32_t m_queryMaxMedDiskThreads ; // 100k - 1M read
//int32_t m_queryMaxSmaDiskThreads ; // < 100k per read
// categorize the disk read sizes by these here
int32_t m_bigReadSize;
int32_t m_medReadSize;
int32_t m_smaReadSize;
//int32_t m_bigReadSize;
//int32_t m_medReadSize;
//int32_t m_smaReadSize;
char m_separateDiskReads;
int32_t m_statsdbMaxTreeMem;
int32_t m_statsdbMaxCacheMem;
int32_t m_statsdbMaxDiskPageCacheMem;
//int32_t m_statsdbMaxDiskPageCacheMem;
//int32_t m_statsdbMinFilesToMerge;
bool m_useStatsdb;
//bool m_statsdbSnapshots;
@ -290,6 +292,7 @@ class Conf {
//bool m_refreshFacebookUsersEnabled;
bool m_injectionsEnabled ;
bool m_queryingEnabled ;
bool m_returnResultsAnyway;
// qa testing loop going on? uses "test" subdir
bool m_testParserEnabled ;
bool m_testSpiderEnabled ;
@ -331,7 +334,7 @@ class Conf {
// indexdb has a max cached age for getting IndexLists (10 mins deflt)
int32_t m_indexdbMaxTreeMem ;
int32_t m_indexdbMaxCacheMem;
int32_t m_indexdbMaxDiskPageCacheMem; // for DiskPageCache class only
//int32_t m_indexdbMaxDiskPageCacheMem; // for DiskPageCache class only
int32_t m_indexdbMaxIndexListAge;
int32_t m_indexdbTruncationLimit;
int32_t m_indexdbMinFilesToMerge;
@ -339,7 +342,7 @@ class Conf {
int32_t m_datedbMaxTreeMem ;
int32_t m_datedbMaxCacheMem;
int32_t m_datedbMaxDiskPageCacheMem; // for DiskPageCache class only
//int32_t m_datedbMaxDiskPageCacheMem; // for DiskPageCache class only
int32_t m_datedbMaxIndexListAge;
int32_t m_datedbTruncationLimit;
int32_t m_datedbMinFilesToMerge;
@ -568,17 +571,11 @@ class Conf {
bool m_useSHM;
bool m_useQuickpoll;
bool m_useDiskPageCacheIndexdb;
bool m_useDiskPageCachePosdb;
bool m_useDiskPageCacheDatedb;
bool m_useDiskPageCacheTitledb;
bool m_useDiskPageCacheSpiderdb;
bool m_useDiskPageCacheTfndb;
bool m_useDiskPageCacheTagdb;
bool m_useDiskPageCacheChecksumdb;
bool m_useDiskPageCacheClusterdb;
bool m_useDiskPageCacheCatdb;
bool m_useDiskPageCacheLinkdb;
int64_t m_posdbFileCacheSize;
int64_t m_tagdbFileCacheSize;
int64_t m_clusterdbFileCacheSize;
int64_t m_titledbFileCacheSize;
int64_t m_spiderdbFileCacheSize;
//bool m_quickpollCoreOnError;
bool m_useShotgun;
@ -685,6 +682,7 @@ class Conf {
bool m_diffbotMsg13Hack ;
bool m_logDebugUrlAttempts ;
bool m_logDebugTcp ;
bool m_logDebugTcpBuf ;
bool m_logDebugThread ;
bool m_logDebugTimedb ;
bool m_logDebugTitle ;

@ -47,14 +47,14 @@ bool Datedb::init ( ) {
// old rec cache. i am trying to do away with the Rdb::m_cache rec
// cache in favor of cleverly used disk page caches, because
// the rec caches are not real-time and get stale.
int32_t pcmem = g_conf.m_datedbMaxDiskPageCacheMem;
//int32_t pcmem = g_conf.m_datedbMaxDiskPageCacheMem;
// make sure at least 30MB
//if ( pcmem < 30000000 ) pcmem = 30000000;
// keep this low if we are the tmp cluster, 20MB
if ( g_hostdb.m_useTmpCluster && pcmem > 20000000 ) pcmem = 20000000;
//if ( g_hostdb.m_useTmpCluster && pcmem > 20000000 ) pcmem = 20000000;
// do not use any page cache if doing tmp cluster in order to
// prevent swapping
if ( g_hostdb.m_useTmpCluster ) pcmem = 0;
//if ( g_hostdb.m_useTmpCluster ) pcmem = 0;
// . init the page cache
// . MDW: "minimize disk seeks" not working otherwise i'd enable it!
// if ( ! m_pc.init ( "datedb",

@ -195,6 +195,7 @@ case EADMININTERFERENCE: return "Adminstrative interference";
case EDNSERROR : return "DNS lookup error";
case ETHREADSDISABLED:return "Threads Disabled";
case EMALFORMEDQUERY: return "Malformed query";
case ESHARDDOWN: return "One or more shards are down";
}
// if the remote error bit is clear it must be a regulare errno
//if ( ! ( errnum & REMOTE_ERROR_BIT ) ) return strerror ( errnum );

@ -199,6 +199,7 @@ enum {
EADMININTERFERENCE,
EDNSERROR ,
ETHREADSDISABLED,
EMALFORMEDQUERY
EMALFORMEDQUERY,
ESHARDDOWN
};
#endif

@ -455,6 +455,11 @@ bool HashTableX::load ( char *dir, char *filename, char **tbuf, int32_t *tsize )
if ( ! f.read ( &ds , 4 , off ) ) return false;
off += 4;
if ( numSlots < 0 || numSlotsUsed < 0 ) {
log("htable: bogus saved hashtable file %s%s.",dir,filename);
return false;
}
// bogus key size?
if ( ks <= 0 ) {
// is very common for this file so skip it

@ -92,28 +92,28 @@ bool Indexdb::init ( ) {
// enough nodes!!
int32_t maxCacheNodes = g_conf.m_indexdbMaxCacheMem / 600;
int32_t pageSize = GB_INDEXDB_PAGE_SIZE;
//int32_t pageSize = GB_INDEXDB_PAGE_SIZE;
// we now use a disk page cache as opposed to the
// old rec cache. i am trying to do away with the Rdb::m_cache rec
// cache in favor of cleverly used disk page caches, because
// the rec caches are not real-time and get stale.
int32_t pcmem = g_conf.m_indexdbMaxDiskPageCacheMem;
//int32_t pcmem = g_conf.m_indexdbMaxDiskPageCacheMem;
pcmem = 0;
//pcmem = 0;
// make sure at least 30MB
//if ( pcmem < 30000000 ) pcmem = 30000000;
// keep this low if we are the tmp cluster, 30MB
if ( g_hostdb.m_useTmpCluster && pcmem > 30000000 ) pcmem = 30000000;
//if ( g_hostdb.m_useTmpCluster && pcmem > 30000000 ) pcmem = 30000000;
// do not use any page cache if doing tmp cluster in order to
// prevent swapping
if ( g_hostdb.m_useTmpCluster ) pcmem = 0;
//if ( g_hostdb.m_useTmpCluster ) pcmem = 0;
// . init the page cache
// . MDW: "minimize disk seeks" not working otherwise i'd enable it!
if ( ! m_pc.init ( "indexdb",
RDB_INDEXDB,
pcmem ,
pageSize ))
return log("db: Indexdb init failed.");
// if ( ! m_pc.init ( "indexdb",
// RDB_INDEXDB,
// pcmem ,
// pageSize ))
// return log("db: Indexdb init failed.");
// . set our own internal rdb
// . max disk space for bin tree is same as maxTreeMem so that we
@ -133,7 +133,7 @@ bool Indexdb::init ( ) {
maxCacheNodes ,
true , // use half keys?
false , // g_conf.m_indexdbSav
&m_pc ) )
NULL))//&m_pc ) )
return false;
return true;
// validate indexdb

@ -14,7 +14,7 @@
#include "Rdb.h"
#include "Conf.h"
#include "DiskPageCache.h"
//#include "DiskPageCache.h"
// we define these here, NUMDOCIDBITS is in ../titledb/Titledb.h
#define NUMTERMIDBITS 48
@ -173,9 +173,9 @@ class Indexdb {
Rdb m_rdb;
DiskPageCache *getDiskPageCache ( ) { return &m_pc; };
//DiskPageCache *getDiskPageCache ( ) { return &m_pc; };
DiskPageCache m_pc;
//DiskPageCache m_pc;
//#ifdef SPLIT_INDEXDB
// . groupId Table, for getting the correct group id based

@ -5,6 +5,7 @@
#include "sort.h"
#include "XmlDoc.h" // score32to8()
#include "Rebalance.h"
#include "Process.h"
Linkdb g_linkdb;
Linkdb g_linkdb2;
@ -101,7 +102,7 @@ bool Linkdb::init ( ) {
*/
// we use the same disk page size as indexdb (for rdbmap.cpp)
int32_t pageSize = GB_INDEXDB_PAGE_SIZE;
//int32_t pageSize = GB_INDEXDB_PAGE_SIZE;
// set this for debugging
//int64_t maxTreeMem = 1000000;
int64_t maxTreeMem = 40000000; // 40MB
@ -110,18 +111,18 @@ bool Linkdb::init ( ) {
// . 32 bytes per record when in the tree
int32_t maxTreeNodes = maxTreeMem /(sizeof(key224_t)+16);
// disk page cache mem, 100MB on gk0 now
int32_t pcmem = 0; // g_conf.m_linkdbMaxDiskPageCacheMem;
//int32_t pcmem = 0; // g_conf.m_linkdbMaxDiskPageCacheMem;
// give it a little
pcmem = 10000000; // 10MB
//pcmem = 10000000; // 10MB
// keep this low if we are the tmp cluster
//if ( g_hostdb.m_useTmpCluster ) pcmem = 0;
// TODO: would be nice to just do page caching on the satellite files;
// look into "minimizeDiskSeeks" at some point...
if ( ! m_pc.init ( "linkdb" ,
RDB_LINKDB,
pcmem ,
pageSize ))
return log("db: Linkdb init failed.");
// if ( ! m_pc.init ( "linkdb" ,
// RDB_LINKDB,
// pcmem ,
// pageSize ))
// return log("db: Linkdb init failed.");
// init the rdb
return m_rdb.init ( g_hostdb.m_dir ,
"linkdb" ,
@ -141,7 +142,7 @@ bool Linkdb::init ( ) {
0 , // cache nodes
false, // true , // use half keys
false , // load cache from disk
&m_pc ,
NULL,//&m_pc ,
false , // false
false , // preload page cache
sizeof(key224_t) ,
@ -1130,6 +1131,12 @@ bool Msg25::doReadLoop ( ) {
ms,m_site,m_url,m_docId,KEYSTR(&startKey,LDBKS));
}
if ( g_process.m_mode == EXIT_MODE ) {
log("linkdb: shutting down. exiting link text loop.");
g_errno = ESHUTTINGDOWN;
return false;
}
m_gettingList = true;
CollectionRec *cr = g_collectiondb.getRec ( m_collnum );

@ -32,7 +32,7 @@
#include "Conf.h"
#include "Rdb.h"
#include "DiskPageCache.h"
//#include "DiskPageCache.h"
#include "Titledb.h"
void handleRequest25 ( UdpSlot *slot , int32_t netnice ) ;
@ -358,8 +358,8 @@ class Linkdb {
Rdb *getRdb() { return &m_rdb; };
DiskPageCache *getDiskPageCache () { return &m_pc; };
DiskPageCache m_pc;
//DiskPageCache *getDiskPageCache () { return &m_pc; };
//DiskPageCache m_pc;
private:
Rdb m_rdb;

@ -1386,9 +1386,9 @@ bool Loop::runLoop ( ) {
if ( m_shutdown == 2 ) {
//log(0,"Thread is saving & shutting down urgently.");
//while ( 1 == 1 ) sleep (50000);
log("loop: Resuming despite thread crash.");
m_shutdown = 0;
goto BIGLOOP;
//log("loop: Resuming despite thread crash.");
//m_shutdown = 0;
//goto BIGLOOP;
}
// otherwise, thread did not save, so we must do it
log ( LOG_INIT ,"loop: Saving and shutting down urgently.");
@ -2017,12 +2017,12 @@ void Loop::doPoll ( ) {
// if shutting down was it a sigterm ?
if ( m_shutdown ) goto again;
// handle returned threads for niceness 0
if ( g_threads.m_needsCleanup )
g_threads.timedCleanUp(-3,0); // 3 ms
//if ( g_threads.m_needsCleanup )
g_threads.timedCleanUp(-3,0); // 3 ms
if ( m_inQuickPoll ) goto again;
// high niceness threads
if ( g_threads.m_needsCleanup )
g_threads.timedCleanUp(-4,MAX_NICENESS); //3 ms
//if ( g_threads.m_needsCleanup )
g_threads.timedCleanUp(-4,MAX_NICENESS); //3 ms
goto again;
}

@ -41,7 +41,7 @@ OBJS = UdpSlot.o Rebalance.o \
Highlight.o File.o Errno.o Entities.o \
Dns.o Dir.o Conf.o Bits.o \
Stats.o BigFile.o Msg17.o \
Speller.o DiskPageCache.o \
Speller.o \
PingServer.o StopWords.o TopTree.o \
Parms.o Pages.o \
Unicode.o iana_charset.o Iso8859.o \
@ -532,6 +532,10 @@ Timedb.o:
HashTableX.o:
$(CC) $(DEFS) $(CPPFLAGS) -O2 -c $*.cpp
# getUrlFilterNum2()
Spider.o:
$(CC) $(DEFS) $(CPPFLAGS) -O2 -c $*.cpp
SpiderCache.o:
$(CC) $(DEFS) $(CPPFLAGS) -O2 -c $*.cpp

10
Mem.cpp

@ -21,7 +21,7 @@
// uncomment this for EFENCE to do underflow checks instead of the
// default overflow checks
//#define _CHECKUNDERFLOW_
//#define CHECKUNDERFLOW
// only Mem.cpp can call ::malloc, everyone else must call mmalloc() so
// we can keep tabs on memory usage. in Mem.h we #define this to be coreme()
@ -2168,7 +2168,7 @@ void *getElecMem ( int32_t size ) {
// a page above OR a page below
// let's go below this time since that seems to be the problem
#ifdef _CHECKUNDERFLOW_
#ifdef CHECKUNDERFLOW
// how much to alloc
// . assume sysmalloc returs one byte above a page, so we need
// MEMPAGESIZE-1 bytes to move p up to page boundary, another
@ -2189,7 +2189,7 @@ void *getElecMem ( int32_t size ) {
// parser
char *p = realMem;
// align p DOWN to nearest 8k boundary
int32_t remainder = (uint32_t)realMem % MEMPAGESIZE;
int32_t remainder = (uint64_t)realMem % MEMPAGESIZE;
// complement
remainder = MEMPAGESIZE - remainder;
// and add to ptr to be aligned on 8k boundary
@ -2211,7 +2211,7 @@ void *getElecMem ( int32_t size ) {
p += size;
// now when we free this it should all be protected, so make sure
// we have enough room on top
int32_t leftover = MEMPAGESIZE - ((uint32_t)p % MEMPAGESIZE);
int32_t leftover = MEMPAGESIZE - ((uint64_t)p % MEMPAGESIZE);
// skip that
p += leftover;
// inefficient?
@ -2302,7 +2302,7 @@ void freeElecMem ( void *fakeMem ) {
char *label = &s_labels[((uint32_t)h)*16];
int32_t fakeSize = s_sizes[h];
#ifdef _CHECKUNDERFLOW_
#ifdef CHECKUNDERFLOW
char *oldProtMem = cp - MEMPAGESIZE;
#else
char *oldProtMem = cp + fakeSize;

@ -9,7 +9,7 @@ void Monitordb::reset() {
bool Monitordb::init ( ) {
// we use the same disk page size as indexdb (for rdbmap.cpp)
int32_t pageSize = GB_INDEXDB_PAGE_SIZE;
//int32_t pageSize = GB_INDEXDB_PAGE_SIZE;
// set this for debugging
//int64_t maxTreeMem = 1000000;
int64_t maxTreeMem = 10000000; // 10MB
@ -18,16 +18,16 @@ bool Monitordb::init ( ) {
// . 32 bytes per record when in the tree
int32_t maxTreeNodes = maxTreeMem /(sizeof(key96_t)+16);
// disk page cache mem, 100MB on gk0 now
int32_t pcmem = 0; // g_conf.m_monitordbMaxDiskPageCacheMem;
//int32_t pcmem = 0; // g_conf.m_monitordbMaxDiskPageCacheMem;
// keep this low if we are the tmp cluster
//if ( g_hostdb.m_useTmpCluster ) pcmem = 0;
// TODO: would be nice to just do page caching on the satellite files;
// look into "minimizeDiskSeeks" at some point...
if ( ! m_pc.init ( "monitordb" ,
RDB_MONITORDB,
pcmem ,
pageSize ))
return log("db: Monitordb init failed.");
// if ( ! m_pc.init ( "monitordb" ,
// RDB_MONITORDB,
// pcmem ,
// pageSize ))
// return log("db: Monitordb init failed.");
// init the rdb
return m_rdb.init ( g_hostdb.m_dir ,
"monitordb" ,
@ -45,7 +45,7 @@ bool Monitordb::init ( ) {
0 , // cache nodes
false, // true , // use half keys
false , // load cache from disk
&m_pc ,
NULL,//&m_pc ,
false , // false
false , // preload page cache
sizeof(key96_t) ,

@ -15,7 +15,7 @@
#define MONITORDBKS sizeof(key96_t)
#include "Rdb.h"
#include "DiskPageCache.h"
//#include "DiskPageCache.h"
class Monitordb {
public:
@ -29,8 +29,8 @@ class Monitordb {
Rdb *getRdb() { return &m_rdb; };
DiskPageCache *getDiskPageMonitor () { return &m_pc; };
DiskPageCache m_pc;
//DiskPageCache *getDiskPageMonitor () { return &m_pc; };
//DiskPageCache m_pc;
private:
Rdb m_rdb;

@ -1222,13 +1222,16 @@ bool ipWasBanned ( TcpSocket *ts , const char **msg , Msg13Request *r ) {
// if it is a seed url and there are no links, then perhaps we
// are in a blacklist somewhere already from triggering a spider trap
if ( //isInSeedBuf ( cr , r->ptr_url ) &&
// this is set in XmlDoc.cpp based on hopcount really
r->m_isRootSeedUrl &&
! strstr ( ts->m_readBuf, "<a href" ) ) {
*msg = "root/seed url with no outlinks";
return true;
}
// i've seen this flub on a site where they just return a script
// and it is not banned, so let's remove this until we thinkg
// of something better.
// if ( //isInSeedBuf ( cr , r->ptr_url ) &&
// // this is set in XmlDoc.cpp based on hopcount really
// r->m_isRootSeedUrl &&
// ! strstr ( ts->m_readBuf, "<a href" ) ) {
// *msg = "root/seed url with no outlinks";
// return true;
// }
// TODO: compare a simple checksum of the page content to what

@ -157,6 +157,12 @@ bool Msg20::getSummary ( Msg20Request *req ) {
// do not re-route to twins if accessing an external network
if ( hostdb != &g_hostdb ) req->m_expected = false;
if ( req->m_docId < 0 && ! req->ptr_ubuf ) {
log("msg20: docid<0 and no url for msg20::getsummary");
g_errno = EBADREQUEST;
return true;
}
// get groupId from docId, if positive
uint32_t shardNum;
if ( req->m_docId >= 0 )
@ -398,8 +404,11 @@ void handleRequest20 ( UdpSlot *slot , int32_t netnice ) {
// sanity check, the size include the \0
if ( req->m_collnum < 0 ) {
log("query: Got empty collection in msg20 handler. FIX!");
char *xx =NULL; *xx = 0;
log("query: Got empty collection in msg20 handler. FIX! "
"from ip=%s port=%i",iptoa(slot->m_ip),(int)slot->m_port);
g_udpServer.sendErrorReply ( slot , ENOTFOUND );
return;
//char *xx =NULL; *xx = 0;
}
// if it's not stored locally that's an error
if ( req->m_docId >= 0 && ! g_titledb.isLocal ( req->m_docId ) ) {

@ -354,9 +354,9 @@ public:
int32_t m_pageInlinksLastUpdated;
int32_t m_siteNumInlinks ; // GOOD inlinks!
int32_t m_siteNumInlinksTotal ; // TOTAL inlinks
int32_t m_siteNumUniqueIps ;
int32_t m_siteNumUniqueCBlocks;
//int32_t m_siteNumInlinksTotal ; // TOTAL inlinks
//int32_t m_siteNumUniqueIps ;
//int32_t m_siteNumUniqueCBlocks;
int32_t m_numOutlinks ; // replaced m_linkCount
int32_t m_tmp ; // used by Msg40.cpp for event merge
@ -406,6 +406,7 @@ public:
int32_t m_timeLinkSpam ; // set for m_getLinkText
void *m_parentOwner;
char m_constructorId;
char m_inlinkWeight ; // set for m_getLinkText
char m_isLinkSpam ; // set for m_getLinkText
char m_isAnomaly ; // set for m_getLinkText

@ -937,8 +937,10 @@ void gotTitleList ( void *state , RdbList *list , Msg5 *msg5 ) {
else if ( r->m_url[0] ) {
// get it
int64_t uh48 = g_titledb.getUrlHash48(k);
// sanity check
if ( st->m_uh48 == 0 ) { char *xx=NULL;*xx=0; }
// sanity check. MDW: looks like we allow 0 to
// be a valid hash. so let this through. i've seen
// it core here before.
//if ( st->m_uh48 == 0 ) { char *xx=NULL;*xx=0; }
// make sure our available docids are availble!
if ( dd == ad1 ) ad1++;
if ( dd == ad2 ) ad2++;

185
Msg3.cpp

@ -39,6 +39,86 @@ void Msg3::reset() {
m_alloc = NULL;
}
key192_t makeCacheKey ( int64_t vfd ,
int64_t offset ,
int64_t readSize ) {
key192_t k;
k.n2 = vfd;
k.n1 = readSize;
k.n0 = offset;
return k;
}
RdbCache g_rdbCaches[5];
class RdbCache *getDiskPageCache ( char rdbId ) {
RdbCache *rpc = NULL;
int64_t *maxSizePtr = NULL;
int64_t maxMem;
int64_t maxRecs;
char *dbname;
if ( rdbId == RDB_POSDB ) {
rpc = &g_rdbCaches[0];
maxSizePtr = &g_conf.m_posdbFileCacheSize;
maxMem = *maxSizePtr;
maxRecs = maxMem / 5000;
dbname = "posdbcache";
}
if ( rdbId == RDB_TAGDB ) {
rpc = &g_rdbCaches[1];
maxSizePtr = &g_conf.m_tagdbFileCacheSize;
maxMem = *maxSizePtr;
maxRecs = maxMem / 200;
dbname = "tagdbcache";
}
if ( rdbId == RDB_CLUSTERDB ) {
rpc = &g_rdbCaches[2];
maxSizePtr = &g_conf.m_clusterdbFileCacheSize;
maxMem = *maxSizePtr;
maxRecs = maxMem / 32;
dbname = "clustcache";
}
if ( rdbId == RDB_TITLEDB ) {
rpc = &g_rdbCaches[3];
maxSizePtr = &g_conf.m_titledbFileCacheSize;
maxMem = *maxSizePtr;
maxRecs = maxMem / 3000;
dbname = "titdbcache";
}
if ( rdbId == RDB_SPIDERDB ) {
rpc = &g_rdbCaches[4];
maxSizePtr = &g_conf.m_spiderdbFileCacheSize;
maxMem = *maxSizePtr;
maxRecs = maxMem / 3000;
dbname = "spdbcache";
}
if ( ! rpc )
return NULL;
if ( maxMem < 0 ) maxMem = 0;
// did size change? if not, return it
if ( rpc->m_maxMem == maxMem )
return rpc;
// re-init or init for the first time here
if ( ! rpc->init ( maxMem ,
-1 , // fixedDataSize. -1 since we are lists
false , // support lists?
maxRecs ,
false , // use half keys?
dbname ,
false , // loadfromdisk
sizeof(key192_t), // cache key size
0 , // data key size
-1 ) ) // numptrsmax
return NULL;
return rpc;
}
// . return false if blocked, true otherwise
// . set g_errno on error
// . read list of keys in [startKey,endKey] range
@ -81,6 +161,10 @@ bool Msg3::readList ( char rdbId ,
bool justGetEndKey ,
bool allowPageCache ,
bool hitDisk ) {
// set this to true to validate
m_validateCache = false;//true;
// clear, this MUST be done so if we return true g_errno is correct
g_errno = 0;
// assume lists are not checked for corruption
@ -599,6 +683,48 @@ bool Msg3::readList ( char rdbId ,
break;
}
////////
//
// try to get from PAGE CACHE
//
////////
BigFile *ff = base->getFile(m_fileNums[i]);
RdbCache *rpc = getDiskPageCache ( m_rdbId );
if ( ! m_allowPageCache ) rpc = NULL;
// . vfd is unique 64 bit file id
// . if file is opened vfd is -1, only set in call to open()
int64_t vfd = ff->getVfd();
key192_t ck = makeCacheKey ( vfd , offset, bytesToRead);
char *rec; int32_t recSize;
bool inCache = false;
if ( rpc && vfd != -1 && ! m_validateCache )
inCache = rpc->getRecord ( (collnum_t)0 , // collnum
(char *)&ck ,
&rec ,
&recSize ,
true , // copy?
-1 , // maxAge, none
true ); // inccounts?
m_scans[i].m_inPageCache = false;
if ( inCache ) {
m_scans[i].m_inPageCache = true;
m_numScansCompleted++;
// now we have to store this value, 6 or 12 so
// we can modify the hint appropriately
m_scans[i].m_shifted = *rec;
m_lists[i].set ( rec +1,
recSize-1 ,
rec , // alloc
recSize , // allocSize
startKey2 ,
endKey2 ,
base->m_fixedDataSize ,
true , // owndata
base->useHalfKeys() ,
getKeySizeFromRdbId ( m_rdbId ) );
continue;
}
// . do the scan/read of file #i
// . this returns false if blocked, true otherwise
// . this will set g_errno on error
@ -812,6 +938,10 @@ bool Msg3::doneScanning ( ) {
}
#endif
// try to fix this error i've seen
if ( g_errno == EBADENGINEER && max == -1 )
max = 100;
// . if we had a ETRYAGAIN error, then try again now
// . it usually means the whole file or a part of it was deleted
// before we could finish reading it, so we should re-read all now
@ -932,6 +1062,60 @@ bool Msg3::doneScanning ( ) {
// files we were reading, i've seen 'ff' be NULL
char *filename = "lostfilename";
if ( ff ) filename = ff->getFilename();
// compute cache info
RdbCache *rpc = getDiskPageCache ( m_rdbId );
if ( ! m_allowPageCache ) rpc = NULL;
int64_t vfd ;
if ( ff ) vfd = ff->getVfd();
key192_t ck ;
if ( ff )
ck = makeCacheKey ( vfd ,
m_scans[i].m_offset ,
m_scans[i].m_bytesToRead );
if ( m_validateCache && ff && rpc && vfd != -1 ) {
bool inCache;
char *rec; int32_t recSize;
inCache = rpc->getRecord ( (collnum_t)0 , // collnum
(char *)&ck ,
&rec ,
&recSize ,
true , // copy?
-1 , // maxAge, none
true ); // inccounts?
if ( inCache &&
// 1st byte is RdbScan::m_shifted
( m_lists[i].m_listSize != recSize-1 ||
memcmp ( m_lists[i].m_list , rec+1,recSize-1) ||
*rec != m_scans[i].m_shifted ) ) {
log("msg3: cache did not validate");
char *xx=NULL;*xx=0;
}
mfree ( rec , recSize , "vca" );
}
///////
//
// STORE IN PAGE CACHE
//
///////
// store what we read in the cache. don't bother storing
// if it was a retry, just in case something strange happened.
// store pre-constrain call is more efficient.
if ( m_retryNum<=0 && ff && rpc && vfd != -1 &&
! m_scans[i].m_inPageCache )
rpc->addRecord ( (collnum_t)0 , // collnum
(char *)&ck ,
// rec1 is this little thingy
&m_scans[i].m_shifted,
1,
// rec2
m_lists[i].getList() ,
m_lists[i].getListSize() ,
0 ); // timestamp. 0 = now
// if from our 'page' cache, no need to constrain
if ( ! m_lists[i].constrain ( m_startKey ,
m_constrainKey , // m_endKey
mrs , // m_minRecSizes
@ -947,6 +1131,7 @@ bool Msg3::doneScanning ( ) {
mstrerror(g_errno), ff->getDir(),
ff->getFilename(), ff->m_vfd ,
(int32_t)ff->m_numParts );
continue;
}
}

8
Msg3.h

@ -3,8 +3,10 @@
// . gets an RdbList from disk
// . reads from N specified files and stores results in N RdbLists
#ifndef _MSG3_H_
#define _MSG3_H_
#ifndef MSG3_H
#define MSG3_H
class RdbCache *getDiskPageCache ( char rdbId ) ;
// . max # of rdb files an rdb can have w/o merging
// . merge your files to keep the number of them low to cut down # of seeks
@ -114,6 +116,8 @@ class Msg3 {
//char *m_coll;
collnum_t m_collnum;
bool m_validateCache;
// the scan classes, 1 per file, used to read from that file
RdbScan *m_scans ; // [ MAX_RDB_FILES ];

@ -154,6 +154,7 @@ void Msg39::getDocIds ( UdpSlot *slot ) {
int32_t requestSize = m_slot->m_readBufSize;
// ensure it's size is ok
if ( requestSize < 8 ) {
BadReq:
g_errno = EBADREQUESTSIZE;
log(LOG_LOGIC,"query: msg39: getDocIds: %s." ,
mstrerror(g_errno) );
@ -169,7 +170,11 @@ void Msg39::getDocIds ( UdpSlot *slot ) {
m_r->m_buf );
// sanity check
if ( finalSize != requestSize ) {char *xx=NULL;*xx=0; }
if ( finalSize != requestSize ) {
log("msg39: sending bad request.");
goto BadReq;
//char *xx=NULL;*xx=0; }
}
getDocIds2 ( m_r );
}

@ -736,14 +736,6 @@ bool Msg3a::gotAllShardReplies ( ) {
// cast it and set it
m_reply [i] = mr;
m_replyMaxSize[i] = replyMaxSize;
// deserialize it (just sets the ptr_ and size_ member vars)
//mr->deserialize ( );
deserializeMsg ( sizeof(Msg39Reply) ,
&mr->size_docIds,
&mr->size_clusterRecs,
&mr->ptr_docIds,
mr->m_buf );
// sanity check
if ( mr->m_nqt != m_q->getNumTerms() ) {
g_errno = EBADREPLY;
@ -761,6 +753,20 @@ bool Msg3a::gotAllShardReplies ( ) {
mstrerror(g_errno));
return true;
}
// deserialize it (just sets the ptr_ and size_ member vars)
//mr->deserialize ( );
if ( ! deserializeMsg ( sizeof(Msg39Reply) ,
&mr->size_docIds,
&mr->size_clusterRecs,
&mr->ptr_docIds,
mr->m_buf ) ) {
g_errno = ECORRUPTDATA;
m_errno = ECORRUPTDATA;
log("query: msg3a: Shard had error: %s",
mstrerror(g_errno));
return true;
}
// skip down here if reply was already set
//skip:
// add of the total hits from each shard, this is how many
@ -1171,18 +1177,6 @@ bool Msg3a::mergeLists ( ) {
continue;
}
fe2->m_count += fe->m_count;
// also accumualte count of total docs, not just in
// the search results, that have this value for this
// facet
fe2->m_outsideSearchResultsCount +=
fe->m_outsideSearchResultsCount;
// prefer docid kinda randomly to balance
// lookupFacets() load in Msg40.cpp
if ( rand() % 2 )
fe2->m_docId = fe->m_docId;
if ( isFloat ) {
@ -1192,23 +1186,38 @@ bool Msg3a::mergeLists ( ) {
sum2 += sum1;
*((double *)&fe2->m_sum) = sum2;
// and min/max as floats
float min1 = *((float *)&fe ->m_min);
float min2 = *((float *)&fe2->m_min);
if ( min1 < min2 ) min2 = min1;
if ( fe2->m_count==0 || (fe->m_count!=0 && min1 < min2 )) min2 = min1;
*((float *)&fe2->m_min) = min2;
float max1 = *((float *)&fe ->m_max);
float max2 = *((float *)&fe2->m_max);
if ( max1 > max2 ) max2 = max1;
if ( fe2->m_count==0 || (fe->m_count!=0 && max1 > max2 )) max2 = max1;
*((float *)&fe2->m_max) = max2;
}
if ( isInt ) {
fe2->m_sum += fe->m_sum;
if ( fe->m_min < fe2->m_min )
if ( fe2->m_count==0 || (fe->m_count!=0 && fe->m_min < fe2->m_min ))
fe2->m_min = fe->m_min;
if ( fe->m_max > fe2->m_max )
if ( fe2->m_count==0 || (fe->m_count!=0 && fe->m_max > fe2->m_max ))
fe2->m_max = fe->m_max;
}
fe2->m_count += fe->m_count;
// also accumualte count of total docs, not just in
// the search results, that have this value for this
// facet
fe2->m_outsideSearchResultsCount +=
fe->m_outsideSearchResultsCount;
// prefer docid kinda randomly to balance
// lookupFacets() load in Msg40.cpp
if ( rand() % 2 )
fe2->m_docId = fe->m_docId;
}
// now get the next gbfacet: term if there was one

@ -696,7 +696,7 @@ bool Msg40::federatedLoop ( ) {
// and mult based on index size
numDocIdSplits *= mult;
// prevent going OOM for type:article AND html
//if ( numDocIdSplits < 5 ) numDocIdSplits = 5;
if ( numDocIdSplits < 5 ) numDocIdSplits = 5;
//}
if ( cr ) mr.m_maxQueryTerms = cr->m_maxQueryTerms;
@ -1071,7 +1071,7 @@ bool Msg40::reallocMsg20Buf ( ) {
// . allocate m_buf2 to hold all our Msg20 pointers and Msg20 classes
// . how much mem do we need?
// . need space for the msg20 ptrs
int32_t need = m_msg3a.m_numDocIds * sizeof(Msg20 *);
int64_t need = m_msg3a.m_numDocIds * sizeof(Msg20 *);
// need space for the classes themselves, only if "visible" though
for ( int32_t i = 0 ; i < m_msg3a.m_numDocIds ; i++ )
if ( m_msg3a.m_clusterLevels[i] == CR_OK )
@ -1243,6 +1243,13 @@ bool Msg40::reallocMsg20Buf ( ) {
m_buf2 = NULL;
m_bufMaxSize2 = need;
// if ( need > 2000000000 ) {
// log("msg40: need too much mem=%"INT64,need);
// m_errno = ENOMEM;
// g_errno = ENOMEM;
// return false;
// }
// do the alloc
if ( need ) m_buf2 = (char *)mmalloc ( need ,"Msg40msg20");
if ( need && ! m_buf2 ) { m_errno = g_errno; return false; }
@ -2030,7 +2037,7 @@ bool Msg40::gotSummary ( ) {
// . set it to true on all but the last thing we send!
// . after each chunk of data we send out, TcpServer::sendChunk
// will call our callback, doneSendingWrapper9
if ( m_si->m_streamResults )
if ( m_si->m_streamResults && st->m_socket )
st->m_socket->m_streamingMode = true;
@ -2112,7 +2119,7 @@ bool Msg40::gotSummary ( ) {
if ( g_conf.m_logDebugTcp )
log("tcp: disabling streamingMode now");
// this will be our final send
st->m_socket->m_streamingMode = false;
if ( st->m_socket ) st->m_socket->m_streamingMode = false;
}
@ -2120,6 +2127,24 @@ bool Msg40::gotSummary ( ) {
//g_conf.m_logDebugTcp = 1;
// do we still own this socket? i am thinking it got closed somewhere
// and the socket descriptor was re-assigned to another socket
// getting a diffbot reply from XmLDoc::getDiffbotReply()
if ( st->m_socket &&
st->m_socket->m_startTime != st->m_socketStartTimeHack ) {
log("msg40: lost control of socket. sd=%i. the socket "
"descriptor closed on us and got re-used by someone else.",
(int)st->m_socket->m_sd);
// if there wasn't already an error like 'broken pipe' then
// set it here so we stop getting summaries if streaming.
if ( ! m_socketHadError ) m_socketHadError = EBADENGINEER;
// make it NULL to avoid us from doing anything to it
// since sommeone else is using it now.
st->m_socket = NULL;
//g_errno = EBADENGINEER;
}
// . transmit the chunk in sb if non-zero length
// . steals the allocated buffer from sb and stores in the
// TcpSocket::m_sendBuf, which it frees when socket is
@ -2133,6 +2158,7 @@ bool Msg40::gotSummary ( ) {
if ( sb->length() &&
// did client browser close the socket on us midstream?
! m_socketHadError &&
st->m_socket &&
! tcp->sendChunk ( st->m_socket ,
sb ,
this ,
@ -2145,8 +2171,11 @@ bool Msg40::gotSummary ( ) {
// writing on closed socket?
if ( g_errno ) {
m_socketHadError = g_errno;
if ( ! m_socketHadError ) m_socketHadError = g_errno;
log("msg40: got tcp error : %s",mstrerror(g_errno));
// disown it here so we do not damage in case it gets
// reopened by someone else
st->m_socket = NULL;
}
// do we need to launch another batch of summary requests?
@ -2200,8 +2229,9 @@ bool Msg40::gotSummary ( ) {
//mdelete(st, sizeof(State0), "msg40st0");
//delete st;
// otherwise, all done!
log("msg40: did not send stuff from last summary. BUG "
"this=0x%"PTRFMT"",(PTRTYPE)this);
log("msg40: did not send last search result summary. "
"this=0x%"PTRFMT" because had error: %s",(PTRTYPE)this,
mstrerror(m_socketHadError));
return true;
}

@ -531,6 +531,10 @@ bool Msg5::readList ( ) {
int32_t niceness = m_niceness;
if ( niceness > 0 ) niceness = 2;
if ( m_isRealMerge ) niceness = 1;
bool allowPageCache = true;
// just in case cache is corrupted, do not use it for doing real
// merges, also it would kick out good lists we have in there already
if ( m_isRealMerge ) allowPageCache = false;
if ( compute ) {
m_msg3.readList ( m_rdbId ,
m_collnum ,
@ -547,7 +551,7 @@ bool Msg5::readList ( ) {
m_compensateForMerge ,
-1,//m_syncPoint ,
true , // just get endKey?
m_allowPageCache );
allowPageCache );
if ( g_errno ) {
log("db: Msg5: getting endKey: %s",mstrerrno(g_errno));
return true;

@ -3347,7 +3347,7 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
"</tr>"
"<tr>"
"<td><b>Crawl Completion Time:</td>"
"<td><b>Last Crawl Completion Time:</td>"
"<td>%"UINT32"</td>"
"</tr>"
@ -3362,6 +3362,46 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
"<td>%"INT32"</td>"
"</tr>"
, cr->m_diffbotCrawlName.getBufStart()
, (int32_t)cr->m_isCustomCrawl
, cr->m_diffbotToken.getBufStart()
, seedStr
, crawlStatus
, tmp.getBufStart()
, cr->m_diffbotCrawlStartTime
// this is 0 if not over yet
, cr->m_diffbotCrawlEndTime
, cr->m_spiderRoundNum
, cr->m_globalCrawlInfo.m_hasUrlsReadyToSpider
);
// show crawlinfo crap
CrawlInfo *cis = (CrawlInfo *)cr->m_crawlInfoBuf.getBufStart();
sb.safePrintf("<tr><td><b>Ready Hosts</b></td><td>");
for ( int32_t i = 0 ; i < g_hostdb.getNumHosts() ; i++ ) {
CrawlInfo *ci = &cis[i];
if ( ! ci->m_hasUrlsReadyToSpider ) continue;
Host *h = g_hostdb.getHost ( i );
if ( ! h ) continue;
sb.safePrintf("<a href=http://%s:%i/crawlbot?c=%s>"
"%i</a> "
, iptoa(h->m_ip)
, (int)h->m_httpPort
, cr->m_coll
, (int)i
);
}
sb.safePrintf("</tr>\n");
sb.safePrintf(
// this will have to be in crawlinfo too!
//"<tr>"
@ -3416,24 +3456,6 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
"</tr>"
, cr->m_diffbotCrawlName.getBufStart()
, (int32_t)cr->m_isCustomCrawl
, cr->m_diffbotToken.getBufStart()
, seedStr
, crawlStatus
, tmp.getBufStart()
, cr->m_diffbotCrawlStartTime
// this is 0 if not over yet
, cr->m_diffbotCrawlEndTime
, cr->m_spiderRoundNum
, cr->m_globalCrawlInfo.m_hasUrlsReadyToSpider
, cr->m_globalCrawlInfo.m_objectsAdded -
cr->m_globalCrawlInfo.m_objectsDeleted
, cr->m_globalCrawlInfo.m_urlsHarvested

@ -75,6 +75,13 @@ void setInjectionRequestFromParms ( TcpSocket *sock ,
int32_t def = atoll(m->m_def);
*ii = (char)hr->getLong(m->m_cgi,def);
}
else if ( m->m_type == TYPE_IP ) {
char *ii = (char *)((char *)ir + m->m_off);
char *is = hr->getString(m->m_cgi,NULL);
*(int32_t *)ii = 0; // default ip to 0
// otherwise, set the ip
if ( is ) *(int32_t *)ii = atoip(is);
}
// if unsupported let developer know
else { char *xx=NULL;*xx=0; }
}
@ -581,11 +588,29 @@ bool sendHttpReply ( void *state ) {
//
////////////
XmlDoc *s_injectHead = NULL;
XmlDoc *s_injectTail = NULL;
XmlDoc *getInjectHead ( ) { return s_injectHead; }
// send back a reply to the originator of the msg7 injection request
void sendUdpReply7 ( void *state ) {
XmlDoc *xd = (XmlDoc *)state;
// remove from linked list
if ( xd->m_nextInject )
xd->m_nextInject->m_prevInject = xd->m_prevInject;
if ( xd->m_prevInject )
xd->m_prevInject->m_nextInject = xd->m_nextInject;
if ( s_injectHead == xd )
s_injectHead = xd->m_nextInject;
if ( s_injectTail == xd )
s_injectTail = xd->m_prevInject;
xd->m_nextInject = NULL;
xd->m_prevInject = NULL;
UdpSlot *slot = xd->m_injectionSlot;
uint32_t statColor = 0xccffcc;
@ -655,6 +680,19 @@ void handleRequest7 ( UdpSlot *slot , int32_t netnice ) {
xd->m_injectionSlot = slot;
xd->m_injectStartTime = gettimeofdayInMilliseconds();
// add to linked list
xd->m_nextInject = NULL;
xd->m_prevInject = NULL;
if ( s_injectTail ) {
s_injectTail->m_nextInject = xd;
xd->m_prevInject = s_injectTail;
s_injectTail = xd;
}
else {
s_injectHead = xd;
s_injectTail = xd;
}
if ( ! xd->injectDoc ( ir->ptr_url , // m_injectUrlBuf.getBufStart() ,
cr ,
ir->ptr_content , // start , // content ,

@ -1,6 +1,10 @@
#ifndef GBINJECT_H
#define GBINJECT_H
// for getting list of injections currently being processed on this host
// for printing in the Spider Queue table in Spider.cpp
class XmlDoc *getInjectHead ( ) ;
void handleRequest7Import ( class UdpSlot *slot , int32_t netnice ) ;
void handleRequest7 ( class UdpSlot *slot , int32_t netnice ) ;

@ -68,8 +68,12 @@ bool sendReply ( State0 *st , char *reply ) {
int32_t savedErr = g_errno;
TcpSocket *s = st->m_socket;
if ( ! s ) { char *xx=NULL;*xx=0; }
TcpSocket *sock = st->m_socket;
if ( ! sock ) {
log("results: not sending back results on an empty socket."
"socket must have closed on us abruptly.");
//char *xx=NULL;*xx=0; }
}
SearchInput *si = &st->m_si;
char *ct = "text/html";
if ( si && si->m_format == FORMAT_XML ) ct = "text/xml";
@ -143,7 +147,8 @@ bool sendReply ( State0 *st , char *reply ) {
//
// send back the actual search results
//
g_httpServer.sendDynamicPage(s,
if ( sock )
g_httpServer.sendDynamicPage(sock,
reply,
rlen,//gbstrlen(reply),
// don't let the ajax re-gen
@ -199,9 +204,9 @@ bool sendReply ( State0 *st , char *reply ) {
// if we had a broken pipe from the browser while sending
// them the search results, then we end up closing the socket fd
// in TcpServer::sendChunk() > sendMsg() > destroySocket()
if ( s->m_numDestroys ) {
if ( sock && sock->m_numDestroys ) {
log("results: not sending back error on destroyed socket "
"sd=%"INT32"",s->m_sd);
"sd=%"INT32"",sock->m_sd);
return true;
}
@ -212,7 +217,8 @@ bool sendReply ( State0 *st , char *reply ) {
savedErr == ENOCOLLREC)
status = 400;
g_httpServer.sendQueryErrorReply(s,
if ( sock )
g_httpServer.sendQueryErrorReply(sock,
status,
mstrerror(savedErr),
format,//xml,
@ -542,6 +548,9 @@ bool sendPageResults ( TcpSocket *s , HttpRequest *hr ) {
// set this in case SearchInput::set fails!
st->m_socket = s;
// record timestamp so we know if we got our socket closed and swapped
st->m_socketStartTimeHack = s->m_startTime;
// save this count so we know if TcpServer.cpp calls destroySocket(s)
st->m_numDestroys = s->m_numDestroys;
@ -1154,6 +1163,16 @@ bool gotResults ( void *state ) {
SearchInput *si = &st->m_si;
// if we lost the socket because we were streaming and it
// got closed from a broken pipe or something, then Msg40.cpp
// will set st->m_socket to NULL if the fd ends up ending closed
// because someone else might be using it and we do not want to
// mess with their TcpSocket settings.
if ( ! st->m_socket ) {
log("results: socket is NULL. sending failed.");
return sendReply(st,NULL);
}
// if in streaming mode and we never sent anything and we had
// an error, then send that back. we never really entered streaming
// mode in that case. this happens when someone deletes a coll
@ -1164,6 +1183,23 @@ bool gotResults ( void *state ) {
st->m_socket->m_totalSent == 0 )
return sendReply(st,NULL);
// if we skipped a shard because it was dead, usually we provide
// the results anyway, but if this switch is true then return an
// error code instead. this is the 'all or nothing' switch.
if ( msg40->m_msg3a.m_skippedShards > 0 &&
! g_conf.m_returnResultsAnyway ) {
char reply[256];
sprintf ( reply ,
"%"INT32" shard(s) out of %"INT32" did not "
"respond to query."
, msg40->m_msg3a.m_skippedShards
, g_hostdb.m_numShards );
g_errno = ESHARDDOWN;
return sendReply(st,reply);
}
// if already printed from Msg40.cpp, bail out now
if ( si->m_streamResults ) {
// this will be our final send
@ -1220,10 +1256,21 @@ bool gotResults ( void *state ) {
// into it, and it must be the SAME ptr too!
CollectionRec *cr = si->m_cr;//g_collectiondb.getRec ( collnum );
if ( ! cr ) { // || cr != si->m_cr ) {
g_errno = ENOCOLLREC;
return sendReply(st,NULL);
g_errno = ENOCOLLREC;
return sendReply(st,NULL);
}
// this causes ooms everywhere, not a good fix
if ( ! msg40->m_msg20 && ! si->m_docIdsOnly && msg40->m_errno ) {
log("msg40: failed to get results q=%s",si->m_q.m_orig);
//g_errno = ENOMEM;
g_errno = msg40->m_errno;
return sendReply(st,NULL);
}
//char *coll = cr->m_coll;
/*
@ -3926,6 +3973,8 @@ bool printResult ( State0 *st, int32_t ix , int32_t *numPrintedSoFar ) {
ix, (int32_t)msg40->getClusterLevel(ix));
int64_t d = msg40->getDocId(ix);
// this is normally a double, but cast to float
float docScore = (float)msg40->getScore(ix);
// do not print if it is a summary dup or had some error
// int32_t level = (int32_t)msg40->getClusterLevel(ix);
@ -5047,6 +5096,7 @@ bool printResult ( State0 *st, int32_t ix , int32_t *numPrintedSoFar ) {
// . docId for possible cached link
// . might have merged a bunch together
sb->safePrintf("\t\t<docId>%"INT64"</docId>\n",mr->m_docId );
sb->safePrintf("\t\t<docScore>%f</docScore>\n",docScore);
}
if ( si->m_format == FORMAT_XML && mr->m_contentType != CT_STATUS ) {
@ -5097,6 +5147,7 @@ bool printResult ( State0 *st, int32_t ix , int32_t *numPrintedSoFar ) {
// . docId for possible cached link
// . might have merged a bunch together
sb->safePrintf("\t\t\"docId\":%"INT64",\n",mr->m_docId );
sb->safePrintf("\t\t\"docScore\":%f,\n",docScore);
}
if ( si->m_format == FORMAT_JSON && mr->m_contentType != CT_STATUS ) {
@ -5943,15 +5994,15 @@ bool printResult ( State0 *st, int32_t ix , int32_t *numPrintedSoFar ) {
"</numGoodSiteInlinks>\n",
(int32_t)mr->m_siteNumInlinks );
sb->safePrintf ("\t\t<numTotalSiteInlinks>%"INT32""
"</numTotalSiteInlinks>\n",
(int32_t)mr->m_siteNumInlinksTotal );
sb->safePrintf ("\t\t<numUniqueIpsLinkingToSite>%"INT32""
"</numUniqueIpsLinkingToSite>\n",
(int32_t)mr->m_siteNumUniqueIps );
sb->safePrintf ("\t\t<numUniqueCBlocksLinkingToSite>%"INT32""
"</numUniqueCBlocksLinkingToSite>\n",
(int32_t)mr->m_siteNumUniqueCBlocks );
// sb->safePrintf ("\t\t<numTotalSiteInlinks>%"INT32""
// "</numTotalSiteInlinks>\n",
// (int32_t)mr->m_siteNumInlinksTotal );
// sb->safePrintf ("\t\t<numUniqueIpsLinkingToSite>%"INT32""
// "</numUniqueIpsLinkingToSite>\n",
// (int32_t)mr->m_siteNumUniqueIps );
// sb->safePrintf("\t\t<numUniqueCBlocksLinkingToSite>%"INT32""
// "</numUniqueCBlocksLinkingToSite>\n",
// (int32_t)mr->m_siteNumUniqueCBlocks );
struct tm *timeStruct3;

@ -63,6 +63,7 @@ public:
bool m_didRedownload;
XmlDoc *m_xd;
int32_t m_oldContentHash32;
int64_t m_socketStartTimeHack;
};

@ -383,16 +383,17 @@ void printUdpTable ( SafeBuf *p, char *title, UdpServer *server ,
"<td><b>hostname</b></td>";
}
UdpSlot *slot = server->m_head3;
int32_t callbackReadyCount = 0;
for ( ; slot ; slot = slot->m_next3 , callbackReadyCount++ );
//UdpSlot *slot = server->m_head3;
//int32_t callbackReadyCount = 0;
//for ( ; slot ; slot = slot->m_next3 , callbackReadyCount++ );
p->safePrintf ( "<table %s>"
"<tr class=hdrow><td colspan=19>"
"<center>"
//"<font size=+1>"
"<b>%s</b> (%"INT32" transactions)"
"(%"INT32" requests waiting to processed)"
//"(%"INT32" requests waiting to processed)"
"(%"INT32" incoming)"
//"</font>"
"</td></tr>"
"<tr bgcolor=#%s>"
@ -419,7 +420,8 @@ void printUdpTable ( SafeBuf *p, char *title, UdpServer *server ,
"</tr>\n" ,
TABLE_STYLE,
title , server->getNumUsedSlots() ,
callbackReadyCount ,
//callbackReadyCount ,
server->getNumUsedSlotsIncoming() ,
DARK_BLUE ,
dd );

@ -30,6 +30,7 @@
#include "Sections.h"
//#include "Msg0.h" // g_termlistCache
#include "Msg13.h"
#include "Msg3.h"
bool printNumAbbr ( SafeBuf &p, int64_t vvv ) {
float val = (float)vvv;
@ -733,6 +734,18 @@ bool sendPageStats ( TcpSocket *s , HttpRequest *r ) {
p.safePrintf("<td>%"INT64"</td>",a);
}
p.safePrintf ("</tr>\n<tr class=poo><td><b><nobr>dropped recs</td>" );
for ( int32_t i = 0 ; i < numCaches ; i++ ) {
int64_t a = caches[i]->m_deletes;
p.safePrintf("<td>%"INT64"</td>",a);
}
p.safePrintf ("</tr>\n<tr class=poo><td><b><nobr>added recs</td>" );
for ( int32_t i = 0 ; i < numCaches ; i++ ) {
int64_t a = caches[i]->m_adds;
p.safePrintf("<td>%"INT64"</td>",a);
}
//p.safePrintf ("</tr>\n<tr class=poo><td><b><nobr>max age</td>" );
//for ( int32_t i = 0 ; i < numCaches ; i++ ) {
// int64_t a = caches[i]->getMaxMem();
@ -2076,64 +2089,72 @@ bool sendPageStats ( TcpSocket *s , HttpRequest *r ) {
*/
p.safePrintf("<tr class=poo><td><b>page cache hits %%</b></td>");
p.safePrintf("<tr class=poo><td><b>file cache hits %%</b></td>");
totalf = 0.0;
for ( int32_t i = 0 ; i < nr ; i++ ) {
if ( ! rdbs[i]->m_pc ) {
Rdb *rdb = rdbs[i];
RdbCache *rpc = getDiskPageCache ( rdb->m_rdbId );
if ( ! rpc ) {
p.safePrintf("<td>--</td>");
continue;
}
int64_t hits = rdbs[i]->m_pc->getNumHits();
int64_t misses = rdbs[i]->m_pc->getNumMisses();
int64_t hits = rpc->getNumHits();
int64_t misses = rpc->getNumMisses();
int64_t sum = hits + misses;
float val = 0.0;
if ( sum > 0.0 ) val = ((float)hits * 100.0) / (float)sum;
totalf += val;
p.safePrintf("<td>%.1f</td>",val);
//totalf += val;
p.safePrintf("<td>%.1f%%</td>",val);
}
p.safePrintf("<td>%.1f</td></tr>\n",totalf);
p.safePrintf("<td>--</td></tr>\n");
p.safePrintf("<tr class=poo><td><b>page cache hits</b></td>");
p.safePrintf("<tr class=poo><td><b>file cache hits</b></td>");
total = 0;
for ( int32_t i = 0 ; i < nr ; i++ ) {
if ( ! rdbs[i]->m_pc ) {
Rdb *rdb = rdbs[i];
RdbCache *rpc = getDiskPageCache ( rdb->m_rdbId );
if ( ! rpc ) {
p.safePrintf("<td>--</td>");
continue;
}
int64_t val = rdbs[i]->m_pc->getNumHits();
int64_t val = rpc->getNumHits();
total += val;
p.safePrintf("<td>%"UINT64"</td>",val);
}
p.safePrintf("<td>%"UINT64"</td></tr>\n",total);
p.safePrintf("<tr class=poo><td><b>page cache misses</b></td>");
p.safePrintf("<tr class=poo><td><b>file cache misses</b></td>");
total = 0;
for ( int32_t i = 0 ; i < nr ; i++ ) {
if ( ! rdbs[i]->m_pc ) {
Rdb *rdb = rdbs[i];
RdbCache *rpc = getDiskPageCache ( rdb->m_rdbId );
if ( ! rpc ) {
p.safePrintf("<td>--</td>");
continue;
}
int64_t val = rdbs[i]->m_pc->getNumMisses();
int64_t val = rpc->getNumMisses();
total += val;
p.safePrintf("<td>%"UINT64"</td>",val);
}
p.safePrintf("<td>%"UINT64"</td></tr>\n",total);
p.safePrintf("<tr class=poo><td><b>page cache tries</b></td>");
p.safePrintf("<tr class=poo><td><b>file cache tries</b></td>");
total = 0;
for ( int32_t i = 0 ; i < nr ; i++ ) {
if ( ! rdbs[i]->m_pc ) {
Rdb *rdb = rdbs[i];
RdbCache *rpc = getDiskPageCache ( rdb->m_rdbId );
if ( ! rpc ) {
p.safePrintf("<td>--</td>");
continue;
}
int64_t hits = rdbs[i]->m_pc->getNumHits();
int64_t misses = rdbs[i]->m_pc->getNumMisses();
int64_t hits = rpc->getNumHits();
int64_t misses = rpc->getNumMisses();
int64_t val = hits + misses;
total += val;
p.safePrintf("<td>%"UINT64"</td>",val);
@ -2141,28 +2162,60 @@ bool sendPageStats ( TcpSocket *s , HttpRequest *r ) {
p.safePrintf("<td>%"UINT64"</td></tr>\n",total);
p.safePrintf("<tr class=poo><td><b>page cache used</b></td>");
p.safePrintf("<tr class=poo><td><b>file cache adds</b></td>");
total = 0;
for ( int32_t i = 0 ; i < nr ; i++ ) {
if ( ! rdbs[i]->m_pc ) {
Rdb *rdb = rdbs[i];
RdbCache *rpc = getDiskPageCache ( rdb->m_rdbId );
if ( ! rpc ) {
p.safePrintf("<td>--</td>");
continue;
}
int64_t val = rdbs[i]->m_pc->getMemUsed();
p.safePrintf("<td>%"UINT64"</td>",rpc->m_adds);
}
p.safePrintf("<td>%"UINT64"</td></tr>\n",total);
p.safePrintf("<tr class=poo><td><b>file cache drops</b></td>");
total = 0;
for ( int32_t i = 0 ; i < nr ; i++ ) {
Rdb *rdb = rdbs[i];
RdbCache *rpc = getDiskPageCache ( rdb->m_rdbId );
if ( ! rpc ) {
p.safePrintf("<td>--</td>");
continue;
}
p.safePrintf("<td>%"UINT64"</td>",rpc->m_deletes);
}
p.safePrintf("<td>%"UINT64"</td></tr>\n",total);
p.safePrintf("<tr class=poo><td><b>file cache used</b></td>");
total = 0;
for ( int32_t i = 0 ; i < nr ; i++ ) {
Rdb *rdb = rdbs[i];
RdbCache *rpc = getDiskPageCache ( rdb->m_rdbId );
if ( ! rpc ) {
p.safePrintf("<td>--</td>");
continue;
}
int64_t val = rpc->getMemOccupied();
total += val;
printNumAbbr ( p , val );
}
p.safePrintf("<td>%"UINT64"</td></tr>\n",total);
p.safePrintf("<tr class=poo><td><b><nobr>page cache allocated</nobr></b></td>");
p.safePrintf("<tr class=poo><td><b><nobr>file cache allocated</nobr></b></td>");
total = 0;
for ( int32_t i = 0 ; i < nr ; i++ ) {
if ( ! rdbs[i]->m_pc ) {
Rdb *rdb = rdbs[i];
RdbCache *rpc = getDiskPageCache ( rdb->m_rdbId );
if ( ! rpc ) {
p.safePrintf("<td>--</td>");
continue;
}
int64_t val = rdbs[i]->m_pc->getMemAlloced();
int64_t val = rpc->getMemAlloced();
total += val;
printNumAbbr ( p , val );
}

@ -29,24 +29,32 @@ bool sendPageThreads ( TcpSocket *s , HttpRequest *r ) {
int32_t loActive = q->m_loLaunched - q->m_loReturned;
int32_t mdActive = q->m_mdLaunched - q->m_mdReturned;
int32_t hiActive = q->m_hiLaunched - q->m_hiReturned;
int32_t total = loActive + mdActive + hiActive;
// int32_t loActive = q->m_loLaunched - q->m_loReturned;
// int32_t mdActive = q->m_mdLaunched - q->m_mdReturned;
// int32_t hiActive = q->m_hiLaunched - q->m_hiReturned;
// int32_t total = loActive + mdActive + hiActive;
int32_t total = q->m_launched - q->m_returned;
p.safePrintf ( "<table %s>"
"<tr class=hdrow><td colspan=\"11\">"
//"<center>"
//"<font size=+1>"
"<b>Thread Type: %s"
" (low: %"INT32""
" med: %"INT32""
" high: %"INT32""
" total: %"INT32")</td></tr>",
// " (low: %"INT32""
// " med: %"INT32""
// " high: %"INT32""
" (launched: %"INT32" "
"returned: %"INT32" "
"total: %"INT32" maxpossibleout: %i)</td></tr>",
TABLE_STYLE,
q->getThreadType(),
loActive, mdActive,
hiActive, total);
// loActive, mdActive,
// hiActive,
(int32_t)q->m_launched,
(int32_t)q->m_returned,
total,
(int)MAX_STACKS);
p.safePrintf ("<tr bgcolor=#%s>"
@ -59,19 +67,20 @@ bool sendPageThreads ( TcpSocket *s , HttpRequest *r ) {
"<td><b>Callback</b></td>"
"<td><b>Routine</b></td>"
"<td><b>Bytes Done</b></td>"
"<td><b>KBytes/Sec</b></td>"
"<td><b>Megabytes/Sec</b></td>"
"<td><b>Read|Write</b></td>"
"</tr>"
, LIGHT_BLUE
);
for ( int32_t j = 0 ; j < q->m_top ; j++ ) {
for ( int32_t j = 0 ; j < q->m_maxEntries ; j++ ) {
ThreadEntry *t = &q->m_entries[j];
if(!t->m_isOccupied) continue;
FileState *fs = (FileState *)t->m_state;
bool diskThread = false;
if(q->m_threadType == DISK_THREAD && fs) diskThread = true;
if(q->m_threadType == DISK_THREAD && fs)
diskThread = true;
// might have got pre-called from EDISKSTUCK
if ( ! t->m_callback ) fs = NULL;
@ -81,18 +90,29 @@ bool sendPageThreads ( TcpSocket *s , HttpRequest *r ) {
if(t->m_isDone) {
p.safePrintf("<td><font color='red'><b>done</b></font></td>");
p.safePrintf("<td>%"INT32"</td>", t->m_niceness);
p.safePrintf("<td>%"INT64"</td>", t->m_launchedTime - t->m_queuedTime); //queued
p.safePrintf("<td>%"INT64"</td>", t->m_exitTime - t->m_launchedTime); //run time
p.safePrintf("<td>%"INT64"</td>", now - t->m_exitTime); //cleanup
p.safePrintf("<td>%"INT64"</td>", now - t->m_queuedTime); //total
p.safePrintf("<td>%"INT64"ms</td>", t->m_launchedTime - t->m_queuedTime); //queued
p.safePrintf("<td>%"INT64"ms</td>", t->m_exitTime - t->m_launchedTime); //run time
p.safePrintf("<td>%"INT64"ms</td>", now - t->m_exitTime); //cleanup
p.safePrintf("<td>%"INT64"ms</td>", now - t->m_queuedTime); //total
p.safePrintf("<td>%s</td>", g_profiler.getFnName((PTRTYPE)t->m_callback));
p.safePrintf("<td>%s</td>", g_profiler.getFnName((PTRTYPE)t->m_startRoutine));
if(diskThread && fs) {
int64_t took = (t->m_exitTime - t->m_launchedTime);
if(took <= 0) took = 1;
p.safePrintf("<td>%"INT32"/%"INT32"</td>", t->m_bytesToGo, t->m_bytesToGo);
p.safePrintf("<td>%.2f kbps</td>", (float)t->m_bytesToGo/took);
p.safePrintf("<td>%s</td>",t->m_doWrite? "Write":"Read");
char *sign = "";
if(took <= 0) {sign=">";took = 1;}
p.safePrintf("<td>%"INT32"/%"INT32""
"</td>",
t->m_bytesToGo,
t->m_bytesToGo);
p.safePrintf("<td>%s%.2f MB/s</td>",
sign,
(float)t->m_bytesToGo/
(1024.0*1024.0)/
((float)took/1000.0));
p.safePrintf("<td>%s</td>",
t->m_doWrite?
"<font color=red>"
"Write</font>":"Read");
}
else {
p.safePrintf("<td>--</td>");
@ -113,7 +133,7 @@ bool sendPageThreads ( TcpSocket *s , HttpRequest *r ) {
int64_t took = (now - t->m_launchedTime);
if(took <= 0) took = 1;
p.safePrintf("<td>%c%c%c/%"INT32"</td>", '?','?','?',t->m_bytesToGo);
p.safePrintf("<td>%.2f kbps</td>", 0.0);//(float)fs->m_bytesDone/took);
p.safePrintf("<td>%.2f MB/s</td>", 0.0);//(float)fs->m_bytesDone/took);
p.safePrintf("<td>%s</td>",t->m_doWrite? "Write":"Read");
}
else {
@ -151,7 +171,7 @@ bool sendPageThreads ( TcpSocket *s , HttpRequest *r ) {
}
/*
int32_t loActiveBig = disk->m_loLaunchedBig - disk->m_loReturnedBig;
int32_t loActiveMed = disk->m_loLaunchedMed - disk->m_loReturnedMed;
int32_t loActiveSma = disk->m_loLaunchedSma - disk->m_loReturnedSma;
@ -208,7 +228,7 @@ bool sendPageThreads ( TcpSocket *s , HttpRequest *r ) {
"<td><b>Active Write Threads</b></td><td>%"INT32"</td>"
"</tr></table>",
activeWrites);
*/
return g_httpServer.sendDynamicPage ( s , (char*) p.getBufStart() ,
p.length() );

@ -11,6 +11,7 @@
#include "PageParser.h" // g_inPageParser
#include "Users.h"
#include "Rebalance.h"
#include "Profiler.h"
// a global class extern'd in Pages.h
Pages g_pages;
@ -4664,9 +4665,15 @@ bool printRedBox ( SafeBuf *mb , TcpSocket *sock , HttpRequest *hr ) {
mb->safePrintf("%s",boxEnd);
}
if ( g_profiler.m_realTimeProfilerRunning ) {
if ( adds ) mb->safePrintf("<br>");
adds++;
mb->safePrintf("%s",box);
mb->safePrintf("Profiler is running. Performance is "
"somewhat compromised. Disable on the "
"profiler page.");
mb->safePrintf("%s",boxEnd);
}
if ( g_pingServer.m_hostsConfInDisagreement ) {
if ( adds ) mb->safePrintf("<br>");

384
Parms.cpp

@ -5207,15 +5207,15 @@ void Parms::init ( ) {
m->m_obj = OBJ_CONF;
m++;
m->m_title = "tagdb max page cache mem";
m->m_desc = "";
m->m_off = (char *)&g_conf.m_tagdbMaxDiskPageCacheMem - g;
m->m_def = "200000";
m->m_type = TYPE_LONG;
m->m_flags = PF_NOSYNC|PF_NOAPI;
m->m_page = PAGE_NONE;
m->m_obj = OBJ_CONF;
m++;
// m->m_title = "tagdb max page cache mem";
// m->m_desc = "";
// m->m_off = (char *)&g_conf.m_tagdbMaxDiskPageCacheMem - g;
// m->m_def = "200000";
// m->m_type = TYPE_LONG;
// m->m_flags = PF_NOSYNC|PF_NOAPI;
// m->m_page = PAGE_NONE;
// m->m_obj = OBJ_CONF;
// m++;
//m->m_title = "tagdb max cache mem";
//m->m_desc = "";
@ -5244,15 +5244,15 @@ void Parms::init ( ) {
m->m_obj = OBJ_CONF;
m++;
m->m_title = "catdb max page cache mem";
m->m_desc = "";
m->m_off = (char *)&g_conf.m_catdbMaxDiskPageCacheMem - g;
m->m_def = "25000000";
m->m_type = TYPE_LONG;
m->m_flags = PF_NOSYNC|PF_NOAPI;
m->m_page = PAGE_NONE;
m->m_obj = OBJ_CONF;
m++;
// m->m_title = "catdb max page cache mem";
// m->m_desc = "";
// m->m_off = (char *)&g_conf.m_catdbMaxDiskPageCacheMem - g;
// m->m_def = "25000000";
// m->m_type = TYPE_LONG;
// m->m_flags = PF_NOSYNC|PF_NOAPI;
// m->m_page = PAGE_NONE;
// m->m_obj = OBJ_CONF;
// m++;
m->m_title = "catdb max cache mem";
m->m_desc = "";
@ -5523,15 +5523,15 @@ void Parms::init ( ) {
m++;
*/
m->m_title = "linkdb max page cache mem";
m->m_desc = "";
m->m_off = (char *)&g_conf.m_linkdbMaxDiskPageCacheMem - g;
m->m_def = "0";
m->m_type = TYPE_LONG;
m->m_flags = PF_NOSYNC|PF_NOAPI;
m->m_page = PAGE_NONE;
m->m_obj = OBJ_CONF;
m++;
// m->m_title = "linkdb max page cache mem";
// m->m_desc = "";
// m->m_off = (char *)&g_conf.m_linkdbMaxDiskPageCacheMem - g;
// m->m_def = "0";
// m->m_type = TYPE_LONG;
// m->m_flags = PF_NOSYNC|PF_NOAPI;
// m->m_page = PAGE_NONE;
// m->m_obj = OBJ_CONF;
// m++;
/*
// this is overridden by collection
@ -5657,15 +5657,15 @@ void Parms::init ( ) {
m->m_obj = OBJ_CONF;
m++;
m->m_title = "statsdb max disk page cache mem";
m->m_desc = "";
m->m_off = (char *)&g_conf.m_statsdbMaxDiskPageCacheMem - g;
m->m_def = "1000000";
m->m_type = TYPE_LONG;
m->m_flags = PF_NOSYNC|PF_NOAPI;
m->m_page = PAGE_NONE;
m->m_obj = OBJ_CONF;
m++;
// m->m_title = "statsdb max disk page cache mem";
// m->m_desc = "";
// m->m_off = (char *)&g_conf.m_statsdbMaxDiskPageCacheMem - g;
// m->m_def = "1000000";
// m->m_type = TYPE_LONG;
// m->m_flags = PF_NOSYNC|PF_NOAPI;
// m->m_page = PAGE_NONE;
// m->m_obj = OBJ_CONF;
// m++;
//m->m_title = "statsdb min files to merge";
//m->m_desc = "";
@ -9939,6 +9939,21 @@ void Parms::init ( ) {
m->m_obj = OBJ_CONF;
m++;
m->m_title = "return results even if a shard is down";
m->m_desc = "If you turn this off then Gigablast will return "
"an error message if a shard was down and did not return "
"results for a query. The XML and JSON feed let's you know "
"when a shard is down and will give you the results back "
"any way, but if you would rather have just and error message "
"and no results, then set then set this to 'NO'.";
m->m_cgi = "rra";
m->m_off = (char *)&g_conf.m_returnResultsAnyway - g;
m->m_type = TYPE_BOOL;
m->m_def = "1";
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "max mem";
m->m_desc = "Mem available to this process. May be exceeded due "
"to fragmentation.";
@ -11273,20 +11288,6 @@ void Parms::init ( ) {
m++;
*/
m->m_title = "verify disk writes";
m->m_desc = "Read what was written in a verification step. Decreases "
"performance, but may help fight disk corruption mostly on "
"Maxtors and Western Digitals.";
m->m_cgi = "vdw";
m->m_off = (char *)&g_conf.m_verifyWrites - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_group = 0;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
// this is ifdef'd out in Msg3.cpp for performance reasons,
// so do it here, too
#ifdef GBSANITYCHECK
@ -11457,122 +11458,80 @@ void Parms::init ( ) {
m++;
*/
m->m_title = "use disk page cache for posdb";
m->m_desc = "Use disk page cache?";
m->m_cgi = "udpci";
m->m_off = (char *)&g_conf.m_useDiskPageCachePosdb - g;
m->m_type = TYPE_BOOL;
m->m_def = "1";
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_title = "posdb disk cache size";
m->m_desc = "How much file cache size to use in bytes? Posdb is "
"the index.";
m->m_cgi = "dpcsp";
m->m_off = (char *)&g_conf.m_posdbFileCacheSize - g;
m->m_type = TYPE_LONG_LONG;
m->m_def = "30000000";
m->m_flags = 0;//PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "use disk page cache for datedb";
m->m_desc = "Use disk page cache?";
m->m_cgi = "udpcd";
m->m_off = (char *)&g_conf.m_useDiskPageCacheDatedb - g;
m->m_type = TYPE_BOOL;
m->m_def = "1";
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_title = "tagdb disk cache size";
m->m_desc = "How much file cache size to use in bytes? Tagdb is "
"consulted at spider time and query time to determine "
"if a url or outlink is banned or what its siterank is, etc.";
m->m_cgi = "dpcst";
m->m_off = (char *)&g_conf.m_tagdbFileCacheSize - g;
m->m_type = TYPE_LONG_LONG;
m->m_def = "30000000";
m->m_flags = 0;//PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "use disk page cache for titledb";
m->m_desc = "Use disk page cache?";
m->m_cgi = "udpct";
m->m_off = (char *)&g_conf.m_useDiskPageCacheTitledb - g;
m->m_type = TYPE_BOOL;
m->m_def = "1";
m->m_group = 0;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "use disk page cache for spiderdb";
m->m_desc = "Use disk page cache?";
m->m_cgi = "udpcs";
m->m_off = (char *)&g_conf.m_useDiskPageCacheSpiderdb - g;
m->m_type = TYPE_BOOL;
m->m_def = "1";
m->m_group = 0;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
/*
m->m_title = "use disk page cache for urldb";
m->m_desc = "Use disk page cache?";
m->m_cgi = "udpcu";
m->m_off = (char *)&g_conf.m_useDiskPageCacheTfndb - g;
m->m_type = TYPE_BOOL;
m->m_def = "1";
m->m_group = 0;
m++;
*/
m->m_title = "use disk page cache for tagdb";
m->m_desc = "Use disk page cache?";
m->m_cgi = "udpcg";
m->m_off = (char *)&g_conf.m_useDiskPageCacheTagdb - g;
m->m_type = TYPE_BOOL;
m->m_def = "1";
m->m_group = 0;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_title = "clusterdb disk cache size";
m->m_desc = "How much file cache size to use in bytes? "
"Gigablast does a "
"lookup in clusterdb for each search result at query time to "
"get its site information for site clustering. If you "
"disable site clustering in the search controls then "
"clusterdb will not be consulted.";
m->m_cgi = "dpcsc";
m->m_off = (char *)&g_conf.m_clusterdbFileCacheSize - g;
m->m_type = TYPE_LONG_LONG;
m->m_def = "30000000";
m->m_flags = 0;//PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m->m_group = 0;
m++;
m->m_title = "use disk page cache for checksumdb";
m->m_desc = "Use disk page cache?";
m->m_cgi = "udpck";
m->m_off = (char *)&g_conf.m_useDiskPageCacheChecksumdb - g;
m->m_type = TYPE_BOOL;
m->m_def = "1";
m->m_group = 0;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_title = "titledb disk cache size";
m->m_desc = "How much file cache size to use in bytes? Titledb "
"holds the cached web pages, compressed. Gigablast consults "
"it to generate a summary for a search result, or to see if "
"a url Gigablast is spidering is already in the index.";
m->m_cgi = "dpcsx";
m->m_off = (char *)&g_conf.m_titledbFileCacheSize - g;
m->m_type = TYPE_LONG_LONG;
m->m_def = "30000000";
m->m_flags = 0;//PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m->m_group = 0;
m++;
m->m_title = "use disk page cache for clusterdb";
m->m_desc = "Use disk page cache?";
m->m_cgi = "udpcl";
m->m_off = (char *)&g_conf.m_useDiskPageCacheClusterdb - g;
m->m_type = TYPE_BOOL;
m->m_def = "1";
m->m_group = 0;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_title = "spiderdb disk cache size";
m->m_desc = "How much file cache size to use in bytes? Titledb "
"holds the cached web pages, compressed. Gigablast consults "
"it to generate a summary for a search result, or to see if "
"a url Gigablast is spidering is already in the index.";
m->m_cgi = "dpcsy";
m->m_off = (char *)&g_conf.m_spiderdbFileCacheSize - g;
m->m_type = TYPE_LONG_LONG;
m->m_def = "30000000";
m->m_flags = 0;//PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m->m_group = 0;
m++;
m->m_title = "use disk page cache for catdb";
m->m_desc = "Use disk page cache?";
m->m_cgi = "udpca";
m->m_off = (char *)&g_conf.m_useDiskPageCacheCatdb - g;
m->m_type = TYPE_BOOL;
m->m_def = "1";
m->m_group = 0;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "use disk page cache for linkdb";
m->m_desc = "Use disk page cache?";
m->m_cgi = "udpcnk";
m->m_off = (char *)&g_conf.m_useDiskPageCacheLinkdb - g;
m->m_type = TYPE_BOOL;
m->m_def = "1";
m->m_group = 0;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
/*
m->m_title = "exclude link text";
@ -12448,8 +12407,20 @@ void Parms::init ( ) {
m->m_group = 0;
m++;
m->m_title = "verify disk writes";
m->m_desc = "Read what was written in a verification step. Decreases "
"performance, but may help fight disk corruption mostly on "
"Maxtors and Western Digitals.";
m->m_cgi = "vdw";
m->m_off = (char *)&g_conf.m_verifyWrites - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_group = 0;
m->m_flags = 0;//PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m->m_group = 0;
m++;
m->m_title = "max spider read threads";
m->m_desc = "Maximum number of threads to use per Gigablast process "
@ -12460,7 +12431,7 @@ void Parms::init ( ) {
m->m_cgi = "smdt";
m->m_off = (char *)&g_conf.m_spiderMaxDiskThreads - g;
m->m_type = TYPE_LONG;
m->m_def = "30";
m->m_def = "20";
m->m_units = "threads";
m->m_flags = 0;//PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_MASTER;
@ -12468,13 +12439,16 @@ void Parms::init ( ) {
m->m_group = 0;
m++;
/*
m->m_title = "max spider big read threads";
m->m_desc = "This particular number applies to all disk "
"reads above 1MB.";
"reads above 1MB. "
"The number of total threads is also "
"limited to MAX_STACKS which is currently 20.";
m->m_cgi = "smbdt";
m->m_off = (char *)&g_conf.m_spiderMaxBigDiskThreads - g;
m->m_type = TYPE_LONG;
m->m_def = "8"; // 1
m->m_def = "2";
m->m_units = "threads";
m->m_group = 0;
m->m_flags = 0;//PF_HIDDEN | PF_NOSAVE;
@ -12484,11 +12458,13 @@ void Parms::init ( ) {
m->m_title = "max spider medium read threads";
m->m_desc = "This particular number applies to all disk "
"reads above 100K.";
"reads above 100K. "
"The number of total threads is also "
"limited to MAX_STACKS which is currently 20.";
m->m_cgi = "smmdt";
m->m_off = (char *)&g_conf.m_spiderMaxMedDiskThreads - g;
m->m_type = TYPE_LONG;
m->m_def = "19"; // 3
m->m_def = "4";
m->m_units = "threads";
m->m_group = 0;
m->m_flags = 0;//PF_HIDDEN | PF_NOSAVE;
@ -12498,18 +12474,37 @@ void Parms::init ( ) {
m->m_title = "max spider small read threads";
m->m_desc = "This particular number applies to all disk "
"reads above 1MB.";
"reads above 1MB. "
"The number of total threads is also "
"limited to MAX_STACKS which is currently 20.";
m->m_cgi = "smsdt";
m->m_off = (char *)&g_conf.m_spiderMaxSmaDiskThreads - g;
m->m_type = TYPE_LONG;
m->m_def = "20";
m->m_def = "15";
m->m_units = "threads";
m->m_group = 0;
m->m_flags = 0;//PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
*/
m->m_title = "separate disk reads";
m->m_desc = "If enabled then we will not launch a low priority "
"disk read or write while a high priority is outstanding. "
"Help improve query response time at the expense of "
"spider performance.";
m->m_cgi = "sdt";
m->m_off = (char *)&g_conf.m_separateDiskReads - g;
m->m_type = TYPE_BOOL;
m->m_def = "1";
m->m_flags = 0;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
/*
m->m_title = "max query read threads";
m->m_desc = "Maximum number of threads to use per Gigablast process "
"for accessing the disk "
@ -12527,13 +12522,17 @@ void Parms::init ( ) {
m->m_obj = OBJ_CONF;
m->m_group = 0;
m++;
*/
/*
m->m_title = "max query big read threads";
m->m_desc = "This particular number applies to all reads above 1MB.";
m->m_desc = "This particular number applies to all reads above 1MB. "
"The number of total threads is also "
"limited to MAX_STACKS which is currently 20.";
m->m_cgi = "qmbdt";
m->m_off = (char *)&g_conf.m_queryMaxBigDiskThreads - g;
m->m_type = TYPE_LONG;
m->m_def = "60"; // 1
m->m_def = "20"; // 1
m->m_units = "threads";
m->m_group = 0;
m->m_flags = 0;//PF_HIDDEN | PF_NOSAVE;
@ -12543,11 +12542,13 @@ void Parms::init ( ) {
m->m_title = "max query medium read threads";
m->m_desc = "This particular number applies to all disk "
"reads above 100K.";
"reads above 100K. "
"The number of total threads is also "
"limited to MAX_STACKS which is currently 20.";
m->m_cgi = "qmmdt";
m->m_off = (char *)&g_conf.m_queryMaxMedDiskThreads - g;
m->m_type = TYPE_LONG;
m->m_def = "80"; // 3
m->m_def = "20"; // 3
m->m_units = "threads";
m->m_group = 0;
m->m_flags = 0;//PF_HIDDEN | PF_NOSAVE;
@ -12557,17 +12558,20 @@ void Parms::init ( ) {
m->m_title = "max query small read threads";
m->m_desc = "This particular number applies to all disk "
"reads above 1MB.";
"reads above 1MB. "
"The number of total threads is also "
"limited to MAX_STACKS which is currently 20.";
m->m_cgi = "qmsdt";
m->m_off = (char *)&g_conf.m_queryMaxSmaDiskThreads - g;
m->m_type = TYPE_LONG;
m->m_def = "80";
m->m_def = "20";
m->m_units = "threads";
m->m_group = 0;
m->m_flags = 0;//PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
*/
m->m_title = "min popularity for speller";
m->m_desc = "Word or phrase must be present in this percent "
@ -15102,6 +15106,19 @@ void Parms::init ( ) {
m->m_off = (char *)&ir.m_hopCount - (char *)&ir;
m++;
m->m_title = "url IP";
m->m_desc = "Use this IP when injecting the document. Do not use or "
"set to 0.0.0.0, if unknown. If provided, it will save an IP "
"lookup.";
m->m_cgi = "urlip";
m->m_obj = OBJ_IR;
m->m_type = TYPE_IP;
m->m_def = "0.0.0.0";
m->m_flags = PF_API;
m->m_page = PAGE_INJECT;
m->m_off = (char *)&ir.m_injectDocIp - (char *)&ir;
m++;
m->m_title = "last spider time";
m->m_desc = "Override last time spidered";
m->m_cgi = "lastspidered";
@ -15208,7 +15225,10 @@ void Parms::init ( ) {
"Separate MIME from actual content with two returns. "
"At least put a single space in here if you want to "
"inject empty content, otherwise the content will "
"be downloaded from the url.";
"be downloaded from the url. This is because the "
"page injection form always submits the content text area "
"even if it is empty, which should signify that the "
"content should be downloaded.";
m->m_cgi = "content";
m->m_obj = OBJ_IR;
m->m_type = TYPE_CHARPTR;
@ -15489,6 +15509,22 @@ void Parms::init ( ) {
m->m_obj = OBJ_COLL;
m++;
m->m_title = "do tagdb lookups for queries";
m->m_desc = "For each search result a tagdb lookup is made, "
"usually across the network on distributed clusters, to "
"see if the URL's site has been manually banned in tagdb. "
"If you don't manually ban sites then turn this off for "
"extra speed.";
m->m_cgi = "stgdbl";
m->m_off = (char *)&cr.m_doTagdbLookups - x;
m->m_type = TYPE_BOOL;
m->m_def = "1";
m->m_group = 1;
m->m_flags = PF_API | PF_CLONE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "percent similar dedup summary default value";
m->m_desc = "If document summary (and title) are "
"this percent similar "
@ -16505,9 +16541,10 @@ void Parms::init ( ) {
m->m_flags = PF_CLONE;
m++;
m->m_title = "use robots.txt";
m->m_title = "obey robots.txt";
m->m_xml = "useRobotstxt";
m->m_desc = "If this is true Gigablast will respect "
"the robots.txt convention.";
"the robots.txt convention and rel no follow meta tags.";
m->m_cgi = "obeyRobots";
m->m_off = (char *)&cr.m_useRobotsTxt - x;
m->m_type = TYPE_BOOL;
@ -16517,6 +16554,18 @@ void Parms::init ( ) {
m->m_flags = PF_CLONE;
m++;
m->m_title = "obey rel no follow links";
m->m_desc = "If this is true Gigablast will respect "
"the rel no follow link attribute.";
m->m_cgi = "obeyRelNoFollow";
m->m_off = (char *)&cr.m_obeyRelNoFollowLinks - x;
m->m_type = TYPE_BOOL;
m->m_def = "1";
m->m_page = PAGE_SPIDER;
m->m_obj = OBJ_COLL;
m->m_flags = PF_CLONE;
m++;
m->m_title = "max robots.txt cache age";
m->m_desc = "How many seconds to cache a robots.txt file for. "
"86400 is 1 day. 0 means Gigablast will not read from the "
@ -19729,6 +19778,16 @@ void Parms::init ( ) {
m->m_obj = OBJ_CONF;
m++;
m->m_title = "log debug tcp buffer messages";
m->m_cgi = "ldtb";
m->m_off = (char *)&g_conf.m_logDebugTcpBuf - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_priv = 1;
m->m_page = PAGE_LOG;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "log debug thread messages";
m->m_cgi = "ldth";
m->m_off = (char *)&g_conf.m_logDebugThread - g;
@ -22459,6 +22518,7 @@ bool Parm::printVal ( SafeBuf *sb , collnum_t collnum , int32_t occNum ) {
return sb->safePrintf("CMD");
if ( m_type == TYPE_IP )
// may print 0.0.0.0
return sb->safePrintf("%s",iptoa(*(int32_t *)val) );
log("parms: missing parm type!!");

@ -3228,9 +3228,11 @@ void doneSendingNotifyEmailWrapper ( void *state ) {
// wait for post url to get done
if ( ei->m_notifyBlocked > 0 ) return;
// unmark it
ei->m_inUse = false;
//ei->m_inUse = false;
// all done
ei->m_finalCallback ( ei->m_finalState );
// nuke it
mfree ( ei , sizeof(EmailInfo) ,"eialrt" );
}
void doneGettingNotifyUrlWrapper ( void *state , TcpSocket *sock ) {
@ -3242,9 +3244,11 @@ void doneGettingNotifyUrlWrapper ( void *state , TcpSocket *sock ) {
// wait for email to get done
if ( ei->m_notifyBlocked > 0 ) return;
// unmark it
ei->m_inUse = false;
//ei->m_inUse = false;
// all done
ei->m_finalCallback ( ei->m_finalState );
// nuke it
mfree ( ei , sizeof(EmailInfo) ,"eialrt" );
}
// for printCrawlDetailsInJson()
@ -3259,7 +3263,7 @@ bool sendNotification ( EmailInfo *ei ) {
//log("ping: NOT SENDING NOTIFICATION -- DEBUG!!");
//return true;
if ( ei->m_inUse ) { char *xx=NULL;*xx=0; }
//if ( ei->m_inUse ) { char *xx=NULL;*xx=0; }
// caller must set this, as well as m_finalCallback/m_finalState
CollectionRec *cr = g_collectiondb.m_recs[ei->m_collnum];
@ -3275,7 +3279,7 @@ bool sendNotification ( EmailInfo *ei ) {
// sanity check, can only call once
if ( ei->m_notifyBlocked != 0 ) { char *xx=NULL;*xx=0; }
ei->m_inUse = true;
//ei->m_inUse = true;
if ( email && email[0] ) {
@ -3371,7 +3375,9 @@ bool sendNotification ( EmailInfo *ei ) {
}
if ( ei->m_notifyBlocked == 0 ) {
ei->m_inUse = false;
//ei->m_inUse = false;
// nuke it
mfree ( ei , sizeof(EmailInfo) ,"eialrt" );
return true;
}

@ -30,16 +30,17 @@ public:
// ip address of MX record for this domain
int32_t m_mxIp;
int32_t m_notifyBlocked;
bool m_inUse;
class CollectionRec *m_collRec;
//bool m_inUse;
EmailInfo() {
memset ( this,0,sizeof(EmailInfo) );
};
void reset() {
if ( m_inUse ) { char *xx=NULL;*xx=0; }
if ( m_notifyBlocked ) { char *xx=NULL;*xx=0; }
memset ( this,0,sizeof(EmailInfo) );
};
//EmailInfo() {
// memset ( this,0,sizeof(EmailInfo) );
//};
//void reset() {
// if ( m_inUse ) { char *xx=NULL;*xx=0; }
// if ( m_notifyBlocked ) { char *xx=NULL;*xx=0; }
// memset ( this,0,sizeof(EmailInfo) );
//};
};
class PingServer {

@ -35,8 +35,8 @@ bool Placedb::init ( ) {
// . 25(treeoverhead) + 24(cacheoverhead) = 49
//int32_t maxCacheNodes = g_conf.m_placedbMaxCacheMem / 49;
// we now use a page cache
if ( ! m_pc.init ( "placedb",RDB_PLACEDB,pcmem,GB_INDEXDB_PAGE_SIZE ) )
return log("db: Placedb page cache init failed.");
// if (!m_pc.init("placedb",RDB_PLACEDB,pcmem,GB_INDEXDB_PAGE_SIZE ) )
// return log("db: Placedb page cache init failed.");
// initialize our own internal rdb
if ( ! m_rdb.init ( g_hostdb.m_dir,
"placedb" ,
@ -50,7 +50,7 @@ bool Placedb::init ( ) {
0 , // maxCacheNodes
false , // half keys?
false , // g_conf.m_placedbSaveCache
&m_pc ,
NULL,//&m_pc ,
false , // is titledb?
false , // preload page cache?
16 , // keysize

@ -47,9 +47,9 @@ class Placedb {
// this rdb holds urls waiting to be spidered or being spidered
Rdb m_rdb;
DiskPageCache *getDiskPageCache() { return &m_pc; };
//DiskPageCache *getDiskPageCache() { return &m_pc; };
DiskPageCache m_pc;
//DiskPageCache m_pc;
};
extern class Placedb g_placedb;

@ -125,19 +125,19 @@ bool Posdb::init ( ) {
int32_t nodeSize = (sizeof(key144_t)+12+4) + sizeof(collnum_t);
int32_t maxTreeNodes = maxTreeMem / nodeSize ;
int32_t pageSize = GB_INDEXDB_PAGE_SIZE;
//int32_t pageSize = GB_INDEXDB_PAGE_SIZE;
// we now use a disk page cache as opposed to the
// old rec cache. i am trying to do away with the Rdb::m_cache rec
// cache in favor of cleverly used disk page caches, because
// the rec caches are not real-time and get stale.
int32_t pcmem = 30000000; // 30MB
//int32_t pcmem = 30000000; // 30MB
// make sure at least 30MB
//if ( pcmem < 30000000 ) pcmem = 30000000;
// keep this low if we are the tmp cluster, 30MB
if ( g_hostdb.m_useTmpCluster && pcmem > 30000000 ) pcmem = 30000000;
//if ( g_hostdb.m_useTmpCluster && pcmem > 30000000 ) pcmem = 30000000;
// do not use any page cache if doing tmp cluster in order to
// prevent swapping
if ( g_hostdb.m_useTmpCluster ) pcmem = 0;
//if ( g_hostdb.m_useTmpCluster ) pcmem = 0;
// save more mem!!! allow os to cache it i guess...
// let's go back to using it
//pcmem = 0;
@ -145,11 +145,11 @@ bool Posdb::init ( ) {
//pcmem = 0;
// . init the page cache
// . MDW: "minimize disk seeks" not working otherwise i'd enable it!
if ( ! m_pc.init ( "posdb",
RDB_POSDB,
pcmem ,
pageSize ))
return log("db: Posdb init failed.");
// if ( ! m_pc.init ( "posdb",
// RDB_POSDB,
// pcmem ,
// pageSize ))
// return log("db: Posdb init failed.");
// . set our own internal rdb
// . max disk space for bin tree is same as maxTreeMem so that we
@ -174,7 +174,7 @@ bool Posdb::init ( ) {
// newer systems have tons of ram to use
// for their disk page cache. it is slower than
// ours but the new engine has much slower things
&m_pc ,
NULL,//&m_pc ,
false , // istitledb?
false , // preloaddiskpagecache?
sizeof(key144_t)
@ -918,6 +918,10 @@ bool PosdbTable::allocTopTree ( ) {
, (int32_t)m_r->m_numDocIdSplits
);
// keep it sane
if ( nn > m_r->m_docsToGet * 2 && nn > 60 )
nn = m_r->m_docsToGet * 2;
// this actually sets the # of nodes to MORE than nn!!!
if ( ! m_topTree->setNumNodes(nn,m_r->m_doSiteClustering)) {
log("toptree: toptree: error allocating nodes: %s",
@ -1007,8 +1011,9 @@ bool PosdbTable::allocTopTree ( ) {
continue;
// how big?
int64_t total = m_msg2->m_lists[i].getListSize();
// skip if empty
if ( total == 0 ) {
// skip if empty. no we could be doing a split that is
// empty but other splits are full
if ( total == 0 && m_r->m_numDocIdSplits <= 1 ) {
log("query: empty facets for term #%i",i);
continue;
}
@ -6639,7 +6644,12 @@ void PosdbTable::intersectLists10_r ( ) {
// synbits on it, below!!! or a half stop wiki bigram like
// the term "enough for" in the wiki phrase
// "time enough for love" because we wanna reward that more!
// this halfstopwikibigram bit is set in the indivial keys
// so we'd have to at least do a key cleansing, so we can't
// do this shortcut right now... mdw oct 10 2015
if ( nsub == 1 &&
// need it for gbfacet termlists though it seems
(nwpFlags[0] & (BF_FACET|BF_NUMBER)) &&
!(nwpFlags[0] & BF_SYNONYM) &&
!(nwpFlags[0] & BF_HALFSTOPWIKIBIGRAM) ) {
miniMergedList [j] = nwp [0];
@ -6775,6 +6785,8 @@ void PosdbTable::intersectLists10_r ( ) {
nwp[mink] = NULL;
// avoid breach of core below now
if ( mptr < mptrEnd ) goto mergeMore;
// wrap it up here since done merging
miniMergedEnd[j] = mptr;
}
// breach?
@ -7563,6 +7575,7 @@ void PosdbTable::intersectLists10_r ( ) {
dcs.m_docLang = docLang;
// ensure enough room we can't allocate in a thread!
if ( m_scoreInfoBuf.getAvail()<(int32_t)sizeof(DocIdScore)+1){
goto advance;
char *xx=NULL;*xx=0; }
// if same as last docid, overwrite it since we have a higher
// siterank or langid i guess

@ -393,9 +393,9 @@ class Posdb {
Rdb m_rdb;
DiskPageCache *getDiskPageCache ( ) { return &m_pc; };
//DiskPageCache *getDiskPageCache ( ) { return &m_pc; };
DiskPageCache m_pc;
//DiskPageCache m_pc;
};
class FacetEntry {

@ -1515,15 +1515,16 @@ bool Process::shutdown2 ( ) {
static bool s_printed = false;
// wait for all threads to return
int32_t n = g_threads.getNumThreadsOutOrQueued() ;
//int32_t n = g_threads.getNumThreadsOutOrQueued() ;
int32_t n = g_threads.getNumWriteThreadsOut();
if ( n != 0 && ! m_urgent ) {
log(LOG_INFO,"gb: Has %"INT32" threads out. Waiting for "
log(LOG_INFO,"gb: Has %"INT32" write threads out. Waiting for "
"them to finish.",n);
return false;
}
else if ( ! s_printed && ! m_urgent ) {
s_printed = true;
log(LOG_INFO,"gb: No threads out.");
log(LOG_INFO,"gb: No write threads out.");
}
@ -1687,6 +1688,9 @@ bool Process::shutdown2 ( ) {
if ( g_process.m_threadOut )
log(LOG_INFO,"gb: still has hdtemp thread");
log("gb. EXITING.");
// exit abruptly
exit(0);
@ -1764,7 +1768,7 @@ bool Process::saveRdbTrees ( bool useThread , bool shuttingDown ) {
// no thread if shutting down
if ( shuttingDown ) useThread = false;
// debug note
log("gb: shuttingdown=%i",(int)shuttingDown);
if ( shuttingDown ) log("gb: trying to shutdown");
// turn off statsdb until everyone is done
//g_statsdb.m_disabled = true;
// loop over all Rdbs and save them
@ -2088,22 +2092,30 @@ void Process::resetAll ( ) {
resetTestIpTable();
}
#include "Msg3.h"
void Process::resetPageCaches ( ) {
log("gb: Resetting page caches.");
g_posdb .getDiskPageCache()->reset();
//g_datedb .getDiskPageCache()->reset();
g_linkdb .getDiskPageCache()->reset();
g_titledb .getDiskPageCache()->reset();
g_sectiondb .getDiskPageCache()->reset();
g_tagdb .getDiskPageCache()->reset();
g_spiderdb .getDiskPageCache()->reset();
//g_tfndb .getDiskPageCache()->reset();
//g_checksumdb .getDiskPageCache()->reset();
g_clusterdb .getDiskPageCache()->reset();
g_catdb .getDiskPageCache()->reset();
//g_placedb .getDiskPageCache()->reset();
g_doledb .getDiskPageCache()->reset();
//g_statsdb .getDiskPageCache()->reset();
for ( int32_t i = 0 ; i < RDB_END ; i++ ) {
RdbCache *rpc = getDiskPageCache ( i ); // rdbid = i
if ( ! rpc ) continue;
rpc->reset();
}
// g_posdb .getDiskPageCache()->reset();
// //g_datedb .getDiskPageCache()->reset();
// g_linkdb .getDiskPageCache()->reset();
// g_titledb .getDiskPageCache()->reset();
// g_sectiondb .getDiskPageCache()->reset();
// g_tagdb .getDiskPageCache()->reset();
// g_spiderdb .getDiskPageCache()->reset();
// //g_tfndb .getDiskPageCache()->reset();
// //g_checksumdb .getDiskPageCache()->reset();
// g_clusterdb .getDiskPageCache()->reset();
// g_catdb .getDiskPageCache()->reset();
// //g_placedb .getDiskPageCache()->reset();
// g_doledb .getDiskPageCache()->reset();
// //g_statsdb .getDiskPageCache()->reset();
}
// ============================================================================

@ -1451,10 +1451,17 @@ Profiler::getStackFrame(int sig) {
// profile once every 5ms, not every 1ms
static int32_t s_count = 0;
// turn off after 60 seconds of profiling
if ( m_totalFrames++ >= 60000 ) {
stopRealTimeProfiler(false);
return;
}
if ( ++s_count != 5 ) return;
s_count = 0;
// prevent cores.
// TODO: hack this to a function somehow...
// we set this to positive values when calling library functions like
@ -1463,6 +1470,9 @@ Profiler::getStackFrame(int sig) {
// somewhere. but for now just ignore.
if ( g_inMemcpy ) return;
// likewise, not if in system malloc since backtrace() mallocs
if ( g_inMemFunction ) return;
//void *trace[32];
// the innermost line number
@ -1584,6 +1594,7 @@ Profiler::startRealTimeProfiler() {
// }
init();
m_realTimeProfilerRunning = true;
m_totalFrames = 0;
// now Loop.cpp will call g_profiler.getStackFrame()
return;
@ -1854,7 +1865,7 @@ Profiler::printRealTimeInfo(SafeBuf *sb,
int fd = open ( filename , O_RDWR | O_CREAT , S_IRWXU );
if ( fd < 0 ) {
sb->safePrintf("FAILED TO OPEN %s for writing: %s"
,ff.getBufStart(),strerror(errno));
,ff.getBufStart(),mstrerror(errno));
return false;
}
for ( ; ip < ipEnd ; ip += sizeof(uint64_t) ) {
@ -1881,6 +1892,13 @@ Profiler::printRealTimeInfo(SafeBuf *sb,
// restrict to top 100 lines
char *x = out.getBufStart();
if ( ! x ) {
sb->safePrintf("FAILED TO READ trash/output.txt: %s"
,mstrerror(g_errno));
return false;
}
int lineCount = 0;
for ( ; *x ; x++ ) {
if ( *x != '\n' ) continue;

@ -263,6 +263,8 @@ protected:
HashTableX m_activeFns;
HashTableX m_quickpolls;
int32_t m_totalFrames;
const char* m_lastQpoll;
int32_t m_lastQpollLine;
QuickPollInfo m_quickPollInfos[512];

11
Rdb.cpp

@ -135,7 +135,8 @@ bool Rdb::init ( char *dir ,
int32_t maxCacheNodes ,
bool useHalfKeys ,
bool loadCacheFromDisk ,
DiskPageCache *pc ,
//DiskPageCache *pc ,
void *pc ,
bool isTitledb ,
bool preloadDiskPageCache ,
char keySize ,
@ -158,7 +159,7 @@ bool Rdb::init ( char *dir ,
m_fixedDataSize = fixedDataSize;
m_maxTreeMem = maxTreeMem;
m_useHalfKeys = useHalfKeys;
m_pc = pc;
//m_pc = pc;
m_isTitledb = isTitledb;
m_preloadCache = preloadDiskPageCache;
m_biasDiskPageCache = biasDiskPageCache;
@ -571,7 +572,7 @@ bool Rdb::addRdbBase2 ( collnum_t collnum ) { // addColl2()
buckets ,
&m_dump ,
this ,
m_pc ,
NULL ,
m_isTitledb ,
m_preloadCache ,
m_biasDiskPageCache ) ) {
@ -1643,7 +1644,7 @@ bool Rdb::dumpCollLoop ( ) {
//0 , // prev last key
KEYMIN() , // prev last key
m_ks , // keySize
m_pc , // DiskPageCache ptr
NULL,//m_pc , // DiskPageCache ptr
maxFileSize ,
this )) {// for setting m_needsToSave
return false;
@ -1791,7 +1792,7 @@ void attemptMergeAll2 ( ) {
if ( g_merge.isMerging() ) return;
int32_t niceness = MAX_NICENESS;
collnum_t s_lastCollnum = 0;
static collnum_t s_lastCollnum = 0;
int32_t count = 0;
tryLoop:

5
Rdb.h

@ -113,7 +113,8 @@ class Rdb {
int32_t maxCacheNodes ,
bool useHalfKeys ,
bool loadCacheFromDisk ,
class DiskPageCache *pc = NULL ,
//class DiskPageCache *pc = NULL ,
void *pc = NULL,
bool isTitledb = false , // use fileIds2[]?
bool preloadDiskPageCache = false ,
char keySize = 12 ,
@ -485,7 +486,7 @@ class Rdb {
// so only one save thread launches at a time
bool m_isSaving;
class DiskPageCache *m_pc;
//class DiskPageCache *m_pc;
bool m_isTitledb;

@ -128,7 +128,7 @@ bool RdbBase::init ( char *dir ,
RdbBuckets *buckets ,
RdbDump *dump ,
class Rdb *rdb ,
DiskPageCache *pc ,
void *pc , // DiskPageCache *pc ,
bool isTitledb ,
bool preloadDiskPageCache ,
bool biasDiskPageCache ) {
@ -266,7 +266,7 @@ bool RdbBase::init ( char *dir ,
m_useHalfKeys = useHalfKeys;
m_ks = keySize;
m_pageSize = pageSize;
m_pc = pc;
//m_pc = pc;
m_isTitledb = isTitledb;
// wa haven't done a dump yet
//m_lastWrite = gettimeofdayInMilliseconds();
@ -900,11 +900,11 @@ int32_t RdbBase::addFile ( int32_t id , bool isNew , int32_t mergeNum ,
// open this big data file for reading only
if ( ! isNew ) {
if ( mergeNum < 0 )
f->open ( O_RDONLY | O_NONBLOCK | O_ASYNC , m_pc );
f->open ( O_RDONLY | O_NONBLOCK | O_ASYNC , NULL );
// otherwise, merge will have to be resumed so this file
// should be writable
else
f->open ( O_RDWR | O_NONBLOCK | O_ASYNC , m_pc );
f->open ( O_RDWR | O_NONBLOCK | O_ASYNC , NULL );//pc
}
skip:
// find the position to add so we maintain order by fileId
@ -1132,6 +1132,8 @@ bool RdbBase::incorporateMerge ( ) {
if ( ! m_files[i]->unlink ( doneWrapper , this ) ) {
m_numThreads++; g_numThreads++; }
// debug msg
// MDW this cores if file is bad... if collection
// got delete from under us i guess!!
else log(LOG_INFO,"merge: Unlinked %s (#%"INT32").",
m_files[i]->getFilename(),i);
// debug msg
@ -1421,6 +1423,10 @@ bool RdbBase::attemptMerge ( int32_t niceness, bool forceMergeAll, bool doLog ,
if ( m_nextMergeForced ) forceMergeAll = true;
if ( forceMergeAll )
log(LOG_INFO,"merge: forcing merge for "
"for %s. (collnum=%"INT32")",m_dbname,(int32_t)m_collnum);
// if we are trying to merge titledb but a titledb dump is going on
// then do not do the merge, we do not want to overwrite tfndb via
// RdbDump::updateTfndbLoop()
@ -1468,11 +1474,16 @@ bool RdbBase::attemptMerge ( int32_t niceness, bool forceMergeAll, bool doLog ,
}
if ( g_numThreads > 0 ) {
if ( doLog )
// prevent log spam
static int32_t s_lastTime = 0;
int32_t now = getTimeLocal();
if ( now - s_lastTime > 0 && doLog )
log(LOG_INFO,"merge: Waiting for another "
"collection's unlink/rename "
"operations to finish before attempting merge "
"for %s (collnum=%"INT32").",m_dbname,(int32_t)m_collnum);
"for %s (collnum=%"INT32").",
m_dbname,(int32_t)m_collnum);
s_lastTime = now;
return false;
}
@ -1629,7 +1640,10 @@ bool RdbBase::attemptMerge ( int32_t niceness, bool forceMergeAll, bool doLog ,
// this triggers the negative rec concentration msg below and
// tries to merge on one file...
if ( ! resuming && m_numFiles <= 1 ) return false;
if ( ! resuming && m_numFiles <= 1 ) {
m_nextMergeForced = false;
return false;
}
// what percent of recs in the collections' rdb are negative?
// the rdbmaps hold this info
@ -2263,7 +2277,7 @@ void RdbBase::gotTokenForMerge ( ) {
m_mergeStartFileNum ,
m_numFilesToMerge ,
m_niceness ,
m_pc ,
NULL,//m_pc ,
mint /*maxTargetFileSize*/ ,
m_ks ) )
// we started the merge so return true here
@ -2531,7 +2545,7 @@ void RdbBase::saveMaps ( bool useThread ) {
}
void RdbBase::verifyDiskPageCache ( ) {
if ( !m_pc ) return;
//if ( !m_pc ) return;
// disable for now
return;
// for ( int32_t i = 0; i < m_numFiles; i++ ){

@ -83,7 +83,8 @@ class RdbBase {
RdbBuckets *buckets ,
RdbDump *dump ,
class Rdb *rdb ,
class DiskPageCache *pc = NULL ,
//class DiskPageCache *pc = NULL ,
void *pc = NULL,
bool isTitledb = false , // use fileIds2[]?
bool preloadDiskPageCache = false ,
bool biasDiskPageCache = false );
@ -458,7 +459,7 @@ class RdbBase {
// so only one save thread launches at a time
//bool m_isSaving;
class DiskPageCache *m_pc;
//class DiskPageCache *m_pc;
bool m_isTitledb;

@ -23,6 +23,7 @@ RdbCache::RdbCache () {
m_totalBufSize = 0;
m_numBufs = 0;
m_ptrs = NULL;
m_maxMem = 0;
m_numPtrsMax = 0;
reset();
m_needsSave = false;
@ -156,6 +157,7 @@ bool RdbCache::init ( int32_t maxMem ,
if( bufMem <= 0 ) {
log("rdbcache: cache for %s does not have enough mem. fix "
"by increasing maxmem or number of recs, etc.",m_dbname);
return false;
char *xx=NULL;*xx=0;
}
if ( bufMem && m_fixedDataSize > 0 &&
@ -440,7 +442,8 @@ bool RdbCache::getRecord ( collnum_t collnum ,
if ( m_numPtrsMax <= 0 ) return false;
// if init() called failed because of oom...
if ( ! m_ptrs )
return log("cache: getRecord: failed because oom");
//return log("cache: getRecord: failed because oom");
return false;
// time it -- debug
int64_t t = 0LL ;
if ( g_conf.m_logTimingDb ) t = gettimeofdayInMillisecondsLocal();
@ -540,7 +543,7 @@ bool RdbCache::getRecord ( collnum_t collnum ,
// of the delete head's space i guess.
// i do this for all caches now... what are the downsides? i forget.
//
bool check = false;
bool check = true;//false;
//if ( this == &g_genericCache[SITEQUALITY_CACHEID] ) check = true;
if ( this == g_dns.getCache () ) check = true;
if ( this == g_dns.getCacheLocal () ) check = true;
@ -555,11 +558,11 @@ bool RdbCache::getRecord ( collnum_t collnum ,
//if ( this == &g_tagdb.m_listCache ) check = true;
// the exact count cache...
//if ( this == &g_qtable ) check = true;
if ( m_totalBufSize < 20000 ) check = false;
//if ( m_totalBufSize < 20000 ) check = false;
if ( check ) promoteRecord = false;
// sanity check, do not allow the site quality cache or dns cache to
// be > 128MB, that just does not make sense and it complicates things
if ( check && m_totalBufSize > BUFSIZE ) { char *xx = NULL; *xx = 0; }
//if(check && m_totalBufSize > BUFSIZE ) { char *xx = NULL; *xx = 0; }
// sanity check
if ( m_tail < 0 || m_tail > m_totalBufSize ) {
char *xx = NULL; *xx = 0; }
@ -777,14 +780,15 @@ bool RdbCache::addRecord ( collnum_t collnum ,
int32_t timestamp ,
char **retRecPtr ) {
// bail if cache empty. maybe m_maxMem is 0.
if ( m_totalBufSize <= 0 ) return true;
//int64_t startTime = gettimeofdayInMillisecondsLocal();
if ( collnum < (collnum_t)0) {char *xx=NULL;*xx=0; }
if ( collnum >= m_maxColls ) {char *xx=NULL;*xx=0; }
// full key not allowed because we use that in markDeletedRecord()
if ( KEYCMP(cacheKey,KEYMAX(),m_cks) == 0 ) { char *xx=NULL;*xx=0; }
// bail if cache empty
if ( m_totalBufSize <= 0 ) return true;
// debug msg
int64_t t = 0LL ;
if ( g_conf.m_logTimingDb ) t = gettimeofdayInMillisecondsLocal();
@ -953,11 +957,13 @@ bool RdbCache::addRecord ( collnum_t collnum ,
m_memOccupied += ( p - start );
// debug msg (MDW)
//log("cache: adding rec @ %"UINT32" size=%"INT32" tail=%"UINT32"",
// i1c,p-start,m_tail);
//log("cache: stored k.n1=%"UINT32" k.n0=%"UINT64" %"INT32" bytes @ %"UINT32" tail=%"UINT32"",
// ((key_t *)cacheKey)->n1,
// ((key_t *)cacheKey)->n0,p-start,i1c,m_tail);
// if ( this == &g_spiderLoop.m_winnerListCache ) {
// log("cache: adding rec @ %"UINT32" size=%i tail=%"INT32"",
// i1c,(int)(p-start),m_tail);
// log("cache: stored k.n1=%"UINT32" k.n0=%"UINT64" %"INT32" bytes @ %"UINT32" tail=%"UINT32"",
// ((key_t *)cacheKey)->n1,
// ((key_t *)cacheKey)->n0,(int)(p-start),i1c,m_tail);
// }
//if ( m_cks == 4 )
// log("stored k=%"XINT32" %"INT32" bytes @ %"UINT32"",
// *(int32_t *)cacheKey,p-start,i);//(uint32_t)start);
@ -1109,8 +1115,10 @@ bool RdbCache::deleteRec ( ) {
//int32_t saved = m_tail;
// debug msg (MDW)
//log("cache: deleting rec @ %"INT32" size=%"INT32"",m_tail,
// dataSize+2+12+4+4);
// if ( this == &g_spiderLoop.m_winnerListCache ) {
// log("cache: deleting rec @ %"INT32" size=%"INT32"",m_tail,
// dataSize+2+12+4+4);
// }
// skip over rest of rec
p += dataSize;
@ -1124,6 +1132,10 @@ bool RdbCache::deleteRec ( ) {
m_tail +(int32_t)sizeof(collnum_t)+m_cks+4>m_totalBufSize){
char *xx = NULL; *xx = 0;}
// if ( this == &g_spiderLoop.m_winnerListCache )
// log("spider: rdbcache: removing tail rec collnum=%i",
// (int)collnum);
// delete key from hash table, iff is for THIS record
// but if it has not already been voided.
// we set key to KEYMAX() in markDeletedRecord()
@ -1163,8 +1175,10 @@ bool RdbCache::deleteRec ( ) {
void RdbCache::markDeletedRecord(char *ptr){
int32_t dataSize = sizeof(collnum_t)+m_cks+sizeof(int32_t);
// debug it
//logf(LOG_DEBUG,"cache: makeDeleteRecord ptr=0x%"XINT32" off=%"INT32"",
// (int32_t)ptr,ptr-m_bufs[0]);
// if ( this == &g_spiderLoop.m_winnerListCache ) {
//logf(LOG_DEBUG,"cache: makeDeleteRec ptr=0x%"PTRFMT" off=%"INT32"",
// (PTRTYPE)ptr,(int32_t)(ptr-m_bufs[0]));
// }
// get dataSize and data
if ( m_fixedDataSize == -1 || m_supportLists ) {
dataSize += 4 + // size

@ -120,6 +120,7 @@ class RdbCache {
// . returns true if found, false if not found in cache
// . sets *rec and *recSize iff found
// . sets *cachedTime to time the rec was cached
// . use maxAge of -1 to have no limit to the age of cached rec
bool getRecord ( collnum_t collnum ,
//key_t cacheKey ,
char *cacheKey ,

@ -41,7 +41,8 @@ bool RdbDump::set ( //char *coll ,
//key_t prevLastKey ,
char *prevLastKey ,
char keySize ,
class DiskPageCache *pc ,
//class DiskPageCache *pc ,
void *pc ,
int64_t maxFileSize ,
Rdb *rdb ) {
@ -404,12 +405,15 @@ bool RdbDump::dumpTree ( bool recall ) {
m_totalNegDumped += m_numNegRecs;
// . check the list we got from the tree for problems
// . ensures keys are ordered from lowest to highest as well
#ifdef GBSANITYCHECK
log("dump: verifying list before dumping");
m_list->checkList_r ( false , // removeNegRecs?
false , // sleep on problem?
m_rdb->m_rdbId );
#endif
//#ifdef GBSANITYCHECK
if ( g_conf.m_verifyWrites ) {
char *s = "none";
if ( m_rdb ) s = getDbnameFromId(m_rdb->m_rdbId);
log("dump: verifying list before dumping (rdb=%s)",s);
m_list->checkList_r ( false , // removeNegRecs?
false , // sleep on problem?
m_rdb->m_rdbId );
}
// if list is empty, we're done!
if ( status && m_list->isEmpty() ) {
// consider that a rollover?
@ -485,15 +489,15 @@ bool RdbDump::dumpList ( RdbList *list , int32_t niceness , bool recall ) {
if ( m_list->isEmpty() ) return true;
// we're now in dump mode again
m_isDumping = true;
#ifdef GBSANITYCHECK
//#ifdef GBSANITYCHECK
// don't check list if we're dumping an unordered list from tree!
if ( m_orderedDump ) {
if ( g_conf.m_verifyWrites && m_orderedDump ) {
m_list->checkList_r ( false /*removedNegRecs?*/ );
// print list stats
log("dump: sk=%s ",KEYSTR(m_list->m_startKey,m_ks));
log("dump: ek=%s ",KEYSTR(m_list->m_endKey,m_ks));
// log("dump: sk=%s ",KEYSTR(m_list->m_startKey,m_ks));
// log("dump: ek=%s ",KEYSTR(m_list->m_endKey,m_ks));
}
#endif
//#endif
// before calling RdbMap::addList(), always reset list ptr
// since we no longer call this in RdbMap::addList() so we don't
@ -524,8 +528,10 @@ bool RdbDump::dumpList ( RdbList *list , int32_t niceness , bool recall ) {
}
}
if ( m_ks==18 ) {
m_list->checkList_r(false,false,RDB_POSDB);
if ( g_conf.m_verifyWrites ) {
char rdbId = 0;
if ( m_rdb ) rdbId = m_rdb->m_rdbId;
m_list->checkList_r(false,false,rdbId);//RDB_POSDB);
m_list->resetListPtr();
}
@ -773,7 +779,8 @@ bool RdbDump::doneReadingForVerify ( ) {
// see if what we wrote is the same as what we read back
if ( m_verifyBuf && memcmp(m_verifyBuf,m_buf,m_bytesToWrite) != 0 &&
if ( m_verifyBuf && g_conf.m_verifyWrites &&
memcmp(m_verifyBuf,m_buf,m_bytesToWrite) != 0 &&
! g_errno ) {
log("disk: Write verification of %"INT32" bytes to file %s "
"failed at offset=%"INT64". Retrying.",

@ -50,7 +50,8 @@ class RdbDump {
//key_t prevLastKey ,
char *prevLastKey ,
char keySize ,
class DiskPageCache *pc ,
//class DiskPageCache *pc ,
void *pc ,
int64_t maxFileSize ,
class Rdb *rdb );
@ -75,7 +76,7 @@ class RdbDump {
// . this override makes the file's getSlot() return LdbSlots
// which can be appropriately added to an RdbTable or LdbTable
bool load ( class Rdb *rdb , int32_t fixedDataSize , BigFile *file ,
class DiskPageCache *pc );
void *pc ); // class DiskPageCache *pc );
// . calls the callback specified in set() when done
// . errno set to indicate error #, if any

@ -693,9 +693,9 @@ bool RdbList::checkList_r ( bool removeNegRecs , bool sleepOnProblem ,
return false;
}
if ( m_useHalfKeys && m_ks == 12 ) // m_ks != 18 && m_ks != 24 )
return checkIndexList_r ( removeNegRecs ,
sleepOnProblem );
// if ( m_useHalfKeys && m_ks == 12 ) // m_ks != 18 && m_ks != 24 )
// return checkIndexList_r ( removeNegRecs ,
// sleepOnProblem );
//log("m_list=%"INT32"",(int32_t)m_list);
//key_t oldk;
@ -721,6 +721,10 @@ bool RdbList::checkList_r ( bool removeNegRecs , bool sleepOnProblem ,
if ( KEYCMP(acceptable,KEYMIN(),m_ks)==0 )
KEYSET ( acceptable , m_endKey , m_ks );
char k[MAX_KEY_BYTES];
static int32_t th = 0;
if ( ! th ) th = hash64Lower_a ( "roottitles" , 10 );
while ( ! isExhausted() ) {
//key_t k = getCurrentKey();
getCurrentKey( k );
@ -734,6 +738,43 @@ bool RdbList::checkList_r ( bool removeNegRecs , bool sleepOnProblem ,
*(int32_t *)data > 100000000 ) ) {
char *xx = NULL; *xx = 0; }
}
// tagrec?
if ( rdbId == RDB_TAGDB && ! KEYNEG(k) ) {
//TagRec *gr = (TagRec *)getCurrentRec();
//Tag *tag = gr->getFirstTag ( );
//for ( ; tag ; tag = gr->getNextTag ( tag ) ) {
Tag *tag = (Tag *)getCurrentRec();
if ( tag->m_type == th ) {
char *tdata = tag->getTagData();
int32_t tsize = tag->getTagDataSize();
// core if tag val is not \0 terminated
if ( tsize > 0 && tdata[tsize-1]!='\0' ) {
log("db: bad root title tag");
char *xx=NULL;*xx=0; }
}
}
if ( rdbId == RDB_SPIDERDB && ! KEYNEG(k) &&
getCurrentDataSize() > 0 ) {
//char *data = getCurrentData();
char *rec = getCurrentRec();
// bad url in spider request?
if ( g_spiderdb.isSpiderRequest ( (key128_t *)rec ) ){
SpiderRequest *sr = (SpiderRequest *)rec;
if ( strncmp(sr->m_url,"http",4) != 0 ) {
log("db: spider req url");
char *xx=NULL;*xx=0;
}
}
}
// title bad uncompress size?
if ( rdbId == RDB_TITLEDB && ! KEYNEG(k) ) {
char *rec = getCurrentRec();
int32_t usize = *(int32_t *)(rec+12+4);
if ( usize <= 0 ) {
log("db: bad titlerec uncompress size");
char *xx=NULL;*xx=0;
}
}
// debug msg
// pause if it's google
//if ((((k.n0) >> 1) & 0x0000003fffffffffLL) == 70166155664)
@ -3525,4 +3566,32 @@ void RdbList::setFromSafeBuf ( SafeBuf *sb , char rdbId ) {
}
void RdbList::setFromPtr ( char *p , int32_t psize , char rdbId ) {
// free and NULLify any old m_list we had to make room for our new list
freeList();
// set this first since others depend on it
m_ks = getKeySizeFromRdbId ( rdbId );
// set our list parms
m_list = p;
m_listSize = psize;
m_alloc = p;
m_allocSize = psize;
m_listEnd = m_list + m_listSize;
KEYMIN(m_startKey,m_ks);
KEYMAX(m_endKey ,m_ks);
m_fixedDataSize = getDataSizeFromRdbId ( rdbId );
m_ownData = false;//ownData;
m_useHalfKeys = false;//useHalfKeys;
// use this call now to set m_listPtr and m_listPtrHi based on m_list
resetListPtr();
}

@ -107,6 +107,7 @@ class RdbList {
char keySize = sizeof(key_t) );
void setFromSafeBuf ( class SafeBuf *sb , char rdbId );
void setFromPtr ( char *p , int32_t psize , char rdbId ) ;
// just set the start and end keys
//void set ( key_t startKey , key_t endKey );

@ -1295,12 +1295,13 @@ void RdbMap::reduceMemFootPrint () {
for ( ; s && *s && ! is_digit(*s) ; s++ );
int id = 0;
if ( s ) id = atoi(s);
if ( id && (id % 2) == 0 ) return;
// id can be zero like for spiderdb0000.map
if ( (id % 2) == 0 ) return;
// log("map: reducing mem footprint for %s/%s",
// m_file.getDir(),
// m_file.getFilename());
// log("map: reducing mem footprint for %s/%s",
// m_file.getDir(),
// m_file.getFilename());
// seems kinda buggy now..
m_reducedMem = true;
//return;

@ -38,7 +38,8 @@ bool RdbMerge::merge ( char rdbId ,
int32_t startFileNum ,
int32_t numFiles ,
int32_t niceness ,
class DiskPageCache *pc ,
//class DiskPageCache *pc ,
void *pc ,
int64_t maxTargetFileSize ,
char keySize ) {
// reset ourselves
@ -69,7 +70,7 @@ bool RdbMerge::merge ( char rdbId ,
m_dedup = base->m_dedup;
m_fixedDataSize = base->m_fixedDataSize;
m_niceness = niceness;
m_pc = pc;
//m_pc = pc;
m_maxTargetFileSize = maxTargetFileSize;
m_doneMerging = false;
m_ks = keySize;
@ -209,7 +210,7 @@ bool RdbMerge::gotLock ( ) {
startOffset ,
prevLastKey ,
m_ks ,
m_pc ,
NULL,//m_pc ,
m_maxTargetFileSize ,
NULL ); // set m_base::m_needsToSave? no.
// what kind of error?

@ -66,7 +66,8 @@ class RdbMerge {
int32_t startFileNum ,
int32_t numFiles ,
int32_t niceness ,
class DiskPageCache *pc ,
//class DiskPageCache *pc ,
void *pc ,
int64_t maxTargetFileSize ,
char keySize );
@ -150,7 +151,7 @@ class RdbMerge {
// count for indexdb
int64_t m_dupsRemoved;
class DiskPageCache *m_pc;
//class DiskPageCache *m_pc;
int64_t m_maxTargetFileSize;
int32_t m_id2;

@ -148,6 +148,7 @@ bool RdbScan::setRead ( BigFile *file ,
// ensure we don't mess around
m_fstate.m_allocBuf = NULL;
m_fstate.m_buf = NULL;
//m_fstate.m_usePartFiles = true;
// debug msg
//log("diskOff=%"INT64" nb=%"INT32"",offset,bytesToRead);
//if ( offset == 16386 && bytesToRead == 16386 )
@ -253,6 +254,7 @@ void RdbScan::gotList ( ) {
// so i effectively disabled it by changing to _GBSANITYCHECK2_
//#ifdef GBSANITYCHECK2
// this first test, tests to make sure the read from cache worked
/*
DiskPageCache *pc = m_file->getDiskPageCache();
if ( pc &&
! g_errno &&
@ -307,7 +309,8 @@ void RdbScan::gotList ( ) {
// . go through each page in page cache and verify on disk
//pc->verifyData ( m_file );
}
skip:
*/
// skip:
//#endif
// assume we did not shift it
m_shifted = 0;//false;
@ -319,7 +322,7 @@ void RdbScan::gotList ( ) {
// . i think a read overflow might be causing a segv in malloc
// . NOTE: BigFile's call to DiskPageCache alters these values
if ( m_fstate.m_bytesDone != m_fstate.m_bytesToGo && m_hitDisk )
log(LOG_INFO,"disk: Read %"INT32" bytes but needed %"INT32".",
log(LOG_INFO,"disk: Read %"INT64" bytes but needed %"INT64".",
m_fstate.m_bytesDone , m_fstate.m_bytesToGo );
// adjust the list size for biased page cache if necessary
//if ( m_file->m_pc && m_allowPageCache &&

@ -87,6 +87,9 @@ class RdbScan {
bool m_allowPageCache;
bool m_hitDisk;
// this is set by Msg3.cpp
char m_inPageCache;
};
#endif

@ -2467,8 +2467,8 @@ void threadDoneWrapper ( void *state , ThreadEntry *t ) {
THIS->m_dbname,mstrerror(g_errno));
else
// log it
log("db: Done saving %s/%s-saved.dat",
THIS->m_dir,THIS->m_dbname);
log("db: Done saving %s/%s-saved.dat (wrote %"INT64" bytes)",
THIS->m_dir,THIS->m_dbname,THIS->m_bytesWritten);
// . call callback
if ( THIS->m_callback ) THIS->m_callback ( THIS->m_state );
}
@ -2495,6 +2495,20 @@ bool RdbTree::fastSave_r() {
return log("db: Could not open %s for writing: %s.",
s,mstrerror(errno));
}
redo:
// verify the tree
if ( g_conf.m_verifyWrites ) {
log("db: verify writes is enabled, checking tree before "
"saving.");
if ( ! checkTree( false , true ) ) {
log("db: fixing tree and re-checking");
fixTree ( );
goto redo;
}
}
// clear our own errno
errno = 0;
// . save the header

@ -236,7 +236,7 @@ bool SafeBuf::pushFloat ( float i) {
// hack off trailing 0's
bool SafeBuf::printFloatPretty ( float f ) {
if ( m_length + 20 > m_capacity && ! reserve(20) )
if ( m_length + 40 > m_capacity && ! reserve(40) )
return false;
char *p = m_buf + m_length;

@ -17294,11 +17294,11 @@ bool Sectiondb::init ( ) {
// cache in favor of cleverly used disk page caches, because
// the rec caches are not real-time and get stale.
// . just hard-code 5MB for now
int32_t pcmem = 5000000; // = g_conf.m_sectiondbMaxDiskPageCacheMem;
//int32_t pcmem = 5000000; // = g_conf.m_sectiondbMaxDiskPageCacheMem;
// do not use for now i think we use posdb and store the 32bit
// val in the key for facet type stuff
pcmem = 0;
//pcmem = 0;
maxTreeMem = 100000;
maxTreeNodes = 1000;
@ -17322,14 +17322,14 @@ bool Sectiondb::init ( ) {
// do not use any page cache if doing tmp cluster in order to
// prevent swapping
if ( g_hostdb.m_useTmpCluster ) pcmem = 0;
int32_t pageSize = GB_INDEXDB_PAGE_SIZE;
// init the page cache
if ( ! m_pc.init ( "sectiondb",
RDB_SECTIONDB,
pcmem ,
pageSize ) )
return log("db: Sectiondb init failed.");
// if ( g_hostdb.m_useTmpCluster ) pcmem = 0;
// int32_t pageSize = GB_INDEXDB_PAGE_SIZE;
// // init the page cache
// if ( ! m_pc.init ( "sectiondb",
// RDB_SECTIONDB,
// pcmem ,
// pageSize ) )
// return log("db: Sectiondb init failed.");
// initialize our own internal rdb
if ( ! m_rdb.init ( g_hostdb.m_dir ,
@ -17348,7 +17348,7 @@ bool Sectiondb::init ( ) {
0 , // maxCacheNodes
false , // half keys?
false , // saveCache?
&m_pc , // page cache ptr
NULL,//&m_pc , // page cache ptr
false , // is titledb?
false , // preloadcache?
16 ))// keySize

@ -7,7 +7,7 @@
#include "Dates.h" // datetype_t
#include "Words.h"
#include "Rdb.h"
#include "DiskPageCache.h"
//#include "DiskPageCache.h"
// KEY:
@ -287,9 +287,9 @@ class Sectiondb {
// holds binary format title entries
Rdb m_rdb;
DiskPageCache *getDiskPageCache ( ) { return &m_pc; };
//DiskPageCache *getDiskPageCache ( ) { return &m_pc; };
DiskPageCache m_pc;
//DiskPageCache m_pc;
};
extern class Sectiondb g_sectiondb;

File diff suppressed because it is too large Load Diff

@ -404,7 +404,7 @@ class Spiderdb {
Rdb *getRdb ( ) { return &m_rdb; };
DiskPageCache *getDiskPageCache() { return &m_pc; };
//DiskPageCache *getDiskPageCache() { return &m_pc; };
// this rdb holds urls waiting to be spidered or being spidered
Rdb m_rdb;
@ -453,11 +453,11 @@ class Spiderdb {
*/
// print the spider rec
int32_t print( char *srec );
int32_t print( char *srec , SafeBuf *sb = NULL );
private:
DiskPageCache m_pc;
//DiskPageCache m_pc;
};
void dedupSpiderdbList ( RdbList *list , int32_t niceness , bool removeNegRecs );
@ -989,7 +989,7 @@ class Doledb {
bool addColl ( char *coll, bool doVerify = true );
DiskPageCache *getDiskPageCache() { return &m_pc; };
//DiskPageCache *getDiskPageCache() { return &m_pc; };
// . see "overview of spidercache" below for key definition
// . these keys when hashed are clogging up the hash table
@ -1072,7 +1072,7 @@ class Doledb {
Rdb m_rdb;
DiskPageCache m_pc;
//DiskPageCache m_pc;
};
@ -1131,12 +1131,12 @@ class SpiderColl {
// doledbkey + dataSize + bestRequestRec
//char m_doleBuf[MAX_DOLEREC_SIZE];
SafeBuf m_doleBuf;
//SafeBuf m_doleBuf;
bool m_isLoading;
// for scanning the wait tree...
bool m_isPopulating;
bool m_isPopulatingDoledb;
// for reading from spiderdb
//bool m_isReadDone;
bool m_didRead;
@ -1192,7 +1192,9 @@ class SpiderColl {
bool addToDoleTable ( SpiderRequest *sreq ) ;
bool addDoleBufIntoDoledb (bool isFromCache,uint32_t cachedTimestamp);
bool validateDoleBuf ( SafeBuf *doleBuf ) ;
bool addDoleBufIntoDoledb ( SafeBuf *doleBuf , bool isFromCache);
//,uint32_t cachedTimestamp);
bool updateSiteNumInlinksTable ( int32_t siteHash32,int32_t sni,
time_t tstamp); // time_t

@ -5,7 +5,10 @@
#include "HttpServer.h"
#include "SpiderProxy.h"
#define LOADPOINT_EXPIRE_MS (10*60*1000)
//#define LOADPOINT_EXPIRE_MS (10*60*1000)
// make it 15 seconds not 10 minutes otherwise it gets too full with dup
// keys and really clogs things up
#define LOADPOINT_EXPIRE_MS (15*1000)
//
// BASIC DETAILS
@ -927,6 +930,9 @@ void handleRequest54 ( UdpSlot *udpSlot , int32_t niceness ) {
// and the loadbucket id
//*(int32_t *)p = bb.m_id; p += 4;
//int32_t sanityCount = 0;//s_loadTable.getNumSlots();
// top:
// now remove old entries from the load table. entries that
// have completed and have a download end time more than 10 mins ago
for ( int32_t i = 0 ; i < s_loadTable.getNumSlots() ; i++ ) {
@ -938,8 +944,12 @@ void handleRequest54 ( UdpSlot *udpSlot , int32_t niceness ) {
if ( pp->m_downloadEndTimeMS == 0LL ) continue;
// delta t
int64_t took = nowms - pp->m_downloadEndTimeMS;
// < 10 mins?
// < 10 mins? now it's < 15 seconds to prevent clogging.
if ( took < LOADPOINT_EXPIRE_MS ) continue;
// 100 at a time
//if ( sanityCount++ > 100 ) break;
// ok, its too old, nuke it to save memory
s_loadTable.removeSlot(i);
// the keys might have buried us but we really should not
@ -947,6 +957,7 @@ void handleRequest54 ( UdpSlot *udpSlot , int32_t niceness ) {
// should we? TODO: figure it out. if we miss a few it's not
// a big deal.
i--;
//goto top;
}
// send the proxy ip/port/LBid back to user
@ -1038,6 +1049,7 @@ bool initSpiderProxyStuff() {
128,
NULL,
0,
// this slows us down
true, // allow dups?
MAX_NICENESS,
"lbtab",

@ -247,6 +247,9 @@ void Statsdb::addDocsIndexed ( ) {
if ( ! isClockInSync() ) return;
// only host #0 needs this
if ( g_hostdb.m_hostId != 0 ) return;
// only once per five seconds
int32_t now = getTimeLocal();
static int32_t s_lastTime = 0;

@ -1835,8 +1835,8 @@ bool Tagdb::init ( ) {
// overhead in cache.
//int32_t maxCacheNodes = g_conf.m_tagdbMaxCacheMem / 106;
// we now use a page cache
if ( ! m_pc.init ("tagdb",RDB_TAGDB,pcmem,GB_TFNDB_PAGE_SIZE))
return log("tagdb: Tagdb init failed.");
// if ( ! m_pc.init ("tagdb",RDB_TAGDB,pcmem,GB_TFNDB_PAGE_SIZE))
// return log("tagdb: Tagdb init failed.");
// init this
//if ( ! s_lockTable2.set(8,4,32,NULL,0,false,0,"taglocktbl") )
@ -1858,7 +1858,7 @@ bool Tagdb::init ( ) {
0 , //maxCacheNodes ,
false , // half keys?
false , //m_tagdbSaveCache
&m_pc ,
NULL,//&m_pc ,
false, // is titledb
true , // preload disk page cache
sizeof(key128_t), // key size

@ -8,7 +8,7 @@
#include "Xml.h"
#include "Url.h"
#include "Loop.h"
#include "DiskPageCache.h"
//#include "DiskPageCache.h"
//#include "CollectionRec.h"
#include "SafeBuf.h"
#include "Msg0.h"
@ -353,7 +353,7 @@ class Tagdb {
char *getRec ( RdbList *list , Url *url , int32_t *recSize ,char* coll,
int32_t collLen, RdbList *retList) ;
DiskPageCache *getDiskPageCache() { return &m_pc; };
//DiskPageCache *getDiskPageCache() { return &m_pc; };
//int32_t getGroupId (key_t *key) {return key->n1 & g_hostdb.m_groupMask;}
@ -374,7 +374,7 @@ class Tagdb {
// and "not-founds" stored remotely (net cache)
Rdb m_rdb;
DiskPageCache m_pc;
//DiskPageCache m_pc;
bool loadMinSiteInlinksBuffer ( );
bool loadMinSiteInlinksBuffer2 ( );
@ -392,7 +392,7 @@ class Turkdb {
bool addColl ( char *coll, bool doVerify = true );
Rdb *getRdb ( ) { return &m_rdb; };
Rdb m_rdb;
DiskPageCache m_pc;
//DiskPageCache m_pc;
};
extern class Tagdb g_tagdb;

@ -5,6 +5,7 @@
#include "Profiler.h"
#include "PingServer.h"
//#include "AutoBan.h"
#include "Hostdb.h"
// . TODO: deleting nodes from under Loop::callCallbacks is dangerous!!
@ -593,6 +594,17 @@ bool TcpServer::sendMsg ( int32_t ip ,
// return true if s is NULL and g_errno was set by getNewSocket()
// might set g_errno to EOUTOFSOCKETS
if ( ! s ) { mfree ( sendBuf , sendBufSize,"TcpServer"); return true; }
// debug to find why sockets getting diffbot replies get commandeered.
// we think that they are using an sd used by a streaming socket,
// who closed, but then proceed to use TcpSocket class as if he
// had not closed it.
if ( g_conf.m_logDebugTcpBuf ) {
SafeBuf sb;
sb.safePrintf("tcp: open newsd=%i sendbuf=",s->m_sd);
sb.safeTruncateEllipsis (sendBuf,sendBufSize,200);
log("%s",sb.getBufStart());
}
// set up the new TcpSocket for connecting
s->m_state = state;
s->m_callback = callback;
@ -846,6 +858,7 @@ TcpSocket *TcpServer::getNewSocket ( ) {
// . TODO: ensure this blocks even if sd was set nonblock by wrapSock()
if ( ! s ) {
if ( sd == 0 ) log("tcp: closing1 sd of 0");
log("tcp: wrapsocket2 returned null for sd=%i",(int)sd);
if ( ::close(sd) == -1 )
log("tcp: close2(%"INT32") = %s",(int32_t)sd,mstrerror(errno));
else {
@ -1732,6 +1745,8 @@ void writeSocketWrapper ( int sd , void *state ) {
bool wasStreaming = s->m_streamingMode;
// otherwise, call callback on done writing or error
// MDW: if we close the socket descriptor, then a getdiffbotreply
// gets it, we have to know.
THIS->makeCallback ( s );
// if callback changed socket status to ST_SEND_AGAIN
@ -1921,7 +1936,7 @@ int32_t TcpServer::writeSocket ( TcpSocket *s ) {
// another debug
//if ( g_conf.m_logDebugTcp )
log("tcp: only wrote %"INT32" of %"INT32" bytes "
"tried.",n,toSend);
"tried. sd=%i",n,toSend,s->m_sd);
// need to listen for writability now since our write
// failed to write everythin gout
if ( ! s->m_writeRegistered &&
@ -2260,6 +2275,32 @@ void TcpServer::destroySocket ( TcpSocket *s ) {
// if sd is 0 do not really close it. seems to fix that bug.
// 0 is the FD for stdin so i don't know how that is happening.
if ( sd != 0 ) cret = ::close ( sd );
if ( g_conf.m_logDebugTcpBuf ) {
SafeBuf sb;
sb.safePrintf("tcp: closing sd=%i bytessent=%i "
"sendbufused=%i streaming=%i "
"sendbuf=",
s->m_sd,
s->m_sendOffset,
s->m_sendBufUsed,
(int)s->m_streamingMode);
if ( s->m_sendBuf )
sb.safeTruncateEllipsis(s->m_sendBuf,
s->m_sendBufSize,
200);
sb.safePrintf(" bytesread=%i readbuf=",(int)s->m_readOffset);
if ( s->m_readBuf )
sb.safeTruncateEllipsis(s->m_readBuf,
s->m_readOffset,
2000);
log("%s",sb.getBufStart());
}
// force it out of streaming mode since we closed it. then we
// should avoid the "not timing out streaming socket fd=123" msgs.
s->m_streamingMode = false;
if ( cret != 0 ) { // == -1 )
log("tcp: s=%"PTRFMT" close(%"INT32") = %"INT32" = %s",
(PTRTYPE)s,(int32_t)sd,cret,mstrerror(errno));
@ -2272,7 +2313,7 @@ void TcpServer::destroySocket ( TcpSocket *s ) {
// log("tcp: closing sock %i (open=%"INT32")",sd,
// m_numOpen-m_numClosed);
// set it negative to try to fix the double close while
// streaming bug.
// streaming bug. -sd -m_sd m_sd = m_sd=
if ( s->m_sd > 0 ) s->m_sd *= -1;
}
// a 2nd close? it should return -1 with errno set!
@ -2574,6 +2615,18 @@ TcpSocket *TcpServer::acceptSocket ( ) {
if ( g_conf.m_logDebugTcp )
logf(LOG_DEBUG,"tcp: ...... accepted sd=%"INT32"",(int32_t)newsd);
// debug to find why sockets getting diffbot replies get commandeered.
// we think that they are using an sd used by a streaming socket,
// who closed, but then proceed to use TcpSocket class as if he
// had not closed it.
if ( g_conf.m_logDebugTcpBuf ) {
SafeBuf sb;
sb.safePrintf("tcp: accept newsd=%i incoming req",newsd);
//sb.safeTruncateEllipsis (sendBuf,sendBufSize,200);
log("%s",sb.getBufStart());
}
// ssl debug!
//log("tcp: accept returned fd=%i",newsd);
@ -2621,6 +2674,7 @@ TcpSocket *TcpServer::acceptSocket ( ) {
if ( ! s ) {
//log("tcp: wrapsocket returned null fd=%i",newsd);
if ( newsd == 0 ) log("tcp: closing sd of 0");
log("tcp: wrapsocket1 returned null for sd=%i",(int)newsd);
if ( ::close(newsd)== -1 )
log("tcp: close2(%"INT32") = %s",
(int32_t)newsd,mstrerror(errno));
@ -2726,7 +2780,8 @@ bool TcpServer::sslAccept ( TcpSocket *s ) {
void TcpServer::makeCallback ( TcpSocket * s ) {
if ( ! s->m_callback ) {
// note it
log("tcp: null callback for s=0x%"PTRFMT"",(PTRTYPE)s);
if ( g_conf.m_logDebugTcp )
log("tcp: null callback for s=0x%"PTRFMT"",(PTRTYPE)s);
return;
}
// record times for profiler
@ -2777,7 +2832,8 @@ bool TcpServer::sendChunk ( TcpSocket *s ,
// sendChunk() again.
void (* doneSendingWrapper)( void *,TcpSocket *)){
log("tcp: sending chunk of %"INT32" bytes", sb->length() );
log("tcp: sending chunk of %"INT32" bytes sd=%i", sb->length() ,
s->m_sd );
// if socket had shit on there already, free that memory
// just like TcpServer::destroySocket would
@ -2818,6 +2874,11 @@ bool TcpServer::sendChunk ( TcpSocket *s ,
log("tcp: chunkend=%s",sb->getBuf() - minus);
*/
// char *p = sb->getBufStart();
// char *pend = p + sb->length();
// for ( ; p < pend ; p++ ) {
// if ( *p == '\0' ) { char *xx=NULL;*xx=0; }
// }
// . start the send process
// . returns false if send did not complete

File diff suppressed because it is too large Load Diff

@ -10,6 +10,11 @@
#include <sys/types.h> // pid_t
// this also limit the maximum number of outstanding (live) threads
#define MAX_STACKS 20
// try going up to 40, we use about 2MB per stack... so this is 80MB
//#define MAX_STACKS 40
// if we are a thread this gets the threadid, otherwise, the main process id
//pid_t getpidtid();
// on 64-bit architectures pthread_t is 64 bits and pid_t is still 32 bits
@ -59,6 +64,13 @@ class ThreadEntry {
bool m_needsJoin;
pthread_t m_joinTid;
class ThreadEntry *m_nextLink;
class ThreadEntry *m_prevLink;
// the waiting linked list we came from
ThreadEntry **m_bestHeadPtr;
ThreadEntry **m_bestTailPtr;
};
//#define MAX_THREAD_ENTRIES 1024
@ -85,6 +97,31 @@ class ThreadQueue {
int32_t m_entriesSize;
int32_t m_maxEntries;
// linked list head for launched thread entries
ThreadEntry *m_launchedHead;
// linked list head for empty thread entries
ThreadEntry *m_emptyHead;
// 8 heads/tails for linked lists of thread entries waiting to launch
ThreadEntry *m_waitHead0;
ThreadEntry *m_waitHead1;
ThreadEntry *m_waitHead2;
ThreadEntry *m_waitHead3;
ThreadEntry *m_waitHead4;
ThreadEntry *m_waitHead5;
ThreadEntry *m_waitHead6;
ThreadEntry *m_waitTail0;
ThreadEntry *m_waitTail1;
ThreadEntry *m_waitTail2;
ThreadEntry *m_waitTail3;
ThreadEntry *m_waitTail4;
ThreadEntry *m_waitTail5;
ThreadEntry *m_waitTail6;
/*
// counts the high/low priority (niceness <= 0) threads
int64_t m_hiLaunched;
int64_t m_hiReturned;
@ -114,6 +151,7 @@ class ThreadQueue {
int64_t m_mdReturnedSma;
int64_t m_loLaunchedSma;
int64_t m_loReturnedSma;
*/
// init
bool init (char threadType, int32_t maxThreads, int32_t maxEntries);
@ -122,6 +160,8 @@ class ThreadQueue {
void reset();
int32_t getNumThreadsOutOrQueued();
int32_t getNumWriteThreadsOut() ;
// . for adding an entry
// . returns false and sets errno on error
@ -141,7 +181,14 @@ class ThreadQueue {
// . launch a thread from our queue
// . returns false and sets errno on error
bool launchThread2 ( ThreadEntry *te );
bool launchThread2 ( );
bool launchThreadForReals ( ThreadEntry **headPtr ,
ThreadEntry **tailPtr ) ;
void removeThreads2 ( ThreadEntry **headPtr ,
ThreadEntry **tailPtr ,
class BigFile *bf ) ;
void print ( ) ;
@ -245,11 +292,14 @@ class Threads {
int32_t getNumThreadQueues() { return m_numQueues; }
// used by UdpServer to see if it should call a low priority callback
int32_t getNumActiveHighPriorityCpuThreads() ;
//int32_t getNumActiveHighPriorityCpuThreads() ;
// all high priority threads...
int32_t getNumActiveHighPriorityThreads() ;
bool hasHighPriorityCpuThreads() ;
int32_t getNumThreadsOutOrQueued();
int32_t getNumWriteThreadsOut() ;
// counts the high/low priority (niceness <= 0) threads
//int64_t m_hiLaunched;

@ -56,14 +56,14 @@ bool Titledb::init ( ) {
// do not use any page cache if doing tmp cluster in order to
// prevent swapping
if ( g_hostdb.m_useTmpCluster ) pcmem = 0;
int32_t pageSize = GB_INDEXDB_PAGE_SIZE;
//int32_t pageSize = GB_INDEXDB_PAGE_SIZE;
// init the page cache
// . MDW: "minimize disk seeks" not working otherwise i'd enable it!
if ( ! m_pc.init ( "titledb",
RDB_TITLEDB,
pcmem ,
pageSize ) )
return log("db: Titledb init failed.");
// if ( ! m_pc.init ( "titledb",
// RDB_TITLEDB,
// pcmem ,
// pageSize ) )
// return log("db: Titledb init failed.");
// each entry in the cache is usually just a single record, no lists
//int32_t maxCacheNodes = g_conf.m_titledbMaxCacheMem / (10*1024);
@ -89,7 +89,7 @@ bool Titledb::init ( ) {
0,//maxCacheNodes ,
false ,// half keys?
false ,// g_conf.m_titledbSav
&m_pc , // page cache ptr
NULL,//&m_pc , // page cache ptr
true ) )// is titledb?
return false;
return true;

@ -160,9 +160,9 @@ class Titledb {
// holds binary format title entries
Rdb m_rdb;
DiskPageCache *getDiskPageCache ( ) { return &m_pc; };
//DiskPageCache *getDiskPageCache ( ) { return &m_pc; };
DiskPageCache m_pc;
//DiskPageCache m_pc;
};
extern class Titledb g_titledb;

@ -1533,7 +1533,9 @@ int32_t UdpServer::readSock_ass ( UdpSlot **slotPtr , int64_t now ) {
// . msg13 is clogging thiings up when we synchost a host
// and it comes back up
// . allow spider compression proxy to have a bunch
if ( msgType == 0x13 && m_numUsedSlots > 500 && ! isProxy )
// . MDW: do we need this one anymore? relax it a little.
if ( msgType == 0x13 && m_numUsedSlotsIncoming>400 &&
m_numUsedSlots>800 && !isProxy)
getSlot = false;
// 2c is clogging crap up
if ( msgType == 0x2c && m_msg2csInWaiting >= 100 && niceness )

@ -109,6 +109,7 @@ char *getFirstJSONObject ( char *p ,
char *getJSONObjectEnd ( char *p , int32_t niceness ) ;
XmlDoc::XmlDoc() {
m_readThreadOut = false;
for ( int32_t i = 0 ; i < MAXMSG7S ; i++ ) m_msg7s[i] = NULL;
m_esbuf.setLabel("exputfbuf");
for ( int32_t i = 0 ; i < MAX_XML_DOCS ; i++ ) m_xmlDocs[i] = NULL;
@ -208,6 +209,10 @@ class XmlDoc *g_xd;
void XmlDoc::reset ( ) {
if ( m_readThreadOut )
log("build: deleting xmldoc class that has a read thread out "
"on a warc file");
if ( m_fileValid ) {
m_file.close();
m_file.unlink();
@ -221,7 +226,7 @@ void XmlDoc::reset ( ) {
if ( ! msg7 ) continue;
if(msg7->m_inUse) {
log("build: archive: reseting xmldoc when msg7s are outstanding");
}
mdelete ( msg7 , sizeof(Msg7) , "xdmsg7" );
delete ( msg7 );
@ -1253,6 +1258,12 @@ bool XmlDoc::set4 ( SpiderRequest *sreq ,
utf8Content = m_mime.getContent();
}
// use this to avoid ip lookup if it is not zero
if ( forcedIp ) {
m_ip = forcedIp;
m_ipValid = true;
}
// sometimes they supply the content they want! like when zaks'
// injects pages from PageInject.cpp
if ( utf8Content ) {
@ -1285,11 +1296,6 @@ bool XmlDoc::set4 ( SpiderRequest *sreq ,
// use this ip as well for now to avoid ip lookup
//m_ip = atoip("127.0.0.1");
//m_ipValid = true;
// use this to avoid ip lookup if it is not zero
if ( forcedIp ) {
m_ip = forcedIp;
m_ipValid = true;
}
// do not need robots.txt then
m_isAllowed = true;
m_isAllowedValid = true;
@ -1783,9 +1789,9 @@ bool XmlDoc::set2 ( char *titleRec ,
// new stuff
m_siteNumInlinksValid = true;
m_siteNumInlinksUniqueIpValid = true;
m_siteNumInlinksUniqueCBlockValid = true;
m_siteNumInlinksTotalValid = true;
// m_siteNumInlinksUniqueIpValid = true;
// m_siteNumInlinksUniqueCBlockValid = true;
// m_siteNumInlinksTotalValid = true;
//m_sitePopValid = true;
m_rootLangIdValid = true;
m_hasContactInfoValid = true;
@ -3348,9 +3354,21 @@ void doneInjectingArchiveRec ( void *state ) {
xd->m_numInjectionsOut--;
log("build: archive: injection thread returned. %"INT32" out now.",
xd->m_numInjectionsOut);
// reset g_errno so it doesn't error out in ::indexDoc() when
// we are injecting a ton of these msg7s and then xmldoc ends up
// getting reset and when a msg7 reply comes back in, we core
g_errno = 0;
xd->m_masterLoop ( xd );
}
void doneReadingArchiveFileWrapper ( void *state ) {
XmlDoc *THIS = (XmlDoc *)state;
// . go back to the main entry function
// . make sure g_errno is clear from a msg3a g_errno before calling
// this lest it abandon the loop
THIS->m_masterLoop ( THIS->m_masterState );
}
#define MAXWARCRECSIZE 1000000
@ -3368,7 +3386,7 @@ bool XmlDoc::indexWarcOrArc ( char ctype ) {
// so big we can fit it in memory. just do a wget then gunzip
// then open it. use a system call in a thread.
int64_t fileSize = -1;
File *file = getUtf8ContentInFile( &fileSize );
BigFile *file = getUtf8ContentInFile( &fileSize );
// return true with g_errno set on error
if ( ! file ) {
if ( ! g_errno ) { char *xx=NULL;*xx=0; }
@ -3444,7 +3462,37 @@ bool XmlDoc::indexWarcOrArc ( char ctype ) {
toRead = fileSize - m_fileOff;
m_hasMoreToRead = false;
}
int32_t bytesRead = file->read (m_fileBuf, toRead, m_fileOff);
bool status;
if ( m_readThreadOut ) {
m_readThreadOut = false;
status = false;
goto skipRead;
}
// make a thread to read now
status = file->read (m_fileBuf,
toRead,
m_fileOff,
&m_fileState,
this,
doneReadingArchiveFileWrapper,
MAX_NICENESS );
// if thread was queue or launched, wait for it to come back
if ( ! status ) {
// set a signal so we do not recall thread
// when callback brings us back here
m_readThreadOut = true;
// wait for callback
return false;
}
skipRead:
int64_t bytesRead = m_fileState.m_bytesDone;
if ( bytesRead != toRead ) {
log("build: read of %s failed at offset "
"%"INT64"", file->getFilename(), m_fileOff);
@ -5215,9 +5263,9 @@ SafeBuf *XmlDoc::getTitleRecBuf ( ) {
*/
if ( ! m_siteNumInlinksValid ) { char *xx=NULL;*xx=0; }
if ( ! m_siteNumInlinksUniqueIpValid ) { char *xx=NULL;*xx=0; }
if ( ! m_siteNumInlinksUniqueCBlockValid ) { char *xx=NULL;*xx=0; }
if ( ! m_siteNumInlinksTotalValid ) { char *xx=NULL;*xx=0; }
// if ( ! m_siteNumInlinksUniqueIpValid ) { char *xx=NULL;*xx=0; }
// if ( ! m_siteNumInlinksUniqueCBlockValid ) { char *xx=NULL;*xx=0; }
// if ( ! m_siteNumInlinksTotalValid ) { char *xx=NULL;*xx=0; }
//if ( ! m_sitePopValid ) { char *xx=NULL;*xx=0; }
if ( ! m_rootLangIdValid ) { char *xx=NULL;*xx=0; }
@ -8762,10 +8810,19 @@ Links *XmlDoc::getLinks ( bool doQuickSet ) {
return &m_links;
}
CollectionRec *cr = getCollRec();
if ( ! cr ) return NULL;
bool useRelNoFollow = true;
if ( ! cr->m_obeyRelNoFollowLinks ) useRelNoFollow = false;
// to keep things simple, for diffbot custom crawls, if robots.txt
// is not used then do not use rel no follow
if ( ! cr->m_useRobotsTxt && cr->m_isCustomCrawl )
useRelNoFollow = false;
// . set it
// . if parent is a permalink we can avoid its suburl outlinks
// containing "comment" from being classified as permalinks
if ( ! m_links.set ( true , // useRelNoFollow?
if ( ! m_links.set ( useRelNoFollow ,
xml ,
u ,
true , // setLinkHashes?
@ -10056,6 +10113,10 @@ char *XmlDoc::getIsDup ( ) {
// sanity. must be posdb list.
if ( ! list->isEmpty() && list->m_ks != 18 ) { char *xx=NULL;*xx=0;}
// so getSiteRank() does not core
int32_t *sni = getSiteNumInlinks();
if ( ! sni || sni == (int32_t *)-1 ) return (char *)sni;
// . see if there are any pages that seem like they are dups of us
// . they must also have a HIGHER score than us, for us to be
// considered the dup
@ -12026,11 +12087,25 @@ XmlDoc **XmlDoc::getRootXmlDoc ( int32_t maxCacheAge ) {
mnew ( m_rootDoc , sizeof(XmlDoc),"xmldoc3");
// if we had the title rec, set from that
if ( *rtr ) {
m_rootDoc->set2 ( m_rootTitleRec ,
m_rootTitleRecSize , // maxSize ,
cr->m_coll ,
NULL , // pbuf
m_niceness );
if ( ! m_rootDoc->set2 ( m_rootTitleRec ,
m_rootTitleRecSize , // maxSize ,
cr->m_coll ,
NULL , // pbuf
m_niceness ) ) {
// it was corrupted... delete this
// possibly printed
// " uncompress uncompressed size=..." bad uncompress
log("build: rootdoc set2 failed");
mdelete ( m_rootDoc , sizeof(XmlDoc) , "xdnuke");
delete ( m_rootDoc );
// call it empty for now, we don't want to return
// NULL with g_errno set because it could stop
// the whole indexing pipeline
m_rootDoc = NULL;
m_rootDocValid = true;
return &m_rootDoc;
//return NULL;
}
}
// . otherwise, set the url and download it on demand
// . this junk copied from the contactDoc->* stuff below
@ -13806,7 +13881,7 @@ int32_t *XmlDoc::getSiteNumInlinks ( ) {
// hacks of speed. computeSiteNumInlinks is true by default
// but if the user turns it off the just use sitelinks.txt
if ( ! cr->m_computeSiteNumInlinks ) {
if ( cr && ! cr->m_computeSiteNumInlinks ) {
int32_t hostHash32 = getHostHash32a();
int32_t min = g_tagdb.getMinSiteInlinks ( hostHash32 );
// try with www if not there
@ -13815,12 +13890,12 @@ int32_t *XmlDoc::getSiteNumInlinks ( ) {
min = g_tagdb.getMinSiteInlinks ( wwwHash32 );
}
// fix core by setting these
m_siteNumInlinksUniqueIp = 0;
m_siteNumInlinksUniqueCBlock = 0;
m_siteNumInlinksTotal = 0;
m_siteNumInlinksUniqueIpValid = true;
m_siteNumInlinksUniqueCBlockValid = true;
m_siteNumInlinksTotalValid = true;
// m_siteNumInlinksUniqueIp = 0;
// m_siteNumInlinksUniqueCBlock = 0;
// m_siteNumInlinksTotal = 0;
// m_siteNumInlinksUniqueIpValid = true;
// m_siteNumInlinksUniqueCBlockValid = true;
// m_siteNumInlinksTotalValid = true;
//a nd this
m_siteNumInlinksValid = true;
m_siteNumInlinks = 0;
@ -13847,13 +13922,13 @@ int32_t *XmlDoc::getSiteNumInlinks ( ) {
// no site inlinks
if ( *ip == 0 ) {
m_siteNumInlinks = 0;
m_siteNumInlinksUniqueIp = 0;
m_siteNumInlinksUniqueCBlock = 0;
m_siteNumInlinksTotal = 0;
// m_siteNumInlinksUniqueIp = 0;
// m_siteNumInlinksUniqueCBlock = 0;
// m_siteNumInlinksTotal = 0;
m_siteNumInlinksValid = true;
m_siteNumInlinksUniqueIpValid = true;
m_siteNumInlinksUniqueCBlockValid = true;
m_siteNumInlinksTotalValid = true;
// m_siteNumInlinksUniqueIpValid = true;
// m_siteNumInlinksUniqueCBlockValid = true;
// m_siteNumInlinksTotalValid = true;
return &m_siteNumInlinks;
}
@ -13940,13 +14015,13 @@ int32_t *XmlDoc::getSiteNumInlinks ( ) {
if ( age > maxAge ) valid = false;
}
// our companion tags, sitePop and fresh inlinks
Tag *tag2 = gr->getTag ( "sitenuminlinksuniqueip" );
Tag *tag3 = gr->getTag ( "sitenuminlinksuniquecblock");
Tag *tag4 = gr->getTag ( "sitenuminlinkstotal");
// Tag *tag2 = gr->getTag ( "sitenuminlinksuniqueip" );
// Tag *tag3 = gr->getTag ( "sitenuminlinksuniquecblock");
// Tag *tag4 = gr->getTag ( "sitenuminlinkstotal");
// if we are missing either of those, invalidate as well
if ( ! tag2 ) valid = false;
if ( ! tag3 ) valid = false;
if ( ! tag4 ) valid = false;
// if ( ! tag2 ) valid = false;
// if ( ! tag3 ) valid = false;
// if ( ! tag4 ) valid = false;
// if we have already been through this
if ( m_updatingSiteLinkInfoTags ) valid = false;
// if rebuilding linkdb assume we have no links to sample from!
@ -13959,14 +14034,14 @@ int32_t *XmlDoc::getSiteNumInlinks ( ) {
"age=%"INT32" ns=%"INT32" sni=%"INT32" "
"maxage=%"INT32" "
"tag=%"PTRFMT" "
"tag2=%"PTRFMT" "
"tag3=%"PTRFMT" "
// "tag2=%"PTRFMT" "
// "tag3=%"PTRFMT" "
"url=%s",
(int32_t)valid,age,ns,sni,
maxAge,
(PTRTYPE)tag,
(PTRTYPE)tag2,
(PTRTYPE)tag3,
// (PTRTYPE)tag2,
// (PTRTYPE)tag3,
m_firstUrl.m_url);
LinkInfo *sinfo = NULL;
@ -13979,18 +14054,18 @@ int32_t *XmlDoc::getSiteNumInlinks ( ) {
m_siteNumInlinksValid = true;
// companion tags
if ( tag2 ) {
m_siteNumInlinksUniqueIp = atol(tag2->getTagData());
m_siteNumInlinksUniqueIpValid = true;
}
if ( tag3 ) {
m_siteNumInlinksUniqueCBlock =atol(tag3->getTagData());
m_siteNumInlinksUniqueCBlockValid = true;
}
if ( tag4 ) {
m_siteNumInlinksTotal =atol(tag4->getTagData());
m_siteNumInlinksTotalValid = true;
}
// if ( tag2 ) {
// m_siteNumInlinksUniqueIp = atol(tag2->getTagData());
// m_siteNumInlinksUniqueIpValid = true;
// }
// if ( tag3 ) {
// m_siteNumInlinksUniqueCBlock =atol(tag3->getTagData());
// m_siteNumInlinksUniqueCBlockValid = true;
// }
// if ( tag4 ) {
// m_siteNumInlinksTotal =atol(tag4->getTagData());
// m_siteNumInlinksTotalValid = true;
// }
// . consult our sitelinks.txt file
// . returns -1 if not found
@ -14049,14 +14124,14 @@ int32_t *XmlDoc::getSiteNumInlinks ( ) {
m_siteNumInlinks = (int32_t)sinfo->m_numGoodInlinks;
//m_siteNumInlinksFresh = sinfo->m_numInlinksFresh;
//m_sitePop = sinfo->m_pagePop;
m_siteNumInlinksUniqueIp = sinfo->m_numUniqueIps;
m_siteNumInlinksUniqueCBlock = sinfo->m_numUniqueCBlocks;
m_siteNumInlinksTotal = sinfo->m_totalInlinkingDocIds;
// m_siteNumInlinksUniqueIp = sinfo->m_numUniqueIps;
// m_siteNumInlinksUniqueCBlock = sinfo->m_numUniqueCBlocks;
// m_siteNumInlinksTotal = sinfo->m_totalInlinkingDocIds;
m_siteNumInlinksValid = true;
m_siteNumInlinksUniqueIpValid = true;
m_siteNumInlinksUniqueCBlockValid = true;
m_siteNumInlinksTotalValid = true;
// m_siteNumInlinksUniqueIpValid = true;
// m_siteNumInlinksUniqueCBlockValid = true;
// m_siteNumInlinksTotalValid = true;
updateToMin:
@ -15563,10 +15638,11 @@ void gotDiffbotReplyWrapper ( void *state , TcpSocket *s ) {
THIS->m_diffbotReplyError = code;
}
// a hack for detecting if token is expired
if ( ! ttt && cr && strstr ( page , ":429}" ) ) {
if ( THIS->m_diffbotReplyError == EDIFFBOTTOKENEXPIRED ) {
// note it
log("xmldoc: pausing crawl %s (%"INT32") because "
"token is expired",cr->m_coll,(int32_t)cr->m_collnum);
"token is expired",cr->m_coll,
(int32_t)cr->m_collnum);
// pause the crawl
SafeBuf parmList;
// spidering enabled is the "cse" cgi parm in Parms.cpp
@ -16488,7 +16564,13 @@ SafeBuf *XmlDoc::getDiffbotReply ( ) {
// go through gb. we should fix that by downloading the whole page
// ourselves and sending it back, and tell diffbot's phantomjs not
// to do the certificate check.
useProxies = false;
//
// for now, allow http and NOT https urls through though.
// TODO: if the url redirects to an https url will this mess us up?
if ( ! m_firstUrlValid )
useProxies = false;
if ( m_firstUrlValid && m_firstUrl.isHttps() )
useProxies = false;
// if we used a proxy to download the doc, then diffbot should too
// BUT tell diffbot to go through host #0 so we can send it to the
@ -17763,6 +17845,91 @@ Url **XmlDoc::getCanonicalRedirUrl ( ) {
return &m_canonicalRedirUrlPtr;
}
// returns false if none found
bool setMetaRedirUrlFromTag ( char *p , Url *metaRedirUrl , char niceness ,
Url *cu ) {
// limit scan
char *limit = p + 30;
// skip whitespace
for ( ; *p && p < limit && is_wspace_a(*p) ; p++ );
// must be a num
if ( ! is_digit(*p) ) return false;
// init delay
int32_t delay = atol ( p );
// ignore long delays
if ( delay >= 10 ) return false;
// now find the semicolon, if any
for ( ; *p && p < limit && *p != ';' ; p++ );
// must have semicolon
if ( *p != ';' ) return false;
// skip it
p++;
// skip whitespace some more
for ( ; *p && p < limit && is_wspace_a(*p) ; p++ );
// must have URL
if ( strncasecmp(p,"URL",3) ) return false;
// skip that
p += 3;
// skip white space
for ( ; *p && p < limit && is_wspace_a(*p) ; p++ );
// then an equal sign
if ( *p != '=' ) return false;
// skip equal sign
p++;
// them maybe more whitespace
for ( ; *p && p < limit && is_wspace_a(*p) ; p++ );
// an optional quote
if ( *p == '\"' ) p++;
// can also be a single quote!
if ( *p == '\'' ) p++;
// set the url start
char *url = p;
// now advance to next quote or space or >
for ( ; *p && !is_wspace_a(*p) &&
*p !='\'' &&
*p !='\"' &&
*p !='>' ;
p++);
// that is the end
char *urlEnd = p;
// get size
int32_t usize = urlEnd - url;
// skip if too big
if ( usize > 1024 ) {
log("build: meta redirurl of %"INT32" bytes too big",usize);
return false;
}
// get our current utl
//Url *cu = getCurrentUrl();
// decode what we got
char decoded[MAX_URL_LEN];
// convert &amp; to "&"
int32_t decBytes = htmlDecode(decoded,url,usize,false,niceness);
decoded[decBytes]='\0';
// . then the url
// . set the url to the one in the redirect tag
// . but if the http-equiv meta redirect url starts with a '?'
// then just replace our cgi with that one
if ( *url == '?' ) {
char foob[MAX_URL_LEN*2];
char *pf = foob;
int32_t cuBytes = cu->getPathEnd() - cu->getUrl();
gbmemcpy(foob,cu->getUrl(),cuBytes);
pf += cuBytes;
gbmemcpy ( pf , decoded , decBytes );
pf += decBytes;
*pf = '\0';
metaRedirUrl->set(foob);
}
// . otherwise, append it right on
// . use "url" as the base Url
// . it may be the original url or the one we redirected to
// . redirUrl is set to the original at the top
else
// addWWW = false, stripSessId=true
metaRedirUrl->set(cu,decoded,decBytes,false,true);
return true;
}
// scan document for <meta http-equiv="refresh" content="0;URL=xxx">
@ -17789,6 +17956,14 @@ Url **XmlDoc::getMetaRedirUrl ( ) {
if ( cr->m_recycleContent || m_recycleContent )
return &m_metaRedirUrlPtr;
// will this work in here?
//uint8_t *ct = getContentType();
//if ( ! ct ) return NULL;
Url *cu = getCurrentUrl();
bool gotOne = false;
// advance a bit, we are initially looking for the 'v' char
p += 10;
// begin the string matching loop
@ -17828,91 +18003,64 @@ Url **XmlDoc::getMetaRedirUrl ( ) {
p += 8;
// skip possible quote
if ( *p == '\"' ) p++;
// limit scan
limit = p + 30;
// skip whitespace
for ( ; *p && p < limit && is_wspace_a(*p) ; p++ );
// must be a num
if ( ! is_digit(*p) ) continue;
// init delay
int32_t delay = atol ( p );
// ignore int32_t delays
if ( delay >= 10 ) continue;
// now find the semicolon, if any
for ( ; *p && p < limit && *p != ';' ; p++ );
// must have semicolon
if ( *p != ';' ) continue;
// skip it
p++;
// skip whitespace some more
for ( ; *p && p < limit && is_wspace_a(*p) ; p++ );
// must have URL
if ( strncasecmp(p,"URL",3) ) continue;
// skip that
p += 3;
// skip white space
for ( ; *p && p < limit && is_wspace_a(*p) ; p++ );
// then an equal sign
if ( *p != '=' ) continue;
// skip equal sign
p++;
// them maybe more whitespace
for ( ; *p && p < limit && is_wspace_a(*p) ; p++ );
// an optional quote
if ( *p == '\"' ) p++;
// can also be a single quote!
if ( *p == '\'' ) p++;
// set the url start
char *url = p;
// now advance to next quote or space or >
for ( ; *p && !is_wspace_a(*p) &&
*p !='\'' &&
*p !='\"' &&
*p !='>' ;
p++);
// that is the end
char *urlEnd = p;
// get size
int32_t usize = urlEnd - url;
// skip if too big
if ( usize > 1024 ) {
log("build: meta redirurl of %"INT32" bytes too big",usize);
// PARSE OUT THE URL
Url dummy;
if ( ! setMetaRedirUrlFromTag ( p , &dummy , m_niceness ,cu))
continue;
gotOne = true;
break;
}
if ( ! gotOne )
return &m_metaRedirUrlPtr;
// to fix issue with scripts containing
// document.write('<meta http-equiv="Refresh" content="0;URL=http://ww
// we have to get the Xml. we can't call getXml() because of
// recursion bugs so just do it directly here
Xml xml;
if ( ! xml.set ( m_httpReply ,
m_httpReplySize - 1, // make it a length
false , // ownData?
0 , // allocSize
false , // pure xml?
m_version ,
false , // setParentsArg?
m_niceness ,
// assume html since getContentType() is recursive
// on us.
CT_HTML ) ) // *ct ) )
// return NULL on error with g_errno set
return NULL;
XmlNode *nodes = xml.getNodes();
int32_t n = xml.getNumNodes();
// find the first meta summary node
for ( int32_t i = 0 ; i < n ; i++ ) {
// continue if not a meta tag
if ( nodes[i].m_nodeId != 68 ) continue;
// only get content for <meta http-equiv=..>
int32_t tagLen;
char *tag ;
tag = xml.getString ( i , "http-equiv" , &tagLen );
// skip if empty
if ( ! tag || tagLen <= 0 ) continue;
// if not a refresh, skip it
if ( strncasecmp ( tag , "refresh", 7 ) ) continue;
// get the content
tag = xml.getString ( i ,"content", &tagLen );
// skip if empty
if ( ! tag || tagLen <= 0 ) continue;
// PARSE OUT THE URL
if (!setMetaRedirUrlFromTag(p,&m_metaRedirUrl,m_niceness,cu) )
continue;
}
// get our current utl
Url *cu = getCurrentUrl();
// decode what we got
char decoded[MAX_URL_LEN];
// convert &amp; to "&"
int32_t decBytes = htmlDecode(decoded,url,usize,false,m_niceness);
decoded[decBytes]='\0';
// . then the url
// . set the url to the one in the redirect tag
// . but if the http-equiv meta redirect url starts with a '?'
// then just replace our cgi with that one
if ( *url == '?' ) {
char foob[MAX_URL_LEN*2];
char *pf = foob;
int32_t cuBytes = cu->getPathEnd() - cu->getUrl();
gbmemcpy(foob,cu->getUrl(),cuBytes);
pf += cuBytes;
gbmemcpy ( pf , decoded , decBytes );
pf += decBytes;
*pf = '\0';
m_metaRedirUrl.set(foob);
}
// . otherwise, append it right on
// . use "url" as the base Url
// . it may be the original url or the one we redirected to
// . redirUrl is set to the original at the top
else
// addWWW = false, stripSessId=true
m_metaRedirUrl.set(cu,decoded,decBytes,false,true);
// set it
m_metaRedirUrlPtr = &m_metaRedirUrl;
// return it
break;
return &m_metaRedirUrlPtr;
}
// nothing found
return &m_metaRedirUrlPtr;
}
@ -19086,6 +19234,9 @@ char **XmlDoc::getExpandedUtf8Content ( ) {
// <iframe src=""> which ends up embedding the root url.
if ( urlLen == 0 )
continue;
// skip if "about:blank"
if ( urlLen==11 && strncmp(url,"about:blank",11) == 0 )
continue;
// get our current url
//cu = getCurrentUrl();
// set our frame url
@ -19291,7 +19442,7 @@ void systemDoneWrapper ( void *state , ThreadEntry *t ) {
}
// we download large files to a file on disk, like warcs and arcs
File *XmlDoc::getUtf8ContentInFile ( int64_t *fileSizeArg ) {
BigFile *XmlDoc::getUtf8ContentInFile ( int64_t *fileSizeArg ) {
if ( m_fileValid ) {
*fileSizeArg = m_fileSize;
@ -19305,15 +19456,17 @@ File *XmlDoc::getUtf8ContentInFile ( int64_t *fileSizeArg ) {
char filename[2048];
snprintf ( filename,
2048,
"%sgbarchivefile%"UINT32"",
g_hostdb.m_dir,
"gbarchivefile%"UINT32"",
(int32_t)(int64_t)this);
m_file.set ( filename );
m_file.set ( g_hostdb.m_dir , filename );
m_fileSize = m_file.getFileSize();
m_fileValid = true;
*fileSizeArg = m_fileSize;
m_file.open(O_RDONLY);
// explicitly set it to false now to make it harder for
// it not to be true because that messes things up
m_file.m_usePartFiles = false;
return &m_file;
}
@ -19401,7 +19554,7 @@ File *XmlDoc::getUtf8ContentInFile ( int64_t *fileSizeArg ) {
systemDoneWrapper ,
systemStartWrapper_r ) )
// would block, wait for thread
return (File *)-1;
return (BigFile *)-1;
// failed?
log("build: failed to launch wget thread");
// If we run it in this thread then if we are fetching
@ -21445,12 +21598,13 @@ bool XmlDoc::logIt ( SafeBuf *bb ) {
//
// print # of link texts from 2nd coll
//
if ( m_linkInfo2Valid ) {
LinkInfo *info = ptr_linkInfo2;
int32_t nt = 0;
if ( info ) nt = info->getNumLinkTexts();
if ( nt ) sb->safePrintf("goodinlinks2=%"INT32" ",nt );
}
// this is not used for what it was used for.
// if ( m_linkInfo2Valid && size_linkInfo2 > 4 ) {
// LinkInfo *info = ptr_linkInfo2;
// int32_t nt = 0;
// if ( info ) nt = info->getNumLinkTexts();
// if ( nt ) sb->safePrintf("goodinlinks2=%"INT32" ",nt );
// }
if ( m_docIdValid )
sb->safePrintf("docid=%"UINT64" ",m_docId);
@ -25928,18 +26082,18 @@ void XmlDoc::copyFromOldDoc ( XmlDoc *od ) {
m_ip = od->m_ip;
m_ipValid = true;
m_siteNumInlinks = od->m_siteNumInlinks;
m_siteNumInlinksUniqueIp = od->m_siteNumInlinksUniqueIp;
m_siteNumInlinksUniqueCBlock= od->m_siteNumInlinksUniqueCBlock;
m_siteNumInlinksTotal = od->m_siteNumInlinksTotal;
// m_siteNumInlinksUniqueIp = od->m_siteNumInlinksUniqueIp;
// m_siteNumInlinksUniqueCBlock= od->m_siteNumInlinksUniqueCBlo
// m_siteNumInlinksTotal = od->m_siteNumInlinksTotal;
m_siteNumInlinksValid =
od->m_siteNumInlinksValid;
m_siteNumInlinksUniqueIpValid =
od->m_siteNumInlinksUniqueIpValid;
m_siteNumInlinksUniqueCBlockValid =
od->m_siteNumInlinksUniqueCBlockValid;
m_siteNumInlinksTotal =
od->m_siteNumInlinksTotalValid;
// m_siteNumInlinksUniqueIpValid =
// od->m_siteNumInlinksUniqueIpValid;
// m_siteNumInlinksUniqueCBlockValid =
// od->m_siteNumInlinksUniqueCBlockValid;
// m_siteNumInlinksTotal =
// od->m_siteNumInlinksTotalValid;
}
m_indexCode = 0;//od->m_indexCode;
@ -32238,8 +32392,13 @@ Msg20Reply *XmlDoc::getMsg20Reply ( ) {
}
TagRec *gr = getTagRec();
if ( ! gr || gr == (void *)-1 ) return (Msg20Reply *)gr;
// if we are showing sites that have been banned in tagdb, we dont
// have to do a tagdb lookup. that should speed things up.
TagRec *gr = NULL;
if ( cr && cr->m_doTagdbLookups ) {
gr = getTagRec();
if ( ! gr || gr == (void *)-1 ) return (Msg20Reply *)gr;
}
//reply-> ptr_tagRec = (char *)gr;
//reply->size_tagRec = gr->getSize();
@ -32319,7 +32478,8 @@ Msg20Reply *XmlDoc::getMsg20Reply ( ) {
if ( cr->m_forceDelete[ufn] ) pr = -3;
// this is an automatic ban!
if ( gr->getLong("manualban",0))pr=-3;//SPIDER_PRIORITY_BANNED;
if ( gr && gr->getLong("manualban",0))
pr=-3;//SPIDER_PRIORITY_BANNED;
// is it banned
if ( pr == -3 ) { // SPIDER_PRIORITY_BANNED ) { // -2
@ -32754,9 +32914,9 @@ Msg20Reply *XmlDoc::getMsg20Reply ( ) {
//if ( tag1 ) sni = atol(tag1->m_data);
//if ( tag2 ) spop = atol(tag2->m_data);
reply->m_siteNumInlinks = m_siteNumInlinks;
reply->m_siteNumInlinksTotal = m_siteNumInlinksTotal;
reply->m_siteNumUniqueIps = m_siteNumInlinksUniqueIp;
reply->m_siteNumUniqueCBlocks = m_siteNumInlinksUniqueCBlock;
//reply->m_siteNumInlinksTotal = m_siteNumInlinksTotal;
//reply->m_siteNumUniqueIps = m_siteNumInlinksUniqueIp;
//reply->m_siteNumUniqueCBlocks = m_siteNumInlinksUniqueCBlock;
//reply->m_sitePop = m_sitePop;
// . get stuff from link info
@ -38205,25 +38365,25 @@ bool XmlDoc::printGeneralInfo ( SafeBuf *sb , HttpRequest *hr ) {
"<tr><td><b>good inlinks to site</b>"
"</td><td>%"INT32"</td></tr>\n"
"<tr><td>unique IP inlinks to site"
"</td><td>%"INT32"</td></tr>\n"
// "<tr><td>unique IP inlinks to site"
// "</td><td>%"INT32"</td></tr>\n"
"<tr><td>unique CBlock inlinks to site"
"</td><td>%"INT32"</td></tr>\n"
// "<tr><td>unique CBlock inlinks to site"
// "</td><td>%"INT32"</td></tr>\n"
"<tr><td><b>site rank</b></td><td>%"INT32"</td></tr>\n"
"<tr><td>good inlinks to page"
"</td><td>%"INT32"</td></tr>\n"
"<tr><td>unique IP inlinks to page"
"</td><td>%"INT32"</td></tr>\n"
// "<tr><td>unique IP inlinks to page"
// "</td><td>%"INT32"</td></tr>\n"
"<tr><td>unique CBlock inlinks to page"
"</td><td>%"INT32"</td></tr>\n"
// "<tr><td>unique CBlock inlinks to page"
// "</td><td>%"INT32"</td></tr>\n"
"<tr><td>total inlinks to page"
"</td><td>%"INT32"</td></tr>\n"
// "<tr><td>total inlinks to page"
// "</td><td>%"INT32"</td></tr>\n"
"<tr><td><nobr>page inlinks last computed</nobr></td>"
"<td>%s</td></tr>\n"
@ -38243,14 +38403,14 @@ bool XmlDoc::printGeneralInfo ( SafeBuf *sb , HttpRequest *hr ) {
strLanguage,
g_countryCode.getName(m_countryId) ,
sni,
m_siteNumInlinksUniqueIp,
m_siteNumInlinksUniqueCBlock,
//m_siteNumInlinksUniqueIp,
//m_siteNumInlinksUniqueCBlock,
::getSiteRank(sni),
//info1->getNumTotalInlinks(),
info1->getNumGoodInlinks(),
info1->m_numUniqueIps,
info1->m_numUniqueCBlocks,
info1->m_totalInlinkingDocIds,
// info1->m_numUniqueIps,
// info1->m_numUniqueCBlocks,
// info1->m_totalInlinkingDocIds,
tmp3
);
@ -38262,26 +38422,26 @@ bool XmlDoc::printGeneralInfo ( SafeBuf *sb , HttpRequest *hr ) {
"\t<siteRank>%"INT32"</siteRank>\n"
"\t<numGoodSiteInlinks>%"INT32"</numGoodSiteInlinks>\n"
"\t<numTotalSiteInlinks>%"INT32"</numTotalSiteInlinks>\n"
"\t<numUniqueIpsLinkingToSite>%"INT32""
"</numUniqueIpsLinkingToSite>\n"
"\t<numUniqueCBlocksLinkingToSite>%"INT32""
"</numUniqueCBlocksLinkingToSite>\n"
//"\t<numTotalSiteInlinks>%"INT32"</numTotalSiteInlinks>\n"
// "\t<numUniqueIpsLinkingToSite>%"INT32""
// "</numUniqueIpsLinkingToSite>\n"
// "\t<numUniqueCBlocksLinkingToSite>%"INT32""
// "</numUniqueCBlocksLinkingToSite>\n"
// how many inlinks, external and internal, we have
// to this page not filtered in any way!!!
"\t<numTotalPageInlinks>%"INT32"</numTotalPageInlinks>\n"
//"\t<numTotalPageInlinks>%"INT32"</numTotalPageInlinks>\n"
// how many inlinking ips we got, including our own if
// we link to ourself
"\t<numUniqueIpsLinkingToPage>%"INT32""
"</numUniqueIpsLinkingToPage>\n"
// "\t<numUniqueIpsLinkingToPage>%"INT32""
// "</numUniqueIpsLinkingToPage>\n"
// how many inlinking cblocks we got, including our own
// if we link to ourself
"\t<numUniqueCBlocksLinkingToPage>%"INT32""
"</numUniqueCBlocksLinkingToPage>\n"
// "\t<numUniqueCBlocksLinkingToPage>%"INT32""
// "</numUniqueCBlocksLinkingToPage>\n"
"\t<numGoodPageInlinks>%"INT32"</numGoodPageInlinks>\n"
@ -38293,13 +38453,13 @@ bool XmlDoc::printGeneralInfo ( SafeBuf *sb , HttpRequest *hr ) {
,(int32_t)m_isLinkSpam
,::getSiteRank(sni)
,sni
,m_siteNumInlinksTotal
,m_siteNumInlinksUniqueIp
,m_siteNumInlinksUniqueCBlock
// ,m_siteNumInlinksTotal
// ,m_siteNumInlinksUniqueIp
// ,m_siteNumInlinksUniqueCBlock
,info1->m_totalInlinkingDocIds
,info1->m_numUniqueIps
,info1->m_numUniqueCBlocks
//,info1->m_totalInlinkingDocIds
//,info1->m_numUniqueIps
//,info1->m_numUniqueCBlocks
,info1->getNumGoodInlinks()
//,tmp3
@ -39312,6 +39472,12 @@ char **XmlDoc::getRootTitleBuf ( ) {
// sanity check, must include the null ni the size
if ( m_rootTitleBufSize > 0 &&
m_rootTitleBuf [ m_rootTitleBufSize - 1 ] ) {
log("build: bad root titlebuf size not end in null char for "
"collnum=%i",(int)m_collnum);
ptr_rootTitleBuf = NULL;
size_rootTitleBuf = 0;
m_rootTitleBufValid = true;
return (char **)&m_rootTitleBuf;
char *xx=NULL;*xx=0;
//m_rootTitleBuf [ m_rootTitleBufSize - 1 ] = '\0';
//m_rootTitleBufSize++;
@ -39931,7 +40097,7 @@ SafeBuf *XmlDoc::getNewTagBuf ( ) {
return NULL;
}
int32_t old2, old3, old4;
//int32_t old2, old3, old4;
// if running for diffbot crawlbot then isCustomCrawl is true
// so do not update the siteinlink info already in tagdb since i
@ -39944,31 +40110,31 @@ SafeBuf *XmlDoc::getNewTagBuf ( ) {
//if ( strcmp(cr->m_coll,"GLOBAL-INDEX") == 0 ) ) goto skipSiteInlinks;
// sitenuminlinksfresh
old2 = gr->getLong("sitenuminlinksuniqueip",-1,NULL,&timestamp);
if ( old2 == -1 || old2 != m_siteNumInlinksUniqueIp ||
m_updatingSiteLinkInfoTags )
if ( ! tbuf->addTag2(mysite,"sitenuminlinksuniqueip",
now,"xmldoc",
*ip,m_siteNumInlinksUniqueIp,rdbId))
return NULL;
// sitepop
old3 = gr->getLong("sitenuminlinksuniquecblock",-1,NULL,
&timestamp);
if ( old3 == -1 || old3 != m_siteNumInlinksUniqueCBlock ||
m_updatingSiteLinkInfoTags )
if ( ! tbuf->addTag2(mysite,"sitenuminlinksuniquecblock",
now,"xmldoc",
*ip,m_siteNumInlinksUniqueCBlock,rdbId))
return NULL;
// total site inlinks
old4 = gr->getLong("sitenuminlinkstotal",-1,NULL,
&timestamp);
if ( old4 == -1 || old4 != m_siteNumInlinksTotal ||
m_updatingSiteLinkInfoTags )
if ( ! tbuf->addTag2(mysite,"sitenuminlinkstotal",
now,"xmldoc",
*ip,m_siteNumInlinksTotal,rdbId))
return NULL;
// old2 = gr->getLong("sitenuminlinksuniqueip",-1,NULL,&timestamp);
// if ( old2 == -1 || old2 != m_siteNumInlinksUniqueIp ||
// m_updatingSiteLinkInfoTags )
// if ( ! tbuf->addTag2(mysite,"sitenuminlinksuniqueip",
// now,"xmldoc",
// *ip,m_siteNumInlinksUniqueIp,rdbId))
// return NULL;
// // sitepop
// old3 = gr->getLong("sitenuminlinksuniquecblock",-1,NULL,
// &timestamp);
// if ( old3 == -1 || old3 != m_siteNumInlinksUniqueCBlock ||
// m_updatingSiteLinkInfoTags )
// if ( ! tbuf->addTag2(mysite,"sitenuminlinksuniquecblock",
// now,"xmldoc",
// *ip,m_siteNumInlinksUniqueCBlock,rdbId))
// return NULL;
// // total site inlinks
// old4 = gr->getLong("sitenuminlinkstotal",-1,NULL,
// &timestamp);
// if ( old4 == -1 || old4 != m_siteNumInlinksTotal ||
// m_updatingSiteLinkInfoTags )
// if ( ! tbuf->addTag2(mysite,"sitenuminlinkstotal",
// now,"xmldoc",
// *ip,m_siteNumInlinksTotal,rdbId))
// return NULL;
// skipSiteInlinks:

@ -280,8 +280,10 @@ class XmlDoc {
// this is a hash of all adjacent tag pairs for templated identificatn
uint32_t m_tagPairHash32;
int32_t m_siteNumInlinks;
int32_t m_siteNumInlinksUniqueIp; // m_siteNumInlinksFresh
int32_t m_siteNumInlinksUniqueCBlock; // m_sitePop;
//int32_t m_siteNumInlinksUniqueIp; // m_siteNumInlinksFresh
//int32_t m_siteNumInlinksUniqueCBlock; // m_sitePop;
int32_t m_reserved1;
int32_t m_reserved2;
uint32_t m_spideredTime; // time_t
// just don't throw away any relevant SpiderRequests and we have
// the data that m_minPubDate and m_maxPubDate provided
@ -297,7 +299,8 @@ class XmlDoc {
uint16_t m_countryId;
//uint16_t m_reserved1;//titleWeight;
//uint16_t m_reserved2;//headerWeight;
int32_t m_siteNumInlinksTotal;
//int32_t m_siteNumInlinksTotal;
int32_t m_reserved3;
//uint16_t m_reserved3;//urlPathWeight;
uint8_t m_metaListCheckSum8; // bring it back!!
char m_reserved3b;
@ -702,7 +705,7 @@ class XmlDoc {
char **getExpandedUtf8Content ( ) ;
char **getUtf8Content ( ) ;
// we download large files to a file on disk, like warcs and arcs
File *getUtf8ContentInFile ( int64_t *fileSizeArg );
BigFile *getUtf8ContentInFile ( int64_t *fileSizeArg );
int32_t *getContentHash32 ( ) ;
int32_t *getContentHashJson32 ( ) ;
//int32_t *getTagHash32 ( ) ;
@ -1008,6 +1011,9 @@ class XmlDoc {
int64_t m_startTime;
int64_t m_injectStartTime;
class XmlDoc *m_prevInject;
class XmlDoc *m_nextInject;
// when set() was called by Msg20.cpp so we can time how long it took
// to generate the summary
int64_t m_setTime;
@ -1084,8 +1090,10 @@ class XmlDoc {
int32_t m_fileBufAllocSize;
char *m_fptr ;
char *m_fptrEnd ;
File m_file;
BigFile m_file;
int64_t m_fileSize;
FileState m_fileState;
bool m_readThreadOut;
bool m_hasMoreToRead;
int32_t m_numInjectionsOut;
bool m_calledWgetThread;
@ -1432,9 +1440,9 @@ class XmlDoc {
//bool m_aboutUsLinkValid;
//bool m_contactLinksValid;
bool m_siteNumInlinksValid;
bool m_siteNumInlinksUniqueIpValid;//FreshValid;
bool m_siteNumInlinksUniqueCBlockValid;//sitePopValid
bool m_siteNumInlinksTotalValid;
//bool m_siteNumInlinksUniqueIpValid;//FreshValid;
//bool m_siteNumInlinksUniqueCBlockValid;//sitePopValid
//bool m_siteNumInlinksTotalValid;
bool m_siteNumInlinks8Valid;
bool m_siteLinkInfoValid;
bool m_isWWWDupValid;

@ -18,6 +18,8 @@ static void sleepWrapper ( int fd , void *state ) ;
bool sendPageSEO(TcpSocket *s, HttpRequest *hr) {return true;}
bool g_recoveryMode;
int g_inMemcpy;
int32_t g_recoveryLevel;
static int32_t s_maxNumThreads = 1 ;
static int32_t s_launched = 0;
@ -48,7 +50,7 @@ int main ( int argc , char *argv[] ) {
if ( setrlimit(RLIMIT_CORE,&lim) )
log("blaster::setrlimit: %s", mstrerror(errno) );
g_conf.m_maxMem = 500000000;
//g_conf.m_maxMem = 500000000;
// init our table for doing zobrist hashing
if ( ! hashinit() ) {
@ -57,7 +59,7 @@ int main ( int argc , char *argv[] ) {
// init the memory class after conf since it gets maxMem from Conf
//if ( ! g_mem.init ( 20000000 ) ) {
// log("blaster::Mem init failed" ); return 1; }
g_mem.m_maxMem = 200000000;
//g_mem.m_maxMem = 200000000;
// start up log file
if ( ! g_log.init( "/tmp/blasterLog" ) ) {
log("blaster::Log open /tmp/blasterLog failed" ); return 1; }
@ -449,7 +451,9 @@ bool getWords() {
s_words += '\0';
}
fclose ( fd );
log("blaster: read %"INT32" words, %"INT32" bytes in from dictionary.",
s_windices.length() / sizeof(int32_t), s_words.length());
log("blaster: read %"INT32" words, "
"%"INT32" bytes in from dictionary.",
(int32_t)(s_windices.length() / sizeof(int32_t)),
(int32_t)s_words.length());
return true;
}

@ -27,6 +27,7 @@ bool sendPageSEO(TcpSocket *s, HttpRequest *hr) {return true;}
//SafeBuf g_qbuf;
bool g_recoveryMode;
int32_t g_recoveryLevel;
int g_inMemcpy;

@ -2504,7 +2504,7 @@ int32_t deserializeMsg ( int32_t baseSize ,
// make it NULL if size is 0 though
if ( *sizePtr == 0 ) *strPtr = NULL;
// sanity check
if ( *sizePtr < 0 ) { char *xx = NULL; *xx =0; }
if ( *sizePtr < 0 ) { g_errno = ECORRUPTDATA; return -1;}
// advance our destination ptr
p += *sizePtr;
// advance both ptrs to next string

@ -620,6 +620,7 @@ char *serializeMsg2 ( void *thisPtr ,
int32_t *retSize );
// convert offsets back into ptrs
// returns -1 on error
int32_t deserializeMsg ( int32_t baseSize ,
int32_t *firstSizeParm ,
int32_t *lastSizeParm ,

@ -6394,7 +6394,7 @@ void dumpTitledb (char *coll,int32_t startFileNum,int32_t numFiles,bool includeT
//g_conf.m_spiderdbMaxTreeMem = 1024*1024*30;
//g_conf.m_checksumdbMaxDiskPageCacheMem = 0;
//g_conf.m_spiderdbMaxDiskPageCacheMem = 0;
g_conf.m_tfndbMaxDiskPageCacheMem = 0;
//g_conf.m_tfndbMaxDiskPageCacheMem = 0;
g_titledb.init ();
//g_collectiondb.init(true);
g_titledb.getRdb()->addRdbBase1(coll);
@ -7028,7 +7028,7 @@ int32_t dumpSpiderdb ( char *coll,
//g_conf.m_spiderdbMaxTreeMem = 1024*1024*30;
//g_conf.m_checksumdbMaxDiskPageCacheMem = 0;
//g_conf.m_spiderdbMaxDiskPageCacheMem = 0;
g_conf.m_tfndbMaxDiskPageCacheMem = 0;
//g_conf.m_tfndbMaxDiskPageCacheMem = 0;
g_spiderdb.init ();
//g_collectiondb.init(true);
g_spiderdb.getRdb()->addRdbBase1(coll );
@ -8637,8 +8637,8 @@ if ( ! tr.set ( rec , listSize , false ) ) { // own data?
void dumpMissing ( char *coll ) {
// load tfndb, assume it is a perfect reflection of titledb
//g_conf.m_spiderdbMaxTreeMem = 1024*1024*30;
g_conf.m_tfndbMaxDiskPageCacheMem = 0;
g_conf.m_indexdbMaxCacheMem = 0;
//g_conf.m_tfndbMaxDiskPageCacheMem = 0;
//g_conf.m_indexdbMaxCacheMem = 0;
//g_conf.m_clusterdbMaxDiskPageCacheMem = 0;
//g_tfndb.init ();
@ -8855,7 +8855,7 @@ void dumpMissing ( char *coll ) {
void dumpDups ( char *coll ) {
// load tfndb, assume it is a perfect reflection of titledb
//g_conf.m_spiderdbMaxTreeMem = 1024*1024*30;
g_conf.m_indexdbMaxCacheMem = 0;
//g_conf.m_indexdbMaxCacheMem = 0;
//g_conf.m_spiderdbMaxTreeMem = 1024*1024*30;
g_indexdb.init ();
@ -9407,10 +9407,10 @@ void removeDocIds ( char *coll , char *filename ) {
// g_conf.m_checksumdbMinFilesToMerge = 100;
if ( g_conf.m_clusterdbMinFilesToMerge < 100 )
g_conf.m_clusterdbMinFilesToMerge = 100;
g_conf.m_tfndbMaxDiskPageCacheMem = 0;
//g_conf.m_tfndbMaxDiskPageCacheMem = 0;
//g_conf.m_checksumdbMaxDiskPageCacheMem = 0;
//g_conf.m_clusterdbMaxDiskPageCacheMem = 0;
g_conf.m_indexdbMaxCacheMem = 0;
//g_conf.m_indexdbMaxCacheMem = 0;
//g_conf.m_checksumdbMaxCacheMem = 0;
//g_conf.m_clusterdbMaxCacheMem = 0;
@ -12272,7 +12272,7 @@ bool parseTest ( char *coll , int64_t docId , char *query ) {
//g_mem.m_maxMem = 2000000000LL; // 2G
//g_conf.m_checksumdbMaxDiskPageCacheMem = 0;
//g_conf.m_spiderdbMaxDiskPageCacheMem = 0;
g_conf.m_tfndbMaxDiskPageCacheMem = 0;
//g_conf.m_tfndbMaxDiskPageCacheMem = 0;
//g_conf.m_titledbMaxTreeMem = 1024*1024*10;
g_titledb.init ();
//g_collectiondb.init(true);

4
qa.cpp

@ -1349,6 +1349,10 @@ bool qaSyntax ( ) {
"format=json&"
"q=");
tmp.urlEncode ( s_q[s_i] );
// get back 100 for debugging better
if ( strcmp(s_q[s_i],"gbssStatusCode:0") == 0 ) {
tmp.safePrintf("&n=100");
}
tmp.nullTerm();
// point to next query
s_i++;