Merge branch 'diffbot-testing' into diffbot
Conflicts: Spider.cpp
This commit is contained in:
BigFile.cppBigFile.hCachedb.cppCachedb.hCatdb.cppCatdb.hClusterdb.cppClusterdb.hCollectiondb.cppCollectiondb.hConf.cppConf.hDatedb.cppErrno.cppErrno.hHashTableX.cppIndexdb.cppIndexdb.hLinkdb.cppLinkdb.hLoop.cppMakefileMem.cppMonitordb.cppMonitordb.hMsg13.cppMsg20.cppMsg20.hMsg22.cppMsg3.cppMsg3.hMsg39.cppMsg3a.cppMsg40.cppMsg5.cppPageCrawlBot.cppPageInject.cppPageInject.hPageResults.cppPageResults.hPageSockets.cppPageStats.cppPageThreads.cppPages.cppParms.cppPingServer.cppPingServer.hPlacedb.cppPlacedb.hPosdb.cppPosdb.hProcess.cppProfiler.cppProfiler.hRdb.cppRdb.hRdbBase.cppRdbBase.hRdbCache.cppRdbCache.hRdbDump.cppRdbDump.hRdbList.cppRdbList.hRdbMap.cppRdbMerge.cppRdbMerge.hRdbScan.cppRdbScan.hRdbTree.cppSafeBuf.cppSections.cppSections.hSpider.cppSpider.hSpiderProxy.cppStatsdb.cppTagdb.cppTagdb.hTcpServer.cppThreads.cppThreads.hTitledb.cppTitledb.hUdpServer.cppXmlDoc.cppXmlDoc.hblaster2.cppdmozparse.cppfctypes.cppfctypes.hmain.cppqa.cpp
113
BigFile.cpp
113
BigFile.cpp
@ -9,7 +9,7 @@
|
||||
#include "Threads.h"
|
||||
#include "Stats.h"
|
||||
#include "Statsdb.h"
|
||||
#include "DiskPageCache.h"
|
||||
//#include "DiskPageCache.h"
|
||||
|
||||
#ifdef ASYNCIO
|
||||
#include <aio.h>
|
||||
@ -35,11 +35,12 @@ BigFile::~BigFile () {
|
||||
BigFile::BigFile () {
|
||||
m_permissions = S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH ;
|
||||
m_flags = O_RDWR ; // | O_DIRECT;
|
||||
m_usePartFiles = true;
|
||||
// NULLify all ptrs to files
|
||||
//for ( int32_t i = 0 ; i < MAX_PART_FILES ; i++ ) m_files[i] = NULL;
|
||||
m_maxParts = 0;
|
||||
m_numParts = 0;
|
||||
m_pc = NULL;
|
||||
//m_pc = NULL;
|
||||
m_vfd = -1;
|
||||
//m_vfdAllowed = false;
|
||||
m_fileSize = -1;
|
||||
@ -74,6 +75,8 @@ bool BigFile::set ( char *dir , char *baseFilename , char *stripeDir ) {
|
||||
m_dir .setLabel("bfd");
|
||||
m_baseFilename.setLabel("bfbf");
|
||||
|
||||
m_usePartFiles = true;
|
||||
|
||||
// use this 32 byte char buf to avoid a malloc if possible
|
||||
m_baseFilename.setBuf (m_tmpBaseBuf,32,0,false);
|
||||
|
||||
@ -265,23 +268,36 @@ bool BigFile::doesPartExist ( int32_t n ) {
|
||||
|
||||
static int64_t s_vfd = 0;
|
||||
|
||||
// do not use part files for this open so we can open regular really >2GB
|
||||
// sized files with it
|
||||
// bool BigFile::open2 ( int flags ,
|
||||
// void *pc ,
|
||||
// int64_t maxFileSize ,
|
||||
// int permissions ) {
|
||||
// return open ( flags , pc , maxFileSize , permissions , false );
|
||||
// }
|
||||
|
||||
// . overide File::open so we can set m_numParts
|
||||
// . set maxFileSize when opening a new file for writing and using
|
||||
// DiskPageCache
|
||||
// . use maxFileSize of -1 for us to use getFileSize() to set it
|
||||
bool BigFile::open ( int flags , class DiskPageCache *pc ,
|
||||
bool BigFile::open ( int flags ,
|
||||
//class DiskPageCache *pc ,
|
||||
void *pc ,
|
||||
int64_t maxFileSize ,
|
||||
int permissions ) {
|
||||
|
||||
m_flags = flags;
|
||||
m_pc = pc;
|
||||
//m_pc = pc;
|
||||
m_permissions = permissions;
|
||||
m_isClosing = false;
|
||||
// this is true except when parsing big warc files
|
||||
m_usePartFiles = true;//usePartFiles;
|
||||
// . init the page cache for this vfd
|
||||
// . this returns our "virtual fd", not the same as File::m_vfd
|
||||
// . returns -1 and sets g_errno on failure
|
||||
// . we pass m_vfd to getPages() and addPages()
|
||||
if ( m_pc && m_vfd == -1 ) {
|
||||
if ( m_vfd == -1 ) {
|
||||
//if ( maxFileSize == -1 ) maxFileSize = getFileSize();
|
||||
m_vfd = ++s_vfd;
|
||||
//g_errno = 0;
|
||||
@ -527,6 +543,7 @@ bool BigFile::readwrite ( void *buf ,
|
||||
fstate->m_inPageCache = false;
|
||||
// . try to get as much as we can from page cache first
|
||||
// . the vfd of the big file will be the vfd of its last File class
|
||||
/*
|
||||
if ( ! doWrite && m_pc && allowPageCache ) {
|
||||
//int32_t oldOff = offset;
|
||||
// we have to set these so RdbScan doesn't freak out if we
|
||||
@ -559,6 +576,7 @@ bool BigFile::readwrite ( void *buf ,
|
||||
// return true;
|
||||
//}
|
||||
}
|
||||
*/
|
||||
// sanity check. if you set hitDisk to false, you must allow
|
||||
// us to check the page cache! silly bean!
|
||||
if ( ! allowPageCache && ! hitDisk ) { char*xx=NULL;*xx=0; }
|
||||
@ -591,6 +609,7 @@ bool BigFile::readwrite ( void *buf ,
|
||||
fstate->m_callback = callback;
|
||||
fstate->m_niceness = niceness;
|
||||
fstate->m_flags = m_flags;
|
||||
fstate->m_usePartFiles = m_usePartFiles;
|
||||
// sanity
|
||||
if ( fstate->m_bytesToGo > 150000000 )
|
||||
log("file: huge read of %"INT64" bytes",(int64_t)size);
|
||||
@ -603,6 +622,13 @@ bool BigFile::readwrite ( void *buf ,
|
||||
// situation occurs and pass a g_errno back to the caller.
|
||||
fstate->m_filenum1 = offset / MAX_PART_SIZE;
|
||||
fstate->m_filenum2 = (offset + size ) / MAX_PART_SIZE;
|
||||
|
||||
// if not really a big file. we use this for parsing huge warc files
|
||||
if ( ! m_usePartFiles ) {
|
||||
fstate->m_filenum1 = 0;
|
||||
fstate->m_filenum2 = 0;
|
||||
}
|
||||
|
||||
// . save the open count for this fd
|
||||
// . if it changes when we're done with the read we do a re-read
|
||||
// . it gets incremented once every time File calls ::open and gets
|
||||
@ -643,9 +669,9 @@ bool BigFile::readwrite ( void *buf ,
|
||||
fstate->m_errno = 0;
|
||||
fstate->m_errno2 = 0;
|
||||
fstate->m_startTime = gettimeofdayInMilliseconds();
|
||||
fstate->m_pc = m_pc;
|
||||
if ( ! allowPageCache )
|
||||
fstate->m_pc = NULL;
|
||||
//fstate->m_pc = NULL;//m_pc;
|
||||
// if ( ! allowPageCache )
|
||||
// fstate->m_pc = NULL;
|
||||
fstate->m_vfd = m_vfd;
|
||||
// if hitDisk was false we only check the page cache!
|
||||
if ( ! hitDisk ) return true;
|
||||
@ -765,7 +791,7 @@ bool BigFile::readwrite ( void *buf ,
|
||||
// how many bytes to read from each file?
|
||||
int64_t readSize1 = size;
|
||||
int64_t readSize2 = 0;
|
||||
if ( off1 + readSize1 > MAX_PART_SIZE ) {
|
||||
if ( off1 + readSize1 > MAX_PART_SIZE && m_usePartFiles ) {
|
||||
readSize1 = ((int64_t)MAX_PART_SIZE) - off1;
|
||||
readSize2 = size - readSize1;
|
||||
}
|
||||
@ -784,6 +810,10 @@ bool BigFile::readwrite ( void *buf ,
|
||||
int32_t filenum = offset / MAX_PART_SIZE;
|
||||
int32_t localOffset = offset % MAX_PART_SIZE;
|
||||
|
||||
if ( ! m_usePartFiles ) {
|
||||
filenum = 0;
|
||||
localOffset = offset;
|
||||
}
|
||||
|
||||
// read or write?
|
||||
if ( doWrite ) a0->aio_lio_opcode = LIO_WRITE;
|
||||
@ -852,7 +882,8 @@ bool BigFile::readwrite ( void *buf ,
|
||||
int32_t rate = 100000;
|
||||
if ( took > 500 ) rate = fstate->m_bytesDone / took ;
|
||||
if ( rate < 8000 && fstate->m_niceness <= 0 ) {
|
||||
log(LOG_INFO,"disk: Read %"INT32" bytes in %"INT64" ms (%"INT32"MB/s).",
|
||||
log(LOG_INFO,"disk: Read %"INT64" bytes in %"INT64" "
|
||||
"ms (%"INT32"KB/s).",
|
||||
fstate->m_bytesDone,took,rate);
|
||||
g_stats.m_slowDiskReads++;
|
||||
}
|
||||
@ -880,12 +911,12 @@ bool BigFile::readwrite ( void *buf ,
|
||||
// fstate->m_bytesDone);
|
||||
|
||||
// store read/written pages into page cache
|
||||
if ( ! g_errno && fstate->m_pc )
|
||||
fstate->m_pc->addPages ( fstate->m_vfd ,
|
||||
fstate->m_offset ,
|
||||
fstate->m_bytesDone ,
|
||||
fstate->m_buf ,
|
||||
fstate->m_niceness );
|
||||
// if ( ! g_errno && fstate->m_pc )
|
||||
// fstate->m_pc->addPages ( fstate->m_vfd ,
|
||||
// fstate->m_offset ,
|
||||
// fstate->m_bytesDone ,
|
||||
// fstate->m_buf ,
|
||||
// fstate->m_niceness );
|
||||
// now log our stuff here
|
||||
if ( g_errno && g_errno != EBADENGINEER )
|
||||
log("disk: readwrite: %s", mstrerror(g_errno));
|
||||
@ -952,7 +983,8 @@ void doneWrapper ( void *state , ThreadEntry *t ) {
|
||||
if ( fstate->m_errno == EDISKSTUCK ) slow = true;
|
||||
if ( slow && fstate->m_niceness <= 0 ) {
|
||||
if ( fstate->m_errno != EDISKSTUCK )
|
||||
log(LOG_INFO, "disk: Read %"INT32" bytes in %"INT64" ms (%"INT32"MB/s).",
|
||||
log(LOG_INFO, "disk: Read %"INT64" bytes in %"INT64" "
|
||||
"ms (%"INT32"KB/s).",
|
||||
fstate->m_bytesDone,took,rate);
|
||||
g_stats.m_slowDiskReads++;
|
||||
}
|
||||
@ -964,12 +996,12 @@ void doneWrapper ( void *state , ThreadEntry *t ) {
|
||||
if ( ! g_errno ) g_errno = fstate->m_errno2;
|
||||
// fstate has his own m_pc in case BigFile got deleted, we cannot
|
||||
// reference it...
|
||||
if ( ! g_errno && fstate->m_pc )
|
||||
fstate->m_pc->addPages ( fstate->m_vfd ,
|
||||
fstate->m_offset ,
|
||||
fstate->m_bytesDone ,
|
||||
fstate->m_buf ,
|
||||
fstate->m_niceness );
|
||||
// if ( ! g_errno && fstate->m_pc )
|
||||
// fstate->m_pc->addPages ( fstate->m_vfd ,
|
||||
// fstate->m_offset ,
|
||||
// fstate->m_bytesDone ,
|
||||
// fstate->m_buf ,
|
||||
// fstate->m_niceness );
|
||||
|
||||
// add the stat
|
||||
if ( ! g_errno ) {
|
||||
@ -1015,12 +1047,14 @@ void doneWrapper ( void *state , ThreadEntry *t ) {
|
||||
int32_t tt = LOG_WARN;
|
||||
if ( g_errno == EFILECLOSED ) tt = LOG_INFO;
|
||||
if ( g_errno && g_errno != EDISKSTUCK )
|
||||
log (tt,"disk: %s. fd1=%"INT32" vfd=%"INT32" "
|
||||
"off=%"INT64" toread=%"INT32".",
|
||||
mstrerror(g_errno),
|
||||
(int32_t)fstate->m_fd1,(int32_t)fstate->m_vfd,
|
||||
(int64_t)fstate->m_offset ,
|
||||
(int32_t)fstate->m_bytesToGo );
|
||||
log (tt,"disk: %s. fd1=%"INT32" fd2=%"INT32" "
|
||||
"off=%"INT64" toread=%"INT32,
|
||||
mstrerror(g_errno),
|
||||
(int32_t)fstate->m_fd1,
|
||||
(int32_t)fstate->m_fd2,
|
||||
(int64_t)fstate->m_offset ,
|
||||
(int32_t)fstate->m_bytesToGo
|
||||
);
|
||||
// someone is closing our fd without setting File::s_vfds[fd] to -1
|
||||
if ( g_errno && g_errno != EDISKSTUCK ) {
|
||||
//int fd1 = fstate->m_fd1;
|
||||
@ -1256,6 +1290,12 @@ bool readwrite_r ( FileState *fstate , ThreadEntry *t ) {
|
||||
int32_t len = bytesToGo - bytesDone;
|
||||
// how many bytes can we write to it now
|
||||
if ( len > avail ) len = avail;
|
||||
// hack for reading warc files
|
||||
if ( ! fstate->m_usePartFiles ) {
|
||||
filenum = 0;
|
||||
localOffset = offset;
|
||||
len = bytesToGo - bytesDone;
|
||||
}
|
||||
// get the fd for this filenum
|
||||
int fd = -1;
|
||||
if ( filenum == fstate->m_filenum1 ) fd = fstate->m_fd1;
|
||||
@ -1273,9 +1313,9 @@ bool readwrite_r ( FileState *fstate , ThreadEntry *t ) {
|
||||
if ( t && t->m_callback == ohcrap ) return false;
|
||||
|
||||
// only set this now if we are the first one
|
||||
if ( g_threads.m_threadQueues[DISK_THREAD].m_hiReturned ==
|
||||
g_threads.m_threadQueues[DISK_THREAD].m_hiLaunched )
|
||||
g_lastDiskReadStarted = fstate->m_startTime;
|
||||
// if ( g_threads.m_threadQueues[DISK_THREAD].m_hiReturned ==
|
||||
// g_threads.m_threadQueues[DISK_THREAD].m_hiLaunched )
|
||||
// g_lastDiskReadStarted = fstate->m_startTime;
|
||||
|
||||
// fake it out
|
||||
//static int32_t s_poo = 0;
|
||||
@ -1340,10 +1380,17 @@ bool readwrite_r ( FileState *fstate , ThreadEntry *t ) {
|
||||
log("disk: Read of %"INT32" bytes at offset %"INT64" "
|
||||
" failed because file is too short for that "
|
||||
"offset? Our fd was probably stolen from us by another "
|
||||
"thread. Will retry. error=%s.",
|
||||
"thread. fd1=%i fd2=%i len=%i filenum=%i "
|
||||
"localoffset=%i. usepart=%i error=%s.",
|
||||
(int32_t)len,fstate->m_offset,
|
||||
//fstate->m_this->getDir(),
|
||||
//fstate->m_this->getFilename(),
|
||||
fstate->m_fd1,
|
||||
fstate->m_fd2,
|
||||
len,
|
||||
filenum,
|
||||
localOffset,
|
||||
fstate->m_usePartFiles,
|
||||
mstrerror(errno));
|
||||
errno = EBADENGINEER;
|
||||
return false; // log("disk::read/write: offset too big");
|
||||
|
32
BigFile.h
32
BigFile.h
@ -47,14 +47,14 @@ public:
|
||||
class BigFile *m_this;
|
||||
//struct aiocb m_aiostate;
|
||||
char *m_buf;
|
||||
int32_t m_bytesToGo;
|
||||
int64_t m_bytesToGo;
|
||||
int64_t m_offset;
|
||||
// . the original offset, because we set m_offset to m_currentOffset
|
||||
// if the original offset specified is -1
|
||||
// . we also advance BigFile::m_currentOffset when done w/ read/write
|
||||
//int64_t m_origOffset;
|
||||
bool m_doWrite;
|
||||
int32_t m_bytesDone;
|
||||
int64_t m_bytesDone;
|
||||
void *m_state ;
|
||||
void (*m_callback) ( void *state ) ;
|
||||
// goes from 0 to 1, the lower the niceness, the higher the priority
|
||||
@ -79,9 +79,10 @@ public:
|
||||
// when we started for graphing purposes (in milliseconds)
|
||||
int64_t m_startTime;
|
||||
int64_t m_doneTime;
|
||||
char m_usePartFiles;
|
||||
// this is used for calling DiskPageCache::addPages() when done
|
||||
// with the read/write
|
||||
class DiskPageCache *m_pc;
|
||||
//class DiskPageCache *m_pc;
|
||||
// this is just used for accessing the DiskPageCache, m_pc, it is
|
||||
// a "virtual fd" for this whole file
|
||||
int64_t m_vfd;
|
||||
@ -102,10 +103,10 @@ public:
|
||||
// threads each hogging up 32KB of memory waiting to read tfndb.
|
||||
// m_allocBuf points to what we allocated.
|
||||
char *m_allocBuf;
|
||||
int32_t m_allocSize;
|
||||
int64_t m_allocSize;
|
||||
// m_allocOff is offset into m_allocBuf where we start reading into
|
||||
// from the file
|
||||
int32_t m_allocOff;
|
||||
int64_t m_allocOff;
|
||||
// do not call pthread_create() for every read we do. use async io
|
||||
// because it should be much much faster
|
||||
#ifdef ASYNCIO
|
||||
@ -138,10 +139,23 @@ class BigFile {
|
||||
// . if you are opening a new file for writing, you need to provide it
|
||||
// if you pass in a DiskPageCache ptr
|
||||
bool open ( int flags ,
|
||||
class DiskPageCache *pc = NULL ,
|
||||
//class DiskPageCache *pc = NULL ,
|
||||
void *pc = NULL ,
|
||||
int64_t maxFileSize = -1 ,
|
||||
int permissions =
|
||||
S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH );
|
||||
//bool usePartFiles = true );
|
||||
|
||||
// this will set usepartfiles to false! so use this to open large
|
||||
// warc or arc files
|
||||
//bool open2 ( int flags ,
|
||||
// //class DiskPageCache *pc = NULL ,
|
||||
// void *pc = NULL ,
|
||||
// int64_t maxFileSize = -1 ,
|
||||
// int permissions =
|
||||
// S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH );
|
||||
|
||||
|
||||
|
||||
int getFlags() { return m_flags; };
|
||||
|
||||
@ -234,7 +248,7 @@ class BigFile {
|
||||
|
||||
//int64_t m_currentOffset;
|
||||
|
||||
DiskPageCache *getDiskPageCache ( ) { return m_pc; };
|
||||
//DiskPageCache *getDiskPageCache ( ) { return m_pc; };
|
||||
int32_t getVfd ( ) { return m_vfd; };
|
||||
|
||||
// WARNING: some may have been unlinked from call to chopHead()
|
||||
@ -347,13 +361,15 @@ class BigFile {
|
||||
// maximum part #
|
||||
int32_t m_maxParts;
|
||||
|
||||
class DiskPageCache *m_pc;
|
||||
//class DiskPageCache *m_pc;
|
||||
int32_t m_vfd;
|
||||
//bool m_vfdAllowed;
|
||||
|
||||
// prevent circular calls to BigFile::close() with this
|
||||
char m_isClosing;
|
||||
|
||||
char m_usePartFiles;
|
||||
|
||||
int64_t m_fileSize;
|
||||
|
||||
// oldest of the last modified dates of all the part files
|
||||
|
16
Cachedb.cpp
16
Cachedb.cpp
@ -13,7 +13,7 @@ void Cachedb::reset() {
|
||||
|
||||
bool Cachedb::init ( ) {
|
||||
// we use the same disk page size as indexdb (for rdbmap.cpp)
|
||||
int32_t pageSize = GB_INDEXDB_PAGE_SIZE;
|
||||
//int32_t pageSize = GB_INDEXDB_PAGE_SIZE;
|
||||
// set this for debugging
|
||||
//int64_t maxTreeMem = 1000000;
|
||||
// i've seen some debug entries like 33MB because of
|
||||
@ -26,7 +26,7 @@ bool Cachedb::init ( ) {
|
||||
// . >1000 bytes of data per rec
|
||||
int32_t maxTreeNodes = maxTreeMem /(sizeof(key96_t)+16+1000);
|
||||
// disk page cache mem, 100MB on gk0 now
|
||||
int32_t pcmem = 0; // g_conf.m_cachedbMaxDiskPageCacheMem;
|
||||
//int32_t pcmem = 0; // g_conf.m_cachedbMaxDiskPageCacheMem;
|
||||
// keep this low if we are the tmp cluster
|
||||
//if ( g_hostdb.m_useTmpCluster ) pcmem = 0;
|
||||
// TODO: would be nice to just do page caching on the satellite files;
|
||||
@ -38,11 +38,11 @@ bool Cachedb::init ( ) {
|
||||
m_rdbId = RDB_SERPDB;
|
||||
}
|
||||
|
||||
if ( ! m_pc.init ( m_name ,
|
||||
m_rdbId, // RDB_CACHEDB,
|
||||
pcmem ,
|
||||
pageSize ))
|
||||
return log("db: %s init failed.",m_name);
|
||||
// if ( ! m_pc.init ( m_name ,
|
||||
// m_rdbId, // RDB_CACHEDB,
|
||||
// pcmem ,
|
||||
// pageSize ))
|
||||
// return log("db: %s init failed.",m_name);
|
||||
// init the rdb
|
||||
if ( ! m_rdb.init ( g_hostdb.m_dir ,
|
||||
m_name ,
|
||||
@ -60,7 +60,7 @@ bool Cachedb::init ( ) {
|
||||
0 , // cache nodes
|
||||
false, // true , // use half keys
|
||||
false , // load cache from disk
|
||||
&m_pc ,
|
||||
NULL,//&m_pc ,
|
||||
false , // false
|
||||
false , // preload page cache
|
||||
sizeof(key96_t) ,
|
||||
|
@ -18,7 +18,7 @@
|
||||
#define CACHEDBKS sizeof(key96_t)
|
||||
|
||||
#include "Rdb.h"
|
||||
#include "DiskPageCache.h"
|
||||
//#include "DiskPageCache.h"
|
||||
|
||||
// do not change these numbers, they are permanent and stored in cachedb
|
||||
// that way... just add new numbers to the end.
|
||||
@ -109,8 +109,8 @@ class Cachedb {
|
||||
|
||||
Rdb *getRdb() { return &m_rdb; };
|
||||
|
||||
DiskPageCache *getDiskPageCache () { return &m_pc; };
|
||||
DiskPageCache m_pc;
|
||||
//DiskPageCache *getDiskPageCache () { return &m_pc; };
|
||||
//DiskPageCache m_pc;
|
||||
|
||||
private:
|
||||
Rdb m_rdb;
|
||||
|
14
Catdb.cpp
14
Catdb.cpp
@ -39,18 +39,18 @@ bool Catdb::init ( ) {
|
||||
int32_t maxTreeNodes = treeMem / 82;
|
||||
// do not use any page cache if doing tmp cluster in order to
|
||||
// prevent swapping
|
||||
int32_t pcmem = g_conf.m_catdbMaxDiskPageCacheMem;
|
||||
if ( g_hostdb.m_useTmpCluster ) pcmem = 0;
|
||||
// int32_t pcmem = g_conf.m_catdbMaxDiskPageCacheMem;
|
||||
// if ( g_hostdb.m_useTmpCluster ) pcmem = 0;
|
||||
|
||||
pcmem = 0;
|
||||
// pcmem = 0;
|
||||
// each entry in the cache is usually just a single record, no lists,
|
||||
// unless a hostname has multiple sites in it. has 24 bytes more
|
||||
// overhead in cache.
|
||||
//int32_t maxCacheNodes = g_conf.m_tagdbMaxCacheMem / 106;
|
||||
// we now use a page cache
|
||||
if ( ! m_pc.init ("catdb",RDB_CATDB,pcmem,
|
||||
GB_TFNDB_PAGE_SIZE) )
|
||||
return log("db: Catdb init failed.");
|
||||
// if ( ! m_pc.init ("catdb",RDB_CATDB,pcmem,
|
||||
// GB_TFNDB_PAGE_SIZE) )
|
||||
// return log("db: Catdb init failed.");
|
||||
|
||||
// . initialize our own internal rdb
|
||||
// . i no longer use cache so changes to tagdb are instant
|
||||
@ -71,7 +71,7 @@ bool Catdb::init ( ) {
|
||||
0 , //maxCacheNodes ,
|
||||
false , // half keys?
|
||||
false , //m_tagdbSaveCache
|
||||
&m_pc ,
|
||||
NULL, // &m_pc ,
|
||||
false,
|
||||
false,
|
||||
12, // keysize
|
||||
|
6
Catdb.h
6
Catdb.h
@ -21,7 +21,7 @@
|
||||
#include "Rdb.h"
|
||||
#include "Url.h"
|
||||
#include "Loop.h"
|
||||
#include "DiskPageCache.h"
|
||||
//#include "DiskPageCache.h"
|
||||
//#include "CollectionRec.h"
|
||||
|
||||
class Catdb {
|
||||
@ -74,7 +74,7 @@ class Catdb {
|
||||
void getKeyRange ( bool useIp , Url *url ,
|
||||
key_t *startKey , key_t *endKey );
|
||||
|
||||
DiskPageCache *getDiskPageCache() { return &m_pc; };
|
||||
//DiskPageCache *getDiskPageCache() { return &m_pc; };
|
||||
|
||||
// normalize a url, no www.
|
||||
void normalizeUrl ( Url *srcUrl, Url *dstUrl );
|
||||
@ -93,7 +93,7 @@ class Catdb {
|
||||
// and "not-founds" stored remotely (net cache)
|
||||
Rdb m_rdb;
|
||||
|
||||
DiskPageCache m_pc;
|
||||
//DiskPageCache m_pc;
|
||||
|
||||
};
|
||||
|
||||
|
@ -274,20 +274,20 @@ bool Clusterdb::init ( ) {
|
||||
// RdbCache has a 4 byte ptr to each rec in the cache
|
||||
//int32_t maxCacheNodes = maxCacheMem / ( 4 + CLUSTER_REC_SIZE );
|
||||
//int32_t nodeSize = sizeof(key_t) + sizeof(collnum_t);
|
||||
int32_t pageSize = GB_TFNDB_PAGE_SIZE;
|
||||
//int32_t pageSize = GB_TFNDB_PAGE_SIZE;
|
||||
//int32_t nodeSize = (pageSize + 12) + sizeof(collnum_t) + 20;
|
||||
//int32_t maxCacheNodes = maxCacheMem / nodeSize ;
|
||||
// init the page cache
|
||||
if ( ! m_pc.init ( "clusterdb",
|
||||
RDB_CLUSTERDB,
|
||||
pcmem ,
|
||||
pageSize ) )
|
||||
//g_conf.m_clusterdbMaxDiskPageCacheMem,
|
||||
//clusterGetPages,
|
||||
//clusterAddPages,
|
||||
//clusterGetVfd,
|
||||
//clusterRmVfd ))
|
||||
return log("db: Clusterdb init failed.");
|
||||
// if ( ! m_pc.init ( "clusterdb",
|
||||
// RDB_CLUSTERDB,
|
||||
// pcmem ,
|
||||
// pageSize ) )
|
||||
// //g_conf.m_clusterdbMaxDiskPageCacheMem,
|
||||
// //clusterGetPages,
|
||||
// //clusterAddPages,
|
||||
// //clusterGetVfd,
|
||||
// //clusterRmVfd ))
|
||||
// return log("db: Clusterdb init failed.");
|
||||
//bool bias = true;
|
||||
//if ( g_conf.m_fullSplit ) bias = false;
|
||||
bool bias = false;
|
||||
@ -305,7 +305,7 @@ bool Clusterdb::init ( ) {
|
||||
0,//maxCacheNodes ,
|
||||
true , // half keys?
|
||||
g_conf.m_clusterdbSaveCache,
|
||||
&m_pc ,
|
||||
NULL,//&m_pc ,
|
||||
false, // is titledb
|
||||
true , // preload disk page cache
|
||||
12, // key size
|
||||
|
@ -32,7 +32,7 @@
|
||||
#include "Url.h"
|
||||
#include "Conf.h"
|
||||
#include "Titledb.h"
|
||||
#include "DiskPageCache.h"
|
||||
//#include "DiskPageCache.h"
|
||||
|
||||
// these are now just TitleRec keys
|
||||
#define CLUSTER_REC_SIZE (sizeof(key_t))
|
||||
@ -162,14 +162,14 @@ class Clusterdb {
|
||||
//char getGigabitSimilarity ( char *vec0 , char *vec1 ,
|
||||
// int32_t *qtable , int32_t numSlots ) ;
|
||||
|
||||
DiskPageCache *getDiskPageCache() { return &m_pc; };
|
||||
//DiskPageCache *getDiskPageCache() { return &m_pc; };
|
||||
|
||||
private:
|
||||
|
||||
// this rdb holds urls waiting to be spidered or being spidered
|
||||
Rdb m_rdb;
|
||||
|
||||
DiskPageCache m_pc;
|
||||
//DiskPageCache m_pc;
|
||||
};
|
||||
|
||||
extern class Clusterdb g_clusterdb;
|
||||
|
@ -329,6 +329,12 @@ bool Collectiondb::addExistingColl ( char *coll, collnum_t collnum ) {
|
||||
if ( cr->m_isCustomCrawl )
|
||||
cr->m_indexSpiderReplies = true;
|
||||
|
||||
// and don't do link voting, will help speed up
|
||||
if ( cr->m_isCustomCrawl ) {
|
||||
cr->m_getLinkInfo = false;
|
||||
cr->m_computeSiteNumInlinks = false;
|
||||
}
|
||||
|
||||
// we need to compile the regular expressions or update the url
|
||||
// filters with new logic that maps crawlbot parms to url filters
|
||||
return cr->rebuildUrlFilters ( );
|
||||
@ -1694,13 +1700,24 @@ collnum_t Collectiondb::reserveCollNum ( ) {
|
||||
return next;
|
||||
}
|
||||
|
||||
// collnum_t is signed right now because we use -1 to indicate a
|
||||
// bad collnum.
|
||||
int32_t scanned = 0;
|
||||
// search for an empty slot
|
||||
for ( int32_t i = m_wrapped ; i < m_numRecs ; i++ ) {
|
||||
for ( int32_t i = m_wrapped ; ; i++ ) {
|
||||
// because collnum_t is 2 bytes, signed, limit this here
|
||||
if ( i > 0x7fff ) i = 0;
|
||||
// how can this happen?
|
||||
if ( i < 0 ) i = 0;
|
||||
// if we scanned the max # of recs we could have, we are done
|
||||
if ( ++scanned >= m_numRecs ) break;
|
||||
// skip if this is in use
|
||||
if ( m_recs[i] ) continue;
|
||||
// start after this one next time
|
||||
m_wrapped = i+1;
|
||||
// note it
|
||||
log("colldb: returning wrapped collnum of %"INT32"",(int32_t)i);
|
||||
log("colldb: returning wrapped collnum "
|
||||
"of %"INT32"",(int32_t)i);
|
||||
return (collnum_t)i;
|
||||
}
|
||||
|
||||
@ -1841,6 +1858,8 @@ void CollectionRec::reset() {
|
||||
m_hasucr = false;
|
||||
m_hasupr = false;
|
||||
|
||||
m_sendingAlertInProgress = false;
|
||||
|
||||
// make sure we do not leave spiders "hanging" waiting for their
|
||||
// callback to be called... and it never gets called
|
||||
//if ( m_callbackQueue.length() > 0 ) { char *xx=NULL;*xx=0; }
|
||||
@ -2313,6 +2332,17 @@ bool CollectionRec::rebuildUrlFilters2 ( ) {
|
||||
m_spiderFreqs [n] = .00347; // 5 mins
|
||||
n++;
|
||||
|
||||
// a non temporary error, like a 404? retry once per 3 months i guess
|
||||
m_regExs[n].set("errorcount>=1");
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 90; // 90 day retry
|
||||
m_maxSpidersPerRule [n] = 1; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 2;
|
||||
m_forceDelete [n] = 1;
|
||||
n++;
|
||||
|
||||
m_regExs[n].set("isaddurl");
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 7; // 30 days default
|
||||
|
@ -525,6 +525,7 @@ class CollectionRec {
|
||||
char m_enforceNewQuotas ;
|
||||
char m_doIpLookups ; // considered iff using proxy
|
||||
char m_useRobotsTxt ;
|
||||
char m_obeyRelNoFollowLinks ;
|
||||
char m_forceUseFloaters ;
|
||||
char m_automaticallyUseProxies ;
|
||||
char m_automaticallyBackOff ;
|
||||
@ -626,6 +627,7 @@ class CollectionRec {
|
||||
int32_t m_adWidth; // how wide the ad Column is in pixels
|
||||
|
||||
char m_dedupResultsByDefault ;
|
||||
char m_doTagdbLookups ;
|
||||
char m_clusterByTopicDefault ;
|
||||
char m_restrictTitledbForQuery ; // move this down here
|
||||
char m_useOldIps ;
|
||||
@ -766,7 +768,7 @@ class CollectionRec {
|
||||
|
||||
// last time we computed global crawl info
|
||||
//time_t m_globalCrawlInfoUpdateTime;
|
||||
EmailInfo m_emailInfo;
|
||||
//EmailInfo m_emailInfo;
|
||||
// for counting replies
|
||||
//int32_t m_replies;
|
||||
//int32_t m_requests;
|
||||
@ -974,6 +976,8 @@ class CollectionRec {
|
||||
// NARROW SEARCH
|
||||
char m_doNarrowSearch;
|
||||
|
||||
char m_sendingAlertInProgress;
|
||||
|
||||
// Allow Links: searches on the collection
|
||||
//char m_allowLinksSearch;
|
||||
// . reference pages parameters
|
||||
|
2
Conf.cpp
2
Conf.cpp
@ -369,7 +369,7 @@ bool Conf::init ( char *dir ) { // , int32_t hostId ) {
|
||||
g_conf.m_forceIt = false;
|
||||
|
||||
// always turn on threads if live
|
||||
if ( g_conf.m_isLive ) g_conf.m_useThreads = true;
|
||||
//if ( g_conf.m_isLive ) g_conf.m_useThreads = true;
|
||||
// disable this at startup always... no since might have crashed
|
||||
// in the middle of a test. and we just turn on spiders again when
|
||||
// already in test mode otherwise hostid #0 will erase all the files.
|
||||
|
54
Conf.h
54
Conf.h
@ -175,7 +175,7 @@ class Conf {
|
||||
|
||||
// tagdb parameters
|
||||
int32_t m_tagdbMaxTreeMem;
|
||||
int32_t m_tagdbMaxDiskPageCacheMem;
|
||||
//int32_t m_tagdbMaxDiskPageCacheMem;
|
||||
//int32_t m_tagdbMaxCacheMem;
|
||||
//bool m_tagdbUseSeals;
|
||||
//int32_t m_tagdbMinFilesToMerge;
|
||||
@ -183,7 +183,7 @@ class Conf {
|
||||
|
||||
// catdb parameters
|
||||
int32_t m_catdbMaxTreeMem;
|
||||
int32_t m_catdbMaxDiskPageCacheMem;
|
||||
//int32_t m_catdbMaxDiskPageCacheMem;
|
||||
int32_t m_catdbMaxCacheMem;
|
||||
//int32_t m_catdbMinFilesToMerge;
|
||||
|
||||
@ -216,7 +216,7 @@ class Conf {
|
||||
// linkdb for storing linking relations
|
||||
int32_t m_linkdbMaxTreeMem;
|
||||
// int32_t m_linkdbMaxCacheMem;
|
||||
int32_t m_linkdbMaxDiskPageCacheMem;
|
||||
//int32_t m_linkdbMaxDiskPageCacheMem;
|
||||
int32_t m_linkdbMinFilesToMerge;
|
||||
// bool m_linkdbSaveCache;
|
||||
|
||||
@ -234,7 +234,7 @@ class Conf {
|
||||
|
||||
// for holding urls that have been entered into the spider queue
|
||||
//int32_t m_tfndbMaxTreeMem ;
|
||||
int32_t m_tfndbMaxDiskPageCacheMem ; // for the DiskPageCache class only
|
||||
//int32_t m_tfndbMaxDiskPageCacheMem ; // for the DiskPageCache class only
|
||||
//int32_t m_tfndbMinFilesToMerge;
|
||||
//bool m_tfndbSaveCache;
|
||||
//int64_t m_tfndbMaxUrls;
|
||||
@ -253,21 +253,23 @@ class Conf {
|
||||
//int32_t m_spiderdbMaxDiskPageCacheMem ;
|
||||
//int32_t m_spiderdbMinFilesToMerge;
|
||||
int32_t m_spiderMaxDiskThreads ;
|
||||
int32_t m_spiderMaxBigDiskThreads ; // > 1M read
|
||||
int32_t m_spiderMaxMedDiskThreads ; // 100k - 1M read
|
||||
int32_t m_spiderMaxSmaDiskThreads ; // < 100k read
|
||||
int32_t m_queryMaxDiskThreads ;
|
||||
int32_t m_queryMaxBigDiskThreads ; // > 1M read
|
||||
int32_t m_queryMaxMedDiskThreads ; // 100k - 1M read
|
||||
int32_t m_queryMaxSmaDiskThreads ; // < 100k per read
|
||||
//int32_t m_spiderMaxBigDiskThreads ; // > 1M read
|
||||
//int32_t m_spiderMaxMedDiskThreads ; // 100k - 1M read
|
||||
//int32_t m_spiderMaxSmaDiskThreads ; // < 100k read
|
||||
//int32_t m_queryMaxDiskThreads ;
|
||||
//int32_t m_queryMaxBigDiskThreads ; // > 1M read
|
||||
//int32_t m_queryMaxMedDiskThreads ; // 100k - 1M read
|
||||
//int32_t m_queryMaxSmaDiskThreads ; // < 100k per read
|
||||
// categorize the disk read sizes by these here
|
||||
int32_t m_bigReadSize;
|
||||
int32_t m_medReadSize;
|
||||
int32_t m_smaReadSize;
|
||||
//int32_t m_bigReadSize;
|
||||
//int32_t m_medReadSize;
|
||||
//int32_t m_smaReadSize;
|
||||
|
||||
char m_separateDiskReads;
|
||||
|
||||
int32_t m_statsdbMaxTreeMem;
|
||||
int32_t m_statsdbMaxCacheMem;
|
||||
int32_t m_statsdbMaxDiskPageCacheMem;
|
||||
//int32_t m_statsdbMaxDiskPageCacheMem;
|
||||
//int32_t m_statsdbMinFilesToMerge;
|
||||
bool m_useStatsdb;
|
||||
//bool m_statsdbSnapshots;
|
||||
@ -290,6 +292,7 @@ class Conf {
|
||||
//bool m_refreshFacebookUsersEnabled;
|
||||
bool m_injectionsEnabled ;
|
||||
bool m_queryingEnabled ;
|
||||
bool m_returnResultsAnyway;
|
||||
// qa testing loop going on? uses "test" subdir
|
||||
bool m_testParserEnabled ;
|
||||
bool m_testSpiderEnabled ;
|
||||
@ -331,7 +334,7 @@ class Conf {
|
||||
// indexdb has a max cached age for getting IndexLists (10 mins deflt)
|
||||
int32_t m_indexdbMaxTreeMem ;
|
||||
int32_t m_indexdbMaxCacheMem;
|
||||
int32_t m_indexdbMaxDiskPageCacheMem; // for DiskPageCache class only
|
||||
//int32_t m_indexdbMaxDiskPageCacheMem; // for DiskPageCache class only
|
||||
int32_t m_indexdbMaxIndexListAge;
|
||||
int32_t m_indexdbTruncationLimit;
|
||||
int32_t m_indexdbMinFilesToMerge;
|
||||
@ -339,7 +342,7 @@ class Conf {
|
||||
|
||||
int32_t m_datedbMaxTreeMem ;
|
||||
int32_t m_datedbMaxCacheMem;
|
||||
int32_t m_datedbMaxDiskPageCacheMem; // for DiskPageCache class only
|
||||
//int32_t m_datedbMaxDiskPageCacheMem; // for DiskPageCache class only
|
||||
int32_t m_datedbMaxIndexListAge;
|
||||
int32_t m_datedbTruncationLimit;
|
||||
int32_t m_datedbMinFilesToMerge;
|
||||
@ -568,17 +571,11 @@ class Conf {
|
||||
bool m_useSHM;
|
||||
bool m_useQuickpoll;
|
||||
|
||||
bool m_useDiskPageCacheIndexdb;
|
||||
bool m_useDiskPageCachePosdb;
|
||||
bool m_useDiskPageCacheDatedb;
|
||||
bool m_useDiskPageCacheTitledb;
|
||||
bool m_useDiskPageCacheSpiderdb;
|
||||
bool m_useDiskPageCacheTfndb;
|
||||
bool m_useDiskPageCacheTagdb;
|
||||
bool m_useDiskPageCacheChecksumdb;
|
||||
bool m_useDiskPageCacheClusterdb;
|
||||
bool m_useDiskPageCacheCatdb;
|
||||
bool m_useDiskPageCacheLinkdb;
|
||||
int64_t m_posdbFileCacheSize;
|
||||
int64_t m_tagdbFileCacheSize;
|
||||
int64_t m_clusterdbFileCacheSize;
|
||||
int64_t m_titledbFileCacheSize;
|
||||
int64_t m_spiderdbFileCacheSize;
|
||||
|
||||
//bool m_quickpollCoreOnError;
|
||||
bool m_useShotgun;
|
||||
@ -685,6 +682,7 @@ class Conf {
|
||||
bool m_diffbotMsg13Hack ;
|
||||
bool m_logDebugUrlAttempts ;
|
||||
bool m_logDebugTcp ;
|
||||
bool m_logDebugTcpBuf ;
|
||||
bool m_logDebugThread ;
|
||||
bool m_logDebugTimedb ;
|
||||
bool m_logDebugTitle ;
|
||||
|
@ -47,14 +47,14 @@ bool Datedb::init ( ) {
|
||||
// old rec cache. i am trying to do away with the Rdb::m_cache rec
|
||||
// cache in favor of cleverly used disk page caches, because
|
||||
// the rec caches are not real-time and get stale.
|
||||
int32_t pcmem = g_conf.m_datedbMaxDiskPageCacheMem;
|
||||
//int32_t pcmem = g_conf.m_datedbMaxDiskPageCacheMem;
|
||||
// make sure at least 30MB
|
||||
//if ( pcmem < 30000000 ) pcmem = 30000000;
|
||||
// keep this low if we are the tmp cluster, 20MB
|
||||
if ( g_hostdb.m_useTmpCluster && pcmem > 20000000 ) pcmem = 20000000;
|
||||
//if ( g_hostdb.m_useTmpCluster && pcmem > 20000000 ) pcmem = 20000000;
|
||||
// do not use any page cache if doing tmp cluster in order to
|
||||
// prevent swapping
|
||||
if ( g_hostdb.m_useTmpCluster ) pcmem = 0;
|
||||
//if ( g_hostdb.m_useTmpCluster ) pcmem = 0;
|
||||
// . init the page cache
|
||||
// . MDW: "minimize disk seeks" not working otherwise i'd enable it!
|
||||
// if ( ! m_pc.init ( "datedb",
|
||||
|
@ -195,6 +195,7 @@ case EADMININTERFERENCE: return "Adminstrative interference";
|
||||
case EDNSERROR : return "DNS lookup error";
|
||||
case ETHREADSDISABLED:return "Threads Disabled";
|
||||
case EMALFORMEDQUERY: return "Malformed query";
|
||||
case ESHARDDOWN: return "One or more shards are down";
|
||||
}
|
||||
// if the remote error bit is clear it must be a regulare errno
|
||||
//if ( ! ( errnum & REMOTE_ERROR_BIT ) ) return strerror ( errnum );
|
||||
|
3
Errno.h
3
Errno.h
@ -199,6 +199,7 @@ enum {
|
||||
EADMININTERFERENCE,
|
||||
EDNSERROR ,
|
||||
ETHREADSDISABLED,
|
||||
EMALFORMEDQUERY
|
||||
EMALFORMEDQUERY,
|
||||
ESHARDDOWN
|
||||
};
|
||||
#endif
|
||||
|
@ -455,6 +455,11 @@ bool HashTableX::load ( char *dir, char *filename, char **tbuf, int32_t *tsize )
|
||||
if ( ! f.read ( &ds , 4 , off ) ) return false;
|
||||
off += 4;
|
||||
|
||||
if ( numSlots < 0 || numSlotsUsed < 0 ) {
|
||||
log("htable: bogus saved hashtable file %s%s.",dir,filename);
|
||||
return false;
|
||||
}
|
||||
|
||||
// bogus key size?
|
||||
if ( ks <= 0 ) {
|
||||
// is very common for this file so skip it
|
||||
|
22
Indexdb.cpp
22
Indexdb.cpp
@ -92,28 +92,28 @@ bool Indexdb::init ( ) {
|
||||
// enough nodes!!
|
||||
int32_t maxCacheNodes = g_conf.m_indexdbMaxCacheMem / 600;
|
||||
|
||||
int32_t pageSize = GB_INDEXDB_PAGE_SIZE;
|
||||
//int32_t pageSize = GB_INDEXDB_PAGE_SIZE;
|
||||
// we now use a disk page cache as opposed to the
|
||||
// old rec cache. i am trying to do away with the Rdb::m_cache rec
|
||||
// cache in favor of cleverly used disk page caches, because
|
||||
// the rec caches are not real-time and get stale.
|
||||
int32_t pcmem = g_conf.m_indexdbMaxDiskPageCacheMem;
|
||||
//int32_t pcmem = g_conf.m_indexdbMaxDiskPageCacheMem;
|
||||
|
||||
pcmem = 0;
|
||||
//pcmem = 0;
|
||||
// make sure at least 30MB
|
||||
//if ( pcmem < 30000000 ) pcmem = 30000000;
|
||||
// keep this low if we are the tmp cluster, 30MB
|
||||
if ( g_hostdb.m_useTmpCluster && pcmem > 30000000 ) pcmem = 30000000;
|
||||
//if ( g_hostdb.m_useTmpCluster && pcmem > 30000000 ) pcmem = 30000000;
|
||||
// do not use any page cache if doing tmp cluster in order to
|
||||
// prevent swapping
|
||||
if ( g_hostdb.m_useTmpCluster ) pcmem = 0;
|
||||
//if ( g_hostdb.m_useTmpCluster ) pcmem = 0;
|
||||
// . init the page cache
|
||||
// . MDW: "minimize disk seeks" not working otherwise i'd enable it!
|
||||
if ( ! m_pc.init ( "indexdb",
|
||||
RDB_INDEXDB,
|
||||
pcmem ,
|
||||
pageSize ))
|
||||
return log("db: Indexdb init failed.");
|
||||
// if ( ! m_pc.init ( "indexdb",
|
||||
// RDB_INDEXDB,
|
||||
// pcmem ,
|
||||
// pageSize ))
|
||||
// return log("db: Indexdb init failed.");
|
||||
|
||||
// . set our own internal rdb
|
||||
// . max disk space for bin tree is same as maxTreeMem so that we
|
||||
@ -133,7 +133,7 @@ bool Indexdb::init ( ) {
|
||||
maxCacheNodes ,
|
||||
true , // use half keys?
|
||||
false , // g_conf.m_indexdbSav
|
||||
&m_pc ) )
|
||||
NULL))//&m_pc ) )
|
||||
return false;
|
||||
return true;
|
||||
// validate indexdb
|
||||
|
@ -14,7 +14,7 @@
|
||||
|
||||
#include "Rdb.h"
|
||||
#include "Conf.h"
|
||||
#include "DiskPageCache.h"
|
||||
//#include "DiskPageCache.h"
|
||||
|
||||
// we define these here, NUMDOCIDBITS is in ../titledb/Titledb.h
|
||||
#define NUMTERMIDBITS 48
|
||||
@ -173,9 +173,9 @@ class Indexdb {
|
||||
|
||||
Rdb m_rdb;
|
||||
|
||||
DiskPageCache *getDiskPageCache ( ) { return &m_pc; };
|
||||
//DiskPageCache *getDiskPageCache ( ) { return &m_pc; };
|
||||
|
||||
DiskPageCache m_pc;
|
||||
//DiskPageCache m_pc;
|
||||
|
||||
//#ifdef SPLIT_INDEXDB
|
||||
// . groupId Table, for getting the correct group id based
|
||||
|
25
Linkdb.cpp
25
Linkdb.cpp
@ -5,6 +5,7 @@
|
||||
#include "sort.h"
|
||||
#include "XmlDoc.h" // score32to8()
|
||||
#include "Rebalance.h"
|
||||
#include "Process.h"
|
||||
|
||||
Linkdb g_linkdb;
|
||||
Linkdb g_linkdb2;
|
||||
@ -101,7 +102,7 @@ bool Linkdb::init ( ) {
|
||||
*/
|
||||
|
||||
// we use the same disk page size as indexdb (for rdbmap.cpp)
|
||||
int32_t pageSize = GB_INDEXDB_PAGE_SIZE;
|
||||
//int32_t pageSize = GB_INDEXDB_PAGE_SIZE;
|
||||
// set this for debugging
|
||||
//int64_t maxTreeMem = 1000000;
|
||||
int64_t maxTreeMem = 40000000; // 40MB
|
||||
@ -110,18 +111,18 @@ bool Linkdb::init ( ) {
|
||||
// . 32 bytes per record when in the tree
|
||||
int32_t maxTreeNodes = maxTreeMem /(sizeof(key224_t)+16);
|
||||
// disk page cache mem, 100MB on gk0 now
|
||||
int32_t pcmem = 0; // g_conf.m_linkdbMaxDiskPageCacheMem;
|
||||
//int32_t pcmem = 0; // g_conf.m_linkdbMaxDiskPageCacheMem;
|
||||
// give it a little
|
||||
pcmem = 10000000; // 10MB
|
||||
//pcmem = 10000000; // 10MB
|
||||
// keep this low if we are the tmp cluster
|
||||
//if ( g_hostdb.m_useTmpCluster ) pcmem = 0;
|
||||
// TODO: would be nice to just do page caching on the satellite files;
|
||||
// look into "minimizeDiskSeeks" at some point...
|
||||
if ( ! m_pc.init ( "linkdb" ,
|
||||
RDB_LINKDB,
|
||||
pcmem ,
|
||||
pageSize ))
|
||||
return log("db: Linkdb init failed.");
|
||||
// if ( ! m_pc.init ( "linkdb" ,
|
||||
// RDB_LINKDB,
|
||||
// pcmem ,
|
||||
// pageSize ))
|
||||
// return log("db: Linkdb init failed.");
|
||||
// init the rdb
|
||||
return m_rdb.init ( g_hostdb.m_dir ,
|
||||
"linkdb" ,
|
||||
@ -141,7 +142,7 @@ bool Linkdb::init ( ) {
|
||||
0 , // cache nodes
|
||||
false, // true , // use half keys
|
||||
false , // load cache from disk
|
||||
&m_pc ,
|
||||
NULL,//&m_pc ,
|
||||
false , // false
|
||||
false , // preload page cache
|
||||
sizeof(key224_t) ,
|
||||
@ -1130,6 +1131,12 @@ bool Msg25::doReadLoop ( ) {
|
||||
ms,m_site,m_url,m_docId,KEYSTR(&startKey,LDBKS));
|
||||
}
|
||||
|
||||
if ( g_process.m_mode == EXIT_MODE ) {
|
||||
log("linkdb: shutting down. exiting link text loop.");
|
||||
g_errno = ESHUTTINGDOWN;
|
||||
return false;
|
||||
}
|
||||
|
||||
m_gettingList = true;
|
||||
|
||||
CollectionRec *cr = g_collectiondb.getRec ( m_collnum );
|
||||
|
6
Linkdb.h
6
Linkdb.h
@ -32,7 +32,7 @@
|
||||
|
||||
#include "Conf.h"
|
||||
#include "Rdb.h"
|
||||
#include "DiskPageCache.h"
|
||||
//#include "DiskPageCache.h"
|
||||
#include "Titledb.h"
|
||||
|
||||
void handleRequest25 ( UdpSlot *slot , int32_t netnice ) ;
|
||||
@ -358,8 +358,8 @@ class Linkdb {
|
||||
|
||||
Rdb *getRdb() { return &m_rdb; };
|
||||
|
||||
DiskPageCache *getDiskPageCache () { return &m_pc; };
|
||||
DiskPageCache m_pc;
|
||||
//DiskPageCache *getDiskPageCache () { return &m_pc; };
|
||||
//DiskPageCache m_pc;
|
||||
|
||||
private:
|
||||
Rdb m_rdb;
|
||||
|
14
Loop.cpp
14
Loop.cpp
@ -1386,9 +1386,9 @@ bool Loop::runLoop ( ) {
|
||||
if ( m_shutdown == 2 ) {
|
||||
//log(0,"Thread is saving & shutting down urgently.");
|
||||
//while ( 1 == 1 ) sleep (50000);
|
||||
log("loop: Resuming despite thread crash.");
|
||||
m_shutdown = 0;
|
||||
goto BIGLOOP;
|
||||
//log("loop: Resuming despite thread crash.");
|
||||
//m_shutdown = 0;
|
||||
//goto BIGLOOP;
|
||||
}
|
||||
// otherwise, thread did not save, so we must do it
|
||||
log ( LOG_INIT ,"loop: Saving and shutting down urgently.");
|
||||
@ -2017,12 +2017,12 @@ void Loop::doPoll ( ) {
|
||||
// if shutting down was it a sigterm ?
|
||||
if ( m_shutdown ) goto again;
|
||||
// handle returned threads for niceness 0
|
||||
if ( g_threads.m_needsCleanup )
|
||||
g_threads.timedCleanUp(-3,0); // 3 ms
|
||||
//if ( g_threads.m_needsCleanup )
|
||||
g_threads.timedCleanUp(-3,0); // 3 ms
|
||||
if ( m_inQuickPoll ) goto again;
|
||||
// high niceness threads
|
||||
if ( g_threads.m_needsCleanup )
|
||||
g_threads.timedCleanUp(-4,MAX_NICENESS); //3 ms
|
||||
//if ( g_threads.m_needsCleanup )
|
||||
g_threads.timedCleanUp(-4,MAX_NICENESS); //3 ms
|
||||
|
||||
goto again;
|
||||
}
|
||||
|
6
Makefile
6
Makefile
@ -41,7 +41,7 @@ OBJS = UdpSlot.o Rebalance.o \
|
||||
Highlight.o File.o Errno.o Entities.o \
|
||||
Dns.o Dir.o Conf.o Bits.o \
|
||||
Stats.o BigFile.o Msg17.o \
|
||||
Speller.o DiskPageCache.o \
|
||||
Speller.o \
|
||||
PingServer.o StopWords.o TopTree.o \
|
||||
Parms.o Pages.o \
|
||||
Unicode.o iana_charset.o Iso8859.o \
|
||||
@ -532,6 +532,10 @@ Timedb.o:
|
||||
HashTableX.o:
|
||||
$(CC) $(DEFS) $(CPPFLAGS) -O2 -c $*.cpp
|
||||
|
||||
# getUrlFilterNum2()
|
||||
Spider.o:
|
||||
$(CC) $(DEFS) $(CPPFLAGS) -O2 -c $*.cpp
|
||||
|
||||
SpiderCache.o:
|
||||
$(CC) $(DEFS) $(CPPFLAGS) -O2 -c $*.cpp
|
||||
|
||||
|
10
Mem.cpp
10
Mem.cpp
@ -21,7 +21,7 @@
|
||||
|
||||
// uncomment this for EFENCE to do underflow checks instead of the
|
||||
// default overflow checks
|
||||
//#define _CHECKUNDERFLOW_
|
||||
//#define CHECKUNDERFLOW
|
||||
|
||||
// only Mem.cpp can call ::malloc, everyone else must call mmalloc() so
|
||||
// we can keep tabs on memory usage. in Mem.h we #define this to be coreme()
|
||||
@ -2168,7 +2168,7 @@ void *getElecMem ( int32_t size ) {
|
||||
// a page above OR a page below
|
||||
// let's go below this time since that seems to be the problem
|
||||
|
||||
#ifdef _CHECKUNDERFLOW_
|
||||
#ifdef CHECKUNDERFLOW
|
||||
// how much to alloc
|
||||
// . assume sysmalloc returs one byte above a page, so we need
|
||||
// MEMPAGESIZE-1 bytes to move p up to page boundary, another
|
||||
@ -2189,7 +2189,7 @@ void *getElecMem ( int32_t size ) {
|
||||
// parser
|
||||
char *p = realMem;
|
||||
// align p DOWN to nearest 8k boundary
|
||||
int32_t remainder = (uint32_t)realMem % MEMPAGESIZE;
|
||||
int32_t remainder = (uint64_t)realMem % MEMPAGESIZE;
|
||||
// complement
|
||||
remainder = MEMPAGESIZE - remainder;
|
||||
// and add to ptr to be aligned on 8k boundary
|
||||
@ -2211,7 +2211,7 @@ void *getElecMem ( int32_t size ) {
|
||||
p += size;
|
||||
// now when we free this it should all be protected, so make sure
|
||||
// we have enough room on top
|
||||
int32_t leftover = MEMPAGESIZE - ((uint32_t)p % MEMPAGESIZE);
|
||||
int32_t leftover = MEMPAGESIZE - ((uint64_t)p % MEMPAGESIZE);
|
||||
// skip that
|
||||
p += leftover;
|
||||
// inefficient?
|
||||
@ -2302,7 +2302,7 @@ void freeElecMem ( void *fakeMem ) {
|
||||
char *label = &s_labels[((uint32_t)h)*16];
|
||||
int32_t fakeSize = s_sizes[h];
|
||||
|
||||
#ifdef _CHECKUNDERFLOW_
|
||||
#ifdef CHECKUNDERFLOW
|
||||
char *oldProtMem = cp - MEMPAGESIZE;
|
||||
#else
|
||||
char *oldProtMem = cp + fakeSize;
|
||||
|
@ -9,7 +9,7 @@ void Monitordb::reset() {
|
||||
|
||||
bool Monitordb::init ( ) {
|
||||
// we use the same disk page size as indexdb (for rdbmap.cpp)
|
||||
int32_t pageSize = GB_INDEXDB_PAGE_SIZE;
|
||||
//int32_t pageSize = GB_INDEXDB_PAGE_SIZE;
|
||||
// set this for debugging
|
||||
//int64_t maxTreeMem = 1000000;
|
||||
int64_t maxTreeMem = 10000000; // 10MB
|
||||
@ -18,16 +18,16 @@ bool Monitordb::init ( ) {
|
||||
// . 32 bytes per record when in the tree
|
||||
int32_t maxTreeNodes = maxTreeMem /(sizeof(key96_t)+16);
|
||||
// disk page cache mem, 100MB on gk0 now
|
||||
int32_t pcmem = 0; // g_conf.m_monitordbMaxDiskPageCacheMem;
|
||||
//int32_t pcmem = 0; // g_conf.m_monitordbMaxDiskPageCacheMem;
|
||||
// keep this low if we are the tmp cluster
|
||||
//if ( g_hostdb.m_useTmpCluster ) pcmem = 0;
|
||||
// TODO: would be nice to just do page caching on the satellite files;
|
||||
// look into "minimizeDiskSeeks" at some point...
|
||||
if ( ! m_pc.init ( "monitordb" ,
|
||||
RDB_MONITORDB,
|
||||
pcmem ,
|
||||
pageSize ))
|
||||
return log("db: Monitordb init failed.");
|
||||
// if ( ! m_pc.init ( "monitordb" ,
|
||||
// RDB_MONITORDB,
|
||||
// pcmem ,
|
||||
// pageSize ))
|
||||
// return log("db: Monitordb init failed.");
|
||||
// init the rdb
|
||||
return m_rdb.init ( g_hostdb.m_dir ,
|
||||
"monitordb" ,
|
||||
@ -45,7 +45,7 @@ bool Monitordb::init ( ) {
|
||||
0 , // cache nodes
|
||||
false, // true , // use half keys
|
||||
false , // load cache from disk
|
||||
&m_pc ,
|
||||
NULL,//&m_pc ,
|
||||
false , // false
|
||||
false , // preload page cache
|
||||
sizeof(key96_t) ,
|
||||
|
@ -15,7 +15,7 @@
|
||||
#define MONITORDBKS sizeof(key96_t)
|
||||
|
||||
#include "Rdb.h"
|
||||
#include "DiskPageCache.h"
|
||||
//#include "DiskPageCache.h"
|
||||
|
||||
class Monitordb {
|
||||
public:
|
||||
@ -29,8 +29,8 @@ class Monitordb {
|
||||
|
||||
Rdb *getRdb() { return &m_rdb; };
|
||||
|
||||
DiskPageCache *getDiskPageMonitor () { return &m_pc; };
|
||||
DiskPageCache m_pc;
|
||||
//DiskPageCache *getDiskPageMonitor () { return &m_pc; };
|
||||
//DiskPageCache m_pc;
|
||||
|
||||
private:
|
||||
Rdb m_rdb;
|
||||
|
17
Msg13.cpp
17
Msg13.cpp
@ -1222,13 +1222,16 @@ bool ipWasBanned ( TcpSocket *ts , const char **msg , Msg13Request *r ) {
|
||||
|
||||
// if it is a seed url and there are no links, then perhaps we
|
||||
// are in a blacklist somewhere already from triggering a spider trap
|
||||
if ( //isInSeedBuf ( cr , r->ptr_url ) &&
|
||||
// this is set in XmlDoc.cpp based on hopcount really
|
||||
r->m_isRootSeedUrl &&
|
||||
! strstr ( ts->m_readBuf, "<a href" ) ) {
|
||||
*msg = "root/seed url with no outlinks";
|
||||
return true;
|
||||
}
|
||||
// i've seen this flub on a site where they just return a script
|
||||
// and it is not banned, so let's remove this until we thinkg
|
||||
// of something better.
|
||||
// if ( //isInSeedBuf ( cr , r->ptr_url ) &&
|
||||
// // this is set in XmlDoc.cpp based on hopcount really
|
||||
// r->m_isRootSeedUrl &&
|
||||
// ! strstr ( ts->m_readBuf, "<a href" ) ) {
|
||||
// *msg = "root/seed url with no outlinks";
|
||||
// return true;
|
||||
// }
|
||||
|
||||
|
||||
// TODO: compare a simple checksum of the page content to what
|
||||
|
13
Msg20.cpp
13
Msg20.cpp
@ -157,6 +157,12 @@ bool Msg20::getSummary ( Msg20Request *req ) {
|
||||
// do not re-route to twins if accessing an external network
|
||||
if ( hostdb != &g_hostdb ) req->m_expected = false;
|
||||
|
||||
if ( req->m_docId < 0 && ! req->ptr_ubuf ) {
|
||||
log("msg20: docid<0 and no url for msg20::getsummary");
|
||||
g_errno = EBADREQUEST;
|
||||
return true;
|
||||
}
|
||||
|
||||
// get groupId from docId, if positive
|
||||
uint32_t shardNum;
|
||||
if ( req->m_docId >= 0 )
|
||||
@ -398,8 +404,11 @@ void handleRequest20 ( UdpSlot *slot , int32_t netnice ) {
|
||||
|
||||
// sanity check, the size include the \0
|
||||
if ( req->m_collnum < 0 ) {
|
||||
log("query: Got empty collection in msg20 handler. FIX!");
|
||||
char *xx =NULL; *xx = 0;
|
||||
log("query: Got empty collection in msg20 handler. FIX! "
|
||||
"from ip=%s port=%i",iptoa(slot->m_ip),(int)slot->m_port);
|
||||
g_udpServer.sendErrorReply ( slot , ENOTFOUND );
|
||||
return;
|
||||
//char *xx =NULL; *xx = 0;
|
||||
}
|
||||
// if it's not stored locally that's an error
|
||||
if ( req->m_docId >= 0 && ! g_titledb.isLocal ( req->m_docId ) ) {
|
||||
|
7
Msg20.h
7
Msg20.h
@ -354,9 +354,9 @@ public:
|
||||
int32_t m_pageInlinksLastUpdated;
|
||||
|
||||
int32_t m_siteNumInlinks ; // GOOD inlinks!
|
||||
int32_t m_siteNumInlinksTotal ; // TOTAL inlinks
|
||||
int32_t m_siteNumUniqueIps ;
|
||||
int32_t m_siteNumUniqueCBlocks;
|
||||
//int32_t m_siteNumInlinksTotal ; // TOTAL inlinks
|
||||
//int32_t m_siteNumUniqueIps ;
|
||||
//int32_t m_siteNumUniqueCBlocks;
|
||||
|
||||
int32_t m_numOutlinks ; // replaced m_linkCount
|
||||
int32_t m_tmp ; // used by Msg40.cpp for event merge
|
||||
@ -406,6 +406,7 @@ public:
|
||||
int32_t m_timeLinkSpam ; // set for m_getLinkText
|
||||
void *m_parentOwner;
|
||||
char m_constructorId;
|
||||
|
||||
char m_inlinkWeight ; // set for m_getLinkText
|
||||
char m_isLinkSpam ; // set for m_getLinkText
|
||||
char m_isAnomaly ; // set for m_getLinkText
|
||||
|
@ -937,8 +937,10 @@ void gotTitleList ( void *state , RdbList *list , Msg5 *msg5 ) {
|
||||
else if ( r->m_url[0] ) {
|
||||
// get it
|
||||
int64_t uh48 = g_titledb.getUrlHash48(k);
|
||||
// sanity check
|
||||
if ( st->m_uh48 == 0 ) { char *xx=NULL;*xx=0; }
|
||||
// sanity check. MDW: looks like we allow 0 to
|
||||
// be a valid hash. so let this through. i've seen
|
||||
// it core here before.
|
||||
//if ( st->m_uh48 == 0 ) { char *xx=NULL;*xx=0; }
|
||||
// make sure our available docids are availble!
|
||||
if ( dd == ad1 ) ad1++;
|
||||
if ( dd == ad2 ) ad2++;
|
||||
|
185
Msg3.cpp
185
Msg3.cpp
@ -39,6 +39,86 @@ void Msg3::reset() {
|
||||
m_alloc = NULL;
|
||||
}
|
||||
|
||||
key192_t makeCacheKey ( int64_t vfd ,
|
||||
int64_t offset ,
|
||||
int64_t readSize ) {
|
||||
key192_t k;
|
||||
k.n2 = vfd;
|
||||
k.n1 = readSize;
|
||||
k.n0 = offset;
|
||||
return k;
|
||||
}
|
||||
|
||||
RdbCache g_rdbCaches[5];
|
||||
|
||||
class RdbCache *getDiskPageCache ( char rdbId ) {
|
||||
|
||||
RdbCache *rpc = NULL;
|
||||
int64_t *maxSizePtr = NULL;
|
||||
int64_t maxMem;
|
||||
int64_t maxRecs;
|
||||
char *dbname;
|
||||
if ( rdbId == RDB_POSDB ) {
|
||||
rpc = &g_rdbCaches[0];
|
||||
maxSizePtr = &g_conf.m_posdbFileCacheSize;
|
||||
maxMem = *maxSizePtr;
|
||||
maxRecs = maxMem / 5000;
|
||||
dbname = "posdbcache";
|
||||
}
|
||||
if ( rdbId == RDB_TAGDB ) {
|
||||
rpc = &g_rdbCaches[1];
|
||||
maxSizePtr = &g_conf.m_tagdbFileCacheSize;
|
||||
maxMem = *maxSizePtr;
|
||||
maxRecs = maxMem / 200;
|
||||
dbname = "tagdbcache";
|
||||
}
|
||||
if ( rdbId == RDB_CLUSTERDB ) {
|
||||
rpc = &g_rdbCaches[2];
|
||||
maxSizePtr = &g_conf.m_clusterdbFileCacheSize;
|
||||
maxMem = *maxSizePtr;
|
||||
maxRecs = maxMem / 32;
|
||||
dbname = "clustcache";
|
||||
}
|
||||
if ( rdbId == RDB_TITLEDB ) {
|
||||
rpc = &g_rdbCaches[3];
|
||||
maxSizePtr = &g_conf.m_titledbFileCacheSize;
|
||||
maxMem = *maxSizePtr;
|
||||
maxRecs = maxMem / 3000;
|
||||
dbname = "titdbcache";
|
||||
}
|
||||
if ( rdbId == RDB_SPIDERDB ) {
|
||||
rpc = &g_rdbCaches[4];
|
||||
maxSizePtr = &g_conf.m_spiderdbFileCacheSize;
|
||||
maxMem = *maxSizePtr;
|
||||
maxRecs = maxMem / 3000;
|
||||
dbname = "spdbcache";
|
||||
}
|
||||
|
||||
if ( ! rpc )
|
||||
return NULL;
|
||||
|
||||
if ( maxMem < 0 ) maxMem = 0;
|
||||
|
||||
// did size change? if not, return it
|
||||
if ( rpc->m_maxMem == maxMem )
|
||||
return rpc;
|
||||
|
||||
// re-init or init for the first time here
|
||||
if ( ! rpc->init ( maxMem ,
|
||||
-1 , // fixedDataSize. -1 since we are lists
|
||||
false , // support lists?
|
||||
maxRecs ,
|
||||
false , // use half keys?
|
||||
dbname ,
|
||||
false , // loadfromdisk
|
||||
sizeof(key192_t), // cache key size
|
||||
0 , // data key size
|
||||
-1 ) ) // numptrsmax
|
||||
return NULL;
|
||||
|
||||
return rpc;
|
||||
}
|
||||
|
||||
// . return false if blocked, true otherwise
|
||||
// . set g_errno on error
|
||||
// . read list of keys in [startKey,endKey] range
|
||||
@ -81,6 +161,10 @@ bool Msg3::readList ( char rdbId ,
|
||||
bool justGetEndKey ,
|
||||
bool allowPageCache ,
|
||||
bool hitDisk ) {
|
||||
|
||||
// set this to true to validate
|
||||
m_validateCache = false;//true;
|
||||
|
||||
// clear, this MUST be done so if we return true g_errno is correct
|
||||
g_errno = 0;
|
||||
// assume lists are not checked for corruption
|
||||
@ -599,6 +683,48 @@ bool Msg3::readList ( char rdbId ,
|
||||
break;
|
||||
}
|
||||
|
||||
////////
|
||||
//
|
||||
// try to get from PAGE CACHE
|
||||
//
|
||||
////////
|
||||
BigFile *ff = base->getFile(m_fileNums[i]);
|
||||
RdbCache *rpc = getDiskPageCache ( m_rdbId );
|
||||
if ( ! m_allowPageCache ) rpc = NULL;
|
||||
// . vfd is unique 64 bit file id
|
||||
// . if file is opened vfd is -1, only set in call to open()
|
||||
int64_t vfd = ff->getVfd();
|
||||
key192_t ck = makeCacheKey ( vfd , offset, bytesToRead);
|
||||
char *rec; int32_t recSize;
|
||||
bool inCache = false;
|
||||
if ( rpc && vfd != -1 && ! m_validateCache )
|
||||
inCache = rpc->getRecord ( (collnum_t)0 , // collnum
|
||||
(char *)&ck ,
|
||||
&rec ,
|
||||
&recSize ,
|
||||
true , // copy?
|
||||
-1 , // maxAge, none
|
||||
true ); // inccounts?
|
||||
m_scans[i].m_inPageCache = false;
|
||||
if ( inCache ) {
|
||||
m_scans[i].m_inPageCache = true;
|
||||
m_numScansCompleted++;
|
||||
// now we have to store this value, 6 or 12 so
|
||||
// we can modify the hint appropriately
|
||||
m_scans[i].m_shifted = *rec;
|
||||
m_lists[i].set ( rec +1,
|
||||
recSize-1 ,
|
||||
rec , // alloc
|
||||
recSize , // allocSize
|
||||
startKey2 ,
|
||||
endKey2 ,
|
||||
base->m_fixedDataSize ,
|
||||
true , // owndata
|
||||
base->useHalfKeys() ,
|
||||
getKeySizeFromRdbId ( m_rdbId ) );
|
||||
continue;
|
||||
}
|
||||
|
||||
// . do the scan/read of file #i
|
||||
// . this returns false if blocked, true otherwise
|
||||
// . this will set g_errno on error
|
||||
@ -812,6 +938,10 @@ bool Msg3::doneScanning ( ) {
|
||||
}
|
||||
#endif
|
||||
|
||||
// try to fix this error i've seen
|
||||
if ( g_errno == EBADENGINEER && max == -1 )
|
||||
max = 100;
|
||||
|
||||
// . if we had a ETRYAGAIN error, then try again now
|
||||
// . it usually means the whole file or a part of it was deleted
|
||||
// before we could finish reading it, so we should re-read all now
|
||||
@ -932,6 +1062,60 @@ bool Msg3::doneScanning ( ) {
|
||||
// files we were reading, i've seen 'ff' be NULL
|
||||
char *filename = "lostfilename";
|
||||
if ( ff ) filename = ff->getFilename();
|
||||
|
||||
// compute cache info
|
||||
RdbCache *rpc = getDiskPageCache ( m_rdbId );
|
||||
if ( ! m_allowPageCache ) rpc = NULL;
|
||||
int64_t vfd ;
|
||||
if ( ff ) vfd = ff->getVfd();
|
||||
key192_t ck ;
|
||||
if ( ff )
|
||||
ck = makeCacheKey ( vfd ,
|
||||
m_scans[i].m_offset ,
|
||||
m_scans[i].m_bytesToRead );
|
||||
if ( m_validateCache && ff && rpc && vfd != -1 ) {
|
||||
bool inCache;
|
||||
char *rec; int32_t recSize;
|
||||
inCache = rpc->getRecord ( (collnum_t)0 , // collnum
|
||||
(char *)&ck ,
|
||||
&rec ,
|
||||
&recSize ,
|
||||
true , // copy?
|
||||
-1 , // maxAge, none
|
||||
true ); // inccounts?
|
||||
if ( inCache &&
|
||||
// 1st byte is RdbScan::m_shifted
|
||||
( m_lists[i].m_listSize != recSize-1 ||
|
||||
memcmp ( m_lists[i].m_list , rec+1,recSize-1) ||
|
||||
*rec != m_scans[i].m_shifted ) ) {
|
||||
log("msg3: cache did not validate");
|
||||
char *xx=NULL;*xx=0;
|
||||
}
|
||||
mfree ( rec , recSize , "vca" );
|
||||
}
|
||||
|
||||
|
||||
///////
|
||||
//
|
||||
// STORE IN PAGE CACHE
|
||||
//
|
||||
///////
|
||||
// store what we read in the cache. don't bother storing
|
||||
// if it was a retry, just in case something strange happened.
|
||||
// store pre-constrain call is more efficient.
|
||||
if ( m_retryNum<=0 && ff && rpc && vfd != -1 &&
|
||||
! m_scans[i].m_inPageCache )
|
||||
rpc->addRecord ( (collnum_t)0 , // collnum
|
||||
(char *)&ck ,
|
||||
// rec1 is this little thingy
|
||||
&m_scans[i].m_shifted,
|
||||
1,
|
||||
// rec2
|
||||
m_lists[i].getList() ,
|
||||
m_lists[i].getListSize() ,
|
||||
0 ); // timestamp. 0 = now
|
||||
|
||||
// if from our 'page' cache, no need to constrain
|
||||
if ( ! m_lists[i].constrain ( m_startKey ,
|
||||
m_constrainKey , // m_endKey
|
||||
mrs , // m_minRecSizes
|
||||
@ -947,6 +1131,7 @@ bool Msg3::doneScanning ( ) {
|
||||
mstrerror(g_errno), ff->getDir(),
|
||||
ff->getFilename(), ff->m_vfd ,
|
||||
(int32_t)ff->m_numParts );
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
|
8
Msg3.h
8
Msg3.h
@ -3,8 +3,10 @@
|
||||
// . gets an RdbList from disk
|
||||
// . reads from N specified files and stores results in N RdbLists
|
||||
|
||||
#ifndef _MSG3_H_
|
||||
#define _MSG3_H_
|
||||
#ifndef MSG3_H
|
||||
#define MSG3_H
|
||||
|
||||
class RdbCache *getDiskPageCache ( char rdbId ) ;
|
||||
|
||||
// . max # of rdb files an rdb can have w/o merging
|
||||
// . merge your files to keep the number of them low to cut down # of seeks
|
||||
@ -114,6 +116,8 @@ class Msg3 {
|
||||
//char *m_coll;
|
||||
collnum_t m_collnum;
|
||||
|
||||
bool m_validateCache;
|
||||
|
||||
// the scan classes, 1 per file, used to read from that file
|
||||
RdbScan *m_scans ; // [ MAX_RDB_FILES ];
|
||||
|
||||
|
@ -154,6 +154,7 @@ void Msg39::getDocIds ( UdpSlot *slot ) {
|
||||
int32_t requestSize = m_slot->m_readBufSize;
|
||||
// ensure it's size is ok
|
||||
if ( requestSize < 8 ) {
|
||||
BadReq:
|
||||
g_errno = EBADREQUESTSIZE;
|
||||
log(LOG_LOGIC,"query: msg39: getDocIds: %s." ,
|
||||
mstrerror(g_errno) );
|
||||
@ -169,7 +170,11 @@ void Msg39::getDocIds ( UdpSlot *slot ) {
|
||||
m_r->m_buf );
|
||||
|
||||
// sanity check
|
||||
if ( finalSize != requestSize ) {char *xx=NULL;*xx=0; }
|
||||
if ( finalSize != requestSize ) {
|
||||
log("msg39: sending bad request.");
|
||||
goto BadReq;
|
||||
//char *xx=NULL;*xx=0; }
|
||||
}
|
||||
|
||||
getDocIds2 ( m_r );
|
||||
}
|
||||
|
57
Msg3a.cpp
57
Msg3a.cpp
@ -736,14 +736,6 @@ bool Msg3a::gotAllShardReplies ( ) {
|
||||
// cast it and set it
|
||||
m_reply [i] = mr;
|
||||
m_replyMaxSize[i] = replyMaxSize;
|
||||
// deserialize it (just sets the ptr_ and size_ member vars)
|
||||
//mr->deserialize ( );
|
||||
deserializeMsg ( sizeof(Msg39Reply) ,
|
||||
&mr->size_docIds,
|
||||
&mr->size_clusterRecs,
|
||||
&mr->ptr_docIds,
|
||||
mr->m_buf );
|
||||
|
||||
// sanity check
|
||||
if ( mr->m_nqt != m_q->getNumTerms() ) {
|
||||
g_errno = EBADREPLY;
|
||||
@ -761,6 +753,20 @@ bool Msg3a::gotAllShardReplies ( ) {
|
||||
mstrerror(g_errno));
|
||||
return true;
|
||||
}
|
||||
// deserialize it (just sets the ptr_ and size_ member vars)
|
||||
//mr->deserialize ( );
|
||||
if ( ! deserializeMsg ( sizeof(Msg39Reply) ,
|
||||
&mr->size_docIds,
|
||||
&mr->size_clusterRecs,
|
||||
&mr->ptr_docIds,
|
||||
mr->m_buf ) ) {
|
||||
g_errno = ECORRUPTDATA;
|
||||
m_errno = ECORRUPTDATA;
|
||||
log("query: msg3a: Shard had error: %s",
|
||||
mstrerror(g_errno));
|
||||
return true;
|
||||
|
||||
}
|
||||
// skip down here if reply was already set
|
||||
//skip:
|
||||
// add of the total hits from each shard, this is how many
|
||||
@ -1171,18 +1177,6 @@ bool Msg3a::mergeLists ( ) {
|
||||
continue;
|
||||
}
|
||||
|
||||
fe2->m_count += fe->m_count;
|
||||
|
||||
// also accumualte count of total docs, not just in
|
||||
// the search results, that have this value for this
|
||||
// facet
|
||||
fe2->m_outsideSearchResultsCount +=
|
||||
fe->m_outsideSearchResultsCount;
|
||||
|
||||
// prefer docid kinda randomly to balance
|
||||
// lookupFacets() load in Msg40.cpp
|
||||
if ( rand() % 2 )
|
||||
fe2->m_docId = fe->m_docId;
|
||||
|
||||
|
||||
if ( isFloat ) {
|
||||
@ -1192,23 +1186,38 @@ bool Msg3a::mergeLists ( ) {
|
||||
sum2 += sum1;
|
||||
*((double *)&fe2->m_sum) = sum2;
|
||||
// and min/max as floats
|
||||
|
||||
float min1 = *((float *)&fe ->m_min);
|
||||
float min2 = *((float *)&fe2->m_min);
|
||||
if ( min1 < min2 ) min2 = min1;
|
||||
if ( fe2->m_count==0 || (fe->m_count!=0 && min1 < min2 )) min2 = min1;
|
||||
*((float *)&fe2->m_min) = min2;
|
||||
float max1 = *((float *)&fe ->m_max);
|
||||
float max2 = *((float *)&fe2->m_max);
|
||||
if ( max1 > max2 ) max2 = max1;
|
||||
if ( fe2->m_count==0 || (fe->m_count!=0 && max1 > max2 )) max2 = max1;
|
||||
*((float *)&fe2->m_max) = max2;
|
||||
}
|
||||
if ( isInt ) {
|
||||
fe2->m_sum += fe->m_sum;
|
||||
if ( fe->m_min < fe2->m_min )
|
||||
if ( fe2->m_count==0 || (fe->m_count!=0 && fe->m_min < fe2->m_min ))
|
||||
fe2->m_min = fe->m_min;
|
||||
if ( fe->m_max > fe2->m_max )
|
||||
if ( fe2->m_count==0 || (fe->m_count!=0 && fe->m_max > fe2->m_max ))
|
||||
fe2->m_max = fe->m_max;
|
||||
}
|
||||
|
||||
fe2->m_count += fe->m_count;
|
||||
|
||||
// also accumualte count of total docs, not just in
|
||||
// the search results, that have this value for this
|
||||
// facet
|
||||
fe2->m_outsideSearchResultsCount +=
|
||||
fe->m_outsideSearchResultsCount;
|
||||
|
||||
// prefer docid kinda randomly to balance
|
||||
// lookupFacets() load in Msg40.cpp
|
||||
if ( rand() % 2 )
|
||||
fe2->m_docId = fe->m_docId;
|
||||
|
||||
|
||||
}
|
||||
|
||||
// now get the next gbfacet: term if there was one
|
||||
|
44
Msg40.cpp
44
Msg40.cpp
@ -696,7 +696,7 @@ bool Msg40::federatedLoop ( ) {
|
||||
// and mult based on index size
|
||||
numDocIdSplits *= mult;
|
||||
// prevent going OOM for type:article AND html
|
||||
//if ( numDocIdSplits < 5 ) numDocIdSplits = 5;
|
||||
if ( numDocIdSplits < 5 ) numDocIdSplits = 5;
|
||||
//}
|
||||
|
||||
if ( cr ) mr.m_maxQueryTerms = cr->m_maxQueryTerms;
|
||||
@ -1071,7 +1071,7 @@ bool Msg40::reallocMsg20Buf ( ) {
|
||||
// . allocate m_buf2 to hold all our Msg20 pointers and Msg20 classes
|
||||
// . how much mem do we need?
|
||||
// . need space for the msg20 ptrs
|
||||
int32_t need = m_msg3a.m_numDocIds * sizeof(Msg20 *);
|
||||
int64_t need = m_msg3a.m_numDocIds * sizeof(Msg20 *);
|
||||
// need space for the classes themselves, only if "visible" though
|
||||
for ( int32_t i = 0 ; i < m_msg3a.m_numDocIds ; i++ )
|
||||
if ( m_msg3a.m_clusterLevels[i] == CR_OK )
|
||||
@ -1243,6 +1243,13 @@ bool Msg40::reallocMsg20Buf ( ) {
|
||||
m_buf2 = NULL;
|
||||
m_bufMaxSize2 = need;
|
||||
|
||||
// if ( need > 2000000000 ) {
|
||||
// log("msg40: need too much mem=%"INT64,need);
|
||||
// m_errno = ENOMEM;
|
||||
// g_errno = ENOMEM;
|
||||
// return false;
|
||||
// }
|
||||
|
||||
// do the alloc
|
||||
if ( need ) m_buf2 = (char *)mmalloc ( need ,"Msg40msg20");
|
||||
if ( need && ! m_buf2 ) { m_errno = g_errno; return false; }
|
||||
@ -2030,7 +2037,7 @@ bool Msg40::gotSummary ( ) {
|
||||
// . set it to true on all but the last thing we send!
|
||||
// . after each chunk of data we send out, TcpServer::sendChunk
|
||||
// will call our callback, doneSendingWrapper9
|
||||
if ( m_si->m_streamResults )
|
||||
if ( m_si->m_streamResults && st->m_socket )
|
||||
st->m_socket->m_streamingMode = true;
|
||||
|
||||
|
||||
@ -2112,7 +2119,7 @@ bool Msg40::gotSummary ( ) {
|
||||
if ( g_conf.m_logDebugTcp )
|
||||
log("tcp: disabling streamingMode now");
|
||||
// this will be our final send
|
||||
st->m_socket->m_streamingMode = false;
|
||||
if ( st->m_socket ) st->m_socket->m_streamingMode = false;
|
||||
}
|
||||
|
||||
|
||||
@ -2120,6 +2127,24 @@ bool Msg40::gotSummary ( ) {
|
||||
|
||||
//g_conf.m_logDebugTcp = 1;
|
||||
|
||||
// do we still own this socket? i am thinking it got closed somewhere
|
||||
// and the socket descriptor was re-assigned to another socket
|
||||
// getting a diffbot reply from XmLDoc::getDiffbotReply()
|
||||
if ( st->m_socket &&
|
||||
st->m_socket->m_startTime != st->m_socketStartTimeHack ) {
|
||||
log("msg40: lost control of socket. sd=%i. the socket "
|
||||
"descriptor closed on us and got re-used by someone else.",
|
||||
(int)st->m_socket->m_sd);
|
||||
// if there wasn't already an error like 'broken pipe' then
|
||||
// set it here so we stop getting summaries if streaming.
|
||||
if ( ! m_socketHadError ) m_socketHadError = EBADENGINEER;
|
||||
// make it NULL to avoid us from doing anything to it
|
||||
// since sommeone else is using it now.
|
||||
st->m_socket = NULL;
|
||||
//g_errno = EBADENGINEER;
|
||||
}
|
||||
|
||||
|
||||
// . transmit the chunk in sb if non-zero length
|
||||
// . steals the allocated buffer from sb and stores in the
|
||||
// TcpSocket::m_sendBuf, which it frees when socket is
|
||||
@ -2133,6 +2158,7 @@ bool Msg40::gotSummary ( ) {
|
||||
if ( sb->length() &&
|
||||
// did client browser close the socket on us midstream?
|
||||
! m_socketHadError &&
|
||||
st->m_socket &&
|
||||
! tcp->sendChunk ( st->m_socket ,
|
||||
sb ,
|
||||
this ,
|
||||
@ -2145,8 +2171,11 @@ bool Msg40::gotSummary ( ) {
|
||||
|
||||
// writing on closed socket?
|
||||
if ( g_errno ) {
|
||||
m_socketHadError = g_errno;
|
||||
if ( ! m_socketHadError ) m_socketHadError = g_errno;
|
||||
log("msg40: got tcp error : %s",mstrerror(g_errno));
|
||||
// disown it here so we do not damage in case it gets
|
||||
// reopened by someone else
|
||||
st->m_socket = NULL;
|
||||
}
|
||||
|
||||
// do we need to launch another batch of summary requests?
|
||||
@ -2200,8 +2229,9 @@ bool Msg40::gotSummary ( ) {
|
||||
//mdelete(st, sizeof(State0), "msg40st0");
|
||||
//delete st;
|
||||
// otherwise, all done!
|
||||
log("msg40: did not send stuff from last summary. BUG "
|
||||
"this=0x%"PTRFMT"",(PTRTYPE)this);
|
||||
log("msg40: did not send last search result summary. "
|
||||
"this=0x%"PTRFMT" because had error: %s",(PTRTYPE)this,
|
||||
mstrerror(m_socketHadError));
|
||||
return true;
|
||||
}
|
||||
|
||||
|
6
Msg5.cpp
6
Msg5.cpp
@ -531,6 +531,10 @@ bool Msg5::readList ( ) {
|
||||
int32_t niceness = m_niceness;
|
||||
if ( niceness > 0 ) niceness = 2;
|
||||
if ( m_isRealMerge ) niceness = 1;
|
||||
bool allowPageCache = true;
|
||||
// just in case cache is corrupted, do not use it for doing real
|
||||
// merges, also it would kick out good lists we have in there already
|
||||
if ( m_isRealMerge ) allowPageCache = false;
|
||||
if ( compute ) {
|
||||
m_msg3.readList ( m_rdbId ,
|
||||
m_collnum ,
|
||||
@ -547,7 +551,7 @@ bool Msg5::readList ( ) {
|
||||
m_compensateForMerge ,
|
||||
-1,//m_syncPoint ,
|
||||
true , // just get endKey?
|
||||
m_allowPageCache );
|
||||
allowPageCache );
|
||||
if ( g_errno ) {
|
||||
log("db: Msg5: getting endKey: %s",mstrerrno(g_errno));
|
||||
return true;
|
||||
|
@ -3347,7 +3347,7 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
|
||||
"</tr>"
|
||||
|
||||
"<tr>"
|
||||
"<td><b>Crawl Completion Time:</td>"
|
||||
"<td><b>Last Crawl Completion Time:</td>"
|
||||
"<td>%"UINT32"</td>"
|
||||
"</tr>"
|
||||
|
||||
@ -3362,6 +3362,46 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
|
||||
"<td>%"INT32"</td>"
|
||||
"</tr>"
|
||||
|
||||
, cr->m_diffbotCrawlName.getBufStart()
|
||||
|
||||
, (int32_t)cr->m_isCustomCrawl
|
||||
|
||||
, cr->m_diffbotToken.getBufStart()
|
||||
|
||||
, seedStr
|
||||
|
||||
, crawlStatus
|
||||
, tmp.getBufStart()
|
||||
|
||||
, cr->m_diffbotCrawlStartTime
|
||||
// this is 0 if not over yet
|
||||
, cr->m_diffbotCrawlEndTime
|
||||
|
||||
, cr->m_spiderRoundNum
|
||||
, cr->m_globalCrawlInfo.m_hasUrlsReadyToSpider
|
||||
|
||||
);
|
||||
|
||||
// show crawlinfo crap
|
||||
CrawlInfo *cis = (CrawlInfo *)cr->m_crawlInfoBuf.getBufStart();
|
||||
sb.safePrintf("<tr><td><b>Ready Hosts</b></td><td>");
|
||||
for ( int32_t i = 0 ; i < g_hostdb.getNumHosts() ; i++ ) {
|
||||
CrawlInfo *ci = &cis[i];
|
||||
if ( ! ci->m_hasUrlsReadyToSpider ) continue;
|
||||
Host *h = g_hostdb.getHost ( i );
|
||||
if ( ! h ) continue;
|
||||
sb.safePrintf("<a href=http://%s:%i/crawlbot?c=%s>"
|
||||
"%i</a> "
|
||||
, iptoa(h->m_ip)
|
||||
, (int)h->m_httpPort
|
||||
, cr->m_coll
|
||||
, (int)i
|
||||
);
|
||||
}
|
||||
sb.safePrintf("</tr>\n");
|
||||
|
||||
|
||||
sb.safePrintf(
|
||||
|
||||
// this will have to be in crawlinfo too!
|
||||
//"<tr>"
|
||||
@ -3416,24 +3456,6 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
|
||||
"</tr>"
|
||||
|
||||
|
||||
, cr->m_diffbotCrawlName.getBufStart()
|
||||
|
||||
, (int32_t)cr->m_isCustomCrawl
|
||||
|
||||
, cr->m_diffbotToken.getBufStart()
|
||||
|
||||
, seedStr
|
||||
|
||||
, crawlStatus
|
||||
, tmp.getBufStart()
|
||||
|
||||
, cr->m_diffbotCrawlStartTime
|
||||
// this is 0 if not over yet
|
||||
, cr->m_diffbotCrawlEndTime
|
||||
|
||||
, cr->m_spiderRoundNum
|
||||
, cr->m_globalCrawlInfo.m_hasUrlsReadyToSpider
|
||||
|
||||
, cr->m_globalCrawlInfo.m_objectsAdded -
|
||||
cr->m_globalCrawlInfo.m_objectsDeleted
|
||||
, cr->m_globalCrawlInfo.m_urlsHarvested
|
||||
|
@ -75,6 +75,13 @@ void setInjectionRequestFromParms ( TcpSocket *sock ,
|
||||
int32_t def = atoll(m->m_def);
|
||||
*ii = (char)hr->getLong(m->m_cgi,def);
|
||||
}
|
||||
else if ( m->m_type == TYPE_IP ) {
|
||||
char *ii = (char *)((char *)ir + m->m_off);
|
||||
char *is = hr->getString(m->m_cgi,NULL);
|
||||
*(int32_t *)ii = 0; // default ip to 0
|
||||
// otherwise, set the ip
|
||||
if ( is ) *(int32_t *)ii = atoip(is);
|
||||
}
|
||||
// if unsupported let developer know
|
||||
else { char *xx=NULL;*xx=0; }
|
||||
}
|
||||
@ -581,11 +588,29 @@ bool sendHttpReply ( void *state ) {
|
||||
//
|
||||
////////////
|
||||
|
||||
XmlDoc *s_injectHead = NULL;
|
||||
XmlDoc *s_injectTail = NULL;
|
||||
|
||||
XmlDoc *getInjectHead ( ) { return s_injectHead; }
|
||||
|
||||
// send back a reply to the originator of the msg7 injection request
|
||||
void sendUdpReply7 ( void *state ) {
|
||||
|
||||
XmlDoc *xd = (XmlDoc *)state;
|
||||
|
||||
// remove from linked list
|
||||
if ( xd->m_nextInject )
|
||||
xd->m_nextInject->m_prevInject = xd->m_prevInject;
|
||||
if ( xd->m_prevInject )
|
||||
xd->m_prevInject->m_nextInject = xd->m_nextInject;
|
||||
if ( s_injectHead == xd )
|
||||
s_injectHead = xd->m_nextInject;
|
||||
if ( s_injectTail == xd )
|
||||
s_injectTail = xd->m_prevInject;
|
||||
xd->m_nextInject = NULL;
|
||||
xd->m_prevInject = NULL;
|
||||
|
||||
|
||||
UdpSlot *slot = xd->m_injectionSlot;
|
||||
|
||||
uint32_t statColor = 0xccffcc;
|
||||
@ -655,6 +680,19 @@ void handleRequest7 ( UdpSlot *slot , int32_t netnice ) {
|
||||
xd->m_injectionSlot = slot;
|
||||
xd->m_injectStartTime = gettimeofdayInMilliseconds();
|
||||
|
||||
// add to linked list
|
||||
xd->m_nextInject = NULL;
|
||||
xd->m_prevInject = NULL;
|
||||
if ( s_injectTail ) {
|
||||
s_injectTail->m_nextInject = xd;
|
||||
xd->m_prevInject = s_injectTail;
|
||||
s_injectTail = xd;
|
||||
}
|
||||
else {
|
||||
s_injectHead = xd;
|
||||
s_injectTail = xd;
|
||||
}
|
||||
|
||||
if ( ! xd->injectDoc ( ir->ptr_url , // m_injectUrlBuf.getBufStart() ,
|
||||
cr ,
|
||||
ir->ptr_content , // start , // content ,
|
||||
|
@ -1,6 +1,10 @@
|
||||
#ifndef GBINJECT_H
|
||||
#define GBINJECT_H
|
||||
|
||||
// for getting list of injections currently being processed on this host
|
||||
// for printing in the Spider Queue table in Spider.cpp
|
||||
class XmlDoc *getInjectHead ( ) ;
|
||||
|
||||
void handleRequest7Import ( class UdpSlot *slot , int32_t netnice ) ;
|
||||
|
||||
void handleRequest7 ( class UdpSlot *slot , int32_t netnice ) ;
|
||||
|
@ -68,8 +68,12 @@ bool sendReply ( State0 *st , char *reply ) {
|
||||
|
||||
int32_t savedErr = g_errno;
|
||||
|
||||
TcpSocket *s = st->m_socket;
|
||||
if ( ! s ) { char *xx=NULL;*xx=0; }
|
||||
TcpSocket *sock = st->m_socket;
|
||||
if ( ! sock ) {
|
||||
log("results: not sending back results on an empty socket."
|
||||
"socket must have closed on us abruptly.");
|
||||
//char *xx=NULL;*xx=0; }
|
||||
}
|
||||
SearchInput *si = &st->m_si;
|
||||
char *ct = "text/html";
|
||||
if ( si && si->m_format == FORMAT_XML ) ct = "text/xml";
|
||||
@ -143,7 +147,8 @@ bool sendReply ( State0 *st , char *reply ) {
|
||||
//
|
||||
// send back the actual search results
|
||||
//
|
||||
g_httpServer.sendDynamicPage(s,
|
||||
if ( sock )
|
||||
g_httpServer.sendDynamicPage(sock,
|
||||
reply,
|
||||
rlen,//gbstrlen(reply),
|
||||
// don't let the ajax re-gen
|
||||
@ -199,9 +204,9 @@ bool sendReply ( State0 *st , char *reply ) {
|
||||
// if we had a broken pipe from the browser while sending
|
||||
// them the search results, then we end up closing the socket fd
|
||||
// in TcpServer::sendChunk() > sendMsg() > destroySocket()
|
||||
if ( s->m_numDestroys ) {
|
||||
if ( sock && sock->m_numDestroys ) {
|
||||
log("results: not sending back error on destroyed socket "
|
||||
"sd=%"INT32"",s->m_sd);
|
||||
"sd=%"INT32"",sock->m_sd);
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -212,7 +217,8 @@ bool sendReply ( State0 *st , char *reply ) {
|
||||
savedErr == ENOCOLLREC)
|
||||
status = 400;
|
||||
|
||||
g_httpServer.sendQueryErrorReply(s,
|
||||
if ( sock )
|
||||
g_httpServer.sendQueryErrorReply(sock,
|
||||
status,
|
||||
mstrerror(savedErr),
|
||||
format,//xml,
|
||||
@ -542,6 +548,9 @@ bool sendPageResults ( TcpSocket *s , HttpRequest *hr ) {
|
||||
// set this in case SearchInput::set fails!
|
||||
st->m_socket = s;
|
||||
|
||||
// record timestamp so we know if we got our socket closed and swapped
|
||||
st->m_socketStartTimeHack = s->m_startTime;
|
||||
|
||||
// save this count so we know if TcpServer.cpp calls destroySocket(s)
|
||||
st->m_numDestroys = s->m_numDestroys;
|
||||
|
||||
@ -1154,6 +1163,16 @@ bool gotResults ( void *state ) {
|
||||
|
||||
SearchInput *si = &st->m_si;
|
||||
|
||||
// if we lost the socket because we were streaming and it
|
||||
// got closed from a broken pipe or something, then Msg40.cpp
|
||||
// will set st->m_socket to NULL if the fd ends up ending closed
|
||||
// because someone else might be using it and we do not want to
|
||||
// mess with their TcpSocket settings.
|
||||
if ( ! st->m_socket ) {
|
||||
log("results: socket is NULL. sending failed.");
|
||||
return sendReply(st,NULL);
|
||||
}
|
||||
|
||||
// if in streaming mode and we never sent anything and we had
|
||||
// an error, then send that back. we never really entered streaming
|
||||
// mode in that case. this happens when someone deletes a coll
|
||||
@ -1164,6 +1183,23 @@ bool gotResults ( void *state ) {
|
||||
st->m_socket->m_totalSent == 0 )
|
||||
return sendReply(st,NULL);
|
||||
|
||||
|
||||
// if we skipped a shard because it was dead, usually we provide
|
||||
// the results anyway, but if this switch is true then return an
|
||||
// error code instead. this is the 'all or nothing' switch.
|
||||
if ( msg40->m_msg3a.m_skippedShards > 0 &&
|
||||
! g_conf.m_returnResultsAnyway ) {
|
||||
char reply[256];
|
||||
sprintf ( reply ,
|
||||
"%"INT32" shard(s) out of %"INT32" did not "
|
||||
"respond to query."
|
||||
, msg40->m_msg3a.m_skippedShards
|
||||
, g_hostdb.m_numShards );
|
||||
g_errno = ESHARDDOWN;
|
||||
return sendReply(st,reply);
|
||||
}
|
||||
|
||||
|
||||
// if already printed from Msg40.cpp, bail out now
|
||||
if ( si->m_streamResults ) {
|
||||
// this will be our final send
|
||||
@ -1220,10 +1256,21 @@ bool gotResults ( void *state ) {
|
||||
// into it, and it must be the SAME ptr too!
|
||||
CollectionRec *cr = si->m_cr;//g_collectiondb.getRec ( collnum );
|
||||
if ( ! cr ) { // || cr != si->m_cr ) {
|
||||
g_errno = ENOCOLLREC;
|
||||
return sendReply(st,NULL);
|
||||
g_errno = ENOCOLLREC;
|
||||
return sendReply(st,NULL);
|
||||
}
|
||||
|
||||
// this causes ooms everywhere, not a good fix
|
||||
if ( ! msg40->m_msg20 && ! si->m_docIdsOnly && msg40->m_errno ) {
|
||||
log("msg40: failed to get results q=%s",si->m_q.m_orig);
|
||||
//g_errno = ENOMEM;
|
||||
g_errno = msg40->m_errno;
|
||||
return sendReply(st,NULL);
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
//char *coll = cr->m_coll;
|
||||
|
||||
/*
|
||||
@ -3926,6 +3973,8 @@ bool printResult ( State0 *st, int32_t ix , int32_t *numPrintedSoFar ) {
|
||||
ix, (int32_t)msg40->getClusterLevel(ix));
|
||||
|
||||
int64_t d = msg40->getDocId(ix);
|
||||
// this is normally a double, but cast to float
|
||||
float docScore = (float)msg40->getScore(ix);
|
||||
|
||||
// do not print if it is a summary dup or had some error
|
||||
// int32_t level = (int32_t)msg40->getClusterLevel(ix);
|
||||
@ -5047,6 +5096,7 @@ bool printResult ( State0 *st, int32_t ix , int32_t *numPrintedSoFar ) {
|
||||
// . docId for possible cached link
|
||||
// . might have merged a bunch together
|
||||
sb->safePrintf("\t\t<docId>%"INT64"</docId>\n",mr->m_docId );
|
||||
sb->safePrintf("\t\t<docScore>%f</docScore>\n",docScore);
|
||||
}
|
||||
|
||||
if ( si->m_format == FORMAT_XML && mr->m_contentType != CT_STATUS ) {
|
||||
@ -5097,6 +5147,7 @@ bool printResult ( State0 *st, int32_t ix , int32_t *numPrintedSoFar ) {
|
||||
// . docId for possible cached link
|
||||
// . might have merged a bunch together
|
||||
sb->safePrintf("\t\t\"docId\":%"INT64",\n",mr->m_docId );
|
||||
sb->safePrintf("\t\t\"docScore\":%f,\n",docScore);
|
||||
}
|
||||
|
||||
if ( si->m_format == FORMAT_JSON && mr->m_contentType != CT_STATUS ) {
|
||||
@ -5943,15 +5994,15 @@ bool printResult ( State0 *st, int32_t ix , int32_t *numPrintedSoFar ) {
|
||||
"</numGoodSiteInlinks>\n",
|
||||
(int32_t)mr->m_siteNumInlinks );
|
||||
|
||||
sb->safePrintf ("\t\t<numTotalSiteInlinks>%"INT32""
|
||||
"</numTotalSiteInlinks>\n",
|
||||
(int32_t)mr->m_siteNumInlinksTotal );
|
||||
sb->safePrintf ("\t\t<numUniqueIpsLinkingToSite>%"INT32""
|
||||
"</numUniqueIpsLinkingToSite>\n",
|
||||
(int32_t)mr->m_siteNumUniqueIps );
|
||||
sb->safePrintf ("\t\t<numUniqueCBlocksLinkingToSite>%"INT32""
|
||||
"</numUniqueCBlocksLinkingToSite>\n",
|
||||
(int32_t)mr->m_siteNumUniqueCBlocks );
|
||||
// sb->safePrintf ("\t\t<numTotalSiteInlinks>%"INT32""
|
||||
// "</numTotalSiteInlinks>\n",
|
||||
// (int32_t)mr->m_siteNumInlinksTotal );
|
||||
// sb->safePrintf ("\t\t<numUniqueIpsLinkingToSite>%"INT32""
|
||||
// "</numUniqueIpsLinkingToSite>\n",
|
||||
// (int32_t)mr->m_siteNumUniqueIps );
|
||||
// sb->safePrintf("\t\t<numUniqueCBlocksLinkingToSite>%"INT32""
|
||||
// "</numUniqueCBlocksLinkingToSite>\n",
|
||||
// (int32_t)mr->m_siteNumUniqueCBlocks );
|
||||
|
||||
|
||||
struct tm *timeStruct3;
|
||||
|
@ -63,6 +63,7 @@ public:
|
||||
bool m_didRedownload;
|
||||
XmlDoc *m_xd;
|
||||
int32_t m_oldContentHash32;
|
||||
int64_t m_socketStartTimeHack;
|
||||
};
|
||||
|
||||
|
||||
|
@ -383,16 +383,17 @@ void printUdpTable ( SafeBuf *p, char *title, UdpServer *server ,
|
||||
"<td><b>hostname</b></td>";
|
||||
}
|
||||
|
||||
UdpSlot *slot = server->m_head3;
|
||||
int32_t callbackReadyCount = 0;
|
||||
for ( ; slot ; slot = slot->m_next3 , callbackReadyCount++ );
|
||||
//UdpSlot *slot = server->m_head3;
|
||||
//int32_t callbackReadyCount = 0;
|
||||
//for ( ; slot ; slot = slot->m_next3 , callbackReadyCount++ );
|
||||
|
||||
p->safePrintf ( "<table %s>"
|
||||
"<tr class=hdrow><td colspan=19>"
|
||||
"<center>"
|
||||
//"<font size=+1>"
|
||||
"<b>%s</b> (%"INT32" transactions)"
|
||||
"(%"INT32" requests waiting to processed)"
|
||||
//"(%"INT32" requests waiting to processed)"
|
||||
"(%"INT32" incoming)"
|
||||
//"</font>"
|
||||
"</td></tr>"
|
||||
"<tr bgcolor=#%s>"
|
||||
@ -419,7 +420,8 @@ void printUdpTable ( SafeBuf *p, char *title, UdpServer *server ,
|
||||
"</tr>\n" ,
|
||||
TABLE_STYLE,
|
||||
title , server->getNumUsedSlots() ,
|
||||
callbackReadyCount ,
|
||||
//callbackReadyCount ,
|
||||
server->getNumUsedSlotsIncoming() ,
|
||||
DARK_BLUE ,
|
||||
dd );
|
||||
|
||||
|
@ -30,6 +30,7 @@
|
||||
#include "Sections.h"
|
||||
//#include "Msg0.h" // g_termlistCache
|
||||
#include "Msg13.h"
|
||||
#include "Msg3.h"
|
||||
|
||||
bool printNumAbbr ( SafeBuf &p, int64_t vvv ) {
|
||||
float val = (float)vvv;
|
||||
@ -733,6 +734,18 @@ bool sendPageStats ( TcpSocket *s , HttpRequest *r ) {
|
||||
p.safePrintf("<td>%"INT64"</td>",a);
|
||||
}
|
||||
|
||||
p.safePrintf ("</tr>\n<tr class=poo><td><b><nobr>dropped recs</td>" );
|
||||
for ( int32_t i = 0 ; i < numCaches ; i++ ) {
|
||||
int64_t a = caches[i]->m_deletes;
|
||||
p.safePrintf("<td>%"INT64"</td>",a);
|
||||
}
|
||||
|
||||
p.safePrintf ("</tr>\n<tr class=poo><td><b><nobr>added recs</td>" );
|
||||
for ( int32_t i = 0 ; i < numCaches ; i++ ) {
|
||||
int64_t a = caches[i]->m_adds;
|
||||
p.safePrintf("<td>%"INT64"</td>",a);
|
||||
}
|
||||
|
||||
//p.safePrintf ("</tr>\n<tr class=poo><td><b><nobr>max age</td>" );
|
||||
//for ( int32_t i = 0 ; i < numCaches ; i++ ) {
|
||||
// int64_t a = caches[i]->getMaxMem();
|
||||
@ -2076,64 +2089,72 @@ bool sendPageStats ( TcpSocket *s , HttpRequest *r ) {
|
||||
*/
|
||||
|
||||
|
||||
p.safePrintf("<tr class=poo><td><b>page cache hits %%</b></td>");
|
||||
p.safePrintf("<tr class=poo><td><b>file cache hits %%</b></td>");
|
||||
totalf = 0.0;
|
||||
for ( int32_t i = 0 ; i < nr ; i++ ) {
|
||||
if ( ! rdbs[i]->m_pc ) {
|
||||
Rdb *rdb = rdbs[i];
|
||||
RdbCache *rpc = getDiskPageCache ( rdb->m_rdbId );
|
||||
if ( ! rpc ) {
|
||||
p.safePrintf("<td>--</td>");
|
||||
continue;
|
||||
}
|
||||
int64_t hits = rdbs[i]->m_pc->getNumHits();
|
||||
int64_t misses = rdbs[i]->m_pc->getNumMisses();
|
||||
int64_t hits = rpc->getNumHits();
|
||||
int64_t misses = rpc->getNumMisses();
|
||||
int64_t sum = hits + misses;
|
||||
float val = 0.0;
|
||||
if ( sum > 0.0 ) val = ((float)hits * 100.0) / (float)sum;
|
||||
totalf += val;
|
||||
p.safePrintf("<td>%.1f</td>",val);
|
||||
//totalf += val;
|
||||
p.safePrintf("<td>%.1f%%</td>",val);
|
||||
}
|
||||
p.safePrintf("<td>%.1f</td></tr>\n",totalf);
|
||||
p.safePrintf("<td>--</td></tr>\n");
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
p.safePrintf("<tr class=poo><td><b>page cache hits</b></td>");
|
||||
p.safePrintf("<tr class=poo><td><b>file cache hits</b></td>");
|
||||
total = 0;
|
||||
for ( int32_t i = 0 ; i < nr ; i++ ) {
|
||||
if ( ! rdbs[i]->m_pc ) {
|
||||
Rdb *rdb = rdbs[i];
|
||||
RdbCache *rpc = getDiskPageCache ( rdb->m_rdbId );
|
||||
if ( ! rpc ) {
|
||||
p.safePrintf("<td>--</td>");
|
||||
continue;
|
||||
}
|
||||
int64_t val = rdbs[i]->m_pc->getNumHits();
|
||||
int64_t val = rpc->getNumHits();
|
||||
total += val;
|
||||
p.safePrintf("<td>%"UINT64"</td>",val);
|
||||
}
|
||||
p.safePrintf("<td>%"UINT64"</td></tr>\n",total);
|
||||
|
||||
|
||||
p.safePrintf("<tr class=poo><td><b>page cache misses</b></td>");
|
||||
p.safePrintf("<tr class=poo><td><b>file cache misses</b></td>");
|
||||
total = 0;
|
||||
for ( int32_t i = 0 ; i < nr ; i++ ) {
|
||||
if ( ! rdbs[i]->m_pc ) {
|
||||
Rdb *rdb = rdbs[i];
|
||||
RdbCache *rpc = getDiskPageCache ( rdb->m_rdbId );
|
||||
if ( ! rpc ) {
|
||||
p.safePrintf("<td>--</td>");
|
||||
continue;
|
||||
}
|
||||
int64_t val = rdbs[i]->m_pc->getNumMisses();
|
||||
int64_t val = rpc->getNumMisses();
|
||||
total += val;
|
||||
p.safePrintf("<td>%"UINT64"</td>",val);
|
||||
}
|
||||
p.safePrintf("<td>%"UINT64"</td></tr>\n",total);
|
||||
|
||||
|
||||
p.safePrintf("<tr class=poo><td><b>page cache tries</b></td>");
|
||||
p.safePrintf("<tr class=poo><td><b>file cache tries</b></td>");
|
||||
total = 0;
|
||||
for ( int32_t i = 0 ; i < nr ; i++ ) {
|
||||
if ( ! rdbs[i]->m_pc ) {
|
||||
Rdb *rdb = rdbs[i];
|
||||
RdbCache *rpc = getDiskPageCache ( rdb->m_rdbId );
|
||||
if ( ! rpc ) {
|
||||
p.safePrintf("<td>--</td>");
|
||||
continue;
|
||||
}
|
||||
int64_t hits = rdbs[i]->m_pc->getNumHits();
|
||||
int64_t misses = rdbs[i]->m_pc->getNumMisses();
|
||||
int64_t hits = rpc->getNumHits();
|
||||
int64_t misses = rpc->getNumMisses();
|
||||
int64_t val = hits + misses;
|
||||
total += val;
|
||||
p.safePrintf("<td>%"UINT64"</td>",val);
|
||||
@ -2141,28 +2162,60 @@ bool sendPageStats ( TcpSocket *s , HttpRequest *r ) {
|
||||
p.safePrintf("<td>%"UINT64"</td></tr>\n",total);
|
||||
|
||||
|
||||
p.safePrintf("<tr class=poo><td><b>page cache used</b></td>");
|
||||
p.safePrintf("<tr class=poo><td><b>file cache adds</b></td>");
|
||||
total = 0;
|
||||
for ( int32_t i = 0 ; i < nr ; i++ ) {
|
||||
if ( ! rdbs[i]->m_pc ) {
|
||||
Rdb *rdb = rdbs[i];
|
||||
RdbCache *rpc = getDiskPageCache ( rdb->m_rdbId );
|
||||
if ( ! rpc ) {
|
||||
p.safePrintf("<td>--</td>");
|
||||
continue;
|
||||
}
|
||||
int64_t val = rdbs[i]->m_pc->getMemUsed();
|
||||
p.safePrintf("<td>%"UINT64"</td>",rpc->m_adds);
|
||||
}
|
||||
p.safePrintf("<td>%"UINT64"</td></tr>\n",total);
|
||||
|
||||
|
||||
p.safePrintf("<tr class=poo><td><b>file cache drops</b></td>");
|
||||
total = 0;
|
||||
for ( int32_t i = 0 ; i < nr ; i++ ) {
|
||||
Rdb *rdb = rdbs[i];
|
||||
RdbCache *rpc = getDiskPageCache ( rdb->m_rdbId );
|
||||
if ( ! rpc ) {
|
||||
p.safePrintf("<td>--</td>");
|
||||
continue;
|
||||
}
|
||||
p.safePrintf("<td>%"UINT64"</td>",rpc->m_deletes);
|
||||
}
|
||||
p.safePrintf("<td>%"UINT64"</td></tr>\n",total);
|
||||
|
||||
|
||||
p.safePrintf("<tr class=poo><td><b>file cache used</b></td>");
|
||||
total = 0;
|
||||
for ( int32_t i = 0 ; i < nr ; i++ ) {
|
||||
Rdb *rdb = rdbs[i];
|
||||
RdbCache *rpc = getDiskPageCache ( rdb->m_rdbId );
|
||||
if ( ! rpc ) {
|
||||
p.safePrintf("<td>--</td>");
|
||||
continue;
|
||||
}
|
||||
int64_t val = rpc->getMemOccupied();
|
||||
total += val;
|
||||
printNumAbbr ( p , val );
|
||||
}
|
||||
p.safePrintf("<td>%"UINT64"</td></tr>\n",total);
|
||||
|
||||
|
||||
p.safePrintf("<tr class=poo><td><b><nobr>page cache allocated</nobr></b></td>");
|
||||
p.safePrintf("<tr class=poo><td><b><nobr>file cache allocated</nobr></b></td>");
|
||||
total = 0;
|
||||
for ( int32_t i = 0 ; i < nr ; i++ ) {
|
||||
if ( ! rdbs[i]->m_pc ) {
|
||||
Rdb *rdb = rdbs[i];
|
||||
RdbCache *rpc = getDiskPageCache ( rdb->m_rdbId );
|
||||
if ( ! rpc ) {
|
||||
p.safePrintf("<td>--</td>");
|
||||
continue;
|
||||
}
|
||||
int64_t val = rdbs[i]->m_pc->getMemAlloced();
|
||||
int64_t val = rpc->getMemAlloced();
|
||||
total += val;
|
||||
printNumAbbr ( p , val );
|
||||
}
|
||||
|
@ -29,24 +29,32 @@ bool sendPageThreads ( TcpSocket *s , HttpRequest *r ) {
|
||||
|
||||
|
||||
|
||||
int32_t loActive = q->m_loLaunched - q->m_loReturned;
|
||||
int32_t mdActive = q->m_mdLaunched - q->m_mdReturned;
|
||||
int32_t hiActive = q->m_hiLaunched - q->m_hiReturned;
|
||||
int32_t total = loActive + mdActive + hiActive;
|
||||
// int32_t loActive = q->m_loLaunched - q->m_loReturned;
|
||||
// int32_t mdActive = q->m_mdLaunched - q->m_mdReturned;
|
||||
// int32_t hiActive = q->m_hiLaunched - q->m_hiReturned;
|
||||
// int32_t total = loActive + mdActive + hiActive;
|
||||
|
||||
int32_t total = q->m_launched - q->m_returned;
|
||||
|
||||
p.safePrintf ( "<table %s>"
|
||||
"<tr class=hdrow><td colspan=\"11\">"
|
||||
//"<center>"
|
||||
//"<font size=+1>"
|
||||
"<b>Thread Type: %s"
|
||||
" (low: %"INT32""
|
||||
" med: %"INT32""
|
||||
" high: %"INT32""
|
||||
" total: %"INT32")</td></tr>",
|
||||
// " (low: %"INT32""
|
||||
// " med: %"INT32""
|
||||
// " high: %"INT32""
|
||||
" (launched: %"INT32" "
|
||||
"returned: %"INT32" "
|
||||
"total: %"INT32" maxpossibleout: %i)</td></tr>",
|
||||
TABLE_STYLE,
|
||||
q->getThreadType(),
|
||||
loActive, mdActive,
|
||||
hiActive, total);
|
||||
// loActive, mdActive,
|
||||
// hiActive,
|
||||
(int32_t)q->m_launched,
|
||||
(int32_t)q->m_returned,
|
||||
total,
|
||||
(int)MAX_STACKS);
|
||||
|
||||
|
||||
p.safePrintf ("<tr bgcolor=#%s>"
|
||||
@ -59,19 +67,20 @@ bool sendPageThreads ( TcpSocket *s , HttpRequest *r ) {
|
||||
"<td><b>Callback</b></td>"
|
||||
"<td><b>Routine</b></td>"
|
||||
"<td><b>Bytes Done</b></td>"
|
||||
"<td><b>KBytes/Sec</b></td>"
|
||||
"<td><b>Megabytes/Sec</b></td>"
|
||||
"<td><b>Read|Write</b></td>"
|
||||
"</tr>"
|
||||
, LIGHT_BLUE
|
||||
);
|
||||
|
||||
for ( int32_t j = 0 ; j < q->m_top ; j++ ) {
|
||||
for ( int32_t j = 0 ; j < q->m_maxEntries ; j++ ) {
|
||||
ThreadEntry *t = &q->m_entries[j];
|
||||
if(!t->m_isOccupied) continue;
|
||||
|
||||
FileState *fs = (FileState *)t->m_state;
|
||||
bool diskThread = false;
|
||||
if(q->m_threadType == DISK_THREAD && fs) diskThread = true;
|
||||
if(q->m_threadType == DISK_THREAD && fs)
|
||||
diskThread = true;
|
||||
|
||||
// might have got pre-called from EDISKSTUCK
|
||||
if ( ! t->m_callback ) fs = NULL;
|
||||
@ -81,18 +90,29 @@ bool sendPageThreads ( TcpSocket *s , HttpRequest *r ) {
|
||||
if(t->m_isDone) {
|
||||
p.safePrintf("<td><font color='red'><b>done</b></font></td>");
|
||||
p.safePrintf("<td>%"INT32"</td>", t->m_niceness);
|
||||
p.safePrintf("<td>%"INT64"</td>", t->m_launchedTime - t->m_queuedTime); //queued
|
||||
p.safePrintf("<td>%"INT64"</td>", t->m_exitTime - t->m_launchedTime); //run time
|
||||
p.safePrintf("<td>%"INT64"</td>", now - t->m_exitTime); //cleanup
|
||||
p.safePrintf("<td>%"INT64"</td>", now - t->m_queuedTime); //total
|
||||
p.safePrintf("<td>%"INT64"ms</td>", t->m_launchedTime - t->m_queuedTime); //queued
|
||||
p.safePrintf("<td>%"INT64"ms</td>", t->m_exitTime - t->m_launchedTime); //run time
|
||||
p.safePrintf("<td>%"INT64"ms</td>", now - t->m_exitTime); //cleanup
|
||||
p.safePrintf("<td>%"INT64"ms</td>", now - t->m_queuedTime); //total
|
||||
p.safePrintf("<td>%s</td>", g_profiler.getFnName((PTRTYPE)t->m_callback));
|
||||
p.safePrintf("<td>%s</td>", g_profiler.getFnName((PTRTYPE)t->m_startRoutine));
|
||||
if(diskThread && fs) {
|
||||
int64_t took = (t->m_exitTime - t->m_launchedTime);
|
||||
if(took <= 0) took = 1;
|
||||
p.safePrintf("<td>%"INT32"/%"INT32"</td>", t->m_bytesToGo, t->m_bytesToGo);
|
||||
p.safePrintf("<td>%.2f kbps</td>", (float)t->m_bytesToGo/took);
|
||||
p.safePrintf("<td>%s</td>",t->m_doWrite? "Write":"Read");
|
||||
char *sign = "";
|
||||
if(took <= 0) {sign=">";took = 1;}
|
||||
p.safePrintf("<td>%"INT32"/%"INT32""
|
||||
"</td>",
|
||||
t->m_bytesToGo,
|
||||
t->m_bytesToGo);
|
||||
p.safePrintf("<td>%s%.2f MB/s</td>",
|
||||
sign,
|
||||
(float)t->m_bytesToGo/
|
||||
(1024.0*1024.0)/
|
||||
((float)took/1000.0));
|
||||
p.safePrintf("<td>%s</td>",
|
||||
t->m_doWrite?
|
||||
"<font color=red>"
|
||||
"Write</font>":"Read");
|
||||
}
|
||||
else {
|
||||
p.safePrintf("<td>--</td>");
|
||||
@ -113,7 +133,7 @@ bool sendPageThreads ( TcpSocket *s , HttpRequest *r ) {
|
||||
int64_t took = (now - t->m_launchedTime);
|
||||
if(took <= 0) took = 1;
|
||||
p.safePrintf("<td>%c%c%c/%"INT32"</td>", '?','?','?',t->m_bytesToGo);
|
||||
p.safePrintf("<td>%.2f kbps</td>", 0.0);//(float)fs->m_bytesDone/took);
|
||||
p.safePrintf("<td>%.2f MB/s</td>", 0.0);//(float)fs->m_bytesDone/took);
|
||||
p.safePrintf("<td>%s</td>",t->m_doWrite? "Write":"Read");
|
||||
}
|
||||
else {
|
||||
@ -151,7 +171,7 @@ bool sendPageThreads ( TcpSocket *s , HttpRequest *r ) {
|
||||
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
int32_t loActiveBig = disk->m_loLaunchedBig - disk->m_loReturnedBig;
|
||||
int32_t loActiveMed = disk->m_loLaunchedMed - disk->m_loReturnedMed;
|
||||
int32_t loActiveSma = disk->m_loLaunchedSma - disk->m_loReturnedSma;
|
||||
@ -208,7 +228,7 @@ bool sendPageThreads ( TcpSocket *s , HttpRequest *r ) {
|
||||
"<td><b>Active Write Threads</b></td><td>%"INT32"</td>"
|
||||
"</tr></table>",
|
||||
activeWrites);
|
||||
|
||||
*/
|
||||
|
||||
return g_httpServer.sendDynamicPage ( s , (char*) p.getBufStart() ,
|
||||
p.length() );
|
||||
|
13
Pages.cpp
13
Pages.cpp
@ -11,6 +11,7 @@
|
||||
#include "PageParser.h" // g_inPageParser
|
||||
#include "Users.h"
|
||||
#include "Rebalance.h"
|
||||
#include "Profiler.h"
|
||||
|
||||
// a global class extern'd in Pages.h
|
||||
Pages g_pages;
|
||||
@ -4664,9 +4665,15 @@ bool printRedBox ( SafeBuf *mb , TcpSocket *sock , HttpRequest *hr ) {
|
||||
mb->safePrintf("%s",boxEnd);
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
if ( g_profiler.m_realTimeProfilerRunning ) {
|
||||
if ( adds ) mb->safePrintf("<br>");
|
||||
adds++;
|
||||
mb->safePrintf("%s",box);
|
||||
mb->safePrintf("Profiler is running. Performance is "
|
||||
"somewhat compromised. Disable on the "
|
||||
"profiler page.");
|
||||
mb->safePrintf("%s",boxEnd);
|
||||
}
|
||||
|
||||
if ( g_pingServer.m_hostsConfInDisagreement ) {
|
||||
if ( adds ) mb->safePrintf("<br>");
|
||||
|
384
Parms.cpp
384
Parms.cpp
@ -5207,15 +5207,15 @@ void Parms::init ( ) {
|
||||
m->m_obj = OBJ_CONF;
|
||||
m++;
|
||||
|
||||
m->m_title = "tagdb max page cache mem";
|
||||
m->m_desc = "";
|
||||
m->m_off = (char *)&g_conf.m_tagdbMaxDiskPageCacheMem - g;
|
||||
m->m_def = "200000";
|
||||
m->m_type = TYPE_LONG;
|
||||
m->m_flags = PF_NOSYNC|PF_NOAPI;
|
||||
m->m_page = PAGE_NONE;
|
||||
m->m_obj = OBJ_CONF;
|
||||
m++;
|
||||
// m->m_title = "tagdb max page cache mem";
|
||||
// m->m_desc = "";
|
||||
// m->m_off = (char *)&g_conf.m_tagdbMaxDiskPageCacheMem - g;
|
||||
// m->m_def = "200000";
|
||||
// m->m_type = TYPE_LONG;
|
||||
// m->m_flags = PF_NOSYNC|PF_NOAPI;
|
||||
// m->m_page = PAGE_NONE;
|
||||
// m->m_obj = OBJ_CONF;
|
||||
// m++;
|
||||
|
||||
//m->m_title = "tagdb max cache mem";
|
||||
//m->m_desc = "";
|
||||
@ -5244,15 +5244,15 @@ void Parms::init ( ) {
|
||||
m->m_obj = OBJ_CONF;
|
||||
m++;
|
||||
|
||||
m->m_title = "catdb max page cache mem";
|
||||
m->m_desc = "";
|
||||
m->m_off = (char *)&g_conf.m_catdbMaxDiskPageCacheMem - g;
|
||||
m->m_def = "25000000";
|
||||
m->m_type = TYPE_LONG;
|
||||
m->m_flags = PF_NOSYNC|PF_NOAPI;
|
||||
m->m_page = PAGE_NONE;
|
||||
m->m_obj = OBJ_CONF;
|
||||
m++;
|
||||
// m->m_title = "catdb max page cache mem";
|
||||
// m->m_desc = "";
|
||||
// m->m_off = (char *)&g_conf.m_catdbMaxDiskPageCacheMem - g;
|
||||
// m->m_def = "25000000";
|
||||
// m->m_type = TYPE_LONG;
|
||||
// m->m_flags = PF_NOSYNC|PF_NOAPI;
|
||||
// m->m_page = PAGE_NONE;
|
||||
// m->m_obj = OBJ_CONF;
|
||||
// m++;
|
||||
|
||||
m->m_title = "catdb max cache mem";
|
||||
m->m_desc = "";
|
||||
@ -5523,15 +5523,15 @@ void Parms::init ( ) {
|
||||
m++;
|
||||
*/
|
||||
|
||||
m->m_title = "linkdb max page cache mem";
|
||||
m->m_desc = "";
|
||||
m->m_off = (char *)&g_conf.m_linkdbMaxDiskPageCacheMem - g;
|
||||
m->m_def = "0";
|
||||
m->m_type = TYPE_LONG;
|
||||
m->m_flags = PF_NOSYNC|PF_NOAPI;
|
||||
m->m_page = PAGE_NONE;
|
||||
m->m_obj = OBJ_CONF;
|
||||
m++;
|
||||
// m->m_title = "linkdb max page cache mem";
|
||||
// m->m_desc = "";
|
||||
// m->m_off = (char *)&g_conf.m_linkdbMaxDiskPageCacheMem - g;
|
||||
// m->m_def = "0";
|
||||
// m->m_type = TYPE_LONG;
|
||||
// m->m_flags = PF_NOSYNC|PF_NOAPI;
|
||||
// m->m_page = PAGE_NONE;
|
||||
// m->m_obj = OBJ_CONF;
|
||||
// m++;
|
||||
|
||||
/*
|
||||
// this is overridden by collection
|
||||
@ -5657,15 +5657,15 @@ void Parms::init ( ) {
|
||||
m->m_obj = OBJ_CONF;
|
||||
m++;
|
||||
|
||||
m->m_title = "statsdb max disk page cache mem";
|
||||
m->m_desc = "";
|
||||
m->m_off = (char *)&g_conf.m_statsdbMaxDiskPageCacheMem - g;
|
||||
m->m_def = "1000000";
|
||||
m->m_type = TYPE_LONG;
|
||||
m->m_flags = PF_NOSYNC|PF_NOAPI;
|
||||
m->m_page = PAGE_NONE;
|
||||
m->m_obj = OBJ_CONF;
|
||||
m++;
|
||||
// m->m_title = "statsdb max disk page cache mem";
|
||||
// m->m_desc = "";
|
||||
// m->m_off = (char *)&g_conf.m_statsdbMaxDiskPageCacheMem - g;
|
||||
// m->m_def = "1000000";
|
||||
// m->m_type = TYPE_LONG;
|
||||
// m->m_flags = PF_NOSYNC|PF_NOAPI;
|
||||
// m->m_page = PAGE_NONE;
|
||||
// m->m_obj = OBJ_CONF;
|
||||
// m++;
|
||||
|
||||
//m->m_title = "statsdb min files to merge";
|
||||
//m->m_desc = "";
|
||||
@ -9939,6 +9939,21 @@ void Parms::init ( ) {
|
||||
m->m_obj = OBJ_CONF;
|
||||
m++;
|
||||
|
||||
m->m_title = "return results even if a shard is down";
|
||||
m->m_desc = "If you turn this off then Gigablast will return "
|
||||
"an error message if a shard was down and did not return "
|
||||
"results for a query. The XML and JSON feed let's you know "
|
||||
"when a shard is down and will give you the results back "
|
||||
"any way, but if you would rather have just and error message "
|
||||
"and no results, then set then set this to 'NO'.";
|
||||
m->m_cgi = "rra";
|
||||
m->m_off = (char *)&g_conf.m_returnResultsAnyway - g;
|
||||
m->m_type = TYPE_BOOL;
|
||||
m->m_def = "1";
|
||||
m->m_page = PAGE_MASTER;
|
||||
m->m_obj = OBJ_CONF;
|
||||
m++;
|
||||
|
||||
m->m_title = "max mem";
|
||||
m->m_desc = "Mem available to this process. May be exceeded due "
|
||||
"to fragmentation.";
|
||||
@ -11273,20 +11288,6 @@ void Parms::init ( ) {
|
||||
m++;
|
||||
*/
|
||||
|
||||
m->m_title = "verify disk writes";
|
||||
m->m_desc = "Read what was written in a verification step. Decreases "
|
||||
"performance, but may help fight disk corruption mostly on "
|
||||
"Maxtors and Western Digitals.";
|
||||
m->m_cgi = "vdw";
|
||||
m->m_off = (char *)&g_conf.m_verifyWrites - g;
|
||||
m->m_type = TYPE_BOOL;
|
||||
m->m_def = "0";
|
||||
m->m_group = 0;
|
||||
m->m_flags = PF_HIDDEN | PF_NOSAVE;
|
||||
m->m_page = PAGE_MASTER;
|
||||
m->m_obj = OBJ_CONF;
|
||||
m++;
|
||||
|
||||
// this is ifdef'd out in Msg3.cpp for performance reasons,
|
||||
// so do it here, too
|
||||
#ifdef GBSANITYCHECK
|
||||
@ -11457,122 +11458,80 @@ void Parms::init ( ) {
|
||||
m++;
|
||||
*/
|
||||
|
||||
m->m_title = "use disk page cache for posdb";
|
||||
m->m_desc = "Use disk page cache?";
|
||||
m->m_cgi = "udpci";
|
||||
m->m_off = (char *)&g_conf.m_useDiskPageCachePosdb - g;
|
||||
m->m_type = TYPE_BOOL;
|
||||
m->m_def = "1";
|
||||
m->m_flags = PF_HIDDEN | PF_NOSAVE;
|
||||
m->m_title = "posdb disk cache size";
|
||||
m->m_desc = "How much file cache size to use in bytes? Posdb is "
|
||||
"the index.";
|
||||
m->m_cgi = "dpcsp";
|
||||
m->m_off = (char *)&g_conf.m_posdbFileCacheSize - g;
|
||||
m->m_type = TYPE_LONG_LONG;
|
||||
m->m_def = "30000000";
|
||||
m->m_flags = 0;//PF_HIDDEN | PF_NOSAVE;
|
||||
m->m_page = PAGE_MASTER;
|
||||
m->m_obj = OBJ_CONF;
|
||||
m++;
|
||||
|
||||
m->m_title = "use disk page cache for datedb";
|
||||
m->m_desc = "Use disk page cache?";
|
||||
m->m_cgi = "udpcd";
|
||||
m->m_off = (char *)&g_conf.m_useDiskPageCacheDatedb - g;
|
||||
m->m_type = TYPE_BOOL;
|
||||
m->m_def = "1";
|
||||
m->m_flags = PF_HIDDEN | PF_NOSAVE;
|
||||
m->m_title = "tagdb disk cache size";
|
||||
m->m_desc = "How much file cache size to use in bytes? Tagdb is "
|
||||
"consulted at spider time and query time to determine "
|
||||
"if a url or outlink is banned or what its siterank is, etc.";
|
||||
m->m_cgi = "dpcst";
|
||||
m->m_off = (char *)&g_conf.m_tagdbFileCacheSize - g;
|
||||
m->m_type = TYPE_LONG_LONG;
|
||||
m->m_def = "30000000";
|
||||
m->m_flags = 0;//PF_HIDDEN | PF_NOSAVE;
|
||||
m->m_page = PAGE_MASTER;
|
||||
m->m_obj = OBJ_CONF;
|
||||
m++;
|
||||
|
||||
m->m_title = "use disk page cache for titledb";
|
||||
m->m_desc = "Use disk page cache?";
|
||||
m->m_cgi = "udpct";
|
||||
m->m_off = (char *)&g_conf.m_useDiskPageCacheTitledb - g;
|
||||
m->m_type = TYPE_BOOL;
|
||||
m->m_def = "1";
|
||||
m->m_group = 0;
|
||||
m->m_flags = PF_HIDDEN | PF_NOSAVE;
|
||||
m->m_page = PAGE_MASTER;
|
||||
m->m_obj = OBJ_CONF;
|
||||
m++;
|
||||
|
||||
m->m_title = "use disk page cache for spiderdb";
|
||||
m->m_desc = "Use disk page cache?";
|
||||
m->m_cgi = "udpcs";
|
||||
m->m_off = (char *)&g_conf.m_useDiskPageCacheSpiderdb - g;
|
||||
m->m_type = TYPE_BOOL;
|
||||
m->m_def = "1";
|
||||
m->m_group = 0;
|
||||
m->m_flags = PF_HIDDEN | PF_NOSAVE;
|
||||
m->m_page = PAGE_MASTER;
|
||||
m->m_obj = OBJ_CONF;
|
||||
m++;
|
||||
|
||||
/*
|
||||
m->m_title = "use disk page cache for urldb";
|
||||
m->m_desc = "Use disk page cache?";
|
||||
m->m_cgi = "udpcu";
|
||||
m->m_off = (char *)&g_conf.m_useDiskPageCacheTfndb - g;
|
||||
m->m_type = TYPE_BOOL;
|
||||
m->m_def = "1";
|
||||
m->m_group = 0;
|
||||
m++;
|
||||
*/
|
||||
|
||||
m->m_title = "use disk page cache for tagdb";
|
||||
m->m_desc = "Use disk page cache?";
|
||||
m->m_cgi = "udpcg";
|
||||
m->m_off = (char *)&g_conf.m_useDiskPageCacheTagdb - g;
|
||||
m->m_type = TYPE_BOOL;
|
||||
m->m_def = "1";
|
||||
m->m_group = 0;
|
||||
m->m_flags = PF_HIDDEN | PF_NOSAVE;
|
||||
m->m_title = "clusterdb disk cache size";
|
||||
m->m_desc = "How much file cache size to use in bytes? "
|
||||
"Gigablast does a "
|
||||
"lookup in clusterdb for each search result at query time to "
|
||||
"get its site information for site clustering. If you "
|
||||
"disable site clustering in the search controls then "
|
||||
"clusterdb will not be consulted.";
|
||||
m->m_cgi = "dpcsc";
|
||||
m->m_off = (char *)&g_conf.m_clusterdbFileCacheSize - g;
|
||||
m->m_type = TYPE_LONG_LONG;
|
||||
m->m_def = "30000000";
|
||||
m->m_flags = 0;//PF_HIDDEN | PF_NOSAVE;
|
||||
m->m_page = PAGE_MASTER;
|
||||
m->m_obj = OBJ_CONF;
|
||||
m->m_group = 0;
|
||||
m++;
|
||||
|
||||
m->m_title = "use disk page cache for checksumdb";
|
||||
m->m_desc = "Use disk page cache?";
|
||||
m->m_cgi = "udpck";
|
||||
m->m_off = (char *)&g_conf.m_useDiskPageCacheChecksumdb - g;
|
||||
m->m_type = TYPE_BOOL;
|
||||
m->m_def = "1";
|
||||
m->m_group = 0;
|
||||
m->m_flags = PF_HIDDEN | PF_NOSAVE;
|
||||
m->m_title = "titledb disk cache size";
|
||||
m->m_desc = "How much file cache size to use in bytes? Titledb "
|
||||
"holds the cached web pages, compressed. Gigablast consults "
|
||||
"it to generate a summary for a search result, or to see if "
|
||||
"a url Gigablast is spidering is already in the index.";
|
||||
m->m_cgi = "dpcsx";
|
||||
m->m_off = (char *)&g_conf.m_titledbFileCacheSize - g;
|
||||
m->m_type = TYPE_LONG_LONG;
|
||||
m->m_def = "30000000";
|
||||
m->m_flags = 0;//PF_HIDDEN | PF_NOSAVE;
|
||||
m->m_page = PAGE_MASTER;
|
||||
m->m_obj = OBJ_CONF;
|
||||
m->m_group = 0;
|
||||
m++;
|
||||
|
||||
m->m_title = "use disk page cache for clusterdb";
|
||||
m->m_desc = "Use disk page cache?";
|
||||
m->m_cgi = "udpcl";
|
||||
m->m_off = (char *)&g_conf.m_useDiskPageCacheClusterdb - g;
|
||||
m->m_type = TYPE_BOOL;
|
||||
m->m_def = "1";
|
||||
m->m_group = 0;
|
||||
m->m_flags = PF_HIDDEN | PF_NOSAVE;
|
||||
m->m_title = "spiderdb disk cache size";
|
||||
m->m_desc = "How much file cache size to use in bytes? Titledb "
|
||||
"holds the cached web pages, compressed. Gigablast consults "
|
||||
"it to generate a summary for a search result, or to see if "
|
||||
"a url Gigablast is spidering is already in the index.";
|
||||
m->m_cgi = "dpcsy";
|
||||
m->m_off = (char *)&g_conf.m_spiderdbFileCacheSize - g;
|
||||
m->m_type = TYPE_LONG_LONG;
|
||||
m->m_def = "30000000";
|
||||
m->m_flags = 0;//PF_HIDDEN | PF_NOSAVE;
|
||||
m->m_page = PAGE_MASTER;
|
||||
m->m_obj = OBJ_CONF;
|
||||
m->m_group = 0;
|
||||
m++;
|
||||
|
||||
m->m_title = "use disk page cache for catdb";
|
||||
m->m_desc = "Use disk page cache?";
|
||||
m->m_cgi = "udpca";
|
||||
m->m_off = (char *)&g_conf.m_useDiskPageCacheCatdb - g;
|
||||
m->m_type = TYPE_BOOL;
|
||||
m->m_def = "1";
|
||||
m->m_group = 0;
|
||||
m->m_flags = PF_HIDDEN | PF_NOSAVE;
|
||||
m->m_page = PAGE_MASTER;
|
||||
m->m_obj = OBJ_CONF;
|
||||
m++;
|
||||
|
||||
m->m_title = "use disk page cache for linkdb";
|
||||
m->m_desc = "Use disk page cache?";
|
||||
m->m_cgi = "udpcnk";
|
||||
m->m_off = (char *)&g_conf.m_useDiskPageCacheLinkdb - g;
|
||||
m->m_type = TYPE_BOOL;
|
||||
m->m_def = "1";
|
||||
m->m_group = 0;
|
||||
m->m_flags = PF_HIDDEN | PF_NOSAVE;
|
||||
m->m_page = PAGE_MASTER;
|
||||
m->m_obj = OBJ_CONF;
|
||||
m++;
|
||||
|
||||
/*
|
||||
m->m_title = "exclude link text";
|
||||
@ -12448,8 +12407,20 @@ void Parms::init ( ) {
|
||||
m->m_group = 0;
|
||||
m++;
|
||||
|
||||
|
||||
|
||||
m->m_title = "verify disk writes";
|
||||
m->m_desc = "Read what was written in a verification step. Decreases "
|
||||
"performance, but may help fight disk corruption mostly on "
|
||||
"Maxtors and Western Digitals.";
|
||||
m->m_cgi = "vdw";
|
||||
m->m_off = (char *)&g_conf.m_verifyWrites - g;
|
||||
m->m_type = TYPE_BOOL;
|
||||
m->m_def = "0";
|
||||
m->m_group = 0;
|
||||
m->m_flags = 0;//PF_HIDDEN | PF_NOSAVE;
|
||||
m->m_page = PAGE_MASTER;
|
||||
m->m_obj = OBJ_CONF;
|
||||
m->m_group = 0;
|
||||
m++;
|
||||
|
||||
m->m_title = "max spider read threads";
|
||||
m->m_desc = "Maximum number of threads to use per Gigablast process "
|
||||
@ -12460,7 +12431,7 @@ void Parms::init ( ) {
|
||||
m->m_cgi = "smdt";
|
||||
m->m_off = (char *)&g_conf.m_spiderMaxDiskThreads - g;
|
||||
m->m_type = TYPE_LONG;
|
||||
m->m_def = "30";
|
||||
m->m_def = "20";
|
||||
m->m_units = "threads";
|
||||
m->m_flags = 0;//PF_HIDDEN | PF_NOSAVE;
|
||||
m->m_page = PAGE_MASTER;
|
||||
@ -12468,13 +12439,16 @@ void Parms::init ( ) {
|
||||
m->m_group = 0;
|
||||
m++;
|
||||
|
||||
/*
|
||||
m->m_title = "max spider big read threads";
|
||||
m->m_desc = "This particular number applies to all disk "
|
||||
"reads above 1MB.";
|
||||
"reads above 1MB. "
|
||||
"The number of total threads is also "
|
||||
"limited to MAX_STACKS which is currently 20.";
|
||||
m->m_cgi = "smbdt";
|
||||
m->m_off = (char *)&g_conf.m_spiderMaxBigDiskThreads - g;
|
||||
m->m_type = TYPE_LONG;
|
||||
m->m_def = "8"; // 1
|
||||
m->m_def = "2";
|
||||
m->m_units = "threads";
|
||||
m->m_group = 0;
|
||||
m->m_flags = 0;//PF_HIDDEN | PF_NOSAVE;
|
||||
@ -12484,11 +12458,13 @@ void Parms::init ( ) {
|
||||
|
||||
m->m_title = "max spider medium read threads";
|
||||
m->m_desc = "This particular number applies to all disk "
|
||||
"reads above 100K.";
|
||||
"reads above 100K. "
|
||||
"The number of total threads is also "
|
||||
"limited to MAX_STACKS which is currently 20.";
|
||||
m->m_cgi = "smmdt";
|
||||
m->m_off = (char *)&g_conf.m_spiderMaxMedDiskThreads - g;
|
||||
m->m_type = TYPE_LONG;
|
||||
m->m_def = "19"; // 3
|
||||
m->m_def = "4";
|
||||
m->m_units = "threads";
|
||||
m->m_group = 0;
|
||||
m->m_flags = 0;//PF_HIDDEN | PF_NOSAVE;
|
||||
@ -12498,18 +12474,37 @@ void Parms::init ( ) {
|
||||
|
||||
m->m_title = "max spider small read threads";
|
||||
m->m_desc = "This particular number applies to all disk "
|
||||
"reads above 1MB.";
|
||||
"reads above 1MB. "
|
||||
"The number of total threads is also "
|
||||
"limited to MAX_STACKS which is currently 20.";
|
||||
m->m_cgi = "smsdt";
|
||||
m->m_off = (char *)&g_conf.m_spiderMaxSmaDiskThreads - g;
|
||||
m->m_type = TYPE_LONG;
|
||||
m->m_def = "20";
|
||||
m->m_def = "15";
|
||||
m->m_units = "threads";
|
||||
m->m_group = 0;
|
||||
m->m_flags = 0;//PF_HIDDEN | PF_NOSAVE;
|
||||
m->m_page = PAGE_MASTER;
|
||||
m->m_obj = OBJ_CONF;
|
||||
m++;
|
||||
*/
|
||||
|
||||
m->m_title = "separate disk reads";
|
||||
m->m_desc = "If enabled then we will not launch a low priority "
|
||||
"disk read or write while a high priority is outstanding. "
|
||||
"Help improve query response time at the expense of "
|
||||
"spider performance.";
|
||||
m->m_cgi = "sdt";
|
||||
m->m_off = (char *)&g_conf.m_separateDiskReads - g;
|
||||
m->m_type = TYPE_BOOL;
|
||||
m->m_def = "1";
|
||||
m->m_flags = 0;
|
||||
m->m_page = PAGE_MASTER;
|
||||
m->m_obj = OBJ_CONF;
|
||||
m++;
|
||||
|
||||
|
||||
/*
|
||||
m->m_title = "max query read threads";
|
||||
m->m_desc = "Maximum number of threads to use per Gigablast process "
|
||||
"for accessing the disk "
|
||||
@ -12527,13 +12522,17 @@ void Parms::init ( ) {
|
||||
m->m_obj = OBJ_CONF;
|
||||
m->m_group = 0;
|
||||
m++;
|
||||
*/
|
||||
|
||||
/*
|
||||
m->m_title = "max query big read threads";
|
||||
m->m_desc = "This particular number applies to all reads above 1MB.";
|
||||
m->m_desc = "This particular number applies to all reads above 1MB. "
|
||||
"The number of total threads is also "
|
||||
"limited to MAX_STACKS which is currently 20.";
|
||||
m->m_cgi = "qmbdt";
|
||||
m->m_off = (char *)&g_conf.m_queryMaxBigDiskThreads - g;
|
||||
m->m_type = TYPE_LONG;
|
||||
m->m_def = "60"; // 1
|
||||
m->m_def = "20"; // 1
|
||||
m->m_units = "threads";
|
||||
m->m_group = 0;
|
||||
m->m_flags = 0;//PF_HIDDEN | PF_NOSAVE;
|
||||
@ -12543,11 +12542,13 @@ void Parms::init ( ) {
|
||||
|
||||
m->m_title = "max query medium read threads";
|
||||
m->m_desc = "This particular number applies to all disk "
|
||||
"reads above 100K.";
|
||||
"reads above 100K. "
|
||||
"The number of total threads is also "
|
||||
"limited to MAX_STACKS which is currently 20.";
|
||||
m->m_cgi = "qmmdt";
|
||||
m->m_off = (char *)&g_conf.m_queryMaxMedDiskThreads - g;
|
||||
m->m_type = TYPE_LONG;
|
||||
m->m_def = "80"; // 3
|
||||
m->m_def = "20"; // 3
|
||||
m->m_units = "threads";
|
||||
m->m_group = 0;
|
||||
m->m_flags = 0;//PF_HIDDEN | PF_NOSAVE;
|
||||
@ -12557,17 +12558,20 @@ void Parms::init ( ) {
|
||||
|
||||
m->m_title = "max query small read threads";
|
||||
m->m_desc = "This particular number applies to all disk "
|
||||
"reads above 1MB.";
|
||||
"reads above 1MB. "
|
||||
"The number of total threads is also "
|
||||
"limited to MAX_STACKS which is currently 20.";
|
||||
m->m_cgi = "qmsdt";
|
||||
m->m_off = (char *)&g_conf.m_queryMaxSmaDiskThreads - g;
|
||||
m->m_type = TYPE_LONG;
|
||||
m->m_def = "80";
|
||||
m->m_def = "20";
|
||||
m->m_units = "threads";
|
||||
m->m_group = 0;
|
||||
m->m_flags = 0;//PF_HIDDEN | PF_NOSAVE;
|
||||
m->m_page = PAGE_MASTER;
|
||||
m->m_obj = OBJ_CONF;
|
||||
m++;
|
||||
*/
|
||||
|
||||
m->m_title = "min popularity for speller";
|
||||
m->m_desc = "Word or phrase must be present in this percent "
|
||||
@ -15102,6 +15106,19 @@ void Parms::init ( ) {
|
||||
m->m_off = (char *)&ir.m_hopCount - (char *)&ir;
|
||||
m++;
|
||||
|
||||
m->m_title = "url IP";
|
||||
m->m_desc = "Use this IP when injecting the document. Do not use or "
|
||||
"set to 0.0.0.0, if unknown. If provided, it will save an IP "
|
||||
"lookup.";
|
||||
m->m_cgi = "urlip";
|
||||
m->m_obj = OBJ_IR;
|
||||
m->m_type = TYPE_IP;
|
||||
m->m_def = "0.0.0.0";
|
||||
m->m_flags = PF_API;
|
||||
m->m_page = PAGE_INJECT;
|
||||
m->m_off = (char *)&ir.m_injectDocIp - (char *)&ir;
|
||||
m++;
|
||||
|
||||
m->m_title = "last spider time";
|
||||
m->m_desc = "Override last time spidered";
|
||||
m->m_cgi = "lastspidered";
|
||||
@ -15208,7 +15225,10 @@ void Parms::init ( ) {
|
||||
"Separate MIME from actual content with two returns. "
|
||||
"At least put a single space in here if you want to "
|
||||
"inject empty content, otherwise the content will "
|
||||
"be downloaded from the url.";
|
||||
"be downloaded from the url. This is because the "
|
||||
"page injection form always submits the content text area "
|
||||
"even if it is empty, which should signify that the "
|
||||
"content should be downloaded.";
|
||||
m->m_cgi = "content";
|
||||
m->m_obj = OBJ_IR;
|
||||
m->m_type = TYPE_CHARPTR;
|
||||
@ -15489,6 +15509,22 @@ void Parms::init ( ) {
|
||||
m->m_obj = OBJ_COLL;
|
||||
m++;
|
||||
|
||||
m->m_title = "do tagdb lookups for queries";
|
||||
m->m_desc = "For each search result a tagdb lookup is made, "
|
||||
"usually across the network on distributed clusters, to "
|
||||
"see if the URL's site has been manually banned in tagdb. "
|
||||
"If you don't manually ban sites then turn this off for "
|
||||
"extra speed.";
|
||||
m->m_cgi = "stgdbl";
|
||||
m->m_off = (char *)&cr.m_doTagdbLookups - x;
|
||||
m->m_type = TYPE_BOOL;
|
||||
m->m_def = "1";
|
||||
m->m_group = 1;
|
||||
m->m_flags = PF_API | PF_CLONE;
|
||||
m->m_page = PAGE_SEARCH;
|
||||
m->m_obj = OBJ_COLL;
|
||||
m++;
|
||||
|
||||
m->m_title = "percent similar dedup summary default value";
|
||||
m->m_desc = "If document summary (and title) are "
|
||||
"this percent similar "
|
||||
@ -16505,9 +16541,10 @@ void Parms::init ( ) {
|
||||
m->m_flags = PF_CLONE;
|
||||
m++;
|
||||
|
||||
m->m_title = "use robots.txt";
|
||||
m->m_title = "obey robots.txt";
|
||||
m->m_xml = "useRobotstxt";
|
||||
m->m_desc = "If this is true Gigablast will respect "
|
||||
"the robots.txt convention.";
|
||||
"the robots.txt convention and rel no follow meta tags.";
|
||||
m->m_cgi = "obeyRobots";
|
||||
m->m_off = (char *)&cr.m_useRobotsTxt - x;
|
||||
m->m_type = TYPE_BOOL;
|
||||
@ -16517,6 +16554,18 @@ void Parms::init ( ) {
|
||||
m->m_flags = PF_CLONE;
|
||||
m++;
|
||||
|
||||
m->m_title = "obey rel no follow links";
|
||||
m->m_desc = "If this is true Gigablast will respect "
|
||||
"the rel no follow link attribute.";
|
||||
m->m_cgi = "obeyRelNoFollow";
|
||||
m->m_off = (char *)&cr.m_obeyRelNoFollowLinks - x;
|
||||
m->m_type = TYPE_BOOL;
|
||||
m->m_def = "1";
|
||||
m->m_page = PAGE_SPIDER;
|
||||
m->m_obj = OBJ_COLL;
|
||||
m->m_flags = PF_CLONE;
|
||||
m++;
|
||||
|
||||
m->m_title = "max robots.txt cache age";
|
||||
m->m_desc = "How many seconds to cache a robots.txt file for. "
|
||||
"86400 is 1 day. 0 means Gigablast will not read from the "
|
||||
@ -19729,6 +19778,16 @@ void Parms::init ( ) {
|
||||
m->m_obj = OBJ_CONF;
|
||||
m++;
|
||||
|
||||
m->m_title = "log debug tcp buffer messages";
|
||||
m->m_cgi = "ldtb";
|
||||
m->m_off = (char *)&g_conf.m_logDebugTcpBuf - g;
|
||||
m->m_type = TYPE_BOOL;
|
||||
m->m_def = "0";
|
||||
m->m_priv = 1;
|
||||
m->m_page = PAGE_LOG;
|
||||
m->m_obj = OBJ_CONF;
|
||||
m++;
|
||||
|
||||
m->m_title = "log debug thread messages";
|
||||
m->m_cgi = "ldth";
|
||||
m->m_off = (char *)&g_conf.m_logDebugThread - g;
|
||||
@ -22459,6 +22518,7 @@ bool Parm::printVal ( SafeBuf *sb , collnum_t collnum , int32_t occNum ) {
|
||||
return sb->safePrintf("CMD");
|
||||
|
||||
if ( m_type == TYPE_IP )
|
||||
// may print 0.0.0.0
|
||||
return sb->safePrintf("%s",iptoa(*(int32_t *)val) );
|
||||
|
||||
log("parms: missing parm type!!");
|
||||
|
@ -3228,9 +3228,11 @@ void doneSendingNotifyEmailWrapper ( void *state ) {
|
||||
// wait for post url to get done
|
||||
if ( ei->m_notifyBlocked > 0 ) return;
|
||||
// unmark it
|
||||
ei->m_inUse = false;
|
||||
//ei->m_inUse = false;
|
||||
// all done
|
||||
ei->m_finalCallback ( ei->m_finalState );
|
||||
// nuke it
|
||||
mfree ( ei , sizeof(EmailInfo) ,"eialrt" );
|
||||
}
|
||||
|
||||
void doneGettingNotifyUrlWrapper ( void *state , TcpSocket *sock ) {
|
||||
@ -3242,9 +3244,11 @@ void doneGettingNotifyUrlWrapper ( void *state , TcpSocket *sock ) {
|
||||
// wait for email to get done
|
||||
if ( ei->m_notifyBlocked > 0 ) return;
|
||||
// unmark it
|
||||
ei->m_inUse = false;
|
||||
//ei->m_inUse = false;
|
||||
// all done
|
||||
ei->m_finalCallback ( ei->m_finalState );
|
||||
// nuke it
|
||||
mfree ( ei , sizeof(EmailInfo) ,"eialrt" );
|
||||
}
|
||||
|
||||
// for printCrawlDetailsInJson()
|
||||
@ -3259,7 +3263,7 @@ bool sendNotification ( EmailInfo *ei ) {
|
||||
//log("ping: NOT SENDING NOTIFICATION -- DEBUG!!");
|
||||
//return true;
|
||||
|
||||
if ( ei->m_inUse ) { char *xx=NULL;*xx=0; }
|
||||
//if ( ei->m_inUse ) { char *xx=NULL;*xx=0; }
|
||||
|
||||
// caller must set this, as well as m_finalCallback/m_finalState
|
||||
CollectionRec *cr = g_collectiondb.m_recs[ei->m_collnum];
|
||||
@ -3275,7 +3279,7 @@ bool sendNotification ( EmailInfo *ei ) {
|
||||
// sanity check, can only call once
|
||||
if ( ei->m_notifyBlocked != 0 ) { char *xx=NULL;*xx=0; }
|
||||
|
||||
ei->m_inUse = true;
|
||||
//ei->m_inUse = true;
|
||||
|
||||
|
||||
if ( email && email[0] ) {
|
||||
@ -3371,7 +3375,9 @@ bool sendNotification ( EmailInfo *ei ) {
|
||||
}
|
||||
|
||||
if ( ei->m_notifyBlocked == 0 ) {
|
||||
ei->m_inUse = false;
|
||||
//ei->m_inUse = false;
|
||||
// nuke it
|
||||
mfree ( ei , sizeof(EmailInfo) ,"eialrt" );
|
||||
return true;
|
||||
}
|
||||
|
||||
|
19
PingServer.h
19
PingServer.h
@ -30,16 +30,17 @@ public:
|
||||
// ip address of MX record for this domain
|
||||
int32_t m_mxIp;
|
||||
int32_t m_notifyBlocked;
|
||||
bool m_inUse;
|
||||
class CollectionRec *m_collRec;
|
||||
//bool m_inUse;
|
||||
|
||||
EmailInfo() {
|
||||
memset ( this,0,sizeof(EmailInfo) );
|
||||
};
|
||||
void reset() {
|
||||
if ( m_inUse ) { char *xx=NULL;*xx=0; }
|
||||
if ( m_notifyBlocked ) { char *xx=NULL;*xx=0; }
|
||||
memset ( this,0,sizeof(EmailInfo) );
|
||||
};
|
||||
//EmailInfo() {
|
||||
// memset ( this,0,sizeof(EmailInfo) );
|
||||
//};
|
||||
//void reset() {
|
||||
// if ( m_inUse ) { char *xx=NULL;*xx=0; }
|
||||
// if ( m_notifyBlocked ) { char *xx=NULL;*xx=0; }
|
||||
// memset ( this,0,sizeof(EmailInfo) );
|
||||
//};
|
||||
};
|
||||
|
||||
class PingServer {
|
||||
|
@ -35,8 +35,8 @@ bool Placedb::init ( ) {
|
||||
// . 25(treeoverhead) + 24(cacheoverhead) = 49
|
||||
//int32_t maxCacheNodes = g_conf.m_placedbMaxCacheMem / 49;
|
||||
// we now use a page cache
|
||||
if ( ! m_pc.init ( "placedb",RDB_PLACEDB,pcmem,GB_INDEXDB_PAGE_SIZE ) )
|
||||
return log("db: Placedb page cache init failed.");
|
||||
// if (!m_pc.init("placedb",RDB_PLACEDB,pcmem,GB_INDEXDB_PAGE_SIZE ) )
|
||||
// return log("db: Placedb page cache init failed.");
|
||||
// initialize our own internal rdb
|
||||
if ( ! m_rdb.init ( g_hostdb.m_dir,
|
||||
"placedb" ,
|
||||
@ -50,7 +50,7 @@ bool Placedb::init ( ) {
|
||||
0 , // maxCacheNodes
|
||||
false , // half keys?
|
||||
false , // g_conf.m_placedbSaveCache
|
||||
&m_pc ,
|
||||
NULL,//&m_pc ,
|
||||
false , // is titledb?
|
||||
false , // preload page cache?
|
||||
16 , // keysize
|
||||
|
@ -47,9 +47,9 @@ class Placedb {
|
||||
// this rdb holds urls waiting to be spidered or being spidered
|
||||
Rdb m_rdb;
|
||||
|
||||
DiskPageCache *getDiskPageCache() { return &m_pc; };
|
||||
//DiskPageCache *getDiskPageCache() { return &m_pc; };
|
||||
|
||||
DiskPageCache m_pc;
|
||||
//DiskPageCache m_pc;
|
||||
};
|
||||
|
||||
extern class Placedb g_placedb;
|
||||
|
37
Posdb.cpp
37
Posdb.cpp
@ -125,19 +125,19 @@ bool Posdb::init ( ) {
|
||||
int32_t nodeSize = (sizeof(key144_t)+12+4) + sizeof(collnum_t);
|
||||
int32_t maxTreeNodes = maxTreeMem / nodeSize ;
|
||||
|
||||
int32_t pageSize = GB_INDEXDB_PAGE_SIZE;
|
||||
//int32_t pageSize = GB_INDEXDB_PAGE_SIZE;
|
||||
// we now use a disk page cache as opposed to the
|
||||
// old rec cache. i am trying to do away with the Rdb::m_cache rec
|
||||
// cache in favor of cleverly used disk page caches, because
|
||||
// the rec caches are not real-time and get stale.
|
||||
int32_t pcmem = 30000000; // 30MB
|
||||
//int32_t pcmem = 30000000; // 30MB
|
||||
// make sure at least 30MB
|
||||
//if ( pcmem < 30000000 ) pcmem = 30000000;
|
||||
// keep this low if we are the tmp cluster, 30MB
|
||||
if ( g_hostdb.m_useTmpCluster && pcmem > 30000000 ) pcmem = 30000000;
|
||||
//if ( g_hostdb.m_useTmpCluster && pcmem > 30000000 ) pcmem = 30000000;
|
||||
// do not use any page cache if doing tmp cluster in order to
|
||||
// prevent swapping
|
||||
if ( g_hostdb.m_useTmpCluster ) pcmem = 0;
|
||||
//if ( g_hostdb.m_useTmpCluster ) pcmem = 0;
|
||||
// save more mem!!! allow os to cache it i guess...
|
||||
// let's go back to using it
|
||||
//pcmem = 0;
|
||||
@ -145,11 +145,11 @@ bool Posdb::init ( ) {
|
||||
//pcmem = 0;
|
||||
// . init the page cache
|
||||
// . MDW: "minimize disk seeks" not working otherwise i'd enable it!
|
||||
if ( ! m_pc.init ( "posdb",
|
||||
RDB_POSDB,
|
||||
pcmem ,
|
||||
pageSize ))
|
||||
return log("db: Posdb init failed.");
|
||||
// if ( ! m_pc.init ( "posdb",
|
||||
// RDB_POSDB,
|
||||
// pcmem ,
|
||||
// pageSize ))
|
||||
// return log("db: Posdb init failed.");
|
||||
|
||||
// . set our own internal rdb
|
||||
// . max disk space for bin tree is same as maxTreeMem so that we
|
||||
@ -174,7 +174,7 @@ bool Posdb::init ( ) {
|
||||
// newer systems have tons of ram to use
|
||||
// for their disk page cache. it is slower than
|
||||
// ours but the new engine has much slower things
|
||||
&m_pc ,
|
||||
NULL,//&m_pc ,
|
||||
false , // istitledb?
|
||||
false , // preloaddiskpagecache?
|
||||
sizeof(key144_t)
|
||||
@ -918,6 +918,10 @@ bool PosdbTable::allocTopTree ( ) {
|
||||
, (int32_t)m_r->m_numDocIdSplits
|
||||
);
|
||||
|
||||
// keep it sane
|
||||
if ( nn > m_r->m_docsToGet * 2 && nn > 60 )
|
||||
nn = m_r->m_docsToGet * 2;
|
||||
|
||||
// this actually sets the # of nodes to MORE than nn!!!
|
||||
if ( ! m_topTree->setNumNodes(nn,m_r->m_doSiteClustering)) {
|
||||
log("toptree: toptree: error allocating nodes: %s",
|
||||
@ -1007,8 +1011,9 @@ bool PosdbTable::allocTopTree ( ) {
|
||||
continue;
|
||||
// how big?
|
||||
int64_t total = m_msg2->m_lists[i].getListSize();
|
||||
// skip if empty
|
||||
if ( total == 0 ) {
|
||||
// skip if empty. no we could be doing a split that is
|
||||
// empty but other splits are full
|
||||
if ( total == 0 && m_r->m_numDocIdSplits <= 1 ) {
|
||||
log("query: empty facets for term #%i",i);
|
||||
continue;
|
||||
}
|
||||
@ -6639,7 +6644,12 @@ void PosdbTable::intersectLists10_r ( ) {
|
||||
// synbits on it, below!!! or a half stop wiki bigram like
|
||||
// the term "enough for" in the wiki phrase
|
||||
// "time enough for love" because we wanna reward that more!
|
||||
// this halfstopwikibigram bit is set in the indivial keys
|
||||
// so we'd have to at least do a key cleansing, so we can't
|
||||
// do this shortcut right now... mdw oct 10 2015
|
||||
if ( nsub == 1 &&
|
||||
// need it for gbfacet termlists though it seems
|
||||
(nwpFlags[0] & (BF_FACET|BF_NUMBER)) &&
|
||||
!(nwpFlags[0] & BF_SYNONYM) &&
|
||||
!(nwpFlags[0] & BF_HALFSTOPWIKIBIGRAM) ) {
|
||||
miniMergedList [j] = nwp [0];
|
||||
@ -6775,6 +6785,8 @@ void PosdbTable::intersectLists10_r ( ) {
|
||||
nwp[mink] = NULL;
|
||||
// avoid breach of core below now
|
||||
if ( mptr < mptrEnd ) goto mergeMore;
|
||||
// wrap it up here since done merging
|
||||
miniMergedEnd[j] = mptr;
|
||||
}
|
||||
|
||||
// breach?
|
||||
@ -7563,6 +7575,7 @@ void PosdbTable::intersectLists10_r ( ) {
|
||||
dcs.m_docLang = docLang;
|
||||
// ensure enough room we can't allocate in a thread!
|
||||
if ( m_scoreInfoBuf.getAvail()<(int32_t)sizeof(DocIdScore)+1){
|
||||
goto advance;
|
||||
char *xx=NULL;*xx=0; }
|
||||
// if same as last docid, overwrite it since we have a higher
|
||||
// siterank or langid i guess
|
||||
|
4
Posdb.h
4
Posdb.h
@ -393,9 +393,9 @@ class Posdb {
|
||||
|
||||
Rdb m_rdb;
|
||||
|
||||
DiskPageCache *getDiskPageCache ( ) { return &m_pc; };
|
||||
//DiskPageCache *getDiskPageCache ( ) { return &m_pc; };
|
||||
|
||||
DiskPageCache m_pc;
|
||||
//DiskPageCache m_pc;
|
||||
};
|
||||
|
||||
class FacetEntry {
|
||||
|
48
Process.cpp
48
Process.cpp
@ -1515,15 +1515,16 @@ bool Process::shutdown2 ( ) {
|
||||
static bool s_printed = false;
|
||||
|
||||
// wait for all threads to return
|
||||
int32_t n = g_threads.getNumThreadsOutOrQueued() ;
|
||||
//int32_t n = g_threads.getNumThreadsOutOrQueued() ;
|
||||
int32_t n = g_threads.getNumWriteThreadsOut();
|
||||
if ( n != 0 && ! m_urgent ) {
|
||||
log(LOG_INFO,"gb: Has %"INT32" threads out. Waiting for "
|
||||
log(LOG_INFO,"gb: Has %"INT32" write threads out. Waiting for "
|
||||
"them to finish.",n);
|
||||
return false;
|
||||
}
|
||||
else if ( ! s_printed && ! m_urgent ) {
|
||||
s_printed = true;
|
||||
log(LOG_INFO,"gb: No threads out.");
|
||||
log(LOG_INFO,"gb: No write threads out.");
|
||||
}
|
||||
|
||||
|
||||
@ -1687,6 +1688,9 @@ bool Process::shutdown2 ( ) {
|
||||
if ( g_process.m_threadOut )
|
||||
log(LOG_INFO,"gb: still has hdtemp thread");
|
||||
|
||||
|
||||
log("gb. EXITING.");
|
||||
|
||||
// exit abruptly
|
||||
exit(0);
|
||||
|
||||
@ -1764,7 +1768,7 @@ bool Process::saveRdbTrees ( bool useThread , bool shuttingDown ) {
|
||||
// no thread if shutting down
|
||||
if ( shuttingDown ) useThread = false;
|
||||
// debug note
|
||||
log("gb: shuttingdown=%i",(int)shuttingDown);
|
||||
if ( shuttingDown ) log("gb: trying to shutdown");
|
||||
// turn off statsdb until everyone is done
|
||||
//g_statsdb.m_disabled = true;
|
||||
// loop over all Rdbs and save them
|
||||
@ -2088,22 +2092,30 @@ void Process::resetAll ( ) {
|
||||
resetTestIpTable();
|
||||
}
|
||||
|
||||
#include "Msg3.h"
|
||||
|
||||
void Process::resetPageCaches ( ) {
|
||||
log("gb: Resetting page caches.");
|
||||
g_posdb .getDiskPageCache()->reset();
|
||||
//g_datedb .getDiskPageCache()->reset();
|
||||
g_linkdb .getDiskPageCache()->reset();
|
||||
g_titledb .getDiskPageCache()->reset();
|
||||
g_sectiondb .getDiskPageCache()->reset();
|
||||
g_tagdb .getDiskPageCache()->reset();
|
||||
g_spiderdb .getDiskPageCache()->reset();
|
||||
//g_tfndb .getDiskPageCache()->reset();
|
||||
//g_checksumdb .getDiskPageCache()->reset();
|
||||
g_clusterdb .getDiskPageCache()->reset();
|
||||
g_catdb .getDiskPageCache()->reset();
|
||||
//g_placedb .getDiskPageCache()->reset();
|
||||
g_doledb .getDiskPageCache()->reset();
|
||||
//g_statsdb .getDiskPageCache()->reset();
|
||||
for ( int32_t i = 0 ; i < RDB_END ; i++ ) {
|
||||
RdbCache *rpc = getDiskPageCache ( i ); // rdbid = i
|
||||
if ( ! rpc ) continue;
|
||||
rpc->reset();
|
||||
}
|
||||
|
||||
// g_posdb .getDiskPageCache()->reset();
|
||||
// //g_datedb .getDiskPageCache()->reset();
|
||||
// g_linkdb .getDiskPageCache()->reset();
|
||||
// g_titledb .getDiskPageCache()->reset();
|
||||
// g_sectiondb .getDiskPageCache()->reset();
|
||||
// g_tagdb .getDiskPageCache()->reset();
|
||||
// g_spiderdb .getDiskPageCache()->reset();
|
||||
// //g_tfndb .getDiskPageCache()->reset();
|
||||
// //g_checksumdb .getDiskPageCache()->reset();
|
||||
// g_clusterdb .getDiskPageCache()->reset();
|
||||
// g_catdb .getDiskPageCache()->reset();
|
||||
// //g_placedb .getDiskPageCache()->reset();
|
||||
// g_doledb .getDiskPageCache()->reset();
|
||||
// //g_statsdb .getDiskPageCache()->reset();
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
|
20
Profiler.cpp
20
Profiler.cpp
@ -1451,10 +1451,17 @@ Profiler::getStackFrame(int sig) {
|
||||
// profile once every 5ms, not every 1ms
|
||||
static int32_t s_count = 0;
|
||||
|
||||
// turn off after 60 seconds of profiling
|
||||
if ( m_totalFrames++ >= 60000 ) {
|
||||
stopRealTimeProfiler(false);
|
||||
return;
|
||||
}
|
||||
|
||||
if ( ++s_count != 5 ) return;
|
||||
|
||||
s_count = 0;
|
||||
|
||||
|
||||
// prevent cores.
|
||||
// TODO: hack this to a function somehow...
|
||||
// we set this to positive values when calling library functions like
|
||||
@ -1463,6 +1470,9 @@ Profiler::getStackFrame(int sig) {
|
||||
// somewhere. but for now just ignore.
|
||||
if ( g_inMemcpy ) return;
|
||||
|
||||
// likewise, not if in system malloc since backtrace() mallocs
|
||||
if ( g_inMemFunction ) return;
|
||||
|
||||
//void *trace[32];
|
||||
|
||||
// the innermost line number
|
||||
@ -1584,6 +1594,7 @@ Profiler::startRealTimeProfiler() {
|
||||
// }
|
||||
init();
|
||||
m_realTimeProfilerRunning = true;
|
||||
m_totalFrames = 0;
|
||||
// now Loop.cpp will call g_profiler.getStackFrame()
|
||||
return;
|
||||
|
||||
@ -1854,7 +1865,7 @@ Profiler::printRealTimeInfo(SafeBuf *sb,
|
||||
int fd = open ( filename , O_RDWR | O_CREAT , S_IRWXU );
|
||||
if ( fd < 0 ) {
|
||||
sb->safePrintf("FAILED TO OPEN %s for writing: %s"
|
||||
,ff.getBufStart(),strerror(errno));
|
||||
,ff.getBufStart(),mstrerror(errno));
|
||||
return false;
|
||||
}
|
||||
for ( ; ip < ipEnd ; ip += sizeof(uint64_t) ) {
|
||||
@ -1881,6 +1892,13 @@ Profiler::printRealTimeInfo(SafeBuf *sb,
|
||||
|
||||
// restrict to top 100 lines
|
||||
char *x = out.getBufStart();
|
||||
|
||||
if ( ! x ) {
|
||||
sb->safePrintf("FAILED TO READ trash/output.txt: %s"
|
||||
,mstrerror(g_errno));
|
||||
return false;
|
||||
}
|
||||
|
||||
int lineCount = 0;
|
||||
for ( ; *x ; x++ ) {
|
||||
if ( *x != '\n' ) continue;
|
||||
|
@ -263,6 +263,8 @@ protected:
|
||||
HashTableX m_activeFns;
|
||||
HashTableX m_quickpolls;
|
||||
|
||||
int32_t m_totalFrames;
|
||||
|
||||
const char* m_lastQpoll;
|
||||
int32_t m_lastQpollLine;
|
||||
QuickPollInfo m_quickPollInfos[512];
|
||||
|
11
Rdb.cpp
11
Rdb.cpp
@ -135,7 +135,8 @@ bool Rdb::init ( char *dir ,
|
||||
int32_t maxCacheNodes ,
|
||||
bool useHalfKeys ,
|
||||
bool loadCacheFromDisk ,
|
||||
DiskPageCache *pc ,
|
||||
//DiskPageCache *pc ,
|
||||
void *pc ,
|
||||
bool isTitledb ,
|
||||
bool preloadDiskPageCache ,
|
||||
char keySize ,
|
||||
@ -158,7 +159,7 @@ bool Rdb::init ( char *dir ,
|
||||
m_fixedDataSize = fixedDataSize;
|
||||
m_maxTreeMem = maxTreeMem;
|
||||
m_useHalfKeys = useHalfKeys;
|
||||
m_pc = pc;
|
||||
//m_pc = pc;
|
||||
m_isTitledb = isTitledb;
|
||||
m_preloadCache = preloadDiskPageCache;
|
||||
m_biasDiskPageCache = biasDiskPageCache;
|
||||
@ -571,7 +572,7 @@ bool Rdb::addRdbBase2 ( collnum_t collnum ) { // addColl2()
|
||||
buckets ,
|
||||
&m_dump ,
|
||||
this ,
|
||||
m_pc ,
|
||||
NULL ,
|
||||
m_isTitledb ,
|
||||
m_preloadCache ,
|
||||
m_biasDiskPageCache ) ) {
|
||||
@ -1643,7 +1644,7 @@ bool Rdb::dumpCollLoop ( ) {
|
||||
//0 , // prev last key
|
||||
KEYMIN() , // prev last key
|
||||
m_ks , // keySize
|
||||
m_pc , // DiskPageCache ptr
|
||||
NULL,//m_pc , // DiskPageCache ptr
|
||||
maxFileSize ,
|
||||
this )) {// for setting m_needsToSave
|
||||
return false;
|
||||
@ -1791,7 +1792,7 @@ void attemptMergeAll2 ( ) {
|
||||
if ( g_merge.isMerging() ) return;
|
||||
|
||||
int32_t niceness = MAX_NICENESS;
|
||||
collnum_t s_lastCollnum = 0;
|
||||
static collnum_t s_lastCollnum = 0;
|
||||
int32_t count = 0;
|
||||
|
||||
tryLoop:
|
||||
|
5
Rdb.h
5
Rdb.h
@ -113,7 +113,8 @@ class Rdb {
|
||||
int32_t maxCacheNodes ,
|
||||
bool useHalfKeys ,
|
||||
bool loadCacheFromDisk ,
|
||||
class DiskPageCache *pc = NULL ,
|
||||
//class DiskPageCache *pc = NULL ,
|
||||
void *pc = NULL,
|
||||
bool isTitledb = false , // use fileIds2[]?
|
||||
bool preloadDiskPageCache = false ,
|
||||
char keySize = 12 ,
|
||||
@ -485,7 +486,7 @@ class Rdb {
|
||||
// so only one save thread launches at a time
|
||||
bool m_isSaving;
|
||||
|
||||
class DiskPageCache *m_pc;
|
||||
//class DiskPageCache *m_pc;
|
||||
|
||||
bool m_isTitledb;
|
||||
|
||||
|
32
RdbBase.cpp
32
RdbBase.cpp
@ -128,7 +128,7 @@ bool RdbBase::init ( char *dir ,
|
||||
RdbBuckets *buckets ,
|
||||
RdbDump *dump ,
|
||||
class Rdb *rdb ,
|
||||
DiskPageCache *pc ,
|
||||
void *pc , // DiskPageCache *pc ,
|
||||
bool isTitledb ,
|
||||
bool preloadDiskPageCache ,
|
||||
bool biasDiskPageCache ) {
|
||||
@ -266,7 +266,7 @@ bool RdbBase::init ( char *dir ,
|
||||
m_useHalfKeys = useHalfKeys;
|
||||
m_ks = keySize;
|
||||
m_pageSize = pageSize;
|
||||
m_pc = pc;
|
||||
//m_pc = pc;
|
||||
m_isTitledb = isTitledb;
|
||||
// wa haven't done a dump yet
|
||||
//m_lastWrite = gettimeofdayInMilliseconds();
|
||||
@ -900,11 +900,11 @@ int32_t RdbBase::addFile ( int32_t id , bool isNew , int32_t mergeNum ,
|
||||
// open this big data file for reading only
|
||||
if ( ! isNew ) {
|
||||
if ( mergeNum < 0 )
|
||||
f->open ( O_RDONLY | O_NONBLOCK | O_ASYNC , m_pc );
|
||||
f->open ( O_RDONLY | O_NONBLOCK | O_ASYNC , NULL );
|
||||
// otherwise, merge will have to be resumed so this file
|
||||
// should be writable
|
||||
else
|
||||
f->open ( O_RDWR | O_NONBLOCK | O_ASYNC , m_pc );
|
||||
f->open ( O_RDWR | O_NONBLOCK | O_ASYNC , NULL );//pc
|
||||
}
|
||||
skip:
|
||||
// find the position to add so we maintain order by fileId
|
||||
@ -1132,6 +1132,8 @@ bool RdbBase::incorporateMerge ( ) {
|
||||
if ( ! m_files[i]->unlink ( doneWrapper , this ) ) {
|
||||
m_numThreads++; g_numThreads++; }
|
||||
// debug msg
|
||||
// MDW this cores if file is bad... if collection
|
||||
// got delete from under us i guess!!
|
||||
else log(LOG_INFO,"merge: Unlinked %s (#%"INT32").",
|
||||
m_files[i]->getFilename(),i);
|
||||
// debug msg
|
||||
@ -1421,6 +1423,10 @@ bool RdbBase::attemptMerge ( int32_t niceness, bool forceMergeAll, bool doLog ,
|
||||
|
||||
if ( m_nextMergeForced ) forceMergeAll = true;
|
||||
|
||||
if ( forceMergeAll )
|
||||
log(LOG_INFO,"merge: forcing merge for "
|
||||
"for %s. (collnum=%"INT32")",m_dbname,(int32_t)m_collnum);
|
||||
|
||||
// if we are trying to merge titledb but a titledb dump is going on
|
||||
// then do not do the merge, we do not want to overwrite tfndb via
|
||||
// RdbDump::updateTfndbLoop()
|
||||
@ -1468,11 +1474,16 @@ bool RdbBase::attemptMerge ( int32_t niceness, bool forceMergeAll, bool doLog ,
|
||||
}
|
||||
|
||||
if ( g_numThreads > 0 ) {
|
||||
if ( doLog )
|
||||
// prevent log spam
|
||||
static int32_t s_lastTime = 0;
|
||||
int32_t now = getTimeLocal();
|
||||
if ( now - s_lastTime > 0 && doLog )
|
||||
log(LOG_INFO,"merge: Waiting for another "
|
||||
"collection's unlink/rename "
|
||||
"operations to finish before attempting merge "
|
||||
"for %s (collnum=%"INT32").",m_dbname,(int32_t)m_collnum);
|
||||
"for %s (collnum=%"INT32").",
|
||||
m_dbname,(int32_t)m_collnum);
|
||||
s_lastTime = now;
|
||||
return false;
|
||||
}
|
||||
|
||||
@ -1629,7 +1640,10 @@ bool RdbBase::attemptMerge ( int32_t niceness, bool forceMergeAll, bool doLog ,
|
||||
|
||||
// this triggers the negative rec concentration msg below and
|
||||
// tries to merge on one file...
|
||||
if ( ! resuming && m_numFiles <= 1 ) return false;
|
||||
if ( ! resuming && m_numFiles <= 1 ) {
|
||||
m_nextMergeForced = false;
|
||||
return false;
|
||||
}
|
||||
|
||||
// what percent of recs in the collections' rdb are negative?
|
||||
// the rdbmaps hold this info
|
||||
@ -2263,7 +2277,7 @@ void RdbBase::gotTokenForMerge ( ) {
|
||||
m_mergeStartFileNum ,
|
||||
m_numFilesToMerge ,
|
||||
m_niceness ,
|
||||
m_pc ,
|
||||
NULL,//m_pc ,
|
||||
mint /*maxTargetFileSize*/ ,
|
||||
m_ks ) )
|
||||
// we started the merge so return true here
|
||||
@ -2531,7 +2545,7 @@ void RdbBase::saveMaps ( bool useThread ) {
|
||||
}
|
||||
|
||||
void RdbBase::verifyDiskPageCache ( ) {
|
||||
if ( !m_pc ) return;
|
||||
//if ( !m_pc ) return;
|
||||
// disable for now
|
||||
return;
|
||||
// for ( int32_t i = 0; i < m_numFiles; i++ ){
|
||||
|
@ -83,7 +83,8 @@ class RdbBase {
|
||||
RdbBuckets *buckets ,
|
||||
RdbDump *dump ,
|
||||
class Rdb *rdb ,
|
||||
class DiskPageCache *pc = NULL ,
|
||||
//class DiskPageCache *pc = NULL ,
|
||||
void *pc = NULL,
|
||||
bool isTitledb = false , // use fileIds2[]?
|
||||
bool preloadDiskPageCache = false ,
|
||||
bool biasDiskPageCache = false );
|
||||
@ -458,7 +459,7 @@ class RdbBase {
|
||||
// so only one save thread launches at a time
|
||||
//bool m_isSaving;
|
||||
|
||||
class DiskPageCache *m_pc;
|
||||
//class DiskPageCache *m_pc;
|
||||
|
||||
bool m_isTitledb;
|
||||
|
||||
|
44
RdbCache.cpp
44
RdbCache.cpp
@ -23,6 +23,7 @@ RdbCache::RdbCache () {
|
||||
m_totalBufSize = 0;
|
||||
m_numBufs = 0;
|
||||
m_ptrs = NULL;
|
||||
m_maxMem = 0;
|
||||
m_numPtrsMax = 0;
|
||||
reset();
|
||||
m_needsSave = false;
|
||||
@ -156,6 +157,7 @@ bool RdbCache::init ( int32_t maxMem ,
|
||||
if( bufMem <= 0 ) {
|
||||
log("rdbcache: cache for %s does not have enough mem. fix "
|
||||
"by increasing maxmem or number of recs, etc.",m_dbname);
|
||||
return false;
|
||||
char *xx=NULL;*xx=0;
|
||||
}
|
||||
if ( bufMem && m_fixedDataSize > 0 &&
|
||||
@ -440,7 +442,8 @@ bool RdbCache::getRecord ( collnum_t collnum ,
|
||||
if ( m_numPtrsMax <= 0 ) return false;
|
||||
// if init() called failed because of oom...
|
||||
if ( ! m_ptrs )
|
||||
return log("cache: getRecord: failed because oom");
|
||||
//return log("cache: getRecord: failed because oom");
|
||||
return false;
|
||||
// time it -- debug
|
||||
int64_t t = 0LL ;
|
||||
if ( g_conf.m_logTimingDb ) t = gettimeofdayInMillisecondsLocal();
|
||||
@ -540,7 +543,7 @@ bool RdbCache::getRecord ( collnum_t collnum ,
|
||||
// of the delete head's space i guess.
|
||||
// i do this for all caches now... what are the downsides? i forget.
|
||||
//
|
||||
bool check = false;
|
||||
bool check = true;//false;
|
||||
//if ( this == &g_genericCache[SITEQUALITY_CACHEID] ) check = true;
|
||||
if ( this == g_dns.getCache () ) check = true;
|
||||
if ( this == g_dns.getCacheLocal () ) check = true;
|
||||
@ -555,11 +558,11 @@ bool RdbCache::getRecord ( collnum_t collnum ,
|
||||
//if ( this == &g_tagdb.m_listCache ) check = true;
|
||||
// the exact count cache...
|
||||
//if ( this == &g_qtable ) check = true;
|
||||
if ( m_totalBufSize < 20000 ) check = false;
|
||||
//if ( m_totalBufSize < 20000 ) check = false;
|
||||
if ( check ) promoteRecord = false;
|
||||
// sanity check, do not allow the site quality cache or dns cache to
|
||||
// be > 128MB, that just does not make sense and it complicates things
|
||||
if ( check && m_totalBufSize > BUFSIZE ) { char *xx = NULL; *xx = 0; }
|
||||
//if(check && m_totalBufSize > BUFSIZE ) { char *xx = NULL; *xx = 0; }
|
||||
// sanity check
|
||||
if ( m_tail < 0 || m_tail > m_totalBufSize ) {
|
||||
char *xx = NULL; *xx = 0; }
|
||||
@ -777,14 +780,15 @@ bool RdbCache::addRecord ( collnum_t collnum ,
|
||||
int32_t timestamp ,
|
||||
char **retRecPtr ) {
|
||||
|
||||
// bail if cache empty. maybe m_maxMem is 0.
|
||||
if ( m_totalBufSize <= 0 ) return true;
|
||||
|
||||
//int64_t startTime = gettimeofdayInMillisecondsLocal();
|
||||
if ( collnum < (collnum_t)0) {char *xx=NULL;*xx=0; }
|
||||
if ( collnum >= m_maxColls ) {char *xx=NULL;*xx=0; }
|
||||
// full key not allowed because we use that in markDeletedRecord()
|
||||
if ( KEYCMP(cacheKey,KEYMAX(),m_cks) == 0 ) { char *xx=NULL;*xx=0; }
|
||||
|
||||
// bail if cache empty
|
||||
if ( m_totalBufSize <= 0 ) return true;
|
||||
// debug msg
|
||||
int64_t t = 0LL ;
|
||||
if ( g_conf.m_logTimingDb ) t = gettimeofdayInMillisecondsLocal();
|
||||
@ -953,11 +957,13 @@ bool RdbCache::addRecord ( collnum_t collnum ,
|
||||
m_memOccupied += ( p - start );
|
||||
|
||||
// debug msg (MDW)
|
||||
//log("cache: adding rec @ %"UINT32" size=%"INT32" tail=%"UINT32"",
|
||||
// i1c,p-start,m_tail);
|
||||
//log("cache: stored k.n1=%"UINT32" k.n0=%"UINT64" %"INT32" bytes @ %"UINT32" tail=%"UINT32"",
|
||||
// ((key_t *)cacheKey)->n1,
|
||||
// ((key_t *)cacheKey)->n0,p-start,i1c,m_tail);
|
||||
// if ( this == &g_spiderLoop.m_winnerListCache ) {
|
||||
// log("cache: adding rec @ %"UINT32" size=%i tail=%"INT32"",
|
||||
// i1c,(int)(p-start),m_tail);
|
||||
// log("cache: stored k.n1=%"UINT32" k.n0=%"UINT64" %"INT32" bytes @ %"UINT32" tail=%"UINT32"",
|
||||
// ((key_t *)cacheKey)->n1,
|
||||
// ((key_t *)cacheKey)->n0,(int)(p-start),i1c,m_tail);
|
||||
// }
|
||||
//if ( m_cks == 4 )
|
||||
// log("stored k=%"XINT32" %"INT32" bytes @ %"UINT32"",
|
||||
// *(int32_t *)cacheKey,p-start,i);//(uint32_t)start);
|
||||
@ -1109,8 +1115,10 @@ bool RdbCache::deleteRec ( ) {
|
||||
//int32_t saved = m_tail;
|
||||
|
||||
// debug msg (MDW)
|
||||
//log("cache: deleting rec @ %"INT32" size=%"INT32"",m_tail,
|
||||
// dataSize+2+12+4+4);
|
||||
// if ( this == &g_spiderLoop.m_winnerListCache ) {
|
||||
// log("cache: deleting rec @ %"INT32" size=%"INT32"",m_tail,
|
||||
// dataSize+2+12+4+4);
|
||||
// }
|
||||
|
||||
// skip over rest of rec
|
||||
p += dataSize;
|
||||
@ -1124,6 +1132,10 @@ bool RdbCache::deleteRec ( ) {
|
||||
m_tail +(int32_t)sizeof(collnum_t)+m_cks+4>m_totalBufSize){
|
||||
char *xx = NULL; *xx = 0;}
|
||||
|
||||
// if ( this == &g_spiderLoop.m_winnerListCache )
|
||||
// log("spider: rdbcache: removing tail rec collnum=%i",
|
||||
// (int)collnum);
|
||||
|
||||
// delete key from hash table, iff is for THIS record
|
||||
// but if it has not already been voided.
|
||||
// we set key to KEYMAX() in markDeletedRecord()
|
||||
@ -1163,8 +1175,10 @@ bool RdbCache::deleteRec ( ) {
|
||||
void RdbCache::markDeletedRecord(char *ptr){
|
||||
int32_t dataSize = sizeof(collnum_t)+m_cks+sizeof(int32_t);
|
||||
// debug it
|
||||
//logf(LOG_DEBUG,"cache: makeDeleteRecord ptr=0x%"XINT32" off=%"INT32"",
|
||||
// (int32_t)ptr,ptr-m_bufs[0]);
|
||||
// if ( this == &g_spiderLoop.m_winnerListCache ) {
|
||||
//logf(LOG_DEBUG,"cache: makeDeleteRec ptr=0x%"PTRFMT" off=%"INT32"",
|
||||
// (PTRTYPE)ptr,(int32_t)(ptr-m_bufs[0]));
|
||||
// }
|
||||
// get dataSize and data
|
||||
if ( m_fixedDataSize == -1 || m_supportLists ) {
|
||||
dataSize += 4 + // size
|
||||
|
@ -120,6 +120,7 @@ class RdbCache {
|
||||
// . returns true if found, false if not found in cache
|
||||
// . sets *rec and *recSize iff found
|
||||
// . sets *cachedTime to time the rec was cached
|
||||
// . use maxAge of -1 to have no limit to the age of cached rec
|
||||
bool getRecord ( collnum_t collnum ,
|
||||
//key_t cacheKey ,
|
||||
char *cacheKey ,
|
||||
|
37
RdbDump.cpp
37
RdbDump.cpp
@ -41,7 +41,8 @@ bool RdbDump::set ( //char *coll ,
|
||||
//key_t prevLastKey ,
|
||||
char *prevLastKey ,
|
||||
char keySize ,
|
||||
class DiskPageCache *pc ,
|
||||
//class DiskPageCache *pc ,
|
||||
void *pc ,
|
||||
int64_t maxFileSize ,
|
||||
Rdb *rdb ) {
|
||||
|
||||
@ -404,12 +405,15 @@ bool RdbDump::dumpTree ( bool recall ) {
|
||||
m_totalNegDumped += m_numNegRecs;
|
||||
// . check the list we got from the tree for problems
|
||||
// . ensures keys are ordered from lowest to highest as well
|
||||
#ifdef GBSANITYCHECK
|
||||
log("dump: verifying list before dumping");
|
||||
m_list->checkList_r ( false , // removeNegRecs?
|
||||
false , // sleep on problem?
|
||||
m_rdb->m_rdbId );
|
||||
#endif
|
||||
//#ifdef GBSANITYCHECK
|
||||
if ( g_conf.m_verifyWrites ) {
|
||||
char *s = "none";
|
||||
if ( m_rdb ) s = getDbnameFromId(m_rdb->m_rdbId);
|
||||
log("dump: verifying list before dumping (rdb=%s)",s);
|
||||
m_list->checkList_r ( false , // removeNegRecs?
|
||||
false , // sleep on problem?
|
||||
m_rdb->m_rdbId );
|
||||
}
|
||||
// if list is empty, we're done!
|
||||
if ( status && m_list->isEmpty() ) {
|
||||
// consider that a rollover?
|
||||
@ -485,15 +489,15 @@ bool RdbDump::dumpList ( RdbList *list , int32_t niceness , bool recall ) {
|
||||
if ( m_list->isEmpty() ) return true;
|
||||
// we're now in dump mode again
|
||||
m_isDumping = true;
|
||||
#ifdef GBSANITYCHECK
|
||||
//#ifdef GBSANITYCHECK
|
||||
// don't check list if we're dumping an unordered list from tree!
|
||||
if ( m_orderedDump ) {
|
||||
if ( g_conf.m_verifyWrites && m_orderedDump ) {
|
||||
m_list->checkList_r ( false /*removedNegRecs?*/ );
|
||||
// print list stats
|
||||
log("dump: sk=%s ",KEYSTR(m_list->m_startKey,m_ks));
|
||||
log("dump: ek=%s ",KEYSTR(m_list->m_endKey,m_ks));
|
||||
// log("dump: sk=%s ",KEYSTR(m_list->m_startKey,m_ks));
|
||||
// log("dump: ek=%s ",KEYSTR(m_list->m_endKey,m_ks));
|
||||
}
|
||||
#endif
|
||||
//#endif
|
||||
|
||||
// before calling RdbMap::addList(), always reset list ptr
|
||||
// since we no longer call this in RdbMap::addList() so we don't
|
||||
@ -524,8 +528,10 @@ bool RdbDump::dumpList ( RdbList *list , int32_t niceness , bool recall ) {
|
||||
}
|
||||
}
|
||||
|
||||
if ( m_ks==18 ) {
|
||||
m_list->checkList_r(false,false,RDB_POSDB);
|
||||
if ( g_conf.m_verifyWrites ) {
|
||||
char rdbId = 0;
|
||||
if ( m_rdb ) rdbId = m_rdb->m_rdbId;
|
||||
m_list->checkList_r(false,false,rdbId);//RDB_POSDB);
|
||||
m_list->resetListPtr();
|
||||
}
|
||||
|
||||
@ -773,7 +779,8 @@ bool RdbDump::doneReadingForVerify ( ) {
|
||||
|
||||
|
||||
// see if what we wrote is the same as what we read back
|
||||
if ( m_verifyBuf && memcmp(m_verifyBuf,m_buf,m_bytesToWrite) != 0 &&
|
||||
if ( m_verifyBuf && g_conf.m_verifyWrites &&
|
||||
memcmp(m_verifyBuf,m_buf,m_bytesToWrite) != 0 &&
|
||||
! g_errno ) {
|
||||
log("disk: Write verification of %"INT32" bytes to file %s "
|
||||
"failed at offset=%"INT64". Retrying.",
|
||||
|
@ -50,7 +50,8 @@ class RdbDump {
|
||||
//key_t prevLastKey ,
|
||||
char *prevLastKey ,
|
||||
char keySize ,
|
||||
class DiskPageCache *pc ,
|
||||
//class DiskPageCache *pc ,
|
||||
void *pc ,
|
||||
int64_t maxFileSize ,
|
||||
class Rdb *rdb );
|
||||
|
||||
@ -75,7 +76,7 @@ class RdbDump {
|
||||
// . this override makes the file's getSlot() return LdbSlots
|
||||
// which can be appropriately added to an RdbTable or LdbTable
|
||||
bool load ( class Rdb *rdb , int32_t fixedDataSize , BigFile *file ,
|
||||
class DiskPageCache *pc );
|
||||
void *pc ); // class DiskPageCache *pc );
|
||||
|
||||
// . calls the callback specified in set() when done
|
||||
// . errno set to indicate error #, if any
|
||||
|
75
RdbList.cpp
75
RdbList.cpp
@ -693,9 +693,9 @@ bool RdbList::checkList_r ( bool removeNegRecs , bool sleepOnProblem ,
|
||||
return false;
|
||||
}
|
||||
|
||||
if ( m_useHalfKeys && m_ks == 12 ) // m_ks != 18 && m_ks != 24 )
|
||||
return checkIndexList_r ( removeNegRecs ,
|
||||
sleepOnProblem );
|
||||
// if ( m_useHalfKeys && m_ks == 12 ) // m_ks != 18 && m_ks != 24 )
|
||||
// return checkIndexList_r ( removeNegRecs ,
|
||||
// sleepOnProblem );
|
||||
|
||||
//log("m_list=%"INT32"",(int32_t)m_list);
|
||||
//key_t oldk;
|
||||
@ -721,6 +721,10 @@ bool RdbList::checkList_r ( bool removeNegRecs , bool sleepOnProblem ,
|
||||
if ( KEYCMP(acceptable,KEYMIN(),m_ks)==0 )
|
||||
KEYSET ( acceptable , m_endKey , m_ks );
|
||||
char k[MAX_KEY_BYTES];
|
||||
|
||||
static int32_t th = 0;
|
||||
if ( ! th ) th = hash64Lower_a ( "roottitles" , 10 );
|
||||
|
||||
while ( ! isExhausted() ) {
|
||||
//key_t k = getCurrentKey();
|
||||
getCurrentKey( k );
|
||||
@ -734,6 +738,43 @@ bool RdbList::checkList_r ( bool removeNegRecs , bool sleepOnProblem ,
|
||||
*(int32_t *)data > 100000000 ) ) {
|
||||
char *xx = NULL; *xx = 0; }
|
||||
}
|
||||
// tagrec?
|
||||
if ( rdbId == RDB_TAGDB && ! KEYNEG(k) ) {
|
||||
//TagRec *gr = (TagRec *)getCurrentRec();
|
||||
//Tag *tag = gr->getFirstTag ( );
|
||||
//for ( ; tag ; tag = gr->getNextTag ( tag ) ) {
|
||||
Tag *tag = (Tag *)getCurrentRec();
|
||||
if ( tag->m_type == th ) {
|
||||
char *tdata = tag->getTagData();
|
||||
int32_t tsize = tag->getTagDataSize();
|
||||
// core if tag val is not \0 terminated
|
||||
if ( tsize > 0 && tdata[tsize-1]!='\0' ) {
|
||||
log("db: bad root title tag");
|
||||
char *xx=NULL;*xx=0; }
|
||||
}
|
||||
}
|
||||
if ( rdbId == RDB_SPIDERDB && ! KEYNEG(k) &&
|
||||
getCurrentDataSize() > 0 ) {
|
||||
//char *data = getCurrentData();
|
||||
char *rec = getCurrentRec();
|
||||
// bad url in spider request?
|
||||
if ( g_spiderdb.isSpiderRequest ( (key128_t *)rec ) ){
|
||||
SpiderRequest *sr = (SpiderRequest *)rec;
|
||||
if ( strncmp(sr->m_url,"http",4) != 0 ) {
|
||||
log("db: spider req url");
|
||||
char *xx=NULL;*xx=0;
|
||||
}
|
||||
}
|
||||
}
|
||||
// title bad uncompress size?
|
||||
if ( rdbId == RDB_TITLEDB && ! KEYNEG(k) ) {
|
||||
char *rec = getCurrentRec();
|
||||
int32_t usize = *(int32_t *)(rec+12+4);
|
||||
if ( usize <= 0 ) {
|
||||
log("db: bad titlerec uncompress size");
|
||||
char *xx=NULL;*xx=0;
|
||||
}
|
||||
}
|
||||
// debug msg
|
||||
// pause if it's google
|
||||
//if ((((k.n0) >> 1) & 0x0000003fffffffffLL) == 70166155664)
|
||||
@ -3525,4 +3566,32 @@ void RdbList::setFromSafeBuf ( SafeBuf *sb , char rdbId ) {
|
||||
|
||||
}
|
||||
|
||||
void RdbList::setFromPtr ( char *p , int32_t psize , char rdbId ) {
|
||||
|
||||
// free and NULLify any old m_list we had to make room for our new list
|
||||
freeList();
|
||||
|
||||
// set this first since others depend on it
|
||||
m_ks = getKeySizeFromRdbId ( rdbId );
|
||||
|
||||
// set our list parms
|
||||
m_list = p;
|
||||
m_listSize = psize;
|
||||
m_alloc = p;
|
||||
m_allocSize = psize;
|
||||
m_listEnd = m_list + m_listSize;
|
||||
|
||||
KEYMIN(m_startKey,m_ks);
|
||||
KEYMAX(m_endKey ,m_ks);
|
||||
|
||||
m_fixedDataSize = getDataSizeFromRdbId ( rdbId );
|
||||
|
||||
m_ownData = false;//ownData;
|
||||
m_useHalfKeys = false;//useHalfKeys;
|
||||
|
||||
// use this call now to set m_listPtr and m_listPtrHi based on m_list
|
||||
resetListPtr();
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
@ -107,6 +107,7 @@ class RdbList {
|
||||
char keySize = sizeof(key_t) );
|
||||
|
||||
void setFromSafeBuf ( class SafeBuf *sb , char rdbId );
|
||||
void setFromPtr ( char *p , int32_t psize , char rdbId ) ;
|
||||
|
||||
// just set the start and end keys
|
||||
//void set ( key_t startKey , key_t endKey );
|
||||
|
11
RdbMap.cpp
11
RdbMap.cpp
@ -1295,12 +1295,13 @@ void RdbMap::reduceMemFootPrint () {
|
||||
for ( ; s && *s && ! is_digit(*s) ; s++ );
|
||||
int id = 0;
|
||||
if ( s ) id = atoi(s);
|
||||
if ( id && (id % 2) == 0 ) return;
|
||||
// id can be zero like for spiderdb0000.map
|
||||
if ( (id % 2) == 0 ) return;
|
||||
|
||||
// log("map: reducing mem footprint for %s/%s",
|
||||
// m_file.getDir(),
|
||||
// m_file.getFilename());
|
||||
|
||||
// log("map: reducing mem footprint for %s/%s",
|
||||
// m_file.getDir(),
|
||||
// m_file.getFilename());
|
||||
|
||||
// seems kinda buggy now..
|
||||
m_reducedMem = true;
|
||||
//return;
|
||||
|
@ -38,7 +38,8 @@ bool RdbMerge::merge ( char rdbId ,
|
||||
int32_t startFileNum ,
|
||||
int32_t numFiles ,
|
||||
int32_t niceness ,
|
||||
class DiskPageCache *pc ,
|
||||
//class DiskPageCache *pc ,
|
||||
void *pc ,
|
||||
int64_t maxTargetFileSize ,
|
||||
char keySize ) {
|
||||
// reset ourselves
|
||||
@ -69,7 +70,7 @@ bool RdbMerge::merge ( char rdbId ,
|
||||
m_dedup = base->m_dedup;
|
||||
m_fixedDataSize = base->m_fixedDataSize;
|
||||
m_niceness = niceness;
|
||||
m_pc = pc;
|
||||
//m_pc = pc;
|
||||
m_maxTargetFileSize = maxTargetFileSize;
|
||||
m_doneMerging = false;
|
||||
m_ks = keySize;
|
||||
@ -209,7 +210,7 @@ bool RdbMerge::gotLock ( ) {
|
||||
startOffset ,
|
||||
prevLastKey ,
|
||||
m_ks ,
|
||||
m_pc ,
|
||||
NULL,//m_pc ,
|
||||
m_maxTargetFileSize ,
|
||||
NULL ); // set m_base::m_needsToSave? no.
|
||||
// what kind of error?
|
||||
|
@ -66,7 +66,8 @@ class RdbMerge {
|
||||
int32_t startFileNum ,
|
||||
int32_t numFiles ,
|
||||
int32_t niceness ,
|
||||
class DiskPageCache *pc ,
|
||||
//class DiskPageCache *pc ,
|
||||
void *pc ,
|
||||
int64_t maxTargetFileSize ,
|
||||
char keySize );
|
||||
|
||||
@ -150,7 +151,7 @@ class RdbMerge {
|
||||
// count for indexdb
|
||||
int64_t m_dupsRemoved;
|
||||
|
||||
class DiskPageCache *m_pc;
|
||||
//class DiskPageCache *m_pc;
|
||||
int64_t m_maxTargetFileSize;
|
||||
|
||||
int32_t m_id2;
|
||||
|
@ -148,6 +148,7 @@ bool RdbScan::setRead ( BigFile *file ,
|
||||
// ensure we don't mess around
|
||||
m_fstate.m_allocBuf = NULL;
|
||||
m_fstate.m_buf = NULL;
|
||||
//m_fstate.m_usePartFiles = true;
|
||||
// debug msg
|
||||
//log("diskOff=%"INT64" nb=%"INT32"",offset,bytesToRead);
|
||||
//if ( offset == 16386 && bytesToRead == 16386 )
|
||||
@ -253,6 +254,7 @@ void RdbScan::gotList ( ) {
|
||||
// so i effectively disabled it by changing to _GBSANITYCHECK2_
|
||||
//#ifdef GBSANITYCHECK2
|
||||
// this first test, tests to make sure the read from cache worked
|
||||
/*
|
||||
DiskPageCache *pc = m_file->getDiskPageCache();
|
||||
if ( pc &&
|
||||
! g_errno &&
|
||||
@ -307,7 +309,8 @@ void RdbScan::gotList ( ) {
|
||||
// . go through each page in page cache and verify on disk
|
||||
//pc->verifyData ( m_file );
|
||||
}
|
||||
skip:
|
||||
*/
|
||||
// skip:
|
||||
//#endif
|
||||
// assume we did not shift it
|
||||
m_shifted = 0;//false;
|
||||
@ -319,7 +322,7 @@ void RdbScan::gotList ( ) {
|
||||
// . i think a read overflow might be causing a segv in malloc
|
||||
// . NOTE: BigFile's call to DiskPageCache alters these values
|
||||
if ( m_fstate.m_bytesDone != m_fstate.m_bytesToGo && m_hitDisk )
|
||||
log(LOG_INFO,"disk: Read %"INT32" bytes but needed %"INT32".",
|
||||
log(LOG_INFO,"disk: Read %"INT64" bytes but needed %"INT64".",
|
||||
m_fstate.m_bytesDone , m_fstate.m_bytesToGo );
|
||||
// adjust the list size for biased page cache if necessary
|
||||
//if ( m_file->m_pc && m_allowPageCache &&
|
||||
|
@ -87,6 +87,9 @@ class RdbScan {
|
||||
bool m_allowPageCache;
|
||||
|
||||
bool m_hitDisk;
|
||||
|
||||
// this is set by Msg3.cpp
|
||||
char m_inPageCache;
|
||||
};
|
||||
|
||||
#endif
|
||||
|
18
RdbTree.cpp
18
RdbTree.cpp
@ -2467,8 +2467,8 @@ void threadDoneWrapper ( void *state , ThreadEntry *t ) {
|
||||
THIS->m_dbname,mstrerror(g_errno));
|
||||
else
|
||||
// log it
|
||||
log("db: Done saving %s/%s-saved.dat",
|
||||
THIS->m_dir,THIS->m_dbname);
|
||||
log("db: Done saving %s/%s-saved.dat (wrote %"INT64" bytes)",
|
||||
THIS->m_dir,THIS->m_dbname,THIS->m_bytesWritten);
|
||||
// . call callback
|
||||
if ( THIS->m_callback ) THIS->m_callback ( THIS->m_state );
|
||||
}
|
||||
@ -2495,6 +2495,20 @@ bool RdbTree::fastSave_r() {
|
||||
return log("db: Could not open %s for writing: %s.",
|
||||
s,mstrerror(errno));
|
||||
}
|
||||
|
||||
redo:
|
||||
// verify the tree
|
||||
if ( g_conf.m_verifyWrites ) {
|
||||
log("db: verify writes is enabled, checking tree before "
|
||||
"saving.");
|
||||
if ( ! checkTree( false , true ) ) {
|
||||
log("db: fixing tree and re-checking");
|
||||
fixTree ( );
|
||||
goto redo;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// clear our own errno
|
||||
errno = 0;
|
||||
// . save the header
|
||||
|
@ -236,7 +236,7 @@ bool SafeBuf::pushFloat ( float i) {
|
||||
// hack off trailing 0's
|
||||
bool SafeBuf::printFloatPretty ( float f ) {
|
||||
|
||||
if ( m_length + 20 > m_capacity && ! reserve(20) )
|
||||
if ( m_length + 40 > m_capacity && ! reserve(40) )
|
||||
return false;
|
||||
|
||||
char *p = m_buf + m_length;
|
||||
|
22
Sections.cpp
22
Sections.cpp
@ -17294,11 +17294,11 @@ bool Sectiondb::init ( ) {
|
||||
// cache in favor of cleverly used disk page caches, because
|
||||
// the rec caches are not real-time and get stale.
|
||||
// . just hard-code 5MB for now
|
||||
int32_t pcmem = 5000000; // = g_conf.m_sectiondbMaxDiskPageCacheMem;
|
||||
//int32_t pcmem = 5000000; // = g_conf.m_sectiondbMaxDiskPageCacheMem;
|
||||
|
||||
// do not use for now i think we use posdb and store the 32bit
|
||||
// val in the key for facet type stuff
|
||||
pcmem = 0;
|
||||
//pcmem = 0;
|
||||
maxTreeMem = 100000;
|
||||
maxTreeNodes = 1000;
|
||||
|
||||
@ -17322,14 +17322,14 @@ bool Sectiondb::init ( ) {
|
||||
|
||||
// do not use any page cache if doing tmp cluster in order to
|
||||
// prevent swapping
|
||||
if ( g_hostdb.m_useTmpCluster ) pcmem = 0;
|
||||
int32_t pageSize = GB_INDEXDB_PAGE_SIZE;
|
||||
// init the page cache
|
||||
if ( ! m_pc.init ( "sectiondb",
|
||||
RDB_SECTIONDB,
|
||||
pcmem ,
|
||||
pageSize ) )
|
||||
return log("db: Sectiondb init failed.");
|
||||
// if ( g_hostdb.m_useTmpCluster ) pcmem = 0;
|
||||
// int32_t pageSize = GB_INDEXDB_PAGE_SIZE;
|
||||
// // init the page cache
|
||||
// if ( ! m_pc.init ( "sectiondb",
|
||||
// RDB_SECTIONDB,
|
||||
// pcmem ,
|
||||
// pageSize ) )
|
||||
// return log("db: Sectiondb init failed.");
|
||||
|
||||
// initialize our own internal rdb
|
||||
if ( ! m_rdb.init ( g_hostdb.m_dir ,
|
||||
@ -17348,7 +17348,7 @@ bool Sectiondb::init ( ) {
|
||||
0 , // maxCacheNodes
|
||||
false , // half keys?
|
||||
false , // saveCache?
|
||||
&m_pc , // page cache ptr
|
||||
NULL,//&m_pc , // page cache ptr
|
||||
false , // is titledb?
|
||||
false , // preloadcache?
|
||||
16 ))// keySize
|
||||
|
@ -7,7 +7,7 @@
|
||||
#include "Dates.h" // datetype_t
|
||||
#include "Words.h"
|
||||
#include "Rdb.h"
|
||||
#include "DiskPageCache.h"
|
||||
//#include "DiskPageCache.h"
|
||||
|
||||
|
||||
// KEY:
|
||||
@ -287,9 +287,9 @@ class Sectiondb {
|
||||
// holds binary format title entries
|
||||
Rdb m_rdb;
|
||||
|
||||
DiskPageCache *getDiskPageCache ( ) { return &m_pc; };
|
||||
//DiskPageCache *getDiskPageCache ( ) { return &m_pc; };
|
||||
|
||||
DiskPageCache m_pc;
|
||||
//DiskPageCache m_pc;
|
||||
};
|
||||
|
||||
extern class Sectiondb g_sectiondb;
|
||||
|
714
Spider.cpp
714
Spider.cpp
File diff suppressed because it is too large
Load Diff
18
Spider.h
18
Spider.h
@ -404,7 +404,7 @@ class Spiderdb {
|
||||
|
||||
Rdb *getRdb ( ) { return &m_rdb; };
|
||||
|
||||
DiskPageCache *getDiskPageCache() { return &m_pc; };
|
||||
//DiskPageCache *getDiskPageCache() { return &m_pc; };
|
||||
|
||||
// this rdb holds urls waiting to be spidered or being spidered
|
||||
Rdb m_rdb;
|
||||
@ -453,11 +453,11 @@ class Spiderdb {
|
||||
*/
|
||||
|
||||
// print the spider rec
|
||||
int32_t print( char *srec );
|
||||
int32_t print( char *srec , SafeBuf *sb = NULL );
|
||||
|
||||
private:
|
||||
|
||||
DiskPageCache m_pc;
|
||||
//DiskPageCache m_pc;
|
||||
};
|
||||
|
||||
void dedupSpiderdbList ( RdbList *list , int32_t niceness , bool removeNegRecs );
|
||||
@ -989,7 +989,7 @@ class Doledb {
|
||||
|
||||
bool addColl ( char *coll, bool doVerify = true );
|
||||
|
||||
DiskPageCache *getDiskPageCache() { return &m_pc; };
|
||||
//DiskPageCache *getDiskPageCache() { return &m_pc; };
|
||||
|
||||
// . see "overview of spidercache" below for key definition
|
||||
// . these keys when hashed are clogging up the hash table
|
||||
@ -1072,7 +1072,7 @@ class Doledb {
|
||||
|
||||
Rdb m_rdb;
|
||||
|
||||
DiskPageCache m_pc;
|
||||
//DiskPageCache m_pc;
|
||||
};
|
||||
|
||||
|
||||
@ -1131,12 +1131,12 @@ class SpiderColl {
|
||||
|
||||
// doledbkey + dataSize + bestRequestRec
|
||||
//char m_doleBuf[MAX_DOLEREC_SIZE];
|
||||
SafeBuf m_doleBuf;
|
||||
//SafeBuf m_doleBuf;
|
||||
|
||||
bool m_isLoading;
|
||||
|
||||
// for scanning the wait tree...
|
||||
bool m_isPopulating;
|
||||
bool m_isPopulatingDoledb;
|
||||
// for reading from spiderdb
|
||||
//bool m_isReadDone;
|
||||
bool m_didRead;
|
||||
@ -1192,7 +1192,9 @@ class SpiderColl {
|
||||
|
||||
bool addToDoleTable ( SpiderRequest *sreq ) ;
|
||||
|
||||
bool addDoleBufIntoDoledb (bool isFromCache,uint32_t cachedTimestamp);
|
||||
bool validateDoleBuf ( SafeBuf *doleBuf ) ;
|
||||
bool addDoleBufIntoDoledb ( SafeBuf *doleBuf , bool isFromCache);
|
||||
//,uint32_t cachedTimestamp);
|
||||
|
||||
bool updateSiteNumInlinksTable ( int32_t siteHash32,int32_t sni,
|
||||
time_t tstamp); // time_t
|
||||
|
@ -5,7 +5,10 @@
|
||||
#include "HttpServer.h"
|
||||
#include "SpiderProxy.h"
|
||||
|
||||
#define LOADPOINT_EXPIRE_MS (10*60*1000)
|
||||
//#define LOADPOINT_EXPIRE_MS (10*60*1000)
|
||||
// make it 15 seconds not 10 minutes otherwise it gets too full with dup
|
||||
// keys and really clogs things up
|
||||
#define LOADPOINT_EXPIRE_MS (15*1000)
|
||||
|
||||
//
|
||||
// BASIC DETAILS
|
||||
@ -927,6 +930,9 @@ void handleRequest54 ( UdpSlot *udpSlot , int32_t niceness ) {
|
||||
// and the loadbucket id
|
||||
//*(int32_t *)p = bb.m_id; p += 4;
|
||||
|
||||
//int32_t sanityCount = 0;//s_loadTable.getNumSlots();
|
||||
// top:
|
||||
|
||||
// now remove old entries from the load table. entries that
|
||||
// have completed and have a download end time more than 10 mins ago
|
||||
for ( int32_t i = 0 ; i < s_loadTable.getNumSlots() ; i++ ) {
|
||||
@ -938,8 +944,12 @@ void handleRequest54 ( UdpSlot *udpSlot , int32_t niceness ) {
|
||||
if ( pp->m_downloadEndTimeMS == 0LL ) continue;
|
||||
// delta t
|
||||
int64_t took = nowms - pp->m_downloadEndTimeMS;
|
||||
// < 10 mins?
|
||||
// < 10 mins? now it's < 15 seconds to prevent clogging.
|
||||
if ( took < LOADPOINT_EXPIRE_MS ) continue;
|
||||
|
||||
// 100 at a time
|
||||
//if ( sanityCount++ > 100 ) break;
|
||||
|
||||
// ok, its too old, nuke it to save memory
|
||||
s_loadTable.removeSlot(i);
|
||||
// the keys might have buried us but we really should not
|
||||
@ -947,6 +957,7 @@ void handleRequest54 ( UdpSlot *udpSlot , int32_t niceness ) {
|
||||
// should we? TODO: figure it out. if we miss a few it's not
|
||||
// a big deal.
|
||||
i--;
|
||||
//goto top;
|
||||
}
|
||||
|
||||
// send the proxy ip/port/LBid back to user
|
||||
@ -1038,6 +1049,7 @@ bool initSpiderProxyStuff() {
|
||||
128,
|
||||
NULL,
|
||||
0,
|
||||
// this slows us down
|
||||
true, // allow dups?
|
||||
MAX_NICENESS,
|
||||
"lbtab",
|
||||
|
@ -247,6 +247,9 @@ void Statsdb::addDocsIndexed ( ) {
|
||||
|
||||
if ( ! isClockInSync() ) return;
|
||||
|
||||
// only host #0 needs this
|
||||
if ( g_hostdb.m_hostId != 0 ) return;
|
||||
|
||||
// only once per five seconds
|
||||
int32_t now = getTimeLocal();
|
||||
static int32_t s_lastTime = 0;
|
||||
|
@ -1835,8 +1835,8 @@ bool Tagdb::init ( ) {
|
||||
// overhead in cache.
|
||||
//int32_t maxCacheNodes = g_conf.m_tagdbMaxCacheMem / 106;
|
||||
// we now use a page cache
|
||||
if ( ! m_pc.init ("tagdb",RDB_TAGDB,pcmem,GB_TFNDB_PAGE_SIZE))
|
||||
return log("tagdb: Tagdb init failed.");
|
||||
// if ( ! m_pc.init ("tagdb",RDB_TAGDB,pcmem,GB_TFNDB_PAGE_SIZE))
|
||||
// return log("tagdb: Tagdb init failed.");
|
||||
|
||||
// init this
|
||||
//if ( ! s_lockTable2.set(8,4,32,NULL,0,false,0,"taglocktbl") )
|
||||
@ -1858,7 +1858,7 @@ bool Tagdb::init ( ) {
|
||||
0 , //maxCacheNodes ,
|
||||
false , // half keys?
|
||||
false , //m_tagdbSaveCache
|
||||
&m_pc ,
|
||||
NULL,//&m_pc ,
|
||||
false, // is titledb
|
||||
true , // preload disk page cache
|
||||
sizeof(key128_t), // key size
|
||||
|
8
Tagdb.h
8
Tagdb.h
@ -8,7 +8,7 @@
|
||||
#include "Xml.h"
|
||||
#include "Url.h"
|
||||
#include "Loop.h"
|
||||
#include "DiskPageCache.h"
|
||||
//#include "DiskPageCache.h"
|
||||
//#include "CollectionRec.h"
|
||||
#include "SafeBuf.h"
|
||||
#include "Msg0.h"
|
||||
@ -353,7 +353,7 @@ class Tagdb {
|
||||
char *getRec ( RdbList *list , Url *url , int32_t *recSize ,char* coll,
|
||||
int32_t collLen, RdbList *retList) ;
|
||||
|
||||
DiskPageCache *getDiskPageCache() { return &m_pc; };
|
||||
//DiskPageCache *getDiskPageCache() { return &m_pc; };
|
||||
|
||||
//int32_t getGroupId (key_t *key) {return key->n1 & g_hostdb.m_groupMask;}
|
||||
|
||||
@ -374,7 +374,7 @@ class Tagdb {
|
||||
// and "not-founds" stored remotely (net cache)
|
||||
Rdb m_rdb;
|
||||
|
||||
DiskPageCache m_pc;
|
||||
//DiskPageCache m_pc;
|
||||
|
||||
bool loadMinSiteInlinksBuffer ( );
|
||||
bool loadMinSiteInlinksBuffer2 ( );
|
||||
@ -392,7 +392,7 @@ class Turkdb {
|
||||
bool addColl ( char *coll, bool doVerify = true );
|
||||
Rdb *getRdb ( ) { return &m_rdb; };
|
||||
Rdb m_rdb;
|
||||
DiskPageCache m_pc;
|
||||
//DiskPageCache m_pc;
|
||||
};
|
||||
|
||||
extern class Tagdb g_tagdb;
|
||||
|
@ -5,6 +5,7 @@
|
||||
#include "Profiler.h"
|
||||
#include "PingServer.h"
|
||||
//#include "AutoBan.h"
|
||||
#include "Hostdb.h"
|
||||
|
||||
// . TODO: deleting nodes from under Loop::callCallbacks is dangerous!!
|
||||
|
||||
@ -593,6 +594,17 @@ bool TcpServer::sendMsg ( int32_t ip ,
|
||||
// return true if s is NULL and g_errno was set by getNewSocket()
|
||||
// might set g_errno to EOUTOFSOCKETS
|
||||
if ( ! s ) { mfree ( sendBuf , sendBufSize,"TcpServer"); return true; }
|
||||
// debug to find why sockets getting diffbot replies get commandeered.
|
||||
// we think that they are using an sd used by a streaming socket,
|
||||
// who closed, but then proceed to use TcpSocket class as if he
|
||||
// had not closed it.
|
||||
if ( g_conf.m_logDebugTcpBuf ) {
|
||||
SafeBuf sb;
|
||||
sb.safePrintf("tcp: open newsd=%i sendbuf=",s->m_sd);
|
||||
sb.safeTruncateEllipsis (sendBuf,sendBufSize,200);
|
||||
log("%s",sb.getBufStart());
|
||||
}
|
||||
|
||||
// set up the new TcpSocket for connecting
|
||||
s->m_state = state;
|
||||
s->m_callback = callback;
|
||||
@ -846,6 +858,7 @@ TcpSocket *TcpServer::getNewSocket ( ) {
|
||||
// . TODO: ensure this blocks even if sd was set nonblock by wrapSock()
|
||||
if ( ! s ) {
|
||||
if ( sd == 0 ) log("tcp: closing1 sd of 0");
|
||||
log("tcp: wrapsocket2 returned null for sd=%i",(int)sd);
|
||||
if ( ::close(sd) == -1 )
|
||||
log("tcp: close2(%"INT32") = %s",(int32_t)sd,mstrerror(errno));
|
||||
else {
|
||||
@ -1732,6 +1745,8 @@ void writeSocketWrapper ( int sd , void *state ) {
|
||||
bool wasStreaming = s->m_streamingMode;
|
||||
|
||||
// otherwise, call callback on done writing or error
|
||||
// MDW: if we close the socket descriptor, then a getdiffbotreply
|
||||
// gets it, we have to know.
|
||||
THIS->makeCallback ( s );
|
||||
|
||||
// if callback changed socket status to ST_SEND_AGAIN
|
||||
@ -1921,7 +1936,7 @@ int32_t TcpServer::writeSocket ( TcpSocket *s ) {
|
||||
// another debug
|
||||
//if ( g_conf.m_logDebugTcp )
|
||||
log("tcp: only wrote %"INT32" of %"INT32" bytes "
|
||||
"tried.",n,toSend);
|
||||
"tried. sd=%i",n,toSend,s->m_sd);
|
||||
// need to listen for writability now since our write
|
||||
// failed to write everythin gout
|
||||
if ( ! s->m_writeRegistered &&
|
||||
@ -2260,6 +2275,32 @@ void TcpServer::destroySocket ( TcpSocket *s ) {
|
||||
// if sd is 0 do not really close it. seems to fix that bug.
|
||||
// 0 is the FD for stdin so i don't know how that is happening.
|
||||
if ( sd != 0 ) cret = ::close ( sd );
|
||||
|
||||
if ( g_conf.m_logDebugTcpBuf ) {
|
||||
SafeBuf sb;
|
||||
sb.safePrintf("tcp: closing sd=%i bytessent=%i "
|
||||
"sendbufused=%i streaming=%i "
|
||||
"sendbuf=",
|
||||
s->m_sd,
|
||||
s->m_sendOffset,
|
||||
s->m_sendBufUsed,
|
||||
(int)s->m_streamingMode);
|
||||
if ( s->m_sendBuf )
|
||||
sb.safeTruncateEllipsis(s->m_sendBuf,
|
||||
s->m_sendBufSize,
|
||||
200);
|
||||
sb.safePrintf(" bytesread=%i readbuf=",(int)s->m_readOffset);
|
||||
if ( s->m_readBuf )
|
||||
sb.safeTruncateEllipsis(s->m_readBuf,
|
||||
s->m_readOffset,
|
||||
2000);
|
||||
log("%s",sb.getBufStart());
|
||||
}
|
||||
|
||||
// force it out of streaming mode since we closed it. then we
|
||||
// should avoid the "not timing out streaming socket fd=123" msgs.
|
||||
s->m_streamingMode = false;
|
||||
|
||||
if ( cret != 0 ) { // == -1 )
|
||||
log("tcp: s=%"PTRFMT" close(%"INT32") = %"INT32" = %s",
|
||||
(PTRTYPE)s,(int32_t)sd,cret,mstrerror(errno));
|
||||
@ -2272,7 +2313,7 @@ void TcpServer::destroySocket ( TcpSocket *s ) {
|
||||
// log("tcp: closing sock %i (open=%"INT32")",sd,
|
||||
// m_numOpen-m_numClosed);
|
||||
// set it negative to try to fix the double close while
|
||||
// streaming bug.
|
||||
// streaming bug. -sd -m_sd m_sd = m_sd=
|
||||
if ( s->m_sd > 0 ) s->m_sd *= -1;
|
||||
}
|
||||
// a 2nd close? it should return -1 with errno set!
|
||||
@ -2574,6 +2615,18 @@ TcpSocket *TcpServer::acceptSocket ( ) {
|
||||
if ( g_conf.m_logDebugTcp )
|
||||
logf(LOG_DEBUG,"tcp: ...... accepted sd=%"INT32"",(int32_t)newsd);
|
||||
|
||||
// debug to find why sockets getting diffbot replies get commandeered.
|
||||
// we think that they are using an sd used by a streaming socket,
|
||||
// who closed, but then proceed to use TcpSocket class as if he
|
||||
// had not closed it.
|
||||
if ( g_conf.m_logDebugTcpBuf ) {
|
||||
SafeBuf sb;
|
||||
sb.safePrintf("tcp: accept newsd=%i incoming req",newsd);
|
||||
//sb.safeTruncateEllipsis (sendBuf,sendBufSize,200);
|
||||
log("%s",sb.getBufStart());
|
||||
}
|
||||
|
||||
|
||||
// ssl debug!
|
||||
//log("tcp: accept returned fd=%i",newsd);
|
||||
|
||||
@ -2621,6 +2674,7 @@ TcpSocket *TcpServer::acceptSocket ( ) {
|
||||
if ( ! s ) {
|
||||
//log("tcp: wrapsocket returned null fd=%i",newsd);
|
||||
if ( newsd == 0 ) log("tcp: closing sd of 0");
|
||||
log("tcp: wrapsocket1 returned null for sd=%i",(int)newsd);
|
||||
if ( ::close(newsd)== -1 )
|
||||
log("tcp: close2(%"INT32") = %s",
|
||||
(int32_t)newsd,mstrerror(errno));
|
||||
@ -2726,7 +2780,8 @@ bool TcpServer::sslAccept ( TcpSocket *s ) {
|
||||
void TcpServer::makeCallback ( TcpSocket * s ) {
|
||||
if ( ! s->m_callback ) {
|
||||
// note it
|
||||
log("tcp: null callback for s=0x%"PTRFMT"",(PTRTYPE)s);
|
||||
if ( g_conf.m_logDebugTcp )
|
||||
log("tcp: null callback for s=0x%"PTRFMT"",(PTRTYPE)s);
|
||||
return;
|
||||
}
|
||||
// record times for profiler
|
||||
@ -2777,7 +2832,8 @@ bool TcpServer::sendChunk ( TcpSocket *s ,
|
||||
// sendChunk() again.
|
||||
void (* doneSendingWrapper)( void *,TcpSocket *)){
|
||||
|
||||
log("tcp: sending chunk of %"INT32" bytes", sb->length() );
|
||||
log("tcp: sending chunk of %"INT32" bytes sd=%i", sb->length() ,
|
||||
s->m_sd );
|
||||
|
||||
// if socket had shit on there already, free that memory
|
||||
// just like TcpServer::destroySocket would
|
||||
@ -2818,6 +2874,11 @@ bool TcpServer::sendChunk ( TcpSocket *s ,
|
||||
log("tcp: chunkend=%s",sb->getBuf() - minus);
|
||||
*/
|
||||
|
||||
// char *p = sb->getBufStart();
|
||||
// char *pend = p + sb->length();
|
||||
// for ( ; p < pend ; p++ ) {
|
||||
// if ( *p == '\0' ) { char *xx=NULL;*xx=0; }
|
||||
// }
|
||||
|
||||
// . start the send process
|
||||
// . returns false if send did not complete
|
||||
|
878
Threads.cpp
878
Threads.cpp
File diff suppressed because it is too large
Load Diff
54
Threads.h
54
Threads.h
@ -10,6 +10,11 @@
|
||||
|
||||
#include <sys/types.h> // pid_t
|
||||
|
||||
// this also limit the maximum number of outstanding (live) threads
|
||||
#define MAX_STACKS 20
|
||||
// try going up to 40, we use about 2MB per stack... so this is 80MB
|
||||
//#define MAX_STACKS 40
|
||||
|
||||
// if we are a thread this gets the threadid, otherwise, the main process id
|
||||
//pid_t getpidtid();
|
||||
// on 64-bit architectures pthread_t is 64 bits and pid_t is still 32 bits
|
||||
@ -59,6 +64,13 @@ class ThreadEntry {
|
||||
|
||||
bool m_needsJoin;
|
||||
pthread_t m_joinTid;
|
||||
|
||||
class ThreadEntry *m_nextLink;
|
||||
class ThreadEntry *m_prevLink;
|
||||
|
||||
// the waiting linked list we came from
|
||||
ThreadEntry **m_bestHeadPtr;
|
||||
ThreadEntry **m_bestTailPtr;
|
||||
};
|
||||
|
||||
//#define MAX_THREAD_ENTRIES 1024
|
||||
@ -85,6 +97,31 @@ class ThreadQueue {
|
||||
int32_t m_entriesSize;
|
||||
int32_t m_maxEntries;
|
||||
|
||||
// linked list head for launched thread entries
|
||||
ThreadEntry *m_launchedHead;
|
||||
|
||||
// linked list head for empty thread entries
|
||||
ThreadEntry *m_emptyHead;
|
||||
|
||||
// 8 heads/tails for linked lists of thread entries waiting to launch
|
||||
ThreadEntry *m_waitHead0;
|
||||
ThreadEntry *m_waitHead1;
|
||||
ThreadEntry *m_waitHead2;
|
||||
ThreadEntry *m_waitHead3;
|
||||
ThreadEntry *m_waitHead4;
|
||||
ThreadEntry *m_waitHead5;
|
||||
ThreadEntry *m_waitHead6;
|
||||
|
||||
ThreadEntry *m_waitTail0;
|
||||
ThreadEntry *m_waitTail1;
|
||||
ThreadEntry *m_waitTail2;
|
||||
ThreadEntry *m_waitTail3;
|
||||
ThreadEntry *m_waitTail4;
|
||||
ThreadEntry *m_waitTail5;
|
||||
ThreadEntry *m_waitTail6;
|
||||
|
||||
|
||||
/*
|
||||
// counts the high/low priority (niceness <= 0) threads
|
||||
int64_t m_hiLaunched;
|
||||
int64_t m_hiReturned;
|
||||
@ -114,6 +151,7 @@ class ThreadQueue {
|
||||
int64_t m_mdReturnedSma;
|
||||
int64_t m_loLaunchedSma;
|
||||
int64_t m_loReturnedSma;
|
||||
*/
|
||||
|
||||
// init
|
||||
bool init (char threadType, int32_t maxThreads, int32_t maxEntries);
|
||||
@ -122,6 +160,8 @@ class ThreadQueue {
|
||||
void reset();
|
||||
|
||||
int32_t getNumThreadsOutOrQueued();
|
||||
int32_t getNumWriteThreadsOut() ;
|
||||
|
||||
|
||||
// . for adding an entry
|
||||
// . returns false and sets errno on error
|
||||
@ -141,7 +181,14 @@ class ThreadQueue {
|
||||
|
||||
// . launch a thread from our queue
|
||||
// . returns false and sets errno on error
|
||||
bool launchThread2 ( ThreadEntry *te );
|
||||
bool launchThread2 ( );
|
||||
|
||||
bool launchThreadForReals ( ThreadEntry **headPtr ,
|
||||
ThreadEntry **tailPtr ) ;
|
||||
|
||||
void removeThreads2 ( ThreadEntry **headPtr ,
|
||||
ThreadEntry **tailPtr ,
|
||||
class BigFile *bf ) ;
|
||||
|
||||
void print ( ) ;
|
||||
|
||||
@ -245,11 +292,14 @@ class Threads {
|
||||
int32_t getNumThreadQueues() { return m_numQueues; }
|
||||
|
||||
// used by UdpServer to see if it should call a low priority callback
|
||||
int32_t getNumActiveHighPriorityCpuThreads() ;
|
||||
//int32_t getNumActiveHighPriorityCpuThreads() ;
|
||||
// all high priority threads...
|
||||
int32_t getNumActiveHighPriorityThreads() ;
|
||||
|
||||
bool hasHighPriorityCpuThreads() ;
|
||||
|
||||
int32_t getNumThreadsOutOrQueued();
|
||||
int32_t getNumWriteThreadsOut() ;
|
||||
|
||||
// counts the high/low priority (niceness <= 0) threads
|
||||
//int64_t m_hiLaunched;
|
||||
|
14
Titledb.cpp
14
Titledb.cpp
@ -56,14 +56,14 @@ bool Titledb::init ( ) {
|
||||
// do not use any page cache if doing tmp cluster in order to
|
||||
// prevent swapping
|
||||
if ( g_hostdb.m_useTmpCluster ) pcmem = 0;
|
||||
int32_t pageSize = GB_INDEXDB_PAGE_SIZE;
|
||||
//int32_t pageSize = GB_INDEXDB_PAGE_SIZE;
|
||||
// init the page cache
|
||||
// . MDW: "minimize disk seeks" not working otherwise i'd enable it!
|
||||
if ( ! m_pc.init ( "titledb",
|
||||
RDB_TITLEDB,
|
||||
pcmem ,
|
||||
pageSize ) )
|
||||
return log("db: Titledb init failed.");
|
||||
// if ( ! m_pc.init ( "titledb",
|
||||
// RDB_TITLEDB,
|
||||
// pcmem ,
|
||||
// pageSize ) )
|
||||
// return log("db: Titledb init failed.");
|
||||
|
||||
// each entry in the cache is usually just a single record, no lists
|
||||
//int32_t maxCacheNodes = g_conf.m_titledbMaxCacheMem / (10*1024);
|
||||
@ -89,7 +89,7 @@ bool Titledb::init ( ) {
|
||||
0,//maxCacheNodes ,
|
||||
false ,// half keys?
|
||||
false ,// g_conf.m_titledbSav
|
||||
&m_pc , // page cache ptr
|
||||
NULL,//&m_pc , // page cache ptr
|
||||
true ) )// is titledb?
|
||||
return false;
|
||||
return true;
|
||||
|
@ -160,9 +160,9 @@ class Titledb {
|
||||
// holds binary format title entries
|
||||
Rdb m_rdb;
|
||||
|
||||
DiskPageCache *getDiskPageCache ( ) { return &m_pc; };
|
||||
//DiskPageCache *getDiskPageCache ( ) { return &m_pc; };
|
||||
|
||||
DiskPageCache m_pc;
|
||||
//DiskPageCache m_pc;
|
||||
};
|
||||
|
||||
extern class Titledb g_titledb;
|
||||
|
@ -1533,7 +1533,9 @@ int32_t UdpServer::readSock_ass ( UdpSlot **slotPtr , int64_t now ) {
|
||||
// . msg13 is clogging thiings up when we synchost a host
|
||||
// and it comes back up
|
||||
// . allow spider compression proxy to have a bunch
|
||||
if ( msgType == 0x13 && m_numUsedSlots > 500 && ! isProxy )
|
||||
// . MDW: do we need this one anymore? relax it a little.
|
||||
if ( msgType == 0x13 && m_numUsedSlotsIncoming>400 &&
|
||||
m_numUsedSlots>800 && !isProxy)
|
||||
getSlot = false;
|
||||
// 2c is clogging crap up
|
||||
if ( msgType == 0x2c && m_msg2csInWaiting >= 100 && niceness )
|
||||
|
620
XmlDoc.cpp
620
XmlDoc.cpp
@ -109,6 +109,7 @@ char *getFirstJSONObject ( char *p ,
|
||||
char *getJSONObjectEnd ( char *p , int32_t niceness ) ;
|
||||
|
||||
XmlDoc::XmlDoc() {
|
||||
m_readThreadOut = false;
|
||||
for ( int32_t i = 0 ; i < MAXMSG7S ; i++ ) m_msg7s[i] = NULL;
|
||||
m_esbuf.setLabel("exputfbuf");
|
||||
for ( int32_t i = 0 ; i < MAX_XML_DOCS ; i++ ) m_xmlDocs[i] = NULL;
|
||||
@ -208,6 +209,10 @@ class XmlDoc *g_xd;
|
||||
|
||||
void XmlDoc::reset ( ) {
|
||||
|
||||
if ( m_readThreadOut )
|
||||
log("build: deleting xmldoc class that has a read thread out "
|
||||
"on a warc file");
|
||||
|
||||
if ( m_fileValid ) {
|
||||
m_file.close();
|
||||
m_file.unlink();
|
||||
@ -221,7 +226,7 @@ void XmlDoc::reset ( ) {
|
||||
if ( ! msg7 ) continue;
|
||||
if(msg7->m_inUse) {
|
||||
log("build: archive: reseting xmldoc when msg7s are outstanding");
|
||||
|
||||
|
||||
}
|
||||
mdelete ( msg7 , sizeof(Msg7) , "xdmsg7" );
|
||||
delete ( msg7 );
|
||||
@ -1253,6 +1258,12 @@ bool XmlDoc::set4 ( SpiderRequest *sreq ,
|
||||
utf8Content = m_mime.getContent();
|
||||
}
|
||||
|
||||
// use this to avoid ip lookup if it is not zero
|
||||
if ( forcedIp ) {
|
||||
m_ip = forcedIp;
|
||||
m_ipValid = true;
|
||||
}
|
||||
|
||||
// sometimes they supply the content they want! like when zaks'
|
||||
// injects pages from PageInject.cpp
|
||||
if ( utf8Content ) {
|
||||
@ -1285,11 +1296,6 @@ bool XmlDoc::set4 ( SpiderRequest *sreq ,
|
||||
// use this ip as well for now to avoid ip lookup
|
||||
//m_ip = atoip("127.0.0.1");
|
||||
//m_ipValid = true;
|
||||
// use this to avoid ip lookup if it is not zero
|
||||
if ( forcedIp ) {
|
||||
m_ip = forcedIp;
|
||||
m_ipValid = true;
|
||||
}
|
||||
// do not need robots.txt then
|
||||
m_isAllowed = true;
|
||||
m_isAllowedValid = true;
|
||||
@ -1783,9 +1789,9 @@ bool XmlDoc::set2 ( char *titleRec ,
|
||||
|
||||
// new stuff
|
||||
m_siteNumInlinksValid = true;
|
||||
m_siteNumInlinksUniqueIpValid = true;
|
||||
m_siteNumInlinksUniqueCBlockValid = true;
|
||||
m_siteNumInlinksTotalValid = true;
|
||||
// m_siteNumInlinksUniqueIpValid = true;
|
||||
// m_siteNumInlinksUniqueCBlockValid = true;
|
||||
// m_siteNumInlinksTotalValid = true;
|
||||
//m_sitePopValid = true;
|
||||
m_rootLangIdValid = true;
|
||||
m_hasContactInfoValid = true;
|
||||
@ -3348,9 +3354,21 @@ void doneInjectingArchiveRec ( void *state ) {
|
||||
xd->m_numInjectionsOut--;
|
||||
log("build: archive: injection thread returned. %"INT32" out now.",
|
||||
xd->m_numInjectionsOut);
|
||||
// reset g_errno so it doesn't error out in ::indexDoc() when
|
||||
// we are injecting a ton of these msg7s and then xmldoc ends up
|
||||
// getting reset and when a msg7 reply comes back in, we core
|
||||
g_errno = 0;
|
||||
xd->m_masterLoop ( xd );
|
||||
}
|
||||
|
||||
void doneReadingArchiveFileWrapper ( void *state ) {
|
||||
XmlDoc *THIS = (XmlDoc *)state;
|
||||
// . go back to the main entry function
|
||||
// . make sure g_errno is clear from a msg3a g_errno before calling
|
||||
// this lest it abandon the loop
|
||||
THIS->m_masterLoop ( THIS->m_masterState );
|
||||
}
|
||||
|
||||
|
||||
#define MAXWARCRECSIZE 1000000
|
||||
|
||||
@ -3368,7 +3386,7 @@ bool XmlDoc::indexWarcOrArc ( char ctype ) {
|
||||
// so big we can fit it in memory. just do a wget then gunzip
|
||||
// then open it. use a system call in a thread.
|
||||
int64_t fileSize = -1;
|
||||
File *file = getUtf8ContentInFile( &fileSize );
|
||||
BigFile *file = getUtf8ContentInFile( &fileSize );
|
||||
// return true with g_errno set on error
|
||||
if ( ! file ) {
|
||||
if ( ! g_errno ) { char *xx=NULL;*xx=0; }
|
||||
@ -3444,7 +3462,37 @@ bool XmlDoc::indexWarcOrArc ( char ctype ) {
|
||||
toRead = fileSize - m_fileOff;
|
||||
m_hasMoreToRead = false;
|
||||
}
|
||||
int32_t bytesRead = file->read (m_fileBuf, toRead, m_fileOff);
|
||||
|
||||
bool status;
|
||||
|
||||
if ( m_readThreadOut ) {
|
||||
m_readThreadOut = false;
|
||||
status = false;
|
||||
goto skipRead;
|
||||
}
|
||||
|
||||
// make a thread to read now
|
||||
status = file->read (m_fileBuf,
|
||||
toRead,
|
||||
m_fileOff,
|
||||
&m_fileState,
|
||||
this,
|
||||
doneReadingArchiveFileWrapper,
|
||||
MAX_NICENESS );
|
||||
|
||||
// if thread was queue or launched, wait for it to come back
|
||||
if ( ! status ) {
|
||||
// set a signal so we do not recall thread
|
||||
// when callback brings us back here
|
||||
m_readThreadOut = true;
|
||||
// wait for callback
|
||||
return false;
|
||||
}
|
||||
|
||||
skipRead:
|
||||
|
||||
int64_t bytesRead = m_fileState.m_bytesDone;
|
||||
|
||||
if ( bytesRead != toRead ) {
|
||||
log("build: read of %s failed at offset "
|
||||
"%"INT64"", file->getFilename(), m_fileOff);
|
||||
@ -5215,9 +5263,9 @@ SafeBuf *XmlDoc::getTitleRecBuf ( ) {
|
||||
*/
|
||||
|
||||
if ( ! m_siteNumInlinksValid ) { char *xx=NULL;*xx=0; }
|
||||
if ( ! m_siteNumInlinksUniqueIpValid ) { char *xx=NULL;*xx=0; }
|
||||
if ( ! m_siteNumInlinksUniqueCBlockValid ) { char *xx=NULL;*xx=0; }
|
||||
if ( ! m_siteNumInlinksTotalValid ) { char *xx=NULL;*xx=0; }
|
||||
// if ( ! m_siteNumInlinksUniqueIpValid ) { char *xx=NULL;*xx=0; }
|
||||
// if ( ! m_siteNumInlinksUniqueCBlockValid ) { char *xx=NULL;*xx=0; }
|
||||
// if ( ! m_siteNumInlinksTotalValid ) { char *xx=NULL;*xx=0; }
|
||||
//if ( ! m_sitePopValid ) { char *xx=NULL;*xx=0; }
|
||||
if ( ! m_rootLangIdValid ) { char *xx=NULL;*xx=0; }
|
||||
|
||||
@ -8762,10 +8810,19 @@ Links *XmlDoc::getLinks ( bool doQuickSet ) {
|
||||
return &m_links;
|
||||
}
|
||||
|
||||
CollectionRec *cr = getCollRec();
|
||||
if ( ! cr ) return NULL;
|
||||
bool useRelNoFollow = true;
|
||||
if ( ! cr->m_obeyRelNoFollowLinks ) useRelNoFollow = false;
|
||||
// to keep things simple, for diffbot custom crawls, if robots.txt
|
||||
// is not used then do not use rel no follow
|
||||
if ( ! cr->m_useRobotsTxt && cr->m_isCustomCrawl )
|
||||
useRelNoFollow = false;
|
||||
|
||||
// . set it
|
||||
// . if parent is a permalink we can avoid its suburl outlinks
|
||||
// containing "comment" from being classified as permalinks
|
||||
if ( ! m_links.set ( true , // useRelNoFollow?
|
||||
if ( ! m_links.set ( useRelNoFollow ,
|
||||
xml ,
|
||||
u ,
|
||||
true , // setLinkHashes?
|
||||
@ -10056,6 +10113,10 @@ char *XmlDoc::getIsDup ( ) {
|
||||
// sanity. must be posdb list.
|
||||
if ( ! list->isEmpty() && list->m_ks != 18 ) { char *xx=NULL;*xx=0;}
|
||||
|
||||
// so getSiteRank() does not core
|
||||
int32_t *sni = getSiteNumInlinks();
|
||||
if ( ! sni || sni == (int32_t *)-1 ) return (char *)sni;
|
||||
|
||||
// . see if there are any pages that seem like they are dups of us
|
||||
// . they must also have a HIGHER score than us, for us to be
|
||||
// considered the dup
|
||||
@ -12026,11 +12087,25 @@ XmlDoc **XmlDoc::getRootXmlDoc ( int32_t maxCacheAge ) {
|
||||
mnew ( m_rootDoc , sizeof(XmlDoc),"xmldoc3");
|
||||
// if we had the title rec, set from that
|
||||
if ( *rtr ) {
|
||||
m_rootDoc->set2 ( m_rootTitleRec ,
|
||||
m_rootTitleRecSize , // maxSize ,
|
||||
cr->m_coll ,
|
||||
NULL , // pbuf
|
||||
m_niceness );
|
||||
if ( ! m_rootDoc->set2 ( m_rootTitleRec ,
|
||||
m_rootTitleRecSize , // maxSize ,
|
||||
cr->m_coll ,
|
||||
NULL , // pbuf
|
||||
m_niceness ) ) {
|
||||
// it was corrupted... delete this
|
||||
// possibly printed
|
||||
// " uncompress uncompressed size=..." bad uncompress
|
||||
log("build: rootdoc set2 failed");
|
||||
mdelete ( m_rootDoc , sizeof(XmlDoc) , "xdnuke");
|
||||
delete ( m_rootDoc );
|
||||
// call it empty for now, we don't want to return
|
||||
// NULL with g_errno set because it could stop
|
||||
// the whole indexing pipeline
|
||||
m_rootDoc = NULL;
|
||||
m_rootDocValid = true;
|
||||
return &m_rootDoc;
|
||||
//return NULL;
|
||||
}
|
||||
}
|
||||
// . otherwise, set the url and download it on demand
|
||||
// . this junk copied from the contactDoc->* stuff below
|
||||
@ -13806,7 +13881,7 @@ int32_t *XmlDoc::getSiteNumInlinks ( ) {
|
||||
|
||||
// hacks of speed. computeSiteNumInlinks is true by default
|
||||
// but if the user turns it off the just use sitelinks.txt
|
||||
if ( ! cr->m_computeSiteNumInlinks ) {
|
||||
if ( cr && ! cr->m_computeSiteNumInlinks ) {
|
||||
int32_t hostHash32 = getHostHash32a();
|
||||
int32_t min = g_tagdb.getMinSiteInlinks ( hostHash32 );
|
||||
// try with www if not there
|
||||
@ -13815,12 +13890,12 @@ int32_t *XmlDoc::getSiteNumInlinks ( ) {
|
||||
min = g_tagdb.getMinSiteInlinks ( wwwHash32 );
|
||||
}
|
||||
// fix core by setting these
|
||||
m_siteNumInlinksUniqueIp = 0;
|
||||
m_siteNumInlinksUniqueCBlock = 0;
|
||||
m_siteNumInlinksTotal = 0;
|
||||
m_siteNumInlinksUniqueIpValid = true;
|
||||
m_siteNumInlinksUniqueCBlockValid = true;
|
||||
m_siteNumInlinksTotalValid = true;
|
||||
// m_siteNumInlinksUniqueIp = 0;
|
||||
// m_siteNumInlinksUniqueCBlock = 0;
|
||||
// m_siteNumInlinksTotal = 0;
|
||||
// m_siteNumInlinksUniqueIpValid = true;
|
||||
// m_siteNumInlinksUniqueCBlockValid = true;
|
||||
// m_siteNumInlinksTotalValid = true;
|
||||
//a nd this
|
||||
m_siteNumInlinksValid = true;
|
||||
m_siteNumInlinks = 0;
|
||||
@ -13847,13 +13922,13 @@ int32_t *XmlDoc::getSiteNumInlinks ( ) {
|
||||
// no site inlinks
|
||||
if ( *ip == 0 ) {
|
||||
m_siteNumInlinks = 0;
|
||||
m_siteNumInlinksUniqueIp = 0;
|
||||
m_siteNumInlinksUniqueCBlock = 0;
|
||||
m_siteNumInlinksTotal = 0;
|
||||
// m_siteNumInlinksUniqueIp = 0;
|
||||
// m_siteNumInlinksUniqueCBlock = 0;
|
||||
// m_siteNumInlinksTotal = 0;
|
||||
m_siteNumInlinksValid = true;
|
||||
m_siteNumInlinksUniqueIpValid = true;
|
||||
m_siteNumInlinksUniqueCBlockValid = true;
|
||||
m_siteNumInlinksTotalValid = true;
|
||||
// m_siteNumInlinksUniqueIpValid = true;
|
||||
// m_siteNumInlinksUniqueCBlockValid = true;
|
||||
// m_siteNumInlinksTotalValid = true;
|
||||
return &m_siteNumInlinks;
|
||||
}
|
||||
|
||||
@ -13940,13 +14015,13 @@ int32_t *XmlDoc::getSiteNumInlinks ( ) {
|
||||
if ( age > maxAge ) valid = false;
|
||||
}
|
||||
// our companion tags, sitePop and fresh inlinks
|
||||
Tag *tag2 = gr->getTag ( "sitenuminlinksuniqueip" );
|
||||
Tag *tag3 = gr->getTag ( "sitenuminlinksuniquecblock");
|
||||
Tag *tag4 = gr->getTag ( "sitenuminlinkstotal");
|
||||
// Tag *tag2 = gr->getTag ( "sitenuminlinksuniqueip" );
|
||||
// Tag *tag3 = gr->getTag ( "sitenuminlinksuniquecblock");
|
||||
// Tag *tag4 = gr->getTag ( "sitenuminlinkstotal");
|
||||
// if we are missing either of those, invalidate as well
|
||||
if ( ! tag2 ) valid = false;
|
||||
if ( ! tag3 ) valid = false;
|
||||
if ( ! tag4 ) valid = false;
|
||||
// if ( ! tag2 ) valid = false;
|
||||
// if ( ! tag3 ) valid = false;
|
||||
// if ( ! tag4 ) valid = false;
|
||||
// if we have already been through this
|
||||
if ( m_updatingSiteLinkInfoTags ) valid = false;
|
||||
// if rebuilding linkdb assume we have no links to sample from!
|
||||
@ -13959,14 +14034,14 @@ int32_t *XmlDoc::getSiteNumInlinks ( ) {
|
||||
"age=%"INT32" ns=%"INT32" sni=%"INT32" "
|
||||
"maxage=%"INT32" "
|
||||
"tag=%"PTRFMT" "
|
||||
"tag2=%"PTRFMT" "
|
||||
"tag3=%"PTRFMT" "
|
||||
// "tag2=%"PTRFMT" "
|
||||
// "tag3=%"PTRFMT" "
|
||||
"url=%s",
|
||||
(int32_t)valid,age,ns,sni,
|
||||
maxAge,
|
||||
(PTRTYPE)tag,
|
||||
(PTRTYPE)tag2,
|
||||
(PTRTYPE)tag3,
|
||||
// (PTRTYPE)tag2,
|
||||
// (PTRTYPE)tag3,
|
||||
m_firstUrl.m_url);
|
||||
|
||||
LinkInfo *sinfo = NULL;
|
||||
@ -13979,18 +14054,18 @@ int32_t *XmlDoc::getSiteNumInlinks ( ) {
|
||||
m_siteNumInlinksValid = true;
|
||||
|
||||
// companion tags
|
||||
if ( tag2 ) {
|
||||
m_siteNumInlinksUniqueIp = atol(tag2->getTagData());
|
||||
m_siteNumInlinksUniqueIpValid = true;
|
||||
}
|
||||
if ( tag3 ) {
|
||||
m_siteNumInlinksUniqueCBlock =atol(tag3->getTagData());
|
||||
m_siteNumInlinksUniqueCBlockValid = true;
|
||||
}
|
||||
if ( tag4 ) {
|
||||
m_siteNumInlinksTotal =atol(tag4->getTagData());
|
||||
m_siteNumInlinksTotalValid = true;
|
||||
}
|
||||
// if ( tag2 ) {
|
||||
// m_siteNumInlinksUniqueIp = atol(tag2->getTagData());
|
||||
// m_siteNumInlinksUniqueIpValid = true;
|
||||
// }
|
||||
// if ( tag3 ) {
|
||||
// m_siteNumInlinksUniqueCBlock =atol(tag3->getTagData());
|
||||
// m_siteNumInlinksUniqueCBlockValid = true;
|
||||
// }
|
||||
// if ( tag4 ) {
|
||||
// m_siteNumInlinksTotal =atol(tag4->getTagData());
|
||||
// m_siteNumInlinksTotalValid = true;
|
||||
// }
|
||||
|
||||
// . consult our sitelinks.txt file
|
||||
// . returns -1 if not found
|
||||
@ -14049,14 +14124,14 @@ int32_t *XmlDoc::getSiteNumInlinks ( ) {
|
||||
m_siteNumInlinks = (int32_t)sinfo->m_numGoodInlinks;
|
||||
//m_siteNumInlinksFresh = sinfo->m_numInlinksFresh;
|
||||
//m_sitePop = sinfo->m_pagePop;
|
||||
m_siteNumInlinksUniqueIp = sinfo->m_numUniqueIps;
|
||||
m_siteNumInlinksUniqueCBlock = sinfo->m_numUniqueCBlocks;
|
||||
m_siteNumInlinksTotal = sinfo->m_totalInlinkingDocIds;
|
||||
// m_siteNumInlinksUniqueIp = sinfo->m_numUniqueIps;
|
||||
// m_siteNumInlinksUniqueCBlock = sinfo->m_numUniqueCBlocks;
|
||||
// m_siteNumInlinksTotal = sinfo->m_totalInlinkingDocIds;
|
||||
|
||||
m_siteNumInlinksValid = true;
|
||||
m_siteNumInlinksUniqueIpValid = true;
|
||||
m_siteNumInlinksUniqueCBlockValid = true;
|
||||
m_siteNumInlinksTotalValid = true;
|
||||
// m_siteNumInlinksUniqueIpValid = true;
|
||||
// m_siteNumInlinksUniqueCBlockValid = true;
|
||||
// m_siteNumInlinksTotalValid = true;
|
||||
|
||||
|
||||
updateToMin:
|
||||
@ -15563,10 +15638,11 @@ void gotDiffbotReplyWrapper ( void *state , TcpSocket *s ) {
|
||||
THIS->m_diffbotReplyError = code;
|
||||
}
|
||||
// a hack for detecting if token is expired
|
||||
if ( ! ttt && cr && strstr ( page , ":429}" ) ) {
|
||||
if ( THIS->m_diffbotReplyError == EDIFFBOTTOKENEXPIRED ) {
|
||||
// note it
|
||||
log("xmldoc: pausing crawl %s (%"INT32") because "
|
||||
"token is expired",cr->m_coll,(int32_t)cr->m_collnum);
|
||||
"token is expired",cr->m_coll,
|
||||
(int32_t)cr->m_collnum);
|
||||
// pause the crawl
|
||||
SafeBuf parmList;
|
||||
// spidering enabled is the "cse" cgi parm in Parms.cpp
|
||||
@ -16488,7 +16564,13 @@ SafeBuf *XmlDoc::getDiffbotReply ( ) {
|
||||
// go through gb. we should fix that by downloading the whole page
|
||||
// ourselves and sending it back, and tell diffbot's phantomjs not
|
||||
// to do the certificate check.
|
||||
useProxies = false;
|
||||
//
|
||||
// for now, allow http and NOT https urls through though.
|
||||
// TODO: if the url redirects to an https url will this mess us up?
|
||||
if ( ! m_firstUrlValid )
|
||||
useProxies = false;
|
||||
if ( m_firstUrlValid && m_firstUrl.isHttps() )
|
||||
useProxies = false;
|
||||
|
||||
// if we used a proxy to download the doc, then diffbot should too
|
||||
// BUT tell diffbot to go through host #0 so we can send it to the
|
||||
@ -17763,6 +17845,91 @@ Url **XmlDoc::getCanonicalRedirUrl ( ) {
|
||||
return &m_canonicalRedirUrlPtr;
|
||||
}
|
||||
|
||||
// returns false if none found
|
||||
bool setMetaRedirUrlFromTag ( char *p , Url *metaRedirUrl , char niceness ,
|
||||
Url *cu ) {
|
||||
// limit scan
|
||||
char *limit = p + 30;
|
||||
// skip whitespace
|
||||
for ( ; *p && p < limit && is_wspace_a(*p) ; p++ );
|
||||
// must be a num
|
||||
if ( ! is_digit(*p) ) return false;
|
||||
// init delay
|
||||
int32_t delay = atol ( p );
|
||||
// ignore long delays
|
||||
if ( delay >= 10 ) return false;
|
||||
// now find the semicolon, if any
|
||||
for ( ; *p && p < limit && *p != ';' ; p++ );
|
||||
// must have semicolon
|
||||
if ( *p != ';' ) return false;
|
||||
// skip it
|
||||
p++;
|
||||
// skip whitespace some more
|
||||
for ( ; *p && p < limit && is_wspace_a(*p) ; p++ );
|
||||
// must have URL
|
||||
if ( strncasecmp(p,"URL",3) ) return false;
|
||||
// skip that
|
||||
p += 3;
|
||||
// skip white space
|
||||
for ( ; *p && p < limit && is_wspace_a(*p) ; p++ );
|
||||
// then an equal sign
|
||||
if ( *p != '=' ) return false;
|
||||
// skip equal sign
|
||||
p++;
|
||||
// them maybe more whitespace
|
||||
for ( ; *p && p < limit && is_wspace_a(*p) ; p++ );
|
||||
// an optional quote
|
||||
if ( *p == '\"' ) p++;
|
||||
// can also be a single quote!
|
||||
if ( *p == '\'' ) p++;
|
||||
// set the url start
|
||||
char *url = p;
|
||||
// now advance to next quote or space or >
|
||||
for ( ; *p && !is_wspace_a(*p) &&
|
||||
*p !='\'' &&
|
||||
*p !='\"' &&
|
||||
*p !='>' ;
|
||||
p++);
|
||||
// that is the end
|
||||
char *urlEnd = p;
|
||||
// get size
|
||||
int32_t usize = urlEnd - url;
|
||||
// skip if too big
|
||||
if ( usize > 1024 ) {
|
||||
log("build: meta redirurl of %"INT32" bytes too big",usize);
|
||||
return false;
|
||||
}
|
||||
// get our current utl
|
||||
//Url *cu = getCurrentUrl();
|
||||
// decode what we got
|
||||
char decoded[MAX_URL_LEN];
|
||||
// convert & to "&"
|
||||
int32_t decBytes = htmlDecode(decoded,url,usize,false,niceness);
|
||||
decoded[decBytes]='\0';
|
||||
// . then the url
|
||||
// . set the url to the one in the redirect tag
|
||||
// . but if the http-equiv meta redirect url starts with a '?'
|
||||
// then just replace our cgi with that one
|
||||
if ( *url == '?' ) {
|
||||
char foob[MAX_URL_LEN*2];
|
||||
char *pf = foob;
|
||||
int32_t cuBytes = cu->getPathEnd() - cu->getUrl();
|
||||
gbmemcpy(foob,cu->getUrl(),cuBytes);
|
||||
pf += cuBytes;
|
||||
gbmemcpy ( pf , decoded , decBytes );
|
||||
pf += decBytes;
|
||||
*pf = '\0';
|
||||
metaRedirUrl->set(foob);
|
||||
}
|
||||
// . otherwise, append it right on
|
||||
// . use "url" as the base Url
|
||||
// . it may be the original url or the one we redirected to
|
||||
// . redirUrl is set to the original at the top
|
||||
else
|
||||
// addWWW = false, stripSessId=true
|
||||
metaRedirUrl->set(cu,decoded,decBytes,false,true);
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
// scan document for <meta http-equiv="refresh" content="0;URL=xxx">
|
||||
@ -17789,6 +17956,14 @@ Url **XmlDoc::getMetaRedirUrl ( ) {
|
||||
if ( cr->m_recycleContent || m_recycleContent )
|
||||
return &m_metaRedirUrlPtr;
|
||||
|
||||
// will this work in here?
|
||||
//uint8_t *ct = getContentType();
|
||||
//if ( ! ct ) return NULL;
|
||||
|
||||
Url *cu = getCurrentUrl();
|
||||
|
||||
bool gotOne = false;
|
||||
|
||||
// advance a bit, we are initially looking for the 'v' char
|
||||
p += 10;
|
||||
// begin the string matching loop
|
||||
@ -17828,91 +18003,64 @@ Url **XmlDoc::getMetaRedirUrl ( ) {
|
||||
p += 8;
|
||||
// skip possible quote
|
||||
if ( *p == '\"' ) p++;
|
||||
// limit scan
|
||||
limit = p + 30;
|
||||
// skip whitespace
|
||||
for ( ; *p && p < limit && is_wspace_a(*p) ; p++ );
|
||||
// must be a num
|
||||
if ( ! is_digit(*p) ) continue;
|
||||
// init delay
|
||||
int32_t delay = atol ( p );
|
||||
// ignore int32_t delays
|
||||
if ( delay >= 10 ) continue;
|
||||
// now find the semicolon, if any
|
||||
for ( ; *p && p < limit && *p != ';' ; p++ );
|
||||
// must have semicolon
|
||||
if ( *p != ';' ) continue;
|
||||
// skip it
|
||||
p++;
|
||||
// skip whitespace some more
|
||||
for ( ; *p && p < limit && is_wspace_a(*p) ; p++ );
|
||||
// must have URL
|
||||
if ( strncasecmp(p,"URL",3) ) continue;
|
||||
// skip that
|
||||
p += 3;
|
||||
// skip white space
|
||||
for ( ; *p && p < limit && is_wspace_a(*p) ; p++ );
|
||||
// then an equal sign
|
||||
if ( *p != '=' ) continue;
|
||||
// skip equal sign
|
||||
p++;
|
||||
// them maybe more whitespace
|
||||
for ( ; *p && p < limit && is_wspace_a(*p) ; p++ );
|
||||
// an optional quote
|
||||
if ( *p == '\"' ) p++;
|
||||
// can also be a single quote!
|
||||
if ( *p == '\'' ) p++;
|
||||
// set the url start
|
||||
char *url = p;
|
||||
// now advance to next quote or space or >
|
||||
for ( ; *p && !is_wspace_a(*p) &&
|
||||
*p !='\'' &&
|
||||
*p !='\"' &&
|
||||
*p !='>' ;
|
||||
p++);
|
||||
// that is the end
|
||||
char *urlEnd = p;
|
||||
// get size
|
||||
int32_t usize = urlEnd - url;
|
||||
// skip if too big
|
||||
if ( usize > 1024 ) {
|
||||
log("build: meta redirurl of %"INT32" bytes too big",usize);
|
||||
// PARSE OUT THE URL
|
||||
Url dummy;
|
||||
if ( ! setMetaRedirUrlFromTag ( p , &dummy , m_niceness ,cu))
|
||||
continue;
|
||||
gotOne = true;
|
||||
break;
|
||||
}
|
||||
|
||||
if ( ! gotOne )
|
||||
return &m_metaRedirUrlPtr;
|
||||
|
||||
// to fix issue with scripts containing
|
||||
// document.write('<meta http-equiv="Refresh" content="0;URL=http://ww
|
||||
// we have to get the Xml. we can't call getXml() because of
|
||||
// recursion bugs so just do it directly here
|
||||
|
||||
Xml xml;
|
||||
if ( ! xml.set ( m_httpReply ,
|
||||
m_httpReplySize - 1, // make it a length
|
||||
false , // ownData?
|
||||
0 , // allocSize
|
||||
false , // pure xml?
|
||||
m_version ,
|
||||
false , // setParentsArg?
|
||||
m_niceness ,
|
||||
// assume html since getContentType() is recursive
|
||||
// on us.
|
||||
CT_HTML ) ) // *ct ) )
|
||||
// return NULL on error with g_errno set
|
||||
return NULL;
|
||||
|
||||
XmlNode *nodes = xml.getNodes();
|
||||
int32_t n = xml.getNumNodes();
|
||||
// find the first meta summary node
|
||||
for ( int32_t i = 0 ; i < n ; i++ ) {
|
||||
// continue if not a meta tag
|
||||
if ( nodes[i].m_nodeId != 68 ) continue;
|
||||
// only get content for <meta http-equiv=..>
|
||||
int32_t tagLen;
|
||||
char *tag ;
|
||||
tag = xml.getString ( i , "http-equiv" , &tagLen );
|
||||
// skip if empty
|
||||
if ( ! tag || tagLen <= 0 ) continue;
|
||||
// if not a refresh, skip it
|
||||
if ( strncasecmp ( tag , "refresh", 7 ) ) continue;
|
||||
// get the content
|
||||
tag = xml.getString ( i ,"content", &tagLen );
|
||||
// skip if empty
|
||||
if ( ! tag || tagLen <= 0 ) continue;
|
||||
// PARSE OUT THE URL
|
||||
if (!setMetaRedirUrlFromTag(p,&m_metaRedirUrl,m_niceness,cu) )
|
||||
continue;
|
||||
}
|
||||
// get our current utl
|
||||
Url *cu = getCurrentUrl();
|
||||
// decode what we got
|
||||
char decoded[MAX_URL_LEN];
|
||||
// convert & to "&"
|
||||
int32_t decBytes = htmlDecode(decoded,url,usize,false,m_niceness);
|
||||
decoded[decBytes]='\0';
|
||||
// . then the url
|
||||
// . set the url to the one in the redirect tag
|
||||
// . but if the http-equiv meta redirect url starts with a '?'
|
||||
// then just replace our cgi with that one
|
||||
if ( *url == '?' ) {
|
||||
char foob[MAX_URL_LEN*2];
|
||||
char *pf = foob;
|
||||
int32_t cuBytes = cu->getPathEnd() - cu->getUrl();
|
||||
gbmemcpy(foob,cu->getUrl(),cuBytes);
|
||||
pf += cuBytes;
|
||||
gbmemcpy ( pf , decoded , decBytes );
|
||||
pf += decBytes;
|
||||
*pf = '\0';
|
||||
m_metaRedirUrl.set(foob);
|
||||
}
|
||||
// . otherwise, append it right on
|
||||
// . use "url" as the base Url
|
||||
// . it may be the original url or the one we redirected to
|
||||
// . redirUrl is set to the original at the top
|
||||
else
|
||||
// addWWW = false, stripSessId=true
|
||||
m_metaRedirUrl.set(cu,decoded,decBytes,false,true);
|
||||
// set it
|
||||
m_metaRedirUrlPtr = &m_metaRedirUrl;
|
||||
// return it
|
||||
break;
|
||||
return &m_metaRedirUrlPtr;
|
||||
}
|
||||
|
||||
// nothing found
|
||||
return &m_metaRedirUrlPtr;
|
||||
}
|
||||
@ -19086,6 +19234,9 @@ char **XmlDoc::getExpandedUtf8Content ( ) {
|
||||
// <iframe src=""> which ends up embedding the root url.
|
||||
if ( urlLen == 0 )
|
||||
continue;
|
||||
// skip if "about:blank"
|
||||
if ( urlLen==11 && strncmp(url,"about:blank",11) == 0 )
|
||||
continue;
|
||||
// get our current url
|
||||
//cu = getCurrentUrl();
|
||||
// set our frame url
|
||||
@ -19291,7 +19442,7 @@ void systemDoneWrapper ( void *state , ThreadEntry *t ) {
|
||||
}
|
||||
|
||||
// we download large files to a file on disk, like warcs and arcs
|
||||
File *XmlDoc::getUtf8ContentInFile ( int64_t *fileSizeArg ) {
|
||||
BigFile *XmlDoc::getUtf8ContentInFile ( int64_t *fileSizeArg ) {
|
||||
|
||||
if ( m_fileValid ) {
|
||||
*fileSizeArg = m_fileSize;
|
||||
@ -19305,15 +19456,17 @@ File *XmlDoc::getUtf8ContentInFile ( int64_t *fileSizeArg ) {
|
||||
char filename[2048];
|
||||
snprintf ( filename,
|
||||
2048,
|
||||
"%sgbarchivefile%"UINT32"",
|
||||
g_hostdb.m_dir,
|
||||
"gbarchivefile%"UINT32"",
|
||||
(int32_t)(int64_t)this);
|
||||
|
||||
m_file.set ( filename );
|
||||
m_file.set ( g_hostdb.m_dir , filename );
|
||||
m_fileSize = m_file.getFileSize();
|
||||
m_fileValid = true;
|
||||
*fileSizeArg = m_fileSize;
|
||||
m_file.open(O_RDONLY);
|
||||
// explicitly set it to false now to make it harder for
|
||||
// it not to be true because that messes things up
|
||||
m_file.m_usePartFiles = false;
|
||||
return &m_file;
|
||||
}
|
||||
|
||||
@ -19401,7 +19554,7 @@ File *XmlDoc::getUtf8ContentInFile ( int64_t *fileSizeArg ) {
|
||||
systemDoneWrapper ,
|
||||
systemStartWrapper_r ) )
|
||||
// would block, wait for thread
|
||||
return (File *)-1;
|
||||
return (BigFile *)-1;
|
||||
// failed?
|
||||
log("build: failed to launch wget thread");
|
||||
// If we run it in this thread then if we are fetching
|
||||
@ -21445,12 +21598,13 @@ bool XmlDoc::logIt ( SafeBuf *bb ) {
|
||||
//
|
||||
// print # of link texts from 2nd coll
|
||||
//
|
||||
if ( m_linkInfo2Valid ) {
|
||||
LinkInfo *info = ptr_linkInfo2;
|
||||
int32_t nt = 0;
|
||||
if ( info ) nt = info->getNumLinkTexts();
|
||||
if ( nt ) sb->safePrintf("goodinlinks2=%"INT32" ",nt );
|
||||
}
|
||||
// this is not used for what it was used for.
|
||||
// if ( m_linkInfo2Valid && size_linkInfo2 > 4 ) {
|
||||
// LinkInfo *info = ptr_linkInfo2;
|
||||
// int32_t nt = 0;
|
||||
// if ( info ) nt = info->getNumLinkTexts();
|
||||
// if ( nt ) sb->safePrintf("goodinlinks2=%"INT32" ",nt );
|
||||
// }
|
||||
|
||||
if ( m_docIdValid )
|
||||
sb->safePrintf("docid=%"UINT64" ",m_docId);
|
||||
@ -25928,18 +26082,18 @@ void XmlDoc::copyFromOldDoc ( XmlDoc *od ) {
|
||||
m_ip = od->m_ip;
|
||||
m_ipValid = true;
|
||||
m_siteNumInlinks = od->m_siteNumInlinks;
|
||||
m_siteNumInlinksUniqueIp = od->m_siteNumInlinksUniqueIp;
|
||||
m_siteNumInlinksUniqueCBlock= od->m_siteNumInlinksUniqueCBlock;
|
||||
m_siteNumInlinksTotal = od->m_siteNumInlinksTotal;
|
||||
// m_siteNumInlinksUniqueIp = od->m_siteNumInlinksUniqueIp;
|
||||
// m_siteNumInlinksUniqueCBlock= od->m_siteNumInlinksUniqueCBlo
|
||||
// m_siteNumInlinksTotal = od->m_siteNumInlinksTotal;
|
||||
|
||||
m_siteNumInlinksValid =
|
||||
od->m_siteNumInlinksValid;
|
||||
m_siteNumInlinksUniqueIpValid =
|
||||
od->m_siteNumInlinksUniqueIpValid;
|
||||
m_siteNumInlinksUniqueCBlockValid =
|
||||
od->m_siteNumInlinksUniqueCBlockValid;
|
||||
m_siteNumInlinksTotal =
|
||||
od->m_siteNumInlinksTotalValid;
|
||||
// m_siteNumInlinksUniqueIpValid =
|
||||
// od->m_siteNumInlinksUniqueIpValid;
|
||||
// m_siteNumInlinksUniqueCBlockValid =
|
||||
// od->m_siteNumInlinksUniqueCBlockValid;
|
||||
// m_siteNumInlinksTotal =
|
||||
// od->m_siteNumInlinksTotalValid;
|
||||
}
|
||||
|
||||
m_indexCode = 0;//od->m_indexCode;
|
||||
@ -32238,8 +32392,13 @@ Msg20Reply *XmlDoc::getMsg20Reply ( ) {
|
||||
}
|
||||
|
||||
|
||||
TagRec *gr = getTagRec();
|
||||
if ( ! gr || gr == (void *)-1 ) return (Msg20Reply *)gr;
|
||||
// if we are showing sites that have been banned in tagdb, we dont
|
||||
// have to do a tagdb lookup. that should speed things up.
|
||||
TagRec *gr = NULL;
|
||||
if ( cr && cr->m_doTagdbLookups ) {
|
||||
gr = getTagRec();
|
||||
if ( ! gr || gr == (void *)-1 ) return (Msg20Reply *)gr;
|
||||
}
|
||||
|
||||
//reply-> ptr_tagRec = (char *)gr;
|
||||
//reply->size_tagRec = gr->getSize();
|
||||
@ -32319,7 +32478,8 @@ Msg20Reply *XmlDoc::getMsg20Reply ( ) {
|
||||
if ( cr->m_forceDelete[ufn] ) pr = -3;
|
||||
|
||||
// this is an automatic ban!
|
||||
if ( gr->getLong("manualban",0))pr=-3;//SPIDER_PRIORITY_BANNED;
|
||||
if ( gr && gr->getLong("manualban",0))
|
||||
pr=-3;//SPIDER_PRIORITY_BANNED;
|
||||
|
||||
// is it banned
|
||||
if ( pr == -3 ) { // SPIDER_PRIORITY_BANNED ) { // -2
|
||||
@ -32754,9 +32914,9 @@ Msg20Reply *XmlDoc::getMsg20Reply ( ) {
|
||||
//if ( tag1 ) sni = atol(tag1->m_data);
|
||||
//if ( tag2 ) spop = atol(tag2->m_data);
|
||||
reply->m_siteNumInlinks = m_siteNumInlinks;
|
||||
reply->m_siteNumInlinksTotal = m_siteNumInlinksTotal;
|
||||
reply->m_siteNumUniqueIps = m_siteNumInlinksUniqueIp;
|
||||
reply->m_siteNumUniqueCBlocks = m_siteNumInlinksUniqueCBlock;
|
||||
//reply->m_siteNumInlinksTotal = m_siteNumInlinksTotal;
|
||||
//reply->m_siteNumUniqueIps = m_siteNumInlinksUniqueIp;
|
||||
//reply->m_siteNumUniqueCBlocks = m_siteNumInlinksUniqueCBlock;
|
||||
//reply->m_sitePop = m_sitePop;
|
||||
|
||||
// . get stuff from link info
|
||||
@ -38205,25 +38365,25 @@ bool XmlDoc::printGeneralInfo ( SafeBuf *sb , HttpRequest *hr ) {
|
||||
"<tr><td><b>good inlinks to site</b>"
|
||||
"</td><td>%"INT32"</td></tr>\n"
|
||||
|
||||
"<tr><td>unique IP inlinks to site"
|
||||
"</td><td>%"INT32"</td></tr>\n"
|
||||
// "<tr><td>unique IP inlinks to site"
|
||||
// "</td><td>%"INT32"</td></tr>\n"
|
||||
|
||||
"<tr><td>unique CBlock inlinks to site"
|
||||
"</td><td>%"INT32"</td></tr>\n"
|
||||
// "<tr><td>unique CBlock inlinks to site"
|
||||
// "</td><td>%"INT32"</td></tr>\n"
|
||||
|
||||
"<tr><td><b>site rank</b></td><td>%"INT32"</td></tr>\n"
|
||||
|
||||
"<tr><td>good inlinks to page"
|
||||
"</td><td>%"INT32"</td></tr>\n"
|
||||
|
||||
"<tr><td>unique IP inlinks to page"
|
||||
"</td><td>%"INT32"</td></tr>\n"
|
||||
// "<tr><td>unique IP inlinks to page"
|
||||
// "</td><td>%"INT32"</td></tr>\n"
|
||||
|
||||
"<tr><td>unique CBlock inlinks to page"
|
||||
"</td><td>%"INT32"</td></tr>\n"
|
||||
// "<tr><td>unique CBlock inlinks to page"
|
||||
// "</td><td>%"INT32"</td></tr>\n"
|
||||
|
||||
"<tr><td>total inlinks to page"
|
||||
"</td><td>%"INT32"</td></tr>\n"
|
||||
// "<tr><td>total inlinks to page"
|
||||
// "</td><td>%"INT32"</td></tr>\n"
|
||||
|
||||
"<tr><td><nobr>page inlinks last computed</nobr></td>"
|
||||
"<td>%s</td></tr>\n"
|
||||
@ -38243,14 +38403,14 @@ bool XmlDoc::printGeneralInfo ( SafeBuf *sb , HttpRequest *hr ) {
|
||||
strLanguage,
|
||||
g_countryCode.getName(m_countryId) ,
|
||||
sni,
|
||||
m_siteNumInlinksUniqueIp,
|
||||
m_siteNumInlinksUniqueCBlock,
|
||||
//m_siteNumInlinksUniqueIp,
|
||||
//m_siteNumInlinksUniqueCBlock,
|
||||
::getSiteRank(sni),
|
||||
//info1->getNumTotalInlinks(),
|
||||
info1->getNumGoodInlinks(),
|
||||
info1->m_numUniqueIps,
|
||||
info1->m_numUniqueCBlocks,
|
||||
info1->m_totalInlinkingDocIds,
|
||||
// info1->m_numUniqueIps,
|
||||
// info1->m_numUniqueCBlocks,
|
||||
// info1->m_totalInlinkingDocIds,
|
||||
|
||||
tmp3
|
||||
);
|
||||
@ -38262,26 +38422,26 @@ bool XmlDoc::printGeneralInfo ( SafeBuf *sb , HttpRequest *hr ) {
|
||||
"\t<siteRank>%"INT32"</siteRank>\n"
|
||||
|
||||
"\t<numGoodSiteInlinks>%"INT32"</numGoodSiteInlinks>\n"
|
||||
"\t<numTotalSiteInlinks>%"INT32"</numTotalSiteInlinks>\n"
|
||||
"\t<numUniqueIpsLinkingToSite>%"INT32""
|
||||
"</numUniqueIpsLinkingToSite>\n"
|
||||
"\t<numUniqueCBlocksLinkingToSite>%"INT32""
|
||||
"</numUniqueCBlocksLinkingToSite>\n"
|
||||
//"\t<numTotalSiteInlinks>%"INT32"</numTotalSiteInlinks>\n"
|
||||
// "\t<numUniqueIpsLinkingToSite>%"INT32""
|
||||
// "</numUniqueIpsLinkingToSite>\n"
|
||||
// "\t<numUniqueCBlocksLinkingToSite>%"INT32""
|
||||
// "</numUniqueCBlocksLinkingToSite>\n"
|
||||
|
||||
|
||||
|
||||
|
||||
// how many inlinks, external and internal, we have
|
||||
// to this page not filtered in any way!!!
|
||||
"\t<numTotalPageInlinks>%"INT32"</numTotalPageInlinks>\n"
|
||||
//"\t<numTotalPageInlinks>%"INT32"</numTotalPageInlinks>\n"
|
||||
// how many inlinking ips we got, including our own if
|
||||
// we link to ourself
|
||||
"\t<numUniqueIpsLinkingToPage>%"INT32""
|
||||
"</numUniqueIpsLinkingToPage>\n"
|
||||
// "\t<numUniqueIpsLinkingToPage>%"INT32""
|
||||
// "</numUniqueIpsLinkingToPage>\n"
|
||||
// how many inlinking cblocks we got, including our own
|
||||
// if we link to ourself
|
||||
"\t<numUniqueCBlocksLinkingToPage>%"INT32""
|
||||
"</numUniqueCBlocksLinkingToPage>\n"
|
||||
// "\t<numUniqueCBlocksLinkingToPage>%"INT32""
|
||||
// "</numUniqueCBlocksLinkingToPage>\n"
|
||||
|
||||
|
||||
"\t<numGoodPageInlinks>%"INT32"</numGoodPageInlinks>\n"
|
||||
@ -38293,13 +38453,13 @@ bool XmlDoc::printGeneralInfo ( SafeBuf *sb , HttpRequest *hr ) {
|
||||
,(int32_t)m_isLinkSpam
|
||||
,::getSiteRank(sni)
|
||||
,sni
|
||||
,m_siteNumInlinksTotal
|
||||
,m_siteNumInlinksUniqueIp
|
||||
,m_siteNumInlinksUniqueCBlock
|
||||
// ,m_siteNumInlinksTotal
|
||||
// ,m_siteNumInlinksUniqueIp
|
||||
// ,m_siteNumInlinksUniqueCBlock
|
||||
|
||||
,info1->m_totalInlinkingDocIds
|
||||
,info1->m_numUniqueIps
|
||||
,info1->m_numUniqueCBlocks
|
||||
//,info1->m_totalInlinkingDocIds
|
||||
//,info1->m_numUniqueIps
|
||||
//,info1->m_numUniqueCBlocks
|
||||
|
||||
,info1->getNumGoodInlinks()
|
||||
//,tmp3
|
||||
@ -39312,6 +39472,12 @@ char **XmlDoc::getRootTitleBuf ( ) {
|
||||
// sanity check, must include the null ni the size
|
||||
if ( m_rootTitleBufSize > 0 &&
|
||||
m_rootTitleBuf [ m_rootTitleBufSize - 1 ] ) {
|
||||
log("build: bad root titlebuf size not end in null char for "
|
||||
"collnum=%i",(int)m_collnum);
|
||||
ptr_rootTitleBuf = NULL;
|
||||
size_rootTitleBuf = 0;
|
||||
m_rootTitleBufValid = true;
|
||||
return (char **)&m_rootTitleBuf;
|
||||
char *xx=NULL;*xx=0;
|
||||
//m_rootTitleBuf [ m_rootTitleBufSize - 1 ] = '\0';
|
||||
//m_rootTitleBufSize++;
|
||||
@ -39931,7 +40097,7 @@ SafeBuf *XmlDoc::getNewTagBuf ( ) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
int32_t old2, old3, old4;
|
||||
//int32_t old2, old3, old4;
|
||||
|
||||
// if running for diffbot crawlbot then isCustomCrawl is true
|
||||
// so do not update the siteinlink info already in tagdb since i
|
||||
@ -39944,31 +40110,31 @@ SafeBuf *XmlDoc::getNewTagBuf ( ) {
|
||||
//if ( strcmp(cr->m_coll,"GLOBAL-INDEX") == 0 ) ) goto skipSiteInlinks;
|
||||
|
||||
// sitenuminlinksfresh
|
||||
old2 = gr->getLong("sitenuminlinksuniqueip",-1,NULL,×tamp);
|
||||
if ( old2 == -1 || old2 != m_siteNumInlinksUniqueIp ||
|
||||
m_updatingSiteLinkInfoTags )
|
||||
if ( ! tbuf->addTag2(mysite,"sitenuminlinksuniqueip",
|
||||
now,"xmldoc",
|
||||
*ip,m_siteNumInlinksUniqueIp,rdbId))
|
||||
return NULL;
|
||||
// sitepop
|
||||
old3 = gr->getLong("sitenuminlinksuniquecblock",-1,NULL,
|
||||
×tamp);
|
||||
if ( old3 == -1 || old3 != m_siteNumInlinksUniqueCBlock ||
|
||||
m_updatingSiteLinkInfoTags )
|
||||
if ( ! tbuf->addTag2(mysite,"sitenuminlinksuniquecblock",
|
||||
now,"xmldoc",
|
||||
*ip,m_siteNumInlinksUniqueCBlock,rdbId))
|
||||
return NULL;
|
||||
// total site inlinks
|
||||
old4 = gr->getLong("sitenuminlinkstotal",-1,NULL,
|
||||
×tamp);
|
||||
if ( old4 == -1 || old4 != m_siteNumInlinksTotal ||
|
||||
m_updatingSiteLinkInfoTags )
|
||||
if ( ! tbuf->addTag2(mysite,"sitenuminlinkstotal",
|
||||
now,"xmldoc",
|
||||
*ip,m_siteNumInlinksTotal,rdbId))
|
||||
return NULL;
|
||||
// old2 = gr->getLong("sitenuminlinksuniqueip",-1,NULL,×tamp);
|
||||
// if ( old2 == -1 || old2 != m_siteNumInlinksUniqueIp ||
|
||||
// m_updatingSiteLinkInfoTags )
|
||||
// if ( ! tbuf->addTag2(mysite,"sitenuminlinksuniqueip",
|
||||
// now,"xmldoc",
|
||||
// *ip,m_siteNumInlinksUniqueIp,rdbId))
|
||||
// return NULL;
|
||||
// // sitepop
|
||||
// old3 = gr->getLong("sitenuminlinksuniquecblock",-1,NULL,
|
||||
// ×tamp);
|
||||
// if ( old3 == -1 || old3 != m_siteNumInlinksUniqueCBlock ||
|
||||
// m_updatingSiteLinkInfoTags )
|
||||
// if ( ! tbuf->addTag2(mysite,"sitenuminlinksuniquecblock",
|
||||
// now,"xmldoc",
|
||||
// *ip,m_siteNumInlinksUniqueCBlock,rdbId))
|
||||
// return NULL;
|
||||
// // total site inlinks
|
||||
// old4 = gr->getLong("sitenuminlinkstotal",-1,NULL,
|
||||
// ×tamp);
|
||||
// if ( old4 == -1 || old4 != m_siteNumInlinksTotal ||
|
||||
// m_updatingSiteLinkInfoTags )
|
||||
// if ( ! tbuf->addTag2(mysite,"sitenuminlinkstotal",
|
||||
// now,"xmldoc",
|
||||
// *ip,m_siteNumInlinksTotal,rdbId))
|
||||
// return NULL;
|
||||
|
||||
// skipSiteInlinks:
|
||||
|
||||
|
24
XmlDoc.h
24
XmlDoc.h
@ -280,8 +280,10 @@ class XmlDoc {
|
||||
// this is a hash of all adjacent tag pairs for templated identificatn
|
||||
uint32_t m_tagPairHash32;
|
||||
int32_t m_siteNumInlinks;
|
||||
int32_t m_siteNumInlinksUniqueIp; // m_siteNumInlinksFresh
|
||||
int32_t m_siteNumInlinksUniqueCBlock; // m_sitePop;
|
||||
//int32_t m_siteNumInlinksUniqueIp; // m_siteNumInlinksFresh
|
||||
//int32_t m_siteNumInlinksUniqueCBlock; // m_sitePop;
|
||||
int32_t m_reserved1;
|
||||
int32_t m_reserved2;
|
||||
uint32_t m_spideredTime; // time_t
|
||||
// just don't throw away any relevant SpiderRequests and we have
|
||||
// the data that m_minPubDate and m_maxPubDate provided
|
||||
@ -297,7 +299,8 @@ class XmlDoc {
|
||||
uint16_t m_countryId;
|
||||
//uint16_t m_reserved1;//titleWeight;
|
||||
//uint16_t m_reserved2;//headerWeight;
|
||||
int32_t m_siteNumInlinksTotal;
|
||||
//int32_t m_siteNumInlinksTotal;
|
||||
int32_t m_reserved3;
|
||||
//uint16_t m_reserved3;//urlPathWeight;
|
||||
uint8_t m_metaListCheckSum8; // bring it back!!
|
||||
char m_reserved3b;
|
||||
@ -702,7 +705,7 @@ class XmlDoc {
|
||||
char **getExpandedUtf8Content ( ) ;
|
||||
char **getUtf8Content ( ) ;
|
||||
// we download large files to a file on disk, like warcs and arcs
|
||||
File *getUtf8ContentInFile ( int64_t *fileSizeArg );
|
||||
BigFile *getUtf8ContentInFile ( int64_t *fileSizeArg );
|
||||
int32_t *getContentHash32 ( ) ;
|
||||
int32_t *getContentHashJson32 ( ) ;
|
||||
//int32_t *getTagHash32 ( ) ;
|
||||
@ -1008,6 +1011,9 @@ class XmlDoc {
|
||||
int64_t m_startTime;
|
||||
int64_t m_injectStartTime;
|
||||
|
||||
class XmlDoc *m_prevInject;
|
||||
class XmlDoc *m_nextInject;
|
||||
|
||||
// when set() was called by Msg20.cpp so we can time how long it took
|
||||
// to generate the summary
|
||||
int64_t m_setTime;
|
||||
@ -1084,8 +1090,10 @@ class XmlDoc {
|
||||
int32_t m_fileBufAllocSize;
|
||||
char *m_fptr ;
|
||||
char *m_fptrEnd ;
|
||||
File m_file;
|
||||
BigFile m_file;
|
||||
int64_t m_fileSize;
|
||||
FileState m_fileState;
|
||||
bool m_readThreadOut;
|
||||
bool m_hasMoreToRead;
|
||||
int32_t m_numInjectionsOut;
|
||||
bool m_calledWgetThread;
|
||||
@ -1432,9 +1440,9 @@ class XmlDoc {
|
||||
//bool m_aboutUsLinkValid;
|
||||
//bool m_contactLinksValid;
|
||||
bool m_siteNumInlinksValid;
|
||||
bool m_siteNumInlinksUniqueIpValid;//FreshValid;
|
||||
bool m_siteNumInlinksUniqueCBlockValid;//sitePopValid
|
||||
bool m_siteNumInlinksTotalValid;
|
||||
//bool m_siteNumInlinksUniqueIpValid;//FreshValid;
|
||||
//bool m_siteNumInlinksUniqueCBlockValid;//sitePopValid
|
||||
//bool m_siteNumInlinksTotalValid;
|
||||
bool m_siteNumInlinks8Valid;
|
||||
bool m_siteLinkInfoValid;
|
||||
bool m_isWWWDupValid;
|
||||
|
12
blaster2.cpp
12
blaster2.cpp
@ -18,6 +18,8 @@ static void sleepWrapper ( int fd , void *state ) ;
|
||||
|
||||
bool sendPageSEO(TcpSocket *s, HttpRequest *hr) {return true;}
|
||||
bool g_recoveryMode;
|
||||
int g_inMemcpy;
|
||||
int32_t g_recoveryLevel;
|
||||
|
||||
static int32_t s_maxNumThreads = 1 ;
|
||||
static int32_t s_launched = 0;
|
||||
@ -48,7 +50,7 @@ int main ( int argc , char *argv[] ) {
|
||||
if ( setrlimit(RLIMIT_CORE,&lim) )
|
||||
log("blaster::setrlimit: %s", mstrerror(errno) );
|
||||
|
||||
g_conf.m_maxMem = 500000000;
|
||||
//g_conf.m_maxMem = 500000000;
|
||||
|
||||
// init our table for doing zobrist hashing
|
||||
if ( ! hashinit() ) {
|
||||
@ -57,7 +59,7 @@ int main ( int argc , char *argv[] ) {
|
||||
// init the memory class after conf since it gets maxMem from Conf
|
||||
//if ( ! g_mem.init ( 20000000 ) ) {
|
||||
// log("blaster::Mem init failed" ); return 1; }
|
||||
g_mem.m_maxMem = 200000000;
|
||||
//g_mem.m_maxMem = 200000000;
|
||||
// start up log file
|
||||
if ( ! g_log.init( "/tmp/blasterLog" ) ) {
|
||||
log("blaster::Log open /tmp/blasterLog failed" ); return 1; }
|
||||
@ -449,7 +451,9 @@ bool getWords() {
|
||||
s_words += '\0';
|
||||
}
|
||||
fclose ( fd );
|
||||
log("blaster: read %"INT32" words, %"INT32" bytes in from dictionary.",
|
||||
s_windices.length() / sizeof(int32_t), s_words.length());
|
||||
log("blaster: read %"INT32" words, "
|
||||
"%"INT32" bytes in from dictionary.",
|
||||
(int32_t)(s_windices.length() / sizeof(int32_t)),
|
||||
(int32_t)s_words.length());
|
||||
return true;
|
||||
}
|
||||
|
@ -27,6 +27,7 @@ bool sendPageSEO(TcpSocket *s, HttpRequest *hr) {return true;}
|
||||
//SafeBuf g_qbuf;
|
||||
|
||||
bool g_recoveryMode;
|
||||
int32_t g_recoveryLevel;
|
||||
|
||||
int g_inMemcpy;
|
||||
|
||||
|
@ -2504,7 +2504,7 @@ int32_t deserializeMsg ( int32_t baseSize ,
|
||||
// make it NULL if size is 0 though
|
||||
if ( *sizePtr == 0 ) *strPtr = NULL;
|
||||
// sanity check
|
||||
if ( *sizePtr < 0 ) { char *xx = NULL; *xx =0; }
|
||||
if ( *sizePtr < 0 ) { g_errno = ECORRUPTDATA; return -1;}
|
||||
// advance our destination ptr
|
||||
p += *sizePtr;
|
||||
// advance both ptrs to next string
|
||||
|
@ -620,6 +620,7 @@ char *serializeMsg2 ( void *thisPtr ,
|
||||
int32_t *retSize );
|
||||
|
||||
// convert offsets back into ptrs
|
||||
// returns -1 on error
|
||||
int32_t deserializeMsg ( int32_t baseSize ,
|
||||
int32_t *firstSizeParm ,
|
||||
int32_t *lastSizeParm ,
|
||||
|
16
main.cpp
16
main.cpp
@ -6394,7 +6394,7 @@ void dumpTitledb (char *coll,int32_t startFileNum,int32_t numFiles,bool includeT
|
||||
//g_conf.m_spiderdbMaxTreeMem = 1024*1024*30;
|
||||
//g_conf.m_checksumdbMaxDiskPageCacheMem = 0;
|
||||
//g_conf.m_spiderdbMaxDiskPageCacheMem = 0;
|
||||
g_conf.m_tfndbMaxDiskPageCacheMem = 0;
|
||||
//g_conf.m_tfndbMaxDiskPageCacheMem = 0;
|
||||
g_titledb.init ();
|
||||
//g_collectiondb.init(true);
|
||||
g_titledb.getRdb()->addRdbBase1(coll);
|
||||
@ -7028,7 +7028,7 @@ int32_t dumpSpiderdb ( char *coll,
|
||||
//g_conf.m_spiderdbMaxTreeMem = 1024*1024*30;
|
||||
//g_conf.m_checksumdbMaxDiskPageCacheMem = 0;
|
||||
//g_conf.m_spiderdbMaxDiskPageCacheMem = 0;
|
||||
g_conf.m_tfndbMaxDiskPageCacheMem = 0;
|
||||
//g_conf.m_tfndbMaxDiskPageCacheMem = 0;
|
||||
g_spiderdb.init ();
|
||||
//g_collectiondb.init(true);
|
||||
g_spiderdb.getRdb()->addRdbBase1(coll );
|
||||
@ -8637,8 +8637,8 @@ if ( ! tr.set ( rec , listSize , false ) ) { // own data?
|
||||
void dumpMissing ( char *coll ) {
|
||||
// load tfndb, assume it is a perfect reflection of titledb
|
||||
//g_conf.m_spiderdbMaxTreeMem = 1024*1024*30;
|
||||
g_conf.m_tfndbMaxDiskPageCacheMem = 0;
|
||||
g_conf.m_indexdbMaxCacheMem = 0;
|
||||
//g_conf.m_tfndbMaxDiskPageCacheMem = 0;
|
||||
//g_conf.m_indexdbMaxCacheMem = 0;
|
||||
//g_conf.m_clusterdbMaxDiskPageCacheMem = 0;
|
||||
|
||||
//g_tfndb.init ();
|
||||
@ -8855,7 +8855,7 @@ void dumpMissing ( char *coll ) {
|
||||
void dumpDups ( char *coll ) {
|
||||
// load tfndb, assume it is a perfect reflection of titledb
|
||||
//g_conf.m_spiderdbMaxTreeMem = 1024*1024*30;
|
||||
g_conf.m_indexdbMaxCacheMem = 0;
|
||||
//g_conf.m_indexdbMaxCacheMem = 0;
|
||||
|
||||
//g_conf.m_spiderdbMaxTreeMem = 1024*1024*30;
|
||||
g_indexdb.init ();
|
||||
@ -9407,10 +9407,10 @@ void removeDocIds ( char *coll , char *filename ) {
|
||||
// g_conf.m_checksumdbMinFilesToMerge = 100;
|
||||
if ( g_conf.m_clusterdbMinFilesToMerge < 100 )
|
||||
g_conf.m_clusterdbMinFilesToMerge = 100;
|
||||
g_conf.m_tfndbMaxDiskPageCacheMem = 0;
|
||||
//g_conf.m_tfndbMaxDiskPageCacheMem = 0;
|
||||
//g_conf.m_checksumdbMaxDiskPageCacheMem = 0;
|
||||
//g_conf.m_clusterdbMaxDiskPageCacheMem = 0;
|
||||
g_conf.m_indexdbMaxCacheMem = 0;
|
||||
//g_conf.m_indexdbMaxCacheMem = 0;
|
||||
//g_conf.m_checksumdbMaxCacheMem = 0;
|
||||
//g_conf.m_clusterdbMaxCacheMem = 0;
|
||||
|
||||
@ -12272,7 +12272,7 @@ bool parseTest ( char *coll , int64_t docId , char *query ) {
|
||||
//g_mem.m_maxMem = 2000000000LL; // 2G
|
||||
//g_conf.m_checksumdbMaxDiskPageCacheMem = 0;
|
||||
//g_conf.m_spiderdbMaxDiskPageCacheMem = 0;
|
||||
g_conf.m_tfndbMaxDiskPageCacheMem = 0;
|
||||
//g_conf.m_tfndbMaxDiskPageCacheMem = 0;
|
||||
//g_conf.m_titledbMaxTreeMem = 1024*1024*10;
|
||||
g_titledb.init ();
|
||||
//g_collectiondb.init(true);
|
||||
|
4
qa.cpp
4
qa.cpp
@ -1349,6 +1349,10 @@ bool qaSyntax ( ) {
|
||||
"format=json&"
|
||||
"q=");
|
||||
tmp.urlEncode ( s_q[s_i] );
|
||||
// get back 100 for debugging better
|
||||
if ( strcmp(s_q[s_i],"gbssStatusCode:0") == 0 ) {
|
||||
tmp.safePrintf("&n=100");
|
||||
}
|
||||
tmp.nullTerm();
|
||||
// point to next query
|
||||
s_i++;
|
||||
|
Reference in New Issue
Block a user