Merge branch 'diffbot' of github.com:gigablast/open-source-search-engine into diffbot
This commit is contained in:
commit
40f373c9e0
AutoBan.cppBigFile.cppBlaster.cppCachedb.cppCatdb.cppClusterdb.cppCollectiondb.cppCollectiondb.hConf.hCountryCode.cppCountryCode.hDatedb.cppDiskPageCache.cppErrno.cppErrno.hHashTableX.cppHostdb.cppHostdb.hHttpServer.cppIndexdb.cppJson.cppLangList.cppLinkdb.cppLoop.cppMake.dependMem.cppMonitordb.cppMsg13.cppMsg20.cppMsg22.cppMsg39.cppMsg4.cppMsg40.cppMsg5.cppMsg5.hMsge0.cppMulticast.cppPageAddColl.cppPageAddUrl.cppPageCatdb.cppPageCrawlBot.cppPageEvents.cppPageGet.cppPageHosts.cppPageInject.cppPageInject.hPageLogView.cppPageParser.cppPagePerf.cppPageReindex.cppPageResults.cppPageRoot.cppPageSockets.cppPageStats.cppPageStatsdb.cppPageThreads.cppPages.cppPages.hParms.cppParms.hPingServer.cppPlacedb.cppPosdb.cppProcess.cppProfiler.cppProxy.cppQuery.cppQuery.hRdb.cppRdb.hRdbBase.cppRdbBase.hRdbCache.cppRdbDump.cppRdbList.cppRdbTree.cppRebalance.cppRepair.cppRevdb.cppSafeBuf.cppSafeBuf.hSearchInput.hSections.cppSpider.cppSpider.hStats.cppStats.hStatsdb.cppSyncdb.cppTagdb.cppTagdb.hTcpServer.cppTcpServer.hTcpSocket.hTfndb.cppTfndb.hThreads.cppTitledb.cppTitledb.hUCNormalizer.cpp
21
AutoBan.cpp
21
AutoBan.cpp
@ -849,9 +849,8 @@ bool AutoBan::printTable( TcpSocket *s , HttpRequest *r ) {
|
||||
setCodesFromConf();
|
||||
}
|
||||
|
||||
sb.safePrintf("\n<br><br><table width=100%% bgcolor=#%s "
|
||||
"cellpadding=4 border=1>\n",
|
||||
BABY_BLUE);
|
||||
sb.safePrintf("\n<br><br><table %s>\n",TABLE_STYLE);
|
||||
|
||||
getCalendarFromMs((now - m_codeResetTime) * 1000,
|
||||
&days,
|
||||
&hours,
|
||||
@ -1134,9 +1133,7 @@ bool AutoBan::printTable( TcpSocket *s , HttpRequest *r ) {
|
||||
|
||||
|
||||
|
||||
sb.safePrintf("\n<table width=100%% bgcolor=#%s "
|
||||
"cellpadding=4 border=1>\n",
|
||||
BABY_BLUE);
|
||||
sb.safePrintf("\n<table %s>\n",TABLE_STYLE);
|
||||
sb.safePrintf("<tr><td colspan=2 bgcolor=#%s>"
|
||||
"<center><b>Add IPs</b></center></td></tr>",
|
||||
DARK_BLUE);
|
||||
@ -1174,9 +1171,7 @@ bool AutoBan::printTable( TcpSocket *s , HttpRequest *r ) {
|
||||
|
||||
/////////////////////////////////////////////////////////////////////
|
||||
|
||||
sb.safePrintf("\n<table width=100%% bgcolor=#%s "
|
||||
"cellpadding=4 border=1>\n",
|
||||
BABY_BLUE);
|
||||
sb.safePrintf("\n<table %s>\n",TABLE_STYLE);
|
||||
|
||||
sb.safePrintf("<tr><td colspan=3 bgcolor=#%s>"
|
||||
"<center><b>Watched Ips</b></center></td></tr>",
|
||||
@ -1315,9 +1310,7 @@ bool AutoBan::printTable( TcpSocket *s , HttpRequest *r ) {
|
||||
|
||||
// MDW moved from here
|
||||
|
||||
sb.safePrintf("\n<br><br><table width=100%% bgcolor=#%s "
|
||||
"cellpadding=4 border=1>\n",
|
||||
BABY_BLUE);
|
||||
sb.safePrintf("\n<br><br><table %s>\n",TABLE_STYLE);
|
||||
|
||||
sb.safePrintf("<tr><td colspan=5 bgcolor=#%s>"
|
||||
"<center><b>Control Panel</b></center></td></tr>",
|
||||
@ -1362,9 +1355,7 @@ bool AutoBan::printTable( TcpSocket *s , HttpRequest *r ) {
|
||||
}
|
||||
|
||||
|
||||
sb.safePrintf("\n<br><br><table width=100%% bgcolor=#%s "
|
||||
"cellpadding=4 border=1>\n",
|
||||
BABY_BLUE);
|
||||
sb.safePrintf("\n<br><br><table %s>\n",TABLE_STYLE);
|
||||
|
||||
sb.safePrintf("<tr><td colspan=6 bgcolor=#%s>"
|
||||
"<center><b>Queries Today</b></center></td></tr>",
|
||||
|
@ -569,7 +569,9 @@ bool BigFile::readwrite ( void *buf ,
|
||||
}
|
||||
// otherwise, thread spawn failed, do it blocking then
|
||||
g_errno = 0;
|
||||
if ( ! g_threads.m_disabled ) {
|
||||
// if threads are manually disabled don't print these msgs because
|
||||
// we redbox the fact above the controls in Pages.cpp
|
||||
if ( g_conf.m_useThreads && ! g_threads.m_disabled ) {
|
||||
static long s_lastTime = 0;
|
||||
long now = getTime();
|
||||
if ( now - s_lastTime >= 1 ) {
|
||||
|
15
Blaster.cpp
15
Blaster.cpp
@ -651,7 +651,10 @@ void Blaster::gotDoc2 ( void *state, TcpSocket *s){
|
||||
false,
|
||||
0,
|
||||
false,
|
||||
TITLEREC_CURRENT_VERSION)){
|
||||
TITLEREC_CURRENT_VERSION ,
|
||||
true , // set parents
|
||||
0 , // niceness
|
||||
CT_XML )){ // content type
|
||||
log(LOG_WARN,"blaster: Couldn't set XML1 Class in gotDoc2");
|
||||
}
|
||||
Links links1;
|
||||
@ -679,7 +682,10 @@ void Blaster::gotDoc2 ( void *state, TcpSocket *s){
|
||||
false,
|
||||
0,
|
||||
false,
|
||||
TITLEREC_CURRENT_VERSION)){
|
||||
TITLEREC_CURRENT_VERSION,
|
||||
true , // setparents
|
||||
0 , // niceness
|
||||
CT_XML )){
|
||||
log(LOG_WARN,"blaster: Couldn't set XML2 Class in gotDoc2");
|
||||
}
|
||||
Links links2;
|
||||
@ -1170,7 +1176,10 @@ void Blaster::gotDoc4 ( void *state, TcpSocket *s){
|
||||
false,
|
||||
0,
|
||||
false,
|
||||
TITLEREC_CURRENT_VERSION)){
|
||||
TITLEREC_CURRENT_VERSION,
|
||||
true, // setparents
|
||||
0, // niceness
|
||||
CT_XML )){
|
||||
log(LOG_WARN,"blaster: Couldn't set XML Class in gotDoc4");
|
||||
}
|
||||
Links links;
|
||||
|
@ -71,9 +71,9 @@ bool Cachedb::init ( ) {
|
||||
return false;
|
||||
|
||||
// add the base since it is a collectionless rdb
|
||||
return m_rdb.addColl ( NULL );
|
||||
return m_rdb.addRdbBase1 ( NULL );
|
||||
}
|
||||
|
||||
/*
|
||||
bool Cachedb::addColl ( char *coll, bool doVerify ) {
|
||||
if ( ! m_rdb.addColl ( coll ) ) return false;
|
||||
if ( ! doVerify ) return true;
|
||||
@ -85,7 +85,7 @@ bool Cachedb::addColl ( char *coll, bool doVerify ) {
|
||||
log ( "db: Verify failed, but scaling is allowed, passing." );
|
||||
return true;
|
||||
}
|
||||
|
||||
*/
|
||||
bool Cachedb::verify ( char *coll ) {
|
||||
// coll is NULL here methinks
|
||||
log ( LOG_DEBUG, "db: Verifying %s...",m_name );
|
||||
|
@ -84,7 +84,7 @@ bool Catdb::init ( ) {
|
||||
// Rdb::getBase(collnum_t) will return. however, for collectionless
|
||||
// rdb databases we set Rdb::m_collectionlessBase special here.
|
||||
// This was in Rdb.cpp::init().
|
||||
return m_rdb.addColl ( NULL );
|
||||
return m_rdb.addRdbBase1 ( NULL );
|
||||
}
|
||||
|
||||
bool Catdb::init2 ( long treeMem ) {
|
||||
@ -112,6 +112,7 @@ bool Catdb::init2 ( long treeMem ) {
|
||||
// end support for "cache recs"
|
||||
//
|
||||
|
||||
/*
|
||||
bool Catdb::addColl ( char *coll, bool doVerify ) {
|
||||
if ( ! m_rdb.addColl ( coll ) ) return false;
|
||||
// verify
|
||||
@ -123,6 +124,7 @@ bool Catdb::addColl ( char *coll, bool doVerify ) {
|
||||
log ( "db: Verify failed, but scaling is allowed, passing." );
|
||||
return true;
|
||||
}
|
||||
*/
|
||||
|
||||
bool Catdb::verify ( char *coll ) {
|
||||
char *rdbName = "Catdb";
|
||||
|
@ -337,7 +337,7 @@ bool Clusterdb::init2 ( long treeMem ) {
|
||||
12 , // key size
|
||||
true ); // bias disk page cache
|
||||
}
|
||||
|
||||
/*
|
||||
bool Clusterdb::addColl ( char *coll, bool doVerify ) {
|
||||
if ( ! m_rdb.addColl ( coll ) ) return false;
|
||||
if ( ! doVerify ) return true;
|
||||
@ -349,7 +349,7 @@ bool Clusterdb::addColl ( char *coll, bool doVerify ) {
|
||||
log ( "db: Verify failed, but scaling is allowed, passing." );
|
||||
return true;
|
||||
}
|
||||
|
||||
*/
|
||||
bool Clusterdb::verify ( char *coll ) {
|
||||
log ( LOG_DEBUG, "db: Verifying Clusterdb for coll %s...", coll );
|
||||
g_threads.disableThreads();
|
||||
|
347
Collectiondb.cpp
347
Collectiondb.cpp
@ -34,7 +34,13 @@ Collectiondb g_collectiondb;
|
||||
Collectiondb::Collectiondb ( ) {
|
||||
m_numRecs = 0;
|
||||
m_numRecsUsed = 0;
|
||||
m_lastUpdateTime = 0LL;
|
||||
//m_lastUpdateTime = 0LL;
|
||||
m_needsSave = false;
|
||||
// sanity
|
||||
if ( RDB_END2 >= RDB_END ) return;
|
||||
log("db: increase RDB_END2 to at least %li in "
|
||||
"Collectiondb.h",(long)RDB_END);
|
||||
char *xx=NULL;*xx=0;
|
||||
}
|
||||
|
||||
// reset rdb
|
||||
@ -51,6 +57,7 @@ void Collectiondb::reset() {
|
||||
g_collTable.reset();
|
||||
}
|
||||
|
||||
/*
|
||||
bool Collectiondb::init ( bool isDump ) {
|
||||
reset();
|
||||
if ( g_isYippy ) return true;
|
||||
@ -77,6 +84,7 @@ bool Collectiondb::init ( bool isDump ) {
|
||||
// otherwise, true, even if reloadList() blocked
|
||||
return true;
|
||||
}
|
||||
*/
|
||||
|
||||
// . save to disk
|
||||
// . returns false if blocked, true otherwise
|
||||
@ -95,7 +103,12 @@ bool Collectiondb::save ( ) {
|
||||
return true;
|
||||
}
|
||||
|
||||
bool Collectiondb::load ( bool isDump ) {
|
||||
///////////
|
||||
//
|
||||
// fill up our m_recs[] array based on the coll.*.*/coll.conf files
|
||||
//
|
||||
///////////
|
||||
bool Collectiondb::loadAllCollRecs ( ) {
|
||||
char dname[1024];
|
||||
// MDW: sprintf ( dname , "%s/collections/" , g_hostdb.m_dir );
|
||||
sprintf ( dname , "%s" , g_hostdb.m_dir );
|
||||
@ -104,7 +117,7 @@ bool Collectiondb::load ( bool isDump ) {
|
||||
if ( ! d.open ()) return log("admin: Could not load collection config "
|
||||
"files.");
|
||||
// note it
|
||||
log(LOG_INFO,"db: Loading collection config files.");
|
||||
//log(LOG_INFO,"db: loading collection config files.");
|
||||
// . scan through all subdirs in the collections dir
|
||||
// . they should be like, "coll.main/" and "coll.mycollection/"
|
||||
char *f;
|
||||
@ -122,16 +135,23 @@ bool Collectiondb::load ( bool isDump ) {
|
||||
// get collnum
|
||||
collnum_t collnum = atol ( pp + 1 );
|
||||
// add it
|
||||
if ( ! addExistingColl ( coll , collnum ,isDump ) )
|
||||
if ( ! addExistingColl ( coll , collnum ) )
|
||||
return false;
|
||||
}
|
||||
// note it
|
||||
log(LOG_INFO,"db: Loaded data for %li collections. Ranging from "
|
||||
"collection #0 to #%li.",m_numRecsUsed,m_numRecs-1);
|
||||
//log(LOG_INFO,"db: Loaded data for %li collections. Ranging from "
|
||||
// "collection #0 to #%li.",m_numRecsUsed,m_numRecs-1);
|
||||
// update the time
|
||||
updateTime();
|
||||
//updateTime();
|
||||
// don't clean the tree if just dumpin
|
||||
if ( isDump ) return true;
|
||||
//if ( isDump ) return true;
|
||||
return true;
|
||||
}
|
||||
|
||||
// after we've initialized all rdbs in main.cpp call this to clean out
|
||||
// our rdb trees
|
||||
bool Collectiondb::cleanTrees ( ) {
|
||||
|
||||
// remove any nodes with illegal collnums
|
||||
Rdb *r;
|
||||
//r = g_indexdb.getRdb();
|
||||
@ -158,7 +178,7 @@ bool Collectiondb::load ( bool isDump ) {
|
||||
// success
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
void Collectiondb::updateTime() {
|
||||
// get time now in milliseconds
|
||||
long long newTime = gettimeofdayInMilliseconds();
|
||||
@ -169,14 +189,13 @@ void Collectiondb::updateTime() {
|
||||
// we need a save
|
||||
m_needsSave = true;
|
||||
}
|
||||
*/
|
||||
|
||||
#include "Statsdb.h"
|
||||
#include "Cachedb.h"
|
||||
#include "Syncdb.h"
|
||||
|
||||
bool Collectiondb::addExistingColl ( char *coll,
|
||||
collnum_t collnum ,
|
||||
bool isDump ) {
|
||||
bool Collectiondb::addExistingColl ( char *coll, collnum_t collnum ) {
|
||||
|
||||
long i = collnum;
|
||||
|
||||
@ -221,7 +240,7 @@ bool Collectiondb::addExistingColl ( char *coll,
|
||||
"\"%s\".",coll);
|
||||
}
|
||||
|
||||
if ( ! registerCollRec ( cr , isDump , false ) ) return false;
|
||||
if ( ! registerCollRec ( cr , false ) ) return false;
|
||||
|
||||
// we need to compile the regular expressions or update the url
|
||||
// filters with new logic that maps crawlbot parms to url filters
|
||||
@ -454,6 +473,16 @@ bool Collectiondb::addNewColl ( char *coll ,
|
||||
memset ( &cr->m_localCrawlInfo , 0 , sizeof(CrawlInfo) );
|
||||
memset ( &cr->m_globalCrawlInfo , 0 , sizeof(CrawlInfo) );
|
||||
|
||||
// note that
|
||||
log("colldb: initial revival for %s",cr->m_coll);
|
||||
|
||||
// . assume we got some urls ready to spider
|
||||
// . Spider.cpp will wait SPIDER_DONE_TIME seconds and if it has no
|
||||
// urls it spidered in that time these will get set to 0 and it
|
||||
// will send out an email alert if m_sentCrawlDoneAlert is not true.
|
||||
cr->m_localCrawlInfo.m_hasUrlsReadyToSpider = 1;
|
||||
cr->m_globalCrawlInfo.m_hasUrlsReadyToSpider = 1;
|
||||
|
||||
// set some defaults. max spiders for all priorities in this
|
||||
// collection. NO, default is in Parms.cpp.
|
||||
//cr->m_maxNumSpiders = 10;
|
||||
@ -496,46 +525,66 @@ bool Collectiondb::addNewColl ( char *coll ,
|
||||
}
|
||||
|
||||
|
||||
return registerCollRec ( cr , false , true );
|
||||
if ( ! registerCollRec ( cr , true ) )
|
||||
return false;
|
||||
|
||||
// add the rdbbases for this coll, CollectionRec::m_bases[]
|
||||
if ( ! addRdbBasesForCollRec ( cr ) )
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
// . called only by addNewColl() and by addExistingColl()
|
||||
bool Collectiondb::registerCollRec ( CollectionRec *cr ,
|
||||
bool isDump ,
|
||||
bool isNew ) {
|
||||
|
||||
bool Collectiondb::registerCollRec ( CollectionRec *cr , bool isNew ) {
|
||||
|
||||
// add m_recs[] and to hashtable
|
||||
if ( ! setRecPtr ( cr->m_collnum , cr ) )
|
||||
return false;
|
||||
|
||||
bool verify = true;
|
||||
return true;
|
||||
}
|
||||
|
||||
bool Collectiondb::addRdbBaseToAllRdbsForEachCollRec ( ) {
|
||||
for ( long i = 0 ; i < m_numRecs ; i++ ) {
|
||||
CollectionRec *cr = m_recs[i];
|
||||
if ( ! cr ) continue;
|
||||
// add rdb base files etc. for it
|
||||
addRdbBasesForCollRec ( cr );
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool Collectiondb::addRdbBasesForCollRec ( CollectionRec *cr ) {
|
||||
|
||||
char *coll = cr->m_coll;
|
||||
|
||||
//////
|
||||
//
|
||||
// if we are doing a dump from the command line, skip this stuff
|
||||
if ( isDump ) return true;
|
||||
|
||||
|
||||
if ( isNew ) verify = false;
|
||||
|
||||
//
|
||||
//////
|
||||
if ( g_dumpMode ) return true;
|
||||
|
||||
// tell rdbs to add one, too
|
||||
//if ( ! g_indexdb.addColl ( coll, verify ) ) goto hadError;
|
||||
if ( ! g_posdb.addColl ( coll, verify ) ) goto hadError;
|
||||
//if ( ! g_datedb.addColl ( coll, verify ) ) goto hadError;
|
||||
//if ( ! g_indexdb.getRdb()->addRdbBase1 ( coll ) ) goto hadError;
|
||||
if ( ! g_posdb.getRdb()->addRdbBase1 ( coll ) ) goto hadError;
|
||||
//if ( ! g_datedb.getRdb()->addRdbBase1 ( coll ) ) goto hadError;
|
||||
|
||||
if ( ! g_titledb.addColl ( coll, verify ) ) goto hadError;
|
||||
//if ( ! g_revdb.addColl ( coll, verify ) ) goto hadError;
|
||||
//if ( ! g_sectiondb.addColl ( coll, verify ) ) goto hadError;
|
||||
if ( ! g_tagdb.addColl ( coll, verify ) ) goto hadError;
|
||||
//if ( ! g_catdb.addColl ( coll, verify ) ) goto hadError;
|
||||
//if ( ! g_checksumdb.addColl ( coll, verify ) ) goto hadError;
|
||||
//if ( ! g_tfndb.addColl ( coll, verify ) ) goto hadError;
|
||||
if ( ! g_clusterdb.addColl ( coll, verify ) ) goto hadError;
|
||||
if ( ! g_linkdb.addColl ( coll, verify ) ) goto hadError;
|
||||
if ( ! g_spiderdb.addColl ( coll, verify ) ) goto hadError;
|
||||
if ( ! g_doledb.addColl ( coll, verify ) ) goto hadError;
|
||||
if ( ! g_titledb.getRdb()->addRdbBase1 ( coll ) ) goto hadError;
|
||||
//if ( ! g_revdb.getRdb()->addRdbBase1 ( coll ) ) goto hadError;
|
||||
//if ( ! g_sectiondb.getRdb()->addRdbBase1 ( coll ) ) goto hadError;
|
||||
if ( ! g_tagdb.getRdb()->addRdbBase1 ( coll ) ) goto hadError;
|
||||
//if ( ! g_catdb.getRdb()->addRdbBase1 ( coll ) ) goto hadError;
|
||||
//if ( ! g_checksumdb.getRdb()->addRdbBase1 ( coll ) ) goto hadError;
|
||||
//if ( ! g_tfndb.getRdb()->addRdbBase1 ( coll ) ) goto hadError;
|
||||
if ( ! g_clusterdb.getRdb()->addRdbBase1 ( coll ) ) goto hadError;
|
||||
if ( ! g_linkdb.getRdb()->addRdbBase1 ( coll ) ) goto hadError;
|
||||
if ( ! g_spiderdb.getRdb()->addRdbBase1 ( coll ) ) goto hadError;
|
||||
if ( ! g_doledb.getRdb()->addRdbBase1 ( coll ) ) goto hadError;
|
||||
|
||||
// now clean the trees
|
||||
cleanTrees();
|
||||
|
||||
// debug message
|
||||
//log ( LOG_INFO, "db: verified collection \"%s\" (%li).",
|
||||
@ -637,6 +686,22 @@ bool Collectiondb::deleteRec ( char *coll , WaitEntry *we ) {
|
||||
}
|
||||
*/
|
||||
|
||||
// if there is an outstanding disk read thread or merge thread then
|
||||
// Spider.cpp will handle the delete in the callback.
|
||||
void Collectiondb::deleteSpiderColl ( SpiderColl *sc ) {
|
||||
|
||||
sc->m_deleteMyself = true;
|
||||
|
||||
// if not currently being accessed nuke it now
|
||||
if ( ! sc->m_msg5.m_waitingForList &&
|
||||
! sc->m_msg5b.m_waitingForList &&
|
||||
! sc->m_msg1.m_mcast.m_inUse ) {
|
||||
mdelete ( sc, sizeof(SpiderColl),"nukecr2");
|
||||
delete ( sc );
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
bool Collectiondb::deleteRec2 ( collnum_t collnum ) { //, WaitEntry *we ) {
|
||||
// do not allow this if in repair mode
|
||||
if ( g_repairMode > 0 ) {
|
||||
@ -724,10 +789,14 @@ bool Collectiondb::deleteRec2 ( collnum_t collnum ) { //, WaitEntry *we ) {
|
||||
SpiderColl *sc = g_spiderCache.getSpiderCollIffNonNull(collnum);
|
||||
if ( sc ) {
|
||||
// remove locks from lock table:
|
||||
sc->clear();
|
||||
sc->clearLocks();
|
||||
//sc->m_collnum = newCollnum;
|
||||
sc->reset();
|
||||
mdelete ( sc, sizeof(SpiderColl),"nukecr2");
|
||||
//sc->reset();
|
||||
// this will put it on "death row" so it will be deleted
|
||||
// once Msg5::m_waitingForList/Merge is NULL
|
||||
deleteSpiderColl ( sc );
|
||||
//mdelete ( sc, sizeof(SpiderColl),"nukecr2");
|
||||
//delete ( sc );
|
||||
cr->m_spiderColl = NULL;
|
||||
}
|
||||
|
||||
@ -872,7 +941,7 @@ bool Collectiondb::setRecPtr ( collnum_t collnum , CollectionRec *cr ) {
|
||||
}
|
||||
|
||||
// update the time
|
||||
updateTime();
|
||||
//updateTime();
|
||||
|
||||
return true;
|
||||
}
|
||||
@ -926,8 +995,19 @@ bool Collectiondb::resetColl2( collnum_t oldCollnum,
|
||||
// reset spider info
|
||||
SpiderColl *sc = g_spiderCache.getSpiderCollIffNonNull(oldCollnum);
|
||||
if ( sc ) {
|
||||
sc->clear();
|
||||
sc->m_collnum = newCollnum;
|
||||
// remove locks from lock table:
|
||||
sc->clearLocks();
|
||||
// don't do this anymore, just nuke it in case
|
||||
// m_populatingDoledb was true etc. there are too many
|
||||
// flags to worry about
|
||||
//sc->m_collnum = newCollnum;
|
||||
//sc->reset();
|
||||
// this will put it on "death row" so it will be deleted
|
||||
// once Msg5::m_waitingForList/Merge is NULL
|
||||
deleteSpiderColl ( sc );
|
||||
//mdelete ( sc, sizeof(SpiderColl),"nukecr2");
|
||||
//delete ( sc );
|
||||
cr->m_spiderColl = NULL;
|
||||
}
|
||||
|
||||
// reset spider round
|
||||
@ -1052,7 +1132,7 @@ bool addCollToTable ( char *coll , collnum_t collnum ) {
|
||||
|
||||
|
||||
// get coll rec specified in the HTTP request
|
||||
CollectionRec *Collectiondb::getRec ( HttpRequest *r ) {
|
||||
CollectionRec *Collectiondb::getRec ( HttpRequest *r , bool useDefaultRec ) {
|
||||
char *coll = r->getString ( "c" );
|
||||
if ( coll && ! coll[0] ) coll = NULL;
|
||||
// maybe it is crawlbot?
|
||||
@ -1067,6 +1147,18 @@ CollectionRec *Collectiondb::getRec ( HttpRequest *r ) {
|
||||
snprintf(tmp,MAX_COLL_LEN,"%s-%s",token,name);
|
||||
coll = tmp;
|
||||
}
|
||||
|
||||
// default to main first
|
||||
if ( ! coll && useDefaultRec ) {
|
||||
CollectionRec *cr = g_collectiondb.getRec("main");
|
||||
if ( cr ) return cr;
|
||||
}
|
||||
|
||||
// try next in line
|
||||
if ( ! coll && useDefaultRec ) {
|
||||
return getFirstRec ();
|
||||
}
|
||||
|
||||
// give up?
|
||||
if ( ! coll ) return NULL;
|
||||
//if ( ! coll || ! coll[0] ) coll = g_conf.m_defaultColl;
|
||||
@ -1296,7 +1388,7 @@ CollectionRec::CollectionRec() {
|
||||
//m_spiderStatusMsg = NULL;
|
||||
// for Url::getSite()
|
||||
m_updateSiteRulesTable = 1;
|
||||
m_lastUpdateTime = 0LL;
|
||||
//m_lastUpdateTime = 0LL;
|
||||
m_clickNScrollEnabled = false;
|
||||
// inits for sortbydatetable
|
||||
m_inProgress = false;
|
||||
@ -1359,6 +1451,10 @@ void CollectionRec::setToDefaults ( ) {
|
||||
|
||||
void CollectionRec::reset() {
|
||||
|
||||
// . grows dynamically
|
||||
// . setting to 0 buckets should never have error
|
||||
//m_pageCountTable.set ( 4,4,0,NULL,0,false,MAX_NICENESS,"pctbl" );
|
||||
|
||||
// regex_t types
|
||||
if ( m_hasucr ) regfree ( &m_ucr );
|
||||
if ( m_hasupr ) regfree ( &m_upr );
|
||||
@ -1378,6 +1474,27 @@ void CollectionRec::reset() {
|
||||
rdb->resetBase ( m_collnum );
|
||||
}
|
||||
|
||||
for ( long i = 0 ; i < g_process.m_numRdbs ; i++ ) {
|
||||
RdbBase *base = m_bases[i];
|
||||
if ( ! base ) continue;
|
||||
mdelete (base, sizeof(RdbBase), "Rdb Coll");
|
||||
delete (base);
|
||||
}
|
||||
|
||||
SpiderColl *sc = m_spiderColl;
|
||||
// if never made one, we are done
|
||||
if ( ! sc ) return;
|
||||
|
||||
// spider coll also!
|
||||
sc->m_deleteMyself = true;
|
||||
|
||||
// if not currently being accessed nuke it now
|
||||
if ( ! sc->m_msg5.m_waitingForList &&
|
||||
! sc->m_msg5b.m_waitingForList &&
|
||||
! sc->m_msg1.m_mcast.m_inUse ) {
|
||||
mdelete ( sc, sizeof(SpiderColl),"nukecr2");
|
||||
delete ( sc );
|
||||
}
|
||||
}
|
||||
|
||||
CollectionRec *g_cr = NULL;
|
||||
@ -1404,7 +1521,8 @@ bool CollectionRec::load ( char *coll , long i ) {
|
||||
m_collLen = gbstrlen ( coll );
|
||||
strcpy ( m_coll , coll );
|
||||
|
||||
log(LOG_INFO,"db: loading data for %s",coll);
|
||||
log(LOG_INFO,"db: loading conf for collection %s (%li)",coll,
|
||||
(long)m_collnum);
|
||||
|
||||
// collection name HACK for backwards compatibility
|
||||
//if ( strcmp ( coll , "main" ) == 0 ) {
|
||||
@ -1440,6 +1558,43 @@ bool CollectionRec::load ( char *coll , long i ) {
|
||||
//m_localCrawlInfo.setFromSafeBuf(&sb);
|
||||
// it is binary now
|
||||
memcpy ( &m_localCrawlInfo , sb.getBufStart(),sb.length() );
|
||||
|
||||
|
||||
log("coll: loaded %s (%li) local hasurlsready=%li",
|
||||
m_coll,
|
||||
(long)m_collnum,
|
||||
(long)m_localCrawlInfo.m_hasUrlsReadyToSpider);
|
||||
|
||||
|
||||
// we introduced the this round counts, so don't start them at 0!!
|
||||
if ( m_spiderRoundNum == 0 &&
|
||||
m_localCrawlInfo.m_pageDownloadSuccessesThisRound <
|
||||
m_localCrawlInfo.m_pageDownloadSuccesses ) {
|
||||
log("coll: fixing process count this round for %s",m_coll);
|
||||
m_localCrawlInfo.m_pageDownloadSuccessesThisRound =
|
||||
m_localCrawlInfo.m_pageDownloadSuccesses;
|
||||
}
|
||||
|
||||
// we introduced the this round counts, so don't start them at 0!!
|
||||
if ( m_spiderRoundNum == 0 &&
|
||||
m_localCrawlInfo.m_pageProcessSuccessesThisRound <
|
||||
m_localCrawlInfo.m_pageProcessSuccesses ) {
|
||||
log("coll: fixing process count this round for %s",m_coll);
|
||||
m_localCrawlInfo.m_pageProcessSuccessesThisRound =
|
||||
m_localCrawlInfo.m_pageProcessSuccesses;
|
||||
}
|
||||
|
||||
// fix from old bug that was fixed
|
||||
//if ( m_spiderRoundNum == 0 &&
|
||||
// m_collectiveRespiderFrequency > 0.0 &&
|
||||
// m_localCrawlInfo.m_sentCrawlDoneAlert ) {
|
||||
// log("coll: bug fix: resending email alert for coll %s (%li) "
|
||||
// "of respider freq %f",m_coll,(long)m_collnum,
|
||||
// m_collectiveRespiderFrequency);
|
||||
// m_localCrawlInfo.m_sentCrawlDoneAlert = false;
|
||||
//}
|
||||
|
||||
|
||||
// LOAD GLOBAL
|
||||
snprintf ( tmp1 , 1023, "%scoll.%s.%li/globalcrawlinfo.dat",
|
||||
g_hostdb.m_dir , m_coll , (long)m_collnum );
|
||||
@ -1451,20 +1606,23 @@ bool CollectionRec::load ( char *coll , long i ) {
|
||||
// it is binary now
|
||||
memcpy ( &m_globalCrawlInfo , sb.getBufStart(),sb.length() );
|
||||
|
||||
log("coll: loaded %s (%li) global hasurlsready=%li",
|
||||
m_coll,
|
||||
(long)m_collnum,
|
||||
(long)m_globalCrawlInfo.m_hasUrlsReadyToSpider);
|
||||
|
||||
|
||||
////////////
|
||||
//
|
||||
// PAGE COUNT TABLE for doing quotas in url filters
|
||||
//
|
||||
/////////////
|
||||
// . grows dynamically
|
||||
// . setting to 0 buckets should never have error
|
||||
m_pageCountTable.set ( 4,4,0,NULL,0,false,MAX_NICENESS,"pctbl" );
|
||||
// log it up if there on disk
|
||||
snprintf ( tmp1 , 1023, "/coll.%s.%li/pagecounts.dat",
|
||||
m_coll , (long)m_collnum );
|
||||
if ( ! m_pageCountTable.load ( g_hostdb.m_dir , tmp1 ) && g_errno )
|
||||
log("db: failed to load page count table: %s",
|
||||
mstrerror(g_errno));
|
||||
//snprintf ( tmp1 , 1023, "/coll.%s.%li/pagecounts.dat",
|
||||
// m_coll , (long)m_collnum );
|
||||
//if ( ! m_pageCountTable.load ( g_hostdb.m_dir , tmp1 ) && g_errno )
|
||||
// log("db: failed to load page count table: %s",
|
||||
// mstrerror(g_errno));
|
||||
|
||||
// ignore errors i guess
|
||||
g_errno = 0;
|
||||
@ -1619,11 +1777,11 @@ void CollectionRec::setUrlFiltersToDefaults ( ) {
|
||||
m_spiderIpWaits[n] = 1000;
|
||||
m_numRegExs5++;
|
||||
|
||||
m_spiderIpMaxSpiders[n] = 1;
|
||||
m_spiderIpMaxSpiders[n] = 7;
|
||||
m_numRegExs6++;
|
||||
|
||||
m_spidersEnabled[n] = 1;
|
||||
m_numRegExs7++;
|
||||
//m_spidersEnabled[n] = 1;
|
||||
//m_numRegExs7++;
|
||||
|
||||
m_harvestLinks[n] = 1;
|
||||
m_numRegExs8++;
|
||||
@ -1724,19 +1882,24 @@ bool CollectionRec::save ( ) {
|
||||
tmp,mstrerror(g_errno));
|
||||
g_errno = 0;
|
||||
}
|
||||
|
||||
// save page count table which has # of pages indexed per
|
||||
// subdomain/site and firstip for doing quotas in url filters table
|
||||
snprintf ( tmp , 1023, "coll.%s.%li/pagecounts.dat",
|
||||
m_coll , (long)m_collnum );
|
||||
if ( ! m_pageCountTable.save ( g_hostdb.m_dir , tmp ) ) {
|
||||
log("db: failed to save file %s : %s",tmp,mstrerror(g_errno));
|
||||
g_errno = 0;
|
||||
}
|
||||
|
||||
|
||||
// do not need a save now
|
||||
m_needsSave = false;
|
||||
|
||||
// waiting tree is saved in SpiderCache::save() called by Process.cpp
|
||||
//SpiderColl *sc = m_spiderColl;
|
||||
//if ( ! sc ) return true;
|
||||
|
||||
// save page count table which has # of pages indexed per
|
||||
// subdomain/site and firstip for doing quotas in url filters table
|
||||
//snprintf ( tmp , 1023, "coll.%s.%li/pagecounts.dat",
|
||||
// m_coll , (long)m_collnum );
|
||||
//if ( ! m_pageCountTable.save ( g_hostdb.m_dir , tmp ) ) {
|
||||
// log("db: failed to save file %s : %s",tmp,mstrerror(g_errno));
|
||||
// g_errno = 0;
|
||||
//}
|
||||
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -1937,10 +2100,10 @@ bool CollectionRec::rebuildUrlFilters ( ) {
|
||||
for ( long i = 0 ; i < MAX_FILTERS ; i++ ) {
|
||||
m_regExs[i].purge();
|
||||
m_spiderPriorities[i] = 0;
|
||||
m_maxSpidersPerRule [i] = 10;
|
||||
m_maxSpidersPerRule [i] = 100;
|
||||
m_spiderIpWaits [i] = wait;
|
||||
m_spiderIpMaxSpiders[i] = 7; // keep it respectful
|
||||
m_spidersEnabled [i] = 1;
|
||||
//m_spidersEnabled [i] = 1;
|
||||
m_spiderFreqs [i] =m_collectiveRespiderFrequency;
|
||||
//m_spiderDiffbotApiUrl[i].purge();
|
||||
m_harvestLinks[i] = true;
|
||||
@ -1961,6 +2124,24 @@ bool CollectionRec::rebuildUrlFilters ( ) {
|
||||
i++;
|
||||
}
|
||||
|
||||
// and for docs that have errors respider once every 5 hours
|
||||
m_regExs[i].set("errorcount==1");
|
||||
m_spiderPriorities [i] = 40;
|
||||
m_spiderFreqs [i] = 0.001; // 86 seconds
|
||||
i++;
|
||||
|
||||
// and for docs that have errors respider once every 5 hours
|
||||
m_regExs[i].set("errorcount==2");
|
||||
m_spiderPriorities [i] = 40;
|
||||
m_spiderFreqs [i] = 0.1; // 2.4 hrs
|
||||
i++;
|
||||
|
||||
// excessive errors? (tcp/dns timed out, etc.) retry once per month?
|
||||
m_regExs[i].set("errorcount>=3");
|
||||
m_spiderPriorities [i] = 30;
|
||||
m_spiderFreqs [i] = 30; // 30 days
|
||||
i++;
|
||||
|
||||
// 3rd rule for respidering
|
||||
if ( m_collectiveRespiderFrequency > 0.0 ) {
|
||||
m_regExs[i].set("lastspidertime>={roundstart}");
|
||||
@ -1968,7 +2149,11 @@ bool CollectionRec::rebuildUrlFilters ( ) {
|
||||
m_spiderPriorities [i] = 10;
|
||||
// just turn off spidering. if we were to set priority to
|
||||
// filtered it would be removed from index!
|
||||
m_spidersEnabled [i] = 0;
|
||||
//m_spidersEnabled [i] = 0;
|
||||
m_maxSpidersPerRule[i] = 0;
|
||||
// temp hack so it processes in xmldoc.cpp::getUrlFilterNum()
|
||||
// which has been obsoleted, but we are running old code now!
|
||||
//m_spiderDiffbotApiUrl[i].set ( api );
|
||||
i++;
|
||||
}
|
||||
// if collectiverespiderfreq is 0 or less then do not RE-spider
|
||||
@ -1981,22 +2166,14 @@ bool CollectionRec::rebuildUrlFilters ( ) {
|
||||
m_spiderPriorities [i] = 10;
|
||||
// just turn off spidering. if we were to set priority to
|
||||
// filtered it would be removed from index!
|
||||
m_spidersEnabled [i] = 0;
|
||||
//m_spidersEnabled [i] = 0;
|
||||
m_maxSpidersPerRule[i] = 0;
|
||||
// temp hack so it processes in xmldoc.cpp::getUrlFilterNum()
|
||||
// which has been obsoleted, but we are running old code now!
|
||||
//m_spiderDiffbotApiUrl[i].set ( api );
|
||||
i++;
|
||||
}
|
||||
|
||||
// and for docs that have errors respider once every 5 hours
|
||||
m_regExs[i].set("errorcount>0 && errcount<3");
|
||||
m_spiderPriorities [i] = 40;
|
||||
m_spiderFreqs [i] = 0.2; // half a day
|
||||
i++;
|
||||
|
||||
// excessive errors? (tcp/dns timed out, etc.) retry once per month?
|
||||
m_regExs[i].set("errorcount>=3");
|
||||
m_spiderPriorities [i] = 30;
|
||||
m_spiderFreqs [i] = 30; // 30 days
|
||||
i++;
|
||||
|
||||
// url crawl and process pattern
|
||||
if ( ucp && upp ) {
|
||||
m_regExs[i].set("matchesucp && matchesupp");
|
||||
@ -2060,7 +2237,7 @@ bool CollectionRec::rebuildUrlFilters ( ) {
|
||||
m_numRegExs10 = i;
|
||||
m_numRegExs5 = i;
|
||||
m_numRegExs6 = i;
|
||||
m_numRegExs7 = i;
|
||||
//m_numRegExs7 = i;
|
||||
m_numRegExs8 = i;
|
||||
//m_numRegExs11 = i;
|
||||
|
||||
|
@ -27,6 +27,7 @@ public:
|
||||
char *m_parmEnd;
|
||||
class UdpSlot *m_slot;
|
||||
bool m_doRebuilds;
|
||||
bool m_updatedRound;
|
||||
collnum_t m_collnum;
|
||||
bool m_registered;
|
||||
long m_errno;
|
||||
@ -45,10 +46,18 @@ class Collectiondb {
|
||||
// . returns false and sets errno on error
|
||||
// . each collection as a CollectionRec class for it and
|
||||
// is loaded up from the appropriate config file
|
||||
bool init ( bool isDump = false );
|
||||
bool init ( );
|
||||
|
||||
// this loads all the recs from host #0
|
||||
bool load ( bool isDump = false );
|
||||
//bool load ( bool isDump = false );
|
||||
|
||||
// called by main.cpp to fill in our m_recs[] array with
|
||||
// all the coll.*.*/coll.conf info
|
||||
bool loadAllCollRecs ( );
|
||||
|
||||
// after main.cpp loads all rdb trees it calls this to remove
|
||||
// bogus collnums from the trees i guess
|
||||
bool cleanTrees ( ) ;
|
||||
|
||||
// . this will save all conf files back to disk that need it
|
||||
// . returns false and sets g_errno on error, true on success
|
||||
@ -63,7 +72,8 @@ class Collectiondb {
|
||||
char *getColl ( collnum_t collnum ) {return getCollName(collnum);};
|
||||
|
||||
// get coll rec specified in the HTTP request
|
||||
class CollectionRec *getRec ( class HttpRequest *r );
|
||||
class CollectionRec *getRec ( class HttpRequest *r ,
|
||||
bool useDefaultRec = true );
|
||||
|
||||
// . get collectionRec from name
|
||||
// returns NULL if not available
|
||||
@ -81,7 +91,7 @@ class Collectiondb {
|
||||
|
||||
// . how many collections we have in here
|
||||
// . only counts valid existing collections
|
||||
long getNumRecs() { return m_numRecsUsed; };
|
||||
long getNumRecsUsed() { return m_numRecsUsed; };
|
||||
|
||||
// . does this requester have root admin privledges???
|
||||
// . uses the root collection record!
|
||||
@ -92,9 +102,9 @@ class Collectiondb {
|
||||
// what collnum will be used the next time a coll is added?
|
||||
collnum_t reserveCollNum ( ) ;
|
||||
|
||||
long long getLastUpdateTime () { return m_lastUpdateTime; };
|
||||
//long long getLastUpdateTime () { return m_lastUpdateTime; };
|
||||
// updates m_lastUpdateTime so g_spiderCache know when to reload
|
||||
void updateTime ();
|
||||
//void updateTime ();
|
||||
|
||||
// private:
|
||||
|
||||
@ -105,9 +115,8 @@ class Collectiondb {
|
||||
// bool saveRec ); // = true
|
||||
|
||||
|
||||
bool addExistingColl ( char *coll,
|
||||
collnum_t collnum ,
|
||||
bool isDump ) ;
|
||||
bool addExistingColl ( char *coll, collnum_t collnum );
|
||||
|
||||
bool addNewColl ( char *coll ,
|
||||
char customCrawl ,
|
||||
char *cpc ,
|
||||
@ -115,9 +124,10 @@ class Collectiondb {
|
||||
bool saveIt ,
|
||||
collnum_t newCollnum ) ;
|
||||
|
||||
bool registerCollRec ( CollectionRec *cr ,
|
||||
bool isDump ,
|
||||
bool isNew ) ;
|
||||
bool registerCollRec ( CollectionRec *cr , bool isNew ) ;
|
||||
|
||||
bool addRdbBaseToAllRdbsForEachCollRec ( ) ;
|
||||
bool addRdbBasesForCollRec ( CollectionRec *cr ) ;
|
||||
|
||||
bool setRecPtr ( collnum_t collnum , CollectionRec *cr ) ;
|
||||
|
||||
@ -128,6 +138,8 @@ class Collectiondb {
|
||||
//bool updateRec ( CollectionRec *newrec );
|
||||
bool deleteRecs ( class HttpRequest *r ) ;
|
||||
|
||||
void deleteSpiderColl ( class SpiderColl *sc );
|
||||
|
||||
// returns false if blocked, true otherwise.
|
||||
//bool resetColl ( char *coll , WaitEntry *we , bool purgeSeeds );
|
||||
bool resetColl2 ( collnum_t oldCollnum,
|
||||
@ -149,7 +161,7 @@ class Collectiondb {
|
||||
long m_numRecs;
|
||||
long m_numRecsUsed;
|
||||
|
||||
long long m_lastUpdateTime;
|
||||
//long long m_lastUpdateTime;
|
||||
};
|
||||
|
||||
extern class Collectiondb g_collectiondb;
|
||||
@ -249,6 +261,7 @@ class CrawlInfo {
|
||||
long long m_pageProcessSuccesses; // 7
|
||||
long long m_urlsHarvested; // 8
|
||||
|
||||
|
||||
long m_lastUpdateTime;
|
||||
|
||||
// this is non-zero if urls are available to be spidered right now.
|
||||
@ -268,6 +281,12 @@ class CrawlInfo {
|
||||
//long m_numUrlsLaunched;
|
||||
long m_dummy1;
|
||||
|
||||
// keep separate because when we receive a crawlinfo struct from
|
||||
// a host we only add these in if it matches our round #
|
||||
long long m_pageDownloadSuccessesThisRound;
|
||||
long long m_pageProcessSuccessesThisRound;
|
||||
|
||||
|
||||
void reset() { memset ( this , 0 , sizeof(CrawlInfo) ); };
|
||||
//bool print (class SafeBuf *sb ) ;
|
||||
//bool setFromSafeBuf (class SafeBuf *sb ) ;
|
||||
@ -348,7 +367,7 @@ class CollectionRec {
|
||||
bool m_urlFiltersHavePageCounts;
|
||||
|
||||
// moved from SpiderColl so we can load up at startup
|
||||
HashTableX m_pageCountTable;
|
||||
//HashTableX m_pageCountTable;
|
||||
|
||||
// . when was the last time we changed?
|
||||
//long long m_lastUpdateTime;
|
||||
@ -385,7 +404,9 @@ class CollectionRec {
|
||||
// spidered and begin the next round
|
||||
long m_spiderRoundNum;
|
||||
|
||||
char m_useDatedb ;
|
||||
char m_indexBody;
|
||||
|
||||
//char m_useDatedb ;
|
||||
//char m_addUrlEnabled ; // TODO: use at http interface lvl
|
||||
//char m_spiderLinks ; use url filters now!
|
||||
char m_sameHostLinks ; // spider links from same host only?
|
||||
@ -691,8 +712,8 @@ class CollectionRec {
|
||||
//long m_respiderWaits [ MAX_FILTERS ];
|
||||
//long m_numRegExs8;
|
||||
// spidering on or off?
|
||||
long m_numRegExs7;
|
||||
char m_spidersEnabled [ MAX_FILTERS ];
|
||||
//long m_numRegExs7;
|
||||
//char m_spidersEnabled [ MAX_FILTERS ];
|
||||
|
||||
// should urls in this queue be sent to diffbot for processing
|
||||
// when we are trying to index them?
|
||||
|
3
Conf.h
3
Conf.h
@ -216,6 +216,7 @@ class Conf {
|
||||
//long long m_tfndbMaxUrls;
|
||||
|
||||
long m_maxCpuThreads;
|
||||
long m_maxCpuMergeThreads;
|
||||
|
||||
long m_deadHostTimeout;
|
||||
long m_sendEmailTimeout;
|
||||
@ -300,6 +301,8 @@ class Conf {
|
||||
long m_robotdbMaxCacheMem ;
|
||||
bool m_robotdbSaveCache;
|
||||
|
||||
long m_maxTotalSpiders;
|
||||
|
||||
// indexdb has a max cached age for getting IndexLists (10 mins deflt)
|
||||
long m_indexdbMaxTreeMem ;
|
||||
long m_indexdbMaxCacheMem;
|
||||
|
@ -1285,6 +1285,10 @@ bool CountryCode::loadHashTable(void) {
|
||||
return(s_catToCountry.load(g_hostdb.m_dir, "catcountry.dat"));
|
||||
}
|
||||
|
||||
void CountryCode::reset ( ) {
|
||||
s_catToCountry.reset();
|
||||
}
|
||||
|
||||
int CountryCode::getNumCodes(void) {
|
||||
return(s_numCountryCodes);
|
||||
}
|
||||
|
@ -25,6 +25,7 @@ class CountryCode {
|
||||
uint8_t getLanguageFromDMOZ(long catid);
|
||||
int createHashTable(void);
|
||||
bool loadHashTable(void);
|
||||
void reset();
|
||||
long getNumEntries(void);
|
||||
void debugDumpNumbers(void);
|
||||
uint64_t getLanguagesWritten(int index);
|
||||
|
@ -121,7 +121,7 @@ bool Datedb::init2 ( long treeMem ) {
|
||||
false , // preload dskpagecache
|
||||
16 );// key size
|
||||
}
|
||||
|
||||
/*
|
||||
bool Datedb::addColl ( char *coll, bool doVerify ) {
|
||||
if ( ! m_rdb.addColl ( coll ) ) return false;
|
||||
if ( ! doVerify ) return true;
|
||||
@ -133,7 +133,7 @@ bool Datedb::addColl ( char *coll, bool doVerify ) {
|
||||
log ( "db: Verify failed, but scaling is allowed, passing." );
|
||||
return true;
|
||||
}
|
||||
|
||||
*/
|
||||
bool Datedb::verify ( char *coll ) {
|
||||
log ( LOG_INFO, "db: Verifying Datedb for coll %s...", coll );
|
||||
g_threads.disableThreads();
|
||||
|
@ -108,6 +108,9 @@ bool DiskPageCache::init ( const char *dbname ,
|
||||
// void (*rmVfd2)(DiskPageCache*, long) ) {
|
||||
reset();
|
||||
|
||||
// fix cores while rebalancing
|
||||
//maxMem = 0;
|
||||
|
||||
m_rdbId = rdbId;
|
||||
|
||||
bool *tog = NULL;
|
||||
|
@ -166,6 +166,7 @@ case EBADJSONPARSER: return "Bad JSON parser";
|
||||
case EFAKEFIRSTIP: return "Fake firstIp";
|
||||
case EBADHOSTSCONF: return "A hosts.conf is out of sync";
|
||||
case EWAITINGTOSYNCHOSTSCONF: return "Wait to ensure hosts.conf in sync";
|
||||
case EDOCNONCANONICAL: return "Url was dup of canonical page";
|
||||
}
|
||||
// if the remote error bit is clear it must be a regulare errno
|
||||
//if ( ! ( errnum & REMOTE_ERROR_BIT ) ) return strerror ( errnum );
|
||||
|
3
Errno.h
3
Errno.h
@ -169,6 +169,7 @@ enum {
|
||||
EBADJSONPARSER,
|
||||
EFAKEFIRSTIP,
|
||||
EBADHOSTSCONF,
|
||||
EWAITINGTOSYNCHOSTSCONF
|
||||
EWAITINGTOSYNCHOSTSCONF,
|
||||
EDOCNONCANONICAL
|
||||
};
|
||||
#endif
|
||||
|
@ -187,6 +187,8 @@ bool HashTableX::addKey ( void *key , void *val , long *slot ) {
|
||||
g_errno = ETRYAGAIN;
|
||||
return false;
|
||||
}
|
||||
// never got initialized? call HashTableX::init()
|
||||
if ( m_ks <= 0 ){ char *xx=NULL; *xx=0; }
|
||||
// check to see if we should grow the table. now we grow
|
||||
// when 25% full to make operations faster so getLongestString()
|
||||
// doesn't return such big numbers!
|
||||
|
13
Hostdb.cpp
13
Hostdb.cpp
@ -68,6 +68,16 @@ Hostdb::~Hostdb () {
|
||||
}
|
||||
|
||||
void Hostdb::reset ( ) {
|
||||
|
||||
for ( long i = 0 ; m_hosts && i < m_numHosts ; i++ ) {
|
||||
Host *h = &m_hosts[i];
|
||||
if ( ! h->m_lastKnownGoodCrawlInfoReply ) continue;
|
||||
mfree ( h->m_lastKnownGoodCrawlInfoReply ,
|
||||
h->m_lastKnownGoodCrawlInfoReplyEnd -
|
||||
h->m_lastKnownGoodCrawlInfoReply , "lknown" );
|
||||
h->m_lastKnownGoodCrawlInfoReply = NULL;
|
||||
}
|
||||
|
||||
if ( m_hosts )
|
||||
mfree ( m_hosts, m_allocSize,"Hostdb" );
|
||||
if ( m_ips ) mfree ( m_ips , m_numIps * 4, "Hostdb" );
|
||||
@ -121,8 +131,7 @@ bool Hostdb::init ( char *filename , long hostId , char *netName ,
|
||||
// make sure our hostId is in our conf file
|
||||
if ( hostId < 0 )
|
||||
return log(
|
||||
"conf: Negative hostId %li supplied in "
|
||||
"hosts.conf.",hostId);
|
||||
"conf: Negative hostId %li supplied",hostId);
|
||||
// set early for calling log()
|
||||
m_hostId = hostId;
|
||||
// set clock in sync in fctypes.cpp
|
||||
|
3
Hostdb.h
3
Hostdb.h
@ -273,6 +273,9 @@ class Host {
|
||||
char m_inSync ;
|
||||
char m_isPermanentOutOfSync ;
|
||||
|
||||
char *m_lastKnownGoodCrawlInfoReply;
|
||||
char *m_lastKnownGoodCrawlInfoReplyEnd;
|
||||
|
||||
// . used by Parms.cpp for broadcasting parm change requests
|
||||
// . each parm change request has an id
|
||||
// . this let's us know which id is in progress and what the last
|
||||
|
@ -925,6 +925,10 @@ bool HttpServer::sendReply ( TcpSocket *s , HttpRequest *r , bool isAdmin) {
|
||||
strncmp ( path , "/v2/bulk/download/" ,18 ) == 0 )
|
||||
return sendBackDump ( s , r );
|
||||
|
||||
// "GET /download/mycoll_urls.csv"
|
||||
if ( strncmp ( path , "/download/", 10 ) == 0 )
|
||||
return sendBackDump ( s , r );
|
||||
|
||||
// . is it a diffbot api request, like "GET /api/*"
|
||||
// . ie "/api/startcrawl" or "/api/stopcrawl" etc.?
|
||||
//if ( strncmp ( path , "/api/" , 5 ) == 0 )
|
||||
|
@ -173,7 +173,7 @@ bool Indexdb::init2 ( long treeMem ) {
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
bool Indexdb::addColl ( char *coll, bool doVerify ) {
|
||||
if ( ! m_rdb.addColl ( coll ) ) return false;
|
||||
if ( ! doVerify ) return true;
|
||||
@ -187,6 +187,7 @@ bool Indexdb::addColl ( char *coll, bool doVerify ) {
|
||||
log ( "db: Verify failed, but scaling is allowed, passing." );
|
||||
return true;
|
||||
}
|
||||
*/
|
||||
|
||||
bool Indexdb::verify ( char *coll ) {
|
||||
return true;
|
||||
|
8
Json.cpp
8
Json.cpp
@ -96,7 +96,13 @@ JsonItem *Json::parseJsonStringIntoJsonItems ( char *json , long niceness ) {
|
||||
// plus a \0 for the value and a \0 for the name of each jsonitem
|
||||
need += 2;
|
||||
// prevent cores for now
|
||||
need += 10000;
|
||||
need += 10;
|
||||
// . to prevent safebuf from reallocating do this
|
||||
// . safeMemcpy() calls reserve(m_length+len) and reserves
|
||||
// tries to alloc m_length + (m_length+len) so since,
|
||||
// m_length+len should never be more than "need" we need to
|
||||
// double up here
|
||||
need *= 2;
|
||||
// this should be enough
|
||||
if ( ! m_sb.reserve ( need ) ) return NULL;
|
||||
// for testing if we realloc
|
||||
|
@ -378,7 +378,7 @@ void LangList::reset ( ) {
|
||||
// . looks under the langlist/ directory for langlist.# files
|
||||
// each number corrisponds to a language
|
||||
bool LangList::loadLists ( ) {
|
||||
log ( LOG_INIT, "lang: Loading Language Lists.");
|
||||
//log ( LOG_INIT, "lang: Loading Language Lists.");
|
||||
// init the term table
|
||||
m_langTable.set(8,4,100000*MAX_LANGUAGES,NULL,0,false,0,"tbl-lang");
|
||||
// loop over the languages and load the files
|
||||
@ -476,6 +476,7 @@ bool LangList::loadLists ( ) {
|
||||
// count the list
|
||||
listCount++;
|
||||
|
||||
if ( wordsInList > 0 )
|
||||
log ( LOG_DEBUG,
|
||||
"lang: Successfully Loaded %li out of %li (%li bytes) "
|
||||
"words from %s dictionary.",
|
||||
|
@ -173,7 +173,7 @@ bool Linkdb::init2 ( long treeMem ) {
|
||||
sizeof(key224_t), // key size
|
||||
true );// bias disk page cache
|
||||
}
|
||||
|
||||
/*
|
||||
bool Linkdb::addColl ( char *coll, bool doVerify ) {
|
||||
if ( ! m_rdb.addColl ( coll ) ) return false;
|
||||
if ( ! doVerify ) return true;
|
||||
@ -185,7 +185,7 @@ bool Linkdb::addColl ( char *coll, bool doVerify ) {
|
||||
log ( "db: Verify failed, but scaling is allowed, passing." );
|
||||
return true;
|
||||
}
|
||||
|
||||
*/
|
||||
bool Linkdb::verify ( char *coll ) {
|
||||
log ( LOG_DEBUG, "db: Verifying Linkdb for coll %s...", coll );
|
||||
g_threads.disableThreads();
|
||||
@ -3648,7 +3648,8 @@ bool Inlink::setXmlFromRSS ( Xml *xml , long niceness ) {
|
||||
true , // pure xml?
|
||||
TITLEREC_CURRENT_VERSION ,
|
||||
false , // no need to now
|
||||
niceness );
|
||||
niceness ,
|
||||
CT_XML );
|
||||
}
|
||||
|
||||
// only Title.cpp uses this right now
|
||||
|
5
Loop.cpp
5
Loop.cpp
@ -1791,6 +1791,11 @@ void Loop::quickPoll(long niceness, const char* caller, long lineno) {
|
||||
if(m_inQuickPoll) {
|
||||
log(LOG_WARN,
|
||||
"admin: tried to quickpoll from inside quickpoll");
|
||||
// this happens when handleRequest3f is called from
|
||||
// a quickpoll and it deletes a collection and BigFile::close
|
||||
// calls ThreadQueue::removeThreads and Msg3::doneScanning()
|
||||
// has niceness 2 and calls quickpoll again!
|
||||
return;
|
||||
//if(g_conf.m_quickpollCoreOnError) {
|
||||
char*xx=NULL;*xx=0;
|
||||
// }
|
||||
|
33
Make.depend
33
Make.depend
@ -662,7 +662,8 @@ fctypes.o: fctypes.cpp gb-include.h types.h fctypes.h Unicode.h \
|
||||
openssl/ssl23.h openssl/srtp.h Collectiondb.h HashTableX.h PingServer.h \
|
||||
Entities.h UCWordIterator.h Timedb.h Rdb.h RdbBase.h RdbScan.h BigFile.h \
|
||||
RdbMap.h RdbList.h RdbDump.h RdbTree.h RdbMem.h RdbBuckets.h RdbCache.h \
|
||||
Msg5.h Msg3.h RdbMerge.h Dir.h Titledb.h DiskPageCache.h Threads.h
|
||||
Msg5.h Msg3.h RdbMerge.h Dir.h Titledb.h DiskPageCache.h Threads.h \
|
||||
HttpMime.h
|
||||
File.o: File.cpp gb-include.h types.h fctypes.h Unicode.h \
|
||||
UnicodeProperties.h UCPropTable.h iconv.h hash.h Errno.h Log.h File.h \
|
||||
Mem.h Conf.h Xml.h XmlNode.h Lang.h Iso8859.h iana_charset.h ip.h \
|
||||
@ -1325,13 +1326,14 @@ Mem.o: Mem.cpp gb-include.h types.h fctypes.h Unicode.h \
|
||||
openssl/pem2.h openssl/hmac.h openssl/kssl.h openssl/ssl2.h \
|
||||
openssl/ssl3.h openssl/tls1.h openssl/dtls1.h openssl/pqueue.h \
|
||||
openssl/ssl23.h openssl/srtp.h Collectiondb.h HashTableX.h PingServer.h \
|
||||
Threads.h malloc.c Msg20.h UdpServer.h UdpSlot.h UdpProtocol.h \
|
||||
Multicast.h Summary.h matches2.h Query.h Words.h StopWords.h Titledb.h \
|
||||
Rdb.h RdbBase.h RdbScan.h BigFile.h RdbMap.h RdbList.h RdbDump.h \
|
||||
RdbTree.h RdbMem.h RdbBuckets.h RdbCache.h Msg5.h Msg3.h RdbMerge.h \
|
||||
Dir.h DiskPageCache.h Bits.h Pos.h Matches.h HashTableT.h Domains.h \
|
||||
CountryCode.h Tagdb.h Msg0.h Indexdb.h Events.h Sections.h IndexList.h \
|
||||
Dates.h
|
||||
Threads.h Pages.h HttpServer.h TcpServer.h openssl/err.h MsgC.h \
|
||||
UdpServer.h UdpSlot.h UdpProtocol.h Dns.h DnsProtocol.h RdbCache.h \
|
||||
RdbList.h Multicast.h Rdb.h RdbBase.h RdbScan.h BigFile.h RdbMap.h \
|
||||
RdbDump.h RdbTree.h RdbMem.h RdbBuckets.h Msg5.h Msg3.h RdbMerge.h Dir.h \
|
||||
HttpMime.h PageCrawlBot.h malloc.c Msg20.h Summary.h matches2.h Query.h \
|
||||
Words.h StopWords.h Titledb.h DiskPageCache.h Bits.h Pos.h Matches.h \
|
||||
HashTableT.h Domains.h CountryCode.h Tagdb.h Msg0.h Indexdb.h Events.h \
|
||||
Sections.h IndexList.h Dates.h
|
||||
MemPool.o: MemPool.cpp gb-include.h types.h fctypes.h Unicode.h \
|
||||
UnicodeProperties.h UCPropTable.h iconv.h hash.h Errno.h Log.h MemPool.h \
|
||||
MemPoolTree.h Mem.h Conf.h Xml.h XmlNode.h Lang.h Iso8859.h \
|
||||
@ -1930,7 +1932,7 @@ Msg4.o: Msg4.cpp gb-include.h types.h fctypes.h Unicode.h \
|
||||
Msg13.h Msge0.h Msge1.h Msg8b.h SearchInput.h Msg40.h Msg39.h Msg37.h \
|
||||
Posdb.h TopTree.h IndexTable2.h Msg51.h Msg17.h IndexReadInfo.h Msg3a.h \
|
||||
Stats.h PostQueryRerank.h Sanity.h SiteGetter.h Title.h Address.h zlib.h \
|
||||
zconf.h Syncdb.h
|
||||
zconf.h Syncdb.h Process.h
|
||||
Msg51.o: Msg51.cpp gb-include.h types.h fctypes.h Unicode.h \
|
||||
UnicodeProperties.h UCPropTable.h iconv.h hash.h Errno.h Log.h Msg51.h \
|
||||
Msg0.h UdpServer.h Mem.h Conf.h Xml.h XmlNode.h Lang.h Iso8859.h \
|
||||
@ -3178,7 +3180,8 @@ RdbBase.o: RdbBase.cpp gb-include.h types.h fctypes.h Unicode.h \
|
||||
Repair.h XmlDoc.h Phrases.h LangList.h Images.h Msg36.h Msg13.h Msge0.h \
|
||||
Msge1.h MsgC.h Dns.h DnsProtocol.h Msg8b.h SearchInput.h Msg40.h Msg39.h \
|
||||
Msg37.h TopTree.h IndexTable2.h Msg51.h Msg17.h Msg3a.h \
|
||||
PostQueryRerank.h Sanity.h SiteGetter.h Title.h Address.h HttpMime.h
|
||||
PostQueryRerank.h Sanity.h SiteGetter.h Title.h Address.h HttpMime.h \
|
||||
Rebalance.h
|
||||
RdbBuckets.o: RdbBuckets.cpp RdbBuckets.h Mem.h Conf.h Xml.h XmlNode.h \
|
||||
gb-include.h types.h fctypes.h Unicode.h UnicodeProperties.h \
|
||||
UCPropTable.h iconv.h hash.h Errno.h Log.h Lang.h Iso8859.h \
|
||||
@ -3334,7 +3337,12 @@ RdbMerge.o: RdbMerge.cpp gb-include.h types.h fctypes.h Unicode.h \
|
||||
openssl/ssl23.h openssl/srtp.h Collectiondb.h HashTableX.h PingServer.h \
|
||||
RdbScan.h BigFile.h RdbMap.h RdbList.h RdbDump.h RdbTree.h RdbMem.h \
|
||||
RdbBuckets.h RdbCache.h Msg5.h Msg3.h RdbMerge.h Dir.h Indexdb.h \
|
||||
DiskPageCache.h Titledb.h Process.h
|
||||
DiskPageCache.h Titledb.h Process.h Spider.h Msg4.h Msg1.h UdpServer.h \
|
||||
UdpSlot.h UdpProtocol.h Multicast.h Threads.h Msg0.h Clusterdb.h \
|
||||
Linkdb.h Msg2.h Query.h Msg20.h Summary.h matches2.h Words.h StopWords.h \
|
||||
Bits.h Pos.h Matches.h HashTableT.h Domains.h CountryCode.h Tagdb.h \
|
||||
Events.h Sections.h IndexList.h Dates.h Msg22.h CatRec.h Categories.h \
|
||||
HashTable.h Catdb.h Datedb.h
|
||||
RdbScan.o: RdbScan.cpp gb-include.h types.h fctypes.h Unicode.h \
|
||||
UnicodeProperties.h UCPropTable.h iconv.h hash.h Errno.h Log.h RdbScan.h \
|
||||
BigFile.h File.h Mem.h Conf.h Xml.h XmlNode.h Lang.h Iso8859.h \
|
||||
@ -4508,7 +4516,8 @@ Xml.o: Xml.cpp gb-include.h types.h fctypes.h Unicode.h \
|
||||
openssl/ssl23.h openssl/srtp.h Collectiondb.h HashTableX.h PingServer.h \
|
||||
Titledb.h Rdb.h RdbBase.h RdbScan.h BigFile.h RdbMap.h RdbList.h \
|
||||
RdbDump.h RdbTree.h RdbMem.h RdbBuckets.h RdbCache.h Msg5.h Msg3.h \
|
||||
RdbMerge.h Dir.h DiskPageCache.h Words.h StopWords.h Entities.h
|
||||
RdbMerge.h Dir.h DiskPageCache.h Words.h StopWords.h HttpMime.h \
|
||||
Entities.h
|
||||
XmlDoc.o: XmlDoc.cpp gb-include.h types.h fctypes.h Unicode.h \
|
||||
UnicodeProperties.h UCPropTable.h iconv.h hash.h Errno.h Log.h XmlDoc.h \
|
||||
Lang.h Iso8859.h iana_charset.h Words.h Xml.h XmlNode.h SafeBuf.h \
|
||||
|
107
Mem.cpp
107
Mem.cpp
@ -10,11 +10,13 @@
|
||||
//#include "MemPoolVar.h"
|
||||
//#include "malloc.h"
|
||||
//#include "Stats.h"
|
||||
#include "Pages.h"
|
||||
|
||||
// put me back
|
||||
//#define _EFENCE_
|
||||
//#define EFENCE
|
||||
#define EFENCE_SIZE 100000
|
||||
|
||||
// uncomment this for _EFENCE_ to do underflow checks instead of the
|
||||
// uncomment this for EFENCE to do underflow checks instead of the
|
||||
// default overflow checks
|
||||
//#define _CHECKUNDERFLOW_
|
||||
|
||||
@ -50,7 +52,7 @@
|
||||
// there because it will hit a different PAGE, to be more sure we could
|
||||
// make UNDERPAD and OVERPAD PAGE bytes, although the overrun could still write
|
||||
// to another allocated area of memory and we can never catch it.
|
||||
#ifdef _EFENCE_
|
||||
#ifdef EFENCE
|
||||
#define UNDERPAD 0
|
||||
#define OVERPAD 0
|
||||
#else
|
||||
@ -66,7 +68,7 @@ extern bool g_isYippy;
|
||||
|
||||
bool freeCacheMem();
|
||||
|
||||
#ifdef _EFENCE_
|
||||
#ifdef EFENCE
|
||||
static void *getElecMem ( long size ) ;
|
||||
static void freeElecMem ( void *p ) ;
|
||||
#endif
|
||||
@ -148,7 +150,9 @@ void mutexUnlock ( ) {
|
||||
// make it big for production machines
|
||||
//#define DMEMTABLESIZE (1024*602)
|
||||
// there should not be too many mallocs any more
|
||||
#define DMEMTABLESIZE (1024*302)
|
||||
// i boosted from 300k to 600k so we can get summaries for 150k results
|
||||
// for the csv download...
|
||||
#define DMEMTABLESIZE (1024*602)
|
||||
//#define DMEMTABLESIZE (1024*202)
|
||||
// and small for local machine
|
||||
//#define DMEMTABLESIZE (1024*50)
|
||||
@ -248,7 +252,7 @@ void * operator new (size_t size) throw (std::bad_alloc) {
|
||||
throw std::bad_alloc();
|
||||
//throw 1;
|
||||
}
|
||||
#ifdef _EFENCE_
|
||||
#ifdef EFENCE
|
||||
void *mem = getElecMem(size);
|
||||
#else
|
||||
//void *mem = dlmalloc ( size );
|
||||
@ -266,7 +270,7 @@ newmemloop:
|
||||
//return NULL;
|
||||
}
|
||||
if ( (unsigned long)mem < 0x00010000 ) {
|
||||
#ifdef _EFENCE_
|
||||
#ifdef EFENCE
|
||||
void *remem = getElecMem(size);
|
||||
#else
|
||||
void *remem = sysmalloc(size);
|
||||
@ -274,7 +278,7 @@ newmemloop:
|
||||
log ( LOG_WARN, "mem: Caught low memory allocation at %08lx, "
|
||||
"reallocated to %08lx", (unsigned long)mem,
|
||||
(unsigned long)remem );
|
||||
#ifdef _EFENCE_
|
||||
#ifdef EFENCE
|
||||
freeElecMem (mem);
|
||||
#else
|
||||
sysfree(mem);
|
||||
@ -326,7 +330,7 @@ void * operator new [] (size_t size) throw (std::bad_alloc) {
|
||||
throw std::bad_alloc();
|
||||
//throw 1;
|
||||
}
|
||||
#ifdef _EFENCE_
|
||||
#ifdef EFENCE
|
||||
void *mem = getElecMem(size);
|
||||
#else
|
||||
//void *mem = dlmalloc ( size );
|
||||
@ -345,7 +349,7 @@ newmemloop:
|
||||
//return NULL;
|
||||
}
|
||||
if ( (unsigned long)mem < 0x00010000 ) {
|
||||
#ifdef _EFENCE_
|
||||
#ifdef EFENCE
|
||||
void *remem = getElecMem(size);
|
||||
#else
|
||||
void *remem = sysmalloc(size);
|
||||
@ -353,7 +357,7 @@ newmemloop:
|
||||
log ( LOG_WARN, "mem: Caught low memory allocation at %08lx, "
|
||||
"reallocated to %08lx",
|
||||
(long)mem, (long)remem );
|
||||
#ifdef _EFENCE_
|
||||
#ifdef EFENCE
|
||||
freeElecMem (mem);
|
||||
#else
|
||||
sysfree(mem);
|
||||
@ -423,6 +427,7 @@ pid_t Mem::getPid() {
|
||||
bool Mem::init ( long long maxMem ) {
|
||||
// set main process pid
|
||||
s_pid = getpid();
|
||||
|
||||
// . don't swap our memory out, man...
|
||||
// . damn, linux 2.4.17 seems to crash the kernel sometimes w/ this
|
||||
//if ( mlockall( MCL_CURRENT | MCL_FUTURE ) == -1 ) {
|
||||
@ -440,10 +445,37 @@ bool Mem::init ( long long maxMem ) {
|
||||
if ( g_conf.m_detectMemLeaks )
|
||||
log(LOG_INIT,"mem: Memory leak checking is enabled.");
|
||||
|
||||
#ifdef _EFENCE_
|
||||
#ifdef EFENCE
|
||||
log(LOG_INIT,"mem: using electric fence!!!!!!!");
|
||||
#endif
|
||||
|
||||
// if we can't alloc 3gb exit and retry
|
||||
long long start = gettimeofdayInMilliseconds();
|
||||
char *pools[30];
|
||||
long long count = 0LL;
|
||||
long long chunk = 100000000LL; // 100MB chunks
|
||||
long long need = 3000000000LL; // 3GB
|
||||
long i = 0; for ( i = 0 ; i < 30 ; i++ ) {
|
||||
pools[i] = (char *)mmalloc(chunk,"testmem");
|
||||
count += chunk;
|
||||
if ( pools[i] ) continue;
|
||||
count -= chunk;
|
||||
log("mem: could only alloc %lli bytes of the "
|
||||
"%lli required to run gigablast. exiting.",
|
||||
count , need );
|
||||
}
|
||||
for ( long j = 0 ; j < i ; j++ )
|
||||
mfree ( pools[j] , chunk , "testmem" );
|
||||
long long now = gettimeofdayInMilliseconds();
|
||||
long long took = now - start;
|
||||
if ( took > 20 ) log("mem: took %lli ms to check memory ceiling",took);
|
||||
// return if could not alloc the full 3GB
|
||||
if ( i < 30 ) return false;
|
||||
|
||||
// reset this, our max mem used over time ever because we don't
|
||||
// want the mem test we did above to count towards it
|
||||
m_maxAlloced = 0;
|
||||
|
||||
// init or own malloc stuff in malloc.c (from doug leay)
|
||||
//if ( mdw_init_sbrk ( maxMem ) ) return true;
|
||||
// bitch
|
||||
@ -652,24 +684,24 @@ bool Mem::printMemBreakdownTable ( SafeBuf* sb,
|
||||
|
||||
// make sure the admin viewing this table knows that there will be
|
||||
// frees in here that are delayed if electric fence is enabled.
|
||||
#ifdef _EFENCE_
|
||||
#ifdef EFENCE
|
||||
ss = " <font color=red>*DELAYED FREES ENABLED*</font>";
|
||||
#endif
|
||||
|
||||
sb->safePrintf (
|
||||
"<table>"
|
||||
|
||||
"<table cellpadding=4 width=100%% bgcolor=#%s border=1>"
|
||||
"<table %s>"
|
||||
"<tr>"
|
||||
"<td colspan=3 bgcolor=#%s>"
|
||||
"<center><b>Mem Breakdown%s</b></td></tr>\n"
|
||||
|
||||
"<tr>"
|
||||
"<tr bgcolor=#%s>"
|
||||
"<td><b>allocator</b></td>"
|
||||
"<td><b>num allocs</b></td>"
|
||||
"<td><b>allocated</b></td>"
|
||||
"</tr>" ,
|
||||
lightblue, darkblue , ss );
|
||||
TABLE_STYLE, darkblue , ss , darkblue );
|
||||
|
||||
long n = m_numAllocated * 2;
|
||||
MemEntry *e = (MemEntry *)mcalloc ( sizeof(MemEntry) * n , "Mem" );
|
||||
@ -756,11 +788,12 @@ bool Mem::printMemBreakdownTable ( SafeBuf* sb,
|
||||
// now print into buffer
|
||||
for ( long i = 0 ; i < count ; i++ )
|
||||
sb->safePrintf (
|
||||
"<tr>"
|
||||
"<tr bgcolor=%s>"
|
||||
"<td>%s</td>"
|
||||
"<td>%li</td>"
|
||||
"<td>%li</td>"
|
||||
"</tr>\n",
|
||||
LIGHT_BLUE,
|
||||
winners[i]->m_label,
|
||||
winners[i]->m_numAllocs,
|
||||
winners[i]->m_allocated);
|
||||
@ -1242,14 +1275,24 @@ void *Mem::gbmalloc ( int size , const char *note ) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
void *mem;
|
||||
|
||||
// to find bug that cores on malloc do this
|
||||
//printBreeches(true);
|
||||
//g_errno=ENOMEM;return (void *)log("Mem::malloc: reached mem limit");}
|
||||
#ifdef _EFENCE_
|
||||
void *mem = getElecMem(size+UNDERPAD+OVERPAD);
|
||||
#else
|
||||
#ifdef EFENCE
|
||||
mem = getElecMem(size+UNDERPAD+OVERPAD);
|
||||
|
||||
// conditional electric fence?
|
||||
#elif EFENCE_BIG
|
||||
if ( size >= EFENCE_SIZE )
|
||||
mem = getElecMem(size+0+0);
|
||||
else
|
||||
mem = (void *)sysmalloc ( size + UNDERPAD + OVERPAD );
|
||||
#else
|
||||
|
||||
//void *mem = dlmalloc ( size );
|
||||
void *mem = (void *)sysmalloc ( size + UNDERPAD + OVERPAD );
|
||||
mem = (void *)sysmalloc ( size + UNDERPAD + OVERPAD );
|
||||
#endif
|
||||
// initialization debug
|
||||
//char *pend = (char *)mem + UNDERPAD + size;
|
||||
@ -1321,7 +1364,7 @@ mallocmemloop:
|
||||
return NULL;
|
||||
}
|
||||
if ( (unsigned long)mem < 0x00010000 ) {
|
||||
#ifdef _EFENCE_
|
||||
#ifdef EFENCE
|
||||
void *remem = getElecMem(size);
|
||||
#else
|
||||
void *remem = sysmalloc(size);
|
||||
@ -1329,7 +1372,7 @@ mallocmemloop:
|
||||
log ( LOG_WARN, "mem: Caught low memory allocation at %08lx, "
|
||||
"reallocated to %08lx",
|
||||
(unsigned long)mem, (unsigned long)remem );
|
||||
#ifdef _EFENCE_
|
||||
#ifdef EFENCE
|
||||
freeElecMem (mem);
|
||||
#else
|
||||
sysfree(mem);
|
||||
@ -1392,7 +1435,9 @@ void *Mem::gbrealloc ( void *ptr , int oldSize , int newSize ,
|
||||
|
||||
char *mem;
|
||||
|
||||
#ifdef _EFENCE_
|
||||
// even though size may be < 100k for EFENCE_BIG, do it this way
|
||||
// for simplicity...
|
||||
#if defined(EFENCE) || defined(EFENCE_BIG)
|
||||
mem = (char *)mmalloc ( newSize , note );
|
||||
if ( ! mem ) return NULL;
|
||||
// copy over to it
|
||||
@ -1471,10 +1516,19 @@ void Mem::gbfree ( void *ptr , int size , const char *note ) {
|
||||
char *xx = NULL; *xx = 0;
|
||||
}
|
||||
|
||||
#ifdef _EFENCE_
|
||||
#ifdef EFENCE
|
||||
// this does a delayed free so do not call rmMem() just yet
|
||||
freeElecMem ((char *)ptr - UNDERPAD );
|
||||
#else
|
||||
return;
|
||||
#endif
|
||||
|
||||
#ifdef EFENCE_BIG
|
||||
if ( size >= EFENCE_SIZE ) {
|
||||
freeElecMem ((char *)ptr - 0 );
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
bool isnew = s_isnew[slot];
|
||||
|
||||
// if this returns false it was an unbalanced free
|
||||
@ -1482,7 +1536,6 @@ void Mem::gbfree ( void *ptr , int size , const char *note ) {
|
||||
|
||||
if ( isnew ) sysfree ( (char *)ptr );
|
||||
else sysfree ( (char *)ptr - UNDERPAD );
|
||||
#endif
|
||||
}
|
||||
|
||||
long getLowestLitBitLL ( unsigned long long bits ) {
|
||||
|
@ -53,7 +53,7 @@ bool Monitordb::init ( ) {
|
||||
sizeof(key96_t) ,
|
||||
true ); // bias page cache? (true!)
|
||||
}
|
||||
|
||||
/*
|
||||
bool Monitordb::addColl ( char *coll, bool doVerify ) {
|
||||
if ( ! m_rdb.addColl ( coll ) ) return false;
|
||||
if ( ! doVerify ) return true;
|
||||
@ -65,7 +65,7 @@ bool Monitordb::addColl ( char *coll, bool doVerify ) {
|
||||
log ( "db: Verify failed, but scaling is allowed, passing." );
|
||||
return true;
|
||||
}
|
||||
|
||||
*/
|
||||
bool Monitordb::verify ( char *coll ) {
|
||||
log ( LOG_INFO, "db: Verifying Monitordb for coll %s...", coll );
|
||||
g_threads.disableThreads();
|
||||
|
12
Msg13.cpp
12
Msg13.cpp
@ -1156,10 +1156,12 @@ void gotHttpReply2 ( void *state ,
|
||||
// . if no user-agent line matches * or gigabot/flurbot we
|
||||
// will get just a \0 for the reply, replySize=1!
|
||||
//char *ua = "ProCogBot";//"EventGuruBot";//r->m_userAgent;
|
||||
char *ua = "Gigabot";
|
||||
long uaLen = gbstrlen(ua);
|
||||
replySize = filterRobotsTxt (reply,replySize,&mime,niceness,
|
||||
ua,uaLen);
|
||||
// take this out until it works for
|
||||
// user-agent: *\ndisallow: blah
|
||||
//char *ua = "Gigabot";
|
||||
//long uaLen = gbstrlen(ua);
|
||||
//replySize = filterRobotsTxt (reply,replySize,&mime,niceness,
|
||||
// ua,uaLen);
|
||||
// record in the stats
|
||||
docsPtr = &g_stats.m_compressRobotsTxtDocs;
|
||||
bytesInPtr = &g_stats.m_compressRobotsTxtBytesIn;
|
||||
@ -2020,7 +2022,7 @@ bool getIframeExpandedContent ( Msg13Request *r , TcpSocket *ts ) {
|
||||
xd->m_r = r;
|
||||
|
||||
// so XmlDoc::getExtraDoc doesn't have any issues
|
||||
xd->m_firstIp = 0;
|
||||
xd->m_firstIp = 123456;
|
||||
xd->m_firstIpValid = true;
|
||||
|
||||
// try using xmldoc to do it
|
||||
|
@ -20,9 +20,16 @@ void Msg20::constructor () {
|
||||
|
||||
void Msg20::destructor () { reset(); m_mcast.destructor(); }
|
||||
|
||||
#include "Process.h"
|
||||
|
||||
void Msg20::reset() {
|
||||
// not allowed to reset one in progress
|
||||
if ( m_inProgress ) { char *xx=NULL;*xx=0; }
|
||||
if ( m_inProgress ) {
|
||||
// do not core on abrupt exits!
|
||||
if (g_process.m_mode == EXIT_MODE ) return;
|
||||
// otherwise core
|
||||
char *xx=NULL;*xx=0;
|
||||
}
|
||||
m_launched = false;
|
||||
if ( m_request && m_request != m_requestBuf )
|
||||
mfree ( m_request , m_requestSize , "Msg20rb" );
|
||||
|
@ -334,7 +334,8 @@ void handleRequest22 ( UdpSlot *slot , long netnice ) {
|
||||
// get base, returns NULL and sets g_errno to ENOCOLLREC on error
|
||||
RdbBase *tbase;
|
||||
if ( ! (tbase=getRdbBase(RDB_TITLEDB,coll) ) ) {
|
||||
log("db: Could not get title rec in collection \"%s\".",
|
||||
log("db: Could not get title rec in collection \"%s\" "
|
||||
"because rdbbase is null.",
|
||||
coll);
|
||||
g_errno = EBADENGINEER;
|
||||
us->sendErrorReply ( slot , g_errno );
|
||||
|
24
Msg39.cpp
24
Msg39.cpp
@ -427,16 +427,24 @@ bool Msg39::getLists () {
|
||||
|
||||
// if we have twins, then make sure the twins read different
|
||||
// pieces of the same docid range to make things 2x faster
|
||||
bool useTwins = false;
|
||||
if ( g_hostdb.getNumStripes() == 2 ) useTwins = true;
|
||||
if ( useTwins ) {
|
||||
long long delta2 = ( docIdEnd - docIdStart ) / 2;
|
||||
if ( m_r->m_stripe == 0 ) docIdEnd = docIdStart + delta2;
|
||||
else docIdStart = docIdStart + delta2;
|
||||
}
|
||||
//bool useTwins = false;
|
||||
//if ( g_hostdb.getNumStripes() == 2 ) useTwins = true;
|
||||
//if ( useTwins ) {
|
||||
// long long delta2 = ( docIdEnd - docIdStart ) / 2;
|
||||
// if ( m_r->m_stripe == 0 ) docIdEnd = docIdStart + delta2;
|
||||
// else docIdStart = docIdStart + delta2;
|
||||
//}
|
||||
// new striping logic:
|
||||
long numStripes = g_hostdb.getNumStripes();
|
||||
long long delta2 = ( docIdEnd - docIdStart ) / numStripes;
|
||||
long stripe = g_hostdb.getMyHost()->m_stripe;
|
||||
docIdStart += delta2 * stripe; // is this right?
|
||||
docIdEnd = docIdStart + delta2;
|
||||
// add 1 to be safe so we don't lose a docid
|
||||
docIdEnd++;
|
||||
// TODO: add triplet support later for this to split the
|
||||
// read 3 ways. 4 ways for quads, etc.
|
||||
if ( g_hostdb.getNumStripes() >= 3 ) { char *xx=NULL;*xx=0;}
|
||||
//if ( g_hostdb.getNumStripes() >= 3 ) { char *xx=NULL;*xx=0;}
|
||||
// do not go over MAX_DOCID because it gets masked and
|
||||
// ends up being 0!!! and we get empty lists
|
||||
if ( docIdEnd > MAX_DOCID ) docIdEnd = MAX_DOCID;
|
||||
|
19
Msg4.cpp
19
Msg4.cpp
@ -541,8 +541,9 @@ bool Msg4::addMetaList ( char *metaList ,
|
||||
s_msg4Tail->m_next = this;
|
||||
// we are the new tail
|
||||
s_msg4Tail = this;
|
||||
// debug log
|
||||
log("msg4: queueing body msg4=0x%lx",(long)this);
|
||||
// debug log. seems to happen a lot if not using threads..
|
||||
if ( g_conf.m_useThreads )
|
||||
log("msg4: queueing body msg4=0x%lx",(long)this);
|
||||
// mark it
|
||||
m_inUse = true;
|
||||
// all done then, but return false so caller does not free
|
||||
@ -556,8 +557,10 @@ bool Msg4::addMetaList ( char *metaList ,
|
||||
// sanity check
|
||||
if ( s_msg4Head || s_msg4Tail ) { char *xx=NULL; *xx=0; }
|
||||
|
||||
// spider hang bug
|
||||
logf(LOG_DEBUG,"msg4: queueing head msg4=0x%lx",(long)this);
|
||||
// . spider hang bug
|
||||
// . debug log. seems to happen a lot if not using threads..
|
||||
if ( g_conf.m_useThreads )
|
||||
logf(LOG_DEBUG,"msg4: queueing head msg4=0x%lx",(long)this);
|
||||
|
||||
// mark it
|
||||
m_inUse = true;
|
||||
@ -1062,8 +1065,10 @@ void storeLineWaiters ( ) {
|
||||
// . if his callback was NULL, then was loaded in loadAddsInProgress()
|
||||
// . we no longer do that so callback should never be null now
|
||||
if ( ! msg4->m_callback ) { char *xx=NULL;*xx=0; }
|
||||
// log this now i guess
|
||||
logf(LOG_DEBUG,"msg4: calling callback for msg4=0x%lx",(long)msg4);
|
||||
// log this now i guess. seems to happen a lot if not using threads
|
||||
if ( g_conf.m_useThreads )
|
||||
logf(LOG_DEBUG,"msg4: calling callback for msg4=0x%lx",
|
||||
(long)msg4);
|
||||
// release it
|
||||
msg4->m_inUse = false;
|
||||
// call his callback
|
||||
@ -1074,7 +1079,7 @@ void storeLineWaiters ( ) {
|
||||
goto loop;
|
||||
}
|
||||
|
||||
|
||||
#include "Process.h"
|
||||
|
||||
// . destroys the slot if false is returned
|
||||
// . this is registered in Msg4::set() to handle add rdb record msgs
|
||||
|
32
Msg40.cpp
32
Msg40.cpp
@ -17,7 +17,7 @@
|
||||
|
||||
// increasing this doesn't seem to improve performance any on a single
|
||||
// node cluster....
|
||||
#define MAX_OUTSTANDING_MSG20S 50
|
||||
#define MAX_OUTSTANDING_MSG20S 200
|
||||
|
||||
//static void handleRequest40 ( UdpSlot *slot , long netnice );
|
||||
//static void gotExternalReplyWrapper ( void *state , void *state2 ) ;
|
||||
@ -1184,6 +1184,10 @@ bool gotSummaryWrapper ( void *state ) {
|
||||
Msg40 *THIS = (Msg40 *)state;
|
||||
// inc it here
|
||||
THIS->m_numReplies++;
|
||||
// log every 1000 i guess
|
||||
if ( (THIS->m_numReplies % 1000) == 0 )
|
||||
log("msg40: got %li summaries out of %li",THIS->m_numReplies,
|
||||
THIS->m_msg3a.m_numDocIds);
|
||||
// it returns false if we're still awaiting replies
|
||||
if ( ! THIS->gotSummary ( ) ) return false;
|
||||
// now call callback, we're done
|
||||
@ -1217,11 +1221,24 @@ bool Msg40::gotSummary ( ) {
|
||||
// reset g_errno
|
||||
g_errno = 0;
|
||||
}
|
||||
/*
|
||||
// sanity check
|
||||
for ( long i = 0 ; i < m_msg3a.m_numDocIds ; i++ ) {
|
||||
// stop as soon as we hit a gap breaking our contiguity...
|
||||
Msg20 *m = m_msg20[i];
|
||||
if ( ! m ) continue;
|
||||
Msg20Reply *mr = m->m_r;
|
||||
if ( ! mr ) continue;
|
||||
char *cc = mr->ptr_content;
|
||||
if ( ! cc ) continue;
|
||||
//if ( ! strstr(cc,"Modern Marketing KF400032MA") ) continue;
|
||||
//log("hey");
|
||||
//fprintf(stderr,"msg %li = %s\n",i,cc );
|
||||
if ( i == 48329 ) { char *xx=NULL;*xx=0; }
|
||||
mr->ptr_content = NULL;
|
||||
}
|
||||
*/
|
||||
|
||||
// . ok, now i wait for everybody.
|
||||
// . TODO: evaluate if this hurts us
|
||||
if ( m_numReplies < m_numRequests )
|
||||
return false;
|
||||
|
||||
doAgain:
|
||||
|
||||
@ -1245,6 +1262,11 @@ bool Msg40::gotSummary ( ) {
|
||||
//char *xx=NULL; *xx=0;
|
||||
}
|
||||
|
||||
// . ok, now i wait for everybody.
|
||||
// . TODO: evaluate if this hurts us
|
||||
if ( m_numReplies < m_numRequests )
|
||||
return false;
|
||||
|
||||
|
||||
// save this before we increment m_numContiguous
|
||||
//long oldNumContiguous = m_numContiguous;
|
||||
|
48
Msg5.cpp
48
Msg5.cpp
@ -22,6 +22,7 @@ long g_numCorrupt = 0;
|
||||
|
||||
Msg5::Msg5() {
|
||||
m_waitingForList = false;
|
||||
//m_waitingForMerge = false;
|
||||
m_numListPtrs = 0;
|
||||
m_mergeLists = true;
|
||||
reset();
|
||||
@ -33,7 +34,7 @@ Msg5::~Msg5() {
|
||||
|
||||
// frees m_treeList
|
||||
void Msg5::reset() {
|
||||
if ( m_waitingForList ) {
|
||||
if ( m_waitingForList ) { // || m_waitingForMerge ) {
|
||||
log("disk: Trying to reset a class waiting for a reply.");
|
||||
// might being doing an urgent exit (mainShutdown(1)) or
|
||||
// g_process.shutdown(), so do not core here
|
||||
@ -45,7 +46,6 @@ void Msg5::reset() {
|
||||
m_prevCount = 0;
|
||||
//m_prevKey.setMin();
|
||||
KEYMIN(m_prevKey,MAX_KEY_BYTES);// m_ks); m_ks is invalid
|
||||
m_waitingForList = false;
|
||||
// free lists if m_mergeLists was false
|
||||
for ( long i = 0 ; ! m_mergeLists && i < m_numListPtrs ; i++ )
|
||||
m_listPtrs[i]->freeList();
|
||||
@ -203,6 +203,13 @@ bool Msg5::getList ( char rdbId ,
|
||||
// remember stuff
|
||||
m_rdbId = rdbId;
|
||||
m_coll = coll;
|
||||
|
||||
m_collnum = g_collectiondb.getCollnum ( coll );
|
||||
if ( m_collnum < 0 ) {
|
||||
g_errno = ENOCOLLREC;
|
||||
return true;
|
||||
}
|
||||
|
||||
m_list = list;
|
||||
//m_startKey = startKey;
|
||||
//m_endKey = endKey;
|
||||
@ -466,7 +473,12 @@ bool Msg5::getList ( char rdbId ,
|
||||
// timing debug
|
||||
//log("Msg5:getting list startKey.n1=%lu",m_startKey.n1);
|
||||
// start the read loop - hopefully, will only loop once
|
||||
return readList ( );
|
||||
if ( readList ( ) ) return true;
|
||||
|
||||
// tell Spider.cpp not to nuke us until we get back!!!
|
||||
m_waitingForList = true;
|
||||
// we blocked!!! must call m_callback
|
||||
return false;
|
||||
}
|
||||
// . returns false if blocked, true otherwise
|
||||
// . sets g_errno on error
|
||||
@ -725,7 +737,7 @@ bool Msg5::readList ( ) {
|
||||
if ( m_treeList.m_ks != m_ks ) { char *xx = NULL; *xx = 0; }
|
||||
|
||||
// we are waiting for the list
|
||||
m_waitingForList = true;
|
||||
//m_waitingForList = true;
|
||||
|
||||
// clear just in case
|
||||
g_errno = 0;
|
||||
@ -915,6 +927,8 @@ void gotListWrapper ( void *state ) {
|
||||
if ( THIS->m_calledCallback ) { char *xx=NULL;*xx=0; }
|
||||
// set it now
|
||||
THIS->m_calledCallback = 1;
|
||||
// we are no longer waiting for the list
|
||||
THIS->m_waitingForList = false;
|
||||
// when completely done call the callback
|
||||
THIS->m_callback ( THIS->m_state , THIS->m_list , THIS );
|
||||
}
|
||||
@ -931,7 +945,7 @@ static void *mergeListsWrapper_r ( void *state , ThreadEntry *t ) ;
|
||||
bool Msg5::gotList ( ) {
|
||||
|
||||
// we are no longer waiting for the list
|
||||
m_waitingForList = false;
|
||||
//m_waitingForList = false;
|
||||
|
||||
// debug msg
|
||||
//log("msg5 got lists from msg3 (msg5=%lu)",(long)this);
|
||||
@ -1064,8 +1078,15 @@ bool Msg5::gotList2 ( ) {
|
||||
// sanity check
|
||||
//if ( KEYNEG(m_listPtrs[i]->getEndKey()) ) {
|
||||
// char *xx=NULL;*xx=0; }
|
||||
if ( KEYCMP(m_listPtrs[i]->getEndKey(),m_minEndKey,m_ks)<0 )
|
||||
if ( KEYCMP(m_listPtrs[i]->getEndKey(),m_minEndKey,m_ks)<0 ) {
|
||||
KEYSET(m_minEndKey,m_listPtrs[i]->getEndKey(),m_ks);
|
||||
// crap, if list is all negative keys, then the
|
||||
// end key seems negative too! however in this
|
||||
// case RdbScan::m_endKey seems positive so
|
||||
// maybe we got a negative endkey in constrain?
|
||||
//if (! (m_minEndKey[0] & 0x01) )
|
||||
// log("msg5: list had bad endkey");
|
||||
}
|
||||
}
|
||||
// sanity check
|
||||
//if ( KEYNEG( m_minEndKey) ) {char *xx=NULL;*xx=0; }
|
||||
@ -1152,7 +1173,7 @@ bool Msg5::gotList2 ( ) {
|
||||
// filter happens and we have a chance to weed out old titleRecs
|
||||
if ( m_rdbId == RDB_TITLEDB && m_numFiles != 1 && n == 1 &&
|
||||
m_isRealMerge ) {
|
||||
log(LOG_LOGIC,"db: Adding dummy list.");
|
||||
//log(LOG_LOGIC,"db: Adding dummy list.");
|
||||
//m_tfns [n] = 255;
|
||||
m_dummy.set ( NULL , // list data
|
||||
0 , // list data size
|
||||
@ -1377,6 +1398,8 @@ bool Msg5::gotList2 ( ) {
|
||||
// skip it for now
|
||||
//goto skipThread;
|
||||
|
||||
//m_waitingForMerge = true;
|
||||
|
||||
// . if size is big, make a thread
|
||||
// . let's always make niceness 0 since it wasn't being very
|
||||
// aggressive before
|
||||
@ -1386,8 +1409,11 @@ bool Msg5::gotList2 ( ) {
|
||||
threadDoneWrapper ,
|
||||
mergeListsWrapper_r ) )
|
||||
return false;
|
||||
|
||||
//m_waitingForMerge = false;
|
||||
|
||||
// thread creation failed
|
||||
if ( ! g_threads.areThreadsDisabled() )
|
||||
if ( g_conf.m_useThreads && ! g_threads.m_disabled )
|
||||
log(LOG_INFO,
|
||||
"net: Failed to create thread to merge lists. Doing "
|
||||
"blocking merge. Hurts performance.");
|
||||
@ -1441,6 +1467,8 @@ void threadDoneWrapper ( void *state , ThreadEntry *t ) {
|
||||
if ( THIS->needsRecall() && ! THIS->readList() ) return;
|
||||
// sanity check
|
||||
if ( THIS->m_calledCallback ) { char *xx=NULL;*xx=0; }
|
||||
// we are no longer waiting for the list
|
||||
THIS->m_waitingForList = false;
|
||||
// set it now
|
||||
THIS->m_calledCallback = 3;
|
||||
// when completely done call the callback
|
||||
@ -1716,6 +1744,8 @@ void Msg5::mergeLists_r ( ) {
|
||||
// . we are left with an empty list
|
||||
bool Msg5::doneMerging ( ) {
|
||||
|
||||
//m_waitingForMerge = false;
|
||||
|
||||
// get base, returns NULL and sets g_errno to ENOCOLLREC on error
|
||||
RdbBase *base; if (!(base=getRdbBase(m_rdbId,m_coll))) return true;
|
||||
|
||||
@ -2017,6 +2047,8 @@ void gotRemoteListWrapper( void *state ) { // , RdbList *list ) {
|
||||
if ( ! THIS->gotRemoteList() ) return;
|
||||
// sanity check
|
||||
if ( THIS->m_calledCallback ) { char *xx=NULL;*xx=0; }
|
||||
// we are no longer waiting for the list
|
||||
THIS->m_waitingForList = false;
|
||||
// set it now
|
||||
THIS->m_calledCallback = 4;
|
||||
// if it doesn't block call the callback, g_errno may be set
|
||||
|
2
Msg5.h
2
Msg5.h
@ -293,6 +293,8 @@ class Msg5 {
|
||||
bool m_mergeLists;
|
||||
|
||||
char m_waitingForList;
|
||||
//char m_waitingForMerge;
|
||||
collnum_t m_collnum;
|
||||
|
||||
// actually part of a different algo than m_waitingForList!
|
||||
unsigned long long m_waitingKey;
|
||||
|
11
Msge0.cpp
11
Msge0.cpp
@ -118,8 +118,14 @@ bool Msge0::launchRequests ( long starti ) {
|
||||
loop:
|
||||
// stop if no more urls. return true if we got all replies! no block.
|
||||
if ( m_n >= m_numUrls ) return (m_numRequests == m_numReplies);
|
||||
// if all hosts are getting a diffbot reply with 50 spiders and they
|
||||
// all timeout at the same time we can very easily clog up the
|
||||
// udp sockets, so use this to limit... i've seen the whole
|
||||
// spider tables stuck with "getting outlink tag rec vector"statuses
|
||||
long maxOut = MAX_OUTSTANDING_MSGE0;
|
||||
if ( g_udpServer.m_numUsedSlots > 500 ) maxOut = 1;
|
||||
// if we are maxed out, we basically blocked!
|
||||
if (m_numRequests - m_numReplies >= MAX_OUTSTANDING_MSGE0)return false;
|
||||
if (m_numRequests - m_numReplies >= maxOut ) return false;
|
||||
// . skip if "old"
|
||||
// . we are not planning on adding this to spiderdb, so Msg16
|
||||
// want to skip the ip lookup, etc.
|
||||
@ -145,7 +151,8 @@ bool Msge0::launchRequests ( long starti ) {
|
||||
// . grab a slot
|
||||
// . m_msg8as[i], m_msgCs[i], m_msg50s[i], m_msg20s[i]
|
||||
long i;
|
||||
for ( i = starti ; i < MAX_OUTSTANDING_MSGE0 ; i++ )
|
||||
// make this 0 since "maxOut" now changes!!
|
||||
for ( i = 0 /*starti*/ ; i < MAX_OUTSTANDING_MSGE0 ; i++ )
|
||||
if ( ! m_used[i] ) break;
|
||||
// sanity check
|
||||
if ( i >= MAX_OUTSTANDING_MSGE0 ) { char *xx = NULL; *xx = 0; }
|
||||
|
@ -443,6 +443,8 @@ void Multicast::gotReply2 ( UdpSlot *slot ) {
|
||||
long now = getTime();
|
||||
if (now - s_elastTime > 10) {s_elastTime = now; logIt=true;}
|
||||
}
|
||||
// don't log ETRYAGAIN, may come across as bad when it is normal
|
||||
if ( m_errnos[i] == ETRYAGAIN ) logIt = false;
|
||||
// log a failure msg
|
||||
if ( logIt ) { // m_errnos[i] != ETRYAGAIN ) {
|
||||
Host *h = m_hostdb->getHost ( slot->m_ip ,slot->m_port );
|
||||
|
@ -31,7 +31,7 @@ bool sendPageAddDelColl ( TcpSocket *s , HttpRequest *r , bool add ) {
|
||||
char *msg = NULL;
|
||||
|
||||
// if any host in network is dead, do not do this
|
||||
if ( g_hostdb.hasDeadHost() ) msg = "A host in the network is dead.";
|
||||
//if ( g_hostdb.hasDeadHost() ) msg = "A host in the network is dead.";
|
||||
|
||||
// . are we adding a collection?
|
||||
// . return if error adding, might already exist!
|
||||
@ -85,15 +85,18 @@ bool sendPageAddDelColl ( TcpSocket *s , HttpRequest *r , bool add ) {
|
||||
// print the add collection box
|
||||
if ( add /*&& (! nc[0] || g_errno ) */ ) {
|
||||
p.safePrintf (
|
||||
"<center>\n<table border=1 cellpadding=4 "
|
||||
"width=100%% bgcolor=#%s>\n"
|
||||
"<tr><td colspan=2 bgcolor=#%s>"
|
||||
"<center>\n<table %s>\n"
|
||||
"<tr class=hdrow><td colspan=2>"
|
||||
"<center><b>Add Collection</b></center>"
|
||||
"</td></tr>\n",LIGHT_BLUE,DARK_BLUE);
|
||||
"</td></tr>\n",
|
||||
TABLE_STYLE);
|
||||
p.safePrintf (
|
||||
"<tr><td><b>name of new collection to add</td>\n"
|
||||
"<tr bgcolor=#%s>"
|
||||
"<td><b>name of new collection to add</td>\n"
|
||||
"<td><input type=text name=addColl size=30>"
|
||||
"</td></tr>\n");
|
||||
"</td></tr>\n"
|
||||
, LIGHT_BLUE
|
||||
);
|
||||
// now list collections from which to copy the config
|
||||
//p.safePrintf (
|
||||
// "<tr><td><b>copy configuration from this "
|
||||
@ -118,27 +121,31 @@ bool sendPageAddDelColl ( TcpSocket *s , HttpRequest *r , bool add ) {
|
||||
// print all collections out in a checklist so you can check the
|
||||
// ones you want to delete, the values will be the id of that collectn
|
||||
p.safePrintf (
|
||||
"<center>\n<table border=1 cellpadding=4 "
|
||||
"width=100%% bgcolor=#%s>\n"
|
||||
"<tr><td bgcolor=#%s><center><b>Delete Collections"
|
||||
"<center>\n<table %s>\n"
|
||||
"<tr class=hdrow><td><center><b>Delete Collections"
|
||||
"</b></center></td></tr>\n"
|
||||
"<tr><td>"
|
||||
"<tr bgcolor=#%s><td>"
|
||||
"<center><b>Select the collections you wish to delete. "
|
||||
//"<font color=red>This feature is currently under "
|
||||
//"development.</font>"
|
||||
"</b></center></td></tr>\n"
|
||||
"<tr><td>"
|
||||
"<tr bgcolor=#%s><td>"
|
||||
// table within a table
|
||||
"<center><table width=20%%>\n",
|
||||
LIGHT_BLUE,DARK_BLUE);
|
||||
TABLE_STYLE,
|
||||
LIGHT_BLUE,
|
||||
DARK_BLUE
|
||||
);
|
||||
|
||||
for ( long i = 0 ; i < g_collectiondb.m_numRecs ; i++ ) {
|
||||
CollectionRec *cr = g_collectiondb.m_recs[i];
|
||||
if ( ! cr ) continue;
|
||||
p.safePrintf (
|
||||
"<tr><td>"
|
||||
"<input type=checkbox name=delete value=\"%s\"> "
|
||||
"%s</td></tr>\n",cr->m_coll,cr->m_coll);
|
||||
"<tr bgcolor=#%s><td>"
|
||||
"<input type=checkbox name=delColl value=\"%s\"> "
|
||||
"%s</td></tr>\n",
|
||||
DARK_BLUE,
|
||||
cr->m_coll,cr->m_coll);
|
||||
}
|
||||
p.safePrintf( "</table></center></td></tr></table><br>\n" );
|
||||
skip:
|
||||
|
@ -89,7 +89,7 @@ bool sendPageAddUrl ( TcpSocket *s , HttpRequest *r ) {
|
||||
collLen = gbstrlen(coll);
|
||||
}
|
||||
// get collection rec
|
||||
CollectionRec *cr = g_collectiondb.getRec ( coll );
|
||||
CollectionRec *cr = g_collectiondb.getRec ( r ); // coll );
|
||||
// bitch if no collection rec found
|
||||
if ( ! cr ) {
|
||||
g_errno = ENOCOLLREC;
|
||||
@ -248,8 +248,6 @@ bool sendPageAddUrl ( TcpSocket *s , HttpRequest *r ) {
|
||||
//
|
||||
|
||||
SpiderRequest *sreq = &st1->m_sreq;
|
||||
|
||||
|
||||
// set the SpiderRequest from this add url
|
||||
if ( ! sreq->setFromAddUrl ( st1->m_url ) ) {
|
||||
if ( ! g_errno ) { char *xx=NULL;*xx=0; }
|
||||
|
@ -149,30 +149,54 @@ bool sendReply ( void *state ) {
|
||||
// . do not print big links if only an assassin, just print host ids
|
||||
g_pages.printAdminTop ( &sb, st->m_socket , &st->m_r );
|
||||
|
||||
sb.safePrintf ( "<table width=100%% bgcolor=#%s border=1 cellpadding=4>"
|
||||
"<tr><td bgcolor=#%s colspan=2>"
|
||||
sb.safePrintf(
|
||||
"<style>"
|
||||
".poo { background-color:#%s;}\n"
|
||||
"</style>\n" ,
|
||||
LIGHT_BLUE );
|
||||
|
||||
|
||||
sb.safePrintf ( "<table %s>"
|
||||
"<tr><td colspan=2>"
|
||||
"<center><font size=+1><b>Catdb</b></font></center>"
|
||||
"</td></tr>", LIGHT_BLUE , DARK_BLUE );
|
||||
"</td></tr>", TABLE_STYLE );
|
||||
|
||||
// instructions
|
||||
sb.safePrintf("<tr bgcolor=#%s>"
|
||||
"<td colspan=3>"
|
||||
"<font size=-2>"
|
||||
"<center>"
|
||||
"Don't just start using this, you need to follow the "
|
||||
"instructions in the <i>admin guide</i> for adding "
|
||||
"DMOZ support."
|
||||
"</center>"
|
||||
"</font>"
|
||||
"</td>"
|
||||
"</tr>"
|
||||
,DARK_BLUE
|
||||
);
|
||||
|
||||
// print the generate Catdb link
|
||||
sb.safePrintf ( "<tr><td>Update Catdb from DMOZ data.</td>"
|
||||
sb.safePrintf ( "<tr class=poo><td>Update Catdb from DMOZ data.</td>"
|
||||
"<td><center>"
|
||||
"<a href=\"/master/catdb?c=%s&gencatdb=2\">"
|
||||
"Update Catdb</a> "
|
||||
"</center></td></tr>",
|
||||
st->m_coll );
|
||||
sb.safePrintf ( "<tr><td>Generate New Catdb from DMOZ data.</td>"
|
||||
sb.safePrintf ( "<tr class=poo>"
|
||||
"<td>Generate New Catdb from DMOZ data.</td>"
|
||||
"<td><center>"
|
||||
"<a href=\"/master/catdb?c=%s&gencatdb=1\">"
|
||||
"Generate Catdb</a> "
|
||||
"</center></td></tr>",
|
||||
st->m_coll );
|
||||
if (st->m_genCatdb)
|
||||
sb.safePrintf ( "<tr><td> Catdb Generation took %lli ms."
|
||||
sb.safePrintf ( "<tr class=poo>"
|
||||
"<td> Catdb Generation took %lli ms."
|
||||
"</td></tr>",
|
||||
endTime - st->m_startTime );
|
||||
// print Url Catgory Lookup
|
||||
sb.safePrintf ( "<tr><td>Lookup Category of Url.</td>"
|
||||
sb.safePrintf ( "<tr class=poo><td>Lookup Category of Url.</td>"
|
||||
"<td><input type=text name=caturl size=80"
|
||||
" value=\"");
|
||||
if (st->m_catLookup) {
|
||||
|
447
PageCrawlBot.cpp
447
PageCrawlBot.cpp
@ -160,6 +160,10 @@ bool sendBackDump ( TcpSocket *sock, HttpRequest *hr ) {
|
||||
rdbId = RDB_SPIDERDB;
|
||||
fmt = FMT_CSV;
|
||||
}
|
||||
else if ( ( xx = strstr ( path , "_urls.txt" ) ) ) {
|
||||
rdbId = RDB_SPIDERDB;
|
||||
fmt = FMT_TXT;
|
||||
}
|
||||
else if ( ( xx = strstr ( path , "_pages.txt" ) ) ) {
|
||||
rdbId = RDB_TITLEDB;
|
||||
fmt = FMT_TXT;
|
||||
@ -204,6 +208,10 @@ bool sendBackDump ( TcpSocket *sock, HttpRequest *hr ) {
|
||||
SafeBuf sb2(tmp2,5000);
|
||||
sb2.safePrintf("GET /search.csv?icc=1&format=csv&sc=0&dr=0&"
|
||||
"c=%s&n=1000000&"
|
||||
// no gigabits
|
||||
"dsrt=0&"
|
||||
// do not compute summary. 0 lines.
|
||||
"ns=0&"
|
||||
"q=gbsortby%%3Agbspiderdate&"
|
||||
"prepend=type%%3Ajson"
|
||||
"\r\n\r\n"
|
||||
@ -231,6 +239,7 @@ bool sendBackDump ( TcpSocket *sock, HttpRequest *hr ) {
|
||||
return g_httpServer.sendErrorReply(sock,500,mstrerror(g_errno));
|
||||
}
|
||||
mnew ( st , sizeof(StateCD), "statecd");
|
||||
|
||||
// initialize the new state
|
||||
st->m_rdbId = rdbId;
|
||||
st->m_downloadJSON = downloadJSON;
|
||||
@ -266,13 +275,60 @@ bool sendBackDump ( TcpSocket *sock, HttpRequest *hr ) {
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
// . all wrappers call this
|
||||
// . returns false if would block, true otherwise
|
||||
bool readAndSendLoop ( StateCD *st , bool readFirst ) {
|
||||
|
||||
subloop:
|
||||
|
||||
// if we had a broken pipe on the sendChunk() call then hopefully
|
||||
// this will kick in...
|
||||
if ( g_errno ) {
|
||||
log("crawlbot: readAndSendLoop: %s",mstrerror(g_errno));
|
||||
readFirst = true;
|
||||
st->m_someoneNeedsMore = false;
|
||||
}
|
||||
|
||||
// wait if some are outstanding. how can this happen?
|
||||
if ( st->m_numRequests > st->m_numReplies ) {
|
||||
log("crawlbot: only got %li of %li replies. waiting for "
|
||||
"all to come back in.",
|
||||
st->m_numReplies,st->m_numRequests);
|
||||
return false;
|
||||
}
|
||||
|
||||
// are we all done?
|
||||
if ( readFirst && ! st->m_someoneNeedsMore ) {
|
||||
log("crawlbot: done sending for download request");
|
||||
mdelete ( st , sizeof(StateCD) , "stcd" );
|
||||
delete st;
|
||||
return true;
|
||||
}
|
||||
|
||||
// begin reading from each shard and sending the spiderdb records
|
||||
// over the network. return if that blocked
|
||||
if ( readFirst && ! st->readDataFromRdb ( ) ) return false;
|
||||
|
||||
// send it to the browser socket. returns false if blocks.
|
||||
if ( ! st->sendList() ) return false;
|
||||
|
||||
// read again i guess
|
||||
readFirst = true;
|
||||
|
||||
// hey, it did not block... tcpserver caches writes...
|
||||
goto subloop;
|
||||
}
|
||||
|
||||
void StateCD::sendBackDump2 ( ) {
|
||||
|
||||
m_numRequests = 0;
|
||||
m_numReplies = 0;
|
||||
|
||||
// read 10MB from each shard's spiderdb at a time
|
||||
m_minRecSizes = 9999999;
|
||||
//m_minRecSizes = 9999999;
|
||||
// 1ook to be more fluid
|
||||
m_minRecSizes = 99999;
|
||||
|
||||
// we stop reading from all shards when this becomes false
|
||||
m_someoneNeedsMore = true;
|
||||
@ -284,20 +340,22 @@ void StateCD::sendBackDump2 ( ) {
|
||||
KEYMIN((char *)&m_titledbStartKeys[i],sizeof(key_t));
|
||||
}
|
||||
|
||||
subloop:
|
||||
// begin reading from each shard and sending the spiderdb records
|
||||
// over the network. return if that blocked
|
||||
if ( ! readDataFromRdb ( ) ) return;
|
||||
// send it to the browser socket
|
||||
if ( ! sendList() ) return;
|
||||
// . hey, it did not block... i guess no data to send out
|
||||
// . but if all shards are exhausted from the dump, just return
|
||||
if ( m_someoneNeedsMore ) goto subloop;
|
||||
// note it
|
||||
log("crawlbot: nobody needs more 1");
|
||||
// begin reading from the shards and trasmitting back on m_socket
|
||||
readAndSendLoop ( this , true );
|
||||
}
|
||||
|
||||
void sendListWrapper ( void *state ) ;
|
||||
|
||||
static void gotListWrapper7 ( void *state ) {
|
||||
// get the Crawler dump State
|
||||
StateCD *st = (StateCD *)state;
|
||||
// inc it up here
|
||||
st->m_numReplies++;
|
||||
// wait for all
|
||||
if ( st->m_numReplies < st->m_numRequests ) return;
|
||||
// read and send loop
|
||||
readAndSendLoop( st , false );
|
||||
}
|
||||
|
||||
|
||||
bool StateCD::readDataFromRdb ( ) {
|
||||
|
||||
@ -341,7 +399,7 @@ bool StateCD::readDataFromRdb ( ) {
|
||||
// records
|
||||
m_minRecSizes,
|
||||
this,
|
||||
sendListWrapper ,
|
||||
gotListWrapper7 ,
|
||||
niceness ) ) {
|
||||
log("crawlbot: blocked getting list from shard");
|
||||
// continue if it blocked
|
||||
@ -360,22 +418,6 @@ bool StateCD::readDataFromRdb ( ) {
|
||||
return true;
|
||||
}
|
||||
|
||||
void sendListWrapper ( void *state ) {
|
||||
// get the Crawler dump State
|
||||
StateCD *st = (StateCD *)state;
|
||||
// inc it up here
|
||||
st->m_numReplies++;
|
||||
subloop:
|
||||
// if this blocked sending back some data, return
|
||||
if ( ! st->sendList() ) return;
|
||||
// otherwise, read more, maybe had no data to send from list
|
||||
if ( ! st->readDataFromRdb () ) return;
|
||||
// send and read more
|
||||
if ( st->m_someoneNeedsMore ) goto subloop;
|
||||
// note it
|
||||
log("crawlbot: nobody needs more 2");
|
||||
}
|
||||
|
||||
bool StateCD::sendList ( ) {
|
||||
// get the Crawler dump State
|
||||
// inc it
|
||||
@ -403,6 +445,7 @@ bool StateCD::sendList ( ) {
|
||||
// then do so here, the content-length will not be in there
|
||||
// because we might have to call for more spiderdb data
|
||||
if ( m_needsMime ) {
|
||||
m_needsMime = false;
|
||||
HttpMime mime;
|
||||
mime.makeMime ( -1, // totel content-lenght is unknown!
|
||||
0 , // do not cache (cacheTime)
|
||||
@ -496,6 +539,13 @@ bool StateCD::sendList ( ) {
|
||||
list->freeList();
|
||||
}
|
||||
|
||||
//log("rdbid=%li fmt=%li some=%li printed=%li",
|
||||
// (long)m_rdbId,(long)m_fmt,(long)m_someoneNeedsMore,
|
||||
// (long)m_printedEndingBracket);
|
||||
|
||||
bool lastChunk = false;
|
||||
if ( ! m_someoneNeedsMore )
|
||||
lastChunk = true;
|
||||
|
||||
// if nobody needs to read more...
|
||||
if ( m_rdbId == RDB_TITLEDB &&
|
||||
@ -504,113 +554,31 @@ bool StateCD::sendList ( ) {
|
||||
! m_printedEndingBracket ) {
|
||||
m_printedEndingBracket = true;
|
||||
// end array of json objects. might be empty!
|
||||
sb.safePrintf("\n]");
|
||||
sb.safePrintf("\n]\n");
|
||||
//log("adding ]. len=%li",sb.length());
|
||||
}
|
||||
|
||||
// if first time, send it back
|
||||
if ( m_needsMime ) {
|
||||
// only do once
|
||||
m_needsMime = false;
|
||||
TcpServer *tcp = &g_httpServer.m_tcp;
|
||||
|
||||
sendLoop:
|
||||
// start the send process
|
||||
TcpServer *tcp = &g_httpServer.m_tcp;
|
||||
if ( ! tcp->sendMsg ( m_socket ,
|
||||
sb.getBufStart(), // sendBuf ,
|
||||
sb.getCapacity(),//sendBufSize ,
|
||||
sb.length(),//sendBufSize ,
|
||||
sb.length(), // msgtotalsize
|
||||
this , // data for callback
|
||||
doneSendingWrapper ) ) { // callback
|
||||
// do not free sendbuf we are transmitting it
|
||||
sb.detachBuf();
|
||||
return false;
|
||||
}
|
||||
// error?
|
||||
//TcpSocket *s = m_socket;
|
||||
// sometimes it does not block and is successful because
|
||||
// it just writes its buffer out in one write call.
|
||||
//if ( ! g_errno )
|
||||
sb.detachBuf();
|
||||
// . transmit the chunk in sb
|
||||
// . steals the allocated buffer from sb and stores in the
|
||||
// TcpSocket::m_sendBuf, which it frees when socket is
|
||||
// ultimately destroyed or we call sendChunk() again.
|
||||
// . when TcpServer is done transmitting, it does not close the
|
||||
// socket but rather calls doneSendingWrapper() which can call
|
||||
// this function again to send another chunk
|
||||
// . when we are truly done sending all the data, then we set lastChunk
|
||||
// to true and TcpServer.cpp will destroy m_socket when done
|
||||
if ( ! tcp->sendChunk ( m_socket ,
|
||||
&sb ,
|
||||
this ,
|
||||
doneSendingWrapper ,
|
||||
lastChunk ) )
|
||||
return false;
|
||||
|
||||
// log it
|
||||
//log("crawlbot: nuking state. strange");
|
||||
|
||||
// nuke state
|
||||
//delete this;
|
||||
//mdelete ( this , sizeof(StateCD) , "stcd" );
|
||||
//if ( g_errno )
|
||||
log("diffbot: tcp sendmsg did not block: %s",
|
||||
mstrerror(g_errno));
|
||||
//g_httpServer.sendErrorReply(s,500,mstrerror(g_errno));
|
||||
// wait for doneSendingWrapper to be called.
|
||||
//return false;
|
||||
//
|
||||
// it did not block... so just keep going. that just
|
||||
// means the socket sent the data. it's probably buffered.
|
||||
//
|
||||
// but we DO have to free the sendbuffer here since
|
||||
// we did not block
|
||||
mfree ( m_socket->m_sendBuf ,
|
||||
m_socket->m_sendBufSize ,
|
||||
"dbsbuf");
|
||||
m_socket->m_sendBuf = NULL;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
// if nothing to send back we are done. return true since we
|
||||
// did not block sending back.
|
||||
if ( sb.length() == 0 ) {
|
||||
//log("crawlbot: nuking state.");
|
||||
//delete this;
|
||||
//mdelete ( this , sizeof(StateCD) , "stcd" );
|
||||
return true;
|
||||
}
|
||||
|
||||
// how can this be?
|
||||
if ( m_socket->m_sendBuf ) { char *xx=NULL;*xx=0; }
|
||||
|
||||
// put socket in sending-again mode
|
||||
m_socket->m_sendBuf = sb.getBufStart();
|
||||
m_socket->m_sendBufSize = sb.getCapacity();
|
||||
m_socket->m_sendBufUsed = sb.length();
|
||||
m_socket->m_sendOffset = 0;
|
||||
m_socket->m_totalSent = 0;
|
||||
m_socket->m_totalToSend = sb.length();
|
||||
|
||||
// tell TcpServer.cpp to send this latest buffer! HACK!
|
||||
//m_socket->m_sockState = ST_SEND_AGAIN;//ST_WRITING;//SEND_AGAIN;
|
||||
|
||||
// this does nothing if we were not called indirectly by
|
||||
// TcpServer::writeSocketWrapper_r(). so if we should call
|
||||
// sendMsg() ourselves in such a situation.
|
||||
// so if the sendMsg() did not block, the first time, and we came
|
||||
// here empty except for the ending ']' the 2nd time, then
|
||||
// write it out this way... calling sendMsg() directly
|
||||
if ( m_socket->m_sockState == ST_NEEDS_CLOSE ) {
|
||||
//m_socket->m_sockState = ST_SEND_AGAIN;
|
||||
goto sendLoop;
|
||||
}
|
||||
|
||||
// do not let safebuf free this, we will take care of it
|
||||
sb.detachBuf();
|
||||
|
||||
// . when it is done sending call this callback, don't hang up!
|
||||
// . if m_someoneNeedsMore is false then this callback should just
|
||||
// destroy the socket and delete "this"
|
||||
m_socket->m_callback = doneSendingWrapper;
|
||||
m_socket->m_state = this;
|
||||
|
||||
//if ( m_socket->m_sendBufUsed == 79 )
|
||||
// log("hey");
|
||||
|
||||
// log it
|
||||
log("crawlbot: resending %li bytes on socket",m_socket->m_sendBufUsed);
|
||||
|
||||
// we blocked sending back
|
||||
return false;
|
||||
// we are done sending this chunk, i guess tcp write was cached
|
||||
// in the network card buffer or something
|
||||
return true;
|
||||
}
|
||||
|
||||
// TcpServer.cpp calls this when done sending TcpSocket's m_sendBuf
|
||||
@ -618,83 +586,16 @@ void doneSendingWrapper ( void *state , TcpSocket *sock ) {
|
||||
|
||||
StateCD *st = (StateCD *)state;
|
||||
|
||||
TcpSocket *socket = st->m_socket;
|
||||
//TcpSocket *socket = st->m_socket;
|
||||
|
||||
log("crawlbot: done sending on socket %li/%li bytes",
|
||||
sock->m_totalSent,
|
||||
sock->m_sendBufUsed);
|
||||
|
||||
// . if the final callback
|
||||
// . sometimes m_sendBuf is NULL if we freed it below and tried to
|
||||
// read more, only to read 0 bytes
|
||||
// . but it will be non-null if we read 0 bytes the first time
|
||||
// and just have a mime to send. because sendReply() above
|
||||
// returned true, and then doneSendingWrapper() got called.
|
||||
if ( //! socket->m_sendBuf &&
|
||||
st->m_numRequests <= st->m_numReplies &&
|
||||
! st->m_someoneNeedsMore ) {
|
||||
log("crawlbot: done sending for download request");
|
||||
delete st;
|
||||
mdelete ( st , sizeof(StateCD) , "stcd" );
|
||||
//log("mdel1: st=%lx",(long)st);
|
||||
return;
|
||||
}
|
||||
|
||||
// if the timer called us, just return
|
||||
if ( ! socket->m_sendBuf ) {
|
||||
log("crawlbot: timer callback");
|
||||
socket->m_sockState = ST_SEND_AGAIN;
|
||||
return;
|
||||
}
|
||||
readAndSendLoop ( st , true );
|
||||
|
||||
|
||||
// free the old sendbuf then i guess since we might replace it
|
||||
// in the above function.
|
||||
mfree ( socket->m_sendBuf ,
|
||||
socket->m_sendBufSize ,
|
||||
"dbsbuf");
|
||||
|
||||
// in case we have nothing to send back do not let socket free
|
||||
// what we just freed above. it'll core.
|
||||
socket->m_sendBuf = NULL;
|
||||
|
||||
// sometimes this wrapper is called just from the timer...
|
||||
// so if we have outstanding msg0s then we gotta wait
|
||||
if ( st->m_numRequests > st->m_numReplies ) {
|
||||
char *xx=NULL;*xx=0;
|
||||
socket->m_sockState = ST_SEND_AGAIN;
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
// all done?
|
||||
if ( st->m_someoneNeedsMore ) {
|
||||
// make sure socket doesn't close up on us!
|
||||
socket->m_sockState = ST_SEND_AGAIN;
|
||||
log("crawlbot: reading more download data");
|
||||
// just enter the little loop here
|
||||
subloop:
|
||||
// otherwise, read more, maybe had no data to send from list
|
||||
if ( ! st->readDataFromRdb () ) return;
|
||||
// if this blocked sending back some data, return
|
||||
if ( ! st->sendList() ) return;
|
||||
// note that
|
||||
log("crawlbot: sendList did not block");
|
||||
// send and read more
|
||||
if ( st->m_someoneNeedsMore ) goto subloop;
|
||||
// note it
|
||||
log("crawlbot: nobody needs more 3");
|
||||
// sanity
|
||||
if ( st->m_numRequests>st->m_numReplies){char *xx=NULL;*xx=0;}
|
||||
}
|
||||
|
||||
|
||||
log("crawlbot: no more data available");
|
||||
|
||||
// it's possible that readDataFromRdb() did not block and called
|
||||
// sendList which set the socket m_sendBuf again... so check
|
||||
// for that... it needs to be sent yet before we delete this state
|
||||
//if ( st->m_socket->m_sendBuf ) return;
|
||||
return;
|
||||
}
|
||||
|
||||
void StateCD::printSpiderdbList ( RdbList *list,SafeBuf *sb,char **lastKeyPtr){
|
||||
@ -804,7 +705,9 @@ void StateCD::printSpiderdbList ( RdbList *list,SafeBuf *sb,char **lastKeyPtr){
|
||||
nowGlobalMS,
|
||||
false,
|
||||
MAX_NICENESS,
|
||||
cr);
|
||||
cr,
|
||||
false, // isoutlink?
|
||||
NULL);
|
||||
char *expression = NULL;
|
||||
long priority = -4;
|
||||
// sanity check
|
||||
@ -821,7 +724,9 @@ void StateCD::printSpiderdbList ( RdbList *list,SafeBuf *sb,char **lastKeyPtr){
|
||||
// when spidering rounds we use the
|
||||
// lastspidertime>={roundstart} --> spiders disabled rule
|
||||
// so that we do not spider a url twice in the same round
|
||||
if ( ufn >= 0 && ! cr->m_spidersEnabled[ufn] ) {
|
||||
if ( ufn >= 0 && //! cr->m_spidersEnabled[ufn] ) {
|
||||
// we set this to 0 instead of using the checkbox
|
||||
cr->m_maxSpidersPerRule[ufn] <= 0 ) {
|
||||
priority = -5;
|
||||
}
|
||||
|
||||
@ -837,7 +742,10 @@ void StateCD::printSpiderdbList ( RdbList *list,SafeBuf *sb,char **lastKeyPtr){
|
||||
m_isFirstTime = false;
|
||||
sb->safePrintf("\"Url\","
|
||||
"\"Entry Method\","
|
||||
"\"Processed?\","
|
||||
);
|
||||
if ( cr->m_isCustomCrawl )
|
||||
sb->safePrintf("\"Processed?\",");
|
||||
sb->safePrintf(
|
||||
"\"Add Time\","
|
||||
"\"Last Crawled\","
|
||||
"\"Last Status\","
|
||||
@ -869,12 +777,15 @@ void StateCD::printSpiderdbList ( RdbList *list,SafeBuf *sb,char **lastKeyPtr){
|
||||
// but default to csv
|
||||
else {
|
||||
sb->safePrintf("\"%s\",\"%s\","
|
||||
"%li,%lu,%lu,\"%s\",\"%s\",\""
|
||||
//",%s"
|
||||
//"\n"
|
||||
, sreq->m_url
|
||||
, as
|
||||
, (long)isProcessed
|
||||
);
|
||||
if ( cr->m_isCustomCrawl )
|
||||
sb->safePrintf("%li,",(long)isProcessed);
|
||||
sb->safePrintf(
|
||||
"%lu,%lu,\"%s\",\"%s\",\""
|
||||
//",%s"
|
||||
//"\n"
|
||||
// when was it first added to spiderdb?
|
||||
, sreq->m_addedTime
|
||||
// last time spidered, 0 if none
|
||||
@ -991,8 +902,11 @@ void StateCD::printTitledbList ( RdbList *list,SafeBuf *sb,char **lastKeyPtr){
|
||||
|
||||
m_printedItem = true;
|
||||
|
||||
if ( ! sb->safeStrcpyPrettyJSON ( json ) )
|
||||
log("diffbot: error printing json in dump");
|
||||
//if ( ! sb->safeStrcpyPrettyJSON ( json ) )
|
||||
// log("diffbot: error printing json in dump");
|
||||
sb->safeStrcpy ( json );
|
||||
|
||||
sb->nullTerm();
|
||||
|
||||
// separate each JSON object with \n i guess
|
||||
//sb->pushChar('\n');
|
||||
@ -1132,8 +1046,8 @@ void printCrawlStatsWrapper ( void *state ) {
|
||||
// save before nuking state
|
||||
TcpSocket *sock = sxx->m_socket;
|
||||
// nuke the state
|
||||
delete sxx;
|
||||
mdelete ( sxx , sizeof(StateXX) , "stxx" );
|
||||
delete sxx;
|
||||
// and send back now
|
||||
g_httpServer.sendDynamicPage ( sock ,
|
||||
sb.getBufStart(),
|
||||
@ -1383,8 +1297,8 @@ void addedUrlsToSpiderdbWrapper ( void *state ) {
|
||||
NULL ,
|
||||
&rr ,
|
||||
st->m_collnum );
|
||||
delete st;
|
||||
mdelete ( st , sizeof(StateCD) , "stcd" );
|
||||
delete st;
|
||||
//log("mdel2: st=%lx",(long)st);
|
||||
}
|
||||
/*
|
||||
@ -1460,8 +1374,8 @@ void injectedUrlWrapper ( void *state ) {
|
||||
response,
|
||||
NULL ,
|
||||
st->m_collnum );
|
||||
delete st;
|
||||
mdelete ( st , sizeof(StateCD) , "stcd" );
|
||||
delete st;
|
||||
}
|
||||
*/
|
||||
|
||||
@ -1587,8 +1501,8 @@ void collOpDoneWrapper ( void *state ) {
|
||||
StateCD *st = (StateCD *)state;
|
||||
TcpSocket *socket = st->m_socket;
|
||||
log("crawlbot: done with blocked op.");
|
||||
delete st;
|
||||
mdelete ( st , sizeof(StateCD) , "stcd" );
|
||||
delete st;
|
||||
//log("mdel3: st=%lx",(long)st);
|
||||
g_httpServer.sendDynamicPage (socket,"OK",2);
|
||||
}
|
||||
@ -1648,6 +1562,29 @@ bool sendPageCrawlbot ( TcpSocket *socket , HttpRequest *hr ) {
|
||||
// . put in xml or json if format=xml or format=json or
|
||||
// xml=1 or json=1 ...
|
||||
char fmt = FMT_JSON;
|
||||
|
||||
// token is always required. get from json or html form input
|
||||
//char *token = getInputString ( "token" );
|
||||
char *token = hr->getString("token");
|
||||
char *name = hr->getString("name");
|
||||
|
||||
// . try getting token-name from ?c=
|
||||
// . the name of the collection is encoded as <token>-<crawlname>
|
||||
char *c = hr->getString("c");
|
||||
char tmp[MAX_COLL_LEN+100];
|
||||
if ( ! token && c ) {
|
||||
strncpy ( tmp , c , MAX_COLL_LEN );
|
||||
token = tmp;
|
||||
name = strstr(tmp,"-");
|
||||
if ( name ) {
|
||||
*name = '\0';
|
||||
name++;
|
||||
}
|
||||
// change default formatting to html
|
||||
fmt = FMT_HTML;
|
||||
}
|
||||
|
||||
|
||||
char *fs = hr->getString("format",NULL,NULL);
|
||||
// give john a json api
|
||||
if ( fs && strcmp(fs,"html") == 0 ) fmt = FMT_HTML;
|
||||
@ -1656,9 +1593,7 @@ bool sendPageCrawlbot ( TcpSocket *socket , HttpRequest *hr ) {
|
||||
// if we got json as input, give it as output
|
||||
//if ( JS.getFirstItem() ) fmt = FMT_JSON;
|
||||
|
||||
// token is always required. get from json or html form input
|
||||
//char *token = getInputString ( "token" );
|
||||
char *token = hr->getString("token");
|
||||
|
||||
|
||||
if ( ! token && fmt == FMT_JSON ) { // (cast==0|| fmt == FMT_JSON ) ) {
|
||||
char *msg = "invalid token";
|
||||
@ -1718,8 +1653,6 @@ bool sendPageCrawlbot ( TcpSocket *socket , HttpRequest *hr ) {
|
||||
bool restartColl = hr->hasField("restart");
|
||||
|
||||
|
||||
char *name = hr->getString("name");
|
||||
|
||||
//if ( delColl && ! && cast == 0 ) {
|
||||
// log("crawlbot: no collection found to delete.");
|
||||
// char *msg = "Could not find crawl to delete.";
|
||||
@ -1906,8 +1839,8 @@ bool sendPageCrawlbot ( TcpSocket *socket , HttpRequest *hr ) {
|
||||
em.safePrintf("Invalid regular expresion: %s",rx2);
|
||||
}
|
||||
if ( status1 || status2 ) {
|
||||
delete st;
|
||||
mdelete ( st , sizeof(StateCD) , "stcd" );
|
||||
delete st;
|
||||
char *msg = em.getBufStart();
|
||||
return sendErrorReply2(socket,fmt,msg);
|
||||
}
|
||||
@ -1965,8 +1898,8 @@ bool sendPageCrawlbot ( TcpSocket *socket , HttpRequest *hr ) {
|
||||
if ( resetColl ) msg = "No such collection";
|
||||
if ( restartColl ) msg = "No such collection";
|
||||
// nuke it
|
||||
delete st;
|
||||
mdelete ( st , sizeof(StateCD) , "stcd" );
|
||||
delete st;
|
||||
// log it
|
||||
log("crawlbot: cr is null. %s",msg);
|
||||
// make sure this returns in json if required
|
||||
@ -1992,8 +1925,8 @@ bool sendPageCrawlbot ( TcpSocket *socket , HttpRequest *hr ) {
|
||||
if ( ! g_collectiondb.deleteRec ( collName , we ) )
|
||||
return false;
|
||||
// nuke it
|
||||
delete st;
|
||||
mdelete ( st , sizeof(StateCD) , "stcd" );
|
||||
delete st;
|
||||
// all done
|
||||
return g_httpServer.sendDynamicPage (socket,"OK",2);
|
||||
}
|
||||
@ -2017,14 +1950,14 @@ bool sendPageCrawlbot ( TcpSocket *socket , HttpRequest *hr ) {
|
||||
// to avoid user confusion
|
||||
if ( cr ) cr->m_spideringEnabled = 1;
|
||||
// nuke it
|
||||
delete st;
|
||||
mdelete ( st , sizeof(StateCD) , "stcd" );
|
||||
delete st;
|
||||
// all done
|
||||
return g_httpServer.sendDynamicPage (socket,"OK",2);
|
||||
}
|
||||
// nuke it
|
||||
delete st;
|
||||
mdelete ( st , sizeof(StateCD) , "stcd" );
|
||||
delete st;
|
||||
// this will set the the collection parms from json
|
||||
//setSpiderParmsFromJSONPost ( socket , hr , cr , &JS );
|
||||
// this is a cast, so just return simple response
|
||||
@ -2050,8 +1983,8 @@ bool sendPageCrawlbot ( TcpSocket *socket , HttpRequest *hr ) {
|
||||
if ( name && name[0] )
|
||||
msg = "Failed to add crawl. Crawl name is illegal.";
|
||||
// nuke it
|
||||
delete st;
|
||||
mdelete ( st , sizeof(StateCD) , "stcd" );
|
||||
delete st;
|
||||
//log("crawlbot: no collection found. need to add a crawl");
|
||||
return sendErrorReply2(socket,fmt, msg);
|
||||
}
|
||||
@ -2101,15 +2034,15 @@ bool sendPageCrawlbot ( TcpSocket *socket , HttpRequest *hr ) {
|
||||
// error?
|
||||
if ( ! status ) {
|
||||
// nuke it
|
||||
delete st;
|
||||
mdelete ( st , sizeof(StateCD) , "stcd" );
|
||||
delete st;
|
||||
return sendErrorReply2(socket,fmt,mstrerror(g_errno));
|
||||
}
|
||||
// if not list
|
||||
if ( ! size ) {
|
||||
// nuke it
|
||||
delete st;
|
||||
mdelete ( st , sizeof(StateCD) , "stcd" );
|
||||
delete st;
|
||||
return sendErrorReply2(socket,fmt,"no urls found");
|
||||
}
|
||||
// add to spiderdb
|
||||
@ -2163,8 +2096,8 @@ bool sendPageCrawlbot ( TcpSocket *socket , HttpRequest *hr ) {
|
||||
printCrawlBotPage2 ( socket,hr,fmt,NULL,NULL,cr->m_collnum);
|
||||
|
||||
// get rid of that state
|
||||
delete st;
|
||||
mdelete ( st , sizeof(StateCD) , "stcd" );
|
||||
delete st;
|
||||
//log("mdel4: st=%lx",(long)st);
|
||||
return true;
|
||||
}
|
||||
@ -2281,8 +2214,11 @@ bool printCrawlDetailsInJson ( SafeBuf &sb , CollectionRec *cx ) {
|
||||
//"\"urlsExamined\":%lli,\n"
|
||||
"\"pageCrawlAttempts\":%lli,\n"
|
||||
"\"pageCrawlSuccesses\":%lli,\n"
|
||||
"\"pageCrawlSuccessesThisRound\":%lli,\n"
|
||||
|
||||
"\"pageProcessAttempts\":%lli,\n"
|
||||
"\"pageProcessSuccesses\":%lli,\n"
|
||||
"\"pageProcessSuccessesThisRound\":%lli,\n"
|
||||
|
||||
"\"maxRounds\":%li,\n"
|
||||
"\"repeat\":%f,\n"
|
||||
@ -2303,8 +2239,11 @@ bool printCrawlDetailsInJson ( SafeBuf &sb , CollectionRec *cx ) {
|
||||
//,cx->m_globalCrawlInfo.m_urlsConsidered
|
||||
, cx->m_globalCrawlInfo.m_pageDownloadAttempts
|
||||
, cx->m_globalCrawlInfo.m_pageDownloadSuccesses
|
||||
, cx->m_globalCrawlInfo.m_pageDownloadSuccessesThisRound
|
||||
|
||||
, cx->m_globalCrawlInfo.m_pageProcessAttempts
|
||||
, cx->m_globalCrawlInfo.m_pageProcessSuccesses
|
||||
, cx->m_globalCrawlInfo.m_pageProcessSuccessesThisRound
|
||||
|
||||
, (long)cx->m_maxCrawlRounds
|
||||
, cx->m_collectiveRespiderFrequency
|
||||
@ -2619,8 +2558,12 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
|
||||
"<td><b>URLs Examined</b></td>"
|
||||
"<td><b>Page Download Attempts</b></td>"
|
||||
"<td><b>Page Download Successes</b></td>"
|
||||
"<td><b>Page Download Successes This Round"
|
||||
"</b></td>"
|
||||
"<td><b>Page Process Attempts</b></td>"
|
||||
"<td><b>Page Process Successes</b></td>"
|
||||
"<td><b>Page Process Successes This Round"
|
||||
"</b></td>"
|
||||
"</tr>"
|
||||
);
|
||||
}
|
||||
@ -2667,6 +2610,8 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
|
||||
"<td>%lli</td>"
|
||||
"<td>%lli</td>"
|
||||
"<td>%lli</td>"
|
||||
"<td>%lli</td>"
|
||||
"<td>%lli</td>"
|
||||
"</tr>"
|
||||
, cx->m_coll
|
||||
, cx->m_globalCrawlInfo.m_objectsAdded -
|
||||
@ -2675,8 +2620,10 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
|
||||
//, cx->m_globalCrawlInfo.m_urlsConsidered
|
||||
, cx->m_globalCrawlInfo.m_pageDownloadAttempts
|
||||
, cx->m_globalCrawlInfo.m_pageDownloadSuccesses
|
||||
, cx->m_globalCrawlInfo.m_pageDownloadSuccessesThisRound
|
||||
, cx->m_globalCrawlInfo.m_pageProcessAttempts
|
||||
, cx->m_globalCrawlInfo.m_pageProcessSuccesses
|
||||
, cx->m_globalCrawlInfo.m_pageProcessSuccessesThisRound
|
||||
);
|
||||
}
|
||||
if ( summary && fmt == FMT_HTML ) {
|
||||
@ -2732,6 +2679,8 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
|
||||
return false;
|
||||
// shortcut
|
||||
XmlDoc **docs = g_spiderLoop.m_docs;
|
||||
// row count
|
||||
long j = 0;
|
||||
// first print the spider recs we are spidering
|
||||
for ( long i = 0 ; i < (long)MAX_SPIDERS ; i++ ) {
|
||||
// get it
|
||||
@ -2739,17 +2688,18 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
|
||||
// skip if empty
|
||||
if ( ! xd ) continue;
|
||||
// sanity check
|
||||
if ( ! xd->m_oldsrValid ) { char *xx=NULL;*xx=0; }
|
||||
if ( ! xd->m_sreqValid ) { char *xx=NULL;*xx=0; }
|
||||
// skip if not our coll rec!
|
||||
//if ( xd->m_cr != cr ) continue;
|
||||
if ( xd->m_collnum != cr->m_collnum ) continue;
|
||||
// grab it
|
||||
SpiderRequest *oldsr = &xd->m_oldsr;
|
||||
SpiderRequest *oldsr = &xd->m_sreq;
|
||||
// get status
|
||||
char *status = xd->m_statusMsg;
|
||||
// show that
|
||||
if ( ! oldsr->printToTableSimple ( &sb , status,xd) )
|
||||
if ( ! oldsr->printToTableSimple ( &sb , status,xd,j))
|
||||
return false;
|
||||
j++;
|
||||
}
|
||||
|
||||
// end the table
|
||||
@ -2888,6 +2838,9 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
|
||||
//
|
||||
if ( fmt == FMT_HTML ) {
|
||||
|
||||
char *seedStr = cr->m_diffbotSeeds.getBufStart();
|
||||
if ( ! seedStr ) seedStr = "";
|
||||
|
||||
SafeBuf tmp;
|
||||
long crawlStatus = -1;
|
||||
getSpiderStatusMsg ( cr , &tmp , &crawlStatus );
|
||||
@ -2927,6 +2880,11 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
|
||||
"<td>%s</td>"
|
||||
"</tr>"
|
||||
|
||||
"<tr>"
|
||||
"<td><b>Seeds:</td>"
|
||||
"<td>%s</td>"
|
||||
"</tr>"
|
||||
|
||||
"<tr>"
|
||||
"<td><b>Crawl Status:</td>"
|
||||
"<td>%li</td>"
|
||||
@ -2942,6 +2900,11 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
|
||||
"<td>%li</td>"
|
||||
"</tr>"
|
||||
|
||||
"<tr>"
|
||||
"<td><b>Has Urls Ready to Spider:</td>"
|
||||
"<td>%li</td>"
|
||||
"</tr>"
|
||||
|
||||
|
||||
// this will have to be in crawlinfo too!
|
||||
//"<tr>"
|
||||
@ -2975,6 +2938,11 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
|
||||
"<td>%lli</td>"
|
||||
"</tr>"
|
||||
|
||||
"<tr>"
|
||||
"<td><b>Page Crawl Successes This Round</b></td>"
|
||||
"<td>%lli</td>"
|
||||
"</tr>"
|
||||
|
||||
"<tr>"
|
||||
"<td><b>Page Process Attempts</b></td>"
|
||||
"<td>%lli</td>"
|
||||
@ -2985,6 +2953,11 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
|
||||
"<td>%lli</td>"
|
||||
"</tr>"
|
||||
|
||||
"<tr>"
|
||||
"<td><b>Page Process Successes This Round</b></td>"
|
||||
"<td>%lli</td>"
|
||||
"</tr>"
|
||||
|
||||
|
||||
, cr->m_diffbotCrawlName.getBufStart()
|
||||
|
||||
@ -2992,9 +2965,12 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
|
||||
|
||||
, cr->m_diffbotToken.getBufStart()
|
||||
|
||||
, seedStr
|
||||
|
||||
, crawlStatus
|
||||
, tmp.getBufStart()
|
||||
, cr->m_spiderRoundNum
|
||||
, cr->m_globalCrawlInfo.m_hasUrlsReadyToSpider
|
||||
|
||||
, cr->m_globalCrawlInfo.m_objectsAdded -
|
||||
cr->m_globalCrawlInfo.m_objectsDeleted
|
||||
@ -3003,9 +2979,11 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
|
||||
|
||||
, cr->m_globalCrawlInfo.m_pageDownloadAttempts
|
||||
, cr->m_globalCrawlInfo.m_pageDownloadSuccesses
|
||||
, cr->m_globalCrawlInfo.m_pageDownloadSuccessesThisRound
|
||||
|
||||
, cr->m_globalCrawlInfo.m_pageProcessAttempts
|
||||
, cr->m_globalCrawlInfo.m_pageProcessSuccesses
|
||||
, cr->m_globalCrawlInfo.m_pageProcessSuccessesThisRound
|
||||
);
|
||||
|
||||
|
||||
@ -3841,6 +3819,9 @@ bool getSpiderRequestMetaList ( char *doc ,
|
||||
SpiderRequest sreq;
|
||||
sreq.reset();
|
||||
sreq.m_firstIp = url.getHostHash32(); // fakeip!
|
||||
// avoid ips of 0 or -1
|
||||
if ( sreq.m_firstIp == 0 || sreq.m_firstIp == -1 )
|
||||
sreq.m_firstIp = 1;
|
||||
sreq.m_hostHash32 = url.getHostHash32();
|
||||
sreq.m_domHash32 = url.getDomainHash32();
|
||||
sreq.m_siteHash32 = url.getHostHash32();
|
||||
|
@ -7527,7 +7527,7 @@ bool printTopBarNav ( SafeBuf &sb , State7 *st ) {
|
||||
"</tr>"
|
||||
// - shadow row
|
||||
//"<tr cellspacing=5 height=5px><td colspan=9 "
|
||||
//"bgcolor=%s></td></tr>"
|
||||
//"bgcolor=#%s></td></tr>"
|
||||
// END TOP TABLE
|
||||
"</table>"
|
||||
//, GRAD2
|
||||
@ -12671,7 +12671,7 @@ bool gotResults ( void *state ) {
|
||||
">"
|
||||
"<tr>"
|
||||
"<td valign=top>"
|
||||
// bgcolor=%s
|
||||
// bgcolor=#%s
|
||||
//, GRAD1
|
||||
//, bg
|
||||
);
|
||||
|
@ -712,7 +712,7 @@ bool processLoop ( void *state ) {
|
||||
//Words *ww = xd->getWords();
|
||||
if ( ! xml.set ( content , contentLen , false ,
|
||||
0 , false , TITLEREC_CURRENT_VERSION ,
|
||||
false , 0 ) ) { // niceness is 0
|
||||
false , 0 , CT_HTML ) ) { // niceness is 0
|
||||
//if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" );
|
||||
return sendErrorReply ( st , g_errno );
|
||||
}
|
||||
|
203
PageHosts.cpp
203
PageHosts.cpp
@ -108,13 +108,13 @@ skipReplaceHost:
|
||||
refreshRate);
|
||||
|
||||
// ignore
|
||||
char *username = g_users.getUsername ( r );
|
||||
char *password = NULL;
|
||||
User *user = NULL;
|
||||
if ( username ) user = g_users.getUser (username );
|
||||
if ( user ) password = user->m_password;
|
||||
if ( ! password ) password = "";
|
||||
if ( ! username ) username = "";
|
||||
//char *username = g_users.getUsername ( r );
|
||||
//char *password = NULL;
|
||||
//User *user = NULL;
|
||||
//if ( username ) user = g_users.getUser (username );
|
||||
//if ( user ) password = user->m_password;
|
||||
//if ( ! password ) password = "";
|
||||
//if ( ! username ) username = "";
|
||||
|
||||
// print standard header
|
||||
// char *pp = sb.getBuf();
|
||||
@ -131,26 +131,26 @@ skipReplaceHost:
|
||||
colspan = "31";
|
||||
//shotcol = "<td><b>ip2</b></td>";
|
||||
sprintf ( shotcol, "<td><a href=\"/master/hosts?c=%s"
|
||||
"&sort=2&username=%s&pwd=%s\">"
|
||||
"&sort=2\">"
|
||||
"<b>ping2</b></td></a>",
|
||||
coll,username,password);
|
||||
coll);
|
||||
}
|
||||
|
||||
// print host table
|
||||
sb.safePrintf (
|
||||
"<table cellpadding=4 border=1 width=100%% bgcolor=#%s>"
|
||||
"<tr><td colspan=%s bgcolor=#%s><center>"
|
||||
"<table %s>"
|
||||
"<tr><td colspan=%s><center>"
|
||||
//"<font size=+1>"
|
||||
"<b>Hosts "
|
||||
"(<a href=\"/master/hosts?c=%s&sort=%li&reset=1\">"
|
||||
"reset)</b>"
|
||||
//"</font>"
|
||||
"</td></tr>"
|
||||
"<tr>"
|
||||
"<td><a href=\"/master/hosts?c=%s&sort=0&username=%s&"
|
||||
"password=%s\">"
|
||||
"<tr bgcolor=#%s>"
|
||||
"<td><a href=\"/master/hosts?c=%s&sort=0\">"
|
||||
|
||||
"<b>hostId</b></td>"
|
||||
"<td><b>host name</b></td>"
|
||||
"<td><b>host ip</b></td>"
|
||||
"<td><b>shard</b></td>" // mirror group
|
||||
"<td><b>stripe</b></td>"
|
||||
|
||||
@ -187,49 +187,49 @@ skipReplaceHost:
|
||||
//"<td><b>resends sent</td>"
|
||||
//"<td><b>errors recvd</td>"
|
||||
//"<td><b>ETRYAGAINS recvd</td>"
|
||||
"<td><a href=\"/master/hosts?c=%s&username=%s&pwd=%s&sort=3\">"
|
||||
"<td><a href=\"/master/hosts?c=%s&sort=3\">"
|
||||
"<b>dgrams resent</a></td>"
|
||||
"<td><a href=\"/master/hosts?c=%s&username=%s&pwd=%s&sort=4\">"
|
||||
"<td><a href=\"/master/hosts?c=%s&sort=4\">"
|
||||
"<b>errors recvd</a></td>"
|
||||
"<td><a href=\"/master/hosts?c=%s&username=%s&pwd=%s&sort=5\">"
|
||||
"<td><a href=\"/master/hosts?c=%s&sort=5\">"
|
||||
"<b>ETRY AGAINS recvd</a></td>"
|
||||
|
||||
"<td><a href=\"/master/hosts?c=%s&username=%s&pwd=%s&sort=6\">"
|
||||
"<td><a href=\"/master/hosts?c=%s&sort=6\">"
|
||||
"<b>dgrams to</a></td>"
|
||||
"<td><a href=\"/master/hosts?c=%s&username=%s&pwd=%s&sort=7\">"
|
||||
"<td><a href=\"/master/hosts?c=%s&sort=7\">"
|
||||
"<b>dgrams from</a></td>"
|
||||
|
||||
//"<td><a href=\"/master/hosts?c=%s&username=%s&pwd=%s&sort=8\">"
|
||||
//"<td><a href=\"/master/hosts?c=%s&sort=8\">"
|
||||
//"<b>loadavg</a></td>"
|
||||
|
||||
|
||||
"<td><a href=\"/master/hosts?c=%s&username=%s&pwd=%s&sort=13\">"
|
||||
"<td><a href=\"/master/hosts?c=%s&sort=13\">"
|
||||
"<b>avg split time</a></td>"
|
||||
|
||||
"<td><b>splits done</a></td>"
|
||||
|
||||
"<td><a href=\"/master/hosts?c=%s&username=%s&pwd=%s&sort=12\">"
|
||||
"<td><a href=\"/master/hosts?c=%s&sort=12\">"
|
||||
"<b>status</a></td>"
|
||||
|
||||
"<td><a href=\"/master/hosts?c=%s&username=%s&pwd=%s&sort=15\">"
|
||||
"<td><a href=\"/master/hosts?c=%s&sort=15\">"
|
||||
"<b>slow reads</a></td>"
|
||||
|
||||
"<td><b>docs indexed</a></td>"
|
||||
|
||||
"<td><a href=\"/master/hosts?c=%s&username=%s&pwd=%s&sort=9\">"
|
||||
"<td><a href=\"/master/hosts?c=%s&sort=9\">"
|
||||
"<b>mem used</a></td>"
|
||||
|
||||
"<td><a href=\"/master/hosts?c=%s&username=%s&pwd=%s&sort=10\">"
|
||||
"<td><a href=\"/master/hosts?c=%s&sort=10\">"
|
||||
"<b>cpu</a></td>"
|
||||
|
||||
"<td><a href=\"/master/hosts?c=%s&username=%s&pwd=%s&sort=14\">"
|
||||
"<td><a href=\"/master/hosts?c=%s&sort=14\">"
|
||||
"<b>max ping1</a></td>"
|
||||
|
||||
"<td><a href=\"/master/hosts?c=%s&username=%s&pwd=%s&sort=11\">"
|
||||
"<td><a href=\"/master/hosts?c=%s&sort=11\">"
|
||||
"<b>ping1 age</a></td>"
|
||||
|
||||
//"<td><b>ip1</td>"
|
||||
"<td><a href=\"/master/hosts?c=%s&username=%s&pwd=%s&sort=1\">"
|
||||
"<td><a href=\"/master/hosts?c=%s&sort=1\">"
|
||||
"<b>ping1</a></td>"
|
||||
|
||||
"%s"// "<td><b>ip2</td>"
|
||||
@ -237,25 +237,26 @@ skipReplaceHost:
|
||||
//"<td>avg roundtrip</td>"
|
||||
//"<td>std. dev.</td></tr>"
|
||||
"<td><b>note</td>",
|
||||
LIGHT_BLUE ,
|
||||
TABLE_STYLE ,
|
||||
colspan ,
|
||||
DARK_BLUE ,
|
||||
|
||||
coll, sort,
|
||||
coll, username, password,
|
||||
coll, username, password,
|
||||
coll, username, password,
|
||||
coll, username, password,
|
||||
coll, username, password,
|
||||
coll, username, password,
|
||||
coll, username, password,
|
||||
coll, username, password,
|
||||
coll, username, password,
|
||||
coll, username, password,
|
||||
//coll,username, password,
|
||||
coll, username, password,
|
||||
coll, username, password,
|
||||
coll, username, password,
|
||||
coll, username, password,
|
||||
DARK_BLUE ,
|
||||
|
||||
coll,
|
||||
coll,
|
||||
coll,
|
||||
coll,
|
||||
coll,
|
||||
coll,
|
||||
coll,
|
||||
coll,
|
||||
coll,
|
||||
coll,
|
||||
coll,
|
||||
coll,
|
||||
coll,
|
||||
coll,
|
||||
shotcol );
|
||||
|
||||
// loop through each host we know and print it's stats
|
||||
@ -396,13 +397,14 @@ skipReplaceHost:
|
||||
"in disagreement with ours.\">H</b></font>");
|
||||
// rebalancing?
|
||||
if ( h->m_flags & PFLAG_REBALANCING )
|
||||
fb.safePrintf("<b title=\"Current rebalancing\">R</b>");
|
||||
fb.safePrintf("<b title=\"Currently "
|
||||
"rebalancing\">R</b>");
|
||||
// has recs that should be in another shard? indicates
|
||||
// we need to rebalance or there is a bad hosts.conf
|
||||
if ( h->m_flags & PFLAG_FOREIGNRECS )
|
||||
fb.safePrintf("<font color=red><b title=\"Foreign data "
|
||||
"detected. Needs rebalance.\">F"
|
||||
"</b></font");
|
||||
"</b></font>");
|
||||
// if it has spiders going on say "S"
|
||||
if ( h->m_flags & PFLAG_HASSPIDERS )
|
||||
fb.safePrintf ( "<span title=\"Spidering\">S</span>");
|
||||
@ -423,11 +425,15 @@ skipReplaceHost:
|
||||
if ( fb.length() == 0 )
|
||||
fb.safePrintf(" ");
|
||||
|
||||
char *bg = LIGHT_BLUE;
|
||||
if ( h->m_ping >= g_conf.m_deadHostTimeout )
|
||||
bg = "ffa6a6";
|
||||
|
||||
// print it
|
||||
sb.safePrintf (
|
||||
"<tr>"
|
||||
"<tr bgcolor=#%s>"
|
||||
"<td><a href=\"http://%s:%hi/master/hosts?"
|
||||
"username=%s&pwd=%s&"
|
||||
""
|
||||
"c=%s"
|
||||
"&sort=%li\">%li</a></td>"
|
||||
|
||||
@ -496,8 +502,8 @@ skipReplaceHost:
|
||||
//"<td>%lims</td>"
|
||||
"<td nowrap=1>%s</td>"
|
||||
"</tr>" ,
|
||||
bg,//LIGHT_BLUE ,
|
||||
ipbuf3, h->m_httpPort,
|
||||
username, password,
|
||||
coll, sort,
|
||||
i ,
|
||||
h->m_hostname,
|
||||
@ -552,15 +558,16 @@ skipReplaceHost:
|
||||
// end the table now
|
||||
sb.safePrintf ( "</table><br>\n" );
|
||||
|
||||
|
||||
// print spare hosts table
|
||||
sb.safePrintf (
|
||||
"<table cellpadding=4 border=1 width=100%% bgcolor=#%s>"
|
||||
"<tr><td colspan=10 bgcolor=#%s><center>"
|
||||
"<table %s>"
|
||||
"<tr class=hdrow><td colspan=10><center>"
|
||||
//"<font size=+1>"
|
||||
"<b>Spares</b>"
|
||||
//"</font>"
|
||||
"</td></tr>"
|
||||
"<tr>"
|
||||
"<tr bgcolor=#%s>"
|
||||
"<td><b>spareId</td>"
|
||||
"<td><b>host name</td>"
|
||||
"<td><b>ip1</td>"
|
||||
@ -575,7 +582,7 @@ skipReplaceHost:
|
||||
//"<td><b>ide channel</td>"
|
||||
|
||||
"<td><b>note</td>",
|
||||
LIGHT_BLUE ,
|
||||
TABLE_STYLE,
|
||||
DARK_BLUE );
|
||||
|
||||
for ( long i = 0; i < g_hostdb.m_numSpareHosts; i++ ) {
|
||||
@ -589,7 +596,7 @@ skipReplaceHost:
|
||||
|
||||
// print it
|
||||
sb.safePrintf (
|
||||
"<tr>"
|
||||
"<tr bgcolor=#%s>"
|
||||
"<td>%li</td>"
|
||||
"<td>%s</td>"
|
||||
"<td>%s</td>"
|
||||
@ -602,6 +609,7 @@ skipReplaceHost:
|
||||
//"<td>%li</td>" // ide channel
|
||||
"<td>%s</td>"
|
||||
"</tr>" ,
|
||||
LIGHT_BLUE,
|
||||
i ,
|
||||
h->m_hostname,
|
||||
ipbuf1,
|
||||
@ -618,13 +626,13 @@ skipReplaceHost:
|
||||
|
||||
// print proxy hosts table
|
||||
sb.safePrintf (
|
||||
"<table cellpadding=4 border=1 width=100%% bgcolor=#%s>"
|
||||
"<tr><td colspan=12 bgcolor=#%s><center>"
|
||||
"<table %s>"
|
||||
"<tr class=hdrow><td colspan=12><center>"
|
||||
//"<font size=+1>"
|
||||
"<b>Proxies</b>"
|
||||
//"</font>"
|
||||
"</td></tr>"
|
||||
"<tr>"
|
||||
"<tr bgcolor=#%s>"
|
||||
"<td><b>proxyId</b></td>"
|
||||
"<td><b>type</b></td>"
|
||||
"<td><b>host name</b></td>"
|
||||
@ -645,8 +653,9 @@ skipReplaceHost:
|
||||
//"<td><b>ide channel</td>"
|
||||
|
||||
"<td><b>note</td>",
|
||||
LIGHT_BLUE ,
|
||||
DARK_BLUE );
|
||||
TABLE_STYLE,
|
||||
DARK_BLUE
|
||||
);
|
||||
for ( long i = 0; i < g_hostdb.m_numProxyHosts; i++ ) {
|
||||
// get the ith host (hostId)
|
||||
Host *h = g_hostdb.getProxy ( i );
|
||||
@ -677,10 +686,10 @@ skipReplaceHost:
|
||||
|
||||
// print it
|
||||
sb.safePrintf (
|
||||
"<tr>"
|
||||
"<tr bgcolor=#%s>"
|
||||
|
||||
"<td><a href=\"http://%s:%hi/master/hosts?"
|
||||
"username=%s&pwd=%s&"
|
||||
""
|
||||
"c=%s\">"
|
||||
"%li</a></td>"
|
||||
|
||||
@ -700,10 +709,9 @@ skipReplaceHost:
|
||||
"<td>%s </td>"
|
||||
"</tr>" ,
|
||||
|
||||
LIGHT_BLUE,
|
||||
ipbuf3,
|
||||
h->m_httpPort,
|
||||
username,
|
||||
password,
|
||||
coll,
|
||||
i ,
|
||||
|
||||
@ -724,24 +732,31 @@ skipReplaceHost:
|
||||
}
|
||||
sb.safePrintf ( "</table><br><br>" );
|
||||
|
||||
sb.safePrintf(
|
||||
"<style>"
|
||||
".poo { background-color:#%s;}\n"
|
||||
"</style>\n" ,
|
||||
LIGHT_BLUE );
|
||||
|
||||
|
||||
// print help table
|
||||
sb.safePrintf (
|
||||
"<table cellpadding=4 border=1 width=100%% bgcolor=#%s>"
|
||||
"<tr><td colspan=10 bgcolor=#%s><center>"
|
||||
"<table %s>"
|
||||
"<tr class=hdrow><td colspan=10><center>"
|
||||
//"<font size=+1>"
|
||||
"<b>Key</b>"
|
||||
//"</font>"
|
||||
"</td></tr>"
|
||||
|
||||
"<tr>"
|
||||
"<tr class=poo>"
|
||||
"<td>shard</td>"
|
||||
"<td>"
|
||||
"The index is split into shards. Which shard does this "
|
||||
"host server?"
|
||||
"host serve?"
|
||||
"</td>"
|
||||
"</tr>\n"
|
||||
|
||||
"<tr>"
|
||||
"<tr class=poo>"
|
||||
"<td>stripe</td>"
|
||||
"<td>"
|
||||
"Hosts with the same stripe serve the same shard "
|
||||
@ -749,41 +764,41 @@ skipReplaceHost:
|
||||
"</td>"
|
||||
"</tr>\n"
|
||||
|
||||
"<tr>"
|
||||
"<tr class=poo>"
|
||||
"<td>ip1</td>"
|
||||
"<td>The primary IP address of the host."
|
||||
"</td>"
|
||||
"</tr>\n"
|
||||
|
||||
"<tr>"
|
||||
"<tr class=poo>"
|
||||
"<td>ip2</td>"
|
||||
"<td>The secondary IP address of the host."
|
||||
"</td>"
|
||||
"</tr>\n"
|
||||
|
||||
/*
|
||||
"<tr>"
|
||||
"<tr class=poo>"
|
||||
"<td>udp port</td>"
|
||||
"<td>The UDP port the host uses to send and recieve "
|
||||
"datagrams."
|
||||
"</td>"
|
||||
"</tr>\n"
|
||||
|
||||
"<tr>"
|
||||
"<tr class=poo>"
|
||||
"<td>dns client port</td>"
|
||||
"<td>The UDP port used to send and receive dns traffic with."
|
||||
"</td>"
|
||||
"</tr>\n"
|
||||
*/
|
||||
|
||||
"<tr>"
|
||||
"<tr class=poo>"
|
||||
"<td>http port</td>"
|
||||
"<td>The port you can connect a browser to."
|
||||
"</td>"
|
||||
"</tr>\n"
|
||||
|
||||
/*
|
||||
"<tr>"
|
||||
"<tr class=poo>"
|
||||
"<td>best switch id</td>"
|
||||
"<td>The host prefers to be on this switch because it "
|
||||
"needs to send a lot of data to other hosts on this swtich. "
|
||||
@ -794,7 +809,7 @@ skipReplaceHost:
|
||||
*/
|
||||
|
||||
/*
|
||||
"<tr>"
|
||||
"<tr class=poo>"
|
||||
"<td>switch id</td>"
|
||||
"<td>Hosts that share the same switch id are "
|
||||
"physically on the same switch."
|
||||
@ -802,7 +817,7 @@ skipReplaceHost:
|
||||
"</tr>\n"
|
||||
*/
|
||||
|
||||
"<tr>"
|
||||
"<tr class=poo>"
|
||||
"<td>dgrams resent</td>"
|
||||
"<td>How many datagrams have had to be resent to a host "
|
||||
"because it was not ACKed quick enough or because it was "
|
||||
@ -811,7 +826,7 @@ skipReplaceHost:
|
||||
"</td>"
|
||||
"</tr>\n"
|
||||
|
||||
"<tr>"
|
||||
"<tr class=poo>"
|
||||
"<td>errors recvd</td>"
|
||||
"<td>How many errors were received from a host in response "
|
||||
"to a request to retrieve or insert data."
|
||||
@ -819,7 +834,7 @@ skipReplaceHost:
|
||||
"</tr>\n"
|
||||
|
||||
|
||||
"<tr>"
|
||||
"<tr class=poo>"
|
||||
"<td>ETRYAGAINS recvd</td>"
|
||||
"<td>How many ETRYAGAIN were received in response to a "
|
||||
"request to add data. Usually because the host's memory "
|
||||
@ -830,7 +845,7 @@ skipReplaceHost:
|
||||
"</td>"
|
||||
"</tr>\n"
|
||||
|
||||
"<tr>"
|
||||
"<tr class=poo>"
|
||||
"<td>dgrams to</td>"
|
||||
"<td>How many datagrams were sent to the host from the "
|
||||
"selected host since startup. Includes ACK datagrams. This "
|
||||
@ -841,46 +856,46 @@ skipReplaceHost:
|
||||
"</td>"
|
||||
"</tr>\n"
|
||||
|
||||
"<tr>"
|
||||
"<tr class=poo>"
|
||||
"<td>dgrams from</td>"
|
||||
"<td>How many datagrams were received from the host by the "
|
||||
"selected host since startup. Includes ACK datagrams."
|
||||
"</td>"
|
||||
"</tr>\n"
|
||||
|
||||
//"<tr>"
|
||||
//"<tr class=poo>"
|
||||
//"<td>loadavg</td>"
|
||||
//"<td>1-minute sliding-window load average from "
|
||||
//"/proc/loadavg."
|
||||
//"</td>"
|
||||
//"</tr>\n"
|
||||
|
||||
"<tr>"
|
||||
"<tr class=poo>"
|
||||
"<td>mem used</td>"
|
||||
"<td>percentage of memory currently used."
|
||||
"</td>"
|
||||
"</tr>\n"
|
||||
|
||||
"<tr>"
|
||||
"<tr class=poo>"
|
||||
"<td>cpu usage</td>"
|
||||
"<td>percentage of cpu resources in use by the gb process."
|
||||
"</td>"
|
||||
"</tr>\n"
|
||||
|
||||
"<tr>"
|
||||
"<tr class=poo>"
|
||||
"<td>ping1 age</td>"
|
||||
"<td>How long ago the last ping request was sent to "
|
||||
"this host. Let's us know how fresh the ping time is."
|
||||
"</td>"
|
||||
"</tr>\n"
|
||||
|
||||
"<tr>"
|
||||
"<tr class=poo>"
|
||||
"<td>ping1</td>"
|
||||
"<td>Ping time to this host on the primary network."
|
||||
"</td>"
|
||||
"</tr>\n"
|
||||
|
||||
"<tr>"
|
||||
"<tr class=poo>"
|
||||
"<td>ping2</td>"
|
||||
"<td>Ping time to this host on the seconday/shotgun "
|
||||
"network. This column is not visible if the shotgun "
|
||||
@ -888,25 +903,25 @@ skipReplaceHost:
|
||||
"</td>"
|
||||
"</tr>\n"
|
||||
|
||||
"<tr>"
|
||||
"<tr class=poo>"
|
||||
"<td>M (status flag)</td>"
|
||||
"<td>Indicates host is merging files on disk."
|
||||
"</td>"
|
||||
"</tr>\n"
|
||||
|
||||
"<tr>"
|
||||
"<tr class=poo>"
|
||||
"<td>D (status flag)</td>"
|
||||
"<td>Indicates host is dumping data to disk."
|
||||
"</td>"
|
||||
"</tr>\n"
|
||||
|
||||
"<tr>"
|
||||
"<tr class=poo>"
|
||||
"<td>S (status flag)</td>"
|
||||
"<td>Indicates host has outstanding spiders."
|
||||
"</td>"
|
||||
"</tr>\n"
|
||||
|
||||
"<tr>"
|
||||
"<tr class=poo>"
|
||||
"<td>y (status flag)</td>"
|
||||
"<td>Indicates host is performing the daily merge."
|
||||
"</td>"
|
||||
@ -914,8 +929,8 @@ skipReplaceHost:
|
||||
|
||||
|
||||
,
|
||||
LIGHT_BLUE ,
|
||||
DARK_BLUE );
|
||||
TABLE_STYLE
|
||||
);
|
||||
|
||||
sb.safePrintf ( "</table><br></form><br>" );
|
||||
|
||||
|
304
PageInject.cpp
304
PageInject.cpp
@ -52,6 +52,8 @@ bool sendPageInject ( TcpSocket *s , HttpRequest *r ) {
|
||||
|
||||
msg7->m_crawlbotAPI = crawlbotAPI;
|
||||
|
||||
strncpy(msg7->m_coll,coll,MAX_COLL_LEN);
|
||||
|
||||
// for diffbot
|
||||
if ( crawlbotAPI )
|
||||
msg7->m_hr.copy ( r );
|
||||
@ -63,7 +65,6 @@ bool sendPageInject ( TcpSocket *s , HttpRequest *r ) {
|
||||
// qts is html encoded? NO! fix that below then...
|
||||
//char *uf="http://www.google.com/search?num=50&"
|
||||
// "q=%s&scoring=d&filter=0";
|
||||
strncpy(msg7->m_coll,coll,MAX_COLL_LEN);
|
||||
msg7->m_isScrape = true;
|
||||
msg7->m_qbuf.safeStrcpy(qts);
|
||||
msg7->m_linkDedupTable.set(4,0,512,NULL,0,false,0,"ldtab");
|
||||
@ -193,6 +194,12 @@ bool sendReply ( void *state ) {
|
||||
// pm = msg;
|
||||
//}
|
||||
|
||||
sb.safePrintf(
|
||||
"<style>"
|
||||
".poo { background-color:#%s;}\n"
|
||||
"</style>\n" ,
|
||||
LIGHT_BLUE );
|
||||
|
||||
//char *c = msg7->m_coll;
|
||||
char bb [ MAX_COLL_LEN + 60 ];
|
||||
bb[0]='\0';
|
||||
@ -204,39 +211,50 @@ bool sendReply ( void *state ) {
|
||||
"<b>%s</b>\n\n" // the url msg
|
||||
//"<FORM method=POST action=/inject>\n\n"
|
||||
|
||||
"<FORM method=GET action=/inject>\n\n"
|
||||
|
||||
//"<input type=hidden name=pwd value=\"%s\">\n"
|
||||
//"<input type=hidden name=username value=\"%s\">\n"
|
||||
"<table width=100%% bgcolor=#%s cellpadding=4 border=1>"
|
||||
"<tr><td bgcolor=#%s colspan=2>"
|
||||
"<table %s>"
|
||||
"<tr class=hdrow><td colspan=2>"
|
||||
"<center>"
|
||||
//"<font size=+1>"
|
||||
"<b>"
|
||||
"Inject URL</b>%s"
|
||||
//"</font>"
|
||||
"<br>"
|
||||
//"Enter the information below to inject "
|
||||
//"a URL. This allows you to specify the URL as well as the "
|
||||
//"content for the URL."
|
||||
"</td></tr>\n\n"
|
||||
|
||||
"<tr><td><b>url</b></td>"
|
||||
"<td>\n"
|
||||
"<tr class=poo><td><b>url</b>"
|
||||
"<br>"
|
||||
"<font size=-2>"
|
||||
"Specify the URL that will be immediately crawled and "
|
||||
"indexed in real time "
|
||||
"while you wait. The browser will return the "
|
||||
"final index status code. Alternatively, "
|
||||
"use the <i>add urls</i> page "
|
||||
"to add URLs in bulk or to just add to the spider queue "
|
||||
"without having to wait for the page or pages to be "
|
||||
"actually indexed in realtime."
|
||||
"</font>"
|
||||
"</td>"
|
||||
|
||||
"<td width=50%%>\n"
|
||||
"<input type=text name=u value=\"\" size=50>"
|
||||
"</td></tr>\n\n"
|
||||
|
||||
"<tr><td><b>query to scrape</b></td>"
|
||||
"<tr class=poo><td><b>query to scrape</b></td>"
|
||||
"<td>\n"
|
||||
"<input type=text name=qts value=\"\" size=50>"
|
||||
"</td></tr>\n\n"
|
||||
|
||||
//"<tr><td><b>use ahrefs.com</b></td>"
|
||||
//"<tr class=poo><td><b>use ahrefs.com</b></td>"
|
||||
//"<td>\n"
|
||||
//"<input type=radio name=useahrefs value=0 checked>no "
|
||||
//"<input type=radio name=useahrefs value=1>yes "
|
||||
//"</td></tr>\n\n"
|
||||
|
||||
|
||||
"<tr><td><b>spider links</b></td>"
|
||||
"<tr class=poo><td><b>spider links</b></td>"
|
||||
"<td>\n"
|
||||
"<input type=radio name=spiderlinks value=0>no "
|
||||
"<input type=radio name=spiderlinks value=1 checked>yes "
|
||||
@ -249,18 +267,18 @@ bool sendReply ( void *state ) {
|
||||
|
||||
|
||||
|
||||
"<tr><td><b>inject scraped links</b></td>"
|
||||
"<tr class=poo><td><b>inject scraped links</b></td>"
|
||||
"<td>\n"
|
||||
"<input type=radio name=injectlinks value=0 checked>no "
|
||||
"<input type=radio name=injectlinks value=1>yes "
|
||||
"</td></tr>\n\n"
|
||||
|
||||
"<tr><td><b>collection</b></td>"
|
||||
"<tr class=poo><td><b>collection</b></td>"
|
||||
"<td>\n"
|
||||
"<input type=text name=c value=\"%s\" size=15>"
|
||||
"</td></tr>\n\n"
|
||||
|
||||
"<tr><td><b>quick reply?</b><br>"
|
||||
"<tr class=poo><td><b>quick reply?</b><br>"
|
||||
"<font size=1>Should reply be short? "
|
||||
"Default: no"
|
||||
"</td>"
|
||||
@ -269,7 +287,7 @@ bool sendReply ( void *state ) {
|
||||
"<input type=radio name=quick value=1>yes "
|
||||
"</td></tr>\n\n"
|
||||
|
||||
"<tr><td><b>only inject new docs?</b><br>"
|
||||
"<tr class=poo><td><b>only inject new docs?</b><br>"
|
||||
"<font size=1>Skips injection if docs already indexed. "
|
||||
"Default: no"
|
||||
"</td>"
|
||||
@ -279,17 +297,17 @@ bool sendReply ( void *state ) {
|
||||
"</td></tr>\n\n"
|
||||
|
||||
|
||||
"<tr><td><b>delete?</b><br>"
|
||||
"<tr class=poo><td><b>delete url?</b><br>"
|
||||
"<font size=1>Should this url be deleted from the index? "
|
||||
"Default: no"
|
||||
"</td>"
|
||||
"<td>\n"
|
||||
"<input type=radio name=delete value=0 checked>no "
|
||||
"<input type=radio name=delete value=1>yes "
|
||||
"<input type=radio name=deleteurl value=0 checked>no "
|
||||
"<input type=radio name=deleteurl value=1>yes "
|
||||
"</td></tr>\n\n"
|
||||
|
||||
|
||||
"<tr><td><b>recycle content?</b><br>"
|
||||
"<tr class=poo><td><b>recycle content?</b><br>"
|
||||
"<font size=1>Should page content be recycled if "
|
||||
"reindexing? "
|
||||
"Default: no"
|
||||
@ -299,16 +317,18 @@ bool sendReply ( void *state ) {
|
||||
"<input type=radio name=recycle value=1>yes "
|
||||
"</td></tr>\n\n"
|
||||
|
||||
"<tr><td><b>ip</b><br>"
|
||||
/*
|
||||
"<tr class=poo><td><b>ip</b><br>"
|
||||
"<font size=1>IP address of the url. If blank then "
|
||||
"Gigablast will look up. "
|
||||
"Default: blank"
|
||||
"</td>"
|
||||
"<td>\n<input type=text name=ip value=\"\" size=15>"
|
||||
"</td></tr>\n\n"
|
||||
*/
|
||||
|
||||
/*
|
||||
"<tr><td><b>do ip lookups?</b><br>"
|
||||
"<tr class=poo><td><b>do ip lookups?</b><br>"
|
||||
"<font size=1>Should Gigablast look up the IP address "
|
||||
"of the url, if it is not provided. "
|
||||
"Default: yes"
|
||||
@ -319,7 +339,7 @@ bool sendReply ( void *state ) {
|
||||
"</td></tr>\n\n"
|
||||
*/
|
||||
|
||||
//"<tr><td><b>is url new?</b><br>"
|
||||
//"<tr class=poo><td><b>is url new?</b><br>"
|
||||
//"<font size=1>Is this url new to the index? If unsure "
|
||||
//"then you should say no here. "
|
||||
//"Default: yes"
|
||||
@ -329,7 +349,7 @@ bool sendReply ( void *state ) {
|
||||
//"<input type=radio name=isnew value=1 checked>yes "
|
||||
//"</td></tr>\n\n"
|
||||
|
||||
"<tr><td><b>dedup?</b><br>"
|
||||
"<tr class=poo><td><b>dedup?</b><br>"
|
||||
"<font size=1>Should this url be skipped if there is "
|
||||
"already a url in the index from this same domain with "
|
||||
"this same content? "
|
||||
@ -339,14 +359,14 @@ bool sendReply ( void *state ) {
|
||||
"<input type=radio name=dedup value=0>no "
|
||||
"<input type=radio name=dedup value=1 checked>yes "
|
||||
"</td></tr>\n\n" ,
|
||||
//"<tr><td><b>ruleset</b><br>"
|
||||
//"<tr class=poo><td><b>ruleset</b><br>"
|
||||
//"<font size=1>Use this ruleset to index the URL. "
|
||||
//"Default: auto"
|
||||
//"</td>"
|
||||
//"<td>\n<select name=rs>" ,
|
||||
pm , // msg7->m_pwd ,
|
||||
//msg7->m_username,
|
||||
LIGHT_BLUE , DARK_BLUE , bb , msg7->m_coll );
|
||||
TABLE_STYLE , bb , msg7->m_coll );
|
||||
|
||||
|
||||
//p += gbstrlen(p);
|
||||
@ -382,7 +402,7 @@ bool sendReply ( void *state ) {
|
||||
|
||||
// make a table, each row will be an injectable parameter
|
||||
sb.safePrintf (
|
||||
"<tr><td><b>content has mime</b><br>"
|
||||
"<tr class=poo><td><b>content has mime</b><br>"
|
||||
"<font size=1>IP address of the url. If blank then "
|
||||
"Gigablast will look up. "
|
||||
"Default: blank"
|
||||
@ -392,10 +412,13 @@ bool sendReply ( void *state ) {
|
||||
"<input type=radio name=hasmime value=1>yes "
|
||||
"</td></tr>\n\n"
|
||||
|
||||
"<tr><td colspan=2>"
|
||||
"<tr class=poo><td colspan=2>"
|
||||
"<center>"
|
||||
"<b>content</b><br>"
|
||||
"<font size=1>Enter the content here. Enter MIME header "
|
||||
"<font size=1>If you want to supply the URL's content "
|
||||
"rather than have Gigablast download it, then "
|
||||
"enter the content here. "
|
||||
"Enter MIME header "
|
||||
"first if \"content has mime\" is set to true above. "
|
||||
"Separate MIME from actual content with two returns."
|
||||
"<br>"
|
||||
@ -404,11 +427,15 @@ bool sendReply ( void *state ) {
|
||||
"\n"
|
||||
"<textarea rows=32 cols=80 name=content>"
|
||||
"</textarea>"
|
||||
"<br>"
|
||||
"<br>\n\n"
|
||||
"<input type=submit value=Submit>"
|
||||
"</center>"
|
||||
"</td></tr></table>\n"
|
||||
|
||||
"<br>"
|
||||
"<br>\n\n"
|
||||
"<center>"
|
||||
"<input type=submit value=Submit>"
|
||||
"</center>"
|
||||
|
||||
"</form>\n"
|
||||
);
|
||||
|
||||
@ -463,34 +490,48 @@ bool Msg7::inject ( TcpSocket *s ,
|
||||
long contentLen;
|
||||
|
||||
// get the junk
|
||||
char *coll = r->getString ( "c" , NULL , NULL /*default*/);
|
||||
//char *coll = r->getString ( "c" , NULL , NULL /*default*/);
|
||||
//if ( ! coll ) coll = "main";
|
||||
// sometimes crawlbot will add or reset a coll and do an inject
|
||||
// in PageCrawlBot.cpp
|
||||
//if ( ! coll ) coll = r->getString("addcoll");
|
||||
//if ( ! coll ) coll = r->getString("resetcoll");
|
||||
if ( ! coll ) coll = collOveride;
|
||||
//if ( ! coll ) coll = collOveride;
|
||||
|
||||
// default to main
|
||||
if ( ! coll || ! coll[0] ) coll = "main";
|
||||
//if ( ! coll || ! coll[0] ) coll = "main";
|
||||
|
||||
if ( collOveride && ! collOveride[0] ) collOveride = NULL;
|
||||
|
||||
CollectionRec *cr = NULL;
|
||||
if ( collOveride ) cr = g_collectiondb.getRec ( collOveride );
|
||||
else cr = g_collectiondb.getRec ( r );
|
||||
|
||||
if ( ! cr ) {
|
||||
g_errno = ENOCOLLREC;
|
||||
return true;
|
||||
}
|
||||
|
||||
char *coll = cr->m_coll;
|
||||
|
||||
bool quickReply = r->getLong ( "quick" , 0 );
|
||||
//char *pwd = r->getString ( "pwd" , NULL );
|
||||
char *url = r->getString ( "u" , NULL , NULL /*default*/);
|
||||
// for diffbot.cpp api
|
||||
if ( ! url ) url = r->getString("injecturl",NULL,NULL);
|
||||
if ( ! url ) url = r->getString("url",NULL,NULL);
|
||||
// PageCrawlBot.cpp uses "seed"
|
||||
if ( ! url ) url = r->getString("seed",NULL,NULL);
|
||||
|
||||
bool recycleContent = r->getLong ( "recycle",0);
|
||||
char *ips = r->getString ( "ip" , NULL , NULL );
|
||||
//char *ips = r->getString ( "ip" , NULL , NULL );
|
||||
//char *username = g_users.getUsername(r);
|
||||
long firstIndexed = r->getLongLong("firstindexed",0LL);
|
||||
long lastSpidered = r->getLongLong("lastspidered",0LL);
|
||||
//long firstIndexed = r->getLongLong("firstindexed",0LL);
|
||||
//long lastSpidered = r->getLongLong("lastspidered",0LL);
|
||||
long hopCount = r->getLong("hopcount",-1);
|
||||
long newOnly = r->getLong("newonly",0);
|
||||
long charset = r->getLong("charset",-1);
|
||||
long deleteIt = r->getLong("delete",0);
|
||||
long deleteUrl = r->getLong("deleteurl",0);
|
||||
char hasMime = r->getLong("hasmime",0);
|
||||
// do consistency testing?
|
||||
bool doConsistencyTesting = r->getLong("dct",0);
|
||||
@ -502,7 +543,7 @@ bool Msg7::inject ( TcpSocket *s ,
|
||||
|
||||
long forcedIp = 0;
|
||||
|
||||
if ( ips ) forcedIp = atoip ( ips , gbstrlen(ips) );
|
||||
//if ( ips ) forcedIp = atoip ( ips , gbstrlen(ips) );
|
||||
|
||||
char *content = r->getString ( "content" , &contentLen , NULL );
|
||||
// mark doesn't like to url-encode his content
|
||||
@ -543,17 +584,20 @@ bool Msg7::inject ( TcpSocket *s ,
|
||||
niceness,
|
||||
state,
|
||||
callback,
|
||||
firstIndexed,
|
||||
lastSpidered,
|
||||
//firstIndexed,
|
||||
//lastSpidered,
|
||||
hopCount,
|
||||
newOnly,
|
||||
charset,
|
||||
spiderLinks,
|
||||
deleteIt,
|
||||
deleteUrl,
|
||||
hasMime,
|
||||
doConsistencyTesting);
|
||||
}
|
||||
|
||||
// . returns false if blocked, true otherwise
|
||||
// . if returns false will call your callback(state) when is done
|
||||
// . returns true and sets g_errno on error
|
||||
bool Msg7::inject ( char *url ,
|
||||
long forcedIp ,
|
||||
char *content ,
|
||||
@ -567,13 +611,13 @@ bool Msg7::inject ( char *url ,
|
||||
long niceness,
|
||||
void *state ,
|
||||
void (*callback)(void *state),
|
||||
long firstIndexed,
|
||||
long lastSpidered,
|
||||
//long firstIndexed,
|
||||
//long lastSpidered,
|
||||
long hopCount,
|
||||
char newOnly,
|
||||
short charset,
|
||||
char spiderLinks,
|
||||
char deleteIt,
|
||||
char deleteUrl,
|
||||
char hasMime,
|
||||
bool doConsistencyTesting
|
||||
) {
|
||||
@ -581,11 +625,14 @@ bool Msg7::inject ( char *url ,
|
||||
m_quickReply = quickReply;
|
||||
|
||||
// store coll
|
||||
if ( ! coll ) { g_errno = ENOCOLLREC; return true; }
|
||||
long collLen = gbstrlen ( coll );
|
||||
if ( collLen > MAX_COLL_LEN ) collLen = MAX_COLL_LEN;
|
||||
strncpy ( m_coll , coll , collLen );
|
||||
m_coll [ collLen ] = '\0';
|
||||
//if ( ! coll ) { g_errno = ENOCOLLREC; return true; }
|
||||
// long collLen = gbstrlen ( coll );
|
||||
//if ( collLen > MAX_COLL_LEN ) collLen = MAX_COLL_LEN;
|
||||
//strncpy ( m_coll , coll , collLen );
|
||||
//m_coll [ collLen ] = '\0';
|
||||
|
||||
CollectionRec *cr = g_collectiondb.getRec ( coll );
|
||||
if ( ! cr ) { g_errno = ENOCOLLREC; return true; }
|
||||
|
||||
// store user
|
||||
//long ulen = 0;
|
||||
@ -612,149 +659,36 @@ bool Msg7::inject ( char *url ,
|
||||
if ( g_repairMode ) { g_errno = EREPAIRING; return true; }
|
||||
|
||||
// send template reply if no content supplied
|
||||
if ( ! content && ! recycleContent ) {
|
||||
log("inject: no content supplied to inject command and "
|
||||
"recycleContent is false.");
|
||||
//return true;
|
||||
}
|
||||
|
||||
// clean url?
|
||||
// normalize and add www. if it needs it
|
||||
Url uu;
|
||||
uu.set ( url , gbstrlen(url) , true );
|
||||
// remove >'s i guess and store in st1->m_url[] buffer
|
||||
char cleanUrl[MAX_URL_LEN+1];
|
||||
urlLen = cleanInput ( cleanUrl,
|
||||
MAX_URL_LEN,
|
||||
uu.getUrl(),
|
||||
uu.getUrlLen() );
|
||||
|
||||
|
||||
// this can go on the stack since set4() copies it
|
||||
SpiderRequest sreq;
|
||||
sreq.reset();
|
||||
strcpy(sreq.m_url, cleanUrl );
|
||||
// parentdocid of 0
|
||||
long firstIp = hash32n(cleanUrl);
|
||||
if ( firstIp == -1 || firstIp == 0 ) firstIp = 1;
|
||||
sreq.setKey( firstIp,0LL, false );
|
||||
sreq.m_isInjecting = 1;
|
||||
sreq.m_isPageInject = 1;
|
||||
sreq.m_hopCount = hopCount;
|
||||
sreq.m_hopCountValid = 1;
|
||||
sreq.m_fakeFirstIp = 1;
|
||||
sreq.m_firstIp = firstIp;
|
||||
//if ( ! content && ! recycleContent ) {
|
||||
// log("inject: no content supplied to inject command and "
|
||||
// "recycleContent is false.");
|
||||
// //return true;
|
||||
//}
|
||||
|
||||
// shortcut
|
||||
XmlDoc *xd = &m_xd;
|
||||
|
||||
// log it now
|
||||
//log("inject: injecting doc %s",cleanUrl);
|
||||
if ( ! xd->injectDoc ( url ,
|
||||
cr ,
|
||||
content ,
|
||||
hasMime , // content starts with http mime?
|
||||
hopCount,
|
||||
charset,
|
||||
|
||||
static char s_dummy[3];
|
||||
// sometims the content is indeed NULL...
|
||||
if ( newOnly && ! content ) {
|
||||
// don't let it be NULL because then xmldoc will
|
||||
// try to download the page!
|
||||
s_dummy[0] = '\0';
|
||||
content = s_dummy;
|
||||
//char *xx=NULL;*xx=0; }
|
||||
}
|
||||
deleteUrl,
|
||||
contentType, // CT_HTML, CT_XML
|
||||
spiderLinks ,
|
||||
newOnly, // index iff new
|
||||
|
||||
state,
|
||||
callback ) )
|
||||
// we blocked...
|
||||
return false;
|
||||
|
||||
// . use the enormous power of our new XmlDoc class
|
||||
// . this returns false with g_errno set on error
|
||||
if ( //m_needsSet &&
|
||||
! xd->set4 ( &sreq ,
|
||||
NULL ,
|
||||
m_coll ,
|
||||
NULL , // pbuf
|
||||
// give it a niceness of 1, we have to be
|
||||
// careful since we are a niceness of 0!!!!
|
||||
niceness, // 1 ,
|
||||
// inject this content
|
||||
content ,
|
||||
deleteIt, // false, // deleteFromIndex ,
|
||||
forcedIp ,
|
||||
contentType ,
|
||||
lastSpidered ,
|
||||
hasMime )) {
|
||||
// g_errno should be set if that returned false
|
||||
if ( ! g_errno ) { char *xx=NULL;*xx=0; }
|
||||
return true;
|
||||
}
|
||||
// do not re-call the set
|
||||
//m_needsSet = false;
|
||||
// make this our callback in case something blocks
|
||||
xd->setCallback ( state , callback );
|
||||
|
||||
xd->m_doConsistencyTesting = doConsistencyTesting;
|
||||
|
||||
// . set xd from the old title rec if recycle is true
|
||||
// . can also use XmlDoc::m_loadFromOldTitleRec flag
|
||||
if ( recycleContent ) xd->m_recycleContent = true;
|
||||
|
||||
// othercrap
|
||||
if ( firstIndexed ) {
|
||||
xd->m_firstIndexedDate = firstIndexed;
|
||||
xd->m_firstIndexedDateValid = true;
|
||||
}
|
||||
|
||||
if ( lastSpidered ) {
|
||||
xd->m_spideredTime = lastSpidered;
|
||||
xd->m_spideredTimeValid = true;
|
||||
}
|
||||
|
||||
if ( hopCount != -1 ) {
|
||||
xd->m_hopCount = hopCount;
|
||||
xd->m_hopCountValid = true;
|
||||
}
|
||||
|
||||
if ( charset != -1 && charset != csUnknown ) {
|
||||
xd->m_charset = charset;
|
||||
xd->m_charsetValid = true;
|
||||
}
|
||||
|
||||
// avoid looking up ip of each outlink to add "firstip" tag to tagdb
|
||||
// because that can be slow!!!!!!!
|
||||
xd->m_spiderLinks = spiderLinks;
|
||||
xd->m_spiderLinks2 = spiderLinks;
|
||||
xd->m_spiderLinksValid = true;
|
||||
|
||||
// . newOnly is true --> do not inject if document is already indexed!
|
||||
// . maybe just set indexCode
|
||||
xd->m_newOnly = newOnly;
|
||||
|
||||
// do not re-lookup the robots.txt
|
||||
xd->m_isAllowed = true;
|
||||
xd->m_isAllowedValid = true;
|
||||
xd->m_crawlDelay = -1; // unknown
|
||||
xd->m_crawlDelayValid = true;
|
||||
|
||||
// set this now
|
||||
g_inPageInject = true;
|
||||
|
||||
// log it now
|
||||
//log("inject: indexing injected doc %s",cleanUrl);
|
||||
|
||||
// . now tell it to index
|
||||
// . this returns false if blocked
|
||||
bool status = xd->indexDoc ( );
|
||||
|
||||
// log it. i guess only for errors when it does not block?
|
||||
// because xmldoc.cpp::indexDoc calls logIt()
|
||||
if ( status ) xd->logIt();
|
||||
|
||||
// undo it
|
||||
g_inPageInject = false;
|
||||
|
||||
// note that it blocked
|
||||
//if ( ! status ) log("inject: blocked for %s",cleanUrl);
|
||||
|
||||
// return false if it blocked
|
||||
return status;
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
///////////////
|
||||
//
|
||||
// SCRAPE GOOGLE
|
||||
|
@ -53,8 +53,8 @@ public:
|
||||
long niceness,
|
||||
void *state ,
|
||||
void (*callback)(void *state),
|
||||
long firstIndexedDate = 0,
|
||||
long spiderDate = 0,
|
||||
//long firstIndexedDate = 0,
|
||||
//long spiderDate = 0,
|
||||
long hopCount = -1 ,
|
||||
char newOnly = 0 ,
|
||||
short charset = -1 ,
|
||||
|
@ -28,8 +28,8 @@ struct StateLogView {
|
||||
|
||||
static char *s_magicStr = "4j3.8x*";
|
||||
#define BABY_BLUE "e0e0d0"
|
||||
#define LIGHT_BLUE "d0d0e0"
|
||||
#define DARK_BLUE "c0c0f0"
|
||||
//#define LIGHT_BLUE "d0d0e0"
|
||||
//#define DARK_BLUE "c0c0f0"
|
||||
|
||||
bool sendPageLogView ( TcpSocket *s , HttpRequest *r ) {
|
||||
|
||||
@ -79,15 +79,21 @@ bool sendPageLogView ( TcpSocket *s , HttpRequest *r ) {
|
||||
"</SCRIPT> ");
|
||||
p->safePrintf("<form name=\"fo\">");
|
||||
|
||||
p->safePrintf("\n<table width=100%% bgcolor=#%s "
|
||||
"cellpadding=4 border=1>\n", BABY_BLUE);
|
||||
p->safePrintf("\n<table %s>\n",TABLE_STYLE);
|
||||
p->safePrintf("<tr class=hdrow><td colspan=2>"
|
||||
"<center><b>Log View</b></center>"
|
||||
"</td></tr>");
|
||||
|
||||
p->safePrintf("<tr><td>Refresh Rate:</td><td><input type=\"text\""
|
||||
p->safePrintf("<tr bgcolor=%s>"
|
||||
"<td>Refresh Rate:</td><td><input type=\"text\""
|
||||
" name=\"rr\" value=\"%li\" size=\"4\"></td></tr>",
|
||||
LIGHT_BLUE,
|
||||
refreshRate);
|
||||
|
||||
p->safePrintf("<tr><td>Sample Size:</td><td><input type=\"text\""
|
||||
" name=\"ss\" value=\"%li\" size=\"4\"></td></tr>",
|
||||
p->safePrintf("<tr bgcolor=%s>"
|
||||
"<td>Sample Size:</td><td><input type=\"text\""
|
||||
" name=\"ss\" value=\"%li\" size=\"4\">",
|
||||
LIGHT_BLUE,
|
||||
sampleSize);
|
||||
|
||||
p->safePrintf("<input type=\"hidden\" "
|
||||
@ -96,6 +102,7 @@ bool sendPageLogView ( TcpSocket *s , HttpRequest *r ) {
|
||||
p->safePrintf("<input type=\"hidden\" "
|
||||
"name=\"dontlog\" value=\"1\">");
|
||||
|
||||
p->safePrintf("</td></tr>");
|
||||
|
||||
// . count the number of hosts we are getting logs for:
|
||||
long numOn = 0;
|
||||
@ -134,7 +141,8 @@ bool sendPageLogView ( TcpSocket *s , HttpRequest *r ) {
|
||||
st->m_filterStr[6] = "INFO";
|
||||
st->m_filterStr[7] = "INIT";
|
||||
|
||||
p->safePrintf("<tr><td>Filter Types:</td><td>");
|
||||
p->safePrintf("<tr bgcolor=#%s><td>Filter Types:</td><td>",
|
||||
LIGHT_BLUE);
|
||||
char *checked;
|
||||
st->m_numFilts = 0;
|
||||
for(long i = 7; i >= 0; i--) {
|
||||
@ -183,7 +191,8 @@ bool sendPageLogView ( TcpSocket *s , HttpRequest *r ) {
|
||||
|
||||
|
||||
|
||||
p->safePrintf("<tr><td>Hosts:</td><td>");
|
||||
p->safePrintf("<tr bgcolor=#%s><td>Hosts:</td><td>",
|
||||
LIGHT_BLUE);
|
||||
for ( long i = 0 ; i < nh ; i++ ) {
|
||||
// skip dead hosts, i don't want to wait for them to timeout.
|
||||
if ( g_hostdb.isDead ( i ) ) continue;
|
||||
@ -214,9 +223,9 @@ bool sendPageLogView ( TcpSocket *s , HttpRequest *r ) {
|
||||
|
||||
p->safePrintf("</td></tr>\n");
|
||||
|
||||
p->safePrintf("<tr><td>\n");
|
||||
p->safePrintf("<tr bgcolor=#%s><td>\n",LIGHT_BLUE);
|
||||
p->safePrintf("<input type=\"submit\" value=\"Update\"> ");
|
||||
p->safePrintf("</td></tr></table>\n");
|
||||
p->safePrintf("</td><td></td></tr></table>\n");
|
||||
p->safePrintf("</form>");
|
||||
|
||||
if(!blocked)
|
||||
@ -227,6 +236,14 @@ bool sendPageLogView ( TcpSocket *s , HttpRequest *r ) {
|
||||
}
|
||||
|
||||
|
||||
bool showLine ( SafeBuf *sb , char *s , long len ) {
|
||||
|
||||
return sb->brify ( s , len ,
|
||||
0 , // niceness
|
||||
80 , // cols
|
||||
"<br>",
|
||||
false ); // isHtml?
|
||||
}
|
||||
|
||||
|
||||
void gotRemoteLogWrapper(void *state, UdpSlot *slot) {
|
||||
@ -329,25 +346,25 @@ void gotRemoteLogWrapper(void *state, UdpSlot *slot) {
|
||||
if(matchNum >= 0 || st->m_numFilts == 0) {
|
||||
if(matchNum == 0) {
|
||||
p->safePrintf("<font color=red>");
|
||||
p->safeMemcpy(st->m_readBufPtrs[ndx], lineLen);
|
||||
showLine(p,st->m_readBufPtrs[ndx], lineLen);
|
||||
p->safePrintf("\n");
|
||||
p->safePrintf("</font>");
|
||||
}
|
||||
else if(matchNum == 1) {
|
||||
p->safePrintf("<font color=green>");
|
||||
p->safeMemcpy(st->m_readBufPtrs[ndx], lineLen);
|
||||
showLine(p,st->m_readBufPtrs[ndx], lineLen);
|
||||
p->safePrintf("\n");
|
||||
p->safePrintf("</font>");
|
||||
|
||||
}
|
||||
else if(matchNum == 2) {
|
||||
p->safePrintf("<font color=blue>");
|
||||
p->safeMemcpy(st->m_readBufPtrs[ndx], lineLen);
|
||||
showLine(p,st->m_readBufPtrs[ndx], lineLen);
|
||||
p->safePrintf("\n");
|
||||
p->safePrintf("</font>");
|
||||
}
|
||||
else {
|
||||
p->safeMemcpy(st->m_readBufPtrs[ndx], lineLen);
|
||||
showLine(p,st->m_readBufPtrs[ndx], lineLen);
|
||||
p->safePrintf("\n");
|
||||
}
|
||||
}
|
||||
|
135
PageParser.cpp
135
PageParser.cpp
@ -211,11 +211,18 @@ bool sendPageParser2 ( TcpSocket *s ,
|
||||
if ( st->m_render ) render = " checked";
|
||||
if ( st->m_oips ) oips = " checked";
|
||||
|
||||
xbuf->safePrintf(
|
||||
"<style>"
|
||||
".poo { background-color:#%s;}\n"
|
||||
"</style>\n" ,
|
||||
LIGHT_BLUE );
|
||||
|
||||
|
||||
long clen;
|
||||
char *contentParm = r->getString("content",&clen,"");
|
||||
|
||||
// print the input form
|
||||
xbuf->safePrintf ("<br>"
|
||||
xbuf->safePrintf (
|
||||
"<style>\n"
|
||||
"h2{font-size: 12px; color: #666666;}\n"
|
||||
|
||||
@ -233,21 +240,30 @@ bool sendPageParser2 ( TcpSocket *s ,
|
||||
".hs {color: #009900;}"
|
||||
"</style>\n"
|
||||
"<center>"
|
||||
"<table cellpadding=3>"
|
||||
"<tr>"
|
||||
|
||||
"<table %s>"
|
||||
|
||||
"<tr><td colspan=5><center><b>"
|
||||
"Parser"
|
||||
"</b></center></td></tr>\n"
|
||||
|
||||
"<tr class=poo>"
|
||||
"<td>"
|
||||
"Url:"
|
||||
"<b>url</b>"
|
||||
"<br><font size=-2>"
|
||||
"Type in <b>FULL</b> url to parse."
|
||||
"</font>"
|
||||
"</td>"
|
||||
|
||||
"</td>"
|
||||
"<td>"
|
||||
"<input type=text name=u value=\"%s\" size=\"40\">\n"
|
||||
"</td>"
|
||||
"<td>"
|
||||
"Type in <b>FULL</b> url\n"
|
||||
"</td>"
|
||||
"</tr>"
|
||||
|
||||
|
||||
"<tr>"
|
||||
/*
|
||||
"<tr class=poo>"
|
||||
"<td>"
|
||||
"Parser version to use: "
|
||||
"</td>"
|
||||
@ -258,9 +274,10 @@ bool sendPageParser2 ( TcpSocket *s ,
|
||||
"(-1 means to use latest title rec version)<br>"
|
||||
"</td>"
|
||||
"</tr>"
|
||||
*/
|
||||
|
||||
/*
|
||||
"<tr>"
|
||||
"<tr class=poo>"
|
||||
"<td>"
|
||||
"Hop count to use: "
|
||||
"</td>"
|
||||
@ -273,20 +290,22 @@ bool sendPageParser2 ( TcpSocket *s ,
|
||||
"</tr>"
|
||||
*/
|
||||
|
||||
"<tr>"
|
||||
"<tr class=poo>"
|
||||
"<td>"
|
||||
"Use cached:"
|
||||
"<b>use cached</b>"
|
||||
|
||||
"<br><font size=-2>"
|
||||
"Load page from cache (titledb)?"
|
||||
"</font>"
|
||||
|
||||
"</td>"
|
||||
"<td>"
|
||||
"<input type=checkbox name=old value=1%s> "
|
||||
"</td>"
|
||||
"<td>"
|
||||
"Load page from cache (titledb)?"
|
||||
"</td>"
|
||||
"</tr>"
|
||||
|
||||
/*
|
||||
"<tr>"
|
||||
"<tr class=poo>"
|
||||
"<td>"
|
||||
"Reparse root:"
|
||||
"</td>"
|
||||
@ -299,20 +318,23 @@ bool sendPageParser2 ( TcpSocket *s ,
|
||||
"</tr>"
|
||||
*/
|
||||
|
||||
"<tr>"
|
||||
"<tr class=poo>"
|
||||
"<td>"
|
||||
"Recycle Link Info:"
|
||||
"<b>recycle link info</b>"
|
||||
|
||||
"<br><font size=-2>"
|
||||
"Recycle the link info from the title rec"
|
||||
"Load page from cache (titledb)?"
|
||||
"</font>"
|
||||
|
||||
"</td>"
|
||||
"<td>"
|
||||
"<input type=checkbox name=recycle value=1%s> "
|
||||
"</td>"
|
||||
"<td>"
|
||||
"Recycle the link info from the title rec"
|
||||
"</td>"
|
||||
"</tr>"
|
||||
|
||||
/*
|
||||
"<tr>"
|
||||
"<tr class=poo>"
|
||||
"<td>"
|
||||
"Recycle Link Info Imported:"
|
||||
"</td>"
|
||||
@ -325,20 +347,22 @@ bool sendPageParser2 ( TcpSocket *s ,
|
||||
"</tr>"
|
||||
*/
|
||||
|
||||
"<tr>"
|
||||
"<tr class=poo>"
|
||||
"<td>"
|
||||
"Render HTML:"
|
||||
"<b>render html</b>"
|
||||
|
||||
"<br><font size=-2>"
|
||||
"Render document content as HTML"
|
||||
"</font>"
|
||||
|
||||
"</td>"
|
||||
"<td>"
|
||||
"<input type=checkbox name=render value=1%s> "
|
||||
"</td>"
|
||||
"<td>"
|
||||
"Render document content as HTML"
|
||||
"</td>"
|
||||
"</tr>"
|
||||
|
||||
/*
|
||||
"<tr>"
|
||||
"<tr class=poo>"
|
||||
"<td>"
|
||||
"Lookup outlinks' ruleset, ips, quality:"
|
||||
"</td>"
|
||||
@ -351,7 +375,7 @@ bool sendPageParser2 ( TcpSocket *s ,
|
||||
"</td>"
|
||||
"</tr>"
|
||||
|
||||
"<tr>"
|
||||
"<tr class=poo>"
|
||||
"<td>"
|
||||
"LinkInfo Coll:"
|
||||
"</td>"
|
||||
@ -364,49 +388,59 @@ bool sendPageParser2 ( TcpSocket *s ,
|
||||
"</tr>"
|
||||
*/
|
||||
|
||||
"<tr>"
|
||||
"<tr class=poo>"
|
||||
"<td>"
|
||||
"Optional query:"
|
||||
"<b>optional query</b>"
|
||||
|
||||
"<br><font size=-2>"
|
||||
"Leave empty usually. For title generation only."
|
||||
"</font>"
|
||||
|
||||
"</td>"
|
||||
"<td>"
|
||||
"<input type=text name=\"q\" size=\"20\" value=\"\"> "
|
||||
"</td>"
|
||||
"<td>"
|
||||
"Leave empty usually. For title generation only."
|
||||
"</td>"
|
||||
"</tr>"
|
||||
|
||||
"<tr>"
|
||||
"<td>"
|
||||
"Content Below is XML:"
|
||||
"</td>"
|
||||
|
||||
|
||||
"<tr class=poo>"
|
||||
"<td>"
|
||||
"<b>content below is xml</b>"
|
||||
"<br><font size=-2>"
|
||||
"Is the content below XML?"
|
||||
"</font>"
|
||||
"</td>"
|
||||
|
||||
"<td>"
|
||||
"<input type=checkbox name=xml value=1> "
|
||||
"</td>"
|
||||
"<td>"
|
||||
//""
|
||||
|
||||
"</td>"
|
||||
"</tr>"
|
||||
|
||||
|
||||
|
||||
"<tr>"
|
||||
"<td colspan=3>"
|
||||
|
||||
"<tr class=poo>"
|
||||
"<td><b>content</b>"
|
||||
"<br><font size=-2>"
|
||||
"Use this content for the provided <i>url</i> "
|
||||
"rather than downloading it from the web."
|
||||
"</td>"
|
||||
|
||||
"<td>"
|
||||
"<textarea rows=10 cols=80 name=content>"
|
||||
"%s"
|
||||
"</textarea>"
|
||||
"</td>"
|
||||
"</tr>"
|
||||
|
||||
"<tr>"
|
||||
"<td colspan=\"3\">"
|
||||
"<input type=submit value=OK>"
|
||||
"</td>"
|
||||
"</tr>"
|
||||
"</table>"
|
||||
"</center>"
|
||||
"</form>"
|
||||
"<br>",
|
||||
|
||||
TABLE_STYLE,
|
||||
us ,
|
||||
//(long)st->m_hopCount,
|
||||
//rtu,
|
||||
@ -420,8 +454,11 @@ bool sendPageParser2 ( TcpSocket *s ,
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
xbuf->safePrintf(
|
||||
"<center>"
|
||||
"<input type=submit value=Submit>"
|
||||
"</center>"
|
||||
);
|
||||
|
||||
|
||||
// just print the page if no url given
|
||||
|
54
PagePerf.cpp
54
PagePerf.cpp
@ -99,14 +99,15 @@ bool sendPagePerf ( TcpSocket *s , HttpRequest *r ) {
|
||||
//skip request path
|
||||
while (!isspace(*rbufEnd)) rbufEnd++;
|
||||
*rbufEnd = '\0';
|
||||
char* refresh = strstr(rbuf, "&rr=");
|
||||
//char* refresh = strstr(rbuf, "&rr=");
|
||||
|
||||
|
||||
// print resource table
|
||||
// columns are the dbs
|
||||
p.safePrintf(
|
||||
//"<center>Disk Statistics<br><br>"
|
||||
"<center><br>"
|
||||
"<center>"
|
||||
//"<br>"
|
||||
//"<img name=\"diskgraph\"
|
||||
//src=/diskGraph%li.gif><br><br>",
|
||||
//g_hostdb.m_hostId );
|
||||
@ -115,12 +116,13 @@ bool sendPagePerf ( TcpSocket *s , HttpRequest *r ) {
|
||||
// now try using absolute divs instead of a GIF
|
||||
g_stats.printGraphInHtml ( p );
|
||||
|
||||
/*
|
||||
if(autoRefresh > 0) {
|
||||
if(refresh) *(refresh+4) = '0';
|
||||
p.safePrintf(
|
||||
"<center><a href=\"%s\">Auto Refresh Off</a>"
|
||||
"</center>",
|
||||
rbuf + 4/*skip over GET*/);
|
||||
rbuf + 4); // skip over GET
|
||||
p.safePrintf( "<input type=\"hidden\" "
|
||||
"name=\"dontlog\" value=\"1\">");
|
||||
|
||||
@ -132,20 +134,26 @@ bool sendPagePerf ( TcpSocket *s , HttpRequest *r ) {
|
||||
p.safePrintf(
|
||||
"<center><a href=\"%s%s\">Auto Refresh</a>"
|
||||
"</center>",
|
||||
rbuf + 4/*skip over GET*/, rr);
|
||||
rbuf + 4, rr); // skip over "GET "
|
||||
}
|
||||
*/
|
||||
|
||||
// print the key
|
||||
p.safePrintf (
|
||||
"<br>"
|
||||
"<center>"
|
||||
"<table border=1 cellpadding=2>"
|
||||
//"<table %s>"
|
||||
//"<tr>%s</tr></table>"
|
||||
|
||||
"<tr>%s</tr></table>"
|
||||
"<style>"
|
||||
".poo { background-color:#%s;}\n"
|
||||
"</style>\n"
|
||||
|
||||
"<table border=1 cellpadding=2>"
|
||||
|
||||
"<table %s>"
|
||||
|
||||
// black
|
||||
"<tr>"
|
||||
"<tr class=poo>"
|
||||
"<td bgcolor=#000000> </td>"
|
||||
"<td> High priority disk read. "
|
||||
"Thicker lines for bigger reads.</td>"
|
||||
@ -158,7 +166,7 @@ bool sendPagePerf ( TcpSocket *s , HttpRequest *r ) {
|
||||
|
||||
|
||||
// red
|
||||
"<tr>"
|
||||
"<tr class=poo>"
|
||||
"<td bgcolor=#ff0000> </td>"
|
||||
"<td> Disk write. "
|
||||
"Thicker lines for bigger writes.</td>"
|
||||
@ -170,7 +178,7 @@ bool sendPagePerf ( TcpSocket *s , HttpRequest *r ) {
|
||||
|
||||
|
||||
// dark brown
|
||||
"<tr>"
|
||||
"<tr class=poo>"
|
||||
"<td bgcolor=#753d30> </td>"
|
||||
"<td> Processing raw query. Has raw= parm.</td>"
|
||||
|
||||
@ -181,7 +189,7 @@ bool sendPagePerf ( TcpSocket *s , HttpRequest *r ) {
|
||||
|
||||
|
||||
// pinkish purple
|
||||
"<tr>"
|
||||
"<tr class=poo>"
|
||||
"<td bgcolor=#aa00aa> </td>"
|
||||
"<td> Send data over network. (low priority)"
|
||||
"Thicker lines for bigger sends.</td>"
|
||||
@ -193,7 +201,7 @@ bool sendPagePerf ( TcpSocket *s , HttpRequest *r ) {
|
||||
"</tr>"
|
||||
|
||||
// pinkish purple
|
||||
"<tr>"
|
||||
"<tr class=poo>"
|
||||
"<td bgcolor=#ff00ff> </td>"
|
||||
"<td> Send data over network. (high priority)"
|
||||
"Thicker lines for bigger sends.</td>"
|
||||
@ -206,7 +214,7 @@ bool sendPagePerf ( TcpSocket *s , HttpRequest *r ) {
|
||||
|
||||
|
||||
// dark purple
|
||||
"<tr>"
|
||||
"<tr class=poo>"
|
||||
"<td bgcolor=#8220ff> </td>"
|
||||
"<td> Get all summaries for results.</td>"
|
||||
|
||||
@ -218,7 +226,7 @@ bool sendPagePerf ( TcpSocket *s , HttpRequest *r ) {
|
||||
|
||||
|
||||
// white
|
||||
"<tr>"
|
||||
"<tr class=poo>"
|
||||
"<td bgcolor=#ffffff> </td>"
|
||||
"<td> Uncompress cached document.</td>"
|
||||
|
||||
@ -229,7 +237,7 @@ bool sendPagePerf ( TcpSocket *s , HttpRequest *r ) {
|
||||
|
||||
|
||||
// bright green
|
||||
"<tr>"
|
||||
"<tr class=poo>"
|
||||
"<td bgcolor=#00ff00> </td>"
|
||||
"<td> Compute search results. "
|
||||
"All terms required. rat=1.</td>"
|
||||
@ -241,7 +249,7 @@ bool sendPagePerf ( TcpSocket *s , HttpRequest *r ) {
|
||||
"</tr>"
|
||||
|
||||
// bright green
|
||||
"<tr>"
|
||||
"<tr class=poo>"
|
||||
"<td bgcolor=#ccffcc> </td>"
|
||||
"<td> Compute reference pages. "
|
||||
"</td>"
|
||||
@ -252,7 +260,7 @@ bool sendPagePerf ( TcpSocket *s , HttpRequest *r ) {
|
||||
"</td>"
|
||||
"</tr>"
|
||||
|
||||
"<tr>"
|
||||
"<tr class=poo>"
|
||||
|
||||
"<td bgcolor=#d1e1ff> </td>"
|
||||
"<td> Compute Gigabits. "
|
||||
@ -265,7 +273,7 @@ bool sendPagePerf ( TcpSocket *s , HttpRequest *r ) {
|
||||
"</tr>"
|
||||
|
||||
|
||||
"<tr>"
|
||||
"<tr class=poo>"
|
||||
|
||||
"<td bgcolor=#0000b0> </td>"
|
||||
"<td> \"Summary\" extraction (low priority) "
|
||||
@ -279,10 +287,12 @@ bool sendPagePerf ( TcpSocket *s , HttpRequest *r ) {
|
||||
|
||||
|
||||
"</table>"
|
||||
"</center>",
|
||||
g_stats.m_keyCols.getBufStart() &&
|
||||
g_conf.m_dynamicPerfGraph ?
|
||||
g_stats.m_keyCols.getBufStart() : ""
|
||||
"</center>"
|
||||
, LIGHT_BLUE
|
||||
, TABLE_STYLE
|
||||
//,g_stats.m_keyCols.getBufStart() &&
|
||||
//g_conf.m_dynamicPerfGraph ?
|
||||
//g_stats.m_keyCols.getBufStart() : ""
|
||||
);
|
||||
|
||||
if(autoRefresh > 0) p.safePrintf("</body>");
|
||||
|
@ -108,7 +108,7 @@ bool sendPageReindex ( TcpSocket *s , HttpRequest *r ) {
|
||||
// if they are NOT submitting a request print the interface
|
||||
// and we're not running, just print the interface
|
||||
t = r->getString ("action" , &len );
|
||||
if ( len != 2 ) { // && ! s_isRunning ) {
|
||||
if ( len < 2 ) { // && ! s_isRunning ) {
|
||||
//p = g_pages.printAdminTop ( p , pend , s , r );
|
||||
//p = printInterface ( p , pend,q,username,coll,NULL,qlangStr);
|
||||
g_pages.printAdminTop ( &sb , s , r );
|
||||
@ -315,19 +315,25 @@ bool printInterface (SafeBuf *sb, char *q , //long user ,
|
||||
errmsg );
|
||||
}
|
||||
|
||||
sb->safePrintf(
|
||||
"<style>"
|
||||
".poo { background-color:#%s;}\n"
|
||||
"</style>\n" ,
|
||||
LIGHT_BLUE );
|
||||
|
||||
char bb [ MAX_COLL_LEN + 60 ];
|
||||
bb[0]='\0';
|
||||
//if ( user == USER_MASTER && c && c[0] ) sprintf ( bb , " (%s)", c);
|
||||
|
||||
// print the reindex interface
|
||||
sb->safePrintf (
|
||||
"<table width=100%% bgcolor=#%s cellpadding=4 border=1>"
|
||||
"<tr><td colspan=3 bgcolor=#%s><center>"
|
||||
"<table %s>"
|
||||
"<tr><td colspan=3><center>"
|
||||
//"<font size=+1>"
|
||||
"<b>"
|
||||
"Reindex Urls"
|
||||
"</b>%s</td></tr>"
|
||||
"<tr><td colspan=3>"
|
||||
"<tr bgcolor=#%s><td colspan=3>"
|
||||
"<font size=1>"
|
||||
"Reindex the URLs that match this query. If URLs are "
|
||||
"banned in tagdb they will be removed from the index. "
|
||||
@ -339,7 +345,7 @@ bool printInterface (SafeBuf *sb, char *q , //long user ,
|
||||
"whatever rule they match in the URL Filters table."
|
||||
"</td></tr>"
|
||||
|
||||
"<tr><td><b>query</b>"
|
||||
"<tr class=poo><td><b>query</b>"
|
||||
"<br><font size=1>"
|
||||
"URLs matching this query will be added to the spider "
|
||||
"queue for re-spidering."
|
||||
@ -359,32 +365,32 @@ bool printInterface (SafeBuf *sb, char *q , //long user ,
|
||||
"name=updatetags>"
|
||||
"</td></tr>"
|
||||
*/
|
||||
, LIGHT_BLUE , DARK_BLUE , bb , q );
|
||||
, TABLE_STYLE , bb , DARK_BLUE , q );
|
||||
|
||||
if ( ! qlangStr ) qlangStr = "";
|
||||
|
||||
sb->safePrintf (
|
||||
|
||||
"<tr><td><b>start result number</b>"
|
||||
"<tr class=poo><td><b>start result number</b>"
|
||||
"<font size=1>"
|
||||
"<br>Start at this search result number. Default 0.</td>"
|
||||
"<td><input type=text name=srn value=0 size=10>"
|
||||
"</td></tr>"
|
||||
|
||||
"<tr><td><b>end result number</b>"
|
||||
"<tr class=poo><td><b>end result number</b>"
|
||||
"<font size=1>"
|
||||
"<br>Stop at this search result number. "
|
||||
"Default 2000000. (2M)</td>"
|
||||
"<td><input type=text name=ern size=10 value=2000000>"
|
||||
"</td></tr>"
|
||||
|
||||
"<tr><td><b>query language</b>"
|
||||
"<tr class=poo><td><b>query language</b>"
|
||||
"<font size=1>"
|
||||
"<br>Language that helps determine sort result ranking.</td>"
|
||||
"<td><input type=text name=qlang size=6 value=\"%s\">"
|
||||
"</td></tr>"
|
||||
|
||||
"<tr><td><b>FORCE DELETE</b>"
|
||||
"<tr class=poo><td><b>FORCE DELETE</b>"
|
||||
"<font size=1>"
|
||||
"<br>Check this checkbox to "
|
||||
"delete every search result matching the above "
|
||||
@ -434,7 +440,7 @@ bool printInterface (SafeBuf *sb, char *q , //long user ,
|
||||
// submit button
|
||||
sb->safePrintf(
|
||||
"<center>"
|
||||
"<input type=submit name=action value=OK>"
|
||||
"<input type=submit name=action value=Submit>"
|
||||
"</center>"
|
||||
"</form></html>");
|
||||
|
||||
|
150
PageResults.cpp
150
PageResults.cpp
@ -57,6 +57,11 @@ public:
|
||||
// for printing our search result json items in csv:
|
||||
HashTableX m_columnTable;
|
||||
long m_numCSVColumns;
|
||||
|
||||
// stuff for doing redownloads
|
||||
bool m_didRedownload;
|
||||
XmlDoc *m_xd;
|
||||
long m_oldContentHash32;
|
||||
};
|
||||
|
||||
static bool printResult ( SafeBuf &sb,
|
||||
@ -467,6 +472,11 @@ bool sendPageResults ( TcpSocket *s , HttpRequest *hr ) {
|
||||
}
|
||||
mnew ( st , sizeof(State0) , "PageResults2" );
|
||||
|
||||
// init some stuff
|
||||
st->m_didRedownload = false;
|
||||
st->m_xd = NULL;
|
||||
st->m_oldContentHash32 = 0;
|
||||
|
||||
// copy yhits
|
||||
if ( ! st->m_hr.copy ( hr ) )
|
||||
return sendReply ( st , NULL );
|
||||
@ -615,6 +625,15 @@ bool sendPageResults ( TcpSocket *s , HttpRequest *hr ) {
|
||||
return status2;
|
||||
}
|
||||
|
||||
// if returned json result is > maxagebeforedownload then we redownload the
|
||||
// page and if its checksum has changed we return empty results
|
||||
void doneRedownloadingWrapper ( void *state ) {
|
||||
// cast our State0 class from this
|
||||
State0 *st = (State0 *) state;
|
||||
// resume
|
||||
gotResults ( st );
|
||||
}
|
||||
|
||||
/*
|
||||
void gotSpellingWrapper( void *state ){
|
||||
// cast our State0 class from this
|
||||
@ -749,6 +768,85 @@ bool gotResults ( void *state ) {
|
||||
return sendReply(st,NULL);
|
||||
}
|
||||
|
||||
/*
|
||||
//
|
||||
// BEGIN REDOWNLOAD LOGIC
|
||||
//
|
||||
|
||||
////////////
|
||||
//
|
||||
// if caller wants a certain freshness we might have to redownload the
|
||||
// parent url to get the new json
|
||||
//
|
||||
////////////
|
||||
// get the first result
|
||||
Msg20 *m20first = msg40->m_msg20[0];
|
||||
long mabr = st->m_hr.getLong("maxagebeforeredownload",-1);
|
||||
if ( mabr >= 0 &&
|
||||
numResults > 0 &&
|
||||
// only do this once
|
||||
! st->m_didRedownload &&
|
||||
// need at least one result
|
||||
m20first &&
|
||||
// get the last spidered time from the msg20 reply of that result
|
||||
m20first->m_r->m_lastSpidered - now > mabr ) {
|
||||
// make a new xmldoc to do the redownload
|
||||
XmlDoc *xd;
|
||||
try { xd = new (XmlDoc); }
|
||||
catch ( ... ) {
|
||||
g_errno = ENOMEM;
|
||||
log("query: Failed to alloc xmldoc.");
|
||||
}
|
||||
if ( g_errno ) return sendReply (st,NULL);
|
||||
mnew ( xd , sizeof(XmlDoc) , "mabrxd");
|
||||
// save it
|
||||
st->m_xd = xd;
|
||||
// get this
|
||||
st->m_oldContentHash32 = m20rep->m_contentHash32;
|
||||
// do not re-do redownload
|
||||
st->m_didRedownload = true;
|
||||
// set it
|
||||
xd->setUrl(parentUrl);
|
||||
xd->setCallback ( st , doneRedownloadingWrapper );
|
||||
// get the checksum
|
||||
if ( xd->getContentChecksum32Fast() == (void *)-1 )
|
||||
// return false if it blocked
|
||||
return false;
|
||||
// error?
|
||||
if ( g_errno ) return sendReply (st,NULL);
|
||||
// how did this not block
|
||||
log("page: redownload did not would block adding parent");
|
||||
}
|
||||
|
||||
// if we did the redownload and checksum changed, return 0 results
|
||||
if ( st->m_didRedownload ) {
|
||||
// get the doc we downloaded
|
||||
XmlDoc *xd = st->m_xd;
|
||||
// get it
|
||||
long newHash32 = xd->getContentHash32();
|
||||
// log it
|
||||
if ( newHash32 != st->m_oldContentHash32 )
|
||||
// note it in logs for now
|
||||
log("results: content changed for %s",xd->m_firstUrl.m_url);
|
||||
// free it
|
||||
mdelete(xd, sizeof(XmlDoc), "mabrxd" );
|
||||
delete xd;
|
||||
// null it out so we don't try to re-free
|
||||
st->m_xd = NULL;
|
||||
// if content is significantly different, return 0 results
|
||||
if ( newHash32 != st->m_oldContentHash32 ) {
|
||||
SafeBuf sb;
|
||||
// empty json i guess
|
||||
sb.safePrintf("[]\n");
|
||||
return sendReply(st,sb.getBufStart());
|
||||
}
|
||||
// otherwise, print the diffbot json results, they are still valid
|
||||
}
|
||||
|
||||
//
|
||||
// END REDOWNLOAD LOGIC
|
||||
//
|
||||
*/
|
||||
|
||||
//
|
||||
// BEGIN ADDING URL
|
||||
@ -1061,7 +1159,8 @@ bool gotResults ( void *state ) {
|
||||
|
||||
// otherwise, we had no error
|
||||
if ( numResults == 0 && si->m_format == FORMAT_HTML ) {
|
||||
sb.safePrintf ( "No results found." );
|
||||
sb.safePrintf ( "No results found in <b>%s</b> collection.",
|
||||
cr->m_coll);
|
||||
}
|
||||
else if ( moreFollow && si->m_format == FORMAT_HTML ) {
|
||||
if ( isAdmin && si->m_docsToScanForReranking > 1 )
|
||||
@ -1128,11 +1227,8 @@ bool gotResults ( void *state ) {
|
||||
if ( collLen == 4 && strncmp ( coll, "main", 4) == 0 ) isMain = true;
|
||||
|
||||
// print "in collection ***" if we had a collection
|
||||
if ( collLen > 0 && ! isMain && isAdmin ) {
|
||||
sb.safePrintf (" in collection '<b>");
|
||||
sb.safeMemcpy ( coll , collLen );
|
||||
sb.safeMemcpy ( "</b>'" , 5 );
|
||||
}
|
||||
if ( collLen > 0 && ! isMain && si->m_format == FORMAT_HTML )
|
||||
sb.safePrintf (" in collection <b>%s</b>",coll);
|
||||
|
||||
|
||||
char *pwd = si->m_pwd;
|
||||
@ -2409,7 +2505,7 @@ static bool printResult ( SafeBuf &sb,
|
||||
mr->m_docId );
|
||||
|
||||
// the new links
|
||||
if ( si->m_format == FORMAT_HTML ) {
|
||||
if ( si->m_format == FORMAT_HTML && g_conf.m_isMattWells ) {
|
||||
//sb.safePrintf(" - <a href=\"/scoring?"
|
||||
// "c=%s&\">scoring</a>",
|
||||
// coll );
|
||||
@ -4724,21 +4820,26 @@ bool printLogoAndSearchBox ( SafeBuf &sb , HttpRequest *hr , long catId ) {
|
||||
else
|
||||
sb.safePrintf("<a title=\"Search the web\" href=/>web</a>");
|
||||
|
||||
|
||||
|
||||
sb.safePrintf(" " );
|
||||
// SEO functionality not included yet - so redir to gigablast.
|
||||
if ( g_conf.m_isMattWells )
|
||||
sb.safePrintf("<a title=\"Rank higher in "
|
||||
"Google\" href='/seo'>");
|
||||
else
|
||||
sb.safePrintf("<a title=\"Rank higher in "
|
||||
"Google\" href='https://www.gigablast."
|
||||
"com/seo'>");
|
||||
|
||||
|
||||
if ( g_conf.m_isMattWells ) {
|
||||
// SEO functionality not included yet - so redir to gigablast.
|
||||
if ( g_conf.m_isMattWells )
|
||||
sb.safePrintf("<a title=\"Rank higher in "
|
||||
"Google\" href='/seo'>");
|
||||
else
|
||||
sb.safePrintf("<a title=\"Rank higher in "
|
||||
"Google\" href='https://www.gigablast."
|
||||
"com/seo'>");
|
||||
|
||||
sb.safePrintf(
|
||||
"seo</a>"
|
||||
" "
|
||||
);
|
||||
sb.safePrintf(
|
||||
"seo</a>"
|
||||
" "
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
if (catId <= 0 )
|
||||
sb.safePrintf("<a title=\"Browse the DMOZ directory\" "
|
||||
@ -4757,12 +4858,12 @@ bool printLogoAndSearchBox ( SafeBuf &sb , HttpRequest *hr , long catId ) {
|
||||
// i'm not sure why this was removed. perhaps
|
||||
// because it is not working yet because of
|
||||
// some bugs...
|
||||
"<!-- <a title=\"Advanced web search\" "
|
||||
"<a title=\"Advanced web search\" "
|
||||
"href=/adv.html>"
|
||||
"advanced"
|
||||
"</a>"
|
||||
|
||||
" -->"
|
||||
" "
|
||||
|
||||
"<a title=\"Add your url to the index\" "
|
||||
"href=/addurl>"
|
||||
@ -4945,6 +5046,11 @@ bool printCSVHeaderRow ( SafeBuf *sb , State0 *st ) {
|
||||
Msg20 *m20 = msg40->m_msg20[i];
|
||||
Msg20Reply *mr = m20->m_r;
|
||||
|
||||
if ( ! mr ) {
|
||||
log("results: missing msg20 reply for result #%li",i);
|
||||
continue;
|
||||
}
|
||||
|
||||
// get content
|
||||
char *json = mr->ptr_content;
|
||||
// how can it be empty?
|
||||
|
96
PageRoot.cpp
96
PageRoot.cpp
@ -61,20 +61,23 @@ bool printNav ( SafeBuf &sb , HttpRequest *r ) {
|
||||
"<a href=%s/privacy.html>Privacy Policy</a>"
|
||||
" "
|
||||
"<a href=%s/searchfeed.html>Search API</a>"
|
||||
" "
|
||||
"<a href=%s/seoapi.html>SEO API</a>"
|
||||
" "
|
||||
"<a href=%s/account>My Account</a> "
|
||||
, root
|
||||
, root
|
||||
, root
|
||||
, root
|
||||
, root
|
||||
, root
|
||||
, rootSecure
|
||||
|
||||
//" <a href=/logout>Logout</a>"
|
||||
);
|
||||
|
||||
if ( g_conf.m_isMattWells )
|
||||
sb.safePrintf(" "
|
||||
"<a href=%s/seoapi.html>SEO API</a>"
|
||||
" "
|
||||
"<a href=%s/account>My Account</a> "
|
||||
, root
|
||||
, rootSecure
|
||||
//" <a href=/logout>Logout</a>"
|
||||
);
|
||||
|
||||
if ( r->isLocal() )
|
||||
sb.safePrintf(" [<a href=\"/master?\">Admin</a>]");
|
||||
sb.safePrintf("</p></b></center></body></html>");
|
||||
@ -152,7 +155,15 @@ bool printWebHomePage ( SafeBuf &sb , HttpRequest *r ) {
|
||||
// submit to https now
|
||||
sb.safePrintf("<form method=get "
|
||||
"action=/search name=f>\n");
|
||||
|
||||
CollectionRec *cr = g_collectiondb.getRec ( r );
|
||||
if ( cr )
|
||||
sb.safePrintf("<input type=hidden name=c value=\"%s\">",
|
||||
cr->m_coll);
|
||||
|
||||
sb.safePrintf("<input name=q type=text size=60 value=\"\"> <input type=\"submit\" value=\"Search\">\n");
|
||||
|
||||
|
||||
sb.safePrintf("\n");
|
||||
sb.safePrintf("</form>\n");
|
||||
sb.safePrintf("<br>\n");
|
||||
@ -381,7 +392,12 @@ bool printAddUrlHomePage ( SafeBuf &sb , char *url , HttpRequest *r ) {
|
||||
|
||||
sb.safePrintf("<br><br>\n");
|
||||
sb.safePrintf("<br><br><br>\n");
|
||||
sb.safePrintf("<a href=/>web</a> <a href=http://www.gigablast.com/seo>seo</a> <a href=\"/Top\">directory</a> \n");
|
||||
sb.safePrintf("<a href=/>web</a> ");
|
||||
if ( g_conf.m_isMattWells )
|
||||
sb.safePrintf("<a href=http://www.gigablast.com/seo>seo"
|
||||
"</a> " );
|
||||
sb.safePrintf("<a href=\"/Top\">directory</a> "
|
||||
" \n");
|
||||
sb.safePrintf("<a href=/adv.html>advanced search</a>");
|
||||
sb.safePrintf(" ");
|
||||
sb.safePrintf("<b title=\"Instantly add your url to Gigablast's "
|
||||
@ -391,8 +407,17 @@ bool printAddUrlHomePage ( SafeBuf &sb , char *url , HttpRequest *r ) {
|
||||
sb.safePrintf("<br><br>\n");
|
||||
sb.safePrintf("<form method=get action=/addurl name=f>\n");
|
||||
|
||||
//CollectionRec *cr = g_collectiondb.getRec ( "main" );
|
||||
//sb.safePrintf("<input type=hidden name=c value=\"%s\">",cr->m_coll);
|
||||
|
||||
CollectionRec *cr = g_collectiondb.getRec ( r );
|
||||
// the collection we want to add the url to
|
||||
char *coll = NULL;
|
||||
if ( cr )
|
||||
coll = cr->m_coll;
|
||||
if ( coll )
|
||||
sb.safePrintf("<input type=hidden name=c value=\"%s\">",coll);
|
||||
if ( ! coll )
|
||||
coll = "";
|
||||
|
||||
sb.safePrintf("<input name=u type=text size=60 value=\"");
|
||||
if ( url ) {
|
||||
SafeBuf tmp;
|
||||
@ -416,6 +441,9 @@ bool printAddUrlHomePage ( SafeBuf &sb , char *url , HttpRequest *r ) {
|
||||
// or if in read-only mode
|
||||
if ( g_conf.m_readOnlyMode )
|
||||
msg = "Add url is temporarily disabled";
|
||||
|
||||
sb.safePrintf("<br><br>Add a url to the <b>%s</b> collection",coll);
|
||||
|
||||
// if url is non-empty the ajax will receive this identical msg
|
||||
// and display it in the div, so do not duplicate the msg!
|
||||
if ( msg && ! url )
|
||||
@ -453,11 +481,12 @@ bool printAddUrlHomePage ( SafeBuf &sb , char *url , HttpRequest *r ) {
|
||||
unsigned long long rand64 = gettimeofdayInMillisecondsLocal();
|
||||
// msg7 needs an explicit collection for /addurl for injecting
|
||||
// in PageInject.cpp. it does not use defaults for safety.
|
||||
sb.safePrintf("&id=%lu&c=main&rand=%llu';\n"
|
||||
sb.safePrintf("&id=%lu&c=%s&rand=%llu';\n"
|
||||
"client.open('GET', url );\n"
|
||||
"client.send();\n"
|
||||
"</script>\n"
|
||||
, h32
|
||||
, coll
|
||||
, rand64
|
||||
);
|
||||
sb.safePrintf("</div>\n");
|
||||
@ -526,9 +555,21 @@ bool printDirHomePage ( SafeBuf &sb , HttpRequest *r ) {
|
||||
|
||||
sb.safePrintf("<br><br>\n");
|
||||
sb.safePrintf("<br><br><br>\n");
|
||||
sb.safePrintf("<a href=/>web</a> <a href=http://www.gigablast.com/seo>seo</a> <b>directory</b> \n");
|
||||
sb.safePrintf("<a href=http://www.gigablast.com/events>events</a>"
|
||||
" \n");
|
||||
|
||||
sb.safePrintf("<a href=/>web</a> ");
|
||||
|
||||
if ( g_conf.m_isMattWells )
|
||||
sb.safePrintf("<a href=http://www.gigablast.com/seo>seo"
|
||||
"</a> " );
|
||||
|
||||
sb.safePrintf("<a href=\"/Top\"><b>directory</b></a> "
|
||||
" \n");
|
||||
|
||||
if ( g_conf.m_isMattWells )
|
||||
sb.safePrintf("<a href=http://www.gigablast.com/events>"
|
||||
"events</a>"
|
||||
" \n");
|
||||
|
||||
sb.safePrintf("<a href=/adv.html>advanced search</a>");
|
||||
sb.safePrintf(" ");
|
||||
char *root = "";
|
||||
@ -578,18 +619,13 @@ bool sendPageRoot ( TcpSocket *s , HttpRequest *r, char *cookie ) {
|
||||
//long qlen;
|
||||
//char *q = r->getString ( "q" , &qlen , NULL );
|
||||
// insert collection name too
|
||||
long collLen;
|
||||
char *coll = r->getString("c",&collLen);
|
||||
if ( ! coll || ! coll[0] ) {
|
||||
//coll = g_conf.m_defaultColl;
|
||||
coll = g_conf.getDefaultColl( r->getHost(), r->getHostLen() );
|
||||
collLen = gbstrlen(coll);
|
||||
}
|
||||
// ensure collection not too big
|
||||
if ( collLen >= MAX_COLL_LEN ) {
|
||||
g_errno = ECOLLTOOBIG;
|
||||
CollectionRec *cr = g_collectiondb.getRec(r);
|
||||
if ( ! cr ) {
|
||||
g_errno = ENOCOLLREC;
|
||||
return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno));
|
||||
}
|
||||
|
||||
|
||||
// get the collection rec
|
||||
/*
|
||||
CollectionRec *cr = g_collectiondb.getRec ( coll );
|
||||
@ -1271,7 +1307,9 @@ bool sendPageAddUrl ( TcpSocket *s , HttpRequest *r ) {
|
||||
// collLen = gbstrlen(coll);
|
||||
//}
|
||||
// get collection rec
|
||||
|
||||
CollectionRec *cr = g_collectiondb.getRec ( r );
|
||||
|
||||
// bitch if no collection rec found
|
||||
if ( ! cr ) {
|
||||
g_errno = ENOCOLLREC;
|
||||
@ -1552,6 +1590,8 @@ void doneInjectingWrapper3 ( void *st ) {
|
||||
//CollectionRec *cr = g_collectiondb.getRec ( st1->m_coll );
|
||||
|
||||
// collection name
|
||||
char *coll = st1->m_coll;
|
||||
if ( ! coll ) coll = "";
|
||||
|
||||
//char tt [ 128 ];
|
||||
//tt[0] = '\0';
|
||||
@ -1658,8 +1698,10 @@ void doneInjectingWrapper3 ( void *st ) {
|
||||
unsigned long rand32 = rand();
|
||||
// in the mime to 0 seconds!
|
||||
sb.safePrintf("<b>Url successfully added. "
|
||||
"<a href=/search?rand=%lu&q=url%%3A",
|
||||
rand32);
|
||||
"<a href=/search?rand=%lu&"
|
||||
"c=%s&q=url%%3A",
|
||||
rand32,
|
||||
coll);
|
||||
sb.urlEncode(url);
|
||||
sb.safePrintf(">Check it</a> or "
|
||||
"<a href=http://www.gigablast.com/seo?u=");
|
||||
|
@ -131,14 +131,14 @@ bool sendPageSockets ( TcpSocket *s , HttpRequest *r ) {
|
||||
|
||||
void printTcpTable ( SafeBuf* p, char *title, TcpServer *server ) {
|
||||
// table headers for urls current being spiderd
|
||||
p->safePrintf ( "<table width=100%% bgcolor=#d0d0f0 border=1>"
|
||||
"<tr><td bgcolor=#c0c0f0 colspan=19>"
|
||||
p->safePrintf ( "<table %s>"
|
||||
"<tr class=hdrow><td colspan=19>"
|
||||
"<center>"
|
||||
//"<font size=+1>"
|
||||
"<b>%s</b>"
|
||||
//"</font>"
|
||||
"</td></tr>"
|
||||
"<tr>"
|
||||
"<tr bgcolor=#%s>"
|
||||
"<td><b>#</td>"
|
||||
"<td><b>fd</td>"
|
||||
"<td><b>age</td>"
|
||||
@ -151,7 +151,11 @@ void printTcpTable ( SafeBuf* p, char *title, TcpServer *server ) {
|
||||
"<td><b>bytes to read</td>"
|
||||
"<td><b>bytes sent</td>"
|
||||
"<td><b>bytes to send</td>"
|
||||
"</tr>\n" , title );
|
||||
"</tr>\n"
|
||||
, TABLE_STYLE
|
||||
, title
|
||||
, DARK_BLUE
|
||||
);
|
||||
// current time in milliseconds
|
||||
long long now = gettimeofdayInMilliseconds();
|
||||
// store in buffer for sorting
|
||||
@ -202,12 +206,12 @@ void printTcpTable ( SafeBuf* p, char *title, TcpServer *server ) {
|
||||
case ST_CLOSE_CALLED: st="close called"; break;
|
||||
}
|
||||
// bgcolor is lighter for incoming requests
|
||||
char *bg = "#c0c0f0";
|
||||
if ( s->m_isIncoming ) bg = "#e8e8ff";
|
||||
char *bg = "c0c0f0";
|
||||
if ( s->m_isIncoming ) bg = "e8e8ff";
|
||||
// times
|
||||
long elapsed1 = now - s->m_startTime ;
|
||||
long elapsed2 = now - s->m_lastActionTime ;
|
||||
p->safePrintf ("<tr bgcolor=%s>"
|
||||
p->safePrintf ("<tr bgcolor=#%s>"
|
||||
"<td>%li</td>" // i
|
||||
"<td>%i</td>" // fd
|
||||
"<td>%lims</td>" // elapsed seconds since start
|
||||
@ -301,26 +305,30 @@ void printUdpTable ( SafeBuf *p, char *title, UdpServer *server ,
|
||||
msgCount1[s->m_msgType]++;
|
||||
}
|
||||
// print the counts
|
||||
p->safePrintf ( "<table bgcolor=#d0d0f0 border=1>"
|
||||
"<tr><td bgcolor=#c0c0f0 colspan=19>"
|
||||
p->safePrintf ( "<table %s>"
|
||||
"<tr class=hdrow><td colspan=19>"
|
||||
"<center>"
|
||||
"<b>%s Summary</b> (%li transactions)"
|
||||
"</td></tr>"
|
||||
"<tr>"
|
||||
"<tr bgcolor=#%s>"
|
||||
"<td><b>niceness</td>"
|
||||
"<td><b>msg type</td>"
|
||||
"<td><b>total</td>"
|
||||
"</tr>",
|
||||
title , server->getNumUsedSlots() );
|
||||
TABLE_STYLE,
|
||||
title , server->getNumUsedSlots() ,
|
||||
DARK_BLUE );
|
||||
for ( long i = 0; i < 96; i++ ) {
|
||||
if ( msgCount0[i] <= 0 ) continue;
|
||||
p->safePrintf("<tr><td>0</td><td>0x%lx</td><td>%li</td></tr>",
|
||||
i, msgCount0[i]);
|
||||
p->safePrintf("<tr bgcolor=#%s>"
|
||||
"<td>0</td><td>0x%lx</td><td>%li</td></tr>",
|
||||
LIGHT_BLUE,i, msgCount0[i]);
|
||||
}
|
||||
for ( long i = 0; i < 96; i++ ) {
|
||||
if ( msgCount1[i] <= 0 ) continue;
|
||||
p->safePrintf("<tr><td>1</td><td>0x%lx</td><td>%li</td></tr>",
|
||||
i, msgCount1[i]);
|
||||
p->safePrintf("<tr bgcolor=#%s>"
|
||||
"<td>1</td><td>0x%lx</td><td>%li</td></tr>",
|
||||
LIGHT_BLUE,i, msgCount1[i]);
|
||||
}
|
||||
p->safePrintf ( "</table><br>" );
|
||||
|
||||
@ -333,15 +341,15 @@ void printUdpTable ( SafeBuf *p, char *title, UdpServer *server ,
|
||||
dd = //"<td><b>dns ip</b></td>"
|
||||
"<td><b>hostname</b></td>";
|
||||
}
|
||||
// table headers for urls current being spiderd
|
||||
p->safePrintf ( "<table width=100%% bgcolor=#d0d0f0 border=1>"
|
||||
"<tr><td bgcolor=#c0c0f0 colspan=19>"
|
||||
|
||||
p->safePrintf ( "<table %s>"
|
||||
"<tr class=hdrow><td colspan=19>"
|
||||
"<center>"
|
||||
//"<font size=+1>"
|
||||
"<b>%s</b> (%li transactions)"
|
||||
//"</font>"
|
||||
"</td></tr>"
|
||||
"<tr>"
|
||||
"<tr bgcolor=#%s>"
|
||||
"<td><b>age</td>"
|
||||
"<td><b>last read</td>"
|
||||
"<td><b>last send</td>"
|
||||
@ -362,7 +370,11 @@ void printUdpTable ( SafeBuf *p, char *title, UdpServer *server ,
|
||||
"<td><b>dgrams to send</td>"
|
||||
"<td><b>acks read</td>"
|
||||
"<td><b>resends</td>"
|
||||
"</tr>\n" , title , server->getNumUsedSlots() , dd );
|
||||
"</tr>\n" ,
|
||||
TABLE_STYLE,
|
||||
title , server->getNumUsedSlots() ,
|
||||
DARK_BLUE ,
|
||||
dd );
|
||||
|
||||
|
||||
// now fill in the columns
|
||||
@ -385,9 +397,9 @@ void printUdpTable ( SafeBuf *p, char *title, UdpServer *server ,
|
||||
if ( s->m_lastReadTime == 0LL ) strcpy ( e1 , "--" );
|
||||
if ( s->m_lastSendTime == 0LL ) strcpy ( e2 , "--" );
|
||||
// bgcolor is lighter for incoming requests
|
||||
char *bg = "#c0c0f0";
|
||||
char *bg = LIGHT_BLUE;//"c0c0f0";
|
||||
// is it incoming
|
||||
if ( ! s->m_callback ) bg = "#e8e8ff";
|
||||
if ( ! s->m_callback ) bg = LIGHTER_BLUE;//"e8e8ff";
|
||||
Host *h = g_hostdb.getHost ( s->m_ip , s->m_port );
|
||||
char *eip = "??";
|
||||
unsigned short eport = 0 ;
|
||||
@ -494,7 +506,7 @@ void printUdpTable ( SafeBuf *p, char *title, UdpServer *server ,
|
||||
if ( msgType == 0x25 ) desc = "get link info";
|
||||
if ( msgType == 0xfd ) desc = "proxy forward";
|
||||
|
||||
p->safePrintf ( "<tr bgcolor=%s>"
|
||||
p->safePrintf ( "<tr bgcolor=#%s>"
|
||||
"<td>%s</td>" // age
|
||||
"<td>%s</td>" // last read
|
||||
"<td>%s</td>" // last send
|
||||
@ -540,22 +552,25 @@ void printUdpTable ( SafeBuf *p, char *title, UdpServer *server ,
|
||||
cf2);
|
||||
}
|
||||
|
||||
if ( ! isDns )
|
||||
if ( ! isDns ) {
|
||||
//"<td>%s</td>" // ip
|
||||
//"<td>%hu</td>" // port
|
||||
// clickable hostId
|
||||
char *toFrom = "to";
|
||||
if ( ! s->m_callback ) toFrom = "from";
|
||||
//"<td><a href=http://%s:%hu/cgi/15.cgi>%li</a></td>"
|
||||
p->safePrintf ( "<td>0x%hhx</td>" // msgtype
|
||||
"<td><nobr>%s</nobr></td>" // desc
|
||||
"<td><a href=http://%s:%hu/"
|
||||
"<td><nobr>%s <a href=http://%s:%hu/"
|
||||
"master/sockets?"
|
||||
"c=%s>%s</a></td>"
|
||||
"c=%s>%s</a></nobr></td>"
|
||||
"<td>%s%li%s</td>" , // niceness
|
||||
s->m_msgType ,
|
||||
desc,
|
||||
//iptoa(s->m_ip) ,
|
||||
//s->m_port ,
|
||||
// begin clickable hostId
|
||||
toFrom,
|
||||
eip ,
|
||||
eport ,
|
||||
coll ,
|
||||
@ -565,6 +580,7 @@ void printUdpTable ( SafeBuf *p, char *title, UdpServer *server ,
|
||||
cf2
|
||||
// end clickable hostId
|
||||
);
|
||||
}
|
||||
|
||||
p->safePrintf ( "<td>%lu</td>" // transId
|
||||
"<td>%i</td>" // called handler
|
||||
|
408
PageStats.cpp
408
PageStats.cpp
File diff suppressed because it is too large
Load Diff
@ -194,9 +194,6 @@ void sendReply ( void *state ) {
|
||||
//g_pages.printAdminTop2 ( &buf , st->m_socket , &st->m_request, NULL ,
|
||||
// tmpBuf.getBufStart(), tmpBuf.length() );
|
||||
|
||||
// write the controls section of the page
|
||||
writeControls( &buf, st );
|
||||
|
||||
// Debug print of CGI parameters and errors
|
||||
char startTimeStr[30];
|
||||
char endTimeStr[30];
|
||||
@ -211,10 +208,10 @@ void sendReply ( void *state ) {
|
||||
"Turn on in the master controls.</b>"
|
||||
"</font>\n" );
|
||||
|
||||
buf.safePrintf("<table cellpadding=10 border=0>\n");
|
||||
buf.safePrintf("<table %s>\n",TABLE_STYLE);
|
||||
|
||||
buf.safePrintf("<tr><td>"
|
||||
"<center>");
|
||||
buf.safePrintf("<tr><td bgcolor=#%s>"
|
||||
"<center>",LIGHT_BLUE);
|
||||
|
||||
/////////////////////////
|
||||
//
|
||||
@ -246,6 +243,9 @@ void sendReply ( void *state ) {
|
||||
|
||||
buf.safePrintf("</center>");
|
||||
|
||||
// write the controls section of the page
|
||||
writeControls( &buf, st );
|
||||
|
||||
// print the bottom of the page
|
||||
g_pages.printAdminBottom2( &buf );
|
||||
|
||||
|
@ -34,8 +34,8 @@ bool sendPageThreads ( TcpSocket *s , HttpRequest *r ) {
|
||||
long hiActive = q->m_hiLaunched - q->m_hiReturned;
|
||||
long total = loActive + mdActive + hiActive;
|
||||
|
||||
p.safePrintf ( "<table width=100%% bgcolor=#d0d0f0 border=1>"
|
||||
"<tr><td bgcolor=#c0c0f0 colspan=\"11\">"
|
||||
p.safePrintf ( "<table %s>"
|
||||
"<tr class=hdrow><td colspan=\"11\">"
|
||||
//"<center>"
|
||||
//"<font size=+1>"
|
||||
"<b>Thread Type: %s"
|
||||
@ -43,12 +43,13 @@ bool sendPageThreads ( TcpSocket *s , HttpRequest *r ) {
|
||||
" med: %li"
|
||||
" high: %li"
|
||||
" total: %li)</td></tr>",
|
||||
TABLE_STYLE,
|
||||
q->getThreadType(),
|
||||
loActive, mdActive,
|
||||
hiActive, total);
|
||||
|
||||
|
||||
p.safePrintf ("<tr>"
|
||||
p.safePrintf ("<tr bgcolor=#%s>"
|
||||
"<td><b>Status</b></td>"
|
||||
"<td><b>Niceness</b></td>"
|
||||
"<td><b>Queued Time</b></td>"
|
||||
@ -60,7 +61,9 @@ bool sendPageThreads ( TcpSocket *s , HttpRequest *r ) {
|
||||
"<td><b>Bytes Done</b></td>"
|
||||
"<td><b>KBytes/Sec</b></td>"
|
||||
"<td><b>Read|Write</b></td>"
|
||||
"</tr>");
|
||||
"</tr>"
|
||||
, LIGHT_BLUE
|
||||
);
|
||||
|
||||
for ( long j = 0 ; j < q->m_top ; j++ ) {
|
||||
ThreadEntry *t = &q->m_entries[j];
|
||||
@ -73,7 +76,7 @@ bool sendPageThreads ( TcpSocket *s , HttpRequest *r ) {
|
||||
// might have got pre-called from EDISKSTUCK
|
||||
if ( ! t->m_callback ) fs = NULL;
|
||||
|
||||
p.safePrintf("<tr>");
|
||||
p.safePrintf("<tr bgcolor=#%s>", DARK_BLUE );
|
||||
|
||||
if(t->m_isDone) {
|
||||
p.safePrintf("<td><font color='red'><b>done</b></font></td>");
|
||||
@ -109,7 +112,7 @@ bool sendPageThreads ( TcpSocket *s , HttpRequest *r ) {
|
||||
if(diskThread && fs ) {
|
||||
long long took = (now - t->m_launchedTime);
|
||||
if(took <= 0) took = 1;
|
||||
p.safePrintf("<td>???/%li</td>", t->m_bytesToGo);
|
||||
p.safePrintf("<td>%c%c%c/%li</td>", '?','?','?',t->m_bytesToGo);
|
||||
p.safePrintf("<td>%.2f kbps</td>", 0.0);//(float)fs->m_bytesDone/took);
|
||||
p.safePrintf("<td>%s</td>",t->m_doWrite? "Write":"Read");
|
||||
}
|
||||
@ -159,41 +162,50 @@ bool sendPageThreads ( TcpSocket *s , HttpRequest *r ) {
|
||||
long hiActiveMed = disk->m_hiLaunchedMed - disk->m_hiReturnedMed;
|
||||
long hiActiveSma = disk->m_hiLaunchedSma - disk->m_hiReturnedSma;
|
||||
long activeWrites = disk->m_writesLaunched - disk->m_writesReturned;
|
||||
p.safePrintf ( "<table width=100%% bgcolor=#d0d0f0 border=1>"
|
||||
"<tr><td bgcolor=#c0c0f0 colspan=\"5\">");
|
||||
p.safePrintf ( "<table %s>"
|
||||
"<tr class=hdrow><td colspan=\"5\">"
|
||||
, TABLE_STYLE );
|
||||
p.safePrintf ( "<center><b>Active Read Threads</b></center></td></tr>"
|
||||
"<tr><td></td><td colspan='3'><center><b>Priority</b></center></td></tr>"
|
||||
"<tr>"
|
||||
"<tr bgcolor=#%s>"
|
||||
"<td></td><td colspan='3'>"
|
||||
"<center><b>Priority</b></center></td></tr>"
|
||||
"<tr bgcolor=#%s>"
|
||||
"<td><b>Size</b></td><td>Low</td><td>Medium</td><td>High</td>"
|
||||
"</tr>"
|
||||
// "<tr>"
|
||||
// "<td>Size</td>"
|
||||
// "</tr>"
|
||||
"<tr>"
|
||||
"<tr bgcolor=#%s>"
|
||||
"<td>Small</td> <td>%li</td><td>%li</td><td>%li</td>"
|
||||
"</tr>"
|
||||
"<tr>"
|
||||
"<tr bgcolor=#%s>"
|
||||
"<td>Medium</td> <td>%li</td><td>%li</td><td>%li</td>"
|
||||
"</tr>"
|
||||
"<tr>"
|
||||
"<tr bgcolor=#%s>"
|
||||
"<td>Large</td> <td>%li</td><td>%li</td><td>%li</td>"
|
||||
"</tr>"
|
||||
"</table><br><br>",
|
||||
LIGHT_BLUE,
|
||||
LIGHT_BLUE,
|
||||
|
||||
DARK_BLUE,
|
||||
loActiveSma,
|
||||
mdActiveSma,
|
||||
hiActiveSma,
|
||||
|
||||
DARK_BLUE,
|
||||
loActiveMed,
|
||||
mdActiveMed,
|
||||
hiActiveMed,
|
||||
|
||||
DARK_BLUE,
|
||||
loActiveBig,
|
||||
mdActiveBig,
|
||||
hiActiveBig);
|
||||
|
||||
p.safePrintf ("<table width=100%% bgcolor=#d0d0f0 border=1>");
|
||||
p.safePrintf ("<tr>"
|
||||
"<td bgcolor=#c0c0f0><b>Active Write Threads</b></td><td>%li</td>"
|
||||
p.safePrintf ("<table %s>",TABLE_STYLE);
|
||||
p.safePrintf ("<tr class=hdrow>"
|
||||
"<td><b>Active Write Threads</b></td><td>%li</td>"
|
||||
"</tr></table>",
|
||||
activeWrites);
|
||||
|
||||
|
278
Pages.cpp
278
Pages.cpp
@ -222,11 +222,11 @@ static WebPage s_pages[] = {
|
||||
//USER_ADMIN | USER_MASTER ,
|
||||
"page filter page",
|
||||
sendPageGeneric , 0 } ,
|
||||
{ PAGE_INJECT , "admin/inject" , 0 , "inject urls" , 0 , 1 ,
|
||||
{ PAGE_INJECT , "admin/inject" , 0 , "inject url" , 0 , 1 ,
|
||||
//USER_ADMIN | USER_MASTER ,
|
||||
"inject url in the index here",
|
||||
sendPageInject , 2 } ,
|
||||
{ PAGE_ADDURL2 , "admin/addurl" , 0 , "add url" , 0 , 0 ,
|
||||
{ PAGE_ADDURL2 , "admin/addurl" , 0 , "add urls" , 0 , 0 ,
|
||||
//USER_ADMIN | USER_MASTER ,
|
||||
"add url page",
|
||||
sendPageAddUrl , 0 } ,
|
||||
@ -913,7 +913,9 @@ bool Pages::printAdminTop ( SafeBuf *sb ,
|
||||
if ( user ) pwd = user->m_password;
|
||||
|
||||
sb->safePrintf(
|
||||
"<html>\n"
|
||||
"<html>\n");
|
||||
|
||||
sb->safePrintf(
|
||||
"<head>\n"
|
||||
"<title>%s | gigablast admin</title>\n"
|
||||
"<meta http-equiv=\"Content-Type\" "
|
||||
@ -961,21 +963,24 @@ bool Pages::printAdminTop ( SafeBuf *sb ,
|
||||
coll, NULL, fromIp, qs );
|
||||
}
|
||||
// end table
|
||||
sb->safePrintf ("</td></tr></table><br/><br/>\n");
|
||||
sb->safePrintf ("</td></tr></table><br/>\n");//<br/>\n");
|
||||
|
||||
SafeBuf mb;
|
||||
long adds = 0;
|
||||
|
||||
PingServer *ps = &g_pingServer;
|
||||
|
||||
mb.safePrintf("<center>"
|
||||
mb.safePrintf(//"<center>"
|
||||
"<table cellpadding=5 "
|
||||
"style=\""
|
||||
//"border:2px solid black;"
|
||||
"max-width:600px\" "
|
||||
"background-color:#ff6666;"
|
||||
"border:2px #8f0000 solid;"
|
||||
"border-radius:5px;"
|
||||
"max-width:600px;"
|
||||
"\" "
|
||||
"border=0"
|
||||
">"
|
||||
"<tr><td bgcolor=#ff6666>");
|
||||
"<tr><td>");
|
||||
|
||||
// emergency message box
|
||||
if ( g_pingServer.m_hostsConfInDisagreement ) {
|
||||
@ -997,8 +1002,9 @@ bool Pages::printAdminTop ( SafeBuf *sb ,
|
||||
*needsRebalance ) {
|
||||
if ( adds ) mb.safePrintf("<br><br>");
|
||||
adds++;
|
||||
mb.safePrintf("This host requires a shard rebalance. "
|
||||
"Click 'rebalance shards' in master controls.");
|
||||
mb.safePrintf("A host requires a shard rebalance. "
|
||||
"Click 'rebalance shards' in master controls to "
|
||||
"rebalance all hosts.");
|
||||
}
|
||||
|
||||
if ( ps->m_numHostsDead ) {
|
||||
@ -1010,39 +1016,22 @@ bool Pages::printAdminTop ( SafeBuf *sb ,
|
||||
"pings.",ps->m_numHostsDead ,s );
|
||||
}
|
||||
|
||||
mb.safePrintf("</td></tr></table></center><br>");
|
||||
if ( ! g_conf.m_useThreads || g_threads.m_disabled ) {
|
||||
if ( adds ) mb.safePrintf("<br><br>");
|
||||
adds++;
|
||||
mb.safePrintf("Threads are disabled. Severely hurts "
|
||||
"performance.");
|
||||
}
|
||||
|
||||
// a new table. on the left is collections, on right is other stuff
|
||||
sb->safePrintf("<TABLE "
|
||||
"cellpadding=5 border=0>"
|
||||
"<TR>"
|
||||
"<TD valign=top>"
|
||||
"<div "
|
||||
"style="
|
||||
"max-height:600px;"
|
||||
//"max-width:225px;"
|
||||
//"min-width:225px;"
|
||||
"overflow-y:auto;"
|
||||
"overflow-x:hidden>"
|
||||
);
|
||||
|
||||
// collection under that
|
||||
status &= printCollectionNavBar ( sb, page , username , coll,pwd, qs );
|
||||
|
||||
// then collection page links and parms
|
||||
sb->safePrintf("</div></TD><TD valign=top><br>");
|
||||
|
||||
// print emergency msg box
|
||||
if ( adds )
|
||||
sb->safePrintf("%s",mb.getBufStart());
|
||||
|
||||
// print the links
|
||||
status &= printAdminLinks ( sb, page , username , coll , pwd, true );
|
||||
|
||||
// print the links
|
||||
status &= printAdminLinks ( sb, page , username , coll ,pwd , false );
|
||||
mb.safePrintf("</td></tr></table>"
|
||||
//"</center>"
|
||||
"<br>");
|
||||
|
||||
////////
|
||||
//
|
||||
// . the form
|
||||
//
|
||||
////////
|
||||
// . we cannot use the GET method if there is more than a few k of
|
||||
// parameters, like in the case of the Search Controls page. The
|
||||
// browser simply will not send the request if it is that big.
|
||||
@ -1054,7 +1043,6 @@ bool Pages::printAdminTop ( SafeBuf *sb ,
|
||||
sb->safePrintf ("<form name=\"SubmitInput\" method=\"get\" "
|
||||
"action=\"/%s\">\n",
|
||||
s_pages[page].m_filename);
|
||||
|
||||
// pass on this stuff
|
||||
//if ( ! pwd ) pwd = "";
|
||||
//sb->safePrintf ( "<input type=hidden name=pwd value=\"%s\">\n",pwd);
|
||||
@ -1065,11 +1053,66 @@ bool Pages::printAdminTop ( SafeBuf *sb ,
|
||||
if ( g_users.hasPermission ( username, PAGE_ADMIN ) ){
|
||||
sb->safePrintf("<input type=hidden name=master value=0>\n");
|
||||
}
|
||||
|
||||
// should any changes be broadcasted to all hosts?
|
||||
sb->safePrintf ("<input type=hidden name=cast value=\"%li\">\n",
|
||||
(long)s_pages[page].m_cast);
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
// a new table. on the left is collections, on right is other stuff
|
||||
sb->safePrintf("<TABLE "
|
||||
"cellpadding=5 border=0>"
|
||||
"<TR>"
|
||||
"<td></td>"
|
||||
);
|
||||
|
||||
|
||||
// then collection page links and parms
|
||||
sb->safePrintf("<TD valign=top>");
|
||||
|
||||
// print emergency msg box
|
||||
if ( adds )
|
||||
sb->safePrintf("<br>%s",mb.getBufStart());
|
||||
|
||||
// print the links
|
||||
status &= printAdminLinks ( sb, page , username , coll , pwd, true );
|
||||
|
||||
// print the links
|
||||
status &= printAdminLinks ( sb, page , username , coll ,pwd , false );
|
||||
|
||||
// begin 2nd row in big table
|
||||
sb->safePrintf("</td></TR>");
|
||||
|
||||
sb->safePrintf(
|
||||
"<TR>"
|
||||
"<TD valign=top>"
|
||||
"<div "
|
||||
"style=\""
|
||||
"max-height:600px;"
|
||||
"max-width:200px;"
|
||||
"min-width:200px;"
|
||||
"padding:4px;" // same as TABLE_STYLE
|
||||
"background-color:#d0d0d0;"
|
||||
"border-radius:10px;"
|
||||
"border:2px #606060 solid;"
|
||||
//"border-width:2px;"
|
||||
//"border-color:#606060;"
|
||||
"overflow-y:auto;"
|
||||
"overflow-x:hidden;"
|
||||
"line-height:23px;"
|
||||
"\""
|
||||
">"
|
||||
);
|
||||
// collection under that
|
||||
status &= printCollectionNavBar ( sb, page , username , coll,pwd, qs );
|
||||
|
||||
sb->safePrintf("</div></TD>");
|
||||
|
||||
// the controls will go here
|
||||
sb->safePrintf("<TD valign=top>");
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -1783,7 +1826,7 @@ bool Pages::printAdminLinks ( SafeBuf *sb,
|
||||
|
||||
//sprintf(p,"<font size=+1>\n" );
|
||||
//p += gbstrlen(p);
|
||||
sb->safePrintf ("<center>\n" );
|
||||
//sb->safePrintf ("<center>\n" );
|
||||
|
||||
// soemtimes we do not want to be USER_MASTER for testing
|
||||
char buf [ 64 ];
|
||||
@ -1807,6 +1850,10 @@ bool Pages::printAdminLinks ( SafeBuf *sb,
|
||||
if ( ! g_conf.m_isMattWells && i == PAGE_SEO )
|
||||
continue;
|
||||
|
||||
// skip page autoban link
|
||||
if ( ! g_conf.m_isMattWells && i == PAGE_AUTOBAN )
|
||||
continue;
|
||||
|
||||
// ignore these for now
|
||||
if ( i == PAGE_SECURITY ) continue;
|
||||
if ( i == PAGE_ACCESS ) continue;
|
||||
@ -1815,38 +1862,55 @@ bool Pages::printAdminLinks ( SafeBuf *sb,
|
||||
if ( i == PAGE_SEARCHBOX ) continue;
|
||||
if ( i == PAGE_TITLEDB ) continue;
|
||||
|
||||
// print "url download" before "inject url"
|
||||
// GET /mycollname_urls.csv
|
||||
if ( i == PAGE_INJECT ) {
|
||||
sb->safePrintf (
|
||||
"<b>"
|
||||
"<a style=text-decoration:none; "
|
||||
"href=\"/download/%s_urls.txt\">"
|
||||
"url download"
|
||||
"</a>"
|
||||
"</b>"
|
||||
" \n",
|
||||
coll );
|
||||
}
|
||||
|
||||
if ( cr && ! cr->m_isCustomCrawl && i == PAGE_CRAWLBOT )
|
||||
continue;
|
||||
|
||||
// print it out
|
||||
if ( i == PAGE_LOGIN || i == PAGE_LOGIN2 )
|
||||
sb->safePrintf(
|
||||
"<span style=\"white-space:nowrap\">"
|
||||
"<a href=\"/%s?"
|
||||
//"user=%s&pwd=%s&"
|
||||
"c=%s%s\">%s</a>"
|
||||
"</span>"
|
||||
" \n",s_pages[i].m_filename,
|
||||
//username,pwd,
|
||||
coll,
|
||||
buf,s_pages[i].m_name);
|
||||
//"<span style=\"white-space:nowrap\">"
|
||||
"<a href=\"/%s?"
|
||||
//"user=%s&pwd=%s&"
|
||||
"c=%s%s\">%s</a>"
|
||||
//"</span>"
|
||||
" \n",s_pages[i].m_filename,
|
||||
//username,pwd,
|
||||
coll,
|
||||
buf,s_pages[i].m_name);
|
||||
else if ( page == i )
|
||||
sb->safePrintf(
|
||||
"<span style=\"white-space:nowrap\">"
|
||||
"<a href=\"/%s?c=%s%s\"><b>"
|
||||
"<font color=red>%s</font></b></a>"
|
||||
"</span>"
|
||||
" \n",s_pages[i].m_filename,
|
||||
coll,
|
||||
buf,s_pages[i].m_name);
|
||||
//"<span style=\"white-space:nowrap\">"
|
||||
"<a href=\"/%s?c=%s%s\"><b>"
|
||||
"<font color=red>%s</font></b></a>"
|
||||
//"</span>"
|
||||
" \n",s_pages[i].m_filename,
|
||||
coll,
|
||||
buf,s_pages[i].m_name);
|
||||
else
|
||||
sb->safePrintf(
|
||||
"<span style=\"white-space:nowrap\">"
|
||||
"<a href=\"/%s?c=%s%s\">%s</a>"
|
||||
"</span>"
|
||||
" \n",s_pages[i].m_filename,
|
||||
coll,
|
||||
buf,s_pages[i].m_name);
|
||||
//"<span style=\"white-space:nowrap\">"
|
||||
"<b>"
|
||||
"<a style=text-decoration:none; "
|
||||
"href=\"/%s?c=%s%s\">%s</a>"
|
||||
"</b>"
|
||||
//"</span>"
|
||||
" \n",s_pages[i].m_filename,
|
||||
coll,
|
||||
buf,s_pages[i].m_name);
|
||||
// print <br> after the last master admin control
|
||||
/*
|
||||
if ( i == PAGE_DELCOLL && user == USER_MASTER ) {
|
||||
@ -1861,7 +1925,24 @@ bool Pages::printAdminLinks ( SafeBuf *sb,
|
||||
}
|
||||
*/
|
||||
}
|
||||
sb->safePrintf("</center><br/>" );
|
||||
|
||||
// print documentation links
|
||||
if ( top ) {
|
||||
sb->safePrintf(" <a style=text-decoration:none "
|
||||
"href=/admin.html>"
|
||||
"<b>"
|
||||
"admin guide"
|
||||
"</b></a> "
|
||||
" "
|
||||
" <a style=text-decoration:none; "
|
||||
"href=/developer.html>"
|
||||
"<b>dev guide</b></a>" );
|
||||
}
|
||||
|
||||
//sb->safePrintf("</center>" );
|
||||
sb->safePrintf("<br/>" );
|
||||
|
||||
if ( top ) sb->safePrintf("<br/>" );
|
||||
|
||||
if ( top ) return status;
|
||||
|
||||
@ -1956,6 +2037,7 @@ bool Pages::printCollectionNavBar ( SafeBuf *sb ,
|
||||
bool status = true;
|
||||
//if ( ! pwd ) pwd = "";
|
||||
if ( ! qs ) qs = "";
|
||||
|
||||
// if not admin just print collection name
|
||||
if ( g_collectiondb.m_numRecsUsed == 0 ) {
|
||||
sb->safePrintf ( "<center>"
|
||||
@ -1990,26 +2072,42 @@ bool Pages::printCollectionNavBar ( SafeBuf *sb ,
|
||||
|
||||
char *s = "s";
|
||||
if ( g_collectiondb.m_numRecsUsed == 1 ) s = "";
|
||||
sb->safePrintf ( "<center><b>%li Collection%s</b></center><br>\n",
|
||||
sb->safePrintf ( "<center><nobr><b>%li Collection%s</b></nobr>"
|
||||
"</center><br>\n",
|
||||
g_collectiondb.m_numRecsUsed , s );
|
||||
|
||||
char *color = "red";
|
||||
//if ( page >= PAGE_CGIPARMS ) color = "red";
|
||||
//else color = "black";
|
||||
|
||||
// style for printing collection names
|
||||
sb->safePrintf("<style>.x{text-decoration:none;font-weight:bold;}"
|
||||
".e{background-color:#e0e0e0;}"
|
||||
"</style>\n");
|
||||
|
||||
long row = 0;
|
||||
|
||||
//for ( long i = a ; i < b ; i++ ) {
|
||||
for ( long i = 0 ; i < g_collectiondb.m_numRecs ; i++ ) {
|
||||
CollectionRec *cc = g_collectiondb.m_recs[i];
|
||||
if ( ! cc ) continue;
|
||||
char *cname = cc->m_coll;
|
||||
|
||||
row++;
|
||||
|
||||
//if ( p + gbstrlen(cname) + 100 >= pend ) return p;
|
||||
// collection name HACK for backwards compatibility
|
||||
//if ( ! cname[0] ) cname = "main";
|
||||
|
||||
// every other coll in a darker div
|
||||
if ( (row % 2) == 0 )
|
||||
sb->safePrintf("<div class=e>");
|
||||
|
||||
sb->safePrintf("<nobr>");
|
||||
|
||||
if ( i != collnum || ! highlight )// || ! coll || ! coll[0])
|
||||
sb->safePrintf ( "<a title=\"%s\" "
|
||||
"class=x "
|
||||
"href=\"/%s?c=%s%s\">%s"
|
||||
"</a> ",
|
||||
cname,
|
||||
@ -2017,12 +2115,17 @@ bool Pages::printCollectionNavBar ( SafeBuf *sb ,
|
||||
cname ,
|
||||
qs, cname );
|
||||
else
|
||||
sb->safePrintf ( "<b><font title=\"%s\" "
|
||||
"color=%s>%s</font></b> "
|
||||
sb->safePrintf ( "<u><b><font title=\"%s\" "
|
||||
"color=%s>%s</font></b></u> "
|
||||
" ",
|
||||
cname, color , cname );
|
||||
sb->safePrintf("</nobr>");
|
||||
sb->safePrintf("<br>\n");
|
||||
|
||||
// every other coll in a darker div
|
||||
if ( (row % 2) == 0 )
|
||||
sb->safePrintf("</div>");
|
||||
else
|
||||
sb->safePrintf("<br>\n");
|
||||
}
|
||||
|
||||
//sb->safePrintf ( "</center><br/>" );
|
||||
@ -2383,21 +2486,32 @@ bool sendPageCgiParms ( TcpSocket *s , HttpRequest *r ) {
|
||||
// p.incrementLength ( pp - p.getBuf() );
|
||||
// }
|
||||
|
||||
p.safePrintf ( "<table width=100%% cellpadding=2 "
|
||||
"bgcolor=#%s border=1>"
|
||||
"<tr><td colspan=4 bgcolor=#%s>"
|
||||
p.safePrintf ( "<table %s>"
|
||||
"<tr class=hdrow><td colspan=8>"
|
||||
"<center><b>CGI Parameters</b></tr></tr>"
|
||||
"<tr><td><b>CGI</b></td><td><b>Type</b></td>"
|
||||
"<tr bgcolor=#%s><td><b>CGI</b></td>"
|
||||
"<td><b>Page</b></td>"
|
||||
"<td><b>Type</b></td>"
|
||||
"<td><b>Name</b></td><td><b>Description</b></td></tr>\n",
|
||||
LIGHT_BLUE, DARK_BLUE );
|
||||
TABLE_STYLE , DARK_BLUE);
|
||||
for ( long i = 0; i < g_parms.m_numParms; i++ ) {
|
||||
Parm *parm = &g_parms.m_parms[i];
|
||||
if ( !parm->m_sparm ) continue;
|
||||
// use m_cgi if no m_scgi
|
||||
char *cgi = parm->m_cgi;
|
||||
if ( parm->m_scgi ) cgi = parm->m_scgi;
|
||||
|
||||
// skip if hidden
|
||||
if ( parm->m_flags & PF_HIDDEN ) continue;
|
||||
|
||||
char *page = parm->m_scmd;
|
||||
if ( ! page ) page = "";
|
||||
|
||||
// print the parm
|
||||
p.safePrintf ( "<tr><td><b>%s</b></td><td nowrap=1>", cgi );
|
||||
p.safePrintf ( "<tr bgcolor=#%s><td><b>%s</b></td>",
|
||||
LIGHT_BLUE , cgi );
|
||||
p.safePrintf("<td>%s</td>",page);
|
||||
p.safePrintf("<td nowrap=1>");
|
||||
switch ( parm->m_type ) {
|
||||
case TYPE_BOOL: p.safePrintf ( "BOOL" ); break;
|
||||
case TYPE_BOOL2: p.safePrintf ( "BOOL" ); break;
|
||||
@ -2417,14 +2531,13 @@ bool sendPageCgiParms ( TcpSocket *s , HttpRequest *r ) {
|
||||
}
|
||||
p.safePrintf ( "</table><br><br>" );
|
||||
|
||||
p.safePrintf ( "<table width=100%% cellpadding=2 "
|
||||
"bgcolor=#%s border=1>"
|
||||
"<tr><td colspan=2 bgcolor=#%s>"
|
||||
p.safePrintf ( "<table %s>"
|
||||
"<tr class=hdrow><td colspan=2>"
|
||||
"<center><b>Query Operators</b></td></tr>"
|
||||
"<tr><td><b>Operator</b></td>"
|
||||
"<td><b>Description</b>"
|
||||
"</td></tr>\n",
|
||||
LIGHT_BLUE, DARK_BLUE );
|
||||
TABLE_STYLE );
|
||||
// table of the query keywords
|
||||
long n = getNumFieldCodes();
|
||||
for ( long i = 0 ; i < n ; i++ ) {
|
||||
@ -2434,8 +2547,9 @@ bool sendPageCgiParms ( TcpSocket *s , HttpRequest *r ) {
|
||||
char *d = f->desc;
|
||||
// fix table internal cell bordering
|
||||
if ( d[0] == '\0' ) d = " ";
|
||||
p.safePrintf("<tr><td><b>%s</b>:</td><td>%s</td></tr>\n",
|
||||
f->text,d);
|
||||
p.safePrintf("<tr bgcolor=#%s>"
|
||||
"<td><b>%s</b>:</td><td>%s</td></tr>\n",
|
||||
LIGHT_BLUE,f->text,d);
|
||||
}
|
||||
|
||||
p.safePrintf("</body></html>");
|
||||
|
4
Pages.h
4
Pages.h
@ -14,8 +14,12 @@
|
||||
#include "SafeBuf.h"
|
||||
#include "PageCrawlBot.h" // sendPageCrawlBot()
|
||||
|
||||
#define LIGHTER_BLUE "e8e8ff"
|
||||
#define LIGHT_BLUE "d0d0e0"
|
||||
#define DARK_BLUE "c0c0f0"
|
||||
#define DARKER_BLUE "a0a0f0"
|
||||
#define DARKEST_BLUE "8080f0"
|
||||
#define TABLE_STYLE " style=\"border-radius:10px;border:#6060f0 2px solid;\" width=100% bgcolor=#a0a0f0 cellpadding=4 border=0 "
|
||||
|
||||
extern char *g_msg;
|
||||
|
||||
|
6
Parms.h
6
Parms.h
@ -93,9 +93,10 @@ class Page {
|
||||
#define PF_API 0x10
|
||||
#define PF_REBUILDURLFILTERS 0x20
|
||||
#define PF_NOSYNC 0x40
|
||||
#define PF_CUSTOMCRAWLONLY 0x80
|
||||
#define PF_DIFFBOT 0x80
|
||||
|
||||
#define PF_HIDDEN 0x0100
|
||||
#define PF_NOSAVE 0x0200
|
||||
|
||||
|
||||
class Parm {
|
||||
@ -342,7 +343,8 @@ class Parms {
|
||||
bool sendToGrunts = true ,
|
||||
bool sendToProxies = false ,
|
||||
// send to this single hostid? -1 means all
|
||||
long hostId = -1 );
|
||||
long hostId = -1 ,
|
||||
long hostId2 = -1 ); // hostid range?
|
||||
bool doParmSendingLoop ( ) ;
|
||||
bool syncParmsWithHost0 ( ) ;
|
||||
bool makeSyncHashList ( SafeBuf *hashList ) ;
|
||||
|
@ -2677,8 +2677,14 @@ void checkKernelErrors( int fd, void *state ){
|
||||
// klogctl reads the last 4k lines of the kernel ring buffer
|
||||
short bufLen = klogctl(3,buf,4096);
|
||||
long long took = gettimeofdayInMilliseconds() - st;
|
||||
if ( took > 1 )
|
||||
if ( took >= 3 ) {
|
||||
long len = bufLen;
|
||||
if ( len > 200 ) len = 200;
|
||||
char c = buf[len];
|
||||
buf[len] = '\0';
|
||||
log("db: klogctl took %lli ms to read %s",took, buf);
|
||||
buf[len] = c;
|
||||
}
|
||||
|
||||
if ( bufLen < 0 ){
|
||||
log ("db: klogctl returned error: %s",mstrerror(errno));
|
||||
|
@ -89,7 +89,7 @@ bool Placedb::init2 ( long treeMem ) {
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
bool Placedb::addColl ( char *coll, bool doVerify ) {
|
||||
if ( ! m_rdb.addColl ( coll ) ) return false;
|
||||
if ( ! doVerify ) return true;
|
||||
@ -101,7 +101,7 @@ bool Placedb::addColl ( char *coll, bool doVerify ) {
|
||||
log ( "db: Verify failed, but scaling is allowed, passing." );
|
||||
return true;
|
||||
}
|
||||
|
||||
*/
|
||||
bool Placedb::verify ( char *coll ) {
|
||||
log ( LOG_INFO, "db: Verifying Placedb for coll %s...", coll );
|
||||
g_threads.disableThreads();
|
||||
|
@ -227,7 +227,7 @@ bool Posdb::init2 ( long treeMem ) {
|
||||
|
||||
|
||||
bool Posdb::addColl ( char *coll, bool doVerify ) {
|
||||
if ( ! m_rdb.addColl ( coll ) ) return false;
|
||||
if ( ! m_rdb.addRdbBase1 ( coll ) ) return false;
|
||||
if ( ! doVerify ) return true;
|
||||
// verify
|
||||
if ( verify(coll) ) return true;
|
||||
|
27
Process.cpp
27
Process.cpp
@ -108,6 +108,10 @@ char *g_files[] = {
|
||||
"pdftohtml", // pdf
|
||||
"pstotext" , // postscript
|
||||
//"ppthtml" , // powerpoint
|
||||
|
||||
// required for SSL server support for both getting web pages
|
||||
// on https:// sites and for serving https:// pages
|
||||
"gb.pem",
|
||||
|
||||
//"dict/unifiedDict",
|
||||
//"dict/thesaurus.txt",
|
||||
@ -187,6 +191,7 @@ char *g_files[] = {
|
||||
|
||||
bool Process::checkFiles ( char *dir ) {
|
||||
|
||||
/*
|
||||
// check these by hand since you need one or the other
|
||||
File f1;
|
||||
File f2;
|
||||
@ -199,15 +204,14 @@ bool Process::checkFiles ( char *dir ) {
|
||||
if ( //( ! f3.doesExist() || ! f4.doesExist() ) &&
|
||||
( ! f4.doesExist() ) &&
|
||||
( ! f1.doesExist() || ! f2.doesExist() ) ) {
|
||||
/*
|
||||
log("db: need either (%s and %s) or (%s and %s)",
|
||||
f3.getFilename() ,
|
||||
f4.getFilename() ,
|
||||
f1.getFilename() ,
|
||||
f2.getFilename() );
|
||||
*/
|
||||
//return false;
|
||||
}
|
||||
*/
|
||||
|
||||
// check for email subdir
|
||||
//f1.set ( dir , "/html/email/");
|
||||
@ -410,7 +414,7 @@ bool Process::init ( ) {
|
||||
//m_rdbs[m_numRdbs++] = g_tfndb.getRdb ();
|
||||
m_rdbs[m_numRdbs++] = g_titledb.getRdb ();
|
||||
//m_rdbs[m_numRdbs++] = g_revdb.getRdb ();
|
||||
//m_rdbs[m_numRdbs++] = g_sectiondb.getRdb ();
|
||||
m_rdbs[m_numRdbs++] = g_sectiondb.getRdb ();
|
||||
m_rdbs[m_numRdbs++] = g_posdb.getRdb ();
|
||||
//m_rdbs[m_numRdbs++] = g_datedb.getRdb ();
|
||||
m_rdbs[m_numRdbs++] = g_spiderdb.getRdb ();
|
||||
@ -430,7 +434,7 @@ bool Process::init ( ) {
|
||||
//m_rdbs[m_numRdbs++] = g_tfndb2.getRdb ();
|
||||
m_rdbs[m_numRdbs++] = g_titledb2.getRdb ();
|
||||
//m_rdbs[m_numRdbs++] = g_revdb2.getRdb ();
|
||||
//m_rdbs[m_numRdbs++] = g_sectiondb2.getRdb ();
|
||||
m_rdbs[m_numRdbs++] = g_sectiondb2.getRdb ();
|
||||
m_rdbs[m_numRdbs++] = g_posdb2.getRdb ();
|
||||
//m_rdbs[m_numRdbs++] = g_datedb2.getRdb ();
|
||||
m_rdbs[m_numRdbs++] = g_spiderdb2.getRdb ();
|
||||
@ -1426,6 +1430,13 @@ bool Process::shutdown2 ( ) {
|
||||
// at least destroy the page caches that have shared memory
|
||||
// because they seem to not clean it up
|
||||
resetPageCaches();
|
||||
|
||||
// let's ensure our core file can dump
|
||||
struct rlimit lim;
|
||||
lim.rlim_cur = lim.rlim_max = RLIM_INFINITY;
|
||||
if ( setrlimit(RLIMIT_CORE,&lim) )
|
||||
log("gb: setrlimit: %s.", mstrerror(errno) );
|
||||
|
||||
// . force an abnormal termination which will cause a core dump
|
||||
// . do not dump core on SIGHUP signals any more though
|
||||
abort();
|
||||
@ -1478,7 +1489,7 @@ void Process::disableTreeWrites ( ) {
|
||||
rdb->disableWrites();
|
||||
}
|
||||
// disable all spider trees and tables
|
||||
for ( long i = 0 ; i < g_collectiondb.getNumRecs() ; i++ ) {
|
||||
for ( long i = 0 ; i < g_collectiondb.m_numRecs ; i++ ) {
|
||||
SpiderColl *sc = g_spiderCache.getSpiderCollIffNonNull(i);
|
||||
if ( ! sc ) continue;
|
||||
sc->m_waitingTree .disableWrites();
|
||||
@ -1495,7 +1506,7 @@ void Process::enableTreeWrites ( ) {
|
||||
rdb->enableWrites();
|
||||
}
|
||||
// enable all waiting trees
|
||||
for ( long i = 0 ; i < g_collectiondb.getNumRecs() ; i++ ) {
|
||||
for ( long i = 0 ; i < g_collectiondb.m_numRecs ; i++ ) {
|
||||
SpiderColl *sc = g_spiderCache.getSpiderCollIffNonNull(i);
|
||||
if ( ! sc ) continue;
|
||||
sc->m_waitingTree .enableWrites();
|
||||
@ -1771,6 +1782,8 @@ void Process::resetAll ( ) {
|
||||
|
||||
g_wiktionary.reset();
|
||||
|
||||
g_countryCode.reset();
|
||||
|
||||
s_clusterdbQuickCache.reset();
|
||||
s_hammerCache.reset();
|
||||
s_table32.reset();
|
||||
@ -1824,7 +1837,7 @@ void Process::resetPageCaches ( ) {
|
||||
//g_datedb .getDiskPageCache()->reset();
|
||||
g_linkdb .getDiskPageCache()->reset();
|
||||
g_titledb .getDiskPageCache()->reset();
|
||||
//g_sectiondb .getDiskPageCache()->reset();
|
||||
g_sectiondb .getDiskPageCache()->reset();
|
||||
g_tagdb .getDiskPageCache()->reset();
|
||||
g_spiderdb .getDiskPageCache()->reset();
|
||||
//g_tfndb .getDiskPageCache()->reset();
|
||||
|
63
Profiler.cpp
63
Profiler.cpp
@ -66,7 +66,7 @@ bool Profiler::init() {
|
||||
return false;
|
||||
if ( ! m_activeFns.set(4,4,256,NULL,0,false,0,"activefns") )
|
||||
return false;
|
||||
return m_fn.set(4,sizeof(FnInfo),256,NULL,0,false,0,"fntbl");
|
||||
return m_fn.set(4,sizeof(FnInfo),65536,NULL,0,false,0,"fntbl");
|
||||
}
|
||||
|
||||
|
||||
@ -751,17 +751,19 @@ bool Profiler::printInfo(SafeBuf *sb,char *username, //long user,
|
||||
}
|
||||
|
||||
|
||||
sb->safePrintf( "<center>\n<table border=1 cellpadding=4 "
|
||||
"width=100%% bgcolor=#%s>\n"
|
||||
"<tr><td colspan=9 bgcolor=#%s>"
|
||||
sb->safePrintf( "<center>\n<table %s>\n"
|
||||
"<tr class=hdrow><td colspan=9>"
|
||||
"<center><b>Profiler "//- Since Startup</b></center>"
|
||||
"<a href=\"/admin/profiler?c=%s"//"
|
||||
"&profilerreset=1\">"
|
||||
"(reset)</a></b></center>"
|
||||
"</td></tr>\n",LIGHT_BLUE,DARK_BLUE,
|
||||
"</td></tr>\n",
|
||||
TABLE_STYLE,
|
||||
coll);
|
||||
|
||||
sb->safePrintf("<tr><td><b>Address</b></td><td><b>Function</b></td>");
|
||||
sb->safePrintf("<tr bgcolor=#%s>"
|
||||
"<td><b>Address</b></td><td><b>Function</b></td>"
|
||||
, LIGHT_BLUE);
|
||||
sb->safePrintf("<td><b><a href=/admin/profiler?sorts=3&c=%s>"
|
||||
"Times Called</a></b></td></td>",coll);
|
||||
sb->safePrintf("<td><b><a href=/admin/profiler?sorts=4&c=%s>"
|
||||
@ -858,12 +860,13 @@ bool Profiler::printInfo(SafeBuf *sb,char *username, //long user,
|
||||
|
||||
|
||||
//Now to print the table of functions called in the last 10 seconds
|
||||
sb->safePrintf( "<center>\n<table border=1 cellpadding=4 "
|
||||
"width=100%% bgcolor=#%s>\n"
|
||||
"<tr><td colspan=8 bgcolor=#%s>"
|
||||
sb->safePrintf( "<center>\n<table %s>\n"
|
||||
"<tr class=hdrow><td colspan=8>"
|
||||
"<center><b>Profiler - Last 10 seconds</b></center>"
|
||||
"</td></tr>\n",LIGHT_BLUE,DARK_BLUE);
|
||||
sb->safePrintf("<tr><td><b>Address</b></td><td><b>Function</b></td>");
|
||||
"</td></tr>\n",TABLE_STYLE);
|
||||
sb->safePrintf("<tr bgcolor=#%s>"
|
||||
"<td><b>Address</b></td><td><b>Function</b></td>",
|
||||
LIGHT_BLUE);
|
||||
sb->safePrintf("<td><b><a href=/admin/profiler?sort10=3&c=%s&"
|
||||
">"
|
||||
"Times Called</a></b></td></td>",coll);
|
||||
@ -1003,22 +1006,24 @@ bool Profiler::printInfo(SafeBuf *sb,char *username, //long user,
|
||||
|
||||
numSlots = m_quickpolls.getNumSlots();
|
||||
numSlotsUsed = m_quickpolls.getNumSlotsUsed();
|
||||
sb->safePrintf("<center>\n<table border=1 cellpadding=4 "
|
||||
"width=100%% bgcolor=#%s>\n"
|
||||
"<tr><td colspan=5 bgcolor=#%s>"
|
||||
sb->safePrintf("<center>\n<table %s>\n"
|
||||
"<tr class=hdrow><td colspan=5>"
|
||||
"<center><b>Triggered Quickpolls "
|
||||
"<a href=\"/admin/profiler?c=%s"
|
||||
"&qpreset=1\">"
|
||||
"(reset)</a></b></center>"
|
||||
"</td></tr>\n",LIGHT_BLUE,DARK_BLUE,
|
||||
"</td></tr>\n",
|
||||
TABLE_STYLE,
|
||||
coll);
|
||||
|
||||
sb->safePrintf("<tr><td><b>Between Functions</b></td>"
|
||||
sb->safePrintf("<tr bgcolor=#%s>"
|
||||
"<td><b>Between Functions</b></td>"
|
||||
"<td><b>max blocked(msec)</b></td>"
|
||||
"<td><b>avg time(msec)</b></td>"
|
||||
"<td><b>times triggered</b></td>"
|
||||
"<td><b>total(msec)</b></td>"
|
||||
"</tr>");
|
||||
"</tr>"
|
||||
, LIGHT_BLUE );
|
||||
|
||||
if(numSlotsUsed == 0) {
|
||||
sb->safePrintf("</table>");
|
||||
@ -1539,15 +1544,13 @@ Profiler::printRealTimeInfo(SafeBuf *sb,
|
||||
int realTimeSortMode,
|
||||
int realTimeShowAll) {
|
||||
if(!m_realTimeProfilerRunning) {
|
||||
sb->safePrintf("<table border=1 cellpadding=4 bgcolor=#%s "
|
||||
"width=100%%\n>",
|
||||
LIGHT_BLUE);
|
||||
sb->safePrintf("<tr><td colspan=7 bgcolor=#%s>"
|
||||
sb->safePrintf("<table %s>",TABLE_STYLE);
|
||||
sb->safePrintf("<tr class=hdrow><td colspan=7>"
|
||||
"<center><b>Real Time Profiler "
|
||||
"<a href=\"/admin/profiler?c=%s"
|
||||
"&rtstart=1\">"
|
||||
"(Start)</a></b></center>"
|
||||
"</td></tr>\n",DARK_BLUE,coll);
|
||||
"</td></tr>\n",coll);
|
||||
sb->safePrintf("</table><br><br>\n");
|
||||
return true;
|
||||
}
|
||||
@ -1562,16 +1565,14 @@ Profiler::printRealTimeInfo(SafeBuf *sb,
|
||||
//}
|
||||
rtNumEntries = realTimeProfilerData.getNumUsedSlots();
|
||||
if(!rtNumEntries) {
|
||||
sb->safePrintf("<table border=1 cellpadding=4 bgcolor=#%s "
|
||||
"width=100%%\n>",
|
||||
LIGHT_BLUE);
|
||||
sb->safePrintf("<tr><td colspan=7 bgcolor=#%s>"
|
||||
sb->safePrintf("<table %s>",TABLE_STYLE);
|
||||
sb->safePrintf("<tr class=hdrow><td colspan=7>"
|
||||
"<center><b>Real Time Profiler started, refresh page "
|
||||
"after some time."
|
||||
"<a href=\"/admin/profiler?c=%s"
|
||||
"&rtstop=1\">"
|
||||
"(Stop)</a></b></center>"
|
||||
"</td></tr>\n",DARK_BLUE,coll);
|
||||
"</td></tr>\n",coll);
|
||||
sb->safePrintf("</table><br><br>\n");
|
||||
startRealTimeProfiler();
|
||||
return true;
|
||||
@ -1626,9 +1627,7 @@ Profiler::printRealTimeInfo(SafeBuf *sb,
|
||||
hitEntries[i].missedQuickPollsPerFunc=missedQuickPolls;
|
||||
}
|
||||
}
|
||||
sb->safePrintf("<table border=1 cellpadding=4 bgcolor=#%s "
|
||||
"width=100%%>\n",
|
||||
LIGHT_BLUE);
|
||||
sb->safePrintf("<table %s>",TABLE_STYLE);
|
||||
char *showMessage;
|
||||
int rtall;
|
||||
if(realTimeShowAll) {
|
||||
@ -1638,11 +1637,11 @@ Profiler::printRealTimeInfo(SafeBuf *sb,
|
||||
showMessage = "(show all)";
|
||||
rtall = 1;
|
||||
}
|
||||
sb->safePrintf("<tr><td colspan=7 bgcolor=#%s>"
|
||||
sb->safePrintf("<tr class=hdrow><td colspan=7>"
|
||||
"<center><b>Real Time Profiler "
|
||||
"<a href=\"/admin/profiler?c=%s"
|
||||
"&rtall=%i\">%s</a>"
|
||||
,DARK_BLUE,coll,
|
||||
,coll,
|
||||
rtall, showMessage);
|
||||
sb->safePrintf("<a href=\"/admin/profiler?c=%s&rtstop=1\">"
|
||||
"(Stop)</a></b></center></td></tr>\n",
|
||||
|
@ -280,7 +280,7 @@ bool Proxy::initProxy ( long proxyId, unsigned short udpPort,
|
||||
//need to init collectiondb too because of addurl
|
||||
//set isdump to true because we aren't going to store any data in the
|
||||
//collection
|
||||
if ( !g_collectiondb.init( true ) ){ //isDump
|
||||
if ( !g_collectiondb.loadAllCollRecs( ) ){ //isDump
|
||||
log ("db: collectiondb init failed.");
|
||||
return false;
|
||||
}
|
||||
|
15
Query.cpp
15
Query.cpp
@ -2190,6 +2190,7 @@ bool Query::setQWords ( char boolFlag ,
|
||||
// if we're hashing a url:, link:, site: or ip: term,
|
||||
// then we need to hash ALL up to the first space
|
||||
if ( fieldCode == FIELD_URL ||
|
||||
fieldCode == FIELD_GBPARENTURL ||
|
||||
fieldCode == FIELD_EXT ||
|
||||
fieldCode == FIELD_LINK ||
|
||||
fieldCode == FIELD_ILINK||
|
||||
@ -2225,6 +2226,7 @@ bool Query::setQWords ( char boolFlag ,
|
||||
|
||||
// should we have normalized before hashing?
|
||||
if ( fieldCode == FIELD_URL ||
|
||||
fieldCode == FIELD_GBPARENTURL ||
|
||||
fieldCode == FIELD_LINK ||
|
||||
fieldCode == FIELD_ILINK ||
|
||||
fieldCode == FIELD_SITELINK ||
|
||||
@ -2237,6 +2239,8 @@ bool Query::setQWords ( char boolFlag ,
|
||||
if ( fieldCode == FIELD_ILINK) addwww = true;
|
||||
if ( fieldCode == FIELD_LINKS) addwww = true;
|
||||
if ( fieldCode == FIELD_URL ) addwww = true;
|
||||
if ( fieldCode == FIELD_GBPARENTURL )
|
||||
addwww = true;
|
||||
if ( fieldCode == FIELD_SITELINK)
|
||||
addwww = true;
|
||||
url.set ( w , wlen , addwww );
|
||||
@ -3006,7 +3010,7 @@ struct QueryField g_fields[] = {
|
||||
{"ilink", FIELD_ILINK, true,"Similar to above."},
|
||||
{"sitelink", FIELD_SITELINK, true,"Matches all pages that link to the given site. Example:sitelink:www.gigablast.com matches all pages that link to some page on the www.gigablast.com site."},
|
||||
{"site", FIELD_SITE, true,"Matches all pages from the given site. Example: site:www.gigablast.com will return all the pages on the gigablast site"},
|
||||
{"coll", FIELD_COLL, true,"Not sure if this works."},
|
||||
//{"coll", FIELD_COLL, true,"Not sure if this works."},
|
||||
{"ip", FIELD_IP, true,"Matches all pages with the given ip. Example:1.2.3.4 will match all pages whose urls have that IP address."},
|
||||
{"inurl", FIELD_SUBURL, true,"Matches all pages that have the given terms in the url. Example inurl:water will match all pages whose url has the word water in it, but the word must be delineated by punctuation."},
|
||||
{"suburl", FIELD_SUBURL, true,"Same as inurl."},
|
||||
@ -3038,8 +3042,8 @@ struct QueryField g_fields[] = {
|
||||
{"gbhasext", FIELD_GBOTHER, false,""},
|
||||
{"gbsubmiturl", FIELD_GBOTHER, false,""},
|
||||
|
||||
{"qdom", FIELD_QUOTA, false,""},
|
||||
{"qhost", FIELD_QUOTA, false,""},
|
||||
//{"qdom", FIELD_QUOTA, false,""},
|
||||
//{"qhost", FIELD_QUOTA, false,""},
|
||||
{"gbtagvector", FIELD_GBTAGVECTOR, false,""},
|
||||
|
||||
{"gbgigabitvector", FIELD_GBGIGABITVECTOR, false,""},
|
||||
@ -3061,7 +3065,7 @@ struct QueryField g_fields[] = {
|
||||
{"gbduphash" ,FIELD_GBOTHER,false,"Internal use only."},
|
||||
{"gbsitetemplate" ,FIELD_GBOTHER,false,"Internal use only."},
|
||||
{"gboutlinkedtitle" ,FIELD_GBOTHER,false,"gboutlinkedtitle:0 and gboutlinkedtitle:1 matches events whose title is not in and in a hyperlink, respectively."},
|
||||
{"gbisaggregator" ,FIELD_GBOTHER,false,"gbisaggregator:0|1 depending on if the event came from an event aggregator website, like eviesays.com."},
|
||||
//{"gbisaggregator" ,FIELD_GBOTHER,false,"gbisaggregator:0|1 depending on if the event came from an event aggregator website, like eviesays.com."},
|
||||
{"gbdeduped" ,FIELD_GBOTHER,false,""},
|
||||
|
||||
{"gbinjected", FIELD_GBOTHER,false,"Was the event injected?."},
|
||||
@ -3070,7 +3074,8 @@ struct QueryField g_fields[] = {
|
||||
//{"gbendrange",FIELD_GBENDRANGE,false,""},
|
||||
|
||||
{"gbpermalink",FIELD_GBPERMALINK,false,""},
|
||||
{"gbcsenum",FIELD_GBCSENUM,false,""},
|
||||
//{"gbcsenum",FIELD_GBCSENUM,false,""},
|
||||
{"gbparenturl", FIELD_GBPARENTURL, true,"Match the json urls that were extract from this parent url. Example: gbparenturl:www.gigablast.com/addurl.htm"},
|
||||
{"gbdocid",FIELD_GBDOCID,false,"restrict results to this docid"}
|
||||
|
||||
};
|
||||
|
1
Query.h
1
Query.h
@ -108,6 +108,7 @@ typedef unsigned long long qvec_t;
|
||||
#define FIELD_GBREVSORTBY 55 // i.e. sortby:price -> low to high
|
||||
#define FIELD_GBNUMBERMIN 56
|
||||
#define FIELD_GBNUMBERMAX 57
|
||||
#define FIELD_GBPARENTURL 58
|
||||
|
||||
#define FIELD_GBOTHER 92
|
||||
|
||||
|
22
Rdb.cpp
22
Rdb.cpp
@ -90,7 +90,7 @@ RdbBase *Rdb::getBase ( collnum_t collnum ) {
|
||||
return cr->m_bases[(unsigned char)m_rdbId];
|
||||
}
|
||||
|
||||
// used by Rdb::addColl
|
||||
// used by Rdb::addBase1()
|
||||
void Rdb::addBase ( collnum_t collnum , RdbBase *base ) {
|
||||
// if we are collectionless, like g_statsdb.m_rdb or
|
||||
// g_cachedb.m_rdb, etc.. shared by all collections essentially.
|
||||
@ -468,12 +468,17 @@ bool Rdb::updateToRebuildFiles ( Rdb *rdb2 , char *coll ) {
|
||||
|
||||
// . returns false and sets g_errno on error, returns true on success
|
||||
// . if this rdb is collectionless we set m_collectionlessBase in addBase()
|
||||
bool Rdb::addColl ( char *coll ) {
|
||||
bool Rdb::addRdbBase1 ( char *coll ) { // addColl()
|
||||
collnum_t collnum = g_collectiondb.getCollnum ( coll );
|
||||
return addColl2 ( collnum );
|
||||
return addRdbBase2 ( collnum );
|
||||
}
|
||||
|
||||
bool Rdb::addColl2 ( collnum_t collnum ) {
|
||||
bool Rdb::addRdbBase2 ( collnum_t collnum ) { // addColl2()
|
||||
|
||||
if ( ! m_initialized ) {
|
||||
g_errno = EBADENGINEER;
|
||||
return log("db: adding coll to uninitialized rdb!");
|
||||
}
|
||||
|
||||
// catdb,statsbaccessdb,facebookdb,syncdb
|
||||
if ( m_isCollectionLess )
|
||||
@ -501,8 +506,9 @@ bool Rdb::addColl2 ( collnum_t collnum ) {
|
||||
RdbBase *base = getBase ( collnum );
|
||||
if ( base ) { // m_bases [ collnum ] ) {
|
||||
g_errno = EBADENGINEER;
|
||||
return log("db: %s: Rdb for collection \"%s\" exists.",
|
||||
m_dbname,coll);
|
||||
return log("db: Rdb for db \"%s\" and "
|
||||
"collection \"%s\" (collnum %li) exists.",
|
||||
m_dbname,coll,(long)collnum);
|
||||
}
|
||||
// make a new one
|
||||
RdbBase *newColl = NULL;
|
||||
@ -616,7 +622,7 @@ bool Rdb::deleteColl ( collnum_t collnum , collnum_t newCollnum ) {
|
||||
// . TODO: what about outstanding merge or dump operations?
|
||||
// . it seems like we can't really recycle this too easily
|
||||
// because reset it not resetting filenames or directory name?
|
||||
// just nuke it and rebuild using addColl2()...
|
||||
// just nuke it and rebuild using addRdbBase2()...
|
||||
RdbBase *oldBase = getBase ( collnum );
|
||||
mdelete (oldBase, sizeof(RdbBase), "Rdb Coll");
|
||||
delete (oldBase);
|
||||
@ -632,7 +638,7 @@ bool Rdb::deleteColl ( collnum_t collnum , collnum_t newCollnum ) {
|
||||
|
||||
// if just resetting recycle base
|
||||
if ( collnum != newCollnum ) {
|
||||
addColl2 ( newCollnum );
|
||||
addRdbBase2 ( newCollnum );
|
||||
// make a new base now
|
||||
//RdbBase *newBase = mnew
|
||||
// new cr
|
||||
|
4
Rdb.h
4
Rdb.h
@ -86,8 +86,8 @@ class Rdb {
|
||||
Rdb ( );
|
||||
~Rdb ( );
|
||||
|
||||
bool addColl ( char *coll );
|
||||
bool addColl2 ( collnum_t collnum );
|
||||
bool addRdbBase1 ( char *coll );
|
||||
bool addRdbBase2 ( collnum_t collnum );
|
||||
bool delColl ( char *coll );
|
||||
|
||||
bool resetBase ( collnum_t collnum );
|
||||
|
130
RdbBase.cpp
130
RdbBase.cpp
@ -21,6 +21,7 @@
|
||||
#include "Collectiondb.h"
|
||||
//#include "CollectionRec.h"
|
||||
#include "Repair.h"
|
||||
#include "Rebalance.h"
|
||||
//#include "Msg3.h" // debug include
|
||||
|
||||
// how many rdbs are in "urgent merge" mode?
|
||||
@ -613,8 +614,22 @@ bool RdbBase::setFiles ( ) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// everyone should start with file 0001.dat or 0000.dat
|
||||
if ( m_numFiles > 0 && m_fileIds[0] > 1 ) {
|
||||
log("db: missing file id 0001.dat for %s in coll %s. "
|
||||
"Fix this or it'll core later. Just rename the next file "
|
||||
"in line to 0001.dat/map. We probably cored at a "
|
||||
"really bad time during the end of a merge process.",
|
||||
m_dbname, m_coll );
|
||||
char *xx=NULL; *xx=0;
|
||||
}
|
||||
|
||||
|
||||
m_dir.close();
|
||||
|
||||
// ensure files are sharded correctly
|
||||
verifyFileSharding();
|
||||
|
||||
if ( ! converting ) return true;
|
||||
|
||||
// now if we are converting old titledb names to new...
|
||||
@ -655,6 +670,7 @@ long RdbBase::addFile ( long id , bool isNew , long mergeNum , long id2 ,
|
||||
(long)MAX_RDB_FILES);
|
||||
return -1;
|
||||
}
|
||||
|
||||
// HACK: skip to avoid a OOM lockup. if RdbBase cannot dump
|
||||
// its data to disk it can backlog everyone and memory will
|
||||
// never get freed up.
|
||||
@ -1558,10 +1574,11 @@ void RdbBase::gotTokenForMerge ( ) {
|
||||
if ( m_rdb == g_tfndb.getRdb() ) m = &g_merge2;
|
||||
// sanity check
|
||||
if ( m_isMerging || m->isMerging() ) {
|
||||
if ( m_doLog )
|
||||
log(LOG_INFO,
|
||||
"merge: Someone already merging. Waiting for merge token "
|
||||
"in order to merge %s.",m_dbname);
|
||||
//if ( m_doLog )
|
||||
//log(LOG_INFO,
|
||||
//"merge: Someone already merging. Waiting for "
|
||||
//"merge token "
|
||||
//"in order to merge %s.",m_dbname);
|
||||
return;
|
||||
}
|
||||
// clear for take-off
|
||||
@ -1959,8 +1976,8 @@ void RdbBase::gotTokenForMerge ( ) {
|
||||
// . if we have no g_errno that is bad!!!
|
||||
// . we should dump core here or something cuz we have to remove the
|
||||
// merge file still to be correct
|
||||
if ( ! g_errno )
|
||||
log(LOG_INFO,"merge: Got token without blocking.");
|
||||
//if ( ! g_errno )
|
||||
// log(LOG_INFO,"merge: Got token without blocking.");
|
||||
// we now set this in init() by calling m_merge.init() so it
|
||||
// can pre-alloc it's lists in it's s_msg3 class
|
||||
// g_conf.m_mergeMaxBufSize ) ) return ;
|
||||
@ -2185,3 +2202,104 @@ void RdbBase::verifyDiskPageCache ( ) {
|
||||
m_pc->verify(f);
|
||||
}
|
||||
}
|
||||
|
||||
bool RdbBase::verifyFileSharding ( ) {
|
||||
|
||||
if ( m_rdb->m_isCollectionLess ) return true;
|
||||
|
||||
//log ( "db: Verifying %s for coll %s (collnum=%li)...",
|
||||
// m_dbname , m_coll , (long)m_collnum );
|
||||
|
||||
g_threads.disableThreads();
|
||||
|
||||
Msg5 msg5;
|
||||
//Msg5 msg5b;
|
||||
RdbList list;
|
||||
char startKey[MAX_KEY_BYTES];
|
||||
char endKey[MAX_KEY_BYTES];
|
||||
KEYMIN(startKey,MAX_KEY_BYTES);
|
||||
KEYMAX(endKey,MAX_KEY_BYTES);
|
||||
long minRecSizes = 64000;
|
||||
char rdbId = m_rdb->m_rdbId;
|
||||
if ( rdbId == RDB_TITLEDB ) minRecSizes = 640000;
|
||||
|
||||
if ( ! msg5.getList ( m_rdb->m_rdbId, //RDB_POSDB ,
|
||||
m_coll ,
|
||||
&list ,
|
||||
startKey ,
|
||||
endKey ,
|
||||
minRecSizes ,
|
||||
true , // includeTree ,
|
||||
false , // add to cache?
|
||||
0 , // max cache age
|
||||
0 , // startFileNum ,
|
||||
-1 , // numFiles ,
|
||||
NULL , // state
|
||||
NULL , // callback
|
||||
0 , // niceness
|
||||
false , // err correction?
|
||||
NULL ,
|
||||
0 ,
|
||||
-1 ,
|
||||
true ,
|
||||
-1LL ,
|
||||
NULL , // &msg5b ,
|
||||
true )) {
|
||||
g_threads.enableThreads();
|
||||
return log("db: HEY! it did not block");
|
||||
}
|
||||
|
||||
long count = 0;
|
||||
long got = 0;
|
||||
long printed = 0;
|
||||
char k[MAX_KEY_BYTES];
|
||||
|
||||
for ( list.resetListPtr() ; ! list.isExhausted() ;
|
||||
list.skipCurrentRecord() ) {
|
||||
//key144_t k;
|
||||
list.getCurrentKey(k);
|
||||
count++;
|
||||
//unsigned long groupId = k.n1 & g_hostdb.m_groupMask;
|
||||
//unsigned long groupId = getGroupId ( RDB_POSDB , &k );
|
||||
//if ( groupId == g_hostdb.m_groupId ) got++;
|
||||
unsigned long shardNum = getShardNum( rdbId , k );
|
||||
|
||||
if ( shardNum == getMyShardNum() ) {
|
||||
got++;
|
||||
continue;
|
||||
}
|
||||
|
||||
if ( ++printed > 100 ) continue;
|
||||
|
||||
// avoid log spam... comment this out
|
||||
//log ( "db: Found bad key in list belongs to shard %li",
|
||||
// shardNum);
|
||||
}
|
||||
|
||||
g_threads.enableThreads();
|
||||
|
||||
//if ( got )
|
||||
// log("db: verified %li recs for %s in coll %s",
|
||||
// got,m_dbname,m_coll);
|
||||
|
||||
if ( got == count ) return true;
|
||||
|
||||
// tally it up
|
||||
g_rebalance.m_numForeignRecs += count - got;
|
||||
log ("db: Out of first %li records in %s for %s.%li, only %li belong "
|
||||
"to our group.",count,m_dbname,m_coll,(long)m_collnum,got);
|
||||
// exit if NONE, we probably got the wrong data
|
||||
//if ( got == 0 ) log("db: Are you sure you have the "
|
||||
// "right data in the right directory? ");
|
||||
|
||||
//log ( "db: Exiting due to Posdb inconsistency." );
|
||||
g_threads.enableThreads();
|
||||
return true;//g_conf.m_bypassValidation;
|
||||
|
||||
//log(LOG_DEBUG, "db: Posdb passed verification successfully for %li "
|
||||
// "recs.", count );
|
||||
// DONE
|
||||
//return true;
|
||||
}
|
||||
|
||||
|
||||
|
@ -262,6 +262,8 @@ class RdbBase {
|
||||
|
||||
void verifyDiskPageCache ( );
|
||||
|
||||
bool verifyFileSharding ( );
|
||||
|
||||
// . add a (new) file to the m_files/m_maps/m_fileIds arrays
|
||||
// . both return array position we added it to
|
||||
// . both return -1 and set errno on error
|
||||
|
@ -1503,9 +1503,12 @@ bool RdbCache::load ( char *dbname ) {
|
||||
// does the file exist?
|
||||
File f;
|
||||
f.set ( g_hostdb.m_dir , filename );
|
||||
// having cache file not existing on disk is not so bad, it's a cache
|
||||
if ( ! f.doesExist() )
|
||||
return log("db: Could not load cache from %s: does not exist.",
|
||||
f.getFilename());
|
||||
return false;
|
||||
// return log("db: Could not load cache from %s: does not exist.",
|
||||
// f.getFilename());
|
||||
|
||||
// open the file
|
||||
if ( ! f.open ( O_RDWR ) )
|
||||
return log("db: Could not open cache save file for %s: %s.",
|
||||
|
26
RdbDump.cpp
26
RdbDump.cpp
@ -204,11 +204,14 @@ void RdbDump::doneDumping ( ) {
|
||||
m_totalPosDumped , m_totalNegDumped ,
|
||||
m_totalPosDumped + m_totalNegDumped );
|
||||
|
||||
// map verify
|
||||
log("db: map # pos=%lli neg=%lli",
|
||||
m_map->getNumPositiveRecs(),
|
||||
m_map->getNumNegativeRecs()
|
||||
);
|
||||
// . map verify
|
||||
// . if continueDumping called us with no collectionrec, it got
|
||||
// deleted so RdbBase::m_map is nuked too i guess
|
||||
if ( saved != ENOCOLLREC )
|
||||
log("db: map # pos=%lli neg=%lli",
|
||||
m_map->getNumPositiveRecs(),
|
||||
m_map->getNumNegativeRecs()
|
||||
);
|
||||
|
||||
// free the list's memory
|
||||
if ( m_list ) m_list->freeList();
|
||||
@ -1015,11 +1018,16 @@ void RdbDump::continueDumping() {
|
||||
|
||||
// if someone reset/deleted the collection we were dumping...
|
||||
CollectionRec *cr = g_collectiondb.getRec ( m_collnum );
|
||||
if ( ! cr ) g_errno = ENOCOLLREC;
|
||||
|
||||
if ( ! cr ) {
|
||||
g_errno = ENOCOLLREC;
|
||||
// m_file is invalid if collrec got nuked because so did
|
||||
// the Rdbbase which has the files
|
||||
log("db: continue dumping lost collection");
|
||||
}
|
||||
// bitch about errors
|
||||
if (g_errno)log("db: Dump to %s had error writing: %s.",
|
||||
m_file->getFilename(),mstrerror(g_errno));
|
||||
else if (g_errno)log("db: Dump to %s had error writing: %s.",
|
||||
m_file->getFilename(),mstrerror(g_errno));
|
||||
|
||||
// go back now if we were NOT dumping a tree
|
||||
if ( ! (m_tree || m_buckets) ) {
|
||||
m_isDumping = false;
|
||||
|
@ -1631,9 +1631,12 @@ void RdbList::merge_r ( RdbList **lists ,
|
||||
// . we don't want any positive recs to go un annhilated
|
||||
// . but don't worry about this check if start and end keys are equal
|
||||
//if ( m_startKey != m_endKey && (m_endKey.n0 & 0x01) == 0x00 )
|
||||
if ( KEYCMP(m_startKey,m_endKey,m_ks)!=0 && KEYNEG(m_endKey) )
|
||||
if ( KEYCMP(m_startKey,m_endKey,m_ks)!=0 && KEYNEG(m_endKey) ) {
|
||||
log(LOG_LOGIC,"db: rdblist: merge_r: Illegal endKey for "
|
||||
"merging.");
|
||||
"merging. fixing.");
|
||||
// make it legal so it will be read first NEXT time
|
||||
KEYSUB(m_endKey,1,m_ks);
|
||||
}
|
||||
// do nothing if no lists passed in
|
||||
if ( numLists <= 0 ) return;
|
||||
// inherit the key size of what we merge
|
||||
|
72
RdbTree.cpp
72
RdbTree.cpp
@ -172,7 +172,11 @@ void RdbTree::reset ( ) {
|
||||
// unprotect it all
|
||||
if ( m_useProtection ) unprotect ( );
|
||||
// make sure string is NULL temrinated. this gbstrlen() should
|
||||
if ( m_numNodes > 0 && m_dbname[0] && gbstrlen(m_dbname) >= 0 )
|
||||
if ( m_numNodes > 0 &&
|
||||
m_dbname[0] &&
|
||||
gbstrlen(m_dbname) >= 0 &&
|
||||
// don't be spammy we can have thousands of these, one per coll
|
||||
strcmp(m_dbname,"waitingtree") )
|
||||
log(LOG_INFO,"db: Resetting tree for %s.",m_dbname);
|
||||
|
||||
// liberate all the nodes
|
||||
@ -279,7 +283,7 @@ long RdbTree::clear ( ) {
|
||||
for ( long i = 0 ; i < nc ; i++ ) {
|
||||
CollectionRec *cr = g_collectiondb.getRec(i);
|
||||
if ( ! cr ) continue;
|
||||
//if ( ((unsigned char)m_rdbId)>=RDB_END){char *xx=NULL;*xx=0; }
|
||||
//if (((unsigned char)m_rdbId)>=RDB_END){char *xx=NULL;*xx=0; }
|
||||
cr->m_numNegKeysInTree[(unsigned char)m_rdbId] = 0;
|
||||
cr->m_numPosKeysInTree[(unsigned char)m_rdbId] = 0;
|
||||
}
|
||||
@ -633,9 +637,11 @@ long RdbTree::addNode ( collnum_t collnum ,
|
||||
// crap, when fixing a tree this will segfault because
|
||||
// m_recs[collnum] is NULL.
|
||||
if ( m_rdbId >= 0 && g_collectiondb.m_recs[collnum] ) {
|
||||
//if( ((unsigned char)m_rdbId)>=RDB_END){char *xx=NULL;*xx=0; }
|
||||
g_collectiondb.m_recs[collnum]->
|
||||
m_numNegKeysInTree[(unsigned char)m_rdbId]++;
|
||||
//if( ((unsigned char)m_rdbId)>=RDB_END){
|
||||
//char *xx=NULL;*xx=0; }
|
||||
CollectionRec *cr ;
|
||||
cr = g_collectiondb.m_recs[collnum];
|
||||
if(cr)cr->m_numNegKeysInTree[(unsigned char)m_rdbId]++;
|
||||
}
|
||||
}
|
||||
else {
|
||||
@ -644,9 +650,11 @@ long RdbTree::addNode ( collnum_t collnum ,
|
||||
// crap, when fixing a tree this will segfault because
|
||||
// m_recs[collnum] is NULL.
|
||||
if ( m_rdbId >= 0 && g_collectiondb.m_recs[collnum] ) {
|
||||
//if( ((unsigned char)m_rdbId)>=RDB_END){char *xx=NULL;*xx=0; }
|
||||
g_collectiondb.m_recs[collnum]->
|
||||
m_numPosKeysInTree[(unsigned char)m_rdbId]++;
|
||||
//if( ((unsigned char)m_rdbId)>=RDB_END){
|
||||
//char *xx=NULL;*xx=0; }
|
||||
CollectionRec *cr ;
|
||||
cr = g_collectiondb.m_recs[collnum];
|
||||
if(cr)cr->m_numPosKeysInTree[(unsigned char)m_rdbId]++;
|
||||
}
|
||||
}
|
||||
// debug2 msg
|
||||
@ -839,16 +847,20 @@ void RdbTree::deleteNode ( long i , bool freeData ) {
|
||||
if ( KEYNEG(m_keys,i,m_ks) ) {
|
||||
m_numNegativeKeys--;
|
||||
//m_numNegKeysPerColl[m_collnums[i]]--;
|
||||
if ( m_rdbId >= 0 )
|
||||
g_collectiondb.m_recs[m_collnums[i]]->
|
||||
m_numPosKeysInTree[(unsigned char)m_rdbId]--;
|
||||
if ( m_rdbId >= 0 ) {
|
||||
CollectionRec *cr;
|
||||
cr = g_collectiondb.m_recs[m_collnums[i]];
|
||||
if(cr)cr->m_numNegKeysInTree[(unsigned char)m_rdbId]--;
|
||||
}
|
||||
}
|
||||
else {
|
||||
m_numPositiveKeys--;
|
||||
//m_numPosKeysPerColl[m_collnums[i]]--;
|
||||
if ( m_rdbId >= 0 )
|
||||
g_collectiondb.m_recs[m_collnums[i]]->
|
||||
m_numPosKeysInTree[(unsigned char)m_rdbId]--;
|
||||
if ( m_rdbId >= 0 ) {
|
||||
CollectionRec *cr;
|
||||
cr = g_collectiondb.m_recs[m_collnums[i]];
|
||||
if(cr)cr->m_numPosKeysInTree[(unsigned char)m_rdbId]--;
|
||||
}
|
||||
}
|
||||
// debug step -- check chain from iparent down making sure that
|
||||
//printTree();
|
||||
@ -874,11 +886,14 @@ void RdbTree::deleteNode ( long i , bool freeData ) {
|
||||
//m_numNegKeysPerColl[m_collnums[i]] = 0;
|
||||
//m_numPosKeysPerColl[m_collnums[i]] = 0;
|
||||
if ( m_rdbId >= 0 ) {
|
||||
//if ( ((unsigned char)m_rdbId)>=RDB_END){char *xx=NULL;*xx=0; }
|
||||
g_collectiondb.m_recs[m_collnums[i]]->
|
||||
m_numNegKeysInTree[(unsigned char)m_rdbId] = 0;
|
||||
g_collectiondb.m_recs[m_collnums[i]]->
|
||||
m_numPosKeysInTree[(unsigned char)m_rdbId] = 0;
|
||||
//if ( ((unsigned char)m_rdbId)>=RDB_END){
|
||||
//char *xx=NULL;*xx=0; }
|
||||
CollectionRec *cr ;
|
||||
cr = g_collectiondb.m_recs[m_collnums[i]];
|
||||
if(cr){
|
||||
cr->m_numNegKeysInTree[(unsigned char)m_rdbId] = 0;
|
||||
cr->m_numPosKeysInTree[(unsigned char)m_rdbId] = 0;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -945,8 +960,9 @@ void RdbTree::deleteNode ( long i , bool freeData ) {
|
||||
//m_numNegKeysPerColl[m_collnums[i]]--;
|
||||
if ( m_rdbId >= 0 ) {
|
||||
//if( ((unsigned char)m_rdbId)>=RDB_END){char *xx=NULL;*xx=0; }
|
||||
g_collectiondb.m_recs[m_collnums[i]]->
|
||||
m_numNegKeysInTree[(unsigned char)m_rdbId]--;
|
||||
CollectionRec *cr ;
|
||||
cr = g_collectiondb.m_recs[m_collnums[i]];
|
||||
if(cr)cr->m_numNegKeysInTree[(unsigned char)m_rdbId]--;
|
||||
}
|
||||
}
|
||||
else {
|
||||
@ -954,8 +970,9 @@ void RdbTree::deleteNode ( long i , bool freeData ) {
|
||||
//m_numPosKeysPerColl[m_collnums[i]]--;
|
||||
if ( m_rdbId >= 0 ) {
|
||||
//if( ((unsigned char)m_rdbId)>=RDB_END){char *xx=NULL;*xx=0; }
|
||||
g_collectiondb.m_recs[m_collnums[i]]->
|
||||
m_numPosKeysInTree[(unsigned char)m_rdbId]--;
|
||||
CollectionRec *cr ;
|
||||
cr = g_collectiondb.m_recs[m_collnums[i]];
|
||||
if(cr)cr->m_numPosKeysInTree[(unsigned char)m_rdbId]--;
|
||||
}
|
||||
}
|
||||
// debug step -- check chain from iparent down making sure that
|
||||
@ -3059,8 +3076,9 @@ void RdbTree::cleanTree ( ) { // char **bases ) {
|
||||
deleteNode ( i , true );
|
||||
// remove it otherwise
|
||||
// don't actually remove it!!!! in case collection gets
|
||||
// moved accidentally
|
||||
//deleteNode ( i , true );
|
||||
// moved accidentally.
|
||||
// no... otherwise it can clog up the tree forever!!!!
|
||||
deleteNode ( i , true );
|
||||
count++;
|
||||
// save it
|
||||
collnum = m_collnums[i];
|
||||
@ -3070,8 +3088,8 @@ void RdbTree::cleanTree ( ) { // char **bases ) {
|
||||
if ( count == 0 ) return;
|
||||
log(LOG_LOGIC,"db: Removed %li records from %s tree for invalid "
|
||||
"collection number %i.",count,m_dbname,collnum);
|
||||
log(LOG_LOGIC,"db: Records not actually removed for safety. Except "
|
||||
"for those with negative colnums.");
|
||||
//log(LOG_LOGIC,"db: Records not actually removed for safety. Except "
|
||||
// "for those with negative colnums.");
|
||||
static bool s_print = true;
|
||||
if ( ! s_print ) return;
|
||||
s_print = false;
|
||||
|
@ -127,6 +127,7 @@ char *Rebalance::getNeedsRebalance ( ) {
|
||||
hexToBin(keyStr,gbstrlen(keyStr), (char *)&m_nextKey);
|
||||
|
||||
m_collnum = cn;
|
||||
//m_collnum = 4695; //debug skip
|
||||
// we are valid now either way
|
||||
m_needsRebalanceValid = true;
|
||||
// assume ok
|
||||
@ -217,8 +218,9 @@ void Rebalance::scanLoop ( ) {
|
||||
if ( rdb->m_rdbId == RDB_STATSDB ) continue;
|
||||
// log it as well
|
||||
if ( m_lastRdb != rdb ) {
|
||||
log("rebal: scanning %s [%s]",
|
||||
cr->m_coll,rdb->m_dbname);
|
||||
log("rebal: scanning %s (%li) [%s]",
|
||||
cr->m_coll,(long)cr->m_collnum,
|
||||
rdb->m_dbname);
|
||||
// only do this once per rdb/coll
|
||||
m_lastRdb = rdb;
|
||||
// reset key cursor as well!!!
|
||||
@ -235,8 +237,11 @@ void Rebalance::scanLoop ( ) {
|
||||
// scan it. returns true if done, false if blocked
|
||||
if ( ! scanRdb ( ) ) return;
|
||||
// note it
|
||||
log("rebal: moved %lli of %lli recs scanned",
|
||||
m_rebalanceCount,m_scannedCount);
|
||||
log("rebal: moved %lli of %lli recs scanned in "
|
||||
"%s for coll.%s.%li",
|
||||
m_rebalanceCount,m_scannedCount,
|
||||
rdb->m_dbname,cr->m_coll,(long)cr->m_collnum);
|
||||
//if ( m_rebalanceCount ) goto done;
|
||||
m_rebalanceCount = 0;
|
||||
m_scannedCount = 0;
|
||||
m_lastPercent = -1;
|
||||
@ -245,6 +250,7 @@ void Rebalance::scanLoop ( ) {
|
||||
m_rdbNum = 0;
|
||||
}
|
||||
|
||||
// done:
|
||||
// all done
|
||||
m_isScanning = false;
|
||||
m_needsRebalance = false;
|
||||
@ -318,6 +324,8 @@ bool Rebalance::scanRdb ( ) {
|
||||
|
||||
readAnother:
|
||||
|
||||
if ( g_process.m_mode == EXIT_MODE ) return false;
|
||||
|
||||
//log("rebal: loading list start = %s",KEYSTR(m_nextKey,rdb->m_ks));
|
||||
|
||||
if ( ! m_msg5.getList ( rdb->m_rdbId ,
|
||||
@ -391,22 +399,27 @@ bool Rebalance::gotList ( ) {
|
||||
return true;
|
||||
}
|
||||
|
||||
char *last = NULL;
|
||||
//char *last = NULL;
|
||||
|
||||
for ( ; ! m_list.isExhausted() ; m_list.skipCurrentRec() ) {
|
||||
// get tht rec
|
||||
char *rec = m_list.getCurrentRec();
|
||||
//char *rec = m_list.getCurrentRec();
|
||||
// get it
|
||||
m_list.getCurrentKey ( m_nextKey );
|
||||
// skip if negative... wtf?
|
||||
if ( KEYNEG(m_nextKey) ) continue;
|
||||
// get shard
|
||||
long shard = getShardNum ( rdbId , rec );
|
||||
long shard = getShardNum ( rdbId , m_nextKey );
|
||||
// save last ptr
|
||||
last = rec;
|
||||
//last = rec;
|
||||
// debug!
|
||||
//m_list.getKey ( rec , m_nextKey );
|
||||
//log("rebal: checking key %s",KEYSTR(m_nextKey,ks));
|
||||
// count as scanned
|
||||
m_scannedCount++;
|
||||
// skip it if it belongs with us
|
||||
if ( shard == myShard ) continue;
|
||||
// note it
|
||||
//log("rebal: shard is %li",shard);
|
||||
// count it
|
||||
m_rebalanceCount++;
|
||||
// otherwise, it does not!
|
||||
@ -445,18 +458,21 @@ bool Rebalance::gotList ( ) {
|
||||
//log("rebal: done reading list");
|
||||
|
||||
// update nextkey
|
||||
if ( last ) {
|
||||
//if ( last ) {
|
||||
if ( ! m_list.isEmpty() ) {
|
||||
// get the last key we scanned, all "ks" bytes of it.
|
||||
// because some keys are compressed and we take the
|
||||
// more significant compressed out bytes from m_list.m_*
|
||||
// member vars
|
||||
m_list.getKey ( last , m_nextKey );
|
||||
//m_list.getKey ( last , m_nextKey );
|
||||
// if it is not maxxed out, then incremenet it for the
|
||||
// next scan round
|
||||
if ( KEYCMP ( m_nextKey , KEYMAX() , ks ) != 0 )
|
||||
KEYADD ( m_nextKey , 1 , ks );
|
||||
}
|
||||
|
||||
//else {
|
||||
// log("rebal: got empty list");
|
||||
//}
|
||||
|
||||
if ( ! m_msg4a.addMetaList ( &m_posMetaList ,
|
||||
m_collnum ,
|
||||
|
241
Repair.cpp
241
Repair.cpp
@ -836,7 +836,8 @@ void Repair::getNextCollToRepair ( ) {
|
||||
|
||||
// add collection to secondary rdbs
|
||||
if ( m_rebuildTitledb ) {
|
||||
if ( ! g_titledb2.addColl ( m_coll ) &&
|
||||
if ( //! g_titledb2.addColl ( m_coll ) &&
|
||||
! g_titledb2.getRdb()->addRdbBase1(m_coll) &&
|
||||
g_errno != EEXIST ) goto hadError;
|
||||
}
|
||||
|
||||
@ -851,7 +852,7 @@ void Repair::getNextCollToRepair ( ) {
|
||||
//}
|
||||
|
||||
if ( m_rebuildPosdb ) {
|
||||
if ( ! g_posdb2.addColl ( m_coll ) &&
|
||||
if ( ! g_posdb2.getRdb()->addRdbBase1 ( m_coll ) &&
|
||||
g_errno != EEXIST ) goto hadError;
|
||||
}
|
||||
|
||||
@ -861,7 +862,7 @@ void Repair::getNextCollToRepair ( ) {
|
||||
//}
|
||||
|
||||
if ( m_rebuildClusterdb ) {
|
||||
if ( ! g_clusterdb2.addColl ( m_coll ) &&
|
||||
if ( ! g_clusterdb2.getRdb()->addRdbBase1 ( m_coll ) &&
|
||||
g_errno != EEXIST ) goto hadError;
|
||||
}
|
||||
|
||||
@ -871,7 +872,7 @@ void Repair::getNextCollToRepair ( ) {
|
||||
//}
|
||||
|
||||
if ( m_rebuildSpiderdb ) {
|
||||
if ( ! g_spiderdb2.addColl ( m_coll ) &&
|
||||
if ( ! g_spiderdb2.getRdb()->addRdbBase1 ( m_coll ) &&
|
||||
g_errno != EEXIST ) goto hadError;
|
||||
}
|
||||
|
||||
@ -881,7 +882,7 @@ void Repair::getNextCollToRepair ( ) {
|
||||
//}
|
||||
|
||||
if ( m_rebuildLinkdb ) {
|
||||
if ( ! g_linkdb2.addColl ( m_coll ) &&
|
||||
if ( ! g_linkdb2.getRdb()->addRdbBase1 ( m_coll ) &&
|
||||
g_errno != EEXIST ) goto hadError;
|
||||
}
|
||||
|
||||
@ -2254,80 +2255,116 @@ bool Repair::printRepairStatus ( SafeBuf *sb , long fromIp ) {
|
||||
}
|
||||
|
||||
// now show the rebuild status
|
||||
sb->safePrintf ( "<table>"
|
||||
sb->safePrintf (
|
||||
"<table%s"
|
||||
" id=\"repairstatustable\">"
|
||||
|
||||
"<table width=100%% bgcolor=#%s cellpadding=4 "
|
||||
"border=1 id=\"repairstatustable\">"
|
||||
|
||||
"<tr><td bgcolor=%s colspan=2><b><center>"
|
||||
"<tr class=hdrow><td colspan=2><b><center>"
|
||||
"Repair Status</center></b></td></tr>\n"
|
||||
|
||||
"<tr bgcolor=#%s><td colspan=2>"
|
||||
"<font size=-2>"
|
||||
"Use this to rebuild a database or to reindex "
|
||||
"all pages to pick up new link text."
|
||||
"</font>"
|
||||
"</td></tr>"
|
||||
|
||||
// status (see list of above statuses)
|
||||
"<tr><td width=50%%><b>status</b></td>"
|
||||
"<tr bgcolor=#%s><td width=50%%><b>status</b></td>"
|
||||
"<td>%s</td></tr>\n"
|
||||
|
||||
"<tr><td width=50%%><b>repair mode</b></td>"
|
||||
"<tr bgcolor=#%s><td width=50%%><b>repair mode</b>"
|
||||
"</td>"
|
||||
"<td>%li</td></tr>\n"
|
||||
|
||||
"<tr><td width=50%%><b>min repair mode</b></td>"
|
||||
"<tr bgcolor=#%s>"
|
||||
"<td width=50%%><b>min repair mode</b></td>"
|
||||
"<td>%li</td></tr>\n"
|
||||
|
||||
"<tr><td width=50%%><b>host ID with min repair mode"
|
||||
"<tr bgcolor=#%s>"
|
||||
"<td width=50%%><b>host ID with min repair mode"
|
||||
"</b></td>"
|
||||
"<td><a href=\"http://%s:%hu/master/repair\">"
|
||||
"%li</a></td></tr>\n"
|
||||
|
||||
"<tr><td><b>old collection</b></td>"
|
||||
"<tr bgcolor=#%s><td><b>old collection</b></td>"
|
||||
"<td>%s</td></tr>"
|
||||
|
||||
"<tr><td><b>new collection</b></td>"
|
||||
"<tr bgcolor=#%s><td><b>new collection</b></td>"
|
||||
"<td>%s</td></tr>"
|
||||
|
||||
,
|
||||
TABLE_STYLE ,
|
||||
|
||||
|
||||
LIGHT_BLUE ,
|
||||
LIGHT_BLUE ,
|
||||
status ,
|
||||
|
||||
LIGHT_BLUE ,
|
||||
(long)g_repairMode,
|
||||
|
||||
LIGHT_BLUE ,
|
||||
(long)g_pingServer.m_minRepairMode,
|
||||
|
||||
LIGHT_BLUE ,
|
||||
minIpBuf, // ip string
|
||||
minPort, // port
|
||||
(long)minHostId,
|
||||
|
||||
LIGHT_BLUE ,
|
||||
oldColl ,
|
||||
|
||||
LIGHT_BLUE ,
|
||||
newColl
|
||||
);
|
||||
|
||||
sb->safePrintf (
|
||||
// docs done, includes overwritten title recs
|
||||
"<tr bgcolor=%s><td><b>titledb recs scanned</b></td>"
|
||||
"<tr bgcolor=#%s><td><b>titledb recs scanned</b></td>"
|
||||
"<td>%lli of %lli</td></tr>\n"
|
||||
|
||||
// percent complete
|
||||
"<tr bgcolor=%s><td><b>titledb recs scanned "
|
||||
"<tr bgcolor=#%s><td><b>titledb recs scanned "
|
||||
"progress</b></td>"
|
||||
"<td>%.2f%%</td></tr>\n"
|
||||
|
||||
// title recs set errors, parsing errors, etc.
|
||||
//"<tr bgcolor=%s><td><b>title recs injected</b></td>"
|
||||
//"<tr bgcolor=#%s><td><b>title recs injected</b></td>"
|
||||
//"<td>%lli</td></tr>\n"
|
||||
|
||||
// title recs set errors, parsing errors, etc.
|
||||
"<tr bgcolor=%s><td><b>titledb rec error count</b></td>"
|
||||
"<tr bgcolor=#%s><td><b>titledb rec error count</b></td>"
|
||||
"<td>%lli</td></tr>\n"
|
||||
|
||||
// sub errors
|
||||
"<tr bgcolor=%s><td> key out of order</b></td>"
|
||||
"<tr bgcolor=#%s><td> key out of order</b></td>"
|
||||
"<td>%lli</td></tr>\n"
|
||||
"<tr bgcolor=%s><td> set errors</b></td>"
|
||||
"<tr bgcolor=#%s><td> set errors</b></td>"
|
||||
"<td>%lli</td></tr>\n"
|
||||
"<tr bgcolor=%s><td> corrupt errors</b></td>"
|
||||
"<tr bgcolor=#%s><td> corrupt errors</b></td>"
|
||||
"<td>%lli</td></tr>\n"
|
||||
"<tr bgcolor=%s><td> xml errors</b></td>"
|
||||
"<tr bgcolor=#%s><td> xml errors</b></td>"
|
||||
"<td>%lli</td></tr>\n"
|
||||
"<tr bgcolor=%s><td> dup docid errors</b></td>"
|
||||
"<tr bgcolor=#%s><td> dup docid errors</b></td>"
|
||||
"<td>%lli</td></tr>\n"
|
||||
"<tr bgcolor=%s><td> negative keys</b></td>"
|
||||
"<tr bgcolor=#%s><td> negative keys</b></td>"
|
||||
"<td>%lli</td></tr>\n"
|
||||
//"<tr bgcolor=%s><td> overwritten recs</b></td>"
|
||||
//"<tr bgcolor=#%s><td> overwritten recs</b></td>"
|
||||
//"<td>%lli</td></tr>\n"
|
||||
"<tr bgcolor=%s><td> twin's "
|
||||
"<tr bgcolor=#%s><td> twin's "
|
||||
"respsponsibility</b></td>"
|
||||
"<td>%lli</td></tr>\n"
|
||||
|
||||
"<tr bgcolor=%s><td> wrong shard</b></td>"
|
||||
"<tr bgcolor=#%s><td> wrong shard</b></td>"
|
||||
"<td>%lli</td></tr>\n"
|
||||
|
||||
"<tr bgcolor=%s><td> root urls</b></td>"
|
||||
"<tr bgcolor=#%s><td> root urls</b></td>"
|
||||
"<td>%lli</td></tr>\n"
|
||||
"<tr bgcolor=%s><td> non-root urls</b></td>"
|
||||
"<tr bgcolor=#%s><td> non-root urls</b></td>"
|
||||
"<td>%lli</td></tr>\n"
|
||||
|
||||
"<tr bgcolor=%s><td> no title rec</b></td>"
|
||||
"<tr bgcolor=#%s><td> no title rec</b></td>"
|
||||
"<td>%lli</td></tr>\n"
|
||||
|
||||
//"<tr><td><b> Other errors</b></td>"
|
||||
@ -2337,49 +2374,7 @@ bool Repair::printRepairStatus ( SafeBuf *sb , long fromIp ) {
|
||||
//"<tr><td><b>Time Left in Phase %li</b></td>"
|
||||
//"<td>%.2f hrs</td></tr>\n"
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
// spider recs done
|
||||
"<tr><td><b>spider recs scanned</b></td>"
|
||||
"<td>%lli of %lli</td></tr>\n"
|
||||
|
||||
// percent complete
|
||||
"<tr><td><b>spider recs scanned progress</b></td>"
|
||||
"<td>%.2f%%</td></tr>\n"
|
||||
|
||||
// spider recs set errors, parsing errors, etc.
|
||||
"<tr><td><b>spider rec not assigned to us</b></td>"
|
||||
"<td>%li</td></tr>\n"
|
||||
|
||||
// spider recs set errors, parsing errors, etc.
|
||||
"<tr><td><b>spider rec errors</b></td>"
|
||||
"<td>%lli</td></tr>\n"
|
||||
|
||||
// spider recs set errors, parsing errors, etc.
|
||||
"<tr><td><b>spider rec bad tld</b></td>"
|
||||
"<td>%li</td></tr>\n"
|
||||
|
||||
// time left in hours
|
||||
//"<tr><td><b>Time Left in Phase %li</b></td>"
|
||||
//"<td>%.2f hrs</td></tr>\n"
|
||||
|
||||
|
||||
,
|
||||
LIGHT_BLUE ,
|
||||
DARK_BLUE ,
|
||||
status ,
|
||||
|
||||
(long)g_repairMode,
|
||||
(long)g_pingServer.m_minRepairMode,
|
||||
minIpBuf, // ip string
|
||||
minPort, // port
|
||||
(long)minHostId,
|
||||
|
||||
oldColl ,
|
||||
newColl ,
|
||||
|
||||
DARK_BLUE,
|
||||
ns ,
|
||||
nr ,
|
||||
@ -2415,13 +2410,49 @@ bool Repair::printRepairStatus ( SafeBuf *sb , long fromIp ) {
|
||||
m_recsNonRoot ,
|
||||
|
||||
DARK_BLUE,
|
||||
m_noTitleRecs,
|
||||
m_noTitleRecs
|
||||
);
|
||||
|
||||
|
||||
sb->safePrintf(
|
||||
// spider recs done
|
||||
"<tr bgcolor=#%s><td><b>spider recs scanned</b></td>"
|
||||
"<td>%lli of %lli</td></tr>\n"
|
||||
|
||||
// percent complete
|
||||
"<tr bgcolor=#%s><td><b>spider recs scanned "
|
||||
"progress</b></td>"
|
||||
"<td>%.2f%%</td></tr>\n"
|
||||
|
||||
// spider recs set errors, parsing errors, etc.
|
||||
"<tr bgcolor=#%s><td><b>spider rec not "
|
||||
"assigned to us</b></td>"
|
||||
"<td>%li</td></tr>\n"
|
||||
|
||||
// spider recs set errors, parsing errors, etc.
|
||||
"<tr bgcolor=#%s><td><b>spider rec errors</b></td>"
|
||||
"<td>%lli</td></tr>\n"
|
||||
|
||||
// spider recs set errors, parsing errors, etc.
|
||||
"<tr bgcolor=#%s><td><b>spider rec bad tld</b></td>"
|
||||
"<td>%li</td></tr>\n"
|
||||
|
||||
// time left in hours
|
||||
//"<tr bgcolor=#%s><td><b>"
|
||||
//"Time Left in Phase %li</b></td>"
|
||||
//"<td>%.2f hrs</td></tr>\n"
|
||||
|
||||
,
|
||||
LIGHT_BLUE ,
|
||||
ns2 ,
|
||||
nr2 ,
|
||||
LIGHT_BLUE ,
|
||||
ratio2 ,
|
||||
LIGHT_BLUE ,
|
||||
m_spiderRecNotAssigned ,
|
||||
LIGHT_BLUE ,
|
||||
errors2,
|
||||
LIGHT_BLUE ,
|
||||
m_spiderRecBadTLD
|
||||
);
|
||||
|
||||
@ -2439,7 +2470,7 @@ bool Repair::printRepairStatus ( SafeBuf *sb , long fromIp ) {
|
||||
// m_dbname will be 0
|
||||
if ( tr == 0 ) continue;
|
||||
sb->safePrintf(
|
||||
"<tr bgcolor=%s><td><b>%s2 recs</b></td>"
|
||||
"<tr bgcolor=#%s><td><b>%s2 recs</b></td>"
|
||||
"<td>%lli</td></tr>\n" ,
|
||||
bg,
|
||||
rdb->m_dbname,
|
||||
@ -2495,81 +2526,94 @@ bool Repair::printRepairStatus ( SafeBuf *sb , long fromIp ) {
|
||||
|
||||
sb->safePrintf (
|
||||
|
||||
"<table width=100%% bgcolor=#%s cellpadding=4 "
|
||||
"border=1 id=\"repairstatustable2\">"
|
||||
"<table %s "
|
||||
"id=\"repairstatustable2\">"
|
||||
|
||||
// current collection being repaired
|
||||
"<tr><td bgcolor=%s colspan=2><b><center>"
|
||||
"<tr class=hdrow><td colspan=2><b><center>"
|
||||
"Repair Settings In Use</center></b></td></tr>"
|
||||
|
||||
// . print parms for this repair
|
||||
// . they may differ than current controls because
|
||||
// the current controls were changed after the
|
||||
// repair started
|
||||
"<tr><td width=50%%><b>full rebuild</b></td>"
|
||||
"<tr bgcolor=#%s>"
|
||||
"<td width=50%%><b>full rebuild</b></td>"
|
||||
"<td>%s</td></tr>\n"
|
||||
|
||||
//"<tr><td><b>recycle link info</b></td>"
|
||||
//"<tr bgcolor=#%s><td><b>recycle link info</b></td>"
|
||||
//"<td>%s</td></tr>\n"
|
||||
|
||||
"<tr><td><b>rebuild titledb</b></td>"
|
||||
"<tr bgcolor=#%s><td><b>rebuild titledb</b></td>"
|
||||
"<td>%s</td></tr>\n"
|
||||
|
||||
//"<tr><td><b>rebuild tfndb</b></td>"
|
||||
//"<tr bgcolor=#%s><td><b>rebuild tfndb</b></td>"
|
||||
//"<td>%s</td></tr>\n"
|
||||
|
||||
//"<tr><td><b>rebuild indexdb</b></td>"
|
||||
//"<tr bgcolor=#%s><td><b>rebuild indexdb</b></td>"
|
||||
//"<td>%s</td></tr>\n"
|
||||
|
||||
"<tr><td><b>rebuild posdb</b></td>"
|
||||
"<tr bgcolor=#%s><td><b>rebuild posdb</b></td>"
|
||||
"<td>%s</td></tr>\n"
|
||||
|
||||
//"<tr><td><b>rebuild datedb</b></td>"
|
||||
//"<tr bgcolor=#%s><td><b>rebuild datedb</b></td>"
|
||||
//"<td>%s</td></tr>\n"
|
||||
|
||||
"<tr><td><b>rebuild clusterdb</b></td>"
|
||||
"<tr bgcolor=#%s><td><b>rebuild clusterdb</b></td>"
|
||||
"<td>%s</td></tr>\n"
|
||||
|
||||
//"<tr><td><b>rebuild checksumdb</b></td>"
|
||||
//"<tr bgcolor=#%s><td><b>rebuild checksumdb</b></td>"
|
||||
//"<td>%s</td></tr>\n"
|
||||
|
||||
"<tr><td><b>rebuild spiderdb</b></td>"
|
||||
"<tr bgcolor=#%s><td><b>rebuild spiderdb</b></td>"
|
||||
"<td>%s</td></tr>\n"
|
||||
|
||||
"<tr><td><b>rebuild linkdb</b></td>"
|
||||
"<tr bgcolor=#%s><td><b>rebuild linkdb</b></td>"
|
||||
"<td>%s</td></tr>\n"
|
||||
|
||||
//"<tr><td><b>rebuild tagdb</b></td>"
|
||||
//"<tr bgcolor=#%s><td><b>rebuild tagdb</b></td>"
|
||||
//"<td>%s</td></tr>\n"
|
||||
//"<tr><td><b>rebuild placedb</b></td>"
|
||||
//"<tr bgcolor=#%s><td><b>rebuild placedb</b></td>"
|
||||
//"<td>%s</td></tr>\n"
|
||||
//"<tr><td><b>rebuild sectiondb</b></td>"
|
||||
//"<tr bgcolor=#%s><td><b>rebuild sectiondb</b></td>"
|
||||
//"<td>%s</td></tr>\n"
|
||||
//"<tr><td><b>rebuild revdb</b></td>"
|
||||
//"<tr bgcolor=#%s><td><b>rebuild revdb</b></td>"
|
||||
//"<td>%s</td></tr>\n"
|
||||
|
||||
|
||||
"<tr><td><b>rebuild root urls</b></td>"
|
||||
"<tr bgcolor=#%s><td><b>rebuild root urls</b></td>"
|
||||
"<td>%s</td></tr>\n"
|
||||
|
||||
"<tr><td><b>rebuild non-root urls</b></td>"
|
||||
"<tr bgcolor=#%s>"
|
||||
"<td><b>rebuild non-root urls</b></td>"
|
||||
"<td>%s</td></tr>\n"
|
||||
|
||||
"</table>\n"
|
||||
"<br>\n"
|
||||
,
|
||||
TABLE_STYLE,
|
||||
|
||||
LIGHT_BLUE,
|
||||
DARK_BLUE,
|
||||
rr[0],
|
||||
//rr[10],
|
||||
|
||||
LIGHT_BLUE,
|
||||
rr[1],
|
||||
//rr[2],
|
||||
|
||||
LIGHT_BLUE,
|
||||
rr[3],
|
||||
//rr[4],
|
||||
|
||||
LIGHT_BLUE,
|
||||
rr[5],
|
||||
//rr[6],
|
||||
|
||||
LIGHT_BLUE,
|
||||
rr[7],
|
||||
//rr[8],
|
||||
|
||||
LIGHT_BLUE,
|
||||
rr[9],
|
||||
|
||||
//rr[13],
|
||||
@ -2578,7 +2622,10 @@ bool Repair::printRepairStatus ( SafeBuf *sb , long fromIp ) {
|
||||
//rr[16],
|
||||
//rr[17],
|
||||
|
||||
LIGHT_BLUE,
|
||||
rr[11],
|
||||
|
||||
LIGHT_BLUE,
|
||||
rr[12]
|
||||
);
|
||||
return true;
|
||||
|
@ -67,7 +67,7 @@ bool Revdb::init2 ( long treeMem ) {
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
bool Revdb::addColl ( char *coll, bool doVerify ) {
|
||||
if ( ! m_rdb.addColl ( coll ) ) return false;
|
||||
if ( ! doVerify ) return true;
|
||||
@ -79,7 +79,7 @@ bool Revdb::addColl ( char *coll, bool doVerify ) {
|
||||
log ( "db: Verify failed, but scaling is allowed, passing." );
|
||||
return true;
|
||||
}
|
||||
|
||||
*/
|
||||
bool Revdb::verify ( char *coll ) {
|
||||
log ( LOG_INFO, "db: Verifying Revdb for coll %s...", coll );
|
||||
g_threads.disableThreads();
|
||||
|
@ -1761,7 +1761,12 @@ Tag *SafeBuf::addTag ( char *mysite ,
|
||||
bool SafeBuf::addTag ( Tag *tag ) {
|
||||
long recSize = tag->getSize();
|
||||
//tag->setDataSize();
|
||||
if ( tag->m_recDataSize <= 16 ) { char *xx=NULL;*xx=0; }
|
||||
if ( tag->m_recDataSize <= 16 ) {
|
||||
// note it
|
||||
return log("safebuf: encountered corrupted tag datasize=%li.",
|
||||
tag->m_recDataSize);
|
||||
//char *xx=NULL;*xx=0; }
|
||||
}
|
||||
return safeMemcpy ( (char *)tag , recSize );
|
||||
}
|
||||
|
||||
@ -2703,6 +2708,7 @@ bool SafeBuf::decodeJSONToUtf8 ( long niceness ) {
|
||||
// diffbot
|
||||
// . really we could leave the newlines decoded etc, but it is prettier
|
||||
// for printing
|
||||
/*
|
||||
bool SafeBuf::safeStrcpyPrettyJSON ( char *decodedJson ) {
|
||||
// how much space do we need?
|
||||
// each single byte \t char for instance will need 2 bytes
|
||||
@ -2762,6 +2768,7 @@ bool SafeBuf::safeStrcpyPrettyJSON ( char *decodedJson ) {
|
||||
|
||||
return true;
|
||||
}
|
||||
*/
|
||||
|
||||
bool SafeBuf::safeUtf8ToJSON ( char *utf8 ) {
|
||||
|
||||
|
@ -104,7 +104,7 @@ struct SafeBuf {
|
||||
bool safeMemcpy(SafeBuf *c){return safeMemcpy(c->m_buf,c->m_length);};
|
||||
bool safeMemcpy ( class Words *w , long a , long b ) ;
|
||||
bool safeStrcpy ( char *s ) ;
|
||||
bool safeStrcpyPrettyJSON ( char *decodedJson ) ;
|
||||
//bool safeStrcpyPrettyJSON ( char *decodedJson ) ;
|
||||
bool safeUtf8ToJSON ( char *utf8 ) ;
|
||||
|
||||
bool csvEncode ( char *s , long len , long niceness = 0 );
|
||||
|
@ -224,8 +224,10 @@ class SearchInput {
|
||||
//long m_formatStrLen;
|
||||
//char *m_formatStr;
|
||||
|
||||
char m_formatTmp[11];
|
||||
|
||||
// can be 0 for FORMAT_HTML, 1 = FORMAT_XML, 2=FORMAT_JSON, 3=csv
|
||||
char m_format;
|
||||
long m_format;
|
||||
|
||||
// this should be part of the key because it will affect the results!
|
||||
char m_queryExpansion;
|
||||
|
@ -17238,7 +17238,7 @@ bool Sectiondb::init2 ( long treeMem ) {
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
bool Sectiondb::addColl ( char *coll, bool doVerify ) {
|
||||
if ( ! m_rdb.addColl ( coll ) ) return false;
|
||||
if ( ! doVerify ) return true;
|
||||
@ -17250,7 +17250,7 @@ bool Sectiondb::addColl ( char *coll, bool doVerify ) {
|
||||
log ( "db: sectiondb verify failed, but scaling is allowed, passing.");
|
||||
return true;
|
||||
}
|
||||
|
||||
*/
|
||||
bool Sectiondb::verify ( char *coll ) {
|
||||
log ( LOG_INFO, "db: Verifying Sectiondb for coll %s...", coll );
|
||||
g_threads.disableThreads();
|
||||
|
1839
Spider.cpp
1839
Spider.cpp
File diff suppressed because it is too large
Load Diff
110
Spider.h
110
Spider.h
@ -1,11 +1,5 @@
|
||||
// Matt Wells, copyright Nov 2002
|
||||
|
||||
//
|
||||
// . Spider.h/.cpp contains all the code related to spider scheduling
|
||||
// . Spiderdb holds the SpiderRecs which indicate the time to spider a url
|
||||
// . there are 2 types of SpiderRecs: SpiderRequest and SpiderReply recs
|
||||
//
|
||||
|
||||
#ifndef _SPIDER_H_
|
||||
#define _SPIDER_H_
|
||||
|
||||
@ -45,6 +39,7 @@
|
||||
#define SP_ADMIN_PAUSED 8 // g_conf.m_spideringEnabled = false
|
||||
#define SP_COMPLETED 9 // crawl is done, and no repeatCrawl is scheduled
|
||||
|
||||
void spiderRoundIncremented ( class CollectionRec *cr ) ;
|
||||
bool testPatterns ( ) ;
|
||||
bool doesStringContainPattern ( char *content , char *pattern ) ;
|
||||
|
||||
@ -57,6 +52,29 @@ bool getSpiderStatusMsg ( class CollectionRec *cx ,
|
||||
// this new spider algorithm ensures that urls get spidered even if a host
|
||||
// is dead. and even if the url was being spidered by a host that suddenly went
|
||||
// dead.
|
||||
//
|
||||
// . Spider.h/.cpp contains all the code related to spider scheduling
|
||||
// . Spiderdb holds the SpiderRecs which indicate the time to spider a url
|
||||
// . there are 2 types of SpiderRecs: SpiderRequest and SpiderReply recs
|
||||
//
|
||||
//
|
||||
// There are 3 main components to the spidering process:
|
||||
// 1) spiderdb
|
||||
// 2) the "waiting tree"
|
||||
// 3) doledb
|
||||
//
|
||||
// spiderdb holds all the spiderrequests/spiderreplies sorted by
|
||||
// their IP
|
||||
//
|
||||
// the waiting tree holds at most one entry for an IP indicating that
|
||||
// we should scan all the spiderrequests/spiderreplies for that IP in
|
||||
// spiderdb, find the "best" one(s) and add it (them) to doledb.
|
||||
//
|
||||
// doledb holds the best spiderrequests from spiderdb sorted by
|
||||
// "priority". priorities range from 0 to 127, the highest priority.
|
||||
// basically doledb holds the urls that are ready for spidering now.
|
||||
|
||||
|
||||
|
||||
|
||||
// Spiderdb
|
||||
@ -242,10 +260,10 @@ bool getSpiderStatusMsg ( class CollectionRec *cx ,
|
||||
// can spider any request/url in doledb provided they get the lock.
|
||||
|
||||
|
||||
// scanSpiderdb()
|
||||
// evalIpLoop()
|
||||
//
|
||||
// The waiting tree is populated at startup by scanning spiderdb (see
|
||||
// SpiderColl::scanSpiderdb()), which might take a while to complete,
|
||||
// SpiderColl::evalIpLoop()), which might take a while to complete,
|
||||
// so it is running in the background while the gb server is up. it will
|
||||
// log "10836674298 spiderdb bytes scanned for waiting tree re-population"
|
||||
// periodically in the log as it tries to do a complete spiderdb scan
|
||||
@ -255,7 +273,7 @@ bool getSpiderStatusMsg ( class CollectionRec *cx ,
|
||||
// It will also perform a background scan if the admin changes the url
|
||||
// filters table, which dictates that we recompute everything.
|
||||
//
|
||||
// scanSpiderdb() will recompute the "url filter number" (matching row)
|
||||
// evalIpLoop() will recompute the "url filter number" (matching row)
|
||||
// in the url filters table for each url in each SpiderRequest it reads.
|
||||
// it will ignore spider requests whose urls
|
||||
// are "filtered" or "banned". otherwise they will have a spider priority >= 0.
|
||||
@ -270,18 +288,18 @@ bool getSpiderStatusMsg ( class CollectionRec *cx ,
|
||||
// by preferring those with the highest priority. Tied spider priorities
|
||||
// should be resolved by minimum hopCount probably.
|
||||
//
|
||||
// If the spidertime of the URL is overdue then scanSpiderdb() will NOT add
|
||||
// If the spidertime of the URL is overdue then evalIpLoop() will NOT add
|
||||
// it to waiting tree, but will add it to doledb directly to make it available
|
||||
// for spidering immediately. It calls m_msg4.addMetaList() to add it to
|
||||
// doledb on all hosts in its group (shard). It uses s_ufnTree for keeping
|
||||
// track of the best urls to spider for a given IP/spiderPriority.
|
||||
//
|
||||
// scanSpiderdb() can also be called with its m_nextKey/m_endKey limited
|
||||
// evalIpLoop() can also be called with its m_nextKey/m_endKey limited
|
||||
// to just scan the SpiderRequests for a specific IP address. It does
|
||||
// this after adding a SpiderReply. addSpiderReply() calls addToWaitingTree()
|
||||
// with the "0" time entry, and addToWaitingTree() calls
|
||||
// populateDoledbFromWaitingTree() which will see that "0" entry and call
|
||||
// scanSpiderdb(true) after setting m_nextKey/m_endKey for that IP.
|
||||
// evalIpLoop(true) after setting m_nextKey/m_endKey for that IP.
|
||||
|
||||
|
||||
|
||||
@ -289,7 +307,7 @@ bool getSpiderStatusMsg ( class CollectionRec *cx ,
|
||||
//
|
||||
// SpiderColl::populateDoledbFromWaitingTree() scans the waiting tree for
|
||||
// entries whose spider time is due. so it gets the IP address and spider
|
||||
// priority from the waiting tree. but then it calls scanSpiderdb()
|
||||
// priority from the waiting tree. but then it calls evalIpLoop()
|
||||
// restricted to that IP (using m_nextKey,m_endKey) to get the best
|
||||
// SpiderRequest from spiderdb for that IP to add to doledb for immediate
|
||||
// spidering. populateDoledbFromWaitingTree() is called a lot to try to
|
||||
@ -505,8 +523,28 @@ class SpiderRequest {
|
||||
// . this is zero if none or invalid
|
||||
long m_contentHash32;
|
||||
|
||||
/*
|
||||
char m_reserved1;
|
||||
|
||||
// the new add url control will allow user to control link spidering
|
||||
// on each url they add. they can also specify file:// instead of
|
||||
// http:// to index local files. so we have to allow file://
|
||||
char m_onlyAddSameDomainLinks :1;
|
||||
char m_onlyAddSameSubdomainLinks :1;
|
||||
char m_onlyDoNotAddLinksLinks :1; // max hopcount 1
|
||||
char m_onlyDoNotAddLinksLinksLinks :1; // max hopcount 2
|
||||
char m_reserved2d:1;
|
||||
char m_reserved2e:1;
|
||||
char m_reserved2f:1;
|
||||
char m_reserved2g:1;
|
||||
char m_reserved2h:1;
|
||||
|
||||
|
||||
// . each request can have a different hop count
|
||||
// . this is only valid if m_hopCountValid is true!
|
||||
short m_hopCount;
|
||||
*/
|
||||
|
||||
long m_hopCount;
|
||||
|
||||
// . this is now computed dynamically often based on the latest
|
||||
@ -711,16 +749,17 @@ class SpiderRequest {
|
||||
long print( class SafeBuf *sb );
|
||||
|
||||
long printToTable ( SafeBuf *sb , char *status ,
|
||||
class XmlDoc *xd ) ;
|
||||
class XmlDoc *xd , long row ) ;
|
||||
// for diffbot...
|
||||
long printToTableSimple ( SafeBuf *sb , char *status ,
|
||||
class XmlDoc *xd ) ;
|
||||
class XmlDoc *xd , long row ) ;
|
||||
static long printTableHeader ( SafeBuf *sb , bool currentlSpidering ) ;
|
||||
static long printTableHeaderSimple ( SafeBuf *sb ,
|
||||
bool currentlSpidering ) ;
|
||||
|
||||
// returns false and sets g_errno on error
|
||||
bool setFromAddUrl ( char *url ) ;
|
||||
bool setFromInject ( char *url ) ;
|
||||
};
|
||||
|
||||
// . XmlDoc adds this record to spiderdb after attempting to spider a url
|
||||
@ -826,7 +865,11 @@ class SpiderReply {
|
||||
long m_isContacty :1;
|
||||
long m_hasAddress :1;
|
||||
long m_hasTOD :1;
|
||||
long m_hasSiteVenue :1;
|
||||
|
||||
// make this "INvalid" not valid since it was set to 0 before
|
||||
// and we want to be backwards compatible
|
||||
long m_isIndexedINValid :1;
|
||||
//long m_hasSiteVenue :1;
|
||||
|
||||
// expires after a certain time or if ownership changed
|
||||
long m_inGoogleValid :1;
|
||||
@ -835,7 +878,8 @@ class SpiderReply {
|
||||
long m_isContactyValid :1;
|
||||
long m_hasAddressValid :1;
|
||||
long m_hasTODValid :1;
|
||||
long m_hasSiteVenueValid :1;
|
||||
//long m_hasSiteVenueValid :1;
|
||||
long m_reserved2 :1;
|
||||
long m_siteNumInlinksValid :1;
|
||||
// was the request an injection request
|
||||
long m_fromInjectionRequest :1;
|
||||
@ -989,7 +1033,7 @@ class SpiderColl {
|
||||
~SpiderColl ( );
|
||||
SpiderColl ( ) ;
|
||||
|
||||
void clear();
|
||||
void clearLocks();
|
||||
|
||||
// called by main.cpp on exit to free memory
|
||||
void reset();
|
||||
@ -1028,7 +1072,8 @@ class SpiderColl {
|
||||
// for scanning the wait tree...
|
||||
bool m_isPopulating;
|
||||
// for reading from spiderdb
|
||||
bool m_isReadDone;
|
||||
//bool m_isReadDone;
|
||||
bool m_didRead;
|
||||
|
||||
Msg4 m_msg4;
|
||||
Msg1 m_msg1;
|
||||
@ -1111,7 +1156,8 @@ class SpiderColl {
|
||||
|
||||
bool m_countingPagesIndexed;
|
||||
HashTableX m_localTable;
|
||||
long long m_lastReqUh48;
|
||||
long long m_lastReqUh48a;
|
||||
long long m_lastReqUh48b;
|
||||
long long m_lastRepUh48;
|
||||
// move to CollectionRec so it can load at startup and save it
|
||||
//HashTableX m_pageCountTable;
|
||||
@ -1127,8 +1173,17 @@ class SpiderColl {
|
||||
bool addToWaitingTree ( uint64_t spiderTime , long firstIp ,
|
||||
bool callForScan );
|
||||
long getNextIpFromWaitingTree ( );
|
||||
void populateDoledbFromWaitingTree ( bool reentry );
|
||||
bool scanSpiderdb ( bool needList );
|
||||
void populateDoledbFromWaitingTree ( );
|
||||
|
||||
//bool scanSpiderdb ( bool needList );
|
||||
|
||||
|
||||
// broke up scanSpiderdb into simpler functions:
|
||||
bool evalIpLoop ( ) ;
|
||||
bool readListFromSpiderdb ( ) ;
|
||||
bool scanListForWinners ( ) ;
|
||||
bool addWinnerToDoledb ( ) ;
|
||||
|
||||
|
||||
void populateWaitingTreeFromSpiderdb ( bool reentry ) ;
|
||||
|
||||
@ -1138,7 +1193,11 @@ class SpiderColl {
|
||||
key_t m_waitingTreeKey;
|
||||
bool m_waitingTreeKeyValid;
|
||||
long m_scanningIp;
|
||||
bool m_gotNewRequestsForScanningIp;
|
||||
long m_gotNewDataForScanningIp;
|
||||
long m_lastListSize;
|
||||
long m_lastScanningIp;
|
||||
|
||||
char m_deleteMyself;
|
||||
|
||||
// start key for reading doledb
|
||||
key_t m_msg5StartKey;
|
||||
@ -1292,7 +1351,7 @@ void handleRequestc1 ( UdpSlot *slot , long niceness ) ;
|
||||
|
||||
// . max spiders we can have going at once for this process
|
||||
// . limit to 70 to preven OOM conditions
|
||||
#define MAX_SPIDERS 70
|
||||
#define MAX_SPIDERS 100
|
||||
|
||||
class SpiderLoop {
|
||||
|
||||
@ -1412,6 +1471,7 @@ long getUrlFilterNum ( class SpiderRequest *sreq ,
|
||||
bool isForMsg20 ,
|
||||
long niceness ,
|
||||
class CollectionRec *cr ,
|
||||
bool isOutlink = false ) ;
|
||||
bool isOutlink , // = false ,
|
||||
HashTableX *quotaTable );//= NULL ) ;
|
||||
|
||||
#endif
|
||||
|
12
Stats.cpp
12
Stats.cpp
@ -557,6 +557,11 @@ void Stats::printGraphInHtml ( SafeBuf &sb ) {
|
||||
//
|
||||
sb.safePrintf("<div style=\"position:relative;"
|
||||
"background-color:#c0c0c0;"
|
||||
|
||||
// match style of tables
|
||||
"border-radius:10px;"
|
||||
"border:#6060f0 2px solid;"
|
||||
|
||||
//"overflow-y:hidden;"
|
||||
"overflow-x:hidden;"
|
||||
"z-index:-10;"
|
||||
@ -567,10 +572,11 @@ void Stats::printGraphInHtml ( SafeBuf &sb ) {
|
||||
"min-height:%lipx;"
|
||||
//"width:100%%;"
|
||||
//"min-height:600px;"
|
||||
"margin-top:10px;"
|
||||
//"margin-top:10px;"
|
||||
"margin-bottom:10px;"
|
||||
"margin-right:10px;"
|
||||
"margin-left:10px;\">"
|
||||
//"margin-right:10px;"
|
||||
//"margin-left:10px;"
|
||||
"\">"
|
||||
,(long)DX
|
||||
,(long)DY +20); // add 10 more for "2s" labels etc.
|
||||
|
||||
|
5
Stats.h
5
Stats.h
@ -25,9 +25,10 @@ class StatPoint {
|
||||
|
||||
#define MAX_POINTS 6000
|
||||
#define MAX_WIDTH 6
|
||||
#define DY 1000 // pixels vertical
|
||||
//#define DY 1000 // pixels vertical
|
||||
#define DY 500 // pixels vertical
|
||||
#define DX 1000 // pixels across
|
||||
#define DT (20*1000) // time window, 20 seconds
|
||||
#define DT (10*1000) // time window, 10 seconds
|
||||
#define MAX_LINES (DY / (MAX_WIDTH+1)) // leave free pixel above each line
|
||||
|
||||
#define STAT_GENERIC 0
|
||||
|
13
Statsdb.cpp
13
Statsdb.cpp
@ -209,17 +209,17 @@ bool Statsdb::init ( ) {
|
||||
// will init the CollectionRec::m_rdbBase, which is what
|
||||
// Rdb::getBase(collnum_t) will return. however, for collectionless
|
||||
// rdb databases we set Rdb::m_collectionlessBase special here.
|
||||
return m_rdb.addColl ( NULL );
|
||||
return m_rdb.addRdbBase1 ( NULL );
|
||||
}
|
||||
|
||||
|
||||
|
||||
// Make sure we need this function.
|
||||
// main.cpp currently uses the addColl from m_rdb
|
||||
bool Statsdb::addColl ( char *coll, bool doVerify ) {
|
||||
if ( ! m_rdb.addColl ( coll ) ) return false;
|
||||
return true;
|
||||
}
|
||||
//bool Statsdb::addColl ( char *coll, bool doVerify ) {
|
||||
// if ( ! m_rdb.addColl ( coll ) ) return false;
|
||||
// return true;
|
||||
//}
|
||||
|
||||
void flushStatsWrapper ( int fd , void *state ) {
|
||||
g_statsdb.addDocsIndexed();
|
||||
@ -532,7 +532,8 @@ bool Statsdb::makeGIF ( long t1Arg ,
|
||||
|
||||
#define MAX_POINTS 6000
|
||||
#define MAX_WIDTH 6
|
||||
#define DY2 600 // pixels vertical
|
||||
//#define DY2 600 // pixels vertical
|
||||
#define DY2 400 // pixels vertical
|
||||
#define DX2 1000 // pixels across
|
||||
#define MAX_LINES2 (DY2 / (MAX_WIDTH+1)) // leave free pixel above each line
|
||||
|
||||
|
@ -992,7 +992,7 @@ bool Syncdb::init ( ) {
|
||||
// clear it all!
|
||||
m_qt.clear();
|
||||
// add the base since it is a collectionless rdb
|
||||
return m_rdb.addColl ( NULL );
|
||||
return m_rdb.addRdbBase1 ( NULL );
|
||||
}
|
||||
|
||||
// . save our crap
|
||||
@ -1432,9 +1432,10 @@ void Syncdb::syncDone ( ) {
|
||||
m_rcpStarted = false;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
// TODO: Provide verification.
|
||||
bool Syncdb::addColl ( char *coll, bool doVerify ) {
|
||||
if ( ! m_rdb.addColl ( coll ) ) return false;
|
||||
return true;
|
||||
}
|
||||
*/
|
||||
|
148
Tagdb.cpp
148
Tagdb.cpp
@ -1854,7 +1854,7 @@ bool Tagdb::init2 ( long treeMem ) {
|
||||
false ); // bias disk page cache?
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
bool Tagdb::addColl ( char *coll, bool doVerify ) {
|
||||
if ( ! m_rdb.addColl ( coll ) ) return false;
|
||||
if ( ! doVerify ) return true;//false;
|
||||
@ -1867,7 +1867,7 @@ bool Tagdb::addColl ( char *coll, bool doVerify ) {
|
||||
//return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
*/
|
||||
|
||||
|
||||
bool Tagdb::verify ( char *coll ) {
|
||||
@ -2761,14 +2761,16 @@ bool Msg8a::launchGetRequests ( ) {
|
||||
long shardNum = getShardNum ( m_rdbId , &startKey );//, true );
|
||||
Host *group = g_hostdb.getShard ( shardNum );
|
||||
|
||||
long numTwins = g_hostdb.getNumHostsPerShard();
|
||||
//long numTwins = g_hostdb.getNumHostsPerShard();
|
||||
// use top byte!
|
||||
uint8_t *sks = (uint8_t *)&startKey;
|
||||
uint8_t top = sks[sizeof(TAGDB_KEY)-1];
|
||||
long hostNum = 0;
|
||||
if ( numTwins == 2 && (top & 0x80) ) hostNum = 1;
|
||||
//long hostNum = 0;
|
||||
//if ( numTwins == 2 && (top & 0x80) ) hostNum = 1;
|
||||
// TODO: fix this!
|
||||
if ( numTwins >= 3 ) { char *xx=NULL;*xx=0; }
|
||||
//if ( numTwins >= 3 ) { char *xx=NULL;*xx=0; }
|
||||
// support more than 2 stripes now...
|
||||
long hostNum = top % g_hostdb.getNumHostsPerShard();
|
||||
long hostId = group[hostNum].m_hostId;
|
||||
|
||||
|
||||
@ -4440,27 +4442,62 @@ bool sendReply2 ( void *state ) {
|
||||
char bb [ MAX_COLL_LEN + 60 ];
|
||||
bb[0]='\0';
|
||||
|
||||
sb.safePrintf(
|
||||
"<style>"
|
||||
".poo { background-color:#%s;}\n"
|
||||
"</style>\n" ,
|
||||
LIGHT_BLUE );
|
||||
|
||||
// print interface to add sites
|
||||
sb.safePrintf (
|
||||
"<table width=100%% bgcolor=#%s border=1 cellpadding=4>"
|
||||
"<tr><td bgcolor=#%s colspan=21>"
|
||||
"<center><font size=+1><b>Tagdb</b>%s</font></center>"
|
||||
"</td></tr>", LIGHT_BLUE , DARK_BLUE , bb );
|
||||
"<table %s>"
|
||||
"<tr><td colspan=2>"
|
||||
"<center><b>Tagdb</b>%s</center>"
|
||||
"</td></tr>", TABLE_STYLE , bb );
|
||||
|
||||
// sometimes we add a huge # of urls, so don't display them because
|
||||
// it like freezes the silly browser
|
||||
char *uu = st->m_urls;
|
||||
if ( st->m_urlsLen > 100000 ) uu = "";
|
||||
|
||||
sb.safePrintf ( "<tr><td colspan=21>");
|
||||
|
||||
//sb.safePrintf ( "<tr bgcolor=#%s><td colspan=2>"
|
||||
// "<center>"
|
||||
// "</center>"
|
||||
// "</td></tr>",
|
||||
// DARK_BLUE);
|
||||
|
||||
|
||||
sb.safePrintf ( "<tr class=poo><td>"
|
||||
"<b>urls</b>"
|
||||
"<br>"
|
||||
|
||||
"<font size=-2>"
|
||||
"Enter a single URL and then click <i>Get Tags</i> to "
|
||||
"get back its tags. Enter multiple URLs and select "
|
||||
"the tags names and values in the other table "
|
||||
"below in order to tag "
|
||||
"them all with those tags when you click "
|
||||
"<i>Add Tags</i>. "
|
||||
"On the command line you can also issue a "
|
||||
"<i>./gb 0 dump S main 0 -1 1</i>"
|
||||
"command, for instance, to dump out the tagdb "
|
||||
"contents for the <i>main</i> collection on "
|
||||
"<i>host #0</i>. "
|
||||
"</font>"
|
||||
|
||||
|
||||
"</td>");
|
||||
|
||||
// text area for adding space separated sites/urls
|
||||
//char *pp = "put sites here";
|
||||
//char *pp = "";
|
||||
//if ( st->m_bufLen > 0 ) pp = st->m_buf; // no, print out "urls"
|
||||
sb.safePrintf ("<center>"
|
||||
sb.safePrintf (""
|
||||
"<td width=70%%>"
|
||||
"<br>"
|
||||
"<textarea rows=16 cols=64 name=u>"
|
||||
"%s</textarea><br><br>" , uu );
|
||||
"%s</textarea></td></tr>" , uu );
|
||||
|
||||
// spam assassins should not use this much power, too risky
|
||||
//if ( st->m_isAdmin ) {
|
||||
@ -4470,30 +4507,61 @@ bool sendReply2 ( void *state ) {
|
||||
|
||||
// allow filename to load them from
|
||||
//if ( st->m_isAdmin ) {
|
||||
sb.safePrintf("or specify a file of them: <input name=ufu "
|
||||
"type=text size=40><br>"
|
||||
"<i>file can also be dumped output of "
|
||||
"tagdb from the <b>gb dump S ...</b> "
|
||||
"command.</i>"
|
||||
"<br><br>" );
|
||||
sb.safePrintf("<tr class=poo>"
|
||||
"<td>"
|
||||
"<b>file of urls to tag</b>"
|
||||
"<br>"
|
||||
"<font size=-2>"
|
||||
"If provided, Gigablast will read the URLs from "
|
||||
"this file as if you pasted them into the text "
|
||||
"area above. The text area will also be ignored."
|
||||
"</font>"
|
||||
"</td>"
|
||||
"<td><input name=ufu "
|
||||
"type=text size=40>"//<br>"
|
||||
//"<i>file can also be dumped output of "
|
||||
//"tagdb from the <b>gb dump S ...</b> "
|
||||
//"command.</i>"
|
||||
//"<br><br>" );
|
||||
"</td></tr>"
|
||||
);
|
||||
//}
|
||||
|
||||
// this is applied to every tag that is added for accountability
|
||||
sb.safePrintf("<br>Username: <input name=username type=text size=6 "
|
||||
"value=\"admin\"> " );//,st->m_username);
|
||||
sb.safePrintf("<tr class=poo><td>"
|
||||
"<b>username</b>"
|
||||
"<br><font size=-2>"
|
||||
"Stored with each tag you add for accountability."
|
||||
"</font>"
|
||||
"</td><td>"
|
||||
"<input name=username type=text size=6 "
|
||||
"value=\"admin\"> "
|
||||
"</td></tr>"
|
||||
);//,st->m_username);
|
||||
|
||||
// as a safety, this must be checked for any delete operation
|
||||
sb.safePrintf (" delete operation<input type=\"checkbox\" "
|
||||
"value=\"1\" name=\"delop\"><br>");
|
||||
sb.safePrintf ("<tr class=poo><td><b>delete operation</b>"
|
||||
"<br>"
|
||||
"<font size=-2>"
|
||||
|
||||
"If checked "
|
||||
"then the tag names you specify below will be "
|
||||
"deleted for the URLs you provide in the text area "
|
||||
"when you click <i>Add Tags</i>."
|
||||
"</font>"
|
||||
|
||||
|
||||
"</td><td><input type=\"checkbox\" "
|
||||
"value=\"1\" name=\"delop\"></td></tr>");
|
||||
|
||||
// close up
|
||||
sb.safePrintf ("<br><center>"
|
||||
|
||||
sb.safePrintf ("<tr bgcolor=#%s><td colspan=2>"
|
||||
"<center>"
|
||||
// this is merge all by default right now but since
|
||||
// zak is really only using eventtaghashxxxx.com we
|
||||
// should be ok
|
||||
"<input type=submit name=get "
|
||||
"value=\"get tags\" border=0>"
|
||||
"value=\"Get Tags\" border=0>"
|
||||
|
||||
//"<input type=submit name=get "
|
||||
//"value=\"get best rec\" border=0>"
|
||||
@ -4506,7 +4574,11 @@ bool sendReply2 ( void *state ) {
|
||||
|
||||
// "</form>"
|
||||
"</center>"
|
||||
"</tr>\n");
|
||||
"</td></tr></table>"
|
||||
"<br><br>"
|
||||
, DARK_BLUE
|
||||
);
|
||||
|
||||
|
||||
// . show all tags we got values for
|
||||
// . put a delete checkbox next to each one
|
||||
@ -4515,6 +4587,13 @@ bool sendReply2 ( void *state ) {
|
||||
// for some reason the "selected" option tags do not show up below
|
||||
// on firefox unless i have this line.
|
||||
|
||||
sb.safePrintf (
|
||||
"<table %s>"
|
||||
"<tr><td colspan=20>"
|
||||
"<center><b>Add Tag</b></center>"
|
||||
"</td></tr>", TABLE_STYLE );
|
||||
|
||||
|
||||
// count how many "tagRecs" we are taking tags from
|
||||
Tag *jtag = st->m_tagRec.getFirstTag();
|
||||
long numTagRecs = 0;
|
||||
@ -4532,13 +4611,14 @@ bool sendReply2 ( void *state ) {
|
||||
bool canEdit = (numTagRecs <= 1);
|
||||
|
||||
if ( ! canEdit )
|
||||
sb.safePrintf("<tr><td colspan=20><center><font color=red>"
|
||||
sb.safePrintf("<tr class=poo>"
|
||||
"<td colspan=10><center><font color=red>"
|
||||
"<b>Can not edit because more than one "
|
||||
"TagRecs were merged</b></font></center>"
|
||||
"</td></tr>\n" );
|
||||
|
||||
// headers
|
||||
sb.safePrintf("<tr bgcolor=%s>"
|
||||
sb.safePrintf("<tr bgcolor=#%s>"
|
||||
//"<td><b>delete?</b></td>"
|
||||
"<td><b>del?</b></td>"
|
||||
"<td><b>tag name</b></td>"
|
||||
@ -4574,9 +4654,9 @@ bool sendReply2 ( void *state ) {
|
||||
// if we are NULL, print out 3 empty tags
|
||||
if ( ! ctag ) empty++;
|
||||
// start the section
|
||||
sb.safePrintf("<tr bgcolor=%s>",DARK_BLUE);
|
||||
sb.safePrintf("<tr class=poo>");
|
||||
// the delete tag checkbox
|
||||
//sb.safePrintf("<tr bgcolor=%s><td>",DARK_BLUE);
|
||||
//sb.safePrintf("<tr bgcolor=#%s><td>",DARK_BLUE);
|
||||
sb.safePrintf("<td>");
|
||||
if ( ctag && canEdit ) // && tag->m_type != ST_SITE )
|
||||
sb.safePrintf("<input name=deltag%li "
|
||||
@ -4624,7 +4704,7 @@ bool sendReply2 ( void *state ) {
|
||||
// was selected will have this score
|
||||
if ( canEdit )
|
||||
sb.safePrintf("<input type=text name=tagdata%li "
|
||||
"size=70 value=\"",count);
|
||||
"size=50 value=\"",count);
|
||||
// show the value
|
||||
if ( ctag ) ctag->printDataToBuf ( &sb );
|
||||
// close up the input tag
|
||||
@ -4693,10 +4773,10 @@ bool sendReply2 ( void *state ) {
|
||||
// do not print add or del tags buttons if we got tags from more
|
||||
// than one TagRec!
|
||||
if ( canEdit )
|
||||
sb.safePrintf ("<tr bgcolor=%s><td colspan=21><center>"
|
||||
sb.safePrintf ("<tr bgcolor=#%s><td colspan=10><center>"
|
||||
|
||||
"<input type=submit name=add "
|
||||
"value=\"add tags\" border=0>"
|
||||
"value=\"Add Tags\" border=0>"
|
||||
|
||||
"</center></td>"
|
||||
"</tr>\n",DARK_BLUE);
|
||||
|
2
Tagdb.h
2
Tagdb.h
@ -115,7 +115,7 @@ char *getTagStrFromType ( long tagType ) ;
|
||||
//#define MAX_TAGREC_SIZE 1024
|
||||
|
||||
// max "oustanding" msg0 requests sent by TagRec::lookup()
|
||||
#define MAX_TAGDB_REQUESTS 5
|
||||
#define MAX_TAGDB_REQUESTS 3
|
||||
|
||||
// . the latest version of the TagRec
|
||||
//#define TAGREC_CURRENT_VERSION 0
|
||||
|
@ -604,6 +604,7 @@ bool TcpServer::sendMsg ( long ip ,
|
||||
s->m_maxOtherDocLen = maxOtherDocLen ;
|
||||
s->m_ssl = NULL;
|
||||
s->m_udpSlot = NULL;
|
||||
s->m_streamingMode = false;
|
||||
// . call the connect routine to try to connect it asap
|
||||
// . this does not block however
|
||||
// . this returns false if blocked, true otherwise
|
||||
@ -694,11 +695,17 @@ bool TcpServer::sendMsg ( TcpSocket *s ,
|
||||
// . this will also unregister all our callbacks for the socket
|
||||
// . TODO: deleting nodes from under Loop::callCallbacks is dangerous!!
|
||||
if ( g_errno ) { destroySocket ( s ); return true; }
|
||||
|
||||
// if in streaming mode just return true, do not set sockState
|
||||
// to ST_NEEDS_CLOSE lest it be destroyed. streaming mode needs
|
||||
// to get more data to send on the socket.
|
||||
if ( s->m_streamingMode ) return true;
|
||||
|
||||
// reset the socket iff it was a reply that we finished writing
|
||||
// hmmm else if ( s->m_readBuf ) { recycleSocket ( s ); return true; }
|
||||
// we can't close it here any more for some reason the browser truncates
|
||||
// the content we transmit otherwise... i've tried SO_LINGER and couldnt get
|
||||
// that to work...
|
||||
// we can't close it here any more for some reason the browser truncats
|
||||
// the content we transmit otherwise... i've tried SO_LINGER and
|
||||
// couldnt get that to work...
|
||||
if ( s->m_readBuf ) { s->m_sockState = ST_NEEDS_CLOSE; return true; }
|
||||
// we're blocking on the reply (readBuf is empty)
|
||||
return false;
|
||||
@ -906,6 +913,8 @@ TcpSocket *TcpServer::wrapSocket ( int sd , long niceness , bool isIncoming ) {
|
||||
s->m_lastActionTime = s->m_startTime;
|
||||
// set if it's incoming connection or not
|
||||
s->m_isIncoming = isIncoming;
|
||||
// turn this off
|
||||
s->m_streamingMode = false;
|
||||
// . a 30 sec timeout, we don't want slow guys using all our sockets
|
||||
// . they could easily flood us anyway though
|
||||
// . we need to wait possibly a few minutes for a large inject of
|
||||
@ -1434,7 +1443,7 @@ void writeSocketWrapper ( int sd , void *state ) {
|
||||
// if socket has nothing to send yet cuz we're waiting, wait...
|
||||
if ( s->m_sendBufUsed == 0 ) return;
|
||||
|
||||
sendAgain:
|
||||
// sendAgain:
|
||||
|
||||
// . writeSocket returns false if blocked, true otherwise
|
||||
// . it also sets g_errno on errro
|
||||
@ -1451,13 +1460,16 @@ void writeSocketWrapper ( int sd , void *state ) {
|
||||
|
||||
// if callback changed socket status to ST_SEND_AGAIN
|
||||
// then let's send the new buffer that it has. Diffbot.cpp uses this.
|
||||
if ( s->m_sockState == ST_SEND_AGAIN ) {
|
||||
s->m_sockState = ST_WRITING;
|
||||
// if nothing left to send just return
|
||||
if ( ! s->m_sendBuf ) return;
|
||||
// otherwise send it
|
||||
goto sendAgain;
|
||||
}
|
||||
//if ( s->m_sockState == ST_SEND_AGAIN ) {
|
||||
// s->m_sockState = ST_WRITING;
|
||||
// // if nothing left to send just return
|
||||
// if ( ! s->m_sendBuf ) return;
|
||||
// // otherwise send it
|
||||
// goto sendAgain;
|
||||
//}
|
||||
|
||||
// wait for it to exit streaming mode before destroying
|
||||
if ( s->m_streamingMode ) return;
|
||||
|
||||
// . destroy the socket on error, recycle on transaction completion
|
||||
// . this will also unregister all our callbacks for the socket
|
||||
@ -1673,6 +1685,14 @@ connected:
|
||||
// . calls the callback governing "s" if it has one
|
||||
void TcpServer::destroySocket ( TcpSocket *s ) {
|
||||
if ( ! s ) return ;
|
||||
|
||||
// sanity, must exit streaming mode before destruction
|
||||
if ( s->m_streamingMode ) {
|
||||
log("tcp: destroying socket in streaming mode. err=%s",
|
||||
mstrerror(g_errno));
|
||||
//char *xx=NULL;*xx=0; }
|
||||
}
|
||||
|
||||
// sanity check
|
||||
if ( s->m_udpSlot ) {
|
||||
log("tcp: sending back error on udp slot err=%s",
|
||||
@ -1864,6 +1884,7 @@ void TcpServer::recycleSocket ( TcpSocket *s ) {
|
||||
//s->m_timeout = 60*1000;
|
||||
s->m_timeout = 10*60*1000;
|
||||
s->m_udpSlot = NULL;
|
||||
s->m_streamingMode = false;
|
||||
// keep it alive for other dialogs
|
||||
s->m_sockState = ST_AVAILABLE;
|
||||
s->m_startTime = gettimeofdayInMilliseconds();
|
||||
@ -2097,6 +2118,7 @@ TcpSocket *TcpServer::acceptSocket ( ) {
|
||||
s->m_sockState = ST_READING;
|
||||
s->m_this = this;
|
||||
s->m_udpSlot = NULL;
|
||||
s->m_streamingMode = false;
|
||||
|
||||
if ( ! m_useSSL ) return s;
|
||||
|
||||
@ -2214,3 +2236,56 @@ void TcpServer::cancel ( void *state ) {
|
||||
destroySocket ( s );
|
||||
}
|
||||
}
|
||||
|
||||
#include "SafeBuf.h"
|
||||
|
||||
bool TcpServer::sendChunk ( TcpSocket *s ,
|
||||
SafeBuf *sb ,
|
||||
void *state ,
|
||||
// call this function when done sending this chunk
|
||||
// so that it can read another chunk and call
|
||||
// sendChunk() again.
|
||||
void (* doneSendingWrapper)( void *,TcpSocket *) ,
|
||||
bool lastChunk ) {
|
||||
|
||||
log("tcp: sending chunk of %li bytes", sb->length() );
|
||||
|
||||
// if socket had shit on there already, free that memory
|
||||
// just like TcpServer::destroySocket would
|
||||
if ( s->m_sendBuf ) {
|
||||
mfree (s->m_sendBuf, s->m_sendBufSize,"TcpServer");
|
||||
s->m_sendBuf = NULL;
|
||||
}
|
||||
|
||||
// reset send stats just in case
|
||||
s->m_sendOffset = 0;
|
||||
s->m_totalSent = 0;
|
||||
s->m_totalToSend = 0;
|
||||
|
||||
// let it know not to close the socket while this is set
|
||||
if ( ! lastChunk ) s->m_streamingMode = true;
|
||||
else s->m_streamingMode = false;
|
||||
|
||||
// . start the send process
|
||||
// . returns false if send did not complete
|
||||
// . returns true and sets g_errno on error
|
||||
if ( ! sendMsg ( s ,
|
||||
sb->getBufStart(), // sendBuf ,
|
||||
sb->getCapacity(),//sendBufSize ,
|
||||
sb->length(),//sendBufSize ,
|
||||
sb->length(), // msgtotalsize
|
||||
state , // data for callback
|
||||
doneSendingWrapper ) ) { // callback
|
||||
// do not free sendbuf we are transmitting it
|
||||
sb->detachBuf();
|
||||
return false;
|
||||
}
|
||||
|
||||
// we sent without blocking
|
||||
sb->detachBuf();
|
||||
|
||||
// a problem?
|
||||
if ( g_errno ) return true;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
@ -99,6 +99,15 @@ class TcpServer {
|
||||
long maxOtherDocLen );
|
||||
|
||||
|
||||
bool sendChunk ( class TcpSocket *s ,
|
||||
class SafeBuf *sb ,
|
||||
void *state ,
|
||||
// call this function when done sending this chunk
|
||||
// so that it can read another chunk and call
|
||||
// sendChunk() again.
|
||||
void (* doneSendingWrapper)( void *state,TcpSocket *),
|
||||
bool lastChunk );
|
||||
|
||||
// . returns false if blocked, true otherwise
|
||||
// . sets errno on error
|
||||
// . use this for sending a msg to another host
|
||||
|
@ -29,7 +29,7 @@
|
||||
// hack to repopulate the socket's send buf when its done sending
|
||||
// it's current sendbuf in order to transmit large amounts of data that
|
||||
// can't all fit in memory at the same time:
|
||||
#define ST_SEND_AGAIN 10
|
||||
//#define ST_SEND_AGAIN 10
|
||||
|
||||
#define TCP_READ_BUF_SIZE 1024
|
||||
|
||||
@ -117,6 +117,7 @@ class TcpSocket {
|
||||
long m_maxOtherDocLen; // if reading other doc types
|
||||
|
||||
char m_niceness;
|
||||
char m_streamingMode;
|
||||
|
||||
long m_shutdownStart;
|
||||
|
||||
|
@ -105,7 +105,7 @@ bool Tfndb::init2 ( long treeMem ) {
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
bool Tfndb::addColl ( char *coll, bool doVerify ) {
|
||||
if ( ! m_rdb.addColl ( coll ) ) return false;
|
||||
if ( ! doVerify ) return true;
|
||||
@ -117,7 +117,7 @@ bool Tfndb::addColl ( char *coll, bool doVerify ) {
|
||||
log ( "db: Verify failed, but scaling is allowed, passing." );
|
||||
return true;
|
||||
}
|
||||
|
||||
*/
|
||||
bool Tfndb::verify ( char *coll ) {
|
||||
log ( LOG_INFO, "db: Verifying Tfndb for coll %s...", coll );
|
||||
g_threads.disableThreads();
|
||||
|
2
Tfndb.h
2
Tfndb.h
@ -27,7 +27,7 @@ class Tfndb {
|
||||
|
||||
bool verify ( char *coll );
|
||||
|
||||
bool addColl ( char *coll, bool doVerify = true );
|
||||
//bool addColl ( char *coll, bool doVerify = true );
|
||||
|
||||
// set up our private rdb
|
||||
bool init ( );
|
||||
|
15
Threads.cpp
15
Threads.cpp
@ -284,7 +284,12 @@ bool Threads::init ( ) {
|
||||
// with high niceness cuz it would hold up high priority ones!
|
||||
// . TODO: is there a better way? cancel it when UdpServer calls
|
||||
// Threads::suspendLowPriorityThreads() ?
|
||||
if ( ! g_threads.registerType ( MERGE_THREAD , 2/*maxThreads*/,1000) )
|
||||
// . this used to be 2 but now defaults to 10 in Parms.cpp. i found
|
||||
// i have less long gray lines in the performance graph when i
|
||||
// did that on trinity.
|
||||
long max2 = g_conf.m_maxCpuMergeThreads;
|
||||
if ( max2 < 1 ) max2 = 1;
|
||||
if ( ! g_threads.registerType ( MERGE_THREAD , max2,1000) )
|
||||
return log("thread: Failed to register thread type." );
|
||||
// will raising this from 1 to 2 make it faster too?
|
||||
// i raised since global specs new servers have 2 (hyperthreaded?) cpus
|
||||
@ -300,7 +305,11 @@ bool Threads::init ( ) {
|
||||
return log("thread: Failed to register thread type." );
|
||||
// . File.cpp spawns a rename thread for doing renames and unlinks
|
||||
// . doing a tight merge on titldb can be ~250 unlinks
|
||||
if ( ! g_threads.registerType ( UNLINK_THREAD,1/*maxThreads*/,3000) )
|
||||
// . MDW up from 1 to 30 max, after doing a ddump on 3000+ collections
|
||||
// it was taking forever to go one at a time through the unlink
|
||||
// thread queue. seemed like a 1 second space between unlinks.
|
||||
// 1/23/1014
|
||||
if ( ! g_threads.registerType ( UNLINK_THREAD,30/*maxThreads*/,3000) )
|
||||
return log("thread: Failed to register thread type." );
|
||||
// generic multipurpose
|
||||
if ( ! g_threads.registerType (GENERIC_THREAD,100/*maxThreads*/,100) )
|
||||
@ -1120,7 +1129,7 @@ void makeCallback ( ThreadEntry *t ) {
|
||||
// then set it
|
||||
if ( t->m_niceness >= 1 ) g_niceness = 1;
|
||||
else g_niceness = 0;
|
||||
|
||||
|
||||
t->m_callback ( t->m_state , t );
|
||||
|
||||
// time it?
|
||||
|
@ -124,7 +124,7 @@ bool Titledb::init2 ( long treeMem ) {
|
||||
// validate
|
||||
//return verify ( );
|
||||
}
|
||||
|
||||
/*
|
||||
bool Titledb::addColl ( char *coll, bool doVerify ) {
|
||||
if ( ! m_rdb.addColl ( coll ) ) return false;
|
||||
if ( ! doVerify ) return true;
|
||||
@ -136,7 +136,7 @@ bool Titledb::addColl ( char *coll, bool doVerify ) {
|
||||
log ( "db: Verify failed, but scaling is allowed, passing." );
|
||||
return true;
|
||||
}
|
||||
|
||||
*/
|
||||
bool Titledb::verify ( char *coll ) {
|
||||
log ( LOG_DEBUG, "db: Verifying Titledb for coll %s...", coll );
|
||||
g_threads.disableThreads();
|
||||
|
@ -39,7 +39,7 @@ class Titledb {
|
||||
|
||||
bool verify ( char *coll );
|
||||
|
||||
bool addColl ( char *coll, bool doVerify = true );
|
||||
//bool addColl ( char *coll, bool doVerify = true );
|
||||
|
||||
// init m_rdb
|
||||
bool init ();
|
||||
|
@ -131,8 +131,8 @@ static char s_compBuf[COMPBUFSIZE];
|
||||
// Kompatible Decomposition table must be loaded before calling this
|
||||
bool initCompositionTable(){
|
||||
if ( ! s_isInitialized ) {
|
||||
log(LOG_INFO,"conf: UCNormalizer: "
|
||||
"initializing Full Composition table");
|
||||
//log(LOG_INFO,"conf: UCNormalizer: "
|
||||
// "initializing Full Composition table");
|
||||
// set up the hash table
|
||||
//if ( ! s_compositions.set ( 8,4,16384 ) )
|
||||
if (!s_compositions.set(8,4,65536,s_compBuf,(long)COMPBUFSIZE,
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user