Merge branch 'diffbot' of github.com:gigablast/open-source-search-engine into diffbot

This commit is contained in:
Matt Wells 2014-01-30 13:11:48 -08:00
commit 40f373c9e0
113 changed files with 5815 additions and 3112 deletions

@ -849,9 +849,8 @@ bool AutoBan::printTable( TcpSocket *s , HttpRequest *r ) {
setCodesFromConf();
}
sb.safePrintf("\n<br><br><table width=100%% bgcolor=#%s "
"cellpadding=4 border=1>\n",
BABY_BLUE);
sb.safePrintf("\n<br><br><table %s>\n",TABLE_STYLE);
getCalendarFromMs((now - m_codeResetTime) * 1000,
&days,
&hours,
@ -1134,9 +1133,7 @@ bool AutoBan::printTable( TcpSocket *s , HttpRequest *r ) {
sb.safePrintf("\n<table width=100%% bgcolor=#%s "
"cellpadding=4 border=1>\n",
BABY_BLUE);
sb.safePrintf("\n<table %s>\n",TABLE_STYLE);
sb.safePrintf("<tr><td colspan=2 bgcolor=#%s>"
"<center><b>Add IPs</b></center></td></tr>",
DARK_BLUE);
@ -1174,9 +1171,7 @@ bool AutoBan::printTable( TcpSocket *s , HttpRequest *r ) {
/////////////////////////////////////////////////////////////////////
sb.safePrintf("\n<table width=100%% bgcolor=#%s "
"cellpadding=4 border=1>\n",
BABY_BLUE);
sb.safePrintf("\n<table %s>\n",TABLE_STYLE);
sb.safePrintf("<tr><td colspan=3 bgcolor=#%s>"
"<center><b>Watched Ips</b></center></td></tr>",
@ -1315,9 +1310,7 @@ bool AutoBan::printTable( TcpSocket *s , HttpRequest *r ) {
// MDW moved from here
sb.safePrintf("\n<br><br><table width=100%% bgcolor=#%s "
"cellpadding=4 border=1>\n",
BABY_BLUE);
sb.safePrintf("\n<br><br><table %s>\n",TABLE_STYLE);
sb.safePrintf("<tr><td colspan=5 bgcolor=#%s>"
"<center><b>Control Panel</b></center></td></tr>",
@ -1362,9 +1355,7 @@ bool AutoBan::printTable( TcpSocket *s , HttpRequest *r ) {
}
sb.safePrintf("\n<br><br><table width=100%% bgcolor=#%s "
"cellpadding=4 border=1>\n",
BABY_BLUE);
sb.safePrintf("\n<br><br><table %s>\n",TABLE_STYLE);
sb.safePrintf("<tr><td colspan=6 bgcolor=#%s>"
"<center><b>Queries Today</b></center></td></tr>",

@ -569,7 +569,9 @@ bool BigFile::readwrite ( void *buf ,
}
// otherwise, thread spawn failed, do it blocking then
g_errno = 0;
if ( ! g_threads.m_disabled ) {
// if threads are manually disabled don't print these msgs because
// we redbox the fact above the controls in Pages.cpp
if ( g_conf.m_useThreads && ! g_threads.m_disabled ) {
static long s_lastTime = 0;
long now = getTime();
if ( now - s_lastTime >= 1 ) {

@ -651,7 +651,10 @@ void Blaster::gotDoc2 ( void *state, TcpSocket *s){
false,
0,
false,
TITLEREC_CURRENT_VERSION)){
TITLEREC_CURRENT_VERSION ,
true , // set parents
0 , // niceness
CT_XML )){ // content type
log(LOG_WARN,"blaster: Couldn't set XML1 Class in gotDoc2");
}
Links links1;
@ -679,7 +682,10 @@ void Blaster::gotDoc2 ( void *state, TcpSocket *s){
false,
0,
false,
TITLEREC_CURRENT_VERSION)){
TITLEREC_CURRENT_VERSION,
true , // setparents
0 , // niceness
CT_XML )){
log(LOG_WARN,"blaster: Couldn't set XML2 Class in gotDoc2");
}
Links links2;
@ -1170,7 +1176,10 @@ void Blaster::gotDoc4 ( void *state, TcpSocket *s){
false,
0,
false,
TITLEREC_CURRENT_VERSION)){
TITLEREC_CURRENT_VERSION,
true, // setparents
0, // niceness
CT_XML )){
log(LOG_WARN,"blaster: Couldn't set XML Class in gotDoc4");
}
Links links;

@ -71,9 +71,9 @@ bool Cachedb::init ( ) {
return false;
// add the base since it is a collectionless rdb
return m_rdb.addColl ( NULL );
return m_rdb.addRdbBase1 ( NULL );
}
/*
bool Cachedb::addColl ( char *coll, bool doVerify ) {
if ( ! m_rdb.addColl ( coll ) ) return false;
if ( ! doVerify ) return true;
@ -85,7 +85,7 @@ bool Cachedb::addColl ( char *coll, bool doVerify ) {
log ( "db: Verify failed, but scaling is allowed, passing." );
return true;
}
*/
bool Cachedb::verify ( char *coll ) {
// coll is NULL here methinks
log ( LOG_DEBUG, "db: Verifying %s...",m_name );

@ -84,7 +84,7 @@ bool Catdb::init ( ) {
// Rdb::getBase(collnum_t) will return. however, for collectionless
// rdb databases we set Rdb::m_collectionlessBase special here.
// This was in Rdb.cpp::init().
return m_rdb.addColl ( NULL );
return m_rdb.addRdbBase1 ( NULL );
}
bool Catdb::init2 ( long treeMem ) {
@ -112,6 +112,7 @@ bool Catdb::init2 ( long treeMem ) {
// end support for "cache recs"
//
/*
bool Catdb::addColl ( char *coll, bool doVerify ) {
if ( ! m_rdb.addColl ( coll ) ) return false;
// verify
@ -123,6 +124,7 @@ bool Catdb::addColl ( char *coll, bool doVerify ) {
log ( "db: Verify failed, but scaling is allowed, passing." );
return true;
}
*/
bool Catdb::verify ( char *coll ) {
char *rdbName = "Catdb";

@ -337,7 +337,7 @@ bool Clusterdb::init2 ( long treeMem ) {
12 , // key size
true ); // bias disk page cache
}
/*
bool Clusterdb::addColl ( char *coll, bool doVerify ) {
if ( ! m_rdb.addColl ( coll ) ) return false;
if ( ! doVerify ) return true;
@ -349,7 +349,7 @@ bool Clusterdb::addColl ( char *coll, bool doVerify ) {
log ( "db: Verify failed, but scaling is allowed, passing." );
return true;
}
*/
bool Clusterdb::verify ( char *coll ) {
log ( LOG_DEBUG, "db: Verifying Clusterdb for coll %s...", coll );
g_threads.disableThreads();

@ -34,7 +34,13 @@ Collectiondb g_collectiondb;
Collectiondb::Collectiondb ( ) {
m_numRecs = 0;
m_numRecsUsed = 0;
m_lastUpdateTime = 0LL;
//m_lastUpdateTime = 0LL;
m_needsSave = false;
// sanity
if ( RDB_END2 >= RDB_END ) return;
log("db: increase RDB_END2 to at least %li in "
"Collectiondb.h",(long)RDB_END);
char *xx=NULL;*xx=0;
}
// reset rdb
@ -51,6 +57,7 @@ void Collectiondb::reset() {
g_collTable.reset();
}
/*
bool Collectiondb::init ( bool isDump ) {
reset();
if ( g_isYippy ) return true;
@ -77,6 +84,7 @@ bool Collectiondb::init ( bool isDump ) {
// otherwise, true, even if reloadList() blocked
return true;
}
*/
// . save to disk
// . returns false if blocked, true otherwise
@ -95,7 +103,12 @@ bool Collectiondb::save ( ) {
return true;
}
bool Collectiondb::load ( bool isDump ) {
///////////
//
// fill up our m_recs[] array based on the coll.*.*/coll.conf files
//
///////////
bool Collectiondb::loadAllCollRecs ( ) {
char dname[1024];
// MDW: sprintf ( dname , "%s/collections/" , g_hostdb.m_dir );
sprintf ( dname , "%s" , g_hostdb.m_dir );
@ -104,7 +117,7 @@ bool Collectiondb::load ( bool isDump ) {
if ( ! d.open ()) return log("admin: Could not load collection config "
"files.");
// note it
log(LOG_INFO,"db: Loading collection config files.");
//log(LOG_INFO,"db: loading collection config files.");
// . scan through all subdirs in the collections dir
// . they should be like, "coll.main/" and "coll.mycollection/"
char *f;
@ -122,16 +135,23 @@ bool Collectiondb::load ( bool isDump ) {
// get collnum
collnum_t collnum = atol ( pp + 1 );
// add it
if ( ! addExistingColl ( coll , collnum ,isDump ) )
if ( ! addExistingColl ( coll , collnum ) )
return false;
}
// note it
log(LOG_INFO,"db: Loaded data for %li collections. Ranging from "
"collection #0 to #%li.",m_numRecsUsed,m_numRecs-1);
//log(LOG_INFO,"db: Loaded data for %li collections. Ranging from "
// "collection #0 to #%li.",m_numRecsUsed,m_numRecs-1);
// update the time
updateTime();
//updateTime();
// don't clean the tree if just dumpin
if ( isDump ) return true;
//if ( isDump ) return true;
return true;
}
// after we've initialized all rdbs in main.cpp call this to clean out
// our rdb trees
bool Collectiondb::cleanTrees ( ) {
// remove any nodes with illegal collnums
Rdb *r;
//r = g_indexdb.getRdb();
@ -158,7 +178,7 @@ bool Collectiondb::load ( bool isDump ) {
// success
return true;
}
/*
void Collectiondb::updateTime() {
// get time now in milliseconds
long long newTime = gettimeofdayInMilliseconds();
@ -169,14 +189,13 @@ void Collectiondb::updateTime() {
// we need a save
m_needsSave = true;
}
*/
#include "Statsdb.h"
#include "Cachedb.h"
#include "Syncdb.h"
bool Collectiondb::addExistingColl ( char *coll,
collnum_t collnum ,
bool isDump ) {
bool Collectiondb::addExistingColl ( char *coll, collnum_t collnum ) {
long i = collnum;
@ -221,7 +240,7 @@ bool Collectiondb::addExistingColl ( char *coll,
"\"%s\".",coll);
}
if ( ! registerCollRec ( cr , isDump , false ) ) return false;
if ( ! registerCollRec ( cr , false ) ) return false;
// we need to compile the regular expressions or update the url
// filters with new logic that maps crawlbot parms to url filters
@ -454,6 +473,16 @@ bool Collectiondb::addNewColl ( char *coll ,
memset ( &cr->m_localCrawlInfo , 0 , sizeof(CrawlInfo) );
memset ( &cr->m_globalCrawlInfo , 0 , sizeof(CrawlInfo) );
// note that
log("colldb: initial revival for %s",cr->m_coll);
// . assume we got some urls ready to spider
// . Spider.cpp will wait SPIDER_DONE_TIME seconds and if it has no
// urls it spidered in that time these will get set to 0 and it
// will send out an email alert if m_sentCrawlDoneAlert is not true.
cr->m_localCrawlInfo.m_hasUrlsReadyToSpider = 1;
cr->m_globalCrawlInfo.m_hasUrlsReadyToSpider = 1;
// set some defaults. max spiders for all priorities in this
// collection. NO, default is in Parms.cpp.
//cr->m_maxNumSpiders = 10;
@ -496,46 +525,66 @@ bool Collectiondb::addNewColl ( char *coll ,
}
return registerCollRec ( cr , false , true );
if ( ! registerCollRec ( cr , true ) )
return false;
// add the rdbbases for this coll, CollectionRec::m_bases[]
if ( ! addRdbBasesForCollRec ( cr ) )
return false;
return true;
}
// . called only by addNewColl() and by addExistingColl()
bool Collectiondb::registerCollRec ( CollectionRec *cr ,
bool isDump ,
bool isNew ) {
bool Collectiondb::registerCollRec ( CollectionRec *cr , bool isNew ) {
// add m_recs[] and to hashtable
if ( ! setRecPtr ( cr->m_collnum , cr ) )
return false;
bool verify = true;
return true;
}
bool Collectiondb::addRdbBaseToAllRdbsForEachCollRec ( ) {
for ( long i = 0 ; i < m_numRecs ; i++ ) {
CollectionRec *cr = m_recs[i];
if ( ! cr ) continue;
// add rdb base files etc. for it
addRdbBasesForCollRec ( cr );
}
return true;
}
bool Collectiondb::addRdbBasesForCollRec ( CollectionRec *cr ) {
char *coll = cr->m_coll;
//////
//
// if we are doing a dump from the command line, skip this stuff
if ( isDump ) return true;
if ( isNew ) verify = false;
//
//////
if ( g_dumpMode ) return true;
// tell rdbs to add one, too
//if ( ! g_indexdb.addColl ( coll, verify ) ) goto hadError;
if ( ! g_posdb.addColl ( coll, verify ) ) goto hadError;
//if ( ! g_datedb.addColl ( coll, verify ) ) goto hadError;
//if ( ! g_indexdb.getRdb()->addRdbBase1 ( coll ) ) goto hadError;
if ( ! g_posdb.getRdb()->addRdbBase1 ( coll ) ) goto hadError;
//if ( ! g_datedb.getRdb()->addRdbBase1 ( coll ) ) goto hadError;
if ( ! g_titledb.addColl ( coll, verify ) ) goto hadError;
//if ( ! g_revdb.addColl ( coll, verify ) ) goto hadError;
//if ( ! g_sectiondb.addColl ( coll, verify ) ) goto hadError;
if ( ! g_tagdb.addColl ( coll, verify ) ) goto hadError;
//if ( ! g_catdb.addColl ( coll, verify ) ) goto hadError;
//if ( ! g_checksumdb.addColl ( coll, verify ) ) goto hadError;
//if ( ! g_tfndb.addColl ( coll, verify ) ) goto hadError;
if ( ! g_clusterdb.addColl ( coll, verify ) ) goto hadError;
if ( ! g_linkdb.addColl ( coll, verify ) ) goto hadError;
if ( ! g_spiderdb.addColl ( coll, verify ) ) goto hadError;
if ( ! g_doledb.addColl ( coll, verify ) ) goto hadError;
if ( ! g_titledb.getRdb()->addRdbBase1 ( coll ) ) goto hadError;
//if ( ! g_revdb.getRdb()->addRdbBase1 ( coll ) ) goto hadError;
//if ( ! g_sectiondb.getRdb()->addRdbBase1 ( coll ) ) goto hadError;
if ( ! g_tagdb.getRdb()->addRdbBase1 ( coll ) ) goto hadError;
//if ( ! g_catdb.getRdb()->addRdbBase1 ( coll ) ) goto hadError;
//if ( ! g_checksumdb.getRdb()->addRdbBase1 ( coll ) ) goto hadError;
//if ( ! g_tfndb.getRdb()->addRdbBase1 ( coll ) ) goto hadError;
if ( ! g_clusterdb.getRdb()->addRdbBase1 ( coll ) ) goto hadError;
if ( ! g_linkdb.getRdb()->addRdbBase1 ( coll ) ) goto hadError;
if ( ! g_spiderdb.getRdb()->addRdbBase1 ( coll ) ) goto hadError;
if ( ! g_doledb.getRdb()->addRdbBase1 ( coll ) ) goto hadError;
// now clean the trees
cleanTrees();
// debug message
//log ( LOG_INFO, "db: verified collection \"%s\" (%li).",
@ -637,6 +686,22 @@ bool Collectiondb::deleteRec ( char *coll , WaitEntry *we ) {
}
*/
// if there is an outstanding disk read thread or merge thread then
// Spider.cpp will handle the delete in the callback.
void Collectiondb::deleteSpiderColl ( SpiderColl *sc ) {
sc->m_deleteMyself = true;
// if not currently being accessed nuke it now
if ( ! sc->m_msg5.m_waitingForList &&
! sc->m_msg5b.m_waitingForList &&
! sc->m_msg1.m_mcast.m_inUse ) {
mdelete ( sc, sizeof(SpiderColl),"nukecr2");
delete ( sc );
return;
}
}
bool Collectiondb::deleteRec2 ( collnum_t collnum ) { //, WaitEntry *we ) {
// do not allow this if in repair mode
if ( g_repairMode > 0 ) {
@ -724,10 +789,14 @@ bool Collectiondb::deleteRec2 ( collnum_t collnum ) { //, WaitEntry *we ) {
SpiderColl *sc = g_spiderCache.getSpiderCollIffNonNull(collnum);
if ( sc ) {
// remove locks from lock table:
sc->clear();
sc->clearLocks();
//sc->m_collnum = newCollnum;
sc->reset();
mdelete ( sc, sizeof(SpiderColl),"nukecr2");
//sc->reset();
// this will put it on "death row" so it will be deleted
// once Msg5::m_waitingForList/Merge is NULL
deleteSpiderColl ( sc );
//mdelete ( sc, sizeof(SpiderColl),"nukecr2");
//delete ( sc );
cr->m_spiderColl = NULL;
}
@ -872,7 +941,7 @@ bool Collectiondb::setRecPtr ( collnum_t collnum , CollectionRec *cr ) {
}
// update the time
updateTime();
//updateTime();
return true;
}
@ -926,8 +995,19 @@ bool Collectiondb::resetColl2( collnum_t oldCollnum,
// reset spider info
SpiderColl *sc = g_spiderCache.getSpiderCollIffNonNull(oldCollnum);
if ( sc ) {
sc->clear();
sc->m_collnum = newCollnum;
// remove locks from lock table:
sc->clearLocks();
// don't do this anymore, just nuke it in case
// m_populatingDoledb was true etc. there are too many
// flags to worry about
//sc->m_collnum = newCollnum;
//sc->reset();
// this will put it on "death row" so it will be deleted
// once Msg5::m_waitingForList/Merge is NULL
deleteSpiderColl ( sc );
//mdelete ( sc, sizeof(SpiderColl),"nukecr2");
//delete ( sc );
cr->m_spiderColl = NULL;
}
// reset spider round
@ -1052,7 +1132,7 @@ bool addCollToTable ( char *coll , collnum_t collnum ) {
// get coll rec specified in the HTTP request
CollectionRec *Collectiondb::getRec ( HttpRequest *r ) {
CollectionRec *Collectiondb::getRec ( HttpRequest *r , bool useDefaultRec ) {
char *coll = r->getString ( "c" );
if ( coll && ! coll[0] ) coll = NULL;
// maybe it is crawlbot?
@ -1067,6 +1147,18 @@ CollectionRec *Collectiondb::getRec ( HttpRequest *r ) {
snprintf(tmp,MAX_COLL_LEN,"%s-%s",token,name);
coll = tmp;
}
// default to main first
if ( ! coll && useDefaultRec ) {
CollectionRec *cr = g_collectiondb.getRec("main");
if ( cr ) return cr;
}
// try next in line
if ( ! coll && useDefaultRec ) {
return getFirstRec ();
}
// give up?
if ( ! coll ) return NULL;
//if ( ! coll || ! coll[0] ) coll = g_conf.m_defaultColl;
@ -1296,7 +1388,7 @@ CollectionRec::CollectionRec() {
//m_spiderStatusMsg = NULL;
// for Url::getSite()
m_updateSiteRulesTable = 1;
m_lastUpdateTime = 0LL;
//m_lastUpdateTime = 0LL;
m_clickNScrollEnabled = false;
// inits for sortbydatetable
m_inProgress = false;
@ -1359,6 +1451,10 @@ void CollectionRec::setToDefaults ( ) {
void CollectionRec::reset() {
// . grows dynamically
// . setting to 0 buckets should never have error
//m_pageCountTable.set ( 4,4,0,NULL,0,false,MAX_NICENESS,"pctbl" );
// regex_t types
if ( m_hasucr ) regfree ( &m_ucr );
if ( m_hasupr ) regfree ( &m_upr );
@ -1378,6 +1474,27 @@ void CollectionRec::reset() {
rdb->resetBase ( m_collnum );
}
for ( long i = 0 ; i < g_process.m_numRdbs ; i++ ) {
RdbBase *base = m_bases[i];
if ( ! base ) continue;
mdelete (base, sizeof(RdbBase), "Rdb Coll");
delete (base);
}
SpiderColl *sc = m_spiderColl;
// if never made one, we are done
if ( ! sc ) return;
// spider coll also!
sc->m_deleteMyself = true;
// if not currently being accessed nuke it now
if ( ! sc->m_msg5.m_waitingForList &&
! sc->m_msg5b.m_waitingForList &&
! sc->m_msg1.m_mcast.m_inUse ) {
mdelete ( sc, sizeof(SpiderColl),"nukecr2");
delete ( sc );
}
}
CollectionRec *g_cr = NULL;
@ -1404,7 +1521,8 @@ bool CollectionRec::load ( char *coll , long i ) {
m_collLen = gbstrlen ( coll );
strcpy ( m_coll , coll );
log(LOG_INFO,"db: loading data for %s",coll);
log(LOG_INFO,"db: loading conf for collection %s (%li)",coll,
(long)m_collnum);
// collection name HACK for backwards compatibility
//if ( strcmp ( coll , "main" ) == 0 ) {
@ -1440,6 +1558,43 @@ bool CollectionRec::load ( char *coll , long i ) {
//m_localCrawlInfo.setFromSafeBuf(&sb);
// it is binary now
memcpy ( &m_localCrawlInfo , sb.getBufStart(),sb.length() );
log("coll: loaded %s (%li) local hasurlsready=%li",
m_coll,
(long)m_collnum,
(long)m_localCrawlInfo.m_hasUrlsReadyToSpider);
// we introduced the this round counts, so don't start them at 0!!
if ( m_spiderRoundNum == 0 &&
m_localCrawlInfo.m_pageDownloadSuccessesThisRound <
m_localCrawlInfo.m_pageDownloadSuccesses ) {
log("coll: fixing process count this round for %s",m_coll);
m_localCrawlInfo.m_pageDownloadSuccessesThisRound =
m_localCrawlInfo.m_pageDownloadSuccesses;
}
// we introduced the this round counts, so don't start them at 0!!
if ( m_spiderRoundNum == 0 &&
m_localCrawlInfo.m_pageProcessSuccessesThisRound <
m_localCrawlInfo.m_pageProcessSuccesses ) {
log("coll: fixing process count this round for %s",m_coll);
m_localCrawlInfo.m_pageProcessSuccessesThisRound =
m_localCrawlInfo.m_pageProcessSuccesses;
}
// fix from old bug that was fixed
//if ( m_spiderRoundNum == 0 &&
// m_collectiveRespiderFrequency > 0.0 &&
// m_localCrawlInfo.m_sentCrawlDoneAlert ) {
// log("coll: bug fix: resending email alert for coll %s (%li) "
// "of respider freq %f",m_coll,(long)m_collnum,
// m_collectiveRespiderFrequency);
// m_localCrawlInfo.m_sentCrawlDoneAlert = false;
//}
// LOAD GLOBAL
snprintf ( tmp1 , 1023, "%scoll.%s.%li/globalcrawlinfo.dat",
g_hostdb.m_dir , m_coll , (long)m_collnum );
@ -1451,20 +1606,23 @@ bool CollectionRec::load ( char *coll , long i ) {
// it is binary now
memcpy ( &m_globalCrawlInfo , sb.getBufStart(),sb.length() );
log("coll: loaded %s (%li) global hasurlsready=%li",
m_coll,
(long)m_collnum,
(long)m_globalCrawlInfo.m_hasUrlsReadyToSpider);
////////////
//
// PAGE COUNT TABLE for doing quotas in url filters
//
/////////////
// . grows dynamically
// . setting to 0 buckets should never have error
m_pageCountTable.set ( 4,4,0,NULL,0,false,MAX_NICENESS,"pctbl" );
// log it up if there on disk
snprintf ( tmp1 , 1023, "/coll.%s.%li/pagecounts.dat",
m_coll , (long)m_collnum );
if ( ! m_pageCountTable.load ( g_hostdb.m_dir , tmp1 ) && g_errno )
log("db: failed to load page count table: %s",
mstrerror(g_errno));
//snprintf ( tmp1 , 1023, "/coll.%s.%li/pagecounts.dat",
// m_coll , (long)m_collnum );
//if ( ! m_pageCountTable.load ( g_hostdb.m_dir , tmp1 ) && g_errno )
// log("db: failed to load page count table: %s",
// mstrerror(g_errno));
// ignore errors i guess
g_errno = 0;
@ -1619,11 +1777,11 @@ void CollectionRec::setUrlFiltersToDefaults ( ) {
m_spiderIpWaits[n] = 1000;
m_numRegExs5++;
m_spiderIpMaxSpiders[n] = 1;
m_spiderIpMaxSpiders[n] = 7;
m_numRegExs6++;
m_spidersEnabled[n] = 1;
m_numRegExs7++;
//m_spidersEnabled[n] = 1;
//m_numRegExs7++;
m_harvestLinks[n] = 1;
m_numRegExs8++;
@ -1724,19 +1882,24 @@ bool CollectionRec::save ( ) {
tmp,mstrerror(g_errno));
g_errno = 0;
}
// save page count table which has # of pages indexed per
// subdomain/site and firstip for doing quotas in url filters table
snprintf ( tmp , 1023, "coll.%s.%li/pagecounts.dat",
m_coll , (long)m_collnum );
if ( ! m_pageCountTable.save ( g_hostdb.m_dir , tmp ) ) {
log("db: failed to save file %s : %s",tmp,mstrerror(g_errno));
g_errno = 0;
}
// do not need a save now
m_needsSave = false;
// waiting tree is saved in SpiderCache::save() called by Process.cpp
//SpiderColl *sc = m_spiderColl;
//if ( ! sc ) return true;
// save page count table which has # of pages indexed per
// subdomain/site and firstip for doing quotas in url filters table
//snprintf ( tmp , 1023, "coll.%s.%li/pagecounts.dat",
// m_coll , (long)m_collnum );
//if ( ! m_pageCountTable.save ( g_hostdb.m_dir , tmp ) ) {
// log("db: failed to save file %s : %s",tmp,mstrerror(g_errno));
// g_errno = 0;
//}
return true;
}
@ -1937,10 +2100,10 @@ bool CollectionRec::rebuildUrlFilters ( ) {
for ( long i = 0 ; i < MAX_FILTERS ; i++ ) {
m_regExs[i].purge();
m_spiderPriorities[i] = 0;
m_maxSpidersPerRule [i] = 10;
m_maxSpidersPerRule [i] = 100;
m_spiderIpWaits [i] = wait;
m_spiderIpMaxSpiders[i] = 7; // keep it respectful
m_spidersEnabled [i] = 1;
//m_spidersEnabled [i] = 1;
m_spiderFreqs [i] =m_collectiveRespiderFrequency;
//m_spiderDiffbotApiUrl[i].purge();
m_harvestLinks[i] = true;
@ -1961,6 +2124,24 @@ bool CollectionRec::rebuildUrlFilters ( ) {
i++;
}
// and for docs that have errors respider once every 5 hours
m_regExs[i].set("errorcount==1");
m_spiderPriorities [i] = 40;
m_spiderFreqs [i] = 0.001; // 86 seconds
i++;
// and for docs that have errors respider once every 5 hours
m_regExs[i].set("errorcount==2");
m_spiderPriorities [i] = 40;
m_spiderFreqs [i] = 0.1; // 2.4 hrs
i++;
// excessive errors? (tcp/dns timed out, etc.) retry once per month?
m_regExs[i].set("errorcount>=3");
m_spiderPriorities [i] = 30;
m_spiderFreqs [i] = 30; // 30 days
i++;
// 3rd rule for respidering
if ( m_collectiveRespiderFrequency > 0.0 ) {
m_regExs[i].set("lastspidertime>={roundstart}");
@ -1968,7 +2149,11 @@ bool CollectionRec::rebuildUrlFilters ( ) {
m_spiderPriorities [i] = 10;
// just turn off spidering. if we were to set priority to
// filtered it would be removed from index!
m_spidersEnabled [i] = 0;
//m_spidersEnabled [i] = 0;
m_maxSpidersPerRule[i] = 0;
// temp hack so it processes in xmldoc.cpp::getUrlFilterNum()
// which has been obsoleted, but we are running old code now!
//m_spiderDiffbotApiUrl[i].set ( api );
i++;
}
// if collectiverespiderfreq is 0 or less then do not RE-spider
@ -1981,22 +2166,14 @@ bool CollectionRec::rebuildUrlFilters ( ) {
m_spiderPriorities [i] = 10;
// just turn off spidering. if we were to set priority to
// filtered it would be removed from index!
m_spidersEnabled [i] = 0;
//m_spidersEnabled [i] = 0;
m_maxSpidersPerRule[i] = 0;
// temp hack so it processes in xmldoc.cpp::getUrlFilterNum()
// which has been obsoleted, but we are running old code now!
//m_spiderDiffbotApiUrl[i].set ( api );
i++;
}
// and for docs that have errors respider once every 5 hours
m_regExs[i].set("errorcount>0 && errcount<3");
m_spiderPriorities [i] = 40;
m_spiderFreqs [i] = 0.2; // half a day
i++;
// excessive errors? (tcp/dns timed out, etc.) retry once per month?
m_regExs[i].set("errorcount>=3");
m_spiderPriorities [i] = 30;
m_spiderFreqs [i] = 30; // 30 days
i++;
// url crawl and process pattern
if ( ucp && upp ) {
m_regExs[i].set("matchesucp && matchesupp");
@ -2060,7 +2237,7 @@ bool CollectionRec::rebuildUrlFilters ( ) {
m_numRegExs10 = i;
m_numRegExs5 = i;
m_numRegExs6 = i;
m_numRegExs7 = i;
//m_numRegExs7 = i;
m_numRegExs8 = i;
//m_numRegExs11 = i;

@ -27,6 +27,7 @@ public:
char *m_parmEnd;
class UdpSlot *m_slot;
bool m_doRebuilds;
bool m_updatedRound;
collnum_t m_collnum;
bool m_registered;
long m_errno;
@ -45,10 +46,18 @@ class Collectiondb {
// . returns false and sets errno on error
// . each collection as a CollectionRec class for it and
// is loaded up from the appropriate config file
bool init ( bool isDump = false );
bool init ( );
// this loads all the recs from host #0
bool load ( bool isDump = false );
//bool load ( bool isDump = false );
// called by main.cpp to fill in our m_recs[] array with
// all the coll.*.*/coll.conf info
bool loadAllCollRecs ( );
// after main.cpp loads all rdb trees it calls this to remove
// bogus collnums from the trees i guess
bool cleanTrees ( ) ;
// . this will save all conf files back to disk that need it
// . returns false and sets g_errno on error, true on success
@ -63,7 +72,8 @@ class Collectiondb {
char *getColl ( collnum_t collnum ) {return getCollName(collnum);};
// get coll rec specified in the HTTP request
class CollectionRec *getRec ( class HttpRequest *r );
class CollectionRec *getRec ( class HttpRequest *r ,
bool useDefaultRec = true );
// . get collectionRec from name
// returns NULL if not available
@ -81,7 +91,7 @@ class Collectiondb {
// . how many collections we have in here
// . only counts valid existing collections
long getNumRecs() { return m_numRecsUsed; };
long getNumRecsUsed() { return m_numRecsUsed; };
// . does this requester have root admin privledges???
// . uses the root collection record!
@ -92,9 +102,9 @@ class Collectiondb {
// what collnum will be used the next time a coll is added?
collnum_t reserveCollNum ( ) ;
long long getLastUpdateTime () { return m_lastUpdateTime; };
//long long getLastUpdateTime () { return m_lastUpdateTime; };
// updates m_lastUpdateTime so g_spiderCache know when to reload
void updateTime ();
//void updateTime ();
// private:
@ -105,9 +115,8 @@ class Collectiondb {
// bool saveRec ); // = true
bool addExistingColl ( char *coll,
collnum_t collnum ,
bool isDump ) ;
bool addExistingColl ( char *coll, collnum_t collnum );
bool addNewColl ( char *coll ,
char customCrawl ,
char *cpc ,
@ -115,9 +124,10 @@ class Collectiondb {
bool saveIt ,
collnum_t newCollnum ) ;
bool registerCollRec ( CollectionRec *cr ,
bool isDump ,
bool isNew ) ;
bool registerCollRec ( CollectionRec *cr , bool isNew ) ;
bool addRdbBaseToAllRdbsForEachCollRec ( ) ;
bool addRdbBasesForCollRec ( CollectionRec *cr ) ;
bool setRecPtr ( collnum_t collnum , CollectionRec *cr ) ;
@ -128,6 +138,8 @@ class Collectiondb {
//bool updateRec ( CollectionRec *newrec );
bool deleteRecs ( class HttpRequest *r ) ;
void deleteSpiderColl ( class SpiderColl *sc );
// returns false if blocked, true otherwise.
//bool resetColl ( char *coll , WaitEntry *we , bool purgeSeeds );
bool resetColl2 ( collnum_t oldCollnum,
@ -149,7 +161,7 @@ class Collectiondb {
long m_numRecs;
long m_numRecsUsed;
long long m_lastUpdateTime;
//long long m_lastUpdateTime;
};
extern class Collectiondb g_collectiondb;
@ -249,6 +261,7 @@ class CrawlInfo {
long long m_pageProcessSuccesses; // 7
long long m_urlsHarvested; // 8
long m_lastUpdateTime;
// this is non-zero if urls are available to be spidered right now.
@ -268,6 +281,12 @@ class CrawlInfo {
//long m_numUrlsLaunched;
long m_dummy1;
// keep separate because when we receive a crawlinfo struct from
// a host we only add these in if it matches our round #
long long m_pageDownloadSuccessesThisRound;
long long m_pageProcessSuccessesThisRound;
void reset() { memset ( this , 0 , sizeof(CrawlInfo) ); };
//bool print (class SafeBuf *sb ) ;
//bool setFromSafeBuf (class SafeBuf *sb ) ;
@ -348,7 +367,7 @@ class CollectionRec {
bool m_urlFiltersHavePageCounts;
// moved from SpiderColl so we can load up at startup
HashTableX m_pageCountTable;
//HashTableX m_pageCountTable;
// . when was the last time we changed?
//long long m_lastUpdateTime;
@ -385,7 +404,9 @@ class CollectionRec {
// spidered and begin the next round
long m_spiderRoundNum;
char m_useDatedb ;
char m_indexBody;
//char m_useDatedb ;
//char m_addUrlEnabled ; // TODO: use at http interface lvl
//char m_spiderLinks ; use url filters now!
char m_sameHostLinks ; // spider links from same host only?
@ -691,8 +712,8 @@ class CollectionRec {
//long m_respiderWaits [ MAX_FILTERS ];
//long m_numRegExs8;
// spidering on or off?
long m_numRegExs7;
char m_spidersEnabled [ MAX_FILTERS ];
//long m_numRegExs7;
//char m_spidersEnabled [ MAX_FILTERS ];
// should urls in this queue be sent to diffbot for processing
// when we are trying to index them?

3
Conf.h

@ -216,6 +216,7 @@ class Conf {
//long long m_tfndbMaxUrls;
long m_maxCpuThreads;
long m_maxCpuMergeThreads;
long m_deadHostTimeout;
long m_sendEmailTimeout;
@ -300,6 +301,8 @@ class Conf {
long m_robotdbMaxCacheMem ;
bool m_robotdbSaveCache;
long m_maxTotalSpiders;
// indexdb has a max cached age for getting IndexLists (10 mins deflt)
long m_indexdbMaxTreeMem ;
long m_indexdbMaxCacheMem;

@ -1285,6 +1285,10 @@ bool CountryCode::loadHashTable(void) {
return(s_catToCountry.load(g_hostdb.m_dir, "catcountry.dat"));
}
void CountryCode::reset ( ) {
s_catToCountry.reset();
}
int CountryCode::getNumCodes(void) {
return(s_numCountryCodes);
}

@ -25,6 +25,7 @@ class CountryCode {
uint8_t getLanguageFromDMOZ(long catid);
int createHashTable(void);
bool loadHashTable(void);
void reset();
long getNumEntries(void);
void debugDumpNumbers(void);
uint64_t getLanguagesWritten(int index);

@ -121,7 +121,7 @@ bool Datedb::init2 ( long treeMem ) {
false , // preload dskpagecache
16 );// key size
}
/*
bool Datedb::addColl ( char *coll, bool doVerify ) {
if ( ! m_rdb.addColl ( coll ) ) return false;
if ( ! doVerify ) return true;
@ -133,7 +133,7 @@ bool Datedb::addColl ( char *coll, bool doVerify ) {
log ( "db: Verify failed, but scaling is allowed, passing." );
return true;
}
*/
bool Datedb::verify ( char *coll ) {
log ( LOG_INFO, "db: Verifying Datedb for coll %s...", coll );
g_threads.disableThreads();

@ -108,6 +108,9 @@ bool DiskPageCache::init ( const char *dbname ,
// void (*rmVfd2)(DiskPageCache*, long) ) {
reset();
// fix cores while rebalancing
//maxMem = 0;
m_rdbId = rdbId;
bool *tog = NULL;

@ -166,6 +166,7 @@ case EBADJSONPARSER: return "Bad JSON parser";
case EFAKEFIRSTIP: return "Fake firstIp";
case EBADHOSTSCONF: return "A hosts.conf is out of sync";
case EWAITINGTOSYNCHOSTSCONF: return "Wait to ensure hosts.conf in sync";
case EDOCNONCANONICAL: return "Url was dup of canonical page";
}
// if the remote error bit is clear it must be a regulare errno
//if ( ! ( errnum & REMOTE_ERROR_BIT ) ) return strerror ( errnum );

@ -169,6 +169,7 @@ enum {
EBADJSONPARSER,
EFAKEFIRSTIP,
EBADHOSTSCONF,
EWAITINGTOSYNCHOSTSCONF
EWAITINGTOSYNCHOSTSCONF,
EDOCNONCANONICAL
};
#endif

@ -187,6 +187,8 @@ bool HashTableX::addKey ( void *key , void *val , long *slot ) {
g_errno = ETRYAGAIN;
return false;
}
// never got initialized? call HashTableX::init()
if ( m_ks <= 0 ){ char *xx=NULL; *xx=0; }
// check to see if we should grow the table. now we grow
// when 25% full to make operations faster so getLongestString()
// doesn't return such big numbers!

@ -68,6 +68,16 @@ Hostdb::~Hostdb () {
}
void Hostdb::reset ( ) {
for ( long i = 0 ; m_hosts && i < m_numHosts ; i++ ) {
Host *h = &m_hosts[i];
if ( ! h->m_lastKnownGoodCrawlInfoReply ) continue;
mfree ( h->m_lastKnownGoodCrawlInfoReply ,
h->m_lastKnownGoodCrawlInfoReplyEnd -
h->m_lastKnownGoodCrawlInfoReply , "lknown" );
h->m_lastKnownGoodCrawlInfoReply = NULL;
}
if ( m_hosts )
mfree ( m_hosts, m_allocSize,"Hostdb" );
if ( m_ips ) mfree ( m_ips , m_numIps * 4, "Hostdb" );
@ -121,8 +131,7 @@ bool Hostdb::init ( char *filename , long hostId , char *netName ,
// make sure our hostId is in our conf file
if ( hostId < 0 )
return log(
"conf: Negative hostId %li supplied in "
"hosts.conf.",hostId);
"conf: Negative hostId %li supplied",hostId);
// set early for calling log()
m_hostId = hostId;
// set clock in sync in fctypes.cpp

@ -273,6 +273,9 @@ class Host {
char m_inSync ;
char m_isPermanentOutOfSync ;
char *m_lastKnownGoodCrawlInfoReply;
char *m_lastKnownGoodCrawlInfoReplyEnd;
// . used by Parms.cpp for broadcasting parm change requests
// . each parm change request has an id
// . this let's us know which id is in progress and what the last

@ -925,6 +925,10 @@ bool HttpServer::sendReply ( TcpSocket *s , HttpRequest *r , bool isAdmin) {
strncmp ( path , "/v2/bulk/download/" ,18 ) == 0 )
return sendBackDump ( s , r );
// "GET /download/mycoll_urls.csv"
if ( strncmp ( path , "/download/", 10 ) == 0 )
return sendBackDump ( s , r );
// . is it a diffbot api request, like "GET /api/*"
// . ie "/api/startcrawl" or "/api/stopcrawl" etc.?
//if ( strncmp ( path , "/api/" , 5 ) == 0 )

@ -173,7 +173,7 @@ bool Indexdb::init2 ( long treeMem ) {
return true;
}
/*
bool Indexdb::addColl ( char *coll, bool doVerify ) {
if ( ! m_rdb.addColl ( coll ) ) return false;
if ( ! doVerify ) return true;
@ -187,6 +187,7 @@ bool Indexdb::addColl ( char *coll, bool doVerify ) {
log ( "db: Verify failed, but scaling is allowed, passing." );
return true;
}
*/
bool Indexdb::verify ( char *coll ) {
return true;

@ -96,7 +96,13 @@ JsonItem *Json::parseJsonStringIntoJsonItems ( char *json , long niceness ) {
// plus a \0 for the value and a \0 for the name of each jsonitem
need += 2;
// prevent cores for now
need += 10000;
need += 10;
// . to prevent safebuf from reallocating do this
// . safeMemcpy() calls reserve(m_length+len) and reserves
// tries to alloc m_length + (m_length+len) so since,
// m_length+len should never be more than "need" we need to
// double up here
need *= 2;
// this should be enough
if ( ! m_sb.reserve ( need ) ) return NULL;
// for testing if we realloc

@ -378,7 +378,7 @@ void LangList::reset ( ) {
// . looks under the langlist/ directory for langlist.# files
// each number corrisponds to a language
bool LangList::loadLists ( ) {
log ( LOG_INIT, "lang: Loading Language Lists.");
//log ( LOG_INIT, "lang: Loading Language Lists.");
// init the term table
m_langTable.set(8,4,100000*MAX_LANGUAGES,NULL,0,false,0,"tbl-lang");
// loop over the languages and load the files
@ -476,6 +476,7 @@ bool LangList::loadLists ( ) {
// count the list
listCount++;
if ( wordsInList > 0 )
log ( LOG_DEBUG,
"lang: Successfully Loaded %li out of %li (%li bytes) "
"words from %s dictionary.",

@ -173,7 +173,7 @@ bool Linkdb::init2 ( long treeMem ) {
sizeof(key224_t), // key size
true );// bias disk page cache
}
/*
bool Linkdb::addColl ( char *coll, bool doVerify ) {
if ( ! m_rdb.addColl ( coll ) ) return false;
if ( ! doVerify ) return true;
@ -185,7 +185,7 @@ bool Linkdb::addColl ( char *coll, bool doVerify ) {
log ( "db: Verify failed, but scaling is allowed, passing." );
return true;
}
*/
bool Linkdb::verify ( char *coll ) {
log ( LOG_DEBUG, "db: Verifying Linkdb for coll %s...", coll );
g_threads.disableThreads();
@ -3648,7 +3648,8 @@ bool Inlink::setXmlFromRSS ( Xml *xml , long niceness ) {
true , // pure xml?
TITLEREC_CURRENT_VERSION ,
false , // no need to now
niceness );
niceness ,
CT_XML );
}
// only Title.cpp uses this right now

@ -1791,6 +1791,11 @@ void Loop::quickPoll(long niceness, const char* caller, long lineno) {
if(m_inQuickPoll) {
log(LOG_WARN,
"admin: tried to quickpoll from inside quickpoll");
// this happens when handleRequest3f is called from
// a quickpoll and it deletes a collection and BigFile::close
// calls ThreadQueue::removeThreads and Msg3::doneScanning()
// has niceness 2 and calls quickpoll again!
return;
//if(g_conf.m_quickpollCoreOnError) {
char*xx=NULL;*xx=0;
// }

@ -662,7 +662,8 @@ fctypes.o: fctypes.cpp gb-include.h types.h fctypes.h Unicode.h \
openssl/ssl23.h openssl/srtp.h Collectiondb.h HashTableX.h PingServer.h \
Entities.h UCWordIterator.h Timedb.h Rdb.h RdbBase.h RdbScan.h BigFile.h \
RdbMap.h RdbList.h RdbDump.h RdbTree.h RdbMem.h RdbBuckets.h RdbCache.h \
Msg5.h Msg3.h RdbMerge.h Dir.h Titledb.h DiskPageCache.h Threads.h
Msg5.h Msg3.h RdbMerge.h Dir.h Titledb.h DiskPageCache.h Threads.h \
HttpMime.h
File.o: File.cpp gb-include.h types.h fctypes.h Unicode.h \
UnicodeProperties.h UCPropTable.h iconv.h hash.h Errno.h Log.h File.h \
Mem.h Conf.h Xml.h XmlNode.h Lang.h Iso8859.h iana_charset.h ip.h \
@ -1325,13 +1326,14 @@ Mem.o: Mem.cpp gb-include.h types.h fctypes.h Unicode.h \
openssl/pem2.h openssl/hmac.h openssl/kssl.h openssl/ssl2.h \
openssl/ssl3.h openssl/tls1.h openssl/dtls1.h openssl/pqueue.h \
openssl/ssl23.h openssl/srtp.h Collectiondb.h HashTableX.h PingServer.h \
Threads.h malloc.c Msg20.h UdpServer.h UdpSlot.h UdpProtocol.h \
Multicast.h Summary.h matches2.h Query.h Words.h StopWords.h Titledb.h \
Rdb.h RdbBase.h RdbScan.h BigFile.h RdbMap.h RdbList.h RdbDump.h \
RdbTree.h RdbMem.h RdbBuckets.h RdbCache.h Msg5.h Msg3.h RdbMerge.h \
Dir.h DiskPageCache.h Bits.h Pos.h Matches.h HashTableT.h Domains.h \
CountryCode.h Tagdb.h Msg0.h Indexdb.h Events.h Sections.h IndexList.h \
Dates.h
Threads.h Pages.h HttpServer.h TcpServer.h openssl/err.h MsgC.h \
UdpServer.h UdpSlot.h UdpProtocol.h Dns.h DnsProtocol.h RdbCache.h \
RdbList.h Multicast.h Rdb.h RdbBase.h RdbScan.h BigFile.h RdbMap.h \
RdbDump.h RdbTree.h RdbMem.h RdbBuckets.h Msg5.h Msg3.h RdbMerge.h Dir.h \
HttpMime.h PageCrawlBot.h malloc.c Msg20.h Summary.h matches2.h Query.h \
Words.h StopWords.h Titledb.h DiskPageCache.h Bits.h Pos.h Matches.h \
HashTableT.h Domains.h CountryCode.h Tagdb.h Msg0.h Indexdb.h Events.h \
Sections.h IndexList.h Dates.h
MemPool.o: MemPool.cpp gb-include.h types.h fctypes.h Unicode.h \
UnicodeProperties.h UCPropTable.h iconv.h hash.h Errno.h Log.h MemPool.h \
MemPoolTree.h Mem.h Conf.h Xml.h XmlNode.h Lang.h Iso8859.h \
@ -1930,7 +1932,7 @@ Msg4.o: Msg4.cpp gb-include.h types.h fctypes.h Unicode.h \
Msg13.h Msge0.h Msge1.h Msg8b.h SearchInput.h Msg40.h Msg39.h Msg37.h \
Posdb.h TopTree.h IndexTable2.h Msg51.h Msg17.h IndexReadInfo.h Msg3a.h \
Stats.h PostQueryRerank.h Sanity.h SiteGetter.h Title.h Address.h zlib.h \
zconf.h Syncdb.h
zconf.h Syncdb.h Process.h
Msg51.o: Msg51.cpp gb-include.h types.h fctypes.h Unicode.h \
UnicodeProperties.h UCPropTable.h iconv.h hash.h Errno.h Log.h Msg51.h \
Msg0.h UdpServer.h Mem.h Conf.h Xml.h XmlNode.h Lang.h Iso8859.h \
@ -3178,7 +3180,8 @@ RdbBase.o: RdbBase.cpp gb-include.h types.h fctypes.h Unicode.h \
Repair.h XmlDoc.h Phrases.h LangList.h Images.h Msg36.h Msg13.h Msge0.h \
Msge1.h MsgC.h Dns.h DnsProtocol.h Msg8b.h SearchInput.h Msg40.h Msg39.h \
Msg37.h TopTree.h IndexTable2.h Msg51.h Msg17.h Msg3a.h \
PostQueryRerank.h Sanity.h SiteGetter.h Title.h Address.h HttpMime.h
PostQueryRerank.h Sanity.h SiteGetter.h Title.h Address.h HttpMime.h \
Rebalance.h
RdbBuckets.o: RdbBuckets.cpp RdbBuckets.h Mem.h Conf.h Xml.h XmlNode.h \
gb-include.h types.h fctypes.h Unicode.h UnicodeProperties.h \
UCPropTable.h iconv.h hash.h Errno.h Log.h Lang.h Iso8859.h \
@ -3334,7 +3337,12 @@ RdbMerge.o: RdbMerge.cpp gb-include.h types.h fctypes.h Unicode.h \
openssl/ssl23.h openssl/srtp.h Collectiondb.h HashTableX.h PingServer.h \
RdbScan.h BigFile.h RdbMap.h RdbList.h RdbDump.h RdbTree.h RdbMem.h \
RdbBuckets.h RdbCache.h Msg5.h Msg3.h RdbMerge.h Dir.h Indexdb.h \
DiskPageCache.h Titledb.h Process.h
DiskPageCache.h Titledb.h Process.h Spider.h Msg4.h Msg1.h UdpServer.h \
UdpSlot.h UdpProtocol.h Multicast.h Threads.h Msg0.h Clusterdb.h \
Linkdb.h Msg2.h Query.h Msg20.h Summary.h matches2.h Words.h StopWords.h \
Bits.h Pos.h Matches.h HashTableT.h Domains.h CountryCode.h Tagdb.h \
Events.h Sections.h IndexList.h Dates.h Msg22.h CatRec.h Categories.h \
HashTable.h Catdb.h Datedb.h
RdbScan.o: RdbScan.cpp gb-include.h types.h fctypes.h Unicode.h \
UnicodeProperties.h UCPropTable.h iconv.h hash.h Errno.h Log.h RdbScan.h \
BigFile.h File.h Mem.h Conf.h Xml.h XmlNode.h Lang.h Iso8859.h \
@ -4508,7 +4516,8 @@ Xml.o: Xml.cpp gb-include.h types.h fctypes.h Unicode.h \
openssl/ssl23.h openssl/srtp.h Collectiondb.h HashTableX.h PingServer.h \
Titledb.h Rdb.h RdbBase.h RdbScan.h BigFile.h RdbMap.h RdbList.h \
RdbDump.h RdbTree.h RdbMem.h RdbBuckets.h RdbCache.h Msg5.h Msg3.h \
RdbMerge.h Dir.h DiskPageCache.h Words.h StopWords.h Entities.h
RdbMerge.h Dir.h DiskPageCache.h Words.h StopWords.h HttpMime.h \
Entities.h
XmlDoc.o: XmlDoc.cpp gb-include.h types.h fctypes.h Unicode.h \
UnicodeProperties.h UCPropTable.h iconv.h hash.h Errno.h Log.h XmlDoc.h \
Lang.h Iso8859.h iana_charset.h Words.h Xml.h XmlNode.h SafeBuf.h \

107
Mem.cpp

@ -10,11 +10,13 @@
//#include "MemPoolVar.h"
//#include "malloc.h"
//#include "Stats.h"
#include "Pages.h"
// put me back
//#define _EFENCE_
//#define EFENCE
#define EFENCE_SIZE 100000
// uncomment this for _EFENCE_ to do underflow checks instead of the
// uncomment this for EFENCE to do underflow checks instead of the
// default overflow checks
//#define _CHECKUNDERFLOW_
@ -50,7 +52,7 @@
// there because it will hit a different PAGE, to be more sure we could
// make UNDERPAD and OVERPAD PAGE bytes, although the overrun could still write
// to another allocated area of memory and we can never catch it.
#ifdef _EFENCE_
#ifdef EFENCE
#define UNDERPAD 0
#define OVERPAD 0
#else
@ -66,7 +68,7 @@ extern bool g_isYippy;
bool freeCacheMem();
#ifdef _EFENCE_
#ifdef EFENCE
static void *getElecMem ( long size ) ;
static void freeElecMem ( void *p ) ;
#endif
@ -148,7 +150,9 @@ void mutexUnlock ( ) {
// make it big for production machines
//#define DMEMTABLESIZE (1024*602)
// there should not be too many mallocs any more
#define DMEMTABLESIZE (1024*302)
// i boosted from 300k to 600k so we can get summaries for 150k results
// for the csv download...
#define DMEMTABLESIZE (1024*602)
//#define DMEMTABLESIZE (1024*202)
// and small for local machine
//#define DMEMTABLESIZE (1024*50)
@ -248,7 +252,7 @@ void * operator new (size_t size) throw (std::bad_alloc) {
throw std::bad_alloc();
//throw 1;
}
#ifdef _EFENCE_
#ifdef EFENCE
void *mem = getElecMem(size);
#else
//void *mem = dlmalloc ( size );
@ -266,7 +270,7 @@ newmemloop:
//return NULL;
}
if ( (unsigned long)mem < 0x00010000 ) {
#ifdef _EFENCE_
#ifdef EFENCE
void *remem = getElecMem(size);
#else
void *remem = sysmalloc(size);
@ -274,7 +278,7 @@ newmemloop:
log ( LOG_WARN, "mem: Caught low memory allocation at %08lx, "
"reallocated to %08lx", (unsigned long)mem,
(unsigned long)remem );
#ifdef _EFENCE_
#ifdef EFENCE
freeElecMem (mem);
#else
sysfree(mem);
@ -326,7 +330,7 @@ void * operator new [] (size_t size) throw (std::bad_alloc) {
throw std::bad_alloc();
//throw 1;
}
#ifdef _EFENCE_
#ifdef EFENCE
void *mem = getElecMem(size);
#else
//void *mem = dlmalloc ( size );
@ -345,7 +349,7 @@ newmemloop:
//return NULL;
}
if ( (unsigned long)mem < 0x00010000 ) {
#ifdef _EFENCE_
#ifdef EFENCE
void *remem = getElecMem(size);
#else
void *remem = sysmalloc(size);
@ -353,7 +357,7 @@ newmemloop:
log ( LOG_WARN, "mem: Caught low memory allocation at %08lx, "
"reallocated to %08lx",
(long)mem, (long)remem );
#ifdef _EFENCE_
#ifdef EFENCE
freeElecMem (mem);
#else
sysfree(mem);
@ -423,6 +427,7 @@ pid_t Mem::getPid() {
bool Mem::init ( long long maxMem ) {
// set main process pid
s_pid = getpid();
// . don't swap our memory out, man...
// . damn, linux 2.4.17 seems to crash the kernel sometimes w/ this
//if ( mlockall( MCL_CURRENT | MCL_FUTURE ) == -1 ) {
@ -440,10 +445,37 @@ bool Mem::init ( long long maxMem ) {
if ( g_conf.m_detectMemLeaks )
log(LOG_INIT,"mem: Memory leak checking is enabled.");
#ifdef _EFENCE_
#ifdef EFENCE
log(LOG_INIT,"mem: using electric fence!!!!!!!");
#endif
// if we can't alloc 3gb exit and retry
long long start = gettimeofdayInMilliseconds();
char *pools[30];
long long count = 0LL;
long long chunk = 100000000LL; // 100MB chunks
long long need = 3000000000LL; // 3GB
long i = 0; for ( i = 0 ; i < 30 ; i++ ) {
pools[i] = (char *)mmalloc(chunk,"testmem");
count += chunk;
if ( pools[i] ) continue;
count -= chunk;
log("mem: could only alloc %lli bytes of the "
"%lli required to run gigablast. exiting.",
count , need );
}
for ( long j = 0 ; j < i ; j++ )
mfree ( pools[j] , chunk , "testmem" );
long long now = gettimeofdayInMilliseconds();
long long took = now - start;
if ( took > 20 ) log("mem: took %lli ms to check memory ceiling",took);
// return if could not alloc the full 3GB
if ( i < 30 ) return false;
// reset this, our max mem used over time ever because we don't
// want the mem test we did above to count towards it
m_maxAlloced = 0;
// init or own malloc stuff in malloc.c (from doug leay)
//if ( mdw_init_sbrk ( maxMem ) ) return true;
// bitch
@ -652,24 +684,24 @@ bool Mem::printMemBreakdownTable ( SafeBuf* sb,
// make sure the admin viewing this table knows that there will be
// frees in here that are delayed if electric fence is enabled.
#ifdef _EFENCE_
#ifdef EFENCE
ss = " <font color=red>*DELAYED FREES ENABLED*</font>";
#endif
sb->safePrintf (
"<table>"
"<table cellpadding=4 width=100%% bgcolor=#%s border=1>"
"<table %s>"
"<tr>"
"<td colspan=3 bgcolor=#%s>"
"<center><b>Mem Breakdown%s</b></td></tr>\n"
"<tr>"
"<tr bgcolor=#%s>"
"<td><b>allocator</b></td>"
"<td><b>num allocs</b></td>"
"<td><b>allocated</b></td>"
"</tr>" ,
lightblue, darkblue , ss );
TABLE_STYLE, darkblue , ss , darkblue );
long n = m_numAllocated * 2;
MemEntry *e = (MemEntry *)mcalloc ( sizeof(MemEntry) * n , "Mem" );
@ -756,11 +788,12 @@ bool Mem::printMemBreakdownTable ( SafeBuf* sb,
// now print into buffer
for ( long i = 0 ; i < count ; i++ )
sb->safePrintf (
"<tr>"
"<tr bgcolor=%s>"
"<td>%s</td>"
"<td>%li</td>"
"<td>%li</td>"
"</tr>\n",
LIGHT_BLUE,
winners[i]->m_label,
winners[i]->m_numAllocs,
winners[i]->m_allocated);
@ -1242,14 +1275,24 @@ void *Mem::gbmalloc ( int size , const char *note ) {
return NULL;
}
void *mem;
// to find bug that cores on malloc do this
//printBreeches(true);
//g_errno=ENOMEM;return (void *)log("Mem::malloc: reached mem limit");}
#ifdef _EFENCE_
void *mem = getElecMem(size+UNDERPAD+OVERPAD);
#else
#ifdef EFENCE
mem = getElecMem(size+UNDERPAD+OVERPAD);
// conditional electric fence?
#elif EFENCE_BIG
if ( size >= EFENCE_SIZE )
mem = getElecMem(size+0+0);
else
mem = (void *)sysmalloc ( size + UNDERPAD + OVERPAD );
#else
//void *mem = dlmalloc ( size );
void *mem = (void *)sysmalloc ( size + UNDERPAD + OVERPAD );
mem = (void *)sysmalloc ( size + UNDERPAD + OVERPAD );
#endif
// initialization debug
//char *pend = (char *)mem + UNDERPAD + size;
@ -1321,7 +1364,7 @@ mallocmemloop:
return NULL;
}
if ( (unsigned long)mem < 0x00010000 ) {
#ifdef _EFENCE_
#ifdef EFENCE
void *remem = getElecMem(size);
#else
void *remem = sysmalloc(size);
@ -1329,7 +1372,7 @@ mallocmemloop:
log ( LOG_WARN, "mem: Caught low memory allocation at %08lx, "
"reallocated to %08lx",
(unsigned long)mem, (unsigned long)remem );
#ifdef _EFENCE_
#ifdef EFENCE
freeElecMem (mem);
#else
sysfree(mem);
@ -1392,7 +1435,9 @@ void *Mem::gbrealloc ( void *ptr , int oldSize , int newSize ,
char *mem;
#ifdef _EFENCE_
// even though size may be < 100k for EFENCE_BIG, do it this way
// for simplicity...
#if defined(EFENCE) || defined(EFENCE_BIG)
mem = (char *)mmalloc ( newSize , note );
if ( ! mem ) return NULL;
// copy over to it
@ -1471,10 +1516,19 @@ void Mem::gbfree ( void *ptr , int size , const char *note ) {
char *xx = NULL; *xx = 0;
}
#ifdef _EFENCE_
#ifdef EFENCE
// this does a delayed free so do not call rmMem() just yet
freeElecMem ((char *)ptr - UNDERPAD );
#else
return;
#endif
#ifdef EFENCE_BIG
if ( size >= EFENCE_SIZE ) {
freeElecMem ((char *)ptr - 0 );
return;
}
#endif
bool isnew = s_isnew[slot];
// if this returns false it was an unbalanced free
@ -1482,7 +1536,6 @@ void Mem::gbfree ( void *ptr , int size , const char *note ) {
if ( isnew ) sysfree ( (char *)ptr );
else sysfree ( (char *)ptr - UNDERPAD );
#endif
}
long getLowestLitBitLL ( unsigned long long bits ) {

@ -53,7 +53,7 @@ bool Monitordb::init ( ) {
sizeof(key96_t) ,
true ); // bias page cache? (true!)
}
/*
bool Monitordb::addColl ( char *coll, bool doVerify ) {
if ( ! m_rdb.addColl ( coll ) ) return false;
if ( ! doVerify ) return true;
@ -65,7 +65,7 @@ bool Monitordb::addColl ( char *coll, bool doVerify ) {
log ( "db: Verify failed, but scaling is allowed, passing." );
return true;
}
*/
bool Monitordb::verify ( char *coll ) {
log ( LOG_INFO, "db: Verifying Monitordb for coll %s...", coll );
g_threads.disableThreads();

@ -1156,10 +1156,12 @@ void gotHttpReply2 ( void *state ,
// . if no user-agent line matches * or gigabot/flurbot we
// will get just a \0 for the reply, replySize=1!
//char *ua = "ProCogBot";//"EventGuruBot";//r->m_userAgent;
char *ua = "Gigabot";
long uaLen = gbstrlen(ua);
replySize = filterRobotsTxt (reply,replySize,&mime,niceness,
ua,uaLen);
// take this out until it works for
// user-agent: *\ndisallow: blah
//char *ua = "Gigabot";
//long uaLen = gbstrlen(ua);
//replySize = filterRobotsTxt (reply,replySize,&mime,niceness,
// ua,uaLen);
// record in the stats
docsPtr = &g_stats.m_compressRobotsTxtDocs;
bytesInPtr = &g_stats.m_compressRobotsTxtBytesIn;
@ -2020,7 +2022,7 @@ bool getIframeExpandedContent ( Msg13Request *r , TcpSocket *ts ) {
xd->m_r = r;
// so XmlDoc::getExtraDoc doesn't have any issues
xd->m_firstIp = 0;
xd->m_firstIp = 123456;
xd->m_firstIpValid = true;
// try using xmldoc to do it

@ -20,9 +20,16 @@ void Msg20::constructor () {
void Msg20::destructor () { reset(); m_mcast.destructor(); }
#include "Process.h"
void Msg20::reset() {
// not allowed to reset one in progress
if ( m_inProgress ) { char *xx=NULL;*xx=0; }
if ( m_inProgress ) {
// do not core on abrupt exits!
if (g_process.m_mode == EXIT_MODE ) return;
// otherwise core
char *xx=NULL;*xx=0;
}
m_launched = false;
if ( m_request && m_request != m_requestBuf )
mfree ( m_request , m_requestSize , "Msg20rb" );

@ -334,7 +334,8 @@ void handleRequest22 ( UdpSlot *slot , long netnice ) {
// get base, returns NULL and sets g_errno to ENOCOLLREC on error
RdbBase *tbase;
if ( ! (tbase=getRdbBase(RDB_TITLEDB,coll) ) ) {
log("db: Could not get title rec in collection \"%s\".",
log("db: Could not get title rec in collection \"%s\" "
"because rdbbase is null.",
coll);
g_errno = EBADENGINEER;
us->sendErrorReply ( slot , g_errno );

@ -427,16 +427,24 @@ bool Msg39::getLists () {
// if we have twins, then make sure the twins read different
// pieces of the same docid range to make things 2x faster
bool useTwins = false;
if ( g_hostdb.getNumStripes() == 2 ) useTwins = true;
if ( useTwins ) {
long long delta2 = ( docIdEnd - docIdStart ) / 2;
if ( m_r->m_stripe == 0 ) docIdEnd = docIdStart + delta2;
else docIdStart = docIdStart + delta2;
}
//bool useTwins = false;
//if ( g_hostdb.getNumStripes() == 2 ) useTwins = true;
//if ( useTwins ) {
// long long delta2 = ( docIdEnd - docIdStart ) / 2;
// if ( m_r->m_stripe == 0 ) docIdEnd = docIdStart + delta2;
// else docIdStart = docIdStart + delta2;
//}
// new striping logic:
long numStripes = g_hostdb.getNumStripes();
long long delta2 = ( docIdEnd - docIdStart ) / numStripes;
long stripe = g_hostdb.getMyHost()->m_stripe;
docIdStart += delta2 * stripe; // is this right?
docIdEnd = docIdStart + delta2;
// add 1 to be safe so we don't lose a docid
docIdEnd++;
// TODO: add triplet support later for this to split the
// read 3 ways. 4 ways for quads, etc.
if ( g_hostdb.getNumStripes() >= 3 ) { char *xx=NULL;*xx=0;}
//if ( g_hostdb.getNumStripes() >= 3 ) { char *xx=NULL;*xx=0;}
// do not go over MAX_DOCID because it gets masked and
// ends up being 0!!! and we get empty lists
if ( docIdEnd > MAX_DOCID ) docIdEnd = MAX_DOCID;

@ -541,8 +541,9 @@ bool Msg4::addMetaList ( char *metaList ,
s_msg4Tail->m_next = this;
// we are the new tail
s_msg4Tail = this;
// debug log
log("msg4: queueing body msg4=0x%lx",(long)this);
// debug log. seems to happen a lot if not using threads..
if ( g_conf.m_useThreads )
log("msg4: queueing body msg4=0x%lx",(long)this);
// mark it
m_inUse = true;
// all done then, but return false so caller does not free
@ -556,8 +557,10 @@ bool Msg4::addMetaList ( char *metaList ,
// sanity check
if ( s_msg4Head || s_msg4Tail ) { char *xx=NULL; *xx=0; }
// spider hang bug
logf(LOG_DEBUG,"msg4: queueing head msg4=0x%lx",(long)this);
// . spider hang bug
// . debug log. seems to happen a lot if not using threads..
if ( g_conf.m_useThreads )
logf(LOG_DEBUG,"msg4: queueing head msg4=0x%lx",(long)this);
// mark it
m_inUse = true;
@ -1062,8 +1065,10 @@ void storeLineWaiters ( ) {
// . if his callback was NULL, then was loaded in loadAddsInProgress()
// . we no longer do that so callback should never be null now
if ( ! msg4->m_callback ) { char *xx=NULL;*xx=0; }
// log this now i guess
logf(LOG_DEBUG,"msg4: calling callback for msg4=0x%lx",(long)msg4);
// log this now i guess. seems to happen a lot if not using threads
if ( g_conf.m_useThreads )
logf(LOG_DEBUG,"msg4: calling callback for msg4=0x%lx",
(long)msg4);
// release it
msg4->m_inUse = false;
// call his callback
@ -1074,7 +1079,7 @@ void storeLineWaiters ( ) {
goto loop;
}
#include "Process.h"
// . destroys the slot if false is returned
// . this is registered in Msg4::set() to handle add rdb record msgs

@ -17,7 +17,7 @@
// increasing this doesn't seem to improve performance any on a single
// node cluster....
#define MAX_OUTSTANDING_MSG20S 50
#define MAX_OUTSTANDING_MSG20S 200
//static void handleRequest40 ( UdpSlot *slot , long netnice );
//static void gotExternalReplyWrapper ( void *state , void *state2 ) ;
@ -1184,6 +1184,10 @@ bool gotSummaryWrapper ( void *state ) {
Msg40 *THIS = (Msg40 *)state;
// inc it here
THIS->m_numReplies++;
// log every 1000 i guess
if ( (THIS->m_numReplies % 1000) == 0 )
log("msg40: got %li summaries out of %li",THIS->m_numReplies,
THIS->m_msg3a.m_numDocIds);
// it returns false if we're still awaiting replies
if ( ! THIS->gotSummary ( ) ) return false;
// now call callback, we're done
@ -1217,11 +1221,24 @@ bool Msg40::gotSummary ( ) {
// reset g_errno
g_errno = 0;
}
/*
// sanity check
for ( long i = 0 ; i < m_msg3a.m_numDocIds ; i++ ) {
// stop as soon as we hit a gap breaking our contiguity...
Msg20 *m = m_msg20[i];
if ( ! m ) continue;
Msg20Reply *mr = m->m_r;
if ( ! mr ) continue;
char *cc = mr->ptr_content;
if ( ! cc ) continue;
//if ( ! strstr(cc,"Modern Marketing KF400032MA") ) continue;
//log("hey");
//fprintf(stderr,"msg %li = %s\n",i,cc );
if ( i == 48329 ) { char *xx=NULL;*xx=0; }
mr->ptr_content = NULL;
}
*/
// . ok, now i wait for everybody.
// . TODO: evaluate if this hurts us
if ( m_numReplies < m_numRequests )
return false;
doAgain:
@ -1245,6 +1262,11 @@ bool Msg40::gotSummary ( ) {
//char *xx=NULL; *xx=0;
}
// . ok, now i wait for everybody.
// . TODO: evaluate if this hurts us
if ( m_numReplies < m_numRequests )
return false;
// save this before we increment m_numContiguous
//long oldNumContiguous = m_numContiguous;

@ -22,6 +22,7 @@ long g_numCorrupt = 0;
Msg5::Msg5() {
m_waitingForList = false;
//m_waitingForMerge = false;
m_numListPtrs = 0;
m_mergeLists = true;
reset();
@ -33,7 +34,7 @@ Msg5::~Msg5() {
// frees m_treeList
void Msg5::reset() {
if ( m_waitingForList ) {
if ( m_waitingForList ) { // || m_waitingForMerge ) {
log("disk: Trying to reset a class waiting for a reply.");
// might being doing an urgent exit (mainShutdown(1)) or
// g_process.shutdown(), so do not core here
@ -45,7 +46,6 @@ void Msg5::reset() {
m_prevCount = 0;
//m_prevKey.setMin();
KEYMIN(m_prevKey,MAX_KEY_BYTES);// m_ks); m_ks is invalid
m_waitingForList = false;
// free lists if m_mergeLists was false
for ( long i = 0 ; ! m_mergeLists && i < m_numListPtrs ; i++ )
m_listPtrs[i]->freeList();
@ -203,6 +203,13 @@ bool Msg5::getList ( char rdbId ,
// remember stuff
m_rdbId = rdbId;
m_coll = coll;
m_collnum = g_collectiondb.getCollnum ( coll );
if ( m_collnum < 0 ) {
g_errno = ENOCOLLREC;
return true;
}
m_list = list;
//m_startKey = startKey;
//m_endKey = endKey;
@ -466,7 +473,12 @@ bool Msg5::getList ( char rdbId ,
// timing debug
//log("Msg5:getting list startKey.n1=%lu",m_startKey.n1);
// start the read loop - hopefully, will only loop once
return readList ( );
if ( readList ( ) ) return true;
// tell Spider.cpp not to nuke us until we get back!!!
m_waitingForList = true;
// we blocked!!! must call m_callback
return false;
}
// . returns false if blocked, true otherwise
// . sets g_errno on error
@ -725,7 +737,7 @@ bool Msg5::readList ( ) {
if ( m_treeList.m_ks != m_ks ) { char *xx = NULL; *xx = 0; }
// we are waiting for the list
m_waitingForList = true;
//m_waitingForList = true;
// clear just in case
g_errno = 0;
@ -915,6 +927,8 @@ void gotListWrapper ( void *state ) {
if ( THIS->m_calledCallback ) { char *xx=NULL;*xx=0; }
// set it now
THIS->m_calledCallback = 1;
// we are no longer waiting for the list
THIS->m_waitingForList = false;
// when completely done call the callback
THIS->m_callback ( THIS->m_state , THIS->m_list , THIS );
}
@ -931,7 +945,7 @@ static void *mergeListsWrapper_r ( void *state , ThreadEntry *t ) ;
bool Msg5::gotList ( ) {
// we are no longer waiting for the list
m_waitingForList = false;
//m_waitingForList = false;
// debug msg
//log("msg5 got lists from msg3 (msg5=%lu)",(long)this);
@ -1064,8 +1078,15 @@ bool Msg5::gotList2 ( ) {
// sanity check
//if ( KEYNEG(m_listPtrs[i]->getEndKey()) ) {
// char *xx=NULL;*xx=0; }
if ( KEYCMP(m_listPtrs[i]->getEndKey(),m_minEndKey,m_ks)<0 )
if ( KEYCMP(m_listPtrs[i]->getEndKey(),m_minEndKey,m_ks)<0 ) {
KEYSET(m_minEndKey,m_listPtrs[i]->getEndKey(),m_ks);
// crap, if list is all negative keys, then the
// end key seems negative too! however in this
// case RdbScan::m_endKey seems positive so
// maybe we got a negative endkey in constrain?
//if (! (m_minEndKey[0] & 0x01) )
// log("msg5: list had bad endkey");
}
}
// sanity check
//if ( KEYNEG( m_minEndKey) ) {char *xx=NULL;*xx=0; }
@ -1152,7 +1173,7 @@ bool Msg5::gotList2 ( ) {
// filter happens and we have a chance to weed out old titleRecs
if ( m_rdbId == RDB_TITLEDB && m_numFiles != 1 && n == 1 &&
m_isRealMerge ) {
log(LOG_LOGIC,"db: Adding dummy list.");
//log(LOG_LOGIC,"db: Adding dummy list.");
//m_tfns [n] = 255;
m_dummy.set ( NULL , // list data
0 , // list data size
@ -1377,6 +1398,8 @@ bool Msg5::gotList2 ( ) {
// skip it for now
//goto skipThread;
//m_waitingForMerge = true;
// . if size is big, make a thread
// . let's always make niceness 0 since it wasn't being very
// aggressive before
@ -1386,8 +1409,11 @@ bool Msg5::gotList2 ( ) {
threadDoneWrapper ,
mergeListsWrapper_r ) )
return false;
//m_waitingForMerge = false;
// thread creation failed
if ( ! g_threads.areThreadsDisabled() )
if ( g_conf.m_useThreads && ! g_threads.m_disabled )
log(LOG_INFO,
"net: Failed to create thread to merge lists. Doing "
"blocking merge. Hurts performance.");
@ -1441,6 +1467,8 @@ void threadDoneWrapper ( void *state , ThreadEntry *t ) {
if ( THIS->needsRecall() && ! THIS->readList() ) return;
// sanity check
if ( THIS->m_calledCallback ) { char *xx=NULL;*xx=0; }
// we are no longer waiting for the list
THIS->m_waitingForList = false;
// set it now
THIS->m_calledCallback = 3;
// when completely done call the callback
@ -1716,6 +1744,8 @@ void Msg5::mergeLists_r ( ) {
// . we are left with an empty list
bool Msg5::doneMerging ( ) {
//m_waitingForMerge = false;
// get base, returns NULL and sets g_errno to ENOCOLLREC on error
RdbBase *base; if (!(base=getRdbBase(m_rdbId,m_coll))) return true;
@ -2017,6 +2047,8 @@ void gotRemoteListWrapper( void *state ) { // , RdbList *list ) {
if ( ! THIS->gotRemoteList() ) return;
// sanity check
if ( THIS->m_calledCallback ) { char *xx=NULL;*xx=0; }
// we are no longer waiting for the list
THIS->m_waitingForList = false;
// set it now
THIS->m_calledCallback = 4;
// if it doesn't block call the callback, g_errno may be set

2
Msg5.h

@ -293,6 +293,8 @@ class Msg5 {
bool m_mergeLists;
char m_waitingForList;
//char m_waitingForMerge;
collnum_t m_collnum;
// actually part of a different algo than m_waitingForList!
unsigned long long m_waitingKey;

@ -118,8 +118,14 @@ bool Msge0::launchRequests ( long starti ) {
loop:
// stop if no more urls. return true if we got all replies! no block.
if ( m_n >= m_numUrls ) return (m_numRequests == m_numReplies);
// if all hosts are getting a diffbot reply with 50 spiders and they
// all timeout at the same time we can very easily clog up the
// udp sockets, so use this to limit... i've seen the whole
// spider tables stuck with "getting outlink tag rec vector"statuses
long maxOut = MAX_OUTSTANDING_MSGE0;
if ( g_udpServer.m_numUsedSlots > 500 ) maxOut = 1;
// if we are maxed out, we basically blocked!
if (m_numRequests - m_numReplies >= MAX_OUTSTANDING_MSGE0)return false;
if (m_numRequests - m_numReplies >= maxOut ) return false;
// . skip if "old"
// . we are not planning on adding this to spiderdb, so Msg16
// want to skip the ip lookup, etc.
@ -145,7 +151,8 @@ bool Msge0::launchRequests ( long starti ) {
// . grab a slot
// . m_msg8as[i], m_msgCs[i], m_msg50s[i], m_msg20s[i]
long i;
for ( i = starti ; i < MAX_OUTSTANDING_MSGE0 ; i++ )
// make this 0 since "maxOut" now changes!!
for ( i = 0 /*starti*/ ; i < MAX_OUTSTANDING_MSGE0 ; i++ )
if ( ! m_used[i] ) break;
// sanity check
if ( i >= MAX_OUTSTANDING_MSGE0 ) { char *xx = NULL; *xx = 0; }

@ -443,6 +443,8 @@ void Multicast::gotReply2 ( UdpSlot *slot ) {
long now = getTime();
if (now - s_elastTime > 10) {s_elastTime = now; logIt=true;}
}
// don't log ETRYAGAIN, may come across as bad when it is normal
if ( m_errnos[i] == ETRYAGAIN ) logIt = false;
// log a failure msg
if ( logIt ) { // m_errnos[i] != ETRYAGAIN ) {
Host *h = m_hostdb->getHost ( slot->m_ip ,slot->m_port );

@ -31,7 +31,7 @@ bool sendPageAddDelColl ( TcpSocket *s , HttpRequest *r , bool add ) {
char *msg = NULL;
// if any host in network is dead, do not do this
if ( g_hostdb.hasDeadHost() ) msg = "A host in the network is dead.";
//if ( g_hostdb.hasDeadHost() ) msg = "A host in the network is dead.";
// . are we adding a collection?
// . return if error adding, might already exist!
@ -85,15 +85,18 @@ bool sendPageAddDelColl ( TcpSocket *s , HttpRequest *r , bool add ) {
// print the add collection box
if ( add /*&& (! nc[0] || g_errno ) */ ) {
p.safePrintf (
"<center>\n<table border=1 cellpadding=4 "
"width=100%% bgcolor=#%s>\n"
"<tr><td colspan=2 bgcolor=#%s>"
"<center>\n<table %s>\n"
"<tr class=hdrow><td colspan=2>"
"<center><b>Add Collection</b></center>"
"</td></tr>\n",LIGHT_BLUE,DARK_BLUE);
"</td></tr>\n",
TABLE_STYLE);
p.safePrintf (
"<tr><td><b>name of new collection to add</td>\n"
"<tr bgcolor=#%s>"
"<td><b>name of new collection to add</td>\n"
"<td><input type=text name=addColl size=30>"
"</td></tr>\n");
"</td></tr>\n"
, LIGHT_BLUE
);
// now list collections from which to copy the config
//p.safePrintf (
// "<tr><td><b>copy configuration from this "
@ -118,27 +121,31 @@ bool sendPageAddDelColl ( TcpSocket *s , HttpRequest *r , bool add ) {
// print all collections out in a checklist so you can check the
// ones you want to delete, the values will be the id of that collectn
p.safePrintf (
"<center>\n<table border=1 cellpadding=4 "
"width=100%% bgcolor=#%s>\n"
"<tr><td bgcolor=#%s><center><b>Delete Collections"
"<center>\n<table %s>\n"
"<tr class=hdrow><td><center><b>Delete Collections"
"</b></center></td></tr>\n"
"<tr><td>"
"<tr bgcolor=#%s><td>"
"<center><b>Select the collections you wish to delete. "
//"<font color=red>This feature is currently under "
//"development.</font>"
"</b></center></td></tr>\n"
"<tr><td>"
"<tr bgcolor=#%s><td>"
// table within a table
"<center><table width=20%%>\n",
LIGHT_BLUE,DARK_BLUE);
TABLE_STYLE,
LIGHT_BLUE,
DARK_BLUE
);
for ( long i = 0 ; i < g_collectiondb.m_numRecs ; i++ ) {
CollectionRec *cr = g_collectiondb.m_recs[i];
if ( ! cr ) continue;
p.safePrintf (
"<tr><td>"
"<input type=checkbox name=delete value=\"%s\"> "
"%s</td></tr>\n",cr->m_coll,cr->m_coll);
"<tr bgcolor=#%s><td>"
"<input type=checkbox name=delColl value=\"%s\"> "
"%s</td></tr>\n",
DARK_BLUE,
cr->m_coll,cr->m_coll);
}
p.safePrintf( "</table></center></td></tr></table><br>\n" );
skip:

@ -89,7 +89,7 @@ bool sendPageAddUrl ( TcpSocket *s , HttpRequest *r ) {
collLen = gbstrlen(coll);
}
// get collection rec
CollectionRec *cr = g_collectiondb.getRec ( coll );
CollectionRec *cr = g_collectiondb.getRec ( r ); // coll );
// bitch if no collection rec found
if ( ! cr ) {
g_errno = ENOCOLLREC;
@ -248,8 +248,6 @@ bool sendPageAddUrl ( TcpSocket *s , HttpRequest *r ) {
//
SpiderRequest *sreq = &st1->m_sreq;
// set the SpiderRequest from this add url
if ( ! sreq->setFromAddUrl ( st1->m_url ) ) {
if ( ! g_errno ) { char *xx=NULL;*xx=0; }

@ -149,30 +149,54 @@ bool sendReply ( void *state ) {
// . do not print big links if only an assassin, just print host ids
g_pages.printAdminTop ( &sb, st->m_socket , &st->m_r );
sb.safePrintf ( "<table width=100%% bgcolor=#%s border=1 cellpadding=4>"
"<tr><td bgcolor=#%s colspan=2>"
sb.safePrintf(
"<style>"
".poo { background-color:#%s;}\n"
"</style>\n" ,
LIGHT_BLUE );
sb.safePrintf ( "<table %s>"
"<tr><td colspan=2>"
"<center><font size=+1><b>Catdb</b></font></center>"
"</td></tr>", LIGHT_BLUE , DARK_BLUE );
"</td></tr>", TABLE_STYLE );
// instructions
sb.safePrintf("<tr bgcolor=#%s>"
"<td colspan=3>"
"<font size=-2>"
"<center>"
"Don't just start using this, you need to follow the "
"instructions in the <i>admin guide</i> for adding "
"DMOZ support."
"</center>"
"</font>"
"</td>"
"</tr>"
,DARK_BLUE
);
// print the generate Catdb link
sb.safePrintf ( "<tr><td>Update Catdb from DMOZ data.</td>"
sb.safePrintf ( "<tr class=poo><td>Update Catdb from DMOZ data.</td>"
"<td><center>"
"<a href=\"/master/catdb?c=%s&gencatdb=2\">"
"Update Catdb</a> "
"</center></td></tr>",
st->m_coll );
sb.safePrintf ( "<tr><td>Generate New Catdb from DMOZ data.</td>"
sb.safePrintf ( "<tr class=poo>"
"<td>Generate New Catdb from DMOZ data.</td>"
"<td><center>"
"<a href=\"/master/catdb?c=%s&gencatdb=1\">"
"Generate Catdb</a> "
"</center></td></tr>",
st->m_coll );
if (st->m_genCatdb)
sb.safePrintf ( "<tr><td> Catdb Generation took %lli ms."
sb.safePrintf ( "<tr class=poo>"
"<td> Catdb Generation took %lli ms."
"</td></tr>",
endTime - st->m_startTime );
// print Url Catgory Lookup
sb.safePrintf ( "<tr><td>Lookup Category of Url.</td>"
sb.safePrintf ( "<tr class=poo><td>Lookup Category of Url.</td>"
"<td><input type=text name=caturl size=80"
" value=\"");
if (st->m_catLookup) {

@ -160,6 +160,10 @@ bool sendBackDump ( TcpSocket *sock, HttpRequest *hr ) {
rdbId = RDB_SPIDERDB;
fmt = FMT_CSV;
}
else if ( ( xx = strstr ( path , "_urls.txt" ) ) ) {
rdbId = RDB_SPIDERDB;
fmt = FMT_TXT;
}
else if ( ( xx = strstr ( path , "_pages.txt" ) ) ) {
rdbId = RDB_TITLEDB;
fmt = FMT_TXT;
@ -204,6 +208,10 @@ bool sendBackDump ( TcpSocket *sock, HttpRequest *hr ) {
SafeBuf sb2(tmp2,5000);
sb2.safePrintf("GET /search.csv?icc=1&format=csv&sc=0&dr=0&"
"c=%s&n=1000000&"
// no gigabits
"dsrt=0&"
// do not compute summary. 0 lines.
"ns=0&"
"q=gbsortby%%3Agbspiderdate&"
"prepend=type%%3Ajson"
"\r\n\r\n"
@ -231,6 +239,7 @@ bool sendBackDump ( TcpSocket *sock, HttpRequest *hr ) {
return g_httpServer.sendErrorReply(sock,500,mstrerror(g_errno));
}
mnew ( st , sizeof(StateCD), "statecd");
// initialize the new state
st->m_rdbId = rdbId;
st->m_downloadJSON = downloadJSON;
@ -266,13 +275,60 @@ bool sendBackDump ( TcpSocket *sock, HttpRequest *hr ) {
return true;
}
// . all wrappers call this
// . returns false if would block, true otherwise
bool readAndSendLoop ( StateCD *st , bool readFirst ) {
subloop:
// if we had a broken pipe on the sendChunk() call then hopefully
// this will kick in...
if ( g_errno ) {
log("crawlbot: readAndSendLoop: %s",mstrerror(g_errno));
readFirst = true;
st->m_someoneNeedsMore = false;
}
// wait if some are outstanding. how can this happen?
if ( st->m_numRequests > st->m_numReplies ) {
log("crawlbot: only got %li of %li replies. waiting for "
"all to come back in.",
st->m_numReplies,st->m_numRequests);
return false;
}
// are we all done?
if ( readFirst && ! st->m_someoneNeedsMore ) {
log("crawlbot: done sending for download request");
mdelete ( st , sizeof(StateCD) , "stcd" );
delete st;
return true;
}
// begin reading from each shard and sending the spiderdb records
// over the network. return if that blocked
if ( readFirst && ! st->readDataFromRdb ( ) ) return false;
// send it to the browser socket. returns false if blocks.
if ( ! st->sendList() ) return false;
// read again i guess
readFirst = true;
// hey, it did not block... tcpserver caches writes...
goto subloop;
}
void StateCD::sendBackDump2 ( ) {
m_numRequests = 0;
m_numReplies = 0;
// read 10MB from each shard's spiderdb at a time
m_minRecSizes = 9999999;
//m_minRecSizes = 9999999;
// 1ook to be more fluid
m_minRecSizes = 99999;
// we stop reading from all shards when this becomes false
m_someoneNeedsMore = true;
@ -284,20 +340,22 @@ void StateCD::sendBackDump2 ( ) {
KEYMIN((char *)&m_titledbStartKeys[i],sizeof(key_t));
}
subloop:
// begin reading from each shard and sending the spiderdb records
// over the network. return if that blocked
if ( ! readDataFromRdb ( ) ) return;
// send it to the browser socket
if ( ! sendList() ) return;
// . hey, it did not block... i guess no data to send out
// . but if all shards are exhausted from the dump, just return
if ( m_someoneNeedsMore ) goto subloop;
// note it
log("crawlbot: nobody needs more 1");
// begin reading from the shards and trasmitting back on m_socket
readAndSendLoop ( this , true );
}
void sendListWrapper ( void *state ) ;
static void gotListWrapper7 ( void *state ) {
// get the Crawler dump State
StateCD *st = (StateCD *)state;
// inc it up here
st->m_numReplies++;
// wait for all
if ( st->m_numReplies < st->m_numRequests ) return;
// read and send loop
readAndSendLoop( st , false );
}
bool StateCD::readDataFromRdb ( ) {
@ -341,7 +399,7 @@ bool StateCD::readDataFromRdb ( ) {
// records
m_minRecSizes,
this,
sendListWrapper ,
gotListWrapper7 ,
niceness ) ) {
log("crawlbot: blocked getting list from shard");
// continue if it blocked
@ -360,22 +418,6 @@ bool StateCD::readDataFromRdb ( ) {
return true;
}
void sendListWrapper ( void *state ) {
// get the Crawler dump State
StateCD *st = (StateCD *)state;
// inc it up here
st->m_numReplies++;
subloop:
// if this blocked sending back some data, return
if ( ! st->sendList() ) return;
// otherwise, read more, maybe had no data to send from list
if ( ! st->readDataFromRdb () ) return;
// send and read more
if ( st->m_someoneNeedsMore ) goto subloop;
// note it
log("crawlbot: nobody needs more 2");
}
bool StateCD::sendList ( ) {
// get the Crawler dump State
// inc it
@ -403,6 +445,7 @@ bool StateCD::sendList ( ) {
// then do so here, the content-length will not be in there
// because we might have to call for more spiderdb data
if ( m_needsMime ) {
m_needsMime = false;
HttpMime mime;
mime.makeMime ( -1, // totel content-lenght is unknown!
0 , // do not cache (cacheTime)
@ -496,6 +539,13 @@ bool StateCD::sendList ( ) {
list->freeList();
}
//log("rdbid=%li fmt=%li some=%li printed=%li",
// (long)m_rdbId,(long)m_fmt,(long)m_someoneNeedsMore,
// (long)m_printedEndingBracket);
bool lastChunk = false;
if ( ! m_someoneNeedsMore )
lastChunk = true;
// if nobody needs to read more...
if ( m_rdbId == RDB_TITLEDB &&
@ -504,113 +554,31 @@ bool StateCD::sendList ( ) {
! m_printedEndingBracket ) {
m_printedEndingBracket = true;
// end array of json objects. might be empty!
sb.safePrintf("\n]");
sb.safePrintf("\n]\n");
//log("adding ]. len=%li",sb.length());
}
// if first time, send it back
if ( m_needsMime ) {
// only do once
m_needsMime = false;
TcpServer *tcp = &g_httpServer.m_tcp;
sendLoop:
// start the send process
TcpServer *tcp = &g_httpServer.m_tcp;
if ( ! tcp->sendMsg ( m_socket ,
sb.getBufStart(), // sendBuf ,
sb.getCapacity(),//sendBufSize ,
sb.length(),//sendBufSize ,
sb.length(), // msgtotalsize
this , // data for callback
doneSendingWrapper ) ) { // callback
// do not free sendbuf we are transmitting it
sb.detachBuf();
return false;
}
// error?
//TcpSocket *s = m_socket;
// sometimes it does not block and is successful because
// it just writes its buffer out in one write call.
//if ( ! g_errno )
sb.detachBuf();
// . transmit the chunk in sb
// . steals the allocated buffer from sb and stores in the
// TcpSocket::m_sendBuf, which it frees when socket is
// ultimately destroyed or we call sendChunk() again.
// . when TcpServer is done transmitting, it does not close the
// socket but rather calls doneSendingWrapper() which can call
// this function again to send another chunk
// . when we are truly done sending all the data, then we set lastChunk
// to true and TcpServer.cpp will destroy m_socket when done
if ( ! tcp->sendChunk ( m_socket ,
&sb ,
this ,
doneSendingWrapper ,
lastChunk ) )
return false;
// log it
//log("crawlbot: nuking state. strange");
// nuke state
//delete this;
//mdelete ( this , sizeof(StateCD) , "stcd" );
//if ( g_errno )
log("diffbot: tcp sendmsg did not block: %s",
mstrerror(g_errno));
//g_httpServer.sendErrorReply(s,500,mstrerror(g_errno));
// wait for doneSendingWrapper to be called.
//return false;
//
// it did not block... so just keep going. that just
// means the socket sent the data. it's probably buffered.
//
// but we DO have to free the sendbuffer here since
// we did not block
mfree ( m_socket->m_sendBuf ,
m_socket->m_sendBufSize ,
"dbsbuf");
m_socket->m_sendBuf = NULL;
return true;
}
// if nothing to send back we are done. return true since we
// did not block sending back.
if ( sb.length() == 0 ) {
//log("crawlbot: nuking state.");
//delete this;
//mdelete ( this , sizeof(StateCD) , "stcd" );
return true;
}
// how can this be?
if ( m_socket->m_sendBuf ) { char *xx=NULL;*xx=0; }
// put socket in sending-again mode
m_socket->m_sendBuf = sb.getBufStart();
m_socket->m_sendBufSize = sb.getCapacity();
m_socket->m_sendBufUsed = sb.length();
m_socket->m_sendOffset = 0;
m_socket->m_totalSent = 0;
m_socket->m_totalToSend = sb.length();
// tell TcpServer.cpp to send this latest buffer! HACK!
//m_socket->m_sockState = ST_SEND_AGAIN;//ST_WRITING;//SEND_AGAIN;
// this does nothing if we were not called indirectly by
// TcpServer::writeSocketWrapper_r(). so if we should call
// sendMsg() ourselves in such a situation.
// so if the sendMsg() did not block, the first time, and we came
// here empty except for the ending ']' the 2nd time, then
// write it out this way... calling sendMsg() directly
if ( m_socket->m_sockState == ST_NEEDS_CLOSE ) {
//m_socket->m_sockState = ST_SEND_AGAIN;
goto sendLoop;
}
// do not let safebuf free this, we will take care of it
sb.detachBuf();
// . when it is done sending call this callback, don't hang up!
// . if m_someoneNeedsMore is false then this callback should just
// destroy the socket and delete "this"
m_socket->m_callback = doneSendingWrapper;
m_socket->m_state = this;
//if ( m_socket->m_sendBufUsed == 79 )
// log("hey");
// log it
log("crawlbot: resending %li bytes on socket",m_socket->m_sendBufUsed);
// we blocked sending back
return false;
// we are done sending this chunk, i guess tcp write was cached
// in the network card buffer or something
return true;
}
// TcpServer.cpp calls this when done sending TcpSocket's m_sendBuf
@ -618,83 +586,16 @@ void doneSendingWrapper ( void *state , TcpSocket *sock ) {
StateCD *st = (StateCD *)state;
TcpSocket *socket = st->m_socket;
//TcpSocket *socket = st->m_socket;
log("crawlbot: done sending on socket %li/%li bytes",
sock->m_totalSent,
sock->m_sendBufUsed);
// . if the final callback
// . sometimes m_sendBuf is NULL if we freed it below and tried to
// read more, only to read 0 bytes
// . but it will be non-null if we read 0 bytes the first time
// and just have a mime to send. because sendReply() above
// returned true, and then doneSendingWrapper() got called.
if ( //! socket->m_sendBuf &&
st->m_numRequests <= st->m_numReplies &&
! st->m_someoneNeedsMore ) {
log("crawlbot: done sending for download request");
delete st;
mdelete ( st , sizeof(StateCD) , "stcd" );
//log("mdel1: st=%lx",(long)st);
return;
}
// if the timer called us, just return
if ( ! socket->m_sendBuf ) {
log("crawlbot: timer callback");
socket->m_sockState = ST_SEND_AGAIN;
return;
}
readAndSendLoop ( st , true );
// free the old sendbuf then i guess since we might replace it
// in the above function.
mfree ( socket->m_sendBuf ,
socket->m_sendBufSize ,
"dbsbuf");
// in case we have nothing to send back do not let socket free
// what we just freed above. it'll core.
socket->m_sendBuf = NULL;
// sometimes this wrapper is called just from the timer...
// so if we have outstanding msg0s then we gotta wait
if ( st->m_numRequests > st->m_numReplies ) {
char *xx=NULL;*xx=0;
socket->m_sockState = ST_SEND_AGAIN;
return;
}
// all done?
if ( st->m_someoneNeedsMore ) {
// make sure socket doesn't close up on us!
socket->m_sockState = ST_SEND_AGAIN;
log("crawlbot: reading more download data");
// just enter the little loop here
subloop:
// otherwise, read more, maybe had no data to send from list
if ( ! st->readDataFromRdb () ) return;
// if this blocked sending back some data, return
if ( ! st->sendList() ) return;
// note that
log("crawlbot: sendList did not block");
// send and read more
if ( st->m_someoneNeedsMore ) goto subloop;
// note it
log("crawlbot: nobody needs more 3");
// sanity
if ( st->m_numRequests>st->m_numReplies){char *xx=NULL;*xx=0;}
}
log("crawlbot: no more data available");
// it's possible that readDataFromRdb() did not block and called
// sendList which set the socket m_sendBuf again... so check
// for that... it needs to be sent yet before we delete this state
//if ( st->m_socket->m_sendBuf ) return;
return;
}
void StateCD::printSpiderdbList ( RdbList *list,SafeBuf *sb,char **lastKeyPtr){
@ -804,7 +705,9 @@ void StateCD::printSpiderdbList ( RdbList *list,SafeBuf *sb,char **lastKeyPtr){
nowGlobalMS,
false,
MAX_NICENESS,
cr);
cr,
false, // isoutlink?
NULL);
char *expression = NULL;
long priority = -4;
// sanity check
@ -821,7 +724,9 @@ void StateCD::printSpiderdbList ( RdbList *list,SafeBuf *sb,char **lastKeyPtr){
// when spidering rounds we use the
// lastspidertime>={roundstart} --> spiders disabled rule
// so that we do not spider a url twice in the same round
if ( ufn >= 0 && ! cr->m_spidersEnabled[ufn] ) {
if ( ufn >= 0 && //! cr->m_spidersEnabled[ufn] ) {
// we set this to 0 instead of using the checkbox
cr->m_maxSpidersPerRule[ufn] <= 0 ) {
priority = -5;
}
@ -837,7 +742,10 @@ void StateCD::printSpiderdbList ( RdbList *list,SafeBuf *sb,char **lastKeyPtr){
m_isFirstTime = false;
sb->safePrintf("\"Url\","
"\"Entry Method\","
"\"Processed?\","
);
if ( cr->m_isCustomCrawl )
sb->safePrintf("\"Processed?\",");
sb->safePrintf(
"\"Add Time\","
"\"Last Crawled\","
"\"Last Status\","
@ -869,12 +777,15 @@ void StateCD::printSpiderdbList ( RdbList *list,SafeBuf *sb,char **lastKeyPtr){
// but default to csv
else {
sb->safePrintf("\"%s\",\"%s\","
"%li,%lu,%lu,\"%s\",\"%s\",\""
//",%s"
//"\n"
, sreq->m_url
, as
, (long)isProcessed
);
if ( cr->m_isCustomCrawl )
sb->safePrintf("%li,",(long)isProcessed);
sb->safePrintf(
"%lu,%lu,\"%s\",\"%s\",\""
//",%s"
//"\n"
// when was it first added to spiderdb?
, sreq->m_addedTime
// last time spidered, 0 if none
@ -991,8 +902,11 @@ void StateCD::printTitledbList ( RdbList *list,SafeBuf *sb,char **lastKeyPtr){
m_printedItem = true;
if ( ! sb->safeStrcpyPrettyJSON ( json ) )
log("diffbot: error printing json in dump");
//if ( ! sb->safeStrcpyPrettyJSON ( json ) )
// log("diffbot: error printing json in dump");
sb->safeStrcpy ( json );
sb->nullTerm();
// separate each JSON object with \n i guess
//sb->pushChar('\n');
@ -1132,8 +1046,8 @@ void printCrawlStatsWrapper ( void *state ) {
// save before nuking state
TcpSocket *sock = sxx->m_socket;
// nuke the state
delete sxx;
mdelete ( sxx , sizeof(StateXX) , "stxx" );
delete sxx;
// and send back now
g_httpServer.sendDynamicPage ( sock ,
sb.getBufStart(),
@ -1383,8 +1297,8 @@ void addedUrlsToSpiderdbWrapper ( void *state ) {
NULL ,
&rr ,
st->m_collnum );
delete st;
mdelete ( st , sizeof(StateCD) , "stcd" );
delete st;
//log("mdel2: st=%lx",(long)st);
}
/*
@ -1460,8 +1374,8 @@ void injectedUrlWrapper ( void *state ) {
response,
NULL ,
st->m_collnum );
delete st;
mdelete ( st , sizeof(StateCD) , "stcd" );
delete st;
}
*/
@ -1587,8 +1501,8 @@ void collOpDoneWrapper ( void *state ) {
StateCD *st = (StateCD *)state;
TcpSocket *socket = st->m_socket;
log("crawlbot: done with blocked op.");
delete st;
mdelete ( st , sizeof(StateCD) , "stcd" );
delete st;
//log("mdel3: st=%lx",(long)st);
g_httpServer.sendDynamicPage (socket,"OK",2);
}
@ -1648,6 +1562,29 @@ bool sendPageCrawlbot ( TcpSocket *socket , HttpRequest *hr ) {
// . put in xml or json if format=xml or format=json or
// xml=1 or json=1 ...
char fmt = FMT_JSON;
// token is always required. get from json or html form input
//char *token = getInputString ( "token" );
char *token = hr->getString("token");
char *name = hr->getString("name");
// . try getting token-name from ?c=
// . the name of the collection is encoded as <token>-<crawlname>
char *c = hr->getString("c");
char tmp[MAX_COLL_LEN+100];
if ( ! token && c ) {
strncpy ( tmp , c , MAX_COLL_LEN );
token = tmp;
name = strstr(tmp,"-");
if ( name ) {
*name = '\0';
name++;
}
// change default formatting to html
fmt = FMT_HTML;
}
char *fs = hr->getString("format",NULL,NULL);
// give john a json api
if ( fs && strcmp(fs,"html") == 0 ) fmt = FMT_HTML;
@ -1656,9 +1593,7 @@ bool sendPageCrawlbot ( TcpSocket *socket , HttpRequest *hr ) {
// if we got json as input, give it as output
//if ( JS.getFirstItem() ) fmt = FMT_JSON;
// token is always required. get from json or html form input
//char *token = getInputString ( "token" );
char *token = hr->getString("token");
if ( ! token && fmt == FMT_JSON ) { // (cast==0|| fmt == FMT_JSON ) ) {
char *msg = "invalid token";
@ -1718,8 +1653,6 @@ bool sendPageCrawlbot ( TcpSocket *socket , HttpRequest *hr ) {
bool restartColl = hr->hasField("restart");
char *name = hr->getString("name");
//if ( delColl && ! && cast == 0 ) {
// log("crawlbot: no collection found to delete.");
// char *msg = "Could not find crawl to delete.";
@ -1906,8 +1839,8 @@ bool sendPageCrawlbot ( TcpSocket *socket , HttpRequest *hr ) {
em.safePrintf("Invalid regular expresion: %s",rx2);
}
if ( status1 || status2 ) {
delete st;
mdelete ( st , sizeof(StateCD) , "stcd" );
delete st;
char *msg = em.getBufStart();
return sendErrorReply2(socket,fmt,msg);
}
@ -1965,8 +1898,8 @@ bool sendPageCrawlbot ( TcpSocket *socket , HttpRequest *hr ) {
if ( resetColl ) msg = "No such collection";
if ( restartColl ) msg = "No such collection";
// nuke it
delete st;
mdelete ( st , sizeof(StateCD) , "stcd" );
delete st;
// log it
log("crawlbot: cr is null. %s",msg);
// make sure this returns in json if required
@ -1992,8 +1925,8 @@ bool sendPageCrawlbot ( TcpSocket *socket , HttpRequest *hr ) {
if ( ! g_collectiondb.deleteRec ( collName , we ) )
return false;
// nuke it
delete st;
mdelete ( st , sizeof(StateCD) , "stcd" );
delete st;
// all done
return g_httpServer.sendDynamicPage (socket,"OK",2);
}
@ -2017,14 +1950,14 @@ bool sendPageCrawlbot ( TcpSocket *socket , HttpRequest *hr ) {
// to avoid user confusion
if ( cr ) cr->m_spideringEnabled = 1;
// nuke it
delete st;
mdelete ( st , sizeof(StateCD) , "stcd" );
delete st;
// all done
return g_httpServer.sendDynamicPage (socket,"OK",2);
}
// nuke it
delete st;
mdelete ( st , sizeof(StateCD) , "stcd" );
delete st;
// this will set the the collection parms from json
//setSpiderParmsFromJSONPost ( socket , hr , cr , &JS );
// this is a cast, so just return simple response
@ -2050,8 +1983,8 @@ bool sendPageCrawlbot ( TcpSocket *socket , HttpRequest *hr ) {
if ( name && name[0] )
msg = "Failed to add crawl. Crawl name is illegal.";
// nuke it
delete st;
mdelete ( st , sizeof(StateCD) , "stcd" );
delete st;
//log("crawlbot: no collection found. need to add a crawl");
return sendErrorReply2(socket,fmt, msg);
}
@ -2101,15 +2034,15 @@ bool sendPageCrawlbot ( TcpSocket *socket , HttpRequest *hr ) {
// error?
if ( ! status ) {
// nuke it
delete st;
mdelete ( st , sizeof(StateCD) , "stcd" );
delete st;
return sendErrorReply2(socket,fmt,mstrerror(g_errno));
}
// if not list
if ( ! size ) {
// nuke it
delete st;
mdelete ( st , sizeof(StateCD) , "stcd" );
delete st;
return sendErrorReply2(socket,fmt,"no urls found");
}
// add to spiderdb
@ -2163,8 +2096,8 @@ bool sendPageCrawlbot ( TcpSocket *socket , HttpRequest *hr ) {
printCrawlBotPage2 ( socket,hr,fmt,NULL,NULL,cr->m_collnum);
// get rid of that state
delete st;
mdelete ( st , sizeof(StateCD) , "stcd" );
delete st;
//log("mdel4: st=%lx",(long)st);
return true;
}
@ -2281,8 +2214,11 @@ bool printCrawlDetailsInJson ( SafeBuf &sb , CollectionRec *cx ) {
//"\"urlsExamined\":%lli,\n"
"\"pageCrawlAttempts\":%lli,\n"
"\"pageCrawlSuccesses\":%lli,\n"
"\"pageCrawlSuccessesThisRound\":%lli,\n"
"\"pageProcessAttempts\":%lli,\n"
"\"pageProcessSuccesses\":%lli,\n"
"\"pageProcessSuccessesThisRound\":%lli,\n"
"\"maxRounds\":%li,\n"
"\"repeat\":%f,\n"
@ -2303,8 +2239,11 @@ bool printCrawlDetailsInJson ( SafeBuf &sb , CollectionRec *cx ) {
//,cx->m_globalCrawlInfo.m_urlsConsidered
, cx->m_globalCrawlInfo.m_pageDownloadAttempts
, cx->m_globalCrawlInfo.m_pageDownloadSuccesses
, cx->m_globalCrawlInfo.m_pageDownloadSuccessesThisRound
, cx->m_globalCrawlInfo.m_pageProcessAttempts
, cx->m_globalCrawlInfo.m_pageProcessSuccesses
, cx->m_globalCrawlInfo.m_pageProcessSuccessesThisRound
, (long)cx->m_maxCrawlRounds
, cx->m_collectiveRespiderFrequency
@ -2619,8 +2558,12 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
"<td><b>URLs Examined</b></td>"
"<td><b>Page Download Attempts</b></td>"
"<td><b>Page Download Successes</b></td>"
"<td><b>Page Download Successes This Round"
"</b></td>"
"<td><b>Page Process Attempts</b></td>"
"<td><b>Page Process Successes</b></td>"
"<td><b>Page Process Successes This Round"
"</b></td>"
"</tr>"
);
}
@ -2667,6 +2610,8 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
"<td>%lli</td>"
"<td>%lli</td>"
"<td>%lli</td>"
"<td>%lli</td>"
"<td>%lli</td>"
"</tr>"
, cx->m_coll
, cx->m_globalCrawlInfo.m_objectsAdded -
@ -2675,8 +2620,10 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
//, cx->m_globalCrawlInfo.m_urlsConsidered
, cx->m_globalCrawlInfo.m_pageDownloadAttempts
, cx->m_globalCrawlInfo.m_pageDownloadSuccesses
, cx->m_globalCrawlInfo.m_pageDownloadSuccessesThisRound
, cx->m_globalCrawlInfo.m_pageProcessAttempts
, cx->m_globalCrawlInfo.m_pageProcessSuccesses
, cx->m_globalCrawlInfo.m_pageProcessSuccessesThisRound
);
}
if ( summary && fmt == FMT_HTML ) {
@ -2732,6 +2679,8 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
return false;
// shortcut
XmlDoc **docs = g_spiderLoop.m_docs;
// row count
long j = 0;
// first print the spider recs we are spidering
for ( long i = 0 ; i < (long)MAX_SPIDERS ; i++ ) {
// get it
@ -2739,17 +2688,18 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
// skip if empty
if ( ! xd ) continue;
// sanity check
if ( ! xd->m_oldsrValid ) { char *xx=NULL;*xx=0; }
if ( ! xd->m_sreqValid ) { char *xx=NULL;*xx=0; }
// skip if not our coll rec!
//if ( xd->m_cr != cr ) continue;
if ( xd->m_collnum != cr->m_collnum ) continue;
// grab it
SpiderRequest *oldsr = &xd->m_oldsr;
SpiderRequest *oldsr = &xd->m_sreq;
// get status
char *status = xd->m_statusMsg;
// show that
if ( ! oldsr->printToTableSimple ( &sb , status,xd) )
if ( ! oldsr->printToTableSimple ( &sb , status,xd,j))
return false;
j++;
}
// end the table
@ -2888,6 +2838,9 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
//
if ( fmt == FMT_HTML ) {
char *seedStr = cr->m_diffbotSeeds.getBufStart();
if ( ! seedStr ) seedStr = "";
SafeBuf tmp;
long crawlStatus = -1;
getSpiderStatusMsg ( cr , &tmp , &crawlStatus );
@ -2927,6 +2880,11 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
"<td>%s</td>"
"</tr>"
"<tr>"
"<td><b>Seeds:</td>"
"<td>%s</td>"
"</tr>"
"<tr>"
"<td><b>Crawl Status:</td>"
"<td>%li</td>"
@ -2942,6 +2900,11 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
"<td>%li</td>"
"</tr>"
"<tr>"
"<td><b>Has Urls Ready to Spider:</td>"
"<td>%li</td>"
"</tr>"
// this will have to be in crawlinfo too!
//"<tr>"
@ -2975,6 +2938,11 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
"<td>%lli</td>"
"</tr>"
"<tr>"
"<td><b>Page Crawl Successes This Round</b></td>"
"<td>%lli</td>"
"</tr>"
"<tr>"
"<td><b>Page Process Attempts</b></td>"
"<td>%lli</td>"
@ -2985,6 +2953,11 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
"<td>%lli</td>"
"</tr>"
"<tr>"
"<td><b>Page Process Successes This Round</b></td>"
"<td>%lli</td>"
"</tr>"
, cr->m_diffbotCrawlName.getBufStart()
@ -2992,9 +2965,12 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
, cr->m_diffbotToken.getBufStart()
, seedStr
, crawlStatus
, tmp.getBufStart()
, cr->m_spiderRoundNum
, cr->m_globalCrawlInfo.m_hasUrlsReadyToSpider
, cr->m_globalCrawlInfo.m_objectsAdded -
cr->m_globalCrawlInfo.m_objectsDeleted
@ -3003,9 +2979,11 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
, cr->m_globalCrawlInfo.m_pageDownloadAttempts
, cr->m_globalCrawlInfo.m_pageDownloadSuccesses
, cr->m_globalCrawlInfo.m_pageDownloadSuccessesThisRound
, cr->m_globalCrawlInfo.m_pageProcessAttempts
, cr->m_globalCrawlInfo.m_pageProcessSuccesses
, cr->m_globalCrawlInfo.m_pageProcessSuccessesThisRound
);
@ -3841,6 +3819,9 @@ bool getSpiderRequestMetaList ( char *doc ,
SpiderRequest sreq;
sreq.reset();
sreq.m_firstIp = url.getHostHash32(); // fakeip!
// avoid ips of 0 or -1
if ( sreq.m_firstIp == 0 || sreq.m_firstIp == -1 )
sreq.m_firstIp = 1;
sreq.m_hostHash32 = url.getHostHash32();
sreq.m_domHash32 = url.getDomainHash32();
sreq.m_siteHash32 = url.getHostHash32();

@ -7527,7 +7527,7 @@ bool printTopBarNav ( SafeBuf &sb , State7 *st ) {
"</tr>"
// - shadow row
//"<tr cellspacing=5 height=5px><td colspan=9 "
//"bgcolor=%s></td></tr>"
//"bgcolor=#%s></td></tr>"
// END TOP TABLE
"</table>"
//, GRAD2
@ -12671,7 +12671,7 @@ bool gotResults ( void *state ) {
">"
"<tr>"
"<td valign=top>"
// bgcolor=%s
// bgcolor=#%s
//, GRAD1
//, bg
);

@ -712,7 +712,7 @@ bool processLoop ( void *state ) {
//Words *ww = xd->getWords();
if ( ! xml.set ( content , contentLen , false ,
0 , false , TITLEREC_CURRENT_VERSION ,
false , 0 ) ) { // niceness is 0
false , 0 , CT_HTML ) ) { // niceness is 0
//if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" );
return sendErrorReply ( st , g_errno );
}

@ -108,13 +108,13 @@ skipReplaceHost:
refreshRate);
// ignore
char *username = g_users.getUsername ( r );
char *password = NULL;
User *user = NULL;
if ( username ) user = g_users.getUser (username );
if ( user ) password = user->m_password;
if ( ! password ) password = "";
if ( ! username ) username = "";
//char *username = g_users.getUsername ( r );
//char *password = NULL;
//User *user = NULL;
//if ( username ) user = g_users.getUser (username );
//if ( user ) password = user->m_password;
//if ( ! password ) password = "";
//if ( ! username ) username = "";
// print standard header
// char *pp = sb.getBuf();
@ -131,26 +131,26 @@ skipReplaceHost:
colspan = "31";
//shotcol = "<td><b>ip2</b></td>";
sprintf ( shotcol, "<td><a href=\"/master/hosts?c=%s"
"&sort=2&username=%s&pwd=%s\">"
"&sort=2\">"
"<b>ping2</b></td></a>",
coll,username,password);
coll);
}
// print host table
sb.safePrintf (
"<table cellpadding=4 border=1 width=100%% bgcolor=#%s>"
"<tr><td colspan=%s bgcolor=#%s><center>"
"<table %s>"
"<tr><td colspan=%s><center>"
//"<font size=+1>"
"<b>Hosts "
"(<a href=\"/master/hosts?c=%s&sort=%li&reset=1\">"
"reset)</b>"
//"</font>"
"</td></tr>"
"<tr>"
"<td><a href=\"/master/hosts?c=%s&sort=0&username=%s&"
"password=%s\">"
"<tr bgcolor=#%s>"
"<td><a href=\"/master/hosts?c=%s&sort=0\">"
"<b>hostId</b></td>"
"<td><b>host name</b></td>"
"<td><b>host ip</b></td>"
"<td><b>shard</b></td>" // mirror group
"<td><b>stripe</b></td>"
@ -187,49 +187,49 @@ skipReplaceHost:
//"<td><b>resends sent</td>"
//"<td><b>errors recvd</td>"
//"<td><b>ETRYAGAINS recvd</td>"
"<td><a href=\"/master/hosts?c=%s&username=%s&pwd=%s&sort=3\">"
"<td><a href=\"/master/hosts?c=%s&sort=3\">"
"<b>dgrams resent</a></td>"
"<td><a href=\"/master/hosts?c=%s&username=%s&pwd=%s&sort=4\">"
"<td><a href=\"/master/hosts?c=%s&sort=4\">"
"<b>errors recvd</a></td>"
"<td><a href=\"/master/hosts?c=%s&username=%s&pwd=%s&sort=5\">"
"<td><a href=\"/master/hosts?c=%s&sort=5\">"
"<b>ETRY AGAINS recvd</a></td>"
"<td><a href=\"/master/hosts?c=%s&username=%s&pwd=%s&sort=6\">"
"<td><a href=\"/master/hosts?c=%s&sort=6\">"
"<b>dgrams to</a></td>"
"<td><a href=\"/master/hosts?c=%s&username=%s&pwd=%s&sort=7\">"
"<td><a href=\"/master/hosts?c=%s&sort=7\">"
"<b>dgrams from</a></td>"
//"<td><a href=\"/master/hosts?c=%s&username=%s&pwd=%s&sort=8\">"
//"<td><a href=\"/master/hosts?c=%s&sort=8\">"
//"<b>loadavg</a></td>"
"<td><a href=\"/master/hosts?c=%s&username=%s&pwd=%s&sort=13\">"
"<td><a href=\"/master/hosts?c=%s&sort=13\">"
"<b>avg split time</a></td>"
"<td><b>splits done</a></td>"
"<td><a href=\"/master/hosts?c=%s&username=%s&pwd=%s&sort=12\">"
"<td><a href=\"/master/hosts?c=%s&sort=12\">"
"<b>status</a></td>"
"<td><a href=\"/master/hosts?c=%s&username=%s&pwd=%s&sort=15\">"
"<td><a href=\"/master/hosts?c=%s&sort=15\">"
"<b>slow reads</a></td>"
"<td><b>docs indexed</a></td>"
"<td><a href=\"/master/hosts?c=%s&username=%s&pwd=%s&sort=9\">"
"<td><a href=\"/master/hosts?c=%s&sort=9\">"
"<b>mem used</a></td>"
"<td><a href=\"/master/hosts?c=%s&username=%s&pwd=%s&sort=10\">"
"<td><a href=\"/master/hosts?c=%s&sort=10\">"
"<b>cpu</a></td>"
"<td><a href=\"/master/hosts?c=%s&username=%s&pwd=%s&sort=14\">"
"<td><a href=\"/master/hosts?c=%s&sort=14\">"
"<b>max ping1</a></td>"
"<td><a href=\"/master/hosts?c=%s&username=%s&pwd=%s&sort=11\">"
"<td><a href=\"/master/hosts?c=%s&sort=11\">"
"<b>ping1 age</a></td>"
//"<td><b>ip1</td>"
"<td><a href=\"/master/hosts?c=%s&username=%s&pwd=%s&sort=1\">"
"<td><a href=\"/master/hosts?c=%s&sort=1\">"
"<b>ping1</a></td>"
"%s"// "<td><b>ip2</td>"
@ -237,25 +237,26 @@ skipReplaceHost:
//"<td>avg roundtrip</td>"
//"<td>std. dev.</td></tr>"
"<td><b>note</td>",
LIGHT_BLUE ,
TABLE_STYLE ,
colspan ,
DARK_BLUE ,
coll, sort,
coll, username, password,
coll, username, password,
coll, username, password,
coll, username, password,
coll, username, password,
coll, username, password,
coll, username, password,
coll, username, password,
coll, username, password,
coll, username, password,
//coll,username, password,
coll, username, password,
coll, username, password,
coll, username, password,
coll, username, password,
DARK_BLUE ,
coll,
coll,
coll,
coll,
coll,
coll,
coll,
coll,
coll,
coll,
coll,
coll,
coll,
coll,
shotcol );
// loop through each host we know and print it's stats
@ -396,13 +397,14 @@ skipReplaceHost:
"in disagreement with ours.\">H</b></font>");
// rebalancing?
if ( h->m_flags & PFLAG_REBALANCING )
fb.safePrintf("<b title=\"Current rebalancing\">R</b>");
fb.safePrintf("<b title=\"Currently "
"rebalancing\">R</b>");
// has recs that should be in another shard? indicates
// we need to rebalance or there is a bad hosts.conf
if ( h->m_flags & PFLAG_FOREIGNRECS )
fb.safePrintf("<font color=red><b title=\"Foreign data "
"detected. Needs rebalance.\">F"
"</b></font");
"</b></font>");
// if it has spiders going on say "S"
if ( h->m_flags & PFLAG_HASSPIDERS )
fb.safePrintf ( "<span title=\"Spidering\">S</span>");
@ -423,11 +425,15 @@ skipReplaceHost:
if ( fb.length() == 0 )
fb.safePrintf("&nbsp;");
char *bg = LIGHT_BLUE;
if ( h->m_ping >= g_conf.m_deadHostTimeout )
bg = "ffa6a6";
// print it
sb.safePrintf (
"<tr>"
"<tr bgcolor=#%s>"
"<td><a href=\"http://%s:%hi/master/hosts?"
"username=%s&pwd=%s&"
""
"c=%s"
"&sort=%li\">%li</a></td>"
@ -496,8 +502,8 @@ skipReplaceHost:
//"<td>%lims</td>"
"<td nowrap=1>%s</td>"
"</tr>" ,
bg,//LIGHT_BLUE ,
ipbuf3, h->m_httpPort,
username, password,
coll, sort,
i ,
h->m_hostname,
@ -552,15 +558,16 @@ skipReplaceHost:
// end the table now
sb.safePrintf ( "</table><br>\n" );
// print spare hosts table
sb.safePrintf (
"<table cellpadding=4 border=1 width=100%% bgcolor=#%s>"
"<tr><td colspan=10 bgcolor=#%s><center>"
"<table %s>"
"<tr class=hdrow><td colspan=10><center>"
//"<font size=+1>"
"<b>Spares</b>"
//"</font>"
"</td></tr>"
"<tr>"
"<tr bgcolor=#%s>"
"<td><b>spareId</td>"
"<td><b>host name</td>"
"<td><b>ip1</td>"
@ -575,7 +582,7 @@ skipReplaceHost:
//"<td><b>ide channel</td>"
"<td><b>note</td>",
LIGHT_BLUE ,
TABLE_STYLE,
DARK_BLUE );
for ( long i = 0; i < g_hostdb.m_numSpareHosts; i++ ) {
@ -589,7 +596,7 @@ skipReplaceHost:
// print it
sb.safePrintf (
"<tr>"
"<tr bgcolor=#%s>"
"<td>%li</td>"
"<td>%s</td>"
"<td>%s</td>"
@ -602,6 +609,7 @@ skipReplaceHost:
//"<td>%li</td>" // ide channel
"<td>%s</td>"
"</tr>" ,
LIGHT_BLUE,
i ,
h->m_hostname,
ipbuf1,
@ -618,13 +626,13 @@ skipReplaceHost:
// print proxy hosts table
sb.safePrintf (
"<table cellpadding=4 border=1 width=100%% bgcolor=#%s>"
"<tr><td colspan=12 bgcolor=#%s><center>"
"<table %s>"
"<tr class=hdrow><td colspan=12><center>"
//"<font size=+1>"
"<b>Proxies</b>"
//"</font>"
"</td></tr>"
"<tr>"
"<tr bgcolor=#%s>"
"<td><b>proxyId</b></td>"
"<td><b>type</b></td>"
"<td><b>host name</b></td>"
@ -645,8 +653,9 @@ skipReplaceHost:
//"<td><b>ide channel</td>"
"<td><b>note</td>",
LIGHT_BLUE ,
DARK_BLUE );
TABLE_STYLE,
DARK_BLUE
);
for ( long i = 0; i < g_hostdb.m_numProxyHosts; i++ ) {
// get the ith host (hostId)
Host *h = g_hostdb.getProxy ( i );
@ -677,10 +686,10 @@ skipReplaceHost:
// print it
sb.safePrintf (
"<tr>"
"<tr bgcolor=#%s>"
"<td><a href=\"http://%s:%hi/master/hosts?"
"username=%s&pwd=%s&"
""
"c=%s\">"
"%li</a></td>"
@ -700,10 +709,9 @@ skipReplaceHost:
"<td>%s </td>"
"</tr>" ,
LIGHT_BLUE,
ipbuf3,
h->m_httpPort,
username,
password,
coll,
i ,
@ -724,24 +732,31 @@ skipReplaceHost:
}
sb.safePrintf ( "</table><br><br>" );
sb.safePrintf(
"<style>"
".poo { background-color:#%s;}\n"
"</style>\n" ,
LIGHT_BLUE );
// print help table
sb.safePrintf (
"<table cellpadding=4 border=1 width=100%% bgcolor=#%s>"
"<tr><td colspan=10 bgcolor=#%s><center>"
"<table %s>"
"<tr class=hdrow><td colspan=10><center>"
//"<font size=+1>"
"<b>Key</b>"
//"</font>"
"</td></tr>"
"<tr>"
"<tr class=poo>"
"<td>shard</td>"
"<td>"
"The index is split into shards. Which shard does this "
"host server?"
"host serve?"
"</td>"
"</tr>\n"
"<tr>"
"<tr class=poo>"
"<td>stripe</td>"
"<td>"
"Hosts with the same stripe serve the same shard "
@ -749,41 +764,41 @@ skipReplaceHost:
"</td>"
"</tr>\n"
"<tr>"
"<tr class=poo>"
"<td>ip1</td>"
"<td>The primary IP address of the host."
"</td>"
"</tr>\n"
"<tr>"
"<tr class=poo>"
"<td>ip2</td>"
"<td>The secondary IP address of the host."
"</td>"
"</tr>\n"
/*
"<tr>"
"<tr class=poo>"
"<td>udp port</td>"
"<td>The UDP port the host uses to send and recieve "
"datagrams."
"</td>"
"</tr>\n"
"<tr>"
"<tr class=poo>"
"<td>dns client port</td>"
"<td>The UDP port used to send and receive dns traffic with."
"</td>"
"</tr>\n"
*/
"<tr>"
"<tr class=poo>"
"<td>http port</td>"
"<td>The port you can connect a browser to."
"</td>"
"</tr>\n"
/*
"<tr>"
"<tr class=poo>"
"<td>best switch id</td>"
"<td>The host prefers to be on this switch because it "
"needs to send a lot of data to other hosts on this swtich. "
@ -794,7 +809,7 @@ skipReplaceHost:
*/
/*
"<tr>"
"<tr class=poo>"
"<td>switch id</td>"
"<td>Hosts that share the same switch id are "
"physically on the same switch."
@ -802,7 +817,7 @@ skipReplaceHost:
"</tr>\n"
*/
"<tr>"
"<tr class=poo>"
"<td>dgrams resent</td>"
"<td>How many datagrams have had to be resent to a host "
"because it was not ACKed quick enough or because it was "
@ -811,7 +826,7 @@ skipReplaceHost:
"</td>"
"</tr>\n"
"<tr>"
"<tr class=poo>"
"<td>errors recvd</td>"
"<td>How many errors were received from a host in response "
"to a request to retrieve or insert data."
@ -819,7 +834,7 @@ skipReplaceHost:
"</tr>\n"
"<tr>"
"<tr class=poo>"
"<td>ETRYAGAINS recvd</td>"
"<td>How many ETRYAGAIN were received in response to a "
"request to add data. Usually because the host's memory "
@ -830,7 +845,7 @@ skipReplaceHost:
"</td>"
"</tr>\n"
"<tr>"
"<tr class=poo>"
"<td>dgrams to</td>"
"<td>How many datagrams were sent to the host from the "
"selected host since startup. Includes ACK datagrams. This "
@ -841,46 +856,46 @@ skipReplaceHost:
"</td>"
"</tr>\n"
"<tr>"
"<tr class=poo>"
"<td>dgrams from</td>"
"<td>How many datagrams were received from the host by the "
"selected host since startup. Includes ACK datagrams."
"</td>"
"</tr>\n"
//"<tr>"
//"<tr class=poo>"
//"<td>loadavg</td>"
//"<td>1-minute sliding-window load average from "
//"/proc/loadavg."
//"</td>"
//"</tr>\n"
"<tr>"
"<tr class=poo>"
"<td>mem used</td>"
"<td>percentage of memory currently used."
"</td>"
"</tr>\n"
"<tr>"
"<tr class=poo>"
"<td>cpu usage</td>"
"<td>percentage of cpu resources in use by the gb process."
"</td>"
"</tr>\n"
"<tr>"
"<tr class=poo>"
"<td>ping1 age</td>"
"<td>How long ago the last ping request was sent to "
"this host. Let's us know how fresh the ping time is."
"</td>"
"</tr>\n"
"<tr>"
"<tr class=poo>"
"<td>ping1</td>"
"<td>Ping time to this host on the primary network."
"</td>"
"</tr>\n"
"<tr>"
"<tr class=poo>"
"<td>ping2</td>"
"<td>Ping time to this host on the seconday/shotgun "
"network. This column is not visible if the shotgun "
@ -888,25 +903,25 @@ skipReplaceHost:
"</td>"
"</tr>\n"
"<tr>"
"<tr class=poo>"
"<td>M (status flag)</td>"
"<td>Indicates host is merging files on disk."
"</td>"
"</tr>\n"
"<tr>"
"<tr class=poo>"
"<td>D (status flag)</td>"
"<td>Indicates host is dumping data to disk."
"</td>"
"</tr>\n"
"<tr>"
"<tr class=poo>"
"<td>S (status flag)</td>"
"<td>Indicates host has outstanding spiders."
"</td>"
"</tr>\n"
"<tr>"
"<tr class=poo>"
"<td>y (status flag)</td>"
"<td>Indicates host is performing the daily merge."
"</td>"
@ -914,8 +929,8 @@ skipReplaceHost:
,
LIGHT_BLUE ,
DARK_BLUE );
TABLE_STYLE
);
sb.safePrintf ( "</table><br></form><br>" );

@ -52,6 +52,8 @@ bool sendPageInject ( TcpSocket *s , HttpRequest *r ) {
msg7->m_crawlbotAPI = crawlbotAPI;
strncpy(msg7->m_coll,coll,MAX_COLL_LEN);
// for diffbot
if ( crawlbotAPI )
msg7->m_hr.copy ( r );
@ -63,7 +65,6 @@ bool sendPageInject ( TcpSocket *s , HttpRequest *r ) {
// qts is html encoded? NO! fix that below then...
//char *uf="http://www.google.com/search?num=50&"
// "q=%s&scoring=d&filter=0";
strncpy(msg7->m_coll,coll,MAX_COLL_LEN);
msg7->m_isScrape = true;
msg7->m_qbuf.safeStrcpy(qts);
msg7->m_linkDedupTable.set(4,0,512,NULL,0,false,0,"ldtab");
@ -193,6 +194,12 @@ bool sendReply ( void *state ) {
// pm = msg;
//}
sb.safePrintf(
"<style>"
".poo { background-color:#%s;}\n"
"</style>\n" ,
LIGHT_BLUE );
//char *c = msg7->m_coll;
char bb [ MAX_COLL_LEN + 60 ];
bb[0]='\0';
@ -204,39 +211,50 @@ bool sendReply ( void *state ) {
"<b>%s</b>\n\n" // the url msg
//"<FORM method=POST action=/inject>\n\n"
"<FORM method=GET action=/inject>\n\n"
//"<input type=hidden name=pwd value=\"%s\">\n"
//"<input type=hidden name=username value=\"%s\">\n"
"<table width=100%% bgcolor=#%s cellpadding=4 border=1>"
"<tr><td bgcolor=#%s colspan=2>"
"<table %s>"
"<tr class=hdrow><td colspan=2>"
"<center>"
//"<font size=+1>"
"<b>"
"Inject URL</b>%s"
//"</font>"
"<br>"
//"Enter the information below to inject "
//"a URL. This allows you to specify the URL as well as the "
//"content for the URL."
"</td></tr>\n\n"
"<tr><td><b>url</b></td>"
"<td>\n"
"<tr class=poo><td><b>url</b>"
"<br>"
"<font size=-2>"
"Specify the URL that will be immediately crawled and "
"indexed in real time "
"while you wait. The browser will return the "
"final index status code. Alternatively, "
"use the <i>add urls</i> page "
"to add URLs in bulk or to just add to the spider queue "
"without having to wait for the page or pages to be "
"actually indexed in realtime."
"</font>"
"</td>"
"<td width=50%%>\n"
"<input type=text name=u value=\"\" size=50>"
"</td></tr>\n\n"
"<tr><td><b>query to scrape</b></td>"
"<tr class=poo><td><b>query to scrape</b></td>"
"<td>\n"
"<input type=text name=qts value=\"\" size=50>"
"</td></tr>\n\n"
//"<tr><td><b>use ahrefs.com</b></td>"
//"<tr class=poo><td><b>use ahrefs.com</b></td>"
//"<td>\n"
//"<input type=radio name=useahrefs value=0 checked>no &nbsp; "
//"<input type=radio name=useahrefs value=1>yes "
//"</td></tr>\n\n"
"<tr><td><b>spider links</b></td>"
"<tr class=poo><td><b>spider links</b></td>"
"<td>\n"
"<input type=radio name=spiderlinks value=0>no &nbsp; "
"<input type=radio name=spiderlinks value=1 checked>yes "
@ -249,18 +267,18 @@ bool sendReply ( void *state ) {
"<tr><td><b>inject scraped links</b></td>"
"<tr class=poo><td><b>inject scraped links</b></td>"
"<td>\n"
"<input type=radio name=injectlinks value=0 checked>no &nbsp; "
"<input type=radio name=injectlinks value=1>yes "
"</td></tr>\n\n"
"<tr><td><b>collection</b></td>"
"<tr class=poo><td><b>collection</b></td>"
"<td>\n"
"<input type=text name=c value=\"%s\" size=15>"
"</td></tr>\n\n"
"<tr><td><b>quick reply?</b><br>"
"<tr class=poo><td><b>quick reply?</b><br>"
"<font size=1>Should reply be short? "
"Default: no"
"</td>"
@ -269,7 +287,7 @@ bool sendReply ( void *state ) {
"<input type=radio name=quick value=1>yes "
"</td></tr>\n\n"
"<tr><td><b>only inject new docs?</b><br>"
"<tr class=poo><td><b>only inject new docs?</b><br>"
"<font size=1>Skips injection if docs already indexed. "
"Default: no"
"</td>"
@ -279,17 +297,17 @@ bool sendReply ( void *state ) {
"</td></tr>\n\n"
"<tr><td><b>delete?</b><br>"
"<tr class=poo><td><b>delete url?</b><br>"
"<font size=1>Should this url be deleted from the index? "
"Default: no"
"</td>"
"<td>\n"
"<input type=radio name=delete value=0 checked>no &nbsp; "
"<input type=radio name=delete value=1>yes "
"<input type=radio name=deleteurl value=0 checked>no &nbsp; "
"<input type=radio name=deleteurl value=1>yes "
"</td></tr>\n\n"
"<tr><td><b>recycle content?</b><br>"
"<tr class=poo><td><b>recycle content?</b><br>"
"<font size=1>Should page content be recycled if "
"reindexing? "
"Default: no"
@ -299,16 +317,18 @@ bool sendReply ( void *state ) {
"<input type=radio name=recycle value=1>yes "
"</td></tr>\n\n"
"<tr><td><b>ip</b><br>"
/*
"<tr class=poo><td><b>ip</b><br>"
"<font size=1>IP address of the url. If blank then "
"Gigablast will look up. "
"Default: blank"
"</td>"
"<td>\n<input type=text name=ip value=\"\" size=15>"
"</td></tr>\n\n"
*/
/*
"<tr><td><b>do ip lookups?</b><br>"
"<tr class=poo><td><b>do ip lookups?</b><br>"
"<font size=1>Should Gigablast look up the IP address "
"of the url, if it is not provided. "
"Default: yes"
@ -319,7 +339,7 @@ bool sendReply ( void *state ) {
"</td></tr>\n\n"
*/
//"<tr><td><b>is url new?</b><br>"
//"<tr class=poo><td><b>is url new?</b><br>"
//"<font size=1>Is this url new to the index? If unsure "
//"then you should say no here. "
//"Default: yes"
@ -329,7 +349,7 @@ bool sendReply ( void *state ) {
//"<input type=radio name=isnew value=1 checked>yes "
//"</td></tr>\n\n"
"<tr><td><b>dedup?</b><br>"
"<tr class=poo><td><b>dedup?</b><br>"
"<font size=1>Should this url be skipped if there is "
"already a url in the index from this same domain with "
"this same content? "
@ -339,14 +359,14 @@ bool sendReply ( void *state ) {
"<input type=radio name=dedup value=0>no &nbsp; "
"<input type=radio name=dedup value=1 checked>yes "
"</td></tr>\n\n" ,
//"<tr><td><b>ruleset</b><br>"
//"<tr class=poo><td><b>ruleset</b><br>"
//"<font size=1>Use this ruleset to index the URL. "
//"Default: auto"
//"</td>"
//"<td>\n<select name=rs>" ,
pm , // msg7->m_pwd ,
//msg7->m_username,
LIGHT_BLUE , DARK_BLUE , bb , msg7->m_coll );
TABLE_STYLE , bb , msg7->m_coll );
//p += gbstrlen(p);
@ -382,7 +402,7 @@ bool sendReply ( void *state ) {
// make a table, each row will be an injectable parameter
sb.safePrintf (
"<tr><td><b>content has mime</b><br>"
"<tr class=poo><td><b>content has mime</b><br>"
"<font size=1>IP address of the url. If blank then "
"Gigablast will look up. "
"Default: blank"
@ -392,10 +412,13 @@ bool sendReply ( void *state ) {
"<input type=radio name=hasmime value=1>yes "
"</td></tr>\n\n"
"<tr><td colspan=2>"
"<tr class=poo><td colspan=2>"
"<center>"
"<b>content</b><br>"
"<font size=1>Enter the content here. Enter MIME header "
"<font size=1>If you want to supply the URL's content "
"rather than have Gigablast download it, then "
"enter the content here. "
"Enter MIME header "
"first if \"content has mime\" is set to true above. "
"Separate MIME from actual content with two returns."
"<br>"
@ -404,11 +427,15 @@ bool sendReply ( void *state ) {
"\n"
"<textarea rows=32 cols=80 name=content>"
"</textarea>"
"<br>"
"<br>\n\n"
"<input type=submit value=Submit>"
"</center>"
"</td></tr></table>\n"
"<br>"
"<br>\n\n"
"<center>"
"<input type=submit value=Submit>"
"</center>"
"</form>\n"
);
@ -463,34 +490,48 @@ bool Msg7::inject ( TcpSocket *s ,
long contentLen;
// get the junk
char *coll = r->getString ( "c" , NULL , NULL /*default*/);
//char *coll = r->getString ( "c" , NULL , NULL /*default*/);
//if ( ! coll ) coll = "main";
// sometimes crawlbot will add or reset a coll and do an inject
// in PageCrawlBot.cpp
//if ( ! coll ) coll = r->getString("addcoll");
//if ( ! coll ) coll = r->getString("resetcoll");
if ( ! coll ) coll = collOveride;
//if ( ! coll ) coll = collOveride;
// default to main
if ( ! coll || ! coll[0] ) coll = "main";
//if ( ! coll || ! coll[0] ) coll = "main";
if ( collOveride && ! collOveride[0] ) collOveride = NULL;
CollectionRec *cr = NULL;
if ( collOveride ) cr = g_collectiondb.getRec ( collOveride );
else cr = g_collectiondb.getRec ( r );
if ( ! cr ) {
g_errno = ENOCOLLREC;
return true;
}
char *coll = cr->m_coll;
bool quickReply = r->getLong ( "quick" , 0 );
//char *pwd = r->getString ( "pwd" , NULL );
char *url = r->getString ( "u" , NULL , NULL /*default*/);
// for diffbot.cpp api
if ( ! url ) url = r->getString("injecturl",NULL,NULL);
if ( ! url ) url = r->getString("url",NULL,NULL);
// PageCrawlBot.cpp uses "seed"
if ( ! url ) url = r->getString("seed",NULL,NULL);
bool recycleContent = r->getLong ( "recycle",0);
char *ips = r->getString ( "ip" , NULL , NULL );
//char *ips = r->getString ( "ip" , NULL , NULL );
//char *username = g_users.getUsername(r);
long firstIndexed = r->getLongLong("firstindexed",0LL);
long lastSpidered = r->getLongLong("lastspidered",0LL);
//long firstIndexed = r->getLongLong("firstindexed",0LL);
//long lastSpidered = r->getLongLong("lastspidered",0LL);
long hopCount = r->getLong("hopcount",-1);
long newOnly = r->getLong("newonly",0);
long charset = r->getLong("charset",-1);
long deleteIt = r->getLong("delete",0);
long deleteUrl = r->getLong("deleteurl",0);
char hasMime = r->getLong("hasmime",0);
// do consistency testing?
bool doConsistencyTesting = r->getLong("dct",0);
@ -502,7 +543,7 @@ bool Msg7::inject ( TcpSocket *s ,
long forcedIp = 0;
if ( ips ) forcedIp = atoip ( ips , gbstrlen(ips) );
//if ( ips ) forcedIp = atoip ( ips , gbstrlen(ips) );
char *content = r->getString ( "content" , &contentLen , NULL );
// mark doesn't like to url-encode his content
@ -543,17 +584,20 @@ bool Msg7::inject ( TcpSocket *s ,
niceness,
state,
callback,
firstIndexed,
lastSpidered,
//firstIndexed,
//lastSpidered,
hopCount,
newOnly,
charset,
spiderLinks,
deleteIt,
deleteUrl,
hasMime,
doConsistencyTesting);
}
// . returns false if blocked, true otherwise
// . if returns false will call your callback(state) when is done
// . returns true and sets g_errno on error
bool Msg7::inject ( char *url ,
long forcedIp ,
char *content ,
@ -567,13 +611,13 @@ bool Msg7::inject ( char *url ,
long niceness,
void *state ,
void (*callback)(void *state),
long firstIndexed,
long lastSpidered,
//long firstIndexed,
//long lastSpidered,
long hopCount,
char newOnly,
short charset,
char spiderLinks,
char deleteIt,
char deleteUrl,
char hasMime,
bool doConsistencyTesting
) {
@ -581,11 +625,14 @@ bool Msg7::inject ( char *url ,
m_quickReply = quickReply;
// store coll
if ( ! coll ) { g_errno = ENOCOLLREC; return true; }
long collLen = gbstrlen ( coll );
if ( collLen > MAX_COLL_LEN ) collLen = MAX_COLL_LEN;
strncpy ( m_coll , coll , collLen );
m_coll [ collLen ] = '\0';
//if ( ! coll ) { g_errno = ENOCOLLREC; return true; }
// long collLen = gbstrlen ( coll );
//if ( collLen > MAX_COLL_LEN ) collLen = MAX_COLL_LEN;
//strncpy ( m_coll , coll , collLen );
//m_coll [ collLen ] = '\0';
CollectionRec *cr = g_collectiondb.getRec ( coll );
if ( ! cr ) { g_errno = ENOCOLLREC; return true; }
// store user
//long ulen = 0;
@ -612,149 +659,36 @@ bool Msg7::inject ( char *url ,
if ( g_repairMode ) { g_errno = EREPAIRING; return true; }
// send template reply if no content supplied
if ( ! content && ! recycleContent ) {
log("inject: no content supplied to inject command and "
"recycleContent is false.");
//return true;
}
// clean url?
// normalize and add www. if it needs it
Url uu;
uu.set ( url , gbstrlen(url) , true );
// remove >'s i guess and store in st1->m_url[] buffer
char cleanUrl[MAX_URL_LEN+1];
urlLen = cleanInput ( cleanUrl,
MAX_URL_LEN,
uu.getUrl(),
uu.getUrlLen() );
// this can go on the stack since set4() copies it
SpiderRequest sreq;
sreq.reset();
strcpy(sreq.m_url, cleanUrl );
// parentdocid of 0
long firstIp = hash32n(cleanUrl);
if ( firstIp == -1 || firstIp == 0 ) firstIp = 1;
sreq.setKey( firstIp,0LL, false );
sreq.m_isInjecting = 1;
sreq.m_isPageInject = 1;
sreq.m_hopCount = hopCount;
sreq.m_hopCountValid = 1;
sreq.m_fakeFirstIp = 1;
sreq.m_firstIp = firstIp;
//if ( ! content && ! recycleContent ) {
// log("inject: no content supplied to inject command and "
// "recycleContent is false.");
// //return true;
//}
// shortcut
XmlDoc *xd = &m_xd;
// log it now
//log("inject: injecting doc %s",cleanUrl);
if ( ! xd->injectDoc ( url ,
cr ,
content ,
hasMime , // content starts with http mime?
hopCount,
charset,
static char s_dummy[3];
// sometims the content is indeed NULL...
if ( newOnly && ! content ) {
// don't let it be NULL because then xmldoc will
// try to download the page!
s_dummy[0] = '\0';
content = s_dummy;
//char *xx=NULL;*xx=0; }
}
deleteUrl,
contentType, // CT_HTML, CT_XML
spiderLinks ,
newOnly, // index iff new
state,
callback ) )
// we blocked...
return false;
// . use the enormous power of our new XmlDoc class
// . this returns false with g_errno set on error
if ( //m_needsSet &&
! xd->set4 ( &sreq ,
NULL ,
m_coll ,
NULL , // pbuf
// give it a niceness of 1, we have to be
// careful since we are a niceness of 0!!!!
niceness, // 1 ,
// inject this content
content ,
deleteIt, // false, // deleteFromIndex ,
forcedIp ,
contentType ,
lastSpidered ,
hasMime )) {
// g_errno should be set if that returned false
if ( ! g_errno ) { char *xx=NULL;*xx=0; }
return true;
}
// do not re-call the set
//m_needsSet = false;
// make this our callback in case something blocks
xd->setCallback ( state , callback );
xd->m_doConsistencyTesting = doConsistencyTesting;
// . set xd from the old title rec if recycle is true
// . can also use XmlDoc::m_loadFromOldTitleRec flag
if ( recycleContent ) xd->m_recycleContent = true;
// othercrap
if ( firstIndexed ) {
xd->m_firstIndexedDate = firstIndexed;
xd->m_firstIndexedDateValid = true;
}
if ( lastSpidered ) {
xd->m_spideredTime = lastSpidered;
xd->m_spideredTimeValid = true;
}
if ( hopCount != -1 ) {
xd->m_hopCount = hopCount;
xd->m_hopCountValid = true;
}
if ( charset != -1 && charset != csUnknown ) {
xd->m_charset = charset;
xd->m_charsetValid = true;
}
// avoid looking up ip of each outlink to add "firstip" tag to tagdb
// because that can be slow!!!!!!!
xd->m_spiderLinks = spiderLinks;
xd->m_spiderLinks2 = spiderLinks;
xd->m_spiderLinksValid = true;
// . newOnly is true --> do not inject if document is already indexed!
// . maybe just set indexCode
xd->m_newOnly = newOnly;
// do not re-lookup the robots.txt
xd->m_isAllowed = true;
xd->m_isAllowedValid = true;
xd->m_crawlDelay = -1; // unknown
xd->m_crawlDelayValid = true;
// set this now
g_inPageInject = true;
// log it now
//log("inject: indexing injected doc %s",cleanUrl);
// . now tell it to index
// . this returns false if blocked
bool status = xd->indexDoc ( );
// log it. i guess only for errors when it does not block?
// because xmldoc.cpp::indexDoc calls logIt()
if ( status ) xd->logIt();
// undo it
g_inPageInject = false;
// note that it blocked
//if ( ! status ) log("inject: blocked for %s",cleanUrl);
// return false if it blocked
return status;
return true;
}
///////////////
//
// SCRAPE GOOGLE

@ -53,8 +53,8 @@ public:
long niceness,
void *state ,
void (*callback)(void *state),
long firstIndexedDate = 0,
long spiderDate = 0,
//long firstIndexedDate = 0,
//long spiderDate = 0,
long hopCount = -1 ,
char newOnly = 0 ,
short charset = -1 ,

@ -28,8 +28,8 @@ struct StateLogView {
static char *s_magicStr = "4j3.8x*";
#define BABY_BLUE "e0e0d0"
#define LIGHT_BLUE "d0d0e0"
#define DARK_BLUE "c0c0f0"
//#define LIGHT_BLUE "d0d0e0"
//#define DARK_BLUE "c0c0f0"
bool sendPageLogView ( TcpSocket *s , HttpRequest *r ) {
@ -79,15 +79,21 @@ bool sendPageLogView ( TcpSocket *s , HttpRequest *r ) {
"</SCRIPT> ");
p->safePrintf("<form name=\"fo\">");
p->safePrintf("\n<table width=100%% bgcolor=#%s "
"cellpadding=4 border=1>\n", BABY_BLUE);
p->safePrintf("\n<table %s>\n",TABLE_STYLE);
p->safePrintf("<tr class=hdrow><td colspan=2>"
"<center><b>Log View</b></center>"
"</td></tr>");
p->safePrintf("<tr><td>Refresh Rate:</td><td><input type=\"text\""
p->safePrintf("<tr bgcolor=%s>"
"<td>Refresh Rate:</td><td><input type=\"text\""
" name=\"rr\" value=\"%li\" size=\"4\"></td></tr>",
LIGHT_BLUE,
refreshRate);
p->safePrintf("<tr><td>Sample Size:</td><td><input type=\"text\""
" name=\"ss\" value=\"%li\" size=\"4\"></td></tr>",
p->safePrintf("<tr bgcolor=%s>"
"<td>Sample Size:</td><td><input type=\"text\""
" name=\"ss\" value=\"%li\" size=\"4\">",
LIGHT_BLUE,
sampleSize);
p->safePrintf("<input type=\"hidden\" "
@ -96,6 +102,7 @@ bool sendPageLogView ( TcpSocket *s , HttpRequest *r ) {
p->safePrintf("<input type=\"hidden\" "
"name=\"dontlog\" value=\"1\">");
p->safePrintf("</td></tr>");
// . count the number of hosts we are getting logs for:
long numOn = 0;
@ -134,7 +141,8 @@ bool sendPageLogView ( TcpSocket *s , HttpRequest *r ) {
st->m_filterStr[6] = "INFO";
st->m_filterStr[7] = "INIT";
p->safePrintf("<tr><td>Filter Types:</td><td>");
p->safePrintf("<tr bgcolor=#%s><td>Filter Types:</td><td>",
LIGHT_BLUE);
char *checked;
st->m_numFilts = 0;
for(long i = 7; i >= 0; i--) {
@ -183,7 +191,8 @@ bool sendPageLogView ( TcpSocket *s , HttpRequest *r ) {
p->safePrintf("<tr><td>Hosts:</td><td>");
p->safePrintf("<tr bgcolor=#%s><td>Hosts:</td><td>",
LIGHT_BLUE);
for ( long i = 0 ; i < nh ; i++ ) {
// skip dead hosts, i don't want to wait for them to timeout.
if ( g_hostdb.isDead ( i ) ) continue;
@ -214,9 +223,9 @@ bool sendPageLogView ( TcpSocket *s , HttpRequest *r ) {
p->safePrintf("</td></tr>\n");
p->safePrintf("<tr><td>\n");
p->safePrintf("<tr bgcolor=#%s><td>\n",LIGHT_BLUE);
p->safePrintf("<input type=\"submit\" value=\"Update\"> ");
p->safePrintf("</td></tr></table>\n");
p->safePrintf("</td><td></td></tr></table>\n");
p->safePrintf("</form>");
if(!blocked)
@ -227,6 +236,14 @@ bool sendPageLogView ( TcpSocket *s , HttpRequest *r ) {
}
bool showLine ( SafeBuf *sb , char *s , long len ) {
return sb->brify ( s , len ,
0 , // niceness
80 , // cols
"<br>",
false ); // isHtml?
}
void gotRemoteLogWrapper(void *state, UdpSlot *slot) {
@ -329,25 +346,25 @@ void gotRemoteLogWrapper(void *state, UdpSlot *slot) {
if(matchNum >= 0 || st->m_numFilts == 0) {
if(matchNum == 0) {
p->safePrintf("<font color=red>");
p->safeMemcpy(st->m_readBufPtrs[ndx], lineLen);
showLine(p,st->m_readBufPtrs[ndx], lineLen);
p->safePrintf("\n");
p->safePrintf("</font>");
}
else if(matchNum == 1) {
p->safePrintf("<font color=green>");
p->safeMemcpy(st->m_readBufPtrs[ndx], lineLen);
showLine(p,st->m_readBufPtrs[ndx], lineLen);
p->safePrintf("\n");
p->safePrintf("</font>");
}
else if(matchNum == 2) {
p->safePrintf("<font color=blue>");
p->safeMemcpy(st->m_readBufPtrs[ndx], lineLen);
showLine(p,st->m_readBufPtrs[ndx], lineLen);
p->safePrintf("\n");
p->safePrintf("</font>");
}
else {
p->safeMemcpy(st->m_readBufPtrs[ndx], lineLen);
showLine(p,st->m_readBufPtrs[ndx], lineLen);
p->safePrintf("\n");
}
}

@ -211,11 +211,18 @@ bool sendPageParser2 ( TcpSocket *s ,
if ( st->m_render ) render = " checked";
if ( st->m_oips ) oips = " checked";
xbuf->safePrintf(
"<style>"
".poo { background-color:#%s;}\n"
"</style>\n" ,
LIGHT_BLUE );
long clen;
char *contentParm = r->getString("content",&clen,"");
// print the input form
xbuf->safePrintf ("<br>"
xbuf->safePrintf (
"<style>\n"
"h2{font-size: 12px; color: #666666;}\n"
@ -233,21 +240,30 @@ bool sendPageParser2 ( TcpSocket *s ,
".hs {color: #009900;}"
"</style>\n"
"<center>"
"<table cellpadding=3>"
"<tr>"
"<table %s>"
"<tr><td colspan=5><center><b>"
"Parser"
"</b></center></td></tr>\n"
"<tr class=poo>"
"<td>"
"Url:"
"<b>url</b>"
"<br><font size=-2>"
"Type in <b>FULL</b> url to parse."
"</font>"
"</td>"
"</td>"
"<td>"
"<input type=text name=u value=\"%s\" size=\"40\">\n"
"</td>"
"<td>"
"Type in <b>FULL</b> url\n"
"</td>"
"</tr>"
"<tr>"
/*
"<tr class=poo>"
"<td>"
"Parser version to use: "
"</td>"
@ -258,9 +274,10 @@ bool sendPageParser2 ( TcpSocket *s ,
"(-1 means to use latest title rec version)<br>"
"</td>"
"</tr>"
*/
/*
"<tr>"
"<tr class=poo>"
"<td>"
"Hop count to use: "
"</td>"
@ -273,20 +290,22 @@ bool sendPageParser2 ( TcpSocket *s ,
"</tr>"
*/
"<tr>"
"<tr class=poo>"
"<td>"
"Use cached:"
"<b>use cached</b>"
"<br><font size=-2>"
"Load page from cache (titledb)?"
"</font>"
"</td>"
"<td>"
"<input type=checkbox name=old value=1%s> "
"</td>"
"<td>"
"Load page from cache (titledb)?"
"</td>"
"</tr>"
/*
"<tr>"
"<tr class=poo>"
"<td>"
"Reparse root:"
"</td>"
@ -299,20 +318,23 @@ bool sendPageParser2 ( TcpSocket *s ,
"</tr>"
*/
"<tr>"
"<tr class=poo>"
"<td>"
"Recycle Link Info:"
"<b>recycle link info</b>"
"<br><font size=-2>"
"Recycle the link info from the title rec"
"Load page from cache (titledb)?"
"</font>"
"</td>"
"<td>"
"<input type=checkbox name=recycle value=1%s> "
"</td>"
"<td>"
"Recycle the link info from the title rec"
"</td>"
"</tr>"
/*
"<tr>"
"<tr class=poo>"
"<td>"
"Recycle Link Info Imported:"
"</td>"
@ -325,20 +347,22 @@ bool sendPageParser2 ( TcpSocket *s ,
"</tr>"
*/
"<tr>"
"<tr class=poo>"
"<td>"
"Render HTML:"
"<b>render html</b>"
"<br><font size=-2>"
"Render document content as HTML"
"</font>"
"</td>"
"<td>"
"<input type=checkbox name=render value=1%s> "
"</td>"
"<td>"
"Render document content as HTML"
"</td>"
"</tr>"
/*
"<tr>"
"<tr class=poo>"
"<td>"
"Lookup outlinks' ruleset, ips, quality:"
"</td>"
@ -351,7 +375,7 @@ bool sendPageParser2 ( TcpSocket *s ,
"</td>"
"</tr>"
"<tr>"
"<tr class=poo>"
"<td>"
"LinkInfo Coll:"
"</td>"
@ -364,49 +388,59 @@ bool sendPageParser2 ( TcpSocket *s ,
"</tr>"
*/
"<tr>"
"<tr class=poo>"
"<td>"
"Optional query:"
"<b>optional query</b>"
"<br><font size=-2>"
"Leave empty usually. For title generation only."
"</font>"
"</td>"
"<td>"
"<input type=text name=\"q\" size=\"20\" value=\"\"> "
"</td>"
"<td>"
"Leave empty usually. For title generation only."
"</td>"
"</tr>"
"<tr>"
"<td>"
"Content Below is XML:"
"</td>"
"<tr class=poo>"
"<td>"
"<b>content below is xml</b>"
"<br><font size=-2>"
"Is the content below XML?"
"</font>"
"</td>"
"<td>"
"<input type=checkbox name=xml value=1> "
"</td>"
"<td>"
//""
"</td>"
"</tr>"
"<tr>"
"<td colspan=3>"
"<tr class=poo>"
"<td><b>content</b>"
"<br><font size=-2>"
"Use this content for the provided <i>url</i> "
"rather than downloading it from the web."
"</td>"
"<td>"
"<textarea rows=10 cols=80 name=content>"
"%s"
"</textarea>"
"</td>"
"</tr>"
"<tr>"
"<td colspan=\"3\">"
"<input type=submit value=OK>"
"</td>"
"</tr>"
"</table>"
"</center>"
"</form>"
"<br>",
TABLE_STYLE,
us ,
//(long)st->m_hopCount,
//rtu,
@ -420,8 +454,11 @@ bool sendPageParser2 ( TcpSocket *s ,
xbuf->safePrintf(
"<center>"
"<input type=submit value=Submit>"
"</center>"
);
// just print the page if no url given

@ -99,14 +99,15 @@ bool sendPagePerf ( TcpSocket *s , HttpRequest *r ) {
//skip request path
while (!isspace(*rbufEnd)) rbufEnd++;
*rbufEnd = '\0';
char* refresh = strstr(rbuf, "&rr=");
//char* refresh = strstr(rbuf, "&rr=");
// print resource table
// columns are the dbs
p.safePrintf(
//"<center>Disk Statistics<br><br>"
"<center><br>"
"<center>"
//"<br>"
//"<img name=\"diskgraph\"
//src=/diskGraph%li.gif><br><br>",
//g_hostdb.m_hostId );
@ -115,12 +116,13 @@ bool sendPagePerf ( TcpSocket *s , HttpRequest *r ) {
// now try using absolute divs instead of a GIF
g_stats.printGraphInHtml ( p );
/*
if(autoRefresh > 0) {
if(refresh) *(refresh+4) = '0';
p.safePrintf(
"<center><a href=\"%s\">Auto Refresh Off</a>"
"</center>",
rbuf + 4/*skip over GET*/);
rbuf + 4); // skip over GET
p.safePrintf( "<input type=\"hidden\" "
"name=\"dontlog\" value=\"1\">");
@ -132,20 +134,26 @@ bool sendPagePerf ( TcpSocket *s , HttpRequest *r ) {
p.safePrintf(
"<center><a href=\"%s%s\">Auto Refresh</a>"
"</center>",
rbuf + 4/*skip over GET*/, rr);
rbuf + 4, rr); // skip over "GET "
}
*/
// print the key
p.safePrintf (
"<br>"
"<center>"
"<table border=1 cellpadding=2>"
//"<table %s>"
//"<tr>%s</tr></table>"
"<tr>%s</tr></table>"
"<style>"
".poo { background-color:#%s;}\n"
"</style>\n"
"<table border=1 cellpadding=2>"
"<table %s>"
// black
"<tr>"
"<tr class=poo>"
"<td bgcolor=#000000>&nbsp; &nbsp;</td>"
"<td> High priority disk read. "
"Thicker lines for bigger reads.</td>"
@ -158,7 +166,7 @@ bool sendPagePerf ( TcpSocket *s , HttpRequest *r ) {
// red
"<tr>"
"<tr class=poo>"
"<td bgcolor=#ff0000>&nbsp; &nbsp;</td>"
"<td> Disk write. "
"Thicker lines for bigger writes.</td>"
@ -170,7 +178,7 @@ bool sendPagePerf ( TcpSocket *s , HttpRequest *r ) {
// dark brown
"<tr>"
"<tr class=poo>"
"<td bgcolor=#753d30>&nbsp; &nbsp;</td>"
"<td> Processing raw query. Has raw= parm.</td>"
@ -181,7 +189,7 @@ bool sendPagePerf ( TcpSocket *s , HttpRequest *r ) {
// pinkish purple
"<tr>"
"<tr class=poo>"
"<td bgcolor=#aa00aa>&nbsp; &nbsp;</td>"
"<td> Send data over network. (low priority)"
"Thicker lines for bigger sends.</td>"
@ -193,7 +201,7 @@ bool sendPagePerf ( TcpSocket *s , HttpRequest *r ) {
"</tr>"
// pinkish purple
"<tr>"
"<tr class=poo>"
"<td bgcolor=#ff00ff>&nbsp; &nbsp;</td>"
"<td> Send data over network. (high priority)"
"Thicker lines for bigger sends.</td>"
@ -206,7 +214,7 @@ bool sendPagePerf ( TcpSocket *s , HttpRequest *r ) {
// dark purple
"<tr>"
"<tr class=poo>"
"<td bgcolor=#8220ff>&nbsp; &nbsp;</td>"
"<td> Get all summaries for results.</td>"
@ -218,7 +226,7 @@ bool sendPagePerf ( TcpSocket *s , HttpRequest *r ) {
// white
"<tr>"
"<tr class=poo>"
"<td bgcolor=#ffffff>&nbsp; &nbsp;</td>"
"<td> Uncompress cached document.</td>"
@ -229,7 +237,7 @@ bool sendPagePerf ( TcpSocket *s , HttpRequest *r ) {
// bright green
"<tr>"
"<tr class=poo>"
"<td bgcolor=#00ff00>&nbsp; &nbsp;</td>"
"<td> Compute search results. "
"All terms required. rat=1.</td>"
@ -241,7 +249,7 @@ bool sendPagePerf ( TcpSocket *s , HttpRequest *r ) {
"</tr>"
// bright green
"<tr>"
"<tr class=poo>"
"<td bgcolor=#ccffcc>&nbsp; &nbsp;</td>"
"<td> Compute reference pages. "
"</td>"
@ -252,7 +260,7 @@ bool sendPagePerf ( TcpSocket *s , HttpRequest *r ) {
"</td>"
"</tr>"
"<tr>"
"<tr class=poo>"
"<td bgcolor=#d1e1ff>&nbsp; &nbsp;</td>"
"<td> Compute Gigabits. "
@ -265,7 +273,7 @@ bool sendPagePerf ( TcpSocket *s , HttpRequest *r ) {
"</tr>"
"<tr>"
"<tr class=poo>"
"<td bgcolor=#0000b0>&nbsp; &nbsp;</td>"
"<td> \"Summary\" extraction (low priority) "
@ -279,10 +287,12 @@ bool sendPagePerf ( TcpSocket *s , HttpRequest *r ) {
"</table>"
"</center>",
g_stats.m_keyCols.getBufStart() &&
g_conf.m_dynamicPerfGraph ?
g_stats.m_keyCols.getBufStart() : ""
"</center>"
, LIGHT_BLUE
, TABLE_STYLE
//,g_stats.m_keyCols.getBufStart() &&
//g_conf.m_dynamicPerfGraph ?
//g_stats.m_keyCols.getBufStart() : ""
);
if(autoRefresh > 0) p.safePrintf("</body>");

@ -108,7 +108,7 @@ bool sendPageReindex ( TcpSocket *s , HttpRequest *r ) {
// if they are NOT submitting a request print the interface
// and we're not running, just print the interface
t = r->getString ("action" , &len );
if ( len != 2 ) { // && ! s_isRunning ) {
if ( len < 2 ) { // && ! s_isRunning ) {
//p = g_pages.printAdminTop ( p , pend , s , r );
//p = printInterface ( p , pend,q,username,coll,NULL,qlangStr);
g_pages.printAdminTop ( &sb , s , r );
@ -315,19 +315,25 @@ bool printInterface (SafeBuf *sb, char *q , //long user ,
errmsg );
}
sb->safePrintf(
"<style>"
".poo { background-color:#%s;}\n"
"</style>\n" ,
LIGHT_BLUE );
char bb [ MAX_COLL_LEN + 60 ];
bb[0]='\0';
//if ( user == USER_MASTER && c && c[0] ) sprintf ( bb , " (%s)", c);
// print the reindex interface
sb->safePrintf (
"<table width=100%% bgcolor=#%s cellpadding=4 border=1>"
"<tr><td colspan=3 bgcolor=#%s><center>"
"<table %s>"
"<tr><td colspan=3><center>"
//"<font size=+1>"
"<b>"
"Reindex Urls"
"</b>%s</td></tr>"
"<tr><td colspan=3>"
"<tr bgcolor=#%s><td colspan=3>"
"<font size=1>"
"Reindex the URLs that match this query. If URLs are "
"banned in tagdb they will be removed from the index. "
@ -339,7 +345,7 @@ bool printInterface (SafeBuf *sb, char *q , //long user ,
"whatever rule they match in the URL Filters table."
"</td></tr>"
"<tr><td><b>query</b>"
"<tr class=poo><td><b>query</b>"
"<br><font size=1>"
"URLs matching this query will be added to the spider "
"queue for re-spidering."
@ -359,32 +365,32 @@ bool printInterface (SafeBuf *sb, char *q , //long user ,
"name=updatetags>"
"</td></tr>"
*/
, LIGHT_BLUE , DARK_BLUE , bb , q );
, TABLE_STYLE , bb , DARK_BLUE , q );
if ( ! qlangStr ) qlangStr = "";
sb->safePrintf (
"<tr><td><b>start result number</b>"
"<tr class=poo><td><b>start result number</b>"
"<font size=1>"
"<br>Start at this search result number. Default 0.</td>"
"<td><input type=text name=srn value=0 size=10>"
"</td></tr>"
"<tr><td><b>end result number</b>"
"<tr class=poo><td><b>end result number</b>"
"<font size=1>"
"<br>Stop at this search result number. "
"Default 2000000. (2M)</td>"
"<td><input type=text name=ern size=10 value=2000000>"
"</td></tr>"
"<tr><td><b>query language</b>"
"<tr class=poo><td><b>query language</b>"
"<font size=1>"
"<br>Language that helps determine sort result ranking.</td>"
"<td><input type=text name=qlang size=6 value=\"%s\">"
"</td></tr>"
"<tr><td><b>FORCE DELETE</b>"
"<tr class=poo><td><b>FORCE DELETE</b>"
"<font size=1>"
"<br>Check this checkbox to "
"delete every search result matching the above "
@ -434,7 +440,7 @@ bool printInterface (SafeBuf *sb, char *q , //long user ,
// submit button
sb->safePrintf(
"<center>"
"<input type=submit name=action value=OK>"
"<input type=submit name=action value=Submit>"
"</center>"
"</form></html>");

@ -57,6 +57,11 @@ public:
// for printing our search result json items in csv:
HashTableX m_columnTable;
long m_numCSVColumns;
// stuff for doing redownloads
bool m_didRedownload;
XmlDoc *m_xd;
long m_oldContentHash32;
};
static bool printResult ( SafeBuf &sb,
@ -467,6 +472,11 @@ bool sendPageResults ( TcpSocket *s , HttpRequest *hr ) {
}
mnew ( st , sizeof(State0) , "PageResults2" );
// init some stuff
st->m_didRedownload = false;
st->m_xd = NULL;
st->m_oldContentHash32 = 0;
// copy yhits
if ( ! st->m_hr.copy ( hr ) )
return sendReply ( st , NULL );
@ -615,6 +625,15 @@ bool sendPageResults ( TcpSocket *s , HttpRequest *hr ) {
return status2;
}
// if returned json result is > maxagebeforedownload then we redownload the
// page and if its checksum has changed we return empty results
void doneRedownloadingWrapper ( void *state ) {
// cast our State0 class from this
State0 *st = (State0 *) state;
// resume
gotResults ( st );
}
/*
void gotSpellingWrapper( void *state ){
// cast our State0 class from this
@ -749,6 +768,85 @@ bool gotResults ( void *state ) {
return sendReply(st,NULL);
}
/*
//
// BEGIN REDOWNLOAD LOGIC
//
////////////
//
// if caller wants a certain freshness we might have to redownload the
// parent url to get the new json
//
////////////
// get the first result
Msg20 *m20first = msg40->m_msg20[0];
long mabr = st->m_hr.getLong("maxagebeforeredownload",-1);
if ( mabr >= 0 &&
numResults > 0 &&
// only do this once
! st->m_didRedownload &&
// need at least one result
m20first &&
// get the last spidered time from the msg20 reply of that result
m20first->m_r->m_lastSpidered - now > mabr ) {
// make a new xmldoc to do the redownload
XmlDoc *xd;
try { xd = new (XmlDoc); }
catch ( ... ) {
g_errno = ENOMEM;
log("query: Failed to alloc xmldoc.");
}
if ( g_errno ) return sendReply (st,NULL);
mnew ( xd , sizeof(XmlDoc) , "mabrxd");
// save it
st->m_xd = xd;
// get this
st->m_oldContentHash32 = m20rep->m_contentHash32;
// do not re-do redownload
st->m_didRedownload = true;
// set it
xd->setUrl(parentUrl);
xd->setCallback ( st , doneRedownloadingWrapper );
// get the checksum
if ( xd->getContentChecksum32Fast() == (void *)-1 )
// return false if it blocked
return false;
// error?
if ( g_errno ) return sendReply (st,NULL);
// how did this not block
log("page: redownload did not would block adding parent");
}
// if we did the redownload and checksum changed, return 0 results
if ( st->m_didRedownload ) {
// get the doc we downloaded
XmlDoc *xd = st->m_xd;
// get it
long newHash32 = xd->getContentHash32();
// log it
if ( newHash32 != st->m_oldContentHash32 )
// note it in logs for now
log("results: content changed for %s",xd->m_firstUrl.m_url);
// free it
mdelete(xd, sizeof(XmlDoc), "mabrxd" );
delete xd;
// null it out so we don't try to re-free
st->m_xd = NULL;
// if content is significantly different, return 0 results
if ( newHash32 != st->m_oldContentHash32 ) {
SafeBuf sb;
// empty json i guess
sb.safePrintf("[]\n");
return sendReply(st,sb.getBufStart());
}
// otherwise, print the diffbot json results, they are still valid
}
//
// END REDOWNLOAD LOGIC
//
*/
//
// BEGIN ADDING URL
@ -1061,7 +1159,8 @@ bool gotResults ( void *state ) {
// otherwise, we had no error
if ( numResults == 0 && si->m_format == FORMAT_HTML ) {
sb.safePrintf ( "No results found." );
sb.safePrintf ( "No results found in <b>%s</b> collection.",
cr->m_coll);
}
else if ( moreFollow && si->m_format == FORMAT_HTML ) {
if ( isAdmin && si->m_docsToScanForReranking > 1 )
@ -1128,11 +1227,8 @@ bool gotResults ( void *state ) {
if ( collLen == 4 && strncmp ( coll, "main", 4) == 0 ) isMain = true;
// print "in collection ***" if we had a collection
if ( collLen > 0 && ! isMain && isAdmin ) {
sb.safePrintf (" in collection '<b>");
sb.safeMemcpy ( coll , collLen );
sb.safeMemcpy ( "</b>'" , 5 );
}
if ( collLen > 0 && ! isMain && si->m_format == FORMAT_HTML )
sb.safePrintf (" in collection <b>%s</b>",coll);
char *pwd = si->m_pwd;
@ -2409,7 +2505,7 @@ static bool printResult ( SafeBuf &sb,
mr->m_docId );
// the new links
if ( si->m_format == FORMAT_HTML ) {
if ( si->m_format == FORMAT_HTML && g_conf.m_isMattWells ) {
//sb.safePrintf(" - <a href=\"/scoring?"
// "c=%s&\">scoring</a>",
// coll );
@ -4724,21 +4820,26 @@ bool printLogoAndSearchBox ( SafeBuf &sb , HttpRequest *hr , long catId ) {
else
sb.safePrintf("<a title=\"Search the web\" href=/>web</a>");
sb.safePrintf(" &nbsp;&nbsp;&nbsp;&nbsp; " );
// SEO functionality not included yet - so redir to gigablast.
if ( g_conf.m_isMattWells )
sb.safePrintf("<a title=\"Rank higher in "
"Google\" href='/seo'>");
else
sb.safePrintf("<a title=\"Rank higher in "
"Google\" href='https://www.gigablast."
"com/seo'>");
if ( g_conf.m_isMattWells ) {
// SEO functionality not included yet - so redir to gigablast.
if ( g_conf.m_isMattWells )
sb.safePrintf("<a title=\"Rank higher in "
"Google\" href='/seo'>");
else
sb.safePrintf("<a title=\"Rank higher in "
"Google\" href='https://www.gigablast."
"com/seo'>");
sb.safePrintf(
"seo</a>"
" &nbsp;&nbsp;&nbsp;&nbsp; "
);
sb.safePrintf(
"seo</a>"
" &nbsp;&nbsp;&nbsp;&nbsp; "
);
}
if (catId <= 0 )
sb.safePrintf("<a title=\"Browse the DMOZ directory\" "
@ -4757,12 +4858,12 @@ bool printLogoAndSearchBox ( SafeBuf &sb , HttpRequest *hr , long catId ) {
// i'm not sure why this was removed. perhaps
// because it is not working yet because of
// some bugs...
"<!-- <a title=\"Advanced web search\" "
"<a title=\"Advanced web search\" "
"href=/adv.html>"
"advanced"
"</a>"
" &nbsp;&nbsp;&nbsp;&nbsp; -->"
" &nbsp;&nbsp;&nbsp;&nbsp;"
"<a title=\"Add your url to the index\" "
"href=/addurl>"
@ -4945,6 +5046,11 @@ bool printCSVHeaderRow ( SafeBuf *sb , State0 *st ) {
Msg20 *m20 = msg40->m_msg20[i];
Msg20Reply *mr = m20->m_r;
if ( ! mr ) {
log("results: missing msg20 reply for result #%li",i);
continue;
}
// get content
char *json = mr->ptr_content;
// how can it be empty?

@ -61,20 +61,23 @@ bool printNav ( SafeBuf &sb , HttpRequest *r ) {
"<a href=%s/privacy.html>Privacy Policy</a>"
" &nbsp; &nbsp; "
"<a href=%s/searchfeed.html>Search API</a>"
" &nbsp; &nbsp; "
"<a href=%s/seoapi.html>SEO API</a>"
" &nbsp; &nbsp; "
"<a href=%s/account>My Account</a> "
, root
, root
, root
, root
, root
, root
, rootSecure
//" &nbsp; &nbsp; <a href=/logout>Logout</a>"
);
if ( g_conf.m_isMattWells )
sb.safePrintf(" &nbsp; &nbsp; "
"<a href=%s/seoapi.html>SEO API</a>"
" &nbsp; &nbsp; "
"<a href=%s/account>My Account</a> "
, root
, rootSecure
//" &nbsp; &nbsp; <a href=/logout>Logout</a>"
);
if ( r->isLocal() )
sb.safePrintf("&nbsp; &nbsp;[<a href=\"/master?\">Admin</a>]");
sb.safePrintf("</p></b></center></body></html>");
@ -152,7 +155,15 @@ bool printWebHomePage ( SafeBuf &sb , HttpRequest *r ) {
// submit to https now
sb.safePrintf("<form method=get "
"action=/search name=f>\n");
CollectionRec *cr = g_collectiondb.getRec ( r );
if ( cr )
sb.safePrintf("<input type=hidden name=c value=\"%s\">",
cr->m_coll);
sb.safePrintf("<input name=q type=text size=60 value=\"\">&nbsp;<input type=\"submit\" value=\"Search\">\n");
sb.safePrintf("\n");
sb.safePrintf("</form>\n");
sb.safePrintf("<br>\n");
@ -381,7 +392,12 @@ bool printAddUrlHomePage ( SafeBuf &sb , char *url , HttpRequest *r ) {
sb.safePrintf("<br><br>\n");
sb.safePrintf("<br><br><br>\n");
sb.safePrintf("<a href=/>web</a> &nbsp;&nbsp;&nbsp;&nbsp; <a href=http://www.gigablast.com/seo>seo</a> &nbsp;&nbsp;&nbsp;&nbsp; <a href=\"/Top\">directory</a> &nbsp;&nbsp;&nbsp;&nbsp; \n");
sb.safePrintf("<a href=/>web</a> &nbsp;&nbsp;&nbsp;&nbsp; ");
if ( g_conf.m_isMattWells )
sb.safePrintf("<a href=http://www.gigablast.com/seo>seo"
"</a> &nbsp;&nbsp;&nbsp;&nbsp; " );
sb.safePrintf("<a href=\"/Top\">directory</a> "
"&nbsp;&nbsp;&nbsp;&nbsp; \n");
sb.safePrintf("<a href=/adv.html>advanced search</a>");
sb.safePrintf(" &nbsp;&nbsp;&nbsp;&nbsp; ");
sb.safePrintf("<b title=\"Instantly add your url to Gigablast's "
@ -391,8 +407,17 @@ bool printAddUrlHomePage ( SafeBuf &sb , char *url , HttpRequest *r ) {
sb.safePrintf("<br><br>\n");
sb.safePrintf("<form method=get action=/addurl name=f>\n");
//CollectionRec *cr = g_collectiondb.getRec ( "main" );
//sb.safePrintf("<input type=hidden name=c value=\"%s\">",cr->m_coll);
CollectionRec *cr = g_collectiondb.getRec ( r );
// the collection we want to add the url to
char *coll = NULL;
if ( cr )
coll = cr->m_coll;
if ( coll )
sb.safePrintf("<input type=hidden name=c value=\"%s\">",coll);
if ( ! coll )
coll = "";
sb.safePrintf("<input name=u type=text size=60 value=\"");
if ( url ) {
SafeBuf tmp;
@ -416,6 +441,9 @@ bool printAddUrlHomePage ( SafeBuf &sb , char *url , HttpRequest *r ) {
// or if in read-only mode
if ( g_conf.m_readOnlyMode )
msg = "Add url is temporarily disabled";
sb.safePrintf("<br><br>Add a url to the <b>%s</b> collection",coll);
// if url is non-empty the ajax will receive this identical msg
// and display it in the div, so do not duplicate the msg!
if ( msg && ! url )
@ -453,11 +481,12 @@ bool printAddUrlHomePage ( SafeBuf &sb , char *url , HttpRequest *r ) {
unsigned long long rand64 = gettimeofdayInMillisecondsLocal();
// msg7 needs an explicit collection for /addurl for injecting
// in PageInject.cpp. it does not use defaults for safety.
sb.safePrintf("&id=%lu&c=main&rand=%llu';\n"
sb.safePrintf("&id=%lu&c=%s&rand=%llu';\n"
"client.open('GET', url );\n"
"client.send();\n"
"</script>\n"
, h32
, coll
, rand64
);
sb.safePrintf("</div>\n");
@ -526,9 +555,21 @@ bool printDirHomePage ( SafeBuf &sb , HttpRequest *r ) {
sb.safePrintf("<br><br>\n");
sb.safePrintf("<br><br><br>\n");
sb.safePrintf("<a href=/>web</a> &nbsp;&nbsp;&nbsp;&nbsp; <a href=http://www.gigablast.com/seo>seo</a> &nbsp;&nbsp;&nbsp;&nbsp; <b>directory</b> &nbsp;&nbsp;&nbsp;&nbsp; \n");
sb.safePrintf("<a href=http://www.gigablast.com/events>events</a>"
" &nbsp;&nbsp;&nbsp;&nbsp; \n");
sb.safePrintf("<a href=/>web</a> &nbsp;&nbsp;&nbsp;&nbsp; ");
if ( g_conf.m_isMattWells )
sb.safePrintf("<a href=http://www.gigablast.com/seo>seo"
"</a> &nbsp;&nbsp;&nbsp;&nbsp; " );
sb.safePrintf("<a href=\"/Top\"><b>directory</b></a> "
"&nbsp;&nbsp;&nbsp;&nbsp; \n");
if ( g_conf.m_isMattWells )
sb.safePrintf("<a href=http://www.gigablast.com/events>"
"events</a>"
" &nbsp;&nbsp;&nbsp;&nbsp; \n");
sb.safePrintf("<a href=/adv.html>advanced search</a>");
sb.safePrintf(" &nbsp;&nbsp;&nbsp;&nbsp; ");
char *root = "";
@ -578,18 +619,13 @@ bool sendPageRoot ( TcpSocket *s , HttpRequest *r, char *cookie ) {
//long qlen;
//char *q = r->getString ( "q" , &qlen , NULL );
// insert collection name too
long collLen;
char *coll = r->getString("c",&collLen);
if ( ! coll || ! coll[0] ) {
//coll = g_conf.m_defaultColl;
coll = g_conf.getDefaultColl( r->getHost(), r->getHostLen() );
collLen = gbstrlen(coll);
}
// ensure collection not too big
if ( collLen >= MAX_COLL_LEN ) {
g_errno = ECOLLTOOBIG;
CollectionRec *cr = g_collectiondb.getRec(r);
if ( ! cr ) {
g_errno = ENOCOLLREC;
return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno));
}
// get the collection rec
/*
CollectionRec *cr = g_collectiondb.getRec ( coll );
@ -1271,7 +1307,9 @@ bool sendPageAddUrl ( TcpSocket *s , HttpRequest *r ) {
// collLen = gbstrlen(coll);
//}
// get collection rec
CollectionRec *cr = g_collectiondb.getRec ( r );
// bitch if no collection rec found
if ( ! cr ) {
g_errno = ENOCOLLREC;
@ -1552,6 +1590,8 @@ void doneInjectingWrapper3 ( void *st ) {
//CollectionRec *cr = g_collectiondb.getRec ( st1->m_coll );
// collection name
char *coll = st1->m_coll;
if ( ! coll ) coll = "";
//char tt [ 128 ];
//tt[0] = '\0';
@ -1658,8 +1698,10 @@ void doneInjectingWrapper3 ( void *st ) {
unsigned long rand32 = rand();
// in the mime to 0 seconds!
sb.safePrintf("<b>Url successfully added. "
"<a href=/search?rand=%lu&q=url%%3A",
rand32);
"<a href=/search?rand=%lu&"
"c=%s&q=url%%3A",
rand32,
coll);
sb.urlEncode(url);
sb.safePrintf(">Check it</a> or "
"<a href=http://www.gigablast.com/seo?u=");

@ -131,14 +131,14 @@ bool sendPageSockets ( TcpSocket *s , HttpRequest *r ) {
void printTcpTable ( SafeBuf* p, char *title, TcpServer *server ) {
// table headers for urls current being spiderd
p->safePrintf ( "<table width=100%% bgcolor=#d0d0f0 border=1>"
"<tr><td bgcolor=#c0c0f0 colspan=19>"
p->safePrintf ( "<table %s>"
"<tr class=hdrow><td colspan=19>"
"<center>"
//"<font size=+1>"
"<b>%s</b>"
//"</font>"
"</td></tr>"
"<tr>"
"<tr bgcolor=#%s>"
"<td><b>#</td>"
"<td><b>fd</td>"
"<td><b>age</td>"
@ -151,7 +151,11 @@ void printTcpTable ( SafeBuf* p, char *title, TcpServer *server ) {
"<td><b>bytes to read</td>"
"<td><b>bytes sent</td>"
"<td><b>bytes to send</td>"
"</tr>\n" , title );
"</tr>\n"
, TABLE_STYLE
, title
, DARK_BLUE
);
// current time in milliseconds
long long now = gettimeofdayInMilliseconds();
// store in buffer for sorting
@ -202,12 +206,12 @@ void printTcpTable ( SafeBuf* p, char *title, TcpServer *server ) {
case ST_CLOSE_CALLED: st="close called"; break;
}
// bgcolor is lighter for incoming requests
char *bg = "#c0c0f0";
if ( s->m_isIncoming ) bg = "#e8e8ff";
char *bg = "c0c0f0";
if ( s->m_isIncoming ) bg = "e8e8ff";
// times
long elapsed1 = now - s->m_startTime ;
long elapsed2 = now - s->m_lastActionTime ;
p->safePrintf ("<tr bgcolor=%s>"
p->safePrintf ("<tr bgcolor=#%s>"
"<td>%li</td>" // i
"<td>%i</td>" // fd
"<td>%lims</td>" // elapsed seconds since start
@ -301,26 +305,30 @@ void printUdpTable ( SafeBuf *p, char *title, UdpServer *server ,
msgCount1[s->m_msgType]++;
}
// print the counts
p->safePrintf ( "<table bgcolor=#d0d0f0 border=1>"
"<tr><td bgcolor=#c0c0f0 colspan=19>"
p->safePrintf ( "<table %s>"
"<tr class=hdrow><td colspan=19>"
"<center>"
"<b>%s Summary</b> (%li transactions)"
"</td></tr>"
"<tr>"
"<tr bgcolor=#%s>"
"<td><b>niceness</td>"
"<td><b>msg type</td>"
"<td><b>total</td>"
"</tr>",
title , server->getNumUsedSlots() );
TABLE_STYLE,
title , server->getNumUsedSlots() ,
DARK_BLUE );
for ( long i = 0; i < 96; i++ ) {
if ( msgCount0[i] <= 0 ) continue;
p->safePrintf("<tr><td>0</td><td>0x%lx</td><td>%li</td></tr>",
i, msgCount0[i]);
p->safePrintf("<tr bgcolor=#%s>"
"<td>0</td><td>0x%lx</td><td>%li</td></tr>",
LIGHT_BLUE,i, msgCount0[i]);
}
for ( long i = 0; i < 96; i++ ) {
if ( msgCount1[i] <= 0 ) continue;
p->safePrintf("<tr><td>1</td><td>0x%lx</td><td>%li</td></tr>",
i, msgCount1[i]);
p->safePrintf("<tr bgcolor=#%s>"
"<td>1</td><td>0x%lx</td><td>%li</td></tr>",
LIGHT_BLUE,i, msgCount1[i]);
}
p->safePrintf ( "</table><br>" );
@ -333,15 +341,15 @@ void printUdpTable ( SafeBuf *p, char *title, UdpServer *server ,
dd = //"<td><b>dns ip</b></td>"
"<td><b>hostname</b></td>";
}
// table headers for urls current being spiderd
p->safePrintf ( "<table width=100%% bgcolor=#d0d0f0 border=1>"
"<tr><td bgcolor=#c0c0f0 colspan=19>"
p->safePrintf ( "<table %s>"
"<tr class=hdrow><td colspan=19>"
"<center>"
//"<font size=+1>"
"<b>%s</b> (%li transactions)"
//"</font>"
"</td></tr>"
"<tr>"
"<tr bgcolor=#%s>"
"<td><b>age</td>"
"<td><b>last read</td>"
"<td><b>last send</td>"
@ -362,7 +370,11 @@ void printUdpTable ( SafeBuf *p, char *title, UdpServer *server ,
"<td><b>dgrams to send</td>"
"<td><b>acks read</td>"
"<td><b>resends</td>"
"</tr>\n" , title , server->getNumUsedSlots() , dd );
"</tr>\n" ,
TABLE_STYLE,
title , server->getNumUsedSlots() ,
DARK_BLUE ,
dd );
// now fill in the columns
@ -385,9 +397,9 @@ void printUdpTable ( SafeBuf *p, char *title, UdpServer *server ,
if ( s->m_lastReadTime == 0LL ) strcpy ( e1 , "--" );
if ( s->m_lastSendTime == 0LL ) strcpy ( e2 , "--" );
// bgcolor is lighter for incoming requests
char *bg = "#c0c0f0";
char *bg = LIGHT_BLUE;//"c0c0f0";
// is it incoming
if ( ! s->m_callback ) bg = "#e8e8ff";
if ( ! s->m_callback ) bg = LIGHTER_BLUE;//"e8e8ff";
Host *h = g_hostdb.getHost ( s->m_ip , s->m_port );
char *eip = "??";
unsigned short eport = 0 ;
@ -494,7 +506,7 @@ void printUdpTable ( SafeBuf *p, char *title, UdpServer *server ,
if ( msgType == 0x25 ) desc = "get link info";
if ( msgType == 0xfd ) desc = "proxy forward";
p->safePrintf ( "<tr bgcolor=%s>"
p->safePrintf ( "<tr bgcolor=#%s>"
"<td>%s</td>" // age
"<td>%s</td>" // last read
"<td>%s</td>" // last send
@ -540,22 +552,25 @@ void printUdpTable ( SafeBuf *p, char *title, UdpServer *server ,
cf2);
}
if ( ! isDns )
if ( ! isDns ) {
//"<td>%s</td>" // ip
//"<td>%hu</td>" // port
// clickable hostId
char *toFrom = "to";
if ( ! s->m_callback ) toFrom = "from";
//"<td><a href=http://%s:%hu/cgi/15.cgi>%li</a></td>"
p->safePrintf ( "<td>0x%hhx</td>" // msgtype
"<td><nobr>%s</nobr></td>" // desc
"<td><a href=http://%s:%hu/"
"<td><nobr>%s <a href=http://%s:%hu/"
"master/sockets?"
"c=%s>%s</a></td>"
"c=%s>%s</a></nobr></td>"
"<td>%s%li%s</td>" , // niceness
s->m_msgType ,
desc,
//iptoa(s->m_ip) ,
//s->m_port ,
// begin clickable hostId
toFrom,
eip ,
eport ,
coll ,
@ -565,6 +580,7 @@ void printUdpTable ( SafeBuf *p, char *title, UdpServer *server ,
cf2
// end clickable hostId
);
}
p->safePrintf ( "<td>%lu</td>" // transId
"<td>%i</td>" // called handler

File diff suppressed because it is too large Load Diff

@ -194,9 +194,6 @@ void sendReply ( void *state ) {
//g_pages.printAdminTop2 ( &buf , st->m_socket , &st->m_request, NULL ,
// tmpBuf.getBufStart(), tmpBuf.length() );
// write the controls section of the page
writeControls( &buf, st );
// Debug print of CGI parameters and errors
char startTimeStr[30];
char endTimeStr[30];
@ -211,10 +208,10 @@ void sendReply ( void *state ) {
"Turn on in the master controls.</b>"
"</font>\n" );
buf.safePrintf("<table cellpadding=10 border=0>\n");
buf.safePrintf("<table %s>\n",TABLE_STYLE);
buf.safePrintf("<tr><td>"
"<center>");
buf.safePrintf("<tr><td bgcolor=#%s>"
"<center>",LIGHT_BLUE);
/////////////////////////
//
@ -246,6 +243,9 @@ void sendReply ( void *state ) {
buf.safePrintf("</center>");
// write the controls section of the page
writeControls( &buf, st );
// print the bottom of the page
g_pages.printAdminBottom2( &buf );

@ -34,8 +34,8 @@ bool sendPageThreads ( TcpSocket *s , HttpRequest *r ) {
long hiActive = q->m_hiLaunched - q->m_hiReturned;
long total = loActive + mdActive + hiActive;
p.safePrintf ( "<table width=100%% bgcolor=#d0d0f0 border=1>"
"<tr><td bgcolor=#c0c0f0 colspan=\"11\">"
p.safePrintf ( "<table %s>"
"<tr class=hdrow><td colspan=\"11\">"
//"<center>"
//"<font size=+1>"
"<b>Thread Type: %s"
@ -43,12 +43,13 @@ bool sendPageThreads ( TcpSocket *s , HttpRequest *r ) {
" med: %li"
" high: %li"
" total: %li)</td></tr>",
TABLE_STYLE,
q->getThreadType(),
loActive, mdActive,
hiActive, total);
p.safePrintf ("<tr>"
p.safePrintf ("<tr bgcolor=#%s>"
"<td><b>Status</b></td>"
"<td><b>Niceness</b></td>"
"<td><b>Queued Time</b></td>"
@ -60,7 +61,9 @@ bool sendPageThreads ( TcpSocket *s , HttpRequest *r ) {
"<td><b>Bytes Done</b></td>"
"<td><b>KBytes/Sec</b></td>"
"<td><b>Read|Write</b></td>"
"</tr>");
"</tr>"
, LIGHT_BLUE
);
for ( long j = 0 ; j < q->m_top ; j++ ) {
ThreadEntry *t = &q->m_entries[j];
@ -73,7 +76,7 @@ bool sendPageThreads ( TcpSocket *s , HttpRequest *r ) {
// might have got pre-called from EDISKSTUCK
if ( ! t->m_callback ) fs = NULL;
p.safePrintf("<tr>");
p.safePrintf("<tr bgcolor=#%s>", DARK_BLUE );
if(t->m_isDone) {
p.safePrintf("<td><font color='red'><b>done</b></font></td>");
@ -109,7 +112,7 @@ bool sendPageThreads ( TcpSocket *s , HttpRequest *r ) {
if(diskThread && fs ) {
long long took = (now - t->m_launchedTime);
if(took <= 0) took = 1;
p.safePrintf("<td>???/%li</td>", t->m_bytesToGo);
p.safePrintf("<td>%c%c%c/%li</td>", '?','?','?',t->m_bytesToGo);
p.safePrintf("<td>%.2f kbps</td>", 0.0);//(float)fs->m_bytesDone/took);
p.safePrintf("<td>%s</td>",t->m_doWrite? "Write":"Read");
}
@ -159,41 +162,50 @@ bool sendPageThreads ( TcpSocket *s , HttpRequest *r ) {
long hiActiveMed = disk->m_hiLaunchedMed - disk->m_hiReturnedMed;
long hiActiveSma = disk->m_hiLaunchedSma - disk->m_hiReturnedSma;
long activeWrites = disk->m_writesLaunched - disk->m_writesReturned;
p.safePrintf ( "<table width=100%% bgcolor=#d0d0f0 border=1>"
"<tr><td bgcolor=#c0c0f0 colspan=\"5\">");
p.safePrintf ( "<table %s>"
"<tr class=hdrow><td colspan=\"5\">"
, TABLE_STYLE );
p.safePrintf ( "<center><b>Active Read Threads</b></center></td></tr>"
"<tr><td></td><td colspan='3'><center><b>Priority</b></center></td></tr>"
"<tr>"
"<tr bgcolor=#%s>"
"<td></td><td colspan='3'>"
"<center><b>Priority</b></center></td></tr>"
"<tr bgcolor=#%s>"
"<td><b>Size</b></td><td>Low</td><td>Medium</td><td>High</td>"
"</tr>"
// "<tr>"
// "<td>Size</td>"
// "</tr>"
"<tr>"
"<tr bgcolor=#%s>"
"<td>Small</td> <td>%li</td><td>%li</td><td>%li</td>"
"</tr>"
"<tr>"
"<tr bgcolor=#%s>"
"<td>Medium</td> <td>%li</td><td>%li</td><td>%li</td>"
"</tr>"
"<tr>"
"<tr bgcolor=#%s>"
"<td>Large</td> <td>%li</td><td>%li</td><td>%li</td>"
"</tr>"
"</table><br><br>",
LIGHT_BLUE,
LIGHT_BLUE,
DARK_BLUE,
loActiveSma,
mdActiveSma,
hiActiveSma,
DARK_BLUE,
loActiveMed,
mdActiveMed,
hiActiveMed,
DARK_BLUE,
loActiveBig,
mdActiveBig,
hiActiveBig);
p.safePrintf ("<table width=100%% bgcolor=#d0d0f0 border=1>");
p.safePrintf ("<tr>"
"<td bgcolor=#c0c0f0><b>Active Write Threads</b></td><td>%li</td>"
p.safePrintf ("<table %s>",TABLE_STYLE);
p.safePrintf ("<tr class=hdrow>"
"<td><b>Active Write Threads</b></td><td>%li</td>"
"</tr></table>",
activeWrites);

278
Pages.cpp

@ -222,11 +222,11 @@ static WebPage s_pages[] = {
//USER_ADMIN | USER_MASTER ,
"page filter page",
sendPageGeneric , 0 } ,
{ PAGE_INJECT , "admin/inject" , 0 , "inject urls" , 0 , 1 ,
{ PAGE_INJECT , "admin/inject" , 0 , "inject url" , 0 , 1 ,
//USER_ADMIN | USER_MASTER ,
"inject url in the index here",
sendPageInject , 2 } ,
{ PAGE_ADDURL2 , "admin/addurl" , 0 , "add url" , 0 , 0 ,
{ PAGE_ADDURL2 , "admin/addurl" , 0 , "add urls" , 0 , 0 ,
//USER_ADMIN | USER_MASTER ,
"add url page",
sendPageAddUrl , 0 } ,
@ -913,7 +913,9 @@ bool Pages::printAdminTop ( SafeBuf *sb ,
if ( user ) pwd = user->m_password;
sb->safePrintf(
"<html>\n"
"<html>\n");
sb->safePrintf(
"<head>\n"
"<title>%s | gigablast admin</title>\n"
"<meta http-equiv=\"Content-Type\" "
@ -961,21 +963,24 @@ bool Pages::printAdminTop ( SafeBuf *sb ,
coll, NULL, fromIp, qs );
}
// end table
sb->safePrintf ("</td></tr></table><br/><br/>\n");
sb->safePrintf ("</td></tr></table><br/>\n");//<br/>\n");
SafeBuf mb;
long adds = 0;
PingServer *ps = &g_pingServer;
mb.safePrintf("<center>"
mb.safePrintf(//"<center>"
"<table cellpadding=5 "
"style=\""
//"border:2px solid black;"
"max-width:600px\" "
"background-color:#ff6666;"
"border:2px #8f0000 solid;"
"border-radius:5px;"
"max-width:600px;"
"\" "
"border=0"
">"
"<tr><td bgcolor=#ff6666>");
"<tr><td>");
// emergency message box
if ( g_pingServer.m_hostsConfInDisagreement ) {
@ -997,8 +1002,9 @@ bool Pages::printAdminTop ( SafeBuf *sb ,
*needsRebalance ) {
if ( adds ) mb.safePrintf("<br><br>");
adds++;
mb.safePrintf("This host requires a shard rebalance. "
"Click 'rebalance shards' in master controls.");
mb.safePrintf("A host requires a shard rebalance. "
"Click 'rebalance shards' in master controls to "
"rebalance all hosts.");
}
if ( ps->m_numHostsDead ) {
@ -1010,39 +1016,22 @@ bool Pages::printAdminTop ( SafeBuf *sb ,
"pings.",ps->m_numHostsDead ,s );
}
mb.safePrintf("</td></tr></table></center><br>");
if ( ! g_conf.m_useThreads || g_threads.m_disabled ) {
if ( adds ) mb.safePrintf("<br><br>");
adds++;
mb.safePrintf("Threads are disabled. Severely hurts "
"performance.");
}
// a new table. on the left is collections, on right is other stuff
sb->safePrintf("<TABLE "
"cellpadding=5 border=0>"
"<TR>"
"<TD valign=top>"
"<div "
"style="
"max-height:600px;"
//"max-width:225px;"
//"min-width:225px;"
"overflow-y:auto;"
"overflow-x:hidden>"
);
// collection under that
status &= printCollectionNavBar ( sb, page , username , coll,pwd, qs );
// then collection page links and parms
sb->safePrintf("</div></TD><TD valign=top><br>");
// print emergency msg box
if ( adds )
sb->safePrintf("%s",mb.getBufStart());
// print the links
status &= printAdminLinks ( sb, page , username , coll , pwd, true );
// print the links
status &= printAdminLinks ( sb, page , username , coll ,pwd , false );
mb.safePrintf("</td></tr></table>"
//"</center>"
"<br>");
////////
//
// . the form
//
////////
// . we cannot use the GET method if there is more than a few k of
// parameters, like in the case of the Search Controls page. The
// browser simply will not send the request if it is that big.
@ -1054,7 +1043,6 @@ bool Pages::printAdminTop ( SafeBuf *sb ,
sb->safePrintf ("<form name=\"SubmitInput\" method=\"get\" "
"action=\"/%s\">\n",
s_pages[page].m_filename);
// pass on this stuff
//if ( ! pwd ) pwd = "";
//sb->safePrintf ( "<input type=hidden name=pwd value=\"%s\">\n",pwd);
@ -1065,11 +1053,66 @@ bool Pages::printAdminTop ( SafeBuf *sb ,
if ( g_users.hasPermission ( username, PAGE_ADMIN ) ){
sb->safePrintf("<input type=hidden name=master value=0>\n");
}
// should any changes be broadcasted to all hosts?
sb->safePrintf ("<input type=hidden name=cast value=\"%li\">\n",
(long)s_pages[page].m_cast);
// a new table. on the left is collections, on right is other stuff
sb->safePrintf("<TABLE "
"cellpadding=5 border=0>"
"<TR>"
"<td></td>"
);
// then collection page links and parms
sb->safePrintf("<TD valign=top>");
// print emergency msg box
if ( adds )
sb->safePrintf("<br>%s",mb.getBufStart());
// print the links
status &= printAdminLinks ( sb, page , username , coll , pwd, true );
// print the links
status &= printAdminLinks ( sb, page , username , coll ,pwd , false );
// begin 2nd row in big table
sb->safePrintf("</td></TR>");
sb->safePrintf(
"<TR>"
"<TD valign=top>"
"<div "
"style=\""
"max-height:600px;"
"max-width:200px;"
"min-width:200px;"
"padding:4px;" // same as TABLE_STYLE
"background-color:#d0d0d0;"
"border-radius:10px;"
"border:2px #606060 solid;"
//"border-width:2px;"
//"border-color:#606060;"
"overflow-y:auto;"
"overflow-x:hidden;"
"line-height:23px;"
"\""
">"
);
// collection under that
status &= printCollectionNavBar ( sb, page , username , coll,pwd, qs );
sb->safePrintf("</div></TD>");
// the controls will go here
sb->safePrintf("<TD valign=top>");
return true;
}
@ -1783,7 +1826,7 @@ bool Pages::printAdminLinks ( SafeBuf *sb,
//sprintf(p,"<font size=+1>\n" );
//p += gbstrlen(p);
sb->safePrintf ("<center>\n" );
//sb->safePrintf ("<center>\n" );
// soemtimes we do not want to be USER_MASTER for testing
char buf [ 64 ];
@ -1807,6 +1850,10 @@ bool Pages::printAdminLinks ( SafeBuf *sb,
if ( ! g_conf.m_isMattWells && i == PAGE_SEO )
continue;
// skip page autoban link
if ( ! g_conf.m_isMattWells && i == PAGE_AUTOBAN )
continue;
// ignore these for now
if ( i == PAGE_SECURITY ) continue;
if ( i == PAGE_ACCESS ) continue;
@ -1815,38 +1862,55 @@ bool Pages::printAdminLinks ( SafeBuf *sb,
if ( i == PAGE_SEARCHBOX ) continue;
if ( i == PAGE_TITLEDB ) continue;
// print "url download" before "inject url"
// GET /mycollname_urls.csv
if ( i == PAGE_INJECT ) {
sb->safePrintf (
"<b>"
"<a style=text-decoration:none; "
"href=\"/download/%s_urls.txt\">"
"url download"
"</a>"
"</b>"
" &nbsp; \n",
coll );
}
if ( cr && ! cr->m_isCustomCrawl && i == PAGE_CRAWLBOT )
continue;
// print it out
if ( i == PAGE_LOGIN || i == PAGE_LOGIN2 )
sb->safePrintf(
"<span style=\"white-space:nowrap\">"
"<a href=\"/%s?"
//"user=%s&pwd=%s&"
"c=%s%s\">%s</a>"
"</span>"
" &nbsp; \n",s_pages[i].m_filename,
//username,pwd,
coll,
buf,s_pages[i].m_name);
//"<span style=\"white-space:nowrap\">"
"<a href=\"/%s?"
//"user=%s&pwd=%s&"
"c=%s%s\">%s</a>"
//"</span>"
" &nbsp; \n",s_pages[i].m_filename,
//username,pwd,
coll,
buf,s_pages[i].m_name);
else if ( page == i )
sb->safePrintf(
"<span style=\"white-space:nowrap\">"
"<a href=\"/%s?c=%s%s\"><b>"
"<font color=red>%s</font></b></a>"
"</span>"
" &nbsp; \n",s_pages[i].m_filename,
coll,
buf,s_pages[i].m_name);
//"<span style=\"white-space:nowrap\">"
"<a href=\"/%s?c=%s%s\"><b>"
"<font color=red>%s</font></b></a>"
//"</span>"
" &nbsp; \n",s_pages[i].m_filename,
coll,
buf,s_pages[i].m_name);
else
sb->safePrintf(
"<span style=\"white-space:nowrap\">"
"<a href=\"/%s?c=%s%s\">%s</a>"
"</span>"
" &nbsp; \n",s_pages[i].m_filename,
coll,
buf,s_pages[i].m_name);
//"<span style=\"white-space:nowrap\">"
"<b>"
"<a style=text-decoration:none; "
"href=\"/%s?c=%s%s\">%s</a>"
"</b>"
//"</span>"
" &nbsp; \n",s_pages[i].m_filename,
coll,
buf,s_pages[i].m_name);
// print <br> after the last master admin control
/*
if ( i == PAGE_DELCOLL && user == USER_MASTER ) {
@ -1861,7 +1925,24 @@ bool Pages::printAdminLinks ( SafeBuf *sb,
}
*/
}
sb->safePrintf("</center><br/>" );
// print documentation links
if ( top ) {
sb->safePrintf(" <a style=text-decoration:none "
"href=/admin.html>"
"<b>"
"admin guide"
"</b></a> "
"&nbsp; "
" <a style=text-decoration:none; "
"href=/developer.html>"
"<b>dev guide</b></a>" );
}
//sb->safePrintf("</center>" );
sb->safePrintf("<br/>" );
if ( top ) sb->safePrintf("<br/>" );
if ( top ) return status;
@ -1956,6 +2037,7 @@ bool Pages::printCollectionNavBar ( SafeBuf *sb ,
bool status = true;
//if ( ! pwd ) pwd = "";
if ( ! qs ) qs = "";
// if not admin just print collection name
if ( g_collectiondb.m_numRecsUsed == 0 ) {
sb->safePrintf ( "<center>"
@ -1990,26 +2072,42 @@ bool Pages::printCollectionNavBar ( SafeBuf *sb ,
char *s = "s";
if ( g_collectiondb.m_numRecsUsed == 1 ) s = "";
sb->safePrintf ( "<center><b>%li Collection%s</b></center><br>\n",
sb->safePrintf ( "<center><nobr><b>%li Collection%s</b></nobr>"
"</center><br>\n",
g_collectiondb.m_numRecsUsed , s );
char *color = "red";
//if ( page >= PAGE_CGIPARMS ) color = "red";
//else color = "black";
// style for printing collection names
sb->safePrintf("<style>.x{text-decoration:none;font-weight:bold;}"
".e{background-color:#e0e0e0;}"
"</style>\n");
long row = 0;
//for ( long i = a ; i < b ; i++ ) {
for ( long i = 0 ; i < g_collectiondb.m_numRecs ; i++ ) {
CollectionRec *cc = g_collectiondb.m_recs[i];
if ( ! cc ) continue;
char *cname = cc->m_coll;
row++;
//if ( p + gbstrlen(cname) + 100 >= pend ) return p;
// collection name HACK for backwards compatibility
//if ( ! cname[0] ) cname = "main";
// every other coll in a darker div
if ( (row % 2) == 0 )
sb->safePrintf("<div class=e>");
sb->safePrintf("<nobr>");
if ( i != collnum || ! highlight )// || ! coll || ! coll[0])
sb->safePrintf ( "<a title=\"%s\" "
"class=x "
"href=\"/%s?c=%s%s\">%s"
"</a> &nbsp;",
cname,
@ -2017,12 +2115,17 @@ bool Pages::printCollectionNavBar ( SafeBuf *sb ,
cname ,
qs, cname );
else
sb->safePrintf ( "<b><font title=\"%s\" "
"color=%s>%s</font></b> "
sb->safePrintf ( "<u><b><font title=\"%s\" "
"color=%s>%s</font></b></u> "
"&nbsp; ",
cname, color , cname );
sb->safePrintf("</nobr>");
sb->safePrintf("<br>\n");
// every other coll in a darker div
if ( (row % 2) == 0 )
sb->safePrintf("</div>");
else
sb->safePrintf("<br>\n");
}
//sb->safePrintf ( "</center><br/>" );
@ -2383,21 +2486,32 @@ bool sendPageCgiParms ( TcpSocket *s , HttpRequest *r ) {
// p.incrementLength ( pp - p.getBuf() );
// }
p.safePrintf ( "<table width=100%% cellpadding=2 "
"bgcolor=#%s border=1>"
"<tr><td colspan=4 bgcolor=#%s>"
p.safePrintf ( "<table %s>"
"<tr class=hdrow><td colspan=8>"
"<center><b>CGI Parameters</b></tr></tr>"
"<tr><td><b>CGI</b></td><td><b>Type</b></td>"
"<tr bgcolor=#%s><td><b>CGI</b></td>"
"<td><b>Page</b></td>"
"<td><b>Type</b></td>"
"<td><b>Name</b></td><td><b>Description</b></td></tr>\n",
LIGHT_BLUE, DARK_BLUE );
TABLE_STYLE , DARK_BLUE);
for ( long i = 0; i < g_parms.m_numParms; i++ ) {
Parm *parm = &g_parms.m_parms[i];
if ( !parm->m_sparm ) continue;
// use m_cgi if no m_scgi
char *cgi = parm->m_cgi;
if ( parm->m_scgi ) cgi = parm->m_scgi;
// skip if hidden
if ( parm->m_flags & PF_HIDDEN ) continue;
char *page = parm->m_scmd;
if ( ! page ) page = "";
// print the parm
p.safePrintf ( "<tr><td><b>%s</b></td><td nowrap=1>", cgi );
p.safePrintf ( "<tr bgcolor=#%s><td><b>%s</b></td>",
LIGHT_BLUE , cgi );
p.safePrintf("<td>%s</td>",page);
p.safePrintf("<td nowrap=1>");
switch ( parm->m_type ) {
case TYPE_BOOL: p.safePrintf ( "BOOL" ); break;
case TYPE_BOOL2: p.safePrintf ( "BOOL" ); break;
@ -2417,14 +2531,13 @@ bool sendPageCgiParms ( TcpSocket *s , HttpRequest *r ) {
}
p.safePrintf ( "</table><br><br>" );
p.safePrintf ( "<table width=100%% cellpadding=2 "
"bgcolor=#%s border=1>"
"<tr><td colspan=2 bgcolor=#%s>"
p.safePrintf ( "<table %s>"
"<tr class=hdrow><td colspan=2>"
"<center><b>Query Operators</b></td></tr>"
"<tr><td><b>Operator</b></td>"
"<td><b>Description</b>"
"</td></tr>\n",
LIGHT_BLUE, DARK_BLUE );
TABLE_STYLE );
// table of the query keywords
long n = getNumFieldCodes();
for ( long i = 0 ; i < n ; i++ ) {
@ -2434,8 +2547,9 @@ bool sendPageCgiParms ( TcpSocket *s , HttpRequest *r ) {
char *d = f->desc;
// fix table internal cell bordering
if ( d[0] == '\0' ) d = "&nbsp;";
p.safePrintf("<tr><td><b>%s</b>:</td><td>%s</td></tr>\n",
f->text,d);
p.safePrintf("<tr bgcolor=#%s>"
"<td><b>%s</b>:</td><td>%s</td></tr>\n",
LIGHT_BLUE,f->text,d);
}
p.safePrintf("</body></html>");

@ -14,8 +14,12 @@
#include "SafeBuf.h"
#include "PageCrawlBot.h" // sendPageCrawlBot()
#define LIGHTER_BLUE "e8e8ff"
#define LIGHT_BLUE "d0d0e0"
#define DARK_BLUE "c0c0f0"
#define DARKER_BLUE "a0a0f0"
#define DARKEST_BLUE "8080f0"
#define TABLE_STYLE " style=\"border-radius:10px;border:#6060f0 2px solid;\" width=100% bgcolor=#a0a0f0 cellpadding=4 border=0 "
extern char *g_msg;

1018
Parms.cpp

File diff suppressed because it is too large Load Diff

@ -93,9 +93,10 @@ class Page {
#define PF_API 0x10
#define PF_REBUILDURLFILTERS 0x20
#define PF_NOSYNC 0x40
#define PF_CUSTOMCRAWLONLY 0x80
#define PF_DIFFBOT 0x80
#define PF_HIDDEN 0x0100
#define PF_NOSAVE 0x0200
class Parm {
@ -342,7 +343,8 @@ class Parms {
bool sendToGrunts = true ,
bool sendToProxies = false ,
// send to this single hostid? -1 means all
long hostId = -1 );
long hostId = -1 ,
long hostId2 = -1 ); // hostid range?
bool doParmSendingLoop ( ) ;
bool syncParmsWithHost0 ( ) ;
bool makeSyncHashList ( SafeBuf *hashList ) ;

@ -2677,8 +2677,14 @@ void checkKernelErrors( int fd, void *state ){
// klogctl reads the last 4k lines of the kernel ring buffer
short bufLen = klogctl(3,buf,4096);
long long took = gettimeofdayInMilliseconds() - st;
if ( took > 1 )
if ( took >= 3 ) {
long len = bufLen;
if ( len > 200 ) len = 200;
char c = buf[len];
buf[len] = '\0';
log("db: klogctl took %lli ms to read %s",took, buf);
buf[len] = c;
}
if ( bufLen < 0 ){
log ("db: klogctl returned error: %s",mstrerror(errno));

@ -89,7 +89,7 @@ bool Placedb::init2 ( long treeMem ) {
return false;
return true;
}
/*
bool Placedb::addColl ( char *coll, bool doVerify ) {
if ( ! m_rdb.addColl ( coll ) ) return false;
if ( ! doVerify ) return true;
@ -101,7 +101,7 @@ bool Placedb::addColl ( char *coll, bool doVerify ) {
log ( "db: Verify failed, but scaling is allowed, passing." );
return true;
}
*/
bool Placedb::verify ( char *coll ) {
log ( LOG_INFO, "db: Verifying Placedb for coll %s...", coll );
g_threads.disableThreads();

@ -227,7 +227,7 @@ bool Posdb::init2 ( long treeMem ) {
bool Posdb::addColl ( char *coll, bool doVerify ) {
if ( ! m_rdb.addColl ( coll ) ) return false;
if ( ! m_rdb.addRdbBase1 ( coll ) ) return false;
if ( ! doVerify ) return true;
// verify
if ( verify(coll) ) return true;

@ -108,6 +108,10 @@ char *g_files[] = {
"pdftohtml", // pdf
"pstotext" , // postscript
//"ppthtml" , // powerpoint
// required for SSL server support for both getting web pages
// on https:// sites and for serving https:// pages
"gb.pem",
//"dict/unifiedDict",
//"dict/thesaurus.txt",
@ -187,6 +191,7 @@ char *g_files[] = {
bool Process::checkFiles ( char *dir ) {
/*
// check these by hand since you need one or the other
File f1;
File f2;
@ -199,15 +204,14 @@ bool Process::checkFiles ( char *dir ) {
if ( //( ! f3.doesExist() || ! f4.doesExist() ) &&
( ! f4.doesExist() ) &&
( ! f1.doesExist() || ! f2.doesExist() ) ) {
/*
log("db: need either (%s and %s) or (%s and %s)",
f3.getFilename() ,
f4.getFilename() ,
f1.getFilename() ,
f2.getFilename() );
*/
//return false;
}
*/
// check for email subdir
//f1.set ( dir , "/html/email/");
@ -410,7 +414,7 @@ bool Process::init ( ) {
//m_rdbs[m_numRdbs++] = g_tfndb.getRdb ();
m_rdbs[m_numRdbs++] = g_titledb.getRdb ();
//m_rdbs[m_numRdbs++] = g_revdb.getRdb ();
//m_rdbs[m_numRdbs++] = g_sectiondb.getRdb ();
m_rdbs[m_numRdbs++] = g_sectiondb.getRdb ();
m_rdbs[m_numRdbs++] = g_posdb.getRdb ();
//m_rdbs[m_numRdbs++] = g_datedb.getRdb ();
m_rdbs[m_numRdbs++] = g_spiderdb.getRdb ();
@ -430,7 +434,7 @@ bool Process::init ( ) {
//m_rdbs[m_numRdbs++] = g_tfndb2.getRdb ();
m_rdbs[m_numRdbs++] = g_titledb2.getRdb ();
//m_rdbs[m_numRdbs++] = g_revdb2.getRdb ();
//m_rdbs[m_numRdbs++] = g_sectiondb2.getRdb ();
m_rdbs[m_numRdbs++] = g_sectiondb2.getRdb ();
m_rdbs[m_numRdbs++] = g_posdb2.getRdb ();
//m_rdbs[m_numRdbs++] = g_datedb2.getRdb ();
m_rdbs[m_numRdbs++] = g_spiderdb2.getRdb ();
@ -1426,6 +1430,13 @@ bool Process::shutdown2 ( ) {
// at least destroy the page caches that have shared memory
// because they seem to not clean it up
resetPageCaches();
// let's ensure our core file can dump
struct rlimit lim;
lim.rlim_cur = lim.rlim_max = RLIM_INFINITY;
if ( setrlimit(RLIMIT_CORE,&lim) )
log("gb: setrlimit: %s.", mstrerror(errno) );
// . force an abnormal termination which will cause a core dump
// . do not dump core on SIGHUP signals any more though
abort();
@ -1478,7 +1489,7 @@ void Process::disableTreeWrites ( ) {
rdb->disableWrites();
}
// disable all spider trees and tables
for ( long i = 0 ; i < g_collectiondb.getNumRecs() ; i++ ) {
for ( long i = 0 ; i < g_collectiondb.m_numRecs ; i++ ) {
SpiderColl *sc = g_spiderCache.getSpiderCollIffNonNull(i);
if ( ! sc ) continue;
sc->m_waitingTree .disableWrites();
@ -1495,7 +1506,7 @@ void Process::enableTreeWrites ( ) {
rdb->enableWrites();
}
// enable all waiting trees
for ( long i = 0 ; i < g_collectiondb.getNumRecs() ; i++ ) {
for ( long i = 0 ; i < g_collectiondb.m_numRecs ; i++ ) {
SpiderColl *sc = g_spiderCache.getSpiderCollIffNonNull(i);
if ( ! sc ) continue;
sc->m_waitingTree .enableWrites();
@ -1771,6 +1782,8 @@ void Process::resetAll ( ) {
g_wiktionary.reset();
g_countryCode.reset();
s_clusterdbQuickCache.reset();
s_hammerCache.reset();
s_table32.reset();
@ -1824,7 +1837,7 @@ void Process::resetPageCaches ( ) {
//g_datedb .getDiskPageCache()->reset();
g_linkdb .getDiskPageCache()->reset();
g_titledb .getDiskPageCache()->reset();
//g_sectiondb .getDiskPageCache()->reset();
g_sectiondb .getDiskPageCache()->reset();
g_tagdb .getDiskPageCache()->reset();
g_spiderdb .getDiskPageCache()->reset();
//g_tfndb .getDiskPageCache()->reset();

@ -66,7 +66,7 @@ bool Profiler::init() {
return false;
if ( ! m_activeFns.set(4,4,256,NULL,0,false,0,"activefns") )
return false;
return m_fn.set(4,sizeof(FnInfo),256,NULL,0,false,0,"fntbl");
return m_fn.set(4,sizeof(FnInfo),65536,NULL,0,false,0,"fntbl");
}
@ -751,17 +751,19 @@ bool Profiler::printInfo(SafeBuf *sb,char *username, //long user,
}
sb->safePrintf( "<center>\n<table border=1 cellpadding=4 "
"width=100%% bgcolor=#%s>\n"
"<tr><td colspan=9 bgcolor=#%s>"
sb->safePrintf( "<center>\n<table %s>\n"
"<tr class=hdrow><td colspan=9>"
"<center><b>Profiler "//- Since Startup</b></center>"
"<a href=\"/admin/profiler?c=%s"//"
"&profilerreset=1\">"
"(reset)</a></b></center>"
"</td></tr>\n",LIGHT_BLUE,DARK_BLUE,
"</td></tr>\n",
TABLE_STYLE,
coll);
sb->safePrintf("<tr><td><b>Address</b></td><td><b>Function</b></td>");
sb->safePrintf("<tr bgcolor=#%s>"
"<td><b>Address</b></td><td><b>Function</b></td>"
, LIGHT_BLUE);
sb->safePrintf("<td><b><a href=/admin/profiler?sorts=3&c=%s>"
"Times Called</a></b></td></td>",coll);
sb->safePrintf("<td><b><a href=/admin/profiler?sorts=4&c=%s>"
@ -858,12 +860,13 @@ bool Profiler::printInfo(SafeBuf *sb,char *username, //long user,
//Now to print the table of functions called in the last 10 seconds
sb->safePrintf( "<center>\n<table border=1 cellpadding=4 "
"width=100%% bgcolor=#%s>\n"
"<tr><td colspan=8 bgcolor=#%s>"
sb->safePrintf( "<center>\n<table %s>\n"
"<tr class=hdrow><td colspan=8>"
"<center><b>Profiler - Last 10 seconds</b></center>"
"</td></tr>\n",LIGHT_BLUE,DARK_BLUE);
sb->safePrintf("<tr><td><b>Address</b></td><td><b>Function</b></td>");
"</td></tr>\n",TABLE_STYLE);
sb->safePrintf("<tr bgcolor=#%s>"
"<td><b>Address</b></td><td><b>Function</b></td>",
LIGHT_BLUE);
sb->safePrintf("<td><b><a href=/admin/profiler?sort10=3&c=%s&"
">"
"Times Called</a></b></td></td>",coll);
@ -1003,22 +1006,24 @@ bool Profiler::printInfo(SafeBuf *sb,char *username, //long user,
numSlots = m_quickpolls.getNumSlots();
numSlotsUsed = m_quickpolls.getNumSlotsUsed();
sb->safePrintf("<center>\n<table border=1 cellpadding=4 "
"width=100%% bgcolor=#%s>\n"
"<tr><td colspan=5 bgcolor=#%s>"
sb->safePrintf("<center>\n<table %s>\n"
"<tr class=hdrow><td colspan=5>"
"<center><b>Triggered Quickpolls "
"<a href=\"/admin/profiler?c=%s"
"&qpreset=1\">"
"(reset)</a></b></center>"
"</td></tr>\n",LIGHT_BLUE,DARK_BLUE,
"</td></tr>\n",
TABLE_STYLE,
coll);
sb->safePrintf("<tr><td><b>Between Functions</b></td>"
sb->safePrintf("<tr bgcolor=#%s>"
"<td><b>Between Functions</b></td>"
"<td><b>max blocked(msec)</b></td>"
"<td><b>avg time(msec)</b></td>"
"<td><b>times triggered</b></td>"
"<td><b>total(msec)</b></td>"
"</tr>");
"</tr>"
, LIGHT_BLUE );
if(numSlotsUsed == 0) {
sb->safePrintf("</table>");
@ -1539,15 +1544,13 @@ Profiler::printRealTimeInfo(SafeBuf *sb,
int realTimeSortMode,
int realTimeShowAll) {
if(!m_realTimeProfilerRunning) {
sb->safePrintf("<table border=1 cellpadding=4 bgcolor=#%s "
"width=100%%\n>",
LIGHT_BLUE);
sb->safePrintf("<tr><td colspan=7 bgcolor=#%s>"
sb->safePrintf("<table %s>",TABLE_STYLE);
sb->safePrintf("<tr class=hdrow><td colspan=7>"
"<center><b>Real Time Profiler "
"<a href=\"/admin/profiler?c=%s"
"&rtstart=1\">"
"(Start)</a></b></center>"
"</td></tr>\n",DARK_BLUE,coll);
"</td></tr>\n",coll);
sb->safePrintf("</table><br><br>\n");
return true;
}
@ -1562,16 +1565,14 @@ Profiler::printRealTimeInfo(SafeBuf *sb,
//}
rtNumEntries = realTimeProfilerData.getNumUsedSlots();
if(!rtNumEntries) {
sb->safePrintf("<table border=1 cellpadding=4 bgcolor=#%s "
"width=100%%\n>",
LIGHT_BLUE);
sb->safePrintf("<tr><td colspan=7 bgcolor=#%s>"
sb->safePrintf("<table %s>",TABLE_STYLE);
sb->safePrintf("<tr class=hdrow><td colspan=7>"
"<center><b>Real Time Profiler started, refresh page "
"after some time."
"<a href=\"/admin/profiler?c=%s"
"&rtstop=1\">"
"(Stop)</a></b></center>"
"</td></tr>\n",DARK_BLUE,coll);
"</td></tr>\n",coll);
sb->safePrintf("</table><br><br>\n");
startRealTimeProfiler();
return true;
@ -1626,9 +1627,7 @@ Profiler::printRealTimeInfo(SafeBuf *sb,
hitEntries[i].missedQuickPollsPerFunc=missedQuickPolls;
}
}
sb->safePrintf("<table border=1 cellpadding=4 bgcolor=#%s "
"width=100%%>\n",
LIGHT_BLUE);
sb->safePrintf("<table %s>",TABLE_STYLE);
char *showMessage;
int rtall;
if(realTimeShowAll) {
@ -1638,11 +1637,11 @@ Profiler::printRealTimeInfo(SafeBuf *sb,
showMessage = "(show all)";
rtall = 1;
}
sb->safePrintf("<tr><td colspan=7 bgcolor=#%s>"
sb->safePrintf("<tr class=hdrow><td colspan=7>"
"<center><b>Real Time Profiler "
"<a href=\"/admin/profiler?c=%s"
"&rtall=%i\">%s</a>"
,DARK_BLUE,coll,
,coll,
rtall, showMessage);
sb->safePrintf("<a href=\"/admin/profiler?c=%s&rtstop=1\">"
"(Stop)</a></b></center></td></tr>\n",

@ -280,7 +280,7 @@ bool Proxy::initProxy ( long proxyId, unsigned short udpPort,
//need to init collectiondb too because of addurl
//set isdump to true because we aren't going to store any data in the
//collection
if ( !g_collectiondb.init( true ) ){ //isDump
if ( !g_collectiondb.loadAllCollRecs( ) ){ //isDump
log ("db: collectiondb init failed.");
return false;
}

@ -2190,6 +2190,7 @@ bool Query::setQWords ( char boolFlag ,
// if we're hashing a url:, link:, site: or ip: term,
// then we need to hash ALL up to the first space
if ( fieldCode == FIELD_URL ||
fieldCode == FIELD_GBPARENTURL ||
fieldCode == FIELD_EXT ||
fieldCode == FIELD_LINK ||
fieldCode == FIELD_ILINK||
@ -2225,6 +2226,7 @@ bool Query::setQWords ( char boolFlag ,
// should we have normalized before hashing?
if ( fieldCode == FIELD_URL ||
fieldCode == FIELD_GBPARENTURL ||
fieldCode == FIELD_LINK ||
fieldCode == FIELD_ILINK ||
fieldCode == FIELD_SITELINK ||
@ -2237,6 +2239,8 @@ bool Query::setQWords ( char boolFlag ,
if ( fieldCode == FIELD_ILINK) addwww = true;
if ( fieldCode == FIELD_LINKS) addwww = true;
if ( fieldCode == FIELD_URL ) addwww = true;
if ( fieldCode == FIELD_GBPARENTURL )
addwww = true;
if ( fieldCode == FIELD_SITELINK)
addwww = true;
url.set ( w , wlen , addwww );
@ -3006,7 +3010,7 @@ struct QueryField g_fields[] = {
{"ilink", FIELD_ILINK, true,"Similar to above."},
{"sitelink", FIELD_SITELINK, true,"Matches all pages that link to the given site. Example:sitelink:www.gigablast.com matches all pages that link to some page on the www.gigablast.com site."},
{"site", FIELD_SITE, true,"Matches all pages from the given site. Example: site:www.gigablast.com will return all the pages on the gigablast site"},
{"coll", FIELD_COLL, true,"Not sure if this works."},
//{"coll", FIELD_COLL, true,"Not sure if this works."},
{"ip", FIELD_IP, true,"Matches all pages with the given ip. Example:1.2.3.4 will match all pages whose urls have that IP address."},
{"inurl", FIELD_SUBURL, true,"Matches all pages that have the given terms in the url. Example inurl:water will match all pages whose url has the word water in it, but the word must be delineated by punctuation."},
{"suburl", FIELD_SUBURL, true,"Same as inurl."},
@ -3038,8 +3042,8 @@ struct QueryField g_fields[] = {
{"gbhasext", FIELD_GBOTHER, false,""},
{"gbsubmiturl", FIELD_GBOTHER, false,""},
{"qdom", FIELD_QUOTA, false,""},
{"qhost", FIELD_QUOTA, false,""},
//{"qdom", FIELD_QUOTA, false,""},
//{"qhost", FIELD_QUOTA, false,""},
{"gbtagvector", FIELD_GBTAGVECTOR, false,""},
{"gbgigabitvector", FIELD_GBGIGABITVECTOR, false,""},
@ -3061,7 +3065,7 @@ struct QueryField g_fields[] = {
{"gbduphash" ,FIELD_GBOTHER,false,"Internal use only."},
{"gbsitetemplate" ,FIELD_GBOTHER,false,"Internal use only."},
{"gboutlinkedtitle" ,FIELD_GBOTHER,false,"gboutlinkedtitle:0 and gboutlinkedtitle:1 matches events whose title is not in and in a hyperlink, respectively."},
{"gbisaggregator" ,FIELD_GBOTHER,false,"gbisaggregator:0|1 depending on if the event came from an event aggregator website, like eviesays.com."},
//{"gbisaggregator" ,FIELD_GBOTHER,false,"gbisaggregator:0|1 depending on if the event came from an event aggregator website, like eviesays.com."},
{"gbdeduped" ,FIELD_GBOTHER,false,""},
{"gbinjected", FIELD_GBOTHER,false,"Was the event injected?."},
@ -3070,7 +3074,8 @@ struct QueryField g_fields[] = {
//{"gbendrange",FIELD_GBENDRANGE,false,""},
{"gbpermalink",FIELD_GBPERMALINK,false,""},
{"gbcsenum",FIELD_GBCSENUM,false,""},
//{"gbcsenum",FIELD_GBCSENUM,false,""},
{"gbparenturl", FIELD_GBPARENTURL, true,"Match the json urls that were extract from this parent url. Example: gbparenturl:www.gigablast.com/addurl.htm"},
{"gbdocid",FIELD_GBDOCID,false,"restrict results to this docid"}
};

@ -108,6 +108,7 @@ typedef unsigned long long qvec_t;
#define FIELD_GBREVSORTBY 55 // i.e. sortby:price -> low to high
#define FIELD_GBNUMBERMIN 56
#define FIELD_GBNUMBERMAX 57
#define FIELD_GBPARENTURL 58
#define FIELD_GBOTHER 92

22
Rdb.cpp

@ -90,7 +90,7 @@ RdbBase *Rdb::getBase ( collnum_t collnum ) {
return cr->m_bases[(unsigned char)m_rdbId];
}
// used by Rdb::addColl
// used by Rdb::addBase1()
void Rdb::addBase ( collnum_t collnum , RdbBase *base ) {
// if we are collectionless, like g_statsdb.m_rdb or
// g_cachedb.m_rdb, etc.. shared by all collections essentially.
@ -468,12 +468,17 @@ bool Rdb::updateToRebuildFiles ( Rdb *rdb2 , char *coll ) {
// . returns false and sets g_errno on error, returns true on success
// . if this rdb is collectionless we set m_collectionlessBase in addBase()
bool Rdb::addColl ( char *coll ) {
bool Rdb::addRdbBase1 ( char *coll ) { // addColl()
collnum_t collnum = g_collectiondb.getCollnum ( coll );
return addColl2 ( collnum );
return addRdbBase2 ( collnum );
}
bool Rdb::addColl2 ( collnum_t collnum ) {
bool Rdb::addRdbBase2 ( collnum_t collnum ) { // addColl2()
if ( ! m_initialized ) {
g_errno = EBADENGINEER;
return log("db: adding coll to uninitialized rdb!");
}
// catdb,statsbaccessdb,facebookdb,syncdb
if ( m_isCollectionLess )
@ -501,8 +506,9 @@ bool Rdb::addColl2 ( collnum_t collnum ) {
RdbBase *base = getBase ( collnum );
if ( base ) { // m_bases [ collnum ] ) {
g_errno = EBADENGINEER;
return log("db: %s: Rdb for collection \"%s\" exists.",
m_dbname,coll);
return log("db: Rdb for db \"%s\" and "
"collection \"%s\" (collnum %li) exists.",
m_dbname,coll,(long)collnum);
}
// make a new one
RdbBase *newColl = NULL;
@ -616,7 +622,7 @@ bool Rdb::deleteColl ( collnum_t collnum , collnum_t newCollnum ) {
// . TODO: what about outstanding merge or dump operations?
// . it seems like we can't really recycle this too easily
// because reset it not resetting filenames or directory name?
// just nuke it and rebuild using addColl2()...
// just nuke it and rebuild using addRdbBase2()...
RdbBase *oldBase = getBase ( collnum );
mdelete (oldBase, sizeof(RdbBase), "Rdb Coll");
delete (oldBase);
@ -632,7 +638,7 @@ bool Rdb::deleteColl ( collnum_t collnum , collnum_t newCollnum ) {
// if just resetting recycle base
if ( collnum != newCollnum ) {
addColl2 ( newCollnum );
addRdbBase2 ( newCollnum );
// make a new base now
//RdbBase *newBase = mnew
// new cr

4
Rdb.h

@ -86,8 +86,8 @@ class Rdb {
Rdb ( );
~Rdb ( );
bool addColl ( char *coll );
bool addColl2 ( collnum_t collnum );
bool addRdbBase1 ( char *coll );
bool addRdbBase2 ( collnum_t collnum );
bool delColl ( char *coll );
bool resetBase ( collnum_t collnum );

@ -21,6 +21,7 @@
#include "Collectiondb.h"
//#include "CollectionRec.h"
#include "Repair.h"
#include "Rebalance.h"
//#include "Msg3.h" // debug include
// how many rdbs are in "urgent merge" mode?
@ -613,8 +614,22 @@ bool RdbBase::setFiles ( ) {
return false;
}
// everyone should start with file 0001.dat or 0000.dat
if ( m_numFiles > 0 && m_fileIds[0] > 1 ) {
log("db: missing file id 0001.dat for %s in coll %s. "
"Fix this or it'll core later. Just rename the next file "
"in line to 0001.dat/map. We probably cored at a "
"really bad time during the end of a merge process.",
m_dbname, m_coll );
char *xx=NULL; *xx=0;
}
m_dir.close();
// ensure files are sharded correctly
verifyFileSharding();
if ( ! converting ) return true;
// now if we are converting old titledb names to new...
@ -655,6 +670,7 @@ long RdbBase::addFile ( long id , bool isNew , long mergeNum , long id2 ,
(long)MAX_RDB_FILES);
return -1;
}
// HACK: skip to avoid a OOM lockup. if RdbBase cannot dump
// its data to disk it can backlog everyone and memory will
// never get freed up.
@ -1558,10 +1574,11 @@ void RdbBase::gotTokenForMerge ( ) {
if ( m_rdb == g_tfndb.getRdb() ) m = &g_merge2;
// sanity check
if ( m_isMerging || m->isMerging() ) {
if ( m_doLog )
log(LOG_INFO,
"merge: Someone already merging. Waiting for merge token "
"in order to merge %s.",m_dbname);
//if ( m_doLog )
//log(LOG_INFO,
//"merge: Someone already merging. Waiting for "
//"merge token "
//"in order to merge %s.",m_dbname);
return;
}
// clear for take-off
@ -1959,8 +1976,8 @@ void RdbBase::gotTokenForMerge ( ) {
// . if we have no g_errno that is bad!!!
// . we should dump core here or something cuz we have to remove the
// merge file still to be correct
if ( ! g_errno )
log(LOG_INFO,"merge: Got token without blocking.");
//if ( ! g_errno )
// log(LOG_INFO,"merge: Got token without blocking.");
// we now set this in init() by calling m_merge.init() so it
// can pre-alloc it's lists in it's s_msg3 class
// g_conf.m_mergeMaxBufSize ) ) return ;
@ -2185,3 +2202,104 @@ void RdbBase::verifyDiskPageCache ( ) {
m_pc->verify(f);
}
}
bool RdbBase::verifyFileSharding ( ) {
if ( m_rdb->m_isCollectionLess ) return true;
//log ( "db: Verifying %s for coll %s (collnum=%li)...",
// m_dbname , m_coll , (long)m_collnum );
g_threads.disableThreads();
Msg5 msg5;
//Msg5 msg5b;
RdbList list;
char startKey[MAX_KEY_BYTES];
char endKey[MAX_KEY_BYTES];
KEYMIN(startKey,MAX_KEY_BYTES);
KEYMAX(endKey,MAX_KEY_BYTES);
long minRecSizes = 64000;
char rdbId = m_rdb->m_rdbId;
if ( rdbId == RDB_TITLEDB ) minRecSizes = 640000;
if ( ! msg5.getList ( m_rdb->m_rdbId, //RDB_POSDB ,
m_coll ,
&list ,
startKey ,
endKey ,
minRecSizes ,
true , // includeTree ,
false , // add to cache?
0 , // max cache age
0 , // startFileNum ,
-1 , // numFiles ,
NULL , // state
NULL , // callback
0 , // niceness
false , // err correction?
NULL ,
0 ,
-1 ,
true ,
-1LL ,
NULL , // &msg5b ,
true )) {
g_threads.enableThreads();
return log("db: HEY! it did not block");
}
long count = 0;
long got = 0;
long printed = 0;
char k[MAX_KEY_BYTES];
for ( list.resetListPtr() ; ! list.isExhausted() ;
list.skipCurrentRecord() ) {
//key144_t k;
list.getCurrentKey(k);
count++;
//unsigned long groupId = k.n1 & g_hostdb.m_groupMask;
//unsigned long groupId = getGroupId ( RDB_POSDB , &k );
//if ( groupId == g_hostdb.m_groupId ) got++;
unsigned long shardNum = getShardNum( rdbId , k );
if ( shardNum == getMyShardNum() ) {
got++;
continue;
}
if ( ++printed > 100 ) continue;
// avoid log spam... comment this out
//log ( "db: Found bad key in list belongs to shard %li",
// shardNum);
}
g_threads.enableThreads();
//if ( got )
// log("db: verified %li recs for %s in coll %s",
// got,m_dbname,m_coll);
if ( got == count ) return true;
// tally it up
g_rebalance.m_numForeignRecs += count - got;
log ("db: Out of first %li records in %s for %s.%li, only %li belong "
"to our group.",count,m_dbname,m_coll,(long)m_collnum,got);
// exit if NONE, we probably got the wrong data
//if ( got == 0 ) log("db: Are you sure you have the "
// "right data in the right directory? ");
//log ( "db: Exiting due to Posdb inconsistency." );
g_threads.enableThreads();
return true;//g_conf.m_bypassValidation;
//log(LOG_DEBUG, "db: Posdb passed verification successfully for %li "
// "recs.", count );
// DONE
//return true;
}

@ -262,6 +262,8 @@ class RdbBase {
void verifyDiskPageCache ( );
bool verifyFileSharding ( );
// . add a (new) file to the m_files/m_maps/m_fileIds arrays
// . both return array position we added it to
// . both return -1 and set errno on error

@ -1503,9 +1503,12 @@ bool RdbCache::load ( char *dbname ) {
// does the file exist?
File f;
f.set ( g_hostdb.m_dir , filename );
// having cache file not existing on disk is not so bad, it's a cache
if ( ! f.doesExist() )
return log("db: Could not load cache from %s: does not exist.",
f.getFilename());
return false;
// return log("db: Could not load cache from %s: does not exist.",
// f.getFilename());
// open the file
if ( ! f.open ( O_RDWR ) )
return log("db: Could not open cache save file for %s: %s.",

@ -204,11 +204,14 @@ void RdbDump::doneDumping ( ) {
m_totalPosDumped , m_totalNegDumped ,
m_totalPosDumped + m_totalNegDumped );
// map verify
log("db: map # pos=%lli neg=%lli",
m_map->getNumPositiveRecs(),
m_map->getNumNegativeRecs()
);
// . map verify
// . if continueDumping called us with no collectionrec, it got
// deleted so RdbBase::m_map is nuked too i guess
if ( saved != ENOCOLLREC )
log("db: map # pos=%lli neg=%lli",
m_map->getNumPositiveRecs(),
m_map->getNumNegativeRecs()
);
// free the list's memory
if ( m_list ) m_list->freeList();
@ -1015,11 +1018,16 @@ void RdbDump::continueDumping() {
// if someone reset/deleted the collection we were dumping...
CollectionRec *cr = g_collectiondb.getRec ( m_collnum );
if ( ! cr ) g_errno = ENOCOLLREC;
if ( ! cr ) {
g_errno = ENOCOLLREC;
// m_file is invalid if collrec got nuked because so did
// the Rdbbase which has the files
log("db: continue dumping lost collection");
}
// bitch about errors
if (g_errno)log("db: Dump to %s had error writing: %s.",
m_file->getFilename(),mstrerror(g_errno));
else if (g_errno)log("db: Dump to %s had error writing: %s.",
m_file->getFilename(),mstrerror(g_errno));
// go back now if we were NOT dumping a tree
if ( ! (m_tree || m_buckets) ) {
m_isDumping = false;

@ -1631,9 +1631,12 @@ void RdbList::merge_r ( RdbList **lists ,
// . we don't want any positive recs to go un annhilated
// . but don't worry about this check if start and end keys are equal
//if ( m_startKey != m_endKey && (m_endKey.n0 & 0x01) == 0x00 )
if ( KEYCMP(m_startKey,m_endKey,m_ks)!=0 && KEYNEG(m_endKey) )
if ( KEYCMP(m_startKey,m_endKey,m_ks)!=0 && KEYNEG(m_endKey) ) {
log(LOG_LOGIC,"db: rdblist: merge_r: Illegal endKey for "
"merging.");
"merging. fixing.");
// make it legal so it will be read first NEXT time
KEYSUB(m_endKey,1,m_ks);
}
// do nothing if no lists passed in
if ( numLists <= 0 ) return;
// inherit the key size of what we merge

@ -172,7 +172,11 @@ void RdbTree::reset ( ) {
// unprotect it all
if ( m_useProtection ) unprotect ( );
// make sure string is NULL temrinated. this gbstrlen() should
if ( m_numNodes > 0 && m_dbname[0] && gbstrlen(m_dbname) >= 0 )
if ( m_numNodes > 0 &&
m_dbname[0] &&
gbstrlen(m_dbname) >= 0 &&
// don't be spammy we can have thousands of these, one per coll
strcmp(m_dbname,"waitingtree") )
log(LOG_INFO,"db: Resetting tree for %s.",m_dbname);
// liberate all the nodes
@ -279,7 +283,7 @@ long RdbTree::clear ( ) {
for ( long i = 0 ; i < nc ; i++ ) {
CollectionRec *cr = g_collectiondb.getRec(i);
if ( ! cr ) continue;
//if ( ((unsigned char)m_rdbId)>=RDB_END){char *xx=NULL;*xx=0; }
//if (((unsigned char)m_rdbId)>=RDB_END){char *xx=NULL;*xx=0; }
cr->m_numNegKeysInTree[(unsigned char)m_rdbId] = 0;
cr->m_numPosKeysInTree[(unsigned char)m_rdbId] = 0;
}
@ -633,9 +637,11 @@ long RdbTree::addNode ( collnum_t collnum ,
// crap, when fixing a tree this will segfault because
// m_recs[collnum] is NULL.
if ( m_rdbId >= 0 && g_collectiondb.m_recs[collnum] ) {
//if( ((unsigned char)m_rdbId)>=RDB_END){char *xx=NULL;*xx=0; }
g_collectiondb.m_recs[collnum]->
m_numNegKeysInTree[(unsigned char)m_rdbId]++;
//if( ((unsigned char)m_rdbId)>=RDB_END){
//char *xx=NULL;*xx=0; }
CollectionRec *cr ;
cr = g_collectiondb.m_recs[collnum];
if(cr)cr->m_numNegKeysInTree[(unsigned char)m_rdbId]++;
}
}
else {
@ -644,9 +650,11 @@ long RdbTree::addNode ( collnum_t collnum ,
// crap, when fixing a tree this will segfault because
// m_recs[collnum] is NULL.
if ( m_rdbId >= 0 && g_collectiondb.m_recs[collnum] ) {
//if( ((unsigned char)m_rdbId)>=RDB_END){char *xx=NULL;*xx=0; }
g_collectiondb.m_recs[collnum]->
m_numPosKeysInTree[(unsigned char)m_rdbId]++;
//if( ((unsigned char)m_rdbId)>=RDB_END){
//char *xx=NULL;*xx=0; }
CollectionRec *cr ;
cr = g_collectiondb.m_recs[collnum];
if(cr)cr->m_numPosKeysInTree[(unsigned char)m_rdbId]++;
}
}
// debug2 msg
@ -839,16 +847,20 @@ void RdbTree::deleteNode ( long i , bool freeData ) {
if ( KEYNEG(m_keys,i,m_ks) ) {
m_numNegativeKeys--;
//m_numNegKeysPerColl[m_collnums[i]]--;
if ( m_rdbId >= 0 )
g_collectiondb.m_recs[m_collnums[i]]->
m_numPosKeysInTree[(unsigned char)m_rdbId]--;
if ( m_rdbId >= 0 ) {
CollectionRec *cr;
cr = g_collectiondb.m_recs[m_collnums[i]];
if(cr)cr->m_numNegKeysInTree[(unsigned char)m_rdbId]--;
}
}
else {
m_numPositiveKeys--;
//m_numPosKeysPerColl[m_collnums[i]]--;
if ( m_rdbId >= 0 )
g_collectiondb.m_recs[m_collnums[i]]->
m_numPosKeysInTree[(unsigned char)m_rdbId]--;
if ( m_rdbId >= 0 ) {
CollectionRec *cr;
cr = g_collectiondb.m_recs[m_collnums[i]];
if(cr)cr->m_numPosKeysInTree[(unsigned char)m_rdbId]--;
}
}
// debug step -- check chain from iparent down making sure that
//printTree();
@ -874,11 +886,14 @@ void RdbTree::deleteNode ( long i , bool freeData ) {
//m_numNegKeysPerColl[m_collnums[i]] = 0;
//m_numPosKeysPerColl[m_collnums[i]] = 0;
if ( m_rdbId >= 0 ) {
//if ( ((unsigned char)m_rdbId)>=RDB_END){char *xx=NULL;*xx=0; }
g_collectiondb.m_recs[m_collnums[i]]->
m_numNegKeysInTree[(unsigned char)m_rdbId] = 0;
g_collectiondb.m_recs[m_collnums[i]]->
m_numPosKeysInTree[(unsigned char)m_rdbId] = 0;
//if ( ((unsigned char)m_rdbId)>=RDB_END){
//char *xx=NULL;*xx=0; }
CollectionRec *cr ;
cr = g_collectiondb.m_recs[m_collnums[i]];
if(cr){
cr->m_numNegKeysInTree[(unsigned char)m_rdbId] = 0;
cr->m_numPosKeysInTree[(unsigned char)m_rdbId] = 0;
}
}
@ -945,8 +960,9 @@ void RdbTree::deleteNode ( long i , bool freeData ) {
//m_numNegKeysPerColl[m_collnums[i]]--;
if ( m_rdbId >= 0 ) {
//if( ((unsigned char)m_rdbId)>=RDB_END){char *xx=NULL;*xx=0; }
g_collectiondb.m_recs[m_collnums[i]]->
m_numNegKeysInTree[(unsigned char)m_rdbId]--;
CollectionRec *cr ;
cr = g_collectiondb.m_recs[m_collnums[i]];
if(cr)cr->m_numNegKeysInTree[(unsigned char)m_rdbId]--;
}
}
else {
@ -954,8 +970,9 @@ void RdbTree::deleteNode ( long i , bool freeData ) {
//m_numPosKeysPerColl[m_collnums[i]]--;
if ( m_rdbId >= 0 ) {
//if( ((unsigned char)m_rdbId)>=RDB_END){char *xx=NULL;*xx=0; }
g_collectiondb.m_recs[m_collnums[i]]->
m_numPosKeysInTree[(unsigned char)m_rdbId]--;
CollectionRec *cr ;
cr = g_collectiondb.m_recs[m_collnums[i]];
if(cr)cr->m_numPosKeysInTree[(unsigned char)m_rdbId]--;
}
}
// debug step -- check chain from iparent down making sure that
@ -3059,8 +3076,9 @@ void RdbTree::cleanTree ( ) { // char **bases ) {
deleteNode ( i , true );
// remove it otherwise
// don't actually remove it!!!! in case collection gets
// moved accidentally
//deleteNode ( i , true );
// moved accidentally.
// no... otherwise it can clog up the tree forever!!!!
deleteNode ( i , true );
count++;
// save it
collnum = m_collnums[i];
@ -3070,8 +3088,8 @@ void RdbTree::cleanTree ( ) { // char **bases ) {
if ( count == 0 ) return;
log(LOG_LOGIC,"db: Removed %li records from %s tree for invalid "
"collection number %i.",count,m_dbname,collnum);
log(LOG_LOGIC,"db: Records not actually removed for safety. Except "
"for those with negative colnums.");
//log(LOG_LOGIC,"db: Records not actually removed for safety. Except "
// "for those with negative colnums.");
static bool s_print = true;
if ( ! s_print ) return;
s_print = false;

@ -127,6 +127,7 @@ char *Rebalance::getNeedsRebalance ( ) {
hexToBin(keyStr,gbstrlen(keyStr), (char *)&m_nextKey);
m_collnum = cn;
//m_collnum = 4695; //debug skip
// we are valid now either way
m_needsRebalanceValid = true;
// assume ok
@ -217,8 +218,9 @@ void Rebalance::scanLoop ( ) {
if ( rdb->m_rdbId == RDB_STATSDB ) continue;
// log it as well
if ( m_lastRdb != rdb ) {
log("rebal: scanning %s [%s]",
cr->m_coll,rdb->m_dbname);
log("rebal: scanning %s (%li) [%s]",
cr->m_coll,(long)cr->m_collnum,
rdb->m_dbname);
// only do this once per rdb/coll
m_lastRdb = rdb;
// reset key cursor as well!!!
@ -235,8 +237,11 @@ void Rebalance::scanLoop ( ) {
// scan it. returns true if done, false if blocked
if ( ! scanRdb ( ) ) return;
// note it
log("rebal: moved %lli of %lli recs scanned",
m_rebalanceCount,m_scannedCount);
log("rebal: moved %lli of %lli recs scanned in "
"%s for coll.%s.%li",
m_rebalanceCount,m_scannedCount,
rdb->m_dbname,cr->m_coll,(long)cr->m_collnum);
//if ( m_rebalanceCount ) goto done;
m_rebalanceCount = 0;
m_scannedCount = 0;
m_lastPercent = -1;
@ -245,6 +250,7 @@ void Rebalance::scanLoop ( ) {
m_rdbNum = 0;
}
// done:
// all done
m_isScanning = false;
m_needsRebalance = false;
@ -318,6 +324,8 @@ bool Rebalance::scanRdb ( ) {
readAnother:
if ( g_process.m_mode == EXIT_MODE ) return false;
//log("rebal: loading list start = %s",KEYSTR(m_nextKey,rdb->m_ks));
if ( ! m_msg5.getList ( rdb->m_rdbId ,
@ -391,22 +399,27 @@ bool Rebalance::gotList ( ) {
return true;
}
char *last = NULL;
//char *last = NULL;
for ( ; ! m_list.isExhausted() ; m_list.skipCurrentRec() ) {
// get tht rec
char *rec = m_list.getCurrentRec();
//char *rec = m_list.getCurrentRec();
// get it
m_list.getCurrentKey ( m_nextKey );
// skip if negative... wtf?
if ( KEYNEG(m_nextKey) ) continue;
// get shard
long shard = getShardNum ( rdbId , rec );
long shard = getShardNum ( rdbId , m_nextKey );
// save last ptr
last = rec;
//last = rec;
// debug!
//m_list.getKey ( rec , m_nextKey );
//log("rebal: checking key %s",KEYSTR(m_nextKey,ks));
// count as scanned
m_scannedCount++;
// skip it if it belongs with us
if ( shard == myShard ) continue;
// note it
//log("rebal: shard is %li",shard);
// count it
m_rebalanceCount++;
// otherwise, it does not!
@ -445,18 +458,21 @@ bool Rebalance::gotList ( ) {
//log("rebal: done reading list");
// update nextkey
if ( last ) {
//if ( last ) {
if ( ! m_list.isEmpty() ) {
// get the last key we scanned, all "ks" bytes of it.
// because some keys are compressed and we take the
// more significant compressed out bytes from m_list.m_*
// member vars
m_list.getKey ( last , m_nextKey );
//m_list.getKey ( last , m_nextKey );
// if it is not maxxed out, then incremenet it for the
// next scan round
if ( KEYCMP ( m_nextKey , KEYMAX() , ks ) != 0 )
KEYADD ( m_nextKey , 1 , ks );
}
//else {
// log("rebal: got empty list");
//}
if ( ! m_msg4a.addMetaList ( &m_posMetaList ,
m_collnum ,

@ -836,7 +836,8 @@ void Repair::getNextCollToRepair ( ) {
// add collection to secondary rdbs
if ( m_rebuildTitledb ) {
if ( ! g_titledb2.addColl ( m_coll ) &&
if ( //! g_titledb2.addColl ( m_coll ) &&
! g_titledb2.getRdb()->addRdbBase1(m_coll) &&
g_errno != EEXIST ) goto hadError;
}
@ -851,7 +852,7 @@ void Repair::getNextCollToRepair ( ) {
//}
if ( m_rebuildPosdb ) {
if ( ! g_posdb2.addColl ( m_coll ) &&
if ( ! g_posdb2.getRdb()->addRdbBase1 ( m_coll ) &&
g_errno != EEXIST ) goto hadError;
}
@ -861,7 +862,7 @@ void Repair::getNextCollToRepair ( ) {
//}
if ( m_rebuildClusterdb ) {
if ( ! g_clusterdb2.addColl ( m_coll ) &&
if ( ! g_clusterdb2.getRdb()->addRdbBase1 ( m_coll ) &&
g_errno != EEXIST ) goto hadError;
}
@ -871,7 +872,7 @@ void Repair::getNextCollToRepair ( ) {
//}
if ( m_rebuildSpiderdb ) {
if ( ! g_spiderdb2.addColl ( m_coll ) &&
if ( ! g_spiderdb2.getRdb()->addRdbBase1 ( m_coll ) &&
g_errno != EEXIST ) goto hadError;
}
@ -881,7 +882,7 @@ void Repair::getNextCollToRepair ( ) {
//}
if ( m_rebuildLinkdb ) {
if ( ! g_linkdb2.addColl ( m_coll ) &&
if ( ! g_linkdb2.getRdb()->addRdbBase1 ( m_coll ) &&
g_errno != EEXIST ) goto hadError;
}
@ -2254,80 +2255,116 @@ bool Repair::printRepairStatus ( SafeBuf *sb , long fromIp ) {
}
// now show the rebuild status
sb->safePrintf ( "<table>"
sb->safePrintf (
"<table%s"
" id=\"repairstatustable\">"
"<table width=100%% bgcolor=#%s cellpadding=4 "
"border=1 id=\"repairstatustable\">"
"<tr><td bgcolor=%s colspan=2><b><center>"
"<tr class=hdrow><td colspan=2><b><center>"
"Repair Status</center></b></td></tr>\n"
"<tr bgcolor=#%s><td colspan=2>"
"<font size=-2>"
"Use this to rebuild a database or to reindex "
"all pages to pick up new link text."
"</font>"
"</td></tr>"
// status (see list of above statuses)
"<tr><td width=50%%><b>status</b></td>"
"<tr bgcolor=#%s><td width=50%%><b>status</b></td>"
"<td>%s</td></tr>\n"
"<tr><td width=50%%><b>repair mode</b></td>"
"<tr bgcolor=#%s><td width=50%%><b>repair mode</b>"
"</td>"
"<td>%li</td></tr>\n"
"<tr><td width=50%%><b>min repair mode</b></td>"
"<tr bgcolor=#%s>"
"<td width=50%%><b>min repair mode</b></td>"
"<td>%li</td></tr>\n"
"<tr><td width=50%%><b>host ID with min repair mode"
"<tr bgcolor=#%s>"
"<td width=50%%><b>host ID with min repair mode"
"</b></td>"
"<td><a href=\"http://%s:%hu/master/repair\">"
"%li</a></td></tr>\n"
"<tr><td><b>old collection</b></td>"
"<tr bgcolor=#%s><td><b>old collection</b></td>"
"<td>%s</td></tr>"
"<tr><td><b>new collection</b></td>"
"<tr bgcolor=#%s><td><b>new collection</b></td>"
"<td>%s</td></tr>"
,
TABLE_STYLE ,
LIGHT_BLUE ,
LIGHT_BLUE ,
status ,
LIGHT_BLUE ,
(long)g_repairMode,
LIGHT_BLUE ,
(long)g_pingServer.m_minRepairMode,
LIGHT_BLUE ,
minIpBuf, // ip string
minPort, // port
(long)minHostId,
LIGHT_BLUE ,
oldColl ,
LIGHT_BLUE ,
newColl
);
sb->safePrintf (
// docs done, includes overwritten title recs
"<tr bgcolor=%s><td><b>titledb recs scanned</b></td>"
"<tr bgcolor=#%s><td><b>titledb recs scanned</b></td>"
"<td>%lli of %lli</td></tr>\n"
// percent complete
"<tr bgcolor=%s><td><b>titledb recs scanned "
"<tr bgcolor=#%s><td><b>titledb recs scanned "
"progress</b></td>"
"<td>%.2f%%</td></tr>\n"
// title recs set errors, parsing errors, etc.
//"<tr bgcolor=%s><td><b>title recs injected</b></td>"
//"<tr bgcolor=#%s><td><b>title recs injected</b></td>"
//"<td>%lli</td></tr>\n"
// title recs set errors, parsing errors, etc.
"<tr bgcolor=%s><td><b>titledb rec error count</b></td>"
"<tr bgcolor=#%s><td><b>titledb rec error count</b></td>"
"<td>%lli</td></tr>\n"
// sub errors
"<tr bgcolor=%s><td> &nbsp; key out of order</b></td>"
"<tr bgcolor=#%s><td> &nbsp; key out of order</b></td>"
"<td>%lli</td></tr>\n"
"<tr bgcolor=%s><td> &nbsp; set errors</b></td>"
"<tr bgcolor=#%s><td> &nbsp; set errors</b></td>"
"<td>%lli</td></tr>\n"
"<tr bgcolor=%s><td> &nbsp; corrupt errors</b></td>"
"<tr bgcolor=#%s><td> &nbsp; corrupt errors</b></td>"
"<td>%lli</td></tr>\n"
"<tr bgcolor=%s><td> &nbsp; xml errors</b></td>"
"<tr bgcolor=#%s><td> &nbsp; xml errors</b></td>"
"<td>%lli</td></tr>\n"
"<tr bgcolor=%s><td> &nbsp; dup docid errors</b></td>"
"<tr bgcolor=#%s><td> &nbsp; dup docid errors</b></td>"
"<td>%lli</td></tr>\n"
"<tr bgcolor=%s><td> &nbsp; negative keys</b></td>"
"<tr bgcolor=#%s><td> &nbsp; negative keys</b></td>"
"<td>%lli</td></tr>\n"
//"<tr bgcolor=%s><td> &nbsp; overwritten recs</b></td>"
//"<tr bgcolor=#%s><td> &nbsp; overwritten recs</b></td>"
//"<td>%lli</td></tr>\n"
"<tr bgcolor=%s><td> &nbsp; twin's "
"<tr bgcolor=#%s><td> &nbsp; twin's "
"respsponsibility</b></td>"
"<td>%lli</td></tr>\n"
"<tr bgcolor=%s><td> &nbsp; wrong shard</b></td>"
"<tr bgcolor=#%s><td> &nbsp; wrong shard</b></td>"
"<td>%lli</td></tr>\n"
"<tr bgcolor=%s><td> &nbsp; root urls</b></td>"
"<tr bgcolor=#%s><td> &nbsp; root urls</b></td>"
"<td>%lli</td></tr>\n"
"<tr bgcolor=%s><td> &nbsp; non-root urls</b></td>"
"<tr bgcolor=#%s><td> &nbsp; non-root urls</b></td>"
"<td>%lli</td></tr>\n"
"<tr bgcolor=%s><td> &nbsp; no title rec</b></td>"
"<tr bgcolor=#%s><td> &nbsp; no title rec</b></td>"
"<td>%lli</td></tr>\n"
//"<tr><td><b> &nbsp; Other errors</b></td>"
@ -2337,49 +2374,7 @@ bool Repair::printRepairStatus ( SafeBuf *sb , long fromIp ) {
//"<tr><td><b>Time Left in Phase %li</b></td>"
//"<td>%.2f hrs</td></tr>\n"
// spider recs done
"<tr><td><b>spider recs scanned</b></td>"
"<td>%lli of %lli</td></tr>\n"
// percent complete
"<tr><td><b>spider recs scanned progress</b></td>"
"<td>%.2f%%</td></tr>\n"
// spider recs set errors, parsing errors, etc.
"<tr><td><b>spider rec not assigned to us</b></td>"
"<td>%li</td></tr>\n"
// spider recs set errors, parsing errors, etc.
"<tr><td><b>spider rec errors</b></td>"
"<td>%lli</td></tr>\n"
// spider recs set errors, parsing errors, etc.
"<tr><td><b>spider rec bad tld</b></td>"
"<td>%li</td></tr>\n"
// time left in hours
//"<tr><td><b>Time Left in Phase %li</b></td>"
//"<td>%.2f hrs</td></tr>\n"
,
LIGHT_BLUE ,
DARK_BLUE ,
status ,
(long)g_repairMode,
(long)g_pingServer.m_minRepairMode,
minIpBuf, // ip string
minPort, // port
(long)minHostId,
oldColl ,
newColl ,
DARK_BLUE,
ns ,
nr ,
@ -2415,13 +2410,49 @@ bool Repair::printRepairStatus ( SafeBuf *sb , long fromIp ) {
m_recsNonRoot ,
DARK_BLUE,
m_noTitleRecs,
m_noTitleRecs
);
sb->safePrintf(
// spider recs done
"<tr bgcolor=#%s><td><b>spider recs scanned</b></td>"
"<td>%lli of %lli</td></tr>\n"
// percent complete
"<tr bgcolor=#%s><td><b>spider recs scanned "
"progress</b></td>"
"<td>%.2f%%</td></tr>\n"
// spider recs set errors, parsing errors, etc.
"<tr bgcolor=#%s><td><b>spider rec not "
"assigned to us</b></td>"
"<td>%li</td></tr>\n"
// spider recs set errors, parsing errors, etc.
"<tr bgcolor=#%s><td><b>spider rec errors</b></td>"
"<td>%lli</td></tr>\n"
// spider recs set errors, parsing errors, etc.
"<tr bgcolor=#%s><td><b>spider rec bad tld</b></td>"
"<td>%li</td></tr>\n"
// time left in hours
//"<tr bgcolor=#%s><td><b>"
//"Time Left in Phase %li</b></td>"
//"<td>%.2f hrs</td></tr>\n"
,
LIGHT_BLUE ,
ns2 ,
nr2 ,
LIGHT_BLUE ,
ratio2 ,
LIGHT_BLUE ,
m_spiderRecNotAssigned ,
LIGHT_BLUE ,
errors2,
LIGHT_BLUE ,
m_spiderRecBadTLD
);
@ -2439,7 +2470,7 @@ bool Repair::printRepairStatus ( SafeBuf *sb , long fromIp ) {
// m_dbname will be 0
if ( tr == 0 ) continue;
sb->safePrintf(
"<tr bgcolor=%s><td><b>%s2 recs</b></td>"
"<tr bgcolor=#%s><td><b>%s2 recs</b></td>"
"<td>%lli</td></tr>\n" ,
bg,
rdb->m_dbname,
@ -2495,81 +2526,94 @@ bool Repair::printRepairStatus ( SafeBuf *sb , long fromIp ) {
sb->safePrintf (
"<table width=100%% bgcolor=#%s cellpadding=4 "
"border=1 id=\"repairstatustable2\">"
"<table %s "
"id=\"repairstatustable2\">"
// current collection being repaired
"<tr><td bgcolor=%s colspan=2><b><center>"
"<tr class=hdrow><td colspan=2><b><center>"
"Repair Settings In Use</center></b></td></tr>"
// . print parms for this repair
// . they may differ than current controls because
// the current controls were changed after the
// repair started
"<tr><td width=50%%><b>full rebuild</b></td>"
"<tr bgcolor=#%s>"
"<td width=50%%><b>full rebuild</b></td>"
"<td>%s</td></tr>\n"
//"<tr><td><b>recycle link info</b></td>"
//"<tr bgcolor=#%s><td><b>recycle link info</b></td>"
//"<td>%s</td></tr>\n"
"<tr><td><b>rebuild titledb</b></td>"
"<tr bgcolor=#%s><td><b>rebuild titledb</b></td>"
"<td>%s</td></tr>\n"
//"<tr><td><b>rebuild tfndb</b></td>"
//"<tr bgcolor=#%s><td><b>rebuild tfndb</b></td>"
//"<td>%s</td></tr>\n"
//"<tr><td><b>rebuild indexdb</b></td>"
//"<tr bgcolor=#%s><td><b>rebuild indexdb</b></td>"
//"<td>%s</td></tr>\n"
"<tr><td><b>rebuild posdb</b></td>"
"<tr bgcolor=#%s><td><b>rebuild posdb</b></td>"
"<td>%s</td></tr>\n"
//"<tr><td><b>rebuild datedb</b></td>"
//"<tr bgcolor=#%s><td><b>rebuild datedb</b></td>"
//"<td>%s</td></tr>\n"
"<tr><td><b>rebuild clusterdb</b></td>"
"<tr bgcolor=#%s><td><b>rebuild clusterdb</b></td>"
"<td>%s</td></tr>\n"
//"<tr><td><b>rebuild checksumdb</b></td>"
//"<tr bgcolor=#%s><td><b>rebuild checksumdb</b></td>"
//"<td>%s</td></tr>\n"
"<tr><td><b>rebuild spiderdb</b></td>"
"<tr bgcolor=#%s><td><b>rebuild spiderdb</b></td>"
"<td>%s</td></tr>\n"
"<tr><td><b>rebuild linkdb</b></td>"
"<tr bgcolor=#%s><td><b>rebuild linkdb</b></td>"
"<td>%s</td></tr>\n"
//"<tr><td><b>rebuild tagdb</b></td>"
//"<tr bgcolor=#%s><td><b>rebuild tagdb</b></td>"
//"<td>%s</td></tr>\n"
//"<tr><td><b>rebuild placedb</b></td>"
//"<tr bgcolor=#%s><td><b>rebuild placedb</b></td>"
//"<td>%s</td></tr>\n"
//"<tr><td><b>rebuild sectiondb</b></td>"
//"<tr bgcolor=#%s><td><b>rebuild sectiondb</b></td>"
//"<td>%s</td></tr>\n"
//"<tr><td><b>rebuild revdb</b></td>"
//"<tr bgcolor=#%s><td><b>rebuild revdb</b></td>"
//"<td>%s</td></tr>\n"
"<tr><td><b>rebuild root urls</b></td>"
"<tr bgcolor=#%s><td><b>rebuild root urls</b></td>"
"<td>%s</td></tr>\n"
"<tr><td><b>rebuild non-root urls</b></td>"
"<tr bgcolor=#%s>"
"<td><b>rebuild non-root urls</b></td>"
"<td>%s</td></tr>\n"
"</table>\n"
"<br>\n"
,
TABLE_STYLE,
LIGHT_BLUE,
DARK_BLUE,
rr[0],
//rr[10],
LIGHT_BLUE,
rr[1],
//rr[2],
LIGHT_BLUE,
rr[3],
//rr[4],
LIGHT_BLUE,
rr[5],
//rr[6],
LIGHT_BLUE,
rr[7],
//rr[8],
LIGHT_BLUE,
rr[9],
//rr[13],
@ -2578,7 +2622,10 @@ bool Repair::printRepairStatus ( SafeBuf *sb , long fromIp ) {
//rr[16],
//rr[17],
LIGHT_BLUE,
rr[11],
LIGHT_BLUE,
rr[12]
);
return true;

@ -67,7 +67,7 @@ bool Revdb::init2 ( long treeMem ) {
return false;
return true;
}
/*
bool Revdb::addColl ( char *coll, bool doVerify ) {
if ( ! m_rdb.addColl ( coll ) ) return false;
if ( ! doVerify ) return true;
@ -79,7 +79,7 @@ bool Revdb::addColl ( char *coll, bool doVerify ) {
log ( "db: Verify failed, but scaling is allowed, passing." );
return true;
}
*/
bool Revdb::verify ( char *coll ) {
log ( LOG_INFO, "db: Verifying Revdb for coll %s...", coll );
g_threads.disableThreads();

@ -1761,7 +1761,12 @@ Tag *SafeBuf::addTag ( char *mysite ,
bool SafeBuf::addTag ( Tag *tag ) {
long recSize = tag->getSize();
//tag->setDataSize();
if ( tag->m_recDataSize <= 16 ) { char *xx=NULL;*xx=0; }
if ( tag->m_recDataSize <= 16 ) {
// note it
return log("safebuf: encountered corrupted tag datasize=%li.",
tag->m_recDataSize);
//char *xx=NULL;*xx=0; }
}
return safeMemcpy ( (char *)tag , recSize );
}
@ -2703,6 +2708,7 @@ bool SafeBuf::decodeJSONToUtf8 ( long niceness ) {
// diffbot
// . really we could leave the newlines decoded etc, but it is prettier
// for printing
/*
bool SafeBuf::safeStrcpyPrettyJSON ( char *decodedJson ) {
// how much space do we need?
// each single byte \t char for instance will need 2 bytes
@ -2762,6 +2768,7 @@ bool SafeBuf::safeStrcpyPrettyJSON ( char *decodedJson ) {
return true;
}
*/
bool SafeBuf::safeUtf8ToJSON ( char *utf8 ) {

@ -104,7 +104,7 @@ struct SafeBuf {
bool safeMemcpy(SafeBuf *c){return safeMemcpy(c->m_buf,c->m_length);};
bool safeMemcpy ( class Words *w , long a , long b ) ;
bool safeStrcpy ( char *s ) ;
bool safeStrcpyPrettyJSON ( char *decodedJson ) ;
//bool safeStrcpyPrettyJSON ( char *decodedJson ) ;
bool safeUtf8ToJSON ( char *utf8 ) ;
bool csvEncode ( char *s , long len , long niceness = 0 );

@ -224,8 +224,10 @@ class SearchInput {
//long m_formatStrLen;
//char *m_formatStr;
char m_formatTmp[11];
// can be 0 for FORMAT_HTML, 1 = FORMAT_XML, 2=FORMAT_JSON, 3=csv
char m_format;
long m_format;
// this should be part of the key because it will affect the results!
char m_queryExpansion;

@ -17238,7 +17238,7 @@ bool Sectiondb::init2 ( long treeMem ) {
return false;
return true;
}
/*
bool Sectiondb::addColl ( char *coll, bool doVerify ) {
if ( ! m_rdb.addColl ( coll ) ) return false;
if ( ! doVerify ) return true;
@ -17250,7 +17250,7 @@ bool Sectiondb::addColl ( char *coll, bool doVerify ) {
log ( "db: sectiondb verify failed, but scaling is allowed, passing.");
return true;
}
*/
bool Sectiondb::verify ( char *coll ) {
log ( LOG_INFO, "db: Verifying Sectiondb for coll %s...", coll );
g_threads.disableThreads();

1839
Spider.cpp

File diff suppressed because it is too large Load Diff

110
Spider.h

@ -1,11 +1,5 @@
// Matt Wells, copyright Nov 2002
//
// . Spider.h/.cpp contains all the code related to spider scheduling
// . Spiderdb holds the SpiderRecs which indicate the time to spider a url
// . there are 2 types of SpiderRecs: SpiderRequest and SpiderReply recs
//
#ifndef _SPIDER_H_
#define _SPIDER_H_
@ -45,6 +39,7 @@
#define SP_ADMIN_PAUSED 8 // g_conf.m_spideringEnabled = false
#define SP_COMPLETED 9 // crawl is done, and no repeatCrawl is scheduled
void spiderRoundIncremented ( class CollectionRec *cr ) ;
bool testPatterns ( ) ;
bool doesStringContainPattern ( char *content , char *pattern ) ;
@ -57,6 +52,29 @@ bool getSpiderStatusMsg ( class CollectionRec *cx ,
// this new spider algorithm ensures that urls get spidered even if a host
// is dead. and even if the url was being spidered by a host that suddenly went
// dead.
//
// . Spider.h/.cpp contains all the code related to spider scheduling
// . Spiderdb holds the SpiderRecs which indicate the time to spider a url
// . there are 2 types of SpiderRecs: SpiderRequest and SpiderReply recs
//
//
// There are 3 main components to the spidering process:
// 1) spiderdb
// 2) the "waiting tree"
// 3) doledb
//
// spiderdb holds all the spiderrequests/spiderreplies sorted by
// their IP
//
// the waiting tree holds at most one entry for an IP indicating that
// we should scan all the spiderrequests/spiderreplies for that IP in
// spiderdb, find the "best" one(s) and add it (them) to doledb.
//
// doledb holds the best spiderrequests from spiderdb sorted by
// "priority". priorities range from 0 to 127, the highest priority.
// basically doledb holds the urls that are ready for spidering now.
// Spiderdb
@ -242,10 +260,10 @@ bool getSpiderStatusMsg ( class CollectionRec *cx ,
// can spider any request/url in doledb provided they get the lock.
// scanSpiderdb()
// evalIpLoop()
//
// The waiting tree is populated at startup by scanning spiderdb (see
// SpiderColl::scanSpiderdb()), which might take a while to complete,
// SpiderColl::evalIpLoop()), which might take a while to complete,
// so it is running in the background while the gb server is up. it will
// log "10836674298 spiderdb bytes scanned for waiting tree re-population"
// periodically in the log as it tries to do a complete spiderdb scan
@ -255,7 +273,7 @@ bool getSpiderStatusMsg ( class CollectionRec *cx ,
// It will also perform a background scan if the admin changes the url
// filters table, which dictates that we recompute everything.
//
// scanSpiderdb() will recompute the "url filter number" (matching row)
// evalIpLoop() will recompute the "url filter number" (matching row)
// in the url filters table for each url in each SpiderRequest it reads.
// it will ignore spider requests whose urls
// are "filtered" or "banned". otherwise they will have a spider priority >= 0.
@ -270,18 +288,18 @@ bool getSpiderStatusMsg ( class CollectionRec *cx ,
// by preferring those with the highest priority. Tied spider priorities
// should be resolved by minimum hopCount probably.
//
// If the spidertime of the URL is overdue then scanSpiderdb() will NOT add
// If the spidertime of the URL is overdue then evalIpLoop() will NOT add
// it to waiting tree, but will add it to doledb directly to make it available
// for spidering immediately. It calls m_msg4.addMetaList() to add it to
// doledb on all hosts in its group (shard). It uses s_ufnTree for keeping
// track of the best urls to spider for a given IP/spiderPriority.
//
// scanSpiderdb() can also be called with its m_nextKey/m_endKey limited
// evalIpLoop() can also be called with its m_nextKey/m_endKey limited
// to just scan the SpiderRequests for a specific IP address. It does
// this after adding a SpiderReply. addSpiderReply() calls addToWaitingTree()
// with the "0" time entry, and addToWaitingTree() calls
// populateDoledbFromWaitingTree() which will see that "0" entry and call
// scanSpiderdb(true) after setting m_nextKey/m_endKey for that IP.
// evalIpLoop(true) after setting m_nextKey/m_endKey for that IP.
@ -289,7 +307,7 @@ bool getSpiderStatusMsg ( class CollectionRec *cx ,
//
// SpiderColl::populateDoledbFromWaitingTree() scans the waiting tree for
// entries whose spider time is due. so it gets the IP address and spider
// priority from the waiting tree. but then it calls scanSpiderdb()
// priority from the waiting tree. but then it calls evalIpLoop()
// restricted to that IP (using m_nextKey,m_endKey) to get the best
// SpiderRequest from spiderdb for that IP to add to doledb for immediate
// spidering. populateDoledbFromWaitingTree() is called a lot to try to
@ -505,8 +523,28 @@ class SpiderRequest {
// . this is zero if none or invalid
long m_contentHash32;
/*
char m_reserved1;
// the new add url control will allow user to control link spidering
// on each url they add. they can also specify file:// instead of
// http:// to index local files. so we have to allow file://
char m_onlyAddSameDomainLinks :1;
char m_onlyAddSameSubdomainLinks :1;
char m_onlyDoNotAddLinksLinks :1; // max hopcount 1
char m_onlyDoNotAddLinksLinksLinks :1; // max hopcount 2
char m_reserved2d:1;
char m_reserved2e:1;
char m_reserved2f:1;
char m_reserved2g:1;
char m_reserved2h:1;
// . each request can have a different hop count
// . this is only valid if m_hopCountValid is true!
short m_hopCount;
*/
long m_hopCount;
// . this is now computed dynamically often based on the latest
@ -711,16 +749,17 @@ class SpiderRequest {
long print( class SafeBuf *sb );
long printToTable ( SafeBuf *sb , char *status ,
class XmlDoc *xd ) ;
class XmlDoc *xd , long row ) ;
// for diffbot...
long printToTableSimple ( SafeBuf *sb , char *status ,
class XmlDoc *xd ) ;
class XmlDoc *xd , long row ) ;
static long printTableHeader ( SafeBuf *sb , bool currentlSpidering ) ;
static long printTableHeaderSimple ( SafeBuf *sb ,
bool currentlSpidering ) ;
// returns false and sets g_errno on error
bool setFromAddUrl ( char *url ) ;
bool setFromInject ( char *url ) ;
};
// . XmlDoc adds this record to spiderdb after attempting to spider a url
@ -826,7 +865,11 @@ class SpiderReply {
long m_isContacty :1;
long m_hasAddress :1;
long m_hasTOD :1;
long m_hasSiteVenue :1;
// make this "INvalid" not valid since it was set to 0 before
// and we want to be backwards compatible
long m_isIndexedINValid :1;
//long m_hasSiteVenue :1;
// expires after a certain time or if ownership changed
long m_inGoogleValid :1;
@ -835,7 +878,8 @@ class SpiderReply {
long m_isContactyValid :1;
long m_hasAddressValid :1;
long m_hasTODValid :1;
long m_hasSiteVenueValid :1;
//long m_hasSiteVenueValid :1;
long m_reserved2 :1;
long m_siteNumInlinksValid :1;
// was the request an injection request
long m_fromInjectionRequest :1;
@ -989,7 +1033,7 @@ class SpiderColl {
~SpiderColl ( );
SpiderColl ( ) ;
void clear();
void clearLocks();
// called by main.cpp on exit to free memory
void reset();
@ -1028,7 +1072,8 @@ class SpiderColl {
// for scanning the wait tree...
bool m_isPopulating;
// for reading from spiderdb
bool m_isReadDone;
//bool m_isReadDone;
bool m_didRead;
Msg4 m_msg4;
Msg1 m_msg1;
@ -1111,7 +1156,8 @@ class SpiderColl {
bool m_countingPagesIndexed;
HashTableX m_localTable;
long long m_lastReqUh48;
long long m_lastReqUh48a;
long long m_lastReqUh48b;
long long m_lastRepUh48;
// move to CollectionRec so it can load at startup and save it
//HashTableX m_pageCountTable;
@ -1127,8 +1173,17 @@ class SpiderColl {
bool addToWaitingTree ( uint64_t spiderTime , long firstIp ,
bool callForScan );
long getNextIpFromWaitingTree ( );
void populateDoledbFromWaitingTree ( bool reentry );
bool scanSpiderdb ( bool needList );
void populateDoledbFromWaitingTree ( );
//bool scanSpiderdb ( bool needList );
// broke up scanSpiderdb into simpler functions:
bool evalIpLoop ( ) ;
bool readListFromSpiderdb ( ) ;
bool scanListForWinners ( ) ;
bool addWinnerToDoledb ( ) ;
void populateWaitingTreeFromSpiderdb ( bool reentry ) ;
@ -1138,7 +1193,11 @@ class SpiderColl {
key_t m_waitingTreeKey;
bool m_waitingTreeKeyValid;
long m_scanningIp;
bool m_gotNewRequestsForScanningIp;
long m_gotNewDataForScanningIp;
long m_lastListSize;
long m_lastScanningIp;
char m_deleteMyself;
// start key for reading doledb
key_t m_msg5StartKey;
@ -1292,7 +1351,7 @@ void handleRequestc1 ( UdpSlot *slot , long niceness ) ;
// . max spiders we can have going at once for this process
// . limit to 70 to preven OOM conditions
#define MAX_SPIDERS 70
#define MAX_SPIDERS 100
class SpiderLoop {
@ -1412,6 +1471,7 @@ long getUrlFilterNum ( class SpiderRequest *sreq ,
bool isForMsg20 ,
long niceness ,
class CollectionRec *cr ,
bool isOutlink = false ) ;
bool isOutlink , // = false ,
HashTableX *quotaTable );//= NULL ) ;
#endif

@ -557,6 +557,11 @@ void Stats::printGraphInHtml ( SafeBuf &sb ) {
//
sb.safePrintf("<div style=\"position:relative;"
"background-color:#c0c0c0;"
// match style of tables
"border-radius:10px;"
"border:#6060f0 2px solid;"
//"overflow-y:hidden;"
"overflow-x:hidden;"
"z-index:-10;"
@ -567,10 +572,11 @@ void Stats::printGraphInHtml ( SafeBuf &sb ) {
"min-height:%lipx;"
//"width:100%%;"
//"min-height:600px;"
"margin-top:10px;"
//"margin-top:10px;"
"margin-bottom:10px;"
"margin-right:10px;"
"margin-left:10px;\">"
//"margin-right:10px;"
//"margin-left:10px;"
"\">"
,(long)DX
,(long)DY +20); // add 10 more for "2s" labels etc.

@ -25,9 +25,10 @@ class StatPoint {
#define MAX_POINTS 6000
#define MAX_WIDTH 6
#define DY 1000 // pixels vertical
//#define DY 1000 // pixels vertical
#define DY 500 // pixels vertical
#define DX 1000 // pixels across
#define DT (20*1000) // time window, 20 seconds
#define DT (10*1000) // time window, 10 seconds
#define MAX_LINES (DY / (MAX_WIDTH+1)) // leave free pixel above each line
#define STAT_GENERIC 0

@ -209,17 +209,17 @@ bool Statsdb::init ( ) {
// will init the CollectionRec::m_rdbBase, which is what
// Rdb::getBase(collnum_t) will return. however, for collectionless
// rdb databases we set Rdb::m_collectionlessBase special here.
return m_rdb.addColl ( NULL );
return m_rdb.addRdbBase1 ( NULL );
}
// Make sure we need this function.
// main.cpp currently uses the addColl from m_rdb
bool Statsdb::addColl ( char *coll, bool doVerify ) {
if ( ! m_rdb.addColl ( coll ) ) return false;
return true;
}
//bool Statsdb::addColl ( char *coll, bool doVerify ) {
// if ( ! m_rdb.addColl ( coll ) ) return false;
// return true;
//}
void flushStatsWrapper ( int fd , void *state ) {
g_statsdb.addDocsIndexed();
@ -532,7 +532,8 @@ bool Statsdb::makeGIF ( long t1Arg ,
#define MAX_POINTS 6000
#define MAX_WIDTH 6
#define DY2 600 // pixels vertical
//#define DY2 600 // pixels vertical
#define DY2 400 // pixels vertical
#define DX2 1000 // pixels across
#define MAX_LINES2 (DY2 / (MAX_WIDTH+1)) // leave free pixel above each line

@ -992,7 +992,7 @@ bool Syncdb::init ( ) {
// clear it all!
m_qt.clear();
// add the base since it is a collectionless rdb
return m_rdb.addColl ( NULL );
return m_rdb.addRdbBase1 ( NULL );
}
// . save our crap
@ -1432,9 +1432,10 @@ void Syncdb::syncDone ( ) {
m_rcpStarted = false;
}
/*
// TODO: Provide verification.
bool Syncdb::addColl ( char *coll, bool doVerify ) {
if ( ! m_rdb.addColl ( coll ) ) return false;
return true;
}
*/

148
Tagdb.cpp

@ -1854,7 +1854,7 @@ bool Tagdb::init2 ( long treeMem ) {
false ); // bias disk page cache?
}
/*
bool Tagdb::addColl ( char *coll, bool doVerify ) {
if ( ! m_rdb.addColl ( coll ) ) return false;
if ( ! doVerify ) return true;//false;
@ -1867,7 +1867,7 @@ bool Tagdb::addColl ( char *coll, bool doVerify ) {
//return true;
return false;
}
*/
bool Tagdb::verify ( char *coll ) {
@ -2761,14 +2761,16 @@ bool Msg8a::launchGetRequests ( ) {
long shardNum = getShardNum ( m_rdbId , &startKey );//, true );
Host *group = g_hostdb.getShard ( shardNum );
long numTwins = g_hostdb.getNumHostsPerShard();
//long numTwins = g_hostdb.getNumHostsPerShard();
// use top byte!
uint8_t *sks = (uint8_t *)&startKey;
uint8_t top = sks[sizeof(TAGDB_KEY)-1];
long hostNum = 0;
if ( numTwins == 2 && (top & 0x80) ) hostNum = 1;
//long hostNum = 0;
//if ( numTwins == 2 && (top & 0x80) ) hostNum = 1;
// TODO: fix this!
if ( numTwins >= 3 ) { char *xx=NULL;*xx=0; }
//if ( numTwins >= 3 ) { char *xx=NULL;*xx=0; }
// support more than 2 stripes now...
long hostNum = top % g_hostdb.getNumHostsPerShard();
long hostId = group[hostNum].m_hostId;
@ -4440,27 +4442,62 @@ bool sendReply2 ( void *state ) {
char bb [ MAX_COLL_LEN + 60 ];
bb[0]='\0';
sb.safePrintf(
"<style>"
".poo { background-color:#%s;}\n"
"</style>\n" ,
LIGHT_BLUE );
// print interface to add sites
sb.safePrintf (
"<table width=100%% bgcolor=#%s border=1 cellpadding=4>"
"<tr><td bgcolor=#%s colspan=21>"
"<center><font size=+1><b>Tagdb</b>%s</font></center>"
"</td></tr>", LIGHT_BLUE , DARK_BLUE , bb );
"<table %s>"
"<tr><td colspan=2>"
"<center><b>Tagdb</b>%s</center>"
"</td></tr>", TABLE_STYLE , bb );
// sometimes we add a huge # of urls, so don't display them because
// it like freezes the silly browser
char *uu = st->m_urls;
if ( st->m_urlsLen > 100000 ) uu = "";
sb.safePrintf ( "<tr><td colspan=21>");
//sb.safePrintf ( "<tr bgcolor=#%s><td colspan=2>"
// "<center>"
// "</center>"
// "</td></tr>",
// DARK_BLUE);
sb.safePrintf ( "<tr class=poo><td>"
"<b>urls</b>"
"<br>"
"<font size=-2>"
"Enter a single URL and then click <i>Get Tags</i> to "
"get back its tags. Enter multiple URLs and select "
"the tags names and values in the other table "
"below in order to tag "
"them all with those tags when you click "
"<i>Add Tags</i>. "
"On the command line you can also issue a "
"<i>./gb 0 dump S main 0 -1 1</i>"
"command, for instance, to dump out the tagdb "
"contents for the <i>main</i> collection on "
"<i>host #0</i>. "
"</font>"
"</td>");
// text area for adding space separated sites/urls
//char *pp = "put sites here";
//char *pp = "";
//if ( st->m_bufLen > 0 ) pp = st->m_buf; // no, print out "urls"
sb.safePrintf ("<center>"
sb.safePrintf (""
"<td width=70%%>"
"<br>"
"<textarea rows=16 cols=64 name=u>"
"%s</textarea><br><br>" , uu );
"%s</textarea></td></tr>" , uu );
// spam assassins should not use this much power, too risky
//if ( st->m_isAdmin ) {
@ -4470,30 +4507,61 @@ bool sendReply2 ( void *state ) {
// allow filename to load them from
//if ( st->m_isAdmin ) {
sb.safePrintf("or specify a file of them: <input name=ufu "
"type=text size=40><br>"
"<i>file can also be dumped output of "
"tagdb from the <b>gb dump S ...</b> "
"command.</i>"
"<br><br>" );
sb.safePrintf("<tr class=poo>"
"<td>"
"<b>file of urls to tag</b>"
"<br>"
"<font size=-2>"
"If provided, Gigablast will read the URLs from "
"this file as if you pasted them into the text "
"area above. The text area will also be ignored."
"</font>"
"</td>"
"<td><input name=ufu "
"type=text size=40>"//<br>"
//"<i>file can also be dumped output of "
//"tagdb from the <b>gb dump S ...</b> "
//"command.</i>"
//"<br><br>" );
"</td></tr>"
);
//}
// this is applied to every tag that is added for accountability
sb.safePrintf("<br>Username: <input name=username type=text size=6 "
"value=\"admin\"> " );//,st->m_username);
sb.safePrintf("<tr class=poo><td>"
"<b>username</b>"
"<br><font size=-2>"
"Stored with each tag you add for accountability."
"</font>"
"</td><td>"
"<input name=username type=text size=6 "
"value=\"admin\"> "
"</td></tr>"
);//,st->m_username);
// as a safety, this must be checked for any delete operation
sb.safePrintf ("&nbsp; delete operation<input type=\"checkbox\" "
"value=\"1\" name=\"delop\"><br>");
sb.safePrintf ("<tr class=poo><td><b>delete operation</b>"
"<br>"
"<font size=-2>"
"If checked "
"then the tag names you specify below will be "
"deleted for the URLs you provide in the text area "
"when you click <i>Add Tags</i>."
"</font>"
"</td><td><input type=\"checkbox\" "
"value=\"1\" name=\"delop\"></td></tr>");
// close up
sb.safePrintf ("<br><center>"
sb.safePrintf ("<tr bgcolor=#%s><td colspan=2>"
"<center>"
// this is merge all by default right now but since
// zak is really only using eventtaghashxxxx.com we
// should be ok
"<input type=submit name=get "
"value=\"get tags\" border=0>"
"value=\"Get Tags\" border=0>"
//"<input type=submit name=get "
//"value=\"get best rec\" border=0>"
@ -4506,7 +4574,11 @@ bool sendReply2 ( void *state ) {
// "</form>"
"</center>"
"</tr>\n");
"</td></tr></table>"
"<br><br>"
, DARK_BLUE
);
// . show all tags we got values for
// . put a delete checkbox next to each one
@ -4515,6 +4587,13 @@ bool sendReply2 ( void *state ) {
// for some reason the "selected" option tags do not show up below
// on firefox unless i have this line.
sb.safePrintf (
"<table %s>"
"<tr><td colspan=20>"
"<center><b>Add Tag</b></center>"
"</td></tr>", TABLE_STYLE );
// count how many "tagRecs" we are taking tags from
Tag *jtag = st->m_tagRec.getFirstTag();
long numTagRecs = 0;
@ -4532,13 +4611,14 @@ bool sendReply2 ( void *state ) {
bool canEdit = (numTagRecs <= 1);
if ( ! canEdit )
sb.safePrintf("<tr><td colspan=20><center><font color=red>"
sb.safePrintf("<tr class=poo>"
"<td colspan=10><center><font color=red>"
"<b>Can not edit because more than one "
"TagRecs were merged</b></font></center>"
"</td></tr>\n" );
// headers
sb.safePrintf("<tr bgcolor=%s>"
sb.safePrintf("<tr bgcolor=#%s>"
//"<td><b>delete?</b></td>"
"<td><b>del?</b></td>"
"<td><b>tag name</b></td>"
@ -4574,9 +4654,9 @@ bool sendReply2 ( void *state ) {
// if we are NULL, print out 3 empty tags
if ( ! ctag ) empty++;
// start the section
sb.safePrintf("<tr bgcolor=%s>",DARK_BLUE);
sb.safePrintf("<tr class=poo>");
// the delete tag checkbox
//sb.safePrintf("<tr bgcolor=%s><td>",DARK_BLUE);
//sb.safePrintf("<tr bgcolor=#%s><td>",DARK_BLUE);
sb.safePrintf("<td>");
if ( ctag && canEdit ) // && tag->m_type != ST_SITE )
sb.safePrintf("<input name=deltag%li "
@ -4624,7 +4704,7 @@ bool sendReply2 ( void *state ) {
// was selected will have this score
if ( canEdit )
sb.safePrintf("<input type=text name=tagdata%li "
"size=70 value=\"",count);
"size=50 value=\"",count);
// show the value
if ( ctag ) ctag->printDataToBuf ( &sb );
// close up the input tag
@ -4693,10 +4773,10 @@ bool sendReply2 ( void *state ) {
// do not print add or del tags buttons if we got tags from more
// than one TagRec!
if ( canEdit )
sb.safePrintf ("<tr bgcolor=%s><td colspan=21><center>"
sb.safePrintf ("<tr bgcolor=#%s><td colspan=10><center>"
"<input type=submit name=add "
"value=\"add tags\" border=0>"
"value=\"Add Tags\" border=0>"
"</center></td>"
"</tr>\n",DARK_BLUE);

@ -115,7 +115,7 @@ char *getTagStrFromType ( long tagType ) ;
//#define MAX_TAGREC_SIZE 1024
// max "oustanding" msg0 requests sent by TagRec::lookup()
#define MAX_TAGDB_REQUESTS 5
#define MAX_TAGDB_REQUESTS 3
// . the latest version of the TagRec
//#define TAGREC_CURRENT_VERSION 0

@ -604,6 +604,7 @@ bool TcpServer::sendMsg ( long ip ,
s->m_maxOtherDocLen = maxOtherDocLen ;
s->m_ssl = NULL;
s->m_udpSlot = NULL;
s->m_streamingMode = false;
// . call the connect routine to try to connect it asap
// . this does not block however
// . this returns false if blocked, true otherwise
@ -694,11 +695,17 @@ bool TcpServer::sendMsg ( TcpSocket *s ,
// . this will also unregister all our callbacks for the socket
// . TODO: deleting nodes from under Loop::callCallbacks is dangerous!!
if ( g_errno ) { destroySocket ( s ); return true; }
// if in streaming mode just return true, do not set sockState
// to ST_NEEDS_CLOSE lest it be destroyed. streaming mode needs
// to get more data to send on the socket.
if ( s->m_streamingMode ) return true;
// reset the socket iff it was a reply that we finished writing
// hmmm else if ( s->m_readBuf ) { recycleSocket ( s ); return true; }
// we can't close it here any more for some reason the browser truncates
// the content we transmit otherwise... i've tried SO_LINGER and couldnt get
// that to work...
// we can't close it here any more for some reason the browser truncats
// the content we transmit otherwise... i've tried SO_LINGER and
// couldnt get that to work...
if ( s->m_readBuf ) { s->m_sockState = ST_NEEDS_CLOSE; return true; }
// we're blocking on the reply (readBuf is empty)
return false;
@ -906,6 +913,8 @@ TcpSocket *TcpServer::wrapSocket ( int sd , long niceness , bool isIncoming ) {
s->m_lastActionTime = s->m_startTime;
// set if it's incoming connection or not
s->m_isIncoming = isIncoming;
// turn this off
s->m_streamingMode = false;
// . a 30 sec timeout, we don't want slow guys using all our sockets
// . they could easily flood us anyway though
// . we need to wait possibly a few minutes for a large inject of
@ -1434,7 +1443,7 @@ void writeSocketWrapper ( int sd , void *state ) {
// if socket has nothing to send yet cuz we're waiting, wait...
if ( s->m_sendBufUsed == 0 ) return;
sendAgain:
// sendAgain:
// . writeSocket returns false if blocked, true otherwise
// . it also sets g_errno on errro
@ -1451,13 +1460,16 @@ void writeSocketWrapper ( int sd , void *state ) {
// if callback changed socket status to ST_SEND_AGAIN
// then let's send the new buffer that it has. Diffbot.cpp uses this.
if ( s->m_sockState == ST_SEND_AGAIN ) {
s->m_sockState = ST_WRITING;
// if nothing left to send just return
if ( ! s->m_sendBuf ) return;
// otherwise send it
goto sendAgain;
}
//if ( s->m_sockState == ST_SEND_AGAIN ) {
// s->m_sockState = ST_WRITING;
// // if nothing left to send just return
// if ( ! s->m_sendBuf ) return;
// // otherwise send it
// goto sendAgain;
//}
// wait for it to exit streaming mode before destroying
if ( s->m_streamingMode ) return;
// . destroy the socket on error, recycle on transaction completion
// . this will also unregister all our callbacks for the socket
@ -1673,6 +1685,14 @@ connected:
// . calls the callback governing "s" if it has one
void TcpServer::destroySocket ( TcpSocket *s ) {
if ( ! s ) return ;
// sanity, must exit streaming mode before destruction
if ( s->m_streamingMode ) {
log("tcp: destroying socket in streaming mode. err=%s",
mstrerror(g_errno));
//char *xx=NULL;*xx=0; }
}
// sanity check
if ( s->m_udpSlot ) {
log("tcp: sending back error on udp slot err=%s",
@ -1864,6 +1884,7 @@ void TcpServer::recycleSocket ( TcpSocket *s ) {
//s->m_timeout = 60*1000;
s->m_timeout = 10*60*1000;
s->m_udpSlot = NULL;
s->m_streamingMode = false;
// keep it alive for other dialogs
s->m_sockState = ST_AVAILABLE;
s->m_startTime = gettimeofdayInMilliseconds();
@ -2097,6 +2118,7 @@ TcpSocket *TcpServer::acceptSocket ( ) {
s->m_sockState = ST_READING;
s->m_this = this;
s->m_udpSlot = NULL;
s->m_streamingMode = false;
if ( ! m_useSSL ) return s;
@ -2214,3 +2236,56 @@ void TcpServer::cancel ( void *state ) {
destroySocket ( s );
}
}
#include "SafeBuf.h"
bool TcpServer::sendChunk ( TcpSocket *s ,
SafeBuf *sb ,
void *state ,
// call this function when done sending this chunk
// so that it can read another chunk and call
// sendChunk() again.
void (* doneSendingWrapper)( void *,TcpSocket *) ,
bool lastChunk ) {
log("tcp: sending chunk of %li bytes", sb->length() );
// if socket had shit on there already, free that memory
// just like TcpServer::destroySocket would
if ( s->m_sendBuf ) {
mfree (s->m_sendBuf, s->m_sendBufSize,"TcpServer");
s->m_sendBuf = NULL;
}
// reset send stats just in case
s->m_sendOffset = 0;
s->m_totalSent = 0;
s->m_totalToSend = 0;
// let it know not to close the socket while this is set
if ( ! lastChunk ) s->m_streamingMode = true;
else s->m_streamingMode = false;
// . start the send process
// . returns false if send did not complete
// . returns true and sets g_errno on error
if ( ! sendMsg ( s ,
sb->getBufStart(), // sendBuf ,
sb->getCapacity(),//sendBufSize ,
sb->length(),//sendBufSize ,
sb->length(), // msgtotalsize
state , // data for callback
doneSendingWrapper ) ) { // callback
// do not free sendbuf we are transmitting it
sb->detachBuf();
return false;
}
// we sent without blocking
sb->detachBuf();
// a problem?
if ( g_errno ) return true;
return true;
}

@ -99,6 +99,15 @@ class TcpServer {
long maxOtherDocLen );
bool sendChunk ( class TcpSocket *s ,
class SafeBuf *sb ,
void *state ,
// call this function when done sending this chunk
// so that it can read another chunk and call
// sendChunk() again.
void (* doneSendingWrapper)( void *state,TcpSocket *),
bool lastChunk );
// . returns false if blocked, true otherwise
// . sets errno on error
// . use this for sending a msg to another host

@ -29,7 +29,7 @@
// hack to repopulate the socket's send buf when its done sending
// it's current sendbuf in order to transmit large amounts of data that
// can't all fit in memory at the same time:
#define ST_SEND_AGAIN 10
//#define ST_SEND_AGAIN 10
#define TCP_READ_BUF_SIZE 1024
@ -117,6 +117,7 @@ class TcpSocket {
long m_maxOtherDocLen; // if reading other doc types
char m_niceness;
char m_streamingMode;
long m_shutdownStart;

@ -105,7 +105,7 @@ bool Tfndb::init2 ( long treeMem ) {
return false;
return true;
}
/*
bool Tfndb::addColl ( char *coll, bool doVerify ) {
if ( ! m_rdb.addColl ( coll ) ) return false;
if ( ! doVerify ) return true;
@ -117,7 +117,7 @@ bool Tfndb::addColl ( char *coll, bool doVerify ) {
log ( "db: Verify failed, but scaling is allowed, passing." );
return true;
}
*/
bool Tfndb::verify ( char *coll ) {
log ( LOG_INFO, "db: Verifying Tfndb for coll %s...", coll );
g_threads.disableThreads();

@ -27,7 +27,7 @@ class Tfndb {
bool verify ( char *coll );
bool addColl ( char *coll, bool doVerify = true );
//bool addColl ( char *coll, bool doVerify = true );
// set up our private rdb
bool init ( );

@ -284,7 +284,12 @@ bool Threads::init ( ) {
// with high niceness cuz it would hold up high priority ones!
// . TODO: is there a better way? cancel it when UdpServer calls
// Threads::suspendLowPriorityThreads() ?
if ( ! g_threads.registerType ( MERGE_THREAD , 2/*maxThreads*/,1000) )
// . this used to be 2 but now defaults to 10 in Parms.cpp. i found
// i have less long gray lines in the performance graph when i
// did that on trinity.
long max2 = g_conf.m_maxCpuMergeThreads;
if ( max2 < 1 ) max2 = 1;
if ( ! g_threads.registerType ( MERGE_THREAD , max2,1000) )
return log("thread: Failed to register thread type." );
// will raising this from 1 to 2 make it faster too?
// i raised since global specs new servers have 2 (hyperthreaded?) cpus
@ -300,7 +305,11 @@ bool Threads::init ( ) {
return log("thread: Failed to register thread type." );
// . File.cpp spawns a rename thread for doing renames and unlinks
// . doing a tight merge on titldb can be ~250 unlinks
if ( ! g_threads.registerType ( UNLINK_THREAD,1/*maxThreads*/,3000) )
// . MDW up from 1 to 30 max, after doing a ddump on 3000+ collections
// it was taking forever to go one at a time through the unlink
// thread queue. seemed like a 1 second space between unlinks.
// 1/23/1014
if ( ! g_threads.registerType ( UNLINK_THREAD,30/*maxThreads*/,3000) )
return log("thread: Failed to register thread type." );
// generic multipurpose
if ( ! g_threads.registerType (GENERIC_THREAD,100/*maxThreads*/,100) )
@ -1120,7 +1129,7 @@ void makeCallback ( ThreadEntry *t ) {
// then set it
if ( t->m_niceness >= 1 ) g_niceness = 1;
else g_niceness = 0;
t->m_callback ( t->m_state , t );
// time it?

@ -124,7 +124,7 @@ bool Titledb::init2 ( long treeMem ) {
// validate
//return verify ( );
}
/*
bool Titledb::addColl ( char *coll, bool doVerify ) {
if ( ! m_rdb.addColl ( coll ) ) return false;
if ( ! doVerify ) return true;
@ -136,7 +136,7 @@ bool Titledb::addColl ( char *coll, bool doVerify ) {
log ( "db: Verify failed, but scaling is allowed, passing." );
return true;
}
*/
bool Titledb::verify ( char *coll ) {
log ( LOG_DEBUG, "db: Verifying Titledb for coll %s...", coll );
g_threads.disableThreads();

@ -39,7 +39,7 @@ class Titledb {
bool verify ( char *coll );
bool addColl ( char *coll, bool doVerify = true );
//bool addColl ( char *coll, bool doVerify = true );
// init m_rdb
bool init ();

@ -131,8 +131,8 @@ static char s_compBuf[COMPBUFSIZE];
// Kompatible Decomposition table must be loaded before calling this
bool initCompositionTable(){
if ( ! s_isInitialized ) {
log(LOG_INFO,"conf: UCNormalizer: "
"initializing Full Composition table");
//log(LOG_INFO,"conf: UCNormalizer: "
// "initializing Full Composition table");
// set up the hash table
//if ( ! s_compositions.set ( 8,4,16384 ) )
if (!s_compositions.set(8,4,65536,s_compBuf,(long)COMPBUFSIZE,

Some files were not shown because too many files have changed in this diff Show More