Merge branch 'diffbot-testing' into diffbot-dan

This commit is contained in:
Matt Wells
2014-05-12 15:33:15 -07:00
78 changed files with 4633 additions and 1242 deletions

@ -467,13 +467,13 @@ bool Collectiondb::addNewColl ( char *coll ,
cr->m_collectiveRespiderFrequency = 0.0;
//cr->m_restrictDomain = true;
// reset the crawl stats
// . this will core if a host was dead and then when it came
// back up host #0's parms.cpp told it to add a new coll
cr->m_diffbotCrawlStartTime=
gettimeofdayInMillisecondsGlobalNoCore();
cr->m_diffbotCrawlEndTime = 0LL;
}
// . this will core if a host was dead and then when it came
// back up host #0's parms.cpp told it to add a new coll
cr->m_diffbotCrawlStartTime = getTimeGlobalNoCore();
cr->m_diffbotCrawlEndTime = 0;
// . just the basics on these for now
// . if certain parms are changed then the url filters
// must be rebuilt, as well as possibly the waiting tree!!!
@ -807,6 +807,11 @@ bool Collectiondb::deleteRec2 ( collnum_t collnum ) { //, WaitEntry *we ) {
sc->clearLocks();
//sc->m_collnum = newCollnum;
//sc->reset();
// you have to set this for tryToDeleteSpiderColl to
// actually have a shot at deleting it
sc->m_deleteMyself = true;
// cr will be invalid shortly after this
sc->m_cr = NULL;
// this will put it on "death row" so it will be deleted
// once Msg5::m_waitingForList/Merge is NULL
tryToDeleteSpiderColl ( sc );
@ -1125,6 +1130,11 @@ bool Collectiondb::resetColl2( collnum_t oldCollnum,
// to any rdb...
cr->m_collnum = newCollnum;
// update the timestamps since we are restarting/resetting
cr->m_diffbotCrawlStartTime = getTimeGlobalNoCore();
cr->m_diffbotCrawlEndTime = 0;
////////
//
// ALTER m_recs[] array
@ -1252,6 +1262,33 @@ CollectionRec *Collectiondb::getRec ( HttpRequest *r , bool useDefaultRec ) {
return g_collectiondb.getRec ( coll );
}
char *Collectiondb::getDefaultColl ( HttpRequest *r ) {
char *coll = r->getString ( "c" );
if ( coll && ! coll[0] ) coll = NULL;
if ( coll ) return coll;
CollectionRec *cr = NULL;
// default to main first
if ( ! coll ) {
cr = g_collectiondb.getRec("main");
// CAUTION: cr could be deleted so don't trust this ptr
// if you give up control of the cpu
if ( cr ) return cr->m_coll;
}
// try next in line
if ( ! coll ) {
cr = getFirstRec ();
if ( cr ) return cr->m_coll;
}
// give up?
return NULL;
}
//CollectionRec *Collectiondb::getRec2 ( HttpRequest *r , bool useDefaultRec) {
// char *coll = getDefaultColl();
// return g_collectiondb.getRec(coll);
//}
// . get collectionRec from name
// . returns NULL if not available
CollectionRec *Collectiondb::getRec ( char *coll ) {
@ -1584,12 +1621,14 @@ void CollectionRec::reset() {
sc->m_deleteMyself = true;
// if not currently being accessed nuke it now
if ( ! sc->m_msg5.m_waitingForList &&
! sc->m_msg5b.m_waitingForList &&
! sc->m_msg1.m_mcast.m_inUse ) {
mdelete ( sc, sizeof(SpiderColl),"nukecr2");
delete ( sc );
}
tryToDeleteSpiderColl ( sc );
// if ( ! sc->m_msg5.m_waitingForList &&
// ! sc->m_msg5b.m_waitingForList &&
// ! sc->m_msg1.m_mcast.m_inUse ) {
// mdelete ( sc, sizeof(SpiderColl),"nukecr2");
// delete ( sc );
// }
}
CollectionRec *g_cr = NULL;
@ -1617,7 +1656,7 @@ bool CollectionRec::load ( char *coll , long i ) {
strcpy ( m_coll , coll );
if ( ! g_conf.m_doingCommandLine )
log(LOG_INFO,"db: loading conf for collection %s (%li)",coll,
log(LOG_INFO,"db: Loading conf for collection %s (%li)",coll,
(long)m_collnum);
// collection name HACK for backwards compatibility
@ -1649,7 +1688,7 @@ bool CollectionRec::load ( char *coll , long i ) {
// LOAD LOCAL
snprintf ( tmp1 , 1023, "%scoll.%s.%li/localcrawlinfo.dat",
g_hostdb.m_dir , m_coll , (long)m_collnum );
log(LOG_DEBUG,"db: loading %s",tmp1);
log(LOG_DEBUG,"db: Loading %s",tmp1);
m_localCrawlInfo.reset();
SafeBuf sb;
// fillfromfile returns 0 if does not exist, -1 on read error
@ -1660,7 +1699,7 @@ bool CollectionRec::load ( char *coll , long i ) {
if ( ! g_conf.m_doingCommandLine )
log("coll: loaded %s (%li) local hasurlsready=%li",
log("coll: Loaded %s (%li) local hasurlsready=%li",
m_coll,
(long)m_collnum,
(long)m_localCrawlInfo.m_hasUrlsReadyToSpider);
@ -1698,7 +1737,7 @@ bool CollectionRec::load ( char *coll , long i ) {
// LOAD GLOBAL
snprintf ( tmp1 , 1023, "%scoll.%s.%li/globalcrawlinfo.dat",
g_hostdb.m_dir , m_coll , (long)m_collnum );
log(LOG_DEBUG,"db: loading %s",tmp1);
log(LOG_DEBUG,"db: Loading %s",tmp1);
m_globalCrawlInfo.reset();
sb.reset();
if ( sb.fillFromFile ( tmp1 ) > 0 )
@ -1707,7 +1746,7 @@ bool CollectionRec::load ( char *coll , long i ) {
memcpy ( &m_globalCrawlInfo , sb.getBufStart(),sb.length() );
if ( ! g_conf.m_doingCommandLine )
log("coll: loaded %s (%li) global hasurlsready=%li",
log("coll: Loaded %s (%li) global hasurlsready=%li",
m_coll,
(long)m_collnum,
(long)m_globalCrawlInfo.m_hasUrlsReadyToSpider);
@ -1865,6 +1904,9 @@ bool CollectionRec::rebuildUrlFilters2 ( ) {
// addDefault = true;
if ( ! rebuild ) return true;
if ( m_urlFiltersProfile == UFP_CHINESE )
return rebuildChineseRules();
long n = 0;
/*
@ -1948,7 +1990,6 @@ bool CollectionRec::rebuildUrlFilters2 ( ) {
m_spiderFreqs [n] = .00347; // 5 mins
n++;
m_regExs[n].set("hopcount==0 && iswww");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 7.0; // days b4 respider
@ -2111,6 +2152,383 @@ bool CollectionRec::rebuildUrlFilters2 ( ) {
return true;
}
bool CollectionRec::rebuildChineseRules ( ) {
long n = 0;
m_regExs[n].set("isdocidbased");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 0; // 30 days default
m_maxSpidersPerRule [n] = 99; // max spiders
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 80;
n++;
m_regExs[n].set("ismedia");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 0; // 30 days default
m_maxSpidersPerRule [n] = 99; // max spiders
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = -3; // delete!
n++;
// if not in the site list then nuke it
m_regExs[n].set("!ismanualadd && !insitelist");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 0; // 30 days default
m_maxSpidersPerRule [n] = 99; // max spiders
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = -3; // delete!
n++;
m_regExs[n].set("errorcount>=3 && hastmperror");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 1; // 30 days default
m_maxSpidersPerRule [n] = 1; // max spiders
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 3;
n++;
m_regExs[n].set("errorcount>=1 && hastmperror");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 1; // 30 days default
m_maxSpidersPerRule [n] = 1; // max spiders
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 45;
n++;
m_regExs[n].set("isaddurl");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 7; // 30 days default
m_maxSpidersPerRule [n] = 99; // max spiders
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 85;
n++;
m_regExs[n].set("hopcount==0 && iswww && isnew && tld==cn");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 7; // 30 days default
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 50;
n++;
m_regExs[n].set("hopcount==0 && iswww && isnew && parentlang==zh_cn,zh_tw,xx");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 7; // 30 days default
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 50;
n++;
m_regExs[n].set("hopcount==0 && iswww && isnew");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 7; // 30 days default
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 20;
m_regExs[n].set("hopcount==0 && iswww && tld==cn");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 7.0; // days b4 respider
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 48;
n++;
m_regExs[n].set("hopcount==0 && iswww && parentlang==zh_cn,zh_tw,xx");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 7.0; // days b4 respider
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 48;
n++;
m_regExs[n].set("hopcount==0 && iswww");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 7.0; // days b4 respider
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 19;
n++;
m_regExs[n].set("hopcount==0 && isnew && tld==cn");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 7.0;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 49;
n++;
m_regExs[n].set("hopcount==0 && isnew && parentlang==zh_cn,zh_tw,xx");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 7.0;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 49;
n++;
m_regExs[n].set("hopcount==0 && isnew");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 7.0;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 18;
n++;
m_regExs[n].set("hopcount==0 && tld==cn");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 10.0;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 47;
n++;
m_regExs[n].set("hopcount==0 && parentlang==zh_cn,zh_tw,xx");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 10.0;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 47;
n++;
m_regExs[n].set("hopcount==0");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 10.0;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 17;
n++;
m_regExs[n].set("hopcount==1 && isnew && tld==cn");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 20.0;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 40;
n++;
m_regExs[n].set("hopcount==1 && isnew && parentlang==zh_cn,zh_tw,xx");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 20.0;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 40;
n++;
m_regExs[n].set("hopcount==1 && isnew");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 20.0;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 16;
n++;
m_regExs[n].set("hopcount==1 && tld==cn");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 20.0;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 39;
n++;
m_regExs[n].set("hopcount==1 && parentlang==zh_cn,zh_tw,xx");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 20.0;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 39;
n++;
m_regExs[n].set("hopcount==1");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 20.0;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 15;
n++;
m_regExs[n].set("hopcount==2 && isnew && tld==cn");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 40;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 30;
n++;
m_regExs[n].set("hopcount==2 && isnew && parentlang==zh_cn,zh_tw,xx");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 40;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 30;
n++;
m_regExs[n].set("hopcount==2 && isnew");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 40;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 14;
n++;
m_regExs[n].set("hopcount==2 && tld==cn");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 40;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 29;
n++;
m_regExs[n].set("hopcount==2 && parentlang==zh_cn,zh_tw,xx");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 40;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 29;
n++;
m_regExs[n].set("hopcount==2");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 40;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 13;
n++;
m_regExs[n].set("hopcount>=3 && isnew && tld==cn");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 60;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 22;
n++;
m_regExs[n].set("hopcount>=3 && isnew && parentlang==zh_cn,zh_tw,xx");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 60;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 22;
n++;
m_regExs[n].set("hopcount>=3 && isnew");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 60;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 12;
n++;
m_regExs[n].set("hopcount>=3 && tld==cn");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 60;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 21;
n++;
m_regExs[n].set("hopcount>=3 && parentlang==zh_cn,zh_tw,xx");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 60;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 21;
n++;
m_regExs[n].set("hopcount>=3");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 60;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 11;
n++;
m_regExs[n].set("default");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 60;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 1;
n++;
m_numRegExs = n;
m_numRegExs2 = n;
m_numRegExs3 = n;
m_numRegExs10 = n;
m_numRegExs5 = n;
m_numRegExs6 = n;
m_numRegExs8 = n;
// done rebuilding CHINESE rules
return true;
}
/*
bool CrawlInfo::print (SafeBuf *sb ) {
return sb->safePrintf("objectsAdded:%lli\n"
@ -2350,7 +2768,6 @@ bool CollectionRec::hasSearchPermission ( TcpSocket *s , long encapIp ) {
}
bool expandRegExShortcuts ( SafeBuf *sb ) ;
//bool updateSiteListTables(collnum_t collnum,bool addSeeds,char *siteListArg);
void nukeDoledb ( collnum_t collnum );
// . anytime the url filters are updated, this function is called
@ -2358,7 +2775,7 @@ void nukeDoledb ( collnum_t collnum );
bool CollectionRec::rebuildUrlFilters ( ) {
if ( ! g_conf.m_doingCommandLine )
log("coll: rebuilding url filters for %s ufp=%li",m_coll,
log("coll: Rebuilding url filters for %s ufp=%li",m_coll,
(long)m_urlFiltersProfile);
// if not a custom crawl, and no expressions, add a default one
@ -2762,3 +3179,4 @@ void testRegex ( ) {
url,rx);
exit(0);
}

@ -75,6 +75,12 @@ class Collectiondb {
class CollectionRec *getRec ( class HttpRequest *r ,
bool useDefaultRec = true );
// do not support diffbot style token/name style for this one:
char *getDefaultColl ( HttpRequest *r ) ;
//class CollectionRec *getRec2 ( class HttpRequest *r ,
// bool useDefaultRec = true );
// . get collectionRec from name
// returns NULL if not available
class CollectionRec *getRec ( char *coll );
@ -368,6 +374,8 @@ class CollectionRec {
// for regular crawls
bool rebuildUrlFilters2();
bool rebuildChineseRules();
bool m_urlFiltersHavePageCounts;
// moved from SpiderColl so we can load up at startup
@ -408,6 +416,8 @@ class CollectionRec {
// spidered and begin the next round
long m_spiderRoundNum;
char m_makeImageThumbnails;
char m_indexSpiderReplies;
char m_indexBody;
//char m_useDatedb ;
@ -659,8 +669,9 @@ class CollectionRec {
long long m_maxToProcess;
long m_maxCrawlRounds;
long long m_diffbotCrawlStartTime;
long long m_diffbotCrawlEndTime;
// in seconds now
long m_diffbotCrawlStartTime;
long m_diffbotCrawlEndTime;
// for testing their regexes etc...
//char m_isDiffbotTestCrawl;

@ -297,8 +297,8 @@ bool Conf::init ( char *dir ) { // , long hostId ) {
log(LOG_INFO,"db: Split is FULL");
*/
// sanity check
if ( g_hostdb.m_indexSplits > MAX_INDEXDB_SPLIT ) {
log("db: Increase MAX_INDEXDB_SPLIT");
if ( g_hostdb.m_indexSplits > MAX_SHARDS ) {
log("db: Increase MAX_SHARDS");
char *xx = NULL; *xx = 0;
}
// and always keep a decent site quality cache of at least 3M

1
Conf.h

@ -628,6 +628,7 @@ class Conf {
bool m_logDebugDownloads;
bool m_logDebugFacebook;
bool m_logDebugHttp ;
bool m_logDebugImage ;
bool m_logDebugLoop ;
bool m_logDebugLang ;
bool m_logDebugLinkInfo ;

@ -169,6 +169,7 @@ case EWAITINGTOSYNCHOSTSCONF: return "Wait to ensure hosts.conf in sync";
case EDOCNONCANONICAL: return "Url was dup of canonical page";
case ECUSTOMCRAWLMISMATCH: return "Job name/type mismatch. Job name has already been used for a crawl or bulk job.";
case ENOTOKEN: return "Missing token";
case EBADIMG: return "Bad image";
}
// if the remote error bit is clear it must be a regulare errno
//if ( ! ( errnum & REMOTE_ERROR_BIT ) ) return strerror ( errnum );

@ -172,6 +172,7 @@ enum {
EWAITINGTOSYNCHOSTSCONF,
EDOCNONCANONICAL,
ECUSTOMCRAWLMISMATCH, // a crawl request was made with a name that already existed for bulk request (or the other way around)
ENOTOKEN
ENOTOKEN,
EBADIMG
};
#endif

@ -43,6 +43,7 @@ void Hostdb::resetPortTables () {
}
static int cmp ( const void *h1 , const void *h2 ) ;
//static int cmp2 ( const void *h1 , const void *h2 ) ;
//static void *syncStartWrapper_r ( void *state );
@ -98,7 +99,7 @@ char *Hostdb::getNetName ( ) {
// . gets filename that contains the hosts from the Conf file
// . return false on errro
// . g_errno may NOT be set
bool Hostdb::init ( long hostId , char *netName ,
bool Hostdb::init ( long hostIdArg , char *netName ,
bool proxyHost , char useTmpCluster , char *cwd ) {
// reset my ip and port
m_myIp = 0;
@ -118,6 +119,12 @@ bool Hostdb::init ( long hostId , char *netName ,
char *filename = "hosts.conf";
//if ( strcmp ( filename , "hosts.conf" ) == 0 )
// filename = "localhosts.conf";
//bool triedEtc = false;
// for now we autodetermine
if ( hostIdArg != -1 ) { char *xx=NULL;*xx=0; }
// init to -1
m_hostId = -1;
retry:
@ -136,11 +143,11 @@ bool Hostdb::init ( long hostId , char *netName ,
m_netName[0] = '\0';
if ( netName ) strncpy ( m_netName , netName , 31 );
// make sure our hostId is in our conf file
if ( hostId < 0 )
return log(
"conf: Negative hostId %li supplied",hostId);
//if ( hostId < 0 )
// return log(
// "conf: Negative hostId %li supplied",hostId);
// set early for calling log()
m_hostId = hostId;
//m_hostId = hostId;
// set clock in sync in fctypes.cpp
//if ( m_hostId == 0 ) g_clockInSync = true;
// log it
@ -174,15 +181,18 @@ bool Hostdb::init ( long hostId , char *netName ,
if ( this == &g_hostdb2 ) return true;
g_errno = ENOHOSTSFILE;
// if doing localhosts.conf now try hosts.conf
if ( strcmp(filename,"localhosts.conf") == 0 ) {
filename = "hosts.conf";
g_errno = 0;
goto retry;
}
// if ( ! triedEtc ) { //strcmp(filename,"hosts.conf") == 0 ) {
// triedEtc = true;
// dir = "/etc/gigablast/";
// //filename = "hosts.conf";
// g_errno = 0;
// goto retry;
// }
// now we generate one if that is not there
if ( ! m_created ) {
m_created = true;
g_errno = 0;
dir = cwd;
createHostsConf( cwd );
goto retry;
}
@ -200,6 +210,10 @@ bool Hostdb::init ( long hostId , char *netName ,
filename,m_bufSize,
(long)(MAX_HOSTS+MAX_SPARES)*128);
}
// note it
//log("host: reading %s",f.getFilename());
// save it
//m_hostsConfFilename.safePrintf("%s",f.getFilename());
// open the file
if ( ! f.open ( O_RDONLY ) ) return false;
// read in the file
@ -1016,14 +1030,19 @@ bool Hostdb::init ( long hostId , char *netName ,
// set # of machines
m_numMachines = next;
// get IPs of this server. last entry is 0.
long *localIps = getLocalIps();
// now get host based on cwd and ip
Host *host = getHost2 ( cwd , localIps );
// now set m_myIp, m_myPort, m_myPort2 and m_myMachineNum
Host *host = getHost ( hostId );
//Host *host = getHost ( hostId );
if ( proxyHost )
host = getProxy ( hostId );
host = getProxy2 ( cwd , localIps ); //hostId );
if ( ! host )
return log(
"conf: Could not find host with hostId %li in "
"%s.",hostId,filename);
return log("conf: Could not find host with path %s and "
"local ip in %s",cwd,filename);
m_myIp = host->m_ip; // internal IP
m_myIpShotgun = host->m_ipShotgun;
m_myPort = host->m_port; // low priority udp port
@ -1098,7 +1117,7 @@ bool Hostdb::init ( long hostId , char *netName ,
*/
// THIS hostId
m_hostId = hostId;
m_hostId = m_myHost->m_hostId;
// set hosts per shard (mirror group)
m_numHostsPerShard = m_numHosts / m_numShards;
@ -1131,17 +1150,17 @@ bool Hostdb::init ( long hostId , char *netName ,
}
// get THIS host
Host *h = getHost ( hostId );
Host *h = getHost ( m_hostId );
if ( proxyHost )
h = getProxy ( hostId );
h = getProxy ( m_hostId );
if ( ! h ) return log(
"conf: HostId %li not found in %s.",
hostId,filename);
m_hostId,filename);
// set m_dir to THIS host's working dir
strcpy ( m_dir , h->m_dir );
// likewise, set m_htmlDir to this host's html dir
sprintf ( m_httpRootDir , "%shtml/" , m_dir );
sprintf ( m_logFilename , "%slog%03li", m_dir , hostId );
sprintf ( m_logFilename , "%slog%03li", m_dir , m_hostId );
if ( ! g_conf.m_runAsDaemon )
sprintf(m_logFilename,"/dev/stderr");
@ -2297,6 +2316,14 @@ uint32_t Hostdb::getShardNumByTermId ( void *k ) {
return m_map [(*(uint16_t *)((char *)k + 16))>>3];
}
// uint32_t Hostdb::getShardNumFromTermId ( long long termId ) {
// key144_t sk;
// // make fake posdb key
// makeStartKey ( &sk, termId );
// // and use this
// return getShardNumByTermId ( &sk );
// }
// . if false, we don't split index and date lists, other dbs are unaffected
// . this obsolets the g_*.getGroupId() functions
// . this allows us to have any # of groups in a stripe, not just power of 2
@ -2494,6 +2521,12 @@ bool Hostdb::createHostsConf( char *cwd ) {
sb.safePrintf("# Tells us what hosts are participating in the distributed search engine.\n");
sb.safePrintf("\n");
sb.safePrintf("\n");
// put our cwd here
sb.safePrintf("0 5998 7000 8000 9000 127.0.0.1 127.0.0.1 %s\n",cwd);
sb.safePrintf("\n");
sb.safePrintf("\n");
sb.safePrintf("# How many mirrors do you want? If this is 0 then your data\n");
sb.safePrintf("# will NOT be replicated. If it is 1 then each host listed\n");
sb.safePrintf("# below will have one host that mirrors it, thereby decreasing\n");
@ -2548,10 +2581,6 @@ bool Hostdb::createHostsConf( char *cwd ) {
sb.safePrintf("# 'gb' binary resides.\n");
sb.safePrintf("#\n");
// put our cwd here
sb.safePrintf("0 5998 7000 8000 9000 127.0.0.1 127.0.0.1 %s\n",cwd);
sb.safePrintf("\n");
sb.safePrintf("\n");
sb.safePrintf("#\n");
sb.safePrintf("# Example of a four-node distributed search index running on a single\n");
sb.safePrintf("# server with four cores. The working directories are /home/mwells/hostN/.\n");
@ -2616,3 +2645,74 @@ bool Hostdb::createHostsConf( char *cwd ) {
sb.save ( cwd , "hosts.conf" );
return true;
}
static long s_localIps[20];
#include <sys/types.h>
#include <ifaddrs.h>
long *getLocalIps ( ) {
static bool s_valid = false;
if ( s_valid ) return s_localIps;
s_valid = true;
struct ifaddrs *ifap = NULL;
getifaddrs( &ifap );
ifaddrs *p = ifap;
long ni = 0;
// store loopback just in case
long loopback = atoip("127.0.0.1");
s_localIps[ni++] = loopback;
for ( ; p && ni < 18 ; p = p->ifa_next ) {
long ip = ((struct sockaddr_in*)p->ifa_addr)->sin_addr.s_addr;
// skip if loopback we stored above
if ( ip == loopback ) continue;
// skip bogus ones
if ( (unsigned long)ip <= 10 ) continue;
// show it
//log("host: detected local ip %s",iptoa(ip));
// otherwise store it
s_localIps[ni++] = ip;
}
// mark the end of it
s_localIps[ni] = 0;
// free that memore
freeifaddrs ( ifap );
// return the static buffer
return s_localIps;
}
Host *Hostdb::getHost2 ( char *cwd , long *localIps ) {
for ( long i = 0 ; i < m_numHosts ; i++ ) {
Host *h = &m_hosts[i];
// . get the path. guaranteed to end in '/'
// as well as cwd!
// . if the gb binary does not reside in the working dir
// for this host, skip it, it's not our host
if ( strcmp(h->m_dir,cwd) ) continue;
// now it must be our ip as well!
long *ipPtr = localIps;
for ( ; *ipPtr ; ipPtr++ )
// return the host if it also matches the ip!
if ( (long)h->m_ip == *ipPtr ) return h;
}
// what, no host?
return NULL;
}
Host *Hostdb::getProxy2 ( char *cwd , long *localIps ) {
for ( long i = 0 ; i < m_numProxyHosts ; i++ ) {
Host *h = getProxy(i);
if ( ! (h->m_type & HT_PROXY ) ) continue;
// . get the path. guaranteed to end in '/'
// as well as cwd!
// . if the gb binary does not reside in the working dir
// for this host, skip it, it's not our host
if ( strcmp(h->m_dir,cwd) ) continue;
// now it must be our ip as well!
long *ipPtr = localIps;
for ( ; *ipPtr ; ipPtr++ )
// return the host if it also matches the ip!
if ( (long)h->m_ip == *ipPtr ) return h;
}
// what, no host?
return NULL;
}

@ -62,6 +62,8 @@ enum {
#define HT_SCPROXY 0x10
#define HT_ALL_PROXIES (HT_PROXY|HT_QCPROXY|HT_SCPROXY)
long *getLocalIps ( ) ;
class EventStats {
public:
long m_expired;
@ -455,9 +457,17 @@ class Hostdb {
// get the host in this group with the smallest avg roundtrip time
//Host *getFastestHostInGroup ( unsigned long groupId );
// get the host that has this path/ip
Host *getHost2 ( char *cwd , long *localIps ) ;
Host *getProxy2 ( char *cwd , long *localIps ) ;
// . like above but just gets one host
// Host *getHost ( long hostId ) { return m_groups[hostId]; };
Host *getHost ( long hostId ) { return m_hostPtrs[hostId]; };
Host *getHost ( long hostId ) {
if ( hostId < 0 ) { char *xx=NULL;*xx=0; }
return m_hostPtrs[hostId];
};
Host *getSpare ( long spareId ) {
return m_spareHosts[spareId]; };
@ -672,6 +682,8 @@ inline uint32_t getShardNumFromDocId ( long long d ) {
return g_hostdb.getShardNumFromDocId ( d );
};
//inline uint32_t getShardNumFromTermId ( long long termId );
//inline uint32_t getGroupId ( char rdbId, void *key,bool split = true) {
// return g_hostdb.getGroupId ( rdbId , key , split );
//};

@ -24,7 +24,9 @@ char *g_contentTypeStrings [] = {
"bmp" , // 13
"javascript" , // 14
"css" , // 15
"json" // 16
"json" , // 16
"image", // 17
"status" // 18
};
HttpMime::HttpMime () { reset(); }

@ -37,6 +37,7 @@ time_t atotime5 ( char *s ) ;
#define CT_CSS 15
#define CT_JSON 16
#define CT_IMAGE 17
#define CT_STATUS 18 // an internal type indicating spider reply
#define ET_IDENTITY 0
#define ET_GZIP 1

@ -36,6 +36,8 @@
#define FORMAT_PROCOG 6
#define FORMAT_WIDGET_IFRAME 7
#define FORMAT_WIDGET_AJAX 8
// used by ajax widget to create search results to APPEND to the end of widget
#define FORMAT_WIDGET_APPEND 9
class HttpRequest {

@ -95,11 +95,11 @@ bool HttpServer::init ( short port,
m_ssltcp.reset();
}
// log an innocent msg
log(LOG_INIT,"http: listening on TCP port %i with sd=%i",
log(LOG_INIT,"http: Listening on TCP port %i with sd=%i",
port, m_tcp.m_sock );
// log for https
if (m_ssltcp.m_ready)
log(LOG_INIT,"https: listening on TCP port %i with sd=%i",
log(LOG_INIT,"https: Listening on TCP port %i with sd=%i",
sslPort, m_ssltcp.m_sock );
return true;

@ -5,16 +5,14 @@
#include "Sections.h"
#include "XmlDoc.h"
#include "Threads.h"
//#include "Msg16.h" // my_system_r()
#include "Hostdb.h"
#include "XmlDoc.h" // my_system_r()
// TODO: image is bad if repeated on same page, check for that
static void gotTermFreqWrapper ( void *state ) ;
//static void gotTermFreqWrapper ( void *state ) ;
static void gotTermListWrapper ( void *state ) ;
static void gotImageWrapper ( void *state ) ;
static void *thumbStartWrapper_r ( void *state , ThreadEntry *te );
static void thumbDoneWrapper ( void *state , ThreadEntry *te );
static void getImageInfo ( char *buf, long size, long *dx, long *dy, long *it);
Images::Images ( ) {
@ -26,10 +24,12 @@ void Images::reset() {
m_imgDataSize = 0;
m_setCalled = false;
m_thumbnailValid = false;
m_imgBuf = NULL;
m_imgBufLen = 0;
m_imgBufMaxLen = 0;
m_imgReply = NULL;
m_imgReplyLen = 0;
m_imgReplyMaxLen = 0;
m_numImages = 0;
m_imageBufValid = false;
m_phase = 0;
}
/*
@ -74,7 +74,7 @@ bool Images::hash ( long titleRecVersion ,
*/
void Images::setCandidates ( Url *pageUrl , Words *words , Xml *xml ,
Sections *sections ) {
Sections *sections , XmlDoc *xd ) {
// not valid for now
m_thumbnailValid = false;
// reset our array of image node candidates
@ -82,10 +82,15 @@ void Images::setCandidates ( Url *pageUrl , Words *words , Xml *xml ,
// flag it
m_setCalled = true;
// strange...
if ( m_imgBuf ) { char *xx=NULL;*xx=0; }
if ( m_imgReply ) { char *xx=NULL;*xx=0; }
// save this
m_xml = xml;
m_pageUrl = pageUrl;
// if we are a diffbot json reply, trust that diffbot got the
// best candidate, and just use that
if ( xd->m_isDiffbotJSONObject ) return;
//m_pageSite = pageSite;
// scan the words
long nw = words->getNumWords();
@ -156,7 +161,7 @@ void Images::setCandidates ( Url *pageUrl , Words *words , Xml *xml ,
if ( height != -1 && height < 50 ) continue;
// get the url of the image
long srcLen;
char *src = xml->getString(nn,nn+1,"src",&srcLen);
char *src = xml->getString(nn,"src",&srcLen);
// skip if none
if ( srcLen <= 2 ) continue;
// set it to the full url
@ -180,6 +185,7 @@ void Images::setCandidates ( Url *pageUrl , Words *words , Xml *xml ,
if ( strncasestr(u,ulen,"header" ) ) continue;
if ( strncasestr(u,ulen,"footer" ) ) continue;
if ( strncasestr(u,ulen,"menu" ) ) continue;
if ( strncasestr(u,ulen,"button" ) ) continue;
if ( strncasestr(u,ulen,"banner" ) ) continue;
if ( strncasestr(u,ulen,"ad.doubleclick.") ) continue;
if ( strncasestr(u,ulen,"ads.webfeat." ) ) continue;
@ -222,7 +228,7 @@ bool Images::getThumbnail ( char *pageSite ,
long long docId ,
XmlDoc *xd ,
collnum_t collnum,//char *coll ,
char **statusPtr ,
//char **statusPtr ,
long hopCount,
void *state ,
void (*callback)(void *state) ) {
@ -235,6 +241,7 @@ bool Images::getThumbnail ( char *pageSite ,
// reset here now
m_i = 0;
m_j = 0;
m_phase = 0;
// sanity check
if ( ! m_pageUrl ) { char *xx=NULL;*xx=0; }
@ -244,10 +251,18 @@ bool Images::getThumbnail ( char *pageSite ,
//if ( ! isPermalink ) return true;
// save these
m_statusPtr = statusPtr;
//m_statusPtr = statusPtr;
// save this
m_collnum = collnum;
m_docId = docId;
m_callback = callback;
m_state = state;
// if this doc is a json diffbot reply it already has the primary
// image selected so just use that
m_xd = xd;
if ( m_xd->m_isDiffbotJSONObject )
return downloadImages();
// if no candidates, we are done, no error
if ( m_numImages == 0 ) return true;
@ -280,43 +295,97 @@ bool Images::getThumbnail ( char *pageSite ,
// store the termid
long long termId = q.getTermId(0);
if ( ! m_msg36.getTermFreq ( m_collnum ,
0 , // maxAge
termId ,
this ,
gotTermFreqWrapper ,
MAX_NICENESS ,
true , // exact count?
false , // inc count?
false , // dec count?
false )) // is split?
key144_t startKey ;
key144_t endKey ;
g_posdb.makeStartKey(&startKey,termId);
g_posdb.makeEndKey (&endKey ,termId);
// get shard of that (this termlist is sharded by termid -
// see XmlDoc.cpp::hashNoSplit() where it hashes gbsitetemplate: term)
long shardNum = g_hostdb.getShardNumByTermId ( &startKey );
// if ( ! m_msg36.getTermFreq ( m_collnum ,
// 0 , // maxAge
// termId ,
// this ,
// gotTermFreqWrapper ,
// MAX_NICENESS ,
// true , // exact count?
// false , // inc count?
// false , // dec count?
// false )) // is split?
// return false;
// just use msg0 and limit to like 1k or something
if ( ! m_msg0.getList ( -1 , // hostid
-1 , // ip
-1 , // port
0 , // maxAge
false , // addToCache?
RDB_POSDB ,
m_collnum ,
&m_list , // RdbList ptr
(char *)&startKey ,
(char *)&endKey ,
1024 , // minRecSize
this ,
gotTermListWrapper ,
MAX_NICENESS ,
false , // err correction?
true , // inc tree?
true , // domergeobsolete
-1 , // firstHostId
0 , // start filenum
-1 , // numFiles
30 , // timeout
-1 , // syncpoint
-1 , // preferlocalreads
NULL , // msg5
NULL , // msg5b
false , // isRealMerge?
true , // allow pg cache
false , // focelocalindexdb
false , // doIndexdbSplit?
shardNum ))// force paritysplit
return false;
// did not block
return gotTermFreq();
}
void gotTermFreqWrapper ( void *state ) {
Images *THIS = (Images *)state;
// process/store the reply
if ( ! THIS->gotTermFreq() ) return;
// all done
THIS->m_callback ( THIS->m_state );
}
// void gotTermFreqWrapper ( void *state ) {
// Images *THIS = (Images *)state;
// // process/store the reply
// if ( ! THIS->gotTermFreq() ) return;
// // all done
// THIS->m_callback ( THIS->m_state );
// }
// returns false if blocked, true otherwise
bool Images::gotTermFreq ( ) {
// error?
if ( g_errno ) return true;
// bail if less than 10
long long nt = m_msg36.getTermFreq();
// return true, without g_errno set, we are done
if ( nt < 10 ) return true;
//long long nt = m_msg36.getTermFreq();
// each key but the first is 12 bytes (compressed)
long long nt = (m_list.getListSize() - 6)/ 12;
// . return true, without g_errno set, we are done
// . if we do not have 10 or more webpages that share this same
// template then do not do image extraction at all, it is too risky
// that we get a bad image
// . MDW: for debugging, do not require 10 pages of same template
//if ( nt < 10 ) return true;
if ( nt < -2 ) return true;
// now see which of the image urls are unique
if ( ! launchRequests () ) return false;
// i guess we did not block
return true;
}
// . returns false if blocked, true otherwise
// . see if other pages we've indexed have this same image url
bool Images::launchRequests ( ) {
// loop over all images
for ( long i = m_i ; i < m_numImages ; i++ ) {
@ -324,26 +393,30 @@ bool Images::launchRequests ( ) {
m_i++;
// assume no error
m_errors[i] = 0;
// make the keys
key_t startKey = g_indexdb.makeStartKey(m_termIds[i]);
key_t endKey = g_indexdb.makeEndKey (m_termIds[i]);
// make the keys. each term is a gbimage:<imageUrl> term
// so we are searching for the image url to see how often
// it is repeated on other pages.
key144_t startKey ;
key144_t endKey ;
g_posdb.makeStartKey(&startKey,m_termIds[i]);
g_posdb.makeEndKey (&endKey ,m_termIds[i]);
// get our residing groupid
//unsigned long gid = g_indexdb.getNoSplitGroupId(&startKey);
// no split is true for this one, so we do not split by docid
//uint32_t gid = getGroupId(RDB_INDEXDB,&startKey,false);
unsigned long shardNum;
shardNum = getShardNum(RDB_INDEXDB,&startKey);
shardNum = getShardNum(RDB_POSDB,&startKey);
// get the termlist
if ( ! m_msg0.getList ( -1 , // hostid
-1 , // ip
-1 , // port
0 , // maxAge
false , // addToCache?
RDB_INDEXDB ,
RDB_POSDB,
m_collnum ,
&m_list , // RdbList ptr
startKey ,
endKey ,
(char *)&startKey ,
(char *)&endKey ,
1024 , // minRecSize
this ,
gotTermListWrapper ,
@ -408,71 +481,235 @@ void Images::gotTermList ( ) {
bool Images::downloadImages () {
// all done if we got a valid thumbnail
if ( m_thumbnailValid ) return true;
// if not valid free old image
if ( m_imgBuf ) {
mfree ( m_imgBuf , m_imgBufMaxLen , "Image" );
m_imgBuf = NULL;
}
//if ( m_thumbnailValid ) return true;
CollectionRec *cr = g_collectiondb.getRec(m_collnum);
long srcLen;
char *src = NULL;
long node;
// downloading an image from diffbot json reply?
if ( m_xd->m_isDiffbotJSONObject ) {
// i guess this better not block cuz we'll core!
char **iup = m_xd->getDiffbotPrimaryImageUrl();
// if no image, nothing to download
if ( ! *iup ) {
//log("no diffbot image url for %s",
// m_xd->m_firstUrl.m_url);
return true;
}
// force image count to one
m_numImages = 1;
// do not error out
m_errors[0] = 0;
// set it to the full url
src = *iup;
srcLen = gbstrlen(src);
// need this
m_imageUrl.set ( src , srcLen );
// jump into the for loop below
//if ( m_phase == 0 ) goto insertionPoint;
}
// . download each leftover image
// . stop as soon as we get one with good dimensions
// . make a thumbnail of that one
for ( long i = m_j ; i < m_numImages ; i++ ) {
// advance now
m_j++;
// if we should stop, stop
if ( m_stopDownloading ) break;
// skip if bad or not unique
if ( m_errors[i] ) continue;
// set status msg
sprintf ( m_statusBuf ,"downloading image %li",i);
// point to it
*m_statusPtr = m_statusBuf;
// get the url of the image
long srcLen;
char *src = m_xml->getString(i,i+1,"src",&srcLen);
// set it to the full url
Url iu;
// use "pageUrl" as the baseUrl
iu.set ( m_pageUrl , src , srcLen );
// assume success
m_httpStatus = 200;
// set the request
Msg13Request *r = &m_msg13Request;
r->reset();
r->m_maxTextDocLen = 200000;
r->m_maxOtherDocLen = 500000;
if ( ! strcmp(cr->m_coll,"qatest123")) {
r->m_useTestCache = 1;
r->m_addToTestCache = 1;
for ( ; m_j < m_numImages ; m_j++ , m_phase = 0 ) {
// did collection get nuked?
CollectionRec *cr = g_collectiondb.getRec(m_collnum);
if ( ! cr ) { g_errno = ENOCOLLREC; return true; }
// clear error
g_errno = 0;
if ( m_phase == 0 ) {
// advance
m_phase++;
// only if not diffbot, we set "src" above for it
if ( ! m_xd->m_isDiffbotJSONObject ) {
// get img tag node
node = m_imageNodes[m_j];
// get the url of the image
src = m_xml->getString(node,"src",&srcLen);
// use "pageUrl" as the baseUrl
m_imageUrl.set ( m_pageUrl , src , srcLen );
}
// if we should stop, stop
if ( m_stopDownloading ) break;
// skip if bad or not unique
if ( m_errors[m_j] ) continue;
// set status msg
sprintf ( m_statusBuf ,"downloading image %li",m_j);
// point to it
if ( m_xd ) m_xd->setStatus ( m_statusBuf );
}
// url is the most important
strcpy(r->m_url,iu.getUrl());
// . try to download it
// . i guess we are ignoring hammers at this point
if ( ! m_msg13.getDoc(r,false,this,gotImageWrapper))
return false;
// handle it
gotImage ( );
// get image ip
if ( m_phase == 1 ) {
// advance
m_phase++;
// this increments phase if it should
if ( ! getImageIp() ) return false;
// error?
if ( g_errno ) continue;
}
// download the actual image
if ( m_phase == 2 ) {
// advance
m_phase++;
// download image data
if ( ! downloadImage() ) return false;
// error downloading?
if ( g_errno ) continue;
}
// get thumbnail using threaded call to netpbm stuff
if ( m_phase == 3 ) {
// advance
m_phase++;
// call pnmscale etc. to make thumbnail
if ( ! makeThumb() ) return false;
// error downloading?
if ( g_errno ) continue;
}
// error making thumb or just not a good thumb size?
if ( ! m_thumbnailValid ) {
// free old image we downloaded, if any
m_msg13.reset();
// i guess do this too, it was pointing at it in msg13
m_imgReply = NULL;
// try the next image candidate
continue;
}
// it's a keeper
long urlSize = m_imageUrl.getUrlLen() + 1; // include \0
// . make our ThumbnailArray out of it
long need = 0;
// the array itself
need += sizeof(ThumbnailArray);
// and each thumbnail it contains
need += urlSize;
need += m_thumbnailSize;
need += sizeof(ThumbnailInfo);
// reserve it
m_imageBuf.reserve ( need );
// point to array
ThumbnailArray *ta =(ThumbnailArray *)m_imageBuf.getBufStart();
// set that as much as possible, version...
ta->m_version = 0;
// and thumb count
ta->m_numThumbnails = 1;
// now store the thumbnail info
ThumbnailInfo *ti = ta->getThumbnailInfo (0);
// and set our one thumbnail
ti->m_origDX = m_dx;
ti->m_origDY = m_dy;
ti->m_dx = m_tdx;
ti->m_dy = m_tdy;
ti->m_urlSize = urlSize;
ti->m_dataSize = m_thumbnailSize;
// now copy the data over sequentially
char *p = ti->m_buf;
// the image url
memcpy(p,m_imageUrl.getUrl(),urlSize);
p += urlSize;
// the image thumbnail data
memcpy(p,m_imgData,m_thumbnailSize);
p += m_thumbnailSize;
// update buf length of course
m_imageBuf.setLength ( p - m_imageBuf.getBufStart() );
// validate the buffer
m_imageBufValid = true;
// save mem. do this after because m_imgData uses m_msg13's
// reply buf to store the thumbnail for now...
m_msg13.reset();
m_imgReply = NULL;
g_errno = 0;
return true;
}
// now get the thumbnail from it
return gotImage ( );
// don't tell caller EBADIMG it will make him fail to index doc
g_errno = 0;
return true;
}
void gotImageWrapper ( void *state ) {
static void gotImgIpWrapper ( void *state , long ip ) {
Images *THIS = (Images *)state;
// process/store the reply
if ( ! THIS->gotImage ( ) ) return;
// download the images. will set m_stopDownloading when we get one
// control loop
if ( ! THIS->downloadImages() ) return;
// call callback at this point, we are done with the download loop
THIS->m_callback ( THIS->m_state );
}
bool Images::getImageIp ( ) {
if ( ! m_msgc.getIp ( m_imageUrl.getHost () ,
m_imageUrl.getHostLen() ,
&m_latestIp ,
this ,
gotImgIpWrapper ))
// we blocked
return false;
return true;
}
static void downloadImageWrapper ( void *state ) {
Images *THIS = (Images *)state;
// control loop
if ( ! THIS->downloadImages() ) return;
// all done
THIS->m_callback ( THIS->m_state );
}
bool Images::gotImage ( ) {
bool Images::downloadImage ( ) {
// error?
if ( m_latestIp == 0 || m_latestIp == -1 ) {
log(LOG_DEBUG,"images: ip of %s is %li (%s)",
m_imageUrl.getUrl(),m_latestIp,mstrerror(g_errno));
// ignore errors
g_errno = 0;
return true;
}
CollectionRec *cr = g_collectiondb.getRec(m_collnum);
if ( ! cr ) { g_errno = ENOCOLLREC; return true; }
// assume success
m_httpStatus = 200;
// set the request
Msg13Request *r = &m_msg13Request;
r->reset();
r->m_maxTextDocLen = 200000;
r->m_maxOtherDocLen = 500000;
r->m_urlIp = m_latestIp;
if ( ! strcmp(cr->m_coll,"qatest123")) {
r->m_useTestCache = 1;
r->m_addToTestCache = 1;
}
// url is the most important
strcpy(r->m_url,m_imageUrl.getUrl());
// . try to download it
// . i guess we are ignoring hammers at this point
if ( ! m_msg13.getDoc(r,false,this,downloadImageWrapper))
return false;
return true;
}
static void makeThumbWrapper ( void *state , ThreadEntry *t ) {
Images *THIS = (Images *)state;
// control loop
if ( ! THIS->downloadImages() ) return;
// all done
THIS->m_callback ( THIS->m_state );
}
bool Images::makeThumb ( ) {
// did it have an error?
if ( g_errno ) {
// just give up on all of them if one has an error
@ -489,12 +726,12 @@ bool Images::gotImage ( ) {
m_imgData = NULL;
m_imgDataSize = 0;
log( LOG_DEBUG, "image: Msg16::gotImage() entered." );
log( LOG_DEBUG, "image: gotImage() entered." );
// . if there was a problem, just ignore, don't let it stop getting
// the real page.
if ( g_errno ) {
log( "ERROR? g_errno puked: %s", mstrerror(g_errno) );
g_errno = 0;
//g_errno = 0;
return true;
}
//if ( ! slot ) return true;
@ -503,12 +740,24 @@ bool Images::gotImage ( ) {
bufLen = m_msg13.m_replyBufSize;
bufMaxLen = m_msg13.m_replyBufAllocSize;
// no image?
if ( ! buf || bufLen <= 0 ) return true;
if ( ! buf || bufLen <= 0 ) {
g_errno = EBADIMG;
return true;
}
// we are image candidate #i
long i = m_j - 1;
//long i = m_j - 1;
// get img tag node
// get the url of the image
long srcLen;
char *src = m_xml->getString(i,i+1,"src",&srcLen);
char *src = NULL;
if ( m_xd->m_isDiffbotJSONObject ) {
src = *m_xd->getDiffbotPrimaryImageUrl();
srcLen = gbstrlen(src);
}
else {
long node = m_imageNodes[m_j];
src = m_xml->getString(node,"src",&srcLen);
}
// set it to the full url
Url iu;
// use "pageUrl" as the baseUrl
@ -518,6 +767,7 @@ bool Images::gotImage ( ) {
log ( "image: MIME.set() failed in gotImage()" );
// give up on the remaining images then
m_stopDownloading = true;
g_errno = EBADIMG;
return true;
}
// set the status so caller can see
@ -528,6 +778,7 @@ bool Images::gotImage ( ) {
m_httpStatus);
// give up on the remaining images then
m_stopDownloading = true;
g_errno = EBADIMG;
return true;
}
// make sure this is an image
@ -536,6 +787,7 @@ bool Images::gotImage ( ) {
log( LOG_DEBUG, "image: gotImage() states that this image is "
"not in a format we currently handle." );
// try the next image if any
g_errno = EBADIMG;
return true;
}
// get the content
@ -543,41 +795,64 @@ bool Images::gotImage ( ) {
m_imgDataSize = bufLen - mime.getMimeLen();
// Reset socket, so socket doesn't free the data, now we own
// We must free the buf after thumbnail is inserted in TitleRec
m_imgBuf = buf;//slot->m_readBuf;
m_imgBufLen = bufLen;//slot->m_readBufSize;
m_imgBufMaxLen = bufMaxLen;//slot->m_readBufMaxSize;
m_imgReply = buf;//slot->m_readBuf;
m_imgReplyLen = bufLen;//slot->m_readBufSize;
m_imgReplyMaxLen = bufMaxLen;//slot->m_readBufMaxSize;
// do not let UdpServer free the reply, we own it now
//slot->m_readBuf = NULL;
if ( ! m_imgBuf || m_imgBufLen == 0 ) {
log( LOG_DEBUG, "image: Returned empty image data!" );
if ( ! m_imgReply || m_imgReplyLen == 0 ) {
log( LOG_DEBUG, "image: Returned empty image reply!" );
g_errno = EBADIMG;
return true;
}
// get next if too small
if ( m_imgDataSize < 20 ) return true;
if ( m_imgDataSize < 20 ) { g_errno = EBADIMG; return true; }
long imageType;
getImageInfo ( m_imgData, m_imgDataSize, &m_dx, &m_dy, &imageType );
// log the image dimensions
log( LOG_DEBUG, "image: Image Link: %s", iu.getUrl() );
log( LOG_DEBUG, "image: Max Buffer Size: %lu bytes.",m_imgBufMaxLen );
log( LOG_DEBUG, "image: Image Original Size: %lu bytes.",m_imgBufLen);
log( LOG_DEBUG, "image: Image Buffer @ 0x%lx - 0x%lx.",(long)m_imgBuf,
(long)m_imgBuf+m_imgBufMaxLen );
log( LOG_DEBUG,"image: Image Link: %s", iu.getUrl() );
log( LOG_DEBUG,"image: Max Buffer Size: %lu bytes.",m_imgReplyMaxLen);
log( LOG_DEBUG,"image: Image Original Size: %lu bytes.",m_imgReplyLen);
log( LOG_DEBUG,"image: Image Buffer @ 0x%lx - 0x%lx",(long)m_imgReply,
(long)m_imgReply+m_imgReplyMaxLen );
log( LOG_DEBUG, "image: Size: %lupx x %lupx", m_dx, m_dy );
// what is this?
if ( m_dx <= 0 || m_dy <= 0 ) {
log(LOG_DEBUG, "image: Image has bad dimensions.");
g_errno = EBADIMG;
return true;
}
// skip if bad dimensions
if( ((m_dx < 50) || (m_dy < 50)) && ((m_dx > 0) && (m_dy > 0)) ) {
log( "image: Image is too small to represent a news article." );
return true;
log(LOG_DEBUG,
"image: Image is too small to represent a news article." );
g_errno = EBADIMG;
return true;
}
// skip if bad aspect ratio. 5x1 or 1x5 is bad i guess
if ( m_dx > 0 && m_dy > 0 ) {
float aspect = (float)m_dx / (float)m_dy;
if ( aspect < .2 || aspect > 5.0 ) {
log(LOG_DEBUG,
"image: Image aspect ratio is worse that 5 to 1");
g_errno = EBADIMG;
return true;
}
}
// update status
*m_statusPtr = "making thumbnail";
if ( m_xd ) m_xd->setStatus ( "making thumbnail" );
// log it
log ( LOG_DEBUG, "image: Msg16::gotImage() thumbnailing image." );
log ( LOG_DEBUG, "image: gotImage() thumbnailing image." );
// create the thumbnail...
// reset this... why?
g_errno = 0;
@ -587,23 +862,14 @@ bool Images::gotImage ( ) {
if ( g_threads.call ( FILTER_THREAD ,
MAX_NICENESS ,
this ,
thumbDoneWrapper ,
makeThumbWrapper ,
thumbStartWrapper_r ) ) return false;
// threads might be off
logf ( LOG_DEBUG, "image: Calling thumbnail gen without thread.");
thumbStartWrapper_r ( NULL , NULL );
thumbStartWrapper_r ( this , NULL );
return true;
}
void thumbDoneWrapper ( void *state , ThreadEntry *t ) {
Images *THIS = (Images *)state;
// . download another image if we ! m_thumbnailValid
// . should also free m_imgBuf if ! m_thumbnailValid
if ( ! THIS->downloadImages() ) return;
// all done
THIS->m_callback ( THIS->m_state );
}
void *thumbStartWrapper_r ( void *state , ThreadEntry *t ) {
Images *THIS = (Images *)state;
THIS->thumbStart_r ( true /* am thread?*/ );
@ -614,30 +880,33 @@ void Images::thumbStart_r ( bool amThread ) {
long long start = gettimeofdayInMilliseconds();
static char scmd[200] = "%stopnm %s | "
"pnmscale -xysize 100 100 - | "
"ppmtojpeg - > %s";
//static char scmd[200] = "%stopnm %s | "
// "pnmscale -xysize 100 100 - | "
// "ppmtojpeg - > %s";
log( LOG_DEBUG, "image: thumbStart_r entered." );
//DIR *d;
char cmd[250];
sprintf( cmd, "%strash", g_hostdb.m_dir );
//char cmd[2500];
//sprintf( cmd, "%strash", g_hostdb.m_dir );
makeTrashDir();
// get thread id
long id = getpid();
// pass the input to the program through this file
// rather than a pipe, since popen() seems broken
char in[64];
sprintf ( in , "%strash/in.%li", g_hostdb.m_dir, id );
char in[364];
snprintf ( in , 363,"%strash/in.%li", g_hostdb.m_dir, id );
unlink ( in );
log( LOG_DEBUG, "image: thumbStart_r create in file." );
// collect the output from the filter from this file
char out[64];
sprintf ( out , "%strash/out.%li", g_hostdb.m_dir, id );
char out[364];
snprintf ( out , 363,"%strash/out.%li", g_hostdb.m_dir, id );
unlink ( out );
log( LOG_DEBUG, "image: thumbStart_r create out file." );
@ -695,11 +964,28 @@ void Images::thumbStart_r ( bool amThread ) {
break;
}
sprintf( cmd, scmd, ext, in, out);
long xysize = 250;//100;
// make thumbnail a little bigger for diffbot for widget
if ( m_xd->m_isDiffbotJSONObject ) xysize = 250;
// i hope 2500 is big enough!
char cmd[2501];
//sprintf( cmd, scmd, ext, in, out);
char *wdir = g_hostdb.m_dir;
snprintf( cmd, 2500 ,
"LD_LIBRARY_PATH=%s %s/%stopnm %s | "
"LD_LIBRARY_PATH=%s %s/pnmscale -xysize %li %li - | "
"LD_LIBRARY_PATH=%s %s/ppmtojpeg - > %s"
, wdir , wdir , ext , in
, wdir , wdir , xysize , xysize
, wdir , wdir , out
);
// Call clone function for the shell to execute command
// This call WILL BLOCK . timeout is 30 seconds.
int err = my_system_r( cmd, 30 ); // m_thmbconvTimeout );
//int err = my_system_r( cmd, 30 ); // m_thmbconvTimeout );
int err = system( cmd ); // m_thmbconvTimeout );
//if( (m_dx != 0) && (m_dy != 0) )
// unlink( in );
@ -736,13 +1022,13 @@ void Images::thumbStart_r ( bool amThread ) {
return;
}
if( m_thumbnailSize > m_imgBufMaxLen ) {
log( "image: Image thumbnail larger than buffer!" );
log( LOG_DEBUG, "\t\t\tFile Read Bytes: %ld", m_thumbnailSize);
log( LOG_DEBUG, "\t\t\tBuf Max Bytes : %ld", m_imgBufMaxLen );
log( LOG_DEBUG, "\t\t\t-----------------------" );
log( LOG_DEBUG, "\t\t\tDiff : %ld",
m_imgBufMaxLen-m_thumbnailSize );
if( m_thumbnailSize > m_imgReplyMaxLen ) {
log(LOG_DEBUG,"image: Image thumbnail larger than buffer!" );
log(LOG_DEBUG,"image: File Read Bytes: %ld", m_thumbnailSize);
log(LOG_DEBUG,"image: Buf Max Bytes : %ld",m_imgReplyMaxLen );
log(LOG_DEBUG,"image: -----------------------" );
log(LOG_DEBUG,"image: Diff : %ld",
m_imgReplyMaxLen-m_thumbnailSize );
return;
}
@ -777,10 +1063,16 @@ void Images::thumbStart_r ( bool amThread ) {
// tell the loop above not to download anymore, we got one
m_thumbnailValid = true;
getImageInfo ( m_imgBuf , m_thumbnailSize , &m_tdx , &m_tdy , NULL );
// MDW: this was m_imgReply
getImageInfo ( m_imgData , m_thumbnailSize , &m_tdx , &m_tdy , NULL );
log( LOG_DEBUG, "image: Thumbnailed size: %li bytes.", m_imgDataSize );
log( LOG_DEBUG, "image: Thumbnaile dx=%li dy=%li.", m_tdx,m_tdy );
// now make the meta data struct
// <imageUrl>\0<width><height><thumbnailData>
log( LOG_DEBUG, "image: Thumbnail size: %li bytes.", m_imgDataSize );
log( LOG_DEBUG, "image: Thumbnail dx=%li dy=%li.", m_tdx,m_tdy );
log( LOG_DEBUG, "image: Thumbnail generated in %lldms.", stop-start );
}
@ -841,6 +1133,9 @@ void getImageInfo ( char *buf , long bufSize ,
if( bufSize > 25 ) {
*dx=(unsigned long)(*(unsigned long *)&buf[16]);
*dy=(unsigned long)(*(unsigned long *)&buf[20]);
// these are in network order
*dx = ntohl(*dx);
*dy = ntohl(*dy);
}
}
else if( (strPtr = strncasestr( buf, 20, "MM" )) ) {
@ -886,3 +1181,46 @@ void getImageInfo ( char *buf , long bufSize ,
log( LOG_DEBUG, "image: Image Corrupted? No type found in "
"data." );
}
// container is maxWidth X maxHeight, so try to fix widget in there
bool ThumbnailInfo::printThumbnailInHtml ( SafeBuf *sb ,
long maxWidth ,
long maxHeight,
bool printLink ,
long *retNewdx ,
char *style ) {
if ( ! style ) style = "";
// account for scrollbar on the right
//maxSide -= (long)SCROLLBAR_WIDTH;
// avoid distortion.
// if image is wide, use that to scale
if ( m_dx <= 0 ) return true;
if ( m_dy <= 0 ) return true;
float xscale =
(float)maxWidth/
(float)m_dx;
float yscale =
(float)maxHeight/
(float)m_dy;
float min = xscale;
if ( yscale < min ) min = yscale;
long newdx = (long)((float)m_dx * min);
long newdy = (long)((float)m_dy * min);
if ( printLink ) sb->safePrintf("<a href=%s>", getUrl() );
sb->safePrintf("<img width=%li height=%li align=left "
"%s"
"src=\"data:image/"
"jpg;base64,"
, newdx
, newdy
, style
);
// encode image in base 64
sb->base64Encode ( getData(), m_dataSize , 0 ); // 0 niceness
sb->safePrintf("\">");
if ( printLink ) sb->safePrintf ("</a>");
// widget needs to know the width of the thumb for formatting
// the text either on top of the thumb or to the right of it
if ( retNewdx ) *retNewdx = newdx;
return true;
}

@ -7,9 +7,59 @@
#include "Msg36.h"
#include "Msg13.h"
#include "IndexList.h"
#include "MsgC.h"
#include "SafeBuf.h"
#define MAX_IMAGES 500
// a single serialized thumbnail:
class ThumbnailInfo {
public:
long m_origDX;
long m_origDY;
long m_dx;
long m_dy;
long m_urlSize;
long m_dataSize;
char m_buf[];
char *getUrl() { return m_buf; };
char *getData() { return m_buf + m_urlSize; };
long getDataSize() { return m_dataSize; };
long getSize () { return sizeof(ThumbnailInfo)+m_urlSize+m_dataSize;};
// make sure neither the x or y side is > maxSize
bool printThumbnailInHtml ( SafeBuf *sb ,
long maxWidth,
long maxHeight,
bool printLink ,
long *newdx ,
char *style = NULL ) ;
};
// XmlDoc::ptr_imgData is a ThumbnailArray
class ThumbnailArray {
public:
// 1st byte if format version
char m_version;
// # of thumbs
long m_numThumbnails;
// list of ThumbnailInfos
char m_buf[];
long getNumThumbnails() { return m_numThumbnails;};
ThumbnailInfo *getThumbnailInfo ( long x ) {
if ( x >= m_numThumbnails ) return NULL;
char *p = m_buf;
for ( long i = 0 ; i < m_numThumbnails ; i++ ) {
if ( i == x ) return (ThumbnailInfo *)p;
ThumbnailInfo *ti = (ThumbnailInfo *)p;
p += ti->getSize();
}
return NULL;
};
};
class Images {
public:
@ -31,7 +81,8 @@ class Images {
void setCandidates ( class Url *pageUrl ,
class Words *words ,
class Xml *xml ,
class Sections *sections );
class Sections *sections ,
class XmlDoc *xd );
// . returns false if blocked, true otherwise
// . sets errno on error
@ -42,25 +93,38 @@ class Images {
long long docId ,
class XmlDoc *xd ,
collnum_t collnum,
char **statusPtr ,
//char **statusPtr ,
long hopCount,
void *state ,
void (*callback)(void *state) );
char *getImageData () { return m_imgData; };
long getImageDataSize() { return m_imgDataSize; };
//char *getImageData () { return m_imgData; };
//long getImageDataSize() { return m_imgDataSize; };
//long getImageType () { return m_imageType; };
SafeBuf m_imageBuf;
bool m_imageBufValid;
long m_phase;
bool gotTermFreq();
bool launchRequests();
void gotTermList();
bool downloadImages();
bool gotImage ( );
bool getImageIp();
bool downloadImage();
bool makeThumb();
//bool gotImage ( );
void thumbStart_r ( bool amThread );
long m_i;
long m_j;
class XmlDoc *m_xd;
// callback information
void *m_state ;
void (* m_callback)(void *state );
@ -69,17 +133,21 @@ class Images {
long m_errno;
long m_hadError;
bool m_stopDownloading;
char **m_statusPtr;
//char **m_statusPtr;
char m_statusBuf[128];
collnum_t m_collnum;
long long m_docId;
IndexList m_list;
long m_latestIp;
MsgC m_msgc;
Url m_imageUrl;
long m_numImages;
long m_imageNodes[MAX_IMAGES];
// termids for doing gbimage:<url> lookups for uniqueness
long m_termIds [MAX_IMAGES];
long long m_termIds [MAX_IMAGES];
// for the msg0 lookup, did we have an error?
long m_errors [MAX_IMAGES];
@ -106,9 +174,9 @@ class Images {
long m_imgType;
// udp slot buffer
char *m_imgBuf;
long m_imgBufLen; // how many bytes the image is
long m_imgBufMaxLen; // allocated for the image
char *m_imgReply;
long m_imgReplyLen; // how many bytes the image is
long m_imgReplyMaxLen; // allocated for the image
long m_dx; // width of image in pixels
long m_dy; // height of image in pixels
bool m_thumbnailValid; // is it a valid thumbnail image

@ -33,7 +33,7 @@
//#define INDEXDB_SPLIT 8
//#define DOCID_OFFSET_MASK (INDEXDB_SPLIT-1)
#define DOCID_OFFSET_MASK (g_conf.m_indexdbSplit-1)
#define MAX_INDEXDB_SPLIT 128
#define MAX_SHARDS 128
class Indexdb {

@ -170,6 +170,7 @@ bool Log::shouldLog ( long type , char *msg ) {
if (msg[0]=='d'&&msg[1]=='n' ) return g_conf.m_logDebugDns ;
if (msg[0]=='d'&&msg[1]=='o' ) return g_conf.m_logDebugDownloads;
if (msg[0]=='h'&&msg[1]=='t' ) return g_conf.m_logDebugHttp ;
if (msg[0]=='i'&&msg[1]=='m' ) return g_conf.m_logDebugImage ;
if (msg[0]=='l'&&msg[1]=='o' ) return g_conf.m_logDebugLoop ;
if (msg[0]=='l'&&msg[1]=='a' ) return g_conf.m_logDebugLang ;
if (msg[0]=='m'&&msg[2]=='m' ) return g_conf.m_logDebugMem ;
@ -302,8 +303,8 @@ bool Log::logR ( long long now , long type , char *msg , bool asterisk ,
// back up over spaces
while ( p[-1] == ' ' ) p--;
// end in period or ? or !
if ( p[-1] != '?' && p[-1] != '.' && p[-1] != '!' )
*p++ = '.';
//if ( p[-1] != '?' && p[-1] != '.' && p[-1] != '!' )
// *p++ = '.';
*p ='\0';
// the total length, not including the \0
long tlen = p - tt;

@ -466,6 +466,22 @@ Msg6a.o:
geo_ip_table.o: geo_ip_table.cpp geo_ip_table.h
$(CC) $(DEFS) -m32 -Wall -pipe -c $*.cpp
install:
# gigablast will copy over the necessary files. it has a list of the
# necessary files and that list changes over time so it is better to let gb
# deal with it.
mkdir -p /var/gigablast/data0/
./gb copyfiles /var/gigablast/data0/
# if user types 'gb' it will use the binary in /var/gigablast/data0/
rm -f /usr/bin/gb
ln -s /var/gigablast/data0/gb /usr/bin/gb
# if machine restarts
# the new way that does not use run-levels anymore
rm -f /etc/init.d/gb
ln -s /lib/init/upstart-job /etc/init.d/gb
# initctl upstart-job conf file (gb stop|start|reload)
cp init.gb.conf /etc/init/gb.conf
.cpp.o:
$(CC) $(DEFS) $(CPPFLAGS) -c $*.cpp

@ -462,6 +462,10 @@ bool Mem::init ( long long maxMem ) {
log(LOG_INIT,"mem: using electric fence!!!!!!!");
#endif
/*
take this out for now it seems to hang the OS when running
as root
#ifndef TITAN
// if we can't alloc 3gb exit and retry
long long start = gettimeofdayInMilliseconds();
@ -486,6 +490,7 @@ bool Mem::init ( long long maxMem ) {
// return if could not alloc the full 3GB
if ( i < 30 ) return false;
#endif
*/
// reset this, our max mem used over time ever because we don't
// want the mem test we did above to count towards it

@ -31,7 +31,7 @@ void Msg0::constructor ( ) {
m_msg5b = NULL;
//#ifdef SPLIT_INDEXDB
//for ( long i = 0; i < INDEXDB_SPLIT; i++ )
//for ( long i = 0; i < MAX_INDEXDB_SPLIT; i++ )
//for ( long i = 0; i < MAX_SHARDS; i++ )
// m_mcast[i].constructor();
m_mcast.constructor();
m_mcasts = NULL;
@ -726,8 +726,8 @@ void Msg0::gotSplitReply ( ) {
char *xx=NULL;*xx=0;
// get all the split lists
long totalSize = 0;
RdbList lists[MAX_INDEXDB_SPLIT];
RdbList *listPtrs[MAX_INDEXDB_SPLIT];
RdbList lists[MAX_SHARDS];
RdbList *listPtrs[MAX_SHARDS];
for ( long i = 0; i < m_numSplit; i++ ) {
listPtrs[i] = &lists[i];
long replySize;

2
Msg0.h

@ -216,7 +216,7 @@ class Msg0 {
// used for multicasting the request
//#ifdef SPLIT_INDEXDB
//Multicast m_mcast[INDEXDB_SPLIT];
//Multicast m_mcast[MAX_INDEXDB_SPLIT];
//Multicast m_mcast[MAX_SHARDS];
// casting to multiple splits is obsolete, but for PageIndexdb.cpp
// we still need to do it, but we alloc for it
Multicast m_mcast;

@ -192,7 +192,7 @@ class Msg20Request {
char *ptr_affWeights ;
char *ptr_linkee ; // used by Msg25 for getting link text
//char *ptr_coll ;
char *ptr_imgUrl ;
//char *ptr_imgUrl ;
char *ptr_displayMetas ;
// . from here down: automatically set in Msg20Request::serialize()
@ -209,7 +209,7 @@ class Msg20Request {
long size_affWeights ;
long size_linkee ; // size includes terminating \0
//long size_coll ; // size includes terminating \0
long size_imgUrl ;
//long size_imgUrl ;
long size_displayMetas ; // size includes terminating \0
char m_buf[0] ;
@ -428,6 +428,7 @@ public:
char *ptr_tvbuf ; // title vector
char *ptr_gbvecbuf ; // gigabit vector
char *ptr_imgUrl ; // youtube/metacafe vid thumb
char *ptr_imgData ; // for encoded images
//char *ptr_eventEnglishTime ; // "every saturday [[]] jan"
//char *ptr_eventDateIntervals ;
char *ptr_likedbList ;
@ -523,6 +524,7 @@ public:
long size_tvbuf ;
long size_gbvecbuf ;
long size_imgUrl ; // youtube/metacafe vid thumb
long size_imgData;
//long size_eventEnglishTime ;
//long size_eventDateIntervals ;
long size_likedbList ;

@ -23,6 +23,34 @@ Msg22::~Msg22(){
static void gotReplyWrapper22 ( void *state1 , void *state2 ) ;
// . sets m_availDocId or sets g_errno to ENOTFOUND on error
// . calls callback(state) when done
// . returns false if blocked true otherwise
bool Msg22::getAvailDocIdOnly ( Msg22Request *r ,
long long preferredDocId ,
char *coll ,
void *state ,
void (* callback)(void *state) ,
long niceness ) {
return getTitleRec ( r ,
NULL , // url
preferredDocId ,
coll ,
NULL , // **titleRecPtrPtr
NULL , // *titleRecSizePtr
false , // justCheckTfndb
true , // getAvailDocIdOnly
state ,
callback ,
niceness ,
false , // addToCache
0 , // maxCacheAge
9999999 , // timeout
false ); // doLoadBalancing
}
// . if url is NULL use the docId to get the titleRec
// . if titleRec is NULL use our own internal m_myTitleRec
// . sets g_errno to ENOTFOUND if TitleRec does not exist for this url/docId
@ -37,6 +65,10 @@ bool Msg22::getTitleRec ( Msg22Request *r ,
char **titleRecPtrPtr ,
long *titleRecSizePtr,
bool justCheckTfndb ,
// when indexing spider replies we just want
// a unique docid... "docId" should be the desired
// one, but we might have to change it.
bool getAvailDocIdOnly ,
void *state ,
void (* callback) (void *state) ,
long niceness ,
@ -45,6 +77,9 @@ bool Msg22::getTitleRec ( Msg22Request *r ,
long timeout ,
bool doLoadBalancing ) {
// sanity
if ( getAvailDocIdOnly && justCheckTfndb ) { char *xx=NULL;*xx=0; }
if ( getAvailDocIdOnly && url ) { char *xx=NULL;*xx=0; }
//if ( m_url ) log(LOG_DEBUG,"build: getting TitleRec for %s",m_url);
// sanity checks
@ -56,7 +91,7 @@ bool Msg22::getTitleRec ( Msg22Request *r ,
if ( r->m_inUse ) { char *xx=NULL;*xx=0; }
if ( m_outstanding ) { char *xx = NULL;*xx=0; }
// sanity check
if ( ! justCheckTfndb ) {
if ( ! justCheckTfndb && ! getAvailDocIdOnly ) {
if ( ! titleRecPtrPtr ) { char *xx=NULL;*xx=0; }
if ( ! titleRecSizePtr ) { char *xx=NULL;*xx=0; }
}
@ -79,6 +114,7 @@ bool Msg22::getTitleRec ( Msg22Request *r ,
r->m_docId = docId;
r->m_niceness = niceness;
r->m_justCheckTfndb = (bool)justCheckTfndb;
r->m_getAvailDocIdOnly = (bool)getAvailDocIdOnly;
r->m_doLoadBalancing = (bool)doLoadBalancing;
r->m_collnum = g_collectiondb.getCollnum ( coll );
r->m_addToCache = false;
@ -391,6 +427,21 @@ void handleRequest22 ( UdpSlot *slot , long netnice ) {
st->m_docId1 = r->m_docId;
st->m_docId2 = r->m_docId;
}
// but if we are requesting an available docid, it might be taken
// so try the range
if ( r->m_getAvailDocIdOnly ) {
long long pd = r->m_docId;
long long d1 = g_titledb.getFirstProbableDocId ( pd );
long long d2 = g_titledb.getLastProbableDocId ( pd );
// sanity - bad url with bad subdomain?
if ( pd < d1 || pd > d2 ) { char *xx=NULL;*xx=0; }
// make sure we get a decent sample in titledb then in
// case the docid we wanted is not available
st->m_docId1 = d1;
st->m_docId2 = d2;
}
// . otherwise, url was given, like from Msg15
// . we may get multiple tfndb recs
if ( r->m_url[0] ) {
@ -827,11 +878,18 @@ void gotTitleList ( void *state , RdbList *list , Msg5 *msg5 ) {
//if ( pd != st->m_pd ) { char *xx=NULL;*xx=0; }
}
// the probable docid is the PREFERRED docid in this case
if ( r->m_getAvailDocIdOnly ) pd = st->m_r->m_docId;
// . these are both meant to be available docids
// . if ad2 gets exhausted we use ad1
long long ad1 = st->m_docId1;
long long ad2 = pd;
bool docIdWasFound = false;
// scan the titleRecs in the list
for ( ; ! tlist->isExhausted() ; tlist->skipCurrentRecord ( ) ) {
// breathe
@ -844,11 +902,16 @@ void gotTitleList ( void *state , RdbList *list , Msg5 *msg5 ) {
// skip negative recs, first one should not be negative however
if ( ( k->n0 & 0x01 ) == 0x00 ) continue;
// get docid of that guy
// get docid of that titlerec
long long dd = g_titledb.getDocId(k);
if ( r->m_getAvailDocIdOnly ) {
// make sure our available docids are availble!
if ( dd == ad1 ) ad1++;
if ( dd == ad2 ) ad2++;
}
// if we had a url make sure uh48 matches
if ( r->m_url[0] ) {
else if ( r->m_url[0] ) {
// get it
long long uh48 = g_titledb.getUrlHash48(k);
// sanity check
@ -865,6 +928,9 @@ void gotTitleList ( void *state , RdbList *list , Msg5 *msg5 ) {
if ( r->m_docId != dd ) continue;
}
// flag that we matched m_docId
docIdWasFound = true;
// ok, if just "checking tfndb" no need to go further
if ( r->m_justCheckTfndb ) {
// send back a good reply (empty means found!)
@ -907,12 +973,16 @@ void gotTitleList ( void *state , RdbList *list , Msg5 *msg5 ) {
long long ad = ad2;
// but wrap around if we need to
if ( ad == 0LL ) ad = ad1;
// if "docId" was unmatched that should be the preferred available
// docid then...
if ( ! docIdWasFound && r->m_getAvailDocIdOnly && ad != r->m_docId ) {
char *xx=NULL;*xx=0; }
// remember it
st->m_availDocId = ad;
// . ok, return an available docid
if ( r->m_url[0] || r->m_justCheckTfndb ) {
if ( r->m_url[0] || r->m_justCheckTfndb || r->m_getAvailDocIdOnly ) {
// store docid in reply
char *p = st->m_slot->m_tmpBuf;
// send back the available docid

@ -16,6 +16,7 @@ public:
long m_maxCacheAge;
collnum_t m_collnum;
char m_justCheckTfndb :1;
char m_getAvailDocIdOnly:1;
char m_doLoadBalancing :1;
char m_addToCache :1;
char m_inUse :1;
@ -35,6 +36,13 @@ class Msg22 {
static bool registerHandler ( ) ;
bool getAvailDocIdOnly ( class Msg22Request *r ,
long long preferredDocId ,
char *coll ,
void *state ,
void (* callback)(void *state) ,
long niceness ) ;
// . make sure you keep url/coll on your stack cuz we just point to it
// . see the other getTitleRec() description below for more details
// . use a maxCacheAge of 0 to avoid the cache
@ -45,6 +53,7 @@ class Msg22 {
char **titleRecPtrPtr ,
long *titleRecSizePtr ,
bool justCheckTfndb ,
bool getAvailDocIdOnly ,
void *state ,
void (* callback) (void *state ),
long niceness ,

@ -930,7 +930,7 @@ bool Msg3::doneScanning ( ) {
ff->getFilename() ,
m_niceness ) ) {
log("net: Had error while constraining list read from "
"%s: %s%s. vfd=%li parts=%li. "
"%s: %s/%s. vfd=%li parts=%li. "
"This is likely caused by corrupted "
"data on disk.",
mstrerror(g_errno), ff->m_dir ,

@ -65,12 +65,12 @@ class Msg36 {
//#else
// char m_reply[8];
//#endif
char m_reply[8*MAX_INDEXDB_SPLIT];
char m_reply[8*MAX_SHARDS];
// for sending the request
//#ifdef SPLIT_INDEXDB
//Multicast m_mcast[INDEXDB_SPLIT];
Multicast m_mcast[1];//MAX_INDEXDB_SPLIT];
Multicast m_mcast[1];//MAX_SHARDS];
long m_numRequests;
long m_numReplies;
long m_errno;

@ -219,7 +219,8 @@ void Msg39::getDocIds2 ( Msg39Request *req ) {
log("query: Query parsing inconsistency for q=%s. "
"langid=%li. Check langids and m_queryExpansion parms "
"which are the only parms that could be different in "
"Query::set2()."
"Query::set2(). You probably have different mysynoyms.txt "
"files on two different hosts! check that!!"
,m_tmpq.m_orig
,(long)m_r->m_language
);

10
Msg39.h

@ -55,7 +55,7 @@ class Msg39Request {
m_doMaxScoreAlgo = true;
m_seoDebug = false;
m_useSeoResultsCache = false;
ptr_readSizes = NULL;
ptr_query = NULL; // in utf8?
ptr_whiteList = NULL;
@ -72,6 +72,10 @@ class Msg39Request {
m_minDocId = -1;
m_maxDocId = -1;
// for widget, to only get results to append to last docid
m_maxSerpScore = 0.0;
m_minSerpDocId = 0LL;
m_makeReply = true;
// . search results knobs
@ -122,6 +126,10 @@ class Msg39Request {
long long m_maxDocId;
bool m_makeReply;
// for widget, to only get results to append to last docid
double m_maxSerpScore;
long long m_minSerpDocId;
// msg3a stuff
long m_timeout; // in seconds

208
Msg3a.cpp

@ -26,17 +26,17 @@ void Msg3a::constructor ( ) {
m_rbuf2.constructor();
// NULLify all the reply buffer ptrs
for ( long j = 0; j < MAX_INDEXDB_SPLIT; j++ )
for ( long j = 0; j < MAX_SHARDS; j++ )
m_reply[j] = NULL;
m_rbufPtr = NULL;
for ( long j = 0; j < MAX_INDEXDB_SPLIT; j++ )
for ( long j = 0; j < MAX_SHARDS; j++ )
m_mcast[j].constructor();
m_seoCacheList.constructor();
}
Msg3a::~Msg3a ( ) {
reset();
for ( long j = 0; j < MAX_INDEXDB_SPLIT; j++ )
for ( long j = 0; j < MAX_SHARDS; j++ )
m_mcast[j].destructor();
m_seoCacheList.freeList();
}
@ -48,12 +48,12 @@ void Msg3a::reset ( ) {
m_siteHashes26 = NULL;
// . NULLify all the reply buffer ptrs
// . have to count DOWN with "i" because of the m_reply[i-1][j] check
for ( long j = 0; j < MAX_INDEXDB_SPLIT; j++ ) {
for ( long j = 0; j < MAX_SHARDS; j++ ) {
if ( ! m_reply[j] ) continue;
mfree(m_reply[j],m_replyMaxSize[j], "Msg3aR");
m_reply[j] = NULL;
}
for ( long j = 0; j < MAX_INDEXDB_SPLIT; j++ )
for ( long j = 0; j < MAX_SHARDS; j++ )
m_mcast[j].reset();
// and the buffer that holds the final docids, etc.
if ( m_finalBuf )
@ -89,7 +89,7 @@ static void gotCacheReplyWrapper ( void *state ) {
// . sets g_errno on error
// . "query/coll" should NOT be on the stack in case we block
// . uses Msg36 to retrieve term frequencies for each termId in query
// . sends Msg39 request to get docids from each indexdb split
// . sends Msg39 request to get docids from each indexdb shard
// . merges replies together
// . we print out debug info if debug is true
// . "maxAge"/"addToCache" is talking about the clusterdb cache as well
@ -337,7 +337,7 @@ bool Msg3a::gotCacheReply ( ) {
}
}
// time how long to get each split's docids
// time how long to get each shard's docids
if ( m_debug )
m_startTime = gettimeofdayInMilliseconds();
@ -483,7 +483,7 @@ bool Msg3a::gotCacheReply ( ) {
Multicast *m = &m_mcast[i];
// clear it for transmit
m->reset();
// . send out a msg39 request to each split
// . send out a msg39 request to each shard
// . multicasts to a host in group "groupId"
// . we always block waiting for the reply with a multicast
// . returns false and sets g_errno on error
@ -532,10 +532,10 @@ bool Msg3a::gotCacheReply ( ) {
if ( m_numReplies < m_numHosts ) return false;//indexdbSplit )
// . otherwise, we did not block... error?
// . it must have been an error or just no new lists available!!
// . if we call gotAllSplitReplies() here, and we were called by
// . if we call gotAllShardReplies() here, and we were called by
// mergeLists() we end up calling mergeLists() again... bad. so
// just return true in that case.
//return gotAllSplitReplies();
//return gotAllShardReplies();
return true;
}
@ -553,7 +553,7 @@ void gotReplyWrapper3a ( void *state , void *state2 ) {
" err=%s", (long)THIS, THIS->m_numReplies ,
mstrerror(g_errno) );
// if one split times out, ignore it!
// if one shard times out, ignore it!
if ( g_errno == EQUERYTRUNCATED ||
g_errno == EUDPTIMEDOUT )
g_errno = 0;
@ -576,7 +576,7 @@ void gotReplyWrapper3a ( void *state , void *state2 ) {
// . sanity check
// . ntpd can screw with our local time and make this negative
if ( delta >= 0 ) {
// count the split
// count the shards
h->m_splitsDone++;
// accumulate the times so we can do an average display
// in PageHosts.cpp.
@ -587,8 +587,8 @@ void gotReplyWrapper3a ( void *state , void *state2 ) {
THIS->m_numReplies++;
// bail if still awaiting more replies
if ( THIS->m_numReplies < THIS->m_numHosts ) return;
// return if gotAllSplitReplies() blocked
if ( ! THIS->gotAllSplitReplies( ) ) return;
// return if gotAllShardReplies() blocked
if ( ! THIS->gotAllShardReplies( ) ) return;
// set g_errno i guess so parent knows
if ( THIS->m_errno ) g_errno = THIS->m_errno;
// call callback if we did not block, since we're here. all done.
@ -603,9 +603,9 @@ static void gotSerpdbReplyWrapper ( void *state ) {
THIS->m_callback ( THIS->m_state );
}
bool Msg3a::gotAllSplitReplies ( ) {
bool Msg3a::gotAllShardReplies ( ) {
// if any of the split requests had an error, give up and set m_errno
// if any of the shard requests had an error, give up and set m_errno
// but don't set if for non critical errors like query truncation
if ( m_errno ) {
g_errno = m_errno;
@ -705,23 +705,23 @@ bool Msg3a::gotAllSplitReplies ( ) {
if ( mr->m_nqt != m_q->getNumTerms() ) {
g_errno = EBADREPLY;
m_errno = EBADREPLY;
log("query: msg3a: Split reply qterms=%li != %li.",
log("query: msg3a: Shard reply qterms=%li != %li.",
(long)mr->m_nqt,(long)m_q->getNumTerms() );
return true;
}
// return if split had an error, but not for a non-critical
// return if shard had an error, but not for a non-critical
// error like query truncation
if ( mr->m_errno && mr->m_errno != EQUERYTRUNCATED ) {
g_errno = mr->m_errno;
m_errno = mr->m_errno;
log("query: msg3a: Split had error: %s",
log("query: msg3a: Shard had error: %s",
mstrerror(g_errno));
return true;
}
// skip down here if reply was already set
//skip:
// add of the total hits from each split, this is how many
// total results the lastest split is estimated to be able to
// add of the total hits from each shard, this is how many
// total results the lastest shard is estimated to be able to
// return
// . THIS should now be exact since we read all termlists
// of posdb...
@ -732,19 +732,19 @@ bool Msg3a::gotAllSplitReplies ( ) {
// cast these for printing out
long long *docIds = (long long *)mr->ptr_docIds;
double *scores = (double *)mr->ptr_scores;
// print out every docid in this split reply
// print out every docid in this shard reply
for ( long j = 0; j < mr->m_numDocIds ; j++ ) {
// print out score_t
logf( LOG_DEBUG,
"query: msg3a: [%lu] %03li) "
"split=%li docId=%012llu domHash=0x%02lx "
"shard=%li docId=%012llu domHash=0x%02lx "
"score=%f" ,
(unsigned long)this ,
j ,
i ,
docIds [j] ,
(long)g_titledb.getDomHash8FromDocId(docIds[j]),
(float)scores[j] );
scores[j] );
}
}
@ -849,13 +849,13 @@ bool Msg3a::mergeLists ( ) {
// shortcut
//long numSplits = m_numHosts;//indexdbSplit;
// . point to the various docids, etc. in each split reply
// . point to the various docids, etc. in each shard reply
// . tcPtr = term count. how many required query terms does the doc
// have? formerly called topExplicits in IndexTable2.cpp
long long *diPtr [MAX_INDEXDB_SPLIT];
double *rsPtr [MAX_INDEXDB_SPLIT];
key_t *ksPtr [MAX_INDEXDB_SPLIT];
long long *diEnd [MAX_INDEXDB_SPLIT];
long long *diPtr [MAX_SHARDS];
double *rsPtr [MAX_SHARDS];
key_t *ksPtr [MAX_SHARDS];
long long *diEnd [MAX_SHARDS];
for ( long j = 0; j < m_numHosts ; j++ ) {
Msg39Reply *mr =m_reply[j];
// if we have gbdocid:| in query this could be NULL
@ -953,7 +953,7 @@ bool Msg3a::mergeLists ( ) {
return true;
//
// ***MERGE ALL SPLITS INTO m_docIds[], etc.***
// ***MERGE ALL SHARDS INTO m_docIds[], etc.***
//
// . merge all lists in m_replyDocIds[splitNum]
// . we may be re-called later after m_docsToGet is increased
@ -966,7 +966,7 @@ bool Msg3a::mergeLists ( ) {
//Msg39Reply *mr;
long hslot;
// get the next highest-scoring docids from all split lists
// get the next highest-scoring docids from all shard termlists
for ( long j = 0; j < m_numHosts; j++ ) {
// . skip exhausted lists
// . these both should be NULL if reply was skipped because
@ -1026,82 +1026,84 @@ bool Msg3a::mergeLists ( ) {
// . only add it to the final list if the docid is "unique"
// . BUT since different event ids share the same docid, exception!
if ( hslot < 0 ) {
// always inc this
//m_totalDocCount++;
// only do this if we need more
if ( m_numDocIds < m_docsToGet ) {
// get DocIdScore class for this docid
Msg39Reply *mr = m_reply[maxj];
// point to the array of DocIdScores
DocIdScore *ds = (DocIdScore *)mr->ptr_scoreInfo;
long nds = mr->size_scoreInfo/sizeof(DocIdScore);
DocIdScore *dp = NULL;
for ( long i = 0 ; i < nds ; i++ ) {
if ( ds[i].m_docId != *diPtr[maxj] ) continue;
dp = &ds[i];
break;
}
// add the max to the final merged lists
m_docIds [m_numDocIds] = *diPtr[maxj];
if ( hslot >= 0 ) goto skip; // < 0 ) {
// wtf?
if ( ! dp ) {
// this is empty if no scoring info
// supplied!
if ( m_r->m_getDocIdScoringInfo )
log("msg3a: CRAP! got empty score "
"info for "
"d=%lli",
m_docIds[m_numDocIds]);
//char *xx=NULL; *xx=0; 261561804684
// qry = www.yahoo
}
// point to the single DocIdScore for this docid
m_scoreInfos[m_numDocIds] = dp;
// always inc this
//m_totalDocCount++;
// only do this if we need more
if ( m_numDocIds < m_docsToGet ) {
// get DocIdScore class for this docid
Msg39Reply *mr = m_reply[maxj];
// point to the array of DocIdScores
DocIdScore *ds = (DocIdScore *)mr->ptr_scoreInfo;
long nds = mr->size_scoreInfo/sizeof(DocIdScore);
DocIdScore *dp = NULL;
for ( long i = 0 ; i < nds ; i++ ) {
if ( ds[i].m_docId != *diPtr[maxj] ) continue;
dp = &ds[i];
break;
}
// add the max to the final merged lists
m_docIds [m_numDocIds] = *diPtr[maxj];
// reset this just in case
if ( dp ) {
dp->m_singleScores = NULL;
dp->m_pairScores = NULL;
}
// wtf?
if ( ! dp ) {
// this is empty if no scoring info
// supplied!
if ( m_r->m_getDocIdScoringInfo )
log("msg3a: CRAP! got empty score "
"info for "
"d=%lli",
m_docIds[m_numDocIds]);
//char *xx=NULL; *xx=0; 261561804684
// qry = www.yahoo
}
// point to the single DocIdScore for this docid
m_scoreInfos[m_numDocIds] = dp;
// now fix DocIdScore::m_pairScores and m_singleScores
// ptrs so they reference into the
// Msg39Reply::ptr_pairScoreBuf and ptr_singleSingleBuf
// like they should. it seems we do not free the
// Msg39Replies so we should be ok referencing them.
if ( dp && dp->m_singlesOffset >= 0 )
dp->m_singleScores =
(SingleScore *)(mr->ptr_singleScoreBuf+
dp->m_singlesOffset) ;
if ( dp && dp->m_pairsOffset >= 0 )
dp->m_pairScores =
(PairScore *)(mr->ptr_pairScoreBuf +
dp->m_pairsOffset );
// reset this just in case
if ( dp ) {
dp->m_singleScores = NULL;
dp->m_pairScores = NULL;
}
// now fix DocIdScore::m_pairScores and m_singleScores
// ptrs so they reference into the
// Msg39Reply::ptr_pairScoreBuf and ptr_singleSingleBuf
// like they should. it seems we do not free the
// Msg39Replies so we should be ok referencing them.
if ( dp && dp->m_singlesOffset >= 0 )
dp->m_singleScores =
(SingleScore *)(mr->ptr_singleScoreBuf+
dp->m_singlesOffset) ;
if ( dp && dp->m_pairsOffset >= 0 )
dp->m_pairScores =
(PairScore *)(mr->ptr_pairScoreBuf +
dp->m_pairsOffset );
// turn it into a float, that is what rscore_t is.
// we do this to make it easier for PostQueryRerank.cpp
m_scores [m_numDocIds]=(double)*rsPtr[maxj];
if ( m_r->m_doSiteClustering )
m_clusterRecs[m_numDocIds]= *ksPtr[maxj];
// clear this out
//m_eventIdBits[m_numDocIds].clear();
// set this for use below
hslot = m_numDocIds;
// point to next available slot to add to
m_numDocIds++;
}
// if it has ALL the required query terms, count it
//if ( *bsPtr[maxj] & 0x60 ) m_numAbove++;
// . add it, this should be pre-allocated!
// . returns false and sets g_errno on error
if ( ! htable.addKey(*diPtr[maxj],1) ) return true;
// turn it into a float, that is what rscore_t is.
// we do this to make it easier for PostQueryRerank.cpp
m_scores [m_numDocIds]=(double)*rsPtr[maxj];
if ( m_r->m_doSiteClustering )
m_clusterRecs[m_numDocIds]= *ksPtr[maxj];
// clear this out
//m_eventIdBits[m_numDocIds].clear();
// set this for use below
hslot = m_numDocIds;
// point to next available slot to add to
m_numDocIds++;
}
// if it has ALL the required query terms, count it
//if ( *bsPtr[maxj] & 0x60 ) m_numAbove++;
// . add it, this should be pre-allocated!
// . returns false and sets g_errno on error
if ( ! htable.addKey(*diPtr[maxj],1) ) return true;
skip:
// increment the split pointers from which we took the max
// increment the shard pointers from which we took the max
rsPtr[maxj]++;
diPtr[maxj]++;
ksPtr[maxj]++;
@ -1113,7 +1115,7 @@ bool Msg3a::mergeLists ( ) {
if ( m_debug ) {
// show how long it took
logf( LOG_DEBUG,"query: msg3a: [%lu] merged %li docs from %li "
"splits in %llu ms. "
"shards in %llu ms. "
,
(unsigned long)this,
m_numDocIds, (long)m_numHosts,
@ -1128,17 +1130,17 @@ bool Msg3a::mergeLists ( ) {
// print out score_t
logf(LOG_DEBUG,"query: msg3a: [%lu] "
"%03li) merged docId=%012llu "
"score=%.01f hosthash=0x%lx",
"score=%f hosthash=0x%lx",
(unsigned long)this,
i,
m_docIds [i] ,
(float)m_scores [i] ,
(double)m_scores [i] ,
sh );
}
}
// if we had a full split, we should have gotten the cluster recs
// from each split already
// from each shard already
memset ( m_clusterLevels , CR_OK , m_numDocIds );
return true;

10
Msg3a.h

@ -80,7 +80,7 @@ public:
return m_numTotalEstimatedHits; };
// called when we got a reply of docIds
bool gotAllSplitReplies ( );
bool gotAllShardReplies ( );
bool gotCacheReply ( );
@ -135,13 +135,13 @@ public:
float m_termFreqWeights[MAX_QUERY_TERMS];
// a multicast class to send the request, one for each split
Multicast m_mcast[MAX_INDEXDB_SPLIT];
Multicast m_mcast[MAX_SHARDS];
// for timing how long things take
long long m_startTime;
// this buffer should be big enough to hold all requests
//char m_request [MAX_MSG39_REQUEST_SIZE * MAX_INDEXDB_SPLIT];
//char m_request [MAX_MSG39_REQUEST_SIZE * MAX_SHARDS];
long m_numReplies;
// . # estimated total hits
@ -157,8 +157,8 @@ public:
SafeBuf m_rbuf2;
// each split gives us a reply
class Msg39Reply *m_reply [MAX_INDEXDB_SPLIT];
long m_replyMaxSize[MAX_INDEXDB_SPLIT];
class Msg39Reply *m_reply [MAX_SHARDS];
long m_replyMaxSize[MAX_SHARDS];
char m_debug;

155
Msg40.cpp

@ -100,6 +100,7 @@ Msg40::Msg40() {
m_sendsIn = 0;
m_printi = 0;
m_numDisplayed = 0;
m_numPrintedSoFar = 0;
m_lastChunk = false;
//m_numGigabitInfos = 0;
}
@ -555,6 +556,9 @@ bool Msg40::getDocIds ( bool recall ) {
mr.m_maxQueryTerms = m_si->m_maxQueryTerms;
mr.m_realMaxTop = m_si->m_realMaxTop;
mr.m_minSerpDocId = m_si->m_minSerpDocId;
mr.m_maxSerpScore = m_si->m_maxSerpScore;
// . get the docIds
// . this sets m_msg3a.m_clusterLevels[] for us
//if(! m_msg3a.getDocIds ( &m_r, m_si->m_q, this , gotDocIdsWrapper))
@ -721,7 +725,6 @@ bool Msg40::gotDocIds ( ) {
if ( m_needFirstReplies > 100 ) m_needFirstReplies = 100;
}
// we have received m_numGood contiguous Msg20 replies!
//m_numContiguous = 0;
//m_visibleContiguous = 0;
@ -1591,6 +1594,7 @@ bool Msg40::gotSummary ( ) {
for ( ; m_si && m_si->m_streamResults&&m_printi<m_msg3a.m_numDocIds ;
m_printi++){
// if we are waiting on our previous send to complete... wait..
if ( m_sendsOut > m_sendsIn ) break;
@ -1658,18 +1662,38 @@ bool Msg40::gotSummary ( ) {
// XmlDoc::m_contentHash32.. it will be zero if invalid i guess
if ( m_si && m_si->m_doDupContentRemoval && // &dr=1
mr->m_contentHash32 &&
// do not dedup CT_STATUS results, those are
// spider reply "documents" that indicate the last
// time a doc was spidered and the error code or success
// code
mr->m_contentType != CT_STATUS &&
m_dedupTable.isInTable ( &mr->m_contentHash32 ) ) {
//if ( g_conf.m_logDebugQuery )
log("msg40: dup sum #%li (%lu)",m_printi,
mr->m_contentHash32);
log("msg40: dup sum #%li (%lu)(d=%lli)",m_printi,
mr->m_contentHash32,mr->m_docId);
// make it available to be reused
m20->reset();
continue;
}
// static long s_bs = 0;
// if ( (s_bs++ % 5) != 0 ) {
// log("msg40: FAKE dup sum #%li (%lu)(d=%lli)",m_printi,
// mr->m_contentHash32,mr->m_docId);
// // make it available to be reused
// m20->reset();
// continue;
// }
// return true with g_errno set on error
if ( m_si && m_si->m_doDupContentRemoval && // &dr=1
mr->m_contentHash32 &&
// do not dedup CT_STATUS results, those are
// spider reply "documents" that indicate the last
// time a doc was spidered and the error code or success
// code
mr->m_contentType != CT_STATUS &&
! m_dedupTable.addKey ( &mr->m_contentHash32 ) ) {
m_hadPrintError = true;
log("msg40: error adding to dedup table: %s",
@ -1678,19 +1702,25 @@ bool Msg40::gotSummary ( ) {
// assume we show this to the user
m_numDisplayed++;
//log("msg40: numdisplayed=%li",m_numDisplayed);
// do not print it if before the &s=X start position though
if ( m_si && m_numDisplayed <= m_si->m_firstResultNum ){
log("msg40: hiding #%li (%lu)",
m_printi,mr->m_contentHash32);
log("msg40: hiding #%li (%lu)(d=%lli)",
m_printi,mr->m_contentHash32,mr->m_docId);
m20->reset();
continue;
}
log("msg40: printing #%li (%lu)",m_printi,mr->m_contentHash32);
log("msg40: printing #%li (%lu)(d=%lli)",
m_printi,mr->m_contentHash32,mr->m_docId);
// . ok, we got it, so print it and stream it
// . this might set m_hadPrintError to true
printSearchResult9 ( m_printi );
printSearchResult9 ( m_printi , m_numPrintedSoFar );
m_numPrintedSoFar++;
//log("msg40: printedsofar=%li",m_numPrintedSoFar);
// now free the reply to save memory since we could be
// streaming back 1M+. we call reset below, no need for this.
@ -1705,6 +1735,62 @@ bool Msg40::gotSummary ( ) {
if ( m_si->m_streamResults )
st->m_socket->m_streamingMode = true;
// if streaming results, and too many results were clustered or
// deduped then try to get more by merging the docid lists that
// we already have from the shards. if this still does not provide
// enough docids then we will need to issue a new msg39 request to
// each shard to get even more docids from each shard.
if ( m_si && m_si->m_streamResults &&
// this is coring as well on multi collection federated searches
// so disable that for now too. it is because Msg3a::m_r is
// NULL.
m_numCollsToSearch == 1 &&
// must have no streamed chunk sends out
m_sendsOut == m_sendsIn &&
// if we did not ask for enough docids and they were mostly
// dups so they got deduped, then ask for more.
// m_numDisplayed includes results before the &s=X parm.
// and so does m_docsToGetVisiable, so we can compare them.
m_numDisplayed < m_docsToGetVisible &&
// wait for us to have exhausted the docids we have merged
m_printi >= m_msg3a.m_numDocIds &&
// wait for us to have available msg20s to get summaries
m_numReplies == m_numRequests &&
// this is true if we can get more docids from merging
// more of the termlists from the shards together.
// otherwise, we will have to ask each shard for a
// higher number of docids.
m_msg3a.m_moreDocIdsAvail &&
// do not do this if client closed connection
! m_socketHadError ) { //&&
// doesn't work on multi-coll just yet, it cores.
// MAKE it.
//m_numCollsToSearch == 1 ) {
// can it cover us?
long need = m_msg3a.m_docsToGet + 20;
// note it
log("msg40: too many summaries deduped. "
"getting more "
"docids from msg3a merge and getting summaries. "
"%li are visible, need %li. "
"changing docsToGet from %li to %li. "
"numReplies=%li numRequests=%li",
m_numDisplayed,
m_docsToGetVisible,
m_msg3a.m_docsToGet,
need,
m_numReplies,
m_numRequests);
// merge more docids from the shards' termlists
m_msg3a.m_docsToGet = need;
// sanity. the original msg39request must be there
if ( ! m_msg3a.m_r ) { char *xx=NULL;*xx=0; }
// this should increase m_msg3a.m_numDocIds
m_msg3a.mergeLists();
}
// . wrap it up with Next 10 etc.
// . this is in PageResults.cpp
if ( m_si && m_si->m_streamResults && ! m_printedTail &&
@ -1987,6 +2073,11 @@ bool Msg40::gotSummary ( ) {
//long m = oldNumContiguous;
// get it
Msg20Reply *mri = m_msg20[i]->m_r;
// do not dedup CT_STATUS results, those are
// spider reply "documents" that indicate the last
// time a doc was spidered and the error code or
// success code
if ( mri->m_contentType == CT_STATUS ) continue;
// never let it be i
//if ( m <= i ) m = i + 1;
// see if any result lower-scoring than #i is a dup of #i
@ -1997,6 +2088,11 @@ bool Msg40::gotSummary ( ) {
if ( *level != CR_OK ) continue;
// get it
Msg20Reply *mrm = m_msg20[m]->m_r;
// do not dedup CT_STATUS results, those are
// spider reply "documents" that indicate the last
// time a doc was spidered and the error code or
// success code
if ( mrm->m_contentType == CT_STATUS ) continue;
// use gigabit vector to do topic clustering, etc.
long *vi = (long *)mri->ptr_vbuf;
long *vm = (long *)mrm->ptr_vbuf;
@ -5175,7 +5271,7 @@ bool Msg40::addFacts ( HashTableX *queryTable,
// . printSearchResult into "sb"
bool Msg40::printSearchResult9 ( long ix ) {
bool Msg40::printSearchResult9 ( long ix , long numPrintedSoFar ) {
// . we stream results right onto the socket
// . useful for thousands of results... and saving mem
@ -5192,27 +5288,23 @@ bool Msg40::printSearchResult9 ( long ix ) {
// then print each result
// don't display more than docsWanted results
if ( m_numPrinted >= msg40->getDocsWanted() ) return true;
// prints in xml or html
if ( m_numPrinted < msg40->getDocsWanted() ) {
if ( m_si->m_format == FORMAT_CSV ) {
printJsonItemInCSV ( st , ix );
//log("print: printing #%li csv",(long)ix);
}
// print that out into st->m_sb safebuf
else if ( ! printResult ( st , ix ) ) {
// oom?
if ( ! g_errno ) g_errno = EBADENGINEER;
log("query: had error: %s",mstrerror(g_errno));
m_hadPrintError = true;
}
// count it
m_numPrinted++;
if ( m_si->m_format == FORMAT_CSV ) {
printJsonItemInCSV ( st , ix );
//log("print: printing #%li csv",(long)ix);
}
// print that out into st->m_sb safebuf
else if ( ! printResult ( st , ix , numPrintedSoFar ) ) {
// oom?
if ( ! g_errno ) g_errno = EBADENGINEER;
log("query: had error: %s",mstrerror(g_errno));
m_hadPrintError = true;
}
// count it
m_numPrinted++;
return true;
}
@ -5241,6 +5333,8 @@ bool printHttpMime ( State0 *st ) {
ct = "application/json";
if ( si->m_format == FORMAT_XML )
ct = "text/xml";
if ( si->m_format == FORMAT_HTML )
ct = "text/html";
//if ( si->m_format == FORMAT_TEXT )
// ct = "text/plain";
if ( si->m_format == FORMAT_CSV )
@ -5360,6 +5454,10 @@ bool Msg40::printCSVHeaderRow ( SafeBuf *sb ) {
if ( ! ji->getCompoundName ( tmpBuf ) )
return false;
// skip the "html" column, strip that out now
if ( strcmp(tmpBuf.getBufStart(),"html") == 0 )
continue;
// is it new?
long long h64 = hash64n ( tmpBuf.getBufStart() );
if ( nameTable.isInTable ( &h64 ) ) continue;
@ -5492,6 +5590,9 @@ bool Msg40::printJsonItemInCSV ( State0 *st , long ix ) {
// is it new?
long long h64 = hash64n ( tmpBuf.getBufStart() );
// ignore the "html" column
if ( strcmp(tmpBuf.getBufStart(),"html") == 0 ) continue;
long slot = columnTable->getSlot ( &h64 ) ;
// MUST be in there
// get col #

@ -171,7 +171,7 @@ class Msg40 {
long long getDocId ( long i ){return m_msg3a.m_docIds[i]; };
long long *getDocIds( ){return m_msg3a.m_docIds; };
float getScore ( long i ){return m_msg3a.m_scores[i]; };
double getScore ( long i ){return m_msg3a.m_scores[i]; };
class DocIdScore *getScoreInfo(long i){
if ( ! m_msg3a.m_scoreInfos ) return NULL;
return m_msg3a.m_scoreInfos[i];
@ -208,7 +208,7 @@ class Msg40 {
long m_lastHeartbeat;
bool printSearchResult9 ( long ix ) ;
bool printSearchResult9 ( long ix , long numPrintedSoFar ) ;
HashTableX m_columnTable;
bool printCSVHeaderRow ( class SafeBuf *sb );
bool printJsonItemInCSV ( class State0 *st , long ix );
@ -265,6 +265,7 @@ class Msg40 {
long m_sendsIn ;
long m_printi ;
long m_numDisplayed ;
long m_numPrintedSoFar;
long m_socketHadError;

@ -802,7 +802,9 @@ bool Msg5::needsRecall ( ) {
RdbBase *base = getRdbBase ( m_rdbId , m_collnum );
// if collection was deleted from under us, base will be NULL
if ( ! base && ! g_errno ) {
log("msg5: base lost for collnum %li",(long)m_collnum);
log("msg5: base lost for rdbid=%li collnum %li",
(long)m_rdbId,(long)m_collnum);
g_errno = ENOCOLLREC;
return false;
}
// sanity check
@ -1535,7 +1537,9 @@ void Msg5::repairLists_r ( ) {
if ( i < nn && base ) {
long fn = m_msg3.m_fileNums[i];
BigFile *bf = base->getFile ( fn );
log("db: Corrupt filename is %s.",bf->getFilename());
log("db: Corrupt filename is %s in collnum %li."
,bf->getFilename()
,(long)m_collnum);
//key_t sk = m_listPtrs[i]->getStartKey();
//key_t ek = m_listPtrs[i]->getEndKey ();
//log("db: "
@ -1551,10 +1555,10 @@ void Msg5::repairLists_r ( ) {
}
// . remove the bad eggs from the list
// . TODO: support non-fixed data sizes
if ( m_listPtrs[i]->getFixedDataSize() >= 0 )
m_listPtrs[i]->removeBadData_r();
else
m_listPtrs[i]->reset();
//if ( m_listPtrs[i]->getFixedDataSize() >= 0 )
m_listPtrs[i]->removeBadData_r();
//else
//m_listPtrs[i]->reset();
// otherwise we have a patchable error
m_hadCorruption = true;
// don't add a list with errors to cache, please

@ -4,6 +4,10 @@
#include "Pages.h"
#include "Parms.h"
#include "Spider.h"
#include "PageResults.h" // for RESULT_HEIGHT
// 5 seconds
#define DEFAULT_WIDGET_RELOAD 1000
//bool printSitePatternExamples ( SafeBuf *sb , HttpRequest *hr ) ;
@ -68,12 +72,18 @@ public:
// . Collectiondb.cpp calls this when any parm flagged with
// PF_REBUILDURLFILTERS is updated
// . it only adds sites via msg4 that are in "siteListArg" but NOT in the
// current CollectionRec::m_siteListBuf
// . updates SpiderColl::m_siteListDomTable to see what doms we can spider
// . updates SpiderColl::m_negSubstringBuf and m_posSubStringBuf to
// see what substrings in urls are disallowed/allowable for spidering
// . this returns false if it blocks
// . returns true and sets g_errno on error
// . uses msg4 to add seeds to spiderdb if necessary
// . uses msg4 to add seeds to spiderdb if necessary if "siteListArg"
// has new urls that are not currently in cr->m_siteListBuf
// . only adds seeds for the shard we are on iff we are responsible for
// the fake firstip!!!
bool updateSiteListTables ( collnum_t collnum ,
// the fake firstip!!! that way only one shard does the add.
bool updateSiteListBuf ( collnum_t collnum ,
bool addSeeds ,
char *siteListArg ) {
@ -402,7 +412,7 @@ char *getMatchingUrlPattern ( SpiderColl *sc , SpiderRequest *sreq ) {
// need to build dom table for pattern matching?
if ( dt->getNumSlotsUsed() == 0 && cr ) {
// do not add seeds, just make siteListDomTable, etc.
updateSiteListTables ( sc->m_collnum ,
updateSiteListBuf ( sc->m_collnum ,
false , // add seeds?
cr->m_siteListBuf.getBufStart() );
}
@ -771,6 +781,461 @@ bool sendPageBasicStatus ( TcpSocket *socket , HttpRequest *hr ) {
// this prints the <form tag as well
g_pages.printAdminTop ( &sb , socket , hr );
// table to split between widget and stats in left and right panes
if ( fmt == FORMAT_HTML ) {
sb.safePrintf("<TABLE id=pane>"
"<TR><TD valign=top>");
}
long savedLen1, savedLen2;
//
// widget
//
// put the widget in here, just sort results by spidered date
//
// the scripts do "infinite" scrolling both up and down.
// but if you are at the top then new results will load above
// you and we try to maintain your current visual state even though
// the scrollbar position will change.
//
if ( fmt == FORMAT_HTML ) {
// save position so we can output the widget code
// so user can embed it into their own web page
savedLen1 = sb.length();
sb.safePrintf("<script type=\"text/javascript\">\n\n");
// if user has the scrollbar at the top
// in the widget we do a search every 15 secs
// to try to load more recent results. we should
// return up to 10 results above your last
// top docid and 10 results below it. that way
// no matter which of the 10 results you were
// viewing your view should remaing unchanged.
sb.safePrintf(
// global var
"var forcing;"
"function widget123_handler_reload() {"
// return if reply is not fully ready
"if(this.readyState != 4 )return;"
// if error or empty reply then do nothing
"if(!this.responseText)return;"
// get the widget container
"var w=document.getElementById(\"widget123\");"
// GET DOCID of first div/searchresult
"var sd=document.getElementById("
"\"widget123_scrolldiv\");"
"var cd;"
"if ( sd ) cd=sd.firstChild;"
"var fd=0;"
"if(cd) fd=cd.getAttribute('docid');"
// if the searchbox has the focus then do not
// update the content just yet...
"var qb=document.getElementById(\"qbox\");"
"if(qb&&qb==document.activeElement)"
"return;"
// or if not forced and they scrolled down
// don't jerk them back up again
"if(!forcing&&sd&&sd.scrollTop!=0)return;"
// just set the widget content to the reply
"w.innerHTML=this.responseText;"
//
// find that SAME docid in response and see
// how many new results were added above it
//
"var added=0;"
// did we find the docid?
"var found=0;"
// get div again since we updated innerHTML
"sd=document.getElementById("
"\"widget123_scrolldiv\");"
// scan the kids
"var kid=sd.firstChild;"
// begin the while loop to scan the kids
"while (kid) {"
// if div had no docid it might have been a line
// break div, so ignore
"if (!kid.hasAttribute('docid') ) {"
"kid=kid.nextSibling;"
"continue;"
"}"
// set kd to docid of kid
"var kd=kid.getAttribute('docid');"
// stop if we hit our original top docid
"if(kd==fd) {found=1;break;}"
// otherwise count it as a NEW result we got
"added++;"
// advance kid
"kid=kid.nextSibling;"
// end while loop
"}"
//"alert(\"added=\"+added);"
// how many results did we ADD above the
// reported "topdocid" of the widget?
// it should be in the ajax reply from the
// search engine. how many result were above
// the given "topdocid".
//"var ta=document.getElementById(\"topadd\");"
//"var added=0;"
//"if(ta)added=ta.value;"
// if nothing added do nothing
"if (added==0)return;"
// if original top docid not found, i guess we
// added too many new guys to the top of the
// search results, so don't bother scrolling
// just reset to top
"if (!found) return;"
// show that
//"alert(this.responseText);"
// get the div that has the scrollbar
"var sd=document.getElementById("
"\"widget123_scrolldiv\");"
// save current scroll pos
"var oldpos=parseInt(sd.scrollTop);"
// note it
//"alert (sd.scrollTop);"
// preserve the relative scroll position so we
// do not jerk around since we might have added
// "added" new results to the top.
"sd.scrollTop += added*%li;"
// try to scroll out new results if we are
// still at the top of the scrollbar and
// there are new results to scroll.
"if(oldpos==0)widget123_scroll();}\n\n"
// for preserving scrollbar position
,(long)RESULT_HEIGHT +2*PADDING
);
// scroll the widget up until we hit the 0 position
sb.safePrintf(
"function widget123_scroll() {"
// only scroll if at the top of the widget
// and not scrolled down so we do not
// interrupt
"var sd=document.getElementById("
"\"widget123_scrolldiv\");"
// TODO: need parseInt here?
"var pos=parseInt(sd.scrollTop);"
// note it
//"alert (sd.scrollTop);"
// if already at the top of widget, return
"if(pos==0)return;"
// decrement by 3 pixels
"pos=pos-3;"
// do not go negative
"if(pos<0)pos=0;"
// assign to scroll up. TODO: need +\"px\"; ?
"sd.scrollTop=pos;"
// all done, then return
"if(pos==0) return;"
// otherwise, scroll more in 3ms
// TODO: make this 1000ms on result boundaries
// so it delays on each new result. perhaps make
// it less than 1000ms if we have a lot of
// results above us!
"setTimeout('widget123_scroll()',3);}\n\n"
);
// this function appends the search results to what is
// already in the widget.
sb.safePrintf(
"function widget123_handler_append() {"
// return if reply is not fully ready
"if(this.readyState != 4 )return;"
// i guess we are done... release the lock
"outstanding=0;"
// if error or empty reply then do nothing
"if(!this.responseText)return;"
// if too small
"if(this.responseText.length<=3)return;"
// get the widget container
"var w=document.getElementById("
"\"widget123_scrolldiv\");"
// just set the widget content to the reply
"w.innerHTML+=this.responseText;"
"}\n\n"
);
//sb.safePrintf ( "</script>\n\n" );
long widgetWidth = 300;
long widgetHeight = 500;
// make the ajax url that gets the search results
SafeBuf ub;
ub.safePrintf("/search"
//"format=ajax"
"?c=%s"
//"&prepend=gbsortbyint%%3Agbspiderdate"
"&q=-gbstatus:0+gbsortbyint%%3Agbspiderdate"
"&sc=0" // no site clustering
"&dr=0" // no deduping
// 10 results at a time
"&n=10"
"&widgetheight=%li"
"&widgetwidth=%li"
, cr->m_coll
, widgetHeight
, widgetWidth
);
//ub.safePrintf("&topdocid="
// );
// get the search results from neo as soon as this div is
// being rendered, and set its contents to them
sb.safePrintf(//"<script type=text/javascript>"
"function widget123_reload(force) {"
// when the user submits a new query in the
// query box we set force to false when
// we call this (see PageResults.cpp) so that
// we do not register multiple timeouts
"if ( ! force ) "
"setTimeout('widget123_reload(0)',%li);"
// get the query box
"var qb=document.getElementById(\"qbox\");"
// if forced then turn off focus for searchbox
// since it was either 1) the initial call
// or 2) someone submitted a query and
// we got called from PageResults.cpp
// onsubmit event.
"if (force&&qb) qb.blur();"
// if the searchbox has the focus then do not
// reload!! unless force is true..
"if(qb&&qb==document.activeElement&&!force)"
"return;"
//"var ee=document.getElementById(\"sbox\");"
//"if (ee)alert('reloading '+ee.style.display);"
// do not do timer reload if searchbox is
// visible because we do not want to interrupt
// a possible search
//"if(!force&&ee && ee.style.display=='')return;"
// do not bother timed reloading if scrollbar pos
// not at top or near bottom
"var sd=document.getElementById("
"\"widget123_scrolldiv\");"
"if ( sd && !force ) {"
"var pos=parseInt(sd.scrollTop);"
"if (pos!=0) return;"
"}"
"var client=new XMLHttpRequest();"
"client.onreadystatechange="
"widget123_handler_reload;"
// . this url gets the search results
// . get them in "ajax" format so we can embed
// them into the base html as a widget
"var u='%s&format=ajax';"
// append our query from query box if there
"var qv;"
"if (qb) qv=qb.value;"
"if (qv){"
//"u+='&q=';"
"u+='&prepend=';"
"u+=encodeURI(qv);"
"}"
// set global var so handler knows if we were
// forced or not
"forcing=force;"
// get the docid at the top of the widget
// so we can get SURROUNDING search results,
// like 10 before it and 10 after it for
// our infinite scrolling
//"var td=document.getElementById('topdocid');"
//"if ( td ) u=u+\"&topdocid=\"+td.value;"
//"alert('reloading');"
"client.open('GET',u);"
"client.send();"
"}\n\n"
// when page loads, populate the widget immed.
"widget123_reload(1);\n\n"
// initiate the timer loop since it was
// not initiated on that call since we had to
// set force=1 to load in case the query box
// was currently visible.
"setTimeout('widget123_reload(0)',%li);"
//, widgetHeight
, (long)DEFAULT_WIDGET_RELOAD
, ub.getBufStart()
, (long)DEFAULT_WIDGET_RELOAD
);
//
// . call this when scrollbar gets 5 up from bottom
// . but if < 10 new results are appended, then stop!
//
sb.safePrintf(
"var outstanding=0;\n\n"
"function widget123_append() {"
// bail if already outstanding
"if (outstanding) return;"
// if scrollbar not near bottom, then return
"var sd=document.getElementById("
"\"widget123_scrolldiv\");"
"if ( sd ) {"
"var pos=parseInt(sd.scrollTop);"
"if (pos < (sd.scrollHeight-%li)) "
"return;"
"}"
// . this url gets the search results
// . just get them so we can APPEND them to
// the widget, so it will be just the
// "results" divs
"var u='%s&format=append';"
// . get score of the last docid in our widget
// . it should be persistent.
// . it is like a bookmark for scrolling
// . append results AFTER it into the widget
// . this way we can deal with the fact that
// we may be adding 100s of results to this
// query per second, especially if spidering
// at a high rate. and this will keep the
// results we append persistent.
// . now we scan the children "search result"
// divs of the "widget123_scrolldiv" div
// container to get the last child and get
// its score/docid so we can re-do the search
// and just get the search results with
// a score/docid LESS THAN that. THEN our
// results should be contiguous.
// . get the container div, "cd"
"var cd=document.getElementById("
"'widget123_scrolldiv');"
// must be there
"if(!cd)return;"
// get the last child div in there
"var d=cd.lastChild.previousSibling;"
// must be there
"if(!d)return;"
// get docid/score
"u=u+\"&maxserpscore=\"+d.getAttribute('score');"
"u=u+\"&minserpdocid=\"+d.getAttribute('docid');"
// append our query from query box if there
"var qb=document.getElementById(\"qbox\");"
"var qv;"
"if (qb) qv=qb.value;"
"if (qv){"
//"u+='&q=';"
"u+='&prepend=';"
"u+=encodeURI(qv);"
"}"
// turn on the lock to prevent excessive calls
"outstanding=1;"
//"alert(\"scrolling2 u=\"+u);"
"var client=new XMLHttpRequest();"
"client.onreadystatechange="
"widget123_handler_append;"
//"alert('appending scrollTop='+sd.scrollTop+' scrollHeight='+sd.scrollHeight+' 5results=%li'+u);"
"client.open('GET',u);"
"client.send();"
"}\n\n"
"</script>\n\n"
// if (pos < (sd.scrollHeight-%li)) return...
// once user scrolls down to within last 5
// results then try to append to the results.
, widgetHeight +5*((long)RESULT_HEIGHT+2*PADDING)
, ub.getBufStart()
//,widgetHeight +5*((long)RESULT_HEIGHT+2*PADDING
);
// then the WIDGET MASTER div. set the "id" so that the
// style tag the user sets can control its appearance.
// when the browser loads this the ajax sets the contents
// to the reply from neo.
// on scroll call widget123_append() which will append
// more search results if we are near the bottom of the
// widget.
sb.safePrintf("<div id=widget123 "
"style=\"border:2px solid black;"
"position:relative;border-radius:10px;"
"width:%lipx;height:%lipx;\">"
, widgetWidth
, widgetHeight
);
//sb.safePrintf("<style>"
// "a{color:white;}"
// "</style>");
sb.safePrintf("Waiting for Server...");
// end the containing div
sb.safePrintf("</div>");
savedLen2 = sb.length();
}
// the right table pane is the crawl stats
if ( fmt == FORMAT_HTML ) {
sb.safePrintf("</TD><TD valign=top>");
}
//
// show stats
@ -797,10 +1262,10 @@ bool sendPageBasicStatus ( TcpSocket *socket , HttpRequest *hr ) {
if ( cr->m_globalCrawlInfo.m_hasUrlsReadyToSpider )
hurts = "Yes";
sb.safePrintf("<TABLE border=0>"
"<TR><TD valign=top>"
sb.safePrintf(//"<TABLE border=0>"
//"<TR><TD valign=top>"
"<table border=0 cellpadding=5>"
"<table id=stats border=0 cellpadding=5>"
"<tr>"
"<td><b>Crawl Status Code:</td>"
@ -830,8 +1295,8 @@ bool sendPageBasicStatus ( TcpSocket *socket , HttpRequest *hr ) {
//"</tr>"
"<tr>"
"<td><b>URLs Harvested</b> "
"(may include dups)</td>"
"<td><b><nobr>URLs Harvested</b> "
"(may include dups)</nobr></td>"
"<td>%lli</td>"
"</tr>"
@ -863,8 +1328,83 @@ bool sendPageBasicStatus ( TcpSocket *socket , HttpRequest *hr ) {
, cr->m_globalCrawlInfo.m_pageDownloadSuccesses
);
char tmp3[64];
struct tm *timeStruct;
timeStruct = localtime((time_t *)&cr->m_diffbotCrawlStartTime);
// Jan 01 1970 at 10:30:00
strftime ( tmp3,64 , "%b %d %Y at %H:%M:%S",timeStruct);
sb.safePrintf("<tr><td><b>Collection Created</b></td>"
"<td>%s (local time)</td></tr>",tmp3);
// print link to embed the code in their own site
SafeBuf embed;
embed.htmlEncode(sb.getBufStart()+savedLen1,
savedLen2-savedLen1,
false); // encodePoundSign #?
// convert all ''s to "'s for php's echo ''; cmd
embed.replaceChar('\'','\"');
sb.safePrintf("<tr>"
"<td valign=top>"
"<a onclick=\""
"var dd=document.getElementById('hcode');"
"if ( dd.style.display=='none' ) "
"dd.style.display=''; "
"else "
"dd.style.display='none';"
"\" style=color:blue;>"
"<u>"
"show Widget HTML code"
"</u>"
"</a>"
"</td><td>"
"<div id=hcode style=display:none;"
"max-width:800px;>"
"%s"
"</div>"
"</td></tr>"
, embed.getBufStart() );
sb.safePrintf("<tr>"
"<td valign=top>"
"<a onclick=\""
"var dd=document.getElementById('pcode');"
"if ( dd.style.display=='none' ) "
"dd.style.display=''; "
"else "
"dd.style.display='none';"
"\" style=color:blue;>"
"<u>"
"show Widget PHP code"
"</u>"
"</a>"
"</td>"
"<td>"
"<div id=pcode style=display:none;"
"max-width:800px;>"
"<i>"
"echo '"
"%s"
"';"
"</i>"
"</div>"
"</td></tr>"
, embed.getBufStart() );
sb.safePrintf("</table>\n\n");
}
// end the right table pane
if ( fmt == FORMAT_HTML ) {
sb.safePrintf("</TD></TR></TABLE>");
}
//if ( fmt != FORMAT_JSON )
// // wrap up the form, print a submit button
// g_pages.printAdminBottom ( &sb );

@ -2355,10 +2355,13 @@ bool printCrawlDetailsInJson ( SafeBuf *sb , CollectionRec *cx ) {
//nomen = "job";
}
sb->safePrintf("\n\n{"
"\"name\":\"%s\",\n"
"\"type\":\"%s\",\n"
"\"jobCreationTimeUTC\":%li,\n"
"\"jobCompletionTimeUTC\":%li,\n"
//"\"alias\":\"%s\",\n"
//"\"crawlingEnabled\":%li,\n"
"\"jobStatus\":{" // nomen = jobStatus / crawlStatus
@ -2384,6 +2387,11 @@ bool printCrawlDetailsInJson ( SafeBuf *sb , CollectionRec *cx ) {
//,cx->m_coll
, cx->m_diffbotCrawlName.getBufStart()
, crawlTypeStr
, cx->m_diffbotCrawlStartTime
// this is 0 if not over yet
, cx->m_diffbotCrawlEndTime
//, alias
//, (long)cx->m_spideringEnabled
, crawlStatus

@ -245,19 +245,24 @@ bool processLoop ( void *state ) {
//xd->set3 ( st->m_docId , st->m_coll , 0 );
// callback
xd->setCallback ( state , processLoop );
// and tell it to load from the old title rec
// . and tell it to load from the old title rec
// . this sets xd->m_oldTitleRec/m_oldTitleRecSize
// . this sets xd->ptr_* and all other member vars from
// the old title rec if found in titledb.
if ( ! xd->loadFromOldTitleRec ( ) ) return false;
}
if ( g_errno ) return sendErrorReply ( st , g_errno );
// now force it to load old title rec
char **tr = xd->getTitleRec();
//char **tr = xd->getTitleRec();
SafeBuf *tr = xd->getTitleRecBuf();
// blocked? return false if so. it will call processLoop() when it rets
if ( tr == (void *)-1 ) return false;
// we did not block. check for error? this will free "st" too.
if ( ! tr ) return sendErrorReply ( st , g_errno );
// if title rec was empty, that is a problem
if ( xd->m_titleRecSize == 0 ) return sendErrorReply ( st , ENOTFOUND);
if ( xd->m_titleRecBuf.length() == 0 )
return sendErrorReply ( st , ENOTFOUND);
// set callback
char *na = xd->getIsNoArchive();

@ -400,26 +400,40 @@ bool sendPageParser2 ( TcpSocket *s ,
"<td>"
"<input type=text name=\"q\" size=\"20\" value=\"\"> "
"</td>"
"</tr>"
"</tr>",
TABLE_STYLE,
us ,
dd,
rr,
render
);
xbuf->safePrintf(
"<tr class=poo>"
"<td>"
"<b>content below is xml</b>"
"<b>content type below is</b>"
"<br><font size=-2>"
"Is the content below XML?"
"Is the content below HTML? XML? JSON?"
"</font>"
"</td>"
"<td>"
"<input type=checkbox name=xml value=1> "
//"<input type=checkbox name=xml value=1> "
"<select name=ctype>\n"
"<option value=%li selected>HTML</option>\n"
"<option value=%li selected>XML</option>\n"
"<option value=%li selected>JSON</option>\n"
"</select>\n"
"</td>"
"</tr>"
"</tr>",
(long)CT_HTML,
(long)CT_XML,
(long)CT_JSON
);
xbuf->safePrintf(
"<tr class=poo>"
"<td><b>content</b>"
@ -440,15 +454,6 @@ bool sendPageParser2 ( TcpSocket *s ,
"</form>"
"<br>",
TABLE_STYLE,
us ,
//(long)st->m_hopCount,
//rtu,
dd,
//artr ,
rr,
//rr2,
render ,
//oips ,
contentParm );
@ -807,8 +812,9 @@ bool sendPageAnalyze ( TcpSocket *s , HttpRequest *r ) {
// ensure null
if ( contentLen == 0 ) content = NULL;
uint8_t contentType = CT_HTML;
//uint8_t contentType = CT_HTML;
//if ( isXml ) contentType = CT_XML;
long ctype = r->getLong("ctype",CT_HTML);
// . use the enormous power of our new XmlDoc class
// . this returns false if blocked
@ -821,7 +827,7 @@ bool sendPageAnalyze ( TcpSocket *s , HttpRequest *r ) {
content ,
false, // deletefromindex
0, // forced ip
contentType ))
ctype ))
// return error reply if g_errno is set
return sendErrorReply ( st , g_errno );
// make this our callback in case something blocks
@ -900,16 +906,15 @@ bool gotXmlDoc ( void *state ) {
}
long isXml = st->m_r.getLong("xml",0);
char ctype = CT_HTML;
if ( isXml ) ctype = CT_XML;
char ctype2 = CT_HTML;
if ( isXml ) ctype2 = CT_XML;
// now encapsulate it in html head/tail and send it off
bool status = g_httpServer.sendDynamicPage( st->m_s ,
xbuf->getBufStart(),
xbuf->length() ,
-1, //cachtime
false ,//postreply?
&ctype,
&ctype2,
-1 , //httpstatus
NULL,//cookie
"utf-8");

@ -249,6 +249,58 @@ bool sendPageResults ( TcpSocket *s , HttpRequest *hr ) {
// get the dmoz catid if given
//long searchingDmoz = hr->getLong("dmoz",0);
//
// DO WE NEED TO ALTER cr->m_siteListBuf for a widget?
//
// when a wordpress user changes the "Websites to Include" for
// her widget, it should send a /search?sites=xyz.com&wpid=xxx
// request here...
// so we need to remove her old sites and add in her new ones.
//
/*
MDW TURN BACK ON IN A DAY. do indexing or err pages first.
// get wordpressid supplied with all widget requests
char *wpid = hr->getString("wpid");
// we have to add set &spidersites=1 which all widgets should do
if ( wpid ) {
// this returns NULL if cr->m_siteListBuf would be unchanged
// because we already have the whiteListBuf sites in there
// for this wordPressId (wpid)
SafeBuf newSiteListBuf;
makeNewSiteList( &si->m_whiteListBuf,
cr->m_siteListBuf ,
wpid ,
&newSiteListBuf);
// . update the list of sites to crawl/search & show in widget
// . if they give an empty list then allow that, stops crawling
SafeBuf parmList;
g_parms.addNewParmToList1 ( &parmList,
cr->m_collnum,
newSiteListBuf,
0,
"sitelist");
// send the parms to all hosts in the network
g_parms.broadcastParmList ( &parmList ,
NULL,//s,// state is socket i guess
NULL);//doneBroadcastingParms2 );
// nothing left to do now
return g_httpServer.sendDynamicPage(s,
"OK",//sb.getBufStart(),
2,//sb.length(),
cacheTime,//0,
false, // POST?
"text/html",
200, // httpstatus
NULL, // cookie
"UTF-8"); // charset
}
*/
//
// . send back page frame with the ajax call to get the real
// search results. do not do this if a "&dir=" (dmoz category)
@ -404,7 +456,7 @@ bool sendPageResults ( TcpSocket *s , HttpRequest *hr ) {
"Copyright &copy; 2014. "
"All Rights Reserved.<br/>"
"Powered by the "
"<a href='http://www.gigablast.com/'>"
"<a href=\"http://www.gigablast.com/\">"
"GigaBlast</a> open source search engine."
"</font>"
"</center>\n"
@ -718,11 +770,6 @@ void freeMsg4Wrapper( void *st ) {
delete stau;
}
// height of each result div in the widget
#define RESULT_HEIGHT 120
#define SERP_SPACER 1
#define PADDING 8
// . make a web page from results stored in msg40
// . send it on TcpSocket "s" when done
// . returns false if blocked, true otherwise
@ -909,45 +956,61 @@ bool gotResults ( void *state ) {
//
long numResults = msg40->getNumResults();
// if user is doing ajax widget we need to know the current docid
// that is listed at the top of their widget display so we can
// hide the new docids above that and scroll them down slowly.
long numResults = msg40->getNumResults();
long topDocIdPos = -1;
/*
//long topDocIdPos = -1;
bool hasInvisibleResults = false;
long numInvisible = 0;
//long numInvisible = 0;
long numAbove = 0;
HttpRequest *hr = &st->m_hr;
long long oldTop = 0LL;
long long lastDocId = 0LL;
double lastSerpScore = 0.0;
if ( si->m_format == FORMAT_WIDGET_AJAX ) {
// sanity, no stream mode here, it won't work
if ( si->m_streamResults )
log("results: do not use stream=1 for widget");
// get current top docid
long long topDocId = hr->getLongLong("topdocid",0LL);
// DEBUG: force it on for now
//topDocId = 4961990748LL;
// scan results
// scan results. this does not support &stream=1 streaming
// mode. it doesn't make sense that it needs to.
for ( long i = 0 ; i < numResults ; i++ ) {
// skip if already invisible
if ( msg40->m_msg3a.m_clusterLevels[i] != CR_OK )
continue;
// get it
Msg20 *m20 ;
if ( si->m_streamResults )
m20 = msg40->getCompletedSummary(i);
else
m20 = msg40->m_msg20[i];
Msg20 *m20 = msg40->m_msg20[i];
if ( ! m20 ) continue;
// checkdocid
Msg20Reply *mr = m20->m_r;
if ( ! mr ) continue;
// save this
lastDocId = mr->m_docId;
lastSerpScore = msg40->m_msg3a.m_scores[i];
// set "oldTop" to first docid we encounter
if ( ! oldTop ) oldTop = mr->m_docId;
// stop if no topdocid otherwise. oldTop is now set
if ( topDocId == 0 ) break;
if ( ! topDocId ) continue; // == 0 ) break;
if ( mr->m_docId != topDocId ) {
hasInvisibleResults = true;
numInvisible++;
// count # of docids above top docid
numAbove++;
continue;
}
topDocIdPos = i;
break;
// we match it, so set this if not already set
//if ( topDocIdPos != -1 ) topDocIdPos = i;
//break;
}
}
*/
SafeBuf *sb = &st->m_sb;
@ -992,20 +1055,48 @@ bool gotResults ( void *state ) {
// propagate "topdocid" so when he does another query every 30 secs
// or so we know what docid was on top for scrolling purposes
if ( si->m_format == FORMAT_WIDGET_AJAX )
sb->safePrintf("<input type=hidden "
"id=topdocid name=topdocid value=%lli>\n",
oldTop);
//if ( si->m_format == FORMAT_WIDGET_AJAX )
// sb->safePrintf("<input type=hidden "
// "id=topdocid name=topdocid value=%lli>\n",
// oldTop);
// report how many results we added above the topdocid provided, if any
// so widget can scroll down automatically
//if ( si->m_format == FORMAT_WIDGET_AJAX && numAbove )
// sb->safePrintf("<input type=hidden "
// "id=topadd name=topadd value=%li>\n",numAbove);
// we often can add 100s of things to the widget's result set per
// second especially when sorting by last spidered time and spidering
// a lot. setting the maxserpscore of the serp score of the last result
// allows us to append new search results to what we have in a
// consistent manner.
// if ( si->m_format == FORMAT_WIDGET_AJAX ) {
// // let's make this ascii encoded crap
// sb->safePrintf("<input type=hidden "
// "id=maxserpscore "
// "value=%f>\n",
// lastSerpScore);
// // let's make this ascii encoded crap
// sb->safePrintf("<input type=hidden "
// "id=maxserpdocid "
// "value=%lli>\n",
// lastDocId);
// }
// then print each result
// don't display more than docsWanted results
long count = msg40->getDocsWanted();
bool hadPrintError = false;
long numPrintedSoFar = 0;
//long widgetHeight = hr->getLong("widgetheight",400);
//long widgetwidth = hr->getLong("widgetwidth",250);
for ( long i = 0 ; count > 0 && i < numResults ; i++ ) {
/*
if ( hasInvisibleResults ) {
//
// MAKE THESE RESULTS INVISIBLE!
@ -1037,14 +1128,14 @@ bool gotResults ( void *state ) {
"position:absolute;>"
);
}
*/
//////////
//
// prints in xml or html
//
//////////
if ( ! printResult ( st , i ) ) {
if ( ! printResult ( st , i , numPrintedSoFar++ ) ) {
hadPrintError = true;
break;
}
@ -1066,7 +1157,7 @@ bool gotResults ( void *state ) {
// if we split the serps into 2 divs for scrolling purposes
// then close up the 2nd one
if ( hasInvisibleResults ) sb->safePrintf("</div>");
//if ( hasInvisibleResults ) sb->safePrintf("</div>");
// END SERP DIV
if ( si->m_format == FORMAT_WIDGET_IFRAME ||
@ -1111,9 +1202,7 @@ bool printSearchResultsHeader ( State0 *st ) {
sb->safePrintf("<body>");
}
if ( ! g_conf.m_isMattWells &&
(si->m_format==FORMAT_WIDGET_IFRAME ||
si->m_format==FORMAT_WIDGET_AJAX) ) {
if ( ! g_conf.m_isMattWells && si->m_format==FORMAT_WIDGET_IFRAME ) {
printCSSHead ( sb ,si->m_format );
sb->safePrintf("<body style=padding:0px;margin:0px;>");
}
@ -1155,26 +1244,43 @@ bool printSearchResultsHeader ( State0 *st ) {
// put image in this div which will have top:0px JUST like
// the div holding the search results we print out below
// so that the image does not scroll when you use the
// scrollbar.
sb->safePrintf("<div style=\"position:absolute;"
// scrollbar. holds the magifying glass img and searchbox.
sb->safePrintf("<div class=magglassdiv "
"style=\"position:absolute;"
"right:15px;"
"z-index:10;"
"top:0px;\">");
long refresh = hr->getLong("refresh",15);
//long refresh = hr->getLong("refresh",15);
char *oq = hr->getString("q",NULL);
if ( ! oq ) oq = "";
char *prepend = hr->getString("prepend");
if ( ! prepend ) prepend = "";
char *displayStr = "none";
if ( prepend && prepend[0] ) displayStr = "";
sb->safePrintf("<form method=get action=/search>");
// to do a search we need to re-call the ajax,
// just call reload like the one that is called every 15s or so
sb->safePrintf("<form "//method=get action=/search "
// use "1" as arg to force reload
"onsubmit=\"widget123_reload(1);"
// let user know we are loading
"var w=document.getElementById("
"'widget123_scrolldiv');"
// just set the widget content to the reply
"if (w) "
"w.innerHTML='<br><br><b>Loading Results..."
"</b>';"
// prevent it from actually submitting
"return false;\">");
sb->safePrintf("<img "
"style=\""
//"position:absolute;" // absolute or relative?
// put it on TOP of the other stuff
"z-index:10;"
"margin-top:3px;"
//"right:10px;"
//"right:2px;"
//"width:%lipx;"
@ -1185,36 +1291,46 @@ bool printSearchResultsHeader ( State0 *st ) {
"var e=document.getElementById('sbox');"
"if(e.style.display == 'none') {"
"e.style.display = '';"
// give it focus
"var qb=document.getElementById('qbox');"
"qb.focus();"
"} else {"
"e.style.display = 'none';"
"}"
"\" " // end function
" "
"width=25 "
"height=25 "
"src=\"http://etc-mysitemyway.s3.amazonaws.com/icons/legacy-previews/icons/simple-black-square-icons-business/126715-simple-black-square-icon-business-magnifying-glass-ps.png\">"
"width=35 "
"height=31 "
"src=\"/magglass.png\">"
);
sb->safePrintf("<div id=sbox style=float:left;display:%s;>"
"<input type=text name=prepend size=%li "
"value=\"%s\" style=\"z-index:10;"
//char *origq = hr->getString("q");
// we sort all results by spider date now so PREPEND
// the actual user query
char *origq = hr->getString("prepend");
if ( ! origq ) origq = "";
sb->safePrintf("<div id=sbox style=\"float:left;"
"display:%s;"
"opacity:0.83;"
//"background-color:gray;"
//"padding:5px;"
"\">"
// the box that holds the query
"<input type=text id=qbox name=qbox "
"size=%li " //name=prepend "
"value=\"%s\" "
"style=\"z-index:10;"
"font-weight:bold;"
"font-size:18px;"
"border:4px solid black;"
"margin:3px;"
"\">"
// hidden parms like collection
"<input name=c type=hidden value=\"%s\">"
"<input name=format type=hidden value=widget>"
"<input name=widgetwidth type=hidden value=%li>"
"<input name=refresh type=hidden value=%li>"
"<input name=q type=hidden value=\"%s\">"
"</div>"
"</form>\n"
, displayStr
, widgetwidth / 15
, prepend
, coll
, widgetwidth
, refresh
, oq
, widgetwidth / 23
, origq
);
sb->safePrintf("</div>"
"</form>\n"
);
// . BEGIN SERP DIV
@ -1222,9 +1338,12 @@ bool printSearchResultsHeader ( State0 *st ) {
// . this will have the scrollbar to just scroll the serps
// and not the magnifying glass
sb->safePrintf("</div>"
"<div style=\"position:absolute;"
"<div id=widget123_scrolldiv "
"onscroll=widget123_append(); "
"style=\"position:absolute;"
"top:0px;"
"overflow-y:auto;"
"overflow-x:hidden;"
"width:%lipx;"
"height:%lipx;\">"
, widgetwidth
@ -1492,7 +1611,8 @@ bool printSearchResultsHeader ( State0 *st ) {
else if ( numResults == 0 &&
( si->m_format == FORMAT_WIDGET_IFRAME ||
si->m_format == FORMAT_WIDGET_AJAX ) ) {
sb->safePrintf ( "No results found.");
sb->safePrintf ( "No results found. Wait for spider to "
"kick in.");
}
else if ( moreFollow && si->m_format == FORMAT_HTML ) {
if ( isAdmin && si->m_docsToScanForReranking > 1 )
@ -1927,12 +2047,13 @@ bool printSearchResultsTail ( State0 *st ) {
// carry over the sites we are restricting the search results to
if ( si->m_whiteListBuf.length() )
args.safePrintf("&sites=%s",si->m_whiteListBuf.getBufStart());
if ( firstNum > 0 &&
(si->m_format == FORMAT_HTML ||
si->m_format == FORMAT_WIDGET_AJAX ||
si->m_format == FORMAT_WIDGET_IFRAME ) ) {
si->m_format == FORMAT_WIDGET_IFRAME //||
//si->m_format == FORMAT_WIDGET_AJAX
) ) {
long ss = firstNum - msg40->getDocsWanted();
sb->safePrintf("<a href=\"/search?s=%li&q=",ss);
// our current query parameters
@ -1949,8 +2070,9 @@ bool printSearchResultsTail ( State0 *st ) {
// now print "Next X Results"
if ( msg40->moreResultsFollow() &&
(si->m_format == FORMAT_HTML ||
si->m_format == FORMAT_WIDGET_IFRAME ||
si->m_format == FORMAT_WIDGET_AJAX )) {
si->m_format == FORMAT_WIDGET_IFRAME
//si->m_format == FORMAT_WIDGET_AJAX
)) {
long ss = firstNum + msg40->getDocsWanted();
// print a separator first if we had a prev results before us
if ( sb->length() > remember ) sb->safePrintf ( " &nbsp; " );
@ -2044,8 +2166,8 @@ bool printSearchResultsTail ( State0 *st ) {
"<font color=gray>"
"Copyright &copy; 2014. All Rights "
"Reserved.<br/>"
"Powered by the <a href='https://www."
"gigablast.com/'>GigaBlast</a> open source "
"Powered by the <a href=\"http://www."
"gigablast.com/\">GigaBlast</a> open source "
"search engine."
"</font>"
"</center>\n"
@ -2359,7 +2481,7 @@ static bool printDMOZCategoryUnderResult ( SafeBuf *sb ,
// use this for xml as well as html
bool printResult ( State0 *st, long ix ) {
bool printResult ( State0 *st, long ix , long numPrintedSoFar ) {
SafeBuf *sb = &st->m_sb;
@ -2440,7 +2562,7 @@ bool printResult ( State0 *st, long ix ) {
if ( mr->ptr_content ) {
// for json items separate with \n,\n
if ( si->m_format != FORMAT_HTML && ix>0 )
if ( si->m_format != FORMAT_HTML && numPrintedSoFar > 0 )
sb->safePrintf(",\n");
sb->safeStrcpy ( mr->ptr_content );
@ -2566,56 +2688,139 @@ bool printResult ( State0 *st, long ix ) {
// http://www.youtube.com/watch?v=auQbi_fkdGE
// http://img.youtube.com/vi/auQbi_fkdGE/2.jpg
// get the thumbnail url
if ( mr->ptr_imgUrl && si->m_format == FORMAT_HTML )
sb->safePrintf ("<a href=%s><image src=%s></a>",
if ( mr->ptr_imgUrl &&
si->m_format == FORMAT_HTML &&
// if we got thumbnail use that not this
! mr->ptr_imgData )
sb->safePrintf ("<a href=%s><img src=%s></a>",
url,mr->ptr_imgUrl);
// if we have a thumbnail show it next to the search result
if ( si->m_format == FORMAT_HTML &&
//! mr->ptr_imgUrl &&
mr->ptr_imgData ) {
ThumbnailArray *ta = (ThumbnailArray *)mr->ptr_imgData;
ThumbnailInfo *ti = ta->getThumbnailInfo(0);
ti->printThumbnailInHtml ( sb ,
100 , // max width
100 , // max height
true , // add <a href>
NULL ,
" style=\"margin:10px;\" ");
}
// print image for widget
if ( //mr->ptr_imgUrl &&
( si->m_format == FORMAT_WIDGET_IFRAME ||
si->m_format == FORMAT_WIDGET_AJAX) ) {
si->m_format == FORMAT_WIDGET_AJAX ||
si->m_format == FORMAT_WIDGET_APPEND ) ) {
long widgetwidth = hr->getLong("widgetwidth",200);
// make a div around this for widget so we can print text
// on top
long widgetWidth = hr->getLong("widgetwidth",200);
// prevent coring
if ( widgetWidth < 1 ) widgetWidth = 1;
// each search result in widget has a div around it
sb->safePrintf("<div "
"class=result "
// we need the docid and score of last result
// when we append new results to the end
// of the widget for infinite scrolling
// using the scripts in PageBasic.cpp
"docid=%lli "
"score=%f " // double
"style=\""
"width:%lipx;"
"min-height:%lipx;"//140px;"
"height:%lipx;"//140px;"
"padding:%lipx;"
"display:table-cell;"
"vertical-align:bottom;"
, widgetwidth - 2*8 // padding is 8px
"position:relative;"
//"display:table-cell;"
//"vertical-align:bottom;"
"\""
">"
, mr->m_docId
// this is a double now. this won't work
// for streaming...
, msg40->m_msg3a.m_scores[ix]
, widgetWidth - 2*8 // padding is 8px
, (long)RESULT_HEIGHT
, (long)RESULT_HEIGHT
, (long)PADDING
);
if ( mr->ptr_imgUrl )
sb->safePrintf("background-repeat:no-repeat;"
"background-size:%lipx 140px;"
"background-image:url('%s');"
, widgetwidth - 2*8 // padding is 8px
, mr->ptr_imgUrl);
// if ( mr->ptr_imgUrl )
// sb->safePrintf("background-repeat:no-repeat;"
// "background-size:%lipx 140px;"
// "background-image:url('%s');"
// , widgetwidth - 2*8 // padding is 8px
// , mr->ptr_imgUrl);
long newdx = 0;
if ( mr->ptr_imgData ) {
ThumbnailArray *ta = (ThumbnailArray *)mr->ptr_imgData;
ThumbnailInfo *ti = ta->getThumbnailInfo(0);
// account for scrollbar on the right
long maxWidth = widgetWidth - (long)SCROLLBAR_WIDTH;
long maxHeight = (long)RESULT_HEIGHT;
// false = do not print <a href> link on image
ti->printThumbnailInHtml ( sb ,
maxWidth ,
maxHeight ,
false , // add <a href>
&newdx );
}
// end the div style attribute and div tag
sb->safePrintf("\">");
//sb->safePrintf("\">");
sb->safePrintf ( "<a "
"target=_blank "
"style=text-decoration:none; href=" );
"style=\"text-decoration:none;"
// don't let scroll bar obscure text
"margin-right:%lipx;"
,(long)SCROLLBAR_WIDTH
);
// if thumbnail is wide enough put text on top of it, otherwise
// image is to the left and text is to the right of image
if ( newdx > .5 * widgetWidth )
sb->safePrintf("position:absolute;"
"bottom:%li;"
"left:%li;"
, (long) PADDING
, (long) PADDING
);
// to align the text verticall we gotta make a textbox div
// otherwise it wraps below image! mdw
//else
// sb->safePrintf("vertical-align:middle;");
else
sb->safePrintf("position:absolute;"
"bottom:%li;"
"left:%li;"
, (long) PADDING
, (long) PADDING + newdx + 10 );
// close the style and begin the url
sb->safePrintf( "\" "
"href=\""
);
// truncate off -diffbotxyz%li
long newLen = urlLen;
if ( diffbotSuffix ) newLen = diffbotSuffix - url;
// print the url in the href tag
sb->safeMemcpy ( url , newLen );
// then finish the a href tag and start a bold for title
sb->safePrintf ( ">");//<font size=+0>" );
sb->safePrintf ( "\">");//<font size=+0>" );
sb->safePrintf("<b style=\""
"text-decoration:none;"
"font-size: 15px;"
"font-weight:bold;"
// add padding so shadow does not stick out
//"padding-left:4px;"
//"padding-right:4px;"
"background-color:rgba(0,0,0,.5);"
"color:white;"
"font-family:arial;"
@ -2634,11 +2839,28 @@ bool printResult ( State0 *st, long ix ) {
//"2px -2px 0 #000 "
//"-2px -2px 0 #000;"
"\">");
//sb->safePrintf ("<image width=50 height=50 src=%s></a>",
//sb->safePrintf ("<img width=50 height=50 src=%s></a>",
// mr->ptr_imgUrl);
// then title over image
}
// only do link here if we have no thumbnail so no bg image
if ( (si->m_format == FORMAT_WIDGET_IFRAME ||
si->m_format == FORMAT_WIDGET_APPEND ||
si->m_format == FORMAT_WIDGET_AJAX ) &&
! mr->ptr_imgData ) {
sb->safePrintf ( "<a style=text-decoration:none;"
"color:white; "
"href=" );
// truncate off -diffbotxyz%li
long newLen = urlLen;
if ( diffbotSuffix ) newLen = diffbotSuffix - url;
// print the url in the href tag
sb->safeMemcpy ( url , newLen );
// then finish the a href tag and start a bold for title
sb->safePrintf ( ">");//<font size=+0>" );
}
// the a href tag
if ( si->m_format == FORMAT_HTML ) sb->safePrintf ( "\n\n" );
@ -2668,20 +2890,6 @@ bool printResult ( State0 *st, long ix ) {
}
// only do link here
if ( (si->m_format == FORMAT_WIDGET_IFRAME ||
si->m_format == FORMAT_WIDGET_AJAX ) &&
! mr->ptr_imgUrl ) {
sb->safePrintf ( "<a href=" );
// truncate off -diffbotxyz%li
long newLen = urlLen;
if ( diffbotSuffix ) newLen = diffbotSuffix - url;
// print the url in the href tag
sb->safeMemcpy ( url , newLen );
// then finish the a href tag and start a bold for title
sb->safePrintf ( ">");//<font size=+0>" );
}
// . then the title (should be NULL terminated)
// . the title can be NULL
// . highlight it first
@ -2737,6 +2945,7 @@ bool printResult ( State0 *st, long ix ) {
backTag = "</b>";
}
if ( si->m_format == FORMAT_WIDGET_IFRAME ||
si->m_format == FORMAT_WIDGET_APPEND ||
si->m_format == FORMAT_WIDGET_AJAX ) {
frontTag = "<font style=\"background-color:yellow\">" ;
}
@ -2784,10 +2993,11 @@ bool printResult ( State0 *st, long ix ) {
if ( si->m_format == FORMAT_HTML ) sb->safePrintf ("</a><br>\n" ) ;
// close the image div
// close the title tag stuf
if ( si->m_format == FORMAT_WIDGET_IFRAME ||
si->m_format == FORMAT_WIDGET_APPEND ||
si->m_format == FORMAT_WIDGET_AJAX )
sb->safePrintf("</b></a></div>\n");
sb->safePrintf("</b></a>\n");
/////
@ -2796,7 +3006,7 @@ bool printResult ( State0 *st, long ix ) {
//
/////
unsigned char ctype = mr->m_contentType;
if ( ctype >= CT_HTML && ctype <= CT_JSON ) {
if ( ctype != CT_HTML && ctype != CT_UNKNOWN ){//&&ctype <= CT_JSON ) {
char *cs = g_contentTypeStrings[ctype];
if ( si->m_format == FORMAT_XML )
sb->safePrintf("\t\t<contentType>"
@ -2805,7 +3015,7 @@ bool printResult ( State0 *st, long ix ) {
"]]>"
"</contentType>\n",
cs);
else if ( si->m_format == FORMAT_HTML ) {
else if ( si->m_format == FORMAT_HTML && ctype != CT_HTML ) {
sb->safePrintf(" <b><font style=color:white;"
"background-color:maroon;>");
char *p = cs;
@ -2845,6 +3055,7 @@ bool printResult ( State0 *st, long ix ) {
// do not print summaries for widgets by default unless overridden
// with &summary=1
if ( (si->m_format == FORMAT_WIDGET_IFRAME ||
si->m_format == FORMAT_WIDGET_APPEND ||
si->m_format == FORMAT_WIDGET_AJAX ) &&
hr->getLong("summaries",0) == 0 )
printSummary = false;
@ -3164,7 +3375,7 @@ bool printResult ( State0 *st, long ix ) {
sb->safePrintf(" - <a style=color:red; href=\"/addurl?u=");
sb->urlEncode ( url , gbstrlen(url) , false );
unsigned long long rand64 = gettimeofdayInMillisecondsLocal();
sb->safePrintf("&rand64=%llu\">respider</a>",rand64);
sb->safePrintf("&rand64=%llu&force=1\">respider</a>",rand64);
}
@ -3379,12 +3590,20 @@ bool printResult ( State0 *st, long ix ) {
*/
// end serp div
if ( si->m_format == FORMAT_WIDGET_IFRAME ||
si->m_format == FORMAT_WIDGET_APPEND ||
si->m_format == FORMAT_WIDGET_AJAX )
sb->safePrintf("</div>");
if ( si->m_format == FORMAT_HTML )
sb->safePrintf ( "<br><br>\n");
// search result spacer
if ( si->m_format == FORMAT_WIDGET_IFRAME ||
si->m_format == FORMAT_WIDGET_AJAX )
si->m_format == FORMAT_WIDGET_APPEND ||
si->m_format == FORMAT_WIDGET_AJAX )
sb->safePrintf("<div style=line-height:%lipx;><br></div>",
(long)SERP_SPACER);
@ -5839,6 +6058,9 @@ bool printJsonItemInCSV ( char *json , SafeBuf *sb , State0 *st ) {
return true;
}
/*
RIP: OLD IFRAME WIDGET CODE HACK
bool printWidgetPage ( SafeBuf *sb , HttpRequest *hr , char *coll ) {
//
@ -6113,23 +6335,21 @@ bool printWidgetPage ( SafeBuf *sb , HttpRequest *hr , char *coll ) {
"</td>"
"<td>"
// begin div with source in it
/*
"<div "
//"class=grad3 "
"style=\""
"border-radius:10px;"
"box-shadow: 6px 6px 3px %s;"
"border:2px solid black;"
"padding:15px;"
"width:600px;"
//"background-image:url('/ss.jpg');"
//"background-repeat:repeat;"
//"background-attachment:fixed;"
"background-color:lightgray;"
"\">"
, SHADOWCOLOR
//"<br>"
*/
// "<div "
// //"class=grad3 "
// "style=\""
// "border-radius:10px;"
// "box-shadow: 6px 6px 3px %s;"
// "border:2px solid black;"
// "padding:15px;"
// "width:600px;"
// //"background-image:url('/ss.jpg');"
// //"background-repeat:repeat;"
// //"background-attachment:fixed;"
// "background-color:lightgray;"
// "\">"
// , SHADOWCOLOR
// //"<br>"
);
// space widget to the right using this table
@ -6157,35 +6377,32 @@ bool printWidgetPage ( SafeBuf *sb , HttpRequest *hr , char *coll ) {
// this iframe contains the WIDGET
sb->safePrintf (
/*
"<div "
"id=scrollerxyz "
"style=\""
// "<div "
// "id=scrollerxyz "
// "style=\""
//"width:%lipx;" // 200;"
//"height:%lipx;" // 400;"
//"overflow:hidden;"
"padding:0px;"
"margin:0px;"
"background-color:white;"
// "padding:0px;"
// "margin:0px;"
// "background-color:white;"
//"padding-left:7px;"
"%s"
//"%s"
//"background-color:%s;"//lightblue;"
//"foreground-color:%s;"
//"overflow:scroll;"
//"overflow-scrolling:touch;"
"\">"
*/
"<iframe width=\"%lipx\" height=\"%lipx\" "
//"scrolling=yes "
/*
"style=\"background-color:white;"
"padding-right:0px;"
//"style=\"background-color:white;"
//"padding-right:0px;"
//"%s\" "
"scrolling=no "
"frameborder=no "
//"scrolling=no "
//"frameborder=no "
//"src=\"http://neo.diffbot.com:8000/search?"
*/
// frameborder=no
"%s"
@ -6389,3 +6606,4 @@ bool sendPageWidget ( TcpSocket *s , HttpRequest *hr ) {
NULL, // cookie
"UTF-8"); // charset
}
*/

@ -6,6 +6,14 @@
#include "Msg40.h"
#include "Msg0.h"
// height of each search result div in the widget
#define RESULT_HEIGHT 120
// other widget parms
#define SERP_SPACER 1
#define PADDING 8
#define SCROLLBAR_WIDTH 20
class State0 {
public:
@ -50,7 +58,7 @@ public:
bool printSearchResultsHeader ( class State0 *st ) ;
bool printResult ( class State0 *st, long ix );
bool printResult ( class State0 *st, long ix , long numPrintedSoFar );
bool printSearchResultsTail ( class State0 *st ) ;

@ -171,7 +171,7 @@ bool printWebHomePage ( SafeBuf &sb , HttpRequest *r ) {
sb.safePrintf("\n");
if ( cr && cr->m_coll && strcmp(cr->m_coll,"main") ) {
if ( cr && cr->m_coll ) { // && strcmp(cr->m_coll,"main") ) {
sb.safePrintf("<center>"
"Searching the <b>%s</b> collection."
"</center>",

@ -83,7 +83,9 @@ bool sendPageTitledb ( TcpSocket *s , HttpRequest *r ) {
xd->set3 ( docId , coll , 0 );
// callback
xd->setCallback ( st , gotTitleRec );
// and tell it to load from old title rec
// . and tell it to load from old title rec
// . this sets all the member vars from it and also sets
// m_titleRecBuf to contain the actual compressed title rec
if ( ! xd->loadFromOldTitleRec ( ) ) return false;
// we got it without blocking. cached?
return gotTitleRec ( st );
@ -118,7 +120,7 @@ bool gotTitleRec ( void *state ) {
// . deal with errors
// . print none if non title rec at or after the provided docId
if ( g_errno || docId == 0LL || xd->m_titleRecSize <= 0 ) {
if ( g_errno || docId == 0LL || xd->m_titleRecBuf.length() <= 0 ) {
// print docId in box
sb.safePrintf ( "<center>\nEnter docId: "
"<input type=text name=d value=%lli size=15>",

@ -69,9 +69,9 @@ static WebPage s_pages[] = {
{ PAGE_RESULTS , "search" , 0 , "search" , 0 , 0 ,
"results page",
sendPageResults, 0 },
{ PAGE_WIDGET , "widget" , 0 , "widget" , 0 , 0 ,
"widget page",
sendPageWidget, 0 },
//{ PAGE_WIDGET , "widget" , 0 , "widget" , 0 , 0 ,
// "widget page",
// sendPageWidget, 0 },
{ PAGE_ADDURL , "addurl" , 0 , "add url" , 0 , 0 ,
"Page where you can add url for spidering",
sendPageAddUrl, 0 },
@ -914,8 +914,9 @@ bool Pages::printAdminTop (SafeBuf *sb ,
//long user = getUserType ( s , r );
//char *username = g_users.getUsername ( r );
char *username = NULL;
char *coll = r->getString ( "c" );
if ( ! coll ) coll = "main";
//char *coll = r->getString ( "c" );
//if ( ! coll ) coll = "main";
char *coll = g_collectiondb.getDefaultColl(r);
//char *pwd = r->getString ( "pwd" );
// get username
@ -1041,8 +1042,8 @@ bool Pages::printAdminTop (SafeBuf *sb ,
username , pwd ,
coll, NULL, s->m_ip, qs );
if ( g_hostdb.getNumHosts() > 1 )
sb->safePrintf("<br><br>");
//if ( g_hostdb.getNumHosts() > 1 )
sb->safePrintf("<br><br>");
// end table
//sb->safePrintf ("</td></tr></table><br/>\n");//<br/>\n");
@ -1655,10 +1656,11 @@ bool Pages::printHostLinks ( SafeBuf* sb ,
// and proxies
total += g_hostdb.m_numProxyHosts;
// don't print host buttons if only 1 host
if ( total <= 1 ) return status;
//if ( total <= 1 ) return status;
sb->safePrintf ( //"&nbsp; &nbsp; &nbsp; "
"<a href=/admin/hosts>hosts</a>: ");
"<a style=text-decoration:none; href=/admin/hosts>"
"<b>hosts in cluster</b></a>: ");
if ( ! qs ) qs = "";
//if ( ! pwd ) pwd = "";

@ -38,7 +38,7 @@ bool sendPageBasicStatus ( TcpSocket *s , HttpRequest *r );
bool sendPageRoot ( TcpSocket *s , HttpRequest *r );
bool sendPageRoot ( TcpSocket *s , HttpRequest *r, char *cookie );
bool sendPageResults ( TcpSocket *s , HttpRequest *r );
bool sendPageWidget ( TcpSocket *s , HttpRequest *r );
//bool sendPageWidget ( TcpSocket *s , HttpRequest *r );
//bool sendPageEvents ( TcpSocket *s , HttpRequest *r );
bool sendPageAddUrl ( TcpSocket *s , HttpRequest *r );
bool sendPageGet ( TcpSocket *s , HttpRequest *r );
@ -295,7 +295,7 @@ enum {
// public pages
PAGE_ROOT ,
PAGE_RESULTS ,
PAGE_WIDGET,
//PAGE_WIDGET,
PAGE_ADDURL , // 5
PAGE_GET ,
PAGE_LOGIN ,

141
Parms.cpp

@ -124,7 +124,7 @@ bool printUrlExpressionExamples ( SafeBuf *sb ) ;
// from PageBasic.cpp:
bool updateSiteListTables(collnum_t collnum,bool addSeeds,char *siteListArg);
bool updateSiteListBuf(collnum_t collnum,bool addSeeds,char *siteListArg);
bool CommandUpdateSiteList ( char *rec ) {
// caller must specify collnum
@ -145,11 +145,12 @@ bool CommandUpdateSiteList ( char *rec ) {
CollectionRec *cr = g_collectiondb.getRec ( collnum );
// get the sitelist
char *data = getDataFromParmRec ( rec );
// update it
updateSiteListTables ( collnum ,
true , // add NEW seeds?
data // entire sitelist
);
// update the table that maps site to whether we should spider it
// and also add newly introduced sites in "data" into spiderdb.
updateSiteListBuf ( collnum ,
true , // add NEW seeds?
data // entire sitelist
);
// now that we deduped the old site list with the new one for
// purposes of adding NEW seeds, we can do the final copy
cr->m_siteListBuf.set ( data );
@ -445,7 +446,7 @@ bool CommandRestartColl ( char *rec , WaitEntry *we ) {
// re-add the buf so it re-seeds spiderdb. it will not dedup these
// urls in "oldSiteList" with "m_siteListBuf" which is now empty.
// "true" = addSeeds.
updateSiteListTables ( newCollnum , true , oldSiteList );
updateSiteListBuf ( newCollnum , true , oldSiteList );
// now put it back
if ( oldSiteList ) cr->m_siteListBuf.safeStrcpy ( oldSiteList );
@ -501,7 +502,7 @@ bool CommandResetColl ( char *rec , WaitEntry *we ) {
// re-add the buf so it re-seeds spiderdb. it will not dedup these
// urls in "oldSiteList" with "m_siteListBuf" which is now empty.
// "true" = addSeeds.
updateSiteListTables ( newCollnum , true , oldSiteList );
updateSiteListBuf ( newCollnum , true , oldSiteList );
// now put it back
if ( oldSiteList ) cr->m_siteListBuf.safeStrcpy ( oldSiteList );
@ -1318,9 +1319,9 @@ bool printDropDown ( long n , SafeBuf* sb, char *name, long select,
bool printDropDownProfile ( SafeBuf* sb, char *name, long select ) {
sb->safePrintf ( "<select name=%s>", name );
// the type of url filters profiles
char *items[] = {"custom","web","news"};
char *items[] = {"custom","web","news","chinese"};
char *s;
for ( long i = 0 ; i < 3 ; i++ ) {
for ( long i = 0 ; i < 4 ; i++ ) {
if ( i == select ) s = " selected";
else s = "";
sb->safePrintf ("<option value=%li%s>%s",i,s,items[i]);
@ -1386,9 +1387,13 @@ bool Parms::printParms (SafeBuf* sb, TcpSocket *s , HttpRequest *r) {
long page = g_pages.getDynamicPageNumber ( r );
long nc = r->getLong("nc",1);
long pd = r->getLong("pd",1);
char *coll = r->getString ( "c" );
if ( ! coll || ! coll[0] ) coll = "main";
CollectionRec *cr = g_collectiondb.getRec ( coll );
char *coll = g_collectiondb.getDefaultColl(r);
CollectionRec *cr = g_collectiondb.getRec(coll);//2(r,true);
//char *coll = r->getString ( "c" );
//if ( ! coll || ! coll[0] ) coll = "main";
//CollectionRec *cr = g_collectiondb.getRec ( coll );
// if "main" collection does not exist, try another
//if ( ! cr ) cr = getCollRecFromHttpRequest ( r );
printParms2 ( sb, page, cr, nc, pd,0,0 , s);
return true;
}
@ -5453,7 +5458,7 @@ void Parms::init ( ) {
m->m_cgi = "live";
m->m_off = (char *)&g_conf.m_isLive - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_def = "1";
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m++;
@ -7702,7 +7707,9 @@ void Parms::init ( ) {
"tools. "
"Limit list to 300MB. If you have a lot of INDIVIDUAL urls "
"to add then consider using the <a href=/admin/addurl>add "
"urls</a> interface.";
"urls</a> interface. <b>IF YOU WANT TO SPIDER THE WHOLE "
"WEB</b> then only use the <i>seed:</i> directives here "
"lest you limit yourself to a set of domains.";
m->m_cgi = "sitelist";
m->m_off = (char *)&cr.m_siteListBuf - x;
m->m_page = PAGE_BASIC_SETTINGS;
@ -8084,6 +8091,14 @@ void Parms::init ( ) {
m->m_priv = 1;
m++;
m->m_title = "log debug image messages";
m->m_cgi = "ldi";
m->m_off = (char *)&g_conf.m_logDebugImage - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_priv = 1;
m++;
m->m_title = "log debug loop messages";
m->m_cgi = "ldl";
m->m_off = (char *)&g_conf.m_logDebugLoop - g;
@ -8518,6 +8533,29 @@ void Parms::init ( ) {
m->m_flags = PF_DIFFBOT;
m++;
m->m_cgi = "createdtime";
m->m_xml = "collectionCreatedTime";
m->m_desc = "Time when this collection was created, or time of "
"the last reset or restart.";
m->m_off = (char *)&cr.m_diffbotCrawlStartTime - x;
m->m_type = TYPE_LONG;
m->m_page = PAGE_NONE;
m->m_obj = OBJ_COLL;
m->m_def = "0";
m->m_flags = 0;//PF_DIFFBOT;
m++;
m->m_cgi = "spiderendtime";
m->m_xml = "crawlEndTime";
m->m_desc = "If spider is done, when did it finish.";
m->m_off = (char *)&cr.m_diffbotCrawlEndTime - x;
m->m_type = TYPE_LONG;
m->m_page = PAGE_NONE;
m->m_obj = OBJ_COLL;
m->m_def = "0";
m->m_flags = 0;//PF_DIFFBOT;
m++;
m->m_cgi = "dbcrawlname";
m->m_xml = "diffbotCrawlName";
m->m_off = (char *)&cr.m_diffbotCrawlName - x;
@ -10030,6 +10068,28 @@ void Parms::init ( ) {
m->m_group = 0;
m++;
m->m_title = "make image thumbnails";
m->m_desc = "Try to find the best image on each page and "
"store it as a thumbnail for presenting in the search "
"results.";
m->m_cgi = "mit";
m->m_off = (char *)&cr.m_makeImageThumbnails - x;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m++;
m->m_title = "index spider replies";
m->m_desc = "Index the spider replies of every url the spider "
"attempts to spider. Search for them using special "
"query operators like type:status or gberrorstr:success or "
"stats:gberrornum to get a histogram. They will not otherwise "
"show up in the search results.";
m->m_cgi = "isr";
m->m_off = (char *)&cr.m_indexSpiderReplies - x;
m->m_type = TYPE_BOOL;
m->m_def = "1";
m++;
// i put this in here so i can save disk space for my global
// diffbot json index
m->m_title = "index body";
@ -15493,8 +15553,9 @@ void Parms::init ( ) {
m++;
m->m_title = "stream search results";
m->m_desc = "Stream search results back on socket as they arrive. Useful "
"when thousands of search results are requested.";
m->m_desc = "Stream search results back on socket as they arrive. "
"Useful when thousands/millions of search results are "
"requested.";
m->m_soff = (char *)&si.m_streamResults - y;
m->m_type = TYPE_CHAR;
m->m_obj = OBJ_SI;
@ -15505,6 +15566,36 @@ void Parms::init ( ) {
m++;
m->m_title = "max serp docid";
m->m_desc = "Start displaying results after this score/docid pair. "
"Used by widget to append results to end when index is "
"volatile.";
m->m_def = "0";
m->m_soff = (char *)&si.m_minSerpDocId - y;
m->m_type = TYPE_LONG_LONG;
m->m_sparm = 1;
m->m_scgi = "minserpdocid";
m->m_flags = PF_API;
m->m_smin = 0;
m->m_sprpg = 0;
m->m_sprpp = 0;
m++;
m->m_title = "max serp score";
m->m_desc = "Start displaying results after this score/docid pair. "
"Used by widget to append results to end when index is "
"volatile.";
m->m_def = "0";
m->m_soff = (char *)&si.m_maxSerpScore - y;
m->m_type = TYPE_DOUBLE;
m->m_sparm = 1;
m->m_scgi = "maxserpscore";
m->m_flags = PF_API;
m->m_smin = 0;
m->m_sprpg = 0;
m->m_sprpp = 0;
m++;
m->m_title = "restrict search to this url";
m->m_desc = "X is the url.";
m->m_sparm = 1;
@ -16387,6 +16478,7 @@ void Parms::init ( ) {
if ( t == TYPE_DATE2 ) size = 4;
if ( t == TYPE_DATE ) size = 4;
if ( t == TYPE_FLOAT ) size = 4;
if ( t == TYPE_DOUBLE ) size = 8;
if ( t == TYPE_IP ) size = 4;
if ( t == TYPE_RULESET ) size = 4;
if ( t == TYPE_LONG ) size = 4;
@ -18735,13 +18827,14 @@ bool printUrlExpressionExamples ( SafeBuf *sb ) {
"<td>"
"This is true if the url was directly "
"injected from the "
"/inject page or API."
"<a href=/admin/inject>inject page</a> or API."
"</td></tr>"
"<tr class=poo><td>isdocidbased | !isdocidbased</td>"
"<td>"
"This is true if the url was added from the "
"reindex interface. The request does not contain "
"<a href=/admin/reindex>query reindex</a> "
"interface. The request does not contain "
"a url, but only a docid, that way we can add "
"millions of search results very quickly without "
"having to lookup each of their urls. You should "
@ -18932,6 +19025,16 @@ bool printUrlExpressionExamples ( SafeBuf *sb ) {
"See table below for supported language "
"abbreviations.</td></tr>"
"<tr class=poo><td><nobr>parentlang==zh_cn,zh_tw,xx"
"</nobr></td>"
"<td>Matches if "
"the url's referring parent url is primarily in "
"this language. Useful for prioritizing spidering "
"pages of a certain language."
"See table below for supported language "
"abbreviations."
"</td></tr>"
/*
"<tr class=poo><td>link:gigablast</td>"
"<td>Matches if the document links to gigablast."

@ -22,7 +22,8 @@ enum {
UFP_CUSTOM = 0 ,
UFP_NONE = 0 ,
UFP_WEB = 1 ,
UFP_NEWS = 2
UFP_NEWS = 2 ,
UFP_CHINESE = 3
};
// special priorities for the priority drop down
@ -71,7 +72,8 @@ enum {
TYPE_SITERULE , // 29
TYPE_SAFEBUF ,
TYPE_UFP ,
TYPE_FILEUPLOADBUTTON
TYPE_FILEUPLOADBUTTON,
TYPE_DOUBLE
};
//forward decls to make compiler happy:

@ -4100,6 +4100,10 @@ bool PosdbTable::setQueryTermInfo ( ) {
m_minScoreTermNumInt = -1;
m_maxScoreTermNumInt = -1;
m_hasMaxSerpScore = false;
if ( m_r->m_minSerpDocId )
m_hasMaxSerpScore = true;
//for ( long i = 0 ; i < m_msg2->getNumLists() ; i++ ) {
for ( long i = 0 ; i < m_q->m_numTerms ; i++ ) {
QueryTerm *qt = &m_q->m_qterms[i];
@ -6618,6 +6622,10 @@ void PosdbTable::intersectLists10_r ( ) {
// no term?
if ( ! miniMergedList[m_sortByTermNumInt] ) goto advance;
intScore = g_posdb.getInt( miniMergedList[m_sortByTermNumInt]);
// do this so hasMaxSerpScore below works, although
// because of roundoff errors we might lose a docid
// through the cracks in the widget.
//score = (float)intScore;
}
// skip docid if outside of range
@ -6656,12 +6664,36 @@ void PosdbTable::intersectLists10_r ( ) {
if ( score3 > m_maxScoreValInt ) goto advance;
}
// now we have a maxscore/maxdocid upper range so the widget
// can append only new results to an older result set.
if ( m_hasMaxSerpScore ) {
// if dealing with an "int" score use the extra precision
// of the double that m_maxSerpScore is!
if ( m_sortByTermNumInt >= 0 ) {
if ( intScore > (long)m_r->m_maxSerpScore )
goto advance;
if ( intScore == (long)m_r->m_maxSerpScore &&
(long long)m_docId <= m_r->m_minSerpDocId )
goto advance;
}
else {
if ( score > (float)m_r->m_maxSerpScore )
goto advance;
if ( score == m_r->m_maxSerpScore &&
(long long)m_docId <= m_r->m_minSerpDocId )
goto advance;
}
}
// . seoDebug hack so we can set "dcs"
// . we only come here if we actually made it into m_topTree
if ( secondPass || m_r->m_seoDebug ) {
dcs.m_siteRank = siteRank;
dcs.m_finalScore = score;
// a double can capture an int without dropping any bits,
// inlike a mere float
if ( m_sortByTermNumInt >= 0 )
dcs.m_finalScore = (double)intScore;
dcs.m_docId = m_docId;
dcs.m_numRequiredTerms = m_numQueryTermInfos;
dcs.m_docLang = docLang;

@ -576,6 +576,8 @@ class PosdbTable {
unsigned long long m_docIdHack;
bool m_hasMaxSerpScore;
// hack for seo.cpp:
float m_finalScore;
float m_preFinalScore;
@ -795,7 +797,10 @@ class DocIdScore {
bool serialize ( class SafeBuf *sb );
long long m_docId;
float m_finalScore;
// made this a double because of intScores which can't be captured
// fully with a float. intScores are used to sort by spidered time
// for example. see Posdb.cpp "intScore".
double m_finalScore;
char m_siteRank;
long m_docLang; // langId
long m_numRequiredTerms;

@ -112,6 +112,9 @@ char *g_files[] = {
// required for SSL server support for both getting web pages
// on https:// sites and for serving https:// pages
"gb.pem",
// the main binary!
"gb",
//"dict/unifiedDict",
//"dict/thesaurus.txt",
@ -150,30 +153,37 @@ char *g_files[] = {
"antiword-dir/koi8-r.txt",
"antiword-dir/koi8-u.txt",
"antiword-dir/roman.txt",
// . thumbnail generation
// . use 'apt-get install netpbm' to install
//"/usr/bin/giftopnm",
//"/usr/bin/tifftopnm",
//"/usr/bin/pngtopnm",
//"/usr/bin/jpegtopnm",
//"/usr/bin/bmptopnm",
//"/usr/bin/pnmscale",
//"/usr/bin/ppmtojpeg",
//"/usr/sbin/smartctl",
//"giftopnm",
//"tifftopnm",
//"pngtopnm",
//"jpegtopnm",
//"bmptopnm",
//"pnmscale",
//"ppmtojpeg",
// . thumbnail generation
// . i used 'apt-get install netpbm' to install
"bmptopnm",
"giftopnm",
"jpegtopnm",
"libjpeg.so.62",
"libnetpbm.so.10",
"libpng12.so.0",
"libtiff.so.4",
"libz.so.1",
"LICENSE",
"pngtopnm",
"pnmscale",
"ppmtojpeg",
"tifftopnm",
"mysynonyms.txt",
//"smartctl",
"wikititles.txt.part1",
"wikititles.txt.part2",
"wiktionary-buf.txt",
"wiktionary-lang.txt",
"wiktionary-syns.dat",
"unifiedDict.txt",
//"unifiedDict-buf.txt",
//"unifiedDict-map.dat",
//
// this junk can be generated
@ -188,6 +198,31 @@ char *g_files[] = {
};
bool Process::getFilesToCopy ( char *srcDir , SafeBuf *buf ) {
// sanirty
long slen = gbstrlen(srcDir);
if ( srcDir[slen-1] != '/' ) { char *xx=NULL;*xx=0; }
for ( long i = 0 ; i < (long)sizeof(g_files)/4 ; i++ ) {
// terminate?
if ( ! g_files[i] ) break;
// skip subdir shit it won't work
if ( strstr(g_files[i],"/") ) continue;
// if not first
if ( i > 0 ) buf->pushChar(' ');
// append it
buf->safePrintf("%s%s"
, srcDir
, g_files[i] );
}
// and the required runtime subdirs
buf->safePrintf(" %santiword-dir",srcDir);
buf->safePrintf(" %sucdata",srcDir);
buf->safePrintf(" %shtml",srcDir);
return true;
}
bool Process::checkFiles ( char *dir ) {
@ -265,6 +300,11 @@ bool Process::checkFiles ( char *dir ) {
}
if ( needsFiles ) {
log("db: Missing files. See above. Exiting.");
return false;
}
//if ( needsFiles ) {
// log("db: use 'apt-get install -y netpbm' to install "
// "pnmfiles");
@ -286,12 +326,16 @@ bool Process::checkFiles ( char *dir ) {
if ( ! g_conf.m_isLive ) return true;
m_swapEnabled = 0;
// first check to make sure swap is off
SafeBuf psb;
if ( psb.fillFromFile("/proc/swaps") < 0 ) {
log("gb: failed to read /proc/swaps");
if ( ! g_errno ) g_errno = EBADENGINEER;
return true;
//if ( ! g_errno ) g_errno = EBADENGINEER;
//return true;
// if we don't know if swap is enabled or not, use -1
m_swapEnabled = -1;
}
/*
@ -307,9 +351,15 @@ bool Process::checkFiles ( char *dir ) {
mstrerror(g_errno));
buf[size] = '\0';
*/
char *buf = psb.getBufStart();
if ( strstr ( buf,"dev" ) )
return log("gb: can not start live gb with swap enabled.");
// we should redbox this! or at least be on the optimizations page
if ( m_swapEnabled == 0 ) {
char *buf = psb.getBufStart();
if ( strstr ( buf,"dev" ) )
//return log("gb: can not start live gb with swap "
//"enabled.");
m_swapEnabled = 1;
}
// . make sure elvtune is being set right
// . must be in /etc/rcS.d/S99local
@ -336,6 +386,9 @@ bool Process::checkFiles ( char *dir ) {
mfree ( buf , size+1, "S99" );
*/
// now that we are open source skip the checks below
return true;
// check kernel version
FILE *fd;
fd = fopen ( "/proc/version" , "r" );
@ -377,7 +430,7 @@ bool Process::checkFiles ( char *dir ) {
"MST 2008\n")== 0)
return true;
log("gb: kernel version is not an approved version.");
return false;
//return false;
return true;
}

@ -16,6 +16,7 @@ class Process {
public:
bool getFilesToCopy ( char *srcDir , class SafeBuf *buf ) ;
bool checkFiles ( char *dir );
// . the big save command
@ -94,6 +95,7 @@ class Process {
long m_desiredFanState;
float m_diskUsage;
long long m_diskAvail;
char m_swapEnabled;
};
extern Process g_process;

@ -3084,9 +3084,15 @@ struct QueryField g_fields[] = {
{"isclean", FIELD_ISCLEAN, true,"Matches all pages that are deemed non-offensive and safe for children."},
{"gbrss", FIELD_GBRSS, true,"Matches all pages that are rss feeds."},
//{"gbruleset",FIELD_GBRULESET, true,"Obsolete."},
{"type", FIELD_TYPE, false,"Matches all pages of the specified file type. Example: type:pdf will match pdf documents, regardless of their file extension."},
{"type", FIELD_TYPE, false,"Matches all pages of the specified file type. Example: type:pdf will match pdf documents, regardless of their file extension. Examples: type:doc type:status type:json type:xls"},
{"filetype", FIELD_TYPE, false,"Same as type:"},
{"gbisadult",FIELD_TYPE,false,"use gbisadult:0 and gbisadult:1 to restrict results to non-adult and adult documents respectively."},
{"gbimage",FIELD_URL,false,"use gbimage:<url> to return all documents containing that image url."},
{"gbstatus",FIELD_TYPE,false,"If document is a spider reply, then search the spider status as a number using this. 0 means success, so gbstatus:0 would return all successful statuses."},
{"gbstatusmsg",FIELD_TYPE,false,"If document is a spider reply, then search the spider status description, which might be something like 'TCP Timed out' or 'Robots.txt disallows' or 'Success', if no error."},
{"gbhasthumbnail",FIELD_TYPE,false,"use gbhasthumbnail:0 and gbhasthumbnail:1 to restrict results to those that do not have or have thumbnails respectively."},
{"gbtag*", FIELD_TAG, false,"Matches all pages whose tag named * have the specified value. Example: gbtagingoogle:1 matches all pages that have a value of 1 for their ingoogle tag in tagdb."},
{"zip", FIELD_ZIP, false,"Matches all pages that have the specified zip code in their meta zip code tag. Not to be used with events."},
{"zipcode", FIELD_ZIP, false,"Same as zip:"},
@ -3143,7 +3149,7 @@ struct QueryField g_fields[] = {
},
{"gbminint", FIELD_GBNUMBERMININT, false,
"Example: 'gbminint:spiderdate:1391749680' "
"Example: 'gbminint:gbspiderdate:1391749680' "
"'gbminint:count:99'. Numeric "
"fields can be in JSON or in meta tag. "
"Use 'gbspiderdate' field for the last time the page was "
@ -3151,7 +3157,7 @@ struct QueryField g_fields[] = {
},
{"gbmaxint", FIELD_GBNUMBERMAXINT, false,
"Example: 'gbmaxint:spiderdate:1391749680' "
"Example: 'gbmaxint:gbspiderdate:1391749680' "
"'gbmaxint:count:99'. Numeric "
"fields can be in JSON or in meta tag. "
"Use 'gbspiderdate' field for the last time the page was "

70
Rdb.cpp

@ -241,6 +241,9 @@ bool Rdb::init ( char *dir ,
// . set tree to use our fixed data size
// . returns false and sets g_errno on error
if(m_useTree) {
long rdbId = m_rdbId;
// statsdb is collectionless really so pass on to tree
if ( rdbId == RDB_STATSDB ) rdbId = -1;
if ( ! m_tree.set ( fixedDataSize ,
maxTreeNodes , // max # nodes in tree
isTreeBalanced ,
@ -253,7 +256,7 @@ bool Rdb::init ( char *dir ,
// make useProtection true for debugging
false , // use protection?
false , // alowdups?
m_rdbId ) )
rdbId ) )
return false;
}
else {
@ -621,6 +624,24 @@ bool Rdb::deleteAllRecs ( collnum_t collnum ) {
return true;
}
bool makeTrashDir() {
char trash[1024];
sprintf(trash, "%strash/",g_hostdb.m_dir);
if ( ::mkdir ( trash,
S_IRUSR | S_IWUSR | S_IXUSR |
S_IRGRP | S_IWGRP | S_IXGRP |
S_IROTH | S_IXOTH ) == -1 ) {
if ( errno != EEXIST ) {
log("dir: mkdir %s had error: %s",
trash,mstrerror(errno));
return false;
}
// clear it
errno = 0;
}
return true;
}
bool Rdb::deleteColl ( collnum_t collnum , collnum_t newCollnum ) {
@ -685,12 +706,7 @@ bool Rdb::deleteColl ( collnum_t collnum , collnum_t newCollnum ) {
(long)collnum,gettimeofdayInMilliseconds());
//Dir d; d.set ( dname );
// ensure ./trash dir is there
char trash[1024];
sprintf(trash, "%strash/",g_hostdb.m_dir);
::mkdir ( trash,
S_IRUSR | S_IWUSR | S_IXUSR |
S_IRGRP | S_IWGRP | S_IXGRP |
S_IROTH | S_IXOTH ) ;
makeTrashDir();
// move into that dir
::rename ( oldname , newname );
@ -1089,8 +1105,8 @@ bool Rdb::loadTree ( ) {
return log("db: Could not load saved buckets.");
long numKeys = m_buckets.getNumKeys();
log("db: Loaded %li recs from %s's buckets on disk.",
numKeys, m_dbname);
// log("db: Loaded %li recs from %s's buckets on disk.",
// numKeys, m_dbname);
if(!m_buckets.testAndRepair()) {
log("db: unrepairable buckets, "
@ -1482,6 +1498,8 @@ bool Rdb::dumpCollLoop ( ) {
// just modify DiskPageCache.cpp to ignore breaches.
if(m_useTree) maxFileSize = m_tree.getMemOccupiedForList ();
else maxFileSize = m_buckets.getMemOccupied();
// sanity
if ( maxFileSize < 0 ) { char *xx=NULL;*xx=0; }
// because we are actively spidering the list we dump ends up
// being more, by like 20% or so, otherwise we do not make a
// big enough diskpagecache and it logs breach msgs... does not
@ -2389,9 +2407,10 @@ bool Rdb::addRecord ( collnum_t collnum,
}
}
// . cancel any spider request that is a dup in the dupcache to save disk space
// . twins might have different dupcaches so they might have different dups, but
// it shouldn't be a big deal because they are dups!
// . cancel any spider request that is a dup in the dupcache to save
// disk space
// . twins might have different dupcaches so they might have different
// dups, but it shouldn't be a big deal because they are dups!
if ( m_rdbId == RDB_SPIDERDB && ! KEYNEG(key) ) {
// . this will create it if spiders are on and its NULL
// . even if spiders are off we need to create it so
@ -2402,12 +2421,18 @@ bool Rdb::addRecord ( collnum_t collnum,
SpiderRequest *sreq=(SpiderRequest *)(orig-4-sizeof(key128_t));
// is it really a request and not a SpiderReply?
char isReq = g_spiderdb.isSpiderRequest ( &sreq->m_key );
// skip if in dup cache. do NOT add to cache since addToWaitingTree()
// in Spider.cpp will do that when called from addSpiderRequest() below
if ( isReq && sc->isInDupCache ( sreq , false ) ) return true;
// skip if in dup cache. do NOT add to cache since
// addToWaitingTree() in Spider.cpp will do that when called
// from addSpiderRequest() below
if ( isReq && sc->isInDupCache ( sreq , false ) ) {
if ( g_conf.m_logDebugSpider )
log("spider: adding spider req %s is dup. "
"skipping.",sreq->m_url);
return true;
}
}
if ( m_useTree && (tn=m_tree.addNode ( collnum, key , data , dataSize ))>=0) {
if ( m_useTree && (tn=m_tree.addNode (collnum,key,data,dataSize))>=0) {
// if adding to spiderdb, add to cache, too
if ( m_rdbId != RDB_SPIDERDB && m_rdbId != RDB_DOLEDB )
return true;
@ -2453,15 +2478,18 @@ bool Rdb::addRecord ( collnum_t collnum,
// add the request
if ( isReq ) {
// log that. why isn't this undoling always
/*
if ( g_conf.m_logDebugSpider )
logf(LOG_DEBUG,"spider: rdb: got spider "
logf(LOG_DEBUG,"spider: rdb: added spider "
"request to spiderdb rdb tree "
"addnode=%li "
"request for uh48=%llu prntdocid=%llu "
"firstIp=%s",
"firstIp=%s spiderdbkey=%s",
tn,
sreq->getUrlHash48(),
sreq->getParentDocId(),
iptoa(sreq->m_firstIp));
*/
iptoa(sreq->m_firstIp),
KEYSTR((char *)&sreq->m_key,
sizeof(key128_t)));
// false means to NOT call evaluateAllRequests()
// because we call it below. the reason we do this
// is because it does not always get called

2
Rdb.h

@ -13,6 +13,8 @@
//#include "Dir.h"
#include "RdbBuckets.h"
bool makeTrashDir() ;
// . each Rdb instance has an ID
// . these ids are also return values for getIdFromRdb()
#define RDB_START 1

@ -594,9 +594,9 @@ bool RdbBuckets::set ( long fixedDataSize , long maxMem,
return false;
}
log("init: Successfully initialized buckets for %s, "
"keysize is %li, max mem is %li, datasize is %li",
m_dbname, (long)m_ks, m_maxMem, m_fixedDataSize);
// log("init: Successfully initialized buckets for %s, "
// "keysize is %li, max mem is %li, datasize is %li",
// m_dbname, (long)m_ks, m_maxMem, m_fixedDataSize);
/*
@ -719,12 +719,12 @@ bool RdbBuckets::resizeTable(long numNeeded) {
g_errno = ENOMEM;
return false;
}
log(LOG_INFO,
"db: scaling down request for buckets. "
"Currently have %li "
"buckets, asked for %li, max number of buckets"
" for %li bytes is %li.",
m_maxBuckets, numNeeded, m_maxMem, m_maxBucketsCapacity);
// log(LOG_INFO,
// "db: scaling down request for buckets. "
// "Currently have %li "
// "buckets, asked for %li, max number of buckets"
// " for %li bytes is %li.",
// m_maxBuckets, numNeeded, m_maxMem, m_maxBucketsCapacity);
numNeeded = m_maxBucketsCapacity;
}
@ -1114,6 +1114,7 @@ bool RdbBuckets::selfTest(bool thorough, bool core) {
last = kk;
lastcoll = b->getCollnum();
}
if ( totalNumKeys != m_numKeysApprox )
log(LOG_WARN, "db have %li keys, should have %li. "
"%li buckets in %li colls for db %s",
totalNumKeys, m_numKeysApprox, m_numBuckets,

@ -1091,7 +1091,14 @@ bool RdbList::removeBadData_r ( ) {
// . if not fixed size, remove all the data for now
// . TODO: make this better, man
if ( m_fixedDataSize == -1 ) {
reset();
// don't call reset because it sets m_ks back to 12
//reset();
m_listSize = 0;
m_list = NULL;
m_listPtr = NULL;
m_listEnd = NULL;
m_mergeMinListSize = -1;
m_lastKeyIsValid = false;
return true;
}
//key_t oldk;

@ -1140,7 +1140,7 @@ void RdbTree::deleteOrderedList ( collnum_t collnum ,
bool RdbTree::fixTree ( ) {
// on error, fix the linked list
//log("RdbTree::fixTree: tree was corrupted on disk?");
log("db: Trying to fix tree.");
log("db: Trying to fix tree for %s.",m_dbname);
log("db: %li occupied nodes and %li empty "
"of top %li nodes.",
m_numUsedNodes , m_minUnusedNode - m_numUsedNodes ,
@ -1171,6 +1171,9 @@ bool RdbTree::fixTree ( ) {
// verify collnum
if ( cn < 0 ) continue;
if ( cn >= max ) continue;
// collnum of non-existent coll
if ( m_rdbId>=0 && ! g_collectiondb.m_recs[cn] )
continue;
// now add just to set m_right/m_left/m_parent
if ( m_fixedDataSize == 0 )
addNode(cn,&m_keys[i*m_ks], NULL, 0 );
@ -1183,11 +1186,11 @@ bool RdbTree::fixTree ( ) {
count++;
}
log("db: Fix tree removed %li nodes.",n - count);
log("db: Fix tree removed %li nodes for %s.",n - count,m_dbname);
// esure it is still good
if ( ! checkTree ( false , true ) )
return log("db: Fix tree failed.");
log("db: Fix tree succeeded.");
log("db: Fix tree succeeded for %s.",m_dbname);
return true;
}
@ -1229,6 +1232,12 @@ bool RdbTree::checkTree2 ( bool printMsgs , bool doChainTest ) {
// for posdb
if ( m_ks == 18 &&(m_keys[i*m_ks] & 0x06) ) {
char *xx=NULL;*xx=0; }
// bad collnum?
collnum_t cn = m_collnums[i];
if ( m_rdbId>=0 && (cn >= g_collectiondb.m_numRecs || cn < 0) )
return log("db: bad collnum in tree");
if ( m_rdbId>=0 && ! g_collectiondb.m_recs[cn] )
return log("db: collnum is obsolete in tree");
// if no left/right kid it MUST be -1
if ( m_left[i] < -1 )
return log(
@ -1305,8 +1314,12 @@ bool RdbTree::checkTree2 ( bool printMsgs , bool doChainTest ) {
if ( ! doChainTest ) continue;
// ensure i goes back to head node
long j = i;
long loopCount = 0;
while ( j >= 0 ) {
if ( j == m_headNode ) break;
// sanity -- loop check
if ( ++loopCount > 10000 )
return log("db: tree had loop");
j = m_parents[j];
}
if ( j != m_headNode )
@ -2799,8 +2812,10 @@ long RdbTree::fastLoadBlock ( BigFile *f ,
m_corrupt++;
continue;
}
// must have rec as well
if ( ! recs[c] ) {
// must have rec as well. unless it its statsdb tree
// or m_waitingTree which are collection-less and always use
// 0 for their collnum. if collection-less m_rdbId==-1.
if ( ! recs[c] && m_rdbId >= 0 ) {
m_corrupt++;
continue;
}
@ -3063,10 +3078,15 @@ long RdbTree::oldLoadBlock ( BigFile *f, long remainingNodes , RdbMem *stack,
void RdbTree::cleanTree ( ) { // char **bases ) {
// some trees always use 0 for all node collnum_t's like
// statsdb, waiting tree etc.
if ( m_rdbId < 0 ) return;
// the liberation count
long count = 0;
collnum_t collnum;
long max = g_collectiondb.m_numRecs;
for ( long i = 0 ; i < m_minUnusedNode ; i++ ) {
// skip node if parents is -2 (unoccupied)
if ( m_parents[i] == -2 ) continue;
@ -3103,7 +3123,8 @@ void RdbTree::cleanTree ( ) { // char **bases ) {
}
long RdbTree::getNumNegativeKeys ( collnum_t collnum ) {
if ( m_rdbId < 0 ) { char *xx=NULL;*xx=0; }
// fix for statsdb or other collectionless rdbs
if ( m_rdbId < 0 ) return m_numNegativeKeys;
CollectionRec *cr = g_collectiondb.m_recs[collnum];
if ( ! cr ) return 0;
//if ( ! m_countsInitialized ) { char *xx=NULL;*xx=0; }
@ -3111,7 +3132,8 @@ long RdbTree::getNumNegativeKeys ( collnum_t collnum ) {
}
long RdbTree::getNumPositiveKeys ( collnum_t collnum ) {
if ( m_rdbId < 0 ) { char *xx=NULL;*xx=0; }
// fix for statsdb or other collectionless rdbs
if ( m_rdbId < 0 ) return m_numPositiveKeys;
CollectionRec *cr = g_collectiondb.m_recs[collnum];
if ( ! cr ) return 0;
//if ( ! m_countsInitialized ) { char *xx=NULL;*xx=0; }

@ -2027,11 +2027,11 @@ bool Repair::injectTitleRec ( ) {
xd->m_tagRecValid = false;
// rebuild the title rec! otherwise we re-add the old one!!!!!!!
xd->m_titleRecValid = false;
xd->m_titleRecBufValid = false;
// free it since set2() should have uncompressed it!
//mfree ( titleRec , titleRecSize, "repair" );
// and so xd doesn't free it
xd->m_titleRec = NULL;
xd->m_titleRecBuf.purge();// = NULL;
// use the ptr_utf8Content that we have
xd->m_recycleContent = true;

@ -3285,3 +3285,92 @@ bool SafeBuf::csvEncode ( char *s , long len , long niceness ) {
return true;
}
bool SafeBuf::base64Encode ( char *sx , long len , long niceness ) {
unsigned char *s = (unsigned char *)sx;
if ( ! s ) return true;
// assume all chars are double quotes and will have to be encoded
long need = len * 2 + 1 +3; // +3 for = padding
if ( ! reserve ( need ) ) return false;
// tmp vars
char *dst = m_buf + m_length;
long round = 0;
// the table of 64 entities
static char tab[] = {
'A','B','C','D','E','F','G','H','I','J','K','L','M',
'N','O','P','Q','R','S','T','U','V','W','X','Y','Z',
'a','b','c','d','e','f','g','h','i','j','k','l','m',
'n','o','p','q','r','s','t','u','v','w','x','y','z',
'0','1','2','3','4','5','6','7','8','9','+','/'
};
unsigned char val;
// scan through all
unsigned char *send = s + len;
for ( ; s < send ; ) {
// breathe
QUICKPOLL ( niceness );
unsigned char c1 = s[0];
unsigned char c2 = 0;
//unsigned char c3 = 0;
if ( s+1 < send ) c2 = s[1];
else c2 = 0;
if ( round == 0 ) {
val = c1 >>2;
}
else if ( round == 1 ) {
val = (c1 & 0x03) << 4;
val |= c2 >> 4;
// time for this
s++;
}
else if ( round == 2 ) {
val = ((c1 & 0x0f) << 2);
val |= ((c2 & 0xc0) >> 6);
s++;
}
else if ( round == 3 ) {
val = (c1 & 0x3f);
s++;
}
// add '0'
*dst = tab[val];
// point to next char
dst++;
// keep going if more left
if ( s < send ) {
// repeat every 4 cycles since it is aligned then
if ( ++round == 4 ) round = 0;
continue;
}
// if we are done do padding
if ( round == 0 ) {
*dst++ = '=';
}
if ( round == 1 ) {
*dst++ = '=';
*dst++ = '=';
}
if ( round == 2 ) {
*dst++ = '=';
}
}
m_length += dst - (m_buf + m_length);
nullTerm();
return true;
}

@ -110,6 +110,8 @@ struct SafeBuf {
bool csvEncode ( char *s , long len , long niceness = 0 );
bool base64Encode ( char *s , long len , long niceness = 0 );
//bool pushLong ( long val ) { return safeMemcpy((char *)&val,4); }
bool cat(SafeBuf& c);
// . only cat the sections/tag that start with "tagFilter"

@ -394,6 +394,22 @@ m if (! cr->hasSearchPermission ( sock, encapIp ) ) {
v = atof(m->m_def);
*(float *)x = (float)v;
}
else if ( m->m_type == TYPE_DOUBLE ) {
double v = 0;
if ( def )
v = *(double *)def;
else if ( m->m_def )
v = atof(m->m_def);
*(double *)x = (double)v;
}
else if ( m->m_type == TYPE_LONG_LONG ) {
long long v = 0;
if ( def )
v = *(long long *)def;
else if ( m->m_def )
v = atoll(m->m_def);
*(long long *)x = (long long)v;
}
else if ( m->m_type == TYPE_STRING ||
m->m_type == TYPE_STRINGBOX ) {
//if ( m->m_cgi && strcmp ( m->m_cgi, "erpc" ) == 0 )
@ -549,6 +565,27 @@ m if (! cr->hasSearchPermission ( sock, encapIp ) ) {
// "name=%s value=\"%li\">\n",
// cgi , v );
}
else if ( m->m_type == TYPE_LONG_LONG ) {
// default was set above
long def = *(long long *)x;
// assume default
long long v = def;
// but cgi parms override cookie
v = r->getLongLong ( cgi , v );
// but if its a privledged parm and we're not an admin
// then do not allow overrides, but m_priv of 3 means
// to not display for clients, but to allow overrides
if ( ! m_isAdmin && m->m_priv && m->m_priv!=3) v = def;
// set it
*(long long *)x = v;
// if it is the same as its default, and the default is
// always from m_def and never from the CollectionRec,
// then do not both storing it in here! what's the
// point?
if ( v == def && m->m_off < 0 ) continue;
// if not default do not propagate
if ( v == def ) continue;
}
else if ( m->m_type == TYPE_FLOAT ) {
// default was set above
float def = *(float *)x;
@ -587,6 +624,34 @@ m if (! cr->hasSearchPermission ( sock, encapIp ) ) {
// "name=%s value=\"%f\">\n",
// cgi , v );
}
else if ( m->m_type == TYPE_DOUBLE ) {
// default was set above
double def = *(double *)x;
// get overriding from http request, if any
double v;
// but if its a privledged parm and we're not an admin
// then do not allow overrides
if ( ! m_isAdmin && m->m_priv && m->m_priv!=3) v = def;
else v = r->getDouble( cgi , def );
// bounds checks
if ( v < m->m_smin ) v = m->m_smin;
if ( v > m->m_smax ) v = m->m_smax;
if ( m->m_sminc >= 0 ) {
double vmin=*(double *)((char *)cr+m->m_sminc);
if ( v < vmin ) v = vmin;
}
if ( m->m_smaxc >= 0 ) {
double vmax=*(double *)((char *)cr+m->m_smaxc);
if ( v > vmax ) v = vmax;
}
// set it
*(double *)x = v;
// include for sure if explicitly provided
char *vp = r->getValue(cgi, NULL, NULL);
if ( ! vp ) continue;
// unchanged from default?
if ( v == def ) continue;
}
else if ( m->m_type == TYPE_BOOL ) {
// default was set above
@ -927,11 +992,12 @@ m if (! cr->hasSearchPermission ( sock, encapIp ) ) {
if(m_firstResultNum < 0) m_firstResultNum = 0;
// DEBUG: temp hack
static bool first = true;
if ( first ) {
first = false;
m_firstResultNum = 1;
}
// static bool first = true;
// if ( first ) {
// first = false;
// m_firstResultNum = 10;
// }
// if useCache is -1 then pick a default value
if ( m_useCache == -1 ) {
@ -1422,6 +1488,8 @@ char getFormatFromRequest ( HttpRequest *r ) {
format=FORMAT_WIDGET_IFRAME;
if ( formatStr && strcmp(formatStr,"ajax")==0)
format=FORMAT_WIDGET_AJAX;
if ( formatStr && strcmp(formatStr,"append")==0)
format=FORMAT_WIDGET_APPEND;
// support old api &xml=1 to mean &format=1
@ -1446,5 +1514,9 @@ char getFormatFromRequest ( HttpRequest *r ) {
format = FORMAT_WIDGET_AJAX;
}
if ( r->getLong("append",0) ) {
format = FORMAT_WIDGET_APPEND;
}
return format;
}

@ -362,6 +362,9 @@ class SearchInput {
long m_urlLen2;
char *m_url2;
double m_maxSerpScore;
long long m_minSerpDocId;
// for /get?d=xxxxx&strip=0&ih=1&qh=1
long long m_docId;
long m_strip;

@ -1039,13 +1039,14 @@ bool Speller::loadUnifiedDict() {
char *tail2 = m_unifiedBuf.getBufStart()+h2-1000;
h = hash64 ( tail1 , 1000 , h );
h = hash64 ( tail2 , 1000 , h );
long long n = 8346765853685546681LL;
//long long n = 8346765853685546681LL;
long long n = -14450509118443930LL;
if ( h != n ) {
log("gb: unifiedDict-buf.txt or "
"unifiedDict-map.dat "
"checksum is not approved for "
"live service (%lli != %lli)" ,h,n);
return false;
//return false;
}
return true;

@ -110,7 +110,7 @@ long SpiderRequest::print ( SafeBuf *sbarg ) {
sb->safePrintf("parentDomHash32=0x%lx ",m_parentDomHash32 );
sb->safePrintf("parentSiteHash32=0x%lx ",m_parentSiteHash32 );
sb->safePrintf("hopCount=%li ",m_hopCount );
sb->safePrintf("hopCount=%li ",(long)m_hopCount );
//timeStruct = gmtime ( &m_spiderTime );
//time[0] = 0;
@ -301,7 +301,7 @@ long SpiderRequest::printToTable ( SafeBuf *sb , char *status ,
sb->safePrintf(" <td>%li</td>\n",m_siteNumInlinks );
//sb->safePrintf(" <td>%li</td>\n",m_pageNumInlinks );
sb->safePrintf(" <td>%li</td>\n",m_hopCount );
sb->safePrintf(" <td>%li</td>\n",(long)m_hopCount );
// print time format: 7/23/1971 10:45:32
struct tm *timeStruct ;
@ -436,7 +436,7 @@ long SpiderRequest::printToTableSimple ( SafeBuf *sb , char *status ,
sb->safePrintf(" <td>%li</td>\n",(long)m_errCount );
sb->safePrintf(" <td>%li</td>\n",m_hopCount );
sb->safePrintf(" <td>%li</td>\n",(long)m_hopCount );
// print time format: 7/23/1971 10:45:32
struct tm *timeStruct ;
@ -1026,14 +1026,22 @@ bool tryToDeleteSpiderColl ( SpiderColl *sc ) {
(long)sc,(long)sc->m_collnum);
return true;
}
// this means msg5 is out
if ( sc->m_msg5.m_waitingForList ) {
log("spider: deleting sc=0x%lx for collnum=%li waiting4",
(long)sc,(long)sc->m_collnum);
return true;
}
// there's still a core of someone trying to write to someting
// in "sc" so we have to try to fix that. somewhere in xmldoc.cpp
// or spider.cpp. everyone should get sc from cr everytime i'd think
log("spider: deleting sc=0x%lx for collnum=%li",
(long)sc,(long)sc->m_collnum);
// . make sure nobody has it
// . cr might be NULL because Collectiondb.cpp::deleteRec2() might
// have nuked it
CollectionRec *cr = sc->m_cr;
// make sure nobody has it
cr->m_spiderColl = NULL;
if ( cr ) cr->m_spiderColl = NULL;
mdelete ( sc , sizeof(SpiderColl),"postdel1");
delete ( sc );
return true;
@ -3075,6 +3083,8 @@ void SpiderColl::populateDoledbFromWaitingTree ( ) { // bool reentry ) {
// reset this
long maxWinners = (long)MAX_WINNER_NODES;
if ( ! m_cr->m_isCustomCrawl ) maxWinners = 1;
if ( m_winnerTree.m_numNodes == 0 &&
! m_winnerTree.set ( -1 , // fixeddatasize
maxWinners , // maxnumnodes
@ -3348,7 +3358,8 @@ bool SpiderColl::evalIpLoop ( ) {
// if we started reading, then assume we got a fresh list here
if ( g_conf.m_logDebugSpider )
log("spider: back from msg5 spiderdb read2");
log("spider: back from msg5 spiderdb read2 of %li bytes",
m_list.m_listSize);
// . set the winning request for all lists we read so far
@ -3539,7 +3550,8 @@ bool SpiderColl::readListFromSpiderdb ( ) {
return false ;
// note its return
if ( g_conf.m_logDebugSpider )
log("spider: back from msg5 spiderdb read");
log("spider: back from msg5 spiderdb read of %li bytes",
m_list.m_listSize);
// no longer getting list
m_gettingList1 = false;
@ -4091,6 +4103,7 @@ bool SpiderColl::scanListForWinners ( ) {
// get the top 100 spider requests by priority/time/etc.
long maxWinners = (long)MAX_WINNER_NODES; // 40
if ( ! m_cr->m_isCustomCrawl ) maxWinners = 1;
// only put 40 urls from the same firstIp into doledb if
// we have a lot of urls in our spiderdb already.
@ -6139,9 +6152,23 @@ bool SpiderLoop::gotDoledbList2 ( ) {
// get priority from doledb key
long pri = g_doledb.getPriority ( doledbKey );
if ( g_conf.m_logDebugSpider )
log("spider: setting pri2=%li nextkey to %s",
m_sc->m_pri2,KEYSTR(&m_sc->m_nextDoledbKey,12));
// if the key went out of its priority because its priority had no
// spider requests then it will bleed over into another priority so
// in that case reset it to the top of its priority for next time
long pri3 = g_doledb.getPriority ( &m_sc->m_nextDoledbKey );
if ( pri3 != m_sc->m_pri2 ) {
m_sc->m_nextDoledbKey = g_doledb.makeFirstKey2 ( m_sc->m_pri2);
// the key must match the priority queue its in as nextKey
//if ( pri3 != m_sc->m_pri2 ) { char *xx=NULL;*xx=0; }
}
if ( g_conf.m_logDebugSpider ) {
long pri4 = g_doledb.getPriority ( &m_sc->m_nextDoledbKey );
log("spider: setting pri2=%li queue doledb nextkey to "
"%s (pri=%li)",
m_sc->m_pri2,KEYSTR(&m_sc->m_nextDoledbKey,12),pri4);
if ( pri4 != m_sc->m_pri2 ) { char *xx=NULL;*xx=0; }
}
// update next doledbkey for this priority to avoid having to
// process excessive positive/negative key annihilations (mdw)
@ -9912,6 +9939,13 @@ long getUrlFilterNum2 ( SpiderRequest *sreq ,
langLen = gbstrlen(lang);
}
// . get parent language in the request
// . primarpy language of the parent page that linked to this url
char *plang = NULL;
long plangLen = 0;
plang = getLanguageAbbr(sreq->m_parentLangId);
if ( plang ) plangLen = gbstrlen(plang);
char *tld = (char *)-1;
long tldLen;
@ -10259,7 +10293,16 @@ long getUrlFilterNum2 ( SpiderRequest *sreq ,
if ( strncmp(p,"insitelist",10) == 0 ) {
// skip for msg20
//if ( isForMsg20 ) continue;
if ( ! checkedRow ) {
// if only seeds in the sitelist and no
// if there is no domain or url explicitly listed
// then assume user is spidering the whole internet
// and we basically ignore "insitelist"
if ( sc->m_siteListIsEmpty ) {
// use a dummy row match
row = (char *)1;
}
else if ( ! checkedRow ) {
// only do once for speed
checkedRow = true;
// this function is in PageBasic.cpp
@ -11026,6 +11069,67 @@ long getUrlFilterNum2 ( SpiderRequest *sreq ,
// come here if we did not match the tld
}
// parentlang=en,zh_cn
if ( *p=='p' && strncmp(p,"parentlang",10)==0){
// if we do not have enough info for outlink, all done
if ( isOutlink ) return -1;
// must have a reply
//if ( ! srep ) continue;
// skip if unknown? no, we support "xx" as unknown now
//if ( srep->m_langId == 0 ) continue;
// set these up
char *b = s;
// loop for the comma-separated list of langids
// like parentlang==en,es,...
subloop2b:
// get length of it in the expression box
char *start = b;
while ( *b && !is_wspace_a(*b) && *b!=',' ) b++;
long blen = b - start;
//char sm;
// if we had parentlang==en,es,...
if ( sign == SIGN_EQ &&
blen == plangLen &&
strncasecmp(start,plang,plangLen)==0 )
// if we matched any, that's great
goto matched2b;
// if its parentlang!=en,es,...
// and we equal the string, then we do not matcht his
// particular rule!!!
if ( sign == SIGN_NE &&
blen == plangLen &&
strncasecmp(start,plang,plangLen)==0 )
// we do not match this rule if we matched
// and of the langs in the != list
continue;
// might have another in the comma-separated list
if ( *b != ',' ) {
// if that was the end of the list and the
// sign was == then skip this rule
if ( sign == SIGN_EQ ) continue;
// otherwise, if the sign was != then we win!
if ( sign == SIGN_NE ) goto matched2b;
// otherwise, bad sign?
continue;
}
// advance to next list item if was a comma after us
b++;
// and try again
goto subloop2b;
// come here on a match
matched2b:
// we matched, now look for &&
p = strstr ( b , "&&" );
// if nothing, else then it is a match
if ( ! p ) return i;
// skip the '&&' and go to next rule
p += 2;
goto checkNextRule;
// come here if we did not match the tld
}
// hopcount == 20 [&&]
if ( *p=='h' && strncmp(p, "hopcount", 8) == 0){
// skip if not valid
@ -12244,6 +12348,8 @@ void handleRequestc1 ( UdpSlot *slot , long niceness ) {
ci->m_hasUrlsReadyToSpider = 0;
// save that!
cr->m_needsSave = true;
// set the time that this happens
cr->m_diffbotCrawlEndTime = getTimeGlobalNoCore();
}
// save it

@ -528,29 +528,37 @@ class SpiderRequest {
// . this is zero if none or invalid
long m_contentHash32;
/*
char m_reserved1;
// . each request can have a different hop count
// . this is only valid if m_hopCountValid is true!
// . i made this a short from long to support m_parentLangId etc above
short m_hopCount;
// when creating a chinese search engine for instance it is nice
// to know the language of the page we are spidering's parent.
// typically a chinese page will link to another chinese page,
// though not always of course. this is the primary language of
// the parent.
uint8_t m_parentLangId;//reserved1;
// the new add url control will allow user to control link spidering
// on each url they add. they can also specify file:// instead of
// http:// to index local files. so we have to allow file://
char m_onlyAddSameDomainLinks :1;
char m_onlyAddSameSubdomainLinks :1;
char m_onlyDoNotAddLinksLinks :1; // max hopcount 1
char m_onlyDoNotAddLinksLinksLinks :1; // max hopcount 2
/* char m_onlyAddSameDomainLinks :1; */
/* char m_onlyAddSameSubdomainLinks :1; */
/* char m_onlyDoNotAddLinksLinks :1; // max hopcount 1 */
/* char m_onlyDoNotAddLinksLinksLinks :1; // max hopcount 2 */
char m_reserved2a:1;
char m_reserved2b:1;
char m_reserved2c:1;
char m_reserved2d:1;
char m_reserved2e:1;
char m_reserved2f:1;
char m_reserved2g:1;
char m_reserved2h:1;
// . each request can have a different hop count
// . this is only valid if m_hopCountValid is true!
short m_hopCount;
*/
long m_hopCount;
//long m_hopCount;
// . this is now computed dynamically often based on the latest
// m_addedTime and m_percentChanged of all the SpideRec *replies*.
@ -715,6 +723,8 @@ class SpiderRequest {
m_ufn = -1;
// this too
m_priority = -1;
// this happens to be zero already, but just in case it changes
m_parentLangId = langUnknown;
};
static long getNeededSize ( long urlLen ) {

@ -116,12 +116,28 @@ bool Title::setTitle ( XmlDoc *xd ,
SafeBuf jsonTitle;
long vlen = 0;
if ( xd->m_contentType == CT_JSON ) {
// shortcut
char *s = xd->ptr_utf8Content;
char *jt;
jt = getJSONFieldValue(xd->ptr_utf8Content,"title",&vlen);
jt = getJSONFieldValue(s,"title",&vlen);
if ( jt && vlen > 0 ) {
jsonTitle.safeDecodeJSONToUtf8 (jt, vlen, m_niceness);
//true ); // decodeAll?
jsonTitle.nullTerm();
}
// if we got a product, try getting price
long oplen;
char *op = getJSONFieldValue(s,"offerPrice",&oplen);
if ( op && oplen ) {
if ( ! is_digit(op[0]) ) { op++; oplen--; }
float price = atof2(op,oplen);
// print without decimal point if ends in .00
if ( (float)(long)price == price )
jsonTitle.safePrintf(", &nbsp; $%li",
(long)price);
else
jsonTitle.safePrintf(", &nbsp; $%.02f",price);
}
if ( jsonTitle.length() ) {
val = jsonTitle.getBufStart();
vlen = jsonTitle.length();
}

@ -50,9 +50,16 @@ bool Wiki::load() {
close ( fd2 );
// save text size for getRandomPhrase() function below
m_txtSize = stats1.st_size;
// just use the .dat if we got it
if ( ! errno2 ) {
log(LOG_INFO,"wiki: Loading %s",ff2);
// "dir" is NULL since already included in ff2
return m_ht.load ( NULL , ff2 );
}
// if we got a newer binary version, use that
if ( ! errno2 && ! errno1 && stats2.st_mtime > stats1.st_mtime ) {
log(LOG_INFO,"wiki: loading %s",ff2);
// add in 10 seconds i guess
if ( ! errno2 && ! errno1 && stats2.st_mtime +10> stats1.st_mtime ) {
log(LOG_INFO,"wiki: Loading %s",ff2);
// "dir" is NULL since already included in ff2
return m_ht.load ( NULL , ff2 );
}
@ -70,15 +77,17 @@ bool Wiki::load() {
bool Wiki::loadText ( long fileSize ) {
log(LOG_INFO,"wiki: generating wikititles2.dat file");
SafeBuf sb;
char ff1[256];
sprintf(ff1, "%swikititles.txt.part1", g_hostdb.m_dir);
log(LOG_INFO,"wiki: loading %s",ff1);
log(LOG_INFO,"wiki: Loading %s",ff1);
if ( ! sb.fillFromFile(ff1) ) return false;
char ff2[256];
sprintf(ff2, "%swikititles.txt.part2", g_hostdb.m_dir);
log(LOG_INFO,"wiki: loading %s",ff2);
log(LOG_INFO,"wiki: Loading %s",ff2);
if ( ! sb.catFile(ff2) ) return false;
@ -312,6 +321,9 @@ bool Wiki::loadText ( long fileSize ) {
//char ff2[256];
//sprintf(ff2, "%s/wikititles2.dat", g_hostdb.m_dir);
if ( ! m_ht.save ( g_hostdb.m_dir , "wikititles2.dat" ) ) return false;
log(LOG_INFO,"wiki: done generating wikititles2.dat file");
// success
return true;
}

@ -261,10 +261,10 @@ bool Wiktionary::load() {
( errno1 || stats3.st_mtime > stats1.st_mtime )
//&& ( errno2 || stats3.st_mtime > stats2.st_mtime )
) {
log(LOG_INFO,"wikt: loading %s",ff3);
log(LOG_INFO,"wikt: Loading %s",ff3);
if ( ! m_synTable .load ( NULL , ff3 ) )
return false;
log(LOG_INFO,"wikt: loading %s",ff4);
log(LOG_INFO,"wikt: Loading %s",ff4);
if ( m_synBuf.fillFromFile ( NULL , ff4 ) <= 0 )
return false;
@ -288,7 +288,7 @@ bool Wiktionary::load() {
log("gb: %s or %s checksum is not approved for "
"live service (%lli != %lli)", ff3, ff4,
h,nn);
return false;
//return false;
}
return true;
@ -517,7 +517,7 @@ bool Wiktionary::generateHashTableFromWiktionaryTxt ( long sizen ) {
//
char ff1[256];
sprintf(ff1, "%swiktionary.txt.aa", g_hostdb.m_dir);
log(LOG_INFO,"wikt: loading %s",ff1);
log(LOG_INFO,"wikt: Loading %s",ff1);
int fd1 = open ( ff1 , O_RDONLY );
if ( fd1 < 0 ) {
log("wikt: open %s : %s",ff1,mstrerror(errno));
@ -558,7 +558,7 @@ bool Wiktionary::generateHashTableFromWiktionaryTxt ( long sizen ) {
round++;
offset = 0;
sprintf(ff1,"%swiktionary.txt.ab",g_hostdb.m_dir);
log(LOG_INFO,"wikt: loading %s",ff1);
log(LOG_INFO,"wikt: Loading %s",ff1);
int fd1 = open ( ff1 , O_RDONLY );
if ( fd1 < 0 ) {
log("wikt: open %s : %s",ff1,mstrerror(errno));

1576
XmlDoc.cpp

File diff suppressed because it is too large Load Diff

@ -273,7 +273,9 @@ class XmlDoc {
// . we can avoid setting Xml and Words classes etc...
long m_contentHash32;
// like the above but hash of all tags in TagRec for this url
long m_tagHash32;
//long m_tagHash32;
// this is a hash of all adjacent tag pairs for templated identificatn
uint32_t m_tagPairHash32;
long m_siteNumInlinks;
long m_siteNumInlinksUniqueIp; // m_siteNumInlinksFresh
long m_siteNumInlinksUniqueCBlock; // m_sitePop;
@ -490,7 +492,13 @@ class XmlDoc {
key_t *getTitleRecKey() ;
//char *getSkipIndexing ( );
char *prepareToMakeTitleRec ( ) ;
char **getTitleRec ( ) ;
// store TitleRec into "buf" so it can be added to metalist
bool setTitleRecBuf ( SafeBuf *buf , long long docId, long long uh48 );
// sets m_titleRecBuf/m_titleRecBufValid/m_titleRecKey[Valid]
SafeBuf *getTitleRecBuf ( );
SafeBuf *getSpiderReplyMetaList ( class SpiderReply *reply ) ;
SafeBuf *getSpiderReplyMetaList2 ( class SpiderReply *reply ) ;
SafeBuf m_spiderReplyMetaList;
char *getIsAdult ( ) ;
long **getIndCatIds ( ) ;
long **getCatIds ( ) ;
@ -540,8 +548,6 @@ class XmlDoc {
class HashTableX *getCountTable ( ) ;
bool hashString_ct ( class HashTableX *ht, char *s , long slen ) ;
uint8_t *getSummaryLangId ( ) ;
long *getTagPairHashVector ( ) ;
uint32_t *getTagPairHash32 ( ) ;
long *getSummaryVector ( ) ;
long *getPageSampleVector ( ) ;
long *getPostLinkTextVector ( long linkNode ) ;
@ -601,6 +607,7 @@ class XmlDoc {
//bool *updateRootLangId ( );
char **getRootTitleRec ( ) ;
//char **getContactTitleRec ( char *url ) ;
long long *getAvailDocIdOnly ( long long preferredDocId ) ;
long long *getDocId ( ) ;
char *getIsIndexed ( ) ;
class TagRec *getTagRec ( ) ;
@ -666,11 +673,13 @@ class XmlDoc {
char **getUtf8Content ( ) ;
long *getContentHash32 ( ) ;
long *getContentHashJson32 ( ) ;
long *getTagHash32 ( ) ;
//long *getTagHash32 ( ) ;
long *getTagPairHashVector ( ) ;
uint32_t *getTagPairHash32 ( ) ;
long getHostHash32a ( ) ;
long getHostHash32b ( ) ;
long getDomHash32 ( );
char **getImageData();
char **getThumbnailData();
class Images *getImages ( ) ;
int8_t *getNextSpiderPriority ( ) ;
long *getPriorityQueueNum ( ) ;
@ -696,7 +705,7 @@ class XmlDoc {
SafeBuf *getNewTagBuf ( ) ;
char *updateTagdb ( ) ;
bool logIt ( ) ;
bool logIt ( class SafeBuf *bb = NULL ) ;
bool m_doConsistencyTesting;
bool doConsistencyTest ( bool forceTest ) ;
long printMetaList ( ) ;
@ -733,7 +742,9 @@ class XmlDoc {
// bool nosplit ) ;
long getSiteRank ();
bool addTable144 ( class HashTableX *tt1 );
bool addTable144 ( class HashTableX *tt1 ,
long long docId ,
class SafeBuf *buf = NULL );
bool addTable224 ( HashTableX *tt1 ) ;
@ -749,6 +760,7 @@ class XmlDoc {
bool hashNoSplit ( class HashTableX *tt ) ;
char *hashAll ( class HashTableX *table ) ;
long getBoostFromSiteNumInlinks ( long inlinks ) ;
bool hashSpiderReply (class SpiderReply *reply ,class HashTableX *tt) ;
bool hashMetaTags ( class HashTableX *table ) ;
bool hashIsClean ( class HashTableX *table ) ;
bool hashZipCodes ( class HashTableX *table ) ;
@ -756,7 +768,7 @@ class XmlDoc {
bool hashContentType ( class HashTableX *table ) ;
bool hashDMOZCategories ( class HashTableX *table ) ;
bool hashLinks ( class HashTableX *table ) ;
bool hashUrl ( class HashTableX *table ) ;
bool hashUrl ( class HashTableX *table , bool hashNonFieldTerms=true) ;
bool hashDateNumbers ( class HashTableX *tt ) ;
bool hashSections ( class HashTableX *table ) ;
bool hashIncomingLinkText ( class HashTableX *table ,
@ -783,10 +795,12 @@ class XmlDoc {
bool hashAds(class HashTableX *table ) ;
class Url *getBaseUrl ( ) ;
bool hashSubmitUrls ( class HashTableX *table ) ;
bool hashImageStuff ( class HashTableX *table ) ;
bool hashIsAdult ( class HashTableX *table ) ;
void set20 ( Msg20Request *req ) ;
class Msg20Reply *getMsg20Reply ( ) ;
char **getDiffbotPrimaryImageUrl ( ) ;
char **getImageUrl() ;
class MatchOffsets *getMatchOffsets () ;
Query *getQuery() ;
@ -823,6 +837,8 @@ class XmlDoc {
bool hashString ( char *s ,
long slen ,
class HashInfo *hi ) ;
bool hashString ( char *s ,
class HashInfo *hi ) ;
@ -1057,7 +1073,7 @@ class XmlDoc {
// fear of getting the buffer overwritten by crap
//TagRec m_savedTagRec1;
//char *m_sampleVector ;
uint32_t m_tagPairHash;
//uint32_t m_tagPairHash32;
long m_firstIp;
class SafeBuf *m_savedSb;
@ -1077,6 +1093,7 @@ class XmlDoc {
char m_firstUrlHash64Valid;
char m_lastUrlValid;
char m_docIdValid;
char m_availDocIdValid;
//char m_collValid;
char m_tagRecValid;
char m_robotsTxtLenValid;
@ -1162,11 +1179,9 @@ class XmlDoc {
//char m_msge2Valid;
//char m_sampleVectorValid;
char m_gigabitHashesValid;
char m_tagPairHashValid;
//char m_oldsrValid;
char m_sreqValid;
char m_srepValid;
char m_titleRecValid;
bool m_ipValid;
bool m_firstIpValid;
@ -1219,7 +1234,9 @@ class XmlDoc {
bool m_redirErrorValid;
bool m_domHash32Valid;
bool m_contentHash32Valid;
bool m_tagHash32Valid;
//bool m_tagHash32Valid;
bool m_tagPairHash32Valid;
bool m_linkInfo2Valid;
bool m_spiderLinksValid;
//bool m_nextSpiderPriorityValid;
@ -1320,6 +1337,7 @@ class XmlDoc {
bool m_crawlInfoValid;
bool m_isPageParserValid;
bool m_imageUrlValid;
bool m_imageUrl2Valid;
bool m_matchOffsetsValid;
bool m_queryValid;
bool m_matchesValid;
@ -1332,11 +1350,13 @@ class XmlDoc {
bool m_newTermInfoBufValid;
bool m_summaryValid;
bool m_gsbufValid;
bool m_spiderReplyMetaListValid;
bool m_isCompromisedValid;
bool m_isNoArchiveValid;
//bool m_isVisibleValid;
bool m_clockCandidatesTableValid;
bool m_clockCandidatesDataValid;
bool m_titleRecBufValid;
bool m_isLinkSpamValid;
bool m_isErrorPageValid;
bool m_isHijackedValid;
@ -1402,6 +1422,7 @@ class XmlDoc {
Msg0 m_msg0;
Msg5 m_msg5;
char m_isDup;
long long m_docIdWeAreADupOf;
long m_ei;
long m_lastLaunch;
Msg22Request m_msg22Request;
@ -1943,8 +1964,10 @@ class XmlDoc {
//long m_gsbufAllocSize;
char *m_note;
char *m_imageUrl;
char *m_imageUrl2;
//char m_imageUrlBuf[100];
SafeBuf m_imageUrlBuf;
SafeBuf m_imageUrlBuf2;
//long m_imageUrlSize;
MatchOffsets m_matchOffsets;
Query m_query;
@ -1973,11 +1996,12 @@ class XmlDoc {
bool m_deleteFromIndex;
// ptrs to stuff
char *m_titleRec;
long m_titleRecSize;
bool m_freeTitleRec;
long m_titleRecAllocSize;
key_t m_titleRecKey;
//char *m_titleRec;
SafeBuf m_titleRecBuf;
//long m_titleRecSize;
//bool m_freeTitleRec;
//long m_titleRecAllocSize;
key_t m_titleRecKey;
// for isDupOfUs()
char *m_dupTrPtr;
@ -2335,6 +2359,8 @@ public:
//m_useWeights = false;
m_useSynonyms = false;
m_hashGroup = -1;
m_useCountTable = true;
m_useSections = true;
m_startDist = 0;
m_siteHash32 = 0;
};
@ -2350,6 +2376,8 @@ public:
char m_hashGroup;
long m_startDist;
long m_siteHash32;
bool m_useCountTable;
bool m_useSections;
};

@ -281,6 +281,7 @@
<logDebugDiskMessages>0</>
<logDebugDnsMessages>0</>
<logDebugHttpMessages>0</>
<logDebugImageMessages>0</>
<logDebugLoopMessages>0</>
<logDebugLanguageDetectionMessages>0</>
<logDebugLinkInfo>0</>

@ -63,15 +63,29 @@ A work-in-progress <a href=/compare.html>comparison to SOLR</a>.
<br><br><a name=quickstart></a>
<h1>Quick Start</h1>
Until I get the binary packages ready, <a href=#src>build from the source code</a>, it should only take about 30 seconds to type the three commands.
<!--
Requirements: You will need an Intel or AMD system running Linux and at least 4GB of RAM.<br><br>
<!--Until I get the binary packages ready, <a href=#src>build from the source code</a>, it should only take about 30 seconds to type the three commands.-->
Requirements: You will need an Intel or AMD system running Linux and at least 4GB of RAM to run one instance/host of gb.<br><br>
Install the <a href=http://www.gigablast.com/gigablast-1.0-1.deb>Gigablast package for Ubuntu or Debian</a> or install the <a href=http://www.gigablast.com/gigablast-1.0-1.rpm>Gigablast package for RedHat</a>.
<br><br>
Once installed visit your <a href=http://127.0.0.1:8000/>local port 8000</a> to access the search engine controls and begin configuration. It could take up to 20 seconds to start the search engine for the first time.
<br><br>
<table><tr><td colspan=2><b>Installed Files</b></td></tr>
<tr><td>/var/gigablast/data0/</td><td>Directory of Gigablast binary and data files</td></tr>
<tr><td>/etc/gigablast/hosts.conf</td><td>Describes the hosts in the distributed cluster. Multiple hosts may exist one one physical server. Initially hosts.conf is just configured to use /var/gigablast/data0/ as the only host. See the section on <a href=#scaling>scaling</a> to add more hosts.</td></tr>
<tr><td>/etc/init.d/gb</td><td>start up script link</td></tr>
<tr><td>/etc/init/gb.conf</td><td>upstart conf file so you can type 'start gb' or 'stop gb', but that will only work on local instances of gb.</td></tr>
<tr><td>/usr/bin/gb</td><td>Link to /var/gigablast/data0/gb</td></tr>
</table>
Install the <a href=http://www.gigablast.com/gigablast-1.0-1.deb>Gigablast package for Ubuntu or Debian</a> or install the <a href=http://www.gigablast.com/gigablast-1.0-1.rpm>Gigablast package for RedHat</a>.
<br><br>
If you run into an bugs let me know so i can fix them right away: mattdwells@hotmail.com.
-->
<br>
<br>
@ -100,11 +114,11 @@ You will need the following packages installed<br>
<b>1.</b> Do <b>apt-get install make g++ gcc-multilib lib32stdc++6</b>
<br>
<b>2.</b> Download the <a href=https://github.com/gigablast/open-source-search-engine>Gigablast source code</a> using <b>wget --no-check-certificate "https://github.com/gigablast/open-source-search-engine/archive/master.zip"</b>, unzip it and cd into it.
<b>2.</b> Download the <a href=https://github.com/gigablast/open-source-search-engine>Gigablast source code</a> using <b>wget --no-check-certificate "https://github.com/gigablast/open-source-search-engine/archive/master.zip"</b>, unzip it and cd into it. (optionally use <b>git clone https://github.com/gigablast/open-source-search-engine.git ./github</b> if you have <i>git</i> installed.)
<br>
<b>3.</b> Run <b>make</b> to compile. (e.g. use 'make -j 4' to compile on four cores)
<br>
<b>4.</b> Run <b>./gb 0 -d</b> to start a single gigablast node which listens on port 8000 running in daemon mode.
<b>4.</b> Run <b>./gb -d</b> to start a single gigablast node which listens on port 8000 running in daemon mode (-d).
<br>
<b>5.</b> The first time you run gb, wait about 30 seconds for it to build some files. Check the log file to see when it completes.
<br>
@ -157,7 +171,9 @@ You will need the following packages installed<br>
<li> Sorting. Sort the search results by meta tags or JSON fields that contain numbers, simply by adding something like gbsortby:price or gbrevsortby:price as a query term, assuming you have meta price tags.
<li> Easy Scaling. Add new servers to the hosts.conf file then click 'rebalance shards' to automatically rebalance the sharded data.
<li> Using &stream=1 can stream back millions of search results for a query without running out of memory.
<li> Makes and displays thumbnail images in the search results.
<li> Nested boolean queries using AND, OR, NOT operators.
<li> Federated search over multiple Gigablast collections using syntax like &c=mycoll1+mycoll2+mycoll3+...
<li> Built-in support for <a href=http://www.diffbot.com/products/automatic/>diffbot.com's api</a>, which extracts various entities from web sites, like products, articles, etc. But you will need to get a free token from them for access to their API.
<li> Spellchecker will be renabled shortly.
</ul>

BIN
html/magglass.png Normal file

Binary file not shown.

After

(image error) Size: 2.0 KiB

22
init.gb.conf Normal file

@ -0,0 +1,22 @@
# Gigablast Search Engine Service
description "Gigablast Search Engine Service"
author "Matt Wells <gigablast@mail.com>"
start on runlevel [2345]
stop on starting rc RUNLEVEL=[016]
#respawn
#respawn limit 2 5
env HOME=/var/gigablast/shard0/
umask 007
# The default of 5 seconds is too low for mysql which needs to flush buffers
#kill timeout 300
# this will read /etc/gigablast/hosts.conf and start up the
# hosts in there that are local on this machine based on its ip address.
# if one is already running it should detect that it can not bind to the
# port and just exit right away without doing any harm.
exec /var/gigablast/shard0/gb localstart

BIN
libjpeg.so.62 Normal file

Binary file not shown.

BIN
libnetpbm.so.10 Normal file

Binary file not shown.

BIN
libpng12.so.0 Normal file

Binary file not shown.

BIN
libtiff.so.4 Normal file

Binary file not shown.

BIN
libz.so.1 Normal file

Binary file not shown.

283
main.cpp

@ -181,6 +181,8 @@ bool g_recoveryMode = false;
bool isRecoveryFutile ( ) ;
int copyFiles ( char *dstDir ) ;
//////
//
// if seo.o is being linked to it needs to override these weak stubs:
@ -402,17 +404,22 @@ int main2 ( int argc , char *argv[] ) {
//vpointerObject.isValidPointer(&vpointerObject); // whiny compiler
// End Pointer Check setup
if (argc < 1) {
if (argc < 0) {
printHelp:
SafeBuf sb;
sb.safePrintf(
"\n"
"Usage: gb [-w workingDir] <CMD>\n");
"Usage: gb [CMD]\n");
sb.safePrintf(
"\n"
"\tItems in []'s are optional, and items "
"in <>'s are "
"required.");
"\tgb will first try to load "
"the hosts.conf in the same directory as the "
"gb binary, if not found, then it will try "
"/etc/gigablast/hosts.conf. "
"Then it will determine its hostId based on "
"the directory and IP address listed in the "
"hosts.conf file it loaded. Things in []'s "
"are optional.");
/*
sb.safePrintf(
"\n\t"
@ -425,26 +432,30 @@ int main2 ( int argc , char *argv[] ) {
"overwritten from git pulls.\n\n" );
*/
sb.safePrintf(
"<CMD> can have the following values:\n\n"
"[CMD] can have the following values:\n\n"
"-h\tprint this help.\n\n"
"-v\tprint version and exit.\n\n"
"<hostId>\n"
"\tstart the gb process for this <hostId> locally."
" <hostId> is 0 to run as host #0, for instance."
"\n\n"
//"<hostId>\n"
//"\tstart the gb process for this <hostId> locally."
//" <hostId> is 0 to run as host #0, for instance."
//"\n\n"
"<hostId> -d\n\trun as daemon.\n\n"
//"<hostId> -d\n\trun as daemon.\n\n"
"-d\trun as daemon.\n\n"
//"-o\tprint the overview documentation in HTML. "
//"Contains the format of hosts.conf.\n\n"
"<hostId> -r\n\tindicates recovery mode, "
// "<hostId> -r\n\tindicates recovery mode, "
// "sends email to addresses "
// "specified in Conf.h upon startup.\n\n"
"-r\tindicates recovery mode, "
"sends email to addresses "
"specified in Conf.h upon startup.\n\n"
"start [hostId]\n"
"\tstart the gb process on all hosts or just on "
"[hostId] if specified using an ssh command.\n\n"
@ -947,20 +958,30 @@ int main2 ( int argc , char *argv[] ) {
return 0;
}
//SafeBuf tt;
//tt.base64Encode("any carnal pleas",16);
//fprintf(stderr,"%s\n",tt.getBufStart());
//exit(0);
// get hosts.conf file
//char *hostsConf = "./hosts.conf";
long hostId = 0;
long cmdarg = 1;
char *workingDir = NULL;
if ( argc >= 3 && argv[1][0]=='-'&&argv[1][1]=='w'&&argv[1][2]=='\0') {
//hostsConf = argv[2];
workingDir = argv[2];
cmdarg = 3;
}
//long hostId = -1;
long cmdarg = 0;
//char *workingDir = NULL;
//if(argc >= 3 && argv[1][0]=='-'&&argv[1][1]=='w'&&argv[1][2]=='\0') {
// //hostsConf = argv[2];
// workingDir = argv[2];
// cmdarg = 3;
// }
// get command
if ( argc <= cmdarg ) goto printHelp;
char *cmd = argv[cmdarg];
//if ( argc <= cmdarg ) goto printHelp;
// it might not be there, might be a simple "./gb"
char *cmd = "";
if ( argc >= 2 ) {
cmdarg = 1;
cmd = argv[1];
}
// help
if ( strcmp ( cmd , "-h" ) == 0 ) goto printHelp;
@ -979,18 +1000,18 @@ int main2 ( int argc , char *argv[] ) {
// return 0;
//}
bool hadHostId = false;
//bool hadHostId = false;
// assume our hostId is the command!
// now we advance 'cmd' past the hostId if we detect
// the presence of more args
if ( is_digit(argv[cmdarg][0]) ) {
hostId = atoi(argv[cmdarg]);
if(argc > cmdarg+1) {
cmd = argv[++cmdarg];
}
hadHostId = true;
}
// the presence of more args.
// WE NO LONGER do it this way...
// if ( is_digit(argv[cmdarg][0]) ) {
// hostId = atoi(argv[cmdarg]);
// if(argc > cmdarg+1) {
// cmd = argv[++cmdarg];
// }
// hadHostId = true;
// }
if ( strcmp ( cmd , "dosopen" ) == 0 ) {
long ip;
@ -1024,6 +1045,25 @@ int main2 ( int argc , char *argv[] ) {
testMandrill = true;
}
/*
class foo {
public:
long poo;
};
class fart {
public:
short fart3;
char fart1;
char fart2;
};
foo xxx;
xxx.poo = 38123;
fart *yyy = (fart *)&xxx;
fprintf(stderr,"fart1=%li fart2=%li fart3=%li\n",
(long)yyy->fart1,(long)yyy->fart2,(long)yyy->fart3);
exit(0);
*/
// gb gendbs, preset the hostid at least
if ( //strcmp ( cmd , "gendbs" ) == 0 ||
//strcmp ( cmd , "gentfndb" ) == 0 ||
@ -1037,7 +1077,7 @@ int main2 ( int argc , char *argv[] ) {
// ensure we got a collection name after the cmd
if ( cmdarg + 2 > argc ) goto printHelp;
// may also have an optional hostid
if ( cmdarg + 3 == argc ) hostId = atoi ( argv[cmdarg+2] );
//if ( cmdarg + 3 == argc ) hostId = atoi ( argv[cmdarg+2] );
}
if( (strcmp( cmd, "countdomains" ) == 0) && (argc >= (cmdarg + 2)) ) {
@ -1047,7 +1087,7 @@ int main2 ( int argc , char *argv[] ) {
}
// set it for g_hostdb and for logging
g_hostdb.m_hostId = hostId;
//g_hostdb.m_hostId = hostId;
//if ( strcmp ( cmd , "gzip" ) == 0 ) {
// if ( argc > cmdarg+1 ) gbgzip(argv[cmdarg+1]);
@ -1061,7 +1101,6 @@ int main2 ( int argc , char *argv[] ) {
// return 0;
//}
// these tests do not need a hosts.conf
/*
if ( strcmp ( cmd , "trietest" ) == 0 ) {
@ -1111,8 +1150,8 @@ int main2 ( int argc , char *argv[] ) {
if ( strcmp ( cmd , "parsetest" ) == 0 ) {
if ( cmdarg+1 >= argc ) goto printHelp;
// load up hosts.conf
if ( ! g_hostdb.init(hostId) ) {
log("db: hostdb init failed." ); return 1; }
//if ( ! g_hostdb.init(hostId) ) {
// log("db: hostdb init failed." ); return 1; }
// init our table for doing zobrist hashing
if ( ! hashinit() ) {
log("db: Failed to init hashtable." ); return 1; }
@ -1157,8 +1196,8 @@ int main2 ( int argc , char *argv[] ) {
*/
if ( strcmp ( cmd , "booltest" ) == 0 ){
if ( ! g_hostdb.init(hostId) ) {
log("db: hostdb init failed." ); return 1; }
//if ( ! g_hostdb.init(hostId) ) {
// log("db: hostdb init failed." ); return 1; }
// init our table for doing zobrist hashing
if ( ! hashinit() ) {
log("db: Failed to init hashtable." ); return 1; }
@ -1282,7 +1321,7 @@ int main2 ( int argc , char *argv[] ) {
strcmp( argv[cmdarg+1] , "load" ) == 0 ) {
isProxy = true;
// we need to parse out the hostid too!
if ( cmdarg + 2 < argc ) hostId = atoi ( argv[cmdarg+2] );
//if ( cmdarg + 2 < argc ) hostId = atoi ( argv[cmdarg+2] );
}
// this is just like starting up a gb process, but we add one to
@ -1298,8 +1337,8 @@ int main2 ( int argc , char *argv[] ) {
if ( strcmp ( cmd , "tmpstarthost" ) == 0 ) {
useTmpCluster = 1;
// we need to parse out the hostid too!
if ( cmdarg + 1 < argc ) hostId = atoi ( argv[cmdarg+1] );
else goto printHelp;
//if ( cmdarg + 1 < argc ) hostId = atoi ( argv[cmdarg+1] );
//else goto printHelp;
}
// gb inject <file> <ip:port> [startdocid]
@ -1325,16 +1364,23 @@ int main2 ( int argc , char *argv[] ) {
// get current working dir that the gb binary is in. all the data
// files should in there too!!
//
if ( ! workingDir ) workingDir = getcwd2 ( argv[0] );
//if ( ! workingDir ) workingDir = getcwd2 ( argv[0] );
char *workingDir = getcwd2 ( argv[0] );
//log("host: working directory is %s",workingDir);
// load up hosts.conf
if ( ! g_hostdb.init(hostId,
// . it will determine our hostid based on the directory path of this
// gb binary and the ip address of this server
if ( ! g_hostdb.init(-1, // we don't know it!!!hostId,
NULL,
isProxy,
useTmpCluster,
workingDir)){
log("db: hostdb init failed." ); return 1; }
Host *h9 = g_hostdb.m_myHost;
// set clock file name so gettimeofdayInMmiilisecondsGlobal()
// see g_clockInSync to be true... unles clockadjust.dat is more
// than 2 days old in which case not!
@ -1788,6 +1834,12 @@ int main2 ( int argc , char *argv[] ) {
char *cmd = argv[cmdarg+1];
return install ( ifk_dsh2 , -1,NULL,NULL,-1, cmd );
}
// gb copyfiles, like gb install but takes a dir not a host #
if ( strcmp ( cmd , "copyfiles" ) == 0 ) {
if ( cmdarg + 1 >= argc ) goto printHelp;
char *dir = argv[cmdarg+1];
return copyFiles ( dir );
}
// gb install
if ( strcmp ( cmd , "install" ) == 0 ) {
// get hostId to install TO (-1 means all)
@ -2307,8 +2359,8 @@ int main2 ( int argc , char *argv[] ) {
// mainStart:
// get host info for this host
Host *h = g_hostdb.getHost ( hostId );
if ( ! h ) { log("db: No host has id %li.",hostId); return 1;}
//Host *h = g_hostdb.getHost ( hostId );
//if ( ! h ) { log("db: No host has id %li.",hostId); return 1;}
// once we are in recoverymode, that means we are being restarted
// from having cored, so to prevent immediate core and restart
@ -2329,7 +2381,7 @@ int main2 ( int argc , char *argv[] ) {
// name gbHID.conf
// . now that hosts.conf has more of the burden, all gbHID.conf files
// can be identical
if ( ! g_conf.init ( h->m_dir ) ) { // , h->m_hostId ) ) {
if ( ! g_conf.init ( h9->m_dir ) ) { // , h->m_hostId ) ) {
log("db: Conf init failed." ); return 1; }
//if ( ! g_hostdb.validateIps ( &g_conf ) ) {
// log("db: Failed to validate ips." ); return 1;}
@ -2421,10 +2473,10 @@ int main2 ( int argc , char *argv[] ) {
if ( strcmp ( cmd , "dump" ) == 0 && argc > cmdarg + 1 &&
argv[cmdarg+1][0]=='I') {
if ( ! hadHostId ) {
log("you must supply hostid in the dump cmd");
return 0;
}
//if ( ! hadHostId ) {
// log("you must supply hostid in the dump cmd");
// return 0;
//}
long fileNum = 0;
long long off = 0LL;
@ -2440,10 +2492,10 @@ int main2 ( int argc , char *argv[] ) {
if ( strcmp ( cmd , "dump" ) == 0 && argc > cmdarg + 1 &&
argv[cmdarg+1][0]=='T') {
if ( ! hadHostId ) {
log("you must supply hostid in the dump cmd");
return 0;
}
//if ( ! hadHostId ) {
// log("you must supply hostid in the dump cmd");
// return 0;
//}
long fileNum = 0;
long long off = 0LL;
@ -2462,10 +2514,10 @@ int main2 ( int argc , char *argv[] ) {
// [priority] [printStats?]
if ( strcmp ( cmd , "dump" ) == 0 ) {
if ( ! hadHostId ) {
log("you must supply hostid in the dump cmd");
return 0;
}
// if ( ! hadHostId ) {
// log("you must supply hostid in the dump cmd");
// return 0;
// }
//
// tell Collectiondb, not to verify each rdb's data
@ -2749,6 +2801,8 @@ int main2 ( int argc , char *argv[] ) {
if ( ! g_httpServer.m_tcp.testBind(g_hostdb.getMyHost()->m_httpPort))
return 1;
long *ips;
//if ( strcmp ( cmd , "gendbs" ) == 0 ) goto jump;
//if ( strcmp ( cmd , "gentfndb" ) == 0 ) goto jump;
if ( strcmp ( cmd , "gencatdb" ) == 0 ) goto jump;
@ -2760,7 +2814,8 @@ int main2 ( int argc , char *argv[] ) {
g_hostdb.m_logFilename );
if ( ! g_conf.m_runAsDaemon )
log("db: Use ./gb <hostid> -d to run as daemon.");
log("db: Use ./gb -d to run as daemon. Example: "
"./gb 0 -d");
/*
// tmp stuff to generate new query log
@ -2776,7 +2831,16 @@ int main2 ( int argc , char *argv[] ) {
// start up log file
if ( ! g_log.init( g_hostdb.m_logFilename ) ) {
fprintf (stderr,"db: Log file init failed.\n" ); return 1; }
fprintf (stderr,"db: Log file init failed. Exiting.\n" );
return 1;
}
// in case we do not have one, we need it for Images.cpp
if ( ! makeTrashDir() ) {
fprintf (stderr,"db: failed to make trash dir. Exiting.\n" );
return 1;
}
g_errno = 0;
@ -2807,6 +2871,20 @@ int main2 ( int argc , char *argv[] ) {
g_log.m_logTimestamps = true;
// show current working dir
log("host: Working directory is %s",workingDir);
log("host: Using %shosts.conf",g_hostdb.m_dir);
// from Hostdb.cpp
ips = getLocalIps();
for ( ; ips && *ips ; ips++ )
log("host: Detected local ip %s",iptoa(*ips));
// show it
log("host: Running as host id #%li",g_hostdb.m_hostId );
if (!ucInit(g_hostdb.m_dir, true)) {
log("Unicode initialization failed!");
return 1;
@ -3275,7 +3353,7 @@ int main2 ( int argc , char *argv[] ) {
// . then dns Distributed client
// . server should listen to a socket and register with g_loop
// . Only the distributed cache shall call the dns server.
if ( ! g_dns.init( h->m_dnsClientPort ) ) {
if ( ! g_dns.init( h9->m_dnsClientPort ) ) {
log("db: Dns distributed client init failed." ); return 1; }
// . then dns Local client
//if ( ! g_dnsLocal.init( 0 , false ) ) {
@ -3283,7 +3361,7 @@ int main2 ( int argc , char *argv[] ) {
// . then webserver
// . server should listen to a socket and register with g_loop
// again:
if ( ! g_httpServer.init( h->m_httpPort, h->m_httpsPort ) ) {
if ( ! g_httpServer.init( h9->m_httpPort, h9->m_httpsPort ) ) {
log("db: HttpServer init failed. Another gb already "
"running?" );
// this is dangerous!!! do not do the shutdown thing
@ -3453,7 +3531,7 @@ int main2 ( int argc , char *argv[] ) {
char buf[256];
log("admin: Sending emails.");
sprintf(buf, "Host %li respawning after crash.(%s)",
hostId, iptoa(g_hostdb.getMyIp()));
h9->m_hostId, iptoa(g_hostdb.getMyIp()));
g_pingServer.sendEmail(NULL, buf);
}
@ -4642,28 +4720,31 @@ int install ( install_flag_konst_t installFlag , long hostId , char *dir ,
// don't copy to ourselves
//if ( h2->m_hostId == h->m_hostId ) continue;
sprintf(tmp,
"rcp %sgb.conf %s:%sgb.conf &",
"scp %sgb.conf %shosts.conf %s:%s %s",
dir ,
dir ,
//h->m_hostId ,
iptoa(h2->m_ip),
h2->m_dir);
h2->m_dir,
//h2->m_hostId);
amp);
log(LOG_INIT,"admin: %s", tmp);
system ( tmp );
sprintf(tmp,
"rcp %shosts.conf %s:%shosts.conf &",
dir ,
iptoa(h2->m_ip),
h2->m_dir);
log(LOG_INIT,"admin: %s", tmp);
system ( tmp );
sprintf(tmp,
"rcp %shosts2.conf %s:%shosts2.conf &",
dir ,
iptoa(h2->m_ip),
h2->m_dir);
log(LOG_INIT,"admin: %s", tmp);
system ( tmp );
// sprintf(tmp,
// "scp %shosts.conf %s:%shosts.conf &",
// dir ,
// iptoa(h2->m_ip),
// h2->m_dir);
// log(LOG_INIT,"admin: %s", tmp);
// system ( tmp );
// sprintf(tmp,
// "scp %shosts2.conf %s:%shosts2.conf &",
// dir ,
// iptoa(h2->m_ip),
// h2->m_dir);
// log(LOG_INIT,"admin: %s", tmp);
// system ( tmp );
}
else if ( installFlag == ifk_start ) {
// . save old log now, too
@ -4743,7 +4824,7 @@ int install ( install_flag_konst_t installFlag , long hostId , char *dir ,
"ssh %s \"cd %s ; "
"cp -f tmpgb tmpgb.oldsave ; "
"mv -f tmpgb.installed tmpgb ; "
"./tmpgb -w %s tmpstarthost "
"%s/tmpgb tmpstarthost "
"%li >& ./tmplog%03li &\" &",
iptoa(h2->m_ip),
h2->m_dir ,
@ -16878,8 +16959,8 @@ bool isRecoveryFutile ( ) {
// get time stamp
long timestamp = ff.getLastModifiedTime ( );
// skip if not iwthin last minute
if ( timestamp < now - 60 ) continue;
// skip if not iwthin 2 minutes
if ( timestamp < now - 2*60 ) continue;
// open it up to see if ends with sighandle
long toRead = 3000;
@ -16931,16 +17012,27 @@ char *getcwd2 ( char *arg ) {
// store the relative path of gb in there now
static char s_cwdBuf[1025];
getcwd ( s_cwdBuf , 1024 );
getcwd ( s_cwdBuf , 1020 );
char *end = s_cwdBuf + gbstrlen(s_cwdBuf);
// if "arg" is a RELATIVE path then append it
if ( arg && arg[0]!='/' ) {
memcpy ( end , arg , alen );
end += alen;
*end = '\0';
}
// if our path started with / then it was absolute...
else {
strncpy(s_cwdBuf,arg,alen);
}
// make sure it ends in / for consistency
long clen = gbstrlen(s_cwdBuf);
if ( s_cwdBuf[clen-1] != '/' ) {
s_cwdBuf[clen++] = '/';
s_cwdBuf[clen++] = '\0';
}
*end = '\0';
// size of the whole thing
//long clen = gbstrlen(s_cwdBuf);
@ -16954,3 +17046,22 @@ char *getcwd2 ( char *arg ) {
return s_cwdBuf;
}
int copyFiles ( char *dstDir ) {
char *srcDir = "./";
SafeBuf fileListBuf;
g_process.getFilesToCopy ( srcDir , &fileListBuf );
SafeBuf tmp;
tmp.safePrintf(
"cp -r %s %s"
, fileListBuf.getBufStart()
, dstDir
);
//log(LOG_INIT,"admin: %s", tmp.getBufStart());
fprintf(stderr,"\nRunning cmd: %s\n",tmp.getBufStart());
system ( tmp.getBufStart() );
return 0;
}

BIN
pdftohtml

Binary file not shown.