merge from master branch to diffbot-kevin
This commit is contained in:
commit
2eb106aaf5
BigFile.cppCollectiondb.cppCollectiondb.hFile.cppFile.hHashTableX.hHostdb.cppHostdb.hHttpServer.cppJson.cppJson.hLinkdb.cppLinkdb.hLoop.cppMakefileMsg13.cppMsg3.cppMsg39.cppMsg39.hMsg3a.cppMsg40.cppMsg40.hPageCrawlBot.cppPageGet.cppPageHosts.cppPageReindex.cppPageResults.cppPageResults.hPageRoot.cppPageSockets.cppPages.cppParms.cppParms.hPingServer.cppPosdb.cppPosdb.hProcess.cppQuery.cppQuery.hREADME.mdRdb.cppRdbBase.cppRdbBuckets.cppRdbBuckets.hRdbDump.cppRepair.cppSearchInput.cppSpider.cppSpider.hTagdb.cppTest.cppUdpServer.cppUdpServer.hUdpSlot.cppUdpSlot.hUrl.hXml.cppXml.hXmlDoc.cppXmlDoc.hXmlNode.cppXmlNode.h
html
blog.htmlfaq.htmlnews.htmlss_filters.pngss_filters_thumb.pngss_hosts.pngss_hosts_thumb.pngss_settings.pngss_settings_thumb.png
main.cppmysynonyms.txtqa.cpp@ -386,7 +386,9 @@ bool BigFile::readwrite ( void *buf ,
|
||||
// had negative offsets, bad engineer
|
||||
if ( offset < 0 ) {
|
||||
log(LOG_LOGIC,"disk: readwrite() offset is %"INT64" "
|
||||
"< 0. dumping core.",offset);
|
||||
"< 0. filename=%s/%s. dumping core. try deleting "
|
||||
"the .map file for it and restarting.",offset,
|
||||
m_dir,m_baseFilename);
|
||||
char *xx = NULL; *xx = 0;
|
||||
}
|
||||
// if we're not blocking use a fake fstate
|
||||
|
153
Collectiondb.cpp
153
Collectiondb.cpp
@ -191,7 +191,8 @@ bool Collectiondb::cleanTrees ( ) {
|
||||
//r = g_indexdb.getRdb();
|
||||
//r->m_tree.cleanTree ((char **)r->m_bases);
|
||||
r = g_posdb.getRdb();
|
||||
r->m_tree.cleanTree ();//(char **)r->m_bases);
|
||||
//r->m_tree.cleanTree ();//(char **)r->m_bases);
|
||||
r->m_buckets.cleanBuckets();
|
||||
//r = g_datedb.getRdb();
|
||||
//r->m_tree.cleanTree ((char **)r->m_bases);
|
||||
|
||||
@ -284,6 +285,10 @@ bool Collectiondb::addExistingColl ( char *coll, collnum_t collnum ) {
|
||||
|
||||
if ( ! registerCollRec ( cr , false ) ) return false;
|
||||
|
||||
// always index spider status docs now for custom crawls
|
||||
if ( cr->m_isCustomCrawl )
|
||||
cr->m_indexSpiderReplies = true;
|
||||
|
||||
// we need to compile the regular expressions or update the url
|
||||
// filters with new logic that maps crawlbot parms to url filters
|
||||
return cr->rebuildUrlFilters ( );
|
||||
@ -476,6 +481,8 @@ bool Collectiondb::addNewColl ( char *coll ,
|
||||
|
||||
|
||||
if ( customCrawl ) {
|
||||
// always index spider status docs now
|
||||
cr->m_indexSpiderReplies = true;
|
||||
// remember the token
|
||||
cr->m_diffbotToken.set ( token );
|
||||
cr->m_diffbotCrawlName.set ( crawl );
|
||||
@ -1702,6 +1709,8 @@ CollectionRec::CollectionRec() {
|
||||
// m_spiderQuotas[i] = -1;
|
||||
memset( m_spiderPriorities, 0,
|
||||
MAX_FILTERS*sizeof(*m_spiderPriorities) );
|
||||
memset ( m_harvestLinks,0,MAX_FILTERS);
|
||||
memset ( m_forceDelete,0,MAX_FILTERS);
|
||||
//memset( m_rulesets, 0, MAX_FILTERS*sizeof(*m_rulesets) );
|
||||
//for ( int i = 0; i < MAX_SEARCH_PASSWORDS; i++ ) {
|
||||
// *(m_searchPwds[i]) = '\0';
|
||||
@ -2071,6 +2080,11 @@ bool CollectionRec::countEvents ( ) {
|
||||
*/
|
||||
|
||||
bool CollectionRec::rebuildUrlFilters2 ( ) {
|
||||
|
||||
// tell spider loop to update active list
|
||||
g_spiderLoop.m_activeListValid = false;
|
||||
|
||||
|
||||
bool rebuild = true;
|
||||
if ( m_numRegExs == 0 )
|
||||
rebuild = true;
|
||||
@ -2106,9 +2120,6 @@ bool CollectionRec::rebuildUrlFilters2 ( ) {
|
||||
// addDefault = true;
|
||||
if ( ! rebuild ) return true;
|
||||
|
||||
// tell spider loop to update active list
|
||||
g_spiderLoop.m_activeListValid = false;
|
||||
|
||||
|
||||
if ( !strcmp(s,"shallow" ) )
|
||||
return rebuildShallowRules();
|
||||
@ -2177,7 +2188,8 @@ bool CollectionRec::rebuildUrlFilters2 ( ) {
|
||||
m_maxSpidersPerRule [n] = 99; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = -3; // delete!
|
||||
m_spiderPriorities [n] = 100; // delete!
|
||||
m_forceDelete [n] = 1;
|
||||
n++;
|
||||
|
||||
// if not in the site list then nuke it
|
||||
@ -2187,7 +2199,8 @@ bool CollectionRec::rebuildUrlFilters2 ( ) {
|
||||
m_maxSpidersPerRule [n] = 99; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = -3; // delete!
|
||||
m_spiderPriorities [n] = 100;
|
||||
m_forceDelete [n] = 1;
|
||||
n++;
|
||||
|
||||
m_regExs[n].set("errorcount>=3 && hastmperror");
|
||||
@ -2196,7 +2209,8 @@ bool CollectionRec::rebuildUrlFilters2 ( ) {
|
||||
m_maxSpidersPerRule [n] = 1; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 3;
|
||||
m_spiderPriorities [n] = 100;
|
||||
m_forceDelete [n] = 1;
|
||||
n++;
|
||||
|
||||
m_regExs[n].set("errorcount>=1 && hastmperror");
|
||||
@ -2221,6 +2235,32 @@ bool CollectionRec::rebuildUrlFilters2 ( ) {
|
||||
m_spiderFreqs [n] = .00347; // 5 mins
|
||||
n++;
|
||||
|
||||
// 20+ unique c block parent request urls means it is important!
|
||||
m_regExs[n].set("numinlinks>7 && isnew");
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 7; // 30 days default
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 52;
|
||||
if ( ! strcmp(s,"news") )
|
||||
m_spiderFreqs [n] = .00347; // 5 mins
|
||||
n++;
|
||||
|
||||
// 20+ unique c block parent request urls means it is important!
|
||||
m_regExs[n].set("numinlinks>7");
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 7; // 30 days default
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 51;
|
||||
if ( ! strcmp(s,"news") )
|
||||
m_spiderFreqs [n] = .00347; // 5 mins
|
||||
n++;
|
||||
|
||||
|
||||
|
||||
m_regExs[n].set("hopcount==0 && iswww && isnew");
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 7; // 30 days default
|
||||
@ -2265,6 +2305,55 @@ bool CollectionRec::rebuildUrlFilters2 ( ) {
|
||||
m_spiderFreqs [n] = .00347; // 5 mins
|
||||
n++;
|
||||
|
||||
|
||||
m_regExs[n].set("isparentrss && isnew");
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 7; // 30 days default
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 45;
|
||||
if ( ! strcmp(s,"news") )
|
||||
m_spiderFreqs [n] = .00347; // 5 mins
|
||||
n++;
|
||||
|
||||
m_regExs[n].set("isparentsitemap && isnew");
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 7; // 30 days default
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 44;
|
||||
if ( ! strcmp(s,"news") )
|
||||
m_spiderFreqs [n] = .00347; // 5 mins
|
||||
n++;
|
||||
|
||||
|
||||
m_regExs[n].set("isparentrss");
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 20.0; // 30 days default
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 43;
|
||||
if ( ! strcmp(s,"news") )
|
||||
m_spiderFreqs [n] = .00347; // 5 mins
|
||||
n++;
|
||||
|
||||
m_regExs[n].set("isparentsitemap");
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 20.0; // 30 days default
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 42;
|
||||
if ( ! strcmp(s,"news") )
|
||||
m_spiderFreqs [n] = .00347; // 5 mins
|
||||
n++;
|
||||
|
||||
|
||||
|
||||
|
||||
m_regExs[n].set("hopcount==1 && isnew");
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 20.0;
|
||||
@ -2379,6 +2468,7 @@ bool CollectionRec::rebuildUrlFilters2 ( ) {
|
||||
m_numRegExs5 = n;
|
||||
m_numRegExs6 = n;
|
||||
m_numRegExs8 = n;
|
||||
m_numRegExs7 = n;
|
||||
|
||||
// more rules
|
||||
|
||||
@ -2414,7 +2504,8 @@ bool CollectionRec::rebuildLangRules ( char *langStr , char *tldStr ) {
|
||||
m_maxSpidersPerRule [n] = 99; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = -3; // delete!
|
||||
m_spiderPriorities [n] = 100; // delete!
|
||||
m_forceDelete [n] = 1;
|
||||
n++;
|
||||
|
||||
// if not in the site list then nuke it
|
||||
@ -2424,7 +2515,8 @@ bool CollectionRec::rebuildLangRules ( char *langStr , char *tldStr ) {
|
||||
m_maxSpidersPerRule [n] = 99; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = -3; // delete!
|
||||
m_spiderPriorities [n] = 100; // delete!
|
||||
m_forceDelete [n] = 1;
|
||||
n++;
|
||||
|
||||
m_regExs[n].set("errorcount>=3 && hastmperror");
|
||||
@ -2433,7 +2525,8 @@ bool CollectionRec::rebuildLangRules ( char *langStr , char *tldStr ) {
|
||||
m_maxSpidersPerRule [n] = 1; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 3;
|
||||
m_spiderPriorities [n] = 100;
|
||||
m_forceDelete [n] = 1;
|
||||
n++;
|
||||
|
||||
m_regExs[n].set("errorcount>=1 && hastmperror");
|
||||
@ -2794,6 +2887,7 @@ bool CollectionRec::rebuildLangRules ( char *langStr , char *tldStr ) {
|
||||
m_numRegExs5 = n;
|
||||
m_numRegExs6 = n;
|
||||
m_numRegExs8 = n;
|
||||
m_numRegExs7 = n;
|
||||
|
||||
// done rebuilding CHINESE rules
|
||||
return true;
|
||||
@ -2818,7 +2912,8 @@ bool CollectionRec::rebuildShallowRules ( ) {
|
||||
m_maxSpidersPerRule [n] = 99; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = -3; // delete!
|
||||
m_spiderPriorities [n] = 100; // delete!
|
||||
m_forceDelete [n] = 1;
|
||||
n++;
|
||||
|
||||
// if not in the site list then nuke it
|
||||
@ -2828,7 +2923,8 @@ bool CollectionRec::rebuildShallowRules ( ) {
|
||||
m_maxSpidersPerRule [n] = 99; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = -3; // delete!
|
||||
m_spiderPriorities [n] = 100; // delete!
|
||||
m_forceDelete [n] = 1;
|
||||
n++;
|
||||
|
||||
m_regExs[n].set("errorcount>=3 && hastmperror");
|
||||
@ -2837,7 +2933,8 @@ bool CollectionRec::rebuildShallowRules ( ) {
|
||||
m_maxSpidersPerRule [n] = 1; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 3;
|
||||
m_spiderPriorities [n] = 100;
|
||||
m_forceDelete [n] = 1;
|
||||
n++;
|
||||
|
||||
m_regExs[n].set("errorcount>=1 && hastmperror");
|
||||
@ -3012,6 +3109,7 @@ bool CollectionRec::rebuildShallowRules ( ) {
|
||||
m_numRegExs5 = n;
|
||||
m_numRegExs6 = n;
|
||||
m_numRegExs8 = n;
|
||||
m_numRegExs7 = n;
|
||||
|
||||
// done rebuilding SHALLOW rules
|
||||
return true;
|
||||
@ -3388,6 +3486,7 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() {
|
||||
m_spiderFreqs [i] = respiderFreq;
|
||||
//m_spiderDiffbotApiUrl[i].purge();
|
||||
m_harvestLinks[i] = true;
|
||||
m_forceDelete [i] = false;
|
||||
}
|
||||
|
||||
int32_t i = 0;
|
||||
@ -3400,7 +3499,9 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() {
|
||||
|
||||
// 2nd default url
|
||||
m_regExs[i].set("ismedia && !ismanualadd");
|
||||
m_spiderPriorities [i] = SPIDER_PRIORITY_FILTERED;
|
||||
m_maxSpidersPerRule [i] = 0;
|
||||
m_spiderPriorities [i] = 100; // delete!
|
||||
m_forceDelete [i] = 1;
|
||||
i++;
|
||||
|
||||
// hopcount filter if asked for
|
||||
@ -3418,7 +3519,10 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() {
|
||||
m_regExs[i].set(hopcountStr);
|
||||
|
||||
// means DELETE :
|
||||
m_spiderPriorities [i] = SPIDER_PRIORITY_FILTERED;
|
||||
m_spiderPriorities [i] = 0;//SPIDER_PRIORITY_FILTERED;
|
||||
|
||||
// just don't spider
|
||||
m_maxSpidersPerRule[i] = 0;
|
||||
|
||||
// compatibility with m_spiderRoundStartTime:
|
||||
m_spiderFreqs[i] = 0.0;
|
||||
@ -3439,7 +3543,9 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() {
|
||||
// MDW: even if they supplied a crawl pattern let's restrict to seed
|
||||
// domains 12/15/14
|
||||
m_regExs[i].set("!isonsamedomain && !ismanualadd");
|
||||
m_spiderPriorities [i] = SPIDER_PRIORITY_FILTERED;
|
||||
m_maxSpidersPerRule [i] = 0;
|
||||
m_spiderPriorities [i] = 100; // delete!
|
||||
m_forceDelete [i] = 1;
|
||||
i++;
|
||||
//}
|
||||
|
||||
@ -3452,7 +3558,9 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() {
|
||||
// only negative patterns then restrict to domains of seeds
|
||||
if ( ucp && ! ucpHasPositive && ! m_hasucr ) {
|
||||
m_regExs[i].set("!isonsamedomain && !ismanualadd");
|
||||
m_spiderPriorities [i] = SPIDER_PRIORITY_FILTERED;
|
||||
m_maxSpidersPerRule [i] = 0;
|
||||
m_spiderPriorities [i] = 100; // delete!
|
||||
m_forceDelete [i] = 1;
|
||||
i++;
|
||||
}
|
||||
|
||||
@ -3478,7 +3586,7 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() {
|
||||
|
||||
// excessive errors? (tcp/dns timed out, etc.) retry once per month?
|
||||
m_regExs[i].set("errorcount>=3 && hastmperror");
|
||||
m_spiderPriorities [i] = 30;
|
||||
m_spiderPriorities [i] = 3;
|
||||
m_spiderFreqs [i] = 30; // 30 days
|
||||
// if bulk job, do not download a url more than 3 times
|
||||
if ( m_isCustomCrawl == 2 ) m_maxSpidersPerRule [i] = 0;
|
||||
@ -3556,7 +3664,9 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() {
|
||||
i++;
|
||||
// do not crawl anything else
|
||||
m_regExs[i].set("default");
|
||||
m_spiderPriorities [i] = SPIDER_PRIORITY_FILTERED;
|
||||
m_spiderPriorities [i] = 0;//SPIDER_PRIORITY_FILTERED;
|
||||
// don't spider
|
||||
m_maxSpidersPerRule[i] = 0;
|
||||
// this needs to be zero so &spiderRoundStart=0
|
||||
// functionality which sets m_spiderRoundStartTime
|
||||
// to the current time works
|
||||
@ -3576,7 +3686,9 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() {
|
||||
i++;
|
||||
// do not crawl anything else
|
||||
m_regExs[i].set("default");
|
||||
m_spiderPriorities [i] = SPIDER_PRIORITY_FILTERED;
|
||||
m_spiderPriorities [i] = 0;//SPIDER_PRIORITY_FILTERED;
|
||||
// don't delete, just don't spider
|
||||
m_maxSpidersPerRule[i] = 0;
|
||||
// this needs to be zero so &spiderRoundStart=0
|
||||
// functionality which sets m_spiderRoundStartTime
|
||||
// to the current time works
|
||||
@ -3630,6 +3742,7 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() {
|
||||
m_numRegExs6 = i;
|
||||
//m_numRegExs7 = i;
|
||||
m_numRegExs8 = i;
|
||||
m_numRegExs7 = i;
|
||||
//m_numRegExs11 = i;
|
||||
|
||||
|
||||
|
@ -814,6 +814,9 @@ class CollectionRec {
|
||||
int32_t m_numRegExs8;
|
||||
char m_harvestLinks [ MAX_FILTERS ];
|
||||
|
||||
int32_t m_numRegExs7;
|
||||
char m_forceDelete [ MAX_FILTERS ];
|
||||
|
||||
// dummy?
|
||||
int32_t m_numRegExs9;
|
||||
|
||||
|
96
File.cpp
96
File.cpp
@ -74,6 +74,8 @@ File::File ( ) {
|
||||
// threaded unlink sets this to true before spawning thread so we
|
||||
// do not try to open it!
|
||||
//m_gone = 0;
|
||||
m_nextActive = NULL;
|
||||
m_prevActive = NULL;
|
||||
}
|
||||
|
||||
|
||||
@ -129,6 +131,52 @@ bool File::rename ( char *newFilename ) {
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
static File *s_activeHead = NULL;
|
||||
static File *s_activeTail = NULL;
|
||||
|
||||
void rmFileFromLinkedList ( File *f ) {
|
||||
// excise from linked list of active files
|
||||
if ( s_activeHead == f )
|
||||
s_activeHead = f->m_nextActive;
|
||||
if ( s_activeTail == f )
|
||||
s_activeTail = f->m_prevActive;
|
||||
if ( f->m_prevActive )
|
||||
f->m_prevActive->m_nextActive = f->m_nextActive;
|
||||
if ( f->m_nextActive )
|
||||
f->m_nextActive->m_prevActive = f->m_prevActive;
|
||||
// and so we do not try to re-excise it
|
||||
f->m_prevActive = NULL;
|
||||
f->m_nextActive = NULL;
|
||||
}
|
||||
|
||||
void addFileToLinkedList ( File *f ) {
|
||||
// must not be in there already, lest we double add it
|
||||
if ( f->m_nextActive ) return;
|
||||
if ( f->m_prevActive ) return;
|
||||
if ( s_activeHead == f ) return;
|
||||
|
||||
f->m_nextActive = NULL;
|
||||
f->m_prevActive = NULL;
|
||||
if ( ! s_activeTail ) {
|
||||
s_activeHead = f;
|
||||
s_activeTail = f;
|
||||
return;
|
||||
}
|
||||
// insert at end of linked list otherwise
|
||||
s_activeTail->m_nextActive = f;
|
||||
f->m_prevActive = s_activeTail;
|
||||
s_activeTail = f;
|
||||
}
|
||||
|
||||
// update linked list
|
||||
void promoteInLinkedList ( File *f ) {
|
||||
rmFileFromLinkedList ( f );
|
||||
addFileToLinkedList ( f );
|
||||
}
|
||||
*/
|
||||
|
||||
// . open the file
|
||||
// . only call once per File after calling set()
|
||||
bool File::open ( int flags , int permissions ) {
|
||||
@ -200,6 +248,8 @@ int File::write ( void *buf ,
|
||||
else n = pwrite ( fd , buf , numBytesToWrite , offset );
|
||||
// valgrind
|
||||
if ( n < 0 && errno == EINTR ) goto retry21;
|
||||
// update linked list
|
||||
//promoteInLinkedList ( this );
|
||||
// copy errno to g_errno
|
||||
if ( n < 0 ) g_errno = errno;
|
||||
// cancel blocking errors - not really errors
|
||||
@ -228,6 +278,8 @@ int File::read ( void *buf ,
|
||||
else n = pread ( fd , buf , numBytesToRead , offset );
|
||||
// valgrind
|
||||
if ( n < 0 && errno == EINTR ) goto retry9;
|
||||
// update linked list
|
||||
//promoteInLinkedList ( this );
|
||||
// copy errno to g_errno
|
||||
if ( n < 0 ) g_errno = errno;
|
||||
// cancel blocking errors - not really errors
|
||||
@ -340,6 +392,8 @@ void File::close2 ( ) {
|
||||
"This should never happen. vfd=%i fd=%i.", m_vfd,fd);
|
||||
return;
|
||||
}
|
||||
// excise from linked list of active files
|
||||
//rmFileFromLinkedList ( this );
|
||||
// mark this virtual file descriptor as available.
|
||||
s_fds [ m_vfd ] = -2;
|
||||
// no more virtual file descriptor
|
||||
@ -407,6 +461,8 @@ bool File::close ( ) {
|
||||
}
|
||||
// otherwise decrease the # of open files
|
||||
s_numOpenFiles--;
|
||||
// excise from linked list of active files
|
||||
//rmFileFromLinkedList ( this );
|
||||
// return true blue
|
||||
return true;
|
||||
}
|
||||
@ -524,6 +580,8 @@ int File::getfd () {
|
||||
s_unlinking [ m_vfd ] = 0;
|
||||
// update the time stamp
|
||||
s_timestamps [ m_vfd ] = gettimeofdayInMillisecondsLocal();
|
||||
// add file to linked list of active files
|
||||
//addFileToLinkedList ( this );
|
||||
return fd;
|
||||
}
|
||||
|
||||
@ -535,6 +593,31 @@ bool File::closeLeastUsed () {
|
||||
int mini = -1;
|
||||
int64_t now = gettimeofdayInMillisecondsLocal();
|
||||
|
||||
/*
|
||||
// use the new linked list of active file descriptors
|
||||
// . file at tail is the most active
|
||||
File *f = s_activeHead;
|
||||
|
||||
// if nothing to do return true
|
||||
if ( ! f ) return true;
|
||||
|
||||
// close the head if not writing
|
||||
for ( ; f ; f = f->m_nextActive ) {
|
||||
mini = f->m_vfd;
|
||||
// how can this be?
|
||||
if ( s_fds [ mini ] < 0 ) { char *xx=NULL;*xx=0; }
|
||||
if ( s_writing [ mini ] ) continue;
|
||||
if ( s_unlinking [ mini ] ) continue;
|
||||
// when we got like 1000 reads queued up, it uses a *lot* of
|
||||
// memory and we can end up never being able to complete a
|
||||
// read because the descriptors are always getting closed on us
|
||||
// so do a hack fix and do not close descriptors that are
|
||||
// about .5 seconds old on avg.
|
||||
if ( s_timestamps [ mini ] >= now - 1 ) continue;
|
||||
break;
|
||||
}
|
||||
*/
|
||||
|
||||
// get the least used of all the actively opened file descriptors.
|
||||
// we can't get files that were opened for writing!!!
|
||||
int i;
|
||||
@ -603,8 +686,19 @@ bool File::closeLeastUsed () {
|
||||
// we're just conserving file descriptors
|
||||
s_fds [ mini ] = -1;
|
||||
|
||||
|
||||
|
||||
// if the real close was successful then decrement the # of open files
|
||||
if ( status == 0 ) s_numOpenFiles--;
|
||||
if ( status == 0 ) {
|
||||
s_numOpenFiles--;
|
||||
// excise from linked list of active files
|
||||
//rmFileFromLinkedList ( f );
|
||||
// getfd() may not execute in time to ince the closeCount
|
||||
// so do it here. test by setting the max open files to like
|
||||
// 10 or so and spidering heavily.
|
||||
//s_closeCounts [ fd ]++;
|
||||
}
|
||||
|
||||
|
||||
if ( status == -1 )
|
||||
return log("disk: close(%i) : %s", fd , strerror(errno));
|
||||
|
6
File.h
6
File.h
@ -22,7 +22,8 @@
|
||||
// . max # of VIRTUAL file descriptors
|
||||
// . man, chris has 958 files, lets crank it up from 2k to 5k
|
||||
// . boost up to 50,000 since we are hitting this limit with crawlbot
|
||||
#define MAX_NUM_VFDS (50*1024)
|
||||
// . we are hitting again with crawlbot, boost to 200k from 50k
|
||||
#define MAX_NUM_VFDS (200*1024)
|
||||
|
||||
#include <sys/types.h> // for open/lseek
|
||||
#include <sys/stat.h> // for open
|
||||
@ -182,6 +183,9 @@ class File {
|
||||
time_t m_st_mtime; // file last mod date
|
||||
int32_t m_st_size; // file size
|
||||
time_t getLastModifiedDate ( ) ;
|
||||
|
||||
class File *m_nextActive;
|
||||
class File *m_prevActive;
|
||||
};
|
||||
|
||||
|
||||
|
18
HashTableX.h
18
HashTableX.h
@ -144,13 +144,19 @@ class HashTableX {
|
||||
|
||||
bool addTerm144 ( key144_t *kp , int32_t score = 1 ) {
|
||||
|
||||
/*
|
||||
// debug XmlDoc.cpp's hash table
|
||||
//int64_t termId = ((key144_t *)kp)->n2 >> 16;
|
||||
//if ( termId == 59194288760543LL ) {
|
||||
// log("got it");
|
||||
// char *xx=NULL;*xx=0;
|
||||
//}
|
||||
|
||||
int64_t termId = ((key144_t *)kp)->n2 >> 16;
|
||||
uint64_t d = 0LL;
|
||||
d = ((unsigned char *)kp)[11];
|
||||
d <<= 32;
|
||||
d |= *(uint32_t *)(((unsigned char *)kp)+7);
|
||||
d >>= 2;
|
||||
if ( d==110324895284 && termId == 39206941907955LL ) {
|
||||
log("got it");
|
||||
char *xx=NULL;*xx=0;
|
||||
}
|
||||
*/
|
||||
// grow it!
|
||||
if ( (m_numSlots < 20 || 4 * m_numSlotsUsed >= m_numSlots) &&
|
||||
m_numSlots < m_maxSlots ) {
|
||||
|
@ -66,6 +66,7 @@ Hostdb::Hostdb ( ) {
|
||||
m_crcValid = false;
|
||||
m_crc = 0;
|
||||
m_created = false;
|
||||
m_myHost = NULL;
|
||||
}
|
||||
|
||||
Hostdb::~Hostdb () {
|
||||
@ -108,6 +109,7 @@ bool Hostdb::init ( int32_t hostIdArg , char *netName ,
|
||||
m_myIp = 0;
|
||||
m_myIpShotgun = 0;
|
||||
m_myPort = 0;
|
||||
m_myHost = NULL;
|
||||
//m_myPort2 = 0;
|
||||
m_numHosts = 0;
|
||||
m_numHostsPerShard = 0;
|
||||
@ -1833,7 +1835,7 @@ bool Hostdb::replaceHost ( int32_t origHostId, int32_t spareHostId ) {
|
||||
oldHost->m_emailCode = 0;
|
||||
oldHost->m_wasAlive = false;
|
||||
oldHost->m_pingInfo.m_etryagains = 0;
|
||||
oldHost->m_pingInfo.m_udpSlotsInUse = 0;
|
||||
oldHost->m_pingInfo.m_udpSlotsInUseIncoming = 0;
|
||||
oldHost->m_pingInfo.m_totalResends = 0;
|
||||
oldHost->m_errorReplies = 0;
|
||||
oldHost->m_dgramsTo = 0;
|
||||
|
4
Hostdb.h
4
Hostdb.h
@ -106,7 +106,7 @@ class PingInfo {
|
||||
int32_t m_totalResends;
|
||||
int32_t m_etryagains;
|
||||
|
||||
int32_t m_udpSlotsInUse;
|
||||
int32_t m_udpSlotsInUseIncoming;
|
||||
int32_t m_tcpSocketsInUse;
|
||||
|
||||
int16_t m_currentSpiders;
|
||||
@ -115,7 +115,7 @@ class PingInfo {
|
||||
char m_gbVersionStr[21];
|
||||
char m_repairMode;
|
||||
char m_kernelErrors;
|
||||
|
||||
uint8_t m_recoveryLevel;
|
||||
};
|
||||
|
||||
class Host {
|
||||
|
@ -1026,6 +1026,7 @@ bool HttpServer::sendReply ( TcpSocket *s , HttpRequest *r , bool isAdmin) {
|
||||
// "GET /crawlbot/downloadpages"
|
||||
if ( strncmp ( path , "/crawlbot/download/" ,19 ) == 0 ||
|
||||
// add 4 to length of needle to account for /vXX.
|
||||
// GET /v3/crawl/download/
|
||||
(pathLen >= 20 && strnstr(path, "/crawl/download/", 20)) ||
|
||||
(pathLen >= 19 && strnstr(path, "/bulk/download/", 19)) )
|
||||
return sendBackDump ( s , r );
|
||||
@ -1243,8 +1244,8 @@ bool HttpServer::sendReply ( TcpSocket *s , HttpRequest *r , bool isAdmin) {
|
||||
return sendPagePretty ( s , r , "about.html","about" );
|
||||
|
||||
// decorate the plain html page, news.html, with our nav chrome
|
||||
if ( ! strncmp ( path ,"/news.html", pathLen ) )
|
||||
return sendPagePretty ( s , r , "news.html", "news");
|
||||
if ( ! strncmp ( path ,"/blog.html", pathLen ) )
|
||||
return sendPagePretty ( s , r , "blog.html", "blog");
|
||||
|
||||
// decorate the plain html page with our nav chrome
|
||||
if ( ! strncmp ( path ,"/searchfeed.html", pathLen ) )
|
||||
@ -2340,7 +2341,7 @@ int32_t getMsgSize ( char *buf, int32_t bufSize, TcpSocket *s ) {
|
||||
// /admin/basic etc
|
||||
if ( pp + 7 < ppend && strncmp ( pp ,"/admin/",7)==0)
|
||||
max = 0x7fffffff;
|
||||
// bulk job. /v2/bulk
|
||||
// bulk job. /v2/bulk or /v3/crawl/download/token-name...
|
||||
if ( pp + 4 < ppend && strncmp ( pp ,"/v",2)==0 &&
|
||||
// /v2/bulk
|
||||
( ( pp[4] == 'b' && pp[5] == 'u' ) ||
|
||||
|
30
Json.cpp
30
Json.cpp
@ -233,6 +233,7 @@ JsonItem *Json::parseJsonStringIntoJsonItems ( char *json , int32_t niceness ) {
|
||||
// json must start with { or [ i guess
|
||||
// otherwise getFirstItem() won't work!
|
||||
if ( m_sb.m_length==0 ) {
|
||||
log("json: length is 0");
|
||||
g_errno = EBADJSONPARSER;
|
||||
return NULL;
|
||||
}
|
||||
@ -294,10 +295,12 @@ JsonItem *Json::parseJsonStringIntoJsonItems ( char *json , int32_t niceness ) {
|
||||
// what is the length of it?
|
||||
int32_t slen = 4;
|
||||
ji->m_valueLong = 1;
|
||||
ji->m_value64 = 1;
|
||||
ji->m_valueDouble = 1.0;
|
||||
if ( *p == 'f' ) {
|
||||
slen = 5;
|
||||
ji->m_valueLong = 0;
|
||||
ji->m_value64 = 0;
|
||||
ji->m_valueDouble = 0;
|
||||
}
|
||||
// store decoded string right after jsonitem
|
||||
@ -342,6 +345,7 @@ JsonItem *Json::parseJsonStringIntoJsonItems ( char *json , int32_t niceness ) {
|
||||
//char c = str[slen];
|
||||
//str[slen] = '\0';
|
||||
ji->m_valueLong = atol(str);
|
||||
ji->m_value64 = atoll(str);
|
||||
ji->m_valueDouble = atof(str);
|
||||
// copy the number as a string as well
|
||||
int32_t curr = m_sb.length();
|
||||
@ -367,7 +371,11 @@ JsonItem *Json::parseJsonStringIntoJsonItems ( char *json , int32_t niceness ) {
|
||||
|
||||
// for testing if we realloc
|
||||
char *memEnd = m_sb.getBufStart();
|
||||
if ( mem != memEnd ) { char *xx=NULL;*xx=0; }
|
||||
|
||||
// bitch if we had to do a realloc. should never happen but i
|
||||
// saw it happen once, so do not core on that.
|
||||
if ( mem != memEnd )
|
||||
log("json: json parser reallocated buffer. inefficient.");
|
||||
|
||||
return (JsonItem *)m_sb.getBufStart();
|
||||
}
|
||||
@ -465,14 +473,26 @@ char *JsonItem::getValueAsString ( int32_t *valueLen ) {
|
||||
}
|
||||
|
||||
// numbers...
|
||||
static char s_numBuf[64];
|
||||
// seems like when this overflowed when it was 64 bytes
|
||||
// it went into s_vbuf in Version.cpp
|
||||
static char s_numBuf[256];
|
||||
if ( (float)m_valueLong == m_valueDouble ) {
|
||||
*valueLen = sprintf ( s_numBuf,"%"INT32"", m_valueLong );
|
||||
*valueLen = snprintf ( s_numBuf,255,"%"INT32"", m_valueLong );
|
||||
return s_numBuf;
|
||||
}
|
||||
|
||||
*valueLen = sprintf ( s_numBuf,"%f", m_valueDouble );
|
||||
return s_numBuf;
|
||||
if ( (double)m_value64 == m_valueDouble ) {
|
||||
*valueLen = snprintf ( s_numBuf,255,"%"INT64"", m_value64 );
|
||||
return s_numBuf;
|
||||
}
|
||||
|
||||
// otherwise return the number as it was written in the json
|
||||
// because it might have too many digits for printing as a double
|
||||
*valueLen = m_valueLen;
|
||||
return (char *)this + sizeof(JsonItem);
|
||||
|
||||
// *valueLen = snprintf ( s_numBuf,255,"%f", m_valueDouble );
|
||||
// return s_numBuf;
|
||||
}
|
||||
|
||||
bool endsInCurly ( char *s , int32_t slen ) {
|
||||
|
1
Json.h
1
Json.h
@ -34,6 +34,7 @@ class JsonItem {
|
||||
|
||||
// for JT_NUMBER
|
||||
int32_t m_valueLong;
|
||||
int64_t m_value64;
|
||||
// for JT_NUMBER
|
||||
double m_valueDouble;
|
||||
|
||||
|
59
Linkdb.cpp
59
Linkdb.cpp
@ -5068,27 +5068,51 @@ bool Links::set ( bool useRelNoFollow ,
|
||||
// . continue if this tag ain't an <a href> tag
|
||||
// . atom feeds have a <link href=""> field in them
|
||||
int32_t id = xml->getNodeId ( i );
|
||||
|
||||
int32_t slen;
|
||||
char *s ;
|
||||
// reset
|
||||
linkflags_t flags = 0;
|
||||
|
||||
/*
|
||||
MDW: now we set m_nodeId properly to TAG_LINK even in
|
||||
pure xml docs
|
||||
if ( xml->m_pureXml ) {
|
||||
// if it's a back tag continue
|
||||
if ( xml->isBackTag ( i ) ) continue;
|
||||
// must be a <> tag not innerhtml of tag
|
||||
if ( xml->m_nodes[i].m_nodeId != TAG_XMLTAG ) continue;
|
||||
// must be <link> i guess
|
||||
if ( xml->m_nodes[i].m_tagNameLen != 4 ) continue;
|
||||
if ( strncmp ( xml->m_nodes[i].m_tagName , "link" , 4))
|
||||
continue;
|
||||
// pure xml does not have ids like this so force it
|
||||
id = TAG_LINK;
|
||||
goto gotOne;
|
||||
}
|
||||
*/
|
||||
|
||||
if ( id != TAG_A &&
|
||||
id != TAG_LINK &&
|
||||
id != TAG_LINK && // rss feed url
|
||||
id != TAG_LOC && // sitemap.xml url
|
||||
id != TAG_AREA &&
|
||||
id != TAG_ENCLOSURE &&
|
||||
id != TAG_WEBLOG &&
|
||||
id != TAG_URLFROM && // <UrlFrom> for ahrefs.com
|
||||
id != TAG_FBORIGLINK )
|
||||
continue;
|
||||
|
||||
//gotOne:
|
||||
|
||||
urlattr = "href";
|
||||
if ( id == TAG_WEBLOG ) urlattr ="url";
|
||||
if ( id == TAG_FBORIGLINK ) m_isFeedBurner = true;
|
||||
|
||||
// if it's a back tag continue
|
||||
if ( xml->isBackTag ( i ) ) continue;
|
||||
// reset
|
||||
linkflags_t flags = 0;
|
||||
// . if it has rel=nofollow then ignore it
|
||||
// . for old titleRecs we should skip this part so that the
|
||||
// link: terms are indexed/hashed the same way in XmlDoc.cpp
|
||||
int32_t slen;
|
||||
char *s ;
|
||||
if ( useRelNoFollow ) s = xml->getString ( i , "rel", &slen ) ;
|
||||
if ( useRelNoFollow &&
|
||||
slen==8 && // ASCII
|
||||
@ -5112,6 +5136,7 @@ bool Links::set ( bool useRelNoFollow ,
|
||||
// follow, like in an rss feed.
|
||||
if ( linkLen==0 &&
|
||||
(id == TAG_LINK ||
|
||||
id == TAG_LOC || // sitemap.xml urls
|
||||
id == TAG_URLFROM ||
|
||||
id == TAG_FBORIGLINK) ) {
|
||||
// the the <link> node
|
||||
@ -5343,6 +5368,30 @@ bool Links::set ( char *buf , int32_t niceness ) { //char *coll,int32_t nicenes
|
||||
return true;
|
||||
}
|
||||
|
||||
bool Links::print ( SafeBuf *sb ) {
|
||||
sb->safePrintf(
|
||||
"<table cellpadding=3 border=1>\n"
|
||||
"<tr>"
|
||||
"<td>#</td>"
|
||||
"<td colspan=40>"
|
||||
// table header row
|
||||
"Outlink"
|
||||
"</td>"
|
||||
"</tr>"
|
||||
);
|
||||
// find the link point to our url
|
||||
int32_t i;
|
||||
for ( i = 0 ; i < m_numLinks ; i++ ) {
|
||||
char *link = getLinkPtr(i);
|
||||
int32_t linkLen = getLinkLen(i);
|
||||
sb->safePrintf("<tr><td>%"INT32"</td><td>",i);
|
||||
sb->safeMemcpy(link,linkLen);
|
||||
sb->safePrintf("</td></tr>\n");
|
||||
}
|
||||
sb->safePrintf("</table>\n<br>\n");
|
||||
return true;
|
||||
}
|
||||
|
||||
// . the blogroll must consist of 2 outlinks to two different external blogs
|
||||
// in order to be a valid blogroll
|
||||
// . add the all the site root outlinks in the valid blogroll into the
|
||||
|
2
Linkdb.h
2
Linkdb.h
@ -1183,6 +1183,8 @@ public:
|
||||
// set from a simple text buffer
|
||||
bool set ( char *buf , int32_t niceness ) ;
|
||||
|
||||
bool print ( SafeBuf *sb ) ;
|
||||
|
||||
// Link in ascii text
|
||||
bool addLink(char *link,int32_t linkLen,int32_t nodeNum,bool setLinkHashes,
|
||||
int32_t titleRecVersion, int32_t niceness , bool isRSS ,
|
||||
|
14
Loop.cpp
14
Loop.cpp
@ -1193,6 +1193,8 @@ void sigvtalrmHandler ( int x , siginfo_t *info , void *y ) {
|
||||
|
||||
}
|
||||
|
||||
float g_cpuUsage = 0.0;
|
||||
|
||||
void sigalrmHandler ( int x , siginfo_t *info , void *y ) {
|
||||
|
||||
#ifdef PTHREADS
|
||||
@ -1206,6 +1208,17 @@ void sigalrmHandler ( int x , siginfo_t *info , void *y ) {
|
||||
|
||||
// stats
|
||||
g_numAlarms++;
|
||||
|
||||
if ( ! g_inWaitState )
|
||||
g_cpuUsage = .99 * g_cpuUsage + .01 * 100;
|
||||
else
|
||||
g_cpuUsage = .99 * g_cpuUsage + .01 * 000;
|
||||
|
||||
if ( g_profiler.m_realTimeProfilerRunning )
|
||||
g_profiler.getStackFrame(0);
|
||||
|
||||
return;
|
||||
/*
|
||||
// . see where we are in the code
|
||||
// . for computing cpu usage
|
||||
// . if idling we will be in sigtimedwait() at the lowest level
|
||||
@ -1224,6 +1237,7 @@ void sigalrmHandler ( int x , siginfo_t *info , void *y ) {
|
||||
|
||||
if ( g_profiler.m_realTimeProfilerRunning )
|
||||
g_profiler.getStackFrame(0);
|
||||
*/
|
||||
}
|
||||
|
||||
/*
|
||||
|
5
Makefile
5
Makefile
@ -177,6 +177,11 @@ vclean:
|
||||
@echo ""
|
||||
@echo "sudo yum install gcc-c++"
|
||||
@echo ""
|
||||
@echo ""
|
||||
@echo "If make fails on CentOS then first run:"
|
||||
@echo ""
|
||||
@echo "sudo yum install gcc-c++ openssl-devel"
|
||||
@echo ""
|
||||
@echo "*****"
|
||||
@echo ""
|
||||
|
||||
|
@ -1950,6 +1950,13 @@ void gotHttpReply2 ( void *state ,
|
||||
if ( --count > 0 && ! err ) {
|
||||
copy = (char *)mdup(reply,replySize,"msg13d");
|
||||
copyAllocSize = replySize;
|
||||
// oom doing the mdup? i've seen this core us so fix it
|
||||
// because calling sendreply_ass with a NULL
|
||||
// 'copy' cores it.
|
||||
if ( reply && ! copy ) {
|
||||
copyAllocSize = 0;
|
||||
err = ENOMEM;
|
||||
}
|
||||
}
|
||||
// this is not freeable
|
||||
if ( copy == g_fakeReply ) copyAllocSize = 0;
|
||||
|
7
Msg3.cpp
7
Msg3.cpp
@ -782,6 +782,13 @@ bool Msg3::doneScanning ( ) {
|
||||
}
|
||||
}
|
||||
|
||||
// if shutting down gb then limit to 20 so we can shutdown because
|
||||
// it can't shutdown until all threads are out of the queue i think
|
||||
if ( g_process.m_mode == EXIT_MODE && max < 0 ) {
|
||||
log("msg3: forcing retries to 0 because shutting down");
|
||||
max = 0;
|
||||
}
|
||||
|
||||
// get base, returns NULL and sets g_errno to ENOCOLLREC on error
|
||||
RdbBase *base; if (!(base=getRdbBase(m_rdbId,m_collnum))) return true;
|
||||
|
||||
|
47
Msg39.cpp
47
Msg39.cpp
@ -1452,6 +1452,9 @@ void Msg39::estimateHitsAndSendReply ( ) {
|
||||
need += 4;
|
||||
// then buckets. keys and counts
|
||||
need += (4+sizeof(FacetEntry)) * used;
|
||||
// for # of ALL docs that have this facet, even if
|
||||
// not in search results
|
||||
need += sizeof(int64_t);
|
||||
}
|
||||
// allocate
|
||||
SafeBuf tmp;
|
||||
@ -1523,6 +1526,12 @@ void Msg39::estimateHitsAndSendReply ( ) {
|
||||
//
|
||||
/////////////
|
||||
|
||||
// how many docs IN TOTAL had the facet, including all docs
|
||||
// that did not match the query.
|
||||
// it's 1-1 with the query terms.
|
||||
mr.ptr_numDocsThatHaveFacetList = NULL;
|
||||
mr.size_numDocsThatHaveFacetList = nqt * sizeof(int64_t);
|
||||
|
||||
|
||||
// . that is pretty much it,so serialize it into buffer,"reply"
|
||||
// . mr.ptr_docIds, etc., will point into the buffer so we can
|
||||
@ -1548,6 +1557,44 @@ void Msg39::estimateHitsAndSendReply ( ) {
|
||||
topDocIds = (int64_t *) mr.ptr_docIds;
|
||||
topScores = (double *) mr.ptr_scores;
|
||||
topRecs = (key_t *) mr.ptr_clusterRecs;
|
||||
|
||||
// sanity
|
||||
if ( nqt != m_msg2.m_numLists )
|
||||
log("query: nqt mismatch for q=%s",m_tmpq.m_orig);
|
||||
int64_t *facetCounts=(int64_t*)mr.ptr_numDocsThatHaveFacetList;
|
||||
for ( int32_t i = 0 ; i < nqt ; i++ ) {
|
||||
QueryTerm *qt = &m_tmpq.m_qterms[i];
|
||||
// default is 0 for non-facet termlists
|
||||
facetCounts[i] = qt->m_numDocsThatHaveFacet;
|
||||
}
|
||||
/*
|
||||
MDW - no, because some docs have the same facet field
|
||||
multiple times and we want a doc count. so do it in Posdb.cpp
|
||||
// fill these in now too
|
||||
int64_t *facetCounts=(int64_t*)mr.ptr_numDocsThatHaveFacetList;
|
||||
for ( int32_t i = 0 ; i < nqt ; i++ ) {
|
||||
// default is 0 for non-facet termlists
|
||||
facetCounts[i] = 0;
|
||||
QueryTerm *qt = &m_tmpq.m_qterms[i];
|
||||
// skip if not facet term
|
||||
bool isFacetTerm = false;
|
||||
if ( qt->m_fieldCode == FIELD_GBFACETSTR )
|
||||
isFacetTerm = true;
|
||||
if ( qt->m_fieldCode == FIELD_GBFACETINT )
|
||||
isFacetTerm = true;
|
||||
if ( qt->m_fieldCode == FIELD_GBFACETFLOAT )
|
||||
isFacetTerm = true;
|
||||
if ( ! isFacetTerm )
|
||||
continue;
|
||||
RdbList *list = &m_lists[i];
|
||||
// they should be all 12 bytes except first rec which
|
||||
// is 18 bytes.
|
||||
int64_t count = list->m_listSize;
|
||||
count -= 6;
|
||||
count /= 12;
|
||||
facetCounts[i] = count;
|
||||
}
|
||||
*/
|
||||
}
|
||||
|
||||
int32_t docCount = 0;
|
||||
|
2
Msg39.h
2
Msg39.h
@ -188,6 +188,7 @@ public:
|
||||
char *ptr_scoreInfo ; // transparency info
|
||||
char *ptr_pairScoreBuf ; // transparency info
|
||||
char *ptr_singleScoreBuf ; // transparency info
|
||||
char *ptr_numDocsThatHaveFacetList ;
|
||||
// this is now 1-1 with # of query terms!
|
||||
char *ptr_facetHashList ; // list of all the facet values in serps
|
||||
char *ptr_clusterRecs ; // key_t (might be empty)
|
||||
@ -199,6 +200,7 @@ public:
|
||||
int32_t size_scoreInfo;
|
||||
int32_t size_pairScoreBuf ;
|
||||
int32_t size_singleScoreBuf;
|
||||
int32_t size_numDocsThatHaveFacetList ;
|
||||
int32_t size_facetHashList;
|
||||
int32_t size_clusterRecs;
|
||||
|
||||
|
28
Msg3a.cpp
28
Msg3a.cpp
@ -25,6 +25,7 @@ void Msg3a::constructor ( ) {
|
||||
m_numDocIds = 0;
|
||||
m_collnums = NULL;
|
||||
m_inUse = false;
|
||||
m_q = NULL;
|
||||
|
||||
// need to call all safebuf constructors now to set m_label
|
||||
m_rbuf2.constructor();
|
||||
@ -143,6 +144,7 @@ bool Msg3a::getDocIds ( Msg39Request *r ,
|
||||
reset();
|
||||
// remember ALL the stuff
|
||||
m_r = r;
|
||||
// this should be &SearchInput::m_q
|
||||
m_q = q;
|
||||
m_callback = callback;
|
||||
m_state = state;
|
||||
@ -761,6 +763,16 @@ bool Msg3a::gotAllShardReplies ( ) {
|
||||
// of posdb...
|
||||
m_numTotalEstimatedHits += mr->m_estimatedHits;
|
||||
|
||||
// accumulate total facet count from all shards for each term
|
||||
int64_t *facetCounts;
|
||||
facetCounts = (int64_t*)mr->ptr_numDocsThatHaveFacetList;
|
||||
for ( int32_t k = 0 ; k < mr->m_nqt ; k++ ) {
|
||||
QueryTerm *qt = &m_q->m_qterms[k];
|
||||
// sanity. this should never happen.
|
||||
if ( k >= m_q->m_numTerms ) break;
|
||||
qt->m_numDocsThatHaveFacet += facetCounts[k];
|
||||
}
|
||||
|
||||
// debug log stuff
|
||||
if ( ! m_debug ) continue;
|
||||
// cast these for printing out
|
||||
@ -771,7 +783,8 @@ bool Msg3a::gotAllShardReplies ( ) {
|
||||
// print out score_t
|
||||
logf( LOG_DEBUG,
|
||||
"query: msg3a: [%"PTRFMT"] %03"INT32") "
|
||||
"shard=%"INT32" docId=%012"UINT64" domHash=0x%02"XINT32" "
|
||||
"shard=%"INT32" docId=%012"UINT64" "
|
||||
"domHash=0x%02"XINT32" "
|
||||
"score=%f" ,
|
||||
(PTRTYPE)this ,
|
||||
j ,
|
||||
@ -1063,13 +1076,21 @@ bool Msg3a::mergeLists ( ) {
|
||||
// and Msg40.cpp ultimately.
|
||||
HashTableX *ht = &qt->m_facetHashTable;
|
||||
// we have to manually call this because Query::constructor()
|
||||
// might have been called explicitly
|
||||
ht->constructor();
|
||||
// might have been called explicitly. not now because
|
||||
// i added a call the Query::constructor() to call
|
||||
// QueryTerm::constructor() for each QueryTerm in
|
||||
// Query::m_qterms[]. this was causing a mem leak of
|
||||
// 'fhtqt' too beacause we were re-using the query for each
|
||||
// coll in the federated loop search.
|
||||
//ht->constructor();
|
||||
// 4 byte key, 4 byte score for counting facet values
|
||||
if ( ! ht->set(4,sizeof(FacetEntry),
|
||||
128,NULL,0,false,
|
||||
m_r->m_niceness,"fhtqt"))
|
||||
return true;
|
||||
// debug note
|
||||
// log("results: alloc fhtqt of %"PTRFMT" for st0=%"PTRFMT,
|
||||
// (PTRTYPE)ht->m_buf,(PTRTYPE)m_q->m_st0Ptr);
|
||||
// sanity
|
||||
if ( ! ht->m_isWritable ) {char *xx=NULL;*xx=0;}
|
||||
}
|
||||
@ -1186,7 +1207,6 @@ bool Msg3a::mergeLists ( ) {
|
||||
if ( ! sortFacetEntries() )
|
||||
return true;
|
||||
|
||||
|
||||
//if ( m_r->m_getSectionStats ) return true;
|
||||
//
|
||||
// HACK: END section stats merge
|
||||
|
70
Msg40.cpp
70
Msg40.cpp
@ -108,6 +108,7 @@ Msg40::Msg40() {
|
||||
m_omitCount = 0;
|
||||
m_printCount = 0;
|
||||
//m_numGigabitInfos = 0;
|
||||
m_numCollsToSearch = 0;
|
||||
}
|
||||
|
||||
#define MAX2 50
|
||||
@ -140,6 +141,14 @@ void Msg40::resetBuf2 ( ) {
|
||||
}
|
||||
|
||||
Msg40::~Msg40() {
|
||||
// free tmp msg3as now
|
||||
for ( int32_t i = 0 ; i < m_numCollsToSearch ; i++ ) {
|
||||
if ( ! m_msg3aPtrs[i] ) continue;
|
||||
if ( m_msg3aPtrs[i] == &m_msg3a ) continue;
|
||||
mdelete ( m_msg3aPtrs[i] , sizeof(Msg3a), "tmsg3a");
|
||||
delete ( m_msg3aPtrs[i] );
|
||||
m_msg3aPtrs[i] = NULL;
|
||||
}
|
||||
if ( m_buf ) mfree ( m_buf , m_bufMaxSize , "Msg40" );
|
||||
m_buf = NULL;
|
||||
resetBuf2();
|
||||
@ -2108,7 +2117,8 @@ bool Msg40::gotSummary ( ) {
|
||||
// socket but rather calls doneSendingWrapper() which can call
|
||||
// this function again to send another chunk
|
||||
// . when we are truly done sending all the data, then we set lastChunk
|
||||
// to true and TcpServer.cpp will destroy m_socket when done
|
||||
// to true and TcpServer.cpp will destroy m_socket when done.
|
||||
// no, actually we just set m_streamingMode to false i guess above
|
||||
if ( sb->length() &&
|
||||
// did client browser close the socket on us midstream?
|
||||
! m_socketHadError &&
|
||||
@ -5774,6 +5784,7 @@ bool printHttpMime ( State0 *st ) {
|
||||
//
|
||||
/////////////////
|
||||
|
||||
/*
|
||||
// return 1 if a should be before b
|
||||
static int csvPtrCmp ( const void *a, const void *b ) {
|
||||
//JsonItem *ja = (JsonItem **)a;
|
||||
@ -5791,6 +5802,7 @@ static int csvPtrCmp ( const void *a, const void *b ) {
|
||||
int val = strcmp(pa,pb);
|
||||
return val;
|
||||
}
|
||||
*/
|
||||
|
||||
#include "Json.h"
|
||||
|
||||
@ -5802,12 +5814,10 @@ bool Msg40::printCSVHeaderRow ( SafeBuf *sb ) {
|
||||
//Msg40 *msg40 = &st->m_msg40;
|
||||
//int32_t numResults = msg40->getNumResults();
|
||||
|
||||
/*
|
||||
char tmp1[1024];
|
||||
SafeBuf tmpBuf (tmp1 , 1024);
|
||||
|
||||
char tmp2[1024];
|
||||
SafeBuf nameBuf (tmp2, 1024);
|
||||
|
||||
char nbuf[27000];
|
||||
HashTableX nameTable;
|
||||
if ( ! nameTable.set ( 8,4,2048,nbuf,27000,false,0,"ntbuf") )
|
||||
@ -5905,9 +5915,8 @@ bool Msg40::printCSVHeaderRow ( SafeBuf *sb ) {
|
||||
}
|
||||
|
||||
// sort them
|
||||
qsort ( ptrs , numPtrs , 4 , csvPtrCmp );
|
||||
qsort ( ptrs , numPtrs , sizeof(char *) , csvPtrCmp );
|
||||
|
||||
// set up table to map field name to column for printing the json items
|
||||
HashTableX *columnTable = &m_columnTable;
|
||||
if ( ! columnTable->set ( 8,4, numPtrs * 4,NULL,0,false,0,"coltbl" ) )
|
||||
return false;
|
||||
@ -5922,6 +5931,37 @@ bool Msg40::printCSVHeaderRow ( SafeBuf *sb ) {
|
||||
if ( ! columnTable->addKey ( &h64 , &i ) )
|
||||
return false;
|
||||
}
|
||||
*/
|
||||
|
||||
Msg20 *msg20s[100];
|
||||
int32_t i;
|
||||
for ( i = 0 ; i < m_needFirstReplies && i < 100 ; i++ ) {
|
||||
Msg20 *m20 = getCompletedSummary(i);
|
||||
if ( ! m20 ) break;
|
||||
msg20s[i] = m20;
|
||||
}
|
||||
|
||||
int32_t numPtrs = 0;
|
||||
|
||||
char tmp2[1024];
|
||||
SafeBuf nameBuf (tmp2, 1024);
|
||||
|
||||
int32_t ct = 0;
|
||||
if ( msg20s[0] ) ct = msg20s[0]->m_r->m_contentType;
|
||||
|
||||
CollectionRec *cr =g_collectiondb.getRec(m_firstCollnum);
|
||||
|
||||
// . set up table to map field name to col for printing the json items
|
||||
// . call this from PageResults.cpp
|
||||
printCSVHeaderRow2 ( sb ,
|
||||
ct ,
|
||||
cr ,
|
||||
&nameBuf ,
|
||||
&m_columnTable ,
|
||||
msg20s ,
|
||||
i , // numResults ,
|
||||
&numPtrs
|
||||
);
|
||||
|
||||
m_numCSVColumns = numPtrs;
|
||||
|
||||
@ -6016,6 +6056,8 @@ bool Msg40::printJsonItemInCSV ( State0 *st , int32_t ix ) {
|
||||
|
||||
// sanity
|
||||
if ( column == -1 ) {//>= numCSVColumns ) {
|
||||
// don't show it any more...
|
||||
continue;
|
||||
// add a new column...
|
||||
int32_t newColnum = numCSVColumns + 1;
|
||||
// silently drop it if we already have too many cols
|
||||
@ -6467,9 +6509,12 @@ bool Msg40::printFacetsForTable ( SafeBuf *sb , QueryTerm *qt ) {
|
||||
if ( format == FORMAT_XML ) {
|
||||
sb->safePrintf("\t<facet>\n"
|
||||
"\t\t<field>%s</field>\n"
|
||||
"\t\t<value>"
|
||||
, term
|
||||
);
|
||||
, term );
|
||||
sb->safePrintf("\t\t<totalDocsWithField>%"INT64""
|
||||
"</totalDocsWithField>\n"
|
||||
, qt->m_numDocsThatHaveFacet );
|
||||
sb->safePrintf("\t\t<value>");
|
||||
|
||||
if ( isString )
|
||||
sb->safePrintf("<![CDATA[%"UINT32",",
|
||||
(uint32_t)*fvh);
|
||||
@ -6569,9 +6614,12 @@ bool Msg40::printFacetsForTable ( SafeBuf *sb , QueryTerm *qt ) {
|
||||
if ( format == FORMAT_JSON ) {
|
||||
sb->safePrintf("{\n"
|
||||
"\t\"field\":\"%s\",\n"
|
||||
"\t\"value\":\""
|
||||
, term
|
||||
, term
|
||||
);
|
||||
sb->safePrintf("\t\"totalDocsWithField\":%"INT64""
|
||||
",\n", qt->m_numDocsThatHaveFacet );
|
||||
sb->safePrintf("\t\"value\":\"");
|
||||
|
||||
if ( isString )
|
||||
sb->safePrintf("%"UINT32","
|
||||
, (uint32_t)*fvh);
|
||||
|
1
Msg40.h
1
Msg40.h
@ -109,6 +109,7 @@ class Msg40 {
|
||||
//void (* callback)(class Msg40 *THIS, void *state));
|
||||
void (* callback)(void *state));
|
||||
|
||||
void makeCallback();
|
||||
bool gotCacheReply();
|
||||
// a continuation function of getResults() above
|
||||
bool prepareToGetDocIds ( );
|
||||
|
@ -244,6 +244,7 @@ bool sendBackDump ( TcpSocket *sock, HttpRequest *hr ) {
|
||||
, dr
|
||||
, cr->m_coll
|
||||
);
|
||||
log("crawlbot: %s",sb2.getBufStart());
|
||||
HttpRequest hr2;
|
||||
hr2.set ( sb2.getBufStart() , sb2.length() , sock );
|
||||
return sendPageResults ( sock , &hr2 );
|
||||
@ -283,6 +284,59 @@ bool sendBackDump ( TcpSocket *sock, HttpRequest *hr ) {
|
||||
, dr
|
||||
, cr->m_coll
|
||||
);
|
||||
log("crawlbot: %s",sb2.getBufStart());
|
||||
HttpRequest hr2;
|
||||
hr2.set ( sb2.getBufStart() , sb2.length() , sock );
|
||||
return sendPageResults ( sock , &hr2 );
|
||||
}
|
||||
|
||||
// . now the urls.csv is also a query on gbss files
|
||||
// . make an httprequest on stack and call it
|
||||
// . only do this for version 3
|
||||
// i.e. GET /v3/crawl/download/token-collectionname_urls.csv
|
||||
if ( fmt == FORMAT_CSV &&
|
||||
rdbId == RDB_SPIDERDB &&
|
||||
path[0] == '/' &&
|
||||
path[1] == 'v' &&
|
||||
path[2] == '3' ) {
|
||||
char tmp2[5000];
|
||||
SafeBuf sb2(tmp2,5000);
|
||||
// never dedup
|
||||
int32_t dr = 0;
|
||||
// do not dedup for crawls either it is too confusing!!!!
|
||||
// ppl wonder where the results are!
|
||||
dr = 0;
|
||||
sb2.safePrintf("GET /search?"
|
||||
// this is not necessary
|
||||
//"icc=1&"
|
||||
"format=csv&"
|
||||
// no site clustering
|
||||
"sc=0&"
|
||||
// never dedup.
|
||||
"dr=0&"
|
||||
"c=%s&"
|
||||
"n=10000000&"
|
||||
// stream it now
|
||||
// can't stream until we fix headers be printed
|
||||
// in Msg40.cpp. so gbssUrl->Url etc.
|
||||
// mdw: ok should work now
|
||||
"stream=1&"
|
||||
//"stream=0&"
|
||||
// no summary similarity dedup, only exact
|
||||
// doc content hash. otherwise too slow!!
|
||||
"pss=0&"
|
||||
// no gigabits
|
||||
"dsrt=0&"
|
||||
// do not compute summary. 0 lines.
|
||||
//"ns=0&"
|
||||
"q=gbrevsortbyint%%3AgbssSpiderTime+"
|
||||
"gbssIsDiffbotObject%%3A0"
|
||||
"&"
|
||||
//"prepend=type%%3Ajson"
|
||||
"\r\n\r\n"
|
||||
, cr->m_coll
|
||||
);
|
||||
log("crawlbot: %s",sb2.getBufStart());
|
||||
HttpRequest hr2;
|
||||
hr2.set ( sb2.getBufStart() , sb2.length() , sock );
|
||||
return sendPageResults ( sock , &hr2 );
|
||||
@ -768,7 +822,7 @@ void StateCD::printSpiderdbList ( RdbList *list,SafeBuf *sb,char **lastKeyPtr){
|
||||
lastSpidered = 0;
|
||||
|
||||
bool isProcessed = false;
|
||||
if ( srep ) isProcessed = srep->m_sentToDiffbot;
|
||||
if ( srep ) isProcessed = srep->m_sentToDiffbotThisTime;
|
||||
|
||||
if ( srep && srep->m_hadDiffbotError )
|
||||
isProcessed = false;
|
||||
@ -848,8 +902,10 @@ void StateCD::printSpiderdbList ( RdbList *list,SafeBuf *sb,char **lastKeyPtr){
|
||||
// lastspidertime>={roundstart} --> spiders disabled rule
|
||||
// so that we do not spider a url twice in the same round
|
||||
if ( ufn >= 0 && //! cr->m_spidersEnabled[ufn] ) {
|
||||
cr->m_regExs[ufn].length() &&
|
||||
// we set this to 0 instead of using the checkbox
|
||||
cr->m_maxSpidersPerRule[ufn] <= 0 ) {
|
||||
strstr(cr->m_regExs[ufn].getBufStart(),"round") ) {
|
||||
//cr->m_maxSpidersPerRule[ufn] <= 0 ) {
|
||||
priority = -5;
|
||||
}
|
||||
|
||||
@ -935,10 +991,12 @@ void StateCD::printSpiderdbList ( RdbList *list,SafeBuf *sb,char **lastKeyPtr){
|
||||
//, iptoa(sreq->m_firstIp)
|
||||
);
|
||||
// print priority
|
||||
if ( priority == SPIDER_PRIORITY_FILTERED )
|
||||
//if ( priority == SPIDER_PRIORITY_FILTERED )
|
||||
// we just turn off the spiders now
|
||||
if ( ufn >= 0 && cr->m_maxSpidersPerRule[ufn] <= 0 )
|
||||
sb->safePrintf("url ignored");
|
||||
else if ( priority == SPIDER_PRIORITY_BANNED )
|
||||
sb->safePrintf("url banned");
|
||||
//else if ( priority == SPIDER_PRIORITY_BANNED )
|
||||
// sb->safePrintf("url banned");
|
||||
else if ( priority == -4 )
|
||||
sb->safePrintf("error");
|
||||
else if ( priority == -5 )
|
||||
@ -4254,7 +4312,7 @@ bool getSpiderRequestMetaList ( char *doc ,
|
||||
sreq.m_hostHash32 = url.getHostHash32();
|
||||
sreq.m_domHash32 = url.getDomainHash32();
|
||||
sreq.m_siteHash32 = url.getHostHash32();
|
||||
sreq.m_probDocId = probDocId;
|
||||
//sreq.m_probDocId = probDocId;
|
||||
sreq.m_hopCount = 0; // we're a seed
|
||||
sreq.m_hopCountValid = true;
|
||||
sreq.m_addedTime = now;
|
||||
|
15
PageGet.cpp
15
PageGet.cpp
@ -407,6 +407,10 @@ bool processLoop ( void *state ) {
|
||||
if ( format == FORMAT_XML ) sb->reset();
|
||||
if ( format == FORMAT_JSON ) sb->reset();
|
||||
|
||||
if ( xd->m_contentType == CT_JSON ) sb->reset();
|
||||
if ( xd->m_contentType == CT_XML ) sb->reset();
|
||||
if ( xd->m_contentType == CT_STATUS ) sb->reset();
|
||||
|
||||
// for undoing the stuff below
|
||||
int32_t startLen2 = sb->length();//p;
|
||||
|
||||
@ -431,6 +435,9 @@ bool processLoop ( void *state ) {
|
||||
if ( xd->m_contentType == CT_JSON )
|
||||
printDisclaimer = false;
|
||||
|
||||
if ( xd->m_contentType == CT_STATUS )
|
||||
printDisclaimer = false;
|
||||
|
||||
if ( format == FORMAT_XML ) printDisclaimer = false;
|
||||
if ( format == FORMAT_JSON ) printDisclaimer = false;
|
||||
|
||||
@ -624,6 +631,8 @@ bool processLoop ( void *state ) {
|
||||
includeHeader = false;
|
||||
if ( xd->m_contentType == CT_XML )
|
||||
includeHeader = false;
|
||||
if ( xd->m_contentType == CT_STATUS )
|
||||
includeHeader = false;
|
||||
|
||||
if ( format == FORMAT_XML ) includeHeader = false;
|
||||
if ( format == FORMAT_JSON ) includeHeader = false;
|
||||
@ -679,6 +688,7 @@ bool processLoop ( void *state ) {
|
||||
// do not calc title or print it if doc is xml or json
|
||||
if ( ctype == CT_XML ) sbend = sbstart;
|
||||
if ( ctype == CT_JSON ) sbend = sbstart;
|
||||
if ( ctype == CT_STATUS ) sbend = sbstart;
|
||||
|
||||
for ( char *t = sbstart ; t < sbend ; t++ ) {
|
||||
// title tag?
|
||||
@ -813,6 +823,8 @@ bool processLoop ( void *state ) {
|
||||
// do not do term highlighting if json
|
||||
if ( xd->m_contentType == CT_JSON )
|
||||
queryHighlighting = false;
|
||||
if ( xd->m_contentType == CT_STATUS )
|
||||
queryHighlighting = false;
|
||||
|
||||
SafeBuf tmp;
|
||||
SafeBuf *xb = sb;
|
||||
@ -917,6 +929,9 @@ bool processLoop ( void *state ) {
|
||||
if ( xd->m_contentType == CT_JSON )
|
||||
contentType = "application/json";
|
||||
|
||||
if ( xd->m_contentType == CT_STATUS )
|
||||
contentType = "application/json";
|
||||
|
||||
if ( xd->m_contentType == CT_XML )
|
||||
contentType = "test/xml";
|
||||
|
||||
|
@ -521,9 +521,18 @@ skipReplaceHost:
|
||||
}
|
||||
|
||||
// recovery mode? reocvered from coring?
|
||||
if ((flags & PFLAG_RECOVERYMODE)&& format == FORMAT_HTML )
|
||||
if ((flags & PFLAG_RECOVERYMODE)&& format == FORMAT_HTML ) {
|
||||
fb.safePrintf("<b title=\"Recovered from core"
|
||||
"\">x</b>");
|
||||
// this is only 8-bits at the moment so it's capped
|
||||
// at 255. this level is 1 the first time we core
|
||||
// and are restarted.
|
||||
if ( h->m_pingInfo.m_recoveryLevel > 1 )
|
||||
fb.safePrintf("<sup>%"INT32"</sup>",
|
||||
(int32_t)
|
||||
h->m_pingInfo.m_recoveryLevel);
|
||||
}
|
||||
|
||||
if ((flags & PFLAG_RECOVERYMODE)&& format != FORMAT_HTML )
|
||||
fb.safePrintf("Recovered from core");
|
||||
|
||||
@ -553,14 +562,15 @@ skipReplaceHost:
|
||||
,h->m_pingInfo.m_currentSpiders
|
||||
);
|
||||
|
||||
if ( format == FORMAT_HTML && h->m_pingInfo.m_udpSlotsInUse ) {
|
||||
if ( format == FORMAT_HTML &&
|
||||
h->m_pingInfo.m_udpSlotsInUseIncoming ) {
|
||||
char *f1 = "";
|
||||
char *f2 = "";
|
||||
if ( h->m_pingInfo.m_udpSlotsInUse >= 200 ) {
|
||||
if ( h->m_pingInfo.m_udpSlotsInUseIncoming >= 200 ) {
|
||||
f1 = "<b>";
|
||||
f2 = "</b>";
|
||||
}
|
||||
if ( h->m_pingInfo.m_udpSlotsInUse >= 400 ) {
|
||||
if ( h->m_pingInfo.m_udpSlotsInUseIncoming >= 400 ) {
|
||||
f1 = "<b><font color=red>";
|
||||
f2 = "</font></b>";
|
||||
}
|
||||
@ -571,7 +581,7 @@ skipReplaceHost:
|
||||
"%s"
|
||||
"</span>"
|
||||
,f1
|
||||
,h->m_pingInfo.m_udpSlotsInUse
|
||||
,h->m_pingInfo.m_udpSlotsInUseIncoming
|
||||
,f2
|
||||
);
|
||||
}
|
||||
@ -679,7 +689,7 @@ skipReplaceHost:
|
||||
|
||||
sb.safePrintf("\t\t<udpSlotsInUse>%"INT32""
|
||||
"</udpSlotsInUse>\n",
|
||||
h->m_pingInfo.m_udpSlotsInUse);
|
||||
h->m_pingInfo.m_udpSlotsInUseIncoming);
|
||||
|
||||
sb.safePrintf("\t\t<tcpSocketsInUse>%"INT32""
|
||||
"</tcpSocketsInUse>\n",
|
||||
@ -791,7 +801,7 @@ skipReplaceHost:
|
||||
sb.safePrintf("\t\t\"errorTryAgains\":%"INT32",\n",
|
||||
h->m_pingInfo.m_etryagains);
|
||||
sb.safePrintf("\t\t\"udpSlotsInUse\":%"INT32",\n",
|
||||
h->m_pingInfo.m_udpSlotsInUse);
|
||||
h->m_pingInfo.m_udpSlotsInUseIncoming);
|
||||
sb.safePrintf("\t\t\"tcpSocketsInUse\":%"INT32",\n",
|
||||
h->m_pingInfo.m_tcpSocketsInUse);
|
||||
|
||||
@ -1463,7 +1473,8 @@ skipReplaceHost:
|
||||
"<td>x (status flag)</td>"
|
||||
"<td>Indicates host has abruptly exited due to a fatal "
|
||||
"error (cored) and "
|
||||
"restarted itself."
|
||||
"restarted itself. The exponent is how many times it has "
|
||||
"done this. If no exponent, it only did it once."
|
||||
"</td>"
|
||||
"</tr>\n"
|
||||
|
||||
@ -1498,7 +1509,8 @@ skipReplaceHost:
|
||||
"<tr class=poo>"
|
||||
"<td><nobr>U (status flag)</nobr></td>"
|
||||
"<td>Indicates the number of active UDP transactions "
|
||||
"which are either outgoing or incoming requests."
|
||||
"which are incoming requests. These will pile up if a "
|
||||
"host can't handle them fast enough."
|
||||
"</td>"
|
||||
"</tr>\n"
|
||||
|
||||
|
@ -449,9 +449,10 @@ bool Msg1c::gotList ( ) {
|
||||
sr.m_urlIsDocId = 1;
|
||||
sr.m_fakeFirstIp = 1;
|
||||
// for msg12 locking
|
||||
sr.m_probDocId = docId;
|
||||
//sr.m_probDocId = docId;
|
||||
// use test-parser not test-spider
|
||||
sr.m_useTestSpiderDir = 0;
|
||||
//sr.m_useTestSpiderDir = 0;
|
||||
sr.m_parentIsSiteMap = 0;
|
||||
// now you can recycle content instead of re-downloading it
|
||||
// for every docid
|
||||
sr.m_recycleContent = gr->m_recycleContent;
|
||||
|
405
PageResults.cpp
405
PageResults.cpp
@ -42,7 +42,7 @@ bool replaceParm2 ( char *cgi , SafeBuf *newUrl ,
|
||||
char *oldUrl , int32_t oldUrlLen ) ;
|
||||
|
||||
|
||||
bool printCSVHeaderRow ( SafeBuf *sb , State0 *st ) ;
|
||||
bool printCSVHeaderRow ( SafeBuf *sb , State0 *st , int32_t ct ) ;
|
||||
|
||||
bool printJsonItemInCSV ( char *json , SafeBuf *sb , class State0 *st ) ;
|
||||
|
||||
@ -128,6 +128,8 @@ bool sendReply ( State0 *st , char *reply ) {
|
||||
|
||||
g_stats.logAvgQueryTime(st->m_startTime);
|
||||
|
||||
//log("results: debug: in sendReply deleting st=%"PTRFMT,(PTRTYPE)st);
|
||||
|
||||
if ( ! savedErr ) { // g_errno ) {
|
||||
g_stats.m_numSuccess++;
|
||||
// . one hour cache time... no 1000 hours, basically infinite
|
||||
@ -543,10 +545,6 @@ bool sendPageResults ( TcpSocket *s , HttpRequest *hr ) {
|
||||
// save this count so we know if TcpServer.cpp calls destroySocket(s)
|
||||
st->m_numDestroys = s->m_numDestroys;
|
||||
|
||||
// you have to say "&header=1" to get back the header for json now.
|
||||
// later on maybe it will default to on.
|
||||
st->m_header = hr->getLong("header",0);
|
||||
|
||||
// . parse it up
|
||||
// . this returns false and sets g_errno and, maybe, g_msg on error
|
||||
SearchInput *si = &st->m_si;
|
||||
@ -563,6 +561,9 @@ bool sendPageResults ( TcpSocket *s , HttpRequest *hr ) {
|
||||
return sendReply ( st, NULL );
|
||||
}
|
||||
|
||||
// for debug
|
||||
si->m_q.m_st0Ptr = (char *)st;
|
||||
|
||||
int32_t codeLen = 0;
|
||||
char *code = hr->getString("code", &codeLen, NULL);
|
||||
// allow up to 1000 results per query for paying clients
|
||||
@ -572,9 +573,15 @@ bool sendPageResults ( TcpSocket *s , HttpRequest *hr ) {
|
||||
if ( cr ) st->m_collnum = cr->m_collnum;
|
||||
else st->m_collnum = -1;
|
||||
|
||||
// turn this on for json output, unless diffbot collection
|
||||
if ( format == FORMAT_JSON && ! cr->m_isCustomCrawl )
|
||||
st->m_header = 1;
|
||||
int32_t defHdr = 1;
|
||||
|
||||
// default is no header for diffbot only
|
||||
if ( cr->m_isCustomCrawl || strcmp(cr->m_coll,"GLOBAL-INDEX") == 0 )
|
||||
defHdr = 0;
|
||||
|
||||
// you have to say "&header=1" to get back the header for json now.
|
||||
// later on maybe it will default to on.
|
||||
st->m_header = hr->getLong("header",defHdr);
|
||||
|
||||
// take this out here as well!
|
||||
// limit here
|
||||
@ -635,7 +642,13 @@ bool sendPageResults ( TcpSocket *s , HttpRequest *hr ) {
|
||||
return sendReply(st,NULL);
|
||||
}
|
||||
|
||||
|
||||
// filter that one query causing the memleak for now
|
||||
// if ( strstr(si->m_q.m_orig,
|
||||
// "type:json AND ((((query=humanLanguage:en") ) {
|
||||
// g_errno = EQUERYINGDISABLED;
|
||||
// return sendReply(st,NULL);
|
||||
// }
|
||||
|
||||
// LAUNCH ADS
|
||||
// . now get the ad space for this query
|
||||
// . don't get ads if we're not on the first page of results
|
||||
@ -692,6 +705,8 @@ bool sendPageResults ( TcpSocket *s , HttpRequest *hr ) {
|
||||
// save error
|
||||
st->m_errno = g_errno;
|
||||
|
||||
//log("results: debug: new state=%"PTRFMT,(PTRTYPE)st);
|
||||
|
||||
// wait for ads and spellcheck and results?
|
||||
if ( !st->m_gotAds || !st->m_gotSpell || !st->m_gotResults )
|
||||
return false;
|
||||
@ -1128,6 +1143,7 @@ bool gotResults ( void *state ) {
|
||||
// record that
|
||||
st->m_took = took;
|
||||
|
||||
//log("results: debug: in gotResults state=%"PTRFMT,(PTRTYPE)st);
|
||||
|
||||
// grab the query
|
||||
Msg40 *msg40 = &(st->m_msg40);
|
||||
@ -1153,10 +1169,12 @@ bool gotResults ( void *state ) {
|
||||
log("res: socket still in streaming mode. wtf?");
|
||||
st->m_socket->m_streamingMode = false;
|
||||
}
|
||||
log("msg40: done streaming. nuking state=%"PTRFMT" q=%s. "
|
||||
log("msg40: done streaming. nuking state=0x%"PTRFMT" "
|
||||
"msg40=0x%"PTRFMT" q=%s. "
|
||||
"msg20sin=%i msg20sout=%i sendsin=%i sendsout=%i "
|
||||
"numrequests=%i numreplies=%i "
|
||||
,(PTRTYPE)st
|
||||
,(PTRTYPE)msg40
|
||||
,si->m_q.m_orig
|
||||
|
||||
, msg40->m_numMsg20sIn
|
||||
@ -1167,6 +1185,15 @@ bool gotResults ( void *state ) {
|
||||
, msg40->m_numReplies
|
||||
|
||||
);
|
||||
|
||||
// for some reason the socket still exists and will time out
|
||||
//g_tcpServer.destroySocket ( st->m_socket );
|
||||
|
||||
// just let tcpserver nuke it, but don't double call
|
||||
// the callback, doneSendingWrapper9()... because msg40
|
||||
// will have been deleted!
|
||||
st->m_socket->m_callback = NULL;
|
||||
|
||||
mdelete(st, sizeof(State0), "PageResults2");
|
||||
delete st;
|
||||
return true;
|
||||
@ -1729,6 +1756,8 @@ bool printLeftNavColumn ( SafeBuf &sb, State0 *st ) {
|
||||
// MDW: support gigabits in xml/json format again
|
||||
//if ( format != FORMAT_HTML ) numGigabits = 0;
|
||||
|
||||
if ( ! st->m_header )
|
||||
numGigabits = 0;
|
||||
|
||||
// print gigabits
|
||||
Gigabit *gigabits = (Gigabit *)gbuf->getBufStart();
|
||||
@ -2131,6 +2160,7 @@ bool printSearchResultsHeader ( State0 *st ) {
|
||||
// print first [ for json
|
||||
if ( si->m_format == FORMAT_JSON ) {
|
||||
if ( st->m_header ) sb->safePrintf("{\n");
|
||||
// this is just for diffbot really...
|
||||
else sb->safePrintf("[\n");
|
||||
}
|
||||
|
||||
@ -2626,7 +2656,8 @@ bool printSearchResultsHeader ( State0 *st ) {
|
||||
|
||||
|
||||
// when streaming results we lookup the facets last
|
||||
if ( si->m_format != FORMAT_HTML && ! si->m_streamResults )
|
||||
if ( si->m_format != FORMAT_HTML && ! si->m_streamResults &&
|
||||
st->m_header )
|
||||
msg40->printFacetTables ( sb );
|
||||
|
||||
// now print gigabits if we are xml/json
|
||||
@ -2647,8 +2678,7 @@ bool printSearchResultsHeader ( State0 *st ) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if ( si->m_format == FORMAT_JSON &&
|
||||
! cr->m_isCustomCrawl ) {
|
||||
if ( si->m_format == FORMAT_JSON && st->m_header ) {
|
||||
sb->safePrintf("\"results\":[\n");
|
||||
return true;
|
||||
}
|
||||
@ -3170,7 +3200,7 @@ bool printSearchResultsTail ( State0 *st ) {
|
||||
sb->m_length -= 2;
|
||||
sb->safePrintf("\n");
|
||||
}
|
||||
// print ending ] for json
|
||||
// print ending ] for json search results
|
||||
sb->safePrintf("]\n");
|
||||
|
||||
// when streaming results we lookup the facets last
|
||||
@ -3903,13 +3933,14 @@ bool printResult ( State0 *st, int32_t ix , int32_t *numPrintedSoFar ) {
|
||||
// ptr_content is set in the msg20reply.
|
||||
if ( si->m_format == FORMAT_CSV &&
|
||||
mr->ptr_content &&
|
||||
mr->m_contentType == CT_JSON ) {
|
||||
// spider STATUS docs are json
|
||||
(mr->m_contentType == CT_JSON || mr->m_contentType == CT_STATUS)){
|
||||
// parse it up
|
||||
char *json = mr->ptr_content;
|
||||
// only print header row once, so pass in that flag
|
||||
if ( ! st->m_printedHeaderRow ) {
|
||||
sb->reset();
|
||||
printCSVHeaderRow ( sb , st );
|
||||
printCSVHeaderRow ( sb , st , mr->m_contentType );
|
||||
st->m_printedHeaderRow = true;
|
||||
}
|
||||
printJsonItemInCSV ( json , sb , st );
|
||||
@ -4026,6 +4057,83 @@ bool printResult ( State0 *st, int32_t ix , int32_t *numPrintedSoFar ) {
|
||||
sb->safePrintf("\",\n");
|
||||
}
|
||||
|
||||
// print spider status pages special
|
||||
if ( mr->ptr_content &&
|
||||
si->m_format == FORMAT_HTML &&
|
||||
mr->m_contentType == CT_STATUS ) {
|
||||
if ( *numPrintedSoFar )
|
||||
sb->safePrintf("<br><hr><br>\n");
|
||||
// skip to gbssurl
|
||||
char *s = strstr ( mr->ptr_content,"\"gbssUrl\":");
|
||||
if ( ! s ) {
|
||||
log("results: missing gbssUrl");
|
||||
goto badformat;
|
||||
}
|
||||
// then do two columns after the two urls
|
||||
char *e = strstr ( s , "\"gbssStatusCode\":" );
|
||||
if ( ! e ) {
|
||||
log("results: missing gbssStatusCode");
|
||||
goto badformat;
|
||||
}
|
||||
char *m = strstr ( e , "\"gbssConsecutiveErrors\":");
|
||||
if ( ! m ) {
|
||||
log("results: missing gbssConsecutiveErrors");
|
||||
goto badformat;
|
||||
}
|
||||
// exclude \0
|
||||
char *end = mr->ptr_content + mr->size_content - 1;
|
||||
// use a table with 2 columns
|
||||
// so we can use \n to separate lines and don't have to add brs
|
||||
// and boldify just the main url, not the redir url!
|
||||
sb->safePrintf("<pre style=display:inline;>"
|
||||
"\"gbssUrl\":\""
|
||||
"<b style=color:blue;><a href=/get?"
|
||||
"c=%s&"
|
||||
"d=%"INT64">"
|
||||
, cr->m_coll
|
||||
, mr->m_docId
|
||||
);
|
||||
char *s2 = strstr ( s , "\"gbssFinalRedirectUrl\":");
|
||||
char *bend = e - 3;
|
||||
if ( s2 ) bend = s2 - 3;
|
||||
sb->safeMemcpy ( s+11 , bend - (s+11));
|
||||
sb->safePrintf("</a></b></pre>\",<br>");
|
||||
// now print redir url if there
|
||||
if ( s2 ) {
|
||||
sb->safePrintf("<pre style=display:inline;>");
|
||||
sb->safeMemcpy ( s2 , e-s2 );
|
||||
sb->removeLastChar('\n');
|
||||
sb->safePrintf("</pre>");
|
||||
}
|
||||
sb->safePrintf("<table border=0 cellpadding=0 cellspacing=0>"
|
||||
"<tr><td>");
|
||||
sb->safePrintf("<pre>");
|
||||
//int32_t off = sb->length();
|
||||
sb->safeMemcpy ( e , m - e );
|
||||
sb->safePrintf("</pre>");
|
||||
sb->safePrintf("</td><td>");
|
||||
sb->safePrintf("<pre>");
|
||||
sb->safeMemcpy ( m , end - m );
|
||||
// remove last \n
|
||||
sb->removeLastChar('\n');
|
||||
sb->removeLastChar('}');
|
||||
sb->removeLastChar('\n');
|
||||
sb->safePrintf("</pre>\n");
|
||||
sb->safePrintf("</td></tr></table>");
|
||||
// replace \n with <br>
|
||||
// sb->safeReplace2 ( "\n" , 1 ,
|
||||
// "<br>" , 4 ,
|
||||
// 0,//niceness ,
|
||||
// off );
|
||||
// inc it
|
||||
*numPrintedSoFar = *numPrintedSoFar + 1;
|
||||
// just in case
|
||||
sb->nullTerm();
|
||||
return true;
|
||||
}
|
||||
|
||||
badformat:
|
||||
|
||||
Highlight hi;
|
||||
|
||||
// get the url
|
||||
@ -4359,7 +4467,6 @@ bool printResult ( State0 *st, int32_t ix , int32_t *numPrintedSoFar ) {
|
||||
//
|
||||
///////
|
||||
|
||||
|
||||
// the a href tag
|
||||
if ( si->m_format == FORMAT_HTML ) {
|
||||
sb->safePrintf ( "<a href=" );
|
||||
@ -4887,6 +4994,9 @@ bool printResult ( State0 *st, int32_t ix , int32_t *numPrintedSoFar ) {
|
||||
// . docId for possible cached link
|
||||
// . might have merged a bunch together
|
||||
sb->safePrintf("\t\t<docId>%"INT64"</docId>\n",mr->m_docId );
|
||||
}
|
||||
|
||||
if ( si->m_format == FORMAT_XML && mr->m_contentType != CT_STATUS ) {
|
||||
// . show the site root
|
||||
// . for hompages.com/users/fred/mypage.html this will be
|
||||
// homepages.com/users/fred/
|
||||
@ -4934,6 +5044,9 @@ bool printResult ( State0 *st, int32_t ix , int32_t *numPrintedSoFar ) {
|
||||
// . docId for possible cached link
|
||||
// . might have merged a bunch together
|
||||
sb->safePrintf("\t\t\"docId\":%"INT64",\n",mr->m_docId );
|
||||
}
|
||||
|
||||
if ( si->m_format == FORMAT_JSON && mr->m_contentType != CT_STATUS ) {
|
||||
// . show the site root
|
||||
// . for hompages.com/users/fred/mypage.html this will be
|
||||
// homepages.com/users/fred/
|
||||
@ -5182,10 +5295,12 @@ bool printResult ( State0 *st, int32_t ix , int32_t *numPrintedSoFar ) {
|
||||
sb->safePrintf (" - "
|
||||
"<a style=color:blue; "
|
||||
"href=\"/search?sb=1&c=%s&"
|
||||
"q=url2%%3A"
|
||||
//"q=url2%%3A"
|
||||
"q=gbfieldmatch%%3AgbssUrl%%3A"
|
||||
, coll
|
||||
);
|
||||
sb->urlEncode ( url , gbstrlen(url) , false );
|
||||
// do not include ending \0
|
||||
sb->urlEncode ( mr->ptr_ubuf , mr->size_ubuf-1 , false );
|
||||
sb->safePrintf ( "\">"
|
||||
"spider info</a>\n"
|
||||
);
|
||||
@ -7810,28 +7925,44 @@ int csvPtrCmp ( const void *a, const void *b ) {
|
||||
if ( strcmp(pb,"product.title") == 0 ) return 1;
|
||||
if ( strcmp(pa,"title") == 0 ) return -1;
|
||||
if ( strcmp(pb,"title") == 0 ) return 1;
|
||||
|
||||
// this is now taken care of from the 'supps[]' array below
|
||||
// by prepending two digits before each field name
|
||||
|
||||
// put url first for spider status docs
|
||||
// if ( strcmp(pa,"gbssUrl") == 0 ) return -1;
|
||||
// if ( strcmp(pb,"gbssUrl") == 0 ) return 1;
|
||||
|
||||
// if ( strcmp(pa,"gbssStatusMsg") == 0 ) return -1;
|
||||
// if ( strcmp(pb,"gbssStatusMsg") == 0 ) return 1;
|
||||
|
||||
// if ( strcmp(pa,"gbssStatusCode") == 0 ) return -1;
|
||||
// if ( strcmp(pb,"gbssStatusCode") == 0 ) return 1;
|
||||
|
||||
|
||||
// otherwise string compare
|
||||
int val = strcmp(pa,pb);
|
||||
|
||||
return val;
|
||||
}
|
||||
|
||||
|
||||
#include "Json.h"
|
||||
|
||||
//
|
||||
// print header row in csv
|
||||
//
|
||||
bool printCSVHeaderRow ( SafeBuf *sb , State0 *st ) {
|
||||
bool printCSVHeaderRow2 ( SafeBuf *sb ,
|
||||
int32_t ct ,
|
||||
CollectionRec *cr ,
|
||||
SafeBuf *nameBuf ,
|
||||
HashTableX *columnTable ,
|
||||
Msg20 **msg20s ,
|
||||
int32_t numMsg20s ,
|
||||
int32_t *numPtrsArg ) {
|
||||
|
||||
Msg40 *msg40 = &st->m_msg40;
|
||||
int32_t numResults = msg40->getNumResults();
|
||||
*numPtrsArg = 0;
|
||||
|
||||
char tmp1[1024];
|
||||
SafeBuf tmpBuf (tmp1 , 1024);
|
||||
|
||||
char tmp2[1024];
|
||||
SafeBuf nameBuf (tmp2, 1024);
|
||||
|
||||
char nbuf[27000];
|
||||
HashTableX nameTable;
|
||||
if ( ! nameTable.set ( 8,4,2048,nbuf,27000,false,0,"ntbuf") )
|
||||
@ -7839,16 +7970,86 @@ bool printCSVHeaderRow ( SafeBuf *sb , State0 *st ) {
|
||||
|
||||
int32_t niceness = 0;
|
||||
|
||||
// if doing spider status docs not all will have dupofdocid field
|
||||
char *supps [] = {
|
||||
"00gbssUrl",
|
||||
"01gbssDocId",
|
||||
"02gbssDiscoveredTime",
|
||||
"03gbssSpiderTime",
|
||||
"06gbssContentLen",
|
||||
"07gbssDupOfDocId" ,
|
||||
"08gbssNumRedirects",
|
||||
"09gbssFinalRedirectUrl",
|
||||
"10gbssCrawlDelayMS",
|
||||
"11gbssCrawlRound",
|
||||
"12gbssPrevTotalNumIndexAttempts",
|
||||
"13gbssHopCount",
|
||||
"14gbssStatusMsg",
|
||||
"15gbssSentToDiffbotThisTime",
|
||||
"16gbssDiffbotReplyMsg",
|
||||
|
||||
"gbssIp",
|
||||
"gbssPercentContentChanged",
|
||||
"gbssDownloadStartTime",
|
||||
"gbssDownloadEndTime",
|
||||
"gbssContentType",
|
||||
"gbssHttpStatus",
|
||||
"gbssWasIndexed",
|
||||
"gbssAgeInIndex",
|
||||
"gbssPrevTotalNumIndexSuccesses",
|
||||
"gbssPrevTotalNumIndexFailures",
|
||||
"gbssDownloadStartTimeMS",
|
||||
"gbssDownloadEndTimeMS",
|
||||
"gbssDownloadDurationMS",
|
||||
"gbssIpLookupTimeMS",
|
||||
"gbssSiteNumInlinks",
|
||||
"gbssSiteRank",
|
||||
"gbssLanguage",
|
||||
"gbssDiffbotReplyCode",
|
||||
"gbssDiffbotLen",
|
||||
"gbssDiffbotReplyResponseTimeMS",
|
||||
"gbssDiffbotReplyRetries",
|
||||
NULL };
|
||||
|
||||
for ( int32_t i = 0 ; supps[i] ; i++ ) {
|
||||
// don't add these column headers to non spider status docs
|
||||
if ( ct != CT_STATUS ) break;
|
||||
char *skip = supps[i];
|
||||
// if custom crawl only show fields in supps with digits
|
||||
if ( cr->m_isCustomCrawl && ! is_digit(skip[0]) ) continue;
|
||||
// skip over the two order digits
|
||||
if ( is_digit(skip[0]) ) skip += 2;
|
||||
// don't include the order digits in the hash
|
||||
int64_t h64 = hash64n ( skip );
|
||||
if ( nameTable.isInTable ( &h64 ) ) continue;
|
||||
// only show diffbot column headers for custom (diffbot) crawls
|
||||
if ( strncmp(skip,"gbssDiffbot",11) == 0 &&
|
||||
( ! cr || ! cr->m_isCustomCrawl ) )
|
||||
break;
|
||||
// record offset of the name for our hash table
|
||||
int32_t nameBufOffset = nameBuf->length();
|
||||
// store the name in our name buffer
|
||||
if ( ! nameBuf->safeStrcpy (supps[i])) return false;
|
||||
if ( ! nameBuf->pushChar ( '\0' ) ) return false;
|
||||
// it's new. add it
|
||||
if ( ! nameTable.addKey ( &h64 ,&nameBufOffset)) return false;
|
||||
}
|
||||
|
||||
// . scan every fucking json item in the search results.
|
||||
// . we still need to deal with the case when there are so many
|
||||
// search results we have to dump each msg20 reply to disk in
|
||||
// order. then we'll have to update this code to scan that file.
|
||||
|
||||
for ( int32_t i = 0 ; i < numResults ; i++ ) {
|
||||
for ( int32_t i = 0 ; i < numMsg20s ; i++ ) { // numResults
|
||||
|
||||
// if custom crawl urls.csv only show the supps[] from above
|
||||
if ( ct == CT_STATUS && cr->m_isCustomCrawl )
|
||||
break;
|
||||
|
||||
// get the msg20 reply for search result #i
|
||||
Msg20 *m20 = msg40->m_msg20[i];
|
||||
Msg20Reply *mr = m20->m_r;
|
||||
//Msg20 *m20 = msg40->m_msg20[i];
|
||||
//Msg20Reply *mr = m20->m_r;
|
||||
Msg20Reply *mr = msg20s[i]->m_r;
|
||||
|
||||
if ( ! mr ) {
|
||||
log("results: missing msg20 reply for result #%"INT32"",i);
|
||||
@ -7889,6 +8090,13 @@ bool printCSVHeaderRow ( SafeBuf *sb , State0 *st ) {
|
||||
strcmp(ji->m_name,"html")==0)
|
||||
continue;
|
||||
|
||||
// for spider status docs skip these
|
||||
if ( ct == CT_STATUS && ji->m_name ) {
|
||||
if (!strcmp(ji->m_name,"") )
|
||||
continue;
|
||||
}
|
||||
|
||||
|
||||
// reset length of buf to 0
|
||||
tmpBuf.reset();
|
||||
|
||||
@ -7902,12 +8110,12 @@ bool printCSVHeaderRow ( SafeBuf *sb , State0 *st ) {
|
||||
if ( nameTable.isInTable ( &h64 ) ) continue;
|
||||
|
||||
// record offset of the name for our hash table
|
||||
int32_t nameBufOffset = nameBuf.length();
|
||||
int32_t nameBufOffset = nameBuf->length();
|
||||
|
||||
// store the name in our name buffer
|
||||
if ( ! nameBuf.safeStrcpy ( tmpBuf.getBufStart() ) )
|
||||
if ( ! nameBuf->safeStrcpy ( tmpBuf.getBufStart() ) )
|
||||
return false;
|
||||
if ( ! nameBuf.pushChar ( '\0' ) )
|
||||
if ( ! nameBuf->pushChar ( '\0' ) )
|
||||
return false;
|
||||
|
||||
// it's new. add it
|
||||
@ -7923,30 +8131,129 @@ bool printCSVHeaderRow ( SafeBuf *sb , State0 *st ) {
|
||||
for ( int32_t i = 0 ; i < nameTable.m_numSlots ; i++ ) {
|
||||
if ( ! nameTable.m_flags[i] ) continue;
|
||||
int32_t off = *(int32_t *)nameTable.getValueFromSlot(i);
|
||||
char *p = nameBuf.getBufStart() + off;
|
||||
char *p = nameBuf->getBufStart() + off;
|
||||
ptrs[numPtrs++] = p;
|
||||
if ( numPtrs >= 1024 ) break;
|
||||
}
|
||||
|
||||
// pass back to caller
|
||||
*numPtrsArg = numPtrs;
|
||||
|
||||
// sort them
|
||||
qsort ( ptrs , numPtrs , sizeof(char *) , csvPtrCmp );
|
||||
|
||||
// set up table to map field name to column for printing the json items
|
||||
HashTableX *columnTable = &st->m_columnTable;
|
||||
//HashTableX *columnTable = &st->m_columnTable;
|
||||
if ( ! columnTable->set ( 8,4, numPtrs * 4,NULL,0,false,0,"coltbl" ) )
|
||||
return false;
|
||||
|
||||
// now print them out as the header row
|
||||
for ( int32_t i = 0 ; i < numPtrs ; i++ ) {
|
||||
|
||||
char *hdr = ptrs[i];
|
||||
|
||||
if ( i > 0 && ! sb->pushChar(',') ) return false;
|
||||
if ( ! sb->safeStrcpy ( ptrs[i] ) ) return false;
|
||||
|
||||
// skip the two order digits
|
||||
if ( ct == CT_STATUS && is_digit(hdr[0]) ) hdr += 2;
|
||||
|
||||
// save it
|
||||
char *skip = hdr;
|
||||
|
||||
// now transform the hdr from gbss* into the old way
|
||||
if ( ! cr->m_isCustomCrawl )
|
||||
goto skipTransform;
|
||||
|
||||
if ( ! strcmp(hdr,"gbssUrl") )
|
||||
hdr = "Url";
|
||||
if ( ! strcmp(hdr,"gbssDocId") )
|
||||
hdr = "Doc ID";
|
||||
// when url was first discovered
|
||||
if ( ! strcmp(hdr,"gbssDiscoveredTime") ) // need this!
|
||||
hdr = "Url Discovered Time";
|
||||
// when it was crawled this time
|
||||
if ( ! strcmp(hdr,"gbssSpiderTime" ) )
|
||||
hdr = "Crawled Time";
|
||||
if ( ! strcmp(hdr,"gbssContentLen") )
|
||||
hdr = "Content Length";
|
||||
if ( ! strcmp(hdr,"gbssDupOfDocId") )
|
||||
hdr = "Duplicate Of";
|
||||
if ( ! strcmp(hdr,"gbssNumRedirects") )
|
||||
hdr = "Redirects";
|
||||
if ( ! strcmp(hdr,"gbssFinalRedirectUrl") )
|
||||
hdr = "Redirected To";
|
||||
if ( ! strcmp(hdr,"gbssCrawlDelayMS") )
|
||||
hdr = "Robots.txt Crawl Delay (ms)";
|
||||
if ( ! strcmp(hdr,"gbssPercentContentChanged") )
|
||||
hdr = "Percent Changed";
|
||||
if ( ! strcmp(hdr,"gbssCrawlRound") )
|
||||
hdr = "Crawl Round";
|
||||
if ( ! strcmp(hdr,"gbssPrevTotalNumIndexAttempts") )
|
||||
hdr = "Crawl Try #";
|
||||
if ( ! strcmp(hdr,"gbssHopCount") )
|
||||
hdr = "Hop Count";
|
||||
if ( ! strcmp(hdr,"gbssIp") )
|
||||
hdr = "IP";
|
||||
if ( ! strcmp(hdr,"gbssSentToDiffbotThisTime") )
|
||||
hdr = "Process Attempted";
|
||||
if ( ! strcmp(hdr,"gbssDiffbotReplyMsg") )
|
||||
hdr = "Process Response";
|
||||
if ( ! strcmp(hdr,"gbssStatusMsg") )
|
||||
hdr = "Crawl Status";
|
||||
|
||||
//if ( ! strcmp(hdr,"gbssMatchingUrlFilter") )
|
||||
// hdr = "Matching Expression";
|
||||
// value is 'url ignored', 'will spider next round', 'error' or
|
||||
// a numeric priority
|
||||
// if ( ! strcmp(hdr,"gbssSpiderPriority") )
|
||||
// hdr = "Matching Action";
|
||||
|
||||
// new columns
|
||||
// if ( ! strcmp(hdr,"gbssAgeInIndex") )
|
||||
// hdr = "Age in Index";
|
||||
|
||||
// if not transformed, then do not print it out
|
||||
if ( ! strncmp(hdr,"gbss",4) )
|
||||
continue;
|
||||
|
||||
skipTransform:
|
||||
if ( ! sb->safeStrcpy ( hdr ) ) return false;
|
||||
|
||||
// record the hash of each one for printing out further json
|
||||
// objects in the same order so columns are aligned!
|
||||
int64_t h64 = hash64n ( ptrs[i] );
|
||||
int64_t h64 = hash64n ( skip ); // ptrs[i] );
|
||||
if ( ! columnTable->addKey ( &h64 , &i ) )
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
//
|
||||
// print header row in csv
|
||||
//
|
||||
bool printCSVHeaderRow ( SafeBuf *sb , State0 *st , int32_t ct ) {
|
||||
|
||||
Msg40 *msg40 = &st->m_msg40;
|
||||
int32_t numResults = msg40->getNumResults();
|
||||
|
||||
char tmp2[1024];
|
||||
SafeBuf nameBuf (tmp2, 1024);
|
||||
|
||||
CollectionRec *cr = g_collectiondb.getRec ( st->m_collnum );
|
||||
|
||||
int32_t numPtrs = 0;
|
||||
|
||||
printCSVHeaderRow2 ( sb ,
|
||||
ct ,
|
||||
cr ,
|
||||
&nameBuf ,
|
||||
&st->m_columnTable ,
|
||||
msg40->m_msg20 ,
|
||||
numResults ,
|
||||
&numPtrs
|
||||
);
|
||||
|
||||
st->m_numCSVColumns = numPtrs;
|
||||
|
||||
if ( ! sb->pushChar('\n') )
|
||||
@ -7960,6 +8267,8 @@ bool printCSVHeaderRow ( SafeBuf *sb , State0 *st ) {
|
||||
// returns false and sets g_errno on error
|
||||
bool printJsonItemInCSV ( char *json , SafeBuf *sb , State0 *st ) {
|
||||
|
||||
CollectionRec *cr = g_collectiondb.getRec ( st->m_collnum );
|
||||
|
||||
int32_t niceness = 0;
|
||||
|
||||
// parse the json
|
||||
@ -8018,6 +8327,9 @@ bool printJsonItemInCSV ( char *json , SafeBuf *sb , State0 *st ) {
|
||||
int32_t slot = columnTable->getSlot ( &h64 ) ;
|
||||
// MUST be in there
|
||||
if ( slot < 0 ) {
|
||||
// we do not transform all gbss fields any more for
|
||||
// diffbot to avoid overpopulating the csv
|
||||
if ( cr && cr->m_isCustomCrawl ) continue;
|
||||
// do not core on this anymore...
|
||||
log("serps: json column not in table : %s",ji->m_name);
|
||||
continue;
|
||||
@ -9039,6 +9351,12 @@ bool printSearchFiltersBar ( SafeBuf *sb , HttpRequest *hr ) {
|
||||
s_mi[n].m_icon = NULL;
|
||||
n++;
|
||||
|
||||
s_mi[n].m_menuNum = 5;
|
||||
s_mi[n].m_title = "Output CSV";
|
||||
s_mi[n].m_cgi = "format=csv";
|
||||
s_mi[n].m_icon = NULL;
|
||||
n++;
|
||||
|
||||
// show/hide banned
|
||||
s_mi[n].m_menuNum = 6;
|
||||
s_mi[n].m_title = "Hide banned results";
|
||||
@ -9116,19 +9434,19 @@ bool printSearchFiltersBar ( SafeBuf *sb , HttpRequest *hr ) {
|
||||
|
||||
s_mi[n].m_menuNum = 11;
|
||||
s_mi[n].m_title = "Respider all results";
|
||||
s_mi[n].m_cgi = "/admin/reindex";
|
||||
s_mi[n].m_cgi = "";//"/admin/reindex";
|
||||
s_mi[n].m_icon = NULL;
|
||||
n++;
|
||||
|
||||
s_mi[n].m_menuNum = 11;
|
||||
s_mi[n].m_title = "Delete all results";
|
||||
s_mi[n].m_cgi = "/admin/reindex";
|
||||
s_mi[n].m_cgi = "";//"/admin/reindex";
|
||||
s_mi[n].m_icon = NULL;
|
||||
n++;
|
||||
|
||||
s_mi[n].m_menuNum = 11;
|
||||
s_mi[n].m_title = "Scrape from google/bing";
|
||||
s_mi[n].m_cgi = "/admin/inject";
|
||||
s_mi[n].m_cgi = "";//"/admin/inject";
|
||||
s_mi[n].m_icon = NULL;
|
||||
n++;
|
||||
|
||||
@ -9355,7 +9673,7 @@ bool printMenu ( SafeBuf *sb , int32_t menuNum , HttpRequest *hr ) {
|
||||
}
|
||||
|
||||
bool replaceParm ( char *cgi , SafeBuf *newUrl , HttpRequest *hr ) {
|
||||
|
||||
if ( ! cgi[0] ) return true;
|
||||
// get original request url. this is not \0 terminated
|
||||
char *src = hr->m_origUrlRequest;
|
||||
int32_t srcLen = hr->m_origUrlRequestLen;
|
||||
@ -9371,7 +9689,8 @@ bool replaceParm2 ( char *cgi , SafeBuf *newUrl ,
|
||||
char *srcEnd = src + srcLen;
|
||||
|
||||
char *equal = strstr(cgi,"=");
|
||||
if ( ! equal ) return log("results: %s has no equal sign",cgi);
|
||||
if ( ! equal )
|
||||
return log("results: %s has no equal sign",cgi);
|
||||
int32_t cgiLen = equal - cgi;
|
||||
|
||||
char *found = NULL;
|
||||
|
@ -13,6 +13,14 @@
|
||||
#define PADDING 8
|
||||
#define SCROLLBAR_WIDTH 20
|
||||
|
||||
bool printCSVHeaderRow2 ( class SafeBuf *sb ,
|
||||
int32_t ct ,
|
||||
class CollectionRec *cr ,
|
||||
class SafeBuf *nameBuf ,
|
||||
class HashTableX *columnTable ,
|
||||
class Msg20 **msg20s ,
|
||||
int32_t numMsg20s ,
|
||||
int32_t *numPtrsArg ) ;
|
||||
|
||||
class State0 {
|
||||
public:
|
||||
|
46
PageRoot.cpp
46
PageRoot.cpp
@ -666,7 +666,7 @@ bool printLeftColumnRocketAndTabs ( SafeBuf *sb ,
|
||||
{"SYNTAX","/syntax.html"},
|
||||
{"USERS","/users.html"},
|
||||
{"ABOUT","/about.html"},
|
||||
{"NEWS","/news.html"},
|
||||
{"BLOG","/blog.html"},
|
||||
// take this out for now
|
||||
//{"FEED","/searchfeed.html"},
|
||||
{"FAQ","/faq.html"},
|
||||
@ -1202,11 +1202,7 @@ bool printWebHomePage ( SafeBuf &sb , HttpRequest *r , TcpSocket *sock ) {
|
||||
if ( printRedBox2 ( &sb , sock , r ) ) // true ) )
|
||||
sb.safePrintf("<br>\n");
|
||||
|
||||
/*
|
||||
|
||||
do not show table for open source installs
|
||||
|
||||
sb.safePrintf("<table cellpadding=3>\n");
|
||||
sb.safePrintf("<br><center><table cellpadding=3>\n");
|
||||
sb.safePrintf("\n");
|
||||
|
||||
char *root = "";
|
||||
@ -1216,16 +1212,42 @@ bool printWebHomePage ( SafeBuf &sb , HttpRequest *r , TcpSocket *sock ) {
|
||||
sb.safePrintf("<tr valign=top>\n");
|
||||
|
||||
//sb.safePrintf("<td align=center><div style=width:50px;height:50px;display:inline-block;background-color:red;></div></td>\n");
|
||||
sb.safePrintf("<td align=center><img height=71px width=50px "
|
||||
sb.safePrintf("<td width=10%% "
|
||||
"align=center><img style=padding-right:10px; "
|
||||
"height=71px width=50px "
|
||||
"src=%s/opensource.png></td>\n"
|
||||
, root );
|
||||
|
||||
sb.safePrintf("<td><font size=+1><b>Open Source!</b>"
|
||||
"</font><br>\n");
|
||||
sb.brify2("Gigablast is now available as an <a href=https://github.com/gigablast/open-source-search-engine>open source search engine</a> on github.com. Download it today. Finally a robust, scalable search solution in C/C++ that has been in development and used commercially since 2000. <a href=http://www.gigablast.com/faq.html#features>Features</a>. Limited support available for free."
|
||||
,80);
|
||||
sb.safePrintf("<td width=45%%><font size=+1><b>Open Source!</b>"
|
||||
"</font><br><br>\n");
|
||||
sb.brify2("Gigablast is now available as an <a href=https://github.com/gigablast/open-source-search-engine>open source search engine</a> on github.com. Download it today. Finally a robust, scalable search solution in C/C++ that has been in development and used commercially since 2000. <a href=http://www.gigablast.com/faq.html#features>Features</a>."
|
||||
,40);
|
||||
//sb.safePrintf("<br><br>");
|
||||
sb.safePrintf("</td>");
|
||||
|
||||
sb.safePrintf("<td><font size=+1><b>ScreenShots</b>"
|
||||
"</font><br><br>\n");
|
||||
|
||||
sb.safePrintf("<a href=/ss_settings.png><img width=150 height=81 src=ss_settings_thumb.png></a>");
|
||||
|
||||
sb.safePrintf("<br><br>");
|
||||
sb.safePrintf("</td></tr>\n");
|
||||
|
||||
sb.safePrintf("<a href=/ss_hosts.png><img width=150 height=81 src=ss_hosts_thumb.png></a>");
|
||||
|
||||
sb.safePrintf("<br><br>");
|
||||
|
||||
sb.safePrintf("<a href=/ss_filters.png><img width=150 height=81 src=ss_filters_thumb.png></a>");
|
||||
|
||||
sb.safePrintf("</td>");
|
||||
|
||||
|
||||
sb.safePrintf("</tr>\n");
|
||||
|
||||
sb.safePrintf("</table></center>\n");
|
||||
|
||||
/*
|
||||
|
||||
do not show table for open source installs
|
||||
|
||||
|
||||
// donate with paypal
|
||||
|
@ -349,11 +349,16 @@ void printUdpTable ( SafeBuf *p, char *title, UdpServer *server ,
|
||||
"<td><b>hostname</b></td>";
|
||||
}
|
||||
|
||||
UdpSlot *slot = server->m_head3;
|
||||
int32_t callbackReadyCount = 0;
|
||||
for ( ; slot ; slot = slot->m_next3 , callbackReadyCount++ );
|
||||
|
||||
p->safePrintf ( "<table %s>"
|
||||
"<tr class=hdrow><td colspan=19>"
|
||||
"<center>"
|
||||
//"<font size=+1>"
|
||||
"<b>%s</b> (%"INT32" transactions)"
|
||||
"(%"INT32" reads ready)"
|
||||
//"</font>"
|
||||
"</td></tr>"
|
||||
"<tr bgcolor=#%s>"
|
||||
@ -380,6 +385,7 @@ void printUdpTable ( SafeBuf *p, char *title, UdpServer *server ,
|
||||
"</tr>\n" ,
|
||||
TABLE_STYLE,
|
||||
title , server->getNumUsedSlots() ,
|
||||
callbackReadyCount ,
|
||||
DARK_BLUE ,
|
||||
dd );
|
||||
|
||||
|
@ -4092,7 +4092,7 @@ bool printRedBox ( SafeBuf *mb , TcpSocket *sock , HttpRequest *hr ) {
|
||||
for ( int32_t i = 1 ; i < g_hostdb.getNumHosts() ; i++ ) {
|
||||
Host *h = &g_hostdb.m_hosts[i];
|
||||
if ( g_hostdb.isDead( h ) ) continue;
|
||||
if ( h->m_pingInfo.m_udpSlotsInUse >= 400 ) jammedHosts++;
|
||||
if ( h->m_pingInfo.m_udpSlotsInUseIncoming>= 400)jammedHosts++;
|
||||
}
|
||||
if ( jammedHosts > 0 ) {
|
||||
if ( adds ) mb->safePrintf("<br>");
|
||||
@ -4101,8 +4101,8 @@ bool printRedBox ( SafeBuf *mb , TcpSocket *sock , HttpRequest *hr ) {
|
||||
if ( out == 1 ) s = " is";
|
||||
mb->safePrintf("%s",box);
|
||||
mb->safePrintf("%"INT32" host%s jammed with "
|
||||
"over %"INT32" outstanding "
|
||||
"udp transactions. "
|
||||
"over %"INT32" unhandled "
|
||||
"incoming udp requests. "
|
||||
"See <a href=/admin/sockets?c=%s>sockets</a>"
|
||||
" table.",jammedHosts,s,400,coll);
|
||||
mb->safePrintf("%s",boxEnd);
|
||||
|
121
Parms.cpp
121
Parms.cpp
@ -1625,6 +1625,11 @@ bool printDropDown ( int32_t n , SafeBuf* sb, char *name, int32_t select,
|
||||
// . by default, minus 2 includes minus 3, the new "FILTERED" priority
|
||||
// . it is link "BANNED" but does not mean the url is low quality necessarily
|
||||
if ( includeMinusTwo ) i = -3;
|
||||
|
||||
// no more DELETE, etc.
|
||||
i = 0;
|
||||
if ( select < 0 ) select = 0;
|
||||
|
||||
for ( ; i < n ; i++ ) {
|
||||
if ( i == select ) s = " selected";
|
||||
else s = "";
|
||||
@ -3446,8 +3451,11 @@ bool Parms::setFromFile ( void *THIS ,
|
||||
Xml xml;
|
||||
//char buf [ MAX_XML_CONF ];
|
||||
SafeBuf sb;
|
||||
if ( filename&&!setXmlFromFile(&xml,filename,&sb))//buf,MAX_XML_CONF) )
|
||||
if ( filename&&!setXmlFromFile(&xml,filename,&sb)){//buf,MAX_XML_CONF))
|
||||
log("parms: error setting from file %s: %s",filename,
|
||||
mstrerror(g_errno));
|
||||
return false;
|
||||
}
|
||||
|
||||
// . all the collectionRecs have the same default file in
|
||||
// the workingDir/collections/default.conf
|
||||
@ -3499,7 +3507,7 @@ bool Parms::setFromFile ( void *THIS ,
|
||||
if ( m->m_type == TYPE_CONSTANT ) continue;
|
||||
// these are special commands really
|
||||
if ( m->m_type == TYPE_BOOL2 ) continue;
|
||||
//if ( strcmp ( m->m_xml , "users" ) == 0 )
|
||||
//if ( strcmp ( m->m_xml , "forceDeleteUrls" ) == 0 )
|
||||
// log("got it");
|
||||
// we did not get one from first xml file yet
|
||||
bool first = true;
|
||||
@ -12985,11 +12993,15 @@ void Parms::init ( ) {
|
||||
"expressions. "
|
||||
"Use the <i>&&</i> operator to string multiple expressions "
|
||||
"together in the same expression text box. "
|
||||
"A <i>spider priority</i> of "
|
||||
"If you check the <i>delete</i> checkbox then urls matching "
|
||||
"that row will be deleted if already indexed, otherwise, "
|
||||
"they just won't be indexed."
|
||||
//"A <i>spider priority</i> of "
|
||||
//"<i>FILTERED</i> or <i>BANNED</i> "
|
||||
"<i>DELETE</i> "
|
||||
"will cause the URL to not be spidered, or if it has already "
|
||||
"been indexed, it will be deleted when it is respidered."
|
||||
// "<i>DELETE</i> "
|
||||
// "will cause the URL to not be spidered, "
|
||||
// "or if it has already "
|
||||
// "been indexed, it will be deleted when it is respidered."
|
||||
"<br><br>";
|
||||
|
||||
/*
|
||||
@ -13159,6 +13171,19 @@ void Parms::init ( ) {
|
||||
m++;
|
||||
*/
|
||||
|
||||
m->m_title = "delete";
|
||||
m->m_cgi = "fdu";
|
||||
m->m_xml = "forceDeleteUrls";
|
||||
m->m_max = MAX_FILTERS;
|
||||
m->m_off = (char *)cr.m_forceDelete - x;
|
||||
m->m_type = TYPE_CHECKBOX;
|
||||
m->m_def = "0";
|
||||
m->m_page = PAGE_FILTERS;
|
||||
m->m_rowid = 1;
|
||||
m->m_flags = PF_REBUILDURLFILTERS | PF_CLONE;
|
||||
m->m_obj = OBJ_COLL;
|
||||
m++;
|
||||
|
||||
m->m_title = "spider priority";
|
||||
m->m_cgi = "fsp";
|
||||
m->m_xml = "filterPriority";
|
||||
@ -17754,7 +17779,8 @@ void Parms::init ( ) {
|
||||
// and we add gbdocspidertime and gbdocindextime terms so you
|
||||
// can use those to sort regular docs and not have spider reply
|
||||
// status docs in the serps.
|
||||
m->m_def = "0";
|
||||
// back on 4/21/2015 seems pretty stable.
|
||||
m->m_def = "1";
|
||||
m->m_page = PAGE_SPIDER;
|
||||
m->m_obj = OBJ_COLL;
|
||||
m->m_flags = PF_CLONE;
|
||||
@ -22006,6 +22032,41 @@ bool Parms::updateParm ( char *rec , WaitEntry *we ) {
|
||||
cr->m_localCrawlInfo.m_lastSpiderAttempt = 0;
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// if user changed the crawl/process max then reset here so
|
||||
// spiders will resume
|
||||
//
|
||||
if ( base == cr &&
|
||||
dst == (char *)&cr->m_maxToCrawl &&
|
||||
cr->m_spiderStatus == SP_MAXTOCRAWL ) {
|
||||
// reset this for rebuilding of active spider collections
|
||||
// so this collection can be in the linked list again
|
||||
cr->m_spiderStatus = SP_INPROGRESS;
|
||||
// rebuild list of active spider collections then
|
||||
g_spiderLoop.m_activeListValid = false;
|
||||
}
|
||||
|
||||
if ( base == cr &&
|
||||
dst == (char *)&cr->m_maxToProcess &&
|
||||
cr->m_spiderStatus == SP_MAXTOPROCESS ) {
|
||||
// reset this for rebuilding of active spider collections
|
||||
// so this collection can be in the linked list again
|
||||
cr->m_spiderStatus = SP_INPROGRESS;
|
||||
// rebuild list of active spider collections then
|
||||
g_spiderLoop.m_activeListValid = false;
|
||||
}
|
||||
|
||||
if ( base == cr &&
|
||||
dst == (char *)&cr->m_maxCrawlRounds &&
|
||||
cr->m_spiderStatus == SP_MAXROUNDS ) {
|
||||
// reset this for rebuilding of active spider collections
|
||||
// so this collection can be in the linked list again
|
||||
cr->m_spiderStatus = SP_INPROGRESS;
|
||||
// rebuild list of active spider collections then
|
||||
g_spiderLoop.m_activeListValid = false;
|
||||
}
|
||||
|
||||
//
|
||||
// END HACK
|
||||
//
|
||||
@ -22287,11 +22348,18 @@ bool printUrlExpressionExamples ( SafeBuf *sb ) {
|
||||
|
||||
|
||||
"<tr class=poo><td>isrss | !isrss</td>"
|
||||
"<td>Matches if document is an rss feed. "
|
||||
"When harvesting outlinks we <i>guess</i> if they "
|
||||
"are an rss feed by seeing if their file extension "
|
||||
"is xml, rss or rdf. Or if they are in an "
|
||||
"alternative link tag.</td></tr>"
|
||||
"<td>Matches if document is an RSS feed. Will "
|
||||
"only match this rule if the document has been "
|
||||
"successfully spidered before, because it requires "
|
||||
"downloading the document content to see if it "
|
||||
"truly is an RSS feed.."
|
||||
"</td></tr>"
|
||||
|
||||
"<tr class=poo><td>isrssext | !isrssext</td>"
|
||||
"<td>Matches if url ends in .xml .rss or .atom. "
|
||||
"TODO: Or if the link was in an "
|
||||
"alternative link tag."
|
||||
"</td></tr>"
|
||||
|
||||
//"<tr class=poo><td>!isrss</td>"
|
||||
//"<td>Matches if document is NOT an rss feed."
|
||||
@ -22452,6 +22520,13 @@ bool printUrlExpressionExamples ( SafeBuf *sb ) {
|
||||
"then this will be matched."
|
||||
"</td></tr>"
|
||||
|
||||
"<tr class=poo><td>isparentsitemap | "
|
||||
"!isparentsitemap</td>"
|
||||
"<td>"
|
||||
"If a parent of the URL was a sitemap.xml page "
|
||||
"then this will be matched."
|
||||
"</td></tr>"
|
||||
|
||||
/*
|
||||
"<tr class=poo><td>parentisnew | !parentisnew</td>"
|
||||
"<td>"
|
||||
@ -22518,6 +22593,20 @@ bool printUrlExpressionExamples ( SafeBuf *sb ) {
|
||||
"Can use <, >, <=, >=, ==, != comparison operators. "
|
||||
"</td></tr>"
|
||||
|
||||
|
||||
"<tr class=poo><td>numinlinks>20</td>"
|
||||
"<td>"
|
||||
"How many inlinks does the URL itself have? "
|
||||
"We only count one link per unique C-Class IP "
|
||||
"address "
|
||||
"so that a webmaster who owns an entire C-Class "
|
||||
"of IP addresses will only have her inlinks counted "
|
||||
"once."
|
||||
"Can use <, >, <=, >=, ==, != comparison operators. "
|
||||
"This is useful for spidering popular URLs quickly."
|
||||
"</td></tr>"
|
||||
|
||||
|
||||
"<tr class=poo><td>httpstatus==404</td>"
|
||||
"<td>"
|
||||
"For matching the URL based on the http status "
|
||||
@ -22649,6 +22738,14 @@ bool printUrlExpressionExamples ( SafeBuf *sb ) {
|
||||
"<i>foo.somesite.com</i> would NOT match."
|
||||
"</td></tr>"
|
||||
|
||||
|
||||
"<tr class=poo><td>isroot | !isroot</td>"
|
||||
"<td>Matches if the URL is a root URL. Like if "
|
||||
"its path is just '/'. Example: http://www.abc.com "
|
||||
"is a root ur but http://www.abc.com/foo is not. "
|
||||
"</td></tr>"
|
||||
|
||||
|
||||
"<tr class=poo><td>isonsamedomain | !isonsamedomain</td>"
|
||||
"<td>"
|
||||
"This is true if the url is from the same "
|
||||
|
8
Parms.h
8
Parms.h
@ -29,10 +29,10 @@ void handleRequest3f ( UdpSlot *slot , int32_t niceness ) ;
|
||||
|
||||
// special priorities for the priority drop down
|
||||
// in the url filters table
|
||||
enum {
|
||||
SPIDER_PRIORITY_FILTERED = -3 ,
|
||||
SPIDER_PRIORITY_BANNED = -2 ,
|
||||
SPIDER_PRIORITY_UNDEFINED = -1 };
|
||||
//enum {
|
||||
// SPIDER_PRIORITY_FILTERED = -3 ,
|
||||
// SPIDER_PRIORITY_BANNED = -2 ,
|
||||
// SPIDER_PRIORITY_UNDEFINED = -1 };
|
||||
|
||||
enum {
|
||||
OBJ_CONF = 1 ,
|
||||
|
@ -28,6 +28,7 @@ int32_t klogctl( int, char *,int ) { return 0; }
|
||||
|
||||
// from main.cpp. when keepalive script restarts us this is true
|
||||
extern bool g_recoveryMode;
|
||||
extern int32_t g_recoveryLevel;
|
||||
|
||||
// a global class extern'd in .h file
|
||||
PingServer g_pingServer;
|
||||
@ -281,6 +282,9 @@ void PingServer::sendPingsToAll ( ) {
|
||||
|
||||
// };
|
||||
|
||||
// from Loop.cpp
|
||||
extern float g_cpuUsage;
|
||||
|
||||
// ping host #i
|
||||
void PingServer::pingHost ( Host *h , uint32_t ip , uint16_t port ) {
|
||||
// don't ping on interface machines
|
||||
@ -491,6 +495,10 @@ void PingServer::pingHost ( Host *h , uint32_t ip , uint16_t port ) {
|
||||
flags |= PFLAG_MERGEMODE0OR6;
|
||||
if ( ! isClockInSync() ) flags |= PFLAG_OUTOFSYNC;
|
||||
|
||||
uint8_t rv8 = (uint8_t)g_recoveryLevel;
|
||||
if ( g_recoveryLevel > 255 ) rv8 = 255;
|
||||
pi->m_recoveryLevel = rv8;
|
||||
|
||||
//*(int32_t *)p = flags; p += 4; // 4 bytes
|
||||
pi->m_flags = flags;
|
||||
|
||||
@ -504,10 +512,13 @@ void PingServer::pingHost ( Host *h , uint32_t ip , uint16_t port ) {
|
||||
|
||||
pi->m_localHostTimeMS = gettimeofdayInMillisecondsLocal();
|
||||
|
||||
pi->m_udpSlotsInUse = g_udpServer.getNumUsedSlots();
|
||||
pi->m_udpSlotsInUseIncoming = g_udpServer.getNumUsedSlotsIncoming();
|
||||
|
||||
pi->m_tcpSocketsInUse = g_httpServer.m_tcp.m_numUsed;
|
||||
|
||||
// from Loop.cpp
|
||||
pi->m_cpuUsage = g_cpuUsage;
|
||||
|
||||
// store hd temps
|
||||
// gbmemcpy ( p , me->m_hdtemps , 4 * 2 );
|
||||
// p += 4 * 2;
|
||||
|
61
Posdb.cpp
61
Posdb.cpp
@ -686,8 +686,10 @@ PosdbTable::~PosdbTable() {
|
||||
}
|
||||
|
||||
void PosdbTable::reset() {
|
||||
// we can't reset this because we don't recall allocTopTree()
|
||||
// again when computing search results in docid ranges.
|
||||
//m_hasFacetTerm = false;
|
||||
// has init() been called?
|
||||
m_hasFacetTerm = false;
|
||||
m_initialized = false;
|
||||
m_estimatedTotalHits = -1;
|
||||
m_errno = 0;
|
||||
@ -4365,6 +4367,9 @@ bool PosdbTable::setQueryTermInfo ( ) {
|
||||
qti->m_qtermNum = i;
|
||||
// and vice versa
|
||||
qt->m_queryTermInfoNum = nrg;
|
||||
// now we count the total # of docs that have a facet
|
||||
// for doing tf/idf type things
|
||||
//qti->m_numDocsThatHaveFacet = 0;
|
||||
// this is not good enough, we need to count
|
||||
// non-whitespace punct as 2 units not 1 unit
|
||||
// otherwise qdist gets thrown off and our phrasing fails.
|
||||
@ -4960,10 +4965,36 @@ inline bool isInRange2 ( char *recPtr , char *subListEnd, QueryTerm *qt ) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// for a facet
|
||||
int64_t PosdbTable::countUniqueDocids( QueryTermInfo *qti ) {
|
||||
// get that sublist. facets should only have one sublist since
|
||||
// they have no synonyms.
|
||||
char *start = qti->m_subLists[0]->getList();
|
||||
register char *recPtr = start;
|
||||
register char *subListEnd = qti->m_subLists[0]->getListEnd();
|
||||
int64_t count = 0;
|
||||
loop:
|
||||
if ( recPtr >= subListEnd ) {
|
||||
if ( m_debug )
|
||||
log(LOG_DEBUG,"posdb: term list size of %"
|
||||
INT32" has %"INT64" unique docids"
|
||||
, (int32_t)(subListEnd-start),count);
|
||||
return count;
|
||||
}
|
||||
// skip that docid record in our termlist. it MUST have been
|
||||
// 12 bytes, a docid heading record.
|
||||
recPtr += 12;
|
||||
count++;
|
||||
// skip any following keys that are 6 bytes, that means they
|
||||
// share the same docid
|
||||
for ( ; recPtr < subListEnd && ((*recPtr)&0x04); recPtr += 6 );
|
||||
goto loop;
|
||||
}
|
||||
|
||||
// . add a QueryTermInfo for a term (synonym lists,etc) to the docid vote buf
|
||||
// "m_docIdVoteBuf"
|
||||
// . this is how we intersect all the docids to end up with the winners
|
||||
void PosdbTable::addDocIdVotes ( QueryTermInfo *qti , int32_t listGroupNum ) {
|
||||
void PosdbTable::addDocIdVotes ( QueryTermInfo *qti , int32_t listGroupNum) {
|
||||
|
||||
// sanity check, we store this in a single byte below for voting
|
||||
if ( listGroupNum >= 256 ) { char *xx=NULL;*xx=0; }
|
||||
@ -5006,7 +5037,7 @@ void PosdbTable::addDocIdVotes ( QueryTermInfo *qti , int32_t listGroupNum ) {
|
||||
// the docid vote buf. that is, if the query is "jump car" we
|
||||
// just add all the docids for "jump" and then intersect with the
|
||||
// docids for "car".
|
||||
for ( int32_t i = 0 ; i < qti->m_numSubLists && listGroupNum > 0 ; i++ ) {
|
||||
for ( int32_t i = 0 ; i < qti->m_numSubLists && listGroupNum > 0; i++){
|
||||
// get that sublist
|
||||
recPtr = qti->m_subLists[i]->getList();
|
||||
subListEnd = qti->m_subLists[i]->getListEnd();
|
||||
@ -5049,6 +5080,7 @@ void PosdbTable::addDocIdVotes ( QueryTermInfo *qti , int32_t listGroupNum ) {
|
||||
dp[5] = listGroupNum;
|
||||
// skip it
|
||||
dp += 6;
|
||||
|
||||
// advance recPtr now
|
||||
break;
|
||||
}
|
||||
@ -5121,7 +5153,7 @@ void PosdbTable::addDocIdVotes ( QueryTermInfo *qti , int32_t listGroupNum ) {
|
||||
for ( int32_t i = 0 ; i < qti->m_numSubLists ; i++ ) {
|
||||
// skip if exhausted
|
||||
if ( ! cursor[i] ) continue;
|
||||
// int16_tcut
|
||||
// shortcut
|
||||
recPtr = cursor[i];
|
||||
// get the min docid
|
||||
if ( ! minRecPtr ) {
|
||||
@ -5628,6 +5660,23 @@ void PosdbTable::intersectLists10_r ( ) {
|
||||
//if ( s_special == 2836 )
|
||||
// log("hey");
|
||||
|
||||
// point to our array of query term infos set in setQueryTermInfos()
|
||||
QueryTermInfo *qip = (QueryTermInfo *)m_qiBuf.getBufStart();
|
||||
|
||||
// if a query term is for a facet (ie gbfacetstr:gbtagsite)
|
||||
// then count how many unique docids are in it. we were trying to
|
||||
// do this in addDocIdVotes() but it wasn't in the right place i guess.
|
||||
for ( int32_t i = 0 ; i < m_numQueryTermInfos ; i++ ) {
|
||||
QueryTermInfo *qti = &qip[i];
|
||||
QueryTerm *qt = qti->m_qt;
|
||||
bool isFacetTerm = false;
|
||||
if ( qt->m_fieldCode == FIELD_GBFACETSTR ) isFacetTerm = true;
|
||||
if ( qt->m_fieldCode == FIELD_GBFACETINT ) isFacetTerm = true;
|
||||
if ( qt->m_fieldCode == FIELD_GBFACETFLOAT ) isFacetTerm =true;
|
||||
if ( ! isFacetTerm ) continue;
|
||||
qt->m_numDocsThatHaveFacet += countUniqueDocids ( qti );
|
||||
}
|
||||
|
||||
|
||||
// setQueryTermInfos() should have set how many we have
|
||||
if ( m_numQueryTermInfos == 0 ) {
|
||||
@ -5662,8 +5711,6 @@ void PosdbTable::intersectLists10_r ( ) {
|
||||
|
||||
int32_t listGroupNum = 0;
|
||||
|
||||
// point to our array of query term infos set in setQueryTermInfos()
|
||||
QueryTermInfo *qip = (QueryTermInfo *)m_qiBuf.getBufStart();
|
||||
|
||||
// if all non-negative query terms are in the same wikiphrase then
|
||||
// we can apply the WIKI_WEIGHT in getMaxPossibleScore() which
|
||||
@ -5705,8 +5752,6 @@ void PosdbTable::intersectLists10_r ( ) {
|
||||
goto skip3;
|
||||
}
|
||||
|
||||
|
||||
|
||||
// . create "m_docIdVoteBuf" filled with just the docids from the
|
||||
// smallest group of sublists
|
||||
// . m_minListi is the queryterminfo that had the smallest total
|
||||
|
2
Posdb.h
2
Posdb.h
@ -711,6 +711,8 @@ class PosdbTable {
|
||||
|
||||
void shrinkSubLists ( class QueryTermInfo *qti );
|
||||
|
||||
int64_t countUniqueDocids( QueryTermInfo *qti ) ;
|
||||
|
||||
// for intersecting docids
|
||||
void addDocIdVotes ( class QueryTermInfo *qti , int32_t listGroupNum );
|
||||
|
||||
|
@ -1471,6 +1471,11 @@ bool Process::shutdown2 ( ) {
|
||||
else
|
||||
log(LOG_INFO,"gb: Shutting down. Try #%"INT32".",m_try++);
|
||||
|
||||
|
||||
// switch to urgent if having problems
|
||||
if ( m_try >= 10 )
|
||||
m_urgent = true;
|
||||
|
||||
// turn off statsdb so it does not try to add records for these writes
|
||||
g_statsdb.m_disabled = true;
|
||||
|
||||
@ -1861,7 +1866,7 @@ bool Process::saveBlockingFiles1 ( ) {
|
||||
if ( g_conf.m_readOnlyMode ) return true;
|
||||
|
||||
// save user accounting files. 3 of them.
|
||||
if ( g_hostdb.m_myHost->m_isProxy )
|
||||
if ( g_hostdb.m_myHost && g_hostdb.m_myHost->m_isProxy )
|
||||
g_proxy.saveUserBufs();
|
||||
|
||||
// save the Conf file now
|
||||
|
421
Query.cpp
421
Query.cpp
@ -32,6 +32,11 @@ void Query::constructor ( ) {
|
||||
m_qwords = NULL;
|
||||
m_numTerms = 0;
|
||||
m_containingParent = NULL;
|
||||
m_st0Ptr = NULL;
|
||||
// we have to manually call this because Query::constructor()
|
||||
// might have been called explicitly
|
||||
for ( int32_t i = 0 ; i < MAX_QUERY_TERMS ; i++ )
|
||||
m_qterms[i].constructor();
|
||||
//m_expressions = NULL;
|
||||
reset ( );
|
||||
}
|
||||
@ -48,10 +53,15 @@ void Query::reset ( ) {
|
||||
|
||||
// if Query::constructor() was called explicitly then we have to
|
||||
// call destructors explicitly as well...
|
||||
// essentially call QueryTerm::reset() on each query term
|
||||
for ( long i = 0 ; i < m_numTerms ; i++ ) {
|
||||
// get it
|
||||
QueryTerm *qt = &m_qterms[i];
|
||||
HashTableX *ht = &qt->m_facetHashTable;
|
||||
// debug note
|
||||
// log("results: free fhtqt of %"PTRFMT" for q=%"PTRFMT
|
||||
// " st0=%"PTRFMT,
|
||||
// (PTRTYPE)ht->m_buf,(PTRTYPE)this,(PTRTYPE)m_st0Ptr);
|
||||
ht->reset();
|
||||
qt->m_facetIndexBuf.purge();
|
||||
}
|
||||
@ -1285,6 +1295,7 @@ bool Query::setQTerms ( Words &words , Phrases &phrases ) {
|
||||
qt->m_isPhrase = false ;
|
||||
qt->m_isUORed = false;
|
||||
qt->m_UORedTerm = NULL;
|
||||
qt->m_langIdBits = 0;
|
||||
// synonym of this term...
|
||||
qt->m_synonymOf = origTerm;
|
||||
// nuke this crap since it was done above and we
|
||||
@ -2570,6 +2581,11 @@ bool Query::setQWords ( char boolFlag ,
|
||||
qw->m_ignoreWordInBoolQuery = true;
|
||||
}
|
||||
|
||||
// this seems case sensitive now, gbfacetstr:humanLang
|
||||
if ( fieldCode == FIELD_GBFACETSTR ) {
|
||||
wid = hash64 ( w , wlen , 0LL );
|
||||
}
|
||||
|
||||
if ( fieldCode == FIELD_GBFIELDMATCH ) {
|
||||
// hash the json field name. (i.e. tag.uri)
|
||||
// make it case sensitive as
|
||||
@ -4105,8 +4121,10 @@ struct QueryField g_fields[] = {
|
||||
false,
|
||||
"gbdocspiderdate:1400081479",
|
||||
"Matches documents that have "
|
||||
"that spider date timestamp (UTC). Does not include the "
|
||||
"special spider status documents. This is the time the document "
|
||||
"that spider date timestamp (UTC). "
|
||||
//"Does not include the "
|
||||
//"special spider status documents. "
|
||||
"This is the time the document "
|
||||
"completed downloading.",
|
||||
"Date Related Query Operators",
|
||||
QTF_BEGINNEWTABLE},
|
||||
@ -4116,7 +4134,8 @@ struct QueryField g_fields[] = {
|
||||
FIELD_GENERIC,
|
||||
false,
|
||||
"gbspiderdate:1400081479",
|
||||
"Like above, but DOES include the special spider status documents.",
|
||||
"Like above.",
|
||||
//, but DOES include the special spider status documents.",
|
||||
NULL,
|
||||
0},
|
||||
|
||||
@ -4126,8 +4145,8 @@ struct QueryField g_fields[] = {
|
||||
"gbdocindexdate:1400081479",
|
||||
"Like above, but is the time the document was last indexed. "
|
||||
"This time is "
|
||||
"slightly greater than or equal to the spider date. Does not "
|
||||
"include the special spider status documents.",
|
||||
"slightly greater than or equal to the spider date.",//Does not "
|
||||
//"include the special spider status documents.",
|
||||
NULL,
|
||||
0},
|
||||
|
||||
@ -4136,8 +4155,8 @@ struct QueryField g_fields[] = {
|
||||
FIELD_GENERIC,
|
||||
false,
|
||||
"gbindexdate:1400081479",
|
||||
"Like above, but it does include the special spider status "
|
||||
"documents.",
|
||||
"Like above.",//, but it does include the special spider status "
|
||||
//"documents.",
|
||||
NULL,
|
||||
0},
|
||||
|
||||
@ -4251,6 +4270,384 @@ struct QueryField g_fields[] = {
|
||||
//
|
||||
// spider status docs queries
|
||||
//
|
||||
|
||||
{"gbssUrl",
|
||||
FIELD_GENERIC,
|
||||
false,
|
||||
"gbssUrl:com",
|
||||
"Query the url of a spider status document.",
|
||||
"Spider Status Documents", // title
|
||||
QTF_BEGINNEWTABLE},
|
||||
|
||||
|
||||
{"gbssFinalRedirectUrl",
|
||||
FIELD_GENERIC,
|
||||
false,
|
||||
"gbssFinalRedirectUrl:abc.com/page2.html",
|
||||
"Query on the last url redirect to, if any.",
|
||||
NULL, // title
|
||||
0},
|
||||
|
||||
{"gbssStatusCode",
|
||||
FIELD_GENERIC,
|
||||
false,
|
||||
"gbssStatusCode:0",
|
||||
"Query on the status code of the index attempt. 0 means no error.",
|
||||
NULL,
|
||||
0},
|
||||
|
||||
{"gbssStatusMsg",
|
||||
FIELD_GENERIC,
|
||||
false,
|
||||
"gbssStatusMsg:\"Tcp timed\"",
|
||||
"Like gbssStatusCode but a textual representation.",
|
||||
NULL,
|
||||
0},
|
||||
|
||||
{"gbssHttpStatus",
|
||||
FIELD_GENERIC,
|
||||
false,
|
||||
"gbssHttpStatus:200",
|
||||
"Query on the HTTP status returned from the web server.",
|
||||
NULL,
|
||||
0},
|
||||
|
||||
{"gbssWasIndexed",
|
||||
FIELD_GENERIC,
|
||||
false,
|
||||
"gbssWasIndexed:0",
|
||||
"Was the document in the index before attempting to index? Use 0 "
|
||||
" or 1 to find all documents that were not or were, respectively.",
|
||||
NULL,
|
||||
0},
|
||||
|
||||
{"gbssIsDiffbotObject",
|
||||
FIELD_GENERIC,
|
||||
false,
|
||||
"gbssIsDiffbotObject:1",
|
||||
"This field is only present if the document was an object from "
|
||||
"a diffbot reply. Use gbssIsDiffbotObject:0 to find the non-diffbot "
|
||||
"objects.",
|
||||
NULL,
|
||||
0},
|
||||
|
||||
{"gbssAgeInIndex",
|
||||
FIELD_GENERIC,
|
||||
false,
|
||||
"gbsortby:gbssAgeInIndex",
|
||||
"If the document was in the index at the time we attempted to "
|
||||
"reindex it, how long has it been since it was last indexed?",
|
||||
NULL,
|
||||
0},
|
||||
|
||||
{"gbssDomain",
|
||||
FIELD_GENERIC,
|
||||
false,
|
||||
"gbssDomain:yahoo.com",
|
||||
"Query on the domain of the url.",
|
||||
NULL,
|
||||
0},
|
||||
|
||||
{"gbssSubdomain",
|
||||
FIELD_GENERIC,
|
||||
false,
|
||||
"gbssSubdomain:www.yahoo.com",
|
||||
"Query on the subdomain of the url.",
|
||||
NULL,
|
||||
0},
|
||||
|
||||
{"gbssNumRedirects",
|
||||
FIELD_GENERIC,
|
||||
false,
|
||||
"gbfacetint:gbssNumRedirects",
|
||||
"Query on the number of times the url redirect when attempting to "
|
||||
"index it.",
|
||||
NULL,
|
||||
0},
|
||||
|
||||
{"gbssDocId",
|
||||
FIELD_GENERIC,
|
||||
false,
|
||||
"gbssDocId:1234567",
|
||||
"Show all the spider status docs for the document with this docId.",
|
||||
NULL,
|
||||
0},
|
||||
|
||||
{"gbssHopCount",
|
||||
FIELD_GENERIC,
|
||||
false,
|
||||
"gbfacetint:gbssHopCount",
|
||||
"Query on the hop count of the document.",
|
||||
NULL,
|
||||
0},
|
||||
|
||||
{"gbssCrawlRound",
|
||||
FIELD_GENERIC,
|
||||
false,
|
||||
"gbfacetint:gbssCrawlRound",
|
||||
"Query on the crawl round number.",
|
||||
NULL,
|
||||
0},
|
||||
|
||||
{"gbssDupOfDocId",
|
||||
FIELD_GENERIC,
|
||||
false,
|
||||
"gbssDupOfDocId:123456",
|
||||
"Show all the documents that were considered dups of this docId.",
|
||||
NULL,
|
||||
0},
|
||||
|
||||
{"gbssPrevTotalNumIndexAttempts",
|
||||
FIELD_GENERIC,
|
||||
false,
|
||||
"gbssPrevTotalNumIndexAttempts:1",
|
||||
"Before this index attempt, how many attempts were there?",
|
||||
NULL,
|
||||
0},
|
||||
|
||||
{"gbssPrevTotalNumIndexSuccesses",
|
||||
FIELD_GENERIC,
|
||||
false,
|
||||
"gbssPrevTotalNumIndexSuccesses:1",
|
||||
"Before this index attempt, how many successful attempts were there?",
|
||||
NULL,
|
||||
0},
|
||||
|
||||
{"gbssPrevTotalNumIndexFailures",
|
||||
FIELD_GENERIC,
|
||||
false,
|
||||
"gbssPrevTotalNumIndexFailures:1",
|
||||
"Before this index attempt, how many failed attempts were there?",
|
||||
NULL,
|
||||
0},
|
||||
|
||||
{"gbssFirstIndexed",
|
||||
FIELD_GENERIC,
|
||||
false,
|
||||
"gbrevsortbyint:gbssFirsIndexed",
|
||||
"The date in utc that the document was first indexed.",
|
||||
NULL,
|
||||
0},
|
||||
|
||||
{"gbssContentHash32",
|
||||
FIELD_GENERIC,
|
||||
false,
|
||||
"gbfacetint:gbssContentHash32",
|
||||
"The hash of the document content, excluding dates and times. Used "
|
||||
"internally for deduping.",
|
||||
NULL,
|
||||
0},
|
||||
|
||||
{"gbssDownloadDurationMS",
|
||||
FIELD_GENERIC,
|
||||
false,
|
||||
"gbsortbyint:gbssDownloadDurationMS",
|
||||
"How long it took in millisecons to download the document.",
|
||||
NULL,
|
||||
0},
|
||||
|
||||
{"gbssDownloadStartTime",
|
||||
FIELD_GENERIC,
|
||||
false,
|
||||
"gbsortbyint:gbssDownloadStartTime",
|
||||
"When the download started, in seconds since the epoch, UTC.",
|
||||
NULL,
|
||||
0},
|
||||
|
||||
{"gbssDownloadEndTime",
|
||||
FIELD_GENERIC,
|
||||
false,
|
||||
"gbsortbyint:gbssDownloadEndTime",
|
||||
"When the download ended, in seconds since the epoch, UTC.",
|
||||
NULL,
|
||||
0},
|
||||
|
||||
{"gbssUsedRobotsTxt",
|
||||
FIELD_GENERIC,
|
||||
false,
|
||||
"gbfacetint:gbssUsedRobotsTxt",
|
||||
"This is 0 or 1 depending on if robots.txt was not obeyed or obeyed, "
|
||||
"respectively.",
|
||||
NULL,
|
||||
0},
|
||||
|
||||
{"gbssConsecutiveErrors",
|
||||
FIELD_GENERIC,
|
||||
false,
|
||||
"gbfacetint:gbssConsecutiveErrors",
|
||||
"For the last set of indexing attempts how many were errors?",
|
||||
NULL,
|
||||
0},
|
||||
|
||||
{"gbssIp",
|
||||
FIELD_GENERIC,
|
||||
false,
|
||||
"gbssIp:1.2.3.4",
|
||||
"The IP address of the document being indexed. Is 0.0.0.0 "
|
||||
"if unknown.",
|
||||
NULL,
|
||||
0},
|
||||
|
||||
{"gbssIpLookupTimeMS",
|
||||
FIELD_GENERIC,
|
||||
false,
|
||||
"gbsortby:gbssIpLookupTimeMS",
|
||||
"How long it took to lookup the IP of the document. Might have been "
|
||||
"in the cache.",
|
||||
NULL,
|
||||
0},
|
||||
|
||||
{"gbssSiteNumInlinks",
|
||||
FIELD_GENERIC,
|
||||
false,
|
||||
"gbsortby:gbssSiteNumInlinks",
|
||||
"How many good inlinks the document's site had.",
|
||||
NULL,
|
||||
0},
|
||||
|
||||
{"gbssSiteRank",
|
||||
FIELD_GENERIC,
|
||||
false,
|
||||
"gbsortby:gbssSiteRank",
|
||||
"The site rank of the document. Based directly "
|
||||
"on the number of inlinks the site had.",
|
||||
NULL,
|
||||
0},
|
||||
|
||||
{"gbssContentInjected",
|
||||
FIELD_GENERIC,
|
||||
false,
|
||||
"gbfacetint:gbssContentInjected",
|
||||
"This is 0 or 1 if the content was not injected or injected, "
|
||||
"respectively.",
|
||||
NULL,
|
||||
0},
|
||||
|
||||
{"gbssPercentContentChanged",
|
||||
FIELD_GENERIC,
|
||||
false,
|
||||
"gbfacetfloat:gbssPercentContentChanged",
|
||||
"A float between 0 and 100, inclusive. Represents how much "
|
||||
"the document has changed since the last time we indexed it. This is "
|
||||
"only valid if the document was successfully indexed this time."
|
||||
"respectively.",
|
||||
NULL,
|
||||
0},
|
||||
|
||||
{"gbssSpiderPriority",
|
||||
FIELD_GENERIC,
|
||||
false,
|
||||
"gbfacetint:gbssSpiderPriority",
|
||||
"The spider priority, from 0 to 127, inclusive, of the document "
|
||||
"according to the url filters table.",
|
||||
NULL,
|
||||
0},
|
||||
|
||||
{"gbssMatchingUrlFilter",
|
||||
FIELD_GENERIC,
|
||||
false,
|
||||
"gbfacetstr:gbssMatchingUrlFilter",
|
||||
"The url filter expression the document matched.",
|
||||
NULL,
|
||||
0},
|
||||
|
||||
{"gbssLanguage",
|
||||
FIELD_GENERIC,
|
||||
false,
|
||||
"gbfacetstr:gbssLanguage",
|
||||
"The language of the document. If document was empty or not "
|
||||
"downloaded then this will not be present. Uses xx to mean "
|
||||
"unknown language. Uses the language abbreviations found at the "
|
||||
"bottom of the url filters page.",
|
||||
NULL,
|
||||
0},
|
||||
|
||||
{"gbssContentType",
|
||||
FIELD_GENERIC,
|
||||
false,
|
||||
"gbfacetstr:gbssContentType",
|
||||
"The content type of the document. Like html, xml, json, pdf, etc. "
|
||||
"This field is not present if unknown.",
|
||||
NULL,
|
||||
0},
|
||||
|
||||
{"gbssContentLen",
|
||||
FIELD_GENERIC,
|
||||
false,
|
||||
"gbsortbyint:gbssContentLen",
|
||||
"The content length of the document. 0 if empty or not downloaded.",
|
||||
NULL,
|
||||
0},
|
||||
|
||||
{"gbssCrawlDelayMS",
|
||||
FIELD_GENERIC,
|
||||
false,
|
||||
"gbfacetint:gbssCrawlDelay",
|
||||
"The crawl delay according to the robots.txt of the document. "
|
||||
"This is -1 if not specified in the robots.txt or not found.",
|
||||
NULL,
|
||||
0},
|
||||
|
||||
{"gbssSentToDiffbot",
|
||||
FIELD_GENERIC,
|
||||
false,
|
||||
"gbssSentToDiffbot:1",
|
||||
"Was the document's url sent to diffbot for processing?",
|
||||
NULL,
|
||||
0},
|
||||
|
||||
{"gbssDiffbotReplyCode",
|
||||
FIELD_GENERIC,
|
||||
false,
|
||||
"gbssDiffbotReplyCode:0",
|
||||
"The reply received from diffbot. 0 means success, otherwise, it "
|
||||
"indicates an error code.",
|
||||
NULL,
|
||||
0},
|
||||
|
||||
{"gbssDiffbotReplyMsg",
|
||||
FIELD_GENERIC,
|
||||
false,
|
||||
"gbfacetstr:gbssDiffbotReplyMsg:0",
|
||||
"The reply received from diffbot represented in text.",
|
||||
NULL,
|
||||
0},
|
||||
|
||||
{"gbssDiffbotReplyLen",
|
||||
FIELD_GENERIC,
|
||||
false,
|
||||
"gbsortbyint:gbssDiffbotReplyLen",
|
||||
"The length of the reply received from diffbot.",
|
||||
NULL,
|
||||
0},
|
||||
|
||||
{"gbssDiffbotReplyResponseTimeMS",
|
||||
FIELD_GENERIC,
|
||||
false,
|
||||
"gbsortbyint:gbssDiffbotReplyResponseTimeMS",
|
||||
"The time in milliseconds it took to get a reply from diffbot.",
|
||||
NULL,
|
||||
0},
|
||||
|
||||
{"gbssDiffbotReplyRetries",
|
||||
FIELD_GENERIC,
|
||||
false,
|
||||
"gbfacetint:gbssDiffbotReplyRetries",
|
||||
"The number of times we had to resend the request to diffbot "
|
||||
"because diffbot returned a 504 gateway timed out error.",
|
||||
NULL,
|
||||
0},
|
||||
|
||||
{"gbssDiffbotReplyNumObjects",
|
||||
FIELD_GENERIC,
|
||||
false,
|
||||
"gbfacetint:gbssDiffbotReplyNumObjects",
|
||||
"The number of JSON objects diffbot excavated from the provided url.",
|
||||
NULL,
|
||||
0},
|
||||
|
||||
|
||||
/*
|
||||
{"gbstatus",
|
||||
FIELD_GENERIC,
|
||||
false,
|
||||
@ -4362,7 +4759,7 @@ struct QueryField g_fields[] = {
|
||||
"spider status documents.",
|
||||
NULL,
|
||||
0},
|
||||
|
||||
*/
|
||||
|
||||
|
||||
// they don't need to know about this
|
||||
@ -5038,6 +5435,14 @@ bool Query::isSplit() {
|
||||
return false;
|
||||
}
|
||||
|
||||
void QueryTerm::constructor ( ) {
|
||||
m_facetHashTable.constructor(); // hashtablex
|
||||
m_facetIndexBuf.constructor(); // safebuf
|
||||
m_langIdBits = 0;
|
||||
m_langIdBitsValid = false;
|
||||
m_numDocsThatHaveFacet = 0;
|
||||
}
|
||||
|
||||
bool QueryTerm::isSplit() {
|
||||
if(!m_fieldCode) return true;
|
||||
if(m_fieldCode == FIELD_QUOTA) return false;
|
||||
|
9
Query.h
9
Query.h
@ -397,6 +397,11 @@ class QueryWord {
|
||||
class QueryTerm {
|
||||
|
||||
public:
|
||||
|
||||
//QueryTerm ( ) { constructor(); };
|
||||
|
||||
void constructor ( ) ;
|
||||
|
||||
// the query word we were derived from
|
||||
QueryWord *m_qword;
|
||||
// . are we a phrase termid or single word termid from that QueryWord?
|
||||
@ -557,6 +562,7 @@ class QueryTerm {
|
||||
int64_t m_hash64d;
|
||||
int32_t m_popWeight;
|
||||
|
||||
uint64_t m_numDocsThatHaveFacet;
|
||||
};
|
||||
|
||||
//#define MAX_OPSLOTS 256
|
||||
@ -871,6 +877,9 @@ class Query {
|
||||
return NULL;
|
||||
};
|
||||
|
||||
// for debugging fhtqt mem leak
|
||||
char *m_st0Ptr;
|
||||
|
||||
// silly little functions that support the BIG HACK
|
||||
//int32_t getNumNonFieldedSingletonTerms() { return m_numTermsSpecial; };
|
||||
//int32_t getTermsFound ( Query *q , char *foundTermVector ) ;
|
||||
|
@ -6,7 +6,8 @@ An open source web and enterprise search engine and spider/crawler. As can be se
|
||||
RUNNING GIGABLAST
|
||||
-----------------
|
||||
|
||||
See html/faq.html for all administrative documentation including
|
||||
See <a href=html/faq.html>html/faq.html</a>
|
||||
for all administrative documentation including
|
||||
the quick start instructions.
|
||||
|
||||
Alternatively, visit http://www.gigablast.com/faq.html
|
||||
@ -16,7 +17,8 @@ Alternatively, visit http://www.gigablast.com/faq.html
|
||||
CODE ARCHITECTURE
|
||||
-----------------
|
||||
|
||||
See html/developer.html for all code documentation.
|
||||
See <a href=html/developer.html>html/developer.html</a>
|
||||
for all code documentation.
|
||||
|
||||
Alternatively, visit http://www.gigablast.com/developer.html
|
||||
|
||||
|
11
Rdb.cpp
11
Rdb.cpp
@ -2324,11 +2324,13 @@ bool Rdb::addRecord ( collnum_t collnum,
|
||||
SpiderRequest *sreq = (SpiderRequest *)data;
|
||||
logf(LOG_DEBUG,"spider: added doledb key "
|
||||
"for pri=%"INT32" time=%"UINT32" "
|
||||
"uh48=%"UINT64" docid=%"INT64" u=%s",
|
||||
"uh48=%"UINT64" "
|
||||
//"docid=%"INT64" "
|
||||
"u=%s",
|
||||
(int32_t)g_doledb.getPriority(&doleKey),
|
||||
(uint32_t)g_doledb.getSpiderTime(&doleKey),
|
||||
g_doledb.getUrlHash48(&doleKey),
|
||||
sreq->m_probDocId,
|
||||
//sreq->m_probDocId,
|
||||
sreq->m_url);
|
||||
}
|
||||
}
|
||||
@ -3042,7 +3044,10 @@ char getKeySizeFromRdbId ( uint8_t rdbId ) {
|
||||
}
|
||||
}
|
||||
// sanity check
|
||||
if ( s_table1[rdbId] == 0 ) { char *xx=NULL;*xx=0; }
|
||||
if ( s_table1[rdbId] == 0 ) {
|
||||
log("rdb: bad lookup rdbid of %i",(int)rdbId);
|
||||
char *xx=NULL;*xx=0;
|
||||
}
|
||||
return s_table1[rdbId];
|
||||
}
|
||||
|
||||
|
@ -815,6 +815,7 @@ int32_t RdbBase::addFile ( int32_t id , bool isNew , int32_t mergeNum , int32_t
|
||||
ff->getFilename() ,
|
||||
(int64_t)ff->getFileSize(),
|
||||
(int64_t)MAX_PART_SIZE);
|
||||
exit(0);
|
||||
return -1;
|
||||
}
|
||||
|
||||
@ -2480,6 +2481,14 @@ bool RdbBase::verifyFileSharding ( ) {
|
||||
// not re-verify file sharding! only do at startup
|
||||
if ( g_loop.m_isDoingLoop ) return true;
|
||||
|
||||
// skip for now to speed up startup
|
||||
static int32_t s_count = 0;
|
||||
s_count++;
|
||||
if ( s_count == 50 )
|
||||
log("db: skipping shard verification for remaining files");
|
||||
if ( s_count >= 50 )
|
||||
return true;
|
||||
|
||||
g_threads.disableThreads();
|
||||
|
||||
Msg5 msg5;
|
||||
|
@ -885,6 +885,13 @@ bool RdbBuckets::addBucket (RdbBucket* newBucket, int32_t i) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// void RdbBuckets::deleteBucket ( int32_t i ) {
|
||||
// int32_t moveSize = (m_numBuckets - i)*sizeof(RdbBuckets*);
|
||||
// if(moveSize > 0)
|
||||
// memmove(&m_buckets[i+1], &m_buckets[i], moveSize);
|
||||
// m_numBuckets--;
|
||||
// }
|
||||
|
||||
bool RdbBuckets::getList ( collnum_t collnum ,
|
||||
char *startKey, char *endKey, int32_t minRecSizes ,
|
||||
RdbList *list , int32_t *numPosRecs ,
|
||||
@ -1768,6 +1775,66 @@ bool RdbBucket::deleteList(RdbList *list) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// remove keys from any non-existent collection
|
||||
void RdbBuckets::cleanBuckets ( ) {
|
||||
|
||||
// what buckets have -1 rdbid???
|
||||
if ( m_rdbId < 0 ) return;
|
||||
|
||||
// the liberation count
|
||||
int32_t count = 0;
|
||||
|
||||
/*
|
||||
char buf[50000];
|
||||
RdbList list;
|
||||
list.set ( NULL,
|
||||
0,
|
||||
buf,
|
||||
50000,
|
||||
0, // fixeddatasize
|
||||
false, // own data? should rdblist free it
|
||||
false, // usehalfkeys
|
||||
m_ks);
|
||||
*/
|
||||
|
||||
top:
|
||||
|
||||
for ( int32_t i = 0; i < m_numBuckets; i++ ) {
|
||||
RdbBucket *b = m_buckets[i];
|
||||
collnum_t collnum = b->getCollnum();
|
||||
CollectionRec *cr = g_collectiondb.m_recs[collnum];
|
||||
if ( cr ) continue;
|
||||
// count # deleted
|
||||
count += b->getNumKeys();
|
||||
// delete that coll
|
||||
delColl ( collnum );
|
||||
// restart
|
||||
goto top;
|
||||
/*
|
||||
int32_t nk = b->getNumKeys();
|
||||
for (int32_t j = 0 ; j < nk ; j++ ) {
|
||||
char *kp = b->m_keys + j*m_ks;
|
||||
// add into list. should just be a gbmemcpy()
|
||||
list.addKey ( kp , 0 , NULL );
|
||||
*/
|
||||
//deleteBucket ( i );
|
||||
}
|
||||
|
||||
// print it
|
||||
if ( count == 0 ) return;
|
||||
log(LOG_LOGIC,"db: Removed %"INT32" records from %s buckets "
|
||||
"for invalid collection numbers.",count,m_dbname);
|
||||
//log(LOG_LOGIC,"db: Records not actually removed for safety. Except "
|
||||
// "for those with negative colnums.");
|
||||
// static bool s_print = true;
|
||||
// if ( ! s_print ) return;
|
||||
// s_print = false;
|
||||
// log (LOG_LOGIC,"db: This is bad. Did you remove a collection "
|
||||
// "subdirectory? Don't do that, you should use the \"delete "
|
||||
// "collections\" interface because it also removes records from "
|
||||
// "memory, too.");
|
||||
}
|
||||
|
||||
|
||||
bool RdbBuckets::delColl(collnum_t collnum) {
|
||||
|
||||
@ -1783,7 +1850,8 @@ bool RdbBuckets::delColl(collnum_t collnum) {
|
||||
minRecSizes /= 2;
|
||||
continue;
|
||||
} else {
|
||||
log("db: buckets could not delete collection: %s.",
|
||||
log("db: buckets could not delete "
|
||||
"collection: %s.",
|
||||
mstrerror(errno));
|
||||
return false;
|
||||
}
|
||||
@ -1791,6 +1859,8 @@ bool RdbBuckets::delColl(collnum_t collnum) {
|
||||
if(list.isEmpty()) break;
|
||||
deleteList(collnum, &list);
|
||||
}
|
||||
|
||||
log("buckets: deleted all keys for collnum %"INT32,(int32_t)collnum);
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -168,6 +168,7 @@ class RdbBuckets {
|
||||
|
||||
int32_t getNumNegativeKeys ( );
|
||||
int32_t getNumPositiveKeys ( );
|
||||
void cleanBuckets ( );
|
||||
bool delColl ( collnum_t collnum );
|
||||
|
||||
//just for this collection
|
||||
|
24
RdbDump.cpp
24
RdbDump.cpp
@ -215,7 +215,7 @@ void RdbDump::doneDumping ( ) {
|
||||
// . map verify
|
||||
// . if continueDumping called us with no collectionrec, it got
|
||||
// deleted so RdbBase::m_map is nuked too i guess
|
||||
if ( saved != ENOCOLLREC )
|
||||
if ( saved != ENOCOLLREC && m_map )
|
||||
log("db: map # pos=%"INT64" neg=%"INT64"",
|
||||
m_map->getNumPositiveRecs(),
|
||||
m_map->getNumNegativeRecs()
|
||||
@ -230,11 +230,11 @@ void RdbDump::doneDumping ( ) {
|
||||
if ( saved == ENOCOLLREC ) return;
|
||||
|
||||
// save the map to disk
|
||||
m_map->writeMap();
|
||||
if ( m_map ) m_map->writeMap();
|
||||
#ifdef GBSANITYCHECK
|
||||
// sanity check
|
||||
log("DOING SANITY CHECK FOR MAP -- REMOVE ME");
|
||||
if ( ! m_map->verifyMap ( m_file ) ) {
|
||||
if ( m_map && ! m_map->verifyMap ( m_file ) ) {
|
||||
char *xx = NULL; *xx = 0; }
|
||||
// now check the whole file for consistency
|
||||
if ( m_ks == 18 ) { // map->m_rdbId == RDB_POSDB ) {
|
||||
@ -495,7 +495,7 @@ bool RdbDump::dumpList ( RdbList *list , int32_t niceness , bool recall ) {
|
||||
|
||||
// . SANITY CHECK
|
||||
// . ensure first key is >= last key added to the map map
|
||||
if ( m_offset > 0 ) {
|
||||
if ( m_offset > 0 && m_map ) {
|
||||
//key_t k = m_list->getCurrentKey();
|
||||
char k[MAX_KEY_BYTES];
|
||||
m_list->getCurrentKey(k);
|
||||
@ -748,6 +748,22 @@ void doneReadingForVerifyWrapper ( void *state ) {
|
||||
}
|
||||
|
||||
bool RdbDump::doneReadingForVerify ( ) {
|
||||
|
||||
// if someone reset/deleted the collection we were dumping...
|
||||
CollectionRec *cr = g_collectiondb.getRec ( m_collnum );
|
||||
// . do not do this for statsdb/catdb which always use collnum of 0
|
||||
// . RdbMerge also calls us but gives a NULL m_rdb so we can't
|
||||
// set m_isCollectionless to false
|
||||
if ( ! cr && m_doCollCheck ) {
|
||||
g_errno = ENOCOLLREC;
|
||||
// m_file is invalid if collrec got nuked because so did
|
||||
// the Rdbbase which has the files
|
||||
log("db: lost collection while dumping to disk. making "
|
||||
"map null so we can stop.");
|
||||
m_map = NULL;
|
||||
}
|
||||
|
||||
|
||||
// see if what we wrote is the same as what we read back
|
||||
if ( m_verifyBuf && memcmp(m_verifyBuf,m_buf,m_bytesToWrite) != 0 &&
|
||||
! g_errno ) {
|
||||
|
16
Repair.cpp
16
Repair.cpp
@ -686,14 +686,26 @@ void Repair::initScan ( ) {
|
||||
|
||||
|
||||
// init secondary rdbs
|
||||
if ( m_rebuildTitledb )
|
||||
if ( m_rebuildTitledb ) {
|
||||
if ( ! g_titledb2.init2 ( titledbMem ) ) goto hadError;
|
||||
// clean tree in case loaded from saved file
|
||||
Rdb *r = g_titledb2.getRdb();
|
||||
if ( r ) r->m_tree.cleanTree();
|
||||
}
|
||||
|
||||
//if ( m_rebuildTfndb )
|
||||
// if ( ! g_tfndb2.init2 ( tfndbMem ) ) goto hadError;
|
||||
//if ( m_rebuildIndexdb )
|
||||
// if ( ! g_indexdb2.init2 ( indexdbMem ) ) goto hadError;
|
||||
if ( m_rebuildPosdb )
|
||||
if ( m_rebuildPosdb ) {
|
||||
if ( ! g_posdb2.init2 ( posdbMem ) ) goto hadError;
|
||||
// clean tree in case loaded from saved file
|
||||
Rdb *r = g_posdb2.getRdb();
|
||||
if ( r ) r->m_buckets.cleanBuckets();
|
||||
}
|
||||
|
||||
|
||||
|
||||
//if ( m_rebuildDatedb )
|
||||
// if ( ! g_datedb2.init2 ( datedbMem ) ) goto hadError;
|
||||
if ( m_rebuildClusterdb )
|
||||
|
@ -29,7 +29,7 @@ void SearchInput::clear ( int32_t niceness ) {
|
||||
reset();
|
||||
// set all to 0 just to avoid any inconsistencies
|
||||
int32_t size = (char *)&m_END_TEST - (char *)&m_START;
|
||||
memset ( this , 0x00 , size );
|
||||
memset ( &m_START , 0x00 , size );
|
||||
m_sbuf1.reset();
|
||||
m_sbuf2.reset();
|
||||
m_sbuf3.reset();
|
||||
@ -185,6 +185,8 @@ bool SearchInput::set ( TcpSocket *sock , HttpRequest *r ) { //, Query *q ) {
|
||||
// store list of collection #'s to search here. usually just one.
|
||||
m_collnumBuf.reset();
|
||||
|
||||
m_q.reset();
|
||||
|
||||
// zero out everything, set niceness to 0
|
||||
clear ( 0 ) ;
|
||||
|
||||
@ -339,10 +341,11 @@ bool SearchInput::set ( TcpSocket *sock , HttpRequest *r ) { //, Query *q ) {
|
||||
|
||||
|
||||
if ( m_streamResults &&
|
||||
tmpFormat != FORMAT_XML &&
|
||||
tmpFormat != FORMAT_XML &&
|
||||
tmpFormat != FORMAT_CSV &&
|
||||
tmpFormat != FORMAT_JSON ) {
|
||||
log("si: streamResults only supported for "
|
||||
"json/html. disabling");
|
||||
"xml/csv/json. disabling");
|
||||
m_streamResults = false;
|
||||
}
|
||||
|
||||
|
412
Spider.cpp
412
Spider.cpp
@ -127,7 +127,8 @@ int32_t SpiderRequest::print ( SafeBuf *sbarg ) {
|
||||
strftime ( time , 256 , "%b %e %T %Y UTC", timeStruct );
|
||||
sb->safePrintf("addedTime=%s(%"UINT32") ",time,(uint32_t)m_addedTime );
|
||||
|
||||
sb->safePrintf("parentFirstIp=%s ",iptoa(m_parentFirstIp) );
|
||||
//sb->safePrintf("parentFirstIp=%s ",iptoa(m_parentFirstIp) );
|
||||
sb->safePrintf("pageNumInlinks=%i ",(int)m_pageNumInlinks);
|
||||
sb->safePrintf("parentHostHash32=0x%"XINT32" ",m_parentHostHash32 );
|
||||
sb->safePrintf("parentDomHash32=0x%"XINT32" ",m_parentDomHash32 );
|
||||
sb->safePrintf("parentSiteHash32=0x%"XINT32" ",m_parentSiteHash32 );
|
||||
@ -174,6 +175,7 @@ int32_t SpiderRequest::print ( SafeBuf *sbarg ) {
|
||||
if ( m_parentIsRSS ) sb->safePrintf("PARENTISRSS ");
|
||||
if ( m_parentIsPermalink ) sb->safePrintf("PARENTISPERMALINK ");
|
||||
if ( m_parentIsPingServer ) sb->safePrintf("PARENTISPINGSERVER ");
|
||||
if ( m_parentIsSiteMap ) sb->safePrintf("PARENTISSITEMAP ");
|
||||
if ( m_isMenuOutlink ) sb->safePrintf("MENUOUTLINK ");
|
||||
|
||||
if ( m_parentHasAddress ) sb->safePrintf("PARENTHASADDRESS ");
|
||||
@ -355,7 +357,7 @@ int32_t SpiderRequest::printToTable ( SafeBuf *sb , char *status ,
|
||||
|
||||
//sb->safePrintf(" <td>%s(%"UINT32")</td>\n",mstrerror(m_errCode),m_errCode);
|
||||
//sb->safePrintf(" <td>%"INT32"ms</td>\n",m_crawlDelay );
|
||||
sb->safePrintf(" <td>%s</td>\n",iptoa(m_parentFirstIp) );
|
||||
sb->safePrintf(" <td>%i</td>\n",(int)m_pageNumInlinks);
|
||||
sb->safePrintf(" <td>%"UINT64"</td>\n",getParentDocId() );
|
||||
|
||||
//sb->safePrintf(" <td>0x%"XINT32"</td>\n",m_parentHostHash32);
|
||||
@ -387,6 +389,7 @@ int32_t SpiderRequest::printToTable ( SafeBuf *sb , char *status ,
|
||||
if ( m_parentIsRSS ) sb->safePrintf("PARENTISRSS ");
|
||||
if ( m_parentIsPermalink ) sb->safePrintf("PARENTISPERMALINK ");
|
||||
if ( m_parentIsPingServer ) sb->safePrintf("PARENTISPINGSERVER ");
|
||||
if ( m_parentIsSiteMap ) sb->safePrintf("PARENTISSITEMAP ");
|
||||
if ( m_isMenuOutlink ) sb->safePrintf("MENUOUTLINK ");
|
||||
|
||||
if ( m_parentHasAddress ) sb->safePrintf("PARENTHASADDRESS ");
|
||||
@ -1209,6 +1212,7 @@ CollectionRec *SpiderColl::getCollectionRec ( ) {
|
||||
SpiderColl::SpiderColl () {
|
||||
m_overflowList = NULL;
|
||||
m_lastOverflowFirstIp = 0;
|
||||
m_lastPrinted = 0;
|
||||
m_deleteMyself = false;
|
||||
m_isLoading = false;
|
||||
m_gettingList1 = false;
|
||||
@ -1798,6 +1802,9 @@ void SpiderColl::clearLocks ( ) {
|
||||
|
||||
void SpiderColl::reset ( ) {
|
||||
|
||||
m_numSuccessReplies = 0;
|
||||
m_numFailedReplies = 0;
|
||||
|
||||
// reset these for SpiderLoop;
|
||||
m_nextDoledbKey.setMin();
|
||||
//m_didRound = false;
|
||||
@ -2309,14 +2316,16 @@ bool SpiderColl::addSpiderRequest ( SpiderRequest *sreq ,
|
||||
if ( priority >= MAX_SPIDER_PRIORITIES) {char *xx=NULL;*xx=0;}
|
||||
|
||||
// do not add to doledb if bad
|
||||
if ( priority == SPIDER_PRIORITY_FILTERED ) {
|
||||
//if ( priority == SPIDER_PRIORITY_FILTERED ) {
|
||||
if ( m_cr->m_forceDelete[ufn] ) {
|
||||
if ( g_conf.m_logDebugSpider )
|
||||
log("spider: request %s is filtered ufn=%"INT32"",
|
||||
sreq->m_url,ufn);
|
||||
return true;
|
||||
}
|
||||
|
||||
if ( priority == SPIDER_PRIORITY_BANNED ) {
|
||||
//if ( priority == SPIDER_PRIORITY_BANNED ) {
|
||||
if ( m_cr->m_forceDelete[ufn] ) {
|
||||
if ( g_conf.m_logDebugSpider )
|
||||
log("spider: request %s is banned ufn=%"INT32"",
|
||||
sreq->m_url,ufn);
|
||||
@ -2370,7 +2379,7 @@ bool SpiderColl::addSpiderRequest ( SpiderRequest *sreq ,
|
||||
"spider: %s request to waiting tree %s "
|
||||
"uh48=%"UINT64" "
|
||||
"firstIp=%s "
|
||||
"parentFirstIp=%"UINT32" "
|
||||
"pageNumInlinks=%"UINT32" "
|
||||
"parentdocid=%"UINT64" "
|
||||
"isinjecting=%"INT32" "
|
||||
"ispagereindex=%"INT32" "
|
||||
@ -2383,7 +2392,7 @@ bool SpiderColl::addSpiderRequest ( SpiderRequest *sreq ,
|
||||
sreq->m_url,
|
||||
sreq->getUrlHash48(),
|
||||
iptoa(sreq->m_firstIp),
|
||||
(uint32_t)sreq->m_parentFirstIp,
|
||||
(uint32_t)sreq->m_pageNumInlinks,//(uint32_t)sreq->m_parentFirstIp
|
||||
sreq->getParentDocId(),
|
||||
(int32_t)(bool)sreq->m_isInjecting,
|
||||
(int32_t)(bool)sreq->m_isPageReindex,
|
||||
@ -2787,6 +2796,27 @@ int32_t SpiderColl::getNextIpFromWaitingTree ( ) {
|
||||
return firstIp;
|
||||
}
|
||||
|
||||
uint64_t SpiderColl::getNextSpiderTimeFromWaitingTree ( ) {
|
||||
// if nothing to scan, bail
|
||||
if ( m_waitingTree.isEmpty() ) return 0LL;
|
||||
// the key
|
||||
key_t mink; mink.setMin();
|
||||
// set node from wait tree key. this way we can resume from a prev key
|
||||
int32_t node = m_waitingTree.getNextNode (0,(char *)&mink );
|
||||
// if empty, stop
|
||||
if ( node < 0 ) return 0LL;
|
||||
// get the key
|
||||
key_t *wk = (key_t *)m_waitingTree.getKey ( node );
|
||||
// time from that
|
||||
uint64_t spiderTimeMS = (wk->n1);
|
||||
spiderTimeMS <<= 32;
|
||||
spiderTimeMS |= ((wk->n0) >> 32);
|
||||
// stop if need to wait for this one
|
||||
return spiderTimeMS;
|
||||
}
|
||||
|
||||
|
||||
|
||||
static void gotSpiderdbListWrapper2( void *state , RdbList *list,Msg5 *msg5) {
|
||||
|
||||
SpiderColl *THIS = (SpiderColl *)state;
|
||||
@ -3535,6 +3565,11 @@ bool SpiderColl::evalIpLoop ( ) {
|
||||
m_didRead = true;
|
||||
// reset some stuff
|
||||
m_lastScanningIp = 0;
|
||||
|
||||
// reset these that need to keep track of requests for
|
||||
// the same url that might span two spiderdb lists or more
|
||||
m_lastSreqUh48 = 0LL;
|
||||
|
||||
// do a read. if it blocks it will recall this loop
|
||||
if ( ! readListFromSpiderdb () ) return false;
|
||||
}
|
||||
@ -3941,8 +3976,20 @@ bool SpiderColl::scanListForWinners ( ) {
|
||||
}
|
||||
// if its a SpiderReply set it for an upcoming requests
|
||||
if ( ! g_spiderdb.isSpiderRequest ( (key128_t *)rec ) ) {
|
||||
|
||||
// see if this is the most recent one
|
||||
SpiderReply *tmp = (SpiderReply *)rec;
|
||||
|
||||
// reset reply stats if beginning a new url
|
||||
if ( srepUh48 != tmp->getUrlHash48() ) {
|
||||
m_numSuccessReplies = 0;
|
||||
m_numFailedReplies = 0;
|
||||
}
|
||||
|
||||
// inc stats
|
||||
if ( tmp->m_errCode == 0 ) m_numSuccessReplies++;
|
||||
else m_numFailedReplies ++;
|
||||
|
||||
// if we have a more recent reply already, skip this
|
||||
if ( srep &&
|
||||
srep->getUrlHash48() == tmp->getUrlHash48() &&
|
||||
@ -3962,6 +4009,12 @@ bool SpiderColl::scanListForWinners ( ) {
|
||||
|
||||
int64_t uh48 = sreq->getUrlHash48();
|
||||
|
||||
// reset reply stats if beginning a new url
|
||||
if ( ! srep ) {
|
||||
m_numSuccessReplies = 0;
|
||||
m_numFailedReplies = 0;
|
||||
}
|
||||
|
||||
// . skip if our twin should add it to doledb
|
||||
// . waiting tree only has firstIps assigned to us so
|
||||
// this should not be necessary
|
||||
@ -4000,7 +4053,58 @@ bool SpiderColl::scanListForWinners ( ) {
|
||||
! sreq->m_fakeFirstIp )
|
||||
m_totalNewSpiderRequests++;
|
||||
|
||||
//int32_t ipdom ( int32_t ip ) { return ip & 0x00ffffff; };
|
||||
int32_t cblock = ipdom ( sreq->m_firstIp );
|
||||
|
||||
bool countIt = true;
|
||||
|
||||
// reset page inlink count on url request change
|
||||
if ( m_lastSreqUh48 != uh48 ) {
|
||||
m_pageNumInlinks = 0;
|
||||
m_lastCBlockIp = 0;
|
||||
}
|
||||
|
||||
//if ( uh48 != m_lastSreqUh48 )
|
||||
// countIt = false;
|
||||
|
||||
if ( cblock == m_lastCBlockIp )
|
||||
countIt = false;
|
||||
|
||||
// do not count manually added spider requests
|
||||
if ( (sreq->m_isAddUrl || sreq->m_isInjecting) )
|
||||
countIt = false;
|
||||
|
||||
// 20 is good enough
|
||||
if ( m_pageNumInlinks >= 20 )
|
||||
countIt = false;
|
||||
|
||||
if ( countIt ) {
|
||||
int32_t ca;
|
||||
for ( ca = 0 ; ca < m_pageNumInlinks ; ca++ )
|
||||
if ( m_cblocks[ca] == cblock ) break;
|
||||
// if found in our list, do not count it, already did
|
||||
if ( ca < m_pageNumInlinks )
|
||||
countIt = false;
|
||||
}
|
||||
|
||||
if ( countIt ) {
|
||||
m_cblocks[m_pageNumInlinks] = cblock;
|
||||
m_pageNumInlinks++;
|
||||
if ( m_pageNumInlinks > 20 ) { char *xx=NULL;*xx=0;}
|
||||
}
|
||||
|
||||
// set this now. it does increase with each request. so
|
||||
// initial requests will not see the full # of inlinks.
|
||||
sreq->m_pageNumInlinks = (uint8_t)m_pageNumInlinks;
|
||||
|
||||
// put these in the spiderequest in doledb so we can
|
||||
// show in the json spider status docs in
|
||||
// XmlDoc::getSpiderStatusDocMetaList2()
|
||||
sreq->m_reservedc1 = m_numSuccessReplies;
|
||||
sreq->m_reservedc2 = m_numFailedReplies;
|
||||
|
||||
m_lastSreqUh48 = uh48;
|
||||
m_lastCBlockIp = cblock;
|
||||
|
||||
// only add firstip if manually added and not fake
|
||||
|
||||
@ -4198,8 +4302,11 @@ bool SpiderColl::scanListForWinners ( ) {
|
||||
}
|
||||
// set the priority (might be the same as old)
|
||||
int32_t priority = m_cr->m_spiderPriorities[ufn];
|
||||
// now get rid of negative priorities since we added a
|
||||
// separate force delete checkbox in the url filters
|
||||
if ( priority < 0 ) priority = 0;
|
||||
// sanity checks
|
||||
if ( priority == -1 ) { char *xx=NULL;*xx=0; }
|
||||
//if ( priority == -1 ) { char *xx=NULL;*xx=0; }
|
||||
if ( priority >= MAX_SPIDER_PRIORITIES) {char *xx=NULL;*xx=0;}
|
||||
|
||||
if ( g_conf.m_logDebugSpider )
|
||||
@ -4214,26 +4321,37 @@ bool SpiderColl::scanListForWinners ( ) {
|
||||
//if ( ! m_cr->m_spidersEnabled[ufn] ) continue;
|
||||
if ( m_cr->m_maxSpidersPerRule[ufn] <= 0 ) continue;
|
||||
|
||||
// skip if banned
|
||||
if ( priority == SPIDER_PRIORITY_FILTERED ) continue;
|
||||
if ( priority == SPIDER_PRIORITY_BANNED ) continue;
|
||||
// skip if banned (unless need to delete from index)
|
||||
bool skip = false;
|
||||
// if ( priority == SPIDER_PRIORITY_FILTERED ) skip = true;
|
||||
// if ( priority == SPIDER_PRIORITY_BANNED ) skip = true;
|
||||
if ( m_cr->m_forceDelete[ufn] ) skip = true;
|
||||
// but if it is currently indexed we have to delete it
|
||||
if ( skip && srep && srep->m_isIndexed ) skip = false;
|
||||
if ( skip ) continue;
|
||||
|
||||
// temp debug
|
||||
//char *xx=NULL;*xx=0;
|
||||
|
||||
if ( m_cr->m_forceDelete[ufn] )
|
||||
// force it to a delete
|
||||
sreq->m_forceDelete = true;
|
||||
|
||||
int64_t spiderTimeMS;
|
||||
spiderTimeMS = getSpiderTimeMS ( sreq,ufn,srep,nowGlobalMS );
|
||||
// how many outstanding spiders on a single IP?
|
||||
//int32_t maxSpidersPerIp = m_cr->m_spiderIpMaxSpiders[ufn];
|
||||
// sanity
|
||||
if ( (int64_t)spiderTimeMS < 0 ) {
|
||||
log("spider: got corrupt 2 spiderRequest in scan (cn=%"INT32")",
|
||||
log("spider: got corrupt 2 spiderRequest in "
|
||||
"scan (cn=%"INT32")",
|
||||
(int32_t)m_collnum);
|
||||
continue;
|
||||
}
|
||||
// more corruption detection
|
||||
if ( sreq->m_hopCount < -1 ) {
|
||||
log("spider: got corrupt 5 spiderRequest in scan (cn=%"INT32")",
|
||||
log("spider: got corrupt 5 spiderRequest in "
|
||||
"scan (cn=%"INT32")",
|
||||
(int32_t)m_collnum);
|
||||
continue;
|
||||
}
|
||||
@ -4245,8 +4363,8 @@ bool SpiderColl::scanListForWinners ( ) {
|
||||
|
||||
// if it is in future, skip it and just set m_futureTime and
|
||||
// and we will update the waiting tree
|
||||
// with an entry based on that future time if the winnerTree turns
|
||||
// out to be empty after we've completed our scan
|
||||
// with an entry based on that future time if the winnerTree
|
||||
// turns out to be empty after we've completed our scan
|
||||
if ( spiderTimeMS > nowGlobalMS ) {
|
||||
// if futuretime is zero set it to this time
|
||||
if ( ! m_minFutureTimeMS )
|
||||
@ -4422,8 +4540,17 @@ bool SpiderColl::scanListForWinners ( ) {
|
||||
wsreq->m_hopCount = sreq->m_hopCount;
|
||||
if ( wsreq->m_hopCount < sreq->m_hopCount )
|
||||
sreq->m_hopCount = wsreq->m_hopCount;
|
||||
// and the min added time as well!
|
||||
// get the oldest timestamp so
|
||||
// gbssDiscoveryTime will be accurate.
|
||||
if ( sreq->m_addedTime < wsreq->m_addedTime )
|
||||
wsreq->m_addedTime = sreq->m_addedTime;
|
||||
if ( wsreq->m_addedTime < sreq->m_addedTime )
|
||||
sreq->m_addedTime = wsreq->m_addedTime;
|
||||
}
|
||||
|
||||
|
||||
|
||||
// are we lower priority? (or equal)
|
||||
// smaller keys are HIGHER priority.
|
||||
if(KEYCMP((char *)&wk,(char *)oldwk,
|
||||
@ -4770,9 +4897,11 @@ bool SpiderColl::scanListForWinners ( ) {
|
||||
log("spider: Checked list of %"INT32" spiderdb "
|
||||
"bytes (%"INT32" recs) "
|
||||
"for winners "
|
||||
"for firstip=%s. winnerTreeUsedNodes=%"INT32"",
|
||||
list->getListSize(),recCount,iptoa(m_scanningIp),
|
||||
m_winnerTree.getNumUsedNodes());
|
||||
"for firstip=%s. winnerTreeUsedNodes=%"INT32" #newreqs=%"
|
||||
INT64
|
||||
,list->getListSize(),recCount,iptoa(m_scanningIp),
|
||||
m_winnerTree.getNumUsedNodes(),
|
||||
m_totalNewSpiderRequests);
|
||||
// reset any errno cuz we're just a cache
|
||||
g_errno = 0;
|
||||
|
||||
@ -6126,6 +6255,9 @@ void SpiderLoop::spiderDoledUrls ( ) {
|
||||
if ( ! m_activeListValid ) {
|
||||
buildActiveList();
|
||||
m_crx = m_activeList;
|
||||
// recompute every 3 seconds, it seems kinda buggy!!
|
||||
m_recalcTime = nowGlobal + 3;
|
||||
m_recalcTimeValid = true;
|
||||
}
|
||||
|
||||
// start again at head
|
||||
@ -7896,23 +8028,25 @@ bool SpiderLoop::indexedDoc ( XmlDoc *xd ) {
|
||||
// care of g_errno now by clearing it and adding an error spider
|
||||
// reply to release the lock!!
|
||||
if ( g_errno ) {
|
||||
log("spider: ----CRITICAL CRITICAL CRITICAL----");
|
||||
log("spider: ----CRITICAL CRITICAL CRITICAL----");
|
||||
log("spider: ------ *** LOCAL ERROR *** ------");
|
||||
log("spider: ------ *** LOCAL ERROR *** ------");
|
||||
log("spider: ------ *** LOCAL ERROR *** ------");
|
||||
// log("spider: ----CRITICAL CRITICAL CRITICAL----");
|
||||
// log("spider: ----CRITICAL CRITICAL CRITICAL----");
|
||||
// log("spider: ------ *** LOCAL ERROR *** ------");
|
||||
// log("spider: ------ *** LOCAL ERROR *** ------");
|
||||
// log("spider: ------ *** LOCAL ERROR *** ------");
|
||||
log("spider: spidering %s has error: %s. uh48=%"INT64". "
|
||||
"Respidering "
|
||||
"in %"INT32" seconds. MAX_LOCK_AGE when lock expires.",
|
||||
//"Respidering "
|
||||
//"in %"INT32" seconds. MAX_LOCK_AGE when lock expires. "
|
||||
"cn=%"INT32"",
|
||||
xd->m_firstUrl.m_url,
|
||||
mstrerror(g_errno),
|
||||
xd->getFirstUrlHash48(),
|
||||
(int32_t)MAX_LOCK_AGE);
|
||||
log("spider: ------ *** LOCAL ERROR *** ------");
|
||||
log("spider: ------ *** LOCAL ERROR *** ------");
|
||||
log("spider: ------ *** LOCAL ERROR *** ------");
|
||||
log("spider: ----CRITICAL CRITICAL CRITICAL----");
|
||||
log("spider: ----CRITICAL CRITICAL CRITICAL----");
|
||||
//(int32_t)MAX_LOCK_AGE,
|
||||
(int32_t)collnum);
|
||||
// log("spider: ------ *** LOCAL ERROR *** ------");
|
||||
// log("spider: ------ *** LOCAL ERROR *** ------");
|
||||
// log("spider: ------ *** LOCAL ERROR *** ------");
|
||||
// log("spider: ----CRITICAL CRITICAL CRITICAL----");
|
||||
// log("spider: ----CRITICAL CRITICAL CRITICAL----");
|
||||
// don't release the lock on it right now. just let the
|
||||
// lock expire on it after MAX_LOCK_AGE seconds. then it will
|
||||
// be retried. we need to debug gb so these things never
|
||||
@ -10782,6 +10916,10 @@ int32_t getUrlFilterNum2 ( SpiderRequest *sreq ,
|
||||
HashTableX *quotaTable ,
|
||||
int32_t langIdArg ) {
|
||||
|
||||
if ( ! sreq ) {
|
||||
log("spider: sreq is NULL!");
|
||||
}
|
||||
|
||||
int32_t langId = langIdArg;
|
||||
if ( srep ) langId = srep->m_langId;
|
||||
|
||||
@ -11264,6 +11402,57 @@ int32_t getUrlFilterNum2 ( SpiderRequest *sreq ,
|
||||
goto checkNextRule;
|
||||
}
|
||||
|
||||
if ( strncmp(p,"isparentsitemap",15) == 0 ) {
|
||||
// skip for msg20
|
||||
if ( isForMsg20 ) continue;
|
||||
// if no match continue
|
||||
if ( (bool)sreq->m_parentIsSiteMap == val) continue;
|
||||
// skip
|
||||
p += 15;
|
||||
// skip to next constraint
|
||||
p = strstr(p, "&&");
|
||||
// all done?
|
||||
if ( ! p ) return i;
|
||||
p += 2;
|
||||
goto checkNextRule;
|
||||
}
|
||||
|
||||
// does it have an rss inlink? we want to expedite indexing
|
||||
// of such pages. i.e. that we gather from an rss feed that
|
||||
// we got from a pingserver...
|
||||
if ( strncmp(p,"isroot",6) == 0 ) {
|
||||
// skip for msg20
|
||||
//if ( isForMsg20 ) continue;
|
||||
// this is a docid only url, no actual url, so skip
|
||||
if ( sreq->m_isPageReindex ) continue;
|
||||
// a fast check
|
||||
char *u = sreq->m_url;
|
||||
// skip http
|
||||
u += 4;
|
||||
// then optional s for https
|
||||
if ( *u == 's' ) u++;
|
||||
// then ://
|
||||
u += 3;
|
||||
// scan until \0 or /
|
||||
for ( ; *u && *u !='/' ; u++ );
|
||||
// if \0 we are root
|
||||
bool isRoot = true;
|
||||
if ( *u == '/' ) {
|
||||
u++;
|
||||
if ( *u ) isRoot = false;
|
||||
}
|
||||
// if we are not root
|
||||
if ( isRoot == val ) continue;
|
||||
// skip
|
||||
p += 6;
|
||||
// skip to next constraint
|
||||
p = strstr(p, "&&");
|
||||
// all done?
|
||||
if ( ! p ) return i;
|
||||
p += 2;
|
||||
goto checkNextRule;
|
||||
}
|
||||
|
||||
/*
|
||||
if ( strncmp(p,"isparentindexed",16) == 0 ) {
|
||||
// skip for msg20
|
||||
@ -11506,6 +11695,21 @@ int32_t getUrlFilterNum2 ( SpiderRequest *sreq ,
|
||||
goto checkNextRule;
|
||||
}
|
||||
|
||||
// check for "isrss" aka "rss"
|
||||
if ( strncmp(p,"isrssext",8) == 0 ) {
|
||||
// if we are not rss, we do not match this rule
|
||||
if ( (bool)sreq->m_isRSSExt == val ) continue;
|
||||
// skip it
|
||||
p += 8;
|
||||
// check for &&
|
||||
p = strstr(p, "&&");
|
||||
// if nothing, else then it is a match
|
||||
if ( ! p ) return i;
|
||||
// skip the '&&' and go to next rule
|
||||
p += 2;
|
||||
goto checkNextRule;
|
||||
}
|
||||
|
||||
// check for permalinks. for new outlinks we *guess* if its
|
||||
// a permalink by calling isPermalink() function.
|
||||
if (!strncmp(p,"ispermalink",11) ) {
|
||||
@ -11602,10 +11806,9 @@ int32_t getUrlFilterNum2 ( SpiderRequest *sreq ,
|
||||
}
|
||||
// iswww, means url is like www.xyz.com/...
|
||||
if ( strncmp(p,"iswww", 5) == 0 ) {
|
||||
// now this is a bit
|
||||
if ( (bool)sreq->m_isWWWSubdomain == (bool)val )
|
||||
continue;
|
||||
/*
|
||||
// now this is a bit - doesn't seem to be working yet
|
||||
//if ( (bool)sreq->m_isWWWSubdomain == (bool)val )
|
||||
// continue;
|
||||
// skip "iswww"
|
||||
p += 5;
|
||||
// skip over http:// or https://
|
||||
@ -11619,7 +11822,6 @@ int32_t getUrlFilterNum2 ( SpiderRequest *sreq ,
|
||||
u[2] == 'w' ) isWWW = 1;
|
||||
// skip if no match
|
||||
if ( isWWW == val ) continue;
|
||||
*/
|
||||
// TODO: fix www.knightstown.skepter.com
|
||||
// maybe just have a bit in the spider request
|
||||
// another rule?
|
||||
@ -12141,12 +12343,37 @@ int32_t getUrlFilterNum2 ( SpiderRequest *sreq ,
|
||||
goto checkNextRule;
|
||||
}
|
||||
|
||||
if ( *p == 'n' && strncmp(p,"numinlinks",10) == 0 ) {
|
||||
// skip for msg20
|
||||
if ( isForMsg20 ) continue;
|
||||
// these are -1 if they are NOT valid
|
||||
int32_t a = sreq->m_pageNumInlinks;
|
||||
// make it point to the priority
|
||||
int32_t b = atoi(s);
|
||||
// compare
|
||||
if ( sign == SIGN_EQ && a != b ) continue;
|
||||
if ( sign == SIGN_NE && a == b ) continue;
|
||||
if ( sign == SIGN_GT && a <= b ) continue;
|
||||
if ( sign == SIGN_LT && a >= b ) continue;
|
||||
if ( sign == SIGN_GE && a < b ) continue;
|
||||
if ( sign == SIGN_LE && a > b ) continue;
|
||||
// skip fast
|
||||
p += 10;
|
||||
p = strstr(s, "&&");
|
||||
//if nothing, else then it is a match
|
||||
if ( ! p ) return i;
|
||||
//skip the '&&' and go to next rule
|
||||
p += 2;
|
||||
goto checkNextRule;
|
||||
}
|
||||
|
||||
// siteNumInlinks >= 300 [&&]
|
||||
if ( *p=='s' && strncmp(p, "sitenuminlinks", 14) == 0){
|
||||
// these are -1 if they are NOT valid
|
||||
int32_t a1 = sreq->m_siteNumInlinks;
|
||||
// only assign if valid
|
||||
int32_t a2 = -1; if ( srep ) a2 = srep->m_siteNumInlinks;
|
||||
int32_t a2 = -1;
|
||||
if ( srep ) a2 = srep->m_siteNumInlinks;
|
||||
// assume a1 is the best
|
||||
int32_t a ;
|
||||
// assign to the first valid one
|
||||
@ -12720,18 +12947,21 @@ void dedupSpiderdbList ( RdbList *list , int32_t niceness , bool removeNegRecs )
|
||||
}
|
||||
|
||||
// try to kinda grab the min hop count as well
|
||||
if ( sreq->m_hopCountValid && oldReq->m_hopCountValid ) {
|
||||
if ( oldReq->m_hopCount < sreq->m_hopCount )
|
||||
sreq->m_hopCount = oldReq->m_hopCount;
|
||||
else
|
||||
oldReq->m_hopCount = sreq->m_hopCount;
|
||||
}
|
||||
// do not alter spiderdb!
|
||||
// if ( sreq->m_hopCountValid && oldReq->m_hopCountValid ) {
|
||||
// if ( oldReq->m_hopCount < sreq->m_hopCount )
|
||||
// sreq->m_hopCount = oldReq->m_hopCount;
|
||||
// else
|
||||
// oldReq->m_hopCount = sreq->m_hopCount;
|
||||
// }
|
||||
|
||||
// if he's essentially different input parms but for the
|
||||
// same url, we want to keep him because he might map the
|
||||
// url to a different url priority!
|
||||
if ( oldReq->m_siteHash32 != sreq->m_siteHash32 ||
|
||||
oldReq->m_isNewOutlink != sreq->m_isNewOutlink ||
|
||||
// use hopcount now too!
|
||||
oldReq->m_hopCount != sreq->m_hopCount ||
|
||||
// makes a difference as far a m_minPubDate goes, because
|
||||
// we want to make sure not to delete that request that
|
||||
// has m_parentPrevSpiderTime
|
||||
@ -12748,7 +12978,8 @@ void dedupSpiderdbList ( RdbList *list , int32_t niceness , bool removeNegRecs )
|
||||
goto addIt;
|
||||
// . if the same check who has the most recent added time
|
||||
// . if we are not the most recent, just do not add us
|
||||
if ( sreq->m_addedTime <= oldReq->m_addedTime ) continue;
|
||||
// . no, now i want the oldest so we can do gbssDiscoveryTime
|
||||
if ( sreq->m_addedTime >= oldReq->m_addedTime ) continue;
|
||||
// otherwise, erase over him
|
||||
dst = restorePoint;
|
||||
lastKey = prevLastKey;
|
||||
@ -13342,6 +13573,8 @@ void handleRequestc1 ( UdpSlot *slot , int32_t niceness ) {
|
||||
|
||||
uint32_t now = (uint32_t)getTimeGlobalNoCore();
|
||||
|
||||
uint64_t nowMS = gettimeofdayInMillisecondsGlobalNoCore();
|
||||
|
||||
//SpiderColl *sc = g_spiderCache.getSpiderColl(collnum);
|
||||
|
||||
for ( int32_t i = 0 ; i < g_collectiondb.m_numRecs ; i++ ) {
|
||||
@ -13395,6 +13628,44 @@ void handleRequestc1 ( UdpSlot *slot , int32_t niceness ) {
|
||||
//g_conf.m_spideringEnabled &&
|
||||
ci->m_lastSpiderAttempt - ci->m_lastSpiderCouldLaunch >
|
||||
spiderDoneTimer ) {
|
||||
|
||||
// break it here for our collnum to see if
|
||||
// doledb was just lagging or not.
|
||||
bool printIt = true;
|
||||
if ( now < sc->m_lastPrinted ) printIt = false;
|
||||
if ( printIt ) sc->m_lastPrinted = now + 5;
|
||||
|
||||
// doledb must be empty
|
||||
if ( ! sc->m_doleIpTable.isEmpty() ) {
|
||||
if ( printIt )
|
||||
log("spider: not ending crawl because "
|
||||
"doledb not empty for coll=%s",cr->m_coll);
|
||||
goto doNotEnd;
|
||||
}
|
||||
|
||||
uint64_t nextTimeMS ;
|
||||
nextTimeMS = sc->getNextSpiderTimeFromWaitingTree ( );
|
||||
|
||||
// and no ips awaiting scans to get into doledb
|
||||
// except for ips needing scans 60+ seconds from now
|
||||
if ( nextTimeMS && nextTimeMS < nowMS + 60000 ) {
|
||||
if ( printIt )
|
||||
log("spider: not ending crawl because "
|
||||
"waiting tree key is ready for scan "
|
||||
"%"INT64" ms from now for coll=%s",
|
||||
nextTimeMS - nowMS,cr->m_coll );
|
||||
goto doNotEnd;
|
||||
}
|
||||
|
||||
// maybe wait for waiting tree population to finish
|
||||
if ( sc->m_waitingTreeNeedsRebuild ) {
|
||||
if ( printIt )
|
||||
log("spider: not ending crawl because "
|
||||
"waiting tree is building for coll=%s",
|
||||
cr->m_coll );
|
||||
goto doNotEnd;
|
||||
}
|
||||
|
||||
// this is the MOST IMPORTANT variable so note it
|
||||
log(LOG_INFO,
|
||||
"spider: coll %s has no more urls to spider",
|
||||
@ -13407,6 +13678,7 @@ void handleRequestc1 ( UdpSlot *slot , int32_t niceness ) {
|
||||
cr->m_needsSave = true;
|
||||
}
|
||||
|
||||
doNotEnd:
|
||||
|
||||
int32_t hostId = slot->m_host->m_hostId;
|
||||
|
||||
@ -13454,39 +13726,64 @@ void handleRequestc1 ( UdpSlot *slot , int32_t niceness ) {
|
||||
|
||||
bool getSpiderStatusMsg ( CollectionRec *cx , SafeBuf *msg , int32_t *status ) {
|
||||
|
||||
if ( ! g_conf.m_spideringEnabled && ! cx->m_isCustomCrawl )
|
||||
if ( ! g_conf.m_spideringEnabled && ! cx->m_isCustomCrawl ) {
|
||||
*status = SP_ADMIN_PAUSED;
|
||||
return msg->safePrintf("Spidering disabled in "
|
||||
"master controls. You can turn it "
|
||||
"back on there.");
|
||||
}
|
||||
|
||||
if ( g_conf.m_readOnlyMode )
|
||||
if ( g_conf.m_readOnlyMode ) {
|
||||
*status = SP_ADMIN_PAUSED;
|
||||
return msg->safePrintf("In read-only mode. Spidering off.");
|
||||
}
|
||||
|
||||
if ( g_dailyMerge.m_mergeMode )
|
||||
if ( g_dailyMerge.m_mergeMode ) {
|
||||
*status = SP_ADMIN_PAUSED;
|
||||
return msg->safePrintf("Daily merge engaged, spidering "
|
||||
"paused.");
|
||||
}
|
||||
|
||||
if ( g_udpServer.getNumUsedSlots() >= 1300 )
|
||||
if ( g_udpServer.getNumUsedSlots() >= 1300 ) {
|
||||
*status = SP_ADMIN_PAUSED;
|
||||
return msg->safePrintf("Too many UDP slots in use, "
|
||||
"spidering paused.");
|
||||
}
|
||||
|
||||
if ( g_repairMode )
|
||||
if ( g_repairMode ) {
|
||||
*status = SP_ADMIN_PAUSED;
|
||||
return msg->safePrintf("In repair mode, spidering paused.");
|
||||
}
|
||||
|
||||
// do not spider until collections/parms in sync with host #0
|
||||
if ( ! g_parms.m_inSyncWithHost0 )
|
||||
if ( ! g_parms.m_inSyncWithHost0 ) {
|
||||
*status = SP_ADMIN_PAUSED;
|
||||
return msg->safePrintf("Parms not in sync with host #0, "
|
||||
"spidering paused");
|
||||
}
|
||||
|
||||
// don't spider if not all hosts are up, or they do not all
|
||||
// have the same hosts.conf.
|
||||
if ( g_pingServer.m_hostsConfInDisagreement )
|
||||
if ( g_pingServer.m_hostsConfInDisagreement ) {
|
||||
*status = SP_ADMIN_PAUSED;
|
||||
return msg->safePrintf("Hosts.conf discrepancy, "
|
||||
"spidering paused.");
|
||||
|
||||
}
|
||||
|
||||
uint32_t now = (uint32_t)getTimeGlobal();
|
||||
|
||||
// try to fix crawlbot nightly test complaining about job status
|
||||
// for TestRepeatCrawlWithMaxToCrawl
|
||||
if ( (cx->m_spiderStatus == SP_MAXTOCRAWL ||
|
||||
cx->m_spiderStatus == SP_MAXTOPROCESS ) &&
|
||||
cx->m_collectiveRespiderFrequency > 0.0 &&
|
||||
now < cx->m_spiderRoundStartTime &&
|
||||
cx->m_spiderRoundNum >= cx->m_maxCrawlRounds ) {
|
||||
*status = SP_MAXROUNDS;
|
||||
return msg->safePrintf ( "Job has reached maxRounds "
|
||||
"limit." );
|
||||
}
|
||||
|
||||
// . 0 means not to RE-crawl
|
||||
// . indicate if we are WAITING for next round...
|
||||
if ( cx->m_spiderStatus == SP_MAXTOCRAWL &&
|
||||
@ -13587,6 +13884,7 @@ bool getSpiderStatusMsg ( CollectionRec *cx , SafeBuf *msg , int32_t *status ) {
|
||||
if ( ! cx->m_isCustomCrawl &&
|
||||
! cx->m_globalCrawlInfo.m_hasUrlsReadyToSpider ) {
|
||||
//*status = SP_COMPLETED;
|
||||
*status = SP_INPROGRESS;
|
||||
return msg->safePrintf ( "Nothing currently "
|
||||
"available to spider. "
|
||||
"Change your url filters, try "
|
||||
@ -13783,7 +14081,7 @@ bool SpiderRequest::setFromAddUrl ( char *url ) {
|
||||
m_isAddUrl = 1;
|
||||
m_addedTime = (uint32_t)getTimeGlobal();//now;
|
||||
m_fakeFirstIp = 1;
|
||||
m_probDocId = probDocId;
|
||||
//m_probDocId = probDocId;
|
||||
m_firstIp = firstIp;
|
||||
m_hopCount = 0;
|
||||
|
||||
@ -13893,10 +14191,12 @@ void SpiderLoop::buildActiveList ( ) {
|
||||
//
|
||||
if ( nowGlobal < cr->m_spiderRoundStartTime ) {
|
||||
active = false;
|
||||
if ( cr->m_spiderRoundStartTime < m_recalcTime ) {
|
||||
m_recalcTime = cr->m_spiderRoundStartTime;
|
||||
m_recalcTimeValid = true;
|
||||
}
|
||||
// no need to do this now since we recalc every
|
||||
// 3 seconds anyway...
|
||||
// if ( cr->m_spiderRoundStartTime < m_recalcTime ) {
|
||||
// m_recalcTime = cr->m_spiderRoundStartTime;
|
||||
// m_recalcTimeValid = true;
|
||||
// }
|
||||
}
|
||||
|
||||
if ( ! active ) continue;
|
||||
|
30
Spider.h
30
Spider.h
@ -509,15 +509,24 @@ class SpiderRequest {
|
||||
// spidered (when m_url was not an outlink on its parent page)
|
||||
uint32_t m_parentPrevSpiderTime; // time_t
|
||||
|
||||
//int32_t m_parentFirstIp;
|
||||
// # of spider requests from different c-blocks. capped at 255.
|
||||
// taken from the # of SpiderRequests.
|
||||
uint8_t m_pageNumInlinks;
|
||||
uint8_t m_reservedb2;
|
||||
uint8_t m_reservedb3;
|
||||
uint8_t m_reservedb4;
|
||||
|
||||
// info on the page we were harvest from
|
||||
int32_t m_parentFirstIp;
|
||||
int32_t m_parentHostHash32;
|
||||
int32_t m_parentDomHash32;
|
||||
int32_t m_parentSiteHash32;
|
||||
|
||||
// the PROBABLE DOCID. if there is a collision with another docid
|
||||
// then we increment the last 8 bits or so. see Msg22.cpp.
|
||||
int64_t m_probDocId;
|
||||
//int64_t m_probDocId;
|
||||
int32_t m_reservedc1;
|
||||
int32_t m_reservedc2;
|
||||
|
||||
//int32_t m_parentPubDate;
|
||||
|
||||
@ -583,11 +592,12 @@ class SpiderRequest {
|
||||
// or from PageParser.cpp directly
|
||||
int32_t m_isPageParser:1;
|
||||
// should we use the test-spider-dir for caching test coll requests?
|
||||
int32_t m_useTestSpiderDir:1;
|
||||
//int32_t m_useTestSpiderDir:1;
|
||||
int32_t m_parentIsSiteMap:1;
|
||||
// . is the url a docid (not an actual url)
|
||||
// . could be a "query reindex"
|
||||
int32_t m_urlIsDocId:1;
|
||||
// does m_url end in .rss? or a related rss file extension?
|
||||
// does m_url end in .rss .xml .atom? or a related rss file extension?
|
||||
int32_t m_isRSSExt:1;
|
||||
// is url in a format known to be a permalink format?
|
||||
int32_t m_isUrlPermalinkFormat:1;
|
||||
@ -921,7 +931,7 @@ class SpiderReply {
|
||||
// was the request an injection request
|
||||
int32_t m_fromInjectionRequest :1;
|
||||
// did we TRY to send it to the diffbot backend filter? might be err?
|
||||
int32_t m_sentToDiffbot :1;
|
||||
int32_t m_sentToDiffbotThisTime :1;
|
||||
int32_t m_hadDiffbotError :1;
|
||||
// . was it in the index when we started?
|
||||
// . we use this with m_isIndexed above to adjust quota counts for
|
||||
@ -1145,6 +1155,9 @@ class SpiderColl {
|
||||
int32_t m_tailHopCount;
|
||||
int64_t m_minFutureTimeMS;
|
||||
|
||||
int32_t m_numSuccessReplies;
|
||||
int32_t m_numFailedReplies;
|
||||
|
||||
// . do not re-send CrawlInfoLocal for a coll if not update
|
||||
// . we store the flags in here as true if we should send our
|
||||
// CrawlInfoLocal for this coll to this hostId
|
||||
@ -1212,6 +1225,7 @@ class SpiderColl {
|
||||
int32_t m_numAdded;
|
||||
int64_t m_numBytesScanned;
|
||||
int64_t m_lastPrintCount;
|
||||
int64_t m_lastPrinted;
|
||||
|
||||
// used by SpiderLoop.cpp
|
||||
int32_t m_spidersOut;
|
||||
@ -1253,6 +1267,7 @@ class SpiderColl {
|
||||
bool addToWaitingTree ( uint64_t spiderTime , int32_t firstIp ,
|
||||
bool callForScan );
|
||||
int32_t getNextIpFromWaitingTree ( );
|
||||
uint64_t getNextSpiderTimeFromWaitingTree ( ) ;
|
||||
void populateDoledbFromWaitingTree ( );
|
||||
|
||||
//bool scanSpiderdb ( bool needList );
|
||||
@ -1305,6 +1320,11 @@ class SpiderColl {
|
||||
int32_t *m_overflowList;
|
||||
int64_t m_totalNewSpiderRequests;
|
||||
int64_t m_lastSreqUh48;
|
||||
|
||||
int32_t m_cblocks[20];
|
||||
int32_t m_pageNumInlinks;
|
||||
int32_t m_lastCBlockIp;
|
||||
|
||||
int32_t m_lastOverflowFirstIp;
|
||||
|
||||
private:
|
||||
|
30
Tagdb.cpp
30
Tagdb.cpp
@ -4873,7 +4873,19 @@ bool isTagTypeUnique ( int32_t tt ) {
|
||||
// make sure table is valid
|
||||
if ( ! s_initialized ) g_tagdb.setHashTable();
|
||||
// look up in hash table
|
||||
TagDesc *td = *(TagDesc **)s_ht.getValue ( &tt );
|
||||
TagDesc **tdp = (TagDesc **)s_ht.getValue ( &tt );
|
||||
if ( ! tdp ) {
|
||||
log("tagdb: tag desc is NULL for tag type %"INT32" assuming "
|
||||
"not indexable",tt);
|
||||
return false;
|
||||
}
|
||||
// do not core for now
|
||||
TagDesc *td = *tdp;
|
||||
if ( ! td ) {
|
||||
log("tagdb: got unknown tag type %"INT32" assuming "
|
||||
"unique",tt);
|
||||
return true;
|
||||
}
|
||||
// if none, that is crazy
|
||||
if ( ! td ) { char *xx=NULL;*xx=0; }
|
||||
// return
|
||||
@ -4887,8 +4899,20 @@ bool isTagTypeIndexable ( int32_t tt ) {
|
||||
// make sure table is valid
|
||||
if ( ! s_initialized ) g_tagdb.setHashTable();
|
||||
// look up in hash table
|
||||
TagDesc *td = *(TagDesc **)s_ht.getValue ( &tt );
|
||||
// if none, that is crazy
|
||||
TagDesc **tdp = (TagDesc **)s_ht.getValue ( &tt );
|
||||
// do not core for now
|
||||
if ( ! tdp ) {
|
||||
log("tagdb: got unknown tag type %"INT32" assuming "
|
||||
"not indexable",tt);
|
||||
return false;
|
||||
}
|
||||
TagDesc *td = *tdp;
|
||||
if ( ! td ) {
|
||||
log("tagdb: tag desc is NULL for tag type %"INT32" assuming "
|
||||
"not indexable",tt);
|
||||
return false;
|
||||
}
|
||||
// if none, that is crazy MDW coring here:
|
||||
if ( ! td ) { char *xx=NULL;*xx=0; }
|
||||
// return false if we should not index it
|
||||
if ( td->m_flags & TDF_NOINDEX ) return false;
|
||||
|
5
Test.cpp
5
Test.cpp
@ -932,11 +932,12 @@ bool Test::injectLoop ( ) {
|
||||
m_sreq.m_domHash32 = fakeIp;
|
||||
m_sreq.m_hostHash32 = fakeIp;
|
||||
m_sreq.m_siteHash32 = fakeIp;
|
||||
m_sreq.m_probDocId = g_titledb.getProbableDocId( m_sreq.m_url );
|
||||
//m_sreq.m_probDocId = g_titledb.getProbableDocId( m_sreq.m_url );
|
||||
// this crap is fake
|
||||
m_sreq.m_isInjecting = 1;
|
||||
// use test-spider subdir for storing pages and spider times?
|
||||
if ( g_conf.m_testSpiderEnabled ) m_sreq.m_useTestSpiderDir = 1;
|
||||
// MDW: this was replaced by m_isParentSiteMap bit.
|
||||
//if ( g_conf.m_testSpiderEnabled ) m_sreq.m_useTestSpiderDir = 1;
|
||||
// use this later
|
||||
m_sreq.m_hasContent = 0;
|
||||
// injected requests use this as the spider time i guess
|
||||
|
210
UdpServer.cpp
210
UdpServer.cpp
@ -251,7 +251,7 @@ bool UdpServer::init ( uint16_t port, UdpProtocol *proto, int32_t niceness,
|
||||
m_head2 = NULL;
|
||||
m_tail2 = NULL;
|
||||
// linked list of callback candidates
|
||||
//m_head3 = NULL;
|
||||
m_head3 = NULL;
|
||||
// . set up hash table that converts key (ip/port/transId) to a slot
|
||||
// . m_numBuckets must be power of 2
|
||||
m_numBuckets = getHighestLitBitValue ( m_maxSlots * 6 );
|
||||
@ -267,6 +267,7 @@ bool UdpServer::init ( uint16_t port, UdpProtocol *proto, int32_t niceness,
|
||||
log(LOG_DEBUG,"udp: Allocated %"INT32" bytes for table.",m_bufSize);
|
||||
|
||||
m_numUsedSlots = 0;
|
||||
m_numUsedSlotsIncoming = 0;
|
||||
// clear this
|
||||
m_isShuttingDown = false;
|
||||
// and this
|
||||
@ -555,7 +556,7 @@ bool UdpServer::sendRequest ( char *msg ,
|
||||
|
||||
// . create a new slot to control the transmission of this request
|
||||
// . should set g_errno on failure
|
||||
UdpSlot *slot = getEmptyUdpSlot_ass ( key );
|
||||
UdpSlot *slot = getEmptyUdpSlot_ass ( key , false );
|
||||
if ( ! slot ) {
|
||||
if ( flipped ) interruptsOn();
|
||||
return log("udp: All %"INT32" slots are in use.",m_maxSlots);
|
||||
@ -601,6 +602,8 @@ bool UdpServer::sendRequest ( char *msg ,
|
||||
return log("udp: Failed to initialize udp socket for "
|
||||
"sending req: %s",mstrerror(g_errno));
|
||||
}
|
||||
|
||||
if ( slot->m_next3 || slot->m_prev3 ) { char *xx=NULL;*xx=0; }
|
||||
// set this
|
||||
slot->m_maxResends = maxResends;
|
||||
// keep sending dgrams until we have no more or hit ACK_WINDOW limit
|
||||
@ -675,6 +678,9 @@ void UdpServer::sendReply_ass ( char *msg ,
|
||||
log(LOG_LOGIC,"udp: sendReply_ass: Callback is non-NULL.");
|
||||
return;
|
||||
}
|
||||
if ( ! msg && msgSize > 0 )
|
||||
log("udp: calling sendreply with null send buffer and "
|
||||
"positive size! will probably core.");
|
||||
// record some statistics on how long these msg handlers are taking
|
||||
int64_t now = gettimeofdayInMillisecondsLocal();
|
||||
// m_queuedTime should have been set before m_handlers[] was called
|
||||
@ -1069,6 +1075,8 @@ void UdpServer::process_ass ( int64_t now , int32_t maxNiceness) {
|
||||
// bail if no main sock
|
||||
if ( m_sock < 0 ) return ;
|
||||
|
||||
//log("process_ass");
|
||||
|
||||
// if we call this while in the sighandler it crashes since
|
||||
// gettimeofdayInMillisecondsLocal() is not async safe
|
||||
int64_t startTimer;
|
||||
@ -1099,7 +1107,16 @@ void UdpServer::process_ass ( int64_t now , int32_t maxNiceness) {
|
||||
// if no slot was set, it was a slotless read so keep looping
|
||||
if ( ! slot ) { g_errno = 0; goto readAgain; }
|
||||
// if there was a read error let makeCallback() know about it
|
||||
if ( status == -1 ) slot->m_errno = g_errno;
|
||||
if ( status == -1 ) {
|
||||
slot->m_errno = g_errno;
|
||||
// prepare to call the callback by adding it to this
|
||||
// special linked list
|
||||
if ( g_errno )
|
||||
addToCallbackLinkedList ( slot );
|
||||
// sanity
|
||||
if ( ! g_errno )
|
||||
log("udp: missing g_errno from read error");
|
||||
}
|
||||
// we read something
|
||||
something = true;
|
||||
// try sending an ACK on the slot we read something from
|
||||
@ -1108,6 +1125,7 @@ void UdpServer::process_ass ( int64_t now , int32_t maxNiceness) {
|
||||
// if we read something, try for more
|
||||
if ( something ) {
|
||||
//if ( slot->m_errno || slot->isTransactionComplete())
|
||||
//log("got something");
|
||||
needCallback = true;
|
||||
goto loop;
|
||||
}
|
||||
@ -1131,6 +1149,8 @@ void UdpServer::process_ass ( int64_t now , int32_t maxNiceness) {
|
||||
if ( makeCallbacks_ass ( /*niceness level*/ 0 ) ) {
|
||||
// set flag to call low priority callbacks
|
||||
m_needBottom = true;
|
||||
// note it
|
||||
//log("made callback");
|
||||
// but not now, only when we don't call any high priorities
|
||||
goto bigloop;
|
||||
}
|
||||
@ -1140,17 +1160,19 @@ void UdpServer::process_ass ( int64_t now , int32_t maxNiceness) {
|
||||
// gettimeofdayInMillisecondsLocal() is not async safe
|
||||
int64_t elapsed = 0;
|
||||
if ( ! g_inSigHandler )
|
||||
elapsed = gettimeofdayInMillisecondsLocal() - startTimer;
|
||||
elapsed = gettimeofdayInMillisecondsLocal() - startTimer;
|
||||
if(elapsed < 10) {
|
||||
// we did not call any, so resort to nice callbacks
|
||||
makeCallbacks_ass ( /*niceness level*/ 1 ) ;
|
||||
// . only go to bigloop if we called a callback
|
||||
if ( makeCallbacks_ass ( /*niceness level*/ 1 ) )
|
||||
goto bigloop;
|
||||
// no longer need to be called
|
||||
// if we did anything loop back up
|
||||
// . but only if we haven't been looping forever,
|
||||
// . if so we need to relinquish control to loop.
|
||||
// log(LOG_WARN, "udp: give back control. after %"INT64"",
|
||||
// elapsed);
|
||||
goto bigloop;
|
||||
//goto bigloop;
|
||||
}
|
||||
else {
|
||||
m_needBottom = true;
|
||||
@ -1239,12 +1261,19 @@ int32_t UdpServer::readSock_ass ( UdpSlot **slotPtr , int64_t now ) {
|
||||
log("loop: readsock_ass: peekSize=%i m_sock/fd=%i",
|
||||
peekSize,m_sock);
|
||||
|
||||
//static int s_ss = 0;
|
||||
|
||||
// cancel silly g_errnos and return 0 since we blocked
|
||||
if ( peekSize < 0 ) {
|
||||
g_errno = errno;
|
||||
if ( flipped ) interruptsOn();
|
||||
if ( g_errno == EAGAIN || g_errno == 0 ) { g_errno = 0; return 0; }
|
||||
if ( g_errno == EILSEQ ) { g_errno = 0; return 0; }
|
||||
if ( g_errno == EAGAIN || g_errno == 0 ) {
|
||||
// if ( s_ss++ == 100 ) {
|
||||
// log("foo");char *xx=NULL;*xx=0; }
|
||||
// log("udp: EAGAIN");
|
||||
g_errno = 0; return 0; }
|
||||
if ( g_errno == EILSEQ ) {
|
||||
g_errno = 0; return 0; }
|
||||
// Interrupted system call (4) (from valgrind)
|
||||
#ifdef _VALGRIND_
|
||||
if ( g_errno == 4 ) { g_errno = 0; return 0;}
|
||||
@ -1592,7 +1621,7 @@ int32_t UdpServer::readSock_ass ( UdpSlot **slotPtr , int64_t now ) {
|
||||
|
||||
if ( getSlot )
|
||||
// get a new UdpSlot
|
||||
slot = getEmptyUdpSlot_ass ( key );
|
||||
slot = getEmptyUdpSlot_ass ( key , true );
|
||||
// return -1 on failure
|
||||
if ( ! slot ) {
|
||||
// return -1
|
||||
@ -1693,8 +1722,25 @@ int32_t UdpServer::readSock_ass ( UdpSlot **slotPtr , int64_t now ) {
|
||||
// we we could not allocate a read buffer to hold the request/reply
|
||||
// just send a cancel ack so the send will call its callback with
|
||||
// g_errno set
|
||||
// MDW: it won't make it into the m_head3 callback linked list with
|
||||
// this logic.... maybe it just times out or resends later...
|
||||
if ( ! status && g_errno == ENOMEM ) goto cancelTrans;
|
||||
|
||||
// if it is now a complete REPLY, callback will need to be called
|
||||
// so insert into the callback linked list, m_head3.
|
||||
// we have to put slots with NULL callbacks in here since they
|
||||
// are incoming requests to handle.
|
||||
if ( //slot->m_callback &&
|
||||
// if we got an error reading the reply (or sending req?) then
|
||||
// consider it completed too?
|
||||
// ( slot->isTransactionComplete() || slot->m_errno ) &&
|
||||
( slot->isDoneReading() || slot->m_errno ) ) {
|
||||
// prepare to call the callback by adding it to this
|
||||
// special linked list
|
||||
addToCallbackLinkedList ( slot );
|
||||
}
|
||||
|
||||
|
||||
// if(g_conf.m_sequentialProfiling) {
|
||||
// if(slot->isDoneReading())
|
||||
// log(LOG_TIMING, "admin: read last dgram: "
|
||||
@ -1705,6 +1751,7 @@ int32_t UdpServer::readSock_ass ( UdpSlot **slotPtr , int64_t now ) {
|
||||
// discard if we should
|
||||
if ( discard ) {
|
||||
readSize=recvfrom(m_sock,tmpbuf,DGRAM_SIZE_CEILING,0,NULL,NULL);
|
||||
//log("udp: recvfrom3 = %i",(int)readSize);
|
||||
}
|
||||
// . update stats, just put them all in g_udpServer
|
||||
// . do not count acks
|
||||
@ -1886,13 +1933,18 @@ void UdpServer::resume ( ) {
|
||||
// . the problem is when we call this with niceness 1 and we convert
|
||||
// a niceness 1 callback to 0...
|
||||
bool UdpServer::makeCallbacks_ass ( int32_t niceness ) {
|
||||
if ( g_conf.m_logDebugUdp )
|
||||
|
||||
// if nothing to call, forget it
|
||||
if ( ! m_head3 ) return false;
|
||||
|
||||
//if ( g_conf.m_logDebugUdp )
|
||||
log(LOG_DEBUG,"udp: makeCallbacks_ass: start. nice=%"INT32" "
|
||||
"inquickpoll=%"INT32"",
|
||||
niceness,(int32_t)g_loop.m_inQuickPoll);
|
||||
// bail if suspended
|
||||
if ( m_isSuspended ) return false;
|
||||
|
||||
|
||||
// . if there are active high priority threads, do not
|
||||
// call low priority callbacks. in that case
|
||||
// . This seems to block things up to much?
|
||||
@ -1938,9 +1990,13 @@ bool UdpServer::makeCallbacks_ass ( int32_t niceness ) {
|
||||
|
||||
nextPass:
|
||||
|
||||
UdpSlot *nextSlot = NULL;
|
||||
|
||||
// only scan those slots that are ready
|
||||
//for ( UdpSlot *slot = m_head3 ; slot ; slot = slot->m_next3 )
|
||||
for ( UdpSlot *slot = m_head2 ; slot ; slot = slot->m_next2 ) {
|
||||
//for ( UdpSlot *slot = m_head2 ; slot ; slot = slot->m_next2 ) {
|
||||
for ( UdpSlot *slot = m_head3 ; slot ; slot = nextSlot ) {
|
||||
// because makeCallback_ass() can delete the slot, use this
|
||||
nextSlot = slot->m_next3;
|
||||
// call quick handlers in pass 0, they do not take any time
|
||||
// and if they do not get called right away can cause this host
|
||||
// to bottleneck many hosts
|
||||
@ -2097,12 +2153,15 @@ bool UdpServer::makeCallbacks_ass ( int32_t niceness ) {
|
||||
//UdpSlot *next3 = slot->m_next2;
|
||||
|
||||
// . crap, this can alter the linked list we are scanning
|
||||
// if it deletes the slot!
|
||||
// if it deletes the slot! yes, but now we use "nextSlot"
|
||||
// . return false on error and sets g_errno, true otherwise
|
||||
// . return true if we called one
|
||||
// . skip to next slot if did not call callback/handler
|
||||
if ( ! makeCallback_ass ( slot ) ) continue;
|
||||
|
||||
// remove it from the callback list to avoid re-call
|
||||
removeFromCallbackLinkedList ( slot );
|
||||
|
||||
int64_t took = 0;
|
||||
if ( logIt )
|
||||
took = gettimeofdayInMillisecondsLocal()-start2;
|
||||
@ -2245,9 +2304,18 @@ bool UdpServer::makeCallback_ass ( UdpSlot *slot ) {
|
||||
start = gettimeofdayInMillisecondsLocal();
|
||||
// callback is non-NULL if we initiated the transaction
|
||||
if ( slot->m_callback ) {
|
||||
|
||||
// assume the slot's error when making callback
|
||||
// like EUDPTIMEDOUT
|
||||
if ( ! g_errno ) g_errno = slot->m_errno;
|
||||
|
||||
// . if transaction has not fully completed, bail
|
||||
// . unless there was an error
|
||||
if ( ! g_errno && ! slot->isTransactionComplete())return false;
|
||||
// . g_errno could be ECANCELLED
|
||||
if ( ! g_errno && ! slot->isTransactionComplete()) {
|
||||
log("udp: why calling callback when not ready???");
|
||||
return false;
|
||||
}
|
||||
/*
|
||||
#ifdef _UDPDEBUG_
|
||||
// if we had the token, give it up so others can send with it
|
||||
@ -2276,7 +2344,8 @@ bool UdpServer::makeCallback_ass ( UdpSlot *slot ) {
|
||||
"niceness=%"INT32" "
|
||||
"callback=%08"PTRFMT" "
|
||||
"took %"INT64" ms (%"INT32" Mbps).",
|
||||
slot->m_transId,msgType,mstrerror(g_errno),
|
||||
slot->m_transId,msgType,
|
||||
mstrerror(g_errno),
|
||||
slot->m_niceness,
|
||||
(PTRTYPE)slot->m_callback ,
|
||||
took , Mbps );
|
||||
@ -2389,7 +2458,14 @@ bool UdpServer::makeCallback_ass ( UdpSlot *slot ) {
|
||||
if ( slot->m_calledHandler ) {
|
||||
// . if transaction has not fully completed, keep sending
|
||||
// . unless there was an error
|
||||
if ( ! g_errno && ! slot->isTransactionComplete())return false;
|
||||
if ( ! g_errno &&
|
||||
! slot->isTransactionComplete() &&
|
||||
! slot->m_errno ) {
|
||||
if ( g_conf.m_logDebugUdp )
|
||||
log("udp: why calling handler "
|
||||
"when not ready?");
|
||||
return false;
|
||||
}
|
||||
// we should not destroy the slot here on ENOMEM error,
|
||||
// because handler might be referencing the slot's read buffer
|
||||
// still. that is what Msg20 does... the first dgram was
|
||||
@ -2468,6 +2544,7 @@ bool UdpServer::makeCallback_ass ( UdpSlot *slot ) {
|
||||
if ( g_inSigHandler ) goto queueSig;
|
||||
// nuke the slot, we gave them a reply...
|
||||
destroySlot ( slot );
|
||||
//log("udp: why double calling handler?");
|
||||
// this kind of callback doesn't count
|
||||
return false;
|
||||
}
|
||||
@ -2882,6 +2959,9 @@ bool UdpServer::readTimeoutPoll ( int64_t now ) {
|
||||
// . set slot's m_errno field
|
||||
// . makeCallbacks_ass() should call its callback
|
||||
slot->m_errno = EUDPTIMEDOUT;
|
||||
// prepare to call the callback by adding it to this
|
||||
// special linked list
|
||||
addToCallbackLinkedList ( slot );
|
||||
// let caller know we did something
|
||||
something = true;
|
||||
// keep going
|
||||
@ -2987,6 +3067,9 @@ bool UdpServer::readTimeoutPoll ( int64_t now ) {
|
||||
slot->m_callback ) {
|
||||
// should this be ENOACK or something?
|
||||
slot->m_errno = EUDPTIMEDOUT;
|
||||
// prepare to call the callback by adding it to this
|
||||
// special linked list
|
||||
addToCallbackLinkedList ( slot );
|
||||
// let caller know we did something
|
||||
something = true;
|
||||
// note it
|
||||
@ -3126,7 +3209,7 @@ bool UdpServer::shutdown ( bool urgent ) {
|
||||
time_t now = getTime();
|
||||
int32_t count = 0;
|
||||
if(!urgent) {
|
||||
//if ( m_head && m_head2->m_next2 ) return false;
|
||||
//if ( m_head && m_head2->m_next2 ) return false;
|
||||
for ( UdpSlot *slot = m_head2 ; slot ; slot = slot->m_next2 ) {
|
||||
// if we initiated, then don't count it
|
||||
if ( slot->m_callback ) continue;
|
||||
@ -3206,7 +3289,7 @@ bool UdpServer::timeoutDeadHosts ( Host *h ) {
|
||||
}
|
||||
|
||||
// verified that this is not interruptible
|
||||
UdpSlot *UdpServer::getEmptyUdpSlot_ass ( key_t k ) {
|
||||
UdpSlot *UdpServer::getEmptyUdpSlot_ass ( key_t k , bool incoming ) {
|
||||
// turn em off
|
||||
bool flipped = interruptsOff();
|
||||
// tmp debug
|
||||
@ -3244,14 +3327,19 @@ UdpSlot *UdpServer::getEmptyUdpSlot_ass ( key_t k ) {
|
||||
m_tail2 = slot;
|
||||
}
|
||||
// also to callback candidates if we should
|
||||
//if ( hasCallback ) {
|
||||
// slot->m_next3 = m_head3;
|
||||
// slot->m_prev3 = NULL;
|
||||
// if ( m_head3 ) m_head3->m_prev3 = slot;
|
||||
// m_head3 = slot;
|
||||
//}
|
||||
// if ( hasCallback ) {
|
||||
// slot->m_next3 = m_head3;
|
||||
// slot->m_prev3 = NULL;
|
||||
// if ( m_head3 ) m_head3->m_prev3 = slot;
|
||||
// m_head3 = slot;
|
||||
// }
|
||||
// count it
|
||||
m_numUsedSlots++;
|
||||
|
||||
if ( incoming ) m_numUsedSlotsIncoming++;
|
||||
|
||||
slot->m_incoming = incoming;
|
||||
|
||||
// now store ptr in hash table
|
||||
slot->m_key = k;
|
||||
addKey ( k , slot );
|
||||
@ -3281,6 +3369,71 @@ UdpSlot *UdpServer::getUdpSlot ( key_t k ) {
|
||||
return m_ptrs[i];
|
||||
}
|
||||
|
||||
void UdpServer::addToCallbackLinkedList ( UdpSlot *slot ) {
|
||||
// debug log
|
||||
if ( g_conf.m_logDebugUdp && slot->m_errno )
|
||||
log("udp: adding slot with err = %s to callback list"
|
||||
, mstrerror(slot->m_errno) );
|
||||
if ( g_conf.m_logDebugUdp )
|
||||
log("udp: adding slot=%"PTRFMT" to callback list"
|
||||
,(PTRTYPE)slot);
|
||||
// must not be in there already, lest we double add it
|
||||
if ( isInCallbackLinkedList ( slot ) ) {
|
||||
if ( g_conf.m_logDebugUdp )
|
||||
log("udp: avoided double add slot=%"PTRFMT
|
||||
,(PTRTYPE)slot);
|
||||
return;
|
||||
}
|
||||
slot->m_next3 = NULL;
|
||||
slot->m_prev3 = NULL;
|
||||
if ( ! m_tail3 ) {
|
||||
m_head3 = slot;
|
||||
m_tail3 = slot;
|
||||
}
|
||||
else {
|
||||
// insert at end of linked list otherwise
|
||||
m_tail3->m_next3 = slot;
|
||||
slot->m_prev3 = m_tail3;
|
||||
m_tail3 = slot;
|
||||
}
|
||||
}
|
||||
|
||||
bool UdpServer::isInCallbackLinkedList ( UdpSlot *slot ) {
|
||||
// return if not in the linked list
|
||||
if ( slot->m_prev3 ) return true;
|
||||
if ( slot->m_next3 ) return true;
|
||||
if ( m_head3 == slot ) return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
void UdpServer::removeFromCallbackLinkedList ( UdpSlot *slot ) {
|
||||
|
||||
if ( g_conf.m_logDebugUdp )
|
||||
log("udp: removing slot=%"PTRFMT" from callback list"
|
||||
,(PTRTYPE)slot);
|
||||
|
||||
// return if not in the linked list
|
||||
if ( slot->m_prev3 == NULL &&
|
||||
slot->m_next3 == NULL &&
|
||||
m_head3 != slot )
|
||||
return;
|
||||
|
||||
// excise from linked list otherwise
|
||||
if ( m_head3 == slot )
|
||||
m_head3 = slot->m_next3;
|
||||
if ( m_tail3 == slot )
|
||||
m_tail3 = slot->m_prev3;
|
||||
|
||||
if ( slot->m_prev3 )
|
||||
slot->m_prev3->m_next3 = slot->m_next3;
|
||||
if ( slot->m_next3 )
|
||||
slot->m_next3->m_prev3 = slot->m_prev3;
|
||||
|
||||
// and so we do not try to re-excise it
|
||||
slot->m_prev3 = NULL;
|
||||
slot->m_next3 = NULL;
|
||||
}
|
||||
|
||||
// verified that this is not interruptible
|
||||
void UdpServer::freeUdpSlot_ass ( UdpSlot *slot ) {
|
||||
bool flipped = interruptsOff();
|
||||
@ -3291,13 +3444,12 @@ void UdpServer::freeUdpSlot_ass ( UdpSlot *slot ) {
|
||||
if ( slot->m_prev2 ) slot->m_prev2->m_next2 = slot->m_next2;
|
||||
if ( slot->m_next2 ) slot->m_next2->m_prev2 = slot->m_prev2;
|
||||
// also from callback candidates if we should
|
||||
//if ( slot->m_callback ) {
|
||||
// if ( slot->m_prev3 ) slot->m_prev3->m_next3 = slot->m_next3;
|
||||
// else m_head3 = slot->m_next3;
|
||||
// if ( slot->m_next3 ) slot->m_next3->m_prev3 = slot->m_prev3;
|
||||
//}
|
||||
removeFromCallbackLinkedList ( slot );
|
||||
// discount it
|
||||
m_numUsedSlots--;
|
||||
|
||||
if ( slot->m_incoming ) m_numUsedSlotsIncoming--;
|
||||
|
||||
// add to linked list of available slots
|
||||
slot->m_next = m_head;
|
||||
m_head = slot;
|
||||
|
13
UdpServer.h
13
UdpServer.h
@ -170,6 +170,8 @@ class UdpServer {
|
||||
// an estimation as well
|
||||
//int32_t getNumUsedSlots () { return m_topUsedSlot + 1; };
|
||||
int32_t getNumUsedSlots () { return m_numUsedSlots; };
|
||||
|
||||
int32_t getNumUsedSlotsIncoming () { return m_numUsedSlotsIncoming; };
|
||||
|
||||
|
||||
// . when a request/msg of type "msgType" is received we call the
|
||||
@ -282,6 +284,11 @@ class UdpServer {
|
||||
|
||||
UdpSlot *getActiveHead ( ) { return m_head2; };
|
||||
|
||||
// callback linked list functions (m_head3)
|
||||
void addToCallbackLinkedList ( UdpSlot *slot ) ;
|
||||
bool isInCallbackLinkedList ( UdpSlot *slot );
|
||||
void removeFromCallbackLinkedList ( UdpSlot *slot ) ;
|
||||
|
||||
// cancel a transaction
|
||||
void cancel ( void *state , unsigned char msgType ) ;
|
||||
|
||||
@ -409,7 +416,7 @@ class UdpServer {
|
||||
int32_t m_maxSlots;
|
||||
|
||||
// routines
|
||||
UdpSlot *getEmptyUdpSlot_ass ( key_t k );
|
||||
UdpSlot *getEmptyUdpSlot_ass ( key_t k , bool incoming );
|
||||
void freeUdpSlot_ass ( UdpSlot *slot );
|
||||
|
||||
void addKey ( key_t key , UdpSlot *ptr ) ;
|
||||
@ -434,9 +441,11 @@ class UdpServer {
|
||||
UdpSlot *m_head2;
|
||||
UdpSlot *m_tail2;
|
||||
// linked list of callback candidates
|
||||
//UdpSlot *m_head3;
|
||||
UdpSlot *m_head3;
|
||||
UdpSlot *m_tail3;
|
||||
|
||||
int32_t m_numUsedSlots;
|
||||
int32_t m_numUsedSlotsIncoming;
|
||||
|
||||
// stats
|
||||
public:
|
||||
|
@ -1502,6 +1502,9 @@ bool UdpSlot::readDatagramOrAck ( int sock ,
|
||||
// if its a msg 0x0c reply from a proxy ove roadrunner wireless
|
||||
// they tend to damage our packets for some reason so i repeat
|
||||
// the ip for a total of an 8 byte reply
|
||||
/*
|
||||
MDW: this seems to be causing problems on local networks
|
||||
so taking it out. 4/7/2015.
|
||||
if ( m_msgType == 0x0c && msgSize == 12 && peekSize == 24 &&
|
||||
// must be reply! not request.
|
||||
m_callback ) {
|
||||
@ -1528,6 +1531,7 @@ bool UdpSlot::readDatagramOrAck ( int sock ,
|
||||
return true;
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
// we're doing the call to recvfrom() for sure now
|
||||
*discard = false;
|
||||
@ -1572,6 +1576,7 @@ bool UdpSlot::readDatagramOrAck ( int sock ,
|
||||
0 ,
|
||||
NULL ,
|
||||
NULL );
|
||||
//log("udp: recvfrom1 = %i",(int)numRead);
|
||||
// let caller know how much we read for stats purposes
|
||||
*readSize = numRead;
|
||||
// restore what was at the header before we stored it there
|
||||
@ -1614,6 +1619,7 @@ bool UdpSlot::readDatagramOrAck ( int sock ,
|
||||
0 ,
|
||||
NULL ,
|
||||
NULL );
|
||||
//log("udp: recvfrom2 = %i",(int)dgramSize);
|
||||
// bail on error, how could this happen?
|
||||
if ( dgramSize < 0 ) {
|
||||
// valgrind
|
||||
|
13
UdpSlot.h
13
UdpSlot.h
@ -412,6 +412,13 @@ class UdpSlot {
|
||||
// save cpu by not having to call memset() on m_sentBits et al
|
||||
int32_t m_numBitsInitialized;
|
||||
|
||||
// and for doubly linked list of callback candidates
|
||||
class UdpSlot *m_next3;
|
||||
class UdpSlot *m_prev3;
|
||||
|
||||
// memset clears from here and above. so put anything that needs to
|
||||
// be set to zero above this line.
|
||||
|
||||
// . i've discarded the window since msg size is limited
|
||||
// . this way is faster
|
||||
// . these bits determine what dgrams we've sent/read/sentAck/readAck
|
||||
@ -425,9 +432,7 @@ class UdpSlot {
|
||||
// and for doubly linked list of used slots
|
||||
class UdpSlot *m_next2;
|
||||
class UdpSlot *m_prev2;
|
||||
// and for doubly linked list of callback candidates
|
||||
//class UdpSlot *m_next3;
|
||||
//class UdpSlot *m_prev3;
|
||||
|
||||
// store the key so when returning slot we can remove from hash table
|
||||
key_t m_key;
|
||||
|
||||
@ -435,6 +440,8 @@ class UdpSlot {
|
||||
|
||||
char m_maxResends;
|
||||
|
||||
char m_incoming;
|
||||
|
||||
// . for the hot udp server, we cannot call malloc in the sig handler
|
||||
// so we set m_readBuf to this to read in int16_t requests
|
||||
// . caller should pre-allocated m_readBuf when calling sendRequest()
|
||||
|
2
Url.h
2
Url.h
@ -202,7 +202,7 @@ public:
|
||||
char *getShorthandUrl ( bool rmWWW , int32_t *len );
|
||||
|
||||
// count the path components (root url as 0 path components)
|
||||
int32_t getPathDepth ( bool countFilename = false );
|
||||
int32_t getPathDepth ( bool countFilename ); // = false );
|
||||
|
||||
// get path component #num. starts at 0.
|
||||
char *getPathComponent ( int32_t num , int32_t *clen );
|
||||
|
17
Xml.cpp
17
Xml.cpp
@ -287,10 +287,19 @@ bool Xml::set ( char *s ,
|
||||
return true;
|
||||
}
|
||||
|
||||
// override
|
||||
// override. no don't it hurts when parsing CT_XML docs!!
|
||||
// we need XmlNode.cpp's setNodeInfo() to identify xml tags in
|
||||
// an rss feed. No, this was here for XmlDoc::hashXml() i think
|
||||
// so let's just fix Links.cpp to get links from pure xml.
|
||||
// we can't do this any more. it's easier to fix xmldoc::hashxml()
|
||||
// some other way... because Links.cpp and Xml::isRSSFeed()
|
||||
// depend on having regular tagids. but without this here
|
||||
// then XmlDoc::hashXml() breaks.
|
||||
if ( contentType == CT_XML )
|
||||
pureXml = true;
|
||||
pureXml = true;
|
||||
|
||||
// is it an xml conf file?
|
||||
m_pureXml = pureXml;
|
||||
|
||||
QUICKPOLL((niceness));
|
||||
int32_t i;
|
||||
@ -372,8 +381,12 @@ bool Xml::set ( char *s ,
|
||||
bool endsInSlash = false;
|
||||
if ( xi->m_node[xi->m_nodeLen-2] == '/' ) endsInSlash = true;
|
||||
if ( xi->m_node[xi->m_nodeLen-2] == '?' ) endsInSlash = true;
|
||||
// disregard </> in the conf files
|
||||
if ( xi->m_nodeLen==3 && endsInSlash ) endsInSlash = false;
|
||||
|
||||
// if not text node then he's the new parent
|
||||
// if we don't do this for xhtml then we don't pop the parent
|
||||
// and run out of parent stack space very quickly.
|
||||
if ( pureXml &&
|
||||
xi->m_nodeId &&
|
||||
xi->m_nodeId != TAG_COMMENT &&
|
||||
|
2
Xml.h
2
Xml.h
@ -230,6 +230,8 @@ class Xml {
|
||||
int32_t m_numNodes;
|
||||
int32_t m_maxNumNodes;
|
||||
|
||||
bool m_pureXml;
|
||||
|
||||
char *m_xml;
|
||||
int32_t m_xmlLen;
|
||||
|
||||
|
823
XmlDoc.cpp
823
XmlDoc.cpp
File diff suppressed because it is too large
Load Diff
29
XmlDoc.h
29
XmlDoc.h
@ -506,7 +506,8 @@ class XmlDoc {
|
||||
bool setTitleRecBuf ( SafeBuf *buf , int64_t docId, int64_t uh48 );
|
||||
// sets m_titleRecBuf/m_titleRecBufValid/m_titleRecKey[Valid]
|
||||
SafeBuf *getTitleRecBuf ( );
|
||||
SafeBuf *getSpiderStatusDocMetaList ( class SpiderReply *reply ) ;
|
||||
SafeBuf *getSpiderStatusDocMetaList ( class SpiderReply *reply ,
|
||||
bool forDelete ) ;
|
||||
SafeBuf *getSpiderStatusDocMetaList2 ( class SpiderReply *reply ) ;
|
||||
SafeBuf m_spiderStatusDocMetaList;
|
||||
char *getIsAdult ( ) ;
|
||||
@ -532,6 +533,7 @@ class XmlDoc {
|
||||
char *getIsPermalink ( ) ;
|
||||
char *getIsUrlPermalinkFormat ( ) ;
|
||||
char *getIsRSS ( ) ;
|
||||
char *getIsSiteMap ( ) ;
|
||||
class Xml *getXml ( ) ;
|
||||
uint8_t *getLangVector ( ) ;
|
||||
uint8_t *getLangId ( ) ;
|
||||
@ -734,6 +736,18 @@ class XmlDoc {
|
||||
|
||||
char *getDiffbotParentUrl( char *myUrl );
|
||||
|
||||
int64_t m_diffbotReplyEndTime;
|
||||
int64_t m_diffbotReplyStartTime;
|
||||
int32_t m_diffbotReplyRetries;
|
||||
|
||||
bool m_sentToDiffbotThisTime;
|
||||
|
||||
uint64_t m_downloadStartTime;
|
||||
//uint64_t m_downloadEndTime;
|
||||
|
||||
uint64_t m_ipStartTime;
|
||||
uint64_t m_ipEndTime;
|
||||
|
||||
void copyFromOldDoc ( class XmlDoc *od ) ;
|
||||
|
||||
class SpiderReply *getFakeSpiderReply ( );
|
||||
@ -785,8 +799,8 @@ class XmlDoc {
|
||||
bool hashContentType ( class HashTableX *table ) ;
|
||||
bool hashDMOZCategories ( class HashTableX *table ) ;
|
||||
bool hashLinks ( class HashTableX *table ) ;
|
||||
bool hashUrl ( class HashTableX *table , bool isStatusDoc = false ) ;
|
||||
bool hashDateNumbers ( class HashTableX *tt , bool isStatusDoc=false) ;
|
||||
bool hashUrl ( class HashTableX *table );
|
||||
bool hashDateNumbers ( class HashTableX *tt );
|
||||
bool hashSections ( class HashTableX *table ) ;
|
||||
bool hashIncomingLinkText ( class HashTableX *table ,
|
||||
bool hashAnomalies ,
|
||||
@ -1148,6 +1162,7 @@ class XmlDoc {
|
||||
char m_addedSpiderRequestSizeValid;
|
||||
char m_addedSpiderReplySizeValid;
|
||||
char m_addedStatusDocSizeValid;
|
||||
char m_downloadStartTimeValid;
|
||||
//char m_docQualityValid;
|
||||
char m_siteValid;
|
||||
char m_startTimeValid;
|
||||
@ -1215,6 +1230,7 @@ class XmlDoc {
|
||||
char m_rootLangIdValid;
|
||||
char m_datedbDateValid;
|
||||
char m_isRSSValid;
|
||||
char m_isSiteMapValid;
|
||||
char m_spiderLinksArgValid;
|
||||
char m_isContentTruncatedValid;
|
||||
char m_xmlValid;
|
||||
@ -1436,6 +1452,8 @@ class XmlDoc {
|
||||
bool m_looseContentHash64Valid;
|
||||
bool m_jpValid;
|
||||
|
||||
char m_isSiteMap;
|
||||
|
||||
// shadows
|
||||
char m_isRSS2;
|
||||
char m_isPermalink2;
|
||||
@ -1634,7 +1652,7 @@ class XmlDoc {
|
||||
//class LinkInfo *m_linkInfo1Ptr;
|
||||
char *m_linkInfoColl;
|
||||
//char m_injectedReply;
|
||||
int32_t m_minInlinkerHopCount;
|
||||
//int32_t m_minInlinkerHopCount;
|
||||
//class LinkInfo *m_linkInfo2Ptr;
|
||||
SiteGetter m_siteGetter;
|
||||
int64_t m_siteHash64;
|
||||
@ -1712,6 +1730,9 @@ class XmlDoc {
|
||||
bool doesPageContentMatchDiffbotProcessPattern() ;
|
||||
int32_t *getDiffbotTitleHashes ( int32_t *numHashes ) ;
|
||||
char *hashJSONFields ( HashTableX *table );
|
||||
char *hashJSONFields2 ( HashTableX *table , HashInfo *hi , Json *jp ,
|
||||
bool hashWithoutFieldNames ) ;
|
||||
|
||||
char *hashXMLFields ( HashTableX *table );
|
||||
int32_t *reindexJSONObjects ( int32_t *newTitleHashes ,
|
||||
int32_t numNewHashes ) ;
|
||||
|
@ -194,8 +194,10 @@ NodeType g_nodes[] = {
|
||||
|
||||
{"scriptText",0, 1, 0, 0,0, TAG_SCRIPTTEXT,0 },
|
||||
{"BUTTON" , 1, 1, 1, 0,0, TAG_BUTTON ,0},
|
||||
{"UrlFrom", 0, 1, 1, 0,0, TAG_URLFROM ,1}
|
||||
{"UrlFrom", 0, 1, 1, 0,0, TAG_URLFROM ,1},
|
||||
|
||||
// for sitemap.xml
|
||||
{"LOC" , 0, 1, 1, 0,0, TAG_LOC,0}
|
||||
//{"BUTTON" , 1, 1, 1, 2, 122,0},
|
||||
//{"BDO" , 1, 1, 1, 2, 123,0},
|
||||
//{"LABEL" , 1, 1, 1, 2, 124,0},
|
||||
@ -312,7 +314,9 @@ int32_t XmlNode::set ( char *node , bool pureXml , int32_t version ) {
|
||||
m_hasBackTag = true;
|
||||
m_isBreaking = true;
|
||||
m_isVisible = true;
|
||||
m_nodeId = TAG_XMLTAG;//1;
|
||||
//m_nodeId = TAG_XMLTAG;//1;
|
||||
// this returns 1 if tag is not in the list
|
||||
m_nodeId = setNodeInfo ( m_hash );//&m_hasBackTag ,
|
||||
}
|
||||
// . determine if the nodeId for this node
|
||||
// . determine if it breaks lines (for phrasing purposes)
|
||||
|
@ -322,13 +322,15 @@ enum {
|
||||
TAG_BUTTON,
|
||||
TAG_URLFROM, // for ahrefs.com
|
||||
|
||||
// support sitemap.xml
|
||||
TAG_LOC,
|
||||
|
||||
//
|
||||
// fake tags below here
|
||||
//
|
||||
// a fake tag used by Sections.cpp
|
||||
TAG_SENTENCE,
|
||||
|
||||
|
||||
LAST_TAG
|
||||
};
|
||||
#endif
|
||||
|
406
html/blog.html
406
html/blog.html
@ -1,12 +1,406 @@
|
||||
<html>
|
||||
<title>Gigablast - Blog</title>
|
||||
|
||||
<div style=max-width:700px;>
|
||||
|
||||
<br>
|
||||
<br><br>
|
||||
|
||||
<a name=comparetool></a>
|
||||
<font size=+1><b>Compare Tool</b></font><br>
|
||||
<i>Aug 17, 2013</i><br><br>
|
||||
This week I have begun constructing a tool that allows you to compare Gigablasts with Solr and Elasticsearch, which are two of the more popular open-source search engines on the market today. Both of those are based on Lucene. So essentially I will be comparing everything I think is noteworthy, if I leave something out drop me an email.
|
||||
<a name=revival></a>
|
||||
<font size=+1><b>15 Year Anniversary</b></font><br>
|
||||
<i>September 1, 2014</i><br><br>
|
||||
It's been 15 years since I first started Gigablast. It's taken some interesting directions as of late. Most notably being open source. I've decided to revive the old blog entries that you can find below and continue working on top of those.
|
||||
|
||||
|
||||
|
||||
|
||||
<br><br><br><br>
|
||||
|
||||
|
||||
|
||||
|
||||
<a name=gigabits></a>
|
||||
<font size=+1><b>Giga Bits Introduced</b></font><br>
|
||||
<i>Jan 31, 2004</i><br><br>
|
||||
Gigablast now generates related concepts for your query. I call them Giga Bits. I believe it is the best concept generator in the industry, but if you don't think so please <a href="/contact.html">drop me a note</a> explaining why not, so I can improve it.
|
||||
<br><br>
|
||||
You can also ask Gigablast a simple question like <a href="/search?q=Who+is+President+of+Russia%3F">"Who is President of Russia?"</a> and it often comes up with the correct answer in the Giga Bits section. How do you think it does that?
|
||||
<br><br>
|
||||
In other news, the spider speed ups I rolled a few weeks ago are tremendously successful. I can easily burn all my bandwidth quota with insignificant load on my servers. I could not be happier with this.
|
||||
<br><br>
|
||||
Now I'm planning on turning Gigablast into a default AND engine. Why? Because it will decrease query latency by several times, believe or not. That should put Gigablast on par with the fastest engines in the world, even though it only runs on 8 desktop machines. But Don't worry, I will still leave the default OR functionality intact.
|
||||
<br>
|
||||
<br>
|
||||
<br>
|
||||
<br>
|
||||
|
||||
<a name=update></a>
|
||||
<font size=+1><b>January Update Rolled</b></font><br>
|
||||
<i>Jan 8, 2004</i><br><br>
|
||||
Gigablast now has a more professional, but still recognizable, logo, and a new catch phrase, "Information Acceleration". Lots of changes on the back end. You should notice significantly higher quality searches. The spider algorithm was sped up several times. Gigablast should be able to index several million documents per day, but that still remains to be tested. <knock on wood>. Site clustering was sped up. I added the ability to force all query terms to be required by using the &rat=1 cgi parm. Now Gigablast will automatically regenerate some of its databases when they are missing. And I think I wasted two weeks working like a dog on code that I'm not going to end up using! I hate when that happens...
|
||||
|
||||
<br>
|
||||
<br>
|
||||
<br>
|
||||
<br>
|
||||
|
||||
<a name=traffic></a>
|
||||
<font size=+1><b>An Easy way to Slash Motor Vehicle Emissions</b></font><br>
|
||||
<i>Dec 11, 2003</i><br><br>
|
||||
Blanket the whole city with wi-fi access. (like <a href="/?redir=http://story.news.yahoo.com/news?tmpl=story&ncid=1293&e=2&u=/ap/20031211/ap_on_hi_te/wi_fi_city&sid=95573418">Cerritos, California</a>) When you want to travel from point A
|
||||
to point B, tell the central traffic computer. It will then give you a time
|
||||
window in which to begin your voyage and, most importantly, it will ensure that
|
||||
as long as you stay within the window you will always hit green lights.
|
||||
<br><br>
|
||||
If you stray from your path, you'll be able to get a new window via the wi-fi network.
|
||||
If everyone's car has gps and is connected to the wi-fi network,
|
||||
the central computer will also be able to monitor the flow of traffic and
|
||||
make adjustments to your itinerary in real-time.
|
||||
Essentially, the traffic computer will be solving a large system of linear,
|
||||
and possibly non-linear, constraints in real-time. Lots of fun... and think of
|
||||
how much more efficient travel will be!! If someone wants to secure funding,
|
||||
count me in.
|
||||
|
||||
<br>
|
||||
<br>
|
||||
<br>
|
||||
<br>
|
||||
|
||||
<a name=spellchecker></a>
|
||||
<font size=+1><b>Spellchecker Finally Finished</b></font><br>
|
||||
<i>Nov 18, 2003</i><br><br>
|
||||
After a large, countable number of interruptions, I've finally completed the spellchecker. I tested the word '<b>dooty</b>' on several search engines to see how they handled that misspelling. Here's what I got:
|
||||
<br><br>
|
||||
<table>
|
||||
<tr><td><b>Source</b></td><td><b>Result</b></td></tr>
|
||||
<tr><td>Alltheweb</td><td><a href="http://www.alltheweb.com/search?query=dooty">booty</a><td></tr>
|
||||
<tr><td>Altavista</td><td><a href="http://search01.altavista.com/web/results?q=dooty">dhooti</a></td></tr>
|
||||
<tr><td>Gigablast</td><td><a href="http://www.gigablast.com/search?q=dooty">door</a></td></tr>
|
||||
<tr><td>Google</td><td><a href="http://www.google.com/search?q=dooty">doody</a></td></tr>
|
||||
<tr><td>Microsoft Word</td><td>Doty</td></tr>
|
||||
<tr><td>Teoma</td><td><a href="http://s.teoma.com/search?q=dooty">doty</a></td></tr>
|
||||
<tr><td>Wisenut</td><td>N/A (no spellchecker)</td></tr>
|
||||
</table>
|
||||
<br>
|
||||
So there is no one way to code a spellchecker. It's a guessing game. And, hey Wisenut, want to license a good spellchecker for cheap? <a href="/contact.html">Let me know</a>.
|
||||
|
||||
<br><br>
|
||||
Gigablast uses its cached web pages to generate its dictionary instead of the query logs. When a word or phrase is not found in the the dictionary, Gigablast replaces it with the closest match in the dictionary. If multiple words or phrases are equally close, then Gigablast resorts to a popularity ranking.
|
||||
<br><br>
|
||||
One interesting thing I noticed is that in Google's spellchecker you must at least get the first letter of the word correct, otherwise, Google will not be able to recommend the correct spelling. I made Gigablast this way too, because it really cuts down on the number of words it has to search to come up with a recommendation. This also allows you to have an extremely large dictionary distributed amongst several machines, where each machine is responsible for a letter.
|
||||
<br><br>
|
||||
Also of note: I am planning on purchasing the hardware required for achieving a 5 billion document index capable of serving hundreds of queries per second within the next 12 months. Wish me luck... and thanks for using Gigablast.
|
||||
<br>
|
||||
<br>
|
||||
<br>
|
||||
<br>
|
||||
|
||||
|
||||
<a name=onagain></a>
|
||||
<font size=+1><b>Spiders On Again</b></font><br>
|
||||
<i>Nov 10, 2003</i><br><br>
|
||||
After updating the spider code I've reactivated the spiders. Gigablast should be able to spider at a faster rate with even less impact on query response time than before. So add your urls now while the addings good.
|
||||
<br>
|
||||
<br>
|
||||
<br>
|
||||
<br>
|
||||
|
||||
|
||||
<a name=speed></a>
|
||||
<font size=+1><b>Going For Speed</b></font><br>
|
||||
<i>Nov 3, 2003</i><br><br>
|
||||
I've finally got around to working on Gigablast's distributed caches. It was not doing a lot of caching before. The new cache class I rigged up has no memory fragmentation and minimal record overhead. It is vurhy nice.<br><br>
|
||||
I've stopped spidering just for a bit so I can dedicate all Gigablast's RAM to the multi-level cache system I have in place now and see how much I can reduce query latency. Disks are still my main point of contention by far so the caching helps out a lot. But I could still use more memory.<br><br>
|
||||
Take Gigablast for a <a href="/">spin</a>. See how fast it is.
|
||||
<br>
|
||||
<br>
|
||||
<br>
|
||||
<br>
|
||||
|
||||
|
||||
<a name=metas></a>
|
||||
<font size=+1><b>Bring Me Your Meta Tags</b></font><br>
|
||||
<i>Oct 11, 2003</i><br><br>
|
||||
As of now Gigablast supports the indexing, searching and displaying of generic meta tags. You name them I fame them. For instance, if you have a tag like <i><meta name="foo" content="bar baz"></i> in your document, then you will be able to do a search like <i><a href="/search?q=foo%3Abar&dt=foo">foo:bar</a></i> or <i><a href="/search?q=foo%3A%22bar+baz%22&dt=foo">foo:"bar baz"</a></i> and Gigablast will find your document.
|
||||
<br><br>
|
||||
You can tell Gigablast to display the contents of arbitrary meta tags in the search results, like <a href="/search?q=gigablast&s=10&dt=author+keywords%3A32">this</a>. Note that you must assign the <i>dt</i> cgi parameter to a space-separated list of the names of the meta tags you want to display. You can limit the number of returned characters of each tag to X characters by appending a <i>:X</i> to the name of the meta tag supplied to the <i>dt</i> parameter. In the link above, I limited the displayed keywords to 32 characters.
|
||||
<br><br>
|
||||
Why use generic metas? Because it is very powerful. It allows you to embed custom data in your documents, search for it and retrieve it. Originally I wanted to do something like this in XML, but now my gut instincts are that XML is not catching on because it is ugly and bloated. Meta tags are pretty and slick.
|
||||
<br>
|
||||
<br>
|
||||
<br>
|
||||
<br>
|
||||
|
||||
<a name=verisignstopped></a>
|
||||
<font size=+1><b>Verisign Stops Destroying the Internet</b></font><br>
|
||||
<i>Oct 11, 2003</i><br><br>
|
||||
Ok, they actually stopped about a week ago, but I didn't get around to posting it until now. They really ought to lose their privileged position so this does not happen again. Please do not stop your boycott. They have not learned from their mistakes.
|
||||
<br>
|
||||
<br>
|
||||
<br>
|
||||
<br>
|
||||
|
||||
<a name=moreverisign></a>
|
||||
<font size=+1><b>Verisign Continues to Damage Gigablast's Index</b></font><br>
|
||||
<i>September 30, 2003</i><br><br>
|
||||
When the Gigablast spider tries to download a page from a domain it first gets the associated robots.txt file for that domain. When the domain does not exist it ends up downloading a robots.txt file from verisign. There are two major problems with this. The first is that verisign's servers may be slow which will slow down Gigablast's indexing. Secondly, and this has been happening for a while now, Gigablast will still index any incoming link text for that domain, thinking that the domain still exists, but just that spider permission was denied by the robots.txt file.
|
||||
<br>
|
||||
<br>
|
||||
So, hats off to you verisign, thanks for enhancing my index with your fantastic "service". I hope your company is around for many years so you can continue providing me with your great "services".
|
||||
<br>
|
||||
<br>
|
||||
If you have been hurt because of verisign's greed you might want to consider joining the <a href="/?redir=http://www.geek.com/news/geeknews/2003Sep/gee20030929021965.htm">class-action lawsuit</a> announced Friday, September 26th, by the <a href="/?redir=http://www.techfirm.com/">Ira Rothken law firm</a>.
|
||||
<br>
|
||||
<br>
|
||||
Want to learn more about how the internet is run? Check out <a href="/?redir=http://www.paradigm.nu/icann/">the ICANN movie page</a>. Movie #1 portrays verisign's CEO, Stratton Sclavos, quite well in my opinion.
|
||||
<br>
|
||||
<br>
|
||||
<b>(10/01/03) Update #5:</b> verisign <a href="/?redir=http://www.pcworld.com/news/article/0,aid,112712,00.asp">comes under further scrutiny</a>.
|
||||
<br>
|
||||
<br>
|
||||
<br>
|
||||
<br>
|
||||
|
||||
<a name=verisign></a>
|
||||
<font size=+1><b>Verisign Redesigns the Internet for their Own Profit</b></font><br>
|
||||
<i>September 24, 2003</i><br><br>
|
||||
My spiders expect to get "not found" messages when they look up a domain that does not have an IP. When verisign uses their privledged position to change the underlying fundamentals of the internet just to line their own greedy pockets it really, really perturbs me. Now, rather than get the "not found" message, my spiders get back a valid IP, the IP of verisign's commercial servers. That causes my spiders to then proceed to download the robots.txt from that domain. This can take forever if their servers are slow. What a pain. Now I have to fix my freakin' code. And that's just one of many problems this company has caused.
|
||||
<br>
|
||||
<br>
|
||||
Please join me in boycott. I'm going to discourage everyone I know from supporting this abusive, monopolistic entity.
|
||||
<br>
|
||||
<br>
|
||||
<b>(9/22/03) Update #1:</b> verisign <a href="/?redir=http://www.icann.org/correspondence/lewis-to-twomey-21sep03.htm">responded</a> to ICANN's request that they stop. <a href="/?redir=http://slashdot.org/articles/03/09/22/2255202.shtml?tid=126&tid=95&tid=99">See what the slashdot community has to say about this response.</a>
|
||||
<br>
|
||||
<br>
|
||||
<b>(9/22/03) Update #2:</b> ICANN has now posted some complaints in this <a href="/?redir=http://forum.icann.org/alac-forum/redirect/">forum</a>.
|
||||
<br>
|
||||
<br>
|
||||
<b>(9/24/03) Update #3:</b> Slashdot has more <a href="/?redir=http://yro.slashdot.org/yro/03/09/24/0134256.shtml?tid=126&tid=95&tid=98&tid=99">coverage</a>.
|
||||
<br>
|
||||
<br>
|
||||
<b>(9/24/03) Update #4:</b> Please sign the <a href="/?redir=http://www.whois.sc/verisign-dns/">petition</a> to stop verisign.
|
||||
<br>
|
||||
<br>
|
||||
<br>
|
||||
<br>
|
||||
|
||||
|
||||
<a name=geotags></a>
|
||||
<font size=+1><b>Geo-Sensitive Search</b></font><br>
|
||||
<i>September 18, 2003</i><br><br>
|
||||
Gigablast now supports some special new meta tags that allow for constraining a search to a particular zipcode, city, state or country. Support was also added for the standard author, language and classification meta tags. This <a href="/tagsdemo.html">page</a> explains more. These meta tags should be standard, everyone should use them (but not abuse them!) and things will be easier for everybody.
|
||||
<br><br>
|
||||
Secondly, I have declared jihad against stale indexes. I am planning a significantly faster update cycle, not to mention growing the index to about 400 million pages, all hopefully in the next few months.
|
||||
<br>
|
||||
<br>
|
||||
<br>
|
||||
<br>
|
||||
|
||||
<a name=turing></a>
|
||||
<font size=+1><b>Foiling the Addurl Scripts</b></font><br>
|
||||
<i>September 6, 2003</i><br><br>
|
||||
The new pseudo-Turing test on the <a href="/addurl">addurl page</a> should prevent most automated scripts from submitting boatloads of URLs. If someone actually takes the time to code a way around it then I'll just have to take it a step further. I would rather work on other things, though, so please quit abusing my free service and discontinue your scripts. Thanks.
|
||||
<br>
|
||||
<br>
|
||||
<br>
|
||||
<br>
|
||||
|
||||
<a name=boolean></a>
|
||||
<font size=+1><b>Boolean is Here</b></font><br>
|
||||
<i>September 1, 2003</i><br><br>
|
||||
I just rolled out the new boolean logic code. You should be able to do nested boolean queries using the traditional AND, OR and NOT boolean operators. See the updated <a href="/help.html#boolean">help page</a> for more detail.
|
||||
<br><br>
|
||||
I have declared jihad against swapping and am now running the 2.4.21-rc6-rmap15j Linux kernel with swap tuned to zero using the /proc/sys/vm/pagecache knobs. So far no machines have swapped, which is great, but I'm unsure of this kernel's stability.
|
||||
<br>
|
||||
<br>
|
||||
<br>
|
||||
<br>
|
||||
|
||||
<a name=swap></a>
|
||||
<font size=+1><b>All Swapped Out</b></font><br>
|
||||
<i>August 29, 2003</i><br><br>
|
||||
I no longer recommend turning the swap off, at least not on linux 2.4.22. A kernel panicked on me and froze a server. Not good. If anyone has any ideas for how I can prevent my app from being swapped out, please let me know. I've tried mlockall() within my app but that makes its memory usage explode for some reason. I've also tried Rik van Riel's 2.4.21-rc6-rmap15j.txt patch on the 2.4.21 kernel, but it still does unnecessary swapping (although, strangely, only when spidering). If you know how to fix this problem, please help!!! <a href="vmstat.html">Here</a> is the output from the vmstat command on one of my production machines running 2.4.22. And <a href="vmstatrik.html">here</a> is the output from my test machine running 2.4.21-rc6-rmap15j.txt.
|
||||
<br>
|
||||
<br>
|
||||
<br>
|
||||
<br>
|
||||
|
||||
<a name=kernel></a>
|
||||
<font size=+1><b>Kernel Update</b></font><br>
|
||||
<i>August 28, 2003</i><br><br>
|
||||
I updated the Linux kernel to 2.4.22, which was just released a few days ago on <a href="/?redir=http://www.kernel.org/">kernel.org</a>. Now my gigabit cards are working, yay! I finally had to turn off swap using the swapoff command. When an application runs out of memory the swapper is supposed to write unfrequently used memory to disk so it can give that memory to the application that needs it. Unfortunately, the Linux virtual memory manager enjoys swapping out an application's memory for no good reason. This can often make an application disastrously slow, especially when the application ends up blocking on code that it doesn't expect too! And, furthermore, when the application uses the disk intensely it has to wait even longer for memory to get swapped back in from disk. I recommend that anyone who needs high performance turn off the swap and just make sure their program does not use more physical memory than is available.
|
||||
<br>
|
||||
<br>
|
||||
<br>
|
||||
<br>
|
||||
|
||||
<a name=gang></a>
|
||||
<font size=+1><b>The Gang's All Here</b></font><br>
|
||||
<i>August 17, 2003</i><br><br>
|
||||
I decided to add PostScript (<a href="/search?q=type:ps">.ps</a>) , PowerPoint (<a href="/search?q=type:ppt">.ppt</a>), Excel SpreadSheet (<a href="/search?q=type:xls">.xls</a>) and Microsoft Word (<a href="/search?q=type:doc">.doc</a>) support in addition to the PDF support. Woo-hoo.
|
||||
<br>
|
||||
<br>
|
||||
<br>
|
||||
<br>
|
||||
|
||||
<a name=pdf></a>
|
||||
<font size=+1><b>PDF Support</b></font><br>
|
||||
<i>August 14, 2003</i><br><br>
|
||||
Gigablast now indexes PDF documents. Try the search <a href="/search?q=type:pdf"><i>type:pdf</i></a> to see some PDF results. <i>type</i> is a new search field. It also support the text type, <a href="/search?q=type:text"><i>type:text</i></a>, and will support other file types in the future.
|
||||
<br>
|
||||
<br>
|
||||
<br>
|
||||
<br>
|
||||
|
||||
<a name=codeupdate3></a>
|
||||
<font size=+1><b>Minor Code Updates</b></font><br>
|
||||
<i>July 17, 2003</i><br><br>
|
||||
I've cleaned up the keyword highlight routines so they don't highlight isolated stop words. Gigablast now displays a <a href="/superRecall.html">blue bar</a> above returned search results that do not have <b>all</b> of your query terms. When returning a page of search results Gigablast lets you know how long ago that page was cached by displaying a small message at the bottom of that page. NOTE: This small message is at the bottom of the page containing the search results, not at the bottom of any pages from the web page cache, that is a different cache entirely. Numerous updates to less user-visible things on the back end. Many bugs fixed, but still more to go. Thanks a bunch to Bruce Perens for writing the <a href="/?redir=http://www.perens.com/FreeSoftware/">Electric Fence</a> debug utility.
|
||||
<br>
|
||||
<br>
|
||||
<br>
|
||||
<br>
|
||||
|
||||
<a name=codeupdate2></a>
|
||||
<font size=+1><b>Gigablast 2.0</b></font><br>
|
||||
<i>June 20, 2003</i><br><br>
|
||||
I've recently released Gigablast 2.0. Right now Gigablast can do about twice as many queries per second as before. When I take care of a few more things that rate should double again.
|
||||
<br><br>
|
||||
The ranking algorithm now treats phrase weights much better. If you search for something like <i><a href="/search?q=boots+in+the+uk">boots in the uk</a></i> you won't get a bunch of results that have that exact phrase in them, but rather you will get UK sites about boots (theoretically). And when you do a search like <i><a href="/search?q=all+the+king%27s+men">all the king's men</a></i> you will get results that have that exact phrase. If you find any queries for which Gigablast is especially bad, but a competing search engine is good, please <a href="/contact.html">let me know</a>, I'm am very interested.
|
||||
<br><br>
|
||||
2.0 also introduced a new index format. The new index is half the size of the old one. This allows my current setup to index over 400 million pages with dual redundancy. Before it was only able to index about 300 million pages. The decreased index size also speeds up the query process since only half as much data needs to be read from disk to satisfy a query.
|
||||
<br><br>
|
||||
I've also started a full index refresh, starting with top level pages that haven't been spidered in a while. This is especially nice because a lot of pages that were indexed before all my anti-spam algorithms were 100% in place are just now getting filtered appropriately. I've manually removed over 100,000 spam pages so far, too.
|
||||
<br>
|
||||
<br>
|
||||
<br>
|
||||
<br>
|
||||
|
||||
<a name=grub></a>
|
||||
<font size=+1><b>My Take on Looksmart's Grub</b></font><br>
|
||||
<i>Apr 19, 2003</i><br><br>
|
||||
There's been some press about Grub, a program from Looksmart which you install on your machine to help Looksmart spider the web. Looksmart is only using Grub to save on their bandwidth. Essentially Grub just compresses web pages before sending them to Looksmart's indexer thus reducing the bandwidth they have to pay for by a factor of 5 or so. The same thing could be accomplished through a proxy which compresses web pages. Eventually, once the HTTP mime standard for requesting compressed web pages is better supported by web servers, Grub will not be necessary.
|
||||
<br>
|
||||
<br>
|
||||
<br>
|
||||
<br>
|
||||
|
||||
<a name=codeupdate></a>
|
||||
<font size=+1><b>Code Update</b></font><br>
|
||||
<i>Mar 25, 2003</i><br><br>
|
||||
I just rolled some significant updates to Gigablast's back-end. Gigablast now has a uniformly-distributed, unreplicated search results cache. This means that if someone has done your search within the last several hours then you will get results back very fast. This also means that Gigablast can handle a lot more queries per second.
|
||||
<br>
|
||||
<br>
|
||||
I also added lots of debug and timing messages that can be turned on and off via the Gigablast admin page. This allows me to quickly isolate problems and identify bottlenecks.
|
||||
<br>
|
||||
<br>
|
||||
Gigablast now synchronizes the clocks on all machines on the network so the instant add-url should be more "instant". Before I made this change, one machine would tell another to spider a new url "now", where "now" was actually a few minutes into the future on the spider machine. But since everyone's currently synchronized, this will not be a problem anymore.
|
||||
<br>
|
||||
<br>
|
||||
There were about 100 other changes and bug fixes, minor and major, that I made, too, that should result in significant performance gains. My next big set of changes should make searches at least 5 times faster, but it will probably take several months until completed. I will keep you posted.
|
||||
<br>
|
||||
<br>
|
||||
<br>
|
||||
<br>
|
||||
|
||||
<a name=downtime></a>
|
||||
<font size=+1><b>Downtime</b></font><br>
|
||||
<i>Feb 20, 2003</i><br><br>
|
||||
To combat downtime I wrote a monitoring program. It will send me a text message on my cellphone if gigablast ever stops responding to queries. This should prevent extended periods of downtime by alerting me to the problem so I can promptly fix it.
|
||||
<br>
|
||||
<br>
|
||||
<br>
|
||||
<br>
|
||||
|
||||
|
||||
<a name=uunet></a>
|
||||
<font size=+1><b>Connectivity Problems. Bah!</b></font><br>
|
||||
<i>Feb 14, 2003</i><br><br>
|
||||
I had to turn off the main refresh spiders a few weeks ago because of internet connectivity problems. Lots of pages were inaccessible or were timing out to the point that spider performance was suffering too much.
|
||||
<br><br>
|
||||
After running tcpdump in combination with wget I noticed that the FIN packets of some web page transfers were being lost or delayed for over a minute. The TCP FIN packet is typically the last TCP packet sent to your browser when it retrieves a web page. It tells your browser to close the connection. Once it is received the little spinning logo in the upper right corner of your browser window should stop spinning.
|
||||
<br><br>
|
||||
The most significant problem was, however, that the initial incoming data packet for some URLs was being lost or excessively delayed. You can get by without receiving FIN packets but you absoultely need these TCP "P" packets. I've tested my equipment and my ISP has tested their equipment and we have both concluded that the problem is upstream. Yesterday my ISP submitted a ticket to Worldcom/UUNet. Worldcom's techs have verified the problem and thought it was... "interesting".
|
||||
<br><br>
|
||||
I personally think it is a bug in some filtering or monitoring software installed at one of Worldcom's NAPs (Network Access Points). NAPs are where the big internet providers interface with each other. The most popular NAPs are in big cities, the Tier-1 cities, as they're called. There are also companies that host NAP sites where the big carriers like Worldcom can install their equipment. The big carriers then set up Peering Agreements with each other. Peering Agreements state the conditions under which two or more carriers will exchange internet traffic.
|
||||
<br><br>
|
||||
Once you have a peering agreement in place with another carrier then you must pay them based on how much data you transfer from your network to their network across a NAP. This means that downloading a file is much cheaper than uploading a file. When you send a request to retrieve some information, that request is small compared to the amount of data it retrieves. Therefore, the carrier that hosted the server from which you got the data will end up paying more. Doh! I got off the topic. I hope they fix the problem soon!
|
||||
<br>
|
||||
<br>
|
||||
<br>
|
||||
<br>
|
||||
|
||||
|
||||
<a name=ads></a>
|
||||
<font size=+1><b>Considering Advertisements</b></font><br>
|
||||
<i>Jan 10, 2003</i><br><br>
|
||||
I'm now looking into serving text advertisements on top of the search results page so I can continue to fund my information retrieval research. I am also exploring the possibility of injecting ads into some of my xml-based search feeds. If you're interested in a search feed I should be able to give you an even better deal provided you can display the ads I feed you, in addition to any other ads you might want to add. If anyone has any good advice concerning what ad company I should use, I'd love to here it.
|
||||
<br>
|
||||
<br>
|
||||
<br>
|
||||
<br>
|
||||
|
||||
<a name=codeupdate></a>
|
||||
<font size=+1><b>Code Update</b></font><br>
|
||||
<i>Dec 27, 2002</i><br><br>
|
||||
After a brief hiatus I've restarted the Gigablast spiders. The problem was they were having a negative impact on the query engine's performance, but now, all spider processing yields computer resources much better to the query traffic. The result is that the spidering process only runs in the space between queries. This actually involved a lot of work. I had to insert code to suspend spider-related, network transactions and cancel disk-read and disk-write threads.<br><br>
|
||||
I've also launched my <a href="/gigaboost.html">Gigaboost</a> campaign. This rewards pages that link to gigablast.com with a boost in the search results rankings. The boost is only utilized to resolve ties in ranking scores so it does not taint the quality of the index.<br><br>
|
||||
Gigablast.nu, in Scandinavia, now has a news index built from news sources in the Scandinavian region. It is not publically available just yet because there's still a few details we are working out.
|
||||
I've also added better duplicate detection and removal. It won't be very noticable until the index refresh cycle completes.
|
||||
In addition Gigablast now removes session ids from urls, but, this only applies to new links and will be back pedaled to fix urls already in the index at a later date.
|
||||
There is also a new summary generator installed. It's over ten times faster than the old one. If you notice any problems with it please contact me. As always, I appreciate any constructive input you have to give.
|
||||
<br>
|
||||
<br>
|
||||
<br>
|
||||
<br>
|
||||
|
||||
|
||||
<a name=corruption></a>
|
||||
<font size=+1><b>Data Corruption Mysteries</b></font><br>
|
||||
<i>Dec 20, 2002</i><br><br>
|
||||
I've been having problems with my hard drives. I have a bunch of Maxtor 160GB drives (Model # = 4G160J8) running on Linux 2.4.17 with the <a href="/ide.2.4.17.02152002.patch.bz2">48-bit LBA patch</a>. Each machine has 4 of these drives on them, 2 on each IDE slot. I've had about 160 gigabytes of data on one before so I know the patch seems to do the job. But every now and then a drive will mess up a write. I do a lot of writing and it usually takes tens of gigabytes of writing before a drive does this. It writes out about 8 bytes that don't match what should have been written. This causes index corruption and I've had to install work-arounds in my code to detect and patch it.
|
||||
<br>
|
||||
<br>
|
||||
I'm not sure if the problem is with the hard drive itself or with Linux. I've made sure that the problem wasn't in my code by doing a read after each write to verify. I thought it might be my motherboard or CPU. I use AMDs and Giga-byte motherboards. But gigablast.nu in Sweden has the same problem and it uses a Pentium 3. Furthermore, gigablast.nu uses a RAID of 160GB Maxtors, whereas gigablast.com does not. Gigablast.nu uses version 2.4.19 of Linux with the 48-bit LBA patch. So the problem seems to be with Linux, the LBA patch or the hard drive itself.
|
||||
<br>
|
||||
<br>
|
||||
On top of all this mess, about 1 Maxtor, out of the 32 I have, completely fails on me every 4 months. The drive just gives I/O errors to the kernel and brings the whole system down. Luckily, gigablast.com implements a redundant architecture so the failing server will be replaced by his backup. So far Maxtor has replaced the drives I had fail. If you give them your credit card number they'll even send the replacements out in advance. But I believe the failure problem is an indicator that the data corruption problem is hard drive related, not Linux related. If anyone has any insight into this problem please let me know, you could quite easily be my hero.
|
||||
<br>
|
||||
<br>
|
||||
If you're still reading this you're pretty hard core so <a href="/output.html">here's</a> what /var/log/messages says when the 4G160J8 completely fails.
|
||||
<br>
|
||||
<br>
|
||||
<br>
|
||||
<br>
|
||||
|
||||
<a name=pvr></a>
|
||||
<font size=+1><b>Personal Video Recorders (PVRs)</b></font><br>
|
||||
<i>Dec 20, 2002</i><br><br>
|
||||
Boy, these things are great. I bought a Tivo last year for my wife and she loved it. At first though she wasn't that enthusiastic because she wasn't very familiar with it. But now we rarely rent any more video tapes from Blockbuster or Hollywood video because there's always something interesting to watch on the Tivo. You just let it know what shows you like and it will record them anytime they come on. We always have an overflow of Simpsons and Seinfeld epsidoes on there.
|
||||
<br>
|
||||
<br>
|
||||
In the future though I don't think Tivo is going to make it. The reason? Home networking. Because I'm a professional computer person, we already have a home network installed. If the TV had an ethernet jack it would be in our network. 100Mbps is fast enough to send it a high-quality video stream from the computers already on the network. I have a cable modem which, in the future, should allow the computer using it to rip signals from the cable station, as well. For now though, you could split your cable and plug the new end into a tuner card on your PC. So once someone comes out with a small device for the television that converts an ethernet-based mpeg stream to a video signal we can use our home PC to act as the TIVO. This device should be pretty cheap, I'd imagine around $30 or so. The only thing you'd need then is a way to allow the remote control to talk to your PC.
|
||||
<br>
|
||||
<br>
|
||||
Now I read about the EFF suing "Hollywood" in order to clarify consumer rights of fair use. Specifically, the EFF was said to be representing Replay TV. Hey! Isn't Replay TV owned in part by Disney (aka Hollywood)... hmmmm... Seems like Disney might have pretty good control over the outcome of this case. I think it's a conflict of interest when such an important trial, which would set precedence for many cases to come, has the same plantiff as defendant.
|
||||
<br>
|
||||
<br>
|
||||
This makes me wonder about when Disney's Go.com division got sued by Overture (then known as Goto.com) for logo infringement. Disney had to pay around 20 million to Overture. I wonder what kind of ties Disney had to Overture. Ok, maybe I'm being a conspiracy theorist, so I'll stop now.
|
||||
<br>
|
||||
<br>
|
||||
<br>
|
||||
<br>
|
||||
|
||||
<a name=ecs></a>
|
||||
<font size=+1><b>ECS K7S5A Motherboard Mayhem</b></font><br>
|
||||
<i>Dec 20, 2002</i><br><br>
|
||||
I pinch pennies. When I bought my 8 servers I got the cheapest motherboards I could get for my AMD 1.4GHz Athlon T-Birds. At the time, in late January 2002, they turned out to be the K7S5A's. While running my search engine on them I experienced lots of segmentation faults. I spent a couple of days pouring over the code wondering if I was tripping out. It wasn't until I ran memtest86 at boot time (ran by lilo) that I found memory was being corrupted. I even tried new memory sticks to no avail. Fortunately I found some pages on the web that addressed the problem. It was the motherboard. It took me many hours to replace them on all 8 servers. I don't recommend ECS. I've been very happy with the Giga-byte motherboards I have now.
|
||||
|
||||
|
||||
<br><br><br>
|
||||
<br><br><br>
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -270,7 +270,7 @@ For RedHat do a <b>yum install gcc-c++ glibc-static libstdc++-static openssl-sta
|
||||
</ul>
|
||||
-->
|
||||
|
||||
<b>1.0</b> For <u>Ubuntu 12.02 or 14.04</u>: do <b>sudo apt-get update ; apt-get install make g++ libssl-dev binutils</b>
|
||||
<b>1.0</b> For <u>Ubuntu 12.02 or 14.04</u>: do <b>sudo apt-get update ; sudo apt-get install make g++ libssl-dev binutils</b>
|
||||
|
||||
<br><br>
|
||||
|
||||
|
406
html/news.html
406
html/news.html
@ -1,406 +0,0 @@
|
||||
|
||||
<div style=max-width:700px;>
|
||||
|
||||
<br>
|
||||
<br><br>
|
||||
|
||||
<a name=revival></a>
|
||||
<font size=+1><b>15 Year Anniversary</b></font><br>
|
||||
<i>September 1, 2014</i><br><br>
|
||||
It's been 15 years since I first started Gigablast. It's taken some interesting directions as of late. Most notably being open source. I've decided to revive the old blog entries that you can find below and continue working on top of those.
|
||||
|
||||
|
||||
|
||||
|
||||
<br><br><br><br>
|
||||
|
||||
|
||||
|
||||
|
||||
<a name=gigabits></a>
|
||||
<font size=+1><b>Giga Bits Introduced</b></font><br>
|
||||
<i>Jan 31, 2004</i><br><br>
|
||||
Gigablast now generates related concepts for your query. I call them Giga Bits. I believe it is the best concept generator in the industry, but if you don't think so please <a href="/contact.html">drop me a note</a> explaining why not, so I can improve it.
|
||||
<br><br>
|
||||
You can also ask Gigablast a simple question like <a href="/search?q=Who+is+President+of+Russia%3F">"Who is President of Russia?"</a> and it often comes up with the correct answer in the Giga Bits section. How do you think it does that?
|
||||
<br><br>
|
||||
In other news, the spider speed ups I rolled a few weeks ago are tremendously successful. I can easily burn all my bandwidth quota with insignificant load on my servers. I could not be happier with this.
|
||||
<br><br>
|
||||
Now I'm planning on turning Gigablast into a default AND engine. Why? Because it will decrease query latency by several times, believe or not. That should put Gigablast on par with the fastest engines in the world, even though it only runs on 8 desktop machines. But Don't worry, I will still leave the default OR functionality intact.
|
||||
<br>
|
||||
<br>
|
||||
<br>
|
||||
<br>
|
||||
|
||||
<a name=update></a>
|
||||
<font size=+1><b>January Update Rolled</b></font><br>
|
||||
<i>Jan 8, 2004</i><br><br>
|
||||
Gigablast now has a more professional, but still recognizable, logo, and a new catch phrase, "Information Acceleration". Lots of changes on the back end. You should notice significantly higher quality searches. The spider algorithm was sped up several times. Gigablast should be able to index several million documents per day, but that still remains to be tested. <knock on wood>. Site clustering was sped up. I added the ability to force all query terms to be required by using the &rat=1 cgi parm. Now Gigablast will automatically regenerate some of its databases when they are missing. And I think I wasted two weeks working like a dog on code that I'm not going to end up using! I hate when that happens...
|
||||
|
||||
<br>
|
||||
<br>
|
||||
<br>
|
||||
<br>
|
||||
|
||||
<a name=traffic></a>
|
||||
<font size=+1><b>An Easy way to Slash Motor Vehicle Emissions</b></font><br>
|
||||
<i>Dec 11, 2003</i><br><br>
|
||||
Blanket the whole city with wi-fi access. (like <a href="/?redir=http://story.news.yahoo.com/news?tmpl=story&ncid=1293&e=2&u=/ap/20031211/ap_on_hi_te/wi_fi_city&sid=95573418">Cerritos, California</a>) When you want to travel from point A
|
||||
to point B, tell the central traffic computer. It will then give you a time
|
||||
window in which to begin your voyage and, most importantly, it will ensure that
|
||||
as long as you stay within the window you will always hit green lights.
|
||||
<br><br>
|
||||
If you stray from your path, you'll be able to get a new window via the wi-fi network.
|
||||
If everyone's car has gps and is connected to the wi-fi network,
|
||||
the central computer will also be able to monitor the flow of traffic and
|
||||
make adjustments to your itinerary in real-time.
|
||||
Essentially, the traffic computer will be solving a large system of linear,
|
||||
and possibly non-linear, constraints in real-time. Lots of fun... and think of
|
||||
how much more efficient travel will be!! If someone wants to secure funding,
|
||||
count me in.
|
||||
|
||||
<br>
|
||||
<br>
|
||||
<br>
|
||||
<br>
|
||||
|
||||
<a name=spellchecker></a>
|
||||
<font size=+1><b>Spellchecker Finally Finished</b></font><br>
|
||||
<i>Nov 18, 2003</i><br><br>
|
||||
After a large, countable number of interruptions, I've finally completed the spellchecker. I tested the word '<b>dooty</b>' on several search engines to see how they handled that misspelling. Here's what I got:
|
||||
<br><br>
|
||||
<table>
|
||||
<tr><td><b>Source</b></td><td><b>Result</b></td></tr>
|
||||
<tr><td>Alltheweb</td><td><a href="http://www.alltheweb.com/search?query=dooty">booty</a><td></tr>
|
||||
<tr><td>Altavista</td><td><a href="http://search01.altavista.com/web/results?q=dooty">dhooti</a></td></tr>
|
||||
<tr><td>Gigablast</td><td><a href="http://www.gigablast.com/search?q=dooty">door</a></td></tr>
|
||||
<tr><td>Google</td><td><a href="http://www.google.com/search?q=dooty">doody</a></td></tr>
|
||||
<tr><td>Microsoft Word</td><td>Doty</td></tr>
|
||||
<tr><td>Teoma</td><td><a href="http://s.teoma.com/search?q=dooty">doty</a></td></tr>
|
||||
<tr><td>Wisenut</td><td>N/A (no spellchecker)</td></tr>
|
||||
</table>
|
||||
<br>
|
||||
So there is no one way to code a spellchecker. It's a guessing game. And, hey Wisenut, want to license a good spellchecker for cheap? <a href="/contact.html">Let me know</a>.
|
||||
|
||||
<br><br>
|
||||
Gigablast uses its cached web pages to generate its dictionary instead of the query logs. When a word or phrase is not found in the the dictionary, Gigablast replaces it with the closest match in the dictionary. If multiple words or phrases are equally close, then Gigablast resorts to a popularity ranking.
|
||||
<br><br>
|
||||
One interesting thing I noticed is that in Google's spellchecker you must at least get the first letter of the word correct, otherwise, Google will not be able to recommend the correct spelling. I made Gigablast this way too, because it really cuts down on the number of words it has to search to come up with a recommendation. This also allows you to have an extremely large dictionary distributed amongst several machines, where each machine is responsible for a letter.
|
||||
<br><br>
|
||||
Also of note: I am planning on purchasing the hardware required for achieving a 5 billion document index capable of serving hundreds of queries per second within the next 12 months. Wish me luck... and thanks for using Gigablast.
|
||||
<br>
|
||||
<br>
|
||||
<br>
|
||||
<br>
|
||||
|
||||
|
||||
<a name=onagain></a>
|
||||
<font size=+1><b>Spiders On Again</b></font><br>
|
||||
<i>Nov 10, 2003</i><br><br>
|
||||
After updating the spider code I've reactivated the spiders. Gigablast should be able to spider at a faster rate with even less impact on query response time than before. So add your urls now while the addings good.
|
||||
<br>
|
||||
<br>
|
||||
<br>
|
||||
<br>
|
||||
|
||||
|
||||
<a name=speed></a>
|
||||
<font size=+1><b>Going For Speed</b></font><br>
|
||||
<i>Nov 3, 2003</i><br><br>
|
||||
I've finally got around to working on Gigablast's distributed caches. It was not doing a lot of caching before. The new cache class I rigged up has no memory fragmentation and minimal record overhead. It is vurhy nice.<br><br>
|
||||
I've stopped spidering just for a bit so I can dedicate all Gigablast's RAM to the multi-level cache system I have in place now and see how much I can reduce query latency. Disks are still my main point of contention by far so the caching helps out a lot. But I could still use more memory.<br><br>
|
||||
Take Gigablast for a <a href="/">spin</a>. See how fast it is.
|
||||
<br>
|
||||
<br>
|
||||
<br>
|
||||
<br>
|
||||
|
||||
|
||||
<a name=metas></a>
|
||||
<font size=+1><b>Bring Me Your Meta Tags</b></font><br>
|
||||
<i>Oct 11, 2003</i><br><br>
|
||||
As of now Gigablast supports the indexing, searching and displaying of generic meta tags. You name them I fame them. For instance, if you have a tag like <i><meta name="foo" content="bar baz"></i> in your document, then you will be able to do a search like <i><a href="/search?q=foo%3Abar&dt=foo">foo:bar</a></i> or <i><a href="/search?q=foo%3A%22bar+baz%22&dt=foo">foo:"bar baz"</a></i> and Gigablast will find your document.
|
||||
<br><br>
|
||||
You can tell Gigablast to display the contents of arbitrary meta tags in the search results, like <a href="/search?q=gigablast&s=10&dt=author+keywords%3A32">this</a>. Note that you must assign the <i>dt</i> cgi parameter to a space-separated list of the names of the meta tags you want to display. You can limit the number of returned characters of each tag to X characters by appending a <i>:X</i> to the name of the meta tag supplied to the <i>dt</i> parameter. In the link above, I limited the displayed keywords to 32 characters.
|
||||
<br><br>
|
||||
Why use generic metas? Because it is very powerful. It allows you to embed custom data in your documents, search for it and retrieve it. Originally I wanted to do something like this in XML, but now my gut instincts are that XML is not catching on because it is ugly and bloated. Meta tags are pretty and slick.
|
||||
<br>
|
||||
<br>
|
||||
<br>
|
||||
<br>
|
||||
|
||||
<a name=verisignstopped></a>
|
||||
<font size=+1><b>Verisign Stops Destroying the Internet</b></font><br>
|
||||
<i>Oct 11, 2003</i><br><br>
|
||||
Ok, they actually stopped about a week ago, but I didn't get around to posting it until now. They really ought to lose their privileged position so this does not happen again. Please do not stop your boycott. They have not learned from their mistakes.
|
||||
<br>
|
||||
<br>
|
||||
<br>
|
||||
<br>
|
||||
|
||||
<a name=moreverisign></a>
|
||||
<font size=+1><b>Verisign Continues to Damage Gigablast's Index</b></font><br>
|
||||
<i>September 30, 2003</i><br><br>
|
||||
When the Gigablast spider tries to download a page from a domain it first gets the associated robots.txt file for that domain. When the domain does not exist it ends up downloading a robots.txt file from verisign. There are two major problems with this. The first is that verisign's servers may be slow which will slow down Gigablast's indexing. Secondly, and this has been happening for a while now, Gigablast will still index any incoming link text for that domain, thinking that the domain still exists, but just that spider permission was denied by the robots.txt file.
|
||||
<br>
|
||||
<br>
|
||||
So, hats off to you verisign, thanks for enhancing my index with your fantastic "service". I hope your company is around for many years so you can continue providing me with your great "services".
|
||||
<br>
|
||||
<br>
|
||||
If you have been hurt because of verisign's greed you might want to consider joining the <a href="/?redir=http://www.geek.com/news/geeknews/2003Sep/gee20030929021965.htm">class-action lawsuit</a> announced Friday, September 26th, by the <a href="/?redir=http://www.techfirm.com/">Ira Rothken law firm</a>.
|
||||
<br>
|
||||
<br>
|
||||
Want to learn more about how the internet is run? Check out <a href="/?redir=http://www.paradigm.nu/icann/">the ICANN movie page</a>. Movie #1 portrays verisign's CEO, Stratton Sclavos, quite well in my opinion.
|
||||
<br>
|
||||
<br>
|
||||
<b>(10/01/03) Update #5:</b> verisign <a href="/?redir=http://www.pcworld.com/news/article/0,aid,112712,00.asp">comes under further scrutiny</a>.
|
||||
<br>
|
||||
<br>
|
||||
<br>
|
||||
<br>
|
||||
|
||||
<a name=verisign></a>
|
||||
<font size=+1><b>Verisign Redesigns the Internet for their Own Profit</b></font><br>
|
||||
<i>September 24, 2003</i><br><br>
|
||||
My spiders expect to get "not found" messages when they look up a domain that does not have an IP. When verisign uses their privledged position to change the underlying fundamentals of the internet just to line their own greedy pockets it really, really perturbs me. Now, rather than get the "not found" message, my spiders get back a valid IP, the IP of verisign's commercial servers. That causes my spiders to then proceed to download the robots.txt from that domain. This can take forever if their servers are slow. What a pain. Now I have to fix my freakin' code. And that's just one of many problems this company has caused.
|
||||
<br>
|
||||
<br>
|
||||
Please join me in boycott. I'm going to discourage everyone I know from supporting this abusive, monopolistic entity.
|
||||
<br>
|
||||
<br>
|
||||
<b>(9/22/03) Update #1:</b> verisign <a href="/?redir=http://www.icann.org/correspondence/lewis-to-twomey-21sep03.htm">responded</a> to ICANN's request that they stop. <a href="/?redir=http://slashdot.org/articles/03/09/22/2255202.shtml?tid=126&tid=95&tid=99">See what the slashdot community has to say about this response.</a>
|
||||
<br>
|
||||
<br>
|
||||
<b>(9/22/03) Update #2:</b> ICANN has now posted some complaints in this <a href="/?redir=http://forum.icann.org/alac-forum/redirect/">forum</a>.
|
||||
<br>
|
||||
<br>
|
||||
<b>(9/24/03) Update #3:</b> Slashdot has more <a href="/?redir=http://yro.slashdot.org/yro/03/09/24/0134256.shtml?tid=126&tid=95&tid=98&tid=99">coverage</a>.
|
||||
<br>
|
||||
<br>
|
||||
<b>(9/24/03) Update #4:</b> Please sign the <a href="/?redir=http://www.whois.sc/verisign-dns/">petition</a> to stop verisign.
|
||||
<br>
|
||||
<br>
|
||||
<br>
|
||||
<br>
|
||||
|
||||
|
||||
<a name=geotags></a>
|
||||
<font size=+1><b>Geo-Sensitive Search</b></font><br>
|
||||
<i>September 18, 2003</i><br><br>
|
||||
Gigablast now supports some special new meta tags that allow for constraining a search to a particular zipcode, city, state or country. Support was also added for the standard author, language and classification meta tags. This <a href="/tagsdemo.html">page</a> explains more. These meta tags should be standard, everyone should use them (but not abuse them!) and things will be easier for everybody.
|
||||
<br><br>
|
||||
Secondly, I have declared jihad against stale indexes. I am planning a significantly faster update cycle, not to mention growing the index to about 400 million pages, all hopefully in the next few months.
|
||||
<br>
|
||||
<br>
|
||||
<br>
|
||||
<br>
|
||||
|
||||
<a name=turing></a>
|
||||
<font size=+1><b>Foiling the Addurl Scripts</b></font><br>
|
||||
<i>September 6, 2003</i><br><br>
|
||||
The new pseudo-Turing test on the <a href="/addurl">addurl page</a> should prevent most automated scripts from submitting boatloads of URLs. If someone actually takes the time to code a way around it then I'll just have to take it a step further. I would rather work on other things, though, so please quit abusing my free service and discontinue your scripts. Thanks.
|
||||
<br>
|
||||
<br>
|
||||
<br>
|
||||
<br>
|
||||
|
||||
<a name=boolean></a>
|
||||
<font size=+1><b>Boolean is Here</b></font><br>
|
||||
<i>September 1, 2003</i><br><br>
|
||||
I just rolled out the new boolean logic code. You should be able to do nested boolean queries using the traditional AND, OR and NOT boolean operators. See the updated <a href="/help.html#boolean">help page</a> for more detail.
|
||||
<br><br>
|
||||
I have declared jihad against swapping and am now running the 2.4.21-rc6-rmap15j Linux kernel with swap tuned to zero using the /proc/sys/vm/pagecache knobs. So far no machines have swapped, which is great, but I'm unsure of this kernel's stability.
|
||||
<br>
|
||||
<br>
|
||||
<br>
|
||||
<br>
|
||||
|
||||
<a name=swap></a>
|
||||
<font size=+1><b>All Swapped Out</b></font><br>
|
||||
<i>August 29, 2003</i><br><br>
|
||||
I no longer recommend turning the swap off, at least not on linux 2.4.22. A kernel panicked on me and froze a server. Not good. If anyone has any ideas for how I can prevent my app from being swapped out, please let me know. I've tried mlockall() within my app but that makes its memory usage explode for some reason. I've also tried Rik van Riel's 2.4.21-rc6-rmap15j.txt patch on the 2.4.21 kernel, but it still does unnecessary swapping (although, strangely, only when spidering). If you know how to fix this problem, please help!!! <a href="vmstat.html">Here</a> is the output from the vmstat command on one of my production machines running 2.4.22. And <a href="vmstatrik.html">here</a> is the output from my test machine running 2.4.21-rc6-rmap15j.txt.
|
||||
<br>
|
||||
<br>
|
||||
<br>
|
||||
<br>
|
||||
|
||||
<a name=kernel></a>
|
||||
<font size=+1><b>Kernel Update</b></font><br>
|
||||
<i>August 28, 2003</i><br><br>
|
||||
I updated the Linux kernel to 2.4.22, which was just released a few days ago on <a href="/?redir=http://www.kernel.org/">kernel.org</a>. Now my gigabit cards are working, yay! I finally had to turn off swap using the swapoff command. When an application runs out of memory the swapper is supposed to write unfrequently used memory to disk so it can give that memory to the application that needs it. Unfortunately, the Linux virtual memory manager enjoys swapping out an application's memory for no good reason. This can often make an application disastrously slow, especially when the application ends up blocking on code that it doesn't expect too! And, furthermore, when the application uses the disk intensely it has to wait even longer for memory to get swapped back in from disk. I recommend that anyone who needs high performance turn off the swap and just make sure their program does not use more physical memory than is available.
|
||||
<br>
|
||||
<br>
|
||||
<br>
|
||||
<br>
|
||||
|
||||
<a name=gang></a>
|
||||
<font size=+1><b>The Gang's All Here</b></font><br>
|
||||
<i>August 17, 2003</i><br><br>
|
||||
I decided to add PostScript (<a href="/search?q=type:ps">.ps</a>) , PowerPoint (<a href="/search?q=type:ppt">.ppt</a>), Excel SpreadSheet (<a href="/search?q=type:xls">.xls</a>) and Microsoft Word (<a href="/search?q=type:doc">.doc</a>) support in addition to the PDF support. Woo-hoo.
|
||||
<br>
|
||||
<br>
|
||||
<br>
|
||||
<br>
|
||||
|
||||
<a name=pdf></a>
|
||||
<font size=+1><b>PDF Support</b></font><br>
|
||||
<i>August 14, 2003</i><br><br>
|
||||
Gigablast now indexes PDF documents. Try the search <a href="/search?q=type:pdf"><i>type:pdf</i></a> to see some PDF results. <i>type</i> is a new search field. It also support the text type, <a href="/search?q=type:text"><i>type:text</i></a>, and will support other file types in the future.
|
||||
<br>
|
||||
<br>
|
||||
<br>
|
||||
<br>
|
||||
|
||||
<a name=codeupdate3></a>
|
||||
<font size=+1><b>Minor Code Updates</b></font><br>
|
||||
<i>July 17, 2003</i><br><br>
|
||||
I've cleaned up the keyword highlight routines so they don't highlight isolated stop words. Gigablast now displays a <a href="/superRecall.html">blue bar</a> above returned search results that do not have <b>all</b> of your query terms. When returning a page of search results Gigablast lets you know how long ago that page was cached by displaying a small message at the bottom of that page. NOTE: This small message is at the bottom of the page containing the search results, not at the bottom of any pages from the web page cache, that is a different cache entirely. Numerous updates to less user-visible things on the back end. Many bugs fixed, but still more to go. Thanks a bunch to Bruce Perens for writing the <a href="/?redir=http://www.perens.com/FreeSoftware/">Electric Fence</a> debug utility.
|
||||
<br>
|
||||
<br>
|
||||
<br>
|
||||
<br>
|
||||
|
||||
<a name=codeupdate2></a>
|
||||
<font size=+1><b>Gigablast 2.0</b></font><br>
|
||||
<i>June 20, 2003</i><br><br>
|
||||
I've recently released Gigablast 2.0. Right now Gigablast can do about twice as many queries per second as before. When I take care of a few more things that rate should double again.
|
||||
<br><br>
|
||||
The ranking algorithm now treats phrase weights much better. If you search for something like <i><a href="/search?q=boots+in+the+uk">boots in the uk</a></i> you won't get a bunch of results that have that exact phrase in them, but rather you will get UK sites about boots (theoretically). And when you do a search like <i><a href="/search?q=all+the+king%27s+men">all the king's men</a></i> you will get results that have that exact phrase. If you find any queries for which Gigablast is especially bad, but a competing search engine is good, please <a href="/contact.html">let me know</a>, I'm am very interested.
|
||||
<br><br>
|
||||
2.0 also introduced a new index format. The new index is half the size of the old one. This allows my current setup to index over 400 million pages with dual redundancy. Before it was only able to index about 300 million pages. The decreased index size also speeds up the query process since only half as much data needs to be read from disk to satisfy a query.
|
||||
<br><br>
|
||||
I've also started a full index refresh, starting with top level pages that haven't been spidered in a while. This is especially nice because a lot of pages that were indexed before all my anti-spam algorithms were 100% in place are just now getting filtered appropriately. I've manually removed over 100,000 spam pages so far, too.
|
||||
<br>
|
||||
<br>
|
||||
<br>
|
||||
<br>
|
||||
|
||||
<a name=grub></a>
|
||||
<font size=+1><b>My Take on Looksmart's Grub</b></font><br>
|
||||
<i>Apr 19, 2003</i><br><br>
|
||||
There's been some press about Grub, a program from Looksmart which you install on your machine to help Looksmart spider the web. Looksmart is only using Grub to save on their bandwidth. Essentially Grub just compresses web pages before sending them to Looksmart's indexer thus reducing the bandwidth they have to pay for by a factor of 5 or so. The same thing could be accomplished through a proxy which compresses web pages. Eventually, once the HTTP mime standard for requesting compressed web pages is better supported by web servers, Grub will not be necessary.
|
||||
<br>
|
||||
<br>
|
||||
<br>
|
||||
<br>
|
||||
|
||||
<a name=codeupdate></a>
|
||||
<font size=+1><b>Code Update</b></font><br>
|
||||
<i>Mar 25, 2003</i><br><br>
|
||||
I just rolled some significant updates to Gigablast's back-end. Gigablast now has a uniformly-distributed, unreplicated search results cache. This means that if someone has done your search within the last several hours then you will get results back very fast. This also means that Gigablast can handle a lot more queries per second.
|
||||
<br>
|
||||
<br>
|
||||
I also added lots of debug and timing messages that can be turned on and off via the Gigablast admin page. This allows me to quickly isolate problems and identify bottlenecks.
|
||||
<br>
|
||||
<br>
|
||||
Gigablast now synchronizes the clocks on all machines on the network so the instant add-url should be more "instant". Before I made this change, one machine would tell another to spider a new url "now", where "now" was actually a few minutes into the future on the spider machine. But since everyone's currently synchronized, this will not be a problem anymore.
|
||||
<br>
|
||||
<br>
|
||||
There were about 100 other changes and bug fixes, minor and major, that I made, too, that should result in significant performance gains. My next big set of changes should make searches at least 5 times faster, but it will probably take several months until completed. I will keep you posted.
|
||||
<br>
|
||||
<br>
|
||||
<br>
|
||||
<br>
|
||||
|
||||
<a name=downtime></a>
|
||||
<font size=+1><b>Downtime</b></font><br>
|
||||
<i>Feb 20, 2003</i><br><br>
|
||||
To combat downtime I wrote a monitoring program. It will send me a text message on my cellphone if gigablast ever stops responding to queries. This should prevent extended periods of downtime by alerting me to the problem so I can promptly fix it.
|
||||
<br>
|
||||
<br>
|
||||
<br>
|
||||
<br>
|
||||
|
||||
|
||||
<a name=uunet></a>
|
||||
<font size=+1><b>Connectivity Problems. Bah!</b></font><br>
|
||||
<i>Feb 14, 2003</i><br><br>
|
||||
I had to turn off the main refresh spiders a few weeks ago because of internet connectivity problems. Lots of pages were inaccessible or were timing out to the point that spider performance was suffering too much.
|
||||
<br><br>
|
||||
After running tcpdump in combination with wget I noticed that the FIN packets of some web page transfers were being lost or delayed for over a minute. The TCP FIN packet is typically the last TCP packet sent to your browser when it retrieves a web page. It tells your browser to close the connection. Once it is received the little spinning logo in the upper right corner of your browser window should stop spinning.
|
||||
<br><br>
|
||||
The most significant problem was, however, that the initial incoming data packet for some URLs was being lost or excessively delayed. You can get by without receiving FIN packets but you absoultely need these TCP "P" packets. I've tested my equipment and my ISP has tested their equipment and we have both concluded that the problem is upstream. Yesterday my ISP submitted a ticket to Worldcom/UUNet. Worldcom's techs have verified the problem and thought it was... "interesting".
|
||||
<br><br>
|
||||
I personally think it is a bug in some filtering or monitoring software installed at one of Worldcom's NAPs (Network Access Points). NAPs are where the big internet providers interface with each other. The most popular NAPs are in big cities, the Tier-1 cities, as they're called. There are also companies that host NAP sites where the big carriers like Worldcom can install their equipment. The big carriers then set up Peering Agreements with each other. Peering Agreements state the conditions under which two or more carriers will exchange internet traffic.
|
||||
<br><br>
|
||||
Once you have a peering agreement in place with another carrier then you must pay them based on how much data you transfer from your network to their network across a NAP. This means that downloading a file is much cheaper than uploading a file. When you send a request to retrieve some information, that request is small compared to the amount of data it retrieves. Therefore, the carrier that hosted the server from which you got the data will end up paying more. Doh! I got off the topic. I hope they fix the problem soon!
|
||||
<br>
|
||||
<br>
|
||||
<br>
|
||||
<br>
|
||||
|
||||
|
||||
<a name=ads></a>
|
||||
<font size=+1><b>Considering Advertisements</b></font><br>
|
||||
<i>Jan 10, 2003</i><br><br>
|
||||
I'm now looking into serving text advertisements on top of the search results page so I can continue to fund my information retrieval research. I am also exploring the possibility of injecting ads into some of my xml-based search feeds. If you're interested in a search feed I should be able to give you an even better deal provided you can display the ads I feed you, in addition to any other ads you might want to add. If anyone has any good advice concerning what ad company I should use, I'd love to here it.
|
||||
<br>
|
||||
<br>
|
||||
<br>
|
||||
<br>
|
||||
|
||||
<a name=codeupdate></a>
|
||||
<font size=+1><b>Code Update</b></font><br>
|
||||
<i>Dec 27, 2002</i><br><br>
|
||||
After a brief hiatus I've restarted the Gigablast spiders. The problem was they were having a negative impact on the query engine's performance, but now, all spider processing yields computer resources much better to the query traffic. The result is that the spidering process only runs in the space between queries. This actually involved a lot of work. I had to insert code to suspend spider-related, network transactions and cancel disk-read and disk-write threads.<br><br>
|
||||
I've also launched my <a href="/gigaboost.html">Gigaboost</a> campaign. This rewards pages that link to gigablast.com with a boost in the search results rankings. The boost is only utilized to resolve ties in ranking scores so it does not taint the quality of the index.<br><br>
|
||||
Gigablast.nu, in Scandinavia, now has a news index built from news sources in the Scandinavian region. It is not publically available just yet because there's still a few details we are working out.
|
||||
I've also added better duplicate detection and removal. It won't be very noticable until the index refresh cycle completes.
|
||||
In addition Gigablast now removes session ids from urls, but, this only applies to new links and will be back pedaled to fix urls already in the index at a later date.
|
||||
There is also a new summary generator installed. It's over ten times faster than the old one. If you notice any problems with it please contact me. As always, I appreciate any constructive input you have to give.
|
||||
<br>
|
||||
<br>
|
||||
<br>
|
||||
<br>
|
||||
|
||||
|
||||
<a name=corruption></a>
|
||||
<font size=+1><b>Data Corruption Mysteries</b></font><br>
|
||||
<i>Dec 20, 2002</i><br><br>
|
||||
I've been having problems with my hard drives. I have a bunch of Maxtor 160GB drives (Model # = 4G160J8) running on Linux 2.4.17 with the <a href="/ide.2.4.17.02152002.patch.bz2">48-bit LBA patch</a>. Each machine has 4 of these drives on them, 2 on each IDE slot. I've had about 160 gigabytes of data on one before so I know the patch seems to do the job. But every now and then a drive will mess up a write. I do a lot of writing and it usually takes tens of gigabytes of writing before a drive does this. It writes out about 8 bytes that don't match what should have been written. This causes index corruption and I've had to install work-arounds in my code to detect and patch it.
|
||||
<br>
|
||||
<br>
|
||||
I'm not sure if the problem is with the hard drive itself or with Linux. I've made sure that the problem wasn't in my code by doing a read after each write to verify. I thought it might be my motherboard or CPU. I use AMDs and Giga-byte motherboards. But gigablast.nu in Sweden has the same problem and it uses a Pentium 3. Furthermore, gigablast.nu uses a RAID of 160GB Maxtors, whereas gigablast.com does not. Gigablast.nu uses version 2.4.19 of Linux with the 48-bit LBA patch. So the problem seems to be with Linux, the LBA patch or the hard drive itself.
|
||||
<br>
|
||||
<br>
|
||||
On top of all this mess, about 1 Maxtor, out of the 32 I have, completely fails on me every 4 months. The drive just gives I/O errors to the kernel and brings the whole system down. Luckily, gigablast.com implements a redundant architecture so the failing server will be replaced by his backup. So far Maxtor has replaced the drives I had fail. If you give them your credit card number they'll even send the replacements out in advance. But I believe the failure problem is an indicator that the data corruption problem is hard drive related, not Linux related. If anyone has any insight into this problem please let me know, you could quite easily be my hero.
|
||||
<br>
|
||||
<br>
|
||||
If you're still reading this you're pretty hard core so <a href="/output.html">here's</a> what /var/log/messages says when the 4G160J8 completely fails.
|
||||
<br>
|
||||
<br>
|
||||
<br>
|
||||
<br>
|
||||
|
||||
<a name=pvr></a>
|
||||
<font size=+1><b>Personal Video Recorders (PVRs)</b></font><br>
|
||||
<i>Dec 20, 2002</i><br><br>
|
||||
Boy, these things are great. I bought a Tivo last year for my wife and she loved it. At first though she wasn't that enthusiastic because she wasn't very familiar with it. But now we rarely rent any more video tapes from Blockbuster or Hollywood video because there's always something interesting to watch on the Tivo. You just let it know what shows you like and it will record them anytime they come on. We always have an overflow of Simpsons and Seinfeld epsidoes on there.
|
||||
<br>
|
||||
<br>
|
||||
In the future though I don't think Tivo is going to make it. The reason? Home networking. Because I'm a professional computer person, we already have a home network installed. If the TV had an ethernet jack it would be in our network. 100Mbps is fast enough to send it a high-quality video stream from the computers already on the network. I have a cable modem which, in the future, should allow the computer using it to rip signals from the cable station, as well. For now though, you could split your cable and plug the new end into a tuner card on your PC. So once someone comes out with a small device for the television that converts an ethernet-based mpeg stream to a video signal we can use our home PC to act as the TIVO. This device should be pretty cheap, I'd imagine around $30 or so. The only thing you'd need then is a way to allow the remote control to talk to your PC.
|
||||
<br>
|
||||
<br>
|
||||
Now I read about the EFF suing "Hollywood" in order to clarify consumer rights of fair use. Specifically, the EFF was said to be representing Replay TV. Hey! Isn't Replay TV owned in part by Disney (aka Hollywood)... hmmmm... Seems like Disney might have pretty good control over the outcome of this case. I think it's a conflict of interest when such an important trial, which would set precedence for many cases to come, has the same plantiff as defendant.
|
||||
<br>
|
||||
<br>
|
||||
This makes me wonder about when Disney's Go.com division got sued by Overture (then known as Goto.com) for logo infringement. Disney had to pay around 20 million to Overture. I wonder what kind of ties Disney had to Overture. Ok, maybe I'm being a conspiracy theorist, so I'll stop now.
|
||||
<br>
|
||||
<br>
|
||||
<br>
|
||||
<br>
|
||||
|
||||
<a name=ecs></a>
|
||||
<font size=+1><b>ECS K7S5A Motherboard Mayhem</b></font><br>
|
||||
<i>Dec 20, 2002</i><br><br>
|
||||
I pinch pennies. When I bought my 8 servers I got the cheapest motherboards I could get for my AMD 1.4GHz Athlon T-Birds. At the time, in late January 2002, they turned out to be the K7S5A's. While running my search engine on them I experienced lots of segmentation faults. I spent a couple of days pouring over the code wondering if I was tripping out. It wasn't until I ran memtest86 at boot time (ran by lilo) that I found memory was being corrupted. I even tried new memory sticks to no avail. Fortunately I found some pages on the web that addressed the problem. It was the motherboard. It took me many hours to replace them on all 8 servers. I don't recommend ECS. I've been very happy with the Giga-byte motherboards I have now.
|
||||
|
||||
|
||||
<br><br><br>
|
||||
<br><br><br>
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
|
BIN
html/ss_filters.png
Normal file
BIN
html/ss_filters.png
Normal file
Binary file not shown.
After ![]() (image error) Size: 216 KiB |
BIN
html/ss_filters_thumb.png
Normal file
BIN
html/ss_filters_thumb.png
Normal file
Binary file not shown.
After ![]() (image error) Size: 14 KiB |
BIN
html/ss_hosts.png
Normal file
BIN
html/ss_hosts.png
Normal file
Binary file not shown.
After ![]() (image error) Size: 214 KiB |
BIN
html/ss_hosts_thumb.png
Normal file
BIN
html/ss_hosts_thumb.png
Normal file
Binary file not shown.
After ![]() (image error) Size: 19 KiB |
BIN
html/ss_settings.png
Normal file
BIN
html/ss_settings.png
Normal file
Binary file not shown.
After ![]() (image error) Size: 196 KiB |
BIN
html/ss_settings_thumb.png
Normal file
BIN
html/ss_settings_thumb.png
Normal file
Binary file not shown.
After ![]() (image error) Size: 11 KiB |
24
main.cpp
24
main.cpp
@ -195,6 +195,8 @@ void dumpLinkdb ( char *coll,int32_t sfn,int32_t numFiles,bool includeT
|
||||
void exitWrapper ( void *state ) { exit(0); };
|
||||
|
||||
bool g_recoveryMode = false;
|
||||
|
||||
int32_t g_recoveryLevel = 0;
|
||||
|
||||
bool isRecoveryFutile ( ) ;
|
||||
|
||||
@ -1116,8 +1118,15 @@ int main2 ( int argc , char *argv[] ) {
|
||||
//send an email on startup for -r, like if we are recovering from an
|
||||
//unclean shutdown.
|
||||
g_recoveryMode = false;
|
||||
if ( strcmp ( cmd , "-r" ) == 0 ) g_recoveryMode = true;
|
||||
if ( strcmp ( cmd2 , "-r" ) == 0 ) g_recoveryMode = true;
|
||||
char *cc = NULL;
|
||||
if ( strncmp ( cmd , "-r" ,2 ) == 0 ) cc = cmd;
|
||||
if ( strncmp ( cmd2 , "-r",2 ) == 0 ) cc = cmd2;
|
||||
if ( cc ) {
|
||||
g_recoveryMode = true;
|
||||
g_recoveryLevel = 1;
|
||||
if ( cc[2] ) g_recoveryLevel = atoi(cc+2);
|
||||
if ( g_recoveryLevel < 0 ) g_recoveryLevel = 0;
|
||||
}
|
||||
|
||||
// run as daemon? then we have to fork
|
||||
if ( strcmp ( cmd , "-d" ) == 0 ) g_conf.m_runAsDaemon = true;
|
||||
@ -3092,6 +3101,10 @@ int main2 ( int argc , char *argv[] ) {
|
||||
pid_t pid, sid;
|
||||
pid = fork();
|
||||
if ( pid < 0 ) exit(EXIT_FAILURE);
|
||||
// seems like we core unless parent sets this to NULL.
|
||||
// it does not affect the child.
|
||||
//if ( pid > 0 ) g_hostdb.m_myHost = NULL;
|
||||
// child gets a 0, parent gets the child's pid, so exit
|
||||
if ( pid > 0 ) exit(EXIT_SUCCESS);
|
||||
// change file mode mask
|
||||
umask(0);
|
||||
@ -4103,6 +4116,9 @@ bool doCmd ( const char *cmd , int32_t hostId , char *filename ,
|
||||
// doCmdAll()'s call to convertHttpRequestToParmList
|
||||
sock.m_ip = atoip("127.0.0.1");
|
||||
s_r.set ( s_buffer , gbstrlen ( s_buffer ) , &sock );
|
||||
// do not do sig alarms! for now just set this to null so
|
||||
// the sigalarmhandler doesn't core
|
||||
//g_hostdb.m_myHost = NULL;
|
||||
// run the loop
|
||||
if ( ! g_loop.runLoop() )
|
||||
return log("INJECT: loop run failed.");
|
||||
@ -5170,6 +5186,7 @@ int install ( install_flag_konst_t installFlag , int32_t hostId , char *dir ,
|
||||
"export MALLOC_CHECK_=0;"
|
||||
"cp -f gb gb.oldsave ; "
|
||||
"ADDARGS='' "
|
||||
"INC=1 "
|
||||
"EXITSTATUS=1 ; "
|
||||
"while [ \\$EXITSTATUS != 0 ]; do "
|
||||
"{ "
|
||||
@ -5191,7 +5208,8 @@ int install ( install_flag_konst_t installFlag , int32_t hostId , char *dir ,
|
||||
" ;"
|
||||
|
||||
"EXITSTATUS=\\$? ; "
|
||||
"ADDARGS='-r' ; "
|
||||
"ADDARGS='-r'\\$INC ; "
|
||||
"INC=\\$((INC+1));"
|
||||
"} "
|
||||
"done >& /dev/null & \" %s",
|
||||
//"\" %s",
|
||||
|
@ -15,6 +15,10 @@ en|nos
|
||||
en|at
|
||||
en|ats
|
||||
|
||||
# it's not good to do love <-> like, as wiktionary does. override it here.
|
||||
en|love,loved,loving,loves
|
||||
en|like,likes,liked,liking
|
||||
|
||||
|
||||
en|sunday,sundays,sun
|
||||
en|mon,monday,mondays
|
||||
@ -179,13 +183,7 @@ en|porno,pornographic,pornography
|
||||
en|tech,technology,technologies,technological
|
||||
en|car,auto,automobile,cars,autos,automobiles
|
||||
en|electric,electrical
|
||||
en|dinner,dining,dined,dines,food
|
||||
en|lunch,food
|
||||
en|breakfast,food
|
||||
en|restaurant,restaurants,restarant,restarants,diner,cafe,cafes,caf\xc3\xa9,caf\xc3\xa9s,caf\xc3\xa8,caf\xc3\xa8s
|
||||
en|breakfast,food
|
||||
en|lunch,food
|
||||
en|dine,dines,dined,dining,eat,eats,eating,ate
|
||||
|
||||
en|website,web site,websites,web sites,webpage,web page,webpages,web pages,homepage,home page,homepages,home pages,site,sites
|
||||
|
||||
@ -208,17 +206,12 @@ en|calendar,schedule,calendars,schedules
|
||||
# added tv
|
||||
en|tv,television,televisions,tvs
|
||||
|
||||
en|food,foods,nourishment,nourishments
|
||||
|
||||
# flower shop san francisco should match 'flower store'
|
||||
en|shop,shops,shopping,store,stores
|
||||
|
||||
# trigram
|
||||
en|nfl,national football league,football leauge
|
||||
|
||||
en|celebration,celebrations,bash,bashes,party,parties,partying
|
||||
en|rave,raves,party,parties,partying
|
||||
|
||||
en|rock groups,rock group,band,bands
|
||||
en|music groups,music group,band,bands
|
||||
|
||||
|
73
qa.cpp
73
qa.cpp
@ -1,6 +1,7 @@
|
||||
#include <string.h>
|
||||
#include "SafeBuf.h"
|
||||
#include "HttpServer.h"
|
||||
#include "Posdb.h"
|
||||
|
||||
TcpSocket *g_qaSock = NULL;
|
||||
SafeBuf g_qaOutput;
|
||||
@ -183,6 +184,7 @@ void processReply ( char *reply , int32_t replyLen ) {
|
||||
markOut ( content , "spider is done (");
|
||||
markOut ( content , "spider is paused (");
|
||||
markOut ( content , "spider queue empty (");
|
||||
markOut ( content , "spider is active (");
|
||||
|
||||
markOut ( content , "<totalShards>");
|
||||
|
||||
@ -200,6 +202,23 @@ void processReply ( char *reply , int32_t replyLen ) {
|
||||
markOut ( content , "\"responseTimeMS\":");
|
||||
markOut ( content , "\"docsInCollection\":");
|
||||
|
||||
// if the results are in json, then status doc is encoded json
|
||||
markOut ( content , "\\\"gbssDownloadStartTime\\\":");
|
||||
markOut ( content , "\\\"gbssDownloadEndTime\\\":");
|
||||
markOut ( content , "\\\"gbssDownloadStartTimeMS\\\":");
|
||||
markOut ( content , "\\\"gbssDownloadEndTimeMS\\\":");
|
||||
markOut ( content , "\\\"gbssDownloadDurationMS\\\":");
|
||||
markOut ( content , "\\\"gbssAgeInIndex\\\":");
|
||||
|
||||
// if the results are in xml, then the status doc is xml encoded
|
||||
markOut ( content , "\"gbssDownloadStartTime\":");
|
||||
markOut ( content , "\"gbssDownloadEndTime\":");
|
||||
markOut ( content , "\"gbssDownloadStartTimeMS\":");
|
||||
markOut ( content , "\"gbssDownloadEndTimeMS\":");
|
||||
markOut ( content , "\"gbssDownloadDurationMS\":");
|
||||
markOut ( content , "\"gbssAgeInIndex\":");
|
||||
|
||||
|
||||
// for xml
|
||||
markOut ( content , "<currentTimeUTC>" );
|
||||
markOut ( content , "<responseTimeMS>");
|
||||
@ -776,6 +795,16 @@ bool qainject1 ( ) {
|
||||
log("qa: failed qa test of posdb0001.dat. "
|
||||
"has %i bytes of positive keys! coring.",
|
||||
(int)list.m_listSize);
|
||||
char rec [ 64];
|
||||
for ( list.getCurrentKey ( rec ) ;
|
||||
! list.isExhausted() ;
|
||||
list.skipCurrentRecord() ) {
|
||||
// parse it up
|
||||
int64_t tid = g_posdb.getTermId ( rec );
|
||||
int64_t d = g_posdb.getDocId ( rec ) ;
|
||||
log("qa: termid=%"INT64" docid=%"INT64,
|
||||
tid,d);
|
||||
}
|
||||
//char *xx=NULL;*xx=0;
|
||||
exit(0);
|
||||
}
|
||||
@ -980,7 +1009,8 @@ bool qainject2 ( ) {
|
||||
if ( ! s_flags[33] ) {
|
||||
s_flags[33] = true;
|
||||
if ( ! getUrl ( "/search?c=qatest123&qa=1&format=xml&q="
|
||||
"url2%3Axyz.com%2F-13737921970569011262&xml=1"
|
||||
"gbssUrl%3Axyz.com%2F-13737921970569011262&"
|
||||
"xml=1"
|
||||
,-1405546537 ) )
|
||||
return false;
|
||||
}
|
||||
@ -1164,17 +1194,17 @@ bool qaSyntax ( ) {
|
||||
"gbpermalink:1",
|
||||
"gbdocid:123456",
|
||||
|
||||
"gbstatus:0",
|
||||
"gbstatusmsg:tcp",
|
||||
"url2:www.abc.com/page.html",
|
||||
"site2:mysite.com",
|
||||
"ip2:1.2.3.4",
|
||||
"inurl2:dog",
|
||||
"gbpathdepth2:2",
|
||||
"gbhopcount2:3",
|
||||
"gbhasfilename2:1",
|
||||
"gbiscgi2:1",
|
||||
"gbhasext2:1",
|
||||
"gbssStatusCode:0",
|
||||
"gbssStatusmsg:tcp",
|
||||
"gbssUrl:www.abc.com/page.html",
|
||||
"gbssDomain:mysite.com",
|
||||
"gbssIp:1.2.3.4",
|
||||
"gbssUrl:dog",
|
||||
//"gbpathdepth:2",
|
||||
"gbssHopcount:3",
|
||||
//"gbhasfilename2:1",
|
||||
//"gbiscgi2:1",
|
||||
//"gbhasext2:1",
|
||||
|
||||
"cat AND dog",
|
||||
"cat OR dog",
|
||||
@ -1553,6 +1583,7 @@ bool qareindex() {
|
||||
"ufp=custom&"
|
||||
// zero spiders if not isreindex
|
||||
"fe1=default&hspl1=0&hspl1=1&fsf1=1.000000&"
|
||||
"fdu1=0&"
|
||||
"mspr1=0&mspi1=0&xg1=1000&fsp1=45&"
|
||||
);
|
||||
if ( ! getUrl ( "/admin/filters",0,sb.getBufStart()) )
|
||||
@ -1776,15 +1807,15 @@ bool qaspider1 ( ) {
|
||||
// make it the custom filter
|
||||
"ufp=custom&"
|
||||
|
||||
"fe=%%21ismanualadd+%%26%%26+%%21insitelist&hspl=0&hspl=1&fsf=0.000000&mspr=0&mspi=1&xg=1000&fsp=-3&"
|
||||
"fdu=0&fe=%%21ismanualadd+%%26%%26+%%21insitelist&hspl=0&hspl=1&fsf=0.000000&mspr=0&mspi=1&xg=1000&fsp=-3&"
|
||||
|
||||
// take out hopcount for now, just test quotas
|
||||
// "fe1=tag%%3Ashallow+%%26%%26+hopcount%%3C%%3D1&hspl1=0&hspl1=1&fsf1=1.000000&mspr1=1&mspi1=1&xg1=1000&fsp1=3&"
|
||||
|
||||
// just one spider out allowed for consistency
|
||||
"fe1=tag%%3Ashallow+%%26%%26+sitepages%%3C%%3D20&hspl1=0&hspl1=1&fsf1=1.000000&mspr1=1&mspi1=1&xg1=1000&fsp1=45&"
|
||||
"fdu1=0&fe1=tag%%3Ashallow+%%26%%26+sitepages%%3C%%3D20&hspl1=0&hspl1=1&fsf1=1.000000&mspr1=1&mspi1=1&xg1=1000&fsp1=45&"
|
||||
|
||||
"fe2=default&hspl2=0&hspl2=1&fsf2=1.000000&mspr2=0&mspi2=1&xg2=1000&fsp2=45&"
|
||||
"fdu2=0&fe2=default&hspl2=0&hspl2=1&fsf2=1.000000&mspr2=0&mspi2=1&xg2=1000&fsp2=45&"
|
||||
|
||||
);
|
||||
if ( ! getUrl ( "/admin/filters",0,sb.getBufStart()) )
|
||||
@ -1935,8 +1966,8 @@ bool qaspider1 ( ) {
|
||||
if ( ! s_flags[17] ) {
|
||||
s_flags[17] = true;
|
||||
if ( ! getUrl ( "/search?c=qatest123&qa=1&format=xml&"
|
||||
"q=site2%3Awww.walmart.com+"
|
||||
"gbsortby%3Agbspiderdate",
|
||||
"q=gbssSubdomain%3Awww.walmart.com+"
|
||||
"gbsortbyint%3AgbssDownloadStartTime",
|
||||
999 ) )
|
||||
return false;
|
||||
}
|
||||
@ -2039,7 +2070,7 @@ bool qaspider2 ( ) {
|
||||
// make it the custom filter
|
||||
"ufp=custom&"
|
||||
|
||||
"fe=%%21ismanualadd+%%26%%26+%%21insitelist&hspl=0&hspl=1&fsf=0.000000&mspr=0&mspi=1&xg=1000&fsp=-3&"
|
||||
"fdu=0&fe=%%21ismanualadd+%%26%%26+%%21insitelist&hspl=0&hspl=1&fsf=0.000000&mspr=0&mspi=1&xg=1000&fsp=-3&"
|
||||
|
||||
// take out hopcount for now, just test quotas
|
||||
// "fe1=tag%%3Ashallow+%%26%%26+hopcount%%3C%%3D1&hspl1=0&hspl1=1&fsf1=1.000000&mspr1=1&mspi1=1&xg1=1000&fsp1=3&"
|
||||
@ -2047,9 +2078,9 @@ bool qaspider2 ( ) {
|
||||
// sitepages is a little fuzzy so take it
|
||||
// out for this test and use hopcount!!!
|
||||
//"fe1=tag%%3Ashallow+%%26%%26+sitepages%%3C%%3D20&hspl1=0&hspl1=1&fsf1=1.000000&mspr1=1&mspi1=1&xg1=1000&fsp1=45&"
|
||||
"fe1=tag%%3Ashallow+%%26%%26+hopcount<%%3D1&hspl1=0&hspl1=1&fsf1=1.000000&mspr1=1&mspi1=1&xg1=1000&fsp1=45&"
|
||||
"fdu1=0&fe1=tag%%3Ashallow+%%26%%26+hopcount<%%3D1&hspl1=0&hspl1=1&fsf1=1.000000&mspr1=1&mspi1=1&xg1=1000&fsp1=45&"
|
||||
|
||||
"fe2=default&hspl2=0&hspl2=1&fsf2=1.000000&mspr2=0&mspi2=1&xg2=1000&fsp2=45&"
|
||||
"fdu2=0&fe2=default&hspl2=0&hspl2=1&fsf2=1.000000&mspr2=0&mspi2=1&xg2=1000&fsp2=45&"
|
||||
|
||||
);
|
||||
if ( ! getUrl ( "/admin/filters",0,sb.getBufStart()) )
|
||||
@ -2450,7 +2481,7 @@ bool qajson ( ) {
|
||||
if ( ! s_flags[12] ) {
|
||||
s_flags[12] = true;
|
||||
if ( ! getUrl ( "/search?c=qatest123&qa=1&format=json&"
|
||||
"q=inurl2%3Aquirksmode.org%2Fm%2F",
|
||||
"q=gbssUrl%3Aquirksmode.org%2Fm%2F",
|
||||
-1310551262 ) )
|
||||
return false;
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user