Merge branch 'master' into diffbot

Conflicts:
	PageResults.cpp
This commit is contained in:
Matt Wells
2013-09-13 09:24:28 -07:00
29 changed files with 213 additions and 69 deletions

@ -189,3 +189,10 @@ third-party archives.
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
10. Exclusions
The SEO (Search Engine Optimiazation) functionality is provided in a separate file named "seo.cpp", and is not included in this license. If you would like to license that then you can arrange a licensing agreement with Matt Wells.
Likewise, the Event datamining logic is in Events.cpp and must be separately licensed as well.

@ -196,7 +196,7 @@ static char *s_langAbbr[] = {
};
// fix bug:
//#ifndef _PRIVATESTUFF_
//#ifndef PRIVATESTUFF
#define csISOLatin6 cslatin6
//#endif

@ -833,9 +833,9 @@ void sigalrmHandler ( int x , siginfo_t *info , void *y ) {
// if we missed to many, then dump core
if ( g_niceness == 1 && g_missedQuickPolls >= 4 ) {
g_inSigHandler = true;
//g_inSigHandler = true;
log("loop: missed quickpoll");
g_inSigHandler = false;
//g_inSigHandler = false;
// seems to core a lot in gbcompress() we need to
// put a quickpoll into zlib deflate() or
// deflat_slot() or logest_match() function
@ -1594,7 +1594,10 @@ void sigHandlerRT ( int x , siginfo_t *info , void *v ) {
}
//fprintf (stderr,"in rt handler\n");
// let everyone know it
g_inSigHandler = true;
// MDW: turn this off for now, how is it getting set? we dont use
// real time signals any more. maybe a pthread is getting such
// a signal?
//g_inSigHandler = true;
// debug msg
//if ( g_conf.m_timingDebugEnabled )
// log("sigHandlerRT entered");

@ -376,7 +376,8 @@ fctypes.o: fctypes.cpp gb-include.h types.h fctypes.h Unicode.h \
TcpSocket.h Collectiondb.h Entities.h UCWordIterator.h Timedb.h Rdb.h \
RdbBase.h RdbScan.h BigFile.h RdbMap.h RdbList.h RdbDump.h RdbTree.h \
RdbMem.h RdbBuckets.h RdbCache.h Msg5.h Msg3.h HashTableX.h RdbMerge.h \
Dir.h Titledb.h DiskPageCache.h CollectionRec.h Parms.h HashTable.h
Dir.h Titledb.h DiskPageCache.h CollectionRec.h Parms.h HashTable.h \
Threads.h
File.o: File.cpp gb-include.h types.h fctypes.h Unicode.h \
UnicodeProperties.h UCPropTable.h iconv.h UCNormalizer.h hash.h Errno.h \
Log.h File.h Mem.h Conf.h Xml.h XmlNode.h Lang.h Iso8859.h \
@ -1536,19 +1537,19 @@ PageStats.o: PageStats.cpp gb-include.h types.h fctypes.h Unicode.h \
Msge0.h Msge1.h Msg8b.h SiteGetter.h Title.h Address.h Placedb.h
PageStatsdb.o: PageStatsdb.cpp gb-include.h types.h fctypes.h Unicode.h \
UnicodeProperties.h UCPropTable.h iconv.h UCNormalizer.h hash.h Errno.h \
Log.h plotter.h iostream.h streambuf.h CollectionRec.h Url.h ip.h \
Parms.h Xml.h XmlNode.h Lang.h Iso8859.h iana_charset.h SafeBuf.h \
HttpRequest.h Mem.h Conf.h File.h Loop.h Hostdb.h TcpSocket.h \
Collectiondb.h HashTable.h HashTableX.h RdbList.h Pages.h HttpServer.h \
TcpServer.h MsgC.h UdpServer.h UdpSlot.h UdpProtocol.h Dns.h \
DnsProtocol.h RdbCache.h Multicast.h Threads.h Rdb.h RdbBase.h RdbScan.h \
BigFile.h RdbMap.h RdbDump.h RdbTree.h RdbMem.h RdbBuckets.h Msg5.h \
Msg3.h RdbMerge.h Dir.h HttpMime.h Statsdb.h zlib.h zconf.h Stats.h \
IndexReadInfo.h Query.h IndexList.h Indexdb.h DiskPageCache.h Titledb.h \
Msg1.h Msg0.h Clusterdb.h Linkdb.h Msg2.h Msg20.h Summary.h matches.h \
Words.h StopWords.h Bits.h Pos.h Matches.h HashTableT.h Domains.h \
CountryCode.h Tagdb.h Events.h Sections.h Dates.h Msg22.h CatRec.h \
Categories.h Catdb.h Datedb.h Msg4.h Process.h Msg28.h SafeList.h
Log.h CollectionRec.h Url.h ip.h Parms.h Xml.h XmlNode.h Lang.h \
Iso8859.h iana_charset.h SafeBuf.h HttpRequest.h Mem.h Conf.h File.h \
Loop.h Hostdb.h TcpSocket.h Collectiondb.h HashTable.h HashTableX.h \
RdbList.h Pages.h HttpServer.h TcpServer.h MsgC.h UdpServer.h UdpSlot.h \
UdpProtocol.h Dns.h DnsProtocol.h RdbCache.h Multicast.h Threads.h Rdb.h \
RdbBase.h RdbScan.h BigFile.h RdbMap.h RdbDump.h RdbTree.h RdbMem.h \
RdbBuckets.h Msg5.h Msg3.h RdbMerge.h Dir.h HttpMime.h Statsdb.h zlib.h \
zconf.h Stats.h IndexReadInfo.h Query.h IndexList.h Indexdb.h \
DiskPageCache.h Titledb.h Msg1.h Msg0.h Clusterdb.h Linkdb.h Msg2.h \
Msg20.h Summary.h matches.h Words.h StopWords.h Bits.h Pos.h Matches.h \
HashTableT.h Domains.h CountryCode.h Tagdb.h Events.h Sections.h Dates.h \
Msg22.h CatRec.h Categories.h Catdb.h Datedb.h Msg4.h Process.h Msg28.h \
SafeList.h
PageSubmit.o: PageSubmit.cpp gb-include.h types.h fctypes.h Unicode.h \
UnicodeProperties.h UCPropTable.h iconv.h UCNormalizer.h hash.h Errno.h \
Log.h XmlDoc.h Lang.h Iso8859.h iana_charset.h Words.h Xml.h XmlNode.h \
@ -2116,17 +2117,17 @@ Stats.o: Stats.cpp gb-include.h types.h fctypes.h Unicode.h \
ip.h Hostdb.h HttpRequest.h Url.h TcpSocket.h Collectiondb.h IndexList.h \
RdbList.h Indexdb.h Rdb.h RdbBase.h RdbScan.h BigFile.h RdbMap.h \
RdbDump.h RdbTree.h RdbMem.h RdbBuckets.h RdbCache.h Msg5.h Msg3.h \
HashTableX.h RdbMerge.h Dir.h DiskPageCache.h Titledb.h plotter.h \
iostream.h streambuf.h PingServer.h Repair.h Msg1.h UdpServer.h \
UdpSlot.h Multicast.h Threads.h Msg0.h Clusterdb.h Linkdb.h Msg2.h \
Msg20.h Summary.h matches.h Words.h StopWords.h Bits.h Pos.h Matches.h \
HashTableT.h Domains.h CountryCode.h Tagdb.h CollectionRec.h Parms.h \
HashTable.h Events.h Sections.h Dates.h Msg22.h CatRec.h Categories.h \
Catdb.h Datedb.h Msg4.h XmlDoc.h Phrases.h LangList.h Images.h Msg36.h \
Msg13.h Msge0.h Msge1.h MsgC.h Dns.h DnsProtocol.h Msg8b.h SearchInput.h \
Msg40.h Msg39.h Msg37.h Posdb.h TopTree.h IndexTable2.h Msg51.h Msg17.h \
Msg2b.h Msg3a.h PostQueryRerank.h Sanity.h SiteGetter.h Title.h \
Address.h zlib.h zconf.h Spider.h HttpMime.h
HashTableX.h RdbMerge.h Dir.h DiskPageCache.h Titledb.h PingServer.h \
Repair.h Msg1.h UdpServer.h UdpSlot.h Multicast.h Threads.h Msg0.h \
Clusterdb.h Linkdb.h Msg2.h Msg20.h Summary.h matches.h Words.h \
StopWords.h Bits.h Pos.h Matches.h HashTableT.h Domains.h CountryCode.h \
Tagdb.h CollectionRec.h Parms.h HashTable.h Events.h Sections.h Dates.h \
Msg22.h CatRec.h Categories.h Catdb.h Datedb.h Msg4.h XmlDoc.h Phrases.h \
LangList.h Images.h Msg36.h Msg13.h Msge0.h Msge1.h MsgC.h Dns.h \
DnsProtocol.h Msg8b.h SearchInput.h Msg40.h Msg39.h Msg37.h Posdb.h \
TopTree.h IndexTable2.h Msg51.h Msg17.h Msg2b.h Msg3a.h \
PostQueryRerank.h Sanity.h SiteGetter.h Title.h Address.h zlib.h zconf.h \
Spider.h HttpMime.h
Statsdb.o: Statsdb.cpp Conf.h Xml.h XmlNode.h gb-include.h types.h \
fctypes.h Unicode.h UnicodeProperties.h UCPropTable.h iconv.h \
UCNormalizer.h hash.h Errno.h Log.h Lang.h Iso8859.h iana_charset.h \

@ -79,7 +79,7 @@ HOST=$(shell hostname)
# we can only build a 32-bit binary, so we have to use the 32-bit libraries
# provided for now.
ifeq ("titan","$(HOST)")
CPPFLAGS = -D_PRIVATESTUFF_ -m32 -g -Wall -pipe -Wno-write-strings -Wstrict-aliasing=0 -Wno-uninitialized -static
CPPFLAGS = -DPRIVATESTUFF -m32 -g -Wall -pipe -Wno-write-strings -Wstrict-aliasing=0 -Wno-uninitialized -static
LIBS = ./libz.a ./libssl.a ./libcrypto.a ./libiconv.a ./libm.a
OBJS:=$(OBJS) seo.o
$(shell rm seo.o)

@ -163,12 +163,12 @@ static bool s_initialized = 0;
// our own memory manager
//static MemPoolVar s_pool;
void operator delete ( void *ptr ) {
void operator delete (void *ptr) throw () {
// now just call this
g_mem.gbfree ( (char *)ptr , -1 , NULL );
}
void operator delete [] ( void *ptr ) {
void operator delete [] ( void *ptr ) throw () {
// now just call this
g_mem.gbfree ( ((char *)ptr-4) , -1 , NULL );
}

2
Mem.h

@ -210,7 +210,7 @@ inline bool relabel ( void *ptr , long size , const char *note ) {
//#define delete(X) { delete X; g_mem.m_freed += sizeof(*X); }
//#endif
//#ifndef DMALLOC
void operator delete ( void *p ) ;
void operator delete ( void *p ) throw();
void * operator new (size_t size) throw (std::bad_alloc);
// you MUST call mmalloc, mcalloc and mrealloc!!
#define malloc coreme

@ -978,6 +978,10 @@ bool Msg3a::mergeLists ( ) {
// or whatever, so let it through regardless
ksPtr[maxj]->n0 != 0LL &&
ksPtr[maxj]->n1 != 0 ) {
// if family filter on and is adult...
if ( m_r->m_familyFilter &&
g_clusterdb.hasAdultContent((char *)ksPtr[maxj]) )
goto skip;
// get the hostname hash, a long long
long sh = g_clusterdb.getSiteHash26 ((char *)ksPtr[maxj]);
// do we have enough from this hostname already?

@ -497,6 +497,7 @@ bool Msg40::getDocIds ( bool recall ) {
m_r.m_debug = m_si->m_debug ;
m_r.m_getDocIdScoringInfo = m_si->m_getDocIdScoringInfo;
m_r.m_doSiteClustering = m_si->m_doSiteClustering ;
m_r.m_familyFilter = m_si->m_familyFilter;
m_r.m_useMinAlgo = m_si->m_useMinAlgo;
m_r.m_useNewAlgo = m_si->m_useNewAlgo;
m_r.m_doMaxScoreAlgo = m_si->m_doMaxScoreAlgo;

@ -66,7 +66,7 @@ bool sendPageParser2 ( TcpSocket *s ,
//addCheckboxSpan ( uh64 , divTag , addNum );
// make basic reply
char *reply;
reply = "HTTP/1.1 200 OK\r\n"
reply = "HTTP/1.0 200 OK\r\n"
"Connection: Close\r\n";
// that is it! send a basic reply ok
bool status = g_httpServer.sendDynamicPage( s ,

@ -289,6 +289,9 @@ bool sendPageResults ( TcpSocket *s , HttpRequest *hr ) {
long clen;
char *coll = hr->getString("c",&clen,"",NULL);
if ( coll ) sb.safePrintf("&c=%s",coll);
// forward the "ff" family filter as well
long ff = hr->getLong("ff",0);
if ( ff ) sb.safePrintf("&ff=%li",ff);
// provide hash of the query so clients can't just pass in
// a bogus id to get search results from us
unsigned long h32 = hash32n(qstr);
@ -343,7 +346,11 @@ bool sendPageResults ( TcpSocket *s , HttpRequest *hr ) {
"      "
/* SEO functionality not included yet - so redir to gigablast. */
#ifdef PRIVATESTUFF
"<a title=\"Rank higher in Google\" href='/seo'>"
#else
"<a title=\"Rank higher in Google\" href='https://www.gigablast.com/seo'>"
#endif
"seo"
"</a>"
@ -356,6 +363,9 @@ bool sendPageResults ( TcpSocket *s , HttpRequest *hr ) {
" &nbsp;&nbsp;&nbsp;&nbsp; "
// i'm not sure why this was removed. perhaps
// because it is not working yet because of
// some bugs...
"<!-- <a title=\"Advanced web search\" "
"href=/adv.html>"
"advanced"
@ -2071,7 +2081,11 @@ static int printResult ( SafeBuf &sb,
// "c=%s&\">scoring</a>",
// coll );
//sb.safePrintf(" - <a href=\"/print?c=%s&",coll);
#ifdef PRIVATESTUFF
sb.safePrintf(" - <a href=\"/seo?");//c=%s&",coll);
#else
sb.safePrintf(" - <a href=\"https://www.gigablast.com/seo?");//c=%s&",coll);
#endif
//sb.safePrintf("d=%lli",mr->m_docId);
sb.safePrintf("u=");
sb.urlEncode ( url , gbstrlen(url) , false );
@ -3238,7 +3252,11 @@ bool printPairScore ( SafeBuf &sb , SearchInput *si , PairScore *ps ,
sb.safePrintf("<td>"
//"<a href=\"/print?d="
//"&page=4&recycle=1&"
#ifdef PRIVATESTUFF
"<a href=\"/seo?d="
#else
"<a href=\"https://www.gigablast.com/seo?d="
#endif
"%lli"
"&page=sections&"
"hipos=%li&c=%s\">"
@ -3316,7 +3334,11 @@ bool printPairScore ( SafeBuf &sb , SearchInput *si , PairScore *ps ,
//"<a href=\"/print?d="
//"%lli"
//"&page=4&recycle=1&"
#ifdef PRIVATESTUFF
"<a href=\"/seo?d="
#else
"<a href=\"https://www.gigablast.com/seo?d="
#endif
"%lli"
"&page=sections&"
"hipos=%li&c=%s\">"

@ -103,7 +103,12 @@ bool printWebHomePage ( SafeBuf &sb , HttpRequest *r ) {
//sb.safePrintf("<body>\n");
//g_proxy.insertLoginBarDirective ( &sb );
sb.safePrintf("<br><br>\n");
// try to avoid using https for images. it is like 10ms slower.
#ifdef PRIVATESTUFF
sb.safePrintf("<center><a href=/><img border=0 width=500 height=122 src=http://www.gigablast.com/logo-med.jpg></a>\n");
#else
sb.safePrintf("<center><a href=/><img border=0 width=500 height=122 src=/logo-med.jpg></a>\n");
#endif
sb.safePrintf("<br><br>\n");
sb.safePrintf("<br><br><br>\n");
sb.safePrintf("<b>web</b> &nbsp;&nbsp;&nbsp;&nbsp; <a href=/seo>seo</a> &nbsp;&nbsp;&nbsp;&nbsp; <a href=\"http://www.gigablast.com/?c=dmoz3\">directory</a> &nbsp;&nbsp;&nbsp;&nbsp; \n");
@ -305,7 +310,11 @@ bool printAddUrlHomePage ( SafeBuf &sb , char *url , HttpRequest *r ) {
sb.safePrintf("\n");
sb.safePrintf("<br><br>\n");
#ifdef PRIVATESTUFF
sb.safePrintf("<center><a href=/><img border=0 width=500 height=122 src=http://www.gigablast.com/logo-med.jpg></a>\n");
#else
sb.safePrintf("<center><a href=/><img border=0 width=500 height=122 src=/logo-med.jpg></a>\n");
#endif
sb.safePrintf("<br><br>\n");
sb.safePrintf("<br><br><br>\n");
sb.safePrintf("<a href=/>web</a> &nbsp;&nbsp;&nbsp;&nbsp; <a href=/seo>seo</a> &nbsp;&nbsp;&nbsp;&nbsp; <a href=\"http://www.gigablast.com/?c=dmoz3\">directory</a> &nbsp;&nbsp;&nbsp;&nbsp; \n");
@ -434,7 +443,12 @@ bool printDirHomePage ( SafeBuf &sb , HttpRequest *r ) {
sb.safePrintf("<body onload=\"x()\">\n");
sb.safePrintf("<body>\n");
sb.safePrintf("<br><br>\n");
// try to avoid using https for images. it is like 10ms slower.
#ifdef PRIVATESTUFF
sb.safePrintf("<center><a href=/><img border=0 width=500 height=122 src=http://www.gigablast.com/logo-med.jpg></a>\n");
#else
sb.safePrintf("<center><a href=/><img border=0 width=500 height=122 src=/logo-med.jpg></a>\n");
#endif
sb.safePrintf("<br><br>\n");
sb.safePrintf("<br><br><br>\n");
sb.safePrintf("<a href=/>web</a> &nbsp;&nbsp;&nbsp;&nbsp; <a href=/seo>seo</a> &nbsp;&nbsp;&nbsp;&nbsp; <b>directory</b> &nbsp;&nbsp;&nbsp;&nbsp; \n");

@ -1,7 +1,7 @@
#include "gb-include.h"
#define X_DISPLAY_MISSING 1
#include <plotter.h>
//#include <plotter.h>
//#include <fstream.h>
#include <math.h>
@ -109,7 +109,7 @@ bool sendPageStatsdb ( TcpSocket *s, HttpRequest *r ) {
// st->m_columns = DEF_COLUMNS;
if ( st->m_now )
st->m_startDate = (time_t)getTimeGlobal();
st->m_startDate = (time_t)getTimeGlobalNoCore();
st->m_startDateR = st->m_startDate;
st->m_endDateR = st->m_endDate;

@ -220,7 +220,7 @@ static WebPage s_pages[] = {
// "get queries a url matches",
// sendPageMatchingQueries , 2 } ,
#ifdef _PRIVATESTUFF_
#ifdef PRIVATESTUFF
{ PAGE_SEO, "seo",0,"seo" , 0 , 0 ,
"SEO info",
sendPageSEO , 2 } ,
@ -415,7 +415,7 @@ bool Pages::sendDynamicReply ( TcpSocket *s , HttpRequest *r , long page ) {
if ( page == PAGE_ROOT ) publicPage = true;
// do not deny /NM/Albuquerque urls
if ( page == PAGE_RESULTS ) publicPage = true;
#ifdef _PRIVATESTUFF_
#ifdef PRIVATESTUFF
if ( page == PAGE_SEO ) publicPage = true;
#endif
if ( page == PAGE_ADDURL ) publicPage = true;

@ -358,7 +358,7 @@ enum {
PAGE_REINDEX ,
PAGE_INJECT ,
//PAGE_KEYWORDS ,
#ifdef _PRIVATESTUFF_
#ifdef PRIVATESTUFF
PAGE_SEO ,
#endif
PAGE_ACCESS , //40

@ -7995,6 +7995,10 @@ void Parms::init ( ) {
m->m_def = "1";
m++;
/*
MDW: use the "onsite" directive in the url filters page now...
m->m_title = "only spider links from same host";
m->m_desc = "If this is true the spider will only harvest links "
"to pages that are contained on the same host as the page "
@ -8012,6 +8016,7 @@ void Parms::init ( ) {
m->m_def = "0";
m->m_group = 0;
m++;
*/
m->m_title = "do not re-add old outlinks more than this many days";
m->m_desc = "If less than this many days have elapsed since the "

@ -2712,7 +2712,7 @@ void checkKernelErrors( int fd, void *state ){
void PingServer::sendEmailMsg ( long *lastTimeStamp , char *msg ) {
// leave if we already sent and alert within 5 mins
//static long s_lasttime = 0;
long now = getTimeGlobal();
long now = getTimeGlobalNoCore();
if ( now - *lastTimeStamp < 5*60 ) return;
// prepare msg to send
//Host *h0 = g_hostdb.getHost ( 0 );

@ -678,6 +678,8 @@ bool Proxy::handleRequest (TcpSocket *s){
userId32b = ui->m_userId32;
break;
}
// code is invalid if is not for an old client
if ( userId32b == 0 ) code = NULL;
}
// if we have both a code and userid, check to see if it is correct
if ( code ) {
@ -3545,7 +3547,7 @@ bool Proxy::hitCreditCard ( StateUser *su ) {
//
// INSERT YOUR secret transaction/api key for authorize.net
//
#ifdef _PRIVATESTUFF_
#ifdef PRIVATESTUFF
url.safePrintf("&x_tran_key=%s",g_secret_tran_key);
url.safePrintf("&x_login=%s",g_secret_api_key);
#else

17
Rdb.cpp

@ -1715,6 +1715,23 @@ bool Rdb::needsDump ( ) {
if ( m_useTree) {if(m_tree.is90PercentFull() ) return true;}
else if(m_buckets.needsDump() ) return true;
// if adding to doledb and it has been > 1 day then force a dump
// so that all the negative keys in the tree annihilate with the
// keys on disk to make it easier to read a doledb list
if ( m_rdbId != RDB_DOLEDB ) return false;
// set this if not valid
//static long s_lastDumpTryTime = -1;
//if ( s_lastDumpTryTime == -1 )
// s_lastDumpTryTime = getTimeLocal();
// try to dump doledb every 24 hrs
//long now = getTimeLocal();
//if ( now - s_lastDumpTryTime >= 3600*24 ) return true;
// or dump doledb if a ton of negative recs...
if ( m_tree.getNumNegativeKeys() > 50000 ) return true;
// otherwise, no need to dump doledb just yet
return false;
}

@ -797,6 +797,7 @@ m if (! cr->hasSearchPermission ( sock, encapIp ) ) {
}
m_familyFilter = r->getLong("ff",0);
long codeLen;
char *code = r->getString ("code",&codeLen,NULL);

@ -3323,6 +3323,7 @@ void SpiderLoop::spiderDoledUrls ( ) {
// . if priority is -1 that means try next priority
// . DO NOT reset the whole scan. that was what was happening
// when we just had "goto loop;" here
// . this means a reset above!!!
if ( m_sc->m_pri == -1 ) return;
// bail if waiting for lock reply, no point in reading more
if ( m_msg12.m_gettingLocks ) return;
@ -3377,13 +3378,25 @@ bool SpiderLoop::gotDoledbList2 ( ) {
// bail if list is empty
if ( m_list.getListSize() <= 0 ) {
if ( g_conf.m_logDebugSpider )
log("spider: resetting doledb priority pri=%li",
m_sc->m_pri);
//if ( g_conf.m_logDebugSpider )
// log("spider: resetting doledb priority pri=%li",
// m_sc->m_pri);
// trigger a reset
m_sc->m_pri = -1;
//m_sc->m_pri = -1;
// . let the sleep timer init the loop again!
// . no, just continue the loop
//return true;
// this priority is EMPTY, try next
m_sc->m_pri = m_sc->m_pri - 1;
// how can this happen?
if ( m_sc->m_pri < -1 ) m_sc->m_pri = -1;
// all done if priority is negative, it will start over
// at the top most priority, we've completed a round
if ( m_sc->m_pri < 0 ) return true;
// set to next priority otherwise
//m_sc->m_nextDoledbKey=g_doledb.makeFirstKey2 ( m_sc->m_pri );
m_sc->m_nextDoledbKey = m_sc->m_nextKeys [m_sc->m_pri];
// and load that list from doledb for that priority
return true;
}

@ -39,10 +39,29 @@ bool updateCrawlInfo ( CollectionRec *cr ,
// There are SpiderRequests and SpiderReplies in Spiderdb. they now use
// 16 byte keys (key128_t). when a new spiderdb rec is added to spiderdb
// in Rdb.cpp we call addSpiderRequest() or addSpiderReply(). then
// that rec might be added to the waiting tree. then the waiting tree
// that rec might be added to the waiting tree. the waiting tree
// is scanned for IPs that have a SpiderRequest whose spiderTime is
// <= now and we grab ONE from spiderdb and add to doledb. any host
// in our group can spider a request in doledb, but they must lock it
// <= now and we grab ONE from spiderdb and add to doledb. we try to
// store every IP (firstIp) we have in Spiderdb into the waiting tree,
// but the IP is also paired up with a spider priority representing a
// SpiderRequest in that priority from that IP. then the entries in
// waiting tree are sorted by scheduled spider time. waiting tree does not
// even store the SpiderRequest, but just the scheduling info for each
// ip/priority pair. then we can quickly scan waiting tree to find
// the next ip/priority ready for spidering, so we read the SpiderRequest
// from spiderdb for the ip/priority/time we choose and add that
// SpiderRequest to doledb to be spidered.
//
// The waiting tree is populated at
// startup by scanning spiderdb, which might take a while to complete,
// so it is running in the background while the gb server is up. it will
// log "10836674298 spiderdb bytes scanned for waiting tree re-population"
// periodically in the log as it tries to do a complete spiderdb scan
// every 24 hours. it should not be necessary to scan spiderdb more than
// once, but it seems we are leaking ips somehow so we do the follow-up
// scans for now. (see populateWaitingTreeFromSpiderdb() in Spider.cpp)
//
// any host in our group can spider a request in doledb, but they must lock it
// by calling getLocks() first and all hosts in the group must grant
// them the lock for that url otherwise they remove all the locks and
// try again on another spiderRequest in doledb.

@ -3,7 +3,7 @@
#include <errno.h>
#include "Stats.h"
#define X_DISPLAY_MISSING 1
#include <plotter.h>
//#include <plotter.h>
#include <math.h>
#include "Conf.h"
#include "PingServer.h"

@ -1025,7 +1025,11 @@ void UdpServer::process_ass ( long long now , long maxNiceness) {
// bail if no main sock
if ( m_sock < 0 ) return ;
long long startTimer = gettimeofdayInMillisecondsLocal();
// if we call this while in the sighandler it crashes since
// gettimeofdayInMillisecondsLocal() is not async safe
long long startTimer;
if ( ! g_inSigHandler )
startTimer = gettimeofdayInMillisecondsLocal();
bigloop:
// . if we're real time, and not in a sig handler, turn 'em off
// . readSock() and doSending() are not Async Signal Safe (ass)
@ -1088,7 +1092,11 @@ void UdpServer::process_ass ( long long now , long maxNiceness) {
}
callBottom:
if(maxNiceness < 1) return;
long long elapsed = gettimeofdayInMillisecondsLocal() - startTimer;
// if we call this while in the sighandler it crashes since
// gettimeofdayInMillisecondsLocal() is not async safe
long long elapsed = 0;
if ( ! g_inSigHandler )
elapsed = gettimeofdayInMillisecondsLocal() - startTimer;
if(elapsed < 10) {
// we did not call any, so resort to nice callbacks
makeCallbacks_ass ( /*niceness level*/ 1 ) ;

@ -1817,10 +1817,15 @@ long long gettimeofdayInMillisecondsGlobal() {
return gettimeofdayInMillisecondsSynced();
}
#include "Threads.h"
long long gettimeofdayInMillisecondsSynced() {
// if in a sig handler then return g_now
//if ( g_inSigHandler ) return g_nowGlobal;
if ( g_inSigHandler ) { char *xx = NULL; *xx = 0; }
// i find that a pthread can call this function even though
// a signal handler is underway in the main thread!
if ( g_inSigHandler && ! g_threads.amThread() ) {
char *xx = NULL; *xx = 0; }
// sanity check
if ( ! isClockInSync() ) { char *xx = NULL; *xx = 0; }
//if ( ! g_clockInSync )
@ -1841,7 +1846,10 @@ long long gettimeofdayInMillisecondsSynced() {
long long gettimeofdayInMillisecondsGlobalNoCore() {
// if in a sig handler then return g_now
//if ( g_inSigHandler ) return g_nowGlobal;
if ( g_inSigHandler ) { char *xx = NULL; *xx = 0; }
// i find that a pthread can call this function even though
// a signal handler is underway in the main thread!
if ( g_inSigHandler && ! g_threads.amThread() ) {
char *xx = NULL; *xx = 0; }
// sanity check
//if ( ! g_clockInSync ) { char *xx = NULL; *xx = 0; }
//if ( ! g_clockInSync )
@ -1873,7 +1881,10 @@ uint64_t gettimeofdayInMicroseconds(void) {
long long gettimeofdayInMilliseconds() {
// if in a sig handler then return g_now
//if ( g_inSigHandler ) return g_now;
if ( g_inSigHandler ) { char *xx = NULL; *xx = 0; }
// i find that a pthread can call this function even though
// a signal handler is underway in the main thread!
if ( g_inSigHandler && ! g_threads.amThread() ) {
char *xx = NULL; *xx = 0; }
// this isn't async signal safe...
struct timeval tv;
//g_loop.disableTimer();
@ -1900,7 +1911,10 @@ time_t getTime () {
time_t getTimeLocal () {
// if in a sig handler then return g_now/1000
//if ( g_inSigHandler ) return (time_t)(g_now / 1000);
if ( g_inSigHandler ) { char *xx = NULL; *xx = 0; }
// i find that a pthread can call this function even though
// a signal handler is underway in the main thread!
if ( g_inSigHandler && ! g_threads.amThread() ) {
char *xx = NULL; *xx = 0; }
// get time now
unsigned long now = gettimeofdayInMilliseconds() / 1000;
// and adjust it

@ -141,6 +141,7 @@ void setS99Local () {
// make raid and mount if not already mounted
// a lot of times after reboot something fails!
fprintf(fd,
"raidstop /dev/md0\n"
"mkraid -c /etc/raidtab --really-force /dev/md0\n"
"mount /dev/md0\n"
"\n"
@ -725,7 +726,7 @@ void setEtcNetworkInterfaces() {
return;
}
fprintf("ip for %s is %s\n",g_name,ips);
fprintf(stderr,"ip for %s is %s\n",g_name,ips);
fprintf ( fd ,
"auto lo\n"

@ -2781,7 +2781,7 @@ int main ( int argc , char *argv[] ) {
}
// the query log split
#ifdef _PRIVATESTUFF_
#ifdef PRIVATESTUFF
if ( ! loadQueryLog() ) return 1;
#endif
@ -3105,7 +3105,7 @@ int main ( int argc , char *argv[] ) {
log("db: Failed to init merge sleep callback.");
// SEO MODULE
#ifdef _PRIVATESTUFF_
#ifdef PRIVATESTUFF
if ( ! g_loop.registerSleepCallback(2000,(void *)1,runSEOQueryLoop))
log("db: Failed to register seo query loop");
#endif
@ -4802,7 +4802,7 @@ bool registerMsgHandlers1(){
// to make things compile we need to declare this stuff since the seo
// module is not in the open source version
//
#ifndef _PRIVATESTUFF_
#ifndef PRIVATESTUFF
SafeBuf g_qbuf;
long g_qbufNeedSave = false;
bool sendPageSEO(TcpSocket *, HttpRequest *) { return true;}
@ -4835,7 +4835,7 @@ bool registerMsgHandlers2(){
if ( ! registerHandler4 () ) return false;
#ifdef _PRIVATESTUFF_
#ifdef PRIVATESTUFF
// seo module handlers
if(! g_udpServer.registerHandler(0x8e,handleRequest8e)) return false;
if(! g_udpServer.registerHandler(0x4f,handleRequest4f)) return false;
@ -5851,6 +5851,18 @@ void dumpDoledb (char *coll,long startFileNum,long numFiles,bool includeTree){
if ( (drec[0] & 0x01) == 0x00 ) {char *xx=NULL;*xx=0; }
// get spider rec in it
char *srec = drec + 12 + 4;
// print doledb info first then spider request
fprintf(stdout,"dolekey=%s (n1=%lu n0=%llu) "
"pri=%li "
"spidertime=%lu "
"uh48=0x%llx\n",
KEYSTR(&k,12),
k.n1,
k.n0,
(long)g_doledb.getPriority(&k),
g_doledb.getSpiderTime(&k),
g_doledb.getUrlHash48(&k));
fprintf(stdout,"spiderkey=");
// print it
g_spiderdb.print ( srec );
// the \n

BIN
pdftohtml

Binary file not shown.

18
types.h

@ -944,15 +944,15 @@ inline char *KEYMIN() { return "\0\0\0\0"
"\0\0\0\0"
"\0\0\0\0"
"\0\0\0\0"; };
static char s_foo[] = { 0xff , 0xff , 0xff , 0xff ,
0xff , 0xff , 0xff , 0xff ,
0xff , 0xff , 0xff , 0xff ,
0xff , 0xff , 0xff , 0xff ,
0xff , 0xff , 0xff , 0xff ,
0xff , 0xff , 0xff , 0xff ,
0xff , 0xff , 0xff , 0xff ,
0xff , 0xff , 0xff , 0xff };
inline char *KEYMAX() { return s_foo; };
static int s_foo[] = { 0xffffffff ,
0xffffffff ,
0xffffffff ,
0xffffffff ,
0xffffffff ,
0xffffffff ,
0xffffffff ,
0xffffffff };
inline char *KEYMAX() { return (char *)s_foo; };
#endif