integrate diffbot from svn back into git.

This commit is contained in:
Matt Wells 2013-09-13 09:23:18 -07:00
parent 9696c7936a
commit 5dc7bd2ab4
36 changed files with 1909 additions and 96 deletions

@ -59,6 +59,10 @@ CollectionRec::CollectionRec() {
// *(m_regExs[i]) = '\0';
//}
m_numRegExs = 0;
// for diffbot caching the global spider stats
reset();
// add default reg ex if we do not have one
fixRec();
}
@ -74,12 +78,22 @@ void CollectionRec::setToDefaults ( ) {
fixRec ();
}
void CollectionRec::reset() {
m_localCrawlInfo.reset();
m_globalCrawlInfo.reset();
m_globalCrawlInfoUpdateTime = 0;
m_requests = 0;
m_replies = 0;
}
// . load this data from a conf file
// . values we do not explicitly have will be taken from "default",
// collection config file. if it does not have them then we use
// the value we received from call to setToDefaults()
// . returns false and sets g_errno on load error
bool CollectionRec::load ( char *coll , long i ) {
// also reset some counts not included in parms list
reset();
// before we load, set to defaults in case some are not in xml file
g_parms.setToDefault ( (char *)this );
// get the filename with that id
@ -111,6 +125,47 @@ bool CollectionRec::load ( char *coll , long i ) {
// add default reg ex
fixRec ();
//
// LOAD the crawlinfo class in the collectionrec for diffbot
//
if ( g_conf.m_useDiffbot ) {
// LOAD LOCAL
sprintf ( tmp1 , "%scoll.%s.%li/localcrawlinfo.txt",
g_hostdb.m_dir , m_coll , (long)m_collnum );
log("coll: loading %s",tmp1);
SafeBuf sb;
// fillfromfile returns 0 if does not exist, -1 on read error
if ( sb.fillFromFile ( tmp1 ) > 0 )
sscanf ( sb.getBufStart() ,
"indexAttempts:%lli\n"
"processAttempts:%lli\n"
"downloadAttempts:%lli\n"
, &m_localCrawlInfo.m_pageIndexAttempts
, &m_localCrawlInfo.m_pageProcessAttempts
, &m_localCrawlInfo.m_pageDownloadAttempts
);
// LOAD GLOBAL
sprintf ( tmp1 , "%scoll.%s.%li/globalcrawlinfo.txt",
g_hostdb.m_dir , m_coll , (long)m_collnum );
log("coll: loading %s",tmp1);
sb.reset();
if ( sb.fillFromFile ( tmp1 ) > 0 )
sscanf ( sb.getBufStart() ,
"indexAttempts:%lli\n"
"processAttempts:%lli\n"
"downloadAttempts:%lli\n"
"lastupdate:%lu\n"
, &m_globalCrawlInfo.m_pageIndexAttempts
, &m_globalCrawlInfo.m_pageProcessAttempts
, &m_globalCrawlInfo.m_pageDownloadAttempts
, &m_globalCrawlInfoUpdateTime
);
// ignore errors i guess
g_errno = 0;
}
// always turn on distributed spider locking because otherwise
// we end up calling Msg50 which calls Msg25 for the same root url
// at the same time, thereby wasting massive resources. it is also
@ -242,6 +297,7 @@ void CollectionRec::fixRec ( ) {
//strcpy(m_regExs [n],"default");
m_regExs[n].set("default");
m_regExs[n].nullTerm();
m_numRegExs++;
m_spiderFreqs [n] = 30; // 30 days default
@ -281,6 +337,50 @@ bool CollectionRec::save ( ) {
if ( ! g_parms.saveToXml ( (char *)this , tmp ) ) return false;
// log msg
log (LOG_INFO,"db: Saved %s.",tmp);//f.getFilename());
//
// save the crawlinfo class in the collectionrec for diffbot
//
if ( g_conf.m_useDiffbot ) {
// SAVE LOCAL
sprintf ( tmp , "%scoll.%s.%li/localcrawlinfo.txt",
g_hostdb.m_dir , m_coll , (long)m_collnum );
log("coll: saving %s",tmp);
SafeBuf sb;
sb.safePrintf("indexAttempts:%lli\n"
"processAttempts:%lli\n"
"downloadAttempts:%lli\n"
, m_localCrawlInfo.m_pageIndexAttempts
, m_localCrawlInfo.m_pageProcessAttempts
, m_localCrawlInfo.m_pageDownloadAttempts
);
if ( sb.dumpToFile ( tmp ) == -1 ) {
log("coll: failed to save file %s : %s",
tmp,mstrerror(g_errno));
g_errno = 0;
}
// SAVE GLOBAL
sprintf ( tmp , "%scoll.%s.%li/globalcrawlinfo.txt",
g_hostdb.m_dir , m_coll , (long)m_collnum );
log("coll: saving %s",tmp);
sb.reset();
sb.safePrintf("indexAttempts:%lli\n"
"processAttempts:%lli\n"
"downloadAttempts:%lli\n"
"lastupdate:%lu\n"
, m_globalCrawlInfo.m_pageIndexAttempts
, m_globalCrawlInfo.m_pageProcessAttempts
, m_globalCrawlInfo.m_pageDownloadAttempts
, m_globalCrawlInfoUpdateTime
);
if ( sb.dumpToFile ( tmp ) == -1 ) {
log("coll: failed to save file %s : %s",
tmp,mstrerror(g_errno));
g_errno = 0;
}
}
// do not need a save now
m_needsSave = false;
return true;

@ -69,6 +69,15 @@
#include "RdbList.h"
#include "Rdb.h" // for RdbBase
// used by diffbot to control spidering per collection
class CrawlInfo {
public:
long long m_pageIndexAttempts;
long long m_pageProcessAttempts;
long long m_pageDownloadAttempts;
void reset() { memset ( this , 0 , sizeof(CrawlInfo) ); };
};
class CollectionRec {
@ -136,6 +145,7 @@ class CollectionRec {
bool m_needsSave;
bool load ( char *coll , long collNum ) ;
void reset();
void fixRec ( );
@ -355,6 +365,40 @@ class CollectionRec {
// priority of urls being retried, usually higher than normal
char m_retryPriority;
// new diffbot parms
SafeBuf m_diffbotToken;
SafeBuf m_diffbotSeed;
SafeBuf m_diffbotApi;
SafeBuf m_diffbotApiQueryString;
SafeBuf m_diffbotUrlCrawlPattern;
SafeBuf m_diffbotUrlProcessPattern;
SafeBuf m_diffbotPageProcessPattern;
SafeBuf m_diffbotClassify;
// format of output. "csv" or "xml" or "json" or null
SafeBuf m_diffbotFormat;
// what fields to return in the json output: (api dependent)
SafeBuf m_diffbotFields;
long long m_diffbotMaxToCrawl;
long long m_diffbotMaxToProcess;
long long m_diffbotCrawlStartTime;
long long m_diffbotCrawlEndTime;
// for testing their regexes etc...
char m_isDiffbotTestCrawl;
// our local crawling stats
CrawlInfo m_localCrawlInfo;
// total crawling stats summed up from all hosts in network
CrawlInfo m_globalCrawlInfo;
// last time we computed global crawl info
time_t m_globalCrawlInfoUpdateTime;
// for counting replies
long m_replies;
long m_requests;
// for storing callbacks waiting in line for freshest crawl info
SafeBuf m_callbackQueue;
// . now the url regular expressions
// . we chain down the regular expressions
// . if a url matches we use that tagdb rec #

@ -396,28 +396,30 @@ bool Collectiondb::addRec ( char *coll , char *cpc , long cpclen , bool isNew ,
// if we are doing a dump from the command line, skip this stuff
if ( isDump ) return true;
if(isNew) verify = false;
// tell rdbs to add one, too
//if ( ! g_indexdb.addColl ( coll, verify ) ) goto hadError;
if ( ! g_posdb.addColl ( coll, verify ) ) goto hadError;
//if ( ! g_datedb.addColl ( coll, verify ) ) goto hadError;
if ( ! g_titledb.addColl ( coll, verify ) ) goto hadError;
//if ( ! g_revdb.addColl ( coll, verify ) ) goto hadError;
//if ( ! g_sectiondb.addColl ( coll, verify ) ) goto hadError;
if ( ! g_tagdb.addColl ( coll, verify ) ) goto hadError;
//if ( ! g_catdb.addColl ( coll, verify ) ) goto hadError;
//if ( ! g_checksumdb.addColl ( coll, verify ) ) goto hadError;
if ( ! g_spiderdb.addColl ( coll, verify ) ) goto hadError;
if ( ! g_doledb.addColl ( coll, verify ) ) goto hadError;
//if ( ! g_tfndb.addColl ( coll, verify ) ) goto hadError;
if ( ! g_clusterdb.addColl ( coll, verify ) ) goto hadError;
if ( ! g_linkdb.addColl ( coll, verify ) ) goto hadError;
if ( ! g_spiderdb.addColl ( coll, verify ) ) goto hadError;
if ( ! g_doledb.addColl ( coll, verify ) ) goto hadError;
// if first time adding a collrec, initialize the collectionless
// rdbs so they call Rdb::addColl() which makes a new RdbBase for them
// and stores ptr to that base in CollectionRec::m_bases[]
if ( m_numRecsUsed <= 1 ) {
if ( m_numRecsUsed == 1 ) {
g_statsdb.addColl ( NULL );
g_cachedb.addColl ( NULL );
g_serpdb.addColl ( NULL );
@ -505,12 +507,12 @@ bool Collectiondb::deleteRec ( char *coll , bool deleteTurkdb ) {
deleteTurkdb = true;
// no spiders can be out. they may be referencing the CollectionRec
// in XmlDoc.cpp... quite likely.
if ( g_conf.m_spideringEnabled ||
g_spiderLoop.m_numSpidersOut > 0 ) {
log("admin: Can not delete collection while "
"spiders are enabled or active.");
return false;
}
//if ( g_conf.m_spideringEnabled ||
// g_spiderLoop.m_numSpidersOut > 0 ) {
// log("admin: Can not delete collection while "
// "spiders are enabled or active.");
// return false;
//}
// do not allow this if in repair mode
if ( g_repairMode > 0 ) {
log("admin: Can not delete collection while in repair mode.");
@ -531,6 +533,16 @@ bool Collectiondb::deleteRec ( char *coll , bool deleteTurkdb ) {
}
CollectionRec *cr = m_recs [ collnum ];
if ( ! cr ) return log("admin: Collection id problem. Delete failed.");
// spiders off
if ( cr->m_spiderColl &&
cr->m_spiderColl->getTotalOutstandingSpiders() > 0 ) {
log("admin: Can not delete collection while "
"spiders are oustanding for collection. Turn off "
"spiders and wait for them to exit.");
return false;
}
// note it
log("coll: deleting coll %s",cr->m_coll);
// we need a save
m_needsSave = true;
// nuke it on disk

@ -156,8 +156,12 @@ bool Conf::init ( char *dir ) { // , long hostId ) {
if ( g_conf.m_isLive ) g_conf.m_doConsistencyTesting = false;
// and this on
g_conf.m_indexDeletes = true;
// leave it turned off for diffbot since it always needs to be crawling
#ifndef DIFFBOT
// these off
g_conf.m_spideringEnabled = false;
#endif
// this off
g_conf.m_repairingEnabled = false;
// make this 1 day for now (in seconds)
@ -203,7 +207,15 @@ bool Conf::init ( char *dir ) { // , long hostId ) {
// and always keep a decent site quality cache of at least 3M
if ( g_conf.m_siteQualityMaxCacheMem < 3000000 )
g_conf.m_siteQualityMaxCacheMem = 3000000;
m_useDiffbot = false;
#ifdef DIFFBOT
// make sure all collections index into a single unified collection
m_useDiffbot = true;
#endif
// HACK: set this now
setRootIps();

4
Conf.h

@ -183,6 +183,10 @@ class Conf {
long m_clusterdbMinFilesToMerge;
bool m_clusterdbSaveCache;
// if this is true, all collections index into the "main" collection
// but keep their own spiderdb in their collection.
bool m_useDiffbot;
//bool m_indexEventsOnly;
// linkdb for storing linking relations

@ -156,6 +156,9 @@ case EDOCIDCOLLISION : return "DocId collision in titledb";
case ESSLERROR : return "SSL error of some kind";
case EPERMDENIED : return "Permission denied";
case ENOFUNDS : return "Not enough funds in account";
case EDIFFBOTINTERNALERROR: return "Diffbot internal error";
case EDIFFBOTMIMEERROR: return "Diffbot mime error";
case EDIFFBOTBADHTTPSTATUS: return "Diffbot reply bad http status";
}
// if the remote error bit is clear it must be a regulare errno
//if ( ! ( errnum & REMOTE_ERROR_BIT ) ) return strerror ( errnum );

@ -159,6 +159,9 @@ enum {
EDOCIDCOLLISION ,
ESSLERROR ,
EPERMDENIED ,
ENOFUNDS
ENOFUNDS ,
EDIFFBOTINTERNALERROR,
EDIFFBOTMIMEERROR,
EDIFFBOTBADHTTPSTATUS
};
#endif

@ -1434,6 +1434,22 @@ unsigned long Hostdb::makeGroupMask ( long numGroups ) {
return makeGroupId ( numGroups - 1 , numGroups );
}
// return first alive host in a group/shard
Host *Hostdb::getLiveHostInGroup ( long groupId ) {
Host *group = getGroup ( groupId );
Host *live = NULL;
for ( long i = 0 ; i < m_numHostsPerGroup ; i++ ) {
// get it
Host *h = &group[i];
// skip if dead
if ( isDead(h->m_hostId) ) continue;
// return it if alive
return h;
}
// return first one if all dead
return &group[0];
}
// . get the Hosts in group with "groupId"
Host *Hostdb::getGroup ( unsigned long groupId , long *numHosts ) {
// set hosts per group

@ -354,6 +354,8 @@ class Hostdb {
long long getNumGlobalEvents ( );
Host *getLiveHostInGroup ( long groupId );
// . returns false if blocks and will call your callback later
// . returns true if doesn't block
// . sets errno on error

@ -746,14 +746,21 @@ void HttpMime::makeMime ( long totalContentLen ,
//sprintf ( m_buf ,
p += sprintf( p,
"HTTP/1.0 %li%s\r\n"
// make it at least 4 spaces so we can change
// the length of the content should we insert
// a login bar in Proxy::storeLoginBar()
"Content-Length: %04li\r\n"
, httpStatus , smsg );
// if content length is not known, as in diffbot.cpp, then
// do not print it into the mime
if ( totalContentLen >= 0 )
p += sprintf ( p ,
// make it at least 4 spaces so we can
// change the length of the content
// should we insert a login bar in
// Proxy::storeLoginBar()
"Content-Length: %04li\r\n"
, totalContentLen );
p += sprintf ( p ,
"%s"
"Content-Type: %s",
httpStatus , smsg ,
totalContentLen , enc , contentType );
enc , contentType );
if ( charset ) p += sprintf ( p , "; charset=%s", charset );
p += sprintf ( p , "\r\n");
p += sprintf ( p ,

@ -614,6 +614,11 @@ bool HttpRequest::set (char *url,long offset,long size,time_t ifModifiedSince,
// procog's ip
// if ( sock && strncmp(iptoa(sock->m_ip),"216.168.36.21",13) == 0)
// m_isLocal = true;
#ifdef DIFFBOT
// diffbot comcast
if ( sock && strncmp(iptoa(sock->m_ip),"50.168.3.61",11) == 0)
m_isLocal = true;
#endif
// roadrunner ip
// if ( sock && strncmp(iptoa(sock->m_ip),"66.162.42.131",13) == 0)
@ -1022,9 +1027,9 @@ long HttpRequest::getLong ( char *field , long defaultLong ) {
if ( i >= len || !is_digit(value[i]) ) return defaultLong;
}
return res;
}
}
long long HttpRequest::getLongLong ( char *field ,
long long HttpRequest::getLongLong ( char *field ,
long long defaultLongLong ) {
long len;
char *value = getValue ( field, &len, NULL );
@ -1043,7 +1048,7 @@ long HttpRequest::getLong ( char *field , long defaultLong ) {
if ( i >= len || !is_digit(value[i]) ) return defaultLongLong;
}
return res;
}
}
float HttpRequest::getFloat ( char *field , double defaultFloat ) {
long len;
@ -1091,6 +1096,22 @@ double HttpRequest::getDouble ( char *field , double defaultDouble ) {
return res;
}
bool HttpRequest::hasField ( char *field ) {
// how long is it?
long fieldLen = gbstrlen ( field );
// scan the field table directly
long i = 0;
for ( ; i < m_numFields ; i++ ) {
if ( fieldLen != m_fieldLens[i] ) continue;
if ( strncmp ( field, m_fields[i], fieldLen ) != 0 ) continue;
// got a match return the true
return true;
}
return false;
}
char *HttpRequest::getValue ( char *field , long *len, long *next ) {
// how long is it?
long fieldLen = gbstrlen ( field );
@ -1146,8 +1167,21 @@ void HttpRequest::parseFields ( char *s , long slen ) {
m_fields [ n ] = s;
// point to = sign
char *equal = strchr ( s , '=' );
// try next field if none here
if ( ! equal ) { s += gbstrlen ( s ) + 1; continue; }
// if no equal sign, maybe it is one of diffbot's valueless
// fields, so support that now
if ( ! equal ) {
// just set value to NULL
char *end = strchr(s,'&');
long len = end - s;
if ( ! end ) len = gbstrlen(s);
m_fieldLens[n] = len;
s[len] = '\0';
m_fieldValues[n] = NULL;
n++;
// skip over the '&' too
s += len + 1;
continue;
}
// set field len
m_fieldLens [ n ] = equal - s;
// set = to \0 so getField() returns NULL terminated field name

@ -118,6 +118,9 @@ class HttpRequest {
char *defaultString = NULL ,
long *next=NULL);
bool hasField ( char *field );
// are we a redir? if so return non-NULL
char *getRedir ( ) { return m_redir; };
long getRedirLen ( ) { return m_redirLen; };

@ -9,6 +9,7 @@
#include "XmlDoc.h" // gbzip
#include "UdpServer.h"
#include "Proxy.h"
#include "Diffbot.h"
// a global class extern'd in .h file
HttpServer g_httpServer;
@ -128,6 +129,11 @@ bool HttpServer::getDoc ( char *url ,
char *proto ,
bool doPost ,
char *cookie ) {
// sanity
if ( ip == -1 )
log("http: you probably didn't mean to set ip=-1 did you? "
"try setting to 0.");
//log(LOG_WARN, "http: get doc %s", url->getUrl());
// use the HttpRequest class
HttpRequest r;
@ -886,6 +892,22 @@ bool HttpServer::sendReply ( TcpSocket *s , HttpRequest *r , bool isAdmin) {
return sendErrorReply(s,404,"bad request");
// . if we get a request for this then allow Diffbot.cpp to
// handle it and send back the right stuff
if ( strcmp ( path , "/dev/crawl" ) == 0 ||
strcmp ( path , "/dev/crawl/" ) == 0 )
// this will call g_httpServer.sendDynamicPage() to send
// back the reply when it is done generating the reply.
// this function is in Diffbot.cpp.
return printCrawlBotPage ( s , r );
// . is it a diffbot api request, like "GET /api/*"
// . ie "/api/startcrawl" or "/api/stopcrawl" etc.?
if ( strncmp ( path , "/api/" , 5 ) == 0 )
// this will call g_httpServer.sendDynamicPage() to send
// back the reply when it is done generating the reply.
// this function is in Diffbot.cpp.
return handleDiffbotRequest ( s , r );
// for adding to browser list of search engines

@ -60,7 +60,7 @@ OBJS = Tfndb.o UdpSlot.o \
Users.o Images.o Wiki.o Wiktionary.o Scraper.o \
Dates.o Sections.o SiteGetter.o Syncdb.o \
Placedb.o Address.o Test.o GeoIP.o GeoIPCity.o Synonyms.o \
Cachedb.o Monitordb.o dlstubs.o
Cachedb.o Monitordb.o dlstubs.o Diffbot.o
CHECKFORMATSTRING = -D_CHECK_FORMAT_STRING_
@ -71,6 +71,7 @@ HOST=$(shell hostname)
#print_vars:
# $(HOST)
# force 32-bit mode using -m32 (apt-get install gcc-multilib to ensure works)
# and -m32 should use /usr/lib32/ as the library path.
# for old kernel 2.4 we don't use pthreads, just clone. so if compiling
@ -88,6 +89,13 @@ LIBS= -L. ./libz.a ./libssl.a ./libcrypto.a ./libiconv.a ./libm.a ./libstdc++.a
endif
# special diffbot compiling case to default g_conf.m_useDiffbot to true
ifeq ("neo","$(HOST)")
CPPFLAGS = -m32 -g -Wall -pipe -Wno-write-strings -Wstrict-aliasing=0 -Wno-uninitialized -static -D_PTHREADS_ -Wno-unused-but-set-variable -DDIFFBOT
LIBS= -L. ./libz.a ./libssl.a ./libcrypto.a ./libiconv.a ./libm.a ./libstdc++.a -lpthread
endif
# let's keep the libraries in the repo for easier bug reporting and debugging
# in general if we can. the includes are still in /usr/include/ however...
# which is kinda strange but seems to work so far.
@ -285,8 +293,8 @@ RdbBuckets.o:
Linkdb.o:
$(CC) $(DEFS) $(CPPFLAGS) -O3 -c $*.cpp
XmlDoc.o:
$(CC) $(DEFS) $(CPPFLAGS) -O3 -c $*.cpp
#XmlDoc.o:
# $(CC) $(DEFS) $(CPPFLAGS) -O3 -c $*.cpp
seo.o:
$(CC) $(DEFS) $(CPPFLAGS) -O3 -c $*.cpp

@ -1459,8 +1459,8 @@ void Mem::gbfree ( void *ptr , int size , const char *note ) {
if ( slot < 0 ) {
log(LOG_LOGIC,"mem: could not find slot (note=%s)",note);
// return for now so procog does not core all the time!
return;
//char *xx = NULL; *xx = 0;
//return;
char *xx = NULL; *xx = 0;
}
#ifdef _EFENCE_

@ -198,8 +198,12 @@ bool Msg0::getList ( long long hostId , // host to ask (-1 if none)
// . groupMask must turn on higher bits first (count downwards kinda)
// . titledb and spiderdb use special masks to get groupId
// if diffbot.cpp is reading spiderdb from each shard we have to
// get groupid from hostid here lest we core in getGroupId() below
if ( hostId >= 0 && m_rdbId == RDB_SPIDERDB )
m_groupId = 0;
// did they force it? core until i figure out what this is
if ( forceParitySplit >= 0 )
else if ( forceParitySplit >= 0 )
m_groupId = g_hostdb.getGroupId ( forceParitySplit );
else
m_groupId = getGroupId ( m_rdbId , startKey , ! noSplit );

@ -285,6 +285,10 @@ bool sendPageResults ( TcpSocket *s , HttpRequest *hr ) {
// show gigabits?
long gb = hr->getLong("gigabits",0);
if ( gb >= 1 ) sb.safePrintf("&gigabits=%li",gb);
// propagate collection
long clen;
char *coll = hr->getString("c",&clen,"",NULL);
if ( coll ) sb.safePrintf("&c=%s",coll);
// provide hash of the query so clients can't just pass in
// a bogus id to get search results from us
unsigned long h32 = hash32n(qstr);
@ -390,8 +394,10 @@ bool sendPageResults ( TcpSocket *s , HttpRequest *hr ) {
);
// contents of search box
sb.htmlEncode ( qstr , qlen , false );
sb.safePrintf ("\">"
"<input type=submit value=\"Search\" border=0>"
sb.safePrintf ("\">");
// propagate collection on subsequent searches
sb.safePrintf("<input name=c type=hidden value=\"%s\">",coll);
sb.safePrintf("<input type=submit value=\"Search\" border=0>"
"<br>"
"<br>"
"Try your search (not secure) on: &nbsp;&nbsp; "
@ -1186,7 +1192,7 @@ bool gotResults ( void *state ) {
// print the word
char *t = qw->m_word;
long tlen = qw->m_wordLen;
sb.utf8Encode ( t , tlen );
sb.utf8Encode2 ( t , tlen );
sb.safePrintf (" ");
}
// print tail if we had ignored terms
@ -1246,7 +1252,7 @@ bool gotResults ( void *state ) {
qe2 );
// close it up
sb.safePrintf ("\"><i><b>");
sb.utf8Encode(st->m_spell, len);
sb.utf8Encode2(st->m_spell, len);
// then finish it off
sb.safePrintf ("</b></i></a></font>\n<br><br>\n");
}
@ -1830,13 +1836,13 @@ static int printResult ( SafeBuf &sb,
backTag,
0,
0 ); // niceness
//if (!sb.utf8Encode(tt, hlen)) return false;
//if (!sb.utf8Encode2(tt, hlen)) return false;
if ( ! sb.brify ( tt,hlen,0,cols) ) return false;
}
else if ( str && strLen ) {
// determine if TiTle wraps, if it does add a <br> count for
// each wrap
//if (!sb.utf8Encode(str , strLen )) return false;
//if (!sb.utf8Encode2(str , strLen )) return false;
if ( ! sb.brify ( str,strLen,0,cols) ) return false;
}
// . use "UNTITLED" if no title

@ -1624,6 +1624,81 @@ bool Pages::printAdminLinks ( SafeBuf *sb,
*/
}
sb->safePrintf("</center><br/>" );
if ( top ) return status;
//
// if diffbot give the crawlbot api here mostly for testing
//
char *hyphen = NULL;
if ( g_conf.m_useDiffbot )
hyphen = strchr ( coll , '-');
if ( g_conf.m_useDiffbot ) {
sb->safePrintf("<br>"
"<center>"
"Diffbot API: &nbsp; " );
// /api/startcrawl
sb->safePrintf(" <a href=/dev/crawl>startcrawl</a>");
}
if ( hyphen ) {
// /api/stopcrawl
sb->safePrintf("&nbsp; <a href=/api/stopcrawl?token=");
sb->safeMemcpy ( coll, hyphen - coll );
sb->safePrintf("&id=%s>stopcrawl</a>"
,hyphen+1);
// /api/resumecrawl
sb->safePrintf("&nbsp; <a href=/api/resumecrawl?token=");
sb->safeMemcpy ( coll, hyphen - coll );
sb->safePrintf("&id=%s>resumecrawl</a>"
,hyphen+1);
// crawls
sb->safePrintf(" &nbsp; <a href=/api/crawls?token=");
sb->safeMemcpy ( coll, hyphen - coll );
sb->safePrintf(" title=\"show all crawl collections\">"
"crawls</a>");
// activecrawls
sb->safePrintf(" &nbsp; <a href=/api/activecrawls?id=%s ",
hyphen+1);
sb->safePrintf(" title=\"show stats on one crawl\">"
"activecrawls</a>");
// downloadurls
sb->safePrintf(" &nbsp; <a href=/api/downloadurls?id=%s ",
hyphen+1);
sb->safePrintf(" title=\"download urls in a crawl's "
"spiderdb\">downloadurls</a>");
// download crawl urls
sb->safePrintf(" &nbsp; <a href=/api/downloadcrawl?id=%s ",
hyphen+1);
sb->safePrintf(" title=\"download urls from crawl\">"
"downloadcrawl (urls)</a>");
// download json objects
sb->safePrintf(" &nbsp; <a href=/api/downloadcrawl?"
"id=%s&format=json ",
hyphen+1);
sb->safePrintf(" title=\"download urls from crawl\">"
"downloadcrawl (json)</a>");
}
if ( g_conf.m_useDiffbot ) {
sb->safePrintf("</center>\n");
sb->safePrintf("<br>");
}
//sprintf(p,"</font>\n" );
//p += gbstrlen(p);
return status;

184
Parms.cpp

@ -1204,7 +1204,8 @@ bool Parms::printParms ( SafeBuf* sb , long page , char *username,//long user,
status &=printParm ( sb, username,&m_parms[i],i,
j, jend, (char *)THIS,
coll,NULL,
bg,nc,pd);
bg,nc,pd,
false);
continue;
}
// if not first in a row, skip it, we printed it already
@ -1222,7 +1223,7 @@ bool Parms::printParms ( SafeBuf* sb , long page , char *username,//long user,
k++ )
status &=printParm(sb,username,&m_parms[k],k,
newj,jend,(char *)THIS,coll,NULL,bg,
nc,pd);
nc,pd, j==size-1);
}
// end array table
//if ( m->m_max > 1 ) {
@ -1656,7 +1657,8 @@ bool Parms::printParm ( SafeBuf* sb,
char *pwd ,
char *bg ,
long nc ,
long pd ) {
long pd ,
bool lastRow ) {
bool status = true;
// do not print if no permissions
if ( m->m_perms != 0 && !g_users.hasPermission(username,m->m_perms) )
@ -1864,8 +1866,14 @@ bool Parms::printParm ( SafeBuf* sb,
else if ( t == TYPE_CHECKBOX ) {
char *ddd = "";
if ( *s ) ddd = " checked";
sb->safePrintf("<center>"
"<input type=checkbox ");
// this is part of the "HACK" fix below. you have to
// specify the cgi parm in the POST request, and unchecked
// checkboxes are not included in the POST request.
if ( lastRow && m->m_page == PAGE_FILTERS )
sb->safePrintf("<center><input type=hidden ");
else
sb->safePrintf("<center>"
"<input type=checkbox ");
if ( m->m_page == PAGE_FILTERS)
sb->safePrintf("id=id_%s ",cgi);
@ -1933,6 +1941,22 @@ bool Parms::printParm ( SafeBuf* sb,
sb->dequote ( s , gbstrlen(s) );
sb->safePrintf ("\">");
}
else if ( t == TYPE_SAFEBUF ) {
long size = m->m_size;
// give regular expression box on url filters page more room
if ( m->m_page == PAGE_FILTERS ) {
if ( size > REGEX_TXT_MAX ) size = REGEX_TXT_MAX;
}
else {
if ( size > 20 ) size = 20;
}
sb->safePrintf ("<input type=text name=%s size=%li value=\"",
cgi,size);
//sb->dequote ( s , gbstrlen(s) );
SafeBuf *sx = (SafeBuf *)s;
sb->dequote ( sx->getBufStart() , sx->length() );
sb->safePrintf ("\">");
}
else if ( t == TYPE_STRINGBOX ) {
sb->safePrintf("<textarea rows=10 cols=64 name=%s>",cgi);
//p += urlEncode ( p , pend - p , s , gbstrlen(s) );
@ -2505,12 +2529,21 @@ void Parms::setParm ( char *THIS , Parm *m , long mm , long j , char *s ,
}
// if we are setting a guy in an array AND he is NOT the first
// in his row, ensure the guy before has a count of j+1 or more
// in his row, ensure the guy before has a count of j+1 or more.
//
// crap, on the url filters page if you do not check "spidering
// enabled" checkbox when adding a new rule at the bottom of the
// table, , then the spidering enabled parameter does not transmit so
// the "respider frequency" ends up checking the "spider enabled"
// array whose "count" was not incremented like it should have been.
// HACK: make new line at bottom always have spidering enabled
// checkbox set and make it impossible to unset.
if ( m->m_max > 1 && m->m_rowid >= 0 && mm > 0 &&
m_parms[mm-1].m_rowid == m->m_rowid ) {
char *pos = (char *)THIS + m_parms[mm-1].m_off - 4 ;
long maxcount = *(long *)pos;
if ( j >= maxcount ) {
log("admin: parm before \"m\" is limiting us");
//log("admin: try nuking the url filters or whatever "
// "and re-adding");
return;
@ -2609,15 +2642,19 @@ void Parms::setParm ( char *THIS , Parm *m , long mm , long j , char *s ,
! isHtmlEncoded && oldLen == len &&
memcmp ( sb->getBufStart() , s , len ) == 0 )
return;
// nuke it
sb->purge();
// this means that we can not use string POINTERS as parms!!
if ( ! isHtmlEncoded ) sb->safeMemcpy ( s , len );
else len = sb->htmlDecode (s,len,false,0);
// ensure null terminated
sb->nullTerm();
// null term it all
//dst[len] = '\0';
sb->reserve ( 1 );
//sb->reserve ( 1 );
// null terminate but do not include as m_length so the
// memcmp() above still works right
sb->m_buf[sb->m_length] = '\0';
//sb->m_buf[sb->m_length] = '\0';
// . might have to set length
// . used for CollectionRec::m_htmlHeadLen and m_htmlTailLen
//if ( m->m_plen >= 0 )
@ -2891,6 +2928,7 @@ bool Parms::setFromFile ( void *THIS ,
// now, extricate from the <![CDATA[ ... ]]> tag if we need to
if ( m->m_type == TYPE_STRING ||
m->m_type == TYPE_STRINGBOX ||
m->m_type == TYPE_SAFEBUF ||
m->m_type == TYPE_STRINGNONEMPTY ) {
char *oldv = v;
long oldvlen = vlen;
@ -3210,6 +3248,10 @@ skip2:
}
*/
// debug point
//if ( m->m_type == TYPE_SAFEBUF )
// log("hey");
// loop over all in this potential array
for ( j = 0 ; j < count ; j++ ) {
// the xml
@ -3219,6 +3261,7 @@ skip2:
// print CDATA if string
if ( m->m_type == TYPE_STRING ||
m->m_type == TYPE_STRINGBOX ||
m->m_type == TYPE_SAFEBUF ||
m->m_type == TYPE_STRINGNONEMPTY ) {
sprintf ( p , "<![CDATA[" );
p += gbstrlen ( p );
@ -3233,6 +3276,7 @@ skip2:
// print CDATA if string
if ( m->m_type == TYPE_STRING ||
m->m_type == TYPE_STRINGBOX ||
m->m_type == TYPE_SAFEBUF ||
m->m_type == TYPE_STRINGNONEMPTY ) {
sprintf ( p , "]]>" );
p += gbstrlen ( p );
@ -3343,6 +3387,14 @@ char *Parms::getParmHtmlEncoded ( char *p , char *pend , Parm *m , char *s ) {
sprintf (p,"%li",*(long *)s);
else if ( t == TYPE_LONG_LONG )
sprintf (p,"%lli",*(long long *)s);
else if ( t == TYPE_SAFEBUF ) {
SafeBuf *sb = (SafeBuf *)s;
p = htmlEncode ( p ,
pend ,
sb->getBufStart(),
sb->getBufStart() + sb->length(),
true ); // #?*
}
else if ( t == TYPE_STRING ||
t == TYPE_STRINGBOX ||
t == TYPE_STRINGNONEMPTY ||
@ -3434,6 +3486,7 @@ bool Parms::serialize( char *buf, long *bufSize ) {
if ( m->m_type == TYPE_STRING ) size = m->m_size;
if ( m->m_type == TYPE_STRINGBOX ) size = m->m_size;
if ( m->m_type == TYPE_STRINGNONEMPTY ) size = m->m_size;
if ( m->m_type == TYPE_SAFEBUF ) size = m->m_size;
if ( m->m_type == TYPE_SITERULE ) size = 4;
// . set size to the total size of array
@ -3573,6 +3626,7 @@ bool Parms::serializeConfParm( Parm *m, long i, char **p, char *end,
return false;
}
// TODO: add TYPE_SAFEBUF support
bool Parms::serializeCollParm( CollectionRec *cr,
Parm *m, long i, char **p, char *end,
long size, long cnt,
@ -5102,7 +5156,7 @@ void Parms::init ( ) {
m->m_cgi = "seatonep";
m->m_off = (char *)&g_conf.m_sendParmChangeAlertsToEmail1 - g;
m->m_type = TYPE_BOOL;
m->m_def = "1";
m->m_def = "0";
m->m_priv = 2;
m->m_group = 0;
m++;
@ -5156,7 +5210,7 @@ void Parms::init ( ) {
m->m_cgi = "seattwop";
m->m_off = (char *)&g_conf.m_sendParmChangeAlertsToEmail2 - g;
m->m_type = TYPE_BOOL;
m->m_def = "1";
m->m_def = "0";
m->m_priv = 2;
m->m_group = 0;
m++;
@ -5210,7 +5264,7 @@ void Parms::init ( ) {
m->m_cgi = "seatthreep";
m->m_off = (char *)&g_conf.m_sendParmChangeAlertsToEmail3 - g;
m->m_type = TYPE_BOOL;
m->m_def = "1";
m->m_def = "0";
m->m_priv = 2;
m->m_group = 0;
m++;
@ -5265,7 +5319,7 @@ void Parms::init ( ) {
m->m_cgi = "seatfourp";
m->m_off = (char *)&g_conf.m_sendParmChangeAlertsToEmail4 - g;
m->m_type = TYPE_BOOL;
m->m_def = "1";
m->m_def = "0";
m->m_priv = 2;
m->m_group = 0;
m++;
@ -7659,6 +7713,105 @@ void Parms::init ( ) {
m++;
*/
/////////////////////
//
// DIFFBOT CRAWLBOT PARMS
//
//////////////////////
m->m_cgi = "dbseed";
m->m_xml = "diffbotSeed";
m->m_off = (char *)&cr.m_diffbotSeed - x;
m->m_type = TYPE_SAFEBUF;
m->m_page = PAGE_NONE;
m->m_obj = OBJ_COLL;
m++;
m->m_cgi = "dbtoken";
m->m_xml = "diffbotToken";
m->m_off = (char *)&cr.m_diffbotToken - x;
m->m_type = TYPE_SAFEBUF;
m->m_page = PAGE_NONE;
m++;
m->m_cgi = "dbapi";
m->m_xml = "diffbotApi";
m->m_off = (char *)&cr.m_diffbotApi - x;
m->m_type = TYPE_SAFEBUF;
m->m_page = PAGE_NONE;
m++;
m->m_cgi = "dbapiqs";
m->m_xml = "diffbotApiQueryString";
m->m_off = (char *)&cr.m_diffbotApiQueryString - x;
m->m_type = TYPE_SAFEBUF;
m->m_page = PAGE_NONE;
m++;
m->m_cgi = "dbucp";
m->m_xml = "diffbotUrlCrawlPattern";
m->m_off = (char *)&cr.m_diffbotUrlCrawlPattern - x;
m->m_type = TYPE_SAFEBUF;
m->m_page = PAGE_NONE;
m++;
m->m_cgi = "dbupp";
m->m_xml = "diffbotUrlProcessPattern";
m->m_off = (char *)&cr.m_diffbotUrlProcessPattern - x;
m->m_type = TYPE_SAFEBUF;
m->m_page = PAGE_NONE;
m++;
m->m_cgi = "dbppp";
m->m_xml = "diffbotPageProcessPattern";
m->m_off = (char *)&cr.m_diffbotPageProcessPattern - x;
m->m_type = TYPE_SAFEBUF;
m->m_page = PAGE_NONE;
m++;
m->m_cgi = "dbclassify";
m->m_xml = "diffbotClassify";
m->m_off = (char *)&cr.m_diffbotClassify - x;
m->m_type = TYPE_SAFEBUF;
m->m_page = PAGE_NONE;
m++;
m->m_cgi = "dbmaxtocrawl";
m->m_xml = "diffbotMaxToCrawl";
m->m_off = (char *)&cr.m_diffbotMaxToCrawl - x;
m->m_type = TYPE_LONG_LONG;
m->m_page = PAGE_NONE;
m++;
m->m_cgi = "dbmaxtoprocess";
m->m_xml = "diffbotMaxToProcess";
m->m_off = (char *)&cr.m_diffbotMaxToProcess - x;
m->m_type = TYPE_LONG_LONG;
m->m_page = PAGE_NONE;
m++;
m->m_cgi = "dbcrawlstarttime";
m->m_xml = "diffbotCrawlStartTime";
m->m_off = (char *)&cr.m_diffbotCrawlStartTime - x;
m->m_type = TYPE_LONG_LONG;
m->m_page = PAGE_NONE;
m++;
m->m_cgi = "dbcrawlendtime";
m->m_xml = "diffbotCrawlEndTime";
m->m_off = (char *)&cr.m_diffbotCrawlEndTime - x;
m->m_type = TYPE_LONG_LONG;
m->m_page = PAGE_NONE;
m++;
m->m_cgi = "isdbtestcrawl";
m->m_xml = "isDiffbotTestCrawl";
m->m_off = (char *)&cr.m_isDiffbotTestCrawl - x;
m->m_type = TYPE_BOOL;
m->m_page = PAGE_NONE;
m++;
///////////////////////////////////////////
// SPIDER CONTROLS
///////////////////////////////////////////
@ -7678,7 +7831,7 @@ void Parms::init ( ) {
m->m_cgi = "cse";
m->m_off = (char *)&cr.m_spideringEnabled - x;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_def = "1";
m++;
/*
@ -12040,7 +12193,8 @@ void Parms::init ( ) {
m->m_off = (char *)cr.m_regExs - x;
// this is a safebuf, dynamically allocated string really
m->m_type = TYPE_SAFEBUF;//STRINGNONEMPTY
m->m_size = MAX_REGEX_LEN+1;
// the size of each element in the array:
m->m_size = sizeof(SafeBuf);//MAX_REGEX_LEN+1;
m->m_page = PAGE_FILTERS;
m->m_rowid = 1; // if we START a new row
m->m_def = "";
@ -15096,6 +15250,8 @@ void Parms::overlapTest ( char step ) {
m_parms[i].m_desc);
}
log("conf: try including \"m->m_obj = OBJ_COLL;\" or "
"\"m->m_obj = OBJ_CONF;\" in your parm definitions");
log("conf: failed overlap test. exiting.");
exit(-1);

24
Parms.h

@ -31,7 +31,7 @@ enum {
TYPE_FLOAT ,
TYPE_IP ,
TYPE_LONG ,
TYPE_LONG_LONG ,
TYPE_LONG_LONG , // 10
TYPE_NONE ,
TYPE_PRIORITY ,
TYPE_PRIORITY2 ,
@ -41,7 +41,7 @@ enum {
TYPE_STRINGBOX ,
TYPE_STRINGNONEMPTY ,
TYPE_TIME ,
TYPE_DATE2 ,
TYPE_DATE2 , // 20
TYPE_DATE ,
TYPE_RULESET ,
TYPE_FILTER ,
@ -50,7 +50,7 @@ enum {
TYPE_MONOD2 ,
TYPE_MONOM2 ,
TYPE_LONG_CONST ,
TYPE_SITERULE ,
TYPE_SITERULE , // 29
TYPE_SAFEBUF
};
@ -147,7 +147,7 @@ class Parm {
char * getValueAsString ( class SearchInput *si ) ;
};
#define MAX_PARMS 840
#define MAX_PARMS 940
#define MAX_XML_CONF (200*1024)
@ -171,15 +171,16 @@ class Parms {
long page , char *coll , char *pwd ) ;
char *printParms (char *p, char *pend, TcpSocket *s , HttpRequest *r );
//char *printParms (char *p, char *pend, TcpSocket *s, HttpRequest *r);
bool printParms (SafeBuf* sb, TcpSocket *s , HttpRequest *r );
char *printParms (char *p,char *pend,long page,char *username,
void *THIS, char *coll , char *pwd ,
long nc , long pd ) ;
//char *printParms (char *p,char *pend,long page,char *username,
// void *THIS, char *coll , char *pwd ,
// long nc , long pd ) ;
bool printParms (SafeBuf* sb, long page,char *username,void *THIS,
char *coll , char *pwd , long nc , long pd ) ;
char *coll , char *pwd , long nc , long pd );
/*
char *printParm ( char *p ,
char *pend ,
//long user ,
@ -194,6 +195,8 @@ class Parms {
char *bg ,
long nc ,
long pd ) ;
*/
bool printParm ( SafeBuf* sb,
//long user ,
char *username,
@ -206,7 +209,8 @@ class Parms {
char *pwd ,
char *bg ,
long nc ,
long pd ) ;
long pd ,
bool lastRow ) ;
char *getTHIS ( HttpRequest *r , long page ) ;

@ -1396,7 +1396,7 @@ void Process::disableTreeWrites ( ) {
}
// disable all spider trees and tables
for ( long i = 0 ; i < g_collectiondb.getNumRecs() ; i++ ) {
SpiderColl *sc = g_spiderCache.getSpiderColl(i);
SpiderColl *sc = g_spiderCache.getSpiderCollIffNonNull(i);
if ( ! sc ) continue;
sc->m_waitingTree .disableWrites();
sc->m_waitingTable.disableWrites();
@ -1413,7 +1413,7 @@ void Process::enableTreeWrites ( ) {
}
// enable all waiting trees
for ( long i = 0 ; i < g_collectiondb.getNumRecs() ; i++ ) {
SpiderColl *sc = g_spiderCache.getSpiderColl(i);
SpiderColl *sc = g_spiderCache.getSpiderCollIffNonNull(i);
if ( ! sc ) continue;
sc->m_waitingTree .enableWrites();
sc->m_waitingTable.enableWrites();

13
Rdb.cpp

@ -229,7 +229,9 @@ bool Rdb::init ( char *dir ,
m_dbname ,
m_ks ,
// make useProtection true for debugging
false ) ) // use protection?
false , // use protection?
false , // alowdups?
m_rdbId ) )
return false;
}
else {
@ -244,7 +246,9 @@ bool Rdb::init ( char *dir ,
m_dbname ,
m_ks ,
// make useProtection true for debugging
false ); // use protection?
false , // use protection?
false , // alowdups?
m_rdbId );
}
// set this then
sprintf(m_treeName,"buckets-%s",m_dbname);
@ -846,7 +850,8 @@ bool Rdb::loadTree ( ) {
//log (0,"Rdb::loadTree: loading %s",filename);
// set a BigFile to this filename
BigFile file;
file.set ( getDir() , filename , NULL ); // getStripeDir() );
char *dir = getDir();
file.set ( dir , filename , NULL ); // getStripeDir() );
bool treeExists = file.doesExist() > 0;
bool status = false ;
if ( treeExists ) {
@ -2163,7 +2168,7 @@ bool Rdb::addRecord ( collnum_t collnum,
}
else if ( (tn=m_tree.addNode ( collnum, key , data , dataSize ))>=0) {
// if adding to spiderdb, add to cache, too
if ( m_rdbId != RDB_SPIDERDB || m_rdbId != RDB_DOLEDB )
if ( m_rdbId != RDB_SPIDERDB && m_rdbId != RDB_DOLEDB )
return true;
// or if negative key
if ( KEYNEG(key) ) return true;

@ -92,7 +92,8 @@ bool RdbTree::set ( long fixedDataSize ,
char *dbname ,
char keySize ,
bool useProtection ,
bool allowDups ) {
bool allowDups ,
char rdbId ) {
reset();
m_fixedDataSize = fixedDataSize;
m_doBalancing = doBalancing;
@ -120,9 +121,9 @@ bool RdbTree::set ( long fixedDataSize ,
if ( dbname ) strncpy ( p , dbname , 8 ); p += 8;
*p++ = '\0';
// set rdbid
m_rdbId = -1;
m_rdbId = rdbId; // -1;
// if its doledb, set it
if ( dbname && strcmp(dbname,"doledb") == 0 ) m_rdbId = RDB_DOLEDB;
//if ( dbname && strcmp(dbname,"doledb") == 0 ) m_rdbId = RDB_DOLEDB;
// adjust m_maxMem to virtual infinity if it was -1
if ( m_maxMem < 0 ) m_maxMem = 0x7fffffff;
// . compute each node's memory overhead
@ -2994,11 +2995,11 @@ void RdbTree::cleanTree ( ) { // char **bases ) {
long RdbTree::getNumNegativeKeys ( collnum_t collnum ) {
return g_collectiondb.m_recs[collnum]->
m_numNegKeysInTree[m_rdbId];
m_numNegKeysInTree[(unsigned char)m_rdbId];
}
long RdbTree::getNumPositiveKeys ( collnum_t collnum ) {
return g_collectiondb.m_recs[collnum]->
m_numPosKeysInTree[m_rdbId];
m_numPosKeysInTree[(unsigned char)m_rdbId];
}

@ -91,7 +91,8 @@ class RdbTree {
bool dataInPtrs = false ,
char *dbname = NULL , char keySize = 12 ,
bool useProtection = false ,
bool allowDups = false );
bool allowDups = false ,
char rdbId = -1 );
// . frees the used memory, etc.
// . override so derivatives can free up extra header arrays

@ -680,7 +680,7 @@ bool SafeBuf::setEncoding(short cs) {
return true;
}
bool SafeBuf::utf8Encode(char *s, long len, bool encodeHTML,long niceness) {
bool SafeBuf::utf8Encode2(char *s, long len, bool encodeHTML,long niceness) {
long tmp = m_length;
if ( m_encoding == csUTF8 ) {
if (! safeMemcpy(s,len)) return false;
@ -1786,6 +1786,7 @@ bool SafeBuf::htmlEncodeXmlTags ( char *s , long slen , long niceness ) {
}
bool SafeBuf::safeStrcpy ( char *s ) {
if ( ! s ) return true;
long slen = gbstrlen(s);
return safeMemcpy(s,slen);
}
@ -2491,6 +2492,182 @@ bool SafeBuf::decodeJSON ( long niceness ) {
return true;
}
bool SafeBuf::decodeJSONToUtf8 ( long niceness ) {
//char *x = strstr(m_buf,"Chief European");
//if ( x )
// log("hey");
// count how many \u's we got
long need = 0;
char *p = m_buf;
for ( ; *p ; p++ )
// for the 'x' and the ';'
if ( *p == '\\' && p[1] == 'u' ) need += 2;
// reserve a little extra if we need it
SafeBuf dbuf;
dbuf.reserve ( need + m_length + 1);
char *src = m_buf;
char *dst = dbuf.m_buf;
for ( ; *src ; ) {
QUICKPOLL(niceness);
if ( *src == '\\' ) {
// \n? (from json.org homepage)
if ( src[1] == 'n' ) {
*dst++ = '\n';
src += 2;
continue;
}
if ( src[1] == 'r' ) {
*dst++ = '\r';
src += 2;
continue;
}
if ( src[1] == 't' ) {
*dst++ = '\t';
src += 2;
continue;
}
if ( src[1] == 'b' ) {
*dst++ = '\b';
src += 2;
continue;
}
if ( src[1] == 'f' ) {
*dst++ = '\f';
src += 2;
continue;
}
// a "\\" is an encoded backslash
if ( src[1] == '\\' ) {
*dst++ = '\\';
src += 2;
continue;
}
// a "\/" is an encoded forward slash
if ( src[1] == '/' ) {
*dst++ = '/';
src += 2;
continue;
}
// utf8? if not, just skip the slash
if ( src[1] != 'u' ) { src++; continue; }
// otherwise, decode. can do in place like this...
char *p = src + 2;
if ( ! is_hex(p[0]) ) continue;
if ( ! is_hex(p[1]) ) continue;
if ( ! is_hex(p[2]) ) continue;
if ( ! is_hex(p[3]) ) continue;
// TODO: support surrogate pairs in utf16?
UChar32 uc = 0;
// store the 16-bit number in lower 16 bits of uc...
hexToBin ( p , 2 , ((char *)&uc)+1 );
hexToBin ( p+2 , 2 , ((char *)&uc)+0 );
//buf[2] = '\0';
long size = ::utf8Encode ( (UChar32)uc , (char *)dst );
// a quote??? not allowed in json!
if ( size == 1 && dst[0] == '\"' ) {
size = 2;
dst[0] = '\\';
dst[1] = '\"';
}
//short = ahextoshort ( p );
dst += size;
// skip over /u and 4 digits
src += 6;
continue;
}
*dst++ = *src++;
}
*dst = '\0';
dbuf.m_length = dst - dbuf.m_buf;
// purge ourselves
purge();
// and steal dbuf's m_buf
m_buf = dbuf.m_buf;
m_length = dbuf.m_length;
m_capacity = dbuf.m_capacity;
m_usingStack = dbuf.m_usingStack;
// detach from dbuf so he does not free it
dbuf.detachBuf();
return true;
}
// . REALLY just a print vanity function. makes json output prettier
//
// . after converting JSON to utf8 above we sometimes want to go back.
// . just print that out. encode \n's and \r's back to \\n \\r
// and backslash to a \\ ... etc.
// . but if they originally had a \u<backslash> encoding and we decoded
// it to a backslash, here it will be re-encoded as (double backslash)
// . like wise if that originally had a \u<quote> encoding we should
// have decoded it as a \"!
// . this does not need to be super fast because it will be used for
// showing cached pages or dumping out the json objects from a crawl for
// diffbot
// . really we could leave the newlines decoded etc, but it is prettier
// for printing
bool SafeBuf::safeStrcpyPrettyJSON ( char *decodedJson ) {
// how much space do we need?
// each single byte \t char for instance will need 2 bytes
long need = gbstrlen(decodedJson) * 2 + 1;
if ( ! reserve ( need ) ) return false;
// scan and copy
char *src = decodedJson;
// concatenate to what's already there
char *dst = m_buf + m_length;
for ( ; *src ; src++ ) {
if ( *src == '\t' ) {
*dst++ = '\\';
*dst++ = 't';
continue;
}
if ( *src == '\n' ) {
*dst++ = '\\';
*dst++ = 'n';
continue;
}
if ( *src == '\r' ) {
*dst++ = '\\';
*dst++ = 'r';
continue;
}
if ( *src == '\f' ) {
*dst++ = '\\';
*dst++ = 'f';
continue;
}
if ( *src == '\\' ) {
*dst++ = '\\';
*dst++ = '\\';
continue;
}
//if ( *src == '\/' ) {
// *dst++ = '\\';
// *dst++ = '/';
// continue;
//}
*dst++ = *src;
}
// null term
*dst = '\0';
m_length = dst - m_buf;
return true;
}
bool SafeBuf::linkify ( long niceness , long startPos ) {

@ -57,6 +57,7 @@ struct SafeBuf {
bool truncateLongWords ( char *src, long srcLen , long minmax );
bool safeTruncateEllipsis ( char *src , long maxLen );
bool convertJSONtoXML ( long niceness , long startConvertPos );
bool decodeJSONToUtf8 ( long niceness );
bool decodeJSON ( long niceness );
bool linkify ( long niceness , long startPos );
@ -70,6 +71,13 @@ struct SafeBuf {
return safeStrcpy ( str );
};
void removeLastChar ( char lastChar ) {
if ( m_length <= 0 ) return;
if ( m_buf[m_length-1] != lastChar ) return;
m_length--;
m_buf[m_length] = '\0';
};
//MUTATORS
#ifdef _CHECK_FORMAT_STRING_
bool safePrintf(char *formatString, ...)
@ -83,6 +91,7 @@ struct SafeBuf {
bool safeMemcpy(SafeBuf *c){return safeMemcpy(c->m_buf,c->m_length);};
bool safeMemcpy ( class Words *w , long a , long b ) ;
bool safeStrcpy ( char *s ) ;
bool safeStrcpyPrettyJSON ( char *decodedJson ) ;
//bool pushLong ( long val ) { return safeMemcpy((char *)&val,4); }
bool cat(SafeBuf& c);
// . only cat the sections/tag that start with "tagFilter"
@ -96,7 +105,11 @@ struct SafeBuf {
bool reserve(long i, char *label=NULL);
bool reserve2x(long i);
bool inlineStyleTags();
void incrementLength(long i) { m_length += i; }
void incrementLength(long i) {
m_length += i;
// watch out for negative i's
if ( m_length < 0 ) m_length = 0;
};
void setLength(long i) { m_length = i; };
char *getNextLine ( char *p ) ;
long catFile(char *filename) ;
@ -172,9 +185,9 @@ struct SafeBuf {
//insert strings in their native encoding
bool encode ( char *s , long len , long niceness=0) {
return utf8Encode(s,len,false,niceness); };
return utf8Encode2(s,len,false,niceness); };
// htmlEncode default = false
bool utf8Encode(char *s, long len, bool htmlEncode=false,
bool utf8Encode2(char *s, long len, bool htmlEncode=false,
long niceness=0);
bool latin1Encode(char *s, long len, bool htmlEncode=false,
long niceness=0);
@ -230,6 +243,16 @@ struct SafeBuf {
bool cdataEncode ( char *s ) ;
// . append a \0 but do not inc m_length
// . for null terminating strings
bool nullTerm ( ) {
if(m_length >= m_capacity && !reserve(m_capacity + 1) )
return false;
m_buf[m_length] = '\0';
return true;
};
bool safeCdataMemcpy(char *s, long len);
bool pushChar (char i) {
if(m_length >= m_capacity)

@ -196,6 +196,9 @@ class SearchInput *g_si = NULL;
bool SearchInput::set ( TcpSocket *sock , HttpRequest *r , Query *q ) {
// save it now
m_socket = sock;
// get coll rec
long collLen;
char *coll = r->getString ( "c" , &collLen );

@ -401,6 +401,8 @@ class SearchInput {
// make a cookie from parms with m_flags of PF_COOKIE set
SafeBuf m_cookieBuf;
TcpSocket *m_socket;
//char m_urlParms [ MAX_URLPARMS_LEN ];
//char m_postParms [ MAX_URLPARMS_LEN ];

@ -747,7 +747,7 @@ void SpiderCache::save ( bool useThread ) {
//m_isSaving = true;
// loop over all SpiderColls and get the best
for ( long i = 0 ; i < g_collectiondb.getNumRecs() ; i++ ) {
SpiderColl *sc = getSpiderColl(i);//m_spiderColls[i];
SpiderColl *sc = getSpiderCollIffNonNull(i);//m_spiderColls[i];
if ( ! sc ) continue;
RdbTree *tree = &sc->m_waitingTree;
char *filename = "waitingtree";
@ -797,7 +797,7 @@ void SpiderCache::save ( bool useThread ) {
bool SpiderCache::needsSave ( ) {
for ( long i = 0 ; i < g_collectiondb.getNumRecs() ; i++ ) {
SpiderColl *sc = getSpiderColl(i);//m_spiderColls[i];
SpiderColl *sc = getSpiderCollIffNonNull(i);//m_spiderColls[i];
if ( ! sc ) continue;
if ( sc->m_waitingTree.m_needsSave ) return true;
// also the doleIpTable
@ -809,7 +809,7 @@ bool SpiderCache::needsSave ( ) {
void SpiderCache::reset ( ) {
// loop over all SpiderColls and get the best
for ( long i = 0 ; i < g_collectiondb.getNumRecs() ; i++ ) {
SpiderColl *sc = getSpiderColl(i);
SpiderColl *sc = getSpiderCollIffNonNull(i);
if ( ! sc ) continue;
sc->reset();
mdelete ( sc , sizeof(SpiderColl) , "SpiderCache" );
@ -821,6 +821,13 @@ void SpiderCache::reset ( ) {
//m_numSpiderColls = 0;
}
SpiderColl *SpiderCache::getSpiderCollIffNonNull ( collnum_t collnum ) {
// shortcut
CollectionRec *cr = g_collectiondb.m_recs[collnum];
// return it if non-NULL
return cr->m_spiderColl;
}
// get SpiderColl for a collection
SpiderColl *SpiderCache::getSpiderColl ( collnum_t collnum ) {
// return it if non-NULL
@ -867,6 +874,8 @@ SpiderColl *SpiderCache::getSpiderColl ( collnum_t collnum ) {
sc->m_cr = cr;
// sanity check
if ( ! cr ) { char *xx=NULL;*xx=0; }
// note it!
log("spider: adding new spider collection for %s",cr->m_coll);
// that was it
return sc;
}
@ -894,6 +903,13 @@ SpiderColl::SpiderColl () {
memset ( m_outstandingSpiders , 0 , 4 * MAX_SPIDER_PRIORITIES );
}
long SpiderColl::getTotalOutstandingSpiders ( ) {
long sum = 0;
for ( long i = 0 ; i < MAX_SPIDER_PRIORITIES ; i++ )
sum += m_outstandingSpiders[i];
return sum;
}
// load the tables that we set when m_doInitialScan is true
bool SpiderColl::load ( ) {
// error?
@ -937,6 +953,8 @@ bool SpiderColl::load ( ) {
// . try going to 20M now since we hit it again...
if (!m_waitingTree.set(0,-1,true,20000000,true,"waittree2",
false,"waitingtree",sizeof(key_t)))return false;
// prevent core with this
m_waitingTree.m_rdbId = RDB_NONE;
// make dir
char dir[500];
@ -2326,7 +2344,8 @@ bool SpiderColl::scanSpiderdb ( bool needList ) {
if ( sreq->m_url[0] != 'h' &&
// might be a docid from a pagereindex.cpp
! is_digit(sreq->m_url[0]) ) {
log("spider: got corrupt 1 spiderRequest in scan");
log("spider: got corrupt 1 spiderRequest in scan "
"because url is %s",sreq->m_url);
continue;
}
@ -7814,3 +7833,144 @@ void dedupSpiderdbList ( RdbList *list , long niceness , bool removeNegRecs ) {
//mfree ( oldbuf , oldSize, "oldspbuf");
}
///////
//
// diffbot uses these for limiting crawls in a collection
//
///////
void gotCrawlInfoReply ( void *state , UdpSlot *slot);
class CallbackEntry2 {
public:
void *m_state;
void (* m_callback ) ( void *state );
};
// . get total # of pages crawled in this collection over whole network
// . returns false if blocked
// . returns true and sets g_errno on error
bool updateCrawlInfo ( CollectionRec *cr ,
void *state ,
void (* callback)(void *state) ,
bool useCache ) {
long now = getTimeLocal();
if ( useCache && now - cr->m_globalCrawlInfoUpdateTime < 60 )
return true;
// wait in line if reply is pending
//if ( cr->m_replies < cr->m_requests || ) {
// . returns false and sets g_errno on error
// . this will store state/callback into a safebuf queue
CallbackEntry2 ce2;
ce2.m_state = state;
ce2.m_callback = callback;
if ( ! cr->m_callbackQueue.safeMemcpy ( &ce2, sizeof(CallbackEntry2)) )
return true;
// if we were not the first, we do not initiate it, we just wait
// for all the replies to come back
if ( cr->m_replies < cr->m_requests ) return false;
cr->m_globalCrawlInfo.reset();
cr->m_replies = 0;
cr->m_requests = 0;
// request is just the collnum
char *request = (char *)&cr->m_collnum;
long requestSize = sizeof(collnum_t);
// send out the msg request
for ( long i = 0 ; i < g_hostdb.m_numHosts ; i++ ) {
Host *h = g_hostdb.getHost(i);
// skip if dead
if ( g_hostdb.isDead(i) ) continue;
// count it as launched
cr->m_requests++;
if ( ! g_udpServer.sendRequest ( request,
requestSize,
0xc1 , // msgtype
h->m_ip ,
h->m_port ,
h->m_hostId ,
NULL, // retslot
cr , // state
gotCrawlInfoReply ) ) {
log("spider: error sending c1 request: %s",
mstrerror(g_errno));
cr->m_replies++;
}
}
// return false if we blocked awaiting replies
if ( cr->m_replies < cr->m_requests ) return false;
// somehow we did not block... hmmmm...
gotCrawlInfoReply( cr , NULL );
// we did not block...
return true;
}
void gotCrawlInfoReply ( void *state , UdpSlot *slot ) {
// cast it
CollectionRec *cr = (CollectionRec *)state;
// inc it
cr->m_replies++;
// the sendbuf should never be freed! it points into collrec
slot->m_sendBufAlloc = NULL;
// add it in to the stats
if ( slot ) {
CrawlInfo *stats = (CrawlInfo *)(slot->m_readBuf);
cr->m_globalCrawlInfo.m_pageIndexAttempts +=
stats->m_pageIndexAttempts;
cr->m_globalCrawlInfo.m_pageProcessAttempts +=
stats->m_pageProcessAttempts;
cr->m_globalCrawlInfo.m_pageDownloadAttempts +=
stats->m_pageDownloadAttempts;
}
// return if still waiting on more to come in
if ( cr->m_replies < cr->m_requests ) return;
// update cache time
cr->m_globalCrawlInfoUpdateTime = getTime();
// make it save to disk i guess
cr->m_needsSave = true;
// call all callbacks
long nc = cr->m_callbackQueue.length() / sizeof(CallbackEntry2);
char *p = cr->m_callbackQueue.getBufStart();
for ( long i = 0 ; i < nc ; i++ ) {
CallbackEntry2 *ce2 = (CallbackEntry2 *)p;
p += sizeof(CallbackEntry2);
// clear g_errno just in case
g_errno = 0;
// call that callback waiting in the queue
ce2->m_callback ( ce2->m_state );
}
// save the mem!
cr->m_callbackQueue.purge();
}
void handleRequestc1 ( UdpSlot *slot , long niceness ) {
char *request = slot->m_readBuf;
// just a single collnum
if ( slot->m_readBufSize != sizeof(collnum_t) ) { char *xx=NULL;*xx=0;}
collnum_t collnum = *(collnum_t *)request;
CollectionRec *cr = g_collectiondb.getRec(collnum);
char *reply = slot->m_tmpBuf;
if ( TMPBUFSIZE < sizeof(CrawlInfo) ) { char *xx=NULL;*xx=0; }
memcpy ( reply , &cr->m_localCrawlInfo , sizeof(CrawlInfo) );
g_udpServer.sendReply_ass ( reply ,
sizeof(CrawlInfo) ,
reply , // alloc
sizeof(CrawlInfo) , //alloc size
slot );
}

@ -24,6 +24,13 @@
#include "Msg4.h"
#include "hash.h"
// for diffbot, this is for xmldoc.cpp to update CollectionRec::m_crawlInfo
// which has m_pagesCrawled and m_pagesProcessed.
bool updateCrawlInfo ( CollectionRec *cr ,
void *state ,
void (* callback)(void *state) ,
bool useCache = true ) ;
///////////////////////////////////////
//
// QUICK OVERVIEW
@ -828,6 +835,8 @@ class SpiderColl {
bool load();
long getTotalOutstandingSpiders ( ) ;
key128_t m_firstKey;
// spiderdb is now 128bit keys
key128_t m_nextKey;
@ -966,6 +975,8 @@ class SpiderCache {
// what SpiderColl does a SpiderRec with this key belong?
SpiderColl *getSpiderColl ( collnum_t collNum ) ;
SpiderColl *getSpiderCollIffNonNull ( collnum_t collNum ) ;
// called by main.cpp on exit to free memory
void reset();
@ -1024,7 +1035,7 @@ class Msg12 {
};
void handleRequest12 ( UdpSlot *udpSlot , long niceness ) ;
void handleRequestc1 ( UdpSlot *slot , long niceness ) ;
// . the spider loop
// . it gets urls to spider from the SpiderCache global class, g_spiderCache

@ -1425,6 +1425,9 @@ void writeSocketWrapper ( int sd , void *state ) {
}
// if socket has nothing to send yet cuz we're waiting, wait...
if ( s->m_sendBufUsed == 0 ) return;
sendAgain:
// . writeSocket returns false if blocked, true otherwise
// . it also sets g_errno on errro
// . don't call it if we have g_errno set, however
@ -1435,8 +1438,16 @@ void writeSocketWrapper ( int sd , void *state ) {
if ( status == 1 && ! s->m_readBuf ) return;
// good?
g_errno = 0;
// otherwise, call callback on done reading or error
// otherwise, call callback on done writing or error
THIS->makeCallback ( s );
// if callback changed socket status to ST_SEND_AGAIN
// then let's send the new buffer that it has. Diffbot.cpp uses this.
if ( s->m_sockState == ST_SEND_AGAIN ) {
s->m_sockState = ST_WRITING;
goto sendAgain;
}
// . destroy the socket on error, recycle on transaction completion
// . this will also unregister all our callbacks for the socket
if ( status == -1 ) THIS->destroySocket ( s );

@ -26,6 +26,10 @@
#define ST_CLOSE_CALLED 7
#define ST_SSL_ACCEPT 8
#define ST_SSL_SHUTDOWN 9
// hack to repopulate the socket's send buf when its done sending
// it's current sendbuf in order to transmit large amounts of data that
// can't all fit in memory at the same time:
#define ST_SEND_AGAIN 10
#define TCP_READ_BUF_SIZE 1024

@ -103,6 +103,35 @@ bool Title::setTitle ( XmlDoc *xd ,
long long startTime = gettimeofdayInMilliseconds();
// . reset so matches.cpp using this does not core
// . assume no title tag
m_titleTagStart = -1;
m_titleTagEnd = -1;
// if we are a json object
if ( ! xd->m_contentTypeValid ) { char *xx=NULL;*xx=0; }
char *val = NULL;
long vlen;
// look for the "title:" field in json then use that
if ( xd->m_contentType == CT_JSON )
val = getJSONFieldValue ( xd->ptr_utf8Content,"title",&vlen);
// if we had a title: field in the json...
if ( val ) {
char *dst = NULL;
m_titleBytes = vlen;
if ( m_titleBytes+1 < TITLE_LOCAL_SIZE )
dst = m_localBuf;
else {
dst = (char *)mmalloc ( m_titleBytes+1,"titdst" );
if ( ! dst ) return false;
}
m_title = dst;
memcpy ( dst , val , m_titleBytes );
dst[m_titleBytes] = '\0';
return true;
}
bool status = setTitle4 ( xd ,
xml ,
words ,

File diff suppressed because it is too large Load Diff

@ -91,8 +91,16 @@ bool setLangVec ( class Words *words ,
class Sections *sections ,
long niceness ) ;
char *getJSONFieldValue ( char *json, char *field , long *valueLen ) ;
bool logQueryLogs ( );
bool checkRegex ( SafeBuf *regex ,
char *target ,
bool *boolVal ,
bool *boolValValid ,
long *compileError = NULL ) ;
// Address.cpp calls this to make a vector from the "place name" for comparing
// to other places in placedb using the computeSimilarity() function. if
// we got a >75% similarity we set the AF_VERIFIED_PLACE_NAME bit in the
@ -283,7 +291,13 @@ class XmlDoc {
char m_reserved3b;
uint16_t m_reserved4;//externalLinkTextWeight;
uint16_t m_reserved5;//internalLinkTextWeight;
uint16_t m_reserved6;//conceptWeight;
// a new parm from reserved6. need to know the count so we can
// delete the json objects derived from this page if we want to
// delete this page. or if this page is respidered then we get the
// json objects for it, REject the old json object urls, and inject
// the new ones i guess.
uint16_t m_diffbotJSONCount;
// these do not include header/footer (dup) addresses
//int16_t m_numAddresses;
@ -311,7 +325,24 @@ class XmlDoc {
uint16_t m_hasSiteVenue:1;
uint16_t m_hasContactInfo:1;
uint16_t m_isSiteRoot:1;
uint16_t m_reserved8;
uint16_t m_isDiffbotJSONObject:1;
uint16_t m_reserved802:1;
uint16_t m_reserved803:1;
uint16_t m_reserved804:1;
uint16_t m_reserved805:1;
uint16_t m_reserved806:1;
uint16_t m_reserved807:1;
uint16_t m_reserved808:1;
uint16_t m_reserved809:1;
uint16_t m_reserved810:1;
uint16_t m_reserved811:1;
uint16_t m_reserved812:1;
uint16_t m_reserved813:1;
uint16_t m_reserved814:1;
uint16_t m_reserved815:1;
uint16_t m_reserved816:1;
char *ptr_firstUrl;
char *ptr_redirUrl;
@ -1205,6 +1236,11 @@ class XmlDoc {
bool m_numOutlinksAddedValid;
bool m_baseUrlValid;
bool m_replyValid;
bool m_diffbotReplyValid;
bool m_diffbotUrlCrawlPatternMatchValid;
bool m_diffbotUrlProcessPatternMatchValid;
bool m_diffbotPageProcessPatternMatchValid;
bool m_crawlInfoValid;
bool m_isPageParserValid;
bool m_imageUrlValid;
bool m_matchOffsetsValid;
@ -1416,6 +1452,7 @@ class XmlDoc {
long m_siteHash32;
char *m_httpReply;
char m_downloadAttempted;
char m_incrementedAttemptsCount;
char m_redirectFlag;
//char m_isScraping;
//char m_throttleDownload;
@ -1447,6 +1484,25 @@ class XmlDoc {
//long *m_outlinkIpVector;
Msge1 m_msge1;
//
// diffbot parms for indexing diffbot's json output
//
XmlDoc *m_dx;
char *m_diffbotObj;
char *m_diffbotObjEnd;
char m_diffbotSavedChar;
SafeBuf m_diffbotReply;
long m_diffbotReplyError;
bool m_diffbotUrlCrawlPatternMatch;
bool m_diffbotUrlProcessPatternMatch;
bool m_diffbotPageProcessPatternMatch;
SafeBuf *getDiffbotReply ( ) ;
bool doesUrlMatchDiffbotCrawlPattern() ;
bool doesUrlMatchDiffbotProcessPattern() ;
bool doesPageContentMatchDiffbotProcessPattern() ;
char *hashJSON ( HashTableX *table );
//
// functions and vars for the seo query matching tool

@ -4828,6 +4828,7 @@ bool registerMsgHandlers2(){
//if ( ! MsgF ::registerHandler() ) return false;
//if(! g_udpServer.registerHandler(0x10,handleRequest10)) return false;
if ( ! g_udpServer.registerHandler(0xc1,handleRequestc1)) return false;
if ( ! g_udpServer.registerHandler(0x39,handleRequest39)) return false;
if ( ! g_udpServer.registerHandler(0x2c,handleRequest2c)) return false;
if ( ! g_udpServer.registerHandler(0x12,handleRequest12)) return false;