Merge branch 'testing' of github.com:gigablast/open-source-search-engine into testing
This commit is contained in:
commit
c20c30c53f
Collectiondb.cppDates.cppErrno.cppHttpRequest.hImages.cppJson.cppLinkdb.cppMakefileMsg13.cppMsg13.hMsg4.cppMsg5.cppMsge1.cppPageBasic.cppPageCrawlBot.cppPageParser.cppPageResults.cppPageRoot.cppPages.cppPages.hParms.cppProxy.cppSafeBuf.hSearchInput.hSections.cppSiteGetter.cppSpider.cppTagdb.cppTest.cppTitle.cppXmlDoc.cpp
coll.main.0
dnstest.cpphtml
main.cppmonitor.cppqa.cpp
273
Collectiondb.cpp
273
Collectiondb.cpp
@ -138,6 +138,19 @@ bool Collectiondb::loadAllCollRecs ( ) {
|
||||
if ( ! addExistingColl ( coll , collnum ) )
|
||||
return false;
|
||||
}
|
||||
// if no existing recs added... add coll.main.0 always at startup
|
||||
if ( m_numRecs == 0 ) {
|
||||
log("admin: adding main collection.");
|
||||
addNewColl ( "main",
|
||||
0 , // customCrawl ,
|
||||
NULL,
|
||||
0 ,
|
||||
true , // bool saveIt ,
|
||||
// Parms.cpp reserves this so it can be sure
|
||||
// to add the same collnum to every shard
|
||||
0 );
|
||||
}
|
||||
|
||||
// note it
|
||||
//log(LOG_INFO,"db: Loaded data for %li collections. Ranging from "
|
||||
// "collection #0 to #%li.",m_numRecsUsed,m_numRecs-1);
|
||||
@ -838,8 +851,8 @@ bool Collectiondb::resetColl ( char *coll , bool purgeSeeds) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// get the CollectionRec for "test"
|
||||
CollectionRec *cr = getRec ( coll ); // "test" );
|
||||
// get the CollectionRec for "qatest123"
|
||||
CollectionRec *cr = getRec ( coll ); // "qatest123" );
|
||||
|
||||
// must be there. if not, we create test i guess
|
||||
if ( ! cr ) {
|
||||
@ -972,6 +985,39 @@ bool Collectiondb::setRecPtr ( collnum_t collnum , CollectionRec *cr ) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// moves a file by first trying rename, then copying since cross device renaming doesn't work
|
||||
// returns 0 on success
|
||||
int mv(char* src, char* dest) {
|
||||
int status = rename( src , dest );
|
||||
|
||||
if (status == 0)
|
||||
return 0;
|
||||
FILE *fsrc, *fdest;
|
||||
fsrc = fopen(src, "r");
|
||||
if (fsrc == NULL)
|
||||
return -1;
|
||||
fdest = fopen(dest, "w");
|
||||
if (fdest == NULL) {
|
||||
fclose(fsrc);
|
||||
return -1;
|
||||
}
|
||||
|
||||
const int BUF_SIZE = 1024;
|
||||
char buf[BUF_SIZE];
|
||||
while (!ferror(fdest) && !ferror(fsrc) && !feof(fsrc)) {
|
||||
int read = fread(buf, 1, BUF_SIZE, fsrc);
|
||||
fwrite(buf, 1, read, fdest);
|
||||
}
|
||||
|
||||
fclose(fsrc);
|
||||
fclose(fdest);
|
||||
if (ferror(fdest) || ferror(fsrc))
|
||||
return -1;
|
||||
|
||||
remove(src);
|
||||
return 0;
|
||||
}
|
||||
|
||||
// . returns false if we need a re-call, true if we completed
|
||||
// . returns true with g_errno set on error
|
||||
bool Collectiondb::resetColl2( collnum_t oldCollnum,
|
||||
@ -982,8 +1028,8 @@ bool Collectiondb::resetColl2( collnum_t oldCollnum,
|
||||
// save parms in case we block
|
||||
//we->m_purgeSeeds = purgeSeeds;
|
||||
|
||||
// now must be "test" only for now
|
||||
//if ( strcmp(coll,"test") ) { char *xx=NULL;*xx=0; }
|
||||
// now must be "qatest123" only for now
|
||||
//if ( strcmp(coll,"qatest123") ) { char *xx=NULL;*xx=0; }
|
||||
// no spiders can be out. they may be referencing the CollectionRec
|
||||
// in XmlDoc.cpp... quite likely.
|
||||
//if ( g_conf.m_spideringEnabled ||
|
||||
@ -1018,6 +1064,18 @@ bool Collectiondb::resetColl2( collnum_t oldCollnum,
|
||||
//collnum_t oldCollnum = cr->m_collnum;
|
||||
//collnum_t newCollnum = m_numRecs;
|
||||
|
||||
// in case of bulk job, be sure to save list of spots
|
||||
// copy existing list to a /tmp, where they will later be transferred back to the new folder
|
||||
char oldbulkurlsname[1036];
|
||||
snprintf(oldbulkurlsname, 1036, "%scoll.%s.%li/bulkurls.txt",g_hostdb.m_dir,cr->m_coll,(long)oldCollnum);
|
||||
char newbulkurlsname[1036];
|
||||
snprintf(newbulkurlsname, 1036, "%scoll.%s.%li/bulkurls.txt",g_hostdb.m_dir,cr->m_coll,(long)newCollnum);
|
||||
char tmpbulkurlsname[1036];
|
||||
snprintf(tmpbulkurlsname, 1036, "/tmp/coll.%s.%li.bulkurls.txt",cr->m_coll,(long)oldCollnum);
|
||||
|
||||
if (cr->m_isCustomCrawl == 2)
|
||||
mv( oldbulkurlsname , tmpbulkurlsname );
|
||||
|
||||
// reset spider info
|
||||
SpiderColl *sc = g_spiderCache.getSpiderCollIffNonNull(oldCollnum);
|
||||
if ( sc ) {
|
||||
@ -1127,6 +1185,9 @@ bool Collectiondb::resetColl2( collnum_t oldCollnum,
|
||||
// save coll.conf to new directory
|
||||
cr->save();
|
||||
|
||||
// be sure to copy back the bulk urls for bulk jobs
|
||||
if (cr->m_isCustomCrawl == 2)
|
||||
mv( tmpbulkurlsname, newbulkurlsname );
|
||||
|
||||
// and clear the robots.txt cache in case we recently spidered a
|
||||
// robots.txt, we don't want to use it, we want to use the one we
|
||||
@ -1792,31 +1853,193 @@ void CollectionRec::setUrlFiltersToDefaults ( ) {
|
||||
|
||||
long n = 0;
|
||||
|
||||
//strcpy(m_regExs [n],"default");
|
||||
/*
|
||||
m_regExs[n].set("default");
|
||||
m_regExs[n].nullTerm();
|
||||
m_numRegExs++;
|
||||
|
||||
m_spiderFreqs [n] = 30; // 30 days default
|
||||
m_numRegExs2++;
|
||||
|
||||
m_spiderPriorities[n] = 0;
|
||||
m_numRegExs3++;
|
||||
|
||||
m_maxSpidersPerRule[n] = 99;
|
||||
m_numRegExs10++;
|
||||
|
||||
m_spiderIpWaits[n] = 1000;
|
||||
m_numRegExs5++;
|
||||
|
||||
m_spiderIpMaxSpiders[n] = 7;
|
||||
m_numRegExs6++;
|
||||
|
||||
//m_spidersEnabled[n] = 1;
|
||||
//m_numRegExs7++;
|
||||
|
||||
m_harvestLinks[n] = 1;
|
||||
m_numRegExs8++;
|
||||
*/
|
||||
|
||||
m_regExs[n].set("isdocidbased");
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 0; // 30 days default
|
||||
m_maxSpidersPerRule [n] = 99; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 80;
|
||||
n++;
|
||||
|
||||
m_regExs[n].set("ismedia");
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 0; // 30 days default
|
||||
m_maxSpidersPerRule [n] = 99; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = -3; // delete!
|
||||
n++;
|
||||
|
||||
// if not in the site list then nuke it
|
||||
m_regExs[n].set("!insitelist");
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 0; // 30 days default
|
||||
m_maxSpidersPerRule [n] = 99; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = -3; // delete!
|
||||
n++;
|
||||
|
||||
m_regExs[n].set("errorcount>=3 && hastmperror");
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 1; // 30 days default
|
||||
m_maxSpidersPerRule [n] = 1; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 3;
|
||||
n++;
|
||||
|
||||
m_regExs[n].set("errorcount>=1 && hastmperror");
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 1; // 30 days default
|
||||
m_maxSpidersPerRule [n] = 1; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 45;
|
||||
n++;
|
||||
|
||||
m_regExs[n].set("isaddurl");
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 7; // 30 days default
|
||||
m_maxSpidersPerRule [n] = 99; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 85;
|
||||
n++;
|
||||
|
||||
m_regExs[n].set("hopcount==0 && iswww && isnew");
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 7; // 30 days default
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 50;
|
||||
n++;
|
||||
|
||||
m_regExs[n].set("hopcount==0 && iswww");
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 7; // 30 days default
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 48;
|
||||
n++;
|
||||
|
||||
m_regExs[n].set("hopcount==0 && isnew");
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 7; // 30 days default
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 49;
|
||||
n++;
|
||||
|
||||
m_regExs[n].set("hopcount==0");
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 10; // 30 days default
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 47;
|
||||
n++;
|
||||
|
||||
m_regExs[n].set("hopcount==1 && isnew");
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 20; // 30 days default
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 40;
|
||||
n++;
|
||||
|
||||
m_regExs[n].set("hopcount==1");
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 20; // 30 days default
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 39;
|
||||
n++;
|
||||
|
||||
m_regExs[n].set("hopcount==2 && isnew");
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 40; // 30 days default
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 30;
|
||||
n++;
|
||||
|
||||
m_regExs[n].set("hopcount==2");
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 40; // 30 days default
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 29;
|
||||
n++;
|
||||
|
||||
m_regExs[n].set("hopcount>=3 && isnew");
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 60; // 30 days default
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 20;
|
||||
n++;
|
||||
|
||||
m_regExs[n].set("hopcount>=3");
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 60; // 30 days default
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 19;
|
||||
n++;
|
||||
|
||||
m_regExs[n].set("isnew");
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 30; // 30 days default
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 2;
|
||||
n++;
|
||||
|
||||
m_regExs[n].set("default");
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 30; // 30 days default
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 1;
|
||||
n++;
|
||||
|
||||
|
||||
m_numRegExs = n;
|
||||
m_numRegExs2 = n;
|
||||
m_numRegExs3 = n;
|
||||
m_numRegExs10 = n;
|
||||
m_numRegExs5 = n;
|
||||
m_numRegExs6 = n;
|
||||
m_numRegExs8 = n;
|
||||
|
||||
// more rules
|
||||
|
||||
|
||||
|
||||
|
||||
//m_spiderDiffbotApiNum[n] = 1;
|
||||
//m_numRegExs11++;
|
||||
@ -2064,7 +2287,7 @@ bool CollectionRec::hasSearchPermission ( TcpSocket *s , long encapIp ) {
|
||||
}
|
||||
|
||||
bool expandRegExShortcuts ( SafeBuf *sb ) ;
|
||||
bool updateSiteList ( collnum_t collnum , bool addSeeds );
|
||||
bool updateSiteListTables ( collnum_t collnum,bool addSeeds,char *siteListArg);
|
||||
void nukeDoledb ( collnum_t collnum );
|
||||
|
||||
// . anytime the url filters are updated, this function is called
|
||||
@ -2127,10 +2350,14 @@ bool CollectionRec::rebuildUrlFilters ( ) {
|
||||
// maybe this is good enough
|
||||
//if ( sc ) sc->m_waitingTreeNeedsRebuild = true;
|
||||
|
||||
CollectionRec *cr = sc->m_cr;
|
||||
|
||||
// . rebuild sitetable? in PageBasic.cpp.
|
||||
// . re-adds seed spdierrequests using msg4
|
||||
// . true = addSeeds
|
||||
updateSiteList ( m_collnum , true );
|
||||
updateSiteListTables ( m_collnum ,
|
||||
true ,
|
||||
cr->m_siteListBuf.getBufStart() );
|
||||
}
|
||||
|
||||
|
||||
|
@ -1318,7 +1318,7 @@ sections. -- todo -- might be an alignment issue... check out later
|
||||
|
||||
// . make a whole new set of urls for pub date detection
|
||||
// . grab that sample set from buzz wiki page
|
||||
// . record the correct pub date for urls in the "test" coll and make sure
|
||||
// . record the correct pub date for urls in the "qatest123" coll and make sure
|
||||
// we get them each time, otherwise core dump!!
|
||||
// . check the date we extract with the rss feed. that is a good test too!
|
||||
// report on that accuracy in the logs and on the stats page.
|
||||
@ -2428,7 +2428,7 @@ bool Dates::setPart1 ( //char *u ,
|
||||
//if ( m_nw != words->m_numWords ) { char *xx=NULL; *xx=0; }
|
||||
|
||||
// . get the current time in utc
|
||||
// . NO! to ensure the "test" collection re-injects docs exactly
|
||||
// . NO! to ensure the "qatest123" collection re-injects docs exactly
|
||||
// the same, use the spideredTime from the doc
|
||||
// . we make sure to save this in the test subdir somehow..
|
||||
//m_now = nd->m_spideredTime; // getTimeSynced();
|
||||
@ -3283,7 +3283,7 @@ bool Dates::setPart1 ( //char *u ,
|
||||
// DF_NOTCLOCK flags from this.
|
||||
|
||||
// . current time. sync'd with host #0 who uses ntp supposedly...! :(
|
||||
// . to ensure that the "test" subdir re-injects docs exactly the
|
||||
// . to ensure that the "qatest123" subdir re-injects docs exactly the
|
||||
// same, we need to use this date now
|
||||
long now = nd->m_spideredTime;
|
||||
// how long has elapsed since we downloaded it last approx.?
|
||||
@ -3294,7 +3294,8 @@ bool Dates::setPart1 ( //char *u ,
|
||||
// might have been different than ours... actually i think our
|
||||
// spiderdate.txt file had an older date in it from a previous round!
|
||||
// so disable this when test spidering.
|
||||
if ( elapsed<0 && g_conf.m_testSpiderEnabled && !strcmp(m_coll,"test"))
|
||||
if ( elapsed<0 && g_conf.m_testSpiderEnabled && !strcmp(m_coll,
|
||||
"qatest123"))
|
||||
elapsed = 0;
|
||||
// is true.
|
||||
if ( elapsed < 0 ) {
|
||||
|
@ -167,7 +167,7 @@ case EFAKEFIRSTIP: return "Fake firstIp";
|
||||
case EBADHOSTSCONF: return "A hosts.conf is out of sync";
|
||||
case EWAITINGTOSYNCHOSTSCONF: return "Wait to ensure hosts.conf in sync";
|
||||
case EDOCNONCANONICAL: return "Url was dup of canonical page";
|
||||
case ECUSTOMCRAWLMISMATCH: return "Crawl type mismatch";
|
||||
case ECUSTOMCRAWLMISMATCH: return "Job name/type mismatch. Job name has already been used for a crawl or bulk job.";
|
||||
}
|
||||
// if the remote error bit is clear it must be a regulare errno
|
||||
//if ( ! ( errnum & REMOTE_ERROR_BIT ) ) return strerror ( errnum );
|
||||
|
@ -28,10 +28,14 @@
|
||||
#include "TcpSocket.h"
|
||||
|
||||
// values for HttpRequest::m_replyFormat
|
||||
#define FORMAT_HTML 0
|
||||
#define FORMAT_XML 1
|
||||
#define FORMAT_JSON 2
|
||||
#define FORMAT_CSV 3
|
||||
#define FORMAT_HTML 1
|
||||
#define FORMAT_XML 2
|
||||
#define FORMAT_JSON 3
|
||||
#define FORMAT_CSV 4
|
||||
#define FORMAT_TXT 5
|
||||
#define FORMAT_PROCOG 6
|
||||
|
||||
|
||||
|
||||
class HttpRequest {
|
||||
|
||||
|
@ -445,7 +445,7 @@ bool Images::downloadImages () {
|
||||
r->reset();
|
||||
r->m_maxTextDocLen = 200000;
|
||||
r->m_maxOtherDocLen = 500000;
|
||||
if ( ! strcmp(cr->m_coll,"test")) {
|
||||
if ( ! strcmp(cr->m_coll,"qatest123")) {
|
||||
r->m_useTestCache = 1;
|
||||
r->m_addToTestCache = 1;
|
||||
}
|
||||
|
2
Json.cpp
2
Json.cpp
@ -433,7 +433,7 @@ char *JsonItem::getValueAsString ( long *valueLen ) {
|
||||
|
||||
// numbers...
|
||||
static char s_numBuf[64];
|
||||
if ( m_valueLong == (long)m_valueDouble ) {
|
||||
if ( (float)m_valueLong == m_valueDouble ) {
|
||||
*valueLen = sprintf ( s_numBuf,"%li", m_valueLong );
|
||||
return s_numBuf;
|
||||
}
|
||||
|
@ -3935,7 +3935,7 @@ LinkInfo *makeLinkInfo ( char *coll ,
|
||||
// . how many unique ips link to us?
|
||||
// . this count includes internal IPs as well
|
||||
info->m_numUniqueIps = msg25->m_uniqueIps;
|
||||
// keep things consistent for the "test" coll
|
||||
// keep things consistent for the "qatest123" coll
|
||||
info->m_reserved1 = 0;
|
||||
info->m_reserved2 = 0;
|
||||
// how many total GOOD inlinks we got. does not include internal cblock
|
||||
|
2
Makefile
2
Makefile
@ -57,7 +57,7 @@ OBJS = UdpSlot.o Rebalance.o \
|
||||
PostQueryRerank.o Msge0.o Msge1.o \
|
||||
CountryCode.o DailyMerge.o CatRec.o Tagdb.o \
|
||||
Users.o Images.o Wiki.o Wiktionary.o Scraper.o \
|
||||
Dates.o Sections.o SiteGetter.o Syncdb.o \
|
||||
Dates.o Sections.o SiteGetter.o Syncdb.o qa.o \
|
||||
Placedb.o Address.o Test.o GeoIP.o GeoIPCity.o Synonyms.o \
|
||||
Cachedb.o Monitordb.o dlstubs.o PageCrawlBot.o Json.o PageBasic.o
|
||||
|
||||
|
25
Msg13.cpp
25
Msg13.cpp
@ -721,6 +721,25 @@ void downloadTheDocForReals ( Msg13Request *r ) {
|
||||
"(compatible; MSIE 6.0; Windows 98; "
|
||||
"Win 9x 4.90)" ;
|
||||
|
||||
// for bulk jobs avoid actual downloads of the page for efficiency
|
||||
if ( r->m_isCustomCrawl == 2 ) {
|
||||
char *s =
|
||||
"HTTP/1.0 200 (OK)\r\n"
|
||||
"Content-Length: 0\r\n"
|
||||
"Connection: Close\r\n"
|
||||
"Content-Type: text/html\r\n\r\n";
|
||||
long slen = gbstrlen(s);
|
||||
long fakeBufSize = slen + 1;
|
||||
char *fakeBuf = mdup ( s , fakeBufSize , "fkblk");
|
||||
gotHttpReply2 ( r ,
|
||||
fakeBuf,
|
||||
fakeBufSize, // include \0
|
||||
fakeBufSize, // allocsize
|
||||
NULL ); // tcpsock
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
// download it
|
||||
if ( ! g_httpServer.getDoc ( r->m_url ,
|
||||
r->m_urlIp ,
|
||||
@ -1390,7 +1409,7 @@ void passOnReply ( void *state , UdpSlot *slot ) {
|
||||
|
||||
//
|
||||
//
|
||||
// . UTILITY FUNCTIONS for injecting into the "test" collection
|
||||
// . UTILITY FUNCTIONS for injecting into the "qatest123" collection
|
||||
// . we need to ensure that the web pages remain constant so we store them
|
||||
//
|
||||
//
|
||||
@ -1400,7 +1419,7 @@ void passOnReply ( void *state , UdpSlot *slot ) {
|
||||
// . now that we are lower level in Msg13.cpp, set "ts" not "slot"
|
||||
bool getTestDoc ( char *u , TcpSocket *ts , Msg13Request *r ) {
|
||||
// sanity check
|
||||
//if ( strcmp(m_coll,"test") ) { char *xx=NULL;*xx=0; }
|
||||
//if ( strcmp(m_coll,"qatest123") ) { char *xx=NULL;*xx=0; }
|
||||
// hash the url into 64 bits
|
||||
long long h = hash64 ( u , gbstrlen(u) );
|
||||
// read the spider date file first
|
||||
@ -1547,7 +1566,7 @@ bool addTestSpideredDate ( Url *u , long spideredTime , char *testDir ) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// add it to our "test" subdir
|
||||
// add it to our "qatest123" subdir
|
||||
bool addTestDoc ( long long urlHash64 , char *httpReply , long httpReplySize ,
|
||||
long err , Msg13Request *r ) {
|
||||
|
||||
|
2
Msg13.h
2
Msg13.h
@ -32,6 +32,8 @@ public:
|
||||
// if doing spider compression, compute contentHash32 of document
|
||||
// downloaded, and if it matches this then send back EDOCUNCHANGED
|
||||
long m_contentHash32;
|
||||
// copy of CollectionRec::m_customCrawl, 0 1 for crawls or 2 for bulks
|
||||
char m_isCustomCrawl;
|
||||
// send back error ENOGOODDATE if it does not have one. but if
|
||||
// harvestLinks is true, just send back a filtered list of links
|
||||
long m_requireGoodDate:1;
|
||||
|
2
Msg4.cpp
2
Msg4.cpp
@ -159,7 +159,7 @@ public:
|
||||
};
|
||||
|
||||
|
||||
// . injecting into the "test" coll flushes after each inject
|
||||
// . injecting into the "qatest123" coll flushes after each inject
|
||||
// . returns false if blocked and callback will be called
|
||||
bool flushMsg4Buffers ( void *state , void (* callback) (void *) ) {
|
||||
// if all empty, return true now
|
||||
|
4
Msg5.cpp
4
Msg5.cpp
@ -859,9 +859,9 @@ bool Msg5::needsRecall ( ) {
|
||||
if ( m_round == 0 ) logIt = false;
|
||||
if ( logIt )
|
||||
logf(LOG_DEBUG,"db: Reading %li again from %s (need %li total "
|
||||
"got %li) this=0x%lx round=%li.",
|
||||
"got %li) cn=%li this=0x%lx round=%li.",
|
||||
m_newMinRecSizes , base->m_dbname , m_minRecSizes,
|
||||
m_list->m_listSize, (long)this , m_round );
|
||||
m_list->m_listSize, (long)m_collnum,(long)this, m_round );
|
||||
m_round++;
|
||||
// record how many screw ups we had so we know if it hurts performance
|
||||
base->m_rdb->didReSeek ( );
|
||||
|
18
Msge1.cpp
18
Msge1.cpp
@ -116,7 +116,7 @@ bool Msge1::getFirstIps ( TagRec **grv ,
|
||||
if ( ! launchRequests ( 0 ) ) return false;
|
||||
|
||||
// save it? might be a page parser
|
||||
//if ( ! strcmp(m_coll,"test") ) saveTestBuf();
|
||||
//if ( ! strcmp(m_coll,"qatest123") ) saveTestBuf();
|
||||
|
||||
// none blocked, we are done
|
||||
return true;
|
||||
@ -219,7 +219,7 @@ bool Msge1::launchRequests ( long starti ) {
|
||||
|
||||
/*
|
||||
// look up in our m_testBuf.
|
||||
if ( m_coll && ! strcmp(m_coll,"test") ) {
|
||||
if ( m_coll && ! strcmp(m_coll,"qatest123") ) {
|
||||
bool found = false;
|
||||
// do we got it?
|
||||
long quickIp ; bool status = getTestIp ( p , &quickIp, &found);
|
||||
@ -300,7 +300,7 @@ bool Msge1::sendMsgC ( long i , char *host , long hlen ) {
|
||||
|
||||
|
||||
// look up in our m_testBuf.
|
||||
if ( m_coll && ! strcmp(m_coll,"test") ) {
|
||||
if ( m_coll && ! strcmp(m_coll,"qatest123") ) {
|
||||
bool found = false;
|
||||
// shortcut
|
||||
//char *p = m_urlPtrs[n];
|
||||
@ -340,7 +340,7 @@ void gotMsgCWrapper ( void *state , long ip ) {
|
||||
if ( ! THIS->launchRequests(i) ) return;
|
||||
// . save it if we should. might be a page parser
|
||||
// . mdw i uncommented this when we cored all the time
|
||||
//if ( ! strcmp(THIS->m_coll,"test")) saveTestBuf();
|
||||
//if ( ! strcmp(THIS->m_coll,"qatest123")) saveTestBuf();
|
||||
// must be all done, call the callback
|
||||
THIS->m_callback ( THIS->m_state );
|
||||
}
|
||||
@ -364,7 +364,7 @@ bool Msge1::doneSending ( long i ) {
|
||||
// n, i, m_urls[i].getUrl() ,iptoa(ip));
|
||||
|
||||
// store it?
|
||||
if ( ! strcmp(m_coll,"test") ) {
|
||||
if ( ! strcmp(m_coll,"qatest123") ) {
|
||||
// get host
|
||||
long hlen = 0;
|
||||
char *host = getHostFast ( m_urlPtrs[n] , &hlen );
|
||||
@ -511,9 +511,9 @@ static char *s_last = NULL ;
|
||||
static long s_lastLen = 0 ;
|
||||
static HashTableX s_ht;
|
||||
|
||||
// . only call this if the collection is "test"
|
||||
// . only call this if the collection is "qatest123"
|
||||
// . we try to get the ip by accessing the "./test/ips.txt" file
|
||||
// . we also ad ips we lookup to that file in the collection is "test"
|
||||
// . we also ad ips we lookup to that file in the collection is "qatest123"
|
||||
// . returns false and sets g_errno on error, true on success
|
||||
bool getTestIp ( char *url , long *retIp , bool *found , long niceness ,
|
||||
char *testDir ) {
|
||||
@ -533,8 +533,8 @@ bool getTestIp ( char *url , long *retIp , bool *found , long niceness ,
|
||||
// assume not found
|
||||
*found = false;
|
||||
|
||||
// . if we are the "test" collection, check for "./test/ips.txt" file
|
||||
// that gives us the ips of the given urls.
|
||||
// . if we are the "qatestq123" collection, check for "./test/ips.txt"
|
||||
// file that gives us the ips of the given urls.
|
||||
// . if we end up doing some lookups we should append to that file
|
||||
if ( ! s_testBuf || s_needsReload ) {
|
||||
// assume needs reload now
|
||||
|
137
PageBasic.cpp
137
PageBasic.cpp
@ -73,7 +73,9 @@ public:
|
||||
// . uses msg4 to add seeds to spiderdb if necessary
|
||||
// . only adds seeds for the shard we are on iff we are responsible for
|
||||
// the fake firstip!!!
|
||||
bool updateSiteList ( collnum_t collnum , bool addSeeds ) {
|
||||
bool updateSiteListTables ( collnum_t collnum ,
|
||||
bool addSeeds ,
|
||||
char *siteListArg ) {
|
||||
|
||||
CollectionRec *cr = g_collectiondb.getRec ( collnum );
|
||||
if ( ! cr ) return true;
|
||||
@ -113,6 +115,8 @@ bool updateSiteList ( collnum_t collnum , bool addSeeds ) {
|
||||
}
|
||||
|
||||
// get the old sitelist Domain Hash to PatternData mapping table
|
||||
// which tells us what domains, subdomains or paths we can or
|
||||
// can not spider...
|
||||
HashTableX *dt = &sc->m_siteListDomTable;
|
||||
|
||||
// reset it
|
||||
@ -142,10 +146,10 @@ bool updateSiteList ( collnum_t collnum , bool addSeeds ) {
|
||||
// use this so it will be free automatically when msg4 completes!
|
||||
SafeBuf *spiderReqBuf = &sc->m_msg4x.m_tmpBuf;
|
||||
|
||||
char *siteList = cr->m_siteListBuf.getBufStart();
|
||||
//char *siteList = cr->m_siteListBuf.getBufStart();
|
||||
|
||||
// scan the list
|
||||
char *pn = siteList;
|
||||
char *pn = siteListArg;
|
||||
|
||||
// completely empty?
|
||||
if ( ! pn ) return true;
|
||||
@ -156,7 +160,7 @@ bool updateSiteList ( collnum_t collnum , bool addSeeds ) {
|
||||
|
||||
Url u;
|
||||
|
||||
for ( ; *pn ; pn++ , lineNum++ ) {
|
||||
for ( ; *pn ; lineNum++ ) {
|
||||
|
||||
// get end
|
||||
char *s = pn;
|
||||
@ -169,6 +173,9 @@ bool updateSiteList ( collnum_t collnum , bool addSeeds ) {
|
||||
char *pe = pn;
|
||||
for ( ; pe > s && is_wspace_a(pe[-1]) ; pe-- );
|
||||
|
||||
// advance over '\n' for next line
|
||||
if ( *pn && *pn == '\n' ) pn++;
|
||||
|
||||
// make hash of the line
|
||||
long h32 = hash32 ( s , pe - s );
|
||||
|
||||
@ -287,7 +294,7 @@ bool updateSiteList ( collnum_t collnum , bool addSeeds ) {
|
||||
if ( ! isFilter ) continue;
|
||||
|
||||
|
||||
// make the data node
|
||||
// make the data node used for filtering urls during spidering
|
||||
PatternData pd;
|
||||
// hash of the subdomain or domain for this line in sitelist
|
||||
pd.m_thingHash32 = u.getHostHash32();
|
||||
@ -388,10 +395,15 @@ char *getMatchingUrlPattern ( SpiderColl *sc , SpiderRequest *sreq ) {
|
||||
// check domain specific tables
|
||||
HashTableX *dt = &sc->m_siteListDomTable;
|
||||
|
||||
// get this
|
||||
CollectionRec *cr = sc->m_cr;
|
||||
|
||||
// need to build dom table for pattern matching?
|
||||
if ( dt->getNumSlotsUsed() == 0 ) {
|
||||
if ( dt->getNumSlotsUsed() == 0 && cr ) {
|
||||
// do not add seeds, just make siteListDomTable, etc.
|
||||
updateSiteList ( sc->m_collnum , false );
|
||||
updateSiteListTables ( sc->m_collnum ,
|
||||
false , // add seeds?
|
||||
cr->m_siteListBuf.getBufStart() );
|
||||
}
|
||||
|
||||
if ( dt->getNumSlotsUsed() == 0 ) {
|
||||
@ -728,6 +740,7 @@ bool sendPageBasicStatus ( TcpSocket *socket , HttpRequest *hr ) {
|
||||
|
||||
char buf [ 128000 ];
|
||||
SafeBuf sb(buf,128000);
|
||||
sb.reset();
|
||||
|
||||
char *fs = hr->getString("format",NULL,NULL);
|
||||
char fmt = FORMAT_HTML;
|
||||
@ -761,7 +774,7 @@ bool sendPageBasicStatus ( TcpSocket *socket , HttpRequest *hr ) {
|
||||
//
|
||||
// show stats
|
||||
//
|
||||
if ( fmt == FMT_HTML ) {
|
||||
if ( fmt == FORMAT_HTML ) {
|
||||
|
||||
char *seedStr = cr->m_diffbotSeeds.getBufStart();
|
||||
if ( ! seedStr ) seedStr = "";
|
||||
@ -773,45 +786,23 @@ bool sendPageBasicStatus ( TcpSocket *socket , HttpRequest *hr ) {
|
||||
long sentAlert = (long)ci->m_sentCrawlDoneAlert;
|
||||
if ( sentAlert ) sentAlert = 1;
|
||||
|
||||
sb.safePrintf(
|
||||
//sb.safePrintf(
|
||||
// "<form method=get action=/crawlbot>"
|
||||
// "%s"
|
||||
// , sb.getBufStart() // hidden input token/name/..
|
||||
// );
|
||||
|
||||
char *hurts = "No";
|
||||
if ( cr->m_globalCrawlInfo.m_hasUrlsReadyToSpider )
|
||||
hurts = "Yes";
|
||||
|
||||
"<form method=get action=/crawlbot>"
|
||||
"%s"
|
||||
, sb.getBufStart() // hidden input token/name/..
|
||||
);
|
||||
sb.safePrintf("<TABLE border=0>"
|
||||
"<TR><TD valign=top>"
|
||||
|
||||
"<table border=0 cellpadding=5>"
|
||||
|
||||
//
|
||||
"<tr>"
|
||||
"<td><b>Crawl Name:</td>"
|
||||
"<td>%s</td>"
|
||||
"</tr>"
|
||||
|
||||
"<tr>"
|
||||
"<td><b>Crawl Type:</td>"
|
||||
"<td>%li</td>"
|
||||
"</tr>"
|
||||
|
||||
//"<tr>"
|
||||
//"<td><b>Collection Alias:</td>"
|
||||
//"<td>%s%s</td>"
|
||||
//"</tr>"
|
||||
|
||||
"<tr>"
|
||||
"<td><b>Token:</td>"
|
||||
"<td>%s</td>"
|
||||
"</tr>"
|
||||
|
||||
"<tr>"
|
||||
"<td><b>Seeds:</td>"
|
||||
"<td>%s</td>"
|
||||
"</tr>"
|
||||
|
||||
"<tr>"
|
||||
"<td><b>Crawl Status:</td>"
|
||||
"<td><b>Crawl Status Code:</td>"
|
||||
"<td>%li</td>"
|
||||
"</tr>"
|
||||
|
||||
@ -820,14 +811,14 @@ bool sendPageBasicStatus ( TcpSocket *socket , HttpRequest *hr ) {
|
||||
"<td>%s</td>"
|
||||
"</tr>"
|
||||
|
||||
"<tr>"
|
||||
"<td><b>Rounds Completed:</td>"
|
||||
"<td>%li</td>"
|
||||
"</tr>"
|
||||
//"<tr>"
|
||||
//"<td><b>Rounds Completed:</td>"
|
||||
//"<td>%li</td>"
|
||||
//"</tr>"
|
||||
|
||||
"<tr>"
|
||||
"<td><b>Has Urls Ready to Spider:</td>"
|
||||
"<td>%li</td>"
|
||||
"<td>%s</td>"
|
||||
"</tr>"
|
||||
|
||||
|
||||
@ -838,12 +829,8 @@ bool sendPageBasicStatus ( TcpSocket *socket , HttpRequest *hr ) {
|
||||
//"</tr>"
|
||||
|
||||
"<tr>"
|
||||
"<td><b>Objects Found</b></td>"
|
||||
"<td>%lli</td>"
|
||||
"</tr>"
|
||||
|
||||
"<tr>"
|
||||
"<td><b>URLs Harvested</b> (inc. dups)</td>"
|
||||
"<td><b>URLs Harvested</b> "
|
||||
"(may include dups)</td>"
|
||||
"<td>%lli</td>"
|
||||
|
||||
"</tr>"
|
||||
@ -862,60 +849,24 @@ bool sendPageBasicStatus ( TcpSocket *socket , HttpRequest *hr ) {
|
||||
"<td><b>Page Crawl Successes</b></td>"
|
||||
"<td>%lli</td>"
|
||||
"</tr>"
|
||||
|
||||
"<tr>"
|
||||
"<td><b>Page Crawl Successes This Round</b></td>"
|
||||
"<td>%lli</td>"
|
||||
"</tr>"
|
||||
|
||||
"<tr>"
|
||||
"<td><b>Page Process Attempts</b></td>"
|
||||
"<td>%lli</td>"
|
||||
"</tr>"
|
||||
|
||||
"<tr>"
|
||||
"<td><b>Page Process Successes</b></td>"
|
||||
"<td>%lli</td>"
|
||||
"</tr>"
|
||||
|
||||
"<tr>"
|
||||
"<td><b>Page Process Successes This Round</b></td>"
|
||||
"<td>%lli</td>"
|
||||
"</tr>"
|
||||
|
||||
|
||||
, cr->m_diffbotCrawlName.getBufStart()
|
||||
|
||||
, (long)cr->m_isCustomCrawl
|
||||
|
||||
, cr->m_diffbotToken.getBufStart()
|
||||
|
||||
, seedStr
|
||||
|
||||
, crawlStatus
|
||||
, tmp.getBufStart()
|
||||
, cr->m_spiderRoundNum
|
||||
, cr->m_globalCrawlInfo.m_hasUrlsReadyToSpider
|
||||
//, cr->m_spiderRoundNum
|
||||
//, cr->m_globalCrawlInfo.m_hasUrlsReadyToSpider
|
||||
, hurts
|
||||
|
||||
, cr->m_globalCrawlInfo.m_objectsAdded -
|
||||
cr->m_globalCrawlInfo.m_objectsDeleted
|
||||
, cr->m_globalCrawlInfo.m_urlsHarvested
|
||||
//, cr->m_globalCrawlInfo.m_urlsConsidered
|
||||
|
||||
, cr->m_globalCrawlInfo.m_pageDownloadAttempts
|
||||
, cr->m_globalCrawlInfo.m_pageDownloadSuccesses
|
||||
, cr->m_globalCrawlInfo.m_pageDownloadSuccessesThisRound
|
||||
|
||||
, cr->m_globalCrawlInfo.m_pageProcessAttempts
|
||||
, cr->m_globalCrawlInfo.m_pageProcessSuccesses
|
||||
, cr->m_globalCrawlInfo.m_pageProcessSuccessesThisRound
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
if ( fmt != FORMAT_JSON )
|
||||
// wrap up the form, print a submit button
|
||||
g_pages.printAdminBottom ( &sb );
|
||||
//if ( fmt != FORMAT_JSON )
|
||||
// // wrap up the form, print a submit button
|
||||
// g_pages.printAdminBottom ( &sb );
|
||||
|
||||
return g_httpServer.sendDynamicPage (socket,
|
||||
sb.getBufStart(),
|
||||
|
161
PageCrawlBot.cpp
161
PageCrawlBot.cpp
@ -25,11 +25,11 @@
|
||||
#include "Parms.h"
|
||||
|
||||
// so user can specify the format of the reply/output
|
||||
#define FMT_HTML 1
|
||||
#define FMT_XML 2
|
||||
#define FMT_JSON 3
|
||||
#define FMT_CSV 4
|
||||
#define FMT_TXT 5
|
||||
//#define FMT_HTML 1
|
||||
//#define FMT_XML 2
|
||||
//#define FMT_JSON 3
|
||||
//#define FMT_CSV 4
|
||||
//#define FMT_TXT 5
|
||||
|
||||
void doneSendingWrapper ( void *state , TcpSocket *sock ) ;
|
||||
bool sendBackDump ( TcpSocket *s,HttpRequest *hr );
|
||||
@ -158,25 +158,25 @@ bool sendBackDump ( TcpSocket *sock, HttpRequest *hr ) {
|
||||
|
||||
if ( ( xx = strstr ( path , "_data.json" ) ) ) {
|
||||
rdbId = RDB_TITLEDB;
|
||||
fmt = FMT_JSON;
|
||||
fmt = FORMAT_JSON;
|
||||
downloadJSON = true;
|
||||
}
|
||||
else if ( ( xx = strstr ( path , "_data.csv" ) ) ) {
|
||||
rdbId = RDB_TITLEDB;
|
||||
downloadJSON = true;
|
||||
fmt = FMT_CSV;
|
||||
fmt = FORMAT_CSV;
|
||||
}
|
||||
else if ( ( xx = strstr ( path , "_urls.csv" ) ) ) {
|
||||
rdbId = RDB_SPIDERDB;
|
||||
fmt = FMT_CSV;
|
||||
fmt = FORMAT_CSV;
|
||||
}
|
||||
else if ( ( xx = strstr ( path , "_urls.txt" ) ) ) {
|
||||
rdbId = RDB_SPIDERDB;
|
||||
fmt = FMT_TXT;
|
||||
fmt = FORMAT_TXT;
|
||||
}
|
||||
else if ( ( xx = strstr ( path , "_pages.txt" ) ) ) {
|
||||
rdbId = RDB_TITLEDB;
|
||||
fmt = FMT_TXT;
|
||||
fmt = FORMAT_TXT;
|
||||
}
|
||||
|
||||
// sanity, must be one of 3 download calls
|
||||
@ -213,7 +213,7 @@ bool sendBackDump ( TcpSocket *sock, HttpRequest *hr ) {
|
||||
|
||||
// . if doing download of csv, make it search results now!
|
||||
// . make an httprequest on stack and call it
|
||||
if ( fmt == FMT_CSV && rdbId == RDB_TITLEDB ) {
|
||||
if ( fmt == FORMAT_CSV && rdbId == RDB_TITLEDB ) {
|
||||
char tmp2[5000];
|
||||
SafeBuf sb2(tmp2,5000);
|
||||
long dr = 1;
|
||||
@ -247,7 +247,7 @@ bool sendBackDump ( TcpSocket *sock, HttpRequest *hr ) {
|
||||
|
||||
// . if doing download of json, make it search results now!
|
||||
// . make an httprequest on stack and call it
|
||||
if ( fmt == FMT_JSON && rdbId == RDB_TITLEDB ) {
|
||||
if ( fmt == FORMAT_JSON && rdbId == RDB_TITLEDB ) {
|
||||
char tmp2[5000];
|
||||
SafeBuf sb2(tmp2,5000);
|
||||
long dr = 1;
|
||||
@ -514,13 +514,13 @@ bool StateCD::sendList ( ) {
|
||||
//sb.setLabel("dbotdmp");
|
||||
|
||||
char *ct = "text/csv";
|
||||
if ( m_fmt == FMT_JSON )
|
||||
if ( m_fmt == FORMAT_JSON )
|
||||
ct = "application/json";
|
||||
if ( m_fmt == FMT_XML )
|
||||
if ( m_fmt == FORMAT_XML )
|
||||
ct = "text/xml";
|
||||
if ( m_fmt == FMT_TXT )
|
||||
if ( m_fmt == FORMAT_TXT )
|
||||
ct = "text/plain";
|
||||
if ( m_fmt == FMT_CSV )
|
||||
if ( m_fmt == FORMAT_CSV )
|
||||
ct = "text/csv";
|
||||
|
||||
// . if we haven't yet sent an http mime back to the user
|
||||
@ -545,13 +545,13 @@ bool StateCD::sendList ( ) {
|
||||
|
||||
//CollectionRec *cr = g_collectiondb.getRec ( m_collnum );
|
||||
|
||||
if ( ! m_printedFirstBracket && m_fmt == FMT_JSON ) {
|
||||
if ( ! m_printedFirstBracket && m_fmt == FORMAT_JSON ) {
|
||||
sb.safePrintf("[\n");
|
||||
m_printedFirstBracket = true;
|
||||
}
|
||||
|
||||
// these are csv files not xls
|
||||
//if ( ! m_printedFirstBracket && m_fmt == FMT_CSV ) {
|
||||
//if ( ! m_printedFirstBracket && m_fmt == FORMAT_CSV ) {
|
||||
// sb.safePrintf("sep=,\n");
|
||||
// m_printedFirstBracket = true;
|
||||
//}
|
||||
@ -638,7 +638,7 @@ bool StateCD::sendList ( ) {
|
||||
// use this for printing out urls.csv as well...
|
||||
m_printedEndingBracket = true;
|
||||
// end array of json objects. might be empty!
|
||||
if ( m_rdbId == RDB_TITLEDB && m_fmt == FMT_JSON )
|
||||
if ( m_rdbId == RDB_TITLEDB && m_fmt == FORMAT_JSON )
|
||||
sb.safePrintf("\n]\n");
|
||||
//log("adding ]. len=%li",sb.length());
|
||||
// i'd like to exit streaming mode here. i fixed tcpserver.cpp
|
||||
@ -853,7 +853,7 @@ void StateCD::printSpiderdbList ( RdbList *list,SafeBuf *sb,char **lastKeyPtr){
|
||||
}
|
||||
|
||||
// "csv" is default if json not specified
|
||||
if ( m_fmt == FMT_JSON )
|
||||
if ( m_fmt == FORMAT_JSON )
|
||||
sb->safePrintf("[{"
|
||||
"{\"url\":"
|
||||
"\"%s\"},"
|
||||
@ -875,6 +875,19 @@ void StateCD::printSpiderdbList ( RdbList *list,SafeBuf *sb,char **lastKeyPtr){
|
||||
);
|
||||
// but default to csv
|
||||
else {
|
||||
if (cr && cr->m_isCustomCrawl == 1 && sreq && !sreq->m_isAddUrl && !sreq->m_isInjecting) {
|
||||
if (cr->m_diffbotUrlCrawlPattern.m_length == 0
|
||||
&& cr->m_diffbotUrlProcessPattern.m_length == 0) {
|
||||
// If a crawl and there are no urlCrawlPattern or urlCrawlRegEx values, only return URLs from seed domain
|
||||
if (sreq && !sreq->m_sameDom)
|
||||
continue;
|
||||
} else {
|
||||
// TODO: if we get here, we have a crawl with a custom urlCrawlPattern and/or custom
|
||||
// urlProcessPattern. We have to check if the current url matches the pattern
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
sb->safePrintf("\"%s\",\"%s\","
|
||||
, sreq->m_url
|
||||
, as
|
||||
@ -984,7 +997,7 @@ void StateCD::printTitledbList ( RdbList *list,SafeBuf *sb,char **lastKeyPtr){
|
||||
|
||||
// if not json, just print the json item out in csv
|
||||
// moved into PageResults.cpp...
|
||||
//if ( m_fmt == FMT_CSV ) {
|
||||
//if ( m_fmt == FORMAT_CSV ) {
|
||||
// printJsonItemInCsv ( json , sb );
|
||||
// continue;
|
||||
//}
|
||||
@ -1324,7 +1337,7 @@ bool sendReply2 (TcpSocket *socket , long fmt , char *msg ) {
|
||||
|
||||
// send this back to browser
|
||||
SafeBuf sb;
|
||||
if ( fmt == FMT_JSON ) {
|
||||
if ( fmt == FORMAT_JSON ) {
|
||||
sb.safePrintf("{\n\"response\":\"success\",\n"
|
||||
"\"message\":\"%s\"\n}\n"
|
||||
, msg );
|
||||
@ -1355,7 +1368,7 @@ bool sendErrorReply2 ( TcpSocket *socket , long fmt , char *msg ) {
|
||||
|
||||
// send this back to browser
|
||||
SafeBuf sb;
|
||||
if ( fmt == FMT_JSON ) {
|
||||
if ( fmt == FORMAT_JSON ) {
|
||||
sb.safePrintf("{\"error\":\"%s\"}\n"
|
||||
, msg );
|
||||
ct = "application/json";
|
||||
@ -1463,7 +1476,7 @@ void injectedUrlWrapper ( void *state ) {
|
||||
|
||||
// send back the html or json response?
|
||||
SafeBuf *response = &sb;
|
||||
if ( st->m_fmt == FMT_JSON ) response = &js;
|
||||
if ( st->m_fmt == FORMAT_JSON ) response = &js;
|
||||
|
||||
// . this will call g_httpServer.sendReply()
|
||||
// . pass it in the injection response, "sb"
|
||||
@ -1660,7 +1673,7 @@ bool sendPageCrawlbot ( TcpSocket *socket , HttpRequest *hr ) {
|
||||
// . now show stats for the current crawl
|
||||
// . put in xml or json if format=xml or format=json or
|
||||
// xml=1 or json=1 ...
|
||||
char fmt = FMT_JSON;
|
||||
char fmt = FORMAT_JSON;
|
||||
|
||||
// token is always required. get from json or html form input
|
||||
//char *token = getInputString ( "token" );
|
||||
@ -1680,21 +1693,21 @@ bool sendPageCrawlbot ( TcpSocket *socket , HttpRequest *hr ) {
|
||||
name++;
|
||||
}
|
||||
// change default formatting to html
|
||||
fmt = FMT_HTML;
|
||||
fmt = FORMAT_HTML;
|
||||
}
|
||||
|
||||
|
||||
char *fs = hr->getString("format",NULL,NULL);
|
||||
// give john a json api
|
||||
if ( fs && strcmp(fs,"html") == 0 ) fmt = FMT_HTML;
|
||||
if ( fs && strcmp(fs,"json") == 0 ) fmt = FMT_JSON;
|
||||
if ( fs && strcmp(fs,"xml") == 0 ) fmt = FMT_XML;
|
||||
if ( fs && strcmp(fs,"html") == 0 ) fmt = FORMAT_HTML;
|
||||
if ( fs && strcmp(fs,"json") == 0 ) fmt = FORMAT_JSON;
|
||||
if ( fs && strcmp(fs,"xml") == 0 ) fmt = FORMAT_XML;
|
||||
// if we got json as input, give it as output
|
||||
//if ( JS.getFirstItem() ) fmt = FMT_JSON;
|
||||
//if ( JS.getFirstItem() ) fmt = FORMAT_JSON;
|
||||
|
||||
|
||||
|
||||
if ( ! token && fmt == FMT_JSON ) { // (cast==0|| fmt == FMT_JSON ) ) {
|
||||
if ( ! token && fmt == FORMAT_JSON ) { // (cast==0|| fmt == FORMAT_JSON ) ) {
|
||||
char *msg = "invalid token";
|
||||
return sendErrorReply2 (socket,fmt,msg);
|
||||
}
|
||||
@ -1759,7 +1772,7 @@ bool sendPageCrawlbot ( TcpSocket *socket , HttpRequest *hr ) {
|
||||
//}
|
||||
|
||||
// just send back a list of all the collections after the delete
|
||||
//if ( delColl && cast && fmt == FMT_JSON ) {
|
||||
//if ( delColl && cast && fmt == FORMAT_JSON ) {
|
||||
// char *msg = "Collection deleted.";
|
||||
// return sendReply2 (socket,fmt,msg);
|
||||
//}
|
||||
@ -2108,7 +2121,7 @@ bool sendPageCrawlbot ( TcpSocket *socket , HttpRequest *hr ) {
|
||||
|
||||
char bulkurlsfile[1024];
|
||||
snprintf(bulkurlsfile, 1024, "%scoll.%s.%li/bulkurls.txt", g_hostdb.m_dir , coll , (long)st->m_collnum );
|
||||
if ( spots ) {
|
||||
if ( spots && cr && cr->m_isCustomCrawl == 2 ) {
|
||||
log("crawlbot: got spots (len=%li) to add coll=%s (%li)",
|
||||
(long)gbstrlen(spots),coll,(long)st->m_collnum);
|
||||
FILE *f = fopen(bulkurlsfile, "w");
|
||||
@ -2120,7 +2133,7 @@ bool sendPageCrawlbot ( TcpSocket *socket , HttpRequest *hr ) {
|
||||
}
|
||||
|
||||
// if restart flag is on and the file with bulk urls exists, get spots from there
|
||||
if ( !spots && restartColl ) {
|
||||
if ( !spots && restartColl && cr && cr->m_isCustomCrawl ) {
|
||||
FILE *f = fopen(bulkurlsfile, "r");
|
||||
if (f != NULL) {
|
||||
fseek(f, 0, SEEK_END);
|
||||
@ -2250,7 +2263,7 @@ bool sendPageCrawlbot ( TcpSocket *socket , HttpRequest *hr ) {
|
||||
/*
|
||||
bool printUrlFilters ( SafeBuf &sb , CollectionRec *cr , long fmt ) {
|
||||
|
||||
if ( fmt == FMT_JSON )
|
||||
if ( fmt == FORMAT_JSON )
|
||||
sb.safePrintf("\"urlFilters\":[");
|
||||
|
||||
// skip first filters that are:
|
||||
@ -2290,7 +2303,7 @@ bool printUrlFilters ( SafeBuf &sb , CollectionRec *cr , long fmt ) {
|
||||
// urls higher spider priority, so skip it
|
||||
if ( strncmp(expression,"ismanualadd && ",15) == 0 )
|
||||
continue;
|
||||
if ( fmt == FMT_HTML ) {
|
||||
if ( fmt == FORMAT_HTML ) {
|
||||
sb.safePrintf("<tr>"
|
||||
"<td>Expression "
|
||||
"<input type=text "
|
||||
@ -2315,7 +2328,7 @@ bool printUrlFilters ( SafeBuf &sb , CollectionRec *cr , long fmt ) {
|
||||
sb.pushChar('\n');
|
||||
}
|
||||
|
||||
if ( fmt == FMT_JSON ) {
|
||||
if ( fmt == FORMAT_JSON ) {
|
||||
// remove trailing comma
|
||||
sb.removeLastChar('\n');
|
||||
sb.removeLastChar(',');
|
||||
@ -2506,7 +2519,7 @@ bool printCrawlDetailsInJson ( SafeBuf *sb , CollectionRec *cx ) {
|
||||
true // isJSON?
|
||||
);
|
||||
*/
|
||||
//printUrlFilters ( sb , cx , FMT_JSON );
|
||||
//printUrlFilters ( sb , cx , FORMAT_JSON );
|
||||
// end that collection rec
|
||||
sb->safePrintf("}\n");
|
||||
|
||||
@ -2524,7 +2537,7 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
|
||||
// store output into here
|
||||
SafeBuf sb;
|
||||
|
||||
if ( fmt == FMT_HTML )
|
||||
if ( fmt == FORMAT_HTML )
|
||||
sb.safePrintf(
|
||||
"<html>"
|
||||
"<title>Crawlbot - "
|
||||
@ -2560,7 +2573,7 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
|
||||
lb.urlEncode(name);
|
||||
lb.safePrintf ("&token=");
|
||||
lb.urlEncode(token);
|
||||
if ( fmt == FMT_HTML ) lb.safePrintf("&format=html");
|
||||
if ( fmt == FORMAT_HTML ) lb.safePrintf("&format=html");
|
||||
lb.nullTerm();
|
||||
|
||||
|
||||
@ -2577,7 +2590,7 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
|
||||
//}
|
||||
|
||||
|
||||
if ( fmt == FMT_HTML ) {
|
||||
if ( fmt == FORMAT_HTML ) {
|
||||
sb.safePrintf("<table border=0>"
|
||||
"<tr><td>"
|
||||
"<b><font size=+2>"
|
||||
@ -2632,7 +2645,7 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
|
||||
//
|
||||
// print list of collections controlled by this token
|
||||
//
|
||||
for ( long i = 0 ; fmt == FMT_HTML && i<g_collectiondb.m_numRecs;i++ ){
|
||||
for ( long i = 0 ; fmt == FORMAT_HTML && i<g_collectiondb.m_numRecs;i++ ){
|
||||
CollectionRec *cx = g_collectiondb.m_recs[i];
|
||||
if ( ! cx ) continue;
|
||||
// get its token if any
|
||||
@ -2664,19 +2677,19 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
|
||||
sb.safePrintf("</font></b>");
|
||||
}
|
||||
|
||||
if ( fmt == FMT_HTML )
|
||||
if ( fmt == FORMAT_HTML )
|
||||
sb.safePrintf ( "</center><br/>" );
|
||||
|
||||
// the ROOT JSON [
|
||||
if ( fmt == FMT_JSON )
|
||||
if ( fmt == FORMAT_JSON )
|
||||
sb.safePrintf("{\n");
|
||||
|
||||
// injection is currently not in use, so this is an artifact:
|
||||
if ( fmt == FMT_JSON && injectionResponse )
|
||||
if ( fmt == FORMAT_JSON && injectionResponse )
|
||||
sb.safePrintf("\"response\":\"%s\",\n\n"
|
||||
, injectionResponse->getBufStart() );
|
||||
|
||||
if ( fmt == FMT_JSON && urlUploadResponse )
|
||||
if ( fmt == FORMAT_JSON && urlUploadResponse )
|
||||
sb.safePrintf("\"response\":\"%s\",\n\n"
|
||||
, urlUploadResponse->getBufStart() );
|
||||
|
||||
@ -2689,14 +2702,14 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
|
||||
|
||||
// the items in the array now have type:bulk or type:crawl
|
||||
// so call them 'jobs'
|
||||
if ( fmt == FMT_JSON )
|
||||
if ( fmt == FORMAT_JSON )
|
||||
sb.safePrintf("\"jobs\":[");//\"collections\":");
|
||||
|
||||
long summary = hr->getLong("summary",0);
|
||||
// enter summary mode for json
|
||||
if ( fmt != FMT_HTML ) summary = 1;
|
||||
if ( fmt != FORMAT_HTML ) summary = 1;
|
||||
// start the table
|
||||
if ( summary && fmt == FMT_HTML ) {
|
||||
if ( summary && fmt == FORMAT_HTML ) {
|
||||
sb.safePrintf("<table border=1 cellpadding=5>"
|
||||
"<tr>"
|
||||
"<td><b>Collection</b></td>"
|
||||
@ -2727,11 +2740,11 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
|
||||
|
||||
|
||||
// just print out single crawl info for json
|
||||
if ( fmt != FMT_HTML && cx != cr && name3 )
|
||||
if ( fmt != FORMAT_HTML && cx != cr && name3 )
|
||||
continue;
|
||||
|
||||
// if json, print each collectionrec
|
||||
if ( fmt == FMT_JSON ) {
|
||||
if ( fmt == FORMAT_JSON ) {
|
||||
if ( ! firstOne )
|
||||
sb.safePrintf(",\n\t");
|
||||
firstOne = false;
|
||||
@ -2773,7 +2786,7 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
|
||||
, cx->m_globalCrawlInfo.m_pageProcessSuccessesThisRound
|
||||
);
|
||||
}
|
||||
if ( summary && fmt == FMT_HTML ) {
|
||||
if ( summary && fmt == FORMAT_HTML ) {
|
||||
sb.safePrintf("</table></html>" );
|
||||
return g_httpServer.sendDynamicPage (socket,
|
||||
sb.getBufStart(),
|
||||
@ -2781,7 +2794,7 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
|
||||
0); // cachetime
|
||||
}
|
||||
|
||||
if ( fmt == FMT_JSON )
|
||||
if ( fmt == FORMAT_JSON )
|
||||
// end the array of collection objects
|
||||
sb.safePrintf("\n]\n");
|
||||
|
||||
@ -2795,7 +2808,7 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
|
||||
//
|
||||
// show urls being crawled (ajax) (from Spider.cpp)
|
||||
//
|
||||
if ( fmt == FMT_HTML ) {
|
||||
if ( fmt == FORMAT_HTML ) {
|
||||
sb.safePrintf ( "<table width=100%% cellpadding=5 "
|
||||
"style=border-width:1px;border-style:solid;"
|
||||
"border-color:black;>"
|
||||
@ -2866,7 +2879,7 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
|
||||
rand64 |= r2;
|
||||
|
||||
|
||||
if ( fmt == FMT_HTML ) {
|
||||
if ( fmt == FORMAT_HTML ) {
|
||||
sb.safePrintf("<br>"
|
||||
"<table border=0 cellpadding=5>"
|
||||
|
||||
@ -2939,12 +2952,12 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
|
||||
);
|
||||
}
|
||||
|
||||
if ( injectionResponse && fmt == FMT_HTML )
|
||||
if ( injectionResponse && fmt == FORMAT_HTML )
|
||||
sb.safePrintf("<br><font size=-1>%s</font>\n"
|
||||
,injectionResponse->getBufStart()
|
||||
);
|
||||
|
||||
if ( fmt == FMT_HTML )
|
||||
if ( fmt == FORMAT_HTML )
|
||||
sb.safePrintf(//"<input type=hidden name=c value=\"%s\">"
|
||||
//"<input type=hidden name=crawlbotapi value=1>"
|
||||
"</td>"
|
||||
@ -2983,7 +2996,7 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
|
||||
//
|
||||
// show stats
|
||||
//
|
||||
if ( fmt == FMT_HTML ) {
|
||||
if ( fmt == FORMAT_HTML ) {
|
||||
|
||||
char *seedStr = cr->m_diffbotSeeds.getBufStart();
|
||||
if ( ! seedStr ) seedStr = "";
|
||||
@ -3641,7 +3654,7 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
|
||||
|
||||
|
||||
// xml or json does not show the input boxes
|
||||
//if ( format != FMT_HTML )
|
||||
//if ( format != FORMAT_HTML )
|
||||
// return g_httpServer.sendDynamicPage ( s,
|
||||
// sb.getBufStart(),
|
||||
// sb.length(),
|
||||
@ -3664,7 +3677,7 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
|
||||
s2 = "";
|
||||
}
|
||||
|
||||
if ( fmt == FMT_HTML )
|
||||
if ( fmt == FORMAT_HTML )
|
||||
sb.safePrintf(
|
||||
|
||||
"<a onclick="
|
||||
@ -3708,7 +3721,7 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
|
||||
//
|
||||
// print url filters. HACKy...
|
||||
//
|
||||
if ( fmt == FMT_HTML )
|
||||
if ( fmt == FORMAT_HTML )
|
||||
g_parms.sendPageGeneric ( socket ,
|
||||
hr ,
|
||||
PAGE_FILTERS ,
|
||||
@ -3719,7 +3732,7 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
|
||||
//
|
||||
// end HACKy hack
|
||||
//
|
||||
if ( fmt == FMT_HTML )
|
||||
if ( fmt == FORMAT_HTML )
|
||||
sb.safePrintf(
|
||||
"</form>"
|
||||
"</div>"
|
||||
@ -3747,7 +3760,7 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
|
||||
//
|
||||
// show simpler url filters table
|
||||
//
|
||||
if ( fmt == FMT_HTML ) {
|
||||
if ( fmt == FORMAT_HTML ) {
|
||||
/*
|
||||
sb.safePrintf ( "<table>"
|
||||
"<tr><td colspan=2>"
|
||||
@ -3783,7 +3796,7 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
|
||||
//
|
||||
// show reset and delete crawl buttons
|
||||
//
|
||||
if ( fmt == FMT_HTML ) {
|
||||
if ( fmt == FORMAT_HTML ) {
|
||||
sb.safePrintf(
|
||||
"<table cellpadding=5>"
|
||||
"<tr>"
|
||||
@ -3846,13 +3859,13 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
|
||||
|
||||
|
||||
// the ROOT JSON }
|
||||
if ( fmt == FMT_JSON )
|
||||
if ( fmt == FORMAT_JSON )
|
||||
sb.safePrintf("}\n");
|
||||
|
||||
char *ct = "text/html";
|
||||
if ( fmt == FMT_JSON ) ct = "application/json";
|
||||
if ( fmt == FMT_XML ) ct = "text/xml";
|
||||
if ( fmt == FMT_CSV ) ct = "text/csv";
|
||||
if ( fmt == FORMAT_JSON ) ct = "application/json";
|
||||
if ( fmt == FORMAT_XML ) ct = "text/xml";
|
||||
if ( fmt == FORMAT_CSV ) ct = "text/csv";
|
||||
|
||||
// this could be in html json or xml
|
||||
return g_httpServer.sendDynamicPage ( socket,
|
||||
@ -3946,7 +3959,7 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
|
||||
|
||||
// . do not add dups into m_diffbotSeeds safebuf
|
||||
// . return 0 if not in table, 1 if in table. -1 on error adding to table.
|
||||
long isInSeedBuf ( CollectionRec *cr , Url *url ) {
|
||||
long isInSeedBuf ( CollectionRec *cr , char *url, int len ) {
|
||||
|
||||
HashTableX *ht = &cr->m_seedHashTable;
|
||||
|
||||
@ -3973,7 +3986,7 @@ long isInSeedBuf ( CollectionRec *cr , Url *url ) {
|
||||
}
|
||||
|
||||
// is this url in the hash table?
|
||||
long long u64 = hash64 ( url->getUrl() , url->getUrlLen() );
|
||||
long long u64 = hash64 ( url, len );
|
||||
|
||||
if ( ht->isInTable ( &u64 ) ) return 1;
|
||||
|
||||
@ -4072,7 +4085,7 @@ bool getSpiderRequestMetaList ( char *doc ,
|
||||
if ( ! cr ) continue;
|
||||
|
||||
// do not add dups into m_diffbotSeeds safebuf
|
||||
long status = isInSeedBuf ( cr , &url );
|
||||
long status = isInSeedBuf ( cr , saved , end - saved );
|
||||
|
||||
// error?
|
||||
if ( status == -1 ) {
|
||||
@ -4129,7 +4142,7 @@ bool setSpiderParmsFromJSONPost ( TcpSocket *socket ,
|
||||
char *json = hr->getString("json");
|
||||
if ( ! json )
|
||||
return sendReply2 ( socket,
|
||||
FMT_JSON,
|
||||
FORMAT_JSON,
|
||||
"No &json= provided in request.");
|
||||
|
||||
|
||||
@ -4138,12 +4151,12 @@ bool setSpiderParmsFromJSONPost ( TcpSocket *socket ,
|
||||
|
||||
// wtf?
|
||||
if ( ! status )
|
||||
return sendReply2 ( socket, FMT_JSON,
|
||||
return sendReply2 ( socket, FORMAT_JSON,
|
||||
"Error with JSON parser.");
|
||||
|
||||
// error adding it?
|
||||
if ( ! cr )
|
||||
return sendReply2 ( socket,FMT_JSON,
|
||||
return sendReply2 ( socket,FORMAT_JSON,
|
||||
"Failed to create new collection.");
|
||||
|
||||
ji = JP.getFirstItem();
|
||||
|
@ -561,7 +561,7 @@ bool processLoop ( void *state ) {
|
||||
// . save the ips.txt file if we are the test coll
|
||||
// . saveTestBuf() is a function in Msge1.cpp
|
||||
CollectionRec *cr = xd->getCollRec();
|
||||
if ( xd && cr && cr->m_coll && ! strcmp ( cr->m_coll,"test") )
|
||||
if ( xd && cr && cr->m_coll && !strcmp(cr->m_coll,"qatest123"))
|
||||
// use same dir that XmlDoc::getTestDir() would use
|
||||
saveTestBuf ( "test-page-parser" );
|
||||
// now get the meta list, in the process it will print out a
|
||||
@ -855,7 +855,7 @@ bool gotXmlDoc ( void *state ) {
|
||||
|
||||
// . save the ips.txt file if we are the test coll
|
||||
// . saveTestBuf() is a function in Msge1.cpp
|
||||
//if ( xd && xd->m_coll && ! strcmp ( xd->m_coll , "test"))
|
||||
//if ( xd && xd->m_coll && ! strcmp ( xd->m_coll , "qatest123"))
|
||||
// // use same dir that XmlDoc::getTestDir() would use
|
||||
// saveTestBuf ( "test-page-parser" );
|
||||
|
||||
|
@ -985,7 +985,7 @@ bool printSearchResultsHeader ( State0 *st ) {
|
||||
sb->safePrintf("\"currentTimeUTC\":%lu,\n", (long)(globalNowMS/1000));
|
||||
}
|
||||
|
||||
// show response time
|
||||
// show response time if not doing Quality Assurance
|
||||
if ( si->m_format == FORMAT_XML )
|
||||
sb->safePrintf("\t<responseTimeMS>%lli</responseTimeMS>\n",
|
||||
st->m_took);
|
||||
@ -2148,8 +2148,13 @@ bool printResult ( State0 *st, long ix ) {
|
||||
// so fix that shit here...
|
||||
//float f = mr->m_lastSpidered;
|
||||
//sb->safePrintf(",\"lastCrawlTimeUTC\":%.0f}",f);
|
||||
sb->safePrintf(",\"lastCrawlTimeUTC\":%li}\n",
|
||||
sb->safePrintf(",\"lastCrawlTimeUTC\":%li\n",
|
||||
mr->m_lastSpidered);
|
||||
// also include a timestamp field with an RFC 1123 formatted date
|
||||
char timestamp[50];
|
||||
struct tm *ptm = gmtime ( &mr->m_lastSpidered );
|
||||
strftime(timestamp, 50, "%a, %d %b %Y %X %Z", ptm);
|
||||
sb->safePrintf(",\"timestamp\":\"%s\"}\n", timestamp);
|
||||
}
|
||||
|
||||
//mr->size_content );
|
||||
|
@ -169,6 +169,11 @@ bool printWebHomePage ( SafeBuf &sb , HttpRequest *r ) {
|
||||
sb.safePrintf("</form>\n");
|
||||
sb.safePrintf("<br>\n");
|
||||
sb.safePrintf("\n");
|
||||
|
||||
// print any red boxes we might need to
|
||||
if ( printRedBox2 ( &sb , true ) )
|
||||
sb.safePrintf("<br>\n");
|
||||
|
||||
sb.safePrintf("<table cellpadding=3>\n");
|
||||
sb.safePrintf("\n");
|
||||
|
||||
|
152
Pages.cpp
152
Pages.cpp
@ -50,6 +50,9 @@ static WebPage s_pages[] = {
|
||||
"dummy page - if set in the users row then user will have master=0 and "
|
||||
" collection links will be highlighted in red",
|
||||
NULL, 0 },
|
||||
|
||||
|
||||
|
||||
//{ PAGE_QUALITY , "quality", 0, "quality", 0, 0,
|
||||
// "dummy page - if set in the users row then \"Quality Control\""
|
||||
// " will be printed besides the logo for certain pages",
|
||||
@ -102,12 +105,66 @@ static WebPage s_pages[] = {
|
||||
// "Basic diffbot page.", sendPageBasicDiffbot , 0 } ,
|
||||
{ PAGE_BASIC_SECURITY, "admin/security", 0 , "security",1, 0 ,
|
||||
"Basic security page.", sendPageGeneric , 0 } ,
|
||||
{ PAGE_BASIC_SEARCH, "", 0 , "search",1, 0 ,
|
||||
"Basic search page.", sendPageRoot , 0 } ,
|
||||
|
||||
|
||||
|
||||
{ PAGE_MASTER , "admin/master" , 0 , "master controls" , 1 , 0 ,
|
||||
//USER_MASTER | USER_PROXY ,
|
||||
"master controls page",
|
||||
sendPageGeneric , 0 } ,
|
||||
{ PAGE_SEARCH , "admin" , 0 , "search controls" , 1 , 1,
|
||||
//USER_ADMIN | USER_MASTER ,
|
||||
"search controls page",
|
||||
sendPageGeneric , 0 } ,
|
||||
{ PAGE_SPIDER , "admin/spider" , 0 , "spider controls" , 1 , 0,
|
||||
//USER_ADMIN | USER_MASTER | USER_PROXY ,
|
||||
"spider controls page",
|
||||
sendPageGeneric , 0 } ,
|
||||
{ PAGE_LOG , "admin/log" , 0 , "log controls" , 1 , 0 ,
|
||||
//USER_MASTER | USER_PROXY,
|
||||
"log page",
|
||||
sendPageGeneric , 0 } ,
|
||||
{ PAGE_SECURITY, "admin/security2", 0 , "security" , 1 , 0 ,
|
||||
//USER_MASTER | USER_PROXY ,
|
||||
"advanced security page",
|
||||
sendPageGeneric , 0 } ,
|
||||
{ PAGE_ADDCOLL , "admin/addcoll" , 0 , "add collection" , 1 , 0 ,
|
||||
//USER_MASTER ,
|
||||
"add a new collection using this page",
|
||||
sendPageAddColl , 0 } ,
|
||||
{ PAGE_DELCOLL , "admin/delcoll" , 0 , "delete collections" , 1 ,0,
|
||||
//USER_MASTER ,
|
||||
"delete a collection using this page",
|
||||
sendPageDelColl , 0 } ,
|
||||
{ PAGE_REPAIR , "admin/repair" , 0 , "repair" , 1 , 0 ,
|
||||
//USER_MASTER ,
|
||||
"repair page",
|
||||
sendPageGeneric , 0 },
|
||||
{ PAGE_SITES , "admin/sites", 0 , "site list" , 1 , 1,
|
||||
"what sites can be spidered",
|
||||
sendPageGeneric , 0 } , // sendPageBasicSettings
|
||||
{ PAGE_FILTERS , "admin/filters", 0 , "url filters" , 1 , 1,
|
||||
//USER_ADMIN | USER_MASTER ,
|
||||
"prioritize urls for spidering",
|
||||
sendPageGeneric , 0 } ,
|
||||
{ PAGE_INJECT , "admin/inject" , 0 , "inject url" , 0 , 1 ,
|
||||
//USER_ADMIN | USER_MASTER ,
|
||||
"inject url in the index here",
|
||||
sendPageInject , 2 } ,
|
||||
// this is the addurl page the the admin!
|
||||
{ PAGE_ADDURL2 , "admin/addurl" , 0 , "add urls" , 0 , 0 ,
|
||||
"add url page for admin",
|
||||
sendPageAddUrl2 , 0 } ,
|
||||
{ PAGE_REINDEX , "admin/reindex" , 0 , "query reindex" , 0 , 0 ,
|
||||
//USER_ADMIN | USER_MASTER,
|
||||
"reindex url page",
|
||||
sendPageReindex , 0 } ,
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
{ PAGE_HOSTS , "admin/hosts" , 0 , "hosts" , 0 , 0 ,
|
||||
//USER_MASTER | USER_PROXY,
|
||||
@ -134,10 +191,7 @@ static WebPage s_pages[] = {
|
||||
//USER_MASTER | USER_PROXY,
|
||||
"sockets page",
|
||||
sendPageSockets , 0 } ,
|
||||
{ PAGE_LOG , "admin/log" , 0 , "log controls" , 1 , 0 ,
|
||||
//USER_MASTER | USER_PROXY,
|
||||
"log page",
|
||||
sendPageGeneric , 0 } ,
|
||||
|
||||
{ PAGE_LOGVIEW , "admin/logview" , 0 , "log view" , 0 , 0 ,
|
||||
//USER_MASTER ,
|
||||
"logview page",
|
||||
@ -147,18 +201,6 @@ static WebPage s_pages[] = {
|
||||
// "sync page",
|
||||
// sendPageGeneric , 0 } ,
|
||||
|
||||
{ PAGE_SECURITY, "admin/security2", 0 , "security" , 1 , 0 ,
|
||||
//USER_MASTER | USER_PROXY ,
|
||||
"advanced security page",
|
||||
sendPageGeneric , 0 } ,
|
||||
{ PAGE_ADDCOLL , "admin/addcoll" , 0 , "add collection" , 1 , 0 ,
|
||||
//USER_MASTER ,
|
||||
"add a new collection using this page",
|
||||
sendPageAddColl , 0 } ,
|
||||
{ PAGE_DELCOLL , "admin/delcoll" , 0 , "delete collections" , 1 ,0,
|
||||
//USER_MASTER ,
|
||||
"delete a collection using this page",
|
||||
sendPageDelColl , 0 } ,
|
||||
{ PAGE_AUTOBAN ,"admin/autoban" , 0 , "autoban" , 1 , 1 ,
|
||||
//USER_MASTER | USER_PROXY ,
|
||||
"autobanned ips",
|
||||
@ -175,10 +217,6 @@ static WebPage s_pages[] = {
|
||||
//USER_MASTER ,
|
||||
"threads page",
|
||||
sendPageThreads , 0 },
|
||||
{ PAGE_REPAIR , "admin/repair" , 0 , "repair" , 1 , 0 ,
|
||||
//USER_MASTER ,
|
||||
"repair page",
|
||||
sendPageGeneric , 0 },
|
||||
//{ PAGE_THESAURUS, "admin/thesaurus", 0 , "thesaurus", 0 , 0 ,
|
||||
// //USER_MASTER ,
|
||||
// "thesaurus page",
|
||||
@ -207,14 +245,6 @@ static WebPage s_pages[] = {
|
||||
"titledb page",
|
||||
sendPageTitledb , 2 } ,
|
||||
// 1 = usePost
|
||||
{ PAGE_SEARCH , "admin" , 0 , "search controls" , 1 , 1,
|
||||
//USER_ADMIN | USER_MASTER ,
|
||||
"search controls page",
|
||||
sendPageGeneric , 0 } ,
|
||||
{ PAGE_SPIDER , "admin/spider" , 0 , "spider controls" , 1 , 0,
|
||||
//USER_ADMIN | USER_MASTER | USER_PROXY ,
|
||||
"spider controls page",
|
||||
sendPageGeneric , 0 } ,
|
||||
|
||||
{ PAGE_CRAWLBOT , "crawlbot" , 0 , "crawlbot" , 1 , 0,
|
||||
"simplified spider controls page",
|
||||
@ -229,30 +259,6 @@ static WebPage s_pages[] = {
|
||||
// "spider priorities page",
|
||||
// sendPageGeneric , 0 } ,
|
||||
|
||||
{ PAGE_SITES , "admin/sites", 0 , "site list" , 1 , 1,
|
||||
"what sites can be spidered",
|
||||
sendPageGeneric , 0 } , // sendPageBasicSettings
|
||||
|
||||
{ PAGE_FILTERS , "admin/filters", 0 , "url filters" , 1 , 1,
|
||||
//USER_ADMIN | USER_MASTER ,
|
||||
"prioritize urls for spidering",
|
||||
sendPageGeneric , 0 } ,
|
||||
|
||||
{ PAGE_INJECT , "admin/inject" , 0 , "inject url" , 0 , 1 ,
|
||||
//USER_ADMIN | USER_MASTER ,
|
||||
"inject url in the index here",
|
||||
sendPageInject , 2 } ,
|
||||
|
||||
// this is the addurl page the the admin!
|
||||
{ PAGE_ADDURL2 , "admin/addurl" , 0 , "add urls" , 0 , 0 ,
|
||||
"add url page for admin",
|
||||
sendPageAddUrl2 , 0 } ,
|
||||
|
||||
{ PAGE_REINDEX , "admin/reindex" , 0 , "query reindex" , 0 , 0 ,
|
||||
//USER_ADMIN | USER_MASTER,
|
||||
"reindex url page",
|
||||
sendPageReindex , 0 } ,
|
||||
|
||||
//{ PAGE_KEYWORDS, "admin/queries",0,"queries" , 0 , 1 ,
|
||||
// "get queries a url matches",
|
||||
// sendPageMatchingQueries , 2 } ,
|
||||
@ -893,8 +899,6 @@ bool Pages::getNiceness ( long page ) {
|
||||
return s_pages[page].m_niceness;
|
||||
}
|
||||
|
||||
bool printRedBox ( SafeBuf *mb ) ;
|
||||
|
||||
///////////////////////////////////////////////////////////
|
||||
//
|
||||
// Convenient html printing routines
|
||||
@ -1056,6 +1060,7 @@ bool Pages::printAdminTop (SafeBuf *sb ,
|
||||
//if ( page == PAGE_BASIC_DIFFBOT ) isBasic = true;
|
||||
//if ( page == PAGE_BASIC_SEARCH ) isBasic = true;
|
||||
if ( page == PAGE_BASIC_SECURITY ) isBasic = true;
|
||||
if ( page == PAGE_BASIC_SEARCH ) isBasic = true;
|
||||
|
||||
//
|
||||
// print breadcrumb. main > Basic > Settings
|
||||
@ -1791,7 +1796,7 @@ bool Pages::printAdminLinks ( SafeBuf *sb,
|
||||
// is this page basic?
|
||||
bool pageBasic = false;
|
||||
if ( i >= PAGE_BASIC_SETTINGS &&
|
||||
i <= PAGE_BASIC_SECURITY )
|
||||
i <= PAGE_BASIC_SEARCH )
|
||||
pageBasic = true;
|
||||
|
||||
// print basic pages under the basic menu, advanced pages
|
||||
@ -2627,9 +2632,18 @@ bool sendPageLogin ( TcpSocket *socket , HttpRequest *hr ) {
|
||||
NULL);// cookie
|
||||
}
|
||||
|
||||
bool printRedBox2 ( SafeBuf *sb , bool isRootWebPage ) {
|
||||
SafeBuf mb;
|
||||
// return false if no red box
|
||||
if ( ! printRedBox ( &mb , isRootWebPage ) ) return false;
|
||||
// otherwise, print it
|
||||
sb->safeStrcpy ( mb.getBufStart() );
|
||||
// return true since we printed one
|
||||
return true;
|
||||
}
|
||||
|
||||
// emergency message box
|
||||
bool printRedBox ( SafeBuf *mb ) {
|
||||
bool printRedBox ( SafeBuf *mb , bool isRootWebPage ) {
|
||||
|
||||
PingServer *ps = &g_pingServer;
|
||||
|
||||
@ -2649,11 +2663,33 @@ bool printRedBox ( SafeBuf *mb ) {
|
||||
char *boxEnd =
|
||||
"</td></tr></table>";
|
||||
|
||||
bool adds = false;
|
||||
long adds = 0;
|
||||
|
||||
|
||||
mb->safePrintf("<div style=max-width:500px;>");
|
||||
|
||||
// are we just starting off? give them a little help.
|
||||
CollectionRec *cr = g_collectiondb.getRec("main");
|
||||
if ( g_collectiondb.m_numRecs == 1 &&
|
||||
cr &&
|
||||
isRootWebPage &&
|
||||
cr->m_globalCrawlInfo.m_pageDownloadAttempts == 0 ) {
|
||||
if ( adds ) mb->safePrintf("<br>");
|
||||
adds++;
|
||||
mb->safePrintf("%s",box);
|
||||
mb->safePrintf("Welcome to Gigablast. The most powerful "
|
||||
"search engine you can legally download. "
|
||||
"Please add the websites you want to spider "
|
||||
"<a href=/admin/settings?c=main>here</a>."
|
||||
);
|
||||
mb->safePrintf("%s",boxEnd);
|
||||
}
|
||||
|
||||
if ( isRootWebPage ) {
|
||||
mb->safePrintf("</div>");
|
||||
return (bool)adds;
|
||||
}
|
||||
|
||||
if ( g_conf.m_numConnectIps == 0 && g_conf.m_numMasterPwds == 0 ) {
|
||||
if ( adds ) mb->safePrintf("<br>");
|
||||
adds++;
|
||||
@ -2738,5 +2774,5 @@ bool printRedBox ( SafeBuf *mb ) {
|
||||
|
||||
mb->safePrintf("</div>");
|
||||
|
||||
return adds;
|
||||
return (bool)adds;
|
||||
}
|
||||
|
31
Pages.h
31
Pages.h
@ -5,6 +5,9 @@
|
||||
#ifndef _PAGES_H_
|
||||
#define _PAGES_H_
|
||||
|
||||
bool printRedBox2 ( SafeBuf *sb , bool isRootWebPage = false ) ;
|
||||
bool printRedBox ( SafeBuf *mb , bool isRootWebPage = false ) ;
|
||||
|
||||
// for PageEvents.cpp and Accessdb.cpp
|
||||
//#define RESULTSWIDTHSTR "550px"
|
||||
|
||||
@ -304,25 +307,36 @@ enum {
|
||||
//PAGE_BASIC_SEARCH , // TODO
|
||||
//PAGE_BASIC_DIFFBOT , // TODO
|
||||
PAGE_BASIC_SECURITY ,
|
||||
PAGE_BASIC_SEARCH ,
|
||||
|
||||
// master admin pages
|
||||
PAGE_MASTER ,
|
||||
PAGE_SEARCH ,
|
||||
PAGE_SPIDER ,
|
||||
PAGE_LOG ,
|
||||
PAGE_SECURITY ,
|
||||
PAGE_ADDCOLL ,
|
||||
PAGE_DELCOLL ,
|
||||
PAGE_REPAIR ,
|
||||
PAGE_SITES , // site filters
|
||||
PAGE_FILTERS ,
|
||||
PAGE_INJECT ,
|
||||
PAGE_ADDURL2 ,
|
||||
PAGE_REINDEX ,
|
||||
|
||||
PAGE_HOSTS ,
|
||||
PAGE_STATS , // 10
|
||||
PAGE_STATSDB ,
|
||||
PAGE_PERF ,
|
||||
PAGE_SOCKETS ,
|
||||
PAGE_LOG ,
|
||||
|
||||
PAGE_LOGVIEW ,
|
||||
// PAGE_SYNC ,
|
||||
PAGE_SECURITY ,
|
||||
PAGE_ADDCOLL ,
|
||||
PAGE_DELCOLL ,
|
||||
PAGE_AUTOBAN , // 20
|
||||
//PAGE_SPIDERLOCKS ,
|
||||
PAGE_PROFILER ,
|
||||
PAGE_THREADS ,
|
||||
PAGE_REPAIR ,
|
||||
|
||||
// PAGE_THESAURUS ,
|
||||
|
||||
// . non master-admin pages (collection controls)
|
||||
@ -335,16 +349,9 @@ enum {
|
||||
PAGE_TITLEDB ,
|
||||
//PAGE_STATSDB ,
|
||||
|
||||
PAGE_SEARCH ,
|
||||
PAGE_SPIDER ,
|
||||
PAGE_CRAWLBOT , // 35
|
||||
PAGE_SPIDERDB ,
|
||||
//PAGE_PRIORITIES , // priority queue controls
|
||||
PAGE_SITES , // site filters
|
||||
PAGE_FILTERS ,
|
||||
PAGE_INJECT ,
|
||||
PAGE_ADDURL2 ,
|
||||
PAGE_REINDEX ,
|
||||
//PAGE_KEYWORDS ,
|
||||
PAGE_SEO ,
|
||||
PAGE_ACCESS , //40
|
||||
|
121
Parms.cpp
121
Parms.cpp
@ -122,6 +122,40 @@ bool printUrlExpressionExamples ( SafeBuf *sb ) ;
|
||||
//
|
||||
////////
|
||||
|
||||
|
||||
// from PageBasic.cpp:
|
||||
bool updateSiteListTables(collnum_t collnum,bool addSeeds,char *siteListArg);
|
||||
|
||||
bool CommandUpdateSiteList ( char *rec ) {
|
||||
// caller must specify collnum
|
||||
collnum_t collnum = getCollnumFromParmRec ( rec );
|
||||
if ( collnum < 0 ) {
|
||||
log("parms: bad collnum for update site list");
|
||||
g_errno = ENOCOLLREC;
|
||||
return true;
|
||||
}
|
||||
// sanity
|
||||
long dataSize = getDataSizeFromParmRec ( rec );
|
||||
if ( dataSize < 0 ) {
|
||||
log("parms: bad site list size = %li bad!",dataSize);
|
||||
g_errno = EBADENGINEER;
|
||||
return true;
|
||||
}
|
||||
// need this
|
||||
CollectionRec *cr = g_collectiondb.getRec ( collnum );
|
||||
// get the sitelist
|
||||
char *data = getDataFromParmRec ( rec );
|
||||
// update it
|
||||
updateSiteListTables ( collnum ,
|
||||
true , // add NEW seeds?
|
||||
data // entire sitelist
|
||||
);
|
||||
// now that we deduped the old site list with the new one for
|
||||
// purposes of adding NEW seeds, we can do the final copy
|
||||
cr->m_siteListBuf.set ( data );
|
||||
return true;
|
||||
}
|
||||
|
||||
// . require user manually execute this to prevent us fucking up the data
|
||||
// at first initially because of a bad hosts.conf file!!!
|
||||
// . maybe put a red 'A' in the hosts table on the web page to indicate
|
||||
@ -450,7 +484,7 @@ bool CommandParserTestInit ( char *rec ) {
|
||||
g_conf.m_spideringEnabled = 1;
|
||||
//g_conf.m_webSpideringEnabled = 1;
|
||||
// turn on for test coll too
|
||||
CollectionRec *cr = g_collectiondb.getRec("test");
|
||||
CollectionRec *cr = g_collectiondb.getRec("qatest123");
|
||||
// turn on spiders
|
||||
if ( cr ) cr->m_spideringEnabled = 1;
|
||||
// if we are not host 0, turn on spiders for testing
|
||||
@ -470,7 +504,7 @@ bool CommandSpiderTestInit ( char *rec ) {
|
||||
g_conf.m_spideringEnabled = 1;
|
||||
//g_conf.m_webSpideringEnabled = 1;
|
||||
// turn on for test coll too
|
||||
CollectionRec *cr = g_collectiondb.getRec("test");
|
||||
CollectionRec *cr = g_collectiondb.getRec("qatest123");
|
||||
// turn on spiders
|
||||
if ( cr ) cr->m_spideringEnabled = 1;
|
||||
// if we are not host 0, turn on spiders for testing
|
||||
@ -488,7 +522,7 @@ bool CommandSpiderTestCont ( char *rec ) {
|
||||
g_conf.m_spideringEnabled = 1;
|
||||
//g_conf.m_webSpideringEnabled = 1;
|
||||
// turn on for test coll too
|
||||
CollectionRec *cr = g_collectiondb.getRec("test");
|
||||
CollectionRec *cr = g_collectiondb.getRec("qatest123");
|
||||
// turn on spiders
|
||||
if ( cr ) cr->m_spideringEnabled = 1;
|
||||
// done
|
||||
@ -1888,7 +1922,7 @@ bool Parms::printParm ( SafeBuf* sb,
|
||||
"value=\"%f\" "
|
||||
// 3 was ok on firefox but need 6
|
||||
// on chrome
|
||||
"size=6>",cgi,*(float *)s);
|
||||
"size=7>",cgi,*(float *)s);
|
||||
}
|
||||
else if ( t == TYPE_IP ) {
|
||||
if ( m->m_max > 0 && j == jend )
|
||||
@ -1896,7 +1930,7 @@ bool Parms::printParm ( SafeBuf* sb,
|
||||
"size=12>",cgi);
|
||||
else
|
||||
sb->safePrintf ("<input type=text name=%s value=\"%s\" "
|
||||
"size=6>",cgi,iptoa(*(long *)s));
|
||||
"size=12>",cgi,iptoa(*(long *)s));
|
||||
}
|
||||
else if ( t == TYPE_LONG ) {
|
||||
// just show the parm name and value if printing in json
|
||||
@ -5080,6 +5114,27 @@ void Parms::init ( ) {
|
||||
m++;
|
||||
*/
|
||||
|
||||
m->m_title = "init QA tests";
|
||||
m->m_desc = "If initiated gb performs some integrity tests "
|
||||
"to ensure injecting, spidering and searching works "
|
||||
"properly. Uses ./test/ subdirectory. Injects "
|
||||
"urls in ./test/inject.txt. Spiders urls "
|
||||
"in ./test/spider.txt. "
|
||||
"Each of those two files is essentially a simple format of "
|
||||
"a url followed by the http reply received from the server "
|
||||
"for that url. "
|
||||
// TODO: generate these files
|
||||
;
|
||||
m->m_cgi = "qasptei";
|
||||
m->m_type = TYPE_CMD;
|
||||
m->m_func = CommandSpiderTestInit;
|
||||
m->m_def = "1";
|
||||
m->m_cast = 1;
|
||||
m->m_group = 0;
|
||||
m->m_flags = PF_HIDDEN | PF_NOSAVE;
|
||||
m++;
|
||||
|
||||
|
||||
m->m_title = "init parser test run";
|
||||
m->m_desc = "If enabled gb injects the urls in the "
|
||||
"./test-parser/urls.txt "
|
||||
@ -7513,6 +7568,7 @@ void Parms::init ( ) {
|
||||
m->m_flags = PF_TEXTAREA;
|
||||
m++;
|
||||
|
||||
/*
|
||||
// the new upload post submit button
|
||||
m->m_title = "upload urls";
|
||||
m->m_desc = "Upload your file of urls.";
|
||||
@ -7521,6 +7577,7 @@ void Parms::init ( ) {
|
||||
m->m_obj = OBJ_NONE;
|
||||
m->m_type = TYPE_FILEUPLOADBUTTON;
|
||||
m++;
|
||||
*/
|
||||
|
||||
m->m_title = "strip sessionids";
|
||||
m->m_desc = "Strip added urls of their session ids.";
|
||||
@ -7570,6 +7627,7 @@ void Parms::init ( ) {
|
||||
m->m_title = "site list";
|
||||
m->m_xml = "siteList";
|
||||
m->m_desc = "List of sites to spider, one per line. "
|
||||
"See <a href=#examples>example site list</a> below. "
|
||||
"Gigablast uses the "
|
||||
"<a href=/admin/filters#insitelist>insitelist</a> "
|
||||
"directive on "
|
||||
@ -7578,8 +7636,7 @@ void Parms::init ( ) {
|
||||
"that match the site patterns you specify here, other than "
|
||||
"urls you add individually via the add urls or inject url "
|
||||
"tools. "
|
||||
"See <a href=#examples>example site list</a> below. "
|
||||
"Limit list to 300MB. If you have a lot of INDIVIDUAL URLS "
|
||||
"Limit list to 300MB. If you have a lot of INDIVIDUAL urls "
|
||||
"to add then consider using the <a href=/admin/addurl>add "
|
||||
"urls</a> interface.";
|
||||
m->m_cgi = "sitelist";
|
||||
@ -7587,6 +7644,7 @@ void Parms::init ( ) {
|
||||
m->m_page = PAGE_BASIC_SETTINGS;
|
||||
m->m_obj = OBJ_COLL;
|
||||
m->m_type = TYPE_SAFEBUF;
|
||||
m->m_func = CommandUpdateSiteList;
|
||||
m->m_def = "";
|
||||
// rebuild urlfilters now will nuke doledb and call updateSiteList()
|
||||
m->m_flags = PF_TEXTAREA | PF_DUP | PF_REBUILDURLFILTERS;
|
||||
@ -7608,6 +7666,7 @@ void Parms::init ( ) {
|
||||
m++;
|
||||
*/
|
||||
|
||||
/*
|
||||
// the new upload post submit button
|
||||
m->m_title = "upload site list";
|
||||
m->m_desc = "Upload your file of site patterns. Completely replaces "
|
||||
@ -7619,12 +7678,13 @@ void Parms::init ( ) {
|
||||
m->m_type = TYPE_FILEUPLOADBUTTON;
|
||||
m->m_flags = PF_NOSAVE | PF_DUP;
|
||||
m++;
|
||||
*/
|
||||
|
||||
m->m_title = "restart collection";
|
||||
m->m_desc = "Remove all documents from this collection and starts "
|
||||
"spidering over again. If you do this accidentally there "
|
||||
"is a <a href=/admin.html#recover>recovery procedure</a> to "
|
||||
"get back the trashed data.";
|
||||
m->m_desc = "Remove all documents from this collection and restart "
|
||||
"spidering.";// If you do this accidentally there "
|
||||
//"is a <a href=/admin.html#recover>recovery procedure</a> to "
|
||||
// "get back the trashed data.";
|
||||
m->m_cgi = "restart";
|
||||
m->m_page = PAGE_BASIC_SETTINGS;
|
||||
m->m_obj = OBJ_COLL;
|
||||
@ -7638,6 +7698,7 @@ void Parms::init ( ) {
|
||||
m->m_title = "site list";
|
||||
m->m_xml = "siteList";
|
||||
m->m_desc = "List of sites to spider, one per line. "
|
||||
"See <a href=#examples>example site list</a> below. "
|
||||
"Gigablast uses the "
|
||||
"<a href=/admin/filters#insitelist>insitelist</a> "
|
||||
"directive on "
|
||||
@ -7646,8 +7707,7 @@ void Parms::init ( ) {
|
||||
"that match the site patterns you specify here, other than "
|
||||
"urls you add individually via the add urls or inject url "
|
||||
"tools. "
|
||||
"See <a href=#examples>example site list</a> below. "
|
||||
"Limit list to 300MB. If you have a lot of INDIVIDUAL URLS "
|
||||
"Limit list to 300MB. If you have a lot of INDIVIDUAL urls "
|
||||
"to add then consider using the <a href=/admin/addurl>addurl"
|
||||
"</a> interface.";
|
||||
m->m_cgi = "sitelist";
|
||||
@ -7655,6 +7715,7 @@ void Parms::init ( ) {
|
||||
m->m_page = PAGE_SITES;
|
||||
m->m_obj = OBJ_COLL;
|
||||
m->m_type = TYPE_SAFEBUF;
|
||||
m->m_func = CommandUpdateSiteList;
|
||||
m->m_def = "";
|
||||
// rebuild urlfilters now will nuke doledb and call updateSiteList()
|
||||
m->m_flags = PF_TEXTAREA | PF_REBUILDURLFILTERS;
|
||||
@ -8741,11 +8802,11 @@ void Parms::init ( ) {
|
||||
m++;
|
||||
|
||||
m->m_title = "max robots.txt cache age";
|
||||
m->m_desc = "How many second to cache a robots.txt file for. "
|
||||
m->m_desc = "How many seconds to cache a robots.txt file for. "
|
||||
"86400 is 1 day. 0 means Gigablast will not read from the "
|
||||
"cache at all and will download the robots.txt before every "
|
||||
"page if robots.txt use is enabled above. However, if this is "
|
||||
"0 then Gigablast will still store robots.txt files into the "
|
||||
"0 then Gigablast will still store robots.txt files in the "
|
||||
"cache.";
|
||||
m->m_cgi = "mrca";
|
||||
m->m_off = (char *)&cr.m_maxRobotsCacheAge - x;
|
||||
@ -10618,8 +10679,9 @@ void Parms::init ( ) {
|
||||
m++;
|
||||
|
||||
m->m_title = "do query expansion";
|
||||
m->m_desc = "Query expansion will include word stems and synonyms in "
|
||||
"its search results.";
|
||||
m->m_desc = "If enabled, query expansion will expand your query "
|
||||
"to include word stems and "
|
||||
"synonyms of the query terms.";
|
||||
m->m_def = "1";
|
||||
m->m_off = (char *)&cr.m_queryExpansion - x;
|
||||
m->m_soff = (char *)&si.m_queryExpansion - y;
|
||||
@ -10632,7 +10694,7 @@ void Parms::init ( ) {
|
||||
|
||||
// more general parameters
|
||||
m->m_title = "max search results";
|
||||
m->m_desc = "What is the limit to the total number "
|
||||
m->m_desc = "What is the maximum total number "
|
||||
"of returned search results.";
|
||||
m->m_cgi = "msr";
|
||||
m->m_off = (char *)&cr.m_maxSearchResults - x;
|
||||
@ -12436,7 +12498,7 @@ void Parms::init ( ) {
|
||||
m++;
|
||||
|
||||
m->m_title = "max summary line width";
|
||||
m->m_desc = "<br> tags are inserted to keep the number "
|
||||
m->m_desc = "<br> tags are inserted to keep the number "
|
||||
"of chars in the summary per line at or below this width. "
|
||||
"Strings without spaces that exceed this "
|
||||
"width are not split.";
|
||||
@ -15299,6 +15361,18 @@ void Parms::init ( ) {
|
||||
m->m_smin = 0;
|
||||
m++;
|
||||
|
||||
// when we do &qa=1 we do not show things like responseTime in
|
||||
// search results so we can verify serp checksum consistency for QA
|
||||
// in qa.cpp
|
||||
m->m_title = "quality assurance";
|
||||
m->m_desc = "This is 1 if doing a QA test in qa.cpp";
|
||||
m->m_def = "0";
|
||||
m->m_soff = (char *)&si.m_qa - y;
|
||||
m->m_type = TYPE_CHAR;
|
||||
m->m_sparm = 1;
|
||||
m->m_scgi = "qa";
|
||||
m++;
|
||||
|
||||
//m->m_title = "show turk forms";
|
||||
//m->m_desc = "If enabled summaries in search results will be "
|
||||
// "turkable input forms.";
|
||||
@ -16744,7 +16818,6 @@ bool Parms::addCurrentParmToList2 ( SafeBuf *parmList ,
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
// returns false and sets g_errno on error
|
||||
bool Parms::convertHttpRequestToParmList (HttpRequest *hr, SafeBuf *parmList,
|
||||
long page ){
|
||||
@ -18019,7 +18092,11 @@ bool Parms::updateParm ( char *rec , WaitEntry *we ) {
|
||||
}
|
||||
|
||||
// cmd to execute?
|
||||
if ( parm->m_type == TYPE_CMD ) {
|
||||
if ( parm->m_type == TYPE_CMD ||
|
||||
// sitelist is a safebuf but it requires special deduping
|
||||
// logic to update it so it uses CommandUpdateSiteList() to
|
||||
// do the updating
|
||||
parm->m_func ) {
|
||||
// all parm rec data for TYPE_CMD should be ascii/utf8 chars
|
||||
// and should be \0 terminated
|
||||
char *data = getDataFromParmRec ( rec );
|
||||
@ -18268,7 +18345,7 @@ bool printUrlExpressionExamples ( SafeBuf *sb ) {
|
||||
CollectionRec *cr = (CollectionRec *)THIS;
|
||||
// if testUrl is provided, find in the table
|
||||
char testUrl [ 1025 ];
|
||||
char *tt = r->getString ( "test" , NULL );
|
||||
char *tt = r->getString ( "qatest123" , NULL );
|
||||
testUrl[0]='\0';
|
||||
if ( tt ) strncpy ( testUrl , tt , 1024 );
|
||||
char *tu = testUrl;
|
||||
|
@ -5158,7 +5158,7 @@ char *Proxy::storeLoginBar ( char *reply ,
|
||||
}
|
||||
// point to first digit in there
|
||||
mp += 16;
|
||||
// store our new content length as ascii into "test" buf
|
||||
// store our new content length as ascii into test buf
|
||||
char test[64];
|
||||
long len = sprintf(test,"%li",(long)(newReplySize-mimeLen));
|
||||
// find end
|
||||
|
@ -60,6 +60,7 @@ struct SafeBuf {
|
||||
long fillFromFile(char *filename);
|
||||
long fillFromFile(char *dir,char *filename);
|
||||
long load(char *dir,char *fname) { return fillFromFile(dir,fname);};
|
||||
long load(char *fname) { return fillFromFile(fname);};
|
||||
|
||||
void filterTags();
|
||||
void filterQuotes();
|
||||
|
@ -179,6 +179,9 @@ class SearchInput {
|
||||
long m_queryMatchOffsets;
|
||||
long m_summaryMode;
|
||||
|
||||
// are we doing a QA query for quality assurance consistency
|
||||
char m_qa;
|
||||
|
||||
float m_pqr_demFactSubPhrase;
|
||||
float m_pqr_demFactCommonInlinks;
|
||||
float m_pqr_demFactLocTitle;
|
||||
|
15
Sections.cpp
15
Sections.cpp
@ -1288,7 +1288,7 @@ bool Sections::set ( Words *w ,
|
||||
}
|
||||
|
||||
|
||||
m_isTestColl = ! strcmp(m_coll,"test") ;
|
||||
m_isTestColl = ! strcmp(m_coll,"qatest123") ;
|
||||
|
||||
//
|
||||
//
|
||||
@ -15163,7 +15163,7 @@ bool Sections::printVotingInfoInJSON ( SafeBuf *sb ) {
|
||||
// breathe
|
||||
QUICKPOLL ( m_niceness );
|
||||
// print this section
|
||||
printSectionDiv ( sk , FMT_JSON ); // forProCog );
|
||||
printSectionDiv ( sk , FORMAT_JSON ); // forProCog );
|
||||
// advance
|
||||
long b = sk->m_b;
|
||||
// stop if last
|
||||
@ -15190,7 +15190,8 @@ bool Sections::print2 ( SafeBuf *sbuf ,
|
||||
HashTableX *st2 ,
|
||||
HashTableX *tt ,
|
||||
Addresses *aa ,
|
||||
char format ) { // bool forProCog ){//FMT_PROCOG FMT_JSON HTML
|
||||
char format ) { // bool forProCog ){
|
||||
//FORMAT_PROCOG FORMAT_JSON HTML
|
||||
|
||||
//sbuf->safePrintf("<b>Sections in Document</b>\n");
|
||||
|
||||
@ -15244,7 +15245,7 @@ bool Sections::print2 ( SafeBuf *sbuf ,
|
||||
sk = m_sectionPtrs[b];
|
||||
}
|
||||
|
||||
if ( format != FMT_HTML ) return true; // forProCog
|
||||
if ( format != FORMAT_HTML ) return true; // forProCog
|
||||
|
||||
// print header
|
||||
char *hdr =
|
||||
@ -15553,7 +15554,7 @@ bool Sections::printSectionDiv ( Section *sk , char format ) { // bool forProCog
|
||||
// m_sbuf->safePrintf("A=%li ",sk->m_a);
|
||||
|
||||
|
||||
if ( format == FMT_PROCOG && sk->m_stats.m_numUniqueSites >= 2 ) {
|
||||
if ( format == FORMAT_PROCOG && sk->m_stats.m_numUniqueSites >= 2 ) {
|
||||
// do not count our own site!
|
||||
m_sbuf->safePrintf("<i>"
|
||||
"<font size=-1>"
|
||||
@ -15573,7 +15574,7 @@ bool Sections::printSectionDiv ( Section *sk , char format ) { // bool forProCog
|
||||
|
||||
m_sbuf->safePrintf("<i>");
|
||||
|
||||
if ( format == FMT_PROCOG && (sk->m_flags & SEC_SENTENCE) ) {
|
||||
if ( format == FORMAT_PROCOG && (sk->m_flags & SEC_SENTENCE) ) {
|
||||
sec_t f = sk->m_flags;
|
||||
//if ( f & SEC_SENTENCE )
|
||||
// m_sbuf->safePrintf("sentence " );
|
||||
@ -15598,7 +15599,7 @@ bool Sections::printSectionDiv ( Section *sk , char format ) { // bool forProCog
|
||||
// m_sbuf->safePrintf("notdupvotes=%li ",
|
||||
// sk->m_votesForNotDup);
|
||||
|
||||
if ( format != FMT_PROCOG ) {
|
||||
if ( format != FORMAT_PROCOG ) {
|
||||
// print the flags
|
||||
m_sbuf->safePrintf("A=%li ",sk->m_a);
|
||||
|
||||
|
@ -146,8 +146,8 @@ bool SiteGetter::getSite ( char *url ,
|
||||
long age = -1;
|
||||
//long now = getTimeGlobal();
|
||||
//if ( tag ) age = now - tag->m_timestamp;
|
||||
// to parse conssitently for the qa test "test" coll use "timestamp"
|
||||
// as the "current time"
|
||||
// to parse conssitently for the qa test "qatest123" coll use
|
||||
// "timestamp" as the "current time"
|
||||
if ( tag ) age = timestamp - tag->m_timestamp;
|
||||
// if there, at least get it (might be -1)
|
||||
if ( tag ) m_oldSitePathDepth = atol ( tag->getTagData() );
|
||||
@ -534,7 +534,7 @@ bool SiteGetter::setSite ( ) {
|
||||
//TagRec gr;
|
||||
m_addedTag.addTag ( "sitepathdepth" ,
|
||||
// now XmlDoc must provide it to ensure that are
|
||||
// injects into the "test" coll are consistent
|
||||
// injects into the "qatest123" coll are consistent
|
||||
m_timestamp ,//getTime()// use now as timestamp
|
||||
"sitegit" , // username
|
||||
0 , // ip
|
||||
|
12
Spider.cpp
12
Spider.cpp
@ -1082,7 +1082,7 @@ SpiderColl *SpiderCache::getSpiderColl ( collnum_t collnum ) {
|
||||
// save this
|
||||
strcpy ( sc->m_coll , cr->m_coll );
|
||||
// set this
|
||||
if ( ! strcmp ( cr->m_coll,"test" ) ) sc->m_isTestColl = true;
|
||||
if ( ! strcmp ( cr->m_coll,"qatest123" ) ) sc->m_isTestColl = true;
|
||||
else sc->m_isTestColl = false;
|
||||
|
||||
// set first doledb scan key
|
||||
@ -6761,12 +6761,12 @@ bool SpiderLoop::spiderUrl2 ( ) {
|
||||
char *coll = "collnumwasinvalid";
|
||||
if ( cr ) coll = cr->m_coll;
|
||||
|
||||
// . pass in a pbuf if this is the "test" collection
|
||||
// . pass in a pbuf if this is the "qatest123" collection
|
||||
// . we will dump the SafeBuf output into a file in the
|
||||
// test subdir for comparison with previous versions of gb
|
||||
// in order to see what changed
|
||||
SafeBuf *pbuf = NULL;
|
||||
if ( !strcmp( coll,"test") && g_conf.m_testParserEnabled )
|
||||
if ( !strcmp( coll,"qatest123") && g_conf.m_testParserEnabled )
|
||||
pbuf = &xd->m_sbuf;
|
||||
|
||||
//
|
||||
@ -6969,10 +6969,10 @@ bool SpiderLoop::indexedDoc ( XmlDoc *xd ) {
|
||||
bool respider = false;
|
||||
if ( xd->m_oldDocValid && xd->m_oldDoc ) respider = true;
|
||||
|
||||
// . dump it out to a file in the "test" subdir
|
||||
// . dump it out to a file in the "qatest123" subdir
|
||||
// . but only the first time we spider it...
|
||||
/*
|
||||
if ( ! strcmp(xd->m_coll,"test") && ! respider &&
|
||||
if ( ! strcmp(xd->m_coll,"qatest123") && ! respider &&
|
||||
// no longer need this when qa testing spider, not parser
|
||||
g_conf.m_testParserEnabled ) {
|
||||
// save the buffers
|
||||
@ -12414,7 +12414,7 @@ bool getSpiderStatusMsg ( CollectionRec *cx , SafeBuf *msg , long *status ) {
|
||||
if ( cx->m_isCustomCrawl )
|
||||
return msg->safePrintf("Job is in progress.");
|
||||
else
|
||||
return true;
|
||||
return msg->safePrintf("Spider is in progress.");
|
||||
}
|
||||
|
||||
// pattern is a ||-separted list of substrings
|
||||
|
@ -3103,7 +3103,7 @@ void TagRec::gotAllReplies ( ) {
|
||||
|
||||
// site getter sometimes adds recs to tagdb to add in a new subsite
|
||||
// it finds... i'd imagine this will create a parsing inconsistency
|
||||
// when injecting docs into the "test" coll... but oh well!
|
||||
// when injecting docs into the "qatest123" coll... but oh well!
|
||||
long timestamp = getTimeGlobal();
|
||||
|
||||
// . begin the "inheritance loop"
|
||||
@ -3288,7 +3288,7 @@ bool Msg9a::addTags ( char *sites ,
|
||||
// when we add the "site" tag to it use the timestamp from one
|
||||
// of the tags we are adding... therefore we must require there be
|
||||
// some tags! we do this to insure injection consistency into the
|
||||
// "test" collection.
|
||||
// "qatest123" collection.
|
||||
if ( ! tagRec || tagRec->getNumTags() <= 0 ) { char *xx=NULL;*xx=0; }
|
||||
|
||||
// use the first timestamp
|
||||
|
10
Test.cpp
10
Test.cpp
@ -55,7 +55,7 @@ bool Test::init ( ) {
|
||||
}
|
||||
|
||||
void Test::reset ( ) {
|
||||
if ( m_urlBuf ) mfree ( m_urlBuf , m_urlEnd - m_urlBuf , "test");
|
||||
if ( m_urlBuf ) mfree ( m_urlBuf , m_urlEnd - m_urlBuf , "test999");
|
||||
//m_spiderLinks = true;//false;
|
||||
m_bypassMenuElimination = false;
|
||||
}
|
||||
@ -122,7 +122,7 @@ void Test::removeFiles ( ) {
|
||||
long saved = g_conf.m_useQuickpoll;
|
||||
g_conf.m_useQuickpoll = false;
|
||||
|
||||
CollectionRec *cr = g_collectiondb.getRec("test");
|
||||
CollectionRec *cr = g_collectiondb.getRec("qatest123");
|
||||
|
||||
// . reset the qatest collection to zero docs
|
||||
// . TODO: implement this. only allow it for qatest coll.
|
||||
@ -172,8 +172,8 @@ void Test::initTestRun ( ) {
|
||||
//if ( m_testSpiderEnabledSaved ) return;
|
||||
//if ( m_testParserEnabledSaved ) return;
|
||||
|
||||
// you must have the "test" coll already setup!
|
||||
CollectionRec *cr = g_collectiondb.getRec("test");
|
||||
// you must have the "qatest123" coll already setup!
|
||||
CollectionRec *cr = g_collectiondb.getRec("qatest123");
|
||||
if ( ! cr ) {
|
||||
// note it
|
||||
log("test: please add a collection named \"test\" first.");
|
||||
@ -233,7 +233,7 @@ void Test::initTestRun ( ) {
|
||||
// save it
|
||||
m_runId = i;
|
||||
|
||||
cr = g_collectiondb.getRec ( "test" );
|
||||
cr = g_collectiondb.getRec ( "qatest123" );
|
||||
if ( ! cr ) {
|
||||
// and no more of this
|
||||
g_conf.m_testParserEnabled = false;
|
||||
|
12
Title.cpp
12
Title.cpp
@ -71,6 +71,7 @@ void Title::reset() {
|
||||
mfree ( m_title , m_titleAllocSize , "Title" );
|
||||
m_title = NULL;
|
||||
m_titleBytes = 0;
|
||||
m_titleAllocSize = 0;
|
||||
m_query = NULL;
|
||||
m_titleTagStart = -1;
|
||||
m_titleTagEnd = -1;
|
||||
@ -113,7 +114,7 @@ bool Title::setTitle ( XmlDoc *xd ,
|
||||
char *val = NULL;
|
||||
// look for the "title:" field in json then use that
|
||||
SafeBuf jsonTitle;
|
||||
long vlen;
|
||||
long vlen = 0;
|
||||
if ( xd->m_contentType == CT_JSON ) {
|
||||
char *jt;
|
||||
jt = getJSONFieldValue(xd->ptr_utf8Content,"title",&vlen);
|
||||
@ -124,7 +125,6 @@ bool Title::setTitle ( XmlDoc *xd ,
|
||||
val = jsonTitle.getBufStart();
|
||||
vlen = jsonTitle.length();
|
||||
}
|
||||
|
||||
}
|
||||
// if we had a title: field in the json...
|
||||
if ( val && vlen > 0 ) {
|
||||
@ -135,6 +135,7 @@ bool Title::setTitle ( XmlDoc *xd ,
|
||||
else {
|
||||
dst = (char *)mmalloc ( m_titleBytes+1,"titdst" );
|
||||
if ( ! dst ) return false;
|
||||
m_titleAllocSize = m_titleBytes+1;
|
||||
}
|
||||
m_title = dst;
|
||||
memcpy ( dst , val , m_titleBytes );
|
||||
@ -142,6 +143,13 @@ bool Title::setTitle ( XmlDoc *xd ,
|
||||
return true;
|
||||
}
|
||||
|
||||
// json content, if has no explicit title field, has no title then
|
||||
if ( xd->m_contentType == CT_JSON ) {
|
||||
m_localBuf[0] = '\0';
|
||||
m_title = m_localBuf;
|
||||
m_titleBytes = 0;
|
||||
return true;
|
||||
}
|
||||
|
||||
bool status = setTitle4 ( xd ,
|
||||
xml ,
|
||||
|
93
XmlDoc.cpp
93
XmlDoc.cpp
@ -879,8 +879,8 @@ bool XmlDoc::set1 ( char *url ,
|
||||
char *XmlDoc::getTestDir ( ) {
|
||||
CollectionRec *cr = getCollRec();
|
||||
if ( ! cr ) return NULL;
|
||||
// return NULL if we are not the "test" collection
|
||||
if ( strcmp(cr->m_coll,"test") ) return NULL;
|
||||
// return NULL if we are not the "qatest123" collection
|
||||
if ( strcmp(cr->m_coll,"qatest123") ) return NULL;
|
||||
// if Test.cpp explicitly set SpiderRequest::m_useTestSpiderDir bit
|
||||
// then return "test-spider" otherwise...
|
||||
if ( m_sreqValid && m_sreq.m_useTestSpiderDir )
|
||||
@ -914,7 +914,7 @@ long XmlDoc::getSpideredTime ( ) {
|
||||
if ( ! cr ) return 0;
|
||||
|
||||
// if not test collection keep it simple
|
||||
if ( strcmp(cr->m_coll,"test") ) {
|
||||
if ( strcmp(cr->m_coll,"qatest123") ) {
|
||||
// . set spider time to current time
|
||||
// . this might already be valid if we set it in
|
||||
// getTestSpideredDate()
|
||||
@ -3295,13 +3295,13 @@ char *XmlDoc::prepareToMakeTitleRec ( ) {
|
||||
CollectionRec *cr = getCollRec();
|
||||
if ( ! cr ) return NULL;
|
||||
|
||||
// if we are injecting into the "test" coll, then we need to have
|
||||
// if we are injecting into the "qatest123" coll, then we need to have
|
||||
// m_spideredTimeValid be true before calling getIsSpam() which calls
|
||||
// getSiteNumInlinks() which adds tags to tagdb using that date, but
|
||||
// only for the "test" coll! that keeps our parser output consistent
|
||||
// across runs!
|
||||
// only for the "qatest123" coll!
|
||||
// that keeps our parser output consistent across runs!
|
||||
char **content = NULL;
|
||||
if ( ! strcmp ( cr->m_coll,"test") ) {
|
||||
if ( ! strcmp ( cr->m_coll,"qatest123") ) {
|
||||
content = getContent ( );
|
||||
if ( ! content || content == (void *)-1 )
|
||||
return (char *)content;
|
||||
@ -11842,7 +11842,7 @@ long *XmlDoc::getSiteNumInlinks ( ) {
|
||||
// current time
|
||||
long now = getTimeGlobal();
|
||||
// use the spidered time for the test collection for consistency
|
||||
if ( !strcmp(cr->m_coll,"test") ) {
|
||||
if ( !strcmp(cr->m_coll,"qatest123") ) {
|
||||
//if ( ! m_spideredTimeValid ) { char *xx=NULL;*xx=0; }
|
||||
now = getSpideredTime();//m_spideredTime;
|
||||
}
|
||||
@ -12061,8 +12061,8 @@ LinkInfo *XmlDoc::getSiteLinkInfo() {
|
||||
// get from spider request if there
|
||||
//bool injected = false;
|
||||
//if ( m_sreqValid && m_sreq.m_isInjecting ) injected = true;
|
||||
// but be consistent if doing the "test" collection
|
||||
if ( ! strcmp(cr->m_coll,"test") ) {
|
||||
// but be consistent if doing the "qatest123" collection
|
||||
if ( ! strcmp(cr->m_coll,"qatest123") ) {
|
||||
//if ( ! m_spideredTimeValid ) {char *xx=NULL;*xx=0;}
|
||||
lastUpdateTime = getSpideredTime();//m_spideredTime;
|
||||
}
|
||||
@ -12164,14 +12164,14 @@ long *XmlDoc::getIp ( ) {
|
||||
if ( ! cr ) return NULL;
|
||||
|
||||
bool useTestCache = false;
|
||||
if ( ! strcmp(cr->m_coll,"test") ) useTestCache = true;
|
||||
if ( ! strcmp(cr->m_coll,"qatest123") ) useTestCache = true;
|
||||
// unless its the pagesubmit.cpp event submission tool
|
||||
//if ( m_sreqValid && m_sreq.m_isPageSubmit ) useTestCache = false;
|
||||
|
||||
|
||||
// when building the "test" collection try to get the ip from
|
||||
// when building the "qatest123" collection try to get the ip from
|
||||
// "./test/ips.txt" so our injections are consistent every time
|
||||
// Test.cpp runs its injection loop into the "test" collection
|
||||
// Test.cpp runs its injection loop into the "qatest123" collection
|
||||
if ( useTestCache ) { // && m_useIpsTxtFile ) {
|
||||
// stolen from msgc.cpp:
|
||||
// if url is already in a.b.c.d format return that
|
||||
@ -12204,7 +12204,7 @@ long *XmlDoc::getIp ( ) {
|
||||
// this basically slows the spider down.
|
||||
long delay = cr->m_spiderDelayInMilliseconds;
|
||||
// ignore for testing
|
||||
if ( ! strcmp(cr->m_coll,"test") ) delay = 0;
|
||||
if ( ! strcmp(cr->m_coll,"qatest123") ) delay = 0;
|
||||
// injected?
|
||||
if ( m_sreqValid && m_sreq.m_isInjecting ) delay = 0;
|
||||
if ( m_sreqValid && m_sreq.m_isPageParser ) delay = 0;
|
||||
@ -12281,14 +12281,14 @@ long *XmlDoc::gotIp ( bool save ) {
|
||||
if ( ! cr ) return NULL;
|
||||
|
||||
bool useTestCache = false;
|
||||
if ( ! strcmp(cr->m_coll,"test") ) useTestCache = true;
|
||||
if ( ! strcmp(cr->m_coll,"qatest123") ) useTestCache = true;
|
||||
// unless its the pagesubmit.cpp event submission tool
|
||||
//if ( m_sreqValid && m_sreq.m_isPageSubmit ) useTestCache = false;
|
||||
|
||||
|
||||
// when building the "test" collection try to get the ip from
|
||||
// when building the "qatest123" collection try to get the ip from
|
||||
// "./test/ips.txt" so our injections are consistent every time
|
||||
// Test.cpp runs its injection loop into the "test" collection
|
||||
// Test.cpp runs its injection loop into the "qatest123" collection
|
||||
if ( save && useTestCache ) {
|
||||
// ip of 0 means NXDOMAIN i think (-1 means error)
|
||||
//if ( m_ip == 0 ) {
|
||||
@ -12592,8 +12592,8 @@ bool *XmlDoc::getIsAllowed ( ) {
|
||||
return &m_isAllowed;
|
||||
}
|
||||
|
||||
// or if using the "test" collection, assume yes!
|
||||
//if ( ! strcmp ( m_coll , "test" ) ) {
|
||||
// or if using the "qatest123" collection, assume yes!
|
||||
//if ( ! strcmp ( m_coll , "qatest123" ) ) {
|
||||
// m_isAllowed = true;
|
||||
// m_isAllowedValid = true;
|
||||
// return &m_isAllowed;
|
||||
@ -12939,8 +12939,8 @@ LinkInfo *XmlDoc::getLinkInfo1 ( ) {
|
||||
if ( ! m_calledMsg25 ) {
|
||||
// get this
|
||||
long lastUpdateTime = getTimeGlobal();
|
||||
// but be consistent if doing the "test" collection
|
||||
if ( ! strcmp(cr->m_coll,"test") ) {
|
||||
// but be consistent if doing the "qatest123" collection
|
||||
if ( ! strcmp(cr->m_coll,"qatest123") ) {
|
||||
//if ( ! m_spideredTimeValid ) {char *xx=NULL;*xx=0;}
|
||||
lastUpdateTime = getSpideredTime();//m_spideredTime;
|
||||
}
|
||||
@ -14184,7 +14184,7 @@ char **XmlDoc::getHttpReply ( ) {
|
||||
// come back up here if a redirect invalidates it
|
||||
loop:
|
||||
// sanity test -- only if not the test collection (NO, might be EBADIP)
|
||||
//if ( m_indexCode && strcmp(m_coll,"test") ) { char *xx=NULL;*xx=0; }
|
||||
//if ( m_indexCode && strcmp(m_coll,"qatest123")){char*xx=NULL;*xx=0;}
|
||||
// get the http reply
|
||||
char **replyPtr = getHttpReply2();
|
||||
if ( ! replyPtr || replyPtr == (void *)-1 ) return (char **)replyPtr;
|
||||
@ -14382,7 +14382,7 @@ char **XmlDoc::getHttpReply2 ( ) {
|
||||
// return gotHttpReply ( );
|
||||
|
||||
bool useTestCache = false;
|
||||
if ( ! strcmp(cr->m_coll,"test") ) useTestCache = true;
|
||||
if ( ! strcmp(cr->m_coll,"qatest123") ) useTestCache = true;
|
||||
// unless its the pagesubmit.cpp event submission tool
|
||||
//if ( m_sreqValid && m_sreq.m_isPageSubmit ) useTestCache = false;
|
||||
|
||||
@ -14474,11 +14474,12 @@ char **XmlDoc::getHttpReply2 ( ) {
|
||||
// turn off
|
||||
r->m_useCompressionProxy = false;
|
||||
r->m_compressReply = false;
|
||||
r->m_isCustomCrawl = cr->m_isCustomCrawl;
|
||||
|
||||
// set it for this too
|
||||
if ( g_conf.m_useCompressionProxy &&
|
||||
// do not use for the test collection ever, that is qa'ing
|
||||
strcmp(cr->m_coll,"test") ) {
|
||||
strcmp(cr->m_coll,"qatest123") ) {
|
||||
r->m_useCompressionProxy = true;
|
||||
r->m_compressReply = true;
|
||||
}
|
||||
@ -14539,7 +14540,7 @@ char **XmlDoc::getHttpReply2 ( ) {
|
||||
// . msg13 uses XmlDoc::getHttpReply() function to handle
|
||||
// redirects, etc.? no...
|
||||
bool isTestColl = false;
|
||||
if ( ! strcmp(cr->m_coll,"test") ) isTestColl = true;
|
||||
if ( ! strcmp(cr->m_coll,"qatest123") ) isTestColl = true;
|
||||
|
||||
// sanity check. keep injections fast. no downloading!
|
||||
if ( m_wasInjected ) {
|
||||
@ -14613,7 +14614,7 @@ char **XmlDoc::gotHttpReply ( ) {
|
||||
// . i.e. what are you doing downloading the page if there was
|
||||
// a problem with the page we already know about
|
||||
if ( m_indexCode && m_indexCodeValid &&
|
||||
strcmp(cr->m_coll,"test") ) { char *xx=NULL;*xx=0; }
|
||||
strcmp(cr->m_coll,"qatest123") ) { char *xx=NULL;*xx=0; }
|
||||
|
||||
// fix this
|
||||
if ( saved == EDOCUNCHANGED ) {
|
||||
@ -17207,6 +17208,8 @@ long *XmlDoc::getContentHashJson32 ( ) {
|
||||
if ( ji->m_type != JT_NUMBER && ji->m_type != JT_STRING )
|
||||
continue;
|
||||
|
||||
char *topName = NULL;
|
||||
|
||||
// what name level are we?
|
||||
long numNames = 1;
|
||||
JsonItem *pi = ji->m_parent;
|
||||
@ -17214,6 +17217,7 @@ long *XmlDoc::getContentHashJson32 ( ) {
|
||||
// empty name?
|
||||
if ( ! pi->m_name ) continue;
|
||||
if ( ! pi->m_name[0] ) continue;
|
||||
topName = pi->m_name;
|
||||
numNames++;
|
||||
}
|
||||
|
||||
@ -17232,6 +17236,22 @@ long *XmlDoc::getContentHashJson32 ( ) {
|
||||
strcmp(ji->m_name,"resolved_url") == 0 )
|
||||
continue;
|
||||
|
||||
if ( topName && strcmp(topName,"stats") == 0 )
|
||||
continue;
|
||||
|
||||
if ( topName && strcmp(topName,"queryString") == 0 )
|
||||
continue;
|
||||
|
||||
if ( topName && strcmp(topName,"nextPages") == 0 )
|
||||
continue;
|
||||
|
||||
if ( topName && strcmp(topName,"textAnalysis") == 0 )
|
||||
continue;
|
||||
|
||||
if ( topName && strcmp(topName,"links") == 0 )
|
||||
continue;
|
||||
|
||||
|
||||
// hash the fully compound name
|
||||
long nameHash32 = 0;
|
||||
JsonItem *p = ji;
|
||||
@ -17607,7 +17627,7 @@ long **XmlDoc::getOutlinkFirstIpVector () {
|
||||
if ( ! cr ) return NULL;
|
||||
|
||||
// . go get it
|
||||
// . if coll is "test" then try to use the file ./test/ips.txt to
|
||||
// . if coll is "qatest123" then try to use the file ./test/ips.txt to
|
||||
// see if the ip is in there for the given url hostname
|
||||
// . this will now update Tagdb with the "firstip" tags if it should!!
|
||||
// . this just dns looks up the DOMAINS of each outlink because these
|
||||
@ -17747,7 +17767,7 @@ long *XmlDoc::getUrlFilterNum ( ) {
|
||||
|
||||
// . look it up
|
||||
// . use the old spidered date for "nowGlobal" so we can be consistent
|
||||
// for injecting into the "test" coll
|
||||
// for injecting into the "qatest123" coll
|
||||
long ufn = ::getUrlFilterNum ( oldsr,&fakeReply,spideredTime,false,
|
||||
m_niceness,cr,
|
||||
false, // isOutlink?
|
||||
@ -18754,7 +18774,7 @@ bool XmlDoc::doConsistencyTest ( bool forceTest ) {
|
||||
return true;
|
||||
|
||||
// if not test coll skip this
|
||||
//if ( strcmp(cr->m_coll,"test") ) return true;
|
||||
//if ( strcmp(cr->m_coll,"qatest123") ) return true;
|
||||
|
||||
// title rec is null if we are reindexing an old doc
|
||||
// and "unchanged" was true.
|
||||
@ -19200,7 +19220,7 @@ void XmlDoc::printMetaList ( char *p , char *pend , SafeBuf *sb ) {
|
||||
else if ( rdbId == RDB_TITLEDB ) {
|
||||
//XmlDoc tr;
|
||||
//SafeBuf tmp;
|
||||
//tr.set2 ( rec,recSize ,"test",&tmp,m_niceness);
|
||||
//tr.set2 ( rec,recSize ,"qatest123",&tmp,m_niceness);
|
||||
// print each offset and size for the variable crap
|
||||
sb->safePrintf("<td><nobr>titlerec datasize=%li "
|
||||
//"sizeofxmldoc=%li "
|
||||
@ -19273,7 +19293,7 @@ bool XmlDoc::verifyMetaList ( char *p , char *pend , bool forDelete ) {
|
||||
if ( ! cr ) return true;
|
||||
|
||||
// do not do this if not test collection for now
|
||||
if ( strcmp(cr->m_coll,"test") ) return true;
|
||||
if ( strcmp(cr->m_coll,"qatest123") ) return true;
|
||||
|
||||
// store each record in the list into the send buffers
|
||||
for ( ; p < pend ; ) {
|
||||
@ -22437,7 +22457,7 @@ SpiderReply *XmlDoc::getNewSpiderReply ( ) {
|
||||
//if ( ! m_spideredTimeValid ) { char *xx=NULL;*xx=0; }
|
||||
|
||||
// . set other fields besides key
|
||||
// . crap! if we are the "test" collection then m_spideredTime
|
||||
// . crap! if we are the "qatest123" collection then m_spideredTime
|
||||
// was read from disk usually and is way in the past! watch out!!
|
||||
m_srep.m_spideredTime = getSpideredTime();//m_spideredTime;
|
||||
|
||||
@ -22447,7 +22467,7 @@ SpiderReply *XmlDoc::getNewSpiderReply ( ) {
|
||||
// crap, for the test coll this is often a very old time and it
|
||||
// causes the spider request to be repeatedly executed, so let's
|
||||
// fix that
|
||||
if ( ! strcmp(cr->m_coll,"test") )
|
||||
if ( ! strcmp(cr->m_coll,"qatest123") )
|
||||
m_srep.m_spideredTime = getTimeGlobal();
|
||||
|
||||
|
||||
@ -23031,7 +23051,7 @@ char *XmlDoc::addOutlinkSpiderRecsToMetaList ( ) {
|
||||
if ( ! cr ) return NULL;
|
||||
|
||||
// do not do this if not test collection for now
|
||||
bool isTestColl = (! strcmp(cr->m_coll,"test") );
|
||||
bool isTestColl = (! strcmp(cr->m_coll,"qatest123") );
|
||||
// turn off for now
|
||||
isTestColl = false;
|
||||
|
||||
@ -30297,6 +30317,9 @@ bool XmlDoc::hashNumber ( char *beginBuf ,
|
||||
// . this now allows for commas in numbers like "1,500.62"
|
||||
float f = atof2 ( p , bufEnd - p );
|
||||
|
||||
// debug
|
||||
//log("build: hashing %s %f",hi->m_prefix,f);
|
||||
|
||||
if ( ! hashNumber2 ( f , hi , "gbsortby" ) )
|
||||
return false;
|
||||
|
||||
@ -33687,7 +33710,7 @@ SafeBuf *XmlDoc::getNewTagBuf ( ) {
|
||||
long now = getTimeGlobal();
|
||||
// actually, use spider download time if we can. that way
|
||||
// Test.cpp's injection runs will be more consistent!
|
||||
if ( ! strcmp(cr->m_coll,"test") ) {
|
||||
if ( ! strcmp(cr->m_coll,"qatest123") ) {
|
||||
//if ( ! m_spideredTimeValid ) { char *xx=NULL;*xx=0; }
|
||||
now = getSpideredTime();//m_spideredTime;
|
||||
}
|
||||
|
@ -1,416 +0,0 @@
|
||||
# List of sites to spider, one per line. Gigablast uses the <a
|
||||
# href=/admin/filters#insitelist>insitelist</a> directive on the <a
|
||||
# href=/admin/filters>url filters</a> page to make sure that the spider only
|
||||
# indexes urls that match the site patterns you specify here, other than urls
|
||||
# you add individually via the add urls or inject url tools. See <a
|
||||
# href=#examples>example site list</a> below. Limit list to 300MB. If you have
|
||||
# a lot of INDIVIDUAL URLS to add then consider using the <a
|
||||
# href=/admin/addurl>addurl</a> interface.
|
||||
<siteList><![CDATA[]]></>
|
||||
|
||||
# All <, >, " and # characters that are values for a field contained herein
|
||||
# must be represented as <, >, " and # respectively.
|
||||
|
||||
# Controls just the spiders for this collection.
|
||||
<spideringEnabled>1</>
|
||||
|
||||
# What is the maximum number of web pages the spider is allowed to download
|
||||
# simultaneously PER HOST for THIS collection?
|
||||
<maxSpiders>100</>
|
||||
|
||||
# make each spider wait this many milliseconds before getting the ip and
|
||||
# downloading the page.
|
||||
<spiderDelayInMilliseconds>0</>
|
||||
|
||||
# If this is true Gigablast will respect the robots.txt convention.
|
||||
<useRobotstxt>1</>
|
||||
|
||||
# How many second to cache a robots.txt file for. 86400 is 1 day. 0 means
|
||||
# Gigablast will not read from the cache at all and will download the
|
||||
# robots.txt before every page if robots.txt use is enabled above. However, if
|
||||
# this is 0 then Gigablast will still store robots.txt files into the cache.
|
||||
<maxRobotstxtCacheAge>86400</>
|
||||
|
||||
# Do a tight merge on posdb and titledb at this time every day. This is
|
||||
# expressed in MINUTES past midnight UTC. UTC is 5 hours ahead of EST and 7
|
||||
# hours ahead of MST. Leave this as -1 to NOT perform a daily merge. To merge
|
||||
# at midnight EST use 60*5=300 and midnight MST use 60*7=420.
|
||||
<dailyMergeTime>-1</>
|
||||
|
||||
# Comma separated list of days to merge on. Use 0 for Sunday, 1 for Monday,
|
||||
# ... 6 for Saturday. Leaving this parmaeter empty or without any numbers will
|
||||
# make the daily merge happen every day
|
||||
<dailyMergeDays><![CDATA[0]]></>
|
||||
|
||||
# When the daily merge was last kicked off. Expressed in UTC in seconds since
|
||||
# the epoch.
|
||||
<dailyMergeLastStarted>-1</>
|
||||
|
||||
# If this is true, users will have to pass a simple Turing test to add a url.
|
||||
# This prevents automated url submission.
|
||||
<turingTestEnabled>0</>
|
||||
|
||||
# Maximum number of urls that can be submitted via the addurl interface, per
|
||||
# IP domain, per 24 hour period. A value less than or equal to zero implies no
|
||||
# limit.
|
||||
<maxAddUrls>0</>
|
||||
|
||||
# When the spider round started
|
||||
<spiderRoundStartTime>0</>
|
||||
|
||||
# The spider round number.
|
||||
<spiderRoundNum>0</>
|
||||
|
||||
# When enabled, the spider will discard web pages which are identical to other
|
||||
# web pages that are already in the index. However, root urls, urls that have
|
||||
# no path, are never discarded. It most likely has to hit disk to do these
|
||||
# checks so it does cause some slow down. Only use it if you need it.
|
||||
<dedupingEnabled>0</>
|
||||
|
||||
# When enabled, the spider will discard web pages which, when a www is
|
||||
# prepended to the page's url, result in a url already in the index.
|
||||
<dedupingEnabledForWww>1</>
|
||||
|
||||
# Detect and do not index pages which have a 200 status code, but are likely
|
||||
# to be error pages.
|
||||
<detectCustomErrorPages>1</>
|
||||
|
||||
# Should pages be removed from the index if they are no longer accessible on
|
||||
# the web?
|
||||
<delete404s>1</>
|
||||
|
||||
# If this is true, the spider, when a url redirects to a "simpler" url, will
|
||||
# add that simpler url into the spider queue and abandon the spidering of the
|
||||
# current url.
|
||||
<useSimplifiedRedirects>1</>
|
||||
|
||||
# If this is true, the spider, when updating a web page that is already in the
|
||||
# index, will not even download the whole page if it hasn't been updated since
|
||||
# the last time Gigablast spidered it. This is primarily a bandwidth saving
|
||||
# feature. It relies on the remote webserver's returned Last-Modified-Since
|
||||
# field being accurate.
|
||||
<useIfModifiedSince>0</>
|
||||
|
||||
# If this is true, do not allow spammy inlinks to vote. This check is too
|
||||
# aggressive for some collections, i.e. it does not allow pages with cgi in
|
||||
# their urls to vote.
|
||||
<doLinkSpamChecking>1</>
|
||||
|
||||
# If this is true Gigablast will only allow one vote per the top 2 significant
|
||||
# bytes of the IP address. Otherwise, multiple pages from the same top IP can
|
||||
# contribute to the link text and link-based quality ratings of a particular
|
||||
# URL. Furthermore, no votes will be accepted from IPs that have the same top
|
||||
# 2 significant bytes as the IP of the page being indexed.
|
||||
<restrictLinkVotingByIp>1</>
|
||||
|
||||
# How often should Gigablast recompute the link info for a url. Also applies
|
||||
# to getting the quality of a site or root url, which is based on the link
|
||||
# info. In days. Can use decimals. 0 means to update the link info every time
|
||||
# the url's content is re-indexed. If the content is not reindexed because it
|
||||
# is unchanged then the link info will not be updated. When getting the link
|
||||
# info or quality of the root url from an external cluster, Gigablast will
|
||||
# tell the external cluster to recompute it if its age is this or higher.
|
||||
<updateLinkInfoFrequency>60.000000</>
|
||||
|
||||
# If this is eabled the spider will not allow any docs which are determined to
|
||||
# be serps.
|
||||
<doSerpDetection>1</>
|
||||
|
||||
# If this is false then the filter will not be used on html or text pages.
|
||||
<applyFilterToTextPages>0</>
|
||||
|
||||
# Program to spawn to filter all HTTP replies the spider receives. Leave blank
|
||||
# for none.
|
||||
<filterName><![CDATA[]]></>
|
||||
|
||||
# Kill filter shell after this many seconds. Assume it stalled permanently.
|
||||
<filterTimeout>40</>
|
||||
|
||||
# Retrieve pages from the proxy at this IP address.
|
||||
<proxyIp>0.0.0.0</>
|
||||
|
||||
# Retrieve pages from the proxy on this port.
|
||||
<proxyPort>0</>
|
||||
|
||||
# Index the body of the documents so you can search it. Required for searching
|
||||
# that. You wil pretty much always want to keep this enabled.
|
||||
<indexBody>1</>
|
||||
|
||||
# Send every spidered url to this diffbot.com by appending a &url=<url> to it
|
||||
# before trinyg to downloading it. We expect get get back a JSON reply which
|
||||
# we index. You will need to supply your token to this as well.
|
||||
<diffbotApiUrl><![CDATA[]]></>
|
||||
|
||||
# Get scoring information for each result so you can see how each result is
|
||||
# scored? You must explicitly request this using &scores=1 for the XML feed
|
||||
# because it is not included by default.
|
||||
<getDocidScoringInfo>1</>
|
||||
|
||||
# Query expansion will include word stems and synonyms in its search results.
|
||||
<doQueryExpansion>1</>
|
||||
|
||||
# What is the limit to the total number of returned search results.
|
||||
<maxSearchResults>1000</>
|
||||
|
||||
# What is the limit to the total number of returned search results per query?
|
||||
<maxSearchResultsPerQuery>100</>
|
||||
|
||||
# What is the maximum number of characters allowed in titles displayed in the
|
||||
# search results?
|
||||
<maxTitleLen>80</>
|
||||
|
||||
# Should search results be site clustered by default?
|
||||
<siteClusterByDefault>1</>
|
||||
|
||||
# Hide all clustered results instead of displaying two results from each site.
|
||||
<hideAllClusteredResults>0</>
|
||||
|
||||
# Should duplicate search results be removed by default?
|
||||
<dedupResultsByDefault>1</>
|
||||
|
||||
# Should we dedup URLs with case insensitivity? This is mainly to correct
|
||||
# duplicate wiki pages.
|
||||
<dedupURLs>0</>
|
||||
|
||||
# If document summary is this percent similar to a document summary above it,
|
||||
# then remove it from the search results. 100 means only to remove if exactly
|
||||
# the same. 0 means no summary deduping.
|
||||
<percentSimilarDedupSummary>90</>
|
||||
|
||||
# Sets the number of lines to generate for summary deduping. This is to help
|
||||
# the deduping process not thorw out valid summaries when normally displayed
|
||||
# summaries are smaller values. Requires percent similar dedup summary to be
|
||||
# enabled.
|
||||
<numberOfLinesToUseInSummaryToDedup>4</>
|
||||
|
||||
# Default language to use for ranking results. Value should be any language
|
||||
# abbreviation, for example "en" for English.
|
||||
<sortLanguagePreference><![CDATA[en]]></>
|
||||
|
||||
# Default country to use for ranking results. Value should be any country code
|
||||
# abbreviation, for example "us" for United States.
|
||||
<sortCountryPreference><![CDATA[us]]></>
|
||||
|
||||
# What is the maximum number of characters displayed in a summary for a search
|
||||
# result?
|
||||
<maxSummaryLen>512</>
|
||||
|
||||
# What is the maximum number of excerpts displayed in the summary of a search
|
||||
# result?
|
||||
<maxSummaryExcerpts>4</>
|
||||
|
||||
# What is the maximum number of characters allowed per summary excerpt?
|
||||
<maxSummaryExcerptLength>300</>
|
||||
|
||||
# What is the default number of summary excerpts displayed per search result?
|
||||
<defaultNumberOfSummaryExcerpts>3</>
|
||||
|
||||
# <br> tags are inserted to keep the number of chars in the summary per line
|
||||
# at or below this width. Strings without spaces that exceed this width are
|
||||
# not split.
|
||||
<maxSummaryLineWidth>80</>
|
||||
|
||||
# Truncating this will miss out on good summaries, but performance will
|
||||
# increase.
|
||||
<bytesOfDocToScanForSummaryGeneration>70000</>
|
||||
|
||||
# Front html tag used for highlightig query terms in the summaries displated
|
||||
# in the search results.
|
||||
<frontHighlightTag><![CDATA[<b style="color:black;background-color:#ffff66">]]></>
|
||||
|
||||
# Front html tag used for highlightig query terms in the summaries displated
|
||||
# in the search results.
|
||||
<backHighlightTag><![CDATA[</b>]]></>
|
||||
|
||||
# How many search results should we scan for related topics (gigabits) per
|
||||
# query?
|
||||
<docsToScanForTopics>300</>
|
||||
|
||||
# Should Gigablast only get one document per IP domain and per domain for
|
||||
# topic (gigabit) generation?
|
||||
<ipRestrictionForTopics>0</>
|
||||
|
||||
# Should Gigablast remove overlapping topics (gigabits)?
|
||||
<removeOverlappingTopics>1</>
|
||||
|
||||
# What is the number of related topics (gigabits) displayed per query? Set to
|
||||
# 0 to save CPU time.
|
||||
<numberOfRelatedTopics>11</>
|
||||
|
||||
# Related topics (gigabits) with scores below this will be excluded. Scores
|
||||
# range from 0% to over 100%.
|
||||
<minTopicsScore>5</>
|
||||
|
||||
# How many documents must contain the topic (gigabit) for it to be displayed.
|
||||
<minTopicDocCount>2</>
|
||||
|
||||
# If a document is this percent similar to another document with a higher
|
||||
# score, then it will not contribute to the topic (gigabit) generation.
|
||||
<dedupDocPercentForTopics>80</>
|
||||
|
||||
# Maximum number of words a topic (gigabit) can have. Affects raw feeds, too.
|
||||
<maxWordsPerTopic>6</>
|
||||
|
||||
# Max chars to sample from each doc for topics (gigabits).
|
||||
<topicMaxSampleSize>4096</>
|
||||
|
||||
# If enabled, results in dmoz will display their categories on the results
|
||||
# page.
|
||||
<displayDmozCategoriesInResults>1</>
|
||||
|
||||
# If enabled, results in dmoz will display their indirect categories on the
|
||||
# results page.
|
||||
<displayIndirectDmozCategoriesInResults>0</>
|
||||
|
||||
# If enabled, a link will appear next to each category on each result allowing
|
||||
# the user to perform their query on that entire category.
|
||||
<displaySearchCategoryLinkToQueryCategoryOfResult>0</>
|
||||
|
||||
# Yes to use DMOZ given title when a page is untitled but is in DMOZ.
|
||||
<useDmozForUntitled>1</>
|
||||
|
||||
# Yes to always show DMOZ summaries with search results that are in DMOZ.
|
||||
<showDmozSummaries>1</>
|
||||
|
||||
# Yes to display the Adult category in the Top category
|
||||
<showAdultCategoryOnTop>0</>
|
||||
|
||||
# Before downloading the contents of a URL, Gigablast first chains down this
|
||||
# list of expressions</a>, starting with expression #0. The first expression
|
||||
# it matches is the ONE AND ONLY matching row for that url. It then uses the
|
||||
# respider frequency, spider priority, etc. on the MATCHING ROW when spidering
|
||||
# that URL. If you specify the <i>expression</i> as <i><b>default</b></i> then
|
||||
# that MATCHES ALL URLs. URLs with high spider priorities take spidering
|
||||
# precedence over URLs with lower spider priorities. The respider frequency
|
||||
# dictates how often a URL will be respidered. See the help table below for
|
||||
# examples of all the supported expressions. Use the <i>&&</i> operator to
|
||||
# string multiple expressions together in the same expression text box. A
|
||||
# <i>spider priority</i> of <i>DELETE</i> will cause the URL to not be
|
||||
# spidered, or if it has already been indexed, it will be deleted when it is
|
||||
# respidered.<br><br>
|
||||
<filterExpression><![CDATA[isdocidbased]]></>
|
||||
<filterExpression><![CDATA[ismedia]]></>
|
||||
<filterExpression><![CDATA[errorcount>=3 && hastmperror]]></>
|
||||
<filterExpression><![CDATA[errorcount>=1 && hastmperror]]></>
|
||||
<filterExpression><![CDATA[isaddurl]]></>
|
||||
<filterExpression><![CDATA[hopcount==0 && iswww && isnew]]></>
|
||||
<filterExpression><![CDATA[hopcount==0 && iswww]]></>
|
||||
<filterExpression><![CDATA[hopcount==0 && isnew]]></>
|
||||
<filterExpression><![CDATA[hopcount==0]]></>
|
||||
<filterExpression><![CDATA[hopcount==1 && isnew]]></>
|
||||
<filterExpression><![CDATA[hopcount==1]]></>
|
||||
<filterExpression><![CDATA[hopcount==2 && isnew]]></>
|
||||
<filterExpression><![CDATA[hopcount==2]]></>
|
||||
<filterExpression><![CDATA[hopcount>=3 && isnew]]></>
|
||||
<filterExpression><![CDATA[hopcount>=3]]></>
|
||||
<filterExpression><![CDATA[isnew]]></>
|
||||
<filterExpression><![CDATA[default]]></>
|
||||
<harvestLinks>1</>
|
||||
<harvestLinks>1</>
|
||||
<harvestLinks>1</>
|
||||
<harvestLinks>1</>
|
||||
<harvestLinks>1</>
|
||||
<harvestLinks>1</>
|
||||
<harvestLinks>1</>
|
||||
<harvestLinks>1</>
|
||||
<harvestLinks>1</>
|
||||
<harvestLinks>1</>
|
||||
<harvestLinks>1</>
|
||||
<harvestLinks>1</>
|
||||
<harvestLinks>1</>
|
||||
<harvestLinks>1</>
|
||||
<harvestLinks>1</>
|
||||
<harvestLinks>1</>
|
||||
<harvestLinks>1</>
|
||||
<filterFrequency>0.000000</>
|
||||
<filterFrequency>0.000000</>
|
||||
<filterFrequency>1.000000</>
|
||||
<filterFrequency>1.000000</>
|
||||
<filterFrequency>1.000000</>
|
||||
<filterFrequency>7.000000</>
|
||||
<filterFrequency>7.000000</>
|
||||
<filterFrequency>7.000000</>
|
||||
<filterFrequency>10.000000</>
|
||||
<filterFrequency>20.000000</>
|
||||
<filterFrequency>20.000000</>
|
||||
<filterFrequency>40.000000</>
|
||||
<filterFrequency>40.000000</>
|
||||
<filterFrequency>60.000000</>
|
||||
<filterFrequency>60.000000</>
|
||||
<filterFrequency>30.000000</>
|
||||
<filterFrequency>30.000000</>
|
||||
|
||||
# Do not allow more than this many outstanding spiders for all urls in this
|
||||
# priority.
|
||||
<maxSpidersPerRule>99</>
|
||||
<maxSpidersPerRule>99</>
|
||||
<maxSpidersPerRule>1</>
|
||||
<maxSpidersPerRule>1</>
|
||||
<maxSpidersPerRule>99</>
|
||||
<maxSpidersPerRule>4</>
|
||||
<maxSpidersPerRule>2</>
|
||||
<maxSpidersPerRule>1</>
|
||||
<maxSpidersPerRule>2</>
|
||||
<maxSpidersPerRule>99</>
|
||||
<maxSpidersPerRule>1</>
|
||||
<maxSpidersPerRule>99</>
|
||||
<maxSpidersPerRule>1</>
|
||||
<maxSpidersPerRule>99</>
|
||||
<maxSpidersPerRule>1</>
|
||||
<maxSpidersPerRule>99</>
|
||||
<maxSpidersPerRule>99</>
|
||||
|
||||
# Allow this many spiders per IP.
|
||||
<maxSpidersPerIp>1</>
|
||||
<maxSpidersPerIp>1</>
|
||||
<maxSpidersPerIp>1</>
|
||||
<maxSpidersPerIp>1</>
|
||||
<maxSpidersPerIp>1</>
|
||||
<maxSpidersPerIp>1</>
|
||||
<maxSpidersPerIp>1</>
|
||||
<maxSpidersPerIp>1</>
|
||||
<maxSpidersPerIp>1</>
|
||||
<maxSpidersPerIp>1</>
|
||||
<maxSpidersPerIp>1</>
|
||||
<maxSpidersPerIp>1</>
|
||||
<maxSpidersPerIp>1</>
|
||||
<maxSpidersPerIp>1</>
|
||||
<maxSpidersPerIp>1</>
|
||||
<maxSpidersPerIp>1</>
|
||||
<maxSpidersPerIp>1</>
|
||||
|
||||
# Wait at least this long before downloading urls from the same IP address.
|
||||
<spiderIpWait>1000</>
|
||||
<spiderIpWait>1000</>
|
||||
<spiderIpWait>1000</>
|
||||
<spiderIpWait>1000</>
|
||||
<spiderIpWait>1000</>
|
||||
<spiderIpWait>1000</>
|
||||
<spiderIpWait>1000</>
|
||||
<spiderIpWait>1000</>
|
||||
<spiderIpWait>1000</>
|
||||
<spiderIpWait>1000</>
|
||||
<spiderIpWait>1000</>
|
||||
<spiderIpWait>1000</>
|
||||
<spiderIpWait>1000</>
|
||||
<spiderIpWait>1000</>
|
||||
<spiderIpWait>1000</>
|
||||
<spiderIpWait>1000</>
|
||||
<spiderIpWait>1000</>
|
||||
<filterPriority>80</>
|
||||
<filterPriority>-3</>
|
||||
<filterPriority>3</>
|
||||
<filterPriority>45</>
|
||||
<filterPriority>85</>
|
||||
<filterPriority>50</>
|
||||
<filterPriority>48</>
|
||||
<filterPriority>49</>
|
||||
<filterPriority>47</>
|
||||
<filterPriority>40</>
|
||||
<filterPriority>39</>
|
||||
<filterPriority>30</>
|
||||
<filterPriority>29</>
|
||||
<filterPriority>20</>
|
||||
<filterPriority>19</>
|
||||
<filterPriority>1</>
|
||||
<filterPriority>0</>
|
@ -104,7 +104,7 @@ void timeWrapper ( int fd , void *state ) {
|
||||
// bail if too many launched
|
||||
if ( s_count >= s_max ) return;
|
||||
// new state
|
||||
StateT *st = (StateT *)mmalloc ( sizeof(StateT) , "test" );
|
||||
StateT *st = (StateT *)mmalloc ( sizeof(StateT) , "dnstest" );
|
||||
// get url from stdin into buf
|
||||
char *p = st->m_buf;
|
||||
if ( ! fgets ( p , 1023 , stdin ) ) exit ( 0 );
|
||||
@ -147,6 +147,6 @@ void dnsWrapper ( void *state , long ip ) {
|
||||
st->m_buf , iptoa(ip) , mstrerror(g_errno));
|
||||
//if ( g_errno == ETRYAGAIN )
|
||||
// log("hey");
|
||||
mfree ( st , sizeof(StateT), "test" );
|
||||
mfree ( st , sizeof(StateT), "dnstest" );
|
||||
s_count--;
|
||||
}
|
||||
|
@ -127,11 +127,14 @@ a{cursor:hand;cursor:pointer;text-decoration:none;color:blue;}
|
||||
<td style="padding-bottom:12px"> </td>
|
||||
<td style="padding-bottom:12px"> </td>
|
||||
</tr>
|
||||
<!--
|
||||
<tr bgcolor="#006699">
|
||||
<th><a name="boolean" id="boolean"></a><font color="#FFFFFF">Boolean Search</font></th>
|
||||
<th><font color="#FFFFFF">Description</font></th>
|
||||
|
||||
<tr bgcolor="#0340fd">
|
||||
|
||||
<th><font color=33dcff>Boolean Search</font></th>
|
||||
<th><font color=33dcff>Description</font></th>
|
||||
|
||||
</tr>
|
||||
|
||||
<tr>
|
||||
<td colspan="2" bgcolor="#FFFFCC"><center>
|
||||
Note: boolean operators must be in UPPER CASE.
|
||||
@ -214,16 +217,17 @@ a{cursor:hand;cursor:pointer;text-decoration:none;color:blue;}
|
||||
expressions and can be optionally enclosed in parentheses. A NOT
|
||||
operator can optionally preceed the left or the right operand.</td>
|
||||
</tr>
|
||||
-->
|
||||
|
||||
</table>
|
||||
|
||||
|
||||
|
||||
</td></tr>
|
||||
</table>
|
||||
<br>
|
||||
|
||||
<center>
|
||||
Copyright © 2013. All rights reserved.
|
||||
Copyright © 2014. All rights reserved.
|
||||
</center>
|
||||
</body>
|
||||
</html>
|
||||
|
12
main.cpp
12
main.cpp
@ -5680,7 +5680,7 @@ void zlibtest() {
|
||||
// malloc 1,000 bufs of size about 100-64k each
|
||||
for ( long i = 0 ; i < 100 ; i++ ) {
|
||||
long bufSize = 1000 + (rand() % 65000);
|
||||
ptrs[i] = (char *)mmalloc ( bufSize , "test" );
|
||||
ptrs[i] = (char *)mmalloc ( bufSize , "ztest" );
|
||||
if ( ! ptrs[i] ) {
|
||||
log("no mem!"); exit(-1); }
|
||||
lens[i] = bufSize;
|
||||
@ -5690,7 +5690,7 @@ void zlibtest() {
|
||||
}
|
||||
// now free them
|
||||
for ( long i = 0 ; i < 100 ; i++ )
|
||||
mfree (ptrs[i] , lens[i] , "test" );
|
||||
mfree (ptrs[i] , lens[i] , "ztest" );
|
||||
}
|
||||
}
|
||||
*/
|
||||
@ -11555,8 +11555,8 @@ bool parseTest ( char *coll , long long docId , char *query ) {
|
||||
// speed test
|
||||
t = gettimeofdayInMilliseconds();
|
||||
for ( long k = 0 ; k < 100 ; k++ ) {
|
||||
char *mm = (char *)mmalloc ( 300*1024 , "test");
|
||||
mfree ( mm , 300*1024 ,"test");
|
||||
char *mm = (char *)mmalloc ( 300*1024 , "ztest");
|
||||
mfree ( mm , 300*1024 ,"ztest");
|
||||
}
|
||||
e = gettimeofdayInMilliseconds();
|
||||
logf(LOG_DEBUG,"build: Took %.3f ms to do mallocs.",
|
||||
@ -14833,7 +14833,7 @@ bool cacheTest() {
|
||||
false , // support lists of recs?
|
||||
maxCacheNodes ,
|
||||
false , // use half keys?
|
||||
"test" , // dbname
|
||||
"cachetest" , // dbname
|
||||
false )) // save cache to disk?
|
||||
return log("test: Cache init failed.");
|
||||
|
||||
@ -14906,7 +14906,7 @@ bool cacheTest() {
|
||||
false , // support lists of recs?
|
||||
maxCacheNodes ,
|
||||
false , // use half keys?
|
||||
"test" , // dbname
|
||||
"cachetest" , // dbname
|
||||
false )) // save cache to disk?
|
||||
return log("test: Cache init failed.");
|
||||
|
||||
|
@ -233,7 +233,6 @@ long g_qn = 0;
|
||||
|
||||
char *g_queries[] = {
|
||||
//"buzzlogic",
|
||||
//"test",
|
||||
"broncos",
|
||||
"ibm",
|
||||
"yahoo",
|
||||
|
446
qa.cpp
Normal file
446
qa.cpp
Normal file
@ -0,0 +1,446 @@
|
||||
#include <string.h>
|
||||
#include "SafeBuf.h"
|
||||
#include "HttpServer.h"
|
||||
|
||||
static long s_failures = 0;
|
||||
|
||||
bool getUrl( char *path , void (* callback) (void *state, TcpSocket *sock) ) {
|
||||
SafeBuf sb;
|
||||
sb.safePrintf ( "http://%s:%li%s"
|
||||
, iptoa(g_hostdb.m_myHost->m_ip)
|
||||
, (long)g_hostdb.m_myHost->m_port
|
||||
, path
|
||||
);
|
||||
Url u;
|
||||
u.set ( sb.getBufStart() );
|
||||
if ( ! g_httpServer.getDoc ( u.getUrl() ,
|
||||
0 , // ip
|
||||
0 , // offset
|
||||
-1 , // size
|
||||
0 , // ifmodsince
|
||||
NULL ,
|
||||
callback ,
|
||||
60*1000, // timeout
|
||||
0, // proxyip
|
||||
0, // proxyport
|
||||
-1, // maxtextdoclen
|
||||
-1, // maxotherdoclen
|
||||
NULL ) ) // useragent
|
||||
return false;
|
||||
// error?
|
||||
log("qa: getUrl error: %s",mstrerror(g_errno));
|
||||
return true;
|
||||
}
|
||||
|
||||
bool qatest ( ) ;
|
||||
|
||||
void qatestWrapper ( void *state , TcpSocket *sock ) { qatest(); }
|
||||
|
||||
// return false if blocked, true otherwise
|
||||
bool addColl ( ) {
|
||||
static bool s_flag = false;
|
||||
if ( s_flag ) return true;
|
||||
s_flag = true;
|
||||
return getUrl ( "/admin/addcoll?c=qatest123" , qatestWrapper );
|
||||
}
|
||||
|
||||
|
||||
// first inject a set list of urls
|
||||
static char **s_urlPtrs = NULL;
|
||||
static long s_numUrls = 0;
|
||||
static SafeBuf s_ubuf1;
|
||||
static SafeBuf s_ubuf2;
|
||||
|
||||
|
||||
bool loadUrls ( ) {
|
||||
static bool s_loaded = false;
|
||||
if ( s_loaded ) return true;
|
||||
// use injectme3 file
|
||||
s_ubuf1.load("./injectme3");
|
||||
// scan for +++URL: xxxxx
|
||||
char *s = s_ubuf1.getBufStart();
|
||||
for ( ; *s ; s++ ) {
|
||||
if ( strncmp(s,"+++URL: ",8) ) continue;
|
||||
// got one
|
||||
// find end of it
|
||||
s += 8;
|
||||
char *e = s;
|
||||
for ( ; *e && ! is_wspace_a(*e); e++ );
|
||||
// null term it
|
||||
if ( *e ) *e = '\0';
|
||||
// store ptr
|
||||
s_ubuf2.pushLong((long)s);
|
||||
// skip past that
|
||||
s = e;
|
||||
}
|
||||
// make array of url ptrs
|
||||
s_urlPtrs = (char **)s_ubuf2.getBufStart();
|
||||
return true;
|
||||
}
|
||||
|
||||
bool injectUrls ( ) {
|
||||
loadUrls();
|
||||
static long s_ii = 0;
|
||||
for ( ; s_ii < s_numUrls ; ) {
|
||||
// pre-inc it
|
||||
s_ii++;
|
||||
// inject using html api
|
||||
SafeBuf sb;
|
||||
sb.safePrintf("/admin/inject?c=qatest123&delete=0&u=");
|
||||
sb.urlEncode ( s_urlPtrs[s_ii] );
|
||||
return getUrl ( sb.getBufStart() , qatestWrapper );
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
static char *s_queries[] = {
|
||||
"the",
|
||||
"+the",
|
||||
"cats",
|
||||
"+cats dog",
|
||||
"+cats +dog",
|
||||
"cat OR dog",
|
||||
"cat AND dog",
|
||||
"cat AND NOT dog",
|
||||
"NOT cat AND NOT dog",
|
||||
"cat -dog",
|
||||
"site:wisc.edu"
|
||||
};
|
||||
|
||||
static long s_checksums[] = {
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0
|
||||
};
|
||||
|
||||
static long s_qi1 = 0;
|
||||
|
||||
void doneSearching1 ( void *state , TcpSocket *sock ) {
|
||||
//loadQueries1();
|
||||
long ii = s_qi1 - 1;
|
||||
// get checksum of it
|
||||
HttpMime hm;
|
||||
hm.set ( sock->m_readBuf , sock->m_readOffset , NULL );
|
||||
char *page = sock->m_readBuf + hm.getMimeLen() ;
|
||||
// we will need to ignore fields like the latency etc.
|
||||
// perhaps pass that in as a cgi parm. &qa=1
|
||||
long crc = hash32n ( page );
|
||||
if ( crc != s_checksums[ii] ) {
|
||||
log("qatest: query '%s' checksum %lu != %lu",
|
||||
s_queries[ii],
|
||||
s_checksums[ii],
|
||||
crc);
|
||||
s_failures++;
|
||||
}
|
||||
// resume the qa loop
|
||||
qatest();
|
||||
}
|
||||
|
||||
|
||||
// ensure search results are consistent
|
||||
bool searchTest1 () {
|
||||
long nq = sizeof(s_queries)/sizeof(char *);
|
||||
for ( ; s_qi1 < nq ; ) {
|
||||
// pre-inc it
|
||||
s_qi1++;
|
||||
// inject using html api
|
||||
SafeBuf sb;
|
||||
// qa=1 tell gb to exclude "variable" or "random" things
|
||||
// from the serps so we can checksum it consistently
|
||||
sb.safePrintf ( "/search?c=qatest123&qa=1&q=" );
|
||||
sb.urlEncode ( s_queries[s_qi1] );
|
||||
return getUrl ( sb.getBufStart() , doneSearching1 );
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
static long s_qi2 = 0;
|
||||
|
||||
void doneSearching2 ( void *state , TcpSocket *sock ) {
|
||||
//loadQueries1();
|
||||
long ii = s_qi2 - 1;
|
||||
// get checksum of it
|
||||
HttpMime hm;
|
||||
hm.set ( sock->m_readBuf , sock->m_readOffset , NULL );
|
||||
char *page = sock->m_readBuf + hm.getMimeLen() ;
|
||||
// we will need to ignore fields like the latency etc.
|
||||
// perhaps pass that in as a cgi parm. &qa=1
|
||||
long crc = hash32n ( page );
|
||||
if ( crc != s_checksums[ii] ) {
|
||||
log("qatest: query '%s' checksum %lu != %lu",
|
||||
s_queries[ii],
|
||||
s_checksums[ii],
|
||||
crc);
|
||||
s_failures++;
|
||||
}
|
||||
// resume the qa loop
|
||||
qatest();
|
||||
}
|
||||
|
||||
|
||||
// ensure search results are consistent
|
||||
bool searchTest2 () {
|
||||
long nq = sizeof(s_queries)/sizeof(char *);
|
||||
for ( ; s_qi2 < nq ; ) {
|
||||
// pre-inc it
|
||||
s_qi2++;
|
||||
// inject using html api
|
||||
SafeBuf sb;
|
||||
// qa=1 tell gb to exclude "variable" or "random" things
|
||||
// from the serps so we can checksum it consistently
|
||||
sb.safePrintf ( "/search?c=qatest123&qa=1&q=" );
|
||||
sb.urlEncode ( s_queries[s_qi2] );
|
||||
return getUrl ( sb.getBufStart() , doneSearching2 );
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool deleteUrls ( ) {
|
||||
static long s_ii2 = 0;
|
||||
for ( ; s_ii2 < s_numUrls ; ) {
|
||||
// pre-inc it
|
||||
s_ii2++;
|
||||
// reject using html api
|
||||
SafeBuf sb;
|
||||
sb.safePrintf( "/admin/inject?c=qatest123&delete=1&u=");
|
||||
sb.urlEncode ( s_urlPtrs[s_ii2] );
|
||||
return getUrl ( sb.getBufStart() , qatestWrapper );
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
#include "Msg0.h"
|
||||
static Msg0 s_msg0;
|
||||
static RdbList s_list;
|
||||
|
||||
void gotList33 ( void *state ) {
|
||||
long *rdbId = (long *)state;
|
||||
if ( ! s_list.isEmpty() ) {
|
||||
log("qa: delete failed. list is not empty rdbid=%li.",*rdbId);
|
||||
s_failures++;
|
||||
}
|
||||
// resume main loop
|
||||
qatest();
|
||||
}
|
||||
|
||||
// scan all Rdb databases and ensure no recs (it was a clean delete)
|
||||
bool checkRdbLists ( long *rdbId ) {
|
||||
CollectionRec *cr = g_collectiondb.getRec("qatest123");
|
||||
if ( ! cr ) return true;
|
||||
collnum_t cn = cr->m_collnum;
|
||||
for ( ; *rdbId < RDB_END ; ) {
|
||||
// pre-inc it
|
||||
*rdbId = *rdbId + 1;
|
||||
char minKey[MAX_KEY_BYTES];
|
||||
char maxKey[MAX_KEY_BYTES];
|
||||
KEYMIN(minKey,MAX_KEY_BYTES);
|
||||
KEYMAX(maxKey,MAX_KEY_BYTES);
|
||||
if ( ! s_msg0.getList ( 0 , // hostid
|
||||
0 , // ip
|
||||
0 , // port
|
||||
0 , // cacheage
|
||||
false, // addtocache
|
||||
*rdbId , // rdbid
|
||||
cn , // collnum
|
||||
&s_list ,
|
||||
minKey ,
|
||||
maxKey ,
|
||||
1000 , // minrecsizes
|
||||
rdbId , // state
|
||||
gotList33,
|
||||
0 // niceness
|
||||
) )
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// once we have triggered the dump this will cause all rdbs to tightmerge
|
||||
void doneDumping ( void *state , TcpSocket *sock ) {
|
||||
CollectionRec *cr = g_collectiondb.getRec("qatest123");
|
||||
if ( ! cr ) { qatest(); return; }
|
||||
// tight merge the rdb that was dumped
|
||||
for ( long i = 0 ; i < RDB_END ; i++ ) {
|
||||
Rdb *rdb = getRdbFromId ( i );
|
||||
if ( ! rdb ) continue;
|
||||
RdbBase *base = rdb->getBase ( cr->m_collnum );
|
||||
if ( ! base ) continue;
|
||||
// . force a tight merge as soon as dump completes
|
||||
// . the dump should already be going
|
||||
base->m_nextMergeForced = true;
|
||||
}
|
||||
// wait for tight merges to complete now
|
||||
qatest();
|
||||
}
|
||||
|
||||
bool dumpTreesToDisk () {
|
||||
static bool s_done = false;
|
||||
if ( s_done ) return true;
|
||||
s_done = true;
|
||||
// force dump data to disk. dumps all rdbs.
|
||||
return getUrl("/admin/master?dump=1",doneDumping );
|
||||
}
|
||||
|
||||
void doneAddingUrls ( void *state ) {
|
||||
qatest();
|
||||
}
|
||||
|
||||
void sleepCallback ( int fd , void *state ) {
|
||||
qatest();
|
||||
}
|
||||
|
||||
// check every second to see if merges are done
|
||||
bool waitForMergeToFinish ( ) {
|
||||
// if registered
|
||||
static bool s_registered = false;
|
||||
if ( s_registered ) {
|
||||
g_loop.unregisterSleepCallback ( NULL , sleepCallback );
|
||||
s_registered = false;
|
||||
}
|
||||
CollectionRec *cr = g_collectiondb.getRec("qatest123");
|
||||
if ( ! cr ) { qatest(); return true; }
|
||||
// tight merge the rdb that was dumped
|
||||
long i; for ( i = 0 ; i < RDB_END ; i++ ) {
|
||||
Rdb *rdb = getRdbFromId ( i );
|
||||
if ( ! rdb ) continue;
|
||||
RdbBase *base = rdb->getBase ( cr->m_collnum );
|
||||
if ( ! base ) continue;
|
||||
// . force a tight merge as soon as dump completes
|
||||
// . the dump should already be going
|
||||
if ( base->m_nextMergeForced ) return false;
|
||||
// still waiting on this merge
|
||||
break;
|
||||
}
|
||||
// if not still waiting return true
|
||||
if ( i >= RDB_END ) return true;
|
||||
// sleep for 1 second
|
||||
g_loop.registerSleepCallback ( 1000 , // 1000 ms
|
||||
NULL , // state
|
||||
sleepCallback ,
|
||||
0 ); // niceness
|
||||
s_registered = true;
|
||||
return false;
|
||||
}
|
||||
|
||||
bool resetColl ( ) {
|
||||
static bool s_flag = false;
|
||||
if ( s_flag ) return true;
|
||||
s_flag = true;
|
||||
// also turn spiders on
|
||||
return getUrl("/admin/master?reset=qatest123&se=1", qatestWrapper );
|
||||
}
|
||||
|
||||
bool addUrlTest ( ) {
|
||||
static bool s_flag = false;
|
||||
if ( s_flag ) return true;
|
||||
s_flag = true;
|
||||
return getUrl ( "/admin/addurl"
|
||||
"?c=qatest123&u=www.dmoz.org+www.ibm.com+"
|
||||
"www.diffbot.com"
|
||||
, qatestWrapper );
|
||||
}
|
||||
|
||||
// check every second to see if spidering phase is completed
|
||||
bool checkSpidersDone ( ) {
|
||||
// if registered
|
||||
static bool s_registered = false;
|
||||
if ( s_registered ) {
|
||||
g_loop.unregisterSleepCallback ( NULL , sleepCallback );
|
||||
s_registered = false;
|
||||
}
|
||||
// we have to adjust this once we know how many pages we'll archive
|
||||
CollectionRec *cr = g_collectiondb.getRec("qatest123");
|
||||
if ( ! cr ) { qatest(); return true; }
|
||||
// return true if all done
|
||||
if ( cr->m_globalCrawlInfo.m_pageDownloadSuccessesThisRound >= 200 )
|
||||
return true;
|
||||
// sleep for 1 second
|
||||
g_loop.registerSleepCallback ( 1000 , // 1000 ms
|
||||
NULL , // state
|
||||
sleepCallback ,
|
||||
0 ); // niceness
|
||||
s_registered = true;
|
||||
return false;
|
||||
}
|
||||
|
||||
bool delColl ( ) {
|
||||
static bool s_flag = false;
|
||||
if ( s_flag ) return true;
|
||||
s_flag = true;
|
||||
return getUrl ( "/admin/delcoll?c=qatest123" , qatestWrapper );
|
||||
}
|
||||
|
||||
|
||||
static long s_rdbId1 = 0;
|
||||
static long s_rdbId2 = 0;
|
||||
//static long s_rdbId3 = 0;
|
||||
|
||||
// . run a series of tests to ensure that gb is functioning properly
|
||||
// . use s_urls[] array of urls for injecting and spider seeding
|
||||
// . contain an archive copy of all webpages in the injectme3 file and
|
||||
// in pagearchive1.txt file
|
||||
// . while initially spidering store pages in pagearchive1.txt so we can
|
||||
// replay later. store up to 100,000 pages in there.
|
||||
bool qatest ( ) {
|
||||
|
||||
// add the 'qatest123' collection
|
||||
if ( ! addColl () ) return false;
|
||||
|
||||
// inject urls, return false if not done yet
|
||||
if ( ! injectUrls ( ) ) return false;
|
||||
|
||||
// test search results
|
||||
if ( ! searchTest1 () ) return false;
|
||||
|
||||
// delete all urls cleanly now
|
||||
if ( ! deleteUrls ( ) ) return false;
|
||||
|
||||
// now get rdblist for every rdb for this coll and make sure all zero!
|
||||
if ( ! checkRdbLists ( &s_rdbId1 ) ) return false;
|
||||
|
||||
// dump, tight merge and ensure no data in our rdbs for this coll
|
||||
if ( ! dumpTreesToDisk() ) return false;
|
||||
|
||||
// wait for tight merge to complete
|
||||
if ( ! waitForMergeToFinish() ) return false;
|
||||
|
||||
// now get rdblist for every rdb for this coll and make sure all zero!
|
||||
if ( ! checkRdbLists ( &s_rdbId2 ) ) return false;
|
||||
|
||||
// reset the collection so we can test spidering
|
||||
if ( ! resetColl ( ) ) return false;
|
||||
|
||||
// add urls to seed spider with. make msg13.cpp recognize qatest123
|
||||
// collection and return 404 on urls not in our official list so
|
||||
// we can ensure search result consistency. msg13.cpp will initially
|
||||
// store the pages in a file, like the first 1,000 or so pages.
|
||||
if ( ! addUrlTest () ) return false;
|
||||
|
||||
// wait for spidering to complete. sleep callback. # of spidered urls
|
||||
// will be x, so we know when to stop
|
||||
if ( ! checkSpidersDone() ) return false;
|
||||
|
||||
// . now search again on the large collection most likely
|
||||
// . store search queries and checksum into queries2.txt
|
||||
// . a 0 (or no) checksum means we should fill it in
|
||||
if ( ! searchTest2 () ) return false;
|
||||
|
||||
// try a query delete
|
||||
//if ( ! queryDeleteTest() ) return false;
|
||||
|
||||
// ensure empty
|
||||
//if ( ! checkRdbLists ( &s_rdbId3 ) ) return false;
|
||||
|
||||
// delete the collection
|
||||
if ( ! delColl() ) return false;
|
||||
|
||||
return true;
|
||||
}
|
Loading…
Reference in New Issue
Block a user