Merge branch 'ia' into testing
This commit is contained in:
commit
1114deeb29
Collectiondb.cppCollectiondb.hConf.hErrno.cppErrno.hHttpMime.cppHttpMime.hHttpRequest.cppHttpServer.cppLinkdb.cppMakefileMem.cppMsg13.cppMsg13.hMsg2.cppMsg3a.cppMsg40.cppMsg40.hMsg5.cppPageAddUrl.cppPageInject.cppPageInject.hPageReindex.cppPageRoot.cppParms.cppParms.hPingServer.cppProcess.cppSpider.cppSpider.hTcpServer.cppThreads.cppUrl.cppUrl.hXmlDoc.cppXmlDoc.hfctypes.cppfctypes.h
html
qa.cpp
297
Collectiondb.cpp
297
Collectiondb.cpp
@ -463,12 +463,24 @@ bool Collectiondb::addNewColl ( char *coll ,
|
||||
if ( ! h ) {
|
||||
log("crawlbot: bad custom collname");
|
||||
g_errno = EBADENGINEER;
|
||||
mdelete ( cr, sizeof(CollectionRec), "CollectionRec" );
|
||||
delete ( cr );
|
||||
return true;
|
||||
}
|
||||
*h = '\0';
|
||||
crawl = h + 1;
|
||||
if ( ! crawl[0] ) {
|
||||
log("crawlbot: bad custom crawl name");
|
||||
mdelete ( cr, sizeof(CollectionRec), "CollectionRec" );
|
||||
delete ( cr );
|
||||
g_errno = EBADENGINEER;
|
||||
return true;
|
||||
}
|
||||
// or if too big!
|
||||
if ( gbstrlen(crawl) > 30 ) {
|
||||
log("crawlbot: crawlbot crawl NAME is over 30 chars");
|
||||
mdelete ( cr, sizeof(CollectionRec), "CollectionRec" );
|
||||
delete ( cr );
|
||||
g_errno = EBADENGINEER;
|
||||
return true;
|
||||
}
|
||||
@ -1936,6 +1948,17 @@ bool CollectionRec::load ( char *coll , int32_t i ) {
|
||||
m_coll,
|
||||
(int32_t)m_collnum,
|
||||
(int32_t)m_globalCrawlInfo.m_hasUrlsReadyToSpider);
|
||||
|
||||
// the list of ip addresses that we have detected as being throttled
|
||||
// and therefore backoff and use proxies for
|
||||
if ( ! g_conf.m_doingCommandLine ) {
|
||||
sb.reset();
|
||||
sb.safePrintf("%scoll.%s.%"INT32"/",
|
||||
g_hostdb.m_dir , m_coll , (int32_t)m_collnum );
|
||||
m_twitchyTable.m_allocName = "twittbl";
|
||||
m_twitchyTable.load ( sb.getBufStart() , "ipstouseproxiesfor.dat" );
|
||||
}
|
||||
|
||||
|
||||
|
||||
////////////
|
||||
@ -2173,6 +2196,9 @@ bool CollectionRec::rebuildUrlFilters2 ( ) {
|
||||
m_harvestLinks[n] = 1;
|
||||
*/
|
||||
|
||||
// max spiders per ip
|
||||
int32_t ipms = 7;
|
||||
|
||||
m_regExs[n].set("isreindex");
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 0; // 30 days default
|
||||
@ -2228,7 +2254,7 @@ bool CollectionRec::rebuildUrlFilters2 ( ) {
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 7; // 30 days default
|
||||
m_maxSpidersPerRule [n] = 99; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
|
||||
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 85;
|
||||
if ( ! strcmp(s,"news") )
|
||||
@ -2240,7 +2266,7 @@ bool CollectionRec::rebuildUrlFilters2 ( ) {
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 7; // 30 days default
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
|
||||
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 52;
|
||||
if ( ! strcmp(s,"news") )
|
||||
@ -2252,7 +2278,7 @@ bool CollectionRec::rebuildUrlFilters2 ( ) {
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 7; // 30 days default
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
|
||||
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 51;
|
||||
if ( ! strcmp(s,"news") )
|
||||
@ -2265,7 +2291,7 @@ bool CollectionRec::rebuildUrlFilters2 ( ) {
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 7; // 30 days default
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
|
||||
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 50;
|
||||
if ( ! strcmp(s,"news") )
|
||||
@ -2276,7 +2302,7 @@ bool CollectionRec::rebuildUrlFilters2 ( ) {
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 7.0; // days b4 respider
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
|
||||
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 48;
|
||||
if ( ! strcmp(s,"news") )
|
||||
@ -2287,7 +2313,7 @@ bool CollectionRec::rebuildUrlFilters2 ( ) {
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 7.0;
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
|
||||
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 49;
|
||||
if ( ! strcmp(s,"news") )
|
||||
@ -2298,7 +2324,7 @@ bool CollectionRec::rebuildUrlFilters2 ( ) {
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 10.0;
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
|
||||
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 47;
|
||||
if ( ! strcmp(s,"news") )
|
||||
@ -2310,7 +2336,7 @@ bool CollectionRec::rebuildUrlFilters2 ( ) {
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 7; // 30 days default
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
|
||||
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 45;
|
||||
if ( ! strcmp(s,"news") )
|
||||
@ -2321,7 +2347,7 @@ bool CollectionRec::rebuildUrlFilters2 ( ) {
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 7; // 30 days default
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
|
||||
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 44;
|
||||
if ( ! strcmp(s,"news") )
|
||||
@ -2333,7 +2359,7 @@ bool CollectionRec::rebuildUrlFilters2 ( ) {
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 20.0; // 30 days default
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
|
||||
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 43;
|
||||
if ( ! strcmp(s,"news") )
|
||||
@ -2344,7 +2370,7 @@ bool CollectionRec::rebuildUrlFilters2 ( ) {
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 20.0; // 30 days default
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
|
||||
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 42;
|
||||
if ( ! strcmp(s,"news") )
|
||||
@ -2358,7 +2384,7 @@ bool CollectionRec::rebuildUrlFilters2 ( ) {
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 20.0;
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
|
||||
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 40;
|
||||
if ( ! strcmp(s,"news") )
|
||||
@ -2369,7 +2395,7 @@ bool CollectionRec::rebuildUrlFilters2 ( ) {
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 20.0;
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
|
||||
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 39;
|
||||
if ( ! strcmp(s,"news") )
|
||||
@ -2380,7 +2406,7 @@ bool CollectionRec::rebuildUrlFilters2 ( ) {
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 40;
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
|
||||
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 30;
|
||||
// do not harvest links if we are spiderings NEWS
|
||||
@ -2394,7 +2420,7 @@ bool CollectionRec::rebuildUrlFilters2 ( ) {
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 40;
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
|
||||
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 29;
|
||||
// do not harvest links if we are spiderings NEWS
|
||||
@ -2408,7 +2434,7 @@ bool CollectionRec::rebuildUrlFilters2 ( ) {
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 60;
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
|
||||
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 20;
|
||||
// turn off spidering if hopcount is too big and we are spiderings NEWS
|
||||
@ -2424,7 +2450,7 @@ bool CollectionRec::rebuildUrlFilters2 ( ) {
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 60;
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
|
||||
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 19;
|
||||
// turn off spidering if hopcount is too big and we are spiderings NEWS
|
||||
@ -2451,7 +2477,7 @@ bool CollectionRec::rebuildUrlFilters2 ( ) {
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 60;
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
|
||||
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 1;
|
||||
if ( ! strcmp(s,"news") ) {
|
||||
@ -2487,6 +2513,9 @@ bool CollectionRec::rebuildUrlFilters2 ( ) {
|
||||
|
||||
bool CollectionRec::rebuildLangRules ( char *langStr , char *tldStr ) {
|
||||
|
||||
// max spiders per ip
|
||||
int32_t ipms = 7;
|
||||
|
||||
int32_t n = 0;
|
||||
|
||||
m_regExs[n].set("isreindex");
|
||||
@ -2542,7 +2571,7 @@ bool CollectionRec::rebuildLangRules ( char *langStr , char *tldStr ) {
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 7; // 30 days default
|
||||
m_maxSpidersPerRule [n] = 99; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
|
||||
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 85;
|
||||
n++;
|
||||
@ -2553,7 +2582,7 @@ bool CollectionRec::rebuildLangRules ( char *langStr , char *tldStr ) {
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 7; // 30 days default
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
|
||||
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 50;
|
||||
n++;
|
||||
@ -2565,7 +2594,7 @@ bool CollectionRec::rebuildLangRules ( char *langStr , char *tldStr ) {
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 7; // 30 days default
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
|
||||
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 50;
|
||||
n++;
|
||||
@ -2574,7 +2603,7 @@ bool CollectionRec::rebuildLangRules ( char *langStr , char *tldStr ) {
|
||||
// m_harvestLinks [n] = 1;
|
||||
// m_spiderFreqs [n] = 7; // 30 days default
|
||||
// m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
// m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
|
||||
// m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
|
||||
// m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
// m_spiderPriorities [n] = 20;
|
||||
// n++;
|
||||
@ -2586,7 +2615,7 @@ bool CollectionRec::rebuildLangRules ( char *langStr , char *tldStr ) {
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 7.0; // days b4 respider
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
|
||||
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 48;
|
||||
n++;
|
||||
@ -2597,7 +2626,7 @@ bool CollectionRec::rebuildLangRules ( char *langStr , char *tldStr ) {
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 7.0; // days b4 respider
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
|
||||
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 48;
|
||||
n++;
|
||||
@ -2606,7 +2635,7 @@ bool CollectionRec::rebuildLangRules ( char *langStr , char *tldStr ) {
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 7.0; // days b4 respider
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
|
||||
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 19;
|
||||
n++;
|
||||
@ -2620,7 +2649,7 @@ bool CollectionRec::rebuildLangRules ( char *langStr , char *tldStr ) {
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 7.0;
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
|
||||
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 49;
|
||||
n++;
|
||||
@ -2631,7 +2660,7 @@ bool CollectionRec::rebuildLangRules ( char *langStr , char *tldStr ) {
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 7.0;
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
|
||||
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 49;
|
||||
n++;
|
||||
@ -2640,7 +2669,7 @@ bool CollectionRec::rebuildLangRules ( char *langStr , char *tldStr ) {
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 7.0;
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
|
||||
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 18;
|
||||
n++;
|
||||
@ -2652,7 +2681,7 @@ bool CollectionRec::rebuildLangRules ( char *langStr , char *tldStr ) {
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 10.0;
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
|
||||
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 47;
|
||||
n++;
|
||||
@ -2662,7 +2691,7 @@ bool CollectionRec::rebuildLangRules ( char *langStr , char *tldStr ) {
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 10.0;
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
|
||||
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 47;
|
||||
n++;
|
||||
@ -2671,7 +2700,7 @@ bool CollectionRec::rebuildLangRules ( char *langStr , char *tldStr ) {
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 10.0;
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
|
||||
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 17;
|
||||
n++;
|
||||
@ -2684,7 +2713,7 @@ bool CollectionRec::rebuildLangRules ( char *langStr , char *tldStr ) {
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 20.0;
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
|
||||
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 40;
|
||||
n++;
|
||||
@ -2695,7 +2724,7 @@ bool CollectionRec::rebuildLangRules ( char *langStr , char *tldStr ) {
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 20.0;
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
|
||||
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 40;
|
||||
n++;
|
||||
@ -2704,7 +2733,7 @@ bool CollectionRec::rebuildLangRules ( char *langStr , char *tldStr ) {
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 20.0;
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
|
||||
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 16;
|
||||
n++;
|
||||
@ -2716,7 +2745,7 @@ bool CollectionRec::rebuildLangRules ( char *langStr , char *tldStr ) {
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 20.0;
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
|
||||
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 39;
|
||||
n++;
|
||||
@ -2726,7 +2755,7 @@ bool CollectionRec::rebuildLangRules ( char *langStr , char *tldStr ) {
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 20.0;
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
|
||||
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 39;
|
||||
n++;
|
||||
@ -2735,7 +2764,7 @@ bool CollectionRec::rebuildLangRules ( char *langStr , char *tldStr ) {
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 20.0;
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
|
||||
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 15;
|
||||
n++;
|
||||
@ -2747,7 +2776,7 @@ bool CollectionRec::rebuildLangRules ( char *langStr , char *tldStr ) {
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 40;
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
|
||||
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 30;
|
||||
n++;
|
||||
@ -2758,7 +2787,7 @@ bool CollectionRec::rebuildLangRules ( char *langStr , char *tldStr ) {
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 40;
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
|
||||
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 30;
|
||||
n++;
|
||||
@ -2767,7 +2796,7 @@ bool CollectionRec::rebuildLangRules ( char *langStr , char *tldStr ) {
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 40;
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
|
||||
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 14;
|
||||
n++;
|
||||
@ -2780,7 +2809,7 @@ bool CollectionRec::rebuildLangRules ( char *langStr , char *tldStr ) {
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 40;
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
|
||||
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 29;
|
||||
n++;
|
||||
@ -2790,7 +2819,7 @@ bool CollectionRec::rebuildLangRules ( char *langStr , char *tldStr ) {
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 40;
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
|
||||
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 29;
|
||||
n++;
|
||||
@ -2799,7 +2828,7 @@ bool CollectionRec::rebuildLangRules ( char *langStr , char *tldStr ) {
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 40;
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
|
||||
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 13;
|
||||
n++;
|
||||
@ -2812,7 +2841,7 @@ bool CollectionRec::rebuildLangRules ( char *langStr , char *tldStr ) {
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 60;
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
|
||||
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 22;
|
||||
n++;
|
||||
@ -2823,7 +2852,7 @@ bool CollectionRec::rebuildLangRules ( char *langStr , char *tldStr ) {
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 60;
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
|
||||
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 22;
|
||||
n++;
|
||||
@ -2832,7 +2861,7 @@ bool CollectionRec::rebuildLangRules ( char *langStr , char *tldStr ) {
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 60;
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
|
||||
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 12;
|
||||
n++;
|
||||
@ -2845,7 +2874,7 @@ bool CollectionRec::rebuildLangRules ( char *langStr , char *tldStr ) {
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 60;
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
|
||||
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 21;
|
||||
n++;
|
||||
@ -2855,7 +2884,7 @@ bool CollectionRec::rebuildLangRules ( char *langStr , char *tldStr ) {
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 60;
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
|
||||
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 21;
|
||||
n++;
|
||||
@ -2864,7 +2893,7 @@ bool CollectionRec::rebuildLangRules ( char *langStr , char *tldStr ) {
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 60;
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
|
||||
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 11;
|
||||
n++;
|
||||
@ -2875,7 +2904,7 @@ bool CollectionRec::rebuildLangRules ( char *langStr , char *tldStr ) {
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 60;
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
|
||||
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 1;
|
||||
n++;
|
||||
@ -2895,6 +2924,9 @@ bool CollectionRec::rebuildLangRules ( char *langStr , char *tldStr ) {
|
||||
|
||||
bool CollectionRec::rebuildShallowRules ( ) {
|
||||
|
||||
// max spiders per ip
|
||||
int32_t ipms = 7;
|
||||
|
||||
int32_t n = 0;
|
||||
|
||||
m_regExs[n].set("isreindex");
|
||||
@ -2950,7 +2982,7 @@ bool CollectionRec::rebuildShallowRules ( ) {
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 7; // 30 days default
|
||||
m_maxSpidersPerRule [n] = 99; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
|
||||
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 85;
|
||||
n++;
|
||||
@ -2965,7 +2997,7 @@ bool CollectionRec::rebuildShallowRules ( ) {
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 40;
|
||||
m_maxSpidersPerRule [n] = 0; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
|
||||
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 30;
|
||||
n++;
|
||||
@ -2976,7 +3008,7 @@ bool CollectionRec::rebuildShallowRules ( ) {
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 40;
|
||||
m_maxSpidersPerRule [n] = 0; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
|
||||
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 30;
|
||||
n++;
|
||||
@ -2988,7 +3020,7 @@ bool CollectionRec::rebuildShallowRules ( ) {
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 7; // 30 days default
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
|
||||
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 50;
|
||||
n++;
|
||||
@ -2997,7 +3029,7 @@ bool CollectionRec::rebuildShallowRules ( ) {
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 7.0; // days b4 respider
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
|
||||
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 48;
|
||||
n++;
|
||||
@ -3009,7 +3041,7 @@ bool CollectionRec::rebuildShallowRules ( ) {
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 7.0;
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
|
||||
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 49;
|
||||
n++;
|
||||
@ -3021,7 +3053,7 @@ bool CollectionRec::rebuildShallowRules ( ) {
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 10.0;
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
|
||||
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 47;
|
||||
n++;
|
||||
@ -3034,7 +3066,7 @@ bool CollectionRec::rebuildShallowRules ( ) {
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 20.0;
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
|
||||
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 40;
|
||||
n++;
|
||||
@ -3044,7 +3076,7 @@ bool CollectionRec::rebuildShallowRules ( ) {
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 20.0;
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
|
||||
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 39;
|
||||
n++;
|
||||
@ -3056,7 +3088,7 @@ bool CollectionRec::rebuildShallowRules ( ) {
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 40;
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
|
||||
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 30;
|
||||
n++;
|
||||
@ -3065,7 +3097,7 @@ bool CollectionRec::rebuildShallowRules ( ) {
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 40;
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
|
||||
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 29;
|
||||
n++;
|
||||
@ -3077,7 +3109,7 @@ bool CollectionRec::rebuildShallowRules ( ) {
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 60;
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
|
||||
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 22;
|
||||
n++;
|
||||
@ -3086,7 +3118,7 @@ bool CollectionRec::rebuildShallowRules ( ) {
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 60;
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
|
||||
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 21;
|
||||
n++;
|
||||
@ -3097,7 +3129,7 @@ bool CollectionRec::rebuildShallowRules ( ) {
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 60;
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
|
||||
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 1;
|
||||
n++;
|
||||
@ -3207,6 +3239,13 @@ bool CollectionRec::save ( ) {
|
||||
g_errno = 0;
|
||||
}
|
||||
|
||||
// the list of ip addresses that we have detected as being throttled
|
||||
// and therefore backoff and use proxies for
|
||||
sb.reset();
|
||||
sb.safePrintf("%scoll.%s.%"INT32"/",
|
||||
g_hostdb.m_dir , m_coll , (int32_t)m_collnum );
|
||||
m_twitchyTable.save ( sb.getBufStart() , "ipstouseproxiesfor.dat" );
|
||||
|
||||
// do not need a save now
|
||||
m_needsSave = false;
|
||||
|
||||
@ -3356,24 +3395,93 @@ bool CollectionRec::hasSearchPermission ( TcpSocket *s , int32_t encapIp ) {
|
||||
bool expandRegExShortcuts ( SafeBuf *sb ) ;
|
||||
void nukeDoledb ( collnum_t collnum );
|
||||
|
||||
bool CollectionRec::rebuildUrlFiltersDiffbot() {
|
||||
|
||||
//logf(LOG_DEBUG,"db: rebuilding url filters");
|
||||
// rebuild the regexes related to diffbot, such as the one for the URL pattern
|
||||
bool CollectionRec::rebuildDiffbotRegexes() {
|
||||
//logf(LOG_DEBUG,"db: rebuilding url filters");
|
||||
char *ucp = m_diffbotUrlCrawlPattern.getBufStart();
|
||||
if ( ucp && ! ucp[0] ) ucp = NULL;
|
||||
|
||||
// get the regexes
|
||||
if ( ! ucp ) ucp = m_diffbotUrlCrawlRegEx.getBufStart();
|
||||
if ( ucp && ! ucp[0] ) ucp = NULL;
|
||||
char *upp = m_diffbotUrlProcessPattern.getBufStart();
|
||||
if ( upp && ! upp[0] ) upp = NULL;
|
||||
|
||||
if ( ! upp ) upp = m_diffbotUrlProcessRegEx.getBufStart();
|
||||
if ( upp && ! upp[0] ) upp = NULL;
|
||||
char *ppp = m_diffbotPageProcessPattern.getBufStart();
|
||||
if ( ppp && ! ppp[0] ) ppp = NULL;
|
||||
|
||||
// recompiling regexes starts now
|
||||
if ( m_hasucr ) {
|
||||
regfree ( &m_ucr );
|
||||
m_hasucr = false;
|
||||
}
|
||||
if ( m_hasupr ) {
|
||||
regfree ( &m_upr );
|
||||
m_hasupr = false;
|
||||
}
|
||||
|
||||
// copy into tmpbuf
|
||||
SafeBuf tmp;
|
||||
char *rx = m_diffbotUrlCrawlRegEx.getBufStart();
|
||||
if ( rx && ! rx[0] ) rx = NULL;
|
||||
if ( rx ) {
|
||||
tmp.reset();
|
||||
tmp.safeStrcpy ( rx );
|
||||
expandRegExShortcuts ( &tmp );
|
||||
m_hasucr = true;
|
||||
}
|
||||
if ( rx && regcomp ( &m_ucr , tmp.getBufStart() ,
|
||||
REG_EXTENDED| //REG_ICASE|
|
||||
REG_NEWLINE ) ) { // |REG_NOSUB) ) {
|
||||
// error!
|
||||
log("coll: regcomp %s failed: %s. "
|
||||
"Ignoring.",
|
||||
rx,mstrerror(errno));
|
||||
regfree ( &m_ucr );
|
||||
m_hasucr = false;
|
||||
}
|
||||
|
||||
|
||||
rx = m_diffbotUrlProcessRegEx.getBufStart();
|
||||
if ( rx && ! rx[0] ) rx = NULL;
|
||||
if ( rx ) m_hasupr = true;
|
||||
if ( rx ) {
|
||||
tmp.reset();
|
||||
tmp.safeStrcpy ( rx );
|
||||
expandRegExShortcuts ( &tmp );
|
||||
m_hasupr = true;
|
||||
}
|
||||
if ( rx && regcomp ( &m_upr , tmp.getBufStart() ,
|
||||
REG_EXTENDED| // REG_ICASE|
|
||||
REG_NEWLINE ) ) { // |REG_NOSUB) ) {
|
||||
// error!
|
||||
log("coll: regcomp %s failed: %s. "
|
||||
"Ignoring.",
|
||||
rx,mstrerror(errno));
|
||||
regfree ( &m_upr );
|
||||
m_hasupr = false;
|
||||
}
|
||||
return true;
|
||||
|
||||
}
|
||||
|
||||
bool CollectionRec::rebuildUrlFiltersDiffbot() {
|
||||
|
||||
//logf(LOG_DEBUG,"db: rebuilding url filters");
|
||||
char *ucp = m_diffbotUrlCrawlPattern.getBufStart();
|
||||
if ( ucp && ! ucp[0] ) ucp = NULL;
|
||||
|
||||
// if we had a regex, that works for this purpose as well
|
||||
if ( ! ucp ) ucp = m_diffbotUrlCrawlRegEx.getBufStart();
|
||||
if ( ucp && ! ucp[0] ) ucp = NULL;
|
||||
|
||||
char *upp = m_diffbotUrlProcessPattern.getBufStart();
|
||||
if ( upp && ! upp[0] ) upp = NULL;
|
||||
|
||||
// if we had a regex, that works for this purpose as well
|
||||
if ( ! upp ) upp = m_diffbotUrlProcessRegEx.getBufStart();
|
||||
if ( upp && ! upp[0] ) upp = NULL;
|
||||
|
||||
char *ppp = m_diffbotPageProcessPattern.getBufStart();
|
||||
if ( ppp && ! ppp[0] ) ppp = NULL;
|
||||
|
||||
@ -3405,13 +3513,16 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() {
|
||||
expandRegExShortcuts ( &tmp );
|
||||
m_hasucr = true;
|
||||
}
|
||||
if ( rx && regcomp ( &m_ucr , tmp.getBufStart() ,
|
||||
REG_EXTENDED| //REG_ICASE|
|
||||
REG_NEWLINE ) ) { // |REG_NOSUB) ) {
|
||||
int32_t err;
|
||||
if ( rx && ( err = regcomp ( &m_ucr , tmp.getBufStart() ,
|
||||
REG_EXTENDED| //REG_ICASE|
|
||||
REG_NEWLINE ) ) ) { // |REG_NOSUB) ) {
|
||||
// error!
|
||||
char errbuf[1024];
|
||||
regerror(err,&m_ucr,errbuf,1000);
|
||||
log("coll: regcomp %s failed: %s. "
|
||||
"Ignoring.",
|
||||
rx,mstrerror(errno));
|
||||
"Ignoring.",
|
||||
rx,errbuf);
|
||||
regfree ( &m_ucr );
|
||||
m_hasucr = false;
|
||||
}
|
||||
@ -3426,18 +3537,19 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() {
|
||||
expandRegExShortcuts ( &tmp );
|
||||
m_hasupr = true;
|
||||
}
|
||||
if ( rx && regcomp ( &m_upr , tmp.getBufStart() ,
|
||||
REG_EXTENDED| // REG_ICASE|
|
||||
REG_NEWLINE ) ) { // |REG_NOSUB) ) {
|
||||
if ( rx && ( err = regcomp ( &m_upr , tmp.getBufStart() ,
|
||||
REG_EXTENDED| // REG_ICASE|
|
||||
REG_NEWLINE ) ) ) { // |REG_NOSUB) ) {
|
||||
char errbuf[1024];
|
||||
regerror(err,&m_upr,errbuf,1000);
|
||||
// error!
|
||||
log("coll: regcomp %s failed: %s. "
|
||||
"Ignoring.",
|
||||
rx,mstrerror(errno));
|
||||
rx,errbuf);
|
||||
regfree ( &m_upr );
|
||||
m_hasupr = false;
|
||||
}
|
||||
|
||||
|
||||
// what diffbot url to use for processing
|
||||
char *api = m_diffbotApiUrl.getBufStart();
|
||||
if ( api && ! api[0] ) api = NULL;
|
||||
@ -3837,9 +3949,15 @@ bool CollectionRec::rebuildUrlFilters ( ) {
|
||||
}
|
||||
|
||||
|
||||
// the code beow is only for diffbot custom crawls
|
||||
if ( ! m_isCustomCrawl ) return true; //!= 1 && // crawl api
|
||||
// If the crawl is not generated by crawlbot, then we will just update
|
||||
// the regexes concerning the urls to process
|
||||
rebuildDiffbotRegexes();
|
||||
if ( ! m_isCustomCrawl ){
|
||||
return true;
|
||||
}
|
||||
|
||||
// on the other hand, if it is a crawlbot job, then by convention the url filters are all set
|
||||
// to some default ones.
|
||||
return rebuildUrlFiltersDiffbot();
|
||||
}
|
||||
|
||||
@ -3872,17 +3990,20 @@ void testRegex ( ) {
|
||||
rx = ".*?article[0-9]*?.html";
|
||||
|
||||
regex_t ucr;
|
||||
int32_t err;
|
||||
|
||||
if ( regcomp ( &ucr , rx ,
|
||||
REG_ICASE
|
||||
|REG_EXTENDED
|
||||
//|REG_NEWLINE
|
||||
//|REG_NOSUB
|
||||
) ) {
|
||||
if ( ( err = regcomp ( &ucr , rx ,
|
||||
REG_ICASE
|
||||
|REG_EXTENDED
|
||||
//|REG_NEWLINE
|
||||
//|REG_NOSUB
|
||||
) ) ) {
|
||||
// error!
|
||||
char errbuf[1024];
|
||||
regerror(err,&ucr,errbuf,1000);
|
||||
log("xmldoc: regcomp %s failed: %s. "
|
||||
"Ignoring.",
|
||||
rx,mstrerror(errno));
|
||||
rx,errbuf);
|
||||
}
|
||||
|
||||
logf(LOG_DEBUG,"db: compiled '%s' for crawl pattern",rx);
|
||||
|
@ -394,6 +394,9 @@ class CollectionRec {
|
||||
// for diffbot crawl or bulk jobs
|
||||
bool rebuildUrlFiltersDiffbot();
|
||||
|
||||
// rebuild the regexes related to diffbot, such as the one for the URL pattern
|
||||
bool rebuildDiffbotRegexes();
|
||||
|
||||
bool rebuildLangRules( char *lang , char *tld );
|
||||
|
||||
bool rebuildShallowRules();
|
||||
@ -426,6 +429,10 @@ class CollectionRec {
|
||||
|
||||
int64_t m_spiderCorruptCount;
|
||||
|
||||
// holds ips that have been detected as being throttled and we need
|
||||
// to backoff and use proxies on
|
||||
HashTableX m_twitchyTable;
|
||||
|
||||
//
|
||||
// CLOUD SEARCH ENGINE SUPPORT
|
||||
//
|
||||
@ -479,6 +486,7 @@ class CollectionRec {
|
||||
char m_detectCustomErrorPages ;
|
||||
char m_useSimplifiedRedirects ;
|
||||
char m_useIfModifiedSince ;
|
||||
char m_useTimeAxis ;
|
||||
char m_buildVecFromCont ;
|
||||
int32_t m_maxPercentSimilarPublishDate;
|
||||
char m_useSimilarityPublishDate;
|
||||
@ -511,6 +519,8 @@ class CollectionRec {
|
||||
char m_doIpLookups ; // considered iff using proxy
|
||||
char m_useRobotsTxt ;
|
||||
char m_forceUseFloaters ;
|
||||
char m_automaticallyUseProxies ;
|
||||
char m_automaticallyBackOff ;
|
||||
//char m_restrictDomain ; // say on same domain as seeds?
|
||||
char m_doTuringTest ; // for addurl
|
||||
char m_applyFilterToText ; // speeds us up
|
||||
|
3
Conf.h
3
Conf.h
@ -157,6 +157,7 @@ class Conf {
|
||||
SafeBuf m_proxyTestUrl;
|
||||
bool m_useRandAgents;
|
||||
bool m_useProxyIps;
|
||||
bool m_automaticallyUseProxyIps;
|
||||
SafeBuf m_proxyAuth;
|
||||
|
||||
// built-in dns parameters using name servers
|
||||
@ -679,6 +680,8 @@ class Conf {
|
||||
bool m_logDebugStats ;
|
||||
bool m_logDebugSummary ;
|
||||
bool m_logDebugSpider ;
|
||||
bool m_logDebugMsg13 ;
|
||||
bool m_diffbotMsg13Hack ;
|
||||
bool m_logDebugUrlAttempts ;
|
||||
bool m_logDebugTcp ;
|
||||
bool m_logDebugThread ;
|
||||
|
@ -108,6 +108,7 @@ case EDNSBAD : return "DNS sent an unknown response code";
|
||||
case EDNSREFUSED : return "DNS refused to talk";
|
||||
case EDNSDEAD : return "DNS hostname does not exist";
|
||||
case EDNSTIMEDOUT : return "DNS timed out";
|
||||
case EDNSERROR : return "DNS lookup error";
|
||||
case ECOLLTOOBIG : return "Collection is too long";
|
||||
case ESTRIKEOUT : return "Retried enough times, deleting doc";
|
||||
case ENOPERM : return "Permission Denied";
|
||||
@ -191,6 +192,7 @@ case EREADONLYMODE: return "In read only mode. Failed.";
|
||||
case ENOTITLEREC: return "No title rec found when recycling content";
|
||||
case EQUERYINGDISABLED: return "Querying is disabled in the master controls";
|
||||
case EJSONMISSINGLASTCURLY: return "JSON was missing last curly bracket";
|
||||
case EADMININTERFERENCE: return "Adminstrative interference";
|
||||
}
|
||||
// if the remote error bit is clear it must be a regulare errno
|
||||
//if ( ! ( errnum & REMOTE_ERROR_BIT ) ) return strerror ( errnum );
|
||||
|
4
Errno.h
4
Errno.h
@ -112,6 +112,7 @@ enum {
|
||||
EDNSREFUSED , //dns refused to talk to us
|
||||
EDNSDEAD , //dns is dead
|
||||
EDNSTIMEDOUT , //was just EUDPTIMEDOUT
|
||||
EDNSERROR ,
|
||||
ECOLLTOOBIG , //collection is too long
|
||||
ESTRIKEOUT , //retried enough times; deleting doc & giving up
|
||||
ENOPERM , //permission denied
|
||||
@ -195,6 +196,7 @@ enum {
|
||||
EREADONLYMODE,
|
||||
ENOTITLEREC,
|
||||
EQUERYINGDISABLED,
|
||||
EJSONMISSINGLASTCURLY
|
||||
EJSONMISSINGLASTCURLY,
|
||||
EADMININTERFERENCE
|
||||
};
|
||||
#endif
|
||||
|
33
HttpMime.cpp
33
HttpMime.cpp
@ -45,6 +45,7 @@ void HttpMime::reset ( ) {
|
||||
m_locationFieldLen = 0;
|
||||
m_contentEncodingPos = NULL;
|
||||
m_contentLengthPos = NULL;
|
||||
m_contentTypePos = NULL;
|
||||
}
|
||||
|
||||
// . returns false if could not get a valid mime
|
||||
@ -67,7 +68,12 @@ bool HttpMime::set ( char *buf , int32_t bufLen , Url *url ) {
|
||||
// . return false if we had no mime boundary
|
||||
// . but set m_bufLen to 0 so getMimeLen() will return 0 instead of -1
|
||||
// thus avoiding a potential buffer overflow
|
||||
if ( m_bufLen < 0 ) { m_bufLen = 0; m_boundaryLen = 0; return false; }
|
||||
if ( m_bufLen < 0 ) {
|
||||
m_bufLen = 0;
|
||||
m_boundaryLen = 0;
|
||||
log("mime: no rnrn boundary detected");
|
||||
return false;
|
||||
}
|
||||
// set this
|
||||
m_content = buf + m_bufLen;
|
||||
// . parse out m_status, m_contentLen, m_lastModifiedData, contentType
|
||||
@ -157,8 +163,12 @@ bool HttpMime::parse ( char *mime , int32_t mimeLen , Url *url ) {
|
||||
time_t now = time(NULL);
|
||||
if (m_lastModifiedDate > now) m_lastModifiedDate = now;
|
||||
}
|
||||
else if ( strncasecmp ( p , "Content-Type:" ,13) == 0 )
|
||||
else if ( strncasecmp ( p , "Content-Type:" ,13) == 0 ) {
|
||||
m_contentType = getContentTypePrivate ( p + 13 );
|
||||
char *s = p + 13;
|
||||
while ( *s == ' ' || *s == '\t' ) s++;
|
||||
m_contentTypePos = s;
|
||||
}
|
||||
else if ( strncasecmp ( p , "Set-Cookie:" ,10) == 0 ) {
|
||||
m_cookie = p + 11;
|
||||
if ( m_cookie[0] == ' ' ) m_cookie++;
|
||||
@ -533,6 +543,8 @@ int32_t getContentTypeFromStr ( char *s ) {
|
||||
else if (!strcasecmp(s,"application/vnd.ms-powerpoint")) ct = CT_PPT;
|
||||
else if (!strcasecmp(s,"application/mspowerpoint") ) ct = CT_PPT;
|
||||
else if (!strcasecmp(s,"application/postscript" ) ) ct = CT_PS;
|
||||
else if (!strcasecmp(s,"application/warc" ) ) ct = CT_WARC;
|
||||
else if (!strcasecmp(s,"application/arc" ) ) ct = CT_ARC;
|
||||
else if (!strcasecmp(s,"image/gif" ) ) ct = CT_GIF;
|
||||
else if (!strcasecmp(s,"image/jpeg" ) ) ct = CT_JPG;
|
||||
else if (!strcasecmp(s,"image/png" ) ) ct = CT_PNG;
|
||||
@ -540,6 +552,7 @@ int32_t getContentTypeFromStr ( char *s ) {
|
||||
else if (!strncasecmp(s,"image/",6 ) ) ct = CT_IMAGE;
|
||||
else if (!strcasecmp(s,"application/javascript" ) ) ct = CT_JS;
|
||||
else if (!strcasecmp(s,"application/x-javascript") ) ct = CT_JS;
|
||||
else if (!strcasecmp(s,"application/x-gzip" ) ) ct = CT_GZ;
|
||||
else if (!strcasecmp(s,"text/javascript" ) ) ct = CT_JS;
|
||||
else if (!strcasecmp(s,"text/x-js" ) ) ct = CT_JS;
|
||||
else if (!strcasecmp(s,"text/js" ) ) ct = CT_JS;
|
||||
@ -626,6 +639,17 @@ void resetHttpMime ( ) {
|
||||
s_mimeTable.reset();
|
||||
}
|
||||
|
||||
const char *extensionToContentTypeStr2 ( char *ext , int32_t elen ) {
|
||||
// assume text/html if no extension provided
|
||||
if ( ! ext || ! ext[0] ) return NULL;
|
||||
if ( elen <= 0 ) return NULL;
|
||||
// get hash for table look up
|
||||
int32_t key = hash32 ( ext , elen );
|
||||
char **pp = (char **)s_mimeTable.getValue ( &key );
|
||||
if ( ! pp ) return NULL;
|
||||
return *pp;
|
||||
}
|
||||
|
||||
const char *HttpMime::getContentTypeFromExtension ( char *ext , int32_t elen) {
|
||||
// assume text/html if no extension provided
|
||||
if ( ! ext || ! ext[0] ) return "text/html";
|
||||
@ -1051,7 +1075,10 @@ static char *s_ext[] = {
|
||||
"xwd" , "image/x-xwindowdump",
|
||||
"xyz" , "chemical/x-pdb",
|
||||
"zip" , "application/zip" ,
|
||||
"xpi", "application/x-xpinstall"
|
||||
"xpi", "application/x-xpinstall",
|
||||
// newstuff
|
||||
"warc", "application/warc",
|
||||
"arc", "application/arc"
|
||||
};
|
||||
|
||||
// . init s_mimeTable in this call
|
||||
|
@ -9,6 +9,8 @@
|
||||
// convert application/json to CT_JSON for instance
|
||||
int32_t getContentTypeFromStr ( char *s ) ;
|
||||
|
||||
const char *extensionToContentTypeStr2 ( char *ext , int32_t elen ) ;
|
||||
|
||||
#include <time.h>
|
||||
|
||||
void getTime ( char *s , int *sec , int *min , int *hour ) ;
|
||||
@ -42,6 +44,9 @@ time_t atotime5 ( char *s ) ;
|
||||
#define CT_JSON 16
|
||||
#define CT_IMAGE 17
|
||||
#define CT_STATUS 18 // an internal type indicating spider reply
|
||||
#define CT_GZ 19
|
||||
#define CT_ARC 20
|
||||
#define CT_WARC 21
|
||||
|
||||
#define ET_IDENTITY 0
|
||||
#define ET_GZIP 1
|
||||
@ -127,6 +132,7 @@ class HttpMime {
|
||||
int32_t getContentEncoding () {return m_contentEncoding;}
|
||||
char *getContentEncodingPos() {return m_contentEncodingPos;}
|
||||
char *getContentLengthPos() {return m_contentLengthPos;}
|
||||
char *getContentTypePos() {return m_contentTypePos;}
|
||||
|
||||
|
||||
// private:
|
||||
@ -166,6 +172,7 @@ class HttpMime {
|
||||
int32_t m_contentEncoding;
|
||||
char *m_contentEncodingPos;
|
||||
char *m_contentLengthPos;
|
||||
char *m_contentTypePos;
|
||||
|
||||
// the size of the terminating boundary, either 1 or 2 bytes.
|
||||
// just the last \n in the case of a \n\n or \r in the case
|
||||
|
@ -1548,8 +1548,8 @@ void HttpRequest::parseFieldsMultipart ( char *s , int32_t slen ) {
|
||||
// Content-Disposition: form-data; name=\"file\"; filename=\"poo.txt\"\r\nContent-Type: text/plain\r\n\r\nsomething here\n=====\nagain we do it...
|
||||
char *equal2 = strstr ( s , "\"" );
|
||||
// debug point
|
||||
if ( strncmp(s,"file",4) == 0 )
|
||||
log("hey");
|
||||
// if ( strncmp(s,"file",4) == 0 )
|
||||
// log("hey");
|
||||
// so if we had that then we had an uploaded file
|
||||
bool uploadedFile = false;
|
||||
if ( equal2 && equal && equal2 < equal ) {
|
||||
|
218
HttpServer.cpp
218
HttpServer.cpp
@ -197,7 +197,6 @@ bool HttpServer::getDoc ( char *url ,
|
||||
if ( ! ip || useHttpTunnel )
|
||||
host = getHostFast ( url , &hostLen , &port );
|
||||
|
||||
|
||||
// this returns false and sets g_errno on error
|
||||
if ( ! fullRequest ) {
|
||||
if ( ! r.set ( url , offset , size , ifModifiedSince ,
|
||||
@ -212,6 +211,7 @@ bool HttpServer::getDoc ( char *url ,
|
||||
// TODO: ensure we close the socket on this error!
|
||||
return true;
|
||||
}
|
||||
//log("archive: %s",r.m_reqBuf.getBufStart());
|
||||
reqSize = r.getRequestLen();
|
||||
int32_t need = reqSize + pcLen;
|
||||
// if we are requesting an HTTPS url through a proxy then
|
||||
@ -1035,6 +1035,44 @@ bool HttpServer::sendReply ( TcpSocket *s , HttpRequest *r , bool isAdmin) {
|
||||
if ( strncmp ( path , "/download/", 10 ) == 0 )
|
||||
return sendBackDump ( s , r );
|
||||
|
||||
if ( strncmp ( path , "/gbiaitem/" , 10 ) == 0 ) {
|
||||
SafeBuf cmd;
|
||||
char *iaItem = path + 10;
|
||||
char c = iaItem[pathLen];
|
||||
iaItem[pathLen] = '\0';
|
||||
// iaItem is like "webgroup-20100422114008-00011"
|
||||
// print out the warc files as if they were urls
|
||||
// so we can spider them through the spider pipeline as-is.
|
||||
// this hack only works on internet archive servers
|
||||
// that have the '/home/mwells/ia' obviously
|
||||
cmd.safePrintf("/home/mwells/ia list %s --glob='*arc.gz' | "
|
||||
"awk '{print \"<a "
|
||||
"href=http://archive.org/download/"
|
||||
"%s/\"$1\">\"$1\"</a><br>\"}' > ./tmpiaout"
|
||||
//, g_hostdb.m_dir
|
||||
,iaItem
|
||||
,iaItem
|
||||
);
|
||||
iaItem[pathLen] = c;
|
||||
log("system: %s",cmd.getBufStart());
|
||||
gbsystem ( cmd.getBufStart() );
|
||||
SafeBuf sb;
|
||||
sb.safePrintf("<title>%s</title>\n<br>\n",iaItem);
|
||||
sb.load ( "./tmpiaout" );
|
||||
// remove those pesky ^M guys. i guess ia is windows based.
|
||||
sb.safeReplace3("\r","");
|
||||
//log("system: output(%"INT32"=%s",sb.getBufStart(),
|
||||
//sb.length());
|
||||
return g_httpServer.sendDynamicPage(s,
|
||||
sb.getBufStart(),
|
||||
sb.length(),
|
||||
0, false,
|
||||
"text/html",
|
||||
-1, NULL,
|
||||
"UTF-8");
|
||||
}
|
||||
|
||||
|
||||
// . is it a diffbot api request, like "GET /api/*"
|
||||
// . ie "/api/startcrawl" or "/api/stopcrawl" etc.?
|
||||
//if ( strncmp ( path , "/api/" , 5 ) == 0 )
|
||||
@ -2357,6 +2395,7 @@ int32_t getMsgSize ( char *buf, int32_t bufSize, TcpSocket *s ) {
|
||||
}
|
||||
// if has no content then it must end in \n\r\n\r or \r\n\r\n
|
||||
if ( ! hasContent ) return bufSize;
|
||||
|
||||
// look for a Content-Type: field because we now limit how much
|
||||
// we read based on this
|
||||
char *p = buf;
|
||||
@ -2380,45 +2419,71 @@ int32_t getMsgSize ( char *buf, int32_t bufSize, TcpSocket *s ) {
|
||||
// as well index that at least.
|
||||
if ( p + 15 < pend && strncasecmp( p,"application/pdf",15)==0)
|
||||
allOrNothing = true;
|
||||
if ( p + 15 < pend&&strncasecmp(p,"application/x-gzip",18)==0)
|
||||
allOrNothing = true;
|
||||
// adjust "max to read" if we don't have an html/plain doc
|
||||
if ( ! isPost ) {
|
||||
max = s->m_maxOtherDocLen + 10*1024 ;
|
||||
if ( s->m_maxOtherDocLen == -1 ) max = 0x7fffffff;
|
||||
}
|
||||
}
|
||||
|
||||
// // if it is a warc or arc.gz allow it for now but we should
|
||||
// // only allow one spider at a time per host
|
||||
if ( s->m_sendBuf ) {
|
||||
char *p = s->m_sendBuf;
|
||||
char *pend = p + s->m_sendBufSize;
|
||||
if ( strncmp(p,"GET /",5) == 0 ) p += 4;
|
||||
// find end of url we are getting
|
||||
char *e = p;
|
||||
for ( ; *e && e < pend && ! is_wspace_a(*e) ; e++ );
|
||||
if ( e - 8 > p && strncmp(e-8,".warc.gz", 8 ) == 0 )
|
||||
max = 0x7fffffff;
|
||||
if ( e - 7 > p && strncmp(e-7, ".arc.gz", 7 ) == 0 )
|
||||
max = 0x7fffffff;
|
||||
}
|
||||
|
||||
int32_t contentSize = 0;
|
||||
int32_t totalReplySize = 0;
|
||||
|
||||
// now look for Content-Length in the mime
|
||||
for ( int32_t j = 0; j < i ; j++ ) {
|
||||
int32_t j; for ( j = 0; j < i ; j++ ) {
|
||||
if ( buf[j] != 'c' && buf[j] != 'C' ) continue;
|
||||
if ( j + 16 >= i ) break;
|
||||
if ( strncasecmp ( &buf[j], "Content-Length:" , 15 ) != 0 )
|
||||
continue;
|
||||
int32_t contentSize = atol2 ( &buf[j+15] , i - (j+15) );
|
||||
int32_t totalReplySize = contentSize + mimeSize ;
|
||||
// all-or-nothing filter
|
||||
if ( totalReplySize > max && allOrNothing ) {
|
||||
log(LOG_INFO,
|
||||
"http: pdf reply/request size of %"INT32" is larger "
|
||||
"than limit of %"INT32". Cutoff pdf's are useless. "
|
||||
"Abandoning.",totalReplySize,max);
|
||||
// do not read any more than what we have
|
||||
return bufSize;
|
||||
}
|
||||
// warn if we received a post that was truncated
|
||||
if ( totalReplySize > max && isPost ) {
|
||||
log("http: Truncated POST request from %"INT32" "
|
||||
"to %"INT32" bytes. Increase \"max other/text doc "
|
||||
"len\" in Spider Controls page to prevent this.",
|
||||
totalReplySize,max);
|
||||
}
|
||||
// truncate the reply if we have to
|
||||
if ( totalReplySize > max ) {
|
||||
log("http: truncating reply of %"INT32" to %"INT32" bytes",
|
||||
totalReplySize,max);
|
||||
totalReplySize = max;
|
||||
}
|
||||
// truncate if we need to
|
||||
return totalReplySize;
|
||||
contentSize = atol2 ( &buf[j+15] , i - (j+15) );
|
||||
totalReplySize = contentSize + mimeSize ;
|
||||
break;
|
||||
}
|
||||
|
||||
// all-or-nothing filter
|
||||
if ( totalReplySize > max && allOrNothing ) {
|
||||
log(LOG_INFO,
|
||||
"http: reply/request size of %"INT32" is larger "
|
||||
"than limit of %"INT32". Cutoff documents "
|
||||
"of this type are useless. "
|
||||
"Abandoning.",totalReplySize,max);
|
||||
// do not read any more than what we have
|
||||
return bufSize;
|
||||
}
|
||||
// warn if we received a post that was truncated
|
||||
if ( totalReplySize > max && isPost ) {
|
||||
log("http: Truncated POST request from %"INT32" "
|
||||
"to %"INT32" bytes. Increase \"max other/text doc "
|
||||
"len\" in Spider Controls page to prevent this.",
|
||||
totalReplySize,max);
|
||||
}
|
||||
// truncate the reply if we have to
|
||||
if ( totalReplySize > max ) {
|
||||
log("http: truncating reply of %"INT32" to %"INT32" bytes",
|
||||
totalReplySize,max);
|
||||
totalReplySize = max;
|
||||
}
|
||||
// truncate if we need to
|
||||
if ( totalReplySize )
|
||||
return totalReplySize;
|
||||
|
||||
// if it is a POST request with content but no content length...
|
||||
// we don't know how big it is...
|
||||
if ( isPost ) {
|
||||
@ -2849,16 +2914,34 @@ TcpSocket *HttpServer::unzipReply(TcpSocket* s) {
|
||||
// so we need to rewrite the Content-Length: and the
|
||||
// Content-Encoding: http mime field values so they are no longer
|
||||
// "gzip" and use the uncompressed content-length.
|
||||
char *ptr1 = NULL;
|
||||
char *ptr2 = NULL;
|
||||
if(mime.getContentEncodingPos() &&
|
||||
mime.getContentEncodingPos() < mime.getContentLengthPos()) {
|
||||
ptr1 = mime.getContentEncodingPos();
|
||||
ptr2 = mime.getContentLengthPos();
|
||||
}
|
||||
else {
|
||||
ptr1 = mime.getContentLengthPos();
|
||||
ptr2 = mime.getContentEncodingPos();
|
||||
char *ptr1 = mime.getContentEncodingPos();
|
||||
char *ptr2 = mime.getContentLengthPos();
|
||||
char *ptr3 = NULL;
|
||||
|
||||
// change the content type based on the extension before the
|
||||
// .gz extension since we are uncompressing it
|
||||
char *p = s->m_sendBuf + 4;
|
||||
char *pend = s->m_sendBuf + s->m_sendBufSize;
|
||||
const char *newCT = NULL;
|
||||
char *lastPeriod = NULL;
|
||||
// get the extension, if any, before the .gz
|
||||
for ( ; *p && ! is_wspace_a(*p) && p < pend ; p++ ) {
|
||||
if ( p[0] != '.' ) continue;
|
||||
if ( p[1] != 'g' ) { lastPeriod = p; continue; }
|
||||
if ( p[2] != 'z' ) { lastPeriod = p; continue; }
|
||||
if ( ! is_wspace_a(p[3]) ) { lastPeriod = p; continue; }
|
||||
// no prev?
|
||||
if ( ! lastPeriod ) break;
|
||||
// skip period
|
||||
lastPeriod++;
|
||||
// back up
|
||||
newCT = extensionToContentTypeStr2 (lastPeriod,p-lastPeriod);
|
||||
// this is NULL if the file extension is unrecognized
|
||||
if ( ! newCT ) break;
|
||||
// this should be like text/html or
|
||||
// WARC/html or something like that...
|
||||
ptr3 = mime.getContentTypePos();
|
||||
break;
|
||||
}
|
||||
|
||||
// this was writing a number at the start of the mime and messing
|
||||
@ -2870,38 +2953,47 @@ TcpSocket *HttpServer::unzipReply(TcpSocket* s) {
|
||||
char *src = s->m_readBuf;
|
||||
|
||||
// sometimes they are missing Content-Length:
|
||||
if ( ptr1 ) {
|
||||
// copy ptr1 to src
|
||||
gbmemcpy ( pnew, src, ptr1 - src );
|
||||
pnew += ptr1 - src;
|
||||
src += ptr1 - src;
|
||||
// store either the new content encoding or new length
|
||||
if(ptr1 == mime.getContentEncodingPos())
|
||||
pnew += sprintf(pnew, " identity");
|
||||
else
|
||||
pnew += sprintf(pnew, " %"INT32"",newSize);
|
||||
// scan to \r\n at end of that line we replace
|
||||
while ( *src != '\r' && *src != '\n') src++;
|
||||
|
||||
subloop:
|
||||
|
||||
char *nextMin = (char *)-1;
|
||||
if ( ptr1 && (ptr1 < nextMin || nextMin==(char *)-1)) nextMin = ptr1;
|
||||
if ( ptr2 && (ptr2 < nextMin || nextMin==(char *)-1)) nextMin = ptr2;
|
||||
if ( ptr3 && (ptr3 < nextMin || nextMin==(char *)-1)) nextMin = ptr3;
|
||||
|
||||
// if all ptrs are NULL then copy the tail
|
||||
if ( nextMin == (char *)-1 ) nextMin = mimeEnd;
|
||||
|
||||
// copy ptr1 to src
|
||||
gbmemcpy ( pnew, src, nextMin - src );
|
||||
pnew += nextMin - src;
|
||||
src += nextMin - src;
|
||||
// store either the new content encoding or new length
|
||||
if ( nextMin == mime.getContentEncodingPos()) {
|
||||
pnew += sprintf(pnew, " identity");
|
||||
ptr1 = NULL;
|
||||
}
|
||||
else if ( nextMin == mime.getContentLengthPos() ) {
|
||||
pnew += sprintf(pnew, " %"INT32"",newSize);
|
||||
ptr2 = NULL;
|
||||
}
|
||||
else if ( nextMin == mime.getContentTypePos() ) {
|
||||
pnew += sprintf(pnew," %s",newCT);
|
||||
ptr3 = NULL;
|
||||
}
|
||||
|
||||
if ( ptr2 ) {
|
||||
// copy ptr2 to src
|
||||
gbmemcpy ( pnew , src , ptr2 - src );
|
||||
pnew += ptr2 - src;
|
||||
src += ptr2 - src;
|
||||
// now insert the new shit
|
||||
if(ptr2 == mime.getContentEncodingPos())
|
||||
pnew += sprintf(pnew, " identity");
|
||||
else
|
||||
pnew += sprintf(pnew, " %"INT32"",newSize);
|
||||
// loop for more
|
||||
if ( nextMin < mimeEnd ) {
|
||||
// scan to \r\n at end of that line we replace
|
||||
while ( *src != '\r' && *src != '\n') src++;
|
||||
goto subloop;
|
||||
}
|
||||
|
||||
|
||||
// copy the rest
|
||||
gbmemcpy ( pnew , src , mimeEnd - src );
|
||||
pnew += mimeEnd - src;
|
||||
src += mimeEnd - src;
|
||||
// gbmemcpy ( pnew , src , mimeEnd - src );
|
||||
// pnew += mimeEnd - src;
|
||||
// src += mimeEnd - src;
|
||||
|
||||
|
||||
// before restLen was negative because we were skipping over
|
||||
|
35
Linkdb.cpp
35
Linkdb.cpp
@ -5774,23 +5774,32 @@ bool Links::addLink ( char *link , int32_t linkLen , int32_t nodeNum ,
|
||||
if ( link[k] == '>' ) { hasSpaces = false; break; }
|
||||
}
|
||||
bool hitQuestionMark = false;
|
||||
int32_t k; for(k=0;hasSpaces && linkLen<MAX_URL_LEN && k<linkLen ;k++){
|
||||
if ( link[k] == '?' ) hitQuestionMark = true;
|
||||
tmp[k ] = link[k];
|
||||
tmp[k+1] = '\0';
|
||||
if ( tmp[k] != ' ' ) continue;
|
||||
// if we are part of the cgi stuff, use +
|
||||
if ( hitQuestionMark ) { tmp[k] = '+'; continue; }
|
||||
int32_t src = 0;
|
||||
int32_t dst = 0;
|
||||
for ( ;hasSpaces && linkLen<MAX_URL_LEN && src<linkLen ; src++ ){
|
||||
// if not enough buffer then we couldn't do the conversion.
|
||||
if ( k+3 >= MAX_URL_LEN ) { hasSpaces = false; break; }
|
||||
if ( dst+3 >= MAX_URL_LEN ) { hasSpaces = false; break; }
|
||||
if ( link[src] == '?' )
|
||||
hitQuestionMark = true;
|
||||
if ( link[src] != ' ' ) {
|
||||
tmp[dst++] = link[src];
|
||||
continue;
|
||||
}
|
||||
// if we are part of the cgi stuff, use +
|
||||
if ( hitQuestionMark ) {
|
||||
tmp[dst++] = '+';
|
||||
continue;
|
||||
}
|
||||
// if before the '?' then use %20
|
||||
tmp[k ] = '%';
|
||||
tmp[k+1] = '2';
|
||||
tmp[k+2] = '0';
|
||||
tmp[k+3] = '\0';
|
||||
tmp[dst++] = '%';
|
||||
tmp[dst++] = '2';
|
||||
tmp[dst++] = '0';
|
||||
}
|
||||
if ( hasSpaces )
|
||||
if ( hasSpaces ) {
|
||||
link = tmp;
|
||||
linkLen = dst;
|
||||
tmp[dst] = '\0';
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
4
Makefile
4
Makefile
@ -396,8 +396,8 @@ RdbBuckets.o:
|
||||
Linkdb.o:
|
||||
$(CC) $(DEFS) $(CPPFLAGS) -O3 -c $*.cpp
|
||||
|
||||
XmlDoc.o:
|
||||
$(CC) $(DEFS) $(CPPFLAGS) $(XMLDOCOPT) -c $*.cpp
|
||||
#XmlDoc.o:
|
||||
# $(CC) $(DEFS) $(CPPFLAGS) $(XMLDOCOPT) -c $*.cpp
|
||||
|
||||
# final gigabit generation in here:
|
||||
Msg40.o:
|
||||
|
4
Mem.cpp
4
Mem.cpp
@ -700,7 +700,7 @@ void Mem::addMem ( void *mem , int32_t size , const char *note , char isnew ) {
|
||||
//(int32_t)mem,size,h,s_n,note);
|
||||
s_n++;
|
||||
// debug
|
||||
if ( size > MINMEM && g_conf.m_logDebugMemUsage )
|
||||
if ( (size > MINMEM && g_conf.m_logDebugMemUsage) || size>=100000000 )
|
||||
log(LOG_INFO,"mem: addMem(%"INT32"): %s. ptr=0x%"PTRFMT" "
|
||||
"used=%"INT64"",
|
||||
size,note,(PTRTYPE)mem,m_used);
|
||||
@ -1023,7 +1023,7 @@ bool Mem::rmMem ( void *mem , int32_t size , const char *note ) {
|
||||
|
||||
keepgoing:
|
||||
// debug
|
||||
if ( size > MINMEM && g_conf.m_logDebugMemUsage )
|
||||
if ( (size > MINMEM && g_conf.m_logDebugMemUsage) || size>=100000000 )
|
||||
log(LOG_INFO,"mem: rmMem (%"INT32"): "
|
||||
"ptr=0x%"PTRFMT" %s.",size,(PTRTYPE)mem,note);
|
||||
|
||||
|
297
Msg13.cpp
297
Msg13.cpp
@ -281,6 +281,12 @@ bool Msg13::forwardRequest ( ) {
|
||||
//
|
||||
int32_t nh = g_hostdb.m_numHosts;
|
||||
int32_t hostId = hash32h(((uint32_t)r->m_firstIp >> 8), 0) % nh;
|
||||
|
||||
// avoid host #0 for diffbot hack which is dropping some requests
|
||||
// because of the streaming bug methinks
|
||||
if ( hostId == 0 && nh >= 2 && g_conf.m_diffbotMsg13Hack )
|
||||
hostId = 1;
|
||||
|
||||
// get host to send to from hostId
|
||||
Host *h = NULL;
|
||||
// . pick first alive host, starting with "hostId" as the hostId
|
||||
@ -295,6 +301,8 @@ bool Msg13::forwardRequest ( ) {
|
||||
if ( ++hostId >= nh ) hostId = 0;
|
||||
}
|
||||
|
||||
hostId = 0; // HACK!!
|
||||
|
||||
// forward it to self if we are the spider proxy!!!
|
||||
if ( g_hostdb.m_myHost->m_isProxy )
|
||||
h = g_hostdb.m_myHost;
|
||||
@ -504,6 +512,21 @@ bool Msg13::gotFinalReply ( char *reply, int32_t replySize, int32_t replyAllocSi
|
||||
return true;
|
||||
}
|
||||
|
||||
bool isIpInTwitchyTable ( CollectionRec *cr , int32_t ip ) {
|
||||
if ( ! cr ) return false;
|
||||
HashTableX *ht = &cr->m_twitchyTable;
|
||||
if ( ht->m_numSlots == 0 ) return false;
|
||||
return ( ht->getSlot ( &ip ) >= 0 );
|
||||
}
|
||||
|
||||
bool addIpToTwitchyTable ( CollectionRec *cr , int32_t ip ) {
|
||||
if ( ! cr ) return true;
|
||||
HashTableX *ht = &cr->m_twitchyTable;
|
||||
if ( ht->m_numSlots == 0 )
|
||||
ht->set ( 4,0,16,NULL,0,false,MAX_NICENESS,"twitchtbl",true);
|
||||
return ht->addKey ( &ip );
|
||||
}
|
||||
|
||||
RdbCache s_hammerCache;
|
||||
static bool s_flag = false;
|
||||
Msg13Request *s_hammerQueueHead = NULL;
|
||||
@ -591,7 +614,8 @@ void handleRequest13 ( UdpSlot *slot , int32_t niceness ) {
|
||||
}
|
||||
|
||||
// log it so we can see if we are hammering
|
||||
if ( g_conf.m_logDebugRobots || g_conf.m_logDebugDownloads )
|
||||
if ( g_conf.m_logDebugRobots || g_conf.m_logDebugDownloads ||
|
||||
g_conf.m_logDebugMsg13 )
|
||||
logf(LOG_DEBUG,"spider: DOWNLOADING %s firstIp=%s",
|
||||
r->ptr_url,iptoa(r->m_firstIp));
|
||||
|
||||
@ -654,7 +678,7 @@ void handleRequest13 ( UdpSlot *slot , int32_t niceness ) {
|
||||
int32_t key = ((uint32_t)r->m_firstIp >> 8);
|
||||
// send to host "h"
|
||||
Host *h = g_hostdb.getBestSpiderCompressionProxy(&key);
|
||||
if ( g_conf.m_logDebugSpider )
|
||||
if ( g_conf.m_logDebugSpider || g_conf.m_logDebugMsg13 )
|
||||
log(LOG_DEBUG,"spider: sending to compression proxy "
|
||||
"%s:%"UINT32"",iptoa(h->m_ip),(uint32_t)h->m_port);
|
||||
// . otherwise, send the request to the key host
|
||||
@ -699,6 +723,11 @@ void handleRequest13 ( UdpSlot *slot , int32_t niceness ) {
|
||||
// do not get .google.com/ crap
|
||||
//if ( strstr(r->ptr_url,".google.com/") ) { char *xx=NULL;*xx=0; }
|
||||
|
||||
CollectionRec *cr = g_collectiondb.getRec ( r->m_collnum );
|
||||
|
||||
// was it in our table of ips that are throttling us?
|
||||
r->m_wasInTableBeforeStarting = isIpInTwitchyTable ( cr , r->m_urlIp );
|
||||
|
||||
downloadTheDocForReals ( r );
|
||||
}
|
||||
|
||||
@ -716,13 +745,14 @@ void downloadTheDocForReals ( Msg13Request *r ) {
|
||||
bool firstInLine = s_rt.isEmpty ( &r->m_cacheKey );
|
||||
// wait in line cuz someone else downloading it now
|
||||
if ( ! s_rt.addKey ( &r->m_cacheKey , &r ) ) {
|
||||
log("spider: error adding to waiting table %s",r->ptr_url);
|
||||
g_udpServer.sendErrorReply(r->m_udpSlot,g_errno);
|
||||
return;
|
||||
}
|
||||
|
||||
// this means our callback will be called
|
||||
if ( ! firstInLine ) {
|
||||
//log("spider: inlining %s",r->ptr_url);
|
||||
log("spider: waiting in line %s",r->ptr_url);
|
||||
return;
|
||||
}
|
||||
|
||||
@ -733,14 +763,27 @@ void downloadTheDocForReals ( Msg13Request *r ) {
|
||||
// we tried seemed to be ip-banned
|
||||
void downloadTheDocForReals2 ( Msg13Request *r ) {
|
||||
|
||||
bool useProxies = true;
|
||||
bool useProxies = false;
|
||||
|
||||
// user can turn off proxy use with this switch
|
||||
if ( ! g_conf.m_useProxyIps ) useProxies = false;
|
||||
//if ( ! g_conf.m_useProxyIps ) useProxies = false;
|
||||
|
||||
// for diffbot turn ON if use robots is off
|
||||
if ( r->m_forceUseFloaters ) useProxies = true;
|
||||
|
||||
CollectionRec *cr = g_collectiondb.getRec ( r->m_collnum );
|
||||
|
||||
// if you turned on automatically use proxies in spider controls...
|
||||
if ( ! useProxies &&
|
||||
cr &&
|
||||
r->m_urlIp != 0 &&
|
||||
r->m_urlIp != -1 &&
|
||||
// either the global or local setting will work
|
||||
//( g_conf.m_automaticallyUseProxyIps ||
|
||||
cr->m_automaticallyUseProxies &&
|
||||
isIpInTwitchyTable( cr, r->m_urlIp ) )
|
||||
useProxies = true;
|
||||
|
||||
// we gotta have some proxy ips that we can use
|
||||
if ( ! g_conf.m_proxyIps.hasDigits() ) useProxies = false;
|
||||
|
||||
@ -797,7 +840,10 @@ void downloadTheDocForReals2 ( Msg13Request *r ) {
|
||||
// sanity check
|
||||
if ( ! g_errno ) { char *xx=NULL;*xx=0; }
|
||||
// report it
|
||||
log("spider: msg54 request: %s",mstrerror(g_errno));
|
||||
log("spider: msg54 request1: %s %s",
|
||||
mstrerror(g_errno),r->ptr_url);
|
||||
// crap we gotta send back a reply i guess
|
||||
g_udpServer.sendErrorReply(r->m_udpSlot,g_errno);
|
||||
// g_errno must be set!
|
||||
return;
|
||||
}
|
||||
@ -827,8 +873,8 @@ void gotProxyHostReplyWrapper ( void *state , UdpSlot *slot ) {
|
||||
//int32_t replyAllocSize = slot->m_readBufMaxSize;
|
||||
// bad reply? ip/port/LBid
|
||||
if ( replySize != sizeof(ProxyReply) ) {
|
||||
log("sproxy: bad 54 reply size of %"INT32" != %"INT32"",
|
||||
replySize,(int32_t)sizeof(ProxyReply));
|
||||
log("sproxy: bad 54 reply size of %"INT32" != %"INT32" %s",
|
||||
replySize,(int32_t)sizeof(ProxyReply),r->ptr_url);
|
||||
g_udpServer.sendErrorReply(r->m_udpSlot,g_errno);
|
||||
return;
|
||||
}
|
||||
@ -947,7 +993,7 @@ void downloadTheDocForReals3b ( Msg13Request *r ) {
|
||||
// flag this
|
||||
//if ( g_conf.m_qaBuildMode ) r->m_addToTestCache = true;
|
||||
// note it here
|
||||
if ( g_conf.m_logDebugSpider )
|
||||
if ( g_conf.m_logDebugSpider || g_conf.m_logDebugMsg13 )
|
||||
log("spider: downloading %s (%s) (skiphammercheck=%"INT32")",
|
||||
r->ptr_url,iptoa(r->m_urlIp) ,
|
||||
(int32_t)r->m_skipHammerCheck);
|
||||
@ -1115,7 +1161,33 @@ void doneReportingStatsWrapper ( void *state, UdpSlot *slot ) {
|
||||
s_55Out--;
|
||||
}
|
||||
|
||||
bool ipWasBanned ( TcpSocket *ts , const char **msg ) {
|
||||
bool ipWasBanned ( TcpSocket *ts , const char **msg , Msg13Request *r ) {
|
||||
|
||||
// ts will be null if we got a fake reply from a bulk job
|
||||
if ( ! ts )
|
||||
return false;
|
||||
|
||||
// do not do this on robots.txt files
|
||||
if ( r->m_isRobotsTxt )
|
||||
return false;
|
||||
|
||||
// g_errno is 104 for 'connection reset by peer'
|
||||
if ( g_errno == ECONNRESET ) {
|
||||
*msg = "connection reset";
|
||||
return true;
|
||||
}
|
||||
|
||||
// proxy returns empty reply not ECONNRESET if it experiences
|
||||
// a conn reset
|
||||
if ( g_errno == EBADMIME && ts->m_readOffset == 0 ) {
|
||||
*msg = "empty reply";
|
||||
return true;
|
||||
}
|
||||
|
||||
// on other errors do not do the ban check. it might be a
|
||||
// tcp time out or something so we have no reply. but connection resets
|
||||
// are a popular way of saying, hey, don't hit me so hard.
|
||||
if ( g_errno ) return false;
|
||||
|
||||
// if they closed the socket on us we read 0 bytes, assumed
|
||||
// we were banned...
|
||||
@ -1140,11 +1212,25 @@ bool ipWasBanned ( TcpSocket *ts , const char **msg ) {
|
||||
|
||||
// if it has link to "google.com/recaptcha"
|
||||
// TODO: use own gbstrstr so we can do QUICKPOLL(niceness)
|
||||
// TODO: ensure NOT in an invisible div
|
||||
if ( strstr ( ts->m_readBuf , "google.com/recaptcha/api/challenge") ) {
|
||||
*msg = "recaptcha link";
|
||||
return true;
|
||||
}
|
||||
|
||||
//CollectionRec *cr = g_collectiondb.getRec ( r->m_collnum );
|
||||
|
||||
// if it is a seed url and there are no links, then perhaps we
|
||||
// are in a blacklist somewhere already from triggering a spider trap
|
||||
if ( //isInSeedBuf ( cr , r->ptr_url ) &&
|
||||
// this is set in XmlDoc.cpp based on hopcount really
|
||||
r->m_isRootSeedUrl &&
|
||||
! strstr ( ts->m_readBuf, "<a href" ) ) {
|
||||
*msg = "root/seed url with no outlinks";
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
// TODO: compare a simple checksum of the page content to what
|
||||
// we have downloaded previously from this domain or ip. if it
|
||||
// seems to be the same no matter what the url, then perhaps we
|
||||
@ -1167,13 +1253,18 @@ void gotHttpReply9 ( void *state , TcpSocket *ts ) {
|
||||
// if we got a 403 Forbidden or an empty reply
|
||||
// then assume the proxy ip got banned so try another.
|
||||
const char *banMsg = NULL;
|
||||
bool banned = false;
|
||||
if ( ! g_errno )
|
||||
banned = ipWasBanned ( ts , &banMsg );
|
||||
//bool banned = false;
|
||||
|
||||
if ( g_errno )
|
||||
log("msg13: got error from proxy: %s",mstrerror(g_errno));
|
||||
|
||||
if ( g_conf.m_logDebugSpider )
|
||||
log("msg13: got proxy reply for %s",r->ptr_url);
|
||||
|
||||
//if ( ! g_errno )
|
||||
bool banned = ipWasBanned ( ts , &banMsg , r );
|
||||
|
||||
|
||||
// inc this every time we try
|
||||
r->m_proxyTries++;
|
||||
|
||||
@ -1183,13 +1274,15 @@ void gotHttpReply9 ( void *state , TcpSocket *ts ) {
|
||||
if ( r->m_hasMoreProxiesToTry ) msg = "Trying another proxy.";
|
||||
char tmpIp[64];
|
||||
sprintf(tmpIp,"%s",iptoa(r->m_urlIp));
|
||||
log("msg13: detected that proxy %s is banned (tries=%"INT32") by "
|
||||
"url %s %s [%s]. %s"
|
||||
log("msg13: detected that proxy %s is banned "
|
||||
"(banmsg=%s) "
|
||||
"(tries=%"INT32") by "
|
||||
"url %s %s. %s"
|
||||
, iptoa(r->m_proxyIp) // r->m_banProxyIp
|
||||
, banMsg
|
||||
, r->m_proxyTries
|
||||
, tmpIp
|
||||
, r->ptr_url
|
||||
, banMsg
|
||||
, msg );
|
||||
}
|
||||
|
||||
@ -1252,7 +1345,8 @@ void gotHttpReply9 ( void *state , TcpSocket *ts ) {
|
||||
// sanity check
|
||||
//if ( ! g_errno ) { char *xx=NULL;*xx=0; }
|
||||
// report it
|
||||
if ( g_errno ) log("spider: msg54 request: %s",mstrerror(g_errno));
|
||||
if ( g_errno ) log("spider: msg54 request2: %s %s",
|
||||
mstrerror(g_errno),r->ptr_url);
|
||||
// it failed i guess proceed
|
||||
gotHttpReply( state , ts );
|
||||
}
|
||||
@ -1393,11 +1487,77 @@ void gotHttpReply2 ( void *state ,
|
||||
Msg13Request *r = (Msg13Request *) state;
|
||||
UdpSlot *slot = r->m_udpSlot;
|
||||
|
||||
CollectionRec *cr = g_collectiondb.getRec ( r->m_collnum );
|
||||
|
||||
// ' connection reset' debug stuff
|
||||
// log("spider: httpreplysize=%i",(int)replySize);
|
||||
// if ( replySize == 0 )
|
||||
// log("hey");
|
||||
|
||||
// error?
|
||||
if ( g_errno && g_conf.m_logDebugSpider )
|
||||
if ( g_errno && ( g_conf.m_logDebugSpider || g_conf.m_logDebugMsg13 ) )
|
||||
log("spider: http reply (msg13) had error = %s "
|
||||
"for %s at ip %s",
|
||||
mstrerror(g_errno),r->ptr_url,iptoa(r->m_urlIp));
|
||||
mstrerror(savedErr),r->ptr_url,iptoa(r->m_urlIp));
|
||||
|
||||
bool inTable = false;
|
||||
bool checkIfBanned = false;
|
||||
if ( cr && cr->m_automaticallyBackOff ) checkIfBanned = true;
|
||||
if ( cr && cr->m_automaticallyUseProxies ) checkIfBanned = true;
|
||||
// must have a collrec to hold the ips
|
||||
if ( checkIfBanned && cr && r->m_urlIp != 0 && r->m_urlIp != -1 )
|
||||
inTable = isIpInTwitchyTable ( cr , r->m_urlIp );
|
||||
|
||||
// check if our ip seems banned. if g_errno was ECONNRESET that
|
||||
// is an indicator it was throttled/banned.
|
||||
const char *banMsg = NULL;
|
||||
bool banned = false;
|
||||
if ( checkIfBanned )
|
||||
banned = ipWasBanned ( ts , &banMsg , r );
|
||||
if ( banned )
|
||||
// should we turn proxies on for this IP address only?
|
||||
log("msg13: url %s detected as banned (%s), "
|
||||
"for ip %s"
|
||||
, r->ptr_url
|
||||
, banMsg
|
||||
, iptoa(r->m_urlIp)
|
||||
);
|
||||
|
||||
// . add to the table if not in there yet
|
||||
// . store in our table of ips we should use proxies for
|
||||
// . also start off with a crawldelay of like 1 sec for this
|
||||
// which is not normal for using proxies.
|
||||
if ( banned && ! inTable )
|
||||
addIpToTwitchyTable ( cr , r->m_urlIp );
|
||||
|
||||
// did we detect it as banned?
|
||||
if ( banned &&
|
||||
// retry iff we haven't already, but if we did stop the inf loop
|
||||
! r->m_wasInTableBeforeStarting &&
|
||||
cr &&
|
||||
( cr->m_automaticallyBackOff || cr->m_automaticallyUseProxies ) &&
|
||||
// but this is not for proxies... only native crawlbot backoff
|
||||
! r->m_proxyIp ) {
|
||||
// note this as well
|
||||
log("msg13: retrying spidered page with new logic for %s",
|
||||
r->ptr_url);
|
||||
// reset this so we don't endless loop it
|
||||
r->m_wasInTableBeforeStarting = true;
|
||||
// reset error
|
||||
g_errno = 0;
|
||||
/// and retry. it should use the proxy... or at least
|
||||
// use a crawldelay of 3 seconds since we added it to the
|
||||
// twitchy table.
|
||||
downloadTheDocForReals2 ( r );
|
||||
// that's it. if it had an error it will send back a reply.
|
||||
return;
|
||||
}
|
||||
|
||||
// do not print this if we are already using proxies, it is for
|
||||
// the auto crawldelay backoff logic only
|
||||
if ( banned && r->m_wasInTableBeforeStarting && ! r->m_proxyIp )
|
||||
log("msg13: can not retry banned download of %s "
|
||||
"because we knew ip was banned at start",r->ptr_url);
|
||||
|
||||
// get time now
|
||||
int64_t nowms = gettimeofdayInMilliseconds();
|
||||
@ -1421,9 +1581,10 @@ void gotHttpReply2 ( void *state ,
|
||||
timeToAdd,iptoa(r->m_firstIp),r->ptr_url);
|
||||
|
||||
|
||||
if ( g_conf.m_logDebugSpider )
|
||||
log(LOG_DEBUG,"spider: got http reply for firstip=%s url=%s",
|
||||
iptoa(r->m_firstIp),r->ptr_url);
|
||||
if ( g_conf.m_logDebugSpider || g_conf.m_logDebugMsg13 )
|
||||
log(LOG_DEBUG,"spider: got http reply for firstip=%s url=%s "
|
||||
"err=%s",
|
||||
iptoa(r->m_firstIp),r->ptr_url,mstrerror(savedErr));
|
||||
|
||||
|
||||
// sanity. this was happening from iframe download
|
||||
@ -1449,8 +1610,10 @@ void gotHttpReply2 ( void *state ,
|
||||
savedErr , r );
|
||||
|
||||
// note it
|
||||
if ( r->m_useTestCache && g_conf.m_logDebugSpider )
|
||||
logf(LOG_DEBUG,"spider: got reply for %s firstIp=%s uh48=%"UINT64"",
|
||||
if ( r->m_useTestCache &&
|
||||
( g_conf.m_logDebugSpider || g_conf.m_logDebugMsg13 ) )
|
||||
logf(LOG_DEBUG,"spider: got reply for %s "
|
||||
"firstIp=%s uh48=%"UINT64"",
|
||||
r->ptr_url,iptoa(r->m_firstIp),r->m_urlHash48);
|
||||
|
||||
int32_t niceness = r->m_niceness;
|
||||
@ -1677,8 +1840,13 @@ void gotHttpReply2 ( void *state ,
|
||||
// . returns false if blocks
|
||||
// . returns true if did not block, sets g_errno on error
|
||||
// . if it blocked it will recall THIS function
|
||||
if ( ! getIframeExpandedContent ( r , ts ) )
|
||||
if ( ! getIframeExpandedContent ( r , ts ) ) {
|
||||
if ( g_conf.m_logDebugMsg13 ||
|
||||
g_conf.m_logDebugSpider )
|
||||
log("msg13: iframe expansion blocked %s",
|
||||
r->ptr_url);
|
||||
return;
|
||||
}
|
||||
// ok, did we have an error?
|
||||
if ( g_errno )
|
||||
log("scproxy: xml set for %s had error: %s",
|
||||
@ -1832,6 +2000,7 @@ void gotHttpReply2 ( void *state ,
|
||||
char *compressedBuf = (char*)mmalloc(need, "Msg13Zip");
|
||||
if ( ! compressedBuf ) {
|
||||
g_errno = ENOMEM;
|
||||
log("msg13: compression failed1 %s",r->ptr_url);
|
||||
g_udpServer.sendErrorReply(slot,g_errno);
|
||||
return;
|
||||
}
|
||||
@ -1848,10 +2017,11 @@ void gotHttpReply2 ( void *state ,
|
||||
replySize);
|
||||
if(zipErr != Z_OK) {
|
||||
log("spider: had error zipping Msg13 reply. %s "
|
||||
"(%"INT32")",
|
||||
zError(zipErr),(int32_t)zipErr);
|
||||
"(%"INT32") url=%s",
|
||||
zError(zipErr),(int32_t)zipErr,r->ptr_url);
|
||||
mfree (compressedBuf, need, "Msg13ZipError");
|
||||
g_errno = ECORRUPTDATA;
|
||||
log("msg13: compression failed2 %s",r->ptr_url);
|
||||
g_udpServer.sendErrorReply(slot,g_errno);
|
||||
return;
|
||||
}
|
||||
@ -1940,7 +2110,10 @@ void gotHttpReply2 ( void *state ,
|
||||
err != EINLINESECTIONS &&
|
||||
// connection reset by peer
|
||||
err != ECONNRESET ) {
|
||||
char*xx=NULL;*xx=0;}
|
||||
log("http: bad error from httpserver get doc: %s",
|
||||
mstrerror(err));
|
||||
char*xx=NULL;*xx=0;
|
||||
}
|
||||
}
|
||||
// replicate the reply. might return NULL and set g_errno
|
||||
char *copy = reply;
|
||||
@ -1969,7 +2142,8 @@ void gotHttpReply2 ( void *state ,
|
||||
s_rt.removeSlot ( tableSlot );
|
||||
// send back error? maybe...
|
||||
if ( err ) {
|
||||
if ( g_conf.m_logDebugSpider )
|
||||
if ( g_conf.m_logDebugSpider ||
|
||||
g_conf.m_logDebugMsg13 )
|
||||
log("proxy: msg13: sending back error: %s "
|
||||
"for url %s with ip %s",
|
||||
mstrerror(err),
|
||||
@ -1978,6 +2152,9 @@ void gotHttpReply2 ( void *state ,
|
||||
g_udpServer.sendErrorReply ( slot , err );
|
||||
continue;
|
||||
}
|
||||
// for debug for now
|
||||
if ( g_conf.m_logDebugSpider || g_conf.m_logDebugMsg13 )
|
||||
log("msg13: sending reply for %s",r->ptr_url);
|
||||
// send reply
|
||||
us->sendReply_ass ( copy,replySize,copy,copyAllocSize, slot );
|
||||
// now final udp slot will free the reply, so tcp server
|
||||
@ -1998,6 +2175,9 @@ void gotHttpReply2 ( void *state ,
|
||||
// we free it - if it was never sent over a udp slot
|
||||
if ( savedErr && compressed )
|
||||
mfree ( reply , replyAllocSize , "msg13ubuf" );
|
||||
|
||||
if ( g_conf.m_logDebugSpider || g_conf.m_logDebugMsg13 )
|
||||
log("msg13: handled reply ok %s",r->ptr_url);
|
||||
}
|
||||
|
||||
|
||||
@ -2847,6 +3027,10 @@ void gotIframeExpandedContent ( void *state ) {
|
||||
|
||||
#define DELAYPERBAN 500
|
||||
|
||||
// how many milliseconds should spiders use for a crawldelay if
|
||||
// ban was detected and no proxies are being used.
|
||||
#define AUTOCRAWLDELAY 5000
|
||||
|
||||
// returns true if we queue the request to download later
|
||||
bool addToHammerQueue ( Msg13Request *r ) {
|
||||
|
||||
@ -2867,10 +3051,36 @@ bool addToHammerQueue ( Msg13Request *r ) {
|
||||
|
||||
int32_t crawlDelayMS = r->m_crawlDelayMS;
|
||||
|
||||
if ( g_conf.m_logDebugSpider )
|
||||
log(LOG_DEBUG,"spider: got timestamp of %"INT64" from "
|
||||
"hammercache (waited=%"INT64") for %s",last,waited,
|
||||
iptoa(r->m_firstIp));
|
||||
CollectionRec *cr = g_collectiondb.getRec ( r->m_collnum );
|
||||
|
||||
bool canUseProxies = false;
|
||||
if ( cr && cr->m_automaticallyUseProxies ) canUseProxies = true;
|
||||
if ( r->m_forceUseFloaters ) canUseProxies = true;
|
||||
//if ( g_conf.m_useProxyIps ) canUseProxies = true;
|
||||
//if ( g_conf.m_automaticallyUseProxyIps ) canUseProxies = true;
|
||||
|
||||
// if no proxies listed, then it is pointless
|
||||
if ( ! g_conf.m_proxyIps.hasDigits() ) canUseProxies = false;
|
||||
|
||||
// if not using proxies, but the ip is banning us, then at least
|
||||
// backoff a bit
|
||||
if ( cr &&
|
||||
r->m_urlIp != 0 &&
|
||||
r->m_urlIp != -1 &&
|
||||
cr->m_automaticallyBackOff &&
|
||||
// and it is in the twitchy table
|
||||
isIpInTwitchyTable ( cr , r->m_urlIp ) ) {
|
||||
// and no proxies are available to use
|
||||
//! canUseProxies ) {
|
||||
// then just back off with a crawldelay of 3 seconds
|
||||
if ( ! canUseProxies && crawlDelayMS < AUTOCRAWLDELAY )
|
||||
crawlDelayMS = AUTOCRAWLDELAY;
|
||||
// mark this so we do not retry pointlessly
|
||||
r->m_wasInTableBeforeStarting = true;
|
||||
// and obey crawl delay
|
||||
r->m_skipHammerCheck = false;
|
||||
}
|
||||
|
||||
|
||||
// . if we got a proxybackoff base it on # of banned proxies for urlIp
|
||||
// . try to be more sensitive for more sensitive website policies
|
||||
@ -2884,6 +3094,18 @@ bool addToHammerQueue ( Msg13Request *r ) {
|
||||
crawlDelayMS = MAX_PROXYCRAWLDELAYMS;
|
||||
}
|
||||
|
||||
// set the crawldelay we actually used when downloading this
|
||||
//r->m_usedCrawlDelay = crawlDelayMS;
|
||||
|
||||
if ( g_conf.m_logDebugSpider )
|
||||
log(LOG_DEBUG,"spider: got timestamp of %"INT64" from "
|
||||
"hammercache (waited=%"INT64" crawlDelayMS=%"INT32") "
|
||||
"for %s"
|
||||
,last
|
||||
,waited
|
||||
,crawlDelayMS
|
||||
,iptoa(r->m_firstIp));
|
||||
|
||||
bool queueIt = false;
|
||||
if ( last > 0 && waited < crawlDelayMS ) queueIt = true;
|
||||
// a "last" of 0 means currently downloading
|
||||
@ -2901,11 +3123,15 @@ bool addToHammerQueue ( Msg13Request *r ) {
|
||||
if ( queueIt ) {
|
||||
// debug
|
||||
log(LOG_INFO,
|
||||
"spider: adding %s to crawldelayqueue cd=%"INT32"ms",
|
||||
r->ptr_url,crawlDelayMS);
|
||||
"spider: adding %s to crawldelayqueue cd=%"INT32"ms "
|
||||
"ip=%s",
|
||||
r->ptr_url,crawlDelayMS,iptoa(r->m_urlIp));
|
||||
// save this
|
||||
//r->m_udpSlot = slot; // this is already saved!
|
||||
r->m_nextLink = NULL;
|
||||
// we gotta update the crawldelay here in case we modified
|
||||
// it in the above logic.
|
||||
r->m_crawlDelayMS = crawlDelayMS;
|
||||
// add it to queue
|
||||
if ( ! s_hammerQueueHead ) {
|
||||
s_hammerQueueHead = r;
|
||||
@ -2918,7 +3144,6 @@ bool addToHammerQueue ( Msg13Request *r ) {
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
// if we had it in cache check the wait time
|
||||
if ( last > 0 && waited < crawlDelayMS ) {
|
||||
log("spider: hammering firstIp=%s url=%s "
|
||||
|
6
Msg13.h
6
Msg13.h
@ -36,6 +36,8 @@ public:
|
||||
char m_opCode;
|
||||
char m_lastHack;
|
||||
|
||||
collnum_t m_collnum;
|
||||
|
||||
// not part of the proxy request, but set from ProxyReply:
|
||||
int32_t m_numBannedProxies;
|
||||
// . if using proxies, how many proxies have we tried to download
|
||||
@ -108,6 +110,9 @@ public:
|
||||
int32_t m_foundInCache:1;
|
||||
int32_t m_forceUseFloaters:1;
|
||||
|
||||
int32_t m_wasInTableBeforeStarting:1;
|
||||
int32_t m_isRootSeedUrl:1;
|
||||
|
||||
//int32_t m_testParserEnabled:1;
|
||||
//int32_t m_testSpiderEnabled:1;
|
||||
//int32_t m_isPageParser:1;
|
||||
@ -153,6 +158,7 @@ public:
|
||||
m_maxTextDocLen = -1; // no limit
|
||||
m_maxOtherDocLen = -1; // no limit
|
||||
m_crawlDelayMS = -1; // unknown or none
|
||||
m_collnum = (collnum_t)-1;
|
||||
};
|
||||
};
|
||||
|
||||
|
23
Msg2.cpp
23
Msg2.cpp
@ -194,11 +194,11 @@ bool Msg2::getLists ( ) {
|
||||
int32_t minRecSize = m_minRecSizes[m_i];
|
||||
|
||||
// sanity check
|
||||
if ( ( minRecSize > ( 500 * 1024 * 1024 ) ||
|
||||
minRecSize < 0) ){
|
||||
log( "minRecSize = %"INT32"", minRecSize );
|
||||
char *xx=NULL; *xx=0;
|
||||
}
|
||||
// if ( ( minRecSize > ( 500 * 1024 * 1024 ) ||
|
||||
// minRecSize < 0) ){
|
||||
// log( "minRecSize = %"INT32"", minRecSize );
|
||||
// char *xx=NULL; *xx=0;
|
||||
// }
|
||||
|
||||
//bool forceLocalIndexdb = true;
|
||||
// if it is a no-split term, we may gotta get it over the net
|
||||
@ -407,7 +407,13 @@ bool Msg2::getLists ( ) {
|
||||
|
||||
// like 90MB last time i checked. so it won't read more
|
||||
// than that...
|
||||
int32_t minRecSizes = DEFAULT_POSDB_READSIZE;
|
||||
// MDW: no, it's better to print oom then not give all the
|
||||
// results leaving users scratching their heads. besides,
|
||||
// we should do docid range splitting before we go out of
|
||||
// mem. we should also report the size of each termlist
|
||||
// in bytes in the query info header.
|
||||
//int32_t minRecSizes = DEFAULT_POSDB_READSIZE;
|
||||
int32_t minRecSizes = -1;
|
||||
|
||||
// start up the read. thread will wait in thread queue to
|
||||
// launch if too many threads are out.
|
||||
@ -596,12 +602,13 @@ bool Msg2::gotList ( RdbList *list ) {
|
||||
for ( int32_t i = 0 ; i < m_numLists ; i++ ) {
|
||||
if ( m_lists[i].m_listSize < m_minRecSizes[i] ) continue;
|
||||
if ( m_minRecSizes[i] == 0 ) continue;
|
||||
if ( m_minRecSizes[i] == -1 ) continue;
|
||||
// do not print this if compiling section xpathsitehash stats
|
||||
// because we only need like 10k of list to get a decent
|
||||
// reading
|
||||
if ( m_req->m_forSectionStats ) break;
|
||||
log("msg2: read termlist #%"INT32" size=%"INT32" maxSize=%"INT32". losing "
|
||||
"docIds!",
|
||||
log("msg2: read termlist #%"INT32" size=%"INT32" "
|
||||
"maxSize=%"INT32". losing docIds!",
|
||||
i,m_lists[i].m_listSize,m_minRecSizes[i]);
|
||||
}
|
||||
|
||||
|
@ -377,6 +377,9 @@ bool Msg3a::gotCacheReply ( ) {
|
||||
// 'time enough for love' query was hitting 30MB termlists.
|
||||
//rs = 50000000;
|
||||
rs = DEFAULT_POSDB_READSIZE;//90000000; // 90MB!
|
||||
// it is better to go oom then leave users scratching their
|
||||
// heads as to why some results are not being returned.
|
||||
rs = -1;
|
||||
// if section stats, limit to 1MB
|
||||
//if ( m_r->m_getSectionStats ) rs = 1000000;
|
||||
// get the jth query term
|
||||
|
93
Msg40.cpp
93
Msg40.cpp
@ -6352,6 +6352,41 @@ bool Msg40::printFacetTables ( SafeBuf *sb ) {
|
||||
|
||||
int32_t saved = sb->length();
|
||||
|
||||
// If json, print beginning of json array
|
||||
if ( format == FORMAT_JSON ) {
|
||||
if ( m_si->m_streamResults ) {
|
||||
// if we are streaming results in json, we may have hacked off
|
||||
// the last ,\n so we need a comma to put it back
|
||||
bool needComma = true;
|
||||
|
||||
// check if the last non-whitespace char in the
|
||||
// buffer is a comma
|
||||
for (int32_t i= sb->m_length-1; i >= 0; i--) {
|
||||
char c = sb->getBufStart()[i];
|
||||
if (c == '\n' || c == ' ') {
|
||||
// ignore whitespace chars
|
||||
continue;
|
||||
}
|
||||
|
||||
// If the loop reaches this point, we have a
|
||||
// non-whitespace char, so we break the loop
|
||||
// either way
|
||||
if (c == ',') {
|
||||
// last non-whitespace char is a comma,
|
||||
// so we don't need to add an extra one
|
||||
needComma = false;
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
if ( needComma ) {
|
||||
sb->safeStrcpy(",\n\n");
|
||||
}
|
||||
}
|
||||
sb->safePrintf("\"facets\":[");
|
||||
}
|
||||
|
||||
int numTablesPrinted = 0;
|
||||
for ( int32_t i = 0 ; i < m_si->m_q.getNumTerms() ; i++ ) {
|
||||
// only for html for now i guess
|
||||
//if ( m_si->m_format != FORMAT_HTML ) break;
|
||||
@ -6363,10 +6398,25 @@ bool Msg40::printFacetTables ( SafeBuf *sb ) {
|
||||
continue;
|
||||
|
||||
// if had facet ranges, print them out
|
||||
printFacetsForTable ( sb , qt );;
|
||||
|
||||
if ( printFacetsForTable ( sb , qt ) > 0 )
|
||||
numTablesPrinted++;
|
||||
}
|
||||
|
||||
// If josn, print end of json array
|
||||
if ( format == FORMAT_JSON ) {
|
||||
if ( numTablesPrinted > 0 ) {
|
||||
sb->m_length -= 2; // hack off trailing comma
|
||||
sb->safePrintf("],\n"); // close off json array
|
||||
}
|
||||
// if no facets then do not print "facets":[]\n,
|
||||
else {
|
||||
// revert string buf to original length
|
||||
sb->m_length = saved;
|
||||
// and cap the string buf just in case
|
||||
sb->nullTerm();
|
||||
}
|
||||
}
|
||||
|
||||
// if json, remove ending ,\n and make it just \n
|
||||
if ( format == FORMAT_JSON && sb->length() != saved ) {
|
||||
// remove ,\n
|
||||
@ -6387,7 +6437,7 @@ bool Msg40::printFacetTables ( SafeBuf *sb ) {
|
||||
return true;
|
||||
}
|
||||
|
||||
bool Msg40::printFacetsForTable ( SafeBuf *sb , QueryTerm *qt ) {
|
||||
int32_t Msg40::printFacetsForTable ( SafeBuf *sb , QueryTerm *qt ) {
|
||||
|
||||
//QueryWord *qw = qt->m_qword;
|
||||
//if ( qw->m_numFacetRanges > 0 )
|
||||
@ -6397,9 +6447,14 @@ bool Msg40::printFacetsForTable ( SafeBuf *sb , QueryTerm *qt ) {
|
||||
int32_t *ptrs = (int32_t *)qt->m_facetIndexBuf.getBufStart();
|
||||
int32_t numPtrs = qt->m_facetIndexBuf.length() / sizeof(int32_t);
|
||||
|
||||
if ( numPtrs == 0 )
|
||||
return 0;
|
||||
|
||||
int32_t numPrinted = 0;
|
||||
|
||||
// now scan the slots and print out
|
||||
HttpRequest *hr = &m_si->m_hr;
|
||||
bool firstTime = true;
|
||||
|
||||
bool isString = false;
|
||||
bool isFloat = false;
|
||||
bool isInt = false;
|
||||
@ -6409,6 +6464,7 @@ bool Msg40::printFacetsForTable ( SafeBuf *sb , QueryTerm *qt ) {
|
||||
char format = m_si->m_format;
|
||||
// a new table for each facet query term
|
||||
bool needTable = true;
|
||||
|
||||
// print out the dumps
|
||||
for ( int32_t x= 0 ; x < numPtrs ; x++ ) {
|
||||
// skip empty slots
|
||||
@ -6516,7 +6572,9 @@ bool Msg40::printFacetsForTable ( SafeBuf *sb , QueryTerm *qt ) {
|
||||
text = m_facetTextBuf.getBufStart() + *offset;
|
||||
}
|
||||
|
||||
|
||||
if ( format == FORMAT_XML ) {
|
||||
numPrinted++;
|
||||
sb->safePrintf("\t<facet>\n"
|
||||
"\t\t<field>%s</field>\n"
|
||||
, term );
|
||||
@ -6573,17 +6631,6 @@ bool Msg40::printFacetsForTable ( SafeBuf *sb , QueryTerm *qt ) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if ( format == FORMAT_JSON && firstTime ) {
|
||||
firstTime = false;
|
||||
// if streaming results we may have hacked off
|
||||
// the last ,\n so put it back
|
||||
if ( m_si->m_streamResults ) {
|
||||
//sb->m_length -= 1;
|
||||
sb->safeStrcpy(",\n\n");
|
||||
}
|
||||
//sb->safePrintf("\"facets\":[\n");
|
||||
}
|
||||
|
||||
// print that out
|
||||
if ( needTable && format == FORMAT_HTML ) {
|
||||
needTable = false;
|
||||
@ -6619,13 +6666,8 @@ bool Msg40::printFacetsForTable ( SafeBuf *sb , QueryTerm *qt ) {
|
||||
}
|
||||
|
||||
|
||||
if ( needTable && format == FORMAT_JSON ) {
|
||||
needTable = false;
|
||||
sb->safePrintf("\"facets\":[");
|
||||
}
|
||||
|
||||
|
||||
if ( format == FORMAT_JSON ) {
|
||||
numPrinted++;
|
||||
sb->safePrintf("{\n"
|
||||
"\t\"field\":\"%s\",\n"
|
||||
, term
|
||||
@ -6779,6 +6821,8 @@ bool Msg40::printFacetsForTable ( SafeBuf *sb , QueryTerm *qt ) {
|
||||
SafeBuf newUrl;
|
||||
replaceParm ( newStuff.getBufStart(), &newUrl , hr );
|
||||
|
||||
numPrinted++;
|
||||
|
||||
// print the facet in its numeric form
|
||||
// we will have to lookup based on its docid
|
||||
// and get it from the cached page later
|
||||
@ -6799,13 +6843,8 @@ bool Msg40::printFacetsForTable ( SafeBuf *sb , QueryTerm *qt ) {
|
||||
,count); // count for printing
|
||||
}
|
||||
|
||||
if ( ! needTable && format == FORMAT_JSON ) {
|
||||
sb->m_length -= 2; // hack off trailing comma
|
||||
sb->safePrintf("],\n"); // close off json array
|
||||
}
|
||||
|
||||
if ( ! needTable && format == FORMAT_HTML )
|
||||
sb->safePrintf("</table></div><br>\n");
|
||||
|
||||
return true;
|
||||
return numPrinted;
|
||||
}
|
||||
|
2
Msg40.h
2
Msg40.h
@ -227,7 +227,7 @@ class Msg40 {
|
||||
int32_t m_omitCount;
|
||||
|
||||
bool printFacetTables ( class SafeBuf *sb ) ;
|
||||
bool printFacetsForTable ( SafeBuf *sb , QueryTerm *qt );
|
||||
int32_t printFacetsForTable ( SafeBuf *sb , QueryTerm *qt );
|
||||
bool lookupFacets ( ) ;
|
||||
void lookupFacets2 ( ) ;
|
||||
void gotFacetText ( class Msg20 *msg20 ) ;
|
||||
|
7
Msg5.cpp
7
Msg5.cpp
@ -182,9 +182,10 @@ bool Msg5::getList ( char rdbId ,
|
||||
// log("Msg5::readList: startKey > endKey warning");
|
||||
// we no longer allow negative minRecSizes
|
||||
if ( minRecSizes < 0 ) {
|
||||
log(LOG_LOGIC,"net: msg5: MinRecSizes < 0, using 1.");
|
||||
minRecSizes = 1;
|
||||
char *xx = NULL; *xx = 0;
|
||||
if ( g_conf.m_logDebugDb )
|
||||
log(LOG_LOGIC,"net: msg5: MinRecSizes < 0, using 2GB.");
|
||||
minRecSizes = 0x7fffffff;
|
||||
//char *xx = NULL; *xx = 0;
|
||||
}
|
||||
// ensure startKey last bit clear, endKey last bit set
|
||||
//if ( (startKey.n0 & 0x01) == 0x01 )
|
||||
|
@ -242,7 +242,7 @@ bool sendReply ( void *state , bool addUrlEnabled ) {
|
||||
mbuf.safePrintf("<center><font color=red>");
|
||||
mbuf.safePrintf("<b><u>");
|
||||
mbuf.safeTruncateEllipsis(gr->m_urlsBuf,200);
|
||||
mbuf.safePrintf("</u></b> added to spider "
|
||||
mbuf.safePrintf("</u></b></font> added to spider "
|
||||
"queue "
|
||||
"successfully<br><br>");
|
||||
mbuf.safePrintf("</font></center>");
|
||||
|
794
PageInject.cpp
794
PageInject.cpp
File diff suppressed because one or more lines are too long
109
PageInject.h
109
PageInject.h
@ -1,6 +1,8 @@
|
||||
#ifndef GBINJECT_H
|
||||
#define GBINJECT_H
|
||||
|
||||
void handleRequest7Import ( class UdpSlot *slot , int32_t netnice ) ;
|
||||
|
||||
void handleRequest7 ( class UdpSlot *slot , int32_t netnice ) ;
|
||||
|
||||
bool sendPageInject ( class TcpSocket *s, class HttpRequest *hr );
|
||||
@ -12,27 +14,88 @@ void saveImportStates ( ) ;
|
||||
|
||||
#include "XmlDoc.h"
|
||||
#include "Users.h"
|
||||
#include "Parms.h" // GigablastRequest
|
||||
#include "Parms.h"
|
||||
|
||||
|
||||
void setInjectionRequestFromParms ( class TcpSocket *sock ,
|
||||
class HttpRequest *hr ,
|
||||
class CollectionRec *cr ,
|
||||
class InjectionRequest *ir ) ;
|
||||
|
||||
class InjectionRequest {
|
||||
public:
|
||||
|
||||
int32_t m_injectDocIp;
|
||||
char m_injectLinks;
|
||||
char m_spiderLinks;
|
||||
char m_shortReply;
|
||||
char m_newOnly;
|
||||
char m_deleteUrl;
|
||||
char m_recycle;
|
||||
char m_dedup;
|
||||
char m_hasMime;
|
||||
char m_doConsistencyTesting;
|
||||
char m_getSections;
|
||||
char m_gotSections;
|
||||
int32_t m_charset;
|
||||
int32_t m_hopCount;
|
||||
collnum_t m_collnum; // more reliable than m_coll
|
||||
uint32_t m_firstIndexed;
|
||||
uint32_t m_lastSpidered;
|
||||
|
||||
char *ptr_url;
|
||||
char *ptr_queryToScrape;
|
||||
char *ptr_contentDelim;
|
||||
char *ptr_contentFile;
|
||||
char *ptr_contentTypeStr;
|
||||
char *ptr_content;
|
||||
char *ptr_diffbotReply; // secret thing from dan
|
||||
|
||||
int32_t size_url;
|
||||
int32_t size_queryToScrape;
|
||||
int32_t size_contentDelim;
|
||||
int32_t size_contentFile;
|
||||
int32_t size_contentTypeStr;
|
||||
int32_t size_content;
|
||||
int32_t size_diffbotReply; // secret thing from dan
|
||||
|
||||
// serialized space for the ptr_* strings above
|
||||
char m_buf[0];
|
||||
};
|
||||
|
||||
|
||||
class Msg7 {
|
||||
|
||||
public:
|
||||
|
||||
GigablastRequest m_gr;
|
||||
SafeBuf m_injectUrlBuf;
|
||||
bool m_firstTime;
|
||||
char *m_start;
|
||||
bool m_fixMe;
|
||||
int32_t m_injectCount;
|
||||
//GigablastRequest m_gr;
|
||||
InjectionRequest m_injectionRequest;
|
||||
|
||||
int32_t m_replyIndexCode;
|
||||
int64_t m_replyDocId;
|
||||
|
||||
//SafeBuf m_injectUrlBuf;
|
||||
//bool m_firstTime;
|
||||
//char *m_start;
|
||||
//bool m_fixMe;
|
||||
//char m_saved;
|
||||
//int32_t m_injectCount;
|
||||
//bool m_isDoneInjecting;
|
||||
|
||||
char *m_sir;
|
||||
int32_t m_sirSize;
|
||||
|
||||
bool m_needsSet;
|
||||
XmlDoc m_xd;
|
||||
XmlDoc *m_xd;
|
||||
TcpSocket *m_socket;
|
||||
SafeBuf m_sb;
|
||||
char m_round;
|
||||
char m_useAhrefs;
|
||||
HashTableX m_linkDedupTable;
|
||||
|
||||
// referenced by InjectionRequest::ptr_content
|
||||
SafeBuf m_contentBuf;
|
||||
|
||||
SafeBuf m_sbuf; // for holding entire titlerec for importing
|
||||
|
||||
void *m_state;
|
||||
@ -49,27 +112,39 @@ public:
|
||||
Msg7 ();
|
||||
~Msg7 ();
|
||||
bool m_inUse;
|
||||
int32_t m_format;
|
||||
HttpRequest m_hr;
|
||||
|
||||
class XmlDoc *m_stashxd;
|
||||
|
||||
void reset();
|
||||
|
||||
bool scrapeQuery ( );
|
||||
|
||||
bool inject ( char *coll,
|
||||
char *proxiedUrl,
|
||||
int32_t proxiedUrlLen,
|
||||
char *content,
|
||||
void *state ,
|
||||
void (*callback)(void *state) );
|
||||
void gotUdpReply ( class UdpSlot *slot ) ;
|
||||
|
||||
bool inject ( void *state ,
|
||||
void (*callback)(void *state) );
|
||||
bool sendInjectionRequestToHost ( InjectionRequest *ir ,
|
||||
void *state ,
|
||||
void (* callback)(void *) );
|
||||
|
||||
// msg7request m_req7 must be valid
|
||||
//bool inject ( char *coll,
|
||||
// char *proxiedUrl,
|
||||
// int32_t proxiedUrlLen,
|
||||
// char *content,
|
||||
// void *state ,
|
||||
// void (*callback)(void *state) );
|
||||
|
||||
// msg7request m_req7 must be valid
|
||||
// bool inject2 ( void *state , */
|
||||
// void (*callback)(void *state) ); */
|
||||
|
||||
|
||||
//bool injectTitleRec ( void *state ,
|
||||
// void (*callback)(void *state) ,
|
||||
// class CollectionRec *cr );
|
||||
|
||||
void gotMsg7Reply ();
|
||||
//void gotMsg7Reply ();
|
||||
|
||||
};
|
||||
|
||||
|
@ -438,6 +438,25 @@ bool Msg1c::gotList ( ) {
|
||||
// use only 64k values so we don't stress doledb/waittrees/etc.
|
||||
// for large #'s of docids
|
||||
int32_t firstIp = (docId & 0x0000ffff);
|
||||
|
||||
// bits 6-13 of the docid are the domain hash so use those
|
||||
// when doing a REINDEX (not delete!) to ensure that requests
|
||||
// on the same domain go to the same shard, at least when
|
||||
// we have up to 256 shards. if we have more than 256 shards
|
||||
// at this point some shards will not participate in the
|
||||
// query reindex/delete process because of this, so
|
||||
// we'll want to allow more bits in in that case perhaps.
|
||||
// check out Hostdb::getShardNum(RDB_SPIDERDB) in Hostdb.cpp
|
||||
// to see what shard is responsible for storing and indexing
|
||||
// this SpiderRequest based on the firstIp.
|
||||
if ( ! m_forceDel ) {
|
||||
// if we are a REINDEX not a delete because
|
||||
// deletes don't need to spider/redownload the doc
|
||||
// so the distribution can be more random
|
||||
firstIp >>= 6;
|
||||
firstIp &= 0xff;
|
||||
}
|
||||
|
||||
// 0 is not a legit val. it'll core below.
|
||||
if ( firstIp == 0 ) firstIp = 1;
|
||||
// use a fake ip
|
||||
|
46
PageRoot.cpp
46
PageRoot.cpp
@ -2246,11 +2246,11 @@ public:
|
||||
//SpiderRequest m_sreq;
|
||||
};
|
||||
|
||||
static void doneInjectingWrapper3 ( void *st1 ) ;
|
||||
|
||||
// only allow up to 1 Msg10's to be in progress at a time
|
||||
static bool s_inprogress = false;
|
||||
|
||||
void doneInjectingWrapper3 ( void *st ) ;
|
||||
|
||||
// . returns false if blocked, true otherwise
|
||||
// . sets g_errno on error
|
||||
bool sendPageAddUrl ( TcpSocket *sock , HttpRequest *hr ) {
|
||||
@ -2511,17 +2511,30 @@ bool sendPageAddUrl ( TcpSocket *sock , HttpRequest *hr ) {
|
||||
}
|
||||
*/
|
||||
|
||||
Msg7 *msg7 = &st1->m_msg7;
|
||||
// set this.
|
||||
InjectionRequest *ir = &msg7->m_injectionRequest;
|
||||
|
||||
// default to zero
|
||||
memset ( ir , 0 , sizeof(InjectionRequest) );
|
||||
|
||||
// set this. also sets gr->m_hr
|
||||
GigablastRequest *gr = &st1->m_msg7.m_gr;
|
||||
// this will fill in GigablastRequest so all the parms we need are set
|
||||
g_parms.setGigablastRequest ( sock , hr , gr );
|
||||
//setInjectionRequestFromParms ( sock , hr , cr , ir );
|
||||
|
||||
ir->ptr_url = hr->getString("u",NULL);
|
||||
if ( ! ir->ptr_url ) ir->ptr_url = hr->getString("url",NULL);
|
||||
|
||||
// get back a short reply so we can show the status code easily
|
||||
ir->m_shortReply = 1;
|
||||
|
||||
ir->m_spiderLinks = st1->m_spiderLinks;
|
||||
|
||||
// this is really an injection, not add url, so make
|
||||
// GigablastRequest::m_url point to Gigablast::m_urlsBuf because
|
||||
// the PAGE_ADDURLS2 parms in Parms.cpp fill in the m_urlsBuf.
|
||||
// HACK!
|
||||
gr->m_url = gr->m_urlsBuf;
|
||||
//gr->m_url = gr->m_urlsBuf;
|
||||
//ir->ptr_url = gr->m_urlsBuf;
|
||||
|
||||
//
|
||||
// inject using msg7
|
||||
@ -2529,10 +2542,8 @@ bool sendPageAddUrl ( TcpSocket *sock , HttpRequest *hr ) {
|
||||
|
||||
// . pass in the cleaned url
|
||||
// . returns false if blocked, true otherwise
|
||||
if ( ! st1->m_msg7.inject ( //s ,
|
||||
//r ,
|
||||
st1 ,
|
||||
doneInjectingWrapper3 ) )
|
||||
|
||||
if ( ! msg7->sendInjectionRequestToHost ( ir, st1 , doneInjectingWrapper3 ) )
|
||||
return false;
|
||||
|
||||
// some kinda error, g_errno should be set i guess
|
||||
@ -2551,10 +2562,14 @@ void doneInjectingWrapper3 ( void *st ) {
|
||||
// in order to see what sites are being added log it, then we can
|
||||
// more easily remove sites from sitesearch.gigablast.com that are
|
||||
// being added but not being searched
|
||||
char *url = st1->m_msg7.m_xd.m_firstUrl.m_url;
|
||||
//char *url = st1->m_msg7.m_xd.m_firstUrl.m_url;
|
||||
Msg7 *msg7 = &st1->m_msg7;
|
||||
InjectionRequest *ir = &msg7->m_injectionRequest;
|
||||
char *url = ir->ptr_url;
|
||||
log(LOG_INFO,"http: add url %s (%s)",url ,mstrerror(g_errno));
|
||||
// extract info from state
|
||||
TcpSocket *sock = st1->m_socket;
|
||||
|
||||
//bool isAdmin = st1->m_isMasterAdmin;
|
||||
//char *url = NULL;
|
||||
//if ( st1->m_urlLen ) url = st1->m_url;
|
||||
@ -2654,11 +2669,12 @@ void doneInjectingWrapper3 ( void *st ) {
|
||||
// " is enabled.");
|
||||
sb.safePrintf("%s",pm);
|
||||
}
|
||||
else if ( st1->m_msg7.m_xd.m_indexCodeValid &&
|
||||
st1->m_msg7.m_xd.m_indexCode ) {
|
||||
int32_t ic = st1->m_msg7.m_xd.m_indexCode;
|
||||
else if ( msg7->m_replyIndexCode ) {
|
||||
//st1->m_msg7.m_xd.m_indexCodeValid &&
|
||||
// st1->m_msg7.m_xd.m_indexCode ) {
|
||||
//int32_t ic = st1->m_msg7.m_xd.m_indexCode;
|
||||
sb.safePrintf("<b>Had error injecting url: %s</b>",
|
||||
mstrerror(ic));
|
||||
mstrerror(msg7->m_replyIndexCode));
|
||||
}
|
||||
/*
|
||||
if ( url && ! st1->m_ufu[0] && url[0] && printUrl ) {
|
||||
|
272
Parms.cpp
272
Parms.cpp
@ -32,7 +32,7 @@
|
||||
#include "Test.h"
|
||||
#include "Rebalance.h"
|
||||
#include "SpiderProxy.h" // buildProxyTable()
|
||||
#include "PageInject.h"
|
||||
#include "PageInject.h" // InjectionRequest
|
||||
|
||||
// width of input box in characters for url filter expression
|
||||
#define REGEX_TXT_MAX 80
|
||||
@ -1085,6 +1085,9 @@ bool Parms::setGigablastRequest ( TcpSocket *socket ,
|
||||
return false;
|
||||
}
|
||||
|
||||
// just in case
|
||||
memset ( gr , 0 , sizeof(GigablastRequest) );
|
||||
|
||||
gr->m_socket = socket;
|
||||
|
||||
// make a copy of the httprequest because the original is on the stack
|
||||
@ -1798,6 +1801,8 @@ bool Parms::printParms2 ( SafeBuf* sb ,
|
||||
GigablastRequest gr;
|
||||
g_parms.setToDefault ( (char *)&gr , OBJ_GBREQUEST , NULL);
|
||||
|
||||
InjectionRequest ir;
|
||||
g_parms.setToDefault ( (char *)&ir , OBJ_IR , NULL);
|
||||
|
||||
// Begin "parms":[]
|
||||
if (format == FORMAT_JSON ) {
|
||||
@ -1841,6 +1846,8 @@ bool Parms::printParms2 ( SafeBuf* sb ,
|
||||
}
|
||||
if ( m->m_obj == OBJ_GBREQUEST )
|
||||
THIS = (char *)&gr;
|
||||
if ( m->m_obj == OBJ_IR )
|
||||
THIS = (char *)&ir;
|
||||
// might have an array, do not exceed the array size
|
||||
int32_t jend = m->m_max;
|
||||
int32_t size = jend ;
|
||||
@ -2237,6 +2244,7 @@ bool Parms::printParm ( SafeBuf* sb,
|
||||
// test it
|
||||
if ( m->m_def &&
|
||||
m->m_obj != OBJ_NONE &&
|
||||
m->m_obj != OBJ_IR && // do not do for injectionrequest
|
||||
m->m_obj != OBJ_GBREQUEST && // do not do for GigablastRequest
|
||||
strcmp ( val1.getBufStart() , m->m_def ) )
|
||||
// put non-default valued parms in orange!
|
||||
@ -4883,6 +4891,8 @@ void Parms::init ( ) {
|
||||
|
||||
GigablastRequest gr;
|
||||
|
||||
InjectionRequest ir;
|
||||
|
||||
/*
|
||||
m->m_title = "delete collection";
|
||||
m->m_desc = "A collection name to delete. You can specify multiple "
|
||||
@ -8787,19 +8797,47 @@ void Parms::init ( ) {
|
||||
//
|
||||
///////////////////////////////////////////
|
||||
|
||||
m->m_title = "use spider proxies";
|
||||
m->m_desc = "Use the spider proxies listed below. If none are "
|
||||
"listed then gb will not use any.";
|
||||
m->m_title = "always use spider proxies for all collections";
|
||||
m->m_desc = "ALWAYS Use the spider proxies listed below for "
|
||||
"spidering. If none are "
|
||||
"listed then gb will not use any. Applies to all collections. "
|
||||
"If you want to regulate this on a per collection basis then "
|
||||
"set this to <b>NO</b> here and adjust the "
|
||||
"proxy controls on the "
|
||||
"<b>spider controls</b> page. If the list of proxy IPs below "
|
||||
"is empty, then of course, no proxies will be used.";
|
||||
m->m_cgi = "useproxyips";
|
||||
m->m_xml = "useSpiderProxies";
|
||||
m->m_off = (char *)&g_conf.m_useProxyIps - g;
|
||||
m->m_type = TYPE_BOOL;
|
||||
m->m_def = "1";
|
||||
m->m_flags = 0;
|
||||
m->m_def = "0";
|
||||
// hide this for now. just make it a per collection parm.
|
||||
m->m_flags = PF_HIDDEN;
|
||||
m->m_page = PAGE_SPIDERPROXIES;
|
||||
m->m_obj = OBJ_CONF;
|
||||
m++;
|
||||
|
||||
m->m_title = "automatically use spider proxies for all collections";
|
||||
m->m_desc = "AUTOMATICALLY use the spider proxies listed below for "
|
||||
"spidering. If none are "
|
||||
"listed then gb will not use any. Applies to all collections. "
|
||||
"If you want to regulate this on a per collection basis then "
|
||||
"set this to <b>NO</b> here and adjust the "
|
||||
"proxy controls on the "
|
||||
"<b>spider controls</b> page. If the list of proxy IPs below "
|
||||
"is empty, then of course, no proxies will be used.";
|
||||
m->m_cgi = "autouseproxyips";
|
||||
m->m_xml = "automaticallyUseSpiderProxies";
|
||||
m->m_off = (char *)&g_conf.m_automaticallyUseProxyIps - g;
|
||||
m->m_type = TYPE_BOOL;
|
||||
m->m_def = "0";
|
||||
// hide this for now. just make it a per collection parm.
|
||||
m->m_flags = PF_HIDDEN;
|
||||
m->m_page = PAGE_SPIDERPROXIES;
|
||||
m->m_obj = OBJ_CONF;
|
||||
m++;
|
||||
|
||||
|
||||
m->m_title = "spider proxy ips";
|
||||
m->m_desc = "List of white space-separated spider proxy IPs. Put "
|
||||
"in IP:port format. Example <i>1.2.3.4:80 4.5.6.7:99</i>. "
|
||||
@ -14853,50 +14891,57 @@ void Parms::init ( ) {
|
||||
"The injection api is described on the "
|
||||
"<a href=/admin/api>api</a> page. "
|
||||
"Make up a fake url if you are injecting content that "
|
||||
"does not have one.";
|
||||
"does not have one."
|
||||
"<br>"
|
||||
"<br>"
|
||||
"If the url ends in .warc or .arc or .warc.gz or .arc.gz "
|
||||
"Gigablast will index the contained documents as individual "
|
||||
"documents, using the appropriate dates and other meta "
|
||||
"information contained in the containing archive file."
|
||||
;
|
||||
m->m_cgi = "url";
|
||||
//m->m_cgi2 = "u";
|
||||
//m->m_cgi3 = "seed"; // pagerawlbot
|
||||
//m->m_cgi4 = "injecturl";
|
||||
m->m_obj = OBJ_GBREQUEST;
|
||||
m->m_obj = OBJ_IR;
|
||||
m->m_type = TYPE_CHARPTR;
|
||||
m->m_def = NULL;
|
||||
m->m_flags = PF_API | PF_REQUIRED;
|
||||
m->m_page = PAGE_INJECT;
|
||||
m->m_off = (char *)&gr.m_url - (char *)&gr;
|
||||
m->m_off = (char *)&ir.ptr_url - (char *)&ir;
|
||||
m++;
|
||||
|
||||
// alias #1
|
||||
m->m_title = "url";
|
||||
m->m_cgi = "u";
|
||||
m->m_obj = OBJ_GBREQUEST;
|
||||
m->m_obj = OBJ_IR;
|
||||
m->m_type = TYPE_CHARPTR;
|
||||
m->m_def = NULL;
|
||||
m->m_flags = PF_HIDDEN;
|
||||
m->m_page = PAGE_INJECT;
|
||||
m->m_off = (char *)&gr.m_url - (char *)&gr;
|
||||
m->m_off = (char *)&ir.ptr_url - (char *)&ir;
|
||||
m++;
|
||||
|
||||
// alias #2
|
||||
m->m_title = "url";
|
||||
m->m_cgi = "seed";
|
||||
m->m_obj = OBJ_GBREQUEST;
|
||||
m->m_obj = OBJ_IR;
|
||||
m->m_type = TYPE_CHARPTR;
|
||||
m->m_def = NULL;
|
||||
m->m_flags = PF_HIDDEN | PF_DIFFBOT;
|
||||
m->m_page = PAGE_INJECT;
|
||||
m->m_off = (char *)&gr.m_url - (char *)&gr;
|
||||
m->m_off = (char *)&ir.ptr_url - (char *)&ir;
|
||||
m++;
|
||||
|
||||
// alias #3
|
||||
m->m_title = "url";
|
||||
m->m_cgi = "injecturl";
|
||||
m->m_obj = OBJ_GBREQUEST;
|
||||
m->m_obj = OBJ_IR;
|
||||
m->m_type = TYPE_CHARPTR;
|
||||
m->m_def = NULL;
|
||||
m->m_flags = PF_HIDDEN;
|
||||
m->m_page = PAGE_INJECT;
|
||||
m->m_off = (char *)&gr.m_url - (char *)&gr;
|
||||
m->m_off = (char *)&ir.ptr_url - (char *)&ir;
|
||||
m++;
|
||||
|
||||
|
||||
@ -14905,24 +14950,24 @@ void Parms::init ( ) {
|
||||
"and inject their links. You are not required to supply "
|
||||
"the <i>url</i> parm if you supply this parm.";
|
||||
m->m_cgi = "qts";
|
||||
m->m_obj = OBJ_GBREQUEST;
|
||||
m->m_obj = OBJ_IR;
|
||||
m->m_type = TYPE_CHARPTR;
|
||||
m->m_def = NULL;
|
||||
m->m_flags = PF_API;
|
||||
m->m_page = PAGE_INJECT;
|
||||
m->m_off = (char *)&gr.m_queryToScrape - (char *)&gr;
|
||||
m->m_off = (char *)&ir.ptr_queryToScrape - (char *)&ir;
|
||||
m++;
|
||||
|
||||
m->m_title = "inject links";
|
||||
m->m_desc = "Should we inject the links found in the injected "
|
||||
"content as well?";
|
||||
m->m_cgi = "injectlinks";
|
||||
m->m_obj = OBJ_GBREQUEST;
|
||||
m->m_obj = OBJ_IR;
|
||||
m->m_type = TYPE_CHECKBOX;
|
||||
m->m_def = "0";
|
||||
m->m_flags = PF_API;
|
||||
m->m_page = PAGE_INJECT;
|
||||
m->m_off = (char *)&gr.m_injectLinks - (char *)&gr;
|
||||
m->m_off = (char *)&ir.m_injectLinks - (char *)&ir;
|
||||
m++;
|
||||
|
||||
|
||||
@ -14930,47 +14975,47 @@ void Parms::init ( ) {
|
||||
m->m_desc = "Add the outlinks of the injected content into spiderdb "
|
||||
"for spidering?";
|
||||
m->m_cgi = "spiderlinks";
|
||||
m->m_obj = OBJ_GBREQUEST;
|
||||
m->m_obj = OBJ_IR;
|
||||
m->m_type = TYPE_CHECKBOX;
|
||||
// leave off because could start spidering whole web unintentionally
|
||||
m->m_def = "0";
|
||||
m->m_flags = PF_API;
|
||||
m->m_page = PAGE_INJECT;
|
||||
m->m_off = (char *)&gr.m_spiderLinks - (char *)&gr;
|
||||
m->m_off = (char *)&ir.m_spiderLinks - (char *)&ir;
|
||||
m++;
|
||||
|
||||
m->m_title = "int16_t reply";
|
||||
m->m_desc = "Should the injection response be int16_t and simple?";
|
||||
m->m_title = "short reply";
|
||||
m->m_desc = "Should the injection response be short and simple?";
|
||||
m->m_cgi = "quick";
|
||||
m->m_obj = OBJ_GBREQUEST;
|
||||
m->m_obj = OBJ_IR;
|
||||
m->m_type = TYPE_CHECKBOX;
|
||||
m->m_def = "0";
|
||||
m->m_flags = PF_HIDDEN;
|
||||
m->m_page = PAGE_INJECT;
|
||||
m->m_off = (char *)&gr.m_shortReply - (char *)&gr;
|
||||
m->m_off = (char *)&ir.m_shortReply - (char *)&ir;
|
||||
m++;
|
||||
|
||||
m->m_title = "only inject content if new";
|
||||
m->m_desc = "If the specified url is already in the index then "
|
||||
"skip the injection.";
|
||||
m->m_cgi = "newonly";
|
||||
m->m_obj = OBJ_GBREQUEST;
|
||||
m->m_obj = OBJ_IR;
|
||||
m->m_type = TYPE_CHECKBOX;
|
||||
m->m_def = "0";
|
||||
m->m_flags = PF_API;
|
||||
m->m_page = PAGE_INJECT;
|
||||
m->m_off = (char *)&gr.m_newOnly - (char *)&gr;
|
||||
m->m_off = (char *)&ir.m_newOnly - (char *)&ir;
|
||||
m++;
|
||||
|
||||
m->m_title = "delete from index";
|
||||
m->m_desc = "Delete the specified url from the index.";
|
||||
m->m_cgi = "deleteurl";
|
||||
m->m_obj = OBJ_GBREQUEST;
|
||||
m->m_obj = OBJ_IR;
|
||||
m->m_type = TYPE_CHECKBOX;
|
||||
m->m_def = "0";
|
||||
m->m_flags = PF_API;
|
||||
m->m_page = PAGE_INJECT;
|
||||
m->m_off = (char *)&gr.m_deleteUrl - (char *)&gr;
|
||||
m->m_off = (char *)&ir.m_deleteUrl - (char *)&ir;
|
||||
m++;
|
||||
|
||||
m->m_title = "recycle content";
|
||||
@ -14978,68 +15023,68 @@ void Parms::init ( ) {
|
||||
"re-download the content, just use the content that was "
|
||||
"stored in the cache from last time.";
|
||||
m->m_cgi = "recycle";
|
||||
m->m_obj = OBJ_GBREQUEST;
|
||||
m->m_obj = OBJ_IR;
|
||||
m->m_type = TYPE_CHECKBOX;
|
||||
m->m_def = "0";
|
||||
m->m_flags = PF_API;
|
||||
m->m_page = PAGE_INJECT;
|
||||
m->m_off = (char *)&gr.m_recycle - (char *)&gr;
|
||||
m->m_off = (char *)&ir.m_recycle - (char *)&ir;
|
||||
m++;
|
||||
|
||||
m->m_title = "dedup url";
|
||||
m->m_desc = "Do not index the url if there is already another "
|
||||
"url in the index with the same content.";
|
||||
m->m_cgi = "dedup";
|
||||
m->m_obj = OBJ_GBREQUEST;
|
||||
m->m_obj = OBJ_IR;
|
||||
m->m_type = TYPE_CHECKBOX;
|
||||
m->m_def = "0";
|
||||
m->m_flags = PF_API;
|
||||
m->m_page = PAGE_INJECT;
|
||||
m->m_off = (char *)&gr.m_dedup - (char *)&gr;
|
||||
m->m_off = (char *)&ir.m_dedup - (char *)&ir;
|
||||
m++;
|
||||
|
||||
m->m_title = "do consistency checking";
|
||||
m->m_desc = "Turn this on for debugging.";
|
||||
m->m_cgi = "consist";
|
||||
m->m_obj = OBJ_GBREQUEST;
|
||||
m->m_obj = OBJ_IR;
|
||||
m->m_type = TYPE_CHECKBOX;
|
||||
m->m_def = "0";
|
||||
m->m_flags = PF_HIDDEN; // | PF_API
|
||||
m->m_page = PAGE_INJECT;
|
||||
m->m_off = (char *)&gr.m_doConsistencyTesting - (char *)&gr;
|
||||
m->m_off = (char *)&ir.m_doConsistencyTesting - (char *)&ir;
|
||||
m++;
|
||||
|
||||
m->m_title = "hop count";
|
||||
m->m_desc = "Use this hop count when injecting the page.";
|
||||
m->m_cgi = "hopcount";
|
||||
m->m_obj = OBJ_GBREQUEST;
|
||||
m->m_obj = OBJ_IR;
|
||||
m->m_type = TYPE_LONG;
|
||||
m->m_def = "0";
|
||||
m->m_flags = PF_HIDDEN; // | PF_API
|
||||
m->m_page = PAGE_INJECT;
|
||||
m->m_off = (char *)&gr.m_hopCount - (char *)&gr;
|
||||
m->m_off = (char *)&ir.m_hopCount - (char *)&ir;
|
||||
m++;
|
||||
|
||||
m->m_title = "last spider time";
|
||||
m->m_desc = "Override last time spidered";
|
||||
m->m_cgi = "lastspidered";
|
||||
m->m_obj = OBJ_GBREQUEST;
|
||||
m->m_obj = OBJ_IR;
|
||||
m->m_type = TYPE_LONG;
|
||||
m->m_def = "0";
|
||||
m->m_flags = PF_HIDDEN; // | PF_API
|
||||
m->m_page = PAGE_INJECT;
|
||||
m->m_off = (char *)&gr.m_lastSpidered - (char *)&gr;
|
||||
m->m_off = (char *)&ir.m_lastSpidered - (char *)&ir;
|
||||
m++;
|
||||
|
||||
m->m_title = "first indexed";
|
||||
m->m_desc = "Override first indexed time";
|
||||
m->m_cgi = "firstindexed";
|
||||
m->m_obj = OBJ_GBREQUEST;
|
||||
m->m_obj = OBJ_IR;
|
||||
m->m_type = TYPE_LONG;
|
||||
m->m_def = "0";
|
||||
m->m_flags = PF_HIDDEN; // | PF_API
|
||||
m->m_page = PAGE_INJECT;
|
||||
m->m_off = (char *)&gr.m_firstIndexed - (char *)&gr;
|
||||
m->m_off = (char *)&ir.m_firstIndexed - (char *)&ir;
|
||||
m++;
|
||||
|
||||
|
||||
@ -15047,12 +15092,12 @@ void Parms::init ( ) {
|
||||
m->m_desc = "If the content of the url is provided below, does "
|
||||
"it begin with an HTTP mime header?";
|
||||
m->m_cgi = "hasmime";
|
||||
m->m_obj = OBJ_GBREQUEST;
|
||||
m->m_obj = OBJ_IR;
|
||||
m->m_type = TYPE_CHECKBOX;
|
||||
m->m_def = "0";
|
||||
m->m_flags = PF_API;
|
||||
m->m_page = PAGE_INJECT;
|
||||
m->m_off = (char *)&gr.m_hasMime - (char *)&gr;
|
||||
m->m_off = (char *)&ir.m_hasMime - (char *)&ir;
|
||||
m++;
|
||||
|
||||
m->m_title = "content delimeter";
|
||||
@ -15066,12 +15111,12 @@ void Parms::init ( ) {
|
||||
"injected url. Otherwise it will append numbers to the "
|
||||
"url you provide above.";
|
||||
m->m_cgi = "delim";
|
||||
m->m_obj = OBJ_GBREQUEST;
|
||||
m->m_obj = OBJ_IR;
|
||||
m->m_type = TYPE_CHARPTR;
|
||||
m->m_def = NULL;
|
||||
m->m_flags = PF_API;
|
||||
m->m_page = PAGE_INJECT;
|
||||
m->m_off = (char *)&gr.m_contentDelim - (char *)&gr;
|
||||
m->m_off = (char *)&ir.ptr_contentDelim - (char *)&ir;
|
||||
m++;
|
||||
|
||||
|
||||
@ -15082,12 +15127,12 @@ void Parms::init ( ) {
|
||||
"Possible values: <b>text/html text/plain text/xml "
|
||||
"application/json</b>";
|
||||
m->m_cgi = "contenttype";
|
||||
m->m_obj = OBJ_GBREQUEST;
|
||||
m->m_obj = OBJ_IR;
|
||||
m->m_type = TYPE_CHARPTR; //text/html application/json application/xml
|
||||
m->m_def = "text/html";
|
||||
m->m_flags = PF_API;
|
||||
m->m_page = PAGE_INJECT;
|
||||
m->m_off = (char *)&gr.m_contentTypeStr - (char *)&gr;
|
||||
m->m_off = (char *)&ir.ptr_contentTypeStr - (char *)&ir;
|
||||
m++;
|
||||
|
||||
m->m_title = "content charset";
|
||||
@ -15097,24 +15142,24 @@ void Parms::init ( ) {
|
||||
"which is 106. "
|
||||
"See iana_charset.h for the numeric values.";
|
||||
m->m_cgi = "charset";
|
||||
m->m_obj = OBJ_GBREQUEST;
|
||||
m->m_obj = OBJ_IR;
|
||||
m->m_type = TYPE_LONG;
|
||||
m->m_def = "106";
|
||||
m->m_flags = PF_API;
|
||||
m->m_page = PAGE_INJECT;
|
||||
m->m_off = (char *)&gr.m_charset - (char *)&gr;
|
||||
m->m_off = (char *)&ir.m_charset - (char *)&ir;
|
||||
m++;
|
||||
|
||||
m->m_title = "upload content file";
|
||||
m->m_desc = "Instead of specifying the content to be injected in "
|
||||
"the text box below, upload this file for it.";
|
||||
m->m_cgi = "file";
|
||||
m->m_obj = OBJ_GBREQUEST;
|
||||
m->m_obj = OBJ_IR;
|
||||
m->m_type = TYPE_FILEUPLOADBUTTON;
|
||||
m->m_def = NULL;
|
||||
m->m_flags = PF_NOAPI;
|
||||
m->m_page = PAGE_INJECT;
|
||||
m->m_off = (char *)&gr.m_contentFile - (char *)&gr;
|
||||
m->m_off = (char *)&ir.ptr_contentFile - (char *)&ir;
|
||||
m++;
|
||||
|
||||
m->m_title = "content";
|
||||
@ -15128,35 +15173,35 @@ void Parms::init ( ) {
|
||||
"inject empty content, otherwise the content will "
|
||||
"be downloaded from the url.";
|
||||
m->m_cgi = "content";
|
||||
m->m_obj = OBJ_GBREQUEST;
|
||||
m->m_obj = OBJ_IR;
|
||||
m->m_type = TYPE_CHARPTR;
|
||||
m->m_def = NULL;
|
||||
m->m_flags = PF_API|PF_TEXTAREA;
|
||||
m->m_page = PAGE_INJECT;
|
||||
m->m_off = (char *)&gr.m_content - (char *)&gr;
|
||||
m->m_off = (char *)&ir.ptr_content - (char *)&ir;
|
||||
m++;
|
||||
|
||||
m->m_title = "get sectiondb voting info";
|
||||
m->m_desc = "Return section information of injected content for "
|
||||
"the injected subdomain. ";
|
||||
m->m_cgi = "sections";
|
||||
m->m_obj = OBJ_GBREQUEST;
|
||||
m->m_obj = OBJ_IR;
|
||||
m->m_type = TYPE_BOOL;
|
||||
m->m_def = "0";
|
||||
m->m_flags = PF_API|PF_NOHTML;
|
||||
m->m_page = PAGE_INJECT;
|
||||
m->m_off = (char *)&gr.m_getSections - (char *)&gr;
|
||||
m->m_off = (char *)&ir.m_getSections - (char *)&ir;
|
||||
m++;
|
||||
|
||||
m->m_title = "diffbot reply";
|
||||
m->m_desc = "Used exclusively by diffbot. Do not use.";
|
||||
m->m_cgi = "diffbotreply";
|
||||
m->m_obj = OBJ_GBREQUEST;
|
||||
m->m_obj = OBJ_IR;
|
||||
m->m_type = TYPE_CHARPTR;
|
||||
m->m_def = NULL;
|
||||
m->m_flags = PF_API|PF_TEXTAREA|PF_NOHTML; // do not show in our api
|
||||
m->m_page = PAGE_INJECT;
|
||||
m->m_off = (char *)&gr.m_diffbotReply - (char *)&gr;
|
||||
m->m_off = (char *)&ir.ptr_diffbotReply - (char *)&ir;
|
||||
m++;
|
||||
|
||||
|
||||
@ -16408,7 +16453,6 @@ void Parms::init ( ) {
|
||||
m->m_flags = PF_CLONE;
|
||||
m++;
|
||||
|
||||
|
||||
m->m_title = "use robots.txt";
|
||||
m->m_desc = "If this is true Gigablast will respect "
|
||||
"the robots.txt convention.";
|
||||
@ -16440,12 +16484,15 @@ void Parms::init ( ) {
|
||||
m++;
|
||||
|
||||
|
||||
m->m_title = "use proxies for spidering";
|
||||
m->m_desc = "If this is true Gigablast will use the proxies "
|
||||
"listed on the <a href=/admin/proxies>proxies</a> page for "
|
||||
m->m_title = "always use spider proxies";
|
||||
m->m_desc = "If this is true Gigablast will ALWAYS use the proxies "
|
||||
"listed on the <a href=/admin/proxies>proxies</a> "
|
||||
"page for "
|
||||
"spidering for "
|
||||
"this collection regardless whether the proxies are enabled "
|
||||
"on the <a href=/admin/proxies>proxies</a> page.";
|
||||
"this collection."
|
||||
//"regardless whether the proxies are enabled "
|
||||
//"on the <a href=/admin/proxies>proxies</a> page."
|
||||
;
|
||||
m->m_cgi = "useproxies";
|
||||
m->m_off = (char *)&cr.m_forceUseFloaters - x;
|
||||
m->m_type = TYPE_BOOL;
|
||||
@ -16455,6 +16502,58 @@ void Parms::init ( ) {
|
||||
m->m_flags = PF_CLONE;
|
||||
m++;
|
||||
|
||||
m->m_title = "automatically use spider proxies";
|
||||
m->m_desc = "Use the spider proxies listed on the proxies page "
|
||||
"if gb detects that "
|
||||
"a webserver is throttling the spiders. This way we can "
|
||||
"learn the webserver's spidering policy so that our spiders "
|
||||
"can be more polite. If no proxies are listed on the "
|
||||
"proxies page then this parameter will have no effect.";
|
||||
m->m_cgi = "automaticallyuseproxies";
|
||||
m->m_off = (char *)&cr.m_automaticallyUseProxies - x;
|
||||
m->m_type = TYPE_BOOL;
|
||||
m->m_def = "0";
|
||||
m->m_group = 0;
|
||||
m->m_page = PAGE_SPIDER;
|
||||
m->m_obj = OBJ_COLL;
|
||||
m->m_flags = PF_CLONE;
|
||||
m++;
|
||||
|
||||
|
||||
|
||||
m->m_title = "automatically back off";
|
||||
m->m_desc = "Set the crawl delay to 5 seconds if gb detects "
|
||||
"that an IP is throttling or banning gigabot from crawling "
|
||||
"it. The crawl delay just applies to that IP. "
|
||||
"Such throttling will be logged.";
|
||||
m->m_cgi = "automaticallybackoff";
|
||||
m->m_xml = "automaticallyBackOff";
|
||||
m->m_off = (char *)&cr.m_automaticallyBackOff - x;
|
||||
m->m_type = TYPE_BOOL;
|
||||
// a lot of pages have recaptcha links but they have valid content
|
||||
// so leave this off for now... they have it in a hidden div which
|
||||
// popups to email the article link or whatever to someone.
|
||||
m->m_def = "0";
|
||||
m->m_group = 0;
|
||||
m->m_page = PAGE_SPIDER;
|
||||
m->m_obj = OBJ_COLL;
|
||||
m->m_flags = PF_CLONE;
|
||||
m++;
|
||||
|
||||
m->m_title = "use time axis";
|
||||
m->m_desc = "If this is true Gigablast will index the same "
|
||||
"url multiple times if its content varies over time, "
|
||||
"rather than overwriting the older version in the index. "
|
||||
"Useful for archive web pages as they change over time.";
|
||||
m->m_cgi = "usetimeaxis";
|
||||
m->m_off = (char *)&cr.m_useTimeAxis - x;
|
||||
m->m_type = TYPE_BOOL;
|
||||
m->m_def = "0";
|
||||
m->m_page = PAGE_SPIDER;
|
||||
m->m_obj = OBJ_COLL;
|
||||
m->m_flags = PF_CLONE;
|
||||
m++;
|
||||
|
||||
/*
|
||||
m->m_title = "add url enabled";
|
||||
m->m_desc = "If this is enabled others can add "
|
||||
@ -19463,6 +19562,26 @@ void Parms::init ( ) {
|
||||
m->m_obj = OBJ_CONF;
|
||||
m++;
|
||||
|
||||
m->m_title = "log debug msg13 messages";
|
||||
m->m_cgi = "ldspmth";
|
||||
m->m_off = (char *)&g_conf.m_logDebugMsg13 - g;
|
||||
m->m_type = TYPE_BOOL;
|
||||
m->m_def = "0";
|
||||
m->m_priv = 1;
|
||||
m->m_page = PAGE_LOG;
|
||||
m->m_obj = OBJ_CONF;
|
||||
m++;
|
||||
|
||||
m->m_title = "disable host0 for msg13 reception hack";
|
||||
m->m_cgi = "dmth";
|
||||
m->m_off = (char *)&g_conf.m_diffbotMsg13Hack - g;
|
||||
m->m_type = TYPE_BOOL;
|
||||
m->m_def = "0";
|
||||
m->m_priv = 1;
|
||||
m->m_page = PAGE_LOG;
|
||||
m->m_obj = OBJ_CONF;
|
||||
m++;
|
||||
|
||||
m->m_title = "log debug spider proxies";
|
||||
m->m_cgi = "ldspr";
|
||||
m->m_off = (char *)&g_conf.m_logDebugProxies - g;
|
||||
@ -19995,6 +20114,7 @@ void Parms::overlapTest ( char step ) {
|
||||
|
||||
SearchInput tmpsi;
|
||||
GigablastRequest tmpgr;
|
||||
InjectionRequest tmpir;
|
||||
CollectionRec tmpcr;
|
||||
Conf tmpconf;
|
||||
char b;
|
||||
@ -20020,6 +20140,7 @@ void Parms::overlapTest ( char step ) {
|
||||
if ( m_parms[i].m_obj == OBJ_CONF ) p1 = (char *)&tmpconf;
|
||||
if ( m_parms[i].m_obj == OBJ_SI ) p1 = (char *)&tmpsi;
|
||||
if ( m_parms[i].m_obj == OBJ_GBREQUEST ) p1 = (char *)&tmpgr;
|
||||
if ( m_parms[i].m_obj == OBJ_IR ) p1 = (char *)&tmpir;
|
||||
if ( p1 ) p1 += m_parms[i].m_off;
|
||||
p2 = NULL;
|
||||
int32_t size = m_parms[i].m_size;
|
||||
@ -20068,6 +20189,7 @@ void Parms::overlapTest ( char step ) {
|
||||
if ( m_parms[i].m_obj == OBJ_CONF ) p1 = (char *)&tmpconf;
|
||||
if ( m_parms[i].m_obj == OBJ_SI ) p1 = (char *)&tmpsi;
|
||||
if ( m_parms[i].m_obj == OBJ_GBREQUEST ) p1 = (char *)&tmpgr;
|
||||
if ( m_parms[i].m_obj == OBJ_IR ) p1 = (char *)&tmpir;
|
||||
if ( p1 ) p1 += m_parms[i].m_off;
|
||||
p2 = NULL;
|
||||
int32_t size = m_parms[i].m_size;
|
||||
@ -20095,6 +20217,8 @@ void Parms::overlapTest ( char step ) {
|
||||
objStr = "SearchInput.h";
|
||||
if ( m_parms[i].m_obj == OBJ_GBREQUEST )
|
||||
objStr = "GigablastRequest/Parms.h";
|
||||
if ( m_parms[i].m_obj == OBJ_IR )
|
||||
objStr = "InjectionRequest/PageInject.h";
|
||||
// save it
|
||||
infringerB = p1[j];
|
||||
savedi = i;
|
||||
@ -22473,6 +22597,14 @@ bool printUrlExpressionExamples ( SafeBuf *sb ) {
|
||||
"You have to use the respider frequency as well "
|
||||
"to adjust how often you want things respidered."
|
||||
"</td></tr>"
|
||||
|
||||
"<tr class=poo><td>urlage</td>"
|
||||
"<td>"
|
||||
"This is the time, in seconds, since a url was first "
|
||||
"added to spiderdb to be spidered. This is "
|
||||
"its discovery date. "
|
||||
"Can use <, >, <=, >=, ==, != comparison operators."
|
||||
"</td></tr>"
|
||||
|
||||
|
||||
//"<tr class=poo><td>!newoutlink</td>"
|
||||
@ -22495,6 +22627,20 @@ bool printUrlExpressionExamples ( SafeBuf *sb ) {
|
||||
"older permalinks into a slower spider queue."
|
||||
"</td></tr>"
|
||||
|
||||
"<tr class=poo><td>spiderwaited < 3600</td>"
|
||||
"<td>"
|
||||
"<i>spiderwaited</i> is how many seconds have elapsed "
|
||||
"since the last time "
|
||||
"we tried to spider/download the url. "
|
||||
"The constaint containing <i>spiderwaited</i> will "
|
||||
"fail to be matched if the url has never been "
|
||||
"attempted to be spidered/downloaded before. Therefore, "
|
||||
"it will only ever match urls that have a spider reply "
|
||||
"of some sort, so there is no need to add an additional "
|
||||
"<i>hasreply</i>-based constraint."
|
||||
"</td></tr>"
|
||||
|
||||
|
||||
"<tr class=poo><td>"
|
||||
"<a name=insitelist>"
|
||||
"insitelist | !insitelist"
|
||||
|
47
Parms.h
47
Parms.h
@ -39,6 +39,7 @@ enum {
|
||||
OBJ_COLL ,
|
||||
OBJ_SI , // SearchInput class
|
||||
OBJ_GBREQUEST , // for GigablastRequest class of parms
|
||||
OBJ_IR , // InjectionRequest class from PageInject.h
|
||||
OBJ_NONE
|
||||
};
|
||||
|
||||
@ -121,28 +122,32 @@ class GigablastRequest {
|
||||
////////////
|
||||
// these all reference into m_hr or into the Parm::m_def string!
|
||||
char *m_url; // also for /get
|
||||
char *m_queryToScrape;
|
||||
char *m_contentDelim;
|
||||
char *m_contentTypeStr;
|
||||
char *m_contentFile;
|
||||
char *m_content;
|
||||
char *m_diffbotReply; // secret thing from dan
|
||||
char m_injectLinks;
|
||||
char m_spiderLinks;
|
||||
char m_shortReply;
|
||||
char m_newOnly;
|
||||
char m_deleteUrl;
|
||||
char m_recycle;
|
||||
char m_dedup;
|
||||
char m_hasMime;
|
||||
char m_doConsistencyTesting;
|
||||
char m_getSections;
|
||||
char m_gotSections;
|
||||
int32_t m_charset;
|
||||
int32_t m_hopCount; // hopcount
|
||||
//char *m_queryToScrape;
|
||||
//char *m_contentDelim;
|
||||
//char m_containerContentType; // CT_UNKNOWN, CT_WARC, CT_ARC
|
||||
//int32_t m_injectDocIp;
|
||||
//char *m_contentTypeStr;
|
||||
//char *m_contentFile;
|
||||
//char *m_content;
|
||||
//char *m_diffbotReply; // secret thing from dan
|
||||
//char m_injectLinks;
|
||||
//char m_spiderLinks;
|
||||
//char m_shortReply;
|
||||
//char m_newOnly;
|
||||
//char m_deleteUrl;
|
||||
//char m_recycle;
|
||||
//char m_dedup;
|
||||
//char m_hasMime;
|
||||
//char m_doConsistencyTesting;
|
||||
//char m_getSections;
|
||||
//char m_gotSections;
|
||||
//int32_t m_charset;
|
||||
//int32_t m_hopCount; // hopcount
|
||||
//collnum_t m_collnum; // more reliable than m_coll
|
||||
// older ones
|
||||
uint32_t m_firstIndexed; // firstimdexed
|
||||
uint32_t m_lastSpidered; // lastspidered;
|
||||
//uint32_t m_firstIndexed; // firstimdexed
|
||||
//uint32_t m_lastSpidered; // lastspidered;
|
||||
//SafeBuf m_contentBuf; // for holding a warc/arc file
|
||||
|
||||
|
||||
|
||||
|
@ -3074,8 +3074,17 @@ bool gotMxIp ( EmailInfo *ei ) {
|
||||
|
||||
|
||||
static void gotMandrillReplyWrapper ( void *state , TcpSocket *s ) {
|
||||
// log the mandril reply
|
||||
log("email: got mandrill reply: %s",s->m_readBuf);
|
||||
// why core here with s NULL
|
||||
if ( ! s ) {
|
||||
// crap seems like we do not retry so they will not get
|
||||
// the notification... how to fix better?
|
||||
log("email: failed to lookup mandrill ip. sock is null.");
|
||||
g_errno = EBADIP;
|
||||
}
|
||||
else {
|
||||
// log the mandril reply
|
||||
log("email: got mandrill reply: %s",s->m_readBuf);
|
||||
}
|
||||
EmailInfo *ei = (EmailInfo *)state;
|
||||
if ( ei->m_callback ) ei->m_callback ( ei->m_state );
|
||||
}
|
||||
|
@ -961,7 +961,10 @@ float getDiskUsage ( int64_t *diskAvail ) {
|
||||
g_hostdb.m_dir,
|
||||
out);
|
||||
errno = 0;
|
||||
// time it to see how long it took. could it be causing load spikes?
|
||||
//log("process: begin df -ka");
|
||||
int err = system ( cmd );
|
||||
//log("process: end df -ka");
|
||||
if ( err == 127 ) {
|
||||
log("build: /bin/sh does not exist. can not get disk usage.");
|
||||
return -1.0; // unknown
|
||||
@ -1466,10 +1469,12 @@ bool Process::shutdown2 ( ) {
|
||||
if ( g_threads.amThread() ) return true;
|
||||
|
||||
if ( m_urgent )
|
||||
log(LOG_INFO,"gb: Shutting down urgently. Try #%"INT32".",
|
||||
log(LOG_INFO,"gb: Shutting down urgently. "
|
||||
"Timed try #%"INT32".",
|
||||
m_try++);
|
||||
else
|
||||
log(LOG_INFO,"gb: Shutting down. Try #%"INT32".",m_try++);
|
||||
log(LOG_INFO,"gb: Shutting down. Timed try #%"INT32".",
|
||||
m_try++);
|
||||
|
||||
|
||||
// switch to urgent if having problems
|
||||
|
211
Spider.cpp
211
Spider.cpp
@ -797,6 +797,8 @@ bool Spiderdb::verify ( char *coll ) {
|
||||
key128_t Spiderdb::makeKey ( int32_t firstIp ,
|
||||
int64_t urlHash48 ,
|
||||
bool isRequest ,
|
||||
// MDW: now we use timestamp instead of parentdocid
|
||||
// for spider replies. so they do not dedup...
|
||||
int64_t parentDocId ,
|
||||
bool isDel ) {
|
||||
key128_t k;
|
||||
@ -814,6 +816,9 @@ key128_t Spiderdb::makeKey ( int32_t firstIp ,
|
||||
if ( isRequest ) k.n0 |= 0x01;
|
||||
// parent docid
|
||||
k.n0 <<= 38;
|
||||
// if we are making a spider reply key just leave the parentdocid as 0
|
||||
// so we only store one reply per url. the last reply we got.
|
||||
// if ( isRequest ) k.n0 |= parentDocId & DOCID_MASK;
|
||||
k.n0 |= parentDocId & DOCID_MASK;
|
||||
// reserved (padding)
|
||||
k.n0 <<= 8;
|
||||
@ -1802,8 +1807,13 @@ void SpiderColl::clearLocks ( ) {
|
||||
|
||||
void SpiderColl::reset ( ) {
|
||||
|
||||
m_numSuccessReplies = 0;
|
||||
m_numFailedReplies = 0;
|
||||
// these don't work because we only store one reply
|
||||
// which overwrites any older reply. that's how the
|
||||
// key is. we can change the key to use the timestamp
|
||||
// and not parent docid in makeKey() for spider
|
||||
// replies later.
|
||||
// m_numSuccessReplies = 0;
|
||||
// m_numFailedReplies = 0;
|
||||
|
||||
// reset these for SpiderLoop;
|
||||
m_nextDoledbKey.setMin();
|
||||
@ -3980,15 +3990,65 @@ bool SpiderColl::scanListForWinners ( ) {
|
||||
// see if this is the most recent one
|
||||
SpiderReply *tmp = (SpiderReply *)rec;
|
||||
|
||||
// reset reply stats if beginning a new url
|
||||
if ( srepUh48 != tmp->getUrlHash48() ) {
|
||||
m_numSuccessReplies = 0;
|
||||
m_numFailedReplies = 0;
|
||||
// . MDW: we have to detect corrupt replies up here so
|
||||
// they do not become the winning reply because
|
||||
// their date is in the future!!
|
||||
|
||||
// . this is -1 on corruption
|
||||
// . i've seen -31757, 21... etc for bad http replies
|
||||
// in the qatest123 doc cache... so turn off for that
|
||||
if ( tmp->m_httpStatus >= 1000 ) {
|
||||
if ( m_cr->m_spiderCorruptCount == 0 ) {
|
||||
log("spider: got corrupt 3 "
|
||||
"spiderReply in "
|
||||
"scan "
|
||||
"uh48=%"INT64" "
|
||||
"httpstatus=%"INT32" "
|
||||
"(cn=%"INT32")",
|
||||
tmp->getUrlHash48(),
|
||||
(int32_t)tmp->m_httpStatus,
|
||||
(int32_t)m_collnum);
|
||||
}
|
||||
m_cr->m_spiderCorruptCount++;
|
||||
// don't nuke it just for that...
|
||||
//srep = NULL;
|
||||
continue;
|
||||
}
|
||||
// bad langid?
|
||||
if ( ! getLanguageAbbr (tmp->m_langId) ) {
|
||||
log("spider: got corrupt 4 spiderReply in "
|
||||
"scan uh48=%"INT64" "
|
||||
"langid=%"INT32" (cn=%"INT32")",
|
||||
tmp->getUrlHash48(),
|
||||
(int32_t)tmp->m_langId,
|
||||
(int32_t)m_collnum);
|
||||
m_cr->m_spiderCorruptCount++;
|
||||
//srep = NULL;
|
||||
// if ( tmp->getUrlHash48() ==
|
||||
// 271713196158770LL )
|
||||
// log("hey");
|
||||
continue;
|
||||
}
|
||||
|
||||
// reset reply stats if beginning a new url
|
||||
// these don't work because we only store one reply
|
||||
// which overwrites any older reply. that's how the
|
||||
// key is. we can change the key to use the timestamp
|
||||
// and not parent docid in makeKey() for spider
|
||||
// replies later.
|
||||
// if ( srepUh48 != tmp->getUrlHash48() ) {
|
||||
// m_numSuccessReplies = 0;
|
||||
// m_numFailedReplies = 0;
|
||||
// }
|
||||
|
||||
// inc stats
|
||||
if ( tmp->m_errCode == 0 ) m_numSuccessReplies++;
|
||||
else m_numFailedReplies ++;
|
||||
// these don't work because we only store one reply
|
||||
// which overwrites any older reply. that's how the
|
||||
// key is. we can change the key to use the timestamp
|
||||
// and not parent docid in makeKey() for spider
|
||||
// replies later.
|
||||
// if ( tmp->m_errCode == 0 ) m_numSuccessReplies++;
|
||||
// else m_numFailedReplies ++;
|
||||
|
||||
// if we have a more recent reply already, skip this
|
||||
if ( srep &&
|
||||
@ -4010,10 +4070,14 @@ bool SpiderColl::scanListForWinners ( ) {
|
||||
int64_t uh48 = sreq->getUrlHash48();
|
||||
|
||||
// reset reply stats if beginning a new url
|
||||
if ( ! srep ) {
|
||||
m_numSuccessReplies = 0;
|
||||
m_numFailedReplies = 0;
|
||||
}
|
||||
// these don't work because we only store one reply
|
||||
// which overwrites any older reply. that's how the key is.
|
||||
// we can change the key to use the timestamp and not
|
||||
// parent docid in makeKey() for spider replies later.
|
||||
// if ( ! srep ) {
|
||||
// m_numSuccessReplies = 0;
|
||||
// m_numFailedReplies = 0;
|
||||
// }
|
||||
|
||||
// . skip if our twin should add it to doledb
|
||||
// . waiting tree only has firstIps assigned to us so
|
||||
@ -4100,8 +4164,13 @@ bool SpiderColl::scanListForWinners ( ) {
|
||||
// put these in the spiderequest in doledb so we can
|
||||
// show in the json spider status docs in
|
||||
// XmlDoc::getSpiderStatusDocMetaList2()
|
||||
sreq->m_reservedc1 = m_numSuccessReplies;
|
||||
sreq->m_reservedc2 = m_numFailedReplies;
|
||||
// these don't work because we only store one reply
|
||||
// which overwrites any older reply. that's how the
|
||||
// key is. we can change the key to use the timestamp
|
||||
// and not parent docid in makeKey() for spider
|
||||
// replies later.
|
||||
// sreq->m_reservedc1 = m_numSuccessReplies;
|
||||
// sreq->m_reservedc2 = m_numFailedReplies;
|
||||
|
||||
m_lastSreqUh48 = uh48;
|
||||
m_lastCBlockIp = cblock;
|
||||
@ -4256,28 +4325,6 @@ bool SpiderColl::scanListForWinners ( ) {
|
||||
// if we tried it before
|
||||
sreq->m_hadReply = true;
|
||||
}
|
||||
// . this is -1 on corruption
|
||||
// . i've seen -31757, 21... etc for bad http replies
|
||||
// in the qatest123 doc cache... so turn off for that
|
||||
if ( srep && srep->m_httpStatus >= 1000 ) {
|
||||
if ( m_cr->m_spiderCorruptCount == 0 ) {
|
||||
log("spider: got corrupt 3 spiderReply in "
|
||||
"scan httpstatus=%"INT32" (cn=%"INT32")",
|
||||
(int32_t)srep->m_httpStatus,
|
||||
(int32_t)m_collnum);
|
||||
}
|
||||
m_cr->m_spiderCorruptCount++;
|
||||
// don't nuke it just for that...
|
||||
//srep = NULL;
|
||||
}
|
||||
// bad langid?
|
||||
if ( srep && ! getLanguageAbbr (srep->m_langId) ) {
|
||||
log("spider: got corrupt 4 spiderReply in scan "
|
||||
"langid=%"INT32" (cn=%"INT32")",
|
||||
(int32_t)srep->m_langId,
|
||||
(int32_t)m_collnum);
|
||||
srep = NULL;
|
||||
}
|
||||
|
||||
// . get the url filter we match
|
||||
// . if this is slow see the TODO below in dedupSpiderdbList()
|
||||
@ -4310,7 +4357,8 @@ bool SpiderColl::scanListForWinners ( ) {
|
||||
if ( priority >= MAX_SPIDER_PRIORITIES) {char *xx=NULL;*xx=0;}
|
||||
|
||||
if ( g_conf.m_logDebugSpider )
|
||||
log("spider: got ufn=%"INT32" for %s",ufn,sreq->m_url);
|
||||
log("spider: got ufn=%"INT32" for %s (%"INT64"",
|
||||
ufn,sreq->m_url,sreq->getUrlHash48());
|
||||
|
||||
if ( g_conf.m_logDebugSpider && srep )
|
||||
log("spider: lastspidered=%"UINT32"",
|
||||
@ -4514,6 +4562,9 @@ bool SpiderColl::scanListForWinners ( ) {
|
||||
spiderTimeMS ,
|
||||
uh48 );
|
||||
|
||||
// assume our added time is the first time this url was added
|
||||
sreq->m_discoveryTime = sreq->m_addedTime;
|
||||
|
||||
// if ( uh48 == 110582802025376LL )
|
||||
// log("hey");
|
||||
|
||||
@ -4543,10 +4594,12 @@ bool SpiderColl::scanListForWinners ( ) {
|
||||
// and the min added time as well!
|
||||
// get the oldest timestamp so
|
||||
// gbssDiscoveryTime will be accurate.
|
||||
if ( sreq->m_addedTime < wsreq->m_addedTime )
|
||||
wsreq->m_addedTime = sreq->m_addedTime;
|
||||
if ( wsreq->m_addedTime < sreq->m_addedTime )
|
||||
sreq->m_addedTime = wsreq->m_addedTime;
|
||||
if ( sreq->m_discoveryTime < wsreq->m_discoveryTime )
|
||||
wsreq->m_discoveryTime =
|
||||
sreq->m_discoveryTime;
|
||||
if ( wsreq->m_discoveryTime < sreq->m_discoveryTime )
|
||||
sreq->m_discoveryTime =
|
||||
wsreq->m_discoveryTime;
|
||||
}
|
||||
|
||||
|
||||
@ -7592,11 +7645,24 @@ bool SpiderLoop::spiderUrl9 ( SpiderRequest *sreq ,
|
||||
int32_t node = g_doledb.m_rdb.m_tree.deleteNode(m_collnum,
|
||||
(char *)m_doledbKey,
|
||||
true);
|
||||
if ( node == -1 ) { char *xx=NULL;*xx=0; }
|
||||
|
||||
if ( g_conf.m_logDebugSpider )
|
||||
log("spider: deleting doledb tree node %"INT32,node);
|
||||
|
||||
// if url filters rebuilt then doledb gets reset and i've seen us hit
|
||||
// this node == -1 condition here... so maybe ignore it... just log
|
||||
// what happened? i think we did a quickpoll somewhere between here
|
||||
// and the call to spiderDoledUrls() and it the url filters changed
|
||||
// so it reset doledb's tree. so in that case we should bail on this
|
||||
// url.
|
||||
if ( node == -1 ) {
|
||||
g_errno = EADMININTERFERENCE;
|
||||
log("spider: lost url about to spider from url filters "
|
||||
"and doledb tree reset. %s",mstrerror(g_errno));
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
// now remove from doleiptable since we removed from doledb
|
||||
m_sc->removeFromDoledbTable ( sreq->m_firstIp );
|
||||
|
||||
@ -11663,11 +11729,14 @@ int32_t getUrlFilterNum2 ( SpiderRequest *sreq ,
|
||||
goto gotOne;
|
||||
}
|
||||
// two letter extensions
|
||||
else if ( ext[1] == '.' ) {
|
||||
if ( to_lower_a(ext[2]) == 'g' &&
|
||||
to_lower_a(ext[3]) == 'z' )
|
||||
goto gotOne;
|
||||
}
|
||||
// .warc.gz and .arc.gz is ok
|
||||
// take this out for now
|
||||
// else if ( ext[1] == '.' ) {
|
||||
// if ( to_lower_a(ext[2]) == 'g' &&
|
||||
// to_lower_a(ext[3]) == 'z' )
|
||||
// goto gotOne;
|
||||
// }
|
||||
|
||||
// check for ".css?" substring
|
||||
// these two suck up a lot of time:
|
||||
// take them out for now. MDW 2/21/2015
|
||||
@ -12338,6 +12407,37 @@ int32_t getUrlFilterNum2 ( SpiderRequest *sreq ,
|
||||
goto checkNextRule;
|
||||
}
|
||||
|
||||
// selector using the first time it was added to the Spiderdb
|
||||
// added by Sam, May 5th 2015
|
||||
if ( *p=='u' && strncmp(p,"urlage",6) == 0 ) {
|
||||
// skip for msg20
|
||||
if ( isForMsg20 ) {
|
||||
//log("was for message 20");
|
||||
continue;
|
||||
|
||||
}
|
||||
// get the age of the spider_request.
|
||||
// (substraction of uint with int, hope
|
||||
// every thing goes well there)
|
||||
int32_t sreq_age = 0;
|
||||
if ( sreq ) sreq_age = nowGlobal-sreq->m_discoveryTime;
|
||||
//log("spiderage=%d",sreq_age);
|
||||
// the argument entered by user
|
||||
int32_t argument_age=atoi(s) ;
|
||||
if ( sign == SIGN_EQ && sreq_age != argument_age ) continue;
|
||||
if ( sign == SIGN_NE && sreq_age == argument_age ) continue;
|
||||
if ( sign == SIGN_GT && sreq_age <= argument_age ) continue;
|
||||
if ( sign == SIGN_LT && sreq_age >= argument_age ) continue;
|
||||
if ( sign == SIGN_GE && sreq_age < argument_age ) continue;
|
||||
if ( sign == SIGN_LE && sreq_age > argument_age ) continue;
|
||||
p = strstr(s, "&&");
|
||||
//if nothing, else then it is a match
|
||||
if ( ! p ) return i;
|
||||
//skip the '&&' and go to next rule
|
||||
p += 2;
|
||||
goto checkNextRule;
|
||||
}
|
||||
|
||||
|
||||
if ( *p=='e' && strncmp(p,"errorcount",10) == 0 ) {
|
||||
// if we do not have enough info for outlink, all done
|
||||
@ -12460,16 +12560,16 @@ int32_t getUrlFilterNum2 ( SpiderRequest *sreq ,
|
||||
// skip for msg20
|
||||
if ( isForMsg20 ) continue;
|
||||
// do not match rule if never attempted
|
||||
if ( srep->m_spideredTime == 0 ) {
|
||||
char*xx=NULL;*xx=0;}
|
||||
if ( srep->m_spideredTime == (uint32_t)-1){
|
||||
char*xx=NULL;*xx=0;}
|
||||
// int16_tcut
|
||||
float af = (srep->m_spideredTime - nowGlobal);
|
||||
// if ( srep->m_spideredTime == 0 ) {
|
||||
// char*xx=NULL;*xx=0;}
|
||||
// if ( srep->m_spideredTime == (uint32_t)-1){
|
||||
// char*xx=NULL;*xx=0;}
|
||||
// shortcut
|
||||
int32_t a = nowGlobal - srep->m_spideredTime;
|
||||
// make into days
|
||||
af /= (3600.0*24.0);
|
||||
//af /= (3600.0*24.0);
|
||||
// back to a int32_t, round it
|
||||
int32_t a = (int32_t)(af + 0.5);
|
||||
//int32_t a = (int32_t)(af + 0.5);
|
||||
// make it point to the priority
|
||||
int32_t b = atoi(s);
|
||||
// compare
|
||||
@ -13001,6 +13101,7 @@ void dedupSpiderdbList ( RdbList *list , int32_t niceness , bool removeNegRecs )
|
||||
// . if the same check who has the most recent added time
|
||||
// . if we are not the most recent, just do not add us
|
||||
// . no, now i want the oldest so we can do gbssDiscoveryTime
|
||||
// and set sreq->m_discoveryTime accurately, above
|
||||
if ( sreq->m_addedTime >= oldReq->m_addedTime ) continue;
|
||||
// otherwise, erase over him
|
||||
dst = restorePoint;
|
||||
|
18
Spider.h
18
Spider.h
@ -522,10 +522,16 @@ class SpiderRequest {
|
||||
int32_t m_parentDomHash32;
|
||||
int32_t m_parentSiteHash32;
|
||||
|
||||
// if there are several spiderrequests for a url, this should be
|
||||
// the earliest m_addedTime, basically, the url discovery time. this is
|
||||
// NOT valid in spiderdb, but only set upon selecting the url to spider
|
||||
// when we scan all of the SpiderRequests it has.
|
||||
int32_t m_discoveryTime;
|
||||
|
||||
// the PROBABLE DOCID. if there is a collision with another docid
|
||||
// then we increment the last 8 bits or so. see Msg22.cpp.
|
||||
//int64_t m_probDocId;
|
||||
int32_t m_reservedc1;
|
||||
//int32_t m_reservedc1;
|
||||
int32_t m_reservedc2;
|
||||
|
||||
//int32_t m_parentPubDate;
|
||||
@ -829,6 +835,7 @@ class SpiderReply {
|
||||
// a SpiderRec outright
|
||||
key128_t m_key;
|
||||
|
||||
// this can be used for something else really. all SpiderReplies are fixed sz
|
||||
int32_t m_dataSize;
|
||||
|
||||
// for calling getHostIdToDole()
|
||||
@ -1155,8 +1162,13 @@ class SpiderColl {
|
||||
int32_t m_tailHopCount;
|
||||
int64_t m_minFutureTimeMS;
|
||||
|
||||
int32_t m_numSuccessReplies;
|
||||
int32_t m_numFailedReplies;
|
||||
// these don't work because we only store one reply
|
||||
// which overwrites any older reply. that's how the
|
||||
// key is. we can change the key to use the timestamp
|
||||
// and not parent docid in makeKey() for spider
|
||||
// replies later.
|
||||
// int32_t m_numSuccessReplies;
|
||||
// int32_t m_numFailedReplies;
|
||||
|
||||
// . do not re-send CrawlInfoLocal for a coll if not update
|
||||
// . we store the flags in here as true if we should send our
|
||||
|
@ -1491,6 +1491,16 @@ int32_t TcpServer::readSocket ( TcpSocket *s ) {
|
||||
// . MDW: add "&& s->m_sendBuf to it"
|
||||
// . just return -1 WITHOUT setting g_errno
|
||||
if ( n == 0 ) {
|
||||
// set g_errno to 0 then otherwise it seems g_errno was set to
|
||||
// ETRYAGAIN from some other time and when readSocket
|
||||
// calls makeCallback() it ends up calling Msg13.cpp::gotHttpReply2
|
||||
// eventually and coring because the error is not recognized.
|
||||
// even though there was no error but the read just finished.
|
||||
// also see TcpServer.cpp:readSocketWrapper2() to see where
|
||||
// it calls makeCallback() after noticing we return -1 from here.
|
||||
// the site was content.time.com in this case that we read 0
|
||||
// bytes on to indicate the read was done.
|
||||
g_errno = 0;
|
||||
// for debug. seems like content-length: is counting
|
||||
// the \r\n when it shoulnd't be
|
||||
//char *xx=NULL;*xx=0;
|
||||
@ -2861,6 +2871,10 @@ int TcpServer::sslHandshake ( TcpSocket *s ) {
|
||||
(int32_t)sslError,r,iptoa(s->m_ip),sslMsg);
|
||||
|
||||
g_errno = ESSLERROR;
|
||||
// note in log
|
||||
log("tcp: ssl: try running "
|
||||
"'openssl s_client -connect www.hostnamehere.com:443 "
|
||||
"-debug' to debug the webserver on the other side.");
|
||||
// make sure read callback is registered
|
||||
// g_loop.registerReadCallback (s->m_sd,this,readSocketWrapper,
|
||||
// s->m_niceness);
|
||||
|
13
Threads.cpp
13
Threads.cpp
@ -671,6 +671,18 @@ bool ThreadQueue::init ( char threadType, int32_t maxThreads, int32_t maxEntries
|
||||
}
|
||||
|
||||
int32_t ThreadQueue::getNumThreadsOutOrQueued() {
|
||||
// MDW: we also need to count threads that are returned but need their
|
||||
// callback called so, in the case of RdbDump, the rdblist that was written
|
||||
// to disk can update the rdbmap before it gets saved, so it doesn't get
|
||||
// out of sync. Process.cpp calls .suspendMerge() to make sure that all
|
||||
// merge operations are suspended as well.
|
||||
int32_t n = 0;
|
||||
for ( int32_t i = 0 ; i < m_maxEntries ; i++ ) {
|
||||
ThreadEntry *e = &m_entries[i];
|
||||
if ( e->m_isOccupied ) n++;
|
||||
}
|
||||
return n;
|
||||
/*
|
||||
int32_t n = m_launched - m_returned;
|
||||
for ( int32_t i = 0 ; i < m_maxEntries ; i++ ) {
|
||||
ThreadEntry *e = &m_entries[i];
|
||||
@ -684,6 +696,7 @@ int32_t ThreadQueue::getNumThreadsOutOrQueued() {
|
||||
//}
|
||||
}
|
||||
return n;
|
||||
*/
|
||||
}
|
||||
|
||||
// return NULL and set g_errno on error
|
||||
|
72
Url.cpp
72
Url.cpp
@ -32,6 +32,8 @@ void Url::reset() {
|
||||
//m_siteLen = 0;
|
||||
// ip related stuff
|
||||
m_ip = 0;
|
||||
// m_isWarcValid = false;
|
||||
// m_isArcValid = false;
|
||||
}
|
||||
|
||||
// set from another Url, does a copy
|
||||
@ -1426,13 +1428,79 @@ bool Url::isBadExtension ( int32_t version ) {
|
||||
s_badExtInitialized = true;
|
||||
}
|
||||
|
||||
|
||||
|
||||
int myKey = hash64Lower_a(m_extension,m_elen);
|
||||
//zero unless we have a bad extention, otherwise
|
||||
//we return TR version in which it was banned
|
||||
int32_t badVersion = s_badExtTable.getValue(myKey);
|
||||
if (badVersion == 0) return false;
|
||||
if(badVersion <= version) return true;
|
||||
//if(badVersion <= version) return true;
|
||||
if ( badVersion > version ) return false;
|
||||
// exceptions for .warc.gz .warc .arc .argc.gz
|
||||
if ( isWarc() || isArc() ) return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
bool Url::isWarc ( ) {
|
||||
|
||||
// if ( ulen>8 && strncmp(uend-8,".warc.gz",8)==0 )
|
||||
// m_isWarc = true;
|
||||
// if ( ulen>8 && strncmp(uend-5,".warc" ,5)==0 )
|
||||
// m_isWarc = true;
|
||||
|
||||
// if ( ulen>8 && strncmp(uend-7,".arc.gz",7)==0 )
|
||||
// m_isArc = true;
|
||||
// if ( ulen>8 && strncmp(uend-4,".arc" ,4)==0 )
|
||||
// m_isArc = true;
|
||||
|
||||
if ( m_elen == 4 &&
|
||||
m_extension[0] == 'w' &&
|
||||
m_extension[1] == 'a' &&
|
||||
m_extension[2] == 'r' &&
|
||||
m_extension[3] == 'c' )
|
||||
return true;
|
||||
|
||||
if ( m_elen == 2 &&
|
||||
m_extension[0] == 'g' &&
|
||||
m_extension[1] == 'z' &&
|
||||
m_ulen > 10 &&
|
||||
m_extension[-1] == '.' &&
|
||||
m_extension[-2] == 'c' &&
|
||||
m_extension[-3] == 'r' &&
|
||||
m_extension[-4] == 'a' &&
|
||||
m_extension[-5] == 'w' &&
|
||||
m_extension[-6] == '.' ) {
|
||||
// m_isWarc = true;
|
||||
// m_isWarcValid = true;
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
bool Url::isArc ( ) {
|
||||
|
||||
if ( m_elen == 3 &&
|
||||
m_extension[0] == 'a' &&
|
||||
m_extension[1] == 'r' &&
|
||||
m_extension[2] == 'c' )
|
||||
return true;
|
||||
|
||||
// hack to allow for .gz if it is .warc.gz or .arc.gz
|
||||
if ( m_elen == 2 &&
|
||||
m_extension[0] == 'g' &&
|
||||
m_extension[1] == 'z' &&
|
||||
m_ulen > 10 &&
|
||||
m_extension[-1] == '.' &&
|
||||
m_extension[-2] == 'c' &&
|
||||
m_extension[-3] == 'r' &&
|
||||
m_extension[-4] == 'a' &&
|
||||
m_extension[-5] == '.' ) {
|
||||
// m_isArc = true;
|
||||
// m_isArcValid = true;
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
|
5
Url.h
5
Url.h
@ -92,6 +92,11 @@ public:
|
||||
bool isBadExtension(int32_t xxx);
|
||||
bool isSet() { return m_ulen != 0; }
|
||||
|
||||
// is this url a warc or arc url? i.e. ends in .warc or .arc or
|
||||
// .warc.gz or .arc.gz?
|
||||
bool isWarc ( );
|
||||
bool isArc ( );
|
||||
|
||||
// does it end in .xml, .rdb or .rss, etc. kinda thing
|
||||
//bool isRSSFormat ( ) ;
|
||||
|
||||
|
1375
XmlDoc.cpp
1375
XmlDoc.cpp
File diff suppressed because it is too large
Load Diff
59
XmlDoc.h
59
XmlDoc.h
@ -249,6 +249,8 @@ public:
|
||||
|
||||
#define MAX_XML_DOCS 4
|
||||
|
||||
#define MAXMSG7S 50
|
||||
|
||||
class XmlDoc {
|
||||
|
||||
public:
|
||||
@ -339,7 +341,7 @@ class XmlDoc {
|
||||
uint16_t m_isDiffbotJSONObject:1;
|
||||
uint16_t m_sentToDiffbot:1;
|
||||
uint16_t m_gotDiffbotSuccessfulReply:1;
|
||||
uint16_t m_reserved804:1;
|
||||
uint16_t m_useTimeAxis:1; // m_reserved804:1;
|
||||
uint16_t m_reserved805:1;
|
||||
uint16_t m_reserved806:1;
|
||||
uint16_t m_reserved807:1;
|
||||
@ -473,7 +475,9 @@ class XmlDoc {
|
||||
int32_t forcedIp = 0 ,
|
||||
uint8_t contentType = CT_HTML ,
|
||||
uint32_t spideredTime = 0 , // time_t
|
||||
bool contentHasMime = false ) ;
|
||||
bool contentHasMime = false ,
|
||||
// for container docs, what is the separator of subdocs?
|
||||
char *contentDelim = NULL ) ;
|
||||
|
||||
// we now call this right away rather than at download time!
|
||||
int32_t getSpideredTime();
|
||||
@ -499,6 +503,9 @@ class XmlDoc {
|
||||
void getRebuiltSpiderRequest ( class SpiderRequest *sreq ) ;
|
||||
bool indexDoc ( );
|
||||
bool indexDoc2 ( );
|
||||
bool isContainerDoc ( );
|
||||
bool indexContainerDoc ( );
|
||||
bool indexWarcOrArc ( char ct ) ;
|
||||
key_t *getTitleRecKey() ;
|
||||
//char *getSkipIndexing ( );
|
||||
char *prepareToMakeTitleRec ( ) ;
|
||||
@ -600,6 +607,7 @@ class XmlDoc {
|
||||
class Url *getFirstUrl() ;
|
||||
int64_t getFirstUrlHash48();
|
||||
int64_t getFirstUrlHash64();
|
||||
class Url **getLastRedirUrl() ;
|
||||
class Url **getRedirUrl() ;
|
||||
class Url **getMetaRedirUrl() ;
|
||||
class Url **getCanonicalRedirUrl ( ) ;
|
||||
@ -608,7 +616,7 @@ class XmlDoc {
|
||||
//int32_t *getNumBannedOutlinks ( ) ;
|
||||
uint16_t *getCountryId ( ) ;
|
||||
class XmlDoc **getOldXmlDoc ( ) ;
|
||||
bool isRobotsTxtFile ( char *url , int32_t urlLen ) ;
|
||||
//bool isRobotsTxtFile ( char *url , int32_t urlLen ) ;
|
||||
class XmlDoc **getExtraDoc ( char *url , int32_t maxCacheAge = 0 ) ;
|
||||
bool getIsPageParser ( ) ;
|
||||
class XmlDoc **getRootXmlDoc ( int32_t maxCacheAge = 0 ) ;
|
||||
@ -686,6 +694,8 @@ class XmlDoc {
|
||||
char **getRawUtf8Content ( ) ;
|
||||
char **getExpandedUtf8Content ( ) ;
|
||||
char **getUtf8Content ( ) ;
|
||||
// we download large files to a file on disk, like warcs and arcs
|
||||
File *getUtf8ContentInFile ( int64_t *fileSizeArg );
|
||||
int32_t *getContentHash32 ( ) ;
|
||||
int32_t *getContentHashJson32 ( ) ;
|
||||
//int32_t *getTagHash32 ( ) ;
|
||||
@ -799,6 +809,8 @@ class XmlDoc {
|
||||
bool hashContentType ( class HashTableX *table ) ;
|
||||
bool hashDMOZCategories ( class HashTableX *table ) ;
|
||||
bool hashLinks ( class HashTableX *table ) ;
|
||||
bool getUseTimeAxis ( ) ;
|
||||
SafeBuf *getTimeAxisUrl ( );
|
||||
bool hashUrl ( class HashTableX *table );
|
||||
bool hashDateNumbers ( class HashTableX *tt );
|
||||
bool hashSections ( class HashTableX *table ) ;
|
||||
@ -1009,6 +1021,7 @@ class XmlDoc {
|
||||
|
||||
Url m_redirUrl;
|
||||
Url *m_redirUrlPtr;
|
||||
Url *m_lastRedirUrlPtr;
|
||||
SafeBuf m_redirCookieBuf;
|
||||
Url m_metaRedirUrl;
|
||||
Url *m_metaRedirUrlPtr;
|
||||
@ -1044,6 +1057,33 @@ class XmlDoc {
|
||||
SafeBuf m_zbuf;
|
||||
SafeBuf m_kbuf;
|
||||
|
||||
// warc parsing member vars
|
||||
class Msg7 *m_msg7;
|
||||
class Msg7 *m_msg7s[MAXMSG7S];
|
||||
char *m_warcContentPtr;
|
||||
char *m_arcContentPtr;
|
||||
char *m_anyContentPtr;
|
||||
char *m_contentDelim;
|
||||
SafeBuf m_injectUrlBuf;
|
||||
bool m_subDocsHaveMime;
|
||||
int32_t m_warcError ;
|
||||
int32_t m_arcError ;
|
||||
bool m_doneInjectingWarc ;
|
||||
bool m_doneInjectingArc ;
|
||||
int64_t m_fileOff ;
|
||||
char *m_fileBuf ;
|
||||
int32_t m_fileBufAllocSize;
|
||||
char *m_fptr ;
|
||||
char *m_fptrEnd ;
|
||||
File m_file;
|
||||
int64_t m_fileSize;
|
||||
bool m_hasMoreToRead;
|
||||
int32_t m_numInjectionsOut;
|
||||
bool m_calledWgetThread;
|
||||
|
||||
// used by msg7 to store udp slot
|
||||
class UdpSlot *m_injectionSlot;
|
||||
|
||||
// . same thing, a little more complicated
|
||||
// . these classes are only set on demand
|
||||
Xml m_xml;
|
||||
@ -1114,6 +1154,8 @@ class XmlDoc {
|
||||
//bool m_storedVoteCache;
|
||||
//SafeBuf m_cacheRecBuf;
|
||||
|
||||
SafeBuf m_timeAxisUrl;
|
||||
|
||||
HashTableX m_turkVotingTable;
|
||||
HashTableX m_turkBitsTable;
|
||||
uint32_t m_confirmedTitleContentHash ;
|
||||
@ -1154,6 +1196,8 @@ class XmlDoc {
|
||||
class SafeBuf *m_savedSb;
|
||||
class HttpRequest *m_savedHr;
|
||||
|
||||
char m_savedChar;
|
||||
|
||||
|
||||
// validity flags. on reset() all these are set to false.
|
||||
char m_VALIDSTART;
|
||||
@ -1163,10 +1207,14 @@ class XmlDoc {
|
||||
char m_addedSpiderReplySizeValid;
|
||||
char m_addedStatusDocSizeValid;
|
||||
char m_downloadStartTimeValid;
|
||||
char m_contentDelimValid;
|
||||
char m_fileValid;
|
||||
//char m_docQualityValid;
|
||||
char m_siteValid;
|
||||
char m_startTimeValid;
|
||||
char m_currentUrlValid;
|
||||
char m_useTimeAxisValid;
|
||||
char m_timeAxisUrlValid;
|
||||
char m_firstUrlValid;
|
||||
char m_firstUrlHash48Valid;
|
||||
char m_firstUrlHash64Valid;
|
||||
@ -2397,7 +2445,10 @@ class XmlDoc {
|
||||
void (*callback)(void *state) ,
|
||||
|
||||
uint32_t firstIndexedTime = 0,
|
||||
uint32_t lastSpideredDate = 0 );
|
||||
uint32_t lastSpideredDate = 0 ,
|
||||
int32_t injectDocIp = 0 ,
|
||||
// for container docs consisting of subdocs to inject
|
||||
char *contentDelim = NULL );
|
||||
|
||||
|
||||
bool injectLinks ( HashTableX *linkDedupTable ,
|
||||
|
104
fctypes.cpp
104
fctypes.cpp
@ -2408,6 +2408,83 @@ char *serializeMsg ( int32_t baseSize ,
|
||||
return buf;
|
||||
}
|
||||
|
||||
char *serializeMsg2 ( void *thisPtr ,
|
||||
int32_t objSize ,
|
||||
char **firstStrPtr ,
|
||||
int32_t *firstSizeParm ,
|
||||
int32_t *retSize ) {
|
||||
|
||||
// make a buffer to serialize into
|
||||
char *buf = NULL;
|
||||
int32_t baseSize = (char *)firstStrPtr - (char *)thisPtr;
|
||||
int nptrs=((char *)firstSizeParm-(char *)firstStrPtr)/sizeof(char *);
|
||||
int32_t need = baseSize;
|
||||
need += nptrs * sizeof(char *);
|
||||
need += nptrs * sizeof(int32_t);
|
||||
// tally up the string sizes
|
||||
int32_t *srcSizePtr = (int32_t *)firstSizeParm;
|
||||
char **srcStrPtr = (char **)firstStrPtr;
|
||||
int32_t totalStringSizes = 0;
|
||||
for ( int i = 0 ; i < nptrs ; i++ ) {
|
||||
if ( srcStrPtr[i] == NULL ) continue;
|
||||
totalStringSizes += srcSizePtr[i];
|
||||
}
|
||||
int32_t stringBufferOffset = need;
|
||||
need += totalStringSizes;
|
||||
// alloc if we should
|
||||
if ( ! buf ) buf = (char *)mmalloc ( need , "sm2" );
|
||||
// bail on error, g_errno should be set
|
||||
if ( ! buf ) return NULL;
|
||||
// set how many bytes we will serialize into
|
||||
*retSize = need;
|
||||
// copy everything over except strings themselves
|
||||
char *p = buf;
|
||||
gbmemcpy ( p , (char *)thisPtr , stringBufferOffset );//need );
|
||||
// point to the string buffer
|
||||
p += stringBufferOffset;
|
||||
// then store the strings!
|
||||
char **dstStrPtr = (char **)(buf + baseSize );
|
||||
int32_t *dstSizePtr = (int32_t *)(buf + baseSize+sizeof(char *)*nptrs);
|
||||
for ( int count = 0 ; count < nptrs ; count++ ) {
|
||||
// copy ptrs
|
||||
//*dstStrPtr = *srcStrPtr;
|
||||
//*dstSizePtr = *srcSizePtr;
|
||||
// if we are NULL, we are a "bookmark", so
|
||||
// we alloc'd space for it, but don't copy into
|
||||
// the space until after this call toe serialize()
|
||||
if ( ! *srcStrPtr )
|
||||
goto skip;
|
||||
// if this is valid then size can't be 0! fix upstream.
|
||||
if ( ! *srcSizePtr ) { char *xx=NULL;*xx=0; }
|
||||
// if size is 0 use gbstrlen. helps with InjectionRequest
|
||||
// where we set ptr_url or ptr_content but not size_url, etc.
|
||||
//if ( ! *srcSizePtr )
|
||||
// *srcSizePtr = gbstrlen(*strPtr);
|
||||
// sanity check -- cannot copy onto ourselves
|
||||
if ( p > *srcStrPtr && p < *srcStrPtr + *srcSizePtr ) {
|
||||
char *xx = NULL; *xx = 0; }
|
||||
// copy the string into the buffer
|
||||
gbmemcpy ( p , *srcStrPtr , *srcSizePtr );
|
||||
skip:
|
||||
// point it now into the string buffer
|
||||
*dstStrPtr = p;
|
||||
// if it is 0 length, make ptr NULL in destination
|
||||
if ( *srcSizePtr == 0 || *srcStrPtr == NULL ) {
|
||||
*dstStrPtr = NULL;
|
||||
*dstSizePtr = 0;
|
||||
}
|
||||
// advance our destination ptr
|
||||
p += *dstSizePtr;
|
||||
// advance both ptrs to next string
|
||||
srcSizePtr++;
|
||||
srcStrPtr++;
|
||||
dstSizePtr++;
|
||||
dstStrPtr++;
|
||||
}
|
||||
return buf;
|
||||
}
|
||||
|
||||
|
||||
// convert offsets back into ptrs
|
||||
int32_t deserializeMsg ( int32_t baseSize ,
|
||||
int32_t *firstSizeParm ,
|
||||
@ -2437,6 +2514,33 @@ int32_t deserializeMsg ( int32_t baseSize ,
|
||||
return baseSize + (p - stringBuf);//getStringBuf());
|
||||
}
|
||||
|
||||
void deserializeMsg2 ( char **firstStrPtr , // ptr_url
|
||||
int32_t *firstSizeParm ) { // size_url
|
||||
int nptrs=((char *)firstSizeParm-(char *)firstStrPtr)/sizeof(char *);
|
||||
// point to our string buffer
|
||||
char *p = ((char *)firstSizeParm + sizeof(int32_t)*nptrs);
|
||||
// then store the strings!
|
||||
int32_t *sizePtr = firstSizeParm;//getFirstSizeParm(); // &size_qbuf;
|
||||
//int32_t *sizeEnd = lastSizeParm;//getLastSizeParm (); // &size_displ
|
||||
char **strPtr = firstStrPtr;//getFirstStrPtr (); // &ptr_qbuf;
|
||||
int count = 0;
|
||||
for ( ; count < nptrs ; count++ ) { // sizePtr <= sizeEnd ; ) {
|
||||
// convert the offset to a ptr
|
||||
*strPtr = p;
|
||||
// make it NULL if size is 0 though
|
||||
if ( *sizePtr == 0 ) *strPtr = NULL;
|
||||
// sanity check
|
||||
if ( *sizePtr < 0 ) { char *xx = NULL; *xx =0; }
|
||||
// advance our destination ptr
|
||||
p += *sizePtr;
|
||||
// advance both ptrs to next string
|
||||
sizePtr++;
|
||||
strPtr++;
|
||||
}
|
||||
// return how many bytes we processed
|
||||
//return baseSize + (p - stringBuf);//getStringBuf());
|
||||
}
|
||||
|
||||
// print it to stdout for debugging Dates.cpp
|
||||
int32_t printTime ( time_t ttt ) {
|
||||
//char *s = ctime(&ttt);
|
||||
|
@ -612,6 +612,13 @@ char *serializeMsg ( int32_t baseSize ,
|
||||
char *userBuf ,
|
||||
int32_t userBufSize ,
|
||||
bool makePtrsRefNewBuf ) ;
|
||||
|
||||
char *serializeMsg2 ( void *thisPtr ,
|
||||
int32_t objSize ,
|
||||
char **firstStrPtr ,
|
||||
int32_t *firstSizeParm ,
|
||||
int32_t *retSize );
|
||||
|
||||
// convert offsets back into ptrs
|
||||
int32_t deserializeMsg ( int32_t baseSize ,
|
||||
int32_t *firstSizeParm ,
|
||||
@ -619,4 +626,6 @@ int32_t deserializeMsg ( int32_t baseSize ,
|
||||
char **firstStrPtr ,
|
||||
char *stringBuf ) ;
|
||||
|
||||
void deserializeMsg2 ( char **firstStrPtr , int32_t *firstSizeParm );
|
||||
|
||||
#endif
|
||||
|
BIN
html/test.arc.gz
Normal file
BIN
html/test.arc.gz
Normal file
Binary file not shown.
BIN
html/test.warc.gz
Normal file
BIN
html/test.warc.gz
Normal file
Binary file not shown.
218
qa.cpp
218
qa.cpp
@ -75,6 +75,127 @@ void markOut ( char *content , char *needle ) {
|
||||
goto loop;
|
||||
}
|
||||
|
||||
|
||||
void markOut2 ( char *content , char *needle ) {
|
||||
|
||||
if ( ! content ) return;
|
||||
|
||||
int32_t nlen = gbstrlen(needle);
|
||||
|
||||
loop:
|
||||
|
||||
char *s = strstr ( content , needle );
|
||||
if ( ! s ) return;
|
||||
|
||||
// advance over name like "rand64=" to avoid hitting those digits
|
||||
//s += gbstrlen(needle);
|
||||
|
||||
for (int32_t i = 0 ; i < nlen ; i++ )
|
||||
*s++ = ' ';
|
||||
|
||||
//for ( ; *s && ! is_digit(*s); s++ );
|
||||
|
||||
// find end of digit stream
|
||||
//char *end = s;
|
||||
//while ( ; *end && is_digit(*s); end++ );
|
||||
// just bury the digit stream now, zeroing out was not
|
||||
// a consistent LENGTH if we had 10 hits vs 9... making the hash
|
||||
// different
|
||||
|
||||
// space out digits. including decimal point.
|
||||
//for ( ; *s && (is_digit(*s)||*s=='.'); s++ ) *s = ' ';
|
||||
|
||||
// loop for more for the "rand64=" thing
|
||||
content = s;
|
||||
goto loop;
|
||||
}
|
||||
|
||||
|
||||
void markOutBuf ( char *content ) {
|
||||
|
||||
// take out <responseTimeMS>
|
||||
markOut ( content , "<currentTimeUTC>");
|
||||
markOut ( content , "<responseTimeMS>");
|
||||
|
||||
// ...from an index of about 429 pages in 0.91 seconds in collection...
|
||||
markOut ( content , " pages in ");
|
||||
|
||||
// until i figure this one out, take it out
|
||||
markOut ( content , "<docsInCollection>");
|
||||
|
||||
markOut ( content , "spider is done (");
|
||||
markOut ( content , "spider is paused (");
|
||||
markOut ( content , "spider queue empty (");
|
||||
markOut ( content , "spider is active (");
|
||||
|
||||
markOut ( content , "<totalShards>");
|
||||
|
||||
// 3 Collections etc.
|
||||
markOut ( content , "/rocket.jpg></div></a></center><br><br><div style=\"width:190px;padding:4px;margin-left:10px;background-color:white;border-top-left-radius:10px;border-bottom-left-radius:10px;border-color:blue;border-width:3px;border-style:solid;margin-right:-3px;border-right-color:white;overflow-y:auto;overflow-x:hidden;line-height:23px;color:black;\"><center><nobr><b>" );
|
||||
|
||||
// until i figure this one out, take it out
|
||||
markOut ( content , "<hits>");
|
||||
|
||||
// for those links in the html pages
|
||||
markOut ( content, "rand64=");
|
||||
|
||||
// for json
|
||||
markOut ( content , "\"currentTimeUTC\":" );
|
||||
markOut ( content , "\"responseTimeMS\":");
|
||||
markOut ( content , "\"docsInCollection\":");
|
||||
|
||||
// if the results are in json, then status doc is encoded json
|
||||
markOut ( content , "\\\"gbssDownloadStartTime\\\":");
|
||||
markOut ( content , "\\\"gbssDownloadEndTime\\\":");
|
||||
markOut ( content , "\\\"gbssDownloadStartTimeMS\\\":");
|
||||
markOut ( content , "\\\"gbssDownloadEndTimeMS\\\":");
|
||||
markOut ( content , "\\\"gbssDownloadDurationMS\\\":");
|
||||
markOut ( content , "\\\"gbssAgeInIndex\\\":");
|
||||
markOut ( content , "\\\"gbssDiscoveredTime\\\":");
|
||||
|
||||
|
||||
// if the results are in xml, then the status doc is xml encoded
|
||||
markOut ( content , "\"gbssDownloadStartTime\":");
|
||||
markOut ( content , "\"gbssDownloadEndTime\":");
|
||||
markOut ( content , "\"gbssDownloadStartTimeMS\":");
|
||||
markOut ( content , "\"gbssDownloadEndTimeMS\":");
|
||||
markOut ( content , "\"gbssDownloadDurationMS\":");
|
||||
markOut ( content , "\"gbssAgeInIndex\":");
|
||||
|
||||
|
||||
// for xml
|
||||
markOut ( content , "<currentTimeUTC>" );
|
||||
markOut ( content , "<responseTimeMS>");
|
||||
markOut ( content , "<docsInCollection>");
|
||||
markOut ( content , "<firstIndexedDateUTC>");
|
||||
|
||||
// indexed 1 day ago
|
||||
markOut ( content,"indexed:");
|
||||
// modified 1 day ago
|
||||
markOut ( content,"modified:");
|
||||
|
||||
// s_gigabitCount... it is perpetually incrementing static counter
|
||||
// in PageResults.cpp
|
||||
markOut(content,"ccc(");
|
||||
markOut(content,"id=fd");
|
||||
markOut(content,"id=sd");
|
||||
|
||||
// for some reason the term freq seems to change a little in
|
||||
// the scoring table
|
||||
markOut(content,"id=tf");
|
||||
|
||||
// # of collections in the admin page: ..."4 Collections"
|
||||
markOut(content,"px;color:black;\"><center><nobr><b>");
|
||||
|
||||
markOut(content,"spider is done (");
|
||||
markOut(content,"spider is paused (");
|
||||
markOut(content,"spider is active (");
|
||||
markOut(content,"spider queue empty (");
|
||||
|
||||
markOut2(content,"bgcolor=#c0c0f0");
|
||||
markOut2(content,"bgcolor=#d0d0e0");
|
||||
}
|
||||
|
||||
// do not hash
|
||||
int32_t qa_hash32 ( char *s ) {
|
||||
uint32_t h = 0;
|
||||
@ -171,84 +292,8 @@ void processReply ( char *reply , int32_t replyLen ) {
|
||||
|
||||
s_content = content;
|
||||
|
||||
// take out <responseTimeMS>
|
||||
markOut ( content , "<currentTimeUTC>");
|
||||
markOut ( content , "<responseTimeMS>");
|
||||
markOutBuf ( content );
|
||||
|
||||
// ...from an index of about 429 pages in 0.91 seconds in collection...
|
||||
markOut ( content , " pages in ");
|
||||
|
||||
// until i figure this one out, take it out
|
||||
markOut ( content , "<docsInCollection>");
|
||||
|
||||
markOut ( content , "spider is done (");
|
||||
markOut ( content , "spider is paused (");
|
||||
markOut ( content , "spider queue empty (");
|
||||
markOut ( content , "spider is active (");
|
||||
|
||||
markOut ( content , "<totalShards>");
|
||||
|
||||
// 3 Collections etc.
|
||||
markOut ( content , "/rocket.jpg></div></a></center><br><br><div style=\"width:190px;padding:4px;margin-left:10px;background-color:white;border-top-left-radius:10px;border-bottom-left-radius:10px;border-color:blue;border-width:3px;border-style:solid;margin-right:-3px;border-right-color:white;overflow-y:auto;overflow-x:hidden;line-height:23px;color:black;\"><center><nobr><b>" );
|
||||
|
||||
// until i figure this one out, take it out
|
||||
markOut ( content , "<hits>");
|
||||
|
||||
// for those links in the html pages
|
||||
markOut ( content, "rand64=");
|
||||
|
||||
// for json
|
||||
markOut ( content , "\"currentTimeUTC\":" );
|
||||
markOut ( content , "\"responseTimeMS\":");
|
||||
markOut ( content , "\"docsInCollection\":");
|
||||
|
||||
// if the results are in json, then status doc is encoded json
|
||||
markOut ( content , "\\\"gbssDownloadStartTime\\\":");
|
||||
markOut ( content , "\\\"gbssDownloadEndTime\\\":");
|
||||
markOut ( content , "\\\"gbssDownloadStartTimeMS\\\":");
|
||||
markOut ( content , "\\\"gbssDownloadEndTimeMS\\\":");
|
||||
markOut ( content , "\\\"gbssDownloadDurationMS\\\":");
|
||||
markOut ( content , "\\\"gbssAgeInIndex\\\":");
|
||||
markOut ( content , "\\\"gbssDiscoveredTime\\\":");
|
||||
|
||||
|
||||
// if the results are in xml, then the status doc is xml encoded
|
||||
markOut ( content , "\"gbssDownloadStartTime\":");
|
||||
markOut ( content , "\"gbssDownloadEndTime\":");
|
||||
markOut ( content , "\"gbssDownloadStartTimeMS\":");
|
||||
markOut ( content , "\"gbssDownloadEndTimeMS\":");
|
||||
markOut ( content , "\"gbssDownloadDurationMS\":");
|
||||
markOut ( content , "\"gbssAgeInIndex\":");
|
||||
|
||||
|
||||
// for xml
|
||||
markOut ( content , "<currentTimeUTC>" );
|
||||
markOut ( content , "<responseTimeMS>");
|
||||
markOut ( content , "<docsInCollection>");
|
||||
markOut ( content , "<firstIndexedDateUTC>");
|
||||
|
||||
// indexed 1 day ago
|
||||
markOut ( content,"indexed:");
|
||||
// modified 1 day ago
|
||||
markOut ( content,"modified:");
|
||||
|
||||
// s_gigabitCount... it is perpetually incrementing static counter
|
||||
// in PageResults.cpp
|
||||
markOut(content,"ccc(");
|
||||
markOut(content,"id=fd");
|
||||
markOut(content,"id=sd");
|
||||
|
||||
// for some reason the term freq seems to change a little in
|
||||
// the scoring table
|
||||
markOut(content,"id=tf");
|
||||
|
||||
// # of collections in the admin page: ..."4 Collections"
|
||||
markOut(content,"px;color:black;\"><center><nobr><b>");
|
||||
|
||||
markOut(content,"spider is done (");
|
||||
markOut(content,"spider is paused (");
|
||||
markOut(content,"spider is active (");
|
||||
markOut(content,"spider queue empty (");
|
||||
|
||||
// make checksum. we ignore back to back spaces so this
|
||||
// hash works for <docsInCollection>10 vs <docsInCollection>9
|
||||
@ -361,9 +406,26 @@ void processReply ( char *reply , int32_t replyLen ) {
|
||||
fb1.load(fn1);
|
||||
fb1.nullTerm();
|
||||
|
||||
// markout both
|
||||
markOutBuf ( fb1.getBufStart() );
|
||||
markOutBuf ( fb2.getBufStart() );
|
||||
|
||||
// save temps
|
||||
SafeBuf tmpfn1;
|
||||
SafeBuf tmpfn2;
|
||||
tmpfn1.safePrintf("%strash/tmpdiff1.txt",g_hostdb.m_dir);
|
||||
tmpfn2.safePrintf("%strash/tmpdiff2.txt",g_hostdb.m_dir);
|
||||
fb1.save(tmpfn1.getBufStart());
|
||||
fb2.save(tmpfn2.getBufStart());
|
||||
|
||||
// do the diff between the two replies so we can see what changed
|
||||
// now do the diffs between the marked out versions so it is less
|
||||
// spammy
|
||||
char cmd[1024];
|
||||
sprintf(cmd,"diff %s %s > /tmp/diffout",fn1,fn2);
|
||||
sprintf(cmd,"diff %s %s > /tmp/diffout",
|
||||
tmpfn1.getBufStart(),
|
||||
tmpfn2.getBufStart());
|
||||
//fn1,fn2);
|
||||
//log("qa: %s\n",cmd);
|
||||
gbsystem(cmd);
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user