fix url filters bugs.
This commit is contained in:
parent
39d9760e5d
commit
487d3f0a0e
@ -305,6 +305,9 @@ void CollectionRec::setUrlFiltersToDefaults ( ) {
|
||||
|
||||
m_spidersEnabled[n] = 1;
|
||||
m_numRegExs7++;
|
||||
|
||||
m_spiderSendToDiffbot[n] = 1;
|
||||
m_numRegExs11++;
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -459,6 +459,11 @@ class CollectionRec {
|
||||
long m_numRegExs7;
|
||||
char m_spidersEnabled [ MAX_FILTERS ];
|
||||
|
||||
// should urls in this queue be sent to diffbot for processing
|
||||
// when we are trying to index them?
|
||||
long m_numRegExs11;
|
||||
char m_spiderSendToDiffbot [ MAX_FILTERS ];
|
||||
|
||||
// dummy?
|
||||
long m_numRegExs9;
|
||||
|
||||
|
67
Diffbot.cpp
67
Diffbot.cpp
@ -2469,6 +2469,7 @@ CollectionRec *addNewDiffbotColl ( HttpRequest *hr ) {
|
||||
cr->m_spiderIpMaxSpiders[i] = 10;
|
||||
cr->m_spidersEnabled [i] = 1;
|
||||
cr->m_spiderFreqs [i] = 7.0;
|
||||
cr->m_spiderSendToDiffbot[i] = 0;
|
||||
}
|
||||
|
||||
|
||||
@ -2485,57 +2486,33 @@ CollectionRec *addNewDiffbotColl ( HttpRequest *hr ) {
|
||||
i++;
|
||||
// if user did not specify a url crawl pattern then keep
|
||||
// the crawl limited to the same subdomain of the seed url
|
||||
if ( cr->m_diffbotUrlCrawlPattern.length() == 0 ) {
|
||||
// first limit to http://subdomain
|
||||
cr->m_regExs[i].safePrintf("isonsite");//^http://");
|
||||
//cr->m_regExs[i].safeMemcpy(norm.getHost(),norm.getHostLen());
|
||||
//cr->m_regExs[i].pushChar('/');
|
||||
cr->m_regExs[i].nullTerm();
|
||||
cr->m_spiderPriorities [i] = 50;
|
||||
cr->m_maxSpidersPerRule [i] = 10;
|
||||
cr->m_spiderIpWaits [i] = 250; // 500 ms for now
|
||||
cr->m_spiderIpMaxSpiders[i] = 10;
|
||||
cr->m_spidersEnabled [i] = 1;
|
||||
i++;
|
||||
/*
|
||||
// then include HTTPS
|
||||
cr->m_regExs[i].safePrintf("^https://");
|
||||
cr->m_regExs[i].safeMemcpy(norm.getHost(),norm.getHostLen());
|
||||
cr->m_regExs[i].pushChar('/');
|
||||
cr->m_regExs[i].nullTerm();
|
||||
cr->m_spiderPriorities [i] = 50;
|
||||
cr->m_maxSpidersPerRule [i] = 10;
|
||||
cr->m_spiderIpWaits [i] = 250; // 500 ms for now
|
||||
cr->m_spiderIpMaxSpiders[i] = 10;
|
||||
cr->m_spidersEnabled [i] = 1;
|
||||
i++;
|
||||
*/
|
||||
// and make all else filtered
|
||||
cr->m_regExs[i].safePrintf("default");
|
||||
cr->m_spiderPriorities [i] = SPIDER_PRIORITY_FILTERED;
|
||||
cr->m_maxSpidersPerRule [i] = 10;
|
||||
cr->m_spiderIpWaits [i] = 250; // 500 ms for now
|
||||
cr->m_spiderIpMaxSpiders[i] = 10;
|
||||
cr->m_spidersEnabled [i] = 1;
|
||||
i++;
|
||||
}
|
||||
else {
|
||||
cr->m_regExs[i].safePrintf("default");
|
||||
cr->m_spiderPriorities [i] = 50;
|
||||
cr->m_maxSpidersPerRule [i] = 10;
|
||||
cr->m_spiderIpWaits [i] = 250; // 500 ms for now
|
||||
cr->m_spiderIpMaxSpiders[i] = 10;
|
||||
cr->m_spidersEnabled [i] = 1;
|
||||
i++;
|
||||
}
|
||||
|
||||
|
||||
//if ( cr->m_diffbotUrlCrawlPattern.length() == 0 ) {
|
||||
// first limit to http://subdomain
|
||||
cr->m_regExs[i].safePrintf("isonsite");//^http://");
|
||||
//cr->m_regExs[i].safeMemcpy(norm.getHost(),norm.getHostLen());
|
||||
//cr->m_regExs[i].pushChar('/');
|
||||
cr->m_regExs[i].nullTerm();
|
||||
cr->m_spiderPriorities [i] = 50;
|
||||
cr->m_maxSpidersPerRule [i] = 10;
|
||||
cr->m_spiderIpWaits [i] = 250; // 500 ms for now
|
||||
cr->m_spiderIpMaxSpiders[i] = 10;
|
||||
cr->m_spidersEnabled [i] = 1;
|
||||
i++;
|
||||
// and make all else filtered
|
||||
cr->m_regExs[i].safePrintf("default");
|
||||
cr->m_spiderPriorities [i] = SPIDER_PRIORITY_FILTERED;
|
||||
cr->m_maxSpidersPerRule [i] = 10;
|
||||
cr->m_spiderIpWaits [i] = 250; // 500 ms for now
|
||||
cr->m_spiderIpMaxSpiders[i] = 10;
|
||||
cr->m_spidersEnabled [i] = 1;
|
||||
i++;
|
||||
|
||||
// just the default rule!
|
||||
cr->m_numRegExs = i;
|
||||
cr->m_numRegExs2 = i;
|
||||
cr->m_numRegExs3 = i;
|
||||
cr->m_numRegExs10 = i;
|
||||
cr->m_numRegExs11 = i;
|
||||
cr->m_numRegExs5 = i;
|
||||
cr->m_numRegExs6 = i;
|
||||
cr->m_numRegExs7 = i;
|
||||
|
41
Parms.cpp
41
Parms.cpp
@ -444,7 +444,17 @@ bool Parms::sendPageGeneric ( TcpSocket *s , HttpRequest *r , long page ,
|
||||
"</td></tr>%s%s\n",
|
||||
LIGHT_BLUE,DARK_BLUE,tt,bb,e1,e2);
|
||||
|
||||
char *THIS = g_parms.getTHIS ( r , page );
|
||||
char *THIS ;
|
||||
// when being called from Diffbot.cpp crawlbot page it is kind of
|
||||
// hacky and we want to print the url filters for the supplied
|
||||
// collection dictated by collOveride. if we don't have this
|
||||
// fix here it ends up printing the url filters for the default
|
||||
// "main" collection
|
||||
if ( collOveride )
|
||||
THIS = (char *)g_collectiondb.getRec(collOveride);
|
||||
else
|
||||
THIS = g_parms.getTHIS ( r , page );
|
||||
|
||||
if ( ! THIS ) {
|
||||
log("admin: Could not get parameter object.");
|
||||
return g_httpServer.sendErrorReply ( s , 505 , "Bad Request");
|
||||
@ -2013,6 +2023,8 @@ bool Parms::printParm ( SafeBuf* sb,
|
||||
cgi,size);
|
||||
//sb->dequote ( s , gbstrlen(s) );
|
||||
SafeBuf *sx = (SafeBuf *)s;
|
||||
// note it
|
||||
//log("hack: %s",sx->getBufStart());
|
||||
sb->dequote ( sx->getBufStart() , sx->length() );
|
||||
sb->safePrintf ("\">");
|
||||
}
|
||||
@ -2645,7 +2657,7 @@ void Parms::setParm ( char *THIS , Parm *m , long mm , long j , char *s ,
|
||||
return;
|
||||
if ( fromRequest)oldVal = (float)*(char *)(THIS + m->m_off +j);
|
||||
*(char *)(THIS + m->m_off + j) = atol ( s );
|
||||
newVal = (float)*(char *)(THIS + m->m_off + j);
|
||||
newVal = (float)*(char *)(THIS + m->m_off + j);
|
||||
goto changed; }
|
||||
else if ( t == TYPE_CMD ) {
|
||||
log(LOG_LOGIC, "conf: Parms: TYPE_CMD is not a cgi var.");
|
||||
@ -2720,6 +2732,9 @@ void Parms::setParm ( char *THIS , Parm *m , long mm , long j , char *s ,
|
||||
else len = sb->htmlDecode (s,len,false,0);
|
||||
// ensure null terminated
|
||||
sb->nullTerm();
|
||||
// note it
|
||||
//log("hack: %s",s);
|
||||
|
||||
// null term it all
|
||||
//dst[len] = '\0';
|
||||
//sb->reserve ( 1 );
|
||||
@ -2835,6 +2850,15 @@ void Parms::setToDefault ( char *THIS ) {
|
||||
// init if we should
|
||||
init();
|
||||
|
||||
// . clear out any coll rec to get the sendToDiffbot checkboxes
|
||||
// . this is a backwards-compatibility hack since this new parm
|
||||
// will not be in old coll.conf files and will not be properly
|
||||
// initialize when displaying a url filter row.
|
||||
if ( THIS != (char *)&g_conf ) {
|
||||
CollectionRec *cr = (CollectionRec *)THIS;
|
||||
memset ( cr->m_spiderSendToDiffbot , 0 , MAX_FILTERS);
|
||||
}
|
||||
|
||||
for ( long i = 0 ; i < m_numParms ; i++ ) {
|
||||
Parm *m = &m_parms[i];
|
||||
if ( m->m_type == TYPE_COMMENT ) continue;
|
||||
@ -12408,10 +12432,21 @@ void Parms::init ( ) {
|
||||
m->m_type = TYPE_PRIORITY2; // includes UNDEFINED priority in dropdown
|
||||
m->m_page = PAGE_FILTERS;
|
||||
m->m_rowid = 1;
|
||||
m->m_addin = 1; // "insert" follows?
|
||||
m->m_def = "";
|
||||
m++;
|
||||
|
||||
m->m_title = "send to diffbot";
|
||||
m->m_cgi = "stdb";
|
||||
m->m_xml = "spiderSendToDiffbot";
|
||||
m->m_max = MAX_FILTERS;
|
||||
m->m_off = (char *)cr.m_spiderSendToDiffbot - x;
|
||||
m->m_type = TYPE_CHECKBOX;
|
||||
m->m_def = "0";
|
||||
m->m_page = PAGE_FILTERS;
|
||||
m->m_rowid = 1;
|
||||
m->m_addin = 1; // "insert" follows?
|
||||
m++;
|
||||
|
||||
//m->m_title = "<a href=/overview.html#ruleset>ruleset</a>";
|
||||
//m->m_cgi = "frs";
|
||||
//m->m_xml = "filterRuleset";
|
||||
|
1
gb.conf
1
gb.conf
@ -573,6 +573,7 @@
|
||||
<logDebugBuildMessages>0</>
|
||||
<logDebugBuildTimeMessages>0</>
|
||||
<logDebugDatabaseMessages>0</>
|
||||
<logDebugDirtyMessages>0</>
|
||||
<logDebugDiskMessages>0</>
|
||||
<logDebugDnsMessages>0</>
|
||||
<logDebugHttpMessages>0</>
|
||||
|
Loading…
x
Reference in New Issue
Block a user