fix url filters bugs.

This commit is contained in:
Matt Wells 2013-09-18 11:02:09 -07:00
parent 39d9760e5d
commit 487d3f0a0e
5 changed files with 69 additions and 48 deletions

@ -305,6 +305,9 @@ void CollectionRec::setUrlFiltersToDefaults ( ) {
m_spidersEnabled[n] = 1;
m_numRegExs7++;
m_spiderSendToDiffbot[n] = 1;
m_numRegExs11++;
}
/*

@ -459,6 +459,11 @@ class CollectionRec {
long m_numRegExs7;
char m_spidersEnabled [ MAX_FILTERS ];
// should urls in this queue be sent to diffbot for processing
// when we are trying to index them?
long m_numRegExs11;
char m_spiderSendToDiffbot [ MAX_FILTERS ];
// dummy?
long m_numRegExs9;

@ -2469,6 +2469,7 @@ CollectionRec *addNewDiffbotColl ( HttpRequest *hr ) {
cr->m_spiderIpMaxSpiders[i] = 10;
cr->m_spidersEnabled [i] = 1;
cr->m_spiderFreqs [i] = 7.0;
cr->m_spiderSendToDiffbot[i] = 0;
}
@ -2485,57 +2486,33 @@ CollectionRec *addNewDiffbotColl ( HttpRequest *hr ) {
i++;
// if user did not specify a url crawl pattern then keep
// the crawl limited to the same subdomain of the seed url
if ( cr->m_diffbotUrlCrawlPattern.length() == 0 ) {
// first limit to http://subdomain
cr->m_regExs[i].safePrintf("isonsite");//^http://");
//cr->m_regExs[i].safeMemcpy(norm.getHost(),norm.getHostLen());
//cr->m_regExs[i].pushChar('/');
cr->m_regExs[i].nullTerm();
cr->m_spiderPriorities [i] = 50;
cr->m_maxSpidersPerRule [i] = 10;
cr->m_spiderIpWaits [i] = 250; // 500 ms for now
cr->m_spiderIpMaxSpiders[i] = 10;
cr->m_spidersEnabled [i] = 1;
i++;
/*
// then include HTTPS
cr->m_regExs[i].safePrintf("^https://");
cr->m_regExs[i].safeMemcpy(norm.getHost(),norm.getHostLen());
cr->m_regExs[i].pushChar('/');
cr->m_regExs[i].nullTerm();
cr->m_spiderPriorities [i] = 50;
cr->m_maxSpidersPerRule [i] = 10;
cr->m_spiderIpWaits [i] = 250; // 500 ms for now
cr->m_spiderIpMaxSpiders[i] = 10;
cr->m_spidersEnabled [i] = 1;
i++;
*/
// and make all else filtered
cr->m_regExs[i].safePrintf("default");
cr->m_spiderPriorities [i] = SPIDER_PRIORITY_FILTERED;
cr->m_maxSpidersPerRule [i] = 10;
cr->m_spiderIpWaits [i] = 250; // 500 ms for now
cr->m_spiderIpMaxSpiders[i] = 10;
cr->m_spidersEnabled [i] = 1;
i++;
}
else {
cr->m_regExs[i].safePrintf("default");
cr->m_spiderPriorities [i] = 50;
cr->m_maxSpidersPerRule [i] = 10;
cr->m_spiderIpWaits [i] = 250; // 500 ms for now
cr->m_spiderIpMaxSpiders[i] = 10;
cr->m_spidersEnabled [i] = 1;
i++;
}
//if ( cr->m_diffbotUrlCrawlPattern.length() == 0 ) {
// first limit to http://subdomain
cr->m_regExs[i].safePrintf("isonsite");//^http://");
//cr->m_regExs[i].safeMemcpy(norm.getHost(),norm.getHostLen());
//cr->m_regExs[i].pushChar('/');
cr->m_regExs[i].nullTerm();
cr->m_spiderPriorities [i] = 50;
cr->m_maxSpidersPerRule [i] = 10;
cr->m_spiderIpWaits [i] = 250; // 500 ms for now
cr->m_spiderIpMaxSpiders[i] = 10;
cr->m_spidersEnabled [i] = 1;
i++;
// and make all else filtered
cr->m_regExs[i].safePrintf("default");
cr->m_spiderPriorities [i] = SPIDER_PRIORITY_FILTERED;
cr->m_maxSpidersPerRule [i] = 10;
cr->m_spiderIpWaits [i] = 250; // 500 ms for now
cr->m_spiderIpMaxSpiders[i] = 10;
cr->m_spidersEnabled [i] = 1;
i++;
// just the default rule!
cr->m_numRegExs = i;
cr->m_numRegExs2 = i;
cr->m_numRegExs3 = i;
cr->m_numRegExs10 = i;
cr->m_numRegExs11 = i;
cr->m_numRegExs5 = i;
cr->m_numRegExs6 = i;
cr->m_numRegExs7 = i;

@ -444,7 +444,17 @@ bool Parms::sendPageGeneric ( TcpSocket *s , HttpRequest *r , long page ,
"</td></tr>%s%s\n",
LIGHT_BLUE,DARK_BLUE,tt,bb,e1,e2);
char *THIS = g_parms.getTHIS ( r , page );
char *THIS ;
// when being called from Diffbot.cpp crawlbot page it is kind of
// hacky and we want to print the url filters for the supplied
// collection dictated by collOveride. if we don't have this
// fix here it ends up printing the url filters for the default
// "main" collection
if ( collOveride )
THIS = (char *)g_collectiondb.getRec(collOveride);
else
THIS = g_parms.getTHIS ( r , page );
if ( ! THIS ) {
log("admin: Could not get parameter object.");
return g_httpServer.sendErrorReply ( s , 505 , "Bad Request");
@ -2013,6 +2023,8 @@ bool Parms::printParm ( SafeBuf* sb,
cgi,size);
//sb->dequote ( s , gbstrlen(s) );
SafeBuf *sx = (SafeBuf *)s;
// note it
//log("hack: %s",sx->getBufStart());
sb->dequote ( sx->getBufStart() , sx->length() );
sb->safePrintf ("\">");
}
@ -2645,7 +2657,7 @@ void Parms::setParm ( char *THIS , Parm *m , long mm , long j , char *s ,
return;
if ( fromRequest)oldVal = (float)*(char *)(THIS + m->m_off +j);
*(char *)(THIS + m->m_off + j) = atol ( s );
newVal = (float)*(char *)(THIS + m->m_off + j);
newVal = (float)*(char *)(THIS + m->m_off + j);
goto changed; }
else if ( t == TYPE_CMD ) {
log(LOG_LOGIC, "conf: Parms: TYPE_CMD is not a cgi var.");
@ -2720,6 +2732,9 @@ void Parms::setParm ( char *THIS , Parm *m , long mm , long j , char *s ,
else len = sb->htmlDecode (s,len,false,0);
// ensure null terminated
sb->nullTerm();
// note it
//log("hack: %s",s);
// null term it all
//dst[len] = '\0';
//sb->reserve ( 1 );
@ -2835,6 +2850,15 @@ void Parms::setToDefault ( char *THIS ) {
// init if we should
init();
// . clear out any coll rec to get the sendToDiffbot checkboxes
// . this is a backwards-compatibility hack since this new parm
// will not be in old coll.conf files and will not be properly
// initialize when displaying a url filter row.
if ( THIS != (char *)&g_conf ) {
CollectionRec *cr = (CollectionRec *)THIS;
memset ( cr->m_spiderSendToDiffbot , 0 , MAX_FILTERS);
}
for ( long i = 0 ; i < m_numParms ; i++ ) {
Parm *m = &m_parms[i];
if ( m->m_type == TYPE_COMMENT ) continue;
@ -12408,10 +12432,21 @@ void Parms::init ( ) {
m->m_type = TYPE_PRIORITY2; // includes UNDEFINED priority in dropdown
m->m_page = PAGE_FILTERS;
m->m_rowid = 1;
m->m_addin = 1; // "insert" follows?
m->m_def = "";
m++;
m->m_title = "send to diffbot";
m->m_cgi = "stdb";
m->m_xml = "spiderSendToDiffbot";
m->m_max = MAX_FILTERS;
m->m_off = (char *)cr.m_spiderSendToDiffbot - x;
m->m_type = TYPE_CHECKBOX;
m->m_def = "0";
m->m_page = PAGE_FILTERS;
m->m_rowid = 1;
m->m_addin = 1; // "insert" follows?
m++;
//m->m_title = "<a href=/overview.html#ruleset>ruleset</a>";
//m->m_cgi = "frs";
//m->m_xml = "filterRuleset";

@ -573,6 +573,7 @@
<logDebugBuildMessages>0</>
<logDebugBuildTimeMessages>0</>
<logDebugDatabaseMessages>0</>
<logDebugDirtyMessages>0</>
<logDebugDiskMessages>0</>
<logDebugDnsMessages>0</>
<logDebugHttpMessages>0</>