added more langs to url filters drop down

This commit is contained in:
mwells
2014-09-21 18:16:11 -07:00
parent 251d72b8b1
commit dcc775eae7
4 changed files with 155 additions and 59 deletions

@ -1948,12 +1948,12 @@ bool CollectionRec::countEvents ( ) {
*/
bool CollectionRec::rebuildUrlFilters2 ( ) {
bool rebuild = false;
bool rebuild = true;
if ( m_numRegExs == 0 )
rebuild = true;
// don't touch it if not supposed to as long as we have some already
if ( m_urlFiltersProfile != UFP_NONE )
rebuild = true;
//if ( m_urlFiltersProfile != UFP_NONE )
// rebuild = true;
// never for custom crawls however
if ( m_isCustomCrawl )
rebuild = false;
@ -1961,12 +1961,47 @@ bool CollectionRec::rebuildUrlFilters2 ( ) {
// addDefault = true;
if ( ! rebuild ) return true;
if ( m_urlFiltersProfile == UFP_CHINESE )
return rebuildChineseRules();
char *s = m_urlFiltersProfile.getBufStart();
if ( m_urlFiltersProfile == UFP_SHALLOW )
if ( !strcmp(s,"shallow" ) )
return rebuildShallowRules();
//if ( strcmp(s,"web") )
// just fall through for that
if ( !strcmp(s,"english") )
return rebuildLangRules( "en","com,us,gov");
if ( !strcmp(s,"german") )
return rebuildLangRules( "de","de");
if ( !strcmp(s,"") )
return rebuildLangRules( "fr","fr");
if ( !strcmp(s,"norwegian") )
return rebuildLangRules( "nl","nl");
if ( !strcmp(s,"spanish") )
return rebuildLangRules( "es","es");
//if ( m_urlFiltersProfile == UFP_EURO )
// return rebuildLangRules( "de,fr,nl,es,sv,no,it",
// "com,gov,org,de,fr,nl,es,sv,no,it");
if ( !strcmp(s,"romantic") )
return rebuildLangRules("en,de,fr,nl,es,sv,no,it,fi,pt",
"de,fr,nl,es,sv,no,it,fi,pt,"
"com,gov,org"
);
if ( !strcmp(s,"chinese") )
return rebuildLangRules( "zh_cn,zh_tw","cn");
long n = 0;
/*
@ -2024,7 +2059,7 @@ bool CollectionRec::rebuildUrlFilters2 ( ) {
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 45;
if ( m_urlFiltersProfile == UFP_NEWS )
if ( ! strcmp(s,"news") )
m_spiderFreqs [n] = .00347; // 5 mins
n++;
@ -2035,7 +2070,7 @@ bool CollectionRec::rebuildUrlFilters2 ( ) {
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 85;
if ( m_urlFiltersProfile == UFP_NEWS )
if ( ! strcmp(s,"news") )
m_spiderFreqs [n] = .00347; // 5 mins
n++;
@ -2046,7 +2081,7 @@ bool CollectionRec::rebuildUrlFilters2 ( ) {
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 50;
if ( m_urlFiltersProfile == UFP_NEWS )
if ( ! strcmp(s,"news") )
m_spiderFreqs [n] = .00347; // 5 mins
n++;
@ -2057,7 +2092,7 @@ bool CollectionRec::rebuildUrlFilters2 ( ) {
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 48;
if ( m_urlFiltersProfile == UFP_NEWS )
if ( ! strcmp(s,"news") )
m_spiderFreqs [n] = .00347; // 5 mins
n++;
@ -2068,7 +2103,7 @@ bool CollectionRec::rebuildUrlFilters2 ( ) {
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 49;
if ( m_urlFiltersProfile == UFP_NEWS )
if ( ! strcmp(s,"news") )
m_spiderFreqs [n] = .00347; // 5 mins
n++;
@ -2079,7 +2114,7 @@ bool CollectionRec::rebuildUrlFilters2 ( ) {
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 47;
if ( m_urlFiltersProfile == UFP_NEWS )
if ( ! strcmp(s,"news") )
m_spiderFreqs [n] = .00347; // 5 mins
n++;
@ -2090,7 +2125,7 @@ bool CollectionRec::rebuildUrlFilters2 ( ) {
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 40;
if ( m_urlFiltersProfile == UFP_NEWS )
if ( ! strcmp(s,"news") )
m_spiderFreqs [n] = .04166; // 60 minutes
n++;
@ -2101,7 +2136,7 @@ bool CollectionRec::rebuildUrlFilters2 ( ) {
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 39;
if ( m_urlFiltersProfile == UFP_NEWS )
if ( ! strcmp(s,"news") )
m_spiderFreqs [n] = .04166; // 60 minutes
n++;
@ -2113,7 +2148,7 @@ bool CollectionRec::rebuildUrlFilters2 ( ) {
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 30;
// do not harvest links if we are spiderings NEWS
if ( m_urlFiltersProfile == UFP_NEWS ) {
if ( ! strcmp(s,"news") ) {
m_spiderFreqs [n] = 5.0;
m_harvestLinks [n] = 0;
}
@ -2127,7 +2162,7 @@ bool CollectionRec::rebuildUrlFilters2 ( ) {
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 29;
// do not harvest links if we are spiderings NEWS
if ( m_urlFiltersProfile == UFP_NEWS ) {
if ( ! strcmp(s,"news") ) {
m_spiderFreqs [n] = 5.0;
m_harvestLinks [n] = 0;
}
@ -2141,7 +2176,7 @@ bool CollectionRec::rebuildUrlFilters2 ( ) {
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 20;
// turn off spidering if hopcount is too big and we are spiderings NEWS
if ( m_urlFiltersProfile == UFP_NEWS ) {
if ( ! strcmp(s,"news") ) {
m_maxSpidersPerRule [n] = 0;
m_harvestLinks [n] = 0;
}
@ -2157,7 +2192,7 @@ bool CollectionRec::rebuildUrlFilters2 ( ) {
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 19;
// turn off spidering if hopcount is too big and we are spiderings NEWS
if ( m_urlFiltersProfile == UFP_NEWS ) {
if ( ! strcmp(s,"news") ) {
m_maxSpidersPerRule [n] = 0;
m_harvestLinks [n] = 0;
}
@ -2183,7 +2218,7 @@ bool CollectionRec::rebuildUrlFilters2 ( ) {
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 1;
if ( m_urlFiltersProfile == UFP_NEWS ) {
if ( ! strcmp(s,"news") ) {
m_maxSpidersPerRule [n] = 0;
m_harvestLinks [n] = 0;
}
@ -2212,7 +2247,8 @@ bool CollectionRec::rebuildUrlFilters2 ( ) {
return true;
}
bool CollectionRec::rebuildChineseRules ( ) {
bool CollectionRec::rebuildLangRules ( char *langStr , char *tldStr ) {
long n = 0;
@ -2271,7 +2307,9 @@ bool CollectionRec::rebuildChineseRules ( ) {
m_spiderPriorities [n] = 85;
n++;
m_regExs[n].set("hopcount==0 && iswww && isnew && tld==cn");
m_regExs[n].reset();
m_regExs[n].safePrintf("hopcount==0 && iswww && isnew && tld==%s",
tldStr);
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 7; // 30 days default
m_maxSpidersPerRule [n] = 9; // max spiders
@ -2280,7 +2318,10 @@ bool CollectionRec::rebuildChineseRules ( ) {
m_spiderPriorities [n] = 50;
n++;
m_regExs[n].set("hopcount==0 && iswww && isnew && parentlang==zh_cn,zh_tw,xx");
m_regExs[n].reset();
m_regExs[n].safePrintf("hopcount==0 && iswww && isnew && "
"parentlang==%s,xx"
,langStr);
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 7; // 30 days default
m_maxSpidersPerRule [n] = 9; // max spiders
@ -2300,7 +2341,8 @@ bool CollectionRec::rebuildChineseRules ( ) {
m_regExs[n].set("hopcount==0 && iswww && tld==cn");
m_regExs[n].reset();
m_regExs[n].safePrintf("hopcount==0 && iswww && tld==%s",tldStr);
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 7.0; // days b4 respider
m_maxSpidersPerRule [n] = 9; // max spiders
@ -2309,7 +2351,9 @@ bool CollectionRec::rebuildChineseRules ( ) {
m_spiderPriorities [n] = 48;
n++;
m_regExs[n].set("hopcount==0 && iswww && parentlang==zh_cn,zh_tw,xx");
m_regExs[n].reset();
m_regExs[n].safePrintf("hopcount==0 && iswww && parentlang==%s,xx",
langStr);
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 7.0; // days b4 respider
m_maxSpidersPerRule [n] = 9; // max spiders
@ -2331,7 +2375,8 @@ bool CollectionRec::rebuildChineseRules ( ) {
m_regExs[n].set("hopcount==0 && isnew && tld==cn");
m_regExs[n].reset();
m_regExs[n].safePrintf("hopcount==0 && isnew && tld==%s",tldStr);
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 7.0;
m_maxSpidersPerRule [n] = 9; // max spiders
@ -2340,7 +2385,9 @@ bool CollectionRec::rebuildChineseRules ( ) {
m_spiderPriorities [n] = 49;
n++;
m_regExs[n].set("hopcount==0 && isnew && parentlang==zh_cn,zh_tw,xx");
m_regExs[n].reset();
m_regExs[n].safePrintf("hopcount==0 && isnew && parentlang==%s,xx",
langStr);
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 7.0;
m_maxSpidersPerRule [n] = 9; // max spiders
@ -2360,7 +2407,8 @@ bool CollectionRec::rebuildChineseRules ( ) {
m_regExs[n].set("hopcount==0 && tld==cn");
m_regExs[n].reset();
m_regExs[n].safePrintf("hopcount==0 && tld==%s",tldStr);
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 10.0;
m_maxSpidersPerRule [n] = 9; // max spiders
@ -2369,7 +2417,8 @@ bool CollectionRec::rebuildChineseRules ( ) {
m_spiderPriorities [n] = 47;
n++;
m_regExs[n].set("hopcount==0 && parentlang==zh_cn,zh_tw,xx");
m_regExs[n].reset();
m_regExs[n].safePrintf("hopcount==0 && parentlang==%s,xx",langStr);
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 10.0;
m_maxSpidersPerRule [n] = 9; // max spiders
@ -2390,7 +2439,8 @@ bool CollectionRec::rebuildChineseRules ( ) {
m_regExs[n].set("hopcount==1 && isnew && tld==cn");
m_regExs[n].reset();
m_regExs[n].safePrintf("hopcount==1 && isnew && tld==%s",tldStr);
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 20.0;
m_maxSpidersPerRule [n] = 9; // max spiders
@ -2399,7 +2449,9 @@ bool CollectionRec::rebuildChineseRules ( ) {
m_spiderPriorities [n] = 40;
n++;
m_regExs[n].set("hopcount==1 && isnew && parentlang==zh_cn,zh_tw,xx");
m_regExs[n].reset();
m_regExs[n].safePrintf("hopcount==1 && isnew && parentlang==%s,xx",
tldStr);
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 20.0;
m_maxSpidersPerRule [n] = 9; // max spiders
@ -2419,7 +2471,8 @@ bool CollectionRec::rebuildChineseRules ( ) {
m_regExs[n].set("hopcount==1 && tld==cn");
m_regExs[n].reset();
m_regExs[n].safePrintf("hopcount==1 && tld==%s",tldStr);
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 20.0;
m_maxSpidersPerRule [n] = 9; // max spiders
@ -2428,7 +2481,8 @@ bool CollectionRec::rebuildChineseRules ( ) {
m_spiderPriorities [n] = 39;
n++;
m_regExs[n].set("hopcount==1 && parentlang==zh_cn,zh_tw,xx");
m_regExs[n].reset();
m_regExs[n].safePrintf("hopcount==1 && parentlang==%s,xx",langStr);
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 20.0;
m_maxSpidersPerRule [n] = 9; // max spiders
@ -2448,8 +2502,8 @@ bool CollectionRec::rebuildChineseRules ( ) {
m_regExs[n].set("hopcount==2 && isnew && tld==cn");
m_regExs[n].reset();
m_regExs[n].safePrintf("hopcount==2 && isnew && tld==%s",tldStr);
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 40;
m_maxSpidersPerRule [n] = 9; // max spiders
@ -2458,7 +2512,9 @@ bool CollectionRec::rebuildChineseRules ( ) {
m_spiderPriorities [n] = 30;
n++;
m_regExs[n].set("hopcount==2 && isnew && parentlang==zh_cn,zh_tw,xx");
m_regExs[n].reset();
m_regExs[n].safePrintf("hopcount==2 && isnew && parentlang==%s,xx",
langStr);
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 40;
m_maxSpidersPerRule [n] = 9; // max spiders
@ -2479,7 +2535,8 @@ bool CollectionRec::rebuildChineseRules ( ) {
m_regExs[n].set("hopcount==2 && tld==cn");
m_regExs[n].reset();
m_regExs[n].safePrintf("hopcount==2 && tld==%s",tldStr);
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 40;
m_maxSpidersPerRule [n] = 9; // max spiders
@ -2488,7 +2545,8 @@ bool CollectionRec::rebuildChineseRules ( ) {
m_spiderPriorities [n] = 29;
n++;
m_regExs[n].set("hopcount==2 && parentlang==zh_cn,zh_tw,xx");
m_regExs[n].reset();
m_regExs[n].safePrintf("hopcount==2 && parentlang==%s,xx",langStr);
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 40;
m_maxSpidersPerRule [n] = 9; // max spiders
@ -2509,7 +2567,8 @@ bool CollectionRec::rebuildChineseRules ( ) {
m_regExs[n].set("hopcount>=3 && isnew && tld==cn");
m_regExs[n].reset();
m_regExs[n].safePrintf("hopcount>=3 && isnew && tld==%s",tldStr);
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 60;
m_maxSpidersPerRule [n] = 9; // max spiders
@ -2518,7 +2577,9 @@ bool CollectionRec::rebuildChineseRules ( ) {
m_spiderPriorities [n] = 22;
n++;
m_regExs[n].set("hopcount>=3 && isnew && parentlang==zh_cn,zh_tw,xx");
m_regExs[n].reset();
m_regExs[n].safePrintf("hopcount>=3 && isnew && parentlang==%s,xx",
langStr);
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 60;
m_maxSpidersPerRule [n] = 9; // max spiders
@ -2539,7 +2600,8 @@ bool CollectionRec::rebuildChineseRules ( ) {
m_regExs[n].set("hopcount>=3 && tld==cn");
m_regExs[n].safePrintf("hopcount>=3 && tld==%s",tldStr);
m_regExs[n].reset();
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 60;
m_maxSpidersPerRule [n] = 9; // max spiders
@ -2548,7 +2610,8 @@ bool CollectionRec::rebuildChineseRules ( ) {
m_spiderPriorities [n] = 21;
n++;
m_regExs[n].set("hopcount>=3 && parentlang==zh_cn,zh_tw,xx");
m_regExs[n].reset();
m_regExs[n].safePrintf("hopcount>=3 && parentlang==%s,xx",langStr);
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 60;
m_maxSpidersPerRule [n] = 9; // max spiders
@ -3053,8 +3116,8 @@ void nukeDoledb ( collnum_t collnum );
bool CollectionRec::rebuildUrlFilters ( ) {
if ( ! g_conf.m_doingCommandLine )
log("coll: Rebuilding url filters for %s ufp=%li",m_coll,
(long)m_urlFiltersProfile);
log("coll: Rebuilding url filters for %s ufp=%s",m_coll,
m_urlFiltersProfile.getBufStart());
// if not a custom crawl, and no expressions, add a default one
//if ( m_numRegExs == 0 && ! m_isCustomCrawl ) {

@ -385,7 +385,7 @@ class CollectionRec {
// for regular crawls
bool rebuildUrlFilters2();
bool rebuildChineseRules();
bool rebuildLangRules( char *lang , char *tld );
bool rebuildShallowRules();
@ -730,8 +730,8 @@ class CollectionRec {
SafeBuf m_siteListBuf;
char m_spiderToo;
// see UFP_* values in Parms.h. i.e. UFP_NEWS for crawling for NEWS
char m_urlFiltersProfile;
// can be "web" "english" "romantic" "german" etc.
SafeBuf m_urlFiltersProfile;
// . now the url regular expressions
// . we chain down the regular expressions

@ -1601,17 +1601,48 @@ bool printDropDown ( long n , SafeBuf* sb, char *name, long select,
return true;
}
bool printDropDownProfile ( SafeBuf* sb, char *name, long select ) {
class DropLangs {
public:
char *m_title;
char *m_lang;
char *m_tld;
};
DropLangs g_drops[] = {
{"custom",NULL,NULL},
{"web",NULL,NULL},
{"news",NULL,NULL},
{"english","en","com,us.gov,org"},
{"german","de","de"},
{"french","fr","fr"},
{"norweigian","nl","nl"},
{"spanish","es","es"},
{"italian","it","it"},
{"romantic","en,de,fr,nl,es,it","com,us.gov,org,de,fr,nl,es,it"}
};
// "url filters profile" values. used to set default crawl rules
// in Collectiondb.cpp's CollectionRec::setUrlFiltersToDefaults().
// for instance, UFP_NEWS spiders sites more frequently but less deep in
// order to get "news" pages and articles
bool printDropDownProfile ( SafeBuf* sb, char *name, CollectionRec *cr ) {
sb->safePrintf ( "<select name=%s>", name );
// the type of url filters profiles
char *items[] = {"custom","web","news","chinese","shallow"};
char *s;
for ( long i = 0 ; i < 5 ; i++ ) {
if ( i == select ) s = " selected";
else s = "";
sb->safePrintf ("<option value=%li%s>%s",i,s,items[i]);
//char *items[] = {"custom","web","news","chinese","shallow"};
long nd = sizeof(g_drops)/sizeof(DropLangs);
for ( long i = 0 ; i < nd ; i++ ) {
//if ( i == select ) s = " selected";
//else s = "";
char *x = cr->m_urlFiltersProfile.getBufStart();
char *s;
if ( strcmp(g_drops[i].m_title, x) == 0 ) s = " selected";
else s = "";
sb->safePrintf ("<option value=%s%s>%s",
g_drops[i].m_title,
s,
g_drops[i].m_title );
}
sb->safePrintf ( "</select>" );
sb->safePrintf ( "</select>");
return true;
}
@ -2354,9 +2385,11 @@ bool Parms::printParm ( SafeBuf* sb,
//else if ( t == TYPE_DIFFBOT_DROPDOWN ) {
// char *xx=NULL;*xx=0;
//}
else if ( t == TYPE_UFP )
//else if ( t == TYPE_UFP )
else if ( t == TYPE_SAFEBUF &&
strcmp(m->m_title,"url filters profile")==0)
// url filters profile drop down "ufp"
printDropDownProfile ( sb , "ufp" , *s );
printDropDownProfile ( sb , "ufp" , cr );//*s );
else if ( t == TYPE_RETRIES )
printDropDown ( 4 , sb , cgi , *s , false , false );
else if ( t == TYPE_FILEUPLOADBUTTON ) {
@ -12491,8 +12524,8 @@ void Parms::init ( ) {
"to the table will be lost.";
m->m_off = (char *)&cr.m_urlFiltersProfile - x;
m->m_colspan = 3;
m->m_type = TYPE_UFP;// 1 byte dropdown menu
m->m_def = "1"; // UFP_WEB
m->m_type = TYPE_SAFEBUF;//UFP;// 1 byte dropdown menu
m->m_def = "web"; // UFP_WEB
m->m_flags = PF_REBUILDURLFILTERS | PF_CLONE;
m->m_page = PAGE_FILTERS;
m->m_obj = OBJ_COLL;

@ -23,7 +23,7 @@ enum {
UFP_NONE = 0 ,
UFP_WEB = 1 ,
UFP_NEWS = 2 ,
UFP_CHINESE = 3,
UFP_LANG = 3,
UFP_SHALLOW = 4
};