mirror of
https://github.com/privacore/open-source-search-engine.git
synced 2025-07-15 02:36:08 -04:00
added support for spidering a particular language
with higher priority.
This commit is contained in:
382
Collectiondb.cpp
382
Collectiondb.cpp
@ -1862,6 +1862,9 @@ bool CollectionRec::rebuildUrlFilters2 ( ) {
|
||||
// addDefault = true;
|
||||
if ( ! rebuild ) return true;
|
||||
|
||||
if ( m_urlFiltersProfile == UFP_CHINESE )
|
||||
return rebuildChineseRules();
|
||||
|
||||
long n = 0;
|
||||
|
||||
/*
|
||||
@ -1945,7 +1948,6 @@ bool CollectionRec::rebuildUrlFilters2 ( ) {
|
||||
m_spiderFreqs [n] = .00347; // 5 mins
|
||||
n++;
|
||||
|
||||
|
||||
m_regExs[n].set("hopcount==0 && iswww");
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 7.0; // days b4 respider
|
||||
@ -2108,6 +2110,383 @@ bool CollectionRec::rebuildUrlFilters2 ( ) {
|
||||
return true;
|
||||
}
|
||||
|
||||
bool CollectionRec::rebuildChineseRules ( ) {
|
||||
|
||||
long n = 0;
|
||||
|
||||
m_regExs[n].set("isdocidbased");
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 0; // 30 days default
|
||||
m_maxSpidersPerRule [n] = 99; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 80;
|
||||
n++;
|
||||
|
||||
m_regExs[n].set("ismedia");
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 0; // 30 days default
|
||||
m_maxSpidersPerRule [n] = 99; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = -3; // delete!
|
||||
n++;
|
||||
|
||||
// if not in the site list then nuke it
|
||||
m_regExs[n].set("!ismanualadd && !insitelist");
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 0; // 30 days default
|
||||
m_maxSpidersPerRule [n] = 99; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = -3; // delete!
|
||||
n++;
|
||||
|
||||
m_regExs[n].set("errorcount>=3 && hastmperror");
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 1; // 30 days default
|
||||
m_maxSpidersPerRule [n] = 1; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 3;
|
||||
n++;
|
||||
|
||||
m_regExs[n].set("errorcount>=1 && hastmperror");
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 1; // 30 days default
|
||||
m_maxSpidersPerRule [n] = 1; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 45;
|
||||
n++;
|
||||
|
||||
m_regExs[n].set("isaddurl");
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 7; // 30 days default
|
||||
m_maxSpidersPerRule [n] = 99; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 85;
|
||||
n++;
|
||||
|
||||
m_regExs[n].set("hopcount==0 && iswww && isnew && tld==cn");
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 7; // 30 days default
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 50;
|
||||
n++;
|
||||
|
||||
m_regExs[n].set("hopcount==0 && iswww && isnew && parentlang==zh_cn,zh_tw,xx");
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 7; // 30 days default
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 50;
|
||||
n++;
|
||||
|
||||
m_regExs[n].set("hopcount==0 && iswww && isnew");
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 7; // 30 days default
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 20;
|
||||
|
||||
|
||||
|
||||
|
||||
m_regExs[n].set("hopcount==0 && iswww && tld==cn");
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 7.0; // days b4 respider
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 48;
|
||||
n++;
|
||||
|
||||
m_regExs[n].set("hopcount==0 && iswww && parentlang==zh_cn,zh_tw,xx");
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 7.0; // days b4 respider
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 48;
|
||||
n++;
|
||||
|
||||
m_regExs[n].set("hopcount==0 && iswww");
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 7.0; // days b4 respider
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 19;
|
||||
n++;
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
m_regExs[n].set("hopcount==0 && isnew && tld==cn");
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 7.0;
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 49;
|
||||
n++;
|
||||
|
||||
m_regExs[n].set("hopcount==0 && isnew && parentlang==zh_cn,zh_tw,xx");
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 7.0;
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 49;
|
||||
n++;
|
||||
|
||||
m_regExs[n].set("hopcount==0 && isnew");
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 7.0;
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 18;
|
||||
n++;
|
||||
|
||||
|
||||
|
||||
m_regExs[n].set("hopcount==0 && tld==cn");
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 10.0;
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 47;
|
||||
n++;
|
||||
|
||||
m_regExs[n].set("hopcount==0 && parentlang==zh_cn,zh_tw,xx");
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 10.0;
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 47;
|
||||
n++;
|
||||
|
||||
m_regExs[n].set("hopcount==0");
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 10.0;
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 17;
|
||||
n++;
|
||||
|
||||
|
||||
|
||||
|
||||
m_regExs[n].set("hopcount==1 && isnew && tld==cn");
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 20.0;
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 40;
|
||||
n++;
|
||||
|
||||
m_regExs[n].set("hopcount==1 && isnew && parentlang==zh_cn,zh_tw,xx");
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 20.0;
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 40;
|
||||
n++;
|
||||
|
||||
m_regExs[n].set("hopcount==1 && isnew");
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 20.0;
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 16;
|
||||
n++;
|
||||
|
||||
|
||||
|
||||
m_regExs[n].set("hopcount==1 && tld==cn");
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 20.0;
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 39;
|
||||
n++;
|
||||
|
||||
m_regExs[n].set("hopcount==1 && parentlang==zh_cn,zh_tw,xx");
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 20.0;
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 39;
|
||||
n++;
|
||||
|
||||
m_regExs[n].set("hopcount==1");
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 20.0;
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 15;
|
||||
n++;
|
||||
|
||||
|
||||
|
||||
|
||||
m_regExs[n].set("hopcount==2 && isnew && tld==cn");
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 40;
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 30;
|
||||
n++;
|
||||
|
||||
m_regExs[n].set("hopcount==2 && isnew && parentlang==zh_cn,zh_tw,xx");
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 40;
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 30;
|
||||
n++;
|
||||
|
||||
m_regExs[n].set("hopcount==2 && isnew");
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 40;
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 14;
|
||||
n++;
|
||||
|
||||
|
||||
|
||||
|
||||
m_regExs[n].set("hopcount==2 && tld==cn");
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 40;
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 29;
|
||||
n++;
|
||||
|
||||
m_regExs[n].set("hopcount==2 && parentlang==zh_cn,zh_tw,xx");
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 40;
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 29;
|
||||
n++;
|
||||
|
||||
m_regExs[n].set("hopcount==2");
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 40;
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 13;
|
||||
n++;
|
||||
|
||||
|
||||
|
||||
|
||||
m_regExs[n].set("hopcount>=3 && isnew && tld==cn");
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 60;
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 22;
|
||||
n++;
|
||||
|
||||
m_regExs[n].set("hopcount>=3 && isnew && parentlang==zh_cn,zh_tw,xx");
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 60;
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 22;
|
||||
n++;
|
||||
|
||||
m_regExs[n].set("hopcount>=3 && isnew");
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 60;
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 12;
|
||||
n++;
|
||||
|
||||
|
||||
|
||||
|
||||
m_regExs[n].set("hopcount>=3 && tld==cn");
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 60;
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 21;
|
||||
n++;
|
||||
|
||||
m_regExs[n].set("hopcount>=3 && parentlang==zh_cn,zh_tw,xx");
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 60;
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 21;
|
||||
n++;
|
||||
|
||||
m_regExs[n].set("hopcount>=3");
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 60;
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 11;
|
||||
n++;
|
||||
|
||||
|
||||
|
||||
m_regExs[n].set("default");
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 60;
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 1;
|
||||
n++;
|
||||
|
||||
m_numRegExs = n;
|
||||
m_numRegExs2 = n;
|
||||
m_numRegExs3 = n;
|
||||
m_numRegExs10 = n;
|
||||
m_numRegExs5 = n;
|
||||
m_numRegExs6 = n;
|
||||
m_numRegExs8 = n;
|
||||
|
||||
// done rebuilding CHINESE rules
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
bool CrawlInfo::print (SafeBuf *sb ) {
|
||||
return sb->safePrintf("objectsAdded:%lli\n"
|
||||
@ -2758,3 +3137,4 @@ void testRegex ( ) {
|
||||
url,rx);
|
||||
exit(0);
|
||||
}
|
||||
|
||||
|
@ -368,6 +368,8 @@ class CollectionRec {
|
||||
// for regular crawls
|
||||
bool rebuildUrlFilters2();
|
||||
|
||||
bool rebuildChineseRules();
|
||||
|
||||
bool m_urlFiltersHavePageCounts;
|
||||
|
||||
// moved from SpiderColl so we can load up at startup
|
||||
|
23
Parms.cpp
23
Parms.cpp
@ -1318,9 +1318,9 @@ bool printDropDown ( long n , SafeBuf* sb, char *name, long select,
|
||||
bool printDropDownProfile ( SafeBuf* sb, char *name, long select ) {
|
||||
sb->safePrintf ( "<select name=%s>", name );
|
||||
// the type of url filters profiles
|
||||
char *items[] = {"custom","web","news"};
|
||||
char *items[] = {"custom","web","news","chinese"};
|
||||
char *s;
|
||||
for ( long i = 0 ; i < 3 ; i++ ) {
|
||||
for ( long i = 0 ; i < 4 ; i++ ) {
|
||||
if ( i == select ) s = " selected";
|
||||
else s = "";
|
||||
sb->safePrintf ("<option value=%li%s>%s",i,s,items[i]);
|
||||
@ -7702,7 +7702,9 @@ void Parms::init ( ) {
|
||||
"tools. "
|
||||
"Limit list to 300MB. If you have a lot of INDIVIDUAL urls "
|
||||
"to add then consider using the <a href=/admin/addurl>add "
|
||||
"urls</a> interface.";
|
||||
"urls</a> interface. <b>IF YOU WANT TO SPIDER THE WHOLE "
|
||||
"WEB</b> then only use the <i>seed:</i> directives here "
|
||||
"lest you limit yourself to a set of domains.";
|
||||
m->m_cgi = "sitelist";
|
||||
m->m_off = (char *)&cr.m_siteListBuf - x;
|
||||
m->m_page = PAGE_BASIC_SETTINGS;
|
||||
@ -18735,13 +18737,14 @@ bool printUrlExpressionExamples ( SafeBuf *sb ) {
|
||||
"<td>"
|
||||
"This is true if the url was directly "
|
||||
"injected from the "
|
||||
"/inject page or API."
|
||||
"<a href=/admin/inject>inject page</a> or API."
|
||||
"</td></tr>"
|
||||
|
||||
"<tr class=poo><td>isdocidbased | !isdocidbased</td>"
|
||||
"<td>"
|
||||
"This is true if the url was added from the "
|
||||
"reindex interface. The request does not contain "
|
||||
"<a href=/admin/reindex>query reindex</a> "
|
||||
"interface. The request does not contain "
|
||||
"a url, but only a docid, that way we can add "
|
||||
"millions of search results very quickly without "
|
||||
"having to lookup each of their urls. You should "
|
||||
@ -18932,6 +18935,16 @@ bool printUrlExpressionExamples ( SafeBuf *sb ) {
|
||||
"See table below for supported language "
|
||||
"abbreviations.</td></tr>"
|
||||
|
||||
"<tr class=poo><td><nobr>parentlang==zh_cn,zh_tw,xx"
|
||||
"</nobr></td>"
|
||||
"<td>Matches if "
|
||||
"the url's referring parent url is primarily in "
|
||||
"this language. Useful for prioritizing spidering "
|
||||
"pages of a certain language."
|
||||
"See table below for supported language "
|
||||
"abbreviations."
|
||||
"</td></tr>"
|
||||
|
||||
/*
|
||||
"<tr class=poo><td>link:gigablast</td>"
|
||||
"<td>Matches if the document links to gigablast."
|
||||
|
3
Parms.h
3
Parms.h
@ -22,7 +22,8 @@ enum {
|
||||
UFP_CUSTOM = 0 ,
|
||||
UFP_NONE = 0 ,
|
||||
UFP_WEB = 1 ,
|
||||
UFP_NEWS = 2
|
||||
UFP_NEWS = 2 ,
|
||||
UFP_CHINESE = 3
|
||||
};
|
||||
|
||||
// special priorities for the priority drop down
|
||||
|
74
Spider.cpp
74
Spider.cpp
@ -110,7 +110,7 @@ long SpiderRequest::print ( SafeBuf *sbarg ) {
|
||||
sb->safePrintf("parentDomHash32=0x%lx ",m_parentDomHash32 );
|
||||
sb->safePrintf("parentSiteHash32=0x%lx ",m_parentSiteHash32 );
|
||||
|
||||
sb->safePrintf("hopCount=%li ",m_hopCount );
|
||||
sb->safePrintf("hopCount=%li ",(long)m_hopCount );
|
||||
|
||||
//timeStruct = gmtime ( &m_spiderTime );
|
||||
//time[0] = 0;
|
||||
@ -301,7 +301,7 @@ long SpiderRequest::printToTable ( SafeBuf *sb , char *status ,
|
||||
|
||||
sb->safePrintf(" <td>%li</td>\n",m_siteNumInlinks );
|
||||
//sb->safePrintf(" <td>%li</td>\n",m_pageNumInlinks );
|
||||
sb->safePrintf(" <td>%li</td>\n",m_hopCount );
|
||||
sb->safePrintf(" <td>%li</td>\n",(long)m_hopCount );
|
||||
|
||||
// print time format: 7/23/1971 10:45:32
|
||||
struct tm *timeStruct ;
|
||||
@ -436,7 +436,7 @@ long SpiderRequest::printToTableSimple ( SafeBuf *sb , char *status ,
|
||||
|
||||
sb->safePrintf(" <td>%li</td>\n",(long)m_errCount );
|
||||
|
||||
sb->safePrintf(" <td>%li</td>\n",m_hopCount );
|
||||
sb->safePrintf(" <td>%li</td>\n",(long)m_hopCount );
|
||||
|
||||
// print time format: 7/23/1971 10:45:32
|
||||
struct tm *timeStruct ;
|
||||
@ -9912,6 +9912,13 @@ long getUrlFilterNum2 ( SpiderRequest *sreq ,
|
||||
langLen = gbstrlen(lang);
|
||||
}
|
||||
|
||||
// . get parent language in the request
|
||||
// . primarpy language of the parent page that linked to this url
|
||||
char *plang = NULL;
|
||||
long plangLen = 0;
|
||||
plang = getLanguageAbbr(sreq->m_parentLangId);
|
||||
if ( plang ) plangLen = gbstrlen(plang);
|
||||
|
||||
char *tld = (char *)-1;
|
||||
long tldLen;
|
||||
|
||||
@ -11026,6 +11033,67 @@ long getUrlFilterNum2 ( SpiderRequest *sreq ,
|
||||
// come here if we did not match the tld
|
||||
}
|
||||
|
||||
|
||||
// parentlang=en,zh_cn
|
||||
if ( *p=='p' && strncmp(p,"parentlang",10)==0){
|
||||
// if we do not have enough info for outlink, all done
|
||||
if ( isOutlink ) return -1;
|
||||
// must have a reply
|
||||
//if ( ! srep ) continue;
|
||||
// skip if unknown? no, we support "xx" as unknown now
|
||||
//if ( srep->m_langId == 0 ) continue;
|
||||
// set these up
|
||||
char *b = s;
|
||||
// loop for the comma-separated list of langids
|
||||
// like parentlang==en,es,...
|
||||
subloop2b:
|
||||
// get length of it in the expression box
|
||||
char *start = b;
|
||||
while ( *b && !is_wspace_a(*b) && *b!=',' ) b++;
|
||||
long blen = b - start;
|
||||
//char sm;
|
||||
// if we had parentlang==en,es,...
|
||||
if ( sign == SIGN_EQ &&
|
||||
blen == plangLen &&
|
||||
strncasecmp(start,plang,plangLen)==0 )
|
||||
// if we matched any, that's great
|
||||
goto matched2b;
|
||||
// if its parentlang!=en,es,...
|
||||
// and we equal the string, then we do not matcht his
|
||||
// particular rule!!!
|
||||
if ( sign == SIGN_NE &&
|
||||
blen == plangLen &&
|
||||
strncasecmp(start,plang,plangLen)==0 )
|
||||
// we do not match this rule if we matched
|
||||
// and of the langs in the != list
|
||||
continue;
|
||||
// might have another in the comma-separated list
|
||||
if ( *b != ',' ) {
|
||||
// if that was the end of the list and the
|
||||
// sign was == then skip this rule
|
||||
if ( sign == SIGN_EQ ) continue;
|
||||
// otherwise, if the sign was != then we win!
|
||||
if ( sign == SIGN_NE ) goto matched2b;
|
||||
// otherwise, bad sign?
|
||||
continue;
|
||||
}
|
||||
// advance to next list item if was a comma after us
|
||||
b++;
|
||||
// and try again
|
||||
goto subloop2b;
|
||||
// come here on a match
|
||||
matched2b:
|
||||
// we matched, now look for &&
|
||||
p = strstr ( b , "&&" );
|
||||
// if nothing, else then it is a match
|
||||
if ( ! p ) return i;
|
||||
// skip the '&&' and go to next rule
|
||||
p += 2;
|
||||
goto checkNextRule;
|
||||
// come here if we did not match the tld
|
||||
}
|
||||
|
||||
|
||||
// hopcount == 20 [&&]
|
||||
if ( *p=='h' && strncmp(p, "hopcount", 8) == 0){
|
||||
// skip if not valid
|
||||
|
34
Spider.h
34
Spider.h
@ -528,29 +528,37 @@ class SpiderRequest {
|
||||
// . this is zero if none or invalid
|
||||
long m_contentHash32;
|
||||
|
||||
/*
|
||||
char m_reserved1;
|
||||
// . each request can have a different hop count
|
||||
// . this is only valid if m_hopCountValid is true!
|
||||
// . i made this a short from long to support m_parentLangId etc above
|
||||
short m_hopCount;
|
||||
|
||||
// when creating a chinese search engine for instance it is nice
|
||||
// to know the language of the page we are spidering's parent.
|
||||
// typically a chinese page will link to another chinese page,
|
||||
// though not always of course. this is the primary language of
|
||||
// the parent.
|
||||
uint8_t m_parentLangId;//reserved1;
|
||||
|
||||
// the new add url control will allow user to control link spidering
|
||||
// on each url they add. they can also specify file:// instead of
|
||||
// http:// to index local files. so we have to allow file://
|
||||
char m_onlyAddSameDomainLinks :1;
|
||||
char m_onlyAddSameSubdomainLinks :1;
|
||||
char m_onlyDoNotAddLinksLinks :1; // max hopcount 1
|
||||
char m_onlyDoNotAddLinksLinksLinks :1; // max hopcount 2
|
||||
/* char m_onlyAddSameDomainLinks :1; */
|
||||
/* char m_onlyAddSameSubdomainLinks :1; */
|
||||
/* char m_onlyDoNotAddLinksLinks :1; // max hopcount 1 */
|
||||
/* char m_onlyDoNotAddLinksLinksLinks :1; // max hopcount 2 */
|
||||
char m_reserved2a:1;
|
||||
char m_reserved2b:1;
|
||||
char m_reserved2c:1;
|
||||
char m_reserved2d:1;
|
||||
|
||||
char m_reserved2e:1;
|
||||
char m_reserved2f:1;
|
||||
char m_reserved2g:1;
|
||||
char m_reserved2h:1;
|
||||
|
||||
|
||||
// . each request can have a different hop count
|
||||
// . this is only valid if m_hopCountValid is true!
|
||||
short m_hopCount;
|
||||
*/
|
||||
|
||||
long m_hopCount;
|
||||
//long m_hopCount;
|
||||
|
||||
// . this is now computed dynamically often based on the latest
|
||||
// m_addedTime and m_percentChanged of all the SpideRec *replies*.
|
||||
@ -715,6 +723,8 @@ class SpiderRequest {
|
||||
m_ufn = -1;
|
||||
// this too
|
||||
m_priority = -1;
|
||||
// this happens to be zero already, but just in case it changes
|
||||
m_parentLangId = langUnknown;
|
||||
};
|
||||
|
||||
static long getNeededSize ( long urlLen ) {
|
||||
|
@ -22996,6 +22996,9 @@ char *XmlDoc::addOutlinkSpiderRecsToMetaList ( ) {
|
||||
Url *cu = getCurrentUrl();
|
||||
if ( ! cu || cu == (void *)-1 ) return (char *)cu;
|
||||
|
||||
uint8_t *langId = getLangId();
|
||||
if ( ! langId || langId == (uint8_t *)-1 ) return (char *)langId;
|
||||
|
||||
// validate this to prevent core for simplified redirect links
|
||||
long hostHash32a = getHostHash32a();
|
||||
|
||||
@ -23388,6 +23391,11 @@ char *XmlDoc::addOutlinkSpiderRecsToMetaList ( ) {
|
||||
ksr.m_isAddUrl = 1;
|
||||
}
|
||||
|
||||
// it is useful to know the primary langid of the parent
|
||||
// when prioritizing links for spidering in the case of
|
||||
// focussing the search engine on a particular set of langs
|
||||
ksr.m_parentLangId = *langId;
|
||||
|
||||
// don't forget this one!
|
||||
//ksr.m_spiderTime = nowGlobal;
|
||||
|
||||
|
19
main.cpp
19
main.cpp
@ -1024,6 +1024,25 @@ int main2 ( int argc , char *argv[] ) {
|
||||
testMandrill = true;
|
||||
}
|
||||
|
||||
/*
|
||||
class foo {
|
||||
public:
|
||||
long poo;
|
||||
};
|
||||
class fart {
|
||||
public:
|
||||
short fart3;
|
||||
char fart1;
|
||||
char fart2;
|
||||
};
|
||||
foo xxx;
|
||||
xxx.poo = 38123;
|
||||
fart *yyy = (fart *)&xxx;
|
||||
fprintf(stderr,"fart1=%li fart2=%li fart3=%li\n",
|
||||
(long)yyy->fart1,(long)yyy->fart2,(long)yyy->fart3);
|
||||
exit(0);
|
||||
*/
|
||||
|
||||
// gb gendbs, preset the hostid at least
|
||||
if ( //strcmp ( cmd , "gendbs" ) == 0 ||
|
||||
//strcmp ( cmd , "gentfndb" ) == 0 ||
|
||||
|
Reference in New Issue
Block a user