added support for spidering a particular language

with higher priority.
2025-07-15 02:36:08 -04:00 · 2014-05-09 10:03:24 -06:00
parent 45e2506598
commit 6048ae849b
8 changed files with 523 additions and 22 deletions
--- a/Collectiondb.cpp
+++ b/Collectiondb.cpp
@ -1862,6 +1862,9 @@ bool CollectionRec::rebuildUrlFilters2 ( ) {
 	//	addDefault = true;
 	if ( ! rebuild ) return true;

+	if ( m_urlFiltersProfile == UFP_CHINESE )
+		return rebuildChineseRules();
+
 	long n = 0;

 	/*
@ -1945,7 +1948,6 @@ bool CollectionRec::rebuildUrlFilters2 ( ) {
 		m_spiderFreqs [n] = .00347; // 5 mins
 	n++;

-
 	m_regExs[n].set("hopcount==0 && iswww");
 	m_harvestLinks       [n] = 1;
 	m_spiderFreqs        [n] = 7.0; // days b4 respider
@ -2108,6 +2110,383 @@ bool CollectionRec::rebuildUrlFilters2 ( ) {
 	return true;
 }

+bool CollectionRec::rebuildChineseRules ( ) {
+
+	long n = 0;
+
+	m_regExs[n].set("isdocidbased");
+	m_harvestLinks       [n] = 1;
+	m_spiderFreqs        [n] = 0; // 30 days default
+	m_maxSpidersPerRule  [n] = 99; // max spiders
+	m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
+	m_spiderIpWaits      [n] = 1000; // same ip wait
+	m_spiderPriorities   [n] = 80;
+	n++;
+
+	m_regExs[n].set("ismedia");
+	m_harvestLinks       [n] = 1;
+	m_spiderFreqs        [n] = 0; // 30 days default
+	m_maxSpidersPerRule  [n] = 99; // max spiders
+	m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
+	m_spiderIpWaits      [n] = 1000; // same ip wait
+	m_spiderPriorities   [n] = -3; // delete!
+	n++;
+
+	// if not in the site list then nuke it
+	m_regExs[n].set("!ismanualadd && !insitelist");
+	m_harvestLinks       [n] = 1;
+	m_spiderFreqs        [n] = 0; // 30 days default
+	m_maxSpidersPerRule  [n] = 99; // max spiders
+	m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
+	m_spiderIpWaits      [n] = 1000; // same ip wait
+	m_spiderPriorities   [n] = -3; // delete!
+	n++;
+
+	m_regExs[n].set("errorcount>=3 && hastmperror");
+	m_harvestLinks       [n] = 1;
+	m_spiderFreqs        [n] = 1; // 30 days default
+	m_maxSpidersPerRule  [n] = 1; // max spiders
+	m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
+	m_spiderIpWaits      [n] = 1000; // same ip wait
+	m_spiderPriorities   [n] = 3;
+	n++;
+
+	m_regExs[n].set("errorcount>=1 && hastmperror");
+	m_harvestLinks       [n] = 1;
+	m_spiderFreqs        [n] = 1; // 30 days default
+	m_maxSpidersPerRule  [n] = 1; // max spiders
+	m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
+	m_spiderIpWaits      [n] = 1000; // same ip wait
+	m_spiderPriorities   [n] = 45;
+	n++;
+
+	m_regExs[n].set("isaddurl");
+	m_harvestLinks       [n] = 1;
+	m_spiderFreqs        [n] = 7; // 30 days default
+	m_maxSpidersPerRule  [n] = 99; // max spiders
+	m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
+	m_spiderIpWaits      [n] = 1000; // same ip wait
+	m_spiderPriorities   [n] = 85;
+	n++;
+
+	m_regExs[n].set("hopcount==0 && iswww && isnew && tld==cn");
+	m_harvestLinks       [n] = 1;
+	m_spiderFreqs        [n] = 7; // 30 days default
+	m_maxSpidersPerRule  [n] = 9; // max spiders
+	m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
+	m_spiderIpWaits      [n] = 1000; // same ip wait
+	m_spiderPriorities   [n] = 50;
+	n++;
+
+	m_regExs[n].set("hopcount==0 && iswww && isnew && parentlang==zh_cn,zh_tw,xx");
+	m_harvestLinks       [n] = 1;
+	m_spiderFreqs        [n] = 7; // 30 days default
+	m_maxSpidersPerRule  [n] = 9; // max spiders
+	m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
+	m_spiderIpWaits      [n] = 1000; // same ip wait
+	m_spiderPriorities   [n] = 50;
+	n++;
+
+	m_regExs[n].set("hopcount==0 && iswww && isnew");
+	m_harvestLinks       [n] = 1;
+	m_spiderFreqs        [n] = 7; // 30 days default
+	m_maxSpidersPerRule  [n] = 9; // max spiders
+	m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
+	m_spiderIpWaits      [n] = 1000; // same ip wait
+	m_spiderPriorities   [n] = 20;
+
+
+
+
+	m_regExs[n].set("hopcount==0 && iswww && tld==cn");
+	m_harvestLinks       [n] = 1;
+	m_spiderFreqs        [n] = 7.0; // days b4 respider
+	m_maxSpidersPerRule  [n] = 9; // max spiders
+	m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
+	m_spiderIpWaits      [n] = 1000; // same ip wait
+	m_spiderPriorities   [n] = 48;
+	n++;
+
+	m_regExs[n].set("hopcount==0 && iswww && parentlang==zh_cn,zh_tw,xx");
+	m_harvestLinks       [n] = 1;
+	m_spiderFreqs        [n] = 7.0; // days b4 respider
+	m_maxSpidersPerRule  [n] = 9; // max spiders
+	m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
+	m_spiderIpWaits      [n] = 1000; // same ip wait
+	m_spiderPriorities   [n] = 48;
+	n++;
+
+	m_regExs[n].set("hopcount==0 && iswww");
+	m_harvestLinks       [n] = 1;
+	m_spiderFreqs        [n] = 7.0; // days b4 respider
+	m_maxSpidersPerRule  [n] = 9; // max spiders
+	m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
+	m_spiderIpWaits      [n] = 1000; // same ip wait
+	m_spiderPriorities   [n] = 19;
+	n++;
+
+
+
+
+
+	m_regExs[n].set("hopcount==0 && isnew && tld==cn");
+	m_harvestLinks       [n] = 1;
+	m_spiderFreqs        [n] = 7.0;
+	m_maxSpidersPerRule  [n] = 9; // max spiders
+	m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
+	m_spiderIpWaits      [n] = 1000; // same ip wait
+	m_spiderPriorities   [n] = 49;
+	n++;
+
+	m_regExs[n].set("hopcount==0 && isnew && parentlang==zh_cn,zh_tw,xx");
+	m_harvestLinks       [n] = 1;
+	m_spiderFreqs        [n] = 7.0;
+	m_maxSpidersPerRule  [n] = 9; // max spiders
+	m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
+	m_spiderIpWaits      [n] = 1000; // same ip wait
+	m_spiderPriorities   [n] = 49;
+	n++;
+
+	m_regExs[n].set("hopcount==0 && isnew");
+	m_harvestLinks       [n] = 1;
+	m_spiderFreqs        [n] = 7.0;
+	m_maxSpidersPerRule  [n] = 9; // max spiders
+	m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
+	m_spiderIpWaits      [n] = 1000; // same ip wait
+	m_spiderPriorities   [n] = 18;
+	n++;
+
+
+
+	m_regExs[n].set("hopcount==0 && tld==cn");
+	m_harvestLinks       [n] = 1;
+	m_spiderFreqs        [n] = 10.0;
+	m_maxSpidersPerRule  [n] = 9; // max spiders
+	m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
+	m_spiderIpWaits      [n] = 1000; // same ip wait
+	m_spiderPriorities   [n] = 47;
+	n++;
+
+	m_regExs[n].set("hopcount==0 && parentlang==zh_cn,zh_tw,xx");
+	m_harvestLinks       [n] = 1;
+	m_spiderFreqs        [n] = 10.0;
+	m_maxSpidersPerRule  [n] = 9; // max spiders
+	m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
+	m_spiderIpWaits      [n] = 1000; // same ip wait
+	m_spiderPriorities   [n] = 47;
+	n++;
+
+	m_regExs[n].set("hopcount==0");
+	m_harvestLinks       [n] = 1;
+	m_spiderFreqs        [n] = 10.0;
+	m_maxSpidersPerRule  [n] = 9; // max spiders
+	m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
+	m_spiderIpWaits      [n] = 1000; // same ip wait
+	m_spiderPriorities   [n] = 17;
+	n++;
+
+
+
+
+	m_regExs[n].set("hopcount==1 && isnew && tld==cn");
+	m_harvestLinks       [n] = 1;
+	m_spiderFreqs        [n] = 20.0;
+	m_maxSpidersPerRule  [n] = 9; // max spiders
+	m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
+	m_spiderIpWaits      [n] = 1000; // same ip wait
+	m_spiderPriorities   [n] = 40;
+	n++;
+
+	m_regExs[n].set("hopcount==1 && isnew && parentlang==zh_cn,zh_tw,xx");
+	m_harvestLinks       [n] = 1;
+	m_spiderFreqs        [n] = 20.0;
+	m_maxSpidersPerRule  [n] = 9; // max spiders
+	m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
+	m_spiderIpWaits      [n] = 1000; // same ip wait
+	m_spiderPriorities   [n] = 40;
+	n++;
+
+	m_regExs[n].set("hopcount==1 && isnew");
+	m_harvestLinks       [n] = 1;
+	m_spiderFreqs        [n] = 20.0;
+	m_maxSpidersPerRule  [n] = 9; // max spiders
+	m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
+	m_spiderIpWaits      [n] = 1000; // same ip wait
+	m_spiderPriorities   [n] = 16;
+	n++;
+
+
+
+	m_regExs[n].set("hopcount==1 && tld==cn");
+	m_harvestLinks       [n] = 1;
+	m_spiderFreqs        [n] = 20.0;
+	m_maxSpidersPerRule  [n] = 9; // max spiders
+	m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
+	m_spiderIpWaits      [n] = 1000; // same ip wait
+	m_spiderPriorities   [n] = 39;
+	n++;
+
+	m_regExs[n].set("hopcount==1 && parentlang==zh_cn,zh_tw,xx");
+	m_harvestLinks       [n] = 1;
+	m_spiderFreqs        [n] = 20.0;
+	m_maxSpidersPerRule  [n] = 9; // max spiders
+	m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
+	m_spiderIpWaits      [n] = 1000; // same ip wait
+	m_spiderPriorities   [n] = 39;
+	n++;
+
+	m_regExs[n].set("hopcount==1");
+	m_harvestLinks       [n] = 1;
+	m_spiderFreqs        [n] = 20.0;
+	m_maxSpidersPerRule  [n] = 9; // max spiders
+	m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
+	m_spiderIpWaits      [n] = 1000; // same ip wait
+	m_spiderPriorities   [n] = 15;
+	n++;
+
+
+
+
+	m_regExs[n].set("hopcount==2 && isnew && tld==cn");
+	m_harvestLinks       [n] = 1;
+	m_spiderFreqs        [n] = 40;
+	m_maxSpidersPerRule  [n] = 9; // max spiders
+	m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
+	m_spiderIpWaits      [n] = 1000; // same ip wait
+	m_spiderPriorities   [n] = 30;
+	n++;
+
+	m_regExs[n].set("hopcount==2 && isnew && parentlang==zh_cn,zh_tw,xx");
+	m_harvestLinks       [n] = 1;
+	m_spiderFreqs        [n] = 40;
+	m_maxSpidersPerRule  [n] = 9; // max spiders
+	m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
+	m_spiderIpWaits      [n] = 1000; // same ip wait
+	m_spiderPriorities   [n] = 30;
+	n++;
+
+	m_regExs[n].set("hopcount==2 && isnew");
+	m_harvestLinks       [n] = 1;
+	m_spiderFreqs        [n] = 40;
+	m_maxSpidersPerRule  [n] = 9; // max spiders
+	m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
+	m_spiderIpWaits      [n] = 1000; // same ip wait
+	m_spiderPriorities   [n] = 14;
+	n++;
+
+
+
+
+	m_regExs[n].set("hopcount==2 && tld==cn");
+	m_harvestLinks       [n] = 1;
+	m_spiderFreqs        [n] = 40;
+	m_maxSpidersPerRule  [n] = 9; // max spiders
+	m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
+	m_spiderIpWaits      [n] = 1000; // same ip wait
+	m_spiderPriorities   [n] = 29;
+	n++;
+
+	m_regExs[n].set("hopcount==2 && parentlang==zh_cn,zh_tw,xx");
+	m_harvestLinks       [n] = 1;
+	m_spiderFreqs        [n] = 40;
+	m_maxSpidersPerRule  [n] = 9; // max spiders
+	m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
+	m_spiderIpWaits      [n] = 1000; // same ip wait
+	m_spiderPriorities   [n] = 29;
+	n++;
+
+	m_regExs[n].set("hopcount==2");
+	m_harvestLinks       [n] = 1;
+	m_spiderFreqs        [n] = 40;
+	m_maxSpidersPerRule  [n] = 9; // max spiders
+	m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
+	m_spiderIpWaits      [n] = 1000; // same ip wait
+	m_spiderPriorities   [n] = 13;
+	n++;
+
+
+
+
+	m_regExs[n].set("hopcount>=3 && isnew && tld==cn");
+	m_harvestLinks       [n] = 1;
+	m_spiderFreqs        [n] = 60;
+	m_maxSpidersPerRule  [n] = 9; // max spiders
+	m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
+	m_spiderIpWaits      [n] = 1000; // same ip wait
+	m_spiderPriorities   [n] = 22;
+	n++;
+
+	m_regExs[n].set("hopcount>=3 && isnew && parentlang==zh_cn,zh_tw,xx");
+	m_harvestLinks       [n] = 1;
+	m_spiderFreqs        [n] = 60;
+	m_maxSpidersPerRule  [n] = 9; // max spiders
+	m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
+	m_spiderIpWaits      [n] = 1000; // same ip wait
+	m_spiderPriorities   [n] = 22;
+	n++;
+
+	m_regExs[n].set("hopcount>=3 && isnew");
+	m_harvestLinks       [n] = 1;
+	m_spiderFreqs        [n] = 60;
+	m_maxSpidersPerRule  [n] = 9; // max spiders
+	m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
+	m_spiderIpWaits      [n] = 1000; // same ip wait
+	m_spiderPriorities   [n] = 12;
+	n++;
+
+
+
+
+	m_regExs[n].set("hopcount>=3 && tld==cn");
+	m_harvestLinks       [n] = 1;
+	m_spiderFreqs        [n] = 60;
+	m_maxSpidersPerRule  [n] = 9; // max spiders
+	m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
+	m_spiderIpWaits      [n] = 1000; // same ip wait
+	m_spiderPriorities   [n] = 21;
+	n++;
+
+	m_regExs[n].set("hopcount>=3 && parentlang==zh_cn,zh_tw,xx");
+	m_harvestLinks       [n] = 1;
+	m_spiderFreqs        [n] = 60;
+	m_maxSpidersPerRule  [n] = 9; // max spiders
+	m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
+	m_spiderIpWaits      [n] = 1000; // same ip wait
+	m_spiderPriorities   [n] = 21;
+	n++;
+
+	m_regExs[n].set("hopcount>=3");
+	m_harvestLinks       [n] = 1;
+	m_spiderFreqs        [n] = 60;
+	m_maxSpidersPerRule  [n] = 9; // max spiders
+	m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
+	m_spiderIpWaits      [n] = 1000; // same ip wait
+	m_spiderPriorities   [n] = 11;
+	n++;
+
+
+
+	m_regExs[n].set("default");
+	m_harvestLinks       [n] = 1;
+	m_spiderFreqs        [n] = 60;
+	m_maxSpidersPerRule  [n] = 9; // max spiders
+	m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
+	m_spiderIpWaits      [n] = 1000; // same ip wait
+	m_spiderPriorities   [n] = 1;
+	n++;
+
+	m_numRegExs   = n;
+	m_numRegExs2  = n;
+	m_numRegExs3  = n;
+	m_numRegExs10 = n;
+	m_numRegExs5  = n;
+	m_numRegExs6  = n;
+	m_numRegExs8  = n;
+
+	// done rebuilding CHINESE rules
+	return true;
+}
+
 /*
 bool CrawlInfo::print (SafeBuf *sb ) {
 	return sb->safePrintf("objectsAdded:%lli\n"
@ -2758,3 +3137,4 @@ void testRegex ( ) {
 		     url,rx);
 	exit(0);
 }
+
--- a/Collectiondb.h
+++ b/Collectiondb.h
@ -368,6 +368,8 @@ class CollectionRec {
 	// for regular crawls
 	bool rebuildUrlFilters2();

+	bool rebuildChineseRules();
+
 	bool m_urlFiltersHavePageCounts;

 	// moved from SpiderColl so we can load up at startup
--- a/Parms.cpp
+++ b/Parms.cpp
@ -1318,9 +1318,9 @@ bool printDropDown ( long n , SafeBuf* sb, char *name, long select,
 bool printDropDownProfile ( SafeBuf* sb, char *name, long select ) {
 	sb->safePrintf ( "<select name=%s>", name );
 	// the type of url filters profiles
-	char *items[] = {"custom","web","news"};
+	char *items[] = {"custom","web","news","chinese"};
 	char *s;
-	for ( long i = 0 ; i < 3 ; i++ ) {
+	for ( long i = 0 ; i < 4 ; i++ ) {
 		if ( i == select ) s = " selected";
 		else               s = "";
 		sb->safePrintf ("<option value=%li%s>%s",i,s,items[i]);
@ -7702,7 +7702,9 @@ void Parms::init ( ) {
 		"tools. "
 		"Limit list to 300MB. If you have a lot of INDIVIDUAL urls "
 		"to add then consider using the <a href=/admin/addurl>add "
-		"urls</a> interface.";
+		"urls</a> interface. <b>IF YOU WANT TO SPIDER THE WHOLE "
+		"WEB</b> then only use the <i>seed:</i> directives here "
+		"lest you limit yourself to a set of domains.";
 	m->m_cgi   = "sitelist";
 	m->m_off   = (char *)&cr.m_siteListBuf - x;
 	m->m_page  = PAGE_BASIC_SETTINGS;
@ -18735,13 +18737,14 @@ bool printUrlExpressionExamples ( SafeBuf *sb ) {
 			  "<td>"
 			  "This is true if the url was directly "
 			  "injected from the "
-			  "/inject page or API."
+			  "<a href=/admin/inject>inject page</a> or API."
 			  "</td></tr>"

 			  "<tr class=poo><td>isdocidbased | !isdocidbased</td>"
 			  "<td>"
 			  "This is true if the url was added from the "
-			  "reindex interface. The request does not contain "
+			  "<a href=/admin/reindex>query reindex</a> "
+			  "interface. The request does not contain "
 			  "a url, but only a docid, that way we can add "
 			  "millions of search results very quickly without "
 			  "having to lookup each of their urls. You should "
@ -18932,6 +18935,16 @@ bool printUrlExpressionExamples ( SafeBuf *sb ) {
 			  "See table below for supported language "
 			  "abbreviations.</td></tr>"

+			  "<tr class=poo><td><nobr>parentlang==zh_cn,zh_tw,xx"
+			  "</nobr></td>"
+			  "<td>Matches if "
+			  "the url's referring parent url is primarily in "
+			  "this language. Useful for prioritizing spidering "
+			  "pages of a certain language."
+			  "See table below for supported language "
+			  "abbreviations."
+			  "</td></tr>"
+
 			  /*
 			  "<tr class=poo><td>link:gigablast</td>"
 			  "<td>Matches if the document links to gigablast."
--- a/Parms.h
+++ b/Parms.h
@ -22,7 +22,8 @@ enum {
 	UFP_CUSTOM = 0 ,
 	UFP_NONE   = 0 ,
 	UFP_WEB    = 1 ,
-	UFP_NEWS   = 2 
+	UFP_NEWS   = 2 ,
+	UFP_CHINESE = 3
 };

 // special priorities for the priority drop down 
--- a/Spider.cpp
+++ b/Spider.cpp
@ -110,7 +110,7 @@ long SpiderRequest::print ( SafeBuf *sbarg ) {
 	sb->safePrintf("parentDomHash32=0x%lx ",m_parentDomHash32 );
 	sb->safePrintf("parentSiteHash32=0x%lx ",m_parentSiteHash32 );

-	sb->safePrintf("hopCount=%li ",m_hopCount );
+	sb->safePrintf("hopCount=%li ",(long)m_hopCount );

 	//timeStruct = gmtime ( &m_spiderTime );
 	//time[0] = 0;
@ -301,7 +301,7 @@ long SpiderRequest::printToTable ( SafeBuf *sb , char *status ,

 	sb->safePrintf(" <td>%li</td>\n",m_siteNumInlinks );
 	//sb->safePrintf(" <td>%li</td>\n",m_pageNumInlinks );
-	sb->safePrintf(" <td>%li</td>\n",m_hopCount );
+	sb->safePrintf(" <td>%li</td>\n",(long)m_hopCount );

 	// print time format: 7/23/1971 10:45:32
 	struct tm *timeStruct ;
@ -436,7 +436,7 @@ long SpiderRequest::printToTableSimple ( SafeBuf *sb , char *status ,

 	sb->safePrintf(" <td>%li</td>\n",(long)m_errCount );

-	sb->safePrintf(" <td>%li</td>\n",m_hopCount );
+	sb->safePrintf(" <td>%li</td>\n",(long)m_hopCount );

 	// print time format: 7/23/1971 10:45:32
 	struct tm *timeStruct ;
@ -9912,6 +9912,13 @@ long getUrlFilterNum2 ( SpiderRequest *sreq       ,
 		langLen = gbstrlen(lang);
 	}

+	// . get parent language in the request
+	// . primarpy language of the parent page that linked to this url
+	char *plang = NULL;
+	long  plangLen = 0;
+	plang = getLanguageAbbr(sreq->m_parentLangId);
+	if ( plang ) plangLen = gbstrlen(plang);
+
 	char *tld = (char *)-1;
 	long  tldLen;

@ -11026,6 +11033,67 @@ long getUrlFilterNum2 ( SpiderRequest *sreq       ,
 			// come here if we did not match the tld
 		}

+
+		// parentlang=en,zh_cn
+		if ( *p=='p' && strncmp(p,"parentlang",10)==0){
+			// if we do not have enough info for outlink, all done
+			if ( isOutlink ) return -1;
+			// must have a reply
+			//if ( ! srep ) continue;
+			// skip if unknown? no, we support "xx" as unknown now
+			//if ( srep->m_langId == 0 ) continue;
+			// set these up
+			char *b = s;
+			// loop for the comma-separated list of langids
+			// like parentlang==en,es,...
+		subloop2b:
+			// get length of it in the expression box
+			char *start = b;
+			while ( *b && !is_wspace_a(*b) && *b!=',' ) b++;
+			long  blen = b - start;
+			//char sm;
+			// if we had parentlang==en,es,...
+			if ( sign == SIGN_EQ &&
+			     blen == plangLen && 
+			     strncasecmp(start,plang,plangLen)==0 ) 
+				// if we matched any, that's great
+				goto matched2b;
+			// if its parentlang!=en,es,...
+			// and we equal the string, then we do not matcht his
+			// particular rule!!!
+			if ( sign == SIGN_NE &&
+			     blen == plangLen && 
+			     strncasecmp(start,plang,plangLen)==0 ) 
+				// we do not match this rule if we matched
+				// and of the langs in the != list
+				continue;
+			// might have another in the comma-separated list
+			if ( *b != ',' ) {
+				// if that was the end of the list and the
+				// sign was == then skip this rule
+				if ( sign == SIGN_EQ ) continue;
+				// otherwise, if the sign was != then we win!
+				if ( sign == SIGN_NE ) goto matched2b;
+				// otherwise, bad sign?
+				continue;
+			}
+			// advance to next list item if was a comma after us
+			b++;
+			// and try again
+			goto subloop2b;
+			// come here on a match
+		matched2b:
+			// we matched, now look for &&
+			p = strstr ( b , "&&" );
+			// if nothing, else then it is a match
+			if ( ! p ) return i;
+			// skip the '&&' and go to next rule
+			p += 2;
+			goto checkNextRule;
+			// come here if we did not match the tld
+		}
+
+
 		// hopcount == 20 [&&]
 		if ( *p=='h' && strncmp(p, "hopcount", 8) == 0){
 			// skip if not valid
--- a/Spider.h
+++ b/Spider.h
@ -528,29 +528,37 @@ class SpiderRequest {
 	// . this is zero if none or invalid
 	long    m_contentHash32;

-	/*
-	char    m_reserved1;
+	// . each request can have a different hop count
+	// . this is only valid if m_hopCountValid is true!
+	// . i made this a short from long to support m_parentLangId etc above
+	short   m_hopCount;
+	
+	// when creating a chinese search engine for instance it is nice
+	// to know the language of the page we are spidering's parent.
+	// typically a chinese page will link to another chinese page,
+	// though not always of course. this is the primary language of
+	// the parent.
+	uint8_t m_parentLangId;//reserved1;

 	// the new add url control will allow user to control link spidering
 	// on each url they add. they can also specify file:// instead of
 	// http:// to index local files. so we have to allow file://
-	char    m_onlyAddSameDomainLinks        :1;
-	char    m_onlyAddSameSubdomainLinks     :1;
-	char    m_onlyDoNotAddLinksLinks        :1; // max hopcount 1
-	char    m_onlyDoNotAddLinksLinksLinks   :1; // max hopcount 2
+	/* char    m_onlyAddSameDomainLinks        :1; */
+	/* char    m_onlyAddSameSubdomainLinks     :1; */
+	/* char    m_onlyDoNotAddLinksLinks        :1; // max hopcount 1 */
+	/* char    m_onlyDoNotAddLinksLinksLinks   :1; // max hopcount 2 */
+	char    m_reserved2a:1;
+	char    m_reserved2b:1;
+	char    m_reserved2c:1;
 	char    m_reserved2d:1;
+
 	char    m_reserved2e:1;
 	char    m_reserved2f:1;
 	char    m_reserved2g:1;
 	char    m_reserved2h:1;


-	// . each request can have a different hop count
-	// . this is only valid if m_hopCountValid is true!
-	short   m_hopCount;
-	*/
-	
-	long    m_hopCount;
+	//long    m_hopCount;

 	// . this is now computed dynamically often based on the latest
 	//   m_addedTime and m_percentChanged of all the SpideRec *replies*.
@ -715,6 +723,8 @@ class SpiderRequest {
 		m_ufn = -1;
 		// this too
 		m_priority = -1;
+		// this happens to be zero already, but just in case it changes
+		m_parentLangId = langUnknown;
 	};

 	static long getNeededSize ( long urlLen ) {
--- a/XmlDoc.cpp
+++ b/XmlDoc.cpp
@ -22996,6 +22996,9 @@ char *XmlDoc::addOutlinkSpiderRecsToMetaList ( ) {
 	Url *cu = getCurrentUrl();
 	if ( ! cu || cu == (void *)-1 ) return (char *)cu;

+	uint8_t *langId = getLangId();
+	if ( ! langId || langId == (uint8_t *)-1 ) return (char *)langId;
+
 	// validate this to prevent core for simplified redirect links
 	long hostHash32a = getHostHash32a();

@ -23388,6 +23391,11 @@ char *XmlDoc::addOutlinkSpiderRecsToMetaList ( ) {
 				ksr.m_isAddUrl = 1;
 		}

+		// it is useful to know the primary langid of the parent
+		// when prioritizing links for spidering in the case of
+		// focussing the search engine on a particular set of langs
+		ksr.m_parentLangId = *langId;
+
 		// don't forget this one!
 		//ksr.m_spiderTime = nowGlobal;

--- a/main.cpp
+++ b/main.cpp
@ -1024,6 +1024,25 @@ int main2 ( int argc , char *argv[] ) {
 		testMandrill = true;
 	}

+	/*
+	class foo {
+	public:
+		long poo;
+	};
+	class fart {
+	public:
+		short fart3;
+		char fart1;
+		char fart2;
+	};
+	foo xxx;
+	xxx.poo = 38123;
+	fart *yyy = (fart *)&xxx;
+	fprintf(stderr,"fart1=%li fart2=%li fart3=%li\n",
+		(long)yyy->fart1,(long)yyy->fart2,(long)yyy->fart3);
+	exit(0);
+	*/
+
 	// gb gendbs, preset the hostid at least
 	if ( //strcmp ( cmd , "gendbs"   ) == 0 ||
 	     //strcmp ( cmd , "gentfndb" ) == 0 ||