added support for spidering a particular language

with higher priority.
This commit is contained in:
mwells
2014-05-09 10:03:24 -06:00
parent 45e2506598
commit 6048ae849b
8 changed files with 523 additions and 22 deletions

@ -1862,6 +1862,9 @@ bool CollectionRec::rebuildUrlFilters2 ( ) {
// addDefault = true;
if ( ! rebuild ) return true;
if ( m_urlFiltersProfile == UFP_CHINESE )
return rebuildChineseRules();
long n = 0;
/*
@ -1945,7 +1948,6 @@ bool CollectionRec::rebuildUrlFilters2 ( ) {
m_spiderFreqs [n] = .00347; // 5 mins
n++;
m_regExs[n].set("hopcount==0 && iswww");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 7.0; // days b4 respider
@ -2108,6 +2110,383 @@ bool CollectionRec::rebuildUrlFilters2 ( ) {
return true;
}
bool CollectionRec::rebuildChineseRules ( ) {
long n = 0;
m_regExs[n].set("isdocidbased");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 0; // 30 days default
m_maxSpidersPerRule [n] = 99; // max spiders
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 80;
n++;
m_regExs[n].set("ismedia");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 0; // 30 days default
m_maxSpidersPerRule [n] = 99; // max spiders
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = -3; // delete!
n++;
// if not in the site list then nuke it
m_regExs[n].set("!ismanualadd && !insitelist");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 0; // 30 days default
m_maxSpidersPerRule [n] = 99; // max spiders
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = -3; // delete!
n++;
m_regExs[n].set("errorcount>=3 && hastmperror");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 1; // 30 days default
m_maxSpidersPerRule [n] = 1; // max spiders
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 3;
n++;
m_regExs[n].set("errorcount>=1 && hastmperror");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 1; // 30 days default
m_maxSpidersPerRule [n] = 1; // max spiders
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 45;
n++;
m_regExs[n].set("isaddurl");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 7; // 30 days default
m_maxSpidersPerRule [n] = 99; // max spiders
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 85;
n++;
m_regExs[n].set("hopcount==0 && iswww && isnew && tld==cn");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 7; // 30 days default
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 50;
n++;
m_regExs[n].set("hopcount==0 && iswww && isnew && parentlang==zh_cn,zh_tw,xx");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 7; // 30 days default
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 50;
n++;
m_regExs[n].set("hopcount==0 && iswww && isnew");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 7; // 30 days default
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 20;
m_regExs[n].set("hopcount==0 && iswww && tld==cn");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 7.0; // days b4 respider
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 48;
n++;
m_regExs[n].set("hopcount==0 && iswww && parentlang==zh_cn,zh_tw,xx");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 7.0; // days b4 respider
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 48;
n++;
m_regExs[n].set("hopcount==0 && iswww");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 7.0; // days b4 respider
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 19;
n++;
m_regExs[n].set("hopcount==0 && isnew && tld==cn");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 7.0;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 49;
n++;
m_regExs[n].set("hopcount==0 && isnew && parentlang==zh_cn,zh_tw,xx");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 7.0;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 49;
n++;
m_regExs[n].set("hopcount==0 && isnew");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 7.0;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 18;
n++;
m_regExs[n].set("hopcount==0 && tld==cn");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 10.0;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 47;
n++;
m_regExs[n].set("hopcount==0 && parentlang==zh_cn,zh_tw,xx");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 10.0;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 47;
n++;
m_regExs[n].set("hopcount==0");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 10.0;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 17;
n++;
m_regExs[n].set("hopcount==1 && isnew && tld==cn");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 20.0;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 40;
n++;
m_regExs[n].set("hopcount==1 && isnew && parentlang==zh_cn,zh_tw,xx");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 20.0;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 40;
n++;
m_regExs[n].set("hopcount==1 && isnew");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 20.0;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 16;
n++;
m_regExs[n].set("hopcount==1 && tld==cn");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 20.0;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 39;
n++;
m_regExs[n].set("hopcount==1 && parentlang==zh_cn,zh_tw,xx");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 20.0;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 39;
n++;
m_regExs[n].set("hopcount==1");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 20.0;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 15;
n++;
m_regExs[n].set("hopcount==2 && isnew && tld==cn");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 40;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 30;
n++;
m_regExs[n].set("hopcount==2 && isnew && parentlang==zh_cn,zh_tw,xx");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 40;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 30;
n++;
m_regExs[n].set("hopcount==2 && isnew");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 40;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 14;
n++;
m_regExs[n].set("hopcount==2 && tld==cn");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 40;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 29;
n++;
m_regExs[n].set("hopcount==2 && parentlang==zh_cn,zh_tw,xx");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 40;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 29;
n++;
m_regExs[n].set("hopcount==2");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 40;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 13;
n++;
m_regExs[n].set("hopcount>=3 && isnew && tld==cn");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 60;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 22;
n++;
m_regExs[n].set("hopcount>=3 && isnew && parentlang==zh_cn,zh_tw,xx");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 60;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 22;
n++;
m_regExs[n].set("hopcount>=3 && isnew");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 60;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 12;
n++;
m_regExs[n].set("hopcount>=3 && tld==cn");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 60;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 21;
n++;
m_regExs[n].set("hopcount>=3 && parentlang==zh_cn,zh_tw,xx");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 60;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 21;
n++;
m_regExs[n].set("hopcount>=3");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 60;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 11;
n++;
m_regExs[n].set("default");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 60;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 1;
n++;
m_numRegExs = n;
m_numRegExs2 = n;
m_numRegExs3 = n;
m_numRegExs10 = n;
m_numRegExs5 = n;
m_numRegExs6 = n;
m_numRegExs8 = n;
// done rebuilding CHINESE rules
return true;
}
/*
bool CrawlInfo::print (SafeBuf *sb ) {
return sb->safePrintf("objectsAdded:%lli\n"
@ -2758,3 +3137,4 @@ void testRegex ( ) {
url,rx);
exit(0);
}

@ -368,6 +368,8 @@ class CollectionRec {
// for regular crawls
bool rebuildUrlFilters2();
bool rebuildChineseRules();
bool m_urlFiltersHavePageCounts;
// moved from SpiderColl so we can load up at startup

@ -1318,9 +1318,9 @@ bool printDropDown ( long n , SafeBuf* sb, char *name, long select,
bool printDropDownProfile ( SafeBuf* sb, char *name, long select ) {
sb->safePrintf ( "<select name=%s>", name );
// the type of url filters profiles
char *items[] = {"custom","web","news"};
char *items[] = {"custom","web","news","chinese"};
char *s;
for ( long i = 0 ; i < 3 ; i++ ) {
for ( long i = 0 ; i < 4 ; i++ ) {
if ( i == select ) s = " selected";
else s = "";
sb->safePrintf ("<option value=%li%s>%s",i,s,items[i]);
@ -7702,7 +7702,9 @@ void Parms::init ( ) {
"tools. "
"Limit list to 300MB. If you have a lot of INDIVIDUAL urls "
"to add then consider using the <a href=/admin/addurl>add "
"urls</a> interface.";
"urls</a> interface. <b>IF YOU WANT TO SPIDER THE WHOLE "
"WEB</b> then only use the <i>seed:</i> directives here "
"lest you limit yourself to a set of domains.";
m->m_cgi = "sitelist";
m->m_off = (char *)&cr.m_siteListBuf - x;
m->m_page = PAGE_BASIC_SETTINGS;
@ -18735,13 +18737,14 @@ bool printUrlExpressionExamples ( SafeBuf *sb ) {
"<td>"
"This is true if the url was directly "
"injected from the "
"/inject page or API."
"<a href=/admin/inject>inject page</a> or API."
"</td></tr>"
"<tr class=poo><td>isdocidbased | !isdocidbased</td>"
"<td>"
"This is true if the url was added from the "
"reindex interface. The request does not contain "
"<a href=/admin/reindex>query reindex</a> "
"interface. The request does not contain "
"a url, but only a docid, that way we can add "
"millions of search results very quickly without "
"having to lookup each of their urls. You should "
@ -18932,6 +18935,16 @@ bool printUrlExpressionExamples ( SafeBuf *sb ) {
"See table below for supported language "
"abbreviations.</td></tr>"
"<tr class=poo><td><nobr>parentlang==zh_cn,zh_tw,xx"
"</nobr></td>"
"<td>Matches if "
"the url's referring parent url is primarily in "
"this language. Useful for prioritizing spidering "
"pages of a certain language."
"See table below for supported language "
"abbreviations."
"</td></tr>"
/*
"<tr class=poo><td>link:gigablast</td>"
"<td>Matches if the document links to gigablast."

@ -22,7 +22,8 @@ enum {
UFP_CUSTOM = 0 ,
UFP_NONE = 0 ,
UFP_WEB = 1 ,
UFP_NEWS = 2
UFP_NEWS = 2 ,
UFP_CHINESE = 3
};
// special priorities for the priority drop down

@ -110,7 +110,7 @@ long SpiderRequest::print ( SafeBuf *sbarg ) {
sb->safePrintf("parentDomHash32=0x%lx ",m_parentDomHash32 );
sb->safePrintf("parentSiteHash32=0x%lx ",m_parentSiteHash32 );
sb->safePrintf("hopCount=%li ",m_hopCount );
sb->safePrintf("hopCount=%li ",(long)m_hopCount );
//timeStruct = gmtime ( &m_spiderTime );
//time[0] = 0;
@ -301,7 +301,7 @@ long SpiderRequest::printToTable ( SafeBuf *sb , char *status ,
sb->safePrintf(" <td>%li</td>\n",m_siteNumInlinks );
//sb->safePrintf(" <td>%li</td>\n",m_pageNumInlinks );
sb->safePrintf(" <td>%li</td>\n",m_hopCount );
sb->safePrintf(" <td>%li</td>\n",(long)m_hopCount );
// print time format: 7/23/1971 10:45:32
struct tm *timeStruct ;
@ -436,7 +436,7 @@ long SpiderRequest::printToTableSimple ( SafeBuf *sb , char *status ,
sb->safePrintf(" <td>%li</td>\n",(long)m_errCount );
sb->safePrintf(" <td>%li</td>\n",m_hopCount );
sb->safePrintf(" <td>%li</td>\n",(long)m_hopCount );
// print time format: 7/23/1971 10:45:32
struct tm *timeStruct ;
@ -9912,6 +9912,13 @@ long getUrlFilterNum2 ( SpiderRequest *sreq ,
langLen = gbstrlen(lang);
}
// . get parent language in the request
// . primarpy language of the parent page that linked to this url
char *plang = NULL;
long plangLen = 0;
plang = getLanguageAbbr(sreq->m_parentLangId);
if ( plang ) plangLen = gbstrlen(plang);
char *tld = (char *)-1;
long tldLen;
@ -11026,6 +11033,67 @@ long getUrlFilterNum2 ( SpiderRequest *sreq ,
// come here if we did not match the tld
}
// parentlang=en,zh_cn
if ( *p=='p' && strncmp(p,"parentlang",10)==0){
// if we do not have enough info for outlink, all done
if ( isOutlink ) return -1;
// must have a reply
//if ( ! srep ) continue;
// skip if unknown? no, we support "xx" as unknown now
//if ( srep->m_langId == 0 ) continue;
// set these up
char *b = s;
// loop for the comma-separated list of langids
// like parentlang==en,es,...
subloop2b:
// get length of it in the expression box
char *start = b;
while ( *b && !is_wspace_a(*b) && *b!=',' ) b++;
long blen = b - start;
//char sm;
// if we had parentlang==en,es,...
if ( sign == SIGN_EQ &&
blen == plangLen &&
strncasecmp(start,plang,plangLen)==0 )
// if we matched any, that's great
goto matched2b;
// if its parentlang!=en,es,...
// and we equal the string, then we do not matcht his
// particular rule!!!
if ( sign == SIGN_NE &&
blen == plangLen &&
strncasecmp(start,plang,plangLen)==0 )
// we do not match this rule if we matched
// and of the langs in the != list
continue;
// might have another in the comma-separated list
if ( *b != ',' ) {
// if that was the end of the list and the
// sign was == then skip this rule
if ( sign == SIGN_EQ ) continue;
// otherwise, if the sign was != then we win!
if ( sign == SIGN_NE ) goto matched2b;
// otherwise, bad sign?
continue;
}
// advance to next list item if was a comma after us
b++;
// and try again
goto subloop2b;
// come here on a match
matched2b:
// we matched, now look for &&
p = strstr ( b , "&&" );
// if nothing, else then it is a match
if ( ! p ) return i;
// skip the '&&' and go to next rule
p += 2;
goto checkNextRule;
// come here if we did not match the tld
}
// hopcount == 20 [&&]
if ( *p=='h' && strncmp(p, "hopcount", 8) == 0){
// skip if not valid

@ -528,29 +528,37 @@ class SpiderRequest {
// . this is zero if none or invalid
long m_contentHash32;
/*
char m_reserved1;
// . each request can have a different hop count
// . this is only valid if m_hopCountValid is true!
// . i made this a short from long to support m_parentLangId etc above
short m_hopCount;
// when creating a chinese search engine for instance it is nice
// to know the language of the page we are spidering's parent.
// typically a chinese page will link to another chinese page,
// though not always of course. this is the primary language of
// the parent.
uint8_t m_parentLangId;//reserved1;
// the new add url control will allow user to control link spidering
// on each url they add. they can also specify file:// instead of
// http:// to index local files. so we have to allow file://
char m_onlyAddSameDomainLinks :1;
char m_onlyAddSameSubdomainLinks :1;
char m_onlyDoNotAddLinksLinks :1; // max hopcount 1
char m_onlyDoNotAddLinksLinksLinks :1; // max hopcount 2
/* char m_onlyAddSameDomainLinks :1; */
/* char m_onlyAddSameSubdomainLinks :1; */
/* char m_onlyDoNotAddLinksLinks :1; // max hopcount 1 */
/* char m_onlyDoNotAddLinksLinksLinks :1; // max hopcount 2 */
char m_reserved2a:1;
char m_reserved2b:1;
char m_reserved2c:1;
char m_reserved2d:1;
char m_reserved2e:1;
char m_reserved2f:1;
char m_reserved2g:1;
char m_reserved2h:1;
// . each request can have a different hop count
// . this is only valid if m_hopCountValid is true!
short m_hopCount;
*/
long m_hopCount;
//long m_hopCount;
// . this is now computed dynamically often based on the latest
// m_addedTime and m_percentChanged of all the SpideRec *replies*.
@ -715,6 +723,8 @@ class SpiderRequest {
m_ufn = -1;
// this too
m_priority = -1;
// this happens to be zero already, but just in case it changes
m_parentLangId = langUnknown;
};
static long getNeededSize ( long urlLen ) {

@ -22996,6 +22996,9 @@ char *XmlDoc::addOutlinkSpiderRecsToMetaList ( ) {
Url *cu = getCurrentUrl();
if ( ! cu || cu == (void *)-1 ) return (char *)cu;
uint8_t *langId = getLangId();
if ( ! langId || langId == (uint8_t *)-1 ) return (char *)langId;
// validate this to prevent core for simplified redirect links
long hostHash32a = getHostHash32a();
@ -23388,6 +23391,11 @@ char *XmlDoc::addOutlinkSpiderRecsToMetaList ( ) {
ksr.m_isAddUrl = 1;
}
// it is useful to know the primary langid of the parent
// when prioritizing links for spidering in the case of
// focussing the search engine on a particular set of langs
ksr.m_parentLangId = *langId;
// don't forget this one!
//ksr.m_spiderTime = nowGlobal;

@ -1024,6 +1024,25 @@ int main2 ( int argc , char *argv[] ) {
testMandrill = true;
}
/*
class foo {
public:
long poo;
};
class fart {
public:
short fart3;
char fart1;
char fart2;
};
foo xxx;
xxx.poo = 38123;
fart *yyy = (fart *)&xxx;
fprintf(stderr,"fart1=%li fart2=%li fart3=%li\n",
(long)yyy->fart1,(long)yyy->fart2,(long)yyy->fart3);
exit(0);
*/
// gb gendbs, preset the hostid at least
if ( //strcmp ( cmd , "gendbs" ) == 0 ||
//strcmp ( cmd , "gentfndb" ) == 0 ||