forked from Mirrors/privacore-open-source-search-engine
Remove unused rootlang
This commit is contained in:
@ -1690,6 +1690,7 @@ void RdbList::merge_r ( RdbList **lists ,
|
||||
std::set<int64_t> remove_tags;
|
||||
if ( rdbId == RDB_TAGDB ) {
|
||||
/// @todo ALC only need this to clean out existing tagdb records. (remove once it's cleaned up!)
|
||||
remove_tags.insert( getTagTypeFromStr( "rootlang" ) );
|
||||
remove_tags.insert( getTagTypeFromStr( "manualfilter" ) );
|
||||
remove_tags.insert( getTagTypeFromStr( "dateformat" ) );
|
||||
remove_tags.insert( getTagTypeFromStr( "venueaddress" ) );
|
||||
|
@ -886,11 +886,6 @@ static TagDesc s_tagDesc[] = {
|
||||
/// and removing them will cause missing info in the TagDB dump code
|
||||
/// (when clicking 'page info' in search results)
|
||||
|
||||
// data for the "lang" tag is 2 char language id followed by
|
||||
// a comma then a score from 1 to 100 to indicate percentage.
|
||||
// Allow multiple "lang" tags in one tagrec.
|
||||
{"rootlang" ,TDF_STRING,0},
|
||||
|
||||
// title tag and incoming link text of the root page is stored here
|
||||
// for determining default venue addresses
|
||||
{"roottitles" ,TDF_STRING|TDF_NOINDEX,0},
|
||||
@ -930,9 +925,10 @@ static TagDesc s_tagDesc[] = {
|
||||
// doing the throttling, really messing things up
|
||||
{"firstip" ,0x00,0},
|
||||
|
||||
/// @todo ALC only need this until existing tagdb records is merged. (remove once it's cleaned up!)
|
||||
/// @todo ALC only need this until we cater for unknown tags for display (remember titlerec!)
|
||||
// As above, we can't remove the following definition unless if we're sure it's not set anymore
|
||||
// Anything below this point is unused.
|
||||
{"rootlang" ,TDF_STRING,0},
|
||||
{"manualfilter", 0x00, 0},
|
||||
{"dateformat", 0x00, 0}, // 1 = american, 2 = european
|
||||
|
||||
@ -958,7 +954,6 @@ static TagDesc s_tagDesc[] = {
|
||||
|
||||
{"pagerank" ,0x00,0},
|
||||
{"ruleset" ,0x00,0}
|
||||
|
||||
};
|
||||
|
||||
// . convert "domain_squatter" to ST_DOMAIN_SQUATTER
|
||||
|
119
XmlDoc.cpp
119
XmlDoc.cpp
@ -1162,7 +1162,6 @@ bool XmlDoc::set2 ( char *titleRec ,
|
||||
|
||||
// new stuff
|
||||
m_siteNumInlinksValid = true;
|
||||
m_rootLangIdValid = true;
|
||||
m_metaListCheckSum8Valid = true;
|
||||
|
||||
m_hopCountValid = true;
|
||||
@ -2872,8 +2871,6 @@ char *XmlDoc::prepareToMakeTitleRec ( ) {
|
||||
if ( ! ls || ls == (void *)-1 ) return (char *)ls;
|
||||
uint32_t *tph = getTagPairHash32();
|
||||
if ( ! tph || tph == (uint32_t *)-1 ) return (char *)tph;
|
||||
uint8_t *rl = getRootLangId();
|
||||
if ( ! rl || rl == (void *)-1 ) return (char *)rl;
|
||||
|
||||
m_prepared = true;
|
||||
return (char *)1;
|
||||
@ -3104,7 +3101,6 @@ SafeBuf *XmlDoc::getTitleRecBuf ( ) {
|
||||
if ( ! m_httpStatusValid ) { g_process.shutdownAbort(true); }
|
||||
|
||||
if ( ! m_siteNumInlinksValid ) { g_process.shutdownAbort(true); }
|
||||
if ( ! m_rootLangIdValid ) { g_process.shutdownAbort(true); }
|
||||
|
||||
if ( ! m_hopCountValid ) { g_process.shutdownAbort(true); }
|
||||
if ( ! m_metaListCheckSum8Valid ) { g_process.shutdownAbort(true); }
|
||||
@ -5820,86 +5816,6 @@ uint16_t *XmlDoc::getCountryId ( ) {
|
||||
return &m_countryId;
|
||||
}
|
||||
|
||||
uint8_t *XmlDoc::getRootLangId ( ) {
|
||||
|
||||
// return it if we got it
|
||||
if ( m_rootLangIdValid ) return &m_rootLangId;
|
||||
// note it
|
||||
setStatus ( "getting root lang id from tagdb");
|
||||
// are we a root?
|
||||
char *isRoot = getIsSiteRoot();
|
||||
if ( ! isRoot || isRoot == (char *)-1 ) return (uint8_t *)isRoot;
|
||||
// sanity check - should not be called on a root url
|
||||
if ( *isRoot ) {
|
||||
uint8_t *langId = getLangId();
|
||||
if ( ! langId || langId == (uint8_t *)-1 )
|
||||
return (uint8_t *) langId;
|
||||
m_rootLangId = *langId;
|
||||
m_rootLangIdValid = true;
|
||||
return &m_rootLangId;
|
||||
//g_process.shutdownAbort(true); }
|
||||
}
|
||||
// get the tag rec
|
||||
TagRec *gr = getTagRec ();
|
||||
if ( ! gr || gr == (TagRec *)-1 ) return (uint8_t *)gr;
|
||||
// just use one. there may be multiple ones!
|
||||
Tag *tag = gr->getTag("rootlang");
|
||||
// if there use that
|
||||
if ( ! tag ) {
|
||||
// . get the root doc
|
||||
// . allow for a one hour cache of the titleRec
|
||||
XmlDoc **prd = getRootXmlDoc( 3600 );
|
||||
if ( ! prd || prd == (void *)-1 ) return (uint8_t *)prd;
|
||||
// shortcut
|
||||
XmlDoc *rd = *prd;
|
||||
// . if no root doc, then assume language unknown
|
||||
// . this happens if we are injecting because we do not want
|
||||
// to download the root page for speed purposes
|
||||
if ( ! rd ) {
|
||||
m_rootLangId = langUnknown;
|
||||
m_rootLangIdValid = true;
|
||||
return &m_rootLangId;
|
||||
}
|
||||
// . update tagdb rec
|
||||
// . on root download error use language "xx" (unknown) to
|
||||
// avoid hammering the root page
|
||||
//bool *status = rd->updateRootLangId ();
|
||||
//if (! status || status==(void *)-1) return (uint8_t *)status;
|
||||
// update our tag rec now
|
||||
//Tag *tt = rd->m_newTagRec.getTag("rootlang");
|
||||
// must be there
|
||||
//if ( ! tt ) { g_process.shutdownAbort(true); }
|
||||
// add it for us
|
||||
//if ( ! m_newTagRec.addTag ( tt ) ) return NULL;
|
||||
// get it
|
||||
uint8_t *rl = rd->getLangId();
|
||||
if ( ! rl || rl == (void *)-1 ) return (uint8_t *)rl;
|
||||
// must be legit now!
|
||||
if ( ! rd->m_langIdValid ) { g_process.shutdownAbort(true);}
|
||||
// now validate our stuff
|
||||
m_rootLangIdValid = true;
|
||||
m_rootLangId = rd->m_langId;
|
||||
return &m_rootLangId;
|
||||
}
|
||||
|
||||
// sanity check ( must be like "en,50\0" or could be
|
||||
// "en_US,50\0" or "zh_cn,50"
|
||||
if ( tag->getTagDataSize() > 6 ) {
|
||||
g_process.shutdownAbort(true);
|
||||
}
|
||||
|
||||
// point to 2 character language abbreviation
|
||||
char *abbr = tag->getTagData();
|
||||
// map it to an id
|
||||
uint8_t langId = getLangIdFromAbbr( abbr );
|
||||
|
||||
// set that up
|
||||
m_rootLangId = langId;
|
||||
//m_rootLangIdScore = score;
|
||||
m_rootLangIdValid = true;
|
||||
return &m_rootLangId;
|
||||
}
|
||||
|
||||
XmlDoc **XmlDoc::getOldXmlDoc ( ) {
|
||||
|
||||
if ( m_oldDocValid ) return &m_oldDoc;
|
||||
@ -20948,10 +20864,6 @@ SafeBuf *XmlDoc::getNewTagBuf ( ) {
|
||||
if ( ! grv || grv == (void *)-1 ) return (SafeBuf *)grv;
|
||||
}
|
||||
|
||||
// get root langid of root page
|
||||
uint8_t *rl = getRootLangId();
|
||||
if ( ! rl || rl == (void *)-1 ) return (SafeBuf *)rl;
|
||||
|
||||
//
|
||||
// init stuff
|
||||
//
|
||||
@ -20989,37 +20901,6 @@ SafeBuf *XmlDoc::getNewTagBuf ( ) {
|
||||
// reserve it all now
|
||||
if ( ! tbuf->reserve(need) ) return NULL;
|
||||
|
||||
|
||||
|
||||
//
|
||||
// add root langid if we need to
|
||||
//
|
||||
const char *oldrl = gr->getString("rootlang", NULL, NULL, ×tamp);
|
||||
// assume no valid id
|
||||
int32_t oldrlid = -99;
|
||||
// convert to id
|
||||
if ( oldrl ) oldrlid = getLangIdFromAbbr ( oldrl );
|
||||
|
||||
// if not in old tag, or changed from what was in tag, or it has
|
||||
// been 10 days or more, then update tagdb with this tag.
|
||||
bool addRootLang = false;
|
||||
if ( ! oldrl ) addRootLang = true;
|
||||
if ( oldrlid != *rl ) addRootLang = true;
|
||||
if ( oldrl && now-timestamp > 10*86400 ) addRootLang = true;
|
||||
// injects do not download the root doc for speed reasons, so do not
|
||||
// bother for them unless the doc itself is the root.
|
||||
if ( m_wasContentInjected && !*isRoot ) addRootLang = false;
|
||||
// . get the two letter (usually) language code from the id
|
||||
// . i think the two chinese languages are 5 letters
|
||||
const char *newrl = NULL;
|
||||
if ( addRootLang )
|
||||
// i've seen this return NULL because *rl is a corrupt 215
|
||||
// for some reason
|
||||
newrl = getLanguageAbbr( *rl );
|
||||
|
||||
if ( newrl )
|
||||
tbuf->addTag3(mysite,"rootlang",now,"xmldoc",*ip,newrl,rdbId);
|
||||
|
||||
//
|
||||
// add "site" tag
|
||||
//
|
||||
|
4
XmlDoc.h
4
XmlDoc.h
@ -197,7 +197,7 @@ public:
|
||||
|
||||
int8_t m_hopCount;
|
||||
uint8_t m_langId;
|
||||
uint8_t m_rootLangId;
|
||||
uint8_t m_reserved6;
|
||||
uint8_t m_contentType;
|
||||
|
||||
|
||||
@ -406,7 +406,6 @@ public:
|
||||
bool getIsPageParser ( ) ;
|
||||
class XmlDoc **getRootXmlDoc ( int32_t maxCacheAge = 0 ) ;
|
||||
char **getOldTitleRec ( );
|
||||
uint8_t *getRootLangId ();
|
||||
char **getRootTitleRec ( ) ;
|
||||
int64_t *getAvailDocIdOnly ( int64_t preferredDocId ) ;
|
||||
int64_t *getDocId ( ) ;
|
||||
@ -731,7 +730,6 @@ public:
|
||||
char m_charsetValid;
|
||||
char m_langVectorValid;
|
||||
char m_langIdValid;
|
||||
char m_rootLangIdValid;
|
||||
char m_datedbDateValid;
|
||||
char m_isRSSValid;
|
||||
char m_isSiteMapValid;
|
||||
|
Reference in New Issue
Block a user