Remove unused rootlang

This commit is contained in:
Ai Lin Chia
2016-06-24 17:18:14 +02:00
parent 1dca8bc37d
commit 992631a482
4 changed files with 4 additions and 129 deletions

@ -1690,6 +1690,7 @@ void RdbList::merge_r ( RdbList **lists ,
std::set<int64_t> remove_tags;
if ( rdbId == RDB_TAGDB ) {
/// @todo ALC only need this to clean out existing tagdb records. (remove once it's cleaned up!)
remove_tags.insert( getTagTypeFromStr( "rootlang" ) );
remove_tags.insert( getTagTypeFromStr( "manualfilter" ) );
remove_tags.insert( getTagTypeFromStr( "dateformat" ) );
remove_tags.insert( getTagTypeFromStr( "venueaddress" ) );

@ -886,11 +886,6 @@ static TagDesc s_tagDesc[] = {
/// and removing them will cause missing info in the TagDB dump code
/// (when clicking 'page info' in search results)
// data for the "lang" tag is 2 char language id followed by
// a comma then a score from 1 to 100 to indicate percentage.
// Allow multiple "lang" tags in one tagrec.
{"rootlang" ,TDF_STRING,0},
// title tag and incoming link text of the root page is stored here
// for determining default venue addresses
{"roottitles" ,TDF_STRING|TDF_NOINDEX,0},
@ -930,9 +925,10 @@ static TagDesc s_tagDesc[] = {
// doing the throttling, really messing things up
{"firstip" ,0x00,0},
/// @todo ALC only need this until existing tagdb records is merged. (remove once it's cleaned up!)
/// @todo ALC only need this until we cater for unknown tags for display (remember titlerec!)
// As above, we can't remove the following definition unless if we're sure it's not set anymore
// Anything below this point is unused.
{"rootlang" ,TDF_STRING,0},
{"manualfilter", 0x00, 0},
{"dateformat", 0x00, 0}, // 1 = american, 2 = european
@ -958,7 +954,6 @@ static TagDesc s_tagDesc[] = {
{"pagerank" ,0x00,0},
{"ruleset" ,0x00,0}
};
// . convert "domain_squatter" to ST_DOMAIN_SQUATTER

@ -1162,7 +1162,6 @@ bool XmlDoc::set2 ( char *titleRec ,
// new stuff
m_siteNumInlinksValid = true;
m_rootLangIdValid = true;
m_metaListCheckSum8Valid = true;
m_hopCountValid = true;
@ -2872,8 +2871,6 @@ char *XmlDoc::prepareToMakeTitleRec ( ) {
if ( ! ls || ls == (void *)-1 ) return (char *)ls;
uint32_t *tph = getTagPairHash32();
if ( ! tph || tph == (uint32_t *)-1 ) return (char *)tph;
uint8_t *rl = getRootLangId();
if ( ! rl || rl == (void *)-1 ) return (char *)rl;
m_prepared = true;
return (char *)1;
@ -3104,7 +3101,6 @@ SafeBuf *XmlDoc::getTitleRecBuf ( ) {
if ( ! m_httpStatusValid ) { g_process.shutdownAbort(true); }
if ( ! m_siteNumInlinksValid ) { g_process.shutdownAbort(true); }
if ( ! m_rootLangIdValid ) { g_process.shutdownAbort(true); }
if ( ! m_hopCountValid ) { g_process.shutdownAbort(true); }
if ( ! m_metaListCheckSum8Valid ) { g_process.shutdownAbort(true); }
@ -5820,86 +5816,6 @@ uint16_t *XmlDoc::getCountryId ( ) {
return &m_countryId;
}
uint8_t *XmlDoc::getRootLangId ( ) {
// return it if we got it
if ( m_rootLangIdValid ) return &m_rootLangId;
// note it
setStatus ( "getting root lang id from tagdb");
// are we a root?
char *isRoot = getIsSiteRoot();
if ( ! isRoot || isRoot == (char *)-1 ) return (uint8_t *)isRoot;
// sanity check - should not be called on a root url
if ( *isRoot ) {
uint8_t *langId = getLangId();
if ( ! langId || langId == (uint8_t *)-1 )
return (uint8_t *) langId;
m_rootLangId = *langId;
m_rootLangIdValid = true;
return &m_rootLangId;
//g_process.shutdownAbort(true); }
}
// get the tag rec
TagRec *gr = getTagRec ();
if ( ! gr || gr == (TagRec *)-1 ) return (uint8_t *)gr;
// just use one. there may be multiple ones!
Tag *tag = gr->getTag("rootlang");
// if there use that
if ( ! tag ) {
// . get the root doc
// . allow for a one hour cache of the titleRec
XmlDoc **prd = getRootXmlDoc( 3600 );
if ( ! prd || prd == (void *)-1 ) return (uint8_t *)prd;
// shortcut
XmlDoc *rd = *prd;
// . if no root doc, then assume language unknown
// . this happens if we are injecting because we do not want
// to download the root page for speed purposes
if ( ! rd ) {
m_rootLangId = langUnknown;
m_rootLangIdValid = true;
return &m_rootLangId;
}
// . update tagdb rec
// . on root download error use language "xx" (unknown) to
// avoid hammering the root page
//bool *status = rd->updateRootLangId ();
//if (! status || status==(void *)-1) return (uint8_t *)status;
// update our tag rec now
//Tag *tt = rd->m_newTagRec.getTag("rootlang");
// must be there
//if ( ! tt ) { g_process.shutdownAbort(true); }
// add it for us
//if ( ! m_newTagRec.addTag ( tt ) ) return NULL;
// get it
uint8_t *rl = rd->getLangId();
if ( ! rl || rl == (void *)-1 ) return (uint8_t *)rl;
// must be legit now!
if ( ! rd->m_langIdValid ) { g_process.shutdownAbort(true);}
// now validate our stuff
m_rootLangIdValid = true;
m_rootLangId = rd->m_langId;
return &m_rootLangId;
}
// sanity check ( must be like "en,50\0" or could be
// "en_US,50\0" or "zh_cn,50"
if ( tag->getTagDataSize() > 6 ) {
g_process.shutdownAbort(true);
}
// point to 2 character language abbreviation
char *abbr = tag->getTagData();
// map it to an id
uint8_t langId = getLangIdFromAbbr( abbr );
// set that up
m_rootLangId = langId;
//m_rootLangIdScore = score;
m_rootLangIdValid = true;
return &m_rootLangId;
}
XmlDoc **XmlDoc::getOldXmlDoc ( ) {
if ( m_oldDocValid ) return &m_oldDoc;
@ -20948,10 +20864,6 @@ SafeBuf *XmlDoc::getNewTagBuf ( ) {
if ( ! grv || grv == (void *)-1 ) return (SafeBuf *)grv;
}
// get root langid of root page
uint8_t *rl = getRootLangId();
if ( ! rl || rl == (void *)-1 ) return (SafeBuf *)rl;
//
// init stuff
//
@ -20989,37 +20901,6 @@ SafeBuf *XmlDoc::getNewTagBuf ( ) {
// reserve it all now
if ( ! tbuf->reserve(need) ) return NULL;
//
// add root langid if we need to
//
const char *oldrl = gr->getString("rootlang", NULL, NULL, &timestamp);
// assume no valid id
int32_t oldrlid = -99;
// convert to id
if ( oldrl ) oldrlid = getLangIdFromAbbr ( oldrl );
// if not in old tag, or changed from what was in tag, or it has
// been 10 days or more, then update tagdb with this tag.
bool addRootLang = false;
if ( ! oldrl ) addRootLang = true;
if ( oldrlid != *rl ) addRootLang = true;
if ( oldrl && now-timestamp > 10*86400 ) addRootLang = true;
// injects do not download the root doc for speed reasons, so do not
// bother for them unless the doc itself is the root.
if ( m_wasContentInjected && !*isRoot ) addRootLang = false;
// . get the two letter (usually) language code from the id
// . i think the two chinese languages are 5 letters
const char *newrl = NULL;
if ( addRootLang )
// i've seen this return NULL because *rl is a corrupt 215
// for some reason
newrl = getLanguageAbbr( *rl );
if ( newrl )
tbuf->addTag3(mysite,"rootlang",now,"xmldoc",*ip,newrl,rdbId);
//
// add "site" tag
//

@ -197,7 +197,7 @@ public:
int8_t m_hopCount;
uint8_t m_langId;
uint8_t m_rootLangId;
uint8_t m_reserved6;
uint8_t m_contentType;
@ -406,7 +406,6 @@ public:
bool getIsPageParser ( ) ;
class XmlDoc **getRootXmlDoc ( int32_t maxCacheAge = 0 ) ;
char **getOldTitleRec ( );
uint8_t *getRootLangId ();
char **getRootTitleRec ( ) ;
int64_t *getAvailDocIdOnly ( int64_t preferredDocId ) ;
int64_t *getDocId ( ) ;
@ -731,7 +730,6 @@ public:
char m_charsetValid;
char m_langVectorValid;
char m_langIdValid;
char m_rootLangIdValid;
char m_datedbDateValid;
char m_isRSSValid;
char m_isSiteMapValid;