Remove unused rootlang

2016-06-24 17:18:14 +02:00
parent 1dca8bc37d
commit 992631a482
4 changed files with 4 additions and 129 deletions
--- a/RdbList.cpp
+++ b/RdbList.cpp
@ -1690,6 +1690,7 @@ void RdbList::merge_r ( RdbList **lists         ,
 	std::set<int64_t> remove_tags;
 	if ( rdbId == RDB_TAGDB ) {
 		/// @todo ALC only need this to clean out existing tagdb records. (remove once it's cleaned up!)
+		remove_tags.insert( getTagTypeFromStr( "rootlang" ) );
 		remove_tags.insert( getTagTypeFromStr( "manualfilter" ) );
 		remove_tags.insert( getTagTypeFromStr( "dateformat" ) );
 		remove_tags.insert( getTagTypeFromStr( "venueaddress" ) );
--- a/Tagdb.cpp
+++ b/Tagdb.cpp
@ -886,11 +886,6 @@ static TagDesc s_tagDesc[] = {
 	///     and removing them will cause missing info in the TagDB dump code
 	///     (when clicking 'page info' in search results)

-	// data for the "lang" tag is 2 char language id followed by
-	// a comma then a score from 1 to 100 to indicate percentage.
-	// Allow multiple "lang" tags in one tagrec.
-	{"rootlang"             ,TDF_STRING,0},
-
 	// title tag and incoming link text of the root page is stored here
 	// for determining default venue addresses
 	{"roottitles"             ,TDF_STRING|TDF_NOINDEX,0},
@ -930,9 +925,10 @@ static TagDesc s_tagDesc[] = {
 	//   doing the throttling, really messing things up
 	{"firstip"              ,0x00,0},

-	/// @todo ALC only need this until existing tagdb records is merged. (remove once it's cleaned up!)
+	/// @todo ALC only need this until we cater for unknown tags for display (remember titlerec!)
    // As above, we can't remove the following definition unless if we're sure it's not set anymore
    // Anything below this point is unused.
+	{"rootlang"             ,TDF_STRING,0},
 	{"manualfilter", 0x00, 0},
 	{"dateformat", 0x00, 0}, // 1 = american, 2 = european

@ -958,7 +954,6 @@ static TagDesc s_tagDesc[] = {

 	{"pagerank"             ,0x00,0},
 	{"ruleset"              ,0x00,0}
-
 };

 // . convert "domain_squatter" to ST_DOMAIN_SQUATTER
--- a/XmlDoc.cpp
+++ b/XmlDoc.cpp
@ -1162,7 +1162,6 @@ bool XmlDoc::set2 ( char    *titleRec ,

 	// new stuff
 	m_siteNumInlinksValid         = true;
-	m_rootLangIdValid             = true;
 	m_metaListCheckSum8Valid      = true;

 	m_hopCountValid               = true;
@ -2872,8 +2871,6 @@ char *XmlDoc::prepareToMakeTitleRec ( ) {
 	if ( ! ls || ls == (void *)-1 ) return (char *)ls;
 	uint32_t *tph = getTagPairHash32();
 	if ( ! tph || tph == (uint32_t *)-1 ) return (char *)tph;
-	uint8_t *rl = getRootLangId();
-	if ( ! rl || rl == (void *)-1 ) return (char *)rl;

 	m_prepared = true;
 	return (char *)1;
@ -3104,7 +3101,6 @@ SafeBuf *XmlDoc::getTitleRecBuf ( ) {
 	if ( ! m_httpStatusValid             ) { g_process.shutdownAbort(true); }

 	if ( ! m_siteNumInlinksValid         ) { g_process.shutdownAbort(true); }
-	if ( ! m_rootLangIdValid             ) { g_process.shutdownAbort(true); }

 	if ( ! m_hopCountValid               ) { g_process.shutdownAbort(true); }
 	if ( ! m_metaListCheckSum8Valid      ) { g_process.shutdownAbort(true); }
@ -5820,86 +5816,6 @@ uint16_t *XmlDoc::getCountryId ( ) {
 	return &m_countryId;
 }

-uint8_t *XmlDoc::getRootLangId ( ) {
-
-	// return it if we got it
-	if ( m_rootLangIdValid ) return &m_rootLangId;
-	// note it
-	setStatus ( "getting root lang id from tagdb");
-	// are we a root?
-	char *isRoot = getIsSiteRoot();
-	if ( ! isRoot || isRoot == (char *)-1 ) return (uint8_t *)isRoot;
-	// sanity check - should not be called on a root url
-	if ( *isRoot ) {
-		uint8_t *langId = getLangId();
-		if ( ! langId || langId == (uint8_t *)-1 )
-			return (uint8_t *) langId;
-		m_rootLangId = *langId;
-		m_rootLangIdValid = true;
-		return &m_rootLangId;
-		//g_process.shutdownAbort(true); }
-	}
-	// get the tag rec
-	TagRec *gr = getTagRec ();
-	if ( ! gr || gr == (TagRec *)-1 ) return (uint8_t *)gr;
-	// just use one. there may be multiple ones!
-	Tag *tag = gr->getTag("rootlang");
-	// if there use that
-	if ( ! tag ) {
-		// . get the root doc
- 		// . allow for a one hour cache of the titleRec
-		XmlDoc **prd = getRootXmlDoc( 3600 );
-		if ( ! prd || prd == (void *)-1 ) return (uint8_t *)prd;
-		// shortcut
-		XmlDoc *rd = *prd;
-		// . if no root doc, then assume language unknown
-		// . this happens if we are injecting because we do not want
-		//   to download the root page for speed purposes
-		if ( ! rd ) {
-			m_rootLangId = langUnknown;
-			m_rootLangIdValid = true;
-			return &m_rootLangId;
-		}
-		// . update tagdb rec
-		// . on root download error use language "xx" (unknown) to
-		//   avoid hammering the root page
-		//bool *status = rd->updateRootLangId ();
-		//if (! status || status==(void *)-1) return (uint8_t *)status;
-		// update our tag rec now
-		//Tag *tt = rd->m_newTagRec.getTag("rootlang");
-		// must be there
-		//if ( ! tt ) { g_process.shutdownAbort(true); }
-		// add it for us
-		//if ( ! m_newTagRec.addTag ( tt ) ) return NULL;
-		// get it
-		uint8_t *rl = rd->getLangId();
-		if ( ! rl || rl == (void *)-1 ) return (uint8_t *)rl;
-		// must be legit now!
-		if ( ! rd->m_langIdValid ) { g_process.shutdownAbort(true);}
-		// now validate our stuff
-		m_rootLangIdValid = true;
-		m_rootLangId      = rd->m_langId;
-		return &m_rootLangId;
-	}
-
-	// sanity check ( must be like "en,50\0" or could be
-	// "en_US,50\0" or "zh_cn,50"
-	if ( tag->getTagDataSize() > 6 ) {
-		g_process.shutdownAbort(true);
-	}
-
-	// point to 2 character language abbreviation
-	char *abbr = tag->getTagData();
-	// map it to an id
-	uint8_t langId = getLangIdFromAbbr( abbr );
-
-	// set that up
-	m_rootLangId      = langId;
-	//m_rootLangIdScore = score;
-	m_rootLangIdValid = true;
-	return &m_rootLangId;
-}
-
 XmlDoc **XmlDoc::getOldXmlDoc ( ) {

 	if ( m_oldDocValid ) return &m_oldDoc;
@ -20948,10 +20864,6 @@ SafeBuf *XmlDoc::getNewTagBuf ( ) {
 		if ( ! grv || grv == (void *)-1 ) return (SafeBuf *)grv;
 	}

-	// get root langid of root page
-	uint8_t *rl = getRootLangId();
-	if ( ! rl || rl == (void *)-1 ) return (SafeBuf *)rl;
-
 	//
 	// init stuff
 	//
@ -20989,37 +20901,6 @@ SafeBuf *XmlDoc::getNewTagBuf ( ) {
 	// reserve it all now
 	if ( ! tbuf->reserve(need) ) return NULL;

-
-
-	//
-	// add root langid if we need to
-	//
-	const char *oldrl = gr->getString("rootlang", NULL, NULL, &timestamp);
-	// assume no valid id
-	int32_t oldrlid = -99;
-	// convert to id
-	if ( oldrl ) oldrlid = getLangIdFromAbbr ( oldrl );
-
-	// if not in old tag, or changed from what was in tag, or it has
-	// been 10 days or more, then update tagdb with this tag.
-	bool addRootLang = false;
-	if ( ! oldrl ) addRootLang = true;
-	if ( oldrlid != *rl ) addRootLang = true;
-	if ( oldrl && now-timestamp > 10*86400 ) addRootLang = true;
-	// injects do not download the root doc for speed reasons, so do not
-	// bother for them unless the doc itself is the root.
-	if ( m_wasContentInjected && !*isRoot ) addRootLang = false;
-	// . get the two letter (usually) language code from the id
-	// . i think the two chinese languages are 5 letters
-	const char *newrl = NULL;
-	if ( addRootLang )
-		// i've seen this return NULL because *rl is a corrupt 215
-		// for some reason
-		newrl = getLanguageAbbr( *rl );
-
-	if ( newrl )
-		tbuf->addTag3(mysite,"rootlang",now,"xmldoc",*ip,newrl,rdbId);
-
 	//
 	// add "site" tag
 	//
--- a/XmlDoc.h
+++ b/XmlDoc.h
@ -197,7 +197,7 @@ public:
 	
 	int8_t    m_hopCount;
 	uint8_t   m_langId;
-	uint8_t   m_rootLangId;
+	uint8_t   m_reserved6;
 	uint8_t   m_contentType;


@ -406,7 +406,6 @@ public:
 	bool getIsPageParser ( ) ;
 	class XmlDoc **getRootXmlDoc ( int32_t maxCacheAge = 0 ) ;
 	char **getOldTitleRec ( );
-	uint8_t *getRootLangId ();
 	char **getRootTitleRec ( ) ;
 	int64_t *getAvailDocIdOnly ( int64_t preferredDocId ) ;
 	int64_t *getDocId ( ) ;
@ -731,7 +730,6 @@ public:
 	char     m_charsetValid;
 	char     m_langVectorValid;
 	char     m_langIdValid;
-	char     m_rootLangIdValid;
 	char     m_datedbDateValid;
 	char     m_isRSSValid;
 	char     m_isSiteMapValid;