Remove useTimeAxis feature

2017-12-11 12:25:12 +01:00
parent aa48008d11
commit bd5fe9397c
7 changed files with 7 additions and 90 deletions
--- a/Collectiondb.cpp
+++ b/Collectiondb.cpp
@ -998,7 +998,6 @@ CollectionRec::CollectionRec() {
 	m_dedupingEnabled = false;
 	m_dupCheckWWW = false;
 	m_useSimplifiedRedirects = false;
-	m_useTimeAxis = false;
 	m_oneVotePerIpDom = false;
 	m_doUrlSpamCheck = false;
 	m_doLinkSpamCheck = false;
--- a/Collectiondb.h
+++ b/Collectiondb.h
@ -219,7 +219,6 @@ public:
 	bool  m_dedupingEnabled         ; // dedup content on same hostname
 	bool  m_dupCheckWWW             ;
 	bool  m_useSimplifiedRedirects  ;
-	bool  m_useTimeAxis             ;
 	bool  m_oneVotePerIpDom         ;
 	bool  m_doUrlSpamCheck          ; //filter urls w/ naughty hostnames
 	bool  m_doLinkSpamCheck         ; //filters dynamically generated pages
--- a/Parms.cpp
+++ b/Parms.cpp
@ -7698,19 +7698,6 @@ void Parms::init ( ) {
 	m->m_flags = PF_CLONE;
 	m++;

-
-	m->m_title = "use time axis";
-	m->m_desc  = "If this is true Gigablast will index the same "
-		"url multiple times if its content varies over time, "
-		"rather than overwriting the older version in the index. "
-		"Useful for archive web pages as they change over time.";
-	m->m_cgi   = "usetimeaxis";
-	simple_m_set(CollectionRec,m_useTimeAxis);
-	m->m_def   = "0";
-	m->m_page  = PAGE_SPIDER;
-	m->m_flags = PF_CLONE;
-	m++;
-
 	m->m_title = "daily merge time";
 	m->m_desc  = "Do a tight merge on posdb and titledb at this time "
 		"every day. This is expressed in MINUTES past midnight UTC. "
--- a/XmlDoc.cpp
+++ b/XmlDoc.cpp
@ -159,9 +159,6 @@ void XmlDoc::reset ( ) {
 	m_mySiteLinkInfoBuf.purge();
 	m_myPageLinkInfoBuf.purge();

-	// we need to reset this to false
-	m_useTimeAxis = false;
-
 	m_loaded = false;

 	m_indexedDoc = false;
@ -5333,11 +5330,6 @@ int64_t XmlDoc::getFirstUrlHash48() {
 	if ( m_firstUrlHash48Valid ) return m_firstUrlHash48;
 	// this must work
 	if ( ! m_firstUrlValid ) { g_process.shutdownAbort(true); }
-	if ( getUseTimeAxis() ) {
-		m_firstUrlHash48 = hash64b ( getTimeAxisUrl()->getBufStart() ) & 0x0000ffffffffffffLL;
-		m_firstUrlHash48Valid = true;
-		return m_firstUrlHash48;
-	}

 	m_firstUrlHash48 = hash64b ( m_firstUrl.getUrl() ) & 0x0000ffffffffffffLL;
 	m_firstUrlHash48Valid = true;
@ -5349,12 +5341,6 @@ int64_t XmlDoc::getFirstUrlHash64() {
 	// this must work
 	if ( ! m_firstUrlValid ) { g_process.shutdownAbort(true); }

-	if ( getUseTimeAxis() ) {
-		m_firstUrlHash64 = hash64b ( getTimeAxisUrl()->getBufStart() );
-		m_firstUrlHash64Valid = true;
-		return m_firstUrlHash64;
-	}
-
 	m_firstUrlHash64 = hash64b ( m_firstUrl.getUrl() );
 	m_firstUrlHash64Valid = true;
 	return m_firstUrlHash64;
@ -6230,16 +6216,6 @@ XmlDoc **XmlDoc::getRootXmlDoc ( int32_t maxCacheAge ) {
 	return &m_rootDoc;
 }

-SafeBuf *XmlDoc::getTimeAxisUrl ( ) {
-	if ( m_timeAxisUrlValid ) return &m_timeAxisUrl;
-	if ( m_setFromDocId ) return &m_timeAxisUrl;
-	m_timeAxisUrlValid = true;
-	Url *fu = getFirstUrl();
-	m_timeAxisUrl.reset();
-	m_timeAxisUrl.safePrintf("%s.%u",fu->getUrl(),m_contentHash32);
-	return &m_timeAxisUrl;
-}
-
 // . look up TitleRec using Msg22 if we need to
 // . set our m_titleRec member from titledb
 // . the twin brother of XmlDoc::getTitleRecBuf() which makes the title rec
@ -6313,15 +6289,6 @@ char **XmlDoc::getOldTitleRec() {
 		return NULL;
 	}

-	// if using time axis then append the timestamp to the end of
-	// the url. this way Msg22::getAvailDocId() will return a docid
-	// based on that so we don't collide with other instances of this
-	// same url.
-	if ( u && getUseTimeAxis() ) { // g_conf.m_useTimeAxis ) {
-		SafeBuf *tau = getTimeAxisUrl();
-		u = tau->getBufStart();
-	}
-
 	// the title must be local since we're spidering it
 	if ( ! m_msg22a.getTitleRec ( &m_msg22Request      ,
 				      u                    ,
@ -6448,13 +6415,12 @@ int64_t *XmlDoc::getDocId ( ) {
 	}

 	// ensure it is within probable range
-	if ( ! getUseTimeAxis () ) {
-		char *u = getFirstUrl()->getUrl();
-		int64_t pd = Titledb::getProbableDocId(u);
-		int64_t d1 = Titledb::getFirstProbableDocId ( pd );
-		int64_t d2 = Titledb::getLastProbableDocId  ( pd );
-		if ( m_docId < d1 || m_docId > d2 ) {
-			g_process.shutdownAbort(true); }
+	char *u = getFirstUrl()->getUrl();
+	int64_t pd = Titledb::getProbableDocId(u);
+	int64_t d1 = Titledb::getFirstProbableDocId ( pd );
+	int64_t d2 = Titledb::getLastProbableDocId  ( pd );
+	if ( m_docId < d1 || m_docId > d2 ) {
+		g_process.shutdownAbort(true);
 	}

 	m_docIdValid = true;
@ -11543,7 +11509,6 @@ void XmlDoc::logIt (SafeBuf *bb ) {
 	sb->safePrintf("probdocid=%" PRIu64" ",pd);
 	sb->safePrintf("probdocidmin=%" PRIu64" ",d1);
 	sb->safePrintf("probdocidmax=%" PRIu64" ",d2);
-	sb->safePrintf("usetimeaxis=%i ",(int)m_useTimeAxis);

 	if ( m_siteNumInlinksValid ) {
 		sb->safePrintf("siteinlinks=%04" PRId32" ",m_siteNumInlinks );
@ -17388,7 +17353,6 @@ bool XmlDoc::printDoc ( SafeBuf *sb ) {
 		  "<tr><td>content type</td><td>%" PRId32" (%s)</td></tr>\n"
 		  "<tr><td>language</td><td>%" PRId32" (%s)</td></tr>\n"
 		  "<tr><td>country</td><td>%" PRId32" (%s)</td></tr>\n"
-		  "<tr><td>time axis used</td><td>%" PRId32"</td></tr>\n"
 		  "<tr><td>metadata</td><td>%s</td></tr>\n"
 		  "</td></tr>\n",

@ -17432,7 +17396,6 @@ bool XmlDoc::printDoc ( SafeBuf *sb ) {

 		  (int32_t)m_countryId,
 		  g_countryCode.getName(m_countryId),
-		  m_useTimeAxis,
 		  "");

 	if ( info1 ) {
--- a/XmlDoc.h
+++ b/XmlDoc.h
@ -197,7 +197,7 @@ public:
 	uint16_t  m_reserved800:1;
 	uint16_t  m_reserved801:1;
 	uint16_t  m_reserved802:1;
-	uint16_t  m_useTimeAxis:1;
+	uint16_t  m_reserved803:1;
 	uint16_t  m_reserved805:1;
 	uint16_t  m_reserved806:1;
 	uint16_t  m_reserved807:1;
@ -497,8 +497,6 @@ public:
 	bool hashContentType ( class HashTableX *table ) ;
 	
 	bool hashLinks ( class HashTableX *table ) ;
-	bool getUseTimeAxis ( ) ;
-	SafeBuf *getTimeAxisUrl ( );
 	bool hashUrl ( class HashTableX *table, bool urlOnly );
 	bool hashDateNumbers ( class HashTableX *tt );
 	bool hashIncomingLinkText(HashTableX *table);
@ -655,8 +653,6 @@ public:
 	char m_logLangId;
 	int32_t m_logSiteNumInlinks;

-	SafeBuf m_timeAxisUrl;
-
 	bool isFirstUrlRobotsTxt();
 	bool m_isRobotsTxtUrl;

@ -688,8 +684,6 @@ public:
 	bool m_siteValid;
 	bool m_startTimeValid;
 	bool m_currentUrlValid;
-	bool m_useTimeAxisValid;
-	bool m_timeAxisUrlValid;
 	bool m_firstUrlValid;
 	bool m_firstUrlHash48Valid;
 	bool m_firstUrlHash64Valid;
--- a/XmlDoc_Indexing.cpp
+++ b/XmlDoc_Indexing.cpp
@ -886,20 +886,6 @@ bool XmlDoc::hashLinksForLinkdb ( HashTableX *dt ) {
 	return true;
 }

-bool XmlDoc::getUseTimeAxis ( ) {
-	if ( m_useTimeAxisValid )
-		return m_useTimeAxis;
-	if ( m_setFromTitleRec )
-		// return from titlerec header
-		return m_useTimeAxis;
-	CollectionRec *cr = g_collectiondb.getRec ( m_collnum );
-	if ( ! cr ) return false;
-	m_useTimeAxis = cr->m_useTimeAxis;
-	m_useTimeAxisValid = true;
-	return m_useTimeAxis;
-}
-
-
 // . returns false and sets g_errno on error
 // . copied Url2.cpp into here basically, so we can now dump Url2.cpp
 bool XmlDoc::hashUrl ( HashTableX *tt, bool urlOnly ) { // , bool isStatusDoc ) {
@ -937,12 +923,6 @@ bool XmlDoc::hashUrl ( HashTableX *tt, bool urlOnly ) { // , bool isStatusDoc )
 		gbshutdownLogicError();
 	}

-	if ( getUseTimeAxis() ) {
-		hi.m_prefix = "gbtimeurl";
-		SafeBuf *tau = getTimeAxisUrl();
-		hashSingleTerm ( tau->getBufStart(),tau->length(),&hi);
-	}
-
 	char *s = fu->getUrl();
 	int32_t slen = fu->getUrlLen();

--- a/main.cpp
+++ b/main.cpp
@ -2549,8 +2549,6 @@ void dumpTitledb (const char *coll, int32_t startFileNum, int32_t numFiles, bool
 						"cs=%04d "
 						"lang=%02d "
 						"sni=%03" PRId32" "
-						"usetimeaxis=%i "
-						//"cats=%" PRId32" "
 						"lastspidered=%s "
 						"ip=%s "
 						"numLinkTexts=%04" PRId32" "
@ -2574,7 +2572,6 @@ void dumpTitledb (const char *coll, int32_t startFileNum, int32_t numFiles, bool
 						xd->m_charset,//tr.getCharset(),
 						xd->m_langId,//tr.getLanguage(),
 						(int32_t)xd->m_siteNumInlinks,//tr.getDo
-						xd->m_useTimeAxis,
 						//nc,
 						ppp, 
 						iptoa(xd->m_ip,ipbuf2),
@ -2628,7 +2625,6 @@ void dumpTitledb (const char *coll, int32_t startFileNum, int32_t numFiles, bool
 				"ctype=%s "
 				"lang=%02d "
 				"sni=%03" PRId32" "
-				"usetimeaxis=%i "
 				"lastspidered=%s "
 				"ip=%s "
 				"numLinkTexts=%04" PRId32" "
@ -2648,7 +2644,6 @@ void dumpTitledb (const char *coll, int32_t startFileNum, int32_t numFiles, bool
 				g_contentTypeStrings[xd->m_contentType],
 				xd->m_langId,//tr.getLanguage(),
 				(int32_t)xd->m_siteNumInlinks,//tr.getDocQuality(),
-				xd->m_useTimeAxis,
 				ppp,
 				iptoa(xd->m_ip,ipbuf2),
 				info->getNumGoodInlinks(),