Remove useTimeAxis feature

This commit is contained in:
Ai Lin Chia
2017-12-11 12:25:12 +01:00
parent aa48008d11
commit bd5fe9397c
7 changed files with 7 additions and 90 deletions

@ -998,7 +998,6 @@ CollectionRec::CollectionRec() {
m_dedupingEnabled = false;
m_dupCheckWWW = false;
m_useSimplifiedRedirects = false;
m_useTimeAxis = false;
m_oneVotePerIpDom = false;
m_doUrlSpamCheck = false;
m_doLinkSpamCheck = false;

@ -219,7 +219,6 @@ public:
bool m_dedupingEnabled ; // dedup content on same hostname
bool m_dupCheckWWW ;
bool m_useSimplifiedRedirects ;
bool m_useTimeAxis ;
bool m_oneVotePerIpDom ;
bool m_doUrlSpamCheck ; //filter urls w/ naughty hostnames
bool m_doLinkSpamCheck ; //filters dynamically generated pages

@ -7698,19 +7698,6 @@ void Parms::init ( ) {
m->m_flags = PF_CLONE;
m++;
m->m_title = "use time axis";
m->m_desc = "If this is true Gigablast will index the same "
"url multiple times if its content varies over time, "
"rather than overwriting the older version in the index. "
"Useful for archive web pages as they change over time.";
m->m_cgi = "usetimeaxis";
simple_m_set(CollectionRec,m_useTimeAxis);
m->m_def = "0";
m->m_page = PAGE_SPIDER;
m->m_flags = PF_CLONE;
m++;
m->m_title = "daily merge time";
m->m_desc = "Do a tight merge on posdb and titledb at this time "
"every day. This is expressed in MINUTES past midnight UTC. "

@ -159,9 +159,6 @@ void XmlDoc::reset ( ) {
m_mySiteLinkInfoBuf.purge();
m_myPageLinkInfoBuf.purge();
// we need to reset this to false
m_useTimeAxis = false;
m_loaded = false;
m_indexedDoc = false;
@ -5333,11 +5330,6 @@ int64_t XmlDoc::getFirstUrlHash48() {
if ( m_firstUrlHash48Valid ) return m_firstUrlHash48;
// this must work
if ( ! m_firstUrlValid ) { g_process.shutdownAbort(true); }
if ( getUseTimeAxis() ) {
m_firstUrlHash48 = hash64b ( getTimeAxisUrl()->getBufStart() ) & 0x0000ffffffffffffLL;
m_firstUrlHash48Valid = true;
return m_firstUrlHash48;
}
m_firstUrlHash48 = hash64b ( m_firstUrl.getUrl() ) & 0x0000ffffffffffffLL;
m_firstUrlHash48Valid = true;
@ -5349,12 +5341,6 @@ int64_t XmlDoc::getFirstUrlHash64() {
// this must work
if ( ! m_firstUrlValid ) { g_process.shutdownAbort(true); }
if ( getUseTimeAxis() ) {
m_firstUrlHash64 = hash64b ( getTimeAxisUrl()->getBufStart() );
m_firstUrlHash64Valid = true;
return m_firstUrlHash64;
}
m_firstUrlHash64 = hash64b ( m_firstUrl.getUrl() );
m_firstUrlHash64Valid = true;
return m_firstUrlHash64;
@ -6230,16 +6216,6 @@ XmlDoc **XmlDoc::getRootXmlDoc ( int32_t maxCacheAge ) {
return &m_rootDoc;
}
SafeBuf *XmlDoc::getTimeAxisUrl ( ) {
if ( m_timeAxisUrlValid ) return &m_timeAxisUrl;
if ( m_setFromDocId ) return &m_timeAxisUrl;
m_timeAxisUrlValid = true;
Url *fu = getFirstUrl();
m_timeAxisUrl.reset();
m_timeAxisUrl.safePrintf("%s.%u",fu->getUrl(),m_contentHash32);
return &m_timeAxisUrl;
}
// . look up TitleRec using Msg22 if we need to
// . set our m_titleRec member from titledb
// . the twin brother of XmlDoc::getTitleRecBuf() which makes the title rec
@ -6313,15 +6289,6 @@ char **XmlDoc::getOldTitleRec() {
return NULL;
}
// if using time axis then append the timestamp to the end of
// the url. this way Msg22::getAvailDocId() will return a docid
// based on that so we don't collide with other instances of this
// same url.
if ( u && getUseTimeAxis() ) { // g_conf.m_useTimeAxis ) {
SafeBuf *tau = getTimeAxisUrl();
u = tau->getBufStart();
}
// the title must be local since we're spidering it
if ( ! m_msg22a.getTitleRec ( &m_msg22Request ,
u ,
@ -6448,13 +6415,12 @@ int64_t *XmlDoc::getDocId ( ) {
}
// ensure it is within probable range
if ( ! getUseTimeAxis () ) {
char *u = getFirstUrl()->getUrl();
int64_t pd = Titledb::getProbableDocId(u);
int64_t d1 = Titledb::getFirstProbableDocId ( pd );
int64_t d2 = Titledb::getLastProbableDocId ( pd );
if ( m_docId < d1 || m_docId > d2 ) {
g_process.shutdownAbort(true); }
char *u = getFirstUrl()->getUrl();
int64_t pd = Titledb::getProbableDocId(u);
int64_t d1 = Titledb::getFirstProbableDocId ( pd );
int64_t d2 = Titledb::getLastProbableDocId ( pd );
if ( m_docId < d1 || m_docId > d2 ) {
g_process.shutdownAbort(true);
}
m_docIdValid = true;
@ -11543,7 +11509,6 @@ void XmlDoc::logIt (SafeBuf *bb ) {
sb->safePrintf("probdocid=%" PRIu64" ",pd);
sb->safePrintf("probdocidmin=%" PRIu64" ",d1);
sb->safePrintf("probdocidmax=%" PRIu64" ",d2);
sb->safePrintf("usetimeaxis=%i ",(int)m_useTimeAxis);
if ( m_siteNumInlinksValid ) {
sb->safePrintf("siteinlinks=%04" PRId32" ",m_siteNumInlinks );
@ -17388,7 +17353,6 @@ bool XmlDoc::printDoc ( SafeBuf *sb ) {
"<tr><td>content type</td><td>%" PRId32" (%s)</td></tr>\n"
"<tr><td>language</td><td>%" PRId32" (%s)</td></tr>\n"
"<tr><td>country</td><td>%" PRId32" (%s)</td></tr>\n"
"<tr><td>time axis used</td><td>%" PRId32"</td></tr>\n"
"<tr><td>metadata</td><td>%s</td></tr>\n"
"</td></tr>\n",
@ -17432,7 +17396,6 @@ bool XmlDoc::printDoc ( SafeBuf *sb ) {
(int32_t)m_countryId,
g_countryCode.getName(m_countryId),
m_useTimeAxis,
"");
if ( info1 ) {

@ -197,7 +197,7 @@ public:
uint16_t m_reserved800:1;
uint16_t m_reserved801:1;
uint16_t m_reserved802:1;
uint16_t m_useTimeAxis:1;
uint16_t m_reserved803:1;
uint16_t m_reserved805:1;
uint16_t m_reserved806:1;
uint16_t m_reserved807:1;
@ -497,8 +497,6 @@ public:
bool hashContentType ( class HashTableX *table ) ;
bool hashLinks ( class HashTableX *table ) ;
bool getUseTimeAxis ( ) ;
SafeBuf *getTimeAxisUrl ( );
bool hashUrl ( class HashTableX *table, bool urlOnly );
bool hashDateNumbers ( class HashTableX *tt );
bool hashIncomingLinkText(HashTableX *table);
@ -655,8 +653,6 @@ public:
char m_logLangId;
int32_t m_logSiteNumInlinks;
SafeBuf m_timeAxisUrl;
bool isFirstUrlRobotsTxt();
bool m_isRobotsTxtUrl;
@ -688,8 +684,6 @@ public:
bool m_siteValid;
bool m_startTimeValid;
bool m_currentUrlValid;
bool m_useTimeAxisValid;
bool m_timeAxisUrlValid;
bool m_firstUrlValid;
bool m_firstUrlHash48Valid;
bool m_firstUrlHash64Valid;

@ -886,20 +886,6 @@ bool XmlDoc::hashLinksForLinkdb ( HashTableX *dt ) {
return true;
}
bool XmlDoc::getUseTimeAxis ( ) {
if ( m_useTimeAxisValid )
return m_useTimeAxis;
if ( m_setFromTitleRec )
// return from titlerec header
return m_useTimeAxis;
CollectionRec *cr = g_collectiondb.getRec ( m_collnum );
if ( ! cr ) return false;
m_useTimeAxis = cr->m_useTimeAxis;
m_useTimeAxisValid = true;
return m_useTimeAxis;
}
// . returns false and sets g_errno on error
// . copied Url2.cpp into here basically, so we can now dump Url2.cpp
bool XmlDoc::hashUrl ( HashTableX *tt, bool urlOnly ) { // , bool isStatusDoc ) {
@ -937,12 +923,6 @@ bool XmlDoc::hashUrl ( HashTableX *tt, bool urlOnly ) { // , bool isStatusDoc )
gbshutdownLogicError();
}
if ( getUseTimeAxis() ) {
hi.m_prefix = "gbtimeurl";
SafeBuf *tau = getTimeAxisUrl();
hashSingleTerm ( tau->getBufStart(),tau->length(),&hi);
}
char *s = fu->getUrl();
int32_t slen = fu->getUrlLen();

@ -2549,8 +2549,6 @@ void dumpTitledb (const char *coll, int32_t startFileNum, int32_t numFiles, bool
"cs=%04d "
"lang=%02d "
"sni=%03" PRId32" "
"usetimeaxis=%i "
//"cats=%" PRId32" "
"lastspidered=%s "
"ip=%s "
"numLinkTexts=%04" PRId32" "
@ -2574,7 +2572,6 @@ void dumpTitledb (const char *coll, int32_t startFileNum, int32_t numFiles, bool
xd->m_charset,//tr.getCharset(),
xd->m_langId,//tr.getLanguage(),
(int32_t)xd->m_siteNumInlinks,//tr.getDo
xd->m_useTimeAxis,
//nc,
ppp,
iptoa(xd->m_ip,ipbuf2),
@ -2628,7 +2625,6 @@ void dumpTitledb (const char *coll, int32_t startFileNum, int32_t numFiles, bool
"ctype=%s "
"lang=%02d "
"sni=%03" PRId32" "
"usetimeaxis=%i "
"lastspidered=%s "
"ip=%s "
"numLinkTexts=%04" PRId32" "
@ -2648,7 +2644,6 @@ void dumpTitledb (const char *coll, int32_t startFileNum, int32_t numFiles, bool
g_contentTypeStrings[xd->m_contentType],
xd->m_langId,//tr.getLanguage(),
(int32_t)xd->m_siteNumInlinks,//tr.getDocQuality(),
xd->m_useTimeAxis,
ppp,
iptoa(xd->m_ip,ipbuf2),
info->getNumGoodInlinks(),