forked from Mirrors/privacore-open-source-search-engine
Remove useTimeAxis feature
This commit is contained in:
@ -998,7 +998,6 @@ CollectionRec::CollectionRec() {
|
||||
m_dedupingEnabled = false;
|
||||
m_dupCheckWWW = false;
|
||||
m_useSimplifiedRedirects = false;
|
||||
m_useTimeAxis = false;
|
||||
m_oneVotePerIpDom = false;
|
||||
m_doUrlSpamCheck = false;
|
||||
m_doLinkSpamCheck = false;
|
||||
|
@ -219,7 +219,6 @@ public:
|
||||
bool m_dedupingEnabled ; // dedup content on same hostname
|
||||
bool m_dupCheckWWW ;
|
||||
bool m_useSimplifiedRedirects ;
|
||||
bool m_useTimeAxis ;
|
||||
bool m_oneVotePerIpDom ;
|
||||
bool m_doUrlSpamCheck ; //filter urls w/ naughty hostnames
|
||||
bool m_doLinkSpamCheck ; //filters dynamically generated pages
|
||||
|
13
Parms.cpp
13
Parms.cpp
@ -7698,19 +7698,6 @@ void Parms::init ( ) {
|
||||
m->m_flags = PF_CLONE;
|
||||
m++;
|
||||
|
||||
|
||||
m->m_title = "use time axis";
|
||||
m->m_desc = "If this is true Gigablast will index the same "
|
||||
"url multiple times if its content varies over time, "
|
||||
"rather than overwriting the older version in the index. "
|
||||
"Useful for archive web pages as they change over time.";
|
||||
m->m_cgi = "usetimeaxis";
|
||||
simple_m_set(CollectionRec,m_useTimeAxis);
|
||||
m->m_def = "0";
|
||||
m->m_page = PAGE_SPIDER;
|
||||
m->m_flags = PF_CLONE;
|
||||
m++;
|
||||
|
||||
m->m_title = "daily merge time";
|
||||
m->m_desc = "Do a tight merge on posdb and titledb at this time "
|
||||
"every day. This is expressed in MINUTES past midnight UTC. "
|
||||
|
49
XmlDoc.cpp
49
XmlDoc.cpp
@ -159,9 +159,6 @@ void XmlDoc::reset ( ) {
|
||||
m_mySiteLinkInfoBuf.purge();
|
||||
m_myPageLinkInfoBuf.purge();
|
||||
|
||||
// we need to reset this to false
|
||||
m_useTimeAxis = false;
|
||||
|
||||
m_loaded = false;
|
||||
|
||||
m_indexedDoc = false;
|
||||
@ -5333,11 +5330,6 @@ int64_t XmlDoc::getFirstUrlHash48() {
|
||||
if ( m_firstUrlHash48Valid ) return m_firstUrlHash48;
|
||||
// this must work
|
||||
if ( ! m_firstUrlValid ) { g_process.shutdownAbort(true); }
|
||||
if ( getUseTimeAxis() ) {
|
||||
m_firstUrlHash48 = hash64b ( getTimeAxisUrl()->getBufStart() ) & 0x0000ffffffffffffLL;
|
||||
m_firstUrlHash48Valid = true;
|
||||
return m_firstUrlHash48;
|
||||
}
|
||||
|
||||
m_firstUrlHash48 = hash64b ( m_firstUrl.getUrl() ) & 0x0000ffffffffffffLL;
|
||||
m_firstUrlHash48Valid = true;
|
||||
@ -5349,12 +5341,6 @@ int64_t XmlDoc::getFirstUrlHash64() {
|
||||
// this must work
|
||||
if ( ! m_firstUrlValid ) { g_process.shutdownAbort(true); }
|
||||
|
||||
if ( getUseTimeAxis() ) {
|
||||
m_firstUrlHash64 = hash64b ( getTimeAxisUrl()->getBufStart() );
|
||||
m_firstUrlHash64Valid = true;
|
||||
return m_firstUrlHash64;
|
||||
}
|
||||
|
||||
m_firstUrlHash64 = hash64b ( m_firstUrl.getUrl() );
|
||||
m_firstUrlHash64Valid = true;
|
||||
return m_firstUrlHash64;
|
||||
@ -6230,16 +6216,6 @@ XmlDoc **XmlDoc::getRootXmlDoc ( int32_t maxCacheAge ) {
|
||||
return &m_rootDoc;
|
||||
}
|
||||
|
||||
SafeBuf *XmlDoc::getTimeAxisUrl ( ) {
|
||||
if ( m_timeAxisUrlValid ) return &m_timeAxisUrl;
|
||||
if ( m_setFromDocId ) return &m_timeAxisUrl;
|
||||
m_timeAxisUrlValid = true;
|
||||
Url *fu = getFirstUrl();
|
||||
m_timeAxisUrl.reset();
|
||||
m_timeAxisUrl.safePrintf("%s.%u",fu->getUrl(),m_contentHash32);
|
||||
return &m_timeAxisUrl;
|
||||
}
|
||||
|
||||
// . look up TitleRec using Msg22 if we need to
|
||||
// . set our m_titleRec member from titledb
|
||||
// . the twin brother of XmlDoc::getTitleRecBuf() which makes the title rec
|
||||
@ -6313,15 +6289,6 @@ char **XmlDoc::getOldTitleRec() {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// if using time axis then append the timestamp to the end of
|
||||
// the url. this way Msg22::getAvailDocId() will return a docid
|
||||
// based on that so we don't collide with other instances of this
|
||||
// same url.
|
||||
if ( u && getUseTimeAxis() ) { // g_conf.m_useTimeAxis ) {
|
||||
SafeBuf *tau = getTimeAxisUrl();
|
||||
u = tau->getBufStart();
|
||||
}
|
||||
|
||||
// the title must be local since we're spidering it
|
||||
if ( ! m_msg22a.getTitleRec ( &m_msg22Request ,
|
||||
u ,
|
||||
@ -6448,13 +6415,12 @@ int64_t *XmlDoc::getDocId ( ) {
|
||||
}
|
||||
|
||||
// ensure it is within probable range
|
||||
if ( ! getUseTimeAxis () ) {
|
||||
char *u = getFirstUrl()->getUrl();
|
||||
int64_t pd = Titledb::getProbableDocId(u);
|
||||
int64_t d1 = Titledb::getFirstProbableDocId ( pd );
|
||||
int64_t d2 = Titledb::getLastProbableDocId ( pd );
|
||||
if ( m_docId < d1 || m_docId > d2 ) {
|
||||
g_process.shutdownAbort(true); }
|
||||
char *u = getFirstUrl()->getUrl();
|
||||
int64_t pd = Titledb::getProbableDocId(u);
|
||||
int64_t d1 = Titledb::getFirstProbableDocId ( pd );
|
||||
int64_t d2 = Titledb::getLastProbableDocId ( pd );
|
||||
if ( m_docId < d1 || m_docId > d2 ) {
|
||||
g_process.shutdownAbort(true);
|
||||
}
|
||||
|
||||
m_docIdValid = true;
|
||||
@ -11543,7 +11509,6 @@ void XmlDoc::logIt (SafeBuf *bb ) {
|
||||
sb->safePrintf("probdocid=%" PRIu64" ",pd);
|
||||
sb->safePrintf("probdocidmin=%" PRIu64" ",d1);
|
||||
sb->safePrintf("probdocidmax=%" PRIu64" ",d2);
|
||||
sb->safePrintf("usetimeaxis=%i ",(int)m_useTimeAxis);
|
||||
|
||||
if ( m_siteNumInlinksValid ) {
|
||||
sb->safePrintf("siteinlinks=%04" PRId32" ",m_siteNumInlinks );
|
||||
@ -17388,7 +17353,6 @@ bool XmlDoc::printDoc ( SafeBuf *sb ) {
|
||||
"<tr><td>content type</td><td>%" PRId32" (%s)</td></tr>\n"
|
||||
"<tr><td>language</td><td>%" PRId32" (%s)</td></tr>\n"
|
||||
"<tr><td>country</td><td>%" PRId32" (%s)</td></tr>\n"
|
||||
"<tr><td>time axis used</td><td>%" PRId32"</td></tr>\n"
|
||||
"<tr><td>metadata</td><td>%s</td></tr>\n"
|
||||
"</td></tr>\n",
|
||||
|
||||
@ -17432,7 +17396,6 @@ bool XmlDoc::printDoc ( SafeBuf *sb ) {
|
||||
|
||||
(int32_t)m_countryId,
|
||||
g_countryCode.getName(m_countryId),
|
||||
m_useTimeAxis,
|
||||
"");
|
||||
|
||||
if ( info1 ) {
|
||||
|
8
XmlDoc.h
8
XmlDoc.h
@ -197,7 +197,7 @@ public:
|
||||
uint16_t m_reserved800:1;
|
||||
uint16_t m_reserved801:1;
|
||||
uint16_t m_reserved802:1;
|
||||
uint16_t m_useTimeAxis:1;
|
||||
uint16_t m_reserved803:1;
|
||||
uint16_t m_reserved805:1;
|
||||
uint16_t m_reserved806:1;
|
||||
uint16_t m_reserved807:1;
|
||||
@ -497,8 +497,6 @@ public:
|
||||
bool hashContentType ( class HashTableX *table ) ;
|
||||
|
||||
bool hashLinks ( class HashTableX *table ) ;
|
||||
bool getUseTimeAxis ( ) ;
|
||||
SafeBuf *getTimeAxisUrl ( );
|
||||
bool hashUrl ( class HashTableX *table, bool urlOnly );
|
||||
bool hashDateNumbers ( class HashTableX *tt );
|
||||
bool hashIncomingLinkText(HashTableX *table);
|
||||
@ -655,8 +653,6 @@ public:
|
||||
char m_logLangId;
|
||||
int32_t m_logSiteNumInlinks;
|
||||
|
||||
SafeBuf m_timeAxisUrl;
|
||||
|
||||
bool isFirstUrlRobotsTxt();
|
||||
bool m_isRobotsTxtUrl;
|
||||
|
||||
@ -688,8 +684,6 @@ public:
|
||||
bool m_siteValid;
|
||||
bool m_startTimeValid;
|
||||
bool m_currentUrlValid;
|
||||
bool m_useTimeAxisValid;
|
||||
bool m_timeAxisUrlValid;
|
||||
bool m_firstUrlValid;
|
||||
bool m_firstUrlHash48Valid;
|
||||
bool m_firstUrlHash64Valid;
|
||||
|
@ -886,20 +886,6 @@ bool XmlDoc::hashLinksForLinkdb ( HashTableX *dt ) {
|
||||
return true;
|
||||
}
|
||||
|
||||
bool XmlDoc::getUseTimeAxis ( ) {
|
||||
if ( m_useTimeAxisValid )
|
||||
return m_useTimeAxis;
|
||||
if ( m_setFromTitleRec )
|
||||
// return from titlerec header
|
||||
return m_useTimeAxis;
|
||||
CollectionRec *cr = g_collectiondb.getRec ( m_collnum );
|
||||
if ( ! cr ) return false;
|
||||
m_useTimeAxis = cr->m_useTimeAxis;
|
||||
m_useTimeAxisValid = true;
|
||||
return m_useTimeAxis;
|
||||
}
|
||||
|
||||
|
||||
// . returns false and sets g_errno on error
|
||||
// . copied Url2.cpp into here basically, so we can now dump Url2.cpp
|
||||
bool XmlDoc::hashUrl ( HashTableX *tt, bool urlOnly ) { // , bool isStatusDoc ) {
|
||||
@ -937,12 +923,6 @@ bool XmlDoc::hashUrl ( HashTableX *tt, bool urlOnly ) { // , bool isStatusDoc )
|
||||
gbshutdownLogicError();
|
||||
}
|
||||
|
||||
if ( getUseTimeAxis() ) {
|
||||
hi.m_prefix = "gbtimeurl";
|
||||
SafeBuf *tau = getTimeAxisUrl();
|
||||
hashSingleTerm ( tau->getBufStart(),tau->length(),&hi);
|
||||
}
|
||||
|
||||
char *s = fu->getUrl();
|
||||
int32_t slen = fu->getUrlLen();
|
||||
|
||||
|
5
main.cpp
5
main.cpp
@ -2549,8 +2549,6 @@ void dumpTitledb (const char *coll, int32_t startFileNum, int32_t numFiles, bool
|
||||
"cs=%04d "
|
||||
"lang=%02d "
|
||||
"sni=%03" PRId32" "
|
||||
"usetimeaxis=%i "
|
||||
//"cats=%" PRId32" "
|
||||
"lastspidered=%s "
|
||||
"ip=%s "
|
||||
"numLinkTexts=%04" PRId32" "
|
||||
@ -2574,7 +2572,6 @@ void dumpTitledb (const char *coll, int32_t startFileNum, int32_t numFiles, bool
|
||||
xd->m_charset,//tr.getCharset(),
|
||||
xd->m_langId,//tr.getLanguage(),
|
||||
(int32_t)xd->m_siteNumInlinks,//tr.getDo
|
||||
xd->m_useTimeAxis,
|
||||
//nc,
|
||||
ppp,
|
||||
iptoa(xd->m_ip,ipbuf2),
|
||||
@ -2628,7 +2625,6 @@ void dumpTitledb (const char *coll, int32_t startFileNum, int32_t numFiles, bool
|
||||
"ctype=%s "
|
||||
"lang=%02d "
|
||||
"sni=%03" PRId32" "
|
||||
"usetimeaxis=%i "
|
||||
"lastspidered=%s "
|
||||
"ip=%s "
|
||||
"numLinkTexts=%04" PRId32" "
|
||||
@ -2648,7 +2644,6 @@ void dumpTitledb (const char *coll, int32_t startFileNum, int32_t numFiles, bool
|
||||
g_contentTypeStrings[xd->m_contentType],
|
||||
xd->m_langId,//tr.getLanguage(),
|
||||
(int32_t)xd->m_siteNumInlinks,//tr.getDocQuality(),
|
||||
xd->m_useTimeAxis,
|
||||
ppp,
|
||||
iptoa(xd->m_ip,ipbuf2),
|
||||
info->getNumGoodInlinks(),
|
||||
|
Reference in New Issue
Block a user