Remove unused variable/functions from XmlDoc

This commit is contained in:
Ai Lin Chia
2016-03-02 22:22:22 +01:00
parent da3e30490b
commit c46a4c5841
5 changed files with 17 additions and 1021 deletions

@ -312,20 +312,6 @@ bool processLoop ( void *state ) {
return status;
}
/*
// this was calling XmlDoc and setting sections, etc. to
// get the SpiderReply junk... no no no
// is it banned or filtered? this ignores the TagRec in the titleRec
// and uses msg8a to get it fresh instead
char *vi = xd->getIsFiltered();//Visible( );
// wait if blocked
if ( vi == (void *)-1 ) return false;
// error?
if ( ! vi ) return sendErrorReply ( st , g_errno );
// banned?
if ( ! st->m_isMasterAdmin && ! *vi ) return sendErrorReply (st,EDOCBANNED);
*/
// get the utf8 content
char **utf8 = xd->getUtf8Content();
//int32_t len = xd->size_utf8Content - 1;

@ -567,12 +567,6 @@ class SpiderRequest {
//int32_t m_reservedc1;
int32_t m_reservedc2;
//int32_t m_parentPubDate;
// . pub date taken from url directly, not content
// . ie. http://mysite.com/blog/nov-06-2009/food.html
// . ie. http://mysite.com/blog/11062009/food.html
//int32_t m_urlPubDate;
// . replace this with something we need for smart compression
// . this is zero if none or invalid
int32_t m_contentHash32;

@ -100,7 +100,6 @@ XmlDoc::XmlDoc() {
m_wasInIndex = false;
m_outlinkHopCountVector = NULL;
m_extraDoc = NULL;
m_wikiqbuf = NULL;
reset();
}
@ -126,16 +125,11 @@ void XmlDoc::reset ( ) {
m_printedMenu = false;
m_tmpBuf2.purge();
m_bodyStartPos = 0;
m_skipIframeExpansion = false;
m_indexedTime = 0;
m_metaList2.purge();
m_zbuf.purge();
m_kbuf.purge();
m_mySiteLinkInfoBuf.purge();
m_myPageLinkInfoBuf.purge();
@ -155,10 +149,6 @@ void XmlDoc::reset ( ) {
m_fakeIpBuf.purge();
m_fakeTagRecPtrBuf.purge();
m_tlbufTimer = 0LL;
m_gsbuf.reset();
m_doConsistencyTesting = g_conf.m_doConsistencyTesting;
m_computedMetaListCheckSum = false;
@ -191,9 +181,6 @@ void XmlDoc::reset ( ) {
"be saved in addsinprogress.dat.");
}
m_ei = 0;
m_lastLaunch = -1;
m_pbuf = NULL;
m_wts = NULL;
@ -277,9 +264,6 @@ void XmlDoc::reset ( ) {
}
m_outlinkHopCountVector = NULL;
m_gsbuf.reset();
// reset all *valid* flags to false
void *p = &m_VALIDSTART;
void *pend = &m_VALIDEND;
@ -329,7 +313,6 @@ void XmlDoc::reset ( ) {
m_setFromDocId = false;
m_setFromSpiderRec = false;
m_freeLinkInfo1 = false;
m_freeLinkInfo2 = false;
m_checkedUrlFilters = false;
@ -351,50 +334,35 @@ void XmlDoc::reset ( ) {
// keep track of updates to the rdbs we have done, so we do not re-do
m_listAdded = false;
m_listFlushed = false;
m_updatedCounts = false;
m_updatedCounts2 = false;
m_copied1 = false;
m_updatingSiteLinkInfoTags = false;
m_hashedTitle = false;
m_registeredSleepCallback = false;
m_addedNegativeDoledbRec = false;
m_numRedirects = 0;
m_numOutlinksAdded = 0;
m_spamCheckDisabled = false;
m_useRobotsTxt = true;
m_redirectFlag = false;
m_allowSimplifiedRedirs = false;
m_didDelay = false;
m_didDelayUnregister = false;
m_calledMsg22d = 0LL;
m_calledMsg22e = false;
m_calledMsg22f = false;
m_calledMsg25 = false;
m_calledMsg25b = false;
m_calledMsg40 = false;
m_calledSections = false;
m_calledThread = false;
m_alreadyRegistered = false;
m_loaded = false;
m_firstEntry = true;
m_firstEntry2 = true;
m_launchedSpecialMsg8a = false;
m_launchedMsg8a2 = false;
m_setTr = false;
m_calledMsg8b = false;
m_recycleContent = false;
m_callback1 = NULL;
m_callback2 = NULL;
m_state = NULL;
m_processedLang = false;
m_doingConsistencyCheck = false;
@ -431,19 +399,6 @@ void XmlDoc::reset ( ) {
void *px = &ptr_firstUrl;
void *pxend = &m_dummyEnd;
memset ( px , 0 , (char *)pxend - (char *)px );
ptr_unused6 = NULL;
size_unused6 = 0;
ptr_unused7 = NULL;
size_unused7 = 0;
ptr_unused1 = NULL;
size_unused1 = 0;
ptr_unused2 = NULL;
size_unused2 = 0;
ptr_unused3 = NULL;
size_unused3 = 0;
ptr_unused5 = NULL;
size_unused5 = 0;
}
int64_t XmlDoc::logQueryTimingStart() {
@ -638,8 +593,6 @@ bool XmlDoc::setCollNum ( const char *coll ) {
// we can store this safely:
m_collnum = cr->m_collnum;
m_collnumValid = true;
// if user "resets" the collection we need to know
m_lastCollRecResetCount = cr->m_lastResetCount;
return true;
}
@ -1286,10 +1239,6 @@ bool XmlDoc::set2 ( char *titleRec ,
return false;
}
// debug thing
ptr_sectiondbData = NULL;
size_sectiondbData = 0;
// success, return true then
return true;
}
@ -2255,37 +2204,6 @@ bool XmlDoc::indexDoc2 ( ) {
m_registeredSleepCallback = false;
}
//////////
// . add the doledb negative key quickly to our tree to avoid a
// respider because the msg4 doledb negative key is buffered by msg4
// . make it negative
// . well it should not be respidered because the lock is on it!!
// -- so let's comment this out
/////////
/*
key_t negative = m_doledbKey;
// make it negative
negative.n0 &= 0xfffffffffffffffeLL;
// . store it in our tree if we can
// . returns false and sets g_errno on error
// . i.e. g_errno == ETRYAGAIN
if ( ! m_addedNegativeDoledbRec &&
! g_doledb.m_rdb.addRecord(m_coll,(char *)&negative,
NULL,0,m_niceness)){
log("build: error trying to add to doledb: %s",
mstrerror(g_errno));
// set sleep wrapper
g_loop.registerSleepCallback(1000,m_masterState,
indexDocWrapper2,m_niceness);
// note it
m_registeredSleepCallback = true;
// sleep and retry
return false;
}
*/
// we did that
m_addedNegativeDoledbRec = true;
// now add it
if ( ! m_listAdded && m_metaListSize ) {
// only call this once
@ -5610,18 +5528,6 @@ int32_t XmlDoc::computeVector( Words *words, uint32_t *vec, int32_t start, int32
return nd * 4;
}
float *XmlDoc::getTagSimilarity ( XmlDoc *xd2 ) {
int32_t *tv1 = getTagPairHashVector();
if ( ! tv1 || tv1 == (int32_t *)-1 ) return (float *)tv1;
int32_t *tv2 = xd2->getTagPairHashVector();
if ( ! tv2 || tv2 == (int32_t *)-1 ) return (float *)tv2;
m_tagSimilarity = computeSimilarity ( tv1, tv2, NULL, NULL, NULL ,
m_niceness );
// this means error, g_errno should be set
if ( m_tagSimilarity == -1.0 ) return NULL;
return &m_tagSimilarity;
}
float *XmlDoc::getPageSimilarity ( XmlDoc *xd2 ) {
int32_t *sv1 = getPageSampleVector();
if ( ! sv1 || sv1 == (int32_t *)-1 ) return (float *)sv1;
@ -5869,17 +5775,6 @@ bool isSimilar_sorted ( int32_t *vec0 ,
goto mergeLoop;
}
uint64_t *XmlDoc::getFuzzyDupHash ( ) {
if ( m_dupHashValid ) return &m_dupHash;
uint32_t *h1 = getTagPairHash32();
if ( ! h1 || h1 == (uint32_t *)-1 ) return (uint64_t *)h1;
m_dupHash = *h1;
m_dupHashValid = true;
return &m_dupHash;
}
int64_t *XmlDoc::getExactContentHash64 ( ) {
if ( m_exactContentHash64Valid )
@ -6599,7 +6494,6 @@ Url **XmlDoc::getRedirUrl() {
if ( cu->getDomainLen() != dlen ) sameDom=false;
else if ( strncmp(cu->getDomain(),loc->getDomain(),dlen))sameDom=false;
if ( ! sameDom ) {
m_redirectFlag = true;
m_redirUrl.set ( loc , false ); // addWWW=false
m_redirUrlPtr = &m_redirUrl;
ptr_redirUrl = m_redirUrl.m_url;
@ -6701,7 +6595,6 @@ Url **XmlDoc::getRedirUrl() {
return &m_redirUrlPtr;
}
// good to go
m_redirectFlag = true;
m_redirUrl.set ( loc , false ); // addWWW=false
m_redirUrlPtr = &m_redirUrl;
ptr_redirUrl = m_redirUrl.m_url;
@ -7118,9 +7011,6 @@ XmlDoc **XmlDoc::getExtraDoc ( char *u , int32_t maxCacheAge ) {
// carry this forward always!
m_extraDoc->m_isSpiderProxy = m_isSpiderProxy;
// disable spam check because that is not necessary for this doc!
m_extraDoc->m_spamCheckDisabled = true;
// tell msg13 to get this from it robots.txt cache if it can. it also
// keeps a separate html page cache for the root pages, etc. in case
m_extraDoc->m_maxCacheAge = maxCacheAge;
@ -12559,14 +12449,6 @@ int32_t XmlDoc::getHostHash32a ( ) {
return m_hostHash32a;
}
int32_t XmlDoc::getHostHash32b ( ) {
if ( m_hostHash32bValid ) return m_hostHash32b;
m_hostHash32bValid = true;
Url *c = getCurrentUrl();
m_hostHash32b = c->getHostHash32();
return m_hostHash32b;
}
int32_t XmlDoc::getDomHash32( ) {
if ( m_domHash32Valid ) return m_domHash32;
m_domHash32Valid = true;
@ -13421,19 +13303,6 @@ char *XmlDoc::getSpiderLinks ( ) {
return &m_spiderLinks2;
}
// should we index the doc? if already indexed, and is filtered, we delete it
char *XmlDoc::getIsFiltered ( ) {
if ( m_isFilteredValid ) return &m_isFiltered;
int32_t *priority = getSpiderPriority();
if ( ! priority || priority == (void *)-1 ) return (char *)priority;
m_isFiltered = false;
// if ( *priority == SPIDER_PRIORITY_FILTERED ) m_isFiltered = true;
// if ( *priority == SPIDER_PRIORITY_BANNED ) m_isFiltered = true;
if ( *priority == -3 ) m_isFiltered = true;
m_isFilteredValid = true;
return &m_isFiltered;
}
int32_t *XmlDoc::getSpiderPriority ( ) {
if ( m_priorityValid ) return &m_priority;
setStatus ("getting spider priority");
@ -14121,17 +13990,6 @@ bool XmlDoc::doConsistencyTest ( bool forceTest ) {
return true;
}
int32_t XmlDoc::printMetaList ( ) {
SafeBuf sb;
printMetaList ( m_metaList ,
m_metaList + m_metaListSize ,
&sb );
fprintf(stderr,"%s\n",sb.getBufStart());
return 0;
}
#define TABLE_ROWS 25
// print this also for page parser output!
@ -16684,10 +16542,6 @@ void XmlDoc::copyFromOldDoc ( XmlDoc *od ) {
size_linkInfo1 = od->size_linkInfo1;
if ( ptr_linkInfo1 && size_linkInfo1 ) m_linkInfo1Valid = true;
else m_linkInfo1Valid = false;
// turn off for debug
ptr_sectiondbData = NULL;
size_sectiondbData = 0;
}
// for adding a quick reply for EFAKEIP and for diffbot query reindex requests
@ -17726,7 +17580,6 @@ char *XmlDoc::addOutlinkSpiderRecsToMetaList ( ) {
ksr.m_addedTime = getSpideredTime();//m_spideredTime;
//ksr.m_lastAttempt = 0;
//ksr.m_urlPubDate = urlPubDate;
//ksr.m_errCode = 0;
ksr.m_parentHostHash32 = hostHash32a;
ksr.m_parentDomHash32 = m_domHash32;
@ -17955,8 +17808,6 @@ char *XmlDoc::addOutlinkSpiderRecsToMetaList ( ) {
m_numOutlinksAdded = numAdded;
m_numOutlinksAddedValid = true;
m_numOutlinksAddedFromSameDomain = numAddedFromSameDomain;
m_numOutlinksFiltered = linksFiltered;
m_numOutlinksBanned = linksBanned;
// update end of list once we have successfully added all spider recs
m_p = p;
// return current ptr
@ -23457,8 +23308,6 @@ char *XmlDoc::getWordSpamVec ( ) {
// fix this a bit so we're not always totally spammed
maxPercent = 25;
// assume not totally spammed
m_totallySpammed = false;
// get # of words we have to set spam for
int32_t numWords = words->getNumWords();
@ -23670,9 +23519,7 @@ char *XmlDoc::getWordSpamVec ( ) {
// if we had < 100 candidates and < 20% spam, don't bother
//if ( percent < 5 ) goto done;
if ( percent <= maxPercent ) goto done;
// set flag so linkspam.cpp can see if all is spam and will not allow
// this page to vote
m_totallySpammed = true;
// now only set to 99 so each singleton usually gets hashed
for ( i = 0 ; i < numWords ; i++ )
if ( words->getWordId(i) && spam[i] < 99 )
@ -24627,70 +24474,6 @@ bool XmlDoc::getIsInjecting ( ) {
return isInjecting;
}
// this is still used by Title.cpp to get the title: field quickly
char *getJSONFieldValue ( char *json , char *field , int32_t *valueLen ) {
if ( ! json ) return NULL;
// get length
int32_t fieldLen = gbstrlen(field);
// keep track of in a quote or not
bool inQuotes = false;
char *stringStart = NULL;
char *p = json;
bool gotOne = false;
int32_t depth = 0;
// scan
for ( ; *p ; p++ ) {
// escaping a quote? ignore quote then.
if ( *p == '\\' && p[1] == '\"' ) {
// skip two bytes then..
p++;
continue;
}
// count {} depth
if ( ! inQuotes ) {
if ( *p == '{' ) depth++;
if ( *p == '}' ) depth--;
}
// a quote?
if ( *p == '\"' ) {
inQuotes = ! inQuotes;
// set start of the string if quote is beginning
if ( inQuotes ) stringStart = p + 1;
// if quote is ending and a colon follows then
// it was a json field name. so if it matches the
// field we want return the following field for it.
else if ( ! inQuotes &&
! gotOne &&
p[1] == ':' &&
// {"title":"whatever",...}
// could be product:{title:... depth=2
(depth == 1 ||depth==2) &&
stringStart &&
(p - stringStart) == fieldLen &&
strncmp(field,stringStart,fieldLen)==0 ) {
// now, the next time we set stringStart
// it will be set to the VALUE of this field
// assuming the field is a STRING!!!!
gotOne = true;
// return after the quote
//return p + 2;
}
// ok, we got the string after the field string...
else if ( ! inQuotes && gotOne ) {
if ( valueLen ) *valueLen = p - stringStart;
return stringStart;
}
// keep chugging
continue;
}
}
// done, not found
return NULL;
}
Json *XmlDoc::getParsedJson ( ) {
if ( m_jpValid ) return &m_jp;

152
XmlDoc.h

@ -71,10 +71,6 @@ bool setLangVec ( class Words *words ,
class Sections *sections ,
int32_t niceness ) ;
char *getJSONFieldValue ( char *json, char *field , int32_t *valueLen ) ;
bool logQueryLogs ( );
bool getDensityRanks ( int64_t *wids ,
int32_t nw,
//int32_t wordStart ,
@ -192,7 +188,7 @@ public:
uint32_t m_spideredTime; // time_t
uint32_t m_indexedTime; // slightly > m_spideredTime (time_t)
uint32_t m_reserved32;
uint32_t reserved3; //was: m_pubDate; // aka m_datedbDate // time_t
uint32_t reserved3;
uint32_t m_firstIndexedDate; // time_t
uint32_t m_outlinksAddedDate; // time_t
@ -206,7 +202,7 @@ public:
uint16_t m_bodyStartPos;
uint16_t m_reserved5;
uint16_t m_unused0; //was: m_diffbotJSONCount
uint16_t m_unused0;
int16_t m_httpStatus; // -1 if not found (empty http reply)
@ -230,9 +226,9 @@ public:
uint16_t m_reserved799:1;
uint16_t m_isSiteRoot:1;
uint16_t m_reserved800:1; //was:m_isDiffbotJSONObject
uint16_t m_reserved801:1; //was:m_sentToDiffbot
uint16_t m_reserved802:1; //was:m_gotDiffbotSuccessfulReply
uint16_t m_reserved800:1;
uint16_t m_reserved801:1;
uint16_t m_reserved802:1;
uint16_t m_useTimeAxis:1; // m_reserved804:1;
uint16_t m_reserved805:1;
uint16_t m_reserved806:1;
@ -273,7 +269,7 @@ public:
char *ptr_site;
LinkInfo *ptr_linkInfo1;
char *ptr_linkdbData;
char *ptr_sectiondbData;
char *ptr_unused14;
char *ptr_tagRecData;
LinkInfo *ptr_unused9;
@ -296,7 +292,7 @@ public:
int32_t size_site;
int32_t size_linkInfo1;
int32_t size_linkdbData;
int32_t size_sectiondbData;
int32_t size_unused14;
int32_t size_tagRecData;
int32_t size_unused9;
@ -395,10 +391,8 @@ public:
int32_t *getPageSampleVector ( ) ;
int32_t *getPostLinkTextVector ( int32_t linkNode ) ;
int32_t computeVector ( class Words *words, uint32_t *vec , int32_t start = 0 , int32_t end = -1 );
float *getTagSimilarity ( class XmlDoc *xd2 ) ;
float *getPageSimilarity ( class XmlDoc *xd2 ) ;
float *getPercentChanged ( );
uint64_t *getFuzzyDupHash ( );
int64_t *getExactContentHash64();
class RdbList *getDupList ( ) ;
char *getIsDup ( ) ;
@ -471,7 +465,6 @@ public:
int32_t *getTagPairHashVector ( ) ;
uint32_t *getTagPairHash32 ( ) ;
int32_t getHostHash32a ( ) ;
int32_t getHostHash32b ( ) ;
int32_t getDomHash32 ( );
char **getThumbnailData();
class Images *getImages ( ) ;
@ -482,7 +475,6 @@ public:
char *getIsSiteRoot ( ) ;
int8_t *getHopCount ( ) ;
char *getSpiderLinks ( ) ;
char *getIsFiltered ();
bool getIsInjecting();
int32_t *getSpiderPriority ( ) ;
int32_t *getIndexCode ( ) ;
@ -492,7 +484,7 @@ public:
bool logIt ( class SafeBuf *bb = NULL ) ;
bool m_doConsistencyTesting;
bool doConsistencyTest ( bool forceTest ) ;
int32_t printMetaList ( ) ;
void printMetaList ( char *metaList , char *metaListEnd ,
class SafeBuf *pbuf );
bool verifyMetaList ( char *p , char *pend , bool forDelete ) ;
@ -518,10 +510,6 @@ public:
// m_indexCode or g_errno was set!
class SpiderReply *getNewSpiderReply ( );
SpiderRequest m_redirSpiderRequest;
SpiderRequest *m_redirSpiderRequestPtr;
void setSpiderReqForMsg20 ( class SpiderRequest *sreq ,
class SpiderReply *srep );
@ -542,7 +530,6 @@ public:
bool hashNoSplit ( class HashTableX *tt ) ;
char *hashAll ( class HashTableX *table ) ;
bool hashMetaTags ( class HashTableX *table ) ;
bool hashMetaZip ( class HashTableX *table ) ;
bool hashContentType ( class HashTableX *table ) ;
bool hashLinks ( class HashTableX *table ) ;
@ -550,12 +537,9 @@ public:
SafeBuf *getTimeAxisUrl ( );
bool hashUrl ( class HashTableX *table, bool urlOnly );
bool hashDateNumbers ( class HashTableX *tt );
bool hashSections ( class HashTableX *table ) ;
bool hashIncomingLinkText( class HashTableX *table, bool hashAnomalies, bool hashNonAnomalies );
bool hashLinksForLinkdb ( class HashTableX *table ) ;
bool hashNeighborhoods ( class HashTableX *table ) ;
bool hashRSSInfo ( class HashTableX *table ) ;
bool hashRSSTerm ( class HashTableX *table , bool inRSS ) ;
bool hashTitle ( class HashTableX *table );
bool hashBody2 ( class HashTableX *table );
bool hashMetaKeywords ( class HashTableX *table );
@ -564,12 +548,8 @@ public:
bool hashLanguage ( class HashTableX *table ) ;
bool hashLanguageString ( class HashTableX *table ) ;
bool hashCountry ( class HashTableX *table ) ;
bool hashSiteNumInlinks ( class HashTableX *table ) ;
bool hashCharset ( class HashTableX *table ) ;
bool hashTagRec ( class HashTableX *table ) ;
bool hashPermalink ( class HashTableX *table ) ;
bool hashVectors(class HashTableX *table ) ;
class Url *getBaseUrl ( ) ;
bool hashIsAdult ( class HashTableX *table ) ;
@ -611,10 +591,6 @@ public:
int32_t bufLen ,
class HashInfo *hi ) ;
bool hashNumberForSortingAsFloat ( float f ,
class HashInfo *hi ,
char *gbsortByStr ) ;
bool hashNumberForSortingAsInt32 ( int32_t x,
class HashInfo *hi ,
char *gbsortByStr ) ;
@ -678,9 +654,7 @@ public:
int64_t m_firstUrlHash64;
Url m_currentUrl;
CollectionRec *m_lastcr;
collnum_t m_collnum;
int32_t m_lastCollRecResetCount;
class CollectionRec *getCollRec ( ) ;
bool setCollNum ( const char *coll ) ;
@ -696,8 +670,6 @@ public:
int32_t m_addedStatusDocSize;
SafeBuf m_metaList2;
SafeBuf m_zbuf;
SafeBuf m_kbuf;
// used by msg7 to store udp slot
class UdpSlot *m_injectionSlot;
@ -719,8 +691,6 @@ public:
char m_logLangId;
int32_t m_logSiteNumInlinks;
SafeBuf m_tmpBuf2;
SafeBuf m_timeAxisUrl;
Images m_images;
@ -767,7 +737,6 @@ public:
char m_fragBufValid;
char m_wordSpamBufValid;
char m_finalSummaryBufValid;
char m_redirSpiderRequestValid;
char m_hopCountValid;
char m_isInjectingValid;
@ -782,7 +751,6 @@ public:
char m_datedbDateValid;
char m_isRSSValid;
char m_isSiteMapValid;
char m_spiderLinksArgValid;
char m_isContentTruncatedValid;
char m_xmlValid;
char m_linksValid;
@ -790,10 +758,8 @@ public:
char m_bitsValid;
char m_bits2Valid;
char m_posValid;
char m_isUrlBadYearValid;
char m_phrasesValid;
char m_sectionsValid;
char m_subSentsValid;
char m_imageDataValid;
char m_imagesValid;
@ -806,7 +772,6 @@ public:
bool m_firstIpValid;
bool m_spideredTimeValid;
bool m_indexedTimeValid;
bool m_firstIndexedValid;
bool m_isInIndexValid;
bool m_wasInIndexValid;
bool m_outlinksAddedDateValid;
@ -828,9 +793,7 @@ public:
bool m_canonicalRedirUrlValid;
bool m_statusMsgValid;
bool m_mimeValid;
bool m_pubDateValid;
bool m_hostHash32aValid;
bool m_hostHash32bValid;
bool m_indexCodeValid;
bool m_priorityValid;
bool m_downloadStatusValid;
@ -845,29 +808,23 @@ public:
bool m_isPermalinkValid;
bool m_isAdultValid;
bool m_urlPubDateValid;
bool m_isUrlPermalinkFormatValid;
bool m_percentChangedValid;
bool m_unchangedValid;
bool m_countTableValid;
bool m_summaryLangIdValid;
bool m_tagPairHashVecValid;
bool m_summaryVecValid;
bool m_titleVecValid;
bool m_pageSampleVecValid;
bool m_postVecValid;
bool m_dupListValid;
bool m_likedbListValid;
bool m_isDupValid;
bool m_metaDescValid;
bool m_metaSummaryValid;
bool m_metaKeywordsValid;
bool m_metaGeoPlacenameValid;
bool m_siteSpiderQuotaValid;
bool m_oldDocValid;
bool m_extraDocValid;
bool m_rootDocValid;
bool m_oldMetaListValid;
bool m_oldTitleRecValid;
bool m_rootTitleRecValid;
bool m_isIndexedValid;
@ -881,12 +838,10 @@ public:
bool m_siteHash32Valid;
bool m_httpReplyValid;
bool m_contentTypeValid;
bool m_priorityQueueNumValid;
bool m_outlinkTagRecVectorValid;
bool m_outlinkIpVectorValid;
bool m_hasNoIndexMetaTagValid;
bool m_hasUseFakeIpsMetaTagValid;
bool m_outlinkIsIndexedVectorValid;
bool m_isSiteRootValid;
bool m_wasContentInjectedValid;
bool m_outlinkHopCountVectorValid;
@ -906,7 +861,6 @@ public:
bool m_htbValid;
bool m_collnumValid;
bool m_summaryValid;
bool m_gsbufValid;
bool m_spiderStatusDocMetaListValid;
bool m_isCompromisedValid;
bool m_isNoArchiveValid;
@ -914,9 +868,7 @@ public:
bool m_isLinkSpamValid;
bool m_isErrorPageValid;
bool m_isHijackedValid;
bool m_dupHashValid;
bool m_exactContentHash64Valid;
bool m_looseContentHash64Valid;
bool m_jpValid;
char m_isSiteMap;
@ -933,43 +885,31 @@ public:
// DO NOT add validity flags below this line!
char m_VALIDEND;
bool m_printedMenu;
int32_t m_urlPubDate;
char m_isUrlPermalinkFormat;
uint8_t m_summaryLangId;
int32_t m_tagPairHashVec[MAX_TAG_PAIR_HASHES];
int32_t m_tagPairHashVecSize;
int32_t m_summaryVec [SAMPLE_VECTOR_SIZE/4];
int32_t m_summaryVecSize;
int32_t m_titleVec [SAMPLE_VECTOR_SIZE/4];
int32_t m_titleVecSize;
int32_t m_pageSampleVec[SAMPLE_VECTOR_SIZE/4];
int32_t m_pageSampleVecSize;
int32_t m_postVec[POST_VECTOR_SIZE/4];
int32_t m_postVecSize;
float m_tagSimilarity;
float m_pageSimilarity;
float m_percentChanged;
bool m_unchanged;
// what docids are similar to us? docids are in this list
RdbList m_dupList;
RdbList m_likedbList;
uint64_t m_dupHash;
int64_t m_exactContentHash64;
int64_t m_looseContentHash64;
Msg0 m_msg0;
Msg5 m_msg5;
char m_isDup;
int64_t m_docIdWeAreADupOf;
int32_t m_ei;
int32_t m_lastLaunch;
Msg22Request m_msg22Request;
Msg22Request m_msg22Requestc;
Msg22 m_msg22a;
Msg22 m_msg22b;
Msg22 m_msg22c;
Msg22 m_msg22d;
Msg22 m_msg22e;
Msg22 m_msg22f;
// these now reference directly into the html src so our
@ -983,13 +923,10 @@ public:
char *m_metaGeoPlacename;
int32_t m_metaGeoPlacenameLen;
int32_t m_siteSpiderQuota;
class XmlDoc *m_oldDoc;
class XmlDoc *m_extraDoc;
class XmlDoc *m_rootDoc;
RdbList m_oldMetaList;
char *m_oldTitleRec;
int32_t m_oldTitleRecSize;
char *m_rootTitleRec;
@ -1002,12 +939,9 @@ public:
char m_wasInIndex;
Msg8a m_msg8a;
char *m_tagdbColl;
int32_t m_tagdbCollLen;
Url m_extraUrl;
uint8_t m_siteNumInlinks8;
LinkInfo m_siteLinkInfo;
SafeBuf m_mySiteLinkInfoBuf;
SafeBuf m_myPageLinkInfoBuf;
SafeBuf m_myTempLinkInfoBuf;
@ -1021,9 +955,6 @@ public:
SafeBuf m_tmpBuf12;
Multicast m_mcast11;
Multicast m_mcast12;
// lists from cachedb for msg25's msg20 replies serialized
RdbList m_siteReplyList;
RdbList m_pageReplyList;
MsgC m_msgc;
bool m_isAllowed;
bool m_forwardDownloadRequest;
@ -1035,22 +966,17 @@ public:
int32_t m_numExpansions;
char m_newOnly;
char m_isWWWDup;
char m_calledMsg0b;
SafeBuf m_linkSiteHashBuf;
SafeBuf m_linkdbDataBuf;
SafeBuf m_langVec;
Msg0 m_msg0b;
class RdbList *m_ulist;
char *m_linkInfoColl;
SiteGetter m_siteGetter;
int64_t m_siteHash64;
int32_t m_siteHash32;
char *m_httpReply;
char m_incrementedAttemptsCount;
char m_incrementedDownloadCount;
char m_redirectFlag;
char m_spamCheckDisabled;
char m_useRobotsTxt;
int32_t m_robotsTxtLen;
int32_t m_httpReplySize;
@ -1062,16 +988,13 @@ public:
char m_calledThread;
int32_t m_errno;
int32_t m_hostHash32a;
int32_t m_hostHash32b;
int32_t m_domHash32;
int32_t m_priorityQueueNum;
// this points into m_msge0 i guess
Msge0 m_msge0;
// this points into m_msge1 i guess
int32_t *m_outlinkIpVector;
SafeBuf m_outlinkTagRecPtrBuf;
SafeBuf m_fakeIpBuf;
char m_hasNoIndexMetaTag;
char m_hasUseFakeIpsMetaTag;
@ -1080,23 +1003,13 @@ public:
SafeBuf m_fakeTagRecPtrBuf;
TagRec m_fakeTagRec;
//
// diffbot parms for indexing diffbot's json output
//
char *hashJSONFields ( HashTableX *table );
char *hashJSONFields2 ( HashTableX *table , HashInfo *hi , Json *jp ,
bool hashWithoutFieldNames ) ;
char *hashXMLFields ( HashTableX *table );
Json *getParsedJson();
// object that parses the json
Json m_jp;
// related query algo stuff
int64_t m_tlbufTimer;
// flow flags
bool m_computedMetaListCheckSum;
@ -1104,18 +1017,11 @@ public:
// cachedb related args
bool m_allHashed;
// for getRelatedDocIdsWithTitles() launching msg20s
int32_t m_numMsg20Replies;
int32_t m_numMsg20Requests;
int8_t *m_outlinkHopCountVector;
int32_t m_outlinkHopCountVectorSize;
char m_isFiltered;
int32_t m_urlFilterNum;
int32_t m_numOutlinksAdded;
int32_t m_numOutlinksAddedFromSameDomain;
int32_t m_numOutlinksFiltered;
int32_t m_numOutlinksBanned;
int32_t m_numRedirects;
bool m_isPageParser;
Url m_baseUrl;
@ -1124,12 +1030,8 @@ public:
char m_linkTextBuf[MAX_LINK_TEXT_LEN];
char m_surroundingTextBuf[MAX_SURROUNDING_TEXT_WIDTH];
char m_rssItemBuf[MAX_RSSITEM_SIZE];
SafeBuf m_gsbuf;
char *m_note;
char *m_imageUrl;
char *m_imageUrl2;
SafeBuf m_imageUrlBuf;
SafeBuf m_imageUrlBuf2;
Query m_query;
Matches m_matches;
// meta description buf
@ -1138,7 +1040,6 @@ public:
SafeBuf m_htb;
Title m_title;
Summary m_summary;
char m_isCompromised;
char m_isNoArchive;
char m_isErrorPage;
char m_isHijacked;
@ -1146,8 +1047,6 @@ public:
// stuff
char *m_statusMsg;
Msg4 m_msg4;
bool m_incCount;
bool m_decCount;
bool m_deleteFromIndex;
@ -1171,33 +1070,19 @@ public:
bool m_check1 ;
bool m_check2 ;
bool m_prepared ;
bool m_updatedCounts ;
bool m_updatedCounts2 ;
bool m_copied1 ;
bool m_updatingSiteLinkInfoTags ;
int64_t m_calledMsg22d ;
bool m_didDelay ;
bool m_didDelayUnregister ;
bool m_calledMsg22e ;
bool m_calledMsg22f ;
bool m_calledMsg25 ;
bool m_calledMsg25b ;
bool m_calledMsg8b ;
bool m_calledMsg40 ;
bool m_calledSections ;
bool m_firstEntry ;
bool m_firstEntry2 ;
bool m_launchedSpecialMsg8a ;
bool m_launchedMsg8a2 ;
bool m_loaded ;
bool m_processedLang ;
bool m_doingConsistencyCheck ;
int32_t m_langIdScore;
int32_t m_dist;
// use to store a \0 list of "titles" of the root page so we can
@ -1217,7 +1102,6 @@ public:
char m_titleBuf[ROOT_TITLE_BUF_MAX];
int32_t m_titleBufSize;
bool m_setTr ;
void (* m_masterLoop) ( void *state );
@ -1227,8 +1111,6 @@ public:
bool (* m_callback2) ( void *state );
void *m_state;
bool m_skipIframeExpansion;
// this is non-zero if we decided not to index the doc
int32_t m_indexCode;
@ -1250,12 +1132,8 @@ public:
int32_t m_maxCacheAge;
char *m_wikiqbuf;
int32_t m_wikiqbufSize;
bool m_registeredSleepCallback;
bool m_addedNegativeDoledbRec;
bool m_hashedTitle;
bool m_hashedMetas;
@ -1293,7 +1171,6 @@ public:
bool m_setFromUrl;
bool m_setFromDocId;
bool m_freeLinkInfo1;
bool m_freeLinkInfo2;
bool m_contentInjected;
bool m_recycleContent;
@ -1329,7 +1206,6 @@ public:
int32_t getProbSpam ( int32_t *profile, int32_t plen , int32_t step );
bool m_isRepeatSpammer;
int32_t m_numRepeatSpam;
bool m_totallySpammed;
// frag vector (repeated fragments). 0 means repeated, 1 means not.
// vector is 1-1 with words in the document body.
@ -1358,10 +1234,6 @@ public:
void logQueryTimingEnd(const char* function, int64_t startTime);
int32_t m_i;
int32_t m_blocked;
void *m_finalState;
void (* m_finalCallback) ( void *state );
int64_t m_cacheStartTime;
};
// . PageParser.cpp uses this class for printing hashed terms out by calling

@ -431,9 +431,7 @@ char *XmlDoc::hashAll ( HashTableX *table ) {
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, hashCountry failed", __FILE__,__func__, __LINE__);
return NULL;
}
// BR 20160117 removed: if ( ! hashSiteNumInlinks( table ) ) return NULL;
// BR 20160117 removed: if ( ! hashTagRec ( table ) ) return NULL;
// BR 20160106 removed: if ( ! hashAds ( table ) ) return NULL;
// BR 20160106 removed: if ( ! hashSubmitUrls ( table ) ) return NULL;
if ( ! hashIsAdult ( table ) )
@ -445,10 +443,6 @@ char *XmlDoc::hashAll ( HashTableX *table ) {
// has gbhasthumbnail:1 or 0
// BR 20160106 removed: if ( ! hashImageStuff ( table ) ) return NULL;
// . hash sectionhash:xxxx terms
// . diffbot still needs to hash this for voting info
// BR 20160106 removed: if ( ! hashSections ( table ) ) return NULL;
// now hash the terms sharded by termid and not docid here since they
// just set a special bit in posdb key so Rebalance.cpp can work.
// this will hash the content checksum which we need for deduping
@ -477,17 +471,7 @@ char *XmlDoc::hashAll ( HashTableX *table ) {
return (char *)1;
}
// hash json fields
if ( *ct == CT_JSON ) {
// this hashes both with and without the fieldname
// BR 20160107 removed: hashJSONFields ( table );
goto skip;
}
// same for xml now, so we can search for field:value like w/ json
if ( *ct == CT_XML ) {
// this hashes both with and without the fieldname
// BR 20160107 removed: hashXMLFields ( table );
if ( *ct == CT_JSON || *ct == CT_XML ) {
goto skip;
}
@ -579,18 +563,8 @@ char *XmlDoc::hashAll ( HashTableX *table ) {
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, hashMetaTags failed", __FILE__,__func__, __LINE__);
return NULL;
}
/*
BR 20160220 removed.
if ( ! hashMetaZip ( table ) )
{
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, hashMetaZip failed", __FILE__,__func__, __LINE__);
return NULL;
}
*/
// BR 20160107 removed: if ( ! hashCharset ( table ) ) return NULL;
// BR 20160107 removed: if ( ! hashRSSInfo ( table ) ) return NULL;
if ( ! hashPermalink ( table ) )
if ( ! hashPermalink ( table ) )
{
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, hashPermaLink failed", __FILE__,__func__, __LINE__);
return NULL;
@ -619,8 +593,6 @@ char *XmlDoc::hashAll ( HashTableX *table ) {
}
bool XmlDoc::setSpiderStatusDocMetaList ( SafeBuf *jd , int64_t uqd ) {
// the posdb table
@ -870,57 +842,6 @@ bool XmlDoc::hashDateNumbers ( HashTableX *tt ) { // , bool isStatusDoc ) {
return true;
}
bool XmlDoc::hashMetaZip ( HashTableX *tt ) {
setStatus ( "hashing meta zip" );
// . set the score based on quality
// . scores are multiplied by 256 to preserve fractions for adding
uint32_t score = *getSiteNumInlinks8() * 256 ;
if ( score <= 0 ) score = 1;
// search for meta date
char buf [ 32 ];
int32_t bufLen = m_xml.getMetaContent ( buf, 32, "zipcode", 7 );
if ( bufLen <= 0 ) bufLen = m_xml.getMetaContent ( buf, 32, "zip",3);
char *p = buf;
char *pend = buf + bufLen ;
if ( bufLen <= 0 ) return true;
// set up the hashing parms
HashInfo hi;
hi.m_hashGroup = HASHGROUP_INTAG;
hi.m_tt = tt;
//hi.m_prefix = "zipcode";
hi.m_prefix = "gbzipcode";
nextZip:
// . parse out the zip codes, may be multiple ones
// . skip non-digits
while ( p < pend && ! is_digit(*p) ) p++;
// skip if no digits
if ( p == pend ) return true;
// need at least 5 consecutive digits
if ( p + 5 > pend ) return true;
// if not a zip code, skip it
if ( ! is_digit(p[1]) ) { p += 1; goto nextZip; }
if ( ! is_digit(p[2]) ) { p += 2; goto nextZip; }
if ( ! is_digit(p[3]) ) { p += 3; goto nextZip; }
if ( ! is_digit(p[4]) ) { p += 4; goto nextZip; }
// do we have too many consectuive digits?
if ( p + 5 != pend && is_digit(p[5]) ) {
// if so skip this whole string of digits
p += 5; while ( p < pend && is_digit(*p) ) p++;
goto nextZip;
}
// 90210 --> 90 902 9021 90210
for ( int32_t i = 0 ; i <= 3 ; i++ )
// use prefix as description
if ( ! hashString ( p,5-i,&hi ) ) return false;
p += 5;
goto nextZip;
}
// returns false and sets g_errno on error
bool XmlDoc::hashContentType ( HashTableX *tt ) {
@ -1788,13 +1709,6 @@ bool XmlDoc::hashUrl ( HashTableX *tt, bool urlOnly ) { // , bool isStatusDoc )
return true;
}
// . returns false and sets g_errno on error
bool XmlDoc::hashSections ( HashTableX *tt ) {
// BR 20160106: No longer store xpath-hashes in posdb as we do not use them.
return true;
}
// . returns false and sets g_errno on error
bool XmlDoc::hashIncomingLinkText ( HashTableX *tt ,
bool hashAnomalies ,
@ -1981,186 +1895,6 @@ bool XmlDoc::hashNeighborhoods ( HashTableX *tt ) {
goto loop;
}
// . returns false and sets g_errno on error
bool XmlDoc::hashRSSInfo ( HashTableX *tt ) {
setStatus ( "hashing rss info" );
uint8_t *ct = getContentType();
if ( ! ct || ct == (void *)-1 ) { char *xx=NULL;*xx=0; }
// . finally hash in the linkText terms from the LinkInfo
// . the LinkInfo class has all the terms of hashed anchor text for us
// . if we're using an old TitleRec linkTermList is just a ptr to
// somewhere in TitleRec
// . otherwise, we generated it from merging a bunch of LinkInfos
// and storing them in this new TitleRec
LinkInfo *linkInfo = getLinkInfo1();
// get the xml of the first rss/atom item/entry referencing this url
Xml xml;
// . returns NULL if no item xml
// . this could also be a "channel" blurb now, so we index channel pgs
if ( ! linkInfo->getItemXml ( &xml , m_niceness ) ) return false;
if ( xml.isEmpty() )
// hash gbrss:0
return hashRSSTerm ( tt , false );
// parser info msg
//if ( m_pbuf ) {
// m_pbuf->safePrintf(
// "<br><b>--BEGIN RSS/ATOM INFO HASH--</b><br><br>");
//}
// hash nothing if not a permalink and eliminating "menus"
//if ( ! *getIsPermalink() && m_eliminateMenus ) return true;
// . IMPORTANT: you must be using the new link algo, so turn it on
// in the spider controls. this allows us to include LinkTexts from
// the same IP in our LinkInfo class in the TitleRec.
// . is it rss or atom? both use title tag, so doesn't matter
// . get the title tag
bool isHtmlEncoded;
int32_t titleLen;
char *title = xml.getRSSTitle ( &titleLen , &isHtmlEncoded );
char c = 0;
// sanity check
if ( ! m_utf8ContentValid ) { char *xx=NULL;*xx=0; }
bool hashIffUnique = true;
// but if we had no content because we were an mp3 or whatever,
// do not worry about avoiding double hashing
if ( size_utf8Content <= 0 ) hashIffUnique = false;
// decode it?
// should we decode it? if they don't use [CDATA[]] then we should
// ex: http://www.abc.net.au/rn/podcast/feeds/lawrpt.xml has CDATA,
// but most other feeds do not use it
if ( isHtmlEncoded && title && titleLen > 0 ) {
// it is html encoded so that the <'s are encoded to &lt;'s so
// we must decode them back. this could turn latin1 into utf8
// though? no, because the &'s should have been encoded, too!
int32_t newLen =htmlDecode(title,title,titleLen,false,m_niceness);
// make sure we don't overflow the buffer
if ( newLen > titleLen ) { char *xx = NULL; *xx = 0; }
// reassign the length
titleLen = newLen;
// NULL terminate it
c = title[titleLen];
title[titleLen] = '\0';
}
// update hash parms
HashInfo hi;
hi.m_tt = tt;
hi.m_hashGroup = HASHGROUP_TITLE;
hi.m_desc = "rss title";
// . hash the rss title
// . only hash the terms if they are unique to stay balanced with docs
// that are not referenced by an rss feed
bool status = hashString ( title,titleLen,&hi ) ;
// pop the end back just in case
if ( c ) title[titleLen] = c;
// return false with g_errno set on error
if ( ! status ) return false;
// get the rss description
int32_t descLen;
char *desc = xml.getRSSDescription ( &descLen , &isHtmlEncoded );
// for adavanced hashing
Xml xml2;
Words w;
//Scores scores;
Words *wordsPtr = NULL;
//Scores *scoresPtr = NULL;
c = 0;
// should we decode it? if they don't use [CDATA[]] then we should
// ex: http://www.abc.net.au/rn/podcast/feeds/lawrpt.xml has CDATA,
// but most other feeds do not use it
if ( isHtmlEncoded && desc && descLen > 0 ) {
// it is html encoded so that the <'s are encoded to &lt;'s so
// we must decode them back. this could turn latin1 into utf8
// though? no, because the &'s should have been encoded, too!
int32_t newLen = htmlDecode(desc,desc,descLen,false,m_niceness);
// make sure we don't overflow the buffer
if ( newLen > descLen ) { char *xx = NULL; *xx = 0; }
// reassign the length
descLen = newLen;
}
// NULL terminate it
if ( desc ) {
c = desc[descLen];
desc[descLen] = '\0';
// set the xml class from the decoded html
if ( !xml2.set( desc, descLen, m_version, m_niceness, *ct ) ) {
return false;
}
// set the words class from the xml, returns false and sets
// g_errno on error
if ( !w.set( &xml2, true, true ) ) {
return false;
}
// pass it in to TermTable::hash() below
wordsPtr = &w;
}
// update hash parms
hi.m_tt = tt;
hi.m_desc = "rss body";
hi.m_hashGroup = HASHGROUP_BODY;
// . hash the rss/atom description
// . only hash the terms if they are unique to stay balanced with docs
// that are not referenced by an rss feed
status = hashString ( desc, descLen, &hi );
// pop the end back just in case
if ( c ) desc[descLen] = c;
// return false with g_errno set
if ( ! status ) return false;
// hash gbrss:1
if ( ! hashRSSTerm ( tt , true ) ) return false;
// parser info msg
//if ( m_pbuf ) {
// m_pbuf->safePrintf("<br><b>--END RSS/ATOM INFO HASH--"
// "</b><br><br>");
//}
return true;
}
bool XmlDoc::hashRSSTerm ( HashTableX *tt , bool inRSS ) {
// hash gbrss:0 or gbrss:1
char *value;
if ( inRSS ) value = "1";
else value = "0";
// update hash parms
HashInfo hi;
hi.m_tt = tt;
hi.m_prefix = "gbinrss";
hi.m_hashGroup = HASHGROUP_INTAG;
// returns false and sets g_errno on error
if ( ! hashString(value,1,&hi ) ) return false;
// hash gbisrss:1 if we are an rss page ourselves
if ( *getIsRSS() ) value = "1";
else value = "0";
// update hash parms
hi.m_prefix = "gbisrss";
// returns false and sets g_errno on error
if ( ! hashString(value,1,&hi) ) return false;
return true;
}
// . we now do the title hashing here for newer titlerecs, version 80+, rather
// than use the <index> block in the ruleset for titles.
// . this is not to be confused with hashing the title: terms which still
@ -2426,170 +2160,6 @@ bool XmlDoc::hashCountry ( HashTableX *tt ) {
return true;
}
bool XmlDoc::hashSiteNumInlinks ( HashTableX *tt ) {
setStatus ( "hashing site num inlinks" );
char s[32];
int32_t slen = sprintf(s, "%"INT32"", (int32_t)*getSiteNumInlinks() );
// update hash parms
HashInfo hi;
hi.m_tt = tt;
hi.m_hashGroup = HASHGROUP_INTAG;
hi.m_prefix = "gbsitenuminlinks";
// hack test
// slen = sprintf(s,"%"UINT32"",
// ((uint32_t)m_firstUrl.getUrlHash32()) % 1000);
// log("xmldoc: sitenuminlinks for %s is %s",m_firstUrl.getUrl(),s);
return hashString ( s, slen, &hi );
}
bool XmlDoc::hashCharset ( HashTableX *tt ) {
setStatus ( "hashing charset" );
char s[128]; // charset string
int32_t slen;
// hash the charset as a string
if ( ! get_charset_str(*getCharset()))
slen = sprintf(s, "unknown");
else
slen = sprintf(s, "%s", get_charset_str(*getCharset()));
// update hash parms
HashInfo hi;
hi.m_tt = tt;
hi.m_hashGroup = HASHGROUP_INTAG;
hi.m_prefix = "gbcharset";
if ( ! hashString ( s,slen, &hi ) ) return false;
// hash charset as a number
slen = sprintf(s, "%d", *getCharset());
return hashString ( s,slen, &hi ) ;
}
// . only hash certain tags (single byte scores and ST_COMMENT)
// . do not hash clocks, ST_SITE, ST_COMMENT
// . term = gbtag:blog1 score=0-100
// . term = gbtag:blog2 score=0-100
// . term = gbtag:english1 score=0-100
// . term = gbtag:pagerank1 score=0-100, etc. ...
// . term = gbtagmeta:"this site"(special hashing,ST_META,score=qlty)
// . later we can support query like gbtag:english1>30
bool XmlDoc::hashTagRec ( HashTableX *tt ) {
setStatus ( "hashing tag rec" );
//char *field = "gbtag:";
//int32_t fieldlen = gbstrlen(field);
//bool retval = true;
// . this tag rec does not have the ST_SITE tag in it to save space
// . it does not have clocks either?
TagRec *gr = getTagRec();
// count occurence of each tag id
//int16_t count [ LAST_TAG ];
//memset ( count , 0 , 2 * LAST_TAG );
// loop over all tags in the title rec
for ( Tag *tag = gr->getFirstTag(); tag ; tag = gr->getNextTag(tag) ) {
// breathe
QUICKPOLL(m_niceness);
// get id
int32_t type = tag->m_type;
// skip tags we are not supposed to index, like
// ST_CLOCK, etc. or anything with a dataSize not 1
if ( ! tag->isIndexable() ) continue;
// hash these metas below
//if ( type == ST_META ) continue;
//if ( tag->isType("meta") ) continue;
// only single byters. this should have been covered by the
// isIndexable() function.
//if ( tag->getTagDataSize() != 1 ) continue;
// get the name
char *str = getTagStrFromType ( type );
// get data size
//uint8_t *data = (uint8_t *)tag->getTagData();
// make it a string
//char dataStr[6];
//sprintf ( dataStr , "%"INT32"",(int32_t)*data );
// skip if has non numbers
//bool num = true;
//for ( int32_t i = 0 ; i < tag->getTagDataSize() ; i++ )
// if ( ! is_digit(tag->getTagData()[i]) ) num = false;
// skip if it has more than just digits, we are not indexing
// strings at this point
//if ( ! num ) continue;
// point to it, should be a NULL terminated string
char *dataStr = tag->getTagData();
// skip if number is too big
//int32_t val = atol ( dataStr );
// boost by one so we can index "0" score
//val++;
// we really only want to index scores from 0-255
//if ( val > 255 ) continue;
// no negatives
//if ( val <= 0 ) continue;
// count occurence
//count [ type ]++;
// . make the term name to hash after the gbtag:
// . we want to hash "gbtag:english3" for example, for the
// ST_ENGLISH tag id.
char prefix[64];
// . do not include the count for the first occurence
// . follows the gbruleset:36 convention
// . index gbtagspam:0 or gbtagspam:1, etc.!!!
//if ( count[type] == 1 )
sprintf ( prefix , "gbtag%s",str);
// assume that is good enough
//char *prefix = tmp;
// store prefix into m_wbuf so XmlDoc::print() works!
//if ( m_pbuf ) {
// int32_t tlen = gbstrlen(tmp);
// m_wbuf.safeMemcpy(tmp,tlen+1);
// prefix = m_wbuf.getBuf() - (tlen+1);
//}
//else
// sprintf ( tmp , "gbtag%s%"INT32"",str,(int32_t)count[type]);
// "unmap" it so when it is hashed it will have the correct
// 8-bit score. IndexList.cpp will convert it back to 8 bits
// in IndexList::set(table), which sets our termlist from
// this "table".
//int32_t score = score8to32 ( val );
// we already incorporate the score as a string when we hash
// gbtagtagname:tagvalue so why repeat it?
//int32_t score = 1;
// update hash parms
HashInfo hi;
hi.m_tt = tt;
hi.m_prefix = prefix;
hi.m_hashGroup = HASHGROUP_INTAG;
// meta is special now
if ( tag->isType("meta") ) {
hi.m_prefix = NULL;
}
// hash it. like "gbtagenglish:1" with a score of 1, etc.
// or "gbtagspam:33" with a score of 33. this would also
// hash gbtagclock:0xfe442211 type things as well.
int32_t dlen = gbstrlen(dataStr);
if ( ! hashString ( dataStr,dlen,&hi ) ) return false;
}
return true;
}
bool XmlDoc::hashPermalink ( HashTableX *tt ) {
setStatus ( "hashing is permalink" );
@ -2607,15 +2177,6 @@ bool XmlDoc::hashPermalink ( HashTableX *tt ) {
return hashString ( s,1,&hi );
}
bool XmlDoc::hashVectors ( HashTableX *tt ) {
setStatus ( "hashing vectors" );
return true;
}
// returns false and sets g_errno on error
bool XmlDoc::hashIsAdult ( HashTableX *tt ) {
@ -3399,20 +2960,6 @@ bool XmlDoc::hashNumberForSorting ( char *beginBuf ,
// negative sign?
if ( p > beginBuf && p[-1] == '-' ) p--;
// BR 20160108: Removed all float numbers as we don't plan to use them
// . convert it to a float
// . this now allows for commas in numbers like "1,500.62"
// float f = atof2 ( p , bufEnd - p );
// if ( ! hashNumberForSortingAsFloat ( f , hi , "gbsortby" ) )
// return false;
// also hash in reverse order for sorting from low to high
// f = -1.0 * f;
// if ( ! hashNumberForSortingAsFloat ( f , hi , "gbrevsortby" ) )
// return false;
//
// also hash as an int, 4 byte-integer so our lastSpidered timestamps
// dont lose 128 seconds of resolution
@ -3433,116 +2980,6 @@ bool XmlDoc::hashNumberForSorting ( char *beginBuf ,
return true;
}
bool XmlDoc::hashNumberForSortingAsFloat ( float f , HashInfo *hi , char *sortByStr ) {
// prefix is something like price. like the meta "name" or
// the json name with dots in it like "product.info.price" or something
int64_t nameHash = 0LL;
int32_t nameLen = 0;
if ( hi->m_prefix ) nameLen = gbstrlen ( hi->m_prefix );
if ( hi->m_prefix && nameLen )
nameHash = hash64Lower_utf8_nospaces( hi->m_prefix , nameLen );
// need a prefix for hashing numbers... for now
else { char *xx=NULL; *xx=0; }
// combine prefix hash with a special hash to make it unique to avoid
// collisions. this is the "TRUE" prefix.
int64_t truePrefix64 = hash64n ( sortByStr ); // "gbsortby");
// hash with the "TRUE" prefix
int64_t ph2 = hash64 ( nameHash , truePrefix64 );
// . now store it
// . use field hash as the termid. normally this would just be
// a prefix hash
// . use mostly fake value otherwise
key144_t k;
g_posdb.makeKey ( &k ,
ph2 ,
0,//docid
0,// word pos #
0,// densityRank , // 0-15
0 , // MAXDIVERSITYRANK
0 , // wordSpamRank ,
0 , //siterank
0 , // hashGroup,
// we set to docLang final hash loop
//langUnknown, // langid
// unless already set. so set to english here
// so it will not be set to something else
// otherwise our floats would be ordered by langid!
// somehow we have to indicate that this is a float
// termlist so it will not be mangled any more.
//langEnglish,
langUnknown,
0 , // multiplier
false, // syn?
false , // delkey?
hi->m_shardByTermId );
//int64_t final = hash64n("products.offerprice",0);
//int64_t prefix = hash64n("gbsortby",0);
//int64_t h64 = hash64 ( final , prefix);
//if ( ph2 == h64 )
// log("hey: got offer price");
// now set the float in that key
g_posdb.setFloat ( &k , f );
// HACK: this bit is ALWAYS set by Posdb::makeKey() to 1
// so that we can b-step into a posdb list and make sure
// we are aligned on a 6 byte or 12 byte key, since they come
// in both sizes. but for this, hack it off to tell
// addTable144() that we are a special posdb key, a "numeric"
// key that has a float stored in it. then it will NOT
// set the siterank and langid bits which throw our sorting
// off!!
g_posdb.setAlignmentBit ( &k , 0 );
// sanity
float t = g_posdb.getFloat ( &k );
if ( t != f ) { char *xx=NULL;*xx=0; }
HashTableX *dt = hi->m_tt;
// the key may indeed collide, but that's ok for this application
if ( ! dt->addTerm144 ( &k ) )
return false;
if ( ! m_wts )
return true;
// store in buffer
char buf[128];
snprintf(buf,126,"%s:%s float32=%f",sortByStr,hi->m_prefix,f);
int32_t bufLen = gbstrlen(buf);
// add to wts for PageParser.cpp display
// store it
if ( ! storeTerm ( buf,
bufLen,
ph2,
hi,
0, // word#, i,
0, // wordPos
0,// densityRank , // 0-15
0, // MAXDIVERSITYRANK,//phrase
0, // ws,
0, // hashGroup,
//true,
&m_wbuf,
m_wts,
// a hack for display in wts:
SOURCE_NUMBER, // SOURCE_BIGRAM, // synsrc
langUnknown ,
k) )
return false;
return true;
}
bool XmlDoc::hashNumberForSortingAsInt32 ( int32_t n , HashInfo *hi , char *sortByStr ) {
// prefix is something like price. like the meta "name" or
@ -3662,22 +3099,6 @@ bool XmlDoc::hashNumberForSortingAsInt32 ( int32_t n , HashInfo *hi , char *sort
#include "Json.h"
char *XmlDoc::hashJSONFields ( HashTableX *table ) {
setStatus ( "hashing json fields" );
HashInfo hi;
hi.m_tt = table;
hi.m_desc = "json object";
// use new json parser
Json *jp = getParsedJson();
if ( ! jp || jp == (void *)-1 ) return (char *)jp;
return hashJSONFields2 ( table , &hi , jp , true );
}
char *XmlDoc::hashJSONFields2 ( HashTableX *table ,
HashInfo *hi , Json *jp ,
bool hashWithoutFieldNames ) {
@ -3815,63 +3236,3 @@ char *XmlDoc::hashJSONFields2 ( HashTableX *table ,
return (char *)0x01;
}
char *XmlDoc::hashXMLFields ( HashTableX *table ) {
setStatus ( "hashing xml fields" );
HashInfo hi;
hi.m_tt = table;
hi.m_desc = "xml object";
hi.m_hashGroup = HASHGROUP_BODY;
Xml *xml = getXml();
int32_t n = xml->getNumNodes();
XmlNode *nodes = xml->getNodes ();
SafeBuf nameBuf;
// scan the xml nodes
for ( int32_t i = 0 ; i < n ; i++ ) {
// breathe
QUICKPOLL(m_niceness);
// . skip if it's a tag not text node skip it
// . we just want the "text" nodes
if ( nodes[i].isTag() ) continue;
//if(!strncmp(nodes[i].m_node,"Congress%20Presses%20Uber",20))
// log("hey:hy");
// assemble the full parent name
// like "tag1.tag2.tag3"
nameBuf.reset();
xml->getCompoundName ( i , &nameBuf );
// this is \0 terminated
char *tagName = nameBuf.getBufStart();
// get the utf8 text
char *val = nodes[i].m_node;
int32_t vlen = nodes[i].m_nodeLen;
// index like "title:whatever"
if ( tagName && tagName[0] ) {
hi.m_prefix = tagName;
hashString ( val , vlen , &hi );
}
// hash without the field name as well
hi.m_prefix = NULL;
hashString ( val , vlen , &hi );
}
return (char *)0x01;
}