forked from Mirrors/privacore-open-source-search-engine
Remove unused variable/functions from XmlDoc
This commit is contained in:
14
PageGet.cpp
14
PageGet.cpp
@ -312,20 +312,6 @@ bool processLoop ( void *state ) {
|
||||
return status;
|
||||
}
|
||||
|
||||
/*
|
||||
// this was calling XmlDoc and setting sections, etc. to
|
||||
// get the SpiderReply junk... no no no
|
||||
// is it banned or filtered? this ignores the TagRec in the titleRec
|
||||
// and uses msg8a to get it fresh instead
|
||||
char *vi = xd->getIsFiltered();//Visible( );
|
||||
// wait if blocked
|
||||
if ( vi == (void *)-1 ) return false;
|
||||
// error?
|
||||
if ( ! vi ) return sendErrorReply ( st , g_errno );
|
||||
// banned?
|
||||
if ( ! st->m_isMasterAdmin && ! *vi ) return sendErrorReply (st,EDOCBANNED);
|
||||
*/
|
||||
|
||||
// get the utf8 content
|
||||
char **utf8 = xd->getUtf8Content();
|
||||
//int32_t len = xd->size_utf8Content - 1;
|
||||
|
6
Spider.h
6
Spider.h
@ -567,12 +567,6 @@ class SpiderRequest {
|
||||
//int32_t m_reservedc1;
|
||||
int32_t m_reservedc2;
|
||||
|
||||
//int32_t m_parentPubDate;
|
||||
|
||||
// . pub date taken from url directly, not content
|
||||
// . ie. http://mysite.com/blog/nov-06-2009/food.html
|
||||
// . ie. http://mysite.com/blog/11062009/food.html
|
||||
//int32_t m_urlPubDate;
|
||||
// . replace this with something we need for smart compression
|
||||
// . this is zero if none or invalid
|
||||
int32_t m_contentHash32;
|
||||
|
219
XmlDoc.cpp
219
XmlDoc.cpp
@ -100,7 +100,6 @@ XmlDoc::XmlDoc() {
|
||||
m_wasInIndex = false;
|
||||
m_outlinkHopCountVector = NULL;
|
||||
m_extraDoc = NULL;
|
||||
m_wikiqbuf = NULL;
|
||||
|
||||
reset();
|
||||
}
|
||||
@ -126,16 +125,11 @@ void XmlDoc::reset ( ) {
|
||||
|
||||
m_printedMenu = false;
|
||||
|
||||
m_tmpBuf2.purge();
|
||||
|
||||
m_bodyStartPos = 0;
|
||||
|
||||
m_skipIframeExpansion = false;
|
||||
m_indexedTime = 0;
|
||||
|
||||
m_metaList2.purge();
|
||||
m_zbuf.purge();
|
||||
m_kbuf.purge();
|
||||
|
||||
m_mySiteLinkInfoBuf.purge();
|
||||
m_myPageLinkInfoBuf.purge();
|
||||
@ -155,10 +149,6 @@ void XmlDoc::reset ( ) {
|
||||
m_fakeIpBuf.purge();
|
||||
m_fakeTagRecPtrBuf.purge();
|
||||
|
||||
m_tlbufTimer = 0LL;
|
||||
m_gsbuf.reset();
|
||||
|
||||
|
||||
m_doConsistencyTesting = g_conf.m_doConsistencyTesting;
|
||||
|
||||
m_computedMetaListCheckSum = false;
|
||||
@ -191,9 +181,6 @@ void XmlDoc::reset ( ) {
|
||||
"be saved in addsinprogress.dat.");
|
||||
}
|
||||
|
||||
m_ei = 0;
|
||||
m_lastLaunch = -1;
|
||||
|
||||
m_pbuf = NULL;
|
||||
m_wts = NULL;
|
||||
|
||||
@ -277,9 +264,6 @@ void XmlDoc::reset ( ) {
|
||||
}
|
||||
m_outlinkHopCountVector = NULL;
|
||||
|
||||
m_gsbuf.reset();
|
||||
|
||||
|
||||
// reset all *valid* flags to false
|
||||
void *p = &m_VALIDSTART;
|
||||
void *pend = &m_VALIDEND;
|
||||
@ -329,7 +313,6 @@ void XmlDoc::reset ( ) {
|
||||
m_setFromDocId = false;
|
||||
m_setFromSpiderRec = false;
|
||||
m_freeLinkInfo1 = false;
|
||||
m_freeLinkInfo2 = false;
|
||||
|
||||
m_checkedUrlFilters = false;
|
||||
|
||||
@ -351,50 +334,35 @@ void XmlDoc::reset ( ) {
|
||||
// keep track of updates to the rdbs we have done, so we do not re-do
|
||||
m_listAdded = false;
|
||||
m_listFlushed = false;
|
||||
m_updatedCounts = false;
|
||||
m_updatedCounts2 = false;
|
||||
m_copied1 = false;
|
||||
m_updatingSiteLinkInfoTags = false;
|
||||
m_hashedTitle = false;
|
||||
|
||||
m_registeredSleepCallback = false;
|
||||
m_addedNegativeDoledbRec = false;
|
||||
|
||||
m_numRedirects = 0;
|
||||
m_numOutlinksAdded = 0;
|
||||
m_spamCheckDisabled = false;
|
||||
m_useRobotsTxt = true;
|
||||
m_redirectFlag = false;
|
||||
|
||||
m_allowSimplifiedRedirs = false;
|
||||
|
||||
m_didDelay = false;
|
||||
m_didDelayUnregister = false;
|
||||
m_calledMsg22d = 0LL;
|
||||
m_calledMsg22e = false;
|
||||
m_calledMsg22f = false;
|
||||
m_calledMsg25 = false;
|
||||
m_calledMsg25b = false;
|
||||
m_calledMsg40 = false;
|
||||
m_calledSections = false;
|
||||
m_calledThread = false;
|
||||
m_alreadyRegistered = false;
|
||||
m_loaded = false;
|
||||
m_firstEntry = true;
|
||||
m_firstEntry2 = true;
|
||||
m_launchedSpecialMsg8a = false;
|
||||
m_launchedMsg8a2 = false;
|
||||
|
||||
m_setTr = false;
|
||||
m_calledMsg8b = false;
|
||||
|
||||
m_recycleContent = false;
|
||||
m_callback1 = NULL;
|
||||
m_callback2 = NULL;
|
||||
m_state = NULL;
|
||||
|
||||
m_processedLang = false;
|
||||
|
||||
m_doingConsistencyCheck = false;
|
||||
|
||||
|
||||
@ -431,19 +399,6 @@ void XmlDoc::reset ( ) {
|
||||
void *px = &ptr_firstUrl;
|
||||
void *pxend = &m_dummyEnd;
|
||||
memset ( px , 0 , (char *)pxend - (char *)px );
|
||||
|
||||
ptr_unused6 = NULL;
|
||||
size_unused6 = 0;
|
||||
ptr_unused7 = NULL;
|
||||
size_unused7 = 0;
|
||||
ptr_unused1 = NULL;
|
||||
size_unused1 = 0;
|
||||
ptr_unused2 = NULL;
|
||||
size_unused2 = 0;
|
||||
ptr_unused3 = NULL;
|
||||
size_unused3 = 0;
|
||||
ptr_unused5 = NULL;
|
||||
size_unused5 = 0;
|
||||
}
|
||||
|
||||
int64_t XmlDoc::logQueryTimingStart() {
|
||||
@ -638,8 +593,6 @@ bool XmlDoc::setCollNum ( const char *coll ) {
|
||||
// we can store this safely:
|
||||
m_collnum = cr->m_collnum;
|
||||
m_collnumValid = true;
|
||||
// if user "resets" the collection we need to know
|
||||
m_lastCollRecResetCount = cr->m_lastResetCount;
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -1286,10 +1239,6 @@ bool XmlDoc::set2 ( char *titleRec ,
|
||||
return false;
|
||||
}
|
||||
|
||||
// debug thing
|
||||
ptr_sectiondbData = NULL;
|
||||
size_sectiondbData = 0;
|
||||
|
||||
// success, return true then
|
||||
return true;
|
||||
}
|
||||
@ -2255,37 +2204,6 @@ bool XmlDoc::indexDoc2 ( ) {
|
||||
m_registeredSleepCallback = false;
|
||||
}
|
||||
|
||||
//////////
|
||||
// . add the doledb negative key quickly to our tree to avoid a
|
||||
// respider because the msg4 doledb negative key is buffered by msg4
|
||||
// . make it negative
|
||||
// . well it should not be respidered because the lock is on it!!
|
||||
// -- so let's comment this out
|
||||
/////////
|
||||
/*
|
||||
key_t negative = m_doledbKey;
|
||||
// make it negative
|
||||
negative.n0 &= 0xfffffffffffffffeLL;
|
||||
// . store it in our tree if we can
|
||||
// . returns false and sets g_errno on error
|
||||
// . i.e. g_errno == ETRYAGAIN
|
||||
if ( ! m_addedNegativeDoledbRec &&
|
||||
! g_doledb.m_rdb.addRecord(m_coll,(char *)&negative,
|
||||
NULL,0,m_niceness)){
|
||||
log("build: error trying to add to doledb: %s",
|
||||
mstrerror(g_errno));
|
||||
// set sleep wrapper
|
||||
g_loop.registerSleepCallback(1000,m_masterState,
|
||||
indexDocWrapper2,m_niceness);
|
||||
// note it
|
||||
m_registeredSleepCallback = true;
|
||||
// sleep and retry
|
||||
return false;
|
||||
}
|
||||
*/
|
||||
// we did that
|
||||
m_addedNegativeDoledbRec = true;
|
||||
|
||||
// now add it
|
||||
if ( ! m_listAdded && m_metaListSize ) {
|
||||
// only call this once
|
||||
@ -5610,18 +5528,6 @@ int32_t XmlDoc::computeVector( Words *words, uint32_t *vec, int32_t start, int32
|
||||
return nd * 4;
|
||||
}
|
||||
|
||||
float *XmlDoc::getTagSimilarity ( XmlDoc *xd2 ) {
|
||||
int32_t *tv1 = getTagPairHashVector();
|
||||
if ( ! tv1 || tv1 == (int32_t *)-1 ) return (float *)tv1;
|
||||
int32_t *tv2 = xd2->getTagPairHashVector();
|
||||
if ( ! tv2 || tv2 == (int32_t *)-1 ) return (float *)tv2;
|
||||
m_tagSimilarity = computeSimilarity ( tv1, tv2, NULL, NULL, NULL ,
|
||||
m_niceness );
|
||||
// this means error, g_errno should be set
|
||||
if ( m_tagSimilarity == -1.0 ) return NULL;
|
||||
return &m_tagSimilarity;
|
||||
}
|
||||
|
||||
float *XmlDoc::getPageSimilarity ( XmlDoc *xd2 ) {
|
||||
int32_t *sv1 = getPageSampleVector();
|
||||
if ( ! sv1 || sv1 == (int32_t *)-1 ) return (float *)sv1;
|
||||
@ -5869,17 +5775,6 @@ bool isSimilar_sorted ( int32_t *vec0 ,
|
||||
goto mergeLoop;
|
||||
}
|
||||
|
||||
uint64_t *XmlDoc::getFuzzyDupHash ( ) {
|
||||
|
||||
if ( m_dupHashValid ) return &m_dupHash;
|
||||
uint32_t *h1 = getTagPairHash32();
|
||||
if ( ! h1 || h1 == (uint32_t *)-1 ) return (uint64_t *)h1;
|
||||
|
||||
m_dupHash = *h1;
|
||||
m_dupHashValid = true;
|
||||
return &m_dupHash;
|
||||
}
|
||||
|
||||
int64_t *XmlDoc::getExactContentHash64 ( ) {
|
||||
|
||||
if ( m_exactContentHash64Valid )
|
||||
@ -6599,7 +6494,6 @@ Url **XmlDoc::getRedirUrl() {
|
||||
if ( cu->getDomainLen() != dlen ) sameDom=false;
|
||||
else if ( strncmp(cu->getDomain(),loc->getDomain(),dlen))sameDom=false;
|
||||
if ( ! sameDom ) {
|
||||
m_redirectFlag = true;
|
||||
m_redirUrl.set ( loc , false ); // addWWW=false
|
||||
m_redirUrlPtr = &m_redirUrl;
|
||||
ptr_redirUrl = m_redirUrl.m_url;
|
||||
@ -6701,7 +6595,6 @@ Url **XmlDoc::getRedirUrl() {
|
||||
return &m_redirUrlPtr;
|
||||
}
|
||||
// good to go
|
||||
m_redirectFlag = true;
|
||||
m_redirUrl.set ( loc , false ); // addWWW=false
|
||||
m_redirUrlPtr = &m_redirUrl;
|
||||
ptr_redirUrl = m_redirUrl.m_url;
|
||||
@ -7118,9 +7011,6 @@ XmlDoc **XmlDoc::getExtraDoc ( char *u , int32_t maxCacheAge ) {
|
||||
// carry this forward always!
|
||||
m_extraDoc->m_isSpiderProxy = m_isSpiderProxy;
|
||||
|
||||
// disable spam check because that is not necessary for this doc!
|
||||
m_extraDoc->m_spamCheckDisabled = true;
|
||||
|
||||
// tell msg13 to get this from it robots.txt cache if it can. it also
|
||||
// keeps a separate html page cache for the root pages, etc. in case
|
||||
m_extraDoc->m_maxCacheAge = maxCacheAge;
|
||||
@ -12559,14 +12449,6 @@ int32_t XmlDoc::getHostHash32a ( ) {
|
||||
return m_hostHash32a;
|
||||
}
|
||||
|
||||
int32_t XmlDoc::getHostHash32b ( ) {
|
||||
if ( m_hostHash32bValid ) return m_hostHash32b;
|
||||
m_hostHash32bValid = true;
|
||||
Url *c = getCurrentUrl();
|
||||
m_hostHash32b = c->getHostHash32();
|
||||
return m_hostHash32b;
|
||||
}
|
||||
|
||||
int32_t XmlDoc::getDomHash32( ) {
|
||||
if ( m_domHash32Valid ) return m_domHash32;
|
||||
m_domHash32Valid = true;
|
||||
@ -13421,19 +13303,6 @@ char *XmlDoc::getSpiderLinks ( ) {
|
||||
return &m_spiderLinks2;
|
||||
}
|
||||
|
||||
// should we index the doc? if already indexed, and is filtered, we delete it
|
||||
char *XmlDoc::getIsFiltered ( ) {
|
||||
if ( m_isFilteredValid ) return &m_isFiltered;
|
||||
int32_t *priority = getSpiderPriority();
|
||||
if ( ! priority || priority == (void *)-1 ) return (char *)priority;
|
||||
m_isFiltered = false;
|
||||
// if ( *priority == SPIDER_PRIORITY_FILTERED ) m_isFiltered = true;
|
||||
// if ( *priority == SPIDER_PRIORITY_BANNED ) m_isFiltered = true;
|
||||
if ( *priority == -3 ) m_isFiltered = true;
|
||||
m_isFilteredValid = true;
|
||||
return &m_isFiltered;
|
||||
}
|
||||
|
||||
int32_t *XmlDoc::getSpiderPriority ( ) {
|
||||
if ( m_priorityValid ) return &m_priority;
|
||||
setStatus ("getting spider priority");
|
||||
@ -14121,17 +13990,6 @@ bool XmlDoc::doConsistencyTest ( bool forceTest ) {
|
||||
return true;
|
||||
}
|
||||
|
||||
int32_t XmlDoc::printMetaList ( ) {
|
||||
|
||||
SafeBuf sb;
|
||||
printMetaList ( m_metaList ,
|
||||
m_metaList + m_metaListSize ,
|
||||
&sb );
|
||||
fprintf(stderr,"%s\n",sb.getBufStart());
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
#define TABLE_ROWS 25
|
||||
|
||||
// print this also for page parser output!
|
||||
@ -16684,10 +16542,6 @@ void XmlDoc::copyFromOldDoc ( XmlDoc *od ) {
|
||||
size_linkInfo1 = od->size_linkInfo1;
|
||||
if ( ptr_linkInfo1 && size_linkInfo1 ) m_linkInfo1Valid = true;
|
||||
else m_linkInfo1Valid = false;
|
||||
|
||||
// turn off for debug
|
||||
ptr_sectiondbData = NULL;
|
||||
size_sectiondbData = 0;
|
||||
}
|
||||
|
||||
// for adding a quick reply for EFAKEIP and for diffbot query reindex requests
|
||||
@ -17726,7 +17580,6 @@ char *XmlDoc::addOutlinkSpiderRecsToMetaList ( ) {
|
||||
|
||||
ksr.m_addedTime = getSpideredTime();//m_spideredTime;
|
||||
//ksr.m_lastAttempt = 0;
|
||||
//ksr.m_urlPubDate = urlPubDate;
|
||||
//ksr.m_errCode = 0;
|
||||
ksr.m_parentHostHash32 = hostHash32a;
|
||||
ksr.m_parentDomHash32 = m_domHash32;
|
||||
@ -17955,8 +17808,6 @@ char *XmlDoc::addOutlinkSpiderRecsToMetaList ( ) {
|
||||
m_numOutlinksAdded = numAdded;
|
||||
m_numOutlinksAddedValid = true;
|
||||
m_numOutlinksAddedFromSameDomain = numAddedFromSameDomain;
|
||||
m_numOutlinksFiltered = linksFiltered;
|
||||
m_numOutlinksBanned = linksBanned;
|
||||
// update end of list once we have successfully added all spider recs
|
||||
m_p = p;
|
||||
// return current ptr
|
||||
@ -23457,8 +23308,6 @@ char *XmlDoc::getWordSpamVec ( ) {
|
||||
// fix this a bit so we're not always totally spammed
|
||||
maxPercent = 25;
|
||||
|
||||
// assume not totally spammed
|
||||
m_totallySpammed = false;
|
||||
// get # of words we have to set spam for
|
||||
int32_t numWords = words->getNumWords();
|
||||
|
||||
@ -23670,9 +23519,7 @@ char *XmlDoc::getWordSpamVec ( ) {
|
||||
// if we had < 100 candidates and < 20% spam, don't bother
|
||||
//if ( percent < 5 ) goto done;
|
||||
if ( percent <= maxPercent ) goto done;
|
||||
// set flag so linkspam.cpp can see if all is spam and will not allow
|
||||
// this page to vote
|
||||
m_totallySpammed = true;
|
||||
|
||||
// now only set to 99 so each singleton usually gets hashed
|
||||
for ( i = 0 ; i < numWords ; i++ )
|
||||
if ( words->getWordId(i) && spam[i] < 99 )
|
||||
@ -24627,70 +24474,6 @@ bool XmlDoc::getIsInjecting ( ) {
|
||||
return isInjecting;
|
||||
}
|
||||
|
||||
// this is still used by Title.cpp to get the title: field quickly
|
||||
char *getJSONFieldValue ( char *json , char *field , int32_t *valueLen ) {
|
||||
|
||||
if ( ! json ) return NULL;
|
||||
|
||||
// get length
|
||||
int32_t fieldLen = gbstrlen(field);
|
||||
// keep track of in a quote or not
|
||||
bool inQuotes = false;
|
||||
char *stringStart = NULL;
|
||||
char *p = json;
|
||||
bool gotOne = false;
|
||||
int32_t depth = 0;
|
||||
// scan
|
||||
for ( ; *p ; p++ ) {
|
||||
// escaping a quote? ignore quote then.
|
||||
if ( *p == '\\' && p[1] == '\"' ) {
|
||||
// skip two bytes then..
|
||||
p++;
|
||||
continue;
|
||||
}
|
||||
// count {} depth
|
||||
if ( ! inQuotes ) {
|
||||
if ( *p == '{' ) depth++;
|
||||
if ( *p == '}' ) depth--;
|
||||
}
|
||||
// a quote?
|
||||
if ( *p == '\"' ) {
|
||||
inQuotes = ! inQuotes;
|
||||
// set start of the string if quote is beginning
|
||||
if ( inQuotes ) stringStart = p + 1;
|
||||
// if quote is ending and a colon follows then
|
||||
// it was a json field name. so if it matches the
|
||||
// field we want return the following field for it.
|
||||
else if ( ! inQuotes &&
|
||||
! gotOne &&
|
||||
p[1] == ':' &&
|
||||
// {"title":"whatever",...}
|
||||
// could be product:{title:... depth=2
|
||||
(depth == 1 ||depth==2) &&
|
||||
stringStart &&
|
||||
(p - stringStart) == fieldLen &&
|
||||
strncmp(field,stringStart,fieldLen)==0 ) {
|
||||
// now, the next time we set stringStart
|
||||
// it will be set to the VALUE of this field
|
||||
// assuming the field is a STRING!!!!
|
||||
gotOne = true;
|
||||
// return after the quote
|
||||
//return p + 2;
|
||||
}
|
||||
// ok, we got the string after the field string...
|
||||
else if ( ! inQuotes && gotOne ) {
|
||||
if ( valueLen ) *valueLen = p - stringStart;
|
||||
return stringStart;
|
||||
}
|
||||
// keep chugging
|
||||
continue;
|
||||
}
|
||||
}
|
||||
// done, not found
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
||||
Json *XmlDoc::getParsedJson ( ) {
|
||||
|
||||
if ( m_jpValid ) return &m_jp;
|
||||
|
152
XmlDoc.h
152
XmlDoc.h
@ -71,10 +71,6 @@ bool setLangVec ( class Words *words ,
|
||||
class Sections *sections ,
|
||||
int32_t niceness ) ;
|
||||
|
||||
char *getJSONFieldValue ( char *json, char *field , int32_t *valueLen ) ;
|
||||
|
||||
bool logQueryLogs ( );
|
||||
|
||||
bool getDensityRanks ( int64_t *wids ,
|
||||
int32_t nw,
|
||||
//int32_t wordStart ,
|
||||
@ -192,7 +188,7 @@ public:
|
||||
uint32_t m_spideredTime; // time_t
|
||||
uint32_t m_indexedTime; // slightly > m_spideredTime (time_t)
|
||||
uint32_t m_reserved32;
|
||||
uint32_t reserved3; //was: m_pubDate; // aka m_datedbDate // time_t
|
||||
uint32_t reserved3;
|
||||
uint32_t m_firstIndexedDate; // time_t
|
||||
uint32_t m_outlinksAddedDate; // time_t
|
||||
|
||||
@ -206,7 +202,7 @@ public:
|
||||
uint16_t m_bodyStartPos;
|
||||
uint16_t m_reserved5;
|
||||
|
||||
uint16_t m_unused0; //was: m_diffbotJSONCount
|
||||
uint16_t m_unused0;
|
||||
|
||||
int16_t m_httpStatus; // -1 if not found (empty http reply)
|
||||
|
||||
@ -230,9 +226,9 @@ public:
|
||||
uint16_t m_reserved799:1;
|
||||
uint16_t m_isSiteRoot:1;
|
||||
|
||||
uint16_t m_reserved800:1; //was:m_isDiffbotJSONObject
|
||||
uint16_t m_reserved801:1; //was:m_sentToDiffbot
|
||||
uint16_t m_reserved802:1; //was:m_gotDiffbotSuccessfulReply
|
||||
uint16_t m_reserved800:1;
|
||||
uint16_t m_reserved801:1;
|
||||
uint16_t m_reserved802:1;
|
||||
uint16_t m_useTimeAxis:1; // m_reserved804:1;
|
||||
uint16_t m_reserved805:1;
|
||||
uint16_t m_reserved806:1;
|
||||
@ -273,7 +269,7 @@ public:
|
||||
char *ptr_site;
|
||||
LinkInfo *ptr_linkInfo1;
|
||||
char *ptr_linkdbData;
|
||||
char *ptr_sectiondbData;
|
||||
char *ptr_unused14;
|
||||
char *ptr_tagRecData;
|
||||
LinkInfo *ptr_unused9;
|
||||
|
||||
@ -296,7 +292,7 @@ public:
|
||||
int32_t size_site;
|
||||
int32_t size_linkInfo1;
|
||||
int32_t size_linkdbData;
|
||||
int32_t size_sectiondbData;
|
||||
int32_t size_unused14;
|
||||
int32_t size_tagRecData;
|
||||
int32_t size_unused9;
|
||||
|
||||
@ -395,10 +391,8 @@ public:
|
||||
int32_t *getPageSampleVector ( ) ;
|
||||
int32_t *getPostLinkTextVector ( int32_t linkNode ) ;
|
||||
int32_t computeVector ( class Words *words, uint32_t *vec , int32_t start = 0 , int32_t end = -1 );
|
||||
float *getTagSimilarity ( class XmlDoc *xd2 ) ;
|
||||
float *getPageSimilarity ( class XmlDoc *xd2 ) ;
|
||||
float *getPercentChanged ( );
|
||||
uint64_t *getFuzzyDupHash ( );
|
||||
int64_t *getExactContentHash64();
|
||||
class RdbList *getDupList ( ) ;
|
||||
char *getIsDup ( ) ;
|
||||
@ -471,7 +465,6 @@ public:
|
||||
int32_t *getTagPairHashVector ( ) ;
|
||||
uint32_t *getTagPairHash32 ( ) ;
|
||||
int32_t getHostHash32a ( ) ;
|
||||
int32_t getHostHash32b ( ) ;
|
||||
int32_t getDomHash32 ( );
|
||||
char **getThumbnailData();
|
||||
class Images *getImages ( ) ;
|
||||
@ -482,7 +475,6 @@ public:
|
||||
char *getIsSiteRoot ( ) ;
|
||||
int8_t *getHopCount ( ) ;
|
||||
char *getSpiderLinks ( ) ;
|
||||
char *getIsFiltered ();
|
||||
bool getIsInjecting();
|
||||
int32_t *getSpiderPriority ( ) ;
|
||||
int32_t *getIndexCode ( ) ;
|
||||
@ -492,7 +484,7 @@ public:
|
||||
bool logIt ( class SafeBuf *bb = NULL ) ;
|
||||
bool m_doConsistencyTesting;
|
||||
bool doConsistencyTest ( bool forceTest ) ;
|
||||
int32_t printMetaList ( ) ;
|
||||
|
||||
void printMetaList ( char *metaList , char *metaListEnd ,
|
||||
class SafeBuf *pbuf );
|
||||
bool verifyMetaList ( char *p , char *pend , bool forDelete ) ;
|
||||
@ -518,10 +510,6 @@ public:
|
||||
// m_indexCode or g_errno was set!
|
||||
class SpiderReply *getNewSpiderReply ( );
|
||||
|
||||
SpiderRequest m_redirSpiderRequest;
|
||||
SpiderRequest *m_redirSpiderRequestPtr;
|
||||
|
||||
|
||||
void setSpiderReqForMsg20 ( class SpiderRequest *sreq ,
|
||||
class SpiderReply *srep );
|
||||
|
||||
@ -542,7 +530,6 @@ public:
|
||||
bool hashNoSplit ( class HashTableX *tt ) ;
|
||||
char *hashAll ( class HashTableX *table ) ;
|
||||
bool hashMetaTags ( class HashTableX *table ) ;
|
||||
bool hashMetaZip ( class HashTableX *table ) ;
|
||||
bool hashContentType ( class HashTableX *table ) ;
|
||||
|
||||
bool hashLinks ( class HashTableX *table ) ;
|
||||
@ -550,12 +537,9 @@ public:
|
||||
SafeBuf *getTimeAxisUrl ( );
|
||||
bool hashUrl ( class HashTableX *table, bool urlOnly );
|
||||
bool hashDateNumbers ( class HashTableX *tt );
|
||||
bool hashSections ( class HashTableX *table ) ;
|
||||
bool hashIncomingLinkText( class HashTableX *table, bool hashAnomalies, bool hashNonAnomalies );
|
||||
bool hashLinksForLinkdb ( class HashTableX *table ) ;
|
||||
bool hashNeighborhoods ( class HashTableX *table ) ;
|
||||
bool hashRSSInfo ( class HashTableX *table ) ;
|
||||
bool hashRSSTerm ( class HashTableX *table , bool inRSS ) ;
|
||||
bool hashTitle ( class HashTableX *table );
|
||||
bool hashBody2 ( class HashTableX *table );
|
||||
bool hashMetaKeywords ( class HashTableX *table );
|
||||
@ -564,12 +548,8 @@ public:
|
||||
bool hashLanguage ( class HashTableX *table ) ;
|
||||
bool hashLanguageString ( class HashTableX *table ) ;
|
||||
bool hashCountry ( class HashTableX *table ) ;
|
||||
bool hashSiteNumInlinks ( class HashTableX *table ) ;
|
||||
bool hashCharset ( class HashTableX *table ) ;
|
||||
bool hashTagRec ( class HashTableX *table ) ;
|
||||
bool hashPermalink ( class HashTableX *table ) ;
|
||||
bool hashVectors(class HashTableX *table ) ;
|
||||
|
||||
|
||||
class Url *getBaseUrl ( ) ;
|
||||
bool hashIsAdult ( class HashTableX *table ) ;
|
||||
|
||||
@ -611,10 +591,6 @@ public:
|
||||
int32_t bufLen ,
|
||||
class HashInfo *hi ) ;
|
||||
|
||||
bool hashNumberForSortingAsFloat ( float f ,
|
||||
class HashInfo *hi ,
|
||||
char *gbsortByStr ) ;
|
||||
|
||||
bool hashNumberForSortingAsInt32 ( int32_t x,
|
||||
class HashInfo *hi ,
|
||||
char *gbsortByStr ) ;
|
||||
@ -678,9 +654,7 @@ public:
|
||||
int64_t m_firstUrlHash64;
|
||||
Url m_currentUrl;
|
||||
|
||||
CollectionRec *m_lastcr;
|
||||
collnum_t m_collnum;
|
||||
int32_t m_lastCollRecResetCount;
|
||||
class CollectionRec *getCollRec ( ) ;
|
||||
bool setCollNum ( const char *coll ) ;
|
||||
|
||||
@ -696,8 +670,6 @@ public:
|
||||
int32_t m_addedStatusDocSize;
|
||||
|
||||
SafeBuf m_metaList2;
|
||||
SafeBuf m_zbuf;
|
||||
SafeBuf m_kbuf;
|
||||
|
||||
// used by msg7 to store udp slot
|
||||
class UdpSlot *m_injectionSlot;
|
||||
@ -719,8 +691,6 @@ public:
|
||||
char m_logLangId;
|
||||
int32_t m_logSiteNumInlinks;
|
||||
|
||||
SafeBuf m_tmpBuf2;
|
||||
|
||||
SafeBuf m_timeAxisUrl;
|
||||
|
||||
Images m_images;
|
||||
@ -767,7 +737,6 @@ public:
|
||||
char m_fragBufValid;
|
||||
char m_wordSpamBufValid;
|
||||
char m_finalSummaryBufValid;
|
||||
char m_redirSpiderRequestValid;
|
||||
|
||||
char m_hopCountValid;
|
||||
char m_isInjectingValid;
|
||||
@ -782,7 +751,6 @@ public:
|
||||
char m_datedbDateValid;
|
||||
char m_isRSSValid;
|
||||
char m_isSiteMapValid;
|
||||
char m_spiderLinksArgValid;
|
||||
char m_isContentTruncatedValid;
|
||||
char m_xmlValid;
|
||||
char m_linksValid;
|
||||
@ -790,10 +758,8 @@ public:
|
||||
char m_bitsValid;
|
||||
char m_bits2Valid;
|
||||
char m_posValid;
|
||||
char m_isUrlBadYearValid;
|
||||
char m_phrasesValid;
|
||||
char m_sectionsValid;
|
||||
char m_subSentsValid;
|
||||
|
||||
char m_imageDataValid;
|
||||
char m_imagesValid;
|
||||
@ -806,7 +772,6 @@ public:
|
||||
bool m_firstIpValid;
|
||||
bool m_spideredTimeValid;
|
||||
bool m_indexedTimeValid;
|
||||
bool m_firstIndexedValid;
|
||||
bool m_isInIndexValid;
|
||||
bool m_wasInIndexValid;
|
||||
bool m_outlinksAddedDateValid;
|
||||
@ -828,9 +793,7 @@ public:
|
||||
bool m_canonicalRedirUrlValid;
|
||||
bool m_statusMsgValid;
|
||||
bool m_mimeValid;
|
||||
bool m_pubDateValid;
|
||||
bool m_hostHash32aValid;
|
||||
bool m_hostHash32bValid;
|
||||
bool m_indexCodeValid;
|
||||
bool m_priorityValid;
|
||||
bool m_downloadStatusValid;
|
||||
@ -845,29 +808,23 @@ public:
|
||||
bool m_isPermalinkValid;
|
||||
|
||||
bool m_isAdultValid;
|
||||
bool m_urlPubDateValid;
|
||||
bool m_isUrlPermalinkFormatValid;
|
||||
bool m_percentChangedValid;
|
||||
bool m_unchangedValid;
|
||||
bool m_countTableValid;
|
||||
bool m_summaryLangIdValid;
|
||||
bool m_tagPairHashVecValid;
|
||||
bool m_summaryVecValid;
|
||||
bool m_titleVecValid;
|
||||
bool m_pageSampleVecValid;
|
||||
bool m_postVecValid;
|
||||
bool m_dupListValid;
|
||||
bool m_likedbListValid;
|
||||
bool m_isDupValid;
|
||||
bool m_metaDescValid;
|
||||
bool m_metaSummaryValid;
|
||||
bool m_metaKeywordsValid;
|
||||
bool m_metaGeoPlacenameValid;
|
||||
bool m_siteSpiderQuotaValid;
|
||||
bool m_oldDocValid;
|
||||
bool m_extraDocValid;
|
||||
bool m_rootDocValid;
|
||||
bool m_oldMetaListValid;
|
||||
bool m_oldTitleRecValid;
|
||||
bool m_rootTitleRecValid;
|
||||
bool m_isIndexedValid;
|
||||
@ -881,12 +838,10 @@ public:
|
||||
bool m_siteHash32Valid;
|
||||
bool m_httpReplyValid;
|
||||
bool m_contentTypeValid;
|
||||
bool m_priorityQueueNumValid;
|
||||
bool m_outlinkTagRecVectorValid;
|
||||
bool m_outlinkIpVectorValid;
|
||||
bool m_hasNoIndexMetaTagValid;
|
||||
bool m_hasUseFakeIpsMetaTagValid;
|
||||
bool m_outlinkIsIndexedVectorValid;
|
||||
bool m_isSiteRootValid;
|
||||
bool m_wasContentInjectedValid;
|
||||
bool m_outlinkHopCountVectorValid;
|
||||
@ -906,7 +861,6 @@ public:
|
||||
bool m_htbValid;
|
||||
bool m_collnumValid;
|
||||
bool m_summaryValid;
|
||||
bool m_gsbufValid;
|
||||
bool m_spiderStatusDocMetaListValid;
|
||||
bool m_isCompromisedValid;
|
||||
bool m_isNoArchiveValid;
|
||||
@ -914,9 +868,7 @@ public:
|
||||
bool m_isLinkSpamValid;
|
||||
bool m_isErrorPageValid;
|
||||
bool m_isHijackedValid;
|
||||
bool m_dupHashValid;
|
||||
bool m_exactContentHash64Valid;
|
||||
bool m_looseContentHash64Valid;
|
||||
bool m_jpValid;
|
||||
|
||||
char m_isSiteMap;
|
||||
@ -933,43 +885,31 @@ public:
|
||||
// DO NOT add validity flags below this line!
|
||||
char m_VALIDEND;
|
||||
|
||||
|
||||
bool m_printedMenu;
|
||||
int32_t m_urlPubDate;
|
||||
char m_isUrlPermalinkFormat;
|
||||
uint8_t m_summaryLangId;
|
||||
int32_t m_tagPairHashVec[MAX_TAG_PAIR_HASHES];
|
||||
int32_t m_tagPairHashVecSize;
|
||||
int32_t m_summaryVec [SAMPLE_VECTOR_SIZE/4];
|
||||
int32_t m_summaryVecSize;
|
||||
int32_t m_titleVec [SAMPLE_VECTOR_SIZE/4];
|
||||
int32_t m_titleVecSize;
|
||||
int32_t m_pageSampleVec[SAMPLE_VECTOR_SIZE/4];
|
||||
int32_t m_pageSampleVecSize;
|
||||
int32_t m_postVec[POST_VECTOR_SIZE/4];
|
||||
int32_t m_postVecSize;
|
||||
float m_tagSimilarity;
|
||||
float m_pageSimilarity;
|
||||
float m_percentChanged;
|
||||
bool m_unchanged;
|
||||
// what docids are similar to us? docids are in this list
|
||||
RdbList m_dupList;
|
||||
RdbList m_likedbList;
|
||||
uint64_t m_dupHash;
|
||||
int64_t m_exactContentHash64;
|
||||
int64_t m_looseContentHash64;
|
||||
Msg0 m_msg0;
|
||||
Msg5 m_msg5;
|
||||
char m_isDup;
|
||||
int64_t m_docIdWeAreADupOf;
|
||||
int32_t m_ei;
|
||||
int32_t m_lastLaunch;
|
||||
Msg22Request m_msg22Request;
|
||||
Msg22Request m_msg22Requestc;
|
||||
Msg22 m_msg22a;
|
||||
Msg22 m_msg22b;
|
||||
Msg22 m_msg22c;
|
||||
Msg22 m_msg22d;
|
||||
Msg22 m_msg22e;
|
||||
Msg22 m_msg22f;
|
||||
// these now reference directly into the html src so our
|
||||
@ -983,13 +923,10 @@ public:
|
||||
|
||||
char *m_metaGeoPlacename;
|
||||
int32_t m_metaGeoPlacenameLen;
|
||||
|
||||
|
||||
int32_t m_siteSpiderQuota;
|
||||
|
||||
class XmlDoc *m_oldDoc;
|
||||
class XmlDoc *m_extraDoc;
|
||||
class XmlDoc *m_rootDoc;
|
||||
RdbList m_oldMetaList;
|
||||
char *m_oldTitleRec;
|
||||
int32_t m_oldTitleRecSize;
|
||||
char *m_rootTitleRec;
|
||||
@ -1002,12 +939,9 @@ public:
|
||||
char m_wasInIndex;
|
||||
|
||||
Msg8a m_msg8a;
|
||||
char *m_tagdbColl;
|
||||
int32_t m_tagdbCollLen;
|
||||
|
||||
Url m_extraUrl;
|
||||
uint8_t m_siteNumInlinks8;
|
||||
LinkInfo m_siteLinkInfo;
|
||||
SafeBuf m_mySiteLinkInfoBuf;
|
||||
SafeBuf m_myPageLinkInfoBuf;
|
||||
SafeBuf m_myTempLinkInfoBuf;
|
||||
@ -1021,9 +955,6 @@ public:
|
||||
SafeBuf m_tmpBuf12;
|
||||
Multicast m_mcast11;
|
||||
Multicast m_mcast12;
|
||||
// lists from cachedb for msg25's msg20 replies serialized
|
||||
RdbList m_siteReplyList;
|
||||
RdbList m_pageReplyList;
|
||||
MsgC m_msgc;
|
||||
bool m_isAllowed;
|
||||
bool m_forwardDownloadRequest;
|
||||
@ -1035,22 +966,17 @@ public:
|
||||
int32_t m_numExpansions;
|
||||
char m_newOnly;
|
||||
char m_isWWWDup;
|
||||
char m_calledMsg0b;
|
||||
|
||||
SafeBuf m_linkSiteHashBuf;
|
||||
SafeBuf m_linkdbDataBuf;
|
||||
SafeBuf m_langVec;
|
||||
Msg0 m_msg0b;
|
||||
class RdbList *m_ulist;
|
||||
char *m_linkInfoColl;
|
||||
SiteGetter m_siteGetter;
|
||||
int64_t m_siteHash64;
|
||||
int32_t m_siteHash32;
|
||||
char *m_httpReply;
|
||||
char m_incrementedAttemptsCount;
|
||||
char m_incrementedDownloadCount;
|
||||
char m_redirectFlag;
|
||||
char m_spamCheckDisabled;
|
||||
char m_useRobotsTxt;
|
||||
int32_t m_robotsTxtLen;
|
||||
int32_t m_httpReplySize;
|
||||
@ -1062,16 +988,13 @@ public:
|
||||
char m_calledThread;
|
||||
int32_t m_errno;
|
||||
int32_t m_hostHash32a;
|
||||
int32_t m_hostHash32b;
|
||||
int32_t m_domHash32;
|
||||
int32_t m_priorityQueueNum;
|
||||
|
||||
// this points into m_msge0 i guess
|
||||
Msge0 m_msge0;
|
||||
|
||||
// this points into m_msge1 i guess
|
||||
int32_t *m_outlinkIpVector;
|
||||
SafeBuf m_outlinkTagRecPtrBuf;
|
||||
SafeBuf m_fakeIpBuf;
|
||||
char m_hasNoIndexMetaTag;
|
||||
char m_hasUseFakeIpsMetaTag;
|
||||
@ -1080,23 +1003,13 @@ public:
|
||||
SafeBuf m_fakeTagRecPtrBuf;
|
||||
TagRec m_fakeTagRec;
|
||||
|
||||
//
|
||||
// diffbot parms for indexing diffbot's json output
|
||||
//
|
||||
|
||||
char *hashJSONFields ( HashTableX *table );
|
||||
char *hashJSONFields2 ( HashTableX *table , HashInfo *hi , Json *jp ,
|
||||
bool hashWithoutFieldNames ) ;
|
||||
|
||||
char *hashXMLFields ( HashTableX *table );
|
||||
|
||||
Json *getParsedJson();
|
||||
// object that parses the json
|
||||
Json m_jp;
|
||||
|
||||
// related query algo stuff
|
||||
int64_t m_tlbufTimer;
|
||||
|
||||
// flow flags
|
||||
|
||||
bool m_computedMetaListCheckSum;
|
||||
@ -1104,18 +1017,11 @@ public:
|
||||
// cachedb related args
|
||||
bool m_allHashed;
|
||||
|
||||
// for getRelatedDocIdsWithTitles() launching msg20s
|
||||
int32_t m_numMsg20Replies;
|
||||
int32_t m_numMsg20Requests;
|
||||
|
||||
int8_t *m_outlinkHopCountVector;
|
||||
int32_t m_outlinkHopCountVectorSize;
|
||||
char m_isFiltered;
|
||||
int32_t m_urlFilterNum;
|
||||
int32_t m_numOutlinksAdded;
|
||||
int32_t m_numOutlinksAddedFromSameDomain;
|
||||
int32_t m_numOutlinksFiltered;
|
||||
int32_t m_numOutlinksBanned;
|
||||
int32_t m_numRedirects;
|
||||
bool m_isPageParser;
|
||||
Url m_baseUrl;
|
||||
@ -1124,12 +1030,8 @@ public:
|
||||
char m_linkTextBuf[MAX_LINK_TEXT_LEN];
|
||||
char m_surroundingTextBuf[MAX_SURROUNDING_TEXT_WIDTH];
|
||||
char m_rssItemBuf[MAX_RSSITEM_SIZE];
|
||||
SafeBuf m_gsbuf;
|
||||
|
||||
char *m_note;
|
||||
char *m_imageUrl;
|
||||
char *m_imageUrl2;
|
||||
SafeBuf m_imageUrlBuf;
|
||||
SafeBuf m_imageUrlBuf2;
|
||||
Query m_query;
|
||||
Matches m_matches;
|
||||
// meta description buf
|
||||
@ -1138,7 +1040,6 @@ public:
|
||||
SafeBuf m_htb;
|
||||
Title m_title;
|
||||
Summary m_summary;
|
||||
char m_isCompromised;
|
||||
char m_isNoArchive;
|
||||
char m_isErrorPage;
|
||||
char m_isHijacked;
|
||||
@ -1146,8 +1047,6 @@ public:
|
||||
// stuff
|
||||
char *m_statusMsg;
|
||||
Msg4 m_msg4;
|
||||
bool m_incCount;
|
||||
bool m_decCount;
|
||||
|
||||
bool m_deleteFromIndex;
|
||||
|
||||
@ -1171,33 +1070,19 @@ public:
|
||||
bool m_check1 ;
|
||||
bool m_check2 ;
|
||||
bool m_prepared ;
|
||||
bool m_updatedCounts ;
|
||||
bool m_updatedCounts2 ;
|
||||
bool m_copied1 ;
|
||||
bool m_updatingSiteLinkInfoTags ;
|
||||
|
||||
int64_t m_calledMsg22d ;
|
||||
bool m_didDelay ;
|
||||
bool m_didDelayUnregister ;
|
||||
bool m_calledMsg22e ;
|
||||
bool m_calledMsg22f ;
|
||||
bool m_calledMsg25 ;
|
||||
bool m_calledMsg25b ;
|
||||
bool m_calledMsg8b ;
|
||||
bool m_calledMsg40 ;
|
||||
bool m_calledSections ;
|
||||
bool m_firstEntry ;
|
||||
bool m_firstEntry2 ;
|
||||
bool m_launchedSpecialMsg8a ;
|
||||
bool m_launchedMsg8a2 ;
|
||||
bool m_loaded ;
|
||||
|
||||
bool m_processedLang ;
|
||||
|
||||
bool m_doingConsistencyCheck ;
|
||||
|
||||
int32_t m_langIdScore;
|
||||
|
||||
int32_t m_dist;
|
||||
|
||||
// use to store a \0 list of "titles" of the root page so we can
|
||||
@ -1217,7 +1102,6 @@ public:
|
||||
char m_titleBuf[ROOT_TITLE_BUF_MAX];
|
||||
int32_t m_titleBufSize;
|
||||
|
||||
|
||||
bool m_setTr ;
|
||||
|
||||
void (* m_masterLoop) ( void *state );
|
||||
@ -1227,8 +1111,6 @@ public:
|
||||
bool (* m_callback2) ( void *state );
|
||||
void *m_state;
|
||||
|
||||
bool m_skipIframeExpansion;
|
||||
|
||||
// this is non-zero if we decided not to index the doc
|
||||
int32_t m_indexCode;
|
||||
|
||||
@ -1250,12 +1132,8 @@ public:
|
||||
|
||||
int32_t m_maxCacheAge;
|
||||
|
||||
char *m_wikiqbuf;
|
||||
int32_t m_wikiqbufSize;
|
||||
|
||||
bool m_registeredSleepCallback;
|
||||
bool m_addedNegativeDoledbRec;
|
||||
|
||||
|
||||
bool m_hashedTitle;
|
||||
bool m_hashedMetas;
|
||||
|
||||
@ -1293,7 +1171,6 @@ public:
|
||||
bool m_setFromUrl;
|
||||
bool m_setFromDocId;
|
||||
bool m_freeLinkInfo1;
|
||||
bool m_freeLinkInfo2;
|
||||
bool m_contentInjected;
|
||||
|
||||
bool m_recycleContent;
|
||||
@ -1329,7 +1206,6 @@ public:
|
||||
int32_t getProbSpam ( int32_t *profile, int32_t plen , int32_t step );
|
||||
bool m_isRepeatSpammer;
|
||||
int32_t m_numRepeatSpam;
|
||||
bool m_totallySpammed;
|
||||
|
||||
// frag vector (repeated fragments). 0 means repeated, 1 means not.
|
||||
// vector is 1-1 with words in the document body.
|
||||
@ -1358,10 +1234,6 @@ public:
|
||||
void logQueryTimingEnd(const char* function, int64_t startTime);
|
||||
|
||||
int32_t m_i;
|
||||
int32_t m_blocked;
|
||||
void *m_finalState;
|
||||
void (* m_finalCallback) ( void *state );
|
||||
int64_t m_cacheStartTime;
|
||||
};
|
||||
|
||||
// . PageParser.cpp uses this class for printing hashed terms out by calling
|
||||
|
@ -431,9 +431,7 @@ char *XmlDoc::hashAll ( HashTableX *table ) {
|
||||
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, hashCountry failed", __FILE__,__func__, __LINE__);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// BR 20160117 removed: if ( ! hashSiteNumInlinks( table ) ) return NULL;
|
||||
// BR 20160117 removed: if ( ! hashTagRec ( table ) ) return NULL;
|
||||
|
||||
// BR 20160106 removed: if ( ! hashAds ( table ) ) return NULL;
|
||||
// BR 20160106 removed: if ( ! hashSubmitUrls ( table ) ) return NULL;
|
||||
if ( ! hashIsAdult ( table ) )
|
||||
@ -445,10 +443,6 @@ char *XmlDoc::hashAll ( HashTableX *table ) {
|
||||
// has gbhasthumbnail:1 or 0
|
||||
// BR 20160106 removed: if ( ! hashImageStuff ( table ) ) return NULL;
|
||||
|
||||
// . hash sectionhash:xxxx terms
|
||||
// . diffbot still needs to hash this for voting info
|
||||
// BR 20160106 removed: if ( ! hashSections ( table ) ) return NULL;
|
||||
|
||||
// now hash the terms sharded by termid and not docid here since they
|
||||
// just set a special bit in posdb key so Rebalance.cpp can work.
|
||||
// this will hash the content checksum which we need for deduping
|
||||
@ -477,17 +471,7 @@ char *XmlDoc::hashAll ( HashTableX *table ) {
|
||||
return (char *)1;
|
||||
}
|
||||
|
||||
// hash json fields
|
||||
if ( *ct == CT_JSON ) {
|
||||
// this hashes both with and without the fieldname
|
||||
// BR 20160107 removed: hashJSONFields ( table );
|
||||
goto skip;
|
||||
}
|
||||
|
||||
// same for xml now, so we can search for field:value like w/ json
|
||||
if ( *ct == CT_XML ) {
|
||||
// this hashes both with and without the fieldname
|
||||
// BR 20160107 removed: hashXMLFields ( table );
|
||||
if ( *ct == CT_JSON || *ct == CT_XML ) {
|
||||
goto skip;
|
||||
}
|
||||
|
||||
@ -579,18 +563,8 @@ char *XmlDoc::hashAll ( HashTableX *table ) {
|
||||
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, hashMetaTags failed", __FILE__,__func__, __LINE__);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
BR 20160220 removed.
|
||||
if ( ! hashMetaZip ( table ) )
|
||||
{
|
||||
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, hashMetaZip failed", __FILE__,__func__, __LINE__);
|
||||
return NULL;
|
||||
}
|
||||
*/
|
||||
// BR 20160107 removed: if ( ! hashCharset ( table ) ) return NULL;
|
||||
// BR 20160107 removed: if ( ! hashRSSInfo ( table ) ) return NULL;
|
||||
if ( ! hashPermalink ( table ) )
|
||||
|
||||
if ( ! hashPermalink ( table ) )
|
||||
{
|
||||
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, hashPermaLink failed", __FILE__,__func__, __LINE__);
|
||||
return NULL;
|
||||
@ -619,8 +593,6 @@ char *XmlDoc::hashAll ( HashTableX *table ) {
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
bool XmlDoc::setSpiderStatusDocMetaList ( SafeBuf *jd , int64_t uqd ) {
|
||||
|
||||
// the posdb table
|
||||
@ -870,57 +842,6 @@ bool XmlDoc::hashDateNumbers ( HashTableX *tt ) { // , bool isStatusDoc ) {
|
||||
return true;
|
||||
}
|
||||
|
||||
bool XmlDoc::hashMetaZip ( HashTableX *tt ) {
|
||||
|
||||
setStatus ( "hashing meta zip" );
|
||||
|
||||
// . set the score based on quality
|
||||
// . scores are multiplied by 256 to preserve fractions for adding
|
||||
uint32_t score = *getSiteNumInlinks8() * 256 ;
|
||||
if ( score <= 0 ) score = 1;
|
||||
// search for meta date
|
||||
char buf [ 32 ];
|
||||
int32_t bufLen = m_xml.getMetaContent ( buf, 32, "zipcode", 7 );
|
||||
if ( bufLen <= 0 ) bufLen = m_xml.getMetaContent ( buf, 32, "zip",3);
|
||||
char *p = buf;
|
||||
char *pend = buf + bufLen ;
|
||||
if ( bufLen <= 0 ) return true;
|
||||
|
||||
// set up the hashing parms
|
||||
HashInfo hi;
|
||||
hi.m_hashGroup = HASHGROUP_INTAG;
|
||||
hi.m_tt = tt;
|
||||
//hi.m_prefix = "zipcode";
|
||||
hi.m_prefix = "gbzipcode";
|
||||
|
||||
nextZip:
|
||||
// . parse out the zip codes, may be multiple ones
|
||||
// . skip non-digits
|
||||
while ( p < pend && ! is_digit(*p) ) p++;
|
||||
// skip if no digits
|
||||
if ( p == pend ) return true;
|
||||
// need at least 5 consecutive digits
|
||||
if ( p + 5 > pend ) return true;
|
||||
// if not a zip code, skip it
|
||||
if ( ! is_digit(p[1]) ) { p += 1; goto nextZip; }
|
||||
if ( ! is_digit(p[2]) ) { p += 2; goto nextZip; }
|
||||
if ( ! is_digit(p[3]) ) { p += 3; goto nextZip; }
|
||||
if ( ! is_digit(p[4]) ) { p += 4; goto nextZip; }
|
||||
// do we have too many consectuive digits?
|
||||
if ( p + 5 != pend && is_digit(p[5]) ) {
|
||||
// if so skip this whole string of digits
|
||||
p += 5; while ( p < pend && is_digit(*p) ) p++;
|
||||
goto nextZip;
|
||||
}
|
||||
|
||||
// 90210 --> 90 902 9021 90210
|
||||
for ( int32_t i = 0 ; i <= 3 ; i++ )
|
||||
// use prefix as description
|
||||
if ( ! hashString ( p,5-i,&hi ) ) return false;
|
||||
p += 5;
|
||||
goto nextZip;
|
||||
}
|
||||
|
||||
// returns false and sets g_errno on error
|
||||
bool XmlDoc::hashContentType ( HashTableX *tt ) {
|
||||
|
||||
@ -1788,13 +1709,6 @@ bool XmlDoc::hashUrl ( HashTableX *tt, bool urlOnly ) { // , bool isStatusDoc )
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
// . returns false and sets g_errno on error
|
||||
bool XmlDoc::hashSections ( HashTableX *tt ) {
|
||||
// BR 20160106: No longer store xpath-hashes in posdb as we do not use them.
|
||||
return true;
|
||||
}
|
||||
|
||||
// . returns false and sets g_errno on error
|
||||
bool XmlDoc::hashIncomingLinkText ( HashTableX *tt ,
|
||||
bool hashAnomalies ,
|
||||
@ -1981,186 +1895,6 @@ bool XmlDoc::hashNeighborhoods ( HashTableX *tt ) {
|
||||
goto loop;
|
||||
}
|
||||
|
||||
|
||||
// . returns false and sets g_errno on error
|
||||
bool XmlDoc::hashRSSInfo ( HashTableX *tt ) {
|
||||
|
||||
setStatus ( "hashing rss info" );
|
||||
|
||||
uint8_t *ct = getContentType();
|
||||
if ( ! ct || ct == (void *)-1 ) { char *xx=NULL;*xx=0; }
|
||||
|
||||
// . finally hash in the linkText terms from the LinkInfo
|
||||
// . the LinkInfo class has all the terms of hashed anchor text for us
|
||||
// . if we're using an old TitleRec linkTermList is just a ptr to
|
||||
// somewhere in TitleRec
|
||||
// . otherwise, we generated it from merging a bunch of LinkInfos
|
||||
// and storing them in this new TitleRec
|
||||
LinkInfo *linkInfo = getLinkInfo1();
|
||||
|
||||
// get the xml of the first rss/atom item/entry referencing this url
|
||||
Xml xml;
|
||||
// . returns NULL if no item xml
|
||||
// . this could also be a "channel" blurb now, so we index channel pgs
|
||||
if ( ! linkInfo->getItemXml ( &xml , m_niceness ) ) return false;
|
||||
|
||||
if ( xml.isEmpty() )
|
||||
// hash gbrss:0
|
||||
return hashRSSTerm ( tt , false );
|
||||
|
||||
// parser info msg
|
||||
//if ( m_pbuf ) {
|
||||
// m_pbuf->safePrintf(
|
||||
// "<br><b>--BEGIN RSS/ATOM INFO HASH--</b><br><br>");
|
||||
//}
|
||||
|
||||
// hash nothing if not a permalink and eliminating "menus"
|
||||
//if ( ! *getIsPermalink() && m_eliminateMenus ) return true;
|
||||
|
||||
// . IMPORTANT: you must be using the new link algo, so turn it on
|
||||
// in the spider controls. this allows us to include LinkTexts from
|
||||
// the same IP in our LinkInfo class in the TitleRec.
|
||||
// . is it rss or atom? both use title tag, so doesn't matter
|
||||
// . get the title tag
|
||||
bool isHtmlEncoded;
|
||||
int32_t titleLen;
|
||||
char *title = xml.getRSSTitle ( &titleLen , &isHtmlEncoded );
|
||||
char c = 0;
|
||||
|
||||
// sanity check
|
||||
if ( ! m_utf8ContentValid ) { char *xx=NULL;*xx=0; }
|
||||
|
||||
bool hashIffUnique = true;
|
||||
// but if we had no content because we were an mp3 or whatever,
|
||||
// do not worry about avoiding double hashing
|
||||
if ( size_utf8Content <= 0 ) hashIffUnique = false;
|
||||
|
||||
// decode it?
|
||||
// should we decode it? if they don't use [CDATA[]] then we should
|
||||
// ex: http://www.abc.net.au/rn/podcast/feeds/lawrpt.xml has CDATA,
|
||||
// but most other feeds do not use it
|
||||
if ( isHtmlEncoded && title && titleLen > 0 ) {
|
||||
// it is html encoded so that the <'s are encoded to <'s so
|
||||
// we must decode them back. this could turn latin1 into utf8
|
||||
// though? no, because the &'s should have been encoded, too!
|
||||
int32_t newLen =htmlDecode(title,title,titleLen,false,m_niceness);
|
||||
// make sure we don't overflow the buffer
|
||||
if ( newLen > titleLen ) { char *xx = NULL; *xx = 0; }
|
||||
// reassign the length
|
||||
titleLen = newLen;
|
||||
// NULL terminate it
|
||||
c = title[titleLen];
|
||||
title[titleLen] = '\0';
|
||||
}
|
||||
|
||||
// update hash parms
|
||||
HashInfo hi;
|
||||
hi.m_tt = tt;
|
||||
hi.m_hashGroup = HASHGROUP_TITLE;
|
||||
hi.m_desc = "rss title";
|
||||
|
||||
// . hash the rss title
|
||||
// . only hash the terms if they are unique to stay balanced with docs
|
||||
// that are not referenced by an rss feed
|
||||
bool status = hashString ( title,titleLen,&hi ) ;
|
||||
// pop the end back just in case
|
||||
if ( c ) title[titleLen] = c;
|
||||
// return false with g_errno set on error
|
||||
if ( ! status ) return false;
|
||||
|
||||
// get the rss description
|
||||
int32_t descLen;
|
||||
char *desc = xml.getRSSDescription ( &descLen , &isHtmlEncoded );
|
||||
|
||||
// for adavanced hashing
|
||||
Xml xml2;
|
||||
Words w;
|
||||
//Scores scores;
|
||||
Words *wordsPtr = NULL;
|
||||
//Scores *scoresPtr = NULL;
|
||||
c = 0;
|
||||
// should we decode it? if they don't use [CDATA[]] then we should
|
||||
// ex: http://www.abc.net.au/rn/podcast/feeds/lawrpt.xml has CDATA,
|
||||
// but most other feeds do not use it
|
||||
if ( isHtmlEncoded && desc && descLen > 0 ) {
|
||||
// it is html encoded so that the <'s are encoded to <'s so
|
||||
// we must decode them back. this could turn latin1 into utf8
|
||||
// though? no, because the &'s should have been encoded, too!
|
||||
int32_t newLen = htmlDecode(desc,desc,descLen,false,m_niceness);
|
||||
// make sure we don't overflow the buffer
|
||||
if ( newLen > descLen ) { char *xx = NULL; *xx = 0; }
|
||||
// reassign the length
|
||||
descLen = newLen;
|
||||
}
|
||||
|
||||
// NULL terminate it
|
||||
if ( desc ) {
|
||||
c = desc[descLen];
|
||||
desc[descLen] = '\0';
|
||||
// set the xml class from the decoded html
|
||||
if ( !xml2.set( desc, descLen, m_version, m_niceness, *ct ) ) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// set the words class from the xml, returns false and sets
|
||||
// g_errno on error
|
||||
if ( !w.set( &xml2, true, true ) ) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// pass it in to TermTable::hash() below
|
||||
wordsPtr = &w;
|
||||
}
|
||||
|
||||
// update hash parms
|
||||
hi.m_tt = tt;
|
||||
hi.m_desc = "rss body";
|
||||
hi.m_hashGroup = HASHGROUP_BODY;
|
||||
// . hash the rss/atom description
|
||||
// . only hash the terms if they are unique to stay balanced with docs
|
||||
// that are not referenced by an rss feed
|
||||
status = hashString ( desc, descLen, &hi );
|
||||
// pop the end back just in case
|
||||
if ( c ) desc[descLen] = c;
|
||||
// return false with g_errno set
|
||||
if ( ! status ) return false;
|
||||
|
||||
// hash gbrss:1
|
||||
if ( ! hashRSSTerm ( tt , true ) ) return false;
|
||||
|
||||
// parser info msg
|
||||
//if ( m_pbuf ) {
|
||||
// m_pbuf->safePrintf("<br><b>--END RSS/ATOM INFO HASH--"
|
||||
// "</b><br><br>");
|
||||
//}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool XmlDoc::hashRSSTerm ( HashTableX *tt , bool inRSS ) {
|
||||
// hash gbrss:0 or gbrss:1
|
||||
char *value;
|
||||
if ( inRSS ) value = "1";
|
||||
else value = "0";
|
||||
|
||||
// update hash parms
|
||||
HashInfo hi;
|
||||
hi.m_tt = tt;
|
||||
hi.m_prefix = "gbinrss";
|
||||
hi.m_hashGroup = HASHGROUP_INTAG;
|
||||
|
||||
// returns false and sets g_errno on error
|
||||
if ( ! hashString(value,1,&hi ) ) return false;
|
||||
|
||||
// hash gbisrss:1 if we are an rss page ourselves
|
||||
if ( *getIsRSS() ) value = "1";
|
||||
else value = "0";
|
||||
// update hash parms
|
||||
hi.m_prefix = "gbisrss";
|
||||
// returns false and sets g_errno on error
|
||||
if ( ! hashString(value,1,&hi) ) return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
// . we now do the title hashing here for newer titlerecs, version 80+, rather
|
||||
// than use the <index> block in the ruleset for titles.
|
||||
// . this is not to be confused with hashing the title: terms which still
|
||||
@ -2426,170 +2160,6 @@ bool XmlDoc::hashCountry ( HashTableX *tt ) {
|
||||
return true;
|
||||
}
|
||||
|
||||
bool XmlDoc::hashSiteNumInlinks ( HashTableX *tt ) {
|
||||
|
||||
setStatus ( "hashing site num inlinks" );
|
||||
|
||||
char s[32];
|
||||
int32_t slen = sprintf(s, "%"INT32"", (int32_t)*getSiteNumInlinks() );
|
||||
|
||||
// update hash parms
|
||||
HashInfo hi;
|
||||
hi.m_tt = tt;
|
||||
hi.m_hashGroup = HASHGROUP_INTAG;
|
||||
hi.m_prefix = "gbsitenuminlinks";
|
||||
|
||||
// hack test
|
||||
// slen = sprintf(s,"%"UINT32"",
|
||||
// ((uint32_t)m_firstUrl.getUrlHash32()) % 1000);
|
||||
// log("xmldoc: sitenuminlinks for %s is %s",m_firstUrl.getUrl(),s);
|
||||
|
||||
return hashString ( s, slen, &hi );
|
||||
}
|
||||
|
||||
bool XmlDoc::hashCharset ( HashTableX *tt ) {
|
||||
|
||||
setStatus ( "hashing charset" );
|
||||
|
||||
char s[128]; // charset string
|
||||
int32_t slen;
|
||||
|
||||
// hash the charset as a string
|
||||
if ( ! get_charset_str(*getCharset()))
|
||||
slen = sprintf(s, "unknown");
|
||||
else
|
||||
slen = sprintf(s, "%s", get_charset_str(*getCharset()));
|
||||
|
||||
// update hash parms
|
||||
HashInfo hi;
|
||||
hi.m_tt = tt;
|
||||
hi.m_hashGroup = HASHGROUP_INTAG;
|
||||
hi.m_prefix = "gbcharset";
|
||||
|
||||
if ( ! hashString ( s,slen, &hi ) ) return false;
|
||||
|
||||
// hash charset as a number
|
||||
slen = sprintf(s, "%d", *getCharset());
|
||||
|
||||
return hashString ( s,slen, &hi ) ;
|
||||
}
|
||||
|
||||
|
||||
// . only hash certain tags (single byte scores and ST_COMMENT)
|
||||
// . do not hash clocks, ST_SITE, ST_COMMENT
|
||||
// . term = gbtag:blog1 score=0-100
|
||||
// . term = gbtag:blog2 score=0-100
|
||||
// . term = gbtag:english1 score=0-100
|
||||
// . term = gbtag:pagerank1 score=0-100, etc. ...
|
||||
// . term = gbtagmeta:"this site"(special hashing,ST_META,score=qlty)
|
||||
// . later we can support query like gbtag:english1>30
|
||||
bool XmlDoc::hashTagRec ( HashTableX *tt ) {
|
||||
|
||||
setStatus ( "hashing tag rec" );
|
||||
|
||||
//char *field = "gbtag:";
|
||||
//int32_t fieldlen = gbstrlen(field);
|
||||
//bool retval = true;
|
||||
|
||||
// . this tag rec does not have the ST_SITE tag in it to save space
|
||||
// . it does not have clocks either?
|
||||
TagRec *gr = getTagRec();
|
||||
|
||||
// count occurence of each tag id
|
||||
//int16_t count [ LAST_TAG ];
|
||||
//memset ( count , 0 , 2 * LAST_TAG );
|
||||
|
||||
// loop over all tags in the title rec
|
||||
for ( Tag *tag = gr->getFirstTag(); tag ; tag = gr->getNextTag(tag) ) {
|
||||
// breathe
|
||||
QUICKPOLL(m_niceness);
|
||||
// get id
|
||||
int32_t type = tag->m_type;
|
||||
// skip tags we are not supposed to index, like
|
||||
// ST_CLOCK, etc. or anything with a dataSize not 1
|
||||
if ( ! tag->isIndexable() ) continue;
|
||||
// hash these metas below
|
||||
//if ( type == ST_META ) continue;
|
||||
//if ( tag->isType("meta") ) continue;
|
||||
// only single byters. this should have been covered by the
|
||||
// isIndexable() function.
|
||||
//if ( tag->getTagDataSize() != 1 ) continue;
|
||||
// get the name
|
||||
char *str = getTagStrFromType ( type );
|
||||
// get data size
|
||||
//uint8_t *data = (uint8_t *)tag->getTagData();
|
||||
// make it a string
|
||||
//char dataStr[6];
|
||||
//sprintf ( dataStr , "%"INT32"",(int32_t)*data );
|
||||
// skip if has non numbers
|
||||
//bool num = true;
|
||||
//for ( int32_t i = 0 ; i < tag->getTagDataSize() ; i++ )
|
||||
// if ( ! is_digit(tag->getTagData()[i]) ) num = false;
|
||||
// skip if it has more than just digits, we are not indexing
|
||||
// strings at this point
|
||||
//if ( ! num ) continue;
|
||||
// point to it, should be a NULL terminated string
|
||||
char *dataStr = tag->getTagData();
|
||||
// skip if number is too big
|
||||
//int32_t val = atol ( dataStr );
|
||||
// boost by one so we can index "0" score
|
||||
//val++;
|
||||
// we really only want to index scores from 0-255
|
||||
//if ( val > 255 ) continue;
|
||||
// no negatives
|
||||
//if ( val <= 0 ) continue;
|
||||
// count occurence
|
||||
//count [ type ]++;
|
||||
// . make the term name to hash after the gbtag:
|
||||
// . we want to hash "gbtag:english3" for example, for the
|
||||
// ST_ENGLISH tag id.
|
||||
char prefix[64];
|
||||
// . do not include the count for the first occurence
|
||||
// . follows the gbruleset:36 convention
|
||||
// . index gbtagspam:0 or gbtagspam:1, etc.!!!
|
||||
//if ( count[type] == 1 )
|
||||
sprintf ( prefix , "gbtag%s",str);
|
||||
// assume that is good enough
|
||||
//char *prefix = tmp;
|
||||
// store prefix into m_wbuf so XmlDoc::print() works!
|
||||
//if ( m_pbuf ) {
|
||||
// int32_t tlen = gbstrlen(tmp);
|
||||
// m_wbuf.safeMemcpy(tmp,tlen+1);
|
||||
// prefix = m_wbuf.getBuf() - (tlen+1);
|
||||
//}
|
||||
//else
|
||||
// sprintf ( tmp , "gbtag%s%"INT32"",str,(int32_t)count[type]);
|
||||
// "unmap" it so when it is hashed it will have the correct
|
||||
// 8-bit score. IndexList.cpp will convert it back to 8 bits
|
||||
// in IndexList::set(table), which sets our termlist from
|
||||
// this "table".
|
||||
//int32_t score = score8to32 ( val );
|
||||
// we already incorporate the score as a string when we hash
|
||||
// gbtagtagname:tagvalue so why repeat it?
|
||||
//int32_t score = 1;
|
||||
|
||||
// update hash parms
|
||||
HashInfo hi;
|
||||
hi.m_tt = tt;
|
||||
hi.m_prefix = prefix;
|
||||
hi.m_hashGroup = HASHGROUP_INTAG;
|
||||
|
||||
// meta is special now
|
||||
if ( tag->isType("meta") ) {
|
||||
hi.m_prefix = NULL;
|
||||
}
|
||||
|
||||
// hash it. like "gbtagenglish:1" with a score of 1, etc.
|
||||
// or "gbtagspam:33" with a score of 33. this would also
|
||||
// hash gbtagclock:0xfe442211 type things as well.
|
||||
int32_t dlen = gbstrlen(dataStr);
|
||||
if ( ! hashString ( dataStr,dlen,&hi ) ) return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
bool XmlDoc::hashPermalink ( HashTableX *tt ) {
|
||||
|
||||
setStatus ( "hashing is permalink" );
|
||||
@ -2607,15 +2177,6 @@ bool XmlDoc::hashPermalink ( HashTableX *tt ) {
|
||||
return hashString ( s,1,&hi );
|
||||
}
|
||||
|
||||
|
||||
bool XmlDoc::hashVectors ( HashTableX *tt ) {
|
||||
|
||||
setStatus ( "hashing vectors" );
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
// returns false and sets g_errno on error
|
||||
bool XmlDoc::hashIsAdult ( HashTableX *tt ) {
|
||||
|
||||
@ -3399,20 +2960,6 @@ bool XmlDoc::hashNumberForSorting ( char *beginBuf ,
|
||||
// negative sign?
|
||||
if ( p > beginBuf && p[-1] == '-' ) p--;
|
||||
|
||||
// BR 20160108: Removed all float numbers as we don't plan to use them
|
||||
// . convert it to a float
|
||||
// . this now allows for commas in numbers like "1,500.62"
|
||||
// float f = atof2 ( p , bufEnd - p );
|
||||
|
||||
// if ( ! hashNumberForSortingAsFloat ( f , hi , "gbsortby" ) )
|
||||
// return false;
|
||||
|
||||
// also hash in reverse order for sorting from low to high
|
||||
// f = -1.0 * f;
|
||||
|
||||
// if ( ! hashNumberForSortingAsFloat ( f , hi , "gbrevsortby" ) )
|
||||
// return false;
|
||||
|
||||
//
|
||||
// also hash as an int, 4 byte-integer so our lastSpidered timestamps
|
||||
// dont lose 128 seconds of resolution
|
||||
@ -3433,116 +2980,6 @@ bool XmlDoc::hashNumberForSorting ( char *beginBuf ,
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
bool XmlDoc::hashNumberForSortingAsFloat ( float f , HashInfo *hi , char *sortByStr ) {
|
||||
|
||||
// prefix is something like price. like the meta "name" or
|
||||
// the json name with dots in it like "product.info.price" or something
|
||||
int64_t nameHash = 0LL;
|
||||
int32_t nameLen = 0;
|
||||
if ( hi->m_prefix ) nameLen = gbstrlen ( hi->m_prefix );
|
||||
if ( hi->m_prefix && nameLen )
|
||||
nameHash = hash64Lower_utf8_nospaces( hi->m_prefix , nameLen );
|
||||
// need a prefix for hashing numbers... for now
|
||||
else { char *xx=NULL; *xx=0; }
|
||||
|
||||
// combine prefix hash with a special hash to make it unique to avoid
|
||||
// collisions. this is the "TRUE" prefix.
|
||||
int64_t truePrefix64 = hash64n ( sortByStr ); // "gbsortby");
|
||||
// hash with the "TRUE" prefix
|
||||
int64_t ph2 = hash64 ( nameHash , truePrefix64 );
|
||||
|
||||
// . now store it
|
||||
// . use field hash as the termid. normally this would just be
|
||||
// a prefix hash
|
||||
// . use mostly fake value otherwise
|
||||
key144_t k;
|
||||
g_posdb.makeKey ( &k ,
|
||||
ph2 ,
|
||||
0,//docid
|
||||
0,// word pos #
|
||||
0,// densityRank , // 0-15
|
||||
0 , // MAXDIVERSITYRANK
|
||||
0 , // wordSpamRank ,
|
||||
0 , //siterank
|
||||
0 , // hashGroup,
|
||||
// we set to docLang final hash loop
|
||||
//langUnknown, // langid
|
||||
// unless already set. so set to english here
|
||||
// so it will not be set to something else
|
||||
// otherwise our floats would be ordered by langid!
|
||||
// somehow we have to indicate that this is a float
|
||||
// termlist so it will not be mangled any more.
|
||||
//langEnglish,
|
||||
langUnknown,
|
||||
0 , // multiplier
|
||||
false, // syn?
|
||||
false , // delkey?
|
||||
hi->m_shardByTermId );
|
||||
|
||||
//int64_t final = hash64n("products.offerprice",0);
|
||||
//int64_t prefix = hash64n("gbsortby",0);
|
||||
//int64_t h64 = hash64 ( final , prefix);
|
||||
//if ( ph2 == h64 )
|
||||
// log("hey: got offer price");
|
||||
|
||||
// now set the float in that key
|
||||
g_posdb.setFloat ( &k , f );
|
||||
|
||||
// HACK: this bit is ALWAYS set by Posdb::makeKey() to 1
|
||||
// so that we can b-step into a posdb list and make sure
|
||||
// we are aligned on a 6 byte or 12 byte key, since they come
|
||||
// in both sizes. but for this, hack it off to tell
|
||||
// addTable144() that we are a special posdb key, a "numeric"
|
||||
// key that has a float stored in it. then it will NOT
|
||||
// set the siterank and langid bits which throw our sorting
|
||||
// off!!
|
||||
g_posdb.setAlignmentBit ( &k , 0 );
|
||||
|
||||
// sanity
|
||||
float t = g_posdb.getFloat ( &k );
|
||||
if ( t != f ) { char *xx=NULL;*xx=0; }
|
||||
|
||||
HashTableX *dt = hi->m_tt;
|
||||
|
||||
// the key may indeed collide, but that's ok for this application
|
||||
if ( ! dt->addTerm144 ( &k ) )
|
||||
return false;
|
||||
|
||||
if ( ! m_wts )
|
||||
return true;
|
||||
|
||||
// store in buffer
|
||||
char buf[128];
|
||||
snprintf(buf,126,"%s:%s float32=%f",sortByStr,hi->m_prefix,f);
|
||||
int32_t bufLen = gbstrlen(buf);
|
||||
|
||||
// add to wts for PageParser.cpp display
|
||||
// store it
|
||||
if ( ! storeTerm ( buf,
|
||||
bufLen,
|
||||
ph2,
|
||||
hi,
|
||||
0, // word#, i,
|
||||
0, // wordPos
|
||||
0,// densityRank , // 0-15
|
||||
0, // MAXDIVERSITYRANK,//phrase
|
||||
0, // ws,
|
||||
0, // hashGroup,
|
||||
//true,
|
||||
&m_wbuf,
|
||||
m_wts,
|
||||
// a hack for display in wts:
|
||||
SOURCE_NUMBER, // SOURCE_BIGRAM, // synsrc
|
||||
langUnknown ,
|
||||
k) )
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool XmlDoc::hashNumberForSortingAsInt32 ( int32_t n , HashInfo *hi , char *sortByStr ) {
|
||||
|
||||
// prefix is something like price. like the meta "name" or
|
||||
@ -3662,22 +3099,6 @@ bool XmlDoc::hashNumberForSortingAsInt32 ( int32_t n , HashInfo *hi , char *sort
|
||||
|
||||
#include "Json.h"
|
||||
|
||||
char *XmlDoc::hashJSONFields ( HashTableX *table ) {
|
||||
|
||||
setStatus ( "hashing json fields" );
|
||||
|
||||
HashInfo hi;
|
||||
hi.m_tt = table;
|
||||
hi.m_desc = "json object";
|
||||
|
||||
// use new json parser
|
||||
Json *jp = getParsedJson();
|
||||
if ( ! jp || jp == (void *)-1 ) return (char *)jp;
|
||||
|
||||
return hashJSONFields2 ( table , &hi , jp , true );
|
||||
}
|
||||
|
||||
|
||||
char *XmlDoc::hashJSONFields2 ( HashTableX *table ,
|
||||
HashInfo *hi , Json *jp ,
|
||||
bool hashWithoutFieldNames ) {
|
||||
@ -3815,63 +3236,3 @@ char *XmlDoc::hashJSONFields2 ( HashTableX *table ,
|
||||
|
||||
return (char *)0x01;
|
||||
}
|
||||
|
||||
char *XmlDoc::hashXMLFields ( HashTableX *table ) {
|
||||
|
||||
setStatus ( "hashing xml fields" );
|
||||
|
||||
HashInfo hi;
|
||||
hi.m_tt = table;
|
||||
hi.m_desc = "xml object";
|
||||
hi.m_hashGroup = HASHGROUP_BODY;
|
||||
|
||||
|
||||
Xml *xml = getXml();
|
||||
int32_t n = xml->getNumNodes();
|
||||
XmlNode *nodes = xml->getNodes ();
|
||||
|
||||
SafeBuf nameBuf;
|
||||
|
||||
// scan the xml nodes
|
||||
for ( int32_t i = 0 ; i < n ; i++ ) {
|
||||
|
||||
// breathe
|
||||
QUICKPOLL(m_niceness);
|
||||
|
||||
// . skip if it's a tag not text node skip it
|
||||
// . we just want the "text" nodes
|
||||
if ( nodes[i].isTag() ) continue;
|
||||
|
||||
//if(!strncmp(nodes[i].m_node,"Congress%20Presses%20Uber",20))
|
||||
// log("hey:hy");
|
||||
|
||||
// assemble the full parent name
|
||||
// like "tag1.tag2.tag3"
|
||||
nameBuf.reset();
|
||||
xml->getCompoundName ( i , &nameBuf );
|
||||
|
||||
// this is \0 terminated
|
||||
char *tagName = nameBuf.getBufStart();
|
||||
|
||||
// get the utf8 text
|
||||
char *val = nodes[i].m_node;
|
||||
int32_t vlen = nodes[i].m_nodeLen;
|
||||
|
||||
// index like "title:whatever"
|
||||
if ( tagName && tagName[0] ) {
|
||||
hi.m_prefix = tagName;
|
||||
hashString ( val , vlen , &hi );
|
||||
}
|
||||
|
||||
// hash without the field name as well
|
||||
hi.m_prefix = NULL;
|
||||
hashString ( val , vlen , &hi );
|
||||
}
|
||||
|
||||
return (char *)0x01;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
Reference in New Issue
Block a user