forked from Mirrors/privacore-open-source-search-engine
2273 lines
63 KiB
C++
2273 lines
63 KiB
C++
//-*- coding: utf-8 -*-
|
|
|
|
#include "gb-include.h"
|
|
|
|
#include "XmlDoc.h"
|
|
#include "CountryCode.h" // g_countryCode
|
|
#include "Collectiondb.h"
|
|
#include "Speller.h"
|
|
#include "Synonyms.h"
|
|
#include "Process.h"
|
|
#include "ip.h"
|
|
#include "Posdb.h"
|
|
#include "Conf.h"
|
|
#include "UrlBlockCheck.h"
|
|
#include "Domains.h"
|
|
|
|
|
|
#ifdef _VALGRIND_
|
|
#include <valgrind/memcheck.h>
|
|
#endif
|
|
|
|
|
|
// a ptr to HashInfo is passed to hashString() and hashWords()
|
|
class HashInfo {
|
|
public:
|
|
HashInfo() {
|
|
m_tt = NULL;
|
|
m_prefix = NULL;
|
|
m_desc = NULL;
|
|
m_date = 0;
|
|
// should we do sharding based on termid and not the usual docid???
|
|
// in general this is false, but for checksum we want to shard
|
|
// by the checksum and not docid to avoid having to do a
|
|
// gbchecksum:xxxxx search on ALL shards. much more efficient.
|
|
m_shardByTermId = false;
|
|
m_hashGroup = -1;
|
|
m_useCountTable = true;
|
|
m_useSections = true;
|
|
m_startDist = 0;
|
|
|
|
// BR 20160108: Now default to false since we will only use it for
|
|
// very specific cases like spiderdate, which is for debugging only.
|
|
// If true, creates 4 posdb entries for numbers in posdb, e.g.
|
|
// gbsortbyint:gbisadultint32, gbrevsortbyint:gbisadultint32
|
|
// gbsortby:gbisadultfloat32, gbrevsortby:gbisadultfloat32
|
|
m_createSortByForNumbers= false;
|
|
m_hashNumbers = true;
|
|
m_filterUrlIndexableWords = false;
|
|
m_linkerSiteRank = 0;
|
|
}
|
|
class HashTableX *m_tt;
|
|
const char *m_prefix;
|
|
// "m_desc" should detail the algorithm
|
|
const char *m_desc;
|
|
int32_t m_date;
|
|
bool m_shardByTermId;
|
|
char m_linkerSiteRank;
|
|
char m_hashGroup;
|
|
int32_t m_startDist;
|
|
bool m_useCountTable;
|
|
bool m_useSections;
|
|
bool m_createSortByForNumbers;
|
|
bool m_hashNumbers;
|
|
bool m_filterUrlIndexableWords; //Do special filtering on words in url, eg. exclude "com" before path
|
|
};
|
|
|
|
|
|
|
|
static bool storeTerm ( const char *s ,
|
|
int32_t slen ,
|
|
int64_t termId ,
|
|
HashInfo *hi ,
|
|
int32_t wordNum ,
|
|
int32_t wordPos ,
|
|
char densityRank,
|
|
char diversityRank ,
|
|
char wordSpamRank ,
|
|
char hashGroup,
|
|
//bool isPhrase ,
|
|
SafeBuf *wbuf ,
|
|
HashTableX *wts ,
|
|
char synSrc ,
|
|
char langId ,
|
|
posdbkey_t key ) {
|
|
|
|
// store prefix
|
|
int32_t poff = wbuf->length();
|
|
// shortcut
|
|
const char *p = hi->m_prefix;
|
|
// add the prefix too!
|
|
if ( p && ! wbuf->safeMemcpy(p,strlen(p)+1)) return false;
|
|
// none?
|
|
if ( ! p ) poff = -1;
|
|
|
|
|
|
// store description
|
|
int32_t doff = wbuf->length();
|
|
// shortcut
|
|
const char *d = hi->m_desc;
|
|
// add the desc too!
|
|
if ( d && ! wbuf->safeMemcpy(d,strlen(d)+1) ) return false;
|
|
// none?
|
|
if ( ! d ) doff = -1;
|
|
|
|
// store term
|
|
int32_t toff = wbuf->length();
|
|
// add it
|
|
if ( ! wbuf->safeMemcpy ( s , slen ) ) return false;
|
|
// make this
|
|
TermDebugInfo ti;
|
|
ti.m_termOff = toff;
|
|
ti.m_termLen = slen;
|
|
ti.m_descOff = doff;
|
|
ti.m_prefixOff = poff;
|
|
ti.m_date = hi->m_date;
|
|
ti.m_shardByTermId = hi->m_shardByTermId;
|
|
ti.m_termId = termId;
|
|
//ti.m_weight = 1.0;
|
|
//ti.m_spam = -1.0;
|
|
ti.m_diversityRank = diversityRank;
|
|
ti.m_densityRank = densityRank;
|
|
ti.m_wordSpamRank = wordSpamRank;
|
|
ti.m_hashGroup = hashGroup;
|
|
ti.m_wordNum = wordNum;
|
|
ti.m_wordPos = wordPos;
|
|
ti.m_langId = langId;
|
|
ti.m_key = key;
|
|
|
|
// save for printing out an asterisk
|
|
ti.m_synSrc = synSrc; // isSynonym = isSynonym;
|
|
|
|
// get language bit vec
|
|
ti.m_langBitVec64 = g_speller.getLangBits64(termId);
|
|
|
|
// make the key
|
|
key96_t k;
|
|
k.n1 = 0; // date
|
|
k.n0 = termId;
|
|
|
|
// store it
|
|
return wts->addKey ( &k , &ti ) ;
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
//
|
|
// . hash terms that are sharded by TERMID not DOCID!!
|
|
//
|
|
// . returns false and sets g_errno on error
|
|
// . these terms are stored in indexdb, but all terms with the same
|
|
// termId reside in one and only one group. whereas normally the records
|
|
// are split based on docid and every group gets 1/nth of the termlist.
|
|
// . we do this "no splitting" so that only one disk seek is required, and
|
|
// we know the termlist is small, or the termlist is being used for spidering
|
|
// or parsing purposes and is usually not sent across the network.
|
|
bool XmlDoc::hashNoSplit ( HashTableX *tt ) {
|
|
// constructor should set to defaults automatically
|
|
HashInfo hi;
|
|
hi.m_hashGroup = HASHGROUP_INTAG;
|
|
hi.m_tt = tt;
|
|
// usually we shard by docid, but these are terms we shard by termid!
|
|
hi.m_shardByTermId = true;
|
|
|
|
if ((size_utf8Content - 1) > 0) {
|
|
// for exact content deduping
|
|
setStatus("hashing gbcontenthash (deduping) no-split keys");
|
|
|
|
// this should be ready to go and not block!
|
|
int64_t *pch64 = getExactContentHash64();
|
|
if (!pch64 || pch64 == (void *)-1) { g_process.shutdownAbort(true); }
|
|
|
|
char cbuf[64];
|
|
int32_t clen = sprintf(cbuf, "%" PRIu64, (uint64_t)*pch64);
|
|
hi.m_prefix = "gbcontenthash";
|
|
if (!hashString(cbuf, clen, &hi)) return false;
|
|
}
|
|
|
|
// now hash the site
|
|
setStatus ( "hashing no-split SiteGetter terms");
|
|
|
|
Url *fu = getFirstUrl();
|
|
char *host = fu->getHost ();
|
|
|
|
//
|
|
// HASH terms for SiteGetter.cpp
|
|
//
|
|
// these are now no-split terms
|
|
//
|
|
char *s = fu->getUrl ();
|
|
int32_t slen = fu->getUrlLen();
|
|
// . this termId is used by SiteGetter.cpp for determining subsites
|
|
// . matches what is in SiteGet::getSiteList()
|
|
// for www.xyz.com/a/ HASH www.xyz.com
|
|
// for www.xyz.com/a/b/ HASH www.xyz.com/a/
|
|
// for www.xyz.com/a/b/c/ HASH www.xyz.com/a/b/
|
|
bool add = true;
|
|
// we only hash this for urls that end in '/'
|
|
if ( s[slen-1] != '/' ) add = false;
|
|
// and no cgi
|
|
if ( fu->isCgi() ) add = false;
|
|
// skip if root
|
|
if ( fu->getPathLen() <= 1 ) add = false;
|
|
// sanity check
|
|
if ( ! m_linksValid ) { g_process.shutdownAbort(true); }
|
|
// . skip if we have no subdirectory outlinks
|
|
// . that way we do not confuse all the pages in dictionary.com or
|
|
// wikipedia.org as subsites!!
|
|
if ( ! m_links.hasSubdirOutlink() ) add = false;
|
|
// hash it
|
|
if ( add ) {
|
|
// remove the last path component
|
|
char *end2 = s + slen - 2;
|
|
// back up over last component
|
|
for ( ; end2 > fu->getPath() && *end2 != '/' ; end2-- ) ;
|
|
// hash that part of the url
|
|
hi.m_prefix = "siteterm";
|
|
if ( ! hashSingleTerm ( host,end2-host,&hi) ) return false;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
// . returns -1 if blocked, returns NULL and sets g_errno on error
|
|
// . "sr" is the tagdb Record
|
|
// . "ws" store the terms for PageParser.cpp display
|
|
char *XmlDoc::hashAll(HashTableX *table) {
|
|
logTrace(g_conf.m_logTraceXmlDoc, "BEGIN");
|
|
|
|
setStatus("hashing document");
|
|
|
|
if (m_allHashed) {
|
|
return (char *)1;
|
|
}
|
|
|
|
// sanity checks
|
|
if (table->getKeySize() != 18 || table->getDataSize() != 4) {
|
|
g_process.shutdownAbort(true);
|
|
}
|
|
|
|
// ptr to term = 4 + score = 4 + ptr to sec = 4
|
|
if (m_wts && (m_wts->getKeySize() != 12 || m_wts->getDataSize() != sizeof(TermDebugInfo))) {
|
|
g_process.shutdownAbort(true);
|
|
}
|
|
|
|
uint8_t *ct = getContentType();
|
|
if (!ct) {
|
|
logTrace(g_conf.m_logTraceXmlDoc, "END, getContentType failed");
|
|
return NULL;
|
|
}
|
|
|
|
// BR 20160127: Never index JSON and XML content
|
|
if (*ct == CT_JSON || *ct == CT_XML) {
|
|
// For XML (JSON should not get here as it should be filtered out during spidering)
|
|
// store the URL as the only thing in posdb so we are able to find it, and
|
|
// eventually ban it.
|
|
if (!hashUrl(table, true)) { // urlOnly (skip IP and term generation)
|
|
logTrace(g_conf.m_logTraceXmlDoc, "END, hashUrl failed");
|
|
return NULL;
|
|
}
|
|
m_allHashed = true;
|
|
return (char *)1;
|
|
}
|
|
|
|
unsigned char *hc = (unsigned char *)getHopCount();
|
|
if (!hc || hc == (void *)-1) {
|
|
logTrace(g_conf.m_logTraceXmlDoc, "END, getHopCount returned -1");
|
|
return (char *)hc;
|
|
}
|
|
|
|
// need this for hashing
|
|
HashTableX *cnt = getCountTable();
|
|
if (!cnt) {
|
|
logTrace(g_conf.m_logTraceXmlDoc, "END, getCountTable failed");
|
|
return (char *)cnt;
|
|
}
|
|
if (cnt == (void *)-1) {
|
|
g_process.shutdownAbort(true);
|
|
}
|
|
|
|
// and this
|
|
Links *links = getLinks();
|
|
if (!links) {
|
|
logTrace(g_conf.m_logTraceXmlDoc, "END, getLinks failed");
|
|
return (char *)links;
|
|
}
|
|
if (links == (Links *)-1) {
|
|
g_process.shutdownAbort(true);
|
|
}
|
|
|
|
char *wordSpamVec = getWordSpamVec();
|
|
if (!wordSpamVec) {
|
|
logTrace(g_conf.m_logTraceXmlDoc, "END, getWordSpamVec failed");
|
|
return wordSpamVec;
|
|
}
|
|
if (wordSpamVec == (void *)-1) {
|
|
g_process.shutdownAbort(true);
|
|
}
|
|
|
|
char *fragVec = getFragVec();
|
|
if (!fragVec) {
|
|
logTrace(g_conf.m_logTraceXmlDoc, "END, getFragVec failed");
|
|
return fragVec;
|
|
}
|
|
if (fragVec == (void *)-1) {
|
|
g_process.shutdownAbort(true);
|
|
}
|
|
|
|
// why do we need this?
|
|
if ( m_wts ) {
|
|
uint8_t *lv = getLangVector();
|
|
if (!lv) {
|
|
logTrace(g_conf.m_logTraceXmlDoc, "END, getLangVector failed");
|
|
return (char *)lv;
|
|
}
|
|
if (lv == (void *)-1) {
|
|
g_process.shutdownAbort(true);
|
|
}
|
|
}
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) {
|
|
logTrace(g_conf.m_logTraceXmlDoc, "END, getCollRec failed");
|
|
return NULL;
|
|
}
|
|
|
|
// do not repeat this if the cachedb storage call blocks
|
|
m_allHashed = true;
|
|
|
|
// reset distance cursor
|
|
m_dist = 0;
|
|
|
|
if (!hashContentType(table)) {
|
|
logTrace(g_conf.m_logTraceXmlDoc, "END, hashContentType failed");
|
|
return NULL;
|
|
}
|
|
|
|
if (!hashUrl(table, false)) {
|
|
logTrace(g_conf.m_logTraceXmlDoc, "END, hashUrl failed");
|
|
return NULL;
|
|
}
|
|
|
|
if (!hashLanguage(table)) {
|
|
logTrace(g_conf.m_logTraceXmlDoc, "END, hashLanguage failed");
|
|
return NULL;
|
|
}
|
|
|
|
if (!hashCountry(table)) {
|
|
logTrace(g_conf.m_logTraceXmlDoc, "END, hashCountry failed");
|
|
return NULL;
|
|
}
|
|
|
|
// now hash the terms sharded by termid and not docid here since they
|
|
// just set a special bit in posdb key so Rebalance.cpp can work.
|
|
// this will hash the content checksum which we need for deduping
|
|
// which we use for diffbot custom crawls as well.
|
|
if (!hashNoSplit(table)) {
|
|
logTrace(g_conf.m_logTraceXmlDoc, "END, hashNoSplit failed");
|
|
return NULL;
|
|
}
|
|
|
|
// MDW: i think we just inject empty html with a diffbotreply into
|
|
// global index now, so don't need this... 9/28/2014
|
|
|
|
// stop indexing xml docs
|
|
// global index unless this is a json object in which case it is
|
|
// hashed above in the call to hashJSON(). this will decrease disk
|
|
// usage by about half, posdb* files are pretty big.
|
|
if (!cr->m_indexBody) {
|
|
logTrace(g_conf.m_logTraceXmlDoc, "END, !indexDoc");
|
|
return (char *)1;
|
|
}
|
|
|
|
bool *ini = getIsNoIndex();
|
|
if (ini == nullptr || ini == (bool*)-1) {
|
|
// must not be blocked
|
|
gbshutdownLogicError();
|
|
}
|
|
|
|
if (*ini && m_version > 126) {
|
|
logTrace(g_conf.m_logTraceXmlDoc, "END, noindex");
|
|
return (char *)1;
|
|
}
|
|
|
|
if ((size_utf8Content - 1) <= 0) {
|
|
logTrace(g_conf.m_logTraceXmlDoc, "END, contentLen == 0");
|
|
return (char *)1;
|
|
}
|
|
|
|
// hash the body of the doc first so m_dist is 0 to match
|
|
// the rainbow display of sections
|
|
if (!hashBody2(table)) {
|
|
logTrace(g_conf.m_logTraceXmlDoc, "END, hashBody2 failed");
|
|
return NULL;
|
|
}
|
|
|
|
// hash the title now too so neighborhood singles have more
|
|
// to match. plus, we only hash these title terms iff they
|
|
// are not already in the hash table, so as to avoid hashing
|
|
// repeated title terms because we do not do spam detection
|
|
// on them. thus, we need to hash these first before anything
|
|
// else. give them triple the body score
|
|
if (!hashTitle(table)) {
|
|
logTrace(g_conf.m_logTraceXmlDoc, "END, hashTitle failed");
|
|
return NULL;
|
|
}
|
|
|
|
// . hash the keywords tag, limited to first 2k of them so far
|
|
// . hash above the neighborhoods so the neighborhoods only index
|
|
// what is already in the hash table
|
|
if (!hashMetaKeywords(table)) {
|
|
logTrace(g_conf.m_logTraceXmlDoc, "END, hashMetaKeywords failed");
|
|
return NULL;
|
|
}
|
|
|
|
// then hash the incoming link text, NO ANOMALIES, because
|
|
// we index the single words in the neighborhoods next, and
|
|
// we had songfacts.com coming up for the 'street light facts'
|
|
// query because it had a bunch of anomalous inlink text.
|
|
if (!hashIncomingLinkText(table)) {
|
|
logTrace(g_conf.m_logTraceXmlDoc, "END, hashIncomingLinkText failed");
|
|
return NULL;
|
|
}
|
|
|
|
// then the meta summary and description tags with half the score of
|
|
// the body, and only hash a term if was not already hashed above
|
|
// somewhere.
|
|
if (!hashMetaSummary(table)) {
|
|
logTrace(g_conf.m_logTraceXmlDoc, "END, hashMetaSummary failed");
|
|
return NULL;
|
|
}
|
|
|
|
// BR 20160220
|
|
// Store value of meta tag "geo.placename" to help aid searches for
|
|
// location specific sites, e.g. 'Restaurant in London'
|
|
if (!hashMetaGeoPlacename(table)) {
|
|
logTrace(g_conf.m_logTraceXmlDoc, "END, hashMetaGeoPlacename failed");
|
|
return NULL;
|
|
}
|
|
|
|
// this will only increment the scores of terms already in the table
|
|
// because we neighborhoods are not techincally in the document
|
|
// necessarily and we do not want to ruin our precision
|
|
if (!hashNeighborhoods(table)) {
|
|
logTrace(g_conf.m_logTraceXmlDoc, "END, hashNeighborhoods failed");
|
|
return NULL;
|
|
}
|
|
|
|
if (!hashLinks(table)) {
|
|
logTrace(g_conf.m_logTraceXmlDoc, "END, hashLinks failed");
|
|
return NULL;
|
|
}
|
|
|
|
if (!hashDateNumbers(table)) {
|
|
logTrace(g_conf.m_logTraceXmlDoc, "END, hashDateNumbers failed");
|
|
return NULL;
|
|
}
|
|
|
|
if (!hashMetaTags(table)) {
|
|
logTrace(g_conf.m_logTraceXmlDoc, "END, hashMetaTags failed");
|
|
return NULL;
|
|
}
|
|
|
|
// hash gblang:de last for parsing consistency
|
|
if (!hashLanguageString(table)) {
|
|
logTrace(g_conf.m_logTraceXmlDoc, "END, hashLanguageString failed");
|
|
return NULL;
|
|
}
|
|
|
|
logTrace(g_conf.m_logTraceXmlDoc, "END, OK");
|
|
return (char *)1;
|
|
}
|
|
|
|
// returns false and sets g_errno on error
|
|
bool XmlDoc::hashMetaTags ( HashTableX *tt ) {
|
|
|
|
setStatus ( "hashing meta tags" );
|
|
|
|
// assume it's empty
|
|
char buf [ 32*1024 ];
|
|
int32_t bufLen = 32*1024 - 1;
|
|
buf[0] = '\0';
|
|
int32_t n = m_xml.getNumNodes();
|
|
XmlNode *nodes = m_xml.getNodes();
|
|
|
|
// set up the hashing parms
|
|
HashInfo hi;
|
|
hi.m_hashGroup = HASHGROUP_INMETATAG;
|
|
hi.m_tt = tt;
|
|
hi.m_desc = "custom meta tag";
|
|
|
|
// find the first meta summary node
|
|
for ( int32_t i = 0 ; i < n ; i++ ) {
|
|
// continue if not a meta tag
|
|
if ( nodes[i].m_nodeId != TAG_META ) continue;
|
|
// only get content for <meta name=..> not <meta http-equiv=..>
|
|
int32_t tagLen;
|
|
char *tag = m_xml.getString ( i , "name" , &tagLen );
|
|
char tagLower[128];
|
|
int32_t j ;
|
|
int32_t code;
|
|
// skip if empty
|
|
if ( ! tag || tagLen <= 0 ) continue;
|
|
// make tag name lower case and do not allow bad chars
|
|
if ( tagLen > 126 ) tagLen = 126 ;
|
|
to_lower3_a ( tag , tagLen , tagLower );
|
|
for ( j = 0 ; j < tagLen ; j++ ) {
|
|
// bail if has unacceptable chars
|
|
if ( ! is_alnum_a ( tag[j] ) &&
|
|
tag[j] != '-' &&
|
|
tag[j] != '_' &&
|
|
tag[j] != '.' ) break;
|
|
// convert to lower
|
|
tagLower[j] = to_lower_a ( tag[j] );
|
|
}
|
|
// skip this meta if had unacceptable chars
|
|
if ( j < tagLen ) continue;
|
|
// is it recognized?
|
|
code = getFieldCode ( tag , tagLen );
|
|
|
|
// . do not allow reserved tag names
|
|
// . title,url,suburl,
|
|
if ( code != FIELD_GENERIC ) continue;
|
|
// this is now reserved
|
|
// do not hash keyword, keywords, description, or summary metas
|
|
// because that is done in hashRange() below based on the
|
|
// tagdb (ruleset) record
|
|
if ((tagLen== 7&&strncasecmp(tag,"keyword" , 7)== 0)||
|
|
(tagLen== 7&&strncasecmp(tag,"summary" , 7)== 0)||
|
|
(tagLen== 8&&strncasecmp(tag,"keywords" , 8)== 0)||
|
|
(tagLen==11&&strncasecmp(tag,"description",11)== 0) )
|
|
continue;
|
|
|
|
|
|
// BR 20160107: Only hash certain custom meta tags and ignore the rest
|
|
if(
|
|
(strncasecmp(tag,"subject", 7) != 0) &&
|
|
(strncasecmp(tag,"abstract", 8) != 0) &&
|
|
(strncasecmp(tag,"news_keywords", 13) != 0) && // http://www.metatags.org/meta_name_news_keywords
|
|
(strncasecmp(tag,"author", 6) != 0) &&
|
|
(strncasecmp(tag,"title", 5) != 0) &&
|
|
(strncasecmp(tag,"og:title", 8) != 0) &&
|
|
(strncasecmp(tag,"og:description", 14) != 0) &&
|
|
(strncasecmp(tag,"twitter:title", 13) != 0) &&
|
|
(strncasecmp(tag,"twitter:description", 19) != 0) )
|
|
{
|
|
// If none of the above, it is an unwanted meta tag
|
|
continue;
|
|
}
|
|
|
|
// get the content
|
|
int32_t len;
|
|
char *s = m_xml.getString ( i , "content" , &len );
|
|
if ( ! s || len <= 0 ) continue;
|
|
// . ensure not too big for our buffer (keep room for a \0)
|
|
// . TODO: this is wrong, should be len+1 > bufLen,
|
|
// but can't fix w/o resetting the index (COME BACK HERE
|
|
// and see where we index meta tags besides this place!!!)
|
|
// remove those other places, except... what about keywords
|
|
// and description?
|
|
if ( len+1 >= bufLen ) {
|
|
//len = bufLen - 1;
|
|
// assume no punct to break on!
|
|
len = 0;
|
|
// only cut off at punctuation
|
|
char *p = s;
|
|
char *pend = s + len;
|
|
char *last = NULL;
|
|
int32_t size ;
|
|
for ( ; p < pend ; p += size ) {
|
|
// skip if utf8 char
|
|
size = getUtf8CharSize(*p);
|
|
// skip if 2+ bytes
|
|
if ( size > 1 ) continue;
|
|
// skip if not punct
|
|
if ( is_alnum_a(*p) ) continue;
|
|
// mark it
|
|
last = p;
|
|
}
|
|
if ( last ) len = last - s;
|
|
// this old way was faster...:
|
|
//while ( len > 0 && is_alnum(s[len-1]) ) len--;
|
|
}
|
|
// convert html entities to their chars
|
|
len = saftenTags ( buf , bufLen , s , len );
|
|
// NULL terminate the buffer
|
|
buf[len] = '\0';
|
|
|
|
// Now index the wanted meta tags as normal text without prefix so they
|
|
// are used in user searches automatically.
|
|
hi.m_prefix = NULL;
|
|
|
|
// desc is NULL, prefix will be used as desc
|
|
bool status = hashString ( buf,len,&hi );
|
|
|
|
// bail on error, g_errno should be set
|
|
if ( ! status ) return false;
|
|
|
|
// return false with g_errno set on error
|
|
//if ( ! hashNumberForSorting ( buf , bufLen , &hi ) )
|
|
// return false;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
|
|
|
|
// . hash dates for sorting by using gbsortby: and gbrevsortby:
|
|
// . do 'gbsortby:gbspiderdate' as your query to see this in action
|
|
bool XmlDoc::hashDateNumbers ( HashTableX *tt ) { // , bool isStatusDoc ) {
|
|
|
|
// stop if already set
|
|
if ( ! m_spideredTimeValid ) return true;
|
|
|
|
int32_t indexedTime = getIndexedTime();
|
|
|
|
// first the last spidered date
|
|
HashInfo hi;
|
|
hi.m_hashGroup = 0;// this doesn't matter, it's a numeric field
|
|
hi.m_tt = tt;
|
|
hi.m_desc = "last spidered date";
|
|
hi.m_prefix = "gbspiderdate";
|
|
hi.m_createSortByForNumbers = true;
|
|
|
|
char buf[64];
|
|
int32_t bufLen = sprintf ( buf , "%" PRIu32, (uint32_t)m_spideredTime );
|
|
if ( ! hashNumberForSorting( buf , buf , bufLen , &hi ) )
|
|
return false;
|
|
|
|
// and index time is >= spider time, so you want to sort by that for
|
|
// the widget for instance
|
|
hi.m_desc = "last indexed date";
|
|
hi.m_prefix = "gbindexdate";
|
|
bufLen = sprintf ( buf , "%" PRIu32, (uint32_t)indexedTime );
|
|
if ( ! hashNumberForSorting ( buf , buf , bufLen , &hi ) )
|
|
return false;
|
|
|
|
// all done
|
|
return true;
|
|
}
|
|
|
|
// returns false and sets g_errno on error
|
|
bool XmlDoc::hashContentType ( HashTableX *tt ) {
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return false;
|
|
|
|
|
|
uint8_t *ctype = getContentType();
|
|
if( !ctype ) {
|
|
return false;
|
|
}
|
|
|
|
char *s = NULL;
|
|
|
|
setStatus ( "hashing content type" );
|
|
|
|
|
|
// hash numerically so we can do gbfacetint:type on it
|
|
HashInfo hi;
|
|
hi.m_hashGroup = HASHGROUP_INTAG;
|
|
hi.m_tt = tt;
|
|
hi.m_prefix = "type";
|
|
|
|
char tmp[6];
|
|
sprintf(tmp,"%" PRIu32,(uint32_t)*ctype);
|
|
if ( ! hashString (tmp,strlen(tmp),&hi ) ) return false;
|
|
|
|
|
|
// these ctypes are defined in HttpMime.h
|
|
switch (*ctype) {
|
|
case CT_HTML: s = "html"; break;
|
|
case CT_TEXT: s = "text"; break;
|
|
case CT_XML : s = "xml" ; break;
|
|
case CT_PDF : s = "pdf" ; break;
|
|
case CT_DOC : s = "doc" ; break;
|
|
case CT_XLS : s = "xls" ; break;
|
|
case CT_PPT : s = "ppt" ; break;
|
|
case CT_PS : s = "ps" ; break;
|
|
// for diffbot. so we can limit search to json objects
|
|
// in Diffbot.cpp
|
|
case CT_JSON: s = "json" ; break;
|
|
}
|
|
// bail if unrecognized content type
|
|
if ( ! s ) return true;
|
|
|
|
// . now hash it
|
|
// . use a score of 1 for all
|
|
// . TODO: ensure doc counting works ok with this when it does
|
|
// it's interpolation
|
|
return hashString (s,strlen(s),&hi );
|
|
}
|
|
|
|
// . hash the link: terms
|
|
// . ensure that more useful linkers are scored higher
|
|
// . useful for computing offsite link text for qdb-ish algorithm
|
|
// . NOTE: for now i do not hash links to the same domain in order to
|
|
// hopefully save 10%-25% index space
|
|
// . NOTE: PLUS, they may clog up the link-adjusted quality ratings since
|
|
// different site links with no link text will be ranked behind them
|
|
// . the 8-bit bitmap of the score of a link: term:
|
|
// . 00ubdcss u = link is Unbanned? b = link isBanned?
|
|
// d = link dirty? c = link clean?
|
|
// s = 01 if no link text, 10 if link text
|
|
// . NOTE: this is used in Msg18.cpp for extraction
|
|
// . CAUTION: IndexList::score32to8() will warp our score if its >= 128
|
|
// so i moved the bits down
|
|
bool XmlDoc::hashLinks ( HashTableX *tt ) {
|
|
|
|
setStatus ( "hashing links" );
|
|
|
|
// shortcuts
|
|
bool isRSSFeed = *getIsRSS();
|
|
|
|
char dbuf[8*4*1024];
|
|
HashTableX dedup;
|
|
dedup.set( 8,0,1024,dbuf,8*4*1024,false,"hldt");
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) {
|
|
logTrace( g_conf.m_logTraceXmlDoc, "END, getCollRec failed" );
|
|
return false;
|
|
}
|
|
|
|
// see ../url/Url2.cpp for hashAsLink() algorithm
|
|
for ( int32_t i = 0 ; i < m_links.m_numLinks ; i++ ) {
|
|
// skip links with zero 0 length
|
|
if ( m_links.m_linkLens[i] == 0 ) {
|
|
continue;
|
|
}
|
|
|
|
// . skip if we are rss page and this link is an <a href> link
|
|
// . we only harvest/index <link> urls from rss feeds
|
|
// . or in the case of feedburner, those orig tags
|
|
if ( isRSSFeed && (m_links.m_linkFlags[i] & LF_AHREFTAG) ) {
|
|
continue;
|
|
}
|
|
|
|
// if we have a <feedburner:origLink> tag, then ignore <link>
|
|
// tags and only get the links from the original links
|
|
if ( m_links.m_isFeedBurner && !(m_links.m_linkFlags[i] & LF_FBTAG) ) {
|
|
continue;
|
|
}
|
|
|
|
// normalize the link
|
|
Url link;
|
|
|
|
// now we always add "www" to these links so that any link
|
|
// to cnn.com is same as link to www.cnn.com, because either
|
|
// we index cnn.com or www.cnn.com but not both providing
|
|
// their content is identical (deduping). This way whichever
|
|
// one we index, we can take advantage of all link text whether
|
|
// it's to cnn.com or www.cnn.com.
|
|
// Every now and then we add new session ids to our list in
|
|
// Url.cpp, too, so we have to version that.
|
|
// Since this is just for hashing, it shouldn't matter that
|
|
// www.tmblr.co has no IP whereas only tmblr.co does.
|
|
link.set( m_links.m_linkPtrs[i], m_links.m_linkLens[i], true, m_links.m_stripParams, m_version );
|
|
|
|
// BR 20160105: Do not create "link:" hashes for media URLs etc.
|
|
if( link.hasNonIndexableExtension(TITLEREC_CURRENT_VERSION) || // @todo BR: For now ignore actual TitleDB version. // m_version) ||
|
|
link.hasScriptExtension() ||
|
|
link.hasJsonExtension() ||
|
|
link.hasXmlExtension() ||
|
|
isUrlBlocked(link)) {
|
|
logTrace( g_conf.m_logTraceXmlDoc, "Unwanted for indexing [%s]", link.getUrl());
|
|
continue;
|
|
}
|
|
|
|
// dedup this crap
|
|
int64_t h = hash64 ( link.getUrl(), link.getUrlLen() );
|
|
if ( dedup.isInTable ( &h ) ) continue;
|
|
if ( ! dedup.addKey ( &h ) ) return false;
|
|
|
|
// set up the hashing parms
|
|
HashInfo hi;
|
|
hi.m_hashGroup = HASHGROUP_INTAG;
|
|
hi.m_tt = tt;
|
|
hi.m_prefix = "link";
|
|
|
|
// hash link:<url>
|
|
if ( ! hashSingleTerm ( link.getUrl(),link.getUrlLen(),&hi )) {
|
|
return false;
|
|
}
|
|
|
|
h = hash64 ( link.getHost() , link.getHostLen() );
|
|
if ( dedup.isInTable ( &h ) ) continue;
|
|
if ( ! dedup.addKey ( &h ) ) return false;
|
|
|
|
// fix parm
|
|
hi.m_prefix = "sitelink";
|
|
|
|
// hash sitelink:<urlHost>
|
|
if ( ! hashSingleTerm ( link.getHost(),link.getHostLen(),&hi)) {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
|
|
// . returns false and sets g_errno on error
|
|
// . hash for linkdb
|
|
bool XmlDoc::hashLinksForLinkdb ( HashTableX *dt ) {
|
|
|
|
// sanity check
|
|
if ( dt->getKeySize() != sizeof(key224_t) ) { g_process.shutdownAbort(true); }
|
|
if ( dt->getDataSize() != 0 ) { g_process.shutdownAbort(true); }
|
|
|
|
// this will be different with our new site definitions
|
|
uint32_t linkerSiteHash32 = *getSiteHash32();
|
|
|
|
char siteRank = getSiteRank();
|
|
|
|
if ( ! m_linksValid ) { g_process.shutdownAbort(true); }
|
|
|
|
int32_t *linkSiteHashes = getLinkSiteHashes();
|
|
if ( ! linkSiteHashes || linkSiteHashes == (void *)-1 ) {
|
|
g_process.shutdownAbort(true);
|
|
}
|
|
|
|
// use spidered time! might not be current time! like if rebuilding
|
|
// or injecting from a past spider time
|
|
int32_t discoveryDate = getSpideredTime();
|
|
|
|
// add in new links
|
|
for ( int32_t i = 0 ; i < m_links.m_numLinks ; i++ ) {
|
|
// skip if empty
|
|
if (m_links.m_linkLens[i] == 0) {
|
|
continue;
|
|
}
|
|
|
|
// . skip if spam, ALWAYS allow internal outlinks though!!
|
|
// . CAUTION: now we must version islinkspam()
|
|
bool spam = m_links.isLinkSpam(i);
|
|
|
|
// get site of outlink from tagrec if in there
|
|
int32_t linkeeSiteHash32 = linkSiteHashes[i];
|
|
|
|
//
|
|
// when setting the links class it should set the site hash
|
|
//
|
|
|
|
|
|
#ifdef _VALGRIND_
|
|
VALGRIND_CHECK_MEM_IS_DEFINED(&linkeeSiteHash32,sizeof(linkeeSiteHash32));
|
|
uint64_t tmp1 = m_links.getLinkHash64(i);
|
|
VALGRIND_CHECK_MEM_IS_DEFINED(&tmp1,sizeof(tmp1));
|
|
VALGRIND_CHECK_MEM_IS_DEFINED(&spam,sizeof(spam));
|
|
VALGRIND_CHECK_MEM_IS_DEFINED(&siteRank,sizeof(siteRank));
|
|
// uint32_t tmp2 = *getIp();
|
|
// VALGRIND_CHECK_MEM_IS_DEFINED(&tmp2,sizeof(tmp2));
|
|
uint64_t tmp3 = *getDocId();
|
|
VALGRIND_CHECK_MEM_IS_DEFINED(&tmp3,sizeof(tmp3));
|
|
VALGRIND_CHECK_MEM_IS_DEFINED(&discoveryDate,sizeof(discoveryDate));
|
|
VALGRIND_CHECK_MEM_IS_DEFINED(&linkerSiteHash32,sizeof(linkerSiteHash32));
|
|
#endif
|
|
|
|
int32_t *ipptr = getIp();
|
|
int32_t ip = ipptr ? *ipptr : 0;
|
|
|
|
// set this key, it is the entire record
|
|
key224_t k = Linkdb::makeKey_uk ( linkeeSiteHash32 ,
|
|
m_links.getLinkHash64(i) ,
|
|
spam , // link spam?
|
|
siteRank , // was quality
|
|
ip,
|
|
*getDocId() ,
|
|
discoveryDate ,
|
|
0 ,
|
|
false , // new add?
|
|
linkerSiteHash32 ,
|
|
false );// delete?
|
|
#ifdef _VALGRIND_
|
|
VALGRIND_CHECK_MEM_IS_DEFINED(&k,sizeof(k));
|
|
#endif
|
|
|
|
// store in hash table
|
|
if (!dt->addKey(&k, NULL)) {
|
|
return false;
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
// . returns false and sets g_errno on error
|
|
// . copied Url2.cpp into here basically, so we can now dump Url2.cpp
|
|
bool XmlDoc::hashUrl ( HashTableX *tt, bool urlOnly ) { // , bool isStatusDoc ) {
|
|
|
|
setStatus ( "hashing url colon" );
|
|
|
|
// get the first url
|
|
Url *fu = getFirstUrl();
|
|
|
|
// set up the hashing parms
|
|
HashInfo hi;
|
|
hi.m_hashGroup = HASHGROUP_INTAG;
|
|
hi.m_tt = tt;
|
|
|
|
// we do not need diversity bits for this
|
|
hi.m_useCountTable = false;
|
|
//
|
|
// HASH url: term
|
|
//
|
|
// append a "www." for doing url: searches
|
|
Url uw;
|
|
uw.set( fu->getUrl(), fu->getUrlLen(), true, false );
|
|
hi.m_prefix = "url";
|
|
|
|
if ( ! hashSingleTerm(uw.getUrl(),uw.getUrlLen(),&hi) )
|
|
return false;
|
|
|
|
if (urlOnly) {
|
|
return true;
|
|
}
|
|
|
|
bool *ini = getIsNoIndex();
|
|
if (ini == nullptr || ini == (bool*)-1) {
|
|
// must not be blocked
|
|
gbshutdownLogicError();
|
|
}
|
|
|
|
char *s = fu->getUrl();
|
|
int32_t slen = fu->getUrlLen();
|
|
|
|
if (!*ini || m_version <= 126) {
|
|
setStatus("hashing inurl colon");
|
|
|
|
//
|
|
// HASH inurl: terms
|
|
//
|
|
hi.m_prefix = "inurl";
|
|
|
|
// BR 20160114: Skip numbers in urls when doing "inurl:" queries
|
|
hi.m_hashNumbers = false;
|
|
hi.m_filterUrlIndexableWords = true;
|
|
if (!hashString(s, slen, &hi)) return false;
|
|
}
|
|
|
|
{
|
|
setStatus("hashing ip colon");
|
|
hi.m_hashNumbers = true;
|
|
hi.m_filterUrlIndexableWords = false;
|
|
|
|
//
|
|
// HASH ip:a.b.c.d
|
|
//
|
|
if (!m_ipValid) { g_process.shutdownAbort(true); }
|
|
// copy it to save it
|
|
char ipbuf[64];
|
|
int32_t iplen = strlen(iptoa(m_ip, ipbuf));
|
|
hi.m_prefix = "ip";
|
|
if (!hashSingleTerm(ipbuf, iplen, &hi)) return false;
|
|
|
|
// . sanity check
|
|
if (!m_siteNumInlinksValid) { g_process.shutdownAbort(true); }
|
|
}
|
|
|
|
|
|
//
|
|
// HASH the url's mid domain and host as they were in the body
|
|
//
|
|
setStatus ( "hashing site colon terms");
|
|
|
|
//
|
|
// HASH the site: terms
|
|
//
|
|
// . hash the pieces of the site
|
|
// . http://host.domain.com/~harry/level1/ should hash to:
|
|
// . site:host.domain.com/~harry/level1/
|
|
// . site:host.domain.com/~harry/
|
|
// . site:host.domain.com/~
|
|
// . site:host.domain.com/
|
|
// . site:domain.com/~harry/level1/
|
|
// . site:domain.com/~harry/
|
|
// . site:domain.com/~
|
|
// . site:domain.com/
|
|
// ensure score is positive
|
|
//if ( siteScore <= 0 ) siteScore = 1;
|
|
// get the hostname (later we set to domain name)
|
|
char *name = fu->getHost();
|
|
int32_t nameLen = fu->getHostLen();
|
|
|
|
#ifdef _VALGRIND_
|
|
VALGRIND_CHECK_MEM_IS_DEFINED(name,nameLen);
|
|
#endif
|
|
// . point to the end of the whole thing, including port field
|
|
// . add in port, if non default
|
|
char *end3 = name + fu->getHostLen() + fu->getPortLen();
|
|
|
|
// Generate string with port if server runs on non-standard ports
|
|
char pbuf[12];
|
|
int pbufLen=0;
|
|
int32_t port = fu->getPort();
|
|
if( port > 0 && port != 80 && port != 443 ) {
|
|
pbufLen=snprintf(pbuf, 12, ":%" PRIu32, (uint32_t)fu->getPort());
|
|
}
|
|
|
|
|
|
loop:
|
|
// now loop through the sub paths of this url's path
|
|
int32_t prev_len = -1;
|
|
for ( int32_t i = 0 ; ; i++ ) {
|
|
// get the subpath
|
|
int32_t len = fu->getSubPathLen(i);
|
|
if(len==prev_len) //work around bug (?) in Url
|
|
continue;
|
|
prev_len = len;
|
|
|
|
// FIX: always include first /
|
|
if ( len == 0 ) {
|
|
len = 1;
|
|
}
|
|
|
|
// write http://www.whatever.com/path into buf
|
|
char buf[MAX_URL_LEN+10];
|
|
char *p = buf;
|
|
|
|
// BR 20160122: Do NOT fix this for https sites. The search is
|
|
// always prefixed with http:// (sigh ...)
|
|
gbmemcpy ( p , "http://" , 7 ); p += 7;
|
|
gbmemcpy ( p , name, nameLen); p += nameLen;
|
|
if( pbufLen > 0 )
|
|
{
|
|
gbmemcpy ( p , pbuf, pbufLen); p += pbufLen;
|
|
}
|
|
gbmemcpy ( p , fu->getPath() , len ); p += len;
|
|
*p = '\0';
|
|
|
|
// update hash parms
|
|
if (m_version <= 126) {
|
|
hi.m_prefix = "site";
|
|
} else {
|
|
hi.m_prefix = *ini ? "sitenoindex" : "site";
|
|
}
|
|
|
|
hi.m_hashGroup = HASHGROUP_INURL;
|
|
|
|
|
|
// this returns false on failure
|
|
if ( ! hashSingleTerm (buf,p-buf,&hi ) ) {
|
|
return false;
|
|
}
|
|
|
|
// break when we hash the root path
|
|
if ( len <=1 ) {
|
|
break;
|
|
}
|
|
}
|
|
|
|
// now keep moving the period over in the hostname
|
|
while ( name < end3 && *name != '.' ) {
|
|
name++;
|
|
nameLen--;
|
|
}
|
|
|
|
// skip the '.'
|
|
name++; nameLen--;
|
|
|
|
// Check that there is a dot before first slash after domain
|
|
// to avoid junk entries like http://com/subpath/pagename.html
|
|
bool dom_valid = false;
|
|
if( nameLen > 0 )
|
|
{
|
|
int32_t dom_offset=0;
|
|
if( strncmp(name,"http://" ,7)==0 )
|
|
{
|
|
dom_offset=7;
|
|
}
|
|
else
|
|
if( strncmp(name,"https://",8)==0 )
|
|
{
|
|
dom_offset=8;
|
|
}
|
|
|
|
const char *dotpos = (const char *)memchr(name,'.',nameLen);
|
|
const char *slashpos= (const char *)memchr(name+dom_offset,'/',nameLen-dom_offset);
|
|
|
|
if( dotpos && (!slashpos || (slashpos > dotpos)) )
|
|
{
|
|
dom_valid = true;
|
|
}
|
|
}
|
|
|
|
if ( name < end3 && dom_valid ) goto loop;
|
|
|
|
|
|
|
|
// BR 20160121: Make searching for e.g. site:dk work
|
|
setStatus ( "hashing tld for site search");
|
|
const char *tld = fu->getTLD();
|
|
int32_t tldLen = fu->getTLDLen();
|
|
|
|
if( tldLen > 0 && tldLen < 64 ) {
|
|
char tldBuf[72]; // http:// (7) + tld (63) + / (1) + 0 (1)
|
|
char *p = tldBuf;
|
|
gbmemcpy ( p , "http://", 7 ); p += 7;
|
|
gbmemcpy ( p , tld, tldLen); p += tldLen;
|
|
gbmemcpy ( p , "/", 1 ); p += 1;
|
|
*p = '\0';
|
|
if ( ! hashSingleTerm (tldBuf, p - tldBuf, &hi ) ) {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
const char *ext = fu->getExtension();
|
|
int32_t elen = fu->getExtensionLen();
|
|
if (!*ini || m_version <= 126) {
|
|
//
|
|
// HASH ext: term
|
|
//
|
|
// i.e. ext:gif ext:html ext:htm ext:pdf, etc.
|
|
setStatus("hashing ext colon");
|
|
// update hash parms
|
|
hi.m_prefix = "ext";
|
|
if (!hashSingleTerm(ext, elen, &hi)) return false;
|
|
}
|
|
|
|
{
|
|
setStatus("hashing gbdocid");
|
|
hi.m_prefix = "gbdocid";
|
|
char buf2[32];
|
|
sprintf(buf2, "%" PRIu64, (uint64_t)m_docId);
|
|
if (!hashSingleTerm(buf2, strlen(buf2), &hi)) return false;
|
|
}
|
|
|
|
setStatus ( "hashing SiteGetter terms");
|
|
|
|
//
|
|
// HASH terms for SiteGetter.cpp
|
|
//
|
|
// . this termId is used by SiteGetter.cpp for determining subsites
|
|
// . matches what is in SiteGet::getSiteList()
|
|
// for www.xyz.com/a/ HASH www.xyz.com
|
|
// for www.xyz.com/a/b/ HASH www.xyz.com/a/
|
|
// for www.xyz.com/a/b/c/ HASH www.xyz.com/a/b/
|
|
bool add = true;
|
|
// we only hash this for urls that end in '/'
|
|
if ( s[slen-1] != '/' ) add = false;
|
|
// and no cgi
|
|
if ( fu->isCgi() ) add = false;
|
|
// skip if root
|
|
if ( fu->getPathLen() <= 1 ) add = false;
|
|
// sanity check
|
|
if ( ! m_linksValid ) { g_process.shutdownAbort(true); }
|
|
// . skip if we have no subdirectory outlinks
|
|
// . that way we do not confuse all the pages in dictionary.com or
|
|
// wikipedia.org as subsites!!
|
|
if ( ! m_links.hasSubdirOutlink() ) add = false;
|
|
|
|
char *host = fu->getHost ();
|
|
int32_t hlen = fu->getHostLen ();
|
|
|
|
// tags from here out
|
|
hi.m_hashGroup = HASHGROUP_INTAG;
|
|
hi.m_shardByTermId = true;
|
|
// hash it
|
|
if ( add ) {
|
|
// remove the last path component
|
|
char *end2 = s + slen - 2;
|
|
// back up over last component
|
|
for ( ; end2 > fu->getPath() && *end2 != '/' ; end2-- ) ;
|
|
// hash that part of the url
|
|
hi.m_prefix = "siteterm";
|
|
if ( ! hashSingleTerm ( host,end2-host,&hi) ) return false;
|
|
}
|
|
hi.m_shardByTermId = false;
|
|
|
|
setStatus ( "hashing urlhashdiv10 etc");
|
|
|
|
//
|
|
// HASH urlhash: urlhashdiv10: urlhashdiv100: terms
|
|
//
|
|
// this is for proving how many docs are in the index
|
|
char buf[20];
|
|
int32_t blen;
|
|
|
|
uint32_t h = hash32 ( s , slen );
|
|
blen = sprintf(buf,"%" PRIu32,h);
|
|
hi.m_prefix = "urlhash";
|
|
if ( ! hashString(buf,blen,&hi) ) return false;
|
|
|
|
// don't index mid domain or url path for noindex document
|
|
if (*ini && m_version > 126) {
|
|
return true;
|
|
}
|
|
|
|
if (size_utf8Content - 1 > 0 || m_indexCode == EDOCDISALLOWEDROOT) {
|
|
setStatus("hashing url mid domain");
|
|
|
|
// update parms
|
|
hi.m_prefix = NULL;
|
|
hi.m_desc = "middle domain";
|
|
hi.m_hashGroup = HASHGROUP_INURL;
|
|
hi.m_filterUrlIndexableWords = true; // Skip com, http etc.
|
|
if (!hashString(host, hlen, &hi)) {
|
|
return false;
|
|
}
|
|
|
|
hi.m_filterUrlIndexableWords = false;
|
|
if (!hashSingleTerm(fu->getDomain(), fu->getDomainLen(), &hi)) {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
if (size_utf8Content - 1 > 0) {
|
|
setStatus("hashing url path");
|
|
char *path = fu->getPath();
|
|
int32_t plen = fu->getPathLen();
|
|
|
|
// BR 20160113: Do not hash and combine the page filename extension with the page name (skip e.g. .com)
|
|
if (elen > 0) {
|
|
elen++; // also skip the dot
|
|
}
|
|
plen -= elen;
|
|
|
|
// BR 20160113: Do not hash the most common page names
|
|
if (strncmp(path, "/index", plen) != 0) {
|
|
// hash the path
|
|
// BR 20160114: Exclude numbers in paths (usually dates)
|
|
hi.m_hashNumbers = false;
|
|
if (!hashString(path, plen, &hi)) return false;
|
|
}
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
// . returns false and sets g_errno on error
|
|
bool XmlDoc::hashIncomingLinkText(HashTableX *tt) {
|
|
|
|
setStatus ( "hashing link text" );
|
|
|
|
// sanity
|
|
if ( ! m_linkInfo1Valid ) { g_process.shutdownAbort(true); }
|
|
|
|
// . finally hash in the linkText terms from the LinkInfo
|
|
// . the LinkInfo class has all the terms of hashed anchor text for us
|
|
// . if we're using an old TitleRec linkTermList is just a ptr to
|
|
// somewhere in TitleRec
|
|
// . otherwise, we generated it from merging a bunch of LinkInfos
|
|
// and storing them in this new TitleRec
|
|
LinkInfo *linkInfo = getLinkInfo1();
|
|
|
|
// sanity checks
|
|
if ( ! m_ipValid ) { g_process.shutdownAbort(true); }
|
|
if ( ! m_siteNumInlinksValid ) { g_process.shutdownAbort(true); }
|
|
|
|
//
|
|
// brought the following code in from LinkInfo.cpp
|
|
//
|
|
|
|
// count "external" inlinkers
|
|
int32_t ecount = 0;
|
|
|
|
// update hash parms
|
|
HashInfo hi;
|
|
hi.m_tt = tt;
|
|
// hashstring should update this like a cursor.
|
|
hi.m_startDist = 0;
|
|
|
|
// loop through the link texts and hash them
|
|
for ( Inlink *k = NULL; linkInfo && (k = linkInfo->getNextInlink(k)) ; ) {
|
|
// is this inlinker internal?
|
|
bool internal=((m_ip&0x0000ffff)==(k->m_ip&0x0000ffff));
|
|
// count external inlinks we have for indexing gbmininlinks:
|
|
if ( ! internal ) ecount++;
|
|
|
|
// get length of link text
|
|
int32_t tlen = k->size_linkText;
|
|
if ( tlen > 0 ) tlen--;
|
|
// get the text
|
|
char *txt = k->getLinkText();
|
|
// sanity check
|
|
if ( ! verifyUtf8 ( txt , tlen ) ) {
|
|
log("xmldoc: bad link text 2 from url=%s for %s",
|
|
k->getUrl(),m_firstUrl.getUrl());
|
|
continue;
|
|
}
|
|
|
|
if ( internal ) hi.m_hashGroup = HASHGROUP_INTERNALINLINKTEXT;
|
|
else hi.m_hashGroup = HASHGROUP_INLINKTEXT;
|
|
// store the siterank of the linker in this and use that
|
|
// to set the multiplier M bits i guess
|
|
hi.m_linkerSiteRank = k->m_siteRank;
|
|
if(hi.m_linkerSiteRank>MAXSITERANK) {
|
|
log(LOG_INFO,"Inlink had siteRank>max (%d), probably from docid %ld", k->m_siteRank, k->m_docId);
|
|
hi.m_linkerSiteRank = MAXSITERANK;
|
|
}
|
|
// now record this so we can match the link text to
|
|
// a matched offsite inlink text term in the scoring info
|
|
k->m_wordPosStart = m_dist; // hi.m_startDist;
|
|
// . hash the link text into the table
|
|
// . returns false and sets g_errno on error
|
|
// . we still have the score punish from # of words though!
|
|
// . for inlink texts that are the same it should accumulate
|
|
// and use the reserved bits as a multiplier i guess...
|
|
if ( ! hashString ( txt,tlen,&hi) ) return false;
|
|
// now record this so we can match the link text to
|
|
// a matched offsite inlink text term in the scoring info
|
|
//k->m_wordPosEnd = hi.m_startDist;
|
|
// spread it out
|
|
hi.m_startDist += 20;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
// . returns false and sets g_errno on error
|
|
bool XmlDoc::hashNeighborhoods ( HashTableX *tt ) {
|
|
setStatus ( "hashing neighborhoods" );
|
|
|
|
// . now we also hash the neighborhood text of each inlink, that is,
|
|
// the text surrounding the inlink text.
|
|
// . this is also destructive in that it will remove termids that
|
|
// were not in the document being linked to in order to save
|
|
// space in the titleRec
|
|
// . now we only do one or the other, not both
|
|
LinkInfo *linkInfo = getLinkInfo1();
|
|
if(!linkInfo)
|
|
return true;
|
|
|
|
// loop over all the Inlinks
|
|
for(Inlink *k = linkInfo->getNextInlink(NULL); k; k = linkInfo->getNextInlink(k)) {
|
|
// skip if internal, they often have the same neighborhood text
|
|
if((k->m_ip&0x0000ffff)==(m_ip&0x0000ffff))
|
|
continue;
|
|
|
|
// get the left and right texts and hash both
|
|
char *s = k->getSurroundingText();
|
|
if(!s || k->size_surroundingText <= 1)
|
|
continue;
|
|
|
|
// update hash parms
|
|
HashInfo hi;
|
|
hi.m_tt = tt;
|
|
hi.m_desc = "surrounding text";
|
|
hi.m_hashGroup = HASHGROUP_NEIGHBORHOOD;
|
|
|
|
// . hash that
|
|
// . this returns false and sets g_errno on error
|
|
int32_t len = k->size_surroundingText - 1;
|
|
if(!hashString(s, len, &hi))
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
// . we now do the title hashing here for newer titlerecs, version 80+, rather
|
|
// than use the <index> block in the ruleset for titles.
|
|
// . this is not to be confused with hashing the title: terms which still
|
|
// does have an <index> block in the ruleset.
|
|
// . the new Weights class hashes title as part of body now with a high weight
|
|
// given by "titleWeight" parm
|
|
bool XmlDoc::hashTitle ( HashTableX *tt ) {
|
|
// sanity check
|
|
if ( m_hashedTitle ) { g_process.shutdownAbort(true); }
|
|
|
|
setStatus ( "hashing title" );
|
|
|
|
// this has been called, note it
|
|
m_hashedTitle = true;
|
|
|
|
const nodeid_t *tids = m_words.getTagIds();
|
|
int32_t nw = m_words.getNumWords();
|
|
|
|
// find the first <title> tag in the doc
|
|
int32_t i ;
|
|
for ( i = 0 ; i < nw ; i++ )
|
|
if ( tids[i] == TAG_TITLE ) break;
|
|
|
|
// return true if no title
|
|
if ( i >= nw ) return true;
|
|
|
|
// skip tag
|
|
i++;
|
|
// mark it as start of title
|
|
int32_t a = i;
|
|
|
|
// limit end
|
|
int32_t max = i + 40;
|
|
if ( max > nw ) max = nw;
|
|
|
|
// find end of title, either another <title> or a <title> tag
|
|
for ( ; i < max ; i++ )
|
|
if ( (tids[i] & BACKBITCOMP) == TAG_TITLE ) break;
|
|
|
|
// ends on a <title> tag?
|
|
if ( i == a ) return true;
|
|
|
|
HashInfo hi;
|
|
hi.m_tt = tt;
|
|
hi.m_prefix = "title";
|
|
|
|
// the new posdb info
|
|
hi.m_hashGroup = HASHGROUP_TITLE;
|
|
|
|
// . hash it up! use 0 for the date
|
|
// . use XmlDoc::hashWords()
|
|
// . use "title" as both prefix and description
|
|
//if ( ! hashWords (a,i,&hi ) ) return false;
|
|
|
|
char **wptrs = m_words.getWordPtrs();
|
|
int32_t *wlens = m_words.getWordLens();
|
|
char *title = wptrs[a];
|
|
char *titleEnd = wptrs[i-1] + wlens[i-1];
|
|
int32_t titleLen = titleEnd - title;
|
|
if ( ! hashString ( title, titleLen, &hi) ) return false;
|
|
|
|
// now hash as without title: prefix
|
|
hi.m_prefix = NULL;
|
|
if ( ! hashString ( title, titleLen, &hi) ) return false;
|
|
|
|
return true;
|
|
}
|
|
|
|
// . we now do the title hashing here for newer titlerecs, version 80+, rather
|
|
// than use the <index> block in the ruleset for titles.
|
|
// . this is not to be confused with hashing the title: terms which still
|
|
// does have an <index> block in the ruleset.
|
|
bool XmlDoc::hashBody2 ( HashTableX *tt ) {
|
|
|
|
// do not index ANY of the body if it is NOT a permalink and
|
|
// "menu elimination" technology is enabled.
|
|
//if ( ! *getIsPermalink() && m_eliminateMenus ) return true;
|
|
|
|
setStatus ( "hashing body" );
|
|
|
|
// record this
|
|
m_bodyStartPos = m_dist;
|
|
m_bodyStartPosValid = true;
|
|
|
|
HashInfo hi;
|
|
hi.m_tt = tt;
|
|
hi.m_desc = "body";
|
|
hi.m_hashGroup = HASHGROUP_BODY;
|
|
|
|
// use NULL for the prefix
|
|
return hashWords (&hi );
|
|
}
|
|
|
|
bool XmlDoc::hashMetaKeywords ( HashTableX *tt ) {
|
|
|
|
// do not index meta tags if "menu elimination" technology is enabled.
|
|
//if ( m_eliminateMenus ) return true;
|
|
|
|
setStatus ( "hashing meta keywords" );
|
|
|
|
// hash the meta keywords tag
|
|
//char buf [ 2048 + 2 ];
|
|
//int32_t len=m_xml.getMetaContentPointer ( buf , 2048 , "keywords" , 8 );
|
|
int32_t mklen;
|
|
char *mk = getMetaKeywords( &mklen );
|
|
|
|
// update hash parms
|
|
HashInfo hi;
|
|
hi.m_tt = tt;
|
|
hi.m_desc = "meta keywords";
|
|
hi.m_hashGroup = HASHGROUP_INMETATAG;
|
|
|
|
// call XmlDoc::hashString
|
|
return hashString ( mk , mklen , &hi);
|
|
}
|
|
|
|
|
|
// . hash the meta summary, description and keyword tags
|
|
// . we now do the title hashing here for newer titlerecs, version 80+, rather
|
|
// than use the <index> block in the ruleset for titles.
|
|
bool XmlDoc::hashMetaSummary ( HashTableX *tt ) {
|
|
|
|
// sanity check
|
|
if ( m_hashedMetas ) { g_process.shutdownAbort(true); }
|
|
|
|
// this has been called, note it
|
|
m_hashedMetas = true;
|
|
|
|
// do not index meta tags if "menu elimination" technology is enabled.
|
|
//if ( m_eliminateMenus ) return true;
|
|
|
|
setStatus ( "hashing meta summary" );
|
|
|
|
// hash the meta keywords tag
|
|
//char buf [ 2048 + 2 ];
|
|
//int32_t len = m_xml.getMetaContent ( buf , 2048 , "summary" , 7 );
|
|
int32_t mslen;
|
|
char *ms = getMetaSummary ( &mslen );
|
|
|
|
// update hash parms
|
|
HashInfo hi;
|
|
hi.m_tt = tt;
|
|
hi.m_hashGroup = HASHGROUP_INMETATAG;
|
|
|
|
// udpate hashing parms
|
|
hi.m_desc = "meta summary";
|
|
// hash it
|
|
if ( ! hashString ( ms , mslen , &hi )) return false;
|
|
|
|
|
|
//len = m_xml.getMetaContent ( buf , 2048 , "description" , 11 );
|
|
int32_t mdlen;
|
|
char *md = getMetaDescription ( &mdlen );
|
|
|
|
// udpate hashing parms
|
|
hi.m_desc = "meta desc";
|
|
// . TODO: only hash if unique????? set a flag on ht then i guess
|
|
if ( ! hashString ( md , mdlen , &hi ) ) return false;
|
|
|
|
return true;
|
|
}
|
|
|
|
|
|
bool XmlDoc::hashMetaGeoPlacename( HashTableX *tt ) {
|
|
|
|
setStatus ( "hashing meta geo.placename" );
|
|
|
|
int32_t mgplen;
|
|
char *mgp = getMetaGeoPlacename( &mgplen );
|
|
|
|
// update hash parms
|
|
HashInfo hi;
|
|
hi.m_tt = tt;
|
|
hi.m_desc = "meta geo.placename";
|
|
hi.m_hashGroup = HASHGROUP_INMETATAG;
|
|
|
|
// call XmlDoc::hashString
|
|
return hashString ( mgp , mgplen , &hi);
|
|
}
|
|
|
|
|
|
|
|
|
|
bool XmlDoc::hashLanguage ( HashTableX *tt ) {
|
|
|
|
setStatus ( "hashing language" );
|
|
|
|
int32_t langId = (int32_t)*getLangId();
|
|
|
|
char s[32]; // numeric langid
|
|
int32_t slen = sprintf(s, "%" PRId32, langId );
|
|
|
|
// update hash parms
|
|
HashInfo hi;
|
|
hi.m_tt = tt;
|
|
hi.m_hashGroup = HASHGROUP_INTAG;
|
|
hi.m_prefix = "gblang";
|
|
|
|
if ( ! hashString ( s, slen, &hi ) ) return false;
|
|
|
|
return true;
|
|
}
|
|
|
|
bool XmlDoc::hashLanguageString ( HashTableX *tt ) {
|
|
|
|
setStatus ( "hashing language string" );
|
|
|
|
int32_t langId = (int32_t)*getLangId();
|
|
|
|
// update hash parms
|
|
HashInfo hi;
|
|
hi.m_tt = tt;
|
|
hi.m_hashGroup = HASHGROUP_INTAG;
|
|
hi.m_prefix = "gblang";
|
|
|
|
// try lang abbreviation
|
|
char s[32];
|
|
int32_t slen = sprintf(s , "%s ", getLanguageAbbr(langId) );
|
|
// go back to broken way to try to fix parsing consistency bug
|
|
if ( ! hashString ( s, slen, &hi ) ) return false;
|
|
|
|
return true;
|
|
}
|
|
|
|
bool XmlDoc::hashCountry ( HashTableX *tt ) {
|
|
|
|
setStatus ( "hashing country" );
|
|
|
|
uint16_t *cid = getCountryId();
|
|
if ( ! cid || cid == (uint16_t *)-1 ) return false;
|
|
|
|
// update hash parms
|
|
HashInfo hi;
|
|
hi.m_tt = tt;
|
|
hi.m_hashGroup = HASHGROUP_INTAG;
|
|
hi.m_prefix = "gbcountry";
|
|
|
|
for ( int32_t i = 0 ; i < 1 ; i++ ) {
|
|
// convert it
|
|
char buf[32];
|
|
int32_t blen = sprintf(buf,"%s", g_countryCode.getAbbr(*cid) );
|
|
// hash it
|
|
if ( ! hashString ( buf, blen, &hi ) ) return false;
|
|
}
|
|
// all done
|
|
return true;
|
|
}
|
|
|
|
bool XmlDoc::hashSingleTerm( const char *s, int32_t slen, HashInfo *hi ) {
|
|
// empty?
|
|
if ( slen <= 0 ) return true;
|
|
if ( ! m_versionValid ) { g_process.shutdownAbort(true); }
|
|
if ( hi->m_useCountTable && ! m_countTableValid){g_process.shutdownAbort(true); }
|
|
|
|
// a single blob hash
|
|
int64_t termId = hash64 ( s , slen );
|
|
// combine with prefix
|
|
int64_t final = termId;
|
|
// combine with a non-NULL prefix
|
|
int64_t prefixHash = 0LL;
|
|
if ( hi->m_prefix ) {
|
|
prefixHash = hash64b ( hi->m_prefix );
|
|
final = hash64 ( termId , prefixHash );
|
|
}
|
|
// call the other guy now
|
|
//return hashSingleTerm ( final , hi );
|
|
|
|
|
|
// shortcut
|
|
HashTableX *dt = hi->m_tt;
|
|
// sanity check
|
|
if ( dt->getKeySize() != sizeof(key144_t) ) { g_process.shutdownAbort(true); }
|
|
// make the key like we do in hashWords()
|
|
|
|
|
|
key144_t k;
|
|
Posdb::makeKey ( &k ,
|
|
final,
|
|
0LL, // docid
|
|
0, // dist
|
|
MAXDENSITYRANK, // density rank
|
|
MAXDIVERSITYRANK, // diversity rank
|
|
MAXWORDSPAMRANK, // wordspamrank
|
|
0, // siterank
|
|
hi->m_hashGroup,
|
|
// we set to docLang in final hash loop
|
|
langUnknown,// langid
|
|
0, // multiplier
|
|
0, // syn?
|
|
false , // delkey?
|
|
hi->m_shardByTermId );
|
|
|
|
// . otherwise, add a new slot
|
|
// . key should NEVER collide since we are always
|
|
// incrementing the distance cursor, m_dist
|
|
if ( ! dt->addTerm144 ( &k ) ) return false;
|
|
|
|
// add to wts for PageParser.cpp display
|
|
if ( m_wts && ! storeTerm ( s,slen,final,hi,
|
|
0, // wordnum
|
|
0, // wordPos,
|
|
MAXDENSITYRANK,
|
|
MAXDIVERSITYRANK,
|
|
MAXWORDSPAMRANK,
|
|
hi->m_hashGroup,
|
|
//false,
|
|
&m_wbuf,
|
|
m_wts,
|
|
SOURCE_NONE, // synsrc
|
|
langUnknown,
|
|
k) )
|
|
return false;
|
|
|
|
return true;
|
|
}
|
|
|
|
bool XmlDoc::hashString( char *s, int32_t slen, HashInfo *hi ) {
|
|
if ( ! m_versionValid ) { g_process.shutdownAbort(true); }
|
|
|
|
if ( hi->m_useCountTable && ! m_countTableValid){g_process.shutdownAbort(true); }
|
|
|
|
if ( ! m_siteNumInlinksValid ) { g_process.shutdownAbort(true); }
|
|
|
|
return hashString3( s ,
|
|
slen ,
|
|
hi ,
|
|
&m_countTable ,
|
|
m_wts ,
|
|
&m_wbuf );
|
|
}
|
|
|
|
|
|
bool XmlDoc::hashString3( char *s ,
|
|
int32_t slen ,
|
|
HashInfo *hi ,
|
|
HashTableX *countTable ,
|
|
HashTableX *wts ,
|
|
SafeBuf *wbuf) {
|
|
Words words;
|
|
Bits bits;
|
|
Phrases phrases;
|
|
|
|
if ( ! words.set ( s , slen , true ) )
|
|
return false;
|
|
if ( !bits.set(&words))
|
|
return false;
|
|
if ( !phrases.set( &words, &bits ) )
|
|
return false;
|
|
|
|
// use primary langid of doc
|
|
if ( ! m_langIdValid ) { g_process.shutdownAbort(true); }
|
|
|
|
return hashWords3( hi, &words, &phrases, NULL, countTable, NULL, NULL, NULL, wts, wbuf );
|
|
}
|
|
|
|
bool XmlDoc::hashWords ( HashInfo *hi ) {
|
|
// sanity checks
|
|
if ( ! m_wordsValid ) { g_process.shutdownAbort(true); }
|
|
if ( ! m_phrasesValid ) { g_process.shutdownAbort(true); }
|
|
if ( hi->m_useCountTable &&!m_countTableValid){g_process.shutdownAbort(true); }
|
|
if ( ! m_bitsValid ) { g_process.shutdownAbort(true); }
|
|
if ( ! m_sectionsValid) { g_process.shutdownAbort(true); }
|
|
//if ( ! m_synonymsValid) { g_process.shutdownAbort(true); }
|
|
if ( ! m_fragBufValid ) { g_process.shutdownAbort(true); }
|
|
if ( ! m_wordSpamBufValid ) { g_process.shutdownAbort(true); }
|
|
if ( m_wts && ! m_langVectorValid ) { g_process.shutdownAbort(true); }
|
|
if ( ! m_langIdValid ) { g_process.shutdownAbort(true); }
|
|
// . is the word repeated in a pattern?
|
|
// . this should only be used for document body, for meta tags,
|
|
// inlink text, etc. we should make sure words are unique
|
|
char *wordSpamVec = getWordSpamVec();
|
|
char *fragVec = m_fragBuf.getBufStart();
|
|
char *langVec = m_langVec.getBufStart();
|
|
|
|
return hashWords3(hi, &m_words, &m_phrases, &m_sections, &m_countTable, fragVec, wordSpamVec, langVec, m_wts, &m_wbuf);
|
|
}
|
|
|
|
// . this now uses posdb exclusively
|
|
bool XmlDoc::hashWords3( HashInfo *hi, const Words *words, Phrases *phrases, Sections *sectionsArg, HashTableX *countTable,
|
|
char *fragVec, char *wordSpamVec, char *langVec, HashTableX *wts, SafeBuf *wbuf) {
|
|
Sections *sections = sectionsArg;
|
|
// for getSpiderStatusDocMetaList() we don't use sections it'll mess us up
|
|
if ( ! hi->m_useSections ) sections = NULL;
|
|
|
|
// shortcuts
|
|
const uint64_t *wids = reinterpret_cast<const uint64_t*>(words->getWordIds());
|
|
const uint64_t *pids2 = reinterpret_cast<const uint64_t*>(phrases->getPhraseIds2());
|
|
|
|
HashTableX *dt = hi->m_tt;
|
|
|
|
// . sanity checks
|
|
// . posdb just uses the full keys with docid
|
|
if ( dt->getKeySize() != 18 ) { g_process.shutdownAbort(true); }
|
|
if ( dt->getDataSize() != 4 ) { g_process.shutdownAbort(true); }
|
|
|
|
// if provided...
|
|
if ( wts ) {
|
|
if ( wts->getKeySize() != 12 ) { g_process.shutdownAbort(true); }
|
|
if ( wts->getDataSize() != sizeof(TermDebugInfo)){g_process.shutdownAbort(true); }
|
|
if ( ! wts->isAllowDups() ) { g_process.shutdownAbort(true); }
|
|
}
|
|
|
|
// ensure caller set the hashGroup
|
|
if ( hi->m_hashGroup < 0 ) { g_process.shutdownAbort(true); }
|
|
|
|
// handy
|
|
const char *const*wptrs = words->getWordPtrs();
|
|
const int32_t *wlens = words->getWordLens();
|
|
|
|
// hash in the prefix
|
|
uint64_t prefixHash = 0LL;
|
|
int32_t plen = 0;
|
|
if ( hi->m_prefix ) plen = strlen ( hi->m_prefix );
|
|
if ( hi->m_prefix && plen ) {
|
|
// we gotta make this case insensitive, and skip spaces
|
|
// because if it is 'focal length' we can't search
|
|
// 'focal length:10' because that comes across as TWO terms.
|
|
prefixHash = hash64Lower_utf8_nospaces ( hi->m_prefix , plen );
|
|
// . sanity test, make sure it is in supported list
|
|
// . hashing diffbot json output of course fails this so
|
|
// skip in that case if diffbot
|
|
}
|
|
|
|
bool hashIffUnique = false;
|
|
if ( hi->m_hashGroup == HASHGROUP_INMETATAG ) hashIffUnique = true;
|
|
if ( hi->m_hashGroup == HASHGROUP_INTAG ) hashIffUnique = true;
|
|
HashTableX ut; ut.set ( 8,0,0,NULL,0,false,"uqtbl");
|
|
|
|
///////
|
|
//
|
|
// diversity rank vector.
|
|
//
|
|
///////
|
|
// the final diversity which is a multiplier
|
|
// is converted into a rank from 0-15 i guess.
|
|
// so 'mexico' in "new mexico" should receive a low word score but high
|
|
// phrase score. thus, a search for 'mexico' should not bring up
|
|
// the page for university of new mexico!
|
|
SafeBuf dwbuf;
|
|
if ( !getDiversityVec( words, phrases, countTable, &dwbuf ) ) {
|
|
return false;
|
|
}
|
|
char *wdv = dwbuf.getBufStart();
|
|
|
|
int32_t nw = words->getNumWords();
|
|
|
|
/////
|
|
//
|
|
// calculate density ranks
|
|
//
|
|
/////
|
|
//
|
|
// this now varies depending on the length of the sentence/header etc.
|
|
// so if the hasgroup is not title, link text or meta tag, we have to
|
|
// use a safebuf.
|
|
SafeBuf densBuf;
|
|
// returns false and sets g_errno on error
|
|
if ( ! getDensityRanks((int64_t *)wids,
|
|
nw,
|
|
hi->m_hashGroup,
|
|
&densBuf,
|
|
sections))
|
|
return false;
|
|
// a handy ptr
|
|
char *densvec = (char *)densBuf.getBufStart();
|
|
|
|
////////////
|
|
//
|
|
// get word positions
|
|
//
|
|
///////////
|
|
Section **sp = NULL;
|
|
if ( sections ) sp = sections->m_sectionPtrs;
|
|
|
|
SafeBuf wpos;
|
|
if ( ! getWordPosVec ( words , sections, m_dist, fragVec, &wpos) )
|
|
return false;
|
|
|
|
// a handy ptr
|
|
int32_t *wposvec = (int32_t *)wpos.getBufStart();
|
|
|
|
bool seen_slash = false;
|
|
int32_t i;
|
|
for ( i = 0 ; i < nw ; i++ ) {
|
|
if(wlens[i]==1 && wptrs[i][0]=='/')
|
|
seen_slash = true;
|
|
|
|
if ( ! wids[i] ) continue;
|
|
// ignore if in repeated fragment
|
|
if ( fragVec && i<MAXFRAGWORDS && fragVec[i] == 0 ) continue;
|
|
// ignore if in style section
|
|
if ( sp && (sp[i]->m_flags & NOINDEXFLAGS) ) continue;
|
|
|
|
// do not breach wordpos bits
|
|
if ( wposvec[i] > MAXWORDPOS ) break;
|
|
|
|
// BR: 20160114 if digit, do not hash it if disabled
|
|
if( is_digit( wptrs[i][0] ) && !hi->m_hashNumbers ) {
|
|
continue;
|
|
}
|
|
|
|
// . hash the startHash with the wordId for this word
|
|
// . we must mask it before adding it to the table because
|
|
// this table is also used to hash IndexLists into that come
|
|
// from LinkInfo classes (incoming link text). And when
|
|
// those IndexLists are hashed they used masked termIds.
|
|
// So we should too...
|
|
uint64_t h ;
|
|
if ( plen > 0 ) h = hash64 ( wids[i] , prefixHash );
|
|
else h = wids[i];
|
|
|
|
int32_t hashGroup = hi->m_hashGroup;
|
|
|
|
Section *sx = NULL;
|
|
if ( sp ) {
|
|
sx = sp[i];
|
|
// . this is taken care of in hashTitle()
|
|
// . it is slightly different if the title is
|
|
// multiple sentences because when hashing the
|
|
// body the density rank is per sentence, but in
|
|
// hashTitle we count all the words in the title
|
|
// towards the density rank even if they are
|
|
// in different sentences
|
|
if ( sx->m_flags & SEC_IN_TITLE ) {
|
|
continue;
|
|
}
|
|
if ( sx->m_flags & SEC_IN_HEADER ) {
|
|
hashGroup = HASHGROUP_HEADING;
|
|
}
|
|
if ( sx->m_flags & ( SEC_MENU | SEC_MENU_SENTENCE | SEC_MENU_HEADER ) ) {
|
|
hashGroup = HASHGROUP_INMENU;
|
|
}
|
|
}
|
|
|
|
// this is for link text and meta tags mostly
|
|
if ( hashIffUnique ) {
|
|
// skip if already did it
|
|
if ( ut.isInTable ( &h ) ) continue;
|
|
if ( ! ut.addKey ( &h ) ) return false;
|
|
}
|
|
|
|
char ws = 15;
|
|
if ( wordSpamVec ) ws = wordSpamVec[i];
|
|
|
|
// HACK:
|
|
// if this is inlink text, use the wordspamrank to hold the
|
|
// inlinker's site rank!
|
|
if ( hashGroup == HASHGROUP_INLINKTEXT )
|
|
ws = hi->m_linkerSiteRank;
|
|
|
|
// default to the document's primary language if it is not
|
|
// clear what language this word belongs to.
|
|
// if the word is only in german it should be german,
|
|
// otherwise it will be the document's primary language.
|
|
char langId = langUnknown;
|
|
if ( m_wts && langVec ) langId = langVec[i];
|
|
|
|
char wd;
|
|
if ( hi->m_useCountTable ) {
|
|
wd = wdv[i];
|
|
} else {
|
|
wd = MAXDIVERSITYRANK;
|
|
}
|
|
|
|
bool skipword = false;
|
|
if(hi->m_filterUrlIndexableWords) {
|
|
if(!seen_slash) {
|
|
//Scheme/host/domain part of URL
|
|
//the http/https prefix is not indexed at all
|
|
if((wlens[i]==4 && memcmp(wptrs[i],"http",4)==0) ||
|
|
(wlens[i]==5 && memcmp(wptrs[i],"https",5)==0))
|
|
{
|
|
// Never include as single word or in bigrams
|
|
continue; //skip to next word
|
|
}
|
|
//the terms .com .co .dk etc have lots of hits and give very little value for indexing. We only index the bigrams.
|
|
if(isTLD(wptrs[i], wlens[i])) {
|
|
skipword = true; //skip word by index bigram
|
|
}
|
|
} else {
|
|
//Path parth for URL
|
|
//potentially filter out "html" "aspx" index" "cgi" etc.
|
|
}
|
|
}
|
|
|
|
if(!skipword) {
|
|
key144_t k;
|
|
|
|
Posdb::makeKey(&k,
|
|
h,
|
|
0LL,//docid
|
|
wposvec[i], // dist,
|
|
densvec[i],// densityRank , // 0-15
|
|
wd, // diversityRank 0-15
|
|
ws, // wordSpamRank 0-15
|
|
0, // siterank
|
|
hashGroup,
|
|
// we set to docLang final hash loop
|
|
langUnknown, // langid
|
|
0, // multiplier
|
|
false, // syn?
|
|
false, // delkey?
|
|
hi->m_shardByTermId);
|
|
|
|
// key should NEVER collide since we are always incrementing
|
|
// the distance cursor, m_dist
|
|
dt->addTerm144(&k);
|
|
|
|
// add to wts for PageParser.cpp display
|
|
if(wts) {
|
|
if(!storeTerm(wptrs[i],wlens[i],h,hi,i,
|
|
wposvec[i], // wordPos
|
|
densvec[i],// densityRank , // 0-15
|
|
wd,//v[i],
|
|
ws,
|
|
hashGroup,
|
|
wbuf,
|
|
wts,
|
|
SOURCE_NONE, // synsrc
|
|
langId,
|
|
k))
|
|
return false;
|
|
}
|
|
|
|
//
|
|
// STRIP POSSESSIVE WORDS for indexing
|
|
//
|
|
// . for now do simple stripping here
|
|
// . if word is "bob's" hash "bob"
|
|
//
|
|
|
|
//@todo BR 20160107: Is this always good? Is the same done in Query.cpp?
|
|
if(wlens[i] >= 3 &&
|
|
wptrs[i][wlens[i]-2] == '\'' &&
|
|
to_lower_a(wptrs[i][wlens[i]-1]) == 's')
|
|
{
|
|
int64_t nah = hash64Lower_utf8(wptrs[i], wlens[i]-2);
|
|
if(plen>0) nah = hash64(nah, prefixHash);
|
|
Posdb::makeKey(&k,
|
|
nah,
|
|
0LL,//docid
|
|
wposvec[i], // dist,
|
|
densvec[i],// densityRank , // 0-15
|
|
wd,//v[i], // diversityRank ,
|
|
ws, // wordSpamRank ,
|
|
0, //siterank
|
|
hashGroup,
|
|
// we set to docLang final hash loop
|
|
langUnknown, // langid
|
|
0 , // multiplier
|
|
true, // syn?
|
|
false, // delkey?
|
|
hi->m_shardByTermId );
|
|
// key should NEVER collide since we are always
|
|
// incrementing the distance cursor, m_dist
|
|
dt->addTerm144(&k);
|
|
// keep going if not debug
|
|
if(!wts) continue;
|
|
// print the synonym
|
|
if(!storeTerm(wptrs[i], // synWord,
|
|
wlens[i] -2, // strlen(synWord),
|
|
nah, // termid
|
|
hi,
|
|
i, // wordnum
|
|
wposvec[i], // wordPos
|
|
densvec[i],// densityRank , // 0-15
|
|
wd,//v[i],
|
|
ws,
|
|
hashGroup,
|
|
//false, // is phrase?
|
|
wbuf,
|
|
wts,
|
|
SOURCE_GENERATED,
|
|
langId,
|
|
k))
|
|
return false;
|
|
}
|
|
} //!skipword
|
|
|
|
|
|
////////
|
|
//
|
|
// two-word phrase
|
|
//
|
|
////////
|
|
|
|
int64_t npid = pids2[i];
|
|
uint64_t ph2 = 0;
|
|
|
|
// repeat for the two word hash if different!
|
|
if ( npid ) {
|
|
// hash with prefix
|
|
if ( plen > 0 ) ph2 = hash64 ( npid , prefixHash );
|
|
else ph2 = npid;
|
|
key144_t k;
|
|
Posdb::makeKey ( &k ,
|
|
ph2 ,
|
|
0LL,//docid
|
|
wposvec[i],//dist,
|
|
densvec[i],// densityRank , // 0-15
|
|
MAXDIVERSITYRANK, //phrase
|
|
ws, // wordSpamRank ,
|
|
0,//siterank
|
|
hashGroup,
|
|
// we set to docLang final hash loop
|
|
langUnknown, // langid
|
|
0 , // multiplier
|
|
false, // syn?
|
|
false , // delkey?
|
|
hi->m_shardByTermId );
|
|
|
|
// key should NEVER collide since we are always
|
|
// incrementing the distance cursor, m_dist
|
|
dt->addTerm144 ( &k );
|
|
|
|
// add to wts for PageParser.cpp display
|
|
if(wts) {
|
|
// get phrase as a string
|
|
int32_t plen;
|
|
char phraseBuffer[256];
|
|
phrases->getPhrase(i, phraseBuffer, sizeof(phraseBuffer), &plen);
|
|
// store it
|
|
if(!storeTerm(phraseBuffer,plen,ph2,hi,i,
|
|
wposvec[i], // wordPos
|
|
densvec[i],// densityRank , // 0-15
|
|
MAXDIVERSITYRANK,//phrase
|
|
ws,
|
|
hashGroup,
|
|
//true,
|
|
wbuf,
|
|
wts,
|
|
SOURCE_BIGRAM, // synsrc
|
|
langId,
|
|
k))
|
|
return false;
|
|
}
|
|
}
|
|
|
|
|
|
//
|
|
// NUMERIC SORTING AND RANGES
|
|
//
|
|
|
|
// only store numbers in fields this way
|
|
if ( prefixHash == 0 )
|
|
{
|
|
continue;
|
|
}
|
|
|
|
// this may or may not be numeric.
|
|
if ( ! is_digit ( wptrs[i][0] ) )
|
|
{
|
|
continue;
|
|
}
|
|
|
|
// Avoid creating "sortby" number values in posdb if not wanted
|
|
if( !hi->m_createSortByForNumbers )
|
|
{
|
|
continue;
|
|
}
|
|
|
|
// this might have to "back up" before any '.' or '-' symbols
|
|
if ( ! hashNumberForSorting ( wptrs[0] ,
|
|
wptrs[i] ,
|
|
wlens[i] ,
|
|
hi ) )
|
|
return false;
|
|
}
|
|
|
|
// between calls? i.e. hashTitle() and hashBody()
|
|
if ( i > 0 ) m_dist = wposvec[i-1] + 100;
|
|
|
|
return true;
|
|
}
|
|
|
|
// . we store numbers as floats in the top 4 bytes of the lower 6 bytes of the
|
|
// posdb key
|
|
// . the termid is the hash of the preceeding field
|
|
// . in json docs a field is like "object.details.price"
|
|
// . in meta tags it is just the meta tag name
|
|
// . credit card numbers are 16 digits. we'd need like 58 bits to store those
|
|
// so we can't do that here, but we can approximate as a float
|
|
// . the binary representation of floating point numbers is ordered in the
|
|
// same order as the floating points themselves! so we are lucky and can
|
|
// keep our usually KEYCMP sorting algos to keep the floats in order.
|
|
bool XmlDoc::hashNumberForSorting ( const char *beginBuf ,
|
|
const char *buf ,
|
|
int32_t bufLen ,
|
|
HashInfo *hi ) {
|
|
|
|
if ( ! is_digit(buf[0]) ) return true;
|
|
|
|
const char *p = buf;
|
|
const char *bufEnd = buf + bufLen;
|
|
|
|
// back-up over any .
|
|
if ( p > beginBuf && p[-1] == '.' ) p--;
|
|
|
|
// negative sign?
|
|
if ( p > beginBuf && p[-1] == '-' ) p--;
|
|
|
|
//
|
|
// also hash as an int, 4 byte-integer so our lastSpidered timestamps
|
|
// dont lose 128 seconds of resolution
|
|
//
|
|
|
|
int32_t i = (int32_t) atoll2 ( p , bufEnd - p );
|
|
|
|
if ( ! hashNumberForSortingAsInt32 ( i , hi , "gbsortbyint" ) )
|
|
return false;
|
|
|
|
// also hash in reverse order for sorting from low to high
|
|
i = -1 * i;
|
|
|
|
if ( ! hashNumberForSortingAsInt32 ( i , hi , "gbrevsortbyint" ) )
|
|
return false;
|
|
|
|
|
|
return true;
|
|
}
|
|
|
|
bool XmlDoc::hashNumberForSortingAsInt32 ( int32_t n , HashInfo *hi , const char *sortByStr ) {
|
|
|
|
// prefix is something like price. like the meta "name" or
|
|
// the json name with dots in it like "product.info.price" or something
|
|
int64_t nameHash = 0LL;
|
|
int32_t nameLen = 0;
|
|
if ( hi->m_prefix ) nameLen = strlen ( hi->m_prefix );
|
|
if ( hi->m_prefix && nameLen )
|
|
nameHash = hash64Lower_utf8_nospaces( hi->m_prefix , nameLen );
|
|
// need a prefix for hashing numbers... for now
|
|
else { g_process.shutdownAbort(true); }
|
|
|
|
// combine prefix hash with a special hash to make it unique to avoid
|
|
// collisions. this is the "TRUE" prefix.
|
|
int64_t truePrefix64 = hash64n ( sortByStr ); // "gbsortby");
|
|
// hash with the "TRUE" prefix
|
|
int64_t ph2 = hash64 ( nameHash , truePrefix64 );
|
|
|
|
// . now store it
|
|
// . use field hash as the termid. normally this would just be
|
|
// a prefix hash
|
|
// . use mostly fake value otherwise
|
|
key144_t k;
|
|
Posdb::makeKey ( &k ,
|
|
ph2 ,
|
|
0,//docid
|
|
0,// word pos #
|
|
0,// densityRank , // 0-15
|
|
0 , // MAXDIVERSITYRANK
|
|
0 , // wordSpamRank ,
|
|
0 , //siterank
|
|
0 , // hashGroup,
|
|
// we set to docLang final hash loop
|
|
//langUnknown, // langid
|
|
// unless already set. so set to english here
|
|
// so it will not be set to something else
|
|
// otherwise our floats would be ordered by langid!
|
|
// somehow we have to indicate that this is a float
|
|
// termlist so it will not be mangled any more.
|
|
//langEnglish,
|
|
langUnknown,
|
|
0 , // multiplier
|
|
false, // syn?
|
|
false , // delkey?
|
|
hi->m_shardByTermId );
|
|
|
|
Posdb::setInt ( &k , n );
|
|
|
|
// HACK: this bit is ALWAYS set by Posdb::makeKey() to 1
|
|
// so that we can b-step into a posdb list and make sure
|
|
// we are aligned on a 6 byte or 12 byte key, since they come
|
|
// in both sizes. but for this, hack it off to tell
|
|
// addTable144() that we are a special posdb key, a "numeric"
|
|
// key that has a float stored in it. then it will NOT
|
|
// set the siterank and langid bits which throw our sorting
|
|
// off!!
|
|
Posdb::setAlignmentBit ( &k , 0 );
|
|
|
|
// sanity
|
|
//float t = Posdb::getFloat ( &k );
|
|
int32_t x = Posdb::getInt ( &k );
|
|
if ( x != n ) { g_process.shutdownAbort(true); }
|
|
|
|
HashTableX *dt = hi->m_tt;
|
|
|
|
// the key may indeed collide, but that's ok for this application
|
|
if ( ! dt->addTerm144 ( &k ) )
|
|
return false;
|
|
|
|
if ( ! m_wts )
|
|
return true;
|
|
|
|
// store in buffer
|
|
char buf[128];
|
|
snprintf(buf,126,"%s:%s int32=%" PRId32,sortByStr, hi->m_prefix,n);
|
|
int32_t bufLen = strlen(buf);
|
|
|
|
// add to wts for PageParser.cpp display
|
|
// store it
|
|
if ( ! storeTerm ( buf,
|
|
bufLen,
|
|
ph2,
|
|
hi,
|
|
0, // word#, i,
|
|
0, // wordPos
|
|
0,// densityRank , // 0-15
|
|
0, // MAXDIVERSITYRANK,//phrase
|
|
0, // ws,
|
|
0, // hashGroup,
|
|
//true,
|
|
&m_wbuf,
|
|
m_wts,
|
|
// a hack for display in wts:
|
|
SOURCE_NUMBER, // SOURCE_BIGRAM, // synsrc
|
|
langUnknown ,
|
|
k ) )
|
|
return false;
|
|
|
|
return true;
|
|
}
|