2303 lines
68 KiB
C++
2303 lines
68 KiB
C++
//-*- coding: utf-8 -*-
|
|
|
|
#include "XmlDoc.h"
|
|
#include "CountryCode.h" // g_countryCode
|
|
#include "Collectiondb.h"
|
|
#include "Speller.h"
|
|
#include "Synonyms.h"
|
|
#include "Process.h"
|
|
#include "ip.h"
|
|
#include "Posdb.h"
|
|
#include "Conf.h"
|
|
#include "UrlBlockCheck.h"
|
|
#include "Domains.h"
|
|
#include "FxExplicitKeywords.h"
|
|
#include <algorithm>
|
|
#include "Lemma.h"
|
|
#include <unordered_set>
|
|
#include <string>
|
|
#include "Errno.h"
|
|
#include "gbmemcpy.h"
|
|
|
|
|
|
#ifdef _VALGRIND_
|
|
#include <valgrind/memcheck.h>
|
|
#endif
|
|
|
|
|
|
static void possiblyDecodeHtmlEntitiesAgain(const char **s, int32_t *len, SafeBuf *sb, bool also_remove_certain_html_elements) {
|
|
//some documents have incorrectly encoded html entities twice. Example:
|
|
//correct: <meta name="foo" content="Boa">
|
|
//incorrect: <meta name="foo" content="&#66;oa">
|
|
//If it seems likely that this has happened then we decode the entities again and put the result in 'sb' and update '*s' and '*len'
|
|
|
|
//Due to the (il)logic of GB the correct form is decoded, while the incorrect form is still raw, needing double decoding
|
|
|
|
//require & following by a second semicolon
|
|
const char *amppos = (const char*)memmem(*s,*len, "&", 5);
|
|
if((amppos && memchr(amppos+5, ';', *len-(amppos-*s)-5)!=NULL) ||
|
|
(memmem(*s,*len,"<",4)!=NULL && memmem(*s,*len,">",4)!=NULL)) {
|
|
//shortest entity is 4 char (<), longest utf8 encoding of a codepoint is 4 + a bit
|
|
StackBuf<1024> tmpBuf;
|
|
if(!tmpBuf.reserve(*len + *len/2 + 4))
|
|
return;
|
|
if(!sb->reserve(*len + *len/2 + 4))
|
|
return;
|
|
|
|
int32_t tmpLen = htmlDecode(tmpBuf.getBufStart(), *s,*len, false);
|
|
|
|
int32_t newlen = htmlDecode(sb->getBufStart(), tmpBuf.getBufStart(), tmpLen, false);
|
|
|
|
sb->setLength(newlen);
|
|
|
|
//Furthermore, some websites have junk in their meta tags. Eg <br> in the meta description
|
|
//We don't fix all cases as that could hurt correctly written pages about how to write proper html. But
|
|
//if they don't mention "html", "tag" nor "element" then we remove the most common offenders br/b/i/p
|
|
//When changing this function consider keeping in sync with Summary::maybeRemoveHtmlFormatting()
|
|
if(also_remove_certain_html_elements) {
|
|
if(memmem(sb->getBufStart(),sb->length(),"html",4)==0 &&
|
|
memmem(sb->getBufStart(),sb->length(),"HTML",4)==0 &&
|
|
memmem(sb->getBufStart(),sb->length(),"tag",3)==0 &&
|
|
memmem(sb->getBufStart(),sb->length(),"Tag",3)==0 &&
|
|
memmem(sb->getBufStart(),sb->length(),"element",7)==0 &&
|
|
memmem(sb->getBufStart(),sb->length(),"Element",7)==0)
|
|
{
|
|
sb->safeReplace2("<br>",4," ",1,0);
|
|
sb->safeReplace2("<b>",3,"",0,0);
|
|
sb->safeReplace2("<u>",3,"",0,0);
|
|
sb->safeReplace2("<p>",3," ",1,0);
|
|
}
|
|
}
|
|
*s = sb->getBufStart();
|
|
*len = sb->length();
|
|
}
|
|
}
|
|
|
|
|
|
|
|
// a ptr to HashInfo is passed to hashString() and hashWords()
|
|
class HashInfo {
|
|
public:
|
|
HashInfo() {
|
|
m_tt = NULL;
|
|
m_prefix = NULL;
|
|
m_desc = NULL;
|
|
m_date = 0;
|
|
// should we do sharding based on termid and not the usual docid???
|
|
// in general this is false, but for checksum we want to shard
|
|
// by the checksum and not docid to avoid having to do a
|
|
// gbchecksum:xxxxx search on ALL shards. much more efficient.
|
|
m_shardByTermId = false;
|
|
m_hashGroup = -1;
|
|
m_useCountTable = true;
|
|
m_useSections = true;
|
|
m_startDist = 0;
|
|
|
|
m_hashNumbers = true;
|
|
m_filterUrlIndexableWords = false;
|
|
m_linkerSiteRank = 0;
|
|
}
|
|
class HashTableX *m_tt;
|
|
const char *m_prefix;
|
|
// "m_desc" should detail the algorithm
|
|
const char *m_desc;
|
|
int32_t m_date;
|
|
bool m_shardByTermId;
|
|
char m_linkerSiteRank;
|
|
char m_hashGroup;
|
|
int32_t m_startDist;
|
|
bool m_useCountTable;
|
|
bool m_useSections;
|
|
bool m_hashNumbers;
|
|
bool m_filterUrlIndexableWords; //Do special filtering on words in url, eg. exclude "com" before path
|
|
};
|
|
|
|
|
|
|
|
static bool storeTerm ( const char *s ,
|
|
int32_t slen ,
|
|
int64_t termId ,
|
|
HashInfo *hi ,
|
|
int32_t wordNum ,
|
|
int32_t wordPos ,
|
|
char densityRank,
|
|
char diversityRank ,
|
|
char wordSpamRank ,
|
|
char hashGroup,
|
|
//bool isPhrase ,
|
|
SafeBuf *wbuf ,
|
|
HashTableX *wts ,
|
|
char synSrc ,
|
|
char langId ,
|
|
posdbkey_t key ) {
|
|
|
|
// store prefix
|
|
int32_t poff = wbuf->length();
|
|
// shortcut
|
|
const char *p = hi->m_prefix;
|
|
// add the prefix too!
|
|
if ( p && ! wbuf->safeMemcpy(p,strlen(p)+1)) return false;
|
|
// none?
|
|
if ( ! p ) poff = -1;
|
|
|
|
|
|
// store description
|
|
int32_t doff = wbuf->length();
|
|
// shortcut
|
|
const char *d = hi->m_desc;
|
|
// add the desc too!
|
|
if ( d && ! wbuf->safeMemcpy(d,strlen(d)+1) ) return false;
|
|
// none?
|
|
if ( ! d ) doff = -1;
|
|
|
|
// store term
|
|
int32_t toff = wbuf->length();
|
|
// add it
|
|
if ( ! wbuf->safeMemcpy ( s , slen ) ) return false;
|
|
// make this
|
|
TermDebugInfo ti;
|
|
ti.m_termOff = toff;
|
|
ti.m_termLen = slen;
|
|
ti.m_descOff = doff;
|
|
ti.m_prefixOff = poff;
|
|
ti.m_date = hi->m_date;
|
|
ti.m_shardByTermId = hi->m_shardByTermId;
|
|
ti.m_termId = termId;
|
|
//ti.m_weight = 1.0;
|
|
//ti.m_spam = -1.0;
|
|
ti.m_diversityRank = diversityRank;
|
|
ti.m_densityRank = densityRank;
|
|
ti.m_wordSpamRank = wordSpamRank;
|
|
ti.m_hashGroup = hashGroup;
|
|
ti.m_wordNum = wordNum;
|
|
ti.m_wordPos = wordPos;
|
|
ti.m_langId = langId;
|
|
ti.m_key = key;
|
|
|
|
// save for printing out an asterisk
|
|
ti.m_synSrc = synSrc; // isSynonym = isSynonym;
|
|
|
|
// get language bit vec
|
|
ti.m_langBitVec64 = g_speller.getLangBits64(termId);
|
|
|
|
// make the key
|
|
key96_t k;
|
|
k.n1 = 0; // date
|
|
k.n0 = termId;
|
|
|
|
// store it
|
|
return wts->addKey ( &k , &ti ) ;
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
//
|
|
// . hash terms that are sharded by TERMID not DOCID!!
|
|
//
|
|
// . returns false and sets g_errno on error
|
|
// . these terms are stored in indexdb, but all terms with the same
|
|
// termId reside in one and only one group. whereas normally the records
|
|
// are split based on docid and every group gets 1/nth of the termlist.
|
|
// . we do this "no splitting" so that only one disk seek is required, and
|
|
// we know the termlist is small, or the termlist is being used for spidering
|
|
// or parsing purposes and is usually not sent across the network.
|
|
bool XmlDoc::hashNoSplit ( HashTableX *tt ) {
|
|
// constructor should set to defaults automatically
|
|
HashInfo hi;
|
|
hi.m_hashGroup = HASHGROUP_INTAG;
|
|
hi.m_tt = tt;
|
|
// usually we shard by docid, but these are terms we shard by termid!
|
|
hi.m_shardByTermId = true;
|
|
|
|
if ((size_utf8Content - 1) > 0) {
|
|
// for exact content deduping
|
|
setStatus("hashing gbcontenthash (deduping) no-split keys");
|
|
|
|
// this should be ready to go and not block!
|
|
int64_t *pch64 = getExactContentHash64();
|
|
if (!pch64 || pch64 == (void *)-1) { g_process.shutdownAbort(true); }
|
|
|
|
char cbuf[64];
|
|
int32_t clen = sprintf(cbuf, "%" PRIu64, (uint64_t)*pch64);
|
|
hi.m_prefix = "gbcontenthash";
|
|
if (!hashString(cbuf, clen, &hi)) return false;
|
|
}
|
|
|
|
// now hash the site
|
|
setStatus ( "hashing no-split SiteGetter terms");
|
|
|
|
Url *fu = getFirstUrl();
|
|
const char *host = fu->getHost();
|
|
|
|
//
|
|
// HASH terms for SiteGetter.cpp
|
|
//
|
|
// these are now no-split terms
|
|
//
|
|
const char *s = fu->getUrl();
|
|
int32_t slen = fu->getUrlLen();
|
|
// . this termId is used by SiteGetter.cpp for determining subsites
|
|
// . matches what is in SiteGet::getSiteList()
|
|
// for www.xyz.com/a/ HASH www.xyz.com
|
|
// for www.xyz.com/a/b/ HASH www.xyz.com/a/
|
|
// for www.xyz.com/a/b/c/ HASH www.xyz.com/a/b/
|
|
bool add = true;
|
|
// we only hash this for urls that end in '/'
|
|
if ( s[slen-1] != '/' ) add = false;
|
|
// and no cgi
|
|
if ( fu->isCgi() ) add = false;
|
|
// skip if root
|
|
if ( fu->getPathLen() <= 1 ) add = false;
|
|
// sanity check
|
|
if ( ! m_linksValid ) { g_process.shutdownAbort(true); }
|
|
// . skip if we have no subdirectory outlinks
|
|
// . that way we do not confuse all the pages in dictionary.com or
|
|
// wikipedia.org as subsites!!
|
|
if ( ! m_links.hasSubdirOutlink() ) add = false;
|
|
// hash it
|
|
if ( add ) {
|
|
// remove the last path component
|
|
const char *end2 = s + slen - 2;
|
|
// back up over last component
|
|
for ( ; end2 > fu->getPath() && *end2 != '/' ; end2-- ) ;
|
|
// hash that part of the url
|
|
hi.m_prefix = "siteterm";
|
|
if ( ! hashSingleTerm ( host,end2-host,&hi) ) return false;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
// . returns -1 if blocked, returns NULL and sets g_errno on error
|
|
// . "sr" is the tagdb Record
|
|
// . "ws" store the terms for PageParser.cpp display
|
|
char *XmlDoc::hashAll(HashTableX *table) {
|
|
logTrace(g_conf.m_logTraceXmlDoc, "BEGIN");
|
|
|
|
setStatus("hashing document");
|
|
|
|
if (m_allHashed) {
|
|
return (char *)1;
|
|
}
|
|
|
|
// sanity checks
|
|
if (table->getKeySize() != 18 || table->getDataSize() != 4) {
|
|
g_process.shutdownAbort(true);
|
|
}
|
|
|
|
// ptr to term = 4 + score = 4 + ptr to sec = 4
|
|
if (m_wts && (m_wts->getKeySize() != 12 || m_wts->getDataSize() != sizeof(TermDebugInfo))) {
|
|
g_process.shutdownAbort(true);
|
|
}
|
|
|
|
uint8_t *ct = getContentType();
|
|
if (!ct) {
|
|
logTrace(g_conf.m_logTraceXmlDoc, "END, getContentType failed");
|
|
return NULL;
|
|
}
|
|
|
|
lemma_words.clear();
|
|
|
|
// BR 20160127: Never index JSON and XML content
|
|
if (*ct == CT_JSON || *ct == CT_XML) {
|
|
// For XML (JSON should not get here as it should be filtered out during spidering)
|
|
// store the URL as the only thing in posdb so we are able to find it, and
|
|
// eventually ban it.
|
|
if (!hashUrl(table, true)) { // urlOnly (skip IP and term generation)
|
|
logTrace(g_conf.m_logTraceXmlDoc, "END, hashUrl failed");
|
|
return NULL;
|
|
}
|
|
m_allHashed = true;
|
|
return (char *)1;
|
|
}
|
|
|
|
// need this for hashing
|
|
HashTableX *cnt = getCountTable();
|
|
if (!cnt) {
|
|
logTrace(g_conf.m_logTraceXmlDoc, "END, getCountTable failed");
|
|
return (char *)cnt;
|
|
}
|
|
if (cnt == (void *)-1) {
|
|
g_process.shutdownAbort(true);
|
|
}
|
|
|
|
// and this
|
|
Links *links = getLinks();
|
|
if (!links) {
|
|
logTrace(g_conf.m_logTraceXmlDoc, "END, getLinks failed");
|
|
return (char *)links;
|
|
}
|
|
if (links == (Links *)-1) {
|
|
g_process.shutdownAbort(true);
|
|
}
|
|
|
|
char *wordSpamVec = getWordSpamVec();
|
|
if (!wordSpamVec) {
|
|
logTrace(g_conf.m_logTraceXmlDoc, "END, getWordSpamVec failed");
|
|
return wordSpamVec;
|
|
}
|
|
if (wordSpamVec == (void *)-1) {
|
|
g_process.shutdownAbort(true);
|
|
}
|
|
|
|
char *fragVec = getFragVec();
|
|
if (!fragVec) {
|
|
logTrace(g_conf.m_logTraceXmlDoc, "END, getFragVec failed");
|
|
return fragVec;
|
|
}
|
|
if (fragVec == (void *)-1) {
|
|
g_process.shutdownAbort(true);
|
|
}
|
|
|
|
// why do we need this?
|
|
if ( m_wts ) {
|
|
uint8_t *lv = getLangVector();
|
|
if (!lv) {
|
|
logTrace(g_conf.m_logTraceXmlDoc, "END, getLangVector failed");
|
|
return (char *)lv;
|
|
}
|
|
if (lv == (void *)-1) {
|
|
g_process.shutdownAbort(true);
|
|
}
|
|
}
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) {
|
|
logTrace(g_conf.m_logTraceXmlDoc, "END, getCollRec failed");
|
|
return NULL;
|
|
}
|
|
|
|
// do not repeat this if the cachedb storage call blocks
|
|
m_allHashed = true;
|
|
|
|
// reset distance cursor
|
|
m_dist = 0;
|
|
|
|
if (!hashContentType(table)) {
|
|
logTrace(g_conf.m_logTraceXmlDoc, "END, hashContentType failed");
|
|
return NULL;
|
|
}
|
|
|
|
if (!hashUrl(table, false)) {
|
|
logTrace(g_conf.m_logTraceXmlDoc, "END, hashUrl failed");
|
|
return NULL;
|
|
}
|
|
|
|
if (!hashLanguage(table)) {
|
|
logTrace(g_conf.m_logTraceXmlDoc, "END, hashLanguage failed");
|
|
return NULL;
|
|
}
|
|
|
|
if (!hashCountry(table)) {
|
|
logTrace(g_conf.m_logTraceXmlDoc, "END, hashCountry failed");
|
|
return NULL;
|
|
}
|
|
|
|
// now hash the terms sharded by termid and not docid here since they
|
|
// just set a special bit in posdb key so Rebalance.cpp can work.
|
|
// this will hash the content checksum which we need for deduping
|
|
// which we use for diffbot custom crawls as well.
|
|
if (!hashNoSplit(table)) {
|
|
logTrace(g_conf.m_logTraceXmlDoc, "END, hashNoSplit failed");
|
|
return NULL;
|
|
}
|
|
|
|
// MDW: i think we just inject empty html with a diffbotreply into
|
|
// global index now, so don't need this... 9/28/2014
|
|
|
|
// stop indexing xml docs
|
|
// global index unless this is a json object in which case it is
|
|
// hashed above in the call to hashJSON(). this will decrease disk
|
|
// usage by about half, posdb* files are pretty big.
|
|
if (!cr->m_indexBody) {
|
|
logTrace(g_conf.m_logTraceXmlDoc, "END, !indexDoc");
|
|
return (char *)1;
|
|
}
|
|
|
|
bool *ini = getIsNoIndex();
|
|
if (ini == nullptr || ini == (bool*)-1) {
|
|
// must not be blocked
|
|
gbshutdownLogicError();
|
|
}
|
|
|
|
if (*ini && m_version > 126) {
|
|
logTrace(g_conf.m_logTraceXmlDoc, "END, noindex");
|
|
return (char *)1;
|
|
}
|
|
|
|
if ((size_utf8Content - 1) <= 0) {
|
|
logTrace(g_conf.m_logTraceXmlDoc, "END, contentLen == 0");
|
|
return (char *)1;
|
|
}
|
|
|
|
// hash the body of the doc first so m_dist is 0 to match
|
|
// the rainbow display of sections
|
|
if (!hashBody2(table)) {
|
|
logTrace(g_conf.m_logTraceXmlDoc, "END, hashBody2 failed");
|
|
return NULL;
|
|
}
|
|
|
|
// hash the title now too so neighborhood singles have more
|
|
// to match. plus, we only hash these title terms iff they
|
|
// are not already in the hash table, so as to avoid hashing
|
|
// repeated title terms because we do not do spam detection
|
|
// on them. thus, we need to hash these first before anything
|
|
// else. give them triple the body score
|
|
if (!hashTitle(table)) {
|
|
logTrace(g_conf.m_logTraceXmlDoc, "END, hashTitle failed");
|
|
return NULL;
|
|
}
|
|
|
|
// . hash the keywords tag, limited to first 2k of them so far
|
|
// . hash above the neighborhoods so the neighborhoods only index
|
|
// what is already in the hash table
|
|
if (!hashMetaKeywords(table)) {
|
|
logTrace(g_conf.m_logTraceXmlDoc, "END, hashMetaKeywords failed");
|
|
return NULL;
|
|
}
|
|
|
|
//Hash explicit keywords, if any
|
|
if(!hashExplicitKeywords(table)) {
|
|
logTrace(g_conf.m_logTraceXmlDoc, "END, hashExplicityKeywords failed");
|
|
return NULL;
|
|
}
|
|
|
|
// then hash the incoming link text, NO ANOMALIES, because
|
|
// we index the single words in the neighborhoods next, and
|
|
// we had songfacts.com coming up for the 'street light facts'
|
|
// query because it had a bunch of anomalous inlink text.
|
|
if (!hashIncomingLinkText(table)) {
|
|
logTrace(g_conf.m_logTraceXmlDoc, "END, hashIncomingLinkText failed");
|
|
return NULL;
|
|
}
|
|
|
|
// then the meta summary and description tags with half the score of
|
|
// the body, and only hash a term if was not already hashed above
|
|
// somewhere.
|
|
if (!hashMetaSummary(table)) {
|
|
logTrace(g_conf.m_logTraceXmlDoc, "END, hashMetaSummary failed");
|
|
return NULL;
|
|
}
|
|
|
|
// BR 20160220
|
|
// Store value of meta tag "geo.placename" to help aid searches for
|
|
// location specific sites, e.g. 'Restaurant in London'
|
|
if (!hashMetaGeoPlacename(table)) {
|
|
logTrace(g_conf.m_logTraceXmlDoc, "END, hashMetaGeoPlacename failed");
|
|
return NULL;
|
|
}
|
|
|
|
// this will only increment the scores of terms already in the table
|
|
// because we neighborhoods are not techincally in the document
|
|
// necessarily and we do not want to ruin our precision
|
|
if (!hashNeighborhoods(table)) {
|
|
logTrace(g_conf.m_logTraceXmlDoc, "END, hashNeighborhoods failed");
|
|
return NULL;
|
|
}
|
|
|
|
if (!hashLinks(table)) {
|
|
logTrace(g_conf.m_logTraceXmlDoc, "END, hashLinks failed");
|
|
return NULL;
|
|
}
|
|
|
|
if (!hashMetaTags(table)) {
|
|
logTrace(g_conf.m_logTraceXmlDoc, "END, hashMetaTags failed");
|
|
return NULL;
|
|
}
|
|
|
|
// hash gblang:de last for parsing consistency
|
|
if (!hashLanguageString(table)) {
|
|
logTrace(g_conf.m_logTraceXmlDoc, "END, hashLanguageString failed");
|
|
return NULL;
|
|
}
|
|
|
|
if(!hashLemmas(table)) {
|
|
logTrace(g_conf.m_logTraceXmlDoc, "END, hashLemmas failed");
|
|
return NULL;
|
|
}
|
|
lemma_words.clear(); //release memory early
|
|
|
|
logTrace(g_conf.m_logTraceXmlDoc, "END, OK");
|
|
return (char *)1;
|
|
}
|
|
|
|
// returns false and sets g_errno on error
|
|
bool XmlDoc::hashMetaTags ( HashTableX *tt ) {
|
|
|
|
setStatus ( "hashing meta tags" );
|
|
|
|
int32_t n = m_xml.getNumNodes();
|
|
XmlNode *nodes = m_xml.getNodes();
|
|
|
|
// set up the hashing parms
|
|
HashInfo hi;
|
|
hi.m_hashGroup = HASHGROUP_INMETATAG;
|
|
hi.m_tt = tt;
|
|
hi.m_desc = "custom meta tag";
|
|
|
|
// find the first meta summary node
|
|
for ( int32_t i = 0 ; i < n ; i++ ) {
|
|
//we are only interested in meta tags
|
|
if(nodes[i].m_nodeId != TAG_META)
|
|
continue;
|
|
// only get content for <meta name=..> not <meta http-equiv=..>
|
|
int32_t tagLen;
|
|
const char *tag = m_xml.getString(i, "name", &tagLen);
|
|
// skip if error/empty
|
|
if ( ! tag || tagLen <= 0 ) continue;
|
|
|
|
// this is now reserved
|
|
// do not hash keyword, keywords, description, or summary metas
|
|
// because that is done in hashRange() below based on the
|
|
// tagdb (ruleset) record
|
|
if ((tagLen== 7&&strncasecmp(tag,"keyword" , 7)== 0)||
|
|
(tagLen== 7&&strncasecmp(tag,"summary" , 7)== 0)||
|
|
(tagLen== 8&&strncasecmp(tag,"keywords" , 8)== 0)||
|
|
(tagLen==11&&strncasecmp(tag,"description",11)== 0) )
|
|
continue;
|
|
|
|
|
|
// BR 20160107: Only hash certain custom meta tags and ignore the rest
|
|
if(
|
|
(strncasecmp(tag,"subject", 7) != 0) &&
|
|
(strncasecmp(tag,"abstract", 8) != 0) &&
|
|
(strncasecmp(tag,"news_keywords", 13) != 0) && // http://www.metatags.org/meta_name_news_keywords
|
|
(strncasecmp(tag,"author", 6) != 0) &&
|
|
(strncasecmp(tag,"title", 5) != 0) &&
|
|
(strncasecmp(tag,"og:title", 8) != 0) &&
|
|
(strncasecmp(tag,"og:description", 14) != 0) &&
|
|
(strncasecmp(tag,"twitter:title", 13) != 0) &&
|
|
(strncasecmp(tag,"twitter:description", 19) != 0) )
|
|
{
|
|
// If none of the above, it is an unwanted meta tag
|
|
continue;
|
|
}
|
|
|
|
// get the content
|
|
int32_t len;
|
|
const char *s = m_xml.getString ( i , "content" , &len );
|
|
if ( ! s || len <= 0 ) continue;
|
|
|
|
StackBuf<1024> doubleDecodedContent;
|
|
possiblyDecodeHtmlEntitiesAgain(&s, &len, &doubleDecodedContent, true);
|
|
|
|
// Now index the wanted meta tags as normal text without prefix so they
|
|
// are used in user searches automatically.
|
|
hi.m_prefix = NULL;
|
|
|
|
bool status = hashString4(s,len,&hi);
|
|
|
|
// bail on error, g_errno should be set
|
|
if ( ! status ) return false;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
|
|
|
|
// returns false and sets g_errno on error
|
|
bool XmlDoc::hashContentType ( HashTableX *tt ) {
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) return false;
|
|
|
|
|
|
uint8_t *ctype = getContentType();
|
|
if( !ctype ) {
|
|
return false;
|
|
}
|
|
|
|
const char *s = NULL;
|
|
|
|
setStatus ( "hashing content type" );
|
|
|
|
|
|
// hash numerically so we can do gbfacetint:type on it
|
|
HashInfo hi;
|
|
hi.m_hashGroup = HASHGROUP_INTAG;
|
|
hi.m_tt = tt;
|
|
hi.m_prefix = "type";
|
|
|
|
char tmp[6];
|
|
sprintf(tmp,"%" PRIu32,(uint32_t)*ctype);
|
|
if ( ! hashString (tmp,strlen(tmp),&hi ) ) return false;
|
|
|
|
|
|
// these ctypes are defined in HttpMime.h
|
|
switch (*ctype) {
|
|
case CT_HTML: s = "html"; break;
|
|
case CT_TEXT: s = "text"; break;
|
|
case CT_XML : s = "xml" ; break;
|
|
case CT_PDF : s = "pdf" ; break;
|
|
case CT_DOC : s = "doc" ; break;
|
|
case CT_XLS : s = "xls" ; break;
|
|
case CT_PPT : s = "ppt" ; break;
|
|
case CT_PS : s = "ps" ; break;
|
|
// for diffbot. so we can limit search to json objects
|
|
// in Diffbot.cpp
|
|
case CT_JSON: s = "json" ; break;
|
|
}
|
|
// bail if unrecognized content type
|
|
if ( ! s ) return true;
|
|
|
|
// . now hash it
|
|
// . use a score of 1 for all
|
|
// . TODO: ensure doc counting works ok with this when it does
|
|
// it's interpolation
|
|
return hashString (s,strlen(s),&hi );
|
|
}
|
|
|
|
// . hash the link: terms
|
|
// . ensure that more useful linkers are scored higher
|
|
// . useful for computing offsite link text for qdb-ish algorithm
|
|
// . NOTE: for now i do not hash links to the same domain in order to
|
|
// hopefully save 10%-25% index space
|
|
// . NOTE: PLUS, they may clog up the link-adjusted quality ratings since
|
|
// different site links with no link text will be ranked behind them
|
|
// . the 8-bit bitmap of the score of a link: term:
|
|
// . 00ubdcss u = link is Unbanned? b = link isBanned?
|
|
// d = link dirty? c = link clean?
|
|
// s = 01 if no link text, 10 if link text
|
|
// . NOTE: this is used in Msg18.cpp for extraction
|
|
// . CAUTION: IndexList::score32to8() will warp our score if its >= 128
|
|
// so i moved the bits down
|
|
bool XmlDoc::hashLinks ( HashTableX *tt ) {
|
|
|
|
setStatus ( "hashing links" );
|
|
|
|
// shortcuts
|
|
bool isRSSFeed = *getIsRSS();
|
|
|
|
char dbuf[8*4*1024];
|
|
HashTableX dedup;
|
|
dedup.set( 8,0,1024,dbuf,8*4*1024,false,"hldt");
|
|
|
|
CollectionRec *cr = getCollRec();
|
|
if ( ! cr ) {
|
|
logTrace( g_conf.m_logTraceXmlDoc, "END, getCollRec failed" );
|
|
return false;
|
|
}
|
|
|
|
// see ../url/Url2.cpp for hashAsLink() algorithm
|
|
for ( int32_t i = 0 ; i < m_links.m_numLinks ; i++ ) {
|
|
// skip links with zero 0 length
|
|
if ( m_links.m_linkLens[i] == 0 ) {
|
|
continue;
|
|
}
|
|
|
|
// . skip if we are rss page and this link is an <a href> link
|
|
// . we only harvest/index <link> urls from rss feeds
|
|
// . or in the case of feedburner, those orig tags
|
|
if ( isRSSFeed && (m_links.m_linkFlags[i] & LF_AHREFTAG) ) {
|
|
continue;
|
|
}
|
|
|
|
// if we have a <feedburner:origLink> tag, then ignore <link>
|
|
// tags and only get the links from the original links
|
|
if ( m_links.m_isFeedBurner && !(m_links.m_linkFlags[i] & LF_FBTAG) ) {
|
|
continue;
|
|
}
|
|
|
|
// normalize the link
|
|
Url link;
|
|
|
|
// now we always add "www" to these links so that any link
|
|
// to cnn.com is same as link to www.cnn.com, because either
|
|
// we index cnn.com or www.cnn.com but not both providing
|
|
// their content is identical (deduping). This way whichever
|
|
// one we index, we can take advantage of all link text whether
|
|
// it's to cnn.com or www.cnn.com.
|
|
// Every now and then we add new session ids to our list in
|
|
// Url.cpp, too, so we have to version that.
|
|
// Since this is just for hashing, it shouldn't matter that
|
|
// www.tmblr.co has no IP whereas only tmblr.co does.
|
|
link.set( m_links.m_linkPtrs[i], m_links.m_linkLens[i], true, m_links.m_stripParams, m_version );
|
|
|
|
// BR 20160105: Do not create "link:" hashes for media URLs etc.
|
|
if( link.hasNonIndexableExtension(TITLEREC_CURRENT_VERSION) || // @todo BR: For now ignore actual TitleDB version. // m_version) ||
|
|
link.hasScriptExtension() ||
|
|
link.hasJsonExtension() ||
|
|
link.hasXmlExtension() ||
|
|
isUrlBlocked(link)) {
|
|
logTrace( g_conf.m_logTraceXmlDoc, "Unwanted for indexing [%s]", link.getUrl());
|
|
continue;
|
|
}
|
|
|
|
// dedup this crap
|
|
int64_t h = hash64 ( link.getUrl(), link.getUrlLen() );
|
|
if ( dedup.isInTable ( &h ) ) continue;
|
|
if ( ! dedup.addKey ( &h ) ) return false;
|
|
|
|
// set up the hashing parms
|
|
HashInfo hi;
|
|
hi.m_hashGroup = HASHGROUP_INTAG;
|
|
hi.m_tt = tt;
|
|
hi.m_prefix = "link";
|
|
|
|
// hash link:<url>
|
|
if ( ! hashSingleTerm ( link.getUrl(),link.getUrlLen(),&hi )) {
|
|
return false;
|
|
}
|
|
|
|
h = hash64 ( link.getHost() , link.getHostLen() );
|
|
if ( dedup.isInTable ( &h ) ) continue;
|
|
if ( ! dedup.addKey ( &h ) ) return false;
|
|
|
|
// fix parm
|
|
hi.m_prefix = "sitelink";
|
|
|
|
// hash sitelink:<urlHost>
|
|
if ( ! hashSingleTerm ( link.getHost(),link.getHostLen(),&hi)) {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
|
|
// . returns false and sets g_errno on error
|
|
// . hash for linkdb
|
|
bool XmlDoc::hashLinksForLinkdb ( HashTableX *dt ) {
|
|
|
|
// sanity check
|
|
if ( dt->getKeySize() != sizeof(key224_t) ) { g_process.shutdownAbort(true); }
|
|
if ( dt->getDataSize() != 0 ) { g_process.shutdownAbort(true); }
|
|
|
|
// this will be different with our new site definitions
|
|
uint32_t linkerSiteHash32 = *getSiteHash32();
|
|
|
|
char siteRank = getSiteRank();
|
|
|
|
if ( ! m_linksValid ) { g_process.shutdownAbort(true); }
|
|
|
|
int32_t *linkSiteHashes = getLinkSiteHashes();
|
|
if ( ! linkSiteHashes || linkSiteHashes == (void *)-1 ) {
|
|
g_process.shutdownAbort(true);
|
|
}
|
|
|
|
// use spidered time! might not be current time! like if rebuilding
|
|
// or injecting from a past spider time
|
|
int32_t discoveryDate = getSpideredTime();
|
|
|
|
// add in new links
|
|
for ( int32_t i = 0 ; i < m_links.m_numLinks ; i++ ) {
|
|
// skip if empty
|
|
if (m_links.m_linkLens[i] == 0) {
|
|
continue;
|
|
}
|
|
|
|
// . skip if spam, ALWAYS allow internal outlinks though!!
|
|
// . CAUTION: now we must version islinkspam()
|
|
bool spam = m_links.isLinkSpam(i);
|
|
|
|
// get site of outlink from tagrec if in there
|
|
int32_t linkeeSiteHash32 = linkSiteHashes[i];
|
|
|
|
//
|
|
// when setting the links class it should set the site hash
|
|
//
|
|
|
|
|
|
#ifdef _VALGRIND_
|
|
VALGRIND_CHECK_MEM_IS_DEFINED(&linkeeSiteHash32,sizeof(linkeeSiteHash32));
|
|
uint64_t tmp1 = m_links.getLinkHash64(i);
|
|
VALGRIND_CHECK_MEM_IS_DEFINED(&tmp1,sizeof(tmp1));
|
|
VALGRIND_CHECK_MEM_IS_DEFINED(&spam,sizeof(spam));
|
|
VALGRIND_CHECK_MEM_IS_DEFINED(&siteRank,sizeof(siteRank));
|
|
// uint32_t tmp2 = *getIp();
|
|
// VALGRIND_CHECK_MEM_IS_DEFINED(&tmp2,sizeof(tmp2));
|
|
uint64_t tmp3 = *getDocId();
|
|
VALGRIND_CHECK_MEM_IS_DEFINED(&tmp3,sizeof(tmp3));
|
|
VALGRIND_CHECK_MEM_IS_DEFINED(&discoveryDate,sizeof(discoveryDate));
|
|
VALGRIND_CHECK_MEM_IS_DEFINED(&linkerSiteHash32,sizeof(linkerSiteHash32));
|
|
#endif
|
|
|
|
int32_t *ipptr = getIp();
|
|
int32_t ip = ipptr ? *ipptr : 0;
|
|
|
|
// set this key, it is the entire record
|
|
key224_t k = Linkdb::makeKey_uk ( linkeeSiteHash32 ,
|
|
m_links.getLinkHash64(i) ,
|
|
spam , // link spam?
|
|
siteRank , // was quality
|
|
ip,
|
|
*getDocId() ,
|
|
discoveryDate ,
|
|
0 ,
|
|
false , // new add?
|
|
linkerSiteHash32 ,
|
|
false );// delete?
|
|
#ifdef _VALGRIND_
|
|
VALGRIND_CHECK_MEM_IS_DEFINED(&k,sizeof(k));
|
|
#endif
|
|
|
|
// store in hash table
|
|
if (!dt->addKey(&k, NULL)) {
|
|
return false;
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
// . returns false and sets g_errno on error
|
|
// . copied Url2.cpp into here basically, so we can now dump Url2.cpp
|
|
bool XmlDoc::hashUrl ( HashTableX *tt, bool urlOnly ) { // , bool isStatusDoc ) {
|
|
|
|
setStatus ( "hashing url colon" );
|
|
|
|
// get the first url
|
|
Url *fu = getFirstUrl();
|
|
|
|
// set up the hashing parms
|
|
HashInfo hi;
|
|
hi.m_hashGroup = HASHGROUP_INTAG;
|
|
hi.m_tt = tt;
|
|
|
|
// we do not need diversity bits for this
|
|
hi.m_useCountTable = false;
|
|
//
|
|
// HASH url: term
|
|
//
|
|
// append a "www." for doing url: searches
|
|
Url uw;
|
|
uw.set( fu->getUrl(), fu->getUrlLen(), true, false );
|
|
hi.m_prefix = "url";
|
|
|
|
if ( ! hashSingleTerm(uw.getUrl(),uw.getUrlLen(),&hi) )
|
|
return false;
|
|
|
|
if (urlOnly) {
|
|
return true;
|
|
}
|
|
|
|
bool *ini = getIsNoIndex();
|
|
if (ini == nullptr || ini == (bool*)-1) {
|
|
// must not be blocked
|
|
gbshutdownLogicError();
|
|
}
|
|
|
|
const char *s = fu->getUrl();
|
|
int32_t slen = fu->getUrlLen();
|
|
|
|
SafeBuf sb_punyDecodedHost;
|
|
//no-index support was added in version 126. So if noindex is not present, or if un-indexing an older titlerecversion then do the index
|
|
if (!*ini || m_version <= 126) {
|
|
setStatus("hashing inurl colon");
|
|
|
|
//
|
|
// HASH inurl: terms
|
|
//
|
|
hi.m_prefix = "inurl";
|
|
|
|
// BR 20160114: Skip numbers in urls when doing "inurl:" queries
|
|
hi.m_hashNumbers = false;
|
|
hi.m_filterUrlIndexableWords = true;
|
|
if (!hashString(s, slen, &hi)) return false;
|
|
|
|
//If the host has punycode encoded characters in it and the TLD has some enforcement against phishing
|
|
//and misleading domains then index the punycode-decoded string too
|
|
if(fu->isPunycodeSafeTld() && fu->hasPunycode()) {
|
|
if(fu->getPunycodeDecodedHost(&sb_punyDecodedHost)) {
|
|
//note: we index non-punycode labels too, it is not worth the effort to avoid that
|
|
//because we also need them for bigram generation. So eg www.ærtesuppe.dk will get
|
|
//indexed for "www", "xn--rtesuppe-i0a", and "dk" in the hashStrings() call above
|
|
//and them for "www", "ærtesuppe" and "dk" below.
|
|
if (!hashString(sb_punyDecodedHost.getBufStart(), sb_punyDecodedHost.length(), &hi))
|
|
return false;
|
|
}
|
|
}
|
|
}
|
|
|
|
{
|
|
setStatus("hashing ip colon");
|
|
hi.m_hashNumbers = true;
|
|
hi.m_filterUrlIndexableWords = false;
|
|
|
|
//
|
|
// HASH ip:a.b.c.d
|
|
//
|
|
if (!m_ipValid) { g_process.shutdownAbort(true); }
|
|
// copy it to save it
|
|
char ipbuf[64];
|
|
int32_t iplen = strlen(iptoa(m_ip, ipbuf));
|
|
hi.m_prefix = "ip";
|
|
if (!hashSingleTerm(ipbuf, iplen, &hi)) return false;
|
|
|
|
// . sanity check
|
|
if (!m_siteNumInlinksValid) { g_process.shutdownAbort(true); }
|
|
}
|
|
|
|
|
|
//
|
|
// HASH the url's mid domain and host as they were in the body
|
|
//
|
|
setStatus ( "hashing site colon terms");
|
|
|
|
//
|
|
// HASH the site: terms
|
|
//
|
|
// . hash the pieces of the site
|
|
// . http://host.domain.com/~harry/level1/ should hash to:
|
|
// . site:host.domain.com/~harry/level1/
|
|
// . site:host.domain.com/~harry/
|
|
// . site:host.domain.com/~
|
|
// . site:host.domain.com/
|
|
// . site:domain.com/~harry/level1/
|
|
// . site:domain.com/~harry/
|
|
// . site:domain.com/~
|
|
// . site:domain.com/
|
|
// ensure score is positive
|
|
//if ( siteScore <= 0 ) siteScore = 1;
|
|
// get the hostname (later we set to domain name)
|
|
const char *name = fu->getHost();
|
|
int32_t nameLen = fu->getHostLen();
|
|
|
|
#ifdef _VALGRIND_
|
|
VALGRIND_CHECK_MEM_IS_DEFINED(name,nameLen);
|
|
#endif
|
|
// . point to the end of the whole thing, including port field
|
|
// . add in port, if non default
|
|
const char *end3 = name + fu->getHostLen() + fu->getPortLen();
|
|
|
|
// Generate string with port if server runs on non-standard ports
|
|
char pbuf[12];
|
|
int pbufLen=0;
|
|
int32_t port = fu->getPort();
|
|
if( port > 0 && port != 80 && port != 443 ) {
|
|
pbufLen=snprintf(pbuf, 12, ":%" PRIu32, (uint32_t)fu->getPort());
|
|
}
|
|
|
|
|
|
loop:
|
|
// now loop through the sub paths of this url's path
|
|
int32_t prev_len = -1;
|
|
for ( int32_t i = 0 ; ; i++ ) {
|
|
// get the subpath
|
|
int32_t len = fu->getSubPathLen(i);
|
|
if(len==prev_len) //work around bug (?) in Url
|
|
continue;
|
|
prev_len = len;
|
|
|
|
// FIX: always include first /
|
|
if ( len == 0 ) {
|
|
len = 1;
|
|
}
|
|
|
|
// write http://www.whatever.com/path into buf
|
|
char buf[MAX_URL_LEN+10];
|
|
char *p = buf;
|
|
|
|
// BR 20160122: Do NOT fix this for https sites. The search is
|
|
// always prefixed with http:// (sigh ...)
|
|
gbmemcpy ( p , "http://" , 7 ); p += 7;
|
|
gbmemcpy ( p , name, nameLen); p += nameLen;
|
|
if( pbufLen > 0 )
|
|
{
|
|
gbmemcpy ( p , pbuf, pbufLen); p += pbufLen;
|
|
}
|
|
gbmemcpy ( p , fu->getPath() , len ); p += len;
|
|
*p = '\0';
|
|
|
|
// update hash parms
|
|
if (m_version <= 126) {
|
|
hi.m_prefix = "site";
|
|
} else {
|
|
hi.m_prefix = *ini ? "sitenoindex" : "site";
|
|
}
|
|
|
|
hi.m_hashGroup = HASHGROUP_INURL;
|
|
|
|
|
|
// this returns false on failure
|
|
if ( ! hashSingleTerm (buf,p-buf,&hi ) ) {
|
|
return false;
|
|
}
|
|
|
|
// break when we hash the root path
|
|
if ( len <=1 ) {
|
|
break;
|
|
}
|
|
}
|
|
|
|
// now keep moving the period over in the hostname
|
|
while ( name < end3 && *name != '.' ) {
|
|
name++;
|
|
nameLen--;
|
|
}
|
|
|
|
// skip the '.'
|
|
name++; nameLen--;
|
|
|
|
// Check that there is a dot before first slash after domain
|
|
// to avoid junk entries like http://com/subpath/pagename.html
|
|
bool dom_valid = false;
|
|
if( nameLen > 0 )
|
|
{
|
|
int32_t dom_offset=0;
|
|
if( strncmp(name,"http://" ,7)==0 )
|
|
{
|
|
dom_offset=7;
|
|
}
|
|
else
|
|
if( strncmp(name,"https://",8)==0 )
|
|
{
|
|
dom_offset=8;
|
|
}
|
|
|
|
const char *dotpos = (const char *)memchr(name,'.',nameLen);
|
|
const char *slashpos= (const char *)memchr(name+dom_offset,'/',nameLen-dom_offset);
|
|
|
|
if( dotpos && (!slashpos || (slashpos > dotpos)) )
|
|
{
|
|
dom_valid = true;
|
|
}
|
|
}
|
|
|
|
if ( name < end3 && dom_valid ) goto loop;
|
|
|
|
|
|
|
|
// BR 20160121: Make searching for e.g. site:dk work
|
|
setStatus ( "hashing tld for site search");
|
|
const char *tld = fu->getTLD();
|
|
int32_t tldLen = fu->getTLDLen();
|
|
|
|
if( tldLen > 0 && tldLen < 64 ) {
|
|
char tldBuf[72]; // http:// (7) + tld (63) + / (1) + 0 (1)
|
|
char *p = tldBuf;
|
|
gbmemcpy ( p , "http://", 7 ); p += 7;
|
|
gbmemcpy ( p , tld, tldLen); p += tldLen;
|
|
gbmemcpy ( p , "/", 1 ); p += 1;
|
|
*p = '\0';
|
|
if ( ! hashSingleTerm (tldBuf, p - tldBuf, &hi ) ) {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
const char *ext = fu->getExtension();
|
|
int32_t elen = fu->getExtensionLen();
|
|
if (!*ini || m_version <= 126) {
|
|
//
|
|
// HASH ext: term
|
|
//
|
|
// i.e. ext:gif ext:html ext:htm ext:pdf, etc.
|
|
setStatus("hashing ext colon");
|
|
// update hash parms
|
|
hi.m_prefix = "ext";
|
|
if (!hashSingleTerm(ext, elen, &hi)) return false;
|
|
}
|
|
|
|
{
|
|
setStatus("hashing gbdocid");
|
|
hi.m_prefix = "gbdocid";
|
|
char buf2[32];
|
|
sprintf(buf2, "%" PRIu64, (uint64_t)m_docId);
|
|
if (!hashSingleTerm(buf2, strlen(buf2), &hi)) return false;
|
|
}
|
|
|
|
setStatus ( "hashing SiteGetter terms");
|
|
|
|
//
|
|
// HASH terms for SiteGetter.cpp
|
|
//
|
|
// . this termId is used by SiteGetter.cpp for determining subsites
|
|
// . matches what is in SiteGet::getSiteList()
|
|
// for www.xyz.com/a/ HASH www.xyz.com
|
|
// for www.xyz.com/a/b/ HASH www.xyz.com/a/
|
|
// for www.xyz.com/a/b/c/ HASH www.xyz.com/a/b/
|
|
bool add = true;
|
|
// we only hash this for urls that end in '/'
|
|
if ( s[slen-1] != '/' ) add = false;
|
|
// and no cgi
|
|
if ( fu->isCgi() ) add = false;
|
|
// skip if root
|
|
if ( fu->getPathLen() <= 1 ) add = false;
|
|
// sanity check
|
|
if ( ! m_linksValid ) { g_process.shutdownAbort(true); }
|
|
// . skip if we have no subdirectory outlinks
|
|
// . that way we do not confuse all the pages in dictionary.com or
|
|
// wikipedia.org as subsites!!
|
|
if ( ! m_links.hasSubdirOutlink() ) add = false;
|
|
|
|
const char *host = fu->getHost();
|
|
int32_t hlen = fu->getHostLen ();
|
|
|
|
// tags from here out
|
|
hi.m_hashGroup = HASHGROUP_INTAG;
|
|
hi.m_shardByTermId = true;
|
|
// hash it
|
|
if ( add ) {
|
|
// remove the last path component
|
|
const char *end2 = s + slen - 2;
|
|
// back up over last component
|
|
for ( ; end2 > fu->getPath() && *end2 != '/' ; end2-- ) ;
|
|
// hash that part of the url
|
|
hi.m_prefix = "siteterm";
|
|
if ( ! hashSingleTerm ( host,end2-host,&hi) ) return false;
|
|
}
|
|
hi.m_shardByTermId = false;
|
|
|
|
setStatus ( "hashing urlhashdiv10 etc");
|
|
|
|
//
|
|
// HASH urlhash: urlhashdiv10: urlhashdiv100: terms
|
|
//
|
|
// this is for proving how many docs are in the index
|
|
char buf[20];
|
|
int32_t blen;
|
|
|
|
uint32_t h = hash32 ( s , slen );
|
|
blen = sprintf(buf,"%" PRIu32,h);
|
|
hi.m_prefix = "urlhash";
|
|
if ( ! hashString(buf,blen,&hi) ) return false;
|
|
|
|
// don't index mid domain or url path for noindex document
|
|
if (*ini && m_version > 126) {
|
|
return true;
|
|
}
|
|
|
|
if (size_utf8Content - 1 > 0 || m_indexCode == EDOCDISALLOWEDROOT) {
|
|
setStatus("hashing url mid domain");
|
|
|
|
// update parms
|
|
hi.m_prefix = NULL;
|
|
hi.m_desc = "middle domain";
|
|
hi.m_hashGroup = HASHGROUP_INURL;
|
|
hi.m_filterUrlIndexableWords = true; // Skip com, http etc.
|
|
if (!hashString(host, hlen, &hi)) {
|
|
return false;
|
|
}
|
|
if(sb_punyDecodedHost.length()>1) {
|
|
if(!hashString(sb_punyDecodedHost.getBufStart(),sb_punyDecodedHost.length(), &hi))
|
|
return false;
|
|
}
|
|
|
|
hi.m_filterUrlIndexableWords = false;
|
|
if (!hashSingleTerm(fu->getDomain(), fu->getDomainLen(), &hi)) {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
if (size_utf8Content - 1 > 0) {
|
|
setStatus("hashing url path");
|
|
const char *path = fu->getPath();
|
|
int32_t plen = fu->getPathLen();
|
|
|
|
// BR 20160113: Do not hash and combine the page filename extension with the page name (skip e.g. .com)
|
|
if (elen > 0) {
|
|
elen++; // also skip the dot
|
|
}
|
|
plen -= elen;
|
|
|
|
// BR 20160113: Do not hash the most common page names
|
|
if (strncmp(path, "/index", plen) != 0) {
|
|
// hash the path
|
|
// BR 20160114: Exclude numbers in paths (usually dates)
|
|
hi.m_hashGroup = HASHGROUP_INURL;
|
|
hi.m_hashNumbers = false;
|
|
if (!hashString(path, plen, &hi)) return false;
|
|
}
|
|
}
|
|
|
|
//actually index the middle domain. The above indexing of filtered-host and singleterm-domain was in the original code so it was always misleading
|
|
{
|
|
setStatus("hashing url mid domain");
|
|
hi.m_prefix = NULL;
|
|
hi.m_desc = "middle domain(2)";
|
|
hi.m_hashGroup = HASHGROUP_MIDDOMAIN;
|
|
hi.m_filterUrlIndexableWords = false;
|
|
const char *mdom = fu->getMidDomain();
|
|
int32_t mdomlen = fu->getMidDomainLen();
|
|
if (!hashString(mdom, mdomlen, &hi)) {
|
|
return false;
|
|
}
|
|
if(fu->isPunycodeSafeTld() && fu->hasPunycode()) {
|
|
SafeBuf sb_punyDecodedMidDomain;
|
|
if(fu->getPunycodeDecodedMidDomain(&sb_punyDecodedMidDomain)) {
|
|
if (!hashString(sb_punyDecodedMidDomain.getBufStart(), sb_punyDecodedMidDomain.length(), &hi))
|
|
return false;
|
|
}
|
|
}
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
// . returns false and sets g_errno on error
|
|
bool XmlDoc::hashIncomingLinkText(HashTableX *tt) {
|
|
|
|
setStatus ( "hashing link text" );
|
|
|
|
// sanity
|
|
if ( ! m_linkInfo1Valid ) { g_process.shutdownAbort(true); }
|
|
|
|
// . finally hash in the linkText terms from the LinkInfo
|
|
// . the LinkInfo class has all the terms of hashed anchor text for us
|
|
// . if we're using an old TitleRec linkTermList is just a ptr to
|
|
// somewhere in TitleRec
|
|
// . otherwise, we generated it from merging a bunch of LinkInfos
|
|
// and storing them in this new TitleRec
|
|
LinkInfo *linkInfo = getLinkInfo1();
|
|
|
|
// sanity checks
|
|
if ( ! m_ipValid ) { g_process.shutdownAbort(true); }
|
|
if ( ! m_siteNumInlinksValid ) { g_process.shutdownAbort(true); }
|
|
|
|
//
|
|
// brought the following code in from LinkInfo.cpp
|
|
//
|
|
|
|
// count "external" inlinkers
|
|
int32_t ecount = 0;
|
|
|
|
// update hash parms
|
|
HashInfo hi;
|
|
hi.m_tt = tt;
|
|
// hashstring should update this like a cursor.
|
|
hi.m_startDist = 0;
|
|
|
|
// loop through the link texts and hash them
|
|
for ( Inlink *k = NULL; linkInfo && (k = linkInfo->getNextInlink(k)) ; ) {
|
|
// is this inlinker internal?
|
|
bool internal=((m_ip&0x0000ffff)==(k->m_ip&0x0000ffff));
|
|
// count external inlinks we have for indexing gbmininlinks:
|
|
if ( ! internal ) ecount++;
|
|
|
|
// get length of link text
|
|
int32_t tlen = k->size_linkText;
|
|
if ( tlen > 0 ) tlen--;
|
|
// get the text
|
|
const char *txt = k->getLinkText();
|
|
// sanity check
|
|
if ( ! verifyUtf8 ( txt , tlen ) ) {
|
|
log("xmldoc: bad link text 2 from url=%s for %s",
|
|
k->getUrl(),m_firstUrl.getUrl());
|
|
continue;
|
|
}
|
|
|
|
if ( internal ) hi.m_hashGroup = HASHGROUP_INTERNALINLINKTEXT;
|
|
else hi.m_hashGroup = HASHGROUP_INLINKTEXT;
|
|
// store the siterank of the linker in this and use that
|
|
// to set the multiplier M bits i guess
|
|
hi.m_linkerSiteRank = k->m_siteRank;
|
|
if(hi.m_linkerSiteRank>MAXSITERANK) {
|
|
log(LOG_INFO,"Inlink had siteRank>max (%d), probably from docid %ld", k->m_siteRank, k->m_docId);
|
|
hi.m_linkerSiteRank = MAXSITERANK;
|
|
}
|
|
// now record this so we can match the link text to
|
|
// a matched offsite inlink text term in the scoring info
|
|
k->m_wordPosStart = m_dist; // hi.m_startDist;
|
|
// . hash the link text into the table
|
|
// . returns false and sets g_errno on error
|
|
// . we still have the score punish from # of words though!
|
|
// . for inlink texts that are the same it should accumulate
|
|
// and use the reserved bits as a multiplier i guess...
|
|
if ( ! hashString4(txt,tlen,&hi) ) return false;
|
|
// now record this so we can match the link text to
|
|
// a matched offsite inlink text term in the scoring info
|
|
//k->m_wordPosEnd = hi.m_startDist;
|
|
// spread it out
|
|
hi.m_startDist += 20;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
// . returns false and sets g_errno on error
|
|
bool XmlDoc::hashNeighborhoods ( HashTableX *tt ) {
|
|
setStatus ( "hashing neighborhoods" );
|
|
|
|
// . now we also hash the neighborhood text of each inlink, that is,
|
|
// the text surrounding the inlink text.
|
|
// . this is also destructive in that it will remove termids that
|
|
// were not in the document being linked to in order to save
|
|
// space in the titleRec
|
|
// . now we only do one or the other, not both
|
|
LinkInfo *linkInfo = getLinkInfo1();
|
|
if(!linkInfo)
|
|
return true;
|
|
|
|
// loop over all the Inlinks
|
|
for(Inlink *k = linkInfo->getNextInlink(NULL); k; k = linkInfo->getNextInlink(k)) {
|
|
// skip if internal, they often have the same neighborhood text
|
|
if((k->m_ip&0x0000ffff)==(m_ip&0x0000ffff))
|
|
continue;
|
|
|
|
// get the left and right texts and hash both
|
|
const char *s = k->getSurroundingText();
|
|
if(!s || k->size_surroundingText <= 1)
|
|
continue;
|
|
|
|
// update hash parms
|
|
HashInfo hi;
|
|
hi.m_tt = tt;
|
|
hi.m_desc = "surrounding text";
|
|
hi.m_hashGroup = HASHGROUP_NEIGHBORHOOD;
|
|
|
|
// . hash that
|
|
// . this returns false and sets g_errno on error
|
|
int32_t len = k->size_surroundingText - 1;
|
|
if(!hashString(s, len, &hi))
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
// . we now do the title hashing here for newer titlerecs, version 80+, rather
|
|
// than use the <index> block in the ruleset for titles.
|
|
// . this is not to be confused with hashing the title: terms which still
|
|
// does have an <index> block in the ruleset.
|
|
// . the new Weights class hashes title as part of body now with a high weight
|
|
// given by "titleWeight" parm
|
|
bool XmlDoc::hashTitle ( HashTableX *tt ) {
|
|
// sanity check
|
|
if ( m_hashedTitle ) { g_process.shutdownAbort(true); }
|
|
|
|
setStatus ( "hashing title" );
|
|
|
|
// this has been called, note it
|
|
m_hashedTitle = true;
|
|
|
|
//getXml()->getUtf8Content() results in the HTML to be ~mostly~ decoded but lt/gt/amp are still there escaped.
|
|
//So get the title text from m_xml, retokenize it, and then index that
|
|
int rawTitleLen;
|
|
const char *rawTitle = m_xml.getString("title",&rawTitleLen);
|
|
if(!rawTitle) {
|
|
//no title - nothing to do
|
|
return true;
|
|
}
|
|
|
|
//The amp/lt/gt are still there so decode them once again to get rid of them.
|
|
//Due to bad webmasters there can be double-encoded entities in the title. Technically it is
|
|
//their error but we can make some repairs on those pages.
|
|
const char *title = rawTitle;
|
|
int32_t titleLen = rawTitleLen;
|
|
StackBuf<1024> doubleDecodedContent;
|
|
possiblyDecodeHtmlEntitiesAgain(&title, &titleLen, &doubleDecodedContent, false);
|
|
|
|
//get language and country if known, so tokenizer phase 2 can do its magic
|
|
lang_t lang_id;
|
|
const char *countryCode;
|
|
getLanguageAndCountry(&lang_id,&countryCode);
|
|
|
|
TokenizerResult tr;
|
|
plain_tokenizer_phase_1(title,titleLen,&tr);
|
|
plain_tokenizer_phase_2(lang_id, countryCode, &tr);
|
|
calculate_tokens_hashes(&tr);
|
|
sortTokenizerResult(&tr);
|
|
|
|
Bits bits;
|
|
if(!bits.set(&tr))
|
|
return false;
|
|
|
|
HashInfo hi;
|
|
hi.m_tt = tt;
|
|
hi.m_hashGroup = HASHGROUP_TITLE;
|
|
|
|
// hash with title: prefix
|
|
hi.m_prefix = "title";
|
|
if(!hashWords3(&hi, &tr, NULL, &bits, NULL, NULL, NULL, m_wts, &m_wbuf))
|
|
return false;
|
|
// hash without title: prefix
|
|
hi.m_prefix = NULL;
|
|
if(!hashWords3(&hi, &tr, NULL, &bits, NULL, NULL, NULL, m_wts, &m_wbuf))
|
|
return false;
|
|
|
|
return true;
|
|
}
|
|
|
|
// . we now do the title hashing here for newer titlerecs, version 80+, rather
|
|
// than use the <index> block in the ruleset for titles.
|
|
// . this is not to be confused with hashing the title: terms which still
|
|
// does have an <index> block in the ruleset.
|
|
bool XmlDoc::hashBody2 ( HashTableX *tt ) {
|
|
|
|
// do not index ANY of the body if it is NOT a permalink and
|
|
// "menu elimination" technology is enabled.
|
|
//if ( ! *getIsPermalink() && m_eliminateMenus ) return true;
|
|
|
|
setStatus ( "hashing body" );
|
|
|
|
// record this
|
|
m_bodyStartPos = m_dist;
|
|
m_bodyStartPosValid = true;
|
|
|
|
HashInfo hi;
|
|
hi.m_tt = tt;
|
|
hi.m_desc = "body";
|
|
hi.m_hashGroup = HASHGROUP_BODY;
|
|
|
|
// use NULL for the prefix
|
|
return hashWords (&hi );
|
|
}
|
|
|
|
bool XmlDoc::hashMetaKeywords ( HashTableX *tt ) {
|
|
|
|
// do not index meta tags if "menu elimination" technology is enabled.
|
|
//if ( m_eliminateMenus ) return true;
|
|
|
|
setStatus ( "hashing meta keywords" );
|
|
|
|
// hash the meta keywords tag
|
|
//char buf [ 2048 + 2 ];
|
|
//int32_t len=m_xml.getMetaContentPointer ( buf , 2048 , "keywords" , 8 );
|
|
int32_t mklen;
|
|
const char *mk = getMetaKeywords( &mklen );
|
|
|
|
// update hash parms
|
|
HashInfo hi;
|
|
hi.m_tt = tt;
|
|
hi.m_desc = "meta keywords";
|
|
hi.m_hashGroup = HASHGROUP_INMETATAG;
|
|
|
|
// call XmlDoc::hashString
|
|
return hashString4(mk, mklen, &hi);
|
|
}
|
|
|
|
|
|
void XmlDoc::lookupAndSetExplicitKeywords() {
|
|
std::string kw;
|
|
kw = ExplicitKeywords::lookupExplicitKeywords(m_firstUrl.getUrl());
|
|
if(kw.empty())
|
|
kw = ExplicitKeywords::lookupExplicitKeywords(m_currentUrl.getUrl());
|
|
if(!kw.empty()) {
|
|
log(LOG_DEBUG,"spider: found explicit keywords '%s' for %s", kw.c_str(),m_firstUrl.getUrl());
|
|
m_explicitKeywordsBuf.set(kw.c_str());
|
|
ptr_explicitKeywords = m_explicitKeywordsBuf.getBufStart();
|
|
size_explicitKeywords = m_explicitKeywordsBuf.length();
|
|
} else {
|
|
m_explicitKeywordsBuf.purge();
|
|
ptr_explicitKeywords = NULL;
|
|
size_explicitKeywords = 0;
|
|
}
|
|
}
|
|
|
|
bool XmlDoc::hashExplicitKeywords(HashTableX *tt) {
|
|
if(m_version<128)
|
|
return true;
|
|
setStatus("hashing explicit keywords");
|
|
|
|
if(size_explicitKeywords>0) {
|
|
log(LOG_DEBUG,"spider: hashing explicit keywords '%.*s' for %s", size_explicitKeywords, ptr_explicitKeywords, m_firstUrl.getUrl());
|
|
// update hash parms
|
|
HashInfo hi;
|
|
hi.m_tt = tt;
|
|
hi.m_desc = "explicit keywords";
|
|
hi.m_hashGroup = HASHGROUP_EXPLICIT_KEYWORDS;
|
|
return hashString4(ptr_explicitKeywords, size_explicitKeywords, &hi);
|
|
} else
|
|
return true; //nothing done - no error
|
|
}
|
|
|
|
|
|
// . hash the meta summary, description and keyword tags
|
|
// . we now do the title hashing here for newer titlerecs, version 80+, rather
|
|
// than use the <index> block in the ruleset for titles.
|
|
bool XmlDoc::hashMetaSummary ( HashTableX *tt ) {
|
|
|
|
// sanity check
|
|
if ( m_hashedMetas ) { g_process.shutdownAbort(true); }
|
|
|
|
// this has been called, note it
|
|
m_hashedMetas = true;
|
|
|
|
// do not index meta tags if "menu elimination" technology is enabled.
|
|
//if ( m_eliminateMenus ) return true;
|
|
|
|
setStatus ( "hashing meta summary" );
|
|
|
|
StackBuf<1024> doubleDecodedContent;
|
|
|
|
// hash the meta keywords tag
|
|
//char buf [ 2048 + 2 ];
|
|
//int32_t len = m_xml.getMetaContent ( buf , 2048 , "summary" , 7 );
|
|
int32_t mslen;
|
|
const char *ms = getMetaSummary ( &mslen );
|
|
possiblyDecodeHtmlEntitiesAgain(&ms, &mslen, &doubleDecodedContent, true);
|
|
|
|
// update hash parms
|
|
HashInfo hi;
|
|
hi.m_tt = tt;
|
|
hi.m_hashGroup = HASHGROUP_INMETATAG;
|
|
|
|
// udpate hashing parms
|
|
hi.m_desc = "meta summary";
|
|
// hash it
|
|
if(!hashString4(ms,mslen,&hi))
|
|
return false;
|
|
|
|
|
|
//len = m_xml.getMetaContent ( buf , 2048 , "description" , 11 );
|
|
int32_t mdlen;
|
|
const char *md = getMetaDescription ( &mdlen );
|
|
possiblyDecodeHtmlEntitiesAgain(&md, &mdlen, &doubleDecodedContent, true);
|
|
|
|
// udpate hashing parms
|
|
hi.m_desc = "meta desc";
|
|
// . TODO: only hash if unique????? set a flag on ht then i guess
|
|
if(!hashString4(md,mdlen, &hi))
|
|
return false;
|
|
|
|
return true;
|
|
}
|
|
|
|
|
|
bool XmlDoc::hashMetaGeoPlacename( HashTableX *tt ) {
|
|
|
|
setStatus ( "hashing meta geo.placename" );
|
|
|
|
int32_t mgplen;
|
|
const char *mgp = getMetaGeoPlacename( &mgplen );
|
|
|
|
// update hash parms
|
|
HashInfo hi;
|
|
hi.m_tt = tt;
|
|
hi.m_desc = "meta geo.placename";
|
|
hi.m_hashGroup = HASHGROUP_INMETATAG;
|
|
|
|
// call XmlDoc::hashString
|
|
return hashString4(mgp, mgplen, &hi);
|
|
}
|
|
|
|
|
|
|
|
|
|
bool XmlDoc::hashLanguage ( HashTableX *tt ) {
|
|
|
|
setStatus ( "hashing language" );
|
|
|
|
int32_t langId = (int32_t)*getLangId();
|
|
|
|
char s[32]; // numeric langid
|
|
int32_t slen = sprintf(s, "%" PRId32, langId );
|
|
|
|
// update hash parms
|
|
HashInfo hi;
|
|
hi.m_tt = tt;
|
|
hi.m_hashGroup = HASHGROUP_INTAG;
|
|
hi.m_prefix = "gblang";
|
|
|
|
if ( ! hashString ( s, slen, &hi ) ) return false;
|
|
|
|
return true;
|
|
}
|
|
|
|
bool XmlDoc::hashLanguageString ( HashTableX *tt ) {
|
|
|
|
setStatus ( "hashing language string" );
|
|
|
|
int32_t langId = (int32_t)*getLangId();
|
|
|
|
// update hash parms
|
|
HashInfo hi;
|
|
hi.m_tt = tt;
|
|
hi.m_hashGroup = HASHGROUP_INTAG;
|
|
hi.m_prefix = "gblang";
|
|
|
|
// try lang abbreviation
|
|
char s[32];
|
|
int32_t slen = sprintf(s , "%s ", getLanguageAbbr(langId) );
|
|
// go back to broken way to try to fix parsing consistency bug
|
|
if ( ! hashString ( s, slen, &hi ) ) return false;
|
|
|
|
return true;
|
|
}
|
|
|
|
bool XmlDoc::hashCountry ( HashTableX *tt ) {
|
|
|
|
setStatus ( "hashing country" );
|
|
|
|
uint16_t *cid = getCountryId();
|
|
if ( ! cid || cid == (uint16_t *)-1 ) return false;
|
|
|
|
// update hash parms
|
|
HashInfo hi;
|
|
hi.m_tt = tt;
|
|
hi.m_hashGroup = HASHGROUP_INTAG;
|
|
hi.m_prefix = "gbcountry";
|
|
|
|
for ( int32_t i = 0 ; i < 1 ; i++ ) {
|
|
// convert it
|
|
char buf[32];
|
|
int32_t blen = sprintf(buf,"%s", g_countryCode.getAbbr(*cid) );
|
|
// hash it
|
|
if ( ! hashString ( buf, blen, &hi ) ) return false;
|
|
}
|
|
// all done
|
|
return true;
|
|
}
|
|
|
|
|
|
bool XmlDoc::hashLemmas(HashTableX *table) {
|
|
setStatus("hashing lemmas"); //Not llamas
|
|
logTrace(g_conf.m_logTraceTokenIndexing,"lemma_words.size()=%zu", lemma_words.size());
|
|
HashInfo hi; //storeTerm wants a HashInfo instance.
|
|
|
|
if(m_dist > MAXWORDPOS) {
|
|
log(LOG_INFO,"hashLemmas(): wordpos limit hit in document %.*s", m_firstUrl.getUrlLen(), m_firstUrl.getUrl());
|
|
return true;
|
|
}
|
|
|
|
for(const auto &e : lemma_words) {
|
|
uint64_t h = hash64Lower_utf8(e.data(),e.length());
|
|
logTrace(g_conf.m_logTraceTokenIndexing,"Indexing lemma '%s', h=%ld, termid=%lld", e.c_str(), h, h&TERMID_MASK);
|
|
key144_t k;
|
|
Posdb::makeKey(&k,
|
|
h,
|
|
0LL,//docid
|
|
m_dist,
|
|
0,// densityRank , // 0-15
|
|
0, //diversityrank
|
|
0, //wordspamrank
|
|
0, // siterank
|
|
HASHGROUP_LEMMA,
|
|
m_langId, // we set to docLang final hash loop
|
|
0, // multiplier
|
|
false, // syn?
|
|
false, // delkey?
|
|
false); //shardByTermId
|
|
table->addTerm144(&k);
|
|
|
|
if(m_wts) {
|
|
// add to wts for PageParser.cpp display
|
|
if(!storeTerm(e.data(),e.length(),
|
|
h, &hi,
|
|
0, //word index. We could keep track of the first word that generated this base form. But we don't.
|
|
m_dist, // wordPos
|
|
0,// densityRank , // 0-15
|
|
0, //diversityrank
|
|
0, //wordspamrank
|
|
HASHGROUP_LEMMA,
|
|
&m_wbuf,
|
|
m_wts,
|
|
SOURCE_NONE, // synsrc
|
|
m_langId,
|
|
k))
|
|
return false;
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
|
|
void XmlDoc::sortTokenizerResult(TokenizerResult *tr) {
|
|
std::sort(tr->tokens.begin(), tr->tokens.end(), [](const TokenRange&tr0, const TokenRange &tr1) {
|
|
return tr0.start_pos < tr1.start_pos ||
|
|
(tr0.start_pos == tr1.start_pos && tr0.end_pos<tr1.end_pos);
|
|
});
|
|
}
|
|
|
|
void XmlDoc::getLanguageAndCountry(lang_t *lang, const char **country_code) {
|
|
//get language and country if known, so tokenizer phase 2 can do its magic
|
|
uint8_t *tmpLangId = getLangId();
|
|
if(tmpLangId!=NULL && tmpLangId!=(uint8_t*)-1)
|
|
*lang = (lang_t)*tmpLangId;
|
|
else
|
|
*lang = langUnknown;
|
|
|
|
uint16_t *countryId = getCountryId();
|
|
if(countryId!=NULL && countryId!=(uint16_t*)-1)
|
|
*country_code = g_countryCode.getAbbr(*countryId);
|
|
else
|
|
*country_code = NULL;
|
|
}
|
|
|
|
bool XmlDoc::hashSingleTerm( const char *s, int32_t slen, HashInfo *hi ) {
|
|
// empty?
|
|
if ( slen <= 0 ) return true;
|
|
if ( ! m_versionValid ) { g_process.shutdownAbort(true); }
|
|
if ( hi->m_useCountTable && ! m_countTableValid){g_process.shutdownAbort(true); }
|
|
|
|
// a single blob hash
|
|
int64_t termId = hash64 ( s , slen );
|
|
// combine with prefix
|
|
int64_t final = termId;
|
|
// combine with a non-NULL prefix
|
|
int64_t prefixHash = 0LL;
|
|
if ( hi->m_prefix ) {
|
|
prefixHash = hash64b ( hi->m_prefix );
|
|
final = hash64 ( termId , prefixHash );
|
|
}
|
|
// call the other guy now
|
|
//return hashSingleTerm ( final , hi );
|
|
|
|
|
|
// shortcut
|
|
HashTableX *dt = hi->m_tt;
|
|
// sanity check
|
|
if ( dt->getKeySize() != sizeof(key144_t) ) { g_process.shutdownAbort(true); }
|
|
// make the key like we do in hashWords()
|
|
|
|
|
|
key144_t k;
|
|
Posdb::makeKey ( &k ,
|
|
final,
|
|
0LL, // docid
|
|
0, // dist
|
|
MAXDENSITYRANK, // density rank
|
|
MAXDIVERSITYRANK, // diversity rank
|
|
MAXWORDSPAMRANK, // wordspamrank
|
|
0, // siterank
|
|
hi->m_hashGroup,
|
|
// we set to docLang in final hash loop
|
|
langUnknown,// langid
|
|
0, // multiplier
|
|
0, // syn?
|
|
false , // delkey?
|
|
hi->m_shardByTermId );
|
|
|
|
// . otherwise, add a new slot
|
|
// . key should NEVER collide since we are always
|
|
// incrementing the distance cursor, m_dist
|
|
if ( ! dt->addTerm144 ( &k ) ) return false;
|
|
|
|
// add to wts for PageParser.cpp display
|
|
if ( m_wts && ! storeTerm ( s,slen,final,hi,
|
|
0, // wordnum
|
|
0, // wordPos,
|
|
MAXDENSITYRANK,
|
|
MAXDIVERSITYRANK,
|
|
MAXWORDSPAMRANK,
|
|
hi->m_hashGroup,
|
|
//false,
|
|
&m_wbuf,
|
|
m_wts,
|
|
SOURCE_NONE, // synsrc
|
|
langUnknown,
|
|
k) )
|
|
return false;
|
|
|
|
return true;
|
|
}
|
|
|
|
bool XmlDoc::hashString( const char *s, int32_t slen, HashInfo *hi ) {
|
|
if ( ! m_versionValid ) { g_process.shutdownAbort(true); }
|
|
|
|
if ( hi->m_useCountTable && ! m_countTableValid){g_process.shutdownAbort(true); }
|
|
|
|
if ( ! m_siteNumInlinksValid ) { g_process.shutdownAbort(true); }
|
|
|
|
return hashString3( s ,
|
|
slen ,
|
|
hi ,
|
|
m_wts ,
|
|
&m_wbuf );
|
|
}
|
|
|
|
bool XmlDoc::hashString(size_t begin_token, size_t end_token, HashInfo *hi) {
|
|
if(!m_versionValid)
|
|
gbshutdownLogicError();
|
|
return hashString3(begin_token, end_token, hi,
|
|
m_wts,
|
|
&m_wbuf);
|
|
}
|
|
|
|
|
|
bool XmlDoc::hashString3( const char *s ,
|
|
int32_t slen ,
|
|
HashInfo *hi ,
|
|
HashTableX *wts ,
|
|
SafeBuf *wbuf) {
|
|
TokenizerResult tr;
|
|
Bits bits;
|
|
|
|
plain_tokenizer_phase_1(s,slen,&tr);
|
|
calculate_tokens_hashes(&tr);
|
|
if ( !bits.set(&tr))
|
|
return false;
|
|
|
|
// use primary langid of doc
|
|
if ( ! m_langIdValid ) { g_process.shutdownAbort(true); }
|
|
|
|
return hashWords3( hi, &tr, NULL, &bits, NULL, NULL, NULL, wts, wbuf );
|
|
}
|
|
|
|
bool XmlDoc::hashString3(size_t begin_token, size_t end_token, HashInfo *hi,
|
|
HashTableX *wts, SafeBuf *wbuf)
|
|
{
|
|
Bits bits;
|
|
|
|
if ( !bits.set(&m_tokenizerResult))
|
|
return false;
|
|
|
|
return hashWords3( hi, &m_tokenizerResult, begin_token, end_token, NULL, &bits, NULL, NULL, NULL, wts, wbuf );
|
|
}
|
|
|
|
bool XmlDoc::hashString4(const char *s, int32_t slen, HashInfo *hi) {
|
|
TokenizerResult tr;
|
|
Bits bits;
|
|
lang_t lang_id;
|
|
const char *countryCode;
|
|
|
|
getLanguageAndCountry(&lang_id,&countryCode);
|
|
plain_tokenizer_phase_1(s,slen,&tr);
|
|
plain_tokenizer_phase_2(lang_id,countryCode,&tr);
|
|
calculate_tokens_hashes(&tr);
|
|
sortTokenizerResult(&tr);
|
|
if(!bits.set(&tr))
|
|
return false;
|
|
|
|
return hashWords3( hi, &tr, NULL, &bits, NULL, NULL, NULL, m_wts, &m_wbuf );
|
|
}
|
|
|
|
|
|
bool XmlDoc::hashWords ( HashInfo *hi ) {
|
|
// sanity checks
|
|
if ( ! m_tokenizerResultValid ) { g_process.shutdownAbort(true); }
|
|
if ( ! m_tokenizerResultValid2 ) { g_process.shutdownAbort(true); }
|
|
//if ( hi->m_useCountTable &&!m_countTableValid){g_process.shutdownAbort(true); }
|
|
if ( ! m_bitsValid ) { g_process.shutdownAbort(true); }
|
|
if ( ! m_sectionsValid) { g_process.shutdownAbort(true); }
|
|
//if ( ! m_synonymsValid) { g_process.shutdownAbort(true); }
|
|
if ( ! m_fragBufValid ) { g_process.shutdownAbort(true); }
|
|
if ( ! m_wordSpamBufValid ) { g_process.shutdownAbort(true); }
|
|
if ( m_wts && ! m_langVectorValid ) { g_process.shutdownAbort(true); }
|
|
if ( ! m_langIdValid ) { g_process.shutdownAbort(true); }
|
|
// . is the word repeated in a pattern?
|
|
// . this should only be used for document body, for meta tags,
|
|
// inlink text, etc. we should make sure words are unique
|
|
char *wordSpamVec = getWordSpamVec();
|
|
char *fragVec = m_fragBuf.getBufStart();
|
|
char *langVec = m_langVec.getBufStart();
|
|
|
|
return hashWords3(hi, &m_tokenizerResult, &m_sections, &m_bits, fragVec, wordSpamVec, langVec, m_wts, &m_wbuf);
|
|
}
|
|
|
|
// . this now uses posdb exclusively
|
|
bool XmlDoc::hashWords3(HashInfo *hi, const TokenizerResult *tr,
|
|
Sections *sections, const Bits *bits,
|
|
const char *fragVec, const char *wordSpamVec, const char *langVec,
|
|
HashTableX *wts, SafeBuf *wbuf)
|
|
{
|
|
return hashWords3(hi,tr, 0,tr->size(), sections, bits, fragVec, wordSpamVec, langVec, wts, wbuf);
|
|
}
|
|
|
|
bool XmlDoc::hashWords3(HashInfo *hi, const TokenizerResult *tr, size_t begin_token, size_t end_token,
|
|
Sections *sections, const Bits *bits,
|
|
const char *fragVec, const char *wordSpamVec, const char *langVec,
|
|
HashTableX *wts, SafeBuf *wbuf)
|
|
{
|
|
// for getSpiderStatusDocMetaList() we don't use sections it'll mess us up
|
|
if ( ! hi->m_useSections ) sections = NULL;
|
|
|
|
HashTableX *dt = hi->m_tt;
|
|
std::unordered_set<std::string> candidate_lemma_words;
|
|
|
|
// . sanity checks
|
|
// . posdb just uses the full keys with docid
|
|
if ( dt->getKeySize() != 18 ) { g_process.shutdownAbort(true); }
|
|
if ( dt->getDataSize() != 4 ) { g_process.shutdownAbort(true); }
|
|
|
|
// if provided...
|
|
if ( wts ) {
|
|
if ( wts->getKeySize() != 12 ) { g_process.shutdownAbort(true); }
|
|
if ( wts->getDataSize() != sizeof(TermDebugInfo)){g_process.shutdownAbort(true); }
|
|
if ( ! wts->isAllowDups() ) { g_process.shutdownAbort(true); }
|
|
}
|
|
|
|
// ensure caller set the hashGroup
|
|
if ( hi->m_hashGroup < 0 ) { g_process.shutdownAbort(true); }
|
|
|
|
// hash in the prefix
|
|
uint64_t prefixHash = 0LL;
|
|
int32_t plen = 0;
|
|
if ( hi->m_prefix ) plen = strlen ( hi->m_prefix );
|
|
if ( hi->m_prefix && plen ) {
|
|
// we gotta make this case insensitive, and skip spaces
|
|
// because if it is 'focal length' we can't search
|
|
// 'focal length:10' because that comes across as TWO terms.
|
|
prefixHash = hash64Lower_utf8_nospaces ( hi->m_prefix , plen );
|
|
// . sanity test, make sure it is in supported list
|
|
// . hashing diffbot json output of course fails this so
|
|
// skip in that case if diffbot
|
|
}
|
|
|
|
bool hashIffUnique = false;
|
|
if ( hi->m_hashGroup == HASHGROUP_INMETATAG ) hashIffUnique = true;
|
|
if ( hi->m_hashGroup == HASHGROUP_INTAG ) hashIffUnique = true;
|
|
HashTableX ut; ut.set ( 8,0,0,NULL,0,false,"uqtbl");
|
|
|
|
//The diversity rank was effectively disabled (minweight=maxweigt) and the algortihm was either suspect or severely limited by phrases being only 2 words (bigrams).
|
|
//Currently disabled until we can investigate if it is worth fixing, worth implementing in another way, or simply dropped completely.
|
|
//
|
|
//Diversityrank is currently hardcoded to be 10 for individual words, and maxdiversityrank for bigrams
|
|
SafeBuf dwbuf;
|
|
if(!dwbuf.reserve(tr->size()*sizeof(char)))
|
|
return false;
|
|
memset(dwbuf.getBufStart(), MAXDIVERSITYRANK, tr->size());
|
|
#if 0
|
|
///////
|
|
//
|
|
// diversity rank vector.
|
|
//
|
|
///////
|
|
// the final diversity which is a multiplier
|
|
// is converted into a rank from 0-15 i guess.
|
|
// so 'mexico' in "new mexico" should receive a low word score but high
|
|
// phrase score. thus, a search for 'mexico' should not bring up
|
|
// the page for university of new mexico!
|
|
SafeBuf dwbuf;
|
|
if ( !getDiversityVec( tr, phrases, countTable, &dwbuf ) ) {
|
|
return false;
|
|
}
|
|
#endif
|
|
char *wdv = dwbuf.getBufStart();
|
|
|
|
size_t nw = tr->size();
|
|
|
|
/////
|
|
//
|
|
// calculate density ranks
|
|
//
|
|
/////
|
|
//
|
|
// this now varies depending on the length of the sentence/header etc.
|
|
// so if the hasgroup is not title, link text or meta tag, we have to
|
|
// use a safebuf.
|
|
SafeBuf densBuf;
|
|
// returns false and sets g_errno on error
|
|
if ( ! getDensityRanks(tr,
|
|
hi->m_hashGroup,
|
|
&densBuf,
|
|
sections))
|
|
return false;
|
|
// a handy ptr
|
|
char *densvec = (char *)densBuf.getBufStart();
|
|
|
|
////////////
|
|
//
|
|
// get word positions
|
|
//
|
|
///////////
|
|
Section **sp = NULL;
|
|
if ( sections ) sp = sections->m_sectionPtrs;
|
|
|
|
SafeBuf wpos;
|
|
if ( ! getWordPosVec ( tr, sections, m_dist, fragVec, &wpos) )
|
|
return false;
|
|
|
|
// a handy ptr
|
|
int32_t *wposvec = (int32_t *)wpos.getBufStart();
|
|
|
|
if(end_token>begin_token && wposvec[end_token-1]>MAXWORDPOS) {
|
|
log(LOG_INFO,"hashWords3(): wordpos limit will be hit in document %.*s", m_firstUrl.getUrlLen(), m_firstUrl.getUrl());
|
|
}
|
|
|
|
bool seen_slash = false;
|
|
for(unsigned i = begin_token; i < end_token; i++) {
|
|
const auto &token = (*tr)[i];
|
|
logTrace(g_conf.m_logTraceTokenIndexing,"Looking at token #%u: '%.*s', hash=%ld, nodeid=%u", i, (int)token.token_len, token.token_start, token.token_hash, token.nodeid);
|
|
if(token.token_len==1 && token.token_start[0]=='/')
|
|
seen_slash = true;
|
|
|
|
if ( ! token.is_alfanum ) continue;
|
|
// ignore if in repeated fragment
|
|
if ( fragVec && i<MAXFRAGWORDS && fragVec[i] == 0 ) continue;
|
|
// ignore if in style section
|
|
if ( sp && (sp[i]->m_flags & NOINDEXFLAGS) ) continue;
|
|
|
|
// do not breach wordpos bits
|
|
if ( wposvec[i] > MAXWORDPOS ) break;
|
|
|
|
// BR: 20160114 if digit, do not hash it if disabled
|
|
if( is_digit( token.token_start[0] ) && !hi->m_hashNumbers ) {
|
|
continue;
|
|
}
|
|
|
|
// . hash the startHash with the wordId for this word
|
|
// . we must mask it before adding it to the table because
|
|
// this table is also used to hash IndexLists into that come
|
|
// from LinkInfo classes (incoming link text). And when
|
|
// those IndexLists are hashed they used masked termIds.
|
|
// So we should too...
|
|
uint64_t h ;
|
|
if ( plen > 0 ) h = hash64 ( token.token_hash, prefixHash );
|
|
else h = token.token_hash;
|
|
|
|
int32_t hashGroup = hi->m_hashGroup;
|
|
|
|
Section *sx = NULL;
|
|
if ( sp ) {
|
|
sx = sp[i];
|
|
// . this is taken care of in hashTitle()
|
|
// . it is slightly different if the title is
|
|
// multiple sentences because when hashing the
|
|
// body the density rank is per sentence, but in
|
|
// hashTitle we count all the words in the title
|
|
// towards the density rank even if they are
|
|
// in different sentences
|
|
if ( sx->m_flags & SEC_IN_TITLE ) {
|
|
continue;
|
|
}
|
|
if ( sx->m_flags & SEC_IN_HEADER ) {
|
|
hashGroup = HASHGROUP_HEADING;
|
|
}
|
|
if ( sx->m_flags & ( SEC_MENU | SEC_MENU_SENTENCE | SEC_MENU_HEADER ) ) {
|
|
hashGroup = HASHGROUP_INMENU;
|
|
}
|
|
}
|
|
|
|
// this is for link text and meta tags mostly
|
|
if ( hashIffUnique ) {
|
|
// skip if already did it
|
|
if ( ut.isInTable ( &h ) ) continue;
|
|
if ( ! ut.addKey ( &h ) ) return false;
|
|
}
|
|
|
|
char ws = 15;
|
|
if ( wordSpamVec ) ws = wordSpamVec[i];
|
|
|
|
// HACK:
|
|
// if this is inlink text, use the wordspamrank to hold the
|
|
// inlinker's site rank!
|
|
if ( hashGroup == HASHGROUP_INLINKTEXT )
|
|
ws = hi->m_linkerSiteRank;
|
|
|
|
// default to the document's primary language if it is not
|
|
// clear what language this word belongs to.
|
|
// if the word is only in german it should be german,
|
|
// otherwise it will be the document's primary language.
|
|
//note: the above comment is wrong. The lanauge is overwritten by addTable144(). It is unclear if this is a good thing
|
|
char langId = langUnknown;
|
|
if ( m_wts && langVec ) langId = langVec[i];
|
|
|
|
char wd;
|
|
if ( hi->m_useCountTable ) {
|
|
wd = wdv[i];
|
|
} else {
|
|
wd = MAXDIVERSITYRANK;
|
|
}
|
|
|
|
bool skipword = false;
|
|
if(hi->m_filterUrlIndexableWords) {
|
|
if(!seen_slash) {
|
|
//Scheme/host/domain part of URL
|
|
//the http/https prefix is not indexed at all
|
|
if((token.token_len==4 && memcmp(token.token_start,"http",4)==0) ||
|
|
(token.token_len==5 && memcmp(token.token_start,"https",5)==0))
|
|
{
|
|
// Never include as single word or in bigrams
|
|
continue; //skip to next word
|
|
}
|
|
//the terms .com .co .dk etc have lots of hits and give very little value for indexing. We only index the bigrams.
|
|
if(isTLD(token.token_start, token.token_len)) {
|
|
skipword = true; //skip word by index bigram
|
|
}
|
|
} else {
|
|
//Path parth for URL
|
|
//potentially filter out "html" "aspx" index" "cgi" etc.
|
|
}
|
|
}
|
|
|
|
if(!skipword) {
|
|
logTrace(g_conf.m_logTraceTokenIndexing,"Indexing '%.*s', h=%ld, termid=%lld", (int)token.token_len, token.token_start, h, h&TERMID_MASK);
|
|
key144_t k;
|
|
|
|
Posdb::makeKey(&k,
|
|
h,
|
|
0LL,//docid
|
|
wposvec[i], // dist,
|
|
densvec[i],// densityRank , // 0-15
|
|
wd, // diversityRank 0-15
|
|
ws, // wordSpamRank 0-15
|
|
0, // siterank
|
|
hashGroup,
|
|
// we set to docLang final hash loop
|
|
langUnknown, // langid
|
|
0, // multiplier
|
|
false, // syn?
|
|
false, // delkey?
|
|
hi->m_shardByTermId);
|
|
|
|
// key should NEVER collide since we are always incrementing
|
|
// the distance cursor, m_dist
|
|
dt->addTerm144(&k);
|
|
|
|
// add to wts for PageParser.cpp display
|
|
if(wts) {
|
|
if(!storeTerm(token.token_start,token.token_len,h,hi,i,
|
|
wposvec[i], // wordPos
|
|
densvec[i],// densityRank , // 0-15
|
|
wd,//v[i],
|
|
ws,
|
|
hashGroup,
|
|
wbuf,
|
|
wts,
|
|
SOURCE_NONE, // synsrc
|
|
langId,
|
|
k))
|
|
return false;
|
|
}
|
|
if(token.is_alfanum)
|
|
candidate_lemma_words.emplace(token.token_start,token.token_len);
|
|
} else {
|
|
logTrace(g_conf.m_logTraceTokenIndexing,"not indexing '%.*s', h=%ld", (int)token.token_len, token.token_start, h);
|
|
}
|
|
|
|
|
|
////////
|
|
//
|
|
// two-word phrase
|
|
//
|
|
////////
|
|
|
|
//Find the first next alfanum token that starts at or after token.end_pos
|
|
//Also detect if we see a dont-pair-across token while scanning
|
|
unsigned j;
|
|
bool generate_bigram = true;
|
|
for(j=i+1; j<end_token; j++) {
|
|
const auto &t2 = (*tr)[j];
|
|
if(t2.is_alfanum && t2.start_pos>=token.end_pos)
|
|
break;
|
|
if(!bits->canBeInPhrase(j) && !bits->canPairAcross(j)) {
|
|
generate_bigram = false;
|
|
break;
|
|
}
|
|
}
|
|
if(j>=end_token)
|
|
generate_bigram = false;
|
|
|
|
if(generate_bigram) {
|
|
unsigned first_match_start_pos = (*tr)[j].start_pos;
|
|
for( ; j<end_token && (*tr)[j].start_pos == first_match_start_pos; j++) {
|
|
const auto &token2 = (*tr)[j];
|
|
if(!token2.is_alfanum)
|
|
continue; //ampersand-rewrites in tokenizer2.cpp can result in non-alfanum tokens that must be ignored and skipped
|
|
int32_t pos = token.token_len;
|
|
int64_t npid = hash64Lower_utf8_cont(token2.token_start, token2.token_len, token.token_hash, &pos);
|
|
uint64_t ph2;
|
|
|
|
logTrace(g_conf.m_logTraceTokenIndexing,"Indexing two-word phrase '%.*s'+'%.*s' with h=%ld, termid=%lld", (int)token.token_len, token.token_start, (int)token2.token_len, token2.token_start, npid, npid&TERMID_MASK);
|
|
// hash with prefix
|
|
if ( plen > 0 ) ph2 = hash64 ( npid , prefixHash );
|
|
else ph2 = npid;
|
|
key144_t k;
|
|
Posdb::makeKey ( &k ,
|
|
ph2 ,
|
|
0LL,//docid
|
|
wposvec[i],//dist,
|
|
densvec[i],// densityRank , // 0-15
|
|
MAXDIVERSITYRANK, //phrase
|
|
ws, // wordSpamRank ,
|
|
0,//siterank
|
|
hashGroup,
|
|
// we set to docLang final hash loop
|
|
langUnknown, // langid
|
|
0 , // multiplier
|
|
false, // syn?
|
|
false , // delkey?
|
|
hi->m_shardByTermId );
|
|
|
|
// key should NEVER collide since we are always
|
|
// incrementing the distance cursor, m_dist
|
|
dt->addTerm144 ( &k );
|
|
|
|
// add to wts for PageParser.cpp display
|
|
if(wts) {
|
|
// get phrase as a string
|
|
size_t plen;
|
|
char phraseBuffer[256];
|
|
//TODO: Collect the intermediate tokens too. It is complicated because the two tokens generating the bigram can be either primary or secondary tokens from the tonizer, and the non-alfanum tokens between too.
|
|
//simplification: just grab the chars from token+token2
|
|
if(token.token_len<=sizeof(phraseBuffer)) {
|
|
memcpy(phraseBuffer, token.token_start, token.token_len);
|
|
plen = token.token_len;
|
|
} else {
|
|
memcpy(phraseBuffer, token.token_start, sizeof(phraseBuffer));
|
|
plen = sizeof(phraseBuffer);
|
|
}
|
|
if(token2.token_len<=sizeof(phraseBuffer)-plen) {
|
|
memcpy(phraseBuffer+plen, token2.token_start, token2.token_len);
|
|
plen += token2.token_len;
|
|
} else {
|
|
memcpy(phraseBuffer+plen, token2.token_start, sizeof(phraseBuffer)-plen);
|
|
plen = sizeof(phraseBuffer);
|
|
}
|
|
// store it
|
|
if(!storeTerm(phraseBuffer,plen,ph2,hi,i,
|
|
wposvec[i], // wordPos
|
|
densvec[i],// densityRank , // 0-15
|
|
MAXDIVERSITYRANK,//phrase
|
|
ws,
|
|
hashGroup,
|
|
//true,
|
|
wbuf,
|
|
wts,
|
|
SOURCE_BIGRAM, // synsrc
|
|
langId,
|
|
k))
|
|
return false;
|
|
}
|
|
}
|
|
} else {
|
|
logTrace(g_conf.m_logTraceTokenIndexing,"NOT indexing two-word phrase(s)");
|
|
}
|
|
}
|
|
|
|
// between calls? i.e. hashTitle() and hashBody()
|
|
if ( nw > 0 ) m_dist = wposvec[nw-1] + 100;
|
|
|
|
if(m_langId==langDanish && lemma_lexicon) {
|
|
//we only have a lexicon for Danish so far for this test
|
|
logTrace(g_conf.m_logTraceTokenIndexing,"candidate_lemma_words.size()=%zu", candidate_lemma_words.size());
|
|
for(auto e : candidate_lemma_words) {
|
|
//find the word in the lexicon. find the lemma. If the word is unknown or already in its base form then don't generate a lemma entry
|
|
logTrace(g_conf.m_logTraceTokenIndexing,"candidate word for lemma: %s", e.c_str());
|
|
auto le = lemma_lexicon->lookup(e);
|
|
if(!le) {
|
|
//Not found as-is in lexicon. Try lowercase in case it is a capitalized word
|
|
char lowercase_word[128];
|
|
if(e.size()<sizeof(lowercase_word)) {
|
|
size_t sz = to_lower_utf8(lowercase_word,lowercase_word+sizeof(lowercase_word), e.data(), e.data()+e.size());
|
|
lowercase_word[sz] = '\0';
|
|
if(sz!=e.size() || memcmp(e.data(),lowercase_word,e.size())!=0) {
|
|
e = lowercase_word;
|
|
le = lemma_lexicon->lookup(e);
|
|
}
|
|
}
|
|
}
|
|
if(!le) {
|
|
//Not found as-is in lexicon. Try capitalized in case it is a lowercase or uppercase word
|
|
char capitalized_word[128];
|
|
if(e.size()<sizeof(capitalized_word)) {
|
|
size_t sz = to_capitalized_utf8(capitalized_word,capitalized_word+sizeof(capitalized_word), e.data(), e.data()+e.size());
|
|
capitalized_word[sz] = '\0';
|
|
if(sz!=e.size() || memcmp(e.data(),capitalized_word,e.size())!=0) {
|
|
e = capitalized_word;
|
|
le = lemma_lexicon->lookup(e);
|
|
}
|
|
}
|
|
}
|
|
if(!le) {
|
|
//Not found as-is in lexicon. Try uppercasing it
|
|
char uppercase_word[128];
|
|
if(e.size()<sizeof(uppercase_word)) {
|
|
size_t sz = to_upper_utf8(uppercase_word,uppercase_word+sizeof(uppercase_word), e.data(), e.data()+e.size());
|
|
uppercase_word[sz] = '\0';
|
|
if(sz!=e.size() || memcmp(e.data(),uppercase_word,e.size())!=0) {
|
|
e = uppercase_word;
|
|
le = lemma_lexicon->lookup(e);
|
|
}
|
|
}
|
|
}
|
|
if(!le)
|
|
continue; //unknown word
|
|
logTrace(g_conf.m_logTraceTokenIndexing,"lexicalentry found for for lemma: %s", e.c_str());
|
|
|
|
auto wf = le->find_base_wordform();
|
|
if(!wf)
|
|
continue; //no base form
|
|
if(wf->written_form_length==e.size() && memcmp(wf->written_form,e.data(),e.size())==0)
|
|
continue; //already in base form
|
|
logTrace(g_conf.m_logTraceTokenIndexing,"baseform is different than source: '%.*s'", (int)wf->written_form_length, wf->written_form);
|
|
lemma_words.emplace(wf->written_form,wf->written_form_length);
|
|
}
|
|
}
|
|
|
|
return true;
|
|
}
|