privacore-open-source-searc.../XmlDoc_Indexing.cpp
2018-08-24 13:37:43 +02:00

2303 lines
68 KiB
C++

//-*- coding: utf-8 -*-
#include "XmlDoc.h"
#include "CountryCode.h" // g_countryCode
#include "Collectiondb.h"
#include "Speller.h"
#include "Synonyms.h"
#include "Process.h"
#include "ip.h"
#include "Posdb.h"
#include "Conf.h"
#include "UrlBlockCheck.h"
#include "Domains.h"
#include "FxExplicitKeywords.h"
#include <algorithm>
#include "Lemma.h"
#include <unordered_set>
#include <string>
#include "Errno.h"
#include "gbmemcpy.h"
#ifdef _VALGRIND_
#include <valgrind/memcheck.h>
#endif
static void possiblyDecodeHtmlEntitiesAgain(const char **s, int32_t *len, SafeBuf *sb, bool also_remove_certain_html_elements) {
//some documents have incorrectly encoded html entities twice. Example:
//correct: <meta name="foo" content="&#66;oa">
//incorrect: <meta name="foo" content="&amp;#66;oa">
//If it seems likely that this has happened then we decode the entities again and put the result in 'sb' and update '*s' and '*len'
//Due to the (il)logic of GB the correct form is decoded, while the incorrect form is still raw, needing double decoding
//require &amp; following by a second semicolon
const char *amppos = (const char*)memmem(*s,*len, "&amp;", 5);
if((amppos && memchr(amppos+5, ';', *len-(amppos-*s)-5)!=NULL) ||
(memmem(*s,*len,"&lt;",4)!=NULL && memmem(*s,*len,"&gt;",4)!=NULL)) {
//shortest entity is 4 char (&lt;), longest utf8 encoding of a codepoint is 4 + a bit
StackBuf<1024> tmpBuf;
if(!tmpBuf.reserve(*len + *len/2 + 4))
return;
if(!sb->reserve(*len + *len/2 + 4))
return;
int32_t tmpLen = htmlDecode(tmpBuf.getBufStart(), *s,*len, false);
int32_t newlen = htmlDecode(sb->getBufStart(), tmpBuf.getBufStart(), tmpLen, false);
sb->setLength(newlen);
//Furthermore, some websites have junk in their meta tags. Eg <br> in the meta description
//We don't fix all cases as that could hurt correctly written pages about how to write proper html. But
//if they don't mention "html", "tag" nor "element" then we remove the most common offenders br/b/i/p
//When changing this function consider keeping in sync with Summary::maybeRemoveHtmlFormatting()
if(also_remove_certain_html_elements) {
if(memmem(sb->getBufStart(),sb->length(),"html",4)==0 &&
memmem(sb->getBufStart(),sb->length(),"HTML",4)==0 &&
memmem(sb->getBufStart(),sb->length(),"tag",3)==0 &&
memmem(sb->getBufStart(),sb->length(),"Tag",3)==0 &&
memmem(sb->getBufStart(),sb->length(),"element",7)==0 &&
memmem(sb->getBufStart(),sb->length(),"Element",7)==0)
{
sb->safeReplace2("<br>",4," ",1,0);
sb->safeReplace2("<b>",3,"",0,0);
sb->safeReplace2("<u>",3,"",0,0);
sb->safeReplace2("<p>",3," ",1,0);
}
}
*s = sb->getBufStart();
*len = sb->length();
}
}
// a ptr to HashInfo is passed to hashString() and hashWords()
class HashInfo {
public:
HashInfo() {
m_tt = NULL;
m_prefix = NULL;
m_desc = NULL;
m_date = 0;
// should we do sharding based on termid and not the usual docid???
// in general this is false, but for checksum we want to shard
// by the checksum and not docid to avoid having to do a
// gbchecksum:xxxxx search on ALL shards. much more efficient.
m_shardByTermId = false;
m_hashGroup = -1;
m_useCountTable = true;
m_useSections = true;
m_startDist = 0;
m_hashNumbers = true;
m_filterUrlIndexableWords = false;
m_linkerSiteRank = 0;
}
class HashTableX *m_tt;
const char *m_prefix;
// "m_desc" should detail the algorithm
const char *m_desc;
int32_t m_date;
bool m_shardByTermId;
char m_linkerSiteRank;
char m_hashGroup;
int32_t m_startDist;
bool m_useCountTable;
bool m_useSections;
bool m_hashNumbers;
bool m_filterUrlIndexableWords; //Do special filtering on words in url, eg. exclude "com" before path
};
static bool storeTerm ( const char *s ,
int32_t slen ,
int64_t termId ,
HashInfo *hi ,
int32_t wordNum ,
int32_t wordPos ,
char densityRank,
char diversityRank ,
char wordSpamRank ,
char hashGroup,
//bool isPhrase ,
SafeBuf *wbuf ,
HashTableX *wts ,
char synSrc ,
char langId ,
posdbkey_t key ) {
// store prefix
int32_t poff = wbuf->length();
// shortcut
const char *p = hi->m_prefix;
// add the prefix too!
if ( p && ! wbuf->safeMemcpy(p,strlen(p)+1)) return false;
// none?
if ( ! p ) poff = -1;
// store description
int32_t doff = wbuf->length();
// shortcut
const char *d = hi->m_desc;
// add the desc too!
if ( d && ! wbuf->safeMemcpy(d,strlen(d)+1) ) return false;
// none?
if ( ! d ) doff = -1;
// store term
int32_t toff = wbuf->length();
// add it
if ( ! wbuf->safeMemcpy ( s , slen ) ) return false;
// make this
TermDebugInfo ti;
ti.m_termOff = toff;
ti.m_termLen = slen;
ti.m_descOff = doff;
ti.m_prefixOff = poff;
ti.m_date = hi->m_date;
ti.m_shardByTermId = hi->m_shardByTermId;
ti.m_termId = termId;
//ti.m_weight = 1.0;
//ti.m_spam = -1.0;
ti.m_diversityRank = diversityRank;
ti.m_densityRank = densityRank;
ti.m_wordSpamRank = wordSpamRank;
ti.m_hashGroup = hashGroup;
ti.m_wordNum = wordNum;
ti.m_wordPos = wordPos;
ti.m_langId = langId;
ti.m_key = key;
// save for printing out an asterisk
ti.m_synSrc = synSrc; // isSynonym = isSynonym;
// get language bit vec
ti.m_langBitVec64 = g_speller.getLangBits64(termId);
// make the key
key96_t k;
k.n1 = 0; // date
k.n0 = termId;
// store it
return wts->addKey ( &k , &ti ) ;
}
//
// . hash terms that are sharded by TERMID not DOCID!!
//
// . returns false and sets g_errno on error
// . these terms are stored in indexdb, but all terms with the same
// termId reside in one and only one group. whereas normally the records
// are split based on docid and every group gets 1/nth of the termlist.
// . we do this "no splitting" so that only one disk seek is required, and
// we know the termlist is small, or the termlist is being used for spidering
// or parsing purposes and is usually not sent across the network.
bool XmlDoc::hashNoSplit ( HashTableX *tt ) {
// constructor should set to defaults automatically
HashInfo hi;
hi.m_hashGroup = HASHGROUP_INTAG;
hi.m_tt = tt;
// usually we shard by docid, but these are terms we shard by termid!
hi.m_shardByTermId = true;
if ((size_utf8Content - 1) > 0) {
// for exact content deduping
setStatus("hashing gbcontenthash (deduping) no-split keys");
// this should be ready to go and not block!
int64_t *pch64 = getExactContentHash64();
if (!pch64 || pch64 == (void *)-1) { g_process.shutdownAbort(true); }
char cbuf[64];
int32_t clen = sprintf(cbuf, "%" PRIu64, (uint64_t)*pch64);
hi.m_prefix = "gbcontenthash";
if (!hashString(cbuf, clen, &hi)) return false;
}
// now hash the site
setStatus ( "hashing no-split SiteGetter terms");
Url *fu = getFirstUrl();
const char *host = fu->getHost();
//
// HASH terms for SiteGetter.cpp
//
// these are now no-split terms
//
const char *s = fu->getUrl();
int32_t slen = fu->getUrlLen();
// . this termId is used by SiteGetter.cpp for determining subsites
// . matches what is in SiteGet::getSiteList()
// for www.xyz.com/a/ HASH www.xyz.com
// for www.xyz.com/a/b/ HASH www.xyz.com/a/
// for www.xyz.com/a/b/c/ HASH www.xyz.com/a/b/
bool add = true;
// we only hash this for urls that end in '/'
if ( s[slen-1] != '/' ) add = false;
// and no cgi
if ( fu->isCgi() ) add = false;
// skip if root
if ( fu->getPathLen() <= 1 ) add = false;
// sanity check
if ( ! m_linksValid ) { g_process.shutdownAbort(true); }
// . skip if we have no subdirectory outlinks
// . that way we do not confuse all the pages in dictionary.com or
// wikipedia.org as subsites!!
if ( ! m_links.hasSubdirOutlink() ) add = false;
// hash it
if ( add ) {
// remove the last path component
const char *end2 = s + slen - 2;
// back up over last component
for ( ; end2 > fu->getPath() && *end2 != '/' ; end2-- ) ;
// hash that part of the url
hi.m_prefix = "siteterm";
if ( ! hashSingleTerm ( host,end2-host,&hi) ) return false;
}
return true;
}
// . returns -1 if blocked, returns NULL and sets g_errno on error
// . "sr" is the tagdb Record
// . "ws" store the terms for PageParser.cpp display
char *XmlDoc::hashAll(HashTableX *table) {
logTrace(g_conf.m_logTraceXmlDoc, "BEGIN");
setStatus("hashing document");
if (m_allHashed) {
return (char *)1;
}
// sanity checks
if (table->getKeySize() != 18 || table->getDataSize() != 4) {
g_process.shutdownAbort(true);
}
// ptr to term = 4 + score = 4 + ptr to sec = 4
if (m_wts && (m_wts->getKeySize() != 12 || m_wts->getDataSize() != sizeof(TermDebugInfo))) {
g_process.shutdownAbort(true);
}
uint8_t *ct = getContentType();
if (!ct) {
logTrace(g_conf.m_logTraceXmlDoc, "END, getContentType failed");
return NULL;
}
lemma_words.clear();
// BR 20160127: Never index JSON and XML content
if (*ct == CT_JSON || *ct == CT_XML) {
// For XML (JSON should not get here as it should be filtered out during spidering)
// store the URL as the only thing in posdb so we are able to find it, and
// eventually ban it.
if (!hashUrl(table, true)) { // urlOnly (skip IP and term generation)
logTrace(g_conf.m_logTraceXmlDoc, "END, hashUrl failed");
return NULL;
}
m_allHashed = true;
return (char *)1;
}
// need this for hashing
HashTableX *cnt = getCountTable();
if (!cnt) {
logTrace(g_conf.m_logTraceXmlDoc, "END, getCountTable failed");
return (char *)cnt;
}
if (cnt == (void *)-1) {
g_process.shutdownAbort(true);
}
// and this
Links *links = getLinks();
if (!links) {
logTrace(g_conf.m_logTraceXmlDoc, "END, getLinks failed");
return (char *)links;
}
if (links == (Links *)-1) {
g_process.shutdownAbort(true);
}
char *wordSpamVec = getWordSpamVec();
if (!wordSpamVec) {
logTrace(g_conf.m_logTraceXmlDoc, "END, getWordSpamVec failed");
return wordSpamVec;
}
if (wordSpamVec == (void *)-1) {
g_process.shutdownAbort(true);
}
char *fragVec = getFragVec();
if (!fragVec) {
logTrace(g_conf.m_logTraceXmlDoc, "END, getFragVec failed");
return fragVec;
}
if (fragVec == (void *)-1) {
g_process.shutdownAbort(true);
}
// why do we need this?
if ( m_wts ) {
uint8_t *lv = getLangVector();
if (!lv) {
logTrace(g_conf.m_logTraceXmlDoc, "END, getLangVector failed");
return (char *)lv;
}
if (lv == (void *)-1) {
g_process.shutdownAbort(true);
}
}
CollectionRec *cr = getCollRec();
if ( ! cr ) {
logTrace(g_conf.m_logTraceXmlDoc, "END, getCollRec failed");
return NULL;
}
// do not repeat this if the cachedb storage call blocks
m_allHashed = true;
// reset distance cursor
m_dist = 0;
if (!hashContentType(table)) {
logTrace(g_conf.m_logTraceXmlDoc, "END, hashContentType failed");
return NULL;
}
if (!hashUrl(table, false)) {
logTrace(g_conf.m_logTraceXmlDoc, "END, hashUrl failed");
return NULL;
}
if (!hashLanguage(table)) {
logTrace(g_conf.m_logTraceXmlDoc, "END, hashLanguage failed");
return NULL;
}
if (!hashCountry(table)) {
logTrace(g_conf.m_logTraceXmlDoc, "END, hashCountry failed");
return NULL;
}
// now hash the terms sharded by termid and not docid here since they
// just set a special bit in posdb key so Rebalance.cpp can work.
// this will hash the content checksum which we need for deduping
// which we use for diffbot custom crawls as well.
if (!hashNoSplit(table)) {
logTrace(g_conf.m_logTraceXmlDoc, "END, hashNoSplit failed");
return NULL;
}
// MDW: i think we just inject empty html with a diffbotreply into
// global index now, so don't need this... 9/28/2014
// stop indexing xml docs
// global index unless this is a json object in which case it is
// hashed above in the call to hashJSON(). this will decrease disk
// usage by about half, posdb* files are pretty big.
if (!cr->m_indexBody) {
logTrace(g_conf.m_logTraceXmlDoc, "END, !indexDoc");
return (char *)1;
}
bool *ini = getIsNoIndex();
if (ini == nullptr || ini == (bool*)-1) {
// must not be blocked
gbshutdownLogicError();
}
if (*ini && m_version > 126) {
logTrace(g_conf.m_logTraceXmlDoc, "END, noindex");
return (char *)1;
}
if ((size_utf8Content - 1) <= 0) {
logTrace(g_conf.m_logTraceXmlDoc, "END, contentLen == 0");
return (char *)1;
}
// hash the body of the doc first so m_dist is 0 to match
// the rainbow display of sections
if (!hashBody2(table)) {
logTrace(g_conf.m_logTraceXmlDoc, "END, hashBody2 failed");
return NULL;
}
// hash the title now too so neighborhood singles have more
// to match. plus, we only hash these title terms iff they
// are not already in the hash table, so as to avoid hashing
// repeated title terms because we do not do spam detection
// on them. thus, we need to hash these first before anything
// else. give them triple the body score
if (!hashTitle(table)) {
logTrace(g_conf.m_logTraceXmlDoc, "END, hashTitle failed");
return NULL;
}
// . hash the keywords tag, limited to first 2k of them so far
// . hash above the neighborhoods so the neighborhoods only index
// what is already in the hash table
if (!hashMetaKeywords(table)) {
logTrace(g_conf.m_logTraceXmlDoc, "END, hashMetaKeywords failed");
return NULL;
}
//Hash explicit keywords, if any
if(!hashExplicitKeywords(table)) {
logTrace(g_conf.m_logTraceXmlDoc, "END, hashExplicityKeywords failed");
return NULL;
}
// then hash the incoming link text, NO ANOMALIES, because
// we index the single words in the neighborhoods next, and
// we had songfacts.com coming up for the 'street light facts'
// query because it had a bunch of anomalous inlink text.
if (!hashIncomingLinkText(table)) {
logTrace(g_conf.m_logTraceXmlDoc, "END, hashIncomingLinkText failed");
return NULL;
}
// then the meta summary and description tags with half the score of
// the body, and only hash a term if was not already hashed above
// somewhere.
if (!hashMetaSummary(table)) {
logTrace(g_conf.m_logTraceXmlDoc, "END, hashMetaSummary failed");
return NULL;
}
// BR 20160220
// Store value of meta tag "geo.placename" to help aid searches for
// location specific sites, e.g. 'Restaurant in London'
if (!hashMetaGeoPlacename(table)) {
logTrace(g_conf.m_logTraceXmlDoc, "END, hashMetaGeoPlacename failed");
return NULL;
}
// this will only increment the scores of terms already in the table
// because we neighborhoods are not techincally in the document
// necessarily and we do not want to ruin our precision
if (!hashNeighborhoods(table)) {
logTrace(g_conf.m_logTraceXmlDoc, "END, hashNeighborhoods failed");
return NULL;
}
if (!hashLinks(table)) {
logTrace(g_conf.m_logTraceXmlDoc, "END, hashLinks failed");
return NULL;
}
if (!hashMetaTags(table)) {
logTrace(g_conf.m_logTraceXmlDoc, "END, hashMetaTags failed");
return NULL;
}
// hash gblang:de last for parsing consistency
if (!hashLanguageString(table)) {
logTrace(g_conf.m_logTraceXmlDoc, "END, hashLanguageString failed");
return NULL;
}
if(!hashLemmas(table)) {
logTrace(g_conf.m_logTraceXmlDoc, "END, hashLemmas failed");
return NULL;
}
lemma_words.clear(); //release memory early
logTrace(g_conf.m_logTraceXmlDoc, "END, OK");
return (char *)1;
}
// returns false and sets g_errno on error
bool XmlDoc::hashMetaTags ( HashTableX *tt ) {
setStatus ( "hashing meta tags" );
int32_t n = m_xml.getNumNodes();
XmlNode *nodes = m_xml.getNodes();
// set up the hashing parms
HashInfo hi;
hi.m_hashGroup = HASHGROUP_INMETATAG;
hi.m_tt = tt;
hi.m_desc = "custom meta tag";
// find the first meta summary node
for ( int32_t i = 0 ; i < n ; i++ ) {
//we are only interested in meta tags
if(nodes[i].m_nodeId != TAG_META)
continue;
// only get content for <meta name=..> not <meta http-equiv=..>
int32_t tagLen;
const char *tag = m_xml.getString(i, "name", &tagLen);
// skip if error/empty
if ( ! tag || tagLen <= 0 ) continue;
// this is now reserved
// do not hash keyword, keywords, description, or summary metas
// because that is done in hashRange() below based on the
// tagdb (ruleset) record
if ((tagLen== 7&&strncasecmp(tag,"keyword" , 7)== 0)||
(tagLen== 7&&strncasecmp(tag,"summary" , 7)== 0)||
(tagLen== 8&&strncasecmp(tag,"keywords" , 8)== 0)||
(tagLen==11&&strncasecmp(tag,"description",11)== 0) )
continue;
// BR 20160107: Only hash certain custom meta tags and ignore the rest
if(
(strncasecmp(tag,"subject", 7) != 0) &&
(strncasecmp(tag,"abstract", 8) != 0) &&
(strncasecmp(tag,"news_keywords", 13) != 0) && // http://www.metatags.org/meta_name_news_keywords
(strncasecmp(tag,"author", 6) != 0) &&
(strncasecmp(tag,"title", 5) != 0) &&
(strncasecmp(tag,"og:title", 8) != 0) &&
(strncasecmp(tag,"og:description", 14) != 0) &&
(strncasecmp(tag,"twitter:title", 13) != 0) &&
(strncasecmp(tag,"twitter:description", 19) != 0) )
{
// If none of the above, it is an unwanted meta tag
continue;
}
// get the content
int32_t len;
const char *s = m_xml.getString ( i , "content" , &len );
if ( ! s || len <= 0 ) continue;
StackBuf<1024> doubleDecodedContent;
possiblyDecodeHtmlEntitiesAgain(&s, &len, &doubleDecodedContent, true);
// Now index the wanted meta tags as normal text without prefix so they
// are used in user searches automatically.
hi.m_prefix = NULL;
bool status = hashString4(s,len,&hi);
// bail on error, g_errno should be set
if ( ! status ) return false;
}
return true;
}
// returns false and sets g_errno on error
bool XmlDoc::hashContentType ( HashTableX *tt ) {
CollectionRec *cr = getCollRec();
if ( ! cr ) return false;
uint8_t *ctype = getContentType();
if( !ctype ) {
return false;
}
const char *s = NULL;
setStatus ( "hashing content type" );
// hash numerically so we can do gbfacetint:type on it
HashInfo hi;
hi.m_hashGroup = HASHGROUP_INTAG;
hi.m_tt = tt;
hi.m_prefix = "type";
char tmp[6];
sprintf(tmp,"%" PRIu32,(uint32_t)*ctype);
if ( ! hashString (tmp,strlen(tmp),&hi ) ) return false;
// these ctypes are defined in HttpMime.h
switch (*ctype) {
case CT_HTML: s = "html"; break;
case CT_TEXT: s = "text"; break;
case CT_XML : s = "xml" ; break;
case CT_PDF : s = "pdf" ; break;
case CT_DOC : s = "doc" ; break;
case CT_XLS : s = "xls" ; break;
case CT_PPT : s = "ppt" ; break;
case CT_PS : s = "ps" ; break;
// for diffbot. so we can limit search to json objects
// in Diffbot.cpp
case CT_JSON: s = "json" ; break;
}
// bail if unrecognized content type
if ( ! s ) return true;
// . now hash it
// . use a score of 1 for all
// . TODO: ensure doc counting works ok with this when it does
// it's interpolation
return hashString (s,strlen(s),&hi );
}
// . hash the link: terms
// . ensure that more useful linkers are scored higher
// . useful for computing offsite link text for qdb-ish algorithm
// . NOTE: for now i do not hash links to the same domain in order to
// hopefully save 10%-25% index space
// . NOTE: PLUS, they may clog up the link-adjusted quality ratings since
// different site links with no link text will be ranked behind them
// . the 8-bit bitmap of the score of a link: term:
// . 00ubdcss u = link is Unbanned? b = link isBanned?
// d = link dirty? c = link clean?
// s = 01 if no link text, 10 if link text
// . NOTE: this is used in Msg18.cpp for extraction
// . CAUTION: IndexList::score32to8() will warp our score if its >= 128
// so i moved the bits down
bool XmlDoc::hashLinks ( HashTableX *tt ) {
setStatus ( "hashing links" );
// shortcuts
bool isRSSFeed = *getIsRSS();
char dbuf[8*4*1024];
HashTableX dedup;
dedup.set( 8,0,1024,dbuf,8*4*1024,false,"hldt");
CollectionRec *cr = getCollRec();
if ( ! cr ) {
logTrace( g_conf.m_logTraceXmlDoc, "END, getCollRec failed" );
return false;
}
// see ../url/Url2.cpp for hashAsLink() algorithm
for ( int32_t i = 0 ; i < m_links.m_numLinks ; i++ ) {
// skip links with zero 0 length
if ( m_links.m_linkLens[i] == 0 ) {
continue;
}
// . skip if we are rss page and this link is an <a href> link
// . we only harvest/index <link> urls from rss feeds
// . or in the case of feedburner, those orig tags
if ( isRSSFeed && (m_links.m_linkFlags[i] & LF_AHREFTAG) ) {
continue;
}
// if we have a <feedburner:origLink> tag, then ignore <link>
// tags and only get the links from the original links
if ( m_links.m_isFeedBurner && !(m_links.m_linkFlags[i] & LF_FBTAG) ) {
continue;
}
// normalize the link
Url link;
// now we always add "www" to these links so that any link
// to cnn.com is same as link to www.cnn.com, because either
// we index cnn.com or www.cnn.com but not both providing
// their content is identical (deduping). This way whichever
// one we index, we can take advantage of all link text whether
// it's to cnn.com or www.cnn.com.
// Every now and then we add new session ids to our list in
// Url.cpp, too, so we have to version that.
// Since this is just for hashing, it shouldn't matter that
// www.tmblr.co has no IP whereas only tmblr.co does.
link.set( m_links.m_linkPtrs[i], m_links.m_linkLens[i], true, m_links.m_stripParams, m_version );
// BR 20160105: Do not create "link:" hashes for media URLs etc.
if( link.hasNonIndexableExtension(TITLEREC_CURRENT_VERSION) || // @todo BR: For now ignore actual TitleDB version. // m_version) ||
link.hasScriptExtension() ||
link.hasJsonExtension() ||
link.hasXmlExtension() ||
isUrlBlocked(link)) {
logTrace( g_conf.m_logTraceXmlDoc, "Unwanted for indexing [%s]", link.getUrl());
continue;
}
// dedup this crap
int64_t h = hash64 ( link.getUrl(), link.getUrlLen() );
if ( dedup.isInTable ( &h ) ) continue;
if ( ! dedup.addKey ( &h ) ) return false;
// set up the hashing parms
HashInfo hi;
hi.m_hashGroup = HASHGROUP_INTAG;
hi.m_tt = tt;
hi.m_prefix = "link";
// hash link:<url>
if ( ! hashSingleTerm ( link.getUrl(),link.getUrlLen(),&hi )) {
return false;
}
h = hash64 ( link.getHost() , link.getHostLen() );
if ( dedup.isInTable ( &h ) ) continue;
if ( ! dedup.addKey ( &h ) ) return false;
// fix parm
hi.m_prefix = "sitelink";
// hash sitelink:<urlHost>
if ( ! hashSingleTerm ( link.getHost(),link.getHostLen(),&hi)) {
return false;
}
}
return true;
}
// . returns false and sets g_errno on error
// . hash for linkdb
bool XmlDoc::hashLinksForLinkdb ( HashTableX *dt ) {
// sanity check
if ( dt->getKeySize() != sizeof(key224_t) ) { g_process.shutdownAbort(true); }
if ( dt->getDataSize() != 0 ) { g_process.shutdownAbort(true); }
// this will be different with our new site definitions
uint32_t linkerSiteHash32 = *getSiteHash32();
char siteRank = getSiteRank();
if ( ! m_linksValid ) { g_process.shutdownAbort(true); }
int32_t *linkSiteHashes = getLinkSiteHashes();
if ( ! linkSiteHashes || linkSiteHashes == (void *)-1 ) {
g_process.shutdownAbort(true);
}
// use spidered time! might not be current time! like if rebuilding
// or injecting from a past spider time
int32_t discoveryDate = getSpideredTime();
// add in new links
for ( int32_t i = 0 ; i < m_links.m_numLinks ; i++ ) {
// skip if empty
if (m_links.m_linkLens[i] == 0) {
continue;
}
// . skip if spam, ALWAYS allow internal outlinks though!!
// . CAUTION: now we must version islinkspam()
bool spam = m_links.isLinkSpam(i);
// get site of outlink from tagrec if in there
int32_t linkeeSiteHash32 = linkSiteHashes[i];
//
// when setting the links class it should set the site hash
//
#ifdef _VALGRIND_
VALGRIND_CHECK_MEM_IS_DEFINED(&linkeeSiteHash32,sizeof(linkeeSiteHash32));
uint64_t tmp1 = m_links.getLinkHash64(i);
VALGRIND_CHECK_MEM_IS_DEFINED(&tmp1,sizeof(tmp1));
VALGRIND_CHECK_MEM_IS_DEFINED(&spam,sizeof(spam));
VALGRIND_CHECK_MEM_IS_DEFINED(&siteRank,sizeof(siteRank));
// uint32_t tmp2 = *getIp();
// VALGRIND_CHECK_MEM_IS_DEFINED(&tmp2,sizeof(tmp2));
uint64_t tmp3 = *getDocId();
VALGRIND_CHECK_MEM_IS_DEFINED(&tmp3,sizeof(tmp3));
VALGRIND_CHECK_MEM_IS_DEFINED(&discoveryDate,sizeof(discoveryDate));
VALGRIND_CHECK_MEM_IS_DEFINED(&linkerSiteHash32,sizeof(linkerSiteHash32));
#endif
int32_t *ipptr = getIp();
int32_t ip = ipptr ? *ipptr : 0;
// set this key, it is the entire record
key224_t k = Linkdb::makeKey_uk ( linkeeSiteHash32 ,
m_links.getLinkHash64(i) ,
spam , // link spam?
siteRank , // was quality
ip,
*getDocId() ,
discoveryDate ,
0 ,
false , // new add?
linkerSiteHash32 ,
false );// delete?
#ifdef _VALGRIND_
VALGRIND_CHECK_MEM_IS_DEFINED(&k,sizeof(k));
#endif
// store in hash table
if (!dt->addKey(&k, NULL)) {
return false;
}
}
return true;
}
// . returns false and sets g_errno on error
// . copied Url2.cpp into here basically, so we can now dump Url2.cpp
bool XmlDoc::hashUrl ( HashTableX *tt, bool urlOnly ) { // , bool isStatusDoc ) {
setStatus ( "hashing url colon" );
// get the first url
Url *fu = getFirstUrl();
// set up the hashing parms
HashInfo hi;
hi.m_hashGroup = HASHGROUP_INTAG;
hi.m_tt = tt;
// we do not need diversity bits for this
hi.m_useCountTable = false;
//
// HASH url: term
//
// append a "www." for doing url: searches
Url uw;
uw.set( fu->getUrl(), fu->getUrlLen(), true, false );
hi.m_prefix = "url";
if ( ! hashSingleTerm(uw.getUrl(),uw.getUrlLen(),&hi) )
return false;
if (urlOnly) {
return true;
}
bool *ini = getIsNoIndex();
if (ini == nullptr || ini == (bool*)-1) {
// must not be blocked
gbshutdownLogicError();
}
const char *s = fu->getUrl();
int32_t slen = fu->getUrlLen();
SafeBuf sb_punyDecodedHost;
//no-index support was added in version 126. So if noindex is not present, or if un-indexing an older titlerecversion then do the index
if (!*ini || m_version <= 126) {
setStatus("hashing inurl colon");
//
// HASH inurl: terms
//
hi.m_prefix = "inurl";
// BR 20160114: Skip numbers in urls when doing "inurl:" queries
hi.m_hashNumbers = false;
hi.m_filterUrlIndexableWords = true;
if (!hashString(s, slen, &hi)) return false;
//If the host has punycode encoded characters in it and the TLD has some enforcement against phishing
//and misleading domains then index the punycode-decoded string too
if(fu->isPunycodeSafeTld() && fu->hasPunycode()) {
if(fu->getPunycodeDecodedHost(&sb_punyDecodedHost)) {
//note: we index non-punycode labels too, it is not worth the effort to avoid that
//because we also need them for bigram generation. So eg www.ærtesuppe.dk will get
//indexed for "www", "xn--rtesuppe-i0a", and "dk" in the hashStrings() call above
//and them for "www", "ærtesuppe" and "dk" below.
if (!hashString(sb_punyDecodedHost.getBufStart(), sb_punyDecodedHost.length(), &hi))
return false;
}
}
}
{
setStatus("hashing ip colon");
hi.m_hashNumbers = true;
hi.m_filterUrlIndexableWords = false;
//
// HASH ip:a.b.c.d
//
if (!m_ipValid) { g_process.shutdownAbort(true); }
// copy it to save it
char ipbuf[64];
int32_t iplen = strlen(iptoa(m_ip, ipbuf));
hi.m_prefix = "ip";
if (!hashSingleTerm(ipbuf, iplen, &hi)) return false;
// . sanity check
if (!m_siteNumInlinksValid) { g_process.shutdownAbort(true); }
}
//
// HASH the url's mid domain and host as they were in the body
//
setStatus ( "hashing site colon terms");
//
// HASH the site: terms
//
// . hash the pieces of the site
// . http://host.domain.com/~harry/level1/ should hash to:
// . site:host.domain.com/~harry/level1/
// . site:host.domain.com/~harry/
// . site:host.domain.com/~
// . site:host.domain.com/
// . site:domain.com/~harry/level1/
// . site:domain.com/~harry/
// . site:domain.com/~
// . site:domain.com/
// ensure score is positive
//if ( siteScore <= 0 ) siteScore = 1;
// get the hostname (later we set to domain name)
const char *name = fu->getHost();
int32_t nameLen = fu->getHostLen();
#ifdef _VALGRIND_
VALGRIND_CHECK_MEM_IS_DEFINED(name,nameLen);
#endif
// . point to the end of the whole thing, including port field
// . add in port, if non default
const char *end3 = name + fu->getHostLen() + fu->getPortLen();
// Generate string with port if server runs on non-standard ports
char pbuf[12];
int pbufLen=0;
int32_t port = fu->getPort();
if( port > 0 && port != 80 && port != 443 ) {
pbufLen=snprintf(pbuf, 12, ":%" PRIu32, (uint32_t)fu->getPort());
}
loop:
// now loop through the sub paths of this url's path
int32_t prev_len = -1;
for ( int32_t i = 0 ; ; i++ ) {
// get the subpath
int32_t len = fu->getSubPathLen(i);
if(len==prev_len) //work around bug (?) in Url
continue;
prev_len = len;
// FIX: always include first /
if ( len == 0 ) {
len = 1;
}
// write http://www.whatever.com/path into buf
char buf[MAX_URL_LEN+10];
char *p = buf;
// BR 20160122: Do NOT fix this for https sites. The search is
// always prefixed with http:// (sigh ...)
gbmemcpy ( p , "http://" , 7 ); p += 7;
gbmemcpy ( p , name, nameLen); p += nameLen;
if( pbufLen > 0 )
{
gbmemcpy ( p , pbuf, pbufLen); p += pbufLen;
}
gbmemcpy ( p , fu->getPath() , len ); p += len;
*p = '\0';
// update hash parms
if (m_version <= 126) {
hi.m_prefix = "site";
} else {
hi.m_prefix = *ini ? "sitenoindex" : "site";
}
hi.m_hashGroup = HASHGROUP_INURL;
// this returns false on failure
if ( ! hashSingleTerm (buf,p-buf,&hi ) ) {
return false;
}
// break when we hash the root path
if ( len <=1 ) {
break;
}
}
// now keep moving the period over in the hostname
while ( name < end3 && *name != '.' ) {
name++;
nameLen--;
}
// skip the '.'
name++; nameLen--;
// Check that there is a dot before first slash after domain
// to avoid junk entries like http://com/subpath/pagename.html
bool dom_valid = false;
if( nameLen > 0 )
{
int32_t dom_offset=0;
if( strncmp(name,"http://" ,7)==0 )
{
dom_offset=7;
}
else
if( strncmp(name,"https://",8)==0 )
{
dom_offset=8;
}
const char *dotpos = (const char *)memchr(name,'.',nameLen);
const char *slashpos= (const char *)memchr(name+dom_offset,'/',nameLen-dom_offset);
if( dotpos && (!slashpos || (slashpos > dotpos)) )
{
dom_valid = true;
}
}
if ( name < end3 && dom_valid ) goto loop;
// BR 20160121: Make searching for e.g. site:dk work
setStatus ( "hashing tld for site search");
const char *tld = fu->getTLD();
int32_t tldLen = fu->getTLDLen();
if( tldLen > 0 && tldLen < 64 ) {
char tldBuf[72]; // http:// (7) + tld (63) + / (1) + 0 (1)
char *p = tldBuf;
gbmemcpy ( p , "http://", 7 ); p += 7;
gbmemcpy ( p , tld, tldLen); p += tldLen;
gbmemcpy ( p , "/", 1 ); p += 1;
*p = '\0';
if ( ! hashSingleTerm (tldBuf, p - tldBuf, &hi ) ) {
return false;
}
}
const char *ext = fu->getExtension();
int32_t elen = fu->getExtensionLen();
if (!*ini || m_version <= 126) {
//
// HASH ext: term
//
// i.e. ext:gif ext:html ext:htm ext:pdf, etc.
setStatus("hashing ext colon");
// update hash parms
hi.m_prefix = "ext";
if (!hashSingleTerm(ext, elen, &hi)) return false;
}
{
setStatus("hashing gbdocid");
hi.m_prefix = "gbdocid";
char buf2[32];
sprintf(buf2, "%" PRIu64, (uint64_t)m_docId);
if (!hashSingleTerm(buf2, strlen(buf2), &hi)) return false;
}
setStatus ( "hashing SiteGetter terms");
//
// HASH terms for SiteGetter.cpp
//
// . this termId is used by SiteGetter.cpp for determining subsites
// . matches what is in SiteGet::getSiteList()
// for www.xyz.com/a/ HASH www.xyz.com
// for www.xyz.com/a/b/ HASH www.xyz.com/a/
// for www.xyz.com/a/b/c/ HASH www.xyz.com/a/b/
bool add = true;
// we only hash this for urls that end in '/'
if ( s[slen-1] != '/' ) add = false;
// and no cgi
if ( fu->isCgi() ) add = false;
// skip if root
if ( fu->getPathLen() <= 1 ) add = false;
// sanity check
if ( ! m_linksValid ) { g_process.shutdownAbort(true); }
// . skip if we have no subdirectory outlinks
// . that way we do not confuse all the pages in dictionary.com or
// wikipedia.org as subsites!!
if ( ! m_links.hasSubdirOutlink() ) add = false;
const char *host = fu->getHost();
int32_t hlen = fu->getHostLen ();
// tags from here out
hi.m_hashGroup = HASHGROUP_INTAG;
hi.m_shardByTermId = true;
// hash it
if ( add ) {
// remove the last path component
const char *end2 = s + slen - 2;
// back up over last component
for ( ; end2 > fu->getPath() && *end2 != '/' ; end2-- ) ;
// hash that part of the url
hi.m_prefix = "siteterm";
if ( ! hashSingleTerm ( host,end2-host,&hi) ) return false;
}
hi.m_shardByTermId = false;
setStatus ( "hashing urlhashdiv10 etc");
//
// HASH urlhash: urlhashdiv10: urlhashdiv100: terms
//
// this is for proving how many docs are in the index
char buf[20];
int32_t blen;
uint32_t h = hash32 ( s , slen );
blen = sprintf(buf,"%" PRIu32,h);
hi.m_prefix = "urlhash";
if ( ! hashString(buf,blen,&hi) ) return false;
// don't index mid domain or url path for noindex document
if (*ini && m_version > 126) {
return true;
}
if (size_utf8Content - 1 > 0 || m_indexCode == EDOCDISALLOWEDROOT) {
setStatus("hashing url mid domain");
// update parms
hi.m_prefix = NULL;
hi.m_desc = "middle domain";
hi.m_hashGroup = HASHGROUP_INURL;
hi.m_filterUrlIndexableWords = true; // Skip com, http etc.
if (!hashString(host, hlen, &hi)) {
return false;
}
if(sb_punyDecodedHost.length()>1) {
if(!hashString(sb_punyDecodedHost.getBufStart(),sb_punyDecodedHost.length(), &hi))
return false;
}
hi.m_filterUrlIndexableWords = false;
if (!hashSingleTerm(fu->getDomain(), fu->getDomainLen(), &hi)) {
return false;
}
}
if (size_utf8Content - 1 > 0) {
setStatus("hashing url path");
const char *path = fu->getPath();
int32_t plen = fu->getPathLen();
// BR 20160113: Do not hash and combine the page filename extension with the page name (skip e.g. .com)
if (elen > 0) {
elen++; // also skip the dot
}
plen -= elen;
// BR 20160113: Do not hash the most common page names
if (strncmp(path, "/index", plen) != 0) {
// hash the path
// BR 20160114: Exclude numbers in paths (usually dates)
hi.m_hashGroup = HASHGROUP_INURL;
hi.m_hashNumbers = false;
if (!hashString(path, plen, &hi)) return false;
}
}
//actually index the middle domain. The above indexing of filtered-host and singleterm-domain was in the original code so it was always misleading
{
setStatus("hashing url mid domain");
hi.m_prefix = NULL;
hi.m_desc = "middle domain(2)";
hi.m_hashGroup = HASHGROUP_MIDDOMAIN;
hi.m_filterUrlIndexableWords = false;
const char *mdom = fu->getMidDomain();
int32_t mdomlen = fu->getMidDomainLen();
if (!hashString(mdom, mdomlen, &hi)) {
return false;
}
if(fu->isPunycodeSafeTld() && fu->hasPunycode()) {
SafeBuf sb_punyDecodedMidDomain;
if(fu->getPunycodeDecodedMidDomain(&sb_punyDecodedMidDomain)) {
if (!hashString(sb_punyDecodedMidDomain.getBufStart(), sb_punyDecodedMidDomain.length(), &hi))
return false;
}
}
}
return true;
}
// . returns false and sets g_errno on error
bool XmlDoc::hashIncomingLinkText(HashTableX *tt) {
setStatus ( "hashing link text" );
// sanity
if ( ! m_linkInfo1Valid ) { g_process.shutdownAbort(true); }
// . finally hash in the linkText terms from the LinkInfo
// . the LinkInfo class has all the terms of hashed anchor text for us
// . if we're using an old TitleRec linkTermList is just a ptr to
// somewhere in TitleRec
// . otherwise, we generated it from merging a bunch of LinkInfos
// and storing them in this new TitleRec
LinkInfo *linkInfo = getLinkInfo1();
// sanity checks
if ( ! m_ipValid ) { g_process.shutdownAbort(true); }
if ( ! m_siteNumInlinksValid ) { g_process.shutdownAbort(true); }
//
// brought the following code in from LinkInfo.cpp
//
// count "external" inlinkers
int32_t ecount = 0;
// update hash parms
HashInfo hi;
hi.m_tt = tt;
// hashstring should update this like a cursor.
hi.m_startDist = 0;
// loop through the link texts and hash them
for ( Inlink *k = NULL; linkInfo && (k = linkInfo->getNextInlink(k)) ; ) {
// is this inlinker internal?
bool internal=((m_ip&0x0000ffff)==(k->m_ip&0x0000ffff));
// count external inlinks we have for indexing gbmininlinks:
if ( ! internal ) ecount++;
// get length of link text
int32_t tlen = k->size_linkText;
if ( tlen > 0 ) tlen--;
// get the text
const char *txt = k->getLinkText();
// sanity check
if ( ! verifyUtf8 ( txt , tlen ) ) {
log("xmldoc: bad link text 2 from url=%s for %s",
k->getUrl(),m_firstUrl.getUrl());
continue;
}
if ( internal ) hi.m_hashGroup = HASHGROUP_INTERNALINLINKTEXT;
else hi.m_hashGroup = HASHGROUP_INLINKTEXT;
// store the siterank of the linker in this and use that
// to set the multiplier M bits i guess
hi.m_linkerSiteRank = k->m_siteRank;
if(hi.m_linkerSiteRank>MAXSITERANK) {
log(LOG_INFO,"Inlink had siteRank>max (%d), probably from docid %ld", k->m_siteRank, k->m_docId);
hi.m_linkerSiteRank = MAXSITERANK;
}
// now record this so we can match the link text to
// a matched offsite inlink text term in the scoring info
k->m_wordPosStart = m_dist; // hi.m_startDist;
// . hash the link text into the table
// . returns false and sets g_errno on error
// . we still have the score punish from # of words though!
// . for inlink texts that are the same it should accumulate
// and use the reserved bits as a multiplier i guess...
if ( ! hashString4(txt,tlen,&hi) ) return false;
// now record this so we can match the link text to
// a matched offsite inlink text term in the scoring info
//k->m_wordPosEnd = hi.m_startDist;
// spread it out
hi.m_startDist += 20;
}
return true;
}
// . returns false and sets g_errno on error
bool XmlDoc::hashNeighborhoods ( HashTableX *tt ) {
setStatus ( "hashing neighborhoods" );
// . now we also hash the neighborhood text of each inlink, that is,
// the text surrounding the inlink text.
// . this is also destructive in that it will remove termids that
// were not in the document being linked to in order to save
// space in the titleRec
// . now we only do one or the other, not both
LinkInfo *linkInfo = getLinkInfo1();
if(!linkInfo)
return true;
// loop over all the Inlinks
for(Inlink *k = linkInfo->getNextInlink(NULL); k; k = linkInfo->getNextInlink(k)) {
// skip if internal, they often have the same neighborhood text
if((k->m_ip&0x0000ffff)==(m_ip&0x0000ffff))
continue;
// get the left and right texts and hash both
const char *s = k->getSurroundingText();
if(!s || k->size_surroundingText <= 1)
continue;
// update hash parms
HashInfo hi;
hi.m_tt = tt;
hi.m_desc = "surrounding text";
hi.m_hashGroup = HASHGROUP_NEIGHBORHOOD;
// . hash that
// . this returns false and sets g_errno on error
int32_t len = k->size_surroundingText - 1;
if(!hashString(s, len, &hi))
return false;
}
return true;
}
// . we now do the title hashing here for newer titlerecs, version 80+, rather
// than use the <index> block in the ruleset for titles.
// . this is not to be confused with hashing the title: terms which still
// does have an <index> block in the ruleset.
// . the new Weights class hashes title as part of body now with a high weight
// given by "titleWeight" parm
bool XmlDoc::hashTitle ( HashTableX *tt ) {
// sanity check
if ( m_hashedTitle ) { g_process.shutdownAbort(true); }
setStatus ( "hashing title" );
// this has been called, note it
m_hashedTitle = true;
//getXml()->getUtf8Content() results in the HTML to be ~mostly~ decoded but lt/gt/amp are still there escaped.
//So get the title text from m_xml, retokenize it, and then index that
int rawTitleLen;
const char *rawTitle = m_xml.getString("title",&rawTitleLen);
if(!rawTitle) {
//no title - nothing to do
return true;
}
//The amp/lt/gt are still there so decode them once again to get rid of them.
//Due to bad webmasters there can be double-encoded entities in the title. Technically it is
//their error but we can make some repairs on those pages.
const char *title = rawTitle;
int32_t titleLen = rawTitleLen;
StackBuf<1024> doubleDecodedContent;
possiblyDecodeHtmlEntitiesAgain(&title, &titleLen, &doubleDecodedContent, false);
//get language and country if known, so tokenizer phase 2 can do its magic
lang_t lang_id;
const char *countryCode;
getLanguageAndCountry(&lang_id,&countryCode);
TokenizerResult tr;
plain_tokenizer_phase_1(title,titleLen,&tr);
plain_tokenizer_phase_2(lang_id, countryCode, &tr);
calculate_tokens_hashes(&tr);
sortTokenizerResult(&tr);
Bits bits;
if(!bits.set(&tr))
return false;
HashInfo hi;
hi.m_tt = tt;
hi.m_hashGroup = HASHGROUP_TITLE;
// hash with title: prefix
hi.m_prefix = "title";
if(!hashWords3(&hi, &tr, NULL, &bits, NULL, NULL, NULL, m_wts, &m_wbuf))
return false;
// hash without title: prefix
hi.m_prefix = NULL;
if(!hashWords3(&hi, &tr, NULL, &bits, NULL, NULL, NULL, m_wts, &m_wbuf))
return false;
return true;
}
// . we now do the title hashing here for newer titlerecs, version 80+, rather
// than use the <index> block in the ruleset for titles.
// . this is not to be confused with hashing the title: terms which still
// does have an <index> block in the ruleset.
bool XmlDoc::hashBody2 ( HashTableX *tt ) {
// do not index ANY of the body if it is NOT a permalink and
// "menu elimination" technology is enabled.
//if ( ! *getIsPermalink() && m_eliminateMenus ) return true;
setStatus ( "hashing body" );
// record this
m_bodyStartPos = m_dist;
m_bodyStartPosValid = true;
HashInfo hi;
hi.m_tt = tt;
hi.m_desc = "body";
hi.m_hashGroup = HASHGROUP_BODY;
// use NULL for the prefix
return hashWords (&hi );
}
bool XmlDoc::hashMetaKeywords ( HashTableX *tt ) {
// do not index meta tags if "menu elimination" technology is enabled.
//if ( m_eliminateMenus ) return true;
setStatus ( "hashing meta keywords" );
// hash the meta keywords tag
//char buf [ 2048 + 2 ];
//int32_t len=m_xml.getMetaContentPointer ( buf , 2048 , "keywords" , 8 );
int32_t mklen;
const char *mk = getMetaKeywords( &mklen );
// update hash parms
HashInfo hi;
hi.m_tt = tt;
hi.m_desc = "meta keywords";
hi.m_hashGroup = HASHGROUP_INMETATAG;
// call XmlDoc::hashString
return hashString4(mk, mklen, &hi);
}
void XmlDoc::lookupAndSetExplicitKeywords() {
std::string kw;
kw = ExplicitKeywords::lookupExplicitKeywords(m_firstUrl.getUrl());
if(kw.empty())
kw = ExplicitKeywords::lookupExplicitKeywords(m_currentUrl.getUrl());
if(!kw.empty()) {
log(LOG_DEBUG,"spider: found explicit keywords '%s' for %s", kw.c_str(),m_firstUrl.getUrl());
m_explicitKeywordsBuf.set(kw.c_str());
ptr_explicitKeywords = m_explicitKeywordsBuf.getBufStart();
size_explicitKeywords = m_explicitKeywordsBuf.length();
} else {
m_explicitKeywordsBuf.purge();
ptr_explicitKeywords = NULL;
size_explicitKeywords = 0;
}
}
bool XmlDoc::hashExplicitKeywords(HashTableX *tt) {
if(m_version<128)
return true;
setStatus("hashing explicit keywords");
if(size_explicitKeywords>0) {
log(LOG_DEBUG,"spider: hashing explicit keywords '%.*s' for %s", size_explicitKeywords, ptr_explicitKeywords, m_firstUrl.getUrl());
// update hash parms
HashInfo hi;
hi.m_tt = tt;
hi.m_desc = "explicit keywords";
hi.m_hashGroup = HASHGROUP_EXPLICIT_KEYWORDS;
return hashString4(ptr_explicitKeywords, size_explicitKeywords, &hi);
} else
return true; //nothing done - no error
}
// . hash the meta summary, description and keyword tags
// . we now do the title hashing here for newer titlerecs, version 80+, rather
// than use the <index> block in the ruleset for titles.
bool XmlDoc::hashMetaSummary ( HashTableX *tt ) {
// sanity check
if ( m_hashedMetas ) { g_process.shutdownAbort(true); }
// this has been called, note it
m_hashedMetas = true;
// do not index meta tags if "menu elimination" technology is enabled.
//if ( m_eliminateMenus ) return true;
setStatus ( "hashing meta summary" );
StackBuf<1024> doubleDecodedContent;
// hash the meta keywords tag
//char buf [ 2048 + 2 ];
//int32_t len = m_xml.getMetaContent ( buf , 2048 , "summary" , 7 );
int32_t mslen;
const char *ms = getMetaSummary ( &mslen );
possiblyDecodeHtmlEntitiesAgain(&ms, &mslen, &doubleDecodedContent, true);
// update hash parms
HashInfo hi;
hi.m_tt = tt;
hi.m_hashGroup = HASHGROUP_INMETATAG;
// udpate hashing parms
hi.m_desc = "meta summary";
// hash it
if(!hashString4(ms,mslen,&hi))
return false;
//len = m_xml.getMetaContent ( buf , 2048 , "description" , 11 );
int32_t mdlen;
const char *md = getMetaDescription ( &mdlen );
possiblyDecodeHtmlEntitiesAgain(&md, &mdlen, &doubleDecodedContent, true);
// udpate hashing parms
hi.m_desc = "meta desc";
// . TODO: only hash if unique????? set a flag on ht then i guess
if(!hashString4(md,mdlen, &hi))
return false;
return true;
}
bool XmlDoc::hashMetaGeoPlacename( HashTableX *tt ) {
setStatus ( "hashing meta geo.placename" );
int32_t mgplen;
const char *mgp = getMetaGeoPlacename( &mgplen );
// update hash parms
HashInfo hi;
hi.m_tt = tt;
hi.m_desc = "meta geo.placename";
hi.m_hashGroup = HASHGROUP_INMETATAG;
// call XmlDoc::hashString
return hashString4(mgp, mgplen, &hi);
}
bool XmlDoc::hashLanguage ( HashTableX *tt ) {
setStatus ( "hashing language" );
int32_t langId = (int32_t)*getLangId();
char s[32]; // numeric langid
int32_t slen = sprintf(s, "%" PRId32, langId );
// update hash parms
HashInfo hi;
hi.m_tt = tt;
hi.m_hashGroup = HASHGROUP_INTAG;
hi.m_prefix = "gblang";
if ( ! hashString ( s, slen, &hi ) ) return false;
return true;
}
bool XmlDoc::hashLanguageString ( HashTableX *tt ) {
setStatus ( "hashing language string" );
int32_t langId = (int32_t)*getLangId();
// update hash parms
HashInfo hi;
hi.m_tt = tt;
hi.m_hashGroup = HASHGROUP_INTAG;
hi.m_prefix = "gblang";
// try lang abbreviation
char s[32];
int32_t slen = sprintf(s , "%s ", getLanguageAbbr(langId) );
// go back to broken way to try to fix parsing consistency bug
if ( ! hashString ( s, slen, &hi ) ) return false;
return true;
}
bool XmlDoc::hashCountry ( HashTableX *tt ) {
setStatus ( "hashing country" );
uint16_t *cid = getCountryId();
if ( ! cid || cid == (uint16_t *)-1 ) return false;
// update hash parms
HashInfo hi;
hi.m_tt = tt;
hi.m_hashGroup = HASHGROUP_INTAG;
hi.m_prefix = "gbcountry";
for ( int32_t i = 0 ; i < 1 ; i++ ) {
// convert it
char buf[32];
int32_t blen = sprintf(buf,"%s", g_countryCode.getAbbr(*cid) );
// hash it
if ( ! hashString ( buf, blen, &hi ) ) return false;
}
// all done
return true;
}
bool XmlDoc::hashLemmas(HashTableX *table) {
setStatus("hashing lemmas"); //Not llamas
logTrace(g_conf.m_logTraceTokenIndexing,"lemma_words.size()=%zu", lemma_words.size());
HashInfo hi; //storeTerm wants a HashInfo instance.
if(m_dist > MAXWORDPOS) {
log(LOG_INFO,"hashLemmas(): wordpos limit hit in document %.*s", m_firstUrl.getUrlLen(), m_firstUrl.getUrl());
return true;
}
for(const auto &e : lemma_words) {
uint64_t h = hash64Lower_utf8(e.data(),e.length());
logTrace(g_conf.m_logTraceTokenIndexing,"Indexing lemma '%s', h=%ld, termid=%lld", e.c_str(), h, h&TERMID_MASK);
key144_t k;
Posdb::makeKey(&k,
h,
0LL,//docid
m_dist,
0,// densityRank , // 0-15
0, //diversityrank
0, //wordspamrank
0, // siterank
HASHGROUP_LEMMA,
m_langId, // we set to docLang final hash loop
0, // multiplier
false, // syn?
false, // delkey?
false); //shardByTermId
table->addTerm144(&k);
if(m_wts) {
// add to wts for PageParser.cpp display
if(!storeTerm(e.data(),e.length(),
h, &hi,
0, //word index. We could keep track of the first word that generated this base form. But we don't.
m_dist, // wordPos
0,// densityRank , // 0-15
0, //diversityrank
0, //wordspamrank
HASHGROUP_LEMMA,
&m_wbuf,
m_wts,
SOURCE_NONE, // synsrc
m_langId,
k))
return false;
}
}
return true;
}
void XmlDoc::sortTokenizerResult(TokenizerResult *tr) {
std::sort(tr->tokens.begin(), tr->tokens.end(), [](const TokenRange&tr0, const TokenRange &tr1) {
return tr0.start_pos < tr1.start_pos ||
(tr0.start_pos == tr1.start_pos && tr0.end_pos<tr1.end_pos);
});
}
void XmlDoc::getLanguageAndCountry(lang_t *lang, const char **country_code) {
//get language and country if known, so tokenizer phase 2 can do its magic
uint8_t *tmpLangId = getLangId();
if(tmpLangId!=NULL && tmpLangId!=(uint8_t*)-1)
*lang = (lang_t)*tmpLangId;
else
*lang = langUnknown;
uint16_t *countryId = getCountryId();
if(countryId!=NULL && countryId!=(uint16_t*)-1)
*country_code = g_countryCode.getAbbr(*countryId);
else
*country_code = NULL;
}
bool XmlDoc::hashSingleTerm( const char *s, int32_t slen, HashInfo *hi ) {
// empty?
if ( slen <= 0 ) return true;
if ( ! m_versionValid ) { g_process.shutdownAbort(true); }
if ( hi->m_useCountTable && ! m_countTableValid){g_process.shutdownAbort(true); }
// a single blob hash
int64_t termId = hash64 ( s , slen );
// combine with prefix
int64_t final = termId;
// combine with a non-NULL prefix
int64_t prefixHash = 0LL;
if ( hi->m_prefix ) {
prefixHash = hash64b ( hi->m_prefix );
final = hash64 ( termId , prefixHash );
}
// call the other guy now
//return hashSingleTerm ( final , hi );
// shortcut
HashTableX *dt = hi->m_tt;
// sanity check
if ( dt->getKeySize() != sizeof(key144_t) ) { g_process.shutdownAbort(true); }
// make the key like we do in hashWords()
key144_t k;
Posdb::makeKey ( &k ,
final,
0LL, // docid
0, // dist
MAXDENSITYRANK, // density rank
MAXDIVERSITYRANK, // diversity rank
MAXWORDSPAMRANK, // wordspamrank
0, // siterank
hi->m_hashGroup,
// we set to docLang in final hash loop
langUnknown,// langid
0, // multiplier
0, // syn?
false , // delkey?
hi->m_shardByTermId );
// . otherwise, add a new slot
// . key should NEVER collide since we are always
// incrementing the distance cursor, m_dist
if ( ! dt->addTerm144 ( &k ) ) return false;
// add to wts for PageParser.cpp display
if ( m_wts && ! storeTerm ( s,slen,final,hi,
0, // wordnum
0, // wordPos,
MAXDENSITYRANK,
MAXDIVERSITYRANK,
MAXWORDSPAMRANK,
hi->m_hashGroup,
//false,
&m_wbuf,
m_wts,
SOURCE_NONE, // synsrc
langUnknown,
k) )
return false;
return true;
}
bool XmlDoc::hashString( const char *s, int32_t slen, HashInfo *hi ) {
if ( ! m_versionValid ) { g_process.shutdownAbort(true); }
if ( hi->m_useCountTable && ! m_countTableValid){g_process.shutdownAbort(true); }
if ( ! m_siteNumInlinksValid ) { g_process.shutdownAbort(true); }
return hashString3( s ,
slen ,
hi ,
m_wts ,
&m_wbuf );
}
bool XmlDoc::hashString(size_t begin_token, size_t end_token, HashInfo *hi) {
if(!m_versionValid)
gbshutdownLogicError();
return hashString3(begin_token, end_token, hi,
m_wts,
&m_wbuf);
}
bool XmlDoc::hashString3( const char *s ,
int32_t slen ,
HashInfo *hi ,
HashTableX *wts ,
SafeBuf *wbuf) {
TokenizerResult tr;
Bits bits;
plain_tokenizer_phase_1(s,slen,&tr);
calculate_tokens_hashes(&tr);
if ( !bits.set(&tr))
return false;
// use primary langid of doc
if ( ! m_langIdValid ) { g_process.shutdownAbort(true); }
return hashWords3( hi, &tr, NULL, &bits, NULL, NULL, NULL, wts, wbuf );
}
bool XmlDoc::hashString3(size_t begin_token, size_t end_token, HashInfo *hi,
HashTableX *wts, SafeBuf *wbuf)
{
Bits bits;
if ( !bits.set(&m_tokenizerResult))
return false;
return hashWords3( hi, &m_tokenizerResult, begin_token, end_token, NULL, &bits, NULL, NULL, NULL, wts, wbuf );
}
bool XmlDoc::hashString4(const char *s, int32_t slen, HashInfo *hi) {
TokenizerResult tr;
Bits bits;
lang_t lang_id;
const char *countryCode;
getLanguageAndCountry(&lang_id,&countryCode);
plain_tokenizer_phase_1(s,slen,&tr);
plain_tokenizer_phase_2(lang_id,countryCode,&tr);
calculate_tokens_hashes(&tr);
sortTokenizerResult(&tr);
if(!bits.set(&tr))
return false;
return hashWords3( hi, &tr, NULL, &bits, NULL, NULL, NULL, m_wts, &m_wbuf );
}
bool XmlDoc::hashWords ( HashInfo *hi ) {
// sanity checks
if ( ! m_tokenizerResultValid ) { g_process.shutdownAbort(true); }
if ( ! m_tokenizerResultValid2 ) { g_process.shutdownAbort(true); }
//if ( hi->m_useCountTable &&!m_countTableValid){g_process.shutdownAbort(true); }
if ( ! m_bitsValid ) { g_process.shutdownAbort(true); }
if ( ! m_sectionsValid) { g_process.shutdownAbort(true); }
//if ( ! m_synonymsValid) { g_process.shutdownAbort(true); }
if ( ! m_fragBufValid ) { g_process.shutdownAbort(true); }
if ( ! m_wordSpamBufValid ) { g_process.shutdownAbort(true); }
if ( m_wts && ! m_langVectorValid ) { g_process.shutdownAbort(true); }
if ( ! m_langIdValid ) { g_process.shutdownAbort(true); }
// . is the word repeated in a pattern?
// . this should only be used for document body, for meta tags,
// inlink text, etc. we should make sure words are unique
char *wordSpamVec = getWordSpamVec();
char *fragVec = m_fragBuf.getBufStart();
char *langVec = m_langVec.getBufStart();
return hashWords3(hi, &m_tokenizerResult, &m_sections, &m_bits, fragVec, wordSpamVec, langVec, m_wts, &m_wbuf);
}
// . this now uses posdb exclusively
bool XmlDoc::hashWords3(HashInfo *hi, const TokenizerResult *tr,
Sections *sections, const Bits *bits,
const char *fragVec, const char *wordSpamVec, const char *langVec,
HashTableX *wts, SafeBuf *wbuf)
{
return hashWords3(hi,tr, 0,tr->size(), sections, bits, fragVec, wordSpamVec, langVec, wts, wbuf);
}
bool XmlDoc::hashWords3(HashInfo *hi, const TokenizerResult *tr, size_t begin_token, size_t end_token,
Sections *sections, const Bits *bits,
const char *fragVec, const char *wordSpamVec, const char *langVec,
HashTableX *wts, SafeBuf *wbuf)
{
// for getSpiderStatusDocMetaList() we don't use sections it'll mess us up
if ( ! hi->m_useSections ) sections = NULL;
HashTableX *dt = hi->m_tt;
std::unordered_set<std::string> candidate_lemma_words;
// . sanity checks
// . posdb just uses the full keys with docid
if ( dt->getKeySize() != 18 ) { g_process.shutdownAbort(true); }
if ( dt->getDataSize() != 4 ) { g_process.shutdownAbort(true); }
// if provided...
if ( wts ) {
if ( wts->getKeySize() != 12 ) { g_process.shutdownAbort(true); }
if ( wts->getDataSize() != sizeof(TermDebugInfo)){g_process.shutdownAbort(true); }
if ( ! wts->isAllowDups() ) { g_process.shutdownAbort(true); }
}
// ensure caller set the hashGroup
if ( hi->m_hashGroup < 0 ) { g_process.shutdownAbort(true); }
// hash in the prefix
uint64_t prefixHash = 0LL;
int32_t plen = 0;
if ( hi->m_prefix ) plen = strlen ( hi->m_prefix );
if ( hi->m_prefix && plen ) {
// we gotta make this case insensitive, and skip spaces
// because if it is 'focal length' we can't search
// 'focal length:10' because that comes across as TWO terms.
prefixHash = hash64Lower_utf8_nospaces ( hi->m_prefix , plen );
// . sanity test, make sure it is in supported list
// . hashing diffbot json output of course fails this so
// skip in that case if diffbot
}
bool hashIffUnique = false;
if ( hi->m_hashGroup == HASHGROUP_INMETATAG ) hashIffUnique = true;
if ( hi->m_hashGroup == HASHGROUP_INTAG ) hashIffUnique = true;
HashTableX ut; ut.set ( 8,0,0,NULL,0,false,"uqtbl");
//The diversity rank was effectively disabled (minweight=maxweigt) and the algortihm was either suspect or severely limited by phrases being only 2 words (bigrams).
//Currently disabled until we can investigate if it is worth fixing, worth implementing in another way, or simply dropped completely.
//
//Diversityrank is currently hardcoded to be 10 for individual words, and maxdiversityrank for bigrams
SafeBuf dwbuf;
if(!dwbuf.reserve(tr->size()*sizeof(char)))
return false;
memset(dwbuf.getBufStart(), MAXDIVERSITYRANK, tr->size());
#if 0
///////
//
// diversity rank vector.
//
///////
// the final diversity which is a multiplier
// is converted into a rank from 0-15 i guess.
// so 'mexico' in "new mexico" should receive a low word score but high
// phrase score. thus, a search for 'mexico' should not bring up
// the page for university of new mexico!
SafeBuf dwbuf;
if ( !getDiversityVec( tr, phrases, countTable, &dwbuf ) ) {
return false;
}
#endif
char *wdv = dwbuf.getBufStart();
size_t nw = tr->size();
/////
//
// calculate density ranks
//
/////
//
// this now varies depending on the length of the sentence/header etc.
// so if the hasgroup is not title, link text or meta tag, we have to
// use a safebuf.
SafeBuf densBuf;
// returns false and sets g_errno on error
if ( ! getDensityRanks(tr,
hi->m_hashGroup,
&densBuf,
sections))
return false;
// a handy ptr
char *densvec = (char *)densBuf.getBufStart();
////////////
//
// get word positions
//
///////////
Section **sp = NULL;
if ( sections ) sp = sections->m_sectionPtrs;
SafeBuf wpos;
if ( ! getWordPosVec ( tr, sections, m_dist, fragVec, &wpos) )
return false;
// a handy ptr
int32_t *wposvec = (int32_t *)wpos.getBufStart();
if(end_token>begin_token && wposvec[end_token-1]>MAXWORDPOS) {
log(LOG_INFO,"hashWords3(): wordpos limit will be hit in document %.*s", m_firstUrl.getUrlLen(), m_firstUrl.getUrl());
}
bool seen_slash = false;
for(unsigned i = begin_token; i < end_token; i++) {
const auto &token = (*tr)[i];
logTrace(g_conf.m_logTraceTokenIndexing,"Looking at token #%u: '%.*s', hash=%ld, nodeid=%u", i, (int)token.token_len, token.token_start, token.token_hash, token.nodeid);
if(token.token_len==1 && token.token_start[0]=='/')
seen_slash = true;
if ( ! token.is_alfanum ) continue;
// ignore if in repeated fragment
if ( fragVec && i<MAXFRAGWORDS && fragVec[i] == 0 ) continue;
// ignore if in style section
if ( sp && (sp[i]->m_flags & NOINDEXFLAGS) ) continue;
// do not breach wordpos bits
if ( wposvec[i] > MAXWORDPOS ) break;
// BR: 20160114 if digit, do not hash it if disabled
if( is_digit( token.token_start[0] ) && !hi->m_hashNumbers ) {
continue;
}
// . hash the startHash with the wordId for this word
// . we must mask it before adding it to the table because
// this table is also used to hash IndexLists into that come
// from LinkInfo classes (incoming link text). And when
// those IndexLists are hashed they used masked termIds.
// So we should too...
uint64_t h ;
if ( plen > 0 ) h = hash64 ( token.token_hash, prefixHash );
else h = token.token_hash;
int32_t hashGroup = hi->m_hashGroup;
Section *sx = NULL;
if ( sp ) {
sx = sp[i];
// . this is taken care of in hashTitle()
// . it is slightly different if the title is
// multiple sentences because when hashing the
// body the density rank is per sentence, but in
// hashTitle we count all the words in the title
// towards the density rank even if they are
// in different sentences
if ( sx->m_flags & SEC_IN_TITLE ) {
continue;
}
if ( sx->m_flags & SEC_IN_HEADER ) {
hashGroup = HASHGROUP_HEADING;
}
if ( sx->m_flags & ( SEC_MENU | SEC_MENU_SENTENCE | SEC_MENU_HEADER ) ) {
hashGroup = HASHGROUP_INMENU;
}
}
// this is for link text and meta tags mostly
if ( hashIffUnique ) {
// skip if already did it
if ( ut.isInTable ( &h ) ) continue;
if ( ! ut.addKey ( &h ) ) return false;
}
char ws = 15;
if ( wordSpamVec ) ws = wordSpamVec[i];
// HACK:
// if this is inlink text, use the wordspamrank to hold the
// inlinker's site rank!
if ( hashGroup == HASHGROUP_INLINKTEXT )
ws = hi->m_linkerSiteRank;
// default to the document's primary language if it is not
// clear what language this word belongs to.
// if the word is only in german it should be german,
// otherwise it will be the document's primary language.
//note: the above comment is wrong. The lanauge is overwritten by addTable144(). It is unclear if this is a good thing
char langId = langUnknown;
if ( m_wts && langVec ) langId = langVec[i];
char wd;
if ( hi->m_useCountTable ) {
wd = wdv[i];
} else {
wd = MAXDIVERSITYRANK;
}
bool skipword = false;
if(hi->m_filterUrlIndexableWords) {
if(!seen_slash) {
//Scheme/host/domain part of URL
//the http/https prefix is not indexed at all
if((token.token_len==4 && memcmp(token.token_start,"http",4)==0) ||
(token.token_len==5 && memcmp(token.token_start,"https",5)==0))
{
// Never include as single word or in bigrams
continue; //skip to next word
}
//the terms .com .co .dk etc have lots of hits and give very little value for indexing. We only index the bigrams.
if(isTLD(token.token_start, token.token_len)) {
skipword = true; //skip word by index bigram
}
} else {
//Path parth for URL
//potentially filter out "html" "aspx" index" "cgi" etc.
}
}
if(!skipword) {
logTrace(g_conf.m_logTraceTokenIndexing,"Indexing '%.*s', h=%ld, termid=%lld", (int)token.token_len, token.token_start, h, h&TERMID_MASK);
key144_t k;
Posdb::makeKey(&k,
h,
0LL,//docid
wposvec[i], // dist,
densvec[i],// densityRank , // 0-15
wd, // diversityRank 0-15
ws, // wordSpamRank 0-15
0, // siterank
hashGroup,
// we set to docLang final hash loop
langUnknown, // langid
0, // multiplier
false, // syn?
false, // delkey?
hi->m_shardByTermId);
// key should NEVER collide since we are always incrementing
// the distance cursor, m_dist
dt->addTerm144(&k);
// add to wts for PageParser.cpp display
if(wts) {
if(!storeTerm(token.token_start,token.token_len,h,hi,i,
wposvec[i], // wordPos
densvec[i],// densityRank , // 0-15
wd,//v[i],
ws,
hashGroup,
wbuf,
wts,
SOURCE_NONE, // synsrc
langId,
k))
return false;
}
if(token.is_alfanum)
candidate_lemma_words.emplace(token.token_start,token.token_len);
} else {
logTrace(g_conf.m_logTraceTokenIndexing,"not indexing '%.*s', h=%ld", (int)token.token_len, token.token_start, h);
}
////////
//
// two-word phrase
//
////////
//Find the first next alfanum token that starts at or after token.end_pos
//Also detect if we see a dont-pair-across token while scanning
unsigned j;
bool generate_bigram = true;
for(j=i+1; j<end_token; j++) {
const auto &t2 = (*tr)[j];
if(t2.is_alfanum && t2.start_pos>=token.end_pos)
break;
if(!bits->canBeInPhrase(j) && !bits->canPairAcross(j)) {
generate_bigram = false;
break;
}
}
if(j>=end_token)
generate_bigram = false;
if(generate_bigram) {
unsigned first_match_start_pos = (*tr)[j].start_pos;
for( ; j<end_token && (*tr)[j].start_pos == first_match_start_pos; j++) {
const auto &token2 = (*tr)[j];
if(!token2.is_alfanum)
continue; //ampersand-rewrites in tokenizer2.cpp can result in non-alfanum tokens that must be ignored and skipped
int32_t pos = token.token_len;
int64_t npid = hash64Lower_utf8_cont(token2.token_start, token2.token_len, token.token_hash, &pos);
uint64_t ph2;
logTrace(g_conf.m_logTraceTokenIndexing,"Indexing two-word phrase '%.*s'+'%.*s' with h=%ld, termid=%lld", (int)token.token_len, token.token_start, (int)token2.token_len, token2.token_start, npid, npid&TERMID_MASK);
// hash with prefix
if ( plen > 0 ) ph2 = hash64 ( npid , prefixHash );
else ph2 = npid;
key144_t k;
Posdb::makeKey ( &k ,
ph2 ,
0LL,//docid
wposvec[i],//dist,
densvec[i],// densityRank , // 0-15
MAXDIVERSITYRANK, //phrase
ws, // wordSpamRank ,
0,//siterank
hashGroup,
// we set to docLang final hash loop
langUnknown, // langid
0 , // multiplier
false, // syn?
false , // delkey?
hi->m_shardByTermId );
// key should NEVER collide since we are always
// incrementing the distance cursor, m_dist
dt->addTerm144 ( &k );
// add to wts for PageParser.cpp display
if(wts) {
// get phrase as a string
size_t plen;
char phraseBuffer[256];
//TODO: Collect the intermediate tokens too. It is complicated because the two tokens generating the bigram can be either primary or secondary tokens from the tonizer, and the non-alfanum tokens between too.
//simplification: just grab the chars from token+token2
if(token.token_len<=sizeof(phraseBuffer)) {
memcpy(phraseBuffer, token.token_start, token.token_len);
plen = token.token_len;
} else {
memcpy(phraseBuffer, token.token_start, sizeof(phraseBuffer));
plen = sizeof(phraseBuffer);
}
if(token2.token_len<=sizeof(phraseBuffer)-plen) {
memcpy(phraseBuffer+plen, token2.token_start, token2.token_len);
plen += token2.token_len;
} else {
memcpy(phraseBuffer+plen, token2.token_start, sizeof(phraseBuffer)-plen);
plen = sizeof(phraseBuffer);
}
// store it
if(!storeTerm(phraseBuffer,plen,ph2,hi,i,
wposvec[i], // wordPos
densvec[i],// densityRank , // 0-15
MAXDIVERSITYRANK,//phrase
ws,
hashGroup,
//true,
wbuf,
wts,
SOURCE_BIGRAM, // synsrc
langId,
k))
return false;
}
}
} else {
logTrace(g_conf.m_logTraceTokenIndexing,"NOT indexing two-word phrase(s)");
}
}
// between calls? i.e. hashTitle() and hashBody()
if ( nw > 0 ) m_dist = wposvec[nw-1] + 100;
if(m_langId==langDanish && lemma_lexicon) {
//we only have a lexicon for Danish so far for this test
logTrace(g_conf.m_logTraceTokenIndexing,"candidate_lemma_words.size()=%zu", candidate_lemma_words.size());
for(auto e : candidate_lemma_words) {
//find the word in the lexicon. find the lemma. If the word is unknown or already in its base form then don't generate a lemma entry
logTrace(g_conf.m_logTraceTokenIndexing,"candidate word for lemma: %s", e.c_str());
auto le = lemma_lexicon->lookup(e);
if(!le) {
//Not found as-is in lexicon. Try lowercase in case it is a capitalized word
char lowercase_word[128];
if(e.size()<sizeof(lowercase_word)) {
size_t sz = to_lower_utf8(lowercase_word,lowercase_word+sizeof(lowercase_word), e.data(), e.data()+e.size());
lowercase_word[sz] = '\0';
if(sz!=e.size() || memcmp(e.data(),lowercase_word,e.size())!=0) {
e = lowercase_word;
le = lemma_lexicon->lookup(e);
}
}
}
if(!le) {
//Not found as-is in lexicon. Try capitalized in case it is a lowercase or uppercase word
char capitalized_word[128];
if(e.size()<sizeof(capitalized_word)) {
size_t sz = to_capitalized_utf8(capitalized_word,capitalized_word+sizeof(capitalized_word), e.data(), e.data()+e.size());
capitalized_word[sz] = '\0';
if(sz!=e.size() || memcmp(e.data(),capitalized_word,e.size())!=0) {
e = capitalized_word;
le = lemma_lexicon->lookup(e);
}
}
}
if(!le) {
//Not found as-is in lexicon. Try uppercasing it
char uppercase_word[128];
if(e.size()<sizeof(uppercase_word)) {
size_t sz = to_upper_utf8(uppercase_word,uppercase_word+sizeof(uppercase_word), e.data(), e.data()+e.size());
uppercase_word[sz] = '\0';
if(sz!=e.size() || memcmp(e.data(),uppercase_word,e.size())!=0) {
e = uppercase_word;
le = lemma_lexicon->lookup(e);
}
}
}
if(!le)
continue; //unknown word
logTrace(g_conf.m_logTraceTokenIndexing,"lexicalentry found for for lemma: %s", e.c_str());
auto wf = le->find_base_wordform();
if(!wf)
continue; //no base form
if(wf->written_form_length==e.size() && memcmp(wf->written_form,e.data(),e.size())==0)
continue; //already in base form
logTrace(g_conf.m_logTraceTokenIndexing,"baseform is different than source: '%.*s'", (int)wf->written_form_length, wf->written_form);
lemma_words.emplace(wf->written_form,wf->written_form_length);
}
}
return true;
}