Files
privacore-open-source-searc…/XmlDoc_Indexing.cpp
2018-01-05 14:12:47 +01:00

2273 lines
63 KiB
C++

//-*- coding: utf-8 -*-
#include "gb-include.h"
#include "XmlDoc.h"
#include "CountryCode.h" // g_countryCode
#include "Collectiondb.h"
#include "Speller.h"
#include "Synonyms.h"
#include "Process.h"
#include "ip.h"
#include "Posdb.h"
#include "Conf.h"
#include "UrlBlockCheck.h"
#include "Domains.h"
#ifdef _VALGRIND_
#include <valgrind/memcheck.h>
#endif
// a ptr to HashInfo is passed to hashString() and hashWords()
class HashInfo {
public:
HashInfo() {
m_tt = NULL;
m_prefix = NULL;
m_desc = NULL;
m_date = 0;
// should we do sharding based on termid and not the usual docid???
// in general this is false, but for checksum we want to shard
// by the checksum and not docid to avoid having to do a
// gbchecksum:xxxxx search on ALL shards. much more efficient.
m_shardByTermId = false;
m_hashGroup = -1;
m_useCountTable = true;
m_useSections = true;
m_startDist = 0;
// BR 20160108: Now default to false since we will only use it for
// very specific cases like spiderdate, which is for debugging only.
// If true, creates 4 posdb entries for numbers in posdb, e.g.
// gbsortbyint:gbisadultint32, gbrevsortbyint:gbisadultint32
// gbsortby:gbisadultfloat32, gbrevsortby:gbisadultfloat32
m_createSortByForNumbers= false;
m_hashNumbers = true;
m_filterUrlIndexableWords = false;
m_linkerSiteRank = 0;
}
class HashTableX *m_tt;
const char *m_prefix;
// "m_desc" should detail the algorithm
const char *m_desc;
int32_t m_date;
bool m_shardByTermId;
char m_linkerSiteRank;
char m_hashGroup;
int32_t m_startDist;
bool m_useCountTable;
bool m_useSections;
bool m_createSortByForNumbers;
bool m_hashNumbers;
bool m_filterUrlIndexableWords; //Do special filtering on words in url, eg. exclude "com" before path
};
static bool storeTerm ( const char *s ,
int32_t slen ,
int64_t termId ,
HashInfo *hi ,
int32_t wordNum ,
int32_t wordPos ,
char densityRank,
char diversityRank ,
char wordSpamRank ,
char hashGroup,
//bool isPhrase ,
SafeBuf *wbuf ,
HashTableX *wts ,
char synSrc ,
char langId ,
posdbkey_t key ) {
// store prefix
int32_t poff = wbuf->length();
// shortcut
const char *p = hi->m_prefix;
// add the prefix too!
if ( p && ! wbuf->safeMemcpy(p,strlen(p)+1)) return false;
// none?
if ( ! p ) poff = -1;
// store description
int32_t doff = wbuf->length();
// shortcut
const char *d = hi->m_desc;
// add the desc too!
if ( d && ! wbuf->safeMemcpy(d,strlen(d)+1) ) return false;
// none?
if ( ! d ) doff = -1;
// store term
int32_t toff = wbuf->length();
// add it
if ( ! wbuf->safeMemcpy ( s , slen ) ) return false;
// make this
TermDebugInfo ti;
ti.m_termOff = toff;
ti.m_termLen = slen;
ti.m_descOff = doff;
ti.m_prefixOff = poff;
ti.m_date = hi->m_date;
ti.m_shardByTermId = hi->m_shardByTermId;
ti.m_termId = termId;
//ti.m_weight = 1.0;
//ti.m_spam = -1.0;
ti.m_diversityRank = diversityRank;
ti.m_densityRank = densityRank;
ti.m_wordSpamRank = wordSpamRank;
ti.m_hashGroup = hashGroup;
ti.m_wordNum = wordNum;
ti.m_wordPos = wordPos;
ti.m_langId = langId;
ti.m_key = key;
// save for printing out an asterisk
ti.m_synSrc = synSrc; // isSynonym = isSynonym;
// get language bit vec
ti.m_langBitVec64 = g_speller.getLangBits64(termId);
// make the key
key96_t k;
k.n1 = 0; // date
k.n0 = termId;
// store it
return wts->addKey ( &k , &ti ) ;
}
//
// . hash terms that are sharded by TERMID not DOCID!!
//
// . returns false and sets g_errno on error
// . these terms are stored in indexdb, but all terms with the same
// termId reside in one and only one group. whereas normally the records
// are split based on docid and every group gets 1/nth of the termlist.
// . we do this "no splitting" so that only one disk seek is required, and
// we know the termlist is small, or the termlist is being used for spidering
// or parsing purposes and is usually not sent across the network.
bool XmlDoc::hashNoSplit ( HashTableX *tt ) {
// constructor should set to defaults automatically
HashInfo hi;
hi.m_hashGroup = HASHGROUP_INTAG;
hi.m_tt = tt;
// usually we shard by docid, but these are terms we shard by termid!
hi.m_shardByTermId = true;
if ((size_utf8Content - 1) > 0) {
// for exact content deduping
setStatus("hashing gbcontenthash (deduping) no-split keys");
// this should be ready to go and not block!
int64_t *pch64 = getExactContentHash64();
if (!pch64 || pch64 == (void *)-1) { g_process.shutdownAbort(true); }
char cbuf[64];
int32_t clen = sprintf(cbuf, "%" PRIu64, (uint64_t)*pch64);
hi.m_prefix = "gbcontenthash";
if (!hashString(cbuf, clen, &hi)) return false;
}
// now hash the site
setStatus ( "hashing no-split SiteGetter terms");
Url *fu = getFirstUrl();
char *host = fu->getHost ();
//
// HASH terms for SiteGetter.cpp
//
// these are now no-split terms
//
char *s = fu->getUrl ();
int32_t slen = fu->getUrlLen();
// . this termId is used by SiteGetter.cpp for determining subsites
// . matches what is in SiteGet::getSiteList()
// for www.xyz.com/a/ HASH www.xyz.com
// for www.xyz.com/a/b/ HASH www.xyz.com/a/
// for www.xyz.com/a/b/c/ HASH www.xyz.com/a/b/
bool add = true;
// we only hash this for urls that end in '/'
if ( s[slen-1] != '/' ) add = false;
// and no cgi
if ( fu->isCgi() ) add = false;
// skip if root
if ( fu->getPathLen() <= 1 ) add = false;
// sanity check
if ( ! m_linksValid ) { g_process.shutdownAbort(true); }
// . skip if we have no subdirectory outlinks
// . that way we do not confuse all the pages in dictionary.com or
// wikipedia.org as subsites!!
if ( ! m_links.hasSubdirOutlink() ) add = false;
// hash it
if ( add ) {
// remove the last path component
char *end2 = s + slen - 2;
// back up over last component
for ( ; end2 > fu->getPath() && *end2 != '/' ; end2-- ) ;
// hash that part of the url
hi.m_prefix = "siteterm";
if ( ! hashSingleTerm ( host,end2-host,&hi) ) return false;
}
return true;
}
// . returns -1 if blocked, returns NULL and sets g_errno on error
// . "sr" is the tagdb Record
// . "ws" store the terms for PageParser.cpp display
char *XmlDoc::hashAll(HashTableX *table) {
logTrace(g_conf.m_logTraceXmlDoc, "BEGIN");
setStatus("hashing document");
if (m_allHashed) {
return (char *)1;
}
// sanity checks
if (table->getKeySize() != 18 || table->getDataSize() != 4) {
g_process.shutdownAbort(true);
}
// ptr to term = 4 + score = 4 + ptr to sec = 4
if (m_wts && (m_wts->getKeySize() != 12 || m_wts->getDataSize() != sizeof(TermDebugInfo))) {
g_process.shutdownAbort(true);
}
uint8_t *ct = getContentType();
if (!ct) {
logTrace(g_conf.m_logTraceXmlDoc, "END, getContentType failed");
return NULL;
}
// BR 20160127: Never index JSON and XML content
if (*ct == CT_JSON || *ct == CT_XML) {
// For XML (JSON should not get here as it should be filtered out during spidering)
// store the URL as the only thing in posdb so we are able to find it, and
// eventually ban it.
if (!hashUrl(table, true)) { // urlOnly (skip IP and term generation)
logTrace(g_conf.m_logTraceXmlDoc, "END, hashUrl failed");
return NULL;
}
m_allHashed = true;
return (char *)1;
}
unsigned char *hc = (unsigned char *)getHopCount();
if (!hc || hc == (void *)-1) {
logTrace(g_conf.m_logTraceXmlDoc, "END, getHopCount returned -1");
return (char *)hc;
}
// need this for hashing
HashTableX *cnt = getCountTable();
if (!cnt) {
logTrace(g_conf.m_logTraceXmlDoc, "END, getCountTable failed");
return (char *)cnt;
}
if (cnt == (void *)-1) {
g_process.shutdownAbort(true);
}
// and this
Links *links = getLinks();
if (!links) {
logTrace(g_conf.m_logTraceXmlDoc, "END, getLinks failed");
return (char *)links;
}
if (links == (Links *)-1) {
g_process.shutdownAbort(true);
}
char *wordSpamVec = getWordSpamVec();
if (!wordSpamVec) {
logTrace(g_conf.m_logTraceXmlDoc, "END, getWordSpamVec failed");
return wordSpamVec;
}
if (wordSpamVec == (void *)-1) {
g_process.shutdownAbort(true);
}
char *fragVec = getFragVec();
if (!fragVec) {
logTrace(g_conf.m_logTraceXmlDoc, "END, getFragVec failed");
return fragVec;
}
if (fragVec == (void *)-1) {
g_process.shutdownAbort(true);
}
// why do we need this?
if ( m_wts ) {
uint8_t *lv = getLangVector();
if (!lv) {
logTrace(g_conf.m_logTraceXmlDoc, "END, getLangVector failed");
return (char *)lv;
}
if (lv == (void *)-1) {
g_process.shutdownAbort(true);
}
}
CollectionRec *cr = getCollRec();
if ( ! cr ) {
logTrace(g_conf.m_logTraceXmlDoc, "END, getCollRec failed");
return NULL;
}
// do not repeat this if the cachedb storage call blocks
m_allHashed = true;
// reset distance cursor
m_dist = 0;
if (!hashContentType(table)) {
logTrace(g_conf.m_logTraceXmlDoc, "END, hashContentType failed");
return NULL;
}
if (!hashUrl(table, false)) {
logTrace(g_conf.m_logTraceXmlDoc, "END, hashUrl failed");
return NULL;
}
if (!hashLanguage(table)) {
logTrace(g_conf.m_logTraceXmlDoc, "END, hashLanguage failed");
return NULL;
}
if (!hashCountry(table)) {
logTrace(g_conf.m_logTraceXmlDoc, "END, hashCountry failed");
return NULL;
}
// now hash the terms sharded by termid and not docid here since they
// just set a special bit in posdb key so Rebalance.cpp can work.
// this will hash the content checksum which we need for deduping
// which we use for diffbot custom crawls as well.
if (!hashNoSplit(table)) {
logTrace(g_conf.m_logTraceXmlDoc, "END, hashNoSplit failed");
return NULL;
}
// MDW: i think we just inject empty html with a diffbotreply into
// global index now, so don't need this... 9/28/2014
// stop indexing xml docs
// global index unless this is a json object in which case it is
// hashed above in the call to hashJSON(). this will decrease disk
// usage by about half, posdb* files are pretty big.
if (!cr->m_indexBody) {
logTrace(g_conf.m_logTraceXmlDoc, "END, !indexDoc");
return (char *)1;
}
bool *ini = getIsNoIndex();
if (ini == nullptr || ini == (bool*)-1) {
// must not be blocked
gbshutdownLogicError();
}
if (*ini && m_version > 126) {
logTrace(g_conf.m_logTraceXmlDoc, "END, noindex");
return (char *)1;
}
if ((size_utf8Content - 1) <= 0) {
logTrace(g_conf.m_logTraceXmlDoc, "END, contentLen == 0");
return (char *)1;
}
// hash the body of the doc first so m_dist is 0 to match
// the rainbow display of sections
if (!hashBody2(table)) {
logTrace(g_conf.m_logTraceXmlDoc, "END, hashBody2 failed");
return NULL;
}
// hash the title now too so neighborhood singles have more
// to match. plus, we only hash these title terms iff they
// are not already in the hash table, so as to avoid hashing
// repeated title terms because we do not do spam detection
// on them. thus, we need to hash these first before anything
// else. give them triple the body score
if (!hashTitle(table)) {
logTrace(g_conf.m_logTraceXmlDoc, "END, hashTitle failed");
return NULL;
}
// . hash the keywords tag, limited to first 2k of them so far
// . hash above the neighborhoods so the neighborhoods only index
// what is already in the hash table
if (!hashMetaKeywords(table)) {
logTrace(g_conf.m_logTraceXmlDoc, "END, hashMetaKeywords failed");
return NULL;
}
// then hash the incoming link text, NO ANOMALIES, because
// we index the single words in the neighborhoods next, and
// we had songfacts.com coming up for the 'street light facts'
// query because it had a bunch of anomalous inlink text.
if (!hashIncomingLinkText(table)) {
logTrace(g_conf.m_logTraceXmlDoc, "END, hashIncomingLinkText failed");
return NULL;
}
// then the meta summary and description tags with half the score of
// the body, and only hash a term if was not already hashed above
// somewhere.
if (!hashMetaSummary(table)) {
logTrace(g_conf.m_logTraceXmlDoc, "END, hashMetaSummary failed");
return NULL;
}
// BR 20160220
// Store value of meta tag "geo.placename" to help aid searches for
// location specific sites, e.g. 'Restaurant in London'
if (!hashMetaGeoPlacename(table)) {
logTrace(g_conf.m_logTraceXmlDoc, "END, hashMetaGeoPlacename failed");
return NULL;
}
// this will only increment the scores of terms already in the table
// because we neighborhoods are not techincally in the document
// necessarily and we do not want to ruin our precision
if (!hashNeighborhoods(table)) {
logTrace(g_conf.m_logTraceXmlDoc, "END, hashNeighborhoods failed");
return NULL;
}
if (!hashLinks(table)) {
logTrace(g_conf.m_logTraceXmlDoc, "END, hashLinks failed");
return NULL;
}
if (!hashDateNumbers(table)) {
logTrace(g_conf.m_logTraceXmlDoc, "END, hashDateNumbers failed");
return NULL;
}
if (!hashMetaTags(table)) {
logTrace(g_conf.m_logTraceXmlDoc, "END, hashMetaTags failed");
return NULL;
}
// hash gblang:de last for parsing consistency
if (!hashLanguageString(table)) {
logTrace(g_conf.m_logTraceXmlDoc, "END, hashLanguageString failed");
return NULL;
}
logTrace(g_conf.m_logTraceXmlDoc, "END, OK");
return (char *)1;
}
// returns false and sets g_errno on error
bool XmlDoc::hashMetaTags ( HashTableX *tt ) {
setStatus ( "hashing meta tags" );
// assume it's empty
char buf [ 32*1024 ];
int32_t bufLen = 32*1024 - 1;
buf[0] = '\0';
int32_t n = m_xml.getNumNodes();
XmlNode *nodes = m_xml.getNodes();
// set up the hashing parms
HashInfo hi;
hi.m_hashGroup = HASHGROUP_INMETATAG;
hi.m_tt = tt;
hi.m_desc = "custom meta tag";
// find the first meta summary node
for ( int32_t i = 0 ; i < n ; i++ ) {
// continue if not a meta tag
if ( nodes[i].m_nodeId != TAG_META ) continue;
// only get content for <meta name=..> not <meta http-equiv=..>
int32_t tagLen;
char *tag = m_xml.getString ( i , "name" , &tagLen );
char tagLower[128];
int32_t j ;
int32_t code;
// skip if empty
if ( ! tag || tagLen <= 0 ) continue;
// make tag name lower case and do not allow bad chars
if ( tagLen > 126 ) tagLen = 126 ;
to_lower3_a ( tag , tagLen , tagLower );
for ( j = 0 ; j < tagLen ; j++ ) {
// bail if has unacceptable chars
if ( ! is_alnum_a ( tag[j] ) &&
tag[j] != '-' &&
tag[j] != '_' &&
tag[j] != '.' ) break;
// convert to lower
tagLower[j] = to_lower_a ( tag[j] );
}
// skip this meta if had unacceptable chars
if ( j < tagLen ) continue;
// is it recognized?
code = getFieldCode ( tag , tagLen );
// . do not allow reserved tag names
// . title,url,suburl,
if ( code != FIELD_GENERIC ) continue;
// this is now reserved
// do not hash keyword, keywords, description, or summary metas
// because that is done in hashRange() below based on the
// tagdb (ruleset) record
if ((tagLen== 7&&strncasecmp(tag,"keyword" , 7)== 0)||
(tagLen== 7&&strncasecmp(tag,"summary" , 7)== 0)||
(tagLen== 8&&strncasecmp(tag,"keywords" , 8)== 0)||
(tagLen==11&&strncasecmp(tag,"description",11)== 0) )
continue;
// BR 20160107: Only hash certain custom meta tags and ignore the rest
if(
(strncasecmp(tag,"subject", 7) != 0) &&
(strncasecmp(tag,"abstract", 8) != 0) &&
(strncasecmp(tag,"news_keywords", 13) != 0) && // http://www.metatags.org/meta_name_news_keywords
(strncasecmp(tag,"author", 6) != 0) &&
(strncasecmp(tag,"title", 5) != 0) &&
(strncasecmp(tag,"og:title", 8) != 0) &&
(strncasecmp(tag,"og:description", 14) != 0) &&
(strncasecmp(tag,"twitter:title", 13) != 0) &&
(strncasecmp(tag,"twitter:description", 19) != 0) )
{
// If none of the above, it is an unwanted meta tag
continue;
}
// get the content
int32_t len;
char *s = m_xml.getString ( i , "content" , &len );
if ( ! s || len <= 0 ) continue;
// . ensure not too big for our buffer (keep room for a \0)
// . TODO: this is wrong, should be len+1 > bufLen,
// but can't fix w/o resetting the index (COME BACK HERE
// and see where we index meta tags besides this place!!!)
// remove those other places, except... what about keywords
// and description?
if ( len+1 >= bufLen ) {
//len = bufLen - 1;
// assume no punct to break on!
len = 0;
// only cut off at punctuation
char *p = s;
char *pend = s + len;
char *last = NULL;
int32_t size ;
for ( ; p < pend ; p += size ) {
// skip if utf8 char
size = getUtf8CharSize(*p);
// skip if 2+ bytes
if ( size > 1 ) continue;
// skip if not punct
if ( is_alnum_a(*p) ) continue;
// mark it
last = p;
}
if ( last ) len = last - s;
// this old way was faster...:
//while ( len > 0 && is_alnum(s[len-1]) ) len--;
}
// convert html entities to their chars
len = saftenTags ( buf , bufLen , s , len );
// NULL terminate the buffer
buf[len] = '\0';
// Now index the wanted meta tags as normal text without prefix so they
// are used in user searches automatically.
hi.m_prefix = NULL;
// desc is NULL, prefix will be used as desc
bool status = hashString ( buf,len,&hi );
// bail on error, g_errno should be set
if ( ! status ) return false;
// return false with g_errno set on error
//if ( ! hashNumberForSorting ( buf , bufLen , &hi ) )
// return false;
}
return true;
}
// . hash dates for sorting by using gbsortby: and gbrevsortby:
// . do 'gbsortby:gbspiderdate' as your query to see this in action
bool XmlDoc::hashDateNumbers ( HashTableX *tt ) { // , bool isStatusDoc ) {
// stop if already set
if ( ! m_spideredTimeValid ) return true;
int32_t indexedTime = getIndexedTime();
// first the last spidered date
HashInfo hi;
hi.m_hashGroup = 0;// this doesn't matter, it's a numeric field
hi.m_tt = tt;
hi.m_desc = "last spidered date";
hi.m_prefix = "gbspiderdate";
hi.m_createSortByForNumbers = true;
char buf[64];
int32_t bufLen = sprintf ( buf , "%" PRIu32, (uint32_t)m_spideredTime );
if ( ! hashNumberForSorting( buf , buf , bufLen , &hi ) )
return false;
// and index time is >= spider time, so you want to sort by that for
// the widget for instance
hi.m_desc = "last indexed date";
hi.m_prefix = "gbindexdate";
bufLen = sprintf ( buf , "%" PRIu32, (uint32_t)indexedTime );
if ( ! hashNumberForSorting ( buf , buf , bufLen , &hi ) )
return false;
// all done
return true;
}
// returns false and sets g_errno on error
bool XmlDoc::hashContentType ( HashTableX *tt ) {
CollectionRec *cr = getCollRec();
if ( ! cr ) return false;
uint8_t *ctype = getContentType();
if( !ctype ) {
return false;
}
char *s = NULL;
setStatus ( "hashing content type" );
// hash numerically so we can do gbfacetint:type on it
HashInfo hi;
hi.m_hashGroup = HASHGROUP_INTAG;
hi.m_tt = tt;
hi.m_prefix = "type";
char tmp[6];
sprintf(tmp,"%" PRIu32,(uint32_t)*ctype);
if ( ! hashString (tmp,strlen(tmp),&hi ) ) return false;
// these ctypes are defined in HttpMime.h
switch (*ctype) {
case CT_HTML: s = "html"; break;
case CT_TEXT: s = "text"; break;
case CT_XML : s = "xml" ; break;
case CT_PDF : s = "pdf" ; break;
case CT_DOC : s = "doc" ; break;
case CT_XLS : s = "xls" ; break;
case CT_PPT : s = "ppt" ; break;
case CT_PS : s = "ps" ; break;
// for diffbot. so we can limit search to json objects
// in Diffbot.cpp
case CT_JSON: s = "json" ; break;
}
// bail if unrecognized content type
if ( ! s ) return true;
// . now hash it
// . use a score of 1 for all
// . TODO: ensure doc counting works ok with this when it does
// it's interpolation
return hashString (s,strlen(s),&hi );
}
// . hash the link: terms
// . ensure that more useful linkers are scored higher
// . useful for computing offsite link text for qdb-ish algorithm
// . NOTE: for now i do not hash links to the same domain in order to
// hopefully save 10%-25% index space
// . NOTE: PLUS, they may clog up the link-adjusted quality ratings since
// different site links with no link text will be ranked behind them
// . the 8-bit bitmap of the score of a link: term:
// . 00ubdcss u = link is Unbanned? b = link isBanned?
// d = link dirty? c = link clean?
// s = 01 if no link text, 10 if link text
// . NOTE: this is used in Msg18.cpp for extraction
// . CAUTION: IndexList::score32to8() will warp our score if its >= 128
// so i moved the bits down
bool XmlDoc::hashLinks ( HashTableX *tt ) {
setStatus ( "hashing links" );
// shortcuts
bool isRSSFeed = *getIsRSS();
char dbuf[8*4*1024];
HashTableX dedup;
dedup.set( 8,0,1024,dbuf,8*4*1024,false,"hldt");
CollectionRec *cr = getCollRec();
if ( ! cr ) {
logTrace( g_conf.m_logTraceXmlDoc, "END, getCollRec failed" );
return false;
}
// see ../url/Url2.cpp for hashAsLink() algorithm
for ( int32_t i = 0 ; i < m_links.m_numLinks ; i++ ) {
// skip links with zero 0 length
if ( m_links.m_linkLens[i] == 0 ) {
continue;
}
// . skip if we are rss page and this link is an <a href> link
// . we only harvest/index <link> urls from rss feeds
// . or in the case of feedburner, those orig tags
if ( isRSSFeed && (m_links.m_linkFlags[i] & LF_AHREFTAG) ) {
continue;
}
// if we have a <feedburner:origLink> tag, then ignore <link>
// tags and only get the links from the original links
if ( m_links.m_isFeedBurner && !(m_links.m_linkFlags[i] & LF_FBTAG) ) {
continue;
}
// normalize the link
Url link;
// now we always add "www" to these links so that any link
// to cnn.com is same as link to www.cnn.com, because either
// we index cnn.com or www.cnn.com but not both providing
// their content is identical (deduping). This way whichever
// one we index, we can take advantage of all link text whether
// it's to cnn.com or www.cnn.com.
// Every now and then we add new session ids to our list in
// Url.cpp, too, so we have to version that.
// Since this is just for hashing, it shouldn't matter that
// www.tmblr.co has no IP whereas only tmblr.co does.
link.set( m_links.m_linkPtrs[i], m_links.m_linkLens[i], true, m_links.m_stripParams, m_version );
// BR 20160105: Do not create "link:" hashes for media URLs etc.
if( link.hasNonIndexableExtension(TITLEREC_CURRENT_VERSION) || // @todo BR: For now ignore actual TitleDB version. // m_version) ||
link.hasScriptExtension() ||
link.hasJsonExtension() ||
link.hasXmlExtension() ||
isUrlBlocked(link)) {
logTrace( g_conf.m_logTraceXmlDoc, "Unwanted for indexing [%s]", link.getUrl());
continue;
}
// dedup this crap
int64_t h = hash64 ( link.getUrl(), link.getUrlLen() );
if ( dedup.isInTable ( &h ) ) continue;
if ( ! dedup.addKey ( &h ) ) return false;
// set up the hashing parms
HashInfo hi;
hi.m_hashGroup = HASHGROUP_INTAG;
hi.m_tt = tt;
hi.m_prefix = "link";
// hash link:<url>
if ( ! hashSingleTerm ( link.getUrl(),link.getUrlLen(),&hi )) {
return false;
}
h = hash64 ( link.getHost() , link.getHostLen() );
if ( dedup.isInTable ( &h ) ) continue;
if ( ! dedup.addKey ( &h ) ) return false;
// fix parm
hi.m_prefix = "sitelink";
// hash sitelink:<urlHost>
if ( ! hashSingleTerm ( link.getHost(),link.getHostLen(),&hi)) {
return false;
}
}
return true;
}
// . returns false and sets g_errno on error
// . hash for linkdb
bool XmlDoc::hashLinksForLinkdb ( HashTableX *dt ) {
// sanity check
if ( dt->getKeySize() != sizeof(key224_t) ) { g_process.shutdownAbort(true); }
if ( dt->getDataSize() != 0 ) { g_process.shutdownAbort(true); }
// this will be different with our new site definitions
uint32_t linkerSiteHash32 = *getSiteHash32();
char siteRank = getSiteRank();
if ( ! m_linksValid ) { g_process.shutdownAbort(true); }
int32_t *linkSiteHashes = getLinkSiteHashes();
if ( ! linkSiteHashes || linkSiteHashes == (void *)-1 ) {
g_process.shutdownAbort(true);
}
// use spidered time! might not be current time! like if rebuilding
// or injecting from a past spider time
int32_t discoveryDate = getSpideredTime();
// add in new links
for ( int32_t i = 0 ; i < m_links.m_numLinks ; i++ ) {
// skip if empty
if (m_links.m_linkLens[i] == 0) {
continue;
}
// . skip if spam, ALWAYS allow internal outlinks though!!
// . CAUTION: now we must version islinkspam()
bool spam = m_links.isLinkSpam(i);
// get site of outlink from tagrec if in there
int32_t linkeeSiteHash32 = linkSiteHashes[i];
//
// when setting the links class it should set the site hash
//
#ifdef _VALGRIND_
VALGRIND_CHECK_MEM_IS_DEFINED(&linkeeSiteHash32,sizeof(linkeeSiteHash32));
uint64_t tmp1 = m_links.getLinkHash64(i);
VALGRIND_CHECK_MEM_IS_DEFINED(&tmp1,sizeof(tmp1));
VALGRIND_CHECK_MEM_IS_DEFINED(&spam,sizeof(spam));
VALGRIND_CHECK_MEM_IS_DEFINED(&siteRank,sizeof(siteRank));
// uint32_t tmp2 = *getIp();
// VALGRIND_CHECK_MEM_IS_DEFINED(&tmp2,sizeof(tmp2));
uint64_t tmp3 = *getDocId();
VALGRIND_CHECK_MEM_IS_DEFINED(&tmp3,sizeof(tmp3));
VALGRIND_CHECK_MEM_IS_DEFINED(&discoveryDate,sizeof(discoveryDate));
VALGRIND_CHECK_MEM_IS_DEFINED(&linkerSiteHash32,sizeof(linkerSiteHash32));
#endif
int32_t *ipptr = getIp();
int32_t ip = ipptr ? *ipptr : 0;
// set this key, it is the entire record
key224_t k = Linkdb::makeKey_uk ( linkeeSiteHash32 ,
m_links.getLinkHash64(i) ,
spam , // link spam?
siteRank , // was quality
ip,
*getDocId() ,
discoveryDate ,
0 ,
false , // new add?
linkerSiteHash32 ,
false );// delete?
#ifdef _VALGRIND_
VALGRIND_CHECK_MEM_IS_DEFINED(&k,sizeof(k));
#endif
// store in hash table
if (!dt->addKey(&k, NULL)) {
return false;
}
}
return true;
}
// . returns false and sets g_errno on error
// . copied Url2.cpp into here basically, so we can now dump Url2.cpp
bool XmlDoc::hashUrl ( HashTableX *tt, bool urlOnly ) { // , bool isStatusDoc ) {
setStatus ( "hashing url colon" );
// get the first url
Url *fu = getFirstUrl();
// set up the hashing parms
HashInfo hi;
hi.m_hashGroup = HASHGROUP_INTAG;
hi.m_tt = tt;
// we do not need diversity bits for this
hi.m_useCountTable = false;
//
// HASH url: term
//
// append a "www." for doing url: searches
Url uw;
uw.set( fu->getUrl(), fu->getUrlLen(), true, false );
hi.m_prefix = "url";
if ( ! hashSingleTerm(uw.getUrl(),uw.getUrlLen(),&hi) )
return false;
if (urlOnly) {
return true;
}
bool *ini = getIsNoIndex();
if (ini == nullptr || ini == (bool*)-1) {
// must not be blocked
gbshutdownLogicError();
}
char *s = fu->getUrl();
int32_t slen = fu->getUrlLen();
if (!*ini || m_version <= 126) {
setStatus("hashing inurl colon");
//
// HASH inurl: terms
//
hi.m_prefix = "inurl";
// BR 20160114: Skip numbers in urls when doing "inurl:" queries
hi.m_hashNumbers = false;
hi.m_filterUrlIndexableWords = true;
if (!hashString(s, slen, &hi)) return false;
}
{
setStatus("hashing ip colon");
hi.m_hashNumbers = true;
hi.m_filterUrlIndexableWords = false;
//
// HASH ip:a.b.c.d
//
if (!m_ipValid) { g_process.shutdownAbort(true); }
// copy it to save it
char ipbuf[64];
int32_t iplen = strlen(iptoa(m_ip, ipbuf));
hi.m_prefix = "ip";
if (!hashSingleTerm(ipbuf, iplen, &hi)) return false;
// . sanity check
if (!m_siteNumInlinksValid) { g_process.shutdownAbort(true); }
}
//
// HASH the url's mid domain and host as they were in the body
//
setStatus ( "hashing site colon terms");
//
// HASH the site: terms
//
// . hash the pieces of the site
// . http://host.domain.com/~harry/level1/ should hash to:
// . site:host.domain.com/~harry/level1/
// . site:host.domain.com/~harry/
// . site:host.domain.com/~
// . site:host.domain.com/
// . site:domain.com/~harry/level1/
// . site:domain.com/~harry/
// . site:domain.com/~
// . site:domain.com/
// ensure score is positive
//if ( siteScore <= 0 ) siteScore = 1;
// get the hostname (later we set to domain name)
char *name = fu->getHost();
int32_t nameLen = fu->getHostLen();
#ifdef _VALGRIND_
VALGRIND_CHECK_MEM_IS_DEFINED(name,nameLen);
#endif
// . point to the end of the whole thing, including port field
// . add in port, if non default
char *end3 = name + fu->getHostLen() + fu->getPortLen();
// Generate string with port if server runs on non-standard ports
char pbuf[12];
int pbufLen=0;
int32_t port = fu->getPort();
if( port > 0 && port != 80 && port != 443 ) {
pbufLen=snprintf(pbuf, 12, ":%" PRIu32, (uint32_t)fu->getPort());
}
loop:
// now loop through the sub paths of this url's path
int32_t prev_len = -1;
for ( int32_t i = 0 ; ; i++ ) {
// get the subpath
int32_t len = fu->getSubPathLen(i);
if(len==prev_len) //work around bug (?) in Url
continue;
prev_len = len;
// FIX: always include first /
if ( len == 0 ) {
len = 1;
}
// write http://www.whatever.com/path into buf
char buf[MAX_URL_LEN+10];
char *p = buf;
// BR 20160122: Do NOT fix this for https sites. The search is
// always prefixed with http:// (sigh ...)
gbmemcpy ( p , "http://" , 7 ); p += 7;
gbmemcpy ( p , name, nameLen); p += nameLen;
if( pbufLen > 0 )
{
gbmemcpy ( p , pbuf, pbufLen); p += pbufLen;
}
gbmemcpy ( p , fu->getPath() , len ); p += len;
*p = '\0';
// update hash parms
if (m_version <= 126) {
hi.m_prefix = "site";
} else {
hi.m_prefix = *ini ? "sitenoindex" : "site";
}
hi.m_hashGroup = HASHGROUP_INURL;
// this returns false on failure
if ( ! hashSingleTerm (buf,p-buf,&hi ) ) {
return false;
}
// break when we hash the root path
if ( len <=1 ) {
break;
}
}
// now keep moving the period over in the hostname
while ( name < end3 && *name != '.' ) {
name++;
nameLen--;
}
// skip the '.'
name++; nameLen--;
// Check that there is a dot before first slash after domain
// to avoid junk entries like http://com/subpath/pagename.html
bool dom_valid = false;
if( nameLen > 0 )
{
int32_t dom_offset=0;
if( strncmp(name,"http://" ,7)==0 )
{
dom_offset=7;
}
else
if( strncmp(name,"https://",8)==0 )
{
dom_offset=8;
}
const char *dotpos = (const char *)memchr(name,'.',nameLen);
const char *slashpos= (const char *)memchr(name+dom_offset,'/',nameLen-dom_offset);
if( dotpos && (!slashpos || (slashpos > dotpos)) )
{
dom_valid = true;
}
}
if ( name < end3 && dom_valid ) goto loop;
// BR 20160121: Make searching for e.g. site:dk work
setStatus ( "hashing tld for site search");
const char *tld = fu->getTLD();
int32_t tldLen = fu->getTLDLen();
if( tldLen > 0 && tldLen < 64 ) {
char tldBuf[72]; // http:// (7) + tld (63) + / (1) + 0 (1)
char *p = tldBuf;
gbmemcpy ( p , "http://", 7 ); p += 7;
gbmemcpy ( p , tld, tldLen); p += tldLen;
gbmemcpy ( p , "/", 1 ); p += 1;
*p = '\0';
if ( ! hashSingleTerm (tldBuf, p - tldBuf, &hi ) ) {
return false;
}
}
const char *ext = fu->getExtension();
int32_t elen = fu->getExtensionLen();
if (!*ini || m_version <= 126) {
//
// HASH ext: term
//
// i.e. ext:gif ext:html ext:htm ext:pdf, etc.
setStatus("hashing ext colon");
// update hash parms
hi.m_prefix = "ext";
if (!hashSingleTerm(ext, elen, &hi)) return false;
}
{
setStatus("hashing gbdocid");
hi.m_prefix = "gbdocid";
char buf2[32];
sprintf(buf2, "%" PRIu64, (uint64_t)m_docId);
if (!hashSingleTerm(buf2, strlen(buf2), &hi)) return false;
}
setStatus ( "hashing SiteGetter terms");
//
// HASH terms for SiteGetter.cpp
//
// . this termId is used by SiteGetter.cpp for determining subsites
// . matches what is in SiteGet::getSiteList()
// for www.xyz.com/a/ HASH www.xyz.com
// for www.xyz.com/a/b/ HASH www.xyz.com/a/
// for www.xyz.com/a/b/c/ HASH www.xyz.com/a/b/
bool add = true;
// we only hash this for urls that end in '/'
if ( s[slen-1] != '/' ) add = false;
// and no cgi
if ( fu->isCgi() ) add = false;
// skip if root
if ( fu->getPathLen() <= 1 ) add = false;
// sanity check
if ( ! m_linksValid ) { g_process.shutdownAbort(true); }
// . skip if we have no subdirectory outlinks
// . that way we do not confuse all the pages in dictionary.com or
// wikipedia.org as subsites!!
if ( ! m_links.hasSubdirOutlink() ) add = false;
char *host = fu->getHost ();
int32_t hlen = fu->getHostLen ();
// tags from here out
hi.m_hashGroup = HASHGROUP_INTAG;
hi.m_shardByTermId = true;
// hash it
if ( add ) {
// remove the last path component
char *end2 = s + slen - 2;
// back up over last component
for ( ; end2 > fu->getPath() && *end2 != '/' ; end2-- ) ;
// hash that part of the url
hi.m_prefix = "siteterm";
if ( ! hashSingleTerm ( host,end2-host,&hi) ) return false;
}
hi.m_shardByTermId = false;
setStatus ( "hashing urlhashdiv10 etc");
//
// HASH urlhash: urlhashdiv10: urlhashdiv100: terms
//
// this is for proving how many docs are in the index
char buf[20];
int32_t blen;
uint32_t h = hash32 ( s , slen );
blen = sprintf(buf,"%" PRIu32,h);
hi.m_prefix = "urlhash";
if ( ! hashString(buf,blen,&hi) ) return false;
// don't index mid domain or url path for noindex document
if (*ini && m_version > 126) {
return true;
}
if (size_utf8Content - 1 > 0 || m_indexCode == EDOCDISALLOWEDROOT) {
setStatus("hashing url mid domain");
// update parms
hi.m_prefix = NULL;
hi.m_desc = "middle domain";
hi.m_hashGroup = HASHGROUP_INURL;
hi.m_filterUrlIndexableWords = true; // Skip com, http etc.
if (!hashString(host, hlen, &hi)) {
return false;
}
hi.m_filterUrlIndexableWords = false;
if (!hashSingleTerm(fu->getDomain(), fu->getDomainLen(), &hi)) {
return false;
}
}
if (size_utf8Content - 1 > 0) {
setStatus("hashing url path");
char *path = fu->getPath();
int32_t plen = fu->getPathLen();
// BR 20160113: Do not hash and combine the page filename extension with the page name (skip e.g. .com)
if (elen > 0) {
elen++; // also skip the dot
}
plen -= elen;
// BR 20160113: Do not hash the most common page names
if (strncmp(path, "/index", plen) != 0) {
// hash the path
// BR 20160114: Exclude numbers in paths (usually dates)
hi.m_hashNumbers = false;
if (!hashString(path, plen, &hi)) return false;
}
}
return true;
}
// . returns false and sets g_errno on error
bool XmlDoc::hashIncomingLinkText(HashTableX *tt) {
setStatus ( "hashing link text" );
// sanity
if ( ! m_linkInfo1Valid ) { g_process.shutdownAbort(true); }
// . finally hash in the linkText terms from the LinkInfo
// . the LinkInfo class has all the terms of hashed anchor text for us
// . if we're using an old TitleRec linkTermList is just a ptr to
// somewhere in TitleRec
// . otherwise, we generated it from merging a bunch of LinkInfos
// and storing them in this new TitleRec
LinkInfo *linkInfo = getLinkInfo1();
// sanity checks
if ( ! m_ipValid ) { g_process.shutdownAbort(true); }
if ( ! m_siteNumInlinksValid ) { g_process.shutdownAbort(true); }
//
// brought the following code in from LinkInfo.cpp
//
// count "external" inlinkers
int32_t ecount = 0;
// update hash parms
HashInfo hi;
hi.m_tt = tt;
// hashstring should update this like a cursor.
hi.m_startDist = 0;
// loop through the link texts and hash them
for ( Inlink *k = NULL; linkInfo && (k = linkInfo->getNextInlink(k)) ; ) {
// is this inlinker internal?
bool internal=((m_ip&0x0000ffff)==(k->m_ip&0x0000ffff));
// count external inlinks we have for indexing gbmininlinks:
if ( ! internal ) ecount++;
// get length of link text
int32_t tlen = k->size_linkText;
if ( tlen > 0 ) tlen--;
// get the text
char *txt = k->getLinkText();
// sanity check
if ( ! verifyUtf8 ( txt , tlen ) ) {
log("xmldoc: bad link text 2 from url=%s for %s",
k->getUrl(),m_firstUrl.getUrl());
continue;
}
if ( internal ) hi.m_hashGroup = HASHGROUP_INTERNALINLINKTEXT;
else hi.m_hashGroup = HASHGROUP_INLINKTEXT;
// store the siterank of the linker in this and use that
// to set the multiplier M bits i guess
hi.m_linkerSiteRank = k->m_siteRank;
if(hi.m_linkerSiteRank>MAXSITERANK) {
log(LOG_INFO,"Inlink had siteRank>max (%d), probably from docid %ld", k->m_siteRank, k->m_docId);
hi.m_linkerSiteRank = MAXSITERANK;
}
// now record this so we can match the link text to
// a matched offsite inlink text term in the scoring info
k->m_wordPosStart = m_dist; // hi.m_startDist;
// . hash the link text into the table
// . returns false and sets g_errno on error
// . we still have the score punish from # of words though!
// . for inlink texts that are the same it should accumulate
// and use the reserved bits as a multiplier i guess...
if ( ! hashString ( txt,tlen,&hi) ) return false;
// now record this so we can match the link text to
// a matched offsite inlink text term in the scoring info
//k->m_wordPosEnd = hi.m_startDist;
// spread it out
hi.m_startDist += 20;
}
return true;
}
// . returns false and sets g_errno on error
bool XmlDoc::hashNeighborhoods ( HashTableX *tt ) {
setStatus ( "hashing neighborhoods" );
// . now we also hash the neighborhood text of each inlink, that is,
// the text surrounding the inlink text.
// . this is also destructive in that it will remove termids that
// were not in the document being linked to in order to save
// space in the titleRec
// . now we only do one or the other, not both
LinkInfo *linkInfo = getLinkInfo1();
if(!linkInfo)
return true;
// loop over all the Inlinks
for(Inlink *k = linkInfo->getNextInlink(NULL); k; k = linkInfo->getNextInlink(k)) {
// skip if internal, they often have the same neighborhood text
if((k->m_ip&0x0000ffff)==(m_ip&0x0000ffff))
continue;
// get the left and right texts and hash both
char *s = k->getSurroundingText();
if(!s || k->size_surroundingText <= 1)
continue;
// update hash parms
HashInfo hi;
hi.m_tt = tt;
hi.m_desc = "surrounding text";
hi.m_hashGroup = HASHGROUP_NEIGHBORHOOD;
// . hash that
// . this returns false and sets g_errno on error
int32_t len = k->size_surroundingText - 1;
if(!hashString(s, len, &hi))
return false;
}
return true;
}
// . we now do the title hashing here for newer titlerecs, version 80+, rather
// than use the <index> block in the ruleset for titles.
// . this is not to be confused with hashing the title: terms which still
// does have an <index> block in the ruleset.
// . the new Weights class hashes title as part of body now with a high weight
// given by "titleWeight" parm
bool XmlDoc::hashTitle ( HashTableX *tt ) {
// sanity check
if ( m_hashedTitle ) { g_process.shutdownAbort(true); }
setStatus ( "hashing title" );
// this has been called, note it
m_hashedTitle = true;
const nodeid_t *tids = m_words.getTagIds();
int32_t nw = m_words.getNumWords();
// find the first <title> tag in the doc
int32_t i ;
for ( i = 0 ; i < nw ; i++ )
if ( tids[i] == TAG_TITLE ) break;
// return true if no title
if ( i >= nw ) return true;
// skip tag
i++;
// mark it as start of title
int32_t a = i;
// limit end
int32_t max = i + 40;
if ( max > nw ) max = nw;
// find end of title, either another <title> or a <title> tag
for ( ; i < max ; i++ )
if ( (tids[i] & BACKBITCOMP) == TAG_TITLE ) break;
// ends on a <title> tag?
if ( i == a ) return true;
HashInfo hi;
hi.m_tt = tt;
hi.m_prefix = "title";
// the new posdb info
hi.m_hashGroup = HASHGROUP_TITLE;
// . hash it up! use 0 for the date
// . use XmlDoc::hashWords()
// . use "title" as both prefix and description
//if ( ! hashWords (a,i,&hi ) ) return false;
char **wptrs = m_words.getWordPtrs();
int32_t *wlens = m_words.getWordLens();
char *title = wptrs[a];
char *titleEnd = wptrs[i-1] + wlens[i-1];
int32_t titleLen = titleEnd - title;
if ( ! hashString ( title, titleLen, &hi) ) return false;
// now hash as without title: prefix
hi.m_prefix = NULL;
if ( ! hashString ( title, titleLen, &hi) ) return false;
return true;
}
// . we now do the title hashing here for newer titlerecs, version 80+, rather
// than use the <index> block in the ruleset for titles.
// . this is not to be confused with hashing the title: terms which still
// does have an <index> block in the ruleset.
bool XmlDoc::hashBody2 ( HashTableX *tt ) {
// do not index ANY of the body if it is NOT a permalink and
// "menu elimination" technology is enabled.
//if ( ! *getIsPermalink() && m_eliminateMenus ) return true;
setStatus ( "hashing body" );
// record this
m_bodyStartPos = m_dist;
m_bodyStartPosValid = true;
HashInfo hi;
hi.m_tt = tt;
hi.m_desc = "body";
hi.m_hashGroup = HASHGROUP_BODY;
// use NULL for the prefix
return hashWords (&hi );
}
bool XmlDoc::hashMetaKeywords ( HashTableX *tt ) {
// do not index meta tags if "menu elimination" technology is enabled.
//if ( m_eliminateMenus ) return true;
setStatus ( "hashing meta keywords" );
// hash the meta keywords tag
//char buf [ 2048 + 2 ];
//int32_t len=m_xml.getMetaContentPointer ( buf , 2048 , "keywords" , 8 );
int32_t mklen;
char *mk = getMetaKeywords( &mklen );
// update hash parms
HashInfo hi;
hi.m_tt = tt;
hi.m_desc = "meta keywords";
hi.m_hashGroup = HASHGROUP_INMETATAG;
// call XmlDoc::hashString
return hashString ( mk , mklen , &hi);
}
// . hash the meta summary, description and keyword tags
// . we now do the title hashing here for newer titlerecs, version 80+, rather
// than use the <index> block in the ruleset for titles.
bool XmlDoc::hashMetaSummary ( HashTableX *tt ) {
// sanity check
if ( m_hashedMetas ) { g_process.shutdownAbort(true); }
// this has been called, note it
m_hashedMetas = true;
// do not index meta tags if "menu elimination" technology is enabled.
//if ( m_eliminateMenus ) return true;
setStatus ( "hashing meta summary" );
// hash the meta keywords tag
//char buf [ 2048 + 2 ];
//int32_t len = m_xml.getMetaContent ( buf , 2048 , "summary" , 7 );
int32_t mslen;
char *ms = getMetaSummary ( &mslen );
// update hash parms
HashInfo hi;
hi.m_tt = tt;
hi.m_hashGroup = HASHGROUP_INMETATAG;
// udpate hashing parms
hi.m_desc = "meta summary";
// hash it
if ( ! hashString ( ms , mslen , &hi )) return false;
//len = m_xml.getMetaContent ( buf , 2048 , "description" , 11 );
int32_t mdlen;
char *md = getMetaDescription ( &mdlen );
// udpate hashing parms
hi.m_desc = "meta desc";
// . TODO: only hash if unique????? set a flag on ht then i guess
if ( ! hashString ( md , mdlen , &hi ) ) return false;
return true;
}
bool XmlDoc::hashMetaGeoPlacename( HashTableX *tt ) {
setStatus ( "hashing meta geo.placename" );
int32_t mgplen;
char *mgp = getMetaGeoPlacename( &mgplen );
// update hash parms
HashInfo hi;
hi.m_tt = tt;
hi.m_desc = "meta geo.placename";
hi.m_hashGroup = HASHGROUP_INMETATAG;
// call XmlDoc::hashString
return hashString ( mgp , mgplen , &hi);
}
bool XmlDoc::hashLanguage ( HashTableX *tt ) {
setStatus ( "hashing language" );
int32_t langId = (int32_t)*getLangId();
char s[32]; // numeric langid
int32_t slen = sprintf(s, "%" PRId32, langId );
// update hash parms
HashInfo hi;
hi.m_tt = tt;
hi.m_hashGroup = HASHGROUP_INTAG;
hi.m_prefix = "gblang";
if ( ! hashString ( s, slen, &hi ) ) return false;
return true;
}
bool XmlDoc::hashLanguageString ( HashTableX *tt ) {
setStatus ( "hashing language string" );
int32_t langId = (int32_t)*getLangId();
// update hash parms
HashInfo hi;
hi.m_tt = tt;
hi.m_hashGroup = HASHGROUP_INTAG;
hi.m_prefix = "gblang";
// try lang abbreviation
char s[32];
int32_t slen = sprintf(s , "%s ", getLanguageAbbr(langId) );
// go back to broken way to try to fix parsing consistency bug
if ( ! hashString ( s, slen, &hi ) ) return false;
return true;
}
bool XmlDoc::hashCountry ( HashTableX *tt ) {
setStatus ( "hashing country" );
uint16_t *cid = getCountryId();
if ( ! cid || cid == (uint16_t *)-1 ) return false;
// update hash parms
HashInfo hi;
hi.m_tt = tt;
hi.m_hashGroup = HASHGROUP_INTAG;
hi.m_prefix = "gbcountry";
for ( int32_t i = 0 ; i < 1 ; i++ ) {
// convert it
char buf[32];
int32_t blen = sprintf(buf,"%s", g_countryCode.getAbbr(*cid) );
// hash it
if ( ! hashString ( buf, blen, &hi ) ) return false;
}
// all done
return true;
}
bool XmlDoc::hashSingleTerm( const char *s, int32_t slen, HashInfo *hi ) {
// empty?
if ( slen <= 0 ) return true;
if ( ! m_versionValid ) { g_process.shutdownAbort(true); }
if ( hi->m_useCountTable && ! m_countTableValid){g_process.shutdownAbort(true); }
// a single blob hash
int64_t termId = hash64 ( s , slen );
// combine with prefix
int64_t final = termId;
// combine with a non-NULL prefix
int64_t prefixHash = 0LL;
if ( hi->m_prefix ) {
prefixHash = hash64b ( hi->m_prefix );
final = hash64 ( termId , prefixHash );
}
// call the other guy now
//return hashSingleTerm ( final , hi );
// shortcut
HashTableX *dt = hi->m_tt;
// sanity check
if ( dt->getKeySize() != sizeof(key144_t) ) { g_process.shutdownAbort(true); }
// make the key like we do in hashWords()
key144_t k;
Posdb::makeKey ( &k ,
final,
0LL, // docid
0, // dist
MAXDENSITYRANK, // density rank
MAXDIVERSITYRANK, // diversity rank
MAXWORDSPAMRANK, // wordspamrank
0, // siterank
hi->m_hashGroup,
// we set to docLang in final hash loop
langUnknown,// langid
0, // multiplier
0, // syn?
false , // delkey?
hi->m_shardByTermId );
// . otherwise, add a new slot
// . key should NEVER collide since we are always
// incrementing the distance cursor, m_dist
if ( ! dt->addTerm144 ( &k ) ) return false;
// add to wts for PageParser.cpp display
if ( m_wts && ! storeTerm ( s,slen,final,hi,
0, // wordnum
0, // wordPos,
MAXDENSITYRANK,
MAXDIVERSITYRANK,
MAXWORDSPAMRANK,
hi->m_hashGroup,
//false,
&m_wbuf,
m_wts,
SOURCE_NONE, // synsrc
langUnknown,
k) )
return false;
return true;
}
bool XmlDoc::hashString( char *s, int32_t slen, HashInfo *hi ) {
if ( ! m_versionValid ) { g_process.shutdownAbort(true); }
if ( hi->m_useCountTable && ! m_countTableValid){g_process.shutdownAbort(true); }
if ( ! m_siteNumInlinksValid ) { g_process.shutdownAbort(true); }
return hashString3( s ,
slen ,
hi ,
&m_countTable ,
m_wts ,
&m_wbuf );
}
bool XmlDoc::hashString3( char *s ,
int32_t slen ,
HashInfo *hi ,
HashTableX *countTable ,
HashTableX *wts ,
SafeBuf *wbuf) {
Words words;
Bits bits;
Phrases phrases;
if ( ! words.set ( s , slen , true ) )
return false;
if ( !bits.set(&words))
return false;
if ( !phrases.set( &words, &bits ) )
return false;
// use primary langid of doc
if ( ! m_langIdValid ) { g_process.shutdownAbort(true); }
return hashWords3( hi, &words, &phrases, NULL, countTable, NULL, NULL, NULL, wts, wbuf );
}
bool XmlDoc::hashWords ( HashInfo *hi ) {
// sanity checks
if ( ! m_wordsValid ) { g_process.shutdownAbort(true); }
if ( ! m_phrasesValid ) { g_process.shutdownAbort(true); }
if ( hi->m_useCountTable &&!m_countTableValid){g_process.shutdownAbort(true); }
if ( ! m_bitsValid ) { g_process.shutdownAbort(true); }
if ( ! m_sectionsValid) { g_process.shutdownAbort(true); }
//if ( ! m_synonymsValid) { g_process.shutdownAbort(true); }
if ( ! m_fragBufValid ) { g_process.shutdownAbort(true); }
if ( ! m_wordSpamBufValid ) { g_process.shutdownAbort(true); }
if ( m_wts && ! m_langVectorValid ) { g_process.shutdownAbort(true); }
if ( ! m_langIdValid ) { g_process.shutdownAbort(true); }
// . is the word repeated in a pattern?
// . this should only be used for document body, for meta tags,
// inlink text, etc. we should make sure words are unique
char *wordSpamVec = getWordSpamVec();
char *fragVec = m_fragBuf.getBufStart();
char *langVec = m_langVec.getBufStart();
return hashWords3(hi, &m_words, &m_phrases, &m_sections, &m_countTable, fragVec, wordSpamVec, langVec, m_wts, &m_wbuf);
}
// . this now uses posdb exclusively
bool XmlDoc::hashWords3( HashInfo *hi, const Words *words, Phrases *phrases, Sections *sectionsArg, HashTableX *countTable,
char *fragVec, char *wordSpamVec, char *langVec, HashTableX *wts, SafeBuf *wbuf) {
Sections *sections = sectionsArg;
// for getSpiderStatusDocMetaList() we don't use sections it'll mess us up
if ( ! hi->m_useSections ) sections = NULL;
// shortcuts
const uint64_t *wids = reinterpret_cast<const uint64_t*>(words->getWordIds());
const uint64_t *pids2 = reinterpret_cast<const uint64_t*>(phrases->getPhraseIds2());
HashTableX *dt = hi->m_tt;
// . sanity checks
// . posdb just uses the full keys with docid
if ( dt->getKeySize() != 18 ) { g_process.shutdownAbort(true); }
if ( dt->getDataSize() != 4 ) { g_process.shutdownAbort(true); }
// if provided...
if ( wts ) {
if ( wts->getKeySize() != 12 ) { g_process.shutdownAbort(true); }
if ( wts->getDataSize() != sizeof(TermDebugInfo)){g_process.shutdownAbort(true); }
if ( ! wts->isAllowDups() ) { g_process.shutdownAbort(true); }
}
// ensure caller set the hashGroup
if ( hi->m_hashGroup < 0 ) { g_process.shutdownAbort(true); }
// handy
const char *const*wptrs = words->getWordPtrs();
const int32_t *wlens = words->getWordLens();
// hash in the prefix
uint64_t prefixHash = 0LL;
int32_t plen = 0;
if ( hi->m_prefix ) plen = strlen ( hi->m_prefix );
if ( hi->m_prefix && plen ) {
// we gotta make this case insensitive, and skip spaces
// because if it is 'focal length' we can't search
// 'focal length:10' because that comes across as TWO terms.
prefixHash = hash64Lower_utf8_nospaces ( hi->m_prefix , plen );
// . sanity test, make sure it is in supported list
// . hashing diffbot json output of course fails this so
// skip in that case if diffbot
}
bool hashIffUnique = false;
if ( hi->m_hashGroup == HASHGROUP_INMETATAG ) hashIffUnique = true;
if ( hi->m_hashGroup == HASHGROUP_INTAG ) hashIffUnique = true;
HashTableX ut; ut.set ( 8,0,0,NULL,0,false,"uqtbl");
///////
//
// diversity rank vector.
//
///////
// the final diversity which is a multiplier
// is converted into a rank from 0-15 i guess.
// so 'mexico' in "new mexico" should receive a low word score but high
// phrase score. thus, a search for 'mexico' should not bring up
// the page for university of new mexico!
SafeBuf dwbuf;
if ( !getDiversityVec( words, phrases, countTable, &dwbuf ) ) {
return false;
}
char *wdv = dwbuf.getBufStart();
int32_t nw = words->getNumWords();
/////
//
// calculate density ranks
//
/////
//
// this now varies depending on the length of the sentence/header etc.
// so if the hasgroup is not title, link text or meta tag, we have to
// use a safebuf.
SafeBuf densBuf;
// returns false and sets g_errno on error
if ( ! getDensityRanks((int64_t *)wids,
nw,
hi->m_hashGroup,
&densBuf,
sections))
return false;
// a handy ptr
char *densvec = (char *)densBuf.getBufStart();
////////////
//
// get word positions
//
///////////
Section **sp = NULL;
if ( sections ) sp = sections->m_sectionPtrs;
SafeBuf wpos;
if ( ! getWordPosVec ( words , sections, m_dist, fragVec, &wpos) )
return false;
// a handy ptr
int32_t *wposvec = (int32_t *)wpos.getBufStart();
bool seen_slash = false;
int32_t i;
for ( i = 0 ; i < nw ; i++ ) {
if(wlens[i]==1 && wptrs[i][0]=='/')
seen_slash = true;
if ( ! wids[i] ) continue;
// ignore if in repeated fragment
if ( fragVec && i<MAXFRAGWORDS && fragVec[i] == 0 ) continue;
// ignore if in style section
if ( sp && (sp[i]->m_flags & NOINDEXFLAGS) ) continue;
// do not breach wordpos bits
if ( wposvec[i] > MAXWORDPOS ) break;
// BR: 20160114 if digit, do not hash it if disabled
if( is_digit( wptrs[i][0] ) && !hi->m_hashNumbers ) {
continue;
}
// . hash the startHash with the wordId for this word
// . we must mask it before adding it to the table because
// this table is also used to hash IndexLists into that come
// from LinkInfo classes (incoming link text). And when
// those IndexLists are hashed they used masked termIds.
// So we should too...
uint64_t h ;
if ( plen > 0 ) h = hash64 ( wids[i] , prefixHash );
else h = wids[i];
int32_t hashGroup = hi->m_hashGroup;
Section *sx = NULL;
if ( sp ) {
sx = sp[i];
// . this is taken care of in hashTitle()
// . it is slightly different if the title is
// multiple sentences because when hashing the
// body the density rank is per sentence, but in
// hashTitle we count all the words in the title
// towards the density rank even if they are
// in different sentences
if ( sx->m_flags & SEC_IN_TITLE ) {
continue;
}
if ( sx->m_flags & SEC_IN_HEADER ) {
hashGroup = HASHGROUP_HEADING;
}
if ( sx->m_flags & ( SEC_MENU | SEC_MENU_SENTENCE | SEC_MENU_HEADER ) ) {
hashGroup = HASHGROUP_INMENU;
}
}
// this is for link text and meta tags mostly
if ( hashIffUnique ) {
// skip if already did it
if ( ut.isInTable ( &h ) ) continue;
if ( ! ut.addKey ( &h ) ) return false;
}
char ws = 15;
if ( wordSpamVec ) ws = wordSpamVec[i];
// HACK:
// if this is inlink text, use the wordspamrank to hold the
// inlinker's site rank!
if ( hashGroup == HASHGROUP_INLINKTEXT )
ws = hi->m_linkerSiteRank;
// default to the document's primary language if it is not
// clear what language this word belongs to.
// if the word is only in german it should be german,
// otherwise it will be the document's primary language.
char langId = langUnknown;
if ( m_wts && langVec ) langId = langVec[i];
char wd;
if ( hi->m_useCountTable ) {
wd = wdv[i];
} else {
wd = MAXDIVERSITYRANK;
}
bool skipword = false;
if(hi->m_filterUrlIndexableWords) {
if(!seen_slash) {
//Scheme/host/domain part of URL
//the http/https prefix is not indexed at all
if((wlens[i]==4 && memcmp(wptrs[i],"http",4)==0) ||
(wlens[i]==5 && memcmp(wptrs[i],"https",5)==0))
{
// Never include as single word or in bigrams
continue; //skip to next word
}
//the terms .com .co .dk etc have lots of hits and give very little value for indexing. We only index the bigrams.
if(isTLD(wptrs[i], wlens[i])) {
skipword = true; //skip word by index bigram
}
} else {
//Path parth for URL
//potentially filter out "html" "aspx" index" "cgi" etc.
}
}
if(!skipword) {
key144_t k;
Posdb::makeKey(&k,
h,
0LL,//docid
wposvec[i], // dist,
densvec[i],// densityRank , // 0-15
wd, // diversityRank 0-15
ws, // wordSpamRank 0-15
0, // siterank
hashGroup,
// we set to docLang final hash loop
langUnknown, // langid
0, // multiplier
false, // syn?
false, // delkey?
hi->m_shardByTermId);
// key should NEVER collide since we are always incrementing
// the distance cursor, m_dist
dt->addTerm144(&k);
// add to wts for PageParser.cpp display
if(wts) {
if(!storeTerm(wptrs[i],wlens[i],h,hi,i,
wposvec[i], // wordPos
densvec[i],// densityRank , // 0-15
wd,//v[i],
ws,
hashGroup,
wbuf,
wts,
SOURCE_NONE, // synsrc
langId,
k))
return false;
}
//
// STRIP POSSESSIVE WORDS for indexing
//
// . for now do simple stripping here
// . if word is "bob's" hash "bob"
//
//@todo BR 20160107: Is this always good? Is the same done in Query.cpp?
if(wlens[i] >= 3 &&
wptrs[i][wlens[i]-2] == '\'' &&
to_lower_a(wptrs[i][wlens[i]-1]) == 's')
{
int64_t nah = hash64Lower_utf8(wptrs[i], wlens[i]-2);
if(plen>0) nah = hash64(nah, prefixHash);
Posdb::makeKey(&k,
nah,
0LL,//docid
wposvec[i], // dist,
densvec[i],// densityRank , // 0-15
wd,//v[i], // diversityRank ,
ws, // wordSpamRank ,
0, //siterank
hashGroup,
// we set to docLang final hash loop
langUnknown, // langid
0 , // multiplier
true, // syn?
false, // delkey?
hi->m_shardByTermId );
// key should NEVER collide since we are always
// incrementing the distance cursor, m_dist
dt->addTerm144(&k);
// keep going if not debug
if(!wts) continue;
// print the synonym
if(!storeTerm(wptrs[i], // synWord,
wlens[i] -2, // strlen(synWord),
nah, // termid
hi,
i, // wordnum
wposvec[i], // wordPos
densvec[i],// densityRank , // 0-15
wd,//v[i],
ws,
hashGroup,
//false, // is phrase?
wbuf,
wts,
SOURCE_GENERATED,
langId,
k))
return false;
}
} //!skipword
////////
//
// two-word phrase
//
////////
int64_t npid = pids2[i];
uint64_t ph2 = 0;
// repeat for the two word hash if different!
if ( npid ) {
// hash with prefix
if ( plen > 0 ) ph2 = hash64 ( npid , prefixHash );
else ph2 = npid;
key144_t k;
Posdb::makeKey ( &k ,
ph2 ,
0LL,//docid
wposvec[i],//dist,
densvec[i],// densityRank , // 0-15
MAXDIVERSITYRANK, //phrase
ws, // wordSpamRank ,
0,//siterank
hashGroup,
// we set to docLang final hash loop
langUnknown, // langid
0 , // multiplier
false, // syn?
false , // delkey?
hi->m_shardByTermId );
// key should NEVER collide since we are always
// incrementing the distance cursor, m_dist
dt->addTerm144 ( &k );
// add to wts for PageParser.cpp display
if(wts) {
// get phrase as a string
int32_t plen;
char phraseBuffer[256];
phrases->getPhrase(i, phraseBuffer, sizeof(phraseBuffer), &plen);
// store it
if(!storeTerm(phraseBuffer,plen,ph2,hi,i,
wposvec[i], // wordPos
densvec[i],// densityRank , // 0-15
MAXDIVERSITYRANK,//phrase
ws,
hashGroup,
//true,
wbuf,
wts,
SOURCE_BIGRAM, // synsrc
langId,
k))
return false;
}
}
//
// NUMERIC SORTING AND RANGES
//
// only store numbers in fields this way
if ( prefixHash == 0 )
{
continue;
}
// this may or may not be numeric.
if ( ! is_digit ( wptrs[i][0] ) )
{
continue;
}
// Avoid creating "sortby" number values in posdb if not wanted
if( !hi->m_createSortByForNumbers )
{
continue;
}
// this might have to "back up" before any '.' or '-' symbols
if ( ! hashNumberForSorting ( wptrs[0] ,
wptrs[i] ,
wlens[i] ,
hi ) )
return false;
}
// between calls? i.e. hashTitle() and hashBody()
if ( i > 0 ) m_dist = wposvec[i-1] + 100;
return true;
}
// . we store numbers as floats in the top 4 bytes of the lower 6 bytes of the
// posdb key
// . the termid is the hash of the preceeding field
// . in json docs a field is like "object.details.price"
// . in meta tags it is just the meta tag name
// . credit card numbers are 16 digits. we'd need like 58 bits to store those
// so we can't do that here, but we can approximate as a float
// . the binary representation of floating point numbers is ordered in the
// same order as the floating points themselves! so we are lucky and can
// keep our usually KEYCMP sorting algos to keep the floats in order.
bool XmlDoc::hashNumberForSorting ( const char *beginBuf ,
const char *buf ,
int32_t bufLen ,
HashInfo *hi ) {
if ( ! is_digit(buf[0]) ) return true;
const char *p = buf;
const char *bufEnd = buf + bufLen;
// back-up over any .
if ( p > beginBuf && p[-1] == '.' ) p--;
// negative sign?
if ( p > beginBuf && p[-1] == '-' ) p--;
//
// also hash as an int, 4 byte-integer so our lastSpidered timestamps
// dont lose 128 seconds of resolution
//
int32_t i = (int32_t) atoll2 ( p , bufEnd - p );
if ( ! hashNumberForSortingAsInt32 ( i , hi , "gbsortbyint" ) )
return false;
// also hash in reverse order for sorting from low to high
i = -1 * i;
if ( ! hashNumberForSortingAsInt32 ( i , hi , "gbrevsortbyint" ) )
return false;
return true;
}
bool XmlDoc::hashNumberForSortingAsInt32 ( int32_t n , HashInfo *hi , const char *sortByStr ) {
// prefix is something like price. like the meta "name" or
// the json name with dots in it like "product.info.price" or something
int64_t nameHash = 0LL;
int32_t nameLen = 0;
if ( hi->m_prefix ) nameLen = strlen ( hi->m_prefix );
if ( hi->m_prefix && nameLen )
nameHash = hash64Lower_utf8_nospaces( hi->m_prefix , nameLen );
// need a prefix for hashing numbers... for now
else { g_process.shutdownAbort(true); }
// combine prefix hash with a special hash to make it unique to avoid
// collisions. this is the "TRUE" prefix.
int64_t truePrefix64 = hash64n ( sortByStr ); // "gbsortby");
// hash with the "TRUE" prefix
int64_t ph2 = hash64 ( nameHash , truePrefix64 );
// . now store it
// . use field hash as the termid. normally this would just be
// a prefix hash
// . use mostly fake value otherwise
key144_t k;
Posdb::makeKey ( &k ,
ph2 ,
0,//docid
0,// word pos #
0,// densityRank , // 0-15
0 , // MAXDIVERSITYRANK
0 , // wordSpamRank ,
0 , //siterank
0 , // hashGroup,
// we set to docLang final hash loop
//langUnknown, // langid
// unless already set. so set to english here
// so it will not be set to something else
// otherwise our floats would be ordered by langid!
// somehow we have to indicate that this is a float
// termlist so it will not be mangled any more.
//langEnglish,
langUnknown,
0 , // multiplier
false, // syn?
false , // delkey?
hi->m_shardByTermId );
Posdb::setInt ( &k , n );
// HACK: this bit is ALWAYS set by Posdb::makeKey() to 1
// so that we can b-step into a posdb list and make sure
// we are aligned on a 6 byte or 12 byte key, since they come
// in both sizes. but for this, hack it off to tell
// addTable144() that we are a special posdb key, a "numeric"
// key that has a float stored in it. then it will NOT
// set the siterank and langid bits which throw our sorting
// off!!
Posdb::setAlignmentBit ( &k , 0 );
// sanity
//float t = Posdb::getFloat ( &k );
int32_t x = Posdb::getInt ( &k );
if ( x != n ) { g_process.shutdownAbort(true); }
HashTableX *dt = hi->m_tt;
// the key may indeed collide, but that's ok for this application
if ( ! dt->addTerm144 ( &k ) )
return false;
if ( ! m_wts )
return true;
// store in buffer
char buf[128];
snprintf(buf,126,"%s:%s int32=%" PRId32,sortByStr, hi->m_prefix,n);
int32_t bufLen = strlen(buf);
// add to wts for PageParser.cpp display
// store it
if ( ! storeTerm ( buf,
bufLen,
ph2,
hi,
0, // word#, i,
0, // wordPos
0,// densityRank , // 0-15
0, // MAXDIVERSITYRANK,//phrase
0, // ws,
0, // hashGroup,
//true,
&m_wbuf,
m_wts,
// a hack for display in wts:
SOURCE_NUMBER, // SOURCE_BIGRAM, // synsrc
langUnknown ,
k ) )
return false;
return true;
}