forked from Mirrors/privacore-open-source-search-engine
Merge branch 'master' into dev-proxy
This commit is contained in:
@ -3967,10 +3967,7 @@ TokenizerResult *XmlDoc::getTokenizerResult2() {
|
||||
m_sectionsValid = false;
|
||||
m_posValid = false;
|
||||
//and the bigram generation in XmlDoc::hashWords3() requires that the tokens are sorted by <startpos,endpos>
|
||||
std::sort(m_tokenizerResult.tokens.begin(), m_tokenizerResult.tokens.end(), [](const TokenRange&tr0, const TokenRange &tr1) {
|
||||
return tr0.start_pos < tr1.start_pos ||
|
||||
(tr0.start_pos == tr1.start_pos && tr0.end_pos<tr1.end_pos);
|
||||
});
|
||||
sortTokenizerResult(&m_tokenizerResult);
|
||||
|
||||
logQueryTimingEnd( __func__, start );
|
||||
|
||||
|
3
XmlDoc.h
3
XmlDoc.h
@ -526,6 +526,8 @@ public:
|
||||
bool hashLanguage ( class HashTableX *table ) ;
|
||||
bool hashLanguageString ( class HashTableX *table ) ;
|
||||
bool hashCountry ( class HashTableX *table ) ;
|
||||
void sortTokenizerResult(TokenizerResult *tr);
|
||||
void getLanguageAndCountry(lang_t *lang, const char **country_code);
|
||||
|
||||
class Url *getBaseUrl ( ) ;
|
||||
|
||||
@ -571,6 +573,7 @@ public:
|
||||
Sections *sections, const Bits *bits,
|
||||
const char *fragVec, const char *wordSpamVec, const char *langVec,
|
||||
HashTableX *wts, SafeBuf *wbuf);
|
||||
bool hashString4(const char *s, int32_t slen, HashInfo *hi);
|
||||
|
||||
|
||||
// print out for PageTitledb.cpp and PageParser.cpp
|
||||
|
@ -14,6 +14,7 @@
|
||||
#include "UrlBlockCheck.h"
|
||||
#include "Domains.h"
|
||||
#include "FxExplicitKeywords.h"
|
||||
#include <algorithm>
|
||||
|
||||
|
||||
#ifdef _VALGRIND_
|
||||
@ -31,7 +32,7 @@ static void possiblyDecodeHtmlEntitiesAgain(const char **s, int32_t *len, SafeBu
|
||||
|
||||
//require & following by a second semicolon
|
||||
const char *amppos = (const char*)memmem(*s,*len, "&", 5);
|
||||
if((amppos && memchr(amppos+5, ';', *len-(amppos-*s))!=NULL) ||
|
||||
if((amppos && memchr(amppos+5, ';', *len-(amppos-*s)-5)!=NULL) ||
|
||||
(memmem(*s,*len,"<",4)!=NULL && memmem(*s,*len,">",4)!=NULL)) {
|
||||
//shortest entity is 4 char (<), longest utf8 encoding of a codepoint is 4 + a bit
|
||||
StackBuf<1024> tmpBuf;
|
||||
@ -575,15 +576,10 @@ bool XmlDoc::hashMetaTags ( HashTableX *tt ) {
|
||||
// are used in user searches automatically.
|
||||
hi.m_prefix = NULL;
|
||||
|
||||
// desc is NULL, prefix will be used as desc
|
||||
bool status = hashString ( s,len, &hi );
|
||||
bool status = hashString4(s,len,&hi);
|
||||
|
||||
// bail on error, g_errno should be set
|
||||
if ( ! status ) return false;
|
||||
|
||||
// return false with g_errno set on error
|
||||
//if ( ! hashNumberForSorting ( buf , bufLen , &hi ) )
|
||||
// return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
@ -1266,7 +1262,7 @@ bool XmlDoc::hashIncomingLinkText(HashTableX *tt) {
|
||||
// . we still have the score punish from # of words though!
|
||||
// . for inlink texts that are the same it should accumulate
|
||||
// and use the reserved bits as a multiplier i guess...
|
||||
if ( ! hashString ( txt,tlen,&hi) ) return false;
|
||||
if ( ! hashString4(txt,tlen,&hi) ) return false;
|
||||
// now record this so we can match the link text to
|
||||
// a matched offsite inlink text term in the scoring info
|
||||
//k->m_wordPosEnd = hi.m_startDist;
|
||||
@ -1332,84 +1328,51 @@ bool XmlDoc::hashTitle ( HashTableX *tt ) {
|
||||
// this has been called, note it
|
||||
m_hashedTitle = true;
|
||||
|
||||
int32_t nw = m_tokenizerResult.size();
|
||||
|
||||
// find the first <title> tag in the doc
|
||||
int32_t i ;
|
||||
for ( i = 0 ; i < nw ; i++ )
|
||||
if ( m_tokenizerResult[i].nodeid == TAG_TITLE ) break;
|
||||
|
||||
// return true if no title
|
||||
if ( i >= nw ) return true;
|
||||
|
||||
// skip tag
|
||||
i++;
|
||||
// mark it as start of title
|
||||
int32_t a = i;
|
||||
|
||||
// limit end
|
||||
int32_t max = i + 40;
|
||||
if ( max > nw ) max = nw;
|
||||
|
||||
// find end of title, either another <title> or a <title> tag
|
||||
for ( ; i < max ; i++ )
|
||||
if ( (m_tokenizerResult[i].nodeid & BACKBITCOMP) == TAG_TITLE ) break;
|
||||
|
||||
// ends on a <title> tag?
|
||||
if ( i == a ) return true;
|
||||
|
||||
HashInfo hi;
|
||||
hi.m_tt = tt;
|
||||
hi.m_prefix = "title";
|
||||
|
||||
// the new posdb info
|
||||
hi.m_hashGroup = HASHGROUP_TITLE;
|
||||
|
||||
// . hash it up! use 0 for the date
|
||||
// . use XmlDoc::hashWords()
|
||||
// . use "title" as both prefix and description
|
||||
//if ( ! hashWords (a,i,&hi ) ) return false;
|
||||
|
||||
//FIXME: also grab the alternative tokens from phase 2 in the title part
|
||||
|
||||
//clean indexing:
|
||||
// if ( ! hashString(a, i, &hi) ) return false;
|
||||
//but due to bad webmasters we have to decode html entities multiple times.
|
||||
|
||||
bool any_non_primary_tokens = false;
|
||||
for(int j=a; j<=i; j++) {
|
||||
if(!m_tokenizerResult[j].is_primary) {
|
||||
any_non_primary_tokens = true;
|
||||
break;
|
||||
}
|
||||
//getXml()->getUtf8Content() results in the HTML to be ~mostly~ decoded but lt/gt/amp are still there escaped.
|
||||
//So get the title text from m_xml, retokenize it, and then index that
|
||||
int rawTitleLen;
|
||||
const char *rawTitle = m_xml.getString("title",&rawTitleLen);
|
||||
if(!rawTitle) {
|
||||
//no title - nothing to do
|
||||
return true;
|
||||
}
|
||||
|
||||
const char *title;
|
||||
const char *titleEnd;
|
||||
StackBuf<1024> tmpTitleBuf;
|
||||
|
||||
if(!any_non_primary_tokens) {
|
||||
//use the raw source memory because the tokens match up
|
||||
title = m_tokenizerResult[a].token_start;
|
||||
titleEnd = m_tokenizerResult[i].token_end();
|
||||
} else {
|
||||
//copy primary tokens to tmpTitleBuf
|
||||
for(int j=a; j<=i; j++)
|
||||
tmpTitleBuf.safeMemcpy(m_tokenizerResult[j].token_start,m_tokenizerResult[j].token_len);
|
||||
title = tmpTitleBuf.getBufStart();
|
||||
titleEnd = tmpTitleBuf.getBufPtr();
|
||||
//TODO: Investigate if it isn't better to just find the relevant XmlNode and get the text (title) from there.
|
||||
}
|
||||
int32_t titleLen = titleEnd - title;
|
||||
//The amp/lt/gt are still there so decode them once again to get rid of them.
|
||||
//Due to bad webmasters there can be double-encoded entities in the title. Technically it is
|
||||
//their error but we can make some repairs on those pages.
|
||||
const char *title = rawTitle;
|
||||
int32_t titleLen = rawTitleLen;
|
||||
StackBuf<1024> doubleDecodedContent;
|
||||
possiblyDecodeHtmlEntitiesAgain(&title, &titleLen, &doubleDecodedContent, false);
|
||||
|
||||
if ( ! hashString(title, titleLen, &hi)) return false;
|
||||
|
||||
// now hash as without title: prefix
|
||||
hi.m_prefix = NULL;
|
||||
if ( ! hashString(title, titleLen, &hi)) return false;
|
||||
|
||||
//get language and country if known, so tokenizer phase 2 can do its magic
|
||||
lang_t lang_id;
|
||||
const char *countryCode;
|
||||
getLanguageAndCountry(&lang_id,&countryCode);
|
||||
|
||||
TokenizerResult tr;
|
||||
plain_tokenizer_phase_1(title,titleLen,&tr);
|
||||
plain_tokenizer_phase_2(lang_id, countryCode, &tr);
|
||||
calculate_tokens_hashes(&tr);
|
||||
sortTokenizerResult(&tr);
|
||||
|
||||
Bits bits;
|
||||
if(!bits.set(&tr))
|
||||
return false;
|
||||
|
||||
HashInfo hi;
|
||||
hi.m_tt = tt;
|
||||
hi.m_hashGroup = HASHGROUP_TITLE;
|
||||
|
||||
// hash with title: prefix
|
||||
hi.m_prefix = "title";
|
||||
if(!hashWords3(&hi, &tr, NULL, &bits, NULL, NULL, NULL, m_wts, &m_wbuf))
|
||||
return false;
|
||||
// hash without title: prefix
|
||||
hi.m_prefix = NULL;
|
||||
if(!hashWords3(&hi, &tr, NULL, &bits, NULL, NULL, NULL, m_wts, &m_wbuf))
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -1458,7 +1421,7 @@ bool XmlDoc::hashMetaKeywords ( HashTableX *tt ) {
|
||||
hi.m_hashGroup = HASHGROUP_INMETATAG;
|
||||
|
||||
// call XmlDoc::hashString
|
||||
return hashString ( mk , mklen , &hi);
|
||||
return hashString4(mk, mklen, &hi);
|
||||
}
|
||||
|
||||
|
||||
@ -1491,7 +1454,7 @@ bool XmlDoc::hashExplicitKeywords(HashTableX *tt) {
|
||||
hi.m_tt = tt;
|
||||
hi.m_desc = "explicit keywords";
|
||||
hi.m_hashGroup = HASHGROUP_EXPLICIT_KEYWORDS;
|
||||
return hashString(ptr_explicitKeywords, size_explicitKeywords, &hi);
|
||||
return hashString4(ptr_explicitKeywords, size_explicitKeywords, &hi);
|
||||
} else
|
||||
return true; //nothing done - no error
|
||||
}
|
||||
@ -1530,7 +1493,8 @@ bool XmlDoc::hashMetaSummary ( HashTableX *tt ) {
|
||||
// udpate hashing parms
|
||||
hi.m_desc = "meta summary";
|
||||
// hash it
|
||||
if ( ! hashString ( ms , mslen , &hi )) return false;
|
||||
if(!hashString4(ms,mslen,&hi))
|
||||
return false;
|
||||
|
||||
|
||||
//len = m_xml.getMetaContent ( buf , 2048 , "description" , 11 );
|
||||
@ -1541,7 +1505,8 @@ bool XmlDoc::hashMetaSummary ( HashTableX *tt ) {
|
||||
// udpate hashing parms
|
||||
hi.m_desc = "meta desc";
|
||||
// . TODO: only hash if unique????? set a flag on ht then i guess
|
||||
if ( ! hashString ( md , mdlen , &hi ) ) return false;
|
||||
if(!hashString4(md,mdlen, &hi))
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
@ -1561,7 +1526,7 @@ bool XmlDoc::hashMetaGeoPlacename( HashTableX *tt ) {
|
||||
hi.m_hashGroup = HASHGROUP_INMETATAG;
|
||||
|
||||
// call XmlDoc::hashString
|
||||
return hashString ( mgp , mgplen , &hi);
|
||||
return hashString4(mgp, mgplen, &hi);
|
||||
}
|
||||
|
||||
|
||||
@ -1632,6 +1597,28 @@ bool XmlDoc::hashCountry ( HashTableX *tt ) {
|
||||
return true;
|
||||
}
|
||||
|
||||
void XmlDoc::sortTokenizerResult(TokenizerResult *tr) {
|
||||
std::sort(tr->tokens.begin(), tr->tokens.end(), [](const TokenRange&tr0, const TokenRange &tr1) {
|
||||
return tr0.start_pos < tr1.start_pos ||
|
||||
(tr0.start_pos == tr1.start_pos && tr0.end_pos<tr1.end_pos);
|
||||
});
|
||||
}
|
||||
|
||||
void XmlDoc::getLanguageAndCountry(lang_t *lang, const char **country_code) {
|
||||
//get language and country if known, so tokenizer phase 2 can do its magic
|
||||
uint8_t *tmpLangId = getLangId();
|
||||
if(tmpLangId!=NULL && tmpLangId!=(uint8_t*)-1)
|
||||
*lang = (lang_t)*tmpLangId;
|
||||
else
|
||||
*lang = langUnknown;
|
||||
|
||||
uint16_t *countryId = getCountryId();
|
||||
if(countryId!=NULL && countryId!=(uint16_t*)-1)
|
||||
*country_code = g_countryCode.getAbbr(*countryId);
|
||||
else
|
||||
*country_code = NULL;
|
||||
}
|
||||
|
||||
bool XmlDoc::hashSingleTerm( const char *s, int32_t slen, HashInfo *hi ) {
|
||||
// empty?
|
||||
if ( slen <= 0 ) return true;
|
||||
@ -1753,6 +1740,23 @@ bool XmlDoc::hashString3(size_t begin_token, size_t end_token, HashInfo *hi,
|
||||
return hashWords3( hi, &m_tokenizerResult, begin_token, end_token, NULL, &bits, NULL, NULL, NULL, wts, wbuf );
|
||||
}
|
||||
|
||||
bool XmlDoc::hashString4(const char *s, int32_t slen, HashInfo *hi) {
|
||||
TokenizerResult tr;
|
||||
Bits bits;
|
||||
lang_t lang_id;
|
||||
const char *countryCode;
|
||||
|
||||
getLanguageAndCountry(&lang_id,&countryCode);
|
||||
plain_tokenizer_phase_1(s,slen,&tr);
|
||||
plain_tokenizer_phase_2(lang_id,countryCode,&tr);
|
||||
calculate_tokens_hashes(&tr);
|
||||
sortTokenizerResult(&tr);
|
||||
if(!bits.set(&tr))
|
||||
return false;
|
||||
|
||||
return hashWords3( hi, &tr, NULL, &bits, NULL, NULL, NULL, m_wts, &m_wbuf );
|
||||
}
|
||||
|
||||
|
||||
bool XmlDoc::hashWords ( HashInfo *hi ) {
|
||||
// sanity checks
|
||||
@ -2055,7 +2059,7 @@ bool XmlDoc::hashWords3(HashInfo *hi, const TokenizerResult *tr, size_t begin_to
|
||||
const auto &t2 = (*tr)[j];
|
||||
if(t2.is_alfanum && t2.start_pos>=token.end_pos)
|
||||
break;
|
||||
if(!bits->canBeInPhrase(i) && !bits->canPairAcross(j)) {
|
||||
if(!bits->canBeInPhrase(j) && !bits->canPairAcross(j)) {
|
||||
generate_bigram = false;
|
||||
break;
|
||||
}
|
||||
|
@ -433,7 +433,7 @@ static void combine_possessive_s_tokens(TokenizerResult *tr, lang_t lang) {
|
||||
//t1 must be a single blotch
|
||||
if(t1.token_len>4)
|
||||
continue;
|
||||
UChar32 uc[2];
|
||||
UChar32 uc[4];
|
||||
int ucs = decode_utf8_string(t1.token_start,t1.token_len,uc);
|
||||
if(ucs!=1)
|
||||
continue;
|
||||
@ -781,6 +781,8 @@ static void recognize_telephone_numbers_sweden(TokenizerResult *tr) {
|
||||
}
|
||||
}
|
||||
}
|
||||
if(last_digit_token_idx>=org_token_count)
|
||||
last_digit_token_idx = org_token_count-1;
|
||||
if(digit_count<5)
|
||||
continue;
|
||||
if(digit_count>10)
|
||||
@ -1171,7 +1173,9 @@ static void rewrite_ampersands(TokenizerResult *tr, lang_t lang, const char *cou
|
||||
|
||||
static void rewrite_ampersands(TokenizerResult *tr, const char *ampersand_word, size_t ampersand_word_len) {
|
||||
char *s = NULL;
|
||||
for(const auto &t : tr->tokens) {
|
||||
size_t org_token_count = tr->size();
|
||||
for(size_t i=1; i<org_token_count; i++) {
|
||||
const auto &t = (*tr)[i];
|
||||
if(t.token_len==1 && *t.token_start=='&') {
|
||||
if(!s) {
|
||||
s = (char*)tr->egstack.alloc(ampersand_word_len);
|
||||
|
@ -667,6 +667,13 @@ int main(void) {
|
||||
assert(t.has_token("Johns"));
|
||||
}
|
||||
|
||||
printf("Test line %d\n",__LINE__);
|
||||
{
|
||||
T2 t("John''''s dog",langEnglish);
|
||||
assert(!t.has_token("John's"));
|
||||
assert(!t.has_token("Johns"));
|
||||
}
|
||||
|
||||
|
||||
//hyphenation
|
||||
printf("Test line %d\n",__LINE__);
|
||||
@ -844,6 +851,12 @@ int main(void) {
|
||||
T2 t("foo 040-99 88 77 boo",langSwedish);
|
||||
assert(t.has_token("040998877"));
|
||||
}
|
||||
printf("Test line %d\n",__LINE__);
|
||||
{
|
||||
T2 t("foo 08-24 50 55",langSwedish);
|
||||
assert(t.has_token("08245055"));
|
||||
}
|
||||
return 0;
|
||||
|
||||
printf("Test line %d\n",__LINE__);
|
||||
{
|
||||
|
Reference in New Issue
Block a user