forked from Mirrors/privacore-open-source-search-engine
Merge branch 'master' into dev-siteinfo
This commit is contained in:
2
Makefile
2
Makefile
@ -433,7 +433,7 @@ doc:
|
||||
|
||||
# used for tools/unittest
|
||||
libgb.a: $(OBJS) libsto.a libword_variations.a libunicode.a
|
||||
ar rcs $@ $^ word_variations/*.o sto/*.o unicode/*.o
|
||||
ar rcs $@ $^ word_variations/*.o sto/*.o unicode/*.o tokenizer/*.o
|
||||
|
||||
.PHONY: tools
|
||||
tools:
|
||||
|
2
Pos.cpp
2
Pos.cpp
@ -285,7 +285,7 @@ unsigned Pos::filter( const TokenizerResult *tr, int32_t a, int32_t b, bool addE
|
||||
|
||||
|
||||
bool resetPunctCount = true;
|
||||
if ( is_punct_utf8( p ) ) {
|
||||
if (is_punct_utf8(p) && !is_wspace_utf8(p)) {
|
||||
if ( ( cs == lastPunctSize) && ( memcmp(lastPunct, p, cs) == 0 ) ) {
|
||||
resetPunctCount = false;
|
||||
++samePunctCount;
|
||||
|
@ -17684,7 +17684,7 @@ bool XmlDoc::printGeneralInfo ( SafeBuf *sb , HttpRequest *hr ) {
|
||||
if ( linkdbHostNum < numHosts ) {
|
||||
linkdbHostId = hosts[linkdbHostNum].m_hostId ;
|
||||
if( !hosts[linkdbHostNum].m_spiderEnabled) {
|
||||
linkdbHostId = g_hostdb.getHostIdWithSpideringEnabled(linkdbShardNum, true);
|
||||
linkdbHostId = g_hostdb.getHostIdWithSpideringEnabled(linkdbShardNum, false);
|
||||
}
|
||||
}
|
||||
else {
|
||||
|
2
XmlDoc.h
2
XmlDoc.h
@ -526,6 +526,7 @@ public:
|
||||
bool hashLanguageString ( class HashTableX *table ) ;
|
||||
bool hashCountry ( class HashTableX *table ) ;
|
||||
void sortTokenizerResult(TokenizerResult *tr);
|
||||
void getLanguageAndCountry(lang_t *lang, const char **country_code);
|
||||
|
||||
class Url *getBaseUrl ( ) ;
|
||||
|
||||
@ -571,6 +572,7 @@ public:
|
||||
Sections *sections, const Bits *bits,
|
||||
const char *fragVec, const char *wordSpamVec, const char *langVec,
|
||||
HashTableX *wts, SafeBuf *wbuf);
|
||||
bool hashString4(const char *s, int32_t slen, HashInfo *hi);
|
||||
|
||||
|
||||
// print out for PageTitledb.cpp and PageParser.cpp
|
||||
|
@ -576,15 +576,10 @@ bool XmlDoc::hashMetaTags ( HashTableX *tt ) {
|
||||
// are used in user searches automatically.
|
||||
hi.m_prefix = NULL;
|
||||
|
||||
// desc is NULL, prefix will be used as desc
|
||||
bool status = hashString ( s,len, &hi );
|
||||
bool status = hashString4(s,len,&hi);
|
||||
|
||||
// bail on error, g_errno should be set
|
||||
if ( ! status ) return false;
|
||||
|
||||
// return false with g_errno set on error
|
||||
//if ( ! hashNumberForSorting ( buf , bufLen , &hi ) )
|
||||
// return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
@ -1267,7 +1262,7 @@ bool XmlDoc::hashIncomingLinkText(HashTableX *tt) {
|
||||
// . we still have the score punish from # of words though!
|
||||
// . for inlink texts that are the same it should accumulate
|
||||
// and use the reserved bits as a multiplier i guess...
|
||||
if ( ! hashString ( txt,tlen,&hi) ) return false;
|
||||
if ( ! hashString4(txt,tlen,&hi) ) return false;
|
||||
// now record this so we can match the link text to
|
||||
// a matched offsite inlink text term in the scoring info
|
||||
//k->m_wordPosEnd = hi.m_startDist;
|
||||
@ -1352,20 +1347,12 @@ bool XmlDoc::hashTitle ( HashTableX *tt ) {
|
||||
|
||||
//get language and country if known, so tokenizer phase 2 can do its magic
|
||||
lang_t lang_id;
|
||||
uint8_t *tmpLangId = getLangId();
|
||||
if(tmpLangId!=NULL && tmpLangId!=(uint8_t*)-1)
|
||||
lang_id = (lang_t)*tmpLangId;
|
||||
else
|
||||
lang_id = langUnknown;
|
||||
|
||||
const char *countryCode = NULL;
|
||||
uint16_t *countryId = getCountryId();
|
||||
if(countryId!=NULL && countryId!=(uint16_t*)-1)
|
||||
countryCode = g_countryCode.getAbbr(*countryId);
|
||||
const char *countryCode;
|
||||
getLanguageAndCountry(&lang_id,&countryCode);
|
||||
|
||||
TokenizerResult tr;
|
||||
plain_tokenizer_phase_1(title,titleLen,&tr);
|
||||
plain_tokenizer_phase_2((lang_t)lang_id, countryCode, &tr);
|
||||
plain_tokenizer_phase_2(lang_id, countryCode, &tr);
|
||||
calculate_tokens_hashes(&tr);
|
||||
sortTokenizerResult(&tr);
|
||||
|
||||
@ -1434,7 +1421,7 @@ bool XmlDoc::hashMetaKeywords ( HashTableX *tt ) {
|
||||
hi.m_hashGroup = HASHGROUP_INMETATAG;
|
||||
|
||||
// call XmlDoc::hashString
|
||||
return hashString ( mk , mklen , &hi);
|
||||
return hashString4(mk, mklen, &hi);
|
||||
}
|
||||
|
||||
|
||||
@ -1467,7 +1454,7 @@ bool XmlDoc::hashExplicitKeywords(HashTableX *tt) {
|
||||
hi.m_tt = tt;
|
||||
hi.m_desc = "explicit keywords";
|
||||
hi.m_hashGroup = HASHGROUP_EXPLICIT_KEYWORDS;
|
||||
return hashString(ptr_explicitKeywords, size_explicitKeywords, &hi);
|
||||
return hashString4(ptr_explicitKeywords, size_explicitKeywords, &hi);
|
||||
} else
|
||||
return true; //nothing done - no error
|
||||
}
|
||||
@ -1506,7 +1493,8 @@ bool XmlDoc::hashMetaSummary ( HashTableX *tt ) {
|
||||
// udpate hashing parms
|
||||
hi.m_desc = "meta summary";
|
||||
// hash it
|
||||
if ( ! hashString ( ms , mslen , &hi )) return false;
|
||||
if(!hashString4(ms,mslen,&hi))
|
||||
return false;
|
||||
|
||||
|
||||
//len = m_xml.getMetaContent ( buf , 2048 , "description" , 11 );
|
||||
@ -1517,7 +1505,8 @@ bool XmlDoc::hashMetaSummary ( HashTableX *tt ) {
|
||||
// udpate hashing parms
|
||||
hi.m_desc = "meta desc";
|
||||
// . TODO: only hash if unique????? set a flag on ht then i guess
|
||||
if ( ! hashString ( md , mdlen , &hi ) ) return false;
|
||||
if(!hashString4(md,mdlen, &hi))
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
@ -1537,7 +1526,7 @@ bool XmlDoc::hashMetaGeoPlacename( HashTableX *tt ) {
|
||||
hi.m_hashGroup = HASHGROUP_INMETATAG;
|
||||
|
||||
// call XmlDoc::hashString
|
||||
return hashString ( mgp , mgplen , &hi);
|
||||
return hashString4(mgp, mgplen, &hi);
|
||||
}
|
||||
|
||||
|
||||
@ -1615,6 +1604,21 @@ void XmlDoc::sortTokenizerResult(TokenizerResult *tr) {
|
||||
});
|
||||
}
|
||||
|
||||
void XmlDoc::getLanguageAndCountry(lang_t *lang, const char **country_code) {
|
||||
//get language and country if known, so tokenizer phase 2 can do its magic
|
||||
uint8_t *tmpLangId = getLangId();
|
||||
if(tmpLangId!=NULL && tmpLangId!=(uint8_t*)-1)
|
||||
*lang = (lang_t)*tmpLangId;
|
||||
else
|
||||
*lang = langUnknown;
|
||||
|
||||
uint16_t *countryId = getCountryId();
|
||||
if(countryId!=NULL && countryId!=(uint16_t*)-1)
|
||||
*country_code = g_countryCode.getAbbr(*countryId);
|
||||
else
|
||||
*country_code = NULL;
|
||||
}
|
||||
|
||||
bool XmlDoc::hashSingleTerm( const char *s, int32_t slen, HashInfo *hi ) {
|
||||
// empty?
|
||||
if ( slen <= 0 ) return true;
|
||||
@ -1736,6 +1740,23 @@ bool XmlDoc::hashString3(size_t begin_token, size_t end_token, HashInfo *hi,
|
||||
return hashWords3( hi, &m_tokenizerResult, begin_token, end_token, NULL, &bits, NULL, NULL, NULL, wts, wbuf );
|
||||
}
|
||||
|
||||
bool XmlDoc::hashString4(const char *s, int32_t slen, HashInfo *hi) {
|
||||
TokenizerResult tr;
|
||||
Bits bits;
|
||||
lang_t lang_id;
|
||||
const char *countryCode;
|
||||
|
||||
getLanguageAndCountry(&lang_id,&countryCode);
|
||||
plain_tokenizer_phase_1(s,slen,&tr);
|
||||
plain_tokenizer_phase_2(lang_id,countryCode,&tr);
|
||||
calculate_tokens_hashes(&tr);
|
||||
sortTokenizerResult(&tr);
|
||||
if(!bits.set(&tr))
|
||||
return false;
|
||||
|
||||
return hashWords3( hi, &tr, NULL, &bits, NULL, NULL, NULL, m_wts, &m_wbuf );
|
||||
}
|
||||
|
||||
|
||||
bool XmlDoc::hashWords ( HashInfo *hi ) {
|
||||
// sanity checks
|
||||
@ -2038,7 +2059,7 @@ bool XmlDoc::hashWords3(HashInfo *hi, const TokenizerResult *tr, size_t begin_to
|
||||
const auto &t2 = (*tr)[j];
|
||||
if(t2.is_alfanum && t2.start_pos>=token.end_pos)
|
||||
break;
|
||||
if(!bits->canBeInPhrase(i) && !bits->canPairAcross(j)) {
|
||||
if(!bits->canBeInPhrase(j) && !bits->canPairAcross(j)) {
|
||||
generate_bigram = false;
|
||||
break;
|
||||
}
|
||||
|
@ -415,6 +415,7 @@ static void remove_some_combining_marks(TokenizerResult *tr, const UChar32 nativ
|
||||
//that could conceivably stand in for apostrophe. We do this in all languages because the abuse seem to know no language barrier
|
||||
static void combine_possessive_s_tokens(TokenizerResult *tr, lang_t lang) {
|
||||
//Loop through original tokens, looking for <word> <blotch> "s". Combine the word with the letter s.
|
||||
bool any_deleted = false;
|
||||
const size_t org_token_count = tr->size();
|
||||
for(size_t i=0; i+2<org_token_count; i++) {
|
||||
const auto &t0 = (*tr)[i];
|
||||
@ -433,7 +434,7 @@ static void combine_possessive_s_tokens(TokenizerResult *tr, lang_t lang) {
|
||||
//t1 must be a single blotch
|
||||
if(t1.token_len>4)
|
||||
continue;
|
||||
UChar32 uc[2];
|
||||
UChar32 uc[4];
|
||||
int ucs = decode_utf8_string(t1.token_start,t1.token_len,uc);
|
||||
if(ucs!=1)
|
||||
continue;
|
||||
@ -473,7 +474,23 @@ static void combine_possessive_s_tokens(TokenizerResult *tr, lang_t lang) {
|
||||
// car
|
||||
//and XmlDoc_indexing.cpp will generate the bigram "johns+car", but also "s+car".
|
||||
//We remove the 's' token because it (a) causes trouble with weird bigrams, and (b) it has little meaning by itself.
|
||||
tr->tokens.erase(tr->tokens.begin()+i+2);
|
||||
tr->tokens[i+2].token_len = 0; //mark for delete
|
||||
any_deleted = true;
|
||||
//tr->tokens.erase(tr->tokens.begin()+i+2);
|
||||
}
|
||||
if(any_deleted) {
|
||||
size_t src_idx=0;
|
||||
size_t dst_idx = 0;
|
||||
while(src_idx<tr->size()) {
|
||||
if(tr->tokens[src_idx].token_len!=0) {
|
||||
if(src_idx!=dst_idx)
|
||||
tr->tokens[dst_idx] = tr->tokens[src_idx];
|
||||
src_idx++;
|
||||
dst_idx++;
|
||||
} else
|
||||
src_idx++;
|
||||
}
|
||||
tr->tokens.erase(tr->tokens.begin()+dst_idx,tr->tokens.end());
|
||||
}
|
||||
}
|
||||
|
||||
@ -781,6 +798,8 @@ static void recognize_telephone_numbers_sweden(TokenizerResult *tr) {
|
||||
}
|
||||
}
|
||||
}
|
||||
if(last_digit_token_idx>=org_token_count)
|
||||
last_digit_token_idx = org_token_count-1;
|
||||
if(digit_count<5)
|
||||
continue;
|
||||
if(digit_count>10)
|
||||
@ -1171,7 +1190,9 @@ static void rewrite_ampersands(TokenizerResult *tr, lang_t lang, const char *cou
|
||||
|
||||
static void rewrite_ampersands(TokenizerResult *tr, const char *ampersand_word, size_t ampersand_word_len) {
|
||||
char *s = NULL;
|
||||
for(const auto &t : tr->tokens) {
|
||||
size_t org_token_count = tr->size();
|
||||
for(size_t i=1; i<org_token_count; i++) {
|
||||
const auto &t = (*tr)[i];
|
||||
if(t.token_len==1 && *t.token_start=='&') {
|
||||
if(!s) {
|
||||
s = (char*)tr->egstack.alloc(ampersand_word_len);
|
||||
@ -1285,6 +1306,8 @@ bool is_slash_abbreviation(const char *s, size_t slen) {
|
||||
|
||||
|
||||
static void collapse_slash_abbreviations(TokenizerResult *tr) {
|
||||
//Replace simple <singleletter> '/' <singleletter> with a single token without the slash.
|
||||
#if 0
|
||||
size_t org_token_count = tr->size();
|
||||
for(size_t i=1; i+2<org_token_count; i++) {
|
||||
const auto &t0 = (*tr)[i+0];
|
||||
@ -1309,4 +1332,43 @@ static void collapse_slash_abbreviations(TokenizerResult *tr) {
|
||||
org_token_count -= 3;
|
||||
i -= 2;
|
||||
}
|
||||
#endif
|
||||
//The ifdef'fed-out code above is the clean and simple algorithm. But it is horribly inefficient when encountering
|
||||
//documents consisting almost entirely of slash-abbreviations, such as genome tables.
|
||||
//Instead we iterate over the tokens with src,dst iterators, copying, deleting and modifying underway without causing
|
||||
//reallocation of the underlying token vector (the eg stack is used though).
|
||||
if(tr->size()<3)
|
||||
return;
|
||||
size_t src_idx = 0;
|
||||
size_t dst_idx = 0;
|
||||
size_t org_token_count = tr->tokens.size();
|
||||
while(src_idx+2<org_token_count) {
|
||||
const auto &t0 = (*tr)[src_idx+0];
|
||||
const auto &t1 = (*tr)[src_idx+1];
|
||||
const auto &t2 = (*tr)[src_idx+2];
|
||||
if((!t0.is_alfanum || t1.is_alfanum || !t2.is_alfanum) ||
|
||||
(t1.token_len!=1 || t1.token_start[0]!='/') ||
|
||||
(!t0.is_primary || !t1.is_primary || !t2.is_primary) ||
|
||||
(t0.token_end()!=t1.token_start || t1.token_end()!=t2.token_start) ||
|
||||
(!is_slash_abbreviation(t0.token_start, t0.token_len+t1.token_len+t2.token_len)))
|
||||
{
|
||||
if(src_idx!=dst_idx)
|
||||
tr->tokens[dst_idx] = tr->tokens[src_idx];
|
||||
src_idx++;
|
||||
dst_idx++;
|
||||
} else {
|
||||
size_t sl = t0.token_len + t2.token_len;
|
||||
char *s = (char*)tr->egstack.alloc(sl);
|
||||
memcpy(s, t0.token_start, t0.token_len);
|
||||
memcpy(s+t0.token_len, t2.token_start, t2.token_len);
|
||||
tr->tokens[dst_idx] = TokenRange(t0.start_pos, t2.end_pos, s,sl, false, true);
|
||||
|
||||
dst_idx++;
|
||||
src_idx += 3;
|
||||
}
|
||||
}
|
||||
while(src_idx<org_token_count)
|
||||
tr->tokens[dst_idx++] = tr->tokens[src_idx++];
|
||||
if(src_idx!=dst_idx)
|
||||
tr->tokens.erase(tr->tokens.begin()+dst_idx,tr->tokens.end());
|
||||
}
|
||||
|
@ -42,7 +42,11 @@ public:
|
||||
while(p1tokens<tr.size() && tr[p1tokens].is_primary)
|
||||
p1tokens++;
|
||||
printf("phase2-tokens: %u\n", (unsigned)(tr.size()-p1tokens));
|
||||
for(unsigned i=p1tokens; i<tr.size(); i++)
|
||||
for(unsigned i=0; i<tr.size(); i++)
|
||||
if(!tr[i].is_primary || i>=p1tokens)
|
||||
printf(" #%u: [%lu..%lu) '%.*s'\n", i, tr[i].start_pos, tr[i].end_pos, (int)tr[i].token_len, tr[i].token_start);
|
||||
printf("all tokens: %u\n", (unsigned)(tr.size()));
|
||||
for(unsigned i=0; i<tr.size(); i++)
|
||||
printf(" #%u: [%lu..%lu) '%.*s'\n", i, tr[i].start_pos, tr[i].end_pos, (int)tr[i].token_len, tr[i].token_start);
|
||||
}
|
||||
bool empty() const { return tr.empty(); }
|
||||
@ -667,6 +671,23 @@ int main(void) {
|
||||
assert(t.has_token("Johns"));
|
||||
}
|
||||
|
||||
printf("Test line %d\n",__LINE__);
|
||||
{
|
||||
T2 t("John''''s dog",langEnglish);
|
||||
assert(!t.has_token("John's"));
|
||||
assert(!t.has_token("Johns"));
|
||||
}
|
||||
|
||||
printf("Test line %d\n",__LINE__);
|
||||
{
|
||||
T2 t("John's cat bit Mary's dog's tail",langEnglish);
|
||||
assert(t.has_token("John's"));
|
||||
assert(t.has_token("cat"));
|
||||
assert(t.has_token("bit"));
|
||||
assert(t.has_token("Mary's"));
|
||||
assert(t.has_token("dog's"));
|
||||
assert(t.has_token("tail"));
|
||||
}
|
||||
|
||||
//hyphenation
|
||||
printf("Test line %d\n",__LINE__);
|
||||
@ -844,6 +865,11 @@ int main(void) {
|
||||
T2 t("foo 040-99 88 77 boo",langSwedish);
|
||||
assert(t.has_token("040998877"));
|
||||
}
|
||||
printf("Test line %d\n",__LINE__);
|
||||
{
|
||||
T2 t("foo 08-24 50 55",langSwedish);
|
||||
assert(t.has_token("08245055"));
|
||||
}
|
||||
|
||||
printf("Test line %d\n",__LINE__);
|
||||
{
|
||||
@ -1056,12 +1082,22 @@ int main(void) {
|
||||
|
||||
printf("Test line %d\n",__LINE__);
|
||||
{
|
||||
T2 t("The smurf drove 80 km/h on the highway",langUnknown);
|
||||
T2 t("The smurf drove 80 km/h on the highway, which is 22 m/s approximately",langUnknown);
|
||||
assert(t.has_token("The"));
|
||||
assert(t.has_token("smurf"));
|
||||
assert(t.has_token("drove"));
|
||||
assert(t.has_token("80"));
|
||||
assert(t.has_token("kmh"));
|
||||
assert(!t.has_token("km"));
|
||||
assert(!t.has_token("h"));
|
||||
assert(t.has_token("80"));
|
||||
assert(t.has_token("on"));
|
||||
assert(t.has_token("the"));
|
||||
assert(t.has_token("highway"));
|
||||
assert(t.has_token("which"));
|
||||
assert(t.has_token("is"));
|
||||
assert(t.has_token("ms"));
|
||||
assert(t.has_token("approximately"));
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
Reference in New Issue
Block a user