Merge branch 'master' into dev-siteinfo

This commit is contained in:
Ai Lin Chia
2018-06-06 13:30:32 +02:00
7 changed files with 153 additions and 32 deletions

@ -433,7 +433,7 @@ doc:
# used for tools/unittest
libgb.a: $(OBJS) libsto.a libword_variations.a libunicode.a
ar rcs $@ $^ word_variations/*.o sto/*.o unicode/*.o
ar rcs $@ $^ word_variations/*.o sto/*.o unicode/*.o tokenizer/*.o
.PHONY: tools
tools:

@ -285,7 +285,7 @@ unsigned Pos::filter( const TokenizerResult *tr, int32_t a, int32_t b, bool addE
bool resetPunctCount = true;
if ( is_punct_utf8( p ) ) {
if (is_punct_utf8(p) && !is_wspace_utf8(p)) {
if ( ( cs == lastPunctSize) && ( memcmp(lastPunct, p, cs) == 0 ) ) {
resetPunctCount = false;
++samePunctCount;

@ -17684,7 +17684,7 @@ bool XmlDoc::printGeneralInfo ( SafeBuf *sb , HttpRequest *hr ) {
if ( linkdbHostNum < numHosts ) {
linkdbHostId = hosts[linkdbHostNum].m_hostId ;
if( !hosts[linkdbHostNum].m_spiderEnabled) {
linkdbHostId = g_hostdb.getHostIdWithSpideringEnabled(linkdbShardNum, true);
linkdbHostId = g_hostdb.getHostIdWithSpideringEnabled(linkdbShardNum, false);
}
}
else {

@ -526,6 +526,7 @@ public:
bool hashLanguageString ( class HashTableX *table ) ;
bool hashCountry ( class HashTableX *table ) ;
void sortTokenizerResult(TokenizerResult *tr);
void getLanguageAndCountry(lang_t *lang, const char **country_code);
class Url *getBaseUrl ( ) ;
@ -571,6 +572,7 @@ public:
Sections *sections, const Bits *bits,
const char *fragVec, const char *wordSpamVec, const char *langVec,
HashTableX *wts, SafeBuf *wbuf);
bool hashString4(const char *s, int32_t slen, HashInfo *hi);
// print out for PageTitledb.cpp and PageParser.cpp

@ -576,15 +576,10 @@ bool XmlDoc::hashMetaTags ( HashTableX *tt ) {
// are used in user searches automatically.
hi.m_prefix = NULL;
// desc is NULL, prefix will be used as desc
bool status = hashString ( s,len, &hi );
bool status = hashString4(s,len,&hi);
// bail on error, g_errno should be set
if ( ! status ) return false;
// return false with g_errno set on error
//if ( ! hashNumberForSorting ( buf , bufLen , &hi ) )
// return false;
}
return true;
@ -1267,7 +1262,7 @@ bool XmlDoc::hashIncomingLinkText(HashTableX *tt) {
// . we still have the score punish from # of words though!
// . for inlink texts that are the same it should accumulate
// and use the reserved bits as a multiplier i guess...
if ( ! hashString ( txt,tlen,&hi) ) return false;
if ( ! hashString4(txt,tlen,&hi) ) return false;
// now record this so we can match the link text to
// a matched offsite inlink text term in the scoring info
//k->m_wordPosEnd = hi.m_startDist;
@ -1352,20 +1347,12 @@ bool XmlDoc::hashTitle ( HashTableX *tt ) {
//get language and country if known, so tokenizer phase 2 can do its magic
lang_t lang_id;
uint8_t *tmpLangId = getLangId();
if(tmpLangId!=NULL && tmpLangId!=(uint8_t*)-1)
lang_id = (lang_t)*tmpLangId;
else
lang_id = langUnknown;
const char *countryCode = NULL;
uint16_t *countryId = getCountryId();
if(countryId!=NULL && countryId!=(uint16_t*)-1)
countryCode = g_countryCode.getAbbr(*countryId);
const char *countryCode;
getLanguageAndCountry(&lang_id,&countryCode);
TokenizerResult tr;
plain_tokenizer_phase_1(title,titleLen,&tr);
plain_tokenizer_phase_2((lang_t)lang_id, countryCode, &tr);
plain_tokenizer_phase_2(lang_id, countryCode, &tr);
calculate_tokens_hashes(&tr);
sortTokenizerResult(&tr);
@ -1434,7 +1421,7 @@ bool XmlDoc::hashMetaKeywords ( HashTableX *tt ) {
hi.m_hashGroup = HASHGROUP_INMETATAG;
// call XmlDoc::hashString
return hashString ( mk , mklen , &hi);
return hashString4(mk, mklen, &hi);
}
@ -1467,7 +1454,7 @@ bool XmlDoc::hashExplicitKeywords(HashTableX *tt) {
hi.m_tt = tt;
hi.m_desc = "explicit keywords";
hi.m_hashGroup = HASHGROUP_EXPLICIT_KEYWORDS;
return hashString(ptr_explicitKeywords, size_explicitKeywords, &hi);
return hashString4(ptr_explicitKeywords, size_explicitKeywords, &hi);
} else
return true; //nothing done - no error
}
@ -1506,7 +1493,8 @@ bool XmlDoc::hashMetaSummary ( HashTableX *tt ) {
// udpate hashing parms
hi.m_desc = "meta summary";
// hash it
if ( ! hashString ( ms , mslen , &hi )) return false;
if(!hashString4(ms,mslen,&hi))
return false;
//len = m_xml.getMetaContent ( buf , 2048 , "description" , 11 );
@ -1517,7 +1505,8 @@ bool XmlDoc::hashMetaSummary ( HashTableX *tt ) {
// udpate hashing parms
hi.m_desc = "meta desc";
// . TODO: only hash if unique????? set a flag on ht then i guess
if ( ! hashString ( md , mdlen , &hi ) ) return false;
if(!hashString4(md,mdlen, &hi))
return false;
return true;
}
@ -1537,7 +1526,7 @@ bool XmlDoc::hashMetaGeoPlacename( HashTableX *tt ) {
hi.m_hashGroup = HASHGROUP_INMETATAG;
// call XmlDoc::hashString
return hashString ( mgp , mgplen , &hi);
return hashString4(mgp, mgplen, &hi);
}
@ -1615,6 +1604,21 @@ void XmlDoc::sortTokenizerResult(TokenizerResult *tr) {
});
}
void XmlDoc::getLanguageAndCountry(lang_t *lang, const char **country_code) {
//get language and country if known, so tokenizer phase 2 can do its magic
uint8_t *tmpLangId = getLangId();
if(tmpLangId!=NULL && tmpLangId!=(uint8_t*)-1)
*lang = (lang_t)*tmpLangId;
else
*lang = langUnknown;
uint16_t *countryId = getCountryId();
if(countryId!=NULL && countryId!=(uint16_t*)-1)
*country_code = g_countryCode.getAbbr(*countryId);
else
*country_code = NULL;
}
bool XmlDoc::hashSingleTerm( const char *s, int32_t slen, HashInfo *hi ) {
// empty?
if ( slen <= 0 ) return true;
@ -1736,6 +1740,23 @@ bool XmlDoc::hashString3(size_t begin_token, size_t end_token, HashInfo *hi,
return hashWords3( hi, &m_tokenizerResult, begin_token, end_token, NULL, &bits, NULL, NULL, NULL, wts, wbuf );
}
bool XmlDoc::hashString4(const char *s, int32_t slen, HashInfo *hi) {
TokenizerResult tr;
Bits bits;
lang_t lang_id;
const char *countryCode;
getLanguageAndCountry(&lang_id,&countryCode);
plain_tokenizer_phase_1(s,slen,&tr);
plain_tokenizer_phase_2(lang_id,countryCode,&tr);
calculate_tokens_hashes(&tr);
sortTokenizerResult(&tr);
if(!bits.set(&tr))
return false;
return hashWords3( hi, &tr, NULL, &bits, NULL, NULL, NULL, m_wts, &m_wbuf );
}
bool XmlDoc::hashWords ( HashInfo *hi ) {
// sanity checks
@ -2038,7 +2059,7 @@ bool XmlDoc::hashWords3(HashInfo *hi, const TokenizerResult *tr, size_t begin_to
const auto &t2 = (*tr)[j];
if(t2.is_alfanum && t2.start_pos>=token.end_pos)
break;
if(!bits->canBeInPhrase(i) && !bits->canPairAcross(j)) {
if(!bits->canBeInPhrase(j) && !bits->canPairAcross(j)) {
generate_bigram = false;
break;
}

@ -415,6 +415,7 @@ static void remove_some_combining_marks(TokenizerResult *tr, const UChar32 nativ
//that could conceivably stand in for apostrophe. We do this in all languages because the abuse seem to know no language barrier
static void combine_possessive_s_tokens(TokenizerResult *tr, lang_t lang) {
//Loop through original tokens, looking for <word> <blotch> "s". Combine the word with the letter s.
bool any_deleted = false;
const size_t org_token_count = tr->size();
for(size_t i=0; i+2<org_token_count; i++) {
const auto &t0 = (*tr)[i];
@ -433,7 +434,7 @@ static void combine_possessive_s_tokens(TokenizerResult *tr, lang_t lang) {
//t1 must be a single blotch
if(t1.token_len>4)
continue;
UChar32 uc[2];
UChar32 uc[4];
int ucs = decode_utf8_string(t1.token_start,t1.token_len,uc);
if(ucs!=1)
continue;
@ -473,7 +474,23 @@ static void combine_possessive_s_tokens(TokenizerResult *tr, lang_t lang) {
// car
//and XmlDoc_indexing.cpp will generate the bigram "johns+car", but also "s+car".
//We remove the 's' token because it (a) causes trouble with weird bigrams, and (b) it has little meaning by itself.
tr->tokens.erase(tr->tokens.begin()+i+2);
tr->tokens[i+2].token_len = 0; //mark for delete
any_deleted = true;
//tr->tokens.erase(tr->tokens.begin()+i+2);
}
if(any_deleted) {
size_t src_idx=0;
size_t dst_idx = 0;
while(src_idx<tr->size()) {
if(tr->tokens[src_idx].token_len!=0) {
if(src_idx!=dst_idx)
tr->tokens[dst_idx] = tr->tokens[src_idx];
src_idx++;
dst_idx++;
} else
src_idx++;
}
tr->tokens.erase(tr->tokens.begin()+dst_idx,tr->tokens.end());
}
}
@ -781,6 +798,8 @@ static void recognize_telephone_numbers_sweden(TokenizerResult *tr) {
}
}
}
if(last_digit_token_idx>=org_token_count)
last_digit_token_idx = org_token_count-1;
if(digit_count<5)
continue;
if(digit_count>10)
@ -1171,7 +1190,9 @@ static void rewrite_ampersands(TokenizerResult *tr, lang_t lang, const char *cou
static void rewrite_ampersands(TokenizerResult *tr, const char *ampersand_word, size_t ampersand_word_len) {
char *s = NULL;
for(const auto &t : tr->tokens) {
size_t org_token_count = tr->size();
for(size_t i=1; i<org_token_count; i++) {
const auto &t = (*tr)[i];
if(t.token_len==1 && *t.token_start=='&') {
if(!s) {
s = (char*)tr->egstack.alloc(ampersand_word_len);
@ -1285,6 +1306,8 @@ bool is_slash_abbreviation(const char *s, size_t slen) {
static void collapse_slash_abbreviations(TokenizerResult *tr) {
//Replace simple <singleletter> '/' <singleletter> with a single token without the slash.
#if 0
size_t org_token_count = tr->size();
for(size_t i=1; i+2<org_token_count; i++) {
const auto &t0 = (*tr)[i+0];
@ -1309,4 +1332,43 @@ static void collapse_slash_abbreviations(TokenizerResult *tr) {
org_token_count -= 3;
i -= 2;
}
#endif
//The ifdef'fed-out code above is the clean and simple algorithm. But it is horribly inefficient when encountering
//documents consisting almost entirely of slash-abbreviations, such as genome tables.
//Instead we iterate over the tokens with src,dst iterators, copying, deleting and modifying underway without causing
//reallocation of the underlying token vector (the eg stack is used though).
if(tr->size()<3)
return;
size_t src_idx = 0;
size_t dst_idx = 0;
size_t org_token_count = tr->tokens.size();
while(src_idx+2<org_token_count) {
const auto &t0 = (*tr)[src_idx+0];
const auto &t1 = (*tr)[src_idx+1];
const auto &t2 = (*tr)[src_idx+2];
if((!t0.is_alfanum || t1.is_alfanum || !t2.is_alfanum) ||
(t1.token_len!=1 || t1.token_start[0]!='/') ||
(!t0.is_primary || !t1.is_primary || !t2.is_primary) ||
(t0.token_end()!=t1.token_start || t1.token_end()!=t2.token_start) ||
(!is_slash_abbreviation(t0.token_start, t0.token_len+t1.token_len+t2.token_len)))
{
if(src_idx!=dst_idx)
tr->tokens[dst_idx] = tr->tokens[src_idx];
src_idx++;
dst_idx++;
} else {
size_t sl = t0.token_len + t2.token_len;
char *s = (char*)tr->egstack.alloc(sl);
memcpy(s, t0.token_start, t0.token_len);
memcpy(s+t0.token_len, t2.token_start, t2.token_len);
tr->tokens[dst_idx] = TokenRange(t0.start_pos, t2.end_pos, s,sl, false, true);
dst_idx++;
src_idx += 3;
}
}
while(src_idx<org_token_count)
tr->tokens[dst_idx++] = tr->tokens[src_idx++];
if(src_idx!=dst_idx)
tr->tokens.erase(tr->tokens.begin()+dst_idx,tr->tokens.end());
}

@ -42,7 +42,11 @@ public:
while(p1tokens<tr.size() && tr[p1tokens].is_primary)
p1tokens++;
printf("phase2-tokens: %u\n", (unsigned)(tr.size()-p1tokens));
for(unsigned i=p1tokens; i<tr.size(); i++)
for(unsigned i=0; i<tr.size(); i++)
if(!tr[i].is_primary || i>=p1tokens)
printf(" #%u: [%lu..%lu) '%.*s'\n", i, tr[i].start_pos, tr[i].end_pos, (int)tr[i].token_len, tr[i].token_start);
printf("all tokens: %u\n", (unsigned)(tr.size()));
for(unsigned i=0; i<tr.size(); i++)
printf(" #%u: [%lu..%lu) '%.*s'\n", i, tr[i].start_pos, tr[i].end_pos, (int)tr[i].token_len, tr[i].token_start);
}
bool empty() const { return tr.empty(); }
@ -667,6 +671,23 @@ int main(void) {
assert(t.has_token("Johns"));
}
printf("Test line %d\n",__LINE__);
{
T2 t("John''''s dog",langEnglish);
assert(!t.has_token("John's"));
assert(!t.has_token("Johns"));
}
printf("Test line %d\n",__LINE__);
{
T2 t("John's cat bit Mary's dog's tail",langEnglish);
assert(t.has_token("John's"));
assert(t.has_token("cat"));
assert(t.has_token("bit"));
assert(t.has_token("Mary's"));
assert(t.has_token("dog's"));
assert(t.has_token("tail"));
}
//hyphenation
printf("Test line %d\n",__LINE__);
@ -844,6 +865,11 @@ int main(void) {
T2 t("foo 040-99 88 77 boo",langSwedish);
assert(t.has_token("040998877"));
}
printf("Test line %d\n",__LINE__);
{
T2 t("foo 08-24 50 55",langSwedish);
assert(t.has_token("08245055"));
}
printf("Test line %d\n",__LINE__);
{
@ -1056,12 +1082,22 @@ int main(void) {
printf("Test line %d\n",__LINE__);
{
T2 t("The smurf drove 80 km/h on the highway",langUnknown);
T2 t("The smurf drove 80 km/h on the highway, which is 22 m/s approximately",langUnknown);
assert(t.has_token("The"));
assert(t.has_token("smurf"));
assert(t.has_token("drove"));
assert(t.has_token("80"));
assert(t.has_token("kmh"));
assert(!t.has_token("km"));
assert(!t.has_token("h"));
assert(t.has_token("80"));
assert(t.has_token("on"));
assert(t.has_token("the"));
assert(t.has_token("highway"));
assert(t.has_token("which"));
assert(t.has_token("is"));
assert(t.has_token("ms"));
assert(t.has_token("approximately"));
}
return 0;