Merge branch 'master' into dev-siteinfo

2018-06-06 13:30:32 +02:00
parent 1c59db3d7f 2f88aa7d9b
commit 1c6ccacd7a
7 changed files with 153 additions and 32 deletions
--- a/2
+++ b/2
@ -433,7 +433,7 @@ doc:

 # used for tools/unittest
 libgb.a: $(OBJS) libsto.a libword_variations.a libunicode.a
-	ar rcs $@ $^ word_variations/*.o sto/*.o unicode/*.o
+	ar rcs $@ $^ word_variations/*.o sto/*.o unicode/*.o tokenizer/*.o

 .PHONY: tools
 tools:
--- a/Pos.cpp
+++ b/Pos.cpp
@ -285,7 +285,7 @@ unsigned Pos::filter( const TokenizerResult *tr, int32_t a, int32_t b, bool addE


 			bool resetPunctCount = true;
-			if ( is_punct_utf8( p ) ) {
+			if (is_punct_utf8(p) && !is_wspace_utf8(p)) {
 				if ( ( cs == lastPunctSize) && ( memcmp(lastPunct, p, cs) == 0 ) ) {
 					resetPunctCount = false;
 					++samePunctCount;
--- a/XmlDoc.cpp
+++ b/XmlDoc.cpp
@ -17684,7 +17684,7 @@ bool XmlDoc::printGeneralInfo ( SafeBuf *sb , HttpRequest *hr ) {
 		if ( linkdbHostNum < numHosts ) {
 			linkdbHostId = hosts[linkdbHostNum].m_hostId ;
 			if( !hosts[linkdbHostNum].m_spiderEnabled) {
-				linkdbHostId = g_hostdb.getHostIdWithSpideringEnabled(linkdbShardNum, true);
+				linkdbHostId = g_hostdb.getHostIdWithSpideringEnabled(linkdbShardNum, false);
 			}
 		}
 		else {
--- a/XmlDoc.h
+++ b/XmlDoc.h
@ -526,6 +526,7 @@ public:
 	bool hashLanguageString ( class HashTableX *table ) ;
 	bool hashCountry ( class HashTableX *table ) ;
 	void sortTokenizerResult(TokenizerResult *tr);
+	void getLanguageAndCountry(lang_t *lang, const char **country_code);

 	class Url *getBaseUrl ( ) ;

@ -571,6 +572,7 @@ public:
 			Sections *sections, const Bits *bits,
 			const char *fragVec, const char *wordSpamVec, const char *langVec,
 			HashTableX *wts, SafeBuf *wbuf);
+	bool hashString4(const char *s, int32_t slen, HashInfo *hi);


 	// print out for PageTitledb.cpp and PageParser.cpp
--- a/XmlDoc_Indexing.cpp
+++ b/XmlDoc_Indexing.cpp
@ -576,15 +576,10 @@ bool XmlDoc::hashMetaTags ( HashTableX *tt ) {
 		// are used in user searches automatically.
 		hi.m_prefix = NULL;

-		// desc is NULL, prefix will be used as desc
-		bool status = hashString ( s,len, &hi );
+		bool status = hashString4(s,len,&hi);

 		// bail on error, g_errno should be set
 		if ( ! status ) return false;
-
-		// return false with g_errno set on error
-		//if ( ! hashNumberForSorting ( buf , bufLen , &hi ) )
-		//	return false;
 	}

 	return true;
@ -1267,7 +1262,7 @@ bool XmlDoc::hashIncomingLinkText(HashTableX *tt) {
 		// . we still have the score punish from # of words though!
 		// . for inlink texts that are the same it should accumulate
 		//   and use the reserved bits as a multiplier i guess...
-		if ( ! hashString ( txt,tlen,&hi) ) return false;
+		if ( ! hashString4(txt,tlen,&hi) ) return false;
 		// now record this so we can match the link text to
 		// a matched offsite inlink text term in the scoring info
 		//k->m_wordPosEnd = hi.m_startDist;
@ -1352,20 +1347,12 @@ bool XmlDoc::hashTitle ( HashTableX *tt ) {
 	
 	//get language and country if known, so tokenizer phase 2 can do its magic
 	lang_t lang_id;
-	uint8_t *tmpLangId = getLangId();
-	if(tmpLangId!=NULL && tmpLangId!=(uint8_t*)-1)
-		lang_id = (lang_t)*tmpLangId;
-	else
-		lang_id = langUnknown;
-	
-	const char *countryCode = NULL;
-	uint16_t *countryId = getCountryId();
-	if(countryId!=NULL && countryId!=(uint16_t*)-1)
-		countryCode = g_countryCode.getAbbr(*countryId);
+	const char *countryCode;
+	getLanguageAndCountry(&lang_id,&countryCode);
 	
 	TokenizerResult tr;
 	plain_tokenizer_phase_1(title,titleLen,&tr);
-	plain_tokenizer_phase_2((lang_t)lang_id, countryCode, &tr);
+	plain_tokenizer_phase_2(lang_id, countryCode, &tr);
 	calculate_tokens_hashes(&tr);
 	sortTokenizerResult(&tr);
 	
@ -1434,7 +1421,7 @@ bool XmlDoc::hashMetaKeywords ( HashTableX *tt ) {
 	hi.m_hashGroup  = HASHGROUP_INMETATAG;

 	// call XmlDoc::hashString
-	return hashString ( mk , mklen , &hi);
+	return hashString4(mk, mklen, &hi);
 }


@ -1467,7 +1454,7 @@ bool XmlDoc::hashExplicitKeywords(HashTableX *tt) {
 		hi.m_tt         = tt;
 		hi.m_desc       = "explicit keywords";
 		hi.m_hashGroup  = HASHGROUP_EXPLICIT_KEYWORDS;
-		return hashString(ptr_explicitKeywords, size_explicitKeywords, &hi);
+		return hashString4(ptr_explicitKeywords, size_explicitKeywords, &hi);
 	} else
 		return true; //nothing done - no error
 }
@ -1506,7 +1493,8 @@ bool XmlDoc::hashMetaSummary ( HashTableX *tt ) {
 	// udpate hashing parms
 	hi.m_desc = "meta summary";
 	// hash it
-	if ( ! hashString ( ms , mslen , &hi )) return false;
+	if(!hashString4(ms,mslen,&hi))
+		return false;


 	//len = m_xml.getMetaContent ( buf , 2048 , "description" , 11 );
@ -1517,7 +1505,8 @@ bool XmlDoc::hashMetaSummary ( HashTableX *tt ) {
 	// udpate hashing parms
 	hi.m_desc = "meta desc";
 	// . TODO: only hash if unique????? set a flag on ht then i guess
-	if ( ! hashString ( md , mdlen , &hi ) ) return false;
+	if(!hashString4(md,mdlen, &hi))
+		return false;

 	return true;
 }
@ -1537,7 +1526,7 @@ bool XmlDoc::hashMetaGeoPlacename( HashTableX *tt ) {
 	hi.m_hashGroup  = HASHGROUP_INMETATAG;

 	// call XmlDoc::hashString
-	return hashString ( mgp , mgplen , &hi);
+	return hashString4(mgp, mgplen, &hi);
 }


@ -1615,6 +1604,21 @@ void XmlDoc::sortTokenizerResult(TokenizerResult *tr) {
 	});
 }

+void XmlDoc::getLanguageAndCountry(lang_t *lang, const char **country_code) {
+	//get language and country if known, so tokenizer phase 2 can do its magic
+	uint8_t *tmpLangId = getLangId();
+	if(tmpLangId!=NULL && tmpLangId!=(uint8_t*)-1)
+		*lang = (lang_t)*tmpLangId;
+	else
+		*lang = langUnknown;
+	
+	uint16_t *countryId = getCountryId();
+	if(countryId!=NULL && countryId!=(uint16_t*)-1)
+		*country_code = g_countryCode.getAbbr(*countryId);
+	else
+		*country_code = NULL;
+}
+
 bool XmlDoc::hashSingleTerm( const char *s, int32_t slen, HashInfo *hi ) {
 	// empty?
 	if ( slen <= 0 ) return true;
@ -1736,6 +1740,23 @@ bool XmlDoc::hashString3(size_t begin_token, size_t end_token, HashInfo *hi,
 	return hashWords3( hi, &m_tokenizerResult, begin_token, end_token, NULL, &bits, NULL, NULL, NULL, wts, wbuf );
 }

+bool XmlDoc::hashString4(const char *s, int32_t slen, HashInfo *hi) {
+	TokenizerResult tr;
+	Bits    bits;
+	lang_t lang_id;
+	const char *countryCode;
+	
+	getLanguageAndCountry(&lang_id,&countryCode);
+	plain_tokenizer_phase_1(s,slen,&tr);
+	plain_tokenizer_phase_2(lang_id,countryCode,&tr);
+	calculate_tokens_hashes(&tr);
+	sortTokenizerResult(&tr);
+	if(!bits.set(&tr))
+		return false;
+
+	return hashWords3( hi, &tr, NULL, &bits, NULL, NULL, NULL, m_wts, &m_wbuf );
+}
+

 bool XmlDoc::hashWords ( HashInfo   *hi ) {
 	// sanity checks
@ -2038,7 +2059,7 @@ bool XmlDoc::hashWords3(HashInfo *hi, const TokenizerResult *tr, size_t begin_to
 			const auto &t2 = (*tr)[j];
 			if(t2.is_alfanum && t2.start_pos>=token.end_pos)
 				break;
-			if(!bits->canBeInPhrase(i) && !bits->canPairAcross(j)) {
+			if(!bits->canBeInPhrase(j) && !bits->canPairAcross(j)) {
 				generate_bigram = false;
 				break;
 			}
--- a/tokenizer/tokenizer2.cpp
+++ b/tokenizer/tokenizer2.cpp
@ -415,6 +415,7 @@ static void remove_some_combining_marks(TokenizerResult *tr, const UChar32 nativ
 //that could conceivably stand in for apostrophe. We do this in all languages because the abuse seem to know no language barrier
 static void combine_possessive_s_tokens(TokenizerResult *tr, lang_t lang) {
 	//Loop through original tokens, looking for <word> <blotch> "s". Combine the word with the letter s.
+	bool any_deleted = false;
 	const size_t org_token_count = tr->size();
 	for(size_t i=0; i+2<org_token_count; i++) {
 		const auto &t0 = (*tr)[i];
@ -433,7 +434,7 @@ static void combine_possessive_s_tokens(TokenizerResult *tr, lang_t lang) {
 		//t1 must be a single blotch
 		if(t1.token_len>4)
 			continue;
-		UChar32 uc[2];
+		UChar32 uc[4];
 		int ucs = decode_utf8_string(t1.token_start,t1.token_len,uc);
 		if(ucs!=1)
 			continue;
@ -473,7 +474,23 @@ static void combine_possessive_s_tokens(TokenizerResult *tr, lang_t lang) {
 		//  car
 		//and XmlDoc_indexing.cpp will generate the bigram "johns+car", but also "s+car".
 		//We remove the 's' token because it (a) causes trouble with weird bigrams, and (b) it has little meaning by itself.
-		tr->tokens.erase(tr->tokens.begin()+i+2);
+		tr->tokens[i+2].token_len = 0; //mark for delete
+		any_deleted = true;
+		//tr->tokens.erase(tr->tokens.begin()+i+2);
+	}
+	if(any_deleted) {
+		size_t src_idx=0;
+		size_t dst_idx = 0;
+		while(src_idx<tr->size()) {
+			if(tr->tokens[src_idx].token_len!=0) {
+				if(src_idx!=dst_idx)
+					tr->tokens[dst_idx] = tr->tokens[src_idx];
+				src_idx++;
+				dst_idx++;
+			} else
+				src_idx++;
+		}
+		tr->tokens.erase(tr->tokens.begin()+dst_idx,tr->tokens.end());
 	}
 }

@ -781,6 +798,8 @@ static void recognize_telephone_numbers_sweden(TokenizerResult *tr) {
 				}
 			}
 		}
+		if(last_digit_token_idx>=org_token_count)
+			last_digit_token_idx = org_token_count-1;
 		if(digit_count<5)
 			continue;
 		if(digit_count>10)
@ -1171,7 +1190,9 @@ static void rewrite_ampersands(TokenizerResult *tr, lang_t lang, const char *cou

 static void rewrite_ampersands(TokenizerResult *tr, const char *ampersand_word, size_t ampersand_word_len) {
 	char *s = NULL;
-	for(const auto &t : tr->tokens) {
+	size_t org_token_count = tr->size();
+	for(size_t i=1; i<org_token_count; i++) {
+		const auto &t = (*tr)[i];
 		if(t.token_len==1 && *t.token_start=='&') {
 			if(!s) {
 				s = (char*)tr->egstack.alloc(ampersand_word_len);
@ -1285,6 +1306,8 @@ bool is_slash_abbreviation(const char *s, size_t slen) {


 static void collapse_slash_abbreviations(TokenizerResult *tr) {
+	//Replace simple <singleletter> '/' <singleletter> with a single token without the slash.
+#if 0
 	size_t org_token_count = tr->size();
 	for(size_t i=1; i+2<org_token_count; i++) {
 		const auto &t0 = (*tr)[i+0];
@ -1309,4 +1332,43 @@ static void collapse_slash_abbreviations(TokenizerResult *tr) {
 		org_token_count -= 3;
 		i -= 2;
 	}
+#endif
+	//The ifdef'fed-out code above is the clean and simple algorithm. But it is horribly inefficient when encountering
+	//documents consisting almost entirely of slash-abbreviations, such as genome tables.
+	//Instead we iterate over the tokens with src,dst iterators, copying, deleting and modifying underway without causing
+	//reallocation of the underlying token vector (the eg stack is used though).
+	if(tr->size()<3)
+		return;
+	size_t src_idx = 0;
+	size_t dst_idx = 0;
+	size_t org_token_count = tr->tokens.size();
+	while(src_idx+2<org_token_count) {
+		const auto &t0 = (*tr)[src_idx+0];
+		const auto &t1 = (*tr)[src_idx+1];
+		const auto &t2 = (*tr)[src_idx+2];
+		if((!t0.is_alfanum || t1.is_alfanum || !t2.is_alfanum) ||
+		   (t1.token_len!=1 || t1.token_start[0]!='/') ||
+		   (!t0.is_primary || !t1.is_primary || !t2.is_primary) ||
+		   (t0.token_end()!=t1.token_start || t1.token_end()!=t2.token_start) ||
+		   (!is_slash_abbreviation(t0.token_start, t0.token_len+t1.token_len+t2.token_len)))
+		{
+			if(src_idx!=dst_idx)
+				tr->tokens[dst_idx] = tr->tokens[src_idx];
+			src_idx++;
+			dst_idx++;
+		} else {
+			size_t sl = t0.token_len + t2.token_len;
+			char *s = (char*)tr->egstack.alloc(sl);
+			memcpy(s, t0.token_start, t0.token_len);
+			memcpy(s+t0.token_len, t2.token_start, t2.token_len);
+			tr->tokens[dst_idx] = TokenRange(t0.start_pos, t2.end_pos, s,sl, false, true);
+			
+			dst_idx++;
+			src_idx += 3;
+		}
+	}
+	while(src_idx<org_token_count)
+		tr->tokens[dst_idx++] = tr->tokens[src_idx++];
+	if(src_idx!=dst_idx)
+		tr->tokens.erase(tr->tokens.begin()+dst_idx,tr->tokens.end());
 }
--- a/tokenizer/tokenizer_unittest.cpp
+++ b/tokenizer/tokenizer_unittest.cpp
@ -42,7 +42,11 @@ public:
 		while(p1tokens<tr.size() && tr[p1tokens].is_primary)
 			p1tokens++;
 		printf("phase2-tokens: %u\n", (unsigned)(tr.size()-p1tokens));
-		for(unsigned i=p1tokens; i<tr.size(); i++)
+		for(unsigned i=0; i<tr.size(); i++)
+			if(!tr[i].is_primary || i>=p1tokens)
+				printf("  #%u: [%lu..%lu) '%.*s'\n", i, tr[i].start_pos, tr[i].end_pos, (int)tr[i].token_len, tr[i].token_start);
+		printf("all tokens: %u\n", (unsigned)(tr.size()));
+		for(unsigned i=0; i<tr.size(); i++)
 			printf("  #%u: [%lu..%lu) '%.*s'\n", i, tr[i].start_pos, tr[i].end_pos, (int)tr[i].token_len, tr[i].token_start);
 	}
 	bool empty() const { return tr.empty(); }
@ -667,6 +671,23 @@ int main(void) {
 		assert(t.has_token("Johns"));
 	}
 	
+	printf("Test line %d\n",__LINE__);
+	{
+		T2 t("John''''s dog",langEnglish);
+		assert(!t.has_token("John's"));
+		assert(!t.has_token("Johns"));
+	}
+	
+	printf("Test line %d\n",__LINE__);
+	{
+		T2 t("John's cat bit Mary's dog's tail",langEnglish);
+		assert(t.has_token("John's"));
+		assert(t.has_token("cat"));
+		assert(t.has_token("bit"));
+		assert(t.has_token("Mary's"));
+		assert(t.has_token("dog's"));
+		assert(t.has_token("tail"));
+	}
 	
 	//hyphenation
 	printf("Test line %d\n",__LINE__);
@ -844,6 +865,11 @@ int main(void) {
 		T2 t("foo 040-99 88 77 boo",langSwedish);
 		assert(t.has_token("040998877"));
 	}
+	printf("Test line %d\n",__LINE__);
+	{
+		T2 t("foo 08-24 50 55",langSwedish);
+		assert(t.has_token("08245055"));
+	}
 	
 	printf("Test line %d\n",__LINE__);
 	{
@ -1056,12 +1082,22 @@ int main(void) {

 	printf("Test line %d\n",__LINE__);
 	{
-		T2 t("The smurf drove 80 km/h on the highway",langUnknown);
+		T2 t("The smurf drove 80 km/h on the highway, which is 22 m/s approximately",langUnknown);
+		assert(t.has_token("The"));
+		assert(t.has_token("smurf"));
+		assert(t.has_token("drove"));
+		assert(t.has_token("80"));
 		assert(t.has_token("kmh"));
 		assert(!t.has_token("km"));
 		assert(!t.has_token("h"));
 		assert(t.has_token("80"));
 		assert(t.has_token("on"));
+		assert(t.has_token("the"));
+		assert(t.has_token("highway"));
+		assert(t.has_token("which"));
+		assert(t.has_token("is"));
+		assert(t.has_token("ms"));
+		assert(t.has_token("approximately"));
 	}
 	
 	return 0;