Merge branch 'master' into tokenizer

2025-07-14 02:36:06 -04:00 · 2018-05-15 14:41:11 +02:00
parent 415e4c315d 0e08a15346
commit 996470ade0
5 changed files with 127 additions and 73 deletions
--- a/Msg20.cpp
+++ b/Msg20.cpp
@ -164,7 +164,7 @@ bool Msg20::getSummary ( Msg20Request *req ) {

 	// we might be getting inlinks for a spider request
 	// so make sure timeout is inifinite for that...
-	const int32_t timeout = (req->m_niceness==0)
+	const int64_t timeout = (req->m_niceness==0)
 	                      ? multicast_msg20_summary_timeout
 	                      : multicast_infinite_send_timeout;

--- a/SiteGetter.cpp
+++ b/SiteGetter.cpp
@ -311,12 +311,11 @@ bool SiteGetter::gotSiteList ( ) {
 		log("site: sitegetter gotList: %s",mstrerror(g_errno));
 		// mark it so caller knows
 		m_errno = g_errno;
-		// so try again without increasing m_pathDepth
-		// i've seen a host return EBADRDBID for some reason
-		// and put host #0 in an infinite log spam loop so stop it
-		if ( g_errno != EBADRDBID ) m_tryAgain = true;
+
+		// let UdpServer do the retries for error scenario
 		return true;
 	}
+
 	// how many urls at this path depth?
 	int32_t count = ( m_list.getListSize() - 6 ) / 6;
 	// if we do not have enough to quality this as a subsite path depth
--- a/Summary.cpp
+++ b/Summary.cpp
@ -124,6 +124,7 @@ bool Summary::setSummaryFromTags( Xml *xml, unsigned maxSummaryLen, const char *

 	// itemprop = "description"
 	if ( xml->getTagContent("itemprop", "description", m_summary, MAX_SUMMARY_LEN, minSummaryLen, maxSummaryLen, &m_summaryLen) ) {
+		maybeRemoveHtmlFormatting();
 		if ( verifySummary( titleBuf, titleBufLen ) ) {
 			m_isSetFromTags = true;

@ -136,6 +137,7 @@ bool Summary::setSummaryFromTags( Xml *xml, unsigned maxSummaryLen, const char *

 	// meta property = "og:description"
 	if ( xml->getTagContent("property", "og:description", m_summary, MAX_SUMMARY_LEN, minSummaryLen, maxSummaryLen, &m_summaryLen, true, TAG_META ) ) {
+		maybeRemoveHtmlFormatting();
 		if ( verifySummary( titleBuf, titleBufLen ) ) {
 			m_isSetFromTags = true;

@ -148,6 +150,7 @@ bool Summary::setSummaryFromTags( Xml *xml, unsigned maxSummaryLen, const char *

 	// meta name = "description"
 	if ( xml->getTagContent("name", "description", m_summary, MAX_SUMMARY_LEN, minSummaryLen, maxSummaryLen, &m_summaryLen, true, TAG_META ) ) {
+		maybeRemoveHtmlFormatting();
 		if ( verifySummary( titleBuf, titleBufLen ) ) {
 			m_isSetFromTags = true;

@ -162,6 +165,7 @@ bool Summary::setSummaryFromTags( Xml *xml, unsigned maxSummaryLen, const char *
 	if ( xml->getTagContent("property", "description", m_summary, MAX_SUMMARY_LEN, minSummaryLen, maxSummaryLen, &m_summaryLen, true, TAG_META ) ) {
 		if ( verifySummary( titleBuf, titleBufLen ) ) {
 			m_isSetFromTags = true;
+			maybeRemoveHtmlFormatting();

 			logDebug(g_conf.m_logDebugSummary, "sum: generated from meta property description. summary='%.*s'", m_summaryLen, m_summary);
 			logTrace(g_conf.m_logTraceSummary, "END. Generated from meta property description. Returning true");
@ -1153,3 +1157,42 @@ bool Summary::getDefaultSummary(const Xml *xml, const TokenizerResult *tr, const
 	logTrace(g_conf.m_logTraceSummary, "END. Returning true");
 	return true;
 }	
+
+
+void Summary::maybeRemoveHtmlFormatting() {
+	//Some websites have junk in their meta tags. Eg <br> in the meta description
+	//We don't fix all cases as that could hurt correctly written pages about how to write proper html. But
+	//if they don't mention "html", "tag" nor "element" then we remove the most common offenders br/b/i/p
+	//When changing this function consider keeping in sync with XmlDoc_Indexing.cpp:possiblyDecodeHtmlEntitiesAgain()
+	if(memmem(m_summary,m_summaryLen,"html",4)==0 &&
+	   memmem(m_summary,m_summaryLen,"HTML",4)==0 &&
+	   memmem(m_summary,m_summaryLen,"tag",3)==0 &&
+	   memmem(m_summary,m_summaryLen,"Tag",3)==0 &&
+	   memmem(m_summary,m_summaryLen,"element",7)==0 &&
+	   memmem(m_summary,m_summaryLen,"Element",7)==0)
+	{
+		for(int i=0; i<m_summaryLen; ) {
+			char *p = (char*)memchr(m_summary+i,'<',m_summaryLen-i);
+			if(!p)
+				break;
+			i = p-m_summary;
+			if(i+4<m_summaryLen) {
+				if(memcmp(p,"<br>",4)==0) {
+					memmove(m_summary+i,m_summary+i+4,m_summaryLen-i-4);
+					m_summaryLen -= 4;
+				} else if(memcmp(p,"<b>",3)==0) {
+					memmove(m_summary+i,m_summary+i+3,m_summaryLen-i-3);
+					m_summaryLen -= 3;
+				} else if(memcmp(p,"<i>",3)==0) {
+					memmove(m_summary+i,m_summary+i+3,m_summaryLen-i-3);
+					m_summaryLen -= 3;
+				} else if(memcmp(p,"<p>",3)==0) {
+					memmove(m_summary+i,m_summary+i+3,m_summaryLen-i-3);
+					m_summaryLen -= 3;
+				} else
+					i++;
+			} else
+				break;
+		}
+	}
+}
--- a/Summary.h
+++ b/Summary.h
@ -63,6 +63,8 @@ private:
 	int64_t getBestWindow (const Matches *matches, int32_t mn, int32_t *lasta, int32_t *besta, int32_t *bestb,
 	                       char *gotIt, char *retired, int32_t maxExcerptLen );

+	void maybeRemoveHtmlFormatting();
+
 	// null terminate and store the summary here.
 	char  m_summary[ MAX_SUMMARY_LEN ];
 	int32_t  m_summaryLen;
--- a/XmlDoc_Indexing.cpp
+++ b/XmlDoc_Indexing.cpp
@ -21,6 +21,56 @@
 #endif


+static void possiblyDecodeHtmlEntitiesAgain(const char **s, int32_t *len, SafeBuf *sb, bool also_remove_certain_html_elements) {
+	//some documents have incorrectly encoded html entities twice. Example:
+	//correct:   <meta name="foo" content="&#66;oa">
+	//incorrect: <meta name="foo" content="&amp;#66;oa">
+	//If it seems likely that this has happened then we decode the entities again and put the result in 'sb' and update '*s' and '*len'
+	
+	//Due to the (il)logic of GB the correct form is decoded, while the incorrect form is still raw, needing double decoding
+	
+	//require &amp; following by a second semicolon
+	const char *amppos = (const char*)memmem(*s,*len, "&amp;", 5);
+	if((amppos && memchr(amppos+5, ';', *len-(amppos-*s))!=NULL) ||
+	   (memmem(*s,*len,"&lt;",4)!=NULL && memmem(*s,*len,"&gt;",4)!=NULL)) {
+		//shortest entity is 4 char (&lt;), longest utf8 encoding of a codepoint is 4 + a bit
+		StackBuf<1024> tmpBuf;
+		if(!tmpBuf.reserve(*len + *len/2 + 4))
+			return;
+		if(!sb->reserve(*len + *len/2 + 4))
+			return;
+		
+		int32_t tmpLen = htmlDecode(tmpBuf.getBufStart(), *s,*len, false);
+		
+		int32_t newlen = htmlDecode(sb->getBufStart(), tmpBuf.getBufStart(), tmpLen, false);
+		
+		sb->setLength(newlen);
+		
+		//Furthermore, some websites have junk in their meta tags. Eg <br> in the meta description
+		//We don't fix all cases as that could hurt correctly written pages about how to write proper html. But
+		//if they don't mention "html", "tag" nor "element" then we remove the most common offenders br/b/i/p
+		//When changing this function consider keeping in sync with Summary::maybeRemoveHtmlFormatting()
+		if(also_remove_certain_html_elements) {
+			if(memmem(sb->getBufStart(),sb->length(),"html",4)==0 &&
+			   memmem(sb->getBufStart(),sb->length(),"HTML",4)==0 &&
+			   memmem(sb->getBufStart(),sb->length(),"tag",3)==0 &&
+			   memmem(sb->getBufStart(),sb->length(),"Tag",3)==0 &&
+			   memmem(sb->getBufStart(),sb->length(),"element",7)==0 &&
+			   memmem(sb->getBufStart(),sb->length(),"Element",7)==0)
+			{
+				sb->safeReplace2("<br>",4,"",0,0);
+				sb->safeReplace2("<b>",3,"",0,0);
+				sb->safeReplace2("<u>",3,"",0,0);
+				sb->safeReplace2("<p>",3,"",0,0);
+			}
+		}
+		*s = sb->getBufStart();
+		*len = sb->length();
+	   }
+}
+
+
+
 // a ptr to HashInfo is passed to hashString() and hashWords()
 class HashInfo {
 public:
@ -466,10 +516,6 @@ bool XmlDoc::hashMetaTags ( HashTableX *tt ) {

 	setStatus ( "hashing meta tags" );

-	// assume it's empty
-	char buf [ 32*1024 ];
-	int32_t bufLen = 32*1024 - 1;
-	buf[0] = '\0';
 	int32_t     n     = m_xml.getNumNodes();
 	XmlNode *nodes = m_xml.getNodes();

@ -481,36 +527,15 @@ bool XmlDoc::hashMetaTags ( HashTableX *tt ) {

 	// find the first meta summary node
 	for ( int32_t i = 0 ; i < n ; i++ ) {
-		// continue if not a meta tag
-		if ( nodes[i].m_nodeId != TAG_META ) continue;
+		//we are only interested in meta tags
+		if(nodes[i].m_nodeId != TAG_META)
+			continue;
 		// only get content for <meta name=..> not <meta http-equiv=..>
 		int32_t tagLen;
-		char *tag = m_xml.getString ( i , "name" , &tagLen );
-		char tagLower[128];
-		int32_t j ;
-		int32_t code;
-		// skip if empty
+		const char *tag = m_xml.getString(i, "name", &tagLen);
+		// skip if error/empty
 		if ( ! tag || tagLen <= 0 ) continue;
-		// make tag name lower case and do not allow bad chars
-		if ( tagLen > 126 ) tagLen = 126 ;
-		to_lower3_a ( tag , tagLen , tagLower );
-		for ( j = 0 ; j < tagLen ; j++ ) {
-			// bail if has unacceptable chars
-			if ( ! is_alnum_a ( tag[j] ) &&
-			     tag[j] != '-' &&
-			     tag[j] != '_' &&
-			     tag[j] != '.' ) break;
-			// convert to lower
-			tagLower[j] = to_lower_a ( tag[j] );
-		}
-		// skip this meta if had unacceptable chars
-		if ( j < tagLen ) continue;
-		// is it recognized?
-		code = getFieldCode ( tag , tagLen );

-		// . do not allow reserved tag names
-		// . title,url,suburl,
-		if ( code != FIELD_GENERIC ) continue;
 		// this is now reserved
 		// do not hash keyword, keywords, description, or summary metas
 		// because that is done in hashRange() below based on the
@ -540,48 +565,18 @@ bool XmlDoc::hashMetaTags ( HashTableX *tt ) {

 		// get the content
 		int32_t len;
-		char *s = m_xml.getString ( i , "content" , &len );
+		const char *s = m_xml.getString ( i , "content" , &len );
 		if ( ! s || len <= 0 ) continue;
-		// . ensure not too big for our buffer (keep room for a \0)
-		// . TODO: this is wrong, should be len+1 > bufLen,
-		//   but can't fix w/o resetting the index (COME BACK HERE
-		//   and see where we index meta tags besides this place!!!)
-		//   remove those other places, except... what about keywords
-		//   and description?
-		if ( len+1 >= bufLen ) {
-			//len = bufLen - 1;
-			// assume no punct to break on!
-			len = 0;
-			// only cut off at punctuation
-			char *p    = s;
-			char *pend = s + len;
-			char *last = NULL;
-			int32_t  size ;
-			for ( ; p < pend ; p += size ) {
-				// skip if utf8 char
-				size = getUtf8CharSize(*p);
-				// skip if 2+ bytes
-				if ( size > 1 ) continue;
-				// skip if not punct
-				if ( is_alnum_a(*p) ) continue;
-				// mark it
-				last = p;
-			}
-			if ( last ) len = last - s;
-			// this old way was faster...:
-			//while ( len > 0 && is_alnum(s[len-1]) ) len--;
-		}
-		// convert html entities to their chars
-		len = saftenTags ( buf , bufLen , s , len );
-		// NULL terminate the buffer
-		buf[len] = '\0';
+
+		StackBuf<1024> doubleDecodedContent;
+		possiblyDecodeHtmlEntitiesAgain(&s, &len, &doubleDecodedContent, true);

 		// Now index the wanted meta tags as normal text without prefix so they
 		// are used in user searches automatically.
 		hi.m_prefix = NULL;

 		// desc is NULL, prefix will be used as desc
-		bool status = hashString ( buf,len,&hi );
+		bool status = hashString ( s,len, &hi );

 		// bail on error, g_errno should be set
 		if ( ! status ) return false;
@ -1377,11 +1372,22 @@ bool XmlDoc::hashTitle ( HashTableX *tt ) {

 	//FIXME: assumption: title tokens are the phase-1 tokens and the tokens are in contiguous memory
 	//FIXME: also grab the alternative tokens from phase 2 in the title part
-	if ( ! hashString(a, i, &hi) ) return false;
+
+	//clean indexing:
+	//  if ( ! hashString(a, i, &hi) ) return false;
+	//but due to bad webmasters:
+	
+	const char  *title    = m_tokenizerResult[a].token_start;
+	const char  *titleEnd = m_tokenizerResult[i].token_end();
+	int32_t   titleLen = titleEnd - title;
+	StackBuf<1024> doubleDecodedContent;
+	possiblyDecodeHtmlEntitiesAgain(&title, &titleLen, &doubleDecodedContent, false);
+	
+	if ( ! hashString(title, titleLen, &hi)) return false;

 	// now hash as without title: prefix
 	hi.m_prefix = NULL;
-	if ( ! hashString(a, i, &hi) ) return false;
+	if ( ! hashString(title, titleLen, &hi)) return false;

 	return true;
 }
@ -1486,11 +1492,14 @@ bool XmlDoc::hashMetaSummary ( HashTableX *tt ) {

 	setStatus ( "hashing meta summary" );

+	StackBuf<1024> doubleDecodedContent;
+
 	// hash the meta keywords tag
 	//char buf [ 2048 + 2 ];
 	//int32_t len = m_xml.getMetaContent ( buf , 2048 , "summary" , 7 );
 	int32_t mslen;
 	const char *ms = getMetaSummary ( &mslen );
+	possiblyDecodeHtmlEntitiesAgain(&ms, &mslen, &doubleDecodedContent, true);

 	// update hash parms
 	HashInfo hi;
@ -1505,7 +1514,8 @@ bool XmlDoc::hashMetaSummary ( HashTableX *tt ) {

 	//len = m_xml.getMetaContent ( buf , 2048 , "description" , 11 );
 	int32_t mdlen;
-	char *md = getMetaDescription ( &mdlen );
+	const char *md = getMetaDescription ( &mdlen );
+	possiblyDecodeHtmlEntitiesAgain(&md, &mdlen, &doubleDecodedContent, true);

 	// udpate hashing parms
 	hi.m_desc = "meta desc";