Merge branch 'master' into dev-proxy

2018-06-13 17:38:33 +02:00
parent aa736a8314 d8e03ccfb2
commit d8a9f394df
7 changed files with 117 additions and 37 deletions
--- a/Msg3a.cpp
+++ b/Msg3a.cpp
@ -400,14 +400,11 @@ bool Msg3a::getDocIds(const SearchInput *si, Query *q, void *state, void (*callb

 		// if all hosts in group dead, just skip it!
 		if ( g_hostdb.isShardDead ( shardNum ) ) {
-			m_numReplies++;
-			log("msg3a: skipping dead shard # %i "
-			    "(elapsed=%li)",(int)shardNum,elapsed);
+			log(LOG_DEBUG,"msg3a: skipping dead shard # %i (elapsed=%li)", (int)shardNum, elapsed);
 			continue;
 		}

 		if ( si && !si->m_askOtherShards && h!=g_hostdb.getMyHost()) {
-			m_numReplies++;
 			continue;
 		}

--- a/PageResults.cpp
+++ b/PageResults.cpp
@ -34,6 +34,9 @@
 #include "RobotsBlockedResultOverride.h"
 #include "QueryLanguage.h"
 #include "FxLanguage.h"
+#ifdef _VALGRIND_
+#include <valgrind/memcheck.h>
+#endif


 static bool printSearchResultsHeader(State0 *st);
@ -85,7 +88,7 @@ State0::State0()
 	, m_primaryQueryLanguage(langUnknown) {
 }

-static bool sendReply(State0 *st, char *reply) {
+static bool sendReply(State0 *st, const char *reply, int32_t rlen) {

 	int32_t savedErr = g_errno;

@ -114,8 +117,6 @@ static bool sendReply(State0 *st, char *reply) {
 	}


-	int32_t rlen = 0;
-	if ( reply ) rlen = strlen(reply);
 	logf(LOG_DEBUG,"gb: sending back %" PRId32" bytes",rlen);

 	Statistics::register_query_time(si->m_q.m_numWords, si->m_queryLangId, savedErr, (gettimeofdayInMilliseconds() - st->m_startTime));
@ -255,7 +256,7 @@ bool sendPageResults ( TcpSocket *s , HttpRequest *hr ) {

 	// copy yhits
 	if (!st->m_hr.copy(hr)) {
-		return sendReply(st, nullptr);
+		return sendReply(st,nullptr,0);
 	}

 	// set this in case SearchInput::set fails!
@ -391,7 +392,7 @@ static bool gotQueryLanguage(State0 *st, const std::vector<std::pair<lang_t, dou
 	if (!si->set(st->m_socket, &st->m_hr, st->m_primaryQueryLanguage, language_weights)) {
 		log("query: set search input: %s",mstrerror(g_errno));
 		if ( ! g_errno ) g_errno = EBADENGINEER;
-		return sendReply ( st, NULL );
+		return sendReply(st,NULL,0);
 	}

 	// save collnum now
@ -408,7 +409,7 @@ static bool gotQueryLanguage(State0 *st, const std::vector<std::pair<lang_t, dou
 	// for now disable queries
 	if (!g_conf.m_queryingEnabled) {
 		g_errno = EQUERYINGDISABLED;
-		return sendReply(st, nullptr);
+		return sendReply(st, nullptr,0);
 	}

 	// LAUNCH RESULTS
@ -480,7 +481,7 @@ static bool gotResults ( void *state ) {
 	// mess with their TcpSocket settings.
 	if ( ! st->m_socket ) {
 		log("results: socket is NULL. sending failed.");
-		return sendReply(st,NULL);
+		return sendReply(st,NULL,0);
 	}

 	// if we skipped a shard because it was dead, usually we provide
@ -495,7 +496,7 @@ static bool gotResults ( void *state ) {
 			 , msg40->m_msg3a.m_skippedShards
 			 , g_hostdb.m_numShards );
 	       g_errno = ESHARDDOWN;
-	       return sendReply(st,reply);
+	       return sendReply(st,reply,strlen(reply));
 	}


@ -504,7 +505,7 @@ static bool gotResults ( void *state ) {
 	CollectionRec *cr = si->m_cr;//g_collectiondb.getRec ( collnum );
 	if ( ! cr ) { // || cr != si->m_cr ) {
 		g_errno = ENOCOLLREC;
-		return sendReply(st,NULL);
+		return sendReply(st,NULL,0);
 	}

 	// this causes ooms everywhere, not a good fix
@ -512,7 +513,7 @@ static bool gotResults ( void *state ) {
 		log("msg40: failed to get results q=%s",si->m_q.originalQuery());
 	 	//g_errno = ENOMEM;
 		g_errno = msg40->m_errno;
-	 	return sendReply(st,NULL);
+		return sendReply(st,NULL,0);
 	}


@ -548,7 +549,6 @@ static bool gotResults ( void *state ) {
 	if ( hadPrintError ) {
 		if ( ! g_errno ) g_errno = EBADENGINEER;
 		log("query: had error: %s",mstrerror(g_errno));
-		//return sendReply ( st , sb.getBufStart() );
 	}

 	// wrap it up with Next 10 etc.
@ -560,7 +560,7 @@ static bool gotResults ( void *state ) {
 		sb->safePrintf("</div>");

 	// send it off
-	sendReply ( st , st->m_sb.getBufStart() );
+	sendReply(st, st->m_sb.getBufStart(), st->m_sb.length());

 	return true;
 }
@ -930,7 +930,6 @@ static bool printSearchResultsHeader(State0 *st) {
 		log("query: Query failed. Had error processing query: %s",
 		    mstrerror(st->m_errno));
 		g_errno = st->m_errno;
-		//return sendReply(st,sb->getBufStart());
 		return false;
 	}

@ -1224,7 +1223,7 @@ static bool printSearchResultsHeader(State0 *st) {
 	       si->m_allowHighFrequencyTermCache, ABS_MAX_QUERY_TERMS);
 	//syn-todo: in the call above si->m_queryExpansion was used for both 'queryExpansion' and 'useQueryStopWords'. Why?

-	if ( g_errno ) return false;//sendReply (st,NULL);
+	if ( g_errno ) return false;

 	DocIdScore *dpx = NULL;
 	if ( numResults > 0 ) dpx = msg40->getScoreInfo(0);
@ -1765,7 +1764,7 @@ static bool printInlinkText ( SafeBuf *sb , Msg20Reply *mr , SearchInput *si ,
 		     si->m_format == FORMAT_HTML ) 
 			continue;
 		const char *str   = k->getLinkText();//ptr_linkText;
-		int32_t strLen = k->size_linkText;
+		int32_t strLen = strnlen(k->getLinkText(),k->size_linkText);

 		const char *frontTag =
 		     "<font style=\"color:black;background-color:yellow\">" ;
--- a/Xml.cpp
+++ b/Xml.cpp
@ -341,7 +341,7 @@ bool Xml::set( char *s, int32_t slen, int32_t version, char contentType ) {
 		XmlNode *xi = &m_nodes[m_numNodes];

 		// set that node
-		i += xi->set( &m_xml[i], pureXml );
+		i += xi->set( &m_xml[i], m_xmlLen-i, pureXml );

 		// set his parent xml node if is xml
 		xi->m_parent = parent;
--- a/XmlNode.cpp
+++ b/XmlNode.cpp
@ -4,7 +4,7 @@
 #include "Mem.h"
 #include "Sanity.h"

-static int32_t getTagLen(const char *node);
+static int32_t getTagLen(const char *node, int maxNodeLen);

 // . Here's a nice list of all the html nodes names, lengths, whether they're
 //   a breaking node or not and their node id
@ -274,7 +274,7 @@ static bool isTagStart(const char *s) {
 // . called by Xml class
 // . returns the length of the node
 // . TODO: "node" is now guaranteed to be \0 terminated -- make this faster
-int32_t XmlNode::set( char *node, bool pureXml ) {
+int32_t XmlNode::set( char *node, int maxNodeLen, bool pureXml ) {
 	// save head of node
 	m_node = node;

@ -352,7 +352,7 @@ int32_t XmlNode::set( char *node, bool pureXml ) {

 	// . otherwise it's a regular tag
 	// . might be <!DOCTYPE ...> or something though
-	m_nodeLen = getTagLen ( node );
+	m_nodeLen = getTagLen(node, maxNodeLen);

 	// . get the node's name's length (i-1)
 	// . node name ends at non alnum char 
@ -400,13 +400,13 @@ int32_t XmlNode::set( char *node, bool pureXml ) {
 }

 // . return the length of a node starting at "node"
-static int32_t getTagLen ( const char *node ) {
+static int32_t getTagLen ( const char *node, int maxNodeLen) {
 	// skip over first <
 	int32_t i ;

 	// . keep looping until we hit a < or > OR while we're in quotes
 	// . ignore < and > when they're in quotes
-	for ( i = 1 ; node[i] ; i++ ) {
+	for ( i = 1 ; node[i] && i<maxNodeLen; i++ ) {
 		// this switch should speed things up... no!
 		if ( node[i] != '<'  &&
 		     node[i] != '>'  &&
@ -439,17 +439,23 @@ static int32_t getTagLen ( const char *node ) {
 			while ( node[i] && node[i]!='\"' ) {
 				// crap some pages have unbalanced quotes.
 				// see /test/doc.14541556377486183454.html
-				if ( node[i  ]=='>' && 
-				     node[i-1]=='\"' ) {
-					i--;
-					break;
-				}
-
-				if ( node[i  ]=='>' && 
-				     node[i-1]==' ' &&
-				     node[i-2]=='\"' ) {
-					i--;
-					break;
+				if(node[i]=='>') {
+					if((node[i-1]=='\"') ||
+					   (node[i-1]==' ' && node[i-2]=='\"'))
+					{
+						//Well, what about those ther have balanced quotes and just happen to have a '>' first in an attribute value?
+						//Scan forward and check if '<' or '>' comes first. If '>' comes first then this (node[i]) greater-than sign
+						//is really a greater-than sign in an attribute value.
+						int max_bytes_to_scan = std::min(maxNodeLen-i,100);
+						const char *next_gt = (const char*)memchr(node+i+1,'>',max_bytes_to_scan);
+						const char *next_lt = (const char*)memchr(node+i+1,'<',max_bytes_to_scan);
+						if(!next_lt || (next_gt && next_gt<next_lt))
+							; // greater-than comes first
+						else {
+							i--;
+							break;
+						}
+					}
 				}

 				// skip this char
--- a/XmlNode.h
+++ b/XmlNode.h
@ -240,7 +240,7 @@ public:
 	// . returns the length of the node
 	// . pureXml is true if node cannot be an html tag, except comment
 	//int32_t set ( char *node , bool pureXml );
-	int32_t set (char *node , bool pureXml );
+	int32_t set(char *node, int maxNodeLen, bool pureXml);

 	// . called by set() to get the length of a COMMENT node (and set it)
 	int32_t setCommentNode ( char *node );
--- a/tokenizer/tokenizer2.cpp
+++ b/tokenizer/tokenizer2.cpp
@ -230,6 +230,7 @@ static void remove_combining_marks_norwegian(TokenizerResult *tr);
 static void remove_combining_marks_swedish(TokenizerResult *tr);
 static void remove_combining_marks_german(TokenizerResult *tr);
 static void remove_combining_marks_swiss_german(TokenizerResult *tr);
+static void remove_combining_marks_italian(TokenizerResult *tr);
 static void remove_some_combining_marks(TokenizerResult *tr, const UChar32 native_marked_letters[], size_t native_marked_letters_count);


@ -250,6 +251,9 @@ static void remove_combining_marks(TokenizerResult *tr, lang_t lang, const char
 			else	
 				remove_combining_marks_swiss_german(tr);
 			return;
+		case langItalian:
+			remove_combining_marks_italian(tr);
+			break;
 		default:
 			break;
 	}
@ -333,6 +337,37 @@ static void remove_combining_marks_swiss_german(TokenizerResult *tr) {
 }


+//Combining marks in Italian:
+//  - grave		àèìòù	Mandatory for lowercase. Dedicated keys on keyboard
+//  - acute		é	Mandatory for lowercase. Dedicated keys on keyboard
+//  - cedilla		ç	Non-native. Dedicated key on keyboard - lowercase only
+//Swiss-Italian keyboard has access to umlaut.
+//Major problem is that none the the three Italian keyboard layouts have easy access to uppercase accented letters, so the accents are frequently
+//omitted or typed as apostrophe. More discussion here: https://italian.stackexchange.com/questions/3878/how-do-italians-customarily-insert-uppercase-italian-vowels-with-diacritics-with
+//So one way to deal with this is to just remove all diacritics in both diocument and query, but that would lose precision. But given that most documents has been run through word
+//processing software the documents are mostly written correctly, and that when users type queries they rarely use uppercase so the accents are probably also typed correctly there.
+//So we keep the native and easily accessible marks. Then on a later date we should detect the incorrect forms and fix them (requires a dictionary though).
+static void remove_combining_marks_italian(TokenizerResult *tr) {
+	static const UChar32 native_marked_letters[] = {
+		0x00C0, //À
+		0x00C8, //È
+		0x00CC, //Ì
+		0x00D2, //Ò
+		0x00D9, //Ù
+		0x00E0, //à
+		0x00E8, //è
+		0x00EC, //ì
+		0x00F2, //ò
+		0x00F9, //ù
+		0x00C9, //É
+		0x00E9, //é
+		0x00C7, //Ç
+		0x00E7, //ç
+	};
+	remove_some_combining_marks(tr, native_marked_letters, sizeof(native_marked_letters)/sizeof(native_marked_letters[0]));
+}
+
+
 //Remove combining marks form the codepoints except for the native marked letters
 static void remove_some_combining_marks(TokenizerResult *tr, const UChar32 native_marked_letters[], size_t native_marked_letters_count) {
 	const size_t org_token_count = tr->size();
--- a/tokenizer/tokenizer_unittest.cpp
+++ b/tokenizer/tokenizer_unittest.cpp
@ -609,6 +609,49 @@ int main(void) {
 		assert(t.str(6)=="Noel");
 	}

+	//italian diacritics
+	printf("Test line %d\n",__LINE__);
+	{
+		T2 t("aaa bbb",langItalian);
+		assert(t.token_count()==3);
+	}
+	
+	printf("Test line %d\n",__LINE__);
+	{
+		T2 t("Ragù",langItalian);
+		assert(t.token_count()==1);
+		assert(t.str(0)=="Ragù");
+	}
+	
+	printf("Test line %d\n",__LINE__);
+	{
+		T2 t("àèìòùéç",langItalian);
+		assert(t.token_count()==1);
+		assert(t.str(0)=="àèìòùéç");
+	}
+	
+	printf("Test line %d\n",__LINE__);
+	{
+		T2 t("ÀÈÌÒÙÉÇ",langItalian);
+		assert(t.token_count()==1);
+		assert(t.str(0)=="ÀÈÌÒÙÉÇ");
+	}
+	
+	printf("Test line %d\n",__LINE__);
+	{
+		T2 t("monaco münchen",langItalian);
+		assert(t.token_count()==4);
+		assert(t.str(3)=="munchen");
+	}
+	
+	printf("Test line %d\n",__LINE__);
+	{
+		T2 t("Eskişehir",langItalian);
+		assert(t.token_count()==2);
+		assert(t.str(1)=="Eskisehir");
+	}
+	
+	
 	//diacritics hands-off
 	printf("Test line %d\n",__LINE__);
 	{