Merge branch 'diffbot' of github.com:gigablast/open-source-search-engine into diffbot

2013-11-25 15:06:11 -08:00
parent 1826860094 1bbbcff755
commit ca544ddb90
2 changed files with 51 additions and 22 deletions
--- a/PageGet.cpp
+++ b/PageGet.cpp
@ -33,7 +33,7 @@ public:
 	//TagRec     m_tagRec;
 	TcpSocket *m_socket;
 	HttpRequest m_r;
-	char m_coll[50];
+	char m_coll[MAX_COLL_LEN+2];
 	//CollectionRec *m_cr;
 	bool       m_isAdmin;
 	bool       m_isLocal;
@ -136,7 +136,7 @@ bool sendPageGet ( TcpSocket *s , HttpRequest *r ) {
 		uint8_t langId = getLangIdFromAbbr ( langAbbr );
 		st->m_langId = langId;
 	}
-	strncpy ( st->m_coll , coll , 40 );
+	strncpy ( st->m_coll , coll , MAX_COLL_LEN+1 );
 	// store query for query highlighting
 	st->m_netTestResults    = r->getLong ("rnettest", false );
 	if( st->m_netTestResults ) {
@ -179,14 +179,22 @@ bool sendPageGet ( TcpSocket *s , HttpRequest *r ) {
 		sreq.reset();
 		strcpy(sreq.m_url, url );
 		sreq.setDataSize();
-		xd->set4 ( &sreq , NULL , coll , NULL , st->m_niceness ); 
+		// this returns false if "coll" is invalid
+		if ( ! xd->set4 ( &sreq , NULL , coll , NULL , st->m_niceness ) ) 
+			goto hadSetError;
 	}
 	// . when getTitleRec() is called it will load the old one
 	//   since XmlDoc::m_setFromTitleRec will be true
 	// . niceness is 0
-	else {
-		// use st->m_coll since XmlDoc just points to it!
-		xd->set3 ( docId , st->m_coll , 0 );
+	// . use st->m_coll since XmlDoc just points to it!
+	// . this returns false if "coll" is invalid
+	else if ( ! xd->set3 ( docId , st->m_coll , 0 ) ) {
+	hadSetError:
+		mdelete ( st , sizeof(State2) , "PageGet1" );
+		delete ( st );
+		g_errno = ENOMEM;
+		log("PageGet: set3: %s", mstrerror(g_errno));
+		return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno));
 	}
 	// if it blocks while it loads title rec, it will re-call this routine
 	xd->setCallback ( st , processLoopWrapper );
--- a/XmlDoc.cpp
+++ b/XmlDoc.cpp
@ -12916,34 +12916,43 @@ SafeBuf *XmlDoc::getTokenizedDiffbotReply ( ) {
 	// in order for us to do the array separation logic below.
 	// we don't want to do this logic for articles because they
 	// contain an image array!!!
-	char *needleA = "\"type\":\"product";
-	char *needleB = "\"type\":\"image";
-	char *productPtr = strstr ( text , needleA );
-	char *imagePtr   = strstr ( text , needleB );
-	if ( ! productPtr && ! imagePtr ) {
+
+	// this must be on the FIRST level of the json object, otherwise
+	// we get errors because we got type:article and it
+	// contains an images array!
+	
+	long valLen;
+	char *val = getJSONFieldValue ( text , "type", &valLen );
+
+	bool isProduct = false;
+	bool isImage = false;
+
+	if ( val && valLen == 7 && strncmp ( val , "product", 7) == 0 )
+		isProduct = true;
+
+	if ( val && valLen == 5 && strncmp ( val , "image", 5) == 0 )
+		isImage = true;
+
+	if ( ! isProduct && ! isImage ) {
 		m_tokenizedDiffbotReplyValid = true;
 		m_tokenizedDiffbotReplyPtr = &m_diffbotReply;
 		return m_tokenizedDiffbotReplyPtr;
 	}


-	char *needle1 = ",\"products\":[";
-	char *needle2 = ",\"images\":[";
-	char *parray = strstr ( text , needle1 );
-	char *pstart = NULL;
-	char *newTerm = NULL;
-	if ( parray ) {
-		// point to [
-		pstart = parray + 13 - 1;
+	char *needle;
+	char *newTerm;
+	if ( isProduct ) {
+		needle = ",\"products\":[";
 		newTerm = "product";
 	}
 	else {
-		parray = strstr ( text , needle2 );
-		// point to [
-		if ( parray ) pstart = parray + 11 - 1;
+		needle = ",\"images\":[";
 		newTerm = "image";
 	}

+	char *parray = strstr ( text , needle );
+
 	// if not found, no need to do anything...
 	if ( ! parray ) {
 		m_tokenizedDiffbotReplyValid = true;
@ -12951,6 +12960,10 @@ SafeBuf *XmlDoc::getTokenizedDiffbotReply ( ) {
 		return m_tokenizedDiffbotReplyPtr;
 	}

+
+	// point to [
+	char *pstart = parray + gbstrlen(needle) - 1;
+
 	//
 	// ok, now we have to do so json ju jitsu to fix it
 	//
@ -43913,6 +43926,7 @@ char *getJSONFieldValue ( char *json , char *field , long *valueLen ) {
 	char *stringStart = NULL;
 	char *p = json;
 	bool gotOne = false;
+	long depth = 0;
 	// scan
 	for ( ; *p ; p++ ) {
 		// escaping a quote? ignore quote then.
@ -43921,6 +43935,11 @@ char *getJSONFieldValue ( char *json , char *field , long *valueLen ) {
 			p++;
 			continue;
 		}
+		// count {} depth
+		if ( ! inQuotes ) {
+			if ( *p == '{' ) depth++;
+			if ( *p == '}' ) depth--;
+		}
 		// a quote?
 		if ( *p == '\"' ) {
 			inQuotes = ! inQuotes;
@ -43932,6 +43951,8 @@ char *getJSONFieldValue ( char *json , char *field , long *valueLen ) {
 			else if ( ! inQuotes && 
 				  ! gotOne &&
 				  p[1] == ':' &&
+				  // {"title":"whatever",...}
+				  depth == 1 &&
 				  stringStart &&
 				  (p - stringStart) == fieldLen &&
 				  strncmp(field,stringStart,fieldLen)==0 ) {