Merge branch 'diffbot' of github.com:gigablast/open-source-search-engine into diffbot

This commit is contained in:
Matt Wells
2013-11-25 15:06:11 -08:00
2 changed files with 51 additions and 22 deletions

@ -33,7 +33,7 @@ public:
//TagRec m_tagRec;
TcpSocket *m_socket;
HttpRequest m_r;
char m_coll[50];
char m_coll[MAX_COLL_LEN+2];
//CollectionRec *m_cr;
bool m_isAdmin;
bool m_isLocal;
@ -136,7 +136,7 @@ bool sendPageGet ( TcpSocket *s , HttpRequest *r ) {
uint8_t langId = getLangIdFromAbbr ( langAbbr );
st->m_langId = langId;
}
strncpy ( st->m_coll , coll , 40 );
strncpy ( st->m_coll , coll , MAX_COLL_LEN+1 );
// store query for query highlighting
st->m_netTestResults = r->getLong ("rnettest", false );
if( st->m_netTestResults ) {
@ -179,14 +179,22 @@ bool sendPageGet ( TcpSocket *s , HttpRequest *r ) {
sreq.reset();
strcpy(sreq.m_url, url );
sreq.setDataSize();
xd->set4 ( &sreq , NULL , coll , NULL , st->m_niceness );
// this returns false if "coll" is invalid
if ( ! xd->set4 ( &sreq , NULL , coll , NULL , st->m_niceness ) )
goto hadSetError;
}
// . when getTitleRec() is called it will load the old one
// since XmlDoc::m_setFromTitleRec will be true
// . niceness is 0
else {
// use st->m_coll since XmlDoc just points to it!
xd->set3 ( docId , st->m_coll , 0 );
// . use st->m_coll since XmlDoc just points to it!
// . this returns false if "coll" is invalid
else if ( ! xd->set3 ( docId , st->m_coll , 0 ) ) {
hadSetError:
mdelete ( st , sizeof(State2) , "PageGet1" );
delete ( st );
g_errno = ENOMEM;
log("PageGet: set3: %s", mstrerror(g_errno));
return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno));
}
// if it blocks while it loads title rec, it will re-call this routine
xd->setCallback ( st , processLoopWrapper );

@ -12916,34 +12916,43 @@ SafeBuf *XmlDoc::getTokenizedDiffbotReply ( ) {
// in order for us to do the array separation logic below.
// we don't want to do this logic for articles because they
// contain an image array!!!
char *needleA = "\"type\":\"product";
char *needleB = "\"type\":\"image";
char *productPtr = strstr ( text , needleA );
char *imagePtr = strstr ( text , needleB );
if ( ! productPtr && ! imagePtr ) {
// this must be on the FIRST level of the json object, otherwise
// we get errors because we got type:article and it
// contains an images array!
long valLen;
char *val = getJSONFieldValue ( text , "type", &valLen );
bool isProduct = false;
bool isImage = false;
if ( val && valLen == 7 && strncmp ( val , "product", 7) == 0 )
isProduct = true;
if ( val && valLen == 5 && strncmp ( val , "image", 5) == 0 )
isImage = true;
if ( ! isProduct && ! isImage ) {
m_tokenizedDiffbotReplyValid = true;
m_tokenizedDiffbotReplyPtr = &m_diffbotReply;
return m_tokenizedDiffbotReplyPtr;
}
char *needle1 = ",\"products\":[";
char *needle2 = ",\"images\":[";
char *parray = strstr ( text , needle1 );
char *pstart = NULL;
char *newTerm = NULL;
if ( parray ) {
// point to [
pstart = parray + 13 - 1;
char *needle;
char *newTerm;
if ( isProduct ) {
needle = ",\"products\":[";
newTerm = "product";
}
else {
parray = strstr ( text , needle2 );
// point to [
if ( parray ) pstart = parray + 11 - 1;
needle = ",\"images\":[";
newTerm = "image";
}
char *parray = strstr ( text , needle );
// if not found, no need to do anything...
if ( ! parray ) {
m_tokenizedDiffbotReplyValid = true;
@ -12951,6 +12960,10 @@ SafeBuf *XmlDoc::getTokenizedDiffbotReply ( ) {
return m_tokenizedDiffbotReplyPtr;
}
// point to [
char *pstart = parray + gbstrlen(needle) - 1;
//
// ok, now we have to do so json ju jitsu to fix it
//
@ -43913,6 +43926,7 @@ char *getJSONFieldValue ( char *json , char *field , long *valueLen ) {
char *stringStart = NULL;
char *p = json;
bool gotOne = false;
long depth = 0;
// scan
for ( ; *p ; p++ ) {
// escaping a quote? ignore quote then.
@ -43921,6 +43935,11 @@ char *getJSONFieldValue ( char *json , char *field , long *valueLen ) {
p++;
continue;
}
// count {} depth
if ( ! inQuotes ) {
if ( *p == '{' ) depth++;
if ( *p == '}' ) depth--;
}
// a quote?
if ( *p == '\"' ) {
inQuotes = ! inQuotes;
@ -43932,6 +43951,8 @@ char *getJSONFieldValue ( char *json , char *field , long *valueLen ) {
else if ( ! inQuotes &&
! gotOne &&
p[1] == ':' &&
// {"title":"whatever",...}
depth == 1 &&
stringStart &&
(p - stringStart) == fieldLen &&
strncmp(field,stringStart,fieldLen)==0 ) {