now fix embedded products and images in the diffbot

json reply properly!
This commit is contained in:
Matt Wells
2013-11-14 12:51:34 -08:00
parent 28cd1e6490
commit be213ca28f
2 changed files with 151 additions and 164 deletions

@ -12850,6 +12850,139 @@ bool *XmlDoc::getRecycleDiffbotReply ( ) {
return &m_recycleDiffbotReply;
}
// . we now get the TOKENIZED diffbot reply.
// . that converts a single diffbot reply into multiple \0 separated
// json objects.
// . for instance, the diffbot product api returns an array like
// "products":[{...},{...}],"url":... that consists of multiple
// json product items, but the json elements that are not in
// this array are description of the page itself, like url and title.
// so we need to carry over these outter json objects to each
// inner json object we tokenize.
// . in this fashion we'll have separate objects that can each be indexed
// as a single page, which is what we want for searching.
SafeBuf *XmlDoc::getTokenizedDiffbotReply ( ) {
if ( m_tokenizedDiffbotReplyValid )
return m_tokenizedDiffbotReplyPtr;
SafeBuf *dbr = getDiffbotReply();
if ( ! dbr || dbr == (void *)-1 ) return dbr;
// empty? that's easy. might be just "{}\n" i guess
if ( dbr->length() <= 3 ) return dbr;
char *text = dbr->getBufStart();
char *needle1 = ",\"products\":[";
char *needle2 = ",\"images\":[";
char *parray = strstr ( text , needle1 );
char *pstart = NULL;
if ( parray ) {
// point to [
pstart = parray + 13 - 1;
}
else {
parray = strstr ( text , needle2 );
// point to [
if ( parray ) pstart = parray + 11 - 1;
}
// if not found, no need to do anything...
if ( ! parray ) {
m_tokenizedDiffbotReplyValid = true;
m_tokenizedDiffbotReplyPtr = &m_diffbotReply;
return m_tokenizedDiffbotReplyPtr;
}
//
// ok, now we have to do so json ju jitsu to fix it
//
// point to array. starting at the '['
char *p = pstart;
long brackets = 0;
for ( ; *p ; p++ ) {
if ( *p == '[' ) brackets++;
if ( *p != ']' ) continue;
brackets--;
// stop if array is done. p points to ']'
if ( brackets == 0 ) break;
}
// now point to outter items to the left of the ",\"products\":[...
char *left1 = dbr->getBufStart();
char *left2 = parray;
// then to the right. skip over the ending ']'
char *right1 = p + 1;
char *right2 = dbr->getBuf(); // end of the buffer
SafeBuf *tbuf = &m_tokenizedDiffbotReply;
// now scan the json products or images in the array
char *x = pstart;
// skip over [
x++;
// each product item in array is enclosed in {}'s
if ( *x != '{' ) {
log("build: something is wrong with diffbot reply");
g_errno = EBADENGINEER;
return NULL;
}
// reset CURLY bracket count
long curlies = 0;
char *xstart = NULL;
// scan now
for ( ; *x ; x++ ) {
if ( *x== '{' ) {
if ( curlies == 0 ) xstart = x;
curlies++;
continue;
}
if ( *x == '}' ) {
curlies--;
if ( curlies != 0 ) continue;
// unreciprocated '{'? wtf???
if ( ! xstart ) continue;
//
// ok, we got an item!
//
// left top items
if ( ! tbuf->safeMemcpy ( left1 , left2-left1 ) )
return NULL;
// use "product":
if ( ! tbuf->safePrintf(",\"product\":") )
return NULL;
// the item itself, include it's curlies.
if ( ! tbuf->safeMemcpy ( xstart , x - xstart+1 ) )
return NULL;
// right top items
if ( ! tbuf->safeMemcpy ( right1 , right2-right1 ) )
return NULL;
// then a \0
if ( ! tbuf->pushChar('\0') )
return NULL;
// reset this!
xstart = NULL;
}
}
// now show the items. debug!
//p = tbuf->getBufStart();
//for ( ; p < tbuf->getBuf() ; p += gbstrlen(p) + 1 )
// fprintf(stderr,"ITEM\n%s\n\n",p);
m_tokenizedDiffbotReplyPtr = tbuf;
m_tokenizedDiffbotReplyValid = true;
return m_tokenizedDiffbotReplyPtr;
}
// the diffbot reply will be a list of json objects we want to index
SafeBuf *XmlDoc::getDiffbotReply ( ) {
@ -19261,23 +19394,26 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
//
///////////
// . get the reply of json objects from diffbot
// . this will be empty if we are a json object!
// . will also be empty if not meant to be sent to diffbot
SafeBuf *dbr = getDiffbotReply();
if ( ! dbr || dbr == (void *)-1 ) return (char *)dbr;
// . the TOKENIZED reply consists of \0 separated json objects that
// we create from the original diffbot reply
SafeBuf *tdbr = getTokenizedDiffbotReply();
if ( ! tdbr || tdbr == (void *)-1 ) return (char *)tdbr;
long dbrLen = dbr->length();
long tdbrLen = tdbr->length();
// do not index json items as separate docs if we are page parser
if ( getIsPageParser() ) dbrLen = 0;
if ( getIsPageParser() ) tdbrLen = 0;
//
// if we got a json object or two from diffbot, index them
// as their own child xmldocs.
// watch out for reply from diffbot of "-1" indicating error!
//
if ( dbrLen > 3 ) {
if ( tdbrLen > 3 ) {
// make sure diffbot reply is valid for sure
if ( ! m_diffbotReplyValid ) { char *xx=NULL;*xx=0; }
// set status for this
@ -19291,22 +19427,10 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
return NULL;
}
mnew ( m_dx , sizeof(XmlDoc),"xmldocdx");
// init cursor to first json object
//m_diffbotObj = m_diffbotReply.getBufStart();
char *rp = m_diffbotReply.getBufStart();
// we now parse the array of products out of the
// diffbot reply. each product is an item/object.
m_diffbotObj = getFirstJSONObject ( rp ,
m_niceness ,
&m_isJsonProduct ,
&m_isJsonImage );
m_diffbotObj = tdbr->getBufStart();
m_diffbotJSONCount = 0;
// set end of it
m_diffbotObjEnd = getJSONObjectEnd ( m_diffbotObj,
m_niceness);
// temp null it
m_diffbotSavedChar = *m_diffbotObjEnd;
*m_diffbotObjEnd = '\0';
}
// loop back up here to process next json object from below
jsonloop:
@ -19337,32 +19461,6 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
sreq.m_fakeFirstIp = 1;
sreq.m_firstIp = firstIp;
// copy the content
m_tmpBuf.reset();
// how much
long clen = m_diffbotObjEnd - m_diffbotObj;
// include \0
long need = clen + 1;
// insert ,"type":"product" or
// possibly ,"type":"image" to make it kosher
need += 32;
// reserve the mem
if ( ! m_tmpBuf.reserve ( need ) )
return NULL;
// sanity
if ( m_diffbotObj[0] != '{' ) { char *xx=NULL;*xx=0;}
// copy first '{'
m_tmpBuf.pushChar(m_diffbotObj[0]);
// HACK: insert the type: thing here
if ( m_isJsonProduct )
m_tmpBuf.safePrintf("\"type\":\"product\",");
else if ( m_isJsonImage )
m_tmpBuf.safePrintf("\"type\":\"image\",");
// do the copy of the rest, title, etc.
m_tmpBuf.safeMemcpy ( m_diffbotObj+1 , clen-1 );
// null term
m_tmpBuf.nullTerm();
// set this
if (!m_dx->set4 ( &sreq ,
NULL ,
@ -19373,7 +19471,7 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
// niceness of 0!!!!
m_niceness, // 1 ,
// inject this content
m_tmpBuf.getBufStart(), // content ,
m_diffbotObj,
false, // deleteFromIndex ,
0, // forcedIp ,
CT_JSON, // contentType ,
@ -19420,23 +19518,15 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
// count as deleted
cr->m_localCrawlInfo.m_objectsAdded++;
cr->m_globalCrawlInfo.m_objectsAdded++;
// undo the \0 termination we did above
*m_diffbotObjEnd = m_diffbotSavedChar;
// we successfully index the json object, skip to next one
m_diffbotObj = m_diffbotObjEnd;
// advance to first '{'
for ( ; *m_diffbotObj && *m_diffbotObj!='{' ; m_diffbotObj++);
// point to next json object again
m_diffbotObjEnd = getJSONObjectEnd ( m_diffbotObj,m_niceness);
// re-save
m_diffbotSavedChar = *m_diffbotObjEnd;
m_diffbotObj += gbstrlen(m_diffbotObj) + 1;
// but gotta set this crap back
log("diffbot: resetting %s",m_dx->m_firstUrl.m_url);
// clear for next guy if there is one. clears
// m_dx->m_contentValid so the set4() can be called again above
m_dx->reset();
// if more json... this will not be \0
if ( *m_diffbotObj ) goto jsonloop;
// have we breached the buffer of json objects? if not, do more
if ( m_diffbotObj < tdbr->getBuf() ) goto jsonloop;
}
/////
@ -43720,107 +43810,7 @@ char *getJsonArrayEnd ( char *p ) {
return NULL;
}
// . the products and image types are listed as arrays in the json object.
// . so go to those first if there...
char *getFirstJSONObject ( char *p ,
long niceness ,
bool *isProduct ,
bool *isImage ) {
// do we have a "products": array?
char *needle = ",\"products\":[";
char *s = strstr(p,needle);
*isProduct = false;
*isImage = false;
// return ptr to first product if there
if ( s ) {
*isProduct = true;
// find ending ] and null term it!
char *start = s + gbstrlen(needle);
char *p = getJsonArrayEnd ( start );
if ( p ) *p = '\0';
return start;
}
QUICKPOLL ( niceness );
// images?
needle = ",\"images\":[";
s = strstr(p,needle);
// return ptr to first product if there
if ( s ) {
*isImage = true;
// find ending ] and null term it!
char *start = s + gbstrlen(needle);
char *p = getJsonArrayEnd ( start );
if ( p ) *p = '\0';
return start;
}
// default to just that json otherwise
return p;
}
// . advance p to skip over the json object it is pointing to and return
// ptr to the following json object
// . deal with nested {}'s
// . basically skips over current json object in a list of json objects to
// point to the next brother object
// .
char *getJSONObjectEnd ( char *p , long niceness ) {
// otherwise, *p must be {
for ( ; *p && *p != '{' ; p++ );
// empty?
if ( ! *p ) return p;
// count the nests
long nest = 0;
// skip first {
p++;
// keep track of in a quote or not
bool inQuotes = false;
// scan
for ( ; *p ; p++ ) {
// breathe
QUICKPOLL ( niceness );
// escaping a quote? ignore quote then.
if ( *p == '\\' && p[1] == '\"' ) {
// skip two bytes then..
p++;
continue;
}
// a quote?
if ( *p == '\"' ) {
inQuotes = ! inQuotes;
continue;
}
// if in a quote, ignore {} in there
if ( inQuotes ) continue;
// skip if no {}'s
if ( *p != '{' && *p !='}' ) continue;
// otherwise, check for { or }
if ( *p == '{' ) { nest++; continue; }
// otherwise, it must be a }
nest--;
// if we hit the } corresponding to the first }
// then stop!
if ( nest == -1 ) break;
}
// done?
if ( ! *p ) return p;
// must be this then
if ( *p != '}' ) { char *xx=NULL;*xx=0; }
// skip that
p++;
// skip til next {
//for ( ; *p && *p != '{' ; p++ );
// done
return p;
}
// this is still used by Title.cpp to get the title: field quickly
char *getJSONFieldValue ( char *json , char *field , long *valueLen ) {
// get length
long fieldLen = gbstrlen(field);

@ -1289,6 +1289,7 @@ class XmlDoc {
bool m_replyValid;
bool m_recycleDiffbotReplyValid;
bool m_diffbotReplyValid;
bool m_tokenizedDiffbotReplyValid;
//bool m_diffbotUrlCrawlPatternMatchValid;
//bool m_diffbotUrlProcessPatternMatchValid;
//bool m_diffbotPageProcessPatternMatchValid;
@ -1491,11 +1492,6 @@ class XmlDoc {
char m_calledMsg0b;
Url m_tmpUrl;
// hack stuff:
SafeBuf m_tmpBuf;
bool m_isJsonProduct;
bool m_isJsonImage;
SafeBuf m_tmpsb1;
SafeBuf m_tmpsb2;
SafeBuf m_turkBuf;
@ -1564,9 +1560,9 @@ class XmlDoc {
//
XmlDoc *m_dx;
char *m_diffbotObj;
char *m_diffbotObjEnd;
char m_diffbotSavedChar;
SafeBuf m_diffbotReply;
SafeBuf *m_tokenizedDiffbotReplyPtr;
SafeBuf m_tokenizedDiffbotReply;
long m_diffbotReplyError;
bool m_recycleDiffbotReply;
//bool m_diffbotUrlCrawlPatternMatch;
@ -1578,6 +1574,7 @@ class XmlDoc {
SafeBuf m_diffbotApiUrl;
bool *getRecycleDiffbotReply ( ) ;
SafeBuf *getTokenizedDiffbotReply ( ) ;
SafeBuf *getDiffbotReply ( ) ;
//bool doesUrlMatchDiffbotCrawlPattern() ;
//bool doesUrlMatchDiffbotProcessPattern() ;