forked from Mirrors/privacore-open-source-search-engine
fix json double decoding issue. no more
partial decodes, json parser stores fully decoded string into separate buf.
This commit is contained in:
12
Json.cpp
12
Json.cpp
@ -61,7 +61,7 @@ JsonItem *Json::getItem ( char *name ) {
|
||||
|
||||
#include "Mem.h" // gbstrlen()
|
||||
|
||||
JsonItem *Json::parseJsonStringIntoJsonItems ( char *json ) {
|
||||
JsonItem *Json::parseJsonStringIntoJsonItems ( char *json , long niceness ) {
|
||||
|
||||
m_prev = NULL;
|
||||
|
||||
@ -228,7 +228,8 @@ JsonItem *Json::parseJsonStringIntoJsonItems ( char *json ) {
|
||||
// get length decoded
|
||||
long curr = m_sb.length();
|
||||
// store decoded string right after jsonitem
|
||||
if ( !m_sb.safeDecodeJSONToUtf8 ( str, slen,0))
|
||||
if ( !m_sb.safeDecodeJSONToUtf8 (str,slen,
|
||||
niceness ))
|
||||
return NULL;
|
||||
// store length decoded json
|
||||
ji->m_valueLen = m_sb.length() - curr;
|
||||
@ -261,7 +262,7 @@ JsonItem *Json::parseJsonStringIntoJsonItems ( char *json ) {
|
||||
ji->m_valueDouble = 0;
|
||||
}
|
||||
// store decoded string right after jsonitem
|
||||
if ( !m_sb.safeDecodeJSONToUtf8 (p,slen,0))
|
||||
if ( !m_sb.safeDecodeJSONToUtf8 (p,slen,niceness))
|
||||
return NULL;
|
||||
// store length decoded json
|
||||
ji->m_valueLen = m_sb.length() - curr;
|
||||
@ -304,7 +305,7 @@ JsonItem *Json::parseJsonStringIntoJsonItems ( char *json ) {
|
||||
// copy the number as a string as well
|
||||
long curr = m_sb.length();
|
||||
// store decoded string right after jsonitem
|
||||
if ( !m_sb.safeDecodeJSONToUtf8 ( str, slen,0))
|
||||
if ( !m_sb.safeDecodeJSONToUtf8 ( str, slen,niceness))
|
||||
return NULL;
|
||||
// store length decoded json
|
||||
ji->m_valueLen = m_sb.length() - curr;
|
||||
@ -344,7 +345,8 @@ void Json::test ( ) {
|
||||
"in 2010\",\"18083009\":\"Apple personal digital assistants\",\"23475157\":\"Touchscreen portable media players\",\"30107877\":\"IPad\",\"9301031\":\"Apple Inc. hardware\",\"27765345\":\"IOS (Apple)\",\"26588084\":\"Tablet computers\"},\"type\":1,\"senseRank\":1,\"variety\":0.49056603773584906,\"depth\":0.5882352941176471},{\"id\":18839,\"positions\":[[1945,1950],[2204,2209]],\"name\":\"Music\",\"score\":0.7,\"contentMatch\":1,\"categories\":{\"991222\":\"Performing arts\",\"693016\":\"Entertainment\",\"691484\":\"Music\"},\"type\":1,\"senseRank\":1,\"variety\":0.22264150943396221,\"depth\":0.7058823529411764}],\"media\":[{\"pixelHeight\":350,\"link\":\"http://www.onlinemba.com/wp-content/uploads/2013/02/apple-innovates-invert-350x350.png\",\"primary\":\"true\",\"pixelWidth\":350,\"type\":\"image\"}]}";
|
||||
|
||||
|
||||
JsonItem *ji = parseJsonStringIntoJsonItems ( json );
|
||||
long niceness = 0;
|
||||
JsonItem *ji = parseJsonStringIntoJsonItems ( json , niceness );
|
||||
|
||||
// print them out?
|
||||
log("json: type0=%li",(long)ji->m_type);
|
||||
|
2
Json.h
2
Json.h
@ -63,7 +63,7 @@ class Json {
|
||||
|
||||
void test();
|
||||
|
||||
JsonItem *parseJsonStringIntoJsonItems ( char *json );
|
||||
JsonItem *parseJsonStringIntoJsonItems ( char *json , long niceness );
|
||||
|
||||
JsonItem *getFirstItem ( ) ;
|
||||
|
||||
|
19
Make.depend
19
Make.depend
@ -354,7 +354,7 @@ Collectiondb.o: Collectiondb.cpp gb-include.h types.h fctypes.h Unicode.h \
|
||||
IndexTable2.h Msg51.h Msg17.h IndexReadInfo.h Msg3a.h Stats.h \
|
||||
PostQueryRerank.h Sanity.h SiteGetter.h Title.h Address.h zlib.h zconf.h \
|
||||
HttpMime.h Users.h Pages.h HttpServer.h TcpServer.h openssl/err.h \
|
||||
PageCrawlBot.h Statsdb.h Process.h Msg28.h Cachedb.h Syncdb.h PageTurk.h
|
||||
PageCrawlBot.h Statsdb.h Process.h Msg28.h Cachedb.h Syncdb.h
|
||||
CollectionRec.o: CollectionRec.cpp gb-include.h types.h fctypes.h \
|
||||
Unicode.h UnicodeProperties.h UCPropTable.h iconv.h UCNormalizer.h \
|
||||
hash.h Errno.h Log.h CollectionRec.h Url.h ip.h Parms.h Xml.h XmlNode.h \
|
||||
@ -374,7 +374,11 @@ CollectionRec.o: CollectionRec.cpp gb-include.h types.h fctypes.h \
|
||||
RdbBuckets.h RdbCache.h Msg5.h Msg3.h RdbMerge.h Dir.h PingServer.h \
|
||||
HttpServer.h TcpServer.h openssl/err.h MsgC.h UdpServer.h UdpSlot.h \
|
||||
UdpProtocol.h Dns.h DnsProtocol.h Multicast.h Threads.h HttpMime.h \
|
||||
Datedb.h Indexdb.h DiskPageCache.h Titledb.h Timedb.h
|
||||
Datedb.h Indexdb.h DiskPageCache.h Titledb.h Timedb.h Spider.h Msg4.h \
|
||||
Msg1.h Msg0.h Clusterdb.h Linkdb.h Msg2.h Query.h Msg20.h Summary.h \
|
||||
matches2.h Words.h StopWords.h Bits.h Pos.h Matches.h HashTableT.h \
|
||||
Domains.h CountryCode.h Tagdb.h Events.h Sections.h IndexList.h Dates.h \
|
||||
Msg22.h CatRec.h Categories.h Catdb.h
|
||||
Conf.o: Conf.cpp gb-include.h types.h fctypes.h Unicode.h \
|
||||
UnicodeProperties.h UCPropTable.h iconv.h UCNormalizer.h hash.h Errno.h \
|
||||
Log.h Conf.h Xml.h XmlNode.h Lang.h Iso8859.h iana_charset.h File.h \
|
||||
@ -668,6 +672,7 @@ Entities.o: Entities.cpp gb-include.h types.h fctypes.h Unicode.h \
|
||||
Errno.o: Errno.cpp gb-include.h types.h fctypes.h Unicode.h \
|
||||
UnicodeProperties.h UCPropTable.h iconv.h UCNormalizer.h hash.h Errno.h \
|
||||
Log.h
|
||||
errnotest.o: errnotest.cpp
|
||||
Facebook.o: Facebook.cpp Facebook.h Conf.h Xml.h XmlNode.h gb-include.h \
|
||||
types.h fctypes.h Unicode.h UnicodeProperties.h UCPropTable.h iconv.h \
|
||||
UCNormalizer.h hash.h Errno.h Log.h Lang.h Iso8859.h iana_charset.h \
|
||||
@ -1349,10 +1354,10 @@ main.o: main.cpp gb-include.h types.h fctypes.h Unicode.h \
|
||||
Msge0.h Msge1.h Msg8b.h SearchInput.h Msg40.h Msg39.h Msg37.h TopTree.h \
|
||||
IndexTable2.h Msg51.h Msg17.h Msg3a.h PostQueryRerank.h Sanity.h \
|
||||
SiteGetter.h Title.h Address.h DailyMerge.h Speller.h Language.h Wiki.h \
|
||||
Wiktionary.h Scraper.h Msg2a.h Msg9b.h Msg35.h Msg30.h Msg3e.h \
|
||||
PageNetTest.h AutoBan.h TuringTest.h Msg1f.h Profiler.h Blaster.h \
|
||||
Proxy.h linkspam.h sort.h Ads.h LanguagePages.h ValidPointer.h Placedb.h \
|
||||
Test.h seo.h Json.h
|
||||
Wiktionary.h Scraper.h Msg2a.h Msg9b.h Msg35.h Msg3e.h PageNetTest.h \
|
||||
AutoBan.h TuringTest.h Msg1f.h Profiler.h Blaster.h Proxy.h linkspam.h \
|
||||
sort.h Ads.h LanguagePages.h ValidPointer.h Placedb.h Test.h seo.h \
|
||||
Json.h
|
||||
matches2.o: matches2.cpp gb-include.h types.h fctypes.h Unicode.h \
|
||||
UnicodeProperties.h UCPropTable.h iconv.h UCNormalizer.h hash.h Errno.h \
|
||||
Log.h matches2.h Titledb.h Rdb.h RdbBase.h Conf.h Xml.h XmlNode.h Lang.h \
|
||||
@ -2694,7 +2699,7 @@ PageResults.o: PageResults.cpp gb-include.h types.h fctypes.h Unicode.h \
|
||||
Highlight.h AutoBan.h TuringTest.h sort.h LanguageIdentifier.h \
|
||||
LanguagePages.h LangList.h XmlDoc.h Phrases.h Images.h Msg13.h Msge0.h \
|
||||
Msge1.h Msg8b.h SiteGetter.h Title.h Address.h Spider.h PageResults.h \
|
||||
Proxy.h
|
||||
Proxy.h Json.h
|
||||
PageRoot.o: PageRoot.cpp gb-include.h types.h fctypes.h Unicode.h \
|
||||
UnicodeProperties.h UCPropTable.h iconv.h UCNormalizer.h hash.h Errno.h \
|
||||
Log.h Indexdb.h Rdb.h RdbBase.h Conf.h Xml.h XmlNode.h Lang.h Iso8859.h \
|
||||
|
@ -4806,6 +4806,8 @@ bool printCSVHeaderRow ( SafeBuf *sb , State0 *st ) {
|
||||
if ( ! nameTable.set ( 8,4,2048,nbuf,27000,false,0,"ntbuf") )
|
||||
return false;
|
||||
|
||||
long niceness = 0;
|
||||
|
||||
// . scan every fucking json item in the search results.
|
||||
// . we still need to deal with the case when there are so many
|
||||
// search results we have to dump each msg20 reply to disk in
|
||||
@ -4824,7 +4826,7 @@ bool printCSVHeaderRow ( SafeBuf *sb , State0 *st ) {
|
||||
|
||||
// parse it up
|
||||
Json jp;
|
||||
jp.parseJsonStringIntoJsonItems ( json );
|
||||
jp.parseJsonStringIntoJsonItems ( json , niceness );
|
||||
|
||||
// scan each json item
|
||||
for ( JsonItem *ji = jp.getFirstItem(); ji ; ji = ji->m_next ){
|
||||
@ -4913,9 +4915,11 @@ bool printCSVHeaderRow ( SafeBuf *sb , State0 *st ) {
|
||||
// returns false and sets g_errno on error
|
||||
bool printJsonItemInCSV ( char *json , SafeBuf *sb , State0 *st ) {
|
||||
|
||||
long niceness = 0;
|
||||
|
||||
// parse the json
|
||||
Json jp;
|
||||
jp.parseJsonStringIntoJsonItems ( json );
|
||||
jp.parseJsonStringIntoJsonItems ( json , niceness );
|
||||
|
||||
HashTableX *columnTable = &st->m_columnTable;
|
||||
long numCSVColumns = st->m_numCSVColumns;
|
||||
|
@ -2535,8 +2535,7 @@ bool SafeBuf::decodeJSON ( long niceness ) {
|
||||
// decode quotation marks as well then set decodeAll to TRUE!
|
||||
bool SafeBuf::safeDecodeJSONToUtf8 ( char *json,
|
||||
long jsonLen,
|
||||
long niceness,
|
||||
bool decodeAll ) {
|
||||
long niceness ) {
|
||||
|
||||
// how much space to reserve for the copy?
|
||||
long need = jsonLen;
|
||||
@ -2602,7 +2601,7 @@ bool SafeBuf::safeDecodeJSONToUtf8 ( char *json,
|
||||
// the doc so we can preserve json names/value pair
|
||||
// information for indexing purposes. however,
|
||||
// Title.cpp DOES want to decode quotations.
|
||||
if ( src[1] == '\"' && decodeAll ) {
|
||||
if ( src[1] == '\"' ) { // && decodeAll ) {
|
||||
*dst++ = '\"';
|
||||
src += 2;
|
||||
continue;
|
||||
|
@ -60,8 +60,8 @@ struct SafeBuf {
|
||||
bool safeTruncateEllipsis ( char *src , long maxLen );
|
||||
bool convertJSONtoXML ( long niceness , long startConvertPos );
|
||||
|
||||
bool safeDecodeJSONToUtf8 ( char *json, long jsonLen, long niceness,
|
||||
bool decodeAll = false );
|
||||
bool safeDecodeJSONToUtf8 ( char *json, long jsonLen, long niceness);
|
||||
// bool decodeAll = false );
|
||||
|
||||
bool decodeJSONToUtf8 ( long niceness );
|
||||
bool decodeJSON ( long niceness );
|
||||
|
@ -118,8 +118,8 @@ bool Title::setTitle ( XmlDoc *xd ,
|
||||
char *jt;
|
||||
jt = getJSONFieldValue(xd->ptr_utf8Content,"title",&vlen);
|
||||
if ( jt && vlen > 0 ) {
|
||||
jsonTitle.safeDecodeJSONToUtf8 (jt, vlen, m_niceness,
|
||||
true ); // decodeAll?
|
||||
jsonTitle.safeDecodeJSONToUtf8 (jt, vlen, m_niceness);
|
||||
//true ); // decodeAll?
|
||||
jsonTitle.nullTerm();
|
||||
val = jsonTitle.getBufStart();
|
||||
}
|
||||
|
23
XmlDoc.cpp
23
XmlDoc.cpp
@ -12794,8 +12794,16 @@ void gotDiffbotReplyWrapper ( void *state , TcpSocket *s ) {
|
||||
// au->length() );
|
||||
//THIS->m_diffbotReply.pushChar('\n');
|
||||
// convert the \u1f23 to utf8 (\n and \r as well)
|
||||
THIS->m_diffbotReply.safeDecodeJSONToUtf8 ( page , pageLen ,
|
||||
THIS->m_niceness );
|
||||
// crap, this decodes \\\\\" to \\" which is causing
|
||||
// the json parser to believe it is an encoded \ then
|
||||
// a REAL quote... but quote is contained...
|
||||
//THIS->m_diffbotReply.safeDecodeJSONToUtf8 ( page , pageLen ,
|
||||
// THIS->m_niceness );
|
||||
|
||||
// do not do that any more then, jsonparse can call it
|
||||
// on a per string basis
|
||||
THIS->m_diffbotReply.safeMemcpy ( page , pageLen );
|
||||
|
||||
// convert embedded \0 to space
|
||||
//char *p = THIS->m_diffbotReply.getBufStart();
|
||||
//char *pend = p + THIS->m_diffbotReply.getLength();
|
||||
@ -29774,11 +29782,11 @@ bool XmlDoc::printDoc ( SafeBuf *sb ) {
|
||||
//
|
||||
SafeBuf *dbr = getDiffbotReply();
|
||||
if ( dbr->length() ) {
|
||||
sb->safePrintf("<b>START PARTIALLY-DECODED DIFFBOT REPLY</b><br>\n");
|
||||
sb->safePrintf("<b>START EXACT DIFFBOT REPLY</b><br>\n");
|
||||
sb->safePrintf("<pre>");
|
||||
sb->safeMemcpy ( dbr );
|
||||
sb->safePrintf("</pre>");
|
||||
sb->safePrintf("<b>END DIFFBOT REPLY</b><br><br>\n");
|
||||
sb->safePrintf("<b>END EXACT DIFFBOT REPLY</b><br><br>\n");
|
||||
}
|
||||
|
||||
//
|
||||
@ -43999,7 +44007,7 @@ char *XmlDoc::hashJSON ( HashTableX *table ) {
|
||||
// use new json parser
|
||||
Json jp;
|
||||
// returns NULL and sets g_errno on error
|
||||
if ( ! jp.parseJsonStringIntoJsonItems ( p ) ) {
|
||||
if ( ! jp.parseJsonStringIntoJsonItems ( p , m_niceness ) ) {
|
||||
g_errno = EBADJSONPARSER;
|
||||
return NULL;
|
||||
}
|
||||
@ -44068,6 +44076,11 @@ char *XmlDoc::hashJSON ( HashTableX *table ) {
|
||||
hi.m_hashGroup = HASHGROUP_INTAG;
|
||||
if ( strstr(name,"meta") == 0 )
|
||||
hi.m_hashGroup = HASHGROUP_INMETATAG;
|
||||
//
|
||||
// now Json.cpp decodes and stores the value into
|
||||
// a buffer, so ji->getValue() should be decoded completely
|
||||
//
|
||||
|
||||
// index like "title:whatever"
|
||||
hi.m_prefix = name;
|
||||
hashString ( ji->getValue(),ji->getValueLen() , &hi );
|
||||
|
@ -55,7 +55,7 @@ num-mirrors: 0
|
||||
# The working directory is the last string on each line. That is where the
|
||||
# 'gb' binary resides.
|
||||
#
|
||||
0 5998 7000 8000 9000 127.0.0.1 127.0.0.1 /home/mwells/github/
|
||||
0 5999 7001 8001 9001 127.0.0.1 127.0.0.1 /home/mwells/bugfixes/
|
||||
|
||||
|
||||
|
||||
|
Reference in New Issue
Block a user