Add const for some Json encode/decode methods (used in unit test)

2015-11-12 12:10:39 +01:00
parent 64874b91ee
commit d43bc2d92b
12 changed files with 86 additions and 231 deletions
--- a/Json.cpp
+++ b/Json.cpp
@ -64,7 +64,7 @@ JsonItem *Json::getItem ( char *name ) {

 #include "Mem.h" // gbstrlen()

-JsonItem *Json::parseJsonStringIntoJsonItems ( char *json , int32_t niceness ) {
+JsonItem *Json::parseJsonStringIntoJsonItems (const char *json , int32_t niceness ) {

 	m_prev = NULL;

@ -76,7 +76,7 @@ JsonItem *Json::parseJsonStringIntoJsonItems ( char *json , int32_t niceness ) {
 	if ( ! json ) return NULL;

 	// how much space will we need to avoid any reallocs?
-	char *p = json;
+	const char *p = json;
 	bool inQuote = false;
 	int32_t need = 0;
 	for ( ; *p ; p++ ) {
@ -121,7 +121,7 @@ JsonItem *Json::parseJsonStringIntoJsonItems ( char *json , int32_t niceness ) {
 	// reset p
 	p = json;
 	// json maybe bad utf8 causing us to miss the \0 char, so use "pend"
-	char *pend = json + gbstrlen(json);
+	const char *pend = json + gbstrlen(json);

 	// scan
 	for ( ; p < pend ; p += size ) {
@ -210,7 +210,7 @@ JsonItem *Json::parseJsonStringIntoJsonItems ( char *json , int32_t niceness ) {
 		// a quote?
 		if ( *p == '\"' ) {
 			// find end of quote
-			char *end = p + 1;
+			const char *end = p + 1;
 			for ( ; *end ; end++ ) {
 				// skip two chars if escaped
 				if ( *end == '\\' && end[1] ) {
@ -221,11 +221,11 @@ JsonItem *Json::parseJsonStringIntoJsonItems ( char *json , int32_t niceness ) {
 				if ( *end == '\"' ) break;
 			}
 			// field?
-			char *x = end + 1;
+			const char *x = end + 1;
 			// skip spaces
 			for ( ; *x && is_wspace_a(*x) ; x++ );
 			// define the string
-			char *str  = p + 1;
+			const char *str  = p + 1;
 			int32_t  slen = end - str;
 			// . if a colon follows, it was a field
 			if ( *x == ':' ) {
@ -330,12 +330,12 @@ JsonItem *Json::parseJsonStringIntoJsonItems ( char *json , int32_t niceness ) {
 		     // like .123 ?
 		     ( *p == '.' && is_digit(p[1]) ) ) {
 			// find end of the number
-			char *end = p + 1;
+			const char *end = p + 1;
 			// . allow '.' for decimal numbers
 			// . TODO: allow E for exponent
 			for ( ; *end && (is_digit(*end) || *end=='.');end++) ;
 			// define the string
-			char *str  = p;
+			const char *str  = p;
 			int32_t  slen = end - str;
 			// make a new one
 			ji = addNewItem();
--- a/Json.h
+++ b/Json.h
@ -42,14 +42,14 @@ class JsonItem {
 	// for JT_String
 	int32_t m_valueLen;

-	char *m_valueArray;
+	const char *m_valueArray;

 	// for JT_String
-	int32_t  getValueLen() { return m_valueLen; };
+	int32_t  getValueLen() { return m_valueLen; }

 	// for arrays (JT_ARRAY), hack the char ptr into m_valueLong
-	char *getArrayStart() { return m_valueArray;}; //(char *)m_valueLong; }
-	int32_t  getArrayLen  () { return m_valueLen; };
+	const char *getArrayStart() { return m_valueArray;}
+	int32_t  getArrayLen  () { return m_valueLen; }

 	// for JT_String
 	char *getValue () { 
@ -74,7 +74,7 @@ class JsonItem {
 class Json {
 public:

-	JsonItem *parseJsonStringIntoJsonItems ( char *json , int32_t niceness );
+	JsonItem *parseJsonStringIntoJsonItems ( const char *json , int32_t niceness );

 	bool printToString(SafeBuf& out);

--- a/SafeBuf.cpp
+++ b/SafeBuf.cpp
@ -144,7 +144,7 @@ bool SafeBuf::safePrintf(char *formatString , ...) {
 }


-bool SafeBuf::safeMemcpy(char *s, int32_t len) {
+bool SafeBuf::safeMemcpy(const char *s, int32_t len) {
 	// put a silent \0 at the end
 	//int32_t tmp = len + m_length+1;
 	//if(tmp >= m_capacity ) {
@ -2425,7 +2425,7 @@ bool SafeBuf::decodeJSON ( int32_t niceness ) {
 // . SO we do keep \" 
 // . so when indexing a doc we set decodeAll to FALSE, but if you want to 
 //   decode quotation marks as well then set decodeAll to TRUE!
-bool SafeBuf::safeDecodeJSONToUtf8 ( char *json, 
+bool SafeBuf::safeDecodeJSONToUtf8 ( const char *json,
 				     int32_t jsonLen, 
 				     int32_t niceness ) {

@ -2433,8 +2433,8 @@ bool SafeBuf::safeDecodeJSONToUtf8 ( char *json,
 	int32_t need = jsonLen;

 	// count how many \u's we got
-	char *p = json;//m_buf;
-	char *pend = json + jsonLen;
+	const char *p = json;//m_buf;
+	const char *pend = json + jsonLen;
 	for ( ; p < pend ; p++ ) 
 		// for the 'x' and the ';'
 		if ( *p == '\\' && p[1] == 'u' ) need += 2;
@ -2443,8 +2443,8 @@ bool SafeBuf::safeDecodeJSONToUtf8 ( char *json,
 	//SafeBuf dbuf;
 	if ( ! reserve ( need + 1) ) return false;

-	char *src = json;//m_buf;
-	char *srcEnd = json + jsonLen;
+	const char *src = json;//m_buf;
+	const char *srcEnd = json + jsonLen;

 	char *dst = m_buf + m_length;

@ -2507,7 +2507,7 @@ bool SafeBuf::safeDecodeJSONToUtf8 ( char *json,
 				continue; 
 			}
 			// otherwise, decode. can do in place like this...
-			char *p = src + 2;
+			const char *p = src + 2;
 			// skip the /ug or /ugg or /uggg or /ugggg in its
 			// entirety i guess... to avoid infinite loop
 			if ( ! is_hex(p[0]) ) { src +=2; continue;}
@ -2551,7 +2551,7 @@ bool SafeBuf::jsonEncode ( char *src , int32_t srcLen ) {
 }

 // encode into json
-bool SafeBuf::safeUtf8ToJSON ( char *utf8 ) {
+bool SafeBuf::safeUtf8ToJSON ( const char *utf8 ) {

 	if ( ! utf8 ) return true;

@ -2560,7 +2560,7 @@ bool SafeBuf::safeUtf8ToJSON ( char *utf8 ) {
 	int32_t need = gbstrlen(utf8) * 2 + 1;
 	if ( ! reserve ( need ) ) return false;
 	// scan and copy
-	char *src = utf8;
+	const char *src = utf8;
 	// concatenate to what's already there
 	char *dst = m_buf + m_length;
 	for ( ; *src ; src++ ) {
--- a/SafeBuf.h
+++ b/SafeBuf.h
@ -89,9 +89,7 @@ public:

 	bool convertJSONtoXML ( int32_t niceness , int32_t startConvertPos );

-	bool safeDecodeJSONToUtf8 ( char *json, int32_t jsonLen, 
-				    int32_t niceness);
-	//			    bool decodeAll = false );
+	bool safeDecodeJSONToUtf8 ( const char *json, int32_t jsonLen, int32_t niceness);

 	bool decodeJSONToUtf8 ( int32_t niceness );
 	bool decodeJSON ( int32_t niceness );
@ -122,14 +120,14 @@ public:
 	bool  safePrintf(char *formatString, ...);
 #endif
 	bool  safeMemcpy(void *s, int32_t len){return safeMemcpy((char *)s,len);};
-	bool  safeMemcpy(char *s, int32_t len);
+	bool  safeMemcpy(const char *s, int32_t len);
 	bool  safeMemcpy_nospaces(char *s, int32_t len);
 	bool  safeMemcpy(SafeBuf *c){return safeMemcpy(c->m_buf,c->m_length);};
 	bool  safeMemcpy ( class Words *w , int32_t a , int32_t b ) ;
 	bool  safeStrcpy ( char *s ) ;
 	//bool  safeStrcpyPrettyJSON ( char *decodedJson ) ;
-	bool  safeUtf8ToJSON ( char *utf8 ) ;
-	bool jsonEncode ( char *utf8 ) { return safeUtf8ToJSON(utf8); };
+	bool  safeUtf8ToJSON ( const char *utf8 ) ;
+	bool jsonEncode ( const char *utf8 ) { return safeUtf8ToJSON(utf8); }
 	bool jsonEncode ( char *utf8 , int32_t utf8Len );

 	bool  csvEncode ( char *s , int32_t len , int32_t niceness = 0 );
--- a/Unicode.h
+++ b/Unicode.h
@ -62,7 +62,7 @@ inline char getUtf8CharSize ( uint8_t *p ) {
 		return bytes_in_utf8_code[c];
 }

-inline char getUtf8CharSize ( char *p ) {
+inline char getUtf8CharSize ( const char *p ) {
 	uint8_t c = (uint8_t)*p;
 	if(c<128)
 		return 1;
--- a/XmlDoc.cpp
+++ b/XmlDoc.cpp
@ -15971,7 +15971,7 @@ SafeBuf *XmlDoc::getTokenizedDiffbotReply ( ) {
 	}

 	JsonItem *jsonItem = jp.getItem("objects");
-	char *array = NULL;
+	const char *array = NULL;
 	int32_t arrayLen = 0;
 	if ( jsonItem ) {
 		array = jsonItem->getArrayStart();
--- a/fctypes.cpp
+++ b/fctypes.cpp
@ -993,8 +993,8 @@ int64_t htoint32_tint32_t ( const char *s, int32_t len ) {
 }

 // convert hex ascii string into binary at "dst"
-void hexToBin ( char *src , int32_t srcLen , char *dst ) {
-	char *srcEnd = src + srcLen;
+void hexToBin ( const char *src , int32_t srcLen , char *dst ) {
+	const char *srcEnd = src + srcLen;
 	for ( ; src && src < srcEnd ; ) {
 		*dst  = htob(*src++);
 		*dst <<= 4;
--- a/fctypes.h
+++ b/fctypes.h
@ -92,7 +92,7 @@ bool is_urlchar(char s);
 int32_t htob ( char s ) ;
 char btoh ( char s ) ;
 // convert hex ascii string into binary
-void hexToBin ( char *src , int32_t srcLen , char *dst );
+void hexToBin ( const char *src , int32_t srcLen , char *dst );
 // convert binary number of size srcLen bytes into hex string in "dst"
 void binToHex ( unsigned char *src , int32_t srcLen , char *dst );

--- a/main.cpp
+++ b/main.cpp
@ -1323,21 +1323,6 @@ int main2 ( int argc , char *argv[] ) {
 		
 	}

-	/*
-	//  test json parser error with bad json
-	Json jp;
-	char xxx[1024];
-	//sprintf(xxx,"\"categories\":[\"shop\"");
-	sprintf(xxx,"\"too small\"");
-	jp.parseJsonStringIntoJsonItems(xxx,0);
-	JsonItem *ji = jp.getFirstItem();
-	for ( ; ji ; ji = ji->m_next ) {
-		if ( ji->m_type != JT_NUMBER && ji->m_type != JT_STRING )
-			continue;
-	}
-	*/
-
-
 	/*
 	if ( strcmp ( cmd , "querytest" ) == 0){
 		if ( ! g_hostdb.init(hostsConf, hostId) ) {
--- a/test/unit/JsonTest.cpp
+++ b/test/unit/JsonTest.cpp
@ -1,20 +1,19 @@
 #include "gtest/gtest.h"

 #include "Json.h"
+#include "SafeBuf.h"

 TEST(JsonTest, ParseValid) {
 	Json json;

-	char *json_input = "{\"tags\":[\"Apple Inc.\",\"Symbian\",\"IPad\",\"Music\"],\"summary\":\"Good timing and shrewd planning have played as much of a role as innovative thinking for the Silicon Valley juggernaut.\",\"icon\":\"http://www.onlinemba.com/wp-content/themes/onlinemba/assets/img/ico/apple-touch-icon.png\",\"text\":\"How did Apple rise through the ranks to become the world’s most profitable tech company? As it turns out, good timing and shrewd planning have played as much of a role as innovative thinking for the Silicon Valley juggernaut.For example, take the first MP3 player — MPMan, produced by South Korea-based SaeHan Information Systems. MPMan appeared in 1998, three years before the first iPods were released. As the original pioneer of portable MP3 player technology, SaeHan spent a good deal of time in court negotiating terms of use with various record companies. By 2001, a clear legal precedent was set for MP3 access — allowing Apple to focus less on courtroom proceedings and more on cutting-edge marketing campaigns for their new product."
+	const char *json_input = "{\"tags\":[\"Apple Inc.\",\"Symbian\",\"IPad\",\"Music\"],\"summary\":\"Good timing and shrewd planning have played as much of a role as innovative thinking for the Silicon Valley juggernaut.\",\"icon\":\"http://www.onlinemba.com/wp-content/themes/onlinemba/assets/img/ico/apple-touch-icon.png\",\"text\":\"How did Apple rise through the ranks to become the world’s most profitable tech company? As it turns out, good timing and shrewd planning have played as much of a role as innovative thinking for the Silicon Valley juggernaut.For example, take the first MP3 player — MPMan, produced by South Korea-based SaeHan Information Systems. MPMan appeared in 1998, three years before the first iPods were released. As the original pioneer of portable MP3 player technology, SaeHan spent a good deal of time in court negotiating terms of use with various record companies. By 2001, a clear legal precedent was set for MP3 access — allowing Apple to focus less on courtroom proceedings and more on cutting-edge marketing campaigns for their new product."
 		"When all else fails, they buy it: While iPads had fan boys salivating in the streets –the technology has been around for decades. One of the most obvious precursors to the iPad is FingerWorks, a finger gesture operated keyboard with a mouse very similar to Apple’s iPad controller. Fingerworks was bought in 2005 by none other than Apple – not surprisingly a couple years before the release of the iPhone and later the iPad.		 Of course, this isn’t to say that Apple doesn’t deserve to be the most valuable tech company in the world – just that innovation isn’t always about being first or best, sometimes, it’s just perception.\",\"stats\":{\"fetchTime\":2069,\"confidence\":\"0.780\"},\"type\":\"article\",\"meta\":{\"twitter\":{\"twitter:creator\":\"@germanny\",\"twitter:domain\":\"OnlineMBA.com\",\"twitter:card\":\"summary\",\"twitter:site\":\"@OnlineMBA_com\"},\"microdata\":{\"itemprop:image\":\"http://www.onlinemba.com/wp-content/uploads/2013/02/apple-innovates-featured-150x150.png\"},\"title\":\"3 Ways Apple Actually Innovates - OnlineMBA.com\",\"article:publisher\":\"https://www.facebook.com/OnlineMBAcom\",\"fb:app_id\":\"274667389269609\",\"og\":{\"og:type\":\"article\",\"og:title\":\"3 Ways Apple Actually Innovates - OnlineMBA.com\",\"og:description\":\"Good timing and shrewd planning have played as much of a role as innovative thinking for the Silicon Valley juggernaut.\",\"og:site_name\":\"OnlineMBA.com\",\"og:image\":\"http://www.onlinemba.com/wp-content/uploads/2013/02/apple-innovates-featured-150x150.png\",\"og:locale\":\"en_US\",\"og:url\":\"http://www.onlinemba.com/blog/3-ways-apple-innovates\"}},\"human_language\":\"en\",\"url\":\"http://www.onlinemba.com/blog/3-ways-apple-innovates\",\"title\":\"3 Ways Apple Actually Innovates\",\"textAnalysis\":{\"error\":\"Timeout during text analysis\"},\"html\":\"<div><div class=\\\"image_frame\\\"><img data-blend-adjustment=\\\"http://www.onlinemba.com/wp-content/themes/onlinemba/assets/img/backgrounds/bg.gif\\\" data-blend-mode=\\\"screen\\\" src=\\\"http://www.onlinemba.com/wp-content/uploads/2013/02/apple-innovates-invert-350x350.png\\\"></img></div><p>How did Apple rise"
 		"\",\"supertags\":[{\"id\":856,\"positions\":[[7,12],[41,46],[663,668],[776,781],[1188,1193],[1380,1385],[1645,1650],[1841,1848],[2578,2583],[2856,2863],[2931,2936]],\"name\":\"Apple Inc.\",\"score\":0.8,\"contentMatch\":1,\"categories\":{\"1752615\":\"Home computer hardware companies\",\"27841529\":\"Technology companies of the United States\",\"33847259\":\"Publicly traded companies of the United States\",\"15168154\":\"Mobile phone manufacturers\",\"732736\":\"Retail companies of the United States\",\"9300270\":\"Apple Inc.\",\"23568549\":\"Companies based in Cupertino, "
 		"California\",\"34056227\":\"Article Feedback 5\",\"37595560\":\"1976 establishments in California\",\"7415072\":\"Networking hardware companies\",\"699547\":\"Computer hardware companies\",\"37191508\":\"Software companies based in the San Francisco Bay Area\",\"855278\":\"Electronics companies\",\"5800057\":\"Steve Jobs\",\"7652766\":\"Display technology companies\",\"14698378\":\"Warrants issued in Hong Kong Stock Exchange\",\"4478067\":\"Portable audio player manufacturers\",\"31628257\":\"Multinational companies headquartered in the United States\",\"732825\":\"Electronics companies of the United States\",\"733759\":\"Computer companies of the United States\",\"6307421\":\"Companies established in 1976\"},\"type\":1,\"senseRank\":1,\"variety\":0.21886792452830184,\"depth\":0.6470588235294117},{\"id\":25686223,\"positions\":[[895,902],[2318,2325]],\"name\":\"Symbian\",\"score\":"
 		"0.8,\"contentMatch\":0.9162303664921466,\"categories\":{\"33866248\":\"Nokia platforms\",\"20290726\":\"Microkernel-based operating systems\",\"39774425\":\"ARM operating systems\",\"2148723\":\"Real-time operating systems\",\"953043\":\"Smartphones\",\"10817505\":\"History of software\",\"17862682\":\"Mobile phone operating systems\",\"33569166\":\"Accenture\",\"2150815\":\"Embedded operating systems\",\"22533699\":\"Symbian OS\",\"22280474\":\"Mobile operating systems\"},\"type\":1,\"senseRank\":1,\"variety\":0.6566037735849057,\"depth\":0.6470588235294117},{\"id\":25970423,\"positions\":[[2639,2644],[2771,2775],[2864,2868]],\"name\":\"IPad\",\"score\":0.8,\"contentMatch\":1,\"categories\":{\"33578068\":\"Products introduced "
 		"in 2010\",\"18083009\":\"Apple personal digital assistants\",\"23475157\":\"Touchscreen portable media players\",\"30107877\":\"IPad\",\"9301031\":\"Apple Inc. hardware\",\"27765345\":\"IOS (Apple)\",\"26588084\":\"Tablet computers\"},\"type\":1,\"senseRank\":1,\"variety\":0.49056603773584906,\"depth\":0.5882352941176471},{\"id\":18839,\"positions\":[[1945,1950],[2204,2209]],\"name\":\"Music\",\"score\":0.7,\"contentMatch\":1,\"categories\":{\"991222\":\"Performing arts\",\"693016\":\"Entertainment\",\"691484\":\"Music\"},\"type\":1,\"senseRank\":1,\"variety\":0.22264150943396221,\"depth\":0.7058823529411764}],\"media\":[{\"pixelHeight\":350,\"link\":\"http://www.onlinemba.com/wp-content/uploads/2013/02/apple-innovates-invert-350x350.png\",\"primary\":\"true\",\"pixelWidth\":350,\"type\":\"image\"}]}";

-	int32_t niceness = 0;
-
-	JsonItem *ji = json.parseJsonStringIntoJsonItems ( json_input , niceness );
+	JsonItem *ji = json.parseJsonStringIntoJsonItems(json_input , 0);
 	ASSERT_TRUE(ji);

 	EXPECT_EQ(JT_OBJECT, ji->m_type);
@ -23,3 +22,47 @@ TEST(JsonTest, ParseValid) {

 	json.reset();
 }
+
+TEST(JsonTest, ParseInvalid) {
+	const char *json_inputs[] = {
+	    "\"too small\"",
+	    "\"categories\":[\"shop\""
+	};
+
+	size_t len = sizeof(json_inputs) / sizeof(json_inputs[0]);
+	for (size_t i = 0; i < len; i++) {
+		Json jp;
+		jp.parseJsonStringIntoJsonItems(json_inputs[i], 0);
+		JsonItem *ji = jp.getFirstItem();
+		ASSERT_FALSE(ji);
+	}
+}
+
+TEST(JsonTest, EncodeValid) {
+	const char *input_strs[] = {
+	    "hello\tworld",
+	    "apple\norange"
+	};
+
+	const char *expected_encoded[] = {
+	    "hello\\tworld",
+	    "apple\\norange"
+	};
+
+	ASSERT_EQ(sizeof(input_strs), sizeof(expected_encoded));
+
+	size_t len = sizeof(input_strs) / sizeof(input_strs[0]);
+	for (size_t i = 0; i < len; i++) {
+		SafeBuf safe_buf;
+		safe_buf.jsonEncode(input_strs[i]);
+		EXPECT_STREQ(expected_encoded[i], safe_buf.getBufStart());
+	}
+}
+
+TEST(JsonTest, EncodeInvalid) {
+	SafeBuf safe_buf;
+	safe_buf.jsonEncode("ÿ³Qtñw \fpUÈ(ÉÍ±ãââ² 2l òS*í¸¸ 1 $Ìe£6ÓFb `+Ð");
+	log("json: %s", safe_buf.getBufStart());
+
+	/// @todo validate we're stripping invalid utf-8 character
+}
--- a/test/unit/UrlTest.cpp
+++ b/test/unit/UrlTest.cpp
@ -49,15 +49,14 @@ TEST(UrlTest, SetNonAsciiValid) {
 	    "http://xn--kjvp61d69f6wc3zf.jp/"
 	};

-	uint32_t len = sizeof(input_urls) / sizeof(input_urls[0]);
-	for (uint32_t i = 0; i < len; i++) {
+	ASSERT_EQ(sizeof(input_urls), sizeof(expected_normalized));
+
+	size_t len = sizeof(input_urls) / sizeof(input_urls[0]);
+	for (size_t i = 0; i < len; i++) {
 		Url url;
 		url.set(input_urls[i], strlen(input_urls[i]));

 		EXPECT_STREQ(expected_normalized[i], (const char*)url.getUrl());
-
-		//StackBuf(sb);
-		//EXPECT_STREQ(input_urls[i], Url::getDisplayUrl(url.getUrl(), &sb));
 	}
 }

@ -74,15 +73,13 @@ TEST(UrlTest, SetNonAsciiInValid) {
 		"http://undocs.org/ru/A/C.3/68/%0BSR.48"
 	};

-	//StackBuf(sb);
-	uint32_t len = sizeof(input_urls) / sizeof(input_urls[0]);
-	for (uint32_t i = 0; i < len; i++) {
+	ASSERT_EQ(sizeof(input_urls), sizeof(expected_normalized));
+
+	size_t len = sizeof(input_urls) / sizeof(input_urls[0]);
+	for (size_t i = 0; i < len; i++) {
 		Url url;
 		url.set(input_urls[i], strlen(input_urls[i]));

 		EXPECT_STREQ(expected_normalized[i], (const char*)url.getUrl());
-
-		//StackBuf(sb);
-		//EXPECT_STREQ(input_urls[i], Url::getDisplayUrl(url.getUrl(), &sb));
 	}
 }
--- a/test_unicode.cpp
+++ b/test_unicode.cpp
@ -1,168 +0,0 @@
-#include "gb-include.h"
-
-#include "Unicode.h"
-#include "Words.h"
-//#include "Tokens.h"
-#include <sys/time.h>
-
-
-int32_t elapsed_usec(const timeval* tv1, const timeval *tv2)
-{
-	int32_t sec_elapsed = (tv2->tv_sec - tv1->tv_sec);
-	int32_t usec_elapsed = tv2->tv_usec - tv1->tv_usec;
-	if (usec_elapsed<0){
-		usec_elapsed += 1000000;
-		sec_elapsed -=1;
-	}
-	usec_elapsed += sec_elapsed*1000000;
-	return usec_elapsed;
-}
-
-// Read unicode from a file and parse into words
-int main(int argc, char**argv)
-{
-	if (argc < 2){
-		fprintf(stderr, "Usage: %s filename ...\n", argv[0]);
-		exit(1);
-	}
-	init_unicode();
-	if ( ! hashinit() ) {
-		log("db: Failed to init hashtable." ); return 1; }
-	// . hashinit() calls srand() w/ a fixed number
-	// . let's mix it up again
-	srand ( time(NULL) );
-
-	int i;
-	for (i=1;i<argc;i++){
-		char * filename = argv[i];
-		fprintf(stderr, "Reading \"%s\"\n", filename);
-		FILE *fp = fopen(filename,"r");
-		if (!fp){
-			fprintf(stderr, "Error: could not open file \"%s\"\n", 
-				filename);
-			continue;
-		}
-		// Get File size
-		size_t file_size;
-		fseek(fp, 0L, SEEK_END);
-		file_size = (size_t)ftell(fp);
-		fseek(fp, 0L, SEEK_SET);
-		
-		char *file_buf = (char*)malloc(file_size+1);
-		char *text_buf = (char*)malloc(file_size+1);
-		size_t nread = fread(file_buf, (size_t)1,file_size, fp);
-		fclose(fp);
-
-		if (nread != file_size){
-			fprintf(stderr, "Warning: wanted %d chars, but read %d\n",
-				file_size, nread);
-		}
-		file_buf[nread] = '\0';
-
-		//utf8_parse_buf(file_buf);
-		Xml xml;
-		xml.set(file_buf,nread,false, false);
-
-		struct timeval tv1, tv2;
-		struct timezone tz1, tz2;
-
-		int foo;
-		
-
-		
-		// Extract text from (x)html
-		int32_t textlen = xml.getText(text_buf, 
-					   nread, 
-					   0,
-					   99999999,
-					   false,
-					   true,
-					   true,
-					   true,
-					   false);
-#define NUM_RUNS 1
-		///////////////////////////////////////
-		// Parse buffer the old way first for baseline comparision
-		Words words;
-		gettimeofday(&tv1, &tz1);
-		// just tokenize words
-		for(foo=0;foo<NUM_RUNS;foo++){
-			words.set(false, text_buf, TITLEREC_CURRENT_VERSION,
-				  false);
-		}
-		gettimeofday(&tv2, &tz2);
-		int32_t usec_elapsed = elapsed_usec(&tv1, &tv2);
-
-		printf("\nDocument parsed (iso-8851-1): %"INT32" usec (%"INT32" words)\n", 
-		       usec_elapsed,
-		       words.getNumWords());
-		
-		///////////////////////////////////////
-		// Parse buffer the new way 
-
-		Tokens tokens;
-		gettimeofday(&tv1, &tz1);
-		// just tokenize words
-		for(foo=0;foo<NUM_RUNS;foo++){
-			tokens.set(text_buf, false);
-		}
-		//int32_t count = utf8_count_words(file_buf);
-		gettimeofday(&tv2, &tz2);
-		usec_elapsed = elapsed_usec(&tv1, &tv2);
-
-		printf("\nDocument parsed (Unicode): %"INT32" usec (%"INT32" words)\n", 
-		       usec_elapsed,
-		       tokens.getNumTokens());
-		int32_t max_words = words.getNumWords();
-		if (tokens.getNumTokens() > max_words)
-			max_words = tokens.getNumTokens();
-		//
-		// Print tokenization side by side
-		for (foo=0;foo<max_words;foo++){
-			printf("%5d: ", foo);
-			if (foo<words.getNumWords()){
-				int n;
-				char *s;
-				s = words.getWord(foo);
-				for(n=0;n<words.getWordLen(foo);n++){
-					unsigned char c = s[n];
-					if (c == '\n') 
-						printf("[\\n]");
-					else if ((c>=0x20) && ((unsigned)c<=0x7f)){
-						//putchar(c);
-						printf("%4c", (unsigned char)c);
-					}
-					else{
-						printf("<%02lX>", (u_int32_t)c);
-					}
-				}
-				for(n=words.getWordLen(foo);n<15;n++)
-					printf("    ");
-			}
-			else{
-				printf("%60s", "");
-			}
-			
-			printf(" | ");
-			if (foo<tokens.getNumTokens()){
-				char *s;
-				s = tokens.getToken(foo);
-				char *pp;
-				for(pp=s;(pp-s)<tokens.getTokenLen(foo);){
-					u_int32_t c = utf8_read(pp,&pp);
-					if (c == (u_int32_t)'\n') 
-						printf("\\n");
-					else
-						utf8_putchar(c);
-				}
-			}
-			putchar('\n');
-
-			
-		}
-	}
-	fprintf(stderr, "Done\n");
-}
-
-
-