trying to bring back dmoz integration.

2013-10-02 22:34:21 -06:00 · 2013-10-02 22:34:21 -06:00 · 6c2c9f7774
commit 6c2c9f7774
parent 91b8921b9e
21 changed files with 1069 additions and 189 deletions
--- a/Catdb.cpp
+++ b/Catdb.cpp
@ -51,8 +51,8 @@ bool Catdb::init (  ) {
 	// . initialize our own internal rdb
 	// . i no longer use cache so changes to tagdb are instant
 	// . we still use page cache however, which is good enough!
-	if ( this == &g_catdb )
-		return m_rdb.init ( g_hostdb.m_dir               ,
+	//if ( this == &g_catdb )
+	if ( !  m_rdb.init ( g_hostdb.m_dir               ,
 			    "catdb"                   ,
 			    true                       , // dedup same keys?
 			    -1                         , // fixed record size
@ -72,8 +72,14 @@ bool Catdb::init (  ) {
 				    false,
 				    12,
 				    false,
-				    true ); // is collectionless?
-	return true;
+			     true )) // is collectionless?
+		return false;
+
+	// normally Collectiondb.addColl() will call Rdb::addColl() which
+	// will init the CollectionRec::m_rdbBase, which is what
+	// Rdb::getBase(collnum_t) will return. however, for collectionless
+	// rdb databases we set Rdb::m_collectionlessBase special here.
+	return m_rdb.addColl ( NULL );
 }

 bool Catdb::init2 ( long treeMem ) {
@ -119,7 +125,7 @@ bool Catdb::verify ( char *coll ) {
 	g_threads.disableThreads();

 	Msg5 msg5;
-	Msg5 msg5b;
+	//Msg5 msg5b;
 	RdbList list;
 	key_t startKey;
 	key_t endKey;
@ -128,7 +134,7 @@ bool Catdb::verify ( char *coll ) {
 	//long minRecSizes = 64000;
 	
 	if ( ! msg5.getList ( RDB_CATDB     ,
-			      coll          ,
+			      "",//coll          ,
 			      &list         ,
 			      startKey      ,
 			      endKey        ,
@ -147,7 +153,7 @@ bool Catdb::verify ( char *coll ) {
 			      -1            ,
 			      true          ,
 			      -1LL          ,
-			      &msg5b        ,
+			      NULL,//&msg5b        ,
 			      true          )) {
 		g_threads.enableThreads();
 		return log("db: HEY! it did not block");
@ -309,6 +315,19 @@ void Catdb::listSearch ( RdbList *list,
 	// for small lists, just loop through the list
 	if (list->getListSize() < 16*1024) {
 		while ( ! list->isExhausted() ) {
+			// for debug!
+			/*
+			CatRec crec;
+			crec.set ( NULL,
+				   list->getCurrentData(),
+				   list->getCurrentDataSize(),
+				   false);
+			log("catdb: caturl=%s #catid=%li version=%li"
+			    ,crec.m_url
+			    ,(long)crec.m_numCatids
+			    ,(long)crec.m_version
+			    );
+			*/
 			// check the current key
 			if ( list->getCurrentKey() != exactKey ) {
 				// miss, next
--- a/Categories.cpp
+++ b/Categories.cpp
@ -1011,13 +1011,17 @@ errEnd:
 	return false;
 }

-// generate sub categories for a given catid
+// . generate sub categories for a given catid
+// . store list of SubCategories into "subCatBuf" return # stored
 long Categories::generateSubCats ( long catid,
-				   SubCategory *subCats,
-				   char **catBuffer,
-				   long  *catBufferSize,
-				   long  *catBufferLen,
-				   bool   allowRealloc ) {
+				   SafeBuf *subCatBuf 
+				   //SubCategory *subCats,
+				   //char **catBuffer,
+				   //long  *catBufferSize,
+				   //long  *catBufferLen,
+				   //bool   allowRealloc 
+				   ) {
+
 	long catIndex;
 	unsigned long fileOffset;
 	unsigned long n;
@ -1029,15 +1033,22 @@ long Categories::generateSubCats ( long catid,
 	long prefixLen;
 	long nameStart;
 	long nameLen;
-	long catp         = 0;
-	long catBufferInc = *catBufferSize;
-	// lookup the index for this catid
+	long need ;
+	SubCategory *cat;
+	char *p ;
+
+	//long catp         = 0;
+	//long catBufferInc = *catBufferSize;
+	// . lookup the index for this catid
+	// . binary step, guessing to approximate place
+	//   and then scanning from there
 	catIndex = getIndexFromId(catid);
 	if (catIndex < 0)
 		goto errEnd;
 	// get the file offset
 	fileOffset = m_cats[catIndex].m_structureOffset;
 	// open the structure file
+	// cat/content.rdf.u8 in utf8
 	char filename[512];
 	sprintf(filename, "%scat/%s", g_hostdb.m_dir, RDFSTRUCTURE_FILE);
 	//m_rdfStream.clear();
@ -1066,12 +1077,16 @@ long Categories::generateSubCats ( long catid,
 		log("cat: Error Reading Structure Offset");
 		goto errEnd;
 	}
+	// point to the buffer we just read with m_rdfPtr
 	m_rdfPtr = m_rdfBuffer;
 	m_rdfEnd = &m_rdfBuffer[n];
 	m_currOffset = fileOffset;
 	
 	// parse tags for the sub categories or until we hit /Topic
 nextTag:
+	// . this increments m_rdfPtr until it points to the beginning of a tag
+	// . it may end up reading another chunk from disk
+	// . it memcopies m_tagRecfer to be the name of the tag it points to
 	if (rdfNextTag() < 0)
 		goto gotSubCats;
 	// check for /Topic
@ -1173,37 +1188,36 @@ nextTag:
 		break;
 	}
 	// . fill the next sub category
-	if (catp + prefixLen + nameLen >= *catBufferSize) {
-		if (!allowRealloc)
-			goto gotSubCats;
-		// realloc the buffer
-		char *re_catBuffer = (char*)mrealloc ( *catBuffer,
-					       *catBufferSize,
-					       *catBufferSize+catBufferInc,
-					       "Categories" );
-		if (!re_catBuffer) {
-			log ( "Could not allocate %li bytes for catBuffer",
-			      *catBufferSize+catBufferInc );
-			g_errno = ENOMEM;
-			goto errEnd;
-		}
-		*catBuffer = re_catBuffer;
-		*catBufferSize += catBufferInc;
-	}
-	// fill the prefix and name in the buffer and subcat
+	// . fill the prefix and name in the buffer and subcat
+	need = sizeof(SubCategory) + prefixLen + 1 + nameLen + 1;
+	if ( ! subCatBuf->reserve(need) ) goto errEnd;
+	cat = (SubCategory *)(subCatBuf->getBuf());
+	cat->m_prefixLen = prefixLen;
+	cat->m_nameLen = nameLen;
+	cat->m_type = currType;
+	p = cat->m_buf;
+	memcpy ( p , catStr + prefixStart , prefixLen );
+	p += prefixLen;
+	*p++ = '\0';
+	memcpy ( p , catStr + nameStart , nameLen );
+	p += nameLen;
+	*p++ = '\0';
+
+	/*
 	subCats[numSubCats].m_prefixOffset = catp;
 	subCats[numSubCats].m_prefixLen    = prefixLen;
 	if (prefixLen > 0) {
 		memcpy(&((*catBuffer)[catp]), &catStr[prefixStart], prefixLen);
 		catp += prefixLen;
 	}
-	subCats[numSubCats].m_nameOffset   = catp;
+	subCats[numSubCats].m_nameOffset   = catBuf->length();//catp;
 	subCats[numSubCats].m_nameLen      = nameLen;
 	if (nameLen > 0) {
 		memcpy(&((*catBuffer)[catp]), &catStr[nameStart], nameLen);
 		catp += nameLen;
 	}
 	subCats[numSubCats].m_type         = currType;
+	*/
 	// next sub cat
 	numSubCats++;
 	if (numSubCats >= MAX_SUB_CATS) {
@ -1214,14 +1228,14 @@ nextTag:
 	// next tag
 	goto nextTag;
 gotSubCats:
-	*catBufferLen = catp;
+	//*catBufferLen = catp;
 	//m_rdfStream.close();
 	//m_rdfStream.clear();
 	close(m_rdfStream);
 	return numSubCats;

 errEnd:
-	*catBufferLen = 0;
+	//*catBufferLen = 0;
 	//m_rdfStream.close();
 	//m_rdfStream.clear();
 	close(m_rdfStream);
--- a/Categories.h
+++ b/Categories.h
@ -61,11 +61,15 @@ struct CategoryHash {
 };

 struct SubCategory {
-	long  m_prefixOffset;
+	//long  m_prefixOffset;
 	long  m_prefixLen;
-	long  m_nameOffset;
+	//long  m_nameOffset;
 	long  m_nameLen;
 	char  m_type;
+	long getRecSize () { return sizeof(SubCategory)+m_prefixLen+m_nameLen+2;};
+	char *getPrefix() { return m_buf; };
+	char *getName  () { return m_buf+m_prefixLen+1;};
+	char  m_buf[0];
 };

 class Categories {
@ -153,13 +157,10 @@ public:
 	// normalize a url string
 	long fixUrl ( char *url, long urlLen );

-	// generate sub categories for a given catid
-	long generateSubCats ( long catid,
-			       SubCategory *subCats,
-			       char **catBuffer,
-			       long  *catBufferSize,
-			       long  *catBufferLen,
-			       bool   allowRealloc = true );
+	// . generate sub categories for a given catid
+	// . store list of SubCategories into "subCatBuf" return # stored
+	// . hits disk without using threads... so kinda sucks...
+	long generateSubCats ( long catid, SafeBuf *subCatBuf );

 	long getNumUrlsFromIndex ( long catIndex ) {
 		return m_cats[catIndex].m_numUrls; };
--- a/Hostdb.cpp
+++ b/Hostdb.cpp
@ -2308,10 +2308,9 @@ uint32_t Hostdb::getGroupId ( char rdbId,void *k,bool split ) {
 		unsigned long long d = g_revdb.getDocId( (key_t *)k );
 		return m_map [ ((d>>14)^(d>>7)) & (MAX_KSLOTS-1) ];
 	}
-
-	//else if ( rdbId == RDB_CATDB || rdbId == RDB2_CATDB2 ) {
-	//	return m_map [(*(uint16_t *)((char *)k + 10))>>3];
-	//}
+	else if ( rdbId == RDB_CATDB || rdbId == RDB2_CATDB2 ) {
+		return m_map [(*(uint16_t *)((char *)k + 10))>>3];
+	}
 	// core -- must be provided
 	char *xx = NULL; *xx = 0;
 	//groupId=key.n1 & g_hostdb.m_groupMask;
--- a/2
+++ b/2
@ -39,7 +39,7 @@ OBJS =  Tfndb.o UdpSlot.o \
 	Parms.o Pages.o Msg28.o Msg30.o \
 	Unicode.o iana_charset.o Iso8859.o \
 	SearchInput.o \
-	Categories.o Msg2a.o PageCatdb.o PageDirectory.o Msg2b.o \
+	Categories.o Msg2a.o PageCatdb.o PageDirectory.o \
 	SafeBuf.o Datedb.o \
 	UCNormalizer.o UCPropTable.o UnicodeProperties.o \
 	Pops.o Title.o Pos.o LangList.o \
--- a/Msg40.cpp
+++ b/Msg40.cpp
@ -148,6 +148,10 @@ bool Msg40::getResults ( SearchInput *si      ,
 	// we need this info for caching as well
 	//m_numGigabitInfos = 0;

+
+	//just getfrom searchinput
+	//....	m_catId = hr->getLong("catid",0);m_si->m_catId;
+
 	m_postQueryRerank.set1( this, si );

 	// get the collection rec
@ -680,6 +684,20 @@ bool Msg40::gotDocIds ( ) {
 // 	if ( ! m_msg1a.generateReferences(m_si,(void*)this,didTaskWrapper) )
 // 		m_tasksRemaining++;

+
+	//
+	// call Msg2b to generate directory
+	//
+	// why is this here? it does not depend on the docids. (mdw 9/25/13)
+	// dissect it and fix it!!
+	//
+	//if ( m_si->m_catId && 
+	//     ! m_msg2b.generateDirectory ( m_si->m_catId,
+	//				   (void*)this,
+	//				   didTaskWrapper ) )
+	//	m_tasksRemaining++;
+
+
 	return launchMsg20s ( false );
 }

@ -878,7 +896,6 @@ bool Msg40::reallocMsg20Buf ( ) {
 	return true;
 }

-/*
 void didTaskWrapper ( void* state ) {
 	Msg40 *THIS = (Msg40 *) state;
 	// one less task
@ -888,7 +905,6 @@ void didTaskWrapper ( void* state ) {
 	// we are done, call the callback
 	THIS->m_callback ( THIS->m_state );
 }
-*/

 bool Msg40::launchMsg20s ( bool recalled ) {

@ -2128,7 +2144,7 @@ long Msg40::getStoredSize ( ) {
 	//size += m_msg24.getStoredSize ( );
 	//size += m_msg1a.getStoredSize ( );
 	// cache msg2b if we have it
-	size += m_msg2b.getStoredSize();
+	//size += m_msg2b.getStoredSize();

 	return size;
 }
@ -2203,9 +2219,9 @@ long Msg40::serialize ( char *buf , long bufLen ) {
 	//if ( y == -1 ) return -1;
 	//p += y;

-	long z = m_msg2b.serialize (p, pend - p);
-	if ( z == -1 ) return -1;
-	p += z;
+	//long z = m_msg2b.serialize (p, pend - p);
+	//if ( z == -1 ) return -1;
+	//p += z;

 	if ( m_r.m_debug )
 		log("query: msg40 serialize nd=%li "
@ -2258,9 +2274,9 @@ long Msg40::deserialize ( char *buf , long bufSize ) {
 	}

 	// msg2b
-	long z = m_msg2b.deserialize ( p , pend - p );
-	if ( z == -1 ) return -1;
-	p += z;
+	//long z = m_msg2b.deserialize ( p , pend - p );
+	//if ( z == -1 ) return -1;
+	//p += z;

 	// return bytes read
 	return p - buf;
--- a/Msg40.h
+++ b/Msg40.h
@ -14,7 +14,7 @@
 #include "Msg39.h"      // getTermFreqs()
 #include "Msg20.h"      // for getting summary from docId
 #include "Msg17.h"      // a distributed cache of serialized/compressed Msg40s
-#include "Msg2b.h"      // for generating directories
+//#include "Msg2b.h"      // for generating directories
 #include "IndexReadInfo.h" // STAGE0,...
 #include "Msg3a.h"
 #include "PostQueryRerank.h"
@ -302,7 +302,7 @@ class Msg40 {
 	long       m_docsToScanForTopics;

 	// Msg2b for generating a directory
-	Msg2b  m_msg2b;
+	//Msg2b  m_msg2b;

 	PostQueryRerank m_postQueryRerank;

--- a/Msg8b.cpp
+++ b/Msg8b.cpp
@ -50,8 +50,8 @@ bool Msg8b::getCatRec  ( Url     *url              ,
 	m_state          = state;
 	m_callback       = callback;
 	m_url            = url;
-	m_coll           = coll;
-	m_collLen        = collLen;
+	//m_coll           = coll;
+	//m_collLen        = collLen;
 	m_cr             = cr;
 	m_niceness       = niceness;

@ -68,10 +68,10 @@ bool Msg8b::getCatRec  ( Url     *url              ,
 	//m_coll = g_conf.m_dirColl;
 	//m_collLen = gbstrlen(m_coll);
 	// catdb uses a dummy collection now, should not be looked at
-	m_coll = "catdb";
-	m_collLen = 5;
+	//m_coll = "catdb";
+	//m_collLen = 5;

-	m_collnum = g_collectiondb.getCollnum ( m_coll , m_collLen );
+	//m_collnum = g_collectiondb.getCollnum ( m_coll , m_collLen );

 	// . first, try it by canonical domain name
 	// . if that finds no matches, then try it by ip domain
@ -89,7 +89,7 @@ bool Msg8b::getCatRec  ( Url     *url              ,
 	//
 	if ( g_hostdb.m_groupId != m_groupId ) {
 		// coll, url, niceness(1), rdbid(1), useCanonicalName(1)
-		long requestSize = m_collLen + m_url->getUrlLen() + 4 + 4;
+		long requestSize = m_url->getUrlLen() + 4 + 3;
 		// make the request
 		char *p = m_request;
 		*(long *)p = m_url->getIp()     ; p+=4;
@ -97,10 +97,10 @@ bool Msg8b::getCatRec  ( Url     *url              ,
 		*p      = (char)niceness        ; p++;
 		*p      = (char)useCanonicalName; p++;
 		// coll
-		memcpy(p, m_coll, m_collLen);
-		p      += m_collLen;
-		*p      = '\0';
-		p++;
+		//memcpy(p, m_coll, m_collLen);
+		//p      += m_collLen;
+		//*p      = '\0';
+		//p++;
 		// url
 		memcpy(p, m_url->getUrl(), m_url->getUrlLen());
 		 p     += m_url->getUrlLen();
@ -186,7 +186,7 @@ bool Msg8b::getCatRec  ( Url     *url              ,
 				0        , // max cached age in seconds (60)
 			        false    , // add net recv'd list to cache?
 				RDB_CATDB, // specifies the rdb, 1 = tagdb
-				m_coll   ,
+				"",//NULL,//m_coll   ,
 				//&m_list  ,
 				m_list   ,
 				startKey ,
@ -545,7 +545,7 @@ bool Msg8b::gotList ( ) {
 	char *rec;

 	//rec = g_catdb->getRec ( &m_list , m_url , &recSize );
-	rec = g_catdb.getRec(m_list,m_url,&recSize,m_coll,m_collLen);
+	rec = g_catdb.getRec(m_list,m_url,&recSize,NULL,0);//m_coll,m_collLen);

 	// if record found then set it and also set gotIt to true
 	if ( rec ) {
@ -588,8 +588,8 @@ void Msg8b::getIndirectCatids ( ) {
 					matchRecs,
 					matchRecSizes,
 					MAX_IND_CATIDS,
-					m_coll,
-					m_collLen);
+					NULL,//m_coll,
+					0);//m_collLen);
 	// parse out the catids from the matches
 	m_cr->m_numIndCatids = 0;
 	for ( long i = 0; i < numMatches; i++ ) {
--- a/Msg8b.h
+++ b/Msg8b.h
@ -68,11 +68,11 @@ class Msg8b {
 	void cleanSlot     ( );

 	// some specified input
-	char  *m_coll;
-	long   m_collLen;
+	//char  *m_coll;
+	//long   m_collLen;
 	Url   *m_url;

-	collnum_t m_collnum;
+	//collnum_t m_collnum;

 	void    (*m_callback ) ( void *state );//, CatRec *rec );
 	void     *m_state;      // ptr to caller's private state data
--- a/PageCatdb.cpp
+++ b/PageCatdb.cpp
@ -105,8 +105,8 @@ bool sendPageCatdb ( TcpSocket *s , HttpRequest *r ) {
 		st->m_url.set(url, urlLen);
 		// call msg8b to lookup in catdb
 		if (!st->m_msg8b.getCatRec ( &st->m_url,
-					      st->m_coll,
-					      st->m_collLen,
+					     NULL,//st->m_coll,
+					     0,//st->m_collLen,
 					      true,
 					      1,
 					      &st->m_catRec,
--- a/PageResults.cpp
+++ b/PageResults.cpp
@ -263,7 +263,7 @@ bool sendPageResults ( TcpSocket *s , HttpRequest *hr ) {
 		char *qstr = hr->getString("q",&qlen,"",NULL);
 		// . crap! also gotta encode apostrophe since "var url='..."
 		// . true = encodeApostrophes?
-		sb.urlEncode ( qstr , true );
+		sb.urlEncode2 ( qstr , true );
 		// propagate "admin" if set
 		long admin = hr->getLong("admin",-1);
 		if ( admin != -1 ) sb.safePrintf("&admin=%li",admin);
@ -272,7 +272,7 @@ bool sendPageResults ( TcpSocket *s , HttpRequest *hr ) {
 		char *sites = hr->getString("sites",&sitesLen,NULL);
 		if ( sites ) {
 			sb.safePrintf("&sites=");
-			sb.urlEncode ( sites,true);
+			sb.urlEncode2 ( sites,true);
 		}
 		// propagate "debug" if set
 		long debug = hr->getLong("debug",0);
@ -744,6 +744,8 @@ static bool printGigabit ( State0 *st,
 	return true;
 }

+bool printDMOZSubTopics ( SafeBuf&  sb, long catId, State0 *st, bool inXml ) ;
+
 // . make a web page from results stored in msg40
 // . send it on TcpSocket "s" when done
 // . returns false if blocked, true otherwise
@ -805,6 +807,70 @@ bool gotResults ( void *state ) {
 	       return sendReply(st,NULL);
 	}

+
+	// grab the query
+	char  *q    = msg40->getQuery();
+	long   qlen = msg40->getQueryLen();
+
+	bool xml = si->m_xml;
+
+
+	// display it?
+	if (  si->m_catId >= 0 ) {
+		long dirIndex = g_categories->getIndexFromId(si->m_catId);
+		//  dirIndex = g_categories->getIndexFromId(si->m_cat_sdir);
+		if (dirIndex < 0) dirIndex = 0;
+		//   display the directory bread crumb
+		//if( (si->m_cat_dirId > 0 && si->m_isAdmin && !si->m_isFriend)
+		//     || (si->m_cat_sdir > 0 && si->m_cat_sdirt != 0) )
+		//	sb.safePrintf("<br><br>");
+		// shortcut. rtl=Right To Left language format.
+		bool rtl = g_categories->isIdRTL ( si->m_catId ) ;
+		//st->m_isRTL = rtl;
+		if ( ! xml ) {
+			sb.safePrintf("\n<font size=4><b>");
+			if ( rtl ) sb.safePrintf("<span dir=ltr>");
+			sb.safePrintf("<a href=\"/\">Top</a>: ");
+		}
+		// put crumbin xml?
+		if ( xml ) 
+			sb.safePrintf("<breacdcrumb><![CDATA[");
+		// display the breadcrumb in xml or html?
+		g_categories->printPathCrumbFromIndex(&sb,dirIndex,rtl);
+		sb.safePrintf("]]></breadcrumb>\n" );
+
+		// print the num
+		if ( ! xml ) {
+			sb.safePrintf("</b>&nbsp&nbsp<i>");
+			// how many urls/entries in this topic?
+			long nu =g_categories->getNumUrlsFromIndex(dirIndex);
+			if ( rtl )
+				sb.safePrintf("<span dir=ltr>(%li)</span>",nu);
+			else
+				sb.safePrintf("(%li)", nu);
+			sb.safePrintf("</i></font><br><br>\n");
+		}
+	}
+
+
+	///////////
+	//
+	// show DMOZ subcategories if doing either a
+	// "gbpdcat:<catid> |" (Search restricted to category)
+	// "gbdcat:<catid>"    (DMOZ urls in that topic, c=dmoz3)
+	//
+	// The search gbdcat: results should be sorted by siterank i guess
+	// since it is only search a single term: gbdcat:<catid> so we can
+	// put our stars back onto that and should be sorted by them.
+	//
+	///////////
+	if ( si->m_catId >= 0 )
+		// print the subtopcis in this topic. show as links above
+		// the search results
+		printDMOZSubTopics ( sb, si->m_catId , st, xml );
+
+
+
 	// save how many docs are in it
 	long long docsInColl = -1;
 	//RdbBase *base = getRdbBase ( RDB_CHECKSUMDB , si->m_coll );
@ -854,9 +920,6 @@ bool gotResults ( void *state ) {
 	// numResults may be more than we requested now!
 	long n = msg40->getDocsWanted();
 	if ( n > numResults )  n = numResults;
-	// grab the query
-	char  *q    = msg40->getQuery();
-	long   qlen = msg40->getQueryLen();
 	// . make the query class here for highlighting
 	// . keepAllSingles means to convert all individual words into
 	//   QueryTerms even if they're in quotes or in a connection (cd-rom).
@ -1204,7 +1267,7 @@ bool gotResults ( void *state ) {
 		// print the word
 		char *t    = qw->m_word; 
 		long  tlen = qw->m_wordLen;
-		sb.utf8Encode ( t , tlen );
+		sb.utf8Encode2 ( t , tlen );
 		sb.safePrintf (" ");
 	}
 	// print tail if we had ignored terms
@ -1264,7 +1327,7 @@ bool gotResults ( void *state ) {
 			       qe2 );
 		// close it up
 		sb.safePrintf ("\"><i><b>");
-		sb.utf8Encode(st->m_spell, len);
+		sb.utf8Encode2(st->m_spell, len);
 		// then finish it off
 		sb.safePrintf ("</b></i></a></font>\n<br><br>\n");
 	}
@ -1682,6 +1745,60 @@ bool printInlinkText ( SafeBuf &sb , Msg20Reply *mr , SearchInput *si ,
 	return true;
 }

+//
+// . print a dmoz topic for the given numeric catid UNDER search result
+// . print "Search in Category" link as well
+//
+static bool printDMOZCategoryUnderResult ( SafeBuf &sb , 
+					   SearchInput *si, 
+					   long catid ,
+					   State0 *st ) {
+
+	uint8_t queryLanguage = langUnknown;
+	// Don't print category if not in native language category
+	// Note that this only trims out "World" cats, not all
+	// of them. Some of them may still sneak in.
+	if(si->m_langHint)
+		queryLanguage = si->m_langHint;
+	if(queryLanguage != langUnknown) {
+		char tmpbuf[1024];
+		SafeBuf langsb(tmpbuf, 1024);
+		g_categories->printPathFromId(&langsb, catid, false);
+		char *ptr = langsb.getBufStart();
+		uint8_t lang = g_langId.findLangFromDMOZTopic(ptr + 7);
+		if(!strncmp("World: ", ptr, 6) &&
+		   lang != langUnknown &&
+		   lang != queryLanguage)
+			// do not print it if not in our language
+			return true;
+	}
+	//////
+	//
+	// print a link to apply your query to this DMOZ category
+	//
+	//////
+	sb.safePrintf("<a href=\"/search?s=0&q=gbpdcat%%3A%li",catid);
+	sb.urlEncode("|",1);
+	sb.urlEncode(si->m_sbuf1.getBufStart(),si->m_sbuf1.length());
+	sb.safePrintf("\">Search in Category</a>: ");
+	
+	// setup the host of the url
+	//if ( dmozHost )
+	//	sb.safePrintf("<a href=\"http://%s/", dmozHost );
+	//else
+	sb.safePrintf("<a href=\"/");
+	// print link
+	g_categories->printPathFromId(&sb, catid, true,si->m_isRTL);
+	sb.safePrintf("/\">");
+	// print the name of the dmoz category
+	sb.safePrintf("<font color=#c62939>");
+	g_categories->printPathFromId(&sb, catid, false,si->m_isRTL);
+	sb.safePrintf("</font></a><br>");
+	//++tr.brCount;
+	return true;
+}
+
+
 // use this for xml as well as html
 static int printResult ( SafeBuf &sb,
 			 State0 *st,
@ -1806,6 +1923,13 @@ static int printResult ( SafeBuf &sb,
 	if ( mr->m_isBanned && ! si->m_xml )
 		sb.safePrintf("<font color=red><b>BANNED</b></font> ");

+	///////
+	//
+	// PRINT THE TITLE
+	//
+	///////
+
+
 	// the a href tag
 	if ( ! si->m_xml ) {
 		sb.safePrintf ( "<a href=" );
@ -1824,6 +1948,41 @@ static int printResult ( SafeBuf &sb,
 	long strLen = mr->size_tbuf - 1;// msg40->getTitleLen(i);
 	if ( ! str || strLen < 0 ) strLen = 0;

+	/////
+	//
+	// are we printing a dmoz category page?
+	// get the appropriate dmoz title/summary to use since the same
+	// url can exist in multiple topics (catIds) with different
+	// titles summaries.
+	//
+	/////
+
+	char *dmozSummary = NULL;
+	// TODO: just get the catid from httprequest directly?
+	if ( si->m_catId > 0 ) { // si->m_cat_dirId > 0) {
+		// . get the dmoz title and summary
+		// . if empty then just a bunch of \0s, except for catIds
+	        Msg20Reply *mr = m20->getReply();
+		char *dmozTitle  = mr->ptr_dmozTitles;
+		dmozSummary = mr->ptr_dmozSumms;
+		char *dmozAnchor = mr->ptr_dmozAnchors;
+		long *catIds     = mr->ptr_catIds;
+		long numCats = mr->size_catIds / 4;
+		// loop through looking for the right ID
+		for (long i = 0; i < numCats ; i++ ) {
+			// assign shit if we match the dmoz cat we are showing
+			if ( catIds[i] ==  si->m_catId) break;
+			dmozTitle +=gbstrlen(dmozTitle)+1;
+			dmozSummary +=gbstrlen(dmozSummary)+1;
+			dmozAnchor += gbstrlen(dmozAnchor)+1;
+		}
+		// now make the title the dmoz title
+		str = dmozTitle;
+		strLen = gbstrlen(str);
+	}
+	
+
+
 	long hlen;
 	//copy all summary and title excerpts for this result into here
 	char tt[1024*32];
@ -1872,7 +2031,11 @@ static int printResult ( SafeBuf &sb,

 	if ( ! si->m_xml ) sb.safePrintf ("</a><br>\n" ) ;

+	/////
+	//
 	// print content type after title
+	//
+	/////
 	unsigned char ctype = mr->m_contentType;
 	if ( ctype > 2 && ctype <= 13 ) {
 		char *cs = g_contentTypeStrings[ctype];
@ -1887,6 +2050,12 @@ static int printResult ( SafeBuf &sb,
 			sb.safePrintf(" (%s) &nbsp;" ,cs);
 	}

+	////////////
+	//
+	// print the summary
+	//
+	////////////
+
 	// . then the summary
 	// . "s" is a string of null terminated strings
 	char *send;
@ -1897,22 +2066,56 @@ static int printResult ( SafeBuf &sb,
 	if ( strLen < 0 ) strLen  = 0;
 	send = str + strLen;

+	// dmoz summary might override if we are showing a dmoz topic page
+	if ( dmozSummary ) {
+		str = dmozSummary;
+		strLen = gbstrlen(dmozSummary);
+	}
+
 	if ( si->m_xml ) sb.safePrintf("\t\t<sum><![CDATA[");
-	// print summary out
-	//sb.safeMemcpy ( str , strLen );
+
 	sb.brify ( str , strLen, 0 , cols ); // niceness = 0
-
-	// remove \0's... wtf?
-	//char *xend = sb.getBuf();
-	//char *x    = xend - strLen;
-	//for ( ; x < xend ; x++ ) if ( ! *x ) *x = ' ';
-
 	// close xml tag
 	if ( si->m_xml ) sb.safePrintf("]]></sum>\n");
 	// new line if not xml
 	else if ( strLen ) sb.safePrintf("<br>\n");

-	
+	////////////
+	//
+	// . print DMOZ topics under the summary
+	// . will print the "Search in Category" link too
+	//
+	////////////
+	//Msg20Reply *mr = m20->getMsg20Reply();
+	long nCatIds = mr->getNumCatIds();
+	for (long i = 0; i < nCatIds; i++) {
+		long catid = ((long *)(mr->ptr_catIds))[i];
+		printDMOZCategoryUnderResult(sb,si,catid,st);
+	}
+	// skipCatsPrint:
+	// print the indirect category Ids
+	long nIndCatids = mr->size_indCatIds / 4;
+	//if ( !cr->m_displayIndirectDmozCategories )
+	//	goto skipCatsPrint2;
+	for ( long i = 0; i < nIndCatids; i++ ) {
+		long catid = ((long *)(mr->ptr_indCatIds))[i];
+		// skip it if it's a regular category
+		//bool skip = false;
+		long d; for ( d = 0; d < nCatIds; d++) {
+			if (  catid == mr->ptr_catIds[i] ) break;
+		}
+		// skip if the indirect catid matched a directed catid
+		if ( d < nCatIds ) continue;
+		// otherwise print it
+		printDMOZCategoryUnderResult(sb,si,catid,st);
+	}
+
+
+	////////////
+	//
+	// print the URL
+	//
+	////////////
 	// hack off the http:// if any for displaying it on screen
 	if ( urlLen > 8 && strncmp ( url , "http://" , 7 )==0 ) {
 		url += 7; urlLen -= 7; }
@ -1928,7 +2131,6 @@ static int printResult ( SafeBuf &sb,
 		// so hack off the last slash
 		if ( j < 0 ) urlLen--;
 	}
-
 	if ( ! si->m_xml ) {
 		sb.safePrintf ("<font color=gray>" );
 		//sb.htmlEncode ( url , gbstrlen(url) , false );
@ -1937,7 +2139,6 @@ static int printResult ( SafeBuf &sb,
 		// turn off the color
 		sb.safePrintf ( "</font>\n" );
 	}
-
 	if ( si->m_xml ) {
 		sb.safePrintf("\t\t<url><![CDATA[");
 		sb.safeMemcpy ( url , urlLen );
@ -3880,3 +4081,440 @@ bool printSingleScore ( SafeBuf &sb ,
 	//	      "<br>");
 	return true;
 }
+
+
+// print the search options under a dmoz search box
+bool printDirectorySearchType ( SafeBuf& sb, long sdirt ) {
+	// default to entire directory
+	if (sdirt < 1 || sdirt > 4)
+		sdirt = 3;
+
+	// by default search the whole thing
+	sb.safePrintf("<input type=\"radio\" name=\"sdirt\" value=\"3\"");
+	if (sdirt == 3) sb.safePrintf(" checked>");
+	else            sb.safePrintf(">");
+	sb.safePrintf("Entire Directory<br>\n");
+	// entire category
+	sb.safePrintf("<input type=\"radio\" name=\"sdirt\" value=\"1\"");
+	if (sdirt == 1) sb.safePrintf(" checked>");
+	else            sb.safePrintf(">");
+	sb.safePrintf("Entire Category<br>\n");
+	// base category only
+	sb.safePrintf("<nobr><input type=\"radio\" name=\"sdirt\" value=\"2\"");
+	if (sdirt == 2) sb.safePrintf(" checked>");
+	else            sb.safePrintf(">"); 
+	sb.safePrintf("Pages in Base Category</nobr><br>\n");
+	// sites in base category
+	sb.safePrintf("<input type=\"radio\" name=\"sdirt\" value=\"7\"");
+	if (sdirt == 7) sb.safePrintf(" checked>");
+	else            sb.safePrintf(">");
+	sb.safePrintf("Sites in Base Category<br>\n");
+	// sites in entire category
+	sb.safePrintf("<input type=\"radio\" name=\"sdirt\" value=\"6\"");
+	if (sdirt == 6) sb.safePrintf(" checked>");
+	else            sb.safePrintf(">");
+	sb.safePrintf("Sites in Entire Category<br>\n");
+	// end it
+	return true;
+}
+
+////////
+//
+// . print the directory subtopics
+// . show these when we are in a directory topic browsing dmoz
+// . just a list of all the topics/categories
+//
+////////
+bool printDMOZSubTopics ( SafeBuf&  sb, long catId, State0 *st, bool inXml ) {
+	long currType;
+	bool first;
+	bool nextColumn;
+	long maxPerColumn;
+	long currInColumn;
+	long currIndex;
+	char *prefixp;
+	long prefixLen;
+	char *catName;
+	long catNameLen;
+	char encodedName[2048];
+
+	SearchInput *si = &st->m_si;
+
+	SafeBuf subCatBuf;
+	// stores a list of SubCategories into "subCatBuf"
+	long numSubCats = g_categories->generateSubCats ( si->m_catId , &subCatBuf );
+
+	// . get the subcategories for a given categoriy
+	// . msg2b::gernerateDirectory() was launched in Msg40.cpp
+	//long numSubCats      = st->m_msg40.m_msg2b.m_numSubCats;
+	//SubCategory *subCats = st->m_msg40.m_msg2b.m_subCats;
+	//char *catBuffer      = st->m_msg40.m_msg2b.m_catBuffer;
+	//bool showAdultOnTop  = st->m_si.m_cr->m_showAdultCategoryOnTop;
+
+
+	// just print <hr> if no sub categories
+	if (inXml) {
+		sb.safePrintf ( "\t<directory>\n"
+				"\t\t<dirId>%li</dirId>\n"
+				"\t\t<dirName><![CDATA[",
+				si->m_catId);//si.m_cat_dirId );
+		g_categories->printPathFromId ( &sb, 
+						si->m_catId, // st->m_si.m_cat_dirId,
+						true );
+		sb.safePrintf ( "]]></dirName>\n");
+		sb.safePrintf ( "\t\t<dirIsRTL>%li</dirIsRTL>\n",
+				(long)si->m_isRTL);
+	}
+
+	char *p    = subCatBuf.getBufStart();
+	char *pend = subCatBuf.getBuf();
+	SubCategory *ptrs[MAX_SUB_CATS];
+	long count = 0;
+
+	if (numSubCats <= 0)
+		goto dirEnd;
+	// print out the cats
+	currType = 0;
+
+	// first make ptrs to them
+	for ( ; p < pend ; ) {
+		SubCategory *cat = (SubCategory *)p;
+		ptrs[count++] = cat;
+		p += cat->getRecSize();
+	}
+
+
+	for (long i = 0; i < count ; i++ ) {
+		SubCategory *cat = ptrs[i];
+		first = false;
+		catName = cat->getName();//&catBuffer[subCats[i].m_nameOffset];
+		catNameLen = cat->m_nameLen;//subCats[i].m_nameLen;
+		prefixp = cat->getPrefix();//&catBuffer[subCats[i].m_prefixOffset];
+		prefixLen = cat->m_prefixLen;//subCats[i].m_prefixLen;
+		// skip bad categories
+		currIndex = g_categories->getIndexFromPath(catName, catNameLen);
+		if (currIndex < 0)
+			continue;
+		// skip top adult category if we're supposed to
+		if ( !inXml && 
+		     st->m_si.m_catId == 1 && 
+		     si->m_familyFilter &&
+		     g_categories->isIndexAdultStart ( currIndex ) )
+			continue;
+		// check for room
+		//if (p + subCats[i].m_prefixLen*2 +
+		//	subCats[i].m_nameLen*2 +
+		//	512 > pend){
+		//	goto diroverflow;
+		//}
+		// print simple xml tag for inXml
+		if (inXml) {
+			switch ( cat->m_type ) {
+			case SUBCAT_LETTERBAR:
+				sb.safePrintf ( "\t\t<letterbar><![CDATA[" );
+				sb.safePrintf ( "]]>" );
+				sb.safePrintf ( "<urlcount>%li</urlcount>",
+					g_categories->getNumUrlsFromIndex(
+						currIndex) );
+				sb.safePrintf ( "</letterbar>\n" );
+				break;
+			case SUBCAT_NARROW2:
+				sb.safePrintf ( "\t\t<narrow2><![CDATA[" );
+				sb.utf8Encode2 ( catName, catNameLen );
+				sb.safePrintf ( "]]>");
+				sb.safePrintf ( "<urlcount>%li</urlcount>",
+					g_categories->getNumUrlsFromIndex(
+						currIndex) );
+				sb.safePrintf ( "</narrow2>\n" );
+				break;
+			case SUBCAT_NARROW1:
+				sb.safePrintf ( "\t\t<narrow1><![CDATA[" );
+				sb.utf8Encode2 ( catName, catNameLen );
+				sb.safePrintf ( "]]>" );
+				sb.safePrintf ( "<urlcount>%li</urlcount>",
+					g_categories->getNumUrlsFromIndex(
+						currIndex) );
+				sb.safePrintf ( "</narrow1>\n" );
+				break;
+			case SUBCAT_NARROW:
+				sb.safePrintf ( "\t\t<narrow><![CDATA[" );
+				sb.utf8Encode2 ( catName, catNameLen );
+				sb.safePrintf ( "]]>" );
+				sb.safePrintf ( "<urlcount>%li</urlcount>",
+					g_categories->getNumUrlsFromIndex(
+						currIndex) );
+				sb.safePrintf ( "</narrow>\n" );
+				break;
+			case SUBCAT_SYMBOLIC2:
+				sb.safePrintf ( "\t\t<symbolic2><![CDATA[" );
+				sb.utf8Encode2 ( prefixp, prefixLen  );
+				sb.safePrintf ( ":" );
+				sb.utf8Encode2 ( catName, catNameLen );
+				sb.safePrintf ( "]]>" );
+				sb.safePrintf ( "<urlcount>%li</urlcount>",
+					g_categories->getNumUrlsFromIndex(
+						currIndex) );
+				sb.safePrintf ( "</symbolic2>\n" );
+				break;
+			case SUBCAT_SYMBOLIC1:
+				sb.safePrintf ( "\t\t<symbolic1><![CDATA[" );
+				sb.utf8Encode2 ( prefixp, prefixLen  );
+				sb.safePrintf ( ":" );
+				sb.utf8Encode2 ( catName, catNameLen );
+				sb.safePrintf ( "]]>" );
+				sb.safePrintf ( "<urlcount>%li</urlcount>",
+					g_categories->getNumUrlsFromIndex(
+						currIndex) );
+				sb.safePrintf ( "</symbolic1>\n" );
+				break;
+			case SUBCAT_SYMBOLIC:
+				sb.safePrintf ( "\t\t<symbolic><![CDATA[" );
+				sb.utf8Encode2 ( prefixp, prefixLen  );
+				sb.safePrintf ( ":" );
+				sb.utf8Encode2 ( catName, catNameLen );
+				sb.safePrintf ( "]]>" );
+				sb.safePrintf ( "<urlcount>%li</urlcount>",
+					g_categories->getNumUrlsFromIndex(
+						currIndex) );
+				sb.safePrintf ( "</symbolic>\n" );
+				break;
+			case SUBCAT_RELATED:
+				sb.safePrintf ( "\t\t<related><![CDATA[" );
+				sb.utf8Encode2 ( catName, catNameLen );
+				sb.safePrintf ( "]]>" );
+				sb.safePrintf ( "<urlcount>%li</urlcount>",
+					g_categories->getNumUrlsFromIndex(
+						currIndex) );
+				sb.safePrintf ( "</related>\n" );
+				break;
+			case SUBCAT_ALTLANG:
+				sb.safePrintf ( "\t\t<altlang><![CDATA[" );
+				sb.utf8Encode2 ( prefixp, prefixLen  );
+				sb.safePrintf ( ":" );
+				sb.utf8Encode2 ( catName, catNameLen );
+				sb.safePrintf ( "]]>" );
+				sb.safePrintf ( "<urlcount>%li</urlcount>",
+					g_categories->getNumUrlsFromIndex(
+						currIndex) );
+				sb.safePrintf ( "</altlang>\n");
+				break;
+			}
+			continue;
+		}
+		// print type header
+		if ( cat->m_type - currType >= 10) {
+			// end the last type
+			if (currType == SUBCAT_LETTERBAR)
+				sb.safePrintf(" ]</center>\n");
+			else if (currType != 0)
+				sb.safePrintf ( "\n</span></ul></td></tr>"
+						"</table>\n" );
+			// start the new type
+			switch (cat->m_type) {
+			case SUBCAT_LETTERBAR:
+				sb.safePrintf ( "<span class=\"directory\">"
+						"<center>[ " );
+				break;
+			case SUBCAT_NARROW2:
+			case SUBCAT_SYMBOLIC2:
+			case SUBCAT_NARROW1:
+			case SUBCAT_SYMBOLIC1:
+			case SUBCAT_NARROW:
+			case SUBCAT_SYMBOLIC:
+				sb.safePrintf("<hr>\n");
+				break;
+			case SUBCAT_RELATED:
+				if (currType == 0 ||
+				    currType == SUBCAT_LETTERBAR)
+					sb.safePrintf("<hr>");
+				else
+					sb.safePrintf("<br>");
+				if (si->m_isRTL)
+					sb.safePrintf("<span dir=ltr>");
+				sb.safePrintf ( "<b>Related Categories:"
+						"</b>" );
+				if (si->m_isRTL)
+					sb.safePrintf("</span>");
+				break;
+			case SUBCAT_ALTLANG:
+				if (currType == 0 ||
+				    currType == SUBCAT_LETTERBAR)
+					sb.safePrintf("<hr>");
+				else
+					sb.safePrintf("<br>");
+				if (si->m_isRTL)
+					sb.safePrintf("<span dir=ltr>");
+				sb.safePrintf ( "<b>This category in other"
+						" languages:</b>");
+				if (si->m_isRTL)
+					sb.safePrintf("</span>");
+				break;
+			}
+			currType = ( cat->m_type/10)*10;
+			first = true;
+			nextColumn = false;
+			currInColumn = 0;
+			if (currType == SUBCAT_LETTERBAR ||
+			    currType == SUBCAT_RELATED)
+				maxPerColumn = 999;
+			else {
+				// . check how many columns we'll use for this
+				//   type
+				long numInType = 1;
+				for (long j = i+1; j < numSubCats; j++) {
+					if ( ptrs[j]->m_type - currType >= 10)
+						break;
+					numInType++;
+				}
+				// column for every 5, up to 3 columns
+				long numColumns = numInType/5;
+				if ( numInType%5 > 0 ) numColumns++;
+				if ( currType == SUBCAT_ALTLANG &&
+				     numColumns > 4)
+					numColumns = 4;
+				else if (numColumns > 3)
+					numColumns = 3;
+				// max number of links per column
+				maxPerColumn = numInType/numColumns;
+				if (numInType%numColumns > 0)
+					maxPerColumn++;
+			}
+		}
+		// start the sub cat
+		if (first) {
+			if (currType != SUBCAT_LETTERBAR)
+				sb.safePrintf ( "<table border=0>"
+						"<tr><td valign=top>"
+						"<ul><span class=\"directory\">"
+						"\n<li>");
+		}
+		// check for the next column
+		else if (nextColumn) {
+			sb.safePrintf ( "\n</span></ul></td><td valign=top>"
+					"<ul><span class=\"directory\">"
+					"\n<li>");
+			nextColumn = false;
+		}
+		// or just next link
+		else {
+			if (currType == SUBCAT_LETTERBAR)
+				sb.safePrintf("| ");
+			else
+				sb.safePrintf("<li>");
+		}
+		// print out the prefix as a link
+		//if ( p + catNameLen + 16 > pend ) {
+		//	goto diroverflow;
+		//}
+		sb.safePrintf("<a href=\"/");
+		sb.utf8Encode2(catName, catNameLen);
+		sb.safePrintf("/\">");
+		// prefix...
+		//if ( p + prefixLen + 512 > pend ) {
+		//	goto diroverflow;
+		//}
+		if (currType != SUBCAT_ALTLANG)
+			sb.safePrintf("<b>");
+		else {
+			// check for coded <b> or <strong> tags, remove
+			if (prefixLen >= 19 &&
+			    strncasecmp(prefixp, "&lt;b&gt;", 9) == 0 &&
+			    strncasecmp(prefixp + (prefixLen-10), 
+				    "&lt;/b&gt;", 10) == 0) {
+				prefixp += 9;
+				prefixLen -= 19;
+			}
+			else if (prefixLen >= 29 &&
+			    strncasecmp(prefixp, "&lt;strong&gt;", 14) == 0 &&
+			    strncasecmp(prefixp + (prefixLen-15), 
+				    "&lt;/strong&gt;", 15) == 0) {
+				prefixp += 14;
+				prefixLen -= 29;
+			}
+		}
+		if (currType == SUBCAT_RELATED) {
+			// print the full path
+			if (g_categories->isIndexRTL(currIndex))
+				sb.safePrintf("<span dir=ltr>");
+			g_categories->printPathFromIndex (
+							&sb,
+							currIndex,
+							false,
+							si->m_isRTL);
+		}
+		else {
+			char *encodeEnd = htmlEncode ( encodedName,
+						       encodedName + 2047,
+						       prefixp,
+						       prefixp + prefixLen );
+			prefixp = encodedName;
+			prefixLen = encodeEnd - encodedName;
+			//if ( p + prefixLen + 512 > pend ) {
+			//	goto diroverflow;
+			//}
+			for (long c = 0; c < prefixLen; c++) {
+				if (*prefixp == '_')
+					//*p = ' ';
+					sb.safePrintf(" ");
+				else
+					//*p = *prefixp;
+					sb.utf8Encode2(prefixp, 1);
+				//p++;
+				prefixp++;
+			}
+		}
+		//if ( p + 512 > pend ) {
+		//	goto diroverflow;
+		//}
+		// end the link
+		if (currType != SUBCAT_ALTLANG)
+			sb.safePrintf("</b>");
+		sb.safePrintf("</a>");
+		// print an @ for symbolic links
+		if ( (cat->m_type % 10) == 1)
+			sb.safePrintf("@");
+		// print number of urls under here
+		if ( cat->m_type != SUBCAT_LETTERBAR) { 
+			sb.safePrintf("&nbsp&nbsp<i>");
+			if (si->m_isRTL)
+				sb.safePrintf ( "<span dir=ltr>(%li)"
+						"</span></i>",
+					g_categories->getNumUrlsFromIndex(
+						currIndex) );
+			else
+				sb.safePrintf ( "(%li)</i>",
+					g_categories->getNumUrlsFromIndex(
+						currIndex) );
+		}
+		// next line/letter
+		if ( cat->m_type == SUBCAT_LETTERBAR) {
+			sb.safePrintf(" ");
+			continue;
+		}
+		// check for next column
+		currInColumn++;
+		if (currInColumn >= maxPerColumn) {
+			currInColumn = 0;
+			nextColumn = true;
+		}
+	}
+	//if ( p + 512 > pend ) {
+	//	goto diroverflow;
+	//}
+	// end the last type
+	if (!inXml) {
+		if (currType == SUBCAT_LETTERBAR)
+			sb.safePrintf(" ]</center>\n");
+		else
+			sb.safePrintf("</ul></td></tr></table>\n");
+	}
+dirEnd:
+	if (inXml)
+		sb.safePrintf("\t</directory>\n");
+	else {
+		sb.safePrintf("</span>");
+		sb.safePrintf("<hr><br>\n");
+	}
+
+	return true;
+}
--- a/Rdb.cpp
+++ b/Rdb.cpp
@ -5,7 +5,7 @@
 #include "Clusterdb.h"
 #include "Hostdb.h"
 #include "Tagdb.h"
-//#include "Catdb.h"
+#include "Catdb.h"
 #include "Indexdb.h"
 #include "Posdb.h"
 #include "Cachedb.h"
@ -1340,7 +1340,7 @@ void attemptMergeAll ( int fd , void *state ) {
 	g_titledb.getRdb()->attemptMerge    ( 1 , false , !state);
 	//g_tfndb.getRdb()->attemptMerge      ( 1 , false , !state);
 	g_tagdb.getRdb()->attemptMerge     ( 1 , false , !state);
-	//g_catdb.getRdb()->attemptMerge      ( 1 , false , !state);
+	g_catdb.getRdb()->attemptMerge      ( 1 , false , !state);
 	g_clusterdb.getRdb()->attemptMerge  ( 1 , false , !state);
 	g_statsdb.getRdb()->attemptMerge    ( 1 , false , !state);
 	g_syncdb.getRdb()->attemptMerge    ( 1 , false , !state);
@ -2351,7 +2351,7 @@ Rdb *getRdbFromId ( uint8_t rdbId ) {
 		s_table9 [ RDB_DOLEDB    ] = g_doledb.getRdb();
 		s_table9 [ RDB_TFNDB     ] = g_tfndb.getRdb();
 		s_table9 [ RDB_CLUSTERDB ] = g_clusterdb.getRdb();
-		//s_table9 [ RDB_CATDB     ] = g_catdb.getRdb();
+		s_table9 [ RDB_CATDB     ] = g_catdb.getRdb();
 		s_table9 [ RDB_DATEDB    ] = g_datedb.getRdb();
 		s_table9 [ RDB_LINKDB    ] = g_linkdb.getRdb();
 		s_table9 [ RDB_CACHEDB   ] = g_cachedb.getRdb();
@ -2380,7 +2380,7 @@ Rdb *getRdbFromId ( uint8_t rdbId ) {
 // the opposite of the above
 char getIdFromRdb ( Rdb *rdb ) {
 	if ( rdb == g_tagdb.getRdb    () ) return RDB_TAGDB;
-	//if ( rdb == g_catdb.getRdb     () ) return RDB_CATDB;
+	if ( rdb == g_catdb.getRdb     () ) return RDB_CATDB;
 	if ( rdb == g_indexdb.getRdb   () ) return RDB_INDEXDB;
 	if ( rdb == g_posdb.getRdb   () ) return RDB_POSDB;
 	if ( rdb == g_datedb.getRdb    () ) return RDB_DATEDB;
@ -2401,7 +2401,7 @@ char getIdFromRdb ( Rdb *rdb ) {
 	if ( rdb == g_revdb.getRdb     () ) return RDB_REVDB;
 	//if ( rdb == g_sitedb.getRdb    () ) return RDB_SITEDB;
 	//if ( rdb == g_tagdb2.getRdb    () ) return RDB2_SITEDB2;
-	//if ( rdb == g_catdb.getRdb     () ) return RDB_CATDB;
+	if ( rdb == g_catdb.getRdb     () ) return RDB_CATDB;
 	if ( rdb == g_indexdb2.getRdb   () ) return RDB2_INDEXDB2;
 	if ( rdb == g_posdb2.getRdb   () ) return RDB2_POSDB2;
 	if ( rdb == g_datedb2.getRdb    () ) return RDB2_DATEDB2;
@ -2425,7 +2425,7 @@ char getIdFromRdb ( Rdb *rdb ) {
 char isSecondaryRdb ( uint8_t rdbId ) {
 	switch ( rdbId ) {
 	//case RDB2_SITEDB2    : return true;
-        //case RDB_CATDB2     : return g_catdb2.getRdb();
+        case RDB2_CATDB2     : return true;
 	case RDB2_INDEXDB2   : return true;
 	case RDB2_POSDB2   : return true;
 	case RDB2_DATEDB2    : return true;
@ -2532,6 +2532,7 @@ long getDataSizeFromRdbId ( uint8_t rdbId ) {
 			else if ( i == RDB2_TITLEDB2 ||
 				  i == RDB2_REVDB2   ||
 				  i == RDB2_TAGDB2   ||
+				  i == RDB2_CATDB2   ||
 				  i == RDB2_SPIDERDB2 ||
 				  i == RDB2_PLACEDB2 )
 				ds = -1;
--- a/Rdb.h
+++ b/Rdb.h
@ -52,6 +52,7 @@ enum {
 	RDB2_REVDB2,
 	RDB2_TAGDB2,
 	RDB2_POSDB2, // 31
+	RDB2_CATDB2,
 	RDB_END
 };
 // how many rdbs are in "urgent merge" mode?
--- a/SafeBuf.cpp
+++ b/SafeBuf.cpp
@ -680,7 +680,7 @@ bool SafeBuf::setEncoding(short cs) {
 	return true;
 }

-bool  SafeBuf::utf8Encode(char *s, long len, bool encodeHTML,long niceness) {
+bool  SafeBuf::utf8Encode2(char *s, long len, bool encodeHTML,long niceness) {
 	long tmp = m_length;
 	if ( m_encoding == csUTF8 ) {
 		if (! safeMemcpy(s,len)) return false;
@ -1235,7 +1235,8 @@ void initTable ( ) {
 	}
 }

-bool SafeBuf::urlEncode ( bool spaceToPlus ) {
+//  url encode the whole buffer
+bool SafeBuf::urlEncodeAllBuf ( bool spaceToPlus ) {
 	// this makes things faster
 	if ( ! s_init23 ) initTable();
 	// how many chars do we need?
--- a/SafeBuf.h
+++ b/SafeBuf.h
@ -178,9 +178,9 @@ struct SafeBuf {

 	//insert strings in their native encoding
 	bool  encode ( char *s , long len , long niceness=0) {
-		return utf8Encode(s,len,false,niceness); };
+		return utf8Encode2(s,len,false,niceness); };
 	// htmlEncode default = false
-	bool  utf8Encode(char *s, long len, bool htmlEncode=false, 
+	bool  utf8Encode2(char *s, long len, bool htmlEncode=false, 
 			 long niceness=0);
 	bool  latin1Encode(char *s, long len, bool htmlEncode=false,
 			   long niceness=0);
@ -203,11 +203,15 @@ struct SafeBuf {
 			 bool requestPath = false,
 			 bool encodeApostrophes = false );

-	bool  urlEncode (char *s , 
-			 bool encodeApostrophes = false ) {
+	bool  urlEncode (char *s ) {
+		return urlEncode ( s,strlen(s),false,false); };
+
+
+	bool  urlEncode2 (char *s , 
+			  bool encodeApostrophes ) { // usually false
 		return urlEncode ( s,strlen(s),false,encodeApostrophes); };

-	bool  urlEncode ( bool spaceToPlus = true );
+	bool  urlEncodeAllBuf ( bool spaceToPlus = true );
 	bool  latin1CdataEncode(char *s, long len);
 	bool  utf8CdataEncode(char *s, long len);

--- a/SearchInput.cpp
+++ b/SearchInput.cpp
@ -1210,6 +1210,40 @@ bool SearchInput::setQueryBuffers ( ) {
 		  m_displayQuery,
 		  m_displayQueryLen);
 	
+
+
+
+	//////////
+	//
+	// show DMOZ BREADCRUMB if doing a 
+	// "gbpdcat:<catid> |" (Search restricted to category)
+	// "gbdcat:<catid>"    (DMOZ urls in that topic, c=dmoz3)
+	//
+	//////////
+	long pcatId = -1;
+	long dcatId  = -1;
+	// get the final query
+	char *q =m_sbuf1.getBufStart();
+	if ( q ) sscanf(q,"gbpdcat:%li",&pcatId);
+	if ( q ) sscanf(q,"gbcat:%li",&dcatId);
+	// pick the one that is valid
+	long catId = -1;
+	if ( pcatId >= 0 ) catId = pcatId;
+	if ( dcatId >= 0 ) catId = dcatId;
+	
+	//////
+	//
+	// save catid into the state
+	m_catId = catId;
+	//
+	///////
+
+	// are we a right to left language like hebrew?
+	if ( catId > 0 && g_categories->isIdRTL(catId) )
+		m_isRTL = true;
+	else
+		m_isRTL = false;
+
 	return true;
 }

--- a/SearchInput.h
+++ b/SearchInput.h
@ -400,6 +400,9 @@ class SearchInput {
 	SafeBuf m_sbuf2;
 	SafeBuf m_sbuf3;

+	long m_catId;
+	bool m_isRTL;
+
 	// make a cookie from parms with m_flags of PF_COOKIE set
 	SafeBuf m_cookieBuf;

--- a/XmlDoc.cpp
+++ b/XmlDoc.cpp
@ -2790,7 +2790,11 @@ char **XmlDoc::getTitleRec ( ) {
 		long dslen = 0;
 		unsigned char dalen = 0;

-		// store all dmoz info separated by \0's into titles[] buffer
+		// . store all dmoz info separated by \0's into titles[] buffer
+		// . crap, this does a disk read and blocks on that
+		//
+		// . TODO: make it non-blocking!!!!
+		//
 		g_categories->getTitleAndSummary ( m_firstUrl.getUrl(),
 						   m_firstUrl.getUrlLen(),
 						   ptr_catIds[i],
@ -3372,7 +3376,7 @@ CatRec *XmlDoc::getCatRec ( ) {
 	// return what we got
 	if ( m_catRecValid ) return &m_catRec;
 	// call that
-	setStatus ("getting cat rec");
+	setStatus ("getting dmoz cat rec");
 	// callback?
 	if ( m_calledMsg8b ) {
 		// return NULL on error
@ -3386,7 +3390,8 @@ CatRec *XmlDoc::getCatRec ( ) {
 	// assume empty and skip the call for now
 	m_catRec.reset();
 	m_catRecValid = true;
-	return &m_catRec;
+	// let's bring dmoz back
+	//return &m_catRec;
 	// compute it otherwise
 	if ( ! m_msg8b.getCatRec ( &m_firstUrl    ,
 				   m_coll         ,
@ -20303,7 +20308,7 @@ char *XmlDoc::hashAll ( HashTableX *table ) {
 	if ( ! hashUrl           ( table ) ) return NULL;
 	if ( ! hashMetaTags      ( table ) ) return NULL;
 	if ( ! hashMetaZip       ( table ) ) return NULL;
-	//if ( ! hashCategories    ( table ) ) return NULL;
+	if ( ! hashDMOZCategories( table ) ) return NULL;
 	if ( ! hashLanguage      ( table ) ) return NULL;
 	if ( ! hashCountry       ( table ) ) return NULL;
 	if ( ! hashSiteNumInlinks( table ) ) return NULL;
@ -21789,6 +21794,113 @@ bool XmlDoc::searchboxToGigablast ( ) {
 	return m_xml.hasGigablastForm();
 }

+// . bring back support for dmoz integration
+// . when clicking on a "search within this category" it does a gbpdcat:<catid>
+//   search to capture all pages that have that dmoz category as one of their
+//   parent topics
+bool XmlDoc::hashDMOZCategories ( HashTableX *tt ) {
+
+	char *titlePtr = ptr_dmozTitles;
+	char *sumPtr   = ptr_dmozSumms;
+	//char *anchPtr  = ptr_dmozAnchors;
+
+	char  buf[128];
+
+	HashInfo hi;
+	hi.m_tt        = tt;
+	hi.m_hashGroup = HASHGROUP_INTAG;
+	
+	long *catIds = (long *)ptr_catIds;
+	long numCatIds = size_catIds / 4;
+	// go through the catIds and hash them
+	for (long i = 0; i < numCatIds; i++) {
+		// write the catid as a string
+		sprintf(buf, "%lu", catIds[i]);
+		// term prefix for hashing
+		hi.m_prefix = "gbdcat";
+		// hash it
+		hashString ( buf , gbstrlen(buf) , &hi );
+		// we also want to hash the parents
+		long currCatId    = catIds[i];
+		long currParentId = catIds[i];
+		long currCatIndex;
+		// loop to the Top, Top = 1
+		while ( currCatId > 1 ) {
+			// hash the parent
+			sprintf(buf, "%lu", currParentId);
+			hi.m_prefix = "gbpdcat";
+			hashString ( buf , gbstrlen(buf), &hi );
+			// next cat
+			currCatId = currParentId;
+			// get the index for this cat
+			currCatIndex = g_categories->getIndexFromId(currCatId);
+			if ( currCatIndex <= 0 ) break;
+			// get the parent for this cat
+			currParentId = 
+				g_categories->m_cats[currCatIndex].m_parentid;
+		}
+		
+		// do not hash titles or summaries if "index article content
+		// only" parm is on
+		//if ( tr->eliminateMenus() ) continue;
+
+		// hash dmoz title
+		hi.m_prefix = NULL;
+		// call this DMOZ title as regular title i guess
+		hi.m_hashGroup = HASHGROUP_TITLE;
+		// hash the DMOZ title
+		hashString ( titlePtr , gbstrlen(titlePtr), &hi );
+		// next title
+		titlePtr += gbstrlen(titlePtr) + 1;
+
+		// hash DMOZ summary
+		hi.m_prefix = NULL;
+		// call this DMOZ summary as body i guess
+		hi.m_hashGroup = HASHGROUP_BODY;
+		// hash the DMOZ summary
+		hashString ( sumPtr , gbstrlen(sumPtr), &hi );
+		// next summary
+		sumPtr += gbstrlen(sumPtr) + 1;
+	}
+
+	long numIndCatIds = size_indCatIds / 4;
+	long *indCatIds   = (long *)ptr_indCatIds;
+	// go through the INDIRECT catIds and hash them
+	for (long i = 0 ; i < numIndCatIds; i++) {
+
+		// write the catid as a string
+		sprintf(buf, "%lu", indCatIds[i]);
+		// use prefix
+		hi.m_prefix = "gbicat";
+		hi.m_hashGroup = HASHGROUP_INTAG;
+		// hash it
+		hashString ( buf , gbstrlen(buf), &hi );
+		
+		// we also want to hash the parents
+		long currCatId    = indCatIds[i];
+		long currParentId = indCatIds[i];
+		long currCatIndex;
+		// loop to the Top, Top = 1
+		while (currCatId > 1) {
+			// hash the parent
+			sprintf(buf, "%lu", currParentId);
+			// new prefix
+			hi.m_prefix = "gbpicat";
+			// hash it
+			hashString ( buf , gbstrlen(buf), &hi );
+			// next cat
+			currCatId = currParentId;
+			// get the index for this cat
+			currCatIndex = g_categories->getIndexFromId(currCatId);
+			if ( currCatIndex <= 0 ) break;
+			// get the parent for this cat
+			currParentId = 
+				g_categories->m_cats[currCatIndex].m_parentid;
+		}
+	}
+	return true;
+}
+
 bool XmlDoc::hashLanguage ( HashTableX *tt ) {

 	setStatus ( "hashing language" );
--- a/XmlDoc.h
+++ b/XmlDoc.h
@ -693,6 +693,7 @@ class XmlDoc {
 	bool hashZipCodes ( class HashTableX *table ) ;
 	bool hashMetaZip ( class HashTableX *table ) ;
 	bool hashContentType ( class HashTableX *table ) ;
+	bool hashDMOZCategories ( class HashTableX *table ) ;
 	bool hashLinks ( class HashTableX *table ) ;
 	bool hashUrl ( class HashTableX *table ) ;
 	bool hashSections ( class HashTableX *table ) ;
--- a/dmozparse.cpp
+++ b/dmozparse.cpp
@ -21,6 +21,11 @@
 bool closeAll ( void *state , void (* callback)(void *state) ) { return true; }
 bool allExit ( ) { return true; };

+bool sendPageSEO(TcpSocket *s, HttpRequest *hr) {return true;}
+
+//long g_qbufNeedSave = false;
+//SafeBuf g_qbuf;
+
 #define RDFBUFFER_SIZE    (1024*1024*10)
 #define RDFSTRUCTURE_FILE "structure.rdf.u8"
 #define RDFCONTENT_FILE   "content.rdf.u8"
@ -518,7 +523,7 @@ bool isGoodUrl ( char *url, long urlLen ) {
 	if ( urlLen <= 0 )
 		return false;
 	for (long i = 0; i < urlLen; i++) {
-		if (is_space(url[i]))
+		if (is_wspace_a(url[i]))
 			return false;
 	}
 	// check for [prot]://[url]
@ -621,7 +626,7 @@ long fixUrl ( char *url, long urlLen ) {
 			memmove(&url[slashi-1], &url[slashi], newUrlLen - slashi);
 			newUrlLen--;
 		}
-		if (is_space(url[slashi])) {
+		if (is_wspace_a(url[slashi])) {
 			memmove(&url[slashi], &url[slashi+1], newUrlLen - (slashi+1));
 			newUrlLen--;
 		}
@ -678,7 +683,7 @@ int main ( int argc, char *argv[] ) {
 	long m = 0;
 	long newNameBufferSize = 0;
 	long newOffset = 0;
-	char filename[256];
+	char filename[1256];
 	long urlTxtCount = 0;
 	long urlTxtFile  = 0;
 	Url normUrl;
@ -695,6 +700,7 @@ int main ( int argc, char *argv[] ) {
 	bool splitUrls = false;
 	char mode = MODE_NONE;
 	long totalNEC = 0;
+	char *dir;

 	// check the options and mode
 	for (long i = 0; i < argc; i++) {
@ -783,20 +789,29 @@ int main ( int argc, char *argv[] ) {
 		goto errExit;
 	}

+	dir = "";
+
+ retry:
+
 	// open the structure file
 	if ( mode == MODE_NEW || mode == MODE_CATDUMP )
-		sprintf(filename, "%s", RDFSTRUCTURE_FILE);
+		sprintf(filename, "%s%s", dir,RDFSTRUCTURE_FILE);
 	else
-		sprintf(filename, "%s.new", RDFSTRUCTURE_FILE);
+		sprintf(filename, "%s%s.new", dir,RDFSTRUCTURE_FILE);
 	//rdfStream.open(filename, ifstream::in);
 	rdfStream = open ( filename, O_RDONLY );
-	// make sure it openned okay
+	// make sure it opened okay
 	//if (!rdfStream.is_open()) {
 	if ( rdfStream < 0 ) {
-		printf("Error Openning %s\n", filename);
+		// try ./cat/ subdir if not found
+		if ( ! dir[0] ) {
+			dir = "./cat/";
+			goto retry;
+		}
+		printf("Error Opening %s\n", filename);
 		goto errExit;
 	}
-	printf("Openned Structure File: %s\n", filename);
+	printf("Opened Structure File: %s\n", filename);

 	// take the first chunk
 	//rdfStream.read(rdfBuffer, RDFBUFFER_SIZE);
@ -832,7 +847,9 @@ int main ( int argc, char *argv[] ) {
 			nameLen = MAX_HTTP_FILENAME_LEN;
 		nameLen = htmlDecode ( htmlDecoded,
 				      &nameBuffer[nameOffset],
-				       nameLen );
+				       nameLen ,
+				       false,
+				       0);
 		memcpy(&nameBuffer[nameOffset], htmlDecoded, nameLen);
 		nameBufferLen  += nameLen;
 		// parse the catid
@ -977,7 +994,9 @@ nextChildTag:
 			childNameLen = MAX_HTTP_FILENAME_LEN;
 		childNameLen = htmlDecode ( htmlDecoded,
 					    childName,
-					    childNameLen );
+					    childNameLen ,
+					    false,
+					    0);
 		memcpy(childName, htmlDecoded, childNameLen);
 		// cut off the leading label if symbolic
 //		if (parentType == 2) {
@ -1066,25 +1085,25 @@ fileEnd1:
 	for (long i = 0; i < numRdfCats; i++) {
 		// get the hash of the path
 		rawPathLen = printCatPath(rawPath, rdfCats[i].m_catid, true);
-		rdfCats[i].m_catHash = hash32Lower(rawPath, rawPathLen, 0);
+		rdfCats[i].m_catHash = hash32Lower_a(rawPath, rawPathLen, 0);
 	}

 	// . now we want to serialize the needed data into
 	//   one (or more?) file(s) to be quickly read by gb
 	if ( mode == MODE_NEW )
-		sprintf(filename, "%s", STRUCTURE_OUTPUT_FILE);
+		sprintf(filename, "%s%s", dir,STRUCTURE_OUTPUT_FILE);
 	else
-		sprintf(filename, "%s.new", STRUCTURE_OUTPUT_FILE);
+		sprintf(filename, "%s%s.new", dir,STRUCTURE_OUTPUT_FILE);
 	//outStream.open(filename, ofstream::out|ofstream::trunc);
 	outStream = open ( filename, O_CREAT|O_WRONLY|O_TRUNC,
 			S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP );
-	// make sure it openned okay
+	// make sure it opened okay
 	//if (!outStream.is_open()) {
 	if ( outStream < 0 ) {
-		printf("Error Openning %s\n", filename);
+		printf("Error Opening %s\n", filename);
 		goto errExit;
 	}
-	printf("\nOpenned %s for writing.\n", filename);
+	printf("\nOpened %s for writing.\n", filename);

 	// write the size of the truncated name buffer
 	//outStream.write((char*)&newNameBufferSize, sizeof(long));
@ -1152,18 +1171,18 @@ contentParse:
 	
 	// open the content file
 	if ( mode == MODE_NEW ||  mode == MODE_URLDUMP )
-		sprintf(filename, "%s", RDFCONTENT_FILE);
+		sprintf(filename, "%s%s", dir,RDFCONTENT_FILE);
 	else
-		sprintf(filename, "%s.new", RDFCONTENT_FILE);
+		sprintf(filename, "%s%s.new", dir,RDFCONTENT_FILE);
 	//rdfStream.open(filename, ifstream::in);
 	rdfStream = open ( filename, O_RDONLY );
-	// make sure it openned okay
+	// make sure it opened okay
 	//if (!rdfStream.is_open()) {
 	if ( rdfStream < 0 ) {
-		printf("Error Openning %s\n", filename);
+		printf("Error Opening %s\n", filename);
 		goto errExit;
 	}
-	printf("\nOpenned Content File: %s\n", filename);
+	printf("\nOpened Content File: %s\n", filename);

 	// take the first chunk
 	//rdfStream.read(rdfBuffer, RDFBUFFER_SIZE);
@ -1199,13 +1218,13 @@ contentParse:
 		//outStream2.open(filename, ofstream::out|ofstream::trunc);
 		outStream2 = open ( filename, O_CREAT|O_WRONLY|O_TRUNC,
 					S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP );
-		// make sure it openned okay
+		// make sure it opened okay
 		//if (!outStream2.is_open()) {
 		if ( outStream2 < 0 ) {
-			printf("Error Openning %s\n", filename);
+			printf("Error Opening %s\n", filename);
 			goto errExit1;
 		}
-		printf("Openned %s for writing.\n", filename);
+		printf("Opened %s for writing.\n", filename);

 		// if we're doing a diffurldump, load up the diff file first
 		if ( mode == MODE_DIFFURLDUMP ) {
@ -1219,10 +1238,10 @@ contentParse:
 			diffInStream = open(filename, O_RDONLY);
 			//if (!diffInStream.is_open()) {
 			if ( diffInStream < 0 ) {
-				printf("Error Openning %s\n", filename);
+				printf("Error Opening %s\n", filename);
 				goto errExit;
 			}
-			printf("Openned Diff File: %s\n", filename);
+			printf("Opened Diff File: %s\n", filename);
 	
 			// read in the number of urls to update/add
 			//diffInStream.read((char*)&numUpdateIndexes,
@ -1326,14 +1345,14 @@ contentParse:
 					outStream2 = open ( filename,
 					  O_CREAT|O_WRONLY|O_TRUNC,
 					  S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP );
-					// make sure it openned okay
+					// make sure it opened okay
 					//if (!outStream2.is_open()) {
 					if ( outStream2 < 0 ) {
-						printf("Error Openning %s\n",
+						printf("Error Opening %s\n",
 						       filename);
 						goto errExit1;
 					}
-					printf("Openned %s for writing.\n",
+					printf("Opened %s for writing.\n",
 					       filename);
 					urlTxtCount = 0;
 				}
@ -1348,20 +1367,20 @@ contentParse:
 	}
 	else {
 		if ( mode == MODE_NEW )
-			sprintf(filename, "%s", CONTENT_OUTPUT_FILE);
+			sprintf(filename, "%s%s", dir,CONTENT_OUTPUT_FILE);
 		else
-			sprintf(filename, "%s.new", CONTENT_OUTPUT_FILE);
+			sprintf(filename, "%s%s.new", dir,CONTENT_OUTPUT_FILE);
 		// stream the urls into the content
 		//outStream.open(filename, ofstream::out|ofstream::trunc);
 		outStream = open ( filename, O_CREAT|O_WRONLY|O_TRUNC,
 				S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP );
-		// make sure it openned okay
+		// make sure it opened okay
 		//if (!outStream.is_open()) {
 		if ( outStream < 0 ) {
-			printf("Error Openning %s\n", filename);
+			printf("Error Opening %s\n", filename);
 			goto errExit;
 		}
-		printf("Openned %s for writing.\n", filename);
+		printf("Opened %s for writing.\n", filename);

 		// store a space for the number of urls at the start of the file
 		//outStream.write((char*)&numUrlInfos, sizeof(long));
@ -1442,7 +1461,8 @@ hashLink:
 		// html decode the url
 		if (urlLen > MAX_URL_LEN)
 			urlLen = MAX_URL_LEN;
-		urlLen = htmlDecode(decodedUrl, &urlBuffer[urlOffset], urlLen);
+		urlLen = htmlDecode(decodedUrl, &urlBuffer[urlOffset], urlLen,
+				    false,0);
 		memcpy(&urlBuffer[urlOffset], decodedUrl, urlLen);
 		// fix up bad urls
 		urlLen = fixUrl(&urlBuffer[urlOffset], urlLen);
@ -1473,7 +1493,7 @@ hashLink:
 		//urlBufferLen += urlLen;
 		// get the hash value
 		unsigned long long urlHash =
-			hash64Lower(&urlBuffer[urlOffset], urlLen, 0);
+			hash64Lower_a(&urlBuffer[urlOffset], urlLen, 0);
 		//unsigned long urlHash2 =
 		//	hash32Lower(&urlBuffer[urlOffset], urlLen, 0);
 		// see if it's already indexed
@ -1530,14 +1550,14 @@ hashLink:
 					outStream2 = open ( filename,
 					  O_CREAT|O_WRONLY|O_TRUNC,
 					  S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP );
-					// make sure it openned okay
+					// make sure it opened okay
 					//if (!outStream2.is_open()) {
 					if ( outStream2 < 0 ) {
-						printf("Error Openning %s\n",
+						printf("Error Opening %s\n",
 						       filename);
 						goto errExit1;
 					}
-					printf("Openned %s for writing.\n",
+					printf("Opened %s for writing.\n",
 					       filename);
 					urlTxtCount = 0;
 				}
@ -1697,19 +1717,19 @@ fileEnd2:

 		// load the content and url files
 		// url info (content) file
-		sprintf(filename, "%s", CONTENT_OUTPUT_FILE);
+		sprintf(filename, "%s%s", dir,CONTENT_OUTPUT_FILE);
 		//rdfStream.open(filename, ifstream::in);
 		rdfStream = open ( filename, O_RDONLY );
 		//if (!rdfStream.is_open()) {
 		if ( rdfStream < 0 ) {
-			printf("Error Openning %s\n", CONTENT_OUTPUT_FILE);
+			printf("Error Opening %s\n", filename);
 			goto oldErrExit;
 		}
 		// read in the number of urls
 		//rdfStream.read((char*)&oldNumUrls, sizeof(long));
 		if (fileRead(rdfStream, &oldNumUrls, sizeof(long)) !=
 				sizeof(long)) {
-			printf("Error Reading %s\n", CONTENT_OUTPUT_FILE);
+			printf("Error Reading %s\n", filename);
 			goto oldErrExit;
 		}
 	
@ -1749,8 +1769,8 @@ fileEnd2:
 			//rdfStream.read((char*)&urlLen, sizeof(short));
 			long n = fileRead(rdfStream, &urlLen, sizeof(short));
 			if ( n < 0 || n > (long)sizeof(short) ) {
-				printf("Error Reading %s\n",
-					CONTENT_OUTPUT_FILE);
+				printf("Error Reading %s\n",filename);
+				//CONTENT_OUTPUT_FILE);
 				goto oldErrExit;
 			}
 			if ( n == 0 )
@ -1780,8 +1800,8 @@ fileEnd2:
 			}
 			n = fileRead(rdfStream, &oldUrls[urlp], urlLen);
 			if ( n < 0 || n > urlLen ) {
-				printf("Error Reading %s\n",
-					CONTENT_OUTPUT_FILE);
+				printf("Error Reading %s\n",filename);
+				//CONTENT_OUTPUT_FILE);
 				goto oldErrExit;
 			}
 			if ( n == 0 )
@ -1791,7 +1811,7 @@ fileEnd2:
 			urlLen = fixUrl(&oldUrls[urlp], urlLen);
 			// make the hash
 			oldUrlHashes[currUrl] =
-				hash64Lower(&oldUrls[urlp], urlLen, 0);
+				hash64Lower_a(&oldUrls[urlp], urlLen, 0);
 			removeOldUrl[currUrl] = 0;
 			// increment the buffer pointer
 			if (urlLen <= 0) {
@ -1814,8 +1834,8 @@ fileEnd2:
 			//rdfStream.read((char*)&oldNumCatids[currUrl], 1);
 			long n = fileRead(rdfStream, &oldNumCatids[currUrl], 1);
 			if ( n < 0 || n > 1 ) {
-				printf("Error Reading %s\n",
-					CONTENT_OUTPUT_FILE);
+				printf("Error Reading %s\n",filename);
+				//CONTENT_OUTPUT_FILE);
 				goto oldErrExit;
 			}
 			if ( n == 0 )
@ -1839,8 +1859,8 @@ fileEnd2:
 			long readSize = sizeof(long)*oldNumCatids[currUrl];
 			n = fileRead(rdfStream, &oldCatids[catidp], readSize);
 			if ( n < 0 || n > readSize ) {
-				printf("Error Reading %s\n",
-					CONTENT_OUTPUT_FILE);
+				printf("Error Reading %s\n",filename);
+				//CONTENT_OUTPUT_FILE);
 				goto oldErrExit;
 			}
 			if ( n == 0 )
@ -1907,17 +1927,17 @@ oldIsDifferent:
 		//   also urls to remove
 		//
 		// open the new diff file for writing
-		sprintf(filename, "%s.new.diff", CONTENT_OUTPUT_FILE);
+		sprintf(filename, "%s%s.new.diff", dir,CONTENT_OUTPUT_FILE);
 		//outStream.open(filename, ofstream::out|ofstream::trunc);
 		outStream = open ( filename, O_CREAT|O_WRONLY|O_TRUNC,
 				S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP );
-		// make sure it openned okay
+		// make sure it opened okay
 		//if (!outStream.is_open()) {
 		if ( outStream < 0 ) {
-			printf("Error Openning %s\n", filename);
+			printf("Error Opening %s\n", filename);
 			goto oldErrExit;
 		}
-		printf("\nOpenned %s for writing.\n", filename);
+		printf("\nOpened %s for writing.\n", filename);

 		// write out the number of urls to update/add
 		//outStream.write(&numUpdateUrls, sizeof(long));
@ -2027,19 +2047,19 @@ oldGoodExit:
 	// . now we want to serialize the needed data into
 	//   one (or more?) file(s) to be quickly read by gb
 	if ( mode == MODE_NEW )
-		sprintf(filename, "%s", STRUCTURE_OUTPUT_FILE);
+		sprintf(filename, "%s%s", dir,STRUCTURE_OUTPUT_FILE);
 	else
-		sprintf(filename, "%s.new", STRUCTURE_OUTPUT_FILE);
+		sprintf(filename, "%s%s.new", dir,STRUCTURE_OUTPUT_FILE);
 	//outStream.open(filename, ofstream::out|ofstream::ate);
 	outStream = open ( filename, O_WRONLY|O_APPEND,
 			S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP );
-	// make sure it openned okay
+	// make sure it opened okay
 	//if (!outStream.is_open()) {
 	if ( outStream < 0 ) {
-		printf("Error Openning %s\n", filename);
+		printf("Error Opening %s\n", filename);
 		goto errExit;
 	}
-	printf("\nOpenned %s for writing.\n", filename);
+	printf("\nOpened %s for writing.\n", filename);

 	// write the cats
 	//outStream.write((char*)rdfCats, sizeof(RdfCat)*numRdfCats);
@ -2109,21 +2129,21 @@ oldGoodExit:

 	// write another file for the urls
 	if ( mode == MODE_NEW )
-		sprintf(filename, "%s", CONTENT_OUTPUT_FILE);
+		sprintf(filename, "%s%s", dir,CONTENT_OUTPUT_FILE);
 	else
-		sprintf(filename, "%s.new", CONTENT_OUTPUT_FILE);
+		sprintf(filename, "%s%s.new", dir,CONTENT_OUTPUT_FILE);
 	//outStream.open(filename, ofstream::out|ofstream::ate);
 	outStream = open ( filename, O_WRONLY,
 			S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP );
 	//outStream.open(filename, ofstream::out|ofstream::trunc);
 	//endpos = outStream.tellp();
-	// make sure it openned okay
+	// make sure it opened okay
 	//if (!outStream.is_open()) {
 	if ( outStream < 0 ) {
-		printf("Error Openning %s\n", filename);
+		printf("Error Opening %s\n", filename);
 		goto errExit;
 	}
-	printf("\nOpenned %s for writing.\n", filename);
+	printf("\nOpened %s for writing.\n", filename);

 	//outStream.seekp(0);
 	lseek(outStream, 0, SEEK_SET);
--- a/main.cpp
+++ b/main.cpp
@ -22,7 +22,7 @@
 #include "Titledb.h"
 #include "Revdb.h"
 #include "Tagdb.h"
-//#include "Catdb.h"
+#include "Catdb.h"
 #include "Users.h"
 #include "Tfndb.h"
 #include "Spider.h"
@ -2624,8 +2624,8 @@ int main ( int argc , char *argv[] ) {
 	if ( ! g_tagdb.init()     ) {
 		log("db: Tagdb init failed." ); return 1; }
 	// the catdb, it's an instance of tagdb, pass RDB_CATDB
-	//if ( ! g_catdb.init()   ) {
-	//	log("db: Catdb1 init failed." ); return 1; }
+	if ( ! g_catdb.init()   ) {
+		log("db: Catdb1 init failed." ); return 1; }
 	// initialize Users
 	if ( ! g_users.init()  ){
 		log("db: Users init failed. "); return 1;}
@ -10986,7 +10986,8 @@ void dumpTagdb (char *coll,long startFileNum,long numFiles,bool includeTree,
 	//g_conf.m_spiderdbMaxTreeMem = 1024*1024*30;
 	g_tagdb.init ();
 	g_collectiondb.init(true);
-	g_tagdb.addColl ( coll, false );
+	if ( rdbId == RDB_TAGDB ) g_tagdb.addColl ( coll, false );
+	if ( rdbId == RDB_CATDB ) g_catdb.init();
 	key128_t startKey ;
 	key128_t endKey   ;
 	startKey.setMin();
@ -11051,6 +11052,21 @@ void dumpTagdb (char *coll,long startFileNum,long numFiles,bool includeTree,
 			printf("corrupt tagdb rec k.n0=%llu",k.n0);
 			continue;
 		}
+		// catdb?
+		if ( rdbId == RDB_CATDB ) {
+			// for debug!
+			CatRec crec;
+			crec.set ( NULL,
+				   data ,
+				   size ,
+				   false);
+			printf("caturl=%s #catids=%li version=%li\n"
+			    ,crec.m_url
+			    ,(long)crec.m_numCatids
+			    ,(long)crec.m_version
+			    );
+			continue;
+		}
 		// parse it up
 		//TagRec *tagRec = (TagRec *)rec; 
 		Tag *tag = (Tag *)rec;
@ -13945,10 +13961,10 @@ void saveRdbs ( int fd , void *state ) {
 	last = rdb->getLastWriteTime();
 	if ( now - last > delta )
 		if ( ! rdb->close(NULL,NULL,false,false)) return;
-	//rdb = g_catdb.getRdb();
-	//last = rdb->getLastWriteTime();
-	//if ( now - last > delta )
-	//	if ( ! rdb->close(NULL,NULL,false,false)) return;
+	rdb = g_catdb.getRdb();
+	last = rdb->getLastWriteTime();
+	if ( now - last > delta )
+		if ( ! rdb->close(NULL,NULL,false,false)) return;
 	//rdb = g_indexdb.getRdb();
 	//last = rdb->getLastWriteTime();
 	//if ( now - last > delta )