open-source-search-engine/Categories.cpp

#include "gb-include.h"

#include "Categories.h"
#include "Catdb.h"
#include "Loop.h"
#include "sort.h"
#include "LanguageIdentifier.h"
using namespace std;

Categories  g_categories1;
Categories  g_categories2;
Categories *g_categories;

static int sortCatHash ( const void *h1, const void *h2 );

// properly read from file
int32_t Categories::fileRead ( int fileid, void *buf, size_t count ) {
	char *p = (char*)buf;
	int32_t n = 0;
	uint32_t sizeRead = 0;
	while ( sizeRead < count ) {
		n = read ( fileid, p, count - sizeRead );
		if ( n <= 0 || n > (int32_t)count )
			return n;
		sizeRead += n;
		p += n;
	}
	return sizeRead;
}

Categories::Categories() {
	m_cats = NULL;
	m_numCats = 0;
	m_nameBuffer = NULL;
	m_nameBufferSize = 0;
	m_buffer = NULL;
	m_bufferSize = 0;
}

Categories::~Categories() {
	reset();
}

void Categories::reset() {
	if (m_buffer) {
		mfree ( m_buffer,
			m_bufferSize,
			"Categories" );
		m_buffer = NULL;
	}
}

// filename usually ./catdb/gbdmoz.structure.dat
int32_t Categories::loadCategories ( char *filename ) {
	//ifstream inStream;
	int inStream;

	// open the structure file
	inStream = open(filename, O_RDONLY);
	// make sure it opened okay
	if ( inStream < 0 ) {
		log("cat: Error opening structure file: %s", filename);
		return 1;
	}
	// read the size of the name buffer
	if ( fileRead ( inStream, &m_nameBufferSize, sizeof(int32_t) ) !=
			sizeof(int32_t) ) {
		log("cat: Error reading structure file: %s", filename);
		close(inStream);
		return 1;
	}
	// read in the number of cats
	// filename usually ./catdb/gbdmoz.structure.dat
	if ( fileRead ( inStream, &m_numCats, sizeof(int32_t) ) != sizeof(int32_t) ) {
		log("cat: Error reading structure file: %s", filename);
		close(inStream);
		return 1;
	}
	// create the name buffer
	m_bufferSize = m_nameBufferSize +
		       sizeof(Category)*m_numCats +
		       sizeof(CategoryHash)*m_numCats;
	m_buffer = (char*)mmalloc(m_bufferSize, "Categories");
	if (!m_buffer) {
		log("cat: Could not allocate %" INT32 " bytes for Category Buffer",
		    m_bufferSize);
		close(inStream);
		g_errno = ENOMEM;
		return 1;
	}
	// assign the buffers
	m_nameBuffer = m_buffer;
	m_cats       = (Category*)(m_buffer + (sizeof(char)*m_nameBufferSize));
	m_catHash    = (CategoryHash*)(m_buffer +
				       (sizeof(char)*m_nameBufferSize) +
				       (sizeof(Category)*m_numCats));
				       //(sizeof(int32_t)*m_numSymParents));

	/*
	// read and fill the name buffer
	if ( fileRead ( inStream, m_nameBuffer, m_nameBufferSize ) !=
			m_nameBufferSize ) {
		log("cat: Error reading structure file: %s", filename);
		close(inStream);
		return 1;
	}
	*/

	// temp buffer to read the whole file first
	int32_t readSize = m_nameBufferSize + (m_numCats * 30);
	char *tempBuffer = (char*)mmalloc(readSize, "Categories");
	if ( !tempBuffer ) {
		log("cat: Could not allocate %" INT32 " bytes for File Temp Buffer",
		    readSize);
		close(inStream);
		g_errno = ENOMEM;
		return 1;
	}
	// . read the rest of the file into the temp buffer
	// . filename usually ./catdb/gbdmoz.structure.dat
	if ( fileRead ( inStream, tempBuffer, readSize ) != readSize ) {
		log("cat: Error reading structure file: %s", filename);
		close(inStream);
		return 1;
	}
	char *p = tempBuffer;
	gbmemcpy ( m_nameBuffer, p, m_nameBufferSize );
	p += m_nameBufferSize;

	// read and fill the cats
	for (int32_t i = 0; i < m_numCats; i++) {

		gbmemcpy(&m_cats[i].m_catid, p, sizeof(int32_t));
		p += sizeof(int32_t);
		gbmemcpy(&m_cats[i].m_parentid, p, sizeof(int32_t));
		p += sizeof(int32_t);
		gbmemcpy(&m_cats[i].m_nameOffset, p, sizeof(int32_t));
		p += sizeof(int32_t);
		gbmemcpy(&m_cats[i].m_nameLen, p, sizeof(int16_t));
		p += sizeof(int16_t);
		gbmemcpy(&m_cats[i].m_structureOffset, p, sizeof(int32_t));
		p += sizeof(int32_t);
		gbmemcpy(&m_cats[i].m_contentOffset, p, sizeof(int32_t));
		p += sizeof(int32_t);
		gbmemcpy(&m_cats[i].m_numUrls, p, sizeof(int32_t));
		p += sizeof(int32_t);

		/*
		if ( fileRead ( inStream, &m_cats[i].m_catid, sizeof(int32_t) ) !=
				sizeof(int32_t) ) {
			log("cat: Error reading structure file: %s", filename);
			close(inStream);
			return 1;
		}
		if ( fileRead(inStream, &m_cats[i].m_parentid, sizeof(int32_t)) !=
				sizeof(int32_t) ) {
			log("cat: Error reading structure file: %s", filename);
			close(inStream);
			return 1;
		}
		if ( fileRead ( inStream,
				&m_cats[i].m_nameOffset,
				sizeof(int32_t) ) != sizeof(int32_t) ) {
			log("cat: Error reading structure file: %s", filename);
			close(inStream);
			return 1;
		}
		if ( fileRead ( inStream,
				&m_cats[i].m_nameLen,
				sizeof(int16_t) ) != sizeof(int16_t) ) {
			log("cat: Error reading structure file: %s", filename);
			close(inStream);
			return 1;
		}
		if ( fileRead ( inStream, &m_cats[i].m_structureOffset,
				sizeof(int32_t) ) != sizeof(int32_t) ) {
			log("cat: Error reading structure file: %s", filename);
			close(inStream);
			return 1;
		}
		if ( fileRead ( inStream, &m_cats[i].m_contentOffset,
				sizeof(int32_t) ) != sizeof(int32_t) ) {
			log("cat: Error reading structure file: %s", filename);
			close(inStream);
			return 1;
		}
		if ( fileRead ( inStream, &m_cats[i].m_numUrls,
				sizeof(int32_t) ) != sizeof(int32_t) ) {
			log("cat: Error reading structure file: %s", filename);
			close(inStream);
			return 1;
		}
		*/
	}
	// read the category hash
	for (int32_t i = 0; i < m_numCats; i++) {
		// read the hash
		/*
		if ( fileRead ( inStream,
				&m_catHash[i].m_hash,
				sizeof(int32_t) ) != sizeof(int32_t) ) {
			log("cat: Error reading structure file: %s", filename);
			close(inStream);
			return 1;
		}
		*/

		gbmemcpy(&m_catHash[i].m_hash, p, sizeof(int32_t));
		p += sizeof(int32_t);

		// assign the index
		m_catHash[i].m_catIndex = i;
	}
	// is this a bottleneck? shouldn't it be stored that way on disk?
	int64_t start = gettimeofdayInMilliseconds();
	// sort the category hash by hash value
	gbsort(m_catHash, m_numCats, sizeof(CategoryHash), sortCatHash);

	// sanity check - no dups allowed
	uint32_t last = 0xffffffff;
	for ( int32_t i = 0 ; i < m_numCats ; i++ ) {
		if ( m_catHash[i].m_hash == last )
			log("dmoz: hash collision on %" UINT32 "",last);
		last = m_catHash[i].m_hash;
	}

	// time it
	int64_t took = gettimeofdayInMilliseconds();
	if ( took - start > 100 ) log(LOG_INIT,"admin: Took %" INT64 " ms to "
				      "sort cat hashes.",took-start);
	// close the file
	close(inStream);
	// free the temp buffer
	mfree(tempBuffer, readSize, "Categories");
	// now create the "bad" hash table, so we can quickly see if a url
	// url is in the adult, gambling or online pharmacies categories
	if ( ! makeBadHashTable() ) return 1;
	// success
	return 0;
}

// returns false and sets g_errno on error
bool Categories::makeBadHashTable ( ) {

	m_badTable.reset();

	// . if it is on disk, load it
	// . returns false and sets g_errno on load error
	// . returns true if file does not exist
	if ( ! m_badTable.load ( g_hostdb.m_dir , "badcattable.dat" ) )
		return false;

	// if it existed, we are done
	if ( m_badTable.getNumSlotsUsed() > 0 ) return true;

	log(LOG_INFO,"cat: Generating hash table of bad url hashes.");

	for ( int32_t i = 0 ; i < m_numCats ; i++ ) {
		// skip if not an bad catid
		if ( ! isIdBad ( m_cats[i].m_catid ) ) continue;
		// it is, add the url hash to the table
		addUrlsToBadHashTable ( m_cats[i].m_catid ) ;
		//log(LOG_INIT,"cat: Error making bad hash table: %s.",
		//	    mstrerror(g_errno));
		//	return false;
		//}
	}

	//log(LOG_INFO,"cat: Saving hash table to badtable.dat.");

	// now try to save it to make it faster next time around
	m_badTable.save ( g_hostdb.m_dir , "badcattable.dat" ) ;

	return true;
}

bool Categories::isInBadCat ( Url *u ) {
	// hash it
	uint32_t h = hash32 ( u->getUrl() , u->getUrlLen() );
	// if it is in there, it is in a bad catid
	if ( m_badTable.getSlot ( h ) >= 0 ) return true;
	// otherwise, not...
	return false;
}

bool Categories::isInBadCat ( uint32_t h ) {
	// if it is in there, it is in an bad catid
	if ( m_badTable.getSlot ( h ) >= 0 ) return true;
	// otherwise, not...
	return false;
}

int sortCatHash ( const void *h1, const void *h2 ) {
	if (((CategoryHash*)h1)->m_hash < ((CategoryHash*)h2)->m_hash)
		return -1;
	else if (((CategoryHash*)h1)->m_hash > ((CategoryHash*)h2)->m_hash)
		return 1;
	else
		return 0;
}

// do a binary search to get a cat from an id
int32_t Categories::getIndexFromId ( int32_t catid ) {
	int32_t low  = 0;
	int32_t high = m_numCats-1;
	int32_t currCat;
	// binary search
	while (low <= high) {
		// next check spot
		currCat = (low + high)/2;
		// check for hit
		if (m_cats[currCat].m_catid == catid)
			return currCat;
		// shift search range
		else if (m_cats[currCat].m_catid > catid)
			high = currCat-1;
		else
			low  = currCat+1;
	}
	// not found
	return -1;
}

// do a binary search to get a cat from a path
int32_t Categories::getIndexFromPath ( char *str, int32_t strLen ) {
	int32_t low  = 0;
	int32_t high = m_numCats-1;
	int32_t currCat;
	if (!str || strLen <= 0)
		return -1;
	// remove any leading /
	if (str[0] == '/') {
		str++;
		strLen--;
	}
	// remove any trailing /
	if (str[strLen-1] == '/')
		strLen--;
	// check for top
	if (strLen == 3 &&
	    strncasecmp(str, "Top", 3) == 0)
		// it is catid 2 right? but i guess zero is symbolic for us!
		return 0;
	// get the hash
	uint32_t hash = hash32Lower_a(str, strLen, 0);
	// debug
	//char c = str[strLen];
	//str[strLen] = '\0';
	//log("dmoz: looking up hash %" UINT32 " for %s",hash,str);
	//str[strLen] = c;
	// binary search
	while (low <= high) {
		// next check spot
		currCat = (low + high)/2;
		// check for hit
		if (m_catHash[currCat].m_hash == hash)
			return m_catHash[currCat].m_catIndex;
		// shift search range
		else if (m_catHash[currCat].m_hash > hash)
			high = currCat-1;
		else
			low  = currCat+1;
	}
	// not found
	return -1;
}

// return the catid from the given path
int32_t Categories::getIdFromPath ( char *str, int32_t strLen ) {
	if ( ! m_cats ) return -1;
	int32_t index = getIndexFromPath(str, strLen);
	return m_cats[index].m_catid;
}

// check this ID for an RTL starter
bool Categories::isIdRTLStart ( int32_t catid ) {
	if ( catid == 88070   || // Top:World:Arabic
	     catid == 39341   || // Top:World:Farsi
	     catid == 118215  || // Top:World:Hebrew
	     catid == 1214070 || // Top:K&T:Inter:Arabic
	     catid == 1262316 || // Top:K&T:Inter:Farsi
	     catid == 910298 )   // Top:K&T:Inter:Hebrew
		return true;
	else
		return false;
}

// check this ID for an RTL starter
bool Categories::isIndexRTLStart ( int32_t catIndex ) {
	if ( catIndex > 0 )
		return isIdRTLStart(m_cats[catIndex].m_catid);
	return false;
}

// determine if a category is RTL from Id
bool Categories::isIdRTL ( int32_t catid ) {
	int32_t index = getIndexFromId(catid);
	if (index < 0)
		return false;
	return isIndexRTL(index);
}

// determine if a category is RTL from Index
bool Categories::isIndexRTL ( int32_t catIndex ) {
	int32_t currIndex = catIndex;
	while (currIndex > 0) {
		// check if this is one of the RTLs
		if (isIdRTLStart(m_cats[currIndex].m_catid))
			return true;
		// otherwise check the parent
		currIndex = getIndexFromId(m_cats[currIndex].m_parentid);
	}
	return false;
}

// check this ID for a top Adult category
bool Categories::isIdAdultStart ( int32_t catid ) {
	if ( catid == 17 )    // Top:Adult
		return true;
	else
		return false;
}

bool Categories::isIdBadStart ( int32_t catid ) {
	// Top:Adult
	if ( catid ==     17 )
		return true;
	// Top:Games:Gambling
	if ( catid ==    144 )
		return true;
	// Top:Shopping:Health:Pharmacy:Online_Pharmacies
	if ( catid == 128206 )
		return true;
	return false;
}

// check this index for a top Adult category
bool Categories::isIndexAdultStart ( int32_t catIndex ) {
	if (catIndex > 0)
		return isIdAdultStart(m_cats[catIndex].m_catid);
	return false;
}

// check if a category is Adult from Id
bool Categories::isIdAdult ( int32_t catid ) {
	int32_t index = getIndexFromId(catid);
	if (index < 0)
		return false;
	return isIndexAdult(index);
}

// check if a category is "bad" from Id
bool Categories::isIdBad ( int32_t catid ) {
	int32_t index = getIndexFromId(catid);
	if (index < 0)
		return false;
	return isIndexBad(index);
}

// check if a category is Adult from Index
bool Categories::isIndexAdult ( int32_t catIndex ) {
	int32_t currIndex = catIndex;
	while (currIndex > 0) {
		// check if this is the Adult category
		if ( isIdAdultStart(m_cats[currIndex].m_catid) )
			return true;
		// otherwise check the parent
		currIndex = getIndexFromId(m_cats[currIndex].m_parentid);
	}
	return false;
}

// check if a category is Adult, gambling or online phrarmacy from Index
bool Categories::isIndexBad ( int32_t catIndex ) {
	int32_t currIndex = catIndex;
	while (currIndex > 0) {
		// check if this is a "bad" category
		if ( isIdBadStart(m_cats[currIndex].m_catid) )
			return true;
		// otherwise check the parent
		currIndex = getIndexFromId(m_cats[currIndex].m_parentid);
	}
	return false;
}

// print cat information
void Categories::printCats ( int32_t start, int32_t end ) {
	for (int32_t i = start; i < end; i++) {
		char str[512];
		char *s = str;
		s += sprintf(s, "Cat %" INT32 ":\n", i);
		s += sprintf(s, "  CatID: %" INT32 "\n", m_cats[i].m_catid);
		s += sprintf(s, "  Name:  ");
		for (int32_t n = m_cats[i].m_nameOffset;
			  n < m_cats[i].m_nameOffset + m_cats[i].m_nameLen;
			  n++)
			s += sprintf(s, "%c", m_nameBuffer[n]);
		s += sprintf(s, "\n");
		s += sprintf(s, "  Name Offset:      %" INT32 "\n",
				m_cats[i].m_nameOffset);
		s += sprintf(s, "  Structure Offset: %" INT32 "\n",
				m_cats[i].m_structureOffset);
		s += sprintf(s, "  Content Offset:   %" INT32 "\n",
				m_cats[i].m_contentOffset);
		s += sprintf(s, "  Parent:           %" INT32 "\n",
				m_cats[i].m_parentid);
		s += sprintf(s, "\n");
		log ( LOG_INFO, "%s", str );
	}
}

void Categories::printPathFromId ( SafeBuf *sb ,
				   int32_t catid,
				   bool raw,
				   bool isRTL ) {
	int32_t catIndex;
	// get the index
	catIndex = getIndexFromId(catid);
	//if (catIndex < 1) return;
	printPathFromIndex(sb, catIndex, raw, isRTL);
}

void Categories::printPathFromIndex ( SafeBuf *sb ,
				      int32_t catIndex,
				      bool raw,
				      bool isRTL ) {
	int32_t parentId;
	if (catIndex < 1) return;
	// get the parent
	parentId = m_cats[catIndex].m_parentid;
	int32_t catid = m_cats[catIndex].m_catid;

	// include Top now. in newer dmoz it is catid2.
	//if ( catid == 2 ) {
	//	sb->safePrintf("Top");
	//	return;
	//}

	// . print the parent(s) first
	// . the new dmoz data dumps signify a parentless topic by
	//   havings its parentid equal its catid, so avoid infinite
	//   loops by checking for that here now. mdw oct 2013.
	// . the new DMOZ has Top has catid 2 now, even though it is
	//   mistakenly labelled as Top/World, which is really catid 3.
	//   so make this parentId > 2...
	if (parentId >= 1 && parentId != catid ) {
		bool isParentRTL = isIdRTLStart(parentId);
		// print spacing here if RTL
		//if (isRTL && !raw)
		//	p += sprintf(p, " :");
		printPathFromId(sb, parentId, raw, isRTL);
		// print a spacing
		//if (!isRTL && !raw)
		//	p += sprintf(p, ": ");
		//else if (raw)
		//	p += sprintf(p, "/");
		if (!raw) sb->safePrintf(": ");
		else      sb->safePrintf("/");
		// if parent was the start of RTL, <br>
		if (isParentRTL && !raw)
			sb->safePrintf("</span><br>");
	}
	// print this category name
	int32_t nameLen = m_cats[catIndex].m_nameLen;
	int32_t nameOffset = m_cats[catIndex].m_nameOffset;
	if (raw) {
		sb->safeMemcpy(&m_nameBuffer[nameOffset], nameLen);
	}
	else {
		// html encode the name
		char encodedName[2048];
		char *encodeEnd = htmlEncode ( encodedName,
					       encodedName + 2047,
					      &m_nameBuffer[nameOffset],
					      &m_nameBuffer[nameOffset] +
					       		nameLen );
		nameLen = encodeEnd - encodedName;
		// fill it, replace _ with space
		for (int32_t i = 0; i < nameLen; i++) {
			if (encodedName[i] == '_')
				sb->safePrintf(" ");
			else
				sb->safePrintf("%c", encodedName[i]);
		}
	}
}

void Categories::printPathCrumbFromId ( SafeBuf *sb ,
					int32_t catid,
					bool isRTL ) {
	int32_t catIndex;
	// get the index
	catIndex = getIndexFromId(catid);
	//if (catIndex < 1) return;
	printPathCrumbFromIndex(sb, catIndex, isRTL);
}

void Categories::printPathCrumbFromIndex ( SafeBuf *sb,
					   int32_t catIndex,
					   bool isRTL ) {
	int32_t parentId;
	if (catIndex < 1) return;
	// get the parent
	parentId = m_cats[catIndex].m_parentid;
	int32_t catid = m_cats[catIndex].m_catid;

	// include Top now. in newer dmoz it is catid2.
	// seems to already be included below... because you made it
	// parentId>1 not parentId>2
	//if ( catid == 2 ) {
	//	sb->safePrintf("Top");
	//	return;
	//}

	// . print the parent(s) first
	// . the new dmoz has Top has parentid 2 now, and Top/World is
	//   catid 3. so make this parentId > 2 not parentId > 1
	if (parentId > 1 && parentId != catid ) {
		bool isParentRTL = isIdRTLStart(parentId);
		printPathCrumbFromId(sb, parentId, isRTL);
		// print a spacing
		sb->safePrintf(": ");
		// if parent starts RTL, <br>
		if (isParentRTL && isRTL)
			sb->safePrintf("</span><br>");
	}
	// print this category's link
	sb->safePrintf("<a href=\"/");
	printPathFromIndex(sb, catIndex, true, isRTL);
	sb->safePrintf("/\">");
	int32_t nameLen = m_cats[catIndex].m_nameLen;
	int32_t nameOffset = m_cats[catIndex].m_nameOffset;
	// fill it, replace _ with space
	{
		// html encode the name
		char encodedName[2048];
		char *encodeEnd = htmlEncode ( encodedName,
					       encodedName + 2047,
					      &m_nameBuffer[nameOffset],
					      &m_nameBuffer[nameOffset] +
					       		nameLen );
		nameLen = encodeEnd - encodedName;
		for (int32_t i = 0; i < nameLen; i++) {
			if (encodedName[i] == '_')
				sb->safePrintf(" ");
			else
				sb->safePrintf("%c", encodedName[i]);
		}
	}
	sb->safePrintf("</a>");
}

// increment the ptr into the file, possibly reading the next chunk
char* Categories::incRdfPtr( int32_t skip ) {
	int32_t n;
	for (int32_t i = 0; i < skip; i++) {
		m_rdfPtr++;
		m_currOffset++;
		// pull the next chunk if we're at the end
		if (m_rdfPtr == m_rdfEnd) {
			// if nothing left, return NULL
			//if (!m_rdfStream.good())
			//	return NULL;
			// get the next chunk
			//m_rdfStream.read(m_rdfBuffer, m_rdfBufferSize);
			//n      = m_rdfStream.gcount();
			n = read ( m_rdfStream, m_rdfBuffer, m_rdfBufferSize );
			if ( n <= 0 || n > m_rdfBufferSize )
				return NULL;
			m_rdfPtr = m_rdfBuffer;
			m_rdfEnd = &m_rdfBuffer[n];
		}
	}
	return m_rdfPtr;
}

// parse the rdf file up past a given start tag
int32_t Categories::rdfParse ( char *tagName ) {
	bool inQuote = false;
	do {
		int32_t matchPos = 0;
		// move to the next tag
		while (*m_rdfPtr != '<' || inQuote ) {
			// check for quotes
			if (*m_rdfPtr == '"')
				inQuote = !inQuote;
			// next char
			if (!incRdfPtr())
				return -1;
		}
		// check if the tag is good
		do {
			if (!incRdfPtr())
				return -1;
			if (*m_rdfPtr != tagName[matchPos])
				break;
			matchPos++;
		} while (tagName[matchPos]);
		// matched if we're at the end of the tagName
		if (!tagName[matchPos]) {
			if (!incRdfPtr())
				return -1;
			return 0;
		}
		// otherwise it's not a match, keep going
		matchPos = 0;
	} while (true);
}

// move to the next tag in the file
int32_t Categories::rdfNextTag ( ) {
	bool inQuote = false;
	// move to the next tag
	while (*m_rdfPtr != '<' || inQuote ) {
		// check for quotes
		if (*m_rdfPtr == '"')
			inQuote = !inQuote;
		// next char
		if (!incRdfPtr())
			return -1;
	}
	// skip the <
	if (!incRdfPtr())
		return -1;
	// put the tag name in a buffer
	m_tagLen = 0;
	while ( *m_rdfPtr != ' ' &&
		*m_rdfPtr != '>' ) {
		// insert the current char
		if (m_tagLen < MAX_TAG_LEN) {
			m_tagRecfer[m_tagLen] = *m_rdfPtr;
			m_tagLen++;
		}
		// next char
		if (!incRdfPtr())
			return -1;
	}
	m_tagRecfer[m_tagLen] = '\0';
	// success
	return 0;
}

// fill the next quoted string into the buffer
int32_t Categories::fillNextString(char *str, int32_t max) {
	// get the next string, skip to the next quote
	while (*m_rdfPtr != '"') {
		if (!incRdfPtr())
			return -1;
	}
	// skip the quote
	if (!incRdfPtr())
		return -1;
	// . pointing at the string now
	//   dump it in the buffer
	int32_t strLen = 0;
	while (*m_rdfPtr != '"') {
		// fill the next character
		if (strLen < max) {
			str[strLen] = *m_rdfPtr;
			strLen++;
		}
		if (!incRdfPtr())
			return -1;
	}
	// step past the quote
	if (!incRdfPtr())
		return -1;
	// return the length
	return strLen;
}

// fill the next tag body into the buffer
int32_t Categories::fillNextTagBody(char *str, int32_t max) {
	// get the next string, skip to the next quote
	while (*m_rdfPtr != '>') {
		if (!incRdfPtr())
			return -1;
	}
	// skip the >
	if (!incRdfPtr())
		return -1;
	// . pointing at the string now
	//   dump it in the buffer
	int32_t strLen = 0;
	while (*m_rdfPtr != '<') {
		// fill the next character
		if (strLen < max) {
			str[strLen] = *m_rdfPtr;
			strLen++;
		}
		if (!incRdfPtr())
			return -1;
	}
	// return the length
	return strLen;
}

// fix root urls without a trailing /
int32_t Categories::fixUrl ( char *url, int32_t urlLen ) {
	// get past the first ://
	int32_t slashi = 0;
	int32_t newUrlLen = urlLen;
	while (url[slashi]   != ':' ||
	       url[slashi+1] != '/' ||
	       url[slashi+2] != '/') {
		slashi++;
		if (slashi >= urlLen)
			return urlLen;
	}
	slashi += 3;
	// remove a www.
	/*
	if (newUrlLen - slashi >= 4 &&
	    strncasecmp(&url[slashi], "www.", 4) == 0) {
		memmove(&url[slashi], &url[slashi+4], newUrlLen - (slashi+4));
		newUrlLen -= 4;
	}
	*/
	// look for //, cut down to single /
	for (; slashi < newUrlLen; slashi++) {
		if (url[slashi-1] == '/' && url[slashi] == '/') {
			memmove(&url[slashi-1],
				&url[slashi],
				 newUrlLen - slashi);
			newUrlLen--;
		}
		if (is_wspace_a(url[slashi])) {
			memmove(&url[slashi],
				&url[slashi+1],
				 newUrlLen - (slashi+1));
			newUrlLen--;
		}
	}
	// remove any trailing /
	if (url[newUrlLen-1] == '/')
		newUrlLen--;
	// return the new length
	return newUrlLen;
}

bool Categories::addUrlsToBadHashTable ( int32_t catid  ) {
	 return getTitleAndSummary ( NULL  , // urlorig
				     0     , // urloriglen
				     catid ,
				     NULL  , // title
				     0     , // titleLen
				     0     , // maxTitleLen
				     NULL  , // summ
				     0     , // summLen
				     0     , // maxSummLen
				     NULL  , // anchor
				     0     , // anchorLen
				     0     , // maxAnchorLen
				     0     , // niceness
				     true  );// just add to table
 }

// just show the urls in dmoz
bool Categories::printUrlsInTopic ( SafeBuf *sb, int32_t catid ) {
	int32_t catIndex;
	uint32_t fileOffset;
	uint32_t n;
	char* p;
	uint32_t readSize;
	char title[1024];
	char summ[5000];
	int32_t maxTitleLen = 1024;
	int32_t maxSummLen = 5000;
	int32_t titleLen;
	int32_t summLen;
	int32_t urlStrLen;
	char urlStr[MAX_URL_LEN];
	int32_t niceness = 0;
	bool printedStart = false;

	// lookup the index for this catid
	catIndex = getIndexFromId(catid);
	if (catIndex < 0)
		goto errEnd;
	// get the file offset
	fileOffset = m_cats[catIndex].m_contentOffset;

	QUICKPOLL( niceness );

	// . open the file
	char filename[512];
	sprintf(filename, "%scatdb/%s", g_hostdb.m_dir, RDFCONTENT_FILE);
	m_rdfStream = open(filename, O_RDONLY | O_NONBLOCK);
	if ( m_rdfStream < 0 ) {
		log("cat: Error Opening %s\n", filename);
		goto errEnd;
	}
	// . seek to the offset
	n = lseek ( m_rdfStream, fileOffset, SEEK_SET );
	if ( n != fileOffset ) {
		log("cat: Error seeking to Content Offset %" INT32 "", fileOffset);
		goto errEnd;
	}
	// . read in a chunk
	m_rdfBuffer     = m_rdfSmallBuffer;
	m_rdfBufferSize = RDFSMALLBUFFER_SIZE;

	p = m_rdfBuffer;
	readSize = m_rdfBufferSize;
 readLoop:
	n = read ( m_rdfStream, p, readSize );
	if(n > 0 && n != readSize) {
		p += n;
		readSize -= n;
	}
	//log(LOG_WARN,"build: reading %" INT32 " bytes out of %" INT32 "",n,m_rdfBufferSize);
	QUICKPOLL(niceness);

	if(n < 0 && errno == EAGAIN) goto readLoop;

	if ( n <= 0 || n > (uint32_t)m_rdfBufferSize ) {
		log("cat: Error Reading Content");
		goto errEnd;
	}
	m_rdfPtr = m_rdfBuffer;
	m_rdfEnd = &m_rdfBuffer[n];
	m_currOffset = fileOffset;
	// . parse to the correct url
	// parse the first topic and catid
	if (rdfNextTag() < 0)
		goto errEnd;
	if (rdfNextTag() < 0)
		goto errEnd;
	// parse until "ExternalPage"
nextTag:
	QUICKPOLL((niceness));
	if (rdfNextTag() < 0)
		goto errEnd;
	// check for catid of next topic to stop looking
	if (m_tagLen == 5 &&
	    strncmp(m_tagRecfer, "catid", 5) == 0)
		goto errEnd;
	if (m_tagLen != 12 ) goto nextTag;
	if ( strncmp(m_tagRecfer, "ExternalPage", 12) != 0) goto nextTag;

	//
	// got one
	//

	// get the next string
	urlStrLen = fillNextString(urlStr, MAX_URL_LEN-1);
	if (urlStrLen < 0)
		goto errEnd;

	// html decode the url
	/*
	urlStrLen = htmlDecode(decodedUrl, urlStr, urlStrLen,false,
			       niceness);
	gbmemcpy(urlStr, decodedUrl, urlStrLen);

	normUrl.set(urlStr, urlStrLen, true);
	g_catdb.normalizeUrl(&normUrl, &normUrl);
	// copy it back
	urlStrLen = normUrl.getUrlLen();
	gbmemcpy(urlStr, normUrl.getUrl(), urlStrLen);
	// make sure there's a trailing / on root urls
	// and no www.
	//urlStrLen = fixUrl(urlStr, urlStrLen);
	// check for an anchor
	urlAnchor = NULL;
	urlAnchorLen = 0;
	//for (int32_t i = 0; i < urlStrLen; i++) {
	//if (urlStr[i] == '#') {
	if (normUrl.getAnchorLen() > 0) {
		//urlAnchor = &urlStr[i];
		//urlAnchorLen = urlStrLen - i;
		//urlStrLen = i;
		urlAnchor = normUrl.getAnchor();
		urlAnchorLen = normUrl.getAnchorLen();
		//break;
	}
	*/

	// . parse out the title
	if (rdfParse("d:Title") < 0)
		goto errEnd;

	titleLen = fillNextTagBody(title, maxTitleLen);

	QUICKPOLL(niceness);

	// . parse out the summary
	if (rdfParse("d:Description") < 0)
		goto errEnd;

	summLen = fillNextTagBody(summ, maxSummLen);

	if ( ! printedStart ) {
		printedStart = true;
		sb->safePrintf("<ul>");
	}

	// print it out
	sb->safePrintf("<li><a href=\"");
	sb->safeMemcpy ( urlStr , urlStrLen );
	sb->safePrintf("\">");
	sb->safeMemcpy ( title , titleLen );
	sb->safePrintf("</a><br>");
	sb->safeMemcpy( summ, summLen );
	sb->safePrintf("<br>");//<br>");


	/*
	// . fill the anchor
	if (anchor) {
		if (urlAnchor) {
			if (urlAnchorLen > maxAnchorLen)
				urlAnchorLen = maxAnchorLen;
			gbmemcpy(anchor, urlAnchor, urlAnchorLen);
			*anchorLen = urlAnchorLen;
		}
		else
			*anchorLen = 0;
	}
	*/

	// DO NEXT tag
	goto nextTag;

errEnd:

	sb->safePrintf("</ul>");

	close(m_rdfStream);
	return false;
}


// . get the title and summary for a specific url
//   and catid
bool Categories::getTitleAndSummary ( char  *urlOrig,
				      int32_t   urlOrigLen,
				      int32_t   catid,
				      char  *title,
				      int32_t  *titleLen,
				      int32_t   maxTitleLen,
				      char  *summ,
				      int32_t  *summLen,
				      int32_t   maxSummLen,
				      char  *anchor,
				      unsigned char *anchorLen,
				      int32_t   maxAnchorLen ,
				      int32_t   niceness ,
				      bool   justAddToTable ) {
	int32_t catIndex;
	uint32_t fileOffset;
	uint32_t n;
	char url[MAX_URL_LEN];
	int32_t urlLen;
	char urlStr[MAX_URL_LEN];
	int32_t urlStrLen = 0;
	char decodedUrl[MAX_URL_LEN];
	char *urlAnchor = NULL;
	int32_t urlAnchorLen = 0;
	Url  normUrl;
	char* p;
	uint32_t readSize;
	// fix the original url
	//gbmemcpy(url, urlOrig, urlOrigLen);
	//urlLen = fixUrl(url, urlOrigLen);
	normUrl.set(urlOrig, urlOrigLen, true);
	g_catdb.normalizeUrl(&normUrl, &normUrl);
	gbmemcpy(url, normUrl.getUrl(), normUrl.getUrlLen());
	urlLen = normUrl.getUrlLen();
	// lookup the index for this catid
	catIndex = getIndexFromId(catid);
	if (catIndex < 0)
		goto errEnd;
	// get the file offset
	fileOffset = m_cats[catIndex].m_contentOffset;

	QUICKPOLL( niceness );

	// . open the file
	char filename[512];
	sprintf(filename, "%scatdb/%s", g_hostdb.m_dir, RDFCONTENT_FILE);
	//m_rdfStream.clear();
	//m_rdfStream.open(filename, ifstream::in);
	m_rdfStream = open(filename, O_RDONLY | O_NONBLOCK);
	//if (!m_rdfStream.is_open()) {
	if ( m_rdfStream < 0 ) {
		log("cat: Error Opening %s\n", filename);
		goto errEnd;
	}
	// . seek to the offset
	//m_rdfStream.seekg(fileOffset, ios::beg);
	n = lseek ( m_rdfStream, fileOffset, SEEK_SET );
	//if (!m_rdfStream.good()) {
	if ( n != fileOffset ) {
		log("cat: Error seeking to Content Offset %" INT32 "", fileOffset);
		goto errEnd;
	}
	// . read in a chunk
	m_rdfBuffer     = m_rdfSmallBuffer;
	m_rdfBufferSize = RDFSMALLBUFFER_SIZE;
	//m_rdfStream.read(m_rdfBuffer, m_rdfBufferSize);
	//n      = m_rdfStream.gcount();

	p = m_rdfBuffer;
	readSize = m_rdfBufferSize;
 readLoop:
	n = read ( m_rdfStream, p, readSize );
	if(n > 0 && n != readSize) {
		p += n;
		readSize -= n;
	}
	//log(LOG_WARN,"build: reading %" INT32 " bytes out of %" INT32 "",n,m_rdfBufferSize);
	QUICKPOLL(niceness);

	if(n < 0 && errno == EAGAIN) goto readLoop;

	if ( n <= 0 || n > (uint32_t)m_rdfBufferSize ) {
		log("cat: Error Reading Content");
		goto errEnd;
	}
	m_rdfPtr = m_rdfBuffer;
	m_rdfEnd = &m_rdfBuffer[n];
	m_currOffset = fileOffset;
	// . parse to the correct url
	// parse the first topic and catid
	if (rdfNextTag() < 0)
		goto errEnd;
	if (rdfNextTag() < 0)
		goto errEnd;
	// parse until "ExternalPage" and correct url or "Topic"
nextTag:
	QUICKPOLL((niceness));
	if (rdfNextTag() < 0)
		goto errEnd;
	// check for catid of next topic to stop looking
	if (m_tagLen == 5 &&
	    strncmp(m_tagRecfer, "catid", 5) == 0)
		goto errEnd;
	if (m_tagLen == 12 &&
	    strncmp(m_tagRecfer, "ExternalPage", 12) == 0) {
		// get the next string
		urlStrLen = fillNextString(urlStr, MAX_URL_LEN-1);
		if (urlStrLen < 0)
			goto errEnd;
		// html decode the url
		urlStrLen = htmlDecode(decodedUrl, urlStr, urlStrLen,false,
				       niceness);
		gbmemcpy(urlStr, decodedUrl, urlStrLen);
		// normalize with Url
		//normUrl.set(urlStr, urlStrLen, false, false, false, true);
		normUrl.set(urlStr, urlStrLen, true);
		g_catdb.normalizeUrl(&normUrl, &normUrl);
		// if we just want the hashes of all the urls, add them
		if ( justAddToTable ) {
			// but skip if not a root url... because
			// LinkText::isBadCatUrl() only checks roots...
			if ( ! normUrl.isRoot() ) goto nextTag;
			uint32_t h = hash32 ( normUrl.getUrl() ,
						   normUrl.getUrlLen() );
			m_badTable.addKey ( h , 1 );
			goto nextTag;
		}
		// copy it back
		urlStrLen = normUrl.getUrlLen();
		gbmemcpy(urlStr, normUrl.getUrl(), urlStrLen);
		// make sure there's a trailing / on root urls
		// and no www.
		//urlStrLen = fixUrl(urlStr, urlStrLen);
		// check for an anchor
		urlAnchor = NULL;
		urlAnchorLen = 0;
		//for (int32_t i = 0; i < urlStrLen; i++) {
			//if (urlStr[i] == '#') {
			if (normUrl.getAnchorLen() > 0) {
				//urlAnchor = &urlStr[i];
				//urlAnchorLen = urlStrLen - i;
				//urlStrLen = i;
				urlAnchor = normUrl.getAnchor();
				urlAnchorLen = normUrl.getAnchorLen();
				//break;
			}
		//}
		//urlStr[urlStrLen] = '\0';
		// check against the url
		if (urlStrLen == urlLen &&
		    strncasecmp(url, urlStr, urlLen) == 0)
			goto foundTag;
	}
	// miss, goto next tag
	goto nextTag;
foundTag:
	// . parse out the title
	if (rdfParse("d:Title") < 0)
		goto errEnd;
	if (title && titleLen)
		*titleLen = fillNextTagBody(title, maxTitleLen);

	QUICKPOLL(niceness);

	// . parse out the summary
	if (rdfParse("d:Description") < 0)
		goto errEnd;
	if (summ && summLen)
		*summLen = fillNextTagBody(summ, maxSummLen);
	// . fill the anchor
	if (anchor) {
		if (urlAnchor) {
			if (urlAnchorLen > maxAnchorLen)
				urlAnchorLen = maxAnchorLen;
			gbmemcpy(anchor, urlAnchor, urlAnchorLen);
			*anchorLen = urlAnchorLen;
		}
		else
			*anchorLen = 0;
	}
	// . close the file
	//m_rdfStream.clear();
	//m_rdfStream.close();
	close(m_rdfStream);
	return true;

errEnd:
	if (titleLen)
		*titleLen = 0;
	if (summLen)
		*summLen  = 0;
	if (anchor)
		*anchorLen = 0;
	//m_rdfStream.close();
	//m_rdfStream.clear();
	close(m_rdfStream);
	return false;
}

// . generate sub categories for a given catid
// . store list of SubCategories into "subCatBuf" return # stored
int32_t Categories::generateSubCats ( int32_t catid,
				   SafeBuf *subCatBuf
				   //SubCategory *subCats,
				   //char **catBuffer,
				   //int32_t  *catBufferSize,
				   //int32_t  *catBufferLen,
				   //bool   allowRealloc
				   ) {

	int32_t catIndex;
	uint32_t fileOffset;
	uint32_t n;
	int32_t numSubCats = 0;
	int32_t currType;
	char catStr[MAX_CATNAME_LEN];
	int32_t catStrLen;
	int32_t prefixStart;
	int32_t prefixLen;
	int32_t nameStart;
	int32_t nameLen;
	int32_t need ;
	SubCategory *cat;
	char *p ;

	//int32_t catp         = 0;
	//int32_t catBufferInc = *catBufferSize;
	// . lookup the index for this catid
	// . binary step, guessing to approximate place
	//   and then scanning from there
	catIndex = getIndexFromId(catid);
	if (catIndex < 0)
		goto errEnd;
	// get the file offset
	fileOffset = m_cats[catIndex].m_structureOffset;
	// open the structure file
	// catdb/structure.rdf.u8 in utf8
	char filename[512];
	sprintf(filename, "%scatdb/%s", g_hostdb.m_dir, RDFSTRUCTURE_FILE);
	//m_rdfStream.clear();
	//m_rdfStream.open(filename, ifstream::in);
	m_rdfStream = open(filename, O_RDONLY);
	//if (!m_rdfStream.is_open()) {
	if ( m_rdfStream < 0 ) {
		log("cat: Error Opening %s\n", filename);
		goto errEnd;
	}
	// seek to the offset
	//m_rdfStream.seekg(fileOffset, ios::beg);
	n = lseek ( m_rdfStream, fileOffset, SEEK_SET );
	//if (!m_rdfStream.good()) {
	if ( n != fileOffset ) {
		log("cat: Error seeking to Structure Offset %" INT32 "", fileOffset);
		goto errEnd;
	}
	// . read in a chunk
	m_rdfBuffer     = m_rdfSmallBuffer;
	m_rdfBufferSize = RDFSMALLBUFFER_SIZE;
	//m_rdfStream.read(m_rdfBuffer, m_rdfBufferSize);
	//n      = m_rdfStream.gcount();
	n = read ( m_rdfStream, m_rdfBuffer, m_rdfBufferSize );
	if ( n <= 0 || n > (uint32_t)m_rdfBufferSize ) {
		log("cat: Error Reading Structure Offset");
		goto errEnd;
	}
	// point to the buffer we just read with m_rdfPtr
	m_rdfPtr = m_rdfBuffer;
	m_rdfEnd = &m_rdfBuffer[n];
	m_currOffset = fileOffset;

	// parse tags for the sub categories or until we hit /Topic
nextTag:
	// . this increments m_rdfPtr until it points to the beginning of a tag
	// . it may end up reading another chunk from disk
	// . it memcopies m_tagRecfer to be the name of the tag it points to
	if (rdfNextTag() < 0)
		goto gotSubCats;
	// check for /Topic
	if (m_tagLen == 6 &&
	    strncmp(m_tagRecfer, "/Topic", 6) == 0)
		goto gotSubCats;
	else if (m_tagLen == 7 &&
		 strncmp(m_tagRecfer, "altlang", 7) == 0)
		currType = SUBCAT_ALTLANG;
	else if (m_tagLen == 7 &&
		 strncmp(m_tagRecfer, "related", 7) == 0)
		currType = SUBCAT_RELATED;
	else if (m_tagLen == 8 &&
		 strncmp(m_tagRecfer, "symbolic", 8) == 0)
		currType = SUBCAT_SYMBOLIC;
	else if (m_tagLen == 6 &&
		 strncmp(m_tagRecfer, "narrow", 6) == 0)
		currType = SUBCAT_NARROW;
	else if (m_tagLen == 9 &&
		 strncmp(m_tagRecfer, "symbolic1", 9) == 0)
		currType = SUBCAT_SYMBOLIC1;
	else if (m_tagLen == 7 &&
		 strncmp(m_tagRecfer, "narrow1", 7) == 0)
		currType = SUBCAT_NARROW1;
	else if (m_tagLen == 9 &&
		 strncmp(m_tagRecfer, "symbolic2", 9) == 0)
		currType = SUBCAT_SYMBOLIC2;
	else if (m_tagLen == 7 &&
		 strncmp(m_tagRecfer, "narrow2", 7) == 0)
		currType = SUBCAT_NARROW2;
	else if (m_tagLen == 9 &&
		 strncmp(m_tagRecfer, "letterbar", 9) == 0)
		currType = SUBCAT_LETTERBAR;
	else
		goto nextTag;
	// read the name for this category
	catStrLen = fillNextString(catStr, MAX_CATNAME_LEN-1);
	if (catStrLen < 0)
		goto gotSubCats;
	// html decode it first
	char htmlDecoded[MAX_HTTP_FILENAME_LEN*2];
	if (catStrLen > MAX_HTTP_FILENAME_LEN*2)
		catStrLen = MAX_HTTP_FILENAME_LEN*2;
	catStrLen = htmlDecode ( htmlDecoded,
				 catStr,
				 catStrLen ,
				 false,
				 0);
	gbmemcpy(catStr, htmlDecoded, catStrLen);
	// reset this offset
	nameStart = 0;
	nameLen = catStrLen;
	// get the prefix and name position/length
	switch (currType) {
	case SUBCAT_ALTLANG:
	case SUBCAT_SYMBOLIC:
	case SUBCAT_SYMBOLIC1:
	case SUBCAT_SYMBOLIC2:
		// prefix is at the start
		prefixStart = 0;
		prefixLen   = 0;
		//nameStart   = 0;
		// go to the end of the prefix
		while (catStr[nameStart] != ':') {
			nameStart++;
			prefixLen++;
		}
		// skip the : in :Top/
		nameStart += 1;
		nameLen = catStrLen - nameStart;
		break;
	case SUBCAT_LETTERBAR:
		// prefix is the very last letter
		prefixStart = catStrLen - 1;
		prefixLen   = 1;
		// skip the Top/ for the name
		//nameStart   = 4;
		// lose the Top/, keep the end letter
		//nameLen     = catStrLen - 4;
		break;
	// . don't do this because of ltr?
	//case SUBCAT_RELATED:
	//	// prefix the entire path, minus Top
	//	prefixStart = 4;
	//	prefixLen   = catStrLen - 4;
	//	// name skips Top/
	//	nameStart = 4;
	//	nameLen   = catStrLen - 4;
	//	break;
	default:
		// prefix the last folder
		prefixStart = catStrLen;
		prefixLen = 0;
		while (catStr[prefixStart-1] != '/' &&
		       prefixStart > 0) {
			prefixStart--;
			prefixLen++;
		}
		// name skips Top/ ... no! we include Top now
		// because we need it so PageResults.cpp can call
		// currIndex=g_categories->getIndexFromPath(catName,catNameLen)
		// on this name, and it needs "Top/" because it was part
		// of the hash of the full name for the category now.
		// and we lookup the Category record by that hash
		// in getIndexFromPath().
		//nameStart = 4;
		//nameLen   = catStrLen - 4;
		break;
	}
	// . fill the next sub category
	// . fill the prefix and name in the buffer and subcat
	need = sizeof(SubCategory) + prefixLen + 1 + nameLen + 1;

	// reserve space in safebuf for it
	if ( ! subCatBuf->reserve(need) ) goto errEnd;

	// point to it in safebuf
	cat = (SubCategory *)(subCatBuf->getBuf());

	cat->m_prefixLen = prefixLen;
	cat->m_nameLen = nameLen;
	cat->m_type = currType;
	p = cat->m_buf;
	gbmemcpy ( p , catStr + prefixStart , prefixLen );
	p += prefixLen;
	*p++ = '\0';
	gbmemcpy ( p , catStr + nameStart , nameLen );
	p += nameLen;
	*p++ = '\0';

	// update safebuf length
	subCatBuf->incrementLength ( cat->getRecSize() );

	/*
	subCats[numSubCats].m_prefixOffset = catp;
	subCats[numSubCats].m_prefixLen    = prefixLen;
	if (prefixLen > 0) {
		gbmemcpy(&((*catBuffer)[catp]), &catStr[prefixStart], prefixLen);
		catp += prefixLen;
	}
	subCats[numSubCats].m_nameOffset   = catBuf->length();//catp;
	subCats[numSubCats].m_nameLen      = nameLen;
	if (nameLen > 0) {
		gbmemcpy(&((*catBuffer)[catp]), &catStr[nameStart], nameLen);
		catp += nameLen;
	}
	subCats[numSubCats].m_type         = currType;
	*/
	// next sub cat
	numSubCats++;
	if (numSubCats >= MAX_SUB_CATS) {
		log ( LOG_WARN, "categories: Attempted to load too many"
				" sub-categories, truncating." );
		goto gotSubCats;
	}
	// next tag
	goto nextTag;
gotSubCats:
	//*catBufferLen = catp;
	//m_rdfStream.close();
	//m_rdfStream.clear();
	close(m_rdfStream);
	return numSubCats;

errEnd:
	//*catBufferLen = 0;
	//m_rdfStream.close();
	//m_rdfStream.clear();
	close(m_rdfStream);
	return 0;
}

// creates a directory search request url
//void Categories::createDirectorySearchUrl ( Url  *url,
int32_t Categories::createDirSearchRequest ( char *requestBuf,
					  int32_t  requestBufSize,
					  int32_t  catid,
					  char *hostname,
					  int32_t  hostnameLen,
					  char *coll,
					  int32_t  collLen,
					  char *cgi,
					  int32_t  cgiLen,
					  bool  cgiFromRequest ,
					  HttpRequest *r ) {
	// setup the request Url
	//char buffer[1024+MAX_COLL_LEN];
	//int32_t bufferLen;
	//char *p    = buffer;
	char *p    = requestBuf;
	//char *pend = buffer + 1024+MAX_COLL_LEN;
	char *pend = requestBuf + requestBufSize;
	if ( p + (hostnameLen + collLen + 128 ) >= pend )
		return 0;
	// GET
	//p += sprintf(p, "GET ");
	// damnit, keep the ZET if that's what we had, that's how we know
	// if the sender requires a compressed reply (qcproxy = query
	// compression proxy)
	char *cmd = "GET";
	char *rrr = r->m_reqBuf.getBufStart();
	if ( rrr && rrr[0] == 'Z' ) cmd = "ZET";
	// request
	//p += sprintf(p, "%s /search?dir=%" INT32 "&dr=0&sc=0&sdir=%" INT32 "&sdirt=0&c=",
	//		cmd, catid, catid);
	p += sprintf(p,
		     "%s /search?q=gbcatid%%3A%" INT32 "&dir=%" INT32 "&dr=0&sc=0&c="
		     , cmd
		     , catid
		     , catid);
	// coll
	gbmemcpy(p, coll, collLen);
	p += collLen;
	// add extra cgi if we have it and have room
	if ( cgi && cgiLen > 0 && p + cgiLen + 76 < pend ) {
		// if it's from the request, need to add &'s and ='s
		if ( cgiFromRequest ) {
			//p += sprintf(p, "&");
			*p = '&'; p++;
			bool ampToggle = false;
			//for (int32_t i = cgiPos; i < cgiPos + cgiLen; i++) {
				//if ( p + 10 >= pend ) break;
			for (int32_t i = 0; i < cgiLen; i++) {
				//*p = decodedPath[i];
				*p = cgi[i];
				if (*p == '\0') {
					if (ampToggle) *p = '&';
					else           *p = '=';
					ampToggle = !ampToggle;
				}
				p++;
			}
		}
		else {
			gbmemcpy(p, cgi, cgiLen);
			p += cgiLen;
		}
	}
	// hostname
	p += sprintf(p, " HTTP/1.0\r\nHost: http://");
	gbmemcpy(p, hostname, hostnameLen);
	p += hostnameLen;
	// rest of the request
	p += sprintf(p, "\r\n"
			"Accept-Language: en\r\n"
			"Accept: text/html\r\n\r\n" );
	//buffer[p - buffer] = '\0';
	// set the Url
	//url->set(buffer, p - buffer);
	return p - requestBuf;
}

static HashTable langTables[MAX_LANGUAGES+1];

// Horrible hack, must fix later
bool Categories::loadLangTables(void) {
	char line[10240];
	FILE *content;
	uint32_t h;
	uint32_t lineno = 0L;
	uint32_t entries = 0L;
	char *cp;
	char *cpEnd = line + 10239;
	if(!(content = fopen("catdb/content.rdf.u8", "r"))) {
		log(LOG_INFO, "cat: could not open content file.\n");
		return(false);
	}

	while(!feof(content) &&
			fgets(line, 10239, content)) {
		lineno++;

		if(lineno % 1000000 == 0)
			log(LOG_INFO, "cat: Parsing line %" INT32 "\n", lineno);

		if(!strncmp(line, "</ExternalPage>", 14)) {
			h = 0L; // end tag, clear hash
			continue;
		}

		if(!strncmp(line, "<ExternalPage about=\"", 21)) {
			cp = line + 28; // skip http:// too
			while(cp && *cp != '"' && cp < cpEnd)
				cp++;
			*cp = 0;
			h = hash32n(line + 28);
			continue;
		}

		if(h && !strncmp(line, "  <topic>Top/World/", 18)) {
			for(register int i = 2; i <= langTagalog; i++) {
				if(!memcmp(line + 19, langToTopic[i],
					   gbstrlen((char *)langToTopic[i]))) {
					langTables[i].addKey(h, 1);
					entries++;
					h = 0; // paranoia, clear hash
				}
			}
		}
	}

	log(LOG_INFO, "cat: Added %" INT32 " total entries.\n", entries);

	fclose(content);

	// Save all the tables for later
	for(register int i = 2; i <= langTagalog; i++) {
		sprintf(line, "catlang%03d.dat", i);
		langTables[i].save(g_hostdb.m_dir, line);
		if(langTables[i].getNumSlotsUsed() <= 0 ) {
			log(LOG_INFO, "cat: Don't seem to have any data in table %d\n", i);
		}
	}

	return(true);
}

bool Categories::initLangTables(void) {
	char name[512];
	register int i;
	// int64_t memory = g_mem.m_used;
	uint64_t start;
	uint64_t stop;
	for(i = 2; i <= MAX_LANGUAGES; i++) {

		// There is no language 5!
		if(i == 5) continue;

		/*
		langTables[i] = (HashTable *) mmalloc(sizeof(HashTable), "LangHashTable");
		if(!langTables[i]) {
			log(LOG_INFO,
			"cat: Could not allocate memory for category language tables.\n");
			return(false);
		}
		*/

		langTables[i].set(10); // paranoia
		snprintf(name, 511, "lang%03d.dat", i);
		langTables[i].load(g_hostdb.m_dir, name);
	}

	// check for any empty tables
	for(i = 2; i <= langTagalog; i++) {

		// There is no language 5!
		if(i == 5) continue;

		if(langTables[i].getNumSlotsUsed() <= 0 ) {
			log(LOG_INFO, "cat: Starting language load.\n");
			start = gettimeofdayInMicroseconds();
			loadLangTables();
			stop = gettimeofdayInMicroseconds();
			log(LOG_INFO,
					"cat: Parsing content took %" INT64 " microseconds\n", stop - start);
			break;
		}
	}
	return(true);
}

uint8_t Categories::findLanguage(char *addr) {
	uint32_t h;
	char *cp = addr;
	if(!strncmp(cp, "http://", 7)) cp += 7;
	h = hash32(cp, gbstrlen(cp));
	for(register int i = 2; i <= langTagalog; i++) {
		if(i == 5) continue; // There is no language 5!
		if(langTables[i].getNumSlotsUsed() > 0 &&
			langTables[i].getSlot(h) >= 0)
			return((uint8_t)i);
	}
	return(0);
}