many dmoz fixes. but still more we need to do.

isn't printing subcategories right now.
2013-10-08 23:55:11 -07:00 · 2013-10-08 23:55:11 -07:00 · 7ba9994804
commit 7ba9994804
parent 63c7764cd1
8 changed files with 168 additions and 33 deletions
--- a/Categories.cpp
+++ b/Categories.cpp
@ -212,6 +212,15 @@ long Categories::loadCategories ( char *filename ) {
 	long long start = gettimeofdayInMilliseconds();
 	// sort the category hash by hash value
 	gbsort(m_catHash, m_numCats, sizeof(CategoryHash), sortCatHash);
+
+	// sanity check - no dups allowed
+	unsigned long last = 0xffffffff;
+	for ( long i = 0 ; i < m_numCats ; i++ ) {
+		if ( m_catHash[i].m_hash == last ) 
+			log("dmoz: hash collision on %lu",last);
+		last = m_catHash[i].m_hash;
+	}
+
 	// time it
 	long long took = gettimeofdayInMilliseconds();
 	if ( took - start > 100 ) log(LOG_INIT,"admin: Took %lli ms to "
@ -330,6 +339,8 @@ long Categories::getIndexFromPath ( char *str, long strLen ) {
 		return 0;
 	// get the hash
 	unsigned long hash = hash32Lower_a(str, strLen, 0);
+	// debug
+	log("dmoz: looking up hash %lu",hash);
 	// binary search
 	while (low <= high) {
 		// next check spot
@ -514,7 +525,10 @@ void Categories::printPathFromIndex ( SafeBuf *sb ,
 	// . the new dmoz data dumps signify a parentless topic by
 	//   havings its parentid equal its catid, so avoid infinite
 	//   loops by checking for that here now. mdw oct 2013.
-	if (parentId > 1 && parentId != catid ) {
+	// . the new DMOZ has Top has catid 2 now, even though it is
+	//   mistakenly labelled as Top/World, which is really catid 3.
+	//   so make this parentId > 2...
+	if (parentId > 2 && parentId != catid ) {
 		bool isParentRTL = isIdRTLStart(parentId);
 		// print spacing here if RTL
 		//if (isRTL && !raw)
@ -574,8 +588,10 @@ void Categories::printPathCrumbFromIndex ( SafeBuf *sb,
 	// get the parent
 	parentId = m_cats[catIndex].m_parentid;
 	long catid = m_cats[catIndex].m_catid;
-	// print the parent(s) first
-	if (parentId > 1 && parentId != catid ) {
+	// . print the parent(s) first
+	// . the new dmoz has Top has parentid 2 now, and Top/World is
+	//   catid 3. so make this parentId > 2 not parentId > 1
+	if (parentId > 2 && parentId != catid ) {
 		bool isParentRTL = isIdRTLStart(parentId);
 		printPathCrumbFromId(sb, parentId, isRTL);
 		// print a spacing
@ -1195,8 +1211,13 @@ nextTag:
 	// . fill the next sub category
 	// . fill the prefix and name in the buffer and subcat
 	need = sizeof(SubCategory) + prefixLen + 1 + nameLen + 1;
+
+	// reserve space in safebuf for it
 	if ( ! subCatBuf->reserve(need) ) goto errEnd;
+
+	// point to it in safebuf
 	cat = (SubCategory *)(subCatBuf->getBuf());
+
 	cat->m_prefixLen = prefixLen;
 	cat->m_nameLen = nameLen;
 	cat->m_type = currType;
@ -1208,6 +1229,9 @@ nextTag:
 	p += nameLen;
 	*p++ = '\0';

+	// update safebuf length
+	subCatBuf->incrementLength ( cat->getRecSize() );
+
 	/*
 	subCats[numSubCats].m_prefixOffset = catp;
 	subCats[numSubCats].m_prefixLen    = prefixLen;
@ -1278,8 +1302,13 @@ long Categories::createDirSearchRequest ( char *requestBuf,
 	char *rrr = r->m_reqBuf.getBufStart();
 	if ( rrr && rrr[0] == 'Z' ) cmd = "ZET";
 	// request
-	p += sprintf(p, "%s /search?dir=%li&dr=0&sc=0&sdir=%li&sdirt=0&c=",
-			cmd, catid, catid);
+	//p += sprintf(p, "%s /search?dir=%li&dr=0&sc=0&sdir=%li&sdirt=0&c=",
+	//		cmd, catid, catid);
+	p += sprintf(p, 
+		     "%s /search?q=gbcatid%%3A%li&dir=%li&dr=0&sc=0&c="
+		     , cmd
+		     , catid
+		     , catid);
 	// coll
 	memcpy(p, coll, collLen);
 	p += collLen;
--- a/HttpRequest.cpp
+++ b/HttpRequest.cpp
@ -23,6 +23,7 @@ void HttpRequest::reset() {
 	m_userIP = 0;
 	m_isMSIE = false;
 	m_reqBufValid = false;
+	m_reqBuf.purge();

 	if (m_cgiBuf2) {
 		mfree(m_cgiBuf2, m_cgiBuf2Size, "extraParms");
--- a/PageDirectory.cpp
+++ b/PageDirectory.cpp
@ -4,6 +4,9 @@
 #include "Pages.h"
 #include "Categories.h"

+// function is in PageRoot.cpp:
+bool printDirHomePage ( SafeBuf &sb , HttpRequest *r ) ;
+
 // . returns false if blocked, true otherwise
 // . sets g_errno on error
 bool sendPageDirectory ( TcpSocket *s , HttpRequest *r ) {
@ -39,11 +42,34 @@ bool sendPageDirectory ( TcpSocket *s , HttpRequest *r ) {
 	// look it up
 	long catId = g_categories->getIdFromPath(decodedPath, decodedPathLen);

+	// if /Top print the directory homepage
+	if ( catId == 1 ) {
+		SafeBuf sb;
+		// this is in PageRoot.cpp
+		printDirHomePage(sb,r);
+		return g_httpServer.sendDynamicPage ( s,
+						      (char*) sb.getBufStart(),
+						      sb.length(),
+						      // 120 seconds cachetime
+						      // don't cache anymore 
+						      // since
+						      // we have the login bar
+						      // @ the top of the page
+						      0,//120, // cachetime
+						      false,// post?
+						      "text/html",
+						      200,
+						      NULL, // cookie
+						      "UTF-8",
+						      r);
+	}
+
 	// . make a new request for PageResults
 	//Url dirUrl;
 	char requestBuf[1024+MAX_COLL_LEN+128];
 	long requestBufSize = 1024+MAX_COLL_LEN+128;
 	//g_categories.createDirectorySearchUrl ( &dirUrl,
+	log("dmoz: creating search request");
 	long requestBufLen = g_categories->createDirSearchRequest(
 						 requestBuf,
 						 requestBufSize,
--- a/PageResults.cpp
+++ b/PageResults.cpp
@ -212,11 +212,15 @@ bool sendPageResults ( TcpSocket *s , HttpRequest *hr ) {
 	long rawFormat = hr->getLong("xml", 0); // was "raw"
 	long xml = hr->getLong("xml",0);

+	// get the dmoz catid if given
+	long catid = hr->getLong("dir",-1);
+
 	//
 	// send back page frame with the ajax call to get the real
-	// search results
+	// search results. do not do this if a "&dir=" (dmoz category)
+	// is given
 	//
-	if ( hr->getLong("id",0) == 0 && ! xml ) {
+	if ( hr->getLong("id",0) == 0 && ! xml && catid == -1 ) {
 		SafeBuf sb;
 		sb.safePrintf(
 			      "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML "
@ -830,14 +834,16 @@ bool gotResults ( void *state ) {
 		if ( ! xml ) {
 			sb.safePrintf("\n<font size=4><b>");
 			if ( rtl ) sb.safePrintf("<span dir=ltr>");
-			sb.safePrintf("<a href=\"/\">Top</a>: ");
+			sb.safePrintf("<a href=\"/Top\">Top</a>: ");
 		}
 		// put crumbin xml?
 		if ( xml ) 
 			sb.safePrintf("<breacdcrumb><![CDATA[");
 		// display the breadcrumb in xml or html?
 		g_categories->printPathCrumbFromIndex(&sb,dirIndex,rtl);
-		sb.safePrintf("]]></breadcrumb>\n" );
+
+		if ( xml )
+			sb.safePrintf("]]></breadcrumb>\n" );

 		// print the num
 		if ( ! xml ) {
@ -4192,7 +4198,8 @@ bool printDMOZSubTopics ( SafeBuf&  sb, long catId, State0 *st, bool inXml ) {
 		prefixp = cat->getPrefix();//&catBuffer[subCats[i].m_prefixOffset];
 		prefixLen = cat->m_prefixLen;//subCats[i].m_prefixLen;
 		// skip bad categories
-		currIndex = g_categories->getIndexFromPath(catName, catNameLen);
+		//currIndex=g_categories->getIndexFromPath(catName,catNameLen);
+		currIndex=g_categories->getIndexFromPath(prefixp,prefixLen);
 		if (currIndex < 0)
 			continue;
 		// skip top adult category if we're supposed to
--- a/Pages.cpp
+++ b/Pages.cpp
@ -379,7 +379,10 @@ long Pages::getDynamicPageNumber ( HttpRequest *r ) {
 	}
 	// sanity
 	if ( ! g_categories ) log("process: no categories loaded");
-	// look it up for a category
+
+	//
+	// dmoz - look it up for a category
+	//
 	if ( g_categories &&
 	     g_categories->getIndexFromPath(decodedPath, decodedPathLen) >= 0)
 		return PAGE_DIRECTORY;
@ -482,7 +485,7 @@ bool Pages::sendDynamicReply ( TcpSocket *s , HttpRequest *r , long page ) {
 	//   often times my cookie says username=mwells but i am not logged
 	//   in and i don't want to type my password to see the root page,
 	//   or any other public page
-	if ( ! publicPage && ! g_users.hasPermission( r, page , s ) &&
+	if ( ! publicPage &&!isLocal&&//g_users.hasPermission( r, page , s ) &&
 	     ! isLoopback ) {
 		log("login: access denied 2 from ip=%s",iptoa(s->m_ip));
 		return sendPageLogin ( s , r, "Access Denied. No permission.");
@ -614,7 +617,7 @@ bool Pages::sendDynamicReply ( TcpSocket *s , HttpRequest *r , long page ) {
 	// . now, so it can be responsible for calling pg->m_function
 	//if ( userType > USER_PUBLIC ) {
 	// check if user has public page access 
-	if ( g_users.hasPermission( r, page , s ) ) {
+	if ( isLocal ) { // g_users.hasPermission( r, page , s ) ) {
 		// . this will set various parms
 		// . we know the request came from a host in the cluster
 		//   because "isHost" is true.
--- a/SearchInput.cpp
+++ b/SearchInput.cpp
@ -1224,8 +1224,8 @@ bool SearchInput::setQueryBuffers ( ) {
 	long dcatId  = -1;
 	// get the final query
 	char *q =m_sbuf1.getBufStart();
-	if ( q ) sscanf(q,"gbpdcat:%li",&pcatId);
-	if ( q ) sscanf(q,"gbcat:%li",&dcatId);
+	if ( q ) sscanf(q,"gbpdcatid:%li",&pcatId);
+	if ( q ) sscanf(q,"gbcatid:%li",&dcatId);
 	// pick the one that is valid
 	long catId = -1;
 	if ( pcatId >= 0 ) catId = pcatId;
--- a/XmlDoc.cpp
+++ b/XmlDoc.cpp
@ -21983,7 +21983,7 @@ bool XmlDoc::hashDMOZCategories ( HashTableX *tt ) {
 		// write the catid as a string
 		sprintf(buf, "%lu", catIds[i]);
 		// term prefix for hashing
-		hi.m_prefix = "gbdcat";
+		hi.m_prefix = "gbcatid";
 		// hash it
 		hashString ( buf , gbstrlen(buf) , &hi );
 		// we also want to hash the parents
@ -21994,7 +21994,7 @@ bool XmlDoc::hashDMOZCategories ( HashTableX *tt ) {
 		while ( currCatId > 1 ) {
 			// hash the parent
 			sprintf(buf, "%lu", currParentId);
-			hi.m_prefix = "gbpdcat";
+			hi.m_prefix = "gbpcatid";
 			hashString ( buf , gbstrlen(buf), &hi );
 			// next cat
 			currCatId = currParentId;
@ -22037,7 +22037,7 @@ bool XmlDoc::hashDMOZCategories ( HashTableX *tt ) {
 		// write the catid as a string
 		sprintf(buf, "%lu", indCatIds[i]);
 		// use prefix
-		hi.m_prefix = "gbicat";
+		hi.m_prefix = "gbicatid";
 		hi.m_hashGroup = HASHGROUP_INTAG;
 		// hash it
 		hashString ( buf , gbstrlen(buf), &hi );
@ -22051,7 +22051,7 @@ bool XmlDoc::hashDMOZCategories ( HashTableX *tt ) {
 			// hash the parent
 			sprintf(buf, "%lu", currParentId);
 			// new prefix
-			hi.m_prefix = "gbpicat";
+			hi.m_prefix = "gbipcatid";
 			// hash it
 			hashString ( buf , gbstrlen(buf), &hi );
 			// next cat
--- a/dmozparse.cpp
+++ b/dmozparse.cpp
@ -209,12 +209,15 @@ long rdfParse ( char *tagName ) {

 // move to the next tag in the file
 long rdfNextTag ( ) {
-	bool inQuote = false;
+	//bool inQuote = false;
 	// move to the next tag
-	while (*rdfPtr != '<' || inQuote ) {
+	while (*rdfPtr != '<' ) { // || inQuote ) {
 		// check for quotes
-		if (*rdfPtr == '"')
-			inQuote = !inQuote;
+		// NO! too many unbalanced quotes all over the place!
+		// and i think quotes in tags do not have < or > in them
+		// because they should be encoded as &gt; and &lt;
+		//if (*rdfPtr == '"')
+		//	inQuote = !inQuote;
 		// next char
 		if (!incRdfPtr())
 			return -1;
@ -560,8 +563,12 @@ long printCatPath ( char *str, long catid, bool raw ) {
 		return 0;
 	// get the parent
 	parentId = rdfCats[catIndex].m_parentid;
-	// print the parent(s) first
-	if (parentId > 1 && 
+	// . print the parent(s) first
+	// . in NEWER DMOZ dumps, "Top" is catid 2 and catid 1 is an
+	//   empty title. really catid 2 is Top/World but that is an
+	//   error that we correct below. (see "Top/World" below).
+	//   but do not include the "Top/" as part of the path name
+	if (parentId > 2 && 
 	    // the newer dmoz files have the catid == the parent id of
 	    // i guess top most categories, like "Top/Arts"... i would think
 	    // it should have a parentId of 1 like the old dmoz files,
@ -907,12 +914,29 @@ int main ( int argc, char *argv[] ) {
 				       nameLen ,
 				       false,
 				       0);
-		memcpy(&nameBuffer[nameOffset], htmlDecoded, nameLen);
-		nameBufferLen  += nameLen;
+
 		// parse the catid
 		long catid = parseNextCatid();
 		if (catid == -1)
 			goto fileEnd;
+
+		// crap, in the new dmoz structure.rdf.u8 catid 1 is 
+		// empty name and catid 2 has Topic tag "Top/World" but 
+		// Title tag "Top".
+		// but it should probably be "Top" and not "World". There is 
+		// another catid 3 in structure.rdf.u8 that has 
+		// <Topic r:id="Top/World"> and catid 3 which is the real one,
+		// so catid 2 is just "Top". this is a bug in the dmoz output 
+		// i think, so fix it here.
+		if ( catid == 2 ) {
+			nameLen = 3;
+			memcpy(&nameBuffer[nameOffset],"Top",nameLen); 
+			nameBufferLen += nameLen;
+		}
+		else {
+			memcpy(&nameBuffer[nameOffset], htmlDecoded, nameLen);
+			nameBufferLen  += nameLen;
+		}
 		// . fill the current cat
 		//   make sure there's room
 		if (numRdfCats >= rdfCatsSize) {
@ -1002,10 +1026,16 @@ fileEnd:
 	rdfEnd = &rdfBuffer[n];
 	currOffset = 0;

+	//
+	// set m_parentid using structure.rdf.u8
+	//
+
 	// read and parse the file again
 	printf("Building Hierarchy...\n");
 	while (true) {
-		// parse the next catid
+		// parse the next catid in the file, sequentially
+		//if ( currOffset == 545468935 )
+		//	printf("shit\n");
 		long catid = parseNextCatid();
 		if (catid == -1)
 			goto fileEnd1;
@ -1060,6 +1090,14 @@ nextChildTag:
 					    false,
 					    0);
 		memcpy(childName, htmlDecoded, childNameLen);
+
+		// debug log
+		//if ( currOffset >= 506362430 ) // 556362463
+		//	printf("off=%li\n",currOffset);
+		// debug point
+		//if ( currOffset == 545467573 )
+		//	printf("GOT DEBUG POINT before giant skip\n");
+
 		// cut off the leading label if symbolic
 //		if (parentType == 2) {
 //			while (*childName != ':') {
@ -1069,20 +1107,27 @@ nextChildTag:
 //			childName++;
 //			childNameLen--;
 //		}
+		// debug point
+		//if (strcmp(childName,"Top/World/Català/Arts") == 0 )
+		//	printf("hey\n");
 		// get the catid for the child
 		long childid = getCatHash(childName, childNameLen);
 		// get the cat for this id
 		long cat = getIndexFromId(childid);
 		// make sure we have a match
 		if (cat == -1) {
-			//printf("Warning: Child Topic Not Found: ");
-			//for (long i = 0; i < childNameLen; i++)
-			//	printf("%c", childName[i]);
-			//printf("\n");
+			// debug. why does Top/World/Catala/Arts
+			// not have a parent??
+			printf("Warning: Child Topic Not Found: ");
+			for (long i = 0; i < childNameLen; i++)
+				printf("%c", childName[i]);
+			printf("\n");
 			m++;
 			goto nextChildTag;
 		}
-		// assign the parent to the cat
+		// . assign the parent to the cat
+		// . this means we are in a "child" tag within the "catid"
+		// . catid 84192 
 		if (parentType == 1) {
 			if (rdfCats[cat].m_parentid != 0)
 				printf("Warning: Overwriting Parent Id!\n");
@ -1114,6 +1159,14 @@ fileEnd1:
 	printf("  Total Topics:                  %li\n", numRdfCats);
 	printf("  Topics with Parents:           %li\n", t);
 	printf("  Topics Linked but Nonexistent: %li\n", m);
+
+	if ( t != numRdfCats ) {
+		printf("\n"
+		       "  *Topics without parents is bad because they\n"
+		       "   can not have their entired rawPath printed out\n"
+		       "   in order to get their proper hash\n");
+	}
+
 	//printf("  Number of Symbolic Links:      %li\n", numSymParents);
 	printf("\n");

@ -1148,6 +1201,22 @@ fileEnd1:
 		// get the hash of the path
 		rawPathLen = printCatPath(rawPath, rdfCats[i].m_catid, true);
 		rdfCats[i].m_catHash = hash32Lower_a(rawPath, rawPathLen, 0);
+		// fix. so that xyz/Arts does not just hash "Arts"
+		// because it has no parent...
+		if ( rdfCats[i].m_parentid == 0 ) {
+			printf("Missing parent for catid %li. Will be "
+			       "excluded from DMOZ so we avoid hash "
+			       "collisions.\n",rdfCats[i].m_catid);
+		}
+		//
+		// DEBUG!
+		// print this shit out to find the collisions
+		//
+		//printf("hash32=%lu catid=%li parentid=%li path=%s\n",
+		//       rdfCats[i].m_catHash,
+		//       rdfCats[i].m_catid,
+		//       rdfCats[i].m_parentid,
+		//       rawPath);
 	}

 	// . now we want to serialize the needed data into