many dmoz fixes. but still more we need to do.

isn't printing subcategories right now.
This commit is contained in:
mwells 2013-10-08 23:55:11 -07:00
parent 63c7764cd1
commit 7ba9994804
8 changed files with 168 additions and 33 deletions

@ -212,6 +212,15 @@ long Categories::loadCategories ( char *filename ) {
long long start = gettimeofdayInMilliseconds();
// sort the category hash by hash value
gbsort(m_catHash, m_numCats, sizeof(CategoryHash), sortCatHash);
// sanity check - no dups allowed
unsigned long last = 0xffffffff;
for ( long i = 0 ; i < m_numCats ; i++ ) {
if ( m_catHash[i].m_hash == last )
log("dmoz: hash collision on %lu",last);
last = m_catHash[i].m_hash;
}
// time it
long long took = gettimeofdayInMilliseconds();
if ( took - start > 100 ) log(LOG_INIT,"admin: Took %lli ms to "
@ -330,6 +339,8 @@ long Categories::getIndexFromPath ( char *str, long strLen ) {
return 0;
// get the hash
unsigned long hash = hash32Lower_a(str, strLen, 0);
// debug
log("dmoz: looking up hash %lu",hash);
// binary search
while (low <= high) {
// next check spot
@ -514,7 +525,10 @@ void Categories::printPathFromIndex ( SafeBuf *sb ,
// . the new dmoz data dumps signify a parentless topic by
// havings its parentid equal its catid, so avoid infinite
// loops by checking for that here now. mdw oct 2013.
if (parentId > 1 && parentId != catid ) {
// . the new DMOZ has Top has catid 2 now, even though it is
// mistakenly labelled as Top/World, which is really catid 3.
// so make this parentId > 2...
if (parentId > 2 && parentId != catid ) {
bool isParentRTL = isIdRTLStart(parentId);
// print spacing here if RTL
//if (isRTL && !raw)
@ -574,8 +588,10 @@ void Categories::printPathCrumbFromIndex ( SafeBuf *sb,
// get the parent
parentId = m_cats[catIndex].m_parentid;
long catid = m_cats[catIndex].m_catid;
// print the parent(s) first
if (parentId > 1 && parentId != catid ) {
// . print the parent(s) first
// . the new dmoz has Top has parentid 2 now, and Top/World is
// catid 3. so make this parentId > 2 not parentId > 1
if (parentId > 2 && parentId != catid ) {
bool isParentRTL = isIdRTLStart(parentId);
printPathCrumbFromId(sb, parentId, isRTL);
// print a spacing
@ -1195,8 +1211,13 @@ nextTag:
// . fill the next sub category
// . fill the prefix and name in the buffer and subcat
need = sizeof(SubCategory) + prefixLen + 1 + nameLen + 1;
// reserve space in safebuf for it
if ( ! subCatBuf->reserve(need) ) goto errEnd;
// point to it in safebuf
cat = (SubCategory *)(subCatBuf->getBuf());
cat->m_prefixLen = prefixLen;
cat->m_nameLen = nameLen;
cat->m_type = currType;
@ -1208,6 +1229,9 @@ nextTag:
p += nameLen;
*p++ = '\0';
// update safebuf length
subCatBuf->incrementLength ( cat->getRecSize() );
/*
subCats[numSubCats].m_prefixOffset = catp;
subCats[numSubCats].m_prefixLen = prefixLen;
@ -1278,8 +1302,13 @@ long Categories::createDirSearchRequest ( char *requestBuf,
char *rrr = r->m_reqBuf.getBufStart();
if ( rrr && rrr[0] == 'Z' ) cmd = "ZET";
// request
p += sprintf(p, "%s /search?dir=%li&dr=0&sc=0&sdir=%li&sdirt=0&c=",
cmd, catid, catid);
//p += sprintf(p, "%s /search?dir=%li&dr=0&sc=0&sdir=%li&sdirt=0&c=",
// cmd, catid, catid);
p += sprintf(p,
"%s /search?q=gbcatid%%3A%li&dir=%li&dr=0&sc=0&c="
, cmd
, catid
, catid);
// coll
memcpy(p, coll, collLen);
p += collLen;

@ -23,6 +23,7 @@ void HttpRequest::reset() {
m_userIP = 0;
m_isMSIE = false;
m_reqBufValid = false;
m_reqBuf.purge();
if (m_cgiBuf2) {
mfree(m_cgiBuf2, m_cgiBuf2Size, "extraParms");

@ -4,6 +4,9 @@
#include "Pages.h"
#include "Categories.h"
// function is in PageRoot.cpp:
bool printDirHomePage ( SafeBuf &sb , HttpRequest *r ) ;
// . returns false if blocked, true otherwise
// . sets g_errno on error
bool sendPageDirectory ( TcpSocket *s , HttpRequest *r ) {
@ -39,11 +42,34 @@ bool sendPageDirectory ( TcpSocket *s , HttpRequest *r ) {
// look it up
long catId = g_categories->getIdFromPath(decodedPath, decodedPathLen);
// if /Top print the directory homepage
if ( catId == 1 ) {
SafeBuf sb;
// this is in PageRoot.cpp
printDirHomePage(sb,r);
return g_httpServer.sendDynamicPage ( s,
(char*) sb.getBufStart(),
sb.length(),
// 120 seconds cachetime
// don't cache anymore
// since
// we have the login bar
// @ the top of the page
0,//120, // cachetime
false,// post?
"text/html",
200,
NULL, // cookie
"UTF-8",
r);
}
// . make a new request for PageResults
//Url dirUrl;
char requestBuf[1024+MAX_COLL_LEN+128];
long requestBufSize = 1024+MAX_COLL_LEN+128;
//g_categories.createDirectorySearchUrl ( &dirUrl,
log("dmoz: creating search request");
long requestBufLen = g_categories->createDirSearchRequest(
requestBuf,
requestBufSize,

@ -212,11 +212,15 @@ bool sendPageResults ( TcpSocket *s , HttpRequest *hr ) {
long rawFormat = hr->getLong("xml", 0); // was "raw"
long xml = hr->getLong("xml",0);
// get the dmoz catid if given
long catid = hr->getLong("dir",-1);
//
// send back page frame with the ajax call to get the real
// search results
// search results. do not do this if a "&dir=" (dmoz category)
// is given
//
if ( hr->getLong("id",0) == 0 && ! xml ) {
if ( hr->getLong("id",0) == 0 && ! xml && catid == -1 ) {
SafeBuf sb;
sb.safePrintf(
"<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML "
@ -830,14 +834,16 @@ bool gotResults ( void *state ) {
if ( ! xml ) {
sb.safePrintf("\n<font size=4><b>");
if ( rtl ) sb.safePrintf("<span dir=ltr>");
sb.safePrintf("<a href=\"/\">Top</a>: ");
sb.safePrintf("<a href=\"/Top\">Top</a>: ");
}
// put crumbin xml?
if ( xml )
sb.safePrintf("<breacdcrumb><![CDATA[");
// display the breadcrumb in xml or html?
g_categories->printPathCrumbFromIndex(&sb,dirIndex,rtl);
sb.safePrintf("]]></breadcrumb>\n" );
if ( xml )
sb.safePrintf("]]></breadcrumb>\n" );
// print the num
if ( ! xml ) {
@ -4192,7 +4198,8 @@ bool printDMOZSubTopics ( SafeBuf& sb, long catId, State0 *st, bool inXml ) {
prefixp = cat->getPrefix();//&catBuffer[subCats[i].m_prefixOffset];
prefixLen = cat->m_prefixLen;//subCats[i].m_prefixLen;
// skip bad categories
currIndex = g_categories->getIndexFromPath(catName, catNameLen);
//currIndex=g_categories->getIndexFromPath(catName,catNameLen);
currIndex=g_categories->getIndexFromPath(prefixp,prefixLen);
if (currIndex < 0)
continue;
// skip top adult category if we're supposed to

@ -379,7 +379,10 @@ long Pages::getDynamicPageNumber ( HttpRequest *r ) {
}
// sanity
if ( ! g_categories ) log("process: no categories loaded");
// look it up for a category
//
// dmoz - look it up for a category
//
if ( g_categories &&
g_categories->getIndexFromPath(decodedPath, decodedPathLen) >= 0)
return PAGE_DIRECTORY;
@ -482,7 +485,7 @@ bool Pages::sendDynamicReply ( TcpSocket *s , HttpRequest *r , long page ) {
// often times my cookie says username=mwells but i am not logged
// in and i don't want to type my password to see the root page,
// or any other public page
if ( ! publicPage && ! g_users.hasPermission( r, page , s ) &&
if ( ! publicPage &&!isLocal&&//g_users.hasPermission( r, page , s ) &&
! isLoopback ) {
log("login: access denied 2 from ip=%s",iptoa(s->m_ip));
return sendPageLogin ( s , r, "Access Denied. No permission.");
@ -614,7 +617,7 @@ bool Pages::sendDynamicReply ( TcpSocket *s , HttpRequest *r , long page ) {
// . now, so it can be responsible for calling pg->m_function
//if ( userType > USER_PUBLIC ) {
// check if user has public page access
if ( g_users.hasPermission( r, page , s ) ) {
if ( isLocal ) { // g_users.hasPermission( r, page , s ) ) {
// . this will set various parms
// . we know the request came from a host in the cluster
// because "isHost" is true.

@ -1224,8 +1224,8 @@ bool SearchInput::setQueryBuffers ( ) {
long dcatId = -1;
// get the final query
char *q =m_sbuf1.getBufStart();
if ( q ) sscanf(q,"gbpdcat:%li",&pcatId);
if ( q ) sscanf(q,"gbcat:%li",&dcatId);
if ( q ) sscanf(q,"gbpdcatid:%li",&pcatId);
if ( q ) sscanf(q,"gbcatid:%li",&dcatId);
// pick the one that is valid
long catId = -1;
if ( pcatId >= 0 ) catId = pcatId;

@ -21983,7 +21983,7 @@ bool XmlDoc::hashDMOZCategories ( HashTableX *tt ) {
// write the catid as a string
sprintf(buf, "%lu", catIds[i]);
// term prefix for hashing
hi.m_prefix = "gbdcat";
hi.m_prefix = "gbcatid";
// hash it
hashString ( buf , gbstrlen(buf) , &hi );
// we also want to hash the parents
@ -21994,7 +21994,7 @@ bool XmlDoc::hashDMOZCategories ( HashTableX *tt ) {
while ( currCatId > 1 ) {
// hash the parent
sprintf(buf, "%lu", currParentId);
hi.m_prefix = "gbpdcat";
hi.m_prefix = "gbpcatid";
hashString ( buf , gbstrlen(buf), &hi );
// next cat
currCatId = currParentId;
@ -22037,7 +22037,7 @@ bool XmlDoc::hashDMOZCategories ( HashTableX *tt ) {
// write the catid as a string
sprintf(buf, "%lu", indCatIds[i]);
// use prefix
hi.m_prefix = "gbicat";
hi.m_prefix = "gbicatid";
hi.m_hashGroup = HASHGROUP_INTAG;
// hash it
hashString ( buf , gbstrlen(buf), &hi );
@ -22051,7 +22051,7 @@ bool XmlDoc::hashDMOZCategories ( HashTableX *tt ) {
// hash the parent
sprintf(buf, "%lu", currParentId);
// new prefix
hi.m_prefix = "gbpicat";
hi.m_prefix = "gbipcatid";
// hash it
hashString ( buf , gbstrlen(buf), &hi );
// next cat

@ -209,12 +209,15 @@ long rdfParse ( char *tagName ) {
// move to the next tag in the file
long rdfNextTag ( ) {
bool inQuote = false;
//bool inQuote = false;
// move to the next tag
while (*rdfPtr != '<' || inQuote ) {
while (*rdfPtr != '<' ) { // || inQuote ) {
// check for quotes
if (*rdfPtr == '"')
inQuote = !inQuote;
// NO! too many unbalanced quotes all over the place!
// and i think quotes in tags do not have < or > in them
// because they should be encoded as &gt; and &lt;
//if (*rdfPtr == '"')
// inQuote = !inQuote;
// next char
if (!incRdfPtr())
return -1;
@ -560,8 +563,12 @@ long printCatPath ( char *str, long catid, bool raw ) {
return 0;
// get the parent
parentId = rdfCats[catIndex].m_parentid;
// print the parent(s) first
if (parentId > 1 &&
// . print the parent(s) first
// . in NEWER DMOZ dumps, "Top" is catid 2 and catid 1 is an
// empty title. really catid 2 is Top/World but that is an
// error that we correct below. (see "Top/World" below).
// but do not include the "Top/" as part of the path name
if (parentId > 2 &&
// the newer dmoz files have the catid == the parent id of
// i guess top most categories, like "Top/Arts"... i would think
// it should have a parentId of 1 like the old dmoz files,
@ -907,12 +914,29 @@ int main ( int argc, char *argv[] ) {
nameLen ,
false,
0);
memcpy(&nameBuffer[nameOffset], htmlDecoded, nameLen);
nameBufferLen += nameLen;
// parse the catid
long catid = parseNextCatid();
if (catid == -1)
goto fileEnd;
// crap, in the new dmoz structure.rdf.u8 catid 1 is
// empty name and catid 2 has Topic tag "Top/World" but
// Title tag "Top".
// but it should probably be "Top" and not "World". There is
// another catid 3 in structure.rdf.u8 that has
// <Topic r:id="Top/World"> and catid 3 which is the real one,
// so catid 2 is just "Top". this is a bug in the dmoz output
// i think, so fix it here.
if ( catid == 2 ) {
nameLen = 3;
memcpy(&nameBuffer[nameOffset],"Top",nameLen);
nameBufferLen += nameLen;
}
else {
memcpy(&nameBuffer[nameOffset], htmlDecoded, nameLen);
nameBufferLen += nameLen;
}
// . fill the current cat
// make sure there's room
if (numRdfCats >= rdfCatsSize) {
@ -1002,10 +1026,16 @@ fileEnd:
rdfEnd = &rdfBuffer[n];
currOffset = 0;
//
// set m_parentid using structure.rdf.u8
//
// read and parse the file again
printf("Building Hierarchy...\n");
while (true) {
// parse the next catid
// parse the next catid in the file, sequentially
//if ( currOffset == 545468935 )
// printf("shit\n");
long catid = parseNextCatid();
if (catid == -1)
goto fileEnd1;
@ -1060,6 +1090,14 @@ nextChildTag:
false,
0);
memcpy(childName, htmlDecoded, childNameLen);
// debug log
//if ( currOffset >= 506362430 ) // 556362463
// printf("off=%li\n",currOffset);
// debug point
//if ( currOffset == 545467573 )
// printf("GOT DEBUG POINT before giant skip\n");
// cut off the leading label if symbolic
// if (parentType == 2) {
// while (*childName != ':') {
@ -1069,20 +1107,27 @@ nextChildTag:
// childName++;
// childNameLen--;
// }
// debug point
//if (strcmp(childName,"Top/World/Català/Arts") == 0 )
// printf("hey\n");
// get the catid for the child
long childid = getCatHash(childName, childNameLen);
// get the cat for this id
long cat = getIndexFromId(childid);
// make sure we have a match
if (cat == -1) {
//printf("Warning: Child Topic Not Found: ");
//for (long i = 0; i < childNameLen; i++)
// printf("%c", childName[i]);
//printf("\n");
// debug. why does Top/World/Catala/Arts
// not have a parent??
printf("Warning: Child Topic Not Found: ");
for (long i = 0; i < childNameLen; i++)
printf("%c", childName[i]);
printf("\n");
m++;
goto nextChildTag;
}
// assign the parent to the cat
// . assign the parent to the cat
// . this means we are in a "child" tag within the "catid"
// . catid 84192
if (parentType == 1) {
if (rdfCats[cat].m_parentid != 0)
printf("Warning: Overwriting Parent Id!\n");
@ -1114,6 +1159,14 @@ fileEnd1:
printf(" Total Topics: %li\n", numRdfCats);
printf(" Topics with Parents: %li\n", t);
printf(" Topics Linked but Nonexistent: %li\n", m);
if ( t != numRdfCats ) {
printf("\n"
" *Topics without parents is bad because they\n"
" can not have their entired rawPath printed out\n"
" in order to get their proper hash\n");
}
//printf(" Number of Symbolic Links: %li\n", numSymParents);
printf("\n");
@ -1148,6 +1201,22 @@ fileEnd1:
// get the hash of the path
rawPathLen = printCatPath(rawPath, rdfCats[i].m_catid, true);
rdfCats[i].m_catHash = hash32Lower_a(rawPath, rawPathLen, 0);
// fix. so that xyz/Arts does not just hash "Arts"
// because it has no parent...
if ( rdfCats[i].m_parentid == 0 ) {
printf("Missing parent for catid %li. Will be "
"excluded from DMOZ so we avoid hash "
"collisions.\n",rdfCats[i].m_catid);
}
//
// DEBUG!
// print this shit out to find the collisions
//
//printf("hash32=%lu catid=%li parentid=%li path=%s\n",
// rdfCats[i].m_catHash,
// rdfCats[i].m_catid,
// rdfCats[i].m_parentid,
// rawPath);
}
// . now we want to serialize the needed data into