git dmoz nagivation system working.

now we just need to index the urls to
populate dmoz.
This commit is contained in:
mwells
2013-10-10 22:08:21 -07:00
parent 7ba9994804
commit ca6af65217
3 changed files with 73 additions and 20 deletions

@ -50,6 +50,7 @@ void Categories::reset() {
}
}
// filename usually ./catdb/gbdmoz.structure.dat
long Categories::loadCategories ( char *filename ) {
//ifstream inStream;
int inStream;
@ -69,6 +70,7 @@ long Categories::loadCategories ( char *filename ) {
return 1;
}
// read in the number of cats
// filename usually ./catdb/gbdmoz.structure.dat
if ( fileRead ( inStream, &m_numCats, sizeof(long) ) != sizeof(long) ) {
log("cat: Error reading structure file: %s", filename);
close(inStream);
@ -114,7 +116,8 @@ long Categories::loadCategories ( char *filename ) {
g_errno = ENOMEM;
return 1;
}
// read the rest of the file into the temp buffer
// . read the rest of the file into the temp buffer
// . filename usually ./catdb/gbdmoz.structure.dat
if ( fileRead ( inStream, tempBuffer, readSize ) != readSize ) {
log("cat: Error reading structure file: %s", filename);
close(inStream);
@ -336,11 +339,15 @@ long Categories::getIndexFromPath ( char *str, long strLen ) {
// check for top
if (strLen == 3 &&
strncasecmp(str, "Top", 3) == 0)
// it is catid 2 right? but i guess zero is symbolic for us!
return 0;
// get the hash
unsigned long hash = hash32Lower_a(str, strLen, 0);
// debug
log("dmoz: looking up hash %lu",hash);
//char c = str[strLen];
//str[strLen] = '\0';
//log("dmoz: looking up hash %lu for %s",hash,str);
//str[strLen] = c;
// binary search
while (low <= high) {
// next check spot
@ -521,6 +528,13 @@ void Categories::printPathFromIndex ( SafeBuf *sb ,
// get the parent
parentId = m_cats[catIndex].m_parentid;
long catid = m_cats[catIndex].m_catid;
// include Top now. in newer dmoz it is catid2.
if ( catid == 2 ) {
sb->safePrintf("Top");
return;
}
// . print the parent(s) first
// . the new dmoz data dumps signify a parentless topic by
// havings its parentid equal its catid, so avoid infinite
@ -528,7 +542,7 @@ void Categories::printPathFromIndex ( SafeBuf *sb ,
// . the new DMOZ has Top has catid 2 now, even though it is
// mistakenly labelled as Top/World, which is really catid 3.
// so make this parentId > 2...
if (parentId > 2 && parentId != catid ) {
if (parentId >= 1 && parentId != catid ) {
bool isParentRTL = isIdRTLStart(parentId);
// print spacing here if RTL
//if (isRTL && !raw)
@ -588,10 +602,17 @@ void Categories::printPathCrumbFromIndex ( SafeBuf *sb,
// get the parent
parentId = m_cats[catIndex].m_parentid;
long catid = m_cats[catIndex].m_catid;
// include Top now. in newer dmoz it is catid2.
if ( catid == 2 ) {
sb->safePrintf("Top");
return;
}
// . print the parent(s) first
// . the new dmoz has Top has parentid 2 now, and Top/World is
// catid 3. so make this parentId > 2 not parentId > 1
if (parentId > 2 && parentId != catid ) {
if (parentId > 1 && parentId != catid ) {
bool isParentRTL = isIdRTLStart(parentId);
printPathCrumbFromId(sb, parentId, isRTL);
// print a spacing
@ -1157,6 +1178,9 @@ nextTag:
false,
0);
memcpy(catStr, htmlDecoded, catStrLen);
// reset this offset
nameStart = 0;
nameLen = catStrLen;
// get the prefix and name position/length
switch (currType) {
case SUBCAT_ALTLANG:
@ -1166,14 +1190,14 @@ nextTag:
// prefix is at the start
prefixStart = 0;
prefixLen = 0;
nameStart = 0;
//nameStart = 0;
// go to the end of the prefix
while (catStr[nameStart] != ':') {
nameStart++;
prefixLen++;
}
// skip the :Top/
nameStart += 5;
// skip the : in :Top/
nameStart += 1;
nameLen = catStrLen - nameStart;
break;
case SUBCAT_LETTERBAR:
@ -1181,9 +1205,9 @@ nextTag:
prefixStart = catStrLen - 1;
prefixLen = 1;
// skip the Top/ for the name
nameStart = 4;
//nameStart = 4;
// lose the Top/, keep the end letter
nameLen = catStrLen - 4;
//nameLen = catStrLen - 4;
break;
// . don't do this because of ltr?
//case SUBCAT_RELATED:
@ -1203,9 +1227,15 @@ nextTag:
prefixStart--;
prefixLen++;
}
// name skips Top/
nameStart = 4;
nameLen = catStrLen - 4;
// name skips Top/ ... no! we include Top now
// because we need it so PageResults.cpp can call
// currIndex=g_categories->getIndexFromPath(catName,catNameLen)
// on this name, and it needs "Top/" because it was part
// of the hash of the full name for the category now.
// and we lookup the Category record by that hash
// in getIndexFromPath().
//nameStart = 4;
//nameLen = catStrLen - 4;
break;
}
// . fill the next sub category

@ -4195,11 +4195,13 @@ bool printDMOZSubTopics ( SafeBuf& sb, long catId, State0 *st, bool inXml ) {
first = false;
catName = cat->getName();//&catBuffer[subCats[i].m_nameOffset];
catNameLen = cat->m_nameLen;//subCats[i].m_nameLen;
// this is the last topic in the dmoz dir path
// so if the dmoz topic is Top/Arts/Directories then
// the prefixp is "Directories"
prefixp = cat->getPrefix();//&catBuffer[subCats[i].m_prefixOffset];
prefixLen = cat->m_prefixLen;//subCats[i].m_prefixLen;
// skip bad categories
//currIndex=g_categories->getIndexFromPath(catName,catNameLen);
currIndex=g_categories->getIndexFromPath(prefixp,prefixLen);
currIndex=g_categories->getIndexFromPath(catName,catNameLen);
if (currIndex < 0)
continue;
// skip top adult category if we're supposed to

@ -563,12 +563,22 @@ long printCatPath ( char *str, long catid, bool raw ) {
return 0;
// get the parent
parentId = rdfCats[catIndex].m_parentid;
// . print the parent(s) first
// . in NEWER DMOZ dumps, "Top" is catid 2 and catid 1 is an
// empty title. really catid 2 is Top/World but that is an
// error that we correct below. (see "Top/World" below).
// but do not include the "Top/" as part of the path name
if (parentId > 2 &&
if ( catid == 2 ) {
// no! we now include Top as part of the path. let's
// be consistent. i'd rather have www.gigablast.com/Top
// and www.gigablast.com/Top/Arts etc. then i know if the
// path starts with /Top that it is dmoz!!
sprintf(p,"Top");
return 3;
}
if (parentId > 1 &&
// the newer dmoz files have the catid == the parent id of
// i guess top most categories, like "Top/Arts"... i would think
// it should have a parentId of 1 like the old dmoz files,
@ -888,6 +898,13 @@ int main ( int argc, char *argv[] ) {
unsigned long catOffset = currOffset - 6;
// get the topic name, preserve it on the buffer
long nameOffset = nameBufferLen;
// the name inserted by this function into "nameBuffer"
// does not seem to contain "Top/" at the beginning.
// it is from structure.rdf.u8, but it seems to be there!
// yeah, later on we hack the name buffer and nameOffset
// so it is just the last word in the directory to save
// mem. then we print out all the parent names to
// reconstruct.
long nameLen = fillNextString();
if (nameLen == -1)
goto fileEnd;
@ -1200,6 +1217,9 @@ fileEnd1:
for (long i = 0; i < numRdfCats; i++) {
// get the hash of the path
rawPathLen = printCatPath(rawPath, rdfCats[i].m_catid, true);
// crap, this rawpath contains "Top/" in the beginning
// but the rdfCats[i].m_nameOffset refers to a name
// that does not include "Top/"
rdfCats[i].m_catHash = hash32Lower_a(rawPath, rawPathLen, 0);
// fix. so that xyz/Arts does not just hash "Arts"
// because it has no parent...
@ -1212,11 +1232,12 @@ fileEnd1:
// DEBUG!
// print this shit out to find the collisions
//
//printf("hash32=%lu catid=%li parentid=%li path=%s\n",
// rdfCats[i].m_catHash,
// rdfCats[i].m_catid,
// rdfCats[i].m_parentid,
// rawPath);
continue;
printf("hash32=%lu catid=%li parentid=%li path=%s\n",
rdfCats[i].m_catHash,
rdfCats[i].m_catid,
rdfCats[i].m_parentid,
rawPath);
}
// . now we want to serialize the needed data into