many dmoz fixes. but still more we need to do.
isn't printing subcategories right now.
This commit is contained in:
parent
63c7764cd1
commit
7ba9994804
@ -212,6 +212,15 @@ long Categories::loadCategories ( char *filename ) {
|
||||
long long start = gettimeofdayInMilliseconds();
|
||||
// sort the category hash by hash value
|
||||
gbsort(m_catHash, m_numCats, sizeof(CategoryHash), sortCatHash);
|
||||
|
||||
// sanity check - no dups allowed
|
||||
unsigned long last = 0xffffffff;
|
||||
for ( long i = 0 ; i < m_numCats ; i++ ) {
|
||||
if ( m_catHash[i].m_hash == last )
|
||||
log("dmoz: hash collision on %lu",last);
|
||||
last = m_catHash[i].m_hash;
|
||||
}
|
||||
|
||||
// time it
|
||||
long long took = gettimeofdayInMilliseconds();
|
||||
if ( took - start > 100 ) log(LOG_INIT,"admin: Took %lli ms to "
|
||||
@ -330,6 +339,8 @@ long Categories::getIndexFromPath ( char *str, long strLen ) {
|
||||
return 0;
|
||||
// get the hash
|
||||
unsigned long hash = hash32Lower_a(str, strLen, 0);
|
||||
// debug
|
||||
log("dmoz: looking up hash %lu",hash);
|
||||
// binary search
|
||||
while (low <= high) {
|
||||
// next check spot
|
||||
@ -514,7 +525,10 @@ void Categories::printPathFromIndex ( SafeBuf *sb ,
|
||||
// . the new dmoz data dumps signify a parentless topic by
|
||||
// havings its parentid equal its catid, so avoid infinite
|
||||
// loops by checking for that here now. mdw oct 2013.
|
||||
if (parentId > 1 && parentId != catid ) {
|
||||
// . the new DMOZ has Top has catid 2 now, even though it is
|
||||
// mistakenly labelled as Top/World, which is really catid 3.
|
||||
// so make this parentId > 2...
|
||||
if (parentId > 2 && parentId != catid ) {
|
||||
bool isParentRTL = isIdRTLStart(parentId);
|
||||
// print spacing here if RTL
|
||||
//if (isRTL && !raw)
|
||||
@ -574,8 +588,10 @@ void Categories::printPathCrumbFromIndex ( SafeBuf *sb,
|
||||
// get the parent
|
||||
parentId = m_cats[catIndex].m_parentid;
|
||||
long catid = m_cats[catIndex].m_catid;
|
||||
// print the parent(s) first
|
||||
if (parentId > 1 && parentId != catid ) {
|
||||
// . print the parent(s) first
|
||||
// . the new dmoz has Top has parentid 2 now, and Top/World is
|
||||
// catid 3. so make this parentId > 2 not parentId > 1
|
||||
if (parentId > 2 && parentId != catid ) {
|
||||
bool isParentRTL = isIdRTLStart(parentId);
|
||||
printPathCrumbFromId(sb, parentId, isRTL);
|
||||
// print a spacing
|
||||
@ -1195,8 +1211,13 @@ nextTag:
|
||||
// . fill the next sub category
|
||||
// . fill the prefix and name in the buffer and subcat
|
||||
need = sizeof(SubCategory) + prefixLen + 1 + nameLen + 1;
|
||||
|
||||
// reserve space in safebuf for it
|
||||
if ( ! subCatBuf->reserve(need) ) goto errEnd;
|
||||
|
||||
// point to it in safebuf
|
||||
cat = (SubCategory *)(subCatBuf->getBuf());
|
||||
|
||||
cat->m_prefixLen = prefixLen;
|
||||
cat->m_nameLen = nameLen;
|
||||
cat->m_type = currType;
|
||||
@ -1208,6 +1229,9 @@ nextTag:
|
||||
p += nameLen;
|
||||
*p++ = '\0';
|
||||
|
||||
// update safebuf length
|
||||
subCatBuf->incrementLength ( cat->getRecSize() );
|
||||
|
||||
/*
|
||||
subCats[numSubCats].m_prefixOffset = catp;
|
||||
subCats[numSubCats].m_prefixLen = prefixLen;
|
||||
@ -1278,8 +1302,13 @@ long Categories::createDirSearchRequest ( char *requestBuf,
|
||||
char *rrr = r->m_reqBuf.getBufStart();
|
||||
if ( rrr && rrr[0] == 'Z' ) cmd = "ZET";
|
||||
// request
|
||||
p += sprintf(p, "%s /search?dir=%li&dr=0&sc=0&sdir=%li&sdirt=0&c=",
|
||||
cmd, catid, catid);
|
||||
//p += sprintf(p, "%s /search?dir=%li&dr=0&sc=0&sdir=%li&sdirt=0&c=",
|
||||
// cmd, catid, catid);
|
||||
p += sprintf(p,
|
||||
"%s /search?q=gbcatid%%3A%li&dir=%li&dr=0&sc=0&c="
|
||||
, cmd
|
||||
, catid
|
||||
, catid);
|
||||
// coll
|
||||
memcpy(p, coll, collLen);
|
||||
p += collLen;
|
||||
|
@ -23,6 +23,7 @@ void HttpRequest::reset() {
|
||||
m_userIP = 0;
|
||||
m_isMSIE = false;
|
||||
m_reqBufValid = false;
|
||||
m_reqBuf.purge();
|
||||
|
||||
if (m_cgiBuf2) {
|
||||
mfree(m_cgiBuf2, m_cgiBuf2Size, "extraParms");
|
||||
|
@ -4,6 +4,9 @@
|
||||
#include "Pages.h"
|
||||
#include "Categories.h"
|
||||
|
||||
// function is in PageRoot.cpp:
|
||||
bool printDirHomePage ( SafeBuf &sb , HttpRequest *r ) ;
|
||||
|
||||
// . returns false if blocked, true otherwise
|
||||
// . sets g_errno on error
|
||||
bool sendPageDirectory ( TcpSocket *s , HttpRequest *r ) {
|
||||
@ -39,11 +42,34 @@ bool sendPageDirectory ( TcpSocket *s , HttpRequest *r ) {
|
||||
// look it up
|
||||
long catId = g_categories->getIdFromPath(decodedPath, decodedPathLen);
|
||||
|
||||
// if /Top print the directory homepage
|
||||
if ( catId == 1 ) {
|
||||
SafeBuf sb;
|
||||
// this is in PageRoot.cpp
|
||||
printDirHomePage(sb,r);
|
||||
return g_httpServer.sendDynamicPage ( s,
|
||||
(char*) sb.getBufStart(),
|
||||
sb.length(),
|
||||
// 120 seconds cachetime
|
||||
// don't cache anymore
|
||||
// since
|
||||
// we have the login bar
|
||||
// @ the top of the page
|
||||
0,//120, // cachetime
|
||||
false,// post?
|
||||
"text/html",
|
||||
200,
|
||||
NULL, // cookie
|
||||
"UTF-8",
|
||||
r);
|
||||
}
|
||||
|
||||
// . make a new request for PageResults
|
||||
//Url dirUrl;
|
||||
char requestBuf[1024+MAX_COLL_LEN+128];
|
||||
long requestBufSize = 1024+MAX_COLL_LEN+128;
|
||||
//g_categories.createDirectorySearchUrl ( &dirUrl,
|
||||
log("dmoz: creating search request");
|
||||
long requestBufLen = g_categories->createDirSearchRequest(
|
||||
requestBuf,
|
||||
requestBufSize,
|
||||
|
@ -212,11 +212,15 @@ bool sendPageResults ( TcpSocket *s , HttpRequest *hr ) {
|
||||
long rawFormat = hr->getLong("xml", 0); // was "raw"
|
||||
long xml = hr->getLong("xml",0);
|
||||
|
||||
// get the dmoz catid if given
|
||||
long catid = hr->getLong("dir",-1);
|
||||
|
||||
//
|
||||
// send back page frame with the ajax call to get the real
|
||||
// search results
|
||||
// search results. do not do this if a "&dir=" (dmoz category)
|
||||
// is given
|
||||
//
|
||||
if ( hr->getLong("id",0) == 0 && ! xml ) {
|
||||
if ( hr->getLong("id",0) == 0 && ! xml && catid == -1 ) {
|
||||
SafeBuf sb;
|
||||
sb.safePrintf(
|
||||
"<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML "
|
||||
@ -830,14 +834,16 @@ bool gotResults ( void *state ) {
|
||||
if ( ! xml ) {
|
||||
sb.safePrintf("\n<font size=4><b>");
|
||||
if ( rtl ) sb.safePrintf("<span dir=ltr>");
|
||||
sb.safePrintf("<a href=\"/\">Top</a>: ");
|
||||
sb.safePrintf("<a href=\"/Top\">Top</a>: ");
|
||||
}
|
||||
// put crumbin xml?
|
||||
if ( xml )
|
||||
sb.safePrintf("<breacdcrumb><![CDATA[");
|
||||
// display the breadcrumb in xml or html?
|
||||
g_categories->printPathCrumbFromIndex(&sb,dirIndex,rtl);
|
||||
sb.safePrintf("]]></breadcrumb>\n" );
|
||||
|
||||
if ( xml )
|
||||
sb.safePrintf("]]></breadcrumb>\n" );
|
||||
|
||||
// print the num
|
||||
if ( ! xml ) {
|
||||
@ -4192,7 +4198,8 @@ bool printDMOZSubTopics ( SafeBuf& sb, long catId, State0 *st, bool inXml ) {
|
||||
prefixp = cat->getPrefix();//&catBuffer[subCats[i].m_prefixOffset];
|
||||
prefixLen = cat->m_prefixLen;//subCats[i].m_prefixLen;
|
||||
// skip bad categories
|
||||
currIndex = g_categories->getIndexFromPath(catName, catNameLen);
|
||||
//currIndex=g_categories->getIndexFromPath(catName,catNameLen);
|
||||
currIndex=g_categories->getIndexFromPath(prefixp,prefixLen);
|
||||
if (currIndex < 0)
|
||||
continue;
|
||||
// skip top adult category if we're supposed to
|
||||
|
@ -379,7 +379,10 @@ long Pages::getDynamicPageNumber ( HttpRequest *r ) {
|
||||
}
|
||||
// sanity
|
||||
if ( ! g_categories ) log("process: no categories loaded");
|
||||
// look it up for a category
|
||||
|
||||
//
|
||||
// dmoz - look it up for a category
|
||||
//
|
||||
if ( g_categories &&
|
||||
g_categories->getIndexFromPath(decodedPath, decodedPathLen) >= 0)
|
||||
return PAGE_DIRECTORY;
|
||||
@ -482,7 +485,7 @@ bool Pages::sendDynamicReply ( TcpSocket *s , HttpRequest *r , long page ) {
|
||||
// often times my cookie says username=mwells but i am not logged
|
||||
// in and i don't want to type my password to see the root page,
|
||||
// or any other public page
|
||||
if ( ! publicPage && ! g_users.hasPermission( r, page , s ) &&
|
||||
if ( ! publicPage &&!isLocal&&//g_users.hasPermission( r, page , s ) &&
|
||||
! isLoopback ) {
|
||||
log("login: access denied 2 from ip=%s",iptoa(s->m_ip));
|
||||
return sendPageLogin ( s , r, "Access Denied. No permission.");
|
||||
@ -614,7 +617,7 @@ bool Pages::sendDynamicReply ( TcpSocket *s , HttpRequest *r , long page ) {
|
||||
// . now, so it can be responsible for calling pg->m_function
|
||||
//if ( userType > USER_PUBLIC ) {
|
||||
// check if user has public page access
|
||||
if ( g_users.hasPermission( r, page , s ) ) {
|
||||
if ( isLocal ) { // g_users.hasPermission( r, page , s ) ) {
|
||||
// . this will set various parms
|
||||
// . we know the request came from a host in the cluster
|
||||
// because "isHost" is true.
|
||||
|
@ -1224,8 +1224,8 @@ bool SearchInput::setQueryBuffers ( ) {
|
||||
long dcatId = -1;
|
||||
// get the final query
|
||||
char *q =m_sbuf1.getBufStart();
|
||||
if ( q ) sscanf(q,"gbpdcat:%li",&pcatId);
|
||||
if ( q ) sscanf(q,"gbcat:%li",&dcatId);
|
||||
if ( q ) sscanf(q,"gbpdcatid:%li",&pcatId);
|
||||
if ( q ) sscanf(q,"gbcatid:%li",&dcatId);
|
||||
// pick the one that is valid
|
||||
long catId = -1;
|
||||
if ( pcatId >= 0 ) catId = pcatId;
|
||||
|
@ -21983,7 +21983,7 @@ bool XmlDoc::hashDMOZCategories ( HashTableX *tt ) {
|
||||
// write the catid as a string
|
||||
sprintf(buf, "%lu", catIds[i]);
|
||||
// term prefix for hashing
|
||||
hi.m_prefix = "gbdcat";
|
||||
hi.m_prefix = "gbcatid";
|
||||
// hash it
|
||||
hashString ( buf , gbstrlen(buf) , &hi );
|
||||
// we also want to hash the parents
|
||||
@ -21994,7 +21994,7 @@ bool XmlDoc::hashDMOZCategories ( HashTableX *tt ) {
|
||||
while ( currCatId > 1 ) {
|
||||
// hash the parent
|
||||
sprintf(buf, "%lu", currParentId);
|
||||
hi.m_prefix = "gbpdcat";
|
||||
hi.m_prefix = "gbpcatid";
|
||||
hashString ( buf , gbstrlen(buf), &hi );
|
||||
// next cat
|
||||
currCatId = currParentId;
|
||||
@ -22037,7 +22037,7 @@ bool XmlDoc::hashDMOZCategories ( HashTableX *tt ) {
|
||||
// write the catid as a string
|
||||
sprintf(buf, "%lu", indCatIds[i]);
|
||||
// use prefix
|
||||
hi.m_prefix = "gbicat";
|
||||
hi.m_prefix = "gbicatid";
|
||||
hi.m_hashGroup = HASHGROUP_INTAG;
|
||||
// hash it
|
||||
hashString ( buf , gbstrlen(buf), &hi );
|
||||
@ -22051,7 +22051,7 @@ bool XmlDoc::hashDMOZCategories ( HashTableX *tt ) {
|
||||
// hash the parent
|
||||
sprintf(buf, "%lu", currParentId);
|
||||
// new prefix
|
||||
hi.m_prefix = "gbpicat";
|
||||
hi.m_prefix = "gbipcatid";
|
||||
// hash it
|
||||
hashString ( buf , gbstrlen(buf), &hi );
|
||||
// next cat
|
||||
|
@ -209,12 +209,15 @@ long rdfParse ( char *tagName ) {
|
||||
|
||||
// move to the next tag in the file
|
||||
long rdfNextTag ( ) {
|
||||
bool inQuote = false;
|
||||
//bool inQuote = false;
|
||||
// move to the next tag
|
||||
while (*rdfPtr != '<' || inQuote ) {
|
||||
while (*rdfPtr != '<' ) { // || inQuote ) {
|
||||
// check for quotes
|
||||
if (*rdfPtr == '"')
|
||||
inQuote = !inQuote;
|
||||
// NO! too many unbalanced quotes all over the place!
|
||||
// and i think quotes in tags do not have < or > in them
|
||||
// because they should be encoded as > and <
|
||||
//if (*rdfPtr == '"')
|
||||
// inQuote = !inQuote;
|
||||
// next char
|
||||
if (!incRdfPtr())
|
||||
return -1;
|
||||
@ -560,8 +563,12 @@ long printCatPath ( char *str, long catid, bool raw ) {
|
||||
return 0;
|
||||
// get the parent
|
||||
parentId = rdfCats[catIndex].m_parentid;
|
||||
// print the parent(s) first
|
||||
if (parentId > 1 &&
|
||||
// . print the parent(s) first
|
||||
// . in NEWER DMOZ dumps, "Top" is catid 2 and catid 1 is an
|
||||
// empty title. really catid 2 is Top/World but that is an
|
||||
// error that we correct below. (see "Top/World" below).
|
||||
// but do not include the "Top/" as part of the path name
|
||||
if (parentId > 2 &&
|
||||
// the newer dmoz files have the catid == the parent id of
|
||||
// i guess top most categories, like "Top/Arts"... i would think
|
||||
// it should have a parentId of 1 like the old dmoz files,
|
||||
@ -907,12 +914,29 @@ int main ( int argc, char *argv[] ) {
|
||||
nameLen ,
|
||||
false,
|
||||
0);
|
||||
memcpy(&nameBuffer[nameOffset], htmlDecoded, nameLen);
|
||||
nameBufferLen += nameLen;
|
||||
|
||||
// parse the catid
|
||||
long catid = parseNextCatid();
|
||||
if (catid == -1)
|
||||
goto fileEnd;
|
||||
|
||||
// crap, in the new dmoz structure.rdf.u8 catid 1 is
|
||||
// empty name and catid 2 has Topic tag "Top/World" but
|
||||
// Title tag "Top".
|
||||
// but it should probably be "Top" and not "World". There is
|
||||
// another catid 3 in structure.rdf.u8 that has
|
||||
// <Topic r:id="Top/World"> and catid 3 which is the real one,
|
||||
// so catid 2 is just "Top". this is a bug in the dmoz output
|
||||
// i think, so fix it here.
|
||||
if ( catid == 2 ) {
|
||||
nameLen = 3;
|
||||
memcpy(&nameBuffer[nameOffset],"Top",nameLen);
|
||||
nameBufferLen += nameLen;
|
||||
}
|
||||
else {
|
||||
memcpy(&nameBuffer[nameOffset], htmlDecoded, nameLen);
|
||||
nameBufferLen += nameLen;
|
||||
}
|
||||
// . fill the current cat
|
||||
// make sure there's room
|
||||
if (numRdfCats >= rdfCatsSize) {
|
||||
@ -1002,10 +1026,16 @@ fileEnd:
|
||||
rdfEnd = &rdfBuffer[n];
|
||||
currOffset = 0;
|
||||
|
||||
//
|
||||
// set m_parentid using structure.rdf.u8
|
||||
//
|
||||
|
||||
// read and parse the file again
|
||||
printf("Building Hierarchy...\n");
|
||||
while (true) {
|
||||
// parse the next catid
|
||||
// parse the next catid in the file, sequentially
|
||||
//if ( currOffset == 545468935 )
|
||||
// printf("shit\n");
|
||||
long catid = parseNextCatid();
|
||||
if (catid == -1)
|
||||
goto fileEnd1;
|
||||
@ -1060,6 +1090,14 @@ nextChildTag:
|
||||
false,
|
||||
0);
|
||||
memcpy(childName, htmlDecoded, childNameLen);
|
||||
|
||||
// debug log
|
||||
//if ( currOffset >= 506362430 ) // 556362463
|
||||
// printf("off=%li\n",currOffset);
|
||||
// debug point
|
||||
//if ( currOffset == 545467573 )
|
||||
// printf("GOT DEBUG POINT before giant skip\n");
|
||||
|
||||
// cut off the leading label if symbolic
|
||||
// if (parentType == 2) {
|
||||
// while (*childName != ':') {
|
||||
@ -1069,20 +1107,27 @@ nextChildTag:
|
||||
// childName++;
|
||||
// childNameLen--;
|
||||
// }
|
||||
// debug point
|
||||
//if (strcmp(childName,"Top/World/Català/Arts") == 0 )
|
||||
// printf("hey\n");
|
||||
// get the catid for the child
|
||||
long childid = getCatHash(childName, childNameLen);
|
||||
// get the cat for this id
|
||||
long cat = getIndexFromId(childid);
|
||||
// make sure we have a match
|
||||
if (cat == -1) {
|
||||
//printf("Warning: Child Topic Not Found: ");
|
||||
//for (long i = 0; i < childNameLen; i++)
|
||||
// printf("%c", childName[i]);
|
||||
//printf("\n");
|
||||
// debug. why does Top/World/Catala/Arts
|
||||
// not have a parent??
|
||||
printf("Warning: Child Topic Not Found: ");
|
||||
for (long i = 0; i < childNameLen; i++)
|
||||
printf("%c", childName[i]);
|
||||
printf("\n");
|
||||
m++;
|
||||
goto nextChildTag;
|
||||
}
|
||||
// assign the parent to the cat
|
||||
// . assign the parent to the cat
|
||||
// . this means we are in a "child" tag within the "catid"
|
||||
// . catid 84192
|
||||
if (parentType == 1) {
|
||||
if (rdfCats[cat].m_parentid != 0)
|
||||
printf("Warning: Overwriting Parent Id!\n");
|
||||
@ -1114,6 +1159,14 @@ fileEnd1:
|
||||
printf(" Total Topics: %li\n", numRdfCats);
|
||||
printf(" Topics with Parents: %li\n", t);
|
||||
printf(" Topics Linked but Nonexistent: %li\n", m);
|
||||
|
||||
if ( t != numRdfCats ) {
|
||||
printf("\n"
|
||||
" *Topics without parents is bad because they\n"
|
||||
" can not have their entired rawPath printed out\n"
|
||||
" in order to get their proper hash\n");
|
||||
}
|
||||
|
||||
//printf(" Number of Symbolic Links: %li\n", numSymParents);
|
||||
printf("\n");
|
||||
|
||||
@ -1148,6 +1201,22 @@ fileEnd1:
|
||||
// get the hash of the path
|
||||
rawPathLen = printCatPath(rawPath, rdfCats[i].m_catid, true);
|
||||
rdfCats[i].m_catHash = hash32Lower_a(rawPath, rawPathLen, 0);
|
||||
// fix. so that xyz/Arts does not just hash "Arts"
|
||||
// because it has no parent...
|
||||
if ( rdfCats[i].m_parentid == 0 ) {
|
||||
printf("Missing parent for catid %li. Will be "
|
||||
"excluded from DMOZ so we avoid hash "
|
||||
"collisions.\n",rdfCats[i].m_catid);
|
||||
}
|
||||
//
|
||||
// DEBUG!
|
||||
// print this shit out to find the collisions
|
||||
//
|
||||
//printf("hash32=%lu catid=%li parentid=%li path=%s\n",
|
||||
// rdfCats[i].m_catHash,
|
||||
// rdfCats[i].m_catid,
|
||||
// rdfCats[i].m_parentid,
|
||||
// rawPath);
|
||||
}
|
||||
|
||||
// . now we want to serialize the needed data into
|
||||
|
Loading…
x
Reference in New Issue
Block a user