trying to bring back dmoz integration.

This commit is contained in:
mwells 2013-10-02 22:34:21 -06:00
parent 91b8921b9e
commit 6c2c9f7774
21 changed files with 1069 additions and 189 deletions

@ -51,8 +51,8 @@ bool Catdb::init ( ) {
// . initialize our own internal rdb
// . i no longer use cache so changes to tagdb are instant
// . we still use page cache however, which is good enough!
if ( this == &g_catdb )
return m_rdb.init ( g_hostdb.m_dir ,
//if ( this == &g_catdb )
if ( ! m_rdb.init ( g_hostdb.m_dir ,
"catdb" ,
true , // dedup same keys?
-1 , // fixed record size
@ -72,8 +72,14 @@ bool Catdb::init ( ) {
false,
12,
false,
true ); // is collectionless?
return true;
true )) // is collectionless?
return false;
// normally Collectiondb.addColl() will call Rdb::addColl() which
// will init the CollectionRec::m_rdbBase, which is what
// Rdb::getBase(collnum_t) will return. however, for collectionless
// rdb databases we set Rdb::m_collectionlessBase special here.
return m_rdb.addColl ( NULL );
}
bool Catdb::init2 ( long treeMem ) {
@ -119,7 +125,7 @@ bool Catdb::verify ( char *coll ) {
g_threads.disableThreads();
Msg5 msg5;
Msg5 msg5b;
//Msg5 msg5b;
RdbList list;
key_t startKey;
key_t endKey;
@ -128,7 +134,7 @@ bool Catdb::verify ( char *coll ) {
//long minRecSizes = 64000;
if ( ! msg5.getList ( RDB_CATDB ,
coll ,
"",//coll ,
&list ,
startKey ,
endKey ,
@ -147,7 +153,7 @@ bool Catdb::verify ( char *coll ) {
-1 ,
true ,
-1LL ,
&msg5b ,
NULL,//&msg5b ,
true )) {
g_threads.enableThreads();
return log("db: HEY! it did not block");
@ -309,6 +315,19 @@ void Catdb::listSearch ( RdbList *list,
// for small lists, just loop through the list
if (list->getListSize() < 16*1024) {
while ( ! list->isExhausted() ) {
// for debug!
/*
CatRec crec;
crec.set ( NULL,
list->getCurrentData(),
list->getCurrentDataSize(),
false);
log("catdb: caturl=%s #catid=%li version=%li"
,crec.m_url
,(long)crec.m_numCatids
,(long)crec.m_version
);
*/
// check the current key
if ( list->getCurrentKey() != exactKey ) {
// miss, next

@ -1011,13 +1011,17 @@ errEnd:
return false;
}
// generate sub categories for a given catid
// . generate sub categories for a given catid
// . store list of SubCategories into "subCatBuf" return # stored
long Categories::generateSubCats ( long catid,
SubCategory *subCats,
char **catBuffer,
long *catBufferSize,
long *catBufferLen,
bool allowRealloc ) {
SafeBuf *subCatBuf
//SubCategory *subCats,
//char **catBuffer,
//long *catBufferSize,
//long *catBufferLen,
//bool allowRealloc
) {
long catIndex;
unsigned long fileOffset;
unsigned long n;
@ -1029,15 +1033,22 @@ long Categories::generateSubCats ( long catid,
long prefixLen;
long nameStart;
long nameLen;
long catp = 0;
long catBufferInc = *catBufferSize;
// lookup the index for this catid
long need ;
SubCategory *cat;
char *p ;
//long catp = 0;
//long catBufferInc = *catBufferSize;
// . lookup the index for this catid
// . binary step, guessing to approximate place
// and then scanning from there
catIndex = getIndexFromId(catid);
if (catIndex < 0)
goto errEnd;
// get the file offset
fileOffset = m_cats[catIndex].m_structureOffset;
// open the structure file
// cat/content.rdf.u8 in utf8
char filename[512];
sprintf(filename, "%scat/%s", g_hostdb.m_dir, RDFSTRUCTURE_FILE);
//m_rdfStream.clear();
@ -1066,12 +1077,16 @@ long Categories::generateSubCats ( long catid,
log("cat: Error Reading Structure Offset");
goto errEnd;
}
// point to the buffer we just read with m_rdfPtr
m_rdfPtr = m_rdfBuffer;
m_rdfEnd = &m_rdfBuffer[n];
m_currOffset = fileOffset;
// parse tags for the sub categories or until we hit /Topic
nextTag:
// . this increments m_rdfPtr until it points to the beginning of a tag
// . it may end up reading another chunk from disk
// . it memcopies m_tagRecfer to be the name of the tag it points to
if (rdfNextTag() < 0)
goto gotSubCats;
// check for /Topic
@ -1173,37 +1188,36 @@ nextTag:
break;
}
// . fill the next sub category
if (catp + prefixLen + nameLen >= *catBufferSize) {
if (!allowRealloc)
goto gotSubCats;
// realloc the buffer
char *re_catBuffer = (char*)mrealloc ( *catBuffer,
*catBufferSize,
*catBufferSize+catBufferInc,
"Categories" );
if (!re_catBuffer) {
log ( "Could not allocate %li bytes for catBuffer",
*catBufferSize+catBufferInc );
g_errno = ENOMEM;
goto errEnd;
}
*catBuffer = re_catBuffer;
*catBufferSize += catBufferInc;
}
// fill the prefix and name in the buffer and subcat
// . fill the prefix and name in the buffer and subcat
need = sizeof(SubCategory) + prefixLen + 1 + nameLen + 1;
if ( ! subCatBuf->reserve(need) ) goto errEnd;
cat = (SubCategory *)(subCatBuf->getBuf());
cat->m_prefixLen = prefixLen;
cat->m_nameLen = nameLen;
cat->m_type = currType;
p = cat->m_buf;
memcpy ( p , catStr + prefixStart , prefixLen );
p += prefixLen;
*p++ = '\0';
memcpy ( p , catStr + nameStart , nameLen );
p += nameLen;
*p++ = '\0';
/*
subCats[numSubCats].m_prefixOffset = catp;
subCats[numSubCats].m_prefixLen = prefixLen;
if (prefixLen > 0) {
memcpy(&((*catBuffer)[catp]), &catStr[prefixStart], prefixLen);
catp += prefixLen;
}
subCats[numSubCats].m_nameOffset = catp;
subCats[numSubCats].m_nameOffset = catBuf->length();//catp;
subCats[numSubCats].m_nameLen = nameLen;
if (nameLen > 0) {
memcpy(&((*catBuffer)[catp]), &catStr[nameStart], nameLen);
catp += nameLen;
}
subCats[numSubCats].m_type = currType;
*/
// next sub cat
numSubCats++;
if (numSubCats >= MAX_SUB_CATS) {
@ -1214,14 +1228,14 @@ nextTag:
// next tag
goto nextTag;
gotSubCats:
*catBufferLen = catp;
//*catBufferLen = catp;
//m_rdfStream.close();
//m_rdfStream.clear();
close(m_rdfStream);
return numSubCats;
errEnd:
*catBufferLen = 0;
//*catBufferLen = 0;
//m_rdfStream.close();
//m_rdfStream.clear();
close(m_rdfStream);

@ -61,11 +61,15 @@ struct CategoryHash {
};
struct SubCategory {
long m_prefixOffset;
//long m_prefixOffset;
long m_prefixLen;
long m_nameOffset;
//long m_nameOffset;
long m_nameLen;
char m_type;
long getRecSize () { return sizeof(SubCategory)+m_prefixLen+m_nameLen+2;};
char *getPrefix() { return m_buf; };
char *getName () { return m_buf+m_prefixLen+1;};
char m_buf[0];
};
class Categories {
@ -153,13 +157,10 @@ public:
// normalize a url string
long fixUrl ( char *url, long urlLen );
// generate sub categories for a given catid
long generateSubCats ( long catid,
SubCategory *subCats,
char **catBuffer,
long *catBufferSize,
long *catBufferLen,
bool allowRealloc = true );
// . generate sub categories for a given catid
// . store list of SubCategories into "subCatBuf" return # stored
// . hits disk without using threads... so kinda sucks...
long generateSubCats ( long catid, SafeBuf *subCatBuf );
long getNumUrlsFromIndex ( long catIndex ) {
return m_cats[catIndex].m_numUrls; };

@ -2308,10 +2308,9 @@ uint32_t Hostdb::getGroupId ( char rdbId,void *k,bool split ) {
unsigned long long d = g_revdb.getDocId( (key_t *)k );
return m_map [ ((d>>14)^(d>>7)) & (MAX_KSLOTS-1) ];
}
//else if ( rdbId == RDB_CATDB || rdbId == RDB2_CATDB2 ) {
// return m_map [(*(uint16_t *)((char *)k + 10))>>3];
//}
else if ( rdbId == RDB_CATDB || rdbId == RDB2_CATDB2 ) {
return m_map [(*(uint16_t *)((char *)k + 10))>>3];
}
// core -- must be provided
char *xx = NULL; *xx = 0;
//groupId=key.n1 & g_hostdb.m_groupMask;

@ -39,7 +39,7 @@ OBJS = Tfndb.o UdpSlot.o \
Parms.o Pages.o Msg28.o Msg30.o \
Unicode.o iana_charset.o Iso8859.o \
SearchInput.o \
Categories.o Msg2a.o PageCatdb.o PageDirectory.o Msg2b.o \
Categories.o Msg2a.o PageCatdb.o PageDirectory.o \
SafeBuf.o Datedb.o \
UCNormalizer.o UCPropTable.o UnicodeProperties.o \
Pops.o Title.o Pos.o LangList.o \

@ -148,6 +148,10 @@ bool Msg40::getResults ( SearchInput *si ,
// we need this info for caching as well
//m_numGigabitInfos = 0;
//just getfrom searchinput
//.... m_catId = hr->getLong("catid",0);m_si->m_catId;
m_postQueryRerank.set1( this, si );
// get the collection rec
@ -680,6 +684,20 @@ bool Msg40::gotDocIds ( ) {
// if ( ! m_msg1a.generateReferences(m_si,(void*)this,didTaskWrapper) )
// m_tasksRemaining++;
//
// call Msg2b to generate directory
//
// why is this here? it does not depend on the docids. (mdw 9/25/13)
// dissect it and fix it!!
//
//if ( m_si->m_catId &&
// ! m_msg2b.generateDirectory ( m_si->m_catId,
// (void*)this,
// didTaskWrapper ) )
// m_tasksRemaining++;
return launchMsg20s ( false );
}
@ -878,7 +896,6 @@ bool Msg40::reallocMsg20Buf ( ) {
return true;
}
/*
void didTaskWrapper ( void* state ) {
Msg40 *THIS = (Msg40 *) state;
// one less task
@ -888,7 +905,6 @@ void didTaskWrapper ( void* state ) {
// we are done, call the callback
THIS->m_callback ( THIS->m_state );
}
*/
bool Msg40::launchMsg20s ( bool recalled ) {
@ -2128,7 +2144,7 @@ long Msg40::getStoredSize ( ) {
//size += m_msg24.getStoredSize ( );
//size += m_msg1a.getStoredSize ( );
// cache msg2b if we have it
size += m_msg2b.getStoredSize();
//size += m_msg2b.getStoredSize();
return size;
}
@ -2203,9 +2219,9 @@ long Msg40::serialize ( char *buf , long bufLen ) {
//if ( y == -1 ) return -1;
//p += y;
long z = m_msg2b.serialize (p, pend - p);
if ( z == -1 ) return -1;
p += z;
//long z = m_msg2b.serialize (p, pend - p);
//if ( z == -1 ) return -1;
//p += z;
if ( m_r.m_debug )
log("query: msg40 serialize nd=%li "
@ -2258,9 +2274,9 @@ long Msg40::deserialize ( char *buf , long bufSize ) {
}
// msg2b
long z = m_msg2b.deserialize ( p , pend - p );
if ( z == -1 ) return -1;
p += z;
//long z = m_msg2b.deserialize ( p , pend - p );
//if ( z == -1 ) return -1;
//p += z;
// return bytes read
return p - buf;

@ -14,7 +14,7 @@
#include "Msg39.h" // getTermFreqs()
#include "Msg20.h" // for getting summary from docId
#include "Msg17.h" // a distributed cache of serialized/compressed Msg40s
#include "Msg2b.h" // for generating directories
//#include "Msg2b.h" // for generating directories
#include "IndexReadInfo.h" // STAGE0,...
#include "Msg3a.h"
#include "PostQueryRerank.h"
@ -302,7 +302,7 @@ class Msg40 {
long m_docsToScanForTopics;
// Msg2b for generating a directory
Msg2b m_msg2b;
//Msg2b m_msg2b;
PostQueryRerank m_postQueryRerank;

@ -50,8 +50,8 @@ bool Msg8b::getCatRec ( Url *url ,
m_state = state;
m_callback = callback;
m_url = url;
m_coll = coll;
m_collLen = collLen;
//m_coll = coll;
//m_collLen = collLen;
m_cr = cr;
m_niceness = niceness;
@ -68,10 +68,10 @@ bool Msg8b::getCatRec ( Url *url ,
//m_coll = g_conf.m_dirColl;
//m_collLen = gbstrlen(m_coll);
// catdb uses a dummy collection now, should not be looked at
m_coll = "catdb";
m_collLen = 5;
//m_coll = "catdb";
//m_collLen = 5;
m_collnum = g_collectiondb.getCollnum ( m_coll , m_collLen );
//m_collnum = g_collectiondb.getCollnum ( m_coll , m_collLen );
// . first, try it by canonical domain name
// . if that finds no matches, then try it by ip domain
@ -89,7 +89,7 @@ bool Msg8b::getCatRec ( Url *url ,
//
if ( g_hostdb.m_groupId != m_groupId ) {
// coll, url, niceness(1), rdbid(1), useCanonicalName(1)
long requestSize = m_collLen + m_url->getUrlLen() + 4 + 4;
long requestSize = m_url->getUrlLen() + 4 + 3;
// make the request
char *p = m_request;
*(long *)p = m_url->getIp() ; p+=4;
@ -97,10 +97,10 @@ bool Msg8b::getCatRec ( Url *url ,
*p = (char)niceness ; p++;
*p = (char)useCanonicalName; p++;
// coll
memcpy(p, m_coll, m_collLen);
p += m_collLen;
*p = '\0';
p++;
//memcpy(p, m_coll, m_collLen);
//p += m_collLen;
//*p = '\0';
//p++;
// url
memcpy(p, m_url->getUrl(), m_url->getUrlLen());
p += m_url->getUrlLen();
@ -186,7 +186,7 @@ bool Msg8b::getCatRec ( Url *url ,
0 , // max cached age in seconds (60)
false , // add net recv'd list to cache?
RDB_CATDB, // specifies the rdb, 1 = tagdb
m_coll ,
"",//NULL,//m_coll ,
//&m_list ,
m_list ,
startKey ,
@ -545,7 +545,7 @@ bool Msg8b::gotList ( ) {
char *rec;
//rec = g_catdb->getRec ( &m_list , m_url , &recSize );
rec = g_catdb.getRec(m_list,m_url,&recSize,m_coll,m_collLen);
rec = g_catdb.getRec(m_list,m_url,&recSize,NULL,0);//m_coll,m_collLen);
// if record found then set it and also set gotIt to true
if ( rec ) {
@ -588,8 +588,8 @@ void Msg8b::getIndirectCatids ( ) {
matchRecs,
matchRecSizes,
MAX_IND_CATIDS,
m_coll,
m_collLen);
NULL,//m_coll,
0);//m_collLen);
// parse out the catids from the matches
m_cr->m_numIndCatids = 0;
for ( long i = 0; i < numMatches; i++ ) {

@ -68,11 +68,11 @@ class Msg8b {
void cleanSlot ( );
// some specified input
char *m_coll;
long m_collLen;
//char *m_coll;
//long m_collLen;
Url *m_url;
collnum_t m_collnum;
//collnum_t m_collnum;
void (*m_callback ) ( void *state );//, CatRec *rec );
void *m_state; // ptr to caller's private state data

@ -105,8 +105,8 @@ bool sendPageCatdb ( TcpSocket *s , HttpRequest *r ) {
st->m_url.set(url, urlLen);
// call msg8b to lookup in catdb
if (!st->m_msg8b.getCatRec ( &st->m_url,
st->m_coll,
st->m_collLen,
NULL,//st->m_coll,
0,//st->m_collLen,
true,
1,
&st->m_catRec,

@ -263,7 +263,7 @@ bool sendPageResults ( TcpSocket *s , HttpRequest *hr ) {
char *qstr = hr->getString("q",&qlen,"",NULL);
// . crap! also gotta encode apostrophe since "var url='..."
// . true = encodeApostrophes?
sb.urlEncode ( qstr , true );
sb.urlEncode2 ( qstr , true );
// propagate "admin" if set
long admin = hr->getLong("admin",-1);
if ( admin != -1 ) sb.safePrintf("&admin=%li",admin);
@ -272,7 +272,7 @@ bool sendPageResults ( TcpSocket *s , HttpRequest *hr ) {
char *sites = hr->getString("sites",&sitesLen,NULL);
if ( sites ) {
sb.safePrintf("&sites=");
sb.urlEncode ( sites,true);
sb.urlEncode2 ( sites,true);
}
// propagate "debug" if set
long debug = hr->getLong("debug",0);
@ -744,6 +744,8 @@ static bool printGigabit ( State0 *st,
return true;
}
bool printDMOZSubTopics ( SafeBuf& sb, long catId, State0 *st, bool inXml ) ;
// . make a web page from results stored in msg40
// . send it on TcpSocket "s" when done
// . returns false if blocked, true otherwise
@ -805,6 +807,70 @@ bool gotResults ( void *state ) {
return sendReply(st,NULL);
}
// grab the query
char *q = msg40->getQuery();
long qlen = msg40->getQueryLen();
bool xml = si->m_xml;
// display it?
if ( si->m_catId >= 0 ) {
long dirIndex = g_categories->getIndexFromId(si->m_catId);
// dirIndex = g_categories->getIndexFromId(si->m_cat_sdir);
if (dirIndex < 0) dirIndex = 0;
// display the directory bread crumb
//if( (si->m_cat_dirId > 0 && si->m_isAdmin && !si->m_isFriend)
// || (si->m_cat_sdir > 0 && si->m_cat_sdirt != 0) )
// sb.safePrintf("<br><br>");
// shortcut. rtl=Right To Left language format.
bool rtl = g_categories->isIdRTL ( si->m_catId ) ;
//st->m_isRTL = rtl;
if ( ! xml ) {
sb.safePrintf("\n<font size=4><b>");
if ( rtl ) sb.safePrintf("<span dir=ltr>");
sb.safePrintf("<a href=\"/\">Top</a>: ");
}
// put crumbin xml?
if ( xml )
sb.safePrintf("<breacdcrumb><![CDATA[");
// display the breadcrumb in xml or html?
g_categories->printPathCrumbFromIndex(&sb,dirIndex,rtl);
sb.safePrintf("]]></breadcrumb>\n" );
// print the num
if ( ! xml ) {
sb.safePrintf("</b>&nbsp&nbsp<i>");
// how many urls/entries in this topic?
long nu =g_categories->getNumUrlsFromIndex(dirIndex);
if ( rtl )
sb.safePrintf("<span dir=ltr>(%li)</span>",nu);
else
sb.safePrintf("(%li)", nu);
sb.safePrintf("</i></font><br><br>\n");
}
}
///////////
//
// show DMOZ subcategories if doing either a
// "gbpdcat:<catid> |" (Search restricted to category)
// "gbdcat:<catid>" (DMOZ urls in that topic, c=dmoz3)
//
// The search gbdcat: results should be sorted by siterank i guess
// since it is only search a single term: gbdcat:<catid> so we can
// put our stars back onto that and should be sorted by them.
//
///////////
if ( si->m_catId >= 0 )
// print the subtopcis in this topic. show as links above
// the search results
printDMOZSubTopics ( sb, si->m_catId , st, xml );
// save how many docs are in it
long long docsInColl = -1;
//RdbBase *base = getRdbBase ( RDB_CHECKSUMDB , si->m_coll );
@ -854,9 +920,6 @@ bool gotResults ( void *state ) {
// numResults may be more than we requested now!
long n = msg40->getDocsWanted();
if ( n > numResults ) n = numResults;
// grab the query
char *q = msg40->getQuery();
long qlen = msg40->getQueryLen();
// . make the query class here for highlighting
// . keepAllSingles means to convert all individual words into
// QueryTerms even if they're in quotes or in a connection (cd-rom).
@ -1204,7 +1267,7 @@ bool gotResults ( void *state ) {
// print the word
char *t = qw->m_word;
long tlen = qw->m_wordLen;
sb.utf8Encode ( t , tlen );
sb.utf8Encode2 ( t , tlen );
sb.safePrintf (" ");
}
// print tail if we had ignored terms
@ -1264,7 +1327,7 @@ bool gotResults ( void *state ) {
qe2 );
// close it up
sb.safePrintf ("\"><i><b>");
sb.utf8Encode(st->m_spell, len);
sb.utf8Encode2(st->m_spell, len);
// then finish it off
sb.safePrintf ("</b></i></a></font>\n<br><br>\n");
}
@ -1682,6 +1745,60 @@ bool printInlinkText ( SafeBuf &sb , Msg20Reply *mr , SearchInput *si ,
return true;
}
//
// . print a dmoz topic for the given numeric catid UNDER search result
// . print "Search in Category" link as well
//
static bool printDMOZCategoryUnderResult ( SafeBuf &sb ,
SearchInput *si,
long catid ,
State0 *st ) {
uint8_t queryLanguage = langUnknown;
// Don't print category if not in native language category
// Note that this only trims out "World" cats, not all
// of them. Some of them may still sneak in.
if(si->m_langHint)
queryLanguage = si->m_langHint;
if(queryLanguage != langUnknown) {
char tmpbuf[1024];
SafeBuf langsb(tmpbuf, 1024);
g_categories->printPathFromId(&langsb, catid, false);
char *ptr = langsb.getBufStart();
uint8_t lang = g_langId.findLangFromDMOZTopic(ptr + 7);
if(!strncmp("World: ", ptr, 6) &&
lang != langUnknown &&
lang != queryLanguage)
// do not print it if not in our language
return true;
}
//////
//
// print a link to apply your query to this DMOZ category
//
//////
sb.safePrintf("<a href=\"/search?s=0&q=gbpdcat%%3A%li",catid);
sb.urlEncode("|",1);
sb.urlEncode(si->m_sbuf1.getBufStart(),si->m_sbuf1.length());
sb.safePrintf("\">Search in Category</a>: ");
// setup the host of the url
//if ( dmozHost )
// sb.safePrintf("<a href=\"http://%s/", dmozHost );
//else
sb.safePrintf("<a href=\"/");
// print link
g_categories->printPathFromId(&sb, catid, true,si->m_isRTL);
sb.safePrintf("/\">");
// print the name of the dmoz category
sb.safePrintf("<font color=#c62939>");
g_categories->printPathFromId(&sb, catid, false,si->m_isRTL);
sb.safePrintf("</font></a><br>");
//++tr.brCount;
return true;
}
// use this for xml as well as html
static int printResult ( SafeBuf &sb,
State0 *st,
@ -1806,6 +1923,13 @@ static int printResult ( SafeBuf &sb,
if ( mr->m_isBanned && ! si->m_xml )
sb.safePrintf("<font color=red><b>BANNED</b></font> ");
///////
//
// PRINT THE TITLE
//
///////
// the a href tag
if ( ! si->m_xml ) {
sb.safePrintf ( "<a href=" );
@ -1824,6 +1948,41 @@ static int printResult ( SafeBuf &sb,
long strLen = mr->size_tbuf - 1;// msg40->getTitleLen(i);
if ( ! str || strLen < 0 ) strLen = 0;
/////
//
// are we printing a dmoz category page?
// get the appropriate dmoz title/summary to use since the same
// url can exist in multiple topics (catIds) with different
// titles summaries.
//
/////
char *dmozSummary = NULL;
// TODO: just get the catid from httprequest directly?
if ( si->m_catId > 0 ) { // si->m_cat_dirId > 0) {
// . get the dmoz title and summary
// . if empty then just a bunch of \0s, except for catIds
Msg20Reply *mr = m20->getReply();
char *dmozTitle = mr->ptr_dmozTitles;
dmozSummary = mr->ptr_dmozSumms;
char *dmozAnchor = mr->ptr_dmozAnchors;
long *catIds = mr->ptr_catIds;
long numCats = mr->size_catIds / 4;
// loop through looking for the right ID
for (long i = 0; i < numCats ; i++ ) {
// assign shit if we match the dmoz cat we are showing
if ( catIds[i] == si->m_catId) break;
dmozTitle +=gbstrlen(dmozTitle)+1;
dmozSummary +=gbstrlen(dmozSummary)+1;
dmozAnchor += gbstrlen(dmozAnchor)+1;
}
// now make the title the dmoz title
str = dmozTitle;
strLen = gbstrlen(str);
}
long hlen;
//copy all summary and title excerpts for this result into here
char tt[1024*32];
@ -1872,7 +2031,11 @@ static int printResult ( SafeBuf &sb,
if ( ! si->m_xml ) sb.safePrintf ("</a><br>\n" ) ;
/////
//
// print content type after title
//
/////
unsigned char ctype = mr->m_contentType;
if ( ctype > 2 && ctype <= 13 ) {
char *cs = g_contentTypeStrings[ctype];
@ -1887,6 +2050,12 @@ static int printResult ( SafeBuf &sb,
sb.safePrintf(" (%s) &nbsp;" ,cs);
}
////////////
//
// print the summary
//
////////////
// . then the summary
// . "s" is a string of null terminated strings
char *send;
@ -1897,22 +2066,56 @@ static int printResult ( SafeBuf &sb,
if ( strLen < 0 ) strLen = 0;
send = str + strLen;
// dmoz summary might override if we are showing a dmoz topic page
if ( dmozSummary ) {
str = dmozSummary;
strLen = gbstrlen(dmozSummary);
}
if ( si->m_xml ) sb.safePrintf("\t\t<sum><![CDATA[");
// print summary out
//sb.safeMemcpy ( str , strLen );
sb.brify ( str , strLen, 0 , cols ); // niceness = 0
// remove \0's... wtf?
//char *xend = sb.getBuf();
//char *x = xend - strLen;
//for ( ; x < xend ; x++ ) if ( ! *x ) *x = ' ';
// close xml tag
if ( si->m_xml ) sb.safePrintf("]]></sum>\n");
// new line if not xml
else if ( strLen ) sb.safePrintf("<br>\n");
////////////
//
// . print DMOZ topics under the summary
// . will print the "Search in Category" link too
//
////////////
//Msg20Reply *mr = m20->getMsg20Reply();
long nCatIds = mr->getNumCatIds();
for (long i = 0; i < nCatIds; i++) {
long catid = ((long *)(mr->ptr_catIds))[i];
printDMOZCategoryUnderResult(sb,si,catid,st);
}
// skipCatsPrint:
// print the indirect category Ids
long nIndCatids = mr->size_indCatIds / 4;
//if ( !cr->m_displayIndirectDmozCategories )
// goto skipCatsPrint2;
for ( long i = 0; i < nIndCatids; i++ ) {
long catid = ((long *)(mr->ptr_indCatIds))[i];
// skip it if it's a regular category
//bool skip = false;
long d; for ( d = 0; d < nCatIds; d++) {
if ( catid == mr->ptr_catIds[i] ) break;
}
// skip if the indirect catid matched a directed catid
if ( d < nCatIds ) continue;
// otherwise print it
printDMOZCategoryUnderResult(sb,si,catid,st);
}
////////////
//
// print the URL
//
////////////
// hack off the http:// if any for displaying it on screen
if ( urlLen > 8 && strncmp ( url , "http://" , 7 )==0 ) {
url += 7; urlLen -= 7; }
@ -1928,7 +2131,6 @@ static int printResult ( SafeBuf &sb,
// so hack off the last slash
if ( j < 0 ) urlLen--;
}
if ( ! si->m_xml ) {
sb.safePrintf ("<font color=gray>" );
//sb.htmlEncode ( url , gbstrlen(url) , false );
@ -1937,7 +2139,6 @@ static int printResult ( SafeBuf &sb,
// turn off the color
sb.safePrintf ( "</font>\n" );
}
if ( si->m_xml ) {
sb.safePrintf("\t\t<url><![CDATA[");
sb.safeMemcpy ( url , urlLen );
@ -3880,3 +4081,440 @@ bool printSingleScore ( SafeBuf &sb ,
// "<br>");
return true;
}
// print the search options under a dmoz search box
bool printDirectorySearchType ( SafeBuf& sb, long sdirt ) {
// default to entire directory
if (sdirt < 1 || sdirt > 4)
sdirt = 3;
// by default search the whole thing
sb.safePrintf("<input type=\"radio\" name=\"sdirt\" value=\"3\"");
if (sdirt == 3) sb.safePrintf(" checked>");
else sb.safePrintf(">");
sb.safePrintf("Entire Directory<br>\n");
// entire category
sb.safePrintf("<input type=\"radio\" name=\"sdirt\" value=\"1\"");
if (sdirt == 1) sb.safePrintf(" checked>");
else sb.safePrintf(">");
sb.safePrintf("Entire Category<br>\n");
// base category only
sb.safePrintf("<nobr><input type=\"radio\" name=\"sdirt\" value=\"2\"");
if (sdirt == 2) sb.safePrintf(" checked>");
else sb.safePrintf(">");
sb.safePrintf("Pages in Base Category</nobr><br>\n");
// sites in base category
sb.safePrintf("<input type=\"radio\" name=\"sdirt\" value=\"7\"");
if (sdirt == 7) sb.safePrintf(" checked>");
else sb.safePrintf(">");
sb.safePrintf("Sites in Base Category<br>\n");
// sites in entire category
sb.safePrintf("<input type=\"radio\" name=\"sdirt\" value=\"6\"");
if (sdirt == 6) sb.safePrintf(" checked>");
else sb.safePrintf(">");
sb.safePrintf("Sites in Entire Category<br>\n");
// end it
return true;
}
////////
//
// . print the directory subtopics
// . show these when we are in a directory topic browsing dmoz
// . just a list of all the topics/categories
//
////////
bool printDMOZSubTopics ( SafeBuf& sb, long catId, State0 *st, bool inXml ) {
long currType;
bool first;
bool nextColumn;
long maxPerColumn;
long currInColumn;
long currIndex;
char *prefixp;
long prefixLen;
char *catName;
long catNameLen;
char encodedName[2048];
SearchInput *si = &st->m_si;
SafeBuf subCatBuf;
// stores a list of SubCategories into "subCatBuf"
long numSubCats = g_categories->generateSubCats ( si->m_catId , &subCatBuf );
// . get the subcategories for a given categoriy
// . msg2b::gernerateDirectory() was launched in Msg40.cpp
//long numSubCats = st->m_msg40.m_msg2b.m_numSubCats;
//SubCategory *subCats = st->m_msg40.m_msg2b.m_subCats;
//char *catBuffer = st->m_msg40.m_msg2b.m_catBuffer;
//bool showAdultOnTop = st->m_si.m_cr->m_showAdultCategoryOnTop;
// just print <hr> if no sub categories
if (inXml) {
sb.safePrintf ( "\t<directory>\n"
"\t\t<dirId>%li</dirId>\n"
"\t\t<dirName><![CDATA[",
si->m_catId);//si.m_cat_dirId );
g_categories->printPathFromId ( &sb,
si->m_catId, // st->m_si.m_cat_dirId,
true );
sb.safePrintf ( "]]></dirName>\n");
sb.safePrintf ( "\t\t<dirIsRTL>%li</dirIsRTL>\n",
(long)si->m_isRTL);
}
char *p = subCatBuf.getBufStart();
char *pend = subCatBuf.getBuf();
SubCategory *ptrs[MAX_SUB_CATS];
long count = 0;
if (numSubCats <= 0)
goto dirEnd;
// print out the cats
currType = 0;
// first make ptrs to them
for ( ; p < pend ; ) {
SubCategory *cat = (SubCategory *)p;
ptrs[count++] = cat;
p += cat->getRecSize();
}
for (long i = 0; i < count ; i++ ) {
SubCategory *cat = ptrs[i];
first = false;
catName = cat->getName();//&catBuffer[subCats[i].m_nameOffset];
catNameLen = cat->m_nameLen;//subCats[i].m_nameLen;
prefixp = cat->getPrefix();//&catBuffer[subCats[i].m_prefixOffset];
prefixLen = cat->m_prefixLen;//subCats[i].m_prefixLen;
// skip bad categories
currIndex = g_categories->getIndexFromPath(catName, catNameLen);
if (currIndex < 0)
continue;
// skip top adult category if we're supposed to
if ( !inXml &&
st->m_si.m_catId == 1 &&
si->m_familyFilter &&
g_categories->isIndexAdultStart ( currIndex ) )
continue;
// check for room
//if (p + subCats[i].m_prefixLen*2 +
// subCats[i].m_nameLen*2 +
// 512 > pend){
// goto diroverflow;
//}
// print simple xml tag for inXml
if (inXml) {
switch ( cat->m_type ) {
case SUBCAT_LETTERBAR:
sb.safePrintf ( "\t\t<letterbar><![CDATA[" );
sb.safePrintf ( "]]>" );
sb.safePrintf ( "<urlcount>%li</urlcount>",
g_categories->getNumUrlsFromIndex(
currIndex) );
sb.safePrintf ( "</letterbar>\n" );
break;
case SUBCAT_NARROW2:
sb.safePrintf ( "\t\t<narrow2><![CDATA[" );
sb.utf8Encode2 ( catName, catNameLen );
sb.safePrintf ( "]]>");
sb.safePrintf ( "<urlcount>%li</urlcount>",
g_categories->getNumUrlsFromIndex(
currIndex) );
sb.safePrintf ( "</narrow2>\n" );
break;
case SUBCAT_NARROW1:
sb.safePrintf ( "\t\t<narrow1><![CDATA[" );
sb.utf8Encode2 ( catName, catNameLen );
sb.safePrintf ( "]]>" );
sb.safePrintf ( "<urlcount>%li</urlcount>",
g_categories->getNumUrlsFromIndex(
currIndex) );
sb.safePrintf ( "</narrow1>\n" );
break;
case SUBCAT_NARROW:
sb.safePrintf ( "\t\t<narrow><![CDATA[" );
sb.utf8Encode2 ( catName, catNameLen );
sb.safePrintf ( "]]>" );
sb.safePrintf ( "<urlcount>%li</urlcount>",
g_categories->getNumUrlsFromIndex(
currIndex) );
sb.safePrintf ( "</narrow>\n" );
break;
case SUBCAT_SYMBOLIC2:
sb.safePrintf ( "\t\t<symbolic2><![CDATA[" );
sb.utf8Encode2 ( prefixp, prefixLen );
sb.safePrintf ( ":" );
sb.utf8Encode2 ( catName, catNameLen );
sb.safePrintf ( "]]>" );
sb.safePrintf ( "<urlcount>%li</urlcount>",
g_categories->getNumUrlsFromIndex(
currIndex) );
sb.safePrintf ( "</symbolic2>\n" );
break;
case SUBCAT_SYMBOLIC1:
sb.safePrintf ( "\t\t<symbolic1><![CDATA[" );
sb.utf8Encode2 ( prefixp, prefixLen );
sb.safePrintf ( ":" );
sb.utf8Encode2 ( catName, catNameLen );
sb.safePrintf ( "]]>" );
sb.safePrintf ( "<urlcount>%li</urlcount>",
g_categories->getNumUrlsFromIndex(
currIndex) );
sb.safePrintf ( "</symbolic1>\n" );
break;
case SUBCAT_SYMBOLIC:
sb.safePrintf ( "\t\t<symbolic><![CDATA[" );
sb.utf8Encode2 ( prefixp, prefixLen );
sb.safePrintf ( ":" );
sb.utf8Encode2 ( catName, catNameLen );
sb.safePrintf ( "]]>" );
sb.safePrintf ( "<urlcount>%li</urlcount>",
g_categories->getNumUrlsFromIndex(
currIndex) );
sb.safePrintf ( "</symbolic>\n" );
break;
case SUBCAT_RELATED:
sb.safePrintf ( "\t\t<related><![CDATA[" );
sb.utf8Encode2 ( catName, catNameLen );
sb.safePrintf ( "]]>" );
sb.safePrintf ( "<urlcount>%li</urlcount>",
g_categories->getNumUrlsFromIndex(
currIndex) );
sb.safePrintf ( "</related>\n" );
break;
case SUBCAT_ALTLANG:
sb.safePrintf ( "\t\t<altlang><![CDATA[" );
sb.utf8Encode2 ( prefixp, prefixLen );
sb.safePrintf ( ":" );
sb.utf8Encode2 ( catName, catNameLen );
sb.safePrintf ( "]]>" );
sb.safePrintf ( "<urlcount>%li</urlcount>",
g_categories->getNumUrlsFromIndex(
currIndex) );
sb.safePrintf ( "</altlang>\n");
break;
}
continue;
}
// print type header
if ( cat->m_type - currType >= 10) {
// end the last type
if (currType == SUBCAT_LETTERBAR)
sb.safePrintf(" ]</center>\n");
else if (currType != 0)
sb.safePrintf ( "\n</span></ul></td></tr>"
"</table>\n" );
// start the new type
switch (cat->m_type) {
case SUBCAT_LETTERBAR:
sb.safePrintf ( "<span class=\"directory\">"
"<center>[ " );
break;
case SUBCAT_NARROW2:
case SUBCAT_SYMBOLIC2:
case SUBCAT_NARROW1:
case SUBCAT_SYMBOLIC1:
case SUBCAT_NARROW:
case SUBCAT_SYMBOLIC:
sb.safePrintf("<hr>\n");
break;
case SUBCAT_RELATED:
if (currType == 0 ||
currType == SUBCAT_LETTERBAR)
sb.safePrintf("<hr>");
else
sb.safePrintf("<br>");
if (si->m_isRTL)
sb.safePrintf("<span dir=ltr>");
sb.safePrintf ( "<b>Related Categories:"
"</b>" );
if (si->m_isRTL)
sb.safePrintf("</span>");
break;
case SUBCAT_ALTLANG:
if (currType == 0 ||
currType == SUBCAT_LETTERBAR)
sb.safePrintf("<hr>");
else
sb.safePrintf("<br>");
if (si->m_isRTL)
sb.safePrintf("<span dir=ltr>");
sb.safePrintf ( "<b>This category in other"
" languages:</b>");
if (si->m_isRTL)
sb.safePrintf("</span>");
break;
}
currType = ( cat->m_type/10)*10;
first = true;
nextColumn = false;
currInColumn = 0;
if (currType == SUBCAT_LETTERBAR ||
currType == SUBCAT_RELATED)
maxPerColumn = 999;
else {
// . check how many columns we'll use for this
// type
long numInType = 1;
for (long j = i+1; j < numSubCats; j++) {
if ( ptrs[j]->m_type - currType >= 10)
break;
numInType++;
}
// column for every 5, up to 3 columns
long numColumns = numInType/5;
if ( numInType%5 > 0 ) numColumns++;
if ( currType == SUBCAT_ALTLANG &&
numColumns > 4)
numColumns = 4;
else if (numColumns > 3)
numColumns = 3;
// max number of links per column
maxPerColumn = numInType/numColumns;
if (numInType%numColumns > 0)
maxPerColumn++;
}
}
// start the sub cat
if (first) {
if (currType != SUBCAT_LETTERBAR)
sb.safePrintf ( "<table border=0>"
"<tr><td valign=top>"
"<ul><span class=\"directory\">"
"\n<li>");
}
// check for the next column
else if (nextColumn) {
sb.safePrintf ( "\n</span></ul></td><td valign=top>"
"<ul><span class=\"directory\">"
"\n<li>");
nextColumn = false;
}
// or just next link
else {
if (currType == SUBCAT_LETTERBAR)
sb.safePrintf("| ");
else
sb.safePrintf("<li>");
}
// print out the prefix as a link
//if ( p + catNameLen + 16 > pend ) {
// goto diroverflow;
//}
sb.safePrintf("<a href=\"/");
sb.utf8Encode2(catName, catNameLen);
sb.safePrintf("/\">");
// prefix...
//if ( p + prefixLen + 512 > pend ) {
// goto diroverflow;
//}
if (currType != SUBCAT_ALTLANG)
sb.safePrintf("<b>");
else {
// check for coded <b> or <strong> tags, remove
if (prefixLen >= 19 &&
strncasecmp(prefixp, "&lt;b&gt;", 9) == 0 &&
strncasecmp(prefixp + (prefixLen-10),
"&lt;/b&gt;", 10) == 0) {
prefixp += 9;
prefixLen -= 19;
}
else if (prefixLen >= 29 &&
strncasecmp(prefixp, "&lt;strong&gt;", 14) == 0 &&
strncasecmp(prefixp + (prefixLen-15),
"&lt;/strong&gt;", 15) == 0) {
prefixp += 14;
prefixLen -= 29;
}
}
if (currType == SUBCAT_RELATED) {
// print the full path
if (g_categories->isIndexRTL(currIndex))
sb.safePrintf("<span dir=ltr>");
g_categories->printPathFromIndex (
&sb,
currIndex,
false,
si->m_isRTL);
}
else {
char *encodeEnd = htmlEncode ( encodedName,
encodedName + 2047,
prefixp,
prefixp + prefixLen );
prefixp = encodedName;
prefixLen = encodeEnd - encodedName;
//if ( p + prefixLen + 512 > pend ) {
// goto diroverflow;
//}
for (long c = 0; c < prefixLen; c++) {
if (*prefixp == '_')
//*p = ' ';
sb.safePrintf(" ");
else
//*p = *prefixp;
sb.utf8Encode2(prefixp, 1);
//p++;
prefixp++;
}
}
//if ( p + 512 > pend ) {
// goto diroverflow;
//}
// end the link
if (currType != SUBCAT_ALTLANG)
sb.safePrintf("</b>");
sb.safePrintf("</a>");
// print an @ for symbolic links
if ( (cat->m_type % 10) == 1)
sb.safePrintf("@");
// print number of urls under here
if ( cat->m_type != SUBCAT_LETTERBAR) {
sb.safePrintf("&nbsp&nbsp<i>");
if (si->m_isRTL)
sb.safePrintf ( "<span dir=ltr>(%li)"
"</span></i>",
g_categories->getNumUrlsFromIndex(
currIndex) );
else
sb.safePrintf ( "(%li)</i>",
g_categories->getNumUrlsFromIndex(
currIndex) );
}
// next line/letter
if ( cat->m_type == SUBCAT_LETTERBAR) {
sb.safePrintf(" ");
continue;
}
// check for next column
currInColumn++;
if (currInColumn >= maxPerColumn) {
currInColumn = 0;
nextColumn = true;
}
}
//if ( p + 512 > pend ) {
// goto diroverflow;
//}
// end the last type
if (!inXml) {
if (currType == SUBCAT_LETTERBAR)
sb.safePrintf(" ]</center>\n");
else
sb.safePrintf("</ul></td></tr></table>\n");
}
dirEnd:
if (inXml)
sb.safePrintf("\t</directory>\n");
else {
sb.safePrintf("</span>");
sb.safePrintf("<hr><br>\n");
}
return true;
}

13
Rdb.cpp

@ -5,7 +5,7 @@
#include "Clusterdb.h"
#include "Hostdb.h"
#include "Tagdb.h"
//#include "Catdb.h"
#include "Catdb.h"
#include "Indexdb.h"
#include "Posdb.h"
#include "Cachedb.h"
@ -1340,7 +1340,7 @@ void attemptMergeAll ( int fd , void *state ) {
g_titledb.getRdb()->attemptMerge ( 1 , false , !state);
//g_tfndb.getRdb()->attemptMerge ( 1 , false , !state);
g_tagdb.getRdb()->attemptMerge ( 1 , false , !state);
//g_catdb.getRdb()->attemptMerge ( 1 , false , !state);
g_catdb.getRdb()->attemptMerge ( 1 , false , !state);
g_clusterdb.getRdb()->attemptMerge ( 1 , false , !state);
g_statsdb.getRdb()->attemptMerge ( 1 , false , !state);
g_syncdb.getRdb()->attemptMerge ( 1 , false , !state);
@ -2351,7 +2351,7 @@ Rdb *getRdbFromId ( uint8_t rdbId ) {
s_table9 [ RDB_DOLEDB ] = g_doledb.getRdb();
s_table9 [ RDB_TFNDB ] = g_tfndb.getRdb();
s_table9 [ RDB_CLUSTERDB ] = g_clusterdb.getRdb();
//s_table9 [ RDB_CATDB ] = g_catdb.getRdb();
s_table9 [ RDB_CATDB ] = g_catdb.getRdb();
s_table9 [ RDB_DATEDB ] = g_datedb.getRdb();
s_table9 [ RDB_LINKDB ] = g_linkdb.getRdb();
s_table9 [ RDB_CACHEDB ] = g_cachedb.getRdb();
@ -2380,7 +2380,7 @@ Rdb *getRdbFromId ( uint8_t rdbId ) {
// the opposite of the above
char getIdFromRdb ( Rdb *rdb ) {
if ( rdb == g_tagdb.getRdb () ) return RDB_TAGDB;
//if ( rdb == g_catdb.getRdb () ) return RDB_CATDB;
if ( rdb == g_catdb.getRdb () ) return RDB_CATDB;
if ( rdb == g_indexdb.getRdb () ) return RDB_INDEXDB;
if ( rdb == g_posdb.getRdb () ) return RDB_POSDB;
if ( rdb == g_datedb.getRdb () ) return RDB_DATEDB;
@ -2401,7 +2401,7 @@ char getIdFromRdb ( Rdb *rdb ) {
if ( rdb == g_revdb.getRdb () ) return RDB_REVDB;
//if ( rdb == g_sitedb.getRdb () ) return RDB_SITEDB;
//if ( rdb == g_tagdb2.getRdb () ) return RDB2_SITEDB2;
//if ( rdb == g_catdb.getRdb () ) return RDB_CATDB;
if ( rdb == g_catdb.getRdb () ) return RDB_CATDB;
if ( rdb == g_indexdb2.getRdb () ) return RDB2_INDEXDB2;
if ( rdb == g_posdb2.getRdb () ) return RDB2_POSDB2;
if ( rdb == g_datedb2.getRdb () ) return RDB2_DATEDB2;
@ -2425,7 +2425,7 @@ char getIdFromRdb ( Rdb *rdb ) {
char isSecondaryRdb ( uint8_t rdbId ) {
switch ( rdbId ) {
//case RDB2_SITEDB2 : return true;
//case RDB_CATDB2 : return g_catdb2.getRdb();
case RDB2_CATDB2 : return true;
case RDB2_INDEXDB2 : return true;
case RDB2_POSDB2 : return true;
case RDB2_DATEDB2 : return true;
@ -2532,6 +2532,7 @@ long getDataSizeFromRdbId ( uint8_t rdbId ) {
else if ( i == RDB2_TITLEDB2 ||
i == RDB2_REVDB2 ||
i == RDB2_TAGDB2 ||
i == RDB2_CATDB2 ||
i == RDB2_SPIDERDB2 ||
i == RDB2_PLACEDB2 )
ds = -1;

1
Rdb.h

@ -52,6 +52,7 @@ enum {
RDB2_REVDB2,
RDB2_TAGDB2,
RDB2_POSDB2, // 31
RDB2_CATDB2,
RDB_END
};
// how many rdbs are in "urgent merge" mode?

@ -680,7 +680,7 @@ bool SafeBuf::setEncoding(short cs) {
return true;
}
bool SafeBuf::utf8Encode(char *s, long len, bool encodeHTML,long niceness) {
bool SafeBuf::utf8Encode2(char *s, long len, bool encodeHTML,long niceness) {
long tmp = m_length;
if ( m_encoding == csUTF8 ) {
if (! safeMemcpy(s,len)) return false;
@ -1235,7 +1235,8 @@ void initTable ( ) {
}
}
bool SafeBuf::urlEncode ( bool spaceToPlus ) {
// url encode the whole buffer
bool SafeBuf::urlEncodeAllBuf ( bool spaceToPlus ) {
// this makes things faster
if ( ! s_init23 ) initTable();
// how many chars do we need?

@ -178,9 +178,9 @@ struct SafeBuf {
//insert strings in their native encoding
bool encode ( char *s , long len , long niceness=0) {
return utf8Encode(s,len,false,niceness); };
return utf8Encode2(s,len,false,niceness); };
// htmlEncode default = false
bool utf8Encode(char *s, long len, bool htmlEncode=false,
bool utf8Encode2(char *s, long len, bool htmlEncode=false,
long niceness=0);
bool latin1Encode(char *s, long len, bool htmlEncode=false,
long niceness=0);
@ -203,11 +203,15 @@ struct SafeBuf {
bool requestPath = false,
bool encodeApostrophes = false );
bool urlEncode (char *s ,
bool encodeApostrophes = false ) {
bool urlEncode (char *s ) {
return urlEncode ( s,strlen(s),false,false); };
bool urlEncode2 (char *s ,
bool encodeApostrophes ) { // usually false
return urlEncode ( s,strlen(s),false,encodeApostrophes); };
bool urlEncode ( bool spaceToPlus = true );
bool urlEncodeAllBuf ( bool spaceToPlus = true );
bool latin1CdataEncode(char *s, long len);
bool utf8CdataEncode(char *s, long len);

@ -1210,6 +1210,40 @@ bool SearchInput::setQueryBuffers ( ) {
m_displayQuery,
m_displayQueryLen);
//////////
//
// show DMOZ BREADCRUMB if doing a
// "gbpdcat:<catid> |" (Search restricted to category)
// "gbdcat:<catid>" (DMOZ urls in that topic, c=dmoz3)
//
//////////
long pcatId = -1;
long dcatId = -1;
// get the final query
char *q =m_sbuf1.getBufStart();
if ( q ) sscanf(q,"gbpdcat:%li",&pcatId);
if ( q ) sscanf(q,"gbcat:%li",&dcatId);
// pick the one that is valid
long catId = -1;
if ( pcatId >= 0 ) catId = pcatId;
if ( dcatId >= 0 ) catId = dcatId;
//////
//
// save catid into the state
m_catId = catId;
//
///////
// are we a right to left language like hebrew?
if ( catId > 0 && g_categories->isIdRTL(catId) )
m_isRTL = true;
else
m_isRTL = false;
return true;
}

@ -400,6 +400,9 @@ class SearchInput {
SafeBuf m_sbuf2;
SafeBuf m_sbuf3;
long m_catId;
bool m_isRTL;
// make a cookie from parms with m_flags of PF_COOKIE set
SafeBuf m_cookieBuf;

@ -2790,7 +2790,11 @@ char **XmlDoc::getTitleRec ( ) {
long dslen = 0;
unsigned char dalen = 0;
// store all dmoz info separated by \0's into titles[] buffer
// . store all dmoz info separated by \0's into titles[] buffer
// . crap, this does a disk read and blocks on that
//
// . TODO: make it non-blocking!!!!
//
g_categories->getTitleAndSummary ( m_firstUrl.getUrl(),
m_firstUrl.getUrlLen(),
ptr_catIds[i],
@ -3372,7 +3376,7 @@ CatRec *XmlDoc::getCatRec ( ) {
// return what we got
if ( m_catRecValid ) return &m_catRec;
// call that
setStatus ("getting cat rec");
setStatus ("getting dmoz cat rec");
// callback?
if ( m_calledMsg8b ) {
// return NULL on error
@ -3386,7 +3390,8 @@ CatRec *XmlDoc::getCatRec ( ) {
// assume empty and skip the call for now
m_catRec.reset();
m_catRecValid = true;
return &m_catRec;
// let's bring dmoz back
//return &m_catRec;
// compute it otherwise
if ( ! m_msg8b.getCatRec ( &m_firstUrl ,
m_coll ,
@ -20303,7 +20308,7 @@ char *XmlDoc::hashAll ( HashTableX *table ) {
if ( ! hashUrl ( table ) ) return NULL;
if ( ! hashMetaTags ( table ) ) return NULL;
if ( ! hashMetaZip ( table ) ) return NULL;
//if ( ! hashCategories ( table ) ) return NULL;
if ( ! hashDMOZCategories( table ) ) return NULL;
if ( ! hashLanguage ( table ) ) return NULL;
if ( ! hashCountry ( table ) ) return NULL;
if ( ! hashSiteNumInlinks( table ) ) return NULL;
@ -21789,6 +21794,113 @@ bool XmlDoc::searchboxToGigablast ( ) {
return m_xml.hasGigablastForm();
}
// . bring back support for dmoz integration
// . when clicking on a "search within this category" it does a gbpdcat:<catid>
// search to capture all pages that have that dmoz category as one of their
// parent topics
bool XmlDoc::hashDMOZCategories ( HashTableX *tt ) {
char *titlePtr = ptr_dmozTitles;
char *sumPtr = ptr_dmozSumms;
//char *anchPtr = ptr_dmozAnchors;
char buf[128];
HashInfo hi;
hi.m_tt = tt;
hi.m_hashGroup = HASHGROUP_INTAG;
long *catIds = (long *)ptr_catIds;
long numCatIds = size_catIds / 4;
// go through the catIds and hash them
for (long i = 0; i < numCatIds; i++) {
// write the catid as a string
sprintf(buf, "%lu", catIds[i]);
// term prefix for hashing
hi.m_prefix = "gbdcat";
// hash it
hashString ( buf , gbstrlen(buf) , &hi );
// we also want to hash the parents
long currCatId = catIds[i];
long currParentId = catIds[i];
long currCatIndex;
// loop to the Top, Top = 1
while ( currCatId > 1 ) {
// hash the parent
sprintf(buf, "%lu", currParentId);
hi.m_prefix = "gbpdcat";
hashString ( buf , gbstrlen(buf), &hi );
// next cat
currCatId = currParentId;
// get the index for this cat
currCatIndex = g_categories->getIndexFromId(currCatId);
if ( currCatIndex <= 0 ) break;
// get the parent for this cat
currParentId =
g_categories->m_cats[currCatIndex].m_parentid;
}
// do not hash titles or summaries if "index article content
// only" parm is on
//if ( tr->eliminateMenus() ) continue;
// hash dmoz title
hi.m_prefix = NULL;
// call this DMOZ title as regular title i guess
hi.m_hashGroup = HASHGROUP_TITLE;
// hash the DMOZ title
hashString ( titlePtr , gbstrlen(titlePtr), &hi );
// next title
titlePtr += gbstrlen(titlePtr) + 1;
// hash DMOZ summary
hi.m_prefix = NULL;
// call this DMOZ summary as body i guess
hi.m_hashGroup = HASHGROUP_BODY;
// hash the DMOZ summary
hashString ( sumPtr , gbstrlen(sumPtr), &hi );
// next summary
sumPtr += gbstrlen(sumPtr) + 1;
}
long numIndCatIds = size_indCatIds / 4;
long *indCatIds = (long *)ptr_indCatIds;
// go through the INDIRECT catIds and hash them
for (long i = 0 ; i < numIndCatIds; i++) {
// write the catid as a string
sprintf(buf, "%lu", indCatIds[i]);
// use prefix
hi.m_prefix = "gbicat";
hi.m_hashGroup = HASHGROUP_INTAG;
// hash it
hashString ( buf , gbstrlen(buf), &hi );
// we also want to hash the parents
long currCatId = indCatIds[i];
long currParentId = indCatIds[i];
long currCatIndex;
// loop to the Top, Top = 1
while (currCatId > 1) {
// hash the parent
sprintf(buf, "%lu", currParentId);
// new prefix
hi.m_prefix = "gbpicat";
// hash it
hashString ( buf , gbstrlen(buf), &hi );
// next cat
currCatId = currParentId;
// get the index for this cat
currCatIndex = g_categories->getIndexFromId(currCatId);
if ( currCatIndex <= 0 ) break;
// get the parent for this cat
currParentId =
g_categories->m_cats[currCatIndex].m_parentid;
}
}
return true;
}
bool XmlDoc::hashLanguage ( HashTableX *tt ) {
setStatus ( "hashing language" );

@ -693,6 +693,7 @@ class XmlDoc {
bool hashZipCodes ( class HashTableX *table ) ;
bool hashMetaZip ( class HashTableX *table ) ;
bool hashContentType ( class HashTableX *table ) ;
bool hashDMOZCategories ( class HashTableX *table ) ;
bool hashLinks ( class HashTableX *table ) ;
bool hashUrl ( class HashTableX *table ) ;
bool hashSections ( class HashTableX *table ) ;

@ -21,6 +21,11 @@
bool closeAll ( void *state , void (* callback)(void *state) ) { return true; }
bool allExit ( ) { return true; };
bool sendPageSEO(TcpSocket *s, HttpRequest *hr) {return true;}
//long g_qbufNeedSave = false;
//SafeBuf g_qbuf;
#define RDFBUFFER_SIZE (1024*1024*10)
#define RDFSTRUCTURE_FILE "structure.rdf.u8"
#define RDFCONTENT_FILE "content.rdf.u8"
@ -518,7 +523,7 @@ bool isGoodUrl ( char *url, long urlLen ) {
if ( urlLen <= 0 )
return false;
for (long i = 0; i < urlLen; i++) {
if (is_space(url[i]))
if (is_wspace_a(url[i]))
return false;
}
// check for [prot]://[url]
@ -621,7 +626,7 @@ long fixUrl ( char *url, long urlLen ) {
memmove(&url[slashi-1], &url[slashi], newUrlLen - slashi);
newUrlLen--;
}
if (is_space(url[slashi])) {
if (is_wspace_a(url[slashi])) {
memmove(&url[slashi], &url[slashi+1], newUrlLen - (slashi+1));
newUrlLen--;
}
@ -678,7 +683,7 @@ int main ( int argc, char *argv[] ) {
long m = 0;
long newNameBufferSize = 0;
long newOffset = 0;
char filename[256];
char filename[1256];
long urlTxtCount = 0;
long urlTxtFile = 0;
Url normUrl;
@ -695,6 +700,7 @@ int main ( int argc, char *argv[] ) {
bool splitUrls = false;
char mode = MODE_NONE;
long totalNEC = 0;
char *dir;
// check the options and mode
for (long i = 0; i < argc; i++) {
@ -783,20 +789,29 @@ int main ( int argc, char *argv[] ) {
goto errExit;
}
dir = "";
retry:
// open the structure file
if ( mode == MODE_NEW || mode == MODE_CATDUMP )
sprintf(filename, "%s", RDFSTRUCTURE_FILE);
sprintf(filename, "%s%s", dir,RDFSTRUCTURE_FILE);
else
sprintf(filename, "%s.new", RDFSTRUCTURE_FILE);
sprintf(filename, "%s%s.new", dir,RDFSTRUCTURE_FILE);
//rdfStream.open(filename, ifstream::in);
rdfStream = open ( filename, O_RDONLY );
// make sure it openned okay
// make sure it opened okay
//if (!rdfStream.is_open()) {
if ( rdfStream < 0 ) {
printf("Error Openning %s\n", filename);
// try ./cat/ subdir if not found
if ( ! dir[0] ) {
dir = "./cat/";
goto retry;
}
printf("Error Opening %s\n", filename);
goto errExit;
}
printf("Openned Structure File: %s\n", filename);
printf("Opened Structure File: %s\n", filename);
// take the first chunk
//rdfStream.read(rdfBuffer, RDFBUFFER_SIZE);
@ -832,7 +847,9 @@ int main ( int argc, char *argv[] ) {
nameLen = MAX_HTTP_FILENAME_LEN;
nameLen = htmlDecode ( htmlDecoded,
&nameBuffer[nameOffset],
nameLen );
nameLen ,
false,
0);
memcpy(&nameBuffer[nameOffset], htmlDecoded, nameLen);
nameBufferLen += nameLen;
// parse the catid
@ -977,7 +994,9 @@ nextChildTag:
childNameLen = MAX_HTTP_FILENAME_LEN;
childNameLen = htmlDecode ( htmlDecoded,
childName,
childNameLen );
childNameLen ,
false,
0);
memcpy(childName, htmlDecoded, childNameLen);
// cut off the leading label if symbolic
// if (parentType == 2) {
@ -1066,25 +1085,25 @@ fileEnd1:
for (long i = 0; i < numRdfCats; i++) {
// get the hash of the path
rawPathLen = printCatPath(rawPath, rdfCats[i].m_catid, true);
rdfCats[i].m_catHash = hash32Lower(rawPath, rawPathLen, 0);
rdfCats[i].m_catHash = hash32Lower_a(rawPath, rawPathLen, 0);
}
// . now we want to serialize the needed data into
// one (or more?) file(s) to be quickly read by gb
if ( mode == MODE_NEW )
sprintf(filename, "%s", STRUCTURE_OUTPUT_FILE);
sprintf(filename, "%s%s", dir,STRUCTURE_OUTPUT_FILE);
else
sprintf(filename, "%s.new", STRUCTURE_OUTPUT_FILE);
sprintf(filename, "%s%s.new", dir,STRUCTURE_OUTPUT_FILE);
//outStream.open(filename, ofstream::out|ofstream::trunc);
outStream = open ( filename, O_CREAT|O_WRONLY|O_TRUNC,
S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP );
// make sure it openned okay
// make sure it opened okay
//if (!outStream.is_open()) {
if ( outStream < 0 ) {
printf("Error Openning %s\n", filename);
printf("Error Opening %s\n", filename);
goto errExit;
}
printf("\nOpenned %s for writing.\n", filename);
printf("\nOpened %s for writing.\n", filename);
// write the size of the truncated name buffer
//outStream.write((char*)&newNameBufferSize, sizeof(long));
@ -1152,18 +1171,18 @@ contentParse:
// open the content file
if ( mode == MODE_NEW || mode == MODE_URLDUMP )
sprintf(filename, "%s", RDFCONTENT_FILE);
sprintf(filename, "%s%s", dir,RDFCONTENT_FILE);
else
sprintf(filename, "%s.new", RDFCONTENT_FILE);
sprintf(filename, "%s%s.new", dir,RDFCONTENT_FILE);
//rdfStream.open(filename, ifstream::in);
rdfStream = open ( filename, O_RDONLY );
// make sure it openned okay
// make sure it opened okay
//if (!rdfStream.is_open()) {
if ( rdfStream < 0 ) {
printf("Error Openning %s\n", filename);
printf("Error Opening %s\n", filename);
goto errExit;
}
printf("\nOpenned Content File: %s\n", filename);
printf("\nOpened Content File: %s\n", filename);
// take the first chunk
//rdfStream.read(rdfBuffer, RDFBUFFER_SIZE);
@ -1199,13 +1218,13 @@ contentParse:
//outStream2.open(filename, ofstream::out|ofstream::trunc);
outStream2 = open ( filename, O_CREAT|O_WRONLY|O_TRUNC,
S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP );
// make sure it openned okay
// make sure it opened okay
//if (!outStream2.is_open()) {
if ( outStream2 < 0 ) {
printf("Error Openning %s\n", filename);
printf("Error Opening %s\n", filename);
goto errExit1;
}
printf("Openned %s for writing.\n", filename);
printf("Opened %s for writing.\n", filename);
// if we're doing a diffurldump, load up the diff file first
if ( mode == MODE_DIFFURLDUMP ) {
@ -1219,10 +1238,10 @@ contentParse:
diffInStream = open(filename, O_RDONLY);
//if (!diffInStream.is_open()) {
if ( diffInStream < 0 ) {
printf("Error Openning %s\n", filename);
printf("Error Opening %s\n", filename);
goto errExit;
}
printf("Openned Diff File: %s\n", filename);
printf("Opened Diff File: %s\n", filename);
// read in the number of urls to update/add
//diffInStream.read((char*)&numUpdateIndexes,
@ -1326,14 +1345,14 @@ contentParse:
outStream2 = open ( filename,
O_CREAT|O_WRONLY|O_TRUNC,
S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP );
// make sure it openned okay
// make sure it opened okay
//if (!outStream2.is_open()) {
if ( outStream2 < 0 ) {
printf("Error Openning %s\n",
printf("Error Opening %s\n",
filename);
goto errExit1;
}
printf("Openned %s for writing.\n",
printf("Opened %s for writing.\n",
filename);
urlTxtCount = 0;
}
@ -1348,20 +1367,20 @@ contentParse:
}
else {
if ( mode == MODE_NEW )
sprintf(filename, "%s", CONTENT_OUTPUT_FILE);
sprintf(filename, "%s%s", dir,CONTENT_OUTPUT_FILE);
else
sprintf(filename, "%s.new", CONTENT_OUTPUT_FILE);
sprintf(filename, "%s%s.new", dir,CONTENT_OUTPUT_FILE);
// stream the urls into the content
//outStream.open(filename, ofstream::out|ofstream::trunc);
outStream = open ( filename, O_CREAT|O_WRONLY|O_TRUNC,
S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP );
// make sure it openned okay
// make sure it opened okay
//if (!outStream.is_open()) {
if ( outStream < 0 ) {
printf("Error Openning %s\n", filename);
printf("Error Opening %s\n", filename);
goto errExit;
}
printf("Openned %s for writing.\n", filename);
printf("Opened %s for writing.\n", filename);
// store a space for the number of urls at the start of the file
//outStream.write((char*)&numUrlInfos, sizeof(long));
@ -1442,7 +1461,8 @@ hashLink:
// html decode the url
if (urlLen > MAX_URL_LEN)
urlLen = MAX_URL_LEN;
urlLen = htmlDecode(decodedUrl, &urlBuffer[urlOffset], urlLen);
urlLen = htmlDecode(decodedUrl, &urlBuffer[urlOffset], urlLen,
false,0);
memcpy(&urlBuffer[urlOffset], decodedUrl, urlLen);
// fix up bad urls
urlLen = fixUrl(&urlBuffer[urlOffset], urlLen);
@ -1473,7 +1493,7 @@ hashLink:
//urlBufferLen += urlLen;
// get the hash value
unsigned long long urlHash =
hash64Lower(&urlBuffer[urlOffset], urlLen, 0);
hash64Lower_a(&urlBuffer[urlOffset], urlLen, 0);
//unsigned long urlHash2 =
// hash32Lower(&urlBuffer[urlOffset], urlLen, 0);
// see if it's already indexed
@ -1530,14 +1550,14 @@ hashLink:
outStream2 = open ( filename,
O_CREAT|O_WRONLY|O_TRUNC,
S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP );
// make sure it openned okay
// make sure it opened okay
//if (!outStream2.is_open()) {
if ( outStream2 < 0 ) {
printf("Error Openning %s\n",
printf("Error Opening %s\n",
filename);
goto errExit1;
}
printf("Openned %s for writing.\n",
printf("Opened %s for writing.\n",
filename);
urlTxtCount = 0;
}
@ -1697,19 +1717,19 @@ fileEnd2:
// load the content and url files
// url info (content) file
sprintf(filename, "%s", CONTENT_OUTPUT_FILE);
sprintf(filename, "%s%s", dir,CONTENT_OUTPUT_FILE);
//rdfStream.open(filename, ifstream::in);
rdfStream = open ( filename, O_RDONLY );
//if (!rdfStream.is_open()) {
if ( rdfStream < 0 ) {
printf("Error Openning %s\n", CONTENT_OUTPUT_FILE);
printf("Error Opening %s\n", filename);
goto oldErrExit;
}
// read in the number of urls
//rdfStream.read((char*)&oldNumUrls, sizeof(long));
if (fileRead(rdfStream, &oldNumUrls, sizeof(long)) !=
sizeof(long)) {
printf("Error Reading %s\n", CONTENT_OUTPUT_FILE);
printf("Error Reading %s\n", filename);
goto oldErrExit;
}
@ -1749,8 +1769,8 @@ fileEnd2:
//rdfStream.read((char*)&urlLen, sizeof(short));
long n = fileRead(rdfStream, &urlLen, sizeof(short));
if ( n < 0 || n > (long)sizeof(short) ) {
printf("Error Reading %s\n",
CONTENT_OUTPUT_FILE);
printf("Error Reading %s\n",filename);
//CONTENT_OUTPUT_FILE);
goto oldErrExit;
}
if ( n == 0 )
@ -1780,8 +1800,8 @@ fileEnd2:
}
n = fileRead(rdfStream, &oldUrls[urlp], urlLen);
if ( n < 0 || n > urlLen ) {
printf("Error Reading %s\n",
CONTENT_OUTPUT_FILE);
printf("Error Reading %s\n",filename);
//CONTENT_OUTPUT_FILE);
goto oldErrExit;
}
if ( n == 0 )
@ -1791,7 +1811,7 @@ fileEnd2:
urlLen = fixUrl(&oldUrls[urlp], urlLen);
// make the hash
oldUrlHashes[currUrl] =
hash64Lower(&oldUrls[urlp], urlLen, 0);
hash64Lower_a(&oldUrls[urlp], urlLen, 0);
removeOldUrl[currUrl] = 0;
// increment the buffer pointer
if (urlLen <= 0) {
@ -1814,8 +1834,8 @@ fileEnd2:
//rdfStream.read((char*)&oldNumCatids[currUrl], 1);
long n = fileRead(rdfStream, &oldNumCatids[currUrl], 1);
if ( n < 0 || n > 1 ) {
printf("Error Reading %s\n",
CONTENT_OUTPUT_FILE);
printf("Error Reading %s\n",filename);
//CONTENT_OUTPUT_FILE);
goto oldErrExit;
}
if ( n == 0 )
@ -1839,8 +1859,8 @@ fileEnd2:
long readSize = sizeof(long)*oldNumCatids[currUrl];
n = fileRead(rdfStream, &oldCatids[catidp], readSize);
if ( n < 0 || n > readSize ) {
printf("Error Reading %s\n",
CONTENT_OUTPUT_FILE);
printf("Error Reading %s\n",filename);
//CONTENT_OUTPUT_FILE);
goto oldErrExit;
}
if ( n == 0 )
@ -1907,17 +1927,17 @@ oldIsDifferent:
// also urls to remove
//
// open the new diff file for writing
sprintf(filename, "%s.new.diff", CONTENT_OUTPUT_FILE);
sprintf(filename, "%s%s.new.diff", dir,CONTENT_OUTPUT_FILE);
//outStream.open(filename, ofstream::out|ofstream::trunc);
outStream = open ( filename, O_CREAT|O_WRONLY|O_TRUNC,
S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP );
// make sure it openned okay
// make sure it opened okay
//if (!outStream.is_open()) {
if ( outStream < 0 ) {
printf("Error Openning %s\n", filename);
printf("Error Opening %s\n", filename);
goto oldErrExit;
}
printf("\nOpenned %s for writing.\n", filename);
printf("\nOpened %s for writing.\n", filename);
// write out the number of urls to update/add
//outStream.write(&numUpdateUrls, sizeof(long));
@ -2027,19 +2047,19 @@ oldGoodExit:
// . now we want to serialize the needed data into
// one (or more?) file(s) to be quickly read by gb
if ( mode == MODE_NEW )
sprintf(filename, "%s", STRUCTURE_OUTPUT_FILE);
sprintf(filename, "%s%s", dir,STRUCTURE_OUTPUT_FILE);
else
sprintf(filename, "%s.new", STRUCTURE_OUTPUT_FILE);
sprintf(filename, "%s%s.new", dir,STRUCTURE_OUTPUT_FILE);
//outStream.open(filename, ofstream::out|ofstream::ate);
outStream = open ( filename, O_WRONLY|O_APPEND,
S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP );
// make sure it openned okay
// make sure it opened okay
//if (!outStream.is_open()) {
if ( outStream < 0 ) {
printf("Error Openning %s\n", filename);
printf("Error Opening %s\n", filename);
goto errExit;
}
printf("\nOpenned %s for writing.\n", filename);
printf("\nOpened %s for writing.\n", filename);
// write the cats
//outStream.write((char*)rdfCats, sizeof(RdfCat)*numRdfCats);
@ -2109,21 +2129,21 @@ oldGoodExit:
// write another file for the urls
if ( mode == MODE_NEW )
sprintf(filename, "%s", CONTENT_OUTPUT_FILE);
sprintf(filename, "%s%s", dir,CONTENT_OUTPUT_FILE);
else
sprintf(filename, "%s.new", CONTENT_OUTPUT_FILE);
sprintf(filename, "%s%s.new", dir,CONTENT_OUTPUT_FILE);
//outStream.open(filename, ofstream::out|ofstream::ate);
outStream = open ( filename, O_WRONLY,
S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP );
//outStream.open(filename, ofstream::out|ofstream::trunc);
//endpos = outStream.tellp();
// make sure it openned okay
// make sure it opened okay
//if (!outStream.is_open()) {
if ( outStream < 0 ) {
printf("Error Openning %s\n", filename);
printf("Error Opening %s\n", filename);
goto errExit;
}
printf("\nOpenned %s for writing.\n", filename);
printf("\nOpened %s for writing.\n", filename);
//outStream.seekp(0);
lseek(outStream, 0, SEEK_SET);

@ -22,7 +22,7 @@
#include "Titledb.h"
#include "Revdb.h"
#include "Tagdb.h"
//#include "Catdb.h"
#include "Catdb.h"
#include "Users.h"
#include "Tfndb.h"
#include "Spider.h"
@ -2624,8 +2624,8 @@ int main ( int argc , char *argv[] ) {
if ( ! g_tagdb.init() ) {
log("db: Tagdb init failed." ); return 1; }
// the catdb, it's an instance of tagdb, pass RDB_CATDB
//if ( ! g_catdb.init() ) {
// log("db: Catdb1 init failed." ); return 1; }
if ( ! g_catdb.init() ) {
log("db: Catdb1 init failed." ); return 1; }
// initialize Users
if ( ! g_users.init() ){
log("db: Users init failed. "); return 1;}
@ -10986,7 +10986,8 @@ void dumpTagdb (char *coll,long startFileNum,long numFiles,bool includeTree,
//g_conf.m_spiderdbMaxTreeMem = 1024*1024*30;
g_tagdb.init ();
g_collectiondb.init(true);
g_tagdb.addColl ( coll, false );
if ( rdbId == RDB_TAGDB ) g_tagdb.addColl ( coll, false );
if ( rdbId == RDB_CATDB ) g_catdb.init();
key128_t startKey ;
key128_t endKey ;
startKey.setMin();
@ -11051,6 +11052,21 @@ void dumpTagdb (char *coll,long startFileNum,long numFiles,bool includeTree,
printf("corrupt tagdb rec k.n0=%llu",k.n0);
continue;
}
// catdb?
if ( rdbId == RDB_CATDB ) {
// for debug!
CatRec crec;
crec.set ( NULL,
data ,
size ,
false);
printf("caturl=%s #catids=%li version=%li\n"
,crec.m_url
,(long)crec.m_numCatids
,(long)crec.m_version
);
continue;
}
// parse it up
//TagRec *tagRec = (TagRec *)rec;
Tag *tag = (Tag *)rec;
@ -13945,10 +13961,10 @@ void saveRdbs ( int fd , void *state ) {
last = rdb->getLastWriteTime();
if ( now - last > delta )
if ( ! rdb->close(NULL,NULL,false,false)) return;
//rdb = g_catdb.getRdb();
//last = rdb->getLastWriteTime();
//if ( now - last > delta )
// if ( ! rdb->close(NULL,NULL,false,false)) return;
rdb = g_catdb.getRdb();
last = rdb->getLastWriteTime();
if ( now - last > delta )
if ( ! rdb->close(NULL,NULL,false,false)) return;
//rdb = g_indexdb.getRdb();
//last = rdb->getLastWriteTime();
//if ( now - last > delta )