mirror of
https://github.com/privacore/open-source-search-engine.git
synced 2025-07-19 03:14:38 -04:00
Merge branch 'master' into diffbot
Conflicts: Hostdb.cpp Makefile PageResults.cpp PageRoot.cpp Pages.cpp Rdb.cpp SearchInput.cpp SearchInput.h Spider.cpp Spider.h XmlDoc.cpp
This commit is contained in:
CatRec.cppCatdb.cppCategories.cppCategories.hConf.hHostdb.cppHttpMime.cppHttpMime.hHttpRequest.cppLang.cppMakefileMem.cppMsg1.cppMsg2a.cppMsg40.cppMsg40.hMsg8b.cppMsg8b.hMsg9b.cppPageAddUrl.cppPageCatdb.cppPageDirectory.cppPageOverview.cppPagePerf.cppPageResults.cppPageResults.hPageRoot.cppPageStats.cppPageStatsdb.cppPages.cppParms.cppProcess.cppProxy.cppRdb.cppRdb.hSafeBuf.cppSafeBuf.hSearchInput.cppSearchInput.hSections.cppSections.hSpeller.cppSpider.cppSpider.hStats.cppStats.hStatsdb.cppStatsdb.hTcpServer.cppXmlDoc.cppXmlDoc.hdmozparse.cpp
html
libplot.alibplotter.amain.cppmatches2.cppplotter.hurlinfo.cpp@ -198,6 +198,7 @@ bool CatRec::set ( Url *url , char *data , long dataSize , bool gotByIp ) {
|
||||
log ( "tagdb: Deserialized datasize %i != %li for url %s so "
|
||||
"ignoring tagdb record.",
|
||||
p - m_data, m_dataSize , url->getUrl() );
|
||||
return false;
|
||||
char *xx = NULL; *xx = 0;
|
||||
}
|
||||
|
||||
@ -308,7 +309,9 @@ bool CatRec::set ( Url *site ,
|
||||
// add the ids
|
||||
m_catids = (long*)p;
|
||||
memcpy(p, catids, 4*m_numCatids);
|
||||
p += 4*m_numCatids;
|
||||
// skip over "numCatids" NOT m_numCatids which is TRUNCATED
|
||||
// to MAX_CATIDS
|
||||
p += 4*numCatids;
|
||||
//}
|
||||
// point to the filenum so we can mod it!
|
||||
//m_filenumPtr = p;
|
||||
|
43
Catdb.cpp
43
Catdb.cpp
@ -29,7 +29,11 @@ bool Catdb::init ( ) {
|
||||
// . what's max # of tree nodes?
|
||||
// . assume avg tagdb rec size (siteUrl) is about 82 bytes we get:
|
||||
// . NOTE: 32 bytes of the 82 are overhead
|
||||
long treeMem = g_conf.m_catdbMaxTreeMem;
|
||||
//long treeMem = g_conf.m_catdbMaxTreeMem;
|
||||
// speed up gen catdb, use 15MB. later maybe once gen is complete
|
||||
// we can free this tree or something...
|
||||
// TODO!
|
||||
long treeMem = 15000000;
|
||||
//long treeMem = 100000000;
|
||||
//long maxTreeNodes = g_conf.m_catdbMaxTreeMem / 82;
|
||||
long maxTreeNodes = treeMem / 82;
|
||||
@ -51,14 +55,14 @@ bool Catdb::init ( ) {
|
||||
// . initialize our own internal rdb
|
||||
// . i no longer use cache so changes to tagdb are instant
|
||||
// . we still use page cache however, which is good enough!
|
||||
if ( this == &g_catdb )
|
||||
return m_rdb.init ( g_hostdb.m_dir ,
|
||||
//if ( this == &g_catdb )
|
||||
if ( ! m_rdb.init ( g_hostdb.m_dir ,
|
||||
"catdb" ,
|
||||
true , // dedup same keys?
|
||||
-1 , // fixed record size
|
||||
//g_hostdb.m_groupMask ,
|
||||
//g_hostdb.m_groupId ,
|
||||
g_conf.m_catdbMinFilesToMerge ,
|
||||
2,//g_conf.m_catdbMinFilesToMerge ,
|
||||
treeMem ,//g_conf.m_catdbMaxTreeMem ,
|
||||
maxTreeNodes ,
|
||||
// now we balance so Sync.cpp can ordered huge list
|
||||
@ -70,9 +74,17 @@ bool Catdb::init ( ) {
|
||||
&m_pc ,
|
||||
false,
|
||||
false,
|
||||
12,
|
||||
12, // keysize
|
||||
false,
|
||||
true ); // is collectionless?
|
||||
true )) // is collectionless?
|
||||
return false;
|
||||
|
||||
// normally Collectiondb.addColl() will call Rdb::addColl() which
|
||||
// will init the CollectionRec::m_rdbBase, which is what
|
||||
// Rdb::getBase(collnum_t) will return. however, for collectionless
|
||||
// rdb databases we set Rdb::m_collectionlessBase special here.
|
||||
// This is in Rdb.cpp::init() now.
|
||||
//return m_rdb.addColl ( NULL );
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -119,7 +131,7 @@ bool Catdb::verify ( char *coll ) {
|
||||
g_threads.disableThreads();
|
||||
|
||||
Msg5 msg5;
|
||||
Msg5 msg5b;
|
||||
//Msg5 msg5b;
|
||||
RdbList list;
|
||||
key_t startKey;
|
||||
key_t endKey;
|
||||
@ -128,7 +140,7 @@ bool Catdb::verify ( char *coll ) {
|
||||
//long minRecSizes = 64000;
|
||||
|
||||
if ( ! msg5.getList ( RDB_CATDB ,
|
||||
coll ,
|
||||
"",//coll ,
|
||||
&list ,
|
||||
startKey ,
|
||||
endKey ,
|
||||
@ -147,7 +159,7 @@ bool Catdb::verify ( char *coll ) {
|
||||
-1 ,
|
||||
true ,
|
||||
-1LL ,
|
||||
&msg5b ,
|
||||
NULL,//&msg5b ,
|
||||
true )) {
|
||||
g_threads.enableThreads();
|
||||
return log("db: HEY! it did not block");
|
||||
@ -311,6 +323,19 @@ void Catdb::listSearch ( RdbList *list,
|
||||
// for small lists, just loop through the list
|
||||
if (list->getListSize() < 16*1024) {
|
||||
while ( ! list->isExhausted() ) {
|
||||
// for debug!
|
||||
/*
|
||||
CatRec crec;
|
||||
crec.set ( NULL,
|
||||
list->getCurrentData(),
|
||||
list->getCurrentDataSize(),
|
||||
false);
|
||||
log("catdb: caturl=%s #catid=%li version=%li"
|
||||
,crec.m_url
|
||||
,(long)crec.m_numCatids
|
||||
,(long)crec.m_version
|
||||
);
|
||||
*/
|
||||
// check the current key
|
||||
if ( list->getCurrentKey() != exactKey ) {
|
||||
// miss, next
|
||||
|
360
Categories.cpp
360
Categories.cpp
@ -50,6 +50,7 @@ void Categories::reset() {
|
||||
}
|
||||
}
|
||||
|
||||
// filename usually ./catdb/gbdmoz.structure.dat
|
||||
long Categories::loadCategories ( char *filename ) {
|
||||
//ifstream inStream;
|
||||
int inStream;
|
||||
@ -69,6 +70,7 @@ long Categories::loadCategories ( char *filename ) {
|
||||
return 1;
|
||||
}
|
||||
// read in the number of cats
|
||||
// filename usually ./catdb/gbdmoz.structure.dat
|
||||
if ( fileRead ( inStream, &m_numCats, sizeof(long) ) != sizeof(long) ) {
|
||||
log("cat: Error reading structure file: %s", filename);
|
||||
close(inStream);
|
||||
@ -114,7 +116,8 @@ long Categories::loadCategories ( char *filename ) {
|
||||
g_errno = ENOMEM;
|
||||
return 1;
|
||||
}
|
||||
// read the rest of the file into the temp buffer
|
||||
// . read the rest of the file into the temp buffer
|
||||
// . filename usually ./catdb/gbdmoz.structure.dat
|
||||
if ( fileRead ( inStream, tempBuffer, readSize ) != readSize ) {
|
||||
log("cat: Error reading structure file: %s", filename);
|
||||
close(inStream);
|
||||
@ -212,6 +215,15 @@ long Categories::loadCategories ( char *filename ) {
|
||||
long long start = gettimeofdayInMilliseconds();
|
||||
// sort the category hash by hash value
|
||||
gbsort(m_catHash, m_numCats, sizeof(CategoryHash), sortCatHash);
|
||||
|
||||
// sanity check - no dups allowed
|
||||
unsigned long last = 0xffffffff;
|
||||
for ( long i = 0 ; i < m_numCats ; i++ ) {
|
||||
if ( m_catHash[i].m_hash == last )
|
||||
log("dmoz: hash collision on %lu",last);
|
||||
last = m_catHash[i].m_hash;
|
||||
}
|
||||
|
||||
// time it
|
||||
long long took = gettimeofdayInMilliseconds();
|
||||
if ( took - start > 100 ) log(LOG_INIT,"admin: Took %lli ms to "
|
||||
@ -327,9 +339,15 @@ long Categories::getIndexFromPath ( char *str, long strLen ) {
|
||||
// check for top
|
||||
if (strLen == 3 &&
|
||||
strncasecmp(str, "Top", 3) == 0)
|
||||
// it is catid 2 right? but i guess zero is symbolic for us!
|
||||
return 0;
|
||||
// get the hash
|
||||
unsigned long hash = hash32Lower_a(str, strLen, 0);
|
||||
// debug
|
||||
//char c = str[strLen];
|
||||
//str[strLen] = '\0';
|
||||
//log("dmoz: looking up hash %lu for %s",hash,str);
|
||||
//str[strLen] = c;
|
||||
// binary search
|
||||
while (low <= high) {
|
||||
// next check spot
|
||||
@ -349,6 +367,7 @@ long Categories::getIndexFromPath ( char *str, long strLen ) {
|
||||
|
||||
// return the catid from the given path
|
||||
long Categories::getIdFromPath ( char *str, long strLen ) {
|
||||
if ( ! m_cats ) return -1;
|
||||
long index = getIndexFromPath(str, strLen);
|
||||
return m_cats[index].m_catid;
|
||||
}
|
||||
@ -497,7 +516,7 @@ void Categories::printPathFromId ( SafeBuf *sb ,
|
||||
long catIndex;
|
||||
// get the index
|
||||
catIndex = getIndexFromId(catid);
|
||||
if (catIndex < 1) return;
|
||||
//if (catIndex < 1) return;
|
||||
printPathFromIndex(sb, catIndex, raw, isRTL);
|
||||
}
|
||||
|
||||
@ -509,8 +528,22 @@ void Categories::printPathFromIndex ( SafeBuf *sb ,
|
||||
if (catIndex < 1) return;
|
||||
// get the parent
|
||||
parentId = m_cats[catIndex].m_parentid;
|
||||
// print the parent(s) first
|
||||
if (parentId > 1) {
|
||||
long catid = m_cats[catIndex].m_catid;
|
||||
|
||||
// include Top now. in newer dmoz it is catid2.
|
||||
//if ( catid == 2 ) {
|
||||
// sb->safePrintf("Top");
|
||||
// return;
|
||||
//}
|
||||
|
||||
// . print the parent(s) first
|
||||
// . the new dmoz data dumps signify a parentless topic by
|
||||
// havings its parentid equal its catid, so avoid infinite
|
||||
// loops by checking for that here now. mdw oct 2013.
|
||||
// . the new DMOZ has Top has catid 2 now, even though it is
|
||||
// mistakenly labelled as Top/World, which is really catid 3.
|
||||
// so make this parentId > 2...
|
||||
if (parentId >= 1 && parentId != catid ) {
|
||||
bool isParentRTL = isIdRTLStart(parentId);
|
||||
// print spacing here if RTL
|
||||
//if (isRTL && !raw)
|
||||
@ -558,7 +591,7 @@ void Categories::printPathCrumbFromId ( SafeBuf *sb ,
|
||||
long catIndex;
|
||||
// get the index
|
||||
catIndex = getIndexFromId(catid);
|
||||
if (catIndex < 1) return;
|
||||
//if (catIndex < 1) return;
|
||||
printPathCrumbFromIndex(sb, catIndex, isRTL);
|
||||
}
|
||||
|
||||
@ -569,8 +602,20 @@ void Categories::printPathCrumbFromIndex ( SafeBuf *sb,
|
||||
if (catIndex < 1) return;
|
||||
// get the parent
|
||||
parentId = m_cats[catIndex].m_parentid;
|
||||
// print the parent(s) first
|
||||
if (parentId > 1) {
|
||||
long catid = m_cats[catIndex].m_catid;
|
||||
|
||||
// include Top now. in newer dmoz it is catid2.
|
||||
// seems to already be included below... because you made it
|
||||
// parentId>1 not parentId>2
|
||||
//if ( catid == 2 ) {
|
||||
// sb->safePrintf("Top");
|
||||
// return;
|
||||
//}
|
||||
|
||||
// . print the parent(s) first
|
||||
// . the new dmoz has Top has parentid 2 now, and Top/World is
|
||||
// catid 3. so make this parentId > 2 not parentId > 1
|
||||
if (parentId > 1 && parentId != catid ) {
|
||||
bool isParentRTL = isIdRTLStart(parentId);
|
||||
printPathCrumbFromId(sb, parentId, isRTL);
|
||||
// print a spacing
|
||||
@ -793,7 +838,7 @@ long Categories::fixUrl ( char *url, long urlLen ) {
|
||||
return newUrlLen;
|
||||
}
|
||||
|
||||
bool Categories::addUrlsToBadHashTable ( long catid ) {
|
||||
bool Categories::addUrlsToBadHashTable ( long catid ) {
|
||||
return getTitleAndSummary ( NULL , // urlorig
|
||||
0 , // urloriglen
|
||||
catid ,
|
||||
@ -810,6 +855,183 @@ long Categories::fixUrl ( char *url, long urlLen ) {
|
||||
true );// just add to table
|
||||
}
|
||||
|
||||
// just show the urls in dmoz
|
||||
bool Categories::printUrlsInTopic ( SafeBuf *sb, long catid ) {
|
||||
long catIndex;
|
||||
unsigned long fileOffset;
|
||||
unsigned long n;
|
||||
char* p;
|
||||
unsigned long readSize;
|
||||
char title[1024];
|
||||
char summ[5000];
|
||||
long maxTitleLen = 1024;
|
||||
long maxSummLen = 5000;
|
||||
long titleLen;
|
||||
long summLen;
|
||||
long urlStrLen;
|
||||
char urlStr[MAX_URL_LEN];
|
||||
long niceness = 0;
|
||||
bool printedStart = false;
|
||||
|
||||
// lookup the index for this catid
|
||||
catIndex = getIndexFromId(catid);
|
||||
if (catIndex < 0)
|
||||
goto errEnd;
|
||||
// get the file offset
|
||||
fileOffset = m_cats[catIndex].m_contentOffset;
|
||||
|
||||
QUICKPOLL( niceness );
|
||||
|
||||
// . open the file
|
||||
char filename[512];
|
||||
sprintf(filename, "%scatdb/%s", g_hostdb.m_dir, RDFCONTENT_FILE);
|
||||
m_rdfStream = open(filename, O_RDONLY | O_NONBLOCK);
|
||||
if ( m_rdfStream < 0 ) {
|
||||
log("cat: Error Opening %s\n", filename);
|
||||
goto errEnd;
|
||||
}
|
||||
// . seek to the offset
|
||||
n = lseek ( m_rdfStream, fileOffset, SEEK_SET );
|
||||
if ( n != fileOffset ) {
|
||||
log("cat: Error seeking to Content Offset %li", fileOffset);
|
||||
goto errEnd;
|
||||
}
|
||||
// . read in a chunk
|
||||
m_rdfBuffer = m_rdfSmallBuffer;
|
||||
m_rdfBufferSize = RDFSMALLBUFFER_SIZE;
|
||||
|
||||
p = m_rdfBuffer;
|
||||
readSize = m_rdfBufferSize;
|
||||
readLoop:
|
||||
n = read ( m_rdfStream, p, readSize );
|
||||
if(n > 0 && n != readSize) {
|
||||
p += n;
|
||||
readSize -= n;
|
||||
}
|
||||
//log(LOG_WARN,"build: reading %li bytes out of %li",n,m_rdfBufferSize);
|
||||
QUICKPOLL(niceness);
|
||||
|
||||
if(n < 0 && errno == EAGAIN) goto readLoop;
|
||||
|
||||
if ( n <= 0 || n > (unsigned long)m_rdfBufferSize ) {
|
||||
log("cat: Error Reading Content");
|
||||
goto errEnd;
|
||||
}
|
||||
m_rdfPtr = m_rdfBuffer;
|
||||
m_rdfEnd = &m_rdfBuffer[n];
|
||||
m_currOffset = fileOffset;
|
||||
// . parse to the correct url
|
||||
// parse the first topic and catid
|
||||
if (rdfNextTag() < 0)
|
||||
goto errEnd;
|
||||
if (rdfNextTag() < 0)
|
||||
goto errEnd;
|
||||
// parse until "ExternalPage"
|
||||
nextTag:
|
||||
QUICKPOLL((niceness));
|
||||
if (rdfNextTag() < 0)
|
||||
goto errEnd;
|
||||
// check for catid of next topic to stop looking
|
||||
if (m_tagLen == 5 &&
|
||||
strncmp(m_tagRecfer, "catid", 5) == 0)
|
||||
goto errEnd;
|
||||
if (m_tagLen != 12 ) goto nextTag;
|
||||
if ( strncmp(m_tagRecfer, "ExternalPage", 12) != 0) goto nextTag;
|
||||
|
||||
//
|
||||
// got one
|
||||
//
|
||||
|
||||
// get the next string
|
||||
urlStrLen = fillNextString(urlStr, MAX_URL_LEN-1);
|
||||
if (urlStrLen < 0)
|
||||
goto errEnd;
|
||||
|
||||
// html decode the url
|
||||
/*
|
||||
urlStrLen = htmlDecode(decodedUrl, urlStr, urlStrLen,false,
|
||||
niceness);
|
||||
memcpy(urlStr, decodedUrl, urlStrLen);
|
||||
|
||||
normUrl.set(urlStr, urlStrLen, true);
|
||||
g_catdb.normalizeUrl(&normUrl, &normUrl);
|
||||
// copy it back
|
||||
urlStrLen = normUrl.getUrlLen();
|
||||
memcpy(urlStr, normUrl.getUrl(), urlStrLen);
|
||||
// make sure there's a trailing / on root urls
|
||||
// and no www.
|
||||
//urlStrLen = fixUrl(urlStr, urlStrLen);
|
||||
// check for an anchor
|
||||
urlAnchor = NULL;
|
||||
urlAnchorLen = 0;
|
||||
//for (long i = 0; i < urlStrLen; i++) {
|
||||
//if (urlStr[i] == '#') {
|
||||
if (normUrl.getAnchorLen() > 0) {
|
||||
//urlAnchor = &urlStr[i];
|
||||
//urlAnchorLen = urlStrLen - i;
|
||||
//urlStrLen = i;
|
||||
urlAnchor = normUrl.getAnchor();
|
||||
urlAnchorLen = normUrl.getAnchorLen();
|
||||
//break;
|
||||
}
|
||||
*/
|
||||
|
||||
// . parse out the title
|
||||
if (rdfParse("d:Title") < 0)
|
||||
goto errEnd;
|
||||
|
||||
titleLen = fillNextTagBody(title, maxTitleLen);
|
||||
|
||||
QUICKPOLL(niceness);
|
||||
|
||||
// . parse out the summary
|
||||
if (rdfParse("d:Description") < 0)
|
||||
goto errEnd;
|
||||
|
||||
summLen = fillNextTagBody(summ, maxSummLen);
|
||||
|
||||
if ( ! printedStart ) {
|
||||
printedStart = true;
|
||||
sb->safePrintf("<ul>");
|
||||
}
|
||||
|
||||
// print it out
|
||||
sb->safePrintf("<li><a href=\"");
|
||||
sb->safeMemcpy ( urlStr , urlStrLen );
|
||||
sb->safePrintf("\">");
|
||||
sb->safeMemcpy ( title , titleLen );
|
||||
sb->safePrintf("</a><br>");
|
||||
sb->safeMemcpy( summ, summLen );
|
||||
sb->safePrintf("<br>");//<br>");
|
||||
|
||||
|
||||
/*
|
||||
// . fill the anchor
|
||||
if (anchor) {
|
||||
if (urlAnchor) {
|
||||
if (urlAnchorLen > maxAnchorLen)
|
||||
urlAnchorLen = maxAnchorLen;
|
||||
memcpy(anchor, urlAnchor, urlAnchorLen);
|
||||
*anchorLen = urlAnchorLen;
|
||||
}
|
||||
else
|
||||
*anchorLen = 0;
|
||||
}
|
||||
*/
|
||||
|
||||
// DO NEXT tag
|
||||
goto nextTag;
|
||||
|
||||
errEnd:
|
||||
|
||||
sb->safePrintf("</ul>");
|
||||
|
||||
close(m_rdfStream);
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
|
||||
// . get the title and summary for a specific url
|
||||
// and catid
|
||||
bool Categories::getTitleAndSummary ( char *urlOrig,
|
||||
@ -857,7 +1079,7 @@ bool Categories::getTitleAndSummary ( char *urlOrig,
|
||||
|
||||
// . open the file
|
||||
char filename[512];
|
||||
sprintf(filename, "%scat/%s", g_hostdb.m_dir, RDFCONTENT_FILE);
|
||||
sprintf(filename, "%scatdb/%s", g_hostdb.m_dir, RDFCONTENT_FILE);
|
||||
//m_rdfStream.clear();
|
||||
//m_rdfStream.open(filename, ifstream::in);
|
||||
m_rdfStream = open(filename, O_RDONLY | O_NONBLOCK);
|
||||
@ -1011,13 +1233,17 @@ errEnd:
|
||||
return false;
|
||||
}
|
||||
|
||||
// generate sub categories for a given catid
|
||||
// . generate sub categories for a given catid
|
||||
// . store list of SubCategories into "subCatBuf" return # stored
|
||||
long Categories::generateSubCats ( long catid,
|
||||
SubCategory *subCats,
|
||||
char **catBuffer,
|
||||
long *catBufferSize,
|
||||
long *catBufferLen,
|
||||
bool allowRealloc ) {
|
||||
SafeBuf *subCatBuf
|
||||
//SubCategory *subCats,
|
||||
//char **catBuffer,
|
||||
//long *catBufferSize,
|
||||
//long *catBufferLen,
|
||||
//bool allowRealloc
|
||||
) {
|
||||
|
||||
long catIndex;
|
||||
unsigned long fileOffset;
|
||||
unsigned long n;
|
||||
@ -1029,17 +1255,24 @@ long Categories::generateSubCats ( long catid,
|
||||
long prefixLen;
|
||||
long nameStart;
|
||||
long nameLen;
|
||||
long catp = 0;
|
||||
long catBufferInc = *catBufferSize;
|
||||
// lookup the index for this catid
|
||||
long need ;
|
||||
SubCategory *cat;
|
||||
char *p ;
|
||||
|
||||
//long catp = 0;
|
||||
//long catBufferInc = *catBufferSize;
|
||||
// . lookup the index for this catid
|
||||
// . binary step, guessing to approximate place
|
||||
// and then scanning from there
|
||||
catIndex = getIndexFromId(catid);
|
||||
if (catIndex < 0)
|
||||
goto errEnd;
|
||||
// get the file offset
|
||||
fileOffset = m_cats[catIndex].m_structureOffset;
|
||||
// open the structure file
|
||||
// catdb/structure.rdf.u8 in utf8
|
||||
char filename[512];
|
||||
sprintf(filename, "%scat/%s", g_hostdb.m_dir, RDFSTRUCTURE_FILE);
|
||||
sprintf(filename, "%scatdb/%s", g_hostdb.m_dir, RDFSTRUCTURE_FILE);
|
||||
//m_rdfStream.clear();
|
||||
//m_rdfStream.open(filename, ifstream::in);
|
||||
m_rdfStream = open(filename, O_RDONLY);
|
||||
@ -1066,12 +1299,16 @@ long Categories::generateSubCats ( long catid,
|
||||
log("cat: Error Reading Structure Offset");
|
||||
goto errEnd;
|
||||
}
|
||||
// point to the buffer we just read with m_rdfPtr
|
||||
m_rdfPtr = m_rdfBuffer;
|
||||
m_rdfEnd = &m_rdfBuffer[n];
|
||||
m_currOffset = fileOffset;
|
||||
|
||||
// parse tags for the sub categories or until we hit /Topic
|
||||
nextTag:
|
||||
// . this increments m_rdfPtr until it points to the beginning of a tag
|
||||
// . it may end up reading another chunk from disk
|
||||
// . it memcopies m_tagRecfer to be the name of the tag it points to
|
||||
if (rdfNextTag() < 0)
|
||||
goto gotSubCats;
|
||||
// check for /Topic
|
||||
@ -1121,6 +1358,9 @@ nextTag:
|
||||
false,
|
||||
0);
|
||||
memcpy(catStr, htmlDecoded, catStrLen);
|
||||
// reset this offset
|
||||
nameStart = 0;
|
||||
nameLen = catStrLen;
|
||||
// get the prefix and name position/length
|
||||
switch (currType) {
|
||||
case SUBCAT_ALTLANG:
|
||||
@ -1130,14 +1370,14 @@ nextTag:
|
||||
// prefix is at the start
|
||||
prefixStart = 0;
|
||||
prefixLen = 0;
|
||||
nameStart = 0;
|
||||
//nameStart = 0;
|
||||
// go to the end of the prefix
|
||||
while (catStr[nameStart] != ':') {
|
||||
nameStart++;
|
||||
prefixLen++;
|
||||
}
|
||||
// skip the :Top/
|
||||
nameStart += 5;
|
||||
// skip the : in :Top/
|
||||
nameStart += 1;
|
||||
nameLen = catStrLen - nameStart;
|
||||
break;
|
||||
case SUBCAT_LETTERBAR:
|
||||
@ -1145,9 +1385,9 @@ nextTag:
|
||||
prefixStart = catStrLen - 1;
|
||||
prefixLen = 1;
|
||||
// skip the Top/ for the name
|
||||
nameStart = 4;
|
||||
//nameStart = 4;
|
||||
// lose the Top/, keep the end letter
|
||||
nameLen = catStrLen - 4;
|
||||
//nameLen = catStrLen - 4;
|
||||
break;
|
||||
// . don't do this because of ltr?
|
||||
//case SUBCAT_RELATED:
|
||||
@ -1167,43 +1407,56 @@ nextTag:
|
||||
prefixStart--;
|
||||
prefixLen++;
|
||||
}
|
||||
// name skips Top/
|
||||
nameStart = 4;
|
||||
nameLen = catStrLen - 4;
|
||||
// name skips Top/ ... no! we include Top now
|
||||
// because we need it so PageResults.cpp can call
|
||||
// currIndex=g_categories->getIndexFromPath(catName,catNameLen)
|
||||
// on this name, and it needs "Top/" because it was part
|
||||
// of the hash of the full name for the category now.
|
||||
// and we lookup the Category record by that hash
|
||||
// in getIndexFromPath().
|
||||
//nameStart = 4;
|
||||
//nameLen = catStrLen - 4;
|
||||
break;
|
||||
}
|
||||
// . fill the next sub category
|
||||
if (catp + prefixLen + nameLen >= *catBufferSize) {
|
||||
if (!allowRealloc)
|
||||
goto gotSubCats;
|
||||
// realloc the buffer
|
||||
char *re_catBuffer = (char*)mrealloc ( *catBuffer,
|
||||
*catBufferSize,
|
||||
*catBufferSize+catBufferInc,
|
||||
"Categories" );
|
||||
if (!re_catBuffer) {
|
||||
log ( "Could not allocate %li bytes for catBuffer",
|
||||
*catBufferSize+catBufferInc );
|
||||
g_errno = ENOMEM;
|
||||
goto errEnd;
|
||||
}
|
||||
*catBuffer = re_catBuffer;
|
||||
*catBufferSize += catBufferInc;
|
||||
}
|
||||
// fill the prefix and name in the buffer and subcat
|
||||
// . fill the prefix and name in the buffer and subcat
|
||||
need = sizeof(SubCategory) + prefixLen + 1 + nameLen + 1;
|
||||
|
||||
// reserve space in safebuf for it
|
||||
if ( ! subCatBuf->reserve(need) ) goto errEnd;
|
||||
|
||||
// point to it in safebuf
|
||||
cat = (SubCategory *)(subCatBuf->getBuf());
|
||||
|
||||
cat->m_prefixLen = prefixLen;
|
||||
cat->m_nameLen = nameLen;
|
||||
cat->m_type = currType;
|
||||
p = cat->m_buf;
|
||||
memcpy ( p , catStr + prefixStart , prefixLen );
|
||||
p += prefixLen;
|
||||
*p++ = '\0';
|
||||
memcpy ( p , catStr + nameStart , nameLen );
|
||||
p += nameLen;
|
||||
*p++ = '\0';
|
||||
|
||||
// update safebuf length
|
||||
subCatBuf->incrementLength ( cat->getRecSize() );
|
||||
|
||||
/*
|
||||
subCats[numSubCats].m_prefixOffset = catp;
|
||||
subCats[numSubCats].m_prefixLen = prefixLen;
|
||||
if (prefixLen > 0) {
|
||||
memcpy(&((*catBuffer)[catp]), &catStr[prefixStart], prefixLen);
|
||||
catp += prefixLen;
|
||||
}
|
||||
subCats[numSubCats].m_nameOffset = catp;
|
||||
subCats[numSubCats].m_nameOffset = catBuf->length();//catp;
|
||||
subCats[numSubCats].m_nameLen = nameLen;
|
||||
if (nameLen > 0) {
|
||||
memcpy(&((*catBuffer)[catp]), &catStr[nameStart], nameLen);
|
||||
catp += nameLen;
|
||||
}
|
||||
subCats[numSubCats].m_type = currType;
|
||||
*/
|
||||
// next sub cat
|
||||
numSubCats++;
|
||||
if (numSubCats >= MAX_SUB_CATS) {
|
||||
@ -1214,14 +1467,14 @@ nextTag:
|
||||
// next tag
|
||||
goto nextTag;
|
||||
gotSubCats:
|
||||
*catBufferLen = catp;
|
||||
//*catBufferLen = catp;
|
||||
//m_rdfStream.close();
|
||||
//m_rdfStream.clear();
|
||||
close(m_rdfStream);
|
||||
return numSubCats;
|
||||
|
||||
errEnd:
|
||||
*catBufferLen = 0;
|
||||
//*catBufferLen = 0;
|
||||
//m_rdfStream.close();
|
||||
//m_rdfStream.clear();
|
||||
close(m_rdfStream);
|
||||
@ -1259,8 +1512,13 @@ long Categories::createDirSearchRequest ( char *requestBuf,
|
||||
char *rrr = r->m_reqBuf.getBufStart();
|
||||
if ( rrr && rrr[0] == 'Z' ) cmd = "ZET";
|
||||
// request
|
||||
p += sprintf(p, "%s /search?dir=%li&dr=0&sc=0&sdir=%li&sdirt=0&c=",
|
||||
cmd, catid, catid);
|
||||
//p += sprintf(p, "%s /search?dir=%li&dr=0&sc=0&sdir=%li&sdirt=0&c=",
|
||||
// cmd, catid, catid);
|
||||
p += sprintf(p,
|
||||
"%s /search?q=gbcatid%%3A%li&dir=%li&dr=0&sc=0&c="
|
||||
, cmd
|
||||
, catid
|
||||
, catid);
|
||||
// coll
|
||||
memcpy(p, coll, collLen);
|
||||
p += collLen;
|
||||
@ -1314,7 +1572,7 @@ bool Categories::loadLangTables(void) {
|
||||
unsigned long entries = 0L;
|
||||
char *cp;
|
||||
char *cpEnd = line + 10239;
|
||||
if(!(content = fopen("cat/content.rdf.u8", "r"))) {
|
||||
if(!(content = fopen("catdb/content.rdf.u8", "r"))) {
|
||||
log(LOG_INFO, "cat: could not open content file.\n");
|
||||
return(false);
|
||||
}
|
||||
|
24
Categories.h
24
Categories.h
@ -26,7 +26,7 @@
|
||||
#define MAX_TAG_LEN 127
|
||||
#define MAX_URL_CATIDS 64
|
||||
#define MAX_URLTXT_SIZE 500000
|
||||
#define MAX_CATIDS 64
|
||||
#define MAX_CATIDS 96
|
||||
#define MAX_CATNAME_LEN 1024
|
||||
|
||||
#define HASHTABLE_SIZE (1024*1024)
|
||||
@ -61,11 +61,15 @@ struct CategoryHash {
|
||||
};
|
||||
|
||||
struct SubCategory {
|
||||
long m_prefixOffset;
|
||||
//long m_prefixOffset;
|
||||
long m_prefixLen;
|
||||
long m_nameOffset;
|
||||
//long m_nameOffset;
|
||||
long m_nameLen;
|
||||
char m_type;
|
||||
long getRecSize () { return sizeof(SubCategory)+m_prefixLen+m_nameLen+2;};
|
||||
char *getPrefix() { return m_buf; };
|
||||
char *getName () { return m_buf+m_prefixLen+1;};
|
||||
char m_buf[0];
|
||||
};
|
||||
|
||||
class Categories {
|
||||
@ -133,6 +137,8 @@ public:
|
||||
long catid,
|
||||
bool isRTL = false );
|
||||
|
||||
bool printUrlsInTopic ( class SafeBuf *sb , long catid ) ;
|
||||
|
||||
// . get the title and summary for a specific url
|
||||
// and catid
|
||||
bool getTitleAndSummary ( char *url,
|
||||
@ -153,15 +159,13 @@ public:
|
||||
// normalize a url string
|
||||
long fixUrl ( char *url, long urlLen );
|
||||
|
||||
// generate sub categories for a given catid
|
||||
long generateSubCats ( long catid,
|
||||
SubCategory *subCats,
|
||||
char **catBuffer,
|
||||
long *catBufferSize,
|
||||
long *catBufferLen,
|
||||
bool allowRealloc = true );
|
||||
// . generate sub categories for a given catid
|
||||
// . store list of SubCategories into "subCatBuf" return # stored
|
||||
// . hits disk without using threads... so kinda sucks...
|
||||
long generateSubCats ( long catid, SafeBuf *subCatBuf );
|
||||
|
||||
long getNumUrlsFromIndex ( long catIndex ) {
|
||||
if ( ! m_cats ) return 0;
|
||||
return m_cats[catIndex].m_numUrls; };
|
||||
|
||||
// creates a directory search request url
|
||||
|
2
Conf.h
2
Conf.h
@ -164,7 +164,7 @@ class Conf {
|
||||
long m_catdbMaxTreeMem;
|
||||
long m_catdbMaxDiskPageCacheMem;
|
||||
long m_catdbMaxCacheMem;
|
||||
long m_catdbMinFilesToMerge;
|
||||
//long m_catdbMinFilesToMerge;
|
||||
|
||||
long m_revdbMaxTreeMem;
|
||||
long m_timedbMaxTreeMem;
|
||||
|
@ -2359,6 +2359,9 @@ uint32_t Hostdb::getShardNum ( char rdbId,void *k,bool split ) {
|
||||
//else if ( rdbId == RDB_CATDB || rdbId == RDB2_CATDB2 ) {
|
||||
// return m_map [(*(uint16_t *)((char *)k + 10))>>3];
|
||||
//}
|
||||
else if ( rdbId == RDB_CATDB || rdbId == RDB2_CATDB2 ) {
|
||||
return m_map [(*(uint16_t *)((char *)k + 10))>>3];
|
||||
}
|
||||
// core -- must be provided
|
||||
char *xx = NULL; *xx = 0;
|
||||
//groupId=key.n1 & g_hostdb.m_groupMask;
|
||||
|
@ -518,6 +518,7 @@ long HttpMime::getContentTypePrivate ( char *s ) {
|
||||
else if (!strcasecmp(s,"image/jpeg" ) ) ct = CT_JPG;
|
||||
else if (!strcasecmp(s,"image/png" ) ) ct = CT_PNG;
|
||||
else if (!strcasecmp(s,"image/tiff" ) ) ct = CT_TIFF;
|
||||
else if (!strncasecmp(s,"image/",6 ) ) ct = CT_IMAGE;
|
||||
else if (!strcasecmp(s,"application/javascript" ) ) ct = CT_JS;
|
||||
else if (!strcasecmp(s,"application/x-javascript") ) ct = CT_JS;
|
||||
else if (!strcasecmp(s,"text/javascript" ) ) ct = CT_JS;
|
||||
|
@ -36,6 +36,7 @@ time_t atotime5 ( char *s ) ;
|
||||
#define CT_JS 14
|
||||
#define CT_CSS 15
|
||||
#define CT_JSON 16
|
||||
#define CT_IMAGE 17
|
||||
|
||||
#define ET_IDENTITY 0
|
||||
#define ET_GZIP 1
|
||||
|
@ -23,6 +23,7 @@ void HttpRequest::reset() {
|
||||
m_userIP = 0;
|
||||
m_isMSIE = false;
|
||||
m_reqBufValid = false;
|
||||
m_reqBuf.purge();
|
||||
|
||||
if (m_cgiBuf2) {
|
||||
mfree(m_cgiBuf2, m_cgiBuf2Size, "extraParms");
|
||||
|
6
Lang.cpp
6
Lang.cpp
@ -459,7 +459,11 @@ unsigned char getLanguageFromUserAgent(char *abbr) {
|
||||
return langUnknown;
|
||||
}
|
||||
|
||||
// these are going to be adult, in any language
|
||||
// . these are going to be adult, in any language
|
||||
// . this seems only to be used by Speller.cpp when splitting up words
|
||||
// in the url domain.
|
||||
// . s/slen is a full word that is found in our "dictionary" so using
|
||||
// phrases like biglittlestuff probably should not go here.
|
||||
bool isAdult( char *s, long slen, char **loc ) {
|
||||
char **p = NULL;
|
||||
char *a = NULL;
|
||||
|
6
Makefile
6
Makefile
@ -33,13 +33,13 @@ OBJS = Tfndb.o UdpSlot.o \
|
||||
HttpMime.o Hostdb.o \
|
||||
Highlight.o File.o Errno.o Entities.o \
|
||||
Dns.o Dir.o Conf.o Bits.o \
|
||||
Stats.o BigFile.o AdultBit.o Msg17.o \
|
||||
Stats.o BigFile.o Msg17.o \
|
||||
Speller.o DiskPageCache.o \
|
||||
PingServer.o StopWords.o TopTree.o \
|
||||
Parms.o Pages.o Msg28.o Msg30.o \
|
||||
Unicode.o iana_charset.o Iso8859.o \
|
||||
SearchInput.o \
|
||||
Categories.o Msg2a.o PageCatdb.o PageDirectory.o Msg2b.o \
|
||||
Categories.o Msg2a.o PageCatdb.o PageDirectory.o \
|
||||
SafeBuf.o Datedb.o \
|
||||
UCNormalizer.o UCPropTable.o UnicodeProperties.o \
|
||||
Pops.o Title.o Pos.o LangList.o \
|
||||
@ -99,7 +99,7 @@ endif
|
||||
# let's keep the libraries in the repo for easier bug reporting and debugging
|
||||
# in general if we can. the includes are still in /usr/include/ however...
|
||||
# which is kinda strange but seems to work so far.
|
||||
#LIBS= -L. ./libplotter.a ./libplot.a ./libz.a ./libssl.a ./libcrypto.a ./libiconv.a ./libm.a ./libgcc.a ./libpthread.a ./libc.a ./libstdc++.a
|
||||
#LIBS= -L. ./libz.a ./libssl.a ./libcrypto.a ./libiconv.a ./libm.a ./libgcc.a ./libpthread.a ./libc.a ./libstdc++.a
|
||||
|
||||
|
||||
|
||||
|
6
Mem.cpp
6
Mem.cpp
@ -1275,10 +1275,12 @@ mallocmemloop:
|
||||
long long avail = (long long)m_maxMem -
|
||||
(long long)m_used;
|
||||
if ( now - s_lastTime >= 1000LL ) {
|
||||
log("mem: system malloc(%i) availShouldBe=%lli: "
|
||||
log("mem: system malloc(%i,%s) availShouldBe=%lli: "
|
||||
"%s (%s) (ooms suppressed since "
|
||||
"last log msg = %li)",
|
||||
size+UNDERPAD+OVERPAD,avail,
|
||||
size+UNDERPAD+OVERPAD,
|
||||
note,
|
||||
avail,
|
||||
mstrerror(g_errno),
|
||||
note,
|
||||
s_missed);
|
||||
|
10
Msg1.cpp
10
Msg1.cpp
@ -388,6 +388,12 @@ bool Msg1::sendSomeOfList ( ) {
|
||||
if ( m_list->m_fixedDataSize != getDataSizeFromRdbId(m_rdbId) ) {
|
||||
char *xx=NULL;*xx=0; }
|
||||
|
||||
// little debug thing for genCatdb from msg9b's huge list add
|
||||
//if ( m_list->m_listSize > 10000000 )
|
||||
// log("msg1: adding chunk @ %li of %li bytes",
|
||||
// (long)(dataStart - m_list->m_list) ,
|
||||
// (long)m_list->m_listSize );
|
||||
|
||||
// . now send this list to the host
|
||||
// . this returns false if blocked, true otherwise
|
||||
// . it also sets g_errno on error
|
||||
@ -480,7 +486,9 @@ bool Msg1::sendData ( unsigned long shardNum, char *listData , long listSize) {
|
||||
if ( ! g_errno ) sendToSelf = false;
|
||||
else {
|
||||
log("rdb: msg1 had error: %s",mstrerror(g_errno));
|
||||
return true;
|
||||
// this is messing up generate catdb's huge rdblist add
|
||||
// why did we put it in there??? from msg9b.cpp
|
||||
//return true;
|
||||
}
|
||||
|
||||
QUICKPOLL(m_niceness);
|
||||
|
48
Msg2a.cpp
48
Msg2a.cpp
@ -58,9 +58,9 @@ bool Msg2a::makeCatdb( char *coll,
|
||||
char inFile[256];
|
||||
// url info (content) file
|
||||
if ( m_updateFromNew )
|
||||
sprintf(inFile, "%scat/gbdmoz.content.dat.new", g_hostdb.m_dir);
|
||||
sprintf(inFile, "%scatdb/gbdmoz.content.dat.new", g_hostdb.m_dir);
|
||||
else
|
||||
sprintf(inFile, "%scat/gbdmoz.content.dat", g_hostdb.m_dir);
|
||||
sprintf(inFile, "%scatdb/gbdmoz.content.dat", g_hostdb.m_dir);
|
||||
//m_inStream.open(inFile, ifstream::in);
|
||||
m_inStream = open(inFile, O_RDONLY);
|
||||
//if (!m_inStream.is_open()) {
|
||||
@ -118,7 +118,7 @@ bool Msg2a::makeCatdb( char *coll,
|
||||
// open the new diff file
|
||||
//ifstream diffInStream;
|
||||
int diffInStream;
|
||||
sprintf(inFile, "%scat/gbdmoz.content.dat.new.diff",
|
||||
sprintf(inFile, "%scatdb/gbdmoz.content.dat.new.diff",
|
||||
g_hostdb.m_dir);
|
||||
//diffInStream.open(inFile, ifstream::in);
|
||||
diffInStream = open(inFile, O_RDONLY);
|
||||
@ -328,6 +328,12 @@ bool Msg2a::makeCatdb( char *coll,
|
||||
// null terminate
|
||||
m_urls[urlp] = '\0';
|
||||
currUrl++;
|
||||
// debug
|
||||
//SafeBuf sb;
|
||||
//sb.safeMemcpy(&m_urls[urlp-urlLen],urlLen);
|
||||
//sb.nullTerm();
|
||||
//log("gencat: url=%s",sb.getBufStart());
|
||||
|
||||
}
|
||||
log(LOG_INFO, "db: Wrote %li urls to update (%li)\n",
|
||||
currUrl - m_numRemoveUrls, m_numUpdateIndexes);
|
||||
@ -581,9 +587,9 @@ void handleRequest2a ( UdpSlot *slot, long netnice ) {
|
||||
otherCategories = &g_categories1;
|
||||
// load the new file
|
||||
if ( updateFromNew )
|
||||
sprintf(buff, "%scat/gbdmoz.structure.dat.new", g_hostdb.m_dir);
|
||||
sprintf(buff, "%scatdb/gbdmoz.structure.dat.new", g_hostdb.m_dir);
|
||||
else
|
||||
sprintf(buff, "%scat/gbdmoz.structure.dat", g_hostdb.m_dir);
|
||||
sprintf(buff, "%scatdb/gbdmoz.structure.dat", g_hostdb.m_dir);
|
||||
if (otherCategories->loadCategories(buff) != 0) {
|
||||
log("db: Loading Categories From %s Failed", buff);
|
||||
// send error reply
|
||||
@ -605,51 +611,51 @@ void handleRequest2a ( UdpSlot *slot, long netnice ) {
|
||||
}
|
||||
|
||||
// move the current files to .old
|
||||
sprintf(buff, "mv %scat/content.rdf.u8 %scat/content.rdf.u8.old",
|
||||
sprintf(buff, "mv %scatdb/content.rdf.u8 %scatdb/content.rdf.u8.old",
|
||||
g_hostdb.m_dir, g_hostdb.m_dir);
|
||||
log ( LOG_INFO, "%s", buff);
|
||||
system ( buff );
|
||||
sprintf(buff, "mv %scat/structure.rdf.u8 %scat/structure.rdf.u8.old",
|
||||
sprintf(buff, "mv %scatdb/structure.rdf.u8 %scatdb/structure.rdf.u8.old",
|
||||
g_hostdb.m_dir, g_hostdb.m_dir);
|
||||
log ( LOG_INFO, "%s", buff);
|
||||
system ( buff );
|
||||
sprintf(buff, "mv %scat/gbdmoz.content.dat "
|
||||
"%scat/gbdmoz.content.dat.old",
|
||||
sprintf(buff, "mv %scatdb/gbdmoz.content.dat "
|
||||
"%scatdb/gbdmoz.content.dat.old",
|
||||
g_hostdb.m_dir, g_hostdb.m_dir);
|
||||
log ( LOG_INFO, "%s", buff);
|
||||
system ( buff );
|
||||
sprintf(buff, "mv %scat/gbdmoz.structure.dat "
|
||||
"%scat/gbdmoz.structure.dat.old",
|
||||
sprintf(buff, "mv %scatdb/gbdmoz.structure.dat "
|
||||
"%scatdb/gbdmoz.structure.dat.old",
|
||||
g_hostdb.m_dir, g_hostdb.m_dir);
|
||||
log ( LOG_INFO, "%s", buff);
|
||||
system ( buff );
|
||||
sprintf(buff, "mv %scat/gbdmoz.content.dat.diff "
|
||||
"%scat/gbdmoz.content.dat.diff.old",
|
||||
sprintf(buff, "mv %scatdb/gbdmoz.content.dat.diff "
|
||||
"%scatdb/gbdmoz.content.dat.diff.old",
|
||||
g_hostdb.m_dir, g_hostdb.m_dir);
|
||||
log ( LOG_INFO, "%s", buff);
|
||||
system ( buff );
|
||||
|
||||
// move the .new files to current
|
||||
sprintf(buff, "mv %scat/content.rdf.u8.new %scat/content.rdf.u8",
|
||||
sprintf(buff, "mv %scatdb/content.rdf.u8.new %scatdb/content.rdf.u8",
|
||||
g_hostdb.m_dir, g_hostdb.m_dir);
|
||||
log ( LOG_INFO, "%s", buff);
|
||||
system ( buff );
|
||||
sprintf(buff, "mv %scat/structure.rdf.u8.new %scat/structure.rdf.u8",
|
||||
sprintf(buff, "mv %scatdb/structure.rdf.u8.new %scatdb/structure.rdf.u8",
|
||||
g_hostdb.m_dir, g_hostdb.m_dir);
|
||||
log ( LOG_INFO, "%s", buff);
|
||||
system ( buff );
|
||||
sprintf(buff, "mv %scat/gbdmoz.content.dat.new "
|
||||
"%scat/gbdmoz.content.dat",
|
||||
sprintf(buff, "mv %scatdb/gbdmoz.content.dat.new "
|
||||
"%scatdb/gbdmoz.content.dat",
|
||||
g_hostdb.m_dir, g_hostdb.m_dir);
|
||||
log ( LOG_INFO, "%s", buff);
|
||||
system ( buff );
|
||||
sprintf(buff, "mv %scat/gbdmoz.structure.dat.new "
|
||||
"%scat/gbdmoz.structure.dat",
|
||||
sprintf(buff, "mv %scatdb/gbdmoz.structure.dat.new "
|
||||
"%scatdb/gbdmoz.structure.dat",
|
||||
g_hostdb.m_dir, g_hostdb.m_dir);
|
||||
log ( LOG_INFO, "%s", buff);
|
||||
system ( buff );
|
||||
//sprintf(buff, "mv %scat/gbdmoz.content.dat.new.diff "
|
||||
// "%scat/gbdmoz.content.dat.diff",
|
||||
//sprintf(buff, "mv %scatdb/gbdmoz.content.dat.new.diff "
|
||||
// "%scatdb/gbdmoz.content.dat.diff",
|
||||
// g_hostdb.m_dir, g_hostdb.m_dir);
|
||||
//log ( LOG_INFO, "%s", buff);
|
||||
//system ( buff );
|
||||
|
34
Msg40.cpp
34
Msg40.cpp
@ -148,6 +148,10 @@ bool Msg40::getResults ( SearchInput *si ,
|
||||
// we need this info for caching as well
|
||||
//m_numGigabitInfos = 0;
|
||||
|
||||
|
||||
//just getfrom searchinput
|
||||
//.... m_catId = hr->getLong("catid",0);m_si->m_catId;
|
||||
|
||||
m_postQueryRerank.set1( this, si );
|
||||
|
||||
// get the collection rec
|
||||
@ -680,6 +684,20 @@ bool Msg40::gotDocIds ( ) {
|
||||
// if ( ! m_msg1a.generateReferences(m_si,(void*)this,didTaskWrapper) )
|
||||
// m_tasksRemaining++;
|
||||
|
||||
|
||||
//
|
||||
// call Msg2b to generate directory
|
||||
//
|
||||
// why is this here? it does not depend on the docids. (mdw 9/25/13)
|
||||
// dissect it and fix it!!
|
||||
//
|
||||
//if ( m_si->m_catId &&
|
||||
// ! m_msg2b.generateDirectory ( m_si->m_catId,
|
||||
// (void*)this,
|
||||
// didTaskWrapper ) )
|
||||
// m_tasksRemaining++;
|
||||
|
||||
|
||||
return launchMsg20s ( false );
|
||||
}
|
||||
|
||||
@ -878,7 +896,6 @@ bool Msg40::reallocMsg20Buf ( ) {
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
void didTaskWrapper ( void* state ) {
|
||||
Msg40 *THIS = (Msg40 *) state;
|
||||
// one less task
|
||||
@ -888,7 +905,6 @@ void didTaskWrapper ( void* state ) {
|
||||
// we are done, call the callback
|
||||
THIS->m_callback ( THIS->m_state );
|
||||
}
|
||||
*/
|
||||
|
||||
bool Msg40::launchMsg20s ( bool recalled ) {
|
||||
|
||||
@ -2128,7 +2144,7 @@ long Msg40::getStoredSize ( ) {
|
||||
//size += m_msg24.getStoredSize ( );
|
||||
//size += m_msg1a.getStoredSize ( );
|
||||
// cache msg2b if we have it
|
||||
size += m_msg2b.getStoredSize();
|
||||
//size += m_msg2b.getStoredSize();
|
||||
|
||||
return size;
|
||||
}
|
||||
@ -2203,9 +2219,9 @@ long Msg40::serialize ( char *buf , long bufLen ) {
|
||||
//if ( y == -1 ) return -1;
|
||||
//p += y;
|
||||
|
||||
long z = m_msg2b.serialize (p, pend - p);
|
||||
if ( z == -1 ) return -1;
|
||||
p += z;
|
||||
//long z = m_msg2b.serialize (p, pend - p);
|
||||
//if ( z == -1 ) return -1;
|
||||
//p += z;
|
||||
|
||||
if ( m_r.m_debug )
|
||||
log("query: msg40 serialize nd=%li "
|
||||
@ -2258,9 +2274,9 @@ long Msg40::deserialize ( char *buf , long bufSize ) {
|
||||
}
|
||||
|
||||
// msg2b
|
||||
long z = m_msg2b.deserialize ( p , pend - p );
|
||||
if ( z == -1 ) return -1;
|
||||
p += z;
|
||||
//long z = m_msg2b.deserialize ( p , pend - p );
|
||||
//if ( z == -1 ) return -1;
|
||||
//p += z;
|
||||
|
||||
// return bytes read
|
||||
return p - buf;
|
||||
|
4
Msg40.h
4
Msg40.h
@ -14,7 +14,7 @@
|
||||
#include "Msg39.h" // getTermFreqs()
|
||||
#include "Msg20.h" // for getting summary from docId
|
||||
#include "Msg17.h" // a distributed cache of serialized/compressed Msg40s
|
||||
#include "Msg2b.h" // for generating directories
|
||||
//#include "Msg2b.h" // for generating directories
|
||||
#include "IndexReadInfo.h" // STAGE0,...
|
||||
#include "Msg3a.h"
|
||||
#include "PostQueryRerank.h"
|
||||
@ -302,7 +302,7 @@ class Msg40 {
|
||||
long m_docsToScanForTopics;
|
||||
|
||||
// Msg2b for generating a directory
|
||||
Msg2b m_msg2b;
|
||||
//Msg2b m_msg2b;
|
||||
|
||||
PostQueryRerank m_postQueryRerank;
|
||||
|
||||
|
30
Msg8b.cpp
30
Msg8b.cpp
@ -45,13 +45,13 @@ bool Msg8b::getCatRec ( Url *url ,
|
||||
// clear g_errno
|
||||
g_errno = 0;
|
||||
// warning
|
||||
if ( ! coll ) log(LOG_LOGIC,"net: NULL collection. msg8b.");
|
||||
//if ( ! coll ) log(LOG_LOGIC,"net: NULL collection. msg8b.");
|
||||
// store the calling parameters in this class for retrieval by callback
|
||||
m_state = state;
|
||||
m_callback = callback;
|
||||
m_url = url;
|
||||
m_coll = coll;
|
||||
m_collLen = collLen;
|
||||
//m_coll = coll;
|
||||
//m_collLen = collLen;
|
||||
m_cr = cr;
|
||||
m_niceness = niceness;
|
||||
|
||||
@ -68,10 +68,10 @@ bool Msg8b::getCatRec ( Url *url ,
|
||||
//m_coll = g_conf.m_dirColl;
|
||||
//m_collLen = gbstrlen(m_coll);
|
||||
// catdb uses a dummy collection now, should not be looked at
|
||||
m_coll = "catdb";
|
||||
m_collLen = 5;
|
||||
//m_coll = "catdb";
|
||||
//m_collLen = 5;
|
||||
|
||||
m_collnum = g_collectiondb.getCollnum ( m_coll , m_collLen );
|
||||
//m_collnum = g_collectiondb.getCollnum ( m_coll , m_collLen );
|
||||
|
||||
// . first, try it by canonical domain name
|
||||
// . if that finds no matches, then try it by ip domain
|
||||
@ -90,7 +90,7 @@ bool Msg8b::getCatRec ( Url *url ,
|
||||
//
|
||||
if ( getMyShardNum() != m_shardNum ) {//g_hostdb.m_groupId!=m_groupId){
|
||||
// coll, url, niceness(1), rdbid(1), useCanonicalName(1)
|
||||
long requestSize = m_collLen + m_url->getUrlLen() + 4 + 4;
|
||||
long requestSize = m_url->getUrlLen() + 4 + 3;
|
||||
// make the request
|
||||
char *p = m_request;
|
||||
*(long *)p = m_url->getIp() ; p+=4;
|
||||
@ -98,10 +98,10 @@ bool Msg8b::getCatRec ( Url *url ,
|
||||
*p = (char)niceness ; p++;
|
||||
*p = (char)useCanonicalName; p++;
|
||||
// coll
|
||||
memcpy(p, m_coll, m_collLen);
|
||||
p += m_collLen;
|
||||
*p = '\0';
|
||||
p++;
|
||||
//memcpy(p, m_coll, m_collLen);
|
||||
//p += m_collLen;
|
||||
//*p = '\0';
|
||||
//p++;
|
||||
// url
|
||||
memcpy(p, m_url->getUrl(), m_url->getUrlLen());
|
||||
p += m_url->getUrlLen();
|
||||
@ -187,7 +187,7 @@ bool Msg8b::getCatRec ( Url *url ,
|
||||
0 , // max cached age in seconds (60)
|
||||
false , // add net recv'd list to cache?
|
||||
RDB_CATDB, // specifies the rdb, 1 = tagdb
|
||||
m_coll ,
|
||||
"",//NULL,//m_coll ,
|
||||
//&m_list ,
|
||||
m_list ,
|
||||
startKey ,
|
||||
@ -546,7 +546,7 @@ bool Msg8b::gotList ( ) {
|
||||
char *rec;
|
||||
|
||||
//rec = g_catdb->getRec ( &m_list , m_url , &recSize );
|
||||
rec = g_catdb.getRec(m_list,m_url,&recSize,m_coll,m_collLen);
|
||||
rec = g_catdb.getRec(m_list,m_url,&recSize,NULL,0);//m_coll,m_collLen);
|
||||
|
||||
// if record found then set it and also set gotIt to true
|
||||
if ( rec ) {
|
||||
@ -589,8 +589,8 @@ void Msg8b::getIndirectCatids ( ) {
|
||||
matchRecs,
|
||||
matchRecSizes,
|
||||
MAX_IND_CATIDS,
|
||||
m_coll,
|
||||
m_collLen);
|
||||
NULL,//m_coll,
|
||||
0);//m_collLen);
|
||||
// parse out the catids from the matches
|
||||
m_cr->m_numIndCatids = 0;
|
||||
for ( long i = 0; i < numMatches; i++ ) {
|
||||
|
6
Msg8b.h
6
Msg8b.h
@ -68,11 +68,11 @@ class Msg8b {
|
||||
void cleanSlot ( );
|
||||
|
||||
// some specified input
|
||||
char *m_coll;
|
||||
long m_collLen;
|
||||
//char *m_coll;
|
||||
//long m_collLen;
|
||||
Url *m_url;
|
||||
|
||||
collnum_t m_collnum;
|
||||
//collnum_t m_collnum;
|
||||
|
||||
void (*m_callback ) ( void *state );//, CatRec *rec );
|
||||
void *m_state; // ptr to caller's private state data
|
||||
|
40
Msg9b.cpp
40
Msg9b.cpp
@ -93,10 +93,17 @@ bool Msg9b::addCatRecs ( char *urls ,
|
||||
char *e = p; while ( *e && ! is_wspace_a (*e) ) e++;
|
||||
// . set the url
|
||||
// . but don't add the "www."
|
||||
// . watch out for
|
||||
// http://twitter.com/#!/ronpaul to http://www.twitter.com/
|
||||
// so do not strip # hashtags
|
||||
Url site;
|
||||
site.set ( p , e - p , false/*addwww?*/);
|
||||
site.set ( p , e - p , false ); // addwww?
|
||||
// normalize the url
|
||||
g_catdb.normalizeUrl(&site, &site);
|
||||
|
||||
// sanity
|
||||
if ( numCatids[k] > MAX_CATIDS ) { char *xx=NULL;*xx=0; }
|
||||
|
||||
// make a siteRec from this url
|
||||
CatRec sr;
|
||||
// returns false and sets g_errno on error
|
||||
@ -107,6 +114,16 @@ bool Msg9b::addCatRecs ( char *urls ,
|
||||
char *data = sr.getData ();
|
||||
long dataSize = sr.getDataSize ();
|
||||
key_t key;
|
||||
// sanity test
|
||||
CatRec cr2;
|
||||
if ( ! cr2.set ( NULL , sr.getData(), sr.getDataSize(),false)){
|
||||
char *xx=NULL;*xx=0; }
|
||||
// debug when generating catdb
|
||||
//char *x = p;
|
||||
//for ( ; x<e ; x++ ) {
|
||||
// if ( x[0] == '#' )
|
||||
// log("hey");
|
||||
//}
|
||||
if ( numCatids[k] == 0 )
|
||||
key = g_catdb.makeKey(&site, true);
|
||||
else
|
||||
@ -123,7 +140,23 @@ bool Msg9b::addCatRecs ( char *urls ,
|
||||
}
|
||||
else if ( ! m_list.addRecord ( key, dataSize, data ) )
|
||||
return true;
|
||||
|
||||
|
||||
/*
|
||||
// debug point
|
||||
SafeBuf sb;
|
||||
//sb.safeMemcpy(p , e-p );
|
||||
sb.safeStrcpy(sr.m_url);
|
||||
sb.safePrintf(" ");
|
||||
for ( long i = 0 ; i < numCatids[k] ; i++ )
|
||||
sb.safePrintf ( "%li " , catids[c+i] );
|
||||
log("catdb: adding key=%s url=%s",
|
||||
KEYSTR(&key,12),
|
||||
sb.getBufStart());
|
||||
*/
|
||||
|
||||
// debug
|
||||
//log("gencat: adding url=%s",sr.m_url);
|
||||
|
||||
//skip:
|
||||
// now advance p to e
|
||||
p = e;
|
||||
@ -133,7 +166,8 @@ bool Msg9b::addCatRecs ( char *urls ,
|
||||
|
||||
QUICKPOLL((niceness));
|
||||
}
|
||||
log ( LOG_INFO, "Msg9b: %li sites and %li links added", k , c );
|
||||
log ( LOG_INFO, "Msg9b: %li sites and %li links added. "
|
||||
"listSize=%li", k , c , m_list.m_listSize );
|
||||
// . now add the m_list to tagdb using msg1
|
||||
// . use high priority (niceness of 0)
|
||||
// . i raised niceness from 0 to 1 so multicast does not use the
|
||||
|
@ -66,7 +66,8 @@ bool sendPageAddUrl ( TcpSocket *s , HttpRequest *r ) {
|
||||
|
||||
// see if they provided a url of a file of urls if they did not
|
||||
// provide a url to add directly
|
||||
bool isAdmin = g_collectiondb.isAdmin ( r , s );
|
||||
//bool isAdmin = g_collectiondb.isAdmin ( r , s );
|
||||
bool isAdmin = r->getIsLocal();
|
||||
long ufuLen = 0;
|
||||
char *ufu = NULL;
|
||||
if ( isAdmin )
|
||||
|
@ -105,8 +105,8 @@ bool sendPageCatdb ( TcpSocket *s , HttpRequest *r ) {
|
||||
st->m_url.set(url, urlLen);
|
||||
// call msg8b to lookup in catdb
|
||||
if (!st->m_msg8b.getCatRec ( &st->m_url,
|
||||
st->m_coll,
|
||||
st->m_collLen,
|
||||
NULL,//st->m_coll,
|
||||
0,//st->m_collLen,
|
||||
true,
|
||||
1,
|
||||
&st->m_catRec,
|
||||
|
@ -3,6 +3,10 @@
|
||||
#include "CollectionRec.h"
|
||||
#include "Pages.h"
|
||||
#include "Categories.h"
|
||||
#include "PageResults.h" // printDMOZSubtopics()
|
||||
|
||||
// function is in PageRoot.cpp:
|
||||
bool printDirHomePage ( SafeBuf &sb , HttpRequest *r ) ;
|
||||
|
||||
// . returns false if blocked, true otherwise
|
||||
// . sets g_errno on error
|
||||
@ -36,14 +40,61 @@ bool sendPageDirectory ( TcpSocket *s , HttpRequest *r ) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
// look it up
|
||||
// look it up. returns catId <= 0 if dmoz not setup yet.
|
||||
long catId = g_categories->getIdFromPath(decodedPath, decodedPathLen);
|
||||
|
||||
SafeBuf sb;
|
||||
|
||||
long xml = r->getLong("xml",0);
|
||||
|
||||
// if /Top print the directory homepage
|
||||
if ( catId == 1 || catId <= 0 ) {
|
||||
// this is in PageRoot.cpp
|
||||
printDirHomePage(sb,r);
|
||||
}
|
||||
//
|
||||
// try printing this shit out not as search results right now
|
||||
// but just verbatim from dmoz files
|
||||
//
|
||||
else {
|
||||
// search box
|
||||
printLogoAndSearchBox(sb,r,catId);
|
||||
// radio buttons for search dmoz. no, this is printed
|
||||
// from call to printLogoAndSearchBox()
|
||||
//printDmozRadioButtons(sb,catId);
|
||||
// the dmoz breadcrumb
|
||||
printDMOZCrumb ( sb,catId,xml);
|
||||
// print the subtopcis in this topic. show as links above
|
||||
// the search results
|
||||
printDMOZSubTopics ( sb, catId , xml );
|
||||
// ok, for now just print the dmoz topics since our search
|
||||
// results will be empty... until populated!
|
||||
g_categories->printUrlsInTopic ( &sb , catId );
|
||||
}
|
||||
|
||||
return g_httpServer.sendDynamicPage ( s,
|
||||
(char*) sb.getBufStart(),
|
||||
sb.length(),
|
||||
// 120 seconds cachetime
|
||||
// don't cache anymore
|
||||
// since
|
||||
// we have the login bar
|
||||
// @ the top of the page
|
||||
0,//120, // cachetime
|
||||
false,// post?
|
||||
"text/html",
|
||||
200,
|
||||
NULL, // cookie
|
||||
"UTF-8",
|
||||
r);
|
||||
|
||||
|
||||
// . make a new request for PageResults
|
||||
//Url dirUrl;
|
||||
char requestBuf[1024+MAX_COLL_LEN+128];
|
||||
long requestBufSize = 1024+MAX_COLL_LEN+128;
|
||||
//g_categories.createDirectorySearchUrl ( &dirUrl,
|
||||
log("dmoz: creating search request");
|
||||
long requestBufLen = g_categories->createDirSearchRequest(
|
||||
requestBuf,
|
||||
requestBufSize,
|
||||
|
@ -2494,14 +2494,14 @@ z 122 7a { 123 7b\
|
||||
" query with \"prefix:sufix\", i.e. \"gbpdcat:1\" will"
|
||||
" list all pages under the Top category (or all pages"
|
||||
" in the entire directory).<br>"
|
||||
" <ul><li>gbdcat - The page is listed directly"
|
||||
" <ul><li>gbcatid - The page is listed directly"
|
||||
" under this base category.<br>"
|
||||
" <li>gbpdcat - The page is listed under this"
|
||||
" <li>gbpcatid - The page is listed under this"
|
||||
" category or any child of this category.<br>"
|
||||
" <li>gbicat - The page is listed indirectly"
|
||||
" <li>gbicatid - The page is listed indirectly"
|
||||
" under this base category, meaning it is a page found"
|
||||
" under a site listed in the base category.<br>"
|
||||
" <li>gbpicat - The page is listed indirectly"
|
||||
" <li>gbipcat - The page is listed indirectly"
|
||||
" under this category, meaning it is a page found under"
|
||||
" a site listed under this category or any child of"
|
||||
" this category.<br>"
|
||||
|
21
PagePerf.cpp
21
PagePerf.cpp
@ -14,11 +14,11 @@ bool sendPagePerf ( TcpSocket *s , HttpRequest *r ) {
|
||||
// allow connection if i'm running this on lenny, too
|
||||
//if ( s->m_ip != matt1 && s->m_ip != matt2 )
|
||||
// return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno));
|
||||
long refreshLen = 0;
|
||||
if(r->getString ( "refresh" , &refreshLen) ) {
|
||||
g_stats.dumpGIF ();
|
||||
return g_httpServer.sendDynamicPage ( s , "x", 1 );
|
||||
}
|
||||
//long refreshLen = 0;
|
||||
//if(r->getString ( "refresh" , &refreshLen) ) {
|
||||
// g_stats.dumpGIF ();
|
||||
// return g_httpServer.sendDynamicPage ( s , "x", 1 );
|
||||
//}
|
||||
|
||||
// don't allow pages bigger than 128k in cache
|
||||
char buf [ 64*1024 ];
|
||||
@ -77,7 +77,7 @@ bool sendPagePerf ( TcpSocket *s , HttpRequest *r ) {
|
||||
|
||||
|
||||
// dump stats to /tmp/diskGraph.gif
|
||||
g_stats.dumpGIF ();
|
||||
//g_stats.dumpGIF ();
|
||||
|
||||
if(autoRefresh > 0)
|
||||
p.safePrintf("<body onLoad=\"timeit();\">");
|
||||
@ -105,8 +105,13 @@ bool sendPagePerf ( TcpSocket *s , HttpRequest *r ) {
|
||||
p.safePrintf(
|
||||
//"<center>Disk Statistics<br><br>"
|
||||
"<center><br>"
|
||||
"<img name=\"diskgraph\" src=/diskGraph%li.gif><br><br>",
|
||||
g_hostdb.m_hostId );
|
||||
//"<img name=\"diskgraph\"
|
||||
//src=/diskGraph%li.gif><br><br>",
|
||||
//g_hostdb.m_hostId );
|
||||
);
|
||||
|
||||
// now try using absolute divs instead of a GIF
|
||||
g_stats.printGraphInHtml ( p );
|
||||
|
||||
if(autoRefresh > 0) {
|
||||
if(refresh) *(refresh+4) = '0';
|
||||
|
971
PageResults.cpp
971
PageResults.cpp
File diff suppressed because it is too large
Load Diff
@ -3,6 +3,9 @@
|
||||
|
||||
#include "SafeBuf.h"
|
||||
|
||||
bool printDmozRadioButtons ( SafeBuf &sb , long catId ) ;
|
||||
bool printLogoAndSearchBox ( SafeBuf &sb , class HttpRequest *hr, long catId );
|
||||
|
||||
bool printTermPairs ( SafeBuf &sb , class Query *q , class PairScore *ps ) ;
|
||||
bool printSingleTerm ( SafeBuf &sb , class Query *q , class SingleScore *ss );
|
||||
|
||||
@ -17,6 +20,9 @@ bool printEventAddress ( SafeBuf &sb , char *addrStr , class SearchInput *si ,
|
||||
double eventGeocoderLon,
|
||||
char *eventBestPlaceName );
|
||||
|
||||
bool printDMOZCrumb ( SafeBuf &sb , long catId , bool xml ) ;
|
||||
bool printDMOZSubTopics ( SafeBuf& sb, long catId, bool inXml ) ;
|
||||
|
||||
bool printEventCountdown2 ( SafeBuf &sb ,
|
||||
SearchInput *si,
|
||||
long now ,
|
||||
|
175
PageRoot.cpp
175
PageRoot.cpp
@ -115,7 +115,7 @@ bool printWebHomePage ( SafeBuf &sb , HttpRequest *r ) {
|
||||
|
||||
sb.safePrintf("<br><br>\n");
|
||||
sb.safePrintf("<br><br><br>\n");
|
||||
sb.safePrintf("<b>web</b> <a href=/seo>seo</a> <a href=\"http://www.gigablast.com/?c=dmoz3\">directory</a> \n");
|
||||
sb.safePrintf("<b>web</b> <a href=/seo>seo</a> <a href=\"/Top\">directory</a> \n");
|
||||
sb.safePrintf("<a href=/adv.html>advanced search</a>");
|
||||
sb.safePrintf(" ");
|
||||
sb.safePrintf("<a href=/addurl title=\"Instantly add your url to "
|
||||
@ -325,7 +325,7 @@ bool printAddUrlHomePage ( SafeBuf &sb , char *url , HttpRequest *r ) {
|
||||
|
||||
sb.safePrintf("<br><br>\n");
|
||||
sb.safePrintf("<br><br><br>\n");
|
||||
sb.safePrintf("<a href=/>web</a> <a href=/seo>seo</a> <a href=\"http://www.gigablast.com/?c=dmoz3\">directory</a> \n");
|
||||
sb.safePrintf("<a href=/>web</a> <a href=/seo>seo</a> <a href=\"/Top\">directory</a> \n");
|
||||
sb.safePrintf("<a href=/adv.html>advanced search</a>");
|
||||
sb.safePrintf(" ");
|
||||
sb.safePrintf("<b title=\"Instantly add your url to Gigablast's "
|
||||
@ -474,6 +474,8 @@ bool printDirHomePage ( SafeBuf &sb , HttpRequest *r ) {
|
||||
sb.safePrintf("<form method=get "
|
||||
"action=/search name=f>\n");
|
||||
sb.safePrintf("<input name=q type=text size=60 value=\"\"> <input type=\"submit\" value=\"Search Green\">\n");
|
||||
sb.safePrintf("<input type=hidden "
|
||||
"name=prepend value=\"gbipcatid:2\">");
|
||||
sb.safePrintf("\n");
|
||||
sb.safePrintf("</form>\n");
|
||||
sb.safePrintf("<br>\n");
|
||||
@ -570,10 +572,10 @@ bool sendPageRoot ( TcpSocket *s , HttpRequest *r, char *cookie ) {
|
||||
*/
|
||||
|
||||
|
||||
if ( ! strcmp(coll,"dmoz3" ) )
|
||||
printDirHomePage(sb,r);
|
||||
else
|
||||
printWebHomePage(sb,r);
|
||||
//if ( ! strcmp(coll,"dmoz" ) )
|
||||
// printDirHomePage(sb,r);
|
||||
//else
|
||||
printWebHomePage(sb,r);
|
||||
|
||||
|
||||
// . print last 5 queries
|
||||
@ -947,136 +949,151 @@ long printLastQueries ( char *p , char *pend ) {
|
||||
|
||||
//char *printTopDirectory ( char *p, char *pend ) {
|
||||
bool printTopDirectory ( SafeBuf& sb ) {
|
||||
|
||||
// if no recs in catdb, print instructions
|
||||
if ( g_catdb.getRdb()->getNumTotalRecs() == 0 )
|
||||
return sb.safePrintf("<center>"
|
||||
"<b>DMOZ functionality is not set up.</b>"
|
||||
"<br>"
|
||||
"<br>"
|
||||
"<b>"
|
||||
"Please follow the set up "
|
||||
"<a href=/admin.html#dmoz>"
|
||||
"instructions"
|
||||
"</a>."
|
||||
"</b>"
|
||||
"</center>");
|
||||
|
||||
//char topList[4096];
|
||||
//sprintf(topList,
|
||||
return sb.safePrintf (
|
||||
"<center>"
|
||||
"<table cellspacing=\"4\" cellpadding=\"4\"><tr><td valign=top>\n"
|
||||
"<b><a href=\"/Arts/\">Arts</a></b><br>"
|
||||
"<b><a href=\"/Top/Arts/\">Arts</a></b><br>"
|
||||
"<small>"
|
||||
"<a href=\"/Arts/Movies/\">Movies</a>, "
|
||||
"<a href=\"/Arts/Television/\">Television</a>, "
|
||||
"<a href=\"/Arts/Music/\">Music</a>..."
|
||||
"<a href=\"/Top/Arts/Movies/\">Movies</a>, "
|
||||
"<a href=\"/Top/Arts/Television/\">Television</a>, "
|
||||
"<a href=\"/Top/Arts/Music/\">Music</a>..."
|
||||
"</small>\n"
|
||||
"</td><td valign=top>"
|
||||
"<b><a href=\"/Business/\">Business</a></b><br>"
|
||||
"<b><a href=\"/Top/Business/\">Business</a></b><br>"
|
||||
"<small>"
|
||||
"<a href=\"/Business/Employment/\">Jobs</a>, "
|
||||
"<a href=\"/Business/Real_Estate/\">Real Estate</a>, "
|
||||
"<a href=\"/Business/Investing/\">Investing</a>..."
|
||||
"<a href=\"/Top/Business/Employment/\">Jobs</a>, "
|
||||
"<a href=\"/Top/Business/Real_Estate/\">Real Estate</a>, "
|
||||
"<a href=\"/Top/Business/Investing/\">Investing</a>..."
|
||||
"</small>\n"
|
||||
"</td><td valign=top>"
|
||||
"<b><a href=\"/Computers/\">Computers</a></b><br>"
|
||||
"<b><a href=\"/Top/Computers/\">Computers</a></b><br>"
|
||||
"<small>"
|
||||
"<a href=\"/Computers/Internet/\">Internet</a>, "
|
||||
"<a href=\"/Computers/Software/\">Software</a>, "
|
||||
"<a href=\"/Computers/Hardware/\">Hardware</a>..."
|
||||
"<a href=\"/Top/Computers/Internet/\">Internet</a>, "
|
||||
"<a href=\"/Top/Computers/Software/\">Software</a>, "
|
||||
"<a href=\"/Top/Computers/Hardware/\">Hardware</a>..."
|
||||
"</small>\n"
|
||||
"</td></tr><tr><td valign=top>"
|
||||
"<b><a href=\"/Games/\">Games</a></b><br>"
|
||||
"<b><a href=\"/Top/Games/\">Games</a></b><br>"
|
||||
"<small>"
|
||||
"<a href=\"/Games/Video_Games/\">Video Games</a>, "
|
||||
"<a href=\"/Games/Roleplaying/\">RPGs</a>, "
|
||||
"<a href=\"/Games/Gambling/\">Gambling</a>..."
|
||||
"<a href=\"/Top/Games/Video_Games/\">Video Games</a>, "
|
||||
"<a href=\"/Top/Games/Roleplaying/\">RPGs</a>, "
|
||||
"<a href=\"/Top/Games/Gambling/\">Gambling</a>..."
|
||||
"</small>\n"
|
||||
"</td><td valign=top>"
|
||||
"<b><a href=\"/Health/\">Health</a></b><br>"
|
||||
"<b><a href=\"/Top/Health/\">Health</a></b><br>"
|
||||
"<small>"
|
||||
"<a href=\"/Health/Fitness/\">Fitness</a>, "
|
||||
"<a href=\"/Health/Medicine/\">Medicine</a>, "
|
||||
"<a href=\"/Health/Alternative/\">Alternative</a>..."
|
||||
"<a href=\"/Top/Health/Fitness/\">Fitness</a>, "
|
||||
"<a href=\"/Top/Health/Medicine/\">Medicine</a>, "
|
||||
"<a href=\"/Top/Health/Alternative/\">Alternative</a>..."
|
||||
"</small>\n"
|
||||
"</td><td valign=top>"
|
||||
"<b><a href=\"/Home/\">Home</a></b><br>"
|
||||
"<b><a href=\"/Top/Home/\">Home</a></b><br>"
|
||||
"<small>"
|
||||
"<a href=\"/Home/Family/\">Family</a>, "
|
||||
"<a href=\"/Home/Consumer_Information/\">Consumers</a>, "
|
||||
"<a href=\"/Home/Cooking/\">Cooking</a>..."
|
||||
"<a href=\"/Top/Home/Family/\">Family</a>, "
|
||||
"<a href=\"/Top/Home/Consumer_Information/\">Consumers</a>, "
|
||||
"<a href=\"/Top/Home/Cooking/\">Cooking</a>..."
|
||||
"</small>\n"
|
||||
"</td></tr><tr><td valign=top>"
|
||||
//"<b><a href=\"/Kids_and_Teens/\">"
|
||||
//"<b><a href=\"/Top/Kids_and_Teens/\">"
|
||||
//"<font color=\"#ff0000\">K</font>"
|
||||
//"<font color=\"339900\">i</font>"
|
||||
//"<font color=\"#ff6600\">d</font>"
|
||||
//"<font color=\"#0066ff\">s</font>"
|
||||
//" and Teens</a></b><br>"
|
||||
"<b><a href=\"/Kids_and_Teens/\">Kids and Teens</a></b><br>"
|
||||
"<b><a href=\"/Top/Kids_and_Teens/\">Kids and Teens</a></b><br>"
|
||||
"<small>"
|
||||
"<a href=\"/Kids_and_Teens/Arts/\">Arts</a>, "
|
||||
"<a href=\"/Kids_and_Teens/School_Time/\">School Time</a>, "
|
||||
"<a href=\"/Kids_and_Teens/Teen_Life/\">Teen Life</a>..."
|
||||
"<a href=\"/Top/Kids_and_Teens/Arts/\">Arts</a>, "
|
||||
"<a href=\"/Top/Kids_and_Teens/School_Time/\">School Time</a>, "
|
||||
"<a href=\"/Top/Kids_and_Teens/Teen_Life/\">Teen Life</a>..."
|
||||
"</small>\n"
|
||||
"</td><td valign=top>"
|
||||
"<b><a href=\"/News/\">News</a></b><br>"
|
||||
"<b><a href=\"/Top/News/\">News</a></b><br>"
|
||||
"<small>"
|
||||
"<a href=\"/News/Media/\">Media</a>, "
|
||||
"<a href=\"/News/Newspapers/\">Newspapers</a>, "
|
||||
"<a href=\"/News/Weather/\">Weather</a>..."
|
||||
"<a href=\"/Top/News/Media/\">Media</a>, "
|
||||
"<a href=\"/Top/News/Newspapers/\">Newspapers</a>, "
|
||||
"<a href=\"/Top/News/Weather/\">Weather</a>..."
|
||||
"</small>\n"
|
||||
"</td><td valign=top>"
|
||||
"<b><a href=\"/Recreation/\">Recreation</a></b><br>"
|
||||
"<b><a href=\"/Top/Recreation/\">Recreation</a></b><br>"
|
||||
"<small>"
|
||||
"<a href=\"/Recreation/Travel/\">Travel</a>, "
|
||||
"<a href=\"/Recreation/Food/\">Food</a>, "
|
||||
"<a href=\"/Recreation/Outdoors/\">Outdoors</a>, "
|
||||
"<a href=\"/Recreation/Humor/\">Humor</a>..."
|
||||
"<a href=\"/Top/Recreation/Travel/\">Travel</a>, "
|
||||
"<a href=\"/Top/Recreation/Food/\">Food</a>, "
|
||||
"<a href=\"/Top/Recreation/Outdoors/\">Outdoors</a>, "
|
||||
"<a href=\"/Top/Recreation/Humor/\">Humor</a>..."
|
||||
"</small>\n"
|
||||
"</td></tr><tr><td valign=top>"
|
||||
"<b><a href=\"/Reference/\">Reference</a></b><br>"
|
||||
"<b><a href=\"/Top/Reference/\">Reference</a></b><br>"
|
||||
"<small>"
|
||||
"<a href=\"/Reference/Maps/\">Maps</a>, "
|
||||
"<a href=\"/Reference/Education/\">Education</a>, "
|
||||
"<a href=\"/Reference/Libraries/\">Libraries</a>..."
|
||||
"<a href=\"/Top/Reference/Maps/\">Maps</a>, "
|
||||
"<a href=\"/Top/Reference/Education/\">Education</a>, "
|
||||
"<a href=\"/Top/Reference/Libraries/\">Libraries</a>..."
|
||||
"</small>\n"
|
||||
"</td><td valign=top>"
|
||||
"<b><a href=\"/Regional/\">Regional</a></b><br>"
|
||||
"<b><a href=\"/Top/Regional/\">Regional</a></b><br>"
|
||||
"<small>"
|
||||
"<a href=\"/Regional/North_America/United_States/\">US</a>, "
|
||||
"<a href=\"/Regional/North_America/Canada/\">Canada</a>, "
|
||||
"<a href=\"/Regional/Europe/United_Kingdom/\">UK</a>, "
|
||||
"<a href=\"/Regional/Europe/\">Europe</a>..."
|
||||
"<a href=\"/Top/Regional/North_America/United_States/\">US</a>, "
|
||||
"<a href=\"/Top/Regional/North_America/Canada/\">Canada</a>, "
|
||||
"<a href=\"/Top/Regional/Europe/United_Kingdom/\">UK</a>, "
|
||||
"<a href=\"/Top/Regional/Europe/\">Europe</a>..."
|
||||
"</small>\n"
|
||||
"</td><td valign=top>"
|
||||
"<b><a href=\"/Science/\">Science</a></b><br>"
|
||||
"<b><a href=\"/Top/Science/\">Science</a></b><br>"
|
||||
"<small>"
|
||||
"<a href=\"/Science/Biology/\">Biology</a>, "
|
||||
"<a href=\"/Science/Social_Sciences/Psychology/\">Psychology</a>, "
|
||||
"<a href=\"/Science/Physics/\">Physics</a>..."
|
||||
"<a href=\"/Top/Science/Biology/\">Biology</a>, "
|
||||
"<a href=\"/Top/Science/Social_Sciences/Psychology/\">Psychology</a>, "
|
||||
"<a href=\"/Top/Science/Physics/\">Physics</a>..."
|
||||
"</small>\n"
|
||||
"</td></tr><tr><td valign=top>"
|
||||
"<b><a href=\"/Shopping/\">Shopping</a></b><br>"
|
||||
"<b><a href=\"/Top/Shopping/\">Shopping</a></b><br>"
|
||||
"<small>"
|
||||
"<a href=\"/Shopping/Vehicles/Autos/\">Autos</a>, "
|
||||
"<a href=\"/Shopping/Clothing/\">Clothing</a>, "
|
||||
"<a href=\"/Shopping/Gifts/\">Gifts</a>..."
|
||||
"<a href=\"/Top/Shopping/Vehicles/Autos/\">Autos</a>, "
|
||||
"<a href=\"/Top/Shopping/Clothing/\">Clothing</a>, "
|
||||
"<a href=\"/Top/Shopping/Gifts/\">Gifts</a>..."
|
||||
"</small>\n"
|
||||
"</td><td valign=top>"
|
||||
"<b><a href=\"/Society/\">Society</a></b><br>"
|
||||
"<b><a href=\"/Top/Society/\">Society</a></b><br>"
|
||||
"<small>"
|
||||
"<a href=\"/Society/People/\">People</a>, "
|
||||
"<a href=\"/Society/Religion_and_Spirituality/\">Religion</a>, "
|
||||
"<a href=\"/Society/Issues/\">Issues</a>..."
|
||||
"<a href=\"/Top/Society/People/\">People</a>, "
|
||||
"<a href=\"/Top/Society/Religion_and_Spirituality/\">Religion</a>, "
|
||||
"<a href=\"/Top/Society/Issues/\">Issues</a>..."
|
||||
"</small>\n"
|
||||
"</td><td valign=top>"
|
||||
"<b><a href=\"/Sports/\">Sports</a></b><br>"
|
||||
"<b><a href=\"/Top/Sports/\">Sports</a></b><br>"
|
||||
"<small>"
|
||||
"<a href=\"/Sports/Baseball/\">Baseball</a>, "
|
||||
"<a href=\"/Sports/Soccer/\">Soccer</a>, "
|
||||
"<a href=\"/Sports/Basketball/\">Basketball</a>..."
|
||||
"<a href=\"/Top/Sports/Baseball/\">Baseball</a>, "
|
||||
"<a href=\"/Top/Sports/Soccer/\">Soccer</a>, "
|
||||
"<a href=\"/Top/Sports/Basketball/\">Basketball</a>..."
|
||||
"</small>\n"
|
||||
"</td></tr>"
|
||||
"<tr><td colspan=3 valign=top>"
|
||||
"<b><a href=\"/World/\">World</a></b><br>"
|
||||
"<b><a href=\"/Top/World/\">World</a></b><br>"
|
||||
"<small>"
|
||||
"<a href=\"/World/Deutsch/\">Deutsch</a>, "
|
||||
"<a href=\"/World/Espa%%c3%%b1ol/\">Espa%c%col</a>, "
|
||||
"<a href=\"/World/Fran%%c3%%a7ais/\">Fran%c%cais</a>, "
|
||||
"<a href=\"/World/Italiano/\">Italiano</a>, "
|
||||
"<a href=\"/World/Japanese/\">Japanese</a>, "
|
||||
"<a href=\"/World/Nederlands/\">Nederlands</a>, "
|
||||
"<a href=\"/World/Polska/\">Polska</a>, "
|
||||
"<a href=\"/World/Dansk/\">Dansk</a>, "
|
||||
"<a href=\"/World/Svenska/\">Svenska</a>..."
|
||||
"<a href=\"/Top/World/Deutsch/\">Deutsch</a>, "
|
||||
"<a href=\"/Top/World/Espa%%c3%%b1ol/\">Espa%c%col</a>, "
|
||||
"<a href=\"/Top/World/Fran%%c3%%a7ais/\">Fran%c%cais</a>, "
|
||||
"<a href=\"/Top/World/Italiano/\">Italiano</a>, "
|
||||
"<a href=\"/Top/World/Japanese/\">Japanese</a>, "
|
||||
"<a href=\"/Top/World/Nederlands/\">Nederlands</a>, "
|
||||
"<a href=\"/Top/World/Polska/\">Polska</a>, "
|
||||
"<a href=\"/Top/World/Dansk/\">Dansk</a>, "
|
||||
"<a href=\"/Top/World/Svenska/\">Svenska</a>..."
|
||||
"</small>\n"
|
||||
"</td></tr></table></center>\n",
|
||||
195, 177, 195, 167);
|
||||
|
@ -1087,12 +1087,12 @@ bool sendPageStats ( TcpSocket *s , HttpRequest *r ) {
|
||||
//g_tfndb.getRdb(),
|
||||
g_tagdb.getRdb(),
|
||||
g_clusterdb.getRdb(),
|
||||
//g_catdb.getRdb(),
|
||||
g_linkdb.getRdb(),
|
||||
g_cachedb.getRdb(),
|
||||
g_serpdb.getRdb(),
|
||||
g_monitordb.getRdb(),
|
||||
g_statsdb.getRdb()
|
||||
g_statsdb.getRdb(),
|
||||
g_catdb.getRdb()
|
||||
//g_placedb.getRdb() ,
|
||||
//g_sectiondb.getRdb()
|
||||
};
|
||||
|
@ -67,7 +67,8 @@ bool sendPageStatsdb ( TcpSocket *s, HttpRequest *r ) {
|
||||
st->m_niceness = MAX_NICENESS;
|
||||
|
||||
st->m_socket = s;
|
||||
st->m_request = *r;
|
||||
//st->m_request = *r;
|
||||
st->m_request.copy ( r );
|
||||
|
||||
// hostId must be one of the following:
|
||||
// 0-n - a valid hostId
|
||||
@ -120,7 +121,9 @@ bool sendPageStatsdb ( TcpSocket *s, HttpRequest *r ) {
|
||||
st->m_endDate = st->m_endDateR;
|
||||
}
|
||||
|
||||
|
||||
//
|
||||
// this is no longer a gif, but an html graph in g_statsdb.m_sb
|
||||
//
|
||||
if ( ! g_statsdb.makeGIF ( st->m_endDateR ,
|
||||
st->m_startDateR ,
|
||||
st->m_samples ,
|
||||
@ -211,15 +214,28 @@ void sendReply ( void *state ) {
|
||||
buf.safePrintf("<table cellpadding=10 border=0>\n");
|
||||
|
||||
buf.safePrintf("<tr><td>"
|
||||
"<center>"
|
||||
"<img src=\"/stats%li.gif\" height=%li width=%li "
|
||||
"border=\"0px\">"
|
||||
"</center>"
|
||||
"<center>");
|
||||
|
||||
/////////////////////////
|
||||
//
|
||||
// insert the div graph here
|
||||
//
|
||||
/////////////////////////
|
||||
buf.cat ( g_statsdb.m_gw );
|
||||
|
||||
// purge it
|
||||
g_statsdb.m_gw.purge();
|
||||
g_statsdb.m_dupTable.reset();
|
||||
|
||||
//"<img src=\"/stats%li.gif\" height=%li width=%li "
|
||||
//"border=\"0px\">"
|
||||
//st->m_hostId,
|
||||
//g_statsdb.getImgHeight(),
|
||||
//g_statsdb.getImgWidth());
|
||||
|
||||
buf.safePrintf("</center>"
|
||||
//"class=\"statsdb_image\">"
|
||||
"</td></tr>\n",
|
||||
st->m_hostId,
|
||||
g_statsdb.getImgHeight(),
|
||||
g_statsdb.getImgWidth());
|
||||
"</td></tr>\n");
|
||||
|
||||
// the map key
|
||||
buf.safePrintf("<tr><td>");
|
||||
|
10
Pages.cpp
10
Pages.cpp
@ -384,7 +384,10 @@ long Pages::getDynamicPageNumber ( HttpRequest *r ) {
|
||||
}
|
||||
// sanity
|
||||
if ( ! g_categories ) log("process: no categories loaded");
|
||||
// look it up for a category
|
||||
|
||||
//
|
||||
// dmoz - look it up for a category
|
||||
//
|
||||
if ( g_categories &&
|
||||
g_categories->getIndexFromPath(decodedPath, decodedPathLen) >= 0)
|
||||
return PAGE_DIRECTORY;
|
||||
@ -497,6 +500,10 @@ bool Pages::sendDynamicReply ( TcpSocket *s , HttpRequest *r , long page ) {
|
||||
// log("login: access denied 3 from ip=%s",iptoa(s->m_ip));
|
||||
// return sendPageLogin(s,r,"Access Denied. Bad or no password.");
|
||||
//}
|
||||
if ( ! publicPage && ! isLocal && ! isLoopback ) {
|
||||
log("login: access denied 2 from ip=%s",iptoa(s->m_ip));
|
||||
return sendPageLogin ( s , r, "Access Denied. No permission.");
|
||||
}
|
||||
|
||||
g_errno = 0;
|
||||
|
||||
@ -635,7 +642,6 @@ bool Pages::sendDynamicReply ( TcpSocket *s , HttpRequest *r , long page ) {
|
||||
// . now, so it can be responsible for calling pg->m_function
|
||||
//if ( userType > USER_PUBLIC ) {
|
||||
// check if user has public page access
|
||||
//if ( g_users.hasPermission( r, page , s ) ) {
|
||||
if ( isLocal ) { //g_users.hasPermission( r, page , s )){
|
||||
// . this will set various parms
|
||||
// . we know the request came from a host in the cluster
|
||||
|
@ -4657,6 +4657,7 @@ void Parms::init ( ) {
|
||||
m->m_type = TYPE_LONG;
|
||||
m++;
|
||||
|
||||
/*
|
||||
m->m_title = "catdb min files to merge";
|
||||
m->m_desc = "";
|
||||
m->m_off = (char *)&g_conf.m_catdbMinFilesToMerge - g;
|
||||
@ -4665,7 +4666,6 @@ void Parms::init ( ) {
|
||||
m->m_save = 0;
|
||||
m++;
|
||||
|
||||
/*
|
||||
m->m_title = "revdb max tree mem";
|
||||
m->m_desc = "Revdb holds the meta list we added for this doc.";
|
||||
m->m_off = (char *)&g_conf.m_revdbMaxTreeMem - g;
|
||||
|
12
Process.cpp
12
Process.cpp
@ -6,7 +6,7 @@
|
||||
#include "Clusterdb.h"
|
||||
#include "Hostdb.h"
|
||||
#include "Tagdb.h"
|
||||
//#include "Catdb.h"
|
||||
#include "Catdb.h"
|
||||
#include "Posdb.h"
|
||||
#include "Cachedb.h"
|
||||
#include "Monitordb.h"
|
||||
@ -56,7 +56,7 @@ long g_qbufNeedSave = 0;
|
||||
extern void resetPageAddUrl ( );
|
||||
extern void resetHttpMime ( );
|
||||
extern void reset_iana_charset ( );
|
||||
extern void resetAdultBit ( );
|
||||
//extern void resetAdultBit ( );
|
||||
extern void resetDomains ( );
|
||||
extern void resetEntities ( );
|
||||
extern void resetQuery ( );
|
||||
@ -411,7 +411,7 @@ bool Process::init ( ) {
|
||||
m_rdbs[m_numRdbs++] = g_spiderdb.getRdb ();
|
||||
m_rdbs[m_numRdbs++] = g_clusterdb.getRdb ();
|
||||
m_rdbs[m_numRdbs++] = g_tagdb.getRdb ();
|
||||
//m_rdbs[m_numRdbs++] = g_catdb.getRdb ();
|
||||
m_rdbs[m_numRdbs++] = g_catdb.getRdb ();
|
||||
m_rdbs[m_numRdbs++] = g_statsdb.getRdb ();
|
||||
m_rdbs[m_numRdbs++] = g_linkdb.getRdb ();
|
||||
m_rdbs[m_numRdbs++] = g_cachedb.getRdb ();
|
||||
@ -1660,7 +1660,7 @@ void Process::resetAll ( ) {
|
||||
rdb->reset();
|
||||
}
|
||||
|
||||
//g_catdb .reset();
|
||||
g_catdb .reset();
|
||||
g_collectiondb .reset();
|
||||
g_categories1 .reset();
|
||||
g_categories2 .reset();
|
||||
@ -1712,7 +1712,7 @@ void Process::resetAll ( ) {
|
||||
resetPageAddUrl();
|
||||
resetHttpMime();
|
||||
reset_iana_charset();
|
||||
resetAdultBit();
|
||||
//resetAdultBit();
|
||||
resetDomains();
|
||||
resetEntities();
|
||||
resetQuery();
|
||||
@ -1761,7 +1761,7 @@ void Process::resetPageCaches ( ) {
|
||||
//g_tfndb .getDiskPageCache()->reset();
|
||||
//g_checksumdb .getDiskPageCache()->reset();
|
||||
g_clusterdb .getDiskPageCache()->reset();
|
||||
//g_catdb .getDiskPageCache()->reset();
|
||||
g_catdb .getDiskPageCache()->reset();
|
||||
//g_placedb .getDiskPageCache()->reset();
|
||||
g_doledb .getDiskPageCache()->reset();
|
||||
//g_statsdb .getDiskPageCache()->reset();
|
||||
|
@ -256,7 +256,7 @@ bool Proxy::initProxy ( long proxyId, unsigned short udpPort,
|
||||
g_pages.init ( );
|
||||
// load up the dmoz categories here
|
||||
char structureFile[256];
|
||||
sprintf(structureFile, "%scat/gbdmoz.structure.dat", g_hostdb.m_dir);
|
||||
sprintf(structureFile, "%scatdb/gbdmoz.structure.dat", g_hostdb.m_dir);
|
||||
g_categories = &g_categories1;
|
||||
if (g_categories->loadCategories(structureFile) != 0) {
|
||||
log("cat: Loading Categories From %s Failed.",
|
||||
|
36
Rdb.cpp
36
Rdb.cpp
@ -5,7 +5,7 @@
|
||||
#include "Clusterdb.h"
|
||||
#include "Hostdb.h"
|
||||
#include "Tagdb.h"
|
||||
//#include "Catdb.h"
|
||||
#include "Catdb.h"
|
||||
#include "Indexdb.h"
|
||||
#include "Posdb.h"
|
||||
#include "Cachedb.h"
|
||||
@ -302,8 +302,20 @@ bool Rdb::init ( char *dir ,
|
||||
if ( ! loadTree ( ) ) return false;
|
||||
|
||||
// add the single dummy collection for catdb
|
||||
//if ( g_catdb.getRdb() == this ) //||
|
||||
// return g_catdb.addColl ( NULL );
|
||||
if ( g_catdb.getRdb() == this )
|
||||
return g_catdb.addColl ( NULL );
|
||||
if ( g_statsdb.getRdb() == this )
|
||||
return g_statsdb.addColl ( NULL );
|
||||
if ( g_cachedb.getRdb() == this )
|
||||
return g_cachedb.addColl ( NULL );
|
||||
if ( g_serpdb.getRdb() == this )
|
||||
return g_serpdb.addColl ( NULL );
|
||||
//else if ( g_accessdb.getRdb() == this )
|
||||
// return g_accessdb.addColl ( NULL );
|
||||
//else if ( g_facebookdb.getRdb() == this )
|
||||
// return g_facebookdb.addColl ( NULL );
|
||||
if ( g_syncdb.getRdb() == this )
|
||||
return g_syncdb.addColl ( NULL );
|
||||
|
||||
// set this for use below
|
||||
//*(long long *)m_gbcounteventsTermId =
|
||||
@ -1404,7 +1416,7 @@ void attemptMergeAll ( int fd , void *state ) {
|
||||
g_titledb.getRdb()->attemptMerge ( 1 , false , !state);
|
||||
//g_tfndb.getRdb()->attemptMerge ( 1 , false , !state);
|
||||
g_tagdb.getRdb()->attemptMerge ( 1 , false , !state);
|
||||
//g_catdb.getRdb()->attemptMerge ( 1 , false , !state);
|
||||
g_catdb.getRdb()->attemptMerge ( 1 , false , !state);
|
||||
g_clusterdb.getRdb()->attemptMerge ( 1 , false , !state);
|
||||
g_statsdb.getRdb()->attemptMerge ( 1 , false , !state);
|
||||
g_syncdb.getRdb()->attemptMerge ( 1 , false , !state);
|
||||
@ -2035,6 +2047,13 @@ bool Rdb::addRecord ( collnum_t collnum,
|
||||
}
|
||||
*/
|
||||
|
||||
// debug testing
|
||||
//if ( m_rdbId == RDB_CATDB ) {
|
||||
// // show key
|
||||
// log("rdb: adding key=%s to tree n=%li",KEYSTR(key,12) ,n);
|
||||
//}
|
||||
|
||||
|
||||
//jumpdown:
|
||||
|
||||
// if it exists then annihilate it
|
||||
@ -2423,7 +2442,7 @@ Rdb *getRdbFromId ( uint8_t rdbId ) {
|
||||
s_table9 [ RDB_DOLEDB ] = g_doledb.getRdb();
|
||||
s_table9 [ RDB_TFNDB ] = g_tfndb.getRdb();
|
||||
s_table9 [ RDB_CLUSTERDB ] = g_clusterdb.getRdb();
|
||||
//s_table9 [ RDB_CATDB ] = g_catdb.getRdb();
|
||||
s_table9 [ RDB_CATDB ] = g_catdb.getRdb();
|
||||
s_table9 [ RDB_DATEDB ] = g_datedb.getRdb();
|
||||
s_table9 [ RDB_LINKDB ] = g_linkdb.getRdb();
|
||||
s_table9 [ RDB_CACHEDB ] = g_cachedb.getRdb();
|
||||
@ -2453,7 +2472,7 @@ Rdb *getRdbFromId ( uint8_t rdbId ) {
|
||||
// the opposite of the above
|
||||
char getIdFromRdb ( Rdb *rdb ) {
|
||||
if ( rdb == g_tagdb.getRdb () ) return RDB_TAGDB;
|
||||
//if ( rdb == g_catdb.getRdb () ) return RDB_CATDB;
|
||||
if ( rdb == g_catdb.getRdb () ) return RDB_CATDB;
|
||||
if ( rdb == g_indexdb.getRdb () ) return RDB_INDEXDB;
|
||||
if ( rdb == g_posdb.getRdb () ) return RDB_POSDB;
|
||||
if ( rdb == g_datedb.getRdb () ) return RDB_DATEDB;
|
||||
@ -2474,7 +2493,7 @@ char getIdFromRdb ( Rdb *rdb ) {
|
||||
if ( rdb == g_revdb.getRdb () ) return RDB_REVDB;
|
||||
//if ( rdb == g_sitedb.getRdb () ) return RDB_SITEDB;
|
||||
//if ( rdb == g_tagdb2.getRdb () ) return RDB2_SITEDB2;
|
||||
//if ( rdb == g_catdb.getRdb () ) return RDB_CATDB;
|
||||
if ( rdb == g_catdb.getRdb () ) return RDB_CATDB;
|
||||
if ( rdb == g_indexdb2.getRdb () ) return RDB2_INDEXDB2;
|
||||
if ( rdb == g_posdb2.getRdb () ) return RDB2_POSDB2;
|
||||
if ( rdb == g_datedb2.getRdb () ) return RDB2_DATEDB2;
|
||||
@ -2498,7 +2517,7 @@ char getIdFromRdb ( Rdb *rdb ) {
|
||||
char isSecondaryRdb ( uint8_t rdbId ) {
|
||||
switch ( rdbId ) {
|
||||
//case RDB2_SITEDB2 : return true;
|
||||
//case RDB_CATDB2 : return g_catdb2.getRdb();
|
||||
case RDB2_CATDB2 : return true;
|
||||
case RDB2_INDEXDB2 : return true;
|
||||
case RDB2_POSDB2 : return true;
|
||||
case RDB2_DATEDB2 : return true;
|
||||
@ -2606,6 +2625,7 @@ long getDataSizeFromRdbId ( uint8_t rdbId ) {
|
||||
else if ( i == RDB2_TITLEDB2 ||
|
||||
i == RDB2_REVDB2 ||
|
||||
i == RDB2_TAGDB2 ||
|
||||
i == RDB2_CATDB2 ||
|
||||
i == RDB2_SPIDERDB2 ||
|
||||
i == RDB2_PLACEDB2 )
|
||||
ds = -1;
|
||||
|
1
Rdb.h
1
Rdb.h
@ -53,6 +53,7 @@ enum {
|
||||
RDB2_REVDB2,
|
||||
RDB2_TAGDB2,
|
||||
RDB2_POSDB2, // 31
|
||||
RDB2_CATDB2,
|
||||
RDB_END
|
||||
};
|
||||
// how many rdbs are in "urgent merge" mode?
|
||||
|
@ -1244,7 +1244,8 @@ void initTable ( ) {
|
||||
}
|
||||
}
|
||||
|
||||
bool SafeBuf::urlEncode ( bool spaceToPlus ) {
|
||||
// url encode the whole buffer
|
||||
bool SafeBuf::urlEncodeAllBuf ( bool spaceToPlus ) {
|
||||
// this makes things faster
|
||||
if ( ! s_init23 ) initTable();
|
||||
// how many chars do we need?
|
||||
|
10
SafeBuf.h
10
SafeBuf.h
@ -229,11 +229,15 @@ struct SafeBuf {
|
||||
bool requestPath = false,
|
||||
bool encodeApostrophes = false );
|
||||
|
||||
bool urlEncode (char *s ,
|
||||
bool encodeApostrophes = false ) {
|
||||
bool urlEncode (char *s ) {
|
||||
return urlEncode ( s,strlen(s),false,false); };
|
||||
|
||||
|
||||
bool urlEncode2 (char *s ,
|
||||
bool encodeApostrophes ) { // usually false
|
||||
return urlEncode ( s,strlen(s),false,encodeApostrophes); };
|
||||
|
||||
bool urlEncode ( bool spaceToPlus = true );
|
||||
bool urlEncodeAllBuf ( bool spaceToPlus = true );
|
||||
bool latin1CdataEncode(char *s, long len);
|
||||
bool utf8CdataEncode(char *s, long len);
|
||||
|
||||
|
@ -711,7 +711,6 @@ m if (! cr->hasSearchPermission ( sock, encapIp ) ) {
|
||||
// . sets m_qbuf1 and m_qbuf2
|
||||
if ( ! setQueryBuffers ( r ) ) return false;
|
||||
|
||||
|
||||
/* --- Virtual host language detection --- */
|
||||
if(r->getHost()) {
|
||||
bool langset = getLanguageFromAbbr(m_defaultSortLanguage);
|
||||
@ -1226,6 +1225,40 @@ bool SearchInput::setQueryBuffers ( HttpRequest *hr ) {
|
||||
m_displayQuery,
|
||||
m_displayQueryLen);
|
||||
|
||||
|
||||
|
||||
|
||||
//////////
|
||||
//
|
||||
// show DMOZ BREADCRUMB if doing a
|
||||
// "gbpcatid:<catid> |" (Search restricted to category)
|
||||
// "gbcatid:<catid>" (DMOZ urls in that topic, c=dmoz3)
|
||||
//
|
||||
//////////
|
||||
long pcatId = -1;
|
||||
long dcatId = -1;
|
||||
// get the final query
|
||||
char *q =m_sbuf1.getBufStart();
|
||||
if ( q ) sscanf(q,"gbpcatid:%li",&pcatId);
|
||||
if ( q ) sscanf(q,"gbcatid:%li",&dcatId);
|
||||
// pick the one that is valid
|
||||
long catId = -1;
|
||||
if ( pcatId >= 0 ) catId = pcatId;
|
||||
if ( dcatId >= 0 ) catId = dcatId;
|
||||
|
||||
//////
|
||||
//
|
||||
// save catid into the state
|
||||
m_catId = catId;
|
||||
//
|
||||
///////
|
||||
|
||||
// are we a right to left language like hebrew?
|
||||
if ( catId > 0 && g_categories->isIdRTL(catId) )
|
||||
m_isRTL = true;
|
||||
else
|
||||
m_isRTL = false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -402,6 +402,9 @@ class SearchInput {
|
||||
SafeBuf m_sbuf2;
|
||||
SafeBuf m_sbuf3;
|
||||
|
||||
long m_catId;
|
||||
bool m_isRTL;
|
||||
|
||||
// make a cookie from parms with m_flags of PF_COOKIE set
|
||||
SafeBuf m_cookieBuf;
|
||||
|
||||
|
154
Sections.cpp
154
Sections.cpp
@ -36,8 +36,10 @@ Sections::Sections ( ) {
|
||||
}
|
||||
|
||||
void Sections::reset() {
|
||||
if ( m_sections && m_needsFree )
|
||||
mfree ( m_sections , m_sectionsBufSize , "Sections" );
|
||||
//if ( m_sections && m_needsFree )
|
||||
// mfree ( m_sections , m_sectionsBufSize , "Sections" );
|
||||
m_sectionBuf.purge();
|
||||
m_sectionPtrBuf.purge();
|
||||
if ( m_buf && m_bufSize )
|
||||
mfree ( m_buf , m_bufSize , "sdata" );
|
||||
if ( m_buf2 && m_bufSize2 )
|
||||
@ -228,10 +230,20 @@ bool Sections::set ( Words *w ,
|
||||
max++;
|
||||
// and each section may create a sentence section
|
||||
max *= 2;
|
||||
|
||||
// truncate if excessive. growSections() will kick in then i guess
|
||||
// if we need more sections.
|
||||
if ( max > 1000000 ) {
|
||||
log("sections: truncating max sections to 1000000");
|
||||
max = 1000000;
|
||||
}
|
||||
|
||||
//max += 5000;
|
||||
long need = max * sizeof(Section);
|
||||
|
||||
|
||||
// and we need one section ptr for every word!
|
||||
need += nw * 4;
|
||||
//need += nw * 4;
|
||||
// and a section ptr for m_sorted[]
|
||||
//need += max * sizeof(Section *);
|
||||
// set this
|
||||
@ -240,8 +252,21 @@ bool Sections::set ( Words *w ,
|
||||
// breathe
|
||||
QUICKPOLL(m_niceness);
|
||||
|
||||
// allocate m_sections[] buffer
|
||||
// separate buf now for section ptr for each word
|
||||
if ( ! m_sectionPtrBuf.reserve ( nw *4 ) ) return true;
|
||||
m_sectionPtrs = (Section **)m_sectionPtrBuf.getBufStart();
|
||||
m_sectionPtrsEnd = (Section **)m_sectionPtrBuf.getBufEnd();
|
||||
|
||||
// allocate m_sectionBuf
|
||||
m_sections = NULL;
|
||||
|
||||
if ( ! m_sectionBuf.reserve ( need ) )
|
||||
return true;
|
||||
|
||||
// point into it
|
||||
m_sections = (Section *)m_sectionBuf.getBufStart();
|
||||
|
||||
/*
|
||||
// assume no malloc
|
||||
m_needsFree = false;
|
||||
if ( need < SECTIONS_LOCALBUFSIZE ) {
|
||||
@ -259,6 +284,7 @@ bool Sections::set ( Words *w ,
|
||||
m_sectionsBufSize = need;
|
||||
m_needsFree = true;
|
||||
}
|
||||
*/
|
||||
|
||||
// clear it nicely
|
||||
//memset_nice ( m_sections , 0 , m_sectionsBufSize, m_niceness );
|
||||
@ -270,20 +296,20 @@ bool Sections::set ( Words *w ,
|
||||
m_titleEnd = -1;
|
||||
|
||||
// bail if no luck
|
||||
if ( ! m_sections ) return true;
|
||||
//if ( ! m_sections ) return true;
|
||||
|
||||
// point to buf
|
||||
char *ppp = (char *)m_sections;
|
||||
//char *ppp = (char *)m_sections;
|
||||
// skip Sections array
|
||||
ppp += max * sizeof(Section);
|
||||
//ppp += max * sizeof(Section);
|
||||
// assign space for m_sorted
|
||||
//m_sorted = (Section **)ppp;
|
||||
// skip that
|
||||
//ppp += max * sizeof(Section *);
|
||||
// assign space for our ptrs that are 1-1 with the words array
|
||||
m_sectionPtrs = (Section **)ppp;
|
||||
//m_sectionPtrs = (Section **)ppp;
|
||||
// the end
|
||||
m_sectionPtrsEnd = (Section **)(ppp + nw * 4);
|
||||
//m_sectionPtrsEnd = (Section **)(ppp + nw * 4);
|
||||
// save this too
|
||||
m_nw = nw;
|
||||
|
||||
@ -375,6 +401,10 @@ bool Sections::set ( Words *w ,
|
||||
if ( fullTid == TAG_INPUT ||
|
||||
fullTid == TAG_HR ||
|
||||
fullTid == TAG_COMMENT ) {
|
||||
// try to realloc i guess. should keep ptrs in tact.
|
||||
if ( m_numSections >= m_maxNumSections &&
|
||||
! growSections() )
|
||||
return true;
|
||||
// get the section
|
||||
Section *sn = &m_sections[m_numSections];
|
||||
// clear
|
||||
@ -397,6 +427,10 @@ bool Sections::set ( Words *w ,
|
||||
|
||||
// a section of multiple br tags in a sequence
|
||||
if ( fullTid == TAG_BR ) {
|
||||
// try to realloc i guess. should keep ptrs in tact.
|
||||
if ( m_numSections >= m_maxNumSections &&
|
||||
! growSections() )
|
||||
return true;
|
||||
// get the section
|
||||
Section *sn = &m_sections[m_numSections];
|
||||
// clear
|
||||
@ -884,6 +918,9 @@ bool Sections::set ( Words *w ,
|
||||
// with the address above it, and it shouldn't do that!
|
||||
if ( tid == TAG_FONT ) continue;
|
||||
|
||||
// try to realloc i guess. should keep ptrs in tact.
|
||||
if ( m_numSections >= m_maxNumSections && ! growSections() )
|
||||
return true;
|
||||
// get the section
|
||||
Section *sn = &m_sections[m_numSections];
|
||||
// clear
|
||||
@ -11034,8 +11071,11 @@ Section *Sections::insertSubSection ( Section *parentArg , long a , long b ,
|
||||
// debug
|
||||
//log("sect: inserting subsection [%li,%li)",a,b);
|
||||
|
||||
// sanity check
|
||||
if ( m_numSections >= m_maxNumSections ) { char *xx=NULL;*xx=0;}
|
||||
// try to realloc i guess. should keep ptrs in tact.
|
||||
if ( m_numSections >= m_maxNumSections )
|
||||
// try to realloc i guess
|
||||
if ( ! growSections() ) return NULL;
|
||||
//char *xx=NULL;*xx=0;}
|
||||
|
||||
//
|
||||
// make a new section
|
||||
@ -17270,3 +17310,95 @@ bool Sections::setListFlags ( ) {
|
||||
Section *ps;
|
||||
*/
|
||||
}
|
||||
|
||||
bool Sections::growSections ( ) {
|
||||
// make a log note b/c this should not happen a lot because it's slow
|
||||
log("build: growing sections!");
|
||||
// record old buf start
|
||||
char *oldBuf = m_sectionBuf.getBufStart();
|
||||
// grow by 20MB at a time
|
||||
if ( ! m_sectionBuf.reserve ( 20000000 ) ) return false;
|
||||
// for fixing ptrs:
|
||||
char *newBuf = m_sectionBuf.getBufStart();
|
||||
// set the new max
|
||||
m_maxNumSections = m_sectionBuf.getCapacity() / sizeof(Section);
|
||||
// update ptrs in the old sections
|
||||
for ( long i = 0 ; i < m_numSections ; i++ ) {
|
||||
// breathe
|
||||
QUICKPOLL(m_niceness);
|
||||
Section *si = &m_sections[i];
|
||||
if ( si->m_parent ) {
|
||||
char *np = (char *)si->m_parent;
|
||||
np = np - oldBuf + newBuf;
|
||||
si->m_parent = (Section *)np;
|
||||
}
|
||||
if ( si->m_next ) {
|
||||
char *np = (char *)si->m_next;
|
||||
np = np - oldBuf + newBuf;
|
||||
si->m_next = (Section *)np;
|
||||
}
|
||||
if ( si->m_prev ) {
|
||||
char *np = (char *)si->m_prev;
|
||||
np = np - oldBuf + newBuf;
|
||||
si->m_prev = (Section *)np;
|
||||
}
|
||||
if ( si->m_listContainer ) {
|
||||
char *np = (char *)si->m_listContainer;
|
||||
np = np - oldBuf + newBuf;
|
||||
si->m_listContainer = (Section *)np;
|
||||
}
|
||||
if ( si->m_prevBrother ) {
|
||||
char *np = (char *)si->m_prevBrother;
|
||||
np = np - oldBuf + newBuf;
|
||||
si->m_prevBrother = (Section *)np;
|
||||
}
|
||||
if ( si->m_nextBrother ) {
|
||||
char *np = (char *)si->m_nextBrother;
|
||||
np = np - oldBuf + newBuf;
|
||||
si->m_nextBrother = (Section *)np;
|
||||
}
|
||||
if ( si->m_sentenceSection ) {
|
||||
char *np = (char *)si->m_sentenceSection;
|
||||
np = np - oldBuf + newBuf;
|
||||
si->m_sentenceSection = (Section *)np;
|
||||
}
|
||||
if ( si->m_prevSent ) {
|
||||
char *np = (char *)si->m_prevSent;
|
||||
np = np - oldBuf + newBuf;
|
||||
si->m_prevSent = (Section *)np;
|
||||
}
|
||||
if ( si->m_nextSent ) {
|
||||
char *np = (char *)si->m_nextSent;
|
||||
np = np - oldBuf + newBuf;
|
||||
si->m_nextSent = (Section *)np;
|
||||
}
|
||||
if ( si->m_tableSec ) {
|
||||
char *np = (char *)si->m_tableSec;
|
||||
np = np - oldBuf + newBuf;
|
||||
si->m_tableSec = (Section *)np;
|
||||
}
|
||||
if ( si->m_headColSection ) {
|
||||
char *np = (char *)si->m_headColSection;
|
||||
np = np - oldBuf + newBuf;
|
||||
si->m_headColSection = (Section *)np;
|
||||
}
|
||||
if ( si->m_headRowSection ) {
|
||||
char *np = (char *)si->m_headRowSection;
|
||||
np = np - oldBuf + newBuf;
|
||||
si->m_headRowSection = (Section *)np;
|
||||
}
|
||||
if ( si->m_leftCell ) {
|
||||
char *np = (char *)si->m_leftCell;
|
||||
np = np - oldBuf + newBuf;
|
||||
si->m_leftCell = (Section *)np;
|
||||
}
|
||||
if ( si->m_aboveCell ) {
|
||||
char *np = (char *)si->m_aboveCell;
|
||||
np = np - oldBuf + newBuf;
|
||||
si->m_aboveCell = (Section *)np;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
|
13
Sections.h
13
Sections.h
@ -680,7 +680,9 @@ class Sections {
|
||||
long getStoredSize ( ) ;
|
||||
static long getStoredSize ( char *p ) ;
|
||||
long serialize ( char *p ) ;
|
||||
long getMemUsed ( ) { return m_sectionsBufSize; };
|
||||
//long getMemUsed ( ) { return m_sectionsBufSize; };
|
||||
|
||||
bool growSections ( );
|
||||
|
||||
bool getSectiondbList ( );
|
||||
bool gotSectiondbList ( bool *needsRecall ) ;
|
||||
@ -828,10 +830,17 @@ class Sections {
|
||||
|
||||
// allocate m_sections[] buffer
|
||||
class Section *m_sections;
|
||||
long m_sectionsBufSize;
|
||||
//long m_sectionsBufSize;
|
||||
long m_numSections;
|
||||
long m_maxNumSections;
|
||||
|
||||
// this holds the Sections instances in a growable array
|
||||
SafeBuf m_sectionBuf;
|
||||
|
||||
// this holds ptrs to sections 1-1 with words array, so we can
|
||||
// see what section a word is in.
|
||||
SafeBuf m_sectionPtrBuf;
|
||||
|
||||
long m_numSentenceSections;
|
||||
|
||||
bool m_firstDateValid;
|
||||
|
@ -1537,7 +1537,8 @@ bool Speller::findNext( char *s, char *send, char **nextWord, bool *isPorn,
|
||||
long slen = send - s;
|
||||
// check if there is an adult word in there
|
||||
// NOTE: The word 'adult' gives a lot of false positives, so even
|
||||
// though it is in the isAdult() list, skip it
|
||||
// though it is in the isAdult() list, skip it.
|
||||
// s/slen constitues an individual word.
|
||||
if ( isAdult ( s, slen, &loc ) && strncmp ( s, "adult", 5 ) != 0 ){
|
||||
// if this string starts with the adult word, don't check
|
||||
// further
|
||||
|
@ -3878,7 +3878,7 @@ void SpiderLoop::spiderDoledUrls ( ) {
|
||||
if ( m_cri >= g_collectiondb.m_numRecs ) m_cri = 0;
|
||||
// get rec
|
||||
cr = g_collectiondb.m_recs[m_cri];
|
||||
// skip if empty
|
||||
// skip if gone
|
||||
if ( ! cr ) continue;
|
||||
// stop if not enabled
|
||||
if ( ! cr->m_spideringEnabled ) continue;
|
||||
|
8
Spider.h
8
Spider.h
@ -601,7 +601,13 @@ class SpiderRequest {
|
||||
// this 0 and to not avoid spidering the links.
|
||||
long m_avoidSpiderLinks:1;
|
||||
// for identifying address heavy sites...
|
||||
long m_tagYellowPages:1;
|
||||
//long m_tagYellowPages:1;
|
||||
// when indexing urls for dmoz, i.e. the urls outputted from
|
||||
// 'dmozparse urldump -s' we need to index them even if there
|
||||
// was a ETCPTIMEDOUT because we have to have indexed the same
|
||||
// urls that dmoz has in it in order to be identical to dmoz.
|
||||
long m_ignoreExternalErrors:1;
|
||||
|
||||
// called XmlDoc::set4() from PageSubmit.cpp?
|
||||
//long m_isPageSubmit:1;
|
||||
|
||||
|
215
Stats.cpp
215
Stats.cpp
@ -2,7 +2,7 @@
|
||||
|
||||
#include <errno.h>
|
||||
#include "Stats.h"
|
||||
#define X_DISPLAY_MISSING 1
|
||||
//#define X_DISPLAY_MISSING 1
|
||||
//#include <plotter.h>
|
||||
#include <math.h>
|
||||
#include "Conf.h"
|
||||
@ -133,6 +133,7 @@ void Stats::addStat_r ( long numBytes ,
|
||||
//pthread_mutex_unlock ( &s_lock );
|
||||
}
|
||||
|
||||
/*
|
||||
// . dump a graph to /tmp/diskGraph.gif
|
||||
// . use libplotter.a or .so ?
|
||||
// . docs at http://www.gnu.org/manual/plotutils/html_mono/plotutils.html#SEC54
|
||||
@ -341,7 +342,7 @@ void Stats::dumpGIF ( long long startTime , long long endTime ) {
|
||||
mfree(lrgBuf, lrgSize, "Stats.cpp");
|
||||
#endif
|
||||
}
|
||||
|
||||
*/
|
||||
|
||||
void Stats::addPoint (StatPoint **points ,
|
||||
long *numPoints ,
|
||||
@ -486,3 +487,213 @@ void Stats::addSpiderPoint ( long errCode, bool isNew ) {
|
||||
m_allErrorsOld[errCode]++;
|
||||
}
|
||||
}
|
||||
|
||||
// draw a HORIZONTAL line in html
|
||||
void drawLine2 ( SafeBuf &sb ,
|
||||
long x1 ,
|
||||
long x2 ,
|
||||
long fy1 ,
|
||||
long color ,
|
||||
long width ) {
|
||||
|
||||
sb.safePrintf("<div style=\"position:absolute;"
|
||||
"left:%li;"
|
||||
"top:%li;"
|
||||
"background-color:#%lx;"
|
||||
"z-index:-5;"
|
||||
"min-height:%lipx;"
|
||||
"min-width:%lipx;\"></div>\n"
|
||||
, x1
|
||||
, (fy1 - width/2) - 20 //- 300
|
||||
, color
|
||||
, width
|
||||
, x2 - x1
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
//
|
||||
// new code for drawing graph in html with absolute divs instead
|
||||
// of using GIF plotter library which had issues
|
||||
//
|
||||
void Stats::printGraphInHtml ( SafeBuf &sb ) {
|
||||
|
||||
// gif size
|
||||
char tmp[64];
|
||||
sprintf ( tmp , "%lix%li", (long)DX+40 , (long)DY+40 ); // "1040x440"
|
||||
|
||||
// 20 pixel borders
|
||||
//int bx = 10;
|
||||
//int by = 30;
|
||||
// define the space with boundaries 100 unit wide boundaries
|
||||
//plotter.space ( -bx , -by , DX + bx , DY + by );
|
||||
// draw the x-axis
|
||||
//plotter.line ( 0 , 0 , DX , 0 );
|
||||
// draw the y-axis
|
||||
//plotter.line ( 0 , 0 , 0 , DY );
|
||||
|
||||
// find time ranges
|
||||
long long t2 = 0;
|
||||
for ( long i = 0 ; i < MAX_POINTS ; i++ ) {
|
||||
// skip empties
|
||||
if ( m_pts[i].m_startTime == 0 ) continue;
|
||||
// set min/max
|
||||
if ( m_pts[i].m_endTime > t2 ) t2 = m_pts[i].m_endTime;
|
||||
}
|
||||
// now compute the start time for the graph
|
||||
long long t1 = 0x7fffffffffffffffLL;
|
||||
// now recompute t1
|
||||
for ( long i = 0 ; i < MAX_POINTS ; i++ ) {
|
||||
// skip empties
|
||||
if ( m_pts[i].m_startTime == 0 ) continue;
|
||||
// can't be behind more than 1 second
|
||||
if ( m_pts[i].m_startTime < t2 - DT ) continue;
|
||||
// otherwise, it's a candidate for the first time
|
||||
if ( m_pts[i].m_startTime < t1 ) t1 = m_pts[i].m_startTime;
|
||||
}
|
||||
|
||||
//
|
||||
// main graphing window
|
||||
//
|
||||
sb.safePrintf("<div style=\"position:relative;"
|
||||
"background-color:#c0c0c0;"
|
||||
//"overflow-y:hidden;"
|
||||
"overflow-x:hidden;"
|
||||
"z-index:-10;"
|
||||
// the tick marks we print below are based on it
|
||||
// being a window of the last 20 seconds... and using
|
||||
// DX pixels
|
||||
"min-width:%lipx;"
|
||||
"min-height:%lipx;"
|
||||
//"width:100%%;"
|
||||
//"min-height:600px;"
|
||||
"margin-top:10px;"
|
||||
"margin-bottom:10px;"
|
||||
"margin-right:10px;"
|
||||
"margin-left:10px;\">"
|
||||
,(long)DX
|
||||
,(long)DY +20); // add 10 more for "2s" labels etc.
|
||||
|
||||
// 10 x-axis tick marks
|
||||
for ( int x = DX/20 ; x <= DX ; x += DX/20 ) {
|
||||
// tick mark
|
||||
//plotter.line ( x , -20 , x , 20 );
|
||||
sb.safePrintf("<div style=\"position:absolute;"
|
||||
"left:%li;"
|
||||
"bottom:0;"
|
||||
"background-color:#000000;"
|
||||
"z-index:110;"
|
||||
"min-height:20px;"
|
||||
"min-width:3px;\"></div>\n"
|
||||
, (long)x-1
|
||||
);
|
||||
// generate label
|
||||
//char buf [ 32 ];
|
||||
//sprintf ( buf , "%li" ,
|
||||
// (long)(DT * (long long)x / (long long)DX) );
|
||||
// LABEL
|
||||
sb.safePrintf("<div style=\"position:absolute;"
|
||||
"left:%li;"
|
||||
"bottom:20;"
|
||||
//"background-color:#000000;"
|
||||
"z-index:110;"
|
||||
"min-height:20px;"
|
||||
"min-width:3px;\">%lis</div>\n"
|
||||
, (long)x-10
|
||||
// the label:
|
||||
,(long)(DT * (long long)x / (long long)DX)/1000
|
||||
);
|
||||
|
||||
// move cursor
|
||||
//plotter.move ( x , -by / 2 - 9 );
|
||||
// plot label
|
||||
//plotter.alabel ( 'c' , 'c' , buf );
|
||||
}
|
||||
|
||||
// . each line consists of several points
|
||||
// . we need to know each point for adding otherlines
|
||||
// . is about [400/6][1024] = 70k
|
||||
// . each line can contain multiple data points
|
||||
// . each data point is expressed as a horizontal line segment
|
||||
void *lrgBuf;
|
||||
long lrgSize = 0;
|
||||
lrgSize += MAX_LINES * MAX_POINTS * sizeof(StatPoint *);
|
||||
lrgSize += MAX_LINES * sizeof(long);
|
||||
lrgBuf = (char *) mmalloc(lrgSize, "Stats.cpp");
|
||||
if (! lrgBuf) {
|
||||
log("could not allocate memory for local buffer in Stats.cpp"
|
||||
"%li bytes needed", lrgSize);
|
||||
return;
|
||||
}
|
||||
char *lrgPtr = (char *)lrgBuf;
|
||||
StatPoint **points = (StatPoint **)lrgPtr;
|
||||
lrgPtr += MAX_LINES * MAX_POINTS * sizeof(StatPoint *);
|
||||
long *numPoints = (long *)lrgPtr;
|
||||
lrgPtr += MAX_LINES * sizeof(long);
|
||||
memset ( (char *)numPoints , 0 , MAX_LINES * sizeof(long) );
|
||||
|
||||
// store the data points into "lines"
|
||||
long count = MAX_POINTS;
|
||||
for ( long i = m_next ; count >= 0 ; i++ , count-- ) {
|
||||
// wrap around the array
|
||||
if ( i >= MAX_POINTS ) i = 0;
|
||||
// skip point if empty
|
||||
if ( m_pts[i].m_startTime == 0 ) continue;
|
||||
// skip if too early
|
||||
if ( m_pts[i].m_endTime < t1 ) continue;
|
||||
// . find the lowest line the will hold us
|
||||
// . this adds point to points[x][n] where x is determined
|
||||
addPoint ( points , numPoints , &m_pts[i] );
|
||||
}
|
||||
|
||||
int y1 = 21;
|
||||
// plot the points (lines) in each line
|
||||
for ( long i = 0 ; i < MAX_LINES ; i++ ) {
|
||||
// increase vert
|
||||
y1 += MAX_WIDTH + 1;
|
||||
// wrap back down if necessary
|
||||
if ( y1 >= DY ) y1 = 21;
|
||||
// plt all points in this row
|
||||
for ( long j = 0 ; j < numPoints[i] ; j++ ) {
|
||||
// get the point
|
||||
StatPoint *p = points[MAX_POINTS * i + j];
|
||||
// transform time to x coordinates
|
||||
int x1 = (p->m_startTime - t1) * (long long)DX / DT;
|
||||
int x2 = (p->m_endTime - t1) * (long long)DX / DT;
|
||||
// if x2 is negative, skip it
|
||||
if ( x2 < 0 ) continue;
|
||||
// if x1 is negative, boost it to -2
|
||||
if ( x1 < 0 ) x1 = -2;
|
||||
// . line thickness is function of read/write size
|
||||
// . take logs
|
||||
int w = (int)log(((double)p->m_numBytes)/8192.0) + 3;
|
||||
//log("log of %li is %i",m_pts[i].m_numBytes,w);
|
||||
if ( w < 3 ) w = 3;
|
||||
if ( w > MAX_WIDTH ) w = MAX_WIDTH;
|
||||
//plotter.linewidth ( w );
|
||||
// use the color specified from addStat_r() for this line/pt
|
||||
//plotter.pencolor ( ((p->m_color >> 16) & 0xff) << 8 ,
|
||||
// ((p->m_color >> 8) & 0xff) << 8 ,
|
||||
// ((p->m_color >> 0) & 0xff) << 8 );
|
||||
// ensure at least 3 units wide for visibility
|
||||
if ( x2 < x1 + 3 ) x2 = x1 + 3;
|
||||
// . flip the y so we don't have to scroll the browser down
|
||||
// . DY does not include the axis and tick marks
|
||||
long fy1 = DY - y1 + 20 ;
|
||||
// plot it
|
||||
//plotter.line ( x1 , fy1 , x2 , fy1 );
|
||||
drawLine2 ( sb , x1 , x2 , fy1 , p->m_color , w );
|
||||
// debug msg
|
||||
//log("line (%i,%i, %i,%i) ", x1 , vert , x2 , vert );
|
||||
//log("bytes = %li width = %li ", m_pts[i].m_numBytes,w);
|
||||
//log("st=%i, end=%i color=%lx " ,
|
||||
// (int)m_pts[i].m_startTime ,
|
||||
// (int)m_pts[i].m_endTime ,
|
||||
// m_pts[i].m_color );
|
||||
}
|
||||
}
|
||||
|
||||
sb.safePrintf("</div>\n");
|
||||
|
||||
mfree(lrgBuf, lrgSize, "Stats.cpp");
|
||||
}
|
||||
|
9
Stats.h
9
Stats.h
@ -25,9 +25,9 @@ class StatPoint {
|
||||
|
||||
#define MAX_POINTS 6000
|
||||
#define MAX_WIDTH 6
|
||||
#define DY 900 // pixels vertical
|
||||
#define DY 600 // pixels vertical
|
||||
#define DX 1000 // pixels across
|
||||
#define DT (20*1000) // time window, 10 seconds
|
||||
#define DT (20*1000) // time window, 20 seconds
|
||||
#define MAX_LINES (DY / (MAX_WIDTH+1)) // leave free pixel above each line
|
||||
|
||||
#define STAT_GENERIC 0
|
||||
@ -53,7 +53,10 @@ class Stats {
|
||||
// . dumps a bar graph
|
||||
// . each bar represents a stat in time, from inception to completion
|
||||
// . useful for seeing possible sources of contention
|
||||
void dumpGIF ( long long startTime = -1 , long long endTime = -1 );
|
||||
//void dumpGIF ( long long startTime = -1 , long long endTime = -1 );
|
||||
|
||||
|
||||
void printGraphInHtml ( SafeBuf &sb );
|
||||
|
||||
// this graphs:
|
||||
// 1. stats per second
|
||||
|
299
Statsdb.cpp
299
Statsdb.cpp
@ -80,7 +80,7 @@ static Label s_labels[] = {
|
||||
// . max = -1, means dynamic size the ymax!
|
||||
// . use 1B for now again...
|
||||
// . color=pink
|
||||
{GRAPH_QUANTITY,1000000000.0,"docs_indexed", .1,"%.0fK docs" , .001 , 0x00cc0099,"docs indexed" }
|
||||
{GRAPH_QUANTITY,50000000.0,"docs_indexed", .1,"%.0fK docs" , .001 , 0x00cc0099,"docs indexed" }
|
||||
|
||||
|
||||
//{ "termlist_intersect",0x0000ff00},
|
||||
@ -101,6 +101,13 @@ static Label s_labels[] = {
|
||||
//{ "parm_change",0xffc0c0} // pink?
|
||||
};
|
||||
|
||||
void drawLine3 ( SafeBuf &sb ,
|
||||
long x1 ,
|
||||
long x2 ,
|
||||
long fy1 ,
|
||||
long color ,
|
||||
long width ) ;
|
||||
|
||||
Label *Statsdb::getLabel ( long labelHash ) {
|
||||
Label **label = (Label **)m_labelTable.getValue ( &labelHash );
|
||||
if ( ! label ) return NULL;
|
||||
@ -116,7 +123,7 @@ bool Statsdb::init ( ) {
|
||||
|
||||
// 20 pixel borders
|
||||
m_bx = 10;
|
||||
m_by = 30;
|
||||
m_by = 40;
|
||||
|
||||
// keep it at least at 20MB otherwise it is filling up the tree
|
||||
// constantly and dumping
|
||||
@ -477,6 +484,11 @@ bool Statsdb::makeGIF ( long t1Arg ,
|
||||
m_sb3.reset();
|
||||
m_ht3.reset();
|
||||
|
||||
// print graph in here as a bunch of divs now:
|
||||
m_gw.purge();
|
||||
m_dupTable.reset();
|
||||
m_dupTable.set(4,0,20000,NULL,0,false,0,"statstbl");
|
||||
|
||||
// . start at t1 and get stats lists, up to 1MB of stats at a time
|
||||
// . subtract 60 seconds so we can have a better shot at having
|
||||
// a moving average for the last SAMPLE points
|
||||
@ -495,6 +507,7 @@ bool Statsdb::makeGIF ( long t1Arg ,
|
||||
return true;
|
||||
|
||||
// open the file for the gif
|
||||
/*
|
||||
char fname [ 1024 ];
|
||||
sprintf ( fname , "%s/stats%li.gif" ,
|
||||
g_hostdb.m_httpRootDir , g_hostdb.m_hostId );
|
||||
@ -504,13 +517,16 @@ bool Statsdb::makeGIF ( long t1Arg ,
|
||||
fname , mstrerror(errno) );
|
||||
return true;
|
||||
}
|
||||
*/
|
||||
|
||||
return gifLoop ();
|
||||
}
|
||||
|
||||
#define POINTWIDTH 8
|
||||
|
||||
#define MAX_POINTS 6000
|
||||
#define MAX_WIDTH 6
|
||||
#define DY 900 // pixels vertical
|
||||
#define DY 600 // pixels vertical
|
||||
#define DX 1000 // pixels across
|
||||
#define MAX_LINES (DY / (MAX_WIDTH+1)) // leave free pixel above each line
|
||||
|
||||
@ -542,9 +558,9 @@ bool Statsdb::gifLoop ( ) {
|
||||
// shortcut
|
||||
Msg5 *m = &m_msg5;
|
||||
|
||||
#ifndef _USEPLOTTER_
|
||||
return true;
|
||||
#endif
|
||||
//#ifndef _USEPLOTTER_
|
||||
//return true;
|
||||
//#endif
|
||||
|
||||
// loop over all the lists in the time range, [m_t1,m_t2]
|
||||
for ( ; ! m_done ; ) {
|
||||
@ -576,53 +592,87 @@ bool Statsdb::gifLoop ( ) {
|
||||
}
|
||||
|
||||
// define time delta - commented out because it's currently not used.
|
||||
//long dt = m_t2 - m_t1;
|
||||
long dt = m_t2 - m_t1;
|
||||
|
||||
#ifdef _USEPLOTTER_
|
||||
//#ifdef _USEPLOTTER_
|
||||
|
||||
// gif size
|
||||
char tmp[64];
|
||||
//char tmp[64];
|
||||
// dimensions of the gif
|
||||
sprintf ( tmp , "%lix%li", (long)DX+m_bx*2 , (long)DY+m_by*2 );
|
||||
GIFPlotter::parampl ( "BITMAPSIZE" , (void *)tmp );
|
||||
//sprintf ( tmp , "%lix%li", (long)DX+m_bx*2 , (long)DY+m_by*2 );
|
||||
//GIFPlotter::parampl ( "BITMAPSIZE" , (void *)tmp );
|
||||
// create one
|
||||
GIFPlotter plotter ( NULL , m_fd , NULL );
|
||||
//GIFPlotter plotter ( NULL , m_fd , NULL );
|
||||
// open it
|
||||
plotter.openpl ( );
|
||||
//plotter.openpl ( );
|
||||
|
||||
// define the space with boundaries 100 unit wide boundaries
|
||||
//plotter.space ( -m_bx , -m_by , DX + m_bx , DY + m_by );
|
||||
plotter.space ( 0 , 0 , DX + m_bx * 2 , DY + m_by * 2 );
|
||||
//plotter.space ( 0 , 0 , DX + m_bx * 2 , DY + m_by * 2 );
|
||||
|
||||
// line thickness in user coordinates (pixels for us)
|
||||
plotter.linewidth ( 1 );
|
||||
//plotter.linewidth ( 1 );
|
||||
// set bg color to gray (r/g/b)
|
||||
plotter.bgcolor ( 0xd600 , 0xce00 , 0xd600 );
|
||||
// set bg color to white (r/g/b)
|
||||
//plotter.bgcolor ( 0xff00 , 0xff00 , 0xff00 );
|
||||
//plotter.bgcolor ( 0xd600 , 0xce00 , 0xd600 );
|
||||
// erase Plotter's graphics display
|
||||
plotter.erase ();
|
||||
//plotter.erase ();
|
||||
// draw axises in black
|
||||
plotter.pencolorname ("black");
|
||||
//plotter.pencolorname ("black");
|
||||
|
||||
//
|
||||
// main graphing window
|
||||
//
|
||||
m_gw.safePrintf("<div style=\"position:relative;"
|
||||
"background-color:#c0c0c0;"
|
||||
//"overflow-y:hidden;"
|
||||
"overflow-x:hidden;"
|
||||
"z-index:-10;"
|
||||
// the tick marks we print below are based on it
|
||||
// being a window of the last 20 seconds... and using
|
||||
// DX pixels
|
||||
"min-width:%lipx;"
|
||||
"min-height:%lipx;"
|
||||
//"width:100%%;"
|
||||
//"min-height:600px;"
|
||||
"margin-top:10px;"
|
||||
"margin-bottom:10px;"
|
||||
"margin-right:10px;"
|
||||
"margin-left:10px;\">"
|
||||
,(long)DX + 2 *m_bx
|
||||
,(long)DY + 2*m_by);
|
||||
|
||||
|
||||
// draw the x-axis
|
||||
plotter.line ( m_bx , m_by , DX + m_bx , m_by );
|
||||
// draw the y-axis
|
||||
plotter.line ( m_bx , m_by , m_bx , DY + m_by);
|
||||
//plotter.line ( m_bx , m_by , DX + m_bx , m_by );
|
||||
|
||||
// 10 x-axis tick marks
|
||||
for ( int x = DX/10 + m_bx ; x < DX - m_bx ; x += DX/10 ) {
|
||||
for ( int x = DX/20 ; x <= DX ; x += DX/20 ) {
|
||||
// tick mark
|
||||
plotter.line ( x , m_by - 15 , x , m_by + 15 );
|
||||
// generate label
|
||||
long xv = (long)(dt * (long long)x / (long long)DX) -(long)dt;
|
||||
char buf [ 32 ];
|
||||
// in seconds, so put "s" in there
|
||||
sprintf ( buf , "%lis" , xv );//(float)xv / 1000.0 );
|
||||
// move cursor
|
||||
plotter.move ( x , m_by - m_by / 2 - 9 );
|
||||
// plot label
|
||||
plotter.alabel ( 'c' , 'c' , buf );
|
||||
//plotter.line ( x , -20 , x , 20 );
|
||||
m_gw.safePrintf("<div style=\"position:absolute;"
|
||||
"left:%li;"
|
||||
"bottom:0;"
|
||||
"background-color:#000000;"
|
||||
"z-index:110;"
|
||||
"min-height:20px;"
|
||||
"min-width:3px;\"></div>\n"
|
||||
, m_bx + (long)x-1
|
||||
);
|
||||
long xv = (long)(dt * (long long)x/(long long)DX)-(long)dt;
|
||||
// LABEL
|
||||
m_gw.safePrintf("<div style=\"position:absolute;"
|
||||
"left:%li;"
|
||||
"bottom:20;"
|
||||
//"background-color:#000000;"
|
||||
"z-index:110;"
|
||||
"min-height:20px;"
|
||||
"min-width:3px;\">%lis</div>\n"
|
||||
, (long)x-10 + m_bx
|
||||
// the label:
|
||||
, xv
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
HashTableX tmpht;
|
||||
tmpht.set(4,0,0,NULL,0,false,m_niceness,"statsparms");
|
||||
|
||||
@ -651,7 +701,7 @@ bool Statsdb::gifLoop ( ) {
|
||||
|
||||
// . graph this single graph of this color
|
||||
// . returns ptr to first point of different color!
|
||||
plotGraph ( p , pend , gh , &plotter , zoff );
|
||||
plotGraph ( p , pend , gh , m_gw , zoff );
|
||||
// prevent collisions
|
||||
zoff += 20;
|
||||
|
||||
@ -709,7 +759,7 @@ bool Statsdb::gifLoop ( ) {
|
||||
}
|
||||
|
||||
// set the line width
|
||||
plotter.linewidth ( pp->m_thickness );
|
||||
//plotter.linewidth ( pp->m_thickness );
|
||||
|
||||
// get parm hash
|
||||
long colorHash = pp->m_parmHash;
|
||||
@ -720,9 +770,9 @@ bool Statsdb::gifLoop ( ) {
|
||||
// . is really the parm hash in disguise
|
||||
long c1 = colorHash & 0x00ffffff;
|
||||
// use the color specified from addStat_r() for this line/pt
|
||||
plotter.pencolor ( ((c1 >> 16) & 0xff) << 8 ,
|
||||
((c1 >> 8) & 0xff) << 8 ,
|
||||
((c1 >> 0) & 0xff) << 8 );
|
||||
//plotter.pencolor ( ((c1 >> 16) & 0xff) << 8 ,
|
||||
// ((c1 >> 8) & 0xff) << 8 ,
|
||||
// ((c1 >> 0) & 0xff) << 8 );
|
||||
|
||||
long x1 = pp->m_a;
|
||||
long x2 = pp->m_b;
|
||||
@ -731,9 +781,10 @@ bool Statsdb::gifLoop ( ) {
|
||||
if ( x2 < x1 + 10 ) x2 = x1 + 10;
|
||||
// . flip the y so we don't have to scroll the browser down
|
||||
// . DY does not include the axis and tick marks
|
||||
long fy1 = DY - y1 + m_by ;
|
||||
//long fy1 = DY - y1 + m_by ;
|
||||
// plot it
|
||||
plotter.line ( x1 , fy1 , x2 , fy1 );
|
||||
//plotter.line ( x1 , fy1 , x2 , fy1 );
|
||||
drawLine3 ( m_gw , x1 , x2 , y1 , c1 , pp->m_thickness );
|
||||
|
||||
// add to map key? only if we haven't already
|
||||
if ( tmpht.isInTable ( &colorHash ) ) continue;
|
||||
@ -785,12 +836,15 @@ bool Statsdb::gifLoop ( ) {
|
||||
//
|
||||
|
||||
// all done
|
||||
if ( plotter.closepl () < 0 )
|
||||
log("admin: Could not close performance graph object.");
|
||||
//if ( plotter.closepl () < 0 )
|
||||
// log("admin: Could not close performance graph object.");
|
||||
// close the file
|
||||
fclose ( m_fd );
|
||||
//fclose ( m_fd );
|
||||
|
||||
#endif
|
||||
//#endif
|
||||
|
||||
// close main graphing window
|
||||
m_gw.safePrintf("</div>\n");
|
||||
|
||||
return true;
|
||||
}
|
||||
@ -799,15 +853,10 @@ bool Statsdb::gifLoop ( ) {
|
||||
char *Statsdb::plotGraph ( char *pstart ,
|
||||
char *pend ,
|
||||
long graphHash ,
|
||||
GIFPlotter *plotter ,
|
||||
//GIFPlotter *plotter ,
|
||||
SafeBuf &gw ,
|
||||
long zoff ) {
|
||||
|
||||
#ifndef _USEPLOTTER_
|
||||
|
||||
return NULL;
|
||||
|
||||
#else
|
||||
|
||||
// . use "graphHash" to map to unit display
|
||||
// . this is a disk read volume
|
||||
Label *label = getLabel ( graphHash );
|
||||
@ -857,20 +906,16 @@ char *Statsdb::plotGraph ( char *pstart ,
|
||||
char *retp = p;
|
||||
|
||||
// set the line width
|
||||
plotter->linewidth ( 1 );
|
||||
//plotter->linewidth ( 1 );
|
||||
|
||||
long color = label->m_color;
|
||||
|
||||
// use the color specified from addStat_r() for this line/pt
|
||||
plotter->pencolor ( ((color >> 16) & 0xff) << 8 ,
|
||||
((color >> 8) & 0xff) << 8 ,
|
||||
((color >> 0) & 0xff) << 8 );
|
||||
//plotter->pencolor ( ((color >> 16) & 0xff) << 8 ,
|
||||
// ((color >> 8) & 0xff) << 8 ,
|
||||
// ((color >> 0) & 0xff) << 8 );
|
||||
|
||||
|
||||
// how many points per pixel do we have now
|
||||
//float res = (ymax - ymin) / (float)DY;
|
||||
|
||||
|
||||
// . the minimum difference between ymax and ymin is minDiff.
|
||||
// . this prevents us from zooming in too close!
|
||||
float minDiff = (float)DY * label->m_minRes ;
|
||||
@ -896,7 +941,7 @@ char *Statsdb::plotGraph ( char *pstart ,
|
||||
|
||||
|
||||
// set the line width
|
||||
plotter->linewidth ( 2 );
|
||||
//plotter->linewidth ( 2 );
|
||||
|
||||
// reset for 2nd scan
|
||||
p = pstart;
|
||||
@ -940,8 +985,8 @@ char *Statsdb::plotGraph ( char *pstart ,
|
||||
// . flip the y so we don't have to scroll the browser down
|
||||
// . DY does not include the axis and tick marks
|
||||
// . do not flip y any more for statsdb graphs
|
||||
long fy1 = (long)(y1+.5) + m_by ;
|
||||
long fy2 = (long)(y2+.5) + m_by ;
|
||||
long fy1 = (long)(y1+.5);// + m_by ;
|
||||
long fy2 = (long)(y2+.5);// + m_by ;
|
||||
|
||||
// how are we getting -.469 for "query" point?
|
||||
if ( fy1 < 0 ) continue;
|
||||
@ -949,7 +994,10 @@ char *Statsdb::plotGraph ( char *pstart ,
|
||||
|
||||
// skip if can't make a line
|
||||
if ( firstPoint ) {
|
||||
plotter->circle ( x2 , fy2 , 2 );
|
||||
//plotter->circle ( x2 , fy2 , 2 );
|
||||
long width = POINTWIDTH;
|
||||
// draw a 4x4 box now:
|
||||
drawLine3(m_gw,x2-width/2,x2+width/2,fy2,color,width);
|
||||
firstPoint = false;
|
||||
continue;
|
||||
}
|
||||
@ -963,32 +1011,38 @@ char *Statsdb::plotGraph ( char *pstart ,
|
||||
|
||||
// plot it
|
||||
// BUT only iff not more than 5 seconds difference
|
||||
float secondsPerPixel = (m_t2-m_t1)/(float)DX;
|
||||
float dt = (x2 - x1) * secondsPerPixel;
|
||||
//float secondsPerPixel = (m_t2-m_t1)/(float)DX;
|
||||
|
||||
// avoid this for now. mdw oct 14 2013.
|
||||
//float dt = (x2 - x1) * secondsPerPixel;
|
||||
//if ( dt <= 13 || x2 - x1 <= 10 )
|
||||
// plotter->line ( x1 , fy1 , x2 , fy2 );
|
||||
|
||||
if ( dt <= 13 || x2 - x1 <= 10 )
|
||||
plotter->line ( x1 , fy1 , x2 , fy2 );
|
||||
// circle second point
|
||||
plotter->circle ( x1 , fy1 , 2 );
|
||||
plotter->circle ( x2 , fy2 , 2 );
|
||||
//plotter->circle ( x1 , fy1 , 2 );
|
||||
//plotter->circle ( x2 , fy2 , 2 );
|
||||
// draw a 4x4 boxes now:
|
||||
long width = POINTWIDTH;
|
||||
drawLine3 ( m_gw,x1-width/2, x1+width/2, fy1,color, width);
|
||||
drawLine3 ( m_gw,x2-width/2, x2+width/2, fy2,color, width);
|
||||
}
|
||||
|
||||
plotter->linewidth ( 1 );
|
||||
//plotter->linewidth ( 1 );
|
||||
|
||||
// plot unit lines
|
||||
float deltaz = (ymax-ymin) / 6;
|
||||
if ( strstr(label->m_keyDesc,"latency" ) ) {
|
||||
// draw it
|
||||
drawHR ( 400.0 - 111.0 , ymin , ymax , plotter , label , zoff,0xff0000);
|
||||
drawHR ( 600.0 - 111.0 , ymin , ymax , plotter , label , zoff , color);
|
||||
drawHR ( 400.0 - 111.0 , ymin,ymax,m_gw,label,zoff,0xff0000);
|
||||
drawHR ( 600.0-111.0,ymin,ymax,m_gw,label,zoff,color);
|
||||
}
|
||||
|
||||
if ( strstr(label->m_keyDesc,"queries per sec" ) ) {
|
||||
// draw it
|
||||
//deltaz /= 2;
|
||||
//drawHR ( 120.0 , ymin , ymax , plotter , label , zoff , color );
|
||||
//drawHR ( 130.0 , ymin , ymax , plotter , label , zoff , color );
|
||||
drawHR ( 140.0 , ymin , ymax , plotter , label , zoff , color );
|
||||
//drawHR(120.0, ymin , ymax , plotter , label , zoff , color );
|
||||
//drawHR(130.0, ymin , ymax , plotter , label , zoff , color );
|
||||
drawHR ( 140.0 , ymin , ymax ,m_gw , label , zoff , color );
|
||||
}
|
||||
|
||||
|
||||
@ -996,18 +1050,19 @@ char *Statsdb::plotGraph ( char *pstart ,
|
||||
// breathe
|
||||
QUICKPOLL ( m_niceness );
|
||||
// draw it
|
||||
drawHR ( z , ymin , ymax , plotter , label , zoff , color );
|
||||
drawHR ( z , ymin , ymax , m_gw , label , zoff , color );
|
||||
}
|
||||
|
||||
return retp;
|
||||
#endif
|
||||
//#endif
|
||||
|
||||
}
|
||||
|
||||
void Statsdb::drawHR ( float z ,
|
||||
float ymin ,
|
||||
float ymax ,
|
||||
GIFPlotter *plotter ,
|
||||
//GIFPlotter *plotter ,
|
||||
SafeBuf &gw,
|
||||
Label *label ,
|
||||
float zoff ,
|
||||
long color ) {
|
||||
@ -1017,29 +1072,34 @@ void Statsdb::drawHR ( float z ,
|
||||
// avoid collisions with other graphs
|
||||
z2 += zoff;
|
||||
// border
|
||||
z2 += m_by;
|
||||
//z2 += m_by;
|
||||
// round off error
|
||||
z2 += 0.5;
|
||||
// for adjusatmnet
|
||||
//float ptsPerPixel = (ymax-ymin)/ (float)DY;
|
||||
float ptsPerPixel = (ymax-ymin)/ (float)DY;
|
||||
// make an adjustment to the label then! -- Commented out because it's currently not used.
|
||||
//float zadj = zoff * ptsPerPixel;
|
||||
float zadj = zoff * ptsPerPixel;
|
||||
|
||||
#ifdef _USEPLOTTER_
|
||||
//#ifdef _USEPLOTTER_
|
||||
|
||||
// use the color specified from addStat_r() for this line/pt
|
||||
plotter->pencolor ( ((color >> 16) & 0xff) << 8 ,
|
||||
((color >> 8) & 0xff) << 8 ,
|
||||
((color >> 0) & 0xff) << 8 );
|
||||
//plotter->pencolor ( ((color >> 16) & 0xff) << 8 ,
|
||||
// ((color >> 8) & 0xff) << 8 ,
|
||||
// ((color >> 0) & 0xff) << 8 );
|
||||
|
||||
// horizontal line
|
||||
plotter->line ( m_bx, (long)z2 , DX + m_bx, (long)z2 );
|
||||
//plotter->line ( m_bx, (long)z2 , DX + m_bx, (long)z2 );
|
||||
long width = 1;
|
||||
drawLine3 ( m_gw, 0, DX , (long)z2,color, width);
|
||||
|
||||
|
||||
// make label
|
||||
char tmp[128];
|
||||
// . use "graphHash" to map to unit display
|
||||
// . this is a disk read volume
|
||||
sprintf(tmp,label->m_format,z +zadj);//* label->m_yscalar);
|
||||
|
||||
/*
|
||||
// a white shadow
|
||||
plotter->pencolor ( 0xffff,0xffff,0xffff );
|
||||
plotter->move ( m_bx + 80 + 2 , z2 + 10 - 2 );
|
||||
@ -1060,7 +1120,24 @@ void Statsdb::drawHR ( float z ,
|
||||
plotter->move ( m_bx + 80 , z2 + 10 );
|
||||
// plot label
|
||||
plotter->alabel ( 'c' , 'c' , tmp );
|
||||
#endif
|
||||
*/
|
||||
|
||||
// LABEL
|
||||
gw.safePrintf("<div style=\"position:absolute;"
|
||||
"left:%li;"
|
||||
"bottom:%li;"
|
||||
"color:#%lx;"
|
||||
"z-index:110;"
|
||||
"font-size:14px;"
|
||||
"min-height:20px;"
|
||||
"min-width:3px;\">%s</div>\n"
|
||||
, (long)(m_bx)
|
||||
, (long)z2 +m_by
|
||||
, color
|
||||
// the label:
|
||||
, tmp
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
void gotListWrapper ( void *state , RdbList *list, Msg5 *msg5 ) {
|
||||
@ -1289,7 +1366,7 @@ bool Statsdb::addPoint ( long x ,
|
||||
// convert x into pixel position
|
||||
float xf = (float)DX * (float)(x - m_t1) / (float)(m_t2 - m_t1);
|
||||
// round it to nearest pixel
|
||||
long x2 = (long)(xf + .5) + m_bx;
|
||||
long x2 = (long)(xf + .5) ;//+ m_bx;
|
||||
// make this our y pos
|
||||
float y2 = y;
|
||||
// average values if tied
|
||||
@ -1371,7 +1448,7 @@ bool Statsdb::addEventPoint ( long t1 ,
|
||||
// convert t1 into pixel position
|
||||
float af = (float)DX * (float)(t1 - m_t1) / (float)(m_t2 - m_t1);
|
||||
// round it to nearest pixel
|
||||
long a = (long)(af + .5) + m_bx;
|
||||
long a = (long)(af + .5) ;//+ m_bx;
|
||||
|
||||
// convert t2 into pixel position
|
||||
//float bf = (float)DX * (float)(t2 - m_t1) / (float)(m_t2 - m_t1);
|
||||
@ -1439,3 +1516,43 @@ bool Statsdb::addEventPoint ( long t1 ,
|
||||
log("stats: no room in graph for event");
|
||||
return true;
|
||||
}
|
||||
|
||||
//////////
|
||||
//
|
||||
// NEW CODE HERE
|
||||
//
|
||||
//////////
|
||||
|
||||
|
||||
// draw a HORIZONTAL line in html
|
||||
void Statsdb::drawLine3 ( SafeBuf &sb ,
|
||||
long x1 ,
|
||||
long x2 ,
|
||||
long fy1 ,
|
||||
long color ,
|
||||
long width ) {
|
||||
|
||||
// do not draw repeats in the case we have a ton of points to plot
|
||||
long key32 ;
|
||||
key32 = hash32h ( x1 , 0 );
|
||||
key32 = hash32h ( x2 , key32);
|
||||
key32 = hash32h ( fy1 , key32);
|
||||
key32 = hash32h ( color , key32);
|
||||
key32 = hash32h ( width , key32);
|
||||
if ( m_dupTable.isInTable(&key32) ) return;
|
||||
m_dupTable.addKey(&key32);
|
||||
|
||||
sb.safePrintf("<div style=\"position:absolute;"
|
||||
"left:%li;"
|
||||
"bottom:%li;"
|
||||
"background-color:#%lx;"
|
||||
"z-index:-5;"
|
||||
"min-height:%lipx;"
|
||||
"min-width:%lipx;\"></div>\n"
|
||||
, x1 + m_bx
|
||||
, (fy1 - width/2) + m_by
|
||||
, color
|
||||
, width
|
||||
, x2 - x1
|
||||
);
|
||||
}
|
||||
|
17
Statsdb.h
17
Statsdb.h
@ -73,13 +73,22 @@ class Statsdb {
|
||||
char *plotGraph ( char *pstart ,
|
||||
char *pend ,
|
||||
long graphHash ,
|
||||
class GIFPlotter *plotter ,
|
||||
//class GIFPlotter *plotter ,
|
||||
SafeBuf &gw,
|
||||
long zoff );
|
||||
|
||||
void drawLine3 ( SafeBuf &sb ,
|
||||
long x1 ,
|
||||
long x2 ,
|
||||
long fy1 ,
|
||||
long color ,
|
||||
long width ) ;
|
||||
|
||||
void drawHR ( float z ,
|
||||
float ymin ,
|
||||
float ymax ,
|
||||
class GIFPlotter *plotter ,
|
||||
//class GIFPlotter *plotter ,
|
||||
SafeBuf &gw,
|
||||
class Label *label ,
|
||||
float zoff ,
|
||||
long color ) ;
|
||||
@ -119,6 +128,10 @@ class Statsdb {
|
||||
RdbList m_list;
|
||||
Msg1 m_msg1;
|
||||
|
||||
// the graphing window. now a bunch of absolute divs in html
|
||||
SafeBuf m_gw;
|
||||
HashTableX m_dupTable;
|
||||
|
||||
SafeBuf m_sb0;
|
||||
SafeBuf m_sb1;
|
||||
|
||||
|
@ -1735,7 +1735,10 @@ void TcpServer::destroySocket ( TcpSocket *s ) {
|
||||
//log("tcp: closing fd=%i",sd);
|
||||
|
||||
// TODO: does this block or what?
|
||||
long cret = ::close ( sd );
|
||||
long cret = 0;
|
||||
// if sd is 0 do not really close it. seems to fix that bug.
|
||||
// 0 is the FD for stdin so i don't know how that is happening.
|
||||
if ( sd != 0 ) cret = ::close ( sd );
|
||||
if ( cret != 0 ) // == -1 )
|
||||
log("tcp: close(%li) = %li = %s",
|
||||
(long)sd,cret,mstrerror(errno));
|
||||
|
925
XmlDoc.cpp
925
XmlDoc.cpp
File diff suppressed because it is too large
Load Diff
28
XmlDoc.h
28
XmlDoc.h
@ -495,6 +495,13 @@ class XmlDoc {
|
||||
long **getIndCatIds ( ) ;
|
||||
long **getCatIds ( ) ;
|
||||
class CatRec *getCatRec ( ) ;
|
||||
|
||||
long *getNumDmozEntries() ;
|
||||
char **getDmozTitles ( ) ;
|
||||
char **getDmozSummaries ( ) ;
|
||||
char **getDmozAnchors ( ) ;
|
||||
bool setDmozInfo () ;
|
||||
|
||||
long long **getWikiDocIds ( ) ;
|
||||
void gotWikiResults ( class UdpSlot *slot );
|
||||
long *getPubDate ( ) ;
|
||||
@ -663,6 +670,8 @@ class XmlDoc {
|
||||
int8_t *getNextSpiderPriority ( ) ;
|
||||
long *getPriorityQueueNum ( ) ;
|
||||
class TagRec ***getOutlinkTagRecVector () ;
|
||||
char *hasNoIndexMetaTag();
|
||||
char *hasFakeIpsMetaTag ( );
|
||||
long **getOutlinkFirstIpVector () ;
|
||||
//char **getOutlinkIsIndexedVector () ;
|
||||
long *getRegExpNum ( long outlinkNum ) ;
|
||||
@ -678,6 +687,7 @@ class XmlDoc {
|
||||
bool getIsInjecting();
|
||||
long *getSpiderPriority ( ) ;
|
||||
long *getIndexCode ( ) ;
|
||||
long *getIndexCode2 ( ) ;
|
||||
SafeBuf *getNewTagBuf ( ) ;
|
||||
|
||||
char *updateTagdb ( ) ;
|
||||
@ -733,6 +743,7 @@ class XmlDoc {
|
||||
bool hashZipCodes ( class HashTableX *table ) ;
|
||||
bool hashMetaZip ( class HashTableX *table ) ;
|
||||
bool hashContentType ( class HashTableX *table ) ;
|
||||
bool hashDMOZCategories ( class HashTableX *table ) ;
|
||||
bool hashLinks ( class HashTableX *table ) ;
|
||||
bool hashUrl ( class HashTableX *table ) ;
|
||||
bool hashSections ( class HashTableX *table ) ;
|
||||
@ -1038,7 +1049,6 @@ class XmlDoc {
|
||||
char m_fragBufValid;
|
||||
char m_wordSpamBufValid;
|
||||
char m_finalSummaryBufValid;
|
||||
|
||||
char m_matchingQueryBufValid;
|
||||
char m_relatedQueryBufValid;
|
||||
char m_queryLinkBufValid;
|
||||
@ -1143,6 +1153,7 @@ class XmlDoc {
|
||||
bool m_dmozTitlesValid;
|
||||
bool m_dmozSummsValid;
|
||||
bool m_dmozAnchorsValid;
|
||||
bool m_dmozInfoValid;
|
||||
bool m_rawUtf8ContentValid;
|
||||
bool m_expandedUtf8ContentValid;
|
||||
bool m_utf8ContentValid;
|
||||
@ -1239,6 +1250,8 @@ class XmlDoc {
|
||||
bool m_priorityQueueNumValid;
|
||||
bool m_outlinkTagRecVectorValid;
|
||||
bool m_outlinkIpVectorValid;
|
||||
bool m_hasNoIndexMetaTagValid;
|
||||
bool m_hasUseFakeIpsMetaTagValid;
|
||||
bool m_outlinkIsIndexedVectorValid;
|
||||
bool m_isSiteRootValid;
|
||||
bool m_wasInjectedValid;
|
||||
@ -1499,8 +1512,15 @@ class XmlDoc {
|
||||
Msge0 m_msge0;
|
||||
|
||||
// this points into m_msge1 i guess
|
||||
//long *m_outlinkIpVector;
|
||||
long *m_outlinkIpVector;
|
||||
SafeBuf m_outlinkTagRecPtrBuf;
|
||||
SafeBuf m_fakeIpBuf;
|
||||
char m_hasNoIndexMetaTag;
|
||||
char m_hasUseFakeIpsMetaTag;
|
||||
Msge1 m_msge1;
|
||||
TagRec **m_outlinkTagRecVector;
|
||||
SafeBuf m_fakeTagRecPtrBuf;
|
||||
TagRec m_fakeTagRec;
|
||||
|
||||
//
|
||||
// diffbot parms for indexing diffbot's json output
|
||||
@ -1860,7 +1880,9 @@ class XmlDoc {
|
||||
char m_isErrorPage;
|
||||
char m_isHijacked;
|
||||
//char m_isVisible;
|
||||
char m_dmozBuf[12000];
|
||||
//char m_dmozBuf[12000];
|
||||
SafeBuf m_dmozBuf;
|
||||
long m_numDmozEntries;
|
||||
|
||||
// stuff
|
||||
char *m_statusMsg;
|
||||
|
414
dmozparse.cpp
414
dmozparse.cpp
@ -21,6 +21,11 @@
|
||||
bool closeAll ( void *state , void (* callback)(void *state) ) { return true; }
|
||||
bool allExit ( ) { return true; };
|
||||
|
||||
bool sendPageSEO(TcpSocket *s, HttpRequest *hr) {return true;}
|
||||
|
||||
//long g_qbufNeedSave = false;
|
||||
//SafeBuf g_qbuf;
|
||||
|
||||
#define RDFBUFFER_SIZE (1024*1024*10)
|
||||
#define RDFSTRUCTURE_FILE "structure.rdf.u8"
|
||||
#define RDFCONTENT_FILE "content.rdf.u8"
|
||||
@ -167,14 +172,18 @@ char* incRdfPtr( long skip = 1 ) {
|
||||
|
||||
// parse the rdf file up past a given start tag
|
||||
long rdfParse ( char *tagName ) {
|
||||
bool inQuote = false;
|
||||
//bool inQuote = false;
|
||||
do {
|
||||
long matchPos = 0;
|
||||
// move to the next tag
|
||||
while (*rdfPtr != '<' || inQuote ) {
|
||||
// . quotes are no longer escaped out in the newer
|
||||
// dmoz files in oct 2013... so take that out. i do
|
||||
// this < is < though.. perhaps only check for
|
||||
// quotes when in a tag?
|
||||
while (*rdfPtr != '<' ) { // || inQuote ) {
|
||||
// check for quotes
|
||||
if (*rdfPtr == '"')
|
||||
inQuote = !inQuote;
|
||||
//if (*rdfPtr == '"')
|
||||
// inQuote = !inQuote;
|
||||
// next char
|
||||
if (!incRdfPtr())
|
||||
return -1;
|
||||
@ -200,12 +209,15 @@ long rdfParse ( char *tagName ) {
|
||||
|
||||
// move to the next tag in the file
|
||||
long rdfNextTag ( ) {
|
||||
bool inQuote = false;
|
||||
//bool inQuote = false;
|
||||
// move to the next tag
|
||||
while (*rdfPtr != '<' || inQuote ) {
|
||||
while (*rdfPtr != '<' ) { // || inQuote ) {
|
||||
// check for quotes
|
||||
if (*rdfPtr == '"')
|
||||
inQuote = !inQuote;
|
||||
// NO! too many unbalanced quotes all over the place!
|
||||
// and i think quotes in tags do not have < or > in them
|
||||
// because they should be encoded as > and <
|
||||
//if (*rdfPtr == '"')
|
||||
// inQuote = !inQuote;
|
||||
// next char
|
||||
if (!incRdfPtr())
|
||||
return -1;
|
||||
@ -395,6 +407,11 @@ long getIndexFromId ( long catid ) {
|
||||
else
|
||||
low = currCat+1;
|
||||
}
|
||||
//printf("catid %li not found. sanity checking.\n",catid);
|
||||
// sanity check our algo
|
||||
//for ( long i = 0 ; i < numRdfCats ; i++ ) {
|
||||
// if ( rdfCats[i].m_catid == catid ) { char *xx=NULL;*xx=0;}
|
||||
//}
|
||||
// not found
|
||||
return -1;
|
||||
}
|
||||
@ -518,7 +535,7 @@ bool isGoodUrl ( char *url, long urlLen ) {
|
||||
if ( urlLen <= 0 )
|
||||
return false;
|
||||
for (long i = 0; i < urlLen; i++) {
|
||||
if (is_space(url[i]))
|
||||
if (is_wspace_a(url[i]))
|
||||
return false;
|
||||
}
|
||||
// check for [prot]://[url]
|
||||
@ -546,8 +563,27 @@ long printCatPath ( char *str, long catid, bool raw ) {
|
||||
return 0;
|
||||
// get the parent
|
||||
parentId = rdfCats[catIndex].m_parentid;
|
||||
// print the parent(s) first
|
||||
if (parentId > 1) {
|
||||
|
||||
// . print the parent(s) first
|
||||
// . in NEWER DMOZ dumps, "Top" is catid 2 and catid 1 is an
|
||||
// empty title. really catid 2 is Top/World but that is an
|
||||
// error that we correct below. (see "Top/World" below).
|
||||
// but do not include the "Top/" as part of the path name
|
||||
if ( catid == 2 ) {
|
||||
// no! we now include Top as part of the path. let's
|
||||
// be consistent. i'd rather have www.gigablast.com/Top
|
||||
// and www.gigablast.com/Top/Arts etc. then i know if the
|
||||
// path starts with /Top that it is dmoz!!
|
||||
sprintf(p,"Top");
|
||||
return 3;
|
||||
}
|
||||
|
||||
if (parentId > 1 &&
|
||||
// the newer dmoz files have the catid == the parent id of
|
||||
// i guess top most categories, like "Top/Arts"... i would think
|
||||
// it should have a parentId of 1 like the old dmoz files,
|
||||
// so it's probably a bug on dmoz's end
|
||||
parentId != catid ) {
|
||||
p += printCatPath(p, parentId, raw);
|
||||
// print spacing
|
||||
if (!raw) p += sprintf(p, " / ");
|
||||
@ -621,18 +657,22 @@ long fixUrl ( char *url, long urlLen ) {
|
||||
memmove(&url[slashi-1], &url[slashi], newUrlLen - slashi);
|
||||
newUrlLen--;
|
||||
}
|
||||
if (is_space(url[slashi])) {
|
||||
if (is_wspace_a(url[slashi])) {
|
||||
memmove(&url[slashi], &url[slashi+1], newUrlLen - (slashi+1));
|
||||
newUrlLen--;
|
||||
}
|
||||
}
|
||||
// remove any anchor
|
||||
// mdw, sep 2013, no because there is twitter.com/#!/ronpaul
|
||||
// and others...
|
||||
/*
|
||||
for (long i = 0; i < newUrlLen; i++) {
|
||||
if (url[i] == '#') {
|
||||
newUrlLen = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
*/
|
||||
// remove any trailing /
|
||||
if (url[newUrlLen-1] == '/')
|
||||
newUrlLen--;
|
||||
@ -670,6 +710,38 @@ long fileWrite ( int fileid, void *buf, size_t count ) {
|
||||
return sizeWrote;
|
||||
}
|
||||
|
||||
// print special meta tags to tell gigablast to only spider/index
|
||||
// the links and not the links of the links. b/c we only want
|
||||
// to index the dmoz urls. AND ignore any external error like
|
||||
// ETCPTIMEDOUT when indexing a dmoz url so we can be sure to index
|
||||
// all of them under the proper category so our gbcatid:xxx search
|
||||
// works and we can replicate dmoz accurately. see XmlDoc.cpp
|
||||
// addOutlinksSpiderRecsToMetaList() and indexDoc() to see
|
||||
// where these meta tags come into play.
|
||||
void writeMetaTags ( int outStream2 ) {
|
||||
char *str =
|
||||
"<!-- do not spider the links of the links -->\n"
|
||||
"<meta name=spiderlinkslinks content=0>\n"
|
||||
"<!--ignore tcp timeouts, dns timeouts, etc.-->\n"
|
||||
"<meta name=ignorelinksexternalerrors content=1>\n"
|
||||
"<!--do not index this document, but get links from it-->\n"
|
||||
"<meta name=noindex content=1>\n"
|
||||
// tell gigablast to not do a dns lookup on every
|
||||
// outlink when adding spiderRequests to spiderdb
|
||||
// for each outlink. will save time up front but
|
||||
// will have to be done when spidering the doc.
|
||||
"<!-- do not lookup the ip address of every outlink, "
|
||||
"but use hash of the subdomain as the ip -->\n"
|
||||
"<meta name=usefakeips content=1>\n"
|
||||
;
|
||||
long len = gbstrlen(str);
|
||||
if ( write ( outStream2, str , len ) != len )
|
||||
printf("Error writing to outStream2b\n");
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
// main parser
|
||||
int main ( int argc, char *argv[] ) {
|
||||
long n;
|
||||
@ -678,7 +750,7 @@ int main ( int argc, char *argv[] ) {
|
||||
long m = 0;
|
||||
long newNameBufferSize = 0;
|
||||
long newOffset = 0;
|
||||
char filename[256];
|
||||
char filename[1256];
|
||||
long urlTxtCount = 0;
|
||||
long urlTxtFile = 0;
|
||||
Url normUrl;
|
||||
@ -695,6 +767,8 @@ int main ( int argc, char *argv[] ) {
|
||||
bool splitUrls = false;
|
||||
char mode = MODE_NONE;
|
||||
long totalNEC = 0;
|
||||
char *dir="";
|
||||
bool firstTime;
|
||||
|
||||
// check the options and mode
|
||||
for (long i = 0; i < argc; i++) {
|
||||
@ -783,20 +857,29 @@ int main ( int argc, char *argv[] ) {
|
||||
goto errExit;
|
||||
}
|
||||
|
||||
dir = "";
|
||||
|
||||
retry:
|
||||
|
||||
// open the structure file
|
||||
if ( mode == MODE_NEW || mode == MODE_CATDUMP )
|
||||
sprintf(filename, "%s", RDFSTRUCTURE_FILE);
|
||||
sprintf(filename, "%s%s", dir,RDFSTRUCTURE_FILE);
|
||||
else
|
||||
sprintf(filename, "%s.new", RDFSTRUCTURE_FILE);
|
||||
sprintf(filename, "%s%s.new", dir,RDFSTRUCTURE_FILE);
|
||||
//rdfStream.open(filename, ifstream::in);
|
||||
rdfStream = open ( filename, O_RDONLY );
|
||||
// make sure it openned okay
|
||||
// make sure it opened okay
|
||||
//if (!rdfStream.is_open()) {
|
||||
if ( rdfStream < 0 ) {
|
||||
printf("Error Openning %s\n", filename);
|
||||
// try ./catdb/ subdir if not found
|
||||
if ( ! dir[0] ) {
|
||||
dir = "./catdb/";
|
||||
goto retry;
|
||||
}
|
||||
printf("Error Opening %s\n", filename);
|
||||
goto errExit;
|
||||
}
|
||||
printf("Openned Structure File: %s\n", filename);
|
||||
printf("Opened Structure File: %s\n", filename);
|
||||
|
||||
// take the first chunk
|
||||
//rdfStream.read(rdfBuffer, RDFBUFFER_SIZE);
|
||||
@ -809,6 +892,7 @@ int main ( int argc, char *argv[] ) {
|
||||
rdfPtr = rdfBuffer;
|
||||
rdfEnd = &rdfBuffer[n];
|
||||
currOffset = 0;
|
||||
firstTime = true;
|
||||
|
||||
// read and parse the file
|
||||
printf("Parsing Topics...\n");
|
||||
@ -820,6 +904,13 @@ int main ( int argc, char *argv[] ) {
|
||||
unsigned long catOffset = currOffset - 6;
|
||||
// get the topic name, preserve it on the buffer
|
||||
long nameOffset = nameBufferLen;
|
||||
// the name inserted by this function into "nameBuffer"
|
||||
// does not seem to contain "Top/" at the beginning.
|
||||
// it is from structure.rdf.u8, but it seems to be there!
|
||||
// yeah, later on we hack the name buffer and nameOffset
|
||||
// so it is just the last word in the directory to save
|
||||
// mem. then we print out all the parent names to
|
||||
// reconstruct.
|
||||
long nameLen = fillNextString();
|
||||
if (nameLen == -1)
|
||||
goto fileEnd;
|
||||
@ -827,18 +918,48 @@ int main ( int argc, char *argv[] ) {
|
||||
printf("Out of Memory!\n");
|
||||
goto errExit1;
|
||||
}
|
||||
// fix <Topic r:id=\"\"> in the newer content.rdf.u8
|
||||
if ( nameLen == 0 ) {
|
||||
// only do this once!
|
||||
if ( ! firstTime ) {
|
||||
printf("Encountered zero length name");
|
||||
continue;
|
||||
}
|
||||
memcpy(nameBuffer+nameOffset,"Top\0",4);
|
||||
nameLen = 3;
|
||||
firstTime = false;
|
||||
}
|
||||
// html decode it
|
||||
if (nameLen > MAX_HTTP_FILENAME_LEN)
|
||||
nameLen = MAX_HTTP_FILENAME_LEN;
|
||||
nameLen = htmlDecode ( htmlDecoded,
|
||||
&nameBuffer[nameOffset],
|
||||
nameLen );
|
||||
memcpy(&nameBuffer[nameOffset], htmlDecoded, nameLen);
|
||||
nameBufferLen += nameLen;
|
||||
nameLen ,
|
||||
false,
|
||||
0);
|
||||
|
||||
// parse the catid
|
||||
long catid = parseNextCatid();
|
||||
if (catid == -1)
|
||||
goto fileEnd;
|
||||
|
||||
// crap, in the new dmoz structure.rdf.u8 catid 1 is
|
||||
// empty name and catid 2 has Topic tag "Top/World" but
|
||||
// Title tag "Top".
|
||||
// but it should probably be "Top" and not "World". There is
|
||||
// another catid 3 in structure.rdf.u8 that has
|
||||
// <Topic r:id="Top/World"> and catid 3 which is the real one,
|
||||
// so catid 2 is just "Top". this is a bug in the dmoz output
|
||||
// i think, so fix it here.
|
||||
if ( catid == 2 ) {
|
||||
nameLen = 3;
|
||||
memcpy(&nameBuffer[nameOffset],"Top",nameLen);
|
||||
nameBufferLen += nameLen;
|
||||
}
|
||||
else {
|
||||
memcpy(&nameBuffer[nameOffset], htmlDecoded, nameLen);
|
||||
nameBufferLen += nameLen;
|
||||
}
|
||||
// . fill the current cat
|
||||
// make sure there's room
|
||||
if (numRdfCats >= rdfCatsSize) {
|
||||
@ -856,6 +977,11 @@ int main ( int argc, char *argv[] ) {
|
||||
printf("Out of Memory!\n");
|
||||
goto errExit1;
|
||||
}
|
||||
// debug
|
||||
//printf("gbcat=");
|
||||
//for ( long i = 0 ; i < nameLen ; i++ )
|
||||
// printf("%c",htmlDecoded[i]);
|
||||
//printf("\n");
|
||||
// fill it
|
||||
rdfCats[numRdfCats].m_catid = catid;
|
||||
rdfCats[numRdfCats].m_parentid = 0;
|
||||
@ -923,10 +1049,16 @@ fileEnd:
|
||||
rdfEnd = &rdfBuffer[n];
|
||||
currOffset = 0;
|
||||
|
||||
//
|
||||
// set m_parentid using structure.rdf.u8
|
||||
//
|
||||
|
||||
// read and parse the file again
|
||||
printf("Building Hierarchy...\n");
|
||||
while (true) {
|
||||
// parse the next catid
|
||||
// parse the next catid in the file, sequentially
|
||||
//if ( currOffset == 545468935 )
|
||||
// printf("shit\n");
|
||||
long catid = parseNextCatid();
|
||||
if (catid == -1)
|
||||
goto fileEnd1;
|
||||
@ -977,8 +1109,18 @@ nextChildTag:
|
||||
childNameLen = MAX_HTTP_FILENAME_LEN;
|
||||
childNameLen = htmlDecode ( htmlDecoded,
|
||||
childName,
|
||||
childNameLen );
|
||||
childNameLen ,
|
||||
false,
|
||||
0);
|
||||
memcpy(childName, htmlDecoded, childNameLen);
|
||||
|
||||
// debug log
|
||||
//if ( currOffset >= 506362430 ) // 556362463
|
||||
// printf("off=%li\n",currOffset);
|
||||
// debug point
|
||||
//if ( currOffset == 545467573 )
|
||||
// printf("GOT DEBUG POINT before giant skip\n");
|
||||
|
||||
// cut off the leading label if symbolic
|
||||
// if (parentType == 2) {
|
||||
// while (*childName != ':') {
|
||||
@ -988,20 +1130,27 @@ nextChildTag:
|
||||
// childName++;
|
||||
// childNameLen--;
|
||||
// }
|
||||
// debug point
|
||||
//if (strcmp(childName,"Top/World/Català/Arts") == 0 )
|
||||
// printf("hey\n");
|
||||
// get the catid for the child
|
||||
long childid = getCatHash(childName, childNameLen);
|
||||
// get the cat for this id
|
||||
long cat = getIndexFromId(childid);
|
||||
// make sure we have a match
|
||||
if (cat == -1) {
|
||||
//printf("Warning: Child Topic Not Found: ");
|
||||
//for (long i = 0; i < childNameLen; i++)
|
||||
// printf("%c", childName[i]);
|
||||
//printf("\n");
|
||||
// debug. why does Top/World/Catala/Arts
|
||||
// not have a parent??
|
||||
printf("Warning: Child Topic Not Found: ");
|
||||
for (long i = 0; i < childNameLen; i++)
|
||||
printf("%c", childName[i]);
|
||||
printf("\n");
|
||||
m++;
|
||||
goto nextChildTag;
|
||||
}
|
||||
// assign the parent to the cat
|
||||
// . assign the parent to the cat
|
||||
// . this means we are in a "child" tag within the "catid"
|
||||
// . catid 84192
|
||||
if (parentType == 1) {
|
||||
if (rdfCats[cat].m_parentid != 0)
|
||||
printf("Warning: Overwriting Parent Id!\n");
|
||||
@ -1033,6 +1182,14 @@ fileEnd1:
|
||||
printf(" Total Topics: %li\n", numRdfCats);
|
||||
printf(" Topics with Parents: %li\n", t);
|
||||
printf(" Topics Linked but Nonexistent: %li\n", m);
|
||||
|
||||
if ( t != numRdfCats ) {
|
||||
printf("\n"
|
||||
" *Topics without parents is bad because they\n"
|
||||
" can not have their entired rawPath printed out\n"
|
||||
" in order to get their proper hash\n");
|
||||
}
|
||||
|
||||
//printf(" Number of Symbolic Links: %li\n", numSymParents);
|
||||
printf("\n");
|
||||
|
||||
@ -1066,25 +1223,45 @@ fileEnd1:
|
||||
for (long i = 0; i < numRdfCats; i++) {
|
||||
// get the hash of the path
|
||||
rawPathLen = printCatPath(rawPath, rdfCats[i].m_catid, true);
|
||||
rdfCats[i].m_catHash = hash32Lower(rawPath, rawPathLen, 0);
|
||||
// crap, this rawpath contains "Top/" in the beginning
|
||||
// but the rdfCats[i].m_nameOffset refers to a name
|
||||
// that does not include "Top/"
|
||||
rdfCats[i].m_catHash = hash32Lower_a(rawPath, rawPathLen, 0);
|
||||
// fix. so that xyz/Arts does not just hash "Arts"
|
||||
// because it has no parent...
|
||||
if ( rdfCats[i].m_parentid == 0 ) {
|
||||
printf("Missing parent for catid %li. Will be "
|
||||
"excluded from DMOZ so we avoid hash "
|
||||
"collisions.\n",rdfCats[i].m_catid);
|
||||
}
|
||||
//
|
||||
// DEBUG!
|
||||
// print this shit out to find the collisions
|
||||
//
|
||||
continue;
|
||||
printf("hash32=%lu catid=%li parentid=%li path=%s\n",
|
||||
rdfCats[i].m_catHash,
|
||||
rdfCats[i].m_catid,
|
||||
rdfCats[i].m_parentid,
|
||||
rawPath);
|
||||
}
|
||||
|
||||
// . now we want to serialize the needed data into
|
||||
// one (or more?) file(s) to be quickly read by gb
|
||||
if ( mode == MODE_NEW )
|
||||
sprintf(filename, "%s", STRUCTURE_OUTPUT_FILE);
|
||||
sprintf(filename, "%s%s", dir,STRUCTURE_OUTPUT_FILE);
|
||||
else
|
||||
sprintf(filename, "%s.new", STRUCTURE_OUTPUT_FILE);
|
||||
sprintf(filename, "%s%s.new", dir,STRUCTURE_OUTPUT_FILE);
|
||||
//outStream.open(filename, ofstream::out|ofstream::trunc);
|
||||
outStream = open ( filename, O_CREAT|O_WRONLY|O_TRUNC,
|
||||
S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP );
|
||||
// make sure it openned okay
|
||||
// make sure it opened okay
|
||||
//if (!outStream.is_open()) {
|
||||
if ( outStream < 0 ) {
|
||||
printf("Error Openning %s\n", filename);
|
||||
printf("Error Opening %s\n", filename);
|
||||
goto errExit;
|
||||
}
|
||||
printf("\nOpenned %s for writing.\n", filename);
|
||||
printf("\nOpened %s for writing.\n", filename);
|
||||
|
||||
// write the size of the truncated name buffer
|
||||
//outStream.write((char*)&newNameBufferSize, sizeof(long));
|
||||
@ -1149,21 +1326,26 @@ contentParse:
|
||||
printf("Out of Memory!\n");
|
||||
goto errExit;
|
||||
}
|
||||
|
||||
|
||||
again:
|
||||
// open the content file
|
||||
if ( mode == MODE_NEW || mode == MODE_URLDUMP )
|
||||
sprintf(filename, "%s", RDFCONTENT_FILE);
|
||||
sprintf(filename, "%s%s", dir,RDFCONTENT_FILE);
|
||||
else
|
||||
sprintf(filename, "%s.new", RDFCONTENT_FILE);
|
||||
sprintf(filename, "%s%s.new", dir,RDFCONTENT_FILE);
|
||||
//rdfStream.open(filename, ifstream::in);
|
||||
rdfStream = open ( filename, O_RDONLY );
|
||||
// make sure it openned okay
|
||||
// make sure it opened okay
|
||||
//if (!rdfStream.is_open()) {
|
||||
if ( rdfStream < 0 ) {
|
||||
printf("Error Openning %s\n", filename);
|
||||
if ( ! dir[0] ) {
|
||||
dir = "./catdb/";
|
||||
goto again;
|
||||
}
|
||||
printf("Error Opening %s\n", filename);
|
||||
goto errExit;
|
||||
}
|
||||
printf("\nOpenned Content File: %s\n", filename);
|
||||
printf("\nOpened Content File: %s\n", filename);
|
||||
|
||||
// take the first chunk
|
||||
//rdfStream.read(rdfBuffer, RDFBUFFER_SIZE);
|
||||
@ -1184,28 +1366,32 @@ contentParse:
|
||||
// write another file for the urls
|
||||
if ( mode == MODE_URLDUMP ) {
|
||||
if (!splitUrls)
|
||||
sprintf(filename, "%s", URLTEXT_OUTPUT_FILE);
|
||||
sprintf(filename, "html/%s", URLTEXT_OUTPUT_FILE);
|
||||
else
|
||||
sprintf(filename, "%s.0", URLTEXT_OUTPUT_FILE);
|
||||
// put them directly into html/ now for
|
||||
// easy add url'ing
|
||||
sprintf(filename, "html/%s.0", URLTEXT_OUTPUT_FILE);
|
||||
}
|
||||
else {
|
||||
if (!splitUrls)
|
||||
sprintf(filename, "%s",
|
||||
sprintf(filename, "html/%s",
|
||||
DIFFURLTEXT_OUTPUT_FILE);
|
||||
else
|
||||
sprintf(filename, "%s.0",
|
||||
sprintf(filename, "html/%s.0",
|
||||
DIFFURLTEXT_OUTPUT_FILE);
|
||||
}
|
||||
//outStream2.open(filename, ofstream::out|ofstream::trunc);
|
||||
outStream2 = open ( filename, O_CREAT|O_WRONLY|O_TRUNC,
|
||||
S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP );
|
||||
// make sure it openned okay
|
||||
// make sure it opened okay
|
||||
//if (!outStream2.is_open()) {
|
||||
if ( outStream2 < 0 ) {
|
||||
printf("Error Openning %s\n", filename);
|
||||
printf("Error Opening %s\n", filename);
|
||||
goto errExit1;
|
||||
}
|
||||
printf("Openned %s for writing.\n", filename);
|
||||
printf("Opened %s for writing.\n", filename);
|
||||
|
||||
writeMetaTags ( outStream2 );
|
||||
|
||||
// if we're doing a diffurldump, load up the diff file first
|
||||
if ( mode == MODE_DIFFURLDUMP ) {
|
||||
@ -1219,10 +1405,10 @@ contentParse:
|
||||
diffInStream = open(filename, O_RDONLY);
|
||||
//if (!diffInStream.is_open()) {
|
||||
if ( diffInStream < 0 ) {
|
||||
printf("Error Openning %s\n", filename);
|
||||
printf("Error Opening %s\n", filename);
|
||||
goto errExit;
|
||||
}
|
||||
printf("Openned Diff File: %s\n", filename);
|
||||
printf("Opened Diff File: %s\n", filename);
|
||||
|
||||
// read in the number of urls to update/add
|
||||
//diffInStream.read((char*)&numUpdateIndexes,
|
||||
@ -1318,7 +1504,7 @@ contentParse:
|
||||
printf("Completed Writing File.\n");
|
||||
// write another file for the urls
|
||||
urlTxtFile++;
|
||||
sprintf(filename, "%s.%li",
|
||||
sprintf(filename, "html/%s.%li",
|
||||
URLTEXT_OUTPUT_FILE,
|
||||
urlTxtFile);
|
||||
//outStream2.open(filename,
|
||||
@ -1326,14 +1512,14 @@ contentParse:
|
||||
outStream2 = open ( filename,
|
||||
O_CREAT|O_WRONLY|O_TRUNC,
|
||||
S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP );
|
||||
// make sure it openned okay
|
||||
// make sure it opened okay
|
||||
//if (!outStream2.is_open()) {
|
||||
if ( outStream2 < 0 ) {
|
||||
printf("Error Openning %s\n",
|
||||
printf("Error Opening %s\n",
|
||||
filename);
|
||||
goto errExit1;
|
||||
}
|
||||
printf("Openned %s for writing.\n",
|
||||
printf("Opened %s for writing.\n",
|
||||
filename);
|
||||
urlTxtCount = 0;
|
||||
}
|
||||
@ -1348,20 +1534,20 @@ contentParse:
|
||||
}
|
||||
else {
|
||||
if ( mode == MODE_NEW )
|
||||
sprintf(filename, "%s", CONTENT_OUTPUT_FILE);
|
||||
sprintf(filename, "%s%s", dir,CONTENT_OUTPUT_FILE);
|
||||
else
|
||||
sprintf(filename, "%s.new", CONTENT_OUTPUT_FILE);
|
||||
sprintf(filename, "%s%s.new", dir,CONTENT_OUTPUT_FILE);
|
||||
// stream the urls into the content
|
||||
//outStream.open(filename, ofstream::out|ofstream::trunc);
|
||||
outStream = open ( filename, O_CREAT|O_WRONLY|O_TRUNC,
|
||||
S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP );
|
||||
// make sure it openned okay
|
||||
// make sure it opened okay
|
||||
//if (!outStream.is_open()) {
|
||||
if ( outStream < 0 ) {
|
||||
printf("Error Openning %s\n", filename);
|
||||
printf("Error Opening %s\n", filename);
|
||||
goto errExit;
|
||||
}
|
||||
printf("Openned %s for writing.\n", filename);
|
||||
printf("Opened %s for writing.\n", filename);
|
||||
|
||||
// store a space for the number of urls at the start of the file
|
||||
//outStream.write((char*)&numUrlInfos, sizeof(long));
|
||||
@ -1371,7 +1557,7 @@ contentParse:
|
||||
goto errExit;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// read and parse the file again
|
||||
printf("Building Links...\n");
|
||||
while (true) {
|
||||
@ -1389,6 +1575,9 @@ contentParse:
|
||||
if ( mode == MODE_URLDUMP || mode == MODE_DIFFURLDUMP )
|
||||
goto nextLink;
|
||||
// . set the content offset for this cat
|
||||
// . it's missing catid 425187... why? because it had
|
||||
// a double quote in it like '4"'!! so i took out inQuotes
|
||||
// logic above.
|
||||
cat = getIndexFromId(catid);
|
||||
if (cat == -1) {
|
||||
totalNEC++;
|
||||
@ -1442,15 +1631,35 @@ hashLink:
|
||||
// html decode the url
|
||||
if (urlLen > MAX_URL_LEN)
|
||||
urlLen = MAX_URL_LEN;
|
||||
urlLen = htmlDecode(decodedUrl, &urlBuffer[urlOffset], urlLen);
|
||||
urlLen = htmlDecode(decodedUrl, &urlBuffer[urlOffset], urlLen,
|
||||
false,0);
|
||||
// debug point
|
||||
//if ( strcmp(decodedUrl,"http://twitter.com/#!/ronpaul")==0)
|
||||
// printf("hey\n");
|
||||
|
||||
// ignore any url with # in it for now like
|
||||
// http://twitter.com/#!/ronpaul because it bastardizes
|
||||
// the meaning of the # (hashtag) and we need to protest that
|
||||
if ( strchr ( decodedUrl , '#' ) )
|
||||
goto nextLink;
|
||||
|
||||
memcpy(&urlBuffer[urlOffset], decodedUrl, urlLen);
|
||||
// fix up bad urls
|
||||
urlLen = fixUrl(&urlBuffer[urlOffset], urlLen);
|
||||
if (urlLen == 0)
|
||||
goto nextLink;
|
||||
// normalize with Url
|
||||
normUrl.set(&urlBuffer[urlOffset], urlLen,
|
||||
true, false, false, true);
|
||||
// . normalize with Url
|
||||
// . watch out for
|
||||
// http://twitter.com/#!/ronpaul to http://www.twitter.com/
|
||||
// so do not strip # hashtags
|
||||
normUrl.set(&urlBuffer[urlOffset],
|
||||
urlLen,
|
||||
true, // addwww?
|
||||
false, // stripsessionid
|
||||
false, // strippound?
|
||||
true); // stripcommonfile? (i.e. index.htm)
|
||||
// debug print
|
||||
//printf("gburl %s -> %s\n",decodedUrl,normUrl.getUrl());
|
||||
// put it back
|
||||
urlLen = normUrl.getUrlLen();
|
||||
if (urlBufferLen+urlLen+10 >= urlBufferSize) {
|
||||
@ -1473,7 +1682,7 @@ hashLink:
|
||||
//urlBufferLen += urlLen;
|
||||
// get the hash value
|
||||
unsigned long long urlHash =
|
||||
hash64Lower(&urlBuffer[urlOffset], urlLen, 0);
|
||||
hash64Lower_a(&urlBuffer[urlOffset], urlLen, 0);
|
||||
//unsigned long urlHash2 =
|
||||
// hash32Lower(&urlBuffer[urlOffset], urlLen, 0);
|
||||
// see if it's already indexed
|
||||
@ -1491,6 +1700,10 @@ hashLink:
|
||||
currUrl == updateIndexes[currDiffIndex] ) {
|
||||
//outStream2.write(&urlBuffer[urlOffset],
|
||||
// urlLen);
|
||||
// print it in an anchor tag
|
||||
// now so gigablast can spider
|
||||
// these links
|
||||
write ( outStream2,"<a href=\"",9);
|
||||
if ( write ( outStream2,
|
||||
&urlBuffer[urlOffset],
|
||||
urlLen ) != urlLen ) {
|
||||
@ -1498,6 +1711,7 @@ hashLink:
|
||||
"outStream2\n");
|
||||
goto errExit1;
|
||||
}
|
||||
write ( outStream2,"\"></a>",6);
|
||||
//outStream2.write("\n", 1);
|
||||
if (write(outStream2, "\n", 1) != 1) {
|
||||
printf("Error writing to "
|
||||
@ -1518,11 +1732,11 @@ hashLink:
|
||||
// write another file for the urls
|
||||
urlTxtFile++;
|
||||
if ( mode == MODE_URLDUMP )
|
||||
sprintf(filename, "%s.%li",
|
||||
sprintf(filename, "html/%s.%li",
|
||||
URLTEXT_OUTPUT_FILE,
|
||||
urlTxtFile);
|
||||
else
|
||||
sprintf(filename, "%s.%li",
|
||||
sprintf(filename, "html/%s.%li",
|
||||
DIFFURLTEXT_OUTPUT_FILE,
|
||||
urlTxtFile);
|
||||
//outStream2.open(filename,
|
||||
@ -1530,15 +1744,16 @@ hashLink:
|
||||
outStream2 = open ( filename,
|
||||
O_CREAT|O_WRONLY|O_TRUNC,
|
||||
S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP );
|
||||
// make sure it openned okay
|
||||
// make sure it opened okay
|
||||
//if (!outStream2.is_open()) {
|
||||
if ( outStream2 < 0 ) {
|
||||
printf("Error Openning %s\n",
|
||||
printf("Error Opening %s\n",
|
||||
filename);
|
||||
goto errExit1;
|
||||
}
|
||||
printf("Openned %s for writing.\n",
|
||||
printf("Opened %s for writing.\n",
|
||||
filename);
|
||||
writeMetaTags ( outStream2 );
|
||||
urlTxtCount = 0;
|
||||
}
|
||||
}
|
||||
@ -1634,8 +1849,17 @@ hashLink:
|
||||
long currIndex = getIndexFromId(catid);
|
||||
while (currIndex >= 0) {
|
||||
rdfCats[currIndex].m_numUrls++;
|
||||
// the new dmoz files have catids whose parents
|
||||
// are the same cat id! so stop infinite loops
|
||||
if ( rdfCats[currIndex].m_parentid ==
|
||||
rdfCats[currIndex].m_catid )
|
||||
break;
|
||||
// otherwise, make "currIndex" point to the parent
|
||||
currIndex = getIndexFromId(
|
||||
rdfCats[currIndex].m_parentid );
|
||||
// in the newer dmoz files 0 is a bad catid i guess
|
||||
// not -1 any more?
|
||||
// ??????
|
||||
}
|
||||
|
||||
goto nextLink;
|
||||
@ -1697,19 +1921,19 @@ fileEnd2:
|
||||
|
||||
// load the content and url files
|
||||
// url info (content) file
|
||||
sprintf(filename, "%s", CONTENT_OUTPUT_FILE);
|
||||
sprintf(filename, "%s%s", dir,CONTENT_OUTPUT_FILE);
|
||||
//rdfStream.open(filename, ifstream::in);
|
||||
rdfStream = open ( filename, O_RDONLY );
|
||||
//if (!rdfStream.is_open()) {
|
||||
if ( rdfStream < 0 ) {
|
||||
printf("Error Openning %s\n", CONTENT_OUTPUT_FILE);
|
||||
printf("Error Opening %s\n", filename);
|
||||
goto oldErrExit;
|
||||
}
|
||||
// read in the number of urls
|
||||
//rdfStream.read((char*)&oldNumUrls, sizeof(long));
|
||||
if (fileRead(rdfStream, &oldNumUrls, sizeof(long)) !=
|
||||
sizeof(long)) {
|
||||
printf("Error Reading %s\n", CONTENT_OUTPUT_FILE);
|
||||
printf("Error Reading %s\n", filename);
|
||||
goto oldErrExit;
|
||||
}
|
||||
|
||||
@ -1749,8 +1973,8 @@ fileEnd2:
|
||||
//rdfStream.read((char*)&urlLen, sizeof(short));
|
||||
long n = fileRead(rdfStream, &urlLen, sizeof(short));
|
||||
if ( n < 0 || n > (long)sizeof(short) ) {
|
||||
printf("Error Reading %s\n",
|
||||
CONTENT_OUTPUT_FILE);
|
||||
printf("Error Reading %s\n",filename);
|
||||
//CONTENT_OUTPUT_FILE);
|
||||
goto oldErrExit;
|
||||
}
|
||||
if ( n == 0 )
|
||||
@ -1780,8 +2004,8 @@ fileEnd2:
|
||||
}
|
||||
n = fileRead(rdfStream, &oldUrls[urlp], urlLen);
|
||||
if ( n < 0 || n > urlLen ) {
|
||||
printf("Error Reading %s\n",
|
||||
CONTENT_OUTPUT_FILE);
|
||||
printf("Error Reading %s\n",filename);
|
||||
//CONTENT_OUTPUT_FILE);
|
||||
goto oldErrExit;
|
||||
}
|
||||
if ( n == 0 )
|
||||
@ -1791,7 +2015,7 @@ fileEnd2:
|
||||
urlLen = fixUrl(&oldUrls[urlp], urlLen);
|
||||
// make the hash
|
||||
oldUrlHashes[currUrl] =
|
||||
hash64Lower(&oldUrls[urlp], urlLen, 0);
|
||||
hash64Lower_a(&oldUrls[urlp], urlLen, 0);
|
||||
removeOldUrl[currUrl] = 0;
|
||||
// increment the buffer pointer
|
||||
if (urlLen <= 0) {
|
||||
@ -1814,8 +2038,8 @@ fileEnd2:
|
||||
//rdfStream.read((char*)&oldNumCatids[currUrl], 1);
|
||||
long n = fileRead(rdfStream, &oldNumCatids[currUrl], 1);
|
||||
if ( n < 0 || n > 1 ) {
|
||||
printf("Error Reading %s\n",
|
||||
CONTENT_OUTPUT_FILE);
|
||||
printf("Error Reading %s\n",filename);
|
||||
//CONTENT_OUTPUT_FILE);
|
||||
goto oldErrExit;
|
||||
}
|
||||
if ( n == 0 )
|
||||
@ -1839,8 +2063,8 @@ fileEnd2:
|
||||
long readSize = sizeof(long)*oldNumCatids[currUrl];
|
||||
n = fileRead(rdfStream, &oldCatids[catidp], readSize);
|
||||
if ( n < 0 || n > readSize ) {
|
||||
printf("Error Reading %s\n",
|
||||
CONTENT_OUTPUT_FILE);
|
||||
printf("Error Reading %s\n",filename);
|
||||
//CONTENT_OUTPUT_FILE);
|
||||
goto oldErrExit;
|
||||
}
|
||||
if ( n == 0 )
|
||||
@ -1907,17 +2131,17 @@ oldIsDifferent:
|
||||
// also urls to remove
|
||||
//
|
||||
// open the new diff file for writing
|
||||
sprintf(filename, "%s.new.diff", CONTENT_OUTPUT_FILE);
|
||||
sprintf(filename, "%s%s.new.diff", dir,CONTENT_OUTPUT_FILE);
|
||||
//outStream.open(filename, ofstream::out|ofstream::trunc);
|
||||
outStream = open ( filename, O_CREAT|O_WRONLY|O_TRUNC,
|
||||
S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP );
|
||||
// make sure it openned okay
|
||||
// make sure it opened okay
|
||||
//if (!outStream.is_open()) {
|
||||
if ( outStream < 0 ) {
|
||||
printf("Error Openning %s\n", filename);
|
||||
printf("Error Opening %s\n", filename);
|
||||
goto oldErrExit;
|
||||
}
|
||||
printf("\nOpenned %s for writing.\n", filename);
|
||||
printf("\nOpened %s for writing.\n", filename);
|
||||
|
||||
// write out the number of urls to update/add
|
||||
//outStream.write(&numUpdateUrls, sizeof(long));
|
||||
@ -2027,19 +2251,19 @@ oldGoodExit:
|
||||
// . now we want to serialize the needed data into
|
||||
// one (or more?) file(s) to be quickly read by gb
|
||||
if ( mode == MODE_NEW )
|
||||
sprintf(filename, "%s", STRUCTURE_OUTPUT_FILE);
|
||||
sprintf(filename, "%s%s", dir,STRUCTURE_OUTPUT_FILE);
|
||||
else
|
||||
sprintf(filename, "%s.new", STRUCTURE_OUTPUT_FILE);
|
||||
sprintf(filename, "%s%s.new", dir,STRUCTURE_OUTPUT_FILE);
|
||||
//outStream.open(filename, ofstream::out|ofstream::ate);
|
||||
outStream = open ( filename, O_WRONLY|O_APPEND,
|
||||
S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP );
|
||||
// make sure it openned okay
|
||||
// make sure it opened okay
|
||||
//if (!outStream.is_open()) {
|
||||
if ( outStream < 0 ) {
|
||||
printf("Error Openning %s\n", filename);
|
||||
printf("Error Opening %s\n", filename);
|
||||
goto errExit;
|
||||
}
|
||||
printf("\nOpenned %s for writing.\n", filename);
|
||||
printf("\nOpened %s for writing.\n", filename);
|
||||
|
||||
// write the cats
|
||||
//outStream.write((char*)rdfCats, sizeof(RdfCat)*numRdfCats);
|
||||
@ -2109,21 +2333,21 @@ oldGoodExit:
|
||||
|
||||
// write another file for the urls
|
||||
if ( mode == MODE_NEW )
|
||||
sprintf(filename, "%s", CONTENT_OUTPUT_FILE);
|
||||
sprintf(filename, "%s%s", dir,CONTENT_OUTPUT_FILE);
|
||||
else
|
||||
sprintf(filename, "%s.new", CONTENT_OUTPUT_FILE);
|
||||
sprintf(filename, "%s%s.new", dir,CONTENT_OUTPUT_FILE);
|
||||
//outStream.open(filename, ofstream::out|ofstream::ate);
|
||||
outStream = open ( filename, O_WRONLY,
|
||||
S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP );
|
||||
//outStream.open(filename, ofstream::out|ofstream::trunc);
|
||||
//endpos = outStream.tellp();
|
||||
// make sure it openned okay
|
||||
// make sure it opened okay
|
||||
//if (!outStream.is_open()) {
|
||||
if ( outStream < 0 ) {
|
||||
printf("Error Openning %s\n", filename);
|
||||
printf("Error Opening %s\n", filename);
|
||||
goto errExit;
|
||||
}
|
||||
printf("\nOpenned %s for writing.\n", filename);
|
||||
printf("\nOpened %s for writing.\n", filename);
|
||||
|
||||
//outStream.seekp(0);
|
||||
lseek(outStream, 0, SEEK_SET);
|
||||
|
File diff suppressed because one or more lines are too long
BIN
libplot.a
BIN
libplot.a
Binary file not shown.
BIN
libplotter.a
BIN
libplotter.a
Binary file not shown.
118
main.cpp
118
main.cpp
@ -22,7 +22,7 @@
|
||||
#include "Titledb.h"
|
||||
#include "Revdb.h"
|
||||
#include "Tagdb.h"
|
||||
//#include "Catdb.h"
|
||||
#include "Catdb.h"
|
||||
#include "Users.h"
|
||||
#include "Tfndb.h"
|
||||
#include "Spider.h"
|
||||
@ -1390,7 +1390,7 @@ int main ( int argc , char *argv[] ) {
|
||||
char structureFile[256];
|
||||
g_conf.m_maxMem = 1000000000LL; // 1G
|
||||
g_mem.m_maxMem = 1000000000LL; // 1G
|
||||
sprintf(structureFile, "%scat/gbdmoz.structure.dat", g_hostdb.m_dir);
|
||||
sprintf(structureFile, "%scatdb/gbdmoz.structure.dat", g_hostdb.m_dir);
|
||||
g_categories = &g_categories1;
|
||||
if (g_categories->loadCategories(structureFile) != 0) {
|
||||
log("cat: Loading Categories From %s Failed.", structureFile);
|
||||
@ -2633,8 +2633,8 @@ int main ( int argc , char *argv[] ) {
|
||||
if ( ! g_tagdb.init() ) {
|
||||
log("db: Tagdb init failed." ); return 1; }
|
||||
// the catdb, it's an instance of tagdb, pass RDB_CATDB
|
||||
//if ( ! g_catdb.init() ) {
|
||||
// log("db: Catdb1 init failed." ); return 1; }
|
||||
if ( ! g_catdb.init() ) {
|
||||
log("db: Catdb1 init failed." ); return 1; }
|
||||
// initialize Users
|
||||
if ( ! g_users.init() ){
|
||||
log("db: Users init failed. "); return 1;}
|
||||
@ -2842,7 +2842,7 @@ int main ( int argc , char *argv[] ) {
|
||||
|
||||
// load up the dmoz categories here
|
||||
char structureFile[256];
|
||||
sprintf(structureFile, "%scat/gbdmoz.structure.dat", g_hostdb.m_dir);
|
||||
sprintf(structureFile, "%scatdb/gbdmoz.structure.dat", g_hostdb.m_dir);
|
||||
g_categories = &g_categories1;
|
||||
if (g_categories->loadCategories(structureFile) != 0) {
|
||||
log("cat: Loading Categories From %s Failed.",
|
||||
@ -4511,8 +4511,8 @@ int install ( install_flag_konst_t installFlag , long hostId , char *dir ,
|
||||
if ( h2->m_hostId == 0 ) continue;
|
||||
sprintf(tmp,
|
||||
"rcp "
|
||||
"%scat/content.rdf.u8 "
|
||||
"%s:%scat/content.rdf.u8",
|
||||
"%scatdb/content.rdf.u8 "
|
||||
"%s:%scatdb/content.rdf.u8",
|
||||
dir,
|
||||
iptoa(h2->m_ip),
|
||||
h2->m_dir);
|
||||
@ -4520,8 +4520,8 @@ int install ( install_flag_konst_t installFlag , long hostId , char *dir ,
|
||||
system ( tmp );
|
||||
sprintf(tmp,
|
||||
"rcp "
|
||||
"%scat/structure.rdf.u8 "
|
||||
"%s:%scat/structure.rdf.u8",
|
||||
"%scatdb/structure.rdf.u8 "
|
||||
"%s:%scatdb/structure.rdf.u8",
|
||||
dir,
|
||||
iptoa(h2->m_ip),
|
||||
h2->m_dir);
|
||||
@ -4529,8 +4529,8 @@ int install ( install_flag_konst_t installFlag , long hostId , char *dir ,
|
||||
system ( tmp );
|
||||
sprintf(tmp,
|
||||
"rcp "
|
||||
"%scat/gbdmoz.structure.dat "
|
||||
"%s:%scat/gbdmoz.structure.dat",
|
||||
"%scatdb/gbdmoz.structure.dat "
|
||||
"%s:%scatdb/gbdmoz.structure.dat",
|
||||
dir,
|
||||
iptoa(h2->m_ip),
|
||||
h2->m_dir);
|
||||
@ -4538,8 +4538,8 @@ int install ( install_flag_konst_t installFlag , long hostId , char *dir ,
|
||||
system ( tmp );
|
||||
sprintf(tmp,
|
||||
"rcp "
|
||||
"%scat/gbdmoz.content.dat "
|
||||
"%s:%scat/gbdmoz.content.dat",
|
||||
"%scatdb/gbdmoz.content.dat "
|
||||
"%s:%scatdb/gbdmoz.content.dat",
|
||||
dir,
|
||||
iptoa(h2->m_ip),
|
||||
h2->m_dir);
|
||||
@ -4547,8 +4547,8 @@ int install ( install_flag_konst_t installFlag , long hostId , char *dir ,
|
||||
//system ( tmp );
|
||||
//sprintf(tmp,
|
||||
// "rcp "
|
||||
// "%scat/gbdmoz.content.dat.diff "
|
||||
// "%s:%scat/gbdmoz.content.dat.diff",
|
||||
// "%scatdb/gbdmoz.content.dat.diff "
|
||||
// "%s:%scatdb/gbdmoz.content.dat.diff",
|
||||
// dir,
|
||||
// iptoa(h2->m_ip),
|
||||
// h2->m_dir);
|
||||
@ -4561,8 +4561,8 @@ int install ( install_flag_konst_t installFlag , long hostId , char *dir ,
|
||||
if ( h2->m_hostId == 0 ) continue;
|
||||
sprintf(tmp,
|
||||
"rcp "
|
||||
"%scat/content.rdf.u8.new "
|
||||
"%s:%scat/content.rdf.u8.new",
|
||||
"%scatdb/content.rdf.u8.new "
|
||||
"%s:%scatdb/content.rdf.u8.new",
|
||||
dir,
|
||||
iptoa(h2->m_ip),
|
||||
h2->m_dir);
|
||||
@ -4570,8 +4570,8 @@ int install ( install_flag_konst_t installFlag , long hostId , char *dir ,
|
||||
system ( tmp );
|
||||
sprintf(tmp,
|
||||
"rcp "
|
||||
"%scat/structure.rdf.u8.new "
|
||||
"%s:%scat/structure.rdf.u8.new",
|
||||
"%scatdb/structure.rdf.u8.new "
|
||||
"%s:%scatdb/structure.rdf.u8.new",
|
||||
dir,
|
||||
iptoa(h2->m_ip),
|
||||
h2->m_dir);
|
||||
@ -4579,8 +4579,8 @@ int install ( install_flag_konst_t installFlag , long hostId , char *dir ,
|
||||
system ( tmp );
|
||||
sprintf(tmp,
|
||||
"rcp "
|
||||
"%scat/gbdmoz.structure.dat.new "
|
||||
"%s:%scat/gbdmoz.structure.dat.new",
|
||||
"%scatdb/gbdmoz.structure.dat.new "
|
||||
"%s:%scatdb/gbdmoz.structure.dat.new",
|
||||
dir,
|
||||
iptoa(h2->m_ip),
|
||||
h2->m_dir);
|
||||
@ -4588,8 +4588,8 @@ int install ( install_flag_konst_t installFlag , long hostId , char *dir ,
|
||||
system ( tmp );
|
||||
sprintf(tmp,
|
||||
"rcp "
|
||||
"%scat/gbdmoz.content.dat.new "
|
||||
"%s:%scat/gbdmoz.content.dat.new",
|
||||
"%scatdb/gbdmoz.content.dat.new "
|
||||
"%s:%scatdb/gbdmoz.content.dat.new",
|
||||
dir,
|
||||
iptoa(h2->m_ip),
|
||||
h2->m_dir);
|
||||
@ -4597,8 +4597,8 @@ int install ( install_flag_konst_t installFlag , long hostId , char *dir ,
|
||||
system ( tmp );
|
||||
sprintf(tmp,
|
||||
"rcp "
|
||||
"%scat/gbdmoz.content.dat.new.diff "
|
||||
"%s:%scat/gbdmoz.content.dat.new.diff",
|
||||
"%scatdb/gbdmoz.content.dat.new.diff "
|
||||
"%s:%scatdb/gbdmoz.content.dat.new.diff",
|
||||
dir,
|
||||
iptoa(h2->m_ip),
|
||||
h2->m_dir);
|
||||
@ -4694,8 +4694,8 @@ int install ( install_flag_konst_t installFlag , long hostId , char *dir ,
|
||||
if ( h2->m_hostId == 0 ) continue;
|
||||
sprintf(tmp,
|
||||
"rcp "
|
||||
"%scat/content.rdf.u8 "
|
||||
"%s:%scat/content.rdf.u8",
|
||||
"%scatdb/content.rdf.u8 "
|
||||
"%s:%scatdb/content.rdf.u8",
|
||||
dir,
|
||||
iptoa(h2->m_ipShotgun),
|
||||
h2->m_dir);
|
||||
@ -4703,8 +4703,8 @@ int install ( install_flag_konst_t installFlag , long hostId , char *dir ,
|
||||
system ( tmp );
|
||||
sprintf(tmp,
|
||||
"rcp "
|
||||
"%scat/structure.rdf.u8 "
|
||||
"%s:%scat/structure.rdf.u8",
|
||||
"%scatdb/structure.rdf.u8 "
|
||||
"%s:%scatdb/structure.rdf.u8",
|
||||
dir,
|
||||
iptoa(h2->m_ipShotgun),
|
||||
h2->m_dir);
|
||||
@ -4712,8 +4712,8 @@ int install ( install_flag_konst_t installFlag , long hostId , char *dir ,
|
||||
system ( tmp );
|
||||
sprintf(tmp,
|
||||
"rcp "
|
||||
"%scat/gbdmoz.structure.dat "
|
||||
"%s:%scat/gbdmoz.structure.dat",
|
||||
"%scatdb/gbdmoz.structure.dat "
|
||||
"%s:%scatdb/gbdmoz.structure.dat",
|
||||
dir,
|
||||
iptoa(h2->m_ipShotgun),
|
||||
h2->m_dir);
|
||||
@ -4721,8 +4721,8 @@ int install ( install_flag_konst_t installFlag , long hostId , char *dir ,
|
||||
system ( tmp );
|
||||
sprintf(tmp,
|
||||
"rcp "
|
||||
"%scat/gbdmoz.content.dat "
|
||||
"%s:%scat/gbdmoz.content.dat",
|
||||
"%scatdb/gbdmoz.content.dat "
|
||||
"%s:%scatdb/gbdmoz.content.dat",
|
||||
dir,
|
||||
iptoa(h2->m_ipShotgun),
|
||||
h2->m_dir);
|
||||
@ -4730,8 +4730,8 @@ int install ( install_flag_konst_t installFlag , long hostId , char *dir ,
|
||||
//system ( tmp );
|
||||
//sprintf(tmp,
|
||||
// "rcp "
|
||||
// "%scat/gbdmoz.content.dat.diff "
|
||||
// "%s:%scat/gbdmoz.content.dat.diff",
|
||||
// "%scatdb/gbdmoz.content.dat.diff "
|
||||
// "%s:%scatdb/gbdmoz.content.dat.diff",
|
||||
// dir,
|
||||
// iptoa(h2->m_ip),
|
||||
// h2->m_dir);
|
||||
@ -4745,8 +4745,8 @@ int install ( install_flag_konst_t installFlag , long hostId , char *dir ,
|
||||
if ( h2->m_hostId == 0 ) continue;
|
||||
sprintf(tmp,
|
||||
"rcp "
|
||||
"%scat/content.rdf.u8.new "
|
||||
"%s:%scat/content.rdf.u8.new",
|
||||
"%scatdb/content.rdf.u8.new "
|
||||
"%s:%scatdb/content.rdf.u8.new",
|
||||
dir,
|
||||
iptoa(h2->m_ipShotgun),
|
||||
h2->m_dir);
|
||||
@ -4754,8 +4754,8 @@ int install ( install_flag_konst_t installFlag , long hostId , char *dir ,
|
||||
system ( tmp );
|
||||
sprintf(tmp,
|
||||
"rcp "
|
||||
"%scat/structure.rdf.u8.new "
|
||||
"%s:%scat/structure.rdf.u8.new",
|
||||
"%scatdb/structure.rdf.u8.new "
|
||||
"%s:%scatdb/structure.rdf.u8.new",
|
||||
dir,
|
||||
iptoa(h2->m_ipShotgun),
|
||||
h2->m_dir);
|
||||
@ -4763,8 +4763,8 @@ int install ( install_flag_konst_t installFlag , long hostId , char *dir ,
|
||||
system ( tmp );
|
||||
sprintf(tmp,
|
||||
"rcp "
|
||||
"%scat/gbdmoz.structure.dat.new "
|
||||
"%s:%scat/gbdmoz.structure.dat.new",
|
||||
"%scatdb/gbdmoz.structure.dat.new "
|
||||
"%s:%scatdb/gbdmoz.structure.dat.new",
|
||||
dir,
|
||||
iptoa(h2->m_ipShotgun),
|
||||
h2->m_dir);
|
||||
@ -4772,8 +4772,8 @@ int install ( install_flag_konst_t installFlag , long hostId , char *dir ,
|
||||
system ( tmp );
|
||||
sprintf(tmp,
|
||||
"rcp "
|
||||
"%scat/gbdmoz.content.dat.new "
|
||||
"%s:%scat/gbdmoz.content.dat.new",
|
||||
"%scatdb/gbdmoz.content.dat.new "
|
||||
"%s:%scatdb/gbdmoz.content.dat.new",
|
||||
dir,
|
||||
iptoa(h2->m_ipShotgun),
|
||||
h2->m_dir);
|
||||
@ -4781,8 +4781,8 @@ int install ( install_flag_konst_t installFlag , long hostId , char *dir ,
|
||||
system ( tmp );
|
||||
sprintf(tmp,
|
||||
"rcp "
|
||||
"%scat/gbdmoz.content.dat.new.diff "
|
||||
"%s:%scat/gbdmoz.content.dat.new.diff",
|
||||
"%scatdb/gbdmoz.content.dat.new.diff "
|
||||
"%s:%scatdb/gbdmoz.content.dat.new.diff",
|
||||
dir,
|
||||
iptoa(h2->m_ipShotgun),
|
||||
h2->m_dir);
|
||||
@ -11036,7 +11036,8 @@ void dumpTagdb (char *coll,long startFileNum,long numFiles,bool includeTree,
|
||||
//g_conf.m_spiderdbMaxTreeMem = 1024*1024*30;
|
||||
g_tagdb.init ();
|
||||
g_collectiondb.init(true);
|
||||
g_tagdb.addColl ( coll, false );
|
||||
if ( rdbId == RDB_TAGDB ) g_tagdb.addColl ( coll, false );
|
||||
if ( rdbId == RDB_CATDB ) g_catdb.init();
|
||||
key128_t startKey ;
|
||||
key128_t endKey ;
|
||||
startKey.setMin();
|
||||
@ -11101,6 +11102,23 @@ void dumpTagdb (char *coll,long startFileNum,long numFiles,bool includeTree,
|
||||
printf("corrupt tagdb rec k.n0=%llu",k.n0);
|
||||
continue;
|
||||
}
|
||||
// catdb?
|
||||
if ( rdbId == RDB_CATDB ) {
|
||||
// for debug!
|
||||
CatRec crec;
|
||||
crec.set ( NULL,
|
||||
data ,
|
||||
size ,
|
||||
false);
|
||||
fprintf(stdout,
|
||||
"key=%s caturl=%s #catids=%li version=%li\n"
|
||||
,KEYSTR(&k,12)
|
||||
,crec.m_url
|
||||
,(long)crec.m_numCatids
|
||||
,(long)crec.m_version
|
||||
);
|
||||
continue;
|
||||
}
|
||||
// parse it up
|
||||
//TagRec *tagRec = (TagRec *)rec;
|
||||
Tag *tag = (Tag *)rec;
|
||||
@ -13997,10 +14015,10 @@ void saveRdbs ( int fd , void *state ) {
|
||||
last = rdb->getLastWriteTime();
|
||||
if ( now - last > delta )
|
||||
if ( ! rdb->close(NULL,NULL,false,false)) return;
|
||||
//rdb = g_catdb.getRdb();
|
||||
//last = rdb->getLastWriteTime();
|
||||
//if ( now - last > delta )
|
||||
// if ( ! rdb->close(NULL,NULL,false,false)) return;
|
||||
rdb = g_catdb.getRdb();
|
||||
last = rdb->getLastWriteTime();
|
||||
if ( now - last > delta )
|
||||
if ( ! rdb->close(NULL,NULL,false,false)) return;
|
||||
//rdb = g_indexdb.getRdb();
|
||||
//last = rdb->getLastWriteTime();
|
||||
//if ( now - last > delta )
|
||||
|
110
matches2.cpp
110
matches2.cpp
@ -6,7 +6,8 @@
|
||||
#include "HashTableT.h"
|
||||
|
||||
//make the key, it is just the needles ptr
|
||||
static HashTableT<unsigned long long , char*> s_quickTables;
|
||||
//static HashTableT<unsigned long long , char*> s_quickTables;
|
||||
static HashTableX s_quickTables;
|
||||
|
||||
/*
|
||||
// returns false and sets g_errno on error
|
||||
@ -63,6 +64,9 @@ bool fast_highlight ( // highlight these query terms:
|
||||
// to lower and store into tmp[]. TODO.
|
||||
// . a space (includes \r \n) in a needle will match a consecutive sequence
|
||||
// of spaces in the haystack
|
||||
|
||||
#define BITVEC unsigned long long
|
||||
|
||||
char *getMatches2 ( Needle *needles ,
|
||||
long numNeedles ,
|
||||
char *haystack ,
|
||||
@ -108,51 +112,69 @@ char *getMatches2 ( Needle *needles ,
|
||||
// . TODO: use a static cache of like 4 of these tables where the key
|
||||
// is the Needles ptr ... done
|
||||
long numNeedlesToInit = numNeedles;
|
||||
char space[256 * 5 * sizeof(unsigned long)];
|
||||
char space[256 * 6 * sizeof(BITVEC)];
|
||||
char *buf = NULL;
|
||||
|
||||
unsigned long *s0;
|
||||
unsigned long *s1;
|
||||
unsigned long *s2;
|
||||
unsigned long *s3;
|
||||
unsigned long *s4;
|
||||
BITVEC *s0;
|
||||
BITVEC *s1;
|
||||
BITVEC *s2;
|
||||
BITVEC *s3;
|
||||
BITVEC *s4;
|
||||
BITVEC *s5;
|
||||
|
||||
/*
|
||||
static bool s_quickTableInit = false;
|
||||
static char s_qtbuf[128*(12+1)*2];
|
||||
|
||||
long slot = -1;
|
||||
if(saveQuickTables) {
|
||||
uint64_t key = (uint32_t)needles;
|
||||
long slot = s_quickTables.getSlot(key);
|
||||
if(slot == -1) {
|
||||
buf = (char*)mcalloc(sizeof(unsigned long)*256*5,
|
||||
"matches");
|
||||
if(buf) s_quickTables.addKey(key, buf);
|
||||
//sanity check, no reason why there needs to be a
|
||||
//limit, I just don't expect there to be this many
|
||||
//static needles at this point.
|
||||
if(s_quickTables.getNumSlotsUsed() > 32){
|
||||
char *xx=NULL; *xx = 0;
|
||||
}
|
||||
if ( ! s_quickTableInit ) {
|
||||
s_quickTableInit = true;
|
||||
s_quickTables.set(8,4,128,s_qtbuf,256*13,false,0,"qx");
|
||||
}
|
||||
else {
|
||||
uint64_t key = (uint32_t)needles;
|
||||
slot = s_quickTables.getSlot(&key);
|
||||
if ( slot >= 0 ) {
|
||||
buf = s_quickTables.getValueFromSlot(slot);
|
||||
numNeedlesToInit = 0;
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
if(!buf) {
|
||||
buf = space;
|
||||
memset ( buf , 0 , sizeof(unsigned long)*256*5);
|
||||
memset ( buf , 0 , sizeof(BITVEC)*256*6);
|
||||
}
|
||||
|
||||
long offset = 0;
|
||||
s0 = (unsigned long*)(buf + offset);
|
||||
offset += sizeof(unsigned long)*256;
|
||||
s1 = (unsigned long*)(buf + offset);
|
||||
offset += sizeof(unsigned long)*256;
|
||||
s2 = (unsigned long*)(buf + offset);
|
||||
offset += sizeof(unsigned long)*256;
|
||||
s3 = (unsigned long*)(buf + offset);
|
||||
offset += sizeof(unsigned long)*256;
|
||||
s4 = (unsigned long*)(buf + offset);
|
||||
/*
|
||||
if( useQuickTables && slot == -1 ) {
|
||||
//buf = (char*)mcalloc(sizeof(unsigned long)*256*5,
|
||||
// "matches");
|
||||
if(buf) s_quickTables.addKey(&key, &buf);
|
||||
//sanity check, no reason why there needs to be a
|
||||
//limit, I just don't expect there to be this many
|
||||
//static needles at this point.
|
||||
if(s_quickTables.getNumSlotsUsed() > 32){
|
||||
char *xx=NULL; *xx = 0;
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
unsigned long mask;
|
||||
// try 64 bit bit vectors now since we doubled # of needles
|
||||
long offset = 0;
|
||||
s0 = (BITVEC *)(buf + offset);
|
||||
offset += sizeof(BITVEC)*256;
|
||||
s1 = (BITVEC *)(buf + offset);
|
||||
offset += sizeof(BITVEC)*256;
|
||||
s2 = (BITVEC *)(buf + offset);
|
||||
offset += sizeof(BITVEC)*256;
|
||||
s3 = (BITVEC *)(buf + offset);
|
||||
offset += sizeof(BITVEC)*256;
|
||||
s4 = (BITVEC *)(buf + offset);
|
||||
offset += sizeof(BITVEC)*256;
|
||||
s5 = (BITVEC *)(buf + offset);
|
||||
|
||||
BITVEC mask;
|
||||
|
||||
// set the letter tables, s0[] through sN[], for each needle
|
||||
for ( long i = 0 ; i < numNeedlesToInit ; i++ ) {
|
||||
@ -160,7 +182,8 @@ char *getMatches2 ( Needle *needles ,
|
||||
QUICKPOLL(niceness);
|
||||
unsigned char *w = (unsigned char *)needles[i].m_string;
|
||||
unsigned char *wend = w + needles[i].m_stringSize;
|
||||
mask = (1<<(i&0x1f)); // (1<<(i%32));
|
||||
// BITVEC is now 64 bits
|
||||
mask = (1<<(i&0x3f)); // (1<<(i%64));
|
||||
// if the needle is small, fill up the remaining letter tables
|
||||
// with its mask... so it matches any character in haystack.
|
||||
s0[(unsigned char)to_lower_a(*w)] |= mask;
|
||||
@ -172,6 +195,7 @@ char *getMatches2 ( Needle *needles ,
|
||||
s2[j] |= mask;
|
||||
s3[j] |= mask;
|
||||
s4[j] |= mask;
|
||||
s5[j] |= mask;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
@ -184,6 +208,7 @@ char *getMatches2 ( Needle *needles ,
|
||||
s2[j] |= mask;
|
||||
s3[j] |= mask;
|
||||
s4[j] |= mask;
|
||||
s5[j] |= mask;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
@ -195,6 +220,7 @@ char *getMatches2 ( Needle *needles ,
|
||||
for ( long j = 0 ; j < 256 ; j++ ) {
|
||||
s3[j] |= mask;
|
||||
s4[j] |= mask;
|
||||
s5[j] |= mask;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
@ -206,12 +232,24 @@ char *getMatches2 ( Needle *needles ,
|
||||
if ( w >= wend ) {
|
||||
for ( long j = 0 ; j < 256 ; j++ ) {
|
||||
s4[j] |= mask;
|
||||
s5[j] |= mask;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
s4[(unsigned char)to_lower_a(*w)] |= mask;
|
||||
s4[(unsigned char)to_upper_a(*w)] |= mask;
|
||||
w += 1;//step;
|
||||
|
||||
if ( w >= wend ) {
|
||||
for ( long j = 0 ; j < 256 ; j++ ) {
|
||||
s5[j] |= mask;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
s5[(unsigned char)to_lower_a(*w)] |= mask;
|
||||
s5[(unsigned char)to_upper_a(*w)] |= mask;
|
||||
w += 1;//step;
|
||||
|
||||
}
|
||||
|
||||
// return a ptr to the first match if we should, this is it
|
||||
@ -245,6 +283,8 @@ char *getMatches2 ( Needle *needles ,
|
||||
if ( ! mask ) continue;
|
||||
mask &= s4[*(p+4)];
|
||||
if ( ! mask ) continue;
|
||||
mask &= s5[*(p+5)];
|
||||
if ( ! mask ) continue;
|
||||
//debugCount++;
|
||||
/*
|
||||
// display
|
||||
@ -273,7 +313,7 @@ char *getMatches2 ( Needle *needles ,
|
||||
// we got a good candidate, loop through all the needles
|
||||
for ( long j = 0 ; j < numNeedles ; j++ ) {
|
||||
// skip if does not match mask, will save time
|
||||
if ( ! ((1<<(j&0x1f)) & mask) ) continue;
|
||||
if ( ! ((1<<(j&0x3f)) & mask) ) continue;
|
||||
if( needles[j].m_stringSize > 3) {
|
||||
// ensure first 4 bytes matches this needle's
|
||||
if (needles[j].m_string[0]!=to_lower_a(*(p+0)))
|
||||
@ -421,7 +461,7 @@ char *getMatches2 ( Needle *needles ,
|
||||
// we got a good candidate, loop through all the needles
|
||||
for ( long j = 0 ; j < numNeedles ; j++ ) {
|
||||
// skip if does not match mask, will save time
|
||||
if ( ! ((1<<(j&0x1f)) & mask) ) continue;
|
||||
if ( ! ((1<<(j&0x3f)) & mask) ) continue;
|
||||
if( needles[j].m_stringSize > 3) {
|
||||
// ensure first 4 bytes matches this needle's
|
||||
if (needles[j].m_string[0]!=to_lower_a(*(p+0)))
|
||||
|
@ -16,8 +16,10 @@
|
||||
bool mainShutdown ( bool urgent ) { return true; }
|
||||
bool closeAll ( void *state , void (* callback)(void *state) ) {return true;}
|
||||
bool allExit ( ) { return true; }
|
||||
long g_qbufNeedSave = false;
|
||||
SafeBuf g_qbuf;
|
||||
//long g_qbufNeedSave = false;
|
||||
//SafeBuf g_qbuf;
|
||||
bool sendPageSEO(class TcpSocket *s, class HttpRequest *hr) {return true;}
|
||||
|
||||
|
||||
int main ( int argc , char *argv[] ) {
|
||||
bool addWWW = true;
|
||||
|
Reference in New Issue
Block a user