Merge branch 'master' into diffbot

Conflicts:
	Hostdb.cpp
	Makefile
	PageResults.cpp
	PageRoot.cpp
	Pages.cpp
	Rdb.cpp
	SearchInput.cpp
	SearchInput.h
	Spider.cpp
	Spider.h
	XmlDoc.cpp
This commit is contained in:
Matt Wells
2013-10-16 14:28:42 -07:00
59 changed files with 3678 additions and 3262 deletions

@ -198,6 +198,7 @@ bool CatRec::set ( Url *url , char *data , long dataSize , bool gotByIp ) {
log ( "tagdb: Deserialized datasize %i != %li for url %s so "
"ignoring tagdb record.",
p - m_data, m_dataSize , url->getUrl() );
return false;
char *xx = NULL; *xx = 0;
}
@ -308,7 +309,9 @@ bool CatRec::set ( Url *site ,
// add the ids
m_catids = (long*)p;
memcpy(p, catids, 4*m_numCatids);
p += 4*m_numCatids;
// skip over "numCatids" NOT m_numCatids which is TRUNCATED
// to MAX_CATIDS
p += 4*numCatids;
//}
// point to the filenum so we can mod it!
//m_filenumPtr = p;

@ -29,7 +29,11 @@ bool Catdb::init ( ) {
// . what's max # of tree nodes?
// . assume avg tagdb rec size (siteUrl) is about 82 bytes we get:
// . NOTE: 32 bytes of the 82 are overhead
long treeMem = g_conf.m_catdbMaxTreeMem;
//long treeMem = g_conf.m_catdbMaxTreeMem;
// speed up gen catdb, use 15MB. later maybe once gen is complete
// we can free this tree or something...
// TODO!
long treeMem = 15000000;
//long treeMem = 100000000;
//long maxTreeNodes = g_conf.m_catdbMaxTreeMem / 82;
long maxTreeNodes = treeMem / 82;
@ -51,14 +55,14 @@ bool Catdb::init ( ) {
// . initialize our own internal rdb
// . i no longer use cache so changes to tagdb are instant
// . we still use page cache however, which is good enough!
if ( this == &g_catdb )
return m_rdb.init ( g_hostdb.m_dir ,
//if ( this == &g_catdb )
if ( ! m_rdb.init ( g_hostdb.m_dir ,
"catdb" ,
true , // dedup same keys?
-1 , // fixed record size
//g_hostdb.m_groupMask ,
//g_hostdb.m_groupId ,
g_conf.m_catdbMinFilesToMerge ,
2,//g_conf.m_catdbMinFilesToMerge ,
treeMem ,//g_conf.m_catdbMaxTreeMem ,
maxTreeNodes ,
// now we balance so Sync.cpp can ordered huge list
@ -70,9 +74,17 @@ bool Catdb::init ( ) {
&m_pc ,
false,
false,
12,
12, // keysize
false,
true ); // is collectionless?
true )) // is collectionless?
return false;
// normally Collectiondb.addColl() will call Rdb::addColl() which
// will init the CollectionRec::m_rdbBase, which is what
// Rdb::getBase(collnum_t) will return. however, for collectionless
// rdb databases we set Rdb::m_collectionlessBase special here.
// This is in Rdb.cpp::init() now.
//return m_rdb.addColl ( NULL );
return true;
}
@ -119,7 +131,7 @@ bool Catdb::verify ( char *coll ) {
g_threads.disableThreads();
Msg5 msg5;
Msg5 msg5b;
//Msg5 msg5b;
RdbList list;
key_t startKey;
key_t endKey;
@ -128,7 +140,7 @@ bool Catdb::verify ( char *coll ) {
//long minRecSizes = 64000;
if ( ! msg5.getList ( RDB_CATDB ,
coll ,
"",//coll ,
&list ,
startKey ,
endKey ,
@ -147,7 +159,7 @@ bool Catdb::verify ( char *coll ) {
-1 ,
true ,
-1LL ,
&msg5b ,
NULL,//&msg5b ,
true )) {
g_threads.enableThreads();
return log("db: HEY! it did not block");
@ -311,6 +323,19 @@ void Catdb::listSearch ( RdbList *list,
// for small lists, just loop through the list
if (list->getListSize() < 16*1024) {
while ( ! list->isExhausted() ) {
// for debug!
/*
CatRec crec;
crec.set ( NULL,
list->getCurrentData(),
list->getCurrentDataSize(),
false);
log("catdb: caturl=%s #catid=%li version=%li"
,crec.m_url
,(long)crec.m_numCatids
,(long)crec.m_version
);
*/
// check the current key
if ( list->getCurrentKey() != exactKey ) {
// miss, next

@ -50,6 +50,7 @@ void Categories::reset() {
}
}
// filename usually ./catdb/gbdmoz.structure.dat
long Categories::loadCategories ( char *filename ) {
//ifstream inStream;
int inStream;
@ -69,6 +70,7 @@ long Categories::loadCategories ( char *filename ) {
return 1;
}
// read in the number of cats
// filename usually ./catdb/gbdmoz.structure.dat
if ( fileRead ( inStream, &m_numCats, sizeof(long) ) != sizeof(long) ) {
log("cat: Error reading structure file: %s", filename);
close(inStream);
@ -114,7 +116,8 @@ long Categories::loadCategories ( char *filename ) {
g_errno = ENOMEM;
return 1;
}
// read the rest of the file into the temp buffer
// . read the rest of the file into the temp buffer
// . filename usually ./catdb/gbdmoz.structure.dat
if ( fileRead ( inStream, tempBuffer, readSize ) != readSize ) {
log("cat: Error reading structure file: %s", filename);
close(inStream);
@ -212,6 +215,15 @@ long Categories::loadCategories ( char *filename ) {
long long start = gettimeofdayInMilliseconds();
// sort the category hash by hash value
gbsort(m_catHash, m_numCats, sizeof(CategoryHash), sortCatHash);
// sanity check - no dups allowed
unsigned long last = 0xffffffff;
for ( long i = 0 ; i < m_numCats ; i++ ) {
if ( m_catHash[i].m_hash == last )
log("dmoz: hash collision on %lu",last);
last = m_catHash[i].m_hash;
}
// time it
long long took = gettimeofdayInMilliseconds();
if ( took - start > 100 ) log(LOG_INIT,"admin: Took %lli ms to "
@ -327,9 +339,15 @@ long Categories::getIndexFromPath ( char *str, long strLen ) {
// check for top
if (strLen == 3 &&
strncasecmp(str, "Top", 3) == 0)
// it is catid 2 right? but i guess zero is symbolic for us!
return 0;
// get the hash
unsigned long hash = hash32Lower_a(str, strLen, 0);
// debug
//char c = str[strLen];
//str[strLen] = '\0';
//log("dmoz: looking up hash %lu for %s",hash,str);
//str[strLen] = c;
// binary search
while (low <= high) {
// next check spot
@ -349,6 +367,7 @@ long Categories::getIndexFromPath ( char *str, long strLen ) {
// return the catid from the given path
long Categories::getIdFromPath ( char *str, long strLen ) {
if ( ! m_cats ) return -1;
long index = getIndexFromPath(str, strLen);
return m_cats[index].m_catid;
}
@ -497,7 +516,7 @@ void Categories::printPathFromId ( SafeBuf *sb ,
long catIndex;
// get the index
catIndex = getIndexFromId(catid);
if (catIndex < 1) return;
//if (catIndex < 1) return;
printPathFromIndex(sb, catIndex, raw, isRTL);
}
@ -509,8 +528,22 @@ void Categories::printPathFromIndex ( SafeBuf *sb ,
if (catIndex < 1) return;
// get the parent
parentId = m_cats[catIndex].m_parentid;
// print the parent(s) first
if (parentId > 1) {
long catid = m_cats[catIndex].m_catid;
// include Top now. in newer dmoz it is catid2.
//if ( catid == 2 ) {
// sb->safePrintf("Top");
// return;
//}
// . print the parent(s) first
// . the new dmoz data dumps signify a parentless topic by
// havings its parentid equal its catid, so avoid infinite
// loops by checking for that here now. mdw oct 2013.
// . the new DMOZ has Top has catid 2 now, even though it is
// mistakenly labelled as Top/World, which is really catid 3.
// so make this parentId > 2...
if (parentId >= 1 && parentId != catid ) {
bool isParentRTL = isIdRTLStart(parentId);
// print spacing here if RTL
//if (isRTL && !raw)
@ -558,7 +591,7 @@ void Categories::printPathCrumbFromId ( SafeBuf *sb ,
long catIndex;
// get the index
catIndex = getIndexFromId(catid);
if (catIndex < 1) return;
//if (catIndex < 1) return;
printPathCrumbFromIndex(sb, catIndex, isRTL);
}
@ -569,8 +602,20 @@ void Categories::printPathCrumbFromIndex ( SafeBuf *sb,
if (catIndex < 1) return;
// get the parent
parentId = m_cats[catIndex].m_parentid;
// print the parent(s) first
if (parentId > 1) {
long catid = m_cats[catIndex].m_catid;
// include Top now. in newer dmoz it is catid2.
// seems to already be included below... because you made it
// parentId>1 not parentId>2
//if ( catid == 2 ) {
// sb->safePrintf("Top");
// return;
//}
// . print the parent(s) first
// . the new dmoz has Top has parentid 2 now, and Top/World is
// catid 3. so make this parentId > 2 not parentId > 1
if (parentId > 1 && parentId != catid ) {
bool isParentRTL = isIdRTLStart(parentId);
printPathCrumbFromId(sb, parentId, isRTL);
// print a spacing
@ -793,7 +838,7 @@ long Categories::fixUrl ( char *url, long urlLen ) {
return newUrlLen;
}
bool Categories::addUrlsToBadHashTable ( long catid ) {
bool Categories::addUrlsToBadHashTable ( long catid ) {
return getTitleAndSummary ( NULL , // urlorig
0 , // urloriglen
catid ,
@ -810,6 +855,183 @@ long Categories::fixUrl ( char *url, long urlLen ) {
true );// just add to table
}
// just show the urls in dmoz
bool Categories::printUrlsInTopic ( SafeBuf *sb, long catid ) {
long catIndex;
unsigned long fileOffset;
unsigned long n;
char* p;
unsigned long readSize;
char title[1024];
char summ[5000];
long maxTitleLen = 1024;
long maxSummLen = 5000;
long titleLen;
long summLen;
long urlStrLen;
char urlStr[MAX_URL_LEN];
long niceness = 0;
bool printedStart = false;
// lookup the index for this catid
catIndex = getIndexFromId(catid);
if (catIndex < 0)
goto errEnd;
// get the file offset
fileOffset = m_cats[catIndex].m_contentOffset;
QUICKPOLL( niceness );
// . open the file
char filename[512];
sprintf(filename, "%scatdb/%s", g_hostdb.m_dir, RDFCONTENT_FILE);
m_rdfStream = open(filename, O_RDONLY | O_NONBLOCK);
if ( m_rdfStream < 0 ) {
log("cat: Error Opening %s\n", filename);
goto errEnd;
}
// . seek to the offset
n = lseek ( m_rdfStream, fileOffset, SEEK_SET );
if ( n != fileOffset ) {
log("cat: Error seeking to Content Offset %li", fileOffset);
goto errEnd;
}
// . read in a chunk
m_rdfBuffer = m_rdfSmallBuffer;
m_rdfBufferSize = RDFSMALLBUFFER_SIZE;
p = m_rdfBuffer;
readSize = m_rdfBufferSize;
readLoop:
n = read ( m_rdfStream, p, readSize );
if(n > 0 && n != readSize) {
p += n;
readSize -= n;
}
//log(LOG_WARN,"build: reading %li bytes out of %li",n,m_rdfBufferSize);
QUICKPOLL(niceness);
if(n < 0 && errno == EAGAIN) goto readLoop;
if ( n <= 0 || n > (unsigned long)m_rdfBufferSize ) {
log("cat: Error Reading Content");
goto errEnd;
}
m_rdfPtr = m_rdfBuffer;
m_rdfEnd = &m_rdfBuffer[n];
m_currOffset = fileOffset;
// . parse to the correct url
// parse the first topic and catid
if (rdfNextTag() < 0)
goto errEnd;
if (rdfNextTag() < 0)
goto errEnd;
// parse until "ExternalPage"
nextTag:
QUICKPOLL((niceness));
if (rdfNextTag() < 0)
goto errEnd;
// check for catid of next topic to stop looking
if (m_tagLen == 5 &&
strncmp(m_tagRecfer, "catid", 5) == 0)
goto errEnd;
if (m_tagLen != 12 ) goto nextTag;
if ( strncmp(m_tagRecfer, "ExternalPage", 12) != 0) goto nextTag;
//
// got one
//
// get the next string
urlStrLen = fillNextString(urlStr, MAX_URL_LEN-1);
if (urlStrLen < 0)
goto errEnd;
// html decode the url
/*
urlStrLen = htmlDecode(decodedUrl, urlStr, urlStrLen,false,
niceness);
memcpy(urlStr, decodedUrl, urlStrLen);
normUrl.set(urlStr, urlStrLen, true);
g_catdb.normalizeUrl(&normUrl, &normUrl);
// copy it back
urlStrLen = normUrl.getUrlLen();
memcpy(urlStr, normUrl.getUrl(), urlStrLen);
// make sure there's a trailing / on root urls
// and no www.
//urlStrLen = fixUrl(urlStr, urlStrLen);
// check for an anchor
urlAnchor = NULL;
urlAnchorLen = 0;
//for (long i = 0; i < urlStrLen; i++) {
//if (urlStr[i] == '#') {
if (normUrl.getAnchorLen() > 0) {
//urlAnchor = &urlStr[i];
//urlAnchorLen = urlStrLen - i;
//urlStrLen = i;
urlAnchor = normUrl.getAnchor();
urlAnchorLen = normUrl.getAnchorLen();
//break;
}
*/
// . parse out the title
if (rdfParse("d:Title") < 0)
goto errEnd;
titleLen = fillNextTagBody(title, maxTitleLen);
QUICKPOLL(niceness);
// . parse out the summary
if (rdfParse("d:Description") < 0)
goto errEnd;
summLen = fillNextTagBody(summ, maxSummLen);
if ( ! printedStart ) {
printedStart = true;
sb->safePrintf("<ul>");
}
// print it out
sb->safePrintf("<li><a href=\"");
sb->safeMemcpy ( urlStr , urlStrLen );
sb->safePrintf("\">");
sb->safeMemcpy ( title , titleLen );
sb->safePrintf("</a><br>");
sb->safeMemcpy( summ, summLen );
sb->safePrintf("<br>");//<br>");
/*
// . fill the anchor
if (anchor) {
if (urlAnchor) {
if (urlAnchorLen > maxAnchorLen)
urlAnchorLen = maxAnchorLen;
memcpy(anchor, urlAnchor, urlAnchorLen);
*anchorLen = urlAnchorLen;
}
else
*anchorLen = 0;
}
*/
// DO NEXT tag
goto nextTag;
errEnd:
sb->safePrintf("</ul>");
close(m_rdfStream);
return false;
}
// . get the title and summary for a specific url
// and catid
bool Categories::getTitleAndSummary ( char *urlOrig,
@ -857,7 +1079,7 @@ bool Categories::getTitleAndSummary ( char *urlOrig,
// . open the file
char filename[512];
sprintf(filename, "%scat/%s", g_hostdb.m_dir, RDFCONTENT_FILE);
sprintf(filename, "%scatdb/%s", g_hostdb.m_dir, RDFCONTENT_FILE);
//m_rdfStream.clear();
//m_rdfStream.open(filename, ifstream::in);
m_rdfStream = open(filename, O_RDONLY | O_NONBLOCK);
@ -1011,13 +1233,17 @@ errEnd:
return false;
}
// generate sub categories for a given catid
// . generate sub categories for a given catid
// . store list of SubCategories into "subCatBuf" return # stored
long Categories::generateSubCats ( long catid,
SubCategory *subCats,
char **catBuffer,
long *catBufferSize,
long *catBufferLen,
bool allowRealloc ) {
SafeBuf *subCatBuf
//SubCategory *subCats,
//char **catBuffer,
//long *catBufferSize,
//long *catBufferLen,
//bool allowRealloc
) {
long catIndex;
unsigned long fileOffset;
unsigned long n;
@ -1029,17 +1255,24 @@ long Categories::generateSubCats ( long catid,
long prefixLen;
long nameStart;
long nameLen;
long catp = 0;
long catBufferInc = *catBufferSize;
// lookup the index for this catid
long need ;
SubCategory *cat;
char *p ;
//long catp = 0;
//long catBufferInc = *catBufferSize;
// . lookup the index for this catid
// . binary step, guessing to approximate place
// and then scanning from there
catIndex = getIndexFromId(catid);
if (catIndex < 0)
goto errEnd;
// get the file offset
fileOffset = m_cats[catIndex].m_structureOffset;
// open the structure file
// catdb/structure.rdf.u8 in utf8
char filename[512];
sprintf(filename, "%scat/%s", g_hostdb.m_dir, RDFSTRUCTURE_FILE);
sprintf(filename, "%scatdb/%s", g_hostdb.m_dir, RDFSTRUCTURE_FILE);
//m_rdfStream.clear();
//m_rdfStream.open(filename, ifstream::in);
m_rdfStream = open(filename, O_RDONLY);
@ -1066,12 +1299,16 @@ long Categories::generateSubCats ( long catid,
log("cat: Error Reading Structure Offset");
goto errEnd;
}
// point to the buffer we just read with m_rdfPtr
m_rdfPtr = m_rdfBuffer;
m_rdfEnd = &m_rdfBuffer[n];
m_currOffset = fileOffset;
// parse tags for the sub categories or until we hit /Topic
nextTag:
// . this increments m_rdfPtr until it points to the beginning of a tag
// . it may end up reading another chunk from disk
// . it memcopies m_tagRecfer to be the name of the tag it points to
if (rdfNextTag() < 0)
goto gotSubCats;
// check for /Topic
@ -1121,6 +1358,9 @@ nextTag:
false,
0);
memcpy(catStr, htmlDecoded, catStrLen);
// reset this offset
nameStart = 0;
nameLen = catStrLen;
// get the prefix and name position/length
switch (currType) {
case SUBCAT_ALTLANG:
@ -1130,14 +1370,14 @@ nextTag:
// prefix is at the start
prefixStart = 0;
prefixLen = 0;
nameStart = 0;
//nameStart = 0;
// go to the end of the prefix
while (catStr[nameStart] != ':') {
nameStart++;
prefixLen++;
}
// skip the :Top/
nameStart += 5;
// skip the : in :Top/
nameStart += 1;
nameLen = catStrLen - nameStart;
break;
case SUBCAT_LETTERBAR:
@ -1145,9 +1385,9 @@ nextTag:
prefixStart = catStrLen - 1;
prefixLen = 1;
// skip the Top/ for the name
nameStart = 4;
//nameStart = 4;
// lose the Top/, keep the end letter
nameLen = catStrLen - 4;
//nameLen = catStrLen - 4;
break;
// . don't do this because of ltr?
//case SUBCAT_RELATED:
@ -1167,43 +1407,56 @@ nextTag:
prefixStart--;
prefixLen++;
}
// name skips Top/
nameStart = 4;
nameLen = catStrLen - 4;
// name skips Top/ ... no! we include Top now
// because we need it so PageResults.cpp can call
// currIndex=g_categories->getIndexFromPath(catName,catNameLen)
// on this name, and it needs "Top/" because it was part
// of the hash of the full name for the category now.
// and we lookup the Category record by that hash
// in getIndexFromPath().
//nameStart = 4;
//nameLen = catStrLen - 4;
break;
}
// . fill the next sub category
if (catp + prefixLen + nameLen >= *catBufferSize) {
if (!allowRealloc)
goto gotSubCats;
// realloc the buffer
char *re_catBuffer = (char*)mrealloc ( *catBuffer,
*catBufferSize,
*catBufferSize+catBufferInc,
"Categories" );
if (!re_catBuffer) {
log ( "Could not allocate %li bytes for catBuffer",
*catBufferSize+catBufferInc );
g_errno = ENOMEM;
goto errEnd;
}
*catBuffer = re_catBuffer;
*catBufferSize += catBufferInc;
}
// fill the prefix and name in the buffer and subcat
// . fill the prefix and name in the buffer and subcat
need = sizeof(SubCategory) + prefixLen + 1 + nameLen + 1;
// reserve space in safebuf for it
if ( ! subCatBuf->reserve(need) ) goto errEnd;
// point to it in safebuf
cat = (SubCategory *)(subCatBuf->getBuf());
cat->m_prefixLen = prefixLen;
cat->m_nameLen = nameLen;
cat->m_type = currType;
p = cat->m_buf;
memcpy ( p , catStr + prefixStart , prefixLen );
p += prefixLen;
*p++ = '\0';
memcpy ( p , catStr + nameStart , nameLen );
p += nameLen;
*p++ = '\0';
// update safebuf length
subCatBuf->incrementLength ( cat->getRecSize() );
/*
subCats[numSubCats].m_prefixOffset = catp;
subCats[numSubCats].m_prefixLen = prefixLen;
if (prefixLen > 0) {
memcpy(&((*catBuffer)[catp]), &catStr[prefixStart], prefixLen);
catp += prefixLen;
}
subCats[numSubCats].m_nameOffset = catp;
subCats[numSubCats].m_nameOffset = catBuf->length();//catp;
subCats[numSubCats].m_nameLen = nameLen;
if (nameLen > 0) {
memcpy(&((*catBuffer)[catp]), &catStr[nameStart], nameLen);
catp += nameLen;
}
subCats[numSubCats].m_type = currType;
*/
// next sub cat
numSubCats++;
if (numSubCats >= MAX_SUB_CATS) {
@ -1214,14 +1467,14 @@ nextTag:
// next tag
goto nextTag;
gotSubCats:
*catBufferLen = catp;
//*catBufferLen = catp;
//m_rdfStream.close();
//m_rdfStream.clear();
close(m_rdfStream);
return numSubCats;
errEnd:
*catBufferLen = 0;
//*catBufferLen = 0;
//m_rdfStream.close();
//m_rdfStream.clear();
close(m_rdfStream);
@ -1259,8 +1512,13 @@ long Categories::createDirSearchRequest ( char *requestBuf,
char *rrr = r->m_reqBuf.getBufStart();
if ( rrr && rrr[0] == 'Z' ) cmd = "ZET";
// request
p += sprintf(p, "%s /search?dir=%li&dr=0&sc=0&sdir=%li&sdirt=0&c=",
cmd, catid, catid);
//p += sprintf(p, "%s /search?dir=%li&dr=0&sc=0&sdir=%li&sdirt=0&c=",
// cmd, catid, catid);
p += sprintf(p,
"%s /search?q=gbcatid%%3A%li&dir=%li&dr=0&sc=0&c="
, cmd
, catid
, catid);
// coll
memcpy(p, coll, collLen);
p += collLen;
@ -1314,7 +1572,7 @@ bool Categories::loadLangTables(void) {
unsigned long entries = 0L;
char *cp;
char *cpEnd = line + 10239;
if(!(content = fopen("cat/content.rdf.u8", "r"))) {
if(!(content = fopen("catdb/content.rdf.u8", "r"))) {
log(LOG_INFO, "cat: could not open content file.\n");
return(false);
}

@ -26,7 +26,7 @@
#define MAX_TAG_LEN 127
#define MAX_URL_CATIDS 64
#define MAX_URLTXT_SIZE 500000
#define MAX_CATIDS 64
#define MAX_CATIDS 96
#define MAX_CATNAME_LEN 1024
#define HASHTABLE_SIZE (1024*1024)
@ -61,11 +61,15 @@ struct CategoryHash {
};
struct SubCategory {
long m_prefixOffset;
//long m_prefixOffset;
long m_prefixLen;
long m_nameOffset;
//long m_nameOffset;
long m_nameLen;
char m_type;
long getRecSize () { return sizeof(SubCategory)+m_prefixLen+m_nameLen+2;};
char *getPrefix() { return m_buf; };
char *getName () { return m_buf+m_prefixLen+1;};
char m_buf[0];
};
class Categories {
@ -133,6 +137,8 @@ public:
long catid,
bool isRTL = false );
bool printUrlsInTopic ( class SafeBuf *sb , long catid ) ;
// . get the title and summary for a specific url
// and catid
bool getTitleAndSummary ( char *url,
@ -153,15 +159,13 @@ public:
// normalize a url string
long fixUrl ( char *url, long urlLen );
// generate sub categories for a given catid
long generateSubCats ( long catid,
SubCategory *subCats,
char **catBuffer,
long *catBufferSize,
long *catBufferLen,
bool allowRealloc = true );
// . generate sub categories for a given catid
// . store list of SubCategories into "subCatBuf" return # stored
// . hits disk without using threads... so kinda sucks...
long generateSubCats ( long catid, SafeBuf *subCatBuf );
long getNumUrlsFromIndex ( long catIndex ) {
if ( ! m_cats ) return 0;
return m_cats[catIndex].m_numUrls; };
// creates a directory search request url

2
Conf.h

@ -164,7 +164,7 @@ class Conf {
long m_catdbMaxTreeMem;
long m_catdbMaxDiskPageCacheMem;
long m_catdbMaxCacheMem;
long m_catdbMinFilesToMerge;
//long m_catdbMinFilesToMerge;
long m_revdbMaxTreeMem;
long m_timedbMaxTreeMem;

@ -2359,6 +2359,9 @@ uint32_t Hostdb::getShardNum ( char rdbId,void *k,bool split ) {
//else if ( rdbId == RDB_CATDB || rdbId == RDB2_CATDB2 ) {
// return m_map [(*(uint16_t *)((char *)k + 10))>>3];
//}
else if ( rdbId == RDB_CATDB || rdbId == RDB2_CATDB2 ) {
return m_map [(*(uint16_t *)((char *)k + 10))>>3];
}
// core -- must be provided
char *xx = NULL; *xx = 0;
//groupId=key.n1 & g_hostdb.m_groupMask;

@ -518,6 +518,7 @@ long HttpMime::getContentTypePrivate ( char *s ) {
else if (!strcasecmp(s,"image/jpeg" ) ) ct = CT_JPG;
else if (!strcasecmp(s,"image/png" ) ) ct = CT_PNG;
else if (!strcasecmp(s,"image/tiff" ) ) ct = CT_TIFF;
else if (!strncasecmp(s,"image/",6 ) ) ct = CT_IMAGE;
else if (!strcasecmp(s,"application/javascript" ) ) ct = CT_JS;
else if (!strcasecmp(s,"application/x-javascript") ) ct = CT_JS;
else if (!strcasecmp(s,"text/javascript" ) ) ct = CT_JS;

@ -36,6 +36,7 @@ time_t atotime5 ( char *s ) ;
#define CT_JS 14
#define CT_CSS 15
#define CT_JSON 16
#define CT_IMAGE 17
#define ET_IDENTITY 0
#define ET_GZIP 1

@ -23,6 +23,7 @@ void HttpRequest::reset() {
m_userIP = 0;
m_isMSIE = false;
m_reqBufValid = false;
m_reqBuf.purge();
if (m_cgiBuf2) {
mfree(m_cgiBuf2, m_cgiBuf2Size, "extraParms");

@ -459,7 +459,11 @@ unsigned char getLanguageFromUserAgent(char *abbr) {
return langUnknown;
}
// these are going to be adult, in any language
// . these are going to be adult, in any language
// . this seems only to be used by Speller.cpp when splitting up words
// in the url domain.
// . s/slen is a full word that is found in our "dictionary" so using
// phrases like biglittlestuff probably should not go here.
bool isAdult( char *s, long slen, char **loc ) {
char **p = NULL;
char *a = NULL;

@ -33,13 +33,13 @@ OBJS = Tfndb.o UdpSlot.o \
HttpMime.o Hostdb.o \
Highlight.o File.o Errno.o Entities.o \
Dns.o Dir.o Conf.o Bits.o \
Stats.o BigFile.o AdultBit.o Msg17.o \
Stats.o BigFile.o Msg17.o \
Speller.o DiskPageCache.o \
PingServer.o StopWords.o TopTree.o \
Parms.o Pages.o Msg28.o Msg30.o \
Unicode.o iana_charset.o Iso8859.o \
SearchInput.o \
Categories.o Msg2a.o PageCatdb.o PageDirectory.o Msg2b.o \
Categories.o Msg2a.o PageCatdb.o PageDirectory.o \
SafeBuf.o Datedb.o \
UCNormalizer.o UCPropTable.o UnicodeProperties.o \
Pops.o Title.o Pos.o LangList.o \
@ -99,7 +99,7 @@ endif
# let's keep the libraries in the repo for easier bug reporting and debugging
# in general if we can. the includes are still in /usr/include/ however...
# which is kinda strange but seems to work so far.
#LIBS= -L. ./libplotter.a ./libplot.a ./libz.a ./libssl.a ./libcrypto.a ./libiconv.a ./libm.a ./libgcc.a ./libpthread.a ./libc.a ./libstdc++.a
#LIBS= -L. ./libz.a ./libssl.a ./libcrypto.a ./libiconv.a ./libm.a ./libgcc.a ./libpthread.a ./libc.a ./libstdc++.a

@ -1275,10 +1275,12 @@ mallocmemloop:
long long avail = (long long)m_maxMem -
(long long)m_used;
if ( now - s_lastTime >= 1000LL ) {
log("mem: system malloc(%i) availShouldBe=%lli: "
log("mem: system malloc(%i,%s) availShouldBe=%lli: "
"%s (%s) (ooms suppressed since "
"last log msg = %li)",
size+UNDERPAD+OVERPAD,avail,
size+UNDERPAD+OVERPAD,
note,
avail,
mstrerror(g_errno),
note,
s_missed);

@ -388,6 +388,12 @@ bool Msg1::sendSomeOfList ( ) {
if ( m_list->m_fixedDataSize != getDataSizeFromRdbId(m_rdbId) ) {
char *xx=NULL;*xx=0; }
// little debug thing for genCatdb from msg9b's huge list add
//if ( m_list->m_listSize > 10000000 )
// log("msg1: adding chunk @ %li of %li bytes",
// (long)(dataStart - m_list->m_list) ,
// (long)m_list->m_listSize );
// . now send this list to the host
// . this returns false if blocked, true otherwise
// . it also sets g_errno on error
@ -480,7 +486,9 @@ bool Msg1::sendData ( unsigned long shardNum, char *listData , long listSize) {
if ( ! g_errno ) sendToSelf = false;
else {
log("rdb: msg1 had error: %s",mstrerror(g_errno));
return true;
// this is messing up generate catdb's huge rdblist add
// why did we put it in there??? from msg9b.cpp
//return true;
}
QUICKPOLL(m_niceness);

@ -58,9 +58,9 @@ bool Msg2a::makeCatdb( char *coll,
char inFile[256];
// url info (content) file
if ( m_updateFromNew )
sprintf(inFile, "%scat/gbdmoz.content.dat.new", g_hostdb.m_dir);
sprintf(inFile, "%scatdb/gbdmoz.content.dat.new", g_hostdb.m_dir);
else
sprintf(inFile, "%scat/gbdmoz.content.dat", g_hostdb.m_dir);
sprintf(inFile, "%scatdb/gbdmoz.content.dat", g_hostdb.m_dir);
//m_inStream.open(inFile, ifstream::in);
m_inStream = open(inFile, O_RDONLY);
//if (!m_inStream.is_open()) {
@ -118,7 +118,7 @@ bool Msg2a::makeCatdb( char *coll,
// open the new diff file
//ifstream diffInStream;
int diffInStream;
sprintf(inFile, "%scat/gbdmoz.content.dat.new.diff",
sprintf(inFile, "%scatdb/gbdmoz.content.dat.new.diff",
g_hostdb.m_dir);
//diffInStream.open(inFile, ifstream::in);
diffInStream = open(inFile, O_RDONLY);
@ -328,6 +328,12 @@ bool Msg2a::makeCatdb( char *coll,
// null terminate
m_urls[urlp] = '\0';
currUrl++;
// debug
//SafeBuf sb;
//sb.safeMemcpy(&m_urls[urlp-urlLen],urlLen);
//sb.nullTerm();
//log("gencat: url=%s",sb.getBufStart());
}
log(LOG_INFO, "db: Wrote %li urls to update (%li)\n",
currUrl - m_numRemoveUrls, m_numUpdateIndexes);
@ -581,9 +587,9 @@ void handleRequest2a ( UdpSlot *slot, long netnice ) {
otherCategories = &g_categories1;
// load the new file
if ( updateFromNew )
sprintf(buff, "%scat/gbdmoz.structure.dat.new", g_hostdb.m_dir);
sprintf(buff, "%scatdb/gbdmoz.structure.dat.new", g_hostdb.m_dir);
else
sprintf(buff, "%scat/gbdmoz.structure.dat", g_hostdb.m_dir);
sprintf(buff, "%scatdb/gbdmoz.structure.dat", g_hostdb.m_dir);
if (otherCategories->loadCategories(buff) != 0) {
log("db: Loading Categories From %s Failed", buff);
// send error reply
@ -605,51 +611,51 @@ void handleRequest2a ( UdpSlot *slot, long netnice ) {
}
// move the current files to .old
sprintf(buff, "mv %scat/content.rdf.u8 %scat/content.rdf.u8.old",
sprintf(buff, "mv %scatdb/content.rdf.u8 %scatdb/content.rdf.u8.old",
g_hostdb.m_dir, g_hostdb.m_dir);
log ( LOG_INFO, "%s", buff);
system ( buff );
sprintf(buff, "mv %scat/structure.rdf.u8 %scat/structure.rdf.u8.old",
sprintf(buff, "mv %scatdb/structure.rdf.u8 %scatdb/structure.rdf.u8.old",
g_hostdb.m_dir, g_hostdb.m_dir);
log ( LOG_INFO, "%s", buff);
system ( buff );
sprintf(buff, "mv %scat/gbdmoz.content.dat "
"%scat/gbdmoz.content.dat.old",
sprintf(buff, "mv %scatdb/gbdmoz.content.dat "
"%scatdb/gbdmoz.content.dat.old",
g_hostdb.m_dir, g_hostdb.m_dir);
log ( LOG_INFO, "%s", buff);
system ( buff );
sprintf(buff, "mv %scat/gbdmoz.structure.dat "
"%scat/gbdmoz.structure.dat.old",
sprintf(buff, "mv %scatdb/gbdmoz.structure.dat "
"%scatdb/gbdmoz.structure.dat.old",
g_hostdb.m_dir, g_hostdb.m_dir);
log ( LOG_INFO, "%s", buff);
system ( buff );
sprintf(buff, "mv %scat/gbdmoz.content.dat.diff "
"%scat/gbdmoz.content.dat.diff.old",
sprintf(buff, "mv %scatdb/gbdmoz.content.dat.diff "
"%scatdb/gbdmoz.content.dat.diff.old",
g_hostdb.m_dir, g_hostdb.m_dir);
log ( LOG_INFO, "%s", buff);
system ( buff );
// move the .new files to current
sprintf(buff, "mv %scat/content.rdf.u8.new %scat/content.rdf.u8",
sprintf(buff, "mv %scatdb/content.rdf.u8.new %scatdb/content.rdf.u8",
g_hostdb.m_dir, g_hostdb.m_dir);
log ( LOG_INFO, "%s", buff);
system ( buff );
sprintf(buff, "mv %scat/structure.rdf.u8.new %scat/structure.rdf.u8",
sprintf(buff, "mv %scatdb/structure.rdf.u8.new %scatdb/structure.rdf.u8",
g_hostdb.m_dir, g_hostdb.m_dir);
log ( LOG_INFO, "%s", buff);
system ( buff );
sprintf(buff, "mv %scat/gbdmoz.content.dat.new "
"%scat/gbdmoz.content.dat",
sprintf(buff, "mv %scatdb/gbdmoz.content.dat.new "
"%scatdb/gbdmoz.content.dat",
g_hostdb.m_dir, g_hostdb.m_dir);
log ( LOG_INFO, "%s", buff);
system ( buff );
sprintf(buff, "mv %scat/gbdmoz.structure.dat.new "
"%scat/gbdmoz.structure.dat",
sprintf(buff, "mv %scatdb/gbdmoz.structure.dat.new "
"%scatdb/gbdmoz.structure.dat",
g_hostdb.m_dir, g_hostdb.m_dir);
log ( LOG_INFO, "%s", buff);
system ( buff );
//sprintf(buff, "mv %scat/gbdmoz.content.dat.new.diff "
// "%scat/gbdmoz.content.dat.diff",
//sprintf(buff, "mv %scatdb/gbdmoz.content.dat.new.diff "
// "%scatdb/gbdmoz.content.dat.diff",
// g_hostdb.m_dir, g_hostdb.m_dir);
//log ( LOG_INFO, "%s", buff);
//system ( buff );

@ -148,6 +148,10 @@ bool Msg40::getResults ( SearchInput *si ,
// we need this info for caching as well
//m_numGigabitInfos = 0;
//just getfrom searchinput
//.... m_catId = hr->getLong("catid",0);m_si->m_catId;
m_postQueryRerank.set1( this, si );
// get the collection rec
@ -680,6 +684,20 @@ bool Msg40::gotDocIds ( ) {
// if ( ! m_msg1a.generateReferences(m_si,(void*)this,didTaskWrapper) )
// m_tasksRemaining++;
//
// call Msg2b to generate directory
//
// why is this here? it does not depend on the docids. (mdw 9/25/13)
// dissect it and fix it!!
//
//if ( m_si->m_catId &&
// ! m_msg2b.generateDirectory ( m_si->m_catId,
// (void*)this,
// didTaskWrapper ) )
// m_tasksRemaining++;
return launchMsg20s ( false );
}
@ -878,7 +896,6 @@ bool Msg40::reallocMsg20Buf ( ) {
return true;
}
/*
void didTaskWrapper ( void* state ) {
Msg40 *THIS = (Msg40 *) state;
// one less task
@ -888,7 +905,6 @@ void didTaskWrapper ( void* state ) {
// we are done, call the callback
THIS->m_callback ( THIS->m_state );
}
*/
bool Msg40::launchMsg20s ( bool recalled ) {
@ -2128,7 +2144,7 @@ long Msg40::getStoredSize ( ) {
//size += m_msg24.getStoredSize ( );
//size += m_msg1a.getStoredSize ( );
// cache msg2b if we have it
size += m_msg2b.getStoredSize();
//size += m_msg2b.getStoredSize();
return size;
}
@ -2203,9 +2219,9 @@ long Msg40::serialize ( char *buf , long bufLen ) {
//if ( y == -1 ) return -1;
//p += y;
long z = m_msg2b.serialize (p, pend - p);
if ( z == -1 ) return -1;
p += z;
//long z = m_msg2b.serialize (p, pend - p);
//if ( z == -1 ) return -1;
//p += z;
if ( m_r.m_debug )
log("query: msg40 serialize nd=%li "
@ -2258,9 +2274,9 @@ long Msg40::deserialize ( char *buf , long bufSize ) {
}
// msg2b
long z = m_msg2b.deserialize ( p , pend - p );
if ( z == -1 ) return -1;
p += z;
//long z = m_msg2b.deserialize ( p , pend - p );
//if ( z == -1 ) return -1;
//p += z;
// return bytes read
return p - buf;

@ -14,7 +14,7 @@
#include "Msg39.h" // getTermFreqs()
#include "Msg20.h" // for getting summary from docId
#include "Msg17.h" // a distributed cache of serialized/compressed Msg40s
#include "Msg2b.h" // for generating directories
//#include "Msg2b.h" // for generating directories
#include "IndexReadInfo.h" // STAGE0,...
#include "Msg3a.h"
#include "PostQueryRerank.h"
@ -302,7 +302,7 @@ class Msg40 {
long m_docsToScanForTopics;
// Msg2b for generating a directory
Msg2b m_msg2b;
//Msg2b m_msg2b;
PostQueryRerank m_postQueryRerank;

@ -45,13 +45,13 @@ bool Msg8b::getCatRec ( Url *url ,
// clear g_errno
g_errno = 0;
// warning
if ( ! coll ) log(LOG_LOGIC,"net: NULL collection. msg8b.");
//if ( ! coll ) log(LOG_LOGIC,"net: NULL collection. msg8b.");
// store the calling parameters in this class for retrieval by callback
m_state = state;
m_callback = callback;
m_url = url;
m_coll = coll;
m_collLen = collLen;
//m_coll = coll;
//m_collLen = collLen;
m_cr = cr;
m_niceness = niceness;
@ -68,10 +68,10 @@ bool Msg8b::getCatRec ( Url *url ,
//m_coll = g_conf.m_dirColl;
//m_collLen = gbstrlen(m_coll);
// catdb uses a dummy collection now, should not be looked at
m_coll = "catdb";
m_collLen = 5;
//m_coll = "catdb";
//m_collLen = 5;
m_collnum = g_collectiondb.getCollnum ( m_coll , m_collLen );
//m_collnum = g_collectiondb.getCollnum ( m_coll , m_collLen );
// . first, try it by canonical domain name
// . if that finds no matches, then try it by ip domain
@ -90,7 +90,7 @@ bool Msg8b::getCatRec ( Url *url ,
//
if ( getMyShardNum() != m_shardNum ) {//g_hostdb.m_groupId!=m_groupId){
// coll, url, niceness(1), rdbid(1), useCanonicalName(1)
long requestSize = m_collLen + m_url->getUrlLen() + 4 + 4;
long requestSize = m_url->getUrlLen() + 4 + 3;
// make the request
char *p = m_request;
*(long *)p = m_url->getIp() ; p+=4;
@ -98,10 +98,10 @@ bool Msg8b::getCatRec ( Url *url ,
*p = (char)niceness ; p++;
*p = (char)useCanonicalName; p++;
// coll
memcpy(p, m_coll, m_collLen);
p += m_collLen;
*p = '\0';
p++;
//memcpy(p, m_coll, m_collLen);
//p += m_collLen;
//*p = '\0';
//p++;
// url
memcpy(p, m_url->getUrl(), m_url->getUrlLen());
p += m_url->getUrlLen();
@ -187,7 +187,7 @@ bool Msg8b::getCatRec ( Url *url ,
0 , // max cached age in seconds (60)
false , // add net recv'd list to cache?
RDB_CATDB, // specifies the rdb, 1 = tagdb
m_coll ,
"",//NULL,//m_coll ,
//&m_list ,
m_list ,
startKey ,
@ -546,7 +546,7 @@ bool Msg8b::gotList ( ) {
char *rec;
//rec = g_catdb->getRec ( &m_list , m_url , &recSize );
rec = g_catdb.getRec(m_list,m_url,&recSize,m_coll,m_collLen);
rec = g_catdb.getRec(m_list,m_url,&recSize,NULL,0);//m_coll,m_collLen);
// if record found then set it and also set gotIt to true
if ( rec ) {
@ -589,8 +589,8 @@ void Msg8b::getIndirectCatids ( ) {
matchRecs,
matchRecSizes,
MAX_IND_CATIDS,
m_coll,
m_collLen);
NULL,//m_coll,
0);//m_collLen);
// parse out the catids from the matches
m_cr->m_numIndCatids = 0;
for ( long i = 0; i < numMatches; i++ ) {

@ -68,11 +68,11 @@ class Msg8b {
void cleanSlot ( );
// some specified input
char *m_coll;
long m_collLen;
//char *m_coll;
//long m_collLen;
Url *m_url;
collnum_t m_collnum;
//collnum_t m_collnum;
void (*m_callback ) ( void *state );//, CatRec *rec );
void *m_state; // ptr to caller's private state data

@ -93,10 +93,17 @@ bool Msg9b::addCatRecs ( char *urls ,
char *e = p; while ( *e && ! is_wspace_a (*e) ) e++;
// . set the url
// . but don't add the "www."
// . watch out for
// http://twitter.com/#!/ronpaul to http://www.twitter.com/
// so do not strip # hashtags
Url site;
site.set ( p , e - p , false/*addwww?*/);
site.set ( p , e - p , false ); // addwww?
// normalize the url
g_catdb.normalizeUrl(&site, &site);
// sanity
if ( numCatids[k] > MAX_CATIDS ) { char *xx=NULL;*xx=0; }
// make a siteRec from this url
CatRec sr;
// returns false and sets g_errno on error
@ -107,6 +114,16 @@ bool Msg9b::addCatRecs ( char *urls ,
char *data = sr.getData ();
long dataSize = sr.getDataSize ();
key_t key;
// sanity test
CatRec cr2;
if ( ! cr2.set ( NULL , sr.getData(), sr.getDataSize(),false)){
char *xx=NULL;*xx=0; }
// debug when generating catdb
//char *x = p;
//for ( ; x<e ; x++ ) {
// if ( x[0] == '#' )
// log("hey");
//}
if ( numCatids[k] == 0 )
key = g_catdb.makeKey(&site, true);
else
@ -123,7 +140,23 @@ bool Msg9b::addCatRecs ( char *urls ,
}
else if ( ! m_list.addRecord ( key, dataSize, data ) )
return true;
/*
// debug point
SafeBuf sb;
//sb.safeMemcpy(p , e-p );
sb.safeStrcpy(sr.m_url);
sb.safePrintf(" ");
for ( long i = 0 ; i < numCatids[k] ; i++ )
sb.safePrintf ( "%li " , catids[c+i] );
log("catdb: adding key=%s url=%s",
KEYSTR(&key,12),
sb.getBufStart());
*/
// debug
//log("gencat: adding url=%s",sr.m_url);
//skip:
// now advance p to e
p = e;
@ -133,7 +166,8 @@ bool Msg9b::addCatRecs ( char *urls ,
QUICKPOLL((niceness));
}
log ( LOG_INFO, "Msg9b: %li sites and %li links added", k , c );
log ( LOG_INFO, "Msg9b: %li sites and %li links added. "
"listSize=%li", k , c , m_list.m_listSize );
// . now add the m_list to tagdb using msg1
// . use high priority (niceness of 0)
// . i raised niceness from 0 to 1 so multicast does not use the

@ -66,7 +66,8 @@ bool sendPageAddUrl ( TcpSocket *s , HttpRequest *r ) {
// see if they provided a url of a file of urls if they did not
// provide a url to add directly
bool isAdmin = g_collectiondb.isAdmin ( r , s );
//bool isAdmin = g_collectiondb.isAdmin ( r , s );
bool isAdmin = r->getIsLocal();
long ufuLen = 0;
char *ufu = NULL;
if ( isAdmin )

@ -105,8 +105,8 @@ bool sendPageCatdb ( TcpSocket *s , HttpRequest *r ) {
st->m_url.set(url, urlLen);
// call msg8b to lookup in catdb
if (!st->m_msg8b.getCatRec ( &st->m_url,
st->m_coll,
st->m_collLen,
NULL,//st->m_coll,
0,//st->m_collLen,
true,
1,
&st->m_catRec,

@ -3,6 +3,10 @@
#include "CollectionRec.h"
#include "Pages.h"
#include "Categories.h"
#include "PageResults.h" // printDMOZSubtopics()
// function is in PageRoot.cpp:
bool printDirHomePage ( SafeBuf &sb , HttpRequest *r ) ;
// . returns false if blocked, true otherwise
// . sets g_errno on error
@ -36,14 +40,61 @@ bool sendPageDirectory ( TcpSocket *s , HttpRequest *r ) {
break;
}
}
// look it up
// look it up. returns catId <= 0 if dmoz not setup yet.
long catId = g_categories->getIdFromPath(decodedPath, decodedPathLen);
SafeBuf sb;
long xml = r->getLong("xml",0);
// if /Top print the directory homepage
if ( catId == 1 || catId <= 0 ) {
// this is in PageRoot.cpp
printDirHomePage(sb,r);
}
//
// try printing this shit out not as search results right now
// but just verbatim from dmoz files
//
else {
// search box
printLogoAndSearchBox(sb,r,catId);
// radio buttons for search dmoz. no, this is printed
// from call to printLogoAndSearchBox()
//printDmozRadioButtons(sb,catId);
// the dmoz breadcrumb
printDMOZCrumb ( sb,catId,xml);
// print the subtopcis in this topic. show as links above
// the search results
printDMOZSubTopics ( sb, catId , xml );
// ok, for now just print the dmoz topics since our search
// results will be empty... until populated!
g_categories->printUrlsInTopic ( &sb , catId );
}
return g_httpServer.sendDynamicPage ( s,
(char*) sb.getBufStart(),
sb.length(),
// 120 seconds cachetime
// don't cache anymore
// since
// we have the login bar
// @ the top of the page
0,//120, // cachetime
false,// post?
"text/html",
200,
NULL, // cookie
"UTF-8",
r);
// . make a new request for PageResults
//Url dirUrl;
char requestBuf[1024+MAX_COLL_LEN+128];
long requestBufSize = 1024+MAX_COLL_LEN+128;
//g_categories.createDirectorySearchUrl ( &dirUrl,
log("dmoz: creating search request");
long requestBufLen = g_categories->createDirSearchRequest(
requestBuf,
requestBufSize,

@ -2494,14 +2494,14 @@ z 122 7a { 123 7b\
" query with \"prefix:sufix\", i.e. \"gbpdcat:1\" will"
" list all pages under the Top category (or all pages"
" in the entire directory).<br>"
" <ul><li>gbdcat - The page is listed directly"
" <ul><li>gbcatid - The page is listed directly"
" under this base category.<br>"
" <li>gbpdcat - The page is listed under this"
" <li>gbpcatid - The page is listed under this"
" category or any child of this category.<br>"
" <li>gbicat - The page is listed indirectly"
" <li>gbicatid - The page is listed indirectly"
" under this base category, meaning it is a page found"
" under a site listed in the base category.<br>"
" <li>gbpicat - The page is listed indirectly"
" <li>gbipcat - The page is listed indirectly"
" under this category, meaning it is a page found under"
" a site listed under this category or any child of"
" this category.<br>"

@ -14,11 +14,11 @@ bool sendPagePerf ( TcpSocket *s , HttpRequest *r ) {
// allow connection if i'm running this on lenny, too
//if ( s->m_ip != matt1 && s->m_ip != matt2 )
// return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno));
long refreshLen = 0;
if(r->getString ( "refresh" , &refreshLen) ) {
g_stats.dumpGIF ();
return g_httpServer.sendDynamicPage ( s , "x", 1 );
}
//long refreshLen = 0;
//if(r->getString ( "refresh" , &refreshLen) ) {
// g_stats.dumpGIF ();
// return g_httpServer.sendDynamicPage ( s , "x", 1 );
//}
// don't allow pages bigger than 128k in cache
char buf [ 64*1024 ];
@ -77,7 +77,7 @@ bool sendPagePerf ( TcpSocket *s , HttpRequest *r ) {
// dump stats to /tmp/diskGraph.gif
g_stats.dumpGIF ();
//g_stats.dumpGIF ();
if(autoRefresh > 0)
p.safePrintf("<body onLoad=\"timeit();\">");
@ -105,8 +105,13 @@ bool sendPagePerf ( TcpSocket *s , HttpRequest *r ) {
p.safePrintf(
//"<center>Disk Statistics<br><br>"
"<center><br>"
"<img name=\"diskgraph\" src=/diskGraph%li.gif><br><br>",
g_hostdb.m_hostId );
//"<img name=\"diskgraph\"
//src=/diskGraph%li.gif><br><br>",
//g_hostdb.m_hostId );
);
// now try using absolute divs instead of a GIF
g_stats.printGraphInHtml ( p );
if(autoRefresh > 0) {
if(refresh) *(refresh+4) = '0';

File diff suppressed because it is too large Load Diff

@ -3,6 +3,9 @@
#include "SafeBuf.h"
bool printDmozRadioButtons ( SafeBuf &sb , long catId ) ;
bool printLogoAndSearchBox ( SafeBuf &sb , class HttpRequest *hr, long catId );
bool printTermPairs ( SafeBuf &sb , class Query *q , class PairScore *ps ) ;
bool printSingleTerm ( SafeBuf &sb , class Query *q , class SingleScore *ss );
@ -17,6 +20,9 @@ bool printEventAddress ( SafeBuf &sb , char *addrStr , class SearchInput *si ,
double eventGeocoderLon,
char *eventBestPlaceName );
bool printDMOZCrumb ( SafeBuf &sb , long catId , bool xml ) ;
bool printDMOZSubTopics ( SafeBuf& sb, long catId, bool inXml ) ;
bool printEventCountdown2 ( SafeBuf &sb ,
SearchInput *si,
long now ,

@ -115,7 +115,7 @@ bool printWebHomePage ( SafeBuf &sb , HttpRequest *r ) {
sb.safePrintf("<br><br>\n");
sb.safePrintf("<br><br><br>\n");
sb.safePrintf("<b>web</b> &nbsp;&nbsp;&nbsp;&nbsp; <a href=/seo>seo</a> &nbsp;&nbsp;&nbsp;&nbsp; <a href=\"http://www.gigablast.com/?c=dmoz3\">directory</a> &nbsp;&nbsp;&nbsp;&nbsp; \n");
sb.safePrintf("<b>web</b> &nbsp;&nbsp;&nbsp;&nbsp; <a href=/seo>seo</a> &nbsp;&nbsp;&nbsp;&nbsp; <a href=\"/Top\">directory</a> &nbsp;&nbsp;&nbsp;&nbsp; \n");
sb.safePrintf("<a href=/adv.html>advanced search</a>");
sb.safePrintf(" &nbsp;&nbsp;&nbsp;&nbsp; ");
sb.safePrintf("<a href=/addurl title=\"Instantly add your url to "
@ -325,7 +325,7 @@ bool printAddUrlHomePage ( SafeBuf &sb , char *url , HttpRequest *r ) {
sb.safePrintf("<br><br>\n");
sb.safePrintf("<br><br><br>\n");
sb.safePrintf("<a href=/>web</a> &nbsp;&nbsp;&nbsp;&nbsp; <a href=/seo>seo</a> &nbsp;&nbsp;&nbsp;&nbsp; <a href=\"http://www.gigablast.com/?c=dmoz3\">directory</a> &nbsp;&nbsp;&nbsp;&nbsp; \n");
sb.safePrintf("<a href=/>web</a> &nbsp;&nbsp;&nbsp;&nbsp; <a href=/seo>seo</a> &nbsp;&nbsp;&nbsp;&nbsp; <a href=\"/Top\">directory</a> &nbsp;&nbsp;&nbsp;&nbsp; \n");
sb.safePrintf("<a href=/adv.html>advanced search</a>");
sb.safePrintf(" &nbsp;&nbsp;&nbsp;&nbsp; ");
sb.safePrintf("<b title=\"Instantly add your url to Gigablast's "
@ -474,6 +474,8 @@ bool printDirHomePage ( SafeBuf &sb , HttpRequest *r ) {
sb.safePrintf("<form method=get "
"action=/search name=f>\n");
sb.safePrintf("<input name=q type=text size=60 value=\"\">&nbsp;<input type=\"submit\" value=\"Search Green\">\n");
sb.safePrintf("<input type=hidden "
"name=prepend value=\"gbipcatid:2\">");
sb.safePrintf("\n");
sb.safePrintf("</form>\n");
sb.safePrintf("<br>\n");
@ -570,10 +572,10 @@ bool sendPageRoot ( TcpSocket *s , HttpRequest *r, char *cookie ) {
*/
if ( ! strcmp(coll,"dmoz3" ) )
printDirHomePage(sb,r);
else
printWebHomePage(sb,r);
//if ( ! strcmp(coll,"dmoz" ) )
// printDirHomePage(sb,r);
//else
printWebHomePage(sb,r);
// . print last 5 queries
@ -947,136 +949,151 @@ long printLastQueries ( char *p , char *pend ) {
//char *printTopDirectory ( char *p, char *pend ) {
bool printTopDirectory ( SafeBuf& sb ) {
// if no recs in catdb, print instructions
if ( g_catdb.getRdb()->getNumTotalRecs() == 0 )
return sb.safePrintf("<center>"
"<b>DMOZ functionality is not set up.</b>"
"<br>"
"<br>"
"<b>"
"Please follow the set up "
"<a href=/admin.html#dmoz>"
"instructions"
"</a>."
"</b>"
"</center>");
//char topList[4096];
//sprintf(topList,
return sb.safePrintf (
"<center>"
"<table cellspacing=\"4\" cellpadding=\"4\"><tr><td valign=top>\n"
"<b><a href=\"/Arts/\">Arts</a></b><br>"
"<b><a href=\"/Top/Arts/\">Arts</a></b><br>"
"<small>"
"<a href=\"/Arts/Movies/\">Movies</a>, "
"<a href=\"/Arts/Television/\">Television</a>, "
"<a href=\"/Arts/Music/\">Music</a>..."
"<a href=\"/Top/Arts/Movies/\">Movies</a>, "
"<a href=\"/Top/Arts/Television/\">Television</a>, "
"<a href=\"/Top/Arts/Music/\">Music</a>..."
"</small>\n"
"</td><td valign=top>"
"<b><a href=\"/Business/\">Business</a></b><br>"
"<b><a href=\"/Top/Business/\">Business</a></b><br>"
"<small>"
"<a href=\"/Business/Employment/\">Jobs</a>, "
"<a href=\"/Business/Real_Estate/\">Real Estate</a>, "
"<a href=\"/Business/Investing/\">Investing</a>..."
"<a href=\"/Top/Business/Employment/\">Jobs</a>, "
"<a href=\"/Top/Business/Real_Estate/\">Real Estate</a>, "
"<a href=\"/Top/Business/Investing/\">Investing</a>..."
"</small>\n"
"</td><td valign=top>"
"<b><a href=\"/Computers/\">Computers</a></b><br>"
"<b><a href=\"/Top/Computers/\">Computers</a></b><br>"
"<small>"
"<a href=\"/Computers/Internet/\">Internet</a>, "
"<a href=\"/Computers/Software/\">Software</a>, "
"<a href=\"/Computers/Hardware/\">Hardware</a>..."
"<a href=\"/Top/Computers/Internet/\">Internet</a>, "
"<a href=\"/Top/Computers/Software/\">Software</a>, "
"<a href=\"/Top/Computers/Hardware/\">Hardware</a>..."
"</small>\n"
"</td></tr><tr><td valign=top>"
"<b><a href=\"/Games/\">Games</a></b><br>"
"<b><a href=\"/Top/Games/\">Games</a></b><br>"
"<small>"
"<a href=\"/Games/Video_Games/\">Video Games</a>, "
"<a href=\"/Games/Roleplaying/\">RPGs</a>, "
"<a href=\"/Games/Gambling/\">Gambling</a>..."
"<a href=\"/Top/Games/Video_Games/\">Video Games</a>, "
"<a href=\"/Top/Games/Roleplaying/\">RPGs</a>, "
"<a href=\"/Top/Games/Gambling/\">Gambling</a>..."
"</small>\n"
"</td><td valign=top>"
"<b><a href=\"/Health/\">Health</a></b><br>"
"<b><a href=\"/Top/Health/\">Health</a></b><br>"
"<small>"
"<a href=\"/Health/Fitness/\">Fitness</a>, "
"<a href=\"/Health/Medicine/\">Medicine</a>, "
"<a href=\"/Health/Alternative/\">Alternative</a>..."
"<a href=\"/Top/Health/Fitness/\">Fitness</a>, "
"<a href=\"/Top/Health/Medicine/\">Medicine</a>, "
"<a href=\"/Top/Health/Alternative/\">Alternative</a>..."
"</small>\n"
"</td><td valign=top>"
"<b><a href=\"/Home/\">Home</a></b><br>"
"<b><a href=\"/Top/Home/\">Home</a></b><br>"
"<small>"
"<a href=\"/Home/Family/\">Family</a>, "
"<a href=\"/Home/Consumer_Information/\">Consumers</a>, "
"<a href=\"/Home/Cooking/\">Cooking</a>..."
"<a href=\"/Top/Home/Family/\">Family</a>, "
"<a href=\"/Top/Home/Consumer_Information/\">Consumers</a>, "
"<a href=\"/Top/Home/Cooking/\">Cooking</a>..."
"</small>\n"
"</td></tr><tr><td valign=top>"
//"<b><a href=\"/Kids_and_Teens/\">"
//"<b><a href=\"/Top/Kids_and_Teens/\">"
//"<font color=\"#ff0000\">K</font>"
//"<font color=\"339900\">i</font>"
//"<font color=\"#ff6600\">d</font>"
//"<font color=\"#0066ff\">s</font>"
//" and Teens</a></b><br>"
"<b><a href=\"/Kids_and_Teens/\">Kids and Teens</a></b><br>"
"<b><a href=\"/Top/Kids_and_Teens/\">Kids and Teens</a></b><br>"
"<small>"
"<a href=\"/Kids_and_Teens/Arts/\">Arts</a>, "
"<a href=\"/Kids_and_Teens/School_Time/\">School Time</a>, "
"<a href=\"/Kids_and_Teens/Teen_Life/\">Teen Life</a>..."
"<a href=\"/Top/Kids_and_Teens/Arts/\">Arts</a>, "
"<a href=\"/Top/Kids_and_Teens/School_Time/\">School Time</a>, "
"<a href=\"/Top/Kids_and_Teens/Teen_Life/\">Teen Life</a>..."
"</small>\n"
"</td><td valign=top>"
"<b><a href=\"/News/\">News</a></b><br>"
"<b><a href=\"/Top/News/\">News</a></b><br>"
"<small>"
"<a href=\"/News/Media/\">Media</a>, "
"<a href=\"/News/Newspapers/\">Newspapers</a>, "
"<a href=\"/News/Weather/\">Weather</a>..."
"<a href=\"/Top/News/Media/\">Media</a>, "
"<a href=\"/Top/News/Newspapers/\">Newspapers</a>, "
"<a href=\"/Top/News/Weather/\">Weather</a>..."
"</small>\n"
"</td><td valign=top>"
"<b><a href=\"/Recreation/\">Recreation</a></b><br>"
"<b><a href=\"/Top/Recreation/\">Recreation</a></b><br>"
"<small>"
"<a href=\"/Recreation/Travel/\">Travel</a>, "
"<a href=\"/Recreation/Food/\">Food</a>, "
"<a href=\"/Recreation/Outdoors/\">Outdoors</a>, "
"<a href=\"/Recreation/Humor/\">Humor</a>..."
"<a href=\"/Top/Recreation/Travel/\">Travel</a>, "
"<a href=\"/Top/Recreation/Food/\">Food</a>, "
"<a href=\"/Top/Recreation/Outdoors/\">Outdoors</a>, "
"<a href=\"/Top/Recreation/Humor/\">Humor</a>..."
"</small>\n"
"</td></tr><tr><td valign=top>"
"<b><a href=\"/Reference/\">Reference</a></b><br>"
"<b><a href=\"/Top/Reference/\">Reference</a></b><br>"
"<small>"
"<a href=\"/Reference/Maps/\">Maps</a>, "
"<a href=\"/Reference/Education/\">Education</a>, "
"<a href=\"/Reference/Libraries/\">Libraries</a>..."
"<a href=\"/Top/Reference/Maps/\">Maps</a>, "
"<a href=\"/Top/Reference/Education/\">Education</a>, "
"<a href=\"/Top/Reference/Libraries/\">Libraries</a>..."
"</small>\n"
"</td><td valign=top>"
"<b><a href=\"/Regional/\">Regional</a></b><br>"
"<b><a href=\"/Top/Regional/\">Regional</a></b><br>"
"<small>"
"<a href=\"/Regional/North_America/United_States/\">US</a>, "
"<a href=\"/Regional/North_America/Canada/\">Canada</a>, "
"<a href=\"/Regional/Europe/United_Kingdom/\">UK</a>, "
"<a href=\"/Regional/Europe/\">Europe</a>..."
"<a href=\"/Top/Regional/North_America/United_States/\">US</a>, "
"<a href=\"/Top/Regional/North_America/Canada/\">Canada</a>, "
"<a href=\"/Top/Regional/Europe/United_Kingdom/\">UK</a>, "
"<a href=\"/Top/Regional/Europe/\">Europe</a>..."
"</small>\n"
"</td><td valign=top>"
"<b><a href=\"/Science/\">Science</a></b><br>"
"<b><a href=\"/Top/Science/\">Science</a></b><br>"
"<small>"
"<a href=\"/Science/Biology/\">Biology</a>, "
"<a href=\"/Science/Social_Sciences/Psychology/\">Psychology</a>, "
"<a href=\"/Science/Physics/\">Physics</a>..."
"<a href=\"/Top/Science/Biology/\">Biology</a>, "
"<a href=\"/Top/Science/Social_Sciences/Psychology/\">Psychology</a>, "
"<a href=\"/Top/Science/Physics/\">Physics</a>..."
"</small>\n"
"</td></tr><tr><td valign=top>"
"<b><a href=\"/Shopping/\">Shopping</a></b><br>"
"<b><a href=\"/Top/Shopping/\">Shopping</a></b><br>"
"<small>"
"<a href=\"/Shopping/Vehicles/Autos/\">Autos</a>, "
"<a href=\"/Shopping/Clothing/\">Clothing</a>, "
"<a href=\"/Shopping/Gifts/\">Gifts</a>..."
"<a href=\"/Top/Shopping/Vehicles/Autos/\">Autos</a>, "
"<a href=\"/Top/Shopping/Clothing/\">Clothing</a>, "
"<a href=\"/Top/Shopping/Gifts/\">Gifts</a>..."
"</small>\n"
"</td><td valign=top>"
"<b><a href=\"/Society/\">Society</a></b><br>"
"<b><a href=\"/Top/Society/\">Society</a></b><br>"
"<small>"
"<a href=\"/Society/People/\">People</a>, "
"<a href=\"/Society/Religion_and_Spirituality/\">Religion</a>, "
"<a href=\"/Society/Issues/\">Issues</a>..."
"<a href=\"/Top/Society/People/\">People</a>, "
"<a href=\"/Top/Society/Religion_and_Spirituality/\">Religion</a>, "
"<a href=\"/Top/Society/Issues/\">Issues</a>..."
"</small>\n"
"</td><td valign=top>"
"<b><a href=\"/Sports/\">Sports</a></b><br>"
"<b><a href=\"/Top/Sports/\">Sports</a></b><br>"
"<small>"
"<a href=\"/Sports/Baseball/\">Baseball</a>, "
"<a href=\"/Sports/Soccer/\">Soccer</a>, "
"<a href=\"/Sports/Basketball/\">Basketball</a>..."
"<a href=\"/Top/Sports/Baseball/\">Baseball</a>, "
"<a href=\"/Top/Sports/Soccer/\">Soccer</a>, "
"<a href=\"/Top/Sports/Basketball/\">Basketball</a>..."
"</small>\n"
"</td></tr>"
"<tr><td colspan=3 valign=top>"
"<b><a href=\"/World/\">World</a></b><br>"
"<b><a href=\"/Top/World/\">World</a></b><br>"
"<small>"
"<a href=\"/World/Deutsch/\">Deutsch</a>, "
"<a href=\"/World/Espa%%c3%%b1ol/\">Espa%c%col</a>, "
"<a href=\"/World/Fran%%c3%%a7ais/\">Fran%c%cais</a>, "
"<a href=\"/World/Italiano/\">Italiano</a>, "
"<a href=\"/World/Japanese/\">Japanese</a>, "
"<a href=\"/World/Nederlands/\">Nederlands</a>, "
"<a href=\"/World/Polska/\">Polska</a>, "
"<a href=\"/World/Dansk/\">Dansk</a>, "
"<a href=\"/World/Svenska/\">Svenska</a>..."
"<a href=\"/Top/World/Deutsch/\">Deutsch</a>, "
"<a href=\"/Top/World/Espa%%c3%%b1ol/\">Espa%c%col</a>, "
"<a href=\"/Top/World/Fran%%c3%%a7ais/\">Fran%c%cais</a>, "
"<a href=\"/Top/World/Italiano/\">Italiano</a>, "
"<a href=\"/Top/World/Japanese/\">Japanese</a>, "
"<a href=\"/Top/World/Nederlands/\">Nederlands</a>, "
"<a href=\"/Top/World/Polska/\">Polska</a>, "
"<a href=\"/Top/World/Dansk/\">Dansk</a>, "
"<a href=\"/Top/World/Svenska/\">Svenska</a>..."
"</small>\n"
"</td></tr></table></center>\n",
195, 177, 195, 167);

@ -1087,12 +1087,12 @@ bool sendPageStats ( TcpSocket *s , HttpRequest *r ) {
//g_tfndb.getRdb(),
g_tagdb.getRdb(),
g_clusterdb.getRdb(),
//g_catdb.getRdb(),
g_linkdb.getRdb(),
g_cachedb.getRdb(),
g_serpdb.getRdb(),
g_monitordb.getRdb(),
g_statsdb.getRdb()
g_statsdb.getRdb(),
g_catdb.getRdb()
//g_placedb.getRdb() ,
//g_sectiondb.getRdb()
};

@ -67,7 +67,8 @@ bool sendPageStatsdb ( TcpSocket *s, HttpRequest *r ) {
st->m_niceness = MAX_NICENESS;
st->m_socket = s;
st->m_request = *r;
//st->m_request = *r;
st->m_request.copy ( r );
// hostId must be one of the following:
// 0-n - a valid hostId
@ -120,7 +121,9 @@ bool sendPageStatsdb ( TcpSocket *s, HttpRequest *r ) {
st->m_endDate = st->m_endDateR;
}
//
// this is no longer a gif, but an html graph in g_statsdb.m_sb
//
if ( ! g_statsdb.makeGIF ( st->m_endDateR ,
st->m_startDateR ,
st->m_samples ,
@ -211,15 +214,28 @@ void sendReply ( void *state ) {
buf.safePrintf("<table cellpadding=10 border=0>\n");
buf.safePrintf("<tr><td>"
"<center>"
"<img src=\"/stats%li.gif\" height=%li width=%li "
"border=\"0px\">"
"</center>"
"<center>");
/////////////////////////
//
// insert the div graph here
//
/////////////////////////
buf.cat ( g_statsdb.m_gw );
// purge it
g_statsdb.m_gw.purge();
g_statsdb.m_dupTable.reset();
//"<img src=\"/stats%li.gif\" height=%li width=%li "
//"border=\"0px\">"
//st->m_hostId,
//g_statsdb.getImgHeight(),
//g_statsdb.getImgWidth());
buf.safePrintf("</center>"
//"class=\"statsdb_image\">"
"</td></tr>\n",
st->m_hostId,
g_statsdb.getImgHeight(),
g_statsdb.getImgWidth());
"</td></tr>\n");
// the map key
buf.safePrintf("<tr><td>");

@ -384,7 +384,10 @@ long Pages::getDynamicPageNumber ( HttpRequest *r ) {
}
// sanity
if ( ! g_categories ) log("process: no categories loaded");
// look it up for a category
//
// dmoz - look it up for a category
//
if ( g_categories &&
g_categories->getIndexFromPath(decodedPath, decodedPathLen) >= 0)
return PAGE_DIRECTORY;
@ -497,6 +500,10 @@ bool Pages::sendDynamicReply ( TcpSocket *s , HttpRequest *r , long page ) {
// log("login: access denied 3 from ip=%s",iptoa(s->m_ip));
// return sendPageLogin(s,r,"Access Denied. Bad or no password.");
//}
if ( ! publicPage && ! isLocal && ! isLoopback ) {
log("login: access denied 2 from ip=%s",iptoa(s->m_ip));
return sendPageLogin ( s , r, "Access Denied. No permission.");
}
g_errno = 0;
@ -635,7 +642,6 @@ bool Pages::sendDynamicReply ( TcpSocket *s , HttpRequest *r , long page ) {
// . now, so it can be responsible for calling pg->m_function
//if ( userType > USER_PUBLIC ) {
// check if user has public page access
//if ( g_users.hasPermission( r, page , s ) ) {
if ( isLocal ) { //g_users.hasPermission( r, page , s )){
// . this will set various parms
// . we know the request came from a host in the cluster

@ -4657,6 +4657,7 @@ void Parms::init ( ) {
m->m_type = TYPE_LONG;
m++;
/*
m->m_title = "catdb min files to merge";
m->m_desc = "";
m->m_off = (char *)&g_conf.m_catdbMinFilesToMerge - g;
@ -4665,7 +4666,6 @@ void Parms::init ( ) {
m->m_save = 0;
m++;
/*
m->m_title = "revdb max tree mem";
m->m_desc = "Revdb holds the meta list we added for this doc.";
m->m_off = (char *)&g_conf.m_revdbMaxTreeMem - g;

@ -6,7 +6,7 @@
#include "Clusterdb.h"
#include "Hostdb.h"
#include "Tagdb.h"
//#include "Catdb.h"
#include "Catdb.h"
#include "Posdb.h"
#include "Cachedb.h"
#include "Monitordb.h"
@ -56,7 +56,7 @@ long g_qbufNeedSave = 0;
extern void resetPageAddUrl ( );
extern void resetHttpMime ( );
extern void reset_iana_charset ( );
extern void resetAdultBit ( );
//extern void resetAdultBit ( );
extern void resetDomains ( );
extern void resetEntities ( );
extern void resetQuery ( );
@ -411,7 +411,7 @@ bool Process::init ( ) {
m_rdbs[m_numRdbs++] = g_spiderdb.getRdb ();
m_rdbs[m_numRdbs++] = g_clusterdb.getRdb ();
m_rdbs[m_numRdbs++] = g_tagdb.getRdb ();
//m_rdbs[m_numRdbs++] = g_catdb.getRdb ();
m_rdbs[m_numRdbs++] = g_catdb.getRdb ();
m_rdbs[m_numRdbs++] = g_statsdb.getRdb ();
m_rdbs[m_numRdbs++] = g_linkdb.getRdb ();
m_rdbs[m_numRdbs++] = g_cachedb.getRdb ();
@ -1660,7 +1660,7 @@ void Process::resetAll ( ) {
rdb->reset();
}
//g_catdb .reset();
g_catdb .reset();
g_collectiondb .reset();
g_categories1 .reset();
g_categories2 .reset();
@ -1712,7 +1712,7 @@ void Process::resetAll ( ) {
resetPageAddUrl();
resetHttpMime();
reset_iana_charset();
resetAdultBit();
//resetAdultBit();
resetDomains();
resetEntities();
resetQuery();
@ -1761,7 +1761,7 @@ void Process::resetPageCaches ( ) {
//g_tfndb .getDiskPageCache()->reset();
//g_checksumdb .getDiskPageCache()->reset();
g_clusterdb .getDiskPageCache()->reset();
//g_catdb .getDiskPageCache()->reset();
g_catdb .getDiskPageCache()->reset();
//g_placedb .getDiskPageCache()->reset();
g_doledb .getDiskPageCache()->reset();
//g_statsdb .getDiskPageCache()->reset();

@ -256,7 +256,7 @@ bool Proxy::initProxy ( long proxyId, unsigned short udpPort,
g_pages.init ( );
// load up the dmoz categories here
char structureFile[256];
sprintf(structureFile, "%scat/gbdmoz.structure.dat", g_hostdb.m_dir);
sprintf(structureFile, "%scatdb/gbdmoz.structure.dat", g_hostdb.m_dir);
g_categories = &g_categories1;
if (g_categories->loadCategories(structureFile) != 0) {
log("cat: Loading Categories From %s Failed.",

36
Rdb.cpp

@ -5,7 +5,7 @@
#include "Clusterdb.h"
#include "Hostdb.h"
#include "Tagdb.h"
//#include "Catdb.h"
#include "Catdb.h"
#include "Indexdb.h"
#include "Posdb.h"
#include "Cachedb.h"
@ -302,8 +302,20 @@ bool Rdb::init ( char *dir ,
if ( ! loadTree ( ) ) return false;
// add the single dummy collection for catdb
//if ( g_catdb.getRdb() == this ) //||
// return g_catdb.addColl ( NULL );
if ( g_catdb.getRdb() == this )
return g_catdb.addColl ( NULL );
if ( g_statsdb.getRdb() == this )
return g_statsdb.addColl ( NULL );
if ( g_cachedb.getRdb() == this )
return g_cachedb.addColl ( NULL );
if ( g_serpdb.getRdb() == this )
return g_serpdb.addColl ( NULL );
//else if ( g_accessdb.getRdb() == this )
// return g_accessdb.addColl ( NULL );
//else if ( g_facebookdb.getRdb() == this )
// return g_facebookdb.addColl ( NULL );
if ( g_syncdb.getRdb() == this )
return g_syncdb.addColl ( NULL );
// set this for use below
//*(long long *)m_gbcounteventsTermId =
@ -1404,7 +1416,7 @@ void attemptMergeAll ( int fd , void *state ) {
g_titledb.getRdb()->attemptMerge ( 1 , false , !state);
//g_tfndb.getRdb()->attemptMerge ( 1 , false , !state);
g_tagdb.getRdb()->attemptMerge ( 1 , false , !state);
//g_catdb.getRdb()->attemptMerge ( 1 , false , !state);
g_catdb.getRdb()->attemptMerge ( 1 , false , !state);
g_clusterdb.getRdb()->attemptMerge ( 1 , false , !state);
g_statsdb.getRdb()->attemptMerge ( 1 , false , !state);
g_syncdb.getRdb()->attemptMerge ( 1 , false , !state);
@ -2035,6 +2047,13 @@ bool Rdb::addRecord ( collnum_t collnum,
}
*/
// debug testing
//if ( m_rdbId == RDB_CATDB ) {
// // show key
// log("rdb: adding key=%s to tree n=%li",KEYSTR(key,12) ,n);
//}
//jumpdown:
// if it exists then annihilate it
@ -2423,7 +2442,7 @@ Rdb *getRdbFromId ( uint8_t rdbId ) {
s_table9 [ RDB_DOLEDB ] = g_doledb.getRdb();
s_table9 [ RDB_TFNDB ] = g_tfndb.getRdb();
s_table9 [ RDB_CLUSTERDB ] = g_clusterdb.getRdb();
//s_table9 [ RDB_CATDB ] = g_catdb.getRdb();
s_table9 [ RDB_CATDB ] = g_catdb.getRdb();
s_table9 [ RDB_DATEDB ] = g_datedb.getRdb();
s_table9 [ RDB_LINKDB ] = g_linkdb.getRdb();
s_table9 [ RDB_CACHEDB ] = g_cachedb.getRdb();
@ -2453,7 +2472,7 @@ Rdb *getRdbFromId ( uint8_t rdbId ) {
// the opposite of the above
char getIdFromRdb ( Rdb *rdb ) {
if ( rdb == g_tagdb.getRdb () ) return RDB_TAGDB;
//if ( rdb == g_catdb.getRdb () ) return RDB_CATDB;
if ( rdb == g_catdb.getRdb () ) return RDB_CATDB;
if ( rdb == g_indexdb.getRdb () ) return RDB_INDEXDB;
if ( rdb == g_posdb.getRdb () ) return RDB_POSDB;
if ( rdb == g_datedb.getRdb () ) return RDB_DATEDB;
@ -2474,7 +2493,7 @@ char getIdFromRdb ( Rdb *rdb ) {
if ( rdb == g_revdb.getRdb () ) return RDB_REVDB;
//if ( rdb == g_sitedb.getRdb () ) return RDB_SITEDB;
//if ( rdb == g_tagdb2.getRdb () ) return RDB2_SITEDB2;
//if ( rdb == g_catdb.getRdb () ) return RDB_CATDB;
if ( rdb == g_catdb.getRdb () ) return RDB_CATDB;
if ( rdb == g_indexdb2.getRdb () ) return RDB2_INDEXDB2;
if ( rdb == g_posdb2.getRdb () ) return RDB2_POSDB2;
if ( rdb == g_datedb2.getRdb () ) return RDB2_DATEDB2;
@ -2498,7 +2517,7 @@ char getIdFromRdb ( Rdb *rdb ) {
char isSecondaryRdb ( uint8_t rdbId ) {
switch ( rdbId ) {
//case RDB2_SITEDB2 : return true;
//case RDB_CATDB2 : return g_catdb2.getRdb();
case RDB2_CATDB2 : return true;
case RDB2_INDEXDB2 : return true;
case RDB2_POSDB2 : return true;
case RDB2_DATEDB2 : return true;
@ -2606,6 +2625,7 @@ long getDataSizeFromRdbId ( uint8_t rdbId ) {
else if ( i == RDB2_TITLEDB2 ||
i == RDB2_REVDB2 ||
i == RDB2_TAGDB2 ||
i == RDB2_CATDB2 ||
i == RDB2_SPIDERDB2 ||
i == RDB2_PLACEDB2 )
ds = -1;

1
Rdb.h

@ -53,6 +53,7 @@ enum {
RDB2_REVDB2,
RDB2_TAGDB2,
RDB2_POSDB2, // 31
RDB2_CATDB2,
RDB_END
};
// how many rdbs are in "urgent merge" mode?

@ -1244,7 +1244,8 @@ void initTable ( ) {
}
}
bool SafeBuf::urlEncode ( bool spaceToPlus ) {
// url encode the whole buffer
bool SafeBuf::urlEncodeAllBuf ( bool spaceToPlus ) {
// this makes things faster
if ( ! s_init23 ) initTable();
// how many chars do we need?

@ -229,11 +229,15 @@ struct SafeBuf {
bool requestPath = false,
bool encodeApostrophes = false );
bool urlEncode (char *s ,
bool encodeApostrophes = false ) {
bool urlEncode (char *s ) {
return urlEncode ( s,strlen(s),false,false); };
bool urlEncode2 (char *s ,
bool encodeApostrophes ) { // usually false
return urlEncode ( s,strlen(s),false,encodeApostrophes); };
bool urlEncode ( bool spaceToPlus = true );
bool urlEncodeAllBuf ( bool spaceToPlus = true );
bool latin1CdataEncode(char *s, long len);
bool utf8CdataEncode(char *s, long len);

@ -711,7 +711,6 @@ m if (! cr->hasSearchPermission ( sock, encapIp ) ) {
// . sets m_qbuf1 and m_qbuf2
if ( ! setQueryBuffers ( r ) ) return false;
/* --- Virtual host language detection --- */
if(r->getHost()) {
bool langset = getLanguageFromAbbr(m_defaultSortLanguage);
@ -1226,6 +1225,40 @@ bool SearchInput::setQueryBuffers ( HttpRequest *hr ) {
m_displayQuery,
m_displayQueryLen);
//////////
//
// show DMOZ BREADCRUMB if doing a
// "gbpcatid:<catid> |" (Search restricted to category)
// "gbcatid:<catid>" (DMOZ urls in that topic, c=dmoz3)
//
//////////
long pcatId = -1;
long dcatId = -1;
// get the final query
char *q =m_sbuf1.getBufStart();
if ( q ) sscanf(q,"gbpcatid:%li",&pcatId);
if ( q ) sscanf(q,"gbcatid:%li",&dcatId);
// pick the one that is valid
long catId = -1;
if ( pcatId >= 0 ) catId = pcatId;
if ( dcatId >= 0 ) catId = dcatId;
//////
//
// save catid into the state
m_catId = catId;
//
///////
// are we a right to left language like hebrew?
if ( catId > 0 && g_categories->isIdRTL(catId) )
m_isRTL = true;
else
m_isRTL = false;
return true;
}

@ -402,6 +402,9 @@ class SearchInput {
SafeBuf m_sbuf2;
SafeBuf m_sbuf3;
long m_catId;
bool m_isRTL;
// make a cookie from parms with m_flags of PF_COOKIE set
SafeBuf m_cookieBuf;

@ -36,8 +36,10 @@ Sections::Sections ( ) {
}
void Sections::reset() {
if ( m_sections && m_needsFree )
mfree ( m_sections , m_sectionsBufSize , "Sections" );
//if ( m_sections && m_needsFree )
// mfree ( m_sections , m_sectionsBufSize , "Sections" );
m_sectionBuf.purge();
m_sectionPtrBuf.purge();
if ( m_buf && m_bufSize )
mfree ( m_buf , m_bufSize , "sdata" );
if ( m_buf2 && m_bufSize2 )
@ -228,10 +230,20 @@ bool Sections::set ( Words *w ,
max++;
// and each section may create a sentence section
max *= 2;
// truncate if excessive. growSections() will kick in then i guess
// if we need more sections.
if ( max > 1000000 ) {
log("sections: truncating max sections to 1000000");
max = 1000000;
}
//max += 5000;
long need = max * sizeof(Section);
// and we need one section ptr for every word!
need += nw * 4;
//need += nw * 4;
// and a section ptr for m_sorted[]
//need += max * sizeof(Section *);
// set this
@ -240,8 +252,21 @@ bool Sections::set ( Words *w ,
// breathe
QUICKPOLL(m_niceness);
// allocate m_sections[] buffer
// separate buf now for section ptr for each word
if ( ! m_sectionPtrBuf.reserve ( nw *4 ) ) return true;
m_sectionPtrs = (Section **)m_sectionPtrBuf.getBufStart();
m_sectionPtrsEnd = (Section **)m_sectionPtrBuf.getBufEnd();
// allocate m_sectionBuf
m_sections = NULL;
if ( ! m_sectionBuf.reserve ( need ) )
return true;
// point into it
m_sections = (Section *)m_sectionBuf.getBufStart();
/*
// assume no malloc
m_needsFree = false;
if ( need < SECTIONS_LOCALBUFSIZE ) {
@ -259,6 +284,7 @@ bool Sections::set ( Words *w ,
m_sectionsBufSize = need;
m_needsFree = true;
}
*/
// clear it nicely
//memset_nice ( m_sections , 0 , m_sectionsBufSize, m_niceness );
@ -270,20 +296,20 @@ bool Sections::set ( Words *w ,
m_titleEnd = -1;
// bail if no luck
if ( ! m_sections ) return true;
//if ( ! m_sections ) return true;
// point to buf
char *ppp = (char *)m_sections;
//char *ppp = (char *)m_sections;
// skip Sections array
ppp += max * sizeof(Section);
//ppp += max * sizeof(Section);
// assign space for m_sorted
//m_sorted = (Section **)ppp;
// skip that
//ppp += max * sizeof(Section *);
// assign space for our ptrs that are 1-1 with the words array
m_sectionPtrs = (Section **)ppp;
//m_sectionPtrs = (Section **)ppp;
// the end
m_sectionPtrsEnd = (Section **)(ppp + nw * 4);
//m_sectionPtrsEnd = (Section **)(ppp + nw * 4);
// save this too
m_nw = nw;
@ -375,6 +401,10 @@ bool Sections::set ( Words *w ,
if ( fullTid == TAG_INPUT ||
fullTid == TAG_HR ||
fullTid == TAG_COMMENT ) {
// try to realloc i guess. should keep ptrs in tact.
if ( m_numSections >= m_maxNumSections &&
! growSections() )
return true;
// get the section
Section *sn = &m_sections[m_numSections];
// clear
@ -397,6 +427,10 @@ bool Sections::set ( Words *w ,
// a section of multiple br tags in a sequence
if ( fullTid == TAG_BR ) {
// try to realloc i guess. should keep ptrs in tact.
if ( m_numSections >= m_maxNumSections &&
! growSections() )
return true;
// get the section
Section *sn = &m_sections[m_numSections];
// clear
@ -884,6 +918,9 @@ bool Sections::set ( Words *w ,
// with the address above it, and it shouldn't do that!
if ( tid == TAG_FONT ) continue;
// try to realloc i guess. should keep ptrs in tact.
if ( m_numSections >= m_maxNumSections && ! growSections() )
return true;
// get the section
Section *sn = &m_sections[m_numSections];
// clear
@ -11034,8 +11071,11 @@ Section *Sections::insertSubSection ( Section *parentArg , long a , long b ,
// debug
//log("sect: inserting subsection [%li,%li)",a,b);
// sanity check
if ( m_numSections >= m_maxNumSections ) { char *xx=NULL;*xx=0;}
// try to realloc i guess. should keep ptrs in tact.
if ( m_numSections >= m_maxNumSections )
// try to realloc i guess
if ( ! growSections() ) return NULL;
//char *xx=NULL;*xx=0;}
//
// make a new section
@ -17270,3 +17310,95 @@ bool Sections::setListFlags ( ) {
Section *ps;
*/
}
bool Sections::growSections ( ) {
// make a log note b/c this should not happen a lot because it's slow
log("build: growing sections!");
// record old buf start
char *oldBuf = m_sectionBuf.getBufStart();
// grow by 20MB at a time
if ( ! m_sectionBuf.reserve ( 20000000 ) ) return false;
// for fixing ptrs:
char *newBuf = m_sectionBuf.getBufStart();
// set the new max
m_maxNumSections = m_sectionBuf.getCapacity() / sizeof(Section);
// update ptrs in the old sections
for ( long i = 0 ; i < m_numSections ; i++ ) {
// breathe
QUICKPOLL(m_niceness);
Section *si = &m_sections[i];
if ( si->m_parent ) {
char *np = (char *)si->m_parent;
np = np - oldBuf + newBuf;
si->m_parent = (Section *)np;
}
if ( si->m_next ) {
char *np = (char *)si->m_next;
np = np - oldBuf + newBuf;
si->m_next = (Section *)np;
}
if ( si->m_prev ) {
char *np = (char *)si->m_prev;
np = np - oldBuf + newBuf;
si->m_prev = (Section *)np;
}
if ( si->m_listContainer ) {
char *np = (char *)si->m_listContainer;
np = np - oldBuf + newBuf;
si->m_listContainer = (Section *)np;
}
if ( si->m_prevBrother ) {
char *np = (char *)si->m_prevBrother;
np = np - oldBuf + newBuf;
si->m_prevBrother = (Section *)np;
}
if ( si->m_nextBrother ) {
char *np = (char *)si->m_nextBrother;
np = np - oldBuf + newBuf;
si->m_nextBrother = (Section *)np;
}
if ( si->m_sentenceSection ) {
char *np = (char *)si->m_sentenceSection;
np = np - oldBuf + newBuf;
si->m_sentenceSection = (Section *)np;
}
if ( si->m_prevSent ) {
char *np = (char *)si->m_prevSent;
np = np - oldBuf + newBuf;
si->m_prevSent = (Section *)np;
}
if ( si->m_nextSent ) {
char *np = (char *)si->m_nextSent;
np = np - oldBuf + newBuf;
si->m_nextSent = (Section *)np;
}
if ( si->m_tableSec ) {
char *np = (char *)si->m_tableSec;
np = np - oldBuf + newBuf;
si->m_tableSec = (Section *)np;
}
if ( si->m_headColSection ) {
char *np = (char *)si->m_headColSection;
np = np - oldBuf + newBuf;
si->m_headColSection = (Section *)np;
}
if ( si->m_headRowSection ) {
char *np = (char *)si->m_headRowSection;
np = np - oldBuf + newBuf;
si->m_headRowSection = (Section *)np;
}
if ( si->m_leftCell ) {
char *np = (char *)si->m_leftCell;
np = np - oldBuf + newBuf;
si->m_leftCell = (Section *)np;
}
if ( si->m_aboveCell ) {
char *np = (char *)si->m_aboveCell;
np = np - oldBuf + newBuf;
si->m_aboveCell = (Section *)np;
}
}
return true;
}

@ -680,7 +680,9 @@ class Sections {
long getStoredSize ( ) ;
static long getStoredSize ( char *p ) ;
long serialize ( char *p ) ;
long getMemUsed ( ) { return m_sectionsBufSize; };
//long getMemUsed ( ) { return m_sectionsBufSize; };
bool growSections ( );
bool getSectiondbList ( );
bool gotSectiondbList ( bool *needsRecall ) ;
@ -828,10 +830,17 @@ class Sections {
// allocate m_sections[] buffer
class Section *m_sections;
long m_sectionsBufSize;
//long m_sectionsBufSize;
long m_numSections;
long m_maxNumSections;
// this holds the Sections instances in a growable array
SafeBuf m_sectionBuf;
// this holds ptrs to sections 1-1 with words array, so we can
// see what section a word is in.
SafeBuf m_sectionPtrBuf;
long m_numSentenceSections;
bool m_firstDateValid;

@ -1537,7 +1537,8 @@ bool Speller::findNext( char *s, char *send, char **nextWord, bool *isPorn,
long slen = send - s;
// check if there is an adult word in there
// NOTE: The word 'adult' gives a lot of false positives, so even
// though it is in the isAdult() list, skip it
// though it is in the isAdult() list, skip it.
// s/slen constitues an individual word.
if ( isAdult ( s, slen, &loc ) && strncmp ( s, "adult", 5 ) != 0 ){
// if this string starts with the adult word, don't check
// further

@ -3878,7 +3878,7 @@ void SpiderLoop::spiderDoledUrls ( ) {
if ( m_cri >= g_collectiondb.m_numRecs ) m_cri = 0;
// get rec
cr = g_collectiondb.m_recs[m_cri];
// skip if empty
// skip if gone
if ( ! cr ) continue;
// stop if not enabled
if ( ! cr->m_spideringEnabled ) continue;

@ -601,7 +601,13 @@ class SpiderRequest {
// this 0 and to not avoid spidering the links.
long m_avoidSpiderLinks:1;
// for identifying address heavy sites...
long m_tagYellowPages:1;
//long m_tagYellowPages:1;
// when indexing urls for dmoz, i.e. the urls outputted from
// 'dmozparse urldump -s' we need to index them even if there
// was a ETCPTIMEDOUT because we have to have indexed the same
// urls that dmoz has in it in order to be identical to dmoz.
long m_ignoreExternalErrors:1;
// called XmlDoc::set4() from PageSubmit.cpp?
//long m_isPageSubmit:1;

215
Stats.cpp

@ -2,7 +2,7 @@
#include <errno.h>
#include "Stats.h"
#define X_DISPLAY_MISSING 1
//#define X_DISPLAY_MISSING 1
//#include <plotter.h>
#include <math.h>
#include "Conf.h"
@ -133,6 +133,7 @@ void Stats::addStat_r ( long numBytes ,
//pthread_mutex_unlock ( &s_lock );
}
/*
// . dump a graph to /tmp/diskGraph.gif
// . use libplotter.a or .so ?
// . docs at http://www.gnu.org/manual/plotutils/html_mono/plotutils.html#SEC54
@ -341,7 +342,7 @@ void Stats::dumpGIF ( long long startTime , long long endTime ) {
mfree(lrgBuf, lrgSize, "Stats.cpp");
#endif
}
*/
void Stats::addPoint (StatPoint **points ,
long *numPoints ,
@ -486,3 +487,213 @@ void Stats::addSpiderPoint ( long errCode, bool isNew ) {
m_allErrorsOld[errCode]++;
}
}
// draw a HORIZONTAL line in html
void drawLine2 ( SafeBuf &sb ,
long x1 ,
long x2 ,
long fy1 ,
long color ,
long width ) {
sb.safePrintf("<div style=\"position:absolute;"
"left:%li;"
"top:%li;"
"background-color:#%lx;"
"z-index:-5;"
"min-height:%lipx;"
"min-width:%lipx;\"></div>\n"
, x1
, (fy1 - width/2) - 20 //- 300
, color
, width
, x2 - x1
);
}
//
// new code for drawing graph in html with absolute divs instead
// of using GIF plotter library which had issues
//
void Stats::printGraphInHtml ( SafeBuf &sb ) {
// gif size
char tmp[64];
sprintf ( tmp , "%lix%li", (long)DX+40 , (long)DY+40 ); // "1040x440"
// 20 pixel borders
//int bx = 10;
//int by = 30;
// define the space with boundaries 100 unit wide boundaries
//plotter.space ( -bx , -by , DX + bx , DY + by );
// draw the x-axis
//plotter.line ( 0 , 0 , DX , 0 );
// draw the y-axis
//plotter.line ( 0 , 0 , 0 , DY );
// find time ranges
long long t2 = 0;
for ( long i = 0 ; i < MAX_POINTS ; i++ ) {
// skip empties
if ( m_pts[i].m_startTime == 0 ) continue;
// set min/max
if ( m_pts[i].m_endTime > t2 ) t2 = m_pts[i].m_endTime;
}
// now compute the start time for the graph
long long t1 = 0x7fffffffffffffffLL;
// now recompute t1
for ( long i = 0 ; i < MAX_POINTS ; i++ ) {
// skip empties
if ( m_pts[i].m_startTime == 0 ) continue;
// can't be behind more than 1 second
if ( m_pts[i].m_startTime < t2 - DT ) continue;
// otherwise, it's a candidate for the first time
if ( m_pts[i].m_startTime < t1 ) t1 = m_pts[i].m_startTime;
}
//
// main graphing window
//
sb.safePrintf("<div style=\"position:relative;"
"background-color:#c0c0c0;"
//"overflow-y:hidden;"
"overflow-x:hidden;"
"z-index:-10;"
// the tick marks we print below are based on it
// being a window of the last 20 seconds... and using
// DX pixels
"min-width:%lipx;"
"min-height:%lipx;"
//"width:100%%;"
//"min-height:600px;"
"margin-top:10px;"
"margin-bottom:10px;"
"margin-right:10px;"
"margin-left:10px;\">"
,(long)DX
,(long)DY +20); // add 10 more for "2s" labels etc.
// 10 x-axis tick marks
for ( int x = DX/20 ; x <= DX ; x += DX/20 ) {
// tick mark
//plotter.line ( x , -20 , x , 20 );
sb.safePrintf("<div style=\"position:absolute;"
"left:%li;"
"bottom:0;"
"background-color:#000000;"
"z-index:110;"
"min-height:20px;"
"min-width:3px;\"></div>\n"
, (long)x-1
);
// generate label
//char buf [ 32 ];
//sprintf ( buf , "%li" ,
// (long)(DT * (long long)x / (long long)DX) );
// LABEL
sb.safePrintf("<div style=\"position:absolute;"
"left:%li;"
"bottom:20;"
//"background-color:#000000;"
"z-index:110;"
"min-height:20px;"
"min-width:3px;\">%lis</div>\n"
, (long)x-10
// the label:
,(long)(DT * (long long)x / (long long)DX)/1000
);
// move cursor
//plotter.move ( x , -by / 2 - 9 );
// plot label
//plotter.alabel ( 'c' , 'c' , buf );
}
// . each line consists of several points
// . we need to know each point for adding otherlines
// . is about [400/6][1024] = 70k
// . each line can contain multiple data points
// . each data point is expressed as a horizontal line segment
void *lrgBuf;
long lrgSize = 0;
lrgSize += MAX_LINES * MAX_POINTS * sizeof(StatPoint *);
lrgSize += MAX_LINES * sizeof(long);
lrgBuf = (char *) mmalloc(lrgSize, "Stats.cpp");
if (! lrgBuf) {
log("could not allocate memory for local buffer in Stats.cpp"
"%li bytes needed", lrgSize);
return;
}
char *lrgPtr = (char *)lrgBuf;
StatPoint **points = (StatPoint **)lrgPtr;
lrgPtr += MAX_LINES * MAX_POINTS * sizeof(StatPoint *);
long *numPoints = (long *)lrgPtr;
lrgPtr += MAX_LINES * sizeof(long);
memset ( (char *)numPoints , 0 , MAX_LINES * sizeof(long) );
// store the data points into "lines"
long count = MAX_POINTS;
for ( long i = m_next ; count >= 0 ; i++ , count-- ) {
// wrap around the array
if ( i >= MAX_POINTS ) i = 0;
// skip point if empty
if ( m_pts[i].m_startTime == 0 ) continue;
// skip if too early
if ( m_pts[i].m_endTime < t1 ) continue;
// . find the lowest line the will hold us
// . this adds point to points[x][n] where x is determined
addPoint ( points , numPoints , &m_pts[i] );
}
int y1 = 21;
// plot the points (lines) in each line
for ( long i = 0 ; i < MAX_LINES ; i++ ) {
// increase vert
y1 += MAX_WIDTH + 1;
// wrap back down if necessary
if ( y1 >= DY ) y1 = 21;
// plt all points in this row
for ( long j = 0 ; j < numPoints[i] ; j++ ) {
// get the point
StatPoint *p = points[MAX_POINTS * i + j];
// transform time to x coordinates
int x1 = (p->m_startTime - t1) * (long long)DX / DT;
int x2 = (p->m_endTime - t1) * (long long)DX / DT;
// if x2 is negative, skip it
if ( x2 < 0 ) continue;
// if x1 is negative, boost it to -2
if ( x1 < 0 ) x1 = -2;
// . line thickness is function of read/write size
// . take logs
int w = (int)log(((double)p->m_numBytes)/8192.0) + 3;
//log("log of %li is %i",m_pts[i].m_numBytes,w);
if ( w < 3 ) w = 3;
if ( w > MAX_WIDTH ) w = MAX_WIDTH;
//plotter.linewidth ( w );
// use the color specified from addStat_r() for this line/pt
//plotter.pencolor ( ((p->m_color >> 16) & 0xff) << 8 ,
// ((p->m_color >> 8) & 0xff) << 8 ,
// ((p->m_color >> 0) & 0xff) << 8 );
// ensure at least 3 units wide for visibility
if ( x2 < x1 + 3 ) x2 = x1 + 3;
// . flip the y so we don't have to scroll the browser down
// . DY does not include the axis and tick marks
long fy1 = DY - y1 + 20 ;
// plot it
//plotter.line ( x1 , fy1 , x2 , fy1 );
drawLine2 ( sb , x1 , x2 , fy1 , p->m_color , w );
// debug msg
//log("line (%i,%i, %i,%i) ", x1 , vert , x2 , vert );
//log("bytes = %li width = %li ", m_pts[i].m_numBytes,w);
//log("st=%i, end=%i color=%lx " ,
// (int)m_pts[i].m_startTime ,
// (int)m_pts[i].m_endTime ,
// m_pts[i].m_color );
}
}
sb.safePrintf("</div>\n");
mfree(lrgBuf, lrgSize, "Stats.cpp");
}

@ -25,9 +25,9 @@ class StatPoint {
#define MAX_POINTS 6000
#define MAX_WIDTH 6
#define DY 900 // pixels vertical
#define DY 600 // pixels vertical
#define DX 1000 // pixels across
#define DT (20*1000) // time window, 10 seconds
#define DT (20*1000) // time window, 20 seconds
#define MAX_LINES (DY / (MAX_WIDTH+1)) // leave free pixel above each line
#define STAT_GENERIC 0
@ -53,7 +53,10 @@ class Stats {
// . dumps a bar graph
// . each bar represents a stat in time, from inception to completion
// . useful for seeing possible sources of contention
void dumpGIF ( long long startTime = -1 , long long endTime = -1 );
//void dumpGIF ( long long startTime = -1 , long long endTime = -1 );
void printGraphInHtml ( SafeBuf &sb );
// this graphs:
// 1. stats per second

@ -80,7 +80,7 @@ static Label s_labels[] = {
// . max = -1, means dynamic size the ymax!
// . use 1B for now again...
// . color=pink
{GRAPH_QUANTITY,1000000000.0,"docs_indexed", .1,"%.0fK docs" , .001 , 0x00cc0099,"docs indexed" }
{GRAPH_QUANTITY,50000000.0,"docs_indexed", .1,"%.0fK docs" , .001 , 0x00cc0099,"docs indexed" }
//{ "termlist_intersect",0x0000ff00},
@ -101,6 +101,13 @@ static Label s_labels[] = {
//{ "parm_change",0xffc0c0} // pink?
};
void drawLine3 ( SafeBuf &sb ,
long x1 ,
long x2 ,
long fy1 ,
long color ,
long width ) ;
Label *Statsdb::getLabel ( long labelHash ) {
Label **label = (Label **)m_labelTable.getValue ( &labelHash );
if ( ! label ) return NULL;
@ -116,7 +123,7 @@ bool Statsdb::init ( ) {
// 20 pixel borders
m_bx = 10;
m_by = 30;
m_by = 40;
// keep it at least at 20MB otherwise it is filling up the tree
// constantly and dumping
@ -477,6 +484,11 @@ bool Statsdb::makeGIF ( long t1Arg ,
m_sb3.reset();
m_ht3.reset();
// print graph in here as a bunch of divs now:
m_gw.purge();
m_dupTable.reset();
m_dupTable.set(4,0,20000,NULL,0,false,0,"statstbl");
// . start at t1 and get stats lists, up to 1MB of stats at a time
// . subtract 60 seconds so we can have a better shot at having
// a moving average for the last SAMPLE points
@ -495,6 +507,7 @@ bool Statsdb::makeGIF ( long t1Arg ,
return true;
// open the file for the gif
/*
char fname [ 1024 ];
sprintf ( fname , "%s/stats%li.gif" ,
g_hostdb.m_httpRootDir , g_hostdb.m_hostId );
@ -504,13 +517,16 @@ bool Statsdb::makeGIF ( long t1Arg ,
fname , mstrerror(errno) );
return true;
}
*/
return gifLoop ();
}
#define POINTWIDTH 8
#define MAX_POINTS 6000
#define MAX_WIDTH 6
#define DY 900 // pixels vertical
#define DY 600 // pixels vertical
#define DX 1000 // pixels across
#define MAX_LINES (DY / (MAX_WIDTH+1)) // leave free pixel above each line
@ -542,9 +558,9 @@ bool Statsdb::gifLoop ( ) {
// shortcut
Msg5 *m = &m_msg5;
#ifndef _USEPLOTTER_
return true;
#endif
//#ifndef _USEPLOTTER_
//return true;
//#endif
// loop over all the lists in the time range, [m_t1,m_t2]
for ( ; ! m_done ; ) {
@ -576,53 +592,87 @@ bool Statsdb::gifLoop ( ) {
}
// define time delta - commented out because it's currently not used.
//long dt = m_t2 - m_t1;
long dt = m_t2 - m_t1;
#ifdef _USEPLOTTER_
//#ifdef _USEPLOTTER_
// gif size
char tmp[64];
//char tmp[64];
// dimensions of the gif
sprintf ( tmp , "%lix%li", (long)DX+m_bx*2 , (long)DY+m_by*2 );
GIFPlotter::parampl ( "BITMAPSIZE" , (void *)tmp );
//sprintf ( tmp , "%lix%li", (long)DX+m_bx*2 , (long)DY+m_by*2 );
//GIFPlotter::parampl ( "BITMAPSIZE" , (void *)tmp );
// create one
GIFPlotter plotter ( NULL , m_fd , NULL );
//GIFPlotter plotter ( NULL , m_fd , NULL );
// open it
plotter.openpl ( );
//plotter.openpl ( );
// define the space with boundaries 100 unit wide boundaries
//plotter.space ( -m_bx , -m_by , DX + m_bx , DY + m_by );
plotter.space ( 0 , 0 , DX + m_bx * 2 , DY + m_by * 2 );
//plotter.space ( 0 , 0 , DX + m_bx * 2 , DY + m_by * 2 );
// line thickness in user coordinates (pixels for us)
plotter.linewidth ( 1 );
//plotter.linewidth ( 1 );
// set bg color to gray (r/g/b)
plotter.bgcolor ( 0xd600 , 0xce00 , 0xd600 );
// set bg color to white (r/g/b)
//plotter.bgcolor ( 0xff00 , 0xff00 , 0xff00 );
//plotter.bgcolor ( 0xd600 , 0xce00 , 0xd600 );
// erase Plotter's graphics display
plotter.erase ();
//plotter.erase ();
// draw axises in black
plotter.pencolorname ("black");
//plotter.pencolorname ("black");
//
// main graphing window
//
m_gw.safePrintf("<div style=\"position:relative;"
"background-color:#c0c0c0;"
//"overflow-y:hidden;"
"overflow-x:hidden;"
"z-index:-10;"
// the tick marks we print below are based on it
// being a window of the last 20 seconds... and using
// DX pixels
"min-width:%lipx;"
"min-height:%lipx;"
//"width:100%%;"
//"min-height:600px;"
"margin-top:10px;"
"margin-bottom:10px;"
"margin-right:10px;"
"margin-left:10px;\">"
,(long)DX + 2 *m_bx
,(long)DY + 2*m_by);
// draw the x-axis
plotter.line ( m_bx , m_by , DX + m_bx , m_by );
// draw the y-axis
plotter.line ( m_bx , m_by , m_bx , DY + m_by);
//plotter.line ( m_bx , m_by , DX + m_bx , m_by );
// 10 x-axis tick marks
for ( int x = DX/10 + m_bx ; x < DX - m_bx ; x += DX/10 ) {
for ( int x = DX/20 ; x <= DX ; x += DX/20 ) {
// tick mark
plotter.line ( x , m_by - 15 , x , m_by + 15 );
// generate label
long xv = (long)(dt * (long long)x / (long long)DX) -(long)dt;
char buf [ 32 ];
// in seconds, so put "s" in there
sprintf ( buf , "%lis" , xv );//(float)xv / 1000.0 );
// move cursor
plotter.move ( x , m_by - m_by / 2 - 9 );
// plot label
plotter.alabel ( 'c' , 'c' , buf );
//plotter.line ( x , -20 , x , 20 );
m_gw.safePrintf("<div style=\"position:absolute;"
"left:%li;"
"bottom:0;"
"background-color:#000000;"
"z-index:110;"
"min-height:20px;"
"min-width:3px;\"></div>\n"
, m_bx + (long)x-1
);
long xv = (long)(dt * (long long)x/(long long)DX)-(long)dt;
// LABEL
m_gw.safePrintf("<div style=\"position:absolute;"
"left:%li;"
"bottom:20;"
//"background-color:#000000;"
"z-index:110;"
"min-height:20px;"
"min-width:3px;\">%lis</div>\n"
, (long)x-10 + m_bx
// the label:
, xv
);
}
HashTableX tmpht;
tmpht.set(4,0,0,NULL,0,false,m_niceness,"statsparms");
@ -651,7 +701,7 @@ bool Statsdb::gifLoop ( ) {
// . graph this single graph of this color
// . returns ptr to first point of different color!
plotGraph ( p , pend , gh , &plotter , zoff );
plotGraph ( p , pend , gh , m_gw , zoff );
// prevent collisions
zoff += 20;
@ -709,7 +759,7 @@ bool Statsdb::gifLoop ( ) {
}
// set the line width
plotter.linewidth ( pp->m_thickness );
//plotter.linewidth ( pp->m_thickness );
// get parm hash
long colorHash = pp->m_parmHash;
@ -720,9 +770,9 @@ bool Statsdb::gifLoop ( ) {
// . is really the parm hash in disguise
long c1 = colorHash & 0x00ffffff;
// use the color specified from addStat_r() for this line/pt
plotter.pencolor ( ((c1 >> 16) & 0xff) << 8 ,
((c1 >> 8) & 0xff) << 8 ,
((c1 >> 0) & 0xff) << 8 );
//plotter.pencolor ( ((c1 >> 16) & 0xff) << 8 ,
// ((c1 >> 8) & 0xff) << 8 ,
// ((c1 >> 0) & 0xff) << 8 );
long x1 = pp->m_a;
long x2 = pp->m_b;
@ -731,9 +781,10 @@ bool Statsdb::gifLoop ( ) {
if ( x2 < x1 + 10 ) x2 = x1 + 10;
// . flip the y so we don't have to scroll the browser down
// . DY does not include the axis and tick marks
long fy1 = DY - y1 + m_by ;
//long fy1 = DY - y1 + m_by ;
// plot it
plotter.line ( x1 , fy1 , x2 , fy1 );
//plotter.line ( x1 , fy1 , x2 , fy1 );
drawLine3 ( m_gw , x1 , x2 , y1 , c1 , pp->m_thickness );
// add to map key? only if we haven't already
if ( tmpht.isInTable ( &colorHash ) ) continue;
@ -785,12 +836,15 @@ bool Statsdb::gifLoop ( ) {
//
// all done
if ( plotter.closepl () < 0 )
log("admin: Could not close performance graph object.");
//if ( plotter.closepl () < 0 )
// log("admin: Could not close performance graph object.");
// close the file
fclose ( m_fd );
//fclose ( m_fd );
#endif
//#endif
// close main graphing window
m_gw.safePrintf("</div>\n");
return true;
}
@ -799,15 +853,10 @@ bool Statsdb::gifLoop ( ) {
char *Statsdb::plotGraph ( char *pstart ,
char *pend ,
long graphHash ,
GIFPlotter *plotter ,
//GIFPlotter *plotter ,
SafeBuf &gw ,
long zoff ) {
#ifndef _USEPLOTTER_
return NULL;
#else
// . use "graphHash" to map to unit display
// . this is a disk read volume
Label *label = getLabel ( graphHash );
@ -857,20 +906,16 @@ char *Statsdb::plotGraph ( char *pstart ,
char *retp = p;
// set the line width
plotter->linewidth ( 1 );
//plotter->linewidth ( 1 );
long color = label->m_color;
// use the color specified from addStat_r() for this line/pt
plotter->pencolor ( ((color >> 16) & 0xff) << 8 ,
((color >> 8) & 0xff) << 8 ,
((color >> 0) & 0xff) << 8 );
//plotter->pencolor ( ((color >> 16) & 0xff) << 8 ,
// ((color >> 8) & 0xff) << 8 ,
// ((color >> 0) & 0xff) << 8 );
// how many points per pixel do we have now
//float res = (ymax - ymin) / (float)DY;
// . the minimum difference between ymax and ymin is minDiff.
// . this prevents us from zooming in too close!
float minDiff = (float)DY * label->m_minRes ;
@ -896,7 +941,7 @@ char *Statsdb::plotGraph ( char *pstart ,
// set the line width
plotter->linewidth ( 2 );
//plotter->linewidth ( 2 );
// reset for 2nd scan
p = pstart;
@ -940,8 +985,8 @@ char *Statsdb::plotGraph ( char *pstart ,
// . flip the y so we don't have to scroll the browser down
// . DY does not include the axis and tick marks
// . do not flip y any more for statsdb graphs
long fy1 = (long)(y1+.5) + m_by ;
long fy2 = (long)(y2+.5) + m_by ;
long fy1 = (long)(y1+.5);// + m_by ;
long fy2 = (long)(y2+.5);// + m_by ;
// how are we getting -.469 for "query" point?
if ( fy1 < 0 ) continue;
@ -949,7 +994,10 @@ char *Statsdb::plotGraph ( char *pstart ,
// skip if can't make a line
if ( firstPoint ) {
plotter->circle ( x2 , fy2 , 2 );
//plotter->circle ( x2 , fy2 , 2 );
long width = POINTWIDTH;
// draw a 4x4 box now:
drawLine3(m_gw,x2-width/2,x2+width/2,fy2,color,width);
firstPoint = false;
continue;
}
@ -963,32 +1011,38 @@ char *Statsdb::plotGraph ( char *pstart ,
// plot it
// BUT only iff not more than 5 seconds difference
float secondsPerPixel = (m_t2-m_t1)/(float)DX;
float dt = (x2 - x1) * secondsPerPixel;
//float secondsPerPixel = (m_t2-m_t1)/(float)DX;
// avoid this for now. mdw oct 14 2013.
//float dt = (x2 - x1) * secondsPerPixel;
//if ( dt <= 13 || x2 - x1 <= 10 )
// plotter->line ( x1 , fy1 , x2 , fy2 );
if ( dt <= 13 || x2 - x1 <= 10 )
plotter->line ( x1 , fy1 , x2 , fy2 );
// circle second point
plotter->circle ( x1 , fy1 , 2 );
plotter->circle ( x2 , fy2 , 2 );
//plotter->circle ( x1 , fy1 , 2 );
//plotter->circle ( x2 , fy2 , 2 );
// draw a 4x4 boxes now:
long width = POINTWIDTH;
drawLine3 ( m_gw,x1-width/2, x1+width/2, fy1,color, width);
drawLine3 ( m_gw,x2-width/2, x2+width/2, fy2,color, width);
}
plotter->linewidth ( 1 );
//plotter->linewidth ( 1 );
// plot unit lines
float deltaz = (ymax-ymin) / 6;
if ( strstr(label->m_keyDesc,"latency" ) ) {
// draw it
drawHR ( 400.0 - 111.0 , ymin , ymax , plotter , label , zoff,0xff0000);
drawHR ( 600.0 - 111.0 , ymin , ymax , plotter , label , zoff , color);
drawHR ( 400.0 - 111.0 , ymin,ymax,m_gw,label,zoff,0xff0000);
drawHR ( 600.0-111.0,ymin,ymax,m_gw,label,zoff,color);
}
if ( strstr(label->m_keyDesc,"queries per sec" ) ) {
// draw it
//deltaz /= 2;
//drawHR ( 120.0 , ymin , ymax , plotter , label , zoff , color );
//drawHR ( 130.0 , ymin , ymax , plotter , label , zoff , color );
drawHR ( 140.0 , ymin , ymax , plotter , label , zoff , color );
//drawHR(120.0, ymin , ymax , plotter , label , zoff , color );
//drawHR(130.0, ymin , ymax , plotter , label , zoff , color );
drawHR ( 140.0 , ymin , ymax ,m_gw , label , zoff , color );
}
@ -996,18 +1050,19 @@ char *Statsdb::plotGraph ( char *pstart ,
// breathe
QUICKPOLL ( m_niceness );
// draw it
drawHR ( z , ymin , ymax , plotter , label , zoff , color );
drawHR ( z , ymin , ymax , m_gw , label , zoff , color );
}
return retp;
#endif
//#endif
}
void Statsdb::drawHR ( float z ,
float ymin ,
float ymax ,
GIFPlotter *plotter ,
//GIFPlotter *plotter ,
SafeBuf &gw,
Label *label ,
float zoff ,
long color ) {
@ -1017,29 +1072,34 @@ void Statsdb::drawHR ( float z ,
// avoid collisions with other graphs
z2 += zoff;
// border
z2 += m_by;
//z2 += m_by;
// round off error
z2 += 0.5;
// for adjusatmnet
//float ptsPerPixel = (ymax-ymin)/ (float)DY;
float ptsPerPixel = (ymax-ymin)/ (float)DY;
// make an adjustment to the label then! -- Commented out because it's currently not used.
//float zadj = zoff * ptsPerPixel;
float zadj = zoff * ptsPerPixel;
#ifdef _USEPLOTTER_
//#ifdef _USEPLOTTER_
// use the color specified from addStat_r() for this line/pt
plotter->pencolor ( ((color >> 16) & 0xff) << 8 ,
((color >> 8) & 0xff) << 8 ,
((color >> 0) & 0xff) << 8 );
//plotter->pencolor ( ((color >> 16) & 0xff) << 8 ,
// ((color >> 8) & 0xff) << 8 ,
// ((color >> 0) & 0xff) << 8 );
// horizontal line
plotter->line ( m_bx, (long)z2 , DX + m_bx, (long)z2 );
//plotter->line ( m_bx, (long)z2 , DX + m_bx, (long)z2 );
long width = 1;
drawLine3 ( m_gw, 0, DX , (long)z2,color, width);
// make label
char tmp[128];
// . use "graphHash" to map to unit display
// . this is a disk read volume
sprintf(tmp,label->m_format,z +zadj);//* label->m_yscalar);
/*
// a white shadow
plotter->pencolor ( 0xffff,0xffff,0xffff );
plotter->move ( m_bx + 80 + 2 , z2 + 10 - 2 );
@ -1060,7 +1120,24 @@ void Statsdb::drawHR ( float z ,
plotter->move ( m_bx + 80 , z2 + 10 );
// plot label
plotter->alabel ( 'c' , 'c' , tmp );
#endif
*/
// LABEL
gw.safePrintf("<div style=\"position:absolute;"
"left:%li;"
"bottom:%li;"
"color:#%lx;"
"z-index:110;"
"font-size:14px;"
"min-height:20px;"
"min-width:3px;\">%s</div>\n"
, (long)(m_bx)
, (long)z2 +m_by
, color
// the label:
, tmp
);
}
void gotListWrapper ( void *state , RdbList *list, Msg5 *msg5 ) {
@ -1289,7 +1366,7 @@ bool Statsdb::addPoint ( long x ,
// convert x into pixel position
float xf = (float)DX * (float)(x - m_t1) / (float)(m_t2 - m_t1);
// round it to nearest pixel
long x2 = (long)(xf + .5) + m_bx;
long x2 = (long)(xf + .5) ;//+ m_bx;
// make this our y pos
float y2 = y;
// average values if tied
@ -1371,7 +1448,7 @@ bool Statsdb::addEventPoint ( long t1 ,
// convert t1 into pixel position
float af = (float)DX * (float)(t1 - m_t1) / (float)(m_t2 - m_t1);
// round it to nearest pixel
long a = (long)(af + .5) + m_bx;
long a = (long)(af + .5) ;//+ m_bx;
// convert t2 into pixel position
//float bf = (float)DX * (float)(t2 - m_t1) / (float)(m_t2 - m_t1);
@ -1439,3 +1516,43 @@ bool Statsdb::addEventPoint ( long t1 ,
log("stats: no room in graph for event");
return true;
}
//////////
//
// NEW CODE HERE
//
//////////
// draw a HORIZONTAL line in html
void Statsdb::drawLine3 ( SafeBuf &sb ,
long x1 ,
long x2 ,
long fy1 ,
long color ,
long width ) {
// do not draw repeats in the case we have a ton of points to plot
long key32 ;
key32 = hash32h ( x1 , 0 );
key32 = hash32h ( x2 , key32);
key32 = hash32h ( fy1 , key32);
key32 = hash32h ( color , key32);
key32 = hash32h ( width , key32);
if ( m_dupTable.isInTable(&key32) ) return;
m_dupTable.addKey(&key32);
sb.safePrintf("<div style=\"position:absolute;"
"left:%li;"
"bottom:%li;"
"background-color:#%lx;"
"z-index:-5;"
"min-height:%lipx;"
"min-width:%lipx;\"></div>\n"
, x1 + m_bx
, (fy1 - width/2) + m_by
, color
, width
, x2 - x1
);
}

@ -73,13 +73,22 @@ class Statsdb {
char *plotGraph ( char *pstart ,
char *pend ,
long graphHash ,
class GIFPlotter *plotter ,
//class GIFPlotter *plotter ,
SafeBuf &gw,
long zoff );
void drawLine3 ( SafeBuf &sb ,
long x1 ,
long x2 ,
long fy1 ,
long color ,
long width ) ;
void drawHR ( float z ,
float ymin ,
float ymax ,
class GIFPlotter *plotter ,
//class GIFPlotter *plotter ,
SafeBuf &gw,
class Label *label ,
float zoff ,
long color ) ;
@ -119,6 +128,10 @@ class Statsdb {
RdbList m_list;
Msg1 m_msg1;
// the graphing window. now a bunch of absolute divs in html
SafeBuf m_gw;
HashTableX m_dupTable;
SafeBuf m_sb0;
SafeBuf m_sb1;

@ -1735,7 +1735,10 @@ void TcpServer::destroySocket ( TcpSocket *s ) {
//log("tcp: closing fd=%i",sd);
// TODO: does this block or what?
long cret = ::close ( sd );
long cret = 0;
// if sd is 0 do not really close it. seems to fix that bug.
// 0 is the FD for stdin so i don't know how that is happening.
if ( sd != 0 ) cret = ::close ( sd );
if ( cret != 0 ) // == -1 )
log("tcp: close(%li) = %li = %s",
(long)sd,cret,mstrerror(errno));

File diff suppressed because it is too large Load Diff

@ -495,6 +495,13 @@ class XmlDoc {
long **getIndCatIds ( ) ;
long **getCatIds ( ) ;
class CatRec *getCatRec ( ) ;
long *getNumDmozEntries() ;
char **getDmozTitles ( ) ;
char **getDmozSummaries ( ) ;
char **getDmozAnchors ( ) ;
bool setDmozInfo () ;
long long **getWikiDocIds ( ) ;
void gotWikiResults ( class UdpSlot *slot );
long *getPubDate ( ) ;
@ -663,6 +670,8 @@ class XmlDoc {
int8_t *getNextSpiderPriority ( ) ;
long *getPriorityQueueNum ( ) ;
class TagRec ***getOutlinkTagRecVector () ;
char *hasNoIndexMetaTag();
char *hasFakeIpsMetaTag ( );
long **getOutlinkFirstIpVector () ;
//char **getOutlinkIsIndexedVector () ;
long *getRegExpNum ( long outlinkNum ) ;
@ -678,6 +687,7 @@ class XmlDoc {
bool getIsInjecting();
long *getSpiderPriority ( ) ;
long *getIndexCode ( ) ;
long *getIndexCode2 ( ) ;
SafeBuf *getNewTagBuf ( ) ;
char *updateTagdb ( ) ;
@ -733,6 +743,7 @@ class XmlDoc {
bool hashZipCodes ( class HashTableX *table ) ;
bool hashMetaZip ( class HashTableX *table ) ;
bool hashContentType ( class HashTableX *table ) ;
bool hashDMOZCategories ( class HashTableX *table ) ;
bool hashLinks ( class HashTableX *table ) ;
bool hashUrl ( class HashTableX *table ) ;
bool hashSections ( class HashTableX *table ) ;
@ -1038,7 +1049,6 @@ class XmlDoc {
char m_fragBufValid;
char m_wordSpamBufValid;
char m_finalSummaryBufValid;
char m_matchingQueryBufValid;
char m_relatedQueryBufValid;
char m_queryLinkBufValid;
@ -1143,6 +1153,7 @@ class XmlDoc {
bool m_dmozTitlesValid;
bool m_dmozSummsValid;
bool m_dmozAnchorsValid;
bool m_dmozInfoValid;
bool m_rawUtf8ContentValid;
bool m_expandedUtf8ContentValid;
bool m_utf8ContentValid;
@ -1239,6 +1250,8 @@ class XmlDoc {
bool m_priorityQueueNumValid;
bool m_outlinkTagRecVectorValid;
bool m_outlinkIpVectorValid;
bool m_hasNoIndexMetaTagValid;
bool m_hasUseFakeIpsMetaTagValid;
bool m_outlinkIsIndexedVectorValid;
bool m_isSiteRootValid;
bool m_wasInjectedValid;
@ -1499,8 +1512,15 @@ class XmlDoc {
Msge0 m_msge0;
// this points into m_msge1 i guess
//long *m_outlinkIpVector;
long *m_outlinkIpVector;
SafeBuf m_outlinkTagRecPtrBuf;
SafeBuf m_fakeIpBuf;
char m_hasNoIndexMetaTag;
char m_hasUseFakeIpsMetaTag;
Msge1 m_msge1;
TagRec **m_outlinkTagRecVector;
SafeBuf m_fakeTagRecPtrBuf;
TagRec m_fakeTagRec;
//
// diffbot parms for indexing diffbot's json output
@ -1860,7 +1880,9 @@ class XmlDoc {
char m_isErrorPage;
char m_isHijacked;
//char m_isVisible;
char m_dmozBuf[12000];
//char m_dmozBuf[12000];
SafeBuf m_dmozBuf;
long m_numDmozEntries;
// stuff
char *m_statusMsg;

@ -21,6 +21,11 @@
bool closeAll ( void *state , void (* callback)(void *state) ) { return true; }
bool allExit ( ) { return true; };
bool sendPageSEO(TcpSocket *s, HttpRequest *hr) {return true;}
//long g_qbufNeedSave = false;
//SafeBuf g_qbuf;
#define RDFBUFFER_SIZE (1024*1024*10)
#define RDFSTRUCTURE_FILE "structure.rdf.u8"
#define RDFCONTENT_FILE "content.rdf.u8"
@ -167,14 +172,18 @@ char* incRdfPtr( long skip = 1 ) {
// parse the rdf file up past a given start tag
long rdfParse ( char *tagName ) {
bool inQuote = false;
//bool inQuote = false;
do {
long matchPos = 0;
// move to the next tag
while (*rdfPtr != '<' || inQuote ) {
// . quotes are no longer escaped out in the newer
// dmoz files in oct 2013... so take that out. i do
// this < is &lt; though.. perhaps only check for
// quotes when in a tag?
while (*rdfPtr != '<' ) { // || inQuote ) {
// check for quotes
if (*rdfPtr == '"')
inQuote = !inQuote;
//if (*rdfPtr == '"')
// inQuote = !inQuote;
// next char
if (!incRdfPtr())
return -1;
@ -200,12 +209,15 @@ long rdfParse ( char *tagName ) {
// move to the next tag in the file
long rdfNextTag ( ) {
bool inQuote = false;
//bool inQuote = false;
// move to the next tag
while (*rdfPtr != '<' || inQuote ) {
while (*rdfPtr != '<' ) { // || inQuote ) {
// check for quotes
if (*rdfPtr == '"')
inQuote = !inQuote;
// NO! too many unbalanced quotes all over the place!
// and i think quotes in tags do not have < or > in them
// because they should be encoded as &gt; and &lt;
//if (*rdfPtr == '"')
// inQuote = !inQuote;
// next char
if (!incRdfPtr())
return -1;
@ -395,6 +407,11 @@ long getIndexFromId ( long catid ) {
else
low = currCat+1;
}
//printf("catid %li not found. sanity checking.\n",catid);
// sanity check our algo
//for ( long i = 0 ; i < numRdfCats ; i++ ) {
// if ( rdfCats[i].m_catid == catid ) { char *xx=NULL;*xx=0;}
//}
// not found
return -1;
}
@ -518,7 +535,7 @@ bool isGoodUrl ( char *url, long urlLen ) {
if ( urlLen <= 0 )
return false;
for (long i = 0; i < urlLen; i++) {
if (is_space(url[i]))
if (is_wspace_a(url[i]))
return false;
}
// check for [prot]://[url]
@ -546,8 +563,27 @@ long printCatPath ( char *str, long catid, bool raw ) {
return 0;
// get the parent
parentId = rdfCats[catIndex].m_parentid;
// print the parent(s) first
if (parentId > 1) {
// . print the parent(s) first
// . in NEWER DMOZ dumps, "Top" is catid 2 and catid 1 is an
// empty title. really catid 2 is Top/World but that is an
// error that we correct below. (see "Top/World" below).
// but do not include the "Top/" as part of the path name
if ( catid == 2 ) {
// no! we now include Top as part of the path. let's
// be consistent. i'd rather have www.gigablast.com/Top
// and www.gigablast.com/Top/Arts etc. then i know if the
// path starts with /Top that it is dmoz!!
sprintf(p,"Top");
return 3;
}
if (parentId > 1 &&
// the newer dmoz files have the catid == the parent id of
// i guess top most categories, like "Top/Arts"... i would think
// it should have a parentId of 1 like the old dmoz files,
// so it's probably a bug on dmoz's end
parentId != catid ) {
p += printCatPath(p, parentId, raw);
// print spacing
if (!raw) p += sprintf(p, " / ");
@ -621,18 +657,22 @@ long fixUrl ( char *url, long urlLen ) {
memmove(&url[slashi-1], &url[slashi], newUrlLen - slashi);
newUrlLen--;
}
if (is_space(url[slashi])) {
if (is_wspace_a(url[slashi])) {
memmove(&url[slashi], &url[slashi+1], newUrlLen - (slashi+1));
newUrlLen--;
}
}
// remove any anchor
// mdw, sep 2013, no because there is twitter.com/#!/ronpaul
// and others...
/*
for (long i = 0; i < newUrlLen; i++) {
if (url[i] == '#') {
newUrlLen = i;
break;
}
}
*/
// remove any trailing /
if (url[newUrlLen-1] == '/')
newUrlLen--;
@ -670,6 +710,38 @@ long fileWrite ( int fileid, void *buf, size_t count ) {
return sizeWrote;
}
// print special meta tags to tell gigablast to only spider/index
// the links and not the links of the links. b/c we only want
// to index the dmoz urls. AND ignore any external error like
// ETCPTIMEDOUT when indexing a dmoz url so we can be sure to index
// all of them under the proper category so our gbcatid:xxx search
// works and we can replicate dmoz accurately. see XmlDoc.cpp
// addOutlinksSpiderRecsToMetaList() and indexDoc() to see
// where these meta tags come into play.
void writeMetaTags ( int outStream2 ) {
char *str =
"<!-- do not spider the links of the links -->\n"
"<meta name=spiderlinkslinks content=0>\n"
"<!--ignore tcp timeouts, dns timeouts, etc.-->\n"
"<meta name=ignorelinksexternalerrors content=1>\n"
"<!--do not index this document, but get links from it-->\n"
"<meta name=noindex content=1>\n"
// tell gigablast to not do a dns lookup on every
// outlink when adding spiderRequests to spiderdb
// for each outlink. will save time up front but
// will have to be done when spidering the doc.
"<!-- do not lookup the ip address of every outlink, "
"but use hash of the subdomain as the ip -->\n"
"<meta name=usefakeips content=1>\n"
;
long len = gbstrlen(str);
if ( write ( outStream2, str , len ) != len )
printf("Error writing to outStream2b\n");
}
// main parser
int main ( int argc, char *argv[] ) {
long n;
@ -678,7 +750,7 @@ int main ( int argc, char *argv[] ) {
long m = 0;
long newNameBufferSize = 0;
long newOffset = 0;
char filename[256];
char filename[1256];
long urlTxtCount = 0;
long urlTxtFile = 0;
Url normUrl;
@ -695,6 +767,8 @@ int main ( int argc, char *argv[] ) {
bool splitUrls = false;
char mode = MODE_NONE;
long totalNEC = 0;
char *dir="";
bool firstTime;
// check the options and mode
for (long i = 0; i < argc; i++) {
@ -783,20 +857,29 @@ int main ( int argc, char *argv[] ) {
goto errExit;
}
dir = "";
retry:
// open the structure file
if ( mode == MODE_NEW || mode == MODE_CATDUMP )
sprintf(filename, "%s", RDFSTRUCTURE_FILE);
sprintf(filename, "%s%s", dir,RDFSTRUCTURE_FILE);
else
sprintf(filename, "%s.new", RDFSTRUCTURE_FILE);
sprintf(filename, "%s%s.new", dir,RDFSTRUCTURE_FILE);
//rdfStream.open(filename, ifstream::in);
rdfStream = open ( filename, O_RDONLY );
// make sure it openned okay
// make sure it opened okay
//if (!rdfStream.is_open()) {
if ( rdfStream < 0 ) {
printf("Error Openning %s\n", filename);
// try ./catdb/ subdir if not found
if ( ! dir[0] ) {
dir = "./catdb/";
goto retry;
}
printf("Error Opening %s\n", filename);
goto errExit;
}
printf("Openned Structure File: %s\n", filename);
printf("Opened Structure File: %s\n", filename);
// take the first chunk
//rdfStream.read(rdfBuffer, RDFBUFFER_SIZE);
@ -809,6 +892,7 @@ int main ( int argc, char *argv[] ) {
rdfPtr = rdfBuffer;
rdfEnd = &rdfBuffer[n];
currOffset = 0;
firstTime = true;
// read and parse the file
printf("Parsing Topics...\n");
@ -820,6 +904,13 @@ int main ( int argc, char *argv[] ) {
unsigned long catOffset = currOffset - 6;
// get the topic name, preserve it on the buffer
long nameOffset = nameBufferLen;
// the name inserted by this function into "nameBuffer"
// does not seem to contain "Top/" at the beginning.
// it is from structure.rdf.u8, but it seems to be there!
// yeah, later on we hack the name buffer and nameOffset
// so it is just the last word in the directory to save
// mem. then we print out all the parent names to
// reconstruct.
long nameLen = fillNextString();
if (nameLen == -1)
goto fileEnd;
@ -827,18 +918,48 @@ int main ( int argc, char *argv[] ) {
printf("Out of Memory!\n");
goto errExit1;
}
// fix <Topic r:id=\"\"> in the newer content.rdf.u8
if ( nameLen == 0 ) {
// only do this once!
if ( ! firstTime ) {
printf("Encountered zero length name");
continue;
}
memcpy(nameBuffer+nameOffset,"Top\0",4);
nameLen = 3;
firstTime = false;
}
// html decode it
if (nameLen > MAX_HTTP_FILENAME_LEN)
nameLen = MAX_HTTP_FILENAME_LEN;
nameLen = htmlDecode ( htmlDecoded,
&nameBuffer[nameOffset],
nameLen );
memcpy(&nameBuffer[nameOffset], htmlDecoded, nameLen);
nameBufferLen += nameLen;
nameLen ,
false,
0);
// parse the catid
long catid = parseNextCatid();
if (catid == -1)
goto fileEnd;
// crap, in the new dmoz structure.rdf.u8 catid 1 is
// empty name and catid 2 has Topic tag "Top/World" but
// Title tag "Top".
// but it should probably be "Top" and not "World". There is
// another catid 3 in structure.rdf.u8 that has
// <Topic r:id="Top/World"> and catid 3 which is the real one,
// so catid 2 is just "Top". this is a bug in the dmoz output
// i think, so fix it here.
if ( catid == 2 ) {
nameLen = 3;
memcpy(&nameBuffer[nameOffset],"Top",nameLen);
nameBufferLen += nameLen;
}
else {
memcpy(&nameBuffer[nameOffset], htmlDecoded, nameLen);
nameBufferLen += nameLen;
}
// . fill the current cat
// make sure there's room
if (numRdfCats >= rdfCatsSize) {
@ -856,6 +977,11 @@ int main ( int argc, char *argv[] ) {
printf("Out of Memory!\n");
goto errExit1;
}
// debug
//printf("gbcat=");
//for ( long i = 0 ; i < nameLen ; i++ )
// printf("%c",htmlDecoded[i]);
//printf("\n");
// fill it
rdfCats[numRdfCats].m_catid = catid;
rdfCats[numRdfCats].m_parentid = 0;
@ -923,10 +1049,16 @@ fileEnd:
rdfEnd = &rdfBuffer[n];
currOffset = 0;
//
// set m_parentid using structure.rdf.u8
//
// read and parse the file again
printf("Building Hierarchy...\n");
while (true) {
// parse the next catid
// parse the next catid in the file, sequentially
//if ( currOffset == 545468935 )
// printf("shit\n");
long catid = parseNextCatid();
if (catid == -1)
goto fileEnd1;
@ -977,8 +1109,18 @@ nextChildTag:
childNameLen = MAX_HTTP_FILENAME_LEN;
childNameLen = htmlDecode ( htmlDecoded,
childName,
childNameLen );
childNameLen ,
false,
0);
memcpy(childName, htmlDecoded, childNameLen);
// debug log
//if ( currOffset >= 506362430 ) // 556362463
// printf("off=%li\n",currOffset);
// debug point
//if ( currOffset == 545467573 )
// printf("GOT DEBUG POINT before giant skip\n");
// cut off the leading label if symbolic
// if (parentType == 2) {
// while (*childName != ':') {
@ -988,20 +1130,27 @@ nextChildTag:
// childName++;
// childNameLen--;
// }
// debug point
//if (strcmp(childName,"Top/World/Català/Arts") == 0 )
// printf("hey\n");
// get the catid for the child
long childid = getCatHash(childName, childNameLen);
// get the cat for this id
long cat = getIndexFromId(childid);
// make sure we have a match
if (cat == -1) {
//printf("Warning: Child Topic Not Found: ");
//for (long i = 0; i < childNameLen; i++)
// printf("%c", childName[i]);
//printf("\n");
// debug. why does Top/World/Catala/Arts
// not have a parent??
printf("Warning: Child Topic Not Found: ");
for (long i = 0; i < childNameLen; i++)
printf("%c", childName[i]);
printf("\n");
m++;
goto nextChildTag;
}
// assign the parent to the cat
// . assign the parent to the cat
// . this means we are in a "child" tag within the "catid"
// . catid 84192
if (parentType == 1) {
if (rdfCats[cat].m_parentid != 0)
printf("Warning: Overwriting Parent Id!\n");
@ -1033,6 +1182,14 @@ fileEnd1:
printf(" Total Topics: %li\n", numRdfCats);
printf(" Topics with Parents: %li\n", t);
printf(" Topics Linked but Nonexistent: %li\n", m);
if ( t != numRdfCats ) {
printf("\n"
" *Topics without parents is bad because they\n"
" can not have their entired rawPath printed out\n"
" in order to get their proper hash\n");
}
//printf(" Number of Symbolic Links: %li\n", numSymParents);
printf("\n");
@ -1066,25 +1223,45 @@ fileEnd1:
for (long i = 0; i < numRdfCats; i++) {
// get the hash of the path
rawPathLen = printCatPath(rawPath, rdfCats[i].m_catid, true);
rdfCats[i].m_catHash = hash32Lower(rawPath, rawPathLen, 0);
// crap, this rawpath contains "Top/" in the beginning
// but the rdfCats[i].m_nameOffset refers to a name
// that does not include "Top/"
rdfCats[i].m_catHash = hash32Lower_a(rawPath, rawPathLen, 0);
// fix. so that xyz/Arts does not just hash "Arts"
// because it has no parent...
if ( rdfCats[i].m_parentid == 0 ) {
printf("Missing parent for catid %li. Will be "
"excluded from DMOZ so we avoid hash "
"collisions.\n",rdfCats[i].m_catid);
}
//
// DEBUG!
// print this shit out to find the collisions
//
continue;
printf("hash32=%lu catid=%li parentid=%li path=%s\n",
rdfCats[i].m_catHash,
rdfCats[i].m_catid,
rdfCats[i].m_parentid,
rawPath);
}
// . now we want to serialize the needed data into
// one (or more?) file(s) to be quickly read by gb
if ( mode == MODE_NEW )
sprintf(filename, "%s", STRUCTURE_OUTPUT_FILE);
sprintf(filename, "%s%s", dir,STRUCTURE_OUTPUT_FILE);
else
sprintf(filename, "%s.new", STRUCTURE_OUTPUT_FILE);
sprintf(filename, "%s%s.new", dir,STRUCTURE_OUTPUT_FILE);
//outStream.open(filename, ofstream::out|ofstream::trunc);
outStream = open ( filename, O_CREAT|O_WRONLY|O_TRUNC,
S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP );
// make sure it openned okay
// make sure it opened okay
//if (!outStream.is_open()) {
if ( outStream < 0 ) {
printf("Error Openning %s\n", filename);
printf("Error Opening %s\n", filename);
goto errExit;
}
printf("\nOpenned %s for writing.\n", filename);
printf("\nOpened %s for writing.\n", filename);
// write the size of the truncated name buffer
//outStream.write((char*)&newNameBufferSize, sizeof(long));
@ -1149,21 +1326,26 @@ contentParse:
printf("Out of Memory!\n");
goto errExit;
}
again:
// open the content file
if ( mode == MODE_NEW || mode == MODE_URLDUMP )
sprintf(filename, "%s", RDFCONTENT_FILE);
sprintf(filename, "%s%s", dir,RDFCONTENT_FILE);
else
sprintf(filename, "%s.new", RDFCONTENT_FILE);
sprintf(filename, "%s%s.new", dir,RDFCONTENT_FILE);
//rdfStream.open(filename, ifstream::in);
rdfStream = open ( filename, O_RDONLY );
// make sure it openned okay
// make sure it opened okay
//if (!rdfStream.is_open()) {
if ( rdfStream < 0 ) {
printf("Error Openning %s\n", filename);
if ( ! dir[0] ) {
dir = "./catdb/";
goto again;
}
printf("Error Opening %s\n", filename);
goto errExit;
}
printf("\nOpenned Content File: %s\n", filename);
printf("\nOpened Content File: %s\n", filename);
// take the first chunk
//rdfStream.read(rdfBuffer, RDFBUFFER_SIZE);
@ -1184,28 +1366,32 @@ contentParse:
// write another file for the urls
if ( mode == MODE_URLDUMP ) {
if (!splitUrls)
sprintf(filename, "%s", URLTEXT_OUTPUT_FILE);
sprintf(filename, "html/%s", URLTEXT_OUTPUT_FILE);
else
sprintf(filename, "%s.0", URLTEXT_OUTPUT_FILE);
// put them directly into html/ now for
// easy add url'ing
sprintf(filename, "html/%s.0", URLTEXT_OUTPUT_FILE);
}
else {
if (!splitUrls)
sprintf(filename, "%s",
sprintf(filename, "html/%s",
DIFFURLTEXT_OUTPUT_FILE);
else
sprintf(filename, "%s.0",
sprintf(filename, "html/%s.0",
DIFFURLTEXT_OUTPUT_FILE);
}
//outStream2.open(filename, ofstream::out|ofstream::trunc);
outStream2 = open ( filename, O_CREAT|O_WRONLY|O_TRUNC,
S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP );
// make sure it openned okay
// make sure it opened okay
//if (!outStream2.is_open()) {
if ( outStream2 < 0 ) {
printf("Error Openning %s\n", filename);
printf("Error Opening %s\n", filename);
goto errExit1;
}
printf("Openned %s for writing.\n", filename);
printf("Opened %s for writing.\n", filename);
writeMetaTags ( outStream2 );
// if we're doing a diffurldump, load up the diff file first
if ( mode == MODE_DIFFURLDUMP ) {
@ -1219,10 +1405,10 @@ contentParse:
diffInStream = open(filename, O_RDONLY);
//if (!diffInStream.is_open()) {
if ( diffInStream < 0 ) {
printf("Error Openning %s\n", filename);
printf("Error Opening %s\n", filename);
goto errExit;
}
printf("Openned Diff File: %s\n", filename);
printf("Opened Diff File: %s\n", filename);
// read in the number of urls to update/add
//diffInStream.read((char*)&numUpdateIndexes,
@ -1318,7 +1504,7 @@ contentParse:
printf("Completed Writing File.\n");
// write another file for the urls
urlTxtFile++;
sprintf(filename, "%s.%li",
sprintf(filename, "html/%s.%li",
URLTEXT_OUTPUT_FILE,
urlTxtFile);
//outStream2.open(filename,
@ -1326,14 +1512,14 @@ contentParse:
outStream2 = open ( filename,
O_CREAT|O_WRONLY|O_TRUNC,
S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP );
// make sure it openned okay
// make sure it opened okay
//if (!outStream2.is_open()) {
if ( outStream2 < 0 ) {
printf("Error Openning %s\n",
printf("Error Opening %s\n",
filename);
goto errExit1;
}
printf("Openned %s for writing.\n",
printf("Opened %s for writing.\n",
filename);
urlTxtCount = 0;
}
@ -1348,20 +1534,20 @@ contentParse:
}
else {
if ( mode == MODE_NEW )
sprintf(filename, "%s", CONTENT_OUTPUT_FILE);
sprintf(filename, "%s%s", dir,CONTENT_OUTPUT_FILE);
else
sprintf(filename, "%s.new", CONTENT_OUTPUT_FILE);
sprintf(filename, "%s%s.new", dir,CONTENT_OUTPUT_FILE);
// stream the urls into the content
//outStream.open(filename, ofstream::out|ofstream::trunc);
outStream = open ( filename, O_CREAT|O_WRONLY|O_TRUNC,
S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP );
// make sure it openned okay
// make sure it opened okay
//if (!outStream.is_open()) {
if ( outStream < 0 ) {
printf("Error Openning %s\n", filename);
printf("Error Opening %s\n", filename);
goto errExit;
}
printf("Openned %s for writing.\n", filename);
printf("Opened %s for writing.\n", filename);
// store a space for the number of urls at the start of the file
//outStream.write((char*)&numUrlInfos, sizeof(long));
@ -1371,7 +1557,7 @@ contentParse:
goto errExit;
}
}
// read and parse the file again
printf("Building Links...\n");
while (true) {
@ -1389,6 +1575,9 @@ contentParse:
if ( mode == MODE_URLDUMP || mode == MODE_DIFFURLDUMP )
goto nextLink;
// . set the content offset for this cat
// . it's missing catid 425187... why? because it had
// a double quote in it like '4"'!! so i took out inQuotes
// logic above.
cat = getIndexFromId(catid);
if (cat == -1) {
totalNEC++;
@ -1442,15 +1631,35 @@ hashLink:
// html decode the url
if (urlLen > MAX_URL_LEN)
urlLen = MAX_URL_LEN;
urlLen = htmlDecode(decodedUrl, &urlBuffer[urlOffset], urlLen);
urlLen = htmlDecode(decodedUrl, &urlBuffer[urlOffset], urlLen,
false,0);
// debug point
//if ( strcmp(decodedUrl,"http://twitter.com/#!/ronpaul")==0)
// printf("hey\n");
// ignore any url with # in it for now like
// http://twitter.com/#!/ronpaul because it bastardizes
// the meaning of the # (hashtag) and we need to protest that
if ( strchr ( decodedUrl , '#' ) )
goto nextLink;
memcpy(&urlBuffer[urlOffset], decodedUrl, urlLen);
// fix up bad urls
urlLen = fixUrl(&urlBuffer[urlOffset], urlLen);
if (urlLen == 0)
goto nextLink;
// normalize with Url
normUrl.set(&urlBuffer[urlOffset], urlLen,
true, false, false, true);
// . normalize with Url
// . watch out for
// http://twitter.com/#!/ronpaul to http://www.twitter.com/
// so do not strip # hashtags
normUrl.set(&urlBuffer[urlOffset],
urlLen,
true, // addwww?
false, // stripsessionid
false, // strippound?
true); // stripcommonfile? (i.e. index.htm)
// debug print
//printf("gburl %s -> %s\n",decodedUrl,normUrl.getUrl());
// put it back
urlLen = normUrl.getUrlLen();
if (urlBufferLen+urlLen+10 >= urlBufferSize) {
@ -1473,7 +1682,7 @@ hashLink:
//urlBufferLen += urlLen;
// get the hash value
unsigned long long urlHash =
hash64Lower(&urlBuffer[urlOffset], urlLen, 0);
hash64Lower_a(&urlBuffer[urlOffset], urlLen, 0);
//unsigned long urlHash2 =
// hash32Lower(&urlBuffer[urlOffset], urlLen, 0);
// see if it's already indexed
@ -1491,6 +1700,10 @@ hashLink:
currUrl == updateIndexes[currDiffIndex] ) {
//outStream2.write(&urlBuffer[urlOffset],
// urlLen);
// print it in an anchor tag
// now so gigablast can spider
// these links
write ( outStream2,"<a href=\"",9);
if ( write ( outStream2,
&urlBuffer[urlOffset],
urlLen ) != urlLen ) {
@ -1498,6 +1711,7 @@ hashLink:
"outStream2\n");
goto errExit1;
}
write ( outStream2,"\"></a>",6);
//outStream2.write("\n", 1);
if (write(outStream2, "\n", 1) != 1) {
printf("Error writing to "
@ -1518,11 +1732,11 @@ hashLink:
// write another file for the urls
urlTxtFile++;
if ( mode == MODE_URLDUMP )
sprintf(filename, "%s.%li",
sprintf(filename, "html/%s.%li",
URLTEXT_OUTPUT_FILE,
urlTxtFile);
else
sprintf(filename, "%s.%li",
sprintf(filename, "html/%s.%li",
DIFFURLTEXT_OUTPUT_FILE,
urlTxtFile);
//outStream2.open(filename,
@ -1530,15 +1744,16 @@ hashLink:
outStream2 = open ( filename,
O_CREAT|O_WRONLY|O_TRUNC,
S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP );
// make sure it openned okay
// make sure it opened okay
//if (!outStream2.is_open()) {
if ( outStream2 < 0 ) {
printf("Error Openning %s\n",
printf("Error Opening %s\n",
filename);
goto errExit1;
}
printf("Openned %s for writing.\n",
printf("Opened %s for writing.\n",
filename);
writeMetaTags ( outStream2 );
urlTxtCount = 0;
}
}
@ -1634,8 +1849,17 @@ hashLink:
long currIndex = getIndexFromId(catid);
while (currIndex >= 0) {
rdfCats[currIndex].m_numUrls++;
// the new dmoz files have catids whose parents
// are the same cat id! so stop infinite loops
if ( rdfCats[currIndex].m_parentid ==
rdfCats[currIndex].m_catid )
break;
// otherwise, make "currIndex" point to the parent
currIndex = getIndexFromId(
rdfCats[currIndex].m_parentid );
// in the newer dmoz files 0 is a bad catid i guess
// not -1 any more?
// ??????
}
goto nextLink;
@ -1697,19 +1921,19 @@ fileEnd2:
// load the content and url files
// url info (content) file
sprintf(filename, "%s", CONTENT_OUTPUT_FILE);
sprintf(filename, "%s%s", dir,CONTENT_OUTPUT_FILE);
//rdfStream.open(filename, ifstream::in);
rdfStream = open ( filename, O_RDONLY );
//if (!rdfStream.is_open()) {
if ( rdfStream < 0 ) {
printf("Error Openning %s\n", CONTENT_OUTPUT_FILE);
printf("Error Opening %s\n", filename);
goto oldErrExit;
}
// read in the number of urls
//rdfStream.read((char*)&oldNumUrls, sizeof(long));
if (fileRead(rdfStream, &oldNumUrls, sizeof(long)) !=
sizeof(long)) {
printf("Error Reading %s\n", CONTENT_OUTPUT_FILE);
printf("Error Reading %s\n", filename);
goto oldErrExit;
}
@ -1749,8 +1973,8 @@ fileEnd2:
//rdfStream.read((char*)&urlLen, sizeof(short));
long n = fileRead(rdfStream, &urlLen, sizeof(short));
if ( n < 0 || n > (long)sizeof(short) ) {
printf("Error Reading %s\n",
CONTENT_OUTPUT_FILE);
printf("Error Reading %s\n",filename);
//CONTENT_OUTPUT_FILE);
goto oldErrExit;
}
if ( n == 0 )
@ -1780,8 +2004,8 @@ fileEnd2:
}
n = fileRead(rdfStream, &oldUrls[urlp], urlLen);
if ( n < 0 || n > urlLen ) {
printf("Error Reading %s\n",
CONTENT_OUTPUT_FILE);
printf("Error Reading %s\n",filename);
//CONTENT_OUTPUT_FILE);
goto oldErrExit;
}
if ( n == 0 )
@ -1791,7 +2015,7 @@ fileEnd2:
urlLen = fixUrl(&oldUrls[urlp], urlLen);
// make the hash
oldUrlHashes[currUrl] =
hash64Lower(&oldUrls[urlp], urlLen, 0);
hash64Lower_a(&oldUrls[urlp], urlLen, 0);
removeOldUrl[currUrl] = 0;
// increment the buffer pointer
if (urlLen <= 0) {
@ -1814,8 +2038,8 @@ fileEnd2:
//rdfStream.read((char*)&oldNumCatids[currUrl], 1);
long n = fileRead(rdfStream, &oldNumCatids[currUrl], 1);
if ( n < 0 || n > 1 ) {
printf("Error Reading %s\n",
CONTENT_OUTPUT_FILE);
printf("Error Reading %s\n",filename);
//CONTENT_OUTPUT_FILE);
goto oldErrExit;
}
if ( n == 0 )
@ -1839,8 +2063,8 @@ fileEnd2:
long readSize = sizeof(long)*oldNumCatids[currUrl];
n = fileRead(rdfStream, &oldCatids[catidp], readSize);
if ( n < 0 || n > readSize ) {
printf("Error Reading %s\n",
CONTENT_OUTPUT_FILE);
printf("Error Reading %s\n",filename);
//CONTENT_OUTPUT_FILE);
goto oldErrExit;
}
if ( n == 0 )
@ -1907,17 +2131,17 @@ oldIsDifferent:
// also urls to remove
//
// open the new diff file for writing
sprintf(filename, "%s.new.diff", CONTENT_OUTPUT_FILE);
sprintf(filename, "%s%s.new.diff", dir,CONTENT_OUTPUT_FILE);
//outStream.open(filename, ofstream::out|ofstream::trunc);
outStream = open ( filename, O_CREAT|O_WRONLY|O_TRUNC,
S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP );
// make sure it openned okay
// make sure it opened okay
//if (!outStream.is_open()) {
if ( outStream < 0 ) {
printf("Error Openning %s\n", filename);
printf("Error Opening %s\n", filename);
goto oldErrExit;
}
printf("\nOpenned %s for writing.\n", filename);
printf("\nOpened %s for writing.\n", filename);
// write out the number of urls to update/add
//outStream.write(&numUpdateUrls, sizeof(long));
@ -2027,19 +2251,19 @@ oldGoodExit:
// . now we want to serialize the needed data into
// one (or more?) file(s) to be quickly read by gb
if ( mode == MODE_NEW )
sprintf(filename, "%s", STRUCTURE_OUTPUT_FILE);
sprintf(filename, "%s%s", dir,STRUCTURE_OUTPUT_FILE);
else
sprintf(filename, "%s.new", STRUCTURE_OUTPUT_FILE);
sprintf(filename, "%s%s.new", dir,STRUCTURE_OUTPUT_FILE);
//outStream.open(filename, ofstream::out|ofstream::ate);
outStream = open ( filename, O_WRONLY|O_APPEND,
S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP );
// make sure it openned okay
// make sure it opened okay
//if (!outStream.is_open()) {
if ( outStream < 0 ) {
printf("Error Openning %s\n", filename);
printf("Error Opening %s\n", filename);
goto errExit;
}
printf("\nOpenned %s for writing.\n", filename);
printf("\nOpened %s for writing.\n", filename);
// write the cats
//outStream.write((char*)rdfCats, sizeof(RdfCat)*numRdfCats);
@ -2109,21 +2333,21 @@ oldGoodExit:
// write another file for the urls
if ( mode == MODE_NEW )
sprintf(filename, "%s", CONTENT_OUTPUT_FILE);
sprintf(filename, "%s%s", dir,CONTENT_OUTPUT_FILE);
else
sprintf(filename, "%s.new", CONTENT_OUTPUT_FILE);
sprintf(filename, "%s%s.new", dir,CONTENT_OUTPUT_FILE);
//outStream.open(filename, ofstream::out|ofstream::ate);
outStream = open ( filename, O_WRONLY,
S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP );
//outStream.open(filename, ofstream::out|ofstream::trunc);
//endpos = outStream.tellp();
// make sure it openned okay
// make sure it opened okay
//if (!outStream.is_open()) {
if ( outStream < 0 ) {
printf("Error Openning %s\n", filename);
printf("Error Opening %s\n", filename);
goto errExit;
}
printf("\nOpenned %s for writing.\n", filename);
printf("\nOpened %s for writing.\n", filename);
//outStream.seekp(0);
lseek(outStream, 0, SEEK_SET);

File diff suppressed because one or more lines are too long

BIN
libplot.a

Binary file not shown.

Binary file not shown.

118
main.cpp

@ -22,7 +22,7 @@
#include "Titledb.h"
#include "Revdb.h"
#include "Tagdb.h"
//#include "Catdb.h"
#include "Catdb.h"
#include "Users.h"
#include "Tfndb.h"
#include "Spider.h"
@ -1390,7 +1390,7 @@ int main ( int argc , char *argv[] ) {
char structureFile[256];
g_conf.m_maxMem = 1000000000LL; // 1G
g_mem.m_maxMem = 1000000000LL; // 1G
sprintf(structureFile, "%scat/gbdmoz.structure.dat", g_hostdb.m_dir);
sprintf(structureFile, "%scatdb/gbdmoz.structure.dat", g_hostdb.m_dir);
g_categories = &g_categories1;
if (g_categories->loadCategories(structureFile) != 0) {
log("cat: Loading Categories From %s Failed.", structureFile);
@ -2633,8 +2633,8 @@ int main ( int argc , char *argv[] ) {
if ( ! g_tagdb.init() ) {
log("db: Tagdb init failed." ); return 1; }
// the catdb, it's an instance of tagdb, pass RDB_CATDB
//if ( ! g_catdb.init() ) {
// log("db: Catdb1 init failed." ); return 1; }
if ( ! g_catdb.init() ) {
log("db: Catdb1 init failed." ); return 1; }
// initialize Users
if ( ! g_users.init() ){
log("db: Users init failed. "); return 1;}
@ -2842,7 +2842,7 @@ int main ( int argc , char *argv[] ) {
// load up the dmoz categories here
char structureFile[256];
sprintf(structureFile, "%scat/gbdmoz.structure.dat", g_hostdb.m_dir);
sprintf(structureFile, "%scatdb/gbdmoz.structure.dat", g_hostdb.m_dir);
g_categories = &g_categories1;
if (g_categories->loadCategories(structureFile) != 0) {
log("cat: Loading Categories From %s Failed.",
@ -4511,8 +4511,8 @@ int install ( install_flag_konst_t installFlag , long hostId , char *dir ,
if ( h2->m_hostId == 0 ) continue;
sprintf(tmp,
"rcp "
"%scat/content.rdf.u8 "
"%s:%scat/content.rdf.u8",
"%scatdb/content.rdf.u8 "
"%s:%scatdb/content.rdf.u8",
dir,
iptoa(h2->m_ip),
h2->m_dir);
@ -4520,8 +4520,8 @@ int install ( install_flag_konst_t installFlag , long hostId , char *dir ,
system ( tmp );
sprintf(tmp,
"rcp "
"%scat/structure.rdf.u8 "
"%s:%scat/structure.rdf.u8",
"%scatdb/structure.rdf.u8 "
"%s:%scatdb/structure.rdf.u8",
dir,
iptoa(h2->m_ip),
h2->m_dir);
@ -4529,8 +4529,8 @@ int install ( install_flag_konst_t installFlag , long hostId , char *dir ,
system ( tmp );
sprintf(tmp,
"rcp "
"%scat/gbdmoz.structure.dat "
"%s:%scat/gbdmoz.structure.dat",
"%scatdb/gbdmoz.structure.dat "
"%s:%scatdb/gbdmoz.structure.dat",
dir,
iptoa(h2->m_ip),
h2->m_dir);
@ -4538,8 +4538,8 @@ int install ( install_flag_konst_t installFlag , long hostId , char *dir ,
system ( tmp );
sprintf(tmp,
"rcp "
"%scat/gbdmoz.content.dat "
"%s:%scat/gbdmoz.content.dat",
"%scatdb/gbdmoz.content.dat "
"%s:%scatdb/gbdmoz.content.dat",
dir,
iptoa(h2->m_ip),
h2->m_dir);
@ -4547,8 +4547,8 @@ int install ( install_flag_konst_t installFlag , long hostId , char *dir ,
//system ( tmp );
//sprintf(tmp,
// "rcp "
// "%scat/gbdmoz.content.dat.diff "
// "%s:%scat/gbdmoz.content.dat.diff",
// "%scatdb/gbdmoz.content.dat.diff "
// "%s:%scatdb/gbdmoz.content.dat.diff",
// dir,
// iptoa(h2->m_ip),
// h2->m_dir);
@ -4561,8 +4561,8 @@ int install ( install_flag_konst_t installFlag , long hostId , char *dir ,
if ( h2->m_hostId == 0 ) continue;
sprintf(tmp,
"rcp "
"%scat/content.rdf.u8.new "
"%s:%scat/content.rdf.u8.new",
"%scatdb/content.rdf.u8.new "
"%s:%scatdb/content.rdf.u8.new",
dir,
iptoa(h2->m_ip),
h2->m_dir);
@ -4570,8 +4570,8 @@ int install ( install_flag_konst_t installFlag , long hostId , char *dir ,
system ( tmp );
sprintf(tmp,
"rcp "
"%scat/structure.rdf.u8.new "
"%s:%scat/structure.rdf.u8.new",
"%scatdb/structure.rdf.u8.new "
"%s:%scatdb/structure.rdf.u8.new",
dir,
iptoa(h2->m_ip),
h2->m_dir);
@ -4579,8 +4579,8 @@ int install ( install_flag_konst_t installFlag , long hostId , char *dir ,
system ( tmp );
sprintf(tmp,
"rcp "
"%scat/gbdmoz.structure.dat.new "
"%s:%scat/gbdmoz.structure.dat.new",
"%scatdb/gbdmoz.structure.dat.new "
"%s:%scatdb/gbdmoz.structure.dat.new",
dir,
iptoa(h2->m_ip),
h2->m_dir);
@ -4588,8 +4588,8 @@ int install ( install_flag_konst_t installFlag , long hostId , char *dir ,
system ( tmp );
sprintf(tmp,
"rcp "
"%scat/gbdmoz.content.dat.new "
"%s:%scat/gbdmoz.content.dat.new",
"%scatdb/gbdmoz.content.dat.new "
"%s:%scatdb/gbdmoz.content.dat.new",
dir,
iptoa(h2->m_ip),
h2->m_dir);
@ -4597,8 +4597,8 @@ int install ( install_flag_konst_t installFlag , long hostId , char *dir ,
system ( tmp );
sprintf(tmp,
"rcp "
"%scat/gbdmoz.content.dat.new.diff "
"%s:%scat/gbdmoz.content.dat.new.diff",
"%scatdb/gbdmoz.content.dat.new.diff "
"%s:%scatdb/gbdmoz.content.dat.new.diff",
dir,
iptoa(h2->m_ip),
h2->m_dir);
@ -4694,8 +4694,8 @@ int install ( install_flag_konst_t installFlag , long hostId , char *dir ,
if ( h2->m_hostId == 0 ) continue;
sprintf(tmp,
"rcp "
"%scat/content.rdf.u8 "
"%s:%scat/content.rdf.u8",
"%scatdb/content.rdf.u8 "
"%s:%scatdb/content.rdf.u8",
dir,
iptoa(h2->m_ipShotgun),
h2->m_dir);
@ -4703,8 +4703,8 @@ int install ( install_flag_konst_t installFlag , long hostId , char *dir ,
system ( tmp );
sprintf(tmp,
"rcp "
"%scat/structure.rdf.u8 "
"%s:%scat/structure.rdf.u8",
"%scatdb/structure.rdf.u8 "
"%s:%scatdb/structure.rdf.u8",
dir,
iptoa(h2->m_ipShotgun),
h2->m_dir);
@ -4712,8 +4712,8 @@ int install ( install_flag_konst_t installFlag , long hostId , char *dir ,
system ( tmp );
sprintf(tmp,
"rcp "
"%scat/gbdmoz.structure.dat "
"%s:%scat/gbdmoz.structure.dat",
"%scatdb/gbdmoz.structure.dat "
"%s:%scatdb/gbdmoz.structure.dat",
dir,
iptoa(h2->m_ipShotgun),
h2->m_dir);
@ -4721,8 +4721,8 @@ int install ( install_flag_konst_t installFlag , long hostId , char *dir ,
system ( tmp );
sprintf(tmp,
"rcp "
"%scat/gbdmoz.content.dat "
"%s:%scat/gbdmoz.content.dat",
"%scatdb/gbdmoz.content.dat "
"%s:%scatdb/gbdmoz.content.dat",
dir,
iptoa(h2->m_ipShotgun),
h2->m_dir);
@ -4730,8 +4730,8 @@ int install ( install_flag_konst_t installFlag , long hostId , char *dir ,
//system ( tmp );
//sprintf(tmp,
// "rcp "
// "%scat/gbdmoz.content.dat.diff "
// "%s:%scat/gbdmoz.content.dat.diff",
// "%scatdb/gbdmoz.content.dat.diff "
// "%s:%scatdb/gbdmoz.content.dat.diff",
// dir,
// iptoa(h2->m_ip),
// h2->m_dir);
@ -4745,8 +4745,8 @@ int install ( install_flag_konst_t installFlag , long hostId , char *dir ,
if ( h2->m_hostId == 0 ) continue;
sprintf(tmp,
"rcp "
"%scat/content.rdf.u8.new "
"%s:%scat/content.rdf.u8.new",
"%scatdb/content.rdf.u8.new "
"%s:%scatdb/content.rdf.u8.new",
dir,
iptoa(h2->m_ipShotgun),
h2->m_dir);
@ -4754,8 +4754,8 @@ int install ( install_flag_konst_t installFlag , long hostId , char *dir ,
system ( tmp );
sprintf(tmp,
"rcp "
"%scat/structure.rdf.u8.new "
"%s:%scat/structure.rdf.u8.new",
"%scatdb/structure.rdf.u8.new "
"%s:%scatdb/structure.rdf.u8.new",
dir,
iptoa(h2->m_ipShotgun),
h2->m_dir);
@ -4763,8 +4763,8 @@ int install ( install_flag_konst_t installFlag , long hostId , char *dir ,
system ( tmp );
sprintf(tmp,
"rcp "
"%scat/gbdmoz.structure.dat.new "
"%s:%scat/gbdmoz.structure.dat.new",
"%scatdb/gbdmoz.structure.dat.new "
"%s:%scatdb/gbdmoz.structure.dat.new",
dir,
iptoa(h2->m_ipShotgun),
h2->m_dir);
@ -4772,8 +4772,8 @@ int install ( install_flag_konst_t installFlag , long hostId , char *dir ,
system ( tmp );
sprintf(tmp,
"rcp "
"%scat/gbdmoz.content.dat.new "
"%s:%scat/gbdmoz.content.dat.new",
"%scatdb/gbdmoz.content.dat.new "
"%s:%scatdb/gbdmoz.content.dat.new",
dir,
iptoa(h2->m_ipShotgun),
h2->m_dir);
@ -4781,8 +4781,8 @@ int install ( install_flag_konst_t installFlag , long hostId , char *dir ,
system ( tmp );
sprintf(tmp,
"rcp "
"%scat/gbdmoz.content.dat.new.diff "
"%s:%scat/gbdmoz.content.dat.new.diff",
"%scatdb/gbdmoz.content.dat.new.diff "
"%s:%scatdb/gbdmoz.content.dat.new.diff",
dir,
iptoa(h2->m_ipShotgun),
h2->m_dir);
@ -11036,7 +11036,8 @@ void dumpTagdb (char *coll,long startFileNum,long numFiles,bool includeTree,
//g_conf.m_spiderdbMaxTreeMem = 1024*1024*30;
g_tagdb.init ();
g_collectiondb.init(true);
g_tagdb.addColl ( coll, false );
if ( rdbId == RDB_TAGDB ) g_tagdb.addColl ( coll, false );
if ( rdbId == RDB_CATDB ) g_catdb.init();
key128_t startKey ;
key128_t endKey ;
startKey.setMin();
@ -11101,6 +11102,23 @@ void dumpTagdb (char *coll,long startFileNum,long numFiles,bool includeTree,
printf("corrupt tagdb rec k.n0=%llu",k.n0);
continue;
}
// catdb?
if ( rdbId == RDB_CATDB ) {
// for debug!
CatRec crec;
crec.set ( NULL,
data ,
size ,
false);
fprintf(stdout,
"key=%s caturl=%s #catids=%li version=%li\n"
,KEYSTR(&k,12)
,crec.m_url
,(long)crec.m_numCatids
,(long)crec.m_version
);
continue;
}
// parse it up
//TagRec *tagRec = (TagRec *)rec;
Tag *tag = (Tag *)rec;
@ -13997,10 +14015,10 @@ void saveRdbs ( int fd , void *state ) {
last = rdb->getLastWriteTime();
if ( now - last > delta )
if ( ! rdb->close(NULL,NULL,false,false)) return;
//rdb = g_catdb.getRdb();
//last = rdb->getLastWriteTime();
//if ( now - last > delta )
// if ( ! rdb->close(NULL,NULL,false,false)) return;
rdb = g_catdb.getRdb();
last = rdb->getLastWriteTime();
if ( now - last > delta )
if ( ! rdb->close(NULL,NULL,false,false)) return;
//rdb = g_indexdb.getRdb();
//last = rdb->getLastWriteTime();
//if ( now - last > delta )

@ -6,7 +6,8 @@
#include "HashTableT.h"
//make the key, it is just the needles ptr
static HashTableT<unsigned long long , char*> s_quickTables;
//static HashTableT<unsigned long long , char*> s_quickTables;
static HashTableX s_quickTables;
/*
// returns false and sets g_errno on error
@ -63,6 +64,9 @@ bool fast_highlight ( // highlight these query terms:
// to lower and store into tmp[]. TODO.
// . a space (includes \r \n) in a needle will match a consecutive sequence
// of spaces in the haystack
#define BITVEC unsigned long long
char *getMatches2 ( Needle *needles ,
long numNeedles ,
char *haystack ,
@ -108,51 +112,69 @@ char *getMatches2 ( Needle *needles ,
// . TODO: use a static cache of like 4 of these tables where the key
// is the Needles ptr ... done
long numNeedlesToInit = numNeedles;
char space[256 * 5 * sizeof(unsigned long)];
char space[256 * 6 * sizeof(BITVEC)];
char *buf = NULL;
unsigned long *s0;
unsigned long *s1;
unsigned long *s2;
unsigned long *s3;
unsigned long *s4;
BITVEC *s0;
BITVEC *s1;
BITVEC *s2;
BITVEC *s3;
BITVEC *s4;
BITVEC *s5;
/*
static bool s_quickTableInit = false;
static char s_qtbuf[128*(12+1)*2];
long slot = -1;
if(saveQuickTables) {
uint64_t key = (uint32_t)needles;
long slot = s_quickTables.getSlot(key);
if(slot == -1) {
buf = (char*)mcalloc(sizeof(unsigned long)*256*5,
"matches");
if(buf) s_quickTables.addKey(key, buf);
//sanity check, no reason why there needs to be a
//limit, I just don't expect there to be this many
//static needles at this point.
if(s_quickTables.getNumSlotsUsed() > 32){
char *xx=NULL; *xx = 0;
}
if ( ! s_quickTableInit ) {
s_quickTableInit = true;
s_quickTables.set(8,4,128,s_qtbuf,256*13,false,0,"qx");
}
else {
uint64_t key = (uint32_t)needles;
slot = s_quickTables.getSlot(&key);
if ( slot >= 0 ) {
buf = s_quickTables.getValueFromSlot(slot);
numNeedlesToInit = 0;
}
}
*/
if(!buf) {
buf = space;
memset ( buf , 0 , sizeof(unsigned long)*256*5);
memset ( buf , 0 , sizeof(BITVEC)*256*6);
}
long offset = 0;
s0 = (unsigned long*)(buf + offset);
offset += sizeof(unsigned long)*256;
s1 = (unsigned long*)(buf + offset);
offset += sizeof(unsigned long)*256;
s2 = (unsigned long*)(buf + offset);
offset += sizeof(unsigned long)*256;
s3 = (unsigned long*)(buf + offset);
offset += sizeof(unsigned long)*256;
s4 = (unsigned long*)(buf + offset);
/*
if( useQuickTables && slot == -1 ) {
//buf = (char*)mcalloc(sizeof(unsigned long)*256*5,
// "matches");
if(buf) s_quickTables.addKey(&key, &buf);
//sanity check, no reason why there needs to be a
//limit, I just don't expect there to be this many
//static needles at this point.
if(s_quickTables.getNumSlotsUsed() > 32){
char *xx=NULL; *xx = 0;
}
}
*/
unsigned long mask;
// try 64 bit bit vectors now since we doubled # of needles
long offset = 0;
s0 = (BITVEC *)(buf + offset);
offset += sizeof(BITVEC)*256;
s1 = (BITVEC *)(buf + offset);
offset += sizeof(BITVEC)*256;
s2 = (BITVEC *)(buf + offset);
offset += sizeof(BITVEC)*256;
s3 = (BITVEC *)(buf + offset);
offset += sizeof(BITVEC)*256;
s4 = (BITVEC *)(buf + offset);
offset += sizeof(BITVEC)*256;
s5 = (BITVEC *)(buf + offset);
BITVEC mask;
// set the letter tables, s0[] through sN[], for each needle
for ( long i = 0 ; i < numNeedlesToInit ; i++ ) {
@ -160,7 +182,8 @@ char *getMatches2 ( Needle *needles ,
QUICKPOLL(niceness);
unsigned char *w = (unsigned char *)needles[i].m_string;
unsigned char *wend = w + needles[i].m_stringSize;
mask = (1<<(i&0x1f)); // (1<<(i%32));
// BITVEC is now 64 bits
mask = (1<<(i&0x3f)); // (1<<(i%64));
// if the needle is small, fill up the remaining letter tables
// with its mask... so it matches any character in haystack.
s0[(unsigned char)to_lower_a(*w)] |= mask;
@ -172,6 +195,7 @@ char *getMatches2 ( Needle *needles ,
s2[j] |= mask;
s3[j] |= mask;
s4[j] |= mask;
s5[j] |= mask;
}
continue;
}
@ -184,6 +208,7 @@ char *getMatches2 ( Needle *needles ,
s2[j] |= mask;
s3[j] |= mask;
s4[j] |= mask;
s5[j] |= mask;
}
continue;
}
@ -195,6 +220,7 @@ char *getMatches2 ( Needle *needles ,
for ( long j = 0 ; j < 256 ; j++ ) {
s3[j] |= mask;
s4[j] |= mask;
s5[j] |= mask;
}
continue;
}
@ -206,12 +232,24 @@ char *getMatches2 ( Needle *needles ,
if ( w >= wend ) {
for ( long j = 0 ; j < 256 ; j++ ) {
s4[j] |= mask;
s5[j] |= mask;
}
continue;
}
s4[(unsigned char)to_lower_a(*w)] |= mask;
s4[(unsigned char)to_upper_a(*w)] |= mask;
w += 1;//step;
if ( w >= wend ) {
for ( long j = 0 ; j < 256 ; j++ ) {
s5[j] |= mask;
}
continue;
}
s5[(unsigned char)to_lower_a(*w)] |= mask;
s5[(unsigned char)to_upper_a(*w)] |= mask;
w += 1;//step;
}
// return a ptr to the first match if we should, this is it
@ -245,6 +283,8 @@ char *getMatches2 ( Needle *needles ,
if ( ! mask ) continue;
mask &= s4[*(p+4)];
if ( ! mask ) continue;
mask &= s5[*(p+5)];
if ( ! mask ) continue;
//debugCount++;
/*
// display
@ -273,7 +313,7 @@ char *getMatches2 ( Needle *needles ,
// we got a good candidate, loop through all the needles
for ( long j = 0 ; j < numNeedles ; j++ ) {
// skip if does not match mask, will save time
if ( ! ((1<<(j&0x1f)) & mask) ) continue;
if ( ! ((1<<(j&0x3f)) & mask) ) continue;
if( needles[j].m_stringSize > 3) {
// ensure first 4 bytes matches this needle's
if (needles[j].m_string[0]!=to_lower_a(*(p+0)))
@ -421,7 +461,7 @@ char *getMatches2 ( Needle *needles ,
// we got a good candidate, loop through all the needles
for ( long j = 0 ; j < numNeedles ; j++ ) {
// skip if does not match mask, will save time
if ( ! ((1<<(j&0x1f)) & mask) ) continue;
if ( ! ((1<<(j&0x3f)) & mask) ) continue;
if( needles[j].m_stringSize > 3) {
// ensure first 4 bytes matches this needle's
if (needles[j].m_string[0]!=to_lower_a(*(p+0)))

2520
plotter.h

File diff suppressed because it is too large Load Diff

@ -16,8 +16,10 @@
bool mainShutdown ( bool urgent ) { return true; }
bool closeAll ( void *state , void (* callback)(void *state) ) {return true;}
bool allExit ( ) { return true; }
long g_qbufNeedSave = false;
SafeBuf g_qbuf;
//long g_qbufNeedSave = false;
//SafeBuf g_qbuf;
bool sendPageSEO(class TcpSocket *s, class HttpRequest *hr) {return true;}
int main ( int argc , char *argv[] ) {
bool addWWW = true;