2444 lines
69 KiB
C++
2444 lines
69 KiB
C++
//
|
|
// Gigablast, Copyright March 2005
|
|
// Author: Javier Olivares <jolivares@gigablast.com>
|
|
//
|
|
// DMOZ RDF file parser into proprietary format
|
|
// See the "usage" note in the main function for usage and features.
|
|
// I apologize to anyone who must maintain or even simply read this code.
|
|
//
|
|
|
|
#include "gb-include.h"
|
|
|
|
#include <iostream>
|
|
#include <fstream>
|
|
#include "Url.h"
|
|
#include "HttpRequest.h"
|
|
#include "sort.h"
|
|
|
|
#undef malloc
|
|
#undef calloc
|
|
#undef realloc
|
|
bool closeAll ( void *state , void (* callback)(void *state) ) { return true; }
|
|
bool allExit ( ) { return true; };
|
|
|
|
bool sendPageSEO(TcpSocket *s, HttpRequest *hr) {return true;}
|
|
|
|
//int32_t g_qbufNeedSave = false;
|
|
//SafeBuf g_qbuf;
|
|
|
|
bool g_recoveryMode;
|
|
int32_t g_recoveryLevel;
|
|
|
|
int g_inMemcpy;
|
|
|
|
#define RDFBUFFER_SIZE (1024*1024*10)
|
|
#define RDFSTRUCTURE_FILE "structure.rdf.u8"
|
|
#define RDFCONTENT_FILE "content.rdf.u8"
|
|
|
|
#define STRUCTURE_OUTPUT_FILE "gbdmoz.structure.dat"
|
|
#define CONTENT_OUTPUT_FILE "gbdmoz.content.dat"
|
|
#define URL_OUTPUT_FILE "gbdmoz.urls.dat"
|
|
#define URLTEXT_OUTPUT_FILE "gbdmoz.urls.txt"
|
|
#define DIFFURLTEXT_OUTPUT_FILE "gbdmoz.diffurls.txt"
|
|
#define CATEGORY_OUTPUT_FILE "gbdmoz.categories.txt"
|
|
|
|
#define NAME_BUFFER_SIZE 24*1024*1024
|
|
#define CAT_BUFFER_SIZE 256*1024
|
|
#define URL_BUFFER_SIZE 32*1024*1024
|
|
#define URLINFO_BUFFER_SIZE 1024*1024
|
|
|
|
#define MAX_CATID_LEN 63
|
|
#define MAX_TAG_LEN 127
|
|
#define MAX_URL_CATIDS 32
|
|
#define MAX_URLTXT_SIZE 500000
|
|
|
|
#define HASHTABLE_SIZE (1024*1024)
|
|
#define URLHASHTABLE_SIZE (10*1024*1024)
|
|
|
|
#define MODE_NONE 0
|
|
#define MODE_NEW 1
|
|
#define MODE_UPDATE 2
|
|
#define MODE_URLDUMP 3
|
|
#define MODE_DIFFURLDUMP 4
|
|
#define MODE_CATDUMP 5
|
|
|
|
#define OLDURL_BUFFER_SIZE (32*1024*1024)
|
|
#define OLDCATID_BUFFER_SIZE (1024*1024)
|
|
|
|
using namespace std;
|
|
|
|
// struct for a link list hash table
|
|
struct HashLink {
|
|
int32_t m_keyOffset;
|
|
int32_t m_keyLen;
|
|
int32_t m_data;
|
|
HashLink *m_next;
|
|
};
|
|
|
|
// another hash, for urls
|
|
struct UrlHashLink {
|
|
uint64_t m_key;
|
|
//uint32_t m_key2;
|
|
//int32_t m_urlOffset;
|
|
//int32_t m_urlLen;
|
|
int32_t m_index;
|
|
UrlHashLink *m_next;
|
|
};
|
|
|
|
// structure to store url info
|
|
struct UrlInfo {
|
|
//uint64_t m_hash;
|
|
//int16_t m_urlLen;
|
|
//int32_t m_urlOffset;
|
|
unsigned char m_numCatids;
|
|
//int32_t m_catids[MAX_URL_CATIDS];
|
|
int32_t *m_catids;
|
|
char m_changed;
|
|
};
|
|
|
|
// struct for storing categories and their related info
|
|
struct RdfCat {
|
|
int32_t m_catid;
|
|
int32_t m_parentid;
|
|
//int16_t m_numSymParents;
|
|
//int32_t *m_symParents;
|
|
int32_t m_nameOffset;
|
|
int16_t m_nameLen;
|
|
uint32_t m_structureOffset;
|
|
uint32_t m_contentOffset;
|
|
uint32_t m_catHash;
|
|
int32_t m_numUrls;
|
|
};
|
|
|
|
// hash tables
|
|
HashLink *hashTable[HASHTABLE_SIZE];
|
|
UrlHashLink *urlHashTable[URLHASHTABLE_SIZE];
|
|
// url buffer
|
|
char *urlBuffer = NULL;
|
|
int32_t urlBufferSize = 0;
|
|
int32_t urlBufferLen = 0;
|
|
// url info array
|
|
UrlInfo *urlInfos = NULL;
|
|
int32_t urlInfosSize = 0;
|
|
int32_t numUrlInfos = 0;
|
|
// categories
|
|
RdfCat *rdfCats = NULL;
|
|
int32_t rdfCatsSize = 0;
|
|
int32_t numRdfCats = 0;
|
|
// rdf file stream
|
|
//ifstream rdfStream;
|
|
int rdfStream;
|
|
char *rdfBuffer = NULL;
|
|
char *rdfPtr = NULL;
|
|
char *rdfEnd = NULL;
|
|
// output file stream for serialization
|
|
//ofstream outStream;
|
|
//ofstream outStream2;
|
|
int outStream;
|
|
int outStream2;
|
|
// offset into the file
|
|
uint32_t currOffset = 0;
|
|
// cat name buffer
|
|
char *nameBuffer = NULL;
|
|
int32_t nameBufferSize = 0;
|
|
int32_t nameBufferLen = 0;
|
|
// catid buffer
|
|
char catidBuffer[MAX_CATID_LEN+1];
|
|
int32_t catidLen = 0;
|
|
// tag buffer
|
|
char tagRecfer[MAX_TAG_LEN+1];
|
|
int32_t tagLen = 0;
|
|
|
|
bool mainShutdown ( bool urgent ) { return true; }
|
|
|
|
// increment the ptr into the file, possibly reading the next chunk
|
|
char* incRdfPtr( int32_t skip = 1 ) {
|
|
int32_t n;
|
|
for (int32_t i = 0; i < skip; i++) {
|
|
rdfPtr++;
|
|
currOffset++;
|
|
// pull the next chunk if we're at the end
|
|
if (rdfPtr >= rdfEnd) {
|
|
// if nothing left, return NULL
|
|
//if (!rdfStream.good())
|
|
// return NULL;
|
|
// get the next chunk
|
|
//rdfStream.read(rdfBuffer, RDFBUFFER_SIZE);
|
|
//n = rdfStream.gcount();
|
|
n = read(rdfStream, rdfBuffer, RDFBUFFER_SIZE);
|
|
if ( n <= 0 || n > RDFBUFFER_SIZE )
|
|
return NULL;
|
|
rdfPtr = rdfBuffer;
|
|
rdfEnd = &rdfBuffer[n];
|
|
}
|
|
}
|
|
return rdfPtr;
|
|
}
|
|
|
|
// parse the rdf file up past a given start tag
|
|
int32_t rdfParse ( char *tagName ) {
|
|
//bool inQuote = false;
|
|
do {
|
|
int32_t matchPos = 0;
|
|
// move to the next tag
|
|
// . quotes are no longer escaped out in the newer
|
|
// dmoz files in oct 2013... so take that out. i do
|
|
// this < is < though.. perhaps only check for
|
|
// quotes when in a tag?
|
|
while (*rdfPtr != '<' ) { // || inQuote ) {
|
|
// check for quotes
|
|
//if (*rdfPtr == '"')
|
|
// inQuote = !inQuote;
|
|
// next char
|
|
if (!incRdfPtr())
|
|
return -1;
|
|
}
|
|
// check if the tag is good
|
|
do {
|
|
if (!incRdfPtr())
|
|
return -1;
|
|
if (*rdfPtr != tagName[matchPos])
|
|
break;
|
|
matchPos++;
|
|
} while (tagName[matchPos]);
|
|
// matched if we're at the end of the tagName
|
|
if (!tagName[matchPos]) {
|
|
if (!incRdfPtr())
|
|
return -1;
|
|
return 0;
|
|
}
|
|
// otherwise it's not a match, keep going
|
|
matchPos = 0;
|
|
} while (true);
|
|
}
|
|
|
|
// move to the next tag in the file
|
|
int32_t rdfNextTag ( ) {
|
|
//bool inQuote = false;
|
|
// move to the next tag
|
|
while (*rdfPtr != '<' ) { // || inQuote ) {
|
|
// check for quotes
|
|
// NO! too many unbalanced quotes all over the place!
|
|
// and i think quotes in tags do not have < or > in them
|
|
// because they should be encoded as > and <
|
|
//if (*rdfPtr == '"')
|
|
// inQuote = !inQuote;
|
|
// next char
|
|
if (!incRdfPtr())
|
|
return -1;
|
|
}
|
|
// skip the <
|
|
if (!incRdfPtr())
|
|
return -1;
|
|
// put the tag name in a buffer
|
|
tagLen = 0;
|
|
while ( *rdfPtr != ' ' &&
|
|
*rdfPtr != '>' ) {
|
|
// insert the current char
|
|
if (tagLen < MAX_TAG_LEN) {
|
|
tagRecfer[tagLen] = *rdfPtr;
|
|
tagLen++;
|
|
}
|
|
// next char
|
|
if (!incRdfPtr())
|
|
return -1;
|
|
}
|
|
tagRecfer[tagLen] = '\0';
|
|
// success
|
|
return 0;
|
|
}
|
|
|
|
// compare two cats, for gbsort
|
|
int catcomp ( const void *c1, const void *c2 ) {
|
|
return (((RdfCat*)c1)->m_catid - ((RdfCat*)c2)->m_catid);
|
|
}
|
|
|
|
// hash a string
|
|
uint32_t catHash ( char *key, int32_t keyLen ) {
|
|
// simple hash
|
|
uint32_t hash = 0;
|
|
for (int32_t i = 0; i < keyLen; i++)
|
|
hash ^= key[i]*i;
|
|
return (hash % HASHTABLE_SIZE);
|
|
}
|
|
|
|
// NOTE: these hash functions assume the name buffer
|
|
// and key offset are preserved throughout the
|
|
// use of the hash
|
|
|
|
// init the hash table
|
|
void initHashTable ( ) {
|
|
for (int32_t i = 0; i < HASHTABLE_SIZE; i++)
|
|
hashTable[i] = NULL;
|
|
}
|
|
|
|
// clear the hash table
|
|
void clearHashTable ( ) {
|
|
for (int32_t i = 0; i < HASHTABLE_SIZE; i++) {
|
|
while (hashTable[i]) {
|
|
HashLink *next = hashTable[i]->m_next;
|
|
free(hashTable[i]);
|
|
hashTable[i] = next;
|
|
}
|
|
hashTable[i] = NULL;
|
|
}
|
|
}
|
|
|
|
// add a string to a hash table with the given data
|
|
int32_t addCatHash ( int32_t keyOffset, int32_t keyLen, int32_t data ) {
|
|
// get the hash value
|
|
uint32_t hashKey = catHash(&nameBuffer[keyOffset], keyLen);
|
|
// get the first node
|
|
HashLink **currLink = &hashTable[hashKey];
|
|
// go to the first empty node
|
|
while (*currLink)
|
|
currLink = &((*currLink)->m_next);
|
|
// fill the node
|
|
*currLink = (HashLink*)malloc(sizeof(HashLink));
|
|
if (!(*currLink))
|
|
return -1;
|
|
(*currLink)->m_keyOffset = keyOffset;
|
|
(*currLink)->m_keyLen = keyLen;
|
|
(*currLink)->m_data = data;
|
|
(*currLink)->m_next = NULL;
|
|
return 0;
|
|
}
|
|
|
|
// get the data in the hash using a string key
|
|
int32_t getCatHash ( char *key, int32_t keyLen ) {
|
|
// get the hash value
|
|
uint32_t hashKey = catHash(key, keyLen);
|
|
// get the first node
|
|
HashLink *currLink = hashTable[hashKey];
|
|
// go to the correct node
|
|
while ( currLink &&
|
|
( currLink->m_keyLen != keyLen ||
|
|
strncmp(&nameBuffer[currLink->m_keyOffset], key, keyLen) != 0 ) )
|
|
currLink = currLink->m_next;
|
|
// return -1 if not found
|
|
if (!currLink)
|
|
return -1;
|
|
else
|
|
return currLink->m_data;
|
|
}
|
|
|
|
// init the hash table
|
|
void initUrlHashTable ( ) {
|
|
for (int32_t i = 0; i < URLHASHTABLE_SIZE; i++)
|
|
urlHashTable[i] = NULL;
|
|
}
|
|
|
|
// clear the hash table
|
|
void clearUrlHashTable ( ) {
|
|
for (int32_t i = 0; i < URLHASHTABLE_SIZE; i++) {
|
|
while (urlHashTable[i]) {
|
|
UrlHashLink *next = urlHashTable[i]->m_next;
|
|
free(urlHashTable[i]);
|
|
urlHashTable[i] = next;
|
|
}
|
|
urlHashTable[i] = NULL;
|
|
}
|
|
}
|
|
|
|
// add a url hash to the hash table with the given index
|
|
int32_t addUrlHash ( uint64_t key,
|
|
//uint32_t key2,
|
|
int32_t index ) {
|
|
//int32_t index,
|
|
//int32_t urlOffset,
|
|
//int32_t urlLen ) {
|
|
// get the hash value
|
|
uint32_t hashKey = (key%(uint64_t)URLHASHTABLE_SIZE);
|
|
// get the first node
|
|
UrlHashLink **currLink = &urlHashTable[hashKey];
|
|
// go to the first empty node
|
|
while (*currLink)
|
|
currLink = &((*currLink)->m_next);
|
|
// fill the node
|
|
*currLink = (UrlHashLink*)malloc(sizeof(UrlHashLink));
|
|
if (!(*currLink))
|
|
return -1;
|
|
(*currLink)->m_key = key;
|
|
//(*currLink)->m_key2 = key2;
|
|
(*currLink)->m_index = index;
|
|
//(*currLink)->m_urlOffset = urlOffset;
|
|
//(*currLink)->m_urlLen = urlLen;
|
|
(*currLink)->m_next = NULL;
|
|
return 0;
|
|
}
|
|
|
|
// get the index in the hash using hash key
|
|
int32_t getUrlHash ( uint64_t key ) {
|
|
//uint32_t key2 ) {
|
|
//uint32_t key2,
|
|
//int32_t urlOffset,
|
|
//int32_t urlLen ) {
|
|
// get the hash value
|
|
uint32_t hashKey = (key%(uint64_t)URLHASHTABLE_SIZE);
|
|
// get the first node
|
|
UrlHashLink *currLink = urlHashTable[hashKey];
|
|
// go to the correct node
|
|
while ( currLink && currLink->m_key != key )
|
|
//( currLink->m_key != key || currLink->m_key2 != key2 ) )
|
|
//( currLink->m_key != key || currLink->m_key2 != key2 ||
|
|
//currLink->m_urlLen != urlLen ||
|
|
//strncasecmp(&urlBuffer[currLink->m_urlOffset],
|
|
// &urlBuffer[urlOffset], urlLen) != 0) )
|
|
currLink = currLink->m_next;
|
|
// return -1 if not found
|
|
if (!currLink)
|
|
return -1;
|
|
else
|
|
return currLink->m_index;
|
|
}
|
|
|
|
|
|
// do a binary search to get a cat from an id
|
|
int32_t getIndexFromId ( int32_t catid ) {
|
|
int32_t low = 0;
|
|
int32_t high = numRdfCats-1;
|
|
int32_t currCat;
|
|
// binary search
|
|
//while (rdfCats[currCat].m_catid != catid) {
|
|
while (low <= high) {
|
|
// next check spot
|
|
currCat = (low + high)/2;
|
|
// check for hit
|
|
if (rdfCats[currCat].m_catid == catid)
|
|
return currCat;
|
|
// shift search range
|
|
else if (rdfCats[currCat].m_catid > catid)
|
|
high = currCat-1;
|
|
else
|
|
low = currCat+1;
|
|
}
|
|
//printf("catid %"INT32" not found. sanity checking.\n",catid);
|
|
// sanity check our algo
|
|
//for ( int32_t i = 0 ; i < numRdfCats ; i++ ) {
|
|
// if ( rdfCats[i].m_catid == catid ) { char *xx=NULL;*xx=0;}
|
|
//}
|
|
// not found
|
|
return -1;
|
|
}
|
|
|
|
// print cat information
|
|
void printCats ( int32_t start, int32_t end ) {
|
|
for (int32_t i = start; i < end; i++) {
|
|
printf("Cat %"INT32":\n", i);
|
|
printf(" CatID: %"INT32"\n", rdfCats[i].m_catid);
|
|
printf(" Name: ");
|
|
for (int32_t n = rdfCats[i].m_nameOffset;
|
|
n < rdfCats[i].m_nameOffset + rdfCats[i].m_nameLen; n++)
|
|
printf("%c", nameBuffer[n]);
|
|
printf("\n");
|
|
printf(" Name Offset: %"INT32"\n", rdfCats[i].m_nameOffset);
|
|
printf(" Structure Offset: %"INT32"\n", rdfCats[i].m_structureOffset);
|
|
printf(" Content Offset: %"INT32"\n", rdfCats[i].m_contentOffset);
|
|
printf(" Parent: %"INT32"\n", rdfCats[i].m_parentid);
|
|
printf("\n");
|
|
}
|
|
}
|
|
|
|
// parse out the next catid
|
|
int32_t parseNextCatid() {
|
|
// parse for <catid, this will be the next cat
|
|
if (rdfParse("catid") == -1)
|
|
return -1;
|
|
// go to the catid, skip '>'
|
|
if (!incRdfPtr())
|
|
return -1;
|
|
catidLen = 0;
|
|
while (*rdfPtr != '<') {
|
|
if (catidLen < MAX_CATID_LEN) {
|
|
catidBuffer[catidLen] = *rdfPtr;
|
|
catidLen++;
|
|
}
|
|
if (!incRdfPtr())
|
|
return -1;
|
|
}
|
|
catidBuffer[catidLen] = '\0';
|
|
// translate the id
|
|
return atol(catidBuffer);
|
|
}
|
|
|
|
// fill the next quoted string in the name buffer
|
|
int32_t fillNextString() {
|
|
// get the next string, skip to the next quote
|
|
while (*rdfPtr != '"') {
|
|
if (!incRdfPtr())
|
|
return -1;
|
|
}
|
|
// skip the quote
|
|
if (!incRdfPtr())
|
|
return -1;
|
|
// . pointing at the string now
|
|
// dump it in the buffer
|
|
int32_t nameLen = 0;
|
|
while (*rdfPtr != '"') {
|
|
// make sure there's room in the buffer
|
|
if (nameBufferLen+nameLen >= nameBufferSize) {
|
|
nameBufferSize += NAME_BUFFER_SIZE;
|
|
nameBuffer = (char*)realloc((void*)nameBuffer,
|
|
sizeof(char)*nameBufferSize);
|
|
printf("nameBuffer: %"INT32" bytes\n", nameBufferSize);
|
|
if (!nameBuffer)
|
|
return -2;
|
|
}
|
|
// fill the next character
|
|
nameBuffer[nameBufferLen+nameLen] = *rdfPtr;
|
|
nameLen++;
|
|
if (!incRdfPtr())
|
|
return -1;
|
|
}
|
|
// step past the quote
|
|
if (!incRdfPtr())
|
|
return -1;
|
|
// return the length
|
|
return nameLen;
|
|
}
|
|
|
|
// fill the next quoted url in the name buffer
|
|
int32_t fillNextUrl() {
|
|
// get the next string, skip to the next quote
|
|
while (*rdfPtr != '"') {
|
|
if (!incRdfPtr())
|
|
return -1;
|
|
}
|
|
// skip the quote
|
|
if (!incRdfPtr())
|
|
return -1;
|
|
// . pointing at the string now
|
|
// dump it in the buffer
|
|
int32_t urlLen = 0;
|
|
while (*rdfPtr != '"') {
|
|
// make sure there's room in the buffer
|
|
if (urlBufferLen+urlLen+10 >= urlBufferSize) {
|
|
urlBufferSize += URL_BUFFER_SIZE;
|
|
urlBuffer = (char*)realloc((void*)urlBuffer,
|
|
sizeof(char)*urlBufferSize);
|
|
printf("urlBuffer: %"INT32" bytes\n", urlBufferSize);
|
|
if (!urlBuffer)
|
|
return -2;
|
|
}
|
|
// fill the next character
|
|
urlBuffer[urlBufferLen+urlLen] = *rdfPtr;
|
|
urlLen++;
|
|
if (!incRdfPtr())
|
|
return -1;
|
|
}
|
|
// step past the quote
|
|
if (!incRdfPtr())
|
|
return -1;
|
|
// return the length
|
|
return urlLen;
|
|
}
|
|
|
|
// check the url for all valid characters
|
|
bool isGoodUrl ( char *url, int32_t urlLen ) {
|
|
// . all we're going to check for right now are
|
|
// characters that show up as spaces
|
|
if ( urlLen <= 0 )
|
|
return false;
|
|
for (int32_t i = 0; i < urlLen; i++) {
|
|
if (is_wspace_a(url[i]))
|
|
return false;
|
|
}
|
|
// check for [prot]://[url]
|
|
int32_t bef = 0;
|
|
char *p = url;
|
|
char *pend = url + urlLen;
|
|
while ( p < pend && *p != ':' ) {
|
|
p++;
|
|
bef++;
|
|
}
|
|
if ( bef == 0 || pend - p < 3 || p[1] != '/' || p[2] != '/' )
|
|
return false;
|
|
// good url
|
|
return true;
|
|
}
|
|
|
|
// print the category path
|
|
int32_t printCatPath ( char *str, int32_t catid, bool raw ) {
|
|
int32_t catIndex;
|
|
int32_t parentId;
|
|
char *p = str;
|
|
// get the index
|
|
catIndex = getIndexFromId(catid);
|
|
if (catIndex < 1)
|
|
return 0;
|
|
// get the parent
|
|
parentId = rdfCats[catIndex].m_parentid;
|
|
|
|
// . print the parent(s) first
|
|
// . in NEWER DMOZ dumps, "Top" is catid 2 and catid 1 is an
|
|
// empty title. really catid 2 is Top/World but that is an
|
|
// error that we correct below. (see "Top/World" below).
|
|
// but do not include the "Top/" as part of the path name
|
|
if ( catid == 2 ) {
|
|
// no! we now include Top as part of the path. let's
|
|
// be consistent. i'd rather have www.gigablast.com/Top
|
|
// and www.gigablast.com/Top/Arts etc. then i know if the
|
|
// path starts with /Top that it is dmoz!!
|
|
sprintf(p,"Top");
|
|
return 3;
|
|
}
|
|
|
|
if (parentId > 1 &&
|
|
// the newer dmoz files have the catid == the parent id of
|
|
// i guess top most categories, like "Top/Arts"... i would think
|
|
// it should have a parentId of 1 like the old dmoz files,
|
|
// so it's probably a bug on dmoz's end
|
|
parentId != catid ) {
|
|
p += printCatPath(p, parentId, raw);
|
|
// print spacing
|
|
if (!raw) p += sprintf(p, " / ");
|
|
else p += sprintf(p, "/");
|
|
}
|
|
// print this category name
|
|
int32_t nameLen = rdfCats[catIndex].m_nameLen;
|
|
gbmemcpy ( p,
|
|
&nameBuffer[rdfCats[catIndex].m_nameOffset],
|
|
nameLen );
|
|
p += nameLen;
|
|
// null terminate
|
|
*p = '\0';
|
|
// return length
|
|
return (p - str);
|
|
}
|
|
|
|
int32_t fixUrl ( char *url, int32_t urlLen ) {
|
|
int32_t slashi = 0;
|
|
int32_t newUrlLen = urlLen;
|
|
// check for a bad protocol, something:
|
|
while (url[slashi] != ':') {
|
|
slashi++;
|
|
// if no :, throw it out
|
|
if (slashi >= newUrlLen)
|
|
return 0;
|
|
}
|
|
// check for a ://
|
|
if (newUrlLen - slashi < 3)
|
|
return 0;
|
|
if (url[slashi] != ':' ||
|
|
url[slashi+1] != '/' ||
|
|
url[slashi+2] != '/') {
|
|
// fix news: to news://
|
|
if (strncasecmp(url, "news:", 5) == 0) {
|
|
char newsFix[1024];
|
|
gbmemcpy(newsFix, url, newUrlLen);
|
|
gbmemcpy(url, newsFix, 5);
|
|
gbmemcpy(&url[5], "//", 2);
|
|
gbmemcpy(&url[7], &newsFix[5], newUrlLen - 5);
|
|
newUrlLen += 2;
|
|
}
|
|
// otherwise throw it out
|
|
else
|
|
return 0;
|
|
}
|
|
slashi += 3;
|
|
// . jump over http:// if it starts with http://http://
|
|
// . generic for any protocol
|
|
char prot[1024];
|
|
gbmemcpy(prot, url, slashi);
|
|
prot[slashi] = '\0';
|
|
sprintf(prot, "%s%s", prot, prot);
|
|
while ( newUrlLen > slashi*2 &&
|
|
strncasecmp(url, prot, slashi*2) == 0 ) {
|
|
// remove the extra protocol
|
|
memmove(url, &url[slashi], newUrlLen - slashi);
|
|
newUrlLen -= slashi;
|
|
}
|
|
/*
|
|
// remove a www.
|
|
if (newUrlLen - slashi >= 4 &&
|
|
strncasecmp(&url[slashi], "www.", 4) == 0) {
|
|
memmove(&url[slashi], &url[slashi+4], newUrlLen - (slashi+4));
|
|
newUrlLen -= 4;
|
|
}
|
|
*/
|
|
// look for //, cut down to single /, remove any spaces
|
|
for (; slashi < newUrlLen; slashi++) {
|
|
if (url[slashi-1] == '/' && url[slashi] == '/') {
|
|
memmove(&url[slashi-1], &url[slashi], newUrlLen - slashi);
|
|
newUrlLen--;
|
|
}
|
|
if (is_wspace_a(url[slashi])) {
|
|
memmove(&url[slashi], &url[slashi+1], newUrlLen - (slashi+1));
|
|
newUrlLen--;
|
|
}
|
|
}
|
|
// remove any anchor
|
|
// mdw, sep 2013, no because there is twitter.com/#!/ronpaul
|
|
// and others...
|
|
/*
|
|
for (int32_t i = 0; i < newUrlLen; i++) {
|
|
if (url[i] == '#') {
|
|
newUrlLen = i;
|
|
break;
|
|
}
|
|
}
|
|
*/
|
|
// remove any trailing /
|
|
if (url[newUrlLen-1] == '/')
|
|
newUrlLen--;
|
|
// return the new length
|
|
return newUrlLen;
|
|
}
|
|
|
|
// properly read from file
|
|
int32_t fileRead ( int fileid, void *buf, size_t count ) {
|
|
char *p = (char*)buf;
|
|
int32_t n = 0;
|
|
uint32_t sizeRead = 0;
|
|
while ( sizeRead < count ) {
|
|
n = read ( fileid, p, count - sizeRead );
|
|
if ( n <= 0 || n > (int32_t)count )
|
|
return n;
|
|
sizeRead += n;
|
|
p += n;
|
|
}
|
|
return sizeRead;
|
|
}
|
|
|
|
// properly write to file
|
|
int32_t fileWrite ( int fileid, void *buf, size_t count ) {
|
|
char *p = (char*)buf;
|
|
int32_t n = 0;
|
|
uint32_t sizeWrote = 0;
|
|
while ( sizeWrote < count ) {
|
|
n = write ( fileid, p, count - sizeWrote );
|
|
if ( n <= 0 || n > (int32_t)count )
|
|
return n;
|
|
sizeWrote += n;
|
|
p += n;
|
|
}
|
|
return sizeWrote;
|
|
}
|
|
|
|
// print special meta tags to tell gigablast to only spider/index
|
|
// the links and not the links of the links. b/c we only want
|
|
// to index the dmoz urls. AND ignore any external error like
|
|
// ETCPTIMEDOUT when indexing a dmoz url so we can be sure to index
|
|
// all of them under the proper category so our gbcatid:xxx search
|
|
// works and we can replicate dmoz accurately. see XmlDoc.cpp
|
|
// addOutlinksSpiderRecsToMetaList() and indexDoc() to see
|
|
// where these meta tags come into play.
|
|
void writeMetaTags ( int outStream2 ) {
|
|
char *str =
|
|
"<!-- do not spider the links of the links -->\n"
|
|
"<meta name=spiderlinkslinks content=0>\n"
|
|
"<!--ignore tcp timeouts, dns timeouts, etc.-->\n"
|
|
"<meta name=ignorelinksexternalerrors content=1>\n"
|
|
"<!--do not index this document, but get links from it-->\n"
|
|
"<meta name=noindex content=1>\n"
|
|
// tell gigablast to not do a dns lookup on every
|
|
// outlink when adding spiderRequests to spiderdb
|
|
// for each outlink. will save time up front but
|
|
// will have to be done when spidering the doc.
|
|
"<!-- do not lookup the ip address of every outlink, "
|
|
"but use hash of the subdomain as the ip -->\n"
|
|
"<meta name=usefakeips content=1>\n"
|
|
;
|
|
int32_t len = gbstrlen(str);
|
|
if ( write ( outStream2, str , len ) != len )
|
|
printf("Error writing to outStream2b\n");
|
|
}
|
|
|
|
|
|
|
|
|
|
// main parser
|
|
int main ( int argc, char *argv[] ) {
|
|
int32_t n;
|
|
int32_t t = 0;
|
|
int32_t ti = 0;
|
|
int32_t m = 0;
|
|
int32_t newNameBufferSize = 0;
|
|
int32_t newOffset = 0;
|
|
char filename[1256];
|
|
int32_t urlTxtCount = 0;
|
|
int32_t urlTxtFile = 0;
|
|
Url normUrl;
|
|
char decodedUrl[MAX_URL_LEN];
|
|
char htmlDecoded[MAX_HTTP_FILENAME_LEN];
|
|
//int32_t numSymParents = 0;
|
|
//int32_t endpos;
|
|
// url diff stuff
|
|
int32_t numUpdateIndexes = 0;
|
|
int32_t *updateIndexes = NULL;
|
|
int32_t currUrl = 0;
|
|
int32_t currDiffIndex = 0;
|
|
// options
|
|
bool splitUrls = false;
|
|
char mode = MODE_NONE;
|
|
int32_t totalNEC = 0;
|
|
char *dir="";
|
|
bool firstTime;
|
|
|
|
// check the options and mode
|
|
for (int32_t i = 0; i < argc; i++) {
|
|
if (strcmp(argv[i], "-s") == 0)
|
|
splitUrls = true;
|
|
else if (strcmp(argv[i], "urldump") == 0)
|
|
mode = MODE_URLDUMP;
|
|
else if (strcasecmp(argv[i], "update") == 0)
|
|
mode = MODE_UPDATE;
|
|
else if (strcasecmp(argv[i], "new") == 0)
|
|
mode = MODE_NEW;
|
|
else if (strcasecmp(argv[i], "diffurldump") == 0)
|
|
mode = MODE_DIFFURLDUMP;
|
|
else if (strcasecmp(argv[i], "catdump") == 0)
|
|
mode = MODE_CATDUMP;
|
|
}
|
|
|
|
// check for correct call
|
|
if (mode == MODE_NONE) {
|
|
printf("\n"
|
|
"Usage: dmozparse [OPTIONS] [MODE]\n"
|
|
"\n"
|
|
"Modes:\n"
|
|
" new Generate new .dat files.\n"
|
|
"\n"
|
|
" update Generate new .dat.new files, updating\n"
|
|
" existing .dat files. Changes will be\n"
|
|
" written to gbdmoz.changes.dat.new.\n"
|
|
" Catdb will update using these files\n"
|
|
" when told to update.\n"
|
|
"\n"
|
|
" urldump Dump urls to file only. This will not\n"
|
|
" create any .dat files, only url txt \n"
|
|
" files.\n"
|
|
"\n"
|
|
" diffurldump Dump urls that are new, changed, or\n"
|
|
" removed in the latest update. (Uses\n"
|
|
" gbdmoz.content.dat.new.diff)\n"
|
|
"\n"
|
|
" catdump Dump categories to file only.\n"
|
|
"\n"
|
|
"Options:\n"
|
|
" -s Split url output into multiple files.\n"
|
|
" This is used for adding urls to gb\n"
|
|
" which has a limit to the file size.\n"
|
|
"\n"
|
|
"\n" );
|
|
exit(0);
|
|
}
|
|
|
|
// init the hash table for hashing urls
|
|
if (!hashinit()) {
|
|
printf("Hash Init Failed!\n");
|
|
goto errExit;
|
|
}
|
|
|
|
// init the hash table
|
|
initHashTable();
|
|
|
|
printf("\n");
|
|
// . create a large buffer for reading chunks
|
|
// of the rdf files
|
|
rdfBuffer = (char*)malloc(sizeof(char)*(RDFBUFFER_SIZE+1));
|
|
if (!rdfBuffer) {
|
|
printf("Out of memory!!\n");
|
|
goto errExit;
|
|
}
|
|
|
|
// skip hierarchy stuff for url dump
|
|
if ( mode == MODE_URLDUMP || mode == MODE_DIFFURLDUMP )
|
|
goto contentParse;
|
|
|
|
// create the cat array
|
|
rdfCatsSize = CAT_BUFFER_SIZE;
|
|
rdfCats = (RdfCat*)malloc(sizeof(RdfCat)*rdfCatsSize);
|
|
if (!rdfCats) {
|
|
printf("Out of memory!!\n");
|
|
goto errExit;
|
|
}
|
|
|
|
// create the name buffer
|
|
nameBufferSize = NAME_BUFFER_SIZE;
|
|
nameBuffer = (char*)malloc(sizeof(char)*nameBufferSize);
|
|
if (!nameBuffer) {
|
|
printf("Out of memory!!\n");
|
|
goto errExit;
|
|
}
|
|
|
|
dir = "";
|
|
|
|
retry:
|
|
|
|
// open the structure file
|
|
if ( mode == MODE_NEW || mode == MODE_CATDUMP )
|
|
sprintf(filename, "%s%s", dir,RDFSTRUCTURE_FILE);
|
|
else
|
|
sprintf(filename, "%s%s.new", dir,RDFSTRUCTURE_FILE);
|
|
//rdfStream.open(filename, ifstream::in);
|
|
rdfStream = open ( filename, O_RDONLY );
|
|
// make sure it opened okay
|
|
//if (!rdfStream.is_open()) {
|
|
if ( rdfStream < 0 ) {
|
|
// try ./catdb/ subdir if not found
|
|
if ( ! dir[0] ) {
|
|
dir = "./catdb/";
|
|
goto retry;
|
|
}
|
|
printf("Error Opening %s\n", filename);
|
|
goto errExit;
|
|
}
|
|
printf("Opened Structure File: %s\n", filename);
|
|
|
|
// take the first chunk
|
|
//rdfStream.read(rdfBuffer, RDFBUFFER_SIZE);
|
|
//n = rdfStream.gcount();
|
|
n = read ( rdfStream, rdfBuffer, RDFBUFFER_SIZE );
|
|
if ( n <= 0 || n > RDFBUFFER_SIZE ) {
|
|
printf("Error Reading %s\n", filename);
|
|
goto errExit;
|
|
}
|
|
rdfPtr = rdfBuffer;
|
|
rdfEnd = &rdfBuffer[n];
|
|
currOffset = 0;
|
|
firstTime = true;
|
|
|
|
// read and parse the file
|
|
printf("Parsing Topics...\n");
|
|
while (true) {
|
|
// parse for <Topic...
|
|
if (rdfParse("Topic") == -1)
|
|
goto fileEnd;
|
|
// the offset for this cat is 6 chars back
|
|
uint32_t catOffset = currOffset - 6;
|
|
// get the topic name, preserve it on the buffer
|
|
int32_t nameOffset = nameBufferLen;
|
|
// the name inserted by this function into "nameBuffer"
|
|
// does not seem to contain "Top/" at the beginning.
|
|
// it is from structure.rdf.u8, but it seems to be there!
|
|
// yeah, later on we hack the name buffer and nameOffset
|
|
// so it is just the last word in the directory to save
|
|
// mem. then we print out all the parent names to
|
|
// reconstruct.
|
|
int32_t nameLen = fillNextString();
|
|
if (nameLen == -1)
|
|
goto fileEnd;
|
|
if (nameLen == -2) {
|
|
printf("Out of Memory!\n");
|
|
goto errExit1;
|
|
}
|
|
// fix <Topic r:id=\"\"> in the newer content.rdf.u8
|
|
if ( nameLen == 0 ) {
|
|
// only do this once!
|
|
if ( ! firstTime ) {
|
|
printf("Encountered zero length name");
|
|
continue;
|
|
}
|
|
gbmemcpy(nameBuffer+nameOffset,"Top\0",4);
|
|
nameLen = 3;
|
|
firstTime = false;
|
|
}
|
|
// html decode it
|
|
if (nameLen > MAX_HTTP_FILENAME_LEN)
|
|
nameLen = MAX_HTTP_FILENAME_LEN;
|
|
nameLen = htmlDecode ( htmlDecoded,
|
|
&nameBuffer[nameOffset],
|
|
nameLen ,
|
|
false,
|
|
0);
|
|
|
|
// parse the catid
|
|
int32_t catid = parseNextCatid();
|
|
if (catid == -1)
|
|
goto fileEnd;
|
|
|
|
// crap, in the new dmoz structure.rdf.u8 catid 1 is
|
|
// empty name and catid 2 has Topic tag "Top/World" but
|
|
// Title tag "Top".
|
|
// but it should probably be "Top" and not "World". There is
|
|
// another catid 3 in structure.rdf.u8 that has
|
|
// <Topic r:id="Top/World"> and catid 3 which is the real one,
|
|
// so catid 2 is just "Top". this is a bug in the dmoz output
|
|
// i think, so fix it here.
|
|
if ( catid == 2 ) {
|
|
nameLen = 3;
|
|
gbmemcpy(&nameBuffer[nameOffset],"Top",nameLen);
|
|
nameBufferLen += nameLen;
|
|
}
|
|
else {
|
|
gbmemcpy(&nameBuffer[nameOffset], htmlDecoded, nameLen);
|
|
nameBufferLen += nameLen;
|
|
}
|
|
// . fill the current cat
|
|
// make sure there's room
|
|
if (numRdfCats >= rdfCatsSize) {
|
|
rdfCatsSize += CAT_BUFFER_SIZE;
|
|
rdfCats = (RdfCat*)realloc((void*)rdfCats,
|
|
sizeof(RdfCat)*rdfCatsSize);
|
|
printf("rdfCats: %"INT32" bytes\n", rdfCatsSize);
|
|
if (!rdfCats) {
|
|
printf("Out of Memory\n");
|
|
goto errExit1;
|
|
}
|
|
}
|
|
// hash the name to the catid
|
|
if (addCatHash ( nameOffset, nameLen, catid ) == -1) {
|
|
printf("Out of Memory!\n");
|
|
goto errExit1;
|
|
}
|
|
// debug
|
|
//printf("gbcat=");
|
|
//for ( int32_t i = 0 ; i < nameLen ; i++ )
|
|
// printf("%c",htmlDecoded[i]);
|
|
//printf("\n");
|
|
// fill it
|
|
rdfCats[numRdfCats].m_catid = catid;
|
|
rdfCats[numRdfCats].m_parentid = 0;
|
|
//rdfCats[numRdfCats].m_numSymParents = 0;
|
|
//rdfCats[numRdfCats].m_symParents = NULL;
|
|
rdfCats[numRdfCats].m_nameLen = nameLen;
|
|
rdfCats[numRdfCats].m_nameOffset = nameOffset;
|
|
rdfCats[numRdfCats].m_structureOffset = catOffset;
|
|
rdfCats[numRdfCats].m_contentOffset = 0;
|
|
rdfCats[numRdfCats].m_catHash = 0;
|
|
rdfCats[numRdfCats].m_numUrls = 0;
|
|
numRdfCats++;
|
|
}
|
|
|
|
fileEnd:
|
|
// sort the cats by catid
|
|
gbsort(rdfCats, numRdfCats, sizeof(RdfCat), catcomp);
|
|
|
|
// dump out categories for category dump
|
|
if ( mode == MODE_CATDUMP ) {
|
|
char catTemp[16384];
|
|
for ( int32_t i = 0; i < numRdfCats; i++ ) {
|
|
//for (int32_t n = rdfCats[i].m_nameOffset;
|
|
// n < rdfCats[i].m_nameOffset +
|
|
// rdfCats[i].m_nameLen; n++)
|
|
// printf("%c", nameBuffer[n]);
|
|
//printf("\n");
|
|
int32_t encLen = urlEncode(catTemp, 16383,
|
|
&nameBuffer[rdfCats[i].m_nameOffset],
|
|
rdfCats[i].m_nameLen);
|
|
catTemp[encLen] = '\0';
|
|
printf("http://dir.gigablast.com%s\n", &catTemp[3]);
|
|
}
|
|
close(rdfStream);
|
|
goto goodEnd;
|
|
}
|
|
|
|
// . now we need to reparse the whole file again and
|
|
// parse out the children of each topic, this includes:
|
|
// <narrow> hard links
|
|
// <narrow1> hard links
|
|
// <narrow2> hard links
|
|
// <letterbar> hard links
|
|
// <symbolic> sym links
|
|
// <symbolic1> sym links
|
|
// <symbolic2> sym links
|
|
// </Topic> ends the topic
|
|
|
|
// reset to the beginning of the file
|
|
//rdfStream.clear();
|
|
//rdfStream.seekg(0, ios::beg);
|
|
if ( lseek(rdfStream, 0, SEEK_SET) < 0 ) {
|
|
printf ( "Error Resetting RDF File\n" );
|
|
goto errExit1;
|
|
}
|
|
// reset the buffer to the first block
|
|
//rdfStream.read(rdfBuffer, RDFBUFFER_SIZE);
|
|
//n = rdfStream.gcount();
|
|
n = read(rdfStream, rdfBuffer, RDFBUFFER_SIZE);
|
|
if ( n <= 0 || n > RDFBUFFER_SIZE ) {
|
|
printf("Error Reading %s\n", filename);
|
|
goto errExit1;
|
|
}
|
|
rdfPtr = rdfBuffer;
|
|
rdfEnd = &rdfBuffer[n];
|
|
currOffset = 0;
|
|
|
|
//
|
|
// set m_parentid using structure.rdf.u8
|
|
//
|
|
|
|
// read and parse the file again
|
|
printf("Building Hierarchy...\n");
|
|
while (true) {
|
|
// parse the next catid in the file, sequentially
|
|
//if ( currOffset == 545468935 )
|
|
// printf("shit\n");
|
|
int32_t catid = parseNextCatid();
|
|
if (catid == -1)
|
|
goto fileEnd1;
|
|
nextChildTag:
|
|
// now go through the tags looking for what we want
|
|
if (rdfNextTag() == -1)
|
|
goto fileEnd1;
|
|
// check it for one of the tags we're looking for
|
|
int32_t parentType;
|
|
if ( tagLen == 6 &&
|
|
strncmp ( tagRecfer, "/Topic", 6 ) == 0 )
|
|
continue;
|
|
else if ( tagLen == 6 &&
|
|
strncmp ( tagRecfer, "narrow", 6 ) == 0 )
|
|
parentType = 1;
|
|
else if ( tagLen == 7 &&
|
|
strncmp ( tagRecfer, "narrow1", 7 ) == 0 )
|
|
parentType = 1;
|
|
else if ( tagLen == 7 &&
|
|
strncmp ( tagRecfer, "narrow2", 7 ) == 0 )
|
|
parentType = 1;
|
|
else if ( tagLen == 9 &&
|
|
strncmp ( tagRecfer, "letterbar", 9 ) == 0 )
|
|
parentType = 1;
|
|
// else if ( tagLen == 8 &&
|
|
// strncmp ( tagRecfer, "symbolic", 8 ) == 0 )
|
|
// parentType = 2;
|
|
// else if ( tagLen == 9 &&
|
|
// strncmp ( tagRecfer, "symbolic1", 9 ) == 0 )
|
|
// parentType = 2;
|
|
// else if ( tagLen == 9 &&
|
|
// strncmp ( tagRecfer, "symbolic2", 9 ) == 0 )
|
|
// parentType = 2;
|
|
else
|
|
goto nextChildTag;
|
|
// will only reach here if we're at a child cat
|
|
// get the name, use the end of nameBuffer
|
|
char *childName = &nameBuffer[nameBufferLen];
|
|
int32_t childNameLen = fillNextString();
|
|
if (childNameLen == -1)
|
|
goto fileEnd1;
|
|
if (childNameLen == -2) {
|
|
printf("Out of Memory!\n");
|
|
goto errExit1;
|
|
}
|
|
// html decode it
|
|
if (childNameLen > MAX_HTTP_FILENAME_LEN)
|
|
childNameLen = MAX_HTTP_FILENAME_LEN;
|
|
childNameLen = htmlDecode ( htmlDecoded,
|
|
childName,
|
|
childNameLen ,
|
|
false,
|
|
0);
|
|
gbmemcpy(childName, htmlDecoded, childNameLen);
|
|
|
|
// debug log
|
|
//if ( currOffset >= 506362430 ) // 556362463
|
|
// printf("off=%"INT32"\n",currOffset);
|
|
// debug point
|
|
//if ( currOffset == 545467573 )
|
|
// printf("GOT DEBUG POINT before giant skip\n");
|
|
|
|
// cut off the leading label if symbolic
|
|
// if (parentType == 2) {
|
|
// while (*childName != ':') {
|
|
// childName++;
|
|
// childNameLen--;
|
|
// }
|
|
// childName++;
|
|
// childNameLen--;
|
|
// }
|
|
// debug point
|
|
//if (strcmp(childName,"Top/World/Català/Arts") == 0 )
|
|
// printf("hey\n");
|
|
// get the catid for the child
|
|
int32_t childid = getCatHash(childName, childNameLen);
|
|
// get the cat for this id
|
|
int32_t cat = getIndexFromId(childid);
|
|
// make sure we have a match
|
|
if (cat == -1) {
|
|
// debug. why does Top/World/Catala/Arts
|
|
// not have a parent??
|
|
printf("Warning: Child Topic Not Found: ");
|
|
for (int32_t i = 0; i < childNameLen; i++)
|
|
printf("%c", childName[i]);
|
|
printf("\n");
|
|
m++;
|
|
goto nextChildTag;
|
|
}
|
|
// . assign the parent to the cat
|
|
// . this means we are in a "child" tag within the "catid"
|
|
// . catid 84192
|
|
if (parentType == 1) {
|
|
if (rdfCats[cat].m_parentid != 0)
|
|
printf("Warning: Overwriting Parent Id!\n");
|
|
rdfCats[cat].m_parentid = catid;
|
|
t++;
|
|
}
|
|
// assign symbolic parent to the cat
|
|
// else if (parentType == 2) {
|
|
// // grow the buffer
|
|
// rdfCats[cat].m_numSymParents++;
|
|
// rdfCats[cat].m_symParents = (int32_t*)realloc(
|
|
// rdfCats[cat].m_symParents,
|
|
// sizeof(int32_t)*rdfCats[cat].m_numSymParents);
|
|
// if (!rdfCats[cat].m_symParents) {
|
|
// printf("Out of Memory!\n");
|
|
// goto errExit1;
|
|
// }
|
|
// // assign the sym parent
|
|
// rdfCats[cat].m_symParents[rdfCats[cat].m_numSymParents-1] = catid;
|
|
// // inc overall number of sym parents
|
|
// numSymParents++;
|
|
// }
|
|
// go to the next tag
|
|
goto nextChildTag;
|
|
}
|
|
|
|
fileEnd1:
|
|
printf("Completed Structure:\n");
|
|
printf(" Total Topics: %"INT32"\n", numRdfCats);
|
|
printf(" Topics with Parents: %"INT32"\n", t);
|
|
printf(" Topics Linked but Nonexistent: %"INT32"\n", m);
|
|
|
|
if ( t != numRdfCats ) {
|
|
printf("\n"
|
|
" *Topics without parents is bad because they\n"
|
|
" can not have their entired rawPath printed out\n"
|
|
" in order to get their proper hash\n");
|
|
}
|
|
|
|
//printf(" Number of Symbolic Links: %"INT32"\n", numSymParents);
|
|
printf("\n");
|
|
|
|
// clear the hash table
|
|
clearHashTable();
|
|
// close the structure file
|
|
//rdfStream.clear();
|
|
//rdfStream.close();
|
|
close(rdfStream);
|
|
|
|
printf("Truncating Category Names...\n");
|
|
// . truncate the category names to the last directory
|
|
// also calculate the size of the truncated buffer
|
|
for (int32_t i = 0; i < numRdfCats; i++) {
|
|
// find the position of the last /
|
|
newOffset = rdfCats[i].m_nameOffset +
|
|
rdfCats[i].m_nameLen - 1;
|
|
while ( newOffset != rdfCats[i].m_nameOffset &&
|
|
nameBuffer[newOffset-1] != '/' )
|
|
newOffset--;
|
|
// assign the new length and offset
|
|
rdfCats[i].m_nameLen -= newOffset - rdfCats[i].m_nameOffset;
|
|
rdfCats[i].m_nameOffset = newOffset;
|
|
newNameBufferSize += rdfCats[i].m_nameLen;
|
|
}
|
|
|
|
printf("Creating Category Hashes...\n");
|
|
// make the hashes
|
|
char rawPath[4096];
|
|
int32_t rawPathLen;
|
|
for (int32_t i = 0; i < numRdfCats; i++) {
|
|
// get the hash of the path
|
|
rawPathLen = printCatPath(rawPath, rdfCats[i].m_catid, true);
|
|
// crap, this rawpath contains "Top/" in the beginning
|
|
// but the rdfCats[i].m_nameOffset refers to a name
|
|
// that does not include "Top/"
|
|
rdfCats[i].m_catHash = hash32Lower_a(rawPath, rawPathLen, 0);
|
|
// fix. so that xyz/Arts does not just hash "Arts"
|
|
// because it has no parent...
|
|
if ( rdfCats[i].m_parentid == 0 ) {
|
|
printf("Missing parent for catid %"INT32". Will be "
|
|
"excluded from DMOZ so we avoid hash "
|
|
"collisions.\n",rdfCats[i].m_catid);
|
|
}
|
|
//
|
|
// DEBUG!
|
|
// print this shit out to find the collisions
|
|
//
|
|
continue;
|
|
printf("hash32=%"UINT32" catid=%"INT32" parentid=%"INT32" path=%s\n",
|
|
rdfCats[i].m_catHash,
|
|
rdfCats[i].m_catid,
|
|
rdfCats[i].m_parentid,
|
|
rawPath);
|
|
}
|
|
|
|
// . now we want to serialize the needed data into
|
|
// one (or more?) file(s) to be quickly read by gb
|
|
if ( mode == MODE_NEW )
|
|
sprintf(filename, "%s%s", dir,STRUCTURE_OUTPUT_FILE);
|
|
else
|
|
sprintf(filename, "%s%s.new", dir,STRUCTURE_OUTPUT_FILE);
|
|
//outStream.open(filename, ofstream::out|ofstream::trunc);
|
|
outStream = open ( filename, O_CREAT|O_WRONLY|O_TRUNC,
|
|
S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP );
|
|
// make sure it opened okay
|
|
//if (!outStream.is_open()) {
|
|
if ( outStream < 0 ) {
|
|
printf("Error Opening %s\n", filename);
|
|
goto errExit;
|
|
}
|
|
printf("\nOpened %s for writing.\n", filename);
|
|
|
|
// write the size of the truncated name buffer
|
|
//outStream.write((char*)&newNameBufferSize, sizeof(int32_t));
|
|
if (write(outStream, &newNameBufferSize, sizeof(int32_t)) !=
|
|
sizeof(int32_t)) {
|
|
printf("Error writing to %s\n", filename);
|
|
goto errExit;
|
|
}
|
|
// write the number of cats
|
|
//outStream.write((char*)&numRdfCats, sizeof(int32_t));
|
|
if (write(outStream, &numRdfCats, sizeof(int32_t)) !=
|
|
sizeof(int32_t)) {
|
|
printf("Error writing to %s\n", filename);
|
|
goto errExit;
|
|
}
|
|
// write the number of symbolic parents
|
|
//outStream.write((char*)&numSymParents, sizeof(int32_t));
|
|
// write the truncated buffer and further reassign the offsets
|
|
newOffset = 0;
|
|
for (int32_t i = 0; i < numRdfCats; i++) {
|
|
int32_t writeSize = rdfCats[i].m_nameLen;
|
|
//outStream.write((char*)&nameBuffer[rdfCats[i].m_nameOffset],
|
|
// sizeof(char)*rdfCats[i].m_nameLen);
|
|
if ( write ( outStream, &nameBuffer[rdfCats[i].m_nameOffset],
|
|
writeSize ) != writeSize ) {
|
|
printf("Error writing to %s\n", filename);
|
|
goto errExit;
|
|
}
|
|
rdfCats[i].m_nameOffset = newOffset;
|
|
newOffset += rdfCats[i].m_nameLen;
|
|
}
|
|
|
|
// close the output file
|
|
//outStream.clear();
|
|
//outStream.close();
|
|
close(outStream);
|
|
printf("Completed Writing File.\n");
|
|
|
|
// clear up the name buffer
|
|
free(nameBuffer);
|
|
nameBuffer = NULL;
|
|
|
|
contentParse:
|
|
// . now we need to parse up the content file,
|
|
// hash the url's with a gb hash, and store the
|
|
// catid associated with each
|
|
t = 0;
|
|
m = 0;
|
|
|
|
// create the url buffer
|
|
urlBufferSize = URL_BUFFER_SIZE;
|
|
urlBuffer = (char*)malloc(sizeof(char)*urlBufferSize);
|
|
if (!urlBuffer) {
|
|
printf("Out of Memory!\n");
|
|
goto errExit;
|
|
}
|
|
|
|
// create the url info buffer
|
|
urlInfosSize = URLINFO_BUFFER_SIZE;
|
|
urlInfos = (UrlInfo*)malloc(sizeof(UrlInfo)*urlInfosSize);
|
|
if (!urlInfos) {
|
|
printf("Out of Memory!\n");
|
|
goto errExit;
|
|
}
|
|
|
|
again:
|
|
// open the content file
|
|
if ( mode == MODE_NEW || mode == MODE_URLDUMP )
|
|
sprintf(filename, "%s%s", dir,RDFCONTENT_FILE);
|
|
else
|
|
sprintf(filename, "%s%s.new", dir,RDFCONTENT_FILE);
|
|
//rdfStream.open(filename, ifstream::in);
|
|
rdfStream = open ( filename, O_RDONLY );
|
|
// make sure it opened okay
|
|
//if (!rdfStream.is_open()) {
|
|
if ( rdfStream < 0 ) {
|
|
if ( ! dir[0] ) {
|
|
dir = "./catdb/";
|
|
goto again;
|
|
}
|
|
printf("Error Opening %s\n", filename);
|
|
goto errExit;
|
|
}
|
|
printf("\nOpened Content File: %s\n", filename);
|
|
|
|
// take the first chunk
|
|
//rdfStream.read(rdfBuffer, RDFBUFFER_SIZE);
|
|
//n = rdfStream.gcount();
|
|
n = read ( rdfStream, rdfBuffer, RDFBUFFER_SIZE );
|
|
if ( n <= 0 || n > RDFBUFFER_SIZE ) {
|
|
printf("Error Reading %s\n", filename);
|
|
goto errExit;
|
|
}
|
|
rdfPtr = rdfBuffer;
|
|
rdfEnd = &rdfBuffer[n];
|
|
currOffset = 0;
|
|
|
|
// init hash tables for indexing urls
|
|
initUrlHashTable();
|
|
|
|
if ( mode == MODE_URLDUMP || mode == MODE_DIFFURLDUMP ) {
|
|
// write another file for the urls
|
|
if ( mode == MODE_URLDUMP ) {
|
|
if (!splitUrls)
|
|
sprintf(filename, "html/%s", URLTEXT_OUTPUT_FILE);
|
|
else
|
|
// put them directly into html/ now for
|
|
// easy add url'ing
|
|
sprintf(filename, "html/%s.0", URLTEXT_OUTPUT_FILE);
|
|
}
|
|
else {
|
|
if (!splitUrls)
|
|
sprintf(filename, "html/%s",
|
|
DIFFURLTEXT_OUTPUT_FILE);
|
|
else
|
|
sprintf(filename, "html/%s.0",
|
|
DIFFURLTEXT_OUTPUT_FILE);
|
|
}
|
|
//outStream2.open(filename, ofstream::out|ofstream::trunc);
|
|
outStream2 = open ( filename, O_CREAT|O_WRONLY|O_TRUNC,
|
|
S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP );
|
|
// make sure it opened okay
|
|
//if (!outStream2.is_open()) {
|
|
if ( outStream2 < 0 ) {
|
|
printf("Error Opening %s\n", filename);
|
|
goto errExit1;
|
|
}
|
|
printf("Opened %s for writing.\n", filename);
|
|
|
|
writeMetaTags ( outStream2 );
|
|
|
|
// if we're doing a diffurldump, load up the diff file first
|
|
if ( mode == MODE_DIFFURLDUMP ) {
|
|
char diffUrl[MAX_URL_LEN*2];
|
|
int32_t numRemoveUrls = 0;
|
|
// open the new diff file
|
|
//ifstream diffInStream;
|
|
int diffInStream;
|
|
sprintf(filename, "gbdmoz.content.dat.new.diff");
|
|
//diffInStream.open(filename, ifstream::in);
|
|
diffInStream = open(filename, O_RDONLY);
|
|
//if (!diffInStream.is_open()) {
|
|
if ( diffInStream < 0 ) {
|
|
printf("Error Opening %s\n", filename);
|
|
goto errExit;
|
|
}
|
|
printf("Opened Diff File: %s\n", filename);
|
|
|
|
// read in the number of urls to update/add
|
|
//diffInStream.read((char*)&numUpdateIndexes,
|
|
// sizeof(int32_t));
|
|
if ( fileRead ( diffInStream,
|
|
&numUpdateIndexes,
|
|
sizeof(int32_t) ) != sizeof(int32_t) ) {
|
|
printf("Error Reading %s\n", filename);
|
|
goto errExit;
|
|
}
|
|
// read in the number of urls to remove
|
|
//diffInStream.read((char*)&numRemoveUrls, sizeof(int32_t));
|
|
if ( fileRead ( diffInStream,
|
|
&numRemoveUrls,
|
|
sizeof(int32_t) ) != sizeof(int32_t) ) {
|
|
printf("Error Reading %s\n", filename);
|
|
goto errExit;
|
|
}
|
|
// create the buffer for the update/add indexes
|
|
updateIndexes = (int32_t*)malloc(
|
|
sizeof(int32_t)*numUpdateIndexes);
|
|
if ( !updateIndexes ) {
|
|
printf("Out of Memory!\n");
|
|
//diffInStream.clear();
|
|
//diffInStream.close();
|
|
close(diffInStream);
|
|
goto errExit;
|
|
}
|
|
// read in the update/add indexes
|
|
//for ( int32_t i = 0; i < numUpdateIndexes &&
|
|
// diffInStream.good(); i++ ) {
|
|
for ( int32_t i = 0; i < numUpdateIndexes; i++ ) {
|
|
//diffInStream.read((char*)&updateIndexes[i],
|
|
// sizeof(int32_t));
|
|
int32_t n = fileRead ( diffInStream,
|
|
&updateIndexes[i],
|
|
sizeof(int32_t) );
|
|
if ( n < 0 || n > (int32_t)sizeof(int32_t) ) {
|
|
printf("Error Reading%s\n", filename);
|
|
goto errExit;
|
|
}
|
|
if ( n == 0 )
|
|
break;
|
|
}
|
|
// read in the urls to remove
|
|
//for ( int32_t i = 0; i < numRemoveUrls &&
|
|
// diffInStream.good(); i++ ) {
|
|
for ( int32_t i = 0; i < numRemoveUrls; i++ ) {
|
|
int16_t urlLen;
|
|
//diffInStream.read((char*)&urlLen,
|
|
// sizeof(int16_t));
|
|
if ( fileRead(diffInStream, &urlLen,
|
|
sizeof(int16_t)) != sizeof(int16_t) ) {
|
|
printf("Error reading diffInStream\n");
|
|
goto errExit;
|
|
}
|
|
if ( urlLen <= 0 ) {
|
|
printf("WARNING: Found %"INT32" length"
|
|
"url exiting!", (int32_t)urlLen);
|
|
//diffInStream.clear();
|
|
//diffInStream.close();
|
|
close(diffInStream);
|
|
goto errExit;
|
|
}
|
|
// read it in
|
|
//diffInStream.read(diffUrl, urlLen);
|
|
if ( fileRead(diffInStream, diffUrl, urlLen) !=
|
|
urlLen ) {
|
|
printf("Error reading diffInStream\n");
|
|
goto errExit;
|
|
}
|
|
// normalize it
|
|
urlLen = fixUrl(diffUrl, urlLen);
|
|
// write it out to the diffurl file
|
|
//outStream2.write(diffUrl, urlLen);
|
|
if ( write(outStream2, diffUrl, urlLen) !=
|
|
urlLen ) {
|
|
printf("Error writing to outStream2\n");
|
|
goto errExit;
|
|
}
|
|
//outStream2.write("\n", 1);
|
|
if ( write(outStream2, "\n", 1) != 1 ) {
|
|
printf("Error writing to outStream2\n");
|
|
goto errExit;
|
|
}
|
|
urlTxtCount++;
|
|
|
|
if ( splitUrls &&
|
|
urlTxtCount >= MAX_URLTXT_SIZE) {
|
|
//outStream2.clear();
|
|
//outStream2.close();
|
|
close(outStream2);
|
|
printf("Completed Writing File.\n");
|
|
// write another file for the urls
|
|
urlTxtFile++;
|
|
sprintf(filename, "html/%s.%"INT32"",
|
|
URLTEXT_OUTPUT_FILE,
|
|
urlTxtFile);
|
|
//outStream2.open(filename,
|
|
// ofstream::out|ofstream::trunc);
|
|
outStream2 = open ( filename,
|
|
O_CREAT|O_WRONLY|O_TRUNC,
|
|
S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP );
|
|
// make sure it opened okay
|
|
//if (!outStream2.is_open()) {
|
|
if ( outStream2 < 0 ) {
|
|
printf("Error Opening %s\n",
|
|
filename);
|
|
goto errExit1;
|
|
}
|
|
printf("Opened %s for writing.\n",
|
|
filename);
|
|
urlTxtCount = 0;
|
|
}
|
|
|
|
}
|
|
// close up the diff file
|
|
//diffInStream.clear();
|
|
//diffInStream.close();
|
|
close(diffInStream);
|
|
printf("Successfully Built Diff\n");
|
|
}
|
|
}
|
|
else {
|
|
if ( mode == MODE_NEW )
|
|
sprintf(filename, "%s%s", dir,CONTENT_OUTPUT_FILE);
|
|
else
|
|
sprintf(filename, "%s%s.new", dir,CONTENT_OUTPUT_FILE);
|
|
// stream the urls into the content
|
|
//outStream.open(filename, ofstream::out|ofstream::trunc);
|
|
outStream = open ( filename, O_CREAT|O_WRONLY|O_TRUNC,
|
|
S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP );
|
|
// make sure it opened okay
|
|
//if (!outStream.is_open()) {
|
|
if ( outStream < 0 ) {
|
|
printf("Error Opening %s\n", filename);
|
|
goto errExit;
|
|
}
|
|
printf("Opened %s for writing.\n", filename);
|
|
|
|
// store a space for the number of urls at the start of the file
|
|
//outStream.write((char*)&numUrlInfos, sizeof(int32_t));
|
|
if ( write(outStream, &numUrlInfos, sizeof(int32_t)) !=
|
|
sizeof(int32_t) ) {
|
|
printf("Error writing to %s", filename);
|
|
goto errExit;
|
|
}
|
|
}
|
|
|
|
// read and parse the file again
|
|
printf("Building Links...\n");
|
|
while (true) {
|
|
// parse for <Topic...
|
|
if (rdfParse("Topic") == -1)
|
|
goto fileEnd2;
|
|
// the offset for this cat is 6 chars back
|
|
uint32_t catOffset = currOffset - 6;
|
|
// parse the next catid
|
|
int32_t catid = parseNextCatid();
|
|
if (catid == -1)
|
|
goto fileEnd2;
|
|
int32_t cat;
|
|
// skip ahead for url dump
|
|
if ( mode == MODE_URLDUMP || mode == MODE_DIFFURLDUMP )
|
|
goto nextLink;
|
|
// . set the content offset for this cat
|
|
// . it's missing catid 425187... why? because it had
|
|
// a double quote in it like '4"'!! so i took out inQuotes
|
|
// logic above.
|
|
cat = getIndexFromId(catid);
|
|
if (cat == -1) {
|
|
totalNEC++;
|
|
printf("Warning: Nonexistent Category, %"INT32", found in "
|
|
"Content\n", catid );
|
|
continue;
|
|
}
|
|
rdfCats[cat].m_contentOffset = catOffset;
|
|
nextLink:
|
|
// get the next tag
|
|
if (rdfNextTag() == -1)
|
|
goto fileEnd2;
|
|
// check it for one of the tags we're looking for
|
|
if ( tagLen == 6 &&
|
|
strncmp ( tagRecfer, "/Topic", 6 ) == 0 )
|
|
continue;
|
|
else if ( tagLen == 4 &&
|
|
strncmp ( tagRecfer, "link", 4 ) == 0 )
|
|
goto hashLink;
|
|
else if ( tagLen == 5 &&
|
|
strncmp ( tagRecfer, "link1", 5 ) == 0 )
|
|
goto hashLink;
|
|
else if ( tagLen == 4 &&
|
|
strncmp ( tagRecfer, "atom", 4 ) == 0 )
|
|
goto hashLink;
|
|
else if ( tagLen == 3 &&
|
|
strncmp ( tagRecfer, "pdf", 3 ) == 0 )
|
|
goto hashLink;
|
|
else if ( tagLen == 4 &&
|
|
strncmp ( tagRecfer, "pdf1", 4 ) == 0 )
|
|
goto hashLink;
|
|
else if ( tagLen == 3 &&
|
|
strncmp ( tagRecfer, "rss", 3 ) == 0 )
|
|
goto hashLink;
|
|
else if ( tagLen == 4 &&
|
|
strncmp ( tagRecfer, "rss1", 4 ) == 0 )
|
|
goto hashLink;
|
|
else
|
|
goto nextLink;
|
|
hashLink:
|
|
// . hash the link with the catid
|
|
// get the link url
|
|
int32_t urlOffset = urlBufferLen;
|
|
int16_t urlLen = fillNextUrl();
|
|
if (urlLen == -1)
|
|
goto fileEnd2;
|
|
if (urlLen == -2) {
|
|
printf("Out of Memory!\n");
|
|
goto errExit1;
|
|
}
|
|
// html decode the url
|
|
if (urlLen > MAX_URL_LEN)
|
|
urlLen = MAX_URL_LEN;
|
|
urlLen = htmlDecode(decodedUrl, &urlBuffer[urlOffset], urlLen,
|
|
false,0);
|
|
// debug point
|
|
//if ( strcmp(decodedUrl,"http://twitter.com/#!/ronpaul")==0)
|
|
// printf("hey\n");
|
|
|
|
// ignore any url with # in it for now like
|
|
// http://twitter.com/#!/ronpaul because it bastardizes
|
|
// the meaning of the # (hashtag) and we need to protest that
|
|
if ( strchr ( decodedUrl , '#' ) )
|
|
goto nextLink;
|
|
|
|
gbmemcpy(&urlBuffer[urlOffset], decodedUrl, urlLen);
|
|
// fix up bad urls
|
|
urlLen = fixUrl(&urlBuffer[urlOffset], urlLen);
|
|
if (urlLen == 0)
|
|
goto nextLink;
|
|
// . normalize with Url
|
|
// . watch out for
|
|
// http://twitter.com/#!/ronpaul to http://www.twitter.com/
|
|
// so do not strip # hashtags
|
|
normUrl.set(&urlBuffer[urlOffset],
|
|
urlLen,
|
|
true, // addwww?
|
|
false, // stripsessionid
|
|
false, // strippound?
|
|
true); // stripcommonfile? (i.e. index.htm)
|
|
// debug print
|
|
//printf("gburl %s -> %s\n",decodedUrl,normUrl.getUrl());
|
|
// put it back
|
|
urlLen = normUrl.getUrlLen();
|
|
if (urlBufferLen+urlLen+10 >= urlBufferSize) {
|
|
urlBufferSize += URL_BUFFER_SIZE;
|
|
urlBuffer = (char*)realloc((void*)urlBuffer,
|
|
sizeof(char)*urlBufferSize);
|
|
printf("urlBuffer: %"INT32" bytes\n", urlBufferSize);
|
|
if (!urlBuffer)
|
|
goto errExit1;
|
|
}
|
|
gbmemcpy(&urlBuffer[urlOffset], normUrl.getUrl(), urlLen);
|
|
// run it through the fixer once more
|
|
urlLen = fixUrl(&urlBuffer[urlOffset], urlLen);
|
|
if (urlLen == 0)
|
|
goto nextLink;
|
|
// check the url to make sure it is all valid characters
|
|
if (!isGoodUrl(&urlBuffer[urlOffset], urlLen))
|
|
goto nextLink;
|
|
// if good, add it to the buffer and add the cat
|
|
//urlBufferLen += urlLen;
|
|
// get the hash value
|
|
uint64_t urlHash =
|
|
hash64Lower_a(&urlBuffer[urlOffset], urlLen, 0);
|
|
//uint32_t urlHash2 =
|
|
// hash32Lower(&urlBuffer[urlOffset], urlLen, 0);
|
|
// see if it's already indexed
|
|
//int32_t urlIndex = getUrlHash(urlHash, urlOffset, urlLen);
|
|
//int32_t urlIndex = getUrlHash(urlHash, urlHash2);
|
|
//int32_t urlIndex = getUrlHash(urlHash, urlHash2
|
|
// urlOffset, urlLen);
|
|
int32_t urlIndex = getUrlHash(urlHash);
|
|
if (urlIndex == -1) {
|
|
if ( mode == MODE_URLDUMP ||
|
|
mode == MODE_DIFFURLDUMP ) {
|
|
//outStream2.write((char*)&urlLen,
|
|
// sizeof(int16_t));
|
|
if ( mode != MODE_DIFFURLDUMP ||
|
|
currUrl == updateIndexes[currDiffIndex] ) {
|
|
//outStream2.write(&urlBuffer[urlOffset],
|
|
// urlLen);
|
|
// print it in an anchor tag
|
|
// now so gigablast can spider
|
|
// these links
|
|
write ( outStream2,"<a href=\"",9);
|
|
if ( write ( outStream2,
|
|
&urlBuffer[urlOffset],
|
|
urlLen ) != urlLen ) {
|
|
printf("Error writing to "
|
|
"outStream2\n");
|
|
goto errExit1;
|
|
}
|
|
write ( outStream2,"\"></a>",6);
|
|
//outStream2.write("\n", 1);
|
|
if (write(outStream2, "\n", 1) != 1) {
|
|
printf("Error writing to "
|
|
"outStream2\n");
|
|
goto errExit1;
|
|
}
|
|
urlTxtCount++;
|
|
currDiffIndex++;
|
|
}
|
|
currUrl++;
|
|
|
|
if ( splitUrls &&
|
|
urlTxtCount >= MAX_URLTXT_SIZE) {
|
|
//outStream2.clear();
|
|
//outStream2.close();
|
|
close(outStream2);
|
|
printf("Completed Writing File.\n");
|
|
// write another file for the urls
|
|
urlTxtFile++;
|
|
if ( mode == MODE_URLDUMP )
|
|
sprintf(filename, "html/%s.%"INT32"",
|
|
URLTEXT_OUTPUT_FILE,
|
|
urlTxtFile);
|
|
else
|
|
sprintf(filename, "html/%s.%"INT32"",
|
|
DIFFURLTEXT_OUTPUT_FILE,
|
|
urlTxtFile);
|
|
//outStream2.open(filename,
|
|
// ofstream::out|ofstream::trunc);
|
|
outStream2 = open ( filename,
|
|
O_CREAT|O_WRONLY|O_TRUNC,
|
|
S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP );
|
|
// make sure it opened okay
|
|
//if (!outStream2.is_open()) {
|
|
if ( outStream2 < 0 ) {
|
|
printf("Error Opening %s\n",
|
|
filename);
|
|
goto errExit1;
|
|
}
|
|
printf("Opened %s for writing.\n",
|
|
filename);
|
|
writeMetaTags ( outStream2 );
|
|
urlTxtCount = 0;
|
|
}
|
|
}
|
|
else {
|
|
// write the url to the content file
|
|
//outStream.write((char*)&urlLen, sizeof(int16_t));
|
|
if ( write(outStream, &urlLen, sizeof(int16_t)) !=
|
|
sizeof(int16_t) ) {
|
|
printf("Error writing to outStream");
|
|
goto errExit1;
|
|
}
|
|
//outStream.write(&urlBuffer[urlOffset], urlLen);
|
|
if ( write ( outStream,
|
|
&urlBuffer[urlOffset],
|
|
urlLen ) != urlLen ) {
|
|
printf("Error writing to outStream");
|
|
goto errExit1;
|
|
}
|
|
}
|
|
// add the url info to the buffer
|
|
if (numUrlInfos >= urlInfosSize) {
|
|
urlInfosSize += URLINFO_BUFFER_SIZE;
|
|
urlInfos = (UrlInfo*)realloc((void*)urlInfos,
|
|
sizeof(UrlInfo)*urlInfosSize);
|
|
printf("urlInfos: %"INT32" bytes\n",
|
|
(int32_t)(urlInfosSize*sizeof(UrlInfo)));
|
|
if (!urlInfos) {
|
|
printf("Out of Memory!\n");
|
|
goto errExit1;
|
|
}
|
|
}
|
|
// fill the url info
|
|
//urlInfos[numUrlInfos].m_hash = urlHash;
|
|
//urlInfos[numUrlInfos].m_urlLen = urlLen;
|
|
//urlInfos[numUrlInfos].m_urlOffset = urlOffset;
|
|
urlInfos[numUrlInfos].m_numCatids = 1;
|
|
urlInfos[numUrlInfos].m_catids =
|
|
(int32_t*)malloc(sizeof(int32_t));
|
|
if (!urlInfos[numUrlInfos].m_catids) {
|
|
printf("Out of memory!\n");
|
|
goto errExit1;
|
|
}
|
|
urlInfos[numUrlInfos].m_catids[0] = catid;
|
|
// set changed to true so new urls get in the diff
|
|
urlInfos[numUrlInfos].m_changed = 1;
|
|
// add it to the hash
|
|
//if (addUrlHash(urlHash, numUrlInfos,
|
|
// urlOffset, urlLen) == -1) {
|
|
//if (addUrlHash ( urlHash,
|
|
// urlHash2,
|
|
// numUrlInfos) == -1) {
|
|
//if (addUrlHash(urlHash, urlHash2, numUrlInfos,
|
|
// urlOffset, urlLen) == -1) {
|
|
if (addUrlHash(urlHash, numUrlInfos) == -1) {
|
|
printf("Out of Memory!\n");
|
|
goto errExit1;
|
|
}
|
|
// next url info
|
|
numUrlInfos++;
|
|
}
|
|
else {
|
|
// make sure we aren't duping the catid
|
|
for (int32_t i = 0;
|
|
i < urlInfos[urlIndex].m_numCatids; i++)
|
|
if (urlInfos[urlIndex].m_catids[i] == catid)
|
|
goto nextLink;
|
|
// add the catid
|
|
int32_t numCatids = urlInfos[urlIndex].m_numCatids;
|
|
//if (numCatids < MAX_URL_CATIDS) {
|
|
urlInfos[urlIndex].m_catids = (int32_t*)realloc(
|
|
urlInfos[urlIndex].m_catids,
|
|
sizeof(int32_t) *
|
|
(urlInfos[urlIndex].m_numCatids+1));
|
|
if (!urlInfos[urlIndex].m_catids) {
|
|
printf("Out of Memory!\n");
|
|
goto errExit1;
|
|
}
|
|
urlInfos[urlIndex].m_catids[numCatids] = catid;
|
|
urlInfos[urlIndex].m_numCatids++;
|
|
|
|
if (urlInfos[urlIndex].m_numCatids > t) {
|
|
t = urlInfos[urlIndex].m_numCatids;
|
|
ti = urlIndex;
|
|
}
|
|
//}
|
|
m++;
|
|
}
|
|
// skip increment for url dump
|
|
if ( mode == MODE_URLDUMP || mode == MODE_DIFFURLDUMP )
|
|
goto nextLink;
|
|
|
|
// increment the url count for this cat and its parents
|
|
int32_t currIndex = getIndexFromId(catid);
|
|
while (currIndex >= 0) {
|
|
rdfCats[currIndex].m_numUrls++;
|
|
// the new dmoz files have catids whose parents
|
|
// are the same cat id! so stop infinite loops
|
|
if ( rdfCats[currIndex].m_parentid ==
|
|
rdfCats[currIndex].m_catid )
|
|
break;
|
|
// otherwise, make "currIndex" point to the parent
|
|
currIndex = getIndexFromId(
|
|
rdfCats[currIndex].m_parentid );
|
|
// in the newer dmoz files 0 is a bad catid i guess
|
|
// not -1 any more?
|
|
// ??????
|
|
}
|
|
|
|
goto nextLink;
|
|
}
|
|
|
|
fileEnd2:
|
|
// close the output file
|
|
if ( mode == MODE_URLDUMP || mode == MODE_DIFFURLDUMP ) {
|
|
//outStream2.clear();
|
|
//outStream2.close();
|
|
close(outStream2);
|
|
printf("Completed Writing File.\n");
|
|
}
|
|
else {
|
|
//outStream.clear();
|
|
//outStream.close();
|
|
close(outStream);
|
|
printf("Completed Writing File.\n");
|
|
}
|
|
|
|
printf("Completed Content:\n");
|
|
printf(" Total Links: %"INT32"\n", numUrlInfos);
|
|
printf(" Duplicated Links: %"INT32"\n", m);
|
|
printf(" Max Link Duplicated: %"INT32"\n", t);
|
|
printf(" Nonexistent Categories: %"INT32"\n", totalNEC );
|
|
//printf(" ");
|
|
//for (int32_t i = 0; i < urlInfos[ti].m_urlLen; i++)
|
|
// printf("%c", urlBuffer[urlInfos[ti].m_urlOffset + i]);
|
|
printf("\n");
|
|
printf("\n");
|
|
|
|
// close the content file
|
|
//rdfStream.clear();
|
|
//rdfStream.close();
|
|
close(rdfStream);
|
|
|
|
// if we're updating, load up the old content here
|
|
if ( mode == MODE_UPDATE ) {
|
|
//if ( false ) {
|
|
// fill the buffers
|
|
int32_t currUrl = 0;
|
|
int32_t urlp = 0;
|
|
int32_t catidp = 0;
|
|
bool oldErr = false;
|
|
int32_t oldNumUrls;
|
|
char *oldUrls = NULL;
|
|
int32_t oldUrlsBufferSize = OLDURL_BUFFER_SIZE;
|
|
uint64_t *oldUrlHashes;
|
|
char *removeOldUrl;
|
|
//char oldUrl[MAX_URL_LEN*2];
|
|
int32_t *oldCatids = NULL;
|
|
int32_t oldCatidsBufferSize = OLDCATID_BUFFER_SIZE;
|
|
unsigned char *oldNumCatids = NULL;
|
|
int32_t numUpdateUrls = numUrlInfos;
|
|
int32_t numRemoveUrls = 0;
|
|
int32_t numChangedUrls = 0;
|
|
int32_t updateIndexesWritten = 0;
|
|
int32_t numIdsToUpdate = 0;
|
|
|
|
// load the content and url files
|
|
// url info (content) file
|
|
sprintf(filename, "%s%s", dir,CONTENT_OUTPUT_FILE);
|
|
//rdfStream.open(filename, ifstream::in);
|
|
rdfStream = open ( filename, O_RDONLY );
|
|
//if (!rdfStream.is_open()) {
|
|
if ( rdfStream < 0 ) {
|
|
printf("Error Opening %s\n", filename);
|
|
goto oldErrExit;
|
|
}
|
|
// read in the number of urls
|
|
//rdfStream.read((char*)&oldNumUrls, sizeof(int32_t));
|
|
if (fileRead(rdfStream, &oldNumUrls, sizeof(int32_t)) !=
|
|
sizeof(int32_t)) {
|
|
printf("Error Reading %s\n", filename);
|
|
goto oldErrExit;
|
|
}
|
|
|
|
// create the buffer for the urls and catids
|
|
oldUrls = (char*)malloc(oldUrlsBufferSize);
|
|
if (!oldUrls) {
|
|
printf("Out of Memory!\n");
|
|
goto oldErrExit;
|
|
}
|
|
oldUrlHashes = (uint64_t*)malloc (
|
|
sizeof(int64_t)*oldNumUrls );
|
|
if (!oldUrlHashes) {
|
|
printf("Out of Memory!\n");
|
|
goto oldErrExit;
|
|
}
|
|
removeOldUrl = (char*)malloc(oldNumUrls);
|
|
if (!removeOldUrl) {
|
|
printf("Out of Memory!\n");
|
|
goto oldErrExit;
|
|
}
|
|
oldCatids = (int32_t*)malloc(sizeof(int32_t)*oldCatidsBufferSize);
|
|
if (!oldCatids) {
|
|
printf("Out of Memory!\n");
|
|
goto oldErrExit;
|
|
}
|
|
oldNumCatids = (unsigned char*)malloc(oldNumUrls);
|
|
if (!oldNumCatids) {
|
|
printf("Out of Memory!\n");
|
|
goto oldErrExit;
|
|
}
|
|
|
|
printf("Loading Old Content Data...\n");
|
|
//while ( rdfStream.good() && currUrl < oldNumUrls ) {
|
|
while ( currUrl < oldNumUrls ) {
|
|
// read the next url
|
|
int16_t urlLen = 0;
|
|
//rdfStream.read((char*)&urlLen, sizeof(int16_t));
|
|
int32_t n = fileRead(rdfStream, &urlLen, sizeof(int16_t));
|
|
if ( n < 0 || n > (int32_t)sizeof(int16_t) ) {
|
|
printf("Error Reading %s\n",filename);
|
|
//CONTENT_OUTPUT_FILE);
|
|
goto oldErrExit;
|
|
}
|
|
if ( n == 0 )
|
|
break;
|
|
// make sure there's room in the buffer
|
|
if (urlp + urlLen + 4 >= oldUrlsBufferSize) {
|
|
char *re_urls = (char*)realloc(
|
|
oldUrls,
|
|
oldUrlsBufferSize +
|
|
OLDURL_BUFFER_SIZE );
|
|
if (!re_urls) {
|
|
printf("Out of Memory!\n");
|
|
goto oldErrExit;
|
|
}
|
|
oldUrls = re_urls;
|
|
oldUrlsBufferSize += OLDURL_BUFFER_SIZE;
|
|
}
|
|
// insert a space between urls
|
|
//oldUrls[urlp] = '\n';
|
|
//urlp++;
|
|
//char *url = &m_urls[urlp];
|
|
//rdfStream.read(&oldUrls[urlp], urlLen);
|
|
if (urlLen <= 0) {
|
|
printf("WARNING: FOUND %"INT32" LENGTH URL, "
|
|
"WILL BE SKIPPED (1)\n",
|
|
(int32_t)urlLen );
|
|
}
|
|
n = fileRead(rdfStream, &oldUrls[urlp], urlLen);
|
|
if ( n < 0 || n > urlLen ) {
|
|
printf("Error Reading %s\n",filename);
|
|
//CONTENT_OUTPUT_FILE);
|
|
goto oldErrExit;
|
|
}
|
|
if ( n == 0 )
|
|
break;
|
|
//rdfStream.read(oldUrl, urlLen);
|
|
// normalize it
|
|
urlLen = fixUrl(&oldUrls[urlp], urlLen);
|
|
// make the hash
|
|
oldUrlHashes[currUrl] =
|
|
hash64Lower_a(&oldUrls[urlp], urlLen, 0);
|
|
removeOldUrl[currUrl] = 0;
|
|
// increment the buffer pointer
|
|
if (urlLen <= 0) {
|
|
printf("WARNING: FOUND %"INT32" LENGTH URL, "
|
|
"WILL BE SKIPPED (2)\n",
|
|
(int32_t)urlLen );
|
|
}
|
|
urlp += urlLen;
|
|
//urlLen = fixUrl(oldUrl, urlLen);
|
|
// null terminate
|
|
oldUrls[urlp] = '\0';
|
|
urlp++;
|
|
currUrl++;
|
|
}
|
|
currUrl = 0;
|
|
//while ( rdfStream.good() && currUrl < oldNumUrls ) {
|
|
while ( currUrl < oldNumUrls ) {
|
|
// get the number of catids
|
|
oldNumCatids[currUrl] = 0;
|
|
//rdfStream.read((char*)&oldNumCatids[currUrl], 1);
|
|
int32_t n = fileRead(rdfStream, &oldNumCatids[currUrl], 1);
|
|
if ( n < 0 || n > 1 ) {
|
|
printf("Error Reading %s\n",filename);
|
|
//CONTENT_OUTPUT_FILE);
|
|
goto oldErrExit;
|
|
}
|
|
if ( n == 0 )
|
|
break;
|
|
// make sure there's room
|
|
if ( catidp + oldNumCatids[currUrl] + 1 >=
|
|
oldCatidsBufferSize ) {
|
|
int32_t *re_catids = (int32_t*)realloc(
|
|
oldCatids,
|
|
sizeof(int32_t)*(oldCatidsBufferSize+
|
|
OLDCATID_BUFFER_SIZE) );
|
|
if (!re_catids) {
|
|
printf("Out of Memory!\n");
|
|
goto oldErrExit;
|
|
}
|
|
oldCatids = re_catids;
|
|
oldCatidsBufferSize += OLDCATID_BUFFER_SIZE;
|
|
}
|
|
//rdfStream.read((char*)&oldCatids[catidp],
|
|
// sizeof(int32_t)*oldNumCatids[currUrl]);
|
|
int32_t readSize = sizeof(int32_t)*oldNumCatids[currUrl];
|
|
n = fileRead(rdfStream, &oldCatids[catidp], readSize);
|
|
if ( n < 0 || n > readSize ) {
|
|
printf("Error Reading %s\n",filename);
|
|
//CONTENT_OUTPUT_FILE);
|
|
goto oldErrExit;
|
|
}
|
|
if ( n == 0 )
|
|
break;
|
|
// next url
|
|
catidp += oldNumCatids[currUrl];
|
|
currUrl++;
|
|
}
|
|
|
|
// now check the old urls against the new for changes
|
|
catidp = 0;
|
|
for ( int32_t i = 0; i < oldNumUrls; i++ ) {
|
|
// check the new url hash for the old url
|
|
int32_t n = oldNumCatids[i];
|
|
// skip bad urls
|
|
if ( oldUrlHashes[i] == 0 ) {
|
|
printf("WARNING: FOUND 0 LENGTH URL, "
|
|
"SKIPPING\n" );
|
|
catidp += n;
|
|
continue;
|
|
}
|
|
int32_t urlIndex = getUrlHash(oldUrlHashes[i]);
|
|
// check for a removed url
|
|
if ( urlIndex == -1 ) {
|
|
removeOldUrl[i] = 1;
|
|
numRemoveUrls++;
|
|
catidp += n;
|
|
continue;
|
|
}
|
|
// check if we have the same number of catids
|
|
if ( urlInfos[urlIndex].m_numCatids != n )
|
|
goto oldIsDifferent;
|
|
// check if all the catids match
|
|
for ( int32_t co = 0; co < n; co++ ) {
|
|
bool catMatch = false;
|
|
for ( int32_t cn = 0; cn < n; cn++ ) {
|
|
if ( urlInfos[urlIndex].m_catids[cn] ==
|
|
oldCatids[catidp + co] ) {
|
|
catMatch = true;
|
|
break;
|
|
}
|
|
}
|
|
if ( !catMatch )
|
|
goto oldIsDifferent;
|
|
}
|
|
// exact match, mark it unchanged and goto the next
|
|
catidp += n;
|
|
urlInfos[urlIndex].m_changed = 0;
|
|
numUpdateUrls--;
|
|
continue;
|
|
oldIsDifferent:
|
|
// just go on, this is already marked as changed
|
|
catidp += n;
|
|
numChangedUrls++;
|
|
continue;
|
|
}
|
|
printf(" Urls to Update: %"INT32"\n", numChangedUrls);
|
|
printf(" Urls to Add: %"INT32"\n",
|
|
numUpdateUrls - numChangedUrls);
|
|
printf(" Urls to Remove: %"INT32"\n", numRemoveUrls);
|
|
|
|
//
|
|
// . write out the diff file, contains new and changed urls and
|
|
// also urls to remove
|
|
//
|
|
// open the new diff file for writing
|
|
sprintf(filename, "%s%s.new.diff", dir,CONTENT_OUTPUT_FILE);
|
|
//outStream.open(filename, ofstream::out|ofstream::trunc);
|
|
outStream = open ( filename, O_CREAT|O_WRONLY|O_TRUNC,
|
|
S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP );
|
|
// make sure it opened okay
|
|
//if (!outStream.is_open()) {
|
|
if ( outStream < 0 ) {
|
|
printf("Error Opening %s\n", filename);
|
|
goto oldErrExit;
|
|
}
|
|
printf("\nOpened %s for writing.\n", filename);
|
|
|
|
// write out the number of urls to update/add
|
|
//outStream.write(&numUpdateUrls, sizeof(int32_t));
|
|
if ( write(outStream, &numUpdateUrls, sizeof(int32_t)) !=
|
|
sizeof(int32_t) ) {
|
|
printf("Error writing to %s\n", filename);
|
|
goto oldErrExit;
|
|
}
|
|
// write out the number of urls to delete
|
|
//outStream.write(&numRemoveUrls, sizeof(int32_t));
|
|
if ( write(outStream, &numRemoveUrls, sizeof(int32_t)) !=
|
|
sizeof(int32_t) ) {
|
|
printf("Error writing to %s\n", filename);
|
|
goto oldErrExit;
|
|
}
|
|
// write out the urls to update/add
|
|
for ( int32_t i = 0; i < numUrlInfos; i++ ) {
|
|
if ( urlInfos[i].m_changed == 0 ) {
|
|
continue;
|
|
}
|
|
// write the changed url info
|
|
//outStream.write((char*)&urlInfos[i].m_urlLen,
|
|
// sizeof(int16_t));
|
|
//outStream.write(&urlBuffer[urlInfos[i].m_urlOffset],
|
|
// sizeof(char)*urlInfos[i].m_urlLen);
|
|
//outStream.write((char*)&urlInfos[i].m_numCatids,
|
|
// sizeof(char));
|
|
//outStream.write((char*)urlInfos[i].m_catids,
|
|
// sizeof(int32_t)*urlInfos[i].m_numCatids);
|
|
//outStream.write((char*)&i, sizeof(int32_t));
|
|
if ( write(outStream, &i, sizeof(int32_t)) !=
|
|
sizeof(int32_t) ) {
|
|
printf("Error writing to outStream\n");
|
|
goto oldErrExit;
|
|
}
|
|
updateIndexesWritten++;
|
|
numIdsToUpdate += urlInfos[i].m_numCatids;
|
|
}
|
|
printf ( "Wrote %"INT32" urls and %"INT32" catids to update/add.\n",
|
|
updateIndexesWritten, numIdsToUpdate );
|
|
if ( updateIndexesWritten != numUpdateUrls )
|
|
printf ( "WARNING: Wrote %"INT32" Update Indexes, Should be"
|
|
"%"INT32"!", updateIndexesWritten, numUpdateUrls );
|
|
// write out the urls to delete
|
|
urlp = 0;
|
|
for ( int32_t i = 0; i < oldNumUrls; i++ ) {
|
|
int16_t oldUrlLen = gbstrlen(&oldUrls[urlp]);
|
|
if ( removeOldUrl[i] == 0 ) {
|
|
urlp += oldUrlLen + 1;
|
|
continue;
|
|
}
|
|
// write the url to remove
|
|
if ( oldUrlLen <= 0 )
|
|
printf("WARNING: ATTEMPTING TO WRITE %"INT32" "
|
|
"LENGTH URL.\n", (int32_t)oldUrlLen );
|
|
//outStream.write((char*)&oldUrlLen, sizeof(int16_t));
|
|
if ( write(outStream, &oldUrlLen, sizeof(int16_t)) !=
|
|
sizeof(int16_t) ) {
|
|
printf("Error writing to outStream\n");
|
|
goto oldErrExit;
|
|
}
|
|
//outStream.write((char*)&oldUrls[urlp], oldUrlLen);
|
|
if ( write(outStream, &oldUrls[urlp], oldUrlLen) !=
|
|
oldUrlLen ) {
|
|
printf("Error writing to outStream\n");
|
|
goto oldErrExit;
|
|
}
|
|
urlp += oldUrlLen + 1;
|
|
}
|
|
|
|
// close the file
|
|
//outStream.clear();
|
|
//outStream.close();
|
|
close(outStream);
|
|
printf("Completed Writing File.\n");
|
|
printf("\n");
|
|
|
|
// no error
|
|
oldErr = false;
|
|
goto oldGoodExit;
|
|
oldErrExit:
|
|
// set error
|
|
oldErr = true;
|
|
oldGoodExit:
|
|
// close the file
|
|
//rdfStream.clear();
|
|
//rdfStream.close();
|
|
close(rdfStream);
|
|
// free the buffers
|
|
if (oldUrls) free(oldUrls);
|
|
if (oldUrlHashes) free(oldUrlHashes);
|
|
if (removeOldUrl) free(removeOldUrl);
|
|
if (oldCatids) free(oldCatids);
|
|
if (oldNumCatids) free(oldNumCatids);
|
|
|
|
if (oldErr) goto errExit;
|
|
}
|
|
|
|
printf("Clearing Url Hash Table...\n");
|
|
// clear the url index hash
|
|
clearUrlHashTable();
|
|
|
|
// finish up if we're just dumping urls
|
|
if ( mode == MODE_URLDUMP || mode == MODE_DIFFURLDUMP )
|
|
goto goodEnd;
|
|
|
|
// . now we want to serialize the needed data into
|
|
// one (or more?) file(s) to be quickly read by gb
|
|
if ( mode == MODE_NEW )
|
|
sprintf(filename, "%s%s", dir,STRUCTURE_OUTPUT_FILE);
|
|
else
|
|
sprintf(filename, "%s%s.new", dir,STRUCTURE_OUTPUT_FILE);
|
|
//outStream.open(filename, ofstream::out|ofstream::ate);
|
|
outStream = open ( filename, O_WRONLY|O_APPEND,
|
|
S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP );
|
|
// make sure it opened okay
|
|
//if (!outStream.is_open()) {
|
|
if ( outStream < 0 ) {
|
|
printf("Error Opening %s\n", filename);
|
|
goto errExit;
|
|
}
|
|
printf("\nOpened %s for writing.\n", filename);
|
|
|
|
// write the cats
|
|
//outStream.write((char*)rdfCats, sizeof(RdfCat)*numRdfCats);
|
|
for (int32_t i = 0; i < numRdfCats; i++) {
|
|
//outStream.write((char*)&rdfCats[i].m_catid, sizeof(int32_t));
|
|
if ( write(outStream, &rdfCats[i].m_catid, sizeof(int32_t)) !=
|
|
sizeof(int32_t) ) {
|
|
printf("Error writing cats to outStream.\n");
|
|
goto errExit;
|
|
}
|
|
//outStream.write((char*)&rdfCats[i].m_parentid, sizeof(int32_t));
|
|
if ( write(outStream, &rdfCats[i].m_parentid, sizeof(int32_t)) !=
|
|
sizeof(int32_t) ) {
|
|
printf("Error writing cats to outStream.\n");
|
|
goto errExit;
|
|
}
|
|
//outStream.write((char*)&rdfCats[i].m_numSymParents, sizeof(int16_t));
|
|
//outStream.write((char*)&rdfCats[i].m_nameOffset, sizeof(int32_t));
|
|
if ( write(outStream, &rdfCats[i].m_nameOffset, sizeof(int32_t)) !=
|
|
sizeof(int32_t) ) {
|
|
printf("Error writing cats to outStream.\n");
|
|
goto errExit;
|
|
}
|
|
//outStream.write((char*)&rdfCats[i].m_nameLen, sizeof(int16_t));
|
|
if ( write(outStream, &rdfCats[i].m_nameLen, sizeof(int16_t)) !=
|
|
sizeof(int16_t) ) {
|
|
printf("Error writing cats to outStream.\n");
|
|
goto errExit;
|
|
}
|
|
//outStream.write((char*)&rdfCats[i].m_structureOffset, sizeof(int32_t));
|
|
if ( write(outStream, &rdfCats[i].m_structureOffset,
|
|
sizeof(int32_t)) != sizeof(int32_t) ) {
|
|
printf("Error writing cats to outStream.\n");
|
|
goto errExit;
|
|
}
|
|
//outStream.write((char*)&rdfCats[i].m_contentOffset, sizeof(int32_t));
|
|
if ( write(outStream, &rdfCats[i].m_contentOffset,
|
|
sizeof(int32_t)) != sizeof(int32_t) ) {
|
|
printf("Error writing cats to outStream.\n");
|
|
goto errExit;
|
|
}
|
|
//outStream.write((char*)&rdfCats[i].m_numUrls, sizeof(int32_t));
|
|
if ( write(outStream, &rdfCats[i].m_numUrls, sizeof(int32_t)) !=
|
|
sizeof(int32_t) ) {
|
|
printf("Error writing cats to outStream.\n");
|
|
goto errExit;
|
|
}
|
|
}
|
|
// write the symbolic parents
|
|
//for (int32_t i = 0; i < numRdfCats; i++)
|
|
// for (int32_t s = 0; s < rdfCats[i].m_numSymParents; s++)
|
|
// outStream.write((char*)&rdfCats[i].m_symParents[s], sizeof(int32_t));
|
|
// write the cat hashes
|
|
for (int32_t i = 0; i < numRdfCats; i++) {
|
|
//outStream.write((char*)&rdfCats[i].m_catHash, sizeof(int32_t));
|
|
if ( write(outStream, &rdfCats[i].m_catHash, sizeof(int32_t)) !=
|
|
sizeof(int32_t) ) {
|
|
printf("Error writing cats to outStream.\n");
|
|
goto errExit;
|
|
}
|
|
}
|
|
// close the output file
|
|
//outStream.clear();
|
|
//outStream.close();
|
|
close(outStream);
|
|
printf("Completed Writing File.\n");
|
|
|
|
// write another file for the urls
|
|
if ( mode == MODE_NEW )
|
|
sprintf(filename, "%s%s", dir,CONTENT_OUTPUT_FILE);
|
|
else
|
|
sprintf(filename, "%s%s.new", dir,CONTENT_OUTPUT_FILE);
|
|
//outStream.open(filename, ofstream::out|ofstream::ate);
|
|
outStream = open ( filename, O_WRONLY,
|
|
S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP );
|
|
//outStream.open(filename, ofstream::out|ofstream::trunc);
|
|
//endpos = outStream.tellp();
|
|
// make sure it opened okay
|
|
//if (!outStream.is_open()) {
|
|
if ( outStream < 0 ) {
|
|
printf("Error Opening %s\n", filename);
|
|
goto errExit;
|
|
}
|
|
printf("\nOpened %s for writing.\n", filename);
|
|
|
|
//outStream.seekp(0);
|
|
lseek(outStream, 0, SEEK_SET);
|
|
// write the number of urls at the start of the file
|
|
//outStream.write((char*)&numUrlInfos, sizeof(int32_t));
|
|
if ( write(outStream, &numUrlInfos, sizeof(int32_t)) != sizeof(int32_t) ) {
|
|
printf("Error writing to outStream\n");
|
|
goto errExit;
|
|
}
|
|
// seek to the end
|
|
//outStream.seekp(endpos);
|
|
lseek(outStream, 0, SEEK_END);
|
|
// write the urls
|
|
for (int32_t i = 0; i < numUrlInfos; i++) {
|
|
//outStream.write((char*)&urlInfos[i].m_hash, sizeof(int64_t));
|
|
//outStream.write((char*)&urlInfos[i].m_urlLen, sizeof(int16_t));
|
|
//outStream.write(&urlBuffer[urlInfos[i].m_urlOffset],
|
|
// sizeof(char)*urlInfos[i].m_urlLen);
|
|
//outStream.write((char*)&urlInfos[i].m_numCatids, sizeof(char));
|
|
if ( write(outStream, &urlInfos[i].m_numCatids, sizeof(char)) !=
|
|
sizeof(char) ) {
|
|
printf("Error writing to outStream\n");
|
|
goto errExit;
|
|
}
|
|
//outStream.write((char*)urlInfos[i].m_catids, sizeof(int32_t)*
|
|
// urlInfos[i].m_numCatids);
|
|
int32_t writeSize = sizeof(int32_t)*urlInfos[i].m_numCatids;
|
|
if ( write(outStream, urlInfos[i].m_catids, writeSize) !=
|
|
writeSize ) {
|
|
printf("Error writing to outStream\n");
|
|
goto errExit;
|
|
}
|
|
}
|
|
|
|
// close the output file
|
|
//outStream.clear();
|
|
//outStream.close();
|
|
close(outStream);
|
|
|
|
printf("Completed Writing File.\n\n");
|
|
|
|
goodEnd:
|
|
// free up the buffers
|
|
if (urlBuffer)
|
|
free(urlBuffer);
|
|
if (urlInfos) {
|
|
for (int32_t i = 0; i < numUrlInfos; i++) {
|
|
if (urlInfos[i].m_catids)
|
|
free(urlInfos[i].m_catids);
|
|
}
|
|
free(urlInfos);
|
|
}
|
|
//free(nameBuffer);
|
|
if (rdfCats)
|
|
free(rdfCats);
|
|
if (rdfBuffer)
|
|
free(rdfBuffer);
|
|
// success
|
|
return 0;
|
|
|
|
// error exit points
|
|
errExit1:
|
|
clearUrlHashTable();
|
|
clearHashTable();
|
|
//rdfStream.clear();
|
|
//rdfStream.close();
|
|
close(rdfStream);
|
|
errExit:
|
|
if (updateIndexes)
|
|
free(updateIndexes);
|
|
if (urlBuffer)
|
|
free(urlBuffer);
|
|
if (urlInfos) {
|
|
for (int32_t i = 0; i < numUrlInfos; i++) {
|
|
if (urlInfos[i].m_catids)
|
|
free(urlInfos[i].m_catids);
|
|
}
|
|
free(urlInfos);
|
|
}
|
|
if (nameBuffer)
|
|
free(nameBuffer);
|
|
if (rdfCats)
|
|
free(rdfCats);
|
|
if (rdfBuffer)
|
|
free(rdfBuffer);
|
|
// failure
|
|
return 1;
|
|
}
|