591 lines
17 KiB
C++
591 lines
17 KiB
C++
// Matt Wells, copyright Jul 2008
|
|
|
|
#ifndef _TAGDB_H_
|
|
#define _TAGDB_H_
|
|
|
|
#include "Conf.h" // for setting rdb from Conf file
|
|
#include "Rdb.h"
|
|
#include "Xml.h"
|
|
#include "Url.h"
|
|
#include "Loop.h"
|
|
//#include "DiskPageCache.h"
|
|
//#include "CollectionRec.h"
|
|
#include "SafeBuf.h"
|
|
#include "Msg0.h"
|
|
|
|
// . now we can store multiple rating by multiple users or algorithms
|
|
// . we can use accountability, we can merge sources, etc.
|
|
// . we can use time-based merging
|
|
|
|
bool isTagTypeUnique ( int32_t tt ) ;
|
|
bool isTagTypeIndexable ( int32_t tt ) ;
|
|
//bool isTagTypeString ( int32_t tt ) ;
|
|
int32_t hexToBinary ( char *src , char *srcEnd , char *dst , bool decrement );
|
|
|
|
// . Tag::m_type is this if its a dup in the TagRec
|
|
// . so if www.xyz.com has one tag and xyz.com has another, then
|
|
// the xyz.com tag should have its m_type set to TT_DUP, but only if we can only have
|
|
// one of those tag types...
|
|
#define TT_DUP 123456
|
|
|
|
#define TAGDB_KEY key128_t
|
|
|
|
// a TagRec can contain multiple Tags, even of the same Tag::m_type
|
|
class Tag {
|
|
public:
|
|
|
|
int32_t getSize ( ) { return sizeof(key128_t) + 4 + m_recDataSize; };
|
|
int32_t getRecSize ( ) { return sizeof(key128_t) + 4 + m_recDataSize; };
|
|
|
|
void set ( char *site ,
|
|
char *tagname ,
|
|
int32_t timestamp ,
|
|
char *user ,
|
|
int32_t ip ,
|
|
char *data ,
|
|
int32_t dataSize );
|
|
|
|
int32_t print ( ) ;
|
|
bool printToBuf ( SafeBuf *sb );
|
|
bool printToBufAsAddRequest ( SafeBuf *sb );
|
|
bool printToBufAsXml ( SafeBuf *sb );
|
|
bool printToBufAsXml2 ( SafeBuf *sb );
|
|
bool printToBufAsHtml ( SafeBuf *sb , char *prefix );
|
|
bool printToBufAsTagVector ( SafeBuf *sb );
|
|
// just print the m_data...
|
|
bool printDataToBuf ( SafeBuf *sb );
|
|
bool isType ( char *t );
|
|
bool isIndexable ( ) {
|
|
return isTagTypeIndexable ( m_type ); }
|
|
//( m_dataSize == 1 || isType("meta") ); };
|
|
// for parsing output of printToBuf()
|
|
int32_t setFromBuf ( char *p , char *pend ) ;
|
|
int32_t setDataFromBuf ( char *p , char *pend ) ;
|
|
|
|
// skip of the username, whose size (including \0) is encoded
|
|
// as the first byte in the m_recData buffer
|
|
char *getTagData ( ) {return m_buf + *m_buf + 1;};
|
|
int32_t getTagDataSize ( ) {return m_bufSize - *m_buf - 1; };
|
|
// what user added this tag?
|
|
char *getUser ( ) { return m_buf + 1;};
|
|
// remove the terminating \0 which is included as part of the size
|
|
int32_t getUserLen ( ) { return *m_buf - 1; };
|
|
|
|
// used to determine if one Tag should overwrite the other! if they
|
|
// have the same dedup hash... then yes...
|
|
int32_t getDedupHash ( );
|
|
|
|
// tagdb uses 128 bit keys now
|
|
key128_t m_key;
|
|
int32_t m_recDataSize;
|
|
|
|
// when tag was added/updated
|
|
int32_t m_timestamp;
|
|
// . ip address of user adding tag
|
|
// . prevent multiple turk voters from same ip!
|
|
int32_t m_ip;
|
|
|
|
// each tag in a TagRec now has a unique id for ez deletion
|
|
//int32_t m_tagId;
|
|
|
|
// the "type" of tag. see the TagDesc array in Tagdb.cpp for a list
|
|
// of all the tag types. m_type is a hash of the type name.
|
|
int32_t m_type;
|
|
|
|
// . m_user[] IS ACTUALLY a 6-byte KEY for another TagRec
|
|
// . this is also a user's name like "mwells"
|
|
// . each user has a TagRec whose FULL key is this userHash
|
|
// . m_user[7] is always \0
|
|
//char m_user[8];
|
|
|
|
int32_t m_bufSize;
|
|
char m_buf[0];
|
|
} __attribute__((packed, aligned(4)));
|
|
|
|
// . convert "domain_squatter" to ST_DOMAIN_SQUATTER
|
|
// . used by CollectionRec::getRegExpNum()
|
|
int32_t getTagTypeFromStr( char *tagTypeName , int32_t tagnameLen = -1 );
|
|
|
|
// . convert ST_DOMAIN_SQUATTER to "domain_squatter"
|
|
char *getTagStrFromType ( int32_t tagType ) ;
|
|
|
|
// . max # of tags any one site or url can have
|
|
// . even AFTER the "inheritance loop"
|
|
// . includes the 4 bytes used for size and # of tags
|
|
//#define MAX_TAGREC_SIZE 1024
|
|
|
|
// max "outstanding" msg0 requests sent by TagRec::lookup()
|
|
#define MAX_TAGDB_REQUESTS 3
|
|
|
|
// . the latest version of the TagRec
|
|
//#define TAGREC_CURRENT_VERSION 0
|
|
|
|
class TagRec {
|
|
|
|
public:
|
|
|
|
|
|
TagRec();
|
|
~TagRec();
|
|
void reset();
|
|
void constructor ();
|
|
|
|
// . an rdb record is a key, dataSize, then the data
|
|
// . the "data" is al the stuff after "m_dataSize"
|
|
//key_t m_key;
|
|
//int32_t m_dataSize;
|
|
//uint16_t m_numTags;
|
|
//char m_version;
|
|
//char m_buf [ MAX_TAGREC_SIZE - 12 - 4 - 2 ];
|
|
|
|
//char *getKey () { return (char *)&m_key; };
|
|
//char *getData () { return (char *)this + 12 + 4; };
|
|
//int32_t getDataSize () { return m_dataSize; };
|
|
|
|
//void copy (class TagRec *tp ) {
|
|
// gbmemcpy ( this , (void *)tp , tp->getSize() ); };
|
|
|
|
// includes the 4 byte "header" which consists of the first 2 bytes
|
|
// being the size of the actual tag buffer and the second two bytes
|
|
// being the number of tags in the actual tag buffer
|
|
//int32_t getSize ( ) { return 12+4+1+2+m_dataSize; };
|
|
//int32_t getSize ( ) { return 12+4+m_dataSize;};
|
|
|
|
int32_t getNumTags ( );
|
|
|
|
int32_t getSize ( ) { return sizeof(TagRec); };
|
|
|
|
class Tag *getFirstTag ( ) {
|
|
if ( m_numListPtrs == 0 ) return NULL;
|
|
return (Tag *)m_listPtrs[0]->m_list;
|
|
}
|
|
|
|
bool isEmpty ( ) { return (getFirstTag()==NULL); };
|
|
|
|
// lists should be in order of precedence i guess
|
|
class Tag *getNextTag ( class Tag *tag ) {
|
|
// watch out
|
|
if ( ! tag ) return NULL;
|
|
// get rec size
|
|
int32_t recSize = tag->getRecSize();
|
|
// point to current tag
|
|
char *current = (char *)tag;
|
|
// find what list we are in
|
|
int32_t i;
|
|
for ( i = 0 ; i < m_numListPtrs ; i++ ) {
|
|
if ( current < m_listPtrs[i]->m_list ) continue;
|
|
if ( current >= m_listPtrs[i]->m_listEnd ) continue;
|
|
break;
|
|
}
|
|
// sanity
|
|
if ( i >= m_numListPtrs ) { char *xx=NULL;*xx=0; }
|
|
// advance
|
|
current += recSize;
|
|
// sanity check
|
|
if ( recSize > 500000 || recSize < 12 ) {
|
|
log("tagdb: corrupt tag recsize %i",(int)recSize);
|
|
return NULL;
|
|
char *xx=NULL;*xx=0;}
|
|
// breach list?
|
|
if ( current < m_listPtrs[i]->m_listEnd) return (Tag *)current;
|
|
// advance list
|
|
i++;
|
|
// breach of lists?
|
|
if ( i >= m_numListPtrs ) return NULL;
|
|
// return that list record then
|
|
return (Tag *)(m_listPtrs[i]->m_list);
|
|
};
|
|
|
|
// return the number the tags having particular tag types
|
|
int32_t getNumTagTypes ( char *tagTypeStr );
|
|
|
|
// get a tag from the tagType
|
|
class Tag *getTag ( char *tagTypeStr );
|
|
|
|
class Tag *getTag2 ( int32_t tagType );
|
|
|
|
//char *getRecEnd ( ) { return (char *)this + getSize(); };
|
|
//char *getMaxEnd ( ) { return (char *)this + (int32_t)MAX_TAGREC_SIZE; };
|
|
|
|
// . for showing under the summary of a search result in PageResults
|
|
// . also for Msg6a
|
|
int32_t print ( ) ;
|
|
bool printToBuf ( SafeBuf *sb );
|
|
bool printToBufAsAddRequest ( SafeBuf *sb );
|
|
bool printToBufAsXml ( SafeBuf *sb );
|
|
bool printToBufAsHtml ( SafeBuf *sb , char *prefix );
|
|
bool printToBufAsTagVector ( SafeBuf *sb );
|
|
|
|
// . make sure not a dup of a pre-existing tag
|
|
// . used by the clock code to not at a clock if already in there
|
|
// in Msg14.cpp
|
|
Tag *getTag ( char *tagTypeStr , char *dataPtr , int32_t dataSize );
|
|
|
|
int32_t getTimestamp ( char *tagTypeStr , int32_t defalt );
|
|
|
|
// . functions to act on a site "tag buf", like that in Msg16::m_tagRec
|
|
// . first 2 bytes is size, 2nd to bytes is # of tags, then the tags
|
|
int32_t getLong ( char *tagTypeStr ,
|
|
int32_t defalt ,
|
|
Tag **bookmark = NULL ,
|
|
int32_t *timeStamp = NULL ,
|
|
char **user = NULL );
|
|
int32_t getLong ( int32_t tagId ,
|
|
int32_t defalt ,
|
|
Tag **bookmark = NULL ,
|
|
int32_t *timeStamp = NULL ,
|
|
char **user = NULL );
|
|
|
|
int64_t getLongLong ( char *tagTypeStr,
|
|
int64_t defalt ,
|
|
Tag **bookmark = NULL ,
|
|
int32_t *timeStamp = NULL ,
|
|
char **user = NULL );
|
|
|
|
char *getString ( char *tagTypeStr ,
|
|
char *defalt = NULL ,
|
|
int32_t *size = NULL ,
|
|
Tag **bookmark = NULL ,
|
|
int32_t *timestamp = NULL ,
|
|
char **user = NULL );
|
|
|
|
// we only store the first 6 chars of "user" into this TagRec, m_buf
|
|
/*
|
|
bool addTag ( char *tagTypeStr,
|
|
int32_t timestamp ,
|
|
char *user ,
|
|
int32_t ip ,
|
|
char *data ,
|
|
int32_t dataSize );
|
|
|
|
|
|
// we convert the int32_t to a string for you here...
|
|
bool addTag ( char *tagTypeStr ,
|
|
int32_t timestamp ,
|
|
char *user ,
|
|
int32_t ip ,
|
|
int32_t dataAsLong ) {
|
|
char buf[16]; sprintf(buf,"%"INT32"",dataAsLong);
|
|
return addTag(tagTypeStr,timestamp,user,ip,buf,
|
|
gbstrlen(buf)); };
|
|
|
|
|
|
// same as above
|
|
bool addTag ( Tag *tag );
|
|
|
|
// add "negative" tags, so when these tags are added to the
|
|
// TagRec in tagdb, they will delete them
|
|
bool addDelTag ( char *tagTypeStr );
|
|
|
|
// now you can specify a unique tag id in the case of multiple tags
|
|
// that have the same tagType and user
|
|
bool removeTags ( char *tagTypeStr , char *user , int32_t tagId = 0 ) ;
|
|
bool removeTags ( int32_t tagType , char *user , int32_t tagId = 0 ) ;
|
|
bool removeTag ( class Tag *rmTag ) ;
|
|
|
|
// add/remove all the tags from "tagRec" to our list of tags
|
|
bool addTags ( TagRec *tagRec ) ;
|
|
bool removeTags ( TagRec *tagRec ) ;
|
|
|
|
// return false and set g_errno on error
|
|
bool addTags ( Tag *tags , char *tagEnd , char *bufEnd );
|
|
bool removeTags ( Tag *tags , char *tagEnd , char *bufEnd );
|
|
bool replaceTags ( Tag *tags , char *tagEnd , char *bufEnd );
|
|
*/
|
|
|
|
bool setFromBuf ( char *buf , int32_t bufSize );
|
|
bool serialize ( SafeBuf &dst );
|
|
|
|
bool setFromHttpRequest ( HttpRequest *r , TcpSocket *s );
|
|
|
|
|
|
// use this for setFromBuf()
|
|
SafeBuf m_sbuf;
|
|
|
|
|
|
// some specified input
|
|
//char *m_coll;
|
|
Url *m_url;
|
|
|
|
collnum_t m_collnum;
|
|
|
|
void (*m_callback ) ( void *state );
|
|
void *m_state;
|
|
|
|
// hold possible tagdb records
|
|
RdbList m_lists[MAX_TAGDB_REQUESTS];
|
|
|
|
// ptrs to lists in the m_lists[] array
|
|
RdbList *m_listPtrs[MAX_TAGDB_REQUESTS];
|
|
int32_t m_numListPtrs;
|
|
};
|
|
|
|
class Tagdb {
|
|
|
|
public:
|
|
// reset rdb
|
|
void reset();
|
|
|
|
// . TODO: specialized cache because to store pre-parsed tagdb recs
|
|
// . TODO: have m_useSeals parameter???
|
|
bool init ( );
|
|
bool init2 ( int32_t treeMem );
|
|
|
|
bool verify ( char *coll );
|
|
|
|
//bool convert ( char *coll ) ;
|
|
|
|
bool addColl ( char *coll, bool doVerify = true );
|
|
|
|
// used by ../rdb/Msg0 and ../rdb/Msg1
|
|
Rdb *getRdb ( ) { return &m_rdb; };
|
|
|
|
//key128_t makeKey ( Url *u , bool isDelete ) ;
|
|
key128_t makeStartKey ( char *site );//Url *u ) ;
|
|
key128_t makeEndKey ( char *site );//Url *u ) ;
|
|
|
|
|
|
key128_t makeDomainStartKey ( Url *u ) ;
|
|
key128_t makeDomainEndKey ( Url *u ) ;
|
|
|
|
// . get the serialized TagRec from an RdbList of TagRecs from tagdb
|
|
// that is the best match for "url"
|
|
char *getRec ( RdbList *list , Url *url , int32_t *recSize ,char* coll,
|
|
int32_t collLen, RdbList *retList) ;
|
|
|
|
//DiskPageCache *getDiskPageCache() { return &m_pc; };
|
|
|
|
//int32_t getGroupId (key_t *key) {return key->n1 & g_hostdb.m_groupMask;}
|
|
|
|
// . dump tagdb to stdout
|
|
// . dump as URL requests so we can re-add with blaster on each host
|
|
// . this replaces dumpTagdb() in main.cpp
|
|
// . sendPageTagdb() will process such URL requests
|
|
void dumpTagdb ( );
|
|
|
|
// private:
|
|
|
|
// . returns 0 if url is not a suburl of "site"
|
|
int32_t getMatchPoints ( Url *url , Url *site ) ;
|
|
|
|
bool setHashTable ( ) ;
|
|
|
|
// . we use the cache in here also for caching tagdb records
|
|
// and "not-founds" stored remotely (net cache)
|
|
Rdb m_rdb;
|
|
|
|
//DiskPageCache m_pc;
|
|
|
|
bool loadMinSiteInlinksBuffer ( );
|
|
bool loadMinSiteInlinksBuffer2 ( );
|
|
int32_t getMinSiteInlinks ( uint32_t hostHash32 ) ;
|
|
SafeBuf m_siteBuf1;
|
|
SafeBuf m_siteBuf2;
|
|
|
|
};
|
|
|
|
// derive this from tagdb
|
|
class Turkdb {
|
|
public:
|
|
void reset();
|
|
bool init ( );
|
|
bool addColl ( char *coll, bool doVerify = true );
|
|
Rdb *getRdb ( ) { return &m_rdb; };
|
|
Rdb m_rdb;
|
|
//DiskPageCache m_pc;
|
|
};
|
|
|
|
extern class Tagdb g_tagdb;
|
|
extern class Tagdb g_tagdb2;
|
|
extern class Turkdb g_turkdb;
|
|
//extern class Tagdb g_sitedb;
|
|
|
|
bool sendPageTagdb ( TcpSocket *s , HttpRequest *req ) ;
|
|
|
|
|
|
|
|
///////////////////////////////////////////////
|
|
//
|
|
// Msg8a gets TagRecs from Tagdb
|
|
//
|
|
///////////////////////////////////////////////
|
|
|
|
// msg8a needs to keep a ptr to the tags it
|
|
// generates in the "inheritance loop". so don't
|
|
// store more than this! log if we do.
|
|
//#define MAX_TAGS 128
|
|
|
|
//#define MSG8A_MAX_REQUEST_SIZE 2048
|
|
|
|
// this msg class is for getting AND adding to tagdb
|
|
class Msg8a {
|
|
|
|
public:
|
|
|
|
Msg8a ();
|
|
~Msg8a ();
|
|
void reset();
|
|
|
|
// . get records from multiple subdomains of url
|
|
// . calls g_udpServer.sendRequest() on each subdomain of url
|
|
// . all matching records are merge into a final record
|
|
// i.e. site tags are also propagated accordingly
|
|
// . closest matching "site" is used as the "site" (the site url)
|
|
// . stores the tagRec in your "tagRec"
|
|
bool getTagRec ( Url *url ,
|
|
char *site , // set to NULL to auto set
|
|
//char *coll ,
|
|
collnum_t collnum,
|
|
//bool useCanonicalName ,
|
|
bool skipDomainLookup ,
|
|
int32_t niceness ,
|
|
void *state ,
|
|
void (* callback)(void *state ),
|
|
TagRec *tagRec ,
|
|
bool doInheritance = true ,
|
|
char rdbId = RDB_TAGDB);
|
|
|
|
bool launchGetRequests();
|
|
void gotAllReplies ( ) ;
|
|
|
|
// some specified input
|
|
//char *m_coll;
|
|
//int32_t m_collLen;
|
|
Url *m_url;
|
|
//bool m_doFullUrl;
|
|
char m_rdbId;
|
|
|
|
collnum_t m_collnum;
|
|
|
|
void (*m_callback ) ( void *state );
|
|
void *m_state;
|
|
|
|
Msg0 m_msg0s[MAX_TAGDB_REQUESTS];
|
|
|
|
|
|
key128_t m_siteStartKey ;
|
|
key128_t m_siteEndKey ;
|
|
|
|
// hold possible tagdb records
|
|
//RdbList m_lists[MAX_TAGDB_REQUESTS];
|
|
|
|
int32_t m_niceness;
|
|
|
|
char *m_dom;
|
|
char *m_hostEnd;
|
|
char *m_p;
|
|
|
|
int32_t m_requests;
|
|
int32_t m_replies;
|
|
char m_doneLaunching;
|
|
|
|
int32_t m_errno;
|
|
|
|
// we set this for the caller
|
|
TagRec *m_tagRec;
|
|
|
|
// hacks for msg6b
|
|
void *m_parent;
|
|
int32_t m_slotNum;
|
|
|
|
// hack for MsgE
|
|
void *m_state2;
|
|
void *m_state3;
|
|
|
|
bool m_doInheritance;
|
|
};
|
|
|
|
/*
|
|
void handleRequest9a ( UdpSlot *slot , int32_t niceness ) ;
|
|
|
|
class Msg9a {
|
|
|
|
public:
|
|
|
|
Msg9a ();
|
|
~Msg9a();
|
|
void reset() ;
|
|
bool launchAddRequests ( ) ;
|
|
|
|
static bool registerHandler ( ) {
|
|
return g_udpServer.registerHandler ( 0x9a, handleRequest9a );};
|
|
|
|
// . returns false if blocked, true otherwise
|
|
// . sets errno on error
|
|
// . "sites" is a NULL-terminated list of space-separated urls
|
|
// . if "deleteTags" is false, then the tags will be added to the
|
|
/// the TagRecs specified by the sites in "sites". if a TagRec
|
|
// does not exist for a given "site" then it will be added just
|
|
// so we can add the Tags to it. If it does exist, we will
|
|
// just append the given Tags to it. Tags with the same tagType and
|
|
// and "user" will be replaced by these Tags in the tagRecPtrs[].
|
|
// . if "deleteTags" is true, then the Tags matching the Tags
|
|
// given will be removed from each TagRec. If a nonzero timestamp
|
|
// or a username that does not start with \0 or a non-zero data
|
|
// is specified in the Tag, then that must match the Tag
|
|
// being removed as well. In this way we can remove specific
|
|
// Tags from a list of Tags that share the same m_tagType.
|
|
// . if "deleteTagRecs" is false then the entire TagRec will be removed
|
|
// from Tagdb. tagRecPtrs must be NULL in this case.
|
|
// . if provided, the ip vector is 1-1 with the sites in "sitesPtrs[]"
|
|
// and we set the Tag::m_ip with the corresponding
|
|
// ip given by that. this allows XmlDoc.cpp to "ip stamp" a tag.
|
|
// and the tag will be rendered invalid by XmlDoc.cpp if the ip of
|
|
// the site changes from that! in this way we can invalidate tags
|
|
// when a site changes ownership. assuming it also changes ip!
|
|
// . you can now pass in either "sites" or sitePtrs/numSitePtrs,
|
|
// whichever is easiest for you...
|
|
bool addTags ( char *sites ,
|
|
char **sitePtrs ,
|
|
int32_t numSitePtrs ,
|
|
char *coll ,
|
|
void *state ,
|
|
void (*callback)(void *state) ,
|
|
int32_t niceness ,
|
|
TagRec *tagRec ,
|
|
bool nukeTagRecs ,
|
|
int32_t *ipVector );//= NULL );
|
|
|
|
// like above, but we are adding the output of a
|
|
// './gb dump S main 0 -1 1' cmd
|
|
bool addTags ( char *dumpFile ,
|
|
char *coll ,
|
|
void *state ,
|
|
void (*callback)(void *state) ,
|
|
int32_t niceness );
|
|
|
|
void (*m_callback ) ( void *state );
|
|
void *m_state;
|
|
|
|
int32_t m_errno;
|
|
int32_t m_requests;
|
|
int32_t m_replies;
|
|
|
|
char *m_requestBuf;
|
|
int32_t m_requestBufSize;
|
|
|
|
char *m_p;
|
|
char *m_pend;
|
|
|
|
int32_t m_niceness;
|
|
};
|
|
*/
|
|
|
|
int32_t getY ( int64_t X , int64_t *x , int64_t *y , int32_t n ) ;
|
|
|
|
#endif
|
|
|
|
// Lookup order for the url hostname.domainname.com/mydir/mypage.html
|
|
// (aka 1.2.3.4/mydir/mypage.html):
|
|
|
|
// . hostname.domainname.com/mydir/mypage.html
|
|
// . hostname.domainname.com/mydir/
|
|
// . hostname.domainname.com
|
|
// . domainname.com/mydir/mypage.html
|
|
// . domainname.com/mydir/
|
|
// . domainname.com
|
|
// . 1.2.3.4 /mydir/mypage.html
|
|
// . 1.2.3.4 /mydir/
|
|
// . 1.2.3.4
|
|
// . 1.2.3
|