591 lines
16 KiB
C++
591 lines
16 KiB
C++
// Matt Wells, copyright Feb 2002
|
|
|
|
// Ideally, CollectionRec.h and SearchInput.h should be automatically generated
|
|
// from Parms.cpp. But Parms need to be marked if they contribute to
|
|
// SearchInput::makeKey() for caching the SERPS.
|
|
|
|
#ifndef _PARMS_H_
|
|
#define _PARMS_H_
|
|
|
|
#include "Rdb.h"
|
|
|
|
//#include "CollectionRec.h"
|
|
|
|
void handleRequest3e ( UdpSlot *slot , int32_t niceness ) ;
|
|
void handleRequest3f ( UdpSlot *slot , int32_t niceness ) ;
|
|
|
|
// "url filters profile" values. used to set default crawl rules
|
|
// in Collectiondb.cpp's CollectionRec::setUrlFiltersToDefaults().
|
|
// for instance, UFP_NEWS spiders sites more frequently but less deep in
|
|
// order to get "news" pages and articles
|
|
//enum {
|
|
// UFP_CUSTOM = 0 ,
|
|
// UFP_NONE = 0 ,
|
|
// UFP_WEB = 1 ,
|
|
// UFP_NEWS = 2 ,
|
|
// UFP_LANG = 3,
|
|
// UFP_SHALLOW = 4
|
|
//};
|
|
|
|
// special priorities for the priority drop down
|
|
// in the url filters table
|
|
//enum {
|
|
// SPIDER_PRIORITY_FILTERED = -3 ,
|
|
// SPIDER_PRIORITY_BANNED = -2 ,
|
|
// SPIDER_PRIORITY_UNDEFINED = -1 };
|
|
|
|
enum {
|
|
OBJ_CONF = 1 ,
|
|
OBJ_COLL ,
|
|
OBJ_SI , // SearchInput class
|
|
OBJ_GBREQUEST , // for GigablastRequest class of parms
|
|
OBJ_IR , // InjectionRequest class from PageInject.h
|
|
OBJ_NONE
|
|
};
|
|
|
|
enum {
|
|
TYPE_BOOL = 1 ,
|
|
TYPE_BOOL2 ,
|
|
TYPE_CHECKBOX ,
|
|
TYPE_CHAR ,
|
|
TYPE_CHAR2 , //needed to display char as a number (maxNumHops)
|
|
TYPE_CMD ,
|
|
TYPE_FLOAT ,
|
|
TYPE_IP ,
|
|
TYPE_LONG ,
|
|
TYPE_LONG_LONG , // 10
|
|
TYPE_NONE ,
|
|
TYPE_PRIORITY ,
|
|
TYPE_PRIORITY2 ,
|
|
TYPE_PRIORITY_BOXES ,
|
|
TYPE_RETRIES ,
|
|
TYPE_STRING ,
|
|
TYPE_STRINGBOX ,
|
|
TYPE_STRINGNONEMPTY ,
|
|
TYPE_TIME ,
|
|
TYPE_DATE2 , // 20
|
|
TYPE_DATE ,
|
|
TYPE_RULESET ,
|
|
TYPE_FILTER ,
|
|
TYPE_COMMENT ,
|
|
TYPE_CONSTANT ,
|
|
TYPE_MONOD2 ,
|
|
TYPE_MONOM2 ,
|
|
TYPE_LONG_CONST ,
|
|
TYPE_SITERULE , // 29
|
|
TYPE_SAFEBUF ,
|
|
TYPE_UFP ,
|
|
TYPE_FILEUPLOADBUTTON,
|
|
TYPE_DOUBLE,
|
|
TYPE_CHARPTR
|
|
};
|
|
|
|
//forward decls to make compiler happy:
|
|
class HttpRequest;
|
|
class TcpSocket;
|
|
|
|
class Page {
|
|
public:
|
|
int32_t m_page; // from the PAGE_* enums above
|
|
char *m_bgcolor; // color of the cells in the table
|
|
char *m_topcolor; // color of the table's first row
|
|
char *m_title; // browser title bar
|
|
};
|
|
|
|
#include "Msg4.h"
|
|
|
|
// generic gigablast request. for all apis offered.
|
|
class GigablastRequest {
|
|
public:
|
|
|
|
//
|
|
// make a copy of the http request because the original is
|
|
// on the stack. AND the "char *" types below will reference into
|
|
// this because they are listed as TYPE_CHARPTR in Parms.cpp.
|
|
// that saves us memory as opposed to making them all SafeBufs.
|
|
//
|
|
HttpRequest m_hr;
|
|
|
|
// ptr to socket to send reply back on
|
|
TcpSocket *m_socket;
|
|
|
|
// TYPE_CHARPTR
|
|
char *m_coll;
|
|
|
|
// pretty universal char ptr
|
|
char *m_formatStr;
|
|
|
|
////////////
|
|
//
|
|
// /admin/inject parms
|
|
//
|
|
////////////
|
|
// these all reference into m_hr or into the Parm::m_def string!
|
|
char *m_url; // also for /get
|
|
//char *m_queryToScrape;
|
|
//char *m_contentDelim;
|
|
//char m_containerContentType; // CT_UNKNOWN, CT_WARC, CT_ARC
|
|
//int32_t m_injectDocIp;
|
|
//char *m_contentTypeStr;
|
|
//char *m_contentFile;
|
|
//char *m_content;
|
|
//char *m_diffbotReply; // secret thing from dan
|
|
//char m_injectLinks;
|
|
//char m_spiderLinks;
|
|
//char m_shortReply;
|
|
//char m_newOnly;
|
|
//char m_deleteUrl;
|
|
//char m_recycle;
|
|
//char m_dedup;
|
|
//char m_hasMime;
|
|
//char m_doConsistencyTesting;
|
|
//char m_getSections;
|
|
//char m_gotSections;
|
|
//int32_t m_charset;
|
|
//int32_t m_hopCount; // hopcount
|
|
//collnum_t m_collnum; // more reliable than m_coll
|
|
// older ones
|
|
//uint32_t m_firstIndexed; // firstimdexed
|
|
//uint32_t m_lastSpidered; // lastspidered;
|
|
//SafeBuf m_contentBuf; // for holding a warc/arc file
|
|
|
|
|
|
|
|
|
|
|
|
///////////
|
|
//
|
|
// /admin/import parms
|
|
//
|
|
///////////
|
|
char *m_importDir; // TYPE_CHARPTR
|
|
int32_t m_importInjects;
|
|
|
|
|
|
///////////
|
|
//
|
|
// /get parms (for getting cached web pages)
|
|
//
|
|
///////////
|
|
int64_t m_docId;
|
|
int32_t m_strip;
|
|
char m_includeHeader;
|
|
char m_highlightQuery;
|
|
|
|
///////////
|
|
//
|
|
// /admin/addurl parms
|
|
//
|
|
///////////
|
|
char *m_urlsBuf;
|
|
char m_stripBox;
|
|
char m_harvestLinks;
|
|
SafeBuf m_listBuf;
|
|
Msg4 m_msg4;
|
|
|
|
/////////////
|
|
//
|
|
// /admin/reindex parms
|
|
//
|
|
////////////
|
|
char *m_query;
|
|
int32_t m_srn;
|
|
int32_t m_ern;
|
|
char *m_qlang;
|
|
bool m_forceDel;
|
|
char m_recycleContent;
|
|
// useful bufs to copy data over
|
|
SafeBuf m_tmpBuf1;
|
|
SafeBuf m_tmpBuf2;
|
|
SafeBuf m_tmpBuf3;
|
|
|
|
void clear() {
|
|
|
|
m_hr = {};
|
|
m_socket = {};
|
|
m_coll = {};
|
|
m_formatStr = {};
|
|
m_url = {};
|
|
m_importDir = {};
|
|
m_importInjects = {};
|
|
m_docId = {};
|
|
m_strip = {};
|
|
m_includeHeader = {};
|
|
m_highlightQuery = {};
|
|
m_urlsBuf = {};
|
|
m_stripBox = {};
|
|
m_harvestLinks = {};
|
|
m_listBuf = {};
|
|
m_msg4 = {};
|
|
m_query = {};
|
|
m_srn = {};
|
|
m_ern = {};
|
|
m_qlang = {};
|
|
m_forceDel = {};
|
|
m_recycleContent = {};
|
|
m_tmpBuf1 = {};
|
|
m_tmpBuf2 = {};
|
|
m_tmpBuf3 = {};
|
|
}
|
|
};
|
|
|
|
|
|
// values for Parm::m_subMenu
|
|
#define SUBMENU_DISPLAY 1
|
|
#define SUBMENU_MAP 2
|
|
#define SUBMENU_CALENDAR 3
|
|
#define SUBMENU_LOCATION 4
|
|
#define SUBMENU_SOCIAL 5
|
|
#define SUBMENU_TIME 6
|
|
#define SUBMENU_CATEGORIES 7
|
|
#define SUBMENU_LINKS 8
|
|
#define SUBMENU_WIDGET 9
|
|
#define SUBMENU_SUGGESTIONS 10
|
|
#define SUBMENU_SEARCH 11
|
|
#define SUBMENU_CHECKBOX 0x80 // flag
|
|
|
|
// values for Parm::m_flags
|
|
#define PF_COOKIE 0x01 // store in cookie?
|
|
#define PF_REDBOX 0x02 // redbox constraint on search results
|
|
#define PF_SUBMENU_HEADER 0x04
|
|
#define PF_WIDGET_PARM 0x08
|
|
#define PF_API 0x10
|
|
#define PF_REBUILDURLFILTERS 0x20
|
|
#define PF_NOSYNC 0x40
|
|
#define PF_DIFFBOT 0x80
|
|
|
|
#define PF_HIDDEN 0x0100
|
|
#define PF_NOSAVE 0x0200
|
|
#define PF_DUP 0x0400
|
|
#define PF_TEXTAREA 0x0800
|
|
#define PF_COLLDEFAULT 0x1000
|
|
#define PF_NOAPI 0x2000
|
|
#define PF_REQUIRED 0x4000
|
|
#define PF_REBUILDPROXYTABLE 0x8000
|
|
|
|
#define PF_NOHTML 0x10000
|
|
|
|
#define PF_CLONE 0x20000
|
|
#define PF_PRIVATE 0x40000 // for password to not show in api
|
|
#define PF_SMALLTEXTAREA 0x80000
|
|
#define PF_REBUILDACTIVELIST 0x100000
|
|
|
|
class Parm {
|
|
public:
|
|
char *m_title; // displayed above m_desc on admin gui page
|
|
char *m_desc; // description of variable displayed on admin gui page
|
|
char *m_cgi; // cgi name, contains %i if an array
|
|
char *m_cgi2; // alias
|
|
char *m_cgi3; // alias
|
|
char *m_cgi4; // alias
|
|
char *m_xml; // default to rendition of m_title if NULL
|
|
int32_t m_off; // this variable's offset into the CollectionRec class
|
|
char m_colspan;
|
|
char m_type; // TYPE_BOOL, TYPE_LONG, ...
|
|
int32_t m_page; // PAGE_MASTER, PAGE_SPIDER, ... see Pages.h
|
|
char m_obj; // OBJ_CONF or OBJ_COLL
|
|
// the maximum number of elements supported in the array.
|
|
// this is 1 if NOT an array (i.e. array of only one parm).
|
|
// in such cases a "count" is NOT stored before the parm in
|
|
// CollectionRec.h or Conf.h.
|
|
bool isArray() { return (m_max>1); };
|
|
|
|
int32_t getNumInArray() ;
|
|
|
|
int32_t m_max; // max elements in the array
|
|
// if array is fixed size, how many elements in it?
|
|
// this is 0 if not a FIXED size array.
|
|
int32_t m_fixed;
|
|
int32_t m_size; // max string size
|
|
char *m_def; // default value of this variable if not in either conf
|
|
int32_t m_defOff; // if default value points to a collectionrec parm!
|
|
char m_cast; // true if we should broadcast to all hosts (default)
|
|
char *m_units;
|
|
char m_addin; // add "insert above" link to gui when displaying array
|
|
char m_rowid; // id of row controls are in, if any
|
|
char m_rdonly;// if in read-only mode, blank out this control?
|
|
char m_hdrs; // print headers for row or print title/desc for single?
|
|
char m_perms; // 0 means same as WebPages' m_perms
|
|
char m_subMenu;
|
|
int32_t m_flags;
|
|
char *m_class;
|
|
char *m_icon;
|
|
char *m_qterm;
|
|
char *m_pstr; // for sorting by in sendPageAPI()
|
|
int32_t m_parmNum; // slot # in the m_parms[] array that we are
|
|
//bool (*m_func)(TcpSocket *s , HttpRequest *r,
|
|
// bool (*cb)(TcpSocket *s , HttpRequest *r));
|
|
bool (*m_func)(char *parmRec);
|
|
// some functions can block, like when deleting a coll because
|
|
// the tree might be saving, so they take a "we" ptr
|
|
bool (*m_func2)(char *parmRec,class WaitEntry *we);
|
|
int32_t m_plen; // offset of length for TYPE_STRINGS (m_htmlHeadLen...)
|
|
char m_group; // start of a new group of controls?
|
|
// m_priv = 1 means gigablast's software license clients cannot see
|
|
// or change.
|
|
// m_priv = 2 means gigablast's software license clients, including
|
|
// even metalincs, cannot see or change.
|
|
// m_priv = 3 means nobody can see in admin controls, but can be
|
|
// in search input by anybody. really a hack for yaron
|
|
// from quigo so he can set "t2" to something bigger.
|
|
char m_priv; // true if gigablast's software clients cannot see
|
|
char m_save; // save to xml file? almost always true
|
|
int32_t m_min;
|
|
// these are used for search parms in PageResults.cpp
|
|
//char m_sparm;// is this a search parm? for passing to PageResults.cpp
|
|
//char *m_scgi; // parm in the search url
|
|
char m_spriv; // is it private? only admins can see/use private parms
|
|
//char *m_scmd; // the url path for this m_scgi variable
|
|
//int32_t m_sdefo; // offset of default into CollectionRec (use m_off)
|
|
int32_t m_sminc ;// offset of min in CollectionRec (-1 for none)
|
|
int32_t m_smaxc ;// offset of max in CollectionRec (-1 for none)
|
|
int32_t m_smin; // absolute min
|
|
int32_t m_smax; // absolute max
|
|
//int32_t m_soff; // offset into SearchInput to store value in
|
|
char m_sprpg; // propagate the cgi variable to other pages via GET?
|
|
char m_sprpp; // propagate the cgi variable to other pages via POST?
|
|
bool m_sync; // this parm should be synced
|
|
int32_t m_hash; // hash of "title"
|
|
int32_t m_cgiHash; // hash of m_cgi
|
|
|
|
bool getValueAsBool ( class SearchInput *si ) ;
|
|
int32_t getValueAsLong ( class SearchInput *si ) ;
|
|
char * getValueAsString ( class SearchInput *si ) ;
|
|
|
|
int32_t getNumInArray ( collnum_t collnum ) ;
|
|
|
|
bool printVal ( class SafeBuf *sb , collnum_t collnum , int32_t occNum ) ;
|
|
};
|
|
|
|
#define MAX_PARMS 940
|
|
|
|
#define MAX_XML_CONF (200*1024)
|
|
|
|
#include "Xml.h"
|
|
#include "SafeBuf.h"
|
|
|
|
struct SerParm;
|
|
|
|
class Parms {
|
|
|
|
public:
|
|
|
|
Parms();
|
|
|
|
void init();
|
|
|
|
bool sendPageGeneric ( class TcpSocket *s, class HttpRequest *r );
|
|
|
|
bool printParmTable ( SafeBuf *sb , TcpSocket *s , HttpRequest *r );
|
|
|
|
//char *printParms (char *p, char *pend, TcpSocket *s, HttpRequest *r);
|
|
bool printParms (SafeBuf* sb, TcpSocket *s , HttpRequest *r );
|
|
|
|
bool printParms2 (SafeBuf* sb,
|
|
int32_t page,
|
|
CollectionRec *cr,
|
|
int32_t nc ,
|
|
int32_t pd ,
|
|
bool isCrawlbot ,
|
|
char format, //bool isJSON,
|
|
TcpSocket *sock,
|
|
bool isMasterAdmin,
|
|
bool isCollAdmin
|
|
);
|
|
|
|
/*
|
|
char *printParm ( char *p ,
|
|
char *pend ,
|
|
//int32_t user ,
|
|
char *username,
|
|
Parm *m ,
|
|
int32_t mm , // m = &m_parms[mm]
|
|
int32_t j ,
|
|
int32_t jend ,
|
|
char *THIS ,
|
|
char *coll ,
|
|
char *pwd ,
|
|
char *bg ,
|
|
int32_t nc ,
|
|
int32_t pd ) ;
|
|
*/
|
|
|
|
bool printParm ( SafeBuf* sb,
|
|
//int32_t user ,
|
|
char *username,
|
|
Parm *m ,
|
|
int32_t mm , // m = &m_parms[mm]
|
|
int32_t j ,
|
|
int32_t jend ,
|
|
char *THIS ,
|
|
char *coll ,
|
|
char *pwd ,
|
|
char *bg ,
|
|
int32_t nc ,
|
|
int32_t pd ,
|
|
bool lastRow ,
|
|
bool isCrawlbot ,//= false,
|
|
char format , //= FORMAT_HTML,
|
|
bool isMasterAdmin ,
|
|
bool isCollAdmin ,
|
|
class TcpSocket *sock );
|
|
|
|
char *getTHIS ( HttpRequest *r , int32_t page );
|
|
|
|
class Parm *getParmFromParmHash ( int32_t parmHash );
|
|
|
|
bool setFromRequest ( HttpRequest *r , //int32_t user,
|
|
TcpSocket* s,
|
|
class CollectionRec *newcr ,
|
|
char *THIS ,
|
|
int32_t objType );
|
|
|
|
bool insertParm ( int32_t i , int32_t an , char *THIS ) ;
|
|
bool removeParm ( int32_t i , int32_t an , char *THIS ) ;
|
|
|
|
void setParm ( char *THIS, Parm *m, int32_t mm, int32_t j, char *s,
|
|
bool isHtmlEncoded , bool fromRequest ) ;
|
|
|
|
void setToDefault ( char *THIS , char objType ,
|
|
CollectionRec *argcr );//= NULL ) ;
|
|
|
|
bool setFromFile ( void *THIS ,
|
|
char *filename ,
|
|
char *filenameDef ,
|
|
char objType ) ;
|
|
|
|
bool setParmsFromXml ( Xml &xml , void *THIS, char objType ) ;
|
|
|
|
bool setXmlFromFile(Xml *xml, char *filename, class SafeBuf *sb );
|
|
|
|
bool saveToXml ( char *THIS , char *f , char objType ) ;
|
|
|
|
bool convertToXml ( char *buf , char *THIS , char objType ) ;
|
|
|
|
// get the parm with the associated cgi name. must be NULL terminated.
|
|
Parm *getParm ( char *cgi ) ;
|
|
|
|
bool getParmHtmlEncoded ( SafeBuf *sb , Parm *m , char *s );
|
|
|
|
bool setGigablastRequest ( class TcpSocket *s ,
|
|
class HttpRequest *hr ,
|
|
class GigablastRequest *gr );
|
|
|
|
// . make it so a collectionrec can be copied in Collectiondb.cpp
|
|
// . so the rec can be copied and the old one deleted without
|
|
// freeing the safebufs now used by the new one.
|
|
void detachSafeBufs ( class CollectionRec *cr ) ;
|
|
|
|
// calc checksum of parms
|
|
uint32_t calcChecksum();
|
|
|
|
// get size of serialized parms
|
|
//int32_t getStoredSize();
|
|
// . serialized to buf
|
|
// . if buf is NULL, just calcs size
|
|
//bool serialize( char *buf, int32_t *bufSize );
|
|
//void deserialize( char *buf );
|
|
|
|
void overlapTest ( char step ) ;
|
|
|
|
|
|
/////
|
|
//
|
|
// parms now in parmdb
|
|
//
|
|
/////
|
|
|
|
// all parm recs need to be in the tree
|
|
//Rdb m_rdb;
|
|
|
|
//
|
|
// new functions
|
|
//
|
|
|
|
bool addNewParmToList1 ( SafeBuf *parmList ,
|
|
collnum_t collnum ,
|
|
char *parmValString ,
|
|
int32_t occNum ,
|
|
char *parmName ) ;
|
|
bool addNewParmToList2 ( SafeBuf *parmList ,
|
|
collnum_t collnum ,
|
|
char *parmValString ,
|
|
int32_t occNum ,
|
|
Parm *m ) ;
|
|
bool addCurrentParmToList1 ( SafeBuf *parmList ,
|
|
CollectionRec *cr ,
|
|
char *parmName ) ;
|
|
bool addCurrentParmToList2 ( SafeBuf *parmList ,
|
|
collnum_t collnum ,
|
|
int32_t occNum ,
|
|
Parm *m ) ;
|
|
bool convertHttpRequestToParmList (HttpRequest *hr,SafeBuf *parmList,
|
|
int32_t page , TcpSocket *sock );
|
|
Parm *getParmFast2 ( int32_t cgiHash32 ) ;
|
|
Parm *getParmFast1 ( char *cgi , int32_t *occNum ) ;
|
|
bool broadcastParmList ( SafeBuf *parmList ,
|
|
void *state ,
|
|
void (* callback)(void *) ,
|
|
bool sendToGrunts = true ,
|
|
bool sendToProxies = false ,
|
|
// send to this single hostid? -1 means all
|
|
int32_t hostId = -1 ,
|
|
int32_t hostId2 = -1 ); // hostid range?
|
|
bool doParmSendingLoop ( ) ;
|
|
bool syncParmsWithHost0 ( ) ;
|
|
bool makeSyncHashList ( SafeBuf *hashList ) ;
|
|
int32_t getNumInArray ( collnum_t collnum ) ;
|
|
bool addAllParmsToList ( SafeBuf *parmList, collnum_t collnum ) ;
|
|
bool updateParm ( char *rec , class WaitEntry *we ) ;
|
|
|
|
bool cloneCollRec ( char *srcCR , char *dstCR ) ;
|
|
|
|
//
|
|
// end new functions
|
|
//
|
|
|
|
bool m_inSyncWithHost0;
|
|
bool m_triedToSync;
|
|
|
|
bool m_isDefaultLoaded;
|
|
|
|
Page m_pages [ 50 ];
|
|
int32_t m_numPages;
|
|
|
|
Parm m_parms [ MAX_PARMS ];
|
|
int32_t m_numParms;
|
|
|
|
// just those Parms that have a m_sparm of 1
|
|
Parm *m_searchParms [ MAX_PARMS ];
|
|
int32_t m_numSearchParms;
|
|
|
|
/*
|
|
private:
|
|
// these return true if overflow
|
|
bool serializeConfParm( Parm *m, int32_t i, char **p, char *end,
|
|
int32_t size, int32_t cnt,
|
|
bool sizeChk, int32_t *bufSz );
|
|
bool serializeCollParm( class CollectionRec *cr,
|
|
Parm *m, int32_t i, char **p, char *end,
|
|
int32_t size, int32_t cnt,
|
|
bool sizeChk, int32_t *bufSz );
|
|
|
|
|
|
void deserializeConfParm( Parm *m, SerParm *sp, char **p,
|
|
bool *confChgd );
|
|
void deserializeCollParm( class CollectionRec *cr,
|
|
Parm *m, SerParm *sp, char **p );
|
|
*/
|
|
|
|
// for holding default.conf file for collection recs for OBJ_COLL
|
|
char m_buf [ MAX_XML_CONF ];
|
|
|
|
// for parsing default.conf file for collection recs for OBJ_COLL
|
|
Xml m_xml2;
|
|
};
|
|
|
|
extern Parms g_parms;
|
|
|
|
#endif
|
|
|