528 lines
22 KiB
C++
528 lines
22 KiB
C++
// Matt Wells, copyright Jul 201
|
|
|
|
// . the record retrieved from tagdb
|
|
// . used for describing a site
|
|
// . can parse out record from our rdb or from a network msg
|
|
// . has siteUrl and filenum of the file that holds the Xml that has the
|
|
// parsing rules and quotas for docs in that site
|
|
// . we have the fields you can use at the bottom of this file
|
|
|
|
#ifndef _CATREC_H_
|
|
#define _CATREC_H_
|
|
|
|
#include "Conf.h"
|
|
#include "Xml.h"
|
|
#include "RdbList.h"
|
|
#include "Tagdb.h"
|
|
#include "Categories.h"
|
|
#include "Lang.h"
|
|
#include "Tagdb.h"
|
|
#include "Catdb.h"
|
|
|
|
#define MAX_IND_CATIDS 1024
|
|
#define MAX_SITE_TYPES 12
|
|
// url, catids, indirect catids, numCatids, numIndCatids, filenum
|
|
#define CATREC_BUF_SIZE MAX_URL_LEN + MAX_CATIDS*4 + 9
|
|
|
|
class CatRec {
|
|
|
|
public:
|
|
|
|
// these just set m_xml to NULL
|
|
void reset() ;
|
|
CatRec();
|
|
~CatRec();
|
|
|
|
// . extract the site url for "url"
|
|
// . extract the filenum of the file that holds the xml we want
|
|
// . returns false and sets errno on error setting
|
|
// . if rec is NULL we use the default rec for this collection
|
|
bool set ( Url *url, char *data,int32_t dataSize,
|
|
bool gotByIp ); // , char rdbId = RDB_TAGDB );
|
|
|
|
// we're empty if m_xml is NULL
|
|
//bool isEmpty() { return (! m_xml); };
|
|
|
|
// . used to by Msg9 to make a CatRec to add
|
|
// . serializes filenum/site into our m_data/m_dataSize
|
|
// . returns false and sets errno on error
|
|
/*
|
|
bool set ( Url *site , char *coll , int32_t collLen , int32_t filenum ,
|
|
char version , char rdbId = RDB_TAGDB , int32_t timeStamp = 0,
|
|
char *comment = NULL, char *username = NULL,
|
|
int32_t *catids = NULL, unsigned char numCatids = 0,
|
|
unsigned char spamBits = 0, char siteQuality = 0,
|
|
char adultLevel = 0,
|
|
SiteType *siteTypes = NULL,
|
|
uint8_t numTypes = 0,
|
|
SiteType *langs = NULL,
|
|
uint8_t numLangs = 0);
|
|
*/
|
|
bool set ( Url *site , int32_t filenum ,
|
|
int32_t *catids = NULL, unsigned char numCatids = 0 );
|
|
|
|
//Xml *getXml() { return m_xml; };
|
|
|
|
//bool set ( int32_t filenum ) ;
|
|
|
|
// . this method just sets the filenum, version, url and url-len from
|
|
// data-pointer "data"
|
|
// . this method is written as an alternative to the above set methods
|
|
// Useful if the caller is interested just in the url and url len
|
|
// saves time
|
|
bool set (char *data, int32_t dataSize);//, char rdbId );
|
|
|
|
// set the indirect catids
|
|
void setIndirectCatids ( int32_t *indCatids, int32_t numIndCatids );
|
|
|
|
// . did this url have an entry in tagdb?
|
|
// . we need this to know because if it didn't it will have default rec
|
|
// . Msg16 will override Url::isSpam() if this record is not default
|
|
// . Msg25 will also not bother checking for link bans via Msg18
|
|
bool hadRec() { return m_hadRec; };
|
|
|
|
// . did we get it by ip? (if not, we got it by canonical domain name)
|
|
// . if we got it by IP and it was banned, admin has the option to
|
|
// tell gigablast to automatically add the domain name as banned
|
|
// to tagdb in Msg14.cpp
|
|
bool gotByIp() { return m_gotByIp; };
|
|
|
|
// get the record itself (just templateNum/site/coll)
|
|
char *getData ( ) { return m_data; };
|
|
int32_t getDataSize ( ) { return m_dataSize; };
|
|
|
|
// along with coll/collLen identifies a unique xml file
|
|
//int32_t getFilenum ( ) { return m_filenum; };
|
|
//int32_t getRuleset ( ) { return m_filenum; };
|
|
|
|
|
|
// . these should both be NULL terminated
|
|
// . they both reference into the data contained in m_list
|
|
// or m_buf if the list doesn't have a site record for us
|
|
Url *getSite ( ) { return &m_site; };
|
|
//char *getCollection ( ) { return m_coll; };
|
|
//int32_t getCollectionLen ( ) { return m_collLen; };
|
|
|
|
/*
|
|
char* printFormattedRec(char* p);
|
|
void printFormattedRec(SafeBuf *sb);
|
|
char* printXmlRec (char* p);
|
|
void printXmlRec ( SafeBuf *sb );
|
|
|
|
//status of manually set bits.
|
|
bool isSpamUnknown() { return m_spamBits == SPAM_UNKNOWN; }
|
|
bool isSpam() { return m_spamBits == SPAM_BIT; }
|
|
bool isNotSpam() { return m_spamBits == NOT_SPAM; }
|
|
char* getSpamStr();
|
|
unsigned char getSpamStatus() { return m_spamBits; }
|
|
|
|
//
|
|
bool isRatingUnknown() { return m_adultLevel == NOT_RATED; }
|
|
bool isAdultButNotPorn() { return m_adultLevel == RATED_R; }
|
|
bool isPorn() { return m_adultLevel == RATED_X; }
|
|
bool isKidSafe() { return m_adultLevel == RATED_G; }
|
|
char* getAdultStr();
|
|
|
|
char *getPubDateFmtStr();
|
|
|
|
int32_t getTimeStamp() { return m_timeStamp; }
|
|
char *getComment() { return m_comment; }
|
|
char *getUsername() { return m_username; }
|
|
char getSiteQuality() { return m_siteQuality; }
|
|
int32_t getNumSiteTypes () { return m_numTypes; }
|
|
int32_t getNumSiteLangs () { return m_numLangs; }
|
|
SiteType *getSiteTypes () { return m_siteTypes; }
|
|
SiteType *getSiteLangs () { return m_siteLangs; }
|
|
uint32_t getScoreForType(uint8_t type);
|
|
|
|
// . mod functions
|
|
// . pain in the butt cuz we gotta change m_data/m_dataSize buffer too
|
|
void addSiteType (uint8_t type, uint32_t score ) ;
|
|
void setFilenum (int32_t newFilenum );
|
|
|
|
// . [n0,n1] constitute an xml node range in "xml"
|
|
// . "len" is the length of another node's data in another xml doc
|
|
// . gets the scoreWeight from docQuality and a node's dataLen
|
|
// . 2nd one gets the maxScore from docQuality
|
|
int32_t getScoreWeightFromQuality ( int32_t n0, int32_t n1, int32_t quality );
|
|
int32_t getScoreWeightFromQuality2( int32_t quality );
|
|
int32_t getMaxScoreFromQuality ( int32_t n0, int32_t n1, int32_t quality );
|
|
int32_t getMaxLenFromQuality ( int32_t n0, int32_t n1, int32_t quality );
|
|
|
|
//bool hasMaxCountFromQualityTag ( int32_t n0, int32_t n1 ) ;
|
|
//int32_t getMaxCountFromQuality ( int32_t n0, int32_t n1, int32_t quality ) ;
|
|
|
|
int32_t getScoreWeightFromLen ( int32_t n0, int32_t n1, int32_t len );
|
|
int32_t getScoreWeightFromLen2 ( int32_t len );
|
|
int32_t getScoreWeightFromNumWords( int32_t n0, int32_t n1, int32_t len );
|
|
int32_t getMaxScoreFromLen ( int32_t n0, int32_t n1, int32_t quality );
|
|
int32_t getMaxScoreFromNumWords ( int32_t n0, int32_t n1, int32_t quality );
|
|
|
|
// 2 new maps for boosting base quality from link statistics
|
|
int32_t getQualityBoostFromNumLinks ( int32_t numLinks );
|
|
int32_t getQualityBoostFromLinkQualitySum ( int32_t linkBaseQualitySum );
|
|
|
|
// 2 new maps for maxScore/scoreWeight of outgoing linkText
|
|
int32_t getLinkTextScoreWeightFromLinkerQuality ( int32_t quality );
|
|
int32_t getLinkTextScoreWeightFromLinkeeQuality ( int32_t quality );
|
|
int32_t getLinkTextMaxScoreFromQuality ( int32_t quality );
|
|
int32_t getLinkTextScoreWeightFromNumWords( int32_t numWords );
|
|
|
|
|
|
// . another new map for boosting quality from the link-adjusted
|
|
// quality of our root page
|
|
// . root page is just our site url (i.e. http://about.com/)
|
|
// . "rootQuality" is link-adjusted
|
|
int32_t getQualityBoostFromRootQuality ( int32_t rootQuality ) ;
|
|
|
|
int32_t getQuotaBoostFromRootQuality ( int32_t rootQuality ) ;
|
|
int32_t getQuotaBoostFromQuality ( int32_t quality ) ;
|
|
|
|
// if X% of the words are spammed, consider ALL the words to be spammed
|
|
int32_t getMaxPercentForSpamFromQuality ( int32_t quality ) ;
|
|
|
|
//private:
|
|
|
|
// . parses and accesses a map/graph in the xml for us
|
|
// . returns default "def" if map not present or x's in map unordered
|
|
int32_t getY (int32_t n0,int32_t n1,int32_t X,char *strx,char *stry,int32_t def) ;
|
|
*/
|
|
|
|
// these reference into m_data???
|
|
Url m_site;
|
|
//char m_coll[64];
|
|
//int32_t m_collLen;
|
|
|
|
// filenum determines the xml uniquely
|
|
int32_t m_filenum;
|
|
|
|
// did this rec have it's own entry in tagdb?
|
|
bool m_hadRec;
|
|
// did we get it by ip? (if not, we got it by canonical domain name)
|
|
bool m_gotByIp;
|
|
|
|
/*
|
|
// . the xml describing this site
|
|
// . references into an Xml stored in Sitedb class
|
|
Xml *m_xml;
|
|
*/
|
|
|
|
// a buffer for holding the little site record itself
|
|
char m_data[CATREC_BUF_SIZE];
|
|
int32_t m_dataSize;
|
|
|
|
// category ID info for catdb
|
|
unsigned char m_numCatids;
|
|
int32_t *m_catids;
|
|
int32_t m_numIndCatids;
|
|
int32_t m_indCatids[MAX_IND_CATIDS];
|
|
|
|
// version
|
|
unsigned char m_version;
|
|
/*
|
|
|
|
|
|
unsigned char m_spamBits;
|
|
unsigned char m_adultLevel;
|
|
char m_siteQuality;
|
|
|
|
uint8_t m_numTypes;
|
|
uint8_t m_numLangs;
|
|
SiteType m_siteTypes[MAX_SITE_TYPES];
|
|
SiteType m_siteLangs[MAX_SITE_TYPES];
|
|
*/
|
|
|
|
// url pointer
|
|
char *m_url;
|
|
int32_t m_urlLen;
|
|
|
|
/*
|
|
// time stamp, comment, username
|
|
int32_t m_timeStamp;
|
|
char *m_comment;
|
|
char *m_username;
|
|
|
|
// hack for addSiteType()
|
|
int32_t *m_incHere;
|
|
char *m_addHere ;
|
|
// hack for changeFilenum()
|
|
char *m_filenumPtr;
|
|
*/
|
|
};
|
|
|
|
#endif
|
|
|
|
// format of a template or default record in xml:
|
|
|
|
// ## NOTE: the key of the record is the sitename prefixed with the collection:
|
|
// ## NOTE: "collectionName:" is prefixed to all hashed terms before hashing
|
|
// ## LATER: do permission system
|
|
|
|
// ## all indexed terms will be preceeded by "collection:" when indexed so you
|
|
// ## can do a search within that collection.
|
|
// <comment> %s </>
|
|
// ## <addedDate> %s </> (stored as a int32_t)
|
|
// <allowMimeType> %s </> (text, html?)
|
|
// <allowExtension> %s </> (used iff allowAllExtensions is false)
|
|
|
|
// ## the base quality of all docs from this site
|
|
// <baseQuality> %c </> (0-100%,default 30,qual of docs in site)
|
|
|
|
// ## the computed link-adjusted quality should not exceed this
|
|
// <maxQuality> %c </> (0-100%, def 100)
|
|
|
|
// ## should we treat incoming link text as if it were on our page?
|
|
// ## score weights and maxes for the link text is determined by the linker's
|
|
// ## own link-adjusted quality. (see graphs/maps below)
|
|
// <indexIncomingLinkText> %b </> (0-100, default = 100, a %)
|
|
|
|
// ## do links from this site always point to clean pages?
|
|
// <linksClean> %b </> (default no)
|
|
|
|
// ## a doc w/ link-adjusted quality LESS THAN this will not be indexed
|
|
// <minQualityToIndex> %c </> (default 0% )
|
|
|
|
// ## a doc w/ link-adjusted quality at or below this will be checked for
|
|
// ## adult content.
|
|
// <maxQualityForAdultDetect> %c </> (default 0%, 0 means none)
|
|
|
|
// ## how often do we re-spider it?
|
|
// ## we try to compute the best spider rate based on last modified times
|
|
// <minSpiderFrequency> %i </> (default 60*60*24*30=1month, in seconds)
|
|
// <maxSpiderFrequency> %i </> (default 60*60*24*30=1month, in seconds)
|
|
// <spiderLinks> %b </> (default true)
|
|
// <spiderLinkPriority> %"INT32" </> (0-7, default -1) -1 means prntPriorty-1
|
|
// <spiderMaxPriority> %"INT32" </> (0-7, default 7)
|
|
|
|
|
|
// ## these are fairly self-explanatory
|
|
// <maxUrlLen> %i </> (default 0, 0 means none)
|
|
// <minMetaRefresh> %i </> (default 6 )
|
|
// <isBanned> %b </> (default no )
|
|
// <isAdult> %b </> (default no )
|
|
// <isISP> %b </> (default no )
|
|
// <isTrusted> %b </> (default no )
|
|
// <allowAdultContent> %b </> (default yes)
|
|
// <allowCgiUrls> %b </> (default yes)
|
|
// <allowIpUrls> %b </> (default yes)
|
|
// <allowAllExtensions> %b </> (default yes)
|
|
// <allowNonAsciiDocs> %b </> (default yes)
|
|
// <delete404s> %b </> (default yes) from cache/titledb
|
|
// <indexDupContent> %b </> (default yes)
|
|
// <indexSite> %b </> (default yes) site: terms
|
|
// <indexSubSite> %b </> (default yes) subsite: terms
|
|
// <indexUrl> %b </> (default yes) url: terms
|
|
// <indexSubUrl> %b </> (default yes) suburl: terms
|
|
// <indexIp> %b </> (default yes) ip: terms
|
|
// <indexLinks> %b </> (default yes) link:/href: terms
|
|
|
|
// <maxDocs> %ul </> (default -1 = no max)
|
|
|
|
// ## we don't have a security system... yet...
|
|
// ## TODO: <maxCacheSpace> %ul </> (default 1024*1024)
|
|
// ## TODO: <directorMaxScore> %s </> (256bit seal for maxScore tag above)
|
|
|
|
// ## Now for some maps/graphs.
|
|
// ## we list the 5 X components followed by the 5 Y components.
|
|
// ## all maps/graphs linearly interpolate between the points.
|
|
// ## the edge pieces are horizontal.
|
|
// ## these maps can have up to 32 points but i typically just use 5.
|
|
|
|
// ## we map the NUMBER of incoming links to a baseQuality BOOST for our doc.
|
|
// ## the resulting new quality is the link-adjusted quality of the linkee doc.
|
|
// ## These boosts are ADDED to the existing quality.
|
|
// <numLinks11> %i </> (default 0 )
|
|
// <numLinks12> %i </> (default 5 )
|
|
// <numLinks13> %i </> (default 10 )
|
|
// <numLinks14> %i </> (default 20 )
|
|
// <numLinks15> %i </> (default 50 )
|
|
// <qualityBoost11> %i </> (default 0% )
|
|
// <qualityBoost12> %i </> (default 5% )
|
|
// <qualityBoost13> %i </> (default 10% )
|
|
// <qualityBoost14> %i </> (default 15% )
|
|
// <qualityBoost15> %i </> (default 20% )
|
|
|
|
// ## we map the SUM of the baseQuality of all linkers to a baseQuality BOOST.
|
|
// ## the resulting new quality is the link-adjusted quality of the linkee doc.
|
|
// ## we only add up BASE quality of the linkers.
|
|
// ## we only add up 1 linker's BASE quality per site.
|
|
// ## These boosts are ADDED to the existing quality.
|
|
// <linkQualitySum21> %i </> (default 0 )
|
|
// <linkQualitySum22> %i </> (default 50 )
|
|
// <linkQualitySum23> %i </> (default 100 )
|
|
// <linkQualitySum24> %i </> (default 150 )
|
|
// <linkQualitySum25> %i </> (default 200 )
|
|
// <qualityBoost21> %i </> (default 0% )
|
|
// <qualityBoost22> %i </> (default 5% )
|
|
// <qualityBoost23> %i </> (default 10% )
|
|
// <qualityBoost24> %i </> (default 15% )
|
|
// <qualityBoost25> %i </> (default 20% )
|
|
|
|
// ## we map the LINK-ADJUSTED QUALITY of our root page (site url) to a
|
|
// ## quality BOOST for us.
|
|
// ## the site url is just our site, could be like http://about.com/
|
|
// ## These boosts are ADDED to the existing quality.
|
|
// <rootQuality31> %i </> (default 0 )
|
|
// <rootQuality32> %i </> (default 50 )
|
|
// <rootQuality33> %i </> (default 100 )
|
|
// <rootQuality34> %i </> (default 200 )
|
|
// <rootQuality35> %i </> (default 500 )
|
|
// <qualityBoost31> %i </> (default 0% )
|
|
// <qualityBoost32> %i </> (default 5% )
|
|
// <qualityBoost33> %i </> (default 10% )
|
|
// <qualityBoost34> %i </> (default 15% )
|
|
// <qualityBoost35> %i </> (default 20% )
|
|
|
|
// ## TODO: make based on quality of doc and length of link text!!
|
|
// ## currently we limit link text to up to 256 chars in LinkInfo.cpp.
|
|
// ## map doc's link-adjusted quality to scoreWeight of it's outgoing link text
|
|
// <quality41> %i </> (default 0% )
|
|
// <quality42> %i </> (default 30% )
|
|
// <quality43> %i </> (default 50% )
|
|
// <quality44> %i </> (default 70% )
|
|
// <quality45> %i </> (default 85% )
|
|
// <linkTextScoreWeight41> %i </> (default 50% )
|
|
// <linkTextScoreWeight42> %i </> (default 100% )
|
|
// <linkTextScoreWeight43> %i </> (default 130% )
|
|
// <linkTextScoreWeight44> %i </> (default 180% )
|
|
// <linkTextScoreWeight45> %i </> (default 250% )
|
|
|
|
// ## map doc's link-adjusted quality to maxScore of it's outgoing link text.
|
|
// ## maxScore applies to all docs from this site as to limit a site's impact.
|
|
// <quality51> %i </> (default
|
|
// <quality52> %i </>
|
|
// <quality53> %i </>
|
|
// <quality54> %i </>
|
|
// <quality55> %i </>
|
|
// <linkTextMaxScore51> %i </>
|
|
// <linkTextMaxScore52> %i </>
|
|
// <linkTextMaxScore53> %i </>
|
|
// <linkTextMaxScore54> %i </>
|
|
// <linkTextMaxScore55> %i </>
|
|
|
|
// ## we map the LINK-ADJUSTED QUALITY of our ROOT page (site url) to a quota
|
|
// ## boost. (can be negative)
|
|
// ## the site url is just our site, could be like http://about.com/
|
|
// ## These boosts are MULTIPLIED by the existing quota.
|
|
// <rootQuality71> %i </> (default 0 )
|
|
// <rootQuality72> %i </> (default 50 )
|
|
// <rootQuality73> %i </> (default 100 )
|
|
// <rootQuality74> %i </> (default 200 )
|
|
// <rootQuality75> %i </> (default 500 )
|
|
// <quotaBoost71> %i </> (default 0% )
|
|
// <quotaBoost72> %i </> (default 0% )
|
|
// <quotaBoost73> %i </> (default 0% )
|
|
// <quotaBoost74> %i </> (default 0% )
|
|
// <quotaBoost75> %i </> (default 0% )
|
|
|
|
// ## we map the LINK-ADJUSTED QUALITY of our page (site url) to a quota
|
|
// ## boost. (can be negative)
|
|
// ## the site url is just our site, could be like http://about.com/
|
|
// ## These boosts are MULTIPLIED by the existing quota.
|
|
// <quality81> %i </> (default 0 )
|
|
// <quality82> %i </> (default 50 )
|
|
// <quality83> %i </> (default 100 )
|
|
// <quality84> %i </> (default 200 )
|
|
// <quality85> %i </> (default 500 )
|
|
// <quotaBoost81> %i </> (default 0% )
|
|
// <quotaBoost82> %i </> (default 0% )
|
|
// <quotaBoost83> %i </> (default 0% )
|
|
// <quotaBoost84> %i </> (default 0% )
|
|
// <quotaBoost85> %i </> (default 0% )
|
|
|
|
// ## the <index> node describes parsing/indexing rtu
|
|
// ## used for xhtml tags (title, meta summary/keywords/description)
|
|
// ## NOTE: <score2> <weight2> defines a point on the #words-to-score function
|
|
// ## NOTE: omit <name> to index whole body (exculdes meta tags and xml tags)
|
|
// ## NOTE: set <name> to "meta.summary" for indexing meta tag summary
|
|
// ## NOTE: set <name> to "meta.keywords" for indexing meta tag keywords
|
|
// ## NOTE: set <name> to "meta.description" for indexing meta tag keywords
|
|
// ## NOTE: set <name> to "Xml" for indexing ALL xml tags
|
|
// ## NOTE: set <name> to ??? for indexing text under that tag <???>...</>
|
|
// <index>
|
|
// <name> %s </> ("title","meta.summary","Xml","W")
|
|
// <indexAsName> %s </> (for mapping pure xml tags)
|
|
// <prefix> %s </> (like "title", "myTag:" -can omit)
|
|
// <maxQualityForSpamDetect> %c </> (default 0, 0 means none)
|
|
// <minQualityToIndex> %ul </> (0-255, default 0 ) do not index
|
|
// <minDepth> %ul </> (0-inf, default 0 )
|
|
// <maxDepth> %ul </> (0-inf, default inf)
|
|
// <maxLenToIndex> %ul </> (0-inf, default inf)
|
|
// <indexAllOccurences> %b </> (default no) (ex.: no for title)
|
|
// <indexCRC> %b </> (default no ) index checksum?
|
|
// <filterHtmlEntities> %b </> (default yes)
|
|
// <indexIfUniqueOnly> %b </> (default no ) hash word iff unique
|
|
// <indexSingletons> %b </> (default yes)
|
|
// <indexPhrases> %b </> (default yes)
|
|
// <indexAsWhole> %b </> (default no ) hash a checksum
|
|
// <useStopWords> %b </> (default yes)
|
|
// <useStems> %b </> (default yes)
|
|
//
|
|
// ## Map doc's (link-adjusted) quality to a maxLen for this field.
|
|
// ## 30% quality is probably average.
|
|
// ## NOTE: there really are no defaults for these, use tagdb default rec.
|
|
// <quality11> %c </> (default 15% )
|
|
// <quality12> %c </> (default 30% )
|
|
// <quality13> %c </> (default 45% )
|
|
// <quality14> %c </> (default 60% )
|
|
// <quality15> %c </> (default 80% )
|
|
// <maxLen11> %ul </> (default 80k )
|
|
// <maxLen12> %ul </> (default 100k)
|
|
// <maxLen13> %ul </> (default 150k)
|
|
// <maxLen14> %ul </> (default 200k)
|
|
// <maxLen15> %ul </> (default 250k)
|
|
//
|
|
// ## Map doc's (link-adjusted) quality to a maxScore for this field.
|
|
// <quality21> %c </> (default 15% )
|
|
// <quality22> %c </> (default 30% )
|
|
// <quality23> %c </> (default 45% )
|
|
// <quality24> %c </> (default 60% )
|
|
// <quality25> %c </> (default 80% )
|
|
// <maxScore21> %ul </> (default 30% )
|
|
// <maxScore22> %ul </> (default 45% )
|
|
// <maxScore23> %ul </> (default 60% )
|
|
// <maxScore24> %ul </> (default 80% )
|
|
// <maxScore25> %ul </> (default 100%)
|
|
//
|
|
// ## map doc (link-adjusted) quality to a scoreWeight for this field
|
|
// <quality31> %c </> (default 15% )
|
|
// <quality32> %c </> (default 30% )
|
|
// <quality33> %c </> (default 45% )
|
|
// <quality34> %c </> (default 60% )
|
|
// <quality35> %c </> (default 80% )
|
|
// <scoreWeight31> %ul </> (default 60% )
|
|
// <scoreWeight32> %ul </> (default 100%)
|
|
// <scoreWeight33> %ul </> (default 150%)
|
|
// <scoreWeight34> %ul </> (default 200%)
|
|
// <scoreWeight35> %ul </> (default 250%)
|
|
//
|
|
// ## map field length to a scoreWeight for this field
|
|
// <len41> %ul </> (default 100) #w<100 -->wght=300
|
|
// <len42> %ul </> (default 500) score in[200,300]
|
|
// <len43> %ul </> (default 1000)
|
|
// <len44> %ul </> (default 2000)
|
|
// <len45> %ul </> (default 5000) if under/over 5000
|
|
// <scoreWeight41> %ul </> (default 300%)
|
|
// <scoreWeight42> %ul </> (default 200%)
|
|
// <scoreWeight43> %ul </> (default 150%)
|
|
// <scoreWeight44> %ul </> (default 100%)
|
|
// <scoreWeight45> %ul </> (default 50%)
|
|
//
|
|
// ## map field length to a maxScore for this field
|
|
// <len51> %ul </> (default 100) #w<100 -->wght=300
|
|
// <len52> %ul </> (default 500) score in[200,300]
|
|
// <len53> %ul </> (default 1000)
|
|
// <len54> %ul </> (default 2000)
|
|
// <len55> %ul </> (default 5000) if under/over 5000
|
|
// <maxScore51> %ul </> (default 30% )
|
|
// <maxScore52> %ul </> (default 45% )
|
|
// <maxScore53> %ul </> (default 60% )
|
|
// <maxScore54> %ul </> (default 80% )
|
|
// <maxScore55> %ul </> (default 100%)
|
|
//
|
|
// </>
|
|
|
|
// TODO:
|
|
// <indexAsLong>, <indexAsBool>, ... for pure xml tags w/ special meaning
|
|
//
|