mirror of
https://github.com/privacore/open-source-search-engine.git
synced 2025-07-15 02:36:08 -04:00
Merge branch 'master' of github.com:privacore/open-source-search-engine
This commit is contained in:
Bits.hClusterdb.hCollectiondb.cppCollectiondb.hConf.hHighlight.cppHostdb.cppHostdb.hHttpServer.cppImages.cppLinkdb.cppLog.cppLog.hMakefileMatches.cppMsg0.cppMsg2.cppMsg20.cppMsg20.hMsg39.cppMsg39.hMsg3a.cppMsg3a.hMsg40.cppMsg40.hMsg5.cppMulticast.cppMulticast.hPageBasic.cppPageCrawlBot.cppPageParser.cppPageParser.hPageReindex.cppPageReindex.hPageResults.cppPageRoot.cppPageRoot.hPageSockets.cppPages.cppParms.cppPhrases.cppPhrases.hPos.hPosdb.cppPosdb.hPostQueryRerank.cppProcess.cppProfiler.hQuery.cppQuery.hRdb.cppRepair.cppRepair.hRevdb.cppRevdb.hSafeBuf.cppSafeBuf.hSearchInput.cppSearchInput.hSections.cppSections.hSpider.hStatsdb.cppStopWords.cppStopWords.hSummary.cppTitle.hWords.cppWords.hXmlDoc.cppXmlDoc.hXmlDoc_Indexing.cppgb-include.hmain.cppqa.cpp
test
19
Bits.h
19
Bits.h
@ -47,24 +47,21 @@
|
||||
// set by Sections.cpp::setMenu() function
|
||||
#define D_IN_LINK 0x0400
|
||||
|
||||
// in the place name part of an address?
|
||||
//#define D_UNUSED_2 0x0800
|
||||
//#define D_UNUSED 0x0800
|
||||
|
||||
// allow for dows for texasdrums.org, so TUESDAYS is set with this and
|
||||
// we can keep it as part of the sentence and not split on the colon
|
||||
//#define D_IS_IN_DATE_2 0x1000
|
||||
// this is so we can still set EV_HASTITLEBYVOTES if a tod date is in the
|
||||
// title, all other dates are no-no!
|
||||
#define D_IS_DAYNUM 0x1000
|
||||
// for setting event titles in Events.cpp
|
||||
#define D_GENERIC_WORD 0x2000
|
||||
#define D_CRUFTY 0x4000
|
||||
#define D_IS_NUM 0x00008000
|
||||
//#define D_UNUSED_3 0x00010000
|
||||
#define D_IS_IN_URL 0x00020000
|
||||
//#define D_UNUSED 0x4000
|
||||
#define D_IS_NUM 0x00008000
|
||||
//#define D_UNUSED 0x00010000
|
||||
#define D_IS_IN_URL 0x00020000
|
||||
// like D_IS_TOD above
|
||||
#define D_IS_MONTH 0x00040000
|
||||
#define D_IS_HEX_NUM 0x00080000
|
||||
#define D_IS_MONTH 0x00040000
|
||||
#define D_IS_HEX_NUM 0x00080000
|
||||
|
||||
//
|
||||
// the bits below here are used for Summary.cpp when calling
|
||||
// Bits::setForSummary()
|
||||
|
57
Clusterdb.h
57
Clusterdb.h
@ -36,12 +36,9 @@
|
||||
|
||||
// these are now just TitleRec keys
|
||||
#define CLUSTER_REC_SIZE (sizeof(key_t))
|
||||
// this now includes the gigabit vector
|
||||
#define VECTOR_REC_SIZE (sizeof(key_t)+SAMPLE_VECTOR_SIZE+GIGABIT_VECTOR_SIZE)
|
||||
|
||||
class Clusterdb {
|
||||
|
||||
public:
|
||||
public:
|
||||
|
||||
// reset rdb
|
||||
void reset();
|
||||
@ -85,16 +82,6 @@ class Clusterdb {
|
||||
// convert a titlerec key into a clusterec key
|
||||
key_t convertTitleRecKey ( key_t titleKey );
|
||||
|
||||
/*
|
||||
uint32_t getGroupId ( int64_t docId ) {
|
||||
return g_titledb.getGroupId ( docId ); };
|
||||
|
||||
// cluster rec should be stored on same host as titleRec with the
|
||||
// same docId that this key contains
|
||||
uint32_t getGroupIdFromKey ( key_t *key ) {
|
||||
return g_titledb.getGroupId ( getDocId ( *key ) ); };
|
||||
*/
|
||||
|
||||
// NOTE: THESE NOW USE THE REAL CLUSTERDB REC
|
||||
// // docId occupies the most significant bytes of the key
|
||||
// now docId occupies the bits after the first 23
|
||||
@ -106,10 +93,6 @@ class Clusterdb {
|
||||
return docId;
|
||||
};
|
||||
|
||||
//int64_t getDocId ( char *r ) {
|
||||
// return getDocId(*(key_t*)r);
|
||||
//}
|
||||
|
||||
uint32_t getSiteHash26 ( const char *r ) {
|
||||
//return g_titledb.getSiteHash ( (key_t *)r ); };
|
||||
return ((uint32_t)(((const key_t*)r)->n0 >> 2) & 0x03FFFFFF);
|
||||
@ -124,52 +107,16 @@ class Clusterdb {
|
||||
return ((unsigned char)(((const key_t*)r)->n0 >> 28) & 0x0000003F);
|
||||
}
|
||||
|
||||
// NOTE: THESE USE THE OLD "CLUSTERDB" REC GENERATED BY MSG22 (VECTOR)
|
||||
//uint32_t getContentHash ( char *r ) {
|
||||
// return g_titledb.getContentHash ( *(key_t *)r ); };
|
||||
|
||||
char getFamilyFilter ( const char *r ) {
|
||||
if ( (*(const int64_t *)r) & 0x0000000400000000LL ) return 1;
|
||||
return 0;
|
||||
};
|
||||
|
||||
|
||||
//uint32_t hasAdultWords ( char *r ) {
|
||||
// return g_titledb.hasAdultWords ( *(key_t *)r ); };
|
||||
|
||||
//uint32_t hasAdultCategory ( char *r ) {
|
||||
// return g_titledb.hasAdultCategory ( *(key_t *)r ); };
|
||||
|
||||
//unsigned char getLanguageFromVector ( char *r ) {
|
||||
// return 0;
|
||||
//}
|
||||
|
||||
// the random sample vector
|
||||
/*
|
||||
void getSampleVector ( char *vec ,
|
||||
class Doc *doc,
|
||||
char *coll ,
|
||||
int32_t collLen ,
|
||||
int32_t niceness = 0 );
|
||||
*/
|
||||
//void getSampleVector ( char *vec , class TermTable *table );
|
||||
char getSampleSimilarity ( char *vec0 , char *vec1 , int32_t size );
|
||||
// get the content vector from a cluster rec (used by Msg38.cpp)
|
||||
//char *getSampleVector ( char *rec ) { return rec + sizeof(key_t); };
|
||||
|
||||
//char *getGigabitVector ( char *rec ) {
|
||||
// return rec + sizeof(key_t) + SAMPLE_VECTOR_SIZE ; };
|
||||
//char getGigabitSimilarity ( char *vec0 , char *vec1 ,
|
||||
// int32_t *qtable , int32_t numSlots ) ;
|
||||
|
||||
//DiskPageCache *getDiskPageCache() { return &m_pc; };
|
||||
|
||||
private:
|
||||
|
||||
private:
|
||||
// this rdb holds urls waiting to be spidered or being spidered
|
||||
Rdb m_rdb;
|
||||
|
||||
//DiskPageCache m_pc;
|
||||
};
|
||||
|
||||
extern class Clusterdb g_clusterdb;
|
||||
|
@ -456,8 +456,6 @@ bool Collectiondb::addNewColl ( char *coll ,
|
||||
cr->m_collectiveRespiderFrequency = 0.0;
|
||||
//cr->m_restrictDomain = true;
|
||||
// reset the crawl stats
|
||||
// always turn off gigabits so &s=1000 can do summary skipping
|
||||
cr->m_docsToScanForTopics = 0;
|
||||
// turn off link voting, etc. to speed up
|
||||
cr->m_getLinkInfo = false;
|
||||
cr->m_computeSiteNumInlinks = false;
|
||||
@ -1283,12 +1281,12 @@ char *Collectiondb::getDefaultColl ( HttpRequest *r ) {
|
||||
|
||||
// . get collectionRec from name
|
||||
// . returns NULL if not available
|
||||
CollectionRec *Collectiondb::getRec ( char *coll ) {
|
||||
CollectionRec *Collectiondb::getRec ( const char *coll ) {
|
||||
if ( ! coll ) coll = "";
|
||||
return getRec ( coll , gbstrlen(coll) );
|
||||
}
|
||||
|
||||
CollectionRec *Collectiondb::getRec ( char *coll , int32_t collLen ) {
|
||||
CollectionRec *Collectiondb::getRec ( const char *coll , int32_t collLen ) {
|
||||
if ( ! coll ) coll = "";
|
||||
collnum_t collnum = getCollnum ( coll , collLen );
|
||||
if ( collnum < 0 ) return NULL;
|
||||
@ -1333,14 +1331,14 @@ char *Collectiondb::getCollName ( collnum_t collnum ) {
|
||||
return m_recs[collnum]->m_coll;
|
||||
}
|
||||
|
||||
collnum_t Collectiondb::getCollnum ( char *coll ) {
|
||||
collnum_t Collectiondb::getCollnum ( const char *coll ) {
|
||||
|
||||
int32_t clen = 0;
|
||||
if ( coll ) clen = gbstrlen(coll );
|
||||
return getCollnum ( coll , clen );
|
||||
}
|
||||
|
||||
collnum_t Collectiondb::getCollnum ( char *coll , int32_t clen ) {
|
||||
collnum_t Collectiondb::getCollnum ( const char *coll , int32_t clen ) {
|
||||
|
||||
// default empty collection names
|
||||
if ( coll && ! coll[0] ) coll = NULL;
|
||||
@ -1674,9 +1672,6 @@ bool CollectionRec::load ( char *coll , int32_t i ) {
|
||||
// fix for diffbot, spider time deduping
|
||||
if ( m_isCustomCrawl ) m_dedupingEnabled = true;
|
||||
|
||||
// always turn off gigabits so &s=1000 can do summary skipping
|
||||
if ( m_isCustomCrawl ) m_docsToScanForTopics = 0;
|
||||
|
||||
// make min to merge smaller than normal since most collections are
|
||||
// small and we want to reduce the # of vfds (files) we have
|
||||
if ( m_isCustomCrawl ) {
|
||||
|
@ -61,8 +61,8 @@ class Collectiondb {
|
||||
bool m_needsSave;
|
||||
|
||||
// returns i so that m_recs[i].m_coll = coll
|
||||
collnum_t getCollnum ( char *coll , int32_t collLen );
|
||||
collnum_t getCollnum ( char *coll ); // coll is NULL terminated here
|
||||
collnum_t getCollnum ( const char *coll , int32_t collLen );
|
||||
collnum_t getCollnum ( const char *coll ); // coll is NULL terminated here
|
||||
|
||||
char *getCollName ( collnum_t collnum );
|
||||
char *getColl ( collnum_t collnum ) {return getCollName(collnum);};
|
||||
@ -79,9 +79,9 @@ class Collectiondb {
|
||||
|
||||
// . get collectionRec from name
|
||||
// returns NULL if not available
|
||||
class CollectionRec *getRec ( char *coll );
|
||||
class CollectionRec *getRec ( const char *coll );
|
||||
|
||||
class CollectionRec *getRec ( char *coll , int32_t collLen );
|
||||
class CollectionRec *getRec ( const char *coll , int32_t collLen );
|
||||
|
||||
class CollectionRec *getRec ( collnum_t collnum);
|
||||
|
||||
@ -501,8 +501,7 @@ class CollectionRec {
|
||||
float m_updateVotesFreq ; // in days. replaced m_recycleVotes
|
||||
float m_sortByDateWeight ;
|
||||
|
||||
char m_dedupURLDefault ;
|
||||
int32_t m_topicSimilarCutoffDefault ;
|
||||
char m_dedupURLDefault ;
|
||||
char m_useNewDeduping ;
|
||||
char m_doTierJumping ;
|
||||
float m_numDocsMultiplier ;
|
||||
@ -716,20 +715,6 @@ class CollectionRec {
|
||||
|
||||
int32_t m_compoundListMaxSize;
|
||||
|
||||
// . related topics control
|
||||
// . this can all be overridden by passing in your own cgi parms
|
||||
// for the query request
|
||||
int32_t m_numTopics; // how many do they want by default?
|
||||
int32_t m_minTopicScore;
|
||||
int32_t m_docsToScanForTopics; // how many to scan by default?
|
||||
int32_t m_maxWordsPerTopic;
|
||||
int32_t m_minDocCount; // min docs that must contain topic
|
||||
char m_ipRestrict;
|
||||
int32_t m_dedupSamplePercent;
|
||||
char m_topicRemoveOverlaps; // this is generally a good thing
|
||||
int32_t m_topicSampleSize; // sample about 5k per document
|
||||
int32_t m_topicMaxPunctLen; // keep it set to 1 for speed
|
||||
|
||||
// SPELL CHECK
|
||||
char m_spellCheck;
|
||||
|
||||
@ -887,26 +872,15 @@ class CollectionRec {
|
||||
// post query reranking
|
||||
int32_t m_pqr_docsToScan; // also for # docs for language
|
||||
float m_pqr_demFactCountry; // demotion for foreign countries
|
||||
float m_pqr_demFactQTTopicsInUrl; // demotion factor fewer for query terms or gigabits in the url
|
||||
int32_t m_pqr_maxValQTTopicsInUrl; // max value for fewer query terms or gigabits in the url
|
||||
float m_pqr_demFactPaths; // demotion factor for more paths
|
||||
int32_t m_pqr_maxValPaths; // max value for more paths
|
||||
float m_pqr_demFactCatidHasSupers; // demotion factor for catids with many super topics
|
||||
int32_t m_pqr_maxValCatidHasSupers; // max value for catids with many super topics
|
||||
float m_pqr_demFactPageSize; // demotion factor for higher page sizes
|
||||
int32_t m_pqr_maxValPageSize; // max value for higher page sizes
|
||||
float m_pqr_demFactLocTitle; // demotion factor for non-location specific queries with location specific results
|
||||
float m_pqr_demFactLocSummary; // demotion factor for non-location specific queries with location specific results
|
||||
bool m_pqr_demInTopics; // true to demote if location is in the gigabits, otherwise these locs won't be demoted
|
||||
int32_t m_pqr_maxValLoc; // max value for non-location specific queries with location specific results
|
||||
float m_pqr_demFactNonHtml; // demotion factor for non-html content type
|
||||
float m_pqr_demFactXml; // demotion factor for xml content type
|
||||
float m_pqr_demFactOthFromHost; // demotion factor for no other pages from same host
|
||||
int32_t m_pqr_maxValOthFromHost; // max value for no other pages from same host
|
||||
float m_pqr_demFactDmozCatNmNoQT; // demotion factor for dmoz category names that don't contain a query term
|
||||
int32_t m_pqr_maxValDmozCatNmNoQT; // max value for dmoz category names that don't contain a query term
|
||||
float m_pqr_demFactDmozCatNmNoGigabits; // demotion factor for dmoz category names that don't contain a gigabit
|
||||
int32_t m_pqr_maxValDmozCatNmNoGigabits; // max value for dmoz category names that don't contain a gigabit
|
||||
float m_pqr_demFactDatedbDate; // demotion for datedb date
|
||||
int32_t m_pqr_minValDatedbDate; // dates earlier than this will be demoted to the max
|
||||
int32_t m_pqr_maxValDatedbDate; // dates later than this will not be demoted
|
||||
|
47
Conf.h
47
Conf.h
@ -147,7 +147,6 @@ class Conf {
|
||||
// tagdb parameters
|
||||
int32_t m_tagdbMaxTreeMem;
|
||||
|
||||
int32_t m_revdbMaxTreeMem;
|
||||
int32_t m_timedbMaxTreeMem;
|
||||
|
||||
// clusterdb for site clustering, each rec is 16 bytes
|
||||
@ -173,6 +172,9 @@ class Conf {
|
||||
int32_t m_sendEmailTimeout;
|
||||
int32_t m_pingSpacer;
|
||||
|
||||
int64_t m_msg40_msg39_timeout; //timeout for entire get-docid-list phase, in milliseconds.
|
||||
int64_t m_msg3a_msg39_network_overhead; //additional latency/overhead of sending reqeust+response over network.
|
||||
|
||||
// the spiderdb holds url records for spidering, when to spider, etc..
|
||||
int32_t m_maxWriteThreads ;
|
||||
int32_t m_spiderMaxDiskThreads ;
|
||||
@ -184,7 +186,6 @@ class Conf {
|
||||
bool m_useStatsdb;
|
||||
|
||||
bool m_spideringEnabled ;
|
||||
bool m_turkingEnabled ;
|
||||
bool m_injectionsEnabled ;
|
||||
bool m_queryingEnabled ;
|
||||
bool m_returnResultsAnyway;
|
||||
@ -385,8 +386,6 @@ class Conf {
|
||||
bool m_detectMemLeaks;
|
||||
|
||||
// . if false we will not keep spelling information in memory
|
||||
// . we will keep the popularity info from dict though, since related
|
||||
// topics requires that
|
||||
bool m_doSpellChecking;
|
||||
|
||||
// are we running in Matt Wells's private data center? if so we
|
||||
@ -395,23 +394,6 @@ class Conf {
|
||||
|
||||
bool m_forceIt;
|
||||
|
||||
// maximum number of synonyms/stems to expand a word into
|
||||
//int32_t m_maxSynonyms;
|
||||
|
||||
// default affinity for spelling suggestions/numbers
|
||||
//float m_defaultAffinity;
|
||||
|
||||
// threshold for synonym usage
|
||||
//float m_frequencyThreshold;
|
||||
|
||||
// thesaurus configuration
|
||||
//int32_t m_maxAffinityRequests;
|
||||
//int32_t m_maxAffinityErrors;
|
||||
//int32_t m_maxAffinityAge;
|
||||
//int32_t m_affinityTimeout;
|
||||
//char m_affinityServer[MAX_URL_LEN];
|
||||
//char m_affinityParms[MAX_URL_LEN];
|
||||
|
||||
// new syncing information
|
||||
bool m_syncEnabled;
|
||||
bool m_syncIndexdb;
|
||||
@ -561,7 +543,6 @@ class Conf {
|
||||
bool m_logDebugThread ;
|
||||
bool m_logDebugTimedb ;
|
||||
bool m_logDebugTitle ;
|
||||
bool m_logDebugTopics ;
|
||||
bool m_logDebugTopDocs ;
|
||||
bool m_logDebugUdp ;
|
||||
bool m_logDebugUnicode ;
|
||||
@ -586,7 +567,6 @@ class Conf {
|
||||
bool m_logTimingNet;
|
||||
bool m_logTimingQuery;
|
||||
bool m_logTimingSpcache;
|
||||
bool m_logTimingTopics;
|
||||
// programmer reminders.
|
||||
bool m_logReminders;
|
||||
|
||||
@ -653,17 +633,6 @@ class Conf {
|
||||
int32_t m_maxHeartbeatDelay;
|
||||
int32_t m_maxCallbackDelay;
|
||||
|
||||
// balance value for Msg6, each host can have this many ready domains
|
||||
// per global host
|
||||
//int32_t m_distributedSpiderBalance;
|
||||
//int32_t m_distributedIpWait;
|
||||
|
||||
// parameters for indexdb spitting and tfndb extension bits
|
||||
//int32_t m_indexdbSplit;
|
||||
//char m_fullSplit;
|
||||
//char m_legacyIndexdbSplit;
|
||||
//int32_t m_tfndbExtBits;
|
||||
|
||||
// used by Repair.cpp
|
||||
char m_repairingEnabled ;
|
||||
int32_t m_maxRepairSpiders ;
|
||||
@ -673,23 +642,13 @@ class Conf {
|
||||
char m_fullRebuild ;
|
||||
char m_rebuildAddOutlinks;
|
||||
char m_rebuildRecycleLinkInfo ;
|
||||
//char m_rebuildRecycleLinkInfo2 ;
|
||||
//char m_removeBadPages ;
|
||||
char m_rebuildTitledb ;
|
||||
//char m_rebuildTfndb ;
|
||||
//char m_rebuildIndexdb ;
|
||||
char m_rebuildPosdb ;
|
||||
//char m_rebuildNoSplits ;
|
||||
//char m_rebuildDatedb ;
|
||||
char m_rebuildClusterdb ;
|
||||
char m_rebuildSpiderdb ;
|
||||
//char m_rebuildSitedb ;
|
||||
char m_rebuildLinkdb ;
|
||||
//char m_rebuildTagdb ;
|
||||
//char m_rebuildPlacedb ;
|
||||
char m_rebuildTimedb ;
|
||||
char m_rebuildSectiondb ;
|
||||
//char m_rebuildRevdb ;
|
||||
char m_rebuildRoots ;
|
||||
char m_rebuildNonRoots ;
|
||||
|
||||
|
@ -71,7 +71,7 @@ int32_t Highlight::set( SafeBuf *sb, char *content, int32_t contentLen, Query *q
|
||||
}
|
||||
|
||||
Phrases phrases;
|
||||
if ( !phrases.set( &words, &bits, true, false, version, niceness ) ) {
|
||||
if ( !phrases.set( &words, &bits, version, niceness ) ) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
|
30
Hostdb.cpp
30
Hostdb.cpp
@ -16,7 +16,6 @@
|
||||
#include "Clusterdb.h"
|
||||
#include "Datedb.h"
|
||||
#include "Dns.h"
|
||||
#include "Revdb.h"
|
||||
|
||||
// a global class extern'd in .h file
|
||||
Hostdb g_hostdb;
|
||||
@ -1760,7 +1759,7 @@ int64_t Hostdb::getNumGlobalEvents ( ) {
|
||||
return n / m_numHostsPerShard;
|
||||
}
|
||||
|
||||
bool Hostdb::setNote ( int32_t hostId, char *note, int32_t noteLen ) {
|
||||
bool Hostdb::setNote ( int32_t hostId, const char *note, int32_t noteLen ) {
|
||||
// replace the note on the host
|
||||
if ( noteLen > 125 ) noteLen = 125;
|
||||
Host *h = getHost ( hostId );
|
||||
@ -1773,7 +1772,7 @@ bool Hostdb::setNote ( int32_t hostId, char *note, int32_t noteLen ) {
|
||||
return saveHostsConf();
|
||||
}
|
||||
|
||||
bool Hostdb::setSpareNote ( int32_t spareId, char *note, int32_t noteLen ) {
|
||||
bool Hostdb::setSpareNote ( int32_t spareId, const char *note, int32_t noteLen ) {
|
||||
// replace the note on the host
|
||||
if ( noteLen > 125 ) noteLen = 125;
|
||||
Host *h = getSpare ( spareId );
|
||||
@ -2354,8 +2353,7 @@ int32_t getShardNumFromTermId ( int64_t termId ) {
|
||||
// . this allows us to have any # of groups in a stripe, not just power of 2
|
||||
// . now we can use 3 stripes of 96 hosts each so spiders will almost never
|
||||
// go down
|
||||
//uint32_t Hostdb::getGroupId ( char rdbId,void *k,bool split ) {
|
||||
uint32_t Hostdb::getShardNum ( char rdbId, const void *k ) { // ,bool split ) {
|
||||
uint32_t Hostdb::getShardNum ( char rdbId, const void *k ) {
|
||||
|
||||
if ( (rdbId == RDB_POSDB || rdbId == RDB2_POSDB2) &&
|
||||
// split by termid and not docid?
|
||||
@ -2372,10 +2370,6 @@ uint32_t Hostdb::getShardNum ( char rdbId, const void *k ) { // ,bool split ) {
|
||||
uint64_t d = g_posdb.getDocId ( k );
|
||||
return m_map [ ((d>>14)^(d>>7)) & (MAX_KSLOTS-1) ];
|
||||
}
|
||||
//if ( rdbId == RDB_INDEXDB || rdbId == RDB2_INDEXDB2 ) {
|
||||
// uint64_t d = g_indexdb.getDocId ( (key_t *)k );
|
||||
// return m_map [ ((d>>14)^(d>>7)) & (MAX_KSLOTS-1) ];
|
||||
//}
|
||||
else if ( rdbId == RDB_DATEDB || rdbId == RDB2_DATEDB2 ) {
|
||||
uint64_t d = g_datedb.getDocId ( k );
|
||||
return m_map [ ((d>>14)^(d>>7)) & (MAX_KSLOTS-1) ];
|
||||
@ -2383,10 +2377,6 @@ uint32_t Hostdb::getShardNum ( char rdbId, const void *k ) { // ,bool split ) {
|
||||
else if ( rdbId == RDB_LINKDB || rdbId == RDB2_LINKDB2 ) {
|
||||
return m_map [(*(uint16_t *)((char *)k + 26))>>3];
|
||||
}
|
||||
//else if ( rdbId == RDB_TFNDB || rdbId == RDB2_TFNDB2 ) {
|
||||
// uint64_t d = g_tfndb.getDocId ( (key_t *)k );
|
||||
// return m_map [ ((d>>14)^(d>>7)) & (MAX_KSLOTS-1) ];
|
||||
//}
|
||||
else if ( rdbId == RDB_TITLEDB || rdbId == RDB2_TITLEDB2 ) {
|
||||
uint64_t d = g_titledb.getDocId ( (key_t *)k );
|
||||
return m_map [ ((d>>14)^(d>>7)) & (MAX_KSLOTS-1) ];
|
||||
@ -2416,23 +2406,11 @@ uint32_t Hostdb::getShardNum ( char rdbId, const void *k ) { // ,bool split ) {
|
||||
rdbId == RDB2_TAGDB2 ) {
|
||||
return m_map [(*(uint16_t *)((char *)k + 10))>>3];
|
||||
}
|
||||
else if ( rdbId == RDB_DOLEDB ) { // || rdbId == RDB2_DOLEDB2 ) {
|
||||
else if ( rdbId == RDB_DOLEDB ) {
|
||||
// HACK:!!!!!! this is a trick!!! it is us!!!
|
||||
//return g_hostdb.m_myHost->m_groupId;
|
||||
return g_hostdb.m_myHost->m_shardNum;
|
||||
}
|
||||
else if ( rdbId == RDB_SECTIONDB || rdbId == RDB2_SECTIONDB2 ) {
|
||||
// use top 13 bits of key
|
||||
return m_map [(*(uint16_t *)((char *)k + 14))>>3];
|
||||
//uint64_t d = g_datedb.getDocId ( k );
|
||||
//return m_map [ ((d>>14)^(d>>7)) & (MAX_KSLOTS-1) ];
|
||||
}
|
||||
else if ( rdbId == RDB_REVDB || rdbId == RDB2_REVDB2 ) {
|
||||
// key is formed like title key is
|
||||
//int64_t d = g_titledb.getDocId ( (key_t *)k );
|
||||
uint64_t d = g_revdb.getDocId( (key_t *)k );
|
||||
return m_map [ ((d>>14)^(d>>7)) & (MAX_KSLOTS-1) ];
|
||||
}
|
||||
|
||||
// core -- must be provided
|
||||
char *xx = NULL; *xx = 0;
|
||||
|
4
Hostdb.h
4
Hostdb.h
@ -593,8 +593,8 @@ class Hostdb {
|
||||
|
||||
|
||||
// sets the note for a host
|
||||
bool setNote ( int32_t hostId, char *note, int32_t noteLen );
|
||||
bool setSpareNote ( int32_t spareId, char *note, int32_t noteLen );
|
||||
bool setNote ( int32_t hostId, const char *note, int32_t noteLen );
|
||||
bool setSpareNote ( int32_t spareId, const char *note, int32_t noteLen );
|
||||
|
||||
// replace a host with a spare
|
||||
bool replaceHost ( int32_t origHostId, int32_t spareHostId );
|
||||
|
@ -10,6 +10,7 @@
|
||||
#include "Proxy.h"
|
||||
#include "PageCrawlBot.h"
|
||||
#include "Parms.h"
|
||||
#include "PageRoot.h"
|
||||
#ifdef _VALGRIND_
|
||||
#include <valgrind/memcheck.h>
|
||||
#endif
|
||||
@ -471,76 +472,6 @@ void HttpServer::requestHandler ( TcpSocket *s ) {
|
||||
// parse the http request
|
||||
HttpRequest r;
|
||||
|
||||
// debug
|
||||
/*
|
||||
unsigned char foo[1024];
|
||||
unsigned char *pp = foo;
|
||||
pp += sprintf ( (char *)pp,"GET /search?qcs=iso-8859-1&k0c=107207&code=1M9VNT6&spell=1&ns=2&nrt=0&rat=0&sc=1&DR=1&qh=0&bq2&q=");
|
||||
//pp += sprintf ( (char *)pp,"GET /search?k0c=107207&code=1M9VNT6&spell=1&ns=2&nrt=0&rat=0&sc=1&DR=1&qh=0&bq2&q=");
|
||||
|
||||
static char ddd[] = {
|
||||
0xc3, 0x83, 0xc6, 0x92, 0xc3, 0xa2, 0xe2, 0x80, 0x9e, 0xc2,
|
||||
0xa2, 0xc3, 0x83, 0xc2, 0xa2, 0xc3, 0xa2, 0xe2, 0x80, 0x9a,
|
||||
0xc2, 0xac, 0xc3, 0x82, 0xc2, 0xa6, 0xc3, 0x83, 0xc6, 0x92,
|
||||
0xc3, 0xa2, 0xe2, 0x80, 0x9e, 0xc2, 0xa2, 0xc3, 0x83, 0xe2,
|
||||
0x80, 0x9a, 0xc3, 0x82, 0xc2, 0x81, 0xc3, 0x83, 0xc6, 0x92,
|
||||
0xc3, 0xa2, 0xe2, 0x80, 0x9e, 0xc2, 0xa2, 0xc3, 0x83, 0xc2,
|
||||
|
||||
0xa2, 0xc3, 0xa2, 0xe2, 0x80, 0x9a, 0xc2, 0xac, 0xc3, 0x82,
|
||||
0xc2, 0xa1, 0xc3, 0x83, 0xc6, 0x92, 0xc3, 0xa2, 0xe2, 0x80,
|
||||
0x9e, 0xc2, 0xa2, 0xc3, 0x83, 0xe2, 0x80, 0xb9, 0xc3, 0xa2,
|
||||
0xe2, 0x82, 0xac, 0xc2, 0xa0, 0xc3, 0x83, 0xc6, 0x92, 0xc3,
|
||||
0xa2, 0xe2, 0x80, 0x9e, 0xc2, 0xa2, 0xc3, 0x83, 0xc2, 0xa2,
|
||||
0xc3, 0xa2, 0xe2, 0x80, 0x9a, 0xc2, 0xac, 0xc3, 0x82, 0xc2,
|
||||
0xa6, 0x20, 0xc3, 0x83, 0xc6, 0x92, 0xc3, 0x8b, 0xc5, 0x93,
|
||||
0xc3, 0x83, 0xe2, 0x80, 0x9a, 0xc3, 0x82, 0xc2, 0xa7, 0xc3,
|
||||
0x83, 0xc6, 0x92, 0xc3, 0xa2, 0xe2, 0x80, 0x9e, 0xc2, 0xa2,
|
||||
0xc3, 0x83, 0xc2, 0xa2, 0xc3, 0xa2, 0xe2, 0x80, 0x9a, 0xc2,
|
||||
0xac, 0xc3, 0x85, 0xc2, 0xbe, 0xc3, 0x83, 0xc6, 0x92, 0xc3,
|
||||
0xa2, 0xe2, 0x80, 0x9e, 0xc2, 0xa2, 0xc3, 0x83, 0xc2, 0xa2,
|
||||
0xc3, 0xa2, 0xe2, 0x80, 0x9a, 0xc2, 0xac, 0xc3, 0x82, 0xc2,
|
||||
0xa6, 0xc3, 0x83, 0xc6, 0x92, 0xc3, 0xa2, 0xe2, 0x80, 0x9e,
|
||||
0xc2, 0xa2, 0xc3, 0x83, 0xc2, 0xa2, 0xc3, 0xa2, 0xe2, 0x80,
|
||||
0x9a, 0xc2, 0xac, 0xc3, 0x82, 0xc2, 0xa0, 0xc3, 0x83, 0xc6,
|
||||
0x92, 0xc3, 0x8b, 0xc5, 0x93, 0xc3, 0x83, 0xe2, 0x80, 0x9a,
|
||||
0xc3, 0x82, 0xc2, 0xb8, 0xc3, 0x83, 0xc6, 0x92, 0xc3, 0xa2,
|
||||
0xe2, 0x80, 0x9e, 0xc2, 0xa2, 0xc3, 0x83, 0xe2, 0x80, 0xb9,
|
||||
0xc3, 0xa2, 0xe2, 0x82, 0xac, 0xc2, 0xa0, 0xc3, 0x83, 0xc6,
|
||||
0x92, 0xc3, 0xa2, 0xe2, 0x80, 0x9e, 0xc2, 0xa2, 0xc3, 0x83,
|
||||
0xc2, 0xa2, 0xc3, 0xa2, 0xe2, 0x80, 0x9a, 0xc2, 0xac, 0xc3,
|
||||
0x82, 0xc2, 0xa6, 0xc3, 0x83, 0xc6, 0x92, 0xc3, 0x8b, 0xc5,
|
||||
0x93, 0xc3, 0x83, 0xe2, 0x80, 0x9a, 0xc3, 0x82, 0xc2, 0xa9,
|
||||
0x20, 0xc3, 0x83, 0xc6, 0x92, 0xc3, 0x8b, 0xc5, 0x93, 0xc3,
|
||||
0x83, 0xe2, 0x80, 0x9a, 0xc3, 0x82, 0xc2, 0xa7, 0xc3, 0x83,
|
||||
0xc6, 0x92, 0xc3, 0xa2, 0xe2, 0x80, 0x9e, 0xc2, 0xa2, 0xc3,
|
||||
0x83, 0xc2, 0xa2, 0xc3, 0xa2, 0xe2, 0x80, 0x9a, 0xc2, 0xac,
|
||||
0xc3, 0x85, 0xc2, 0xbe, 0xc3, 0x83, 0xc6, 0x92, 0xc3, 0x8b,
|
||||
0xc5, 0x93, 0xc3, 0x83, 0xe2, 0x80, 0x9a, 0xc3, 0x82, 0xc2,
|
||||
0xa8, 0xc3, 0x83, 0xc6, 0x92, 0xc3, 0xa2, 0xe2, 0x80, 0x9e,
|
||||
0xc2, 0xa2, 0xc3, 0x83, 0xe2, 0x80, 0xa6, 0xc3, 0x82, 0xc2,
|
||||
0xa0, 0xc3, 0x83, 0xc6, 0x92, 0xc3, 0x8b, 0xc5, 0x93, 0xc3,
|
||||
0x83, 0xe2, 0x80, 0x9a, 0xc3, 0x82, 0xc2, 0xa6, 0xc3, 0x83,
|
||||
0xc6, 0x92, 0xc3, 0xa2, 0xe2, 0x80, 0x9e, 0xc2, 0xa2, 0xc3,
|
||||
0x83, 0xe2, 0x80, 0xa6, 0xc3, 0x82, 0xc2, 0xa0, 0xc3, 0x83,
|
||||
0xc6, 0x92, 0xc3, 0x8b, 0xc5, 0x93, 0xc3, 0x83, 0xe2, 0x80,
|
||||
0x9a, 0xc3, 0x82, 0xc2, 0xa9, 0x00, 0x00, 0xda, 0xda, 0xda,
|
||||
0xda, 0xda, 0xda, 0xda, 0xda, 0xda, 0xda, 0xda, 0xda, 0xda,
|
||||
0xda, 0xda, 0xda, 0xda, 0xda, 0xda, 0xda, 0xda, 0xda, 0xda,
|
||||
0xda, 0xda, 0xda, 0xda, 0xda, 0xda, 0xda, 0xda, 0xda, 0x74,
|
||||
0x65, 0x73, 0x2c, 0x20, 0x68, 0x59, 0x00, 0x00, 0x00, 0xac,
|
||||
0xed, 0x3b, 0x09, 0xac, 0xed, 0x3b, 0x09, 0x78, 0x51, 0xa7,
|
||||
0x24, 0xf8, 0xd0, 0xa7, 0x24, 0x00, 0x00, 0x00, 0x00, 0x0a,
|
||||
0x00};
|
||||
|
||||
for ( int32_t i = 0 ; i < 435 ; i++ ) {
|
||||
// again:
|
||||
*pp = ddd[i]; // rand() % 256;
|
||||
//if ( *pp < 0x80 ) goto again;
|
||||
pp++;
|
||||
}
|
||||
*pp = 0;
|
||||
*/
|
||||
|
||||
// . since we own the data, we'll free readBuf on r's destruction
|
||||
// . this returns false and sets g_errno on error
|
||||
// . but it should still set m_request to the readBuf to delete it
|
||||
@ -2592,9 +2523,6 @@ TcpSocket *HttpServer::unzipReply(TcpSocket* s) {
|
||||
}
|
||||
|
||||
|
||||
bool printFrontPageShell ( SafeBuf *sb , char *tabName , CollectionRec *cr ,
|
||||
bool printGigablast );
|
||||
|
||||
bool sendPagePretty ( TcpSocket *s ,
|
||||
HttpRequest *r ,
|
||||
char *filename ,
|
||||
|
@ -108,7 +108,7 @@ void Images::setCandidates ( Url *pageUrl , Words *words , Xml *xml ,
|
||||
// the positive scored window
|
||||
int32_t firstPosScore = -1;
|
||||
int32_t lastPosScore = -1;
|
||||
int32_t badFlags = SEC_SCRIPT|SEC_STYLE|SEC_SELECT|SEC_MARQUEE;
|
||||
int32_t badFlags = SEC_SCRIPT|SEC_STYLE|SEC_SELECT;
|
||||
// find positive scoring window
|
||||
for ( int32_t i = 0 ; i < nw ; i++ ) {
|
||||
// skip if in bad section
|
||||
|
44
Linkdb.cpp
44
Linkdb.cpp
@ -356,7 +356,6 @@ key224_t Linkdb::makeKey_uk ( uint32_t linkeeSiteHash32 ,
|
||||
/////////
|
||||
|
||||
#include "Collectiondb.h"
|
||||
//#include "CollectionRec.h"
|
||||
#include "matches2.h"
|
||||
|
||||
// 1MB read size for now
|
||||
@ -364,12 +363,8 @@ key224_t Linkdb::makeKey_uk ( uint32_t linkeeSiteHash32 ,
|
||||
|
||||
#define MAX_INTERNAL_INLINKS 10
|
||||
|
||||
//static void gotRootTitleRecWrapper25 ( void *state ) ;
|
||||
//static void gotTermFreqWrapper ( void *state ) ;
|
||||
static void gotListWrapper ( void *state ,RdbList *list,Msg5 *msg5);
|
||||
static bool gotLinkTextWrapper ( void *state );
|
||||
//static void sendLinkInfoReplyWrapper ( void *state );//, LinkInfo *info ) ;
|
||||
//static void gotReplyWrapper25 ( void *state , void *state2 ) ;
|
||||
|
||||
Msg25::Msg25() {
|
||||
m_numRequests = 0;
|
||||
@ -391,12 +386,6 @@ void Msg25::reset() {
|
||||
mfree ( m_replyPtrs[i], m_replySizes[i], "msg25r");
|
||||
// reset array count to 0
|
||||
m_numReplyPtrs = 0;
|
||||
// . free the linkinfo if we are responsible for it
|
||||
// . if someone "steals" it from us, they should set this to NULL
|
||||
//if ( m_linkInfo )
|
||||
// mfree ( m_linkInfo , m_linkInfo->getStoredSize(),"msg25s");
|
||||
// this now points into m_linkInfoBuf safebuf, just NULL it
|
||||
//m_linkInfo = NULL;
|
||||
|
||||
m_table.reset();
|
||||
m_ipTable.reset();
|
||||
@ -3359,7 +3348,6 @@ void Inlink::set ( Msg20Reply *r ) {
|
||||
r->size_surroundingText +
|
||||
r->size_rssItem +
|
||||
r->size_categories +
|
||||
r->size_gigabitQuery +
|
||||
r->size_templateVector;
|
||||
|
||||
char *pend = p + need;
|
||||
@ -3372,7 +3360,7 @@ void Inlink::set ( Msg20Reply *r ) {
|
||||
size_surroundingText = r->size_surroundingText;
|
||||
size_rssItem = r->size_rssItem;
|
||||
size_categories = r->size_categories;
|
||||
size_gigabitQuery = r->size_gigabitQuery;
|
||||
size_gigabitQuery = 0;
|
||||
size_templateVector = r->size_templateVector;
|
||||
|
||||
|
||||
@ -3432,13 +3420,8 @@ void Inlink::set ( Msg20Reply *r ) {
|
||||
/////////////
|
||||
|
||||
off_gigabitQuery = poff;
|
||||
if ( p + r->size_gigabitQuery < pend ) {
|
||||
gbmemcpy ( p , r->ptr_gigabitQuery , size_gigabitQuery );
|
||||
}
|
||||
else {
|
||||
size_gigabitQuery = 1;
|
||||
*p = '\0';
|
||||
}
|
||||
size_gigabitQuery = 1;
|
||||
*p = '\0';
|
||||
poff += size_gigabitQuery;
|
||||
p += size_gigabitQuery;
|
||||
|
||||
@ -3468,37 +3451,27 @@ void Inlink::setMsg20Reply ( Msg20Reply *r ) {
|
||||
r->m_firstSpidered = m_firstSpidered;
|
||||
|
||||
r->m_lastSpidered = m_lastSpidered;
|
||||
//r->m_nextSpiderTime = m_nextSpiderDate;
|
||||
r->m_datedbDate = m_datedbDate;
|
||||
r->m_firstIndexedDate = m_firstIndexedDate;
|
||||
r->m_numOutlinks = m_numOutlinks;
|
||||
//r->m_linkTextBaseScore = m_baseScore;
|
||||
//r->m_pagePop = m_pagePop;
|
||||
//r->m_sitePop = m_sitePop;
|
||||
//r->m_siteNumInlinks = m_siteNumInlinks;
|
||||
|
||||
r->m_isPermalink = m_isPermalink;
|
||||
r->m_outlinkInContent = m_outlinkInContent;
|
||||
r->m_outlinkInComment = m_outlinkInComment;
|
||||
|
||||
r->m_isLinkSpam = m_isLinkSpam;
|
||||
//r->m_isAnomaly = m_isAnomaly;
|
||||
r->m_hasAllQueryTerms = m_hasAllQueryTerms;
|
||||
|
||||
r->m_country = m_country;
|
||||
r->m_language = m_language;
|
||||
//r->m_docQuality = m_docQuality;
|
||||
r->m_siteRank = m_siteRank;
|
||||
//r->m_ruleset = m_ruleset;
|
||||
r->m_hopcount = m_hopcount;
|
||||
//r->m_linkTextScoreWeight = m_linkTextScoreWeight;
|
||||
|
||||
r->ptr_ubuf = getUrl();//ptr_urlBuf;
|
||||
r->ptr_linkText = getLinkText();//ptr_linkText;
|
||||
r->ptr_surroundingText = getSurroundingText();//ptr_surroundingText;
|
||||
r->ptr_rssItem = getRSSItem();//ptr_rssItem;
|
||||
r->ptr_categories = getCategories();//ptr_categories;
|
||||
r->ptr_gigabitQuery = getGigabitQuery();//ptr_gigabitQuery;
|
||||
r->ptr_templateVector = getTemplateVector();//ptr_templateVector;
|
||||
|
||||
r->size_ubuf = size_urlBuf;
|
||||
@ -3506,7 +3479,6 @@ void Inlink::setMsg20Reply ( Msg20Reply *r ) {
|
||||
r->size_surroundingText = size_surroundingText;
|
||||
r->size_rssItem = size_rssItem;
|
||||
r->size_categories = size_categories;
|
||||
r->size_gigabitQuery = size_gigabitQuery;
|
||||
r->size_templateVector = size_templateVector;
|
||||
}
|
||||
|
||||
@ -3583,7 +3555,7 @@ bool LinkInfo::print ( SafeBuf *sb , char *coll ) {
|
||||
int32_t dlen = k->size_surroundingText - 1;
|
||||
char *r = k->getRSSItem();//ptr_rssItem;
|
||||
int32_t rlen = k->size_rssItem - 1;
|
||||
char *g = k->getGigabitQuery();//ptr_gigabitQuery;
|
||||
char *g = k->getGigabitQuery();
|
||||
int32_t glen = k->size_gigabitQuery - 1;
|
||||
char *c = k->getCategories();//ptr_categories;
|
||||
int32_t clen = k->size_categories - 1;
|
||||
@ -4068,12 +4040,6 @@ bool Links::addLink ( char *link , int32_t linkLen , int32_t nodeNum ,
|
||||
|
||||
// don't add 0 length links
|
||||
if ( linkLen <= 0 ) return true;
|
||||
// ensure buf has enough room
|
||||
// if (titleRecVersion < 72){
|
||||
// if ( m_bufPtr-m_buf + linkLen + 1 > LINK_BUF_SIZE ){
|
||||
// return true;
|
||||
// }
|
||||
// }
|
||||
|
||||
// do we need to alloc more link space?
|
||||
if (m_numLinks >= m_allocLinks) {
|
||||
@ -4250,8 +4216,6 @@ bool Links::addLink ( char *link , int32_t linkLen , int32_t nodeNum ,
|
||||
else bufSpace = 0;
|
||||
// allocate dynamic buffer for lotsa links
|
||||
if ( url.getUrlLen() + 1 > bufSpace ) {
|
||||
//if (titleRecVersion < 72 && m_allocSize >= LINK_BUF_SIZE)
|
||||
// return true;
|
||||
// grow by 100K
|
||||
int32_t newAllocSize;// = m_allocSize+LINK_BUF_SIZE;
|
||||
if ( ! m_allocSize ) newAllocSize = LINK_BUF_SIZE;
|
||||
|
3
Log.cpp
3
Log.cpp
@ -165,7 +165,6 @@ bool Log::shouldLog ( int32_t type , const char *msg ) {
|
||||
if ( msg[0] == 'n' ) return g_conf.m_logTimingNet;
|
||||
if ( msg[0] == 'q' ) return g_conf.m_logTimingQuery;
|
||||
if ( msg[0] == 's' ) return g_conf.m_logTimingSpcache;
|
||||
if ( msg[0] == 't' ) return g_conf.m_logTimingTopics;
|
||||
return false;
|
||||
}
|
||||
if ( type != LOG_DEBUG ) return true;
|
||||
@ -205,8 +204,6 @@ bool Log::shouldLog ( int32_t type , const char *msg ) {
|
||||
if (msg[0]=='u'&&msg[1]=='n' ) return g_conf.m_logDebugUnicode;
|
||||
if (msg[0]=='t'&&msg[1]=='o'&&msg[3]=='D' )
|
||||
return g_conf.m_logDebugTopDocs;
|
||||
if (msg[0]=='t'&&msg[1]=='o'&&msg[3]!='D' )
|
||||
return g_conf.m_logDebugTopics;
|
||||
if (msg[0]=='d'&&msg[1]=='a' ) return g_conf.m_logDebugDate;
|
||||
if (msg[0]=='d'&&msg[1]=='d' ) return g_conf.m_logDebugDetailed;
|
||||
|
||||
|
6
Log.h
6
Log.h
@ -69,7 +69,6 @@
|
||||
// spcache related to determining what urls to spider next
|
||||
// speller query spell checking
|
||||
// thread calling threads
|
||||
// topics related topics
|
||||
// udp udp networking
|
||||
|
||||
// example log:
|
||||
@ -87,11 +86,6 @@
|
||||
#define MAX_LOG_MSGS 1024 // in memory
|
||||
|
||||
// this is for printing out how a page is parsed by PageParser.cpp
|
||||
/* extern char *g_pbuf ; */
|
||||
/* extern char *g_pbufPtr ; */
|
||||
/* extern char *g_pterms ; */
|
||||
/* extern char *g_ptermPtr ; */
|
||||
/* extern char *g_pend; */
|
||||
extern char *g_dbuf;
|
||||
extern int32_t g_dbufSize;
|
||||
|
||||
|
6
Makefile
6
Makefile
@ -35,7 +35,7 @@ OBJS = UdpSlot.o Rebalance.o \
|
||||
Msg1.o \
|
||||
Msg0.o Mem.o Matches.o Loop.o \
|
||||
Log.o Lang.o \
|
||||
Indexdb.o Posdb.o Clusterdb.o IndexList.o Revdb.o \
|
||||
Indexdb.o Posdb.o Clusterdb.o IndexList.o \
|
||||
HttpServer.o HttpRequest.o \
|
||||
HttpMime.o Hostdb.o \
|
||||
Highlight.o File.o Errno.o Entities.o \
|
||||
@ -75,6 +75,7 @@ CPPFLAGS = -g -Wall -fno-stack-protector -DPTHREADS -Wstrict-aliasing=0
|
||||
|
||||
ifeq ($(CXX), g++)
|
||||
CPPFLAGS += -Wno-write-strings -Wno-uninitialized -Wno-unused-but-set-variable
|
||||
CPPFLAGS += -Wno-invalid-offsetof
|
||||
else ifeq ($(CXX), clang++)
|
||||
CPPFLAGS += -Weverything -Wno-cast-align -Wno-reserved-id-macro -Wno-padded -Wno-c++11-long-long -Wno-tautological-undefined-compare -Wno-c++11-compat-reserved-user-defined-literal -Wno-zero-length-array -Wno-float-equal -Wno-c99-extensions -Wno-weak-vtables -Wno-global-constructors -Wno-exit-time-destructors
|
||||
CPPFLAGS += -Wno-shadow -Wno-conversion -Wno-extra-semi -Wno-sign-conversion -Wno-old-style-cast -Wno-shorten-64-to-32 -Wno-unused-parameter -Wno-missing-prototypes -Wno-c++11-compat-deprecated-writable-strings
|
||||
@ -114,6 +115,9 @@ GIT_VERSION=$(shell git rev-parse HEAD)$(DIRTY)
|
||||
|
||||
all: gb
|
||||
|
||||
debug: DEFS += -D_VALGRIND_
|
||||
debug: all
|
||||
|
||||
utils: blaster2 hashtest monitor seektest urlinfo treetest dnstest gbtitletest
|
||||
|
||||
# third party libraries
|
||||
|
@ -508,7 +508,6 @@ bool Matches::addMatches(Words *words, Phrases *phrases, Sections *sections, Bit
|
||||
|
||||
if ( getNumXmlNodes() > 512 ) { char *xx=NULL;*xx=0; }
|
||||
|
||||
// google seems to index SEC_MARQUEE so i took that out of here
|
||||
int32_t badFlags =SEC_SCRIPT|SEC_STYLE|SEC_SELECT|SEC_IN_TITLE;
|
||||
|
||||
int32_t qwn;
|
||||
@ -627,10 +626,7 @@ bool Matches::addMatches(Words *words, Phrases *phrases, Sections *sections, Bit
|
||||
|
||||
// this is 0 if we were an unmatched quote
|
||||
if ( numWords <= 0 ) continue;
|
||||
// we matched a bigram in the document
|
||||
//numWords = 3;
|
||||
// i guess we matched the query phrase bigram
|
||||
//numQWords = 3;
|
||||
|
||||
// got a match
|
||||
goto gotMatch2;
|
||||
}
|
||||
@ -641,7 +637,6 @@ bool Matches::addMatches(Words *words, Phrases *phrases, Sections *sections, Bit
|
||||
numWords = 1;
|
||||
numQWords = 1;
|
||||
goto gotMatch2;
|
||||
//char *xx=NULL;*xx=0;
|
||||
}
|
||||
|
||||
//
|
||||
|
99
Msg0.cpp
99
Msg0.cpp
@ -1164,105 +1164,6 @@ void gotListWrapper ( void *state , RdbList *listb , Msg5 *msg5xx )
|
||||
log(LOG_LOGIC,"net: msg0: Sending more data than what was "
|
||||
"requested. Ineffcient. Bad engineer. dataSize=%"INT32" "
|
||||
"minRecSizes=%"INT32".",dataSize,oldSize);
|
||||
/*
|
||||
// always compress these lists
|
||||
if ( st0->m_rdbId == RDB_SECTIONDB ) { // && 1 == 3) {
|
||||
|
||||
// get sh48, the sitehash
|
||||
key128_t *startKey = (key128_t *)msg5->m_startKey ;
|
||||
int64_t sh48 = g_datedb.getTermId(startKey);
|
||||
|
||||
// debug
|
||||
//log("msg0: got sectiondblist from disk listsize=%"INT32"",
|
||||
// list->getListSize());
|
||||
|
||||
if ( dataSize > 50000 )
|
||||
log("msg0: sending back list rdb=%"INT32" "
|
||||
"listsize=%"INT32" sh48=0x%"XINT64"",
|
||||
(int32_t)st0->m_rdbId,
|
||||
dataSize,
|
||||
sh48);
|
||||
|
||||
// save it
|
||||
int32_t origDataSize = dataSize;
|
||||
// store compressed list on itself
|
||||
char *dst = list->m_list;
|
||||
// warn if niceness is 0!
|
||||
if ( st0->m_niceness == 0 )
|
||||
log("msg0: compressing sectiondb list at niceness 0!");
|
||||
// compress the list
|
||||
uint32_t lastVoteHash32 = 0LL;
|
||||
SectionVote *lastVote = NULL;
|
||||
for ( ; ! list->isExhausted() ; list->skipCurrentRecord() ) {
|
||||
// breathe
|
||||
QUICKPOLL ( st0->m_niceness );
|
||||
// get rec
|
||||
char *rec = list->getCurrentRec();
|
||||
// for ehre
|
||||
key128_t *key = (key128_t *)rec;
|
||||
// the score is the bit which is was set in
|
||||
// Section::m_flags for that docid
|
||||
int32_t secType = g_indexdb.getScore ( (char *)key );
|
||||
// 0 means it probably used to count # of voters
|
||||
// from this site, so i don't think xmldoc uses
|
||||
// that any more
|
||||
if ( secType == SV_SITE_VOTER ) continue;
|
||||
// treat key like a datedb key and get the taghash
|
||||
uint32_t h32 = g_datedb.getDate ( key );
|
||||
// get data/vote from the current record in the
|
||||
// sectiondb list
|
||||
SectionVote *sv=(SectionVote *)list->getCurrentData ();
|
||||
// get the average score for this doc
|
||||
float avg = sv->m_score ;
|
||||
if ( sv->m_numSampled > 0.0 ) avg /= sv->m_numSampled;
|
||||
// if same as last guy, add to it
|
||||
if ( lastVoteHash32 == h32 && lastVote ) {
|
||||
// turn possible multi-vote into single docid
|
||||
// into a single vote, with the score averaged.
|
||||
lastVote->m_score += avg;
|
||||
lastVote->m_numSampled++;
|
||||
continue;
|
||||
}
|
||||
// otherwise, add in a new guy!
|
||||
*(key128_t *)dst = *key;
|
||||
dst += sizeof(key128_t);
|
||||
// the new vote
|
||||
SectionVote *dsv = (SectionVote *)dst;
|
||||
dsv->m_score = avg;
|
||||
dsv->m_numSampled = 1;
|
||||
// set this
|
||||
lastVote = dsv;
|
||||
lastVoteHash32 = h32;
|
||||
// skip over
|
||||
dst += sizeof(SectionVote);
|
||||
}
|
||||
// update the list size now for sending back
|
||||
dataSize = dst - data;
|
||||
// if the list was over the requested minrecsizes we need
|
||||
// to set a flag so that the caller will do a re-call.
|
||||
// so making the entire odd, will be the flag.
|
||||
if ( origDataSize > msg5->m_minRecSizes &&
|
||||
dataSize < origDataSize ) {
|
||||
*dst++ = '\0';
|
||||
dataSize++;
|
||||
}
|
||||
|
||||
// debug
|
||||
//log("msg0: compressed sectiondblist from disk "
|
||||
// "newlistsize=%"INT32"", dataSize);
|
||||
|
||||
// use this timestamp
|
||||
int32_t now = getTimeLocal();//Global();
|
||||
// finally, cache this sucker
|
||||
s_sectiondbCache.addRecord ( msg5->m_coll,
|
||||
(char *)startKey,//(char *)&sh48
|
||||
data,
|
||||
dataSize ,
|
||||
now );
|
||||
// ignore errors
|
||||
g_errno = 0;
|
||||
}
|
||||
*/
|
||||
|
||||
//
|
||||
// for linkdb lists, remove all the keys that have the same IP32
|
||||
|
5
Msg2.cpp
5
Msg2.cpp
@ -446,10 +446,7 @@ bool Msg2::gotList ( RdbList *list ) {
|
||||
if ( m_lists[i].m_listSize < m_minRecSizes[i] ) continue;
|
||||
if ( m_minRecSizes[i] == 0 ) continue;
|
||||
if ( m_minRecSizes[i] == -1 ) continue;
|
||||
// do not print this if compiling section xpathsitehash stats
|
||||
// because we only need like 10k of list to get a decent
|
||||
// reading
|
||||
if ( m_req->m_forSectionStats ) break;
|
||||
|
||||
log("msg2: read termlist #%"INT32" size=%"INT32" "
|
||||
"maxSize=%"INT32". losing docIds!",
|
||||
i,m_lists[i].m_listSize,m_minRecSizes[i]);
|
||||
|
@ -691,8 +691,6 @@ int32_t Msg20Reply::serialize ( char *buf , int32_t bufSize ) {
|
||||
VALGRIND_CHECK_MEM_IS_DEFINED(ptr_displaySum,size_displaySum);
|
||||
if(ptr_dbuf)
|
||||
VALGRIND_CHECK_MEM_IS_DEFINED(ptr_dbuf,size_dbuf);
|
||||
if(ptr_gigabitSample)
|
||||
VALGRIND_CHECK_MEM_IS_DEFINED(ptr_gigabitSample,size_gigabitSample);
|
||||
if(ptr_mbuf)
|
||||
VALGRIND_CHECK_MEM_IS_DEFINED(ptr_mbuf,size_mbuf);
|
||||
if(ptr_vbuf)
|
||||
@ -723,12 +721,8 @@ int32_t Msg20Reply::serialize ( char *buf , int32_t bufSize ) {
|
||||
VALGRIND_CHECK_MEM_IS_DEFINED(ptr_rssItem,size_rssItem);
|
||||
if(ptr_categories)
|
||||
VALGRIND_CHECK_MEM_IS_DEFINED(ptr_categories,size_categories);
|
||||
if(ptr_gigabitQuery)
|
||||
VALGRIND_CHECK_MEM_IS_DEFINED(ptr_gigabitQuery,size_gigabitQuery);
|
||||
if(ptr_content)
|
||||
VALGRIND_CHECK_MEM_IS_DEFINED(ptr_content,size_content);
|
||||
if(ptr_sectionVotingInfo)
|
||||
VALGRIND_CHECK_MEM_IS_DEFINED(ptr_sectionVotingInfo,size_sectionVotingInfo);
|
||||
if(ptr_tr)
|
||||
VALGRIND_CHECK_MEM_IS_DEFINED(ptr_tr,size_tr);
|
||||
if(ptr_tlistBuf)
|
||||
|
10
Msg20.h
10
Msg20.h
@ -58,8 +58,6 @@ class Msg20Request {
|
||||
int32_t m_summaryMaxLen ;
|
||||
int32_t m_summaryMaxNumCharsPerLine ;
|
||||
int32_t m_maxNumCharsPerLine ;
|
||||
int32_t m_bigSampleRadius ;
|
||||
int32_t m_bigSampleMaxLen ;
|
||||
int32_t m_maxCacheAge ;
|
||||
int32_t m_discoveryDate ;
|
||||
|
||||
@ -83,12 +81,10 @@ class Msg20Request {
|
||||
unsigned char m_getSummaryVector :1;
|
||||
unsigned char m_showBanned :1;
|
||||
unsigned char m_includeCachedCopy :1;
|
||||
unsigned char m_getSectionVotingInfo :1; // in JSON for now
|
||||
unsigned char m_getMatches :1;
|
||||
unsigned char m_getTermListBuf :1;
|
||||
unsigned char m_getOutlinks :1;
|
||||
unsigned char m_getTitleRec :1; // sets ptr_tr in reply
|
||||
unsigned char m_getGigabitVector :1;
|
||||
unsigned char m_doLinkSpamCheck :1;
|
||||
unsigned char m_isLinkSpam :1; // Msg25 uses for storage
|
||||
unsigned char m_isSiteLinkInfo :1; // site link info?
|
||||
@ -237,7 +233,6 @@ public:
|
||||
char *ptr_rubuf ; // redirect url buffer
|
||||
char *ptr_displaySum ; // summary for displaying
|
||||
char *ptr_dbuf ; // display metas \0 separated
|
||||
char *ptr_gigabitSample ;
|
||||
char *ptr_mbuf ; // match offsets
|
||||
char *ptr_vbuf ; // summary vector
|
||||
char *ptr_imgData ; // for encoded images
|
||||
@ -263,9 +258,7 @@ public:
|
||||
char *ptr_linkUrl ; // what we link to
|
||||
char *ptr_rssItem ; // set for m_getLinkText
|
||||
char *ptr_categories ;
|
||||
char *ptr_gigabitQuery ; // , separated list of gigabits
|
||||
char *ptr_content ; // page content in utf8
|
||||
char *ptr_sectionVotingInfo ; // in JSON
|
||||
char *ptr_tr ; // like just using msg22
|
||||
char *ptr_tlistBuf ;
|
||||
char *ptr_tiBuf ; // terminfobuf
|
||||
@ -285,7 +278,6 @@ public:
|
||||
int32_t size_rubuf ;
|
||||
int32_t size_displaySum ;
|
||||
int32_t size_dbuf ;
|
||||
int32_t size_gigabitSample ; // includes \0
|
||||
int32_t size_mbuf ;
|
||||
int32_t size_vbuf ;
|
||||
int32_t size_imgData ;
|
||||
@ -304,9 +296,7 @@ public:
|
||||
int32_t size_linkUrl ;
|
||||
int32_t size_rssItem ;
|
||||
int32_t size_categories ;
|
||||
int32_t size_gigabitQuery ;
|
||||
int32_t size_content ; // page content in utf8
|
||||
int32_t size_sectionVotingInfo ; // in json, includes \0
|
||||
int32_t size_tr ;
|
||||
int32_t size_tlistBuf ;
|
||||
int32_t size_tiBuf ;
|
||||
|
14
Msg39.cpp
14
Msg39.cpp
@ -327,17 +327,15 @@ bool Msg39::controlLoop ( ) {
|
||||
// fix it
|
||||
m_r->m_minDocId = d0;
|
||||
m_r->m_maxDocId = d1; // -1; // exclude d1
|
||||
// allow posdbtable re-initialization each time to set
|
||||
// the msg2 termlist ptrs anew, otherwise we core in
|
||||
// call to PosdbTable::init() below
|
||||
//m_posdbTable.m_initialized = false;
|
||||
|
||||
// reset ourselves, partially, anyway, not tmpq etc.
|
||||
reset2();
|
||||
|
||||
// debug log
|
||||
if ( ! m_r->m_forSectionStats && m_debug )
|
||||
if ( m_debug ) {
|
||||
log("msg39: docid split %d/%d range %"INT64"-%"INT64"", m_docIdSplitNumber-1, m_r->m_numDocIdSplits, d0,d1);
|
||||
// wtf?
|
||||
//if ( d0 >= d1 ) break;
|
||||
}
|
||||
|
||||
// load termlists for these docid ranges using msg2 from posdb
|
||||
if ( ! getLists() ) return false;
|
||||
}
|
||||
@ -1068,7 +1066,7 @@ void Msg39::estimateHitsAndSendReply ( ) {
|
||||
mr.size_pairScoreBuf = 0;
|
||||
mr.size_singleScoreBuf = 0;
|
||||
}
|
||||
//mr.m_sectionStats = pt->m_sectionStats;
|
||||
|
||||
// reserve space for these guys, we fill them in below
|
||||
mr.ptr_docIds = NULL;
|
||||
mr.ptr_scores = NULL;
|
||||
|
8
Msg39.h
8
Msg39.h
@ -53,14 +53,11 @@ class Msg39Request {
|
||||
ptr_readSizes = NULL;
|
||||
ptr_query = NULL; // in utf8?
|
||||
ptr_whiteList = NULL;
|
||||
//ptr_coll = NULL;
|
||||
m_forSectionStats = false;
|
||||
size_readSizes = 0;
|
||||
size_query = 0;
|
||||
size_whiteList = 0;
|
||||
m_sameLangWeight = 20.0;
|
||||
m_maxFacets = -1;
|
||||
//size_coll = 0;
|
||||
|
||||
m_getDocIdScoringInfo = 1;
|
||||
|
||||
@ -115,11 +112,6 @@ class Msg39Request {
|
||||
char m_useQueryStopWords;
|
||||
char m_doMaxScoreAlgo;
|
||||
|
||||
char m_forSectionStats;
|
||||
|
||||
// Msg3a still uses this
|
||||
//int32_t m_myFacetVal32; // for gbfacet:xpathsite really sectionstats
|
||||
|
||||
collnum_t m_collnum;
|
||||
|
||||
int64_t m_minDocId;
|
||||
|
98
Msg3a.cpp
98
Msg3a.cpp
@ -131,13 +131,6 @@ bool Msg3a::getDocIds ( Msg39Request *r ,
|
||||
log(LOG_LOGIC,"net: bad collection. msg3a. %"INT32"",
|
||||
(int32_t)m_r->m_collnum);
|
||||
|
||||
//m_indexdbSplit = g_hostdb.m_indexSplits;
|
||||
// certain query term, like, gbdom:xyz.com, are NOT split
|
||||
// at all in order to keep performance high because such
|
||||
// terms are looked up by the spider. if a query contains
|
||||
// multiple "no split" terms, then it becomes split unfortunately...
|
||||
//if ( ! m_q->isSplit() ) m_indexdbSplit = 1;
|
||||
|
||||
// for a sanity check in Msg39.cpp
|
||||
r->m_nqt = m_q->getNumTerms();
|
||||
|
||||
@ -154,10 +147,7 @@ bool Msg3a::getDocIds ( Msg39Request *r ,
|
||||
// . return now if query empty, no docids, or none wanted...
|
||||
// . if query terms = 0, might have been "x AND NOT x"
|
||||
if ( m_q->getNumTerms() <= 0 ) return true;
|
||||
// sometimes we want to get section stats from the hacked
|
||||
// sectionhash: posdb termlists
|
||||
//if ( m_docsToGet <= 0 && ! m_r->m_getSectionStats )
|
||||
// return true;
|
||||
|
||||
// . set g_errno if not found and return true
|
||||
// . coll is null terminated
|
||||
CollectionRec *cr = g_collectiondb.getRec(r->m_collnum);
|
||||
@ -234,24 +224,17 @@ bool Msg3a::getDocIds ( Msg39Request *r ,
|
||||
// update our read info
|
||||
for ( int32_t j = 0; j < n ; j++ ) {
|
||||
// the read size for THIS query term
|
||||
int32_t rs = 300000000; // toRead; 300MB i guess...
|
||||
// limit to 50MB man! this was 30MB but the
|
||||
// 'time enough for love' query was hitting 30MB termlists.
|
||||
//rs = 50000000;
|
||||
rs = DEFAULT_POSDB_READSIZE;//90000000; // 90MB!
|
||||
// it is better to go oom then leave users scratching their
|
||||
// heads as to why some results are not being returned.
|
||||
// no, because we are going out of mem for queries like
|
||||
// 'www.disney.nl' etc.
|
||||
//rs = -1;
|
||||
// if section stats, limit to 1MB
|
||||
//if ( m_r->m_getSectionStats ) rs = 1000000;
|
||||
int32_t rs = DEFAULT_POSDB_READSIZE;//90000000; // 90MB!
|
||||
|
||||
// get the jth query term
|
||||
QueryTerm *qt = &m_q->m_qterms[j];
|
||||
|
||||
// if query term is ignored, skip it
|
||||
if ( qt->m_ignored ) rs = 0;
|
||||
|
||||
// set it
|
||||
readSizes[j] = rs;
|
||||
|
||||
// serialize these too
|
||||
tfw[j] = qt->m_termFreqWeight;
|
||||
}
|
||||
@ -265,8 +248,7 @@ bool Msg3a::getDocIds ( Msg39Request *r ,
|
||||
// Query::expandQuery() above
|
||||
m_r->ptr_query = m_q->m_orig;
|
||||
m_r->size_query = m_q->m_origLen+1;
|
||||
// the white list now too...
|
||||
//m_r->ptr_whiteList = si->m_whiteListBuf.getBufStart();
|
||||
|
||||
// free us?
|
||||
if ( m_rbufPtr && m_rbufPtr != m_rbuf ) {
|
||||
mfree ( m_rbufPtr , m_rbufSize, "Msg3a" );
|
||||
@ -314,7 +296,7 @@ bool Msg3a::getDocIds ( Msg39Request *r ,
|
||||
// high because it is a spider time thing.
|
||||
if ( m_r->m_timeout > 0 ) {
|
||||
timeout = m_r->m_timeout;
|
||||
timeout += 250; //add 250ms for general overhead
|
||||
timeout += g_conf.m_msg3a_msg39_network_overhead;
|
||||
}
|
||||
if ( timeout > multicast_msg3a_maximum_timeout )
|
||||
timeout = multicast_msg3a_maximum_timeout;
|
||||
@ -774,64 +756,6 @@ bool Msg3a::mergeLists ( ) {
|
||||
//m_totalDocCount = 0; // int32_t docCount = 0;
|
||||
m_moreDocIdsAvail = true;
|
||||
|
||||
/*
|
||||
|
||||
this version is too simple. now each query term can be a
|
||||
gbfacet:price or gbfacet:type term and each has a
|
||||
list in the Msg39Reply::ptr_facetHashList for its termid
|
||||
|
||||
//
|
||||
// compile facet stats
|
||||
//
|
||||
for ( int32_t j = 0; j < m_numHosts ; j++ ) {
|
||||
Msg39Reply *mr =m_reply[j];
|
||||
// one table for each query term
|
||||
char *p = mr->ptr_facetHashList;
|
||||
// loop over all query terms
|
||||
int32_t n = m_q->getNumTerms();
|
||||
// use this
|
||||
HashTableX tmp;
|
||||
// do the loop
|
||||
for ( int32_t i = 0 ; i < n ; i++ ) {
|
||||
// size of it
|
||||
int32_t psize = *(int32_t *)p;
|
||||
p += 4;
|
||||
tmp.deserialize ( p , psize );
|
||||
p += psize;
|
||||
// now compile the stats into a master table
|
||||
for ( int32_t k = 0 ; k < tmp.m_numSlots ; k++ ) {
|
||||
if ( ! tmp.m_flags[k] ) continue;
|
||||
// get the vlaue
|
||||
int32_t v32 = *(int32_t *)tmp.getKeyFromSlot(k);
|
||||
// and how many of them there where
|
||||
int32_t count = *(int32_t *)tmp.getValueFromSlot(k);
|
||||
// add to master
|
||||
master.addScore32 ( v32 , count );
|
||||
}
|
||||
}
|
||||
}
|
||||
////////
|
||||
//
|
||||
// now set m_facetStats
|
||||
//
|
||||
////////
|
||||
// add up all counts
|
||||
int64_t count = 0LL;
|
||||
for ( int32_t i = 0 ; i < master.getNumSlots() ; i++ ) {
|
||||
if ( ! master.m_flags[i] ) continue;
|
||||
int64_t slotCount = *(int32_t *)master.getValueFromSlot(i);
|
||||
int32_t h32 = *(int32_t *)master.getKeyFromSlot(i);
|
||||
if ( h32 == m_r->m_myFacetVal32 )
|
||||
m_facetStats.m_myValCount = slotCount;
|
||||
count += slotCount;
|
||||
}
|
||||
m_facetStats.m_totalUniqueValues = master.getNumUsedSlots();
|
||||
m_facetStats.m_totalValues = count;
|
||||
*/
|
||||
|
||||
|
||||
// shortcut
|
||||
//int32_t numSplits = m_numHosts;//indexdbSplit;
|
||||
|
||||
// . point to the various docids, etc. in each shard reply
|
||||
// . tcPtr = term count. how many required query terms does the doc
|
||||
@ -920,11 +844,6 @@ bool Msg3a::mergeLists ( ) {
|
||||
for ( int32_t j = 0; j < m_numQueriedHosts ; j++ ) {
|
||||
Msg39Reply *mr =m_reply[j];
|
||||
if ( ! mr ) continue;
|
||||
//SectionStats *src = &mr->m_sectionStats;
|
||||
//dst->m_onSiteDocIds += src->m_onSiteDocIds;
|
||||
//dst->m_offSiteDocIds += src->m_offSiteDocIds;
|
||||
//dst->m_totalMatches += src->m_totalMatches;
|
||||
//dst->m_totalEntries += src->m_totalEntries;
|
||||
// now the list should be the unique site hashes that
|
||||
// had the section hash. we need to uniquify them again
|
||||
// here.
|
||||
@ -1036,7 +955,6 @@ bool Msg3a::mergeLists ( ) {
|
||||
if ( ! sortFacetEntries() )
|
||||
return true;
|
||||
|
||||
//if ( m_r->m_getSectionStats ) return true;
|
||||
//
|
||||
// HACK: END section stats merge
|
||||
//
|
||||
|
11
Msg3a.h
11
Msg3a.h
@ -145,17 +145,6 @@ public:
|
||||
// when merging this list of docids into a final list keep
|
||||
// track of the cursor into m_docIds[]
|
||||
int32_t m_cursor;
|
||||
|
||||
// what collection # are these docids from if m_collnums[] is NULL
|
||||
//collnum_t m_collnum;
|
||||
|
||||
// we don't have FacetStats because we have the actual
|
||||
// Msg39Reply::ptr_facetHashList from each shard which contains
|
||||
// all the facet hash lists for each gbfacet: query term we had
|
||||
// and the query "Msg3a::m_q.m_qterms[].m_dt" is the hash table
|
||||
// where each key is a facethash for that gbfacet:xxxx term and
|
||||
// the value if the # of occurences.
|
||||
//SectionStats m_sectionStats;
|
||||
};
|
||||
|
||||
#endif
|
||||
|
109
Msg40.h
109
Msg40.h
@ -14,78 +14,12 @@
|
||||
#include "Msg39.h" // getTermFreqs()
|
||||
#include "Msg20.h" // for getting summary from docId
|
||||
#include "Msg17.h" // a distributed cache of serialized/compressed Msg40s
|
||||
//#include "Msg2b.h" // for generating directories
|
||||
//#include "IndexReadInfo.h" // STAGE0,...
|
||||
#include "Msg3a.h"
|
||||
#include "PostQueryRerank.h"
|
||||
|
||||
// replace CollectionRec::m_maxDocIdsToCompute with this
|
||||
//#define MAXDOCIDSTOCOMPUTE 500000
|
||||
// make it 2B now. no reason not too limit it so low.
|
||||
#define MAXDOCIDSTOCOMPUTE 2000000000
|
||||
|
||||
#define MAX_GIGABIT_WORDS 10
|
||||
|
||||
class Gigabit {
|
||||
public:
|
||||
char *m_term;
|
||||
int32_t m_termLen;
|
||||
int64_t m_termId64;
|
||||
float m_gbscore;
|
||||
int32_t m_minPop;
|
||||
int32_t m_numWords;
|
||||
int32_t m_numPages;
|
||||
int64_t m_lastDocId;
|
||||
// the wordids of the words in the gigabit (m_numWords of them)
|
||||
int64_t m_wordIds[MAX_GIGABIT_WORDS];
|
||||
};
|
||||
|
||||
|
||||
//
|
||||
// TODO: add Gigabit::m_firstFastFactOffset..
|
||||
//
|
||||
|
||||
|
||||
#define MAX_GIGABIT_PTRS 10
|
||||
|
||||
class Fact {
|
||||
public:
|
||||
// offset of the gigabit in m_gigabitBuf we belong to
|
||||
int32_t m_gigabitOffset;
|
||||
// . the sentence contaning the gigabit and a lot of the query terms
|
||||
// . ptr refrences into Msg20Reply::ptr_gigabitSample buffers
|
||||
char *m_fact;
|
||||
int32_t m_factLen;
|
||||
float m_gigabitModScore;
|
||||
float m_queryScore;
|
||||
float m_maxGigabitModScore; // gigabitscore * #pagesItIsOn
|
||||
int32_t m_numGigabits;
|
||||
char m_printed;
|
||||
class Gigabit *m_gigabitPtrs[MAX_GIGABIT_PTRS];
|
||||
int32_t m_numQTerms;
|
||||
int64_t m_docId; // from where it came
|
||||
Msg20Reply *m_reply; // reply from where it came
|
||||
// for deduping sentences
|
||||
char m_dedupVector[SAMPLE_VECTOR_SIZE]; // 128
|
||||
};
|
||||
|
||||
|
||||
class GigabitInfo {
|
||||
public:
|
||||
int32_t m_pts;
|
||||
uint32_t m_hash;
|
||||
int32_t m_pop;
|
||||
int32_t m_count;
|
||||
int32_t m_numDocs;
|
||||
int64_t m_lastDocId;
|
||||
int32_t m_currentDocCount;
|
||||
char *m_ptr;
|
||||
int32_t m_len;
|
||||
};
|
||||
|
||||
|
||||
static const int64_t msg40_msg39_timeout = 5000; //timeout for entire get-docid-list phase, in milliseconds.
|
||||
|
||||
class Msg40 {
|
||||
|
||||
public:
|
||||
@ -116,21 +50,6 @@ class Msg40 {
|
||||
bool prepareToGetDocIds ( );
|
||||
bool getDocIds ( bool recall );
|
||||
|
||||
bool computeGigabits( class TopicGroup *tg );
|
||||
SafeBuf m_gigabitBuf;
|
||||
|
||||
// nuggabits...
|
||||
bool computeFastFacts ( );
|
||||
bool addFacts ( HashTableX *queryTable,
|
||||
HashTableX *gbitTable ,
|
||||
char *pstart,
|
||||
char *pend,
|
||||
bool debugGigabits ,
|
||||
class Msg20Reply *reply,
|
||||
SafeBuf *factBuf ) ;
|
||||
|
||||
SafeBuf m_factBuf;
|
||||
|
||||
// keep these public since called by wrapper functions
|
||||
bool federatedLoop ( ) ;
|
||||
bool gotDocIds ( ) ;
|
||||
@ -181,14 +100,7 @@ class Msg40 {
|
||||
bool moreResultsFollow ( ) {return m_moreToCome; };
|
||||
time_t getCachedTime ( ) {return m_cachedTime; };
|
||||
|
||||
|
||||
int32_t getNumGigabits (){return m_gigabitBuf.length()/sizeof(Gigabit);};
|
||||
Gigabit *getGigabit ( int32_t i ) {
|
||||
Gigabit *gbs = (Gigabit *)m_gigabitBuf.getBufStart();
|
||||
return &gbs[i];
|
||||
};
|
||||
|
||||
int64_t *getDocIdPtr() { return m_msg3a.m_docIds; }
|
||||
int64_t *getDocIdPtr() { return m_msg3a.m_docIds; }
|
||||
|
||||
bool printSearchResult9 ( int32_t ix , int32_t *numPrintedSoFar ,
|
||||
class Msg20Reply *mr ) ;
|
||||
@ -277,15 +189,10 @@ class Msg40 {
|
||||
char *m_cachePtr;
|
||||
int32_t m_cacheSize;
|
||||
|
||||
//int32_t m_maxDocIdsToCompute;
|
||||
|
||||
// count summary replies (msg20 replies) we get
|
||||
int32_t m_numRequests;
|
||||
int32_t m_numReplies;
|
||||
|
||||
// we launched all docids from 0 to m_maxiLaunched
|
||||
//int32_t m_maxiLaunched;
|
||||
|
||||
// true if more results follow these
|
||||
bool m_moreToCome;
|
||||
|
||||
@ -303,12 +210,6 @@ class Msg40 {
|
||||
bool m_cachedResults;
|
||||
time_t m_cachedTime;
|
||||
|
||||
// gigabits
|
||||
//Msg24 m_msg24;
|
||||
|
||||
// references
|
||||
//Msg1a m_msg1a;
|
||||
|
||||
int32_t m_tasksRemaining;
|
||||
|
||||
int32_t m_printCount;
|
||||
@ -334,14 +235,6 @@ class Msg40 {
|
||||
|
||||
SearchInput *m_si;
|
||||
|
||||
|
||||
// for topic clustering, saved from CollectionRec
|
||||
int32_t m_topicSimilarCutoff;
|
||||
int32_t m_docsToScanForTopics;
|
||||
|
||||
// Msg2b for generating a directory
|
||||
//Msg2b m_msg2b;
|
||||
|
||||
bool mergeDocIdsIntoBaseMsg3a();
|
||||
int32_t m_numCollsToSearch;
|
||||
class Msg3a **m_msg3aPtrs;
|
||||
|
23
Msg5.cpp
23
Msg5.cpp
@ -1178,29 +1178,6 @@ bool Msg5::gotList2 ( ) {
|
||||
// . why???
|
||||
if ( m_totalSize < 32*1024 ) goto skipThread;
|
||||
|
||||
// if we are an interruptible niceness 1, do not use a thread,
|
||||
// we can be interrupted by the alarm callback and serve niceness
|
||||
// 0 requests, that is probably better! although the resolution is
|
||||
// on like 10ms on those alarms... BUT if you use a smaller
|
||||
// mergeBufSize of like 100k, that might make it responsive enough!
|
||||
// allow it to do a thread again so we can take advantage of
|
||||
// multiple cores, or hyperthreads i guess because i am seeing
|
||||
// some missed quickpoll log msgs, i suppose because we did not
|
||||
// insert QUICKPOLL() statements in the RdbList::merge_r() code
|
||||
//if ( m_niceness >= 1 ) goto skipThread;
|
||||
|
||||
// supder duper hack!
|
||||
//if ( m_rdbId == RDB_REVDB ) goto skipThread;
|
||||
|
||||
// i'm not sure why we core in Msg5's call to RdbList::merge_r().
|
||||
// the list appears to be corrupt...
|
||||
//if ( m_rdbId == RDB_FACEBOOKDB ) goto skipThread;
|
||||
|
||||
// skip it for now
|
||||
//goto skipThread;
|
||||
|
||||
//m_waitingForMerge = true;
|
||||
|
||||
// . if size is big, make a thread
|
||||
// . let's always make niceness 0 since it wasn't being very
|
||||
// aggressive before
|
||||
|
@ -854,8 +854,6 @@ void sleepWrapper1 ( int bogusfd , void *state ) {
|
||||
case 0x20: if ( elapsed < 5000 ) return; break;
|
||||
// msg 0x20 calls this to get the title rec
|
||||
case 0x22: if ( elapsed < 1000 ) return; break;
|
||||
// Msg23 niceness 0 is only for doing &rerank=X queries
|
||||
//case 0x23: if ( elapsed < 100000 ) return; break;
|
||||
// a request to get the score of a docid, can be *very* intensive
|
||||
case 0x3b: if ( elapsed < 500000 ) return; break;
|
||||
// related topics request, calls many Msg22 to get titlerecs...
|
||||
@ -868,21 +866,6 @@ void sleepWrapper1 ( int bogusfd , void *state ) {
|
||||
// performance reasons, cuz we do pretty good load balancing
|
||||
// and when things get saturated, rerouting excacerbates it
|
||||
if ( elapsed < 8000 ) return; break;
|
||||
// how many bytes were requested?
|
||||
/*
|
||||
if ( THIS->m_msg ) nb=*(int32_t *)(THIS->m_msg + sizeof(key_t)*2);
|
||||
else nb=2000000;
|
||||
// . givem 300ms + 1ms per 5000 bytes
|
||||
// . a 6M read would be allowed 1500ms before re-routing
|
||||
// . a 1M read would be allowed 500ms
|
||||
// . a 100k read would be allowed 320ms
|
||||
ta = 300 + nb / 5000;
|
||||
// limit it
|
||||
if ( ta < 100 ) ta = 100;
|
||||
if ( ta > 9000 ) ta = 9000; // could this hurt us?
|
||||
if ( elapsed < ta ) return;
|
||||
break;
|
||||
*/
|
||||
// msg to get a clusterdb rec
|
||||
case 0x38: if ( elapsed < 2000 ) return; break;
|
||||
// msg to get docIds from a query, may take a while
|
||||
|
@ -30,7 +30,6 @@ static const int64_t multicast_msg20_summary_timeout = 1500;
|
||||
static const int64_t multicast_msg1_senddata_timeout = 60000;
|
||||
static const int64_t multicast_msg3a_default_timeout = 10000;
|
||||
static const int64_t multicast_msg3a_maximum_timeout = 60000;
|
||||
static const int64_t multicast_xmldoc_sectionstats_timeout = 30000;
|
||||
static const int64_t multicast_msg1c_getip_default_timeout = 60000;
|
||||
|
||||
|
||||
|
@ -8,9 +8,8 @@
|
||||
#include "SpiderLoop.h"
|
||||
#include "PageResults.h" // for RESULT_HEIGHT
|
||||
#include "Stats.h"
|
||||
#include "PageRoot.h"
|
||||
|
||||
bool printFrontPageShell ( SafeBuf *sb , char *tabName , CollectionRec *cr ,
|
||||
bool printGigablast ) ;
|
||||
|
||||
// 5 seconds
|
||||
#define DEFAULT_WIDGET_RELOAD 1000
|
||||
|
@ -242,8 +242,6 @@ bool sendBackDump ( TcpSocket *sock, HttpRequest *hr ) {
|
||||
// no summary similarity dedup, only exact
|
||||
// doc content hash. otherwise too slow!!
|
||||
"pss=0&"
|
||||
// no gigabits
|
||||
"dsrt=0&"
|
||||
// do not compute summary. 0 lines.
|
||||
"ns=0&"
|
||||
"q=gbsortby%%3Agbspiderdate&"
|
||||
@ -282,8 +280,6 @@ bool sendBackDump ( TcpSocket *sock, HttpRequest *hr ) {
|
||||
// no summary similarity dedup, only exact
|
||||
// doc content hash. otherwise too slow!!
|
||||
"pss=0&"
|
||||
// no gigabits
|
||||
"dsrt=0&"
|
||||
// do not compute summary. 0 lines.
|
||||
"ns=0&"
|
||||
//"q=gbsortby%%3Agbspiderdate&"
|
||||
@ -321,8 +317,6 @@ bool sendBackDump ( TcpSocket *sock, HttpRequest *hr ) {
|
||||
// no summary similarity dedup, only exact
|
||||
// doc content hash. otherwise too slow!!
|
||||
"pss=0&"
|
||||
// no gigabits
|
||||
"dsrt=0&"
|
||||
// do not compute summary. 0 lines.
|
||||
"ns=0&"
|
||||
"q=gbsortby%%3Agbspiderdate&"
|
||||
@ -372,8 +366,6 @@ bool sendBackDump ( TcpSocket *sock, HttpRequest *hr ) {
|
||||
// no summary similarity dedup, only exact
|
||||
// doc content hash. otherwise too slow!!
|
||||
"pss=0&"
|
||||
// no gigabits
|
||||
"dsrt=0&"
|
||||
// do not compute summary. 0 lines.
|
||||
//"ns=0&"
|
||||
"q=gbrevsortbyint%%3AgbssSpiderTime+"
|
||||
|
117
PageParser.cpp
117
PageParser.cpp
@ -5,6 +5,86 @@
|
||||
//#include "IndexTable2.h"
|
||||
//#include "XmlDoc.h" // addCheckboxSpan()
|
||||
|
||||
class State8 {
|
||||
public:
|
||||
TopTree m_topTree;
|
||||
//Msg16 m_msg16;
|
||||
//Msg14 m_msg14;
|
||||
//Msg15 m_msg15;
|
||||
Msg22 m_msg22;
|
||||
SafeBuf m_dbuf;
|
||||
//XmlDoc m_doc;
|
||||
//Url m_url;
|
||||
//Url m_rootUrl;
|
||||
char *m_u;
|
||||
int32_t m_ulen;
|
||||
bool m_applyRulesetToRoot;
|
||||
char m_rootQuality;
|
||||
int32_t m_reparseRootRetries;
|
||||
char m_coll[MAX_COLL_LEN];
|
||||
int32_t m_collLen;
|
||||
//int32_t m_sfn;
|
||||
//int32_t m_urlLen;
|
||||
TcpSocket *m_s;
|
||||
bool m_isLocal;
|
||||
char m_pwd[32];
|
||||
HttpRequest m_r;
|
||||
int32_t m_old;
|
||||
// recyle the link info from the title rec?
|
||||
int32_t m_recycle;
|
||||
// recycle the link info that was imported from another coll?
|
||||
int32_t m_recycle2;
|
||||
int32_t m_render;
|
||||
char m_recompute;
|
||||
int32_t m_oips;
|
||||
char m_linkInfoColl[11];
|
||||
// char m_buf[16384 * 1024];
|
||||
|
||||
//int32_t m_page;
|
||||
// m_pbuf now points to m_sbuf if we are showing the parsing junk
|
||||
SafeBuf m_xbuf;
|
||||
SafeBuf m_wbuf;
|
||||
bool m_donePrinting;
|
||||
//SafeBuf m_sbuf;
|
||||
// this is a buffer which cats m_sbuf into it
|
||||
//SafeBuf m_sbuf2;
|
||||
|
||||
// new state vars for Msg3b.cpp
|
||||
int64_t m_docId;
|
||||
void *m_state ;
|
||||
void (* m_callback) (void *state);
|
||||
Query m_tq;
|
||||
Query *m_q;
|
||||
int64_t *m_termFreqs;
|
||||
float *m_termFreqWeights;
|
||||
float *m_affWeights;
|
||||
//score_t m_total;
|
||||
bool m_freeIt;
|
||||
bool m_blocked;
|
||||
|
||||
// these are from rearranging the code
|
||||
int32_t m_indexCode;
|
||||
//uint64_t m_chksum1;
|
||||
int64_t m_took1;
|
||||
int64_t m_took1b;
|
||||
int64_t m_took2;
|
||||
int64_t m_took3;
|
||||
|
||||
char m_didRootDom;
|
||||
char m_didRootWWW;
|
||||
char m_wasRootDom;
|
||||
|
||||
// call Msg16 with a versino of title rec to do
|
||||
int32_t m_titleRecVersion;
|
||||
|
||||
char m_hopCount;
|
||||
|
||||
//TitleRec m_tr;
|
||||
|
||||
//XmlDoc m_oldDoc;
|
||||
XmlDoc m_xd;
|
||||
};
|
||||
|
||||
bool g_inPageParser = false;
|
||||
bool g_inPageInject = false;
|
||||
|
||||
@ -12,6 +92,17 @@ bool g_inPageInject = false;
|
||||
static bool processLoop ( void *state ) ;
|
||||
static bool gotXmlDoc ( void *state ) ;
|
||||
static bool sendErrorReply ( void *state , int32_t err ) ;
|
||||
static bool sendPageParser2 ( TcpSocket *s ,
|
||||
HttpRequest *r ,
|
||||
class State8 *st ,
|
||||
int64_t docId ,
|
||||
Query *q ,
|
||||
int64_t *termFreqs ,
|
||||
float *termFreqWeights ,
|
||||
float *affWeights ,
|
||||
void *state ,
|
||||
void (* callback)(void *state) ) ;
|
||||
|
||||
|
||||
// . returns false if blocked, true otherwise
|
||||
// . sets g_errno on error
|
||||
@ -26,19 +117,19 @@ bool sendPageParser ( TcpSocket *s , HttpRequest *r ) {
|
||||
// . a new interface so Msg3b can call this with "s" set to NULL
|
||||
// . returns false if blocked, true otherwise
|
||||
// . sets g_errno on error
|
||||
bool sendPageParser2 ( TcpSocket *s ,
|
||||
HttpRequest *r ,
|
||||
State8 *st ,
|
||||
int64_t docId ,
|
||||
Query *q ,
|
||||
// in query term space, not imap space
|
||||
int64_t *termFreqs ,
|
||||
// in imap space
|
||||
float *termFreqWeights ,
|
||||
// in imap space
|
||||
float *affWeights ,
|
||||
void *state ,
|
||||
void (* callback)(void *state) ) {
|
||||
static bool sendPageParser2 ( TcpSocket *s ,
|
||||
HttpRequest *r ,
|
||||
State8 *st ,
|
||||
int64_t docId ,
|
||||
Query *q ,
|
||||
// in query term space, not imap space
|
||||
int64_t *termFreqs ,
|
||||
// in imap space
|
||||
float *termFreqWeights ,
|
||||
// in imap space
|
||||
float *affWeights ,
|
||||
void *state ,
|
||||
void (* callback)(void *state) ) {
|
||||
|
||||
//log("parser: read sock=%"INT32"",s->m_sd);
|
||||
|
||||
|
91
PageParser.h
91
PageParser.h
@ -16,95 +16,4 @@ extern bool g_inPageInject ;
|
||||
|
||||
bool sendPageAnalyze ( TcpSocket *s , HttpRequest *r ) ;
|
||||
|
||||
bool sendPageParser2 ( TcpSocket *s ,
|
||||
HttpRequest *r ,
|
||||
class State8 *st ,
|
||||
int64_t docId ,
|
||||
Query *q ,
|
||||
int64_t *termFreqs ,
|
||||
float *termFreqWeights ,
|
||||
float *affWeights ,
|
||||
void *state ,
|
||||
void (* callback)(void *state) ) ;
|
||||
|
||||
class State8 {
|
||||
public:
|
||||
TopTree m_topTree;
|
||||
//Msg16 m_msg16;
|
||||
//Msg14 m_msg14;
|
||||
//Msg15 m_msg15;
|
||||
Msg22 m_msg22;
|
||||
SafeBuf m_dbuf;
|
||||
//XmlDoc m_doc;
|
||||
//Url m_url;
|
||||
//Url m_rootUrl;
|
||||
char *m_u;
|
||||
int32_t m_ulen;
|
||||
bool m_applyRulesetToRoot;
|
||||
char m_rootQuality;
|
||||
int32_t m_reparseRootRetries;
|
||||
char m_coll[MAX_COLL_LEN];
|
||||
int32_t m_collLen;
|
||||
//int32_t m_sfn;
|
||||
//int32_t m_urlLen;
|
||||
TcpSocket *m_s;
|
||||
bool m_isLocal;
|
||||
char m_pwd[32];
|
||||
HttpRequest m_r;
|
||||
int32_t m_old;
|
||||
// recyle the link info from the title rec?
|
||||
int32_t m_recycle;
|
||||
// recycle the link info that was imported from another coll?
|
||||
int32_t m_recycle2;
|
||||
int32_t m_render;
|
||||
char m_recompute;
|
||||
int32_t m_oips;
|
||||
char m_linkInfoColl[11];
|
||||
// char m_buf[16384 * 1024];
|
||||
|
||||
//int32_t m_page;
|
||||
// m_pbuf now points to m_sbuf if we are showing the parsing junk
|
||||
SafeBuf m_xbuf;
|
||||
SafeBuf m_wbuf;
|
||||
bool m_donePrinting;
|
||||
//SafeBuf m_sbuf;
|
||||
// this is a buffer which cats m_sbuf into it
|
||||
//SafeBuf m_sbuf2;
|
||||
|
||||
// new state vars for Msg3b.cpp
|
||||
int64_t m_docId;
|
||||
void *m_state ;
|
||||
void (* m_callback) (void *state);
|
||||
Query m_tq;
|
||||
Query *m_q;
|
||||
int64_t *m_termFreqs;
|
||||
float *m_termFreqWeights;
|
||||
float *m_affWeights;
|
||||
//score_t m_total;
|
||||
bool m_freeIt;
|
||||
bool m_blocked;
|
||||
|
||||
// these are from rearranging the code
|
||||
int32_t m_indexCode;
|
||||
//uint64_t m_chksum1;
|
||||
int64_t m_took1;
|
||||
int64_t m_took1b;
|
||||
int64_t m_took2;
|
||||
int64_t m_took3;
|
||||
|
||||
char m_didRootDom;
|
||||
char m_didRootWWW;
|
||||
char m_wasRootDom;
|
||||
|
||||
// call Msg16 with a versino of title rec to do
|
||||
int32_t m_titleRecVersion;
|
||||
|
||||
char m_hopCount;
|
||||
|
||||
//TitleRec m_tr;
|
||||
|
||||
//XmlDoc m_oldDoc;
|
||||
XmlDoc m_xd;
|
||||
};
|
||||
|
||||
#endif
|
||||
|
@ -15,7 +15,6 @@
|
||||
#include "Msg40.h"
|
||||
#include "sort.h"
|
||||
#include "Spider.h"
|
||||
#include "Revdb.h"
|
||||
#include "XmlDoc.h"
|
||||
#include "PageInject.h" // Msg7
|
||||
#include "PageReindex.h"
|
||||
|
@ -42,64 +42,4 @@ public:
|
||||
Query m_qq;
|
||||
};
|
||||
|
||||
/*
|
||||
// . for indexing tags for events after you add to tagdb
|
||||
// . created so zak can very quickly tag eventids that are already indexed
|
||||
// . will just add the tag terms directly to datedb for the eventid
|
||||
class Msg1d {
|
||||
|
||||
public:
|
||||
|
||||
bool updateQuery ( char *query ,
|
||||
HttpRequest *r,
|
||||
TcpSocket *sock,
|
||||
char *coll ,
|
||||
int32_t startNum ,
|
||||
int32_t endNum ,
|
||||
void *state ,
|
||||
void (* callback) (void *state ) ) ;
|
||||
|
||||
bool updateTagTerms ( ) ;
|
||||
|
||||
bool getMetaList ( int64_t docId ,
|
||||
int32_t eventId ,
|
||||
TagRec *egr ,
|
||||
RdbList *oldList ,
|
||||
int32_t niceness ,
|
||||
SafeBuf *addBuf ) ;
|
||||
|
||||
void *m_state;
|
||||
void (* m_callback) (void *state);
|
||||
|
||||
Msg40 m_msg40;
|
||||
SearchInput m_si;
|
||||
int32_t m_startNum;
|
||||
int32_t m_endNum;
|
||||
int32_t m_numDocIds;
|
||||
int32_t m_i;
|
||||
Msg12 m_msg12;
|
||||
Msg8a m_msg8a;
|
||||
Msg0 m_msg0;
|
||||
char *m_coll;
|
||||
int32_t m_niceness;
|
||||
TagRec m_tagRec;
|
||||
RdbList m_revdbList;
|
||||
SafeBuf m_addBuf;
|
||||
SafeBuf m_rr;
|
||||
char *m_metaList;
|
||||
int32_t m_metaListSize;
|
||||
Msg4 m_msg4;
|
||||
Query m_qq;
|
||||
|
||||
Url m_fakeUrl;
|
||||
|
||||
int32_t m_gotLock;
|
||||
int32_t m_gotTagRec;
|
||||
int32_t m_gotRevdbRec;
|
||||
int32_t m_madeList;
|
||||
int32_t m_addedList;
|
||||
int32_t m_removeLock;
|
||||
int32_t m_flushedList;
|
||||
};
|
||||
*/
|
||||
#endif
|
||||
|
684
PageResults.cpp
684
PageResults.cpp
@ -19,17 +19,15 @@
|
||||
#include "LanguageIdentifier.h"
|
||||
#include "CountryCode.h"
|
||||
#include "Unicode.h"
|
||||
#include "XmlDoc.h" // GigabitInfo class
|
||||
#include "Posdb.h" // MAX_TOP definition
|
||||
#include "PageResults.h"
|
||||
#include "PageRoot.h"
|
||||
#include "Proxy.h"
|
||||
|
||||
static bool printSearchFiltersBar ( SafeBuf *sb , HttpRequest *hr ) ;
|
||||
static bool printMenu ( SafeBuf *sb , int32_t menuNum , HttpRequest *hr ) ;
|
||||
|
||||
//static void gotSpellingWrapper ( void *state ) ;
|
||||
static void gotResultsWrapper ( void *state ) ;
|
||||
//static void gotAdsWrapper ( void *state ) ;
|
||||
static void gotState ( void *state ) ;
|
||||
static bool gotResults ( void *state ) ;
|
||||
|
||||
@ -163,34 +161,6 @@ bool sendReply ( State0 *st , char *reply ) {
|
||||
mdelete(st, sizeof(State0), "PageResults2");
|
||||
delete st;
|
||||
|
||||
/*
|
||||
if ( format == FORMAT_XML ) {
|
||||
SafeBuf sb;
|
||||
sb.safePrintf("<?xml version=\"1.0\" "
|
||||
"encoding=\"UTF-8\" ?>\n"
|
||||
"<response>\n"
|
||||
"\t<errno>%"INT32"</errno>\n"
|
||||
"\t<errmsg>%s</errmsg>\n"
|
||||
"</response>\n"
|
||||
,(int32_t)savedErr
|
||||
,mstrerror(savedErr)
|
||||
);
|
||||
// clear it for sending back
|
||||
g_errno = 0;
|
||||
// send back as normal reply
|
||||
g_httpServer.sendDynamicPage(s,
|
||||
sb.getBufStart(),
|
||||
sb.length(),
|
||||
0, // cachetime in secs
|
||||
false, // POSTReply?
|
||||
ct,
|
||||
-1, // httpstatus -1 -> 200
|
||||
NULL, // cookieptr
|
||||
charset );
|
||||
return true;
|
||||
}
|
||||
*/
|
||||
|
||||
// if we had a broken pipe from the browser while sending
|
||||
// them the search results, then we end up closing the socket fd
|
||||
// in TcpServer::sendChunk() > sendMsg() > destroySocket()
|
||||
@ -332,15 +302,6 @@ bool sendPageResults ( TcpSocket *s , HttpRequest *hr ) {
|
||||
// propagate "n"
|
||||
int32_t n = hr->getLong("n",-1);
|
||||
if ( n >= 0 ) sb.safePrintf("&n=%"INT32"",n);
|
||||
// Docs to Scan for Related Topics
|
||||
int32_t dsrt = hr->getLong("dsrt",-1);
|
||||
if ( dsrt >= 0 ) sb.safePrintf("&dsrt=%"INT32"",dsrt);
|
||||
// debug gigabits?
|
||||
int32_t dg = hr->getLong("dg",-1);
|
||||
if ( dg >= 0 ) sb.safePrintf("&dg=%"INT32"",dg);
|
||||
// show gigabits?
|
||||
//int32_t gb = hr->getLong("gigabits",1);
|
||||
//if ( gb >= 1 ) sb.safePrintf("&gigabits=%"INT32"",gb);
|
||||
// show banned results?
|
||||
int32_t showBanned = hr->getLong("sb",0);
|
||||
if ( showBanned ) sb.safePrintf("&sb=1");
|
||||
@ -367,12 +328,6 @@ bool sendPageResults ( TcpSocket *s , HttpRequest *hr ) {
|
||||
, h32
|
||||
, rand64
|
||||
);
|
||||
//
|
||||
// . login bar
|
||||
// . proxy will replace it byte by byte with a login/logout
|
||||
// link etc.
|
||||
//
|
||||
//g_proxy.insertLoginBarDirective(&sb);
|
||||
|
||||
//
|
||||
// logo header
|
||||
@ -392,23 +347,6 @@ bool sendPageResults ( TcpSocket *s , HttpRequest *hr ) {
|
||||
"}}\n"
|
||||
|
||||
|
||||
// gigabit unhide function
|
||||
"function ccc ( gn ) {\n"
|
||||
"var e = document.getElementById('fd'+gn);\n"
|
||||
"var f = document.getElementById('sd'+gn);\n"
|
||||
"if ( e.style.display == 'none' ){\n"
|
||||
"e.style.display = '';\n"
|
||||
"f.style.display = 'none';\n"
|
||||
"}\n"
|
||||
"else {\n"
|
||||
"e.style.display = 'none';\n"
|
||||
"f.style.display = '';\n"
|
||||
"}\n"
|
||||
"}\n"
|
||||
"</script>\n"
|
||||
|
||||
|
||||
|
||||
// put search results into this div
|
||||
"<div id=results>"
|
||||
"<img height=50 width=50 "
|
||||
@ -623,259 +561,6 @@ void gotState ( void *state ){
|
||||
gotResults ( state );
|
||||
}
|
||||
|
||||
|
||||
// print all sentences containing this gigabit (fast facts) (nuggabits)
|
||||
static bool printGigabitContainingSentences ( State0 *st,
|
||||
SafeBuf *sb ,
|
||||
Msg40 *msg40 ,
|
||||
Gigabit *gi ,
|
||||
SearchInput *si ,
|
||||
Query *gigabitQuery ,
|
||||
int32_t gigabitId ) {
|
||||
char format = si->m_format;
|
||||
|
||||
HttpRequest *hr = &st->m_hr;
|
||||
CollectionRec *cr = si->m_cr;//g_collectiondb.getRec(collnum );
|
||||
|
||||
int32_t numOff;
|
||||
int32_t revert;
|
||||
int32_t spaceOutOff;
|
||||
|
||||
if ( format == FORMAT_HTML ) {
|
||||
sb->safePrintf("<nobr><b>");
|
||||
|
||||
// make a new query
|
||||
sb->safePrintf("<a href=\"/search?c=%s&q=",cr->m_coll);
|
||||
sb->urlEncode(gi->m_term,gi->m_termLen);
|
||||
sb->safeMemcpy("+|+",3);
|
||||
char *q = hr->getString("q",NULL,"");
|
||||
sb->urlEncode(q);
|
||||
sb->safePrintf("\">");
|
||||
sb->safeMemcpy(gi->m_term,gi->m_termLen);
|
||||
sb->safePrintf("</a></b>");
|
||||
sb->safePrintf(" <font color=gray size=-1>");
|
||||
numOff = sb->m_length;
|
||||
sb->safePrintf(" ");//,gi->m_numPages);
|
||||
sb->safePrintf("</font>");
|
||||
sb->safePrintf("</b>");
|
||||
|
||||
revert = sb->length();
|
||||
|
||||
sb->safePrintf("<font color=blue style=align:right;>"
|
||||
"<a style=cursor:hand;cursor:pointer; "
|
||||
"onclick=ccc(%"INT32");>"
|
||||
, gigabitId // s_gigabitCount
|
||||
);
|
||||
spaceOutOff = sb->length();
|
||||
sb->safePrintf( "%c%c%c",
|
||||
0xe2,
|
||||
0x87,
|
||||
0x93);
|
||||
sb->safePrintf(//"[more]"
|
||||
"</a></font>");
|
||||
|
||||
|
||||
sb->safePrintf("</nobr>"); // <br>
|
||||
}
|
||||
|
||||
if ( format == FORMAT_XML ) {
|
||||
sb->safePrintf("\t\t<gigabit>\n");
|
||||
sb->safePrintf("\t\t\t<term><![CDATA[");
|
||||
sb->cdataEncode(gi->m_term,gi->m_termLen);
|
||||
sb->safePrintf("]]></term>\n");
|
||||
sb->safePrintf("\t\t\t<score>%f</score>\n",gi->m_gbscore);
|
||||
sb->safePrintf("\t\t\t<minPop>%"INT32"</minPop>\n",gi->m_minPop);
|
||||
}
|
||||
|
||||
if ( format == FORMAT_JSON ) {
|
||||
sb->safePrintf("\t{\n");
|
||||
//sb->safePrintf("\t\"gigabit\":{\n");
|
||||
sb->safePrintf("\t\t\"term\":\"");
|
||||
sb->jsonEncode(gi->m_term,gi->m_termLen);
|
||||
sb->safePrintf("\",\n");
|
||||
sb->safePrintf("\t\t\"score\":%f,\n",gi->m_gbscore);
|
||||
sb->safePrintf("\t\t\"minPop\":%"INT32",\n",gi->m_minPop);
|
||||
}
|
||||
|
||||
// get facts
|
||||
int32_t numNuggets = 0;
|
||||
int32_t numFacts = msg40->m_factBuf.length() / sizeof(Fact);
|
||||
Fact *facts = (Fact *)msg40->m_factBuf.getBufStart();
|
||||
bool first = true;
|
||||
bool second = false;
|
||||
bool printedSecond = false;
|
||||
//int64_t lastDocId = -1LL;
|
||||
int32_t saveOffset = 0;
|
||||
for ( int32_t i = 0 ; i < numFacts ; i++ ) {
|
||||
Fact *fi = &facts[i];
|
||||
|
||||
// if printed for a higher scoring gigabit, skip
|
||||
if ( fi->m_printed ) continue;
|
||||
|
||||
// check gigabit match
|
||||
int32_t k; for ( k = 0 ; k < fi->m_numGigabits ; k++ )
|
||||
if ( fi->m_gigabitPtrs[k] == gi ) break;
|
||||
// skip this fact/sentence if does not contain gigabit
|
||||
if ( k >= fi->m_numGigabits ) continue;
|
||||
|
||||
// do not print if no period at end
|
||||
char *s = fi->m_fact;
|
||||
char *e = s + fi->m_factLen;
|
||||
if ( e[-1] != '*' ) continue;
|
||||
e--;
|
||||
|
||||
again:
|
||||
|
||||
// first time, print in the single fact div
|
||||
if ( first && format == FORMAT_HTML ) {
|
||||
sb->safePrintf("<div "
|
||||
//"style=\"border:1px lightgray solid;\"
|
||||
"id=fd%"INT32">",gigabitId);//s_gigabitCount);
|
||||
}
|
||||
|
||||
if ( second && format == FORMAT_HTML ) {
|
||||
sb->safePrintf("<div style=\"max-height:300px;"
|
||||
"display:none;"
|
||||
"overflow-x:hidden;"
|
||||
"overflow-y:auto;"//scroll;"
|
||||
//"border:1px lightgray solid; "
|
||||
"\" "
|
||||
"id=sd%"INT32">",gigabitId);//s_gigabitCount);
|
||||
printedSecond = true;
|
||||
}
|
||||
|
||||
Msg20Reply *reply = fi->m_reply;
|
||||
|
||||
// ok, print it out
|
||||
if ( ! first && ! second && format == FORMAT_HTML ) {
|
||||
sb->safePrintf("<br><br>\n");
|
||||
}
|
||||
|
||||
numNuggets++;
|
||||
|
||||
// let's highlight with gigabits and query terms
|
||||
SafeBuf tmpBuf;
|
||||
Highlight h;
|
||||
h.set ( &tmpBuf , s , e - s , gigabitQuery, "<u>", "</u>", 0 );
|
||||
|
||||
// now highlight the original query as well but in black bold
|
||||
SafeBuf tmpBuf2;
|
||||
h.set ( &tmpBuf2, tmpBuf.getBufStart(), tmpBuf.length(), &si->m_q, "<b>", "</b>", 0 );
|
||||
|
||||
|
||||
int32_t dlen; char *dom = getDomFast(reply->ptr_ubuf,&dlen);
|
||||
|
||||
// print the sentence
|
||||
if ( format == FORMAT_HTML )
|
||||
sb->safeStrcpy(tmpBuf2.getBufStart());
|
||||
|
||||
if ( format == FORMAT_XML ) {
|
||||
sb->safePrintf("\t\t\t<instance>\n"
|
||||
"\t\t\t\t<sentence><![CDATA[");
|
||||
sb->cdataEncode(tmpBuf2.getBufStart());
|
||||
sb->safePrintf("]]></sentence>\n");
|
||||
sb->safePrintf("\t\t\t\t<url><![CDATA[");
|
||||
sb->cdataEncode(reply->ptr_ubuf);
|
||||
sb->safePrintf("]]></url>\n");
|
||||
sb->safePrintf("\t\t\t\t<domain><![CDATA[");
|
||||
sb->cdataEncode(dom,dlen);
|
||||
sb->safePrintf("]]></domain>\n");
|
||||
sb->safePrintf("\t\t\t</instance>\n");
|
||||
}
|
||||
|
||||
if ( format == FORMAT_JSON ) {
|
||||
sb->safePrintf("\t\t\"instance\":{\n"
|
||||
"\t\t\t\"sentence\":\"");
|
||||
sb->jsonEncode(tmpBuf2.getBufStart());
|
||||
sb->safePrintf("\",\n");
|
||||
|
||||
sb->safePrintf("\t\t\t\"url\":\"");
|
||||
sb->jsonEncode(reply->ptr_ubuf);
|
||||
sb->safePrintf("\",\n");
|
||||
|
||||
sb->safePrintf("\t\t\t\"domain\":\"");
|
||||
sb->jsonEncode(dom,dlen);
|
||||
sb->safePrintf("\"\n");
|
||||
sb->safePrintf("\t\t},\n");
|
||||
}
|
||||
|
||||
|
||||
fi->m_printed = 1;
|
||||
saveOffset = sb->length();
|
||||
if ( format == FORMAT_HTML ) {
|
||||
sb->safePrintf(" <a href=/get?c=%s&cnsp=0&"
|
||||
"strip=0&d=%"INT64">",
|
||||
cr->m_coll,reply->m_docId);
|
||||
sb->safeMemcpy(dom,dlen);
|
||||
sb->safePrintf("</a>\n");
|
||||
sb->safePrintf("</div>");
|
||||
}
|
||||
|
||||
if ( second ) {
|
||||
second = false;
|
||||
}
|
||||
|
||||
if ( first ) {
|
||||
first = false;
|
||||
second = true;
|
||||
|
||||
// print first gigabit all over again but in 2nd div
|
||||
goto again;
|
||||
}
|
||||
}
|
||||
|
||||
if ( format == FORMAT_XML )
|
||||
sb->safePrintf("\t\t</gigabit>\n");
|
||||
|
||||
if ( format == FORMAT_JSON ) {
|
||||
// remove last ,\n
|
||||
sb->m_length -= 2;
|
||||
// replace with just \n
|
||||
// end the gigabit
|
||||
sb->safePrintf("\n\t},\n");
|
||||
}
|
||||
|
||||
// all done if not html
|
||||
if ( format != FORMAT_HTML )
|
||||
return true;
|
||||
|
||||
// we counted the first one twice since we had to throw it into
|
||||
// the hidden div too!
|
||||
if ( numNuggets > 1 ) numNuggets--;
|
||||
|
||||
// do not print the double down arrow if no nuggets printed
|
||||
if ( numNuggets <= 0 ) {
|
||||
sb->m_length = revert;
|
||||
sb->safePrintf("</nobr>");
|
||||
}
|
||||
// just remove down arrow if only 1...
|
||||
else if ( numNuggets == 1 ) {
|
||||
char *dst = sb->getBufStart()+spaceOutOff;
|
||||
dst[0] = ' ';
|
||||
dst[1] = ' ';
|
||||
dst[2] = ' ';
|
||||
}
|
||||
// store the # of nuggets in ()'s like (10 )
|
||||
else {
|
||||
char tmp[10];
|
||||
sprintf(tmp,"(%"INT32")",numNuggets);
|
||||
char *src = tmp;
|
||||
// starting storing digits after "( "
|
||||
char *dst = sb->getBufStart()+numOff;
|
||||
int32_t srcLen = gbstrlen(tmp);
|
||||
if ( srcLen > 5 ) srcLen = 5;
|
||||
for ( int32_t k = 0 ; k < srcLen ; k++ )
|
||||
dst[k] = src[k];
|
||||
}
|
||||
|
||||
if ( printedSecond ) {
|
||||
sb->safePrintf("</div>");
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
// . make a web page from results stored in msg40
|
||||
// . send it on TcpSocket "s" when done
|
||||
// . returns false if blocked, true otherwise
|
||||
@ -1045,22 +730,6 @@ bool gotResults ( void *state ) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// defined in PageRoot.cpp
|
||||
bool expandHtml ( SafeBuf& sb,
|
||||
char *head ,
|
||||
int32_t hlen ,
|
||||
char *q ,
|
||||
int32_t qlen ,
|
||||
HttpRequest *r ,
|
||||
SearchInput *si,
|
||||
char *method ,
|
||||
CollectionRec *cr ) ;
|
||||
|
||||
|
||||
bool printLeftColumnRocketAndTabs ( SafeBuf *sb,
|
||||
bool isSearchResultsPage ,
|
||||
CollectionRec *cr ,
|
||||
char *tabName );
|
||||
|
||||
bool printLeftNavColumn ( SafeBuf &sb, State0 *st ) {
|
||||
|
||||
@ -1135,271 +804,26 @@ bool printLeftNavColumn ( SafeBuf &sb, State0 *st ) {
|
||||
// . tabName = "search"
|
||||
printLeftColumnRocketAndTabs ( &sb , true , cr , "search" );
|
||||
|
||||
}
|
||||
//
|
||||
// BEGIN FACET PRINTING
|
||||
//
|
||||
//
|
||||
// . print out one table for each gbfacet: term in the query
|
||||
// . LATER: show the text string corresponding to the hash
|
||||
// by looking it up in the titleRec
|
||||
//
|
||||
msg40->printFacetTables ( &sb );
|
||||
//
|
||||
// END FACET PRINTING
|
||||
//
|
||||
|
||||
|
||||
//
|
||||
// BEGIN FACET PRINTING
|
||||
//
|
||||
//
|
||||
// . print out one table for each gbfacet: term in the query
|
||||
// . LATER: show the text string corresponding to the hash
|
||||
// by looking it up in the titleRec
|
||||
//
|
||||
if ( format == FORMAT_HTML ) msg40->printFacetTables ( &sb );
|
||||
//
|
||||
// END FACET PRINTING
|
||||
//
|
||||
|
||||
//
|
||||
// BEGIN PRINT GIGABITS
|
||||
//
|
||||
|
||||
SafeBuf *gbuf = &msg40->m_gigabitBuf;
|
||||
int32_t numGigabits = gbuf->length()/sizeof(Gigabit);
|
||||
|
||||
if ( ! st->m_header )
|
||||
numGigabits = 0;
|
||||
|
||||
// print gigabits
|
||||
Gigabit *gigabits = (Gigabit *)gbuf->getBufStart();
|
||||
|
||||
if ( numGigabits && format == FORMAT_XML )
|
||||
sb.safePrintf("\t<gigabits>\n");
|
||||
|
||||
if ( numGigabits && format == FORMAT_JSON )
|
||||
sb.safePrintf("\"gigabits\":[\n");
|
||||
|
||||
|
||||
if ( numGigabits && format == FORMAT_HTML )
|
||||
// gigabit unhide function
|
||||
sb.safePrintf (
|
||||
"<script>"
|
||||
"function ccc ( gn ) {\n"
|
||||
"var e = document.getElementById('fd'+gn);\n"
|
||||
"var f = document.getElementById('sd'+gn);\n"
|
||||
"if ( e.style.display == 'none' ){\n"
|
||||
"e.style.display = '';\n"
|
||||
"f.style.display = 'none';\n"
|
||||
"}\n"
|
||||
"else {\n"
|
||||
"e.style.display = 'none';\n"
|
||||
"f.style.display = '';\n"
|
||||
"}\n"
|
||||
"}\n"
|
||||
"</script>\n"
|
||||
);
|
||||
|
||||
if ( numGigabits && format == FORMAT_HTML )
|
||||
sb.safePrintf("<div id=gigabits "
|
||||
"style="
|
||||
"padding:5px;"
|
||||
"position:relative;"
|
||||
"border-width:3px;"
|
||||
"border-right-width:0px;"
|
||||
"border-style:solid;"
|
||||
"margin-left:10px;"
|
||||
"border-top-left-radius:10px;"
|
||||
"border-bottom-left-radius:10px;"
|
||||
"border-color:blue;"
|
||||
"background-color:white;"
|
||||
"border-right-color:white;"
|
||||
"margin-right:-3px;"
|
||||
">"
|
||||
"<table cellspacing=7>"
|
||||
"<tr><td width=200px; valign=top>"
|
||||
"<center><img src=/gigabits40.jpg></center>"
|
||||
"<br>"
|
||||
"<br>"
|
||||
);
|
||||
|
||||
Query gigabitQuery;
|
||||
char tmp[1024];
|
||||
SafeBuf ttt(tmp, 1024);
|
||||
// limit it to 40 gigabits for now
|
||||
for ( int32_t i = 0 ; i < numGigabits && i < 40 ; i++ ) {
|
||||
Gigabit *gi = &gigabits[i];
|
||||
ttt.pushChar('\"');
|
||||
ttt.safeMemcpy(gi->m_term,gi->m_termLen);
|
||||
ttt.pushChar('\"');
|
||||
ttt.pushChar(' ');
|
||||
}
|
||||
// term on it
|
||||
ttt.nullTerm();
|
||||
|
||||
if ( numGigabits > 0 )
|
||||
gigabitQuery.set2 ( ttt.getBufStart() ,
|
||||
si->m_queryLangId ,
|
||||
true , // queryexpansion?
|
||||
true ); // usestopwords?
|
||||
|
||||
for ( int32_t i = 0 ; i < numGigabits ; i++ ) {
|
||||
//if ( i > 0 && format == FORMAT_HTML )
|
||||
// sb.safePrintf("<hr>");
|
||||
//if ( perRow && (i % perRow == 0) )
|
||||
// sb.safePrintf("</td><td valign=top>");
|
||||
// print all sentences containing this gigabit
|
||||
Gigabit *gi = &gigabits[i];
|
||||
// after the first 3 hide them with a more link
|
||||
if ( i == 1 && format == FORMAT_HTML ) {
|
||||
sb.safePrintf("</span><a onclick="
|
||||
"\""
|
||||
"var e = "
|
||||
"document.getElementById('hidegbits');"
|
||||
"if ( e.style.display == 'none' ){\n"
|
||||
"e.style.display = '';\n"
|
||||
"this.innerHtml='Show less';"
|
||||
"}"
|
||||
"else {\n"
|
||||
"e.style.display = 'none';\n"
|
||||
"this.innerHtml='Show more';\n"
|
||||
"}\n"
|
||||
"\" style=cursor:hand;cursor:pointer;>"
|
||||
"Show more</a>");
|
||||
sb.safePrintf("<span id=hidegbits "
|
||||
"style=display:none;>"
|
||||
"<br><br>");
|
||||
}
|
||||
|
||||
printGigabitContainingSentences( st, &sb, msg40, gi, si, &gigabitQuery, i );
|
||||
if ( format == FORMAT_HTML )
|
||||
sb.safePrintf("<br><br>");
|
||||
}
|
||||
|
||||
//if ( numGigabits >= 1 && format == FORMAT_HTML )
|
||||
|
||||
if ( numGigabits && format == FORMAT_HTML )
|
||||
sb.safePrintf("</td></tr></table></div><br>");
|
||||
|
||||
|
||||
if ( numGigabits && format == FORMAT_XML )
|
||||
sb.safePrintf("\t</gigabits>\n");
|
||||
|
||||
if ( numGigabits && format == FORMAT_JSON ) {
|
||||
// remove ,\n
|
||||
sb.m_length -=2;
|
||||
// add back just \n
|
||||
// end the gigabits array
|
||||
sb.safePrintf("\n],\n");
|
||||
}
|
||||
|
||||
//
|
||||
// now print various knobs
|
||||
//
|
||||
|
||||
//
|
||||
// print date constraint functions now
|
||||
//
|
||||
if ( format == FORMAT_HTML && 1 == 2)
|
||||
sb.safePrintf(
|
||||
"<div id=best "
|
||||
"style="
|
||||
"font-size:14px;"
|
||||
"padding:5px;"
|
||||
"position:relative;"
|
||||
"border-width:3px;"
|
||||
"border-right-width:0px;"
|
||||
"border-style:solid;"
|
||||
"margin-left:10px;"
|
||||
"border-top-left-radius:10px;"
|
||||
"border-bottom-left-radius:10px;"
|
||||
"border-color:blue;"
|
||||
"background-color:white;"
|
||||
"border-right-color:white;"
|
||||
"margin-right:-3px;"
|
||||
"text-align:right;"
|
||||
">"
|
||||
"<b>"
|
||||
"ANYTIME "
|
||||
"</b>"
|
||||
"</div>"
|
||||
|
||||
"<br>"
|
||||
|
||||
"<div id=newsest "
|
||||
"style="
|
||||
"font-size:14px;"
|
||||
"padding:5px;"
|
||||
"position:relative;"
|
||||
"border-width:3px;"
|
||||
"border-right-width:0px;"
|
||||
"border-style:solid;"
|
||||
"margin-left:10px;"
|
||||
"border-top-left-radius:10px;"
|
||||
"border-bottom-left-radius:10px;"
|
||||
"border-color:white;"
|
||||
"background-color:blue;"
|
||||
"border-right-color:blue;"
|
||||
"margin-right:0px;"
|
||||
"text-align:right;"
|
||||
"color:white;"
|
||||
">"
|
||||
"<b>"
|
||||
"LAST 24 HOURS "
|
||||
"</b>"
|
||||
"</div>"
|
||||
|
||||
"<br>"
|
||||
|
||||
"<div id=newsest "
|
||||
"style="
|
||||
"font-size:14px;"
|
||||
"padding:5px;"
|
||||
"position:relative;"
|
||||
"border-width:3px;"
|
||||
"border-right-width:0px;"
|
||||
"border-style:solid;"
|
||||
"margin-left:10px;"
|
||||
"border-top-left-radius:10px;"
|
||||
"border-bottom-left-radius:10px;"
|
||||
"border-color:white;"
|
||||
"background-color:blue;"
|
||||
"border-right-color:blue;"
|
||||
"margin-right:0px;"
|
||||
"text-align:right;"
|
||||
"color:white;"
|
||||
">"
|
||||
"<b>"
|
||||
"LAST 7 DAYS "
|
||||
"</b>"
|
||||
"</div>"
|
||||
"<br>"
|
||||
|
||||
"<div id=newsest "
|
||||
"style="
|
||||
"font-size:14px;"
|
||||
"padding:5px;"
|
||||
"position:relative;"
|
||||
"border-width:3px;"
|
||||
"border-right-width:0px;"
|
||||
"border-style:solid;"
|
||||
"margin-left:10px;"
|
||||
"border-top-left-radius:10px;"
|
||||
"border-bottom-left-radius:10px;"
|
||||
"border-color:white;"
|
||||
"background-color:blue;"
|
||||
"border-right-color:blue;"
|
||||
"margin-right:0px;"
|
||||
"text-align:right;"
|
||||
"color:white;"
|
||||
">"
|
||||
"<b>"
|
||||
"LAST 30 DAYS "
|
||||
"</b>"
|
||||
"</div>"
|
||||
"<br>"
|
||||
|
||||
|
||||
);
|
||||
|
||||
|
||||
|
||||
//
|
||||
// now the MAIN column
|
||||
//
|
||||
if ( format == FORMAT_HTML )
|
||||
//
|
||||
// now the MAIN column
|
||||
//
|
||||
sb.safePrintf("\n</TD>"
|
||||
"<TD valign=top style=padding-left:30px;>\n");
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -1525,7 +949,6 @@ bool printSearchResultsHeader ( State0 *st ) {
|
||||
if ( header ) sb->safeStrcpy ( header );
|
||||
}
|
||||
|
||||
// this also prints gigabits and nuggabits
|
||||
// if we are xml/json we call this below otherwise we lose
|
||||
// the header of <?xml...> or whatever
|
||||
if ( ! g_conf.m_isMattWells && si->m_format == FORMAT_HTML ) {
|
||||
@ -1962,12 +1385,6 @@ bool printSearchResultsHeader ( State0 *st ) {
|
||||
st->m_header )
|
||||
msg40->printFacetTables ( sb );
|
||||
|
||||
// now print gigabits if we are xml/json
|
||||
if ( si->m_format != FORMAT_HTML ) {
|
||||
// this will print gigabits
|
||||
printLeftNavColumn ( *sb,st );
|
||||
}
|
||||
|
||||
// global-index is not a custom crawl but we should use "objects"
|
||||
bool isDiffbot = cr->m_isCustomCrawl;
|
||||
if ( strcmp(cr->m_coll,"GLOBAL-INDEX") == 0 ) isDiffbot = true;
|
||||
@ -2822,12 +2239,6 @@ bool printResult ( State0 *st, int32_t ix , int32_t *numPrintedSoFar ) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// . if section voting info was request, display now, it's in json
|
||||
// . so if in csv it will mess things up!!!
|
||||
if ( mr->ptr_sectionVotingInfo )
|
||||
// it is possible this is just "\0"
|
||||
sb->safeStrcpy ( mr->ptr_sectionVotingInfo );
|
||||
|
||||
// each "result" is the actual cached page, in this case, a json
|
||||
// object, because we were called with &icc=1. in that situation
|
||||
// ptr_content is set in the msg20reply.
|
||||
@ -5202,7 +4613,6 @@ bool printSingleScore ( SafeBuf *sb, SearchInput *si, SingleScore *ss, Msg20Repl
|
||||
wbw = WIKI_BIGRAM_WEIGHT;
|
||||
}
|
||||
float hgw = getHashGroupWeight(ss->m_hashGroup);
|
||||
//float dvw = getDiversityWeight(ss->m_diversityRank);
|
||||
float dnw = getDensityWeight(ss->m_densityRank);
|
||||
float wsw = getWordSpamWeight(ss->m_wordSpamRank);
|
||||
// HACK for inlink text!
|
||||
@ -5508,13 +4918,6 @@ bool printFrontPageShell ( SafeBuf *sb , char *tabName , CollectionRec *cr,
|
||||
|
||||
// if catId >= 1 then print the dmoz radio button
|
||||
bool printLogoAndSearchBox ( SafeBuf *sb, HttpRequest *hr, SearchInput *si ) {
|
||||
char *root = "";
|
||||
|
||||
if ( g_conf.m_isMattWells )
|
||||
root = "http://www.gigablast.com";
|
||||
|
||||
// now make a TABLE, left PANE contains gigabits and stuff
|
||||
|
||||
char *coll = hr->getString("c");
|
||||
if ( ! coll ) coll = "";
|
||||
|
||||
@ -6323,53 +5726,6 @@ bool printSearchFiltersBar ( SafeBuf *sb , HttpRequest *hr ) {
|
||||
s_mi[n].m_icon = NULL;
|
||||
n++;
|
||||
|
||||
#ifdef SUPPORT_FACETS
|
||||
// BR 20160801: Disabled by default
|
||||
|
||||
s_mi[n].m_menuNum = 4;
|
||||
s_mi[n].m_title = "Language facet";
|
||||
s_mi[n].m_cgi = "facet=gbfacetint:gblang";
|
||||
s_mi[n].m_icon = NULL;
|
||||
n++;
|
||||
|
||||
s_mi[n].m_menuNum = 4;
|
||||
s_mi[n].m_title = "Content type facet";
|
||||
s_mi[n].m_cgi = "facet=gbfacetstr:type";
|
||||
s_mi[n].m_icon = NULL;
|
||||
n++;
|
||||
|
||||
s_mi[n].m_menuNum = 4;
|
||||
s_mi[n].m_title = "Url path depth";
|
||||
s_mi[n].m_cgi = "facet=gbfacetint:gbpathdepth";
|
||||
s_mi[n].m_icon = NULL;
|
||||
n++;
|
||||
|
||||
s_mi[n].m_menuNum = 4;
|
||||
s_mi[n].m_title = "Spider date facet";
|
||||
s_mi[n].m_cgi = "facet=gbfacetint:gbspiderdate";
|
||||
s_mi[n].m_icon = NULL;
|
||||
n++;
|
||||
|
||||
// everything in tagdb is hashed
|
||||
s_mi[n].m_menuNum = 4;
|
||||
s_mi[n].m_title = "Site num inlinks facet";
|
||||
s_mi[n].m_cgi = "facet=gbfacetint:gbtagsitenuminlinks";
|
||||
s_mi[n].m_icon = NULL;
|
||||
n++;
|
||||
|
||||
// s_mi[n].m_menuNum = 4;
|
||||
// s_mi[n].m_title = "Domains facet";
|
||||
// s_mi[n].m_cgi = "facet=gbfacetint:gbdomhash";
|
||||
// n++;
|
||||
|
||||
s_mi[n].m_menuNum = 4;
|
||||
s_mi[n].m_title = "Hopcount facet";
|
||||
s_mi[n].m_cgi = "facet=gbfacetint:gbhopcount";
|
||||
s_mi[n].m_icon = NULL;
|
||||
n++;
|
||||
#endif
|
||||
|
||||
|
||||
// output
|
||||
s_mi[n].m_menuNum = 5;
|
||||
s_mi[n].m_title = "Output HTML";
|
||||
@ -6600,10 +5956,8 @@ bool printSearchFiltersBar ( SafeBuf *sb , HttpRequest *hr ) {
|
||||
// after 4 make a new line
|
||||
if ( i == 5 ) sb->safePrintf("<br><br>");
|
||||
if ( i == 9 ) sb->safePrintf("<br><br>");
|
||||
|
||||
#ifndef SUPPORT_FACETS
|
||||
|
||||
if( i == 4 ) continue;
|
||||
#endif
|
||||
|
||||
printMenu ( sb , i , hr );
|
||||
}
|
||||
|
27
PageRoot.cpp
27
PageRoot.cpp
@ -1,5 +1,6 @@
|
||||
#include "gb-include.h"
|
||||
|
||||
#include "PageRoot.h"
|
||||
#include "Indexdb.h" // makeKey(int64_t docId)
|
||||
#include "Titledb.h"
|
||||
#include "Spider.h"
|
||||
@ -21,7 +22,7 @@ bool sendPageRoot ( TcpSocket *s, HttpRequest *r ){
|
||||
return sendPageRoot ( s, r, NULL );
|
||||
}
|
||||
|
||||
bool printNav ( SafeBuf &sb , HttpRequest *r ) {
|
||||
static bool printNav ( SafeBuf &sb , HttpRequest *r ) {
|
||||
sb.safePrintf("</TD></TR></TABLE>"
|
||||
"</body></html>");
|
||||
return true;
|
||||
@ -33,7 +34,7 @@ bool printNav ( SafeBuf &sb , HttpRequest *r ) {
|
||||
//
|
||||
//////////////
|
||||
|
||||
bool printFamilyFilter ( SafeBuf& sb , bool familyFilterOn ) {
|
||||
static bool printFamilyFilter ( SafeBuf& sb , bool familyFilterOn ) {
|
||||
char *s1 = "";
|
||||
char *s2 = "";
|
||||
if ( familyFilterOn ) s1 = " checked";
|
||||
@ -49,7 +50,7 @@ bool printFamilyFilter ( SafeBuf& sb , bool familyFilterOn ) {
|
||||
|
||||
#include "SearchInput.h"
|
||||
|
||||
bool printRadioButtons ( SafeBuf& sb , SearchInput *si ) {
|
||||
static bool printRadioButtons ( SafeBuf& sb , SearchInput *si ) {
|
||||
// don't display this for directory search
|
||||
// look it up. returns catId <= 0 if dmoz not setup yet.
|
||||
// From PageDirectory.cpp
|
||||
@ -127,7 +128,7 @@ bool printRadioButtons ( SafeBuf& sb , SearchInput *si ) {
|
||||
return true;
|
||||
}
|
||||
|
||||
bool printLogo ( SafeBuf& sb , SearchInput *si ) {
|
||||
static bool printLogo ( SafeBuf& sb , SearchInput *si ) {
|
||||
// if an image was provided...
|
||||
if ( ! si->m_imgUrl || ! si->m_imgUrl[0] ) {
|
||||
// no, now we default to our logo
|
||||
@ -172,7 +173,7 @@ bool printLogo ( SafeBuf& sb , SearchInput *si ) {
|
||||
|
||||
|
||||
bool expandHtml ( SafeBuf& sb,
|
||||
char *head ,
|
||||
const char *head ,
|
||||
int32_t hlen ,
|
||||
char *q ,
|
||||
int32_t qlen ,
|
||||
@ -433,7 +434,7 @@ bool expandHtml ( SafeBuf& sb,
|
||||
bool printLeftColumnRocketAndTabs ( SafeBuf *sb ,
|
||||
bool isSearchResultsPage ,
|
||||
CollectionRec *cr ,
|
||||
char *tabName ) {
|
||||
const char *tabName ) {
|
||||
|
||||
class MenuItem {
|
||||
public:
|
||||
@ -661,7 +662,7 @@ bool printLeftColumnRocketAndTabs ( SafeBuf *sb ,
|
||||
return true;
|
||||
}
|
||||
|
||||
bool printFrontPageShell ( SafeBuf *sb , char *tabName , CollectionRec *cr ,
|
||||
bool printFrontPageShell ( SafeBuf *sb , const char *tabName , CollectionRec *cr ,
|
||||
bool printGigablast ) {
|
||||
|
||||
sb->safePrintf("<html>\n");
|
||||
@ -670,7 +671,7 @@ bool printFrontPageShell ( SafeBuf *sb , char *tabName , CollectionRec *cr ,
|
||||
sb->safePrintf("<meta name=\"description\" content=\"A powerful, new search engine that does real-time indexing!\">\n");
|
||||
sb->safePrintf("<meta name=\"keywords\" content=\"search, search engine, search engines, search the web, fresh index, green search engine, green search, clean search engine, clean search\">\n");
|
||||
|
||||
char *title = "An Alternative Open Source Search Engine";
|
||||
const char *title = "An Alternative Open Source Search Engine";
|
||||
if ( strcasecmp(tabName,"search") ) {
|
||||
title = tabName;
|
||||
}
|
||||
@ -736,7 +737,7 @@ bool printFrontPageShell ( SafeBuf *sb , char *tabName , CollectionRec *cr ,
|
||||
return true;
|
||||
}
|
||||
|
||||
bool printWebHomePage ( SafeBuf &sb , HttpRequest *r , TcpSocket *sock ) {
|
||||
static bool printWebHomePage ( SafeBuf &sb , HttpRequest *r , TcpSocket *sock ) {
|
||||
SearchInput si;
|
||||
si.set ( sock , r );
|
||||
|
||||
@ -870,7 +871,7 @@ bool printWebHomePage ( SafeBuf &sb , HttpRequest *r , TcpSocket *sock ) {
|
||||
return true;
|
||||
}
|
||||
|
||||
bool printAddUrlHomePage ( SafeBuf &sb , char *url , HttpRequest *r ) {
|
||||
static bool printAddUrlHomePage ( SafeBuf &sb , char *url , HttpRequest *r ) {
|
||||
|
||||
CollectionRec *cr = g_collectiondb.getRec ( r );
|
||||
|
||||
@ -1167,7 +1168,7 @@ public:
|
||||
// only allow up to 1 Msg10's to be in progress at a time
|
||||
static bool s_inprogress = false;
|
||||
|
||||
void doneInjectingWrapper3 ( void *st ) ;
|
||||
static void doneInjectingWrapper3 ( void *st ) ;
|
||||
|
||||
// . returns false if blocked, true otherwise
|
||||
// . sets g_errno on error
|
||||
@ -1438,7 +1439,7 @@ bool sendPageAddUrl ( TcpSocket *sock , HttpRequest *hr ) {
|
||||
}
|
||||
|
||||
|
||||
void doneInjectingWrapper3 ( void *st ) {
|
||||
static void doneInjectingWrapper3 ( void *st ) {
|
||||
State1i *st1 = (State1i *)st;
|
||||
// allow others to add now
|
||||
s_inprogress = false;
|
||||
@ -1645,7 +1646,7 @@ void doneInjectingWrapper3 ( void *st ) {
|
||||
static HashTable s_htable;
|
||||
static bool s_init = false;
|
||||
static int32_t s_lastTime = 0;
|
||||
bool canSubmit ( uint32_t h , int32_t now , int32_t maxAddUrlsPerIpDomPerDay ) {
|
||||
static bool canSubmit ( uint32_t h , int32_t now , int32_t maxAddUrlsPerIpDomPerDay ) {
|
||||
// . sometimes no limit
|
||||
// . 0 means no limit because if they don't want any submission they
|
||||
// can just turn off add url and we want to avoid excess
|
||||
|
29
PageRoot.h
Normal file
29
PageRoot.h
Normal file
@ -0,0 +1,29 @@
|
||||
#ifndef PAGEROOT_H_
|
||||
#define PAGEROOT_H_
|
||||
|
||||
#include "SafeBuf.h"
|
||||
#include "Collectiondb.h"
|
||||
class SearchInput;
|
||||
|
||||
bool printFrontPageShell ( SafeBuf *sb,
|
||||
const char *tabName,
|
||||
CollectionRec *cr,
|
||||
bool printGigablast );
|
||||
|
||||
|
||||
bool expandHtml ( SafeBuf& sb,
|
||||
const char *head ,
|
||||
int32_t hlen ,
|
||||
char *q ,
|
||||
int32_t qlen ,
|
||||
HttpRequest *r ,
|
||||
SearchInput *si,
|
||||
char *method ,
|
||||
CollectionRec *cr );
|
||||
|
||||
bool printLeftColumnRocketAndTabs ( SafeBuf *sb ,
|
||||
bool isSearchResultsPage ,
|
||||
CollectionRec *cr ,
|
||||
const char *tabName );
|
||||
|
||||
#endif
|
@ -21,27 +21,12 @@ bool sendPageSockets ( TcpSocket *s , HttpRequest *r ) {
|
||||
// don't allow pages bigger than 128k in cache
|
||||
char buf [ 128*1024 ];
|
||||
SafeBuf p(buf, 128*1024);
|
||||
//char *bufEnd = buf + 256*1024;
|
||||
// a ptr into "buf"
|
||||
// password, too
|
||||
//int32_t pwdLen = 0;
|
||||
//char *pwd = r->getString ( "pwd" , &pwdLen );
|
||||
//if ( pwdLen > 31 ) pwdLen = 31;
|
||||
//if ( pwd ) pwd[pwdLen]='\0';
|
||||
int32_t collLen = 0;
|
||||
char *coll = r->getString( "c", &collLen );
|
||||
if ( collLen > MAX_COLL_LEN ) collLen = MAX_COLL_LEN;
|
||||
if ( coll ) coll[collLen] = '\0';
|
||||
//char pbuf [32];
|
||||
//if ( pwdLen > 0 ) strncpy ( pbuf , pwd , pwdLen );
|
||||
//pbuf[pwdLen]='\0';
|
||||
// print standard header
|
||||
|
||||
|
||||
// char *ss = p.getBuf();
|
||||
// char *ssend = p.getBufEnd();
|
||||
g_pages.printAdminTop ( &p, s , r );
|
||||
//p.incrementLength(sss - ss);
|
||||
|
||||
// now print out the sockets table for each tcp server we have
|
||||
printTcpTable(&p,"HTTP Server" ,g_httpServer.getTcp());
|
||||
@ -62,66 +47,6 @@ bool sendPageSockets ( TcpSocket *s , HttpRequest *r ) {
|
||||
if ( m == 0 ) count++;
|
||||
}
|
||||
|
||||
/*
|
||||
sprintf ( p , "<table width=100%% bgcolor=#d0d0f0 border=1>"
|
||||
"<tr><td bgcolor=#c0c0f0 colspan=%"INT32">"
|
||||
"<center><font size=+1><b>Wait Times</b></font>"
|
||||
"</td></tr>\n" , 3 + count );
|
||||
p += gbstrlen ( p );
|
||||
// print columns
|
||||
sprintf ( p ,
|
||||
"<tr>"
|
||||
"<td><b>machine #</b></td>"
|
||||
"<td><b>send wait</b></td>"
|
||||
"<td><b>read wait</b></td>" );
|
||||
p += gbstrlen ( p );
|
||||
// print disk columns
|
||||
for ( int32_t i = 0 ; i < count ; i++ ) {
|
||||
sprintf ( p , "<td><b>disk %"INT32" wait</b></td>",i);
|
||||
p += gbstrlen ( p );
|
||||
}
|
||||
// end the top row
|
||||
sprintf ( p , "</tr>\n" );
|
||||
p += gbstrlen ( p );
|
||||
// print rows
|
||||
for ( int32_t i = 0 ; i < g_hostdb.getNumMachines() ; i++ ) {
|
||||
// print machine #
|
||||
sprintf ( p , "<tr><td><b>%"INT32"</b></td>",i);
|
||||
p += gbstrlen ( p );
|
||||
// then net send
|
||||
float x = (float)g_queryRouter.m_sendWaits[i] / 1000;
|
||||
sprintf ( p , "<td>%.1fms</td>", x );
|
||||
p += gbstrlen ( p );
|
||||
// then net read
|
||||
x = (float)g_queryRouter.m_readWaits[i] / 1000;
|
||||
sprintf ( p , "<td>%.1fms</td>", x );
|
||||
p += gbstrlen ( p );
|
||||
// print disk wait in milliseconds (it's in microseconds)
|
||||
// find any host that matches this machine
|
||||
for ( int32_t j = 0 ; j < g_hostdb.getNumHosts() ; j++ ) {
|
||||
// use in order of ip
|
||||
int32_t hid = g_hostdb.m_hostPtrs[j]->m_hostId;
|
||||
// get machine #
|
||||
int32_t m = g_hostdb.getMachineNum(hid);
|
||||
// skip if no match
|
||||
if ( m != i ) continue;
|
||||
// otherwise print
|
||||
x = (float)g_queryRouter.m_diskWaits[hid] / 1000;
|
||||
sprintf ( p , "<td>%.1fms</td>", x );
|
||||
p += gbstrlen ( p );
|
||||
}
|
||||
// end row
|
||||
sprintf ( p , "</tr>\n");
|
||||
p += gbstrlen ( p );
|
||||
}
|
||||
// end table
|
||||
sprintf ( p , "</table>");
|
||||
p += gbstrlen ( p );
|
||||
*/
|
||||
|
||||
// print the final tail
|
||||
//p += g_httpServer.printTail ( p , pend - p );
|
||||
|
||||
// calculate buffer length
|
||||
int32_t bufLen = p.length();
|
||||
// . send this page
|
||||
|
78
Pages.cpp
78
Pages.cpp
@ -8,6 +8,7 @@
|
||||
#include "PageParser.h" // g_inPageParser
|
||||
#include "Rebalance.h"
|
||||
#include "Profiler.h"
|
||||
#include "PageRoot.h"
|
||||
|
||||
// a global class extern'd in Pages.h
|
||||
Pages g_pages;
|
||||
@ -1518,8 +1519,6 @@ bool sendPageReportSpam ( TcpSocket *s , HttpRequest *r ) {
|
||||
return retval;
|
||||
}
|
||||
|
||||
bool printFrontPageShell ( SafeBuf *sb , char *tabName , CollectionRec *cr ,
|
||||
bool printGigablast ) ;
|
||||
|
||||
// let's use a separate section for each "page"
|
||||
// then have 3 tables, the input parms,
|
||||
@ -2200,81 +2199,6 @@ bool printApiForPage ( SafeBuf *sb , int32_t PAGENUM , CollectionRec *cr ) {
|
||||
sb->safePrintf("<b>\t},\n</b>\n");
|
||||
|
||||
|
||||
// gigabits
|
||||
sb->brify2 (
|
||||
"\t# The start of the gigabits array. Each gigabit "
|
||||
"is mined from the content of the search results. "
|
||||
"The top "
|
||||
"N results are mined, and you can control N with the "
|
||||
"&dsrt input parameter described above.\n"
|
||||
, cols , "\n\t# " , false );
|
||||
sb->safePrintf("<b>\t\"gigabits\":[\n\n</b>");
|
||||
|
||||
|
||||
// print gigabit #0
|
||||
sb->brify2 ( "\t\t# The first gigabit in the array.\n"
|
||||
, cols , "\n\t\t# " , false );
|
||||
sb->safePrintf("<b>\t\t{\n\n</b>");
|
||||
|
||||
sb->brify2 ( "\t\t# The gigabit as a string in utf8.\n"
|
||||
, cols , "\n\t\t# " , false );
|
||||
sb->safePrintf("<b>\t\t\"term\":\"Membership\",\n\n</b>");
|
||||
|
||||
sb->brify2 ( "\t\t# The numeric score of the gigabit.\n"
|
||||
, cols , "\n\t\t# " , false );
|
||||
sb->safePrintf("<b>\t\t\"score\":240,\n\n</b>");
|
||||
|
||||
sb->brify2 ( "\t\t# The popularity ranking of the gigabit. "
|
||||
"Out of 10000 random documents, how many "
|
||||
"documents contain it?\n"
|
||||
, cols , "\n\t\t# " , false );
|
||||
sb->safePrintf("<b>\t\t\"minPop\":480,\n\n</b>");
|
||||
|
||||
sb->brify2 ( "\t\t# The gigabit in the context of a "
|
||||
"document.\n"
|
||||
, cols , "\n\t\t# " , false );
|
||||
sb->safePrintf("<b>\t\t\"instance\":{\n\n</b>");
|
||||
|
||||
sb->brify2 ( "\t\t\t"
|
||||
"# A sentence, if it exists, "
|
||||
"from one of the search results "
|
||||
"which also contains the gigabit and as many "
|
||||
"significant query terms as possible. In UTF-8.\n"
|
||||
, cols , "\n\t\t\t# " , false );
|
||||
sb->brify2("<b>\t\t\t\"sentence\":"
|
||||
"\"Get a free "
|
||||
"<b>Tested</b> Premium Membership here!\","
|
||||
"\n\n</b>"
|
||||
, 80 , "\n\t\t\t " , false );
|
||||
|
||||
sb->brify2 ( "\t\t\t"
|
||||
"# The url that contained that sentence. Always "
|
||||
"starts with http.\n"
|
||||
, cols , "\n\t\t\t# " , false );
|
||||
sb->safePrintf("<b>\t\t\t\"url\":"
|
||||
"\"http://www.tested.com/\","
|
||||
"\n\n</b>");
|
||||
|
||||
sb->brify2 ( "\t\t\t"
|
||||
"# The domain of that url.\n"
|
||||
, cols , "\n\t\t\t# " , false );
|
||||
sb->safePrintf("<b>\t\t\t\"domain\":"
|
||||
"\"tested.com\""
|
||||
"\n</b>");
|
||||
// end instance
|
||||
sb->safePrintf("<b>\t\t}\n\n</b>");
|
||||
// end gigabit
|
||||
sb->safePrintf("\t\t# End of the first gigabit\n"
|
||||
"<b>\t\t},\n\n</b>");
|
||||
|
||||
sb->safePrintf("\t\t...\n\n");
|
||||
|
||||
sb->brify2 (
|
||||
"\t# End of the JSON gigabits array.\n"
|
||||
, cols , "\n\t# " , false );
|
||||
sb->safePrintf("<b>\t],\n\n</b>");
|
||||
|
||||
|
||||
// BEGIN FACETS
|
||||
sb->safePrintf( "\t# Start of the facets array, if any.\n");
|
||||
sb->safePrintf("<b>\t\"facets\":[\n</b>\n");
|
||||
|
463
Parms.cpp
463
Parms.cpp
@ -2345,13 +2345,6 @@ bool Parms::setFromRequest ( HttpRequest *r ,
|
||||
char *xx=NULL;*xx=0;
|
||||
}
|
||||
|
||||
// need this for searchInput which takes default from "cr"
|
||||
//CollectionRec *cr = g_collectiondb.getRec ( r , true );
|
||||
|
||||
// no SearchInput.cpp does this and then overrides if xml feed
|
||||
// to set m_docsToScanForTopics
|
||||
//setToDefault ( THIS , objType , cr );
|
||||
|
||||
// loop through cgi parms
|
||||
for ( int32_t i = 0 ; i < r->getNumFields() ; i++ ) {
|
||||
// get cgi parm name
|
||||
@ -4655,59 +4648,6 @@ void Parms::init ( ) {
|
||||
m->m_obj = OBJ_COLL;
|
||||
m++;
|
||||
|
||||
m->m_title = "demotion for query terms or gigabits in url";
|
||||
m->m_desc = "Demotion factor for query terms or gigabits "
|
||||
"in a result's url. "
|
||||
"Score will be penalized by this factor times the number "
|
||||
"of query terms or gigabits in the url divided by "
|
||||
"the max value below such that fewer "
|
||||
"query terms or gigabits in the url causes the result "
|
||||
"to be demoted more heavily, depending on the factor. "
|
||||
"Higher factors demote more per query term or gigabit "
|
||||
"in the page's url. "
|
||||
"Generally, a page may not be demoted more than this "
|
||||
"factor as a percent. Also, how it is demoted is "
|
||||
"dependant on the max value. For example, "
|
||||
"a factor of 0.2 will demote the page 20% if it has no "
|
||||
"query terms or gigabits in its url. And if the max value is "
|
||||
"10, then a page with 5 query terms or gigabits in its "
|
||||
"url will be demoted 10%; and 10 or more query terms or "
|
||||
"gigabits in the url will not be demoted at all. "
|
||||
"0 means no demotion. "
|
||||
"A safe range is from 0 to 0.35. ";
|
||||
m->m_cgi = "pqrqttiu";
|
||||
m->m_off = (char *)&cr.m_pqr_demFactQTTopicsInUrl - x;
|
||||
m->m_type = TYPE_FLOAT;
|
||||
m->m_def = "0";
|
||||
m->m_group = 0;
|
||||
m->m_flags = PF_HIDDEN | PF_NOSAVE;
|
||||
m->m_page = PAGE_SEARCH;
|
||||
m->m_obj = OBJ_COLL;
|
||||
m++;
|
||||
|
||||
m->m_title = "max value for pages with query terms or gigabits "
|
||||
"in url";
|
||||
m->m_desc = "Max number of query terms or gigabits in a url. "
|
||||
"Pages with a number of query terms or gigabits in their "
|
||||
"urls greater than or equal to this value will not be "
|
||||
"demoted. "
|
||||
"This controls the range of values expected to represent "
|
||||
"the number of query terms or gigabits in a url. It should "
|
||||
"be set to or near the estimated max number of query terms "
|
||||
"or topics that can be in a url. Setting to a lower value "
|
||||
"increases the penalty per query term or gigabit that is "
|
||||
"not in a url, but decreases the range of values that "
|
||||
"will be demoted.";
|
||||
m->m_cgi = "pqrqttium";
|
||||
m->m_off = (char *)&cr.m_pqr_maxValQTTopicsInUrl - x;
|
||||
m->m_type = TYPE_LONG;
|
||||
m->m_def = "10";
|
||||
m->m_group = 0;
|
||||
m->m_flags = PF_HIDDEN | PF_NOSAVE;
|
||||
m->m_page = PAGE_SEARCH;
|
||||
m->m_obj = OBJ_COLL;
|
||||
m++;
|
||||
|
||||
m->m_title = "demotion for pages that are not "
|
||||
"root or have many paths in the url";
|
||||
m->m_desc = "Demotion factor each path in the url. "
|
||||
@ -4775,60 +4715,6 @@ void Parms::init ( ) {
|
||||
m->m_obj = OBJ_COLL;
|
||||
m++;
|
||||
|
||||
m->m_title = "demotion for non-location specific queries "
|
||||
"with a location specific title";
|
||||
m->m_desc = "Demotion factor for non-location specific queries "
|
||||
"with a location specific title. "
|
||||
"Pages which contain a location in their title which is "
|
||||
"not in the query or the gigabits will be demoted by their "
|
||||
"population multiplied by this factor divided by the max "
|
||||
"place population specified below. "
|
||||
"Generally, a page will not be demoted more than this "
|
||||
"value as a percent. "
|
||||
"0 means no demotion. ";
|
||||
m->m_cgi = "pqrloct";
|
||||
m->m_off = (char *)&cr.m_pqr_demFactLocTitle - x;
|
||||
m->m_type = TYPE_FLOAT;
|
||||
m->m_def = "0.99";
|
||||
m->m_group = 0;
|
||||
m->m_flags = PF_HIDDEN | PF_NOSAVE;
|
||||
m->m_page = PAGE_SEARCH;
|
||||
m->m_obj = OBJ_COLL;
|
||||
m++;
|
||||
|
||||
m->m_title = "demotion for non-location specific queries "
|
||||
"with a location specific summary";
|
||||
m->m_desc = "Demotion factor for non-location specific queries "
|
||||
"with a location specific summary. "
|
||||
"Pages which contain a location in their summary which is "
|
||||
"not in the query or the gigabits will be demoted by their "
|
||||
"population multiplied by this factor divided by the max "
|
||||
"place population specified below. "
|
||||
"Generally, a page will not be demoted more than this "
|
||||
"value as a percent. "
|
||||
"0 means no demotion. ";
|
||||
m->m_cgi = "pqrlocs";
|
||||
m->m_off = (char *)&cr.m_pqr_demFactLocSummary - x;
|
||||
m->m_type = TYPE_FLOAT;
|
||||
m->m_def = "0.95";
|
||||
m->m_group = 0;
|
||||
m->m_flags = PF_HIDDEN | PF_NOSAVE;
|
||||
m->m_page = PAGE_SEARCH;
|
||||
m->m_obj = OBJ_COLL;
|
||||
m++;
|
||||
|
||||
m->m_title = "demote locations that appear in gigabits";
|
||||
m->m_desc = "Demote locations that appear in gigabits.";
|
||||
m->m_cgi = "pqrlocg";
|
||||
m->m_off = (char *)&cr.m_pqr_demInTopics - x;
|
||||
m->m_type = TYPE_BOOL;
|
||||
m->m_def = "1";
|
||||
m->m_group = 0;
|
||||
m->m_flags = PF_HIDDEN | PF_NOSAVE;
|
||||
m->m_page = PAGE_SEARCH;
|
||||
m->m_obj = OBJ_COLL;
|
||||
m++;
|
||||
|
||||
m->m_title = "max value for non-location specific queries "
|
||||
"with location specific results";
|
||||
m->m_desc = "Max place population. "
|
||||
@ -5093,19 +4979,6 @@ void Parms::init ( ) {
|
||||
m->m_obj = OBJ_COLL;
|
||||
m++;
|
||||
|
||||
m->m_title = "percent topic similar default";
|
||||
m->m_desc = "Like above, but used for deciding when to cluster "
|
||||
"results by topic for the news collection.";
|
||||
m->m_cgi = "ptcd";
|
||||
m->m_off = (char *)&cr.m_topicSimilarCutoffDefault - x;
|
||||
m->m_type = TYPE_LONG;
|
||||
m->m_def = "50";
|
||||
m->m_group = 0;
|
||||
m->m_flags = PF_HIDDEN | PF_NOSAVE;
|
||||
m->m_page = PAGE_SEARCH;
|
||||
m->m_obj = OBJ_COLL;
|
||||
m++;
|
||||
|
||||
m->m_title = "max query terms";
|
||||
m->m_desc = "Do not allow more than this many query terms. Helps "
|
||||
"prevent big queries from resource hogging.";
|
||||
@ -5844,97 +5717,6 @@ void Parms::init ( ) {
|
||||
m->m_obj = OBJ_SI;
|
||||
m++;
|
||||
|
||||
m->m_title = "results to scan for gigabits generation";
|
||||
m->m_desc = "How many search results should we "
|
||||
"scan for gigabit (related topics) generation. Set this to "
|
||||
"zero to disable gigabits!";
|
||||
m->m_cgi = "dsrt";
|
||||
m->m_off = (char *)&si.m_docsToScanForTopics - y;
|
||||
m->m_type = TYPE_LONG;
|
||||
m->m_defOff= (char *)&cr.m_docsToScanForTopics - x;
|
||||
m->m_flags = PF_API;
|
||||
m->m_page = PAGE_RESULTS;
|
||||
m->m_obj = OBJ_SI;
|
||||
m++;
|
||||
|
||||
|
||||
m->m_title = "ip restriction for gigabits";
|
||||
m->m_desc = "Should Gigablast only get one document per IP domain "
|
||||
"and per domain for gigabits (related topics) generation?";
|
||||
m->m_cgi = "ipr";
|
||||
m->m_off = (char *)&si.m_ipRestrictForTopics - y;
|
||||
m->m_defOff= (char *)&cr.m_ipRestrict - x;
|
||||
m->m_type = TYPE_BOOL;
|
||||
m->m_group = 0;
|
||||
m->m_flags = PF_API;
|
||||
m->m_page = PAGE_RESULTS;
|
||||
m->m_obj = OBJ_SI;
|
||||
m++;
|
||||
|
||||
|
||||
|
||||
m->m_title = "number of gigabits to show";
|
||||
m->m_desc = "What is the number of gigabits (related topics) "
|
||||
"displayed per query? Set to 0 to save a little CPU time.";
|
||||
m->m_cgi = "nrt";
|
||||
m->m_defOff= (char *)&cr.m_numTopics - x;
|
||||
m->m_off = (char *)&si.m_numTopicsToDisplay - y;
|
||||
m->m_type = TYPE_LONG;
|
||||
m->m_def = "11";
|
||||
m->m_group = 0;
|
||||
m->m_sprpg = 0; // do not propagate
|
||||
m->m_sprpp = 0; // do not propagate
|
||||
m->m_flags = PF_API;
|
||||
m->m_page = PAGE_RESULTS;
|
||||
m->m_obj = OBJ_SI;
|
||||
m++;
|
||||
|
||||
|
||||
m->m_title = "min topics score";
|
||||
m->m_desc = "Gigabits (related topics) with scores below this "
|
||||
"will be excluded. Scores range from 0% to over 100%.";
|
||||
m->m_cgi = "mts";
|
||||
m->m_defOff= (char *)&cr.m_minTopicScore - x;
|
||||
m->m_off = (char *)&si.m_minTopicScore - y;
|
||||
m->m_type = TYPE_LONG;
|
||||
m->m_group = 0;
|
||||
m->m_flags = PF_API;
|
||||
m->m_page = PAGE_RESULTS;
|
||||
m->m_obj = OBJ_SI;
|
||||
m++;
|
||||
|
||||
|
||||
|
||||
m->m_title = "min gigabit doc count by default";
|
||||
m->m_desc = "How many documents must contain the gigabit "
|
||||
"(related topic) in order for it to be displayed.";
|
||||
m->m_cgi = "mdc";
|
||||
m->m_defOff= (char *)&cr.m_minDocCount - x;
|
||||
m->m_off = (char *)&si.m_minDocCount - y;
|
||||
m->m_type = TYPE_LONG;
|
||||
m->m_def = "2";
|
||||
m->m_group = 0;
|
||||
m->m_flags = PF_API;
|
||||
m->m_page = PAGE_RESULTS;
|
||||
m->m_obj = OBJ_SI;
|
||||
m++;
|
||||
|
||||
|
||||
|
||||
m->m_title = "dedup doc percent for gigabits (related topics)";
|
||||
m->m_desc = "If a document is this percent similar to another "
|
||||
"document with a higher score, then it will not contribute "
|
||||
"to the gigabit generation.";
|
||||
m->m_cgi = "dsp";
|
||||
m->m_defOff= (char *)&cr.m_dedupSamplePercent - x;
|
||||
m->m_off = (char *)&si.m_dedupSamplePercent - y;
|
||||
m->m_type = TYPE_LONG;
|
||||
m->m_def = "80";
|
||||
m->m_group = 0;
|
||||
m->m_flags = PF_API;
|
||||
m->m_page = PAGE_RESULTS;
|
||||
m->m_obj = OBJ_SI;
|
||||
m++;
|
||||
|
||||
///////////////////////////////////////////
|
||||
// SPIDER PROXY CONTROLS
|
||||
@ -6050,19 +5832,6 @@ void Parms::init ( ) {
|
||||
m->m_obj = OBJ_CONF;
|
||||
m++;
|
||||
|
||||
m->m_title = "max words per gigabit (related topic) by default";
|
||||
m->m_desc = "Maximum number of words a gigabit (related topic) "
|
||||
"can have. Affects xml feeds, too.";
|
||||
m->m_cgi = "mwpt";
|
||||
m->m_defOff= (char *)&cr.m_maxWordsPerTopic - x;
|
||||
m->m_off = (char *)&si.m_maxWordsPerTopic - y;
|
||||
m->m_type = TYPE_LONG;
|
||||
m->m_def = "6";
|
||||
m->m_group = 0;
|
||||
m->m_flags = PF_API;
|
||||
m->m_page = PAGE_RESULTS;
|
||||
m->m_obj = OBJ_SI;
|
||||
m++;
|
||||
|
||||
m->m_title = "show images";
|
||||
m->m_desc = "Should we return or show the thumbnail images in the "
|
||||
@ -6364,52 +6133,6 @@ void Parms::init ( ) {
|
||||
m->m_obj = OBJ_SI;
|
||||
m++;
|
||||
|
||||
m->m_title = "return number of docs per topic";
|
||||
m->m_desc = "Use 1 if you want Gigablast to return the number of "
|
||||
"documents in the search results that contained each topic "
|
||||
"(gigabit).";
|
||||
m->m_def = "1";
|
||||
m->m_off = (char *)&si.m_returnDocIdCount - y;
|
||||
m->m_type = TYPE_BOOL;
|
||||
m->m_cgi = "rdc";
|
||||
m->m_page = PAGE_RESULTS;
|
||||
m->m_obj = OBJ_SI;
|
||||
m++;
|
||||
|
||||
m->m_title = "return docids per topic";
|
||||
m->m_desc = "Use 1 if you want Gigablast to return the list of "
|
||||
"docIds from the search results that contained each topic "
|
||||
"(gigabit).";
|
||||
m->m_def = "0";
|
||||
m->m_off = (char *)&si.m_returnDocIds - y;
|
||||
m->m_type = TYPE_BOOL;
|
||||
m->m_cgi = "rd";
|
||||
m->m_page = PAGE_RESULTS;
|
||||
m->m_obj = OBJ_SI;
|
||||
m++;
|
||||
|
||||
m->m_title = "return popularity per topic";
|
||||
m->m_desc = "Use 1 if you want Gigablast to return the popularity "
|
||||
"of each topic (gigabit).";
|
||||
m->m_def = "0";
|
||||
m->m_off = (char *)&si.m_returnPops - y;
|
||||
m->m_type = TYPE_BOOL;
|
||||
m->m_cgi = "rp";
|
||||
m->m_flags = PF_HIDDEN | PF_NOSAVE;
|
||||
m->m_page = PAGE_RESULTS;
|
||||
m->m_obj = OBJ_SI;
|
||||
m++;
|
||||
|
||||
m->m_title = "debug gigabits flag";
|
||||
m->m_desc = "Is 1 to log gigabits debug information, 0 otherwise.";
|
||||
m->m_def = "0";
|
||||
m->m_off = (char *)&si.m_debugGigabits - y;
|
||||
m->m_type = TYPE_BOOL;
|
||||
m->m_cgi = "debuggigabits";
|
||||
m->m_page = PAGE_RESULTS;
|
||||
m->m_obj = OBJ_SI;
|
||||
m++;
|
||||
|
||||
m->m_title = "return docids only";
|
||||
m->m_desc = "Is 1 to return only docids as query results.";
|
||||
m->m_def = "0";
|
||||
@ -9864,147 +9587,6 @@ void Parms::init ( ) {
|
||||
m->m_obj = OBJ_COLL;
|
||||
m++;
|
||||
|
||||
m->m_title = "results to scan for gigabits generation by default";
|
||||
m->m_desc = "How many search results should we "
|
||||
"scan for gigabit (related topics) generation. Set this to "
|
||||
"zero to disable gigabits generation by default.";
|
||||
m->m_cgi = "dsrt";
|
||||
m->m_off = (char *)&cr.m_docsToScanForTopics - x;
|
||||
m->m_type = TYPE_LONG;
|
||||
m->m_def = "0";
|
||||
m->m_flags = PF_API | PF_CLONE;
|
||||
m->m_page = PAGE_SEARCH;
|
||||
m->m_obj = OBJ_COLL;
|
||||
m++;
|
||||
|
||||
m->m_title = "ip restriction for gigabits by default";
|
||||
m->m_desc = "Should Gigablast only get one document per IP domain "
|
||||
"and per domain for gigabits (related topics) generation?";
|
||||
m->m_cgi = "ipr";
|
||||
m->m_off = (char *)&cr.m_ipRestrict - x;
|
||||
m->m_type = TYPE_BOOL;
|
||||
// default to 0 since newspaperarchive only has docs from same IP dom
|
||||
m->m_def = "0";
|
||||
m->m_group = 0;
|
||||
m->m_flags = PF_API | PF_CLONE;
|
||||
m->m_page = PAGE_SEARCH;
|
||||
m->m_obj = OBJ_COLL;
|
||||
m++;
|
||||
|
||||
|
||||
m->m_title = "remove overlapping topics";
|
||||
m->m_desc = "Should Gigablast remove overlapping topics (gigabits)?";
|
||||
m->m_cgi = "rot";
|
||||
m->m_off = (char *)&cr.m_topicRemoveOverlaps - x;
|
||||
m->m_type = TYPE_BOOL;
|
||||
m->m_def = "1";
|
||||
m->m_group = 0;
|
||||
m->m_flags = PF_API | PF_CLONE;
|
||||
m->m_page = PAGE_SEARCH;
|
||||
m->m_obj = OBJ_COLL;
|
||||
m++;
|
||||
|
||||
m->m_title = "number of gigabits to show by default";
|
||||
m->m_desc = "What is the number of "
|
||||
"related topics (gigabits) "
|
||||
"displayed per query? Set to 0 to save "
|
||||
"CPU time.";
|
||||
m->m_cgi = "nrt";
|
||||
m->m_off = (char *)&cr.m_numTopics - x;
|
||||
m->m_type = TYPE_LONG;
|
||||
m->m_def = "0";
|
||||
m->m_group = 0;
|
||||
m->m_sprpg = 0; // do not propagate
|
||||
m->m_sprpp = 0; // do not propagate
|
||||
m->m_flags = PF_API | PF_CLONE;
|
||||
m->m_page = PAGE_SEARCH;
|
||||
m->m_obj = OBJ_COLL;
|
||||
m++;
|
||||
|
||||
|
||||
|
||||
m->m_title = "min gigabit score by default";
|
||||
m->m_desc = "Gigabits (related topics) with scores below this "
|
||||
"will be excluded. Scores range from 0% to over 100%.";
|
||||
m->m_cgi = "mts";
|
||||
m->m_off = (char *)&cr.m_minTopicScore - x;
|
||||
m->m_type = TYPE_LONG;
|
||||
m->m_def = "5";
|
||||
m->m_group = 0;
|
||||
m->m_flags = PF_API | PF_CLONE;
|
||||
m->m_page = PAGE_SEARCH;
|
||||
m->m_obj = OBJ_COLL;
|
||||
m++;
|
||||
|
||||
m->m_title = "min gigabit doc count by default";
|
||||
m->m_desc = "How many documents must contain the gigabit "
|
||||
"(related topic) in order for it to be displayed.";
|
||||
m->m_cgi = "mdc";
|
||||
m->m_off = (char *)&cr.m_minDocCount - x;
|
||||
m->m_type = TYPE_LONG;
|
||||
m->m_def = "2";
|
||||
m->m_group = 0;
|
||||
m->m_flags = PF_API | PF_CLONE;
|
||||
m->m_page = PAGE_SEARCH;
|
||||
m->m_obj = OBJ_COLL;
|
||||
m++;
|
||||
|
||||
m->m_title = "dedup doc percent for gigabits (related topics)";
|
||||
m->m_desc = "If a document is this percent similar to another "
|
||||
"document with a higher score, then it will not contribute "
|
||||
"to the gigabit generation.";
|
||||
m->m_cgi = "dsp";
|
||||
m->m_off = (char *)&cr.m_dedupSamplePercent - x;
|
||||
m->m_type = TYPE_LONG;
|
||||
m->m_def = "80";
|
||||
m->m_group = 0;
|
||||
m->m_flags = PF_API | PF_CLONE;
|
||||
m->m_page = PAGE_SEARCH;
|
||||
m->m_obj = OBJ_COLL;
|
||||
m++;
|
||||
|
||||
m->m_title = "max words per gigabit (related topic) by default";
|
||||
m->m_desc = "Maximum number of words a gigabit (related topic) "
|
||||
"can have. Affects xml feeds, too.";
|
||||
m->m_cgi = "mwpt";
|
||||
m->m_off = (char *)&cr.m_maxWordsPerTopic - x;
|
||||
m->m_type = TYPE_LONG;
|
||||
m->m_def = "6";
|
||||
m->m_group = 0;
|
||||
m->m_flags = PF_API | PF_CLONE;
|
||||
m->m_page = PAGE_SEARCH;
|
||||
m->m_obj = OBJ_COLL;
|
||||
m++;
|
||||
|
||||
|
||||
m->m_title = "gigabit max sample size";
|
||||
m->m_desc = "Max chars to sample from each doc for gigabits "
|
||||
"(related topics).";
|
||||
m->m_cgi = "tmss";
|
||||
m->m_off = (char *)&cr.m_topicSampleSize - x;
|
||||
m->m_type = TYPE_LONG;
|
||||
m->m_def = "4096";
|
||||
m->m_group = 0;
|
||||
m->m_flags = PF_API | PF_CLONE;
|
||||
m->m_page = PAGE_SEARCH;
|
||||
m->m_obj = OBJ_COLL;
|
||||
m++;
|
||||
|
||||
m->m_title = "gigabit max punct len";
|
||||
m->m_desc = "Max sequential punct chars allowed in a gigabit "
|
||||
"(related topic). "
|
||||
" Set to 1 for speed, 5 or more for best topics but twice as "
|
||||
"slow.";
|
||||
m->m_cgi = "tmpl";
|
||||
m->m_off = (char *)&cr.m_topicMaxPunctLen - x;
|
||||
m->m_type = TYPE_LONG;
|
||||
m->m_def = "1";
|
||||
m->m_group = 0;
|
||||
m->m_flags = PF_HIDDEN | PF_NOSAVE;
|
||||
m->m_page = PAGE_SEARCH;
|
||||
m->m_obj = OBJ_COLL;
|
||||
m++;
|
||||
|
||||
m->m_title = "display indexed date";
|
||||
m->m_desc = "Display the indexed date along with results.";
|
||||
m->m_cgi = "didt";
|
||||
@ -10331,6 +9913,31 @@ void Parms::init ( ) {
|
||||
m++;
|
||||
|
||||
|
||||
m->m_title = "msg40->39 timeout";
|
||||
m->m_desc = "Timeout for Msg40/Msg3a to collect candidate docids with Msg39. In milliseconds";
|
||||
m->m_cgi = "msgfourty_msgthirtynine_timeout";
|
||||
m->m_off = offsetof(Conf,m_msg40_msg39_timeout);
|
||||
m->m_xml = "msg40_msg39_timeout";
|
||||
m->m_type = TYPE_LONG_LONG;
|
||||
m->m_page = PAGE_SEARCH;
|
||||
m->m_obj = OBJ_CONF;
|
||||
m->m_def = "5000";
|
||||
m->m_flags = 0;
|
||||
m++;
|
||||
|
||||
|
||||
m->m_title = "msg3a->39 network overhead";
|
||||
m->m_desc = "Additional overhead/latecny for msg39 request+response over the network";
|
||||
m->m_cgi = "msgthreea_msgthirtynine_network_overhead";
|
||||
m->m_off = offsetof(Conf,m_msg3a_msg39_network_overhead);
|
||||
m->m_xml = "msg3a_msg39_network_overhead";
|
||||
m->m_type = TYPE_LONG_LONG;
|
||||
m->m_page = PAGE_SEARCH;
|
||||
m->m_obj = OBJ_CONF;
|
||||
m->m_def = "250";
|
||||
m->m_flags = 0;
|
||||
m++;
|
||||
|
||||
///////////////////////////////////////////
|
||||
// PAGE SPIDER CONTROLS
|
||||
///////////////////////////////////////////
|
||||
@ -12108,16 +11715,6 @@ void Parms::init ( ) {
|
||||
m->m_obj = OBJ_CONF;
|
||||
m++;
|
||||
|
||||
m->m_title = "log debug topic messages";
|
||||
m->m_cgi = "ldto";
|
||||
m->m_off = (char *)&g_conf.m_logDebugTopics - g;
|
||||
m->m_type = TYPE_BOOL;
|
||||
m->m_def = "0";
|
||||
m->m_priv = 1;
|
||||
m->m_page = PAGE_LOG;
|
||||
m->m_obj = OBJ_CONF;
|
||||
m++;
|
||||
|
||||
m->m_title = "log debug topDoc messages";
|
||||
m->m_cgi = "ldtopd";
|
||||
m->m_off = (char *)&g_conf.m_logDebugTopDocs - g;
|
||||
@ -12334,16 +11931,6 @@ void Parms::init ( ) {
|
||||
m->m_obj = OBJ_CONF;
|
||||
m++;
|
||||
|
||||
m->m_title = "log timing messages for related topics";
|
||||
m->m_cgi = "ltt";
|
||||
m->m_off = (char *)&g_conf.m_logTimingTopics - g;
|
||||
m->m_type = TYPE_BOOL;
|
||||
m->m_def = "0";
|
||||
m->m_priv = 1;
|
||||
m->m_page = PAGE_LOG;
|
||||
m->m_obj = OBJ_CONF;
|
||||
m++;
|
||||
|
||||
m->m_title = "log reminder messages";
|
||||
m->m_desc = "Log reminders to the programmer. You do not need this.";
|
||||
m->m_cgi = "lr";
|
||||
|
140
Phrases.cpp
140
Phrases.cpp
@ -5,7 +5,6 @@
|
||||
|
||||
Phrases::Phrases ( ) {
|
||||
m_buf = NULL;
|
||||
m_phraseSpam = NULL;
|
||||
}
|
||||
|
||||
Phrases::~Phrases ( ) {
|
||||
@ -18,29 +17,13 @@ void Phrases::reset() {
|
||||
}
|
||||
|
||||
m_buf = NULL;
|
||||
m_phraseSpam = NULL;
|
||||
}
|
||||
|
||||
// initialize this token array with the string, "s" of length, "len".
|
||||
bool Phrases::set( Words *words,
|
||||
Bits *bits ,
|
||||
bool useStopWords ,
|
||||
bool useStems ,
|
||||
int32_t titleRecVersion,
|
||||
int32_t niceness) {
|
||||
bool Phrases::set( Words *words, Bits *bits, int32_t titleRecVersion, int32_t niceness ) {
|
||||
// reset in case being re-used
|
||||
reset();
|
||||
|
||||
// now we never use stop words and we just index two-word phrases
|
||||
// so that a search for "get a" in quotes will match a doc that has
|
||||
// the phrase "get a clue". it might impact performance, but it should
|
||||
// be insignificant... but we need to have this level of precision.
|
||||
// ok -- but what about 'kick a ball'. we might not have that phrase
|
||||
// in the results for "kick a" AND "a ball"!! so we really need to
|
||||
// index "kick a ball" as well as "kick a" and "a ball". i don't think
|
||||
// that will cause too much bloat.
|
||||
//useStopWords = false;
|
||||
|
||||
// ensure we have words
|
||||
if ( ! words ) return true;
|
||||
|
||||
@ -49,7 +32,7 @@ bool Phrases::set( Words *words,
|
||||
m_numPhrases = words->getNumWords();
|
||||
|
||||
// how much mem do we need?
|
||||
int32_t need = m_numPhrases * (8+8+1+1+1);
|
||||
int32_t need = m_numPhrases * (8+1);
|
||||
|
||||
// alloc if we need to
|
||||
if ( need > PHRASE_BUF_SIZE )
|
||||
@ -65,26 +48,17 @@ bool Phrases::set( Words *words,
|
||||
|
||||
// phrase not using stop words
|
||||
m_phraseIds2 = (int64_t *)p ; p += m_numPhrases * 8;
|
||||
m_phraseIds3 = (int64_t *)p ; p += m_numPhrases * 8;
|
||||
m_phraseSpam = (unsigned char *)p ; p += m_numPhrases * 1;
|
||||
m_numWordsTotal2= (unsigned char *)p ; p += m_numPhrases * 1;
|
||||
m_numWordsTotal3= (unsigned char *)p ; p += m_numPhrases * 1;
|
||||
|
||||
// sanity
|
||||
if ( p != m_buf + need ) { char *xx=NULL;*xx=0; }
|
||||
|
||||
// clear this
|
||||
memset ( m_numWordsTotal2 , 0 , m_numPhrases );
|
||||
memset ( m_numWordsTotal3 , 0 , m_numPhrases );
|
||||
|
||||
// point to this info while we parse
|
||||
m_words = words;
|
||||
m_wptrs = words->getWords();
|
||||
m_wlens = words->getWordLens();
|
||||
m_wids = words->getWordIds();
|
||||
m_bits = bits;
|
||||
m_useStopWords = useStopWords;
|
||||
m_useStems = useStems;
|
||||
|
||||
// we now are dependent on this
|
||||
m_titleRecVersion = titleRecVersion;
|
||||
@ -93,7 +67,10 @@ bool Phrases::set( Words *words,
|
||||
// . sets m_phraseIds [i]
|
||||
// . sets m_phraseSpam[i] to PSKIP if NO phrase exists
|
||||
for ( int32_t i = 0 ; i < words->getNumWords() ; i++ ) {
|
||||
if ( ! m_wids[i] ) continue;
|
||||
if ( ! m_wids[i] ) {
|
||||
continue;
|
||||
}
|
||||
|
||||
setPhrase ( i , niceness);
|
||||
}
|
||||
// success
|
||||
@ -109,16 +86,15 @@ void Phrases::setPhrase ( int32_t i, int32_t niceness ) {
|
||||
// hash of the phrase
|
||||
int64_t h = 0LL;
|
||||
|
||||
// the hash of the two-word phrase (now we do 3,4 and 5 word phrases)
|
||||
// the hash of the two-word phrase
|
||||
int64_t h2 = 0LL;
|
||||
int64_t h3 = 0LL;
|
||||
|
||||
// reset
|
||||
unsigned char pos = 0;
|
||||
|
||||
// now look for other tokens that should follow the ith token
|
||||
int32_t nw = m_words->getNumWords();
|
||||
int32_t numWordsInPhrase = 1;
|
||||
int32_t nw = m_words->getNumWords();
|
||||
int32_t numWordsInPhrase = 1;
|
||||
|
||||
// use the min spam from all words in the phrase as the spam for phrase
|
||||
char minSpam = -1;
|
||||
@ -142,9 +118,10 @@ void Phrases::setPhrase ( int32_t i, int32_t niceness ) {
|
||||
// a phrase but not be in it, then the phrase id ends up just
|
||||
// being the following word's id. causing the synonyms code to
|
||||
// give a synonym which it should not un Synonyms::set()
|
||||
if ( ! m_bits->canBeInPhrase(i) )
|
||||
if ( ! m_bits->canBeInPhrase(i) ) {
|
||||
// so indeed, skip it then
|
||||
goto nophrase;
|
||||
}
|
||||
|
||||
h = m_wids[i];
|
||||
|
||||
@ -160,14 +137,21 @@ void Phrases::setPhrase ( int32_t i, int32_t niceness ) {
|
||||
// . do not allow more than 32 alnum/punct "words" in a phrase
|
||||
// . this prevents phrases with 100,000 words from slowing
|
||||
// us down. would put us in a huge double-nested for loop
|
||||
if ( j > i + 32 ) goto nophrase;
|
||||
if ( j > i + 32 ) {
|
||||
goto nophrase;
|
||||
}
|
||||
|
||||
// deal with punct words
|
||||
if ( ! m_wids[j] ) {
|
||||
// if we cannot pair across word j then break
|
||||
if ( ! m_bits->canPairAcross (j) ) break;
|
||||
if ( !m_bits->canPairAcross( j ) ) {
|
||||
break;
|
||||
}
|
||||
|
||||
// does it have a hyphen?
|
||||
if (j==i+1 && m_words->hasChar(j,'-')) hasHyphen=true;
|
||||
if ( j == i + 1 && m_words->hasChar( j, '-' ) ) {
|
||||
hasHyphen = true;
|
||||
}
|
||||
|
||||
continue;
|
||||
}
|
||||
@ -180,51 +164,35 @@ void Phrases::setPhrase ( int32_t i, int32_t niceness ) {
|
||||
int32_t conti = pos;
|
||||
|
||||
// hash the jth word into the hash
|
||||
h = hash64Lower_utf8_cont(m_wptrs[j],
|
||||
m_wlens[j],
|
||||
h,
|
||||
&conti );
|
||||
h = hash64Lower_utf8_cont( m_wptrs[j], m_wlens[j], h, &conti );
|
||||
|
||||
pos = conti;
|
||||
|
||||
numWordsInPhrase++;
|
||||
++numWordsInPhrase;
|
||||
|
||||
// N-word phrases?
|
||||
if ( numWordsInPhrase == 2 ) {
|
||||
h2 = h;
|
||||
m_numWordsTotal2[i] = j-i+1;
|
||||
if ( m_bits->isStopWord(j) )
|
||||
hasStopWord2 = true;
|
||||
continue;
|
||||
}
|
||||
if ( numWordsInPhrase == 3 ) {
|
||||
h3 = h;
|
||||
m_numWordsTotal3[i] = j-i+1;
|
||||
//continue;
|
||||
m_numWordsTotal2[i] = j - i + 1;
|
||||
hasStopWord2 = m_bits->isStopWord(j);
|
||||
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// if we cannot pair across word j then break
|
||||
if ( ! m_bits->canPairAcross (j) ) break;
|
||||
|
||||
// keep chugging?
|
||||
if ( numWordsInPhrase >= 5 ) {
|
||||
// if we're not using stop words then break
|
||||
if ( ! m_useStopWords ) break;
|
||||
// if it's not a stop word then break
|
||||
if ( ! m_bits->isStopWord (j) ) break;
|
||||
if ( ! m_bits->canPairAcross (j) ) {
|
||||
break;
|
||||
}
|
||||
|
||||
// otherwise, get the next word
|
||||
}
|
||||
|
||||
// if we had no phrase then use 0 as id (need 2+ words to be a pharse)
|
||||
if ( numWordsInPhrase <= 1 ) {
|
||||
nophrase:
|
||||
m_phraseSpam[i] = PSKIP;
|
||||
m_phraseIds2[i] = 0LL;
|
||||
m_phraseIds3[i] = 0LL;
|
||||
m_numWordsTotal2[i] = 0;
|
||||
m_numWordsTotal3[i] = 0;
|
||||
return;
|
||||
}
|
||||
|
||||
@ -236,7 +204,6 @@ void Phrases::setPhrase ( int32_t i, int32_t niceness ) {
|
||||
|
||||
// set the phrase spam
|
||||
if ( minSpam == -1 ) minSpam = 0;
|
||||
m_phraseSpam[i] = minSpam;
|
||||
|
||||
// hyphen between numbers does not count (so 1-2 != 12)
|
||||
if ( isNum ) hasHyphen = false;
|
||||
@ -247,25 +214,23 @@ void Phrases::setPhrase ( int32_t i, int32_t niceness ) {
|
||||
// . "i-phone" -> iphone
|
||||
// . "e-mail" -> email
|
||||
if ( hasHyphen || ! hasStopWord2 ) {
|
||||
//m_phraseIds [i] = h;
|
||||
m_phraseIds2[i] = h2;
|
||||
}
|
||||
// . "st. and" !-> stand
|
||||
// . "the rapist" !-> therapist
|
||||
else {
|
||||
//m_phraseIds [i] = h ^ 0x768867;
|
||||
m_phraseIds2[i] = h2 ^ 0x768867;
|
||||
}
|
||||
|
||||
// forget hyphen logic for these
|
||||
m_phraseIds3[i] = h3;
|
||||
}
|
||||
|
||||
// . store phrase that starts with word #i into "printBuf"
|
||||
// . return bytes stored in "printBuf"
|
||||
char *Phrases::getPhrase ( int32_t i , int32_t *phrLen , int32_t npw ) {
|
||||
// return 0 if no phrase
|
||||
if ( m_phraseSpam[i] == PSKIP ) return NULL;
|
||||
if ( m_phraseIds2[i] == 0LL ) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// store the phrase in here
|
||||
static char buf[256];
|
||||
// . how many words, including punct words, are in phrase?
|
||||
@ -273,7 +238,6 @@ char *Phrases::getPhrase ( int32_t i , int32_t *phrLen , int32_t npw ) {
|
||||
//int32_t n = m_numWordsTotal[i] ;
|
||||
int32_t n ;
|
||||
if ( npw == 2 ) n = m_numWordsTotal2[i] ;
|
||||
else if ( npw == 3 ) n = m_numWordsTotal3[i] ;
|
||||
else { char *xx=NULL; *xx=0; }
|
||||
|
||||
char *s = buf;
|
||||
@ -303,42 +267,6 @@ char *Phrases::getPhrase ( int32_t i , int32_t *phrLen , int32_t npw ) {
|
||||
return buf;
|
||||
}
|
||||
|
||||
// . word #n is in a phrase if he has [word][punct] or [punct][word]
|
||||
// before/after him and you can pair across the punct and include both
|
||||
// in a phrase
|
||||
// . used by SimpleQuery class to see if a word is in a phrase or not
|
||||
// . if it is then the query may choose not to represent the word by itself
|
||||
bool Phrases::isInPhrase ( int32_t n ) {
|
||||
// returns true if we started a phrase (our phraseSpam is not PSKIP)
|
||||
if ( m_phraseSpam[n] != PSKIP ) return true;
|
||||
|
||||
// . see if we were in a phrase started by a word before us
|
||||
// . this only words since stop words - whose previous word cannot be
|
||||
// paired across - are able to start phrases
|
||||
if ( n < 2 ) return false;
|
||||
if ( ! m_bits->canPairAcross(n-1) ) return false;
|
||||
if ( ! m_bits->canBeInPhrase(n-2) ) return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
int32_t Phrases::getMaxWordsInPhrase ( int32_t i , int64_t *pid ) {
|
||||
*pid = 0LL;
|
||||
|
||||
if ( m_numWordsTotal3[i] ) {
|
||||
*pid = m_phraseIds3[i];
|
||||
return m_numWordsTotal3[i];
|
||||
}
|
||||
|
||||
if ( m_numWordsTotal2[i] ) {
|
||||
*pid = m_phraseIds2[i];
|
||||
return m_numWordsTotal2[i];
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
int32_t Phrases::getMinWordsInPhrase ( int32_t i , int64_t *pid ) {
|
||||
*pid = 0LL;
|
||||
|
||||
|
98
Phrases.h
98
Phrases.h
@ -8,17 +8,11 @@
|
||||
#ifndef _PHRASES_H_
|
||||
#define _PHRASES_H_
|
||||
|
||||
//#include "TermTable.h"
|
||||
#include "Bits.h"
|
||||
//#include "Spam.h"
|
||||
//#include "Scores.h"
|
||||
#include "Words.h"
|
||||
//#include "Weights.h"
|
||||
|
||||
#define PHRASE_BUF_SIZE (MAX_WORDS * 14)
|
||||
|
||||
#define PSKIP 201
|
||||
|
||||
class Phrases {
|
||||
|
||||
public:
|
||||
@ -27,82 +21,32 @@ class Phrases {
|
||||
~Phrases();
|
||||
void reset() ;
|
||||
|
||||
bool set2 ( Words *words, Bits *bits , int32_t niceness ) {
|
||||
return set ( words,bits,true,false,TITLEREC_CURRENT_VERSION,
|
||||
niceness); };
|
||||
bool set2( Words *words, Bits *bits, int32_t niceness ) {
|
||||
return set( words, bits, TITLEREC_CURRENT_VERSION, niceness );
|
||||
}
|
||||
|
||||
// . set the hashes (m_phraseIds) of the phrases for these words
|
||||
// . a phraseSpam of PSKIP means word is not in a phrase
|
||||
// . "bits" describes the words in a phrasing context
|
||||
// . "spam" is % spam of each word (spam may be NULL)
|
||||
bool set ( Words *words,
|
||||
Bits *bits ,
|
||||
//Spam *spam ,
|
||||
//Scores *scores ,
|
||||
bool useStopWords ,
|
||||
bool useStems ,
|
||||
int32_t titleRecVersion,
|
||||
int32_t niceness);
|
||||
bool set( Words *words, Bits *bits, int32_t titleRecVersion, int32_t niceness );
|
||||
|
||||
//int64_t getPhraseId ( int32_t n ) { return m_phraseIds [n]; };
|
||||
int64_t getPhraseId2 ( int32_t n ) { return m_phraseIds2[n]; };
|
||||
//int64_t *getPhraseIds ( ) { return m_phraseIds ; };
|
||||
int64_t *getPhraseIds2( ) { return m_phraseIds2; };
|
||||
int64_t *getPhraseIds3( ) { return m_phraseIds3; };
|
||||
//int64_t *getPhraseIds4( ) { return m_phraseIds4; };
|
||||
//int64_t *getPhraseIds5( ) { return m_phraseIds5; };
|
||||
|
||||
//int64_t *getStripPhraseIds ( ) { return m_stripPhraseIds ; };
|
||||
//int64_t getStripPhraseId ( int32_t n )
|
||||
//{ return m_stripPhraseIds [n]; };
|
||||
int32_t getPhraseSpam ( int32_t n ) { return m_phraseSpam[n]; };
|
||||
bool hasPhraseId ( int32_t n ) { return (m_phraseSpam[n]!=PSKIP);};
|
||||
bool startsAPhrase ( int32_t n ) { return (m_phraseSpam[n]!=PSKIP);};
|
||||
bool isInPhrase ( int32_t n ) ;
|
||||
// . often word #i is involved in 2 phrases
|
||||
// . m_phraseIds[i] only holds the one he starts
|
||||
// . this gets the one he's in the middle of or on the right of
|
||||
// . used by Query.cpp for phrase-forcing
|
||||
//int64_t getLeftPhraseId ( int32_t i ) ;
|
||||
//int64_t getLeftStripPhraseId ( int32_t i ) ;
|
||||
//int32_t getLeftPhraseIndex ( int32_t i ) ;
|
||||
|
||||
// . each non-spammy occurence of phrase adds "baseScore" to it's score
|
||||
/*
|
||||
bool hash ( TermTable *table ,
|
||||
Weights *weightsPtr ,
|
||||
uint32_t baseScore ,
|
||||
uint32_t maxScore ,
|
||||
int64_t startHash ,
|
||||
char *prefix1 ,
|
||||
int32_t prefixLen1 ,
|
||||
char *prefix2 ,
|
||||
int32_t prefixLen2 ,
|
||||
bool hashUniqueOnly ,
|
||||
int32_t titleRecVersion,
|
||||
int32_t niceness = 0);
|
||||
*/
|
||||
int64_t *getPhraseIds2( ) { return m_phraseIds2; }
|
||||
|
||||
// . store phrase that starts with word #i into "dest"
|
||||
// . we also NULL terminated it in "dest"
|
||||
// . return length
|
||||
char *getPhrase ( int32_t i , int32_t *phrLen , int32_t npw );
|
||||
//char *getNWordPhrase ( int32_t i , int32_t *phrLen , int32_t npw ) ;
|
||||
//char *getStripPhrase ( int32_t i , int32_t *phrLen );
|
||||
|
||||
//int32_t getNumWords ( int32_t i ) { return m_numWordsTotal[i]; };
|
||||
//int32_t getNumWordsInPhrase ( int32_t i ) { return m_numWordsTotal [i]; };
|
||||
int32_t getNumWordsInPhrase2( int32_t i ) { return m_numWordsTotal2[i]; };
|
||||
int32_t getNumWordsInPhrase2( int32_t i ) { return m_numWordsTotal2[i]; }
|
||||
|
||||
int32_t getMaxWordsInPhrase( int32_t i , int64_t *pid ) ;
|
||||
int32_t getMinWordsInPhrase( int32_t i , int64_t *pid ) ;
|
||||
|
||||
// . leave this public so SimpleQuery.cpp can mess with it
|
||||
// . called by Phrases::set() above for each i
|
||||
// . we set phraseSpam to 0 to 100% typically
|
||||
// . we set phraseSpam to PSKIP if word #i cannot start a phrase
|
||||
void setPhrase ( int32_t i ,
|
||||
int32_t niceness);
|
||||
void setPhrase( int32_t i, int32_t niceness );
|
||||
|
||||
// private:
|
||||
|
||||
@ -111,26 +55,10 @@ class Phrases {
|
||||
char *m_buf;
|
||||
int32_t m_bufSize;
|
||||
|
||||
// . these are 1-1 with the words in the Words class
|
||||
// . phraseSpam is PSKIP if the phraseId is invalid
|
||||
//int64_t *m_phraseIds ;
|
||||
// the two word hash
|
||||
int64_t *m_phraseIds2 ;
|
||||
int64_t *m_phraseIds3 ;
|
||||
//int64_t *m_phraseIds4 ;
|
||||
//int64_t *m_phraseIds5 ;
|
||||
//int64_t *m_stripPhraseIds ;
|
||||
unsigned char *m_phraseSpam ;
|
||||
// . # words in phrase TOTAL (including punct words)
|
||||
// . used for printing
|
||||
// . used by SimpleQuery::getTermIds() for setting word ranges
|
||||
// for phrases
|
||||
//unsigned char *m_numWordsTotal ;
|
||||
// for the two word phrases:
|
||||
unsigned char *m_numWordsTotal2 ;
|
||||
unsigned char *m_numWordsTotal3 ;
|
||||
//unsigned char *m_numWordsTotal4 ;
|
||||
//unsigned char *m_numWordsTotal5 ;
|
||||
int32_t m_numPhrases; // should equal the # of words
|
||||
|
||||
// placeholders to avoid passing to subroutine
|
||||
@ -140,19 +68,7 @@ class Phrases {
|
||||
int32_t *m_wlens;
|
||||
|
||||
Bits *m_bits;
|
||||
bool m_useStems;
|
||||
bool m_useStopWords;
|
||||
int32_t m_titleRecVersion;
|
||||
|
||||
// replaces Scores
|
||||
//class Sections *m_sections;
|
||||
//class Section *m_sectionPtrs;
|
||||
|
||||
// word scores, set in Scores.cpp
|
||||
//int32_t *m_wordScores;
|
||||
// the score of the phrase is the min of the scores of the words that
|
||||
// make up the phrase
|
||||
//int32_t *m_phraseScores ;
|
||||
};
|
||||
|
||||
#endif
|
||||
|
2
Pos.h
2
Pos.h
@ -4,7 +4,7 @@
|
||||
#define _POS_H_
|
||||
|
||||
#include <stdint.h>
|
||||
#include <Titledb.h>
|
||||
#include "Titledb.h"
|
||||
|
||||
// this class is used to measure the number of characters between two "words"
|
||||
// (as defined in the Words.cpp class) in units of "characters". A utf8
|
||||
|
99
Posdb.cpp
99
Posdb.cpp
@ -3839,95 +3839,6 @@ void PosdbTable::intersectLists10_r ( ) {
|
||||
|
||||
if( g_conf.m_logTracePosdb ) log(LOG_TRACE,"%s:%s:%d: seoHack: %s, numTerms: %"INT32"", __FILE__,__func__, __LINE__, seoHack?"true":"false", m_q->m_numTerms);
|
||||
|
||||
// if we are just a sitehash:xxxxx list and m_getSectionStats is
|
||||
// true then assume the list is one of hacked posdb keys where
|
||||
// the wordposition bits and others are really a 32-bit site hash
|
||||
// and we have to see how many different docids and sites have
|
||||
// this term. and we compare to our site hash,
|
||||
// m_r->m_sectionSiteHash32 to determine if the posdb key is
|
||||
// onsite or offsite. then XmlDoc::printRainbowSections()
|
||||
// can print out how many page/sites duplicate your section's content.
|
||||
|
||||
// MDW: TODO: for the facet terms just compile the stats and do not
|
||||
// send to intersecting. they are ignored for those purposes. send
|
||||
// the hashtable back so msg3a can integrate the stats. keep in mind
|
||||
// we have multiple docid ranges sometimes for one query!!!!
|
||||
|
||||
/*
|
||||
|
||||
MDW: take this out. now treat as a normal termlist but
|
||||
do not use for scoring. so it is kinda like gbmin: gbmax:
|
||||
query operators but it will just add the facet values to
|
||||
QueryTerm::m_facetHashList for transmission back to the aggregator
|
||||
node. however, it is only for docids in the final result set!
|
||||
|
||||
if ( m_r->m_getFacetStats ) {
|
||||
// reset
|
||||
m_facetStats.m_totalMatches = 0;
|
||||
m_facetStats.m_totalEntries = 0;
|
||||
m_dt.clear();
|
||||
// scan the posdb keys
|
||||
//for ( int32_t i = 0 ; i < m_msg2->getNumListsInGroup(0); i++) {
|
||||
// get the sublist
|
||||
RdbList *list = m_msg2->getList(0);//Group(0)[i];
|
||||
char *p = list->getList ();
|
||||
char *pend = p + list->getListSize();
|
||||
// test
|
||||
//int64_t final = 5663137686803656554LL;
|
||||
//final &= TERMID_MASK;
|
||||
//if ( p<pend && g_posdb.getTermId(p) == final )
|
||||
// log("boo");
|
||||
// scan it
|
||||
for ( ; p < pend ; ) {
|
||||
// . first key is the full size
|
||||
// . uses the w,G,s,v and F bits to hold this
|
||||
// . this is no longer necessarily sitehash, but
|
||||
// can be any val, like now FacetStats is using
|
||||
// it for the innerHtml sentence content hash32
|
||||
int32_t sh32 = g_posdb.getFacetVal32 ( p );
|
||||
//int64_t d = g_posdb.getDocId(p);
|
||||
//int32_t rs = list->getRecSize(p);
|
||||
// this will not update listptrlo, watch out!
|
||||
p += list->getRecSize ( p );
|
||||
// does this xpath from another docid have the
|
||||
// same inner html as us?
|
||||
if ( sh32 == m_r->m_myFacetVal32 ) // m_siteHash32 )
|
||||
m_facetStats.m_totalMatches++;
|
||||
// always this
|
||||
m_facetStats.m_totalEntries++;
|
||||
// unique site count
|
||||
if ( m_dt.isInTable ( &sh32 ) ) continue;
|
||||
// count it
|
||||
m_facetStats.m_numUniqueVals++;
|
||||
// only once
|
||||
m_dt.addKey ( &sh32 );
|
||||
// log it
|
||||
//log("usite: %08"XINT32" %"INT64" rs=%"INT32"",sh32,d,rs);
|
||||
// stop if too much so we do not try to
|
||||
// re-alloc in a thread!
|
||||
if ( m_dt.m_numSlotsUsed >= 1000000 ) break;
|
||||
}
|
||||
// and return the list of merging
|
||||
int32_t *s = (int32_t *)m_facetHashList.getBufStart();
|
||||
int32_t *send = (int32_t *)m_facetHashList.getBufEnd();
|
||||
//if ( m_facetStats.m_numUniqueSites == 17 ) {
|
||||
// log("q=%s",m_r->ptr_query);
|
||||
// log("hey");
|
||||
// //char *xx = NULL;*xx=0;
|
||||
//}
|
||||
//if(!strcmp(m_r->ptr_query,"gbsectionhash:3335323672699668766"
|
||||
// log("boo");
|
||||
int32_t *orig = s;
|
||||
for ( int32_t i = 0 ; i < m_dt.m_numSlots ; i++ ) {
|
||||
if ( ! m_dt.m_flags[i] ) continue;
|
||||
*s++ = *(int32_t *)m_dt.getKeyFromSlot(i);
|
||||
if ( s >= send ) break;
|
||||
}
|
||||
m_facetHashList.setLength((char *)s-(char *)orig);
|
||||
return;
|
||||
}
|
||||
*/
|
||||
|
||||
//
|
||||
// hash the docids in the whitelist termlists into a hashtable.
|
||||
// every docid in the search results must be in there. the
|
||||
@ -5826,9 +5737,7 @@ void PosdbTable::intersectLists10_r ( ) {
|
||||
// . first key is the full size
|
||||
// . uses the w,G,s,v and F bits to hold this
|
||||
// . this is no longer necessarily sitehash,but
|
||||
// can be any val, like now SectionStats is
|
||||
// using it for the innerHtml sentence
|
||||
// content hash32
|
||||
// can be any val
|
||||
int32_t val32 = g_posdb.getFacetVal32 ( p2 );
|
||||
|
||||
// PREADVANCE "p"
|
||||
@ -5967,12 +5876,6 @@ void PosdbTable::intersectLists10_r ( ) {
|
||||
|
||||
skipFacetCheck:
|
||||
|
||||
|
||||
// if only one term like gbfacetstr:gbxpathsitehash123456
|
||||
// then do not bother adding to top tree
|
||||
if ( m_r->m_forSectionStats ) goto advance;
|
||||
|
||||
|
||||
// . seoDebug hack so we can set "dcs"
|
||||
// . we only come here if we actually made it into m_topTree
|
||||
if ( secondPass ) {
|
||||
|
115
Posdb.h
115
Posdb.h
@ -132,15 +132,6 @@ class Posdb {
|
||||
|
||||
bool addColl ( char *coll, bool doVerify = true );
|
||||
|
||||
// . xmldoc.cpp should call this
|
||||
// . store all posdb keys from revdbList into one hashtable
|
||||
// and only add to new list if not in there
|
||||
//bool makeList ( class RdbList *revdbList ,
|
||||
// int64_t docId ,
|
||||
// class Words *words );
|
||||
|
||||
|
||||
|
||||
// . make a 16-byte key from all these components
|
||||
// . since it is 16 bytes, the big bit will be set
|
||||
void makeKey ( void *kp ,
|
||||
@ -440,80 +431,8 @@ public:
|
||||
int32_t m_quotedStartId;
|
||||
};
|
||||
|
||||
|
||||
/*
|
||||
#include "RdbList.h"
|
||||
|
||||
class PosdbList : public RdbList {
|
||||
|
||||
public:
|
||||
|
||||
// why do i have to repeat this for LinkInfo::set() calling our set()??
|
||||
void set ( char *list , int32_t listSize , bool ownData ) {
|
||||
RdbList::set ( list ,
|
||||
listSize ,
|
||||
list , // alloc
|
||||
listSize , // alloc size
|
||||
0 , // fixed data size
|
||||
ownData ,
|
||||
true , // use half keys?
|
||||
sizeof(key_t));// 12 bytes per key
|
||||
};
|
||||
|
||||
// clear the low bits on the keys so terms are DELETED
|
||||
void clearDelBits ( );
|
||||
|
||||
void print();
|
||||
|
||||
|
||||
// . these are made for special IndexLists, too
|
||||
// . getTermId() assumes as 12 byte key
|
||||
int64_t getCurrentTermId12 ( ) {
|
||||
return getTermId12 ( m_listPtr ); };
|
||||
int64_t getTermId12 ( char *rec ) {
|
||||
return (*(uint64_t *)(&rec[4])) >> 16 ;
|
||||
};
|
||||
int64_t getTermId16 ( char *rec ) {
|
||||
return (*(uint64_t *)(&rec[8])) >> 16 ;
|
||||
};
|
||||
// these 2 assume 12 and 6 byte keys respectively
|
||||
int64_t getCurrentDocId () {
|
||||
if ( isHalfBitOn ( m_listPtr ) ) return getDocId6 (m_listPtr);
|
||||
else return getDocId12(m_listPtr);
|
||||
};
|
||||
int64_t getDocId ( char *rec ) {
|
||||
if ( isHalfBitOn ( rec ) ) return getDocId6 (rec);
|
||||
else return getDocId12(rec);
|
||||
};
|
||||
int64_t getCurrentDocId12 ( ) {
|
||||
return getDocId12 ( m_listPtr ); };
|
||||
int64_t getDocId12 ( char *rec ) {
|
||||
return ((*(uint64_t *)(rec)) >> 2) & DOCID_MASK; };
|
||||
int64_t getDocId6 ( char *rec ) {
|
||||
int64_t docid;
|
||||
*(int32_t *)(&docid) = *(int32_t *)rec;
|
||||
((char *)&docid)[4] = rec[4];
|
||||
docid >>= 2;
|
||||
return docid & DOCID_MASK;
|
||||
};
|
||||
// this works with either 12 or 6 byte keys
|
||||
unsigned char getCurrentScore ( ) {
|
||||
return getScore(m_listPtr); };
|
||||
unsigned char getScore ( char *rec ) { return ~rec[5]; };
|
||||
|
||||
// uncomplemented...
|
||||
void setScore ( char *rec , char score ) { rec[5] = score; };
|
||||
|
||||
// for date lists only...
|
||||
int32_t getCurrentDate ( ) { return ~*(int32_t *)(m_listPtr+6); };
|
||||
};
|
||||
*/
|
||||
|
||||
#include "Query.h" // MAX_QUERY_TERMS, qvec_t
|
||||
|
||||
// max # search results that can be viewed without using TopTree
|
||||
//#define MAX_RESULTS 1000
|
||||
|
||||
class PosdbTable {
|
||||
|
||||
public:
|
||||
@ -525,10 +444,7 @@ class PosdbTable {
|
||||
char debug ,
|
||||
void *logstate ,
|
||||
class TopTree *topTree ,
|
||||
//char *coll ,
|
||||
collnum_t collnum ,
|
||||
//IndexList *lists ,
|
||||
//int32_t numLists ,
|
||||
class Msg2 *msg2,
|
||||
class Msg39Request *r );
|
||||
|
||||
@ -538,12 +454,6 @@ class PosdbTable {
|
||||
// pre-allocate memory since intersection runs in a thread
|
||||
bool allocTopTree ( );
|
||||
|
||||
// . returns false on error and sets errno
|
||||
// . we assume there are "m_numTerms" lists passed in (see set() above)
|
||||
//void intersectLists_r ( );
|
||||
|
||||
//void intersectLists9_r ( );
|
||||
|
||||
void getTermPairScoreForNonBody ( int32_t i, int32_t j,
|
||||
char *wpi, char *wpj,
|
||||
char *endi, char *endj,
|
||||
@ -580,7 +490,9 @@ class PosdbTable {
|
||||
void freeMem ( ) ;
|
||||
|
||||
// has init already been called?
|
||||
bool isInitialized ( ) { return m_initialized; };
|
||||
bool isInitialized() {
|
||||
return m_initialized;
|
||||
}
|
||||
|
||||
uint64_t m_docId;
|
||||
|
||||
@ -609,56 +521,37 @@ class PosdbTable {
|
||||
|
||||
int32_t m_maxScores;
|
||||
|
||||
//char *m_coll;
|
||||
collnum_t m_collnum;
|
||||
|
||||
int32_t *m_qpos;
|
||||
int32_t *m_wikiPhraseIds;
|
||||
int32_t *m_quotedStartIds;
|
||||
//class DocIdScore *m_ds;
|
||||
int32_t m_qdist;
|
||||
float *m_freqWeights;
|
||||
//int64_t *m_freqs;
|
||||
char *m_bflags;
|
||||
int32_t *m_qtermNums;
|
||||
float m_bestWindowScore;
|
||||
//char **m_finalWinners1;
|
||||
//char **m_finalWinners2;
|
||||
//float *m_finalScores;
|
||||
char **m_windowTermPtrs;
|
||||
|
||||
// how many docs in the collection?
|
||||
int64_t m_docsInColl;
|
||||
|
||||
//SectionStats m_sectionStats;
|
||||
//SafeBuf m_facetHashList;
|
||||
//HashTableX m_dt;
|
||||
|
||||
class Msg2 *m_msg2;
|
||||
|
||||
// if getting more than MAX_RESULTS results, use this top tree to hold
|
||||
// them rather than the m_top*[] arrays above
|
||||
class TopTree *m_topTree;
|
||||
|
||||
//HashTableX m_docIdTable;
|
||||
|
||||
SafeBuf m_scoreInfoBuf;
|
||||
SafeBuf m_pairScoreBuf;
|
||||
SafeBuf m_singleScoreBuf;
|
||||
|
||||
SafeBuf m_stackBuf;
|
||||
|
||||
//SafeBuf m_mergeBuf;
|
||||
|
||||
// a reference to the query
|
||||
Query *m_q;
|
||||
int32_t m_nqt;
|
||||
|
||||
// these are NOT in imap space, but in query term space, 1-1 with
|
||||
// Query::m_qterms[]
|
||||
//IndexList *m_lists;
|
||||
//int32_t m_numLists;
|
||||
|
||||
// has init() been called?
|
||||
bool m_initialized;
|
||||
|
||||
@ -668,8 +561,6 @@ class PosdbTable {
|
||||
// for debug msgs
|
||||
void *m_logstate;
|
||||
|
||||
//int64_t m_numDocsInColl;
|
||||
|
||||
class Msg39Request *m_r;
|
||||
|
||||
// for gbsortby:item.price ...
|
||||
|
@ -341,7 +341,7 @@ bool PostQueryRerank::preRerank ( ) {
|
||||
return false;
|
||||
|
||||
// . calculate maximum url length in pages for reranking
|
||||
// by query terms or topics in a url
|
||||
// by query terms in a url
|
||||
int32_t urlLen = mr->size_ubuf - 1;//msg20->getUrlLen();
|
||||
if ( urlLen > m_maxUrlLen )
|
||||
m_maxUrlLen = urlLen;
|
||||
@ -379,7 +379,7 @@ bool PostQueryRerank::preRerank ( ) {
|
||||
}
|
||||
|
||||
|
||||
// . setup reranking for query terms or topics in url (pqrqttiu)
|
||||
// . setup reranking for query terms in url (pqrqttiu)
|
||||
// . add space to max url length for terminating NULL and allocate
|
||||
// room for max length
|
||||
m_maxUrlLen++;
|
||||
|
@ -266,7 +266,6 @@ bool Process::init ( ) {
|
||||
// . let's try to save tfndb first, that is the most important,
|
||||
// followed by titledb perhaps...
|
||||
m_rdbs[m_numRdbs++] = g_titledb.getRdb ();
|
||||
m_rdbs[m_numRdbs++] = g_sectiondb.getRdb ();
|
||||
m_rdbs[m_numRdbs++] = g_posdb.getRdb ();
|
||||
m_rdbs[m_numRdbs++] = g_spiderdb.getRdb ();
|
||||
m_rdbs[m_numRdbs++] = g_clusterdb.getRdb ();
|
||||
@ -277,7 +276,6 @@ bool Process::init ( ) {
|
||||
// save what urls we have been doled
|
||||
m_rdbs[m_numRdbs++] = g_doledb.getRdb ();
|
||||
m_rdbs[m_numRdbs++] = g_titledb2.getRdb ();
|
||||
m_rdbs[m_numRdbs++] = g_sectiondb2.getRdb ();
|
||||
m_rdbs[m_numRdbs++] = g_posdb2.getRdb ();
|
||||
m_rdbs[m_numRdbs++] = g_spiderdb2.getRdb ();
|
||||
m_rdbs[m_numRdbs++] = g_clusterdb2.getRdb ();
|
||||
|
@ -1,6 +1,4 @@
|
||||
|
||||
#define MAX_TOPICS_PER_TERM 28
|
||||
#define MAX_ALLOWED_TOPICS 100
|
||||
#define EI_NIDENT 16
|
||||
#ifndef _PROFILER_H_
|
||||
#define _PROFILER_H_
|
||||
|
740
Query.cpp
740
Query.cpp
@ -2211,7 +2211,7 @@ bool Query::setQWords ( char boolFlag ,
|
||||
else if ( wp[0]=='-' && wplen==1 )
|
||||
posNum += 0;
|
||||
// 'mr. x'
|
||||
else if ( wp[0]=='.' && words.isSpaces2(i,1))
|
||||
else if ( wp[0]=='.' && words.isSpaces(i,1))
|
||||
posNum += 0;
|
||||
// animal (dog)
|
||||
else
|
||||
@ -3242,14 +3242,7 @@ bool Query::setQWords ( char boolFlag ,
|
||||
|
||||
|
||||
// make the phrases from the words and the tweaked Bits class
|
||||
//Phrases phrases;
|
||||
if ( ! phrases.set ( &words ,
|
||||
&bits ,
|
||||
//NULL ,
|
||||
true , // use stop words?
|
||||
false , // use stems?
|
||||
TITLEREC_CURRENT_VERSION,
|
||||
0 /*niceness*/))//disallows HUGE phrases
|
||||
if ( !phrases.set( &words, &bits, TITLEREC_CURRENT_VERSION, 0 ) )
|
||||
return false;
|
||||
|
||||
int64_t *wids = words.getWordIds();
|
||||
@ -3258,17 +3251,7 @@ bool Query::setQWords ( char boolFlag ,
|
||||
for ( int32_t i = 0 ; i < numWords ; i++ ) {
|
||||
// get the ith QueryWord
|
||||
QueryWord *qw = &m_qwords[i];
|
||||
// if word is ignored because it is opcode, or whatever,
|
||||
// it cannot start a phrase
|
||||
// THIS IS BROKEN
|
||||
//if ( qw->m_queryOp && qw->m_opcode == OP_PIPE){
|
||||
// for (int32_t j = i-1;j>=0;j--){
|
||||
// if (!m_qwords[j].m_phraseId) continue;
|
||||
// m_qwords[j].m_ignorePhrase = IGNORE_BOOLOP;
|
||||
// break;
|
||||
// }
|
||||
//
|
||||
//}
|
||||
|
||||
if ( qw->m_ignoreWord ) continue;
|
||||
if ( qw->m_fieldCode && qw->m_quoteStart < 0) continue;
|
||||
// get the first word # to our left that starts a phrase
|
||||
@ -3280,8 +3263,7 @@ bool Query::setQWords ( char boolFlag ,
|
||||
if ( ! bits.canPairAcross(j+1) ) break;
|
||||
//if ( ! bits.canStartPhrase(j) ) continue;
|
||||
if ( ! wids[j] ) continue;
|
||||
// phrases.getNumWordsInPhrase()
|
||||
//if( j + phrases.getMaxWordsInPhrase(j,&tmp)<i) break;
|
||||
|
||||
qw->m_leftPhraseStart = j;
|
||||
// we can't pair across alnum words now, we just want bigrams
|
||||
if ( wids[j] ) break;
|
||||
@ -3335,8 +3317,7 @@ bool Query::setQWords ( char boolFlag ,
|
||||
else qw->m_phraseId = pid;
|
||||
// how many regular words int32_t is the bigram?
|
||||
int32_t plen2; phrases.getPhrase ( i , &plen2 ,2);
|
||||
// the trigram?
|
||||
int32_t plen3; phrases.getPhrase ( i , &plen3 ,3);
|
||||
|
||||
// get just the bigram for now
|
||||
qw->m_phraseLen = plen2;
|
||||
// do not ignore the phrase, it's valid
|
||||
@ -3736,22 +3717,6 @@ static bool s_isInitialized = false;
|
||||
|
||||
// 3rd field = m_hasColon
|
||||
struct QueryField g_fields[] = {
|
||||
|
||||
/*
|
||||
BR 20160117: No longer hashed
|
||||
{"gbfieldmatch",
|
||||
FIELD_GBFIELDMATCH,
|
||||
true,
|
||||
"gbfieldmatch:strings.vendor:\"My Vendor Inc.\"",
|
||||
"Matches all the meta tag or JSON or XML fields that have "
|
||||
"the name \"strings.vendor\" and contain the exactly provided "
|
||||
"value, in this case, <i>My Vendor Inc.</i>. This is CASE "
|
||||
"SENSITIVE and includes punctuation, so it's exact match. In "
|
||||
"general, it should be a very short termlist, so it should be fast.",
|
||||
"Advanced Query Operators",
|
||||
QTF_BEGINNEWTABLE },
|
||||
*/
|
||||
|
||||
{"url",
|
||||
FIELD_URL,
|
||||
true,
|
||||
@ -3779,10 +3744,6 @@ struct QueryField g_fields[] = {
|
||||
NULL,
|
||||
0 },
|
||||
|
||||
//{"links", FIELD_LINKS, true,"Same as link:."},
|
||||
//{"ilink", FIELD_ILINK, true,"Similar to above."},
|
||||
|
||||
|
||||
{"sitelink",
|
||||
FIELD_SITELINK,
|
||||
true,
|
||||
@ -3809,8 +3770,6 @@ struct QueryField g_fields[] = {
|
||||
NULL,
|
||||
QTF_DUP },
|
||||
|
||||
|
||||
//{"coll", FIELD_COLL, true,"Not sure if this works."},
|
||||
{"ip",
|
||||
FIELD_IP,
|
||||
true,
|
||||
@ -3877,22 +3836,6 @@ struct QueryField g_fields[] = {
|
||||
NULL,
|
||||
0},
|
||||
|
||||
|
||||
//{"isclean", FIELD_ISCLEAN, true,"Matches all pages that are deemed non-offensive and safe for children."},
|
||||
|
||||
|
||||
/*
|
||||
BR 20160108: No longer stored in our posdb as we don't plan to use it
|
||||
{"gbinrss",
|
||||
FIELD_GBRSS,
|
||||
true,
|
||||
"gbinrss:1",
|
||||
"Matches all documents that are in RSS feeds. Likewise, use "
|
||||
"<i>gbinrss:0</i> to match all documents that are NOT in RSS feeds.",
|
||||
NULL,
|
||||
0},
|
||||
*/
|
||||
|
||||
{"type",
|
||||
FIELD_TYPE,
|
||||
false,
|
||||
@ -3925,44 +3868,6 @@ struct QueryField g_fields[] = {
|
||||
NULL,
|
||||
0},
|
||||
|
||||
/*
|
||||
BR 20160117: No longer hash image info
|
||||
{"gbimage",
|
||||
FIELD_URL,
|
||||
false,
|
||||
"gbimage:site.com/image.jpg",
|
||||
"Matches all documents that contain the specified image.",
|
||||
NULL,
|
||||
0},
|
||||
|
||||
{"gbhasthumbnail",
|
||||
FIELD_GENERIC,
|
||||
false,
|
||||
"gbhasthumbnail:1",
|
||||
"Matches all documents for which Gigablast detected a thumbnail. "
|
||||
"Likewise use <i>gbhasthumbnail:0</i> to match all documents that "
|
||||
"do not have thumbnails.",
|
||||
NULL,
|
||||
0},
|
||||
*/
|
||||
|
||||
/*
|
||||
BR 20160117: No longer hash tags
|
||||
{"gbtag*",
|
||||
FIELD_TAG,
|
||||
false,
|
||||
"gbtag*",
|
||||
"Matches all documents whose tag named * have the specified value "
|
||||
"in the tagdb entry for the url. Example: gbtagsitenuminlinks:2 "
|
||||
"matches all documents that have 2 qualified "
|
||||
"inlinks pointing to their site "
|
||||
"based on the tagdb record. You can also provide your own "
|
||||
"tags in addition to the tags already present. See the <i>tagdb</i> "
|
||||
"menu for more information.",
|
||||
NULL,
|
||||
0},
|
||||
*/
|
||||
|
||||
{"gbzipcode",
|
||||
FIELD_ZIP,
|
||||
false,
|
||||
@ -3972,25 +3877,6 @@ struct QueryField g_fields[] = {
|
||||
NULL,
|
||||
0},
|
||||
|
||||
/*
|
||||
BR 20160108: No longer stored in our posdb as we don't plan to use it
|
||||
|
||||
{"gbcharset",
|
||||
FIELD_CHARSET,
|
||||
false,
|
||||
"gbcharset:windows-1252",
|
||||
"Matches all documents originally in the Windows-1252 charset. "
|
||||
"Available character sets are listed in the <i>iana_charset.cpp</i> "
|
||||
"file in the open source distribution. There are a lot. Some "
|
||||
"more popular ones are: <i>us, latin1, iso-8859-1, csascii, ascii, "
|
||||
"latin2, latin3, latin4, greek, utf-8, shift_jis.",
|
||||
NULL,
|
||||
0},
|
||||
*/
|
||||
|
||||
// this just complicates things for now, so comment out
|
||||
//{"urlhash",FIELD_URLHASH, false,""},
|
||||
|
||||
{"gblang",
|
||||
FIELD_GBLANG,
|
||||
false,
|
||||
@ -4005,91 +3891,6 @@ struct QueryField g_fields[] = {
|
||||
NULL,
|
||||
0},
|
||||
|
||||
//{"gbquality",FIELD_GBQUALITY,true,""},
|
||||
//{"gblinktextin",FIELD_LINKTEXTIN,true,""},
|
||||
//{"gblinktextout",FIELD_LINKTEXTOUT,true,""},
|
||||
//{"gbkeyword",FIELD_KEYWORD,true,""},
|
||||
//{"gbcharset", FIELD_CHARSET, false,""},
|
||||
|
||||
/*
|
||||
// BR 20160106: No longer stored in our posdb as we don't use it
|
||||
{"gbpathdepth",
|
||||
FIELD_GBOTHER,
|
||||
false,
|
||||
"gbpathdepth:3",
|
||||
"Matches all documents whose url has 3 path components to it like "
|
||||
"http://somedomain.com/dir1/dir2/dir3/foo.html",
|
||||
NULL,
|
||||
0},
|
||||
*/
|
||||
|
||||
/*
|
||||
// BR 20160108: No longer stored in our posdb as we don't use it
|
||||
{"gbhopcount",
|
||||
FIELD_GBOTHER,
|
||||
false,
|
||||
"gbhopcount:2",
|
||||
"Matches all documents that are a minimum of two link hops away "
|
||||
"from a root url.",
|
||||
NULL,
|
||||
0},
|
||||
*/
|
||||
|
||||
/*
|
||||
// BR 20160108: No longer stored in our posdb as we don't use it
|
||||
{"gbhasfilename",
|
||||
FIELD_GBOTHER,
|
||||
false,
|
||||
"gbhasfilename:1",
|
||||
"Matches all documents whose url ends in a filename like "
|
||||
"<i>http://somedomain.com/dir1/myfile</i> and not "
|
||||
"<i>http://somedomain.com/dir1/dir2/</i>. Likewise, use "
|
||||
"<i>gbhasfilename:0</i> to match all the documents that do not "
|
||||
"have a filename in their url.",
|
||||
NULL,
|
||||
0},
|
||||
*/
|
||||
|
||||
/*
|
||||
BR 20160108: No longer stored in our posdb as we don't plan to use it
|
||||
|
||||
{"gbiscgi",
|
||||
FIELD_GBOTHER,
|
||||
false,
|
||||
"gbiscgi:1",
|
||||
"Matches all documents that have a question mark in their url. "
|
||||
"Likewise gbiscgi:0 matches all documents that do not.",
|
||||
NULL,
|
||||
0},
|
||||
*/
|
||||
|
||||
|
||||
/*
|
||||
BR 20160108: No longer stored in our posdb as we don't use it
|
||||
|
||||
{"gbhasext",
|
||||
FIELD_GBOTHER,
|
||||
false,
|
||||
"gbhasext:1",
|
||||
"Matches all documents that have a file extension in their url. "
|
||||
"Likewise, <i>gbhasext:0</i> matches all documents that do not have "
|
||||
"a file extension in their url.",
|
||||
NULL,
|
||||
0},
|
||||
*/
|
||||
|
||||
/*
|
||||
BR 20160106 removed
|
||||
{"gbsubmiturl",
|
||||
FIELD_GBOTHER,
|
||||
false,
|
||||
"gbsubmiturl:domain.com/process.php",
|
||||
"Matches all documents that have a form that submits to the "
|
||||
"specified url.",
|
||||
NULL,
|
||||
0},
|
||||
*/
|
||||
|
||||
// diffbot only
|
||||
{"gbparenturl",
|
||||
FIELD_GBPARENTURL,
|
||||
@ -4131,92 +3932,10 @@ struct QueryField g_fields[] = {
|
||||
NULL,
|
||||
0},
|
||||
|
||||
|
||||
|
||||
//
|
||||
// for content type CT_STATUS documents (Spider status docs)
|
||||
//
|
||||
|
||||
|
||||
|
||||
//{"qdom", FIELD_QUOTA, false,""},
|
||||
//{"qhost", FIELD_QUOTA, false,""},
|
||||
|
||||
/*
|
||||
// BR 20160117: No longer supported
|
||||
{"gbsortbyfloat",
|
||||
FIELD_GBSORTBYFLOAT,
|
||||
false,
|
||||
"cameras gbsortbyfloat:price",
|
||||
"Sort all documents that "
|
||||
"contain 'camera' by price. <i>price</i> can be a root JSON field or "
|
||||
"in a meta tag, or in an xml <price> tag.",
|
||||
"Numeric Field Query Operators",
|
||||
QTF_BEGINNEWTABLE },
|
||||
|
||||
|
||||
{"gbsortbyfloat",
|
||||
FIELD_GBSORTBYFLOAT,
|
||||
false,
|
||||
"cameras gbsortbyfloat:product.price",
|
||||
"Sort all documents that "
|
||||
"contain 'camera' by price. <i>price</i> can be in a JSON document "
|
||||
"like "
|
||||
"<i>{ \"product\":{\"price\":1500.00}} "
|
||||
"</i> or, alternatively, an XML document like <i>"
|
||||
"<product><price>1500.00</price></product>"
|
||||
"</i>",
|
||||
NULL,
|
||||
0 },
|
||||
|
||||
|
||||
{"gbrevsortbyfloat",
|
||||
FIELD_GBREVSORTBYFLOAT,
|
||||
false,
|
||||
"cameras gbrevsortbyfloat:product.price",
|
||||
"Like above example but sorted with highest prices on top.",
|
||||
NULL,
|
||||
0 },
|
||||
|
||||
|
||||
{"gbsortby",
|
||||
FIELD_GBSORTBYFLOAT,
|
||||
false,
|
||||
"dog gbsortbyint:gbdocspiderdate",
|
||||
"Sort the documents that contain 'dog' by "
|
||||
"the date they were last spidered, with the newest "
|
||||
"on top.",
|
||||
NULL,
|
||||
QTF_HIDE},
|
||||
|
||||
{"gbrevsortby",
|
||||
FIELD_GBREVSORTBYFLOAT,
|
||||
false,
|
||||
"dog gbrevsortbyint:gbdocspiderdate",
|
||||
"Sort the documents that contain 'dog' by "
|
||||
"the date they were last spidered, but with the "
|
||||
"oldest on top.",
|
||||
NULL,
|
||||
QTF_HIDE},
|
||||
*/
|
||||
|
||||
|
||||
/*
|
||||
// BR 20160117: No longer supported
|
||||
|
||||
{"gbsortbyint",
|
||||
FIELD_GBSORTBYINT,
|
||||
false,
|
||||
"pilots gbsortbyint:employees",
|
||||
"Sort all documents that "
|
||||
"contain 'pilots' by employees. "
|
||||
"<i>employees</i> can be a root JSON field or "
|
||||
"in a meta tag, or in an xml <price> tag. The value it "
|
||||
"contains is interpreted as a 32-bit integer.",
|
||||
NULL,
|
||||
0 },
|
||||
*/
|
||||
|
||||
{"gbsortbyint",
|
||||
FIELD_GBSORTBYINT,
|
||||
false,
|
||||
@ -4225,33 +3944,6 @@ struct QueryField g_fields[] = {
|
||||
NULL,
|
||||
0},
|
||||
|
||||
/*
|
||||
// BR 20160117: No longer supported
|
||||
|
||||
{"gbsortbyint",
|
||||
FIELD_GBSORTBYINT,
|
||||
false,
|
||||
"gbsortbyint:company.employees",
|
||||
"Sort all documents by employees. Documents can contain "
|
||||
"<i>employees</i> in a JSON document "
|
||||
"like "
|
||||
"<i>{ \"product\":{\"price\":1500.00}} "
|
||||
"</i> or, alternatively, an XML document like <i>"
|
||||
"<product><price>1500.00</price></product>"
|
||||
"</i>",
|
||||
NULL,
|
||||
0 },
|
||||
|
||||
{"gbsortbyint",
|
||||
FIELD_GBSORTBYINT,
|
||||
false,
|
||||
"gbsortbyint:gbsitenuminlinks",
|
||||
"Sort all documents by the number of distinct inlinks the "
|
||||
"document's site has.",
|
||||
NULL,
|
||||
0 },
|
||||
*/
|
||||
|
||||
{"gbrevsortbyint",
|
||||
FIELD_GBREVSORTBYINT,
|
||||
false,
|
||||
@ -4261,114 +3953,6 @@ struct QueryField g_fields[] = {
|
||||
NULL,
|
||||
0},
|
||||
|
||||
|
||||
/*
|
||||
// BR 20160117: No longer supported
|
||||
|
||||
// gbmin:price:1.23
|
||||
{"gbminfloat",
|
||||
FIELD_GBNUMBERMIN,
|
||||
false,
|
||||
"cameras gbminfloat:price:109.99",
|
||||
"Matches all documents that "
|
||||
"contain 'camera' or 'cameras' and have a price of at least 109.99. "
|
||||
"<i>price</i> can be a root JSON field or "
|
||||
"in a meta tag name <i>price</i>, or in an xml <price> tag.",
|
||||
NULL,
|
||||
0 },
|
||||
|
||||
|
||||
{"gbminfloat",
|
||||
FIELD_GBNUMBERMIN,
|
||||
false,
|
||||
"cameras gbminfloat:product.price:109.99",
|
||||
"Matches all documents that "
|
||||
"contain 'camera' or 'cameras' and have a price of at least 109.99 "
|
||||
"in a JSON document like "
|
||||
"<i>{ \"product\":{\"price\":1500.00}} "
|
||||
"</i> or, alternatively, an XML document like <i>"
|
||||
"<product><price>1500.00</price></product>"
|
||||
"</i>",
|
||||
NULL,
|
||||
0 },
|
||||
|
||||
|
||||
// alias we need to bury
|
||||
{"gbmin",
|
||||
FIELD_GBNUMBERMIN,
|
||||
false,
|
||||
"",
|
||||
"",
|
||||
NULL,
|
||||
QTF_HIDE},
|
||||
|
||||
|
||||
|
||||
{"gbmaxfloat",
|
||||
FIELD_GBNUMBERMAX,
|
||||
false,
|
||||
"cameras gbmaxfloat:price:109.99",
|
||||
"Like the gbminfloat examples above, but is an upper bound.",
|
||||
NULL,
|
||||
0 },
|
||||
|
||||
|
||||
|
||||
{"gbequalfloat",
|
||||
FIELD_GBNUMBEREQUALFLOAT,
|
||||
false,
|
||||
"gbequalfloat:product.price:1.23",
|
||||
"Similar to gbminfloat and gbmaxfloat but is an equality constraint.",
|
||||
NULL,
|
||||
0 },
|
||||
|
||||
|
||||
|
||||
{"gbmax",
|
||||
FIELD_GBNUMBERMAX,
|
||||
false,
|
||||
"",
|
||||
"",
|
||||
NULL,
|
||||
QTF_HIDE},
|
||||
|
||||
|
||||
|
||||
{"gbminint",
|
||||
FIELD_GBNUMBERMININT,
|
||||
false,
|
||||
"gbminint:gbspiderdate:1391749680",
|
||||
"Matches all documents with a spider timestamp of at least "
|
||||
"1391749680. Use this as opposed th gbminfloat when you need "
|
||||
"32 bits of integer precision.",
|
||||
NULL,
|
||||
0},
|
||||
|
||||
|
||||
{"gbmaxint",
|
||||
FIELD_GBNUMBERMAXINT,
|
||||
false,
|
||||
"gbmaxint:company.employees:20",
|
||||
"Matches all companies with 20 or less employees "
|
||||
"in a JSON document like "
|
||||
"<i>{ \"company\":{\"employees\":13}} "
|
||||
"</i> or, alternatively, an XML document like <i>"
|
||||
"<company><employees>13</employees>"
|
||||
"</company>"
|
||||
"</i>",
|
||||
NULL,
|
||||
0},
|
||||
|
||||
|
||||
{"gbequalint",
|
||||
FIELD_GBNUMBEREQUALINT,
|
||||
false,
|
||||
"gbequalint:company.employees:13",
|
||||
"Similar to gbminint and gbmaxint but is an equality constraint.",
|
||||
NULL,
|
||||
0},
|
||||
*/
|
||||
|
||||
{"gbdocspiderdate",
|
||||
FIELD_GENERIC,
|
||||
false,
|
||||
@ -4413,114 +3997,6 @@ struct QueryField g_fields[] = {
|
||||
NULL,
|
||||
0},
|
||||
|
||||
// {"gbreplyspiderdate",FIELD_GENERIC,false,
|
||||
// "Example: gbspiderdate:1400081479 will return spider log "
|
||||
// "results that have "
|
||||
// "that spider date timestamp (UTC)"},
|
||||
|
||||
/* BR 20160108: All facets disabled as test. Don't think we will need any of them */
|
||||
#ifdef SUPPORT_FACETS
|
||||
{"gbfacetstr",
|
||||
FIELD_GBFACETSTR,
|
||||
false,
|
||||
"gbfacetstr:color",
|
||||
"Returns facets in "
|
||||
"the search results "
|
||||
"by their color field. <i>color</i> is case INsensitive.",
|
||||
"Facet Related Query Operators",
|
||||
QTF_BEGINNEWTABLE},
|
||||
|
||||
|
||||
{"gbfacetstr",
|
||||
FIELD_GBFACETSTR,
|
||||
false,
|
||||
"gbfacetstr:product.color",
|
||||
"Returns facets in "
|
||||
"the color field in a JSON document like "
|
||||
"<i>{ \"product\":{\"color\":\"red\"}} "
|
||||
"</i> or, alternatively, an XML document like <i>"
|
||||
"<product><color>red</price></product>"
|
||||
"</i>. <i>product.color</i> is case INsensitive.",
|
||||
NULL,
|
||||
0},
|
||||
|
||||
{"gbfacetstr",
|
||||
FIELD_GBFACETSTR,
|
||||
false,
|
||||
"gbfacetstr:gbtagsite cat",
|
||||
"Returns facets from the site names of all pages "
|
||||
"that contain the word 'cat' or 'cats', etc. <i>gbtagsite</i> is case insensitive."
|
||||
,
|
||||
NULL,
|
||||
0},
|
||||
|
||||
{"gbfacetint", FIELD_GBFACETINT, false,
|
||||
"gbfacetint:product.cores",
|
||||
"Returns facets in "
|
||||
"of the <i>cores</i> field in a JSON document like "
|
||||
"<i>{ \"product\":{\"cores\":10}} "
|
||||
"</i> or, alternatively, an XML document like <i>"
|
||||
"<product><cores>10</price></product>"
|
||||
"</i>. <i>product.cores</i> is case INsensitive.",
|
||||
NULL,
|
||||
0},
|
||||
|
||||
{"gbfacetint", FIELD_GBFACETINT, false,
|
||||
"gbfacetint:gbhopcount",
|
||||
"Returns facets in "
|
||||
"of the <i>gbhopcount</i> field over the documents so you can "
|
||||
"search the distribution of hopcounts over the index. <i>gbhopcount</i> is "
|
||||
"case INsensitive.",
|
||||
NULL,
|
||||
0},
|
||||
|
||||
{"gbfacetint", FIELD_GBFACETINT, false,
|
||||
"gbfacetint:gbtagsitenuminlinks",
|
||||
"Returns facets in "
|
||||
"of the <i>sitenuminlinks</i> field for the tag <i>sitenuminlinks</i>"
|
||||
"in the tag for each site. Any numeric tag in tagdb can be "
|
||||
"facetizeed "
|
||||
"in this manner so you can add your own facets this way on a per "
|
||||
"site or per url basis by making tagdb entries. Case Insensitive.",
|
||||
NULL,
|
||||
0},
|
||||
|
||||
|
||||
{"gbfacetint", FIELD_GBFACETINT, false,
|
||||
"gbfacetint:size,0-10,10-20,30-100,100-200,200-1000,1000-10000",
|
||||
"Returns facets in "
|
||||
"of the <i>size</i> field (either in json, field or a meta tag) "
|
||||
"and cluster the results into the specified ranges. <i>size</i> is "
|
||||
"case INsensitive.",
|
||||
NULL,
|
||||
0},
|
||||
|
||||
{"gbfacetint", FIELD_GBFACETINT, false,
|
||||
"gbfacetint:gbsitenuminlinks",
|
||||
"Returns facets based on # of site inlinks the site of each "
|
||||
"result has. <i>gbsitenuminlinks</i> is case INsensitive.",
|
||||
NULL,
|
||||
0},
|
||||
|
||||
{"gbfacetfloat", FIELD_GBFACETFLOAT, false,
|
||||
"gbfacetfloat:product.weight",
|
||||
"Returns facets "
|
||||
"of the <i>weight</i> field in a JSON document like "
|
||||
"<i>{ \"product\":{\"weight\":1.45}} "
|
||||
"</i> or, alternatively, an XML document like <i>"
|
||||
"<product><weight>1.45</price></product>"
|
||||
"</i>. <i>product.weight</i> is case INsensitive.",
|
||||
NULL,
|
||||
0},
|
||||
|
||||
{"gbfacetfloat", FIELD_GBFACETFLOAT, false,
|
||||
"gbfacetfloat:product.price,0-1.5,1.5-5,5.0-20,20-100.0",
|
||||
"Similar to above but cluster the pricess into the specified ranges. "
|
||||
"<i>product.price</i> is case insensitive.",
|
||||
NULL,
|
||||
0},
|
||||
#endif
|
||||
|
||||
//
|
||||
// spider status docs queries
|
||||
//
|
||||
@ -4610,17 +4086,6 @@ struct QueryField g_fields[] = {
|
||||
NULL,
|
||||
0},
|
||||
|
||||
#ifdef SUPPORT_FACETS
|
||||
{"gbssNumRedirects",
|
||||
FIELD_GENERIC,
|
||||
false,
|
||||
"gbfacetint:gbssNumRedirects",
|
||||
"Query on the number of times the url redirect when attempting to "
|
||||
"index it.",
|
||||
NULL,
|
||||
0},
|
||||
#endif
|
||||
|
||||
{"gbssDocId",
|
||||
FIELD_GENERIC,
|
||||
false,
|
||||
@ -4629,26 +4094,6 @@ struct QueryField g_fields[] = {
|
||||
NULL,
|
||||
0},
|
||||
|
||||
|
||||
#ifdef SUPPORT_FACETS
|
||||
{"gbssHopCount",
|
||||
FIELD_GENERIC,
|
||||
false,
|
||||
"gbfacetint:gbssHopCount",
|
||||
"Query on the hop count of the document.",
|
||||
NULL,
|
||||
0},
|
||||
|
||||
{"gbssCrawlRound",
|
||||
FIELD_GENERIC,
|
||||
false,
|
||||
"gbfacetint:gbssCrawlRound",
|
||||
"Query on the crawl round number.",
|
||||
NULL,
|
||||
0},
|
||||
#endif
|
||||
|
||||
|
||||
{"gbssDupOfDocId",
|
||||
FIELD_GENERIC,
|
||||
false,
|
||||
@ -4689,17 +4134,6 @@ struct QueryField g_fields[] = {
|
||||
NULL,
|
||||
0},
|
||||
|
||||
#ifdef SUPPORT_FACETS
|
||||
{"gbssContentHash32",
|
||||
FIELD_GENERIC,
|
||||
false,
|
||||
"gbfacetint:gbssContentHash32",
|
||||
"The hash of the document content, excluding dates and times. Used "
|
||||
"internally for deduping.",
|
||||
NULL,
|
||||
0},
|
||||
#endif
|
||||
|
||||
{"gbssDownloadDurationMS",
|
||||
FIELD_GENERIC,
|
||||
false,
|
||||
@ -4724,25 +4158,6 @@ struct QueryField g_fields[] = {
|
||||
NULL,
|
||||
0},
|
||||
|
||||
#ifdef SUPPORT_FACETS
|
||||
{"gbssUsedRobotsTxt",
|
||||
FIELD_GENERIC,
|
||||
false,
|
||||
"gbfacetint:gbssUsedRobotsTxt",
|
||||
"This is 0 or 1 depending on if robots.txt was not obeyed or obeyed, "
|
||||
"respectively.",
|
||||
NULL,
|
||||
0},
|
||||
|
||||
{"gbssConsecutiveErrors",
|
||||
FIELD_GENERIC,
|
||||
false,
|
||||
"gbfacetint:gbssConsecutiveErrors",
|
||||
"For the last set of indexing attempts how many were errors?",
|
||||
NULL,
|
||||
0},
|
||||
#endif
|
||||
|
||||
{"gbssIp",
|
||||
FIELD_GENERIC,
|
||||
false,
|
||||
@ -4778,65 +4193,6 @@ struct QueryField g_fields[] = {
|
||||
NULL,
|
||||
0},
|
||||
|
||||
#ifdef SUPPORT_FACETS
|
||||
{"gbssContentInjected",
|
||||
FIELD_GENERIC,
|
||||
false,
|
||||
"gbfacetint:gbssContentInjected",
|
||||
"This is 0 or 1 if the content was not injected or injected, "
|
||||
"respectively.",
|
||||
NULL,
|
||||
0},
|
||||
|
||||
{"gbssPercentContentChanged",
|
||||
FIELD_GENERIC,
|
||||
false,
|
||||
"gbfacetfloat:gbssPercentContentChanged",
|
||||
"A float between 0 and 100, inclusive. Represents how much "
|
||||
"the document has changed since the last time we indexed it. This is "
|
||||
"only valid if the document was successfully indexed this time."
|
||||
"respectively.",
|
||||
NULL,
|
||||
0},
|
||||
|
||||
{"gbssSpiderPriority",
|
||||
FIELD_GENERIC,
|
||||
false,
|
||||
"gbfacetint:gbssSpiderPriority",
|
||||
"The spider priority, from 0 to 127, inclusive, of the document "
|
||||
"according to the url filters table.",
|
||||
NULL,
|
||||
0},
|
||||
|
||||
{"gbssMatchingUrlFilter",
|
||||
FIELD_GENERIC,
|
||||
false,
|
||||
"gbfacetstr:gbssMatchingUrlFilter",
|
||||
"The url filter expression the document matched.",
|
||||
NULL,
|
||||
0},
|
||||
|
||||
{"gbssLanguage",
|
||||
FIELD_GENERIC,
|
||||
false,
|
||||
"gbfacetstr:gbssLanguage",
|
||||
"The language of the document. If document was empty or not "
|
||||
"downloaded then this will not be present. Uses xx to mean "
|
||||
"unknown language. Uses the language abbreviations found at the "
|
||||
"bottom of the url filters page.",
|
||||
NULL,
|
||||
0},
|
||||
|
||||
{"gbssContentType",
|
||||
FIELD_GENERIC,
|
||||
false,
|
||||
"gbfacetstr:gbssContentType",
|
||||
"The content type of the document. Like html, xml, json, pdf, etc. "
|
||||
"This field is not present if unknown.",
|
||||
NULL,
|
||||
0},
|
||||
#endif
|
||||
|
||||
{"gbssContentLen",
|
||||
FIELD_GENERIC,
|
||||
false,
|
||||
@ -4845,93 +4201,8 @@ struct QueryField g_fields[] = {
|
||||
NULL,
|
||||
0},
|
||||
|
||||
#ifdef SUPPORT_FACETS
|
||||
{"gbssCrawlDelayMS",
|
||||
FIELD_GENERIC,
|
||||
false,
|
||||
"gbfacetint:gbssCrawlDelay",
|
||||
"The crawl delay according to the robots.txt of the document. "
|
||||
"This is -1 if not specified in the robots.txt or not found.",
|
||||
NULL,
|
||||
0},
|
||||
#endif
|
||||
|
||||
|
||||
/*
|
||||
{"gbssSentToDiffbotThisTime",
|
||||
FIELD_GENERIC,
|
||||
false,
|
||||
"gbssSentToDiffbotThisTime:1",
|
||||
"Was the document's url sent to diffbot for processing this time "
|
||||
"of spidering the url?",
|
||||
NULL,
|
||||
0},
|
||||
|
||||
{"gbssSentToDiffbotAtSomeTime",
|
||||
FIELD_GENERIC,
|
||||
false,
|
||||
"gbssSentToDiffbotAtSomeTime:1",
|
||||
"Was the document's url sent to diffbot for processing, either this "
|
||||
"time or some time before?",
|
||||
NULL,
|
||||
0},
|
||||
|
||||
{"gbssDiffbotReplyCode",
|
||||
FIELD_GENERIC,
|
||||
false,
|
||||
"gbssDiffbotReplyCode:0",
|
||||
"The reply received from diffbot. 0 means success, otherwise, it "
|
||||
"indicates an error code.",
|
||||
NULL,
|
||||
0},
|
||||
|
||||
{"gbssDiffbotReplyMsg",
|
||||
FIELD_GENERIC,
|
||||
false,
|
||||
"gbfacetstr:gbssDiffbotReplyMsg:0",
|
||||
"The reply received from diffbot represented in text.",
|
||||
NULL,
|
||||
0},
|
||||
|
||||
{"gbssDiffbotReplyLen",
|
||||
FIELD_GENERIC,
|
||||
false,
|
||||
"gbsortbyint:gbssDiffbotReplyLen",
|
||||
"The length of the reply received from diffbot.",
|
||||
NULL,
|
||||
0},
|
||||
|
||||
{"gbssDiffbotReplyResponseTimeMS",
|
||||
FIELD_GENERIC,
|
||||
false,
|
||||
"gbsortbyint:gbssDiffbotReplyResponseTimeMS",
|
||||
"The time in milliseconds it took to get a reply from diffbot.",
|
||||
NULL,
|
||||
0},
|
||||
|
||||
{"gbssDiffbotReplyRetries",
|
||||
FIELD_GENERIC,
|
||||
false,
|
||||
"gbfacetint:gbssDiffbotReplyRetries",
|
||||
"The number of times we had to resend the request to diffbot "
|
||||
"because diffbot returned a 504 gateway timed out error.",
|
||||
NULL,
|
||||
0},
|
||||
|
||||
{"gbssDiffbotReplyNumObjects",
|
||||
FIELD_GENERIC,
|
||||
false,
|
||||
"gbfacetint:gbssDiffbotReplyNumObjects",
|
||||
"The number of JSON objects diffbot excavated from the provided url.",
|
||||
NULL,
|
||||
0},
|
||||
*/
|
||||
|
||||
|
||||
|
||||
// they don't need to know about this
|
||||
{"gbad",FIELD_GBAD,false,"","",NULL,QTF_HIDE},
|
||||
//BR 20160117 removed: {"gbtagvector", FIELD_GBTAGVECTOR, false,"","",NULL,QTF_HIDE},
|
||||
{"gbsamplevector", FIELD_GBSAMPLEVECTOR, false,"","",NULL,QTF_HIDE},
|
||||
{"gbcontenthash", FIELD_GBCONTENTHASH, false,"","",NULL,QTF_HIDE},
|
||||
{"gbduphash" ,FIELD_GBOTHER,false,"","",NULL,QTF_HIDE},
|
||||
@ -5606,7 +4877,6 @@ bool QueryTerm::isSplit() {
|
||||
if(!m_fieldCode) return true;
|
||||
if(m_fieldCode == FIELD_QUOTA) return false;
|
||||
//BR 20160117 removed: if(m_fieldCode == FIELD_GBTAGVECTOR) return false;
|
||||
//BR 20160106 removed: if(m_fieldCode == FIELD_GBGIGABITVECTOR) return false;
|
||||
if(m_fieldCode == FIELD_GBSAMPLEVECTOR) return false;
|
||||
if(m_fieldCode == FIELD_GBSECTIONHASH) return false;
|
||||
if(m_fieldCode == FIELD_GBCONTENTHASH) return false;
|
||||
|
25
Query.h
25
Query.h
@ -569,15 +569,9 @@ class QueryTerm {
|
||||
char m_endKey [MAX_KEY_BYTES];
|
||||
char m_ks;
|
||||
|
||||
// used by Msg40.cpp for gigabits generation
|
||||
int64_t m_hash64d;
|
||||
int32_t m_popWeight;
|
||||
|
||||
uint64_t m_numDocsThatHaveFacet;
|
||||
};
|
||||
|
||||
//#define MAX_OPSLOTS 256
|
||||
|
||||
#define MAX_EXPRESSIONS 100
|
||||
|
||||
// operand1 AND operand2 OR ...
|
||||
@ -646,26 +640,14 @@ class Query {
|
||||
int32_t serialize(char *buf, int32_t bufLen);
|
||||
int32_t deserialize(char *buf, int32_t bufLen);
|
||||
|
||||
// . if a term is truncated in indexdb, change its '+' sign to a '*'
|
||||
// . will recopmute m_bitScores to fix bit #7
|
||||
//void softenTruncatedTerms ( );
|
||||
|
||||
bool setQueryTermScores ( int64_t *termFreqsArg ) ;
|
||||
|
||||
// about how hits for this query?
|
||||
//int64_t getEstimatedTotalHits ( );
|
||||
|
||||
char *getQuery ( ) { return m_orig ; };
|
||||
int32_t getQueryLen ( ) { return m_origLen; };
|
||||
|
||||
//int32_t getNumIgnored ( ) { return m_numIgnored; };
|
||||
//int32_t getNumNotIgnored ( ) { return m_numTerms ; };
|
||||
|
||||
int32_t getNumTerms ( ) { return m_numTerms; };
|
||||
char getTermSign ( int32_t i ) { return m_qterms[i].m_termSign; };
|
||||
bool isPhrase ( int32_t i ) { return m_qterms[i].m_isPhrase; };
|
||||
bool isInPhrase ( int32_t i ) { return m_qterms[i].m_inPhrase; };
|
||||
bool isInQuotes ( int32_t i ) { return m_qterms[i].m_inQuotes; };
|
||||
int64_t getTermId ( int32_t i ) { return m_qterms[i].m_termId; };
|
||||
char getFieldCode2( int32_t i ) { return m_qterms[i].m_fieldCode; };
|
||||
int64_t getRawTermId ( int32_t i ) { return m_qterms[i].m_rawTermId; };
|
||||
@ -687,13 +669,6 @@ class Query {
|
||||
|
||||
bool isSplit(int32_t i) { return m_qterms[i].isSplit(); };
|
||||
|
||||
// . Msg39 calls this to get our vector so it can pass it to Msg37
|
||||
// . the signs and ids are dupped in the QueryTerm classes, too
|
||||
//int64_t *getTermFreqs ( ) { return m_termFreqs ; };
|
||||
//int64_t getTermFreq ( int32_t i ) { return m_termFreqs[i]; };
|
||||
//int64_t *getTermIds ( ) { return m_termIds ; };
|
||||
//char *getTermSigns ( ) { return m_termSigns ; };
|
||||
//int32_t *getComponentCodes ( ) { return m_componentCodes; };
|
||||
int64_t getRawWordId ( int32_t i ) { return m_qwords[i].m_rawWordId;};
|
||||
|
||||
int32_t getNumComponentTerms ( ) { return m_numComponents; };
|
||||
|
85
Rdb.cpp
85
Rdb.cpp
@ -16,7 +16,6 @@
|
||||
#include "Spider.h"
|
||||
#include "SpiderColl.h"
|
||||
#include "Doledb.h"
|
||||
#include "Revdb.h"
|
||||
#include "hash.h"
|
||||
|
||||
void attemptMergeAll ( int fd , void *state ) ;
|
||||
@ -168,10 +167,6 @@ bool Rdb::init ( char *dir ,
|
||||
if ( m_rdbId == RDB2_INDEXDB2 ) m_pageSize = GB_INDEXDB_PAGE_SIZE;
|
||||
if ( m_rdbId == RDB_POSDB ) m_pageSize = GB_INDEXDB_PAGE_SIZE;
|
||||
if ( m_rdbId == RDB2_POSDB2 ) m_pageSize = GB_INDEXDB_PAGE_SIZE;
|
||||
//if ( m_rdbId == RDB_DATEDB ) m_pageSize = GB_INDEXDB_PAGE_SIZE;
|
||||
//if ( m_rdbId == RDB2_DATEDB2 ) m_pageSize = GB_INDEXDB_PAGE_SIZE;
|
||||
if ( m_rdbId == RDB_SECTIONDB ) m_pageSize = GB_INDEXDB_PAGE_SIZE;
|
||||
if ( m_rdbId == RDB2_SECTIONDB2) m_pageSize = GB_INDEXDB_PAGE_SIZE;
|
||||
if ( m_rdbId == RDB_TITLEDB ) m_pageSize = GB_INDEXDB_PAGE_SIZE;
|
||||
if ( m_rdbId == RDB2_TITLEDB2 ) m_pageSize = GB_INDEXDB_PAGE_SIZE;
|
||||
if ( m_rdbId == RDB_SPIDERDB ) m_pageSize = GB_INDEXDB_PAGE_SIZE;
|
||||
@ -180,30 +175,7 @@ bool Rdb::init ( char *dir ,
|
||||
if ( m_rdbId == RDB_SERPDB ) m_pageSize = GB_INDEXDB_PAGE_SIZE;
|
||||
if ( m_rdbId == RDB_LINKDB ) m_pageSize = GB_INDEXDB_PAGE_SIZE;
|
||||
if ( m_rdbId == RDB2_LINKDB2 ) m_pageSize = GB_INDEXDB_PAGE_SIZE;
|
||||
if ( m_rdbId == RDB_REVDB ) m_pageSize = GB_INDEXDB_PAGE_SIZE;
|
||||
if ( m_rdbId == RDB2_REVDB2 ) m_pageSize = GB_INDEXDB_PAGE_SIZE;
|
||||
// let's obsolete this rec/list cache because using the
|
||||
// disk page cache cleverly, is usually better than this,
|
||||
// because this ignores newly added data (it is not realtime),
|
||||
// and it really only saves us from having to intersect a
|
||||
// bunch of indexdb/datedb lists.
|
||||
/*
|
||||
loadCacheFromDisk = false;
|
||||
maxCacheMem = 0;
|
||||
maxCacheNodes = 0;
|
||||
// . set up our cache
|
||||
// . we could be adding lists so keep fixedDataSize -1 for cache
|
||||
if ( ! m_cache.init ( maxCacheMem ,
|
||||
fixedDataSize ,
|
||||
true , // support lists
|
||||
maxCacheNodes ,
|
||||
m_useHalfKeys ,
|
||||
m_dbname ,
|
||||
loadCacheFromDisk ,
|
||||
m_ks , // cache key size
|
||||
m_ks ) ) // data key size
|
||||
return false;
|
||||
*/
|
||||
|
||||
// we can't merge more than MAX_RDB_FILES files at a time
|
||||
if ( minToMerge > MAX_RDB_FILES ) minToMerge = MAX_RDB_FILES;
|
||||
m_minToMerge = minToMerge;
|
||||
@ -1736,17 +1708,14 @@ bool Rdb::addList ( collnum_t collnum , RdbList *list,
|
||||
//! g_conf.m_rebuildNoSplits &&
|
||||
//! g_conf.m_removeBadPages &&
|
||||
( m_rdbId == RDB_TITLEDB ||
|
||||
//m_rdbId == RDB_SECTIONDB ||
|
||||
m_rdbId == RDB_PLACEDB ||
|
||||
m_rdbId == RDB_TFNDB ||
|
||||
m_rdbId == RDB_INDEXDB ||
|
||||
m_rdbId == RDB_POSDB ||
|
||||
//m_rdbId == RDB_DATEDB ||
|
||||
m_rdbId == RDB_POSDB ||
|
||||
m_rdbId == RDB_CLUSTERDB ||
|
||||
m_rdbId == RDB_LINKDB ||
|
||||
m_rdbId == RDB_DOLEDB ||
|
||||
m_rdbId == RDB_SPIDERDB ||
|
||||
m_rdbId == RDB_REVDB ) ) {
|
||||
m_rdbId == RDB_SPIDERDB ) ) {
|
||||
|
||||
// exception, spider status docs can be deleted from titledb
|
||||
// if user turns off 'index spider replies' before doing
|
||||
@ -1765,20 +1734,6 @@ bool Rdb::addList ( collnum_t collnum , RdbList *list,
|
||||
|
||||
exception:
|
||||
|
||||
/*
|
||||
if ( g_repair.isRepairActive() &&
|
||||
g_repair.m_fullRebuild &&
|
||||
collnum != g_repair.m_newCollnum &&
|
||||
m_rdbId != RDB_TAGDB ) {
|
||||
log("db: How did an add come in while in full repair mode?"
|
||||
" addCollnum=%"INT32" repairCollnum=%"INT32" db=%s",
|
||||
(int32_t)collnum , (int32_t)g_repair.m_newCollnum ,
|
||||
m_dbname );
|
||||
g_errno = EREPAIRING;
|
||||
return false;
|
||||
}
|
||||
*/
|
||||
|
||||
// if we are currently in a quickpoll, make sure we are not in
|
||||
// RdbTree::getList(), because we could mess that loop up by adding
|
||||
// or deleting a record into/from the tree now
|
||||
@ -2811,23 +2766,19 @@ Rdb *getRdbFromId ( uint8_t rdbId ) {
|
||||
s_table9 [ RDB_INDEXDB ] = g_indexdb.getRdb();
|
||||
s_table9 [ RDB_POSDB ] = g_posdb.getRdb();
|
||||
s_table9 [ RDB_TITLEDB ] = g_titledb.getRdb();
|
||||
s_table9 [ RDB_SECTIONDB ] = g_sectiondb.getRdb();
|
||||
s_table9 [ RDB_SPIDERDB ] = g_spiderdb.getRdb();
|
||||
s_table9 [ RDB_DOLEDB ] = g_doledb.getRdb();
|
||||
s_table9 [ RDB_CLUSTERDB ] = g_clusterdb.getRdb();
|
||||
s_table9 [ RDB_LINKDB ] = g_linkdb.getRdb();
|
||||
s_table9 [ RDB_STATSDB ] = g_statsdb.getRdb();
|
||||
s_table9 [ RDB_REVDB ] = g_revdb.getRdb();
|
||||
s_table9 [ RDB_PARMDB ] = NULL;
|
||||
|
||||
s_table9 [ RDB2_INDEXDB2 ] = g_indexdb2.getRdb();
|
||||
s_table9 [ RDB2_POSDB2 ] = g_posdb2.getRdb();
|
||||
s_table9 [ RDB2_TITLEDB2 ] = g_titledb2.getRdb();
|
||||
s_table9 [ RDB2_SECTIONDB2 ] = g_sectiondb2.getRdb();
|
||||
s_table9 [ RDB2_SPIDERDB2 ] = g_spiderdb2.getRdb();
|
||||
s_table9 [ RDB2_CLUSTERDB2 ] = g_clusterdb2.getRdb();
|
||||
s_table9 [ RDB2_LINKDB2 ] = g_linkdb2.getRdb();
|
||||
s_table9 [ RDB2_REVDB2 ] = g_revdb2.getRdb();
|
||||
s_table9 [ RDB2_TAGDB2 ] = g_tagdb2.getRdb();
|
||||
}
|
||||
if ( rdbId >= RDB_END ) return NULL;
|
||||
@ -2840,22 +2791,18 @@ char getIdFromRdb ( Rdb *rdb ) {
|
||||
if ( rdb == g_indexdb.getRdb () ) return RDB_INDEXDB;
|
||||
if ( rdb == g_posdb.getRdb () ) return RDB_POSDB;
|
||||
if ( rdb == g_titledb.getRdb () ) return RDB_TITLEDB;
|
||||
if ( rdb == g_sectiondb.getRdb () ) return RDB_SECTIONDB;
|
||||
if ( rdb == g_spiderdb.getRdb () ) return RDB_SPIDERDB;
|
||||
if ( rdb == g_doledb.getRdb () ) return RDB_DOLEDB;
|
||||
if ( rdb == g_clusterdb.getRdb () ) return RDB_CLUSTERDB;
|
||||
if ( rdb == g_statsdb.getRdb () ) return RDB_STATSDB;
|
||||
if ( rdb == g_linkdb.getRdb () ) return RDB_LINKDB;
|
||||
if ( rdb == g_revdb.getRdb () ) return RDB_REVDB;
|
||||
if ( rdb == g_indexdb2.getRdb () ) return RDB2_INDEXDB2;
|
||||
if ( rdb == g_posdb2.getRdb () ) return RDB2_POSDB2;
|
||||
if ( rdb == g_tagdb2.getRdb () ) return RDB2_TAGDB2;
|
||||
if ( rdb == g_titledb2.getRdb () ) return RDB2_TITLEDB2;
|
||||
if ( rdb == g_sectiondb2.getRdb () ) return RDB2_SECTIONDB2;
|
||||
if ( rdb == g_spiderdb2.getRdb () ) return RDB2_SPIDERDB2;
|
||||
if ( rdb == g_clusterdb2.getRdb () ) return RDB2_CLUSTERDB2;
|
||||
if ( rdb == g_linkdb2.getRdb () ) return RDB2_LINKDB2;
|
||||
if ( rdb == g_revdb2.getRdb () ) return RDB2_REVDB2;
|
||||
|
||||
log(LOG_LOGIC,"db: getIdFromRdb: no rdbId for %s.",rdb->m_dbname);
|
||||
return 0;
|
||||
@ -2868,12 +2815,10 @@ char isSecondaryRdb ( uint8_t rdbId ) {
|
||||
case RDB2_POSDB2 : return true;
|
||||
case RDB2_TAGDB2 : return true;
|
||||
case RDB2_TITLEDB2 : return true;
|
||||
case RDB2_SECTIONDB2 : return true;
|
||||
case RDB2_PLACEDB2 : return true;
|
||||
case RDB2_SPIDERDB2 : return true;
|
||||
case RDB2_TFNDB2 : return true;
|
||||
case RDB2_CLUSTERDB2 : return true;
|
||||
case RDB2_REVDB2 : return true;
|
||||
case RDB2_LINKDB2 : return true;
|
||||
}
|
||||
return false;
|
||||
@ -2898,13 +2843,9 @@ char getKeySizeFromRdbId ( uint8_t rdbId ) {
|
||||
i == RDB_SPIDERDB ||
|
||||
i == RDB_TAGDB ||
|
||||
i == RDB_SYNCDB ||
|
||||
i == RDB_SECTIONDB ||
|
||||
i == RDB_PLACEDB ||
|
||||
|
||||
//i == RDB2_DATEDB2 ||
|
||||
i == RDB2_SPIDERDB2 ||
|
||||
i == RDB2_TAGDB2 ||
|
||||
i == RDB2_SECTIONDB2 ||
|
||||
i == RDB2_PLACEDB2 )
|
||||
ks = 16;
|
||||
if ( i == RDB_POSDB || i == RDB2_POSDB2 )
|
||||
@ -2942,11 +2883,9 @@ int32_t getDataSizeFromRdbId ( uint8_t rdbId ) {
|
||||
i == RDB_TFNDB ||
|
||||
i == RDB_CLUSTERDB ||
|
||||
i == RDB_DATEDB ||
|
||||
//i == RDB_FAKEDB ||
|
||||
i == RDB_LINKDB )
|
||||
ds = 0;
|
||||
else if ( i == RDB_TITLEDB ||
|
||||
i == RDB_REVDB ||
|
||||
i == RDB_SYNCDB ||
|
||||
i == RDB_CACHEDB ||
|
||||
i == RDB_SERPDB ||
|
||||
@ -2960,8 +2899,6 @@ int32_t getDataSizeFromRdbId ( uint8_t rdbId ) {
|
||||
ds = -1;
|
||||
else if ( i == RDB_STATSDB )
|
||||
ds = sizeof(StatData);
|
||||
else if ( i == RDB_SECTIONDB )
|
||||
ds = sizeof(SectionVote);
|
||||
else if ( i == RDB2_POSDB2 ||
|
||||
i == RDB2_INDEXDB2 ||
|
||||
i == RDB2_TFNDB2 ||
|
||||
@ -2970,23 +2907,17 @@ int32_t getDataSizeFromRdbId ( uint8_t rdbId ) {
|
||||
i == RDB2_DATEDB2 )
|
||||
ds = 0;
|
||||
else if ( i == RDB2_TITLEDB2 ||
|
||||
i == RDB2_REVDB2 ||
|
||||
i == RDB2_TAGDB2 ||
|
||||
i == RDB2_CATDB2 ||
|
||||
i == RDB2_SPIDERDB2 ||
|
||||
i == RDB2_PLACEDB2 )
|
||||
ds = -1;
|
||||
else if ( i == RDB2_SECTIONDB2 )
|
||||
ds = sizeof(SectionVote);
|
||||
else { char *xx=NULL;*xx=0; }
|
||||
// get the rdb for this rdbId
|
||||
//Rdb *rdb = getRdbFromId ( i );
|
||||
// sanity check
|
||||
//if ( ! rdb ) continue;//{ char *xx=NULL;*xx=0; }
|
||||
// sanity!
|
||||
//if ( rdb->m_ks == 0 ) { char *xx=NULL;*xx=0; }
|
||||
else {
|
||||
continue;
|
||||
}
|
||||
|
||||
// set the table
|
||||
s_table2[i] = ds;//rdb->m_fixedDataSize;
|
||||
s_table2[i] = ds;
|
||||
}
|
||||
}
|
||||
return s_table2[rdbId];
|
||||
|
738
Repair.cpp
738
Repair.cpp
File diff suppressed because it is too large
Load Diff
27
Repair.h
27
Repair.h
@ -80,40 +80,27 @@ public:
|
||||
Msg5 m_msg5b;
|
||||
Msg4 m_msg4;
|
||||
bool m_needsCallback;
|
||||
//Msg50 m_msg50;
|
||||
char m_docQuality;
|
||||
//Msg14 m_msg14;
|
||||
//RdbList m_scanList;
|
||||
RdbList m_titleRecList;
|
||||
int64_t m_docId;
|
||||
char m_isDelete;
|
||||
RdbList m_ulist;
|
||||
RdbList m_addlist;
|
||||
//int32_t m_ruleset;
|
||||
//LinkTextReply m_rootLinkText;
|
||||
int64_t m_totalMem;
|
||||
int32_t m_stage ;
|
||||
int32_t m_tfn;
|
||||
int32_t m_count;
|
||||
bool m_updated;
|
||||
//key_t m_currentTitleRecKey; // for tfndb
|
||||
|
||||
// titledb scan vars
|
||||
//key_t m_nextRevdbKey;
|
||||
key_t m_nextTitledbKey;
|
||||
key_t m_nextSpiderdbKey;
|
||||
//key_t m_nextIndexdbKey;
|
||||
key_t m_nextPosdbKey;
|
||||
//key_t m_nextDatedbKey;
|
||||
key128_t m_nextLinkdbKey;
|
||||
//key128_t m_nextPlacedbKey;
|
||||
key_t m_endKey;
|
||||
int64_t m_uh48;
|
||||
//TitleRec m_tr;
|
||||
//Msg8a m_msg8a;
|
||||
int32_t m_priority;
|
||||
uint64_t m_contentHash;
|
||||
//key_t m_tfndbKey;
|
||||
key_t m_clusterdbKey ;
|
||||
key_t m_spiderdbKey;
|
||||
char m_srBuf[SR_BUFSIZE];
|
||||
@ -127,8 +114,6 @@ public:
|
||||
|
||||
// spiderdb scan vars
|
||||
bool m_isNew;
|
||||
//SpiderRec m_sr;
|
||||
//SiteRec m_siteRec;
|
||||
TagRec m_tagRec;
|
||||
|
||||
|
||||
@ -139,8 +124,6 @@ public:
|
||||
int64_t m_prevDocId;
|
||||
bool m_completedFirstScan ;
|
||||
bool m_completedSpiderdbScan ;
|
||||
//bool m_completedIndexdbScan ;
|
||||
//key_t m_lastRevdbKey;
|
||||
key_t m_lastTitledbKey;
|
||||
key_t m_lastSpiderdbKey;
|
||||
|
||||
@ -158,7 +141,6 @@ public:
|
||||
int64_t m_recsRoot;
|
||||
int64_t m_recsNonRoot;
|
||||
int64_t m_recsInjected;
|
||||
//int32_t m_fn;
|
||||
|
||||
// spiderdb scan stats
|
||||
int32_t m_spiderRecsScanned ;
|
||||
@ -168,21 +150,13 @@ public:
|
||||
|
||||
// generic scan parms
|
||||
char m_rebuildTitledb ;
|
||||
//char m_rebuildIndexdb ;
|
||||
char m_rebuildPosdb ;
|
||||
//char m_rebuildNoSplits ;
|
||||
//char m_rebuildDatedb ;
|
||||
//char m_rebuildTfndb ;
|
||||
char m_rebuildClusterdb ;
|
||||
char m_rebuildSpiderdb ;
|
||||
char m_rebuildSitedb ;
|
||||
char m_rebuildLinkdb ;
|
||||
char m_rebuildTagdb ;
|
||||
//char m_rebuildPlacedb ;
|
||||
//char m_rebuildSectiondb ;
|
||||
//char m_rebuildRevdb ;
|
||||
char m_fullRebuild ;
|
||||
//char m_removeBadPages ;
|
||||
|
||||
char m_rebuildRoots ;
|
||||
char m_rebuildNonRoots ;
|
||||
@ -208,7 +182,6 @@ public:
|
||||
char m_SAVE_END;
|
||||
|
||||
// i'd like to save these but they are ptrs
|
||||
//char *m_coll;
|
||||
CollectionRec *m_cr;
|
||||
|
||||
//for timing a repair process
|
||||
|
169
Revdb.cpp
169
Revdb.cpp
@ -1,169 +0,0 @@
|
||||
#include "gb-include.h"
|
||||
|
||||
#include "Revdb.h"
|
||||
#include "Threads.h"
|
||||
|
||||
Revdb g_revdb;
|
||||
Revdb g_revdb2;
|
||||
|
||||
// reset rdb
|
||||
void Revdb::reset() { m_rdb.reset(); }
|
||||
|
||||
// init our rdb
|
||||
bool Revdb::init ( ) {
|
||||
|
||||
int64_t maxTreeMem = 200000000;
|
||||
// . what's max # of tree nodes?
|
||||
// . assume avg RevRec size (compressed html doc) is about 1k we get:
|
||||
// . NOTE: overhead is about 32 bytes per node
|
||||
int32_t maxTreeNodes = maxTreeMem / (1*1024);
|
||||
|
||||
// each entry in the cache is usually just a single record, no lists
|
||||
int32_t maxCacheNodes = 0;//g_conf.m_revdbMaxCacheMem / (10*1024);
|
||||
|
||||
// initialize our own internal rdb
|
||||
if ( ! m_rdb.init ( g_hostdb.m_dir ,
|
||||
"revdb" ,
|
||||
true , // dedup same keys?
|
||||
-1 , // fixed record size
|
||||
// this should not really be changed...
|
||||
2 , // min files to merge
|
||||
maxTreeMem,//g_conf.m_revdbMaxTreeMem ,
|
||||
maxTreeNodes ,
|
||||
// now we balance so Sync.cpp can ordered huge list
|
||||
true , // balance tree?
|
||||
0 , // cache mem
|
||||
maxCacheNodes ,
|
||||
false ,// half keys?
|
||||
false ,// g_conf.m_revdbSav
|
||||
NULL , // page cache ptr
|
||||
false ) )// is titledb?
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
// init the rebuild/secondary rdb, used by PageRepair.cpp
|
||||
bool Revdb::init2 ( int32_t treeMem ) {
|
||||
// . what's max # of tree nodes?
|
||||
// . assume avg RevRec size (compressed html doc) is about 1k we get:
|
||||
// . NOTE: overhead is about 32 bytes per node
|
||||
int32_t maxTreeNodes = treeMem / (1*1024);
|
||||
// initialize our own internal rdb
|
||||
if ( ! m_rdb.init ( g_hostdb.m_dir ,
|
||||
"revdbRebuild" ,
|
||||
true , // dedup same keys?
|
||||
-1 , // fixed record size
|
||||
240 , // MinFilesToMerge
|
||||
treeMem ,
|
||||
maxTreeNodes ,
|
||||
// now we balance so Sync.cpp can ordered huge list
|
||||
true , // balance tree?
|
||||
0 , // MaxCacheMem ,
|
||||
0 , // maxCacheNodes
|
||||
false , // half keys?
|
||||
false , // revdbSaveCache
|
||||
NULL , // page cache ptr
|
||||
false ) )// is titledb?
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
/*
|
||||
bool Revdb::addColl ( char *coll, bool doVerify ) {
|
||||
if ( ! m_rdb.addColl ( coll ) ) return false;
|
||||
if ( ! doVerify ) return true;
|
||||
// verify
|
||||
if ( verify(coll) ) return true;
|
||||
// if not allowing scale, return false
|
||||
if ( ! g_conf.m_allowScale ) return false;
|
||||
// otherwise let it go
|
||||
log ( "db: Verify failed, but scaling is allowed, passing." );
|
||||
return true;
|
||||
}
|
||||
*/
|
||||
bool Revdb::verify ( char *coll ) {
|
||||
log ( LOG_INFO, "db: Verifying Revdb for coll %s...", coll );
|
||||
g_threads.disableThreads();
|
||||
|
||||
Msg5 msg5;
|
||||
Msg5 msg5b;
|
||||
RdbList list;
|
||||
key_t startKey;
|
||||
key_t endKey;
|
||||
startKey.setMin();
|
||||
endKey.setMax();
|
||||
//int32_t minRecSizes = 64000;
|
||||
CollectionRec *cr = g_collectiondb.getRec(coll);
|
||||
|
||||
if ( ! msg5.getList ( RDB_REVDB ,
|
||||
cr->m_collnum ,
|
||||
&list ,
|
||||
startKey ,
|
||||
endKey ,
|
||||
1024*1024 , // minRecSizes ,
|
||||
true , // includeTree ,
|
||||
false , // add to cache?
|
||||
0 , // max cache age
|
||||
0 , // startFileNum ,
|
||||
-1 , // numFiles ,
|
||||
NULL , // state
|
||||
NULL , // callback
|
||||
0 , // niceness
|
||||
false , // err correction?
|
||||
NULL , // cache key ptr
|
||||
0 , // retry num
|
||||
-1 , // maxRetries
|
||||
true , // compensate for merge
|
||||
-1LL , // sync point
|
||||
&msg5b ,
|
||||
false )) {
|
||||
g_threads.enableThreads();
|
||||
return log("db: HEY! it did not block");
|
||||
}
|
||||
|
||||
int32_t count = 0;
|
||||
int32_t got = 0;
|
||||
for ( list.resetListPtr() ; ! list.isExhausted() ;
|
||||
list.skipCurrentRecord() ) {
|
||||
key_t k = list.getCurrentKey();
|
||||
count++;
|
||||
//uint32_t groupId = getGroupId ( RDB_REVDB , &k );
|
||||
//if ( groupId == g_hostdb.m_groupId ) got++;
|
||||
uint32_t shardNum = getShardNum( RDB_REVDB , &k );
|
||||
if ( shardNum == getMyShardNum() ) got++;
|
||||
}
|
||||
if ( got != count ) {
|
||||
log ("db: Out of first %"INT32" records in revdb, "
|
||||
"only %"INT32" belong to our group.",count,got);
|
||||
// exit if NONE, we probably got the wrong data
|
||||
if ( count > 10 && got == 0 )
|
||||
log("db: Are you sure you have the right "
|
||||
"data in the right directory? "
|
||||
"Exiting.");
|
||||
log ( "db: Exiting due to Revdb inconsistency." );
|
||||
g_threads.enableThreads();
|
||||
return g_conf.m_bypassValidation;
|
||||
}
|
||||
|
||||
log ( LOG_INFO, "db: Revdb passed verification successfully for %"INT32""
|
||||
" recs.", count );
|
||||
// DONE
|
||||
g_threads.enableThreads();
|
||||
return true;
|
||||
}
|
||||
|
||||
// . make the key of a RevRec from a docId
|
||||
// . remember to set the low bit so it's not a delete
|
||||
// . hi bits are set in the key
|
||||
key_t Revdb::makeKey ( int64_t docId, bool isDel ){
|
||||
key_t key ;
|
||||
key.n1 = 0;
|
||||
// shift up for delbit
|
||||
key.n0 = ((uint64_t)docId) << 1;
|
||||
// final del bit
|
||||
if ( ! isDel ) key.n0 |= 0x01;
|
||||
return key;
|
||||
};
|
||||
|
||||
int64_t Revdb::getDocId ( key_t *k ) {
|
||||
return (k->n0 >> 1);
|
||||
}
|
52
Revdb.h
52
Revdb.h
@ -1,52 +0,0 @@
|
||||
// Matt Wells, copyright Jun 2001
|
||||
|
||||
// . db of metalists used to delete a doc now
|
||||
|
||||
#ifndef _REVDB_H_
|
||||
#define _REVDB_H_
|
||||
|
||||
#include "Rdb.h"
|
||||
#include "Url.h"
|
||||
#include "Conf.h"
|
||||
#include "Xml.h"
|
||||
#include "Titledb.h"
|
||||
|
||||
// new key format:
|
||||
// . <docId> - 38 bits
|
||||
// . <delBit> - 1 bit
|
||||
|
||||
// data format:
|
||||
// . a metalist that is passed in to Msg4
|
||||
|
||||
class Revdb {
|
||||
|
||||
public:
|
||||
|
||||
// reset rdb
|
||||
void reset();
|
||||
|
||||
bool verify ( char *coll );
|
||||
|
||||
bool addColl ( char *coll, bool doVerify = true );
|
||||
|
||||
// init m_rdb
|
||||
bool init ();
|
||||
|
||||
// init secondary/rebuild revdb
|
||||
bool init2 ( int32_t treeMem ) ;
|
||||
|
||||
// like titledb basically
|
||||
key_t makeKey ( int64_t docId , bool del ) ;
|
||||
|
||||
int64_t getDocId ( key_t *k );
|
||||
|
||||
Rdb *getRdb() { return &m_rdb; };
|
||||
|
||||
// holds binary format rev entries
|
||||
Rdb m_rdb;
|
||||
};
|
||||
|
||||
extern class Revdb g_revdb;
|
||||
extern class Revdb g_revdb2;
|
||||
|
||||
#endif
|
34
SafeBuf.cpp
34
SafeBuf.cpp
@ -7,7 +7,7 @@
|
||||
#include "Words.h"
|
||||
#include "Sections.h"
|
||||
|
||||
SafeBuf::SafeBuf(int32_t initSize, char *label ) {
|
||||
SafeBuf::SafeBuf(int32_t initSize, const char *label ) {
|
||||
if(initSize <= 0) initSize = 1;
|
||||
m_capacity = initSize;
|
||||
m_length = 0;
|
||||
@ -36,11 +36,11 @@ SafeBuf::SafeBuf() {
|
||||
m_label = NULL;
|
||||
}
|
||||
|
||||
void SafeBuf::setLabel ( char *label ) {
|
||||
void SafeBuf::setLabel ( const char *label ) {
|
||||
m_label = label;
|
||||
}
|
||||
|
||||
SafeBuf::SafeBuf(char* stackBuf, int32_t cap, char* label) {
|
||||
SafeBuf::SafeBuf(char* stackBuf, int32_t cap, const char* label) {
|
||||
m_usingStack = true;
|
||||
m_capacity = cap;
|
||||
m_buf = stackBuf;
|
||||
@ -133,7 +133,7 @@ bool SafeBuf::safeMemcpy(const char *s, int32_t len) {
|
||||
return true;
|
||||
}
|
||||
|
||||
bool SafeBuf::safeMemcpy_nospaces(char *s, int32_t len) {
|
||||
bool SafeBuf::safeMemcpy_nospaces(const char *s, int32_t len) {
|
||||
// put a silent \0 at the end
|
||||
int32_t tmp = len + m_length+1;
|
||||
if(tmp >= m_capacity ) {
|
||||
@ -158,7 +158,7 @@ bool SafeBuf::safeMemcpy ( Words *w , int32_t a , int32_t b ) {
|
||||
return safeMemcpy ( p , pend - p );
|
||||
}
|
||||
|
||||
char* SafeBuf::pushStr (char* str, uint32_t len) {
|
||||
char* SafeBuf::pushStr (const char* str, uint32_t len) {
|
||||
int32_t initLen = m_length;
|
||||
bool status = safeMemcpy ( str , len );
|
||||
status &= nullTerm();
|
||||
@ -273,7 +273,7 @@ bool SafeBuf::cat(SafeBuf& c) {
|
||||
return safeMemcpy(c.getBufStart(), c.length());
|
||||
}
|
||||
|
||||
bool SafeBuf::reserve(int32_t i , char *label, bool clearIt ) {
|
||||
bool SafeBuf::reserve(int32_t i , const char *label, bool clearIt ) {
|
||||
|
||||
// if we don't already have a label and they provided one, use it
|
||||
if ( ! m_label ) {
|
||||
@ -333,7 +333,7 @@ bool SafeBuf::reserve(int32_t i , char *label, bool clearIt ) {
|
||||
|
||||
//reserve this many bytes, if we need to alloc, we double the
|
||||
//buffer size.
|
||||
bool SafeBuf::reserve2x(int32_t i, char *label) {
|
||||
bool SafeBuf::reserve2x(int32_t i, const char *label) {
|
||||
//watch out for overflow!
|
||||
if((m_capacity << 1) + i < m_capacity) return false;
|
||||
if(i + m_length >= m_capacity)
|
||||
@ -433,7 +433,7 @@ int32_t SafeBuf::safeSave (char *filename ) {
|
||||
}
|
||||
|
||||
|
||||
int32_t SafeBuf::fillFromFile(char *dir,char *filename,char *label) {
|
||||
int32_t SafeBuf::fillFromFile(const char *dir, const char *filename, const char *label) {
|
||||
m_label = label;
|
||||
char buf[1024];
|
||||
if ( dir ) snprintf(buf,1024,"%s/%s",dir,filename);
|
||||
@ -451,7 +451,7 @@ char *SafeBuf::getNextLine ( char *p ) {
|
||||
}
|
||||
|
||||
// returns -1 on error
|
||||
int32_t SafeBuf::catFile(char *filename) {
|
||||
int32_t SafeBuf::catFile(const char *filename) {
|
||||
SafeBuf sb2;
|
||||
if ( sb2.fillFromFile(filename) < 0 ) return -1;
|
||||
// add 1 for a null
|
||||
@ -462,7 +462,7 @@ int32_t SafeBuf::catFile(char *filename) {
|
||||
|
||||
|
||||
// returns -1 on error
|
||||
int32_t SafeBuf::fillFromFile(char *filename) {
|
||||
int32_t SafeBuf::fillFromFile(const char *filename) {
|
||||
struct stat results;
|
||||
if (stat(filename, &results) != 0) {
|
||||
// An error occurred
|
||||
@ -1135,7 +1135,7 @@ bool SafeBuf::addTag ( Tag *tag ) {
|
||||
}
|
||||
|
||||
// this puts a \0 at the end but does not update m_length for the \0
|
||||
bool SafeBuf::safeStrcpy ( char *s ) {
|
||||
bool SafeBuf::safeStrcpy ( const char *s ) {
|
||||
if ( ! s ) return true;
|
||||
int32_t slen = gbstrlen(s);
|
||||
if ( ! reserve ( slen+1 ) ) return false;
|
||||
@ -1565,7 +1565,7 @@ void SafeBuf::replaceChar ( char src , char dst ) {
|
||||
|
||||
|
||||
// encode a double quote char to two double quote chars
|
||||
bool SafeBuf::csvEncode ( char *s , int32_t len , int32_t niceness ) {
|
||||
bool SafeBuf::csvEncode ( const char *s , int32_t len , int32_t niceness ) {
|
||||
|
||||
if ( ! s ) return true;
|
||||
|
||||
@ -1578,7 +1578,7 @@ bool SafeBuf::csvEncode ( char *s , int32_t len , int32_t niceness ) {
|
||||
//char *dstEnd = m_buf + m_capacity;
|
||||
|
||||
// scan through all
|
||||
char *send = s + len;
|
||||
const char *send = s + len;
|
||||
for ( ; s < send ; s++ ) {
|
||||
// breathe
|
||||
QUICKPOLL ( niceness );
|
||||
@ -1603,9 +1603,9 @@ bool SafeBuf::csvEncode ( char *s , int32_t len , int32_t niceness ) {
|
||||
return true;
|
||||
}
|
||||
|
||||
bool SafeBuf::base64Encode ( char *sx , int32_t len , int32_t niceness ) {
|
||||
bool SafeBuf::base64Encode ( const char *sx , int32_t len , int32_t niceness ) {
|
||||
|
||||
unsigned char *s = (unsigned char *)sx;
|
||||
const unsigned char *s = (const unsigned char *)sx;
|
||||
|
||||
if ( ! s ) return true;
|
||||
|
||||
@ -1630,7 +1630,7 @@ bool SafeBuf::base64Encode ( char *sx , int32_t len , int32_t niceness ) {
|
||||
|
||||
unsigned char val;
|
||||
// scan through all
|
||||
unsigned char *send = s + len;
|
||||
const unsigned char *send = s + len;
|
||||
for ( ; s < send ; ) {
|
||||
// breathe
|
||||
QUICKPOLL ( niceness );
|
||||
@ -1696,7 +1696,7 @@ bool SafeBuf::base64Encode( char *s ) {
|
||||
return base64Encode(s,gbstrlen(s));
|
||||
}
|
||||
|
||||
bool SafeBuf::base64Decode ( char *src , int32_t srcLen , int32_t niceness ) {
|
||||
bool SafeBuf::base64Decode ( const char *src , int32_t srcLen , int32_t niceness ) {
|
||||
|
||||
// make the map
|
||||
static unsigned char s_bmap[256];
|
||||
|
36
SafeBuf.h
36
SafeBuf.h
@ -17,17 +17,17 @@ class SafeBuf {
|
||||
public:
|
||||
//*TRUCTORS
|
||||
SafeBuf();
|
||||
SafeBuf(int32_t initSize, char *label);
|
||||
SafeBuf(int32_t initSize, const char *label);
|
||||
|
||||
void constructor();
|
||||
|
||||
//be careful with passing in a stackBuf! it could go out
|
||||
//of scope independently of the safebuf.
|
||||
SafeBuf(char* stackBuf, int32_t cap, char* label = NULL);
|
||||
SafeBuf(char* stackBuf, int32_t cap, const char* label = NULL);
|
||||
SafeBuf(char *heapBuf, int32_t bufMax, int32_t bytesInUse, bool ownData);
|
||||
~SafeBuf();
|
||||
|
||||
void setLabel ( char *label );
|
||||
void setLabel ( const char *label );
|
||||
|
||||
// CAUTION: BE CAREFUL WHEN USING THE FOLLOWING TWO FUNCTIONS!!
|
||||
// setBuf() allows you reset the contents of the SafeBuf to either
|
||||
@ -68,11 +68,11 @@ public:
|
||||
// saves to tmp file and if that succeeds then renames to orig filename
|
||||
int32_t safeSave (char *filename );
|
||||
|
||||
int32_t fillFromFile(char *filename);
|
||||
int32_t fillFromFile(char *dir,char *filename, char *label=NULL);
|
||||
int32_t load(char *dir,char *fname,char *label = NULL) {
|
||||
int32_t fillFromFile(const char *filename);
|
||||
int32_t fillFromFile(const char *dir, const char *filename, const char *label=NULL);
|
||||
int32_t load(const char *dir, const char *fname, const char *label = NULL) {
|
||||
return fillFromFile(dir,fname,label);};
|
||||
int32_t load(char *fname) { return fillFromFile(fname);};
|
||||
int32_t load(const char *fname) { return fillFromFile(fname);};
|
||||
|
||||
bool safeTruncateEllipsis ( char *src , int32_t maxLen );
|
||||
bool safeTruncateEllipsis ( char *src , int32_t srcLen, int32_t maxLen );
|
||||
@ -103,21 +103,21 @@ public:
|
||||
#else
|
||||
bool safePrintf(const char *formatString, ...);
|
||||
#endif
|
||||
bool safeMemcpy(void *s, int32_t len){return safeMemcpy((char *)s,len);}
|
||||
bool safeMemcpy(const void *s, int32_t len){return safeMemcpy((const char*)s,len);}
|
||||
bool safeMemcpy(const char *s, int32_t len);
|
||||
bool safeMemcpy_nospaces(char *s, int32_t len);
|
||||
bool safeMemcpy_nospaces(const char *s, int32_t len);
|
||||
bool safeMemcpy(SafeBuf *c){return safeMemcpy(c->m_buf,c->m_length);}
|
||||
bool safeMemcpy ( class Words *w , int32_t a , int32_t b ) ;
|
||||
bool safeStrcpy ( char *s ) ;
|
||||
bool safeStrcpy ( const char *s ) ;
|
||||
//bool safeStrcpyPrettyJSON ( char *decodedJson ) ;
|
||||
bool safeUtf8ToJSON ( const char *utf8 ) ;
|
||||
bool jsonEncode ( const char *utf8 ) { return safeUtf8ToJSON(utf8); }
|
||||
bool jsonEncode ( char *utf8 , int32_t utf8Len );
|
||||
|
||||
bool csvEncode ( char *s , int32_t len , int32_t niceness = 0 );
|
||||
bool csvEncode ( const char *s , int32_t len , int32_t niceness = 0 );
|
||||
|
||||
bool base64Encode ( char *s , int32_t len , int32_t niceness = 0 );
|
||||
bool base64Decode ( char *src , int32_t srcLen , int32_t niceness = 0 ) ;
|
||||
bool base64Encode ( const char *s , int32_t len , int32_t niceness = 0 );
|
||||
bool base64Decode ( const char *src , int32_t srcLen , int32_t niceness = 0 ) ;
|
||||
|
||||
bool base64Encode( char *s ) ;
|
||||
|
||||
@ -132,8 +132,8 @@ public:
|
||||
|
||||
// . if clearIt is true we init the new buffer space to zeroes
|
||||
// . used by Collectiondb.cpp
|
||||
bool reserve(int32_t i, char *label=NULL , bool clearIt = false );
|
||||
bool reserve2x(int32_t i, char *label = NULL );
|
||||
bool reserve(int32_t i, const char *label=NULL , bool clearIt = false );
|
||||
bool reserve2x(int32_t i, const char *label = NULL );
|
||||
|
||||
char *makeSpace ( int32_t size ) {
|
||||
if ( ! reserve ( size ) ) return NULL;
|
||||
@ -147,7 +147,7 @@ public:
|
||||
};
|
||||
void setLength(int32_t i) { m_length = i; }
|
||||
char *getNextLine ( char *p ) ;
|
||||
int32_t catFile(char *filename) ;
|
||||
int32_t catFile(const char *filename) ;
|
||||
|
||||
void detachBuf();
|
||||
bool insert ( class SafeBuf *c , int32_t insertPos ) ;
|
||||
@ -266,7 +266,7 @@ public:
|
||||
// hack off trailing 0's
|
||||
bool printFloatPretty ( float f ) ;
|
||||
|
||||
char* pushStr (char* str, uint32_t len);
|
||||
char* pushStr (const char* str, uint32_t len);
|
||||
bool pushPtr ( void *ptr );
|
||||
bool pushLong (int32_t i);
|
||||
bool pushLongLong (int64_t i);
|
||||
@ -307,7 +307,7 @@ public:
|
||||
protected:
|
||||
char *m_buf;
|
||||
public:
|
||||
char *m_label;
|
||||
const char *m_label;
|
||||
bool m_usingStack;
|
||||
int16_t m_encoding; // output charset
|
||||
|
||||
|
@ -61,59 +61,13 @@ key_t SearchInput::makeKey ( ) {
|
||||
// space separated, NULL terminated, list of meta tag names to display
|
||||
if ( m_displayMetas )
|
||||
k.n0 = hash64b ( m_displayMetas , k.n0 );
|
||||
// name of collection in external cluster to get titleRecs for
|
||||
// related pages from
|
||||
//if ( m_rp_getExternalPages && m_rp_externalColl )
|
||||
// k.n0 = hash64b ( m_rp_externalColl , k.n0 );
|
||||
// collection e import from
|
||||
//if ( m_importColl )
|
||||
// k.n0 = hash64b ( m_importColl , k.n0 );
|
||||
// the special query parm
|
||||
//if ( m_sq && m_sqLen > 0 )
|
||||
// k.n0 = hash64 ( m_sq , m_sqLen , k.n0 );
|
||||
//if ( m_noDocIds && m_noDocIdsLen )
|
||||
// k.n0 = hash64 ( m_noDocIds , m_noDocIdsLen , k.n0 );
|
||||
//if ( m_noSiteIds && m_noSiteIdsLen )
|
||||
// k.n0 = hash64 ( m_noSiteIds , m_noSiteIdsLen , k.n0 );
|
||||
|
||||
// no need to hash these again separately, they are in between
|
||||
// m_START and m_END_HASH
|
||||
// language
|
||||
//if ( m_language )
|
||||
// k.n0 = hash64 ( m_language , k.n0 );
|
||||
//if ( m_gblang )
|
||||
// k.n0 = hash64 ( m_gblang , k.n0 );
|
||||
// . now include the hash of the search parameters
|
||||
// . nnot incuding m_docsToScanForTopics since since we got TopicGroups
|
||||
char *a = ((char *)&m_START) + 4 ; // msg40->m_dpf;
|
||||
char *b = (char *)&m_END_HASH ; // msg40->m_topicGroups;
|
||||
int32_t size = b - a;
|
||||
// push and flush some parms that should not contribute
|
||||
//int32_t save1 = m_refs_numToDisplay;
|
||||
//int32_t save2 = m_rp_numToDisplay;
|
||||
//int32_t save3 = m_numTopicsToDisplay;
|
||||
//m_refs_numToDisplay = 0;
|
||||
//m_rp_numToDisplay = 0;
|
||||
//m_numTopicsToDisplay = 0;
|
||||
// and hash it all up
|
||||
k.n0 = hash64 ( a , size , k.n0 );
|
||||
// and pop out the parms that did not contribute
|
||||
//m_refs_numToDisplay = save1;
|
||||
//m_rp_numToDisplay = save2;
|
||||
//m_numTopicsToDisplay = save3;
|
||||
// hash each topic group
|
||||
for ( int32_t i = 0 ; i < 1 ; i++ ) {
|
||||
TopicGroup *t = &m_topicGroups[i];
|
||||
//k.n0 = hash64 ( t->m_numTopics , k.n0 );
|
||||
k.n0 = hash64 ( t->m_maxTopics , k.n0 );
|
||||
k.n0 = hash64 ( t->m_docsToScanForTopics , k.n0 );
|
||||
k.n0 = hash64 ( t->m_minTopicScore , k.n0 );
|
||||
k.n0 = hash64 ( t->m_maxWordsPerTopic , k.n0 );
|
||||
k.n0 = hash64b( t->m_meta , k.n0 );
|
||||
k.n0 = hash64 ( t->m_delimeter , k.n0 );
|
||||
k.n0 = hash64 ( t->m_useIdfForTopics , k.n0 );
|
||||
k.n0 = hash64 ( t->m_dedup , k.n0 );
|
||||
}
|
||||
// . boolean queries have operators (AND OR NOT ( ) ) that we need
|
||||
// to consider in this hash as well. so
|
||||
// . so just hash the whole damn query
|
||||
@ -313,18 +267,13 @@ bool SearchInput::set ( TcpSocket *sock , HttpRequest *r ) { //, Query *q ) {
|
||||
// now override automatic defaults for special cases
|
||||
if ( tmpFormat != FORMAT_HTML ) {
|
||||
m_familyFilter = 0;
|
||||
m_numTopicsToDisplay = 0;
|
||||
m_doQueryHighlighting = 0;
|
||||
//m_spellCheck = 0;
|
||||
m_getDocIdScoringInfo = false;
|
||||
// turn gigabits off by default if not html
|
||||
//m_docsToScanForTopics = 0;
|
||||
}
|
||||
|
||||
// if they have a list of sites...
|
||||
if ( m_sites && m_sites[0] ) {
|
||||
m_doSiteClustering = false;
|
||||
m_ipRestrictForTopics = false;
|
||||
}
|
||||
|
||||
|
||||
@ -576,18 +525,10 @@ bool SearchInput::set ( TcpSocket *sock , HttpRequest *r ) { //, Query *q ) {
|
||||
m_doSiteClustering = true;
|
||||
|
||||
// turn off some parms
|
||||
if ( m_q.m_hasUrlField )
|
||||
m_ipRestrictForTopics = false;
|
||||
if ( m_q.m_hasIpField )
|
||||
m_ipRestrictForTopics = false;
|
||||
if ( m_q.m_hasPositiveSiteField ) {
|
||||
m_ipRestrictForTopics = false;
|
||||
m_doSiteClustering = false;
|
||||
}
|
||||
|
||||
if ( cr && ! cr->m_ipRestrict )
|
||||
m_ipRestrictForTopics = false;
|
||||
|
||||
if ( m_q.m_hasQuotaField ) {
|
||||
m_doSiteClustering = false;
|
||||
m_doDupContentRemoval = false;
|
||||
@ -629,36 +570,6 @@ bool SearchInput::set ( TcpSocket *sock , HttpRequest *r ) { //, Query *q ) {
|
||||
// save it
|
||||
m_rcache = readFromCache;
|
||||
|
||||
|
||||
//
|
||||
// TODO: use Parms.cpp defaults
|
||||
//
|
||||
TopicGroup *tg = &m_topicGroups[0];
|
||||
|
||||
//
|
||||
//
|
||||
// gigabits
|
||||
//
|
||||
//
|
||||
tg->m_numTopics = 50;
|
||||
tg->m_maxTopics = 50;
|
||||
tg->m_docsToScanForTopics = m_docsToScanForTopics;
|
||||
tg->m_minTopicScore = 0;
|
||||
tg->m_maxWordsPerTopic = 6;
|
||||
tg->m_meta[0] = '\0';
|
||||
tg->m_delimeter = '\0';
|
||||
tg->m_useIdfForTopics = false;
|
||||
tg->m_dedup = true;
|
||||
// need to be on at least 2 pages!
|
||||
tg->m_minDocCount = 2;
|
||||
tg->m_ipRestrict = m_ipRestrictForTopics;
|
||||
tg->m_dedupSamplePercent = 80;
|
||||
tg->m_topicRemoveOverlaps = true;
|
||||
tg->m_topicSampleSize = 4096;
|
||||
// max sequential punct chars allowedin a topic
|
||||
tg->m_topicMaxPunctLen = 1;
|
||||
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -22,27 +22,6 @@
|
||||
|
||||
#define MAX_TOPIC_GROUPS 1
|
||||
|
||||
// . parameters used to generate a set of related topics (gigabits)
|
||||
// . you can have Msg24 generate multiple sets of related topics in one call
|
||||
class TopicGroup {
|
||||
public:
|
||||
int32_t m_numTopics;
|
||||
int32_t m_maxTopics;
|
||||
int32_t m_docsToScanForTopics;
|
||||
int32_t m_minTopicScore;
|
||||
int32_t m_maxWordsPerTopic;
|
||||
char m_meta[32];
|
||||
char m_delimeter;
|
||||
bool m_useIdfForTopics;
|
||||
bool m_dedup;
|
||||
int32_t m_minDocCount ;
|
||||
bool m_ipRestrict ;
|
||||
char m_dedupSamplePercent; // -1 means no deduping
|
||||
bool m_topicRemoveOverlaps;
|
||||
int32_t m_topicSampleSize;
|
||||
int32_t m_topicMaxPunctLen;
|
||||
};
|
||||
|
||||
class SearchInput {
|
||||
|
||||
public:
|
||||
@ -53,9 +32,6 @@ class SearchInput {
|
||||
void test ( );
|
||||
key_t makeKey ( ) ;
|
||||
|
||||
// private
|
||||
void setTopicGroups ( class HttpRequest *r ,
|
||||
class CollectionRec *cr ) ;
|
||||
bool setQueryBuffers ( class HttpRequest *hr ) ;
|
||||
|
||||
//void setToDefaults ( class CollectionRec *cr , int32_t niceness ) ;
|
||||
@ -110,7 +86,6 @@ class SearchInput {
|
||||
char m_isCollAdmin;
|
||||
|
||||
// these are set from things above
|
||||
TopicGroup m_topicGroups [ MAX_TOPIC_GROUPS ];// msg40
|
||||
SafeBuf m_sbuf1;
|
||||
SafeBuf m_sbuf2;
|
||||
|
||||
@ -146,7 +121,6 @@ class SearchInput {
|
||||
char m_wcache; // msg40
|
||||
|
||||
char m_debug; // msg40
|
||||
char m_debugGigabits;
|
||||
|
||||
char m_spiderResults;
|
||||
char m_spiderResultRoots;
|
||||
@ -157,7 +131,6 @@ class SearchInput {
|
||||
|
||||
|
||||
// do not include these in makeKey()
|
||||
int32_t m_numTopicsToDisplay;
|
||||
int32_t m_refs_numToDisplay;
|
||||
int32_t m_rp_numToDisplay;
|
||||
|
||||
@ -204,7 +177,6 @@ class SearchInput {
|
||||
char m_excludeMetaText;
|
||||
char m_doBotDetection;
|
||||
int32_t m_includeCachedCopy;
|
||||
char m_getSectionVotingInfo;
|
||||
char m_familyFilter; // msg40
|
||||
char m_showErrors;
|
||||
char m_doSiteClustering; // msg40
|
||||
@ -228,18 +200,6 @@ class SearchInput {
|
||||
|
||||
char *m_filetype;
|
||||
|
||||
// . related topic (gigabits) parameters
|
||||
// . TODO: prepend m_top_ to these var names
|
||||
int32_t m_docsToScanForTopics; // msg40
|
||||
int32_t m_minTopicScore; // msg40
|
||||
int32_t m_minDocCount; // msg40
|
||||
int32_t m_dedupSamplePercent; // msg40
|
||||
int32_t m_maxWordsPerTopic; // msg40
|
||||
int32_t m_ipRestrictForTopics; // msg40
|
||||
char m_returnDocIdCount; // msg40
|
||||
char m_returnDocIds; // msg40
|
||||
char m_returnPops; // msg40
|
||||
|
||||
// . reference page parameters
|
||||
// . copied from CollectionRec.h
|
||||
int32_t m_refs_numToGenerate; // msg40
|
||||
@ -306,12 +266,9 @@ class SearchInput {
|
||||
int32_t m_docsToScanForReranking;
|
||||
float m_pqr_demFactSubPhrase;
|
||||
float m_pqr_demFactCommonInlinks;
|
||||
float m_pqr_demFactLocTitle;
|
||||
float m_pqr_demFactLocSummary;
|
||||
float m_pqr_demFactProximity;
|
||||
float m_pqr_demFactInSection;
|
||||
float m_pqr_demFactOrigScore;
|
||||
bool m_pqr_demInTopics;
|
||||
// . buzz stuff (buzz)
|
||||
// . these controls the set of results, so should be in the makeKey()
|
||||
// as it is, in between the start and end hash vars
|
||||
@ -348,15 +305,9 @@ class SearchInput {
|
||||
////////
|
||||
|
||||
// . end the section we hash in SearchInput::makeKey()
|
||||
// . we also hash displayMetas, TopicGroups and Query into the key
|
||||
// . we also hash displayMetas and Query into the key
|
||||
int32_t m_END_HASH;
|
||||
|
||||
//////
|
||||
//
|
||||
// STUFF NOT REALLY USED NWO
|
||||
//
|
||||
//////
|
||||
|
||||
// a marker for SearchInput::test()
|
||||
int32_t m_END_TEST;
|
||||
|
||||
|
9293
Sections.cpp
9293
Sections.cpp
File diff suppressed because it is too large
Load Diff
651
Sections.h
651
Sections.h
@ -7,8 +7,6 @@
|
||||
#include "Bits.h"
|
||||
#include "Words.h"
|
||||
#include "Rdb.h"
|
||||
//#include "DiskPageCache.h"
|
||||
|
||||
|
||||
// KEY:
|
||||
// ssssssss ssssssss ssssssss ssssssss s = 48 bit site hash
|
||||
@ -21,7 +19,7 @@
|
||||
// NNNNNNNN NNNNNNNN NNNNNNNN NNNNNNNN N = SectionVote::m_numSampled
|
||||
|
||||
// h: hash value. typically the lower 32 bits of the
|
||||
// Section::m_sentenceContentHash64 or the Section::m_contentHash64 vars. we
|
||||
// Section::m_contentHash64 vars. we
|
||||
// do not need the full 64 bits because we have the 48 bit site hash included
|
||||
// to reduce collisions substantially.
|
||||
|
||||
@ -33,80 +31,43 @@
|
||||
// . these are descriptive flags, they are computed when Sections is set
|
||||
// . SEC_NOTEXT sections do not vote, i.e. they are not stored in Sectiondb
|
||||
#define SEC_NOTEXT 0x0001 // implies section has no alnum words
|
||||
|
||||
// . Weights.cpp zeroes out the weights for these types of sections
|
||||
// . is section delimeted by the <script> tag, <marquee> tag, etc.
|
||||
//#define SEC_UNUSED 0x0002
|
||||
//#define SEC_UNUSED 0x0004
|
||||
#define SEC_SCRIPT 0x0008
|
||||
#define SEC_STYLE 0x0010
|
||||
#define SEC_SELECT 0x0020
|
||||
#define SEC_MARQUEE 0x0040
|
||||
#define SEC_CONTAINER 0x0080
|
||||
|
||||
// . in title/header. for gigabits in XmlDoc.cpp
|
||||
// . is section delemited by <title> or <hN> tags?
|
||||
#define SEC_IN_TITLE 0x0100
|
||||
#define SEC_IN_HEADER 0x0200
|
||||
|
||||
// used by Events.cpp to indicate if section contains a TimeOfDay ("7 p.m.")
|
||||
#define SEC_HAS_TOD 0x0400
|
||||
//#define SEC_UNUSED 0x0040
|
||||
//#define SEC_UNUSED 0x0080
|
||||
#define SEC_IN_TITLE 0x0100 // in title
|
||||
#define SEC_IN_HEADER 0x0200 // in <hN> tags
|
||||
//#define SEC_UNUSED 0x0400
|
||||
#define SEC_HIDDEN 0x0800 // <div style="display: none">
|
||||
#define SEC_IN_TABLE 0x1000
|
||||
//#define SEC_UNUSED 0x1000
|
||||
#define SEC_FAKE 0x2000 // <hr>/<br>/sentence based faux section
|
||||
#define SEC_NOSCRIPT 0x4000
|
||||
//#define SEC_UNUSED 0x8000
|
||||
|
||||
#define SEC_HEADING_CONTAINER 0x8000
|
||||
|
||||
#define SEC_MENU 0x010000
|
||||
#define SEC_LINK_TEXT 0x020000
|
||||
#define SEC_MENU_HEADER 0x040000
|
||||
#define SEC_INPUT_HEADER 0x080000
|
||||
#define SEC_INPUT_FOOTER 0x100000
|
||||
#define SEC_HEADING 0x200000
|
||||
|
||||
// reasons why a section is not an event
|
||||
#define SEC_UNBALANCED 0x00400000 // interlaced section/tags
|
||||
#define SEC_OPEN_ENDED 0x00800000 // no closing tag found
|
||||
#define SEC_MENU 0x00010000
|
||||
#define SEC_LINK_TEXT 0x00020000
|
||||
#define SEC_MENU_HEADER 0x00040000
|
||||
#define SEC_INPUT_HEADER 0x00080000
|
||||
#define SEC_INPUT_FOOTER 0x00100000
|
||||
#define SEC_HEADING 0x00200000
|
||||
//#define SEC_UNUSED 0x00400000
|
||||
//#define SEC_UNUSED 0x00800000
|
||||
#define SEC_SENTENCE 0x01000000 // made by a sentence?
|
||||
#define SEC_PLAIN_TEXT 0x02000000
|
||||
//#define SEC_UNUSED_1 0x04000000
|
||||
//#define SEC_UNUSED 0x04000000
|
||||
//#define SEC_UNUSED 0x00008000000LL
|
||||
//#define SEC_UNUSED 0x00010000000LL
|
||||
//#define SEC_UNUSED 0x00020000000LL
|
||||
//#define SEC_UNUSED 0x00040000000LL
|
||||
//#define SEC_UNUSED 0x00080000000LL
|
||||
|
||||
// . this is set in Dates.cpp and used by Dates.cpp and Events.cpp
|
||||
// . we identify max tod sections and make it so brothers in a list of two
|
||||
// or more such sections cannot telescope to each other's dates, and so we
|
||||
// do not share each other's event descriptions. fixes abqtango.com
|
||||
// and salsapower.com from grabbing event description text from "failed"
|
||||
// event sections that are brothers to successful event sections.
|
||||
#define SEC_TOD_EVENT 0x00008000000LL
|
||||
#define SEC_NIXED_HEADING_CONTAINER 0x00010000000LL
|
||||
|
||||
#define SEC_SECOND_TITLE 0x00020000000LL
|
||||
#define SEC_SPLIT_SENT 0x00040000000LL
|
||||
#define SEC_HAS_REGISTRATION 0x00080000000LL
|
||||
|
||||
#define SEC_HAS_PARKING 0x00100000000LL
|
||||
//#define SEC_UNUSED 0x00100000000LL
|
||||
#define SEC_MENU_SENTENCE 0x00200000000LL
|
||||
// fix for folkmads.org:
|
||||
#define SEC_HR_CONTAINER 0x00400000000LL
|
||||
#define SEC_HAS_DOM 0x00800000000LL
|
||||
#define SEC_HAS_DOW 0x01000000000LL
|
||||
#define SEC_EVENT_BROTHER 0x02000000000LL
|
||||
#define SEC_DATE_LIST_CONTAINER 0x04000000000LL
|
||||
#define SEC_TAIL_CRAP 0x08000000000LL
|
||||
|
||||
#define SEC_CONTROL 0x0000010000000000LL
|
||||
#define SEC_STRIKE 0x0000020000000000LL
|
||||
#define SEC_STRIKE2 0x0000040000000000LL
|
||||
#define SEC_HAS_MONTH 0x0000080000000000LL
|
||||
#define SEC_IGNOREEVENTBROTHER 0x0000100000000000LL
|
||||
#define SEC_HASEVENTDOMDOW 0x0000200000000000LL
|
||||
#define SEC_STOREHOURSCONTAINER 0x0000400000000000LL
|
||||
#define SEC_PUBDATECONTAINER 0x0000800000000000LL
|
||||
|
||||
#define SEC_TABLE_HEADER 0x0001000000000000LL
|
||||
#define SEC_HASDATEHEADERROW 0x0002000000000000LL
|
||||
#define SEC_HASDATEHEADERCOL 0x0004000000000000LL
|
||||
#define SEC_MULTIDIMS 0x0008000000000000LL
|
||||
#define SEC_HASHXPATH 0x0010000000000000LL
|
||||
//#define SEC_UNUSED 0x00400000000LL
|
||||
//#define SEC_UNUSED 0x00800000000LL
|
||||
|
||||
// . some random-y numbers for Section::m_baseHash
|
||||
// . used by splitSection() function
|
||||
@ -114,174 +75,10 @@
|
||||
#define BH_SENTENCE 4590649
|
||||
#define BH_IMPLIED 95468323
|
||||
|
||||
// values for Section::m_sentFlags (sentence flags)
|
||||
#define SENT_HAS_COLON 0x00000001
|
||||
//#define SENT_UNUSED_1 0x00000002
|
||||
#define SENT_BAD_FIRST_WORD 0x00000004
|
||||
#define SENT_MIXED_CASE 0x00000008
|
||||
#define SENT_POWERED_BY 0x00000010
|
||||
#define SENT_MULT_EVENTS 0x00000020
|
||||
#define SENT_PAGE_REPEAT 0x00000040
|
||||
#define SENT_NUMBERS_ONLY 0x00000080
|
||||
//#define SENT_UNUSED_6 0x00000100
|
||||
#define SENT_SECOND_TITLE 0x00000200
|
||||
#define SENT_IS_DATE 0x00000400
|
||||
#define SENT_LAST_STOP 0x00000800
|
||||
#define SENT_NUMBER_START 0x00001000
|
||||
#define SENT_TAG_INDICATOR 0x00002000
|
||||
#define SENT_PRETTY 0x00004000
|
||||
#define SENT_IN_HEADER 0x00008000
|
||||
#define SENT_MIXED_CASE_STRICT 0x00010000
|
||||
#define SENT_IN_LIST 0x00020000
|
||||
#define SENT_COLON_ENDS 0x00040000
|
||||
//#define SENT_UNUSED_7 0x00080000
|
||||
#define SENT_IN_TITLEY_TAG 0x00100000
|
||||
#define SENT_CITY_STATE 0x00200000
|
||||
#define SENT_PRICEY 0x00400000
|
||||
#define SENT_PERIOD_ENDS 0x00800000
|
||||
#define SENT_HAS_PHONE 0x01000000
|
||||
#define SENT_IN_MENU 0x02000000
|
||||
#define SENT_MIXED_TEXT 0x04000000
|
||||
#define SENT_TAGS 0x08000000
|
||||
#define SENT_INTITLEFIELD 0x10000000
|
||||
#define SENT_STRANGE_PUNCT 0x20000000
|
||||
#define SENT_INPLACEFIELD 0x40000000
|
||||
#define SENT_INNONTITLEFIELD 0x80000000
|
||||
|
||||
//#define SENT_UNUSED_2 0x0000000100000000LL
|
||||
#define SENT_HASNOSPACE 0x0000000200000000LL
|
||||
#define SENT_IS_BYLINE 0x0000000400000000LL
|
||||
#define SENT_NON_TITLE_FIELD 0x0000000800000000LL
|
||||
#define SENT_TITLE_FIELD 0x0000001000000000LL
|
||||
#define SENT_UNIQUE_TAG_HASH 0x0000002000000000LL
|
||||
#define SENT_AFTER_SENTENCE 0x0000004000000000LL
|
||||
#define SENT_WORD_SANDWICH 0x0000008000000000LL
|
||||
//#define SENT_UNUSED_3 0x0000010000000000LL
|
||||
#define SENT_NUKE_FIRST_WORD 0x0000020000000000LL
|
||||
#define SENT_FIELD_NAME 0x0000040000000000LL
|
||||
#define SENT_PERIOD_ENDS_HARD 0x0000080000000000LL
|
||||
#define SENT_PARENS_START 0x0000100000000000LL
|
||||
#define SENT_IN_MENU_HEADER 0x0000200000000000LL
|
||||
#define SENT_IN_TRUMBA_TITLE 0x0000400000000000LL
|
||||
//#define SENT_UNUSED_8 0x0000800000000000LL
|
||||
#define SENT_FORMTABLE_FIELD 0x0001000000000000LL
|
||||
#define SENT_FORMTABLE_VALUE 0x0002000000000000LL
|
||||
#define SENT_IN_TAG 0x0004000000000000LL
|
||||
#define SENT_AFTER_SPACER 0x0008000000000000LL
|
||||
#define SENT_BEFORE_SPACER 0x0010000000000000LL
|
||||
#define SENT_OBVIOUS_PLACE 0x0020000000000000LL
|
||||
//#define SENT_UNUSED_4 0x0040000000000000LL
|
||||
#define SENT_HASSOMEEVENTSDATE 0x0080000000000000LL
|
||||
#define SENT_AFTER_COLON 0x0100000000000000LL
|
||||
#define SENT_HASTITLEWORDS 0x0200000000000000LL
|
||||
//#define SENT_UNUSED_5 0x0400000000000000LL
|
||||
//#define SENT_UNUSED_9 0x0800000000000000LL
|
||||
#define SENT_IN_BIG_LIST 0x1000000000000000LL
|
||||
#define SENT_BADEVENTSTART 0x2000000000000000LL
|
||||
#define SENT_MENU_SENTENCE 0x4000000000000000LL
|
||||
#define SENT_HAS_PRICE 0x8000000000000000ULL
|
||||
|
||||
#define NOINDEXFLAGS (SEC_SCRIPT|SEC_STYLE|SEC_SELECT)
|
||||
|
||||
// the section type (bit flag vector for SEC_*) is currently 32 bits
|
||||
typedef int64_t sec_t;
|
||||
//typedef int64_t titleflags_t;
|
||||
typedef int64_t sentflags_t;
|
||||
typedef uint32_t turkbits_t;
|
||||
|
||||
bool isPlaceIndicator ( int64_t *widp ) ;
|
||||
char *getSentBitLabel ( sentflags_t sf ) ;
|
||||
sentflags_t getMixedCaseFlags ( class Words *words ,
|
||||
wbit_t *bits ,
|
||||
int32_t senta ,
|
||||
int32_t sentb ,
|
||||
int32_t niceness ) ;
|
||||
int32_t hasTitleWords ( sentflags_t sflags ,
|
||||
int32_t senta,
|
||||
int32_t sentb,
|
||||
int32_t alnumCount,
|
||||
class Bits *bits ,
|
||||
class Words *words ,
|
||||
bool useAsterisk ,
|
||||
int32_t niceness );
|
||||
|
||||
|
||||
class Sectiondb {
|
||||
|
||||
public:
|
||||
|
||||
// reset rdb
|
||||
void reset();
|
||||
|
||||
bool verify ( char *coll );
|
||||
|
||||
bool addColl ( char *coll, bool doVerify = true );
|
||||
|
||||
// init m_rdb
|
||||
bool init ();
|
||||
|
||||
// init secondary/rebuild sectiondb
|
||||
bool init2 ( int32_t treeMem ) ;
|
||||
|
||||
Rdb *getRdb() { return &m_rdb; }
|
||||
|
||||
uint64_t getSiteHash ( void *k ) {
|
||||
return ((*(uint64_t *)(((char *)k)+8))) >> 16;};
|
||||
|
||||
|
||||
uint32_t getSectionHash ( void *k ) {
|
||||
return (*(uint32_t *)(((char *)k)+6)); }
|
||||
|
||||
|
||||
int64_t getDocId ( void *k ) {
|
||||
return ((*(uint64_t *)k) >> 2) & DOCID_MASK; }
|
||||
|
||||
|
||||
uint8_t getSectionType ( void *k ) {
|
||||
return ((unsigned char *)k)[5]; };
|
||||
|
||||
// holds binary format title entries
|
||||
Rdb m_rdb;
|
||||
|
||||
//DiskPageCache *getDiskPageCache ( ) { return &m_pc; };
|
||||
|
||||
//DiskPageCache m_pc;
|
||||
};
|
||||
|
||||
extern class Sectiondb g_sectiondb;
|
||||
extern class Sectiondb g_sectiondb2;
|
||||
|
||||
|
||||
// this is only needed for sections, not facets in general i don think.
|
||||
// facets has the whole QueryTerm::m_facetHashTable array with more info
|
||||
//
|
||||
// . for gbfacet:gbxpathsite1234567 posdb query stats compilation to
|
||||
// show how many pages duplicate your section's content on your site
|
||||
// at the same xpath. the hash of the innerHTML for that xpath is
|
||||
// embedded into the posdb key like a number in a number key, so the
|
||||
// wordpos bits etc are sacrificed to hold that 32-bit number.
|
||||
// . used by XmlDoc::getSectionsWithDupStats() for display in
|
||||
// XmlDoc::printRainbowSections()
|
||||
// . these are in QueryTerm::m_facetStats and computed from
|
||||
// QueryTerm::m_facetHashTable
|
||||
class SectionStats {
|
||||
public:
|
||||
SectionStats() { reset(); }
|
||||
void reset ( ) {
|
||||
m_totalMatches = 0; // posdb key "val" matches ours
|
||||
m_totalEntries = 0; // total posdb keys
|
||||
m_numUniqueVals = 0; // # of unique "vals"
|
||||
m_totalDocIds = 0;
|
||||
};
|
||||
// # of times xpath innerhtml matched ours. 1 count per docid max.
|
||||
int64_t m_totalMatches;
|
||||
// # of times this xpath occurred. doc can have multiple times.
|
||||
int64_t m_totalEntries;
|
||||
// # of unique vals this xpath had. doc can have multiple counts.
|
||||
int64_t m_numUniqueVals;
|
||||
int64_t m_totalDocIds;
|
||||
};
|
||||
|
||||
|
||||
class Section {
|
||||
public:
|
||||
@ -295,9 +92,6 @@ public:
|
||||
class Section *m_next;
|
||||
class Section *m_prev;
|
||||
|
||||
// used by Events.cpp to count # of timeofdays in section
|
||||
//class Event *m_event;
|
||||
|
||||
// . if we are an element in a list, what is the list container section
|
||||
// . a containing section is a section containing MULTIPLE
|
||||
// smaller sections
|
||||
@ -314,24 +108,6 @@ public:
|
||||
// are a sentence section then this points to itself.
|
||||
class Section *m_sentenceSection;
|
||||
|
||||
// . set in XmlDoc::getSectionsWithDupStats()
|
||||
// . voting info for this section over all indexed pages from this site
|
||||
SectionStats m_stats;
|
||||
|
||||
int32_t m_votesForDup;
|
||||
int32_t m_votesForNotDup;
|
||||
float getSectiondbVoteFactor ( ) {
|
||||
// now punish if repeated on many page on the site
|
||||
float a = (float)m_votesForNotDup;
|
||||
float b = (float)m_votesForDup;
|
||||
if ( a == 0 && b == 0 ) return 1.0;
|
||||
// use that as a modifier
|
||||
float factor = a / ( a + b);
|
||||
// minimum so we do not completely nuke title i guess
|
||||
if ( factor < .10 ) factor = .10;
|
||||
return factor;
|
||||
};
|
||||
|
||||
// position of the first and last alnum word contained directly OR
|
||||
// indirectly in this section. use -1 if no text contained...
|
||||
int32_t m_firstWordPos;
|
||||
@ -348,32 +124,11 @@ public:
|
||||
int32_t m_senta;
|
||||
int32_t m_sentb;
|
||||
|
||||
// each sentence is numbered
|
||||
//int32_t m_sentNum;
|
||||
|
||||
class Section *m_prevSent;
|
||||
class Section *m_nextSent;
|
||||
|
||||
// . if we are in a table, what position are we
|
||||
// . starts at 1 and goes upwards
|
||||
// . we start it at 1 so that way we know that 0 is invalid!
|
||||
int32_t m_rowNum;
|
||||
int32_t m_colNum;
|
||||
class Section *m_tableSec;
|
||||
|
||||
class Section *m_headColSection;
|
||||
class Section *m_headRowSection;
|
||||
|
||||
class Section *m_leftCell;
|
||||
class Section *m_aboveCell;
|
||||
|
||||
// hash of this tag's baseHash and all its parents baseHashes combined
|
||||
uint32_t m_tagHash;
|
||||
|
||||
// like above but for turk voting. includes hash of the class tag attr
|
||||
// from m_turkBaseHash, whereas m_tagHash uses m_baseHash of parent.
|
||||
uint32_t m_turkTagHash32;
|
||||
|
||||
// for debug output display of color coded nested sections
|
||||
uint32_t m_colorHash;
|
||||
|
||||
@ -384,35 +139,13 @@ public:
|
||||
// div and span tags, etc. to make them unique
|
||||
uint32_t m_baseHash;
|
||||
|
||||
// just hash the "class=" value along with the tagid
|
||||
uint32_t m_turkBaseHash;
|
||||
|
||||
// kinda like m_baseHash but for xml tags and only hashes the
|
||||
// tag name and none of the fields
|
||||
uint32_t m_xmlNameHash;
|
||||
|
||||
// these deal with enumertated tags and are used by Events.cpp
|
||||
int32_t m_occNum;
|
||||
int32_t m_numOccurences;
|
||||
|
||||
// used by XmlDoc.cpp to set a topological distance
|
||||
int32_t m_topDist;
|
||||
|
||||
// hash of all the alnum words DIRECTLY in this section
|
||||
uint64_t m_contentHash64;
|
||||
|
||||
uint64_t m_sentenceContentHash64;
|
||||
|
||||
// . used by the SEC_EVENTBROTHER algo in Dates.cpp to detect
|
||||
// [more] or [details] links that indicate distinct items
|
||||
// . sometimes the "(more)" link is combined into the last sentence
|
||||
// so we have to treat the last link kinda like its own sentence too!
|
||||
uint32_t m_lastLinkContentHash32;
|
||||
|
||||
// hash of all sentences contained indirectly or directly.
|
||||
// uses m_sentenceContentHash64 (for sentences)
|
||||
uint64_t m_indirectSentHash64;
|
||||
|
||||
// . range of words in Words class we encompass
|
||||
// . m_wordStart and m_wordEnd are the tag word #'s
|
||||
// . ACTUALLY it is a half-closed interval [a,b) like all else
|
||||
@ -422,45 +155,27 @@ public:
|
||||
int32_t m_a;//wordStart;
|
||||
int32_t m_b;//wordEnd;
|
||||
|
||||
// for event titles and descriptions
|
||||
sentflags_t m_sentFlags;
|
||||
|
||||
// . # alnum words only in this and only this section
|
||||
// . if we have none, we are SEC_NOTEXT
|
||||
int32_t m_exclusive;
|
||||
|
||||
// our depth. # of tags in the hash
|
||||
int32_t m_depth;
|
||||
|
||||
// container for the #define'd SEC_* values above
|
||||
sec_t m_flags;
|
||||
|
||||
// used to mark it in Dates.cpp like a breadcrumb trail
|
||||
int32_t m_mark;
|
||||
|
||||
// Events.cpp assigns a date to each section
|
||||
int32_t m_firstDate;
|
||||
|
||||
char m_used;
|
||||
|
||||
// used in Sections::splitSections() function
|
||||
int32_t m_processedHash;
|
||||
|
||||
int32_t m_gbFrameNum;
|
||||
|
||||
// do we contain section "arg"?
|
||||
bool contains ( class Section *arg ) {
|
||||
return ( m_a <= arg->m_a && m_b >= arg->m_b ); };
|
||||
bool contains( class Section *arg ) {
|
||||
return ( m_a <= arg->m_a && m_b >= arg->m_b );
|
||||
}
|
||||
|
||||
// do we contain section "arg"?
|
||||
bool strictlyContains ( class Section *arg ) {
|
||||
if ( m_a < arg->m_a && m_b >= arg->m_b ) return true;
|
||||
if ( m_a <= arg->m_a && m_b > arg->m_b ) return true;
|
||||
return false;
|
||||
};
|
||||
|
||||
// does this section contain the word #a?
|
||||
bool contains2 ( int32_t a ) { return ( m_a <= a && m_b > a ); };
|
||||
}
|
||||
|
||||
bool isVirtualSection ( ) ;
|
||||
};
|
||||
@ -474,84 +189,48 @@ public:
|
||||
#define FMT_JSON 3
|
||||
|
||||
class Sections {
|
||||
public:
|
||||
Sections();
|
||||
~Sections();
|
||||
|
||||
public:
|
||||
|
||||
Sections ( ) ;
|
||||
void reset() ;
|
||||
~Sections ( ) ;
|
||||
void reset();
|
||||
|
||||
// . returns false if blocked, true otherwise
|
||||
// . returns true and sets g_errno on error
|
||||
// . sets m_sections[] array, 1-1 with words array "w"
|
||||
bool set(class Words *w, class Phrases *phrases, class Bits *bits, class Url *url,
|
||||
bool set(class Words *w, class Bits *bits, class Url *url,
|
||||
int64_t siteHash64, char *coll, int32_t niceness, uint8_t contentType );
|
||||
|
||||
bool addVotes(class SectionVotingTable *nsvt, uint32_t tagPairHash );
|
||||
|
||||
bool verifySections ( ) ;
|
||||
|
||||
int32_t getStoredSize ( ) ;
|
||||
static int32_t getStoredSize ( char *p ) ;
|
||||
int32_t serialize ( char *p ) ;
|
||||
|
||||
bool growSections ( );
|
||||
|
||||
bool getSectiondbList ( );
|
||||
bool gotSectiondbList ( bool *needsRecall ) ;
|
||||
|
||||
void setNextBrotherPtrs ( bool setContainer ) ;
|
||||
|
||||
// this is used by Events.cpp Section::m_nextSent
|
||||
void setNextSentPtrs();
|
||||
|
||||
bool print ( SafeBuf *sbuf ,
|
||||
class HashTableX *pt ,
|
||||
class HashTableX *et ,
|
||||
class HashTableX *st ,
|
||||
class HashTableX *at ,
|
||||
class HashTableX *tt ,
|
||||
//class HashTableX *rt ,
|
||||
class HashTableX *priceTable ) ;
|
||||
bool print( SafeBuf *sbuf, class HashTableX *pt, class HashTableX *et, class HashTableX *st,
|
||||
class HashTableX *at, class HashTableX *tt, class HashTableX *priceTable );
|
||||
|
||||
void printFlags ( class SafeBuf *sbuf , class Section *sn ) ;
|
||||
|
||||
bool printVotingInfoInJSON ( SafeBuf *sb ) ;
|
||||
bool print2(SafeBuf *sbuf, int32_t hiPos, int32_t *wposVec, char *densityVec,
|
||||
char *wordSpamVec, char *fragVec, char format = FMT_HTML );
|
||||
|
||||
bool print2 ( SafeBuf *sbuf ,
|
||||
int32_t hiPos,
|
||||
int32_t *wposVec,
|
||||
char *densityVec,
|
||||
char *diversityVec,
|
||||
char *wordSpamVec,
|
||||
char *fragVec,
|
||||
char format = FMT_HTML );
|
||||
bool printSectionDiv ( class Section *sk , char format = FMT_HTML );
|
||||
bool printSectionDiv ( Section *sk , char format = FMT_HTML );
|
||||
class SafeBuf *m_sbuf;
|
||||
|
||||
char *getSectionsReply ( int32_t *size );
|
||||
char *getSectionsVotes ( int32_t *size );
|
||||
|
||||
bool isHardSection ( class Section *sn );
|
||||
bool isHardSection ( Section *sn );
|
||||
|
||||
bool setMenus ( );
|
||||
|
||||
bool setFormTableBits ( ) ;
|
||||
bool setTableRowsAndCols ( class Section *tableSec ) ;
|
||||
bool setTableHeaderBits ( class Section *table );
|
||||
bool setTableScanPtrs ( class Section *ts ) ;
|
||||
|
||||
void setHeader ( int32_t r , class Section *first , sec_t flag ) ;
|
||||
|
||||
bool setHeadingBit ( ) ;
|
||||
|
||||
void setTagHashes ( ) ;
|
||||
|
||||
bool setRegistrationBits ( ) ;
|
||||
bool m_setRegBits ;
|
||||
|
||||
bool m_alnumPosValid;
|
||||
|
||||
// save it
|
||||
class Words *m_words ;
|
||||
class Bits *m_bits ;
|
||||
@ -564,39 +243,15 @@ class Sections {
|
||||
|
||||
int32_t *m_wposVec;
|
||||
char *m_densityVec;
|
||||
char *m_diversityVec;
|
||||
char *m_wordSpamVec;
|
||||
char *m_fragVec;
|
||||
|
||||
// url ends in .rss or .xml ?
|
||||
bool m_isRSSExt;
|
||||
|
||||
bool m_isFacebook ;
|
||||
bool m_isEventBrite ;
|
||||
bool m_isStubHub ;
|
||||
|
||||
Msg0 m_msg0;
|
||||
key128_t m_startKey;
|
||||
int32_t m_recall;
|
||||
IndexList m_list;
|
||||
int64_t m_termId;
|
||||
|
||||
int32_t m_numLineWaiters;
|
||||
bool m_waitInLine;
|
||||
int32_t m_articleStartWord;
|
||||
int32_t m_articleEndWord;
|
||||
bool m_hadArticle;
|
||||
int32_t m_numInvalids;
|
||||
int32_t m_totalSiteVoters;
|
||||
|
||||
int32_t m_numAlnumWordsInArticle;
|
||||
|
||||
// word #'s (-1 means invalid)
|
||||
int32_t m_titleStart;
|
||||
int32_t m_titleEnd;
|
||||
int32_t m_titleStartAlnumPos;
|
||||
|
||||
int32_t m_numVotes;
|
||||
|
||||
// these are 1-1 with the Words::m_words[] array
|
||||
class Section **m_sectionPtrs;
|
||||
@ -604,25 +259,8 @@ class Sections {
|
||||
// save this too
|
||||
int32_t m_nw ;
|
||||
|
||||
// new stuff
|
||||
HashTableX m_ot;
|
||||
HashTableX m_vt;
|
||||
|
||||
// for caching parition scores
|
||||
HashTableX m_ct;
|
||||
|
||||
// buf for serializing m_osvt into
|
||||
char *m_buf;
|
||||
int32_t m_bufSize;
|
||||
|
||||
|
||||
// buf for serializing m_nsvt into
|
||||
char *m_buf2;
|
||||
int32_t m_bufSize2;
|
||||
|
||||
// allocate m_sections[] buffer
|
||||
class Section *m_sections;
|
||||
//int32_t m_sectionsBufSize;
|
||||
int32_t m_numSections;
|
||||
int32_t m_maxNumSections;
|
||||
|
||||
@ -633,71 +271,25 @@ class Sections {
|
||||
// see what section a word is in.
|
||||
SafeBuf m_sectionPtrBuf;
|
||||
|
||||
int32_t m_numSentenceSections;
|
||||
|
||||
bool m_isTestColl;
|
||||
|
||||
// assume no malloc
|
||||
bool m_needsFree;
|
||||
char m_localBuf [ SECTIONS_LOCALBUFSIZE ];
|
||||
|
||||
// set a flag
|
||||
bool m_badHtml;
|
||||
|
||||
int64_t *m_wids;
|
||||
int64_t *m_pids;
|
||||
int32_t *m_wlens;
|
||||
char **m_wptrs;
|
||||
nodeid_t *m_tids;
|
||||
|
||||
// the new way
|
||||
bool addImpliedSections ( );
|
||||
|
||||
bool setSentFlagsPart1 ( );
|
||||
bool setSentFlagsPart2 ( );
|
||||
sentflags_t getSentEventEndingOrBeginningFlags ( sentflags_t sflags ,
|
||||
int32_t senta ,
|
||||
int32_t sentb ,
|
||||
int32_t alnumCount) ;
|
||||
void setSentPrettyFlag ( class Section *si ) ;
|
||||
int32_t m_hiPos;
|
||||
bool m_sentFlagsAreSet;
|
||||
bool m_addedImpliedSections;
|
||||
|
||||
int32_t addImpliedSections3 ();
|
||||
int32_t getDelimScore ( class Section *bro,
|
||||
char method,
|
||||
class Section *delim ,
|
||||
class Partition *part );
|
||||
int32_t getDelimHash ( char method , class Section *bro ) ;
|
||||
|
||||
bool addImpliedLists ( ) ;
|
||||
int32_t getDelimScore2 ( class Section *bro,
|
||||
char method,
|
||||
class Section *delim ,
|
||||
int32_t *a ,
|
||||
int32_t *b );
|
||||
|
||||
bool hashSentBits ( class Section *sx ,
|
||||
class HashTableX *vht ,
|
||||
class Section *container ,
|
||||
uint32_t mod ,
|
||||
class HashTableX *labelTable,
|
||||
char *modLabel );
|
||||
|
||||
bool hashSentPairs ( Section *sx ,
|
||||
Section *sb ,
|
||||
HashTableX *vht ,
|
||||
Section *container ,
|
||||
HashTableX *labelTable );
|
||||
|
||||
bool addSentenceSections ( ) ;
|
||||
|
||||
class Section *insertSubSection ( int32_t a, int32_t b, int32_t newBaseHash ) ;
|
||||
|
||||
int32_t splitSectionsByTag ( nodeid_t tagid ) ;
|
||||
bool splitSections ( char *delimeter , int32_t dh );
|
||||
|
||||
class Section *m_rootSection; // the first section, aka m_firstSection
|
||||
class Section *m_lastSection;
|
||||
|
||||
@ -706,72 +298,8 @@ class Sections {
|
||||
// kinda like m_rootSection, the first sentence section that occurs
|
||||
// in the document, is NULL iff no sentences in document
|
||||
class Section *m_firstSent;
|
||||
class Section *m_lastSent;
|
||||
|
||||
bool containsTagId ( class Section *si, nodeid_t tagId ) ;
|
||||
|
||||
bool isTagDelimeter ( class Section *si , nodeid_t tagId ) ;
|
||||
|
||||
bool isDelimeter ( int32_t i , char *delimeter , int32_t *delimEnd ) {
|
||||
|
||||
// . HACK: special case when delimeter is 0x01
|
||||
// . that means we are back-to-back br tags
|
||||
if ( delimeter == (char *)0x01 ) {
|
||||
// must be a br tag
|
||||
if ( m_tids[i] != TAG_BR ) return false;
|
||||
// assume that
|
||||
int32_t k = i + 1;
|
||||
// bad if end
|
||||
if ( k >= m_nw ) return false;
|
||||
// bad if a wid
|
||||
if ( m_wids[k] ) return false;
|
||||
// inc if punct
|
||||
if ( ! m_tids[k] ) k++;
|
||||
// bad if end
|
||||
if ( k >= m_nw ) return false;
|
||||
// must be another br tag
|
||||
if ( m_tids[k] != TAG_BR ) return false;
|
||||
// mark as end i guess
|
||||
*delimEnd = k + 1;
|
||||
return true;
|
||||
}
|
||||
|
||||
// no word is a delimeter
|
||||
if ( m_wids[i] ) return false;
|
||||
// tags "<hr" and "<br"
|
||||
if ( m_wptrs[i][0] == delimeter[0] &&
|
||||
m_wptrs[i][1] == delimeter[1] &&
|
||||
m_wptrs[i][2] == delimeter[2] )
|
||||
return true;
|
||||
// if no match above, forget it
|
||||
if ( m_tids[i] ) return false;
|
||||
// otherwise, we are a punctuation "word"
|
||||
// the bullet is 3 bytes long
|
||||
if ( m_wlens[i] < 3 ) return false;
|
||||
// if not a bullet, skip it (&bull)
|
||||
char *p = m_wptrs[i];
|
||||
char *pend = p + m_wlens[i];
|
||||
for ( ; p < pend ; p++ ) {
|
||||
if ( p[0] != delimeter[0] ) continue;
|
||||
if ( p[1] != delimeter[1] ) continue;
|
||||
if ( p[2] != delimeter[2] ) continue;
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
};
|
||||
|
||||
|
||||
};
|
||||
|
||||
// convert sectionType to a string
|
||||
char *getSectionTypeAsStr ( int32_t sectionType );
|
||||
|
||||
// hash of the last 3 parent tagids
|
||||
//uint32_t getSectionContentTagHash3 ( class Section *sn ) ;
|
||||
|
||||
// only allow this many urls per site to add sectiondb info
|
||||
#define MAX_SITE_VOTERS 32
|
||||
|
||||
// . the key in sectiondb is basically the Section::m_tagHash
|
||||
// (with a docId) and the data portion of the Rdb record is this SectionVote
|
||||
// . the Sections::m_nsvt and m_osvt hash tables contain SectionVotes
|
||||
@ -790,99 +318,4 @@ public:
|
||||
float m_numSampled;
|
||||
};
|
||||
|
||||
|
||||
class SectionVotingTable {
|
||||
public:
|
||||
|
||||
SectionVotingTable ( ) ;
|
||||
|
||||
//bool set ( Sections *sections , class RdbList *sectiondbList );
|
||||
void reset () { m_svt.reset(); }
|
||||
|
||||
bool print ( SafeBuf *sbuf , char *title ) ;
|
||||
|
||||
// stock table from a sectiondb rdblist
|
||||
bool addListOfVotes ( RdbList *list,
|
||||
key128_t **lastKey ,
|
||||
int64_t docId ,
|
||||
int32_t niceness ) ;
|
||||
|
||||
// index our sections as flag|tagHash pairs using a termId which
|
||||
// is basically our sitehash. this allows us to "vote" on what
|
||||
// sections are static, dynamic, "texty" by indexing our votes into
|
||||
// datedb.
|
||||
bool hash ( int64_t docId ,
|
||||
class HashTableX *dt ,
|
||||
uint64_t siteHash64 ,
|
||||
int32_t niceness ) ;
|
||||
|
||||
bool addVote2 ( int32_t tagHash, int32_t sectionType , float score ) {
|
||||
return addVote3 ( tagHash,sectionType,score,1);};
|
||||
|
||||
bool addVote3 ( //class HashTableX *ttt ,
|
||||
int32_t tagHash ,
|
||||
int32_t sectionType ,
|
||||
float score ,
|
||||
float numSampled ,
|
||||
bool hackFix = false ) ;
|
||||
|
||||
// return -1.0 if no voters!
|
||||
float getScore ( Section *sn , int32_t sectionType ) {
|
||||
if ( ! sn ) return -1.0;
|
||||
return getScore ( sn->m_tagHash , sectionType ); };
|
||||
|
||||
float getScore ( int32_t tagHash , int32_t sectionType ) ;
|
||||
|
||||
|
||||
float getNumSampled ( Section *sn , int32_t sectionType ) {
|
||||
if ( ! sn ) return 0.0;
|
||||
return getNumSampled ( sn->m_tagHash , sectionType ); };
|
||||
|
||||
float getNumSampled ( int32_t tagHash , int32_t sectionType ) ;
|
||||
|
||||
int32_t getNumVotes ( ) { return m_svt.getNumSlotsUsed(); };
|
||||
|
||||
bool init ( int32_t numSlots , char *name , int32_t niceness ) {
|
||||
return m_svt.set(8,sizeof(SectionVote),numSlots,
|
||||
NULL,0,false,niceness,name); };
|
||||
|
||||
HashTableX m_svt;
|
||||
|
||||
int32_t m_totalSiteVoters;
|
||||
//int32_t m_totalSimilarLayouts;
|
||||
};
|
||||
|
||||
|
||||
//
|
||||
// BEGIN SECTION TYPES
|
||||
//
|
||||
|
||||
// . these are the core section types
|
||||
// . these are not to be confused with the section bit flags below
|
||||
// . we put these into sectiondb in the form of a SectionVote
|
||||
// . the SectionVote is the data portion of the rdb record, and the key
|
||||
// of the rdb record contains the url site hash and the section m_tagHash
|
||||
// . in this way, a page can vote on what type of section a tag hash describes
|
||||
//#define SV_TEXTY 1 // section has mostly non-hypertext words
|
||||
#define SV_CLOCK 2 // DateParse2.cpp. section contains a clock
|
||||
#define SV_EURDATEFMT 3 // DateParse2.cpp. contains european date fmt
|
||||
#define SV_EVENT 4 // used in Events.cpp to indicate event container
|
||||
#define SV_ADDRESS 5 // used in Events.cpp to indicate address container
|
||||
|
||||
// . HACK: the "date" is not the enum tag hash, but is the tagPairHash for this
|
||||
// . every doc has just one of these describing the entire layout of the page
|
||||
// . basically looking for these is same as doing a gbtaghash: query
|
||||
#define SV_TAGPAIRHASH 20
|
||||
// . HACK: the "date" is not the enum tag hash, but is the contentHash!
|
||||
// . this allows us to detect a duplicate section even though the layout
|
||||
// of the web page is not quite the same, but is from the same site
|
||||
#define SV_TAGCONTENTHASH 21
|
||||
|
||||
// now Dates.cpp sets these too
|
||||
#define SV_FUTURE_DATE 24
|
||||
#define SV_PAST_DATE 25
|
||||
#define SV_CURRENT_DATE 26
|
||||
#define SV_SITE_VOTER 29
|
||||
#define SV_TURKTAGHASH 30
|
||||
|
||||
#endif
|
||||
|
24
Spider.h
24
Spider.h
@ -697,8 +697,6 @@ class SpiderRequest {
|
||||
unsigned m_reserved3n :1;
|
||||
unsigned m_reserved3k :1;
|
||||
unsigned m_reserved3e :1;
|
||||
//unsigned m_matchesUrlCrawlPattern :1;
|
||||
//unsigned m_matchesUrlProcessPattern:1;
|
||||
unsigned m_reserved3f :1;
|
||||
unsigned m_reserved3g :1;
|
||||
unsigned m_siteNumInlinksValid :1;
|
||||
@ -711,30 +709,16 @@ class SpiderRequest {
|
||||
// want the url's to have their links spidered. default is to make
|
||||
// this 0 and to not avoid spidering the links.
|
||||
unsigned m_avoidSpiderLinks:1;
|
||||
// for identifying address heavy sites...
|
||||
//unsigned m_tagYellowPages:1;
|
||||
// when indexing urls for dmoz, i.e. the urls outputted from
|
||||
// 'dmozparse urldump -s' we need to index them even if there
|
||||
// was a ETCPTIMEDOUT because we have to have indexed the same
|
||||
// urls that dmoz has in it in order to be identical to dmoz.
|
||||
unsigned m_ignoreExternalErrors:1;
|
||||
|
||||
// called XmlDoc::set4() from PageSubmit.cpp?
|
||||
//unsigned m_isPageSubmit:1;
|
||||
|
||||
//
|
||||
// INTERNAL USE ONLY
|
||||
//
|
||||
|
||||
// are we in the m_orderTree/m_doleTables/m_ipTree
|
||||
//unsigned m_inOrderTree:1;
|
||||
// are we doled out?
|
||||
//unsigned m_doled:1;
|
||||
// are we a re-add of a spiderrequest already in spiderdb added
|
||||
// from xmldoc.cpp when done spidering so that the spider request
|
||||
// gets back in the cache quickly?
|
||||
//unsigned m_readd:1;
|
||||
|
||||
// . what url filter num do we match in the url filters table?
|
||||
// . determines our spider priority and wait time
|
||||
int16_t m_ufn;
|
||||
@ -772,14 +756,6 @@ class SpiderRequest {
|
||||
|
||||
int32_t getRecSize () { return m_dataSize + 4 + sizeof(key128_t); }
|
||||
|
||||
// how much buf will we need to serialize ourselves?
|
||||
//int32_t getRecSize () {
|
||||
// //return m_dataSize + 4 + sizeof(key128_t); }
|
||||
// return (m_url - (char *)this) + gbstrlen(m_url) + 1
|
||||
// // subtract m_key and m_dataSize
|
||||
// - sizeof(key_t) - 4 ;
|
||||
//};
|
||||
|
||||
int32_t getUrlLen() { return m_dataSize -
|
||||
// subtract the \0
|
||||
((char *)m_url-(char *)&m_firstIp) - 1;};
|
||||
|
17
Statsdb.cpp
17
Statsdb.cpp
@ -88,23 +88,6 @@ static Label s_labels[] = {
|
||||
// eventually
|
||||
{GRAPH_QUANTITY,-1,"docs_indexed", .1,"%.0f docs" , -1, 0x00cc0099,"docs indexed" }
|
||||
|
||||
|
||||
//{ "termlist_intersect",0x0000ff00},
|
||||
//{ "termlist_intersect_soft",0x00008000}, // rat=0
|
||||
//{ "transmit_data_nice",0x00aa00aa },
|
||||
//{ "transmit_data", 0x00ff00ff },
|
||||
//{ "zak_ref_1a", 0x00ccffcc },
|
||||
//{ "zak_ref_1b",0x00fffacd },
|
||||
//{ "get_summary", 0x0000ff},
|
||||
//{ "get_summary_nice", 0x0000b0},
|
||||
//{ "get_gigabits",0x00d1e1ff },
|
||||
//{ "get_termlists_nice", 0x00aaaa00},
|
||||
//{ "get_termlists",0x00ffff00 },
|
||||
//{ "get_all_summaries", 0x008220ff},
|
||||
//{ "rdb_list_merge",0x0000ffff },
|
||||
//{ "titlerec_compress",0x00ffffff },
|
||||
//{ "titlerec_uncompress", 0x00ffffff} ,
|
||||
//{ "parm_change",0xffc0c0} // pink?
|
||||
};
|
||||
|
||||
void drawLine3 ( SafeBuf &sb ,
|
||||
|
478
StopWords.cpp
478
StopWords.cpp
@ -2014,23 +2014,6 @@ bool isQueryStopWord ( char *s , int32_t len , int64_t h , int32_t langId ) {
|
||||
s_queryStopWords2[langEnglish] = s_queryStopWordsEnglish;
|
||||
s_queryStopWords2[langGerman ] = s_queryStopWordsGerman;
|
||||
// set up the hash table
|
||||
// if ( ! s_queryStopWordTable.set ( sizeof(s_queryStopWords) * 2 ) )
|
||||
// return log(LOG_INIT,"query: Could not init query "
|
||||
// "stop words table.");
|
||||
// // now add in all the stop words
|
||||
// int32_t n = (int32_t)sizeof(s_queryStopWords)/ sizeof(char *);
|
||||
// for ( int32_t i = 0 ; i < n ; i++ ) {
|
||||
// char *sw = s_queryStopWords[i];
|
||||
// int32_t swlen = gbstrlen ( sw );
|
||||
// int64_t swh = hash64Lower ( sw , swlen );
|
||||
// s_queryStopWordTable.addTerm (swh,i+1,i+1,true);
|
||||
// // . add w/o accent marks too!
|
||||
// // . skip "f<>r" though because fur is an eng. word
|
||||
// //if ( *sw=='f' && *(sw+1)=='<27>' &&
|
||||
// // *(sw+2)=='r' && swlen == 3 ) continue;
|
||||
// //swh = hash64AsciiLower ( sw , swlen );
|
||||
// //s_queryStopWordTable.addTerm (swh,i+1,i+1,true);
|
||||
// }
|
||||
for ( int32_t i = 0 ; i <= MAXLANGID ; i++ ) {
|
||||
HashTableX *ht = &s_queryStopWordTables[i];
|
||||
char **words = s_queryStopWords2[i];
|
||||
@ -3844,9 +3827,6 @@ static char *s_commonWords[] = {
|
||||
static HashTableX s_commonWordTable;
|
||||
static bool s_commonWordsInitialized = false;
|
||||
|
||||
static HashTableX s_commonQueryWordTable;
|
||||
static bool s_commonQueryWordsInitialized = false;
|
||||
|
||||
|
||||
// for Process.cpp::resetAll() to call when exiting to free all mem
|
||||
void resetStopWordTables() {
|
||||
@ -3854,7 +3834,6 @@ void resetStopWordTables() {
|
||||
for ( int i = 0 ; i <= MAXLANGID ; i++ )
|
||||
s_queryStopWordTables[i].reset();
|
||||
s_commonWordTable.reset();
|
||||
s_commonQueryWordTable.reset();
|
||||
}
|
||||
|
||||
// used by Msg24.cpp for gigabits generation
|
||||
@ -3896,467 +3875,10 @@ int32_t isCommonWord ( int64_t h ) {
|
||||
return s_commonWordTable.getScore ( &h );
|
||||
}
|
||||
|
||||
static char *s_verbs[] = {
|
||||
"runs",
|
||||
"run",
|
||||
"go",
|
||||
"goes",
|
||||
"going"
|
||||
};
|
||||
|
||||
static HashTableX s_verbTable;
|
||||
static bool s_verbsInitialized = false;
|
||||
|
||||
// used by Msg24.cpp for gigabits generation
|
||||
bool isVerb ( int64_t *hp ) {
|
||||
// include a bunch of foreign prepositions so they don't get required
|
||||
// by the bitScores in IndexTable.cpp
|
||||
if ( ! s_verbsInitialized ) {
|
||||
// set up the hash table
|
||||
if ( ! s_verbTable.set (8,0,sizeof(s_verbs)*2,
|
||||
NULL,0,false,0,"verbs") )
|
||||
return log(LOG_INIT,
|
||||
"query: Could not init verbs table.");
|
||||
// now add in all the stop words
|
||||
int32_t n = (int32_t)sizeof(s_verbs)/ sizeof(char *);
|
||||
for ( int32_t i = 0 ; i < n ; i++ ) {
|
||||
char *sw = s_verbs[i];
|
||||
int32_t swlen = gbstrlen ( sw );
|
||||
// use the same algo that Words.cpp computeWordIds does
|
||||
int64_t swh = hash64Lower_utf8 ( sw , swlen );
|
||||
if ( ! s_verbTable.addKey ( &swh ) ) {
|
||||
char *xx=NULL;*xx=0; }
|
||||
}
|
||||
s_verbsInitialized = true;
|
||||
}
|
||||
|
||||
// get from table
|
||||
return (bool)s_verbTable.isInTable ( hp );
|
||||
}
|
||||
|
||||
void resetStopWords ( ) {
|
||||
s_stopWordTable.reset();
|
||||
for ( int i = 0 ; i <= MAXLANGID ; i++ )
|
||||
s_queryStopWordTables[i].reset();
|
||||
s_commonWordTable.reset();
|
||||
s_verbTable.reset();
|
||||
s_commonQueryWordTable.reset();
|
||||
}
|
||||
|
||||
|
||||
static char *s_commonQueryWords[] = {
|
||||
"to",
|
||||
"and",
|
||||
"ands",
|
||||
"anding",
|
||||
"anded",
|
||||
"be", // "be fine" for fatboyshouseofbbq.com matching queries
|
||||
"thereof",
|
||||
"of",
|
||||
"the",
|
||||
"this",
|
||||
"between",
|
||||
"onto",
|
||||
"too",
|
||||
|
||||
"every",
|
||||
"always",
|
||||
"more", // fix "more more" bringing up whitehouse.gov
|
||||
|
||||
"of",
|
||||
"the",
|
||||
"this",
|
||||
"one",
|
||||
"two",
|
||||
"three",
|
||||
"four",
|
||||
|
||||
"01",
|
||||
"02",
|
||||
"03",
|
||||
"04",
|
||||
"05",
|
||||
"06",
|
||||
"07",
|
||||
"08",
|
||||
"09",
|
||||
|
||||
"1",
|
||||
"2",
|
||||
"3",
|
||||
"4",
|
||||
"5",
|
||||
"6",
|
||||
"7",
|
||||
"8",
|
||||
"9",
|
||||
"10",
|
||||
"11",
|
||||
"12",
|
||||
"13",
|
||||
"14",
|
||||
"15",
|
||||
"16",
|
||||
"17",
|
||||
"18",
|
||||
"19",
|
||||
"20",
|
||||
"21",
|
||||
"22",
|
||||
"23",
|
||||
"24",
|
||||
"25",
|
||||
"26",
|
||||
"27",
|
||||
"28",
|
||||
"29",
|
||||
"30",
|
||||
"31",
|
||||
|
||||
"i","ii","iii","iv","vi","vii","viii","ix","x","xi",
|
||||
"xii","xiii","xiv","xv","xvi","xvii","xviii","xix",
|
||||
"xx","xxi","xxii","xxiii","xxiv","xxv","xxvi","xxvii",
|
||||
"xxviii","xxix","xxx","xxxi",
|
||||
|
||||
"january",
|
||||
"february",
|
||||
"march",
|
||||
"april",
|
||||
"may",
|
||||
"june",
|
||||
"july",
|
||||
"august",
|
||||
"september",
|
||||
"october",
|
||||
"november",
|
||||
"december",
|
||||
|
||||
"jan",
|
||||
"feb",
|
||||
"mar",
|
||||
"apr",
|
||||
"may",
|
||||
"jun",
|
||||
"jul",
|
||||
"aug",
|
||||
"sep",
|
||||
"oct",
|
||||
"nov",
|
||||
"dec",
|
||||
|
||||
"2010",
|
||||
"2011",
|
||||
"2012",
|
||||
"2013",
|
||||
"2014",
|
||||
"2015",
|
||||
|
||||
"a",
|
||||
|
||||
"over", // fix 'over site' for www.espn.com comeptitor pages
|
||||
|
||||
"am", // 'am so' for voyageofattraction.com
|
||||
"be",
|
||||
"being",
|
||||
"been",
|
||||
"so",
|
||||
"soh",
|
||||
|
||||
"moar",
|
||||
"more",
|
||||
"most",
|
||||
"than",
|
||||
"much",
|
||||
|
||||
"los", // fix 'los dos y com' for www.espn.com comeptitor pages
|
||||
"dos",
|
||||
|
||||
"view", // fix for jezebelgallery.com 'view homepage'
|
||||
"viewed",
|
||||
"views",
|
||||
"viewing",
|
||||
"homepage",
|
||||
"homepages",
|
||||
"webpage",
|
||||
"webpages",
|
||||
"home",
|
||||
"homed",
|
||||
"homing",
|
||||
"wit", // wtf?
|
||||
"homes",
|
||||
"house",
|
||||
"houses",
|
||||
"housed",
|
||||
"housing",
|
||||
"page",
|
||||
// fix getting 'green web' and 'green pages' for gigablast.com
|
||||
// as two independent queries for a competitor
|
||||
"pages",
|
||||
// damn, paged is a synonym of pages
|
||||
"paged",
|
||||
"paging",
|
||||
"info",
|
||||
"infos",
|
||||
"informative",
|
||||
"information", // 'the information' for wcnews.com
|
||||
"site",
|
||||
"sites",
|
||||
"sited",
|
||||
"siting",
|
||||
|
||||
"is", // fix 'is website'
|
||||
|
||||
"welcome", // whitehouse.gov fix
|
||||
"online",
|
||||
|
||||
"am", // 'am web' query
|
||||
|
||||
"y", // spanish for "and"
|
||||
|
||||
"at",
|
||||
"be",
|
||||
"by",
|
||||
"on",
|
||||
"or",
|
||||
"do",
|
||||
"doesn't",
|
||||
"in",
|
||||
"into",
|
||||
|
||||
"i",
|
||||
"an",
|
||||
"or",
|
||||
"as",
|
||||
"at",
|
||||
"by",
|
||||
"for",
|
||||
"with",
|
||||
"about",
|
||||
"from",
|
||||
|
||||
"any", // stop 'any web' for diffbot.com
|
||||
|
||||
// german is messing us up so that two queries that should
|
||||
// be basically the same "dos code" and "codes" are not! they
|
||||
// should have the same synbasehash64! fix for cheatcc.com
|
||||
// competitor pages from getting legal sites.
|
||||
// because it matches "dos codes"
|
||||
"dos",
|
||||
"de",
|
||||
"die",
|
||||
"del",
|
||||
"via",
|
||||
"e",
|
||||
|
||||
// spanish. messing up ibm.com competitor pages.
|
||||
// because it matches "es international"
|
||||
"es",
|
||||
|
||||
// fix newser.com 'more of you' 'know you' 'know more'
|
||||
"you", // "where do you" "you but" "but you"
|
||||
"your",
|
||||
"what",
|
||||
"wat",
|
||||
"where", // "and where you"
|
||||
"who",
|
||||
"when",
|
||||
"what's",
|
||||
"where's",
|
||||
"who's", // 'who's who' for www.fudwatch.co.uk
|
||||
"when's",
|
||||
"which",
|
||||
"wich",
|
||||
"but", // "and but"
|
||||
|
||||
"ver", // fix ver ver related query everyone matches for some reason
|
||||
"click", // click here is so popular!
|
||||
"clicked",
|
||||
"clicks",
|
||||
"clicking",
|
||||
"klick",
|
||||
"klicked"
|
||||
"klicks",
|
||||
"klicking",
|
||||
"here",
|
||||
"per",
|
||||
|
||||
"a",
|
||||
"b",
|
||||
"c",
|
||||
"d",
|
||||
"e",
|
||||
"f",
|
||||
"g",
|
||||
"h",
|
||||
"i",
|
||||
"j",
|
||||
"k",
|
||||
"l",
|
||||
"m",
|
||||
"n",
|
||||
"o",
|
||||
"p",
|
||||
"q",
|
||||
"r",
|
||||
"s",
|
||||
"t",
|
||||
"u",
|
||||
"v",
|
||||
"w",
|
||||
"x",
|
||||
"y",
|
||||
"z",
|
||||
|
||||
"innen", // wtf is this?
|
||||
|
||||
|
||||
// fix matching queries for yahoo.com:
|
||||
"inc",
|
||||
"go",
|
||||
"goes",
|
||||
"going",
|
||||
"gone",
|
||||
"went",
|
||||
"link",
|
||||
"links",
|
||||
"linked",
|
||||
"hyperlinking",
|
||||
"hyperlink",
|
||||
"hyperlinks",
|
||||
"hyperlinked",
|
||||
"hyperlinking",
|
||||
"exit",
|
||||
"ing", // wtf?
|
||||
"ed", // wtf?
|
||||
"om",
|
||||
"por",
|
||||
|
||||
"their",
|
||||
"theirs",
|
||||
"doh", // syn of do!
|
||||
"do",
|
||||
"don't",
|
||||
"doesn't",
|
||||
"did",
|
||||
"does",
|
||||
"done",
|
||||
"do's",
|
||||
"doing",
|
||||
"hame", // wtf?
|
||||
"were",
|
||||
"was",
|
||||
"can",
|
||||
"cans",
|
||||
"canning",
|
||||
"canned",
|
||||
"are",
|
||||
"if",
|
||||
"his",
|
||||
"hers",
|
||||
"him",
|
||||
"her",
|
||||
"fand", // wtf?
|
||||
"s's",
|
||||
"a's",
|
||||
"he",
|
||||
"she",
|
||||
"that",
|
||||
"en", // spanish?
|
||||
"le", // french?
|
||||
"will",
|
||||
"willy",
|
||||
|
||||
|
||||
"www",
|
||||
"w3", // synonym for www
|
||||
"com",
|
||||
"coms", // synonym for com
|
||||
"org",
|
||||
"orgs",
|
||||
"net", // .net
|
||||
"nets",
|
||||
"edu",
|
||||
"gov",
|
||||
|
||||
"no", // fix 'no no' missing term for army-list.com
|
||||
"my", // fix 'my' missing term for army-list.com
|
||||
|
||||
//"no", // 'no http' seems common. because we were ignoring "no"
|
||||
// because it was a query stop word in portuguese!!
|
||||
|
||||
"it", // this hurts I.T. i guess...
|
||||
|
||||
"http",
|
||||
"https",
|
||||
"web",
|
||||
"webs",
|
||||
"below",
|
||||
"site",
|
||||
"website",
|
||||
"sites",
|
||||
"websites",
|
||||
|
||||
// until we fix it right! this shows up so much
|
||||
"lincoln",
|
||||
"lincolns"
|
||||
};
|
||||
|
||||
// . used by Msg24.cpp for gigabits generation
|
||||
// . h is the full wordid, not 48-bit termid
|
||||
// . you can now pass in a 32-bit word hash instead of 64 and it should
|
||||
// still work!!!
|
||||
int32_t isCommonQueryWordInEnglish ( int64_t h64 ) {
|
||||
|
||||
// include a bunch of foreign prepositions so they don't get required
|
||||
// by the bitScores in IndexTable.cpp
|
||||
if ( ! s_commonQueryWordsInitialized ) {
|
||||
// set up the hash table
|
||||
int32_t ss = sizeof(s_commonQueryWords);
|
||||
if ( ! s_commonQueryWordTable.set (8,4,ss*2,
|
||||
NULL,0,false,0,
|
||||
"commonwrds") )
|
||||
return log(LOG_INIT,
|
||||
"query: Could not init common words "
|
||||
"table.");
|
||||
// now add in all the stop words
|
||||
int32_t n = (int32_t)sizeof(s_commonQueryWords)/ sizeof(char *);
|
||||
for ( int32_t i = 0 ; i < n ; i++ ) {
|
||||
char *sw = s_commonQueryWords[i];
|
||||
int32_t swlen = gbstrlen ( sw );
|
||||
// use the same algo that Words.cpp computeWordIds does
|
||||
int64_t swh64 = hash64Lower_utf8 ( sw , swlen );
|
||||
if ( ! s_commonQueryWordTable.addTerm ( &swh64,i+1 ) )
|
||||
return false;
|
||||
// if you pass in a 32-bit "h64" from hash32n()
|
||||
// you must make sure it is UNSIGNED so the top
|
||||
// 32 bits of the h64 are not set to 0xffffffff
|
||||
// two's complement
|
||||
swh64 &= 0x00000000ffffffffLL;
|
||||
if ( ! s_commonQueryWordTable.addTerm ( &swh64,i+1 ) )
|
||||
return false;
|
||||
swh64 |= 0xffffffff00000000LL;
|
||||
if ( ! s_commonQueryWordTable.addTerm ( &swh64,i+1 ) )
|
||||
return false;
|
||||
// . add w/o accent marks too!
|
||||
// . skip "f<>r" though because fur is an eng. word
|
||||
//if ( *sw=='f' && *(sw+1)=='<27>' &&
|
||||
// *(sw+2)=='r' && swlen == 3 ) continue;
|
||||
//swh = hash64AsciiLower ( sw , swlen );
|
||||
//s_commonQueryWordTable.addTerm (swh,i+1,i+1,true);
|
||||
}
|
||||
s_commonQueryWordsInitialized = true;
|
||||
// sanity test
|
||||
int32_t tid32 = hash32n("on");
|
||||
if ( !isCommonQueryWordInEnglish(tid32)){char *xx=NULL;*xx=0;}
|
||||
tid32 = hash32n("web");
|
||||
if ( !isCommonQueryWordInEnglish(tid32)){char *xx=NULL;*xx=0;}
|
||||
}
|
||||
|
||||
// . all 1 char letter words are stop words
|
||||
// . good for initials and some contractions
|
||||
//if ( len == 1 && is_alpha_a(*s) ) return true;
|
||||
|
||||
// get from table
|
||||
return (int32_t)s_commonQueryWordTable.getScore ( &h64 );
|
||||
}
|
||||
|
||||
|
@ -13,27 +13,18 @@ bool isStopWord ( char *s , int32_t len , int64_t h ) ;
|
||||
// used by Synonyms.cpp
|
||||
bool isStopWord2 ( int64_t *h ) ;
|
||||
|
||||
//just a stub for now
|
||||
//bool isStopWord ( UChar *s , int32_t len , int64_t h );
|
||||
|
||||
|
||||
// . damn i forgot to include these above
|
||||
// . i need these so m_bitScores in IndexTable.cpp doesn't have to require
|
||||
// them! Otherwise, it's like all queries have quotes around them again...
|
||||
bool isQueryStopWord ( char *s , int32_t len , int64_t h , int32_t langId ) ;
|
||||
//bool isQueryStopWord ( UChar *s , int32_t len , int64_t h ) ;
|
||||
|
||||
// is it a COMMON word?
|
||||
int32_t isCommonWord ( int64_t h ) ;
|
||||
|
||||
int32_t isCommonQueryWordInEnglish ( int64_t h ) ;
|
||||
|
||||
bool initWordTable(class HashTableX *table, char* words[],
|
||||
//int32_t size ,
|
||||
char *label);
|
||||
|
||||
bool isVerb ( int64_t *hp ) ;
|
||||
|
||||
// for Process.cpp::resetAll() to call when exiting to free all mem
|
||||
void resetStopWordTables();
|
||||
|
||||
|
@ -661,7 +661,6 @@ int64_t Summary::getBestWindow ( Matches *matches, int32_t mm, int32_t *lasta,
|
||||
}
|
||||
|
||||
// . we NULLify the section ptrs if we already used the word in another summary.
|
||||
// . google seems to index SEC_MARQUEE, so i took that out of here
|
||||
int32_t badFlags = SEC_SCRIPT|SEC_STYLE|SEC_SELECT|SEC_IN_TITLE;
|
||||
if ( (bb[matchWordNum] & D_USED) || ( sp && (sp[matchWordNum]->m_flags & badFlags) ) ) {
|
||||
// assume no best window
|
||||
@ -1059,7 +1058,6 @@ bool Summary::getDefaultSummary ( Xml *xml, Words *words, Sections *sections, Po
|
||||
int32_t bestEnd = -1;
|
||||
int32_t longestConsecutive = 0;
|
||||
int32_t lastAlnum = -1;
|
||||
// google seems to index SEC_MARQUEE, so i took that out of here
|
||||
int32_t badFlags = SEC_SCRIPT|SEC_STYLE|SEC_SELECT|SEC_IN_TITLE;
|
||||
// shortcut
|
||||
nodeid_t *tids = words->m_tagIds;
|
||||
|
6
Title.h
6
Title.h
@ -26,12 +26,10 @@ public:
|
||||
|
||||
// . set m_title to the title of the document represented by "xd"
|
||||
// . if getHardTitle is true will always use the title in the <title>
|
||||
// tag, but if that is not present, will try dmoz titles before
|
||||
// resorting to trying to guess a title from the document content
|
||||
// or incoming link text.
|
||||
// tag, but if that is not present, will resort to trying to guess
|
||||
// a title from the document content or incoming link text.
|
||||
// . uses the following:
|
||||
// . title tag
|
||||
// . dmoz title
|
||||
// . meta title tag
|
||||
// . incoming link text
|
||||
// . <hX> tags at the top of the scored content
|
||||
|
88
Words.cpp
88
Words.cpp
@ -1,7 +1,6 @@
|
||||
#include "gb-include.h"
|
||||
|
||||
#include "Words.h"
|
||||
#include "Phrases.h" // for isInPhrase() for hashWordIffNotInPhrase
|
||||
#include "Unicode.h" // getUtf8CharSize()
|
||||
#include "StopWords.h"
|
||||
#include "Speller.h"
|
||||
@ -108,7 +107,9 @@ int32_t countWords ( char *p ) {
|
||||
bool Words::set( Xml *xml, bool computeWordIds, int32_t niceness, int32_t node1, int32_t node2 ) {
|
||||
// prevent setting with the same string
|
||||
if ( m_xml == xml ) { char *xx=NULL;*xx=0; }
|
||||
|
||||
reset();
|
||||
|
||||
m_xml = xml;
|
||||
|
||||
// if xml is empty, bail
|
||||
@ -171,12 +172,6 @@ bool Words::set( Xml *xml, bool computeWordIds, int32_t niceness, int32_t node1,
|
||||
m_tagIds[m_numWords] |= BACKBIT;
|
||||
}
|
||||
|
||||
//log(LOG_DEBUG, "Words: Word %"INT32": got tag %s%s (%d)",
|
||||
// m_numWords,
|
||||
// isBackTag(m_numWords)?"/":"",
|
||||
// g_nodes[getTagId(m_numWords)].m_nodeName,
|
||||
// getTagId(m_numWords));
|
||||
|
||||
m_numWords++;
|
||||
|
||||
// used by XmlDoc.cpp
|
||||
@ -188,41 +183,6 @@ bool Words::set( Xml *xml, bool computeWordIds, int32_t niceness, int32_t node1,
|
||||
return true;
|
||||
}
|
||||
|
||||
bool Words::set11 ( char *s , char *send , int32_t niceness ) {
|
||||
reset();
|
||||
|
||||
// this will make addWords() scan for tags
|
||||
m_hasTags = true;
|
||||
|
||||
// save it
|
||||
char saved = *send;
|
||||
|
||||
// null term
|
||||
*send = '\0';
|
||||
|
||||
// determine rough upper bound on number of words by counting
|
||||
// punct/alnum boundaries
|
||||
m_preCount = countWords ( s );
|
||||
|
||||
// true = tagIds
|
||||
bool status = allocateWordBuffers(m_preCount,true);
|
||||
|
||||
// deal with error now
|
||||
if ( !status ) {
|
||||
*send = saved;
|
||||
return false;
|
||||
}
|
||||
|
||||
// and set the words
|
||||
status = addWords(s,0x7fffffff, true, niceness );
|
||||
|
||||
// bring it back
|
||||
*send = saved;
|
||||
|
||||
// return error?
|
||||
return status;
|
||||
}
|
||||
|
||||
// . set words from a string
|
||||
// . assume no HTML entities in the string "s"
|
||||
// . s must be NULL terminated
|
||||
@ -249,10 +209,7 @@ bool Words::set( char *s, bool computeWordIds, int32_t niceness ) {
|
||||
bool Words::addWords( char *s, int32_t nodeLen, bool computeWordIds, int32_t niceness ) {
|
||||
int32_t i = 0;
|
||||
int32_t j;
|
||||
//int32_t k = 0;
|
||||
int32_t wlen;
|
||||
//uint32_t e;
|
||||
//int32_t skip;
|
||||
int32_t badCount = 0;
|
||||
|
||||
bool hadApostrophe = false;
|
||||
@ -453,21 +410,11 @@ bool Words::addWords( char *s, int32_t nodeLen, bool computeWordIds, int32_t nic
|
||||
m_words [ m_numWords ] = &s[j];
|
||||
m_wordLens[ m_numWords ] = wlen;
|
||||
|
||||
// . Lars says it's better to leave the accented chars intact
|
||||
// . google agrees
|
||||
// . but what about "re'sume"?
|
||||
if ( computeWordIds ) {
|
||||
int64_t h = hash64Lower_utf8(&s[j],wlen);
|
||||
m_wordIds [m_numWords] = h;
|
||||
|
||||
// until we get an accent removal algo, comment this
|
||||
// out and possibly use the query synonym pipeline
|
||||
// to search without accents. MDW
|
||||
//int64_t h2 = hash64AsciiLowerE(&s[j],wlen);
|
||||
//if ( h2 != h ) m_stripWordIds [m_numWords] = h2;
|
||||
//else m_stripWordIds [m_numWords] = 0LL;
|
||||
//m_stripWordIds[m_numWords] = 0;
|
||||
}
|
||||
|
||||
m_nodes[m_numWords] = 0;
|
||||
if (m_tagIds) m_tagIds[m_numWords] = 0;
|
||||
m_numWords++;
|
||||
@ -658,7 +605,6 @@ int32_t Words::getLanguage( Sections *sections ,
|
||||
return -1;
|
||||
|
||||
// . avoid words in these bad sections
|
||||
// . google seems to index SEC_MARQUEE so i took that out of badFlags
|
||||
int32_t badFlags = SEC_SCRIPT|SEC_STYLE|SEC_SELECT;
|
||||
// shortcuts
|
||||
int64_t *wids = m_wordIds;
|
||||
@ -798,34 +744,6 @@ int32_t Words::getLanguage( Sections *sections ,
|
||||
return l;
|
||||
}
|
||||
|
||||
// get the word index at the given character position
|
||||
int32_t Words::getWordAt ( char *p ) { // int32_t charPos ) {
|
||||
if ( ! p ) { char *xx=NULL;*xx=0; }
|
||||
if ( p < m_words[0] ) { char *xx=NULL;*xx=0; }
|
||||
if ( p >= getContentEnd() ) { char *xx=NULL;*xx=0; }
|
||||
|
||||
int32_t step = m_numWords / 2;
|
||||
int32_t i = m_numWords / 2 ;
|
||||
|
||||
for (;;) {
|
||||
// divide it by 2 each time
|
||||
step >>= 1;
|
||||
// always at least one
|
||||
if ( step <= 0 )
|
||||
step = 1;
|
||||
// is it a hit?
|
||||
if ( p >= m_words[i] && p < m_words[i] + m_wordLens[i] )
|
||||
return i;
|
||||
// compare
|
||||
if ( m_words[i] < p )
|
||||
i += step;
|
||||
else
|
||||
i -= step;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
|
||||
// . return the value of the specified "field" within this html tag, "s"
|
||||
// . the case of "field" does not matter
|
||||
char *getFieldValue ( char *s ,
|
||||
|
21
Words.h
21
Words.h
@ -56,8 +56,6 @@ class Words {
|
||||
// . html tags are NOT parsed out
|
||||
bool set( char *s, bool computeIds, int32_t niceness );
|
||||
|
||||
bool set11 ( char *s , char *send , int32_t niceness ) ;
|
||||
|
||||
// . similar to above
|
||||
// . but we temporarily stick a \0 @ s[slen] for parsing purposes
|
||||
bool set( char *s, int32_t slen, bool computeIds, int32_t niceness = 0 );
|
||||
@ -112,8 +110,6 @@ class Words {
|
||||
return size;
|
||||
}
|
||||
|
||||
int32_t getWordAt ( char *charPos );
|
||||
|
||||
// . CAUTION: don't call this for punct "words"... it's bogus for them
|
||||
// . this is only for alnum "words"
|
||||
int64_t getWordId( int32_t n ) const {
|
||||
@ -121,16 +117,11 @@ class Words {
|
||||
}
|
||||
|
||||
bool isStopWord ( int32_t n ) {
|
||||
return ::isStopWord(m_words [n],
|
||||
m_wordLens[n],
|
||||
m_wordIds [n]);
|
||||
return ::isStopWord( m_words[n], m_wordLens[n], m_wordIds[n] );
|
||||
}
|
||||
|
||||
bool isQueryStopWord ( int32_t n , int32_t langId ) {
|
||||
return ::isQueryStopWord(m_words [n],
|
||||
m_wordLens[n],
|
||||
m_wordIds [n],
|
||||
langId);
|
||||
return ::isQueryStopWord( m_words[n], m_wordLens[n], m_wordIds[n], langId );
|
||||
}
|
||||
|
||||
|
||||
@ -180,13 +171,7 @@ class Words {
|
||||
return false;
|
||||
}
|
||||
|
||||
bool isSpaces ( int32_t n ) {
|
||||
for ( int32_t i = 0 ; i < m_wordLens[n] ; i++ )
|
||||
if ( ! is_wspace_utf8(&m_words[n][i]) ) return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
bool isSpaces2 ( int32_t n , int32_t starti ) {
|
||||
bool isSpaces ( int32_t n , int32_t starti = 0 ) {
|
||||
for ( int32_t i = starti ; i < m_wordLens[n] ; i++ )
|
||||
if ( ! is_wspace_utf8(&m_words[n][i]) ) return false;
|
||||
return true;
|
||||
|
4034
XmlDoc.cpp
4034
XmlDoc.cpp
File diff suppressed because it is too large
Load Diff
355
XmlDoc.h
355
XmlDoc.h
@ -18,38 +18,30 @@
|
||||
#ifndef _XMLDOC_H_
|
||||
#define _XMLDOC_H_
|
||||
|
||||
//#include "HashTableX.h"
|
||||
#include "Lang.h"
|
||||
#include "Words.h"
|
||||
#include "Bits.h"
|
||||
#include "Pos.h"
|
||||
#include "Phrases.h"
|
||||
//#include "Synonyms.h"
|
||||
//#include "Weights.h"
|
||||
#include "Xml.h"
|
||||
#include "SafeBuf.h"
|
||||
#include "Images.h"
|
||||
#include "Sections.h"
|
||||
#include "Msge0.h"
|
||||
#include "Msge1.h"
|
||||
//#include "Msge2.h"
|
||||
#include "Msg4.h"
|
||||
|
||||
#include "SearchInput.h"
|
||||
#include "Msg40.h"
|
||||
//#include "IndexList.h"
|
||||
#include "Msg0.h"
|
||||
#include "Msg22.h"
|
||||
#include "Tagdb.h"
|
||||
#include "Url.h"
|
||||
#include "Linkdb.h"
|
||||
//#include "LinkInfo.h"
|
||||
//#include "Msg25.h"
|
||||
#include "MsgC.h"
|
||||
#include "Msg13.h"
|
||||
#include "RdbList.h"
|
||||
#include "SiteGetter.h"
|
||||
//#include "CollectionRec.h"
|
||||
#include "Msg20.h"
|
||||
#include "Matches.h"
|
||||
#include "Query.h"
|
||||
@ -62,24 +54,15 @@
|
||||
#include "PingServer.h"
|
||||
#include "Json.h"
|
||||
|
||||
//#define XMLDOC_MAX_AD_IDS 4
|
||||
//#define XMLDOC_ADLEN 64
|
||||
|
||||
#define MAXFRAGWORDS 80000
|
||||
|
||||
#define MAX_WIKI_DOCIDS 20
|
||||
|
||||
#define MAX_TAG_PAIR_HASHES 100
|
||||
|
||||
#include "Msg40.h"
|
||||
//#define SAMPLE_VECTOR_SIZE (32*4)
|
||||
|
||||
#define POST_VECTOR_SIZE (32*4)
|
||||
|
||||
#define XD_GQ_MAX_SIZE 1000
|
||||
#define XD_MAX_GIGABIT_HASHES 48
|
||||
|
||||
#define XD_MAX_AD_IDS 5
|
||||
|
||||
#define MAX_LINK_TEXT_LEN 512
|
||||
#define MAX_SURROUNDING_TEXT_WIDTH 600
|
||||
@ -280,11 +263,11 @@ public:
|
||||
char *ptr_firstUrl;
|
||||
char *ptr_redirUrl;
|
||||
char *ptr_rootTitleBuf;
|
||||
int32_t *ptr_gigabitHashes;
|
||||
int32_t *ptr_gigabitScores;
|
||||
int32_t *ptr_unused12;
|
||||
int32_t *ptr_unused13;
|
||||
void *ptr_unused8;
|
||||
int64_t *ptr_wikiDocIds;
|
||||
rscore_t *ptr_wikiScores;
|
||||
int64_t *ptr_unused10;
|
||||
rscore_t *ptr_unused11;
|
||||
char *ptr_imageData;
|
||||
int32_t *ptr_unused6;
|
||||
int32_t *ptr_unused7;
|
||||
@ -305,11 +288,11 @@ public:
|
||||
int32_t size_firstUrl;
|
||||
int32_t size_redirUrl;
|
||||
int32_t size_rootTitleBuf;
|
||||
int32_t size_gigabitHashes;
|
||||
int32_t size_gigabitScores;
|
||||
int32_t size_unused12;
|
||||
int32_t size_unused13;
|
||||
int32_t size_unused8;
|
||||
int32_t size_wikiDocIds;
|
||||
int32_t size_wikiScores;
|
||||
int32_t size_unused10;
|
||||
int32_t size_unused11;
|
||||
int32_t size_imageData;
|
||||
int32_t size_unused6;
|
||||
int32_t size_unused7;
|
||||
@ -404,9 +387,6 @@ public:
|
||||
SafeBuf m_spiderStatusDocMetaList;
|
||||
char *getIsAdult ( ) ;
|
||||
|
||||
int64_t **getWikiDocIds ( ) ;
|
||||
void gotWikiResults ( class UdpSlot *slot );
|
||||
//class HashTableX *getClockCandidatesTable();
|
||||
int32_t getOutlinkAge ( int32_t outlinkNum ) ;
|
||||
char *getIsPermalink ( ) ;
|
||||
char *getIsUrlPermalinkFormat ( ) ;
|
||||
@ -421,19 +401,7 @@ public:
|
||||
class Bits *getBitsForSummary ( ) ;
|
||||
class Pos *getPos ( );
|
||||
class Phrases *getPhrases ( ) ;
|
||||
//class Synonyms *getSynonyms ( );
|
||||
class Sections *getExplicitSections ( ) ;
|
||||
class Sections *getImpliedSections ( ) ;
|
||||
class Sections *getSections ( ) ;
|
||||
class Sections *getSectionsWithDupStats ( );
|
||||
//BR 20160106 removed: class SafeBuf *getInlineSectionVotingBuf();
|
||||
bool gotSectionFacets( class Multicast *mcast );
|
||||
class SectionStats *getSectionStats ( uint32_t secHash32, uint32_t sentHash32, bool cacheOnly );
|
||||
class SectionVotingTable *getOldSectionVotingTable();
|
||||
class SectionVotingTable *getNewSectionVotingTable();
|
||||
char **getSectionsReply ( ) ;
|
||||
char **getSectionsVotes ( ) ;
|
||||
HashTableX *getSectionVotingTable();
|
||||
int32_t *getLinkSiteHashes ( );
|
||||
class Links *getLinks ( bool doQuickSet = false ) ;
|
||||
class HashTableX *getCountTable ( ) ;
|
||||
@ -442,36 +410,21 @@ public:
|
||||
int32_t *getSummaryVector ( ) ;
|
||||
int32_t *getPageSampleVector ( ) ;
|
||||
int32_t *getPostLinkTextVector ( int32_t linkNode ) ;
|
||||
int32_t computeVector ( class Sections *sections, class Words *words,
|
||||
uint32_t *vec , int32_t start = 0 , int32_t end = -1 );
|
||||
int32_t computeVector ( class Words *words, uint32_t *vec , int32_t start = 0 , int32_t end = -1 );
|
||||
float *getTagSimilarity ( class XmlDoc *xd2 ) ;
|
||||
float *getGigabitSimilarity ( class XmlDoc *xd2 ) ;
|
||||
float *getPageSimilarity ( class XmlDoc *xd2 ) ;
|
||||
float *getPercentChanged ( );
|
||||
uint64_t *getFuzzyDupHash ( );
|
||||
int64_t *getExactContentHash64();
|
||||
int64_t *getLooseContentHash64();
|
||||
class RdbList *getDupList ( ) ;
|
||||
class RdbList *getLikedbListForReq ( );
|
||||
class RdbList *getLikedbListForIndexing ( );
|
||||
char *getIsDup ( ) ;
|
||||
char *isDupOfUs ( int64_t d ) ;
|
||||
uint32_t *getGigabitVectorScorelessHash ( ) ;
|
||||
int32_t **getGigabitHashes ( );
|
||||
char *getGigabitQuery ( ) ;
|
||||
char *getMetaDescription( int32_t *mdlen ) ;
|
||||
char *getMetaSummary ( int32_t *mslen ) ;
|
||||
char *getMetaKeywords( int32_t *mklen ) ;
|
||||
char *getMetaGeoPlacename( int32_t *mgplen );
|
||||
|
||||
bool addGigabits ( char *s , int64_t docId , uint8_t langId ) ;
|
||||
bool addGigabits2 ( char *s,int32_t slen,int64_t docId,uint8_t langId);
|
||||
bool addGigabits ( class Words *ww ,
|
||||
int64_t docId,
|
||||
class Sections *sections,
|
||||
//class Weights *we ,
|
||||
uint8_t langId );
|
||||
|
||||
int32_t *getSiteSpiderQuota ( ) ;
|
||||
class Url *getCurrentUrl ( ) ;
|
||||
class Url *getFirstUrl() ;
|
||||
@ -626,10 +579,6 @@ public:
|
||||
|
||||
char *addOutlinkSpiderRecsToMetaList ( );
|
||||
|
||||
//bool addTable96 ( class HashTableX *tt1 ,
|
||||
// int32_t date1 ,
|
||||
// bool nosplit ) ;
|
||||
|
||||
int32_t getSiteRank ();
|
||||
bool addTable144 ( class HashTableX *tt1 ,
|
||||
int64_t docId ,
|
||||
@ -637,11 +586,6 @@ public:
|
||||
|
||||
bool addTable224 ( HashTableX *tt1 ) ;
|
||||
|
||||
//bool addTableDate ( class HashTableX *tt1 , //T<key128_t,char> *tt1
|
||||
// uint64_t docId ,
|
||||
// uint8_t rdbId ,
|
||||
// bool nosplit ) ;
|
||||
|
||||
bool addTable128 ( class HashTableX *tt1 , // T <key128_t,char>*tt1
|
||||
uint8_t rdbId ,
|
||||
bool forDelete ) ;
|
||||
@ -662,10 +606,7 @@ public:
|
||||
bool hashUrl ( class HashTableX *table, bool urlOnly );
|
||||
bool hashDateNumbers ( class HashTableX *tt );
|
||||
bool hashSections ( class HashTableX *table ) ;
|
||||
bool hashIncomingLinkText ( class HashTableX *table ,
|
||||
bool hashAnomalies ,
|
||||
bool hashNonAnomalies ) ;
|
||||
|
||||
bool hashIncomingLinkText( class HashTableX *table, bool hashAnomalies, bool hashNonAnomalies );
|
||||
bool hashLinksForLinkdb ( class HashTableX *table ) ;
|
||||
bool hashNeighborhoods ( class HashTableX *table ) ;
|
||||
bool hashRSSInfo ( class HashTableX *table ) ;
|
||||
@ -683,11 +624,8 @@ public:
|
||||
bool hashTagRec ( class HashTableX *table ) ;
|
||||
bool hashPermalink ( class HashTableX *table ) ;
|
||||
bool hashVectors(class HashTableX *table ) ;
|
||||
// BR 20160106 removed: bool hashAds(class HashTableX *table ) ;
|
||||
|
||||
class Url *getBaseUrl ( ) ;
|
||||
// BR 20160106 removed: bool hashSubmitUrls ( class HashTableX *table ) ;
|
||||
// BR 20160106 removed: bool hashImageStuff ( class HashTableX *table ) ;
|
||||
bool hashIsAdult ( class HashTableX *table ) ;
|
||||
|
||||
void set20 ( Msg20Request *req ) ;
|
||||
@ -700,8 +638,6 @@ public:
|
||||
class Title *getTitle ();
|
||||
class Summary *getSummary () ;
|
||||
char *getHighlightedSummary ();
|
||||
SafeBuf *getSampleForGigabits ( ) ;
|
||||
SafeBuf *getSampleForGigabitsJSON ( ) ;
|
||||
char *getIsNoArchive ( ) ;
|
||||
int32_t *getUrlFilterNum();
|
||||
char *getIsLinkSpam ( ) ;
|
||||
@ -709,64 +645,21 @@ public:
|
||||
char *getIsErrorPage ( ) ;
|
||||
char* matchErrorMsg(char* p, char* pend );
|
||||
|
||||
bool hashWords ( //int32_t wordStart ,
|
||||
//int32_t wordEnd ,
|
||||
class HashInfo *hi ) ;
|
||||
bool hashSingleTerm ( int64_t termId ,
|
||||
class HashInfo *hi ) ;
|
||||
bool hashSingleTerm ( char *s ,
|
||||
int32_t slen ,
|
||||
class HashInfo *hi );
|
||||
bool hashString ( class HashTableX *ht ,
|
||||
//class Weights *we ,
|
||||
class Bits *bits ,
|
||||
char *s ,
|
||||
int32_t slen ) ;
|
||||
bool hashString ( char *s ,
|
||||
int32_t slen ,
|
||||
class HashInfo *hi ) ;
|
||||
bool hashString ( char *s ,
|
||||
class HashInfo *hi ) ;
|
||||
bool hashWords( class HashInfo *hi );
|
||||
bool hashSingleTerm( int64_t termId, class HashInfo *hi );
|
||||
bool hashSingleTerm( char *s, int32_t slen, class HashInfo *hi );
|
||||
bool hashString( class HashTableX *ht, class Bits *bits, char *s, int32_t slen );
|
||||
bool hashString( char *s, int32_t slen, class HashInfo *hi );
|
||||
bool hashString( char *s, class HashInfo *hi );
|
||||
|
||||
bool hashWords3( class HashInfo *hi, class Words *words, class Phrases *phrases, class Synonyms *synonyms,
|
||||
class Sections *sections, class HashTableX *countTable, char *fragVec, char *wordSpamVec,
|
||||
char *langVec, char docLangId, class SafeBuf *pbuf, class HashTableX *wts,
|
||||
class SafeBuf *wbuf, int32_t niceness );
|
||||
|
||||
|
||||
bool hashWords3 ( //int32_t wordStart ,
|
||||
//int32_t wordEnd ,
|
||||
class HashInfo *hi ,
|
||||
class Words *words ,
|
||||
class Phrases *phrases ,
|
||||
class Synonyms *synonyms ,
|
||||
class Sections *sections ,
|
||||
class HashTableX *countTable ,
|
||||
char *fragVec ,
|
||||
char *wordSpamVec ,
|
||||
char *langVec ,
|
||||
char docLangId , // default lang id
|
||||
class SafeBuf *pbuf ,
|
||||
class HashTableX *wts ,
|
||||
class SafeBuf *wbuf ,
|
||||
int32_t niceness );
|
||||
|
||||
bool hashString3 ( char *s ,
|
||||
int32_t slen ,
|
||||
class HashInfo *hi ,
|
||||
class HashTableX *countTable ,
|
||||
class SafeBuf *pbuf ,
|
||||
class HashTableX *wts ,
|
||||
class SafeBuf *wbuf ,
|
||||
int32_t version ,
|
||||
int32_t siteNumInlinks ,
|
||||
int32_t niceness );
|
||||
|
||||
|
||||
//bool hashSectionTerm ( char *term ,
|
||||
// class HashInfo *hi ,
|
||||
// int32_t sentHash32 ) ;
|
||||
|
||||
bool hashFacet1 ( char *term, class Words *words , HashTableX *dt) ;
|
||||
|
||||
bool hashFacet2 ( char *prefix,char *term,int32_t val32, HashTableX *dt,
|
||||
bool shardByTermId = false ) ;
|
||||
bool hashString3( char *s, int32_t slen, class HashInfo *hi, class HashTableX *countTable,
|
||||
class SafeBuf *pbuf, class HashTableX *wts, class SafeBuf *wbuf, int32_t version,
|
||||
int32_t siteNumInlinks, int32_t niceness );
|
||||
|
||||
// gbfieldmatch:
|
||||
bool hashFieldMatchTerm ( char *val, int32_t vlen, class HashInfo *hi);
|
||||
@ -788,8 +681,6 @@ public:
|
||||
FacetValHash_t fvh ) ;
|
||||
bool storeFacetValuesSite ( char *qs , SafeBuf *sb ,
|
||||
FacetValHash_t fvh );
|
||||
bool storeFacetValuesSections ( char *qs , class SafeBuf *sb ,
|
||||
FacetValHash_t fvh ) ;
|
||||
bool storeFacetValuesHtml ( char *qs , class SafeBuf *sb ,
|
||||
FacetValHash_t fvh ) ;
|
||||
bool storeFacetValuesXml ( char *qs , class SafeBuf *sb ,
|
||||
@ -819,16 +710,12 @@ public:
|
||||
public:
|
||||
|
||||
// stuff set from the key of the titleRec, above the compression area
|
||||
//key_t m_key;
|
||||
int64_t m_docId;
|
||||
|
||||
char *m_ubuf;
|
||||
int32_t m_ubufSize;
|
||||
int32_t m_ubufAlloc;
|
||||
|
||||
// does this page link to gigablast, or has a search form to it?
|
||||
//bool searchboxToGigablast();
|
||||
|
||||
// private:
|
||||
|
||||
// we we started spidering it, in milliseconds since the epoch
|
||||
@ -843,16 +730,6 @@ public:
|
||||
int64_t m_setTime;
|
||||
int64_t m_cpuSummaryStartTime;
|
||||
|
||||
// timers
|
||||
int64_t m_beginSEOTime;
|
||||
int64_t m_beginTimeAllMatch;
|
||||
int64_t m_beginTimeMatchUrl;
|
||||
int64_t m_beginTimeFullQueries;
|
||||
int64_t m_beginTimeLinks;
|
||||
//int64_t m_beginMsg98s;
|
||||
int64_t m_beginRelatedQueries;
|
||||
int64_t m_beginMsg95s;
|
||||
|
||||
// . these should all be set using set*() function calls so their
|
||||
// individual validity flags can bet set to true, and successive
|
||||
// calls to their corresponding get*() functions will not core
|
||||
@ -873,8 +750,6 @@ public:
|
||||
int64_t m_firstUrlHash64;
|
||||
Url m_currentUrl;
|
||||
|
||||
//char *m_coll;
|
||||
//char m_collBuf[MAX_COLL_LEN+1]; // include \0
|
||||
CollectionRec *m_lastcr;
|
||||
collnum_t m_collnum;
|
||||
int32_t m_lastCollRecResetCount;
|
||||
@ -908,91 +783,24 @@ public:
|
||||
Bits m_bits2;
|
||||
Pos m_pos;
|
||||
Phrases m_phrases;
|
||||
//Synonyms m_synonyms;
|
||||
SafeBuf m_synBuf;
|
||||
//Weights m_weights;
|
||||
Sections m_sections;
|
||||
|
||||
// a hack storage thing used by Msg13.cpp
|
||||
class Msg13Request *m_hsr;
|
||||
|
||||
Section *m_si;
|
||||
//Section *m_nextSection;
|
||||
//Section *m_lastSection;
|
||||
int32_t m_mcastRequestsOut;
|
||||
int32_t m_mcastRequestsIn;
|
||||
int32_t m_secStatsErrno;
|
||||
char *m_queryBuf;
|
||||
Msg39Request *m_msg39RequestArray;
|
||||
SafeBuf m_mcastBuf;
|
||||
Multicast *m_mcastArray;
|
||||
//char *m_inUse;
|
||||
//Query *m_queryArray;
|
||||
//Query *m_sharedQuery;
|
||||
bool m_gotDupStats;
|
||||
//Query m_q4;
|
||||
//Msg3a m_msg3a;
|
||||
//Msg39Request m_r39;
|
||||
Msg39Request m_mr2;
|
||||
SectionStats m_sectionStats;
|
||||
HashTableX m_sectionStatsTable;
|
||||
//char m_sectionHashQueryBuf[128];
|
||||
|
||||
// also set in getSections()
|
||||
int32_t m_maxVotesForDup;
|
||||
|
||||
// . for rebuild logging of what's changed
|
||||
// . Repair.cpp sets these based on titlerec
|
||||
char m_logLangId;
|
||||
int32_t m_logSiteNumInlinks;
|
||||
|
||||
SectionVotingTable m_nsvt;
|
||||
|
||||
SectionVotingTable m_osvt;
|
||||
int32_t m_numSectiondbReads;
|
||||
int32_t m_numSectiondbNeeds;
|
||||
key128_t m_sectiondbStartKey;
|
||||
RdbList m_secdbList;
|
||||
int32_t m_sectiondbRecall;
|
||||
|
||||
bool m_gotFacets;
|
||||
SafeBuf m_tmpBuf2;
|
||||
|
||||
SafeBuf m_inlineSectionVotingBuf;
|
||||
|
||||
//HashTableX m_rvt;
|
||||
//Msg17 m_msg17;
|
||||
//char *m_cachedRootVoteRec;
|
||||
//int32_t m_cachedRootVoteRecSize;
|
||||
//bool m_triedVoteCache;
|
||||
//bool m_storedVoteCache;
|
||||
//SafeBuf m_cacheRecBuf;
|
||||
|
||||
SafeBuf m_timeAxisUrl;
|
||||
|
||||
HashTableX m_turkVotingTable;
|
||||
HashTableX m_turkBitsTable;
|
||||
uint32_t m_confirmedTitleContentHash ;
|
||||
uint32_t m_confirmedTitleTagHash ;
|
||||
|
||||
// turk voting tag rec
|
||||
TagRec m_vtr;
|
||||
// tagrec of banned turks
|
||||
TagRec m_bannedTurkRec;
|
||||
// and the table of the hashed banned turk users
|
||||
HashTableX m_turkBanTable;
|
||||
|
||||
// used for displaying turk votes...
|
||||
HashTableX m_vctab;
|
||||
HashTableX m_vcduptab;
|
||||
|
||||
Images m_images;
|
||||
HashTableX m_countTable;
|
||||
HttpMime m_mime;
|
||||
TagRec m_tagRec;
|
||||
SafeBuf m_tagRecBuf;
|
||||
// copy of m_oldTagRec but with our modifications, if any
|
||||
//TagRec m_newTagRec;
|
||||
SafeBuf m_newTagBuf;
|
||||
SafeBuf m_fragBuf;
|
||||
SafeBuf m_wordSpamBuf;
|
||||
@ -1002,9 +810,6 @@ public:
|
||||
class SafeBuf *m_savedSb;
|
||||
class HttpRequest *m_savedHr;
|
||||
|
||||
char m_savedChar;
|
||||
|
||||
|
||||
// validity flags. on reset() all these are set to false.
|
||||
char m_VALIDSTART;
|
||||
// DO NOT add validity flags above this line!
|
||||
@ -1013,7 +818,6 @@ public:
|
||||
char m_addedSpiderReplySizeValid;
|
||||
char m_addedStatusDocSizeValid;
|
||||
char m_downloadStartTimeValid;
|
||||
//char m_docQualityValid;
|
||||
char m_siteValid;
|
||||
char m_startTimeValid;
|
||||
char m_currentUrlValid;
|
||||
@ -1025,7 +829,6 @@ public:
|
||||
char m_lastUrlValid;
|
||||
char m_docIdValid;
|
||||
char m_availDocIdValid;
|
||||
//char m_collValid;
|
||||
char m_tagRecValid;
|
||||
char m_robotsTxtLenValid;
|
||||
char m_tagRecDataValid;
|
||||
@ -1034,7 +837,6 @@ public:
|
||||
char m_filteredRootTitleBufValid;
|
||||
char m_titleBufValid;
|
||||
char m_fragBufValid;
|
||||
char m_inlineSectionVotingBufValid;
|
||||
char m_wordSpamBufValid;
|
||||
char m_finalSummaryBufValid;
|
||||
char m_matchingQueryBufValid;
|
||||
@ -1042,32 +844,24 @@ public:
|
||||
char m_relatedQueryBufValid;
|
||||
char m_queryLinkBufValid;
|
||||
char m_redirSpiderRequestValid;
|
||||
//char m_queryPtrsValid;
|
||||
char m_queryOffsetsValid;
|
||||
//char m_queryPtrsSortedValid;
|
||||
char m_queryPtrsWholeValid;
|
||||
char m_relatedDocIdBufValid;
|
||||
char m_topMatchingQueryBufValid;
|
||||
char m_relatedDocIdsScoredBufValid;
|
||||
char m_relatedDocIdsWithTitlesValid;
|
||||
char m_relatedTitleBufValid;
|
||||
//char m_queryLinkBufValid;
|
||||
char m_missingTermBufValid;
|
||||
char m_matchingTermBufValid;
|
||||
//char m_relPtrsValid;
|
||||
char m_sortedPosdbListBufValid;
|
||||
char m_wpSortedPosdbListBufValid;
|
||||
char m_termListBufValid;
|
||||
char m_insertableTermsBufValid;
|
||||
char m_scoredInsertableTermsBufValid;
|
||||
//char m_iwfiBufValid; // for holding WordFreqInfo instances
|
||||
char m_wordPosInfoBufValid;
|
||||
char m_recommendedLinksBufValid;
|
||||
|
||||
//char m_queryHashTableValid;
|
||||
char m_queryOffsetTableValid;
|
||||
//char m_socketWriteBufValid;
|
||||
//char m_numBannedOutlinksValid;
|
||||
char m_hopCountValid;
|
||||
char m_isInjectingValid;
|
||||
char m_isImportingValid;
|
||||
@ -1091,35 +885,19 @@ public:
|
||||
char m_posValid;
|
||||
char m_isUrlBadYearValid;
|
||||
char m_phrasesValid;
|
||||
//char m_synonymsValid;
|
||||
//char m_weightsValid;
|
||||
char m_sectionsValid;
|
||||
char m_subSentsValid;
|
||||
char m_osvtValid;
|
||||
char m_nsvtValid;
|
||||
//char m_rvtValid;
|
||||
char m_turkVotingTableValid;
|
||||
char m_turkBitsTableValid;
|
||||
char m_turkBanTableValid;
|
||||
char m_vctabValid;
|
||||
char m_explicitSectionsValid;
|
||||
char m_impliedSectionsValid;
|
||||
char m_sectionVotingTableValid;
|
||||
|
||||
char m_imageDataValid;
|
||||
char m_imagesValid;
|
||||
char m_msge0Valid;
|
||||
char m_msge1Valid;
|
||||
//char m_msge2Valid;
|
||||
//char m_sampleVectorValid;
|
||||
char m_gigabitHashesValid;
|
||||
//char m_oldsrValid;
|
||||
char m_sreqValid;
|
||||
char m_srepValid;
|
||||
|
||||
bool m_ipValid;
|
||||
bool m_firstIpValid;
|
||||
bool m_spideredTimeValid;
|
||||
//bool m_nextSpiderTimeValid;
|
||||
bool m_indexedTimeValid;
|
||||
bool m_firstIndexedValid;
|
||||
bool m_isInIndexValid;
|
||||
@ -1127,26 +905,16 @@ public:
|
||||
bool m_outlinksAddedDateValid;
|
||||
bool m_countryIdValid;
|
||||
bool m_bodyStartPosValid;
|
||||
/*
|
||||
bool m_titleWeightValid;
|
||||
bool m_headerWeightValid;
|
||||
bool m_urlPathWeightValid;
|
||||
bool m_externalLinkTextWeightValid;
|
||||
bool m_internalLinkTextWeightValid;
|
||||
bool m_conceptWeightValid;
|
||||
*/
|
||||
|
||||
bool m_httpStatusValid;
|
||||
bool m_crawlDelayValid;
|
||||
bool m_finalCrawlDelayValid;
|
||||
bool m_titleRecKeyValid;
|
||||
bool m_wikiDocIdsValid;
|
||||
bool m_versionValid;
|
||||
bool m_rawUtf8ContentValid;
|
||||
bool m_expandedUtf8ContentValid;
|
||||
bool m_utf8ContentValid;
|
||||
bool m_isAllowedValid;
|
||||
//bool m_tryAgainTimeDeltaValid;
|
||||
//bool m_eliminateMenusValid;
|
||||
bool m_redirUrlValid;
|
||||
bool m_redirCookieBufValid;
|
||||
bool m_metaRedirUrlValid;
|
||||
@ -1163,11 +931,9 @@ public:
|
||||
bool m_redirErrorValid;
|
||||
bool m_domHash32Valid;
|
||||
bool m_contentHash32Valid;
|
||||
//bool m_tagHash32Valid;
|
||||
bool m_tagPairHash32Valid;
|
||||
|
||||
bool m_spiderLinksValid;
|
||||
//bool m_nextSpiderPriorityValid;
|
||||
bool m_firstIndexedDateValid;
|
||||
bool m_isPermalinkValid;
|
||||
|
||||
@ -1186,8 +952,6 @@ public:
|
||||
bool m_dupListValid;
|
||||
bool m_likedbListValid;
|
||||
bool m_isDupValid;
|
||||
bool m_gigabitVectorHashValid;
|
||||
bool m_gigabitQueryValid;
|
||||
bool m_metaDescValid;
|
||||
bool m_metaSummaryValid;
|
||||
bool m_metaKeywordsValid;
|
||||
@ -1196,23 +960,16 @@ public:
|
||||
bool m_oldDocValid;
|
||||
bool m_extraDocValid;
|
||||
bool m_rootDocValid;
|
||||
//bool m_gatewayDocValid;
|
||||
bool m_oldMetaListValid;
|
||||
bool m_oldTitleRecValid;
|
||||
bool m_rootTitleRecValid;
|
||||
bool m_isIndexedValid;
|
||||
bool m_siteNumInlinksValid;
|
||||
//bool m_siteNumInlinksUniqueIpValid;//FreshValid;
|
||||
//bool m_siteNumInlinksUniqueCBlockValid;//sitePopValid
|
||||
//bool m_siteNumInlinksTotalValid;
|
||||
bool m_siteNumInlinks8Valid;
|
||||
bool m_siteLinkInfoValid;
|
||||
bool m_isWWWDupValid;
|
||||
bool m_linkInfo1Valid;
|
||||
bool m_linkSiteHashesValid;
|
||||
bool m_sectionsReplyValid;
|
||||
bool m_sectionsVotesValid;
|
||||
bool m_sectiondbDataValid;
|
||||
bool m_placedbDataValid;
|
||||
bool m_siteHash64Valid;
|
||||
bool m_siteHash32Valid;
|
||||
@ -1228,7 +985,6 @@ public:
|
||||
bool m_isSiteRootValid;
|
||||
bool m_wasContentInjectedValid;
|
||||
bool m_outlinkHopCountVectorValid;
|
||||
//bool m_isSpamValid;
|
||||
bool m_isFilteredValid;
|
||||
bool m_urlFilterNumValid;
|
||||
bool m_numOutlinksAddedValid;
|
||||
@ -1245,7 +1001,6 @@ public:
|
||||
bool m_titleValid;
|
||||
bool m_htbValid;
|
||||
bool m_collnumValid;
|
||||
//bool m_twidsValid;
|
||||
bool m_termId32BufValid;
|
||||
bool m_termInfoBufValid;
|
||||
bool m_newTermInfoBufValid;
|
||||
@ -1254,9 +1009,6 @@ public:
|
||||
bool m_spiderStatusDocMetaListValid;
|
||||
bool m_isCompromisedValid;
|
||||
bool m_isNoArchiveValid;
|
||||
//bool m_isVisibleValid;
|
||||
//bool m_clockCandidatesTableValid;
|
||||
//bool m_clockCandidatesDataValid;
|
||||
bool m_titleRecBufValid;
|
||||
bool m_isLinkSpamValid;
|
||||
bool m_isErrorPageValid;
|
||||
@ -1280,19 +1032,9 @@ public:
|
||||
// DO NOT add validity flags below this line!
|
||||
char m_VALIDEND;
|
||||
|
||||
// more stuff
|
||||
//char *m_utf8Content;
|
||||
//int32_t m_utf8ContentLen;
|
||||
|
||||
// use this stuff for getting wiki docids that match our doc's gigabits
|
||||
//Query m_wq;
|
||||
//SearchInput m_si;
|
||||
//Msg40 m_msg40;
|
||||
bool m_printedMenu;
|
||||
//HashTableX m_clockCandidatesTable;
|
||||
//SafeBuf m_cctbuf;
|
||||
int32_t m_urlPubDate;
|
||||
//int32_t m_urlAge;
|
||||
char m_isUrlPermalinkFormat;
|
||||
uint8_t m_summaryLangId;
|
||||
int32_t m_tagPairHashVec[MAX_TAG_PAIR_HASHES];
|
||||
@ -1306,7 +1048,6 @@ public:
|
||||
int32_t m_postVec[POST_VECTOR_SIZE/4];
|
||||
int32_t m_postVecSize;
|
||||
float m_tagSimilarity;
|
||||
float m_gigabitSimilarity;
|
||||
float m_pageSimilarity;
|
||||
float m_percentChanged;
|
||||
bool m_unchanged;
|
||||
@ -1330,17 +1071,6 @@ public:
|
||||
Msg22 m_msg22d;
|
||||
Msg22 m_msg22e;
|
||||
Msg22 m_msg22f;
|
||||
//int32_t m_collLen;
|
||||
uint32_t m_gigabitVectorHash;
|
||||
char m_gigabitQuery [XD_GQ_MAX_SIZE];
|
||||
int32_t m_gigabitHashes [XD_MAX_GIGABIT_HASHES];
|
||||
int32_t m_gigabitScores [XD_MAX_GIGABIT_HASHES];
|
||||
char *m_gigabitPtrs [XD_MAX_GIGABIT_HASHES];
|
||||
// for debug printing really
|
||||
class GigabitInfo *m_top[100];
|
||||
int32_t m_numTop;
|
||||
//char m_metaDesc[1025];
|
||||
//char m_metaKeywords[1025];
|
||||
// these now reference directly into the html src so our
|
||||
// WordPosInfo::m_wordPtr algo works in seo.cpp
|
||||
char *m_metaDesc;
|
||||
@ -1355,11 +1085,9 @@ public:
|
||||
|
||||
|
||||
int32_t m_siteSpiderQuota;
|
||||
//int32_t m_numBannedOutlinks;
|
||||
class XmlDoc *m_oldDoc;
|
||||
class XmlDoc *m_extraDoc;
|
||||
class XmlDoc *m_rootDoc;
|
||||
//class XmlDoc *m_gatewayDoc;
|
||||
RdbList m_oldMetaList;
|
||||
char *m_oldTitleRec;
|
||||
int32_t m_oldTitleRecSize;
|
||||
@ -1377,10 +1105,7 @@ public:
|
||||
int32_t m_tagdbCollLen;
|
||||
|
||||
Url m_extraUrl;
|
||||
//int32_t m_siteNumInlinksFresh;
|
||||
//int32_t m_sitePop;
|
||||
uint8_t m_siteNumInlinks8;
|
||||
//int32_t m_siteNumInlinks;
|
||||
LinkInfo m_siteLinkInfo;
|
||||
SafeBuf m_mySiteLinkInfoBuf;
|
||||
SafeBuf m_myPageLinkInfoBuf;
|
||||
@ -1391,7 +1116,6 @@ public:
|
||||
char m_useSiteLinkBuf;
|
||||
char m_usePageLinkBuf;
|
||||
char m_printInXml;
|
||||
//Msg25 m_msg25;
|
||||
SafeBuf m_tmpBuf11;
|
||||
SafeBuf m_tmpBuf12;
|
||||
Multicast m_mcast11;
|
||||
@ -1399,7 +1123,6 @@ public:
|
||||
// lists from cachedb for msg25's msg20 replies serialized
|
||||
RdbList m_siteReplyList;
|
||||
RdbList m_pageReplyList;
|
||||
//void (* m_masterLoopWrapper) (void *state);
|
||||
MsgC m_msgc;
|
||||
bool m_isAllowed;
|
||||
bool m_forwardDownloadRequest;
|
||||
@ -1410,10 +1133,6 @@ public:
|
||||
// for limiting # of iframe tag expansions
|
||||
int32_t m_numExpansions;
|
||||
char m_newOnly;
|
||||
//int32_t m_tryAgainTimeDelta;
|
||||
//int32_t m_sameIpWait;
|
||||
//int32_t m_sameDomainWait;
|
||||
//int32_t m_maxSpidersPerDomain;
|
||||
char m_isWWWDup;
|
||||
char m_calledMsg0b;
|
||||
|
||||
@ -1424,24 +1143,14 @@ public:
|
||||
class RdbList *m_ulist;
|
||||
void *m_hack;
|
||||
class XmlDoc *m_hackxd;
|
||||
//class LinkInfo *m_linkInfo1Ptr;
|
||||
char *m_linkInfoColl;
|
||||
//char m_injectedReply;
|
||||
//int32_t m_minInlinkerHopCount;
|
||||
//class LinkInfo *m_linkInfo2Ptr;
|
||||
SiteGetter m_siteGetter;
|
||||
int64_t m_siteHash64;
|
||||
//char *m_site;
|
||||
//int32_t m_siteLen;
|
||||
//Url m_siteUrl;
|
||||
int32_t m_siteHash32;
|
||||
char *m_httpReply;
|
||||
//char m_downloadAttempted;
|
||||
char m_incrementedAttemptsCount;
|
||||
char m_incrementedDownloadCount;
|
||||
char m_redirectFlag;
|
||||
//char m_isScraping;
|
||||
//char m_throttleDownload;
|
||||
char m_spamCheckDisabled;
|
||||
char m_useRobotsTxt;
|
||||
int32_t m_robotsTxtLen;
|
||||
@ -1455,15 +1164,12 @@ public:
|
||||
int32_t m_filteredContentMaxSize;
|
||||
char m_calledThread;
|
||||
int32_t m_errno;
|
||||
//class CollectionRec *m_cr;
|
||||
//int32_t m_utf8ContentAllocSize;
|
||||
int32_t m_hostHash32a;
|
||||
int32_t m_hostHash32b;
|
||||
int32_t m_domHash32;
|
||||
int32_t m_priorityQueueNum;
|
||||
|
||||
// this points into m_msge0 i guess
|
||||
//class TagRec **m_outlinkTagRecVector;
|
||||
Msge0 m_msge0;
|
||||
|
||||
// this points into m_msge1 i guess
|
||||
@ -1729,8 +1435,6 @@ public:
|
||||
|
||||
char *m_wikiqbuf;
|
||||
int32_t m_wikiqbufSize;
|
||||
int64_t m_wikiDocIds [ MAX_WIKI_DOCIDS ];
|
||||
rscore_t m_wikiScores [ MAX_WIKI_DOCIDS ];
|
||||
|
||||
bool m_registeredSleepCallback;
|
||||
bool m_addedNegativeDoledbRec;
|
||||
@ -1741,16 +1445,12 @@ public:
|
||||
int32_t m_niceness;
|
||||
|
||||
bool m_usePosdb ;
|
||||
//bool m_useDatedb ;
|
||||
bool m_useClusterdb ;
|
||||
bool m_useLinkdb ;
|
||||
bool m_useSpiderdb ;
|
||||
bool m_useTitledb ;
|
||||
bool m_useTagdb ;
|
||||
bool m_usePlacedb ;
|
||||
//bool m_useTimedb ;
|
||||
bool m_useSectiondb ;
|
||||
//bool m_useRevdb ;
|
||||
bool m_useSecondaryRdbs ;
|
||||
|
||||
int32_t m_linkeeQualityBoost;
|
||||
@ -1762,10 +1462,7 @@ public:
|
||||
bool m_storeTermListInfo;
|
||||
char m_sortTermListBy;
|
||||
|
||||
SafeBuf m_sectiondbData;
|
||||
//char *m_sectiondbData;
|
||||
char *m_placedbData;
|
||||
//int32_t m_sectiondbDataSize;
|
||||
int32_t m_placedbDataSize;
|
||||
|
||||
// we now have HashInfo to replace this
|
||||
@ -1861,6 +1558,8 @@ public:
|
||||
void *finalState ,
|
||||
void (* finalCallback)(void *));
|
||||
|
||||
void logQueryTiming(const char* function, int64_t startTime);
|
||||
|
||||
bool doInjectLoop ( );
|
||||
void doneInjecting ( class XmlDoc *xd );
|
||||
int32_t m_i;
|
||||
|
@ -184,24 +184,13 @@ static bool storeTerm ( char *s ,
|
||||
// we know the termlist is small, or the termlist is being used for spidering
|
||||
// or parsing purposes and is usually not sent across the network.
|
||||
bool XmlDoc::hashNoSplit ( HashTableX *tt ) {
|
||||
|
||||
//if ( m_pbuf )
|
||||
// m_pbuf->safePrintf("<h3>Terms which are immune to indexdb "
|
||||
// "splitting:</h3>");
|
||||
|
||||
//if ( m_skipIndexing ) return true;
|
||||
|
||||
// this should be ready to go and not block!
|
||||
int64_t *pch64 = getExactContentHash64();
|
||||
//int64_t *pch64 = getLooseContentHash64();
|
||||
if ( ! pch64 || pch64 == (void *)-1 ) { char *xx=NULL;*xx=0; }
|
||||
|
||||
// shortcut
|
||||
Url *fu = getFirstUrl();
|
||||
|
||||
//BR 20160117: removed: if ( ! hashVectors ( tt ) ) return false;
|
||||
|
||||
|
||||
// constructor should set to defaults automatically
|
||||
HashInfo hi;
|
||||
hi.m_hashGroup = HASHGROUP_INTAG;
|
||||
@ -1869,18 +1858,7 @@ bool XmlDoc::hashUrl ( HashTableX *tt, bool urlOnly ) { // , bool isStatusDoc )
|
||||
}
|
||||
|
||||
|
||||
|
||||
/////////////
|
||||
//
|
||||
// CHROME DETECTION
|
||||
//
|
||||
// we search for these terms we hash here in getSectionsWithDupStats()
|
||||
// so we can remove chrome.
|
||||
//
|
||||
/////////////
|
||||
|
||||
// . returns false and sets g_errno on error
|
||||
// . copied Url2.cpp into here basically, so we can now dump Url2.cpp
|
||||
bool XmlDoc::hashSections ( HashTableX *tt ) {
|
||||
// BR 20160106: No longer store xpath-hashes in posdb as we do not use them.
|
||||
return true;
|
||||
@ -2706,134 +2684,14 @@ bool XmlDoc::hashPermalink ( HashTableX *tt ) {
|
||||
}
|
||||
|
||||
|
||||
//hash the tag pair vector, the gigabit vector and the sample vector
|
||||
bool XmlDoc::hashVectors ( HashTableX *tt ) {
|
||||
|
||||
setStatus ( "hashing vectors" );
|
||||
|
||||
int32_t blen;
|
||||
char buf[32];
|
||||
HashInfo hi;
|
||||
hi.m_tt = tt;
|
||||
hi.m_shardByTermId = true;
|
||||
|
||||
/*
|
||||
BR 20160117 removed
|
||||
|
||||
int32_t score = *getSiteNumInlinks8() * 256;
|
||||
if ( score <= 0 ) score = 1;
|
||||
//char *field;
|
||||
//char *descr;
|
||||
//h = m_tagVector.getVectorHash();
|
||||
uint32_t tph = *getTagPairHash32();
|
||||
blen = sprintf(buf,"%"UINT32"", tph);
|
||||
//field = "gbtagvector";
|
||||
//descr = "tag vector hash";
|
||||
|
||||
// update hash parms
|
||||
HashInfo hi;
|
||||
hi.m_tt = tt;
|
||||
hi.m_hashGroup = HASHGROUP_INTAG;
|
||||
hi.m_prefix = "gbtagvector";
|
||||
hi.m_desc = "tag vector hash";
|
||||
hi.m_shardByTermId = true;
|
||||
|
||||
// this returns false on failure
|
||||
if ( ! hashString ( buf,blen, &hi ) ) return false;
|
||||
*/
|
||||
|
||||
/*
|
||||
BR 20160106 removed
|
||||
uint32_t h = *getGigabitVectorScorelessHash();
|
||||
blen = sprintf(buf,"%"UINT32"",(uint32_t)h);
|
||||
// udpate hash parms
|
||||
hi.m_prefix = "gbgigabitvector";
|
||||
hi.m_desc = "gigabit vector hash";
|
||||
// this returns false on failure
|
||||
if ( ! hashString ( buf,blen,&hi) ) return false;
|
||||
*/
|
||||
|
||||
// . dup checking uses the two hashes above, not this hash!!! MDW
|
||||
// . i think this vector is just used to see if the page changed
|
||||
// significantly since last spidering
|
||||
// . it is used by getPercentChanged() and by Dates.cpp
|
||||
// . sanity check
|
||||
//if ( ! m_pageSampleVecValid ) { char *xx=NULL;*xx=0; }
|
||||
//int32_t *pc = m_pageSampleVec;
|
||||
//h = hash32((char *)m_pageSampleVec, SAMPLE_VECTOR_SIZE);
|
||||
//blen = sprintf(buf,"%"UINT32"",(int32_t unsigned int)h);
|
||||
//field = "gbsamplevector";
|
||||
//descr = "sample vector hash";
|
||||
// this returns false on failure
|
||||
//if ( ! hashString ( tt,buf,blen,score,field,descr) )
|
||||
// return false;
|
||||
|
||||
// . hash combined for Dup Dectection
|
||||
// . must match XmlDoc::getDupList ( );
|
||||
//uint64_t h1 = m_tagVector.getVectorHash();
|
||||
//uint64_t h2 = getGigabitVectorScorelessHash(gigabitVec);
|
||||
//uint64_t h64 = hash64 ( h1 , h2 );
|
||||
|
||||
// take this out for now
|
||||
/*
|
||||
uint64_t *dh = getDupHash ( );
|
||||
blen = sprintf(buf,"%"UINT64"", *dh );//h64);
|
||||
//field = "gbduphash";
|
||||
//descr = "dup vector hash";
|
||||
// update hash parms
|
||||
hi.m_prefix = "gbduphash";
|
||||
hi.m_desc = "dup vector hash";
|
||||
// this returns false on failure
|
||||
if ( ! hashString ( buf,blen,&hi ) ) return false;
|
||||
*/
|
||||
|
||||
// hash the wikipedia docids we match
|
||||
if ( ! m_wikiDocIdsValid ) { char *xx=NULL;*xx=0; }
|
||||
for ( int32_t i = 0 ; i < size_wikiDocIds/8 ; i++ ) {
|
||||
blen = sprintf(buf,"%"UINT64"",ptr_wikiDocIds[i]);
|
||||
// convert to int32_t
|
||||
//int32_t convScore = (int32_t)ptr_wikiScores[i];
|
||||
// get score
|
||||
//uint32_t ws = score8to32 ( convScore );
|
||||
// update hash parms
|
||||
hi.m_prefix = "gbwikidocid";
|
||||
hi.m_desc = "wiki docid";
|
||||
hi.m_hashGroup = HASHGROUP_INTAG;
|
||||
|
||||
// this returns false on failure
|
||||
if ( ! hashString ( buf,blen,&hi ) ) return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
BR 20160106 removed.
|
||||
// hash gbhasthumbnail:0|1
|
||||
bool XmlDoc::hashImageStuff ( HashTableX *tt ) {
|
||||
|
||||
setStatus ("hashing image stuff");
|
||||
|
||||
char *val = "0";
|
||||
char **td = getThumbnailData();
|
||||
if ( *td ) val = "1";
|
||||
|
||||
// update hash parms
|
||||
HashInfo hi;
|
||||
hi.m_tt = tt;
|
||||
hi.m_hashGroup = HASHGROUP_INTAG;
|
||||
hi.m_prefix = "gbhasthumbnail";
|
||||
hi.m_desc = "has a thumbnail";
|
||||
|
||||
// this returns false on failure
|
||||
if ( ! hashString ( val,1,&hi ) ) return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
*/
|
||||
|
||||
|
||||
// returns false and sets g_errno on error
|
||||
bool XmlDoc::hashIsAdult ( HashTableX *tt ) {
|
||||
|
||||
@ -3080,7 +2938,7 @@ bool XmlDoc::hashString3( char *s ,
|
||||
return false;
|
||||
if ( ! bits.set ( &words , version , niceness ) )
|
||||
return false;
|
||||
if ( ! phrases.set(&words,&bits,true,false,version,niceness ) )
|
||||
if ( !phrases.set( &words, &bits, version, niceness ) )
|
||||
return false;
|
||||
|
||||
// use primary langid of doc
|
||||
@ -3348,15 +3206,15 @@ bool XmlDoc::hashWords3 ( //int32_t wordStart ,
|
||||
// hashTitle we count all the words in the title
|
||||
// towards the density rank even if they are
|
||||
// in different sentences
|
||||
if ( sx->m_flags & SEC_IN_TITLE )
|
||||
//hashGroup = HASHGROUP_TITLE;
|
||||
if ( sx->m_flags & SEC_IN_TITLE ) {
|
||||
continue;
|
||||
if ( sx->m_flags & SEC_IN_HEADER )
|
||||
}
|
||||
if ( sx->m_flags & SEC_IN_HEADER ) {
|
||||
hashGroup = HASHGROUP_HEADING;
|
||||
if ( sx->m_flags & ( SEC_MENU |
|
||||
SEC_MENU_SENTENCE |
|
||||
SEC_MENU_HEADER ) )
|
||||
}
|
||||
if ( sx->m_flags & ( SEC_MENU | SEC_MENU_SENTENCE | SEC_MENU_HEADER ) ) {
|
||||
hashGroup = HASHGROUP_INMENU;
|
||||
}
|
||||
}
|
||||
|
||||
// this is for link text and meta tags mostly
|
||||
@ -3381,10 +3239,6 @@ bool XmlDoc::hashWords3 ( //int32_t wordStart ,
|
||||
// otherwise it will be the document's primary language.
|
||||
char langId = langUnknown;
|
||||
if ( m_wts && langVec ) langId = langVec[i];
|
||||
// keep it as the original vector. i'm not sure we use
|
||||
// this for anything but for display, so show the user
|
||||
// how we made our calculation of the document's primary lang
|
||||
//if ( langId == langUnknown ) langId = docLangId;
|
||||
|
||||
char wd;
|
||||
if ( hi->m_useCountTable ) wd = wdv[i];
|
||||
@ -3458,8 +3312,7 @@ bool XmlDoc::hashWords3 ( //int32_t wordStart ,
|
||||
|
||||
// if using posdb
|
||||
key144_t k;
|
||||
// if ( i == 11429 )
|
||||
// log("foo");
|
||||
|
||||
g_posdb.makeKey ( &k ,
|
||||
h ,
|
||||
0LL,//docid
|
||||
@ -3476,16 +3329,10 @@ bool XmlDoc::hashWords3 ( //int32_t wordStart ,
|
||||
false , // delkey?
|
||||
hi->m_shardByTermId );
|
||||
|
||||
// get the one we lost
|
||||
// char *kstr = KEYSTR ( &k , sizeof(POSDBKEY) );
|
||||
// if (!strcmp(kstr,"0x0ca3417544e400000000000032b96bf8aa01"))
|
||||
// log("got lost key");
|
||||
|
||||
// key should NEVER collide since we are always incrementing
|
||||
// the distance cursor, m_dist
|
||||
dt->addTerm144 ( &k );
|
||||
|
||||
|
||||
// add to wts for PageParser.cpp display
|
||||
if ( wts ) {
|
||||
if ( ! storeTerm ( wptrs[i],wlens[i],h,hi,i,
|
||||
@ -3494,7 +3341,6 @@ bool XmlDoc::hashWords3 ( //int32_t wordStart ,
|
||||
wd,//v[i],
|
||||
ws,
|
||||
hashGroup,
|
||||
//false, // is phrase?
|
||||
wbuf,
|
||||
wts,
|
||||
SOURCE_NONE, // synsrc
|
||||
@ -3567,7 +3413,6 @@ skipsingleword:
|
||||
////////
|
||||
|
||||
int64_t npid = pids2[i];
|
||||
int32_t npw = 2;
|
||||
uint64_t ph2 = 0;
|
||||
|
||||
// repeat for the two word hash if different!
|
||||
@ -3599,7 +3444,7 @@ skipsingleword:
|
||||
if ( wts && npid ) {
|
||||
// get phrase as a string
|
||||
int32_t plen;
|
||||
char *phr=phrases->getPhrase(i,&plen,npw);
|
||||
char *phr=phrases->getPhrase(i,&plen,2);
|
||||
// store it
|
||||
if ( ! storeTerm ( phr,plen,ph2,hi,i,
|
||||
wposvec[i], // wordPos
|
||||
@ -3647,190 +3492,12 @@ skipsingleword:
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
#ifdef SUPPORT_FACETS
|
||||
//BR 20160108 - facets DISABLED AS TEST. Don't think we will use them.
|
||||
//https://gigablast.com/syntax.html?c=main
|
||||
|
||||
#ifdef PRIVACORE_SAFE_VERSION
|
||||
#error Oops? Do not enable SUPPORT_FACETS with PRIVACORE_SAFE_VERSION. Stores too much unused data in posdb.
|
||||
#endif
|
||||
|
||||
// hash a single term so they can do gbfacet:ext or
|
||||
// gbfacet:siterank or gbfacet:price. a field on a field.
|
||||
if ( prefixHash && words->m_numWords )
|
||||
{
|
||||
// hash gbfacet:price with and store the price in the key
|
||||
hashFacet1 ( hi->m_prefix, words ,hi->m_tt);//, hi );
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
// between calls? i.e. hashTitle() and hashBody()
|
||||
//if ( wc > 0 ) m_dist = wposvec[wc-1] + 100;
|
||||
if ( i > 0 ) m_dist = wposvec[i-1] + 100;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
// just like hashNumber*() functions but we use "gbfacet" as the
|
||||
// primary prefix, NOT gbminint, gbmin, gbmax, gbmaxint, gbsortby,
|
||||
// gbsortbyint, gbrevsortby, gbrevsortbyint
|
||||
bool XmlDoc::hashFacet1 ( char *term ,
|
||||
Words *words ,
|
||||
HashTableX *tt ) {
|
||||
|
||||
// need a prefix
|
||||
//if ( ! hi->m_prefix ) return true;
|
||||
|
||||
// hash the ENTIRE content, all words as one blob
|
||||
int32_t nw = words->getNumWords();
|
||||
char *a = words->m_words[0];
|
||||
char *b = words->m_words[nw-1]+words->m_wordLens[nw-1];
|
||||
// hash the whole string as one value, the value of the facet
|
||||
int32_t val32 = hash32 ( a , b - a );
|
||||
|
||||
if ( ! hashFacet2 ( "gbfacetstr",term, val32 , tt ) ) return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
bool XmlDoc::hashFacet2 ( char *prefix,
|
||||
char *term ,
|
||||
int32_t val32 ,
|
||||
HashTableX *tt ,
|
||||
// we only use this for gbxpathsitehash terms:
|
||||
bool shardByTermId ) {
|
||||
|
||||
// need a prefix
|
||||
//if ( ! hi->m_prefix ) return true;
|
||||
//int32_t plen = gbstrlen ( hi->m_prefix );
|
||||
//if ( plen <= 0 ) return true;
|
||||
// we gotta make this case insensitive, and skip spaces
|
||||
// because if it is 'focal length' we can't search
|
||||
// 'focal length:10' because that comes across as TWO terms.
|
||||
//int64_t prefixHash =hash64Lower_utf8_nospaces ( hi->m_prefix,plen);
|
||||
|
||||
// now any field has to support gbfacet:thatfield
|
||||
// and store the 32-bit termid into where we normally put
|
||||
// the word position bits, etc.
|
||||
//static int64_t s_facetPrefixHash = 0LL;
|
||||
//if ( ! s_facetPrefixHash )
|
||||
// s_facetPrefixHash = hash64n ( "gbfacet" );
|
||||
|
||||
// this is case-sensitive
|
||||
int64_t prefixHash = hash64n ( prefix );
|
||||
|
||||
// term is like something like "object.price" or whatever.
|
||||
// it is the json field itself, or the meta tag name, etc.
|
||||
int64_t termId64 = hash64n ( term );
|
||||
|
||||
// combine with the "gbfacet" prefix. old prefix hash on right.
|
||||
// like "price" on right and "gbfacetfloat" on left... see Query.cpp.
|
||||
int64_t ph2 = hash64 ( termId64, prefixHash );
|
||||
|
||||
// . now store it
|
||||
// . use field hash as the termid. normally this would just be
|
||||
// a prefix hash
|
||||
// . use mostly fake value otherwise
|
||||
key144_t k;
|
||||
g_posdb.makeKey ( &k ,
|
||||
ph2 ,
|
||||
0,//docid
|
||||
0,// word pos #
|
||||
0,// densityRank , // 0-15
|
||||
0 , // MAXDIVERSITYRANK
|
||||
0 , // wordSpamRank ,
|
||||
0 , //siterank
|
||||
0 , // hashGroup,
|
||||
// we set to docLang final hash loop
|
||||
//langUnknown, // langid
|
||||
// unless already set. so set to english here
|
||||
// so it will not be set to something else
|
||||
// otherwise our floats would be ordered by langid!
|
||||
// somehow we have to indicate that this is a float
|
||||
// termlist so it will not be mangled any more.
|
||||
//langEnglish,
|
||||
langUnknown,
|
||||
0 , // multiplier
|
||||
false, // syn?
|
||||
false , // delkey?
|
||||
shardByTermId );
|
||||
|
||||
//int64_t final = hash64n("products.offerprice",0);
|
||||
//int64_t prefix = hash64n("gbsortby",0);
|
||||
//int64_t h64 = hash64 ( final , prefix);
|
||||
//if ( ph2 == h64 )
|
||||
// log("hey: got offer price");
|
||||
|
||||
// now set the float in that key
|
||||
g_posdb.setInt ( &k , val32 );
|
||||
|
||||
// HACK: this bit is ALWAYS set by Posdb::makeKey() to 1
|
||||
// so that we can b-step into a posdb list and make sure
|
||||
// we are aligned on a 6 byte or 12 byte key, since they come
|
||||
// in both sizes. but for this, hack it off to tell
|
||||
// addTable144() that we are a special posdb key, a "numeric"
|
||||
// key that has a float stored in it. then it will NOT
|
||||
// set the siterank and langid bits which throw our sorting
|
||||
// off!!
|
||||
g_posdb.setAlignmentBit ( &k , 0 );
|
||||
|
||||
HashTableX *dt = tt;//hi->m_tt;
|
||||
|
||||
// the key may indeed collide, but that's ok for this application
|
||||
if ( ! dt->addTerm144 ( &k ) )
|
||||
return false;
|
||||
|
||||
if ( ! m_wts )
|
||||
return true;
|
||||
|
||||
bool isFloat = false;
|
||||
if ( strcmp(prefix,"gbfacetfloat")==0 ) isFloat = true;
|
||||
|
||||
// store in buffer for display on pageparser.cpp output
|
||||
char buf[130];
|
||||
if ( isFloat )
|
||||
snprintf(buf,128,"facetField=%s facetVal32=%f",term,
|
||||
*(float *)&val32);
|
||||
else
|
||||
snprintf(buf,128,"facetField=%s facetVal32=%"UINT32"",
|
||||
term,(uint32_t)val32);
|
||||
int32_t bufLen = gbstrlen(buf);
|
||||
|
||||
// make a special hashinfo for this facet
|
||||
HashInfo hi;
|
||||
hi.m_tt = tt;
|
||||
// the full prefix
|
||||
char fullPrefix[66];
|
||||
snprintf(fullPrefix,64,"%s:%s",prefix,term);
|
||||
hi.m_prefix = fullPrefix;//"gbfacet";
|
||||
|
||||
// add to wts for PageParser.cpp display
|
||||
// store it
|
||||
if ( ! storeTerm ( buf,
|
||||
bufLen,
|
||||
ph2, // prefixHash, // s_facetPrefixHash,
|
||||
&hi,
|
||||
0, // word#, i,
|
||||
0, // wordPos
|
||||
0,// densityRank , // 0-15
|
||||
0, // MAXDIVERSITYRANK,//phrase
|
||||
0, // ws,
|
||||
0, // hashGroup,
|
||||
//true,
|
||||
&m_wbuf,
|
||||
m_wts,
|
||||
// a hack for display in wts:
|
||||
SOURCE_NUMBER, // SOURCE_BIGRAM, // synsrc
|
||||
langUnknown ,
|
||||
k) )
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool XmlDoc::hashFieldMatchTerm ( char *val , int32_t vlen , HashInfo *hi ) {
|
||||
|
||||
HashTableX *tt = hi->m_tt;
|
||||
@ -4346,27 +4013,6 @@ char *XmlDoc::hashJSONFields2 ( HashTableX *table ,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
//
|
||||
// for deduping search results we set m_contentHash32 here for
|
||||
// diffbot json objects.
|
||||
// we can't do this here anymore, we have to set the
|
||||
// contenthash in ::getContentHash32() because we need it to
|
||||
// set EDOCUNCHANGED in ::getIndexCode() above.
|
||||
//
|
||||
/*
|
||||
if ( hi->m_hashGroup != HASHGROUP_INURL ) {
|
||||
// make the content hash so we can set m_contentHash32
|
||||
// for deduping
|
||||
int32_t nh32 = hash32n ( name );
|
||||
// do an exact hash for now...
|
||||
int32_t vh32 = hash32 ( val , vlen , m_niceness );
|
||||
// accumulate, order independently
|
||||
totalHash32 ^= nh32;
|
||||
totalHash32 ^= vh32;
|
||||
}
|
||||
*/
|
||||
|
||||
// index like "title:whatever"
|
||||
hi->m_prefix = name;
|
||||
hashString ( val , vlen , hi );
|
||||
@ -4384,24 +4030,8 @@ char *XmlDoc::hashJSONFields2 ( HashTableX *table ,
|
||||
hi->m_prefix = NULL;
|
||||
hashString ( val , vlen , hi );
|
||||
|
||||
/*
|
||||
// a number? hash special then as well
|
||||
if ( ji->m_type != JT_NUMBER ) continue;
|
||||
|
||||
// use prefix for this though
|
||||
hi->m_prefix = name;
|
||||
|
||||
// hash as a number so we can sort search results by
|
||||
// this number and do range constraints
|
||||
float f = ji->m_valueDouble;
|
||||
if ( ! hashNumberForSortingAsFloat ( f , hi ) )
|
||||
return NULL;
|
||||
*/
|
||||
}
|
||||
|
||||
//m_contentHash32 = totalHash32;
|
||||
//m_contentHash32Valid = true;
|
||||
|
||||
return (char *)0x01;
|
||||
}
|
||||
|
||||
|
@ -8,10 +8,6 @@
|
||||
//
|
||||
#define PRIVACORE_SAFE_VERSION
|
||||
|
||||
// Facet support disabled by default to save space in posdb
|
||||
#undef SUPPORT_FACETS
|
||||
|
||||
|
||||
// fix on 64-bit architectures so sizeof(uint96_t) is 12, not 16!
|
||||
//#pragma pack(0)
|
||||
|
||||
|
396
main.cpp
396
main.cpp
@ -19,7 +19,6 @@
|
||||
#include "Posdb.h"
|
||||
#include "Datedb.h"
|
||||
#include "Titledb.h"
|
||||
#include "Revdb.h"
|
||||
#include "Tagdb.h"
|
||||
#include "Spider.h"
|
||||
#include "SpiderColl.h"
|
||||
@ -95,8 +94,6 @@ static void dumpTitledb (char *coll, int32_t sfn, int32_t numFiles, bool includ
|
||||
int64_t docId , bool justPrintDups );
|
||||
static int32_t dumpSpiderdb ( char *coll,int32_t sfn,int32_t numFiles,bool includeTree,
|
||||
char printStats , int32_t firstIp );
|
||||
static void dumpSectiondb( char *coll,int32_t sfn,int32_t numFiles,bool includeTree);
|
||||
static void dumpRevdb ( char *coll,int32_t sfn,int32_t numFiles,bool includeTree);
|
||||
|
||||
static void dumpTagdb( char *coll, int32_t sfn, int32_t numFiles, bool includeTree, char rec = 0,
|
||||
int32_t rdbId = RDB_TAGDB, char *site = NULL );
|
||||
@ -653,16 +650,6 @@ int main2 ( int argc , char *argv[] ) {
|
||||
"all events as if the time is UTCtimestamp.\n\n"
|
||||
*/
|
||||
|
||||
/*
|
||||
#ifdef _CLIENT_
|
||||
//there was <hostId> in this command but it
|
||||
// wasn't used in the program, so deleting it from
|
||||
// here
|
||||
"dump <V> [C [X [Y [Z]]]]\n\tdump a db in "
|
||||
#else
|
||||
*/
|
||||
|
||||
//"dump <db> <collection> [T]\n\tDump a db from disk. "
|
||||
"dump <db> <collection>\n\tDump a db from disk. "
|
||||
"Example: gb dump t main\n"
|
||||
"\t<collection> is the name of the collection.\n"
|
||||
@ -687,7 +674,6 @@ int main2 ( int argc , char *argv[] ) {
|
||||
"\t<db> is W to dump tagdb for wget.\n"
|
||||
"\t<db> is x to dump doledb.\n"
|
||||
"\t<db> is w to dump waiting tree.\n"
|
||||
"\t<db> is B to dump sectiondb.\n"
|
||||
"\t<db> is C to dump catdb.\n"
|
||||
"\t<db> is l to dump clusterdb.\n"
|
||||
"\t<db> is z to dump statsdb all keys.\n"
|
||||
@ -2239,10 +2225,6 @@ int main2 ( int argc , char *argv[] ) {
|
||||
fprintf(stdout,"error dumping spiderdb\n");
|
||||
}
|
||||
}
|
||||
else if ( argv[cmdarg+1][0] == 'B' )
|
||||
dumpSectiondb(coll,startFileNum,numFiles,includeTree);
|
||||
else if ( argv[cmdarg+1][0] == 'V' )
|
||||
dumpRevdb(coll,startFileNum,numFiles,includeTree);
|
||||
else if ( argv[cmdarg+1][0] == 'S' ) {
|
||||
char *site = NULL;
|
||||
if ( cmdarg+6 < argc ) {
|
||||
@ -2638,61 +2620,16 @@ int main2 ( int argc , char *argv[] ) {
|
||||
if ( ! g_linkdb.init() ) {
|
||||
log("db: Linkdb init failed." ); return 1; }
|
||||
|
||||
// use sectiondb again for its immense voting power for detecting and
|
||||
// removing web page chrome, categories, etc. only use if
|
||||
// CollectionRec::m_isCustomCrawl perhaps to save space.
|
||||
if ( ! g_sectiondb.init() ) {
|
||||
log("db: Sectiondb init failed." ); return 1; }
|
||||
|
||||
// now clean the trees since all rdbs have loaded their rdb trees
|
||||
// from disk, we need to remove bogus collection data from teh trees
|
||||
// like if a collection was delete but tree never saved right it'll
|
||||
// still have the collection's data in it
|
||||
if ( ! g_collectiondb.addRdbBaseToAllRdbsForEachCollRec ( ) ) {
|
||||
log("db: Collectiondb init failed." ); return 1; }
|
||||
// . now read in a little bit of each db and make sure the contained
|
||||
// records belong in our group
|
||||
// . only do this if we have more than one group
|
||||
// . we may have records from other groups if we are scaling, but
|
||||
// if we cannot find *any* records in our group we probably have
|
||||
// the wrong data files.
|
||||
//if ( ! checkDataParity() ) return 1;
|
||||
|
||||
//Load the high-frequency term shortcuts (if they exist)
|
||||
g_hfts.load();
|
||||
|
||||
// init the vector cache
|
||||
/*
|
||||
if ( ! g_vectorCache.init ( g_conf.m_maxVectorCacheMem,
|
||||
VECTOR_REC_SIZE-sizeof(key_t),
|
||||
true,
|
||||
g_conf.m_maxVectorCacheMem /
|
||||
( sizeof(collnum_t) + 20 +
|
||||
VECTOR_REC_SIZE ) ,
|
||||
true,
|
||||
"vector",
|
||||
false,
|
||||
12,
|
||||
12 ) ) {
|
||||
log("db: Vector Cache init failed." ); return 1; }
|
||||
*/
|
||||
// . gb gendbs
|
||||
// . hostId should have already been picked up above, so it could be
|
||||
// used to initialize all the rdbs
|
||||
//if ( strcmp ( cmd , "gendbs" ) == 0 ) {
|
||||
// char *coll = argv[cmdarg+1];
|
||||
// // generate the dbs
|
||||
// genDbs ( coll ); // coll
|
||||
// g_log.m_disabled = true;
|
||||
// return 0;
|
||||
//}
|
||||
//if ( strcmp ( cmd, "genclusterdb" ) == 0 ) {
|
||||
// char *coll = argv[cmdarg+1];
|
||||
// makeClusterdb ( coll );
|
||||
// g_log.m_disabled = true;
|
||||
// return 0;
|
||||
//}
|
||||
|
||||
// test all collection dirs for write permission -- metalincs' request
|
||||
int32_t pcount = 0;
|
||||
for ( int32_t i = 0 ; i < g_collectiondb.m_numRecs ; i++ ) {
|
||||
@ -2709,16 +2646,6 @@ int main2 ( int argc , char *argv[] ) {
|
||||
checkDirPerms ( tt ) ;
|
||||
}
|
||||
|
||||
// and now that all rdbs have loaded lets count the gbeventcount
|
||||
// keys we have in datedb. those represent the # of events we
|
||||
// have indexed.
|
||||
//g_collectiondb.countEvents();
|
||||
|
||||
//if (!ucInit(g_hostdb.m_dir, true)) {
|
||||
// log("Unicode initialization failed!");
|
||||
// return 1;
|
||||
//}
|
||||
|
||||
//
|
||||
// NOTE: ANYTHING THAT USES THE PARSER SHOULD GO BELOW HERE, UCINIT!
|
||||
//
|
||||
@ -2728,20 +2655,6 @@ int main2 ( int argc , char *argv[] ) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
// have to test after unified dict is loaded because if word is
|
||||
// of unknown langid we try to get syns for it anyway if it has
|
||||
// only one possible lang according to unified dict
|
||||
//if ( ! g_wiktionary.test2() ) return 1;
|
||||
|
||||
/*
|
||||
if ( strcmp ( cmd, "gendaterange" ) == 0 ) {
|
||||
char *coll = argv[cmdarg+1];
|
||||
genDateRange ( coll );
|
||||
g_log.m_disabled = true;
|
||||
return 0;
|
||||
}
|
||||
*/
|
||||
|
||||
// Load the category language table
|
||||
g_countryCode.loadHashTable();
|
||||
int32_t nce = g_countryCode.getNumEntries();
|
||||
@ -2765,64 +2678,6 @@ int main2 ( int argc , char *argv[] ) {
|
||||
log("db: ResultsCache: %s",mstrerror(g_errno));
|
||||
return 1;
|
||||
}
|
||||
/*
|
||||
maxMem = 40000000;
|
||||
int32_t maxNodes2 = maxMem/(8+8+50*(8+4+4));
|
||||
if ( ! g_genericCache[SEORESULTS_CACHEID].init (
|
||||
maxMem , // max cache mem
|
||||
-1 , // fixedDataSize
|
||||
false , // support lists of recs?
|
||||
maxNodes2 , // max cache nodes
|
||||
false , // use half keys?
|
||||
"seoresults" , // filename
|
||||
true)){ // save to disk?
|
||||
log("db: ResultsCache: %s",mstrerror(g_errno));
|
||||
return 1;
|
||||
}
|
||||
*/
|
||||
/*
|
||||
int32_t maxMem1 = g_conf.m_siteLinkInfoMaxCacheMem;
|
||||
if ( ! g_genericCache[SITELINKINFO_CACHEID].init (
|
||||
maxMem1 , // max cache mem
|
||||
4 , // fixedDataSize
|
||||
false , // support lists of recs?
|
||||
maxMem1/36 , // max cache nodes
|
||||
false , // use half keys?
|
||||
"sitelinkinfo" , // filename
|
||||
//g_conf.m_siteLinkInfoSaveCache ) ) {
|
||||
true)){
|
||||
log("db: SiteLinkInfoCache: %s",mstrerror(g_errno));
|
||||
return 1;
|
||||
}
|
||||
int32_t maxMem2a = g_conf.m_siteQualityMaxCacheMem;
|
||||
if ( ! g_genericCache[SITEQUALITY_CACHEID].init (
|
||||
maxMem2a , // max cache mem
|
||||
1 , // fixedDataSize
|
||||
false , // support lists of recs?
|
||||
maxMem2a/36 , // max cache nodes
|
||||
false , // use half keys?
|
||||
"sitequality" , // filename
|
||||
//g_conf.m_siteQualitySaveCache ) ) {
|
||||
true)) {
|
||||
log("db: SiteQualityCache: %s",mstrerror(g_errno));
|
||||
return 1;
|
||||
}
|
||||
*/
|
||||
/*
|
||||
int32_t maxMem2b = g_conf.m_siteQualityMaxCacheMem * .10 ;
|
||||
if ( ! g_genericCacheSmallLocal[SITEQUALITY_CACHEID].init (
|
||||
maxMem2b , // max cache mem
|
||||
1 , // fixedDataSize
|
||||
false , // support lists of recs?
|
||||
maxMem2b/36 , // max cache nodes
|
||||
false , // use half keys?
|
||||
"sitequality" , // filename
|
||||
//g_conf.m_siteQualitySaveCache ) ) {
|
||||
false)) {
|
||||
log("db: SiteQualityCacheSmallLocal: %s",mstrerror(g_errno));
|
||||
return 1;
|
||||
}
|
||||
*/
|
||||
|
||||
// init minsitenuminlinks buffer
|
||||
if ( ! g_tagdb.loadMinSiteInlinksBuffer() ) {
|
||||
@ -7836,223 +7691,6 @@ void *startUp ( void *state , ThreadEntry *t ) {
|
||||
return 0; //NULL;
|
||||
}
|
||||
|
||||
void dumpSectiondb(char *coll,int32_t startFileNum,int32_t numFiles,
|
||||
bool includeTree) {
|
||||
//g_conf.m_spiderdbMaxTreeMem = 1024*1024*30;
|
||||
g_sectiondb.init ();
|
||||
//g_collectiondb.init(true);
|
||||
g_sectiondb.getRdb()->addRdbBase1(coll );
|
||||
key128_t startKey ;
|
||||
key128_t endKey ;
|
||||
startKey.setMin();
|
||||
endKey.setMax();
|
||||
// turn off threads
|
||||
g_threads.disableThreads();
|
||||
// get a meg at a time
|
||||
int32_t minRecSizes = 1024*1024;
|
||||
Msg5 msg5;
|
||||
RdbList list;
|
||||
char tmpBuf[1024];
|
||||
SafeBuf sb(tmpBuf, 1024);
|
||||
bool firstKey = true;
|
||||
CollectionRec *cr = g_collectiondb.getRec(coll);
|
||||
loop:
|
||||
// use msg5 to get the list, should ALWAYS block since no threads
|
||||
if ( ! msg5.getList ( RDB_SECTIONDB ,
|
||||
cr->m_collnum ,
|
||||
&list ,
|
||||
(char *)&startKey ,
|
||||
(char *)&endKey ,
|
||||
minRecSizes ,
|
||||
includeTree ,
|
||||
false , // add to cache?
|
||||
0 , // max cache age
|
||||
startFileNum ,
|
||||
numFiles ,
|
||||
NULL , // state
|
||||
NULL , // callback
|
||||
0 , // niceness
|
||||
false )){// err correction?
|
||||
log(LOG_LOGIC,"db: getList did not block.");
|
||||
return;
|
||||
}
|
||||
// all done if empty
|
||||
if ( list.isEmpty() ) return;
|
||||
|
||||
key128_t lastk;
|
||||
|
||||
// loop over entries in list
|
||||
for(list.resetListPtr();!list.isExhausted(); list.skipCurrentRecord()){
|
||||
char *rec = list.getCurrentRec();
|
||||
key128_t *k = (key128_t *)rec;
|
||||
char *data = list.getCurrentData();
|
||||
int32_t size = list.getCurrentDataSize();
|
||||
// is it a delete?
|
||||
if ( (k->n0 & 0x01) == 0 ) {
|
||||
printf("k.n1=%016"XINT64" k.n0=%016"XINT64" (delete)\n",
|
||||
k->n1 , k->n0 | 0x01 ); // fix it!
|
||||
continue;
|
||||
}
|
||||
if ( size != sizeof(SectionVote) ) { char *xx=NULL;*xx=0; }
|
||||
// sanity check
|
||||
if ( ! firstKey ) {
|
||||
if ( k->n1 < lastk.n1 ) { char *xx=NULL;*xx=0; }
|
||||
if ( k->n1 == lastk.n1 && k->n0 < lastk.n0 ) {
|
||||
char *xx=NULL;*xx=0; }
|
||||
}
|
||||
// no longer a first key
|
||||
firstKey = false;
|
||||
// copy it
|
||||
gbmemcpy ( &lastk , k , sizeof(key128_t) );
|
||||
int32_t shardNum = getShardNum (RDB_SECTIONDB,k);
|
||||
//int32_t groupNum = g_hostdb.getGroupNum ( gid );
|
||||
// point to the data
|
||||
char *p = data;
|
||||
char *pend = data + size;
|
||||
// breach check
|
||||
if ( p >= pend ) {
|
||||
printf("corrupt sectiondb rec k.n0=%"UINT64"",k->n0);
|
||||
continue;
|
||||
}
|
||||
// parse it up
|
||||
SectionVote *sv = (SectionVote *)data;
|
||||
int64_t termId = g_datedb.getTermId ( k );
|
||||
// score is the section type
|
||||
unsigned char score2 = g_datedb.getScore(k);
|
||||
char *stype = "unknown";
|
||||
if ( score2 == SV_CLOCK ) stype = "clock ";
|
||||
if ( score2 == SV_EURDATEFMT ) stype = "eurdatefmt ";
|
||||
if ( score2 == SV_EVENT ) stype = "event ";
|
||||
if ( score2 == SV_ADDRESS ) stype = "address ";
|
||||
if ( score2 == SV_TAGPAIRHASH ) stype = "tagpairhash ";
|
||||
if ( score2 == SV_TAGCONTENTHASH ) stype = "tagcontenthash";
|
||||
if ( score2 == SV_FUTURE_DATE ) stype = "futuredate ";
|
||||
if ( score2 == SV_PAST_DATE ) stype = "pastdate ";
|
||||
if ( score2 == SV_CURRENT_DATE ) stype = "currentdate ";
|
||||
if ( score2 == SV_SITE_VOTER ) stype = "sitevoter ";
|
||||
if ( score2 == SV_TURKTAGHASH ) stype = "turktaghash ";
|
||||
int64_t d = g_datedb.getDocId(k);
|
||||
int32_t date = g_datedb.getDate(k);
|
||||
// dump it
|
||||
printf("k=%s "
|
||||
"sh48=%"XINT64" " // sitehash is the termid
|
||||
"date=%010"UINT32" "
|
||||
"%s (%"UINT32") "
|
||||
"d=%012"UINT64" "
|
||||
"score=%f samples=%f "
|
||||
"shardnum=%"INT32""
|
||||
"\n",
|
||||
//k->n1,
|
||||
//k->n0,
|
||||
KEYSTR(k,sizeof(key128_t)),
|
||||
termId,
|
||||
date,
|
||||
stype,(uint32_t)score2,
|
||||
d,
|
||||
sv->m_score,
|
||||
sv->m_numSampled,
|
||||
shardNum);
|
||||
}
|
||||
|
||||
startKey = *(key128_t *)list.getLastKey();
|
||||
startKey += (uint32_t) 1;
|
||||
// watch out for wrap around
|
||||
if ( startKey < *(key128_t *)list.getLastKey() ){ printf("\n"); return;}
|
||||
goto loop;
|
||||
}
|
||||
|
||||
void dumpRevdb(char *coll,int32_t startFileNum,int32_t numFiles, bool includeTree) {
|
||||
//g_conf.m_spiderdbMaxTreeMem = 1024*1024*30;
|
||||
g_revdb.init ();
|
||||
//g_collectiondb.init(true);
|
||||
g_revdb.getRdb()->addRdbBase1(coll );
|
||||
key_t startKey ;
|
||||
key_t endKey ;
|
||||
startKey.setMin();
|
||||
endKey.setMax();
|
||||
// turn off threads
|
||||
g_threads.disableThreads();
|
||||
// get a meg at a time
|
||||
int32_t minRecSizes = 1024*1024;
|
||||
Msg5 msg5;
|
||||
RdbList list;
|
||||
char tmpBuf[1024];
|
||||
SafeBuf sb(tmpBuf, 1024);
|
||||
bool firstKey = true;
|
||||
CollectionRec *cr = g_collectiondb.getRec(coll);
|
||||
loop:
|
||||
// use msg5 to get the list, should ALWAYS block since no threads
|
||||
if ( ! msg5.getList ( RDB_REVDB ,
|
||||
cr->m_collnum ,
|
||||
&list ,
|
||||
(char *)&startKey ,
|
||||
(char *)&endKey ,
|
||||
minRecSizes ,
|
||||
includeTree ,
|
||||
false , // add to cache?
|
||||
0 , // max cache age
|
||||
startFileNum ,
|
||||
numFiles ,
|
||||
NULL , // state
|
||||
NULL , // callback
|
||||
0 , // niceness
|
||||
false )){// err correction?
|
||||
log(LOG_LOGIC,"db: getList did not block.");
|
||||
return;
|
||||
}
|
||||
// all done if empty
|
||||
if ( list.isEmpty() ) return;
|
||||
|
||||
key_t lastk;
|
||||
|
||||
// loop over entries in list
|
||||
for(list.resetListPtr();!list.isExhausted(); list.skipCurrentRecord()){
|
||||
char *rec = list.getCurrentRec();
|
||||
key_t *k = (key_t *)rec;
|
||||
char *data = list.getCurrentData();
|
||||
int32_t size = list.getCurrentDataSize();
|
||||
// get docid from key
|
||||
int64_t d = g_revdb.getDocId(k);
|
||||
// is it a delete?
|
||||
if ( (k->n0 & 0x01) == 0 ) {
|
||||
printf("k.n1=%08"XINT32" k.n0=%016"XINT64" d=%"UINT64" (delete)\n",
|
||||
k->n1 , k->n0 | 0x01 , d ); // fix it!
|
||||
continue;
|
||||
}
|
||||
//if ( size != sizeof(SectionVote) ) { char *xx=NULL;*xx=0; }
|
||||
// sanity check
|
||||
if ( ! firstKey ) {
|
||||
if ( k->n1 < lastk.n1 ) { char *xx=NULL;*xx=0; }
|
||||
if ( k->n1 == lastk.n1 && k->n0 < lastk.n0 ) {
|
||||
char *xx=NULL;*xx=0; }
|
||||
}
|
||||
// no longer a first key
|
||||
firstKey = false;
|
||||
// copy it
|
||||
gbmemcpy ( &lastk , k , sizeof(key_t) );
|
||||
// point to the data
|
||||
char *p = data;
|
||||
char *pend = data + size;
|
||||
// breach check
|
||||
if ( p > pend ) {
|
||||
printf("corrupt revdb rec k.n1=0x%08"XINT32" d=%"UINT64"\n",
|
||||
k->n1,d);
|
||||
continue;
|
||||
}
|
||||
// parse it up
|
||||
//SectionVote *sv = (SectionVote *)data;
|
||||
// dump it
|
||||
printf("k.n1=%08"XINT32" k.n0=%016"XINT64" ds=%06"INT32" d=%"UINT64"\n",
|
||||
k->n1,k->n0,size,d);
|
||||
}
|
||||
|
||||
startKey = *(key_t *)list.getLastKey();
|
||||
startKey += (uint32_t) 1;
|
||||
// watch out for wrap around
|
||||
if ( startKey < *(key_t *)list.getLastKey() ){ printf("\n"); return;}
|
||||
goto loop;
|
||||
}
|
||||
|
||||
void dumpTagdb( char *coll, int32_t startFileNum, int32_t numFiles, bool includeTree, char req, int32_t rdbId,
|
||||
char *siteArg ) {
|
||||
//g_conf.m_spiderdbMaxTreeMem = 1024*1024*30;
|
||||
@ -8473,13 +8111,11 @@ bool parseTest ( char *coll , int64_t docId , char *query ) {
|
||||
// computeWordIds from xml
|
||||
words.set ( &xml , true , true ) ;
|
||||
bits.set ( &words ,TITLEREC_CURRENT_VERSION, 0);
|
||||
Phrases phrases;
|
||||
phrases.set ( &words,&bits,true,true,TITLEREC_CURRENT_VERSION,0);
|
||||
t = gettimeofdayInMilliseconds_force();
|
||||
for ( int32_t i = 0 ; i < 100 ; i++ )
|
||||
//if ( ! words.set ( &xml , true , true ) )
|
||||
// do not supply xd so it will be set from scratch
|
||||
if ( !sections.set( &words, &phrases, &bits, NULL, 0, NULL, 0, 0 ) )
|
||||
if ( !sections.set( &words, &bits, NULL, 0, NULL, 0, 0 ) )
|
||||
return log("build: speedtestxml: sections set: %s",
|
||||
mstrerror(g_errno));
|
||||
|
||||
@ -8493,14 +8129,10 @@ bool parseTest ( char *coll , int64_t docId , char *query ) {
|
||||
|
||||
|
||||
//Phrases phrases;
|
||||
Phrases phrases;
|
||||
t = gettimeofdayInMilliseconds_force();
|
||||
for ( int32_t i = 0 ; i < 100 ; i++ )
|
||||
if ( ! phrases.set ( &words ,
|
||||
&bits ,
|
||||
true , // use stop words
|
||||
false , // use stems
|
||||
TITLEREC_CURRENT_VERSION ,
|
||||
0 ) ) // niceness
|
||||
for ( int32_t i = 0 ; i < 100 ; i++ )
|
||||
if ( !phrases.set( &words, &bits, TITLEREC_CURRENT_VERSION, 0 ) )
|
||||
return log("build: speedtestxml: Phrases set: %s",
|
||||
mstrerror(g_errno));
|
||||
// print time it took
|
||||
@ -8597,22 +8229,6 @@ bool summaryTest1 ( char *rec , int32_t listSize, char *coll , int64_t docId ,
|
||||
xml.set( content, contentLen, xd.m_version, 0, CT_HTML );
|
||||
|
||||
xd.getSummary();
|
||||
|
||||
//Summary s;
|
||||
// bool status;
|
||||
/*
|
||||
status = s.set ( &xml ,
|
||||
&q ,
|
||||
NULL , // termFreqs
|
||||
false , // doStemming?
|
||||
summaryMaxLen ,
|
||||
numSummaryLines ,
|
||||
summaryMaxNumCharsPerLine ,
|
||||
bigSampleRadius ,
|
||||
bigSampleMaxLen ,
|
||||
ratInSummary ,
|
||||
&tr );
|
||||
*/
|
||||
}
|
||||
|
||||
// print time it took
|
||||
@ -8641,8 +8257,6 @@ bool summaryTest2 ( char *rec , int32_t listSize, char *coll , int64_t docId ,
|
||||
int32_t numSummaryLines = cr->m_summaryMaxNumLines;
|
||||
int32_t summaryMaxNumCharsPerLine = cr->m_summaryMaxNumCharsPerLine;
|
||||
// these are arbitrary (taken from Msg24.cpp)
|
||||
int32_t bigSampleRadius = 100;
|
||||
int32_t bigSampleMaxLen = 4000;
|
||||
bool ratInSummary = false;
|
||||
|
||||
Query q;
|
||||
@ -8731,8 +8345,6 @@ bool summaryTest2 ( char *rec , int32_t listSize, char *coll , int64_t docId ,
|
||||
summaryMaxLen ,
|
||||
numSummaryLines ,
|
||||
summaryMaxNumCharsPerLine ,
|
||||
bigSampleRadius ,
|
||||
bigSampleMaxLen ,
|
||||
ratInSummary ,
|
||||
&tr );
|
||||
// time it
|
||||
|
22
qa.cpp
22
qa.cpp
@ -745,8 +745,7 @@ bool qainject1 ( ) {
|
||||
|
||||
if ( ! s_flags[16] ) {
|
||||
s_flags[16] = true;
|
||||
if ( ! getUrl ( "/search?c=qatest123&qa=1&format=xml&q=%2Bthe"
|
||||
"&dsrt=500",
|
||||
if ( ! getUrl ( "/search?c=qatest123&qa=1&format=xml&q=%2Bthe",
|
||||
702467314 ) )
|
||||
return false;
|
||||
}
|
||||
@ -1573,8 +1572,7 @@ bool qaWarcFiles ( ) {
|
||||
}
|
||||
if ( s_flags[EXAMINE_RESULTS1] == 0) {
|
||||
s_flags[EXAMINE_RESULTS1]++;
|
||||
if ( ! getUrl ( "/search?c=qatest123&qa=1&format=xml&q=%2Bthe"
|
||||
"&dsrt=500",
|
||||
if ( ! getUrl ( "/search?c=qatest123&qa=1&format=xml&q=%2Bthe",
|
||||
702467314 ) )
|
||||
return false;
|
||||
}
|
||||
@ -1596,8 +1594,7 @@ bool qaWarcFiles ( ) {
|
||||
|
||||
if ( s_flags[EXAMINE_RESULTS2] == 0) {
|
||||
s_flags[EXAMINE_RESULTS2]++;
|
||||
if ( ! getUrl ( "/search?c=qatest123&qa=1&format=xml&q=%2Bthe"
|
||||
"&dsrt=500",
|
||||
if ( ! getUrl ( "/search?c=qatest123&qa=1&format=xml&q=%2Bthe",
|
||||
702467314 ) )
|
||||
return false;
|
||||
}
|
||||
@ -1790,14 +1787,6 @@ bool qaMetadataFacetSearch ( ) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// if ( ! s_flags[EXAMINE_RESULTS] ) {
|
||||
// s_flags[16] = true;
|
||||
// if ( ! getUrl ( "/search?c=qatest123&qa=1&q=%2Bthe"
|
||||
// "&dsrt=500",
|
||||
// 702467314 ) )
|
||||
// return false;
|
||||
// }
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -1876,8 +1865,7 @@ bool qaimport () {
|
||||
// test query
|
||||
if ( ! s_flags[16] ) {
|
||||
s_flags[16] = true;
|
||||
if ( ! getUrl ( "/search?c=qatest123&qa=1&format=xml&q=%2Bthe"
|
||||
"&dsrt=500",
|
||||
if ( ! getUrl ( "/search?c=qatest123&qa=1&format=xml&q=%2Bthe",
|
||||
702467314 ) )
|
||||
return false;
|
||||
}
|
||||
@ -1887,7 +1875,7 @@ bool qaimport () {
|
||||
if ( ! s_flags[29] ) {
|
||||
s_flags[29] = true;
|
||||
if ( ! getUrl ( "/search?c=qatest123&qa=1&format=xml&"
|
||||
"q=mediapost&dsrt=0&sc=1",
|
||||
"q=mediapost&sc=1",
|
||||
702467314 ) )
|
||||
return false;
|
||||
}
|
||||
|
@ -16,7 +16,10 @@ def verify_file(gb_api, httpserver, filename, custom_filename, content_type, exp
|
||||
# add url
|
||||
assert gb_api.add_url(file_url) == True
|
||||
|
||||
result = gb_api.search('url:' + file_url)
|
||||
payload = {}
|
||||
payload.update({'showerrors': '1'})
|
||||
|
||||
result = gb_api.search('url:' + file_url, payload)
|
||||
assert len(result['results']) == 1
|
||||
|
||||
assert result['results'][0]['contentType'] == expected_content_type
|
||||
|
@ -28,14 +28,11 @@ static void generateSummary(Summary &summary, char *htmlInput, char *queryStr, c
|
||||
Bits bits;
|
||||
ASSERT_TRUE(bits.set(&words, TITLEREC_CURRENT_VERSION, 0));
|
||||
|
||||
Phrases phrases;
|
||||
ASSERT_TRUE(phrases.set(&words, &bits, true, false, TITLEREC_CURRENT_VERSION, 0));
|
||||
|
||||
Url url;
|
||||
url.set(urlStr);
|
||||
|
||||
Sections sections;
|
||||
ASSERT_TRUE(sections.set(&words, &phrases, &bits, &url, 0, "", 0, CT_HTML));
|
||||
ASSERT_TRUE(sections.set(&words, &bits, &url, 0, "", 0, CT_HTML));
|
||||
|
||||
Query query;
|
||||
ASSERT_TRUE(query.set2(queryStr, langEnglish, true));
|
||||
@ -53,6 +50,9 @@ static void generateSummary(Summary &summary, char *htmlInput, char *queryStr, c
|
||||
Bits bitsForSummary;
|
||||
ASSERT_TRUE(bitsForSummary.setForSummary(&words));
|
||||
|
||||
Phrases phrases;
|
||||
ASSERT_TRUE(phrases.set(&words, &bits, TITLEREC_CURRENT_VERSION, 0));
|
||||
|
||||
Matches matches;
|
||||
matches.setQuery(&query);
|
||||
ASSERT_TRUE(matches.set(&words, &phrases, §ions, &bitsForSummary, &pos, &xml, &title, &url, &linkInfo, 0));
|
||||
|
Reference in New Issue
Block a user