Merge branch 'master' of github.com:privacore/open-source-search-engine

This commit is contained in:
Ivan Skytte Jørgensen
2016-02-25 11:14:35 +01:00
77 changed files with 789 additions and 22071 deletions

19
Bits.h

@ -47,24 +47,21 @@
// set by Sections.cpp::setMenu() function
#define D_IN_LINK 0x0400
// in the place name part of an address?
//#define D_UNUSED_2 0x0800
//#define D_UNUSED 0x0800
// allow for dows for texasdrums.org, so TUESDAYS is set with this and
// we can keep it as part of the sentence and not split on the colon
//#define D_IS_IN_DATE_2 0x1000
// this is so we can still set EV_HASTITLEBYVOTES if a tod date is in the
// title, all other dates are no-no!
#define D_IS_DAYNUM 0x1000
// for setting event titles in Events.cpp
#define D_GENERIC_WORD 0x2000
#define D_CRUFTY 0x4000
#define D_IS_NUM 0x00008000
//#define D_UNUSED_3 0x00010000
#define D_IS_IN_URL 0x00020000
//#define D_UNUSED 0x4000
#define D_IS_NUM 0x00008000
//#define D_UNUSED 0x00010000
#define D_IS_IN_URL 0x00020000
// like D_IS_TOD above
#define D_IS_MONTH 0x00040000
#define D_IS_HEX_NUM 0x00080000
#define D_IS_MONTH 0x00040000
#define D_IS_HEX_NUM 0x00080000
//
// the bits below here are used for Summary.cpp when calling
// Bits::setForSummary()

@ -36,12 +36,9 @@
// these are now just TitleRec keys
#define CLUSTER_REC_SIZE (sizeof(key_t))
// this now includes the gigabit vector
#define VECTOR_REC_SIZE (sizeof(key_t)+SAMPLE_VECTOR_SIZE+GIGABIT_VECTOR_SIZE)
class Clusterdb {
public:
public:
// reset rdb
void reset();
@ -85,16 +82,6 @@ class Clusterdb {
// convert a titlerec key into a clusterec key
key_t convertTitleRecKey ( key_t titleKey );
/*
uint32_t getGroupId ( int64_t docId ) {
return g_titledb.getGroupId ( docId ); };
// cluster rec should be stored on same host as titleRec with the
// same docId that this key contains
uint32_t getGroupIdFromKey ( key_t *key ) {
return g_titledb.getGroupId ( getDocId ( *key ) ); };
*/
// NOTE: THESE NOW USE THE REAL CLUSTERDB REC
// // docId occupies the most significant bytes of the key
// now docId occupies the bits after the first 23
@ -106,10 +93,6 @@ class Clusterdb {
return docId;
};
//int64_t getDocId ( char *r ) {
// return getDocId(*(key_t*)r);
//}
uint32_t getSiteHash26 ( const char *r ) {
//return g_titledb.getSiteHash ( (key_t *)r ); };
return ((uint32_t)(((const key_t*)r)->n0 >> 2) & 0x03FFFFFF);
@ -124,52 +107,16 @@ class Clusterdb {
return ((unsigned char)(((const key_t*)r)->n0 >> 28) & 0x0000003F);
}
// NOTE: THESE USE THE OLD "CLUSTERDB" REC GENERATED BY MSG22 (VECTOR)
//uint32_t getContentHash ( char *r ) {
// return g_titledb.getContentHash ( *(key_t *)r ); };
char getFamilyFilter ( const char *r ) {
if ( (*(const int64_t *)r) & 0x0000000400000000LL ) return 1;
return 0;
};
//uint32_t hasAdultWords ( char *r ) {
// return g_titledb.hasAdultWords ( *(key_t *)r ); };
//uint32_t hasAdultCategory ( char *r ) {
// return g_titledb.hasAdultCategory ( *(key_t *)r ); };
//unsigned char getLanguageFromVector ( char *r ) {
// return 0;
//}
// the random sample vector
/*
void getSampleVector ( char *vec ,
class Doc *doc,
char *coll ,
int32_t collLen ,
int32_t niceness = 0 );
*/
//void getSampleVector ( char *vec , class TermTable *table );
char getSampleSimilarity ( char *vec0 , char *vec1 , int32_t size );
// get the content vector from a cluster rec (used by Msg38.cpp)
//char *getSampleVector ( char *rec ) { return rec + sizeof(key_t); };
//char *getGigabitVector ( char *rec ) {
// return rec + sizeof(key_t) + SAMPLE_VECTOR_SIZE ; };
//char getGigabitSimilarity ( char *vec0 , char *vec1 ,
// int32_t *qtable , int32_t numSlots ) ;
//DiskPageCache *getDiskPageCache() { return &m_pc; };
private:
private:
// this rdb holds urls waiting to be spidered or being spidered
Rdb m_rdb;
//DiskPageCache m_pc;
};
extern class Clusterdb g_clusterdb;

@ -456,8 +456,6 @@ bool Collectiondb::addNewColl ( char *coll ,
cr->m_collectiveRespiderFrequency = 0.0;
//cr->m_restrictDomain = true;
// reset the crawl stats
// always turn off gigabits so &s=1000 can do summary skipping
cr->m_docsToScanForTopics = 0;
// turn off link voting, etc. to speed up
cr->m_getLinkInfo = false;
cr->m_computeSiteNumInlinks = false;
@ -1283,12 +1281,12 @@ char *Collectiondb::getDefaultColl ( HttpRequest *r ) {
// . get collectionRec from name
// . returns NULL if not available
CollectionRec *Collectiondb::getRec ( char *coll ) {
CollectionRec *Collectiondb::getRec ( const char *coll ) {
if ( ! coll ) coll = "";
return getRec ( coll , gbstrlen(coll) );
}
CollectionRec *Collectiondb::getRec ( char *coll , int32_t collLen ) {
CollectionRec *Collectiondb::getRec ( const char *coll , int32_t collLen ) {
if ( ! coll ) coll = "";
collnum_t collnum = getCollnum ( coll , collLen );
if ( collnum < 0 ) return NULL;
@ -1333,14 +1331,14 @@ char *Collectiondb::getCollName ( collnum_t collnum ) {
return m_recs[collnum]->m_coll;
}
collnum_t Collectiondb::getCollnum ( char *coll ) {
collnum_t Collectiondb::getCollnum ( const char *coll ) {
int32_t clen = 0;
if ( coll ) clen = gbstrlen(coll );
return getCollnum ( coll , clen );
}
collnum_t Collectiondb::getCollnum ( char *coll , int32_t clen ) {
collnum_t Collectiondb::getCollnum ( const char *coll , int32_t clen ) {
// default empty collection names
if ( coll && ! coll[0] ) coll = NULL;
@ -1674,9 +1672,6 @@ bool CollectionRec::load ( char *coll , int32_t i ) {
// fix for diffbot, spider time deduping
if ( m_isCustomCrawl ) m_dedupingEnabled = true;
// always turn off gigabits so &s=1000 can do summary skipping
if ( m_isCustomCrawl ) m_docsToScanForTopics = 0;
// make min to merge smaller than normal since most collections are
// small and we want to reduce the # of vfds (files) we have
if ( m_isCustomCrawl ) {

@ -61,8 +61,8 @@ class Collectiondb {
bool m_needsSave;
// returns i so that m_recs[i].m_coll = coll
collnum_t getCollnum ( char *coll , int32_t collLen );
collnum_t getCollnum ( char *coll ); // coll is NULL terminated here
collnum_t getCollnum ( const char *coll , int32_t collLen );
collnum_t getCollnum ( const char *coll ); // coll is NULL terminated here
char *getCollName ( collnum_t collnum );
char *getColl ( collnum_t collnum ) {return getCollName(collnum);};
@ -79,9 +79,9 @@ class Collectiondb {
// . get collectionRec from name
// returns NULL if not available
class CollectionRec *getRec ( char *coll );
class CollectionRec *getRec ( const char *coll );
class CollectionRec *getRec ( char *coll , int32_t collLen );
class CollectionRec *getRec ( const char *coll , int32_t collLen );
class CollectionRec *getRec ( collnum_t collnum);
@ -501,8 +501,7 @@ class CollectionRec {
float m_updateVotesFreq ; // in days. replaced m_recycleVotes
float m_sortByDateWeight ;
char m_dedupURLDefault ;
int32_t m_topicSimilarCutoffDefault ;
char m_dedupURLDefault ;
char m_useNewDeduping ;
char m_doTierJumping ;
float m_numDocsMultiplier ;
@ -716,20 +715,6 @@ class CollectionRec {
int32_t m_compoundListMaxSize;
// . related topics control
// . this can all be overridden by passing in your own cgi parms
// for the query request
int32_t m_numTopics; // how many do they want by default?
int32_t m_minTopicScore;
int32_t m_docsToScanForTopics; // how many to scan by default?
int32_t m_maxWordsPerTopic;
int32_t m_minDocCount; // min docs that must contain topic
char m_ipRestrict;
int32_t m_dedupSamplePercent;
char m_topicRemoveOverlaps; // this is generally a good thing
int32_t m_topicSampleSize; // sample about 5k per document
int32_t m_topicMaxPunctLen; // keep it set to 1 for speed
// SPELL CHECK
char m_spellCheck;
@ -887,26 +872,15 @@ class CollectionRec {
// post query reranking
int32_t m_pqr_docsToScan; // also for # docs for language
float m_pqr_demFactCountry; // demotion for foreign countries
float m_pqr_demFactQTTopicsInUrl; // demotion factor fewer for query terms or gigabits in the url
int32_t m_pqr_maxValQTTopicsInUrl; // max value for fewer query terms or gigabits in the url
float m_pqr_demFactPaths; // demotion factor for more paths
int32_t m_pqr_maxValPaths; // max value for more paths
float m_pqr_demFactCatidHasSupers; // demotion factor for catids with many super topics
int32_t m_pqr_maxValCatidHasSupers; // max value for catids with many super topics
float m_pqr_demFactPageSize; // demotion factor for higher page sizes
int32_t m_pqr_maxValPageSize; // max value for higher page sizes
float m_pqr_demFactLocTitle; // demotion factor for non-location specific queries with location specific results
float m_pqr_demFactLocSummary; // demotion factor for non-location specific queries with location specific results
bool m_pqr_demInTopics; // true to demote if location is in the gigabits, otherwise these locs won't be demoted
int32_t m_pqr_maxValLoc; // max value for non-location specific queries with location specific results
float m_pqr_demFactNonHtml; // demotion factor for non-html content type
float m_pqr_demFactXml; // demotion factor for xml content type
float m_pqr_demFactOthFromHost; // demotion factor for no other pages from same host
int32_t m_pqr_maxValOthFromHost; // max value for no other pages from same host
float m_pqr_demFactDmozCatNmNoQT; // demotion factor for dmoz category names that don't contain a query term
int32_t m_pqr_maxValDmozCatNmNoQT; // max value for dmoz category names that don't contain a query term
float m_pqr_demFactDmozCatNmNoGigabits; // demotion factor for dmoz category names that don't contain a gigabit
int32_t m_pqr_maxValDmozCatNmNoGigabits; // max value for dmoz category names that don't contain a gigabit
float m_pqr_demFactDatedbDate; // demotion for datedb date
int32_t m_pqr_minValDatedbDate; // dates earlier than this will be demoted to the max
int32_t m_pqr_maxValDatedbDate; // dates later than this will not be demoted

47
Conf.h

@ -147,7 +147,6 @@ class Conf {
// tagdb parameters
int32_t m_tagdbMaxTreeMem;
int32_t m_revdbMaxTreeMem;
int32_t m_timedbMaxTreeMem;
// clusterdb for site clustering, each rec is 16 bytes
@ -173,6 +172,9 @@ class Conf {
int32_t m_sendEmailTimeout;
int32_t m_pingSpacer;
int64_t m_msg40_msg39_timeout; //timeout for entire get-docid-list phase, in milliseconds.
int64_t m_msg3a_msg39_network_overhead; //additional latency/overhead of sending reqeust+response over network.
// the spiderdb holds url records for spidering, when to spider, etc..
int32_t m_maxWriteThreads ;
int32_t m_spiderMaxDiskThreads ;
@ -184,7 +186,6 @@ class Conf {
bool m_useStatsdb;
bool m_spideringEnabled ;
bool m_turkingEnabled ;
bool m_injectionsEnabled ;
bool m_queryingEnabled ;
bool m_returnResultsAnyway;
@ -385,8 +386,6 @@ class Conf {
bool m_detectMemLeaks;
// . if false we will not keep spelling information in memory
// . we will keep the popularity info from dict though, since related
// topics requires that
bool m_doSpellChecking;
// are we running in Matt Wells's private data center? if so we
@ -395,23 +394,6 @@ class Conf {
bool m_forceIt;
// maximum number of synonyms/stems to expand a word into
//int32_t m_maxSynonyms;
// default affinity for spelling suggestions/numbers
//float m_defaultAffinity;
// threshold for synonym usage
//float m_frequencyThreshold;
// thesaurus configuration
//int32_t m_maxAffinityRequests;
//int32_t m_maxAffinityErrors;
//int32_t m_maxAffinityAge;
//int32_t m_affinityTimeout;
//char m_affinityServer[MAX_URL_LEN];
//char m_affinityParms[MAX_URL_LEN];
// new syncing information
bool m_syncEnabled;
bool m_syncIndexdb;
@ -561,7 +543,6 @@ class Conf {
bool m_logDebugThread ;
bool m_logDebugTimedb ;
bool m_logDebugTitle ;
bool m_logDebugTopics ;
bool m_logDebugTopDocs ;
bool m_logDebugUdp ;
bool m_logDebugUnicode ;
@ -586,7 +567,6 @@ class Conf {
bool m_logTimingNet;
bool m_logTimingQuery;
bool m_logTimingSpcache;
bool m_logTimingTopics;
// programmer reminders.
bool m_logReminders;
@ -653,17 +633,6 @@ class Conf {
int32_t m_maxHeartbeatDelay;
int32_t m_maxCallbackDelay;
// balance value for Msg6, each host can have this many ready domains
// per global host
//int32_t m_distributedSpiderBalance;
//int32_t m_distributedIpWait;
// parameters for indexdb spitting and tfndb extension bits
//int32_t m_indexdbSplit;
//char m_fullSplit;
//char m_legacyIndexdbSplit;
//int32_t m_tfndbExtBits;
// used by Repair.cpp
char m_repairingEnabled ;
int32_t m_maxRepairSpiders ;
@ -673,23 +642,13 @@ class Conf {
char m_fullRebuild ;
char m_rebuildAddOutlinks;
char m_rebuildRecycleLinkInfo ;
//char m_rebuildRecycleLinkInfo2 ;
//char m_removeBadPages ;
char m_rebuildTitledb ;
//char m_rebuildTfndb ;
//char m_rebuildIndexdb ;
char m_rebuildPosdb ;
//char m_rebuildNoSplits ;
//char m_rebuildDatedb ;
char m_rebuildClusterdb ;
char m_rebuildSpiderdb ;
//char m_rebuildSitedb ;
char m_rebuildLinkdb ;
//char m_rebuildTagdb ;
//char m_rebuildPlacedb ;
char m_rebuildTimedb ;
char m_rebuildSectiondb ;
//char m_rebuildRevdb ;
char m_rebuildRoots ;
char m_rebuildNonRoots ;

@ -71,7 +71,7 @@ int32_t Highlight::set( SafeBuf *sb, char *content, int32_t contentLen, Query *q
}
Phrases phrases;
if ( !phrases.set( &words, &bits, true, false, version, niceness ) ) {
if ( !phrases.set( &words, &bits, version, niceness ) ) {
return -1;
}

@ -16,7 +16,6 @@
#include "Clusterdb.h"
#include "Datedb.h"
#include "Dns.h"
#include "Revdb.h"
// a global class extern'd in .h file
Hostdb g_hostdb;
@ -1760,7 +1759,7 @@ int64_t Hostdb::getNumGlobalEvents ( ) {
return n / m_numHostsPerShard;
}
bool Hostdb::setNote ( int32_t hostId, char *note, int32_t noteLen ) {
bool Hostdb::setNote ( int32_t hostId, const char *note, int32_t noteLen ) {
// replace the note on the host
if ( noteLen > 125 ) noteLen = 125;
Host *h = getHost ( hostId );
@ -1773,7 +1772,7 @@ bool Hostdb::setNote ( int32_t hostId, char *note, int32_t noteLen ) {
return saveHostsConf();
}
bool Hostdb::setSpareNote ( int32_t spareId, char *note, int32_t noteLen ) {
bool Hostdb::setSpareNote ( int32_t spareId, const char *note, int32_t noteLen ) {
// replace the note on the host
if ( noteLen > 125 ) noteLen = 125;
Host *h = getSpare ( spareId );
@ -2354,8 +2353,7 @@ int32_t getShardNumFromTermId ( int64_t termId ) {
// . this allows us to have any # of groups in a stripe, not just power of 2
// . now we can use 3 stripes of 96 hosts each so spiders will almost never
// go down
//uint32_t Hostdb::getGroupId ( char rdbId,void *k,bool split ) {
uint32_t Hostdb::getShardNum ( char rdbId, const void *k ) { // ,bool split ) {
uint32_t Hostdb::getShardNum ( char rdbId, const void *k ) {
if ( (rdbId == RDB_POSDB || rdbId == RDB2_POSDB2) &&
// split by termid and not docid?
@ -2372,10 +2370,6 @@ uint32_t Hostdb::getShardNum ( char rdbId, const void *k ) { // ,bool split ) {
uint64_t d = g_posdb.getDocId ( k );
return m_map [ ((d>>14)^(d>>7)) & (MAX_KSLOTS-1) ];
}
//if ( rdbId == RDB_INDEXDB || rdbId == RDB2_INDEXDB2 ) {
// uint64_t d = g_indexdb.getDocId ( (key_t *)k );
// return m_map [ ((d>>14)^(d>>7)) & (MAX_KSLOTS-1) ];
//}
else if ( rdbId == RDB_DATEDB || rdbId == RDB2_DATEDB2 ) {
uint64_t d = g_datedb.getDocId ( k );
return m_map [ ((d>>14)^(d>>7)) & (MAX_KSLOTS-1) ];
@ -2383,10 +2377,6 @@ uint32_t Hostdb::getShardNum ( char rdbId, const void *k ) { // ,bool split ) {
else if ( rdbId == RDB_LINKDB || rdbId == RDB2_LINKDB2 ) {
return m_map [(*(uint16_t *)((char *)k + 26))>>3];
}
//else if ( rdbId == RDB_TFNDB || rdbId == RDB2_TFNDB2 ) {
// uint64_t d = g_tfndb.getDocId ( (key_t *)k );
// return m_map [ ((d>>14)^(d>>7)) & (MAX_KSLOTS-1) ];
//}
else if ( rdbId == RDB_TITLEDB || rdbId == RDB2_TITLEDB2 ) {
uint64_t d = g_titledb.getDocId ( (key_t *)k );
return m_map [ ((d>>14)^(d>>7)) & (MAX_KSLOTS-1) ];
@ -2416,23 +2406,11 @@ uint32_t Hostdb::getShardNum ( char rdbId, const void *k ) { // ,bool split ) {
rdbId == RDB2_TAGDB2 ) {
return m_map [(*(uint16_t *)((char *)k + 10))>>3];
}
else if ( rdbId == RDB_DOLEDB ) { // || rdbId == RDB2_DOLEDB2 ) {
else if ( rdbId == RDB_DOLEDB ) {
// HACK:!!!!!! this is a trick!!! it is us!!!
//return g_hostdb.m_myHost->m_groupId;
return g_hostdb.m_myHost->m_shardNum;
}
else if ( rdbId == RDB_SECTIONDB || rdbId == RDB2_SECTIONDB2 ) {
// use top 13 bits of key
return m_map [(*(uint16_t *)((char *)k + 14))>>3];
//uint64_t d = g_datedb.getDocId ( k );
//return m_map [ ((d>>14)^(d>>7)) & (MAX_KSLOTS-1) ];
}
else if ( rdbId == RDB_REVDB || rdbId == RDB2_REVDB2 ) {
// key is formed like title key is
//int64_t d = g_titledb.getDocId ( (key_t *)k );
uint64_t d = g_revdb.getDocId( (key_t *)k );
return m_map [ ((d>>14)^(d>>7)) & (MAX_KSLOTS-1) ];
}
// core -- must be provided
char *xx = NULL; *xx = 0;

@ -593,8 +593,8 @@ class Hostdb {
// sets the note for a host
bool setNote ( int32_t hostId, char *note, int32_t noteLen );
bool setSpareNote ( int32_t spareId, char *note, int32_t noteLen );
bool setNote ( int32_t hostId, const char *note, int32_t noteLen );
bool setSpareNote ( int32_t spareId, const char *note, int32_t noteLen );
// replace a host with a spare
bool replaceHost ( int32_t origHostId, int32_t spareHostId );

@ -10,6 +10,7 @@
#include "Proxy.h"
#include "PageCrawlBot.h"
#include "Parms.h"
#include "PageRoot.h"
#ifdef _VALGRIND_
#include <valgrind/memcheck.h>
#endif
@ -471,76 +472,6 @@ void HttpServer::requestHandler ( TcpSocket *s ) {
// parse the http request
HttpRequest r;
// debug
/*
unsigned char foo[1024];
unsigned char *pp = foo;
pp += sprintf ( (char *)pp,"GET /search?qcs=iso-8859-1&k0c=107207&code=1M9VNT6&spell=1&ns=2&nrt=0&rat=0&sc=1&DR=1&qh=0&bq2&q=");
//pp += sprintf ( (char *)pp,"GET /search?k0c=107207&code=1M9VNT6&spell=1&ns=2&nrt=0&rat=0&sc=1&DR=1&qh=0&bq2&q=");
static char ddd[] = {
0xc3, 0x83, 0xc6, 0x92, 0xc3, 0xa2, 0xe2, 0x80, 0x9e, 0xc2,
0xa2, 0xc3, 0x83, 0xc2, 0xa2, 0xc3, 0xa2, 0xe2, 0x80, 0x9a,
0xc2, 0xac, 0xc3, 0x82, 0xc2, 0xa6, 0xc3, 0x83, 0xc6, 0x92,
0xc3, 0xa2, 0xe2, 0x80, 0x9e, 0xc2, 0xa2, 0xc3, 0x83, 0xe2,
0x80, 0x9a, 0xc3, 0x82, 0xc2, 0x81, 0xc3, 0x83, 0xc6, 0x92,
0xc3, 0xa2, 0xe2, 0x80, 0x9e, 0xc2, 0xa2, 0xc3, 0x83, 0xc2,
0xa2, 0xc3, 0xa2, 0xe2, 0x80, 0x9a, 0xc2, 0xac, 0xc3, 0x82,
0xc2, 0xa1, 0xc3, 0x83, 0xc6, 0x92, 0xc3, 0xa2, 0xe2, 0x80,
0x9e, 0xc2, 0xa2, 0xc3, 0x83, 0xe2, 0x80, 0xb9, 0xc3, 0xa2,
0xe2, 0x82, 0xac, 0xc2, 0xa0, 0xc3, 0x83, 0xc6, 0x92, 0xc3,
0xa2, 0xe2, 0x80, 0x9e, 0xc2, 0xa2, 0xc3, 0x83, 0xc2, 0xa2,
0xc3, 0xa2, 0xe2, 0x80, 0x9a, 0xc2, 0xac, 0xc3, 0x82, 0xc2,
0xa6, 0x20, 0xc3, 0x83, 0xc6, 0x92, 0xc3, 0x8b, 0xc5, 0x93,
0xc3, 0x83, 0xe2, 0x80, 0x9a, 0xc3, 0x82, 0xc2, 0xa7, 0xc3,
0x83, 0xc6, 0x92, 0xc3, 0xa2, 0xe2, 0x80, 0x9e, 0xc2, 0xa2,
0xc3, 0x83, 0xc2, 0xa2, 0xc3, 0xa2, 0xe2, 0x80, 0x9a, 0xc2,
0xac, 0xc3, 0x85, 0xc2, 0xbe, 0xc3, 0x83, 0xc6, 0x92, 0xc3,
0xa2, 0xe2, 0x80, 0x9e, 0xc2, 0xa2, 0xc3, 0x83, 0xc2, 0xa2,
0xc3, 0xa2, 0xe2, 0x80, 0x9a, 0xc2, 0xac, 0xc3, 0x82, 0xc2,
0xa6, 0xc3, 0x83, 0xc6, 0x92, 0xc3, 0xa2, 0xe2, 0x80, 0x9e,
0xc2, 0xa2, 0xc3, 0x83, 0xc2, 0xa2, 0xc3, 0xa2, 0xe2, 0x80,
0x9a, 0xc2, 0xac, 0xc3, 0x82, 0xc2, 0xa0, 0xc3, 0x83, 0xc6,
0x92, 0xc3, 0x8b, 0xc5, 0x93, 0xc3, 0x83, 0xe2, 0x80, 0x9a,
0xc3, 0x82, 0xc2, 0xb8, 0xc3, 0x83, 0xc6, 0x92, 0xc3, 0xa2,
0xe2, 0x80, 0x9e, 0xc2, 0xa2, 0xc3, 0x83, 0xe2, 0x80, 0xb9,
0xc3, 0xa2, 0xe2, 0x82, 0xac, 0xc2, 0xa0, 0xc3, 0x83, 0xc6,
0x92, 0xc3, 0xa2, 0xe2, 0x80, 0x9e, 0xc2, 0xa2, 0xc3, 0x83,
0xc2, 0xa2, 0xc3, 0xa2, 0xe2, 0x80, 0x9a, 0xc2, 0xac, 0xc3,
0x82, 0xc2, 0xa6, 0xc3, 0x83, 0xc6, 0x92, 0xc3, 0x8b, 0xc5,
0x93, 0xc3, 0x83, 0xe2, 0x80, 0x9a, 0xc3, 0x82, 0xc2, 0xa9,
0x20, 0xc3, 0x83, 0xc6, 0x92, 0xc3, 0x8b, 0xc5, 0x93, 0xc3,
0x83, 0xe2, 0x80, 0x9a, 0xc3, 0x82, 0xc2, 0xa7, 0xc3, 0x83,
0xc6, 0x92, 0xc3, 0xa2, 0xe2, 0x80, 0x9e, 0xc2, 0xa2, 0xc3,
0x83, 0xc2, 0xa2, 0xc3, 0xa2, 0xe2, 0x80, 0x9a, 0xc2, 0xac,
0xc3, 0x85, 0xc2, 0xbe, 0xc3, 0x83, 0xc6, 0x92, 0xc3, 0x8b,
0xc5, 0x93, 0xc3, 0x83, 0xe2, 0x80, 0x9a, 0xc3, 0x82, 0xc2,
0xa8, 0xc3, 0x83, 0xc6, 0x92, 0xc3, 0xa2, 0xe2, 0x80, 0x9e,
0xc2, 0xa2, 0xc3, 0x83, 0xe2, 0x80, 0xa6, 0xc3, 0x82, 0xc2,
0xa0, 0xc3, 0x83, 0xc6, 0x92, 0xc3, 0x8b, 0xc5, 0x93, 0xc3,
0x83, 0xe2, 0x80, 0x9a, 0xc3, 0x82, 0xc2, 0xa6, 0xc3, 0x83,
0xc6, 0x92, 0xc3, 0xa2, 0xe2, 0x80, 0x9e, 0xc2, 0xa2, 0xc3,
0x83, 0xe2, 0x80, 0xa6, 0xc3, 0x82, 0xc2, 0xa0, 0xc3, 0x83,
0xc6, 0x92, 0xc3, 0x8b, 0xc5, 0x93, 0xc3, 0x83, 0xe2, 0x80,
0x9a, 0xc3, 0x82, 0xc2, 0xa9, 0x00, 0x00, 0xda, 0xda, 0xda,
0xda, 0xda, 0xda, 0xda, 0xda, 0xda, 0xda, 0xda, 0xda, 0xda,
0xda, 0xda, 0xda, 0xda, 0xda, 0xda, 0xda, 0xda, 0xda, 0xda,
0xda, 0xda, 0xda, 0xda, 0xda, 0xda, 0xda, 0xda, 0xda, 0x74,
0x65, 0x73, 0x2c, 0x20, 0x68, 0x59, 0x00, 0x00, 0x00, 0xac,
0xed, 0x3b, 0x09, 0xac, 0xed, 0x3b, 0x09, 0x78, 0x51, 0xa7,
0x24, 0xf8, 0xd0, 0xa7, 0x24, 0x00, 0x00, 0x00, 0x00, 0x0a,
0x00};
for ( int32_t i = 0 ; i < 435 ; i++ ) {
// again:
*pp = ddd[i]; // rand() % 256;
//if ( *pp < 0x80 ) goto again;
pp++;
}
*pp = 0;
*/
// . since we own the data, we'll free readBuf on r's destruction
// . this returns false and sets g_errno on error
// . but it should still set m_request to the readBuf to delete it
@ -2592,9 +2523,6 @@ TcpSocket *HttpServer::unzipReply(TcpSocket* s) {
}
bool printFrontPageShell ( SafeBuf *sb , char *tabName , CollectionRec *cr ,
bool printGigablast );
bool sendPagePretty ( TcpSocket *s ,
HttpRequest *r ,
char *filename ,

@ -108,7 +108,7 @@ void Images::setCandidates ( Url *pageUrl , Words *words , Xml *xml ,
// the positive scored window
int32_t firstPosScore = -1;
int32_t lastPosScore = -1;
int32_t badFlags = SEC_SCRIPT|SEC_STYLE|SEC_SELECT|SEC_MARQUEE;
int32_t badFlags = SEC_SCRIPT|SEC_STYLE|SEC_SELECT;
// find positive scoring window
for ( int32_t i = 0 ; i < nw ; i++ ) {
// skip if in bad section

@ -356,7 +356,6 @@ key224_t Linkdb::makeKey_uk ( uint32_t linkeeSiteHash32 ,
/////////
#include "Collectiondb.h"
//#include "CollectionRec.h"
#include "matches2.h"
// 1MB read size for now
@ -364,12 +363,8 @@ key224_t Linkdb::makeKey_uk ( uint32_t linkeeSiteHash32 ,
#define MAX_INTERNAL_INLINKS 10
//static void gotRootTitleRecWrapper25 ( void *state ) ;
//static void gotTermFreqWrapper ( void *state ) ;
static void gotListWrapper ( void *state ,RdbList *list,Msg5 *msg5);
static bool gotLinkTextWrapper ( void *state );
//static void sendLinkInfoReplyWrapper ( void *state );//, LinkInfo *info ) ;
//static void gotReplyWrapper25 ( void *state , void *state2 ) ;
Msg25::Msg25() {
m_numRequests = 0;
@ -391,12 +386,6 @@ void Msg25::reset() {
mfree ( m_replyPtrs[i], m_replySizes[i], "msg25r");
// reset array count to 0
m_numReplyPtrs = 0;
// . free the linkinfo if we are responsible for it
// . if someone "steals" it from us, they should set this to NULL
//if ( m_linkInfo )
// mfree ( m_linkInfo , m_linkInfo->getStoredSize(),"msg25s");
// this now points into m_linkInfoBuf safebuf, just NULL it
//m_linkInfo = NULL;
m_table.reset();
m_ipTable.reset();
@ -3359,7 +3348,6 @@ void Inlink::set ( Msg20Reply *r ) {
r->size_surroundingText +
r->size_rssItem +
r->size_categories +
r->size_gigabitQuery +
r->size_templateVector;
char *pend = p + need;
@ -3372,7 +3360,7 @@ void Inlink::set ( Msg20Reply *r ) {
size_surroundingText = r->size_surroundingText;
size_rssItem = r->size_rssItem;
size_categories = r->size_categories;
size_gigabitQuery = r->size_gigabitQuery;
size_gigabitQuery = 0;
size_templateVector = r->size_templateVector;
@ -3432,13 +3420,8 @@ void Inlink::set ( Msg20Reply *r ) {
/////////////
off_gigabitQuery = poff;
if ( p + r->size_gigabitQuery < pend ) {
gbmemcpy ( p , r->ptr_gigabitQuery , size_gigabitQuery );
}
else {
size_gigabitQuery = 1;
*p = '\0';
}
size_gigabitQuery = 1;
*p = '\0';
poff += size_gigabitQuery;
p += size_gigabitQuery;
@ -3468,37 +3451,27 @@ void Inlink::setMsg20Reply ( Msg20Reply *r ) {
r->m_firstSpidered = m_firstSpidered;
r->m_lastSpidered = m_lastSpidered;
//r->m_nextSpiderTime = m_nextSpiderDate;
r->m_datedbDate = m_datedbDate;
r->m_firstIndexedDate = m_firstIndexedDate;
r->m_numOutlinks = m_numOutlinks;
//r->m_linkTextBaseScore = m_baseScore;
//r->m_pagePop = m_pagePop;
//r->m_sitePop = m_sitePop;
//r->m_siteNumInlinks = m_siteNumInlinks;
r->m_isPermalink = m_isPermalink;
r->m_outlinkInContent = m_outlinkInContent;
r->m_outlinkInComment = m_outlinkInComment;
r->m_isLinkSpam = m_isLinkSpam;
//r->m_isAnomaly = m_isAnomaly;
r->m_hasAllQueryTerms = m_hasAllQueryTerms;
r->m_country = m_country;
r->m_language = m_language;
//r->m_docQuality = m_docQuality;
r->m_siteRank = m_siteRank;
//r->m_ruleset = m_ruleset;
r->m_hopcount = m_hopcount;
//r->m_linkTextScoreWeight = m_linkTextScoreWeight;
r->ptr_ubuf = getUrl();//ptr_urlBuf;
r->ptr_linkText = getLinkText();//ptr_linkText;
r->ptr_surroundingText = getSurroundingText();//ptr_surroundingText;
r->ptr_rssItem = getRSSItem();//ptr_rssItem;
r->ptr_categories = getCategories();//ptr_categories;
r->ptr_gigabitQuery = getGigabitQuery();//ptr_gigabitQuery;
r->ptr_templateVector = getTemplateVector();//ptr_templateVector;
r->size_ubuf = size_urlBuf;
@ -3506,7 +3479,6 @@ void Inlink::setMsg20Reply ( Msg20Reply *r ) {
r->size_surroundingText = size_surroundingText;
r->size_rssItem = size_rssItem;
r->size_categories = size_categories;
r->size_gigabitQuery = size_gigabitQuery;
r->size_templateVector = size_templateVector;
}
@ -3583,7 +3555,7 @@ bool LinkInfo::print ( SafeBuf *sb , char *coll ) {
int32_t dlen = k->size_surroundingText - 1;
char *r = k->getRSSItem();//ptr_rssItem;
int32_t rlen = k->size_rssItem - 1;
char *g = k->getGigabitQuery();//ptr_gigabitQuery;
char *g = k->getGigabitQuery();
int32_t glen = k->size_gigabitQuery - 1;
char *c = k->getCategories();//ptr_categories;
int32_t clen = k->size_categories - 1;
@ -4068,12 +4040,6 @@ bool Links::addLink ( char *link , int32_t linkLen , int32_t nodeNum ,
// don't add 0 length links
if ( linkLen <= 0 ) return true;
// ensure buf has enough room
// if (titleRecVersion < 72){
// if ( m_bufPtr-m_buf + linkLen + 1 > LINK_BUF_SIZE ){
// return true;
// }
// }
// do we need to alloc more link space?
if (m_numLinks >= m_allocLinks) {
@ -4250,8 +4216,6 @@ bool Links::addLink ( char *link , int32_t linkLen , int32_t nodeNum ,
else bufSpace = 0;
// allocate dynamic buffer for lotsa links
if ( url.getUrlLen() + 1 > bufSpace ) {
//if (titleRecVersion < 72 && m_allocSize >= LINK_BUF_SIZE)
// return true;
// grow by 100K
int32_t newAllocSize;// = m_allocSize+LINK_BUF_SIZE;
if ( ! m_allocSize ) newAllocSize = LINK_BUF_SIZE;

@ -165,7 +165,6 @@ bool Log::shouldLog ( int32_t type , const char *msg ) {
if ( msg[0] == 'n' ) return g_conf.m_logTimingNet;
if ( msg[0] == 'q' ) return g_conf.m_logTimingQuery;
if ( msg[0] == 's' ) return g_conf.m_logTimingSpcache;
if ( msg[0] == 't' ) return g_conf.m_logTimingTopics;
return false;
}
if ( type != LOG_DEBUG ) return true;
@ -205,8 +204,6 @@ bool Log::shouldLog ( int32_t type , const char *msg ) {
if (msg[0]=='u'&&msg[1]=='n' ) return g_conf.m_logDebugUnicode;
if (msg[0]=='t'&&msg[1]=='o'&&msg[3]=='D' )
return g_conf.m_logDebugTopDocs;
if (msg[0]=='t'&&msg[1]=='o'&&msg[3]!='D' )
return g_conf.m_logDebugTopics;
if (msg[0]=='d'&&msg[1]=='a' ) return g_conf.m_logDebugDate;
if (msg[0]=='d'&&msg[1]=='d' ) return g_conf.m_logDebugDetailed;

6
Log.h

@ -69,7 +69,6 @@
// spcache related to determining what urls to spider next
// speller query spell checking
// thread calling threads
// topics related topics
// udp udp networking
// example log:
@ -87,11 +86,6 @@
#define MAX_LOG_MSGS 1024 // in memory
// this is for printing out how a page is parsed by PageParser.cpp
/* extern char *g_pbuf ; */
/* extern char *g_pbufPtr ; */
/* extern char *g_pterms ; */
/* extern char *g_ptermPtr ; */
/* extern char *g_pend; */
extern char *g_dbuf;
extern int32_t g_dbufSize;

@ -35,7 +35,7 @@ OBJS = UdpSlot.o Rebalance.o \
Msg1.o \
Msg0.o Mem.o Matches.o Loop.o \
Log.o Lang.o \
Indexdb.o Posdb.o Clusterdb.o IndexList.o Revdb.o \
Indexdb.o Posdb.o Clusterdb.o IndexList.o \
HttpServer.o HttpRequest.o \
HttpMime.o Hostdb.o \
Highlight.o File.o Errno.o Entities.o \
@ -75,6 +75,7 @@ CPPFLAGS = -g -Wall -fno-stack-protector -DPTHREADS -Wstrict-aliasing=0
ifeq ($(CXX), g++)
CPPFLAGS += -Wno-write-strings -Wno-uninitialized -Wno-unused-but-set-variable
CPPFLAGS += -Wno-invalid-offsetof
else ifeq ($(CXX), clang++)
CPPFLAGS += -Weverything -Wno-cast-align -Wno-reserved-id-macro -Wno-padded -Wno-c++11-long-long -Wno-tautological-undefined-compare -Wno-c++11-compat-reserved-user-defined-literal -Wno-zero-length-array -Wno-float-equal -Wno-c99-extensions -Wno-weak-vtables -Wno-global-constructors -Wno-exit-time-destructors
CPPFLAGS += -Wno-shadow -Wno-conversion -Wno-extra-semi -Wno-sign-conversion -Wno-old-style-cast -Wno-shorten-64-to-32 -Wno-unused-parameter -Wno-missing-prototypes -Wno-c++11-compat-deprecated-writable-strings
@ -114,6 +115,9 @@ GIT_VERSION=$(shell git rev-parse HEAD)$(DIRTY)
all: gb
debug: DEFS += -D_VALGRIND_
debug: all
utils: blaster2 hashtest monitor seektest urlinfo treetest dnstest gbtitletest
# third party libraries

@ -508,7 +508,6 @@ bool Matches::addMatches(Words *words, Phrases *phrases, Sections *sections, Bit
if ( getNumXmlNodes() > 512 ) { char *xx=NULL;*xx=0; }
// google seems to index SEC_MARQUEE so i took that out of here
int32_t badFlags =SEC_SCRIPT|SEC_STYLE|SEC_SELECT|SEC_IN_TITLE;
int32_t qwn;
@ -627,10 +626,7 @@ bool Matches::addMatches(Words *words, Phrases *phrases, Sections *sections, Bit
// this is 0 if we were an unmatched quote
if ( numWords <= 0 ) continue;
// we matched a bigram in the document
//numWords = 3;
// i guess we matched the query phrase bigram
//numQWords = 3;
// got a match
goto gotMatch2;
}
@ -641,7 +637,6 @@ bool Matches::addMatches(Words *words, Phrases *phrases, Sections *sections, Bit
numWords = 1;
numQWords = 1;
goto gotMatch2;
//char *xx=NULL;*xx=0;
}
//

@ -1164,105 +1164,6 @@ void gotListWrapper ( void *state , RdbList *listb , Msg5 *msg5xx )
log(LOG_LOGIC,"net: msg0: Sending more data than what was "
"requested. Ineffcient. Bad engineer. dataSize=%"INT32" "
"minRecSizes=%"INT32".",dataSize,oldSize);
/*
// always compress these lists
if ( st0->m_rdbId == RDB_SECTIONDB ) { // && 1 == 3) {
// get sh48, the sitehash
key128_t *startKey = (key128_t *)msg5->m_startKey ;
int64_t sh48 = g_datedb.getTermId(startKey);
// debug
//log("msg0: got sectiondblist from disk listsize=%"INT32"",
// list->getListSize());
if ( dataSize > 50000 )
log("msg0: sending back list rdb=%"INT32" "
"listsize=%"INT32" sh48=0x%"XINT64"",
(int32_t)st0->m_rdbId,
dataSize,
sh48);
// save it
int32_t origDataSize = dataSize;
// store compressed list on itself
char *dst = list->m_list;
// warn if niceness is 0!
if ( st0->m_niceness == 0 )
log("msg0: compressing sectiondb list at niceness 0!");
// compress the list
uint32_t lastVoteHash32 = 0LL;
SectionVote *lastVote = NULL;
for ( ; ! list->isExhausted() ; list->skipCurrentRecord() ) {
// breathe
QUICKPOLL ( st0->m_niceness );
// get rec
char *rec = list->getCurrentRec();
// for ehre
key128_t *key = (key128_t *)rec;
// the score is the bit which is was set in
// Section::m_flags for that docid
int32_t secType = g_indexdb.getScore ( (char *)key );
// 0 means it probably used to count # of voters
// from this site, so i don't think xmldoc uses
// that any more
if ( secType == SV_SITE_VOTER ) continue;
// treat key like a datedb key and get the taghash
uint32_t h32 = g_datedb.getDate ( key );
// get data/vote from the current record in the
// sectiondb list
SectionVote *sv=(SectionVote *)list->getCurrentData ();
// get the average score for this doc
float avg = sv->m_score ;
if ( sv->m_numSampled > 0.0 ) avg /= sv->m_numSampled;
// if same as last guy, add to it
if ( lastVoteHash32 == h32 && lastVote ) {
// turn possible multi-vote into single docid
// into a single vote, with the score averaged.
lastVote->m_score += avg;
lastVote->m_numSampled++;
continue;
}
// otherwise, add in a new guy!
*(key128_t *)dst = *key;
dst += sizeof(key128_t);
// the new vote
SectionVote *dsv = (SectionVote *)dst;
dsv->m_score = avg;
dsv->m_numSampled = 1;
// set this
lastVote = dsv;
lastVoteHash32 = h32;
// skip over
dst += sizeof(SectionVote);
}
// update the list size now for sending back
dataSize = dst - data;
// if the list was over the requested minrecsizes we need
// to set a flag so that the caller will do a re-call.
// so making the entire odd, will be the flag.
if ( origDataSize > msg5->m_minRecSizes &&
dataSize < origDataSize ) {
*dst++ = '\0';
dataSize++;
}
// debug
//log("msg0: compressed sectiondblist from disk "
// "newlistsize=%"INT32"", dataSize);
// use this timestamp
int32_t now = getTimeLocal();//Global();
// finally, cache this sucker
s_sectiondbCache.addRecord ( msg5->m_coll,
(char *)startKey,//(char *)&sh48
data,
dataSize ,
now );
// ignore errors
g_errno = 0;
}
*/
//
// for linkdb lists, remove all the keys that have the same IP32

@ -446,10 +446,7 @@ bool Msg2::gotList ( RdbList *list ) {
if ( m_lists[i].m_listSize < m_minRecSizes[i] ) continue;
if ( m_minRecSizes[i] == 0 ) continue;
if ( m_minRecSizes[i] == -1 ) continue;
// do not print this if compiling section xpathsitehash stats
// because we only need like 10k of list to get a decent
// reading
if ( m_req->m_forSectionStats ) break;
log("msg2: read termlist #%"INT32" size=%"INT32" "
"maxSize=%"INT32". losing docIds!",
i,m_lists[i].m_listSize,m_minRecSizes[i]);

@ -691,8 +691,6 @@ int32_t Msg20Reply::serialize ( char *buf , int32_t bufSize ) {
VALGRIND_CHECK_MEM_IS_DEFINED(ptr_displaySum,size_displaySum);
if(ptr_dbuf)
VALGRIND_CHECK_MEM_IS_DEFINED(ptr_dbuf,size_dbuf);
if(ptr_gigabitSample)
VALGRIND_CHECK_MEM_IS_DEFINED(ptr_gigabitSample,size_gigabitSample);
if(ptr_mbuf)
VALGRIND_CHECK_MEM_IS_DEFINED(ptr_mbuf,size_mbuf);
if(ptr_vbuf)
@ -723,12 +721,8 @@ int32_t Msg20Reply::serialize ( char *buf , int32_t bufSize ) {
VALGRIND_CHECK_MEM_IS_DEFINED(ptr_rssItem,size_rssItem);
if(ptr_categories)
VALGRIND_CHECK_MEM_IS_DEFINED(ptr_categories,size_categories);
if(ptr_gigabitQuery)
VALGRIND_CHECK_MEM_IS_DEFINED(ptr_gigabitQuery,size_gigabitQuery);
if(ptr_content)
VALGRIND_CHECK_MEM_IS_DEFINED(ptr_content,size_content);
if(ptr_sectionVotingInfo)
VALGRIND_CHECK_MEM_IS_DEFINED(ptr_sectionVotingInfo,size_sectionVotingInfo);
if(ptr_tr)
VALGRIND_CHECK_MEM_IS_DEFINED(ptr_tr,size_tr);
if(ptr_tlistBuf)

10
Msg20.h

@ -58,8 +58,6 @@ class Msg20Request {
int32_t m_summaryMaxLen ;
int32_t m_summaryMaxNumCharsPerLine ;
int32_t m_maxNumCharsPerLine ;
int32_t m_bigSampleRadius ;
int32_t m_bigSampleMaxLen ;
int32_t m_maxCacheAge ;
int32_t m_discoveryDate ;
@ -83,12 +81,10 @@ class Msg20Request {
unsigned char m_getSummaryVector :1;
unsigned char m_showBanned :1;
unsigned char m_includeCachedCopy :1;
unsigned char m_getSectionVotingInfo :1; // in JSON for now
unsigned char m_getMatches :1;
unsigned char m_getTermListBuf :1;
unsigned char m_getOutlinks :1;
unsigned char m_getTitleRec :1; // sets ptr_tr in reply
unsigned char m_getGigabitVector :1;
unsigned char m_doLinkSpamCheck :1;
unsigned char m_isLinkSpam :1; // Msg25 uses for storage
unsigned char m_isSiteLinkInfo :1; // site link info?
@ -237,7 +233,6 @@ public:
char *ptr_rubuf ; // redirect url buffer
char *ptr_displaySum ; // summary for displaying
char *ptr_dbuf ; // display metas \0 separated
char *ptr_gigabitSample ;
char *ptr_mbuf ; // match offsets
char *ptr_vbuf ; // summary vector
char *ptr_imgData ; // for encoded images
@ -263,9 +258,7 @@ public:
char *ptr_linkUrl ; // what we link to
char *ptr_rssItem ; // set for m_getLinkText
char *ptr_categories ;
char *ptr_gigabitQuery ; // , separated list of gigabits
char *ptr_content ; // page content in utf8
char *ptr_sectionVotingInfo ; // in JSON
char *ptr_tr ; // like just using msg22
char *ptr_tlistBuf ;
char *ptr_tiBuf ; // terminfobuf
@ -285,7 +278,6 @@ public:
int32_t size_rubuf ;
int32_t size_displaySum ;
int32_t size_dbuf ;
int32_t size_gigabitSample ; // includes \0
int32_t size_mbuf ;
int32_t size_vbuf ;
int32_t size_imgData ;
@ -304,9 +296,7 @@ public:
int32_t size_linkUrl ;
int32_t size_rssItem ;
int32_t size_categories ;
int32_t size_gigabitQuery ;
int32_t size_content ; // page content in utf8
int32_t size_sectionVotingInfo ; // in json, includes \0
int32_t size_tr ;
int32_t size_tlistBuf ;
int32_t size_tiBuf ;

@ -327,17 +327,15 @@ bool Msg39::controlLoop ( ) {
// fix it
m_r->m_minDocId = d0;
m_r->m_maxDocId = d1; // -1; // exclude d1
// allow posdbtable re-initialization each time to set
// the msg2 termlist ptrs anew, otherwise we core in
// call to PosdbTable::init() below
//m_posdbTable.m_initialized = false;
// reset ourselves, partially, anyway, not tmpq etc.
reset2();
// debug log
if ( ! m_r->m_forSectionStats && m_debug )
if ( m_debug ) {
log("msg39: docid split %d/%d range %"INT64"-%"INT64"", m_docIdSplitNumber-1, m_r->m_numDocIdSplits, d0,d1);
// wtf?
//if ( d0 >= d1 ) break;
}
// load termlists for these docid ranges using msg2 from posdb
if ( ! getLists() ) return false;
}
@ -1068,7 +1066,7 @@ void Msg39::estimateHitsAndSendReply ( ) {
mr.size_pairScoreBuf = 0;
mr.size_singleScoreBuf = 0;
}
//mr.m_sectionStats = pt->m_sectionStats;
// reserve space for these guys, we fill them in below
mr.ptr_docIds = NULL;
mr.ptr_scores = NULL;

@ -53,14 +53,11 @@ class Msg39Request {
ptr_readSizes = NULL;
ptr_query = NULL; // in utf8?
ptr_whiteList = NULL;
//ptr_coll = NULL;
m_forSectionStats = false;
size_readSizes = 0;
size_query = 0;
size_whiteList = 0;
m_sameLangWeight = 20.0;
m_maxFacets = -1;
//size_coll = 0;
m_getDocIdScoringInfo = 1;
@ -115,11 +112,6 @@ class Msg39Request {
char m_useQueryStopWords;
char m_doMaxScoreAlgo;
char m_forSectionStats;
// Msg3a still uses this
//int32_t m_myFacetVal32; // for gbfacet:xpathsite really sectionstats
collnum_t m_collnum;
int64_t m_minDocId;

@ -131,13 +131,6 @@ bool Msg3a::getDocIds ( Msg39Request *r ,
log(LOG_LOGIC,"net: bad collection. msg3a. %"INT32"",
(int32_t)m_r->m_collnum);
//m_indexdbSplit = g_hostdb.m_indexSplits;
// certain query term, like, gbdom:xyz.com, are NOT split
// at all in order to keep performance high because such
// terms are looked up by the spider. if a query contains
// multiple "no split" terms, then it becomes split unfortunately...
//if ( ! m_q->isSplit() ) m_indexdbSplit = 1;
// for a sanity check in Msg39.cpp
r->m_nqt = m_q->getNumTerms();
@ -154,10 +147,7 @@ bool Msg3a::getDocIds ( Msg39Request *r ,
// . return now if query empty, no docids, or none wanted...
// . if query terms = 0, might have been "x AND NOT x"
if ( m_q->getNumTerms() <= 0 ) return true;
// sometimes we want to get section stats from the hacked
// sectionhash: posdb termlists
//if ( m_docsToGet <= 0 && ! m_r->m_getSectionStats )
// return true;
// . set g_errno if not found and return true
// . coll is null terminated
CollectionRec *cr = g_collectiondb.getRec(r->m_collnum);
@ -234,24 +224,17 @@ bool Msg3a::getDocIds ( Msg39Request *r ,
// update our read info
for ( int32_t j = 0; j < n ; j++ ) {
// the read size for THIS query term
int32_t rs = 300000000; // toRead; 300MB i guess...
// limit to 50MB man! this was 30MB but the
// 'time enough for love' query was hitting 30MB termlists.
//rs = 50000000;
rs = DEFAULT_POSDB_READSIZE;//90000000; // 90MB!
// it is better to go oom then leave users scratching their
// heads as to why some results are not being returned.
// no, because we are going out of mem for queries like
// 'www.disney.nl' etc.
//rs = -1;
// if section stats, limit to 1MB
//if ( m_r->m_getSectionStats ) rs = 1000000;
int32_t rs = DEFAULT_POSDB_READSIZE;//90000000; // 90MB!
// get the jth query term
QueryTerm *qt = &m_q->m_qterms[j];
// if query term is ignored, skip it
if ( qt->m_ignored ) rs = 0;
// set it
readSizes[j] = rs;
// serialize these too
tfw[j] = qt->m_termFreqWeight;
}
@ -265,8 +248,7 @@ bool Msg3a::getDocIds ( Msg39Request *r ,
// Query::expandQuery() above
m_r->ptr_query = m_q->m_orig;
m_r->size_query = m_q->m_origLen+1;
// the white list now too...
//m_r->ptr_whiteList = si->m_whiteListBuf.getBufStart();
// free us?
if ( m_rbufPtr && m_rbufPtr != m_rbuf ) {
mfree ( m_rbufPtr , m_rbufSize, "Msg3a" );
@ -314,7 +296,7 @@ bool Msg3a::getDocIds ( Msg39Request *r ,
// high because it is a spider time thing.
if ( m_r->m_timeout > 0 ) {
timeout = m_r->m_timeout;
timeout += 250; //add 250ms for general overhead
timeout += g_conf.m_msg3a_msg39_network_overhead;
}
if ( timeout > multicast_msg3a_maximum_timeout )
timeout = multicast_msg3a_maximum_timeout;
@ -774,64 +756,6 @@ bool Msg3a::mergeLists ( ) {
//m_totalDocCount = 0; // int32_t docCount = 0;
m_moreDocIdsAvail = true;
/*
this version is too simple. now each query term can be a
gbfacet:price or gbfacet:type term and each has a
list in the Msg39Reply::ptr_facetHashList for its termid
//
// compile facet stats
//
for ( int32_t j = 0; j < m_numHosts ; j++ ) {
Msg39Reply *mr =m_reply[j];
// one table for each query term
char *p = mr->ptr_facetHashList;
// loop over all query terms
int32_t n = m_q->getNumTerms();
// use this
HashTableX tmp;
// do the loop
for ( int32_t i = 0 ; i < n ; i++ ) {
// size of it
int32_t psize = *(int32_t *)p;
p += 4;
tmp.deserialize ( p , psize );
p += psize;
// now compile the stats into a master table
for ( int32_t k = 0 ; k < tmp.m_numSlots ; k++ ) {
if ( ! tmp.m_flags[k] ) continue;
// get the vlaue
int32_t v32 = *(int32_t *)tmp.getKeyFromSlot(k);
// and how many of them there where
int32_t count = *(int32_t *)tmp.getValueFromSlot(k);
// add to master
master.addScore32 ( v32 , count );
}
}
}
////////
//
// now set m_facetStats
//
////////
// add up all counts
int64_t count = 0LL;
for ( int32_t i = 0 ; i < master.getNumSlots() ; i++ ) {
if ( ! master.m_flags[i] ) continue;
int64_t slotCount = *(int32_t *)master.getValueFromSlot(i);
int32_t h32 = *(int32_t *)master.getKeyFromSlot(i);
if ( h32 == m_r->m_myFacetVal32 )
m_facetStats.m_myValCount = slotCount;
count += slotCount;
}
m_facetStats.m_totalUniqueValues = master.getNumUsedSlots();
m_facetStats.m_totalValues = count;
*/
// shortcut
//int32_t numSplits = m_numHosts;//indexdbSplit;
// . point to the various docids, etc. in each shard reply
// . tcPtr = term count. how many required query terms does the doc
@ -920,11 +844,6 @@ bool Msg3a::mergeLists ( ) {
for ( int32_t j = 0; j < m_numQueriedHosts ; j++ ) {
Msg39Reply *mr =m_reply[j];
if ( ! mr ) continue;
//SectionStats *src = &mr->m_sectionStats;
//dst->m_onSiteDocIds += src->m_onSiteDocIds;
//dst->m_offSiteDocIds += src->m_offSiteDocIds;
//dst->m_totalMatches += src->m_totalMatches;
//dst->m_totalEntries += src->m_totalEntries;
// now the list should be the unique site hashes that
// had the section hash. we need to uniquify them again
// here.
@ -1036,7 +955,6 @@ bool Msg3a::mergeLists ( ) {
if ( ! sortFacetEntries() )
return true;
//if ( m_r->m_getSectionStats ) return true;
//
// HACK: END section stats merge
//

11
Msg3a.h

@ -145,17 +145,6 @@ public:
// when merging this list of docids into a final list keep
// track of the cursor into m_docIds[]
int32_t m_cursor;
// what collection # are these docids from if m_collnums[] is NULL
//collnum_t m_collnum;
// we don't have FacetStats because we have the actual
// Msg39Reply::ptr_facetHashList from each shard which contains
// all the facet hash lists for each gbfacet: query term we had
// and the query "Msg3a::m_q.m_qterms[].m_dt" is the hash table
// where each key is a facethash for that gbfacet:xxxx term and
// the value if the # of occurences.
//SectionStats m_sectionStats;
};
#endif

2164
Msg40.cpp

File diff suppressed because it is too large Load Diff

109
Msg40.h

@ -14,78 +14,12 @@
#include "Msg39.h" // getTermFreqs()
#include "Msg20.h" // for getting summary from docId
#include "Msg17.h" // a distributed cache of serialized/compressed Msg40s
//#include "Msg2b.h" // for generating directories
//#include "IndexReadInfo.h" // STAGE0,...
#include "Msg3a.h"
#include "PostQueryRerank.h"
// replace CollectionRec::m_maxDocIdsToCompute with this
//#define MAXDOCIDSTOCOMPUTE 500000
// make it 2B now. no reason not too limit it so low.
#define MAXDOCIDSTOCOMPUTE 2000000000
#define MAX_GIGABIT_WORDS 10
class Gigabit {
public:
char *m_term;
int32_t m_termLen;
int64_t m_termId64;
float m_gbscore;
int32_t m_minPop;
int32_t m_numWords;
int32_t m_numPages;
int64_t m_lastDocId;
// the wordids of the words in the gigabit (m_numWords of them)
int64_t m_wordIds[MAX_GIGABIT_WORDS];
};
//
// TODO: add Gigabit::m_firstFastFactOffset..
//
#define MAX_GIGABIT_PTRS 10
class Fact {
public:
// offset of the gigabit in m_gigabitBuf we belong to
int32_t m_gigabitOffset;
// . the sentence contaning the gigabit and a lot of the query terms
// . ptr refrences into Msg20Reply::ptr_gigabitSample buffers
char *m_fact;
int32_t m_factLen;
float m_gigabitModScore;
float m_queryScore;
float m_maxGigabitModScore; // gigabitscore * #pagesItIsOn
int32_t m_numGigabits;
char m_printed;
class Gigabit *m_gigabitPtrs[MAX_GIGABIT_PTRS];
int32_t m_numQTerms;
int64_t m_docId; // from where it came
Msg20Reply *m_reply; // reply from where it came
// for deduping sentences
char m_dedupVector[SAMPLE_VECTOR_SIZE]; // 128
};
class GigabitInfo {
public:
int32_t m_pts;
uint32_t m_hash;
int32_t m_pop;
int32_t m_count;
int32_t m_numDocs;
int64_t m_lastDocId;
int32_t m_currentDocCount;
char *m_ptr;
int32_t m_len;
};
static const int64_t msg40_msg39_timeout = 5000; //timeout for entire get-docid-list phase, in milliseconds.
class Msg40 {
public:
@ -116,21 +50,6 @@ class Msg40 {
bool prepareToGetDocIds ( );
bool getDocIds ( bool recall );
bool computeGigabits( class TopicGroup *tg );
SafeBuf m_gigabitBuf;
// nuggabits...
bool computeFastFacts ( );
bool addFacts ( HashTableX *queryTable,
HashTableX *gbitTable ,
char *pstart,
char *pend,
bool debugGigabits ,
class Msg20Reply *reply,
SafeBuf *factBuf ) ;
SafeBuf m_factBuf;
// keep these public since called by wrapper functions
bool federatedLoop ( ) ;
bool gotDocIds ( ) ;
@ -181,14 +100,7 @@ class Msg40 {
bool moreResultsFollow ( ) {return m_moreToCome; };
time_t getCachedTime ( ) {return m_cachedTime; };
int32_t getNumGigabits (){return m_gigabitBuf.length()/sizeof(Gigabit);};
Gigabit *getGigabit ( int32_t i ) {
Gigabit *gbs = (Gigabit *)m_gigabitBuf.getBufStart();
return &gbs[i];
};
int64_t *getDocIdPtr() { return m_msg3a.m_docIds; }
int64_t *getDocIdPtr() { return m_msg3a.m_docIds; }
bool printSearchResult9 ( int32_t ix , int32_t *numPrintedSoFar ,
class Msg20Reply *mr ) ;
@ -277,15 +189,10 @@ class Msg40 {
char *m_cachePtr;
int32_t m_cacheSize;
//int32_t m_maxDocIdsToCompute;
// count summary replies (msg20 replies) we get
int32_t m_numRequests;
int32_t m_numReplies;
// we launched all docids from 0 to m_maxiLaunched
//int32_t m_maxiLaunched;
// true if more results follow these
bool m_moreToCome;
@ -303,12 +210,6 @@ class Msg40 {
bool m_cachedResults;
time_t m_cachedTime;
// gigabits
//Msg24 m_msg24;
// references
//Msg1a m_msg1a;
int32_t m_tasksRemaining;
int32_t m_printCount;
@ -334,14 +235,6 @@ class Msg40 {
SearchInput *m_si;
// for topic clustering, saved from CollectionRec
int32_t m_topicSimilarCutoff;
int32_t m_docsToScanForTopics;
// Msg2b for generating a directory
//Msg2b m_msg2b;
bool mergeDocIdsIntoBaseMsg3a();
int32_t m_numCollsToSearch;
class Msg3a **m_msg3aPtrs;

@ -1178,29 +1178,6 @@ bool Msg5::gotList2 ( ) {
// . why???
if ( m_totalSize < 32*1024 ) goto skipThread;
// if we are an interruptible niceness 1, do not use a thread,
// we can be interrupted by the alarm callback and serve niceness
// 0 requests, that is probably better! although the resolution is
// on like 10ms on those alarms... BUT if you use a smaller
// mergeBufSize of like 100k, that might make it responsive enough!
// allow it to do a thread again so we can take advantage of
// multiple cores, or hyperthreads i guess because i am seeing
// some missed quickpoll log msgs, i suppose because we did not
// insert QUICKPOLL() statements in the RdbList::merge_r() code
//if ( m_niceness >= 1 ) goto skipThread;
// supder duper hack!
//if ( m_rdbId == RDB_REVDB ) goto skipThread;
// i'm not sure why we core in Msg5's call to RdbList::merge_r().
// the list appears to be corrupt...
//if ( m_rdbId == RDB_FACEBOOKDB ) goto skipThread;
// skip it for now
//goto skipThread;
//m_waitingForMerge = true;
// . if size is big, make a thread
// . let's always make niceness 0 since it wasn't being very
// aggressive before

@ -854,8 +854,6 @@ void sleepWrapper1 ( int bogusfd , void *state ) {
case 0x20: if ( elapsed < 5000 ) return; break;
// msg 0x20 calls this to get the title rec
case 0x22: if ( elapsed < 1000 ) return; break;
// Msg23 niceness 0 is only for doing &rerank=X queries
//case 0x23: if ( elapsed < 100000 ) return; break;
// a request to get the score of a docid, can be *very* intensive
case 0x3b: if ( elapsed < 500000 ) return; break;
// related topics request, calls many Msg22 to get titlerecs...
@ -868,21 +866,6 @@ void sleepWrapper1 ( int bogusfd , void *state ) {
// performance reasons, cuz we do pretty good load balancing
// and when things get saturated, rerouting excacerbates it
if ( elapsed < 8000 ) return; break;
// how many bytes were requested?
/*
if ( THIS->m_msg ) nb=*(int32_t *)(THIS->m_msg + sizeof(key_t)*2);
else nb=2000000;
// . givem 300ms + 1ms per 5000 bytes
// . a 6M read would be allowed 1500ms before re-routing
// . a 1M read would be allowed 500ms
// . a 100k read would be allowed 320ms
ta = 300 + nb / 5000;
// limit it
if ( ta < 100 ) ta = 100;
if ( ta > 9000 ) ta = 9000; // could this hurt us?
if ( elapsed < ta ) return;
break;
*/
// msg to get a clusterdb rec
case 0x38: if ( elapsed < 2000 ) return; break;
// msg to get docIds from a query, may take a while

@ -30,7 +30,6 @@ static const int64_t multicast_msg20_summary_timeout = 1500;
static const int64_t multicast_msg1_senddata_timeout = 60000;
static const int64_t multicast_msg3a_default_timeout = 10000;
static const int64_t multicast_msg3a_maximum_timeout = 60000;
static const int64_t multicast_xmldoc_sectionstats_timeout = 30000;
static const int64_t multicast_msg1c_getip_default_timeout = 60000;

@ -8,9 +8,8 @@
#include "SpiderLoop.h"
#include "PageResults.h" // for RESULT_HEIGHT
#include "Stats.h"
#include "PageRoot.h"
bool printFrontPageShell ( SafeBuf *sb , char *tabName , CollectionRec *cr ,
bool printGigablast ) ;
// 5 seconds
#define DEFAULT_WIDGET_RELOAD 1000

@ -242,8 +242,6 @@ bool sendBackDump ( TcpSocket *sock, HttpRequest *hr ) {
// no summary similarity dedup, only exact
// doc content hash. otherwise too slow!!
"pss=0&"
// no gigabits
"dsrt=0&"
// do not compute summary. 0 lines.
"ns=0&"
"q=gbsortby%%3Agbspiderdate&"
@ -282,8 +280,6 @@ bool sendBackDump ( TcpSocket *sock, HttpRequest *hr ) {
// no summary similarity dedup, only exact
// doc content hash. otherwise too slow!!
"pss=0&"
// no gigabits
"dsrt=0&"
// do not compute summary. 0 lines.
"ns=0&"
//"q=gbsortby%%3Agbspiderdate&"
@ -321,8 +317,6 @@ bool sendBackDump ( TcpSocket *sock, HttpRequest *hr ) {
// no summary similarity dedup, only exact
// doc content hash. otherwise too slow!!
"pss=0&"
// no gigabits
"dsrt=0&"
// do not compute summary. 0 lines.
"ns=0&"
"q=gbsortby%%3Agbspiderdate&"
@ -372,8 +366,6 @@ bool sendBackDump ( TcpSocket *sock, HttpRequest *hr ) {
// no summary similarity dedup, only exact
// doc content hash. otherwise too slow!!
"pss=0&"
// no gigabits
"dsrt=0&"
// do not compute summary. 0 lines.
//"ns=0&"
"q=gbrevsortbyint%%3AgbssSpiderTime+"

@ -5,6 +5,86 @@
//#include "IndexTable2.h"
//#include "XmlDoc.h" // addCheckboxSpan()
class State8 {
public:
TopTree m_topTree;
//Msg16 m_msg16;
//Msg14 m_msg14;
//Msg15 m_msg15;
Msg22 m_msg22;
SafeBuf m_dbuf;
//XmlDoc m_doc;
//Url m_url;
//Url m_rootUrl;
char *m_u;
int32_t m_ulen;
bool m_applyRulesetToRoot;
char m_rootQuality;
int32_t m_reparseRootRetries;
char m_coll[MAX_COLL_LEN];
int32_t m_collLen;
//int32_t m_sfn;
//int32_t m_urlLen;
TcpSocket *m_s;
bool m_isLocal;
char m_pwd[32];
HttpRequest m_r;
int32_t m_old;
// recyle the link info from the title rec?
int32_t m_recycle;
// recycle the link info that was imported from another coll?
int32_t m_recycle2;
int32_t m_render;
char m_recompute;
int32_t m_oips;
char m_linkInfoColl[11];
// char m_buf[16384 * 1024];
//int32_t m_page;
// m_pbuf now points to m_sbuf if we are showing the parsing junk
SafeBuf m_xbuf;
SafeBuf m_wbuf;
bool m_donePrinting;
//SafeBuf m_sbuf;
// this is a buffer which cats m_sbuf into it
//SafeBuf m_sbuf2;
// new state vars for Msg3b.cpp
int64_t m_docId;
void *m_state ;
void (* m_callback) (void *state);
Query m_tq;
Query *m_q;
int64_t *m_termFreqs;
float *m_termFreqWeights;
float *m_affWeights;
//score_t m_total;
bool m_freeIt;
bool m_blocked;
// these are from rearranging the code
int32_t m_indexCode;
//uint64_t m_chksum1;
int64_t m_took1;
int64_t m_took1b;
int64_t m_took2;
int64_t m_took3;
char m_didRootDom;
char m_didRootWWW;
char m_wasRootDom;
// call Msg16 with a versino of title rec to do
int32_t m_titleRecVersion;
char m_hopCount;
//TitleRec m_tr;
//XmlDoc m_oldDoc;
XmlDoc m_xd;
};
bool g_inPageParser = false;
bool g_inPageInject = false;
@ -12,6 +92,17 @@ bool g_inPageInject = false;
static bool processLoop ( void *state ) ;
static bool gotXmlDoc ( void *state ) ;
static bool sendErrorReply ( void *state , int32_t err ) ;
static bool sendPageParser2 ( TcpSocket *s ,
HttpRequest *r ,
class State8 *st ,
int64_t docId ,
Query *q ,
int64_t *termFreqs ,
float *termFreqWeights ,
float *affWeights ,
void *state ,
void (* callback)(void *state) ) ;
// . returns false if blocked, true otherwise
// . sets g_errno on error
@ -26,19 +117,19 @@ bool sendPageParser ( TcpSocket *s , HttpRequest *r ) {
// . a new interface so Msg3b can call this with "s" set to NULL
// . returns false if blocked, true otherwise
// . sets g_errno on error
bool sendPageParser2 ( TcpSocket *s ,
HttpRequest *r ,
State8 *st ,
int64_t docId ,
Query *q ,
// in query term space, not imap space
int64_t *termFreqs ,
// in imap space
float *termFreqWeights ,
// in imap space
float *affWeights ,
void *state ,
void (* callback)(void *state) ) {
static bool sendPageParser2 ( TcpSocket *s ,
HttpRequest *r ,
State8 *st ,
int64_t docId ,
Query *q ,
// in query term space, not imap space
int64_t *termFreqs ,
// in imap space
float *termFreqWeights ,
// in imap space
float *affWeights ,
void *state ,
void (* callback)(void *state) ) {
//log("parser: read sock=%"INT32"",s->m_sd);

@ -16,95 +16,4 @@ extern bool g_inPageInject ;
bool sendPageAnalyze ( TcpSocket *s , HttpRequest *r ) ;
bool sendPageParser2 ( TcpSocket *s ,
HttpRequest *r ,
class State8 *st ,
int64_t docId ,
Query *q ,
int64_t *termFreqs ,
float *termFreqWeights ,
float *affWeights ,
void *state ,
void (* callback)(void *state) ) ;
class State8 {
public:
TopTree m_topTree;
//Msg16 m_msg16;
//Msg14 m_msg14;
//Msg15 m_msg15;
Msg22 m_msg22;
SafeBuf m_dbuf;
//XmlDoc m_doc;
//Url m_url;
//Url m_rootUrl;
char *m_u;
int32_t m_ulen;
bool m_applyRulesetToRoot;
char m_rootQuality;
int32_t m_reparseRootRetries;
char m_coll[MAX_COLL_LEN];
int32_t m_collLen;
//int32_t m_sfn;
//int32_t m_urlLen;
TcpSocket *m_s;
bool m_isLocal;
char m_pwd[32];
HttpRequest m_r;
int32_t m_old;
// recyle the link info from the title rec?
int32_t m_recycle;
// recycle the link info that was imported from another coll?
int32_t m_recycle2;
int32_t m_render;
char m_recompute;
int32_t m_oips;
char m_linkInfoColl[11];
// char m_buf[16384 * 1024];
//int32_t m_page;
// m_pbuf now points to m_sbuf if we are showing the parsing junk
SafeBuf m_xbuf;
SafeBuf m_wbuf;
bool m_donePrinting;
//SafeBuf m_sbuf;
// this is a buffer which cats m_sbuf into it
//SafeBuf m_sbuf2;
// new state vars for Msg3b.cpp
int64_t m_docId;
void *m_state ;
void (* m_callback) (void *state);
Query m_tq;
Query *m_q;
int64_t *m_termFreqs;
float *m_termFreqWeights;
float *m_affWeights;
//score_t m_total;
bool m_freeIt;
bool m_blocked;
// these are from rearranging the code
int32_t m_indexCode;
//uint64_t m_chksum1;
int64_t m_took1;
int64_t m_took1b;
int64_t m_took2;
int64_t m_took3;
char m_didRootDom;
char m_didRootWWW;
char m_wasRootDom;
// call Msg16 with a versino of title rec to do
int32_t m_titleRecVersion;
char m_hopCount;
//TitleRec m_tr;
//XmlDoc m_oldDoc;
XmlDoc m_xd;
};
#endif

@ -15,7 +15,6 @@
#include "Msg40.h"
#include "sort.h"
#include "Spider.h"
#include "Revdb.h"
#include "XmlDoc.h"
#include "PageInject.h" // Msg7
#include "PageReindex.h"

@ -42,64 +42,4 @@ public:
Query m_qq;
};
/*
// . for indexing tags for events after you add to tagdb
// . created so zak can very quickly tag eventids that are already indexed
// . will just add the tag terms directly to datedb for the eventid
class Msg1d {
public:
bool updateQuery ( char *query ,
HttpRequest *r,
TcpSocket *sock,
char *coll ,
int32_t startNum ,
int32_t endNum ,
void *state ,
void (* callback) (void *state ) ) ;
bool updateTagTerms ( ) ;
bool getMetaList ( int64_t docId ,
int32_t eventId ,
TagRec *egr ,
RdbList *oldList ,
int32_t niceness ,
SafeBuf *addBuf ) ;
void *m_state;
void (* m_callback) (void *state);
Msg40 m_msg40;
SearchInput m_si;
int32_t m_startNum;
int32_t m_endNum;
int32_t m_numDocIds;
int32_t m_i;
Msg12 m_msg12;
Msg8a m_msg8a;
Msg0 m_msg0;
char *m_coll;
int32_t m_niceness;
TagRec m_tagRec;
RdbList m_revdbList;
SafeBuf m_addBuf;
SafeBuf m_rr;
char *m_metaList;
int32_t m_metaListSize;
Msg4 m_msg4;
Query m_qq;
Url m_fakeUrl;
int32_t m_gotLock;
int32_t m_gotTagRec;
int32_t m_gotRevdbRec;
int32_t m_madeList;
int32_t m_addedList;
int32_t m_removeLock;
int32_t m_flushedList;
};
*/
#endif

@ -19,17 +19,15 @@
#include "LanguageIdentifier.h"
#include "CountryCode.h"
#include "Unicode.h"
#include "XmlDoc.h" // GigabitInfo class
#include "Posdb.h" // MAX_TOP definition
#include "PageResults.h"
#include "PageRoot.h"
#include "Proxy.h"
static bool printSearchFiltersBar ( SafeBuf *sb , HttpRequest *hr ) ;
static bool printMenu ( SafeBuf *sb , int32_t menuNum , HttpRequest *hr ) ;
//static void gotSpellingWrapper ( void *state ) ;
static void gotResultsWrapper ( void *state ) ;
//static void gotAdsWrapper ( void *state ) ;
static void gotState ( void *state ) ;
static bool gotResults ( void *state ) ;
@ -163,34 +161,6 @@ bool sendReply ( State0 *st , char *reply ) {
mdelete(st, sizeof(State0), "PageResults2");
delete st;
/*
if ( format == FORMAT_XML ) {
SafeBuf sb;
sb.safePrintf("<?xml version=\"1.0\" "
"encoding=\"UTF-8\" ?>\n"
"<response>\n"
"\t<errno>%"INT32"</errno>\n"
"\t<errmsg>%s</errmsg>\n"
"</response>\n"
,(int32_t)savedErr
,mstrerror(savedErr)
);
// clear it for sending back
g_errno = 0;
// send back as normal reply
g_httpServer.sendDynamicPage(s,
sb.getBufStart(),
sb.length(),
0, // cachetime in secs
false, // POSTReply?
ct,
-1, // httpstatus -1 -> 200
NULL, // cookieptr
charset );
return true;
}
*/
// if we had a broken pipe from the browser while sending
// them the search results, then we end up closing the socket fd
// in TcpServer::sendChunk() > sendMsg() > destroySocket()
@ -332,15 +302,6 @@ bool sendPageResults ( TcpSocket *s , HttpRequest *hr ) {
// propagate "n"
int32_t n = hr->getLong("n",-1);
if ( n >= 0 ) sb.safePrintf("&n=%"INT32"",n);
// Docs to Scan for Related Topics
int32_t dsrt = hr->getLong("dsrt",-1);
if ( dsrt >= 0 ) sb.safePrintf("&dsrt=%"INT32"",dsrt);
// debug gigabits?
int32_t dg = hr->getLong("dg",-1);
if ( dg >= 0 ) sb.safePrintf("&dg=%"INT32"",dg);
// show gigabits?
//int32_t gb = hr->getLong("gigabits",1);
//if ( gb >= 1 ) sb.safePrintf("&gigabits=%"INT32"",gb);
// show banned results?
int32_t showBanned = hr->getLong("sb",0);
if ( showBanned ) sb.safePrintf("&sb=1");
@ -367,12 +328,6 @@ bool sendPageResults ( TcpSocket *s , HttpRequest *hr ) {
, h32
, rand64
);
//
// . login bar
// . proxy will replace it byte by byte with a login/logout
// link etc.
//
//g_proxy.insertLoginBarDirective(&sb);
//
// logo header
@ -392,23 +347,6 @@ bool sendPageResults ( TcpSocket *s , HttpRequest *hr ) {
"}}\n"
// gigabit unhide function
"function ccc ( gn ) {\n"
"var e = document.getElementById('fd'+gn);\n"
"var f = document.getElementById('sd'+gn);\n"
"if ( e.style.display == 'none' ){\n"
"e.style.display = '';\n"
"f.style.display = 'none';\n"
"}\n"
"else {\n"
"e.style.display = 'none';\n"
"f.style.display = '';\n"
"}\n"
"}\n"
"</script>\n"
// put search results into this div
"<div id=results>"
"<img height=50 width=50 "
@ -623,259 +561,6 @@ void gotState ( void *state ){
gotResults ( state );
}
// print all sentences containing this gigabit (fast facts) (nuggabits)
static bool printGigabitContainingSentences ( State0 *st,
SafeBuf *sb ,
Msg40 *msg40 ,
Gigabit *gi ,
SearchInput *si ,
Query *gigabitQuery ,
int32_t gigabitId ) {
char format = si->m_format;
HttpRequest *hr = &st->m_hr;
CollectionRec *cr = si->m_cr;//g_collectiondb.getRec(collnum );
int32_t numOff;
int32_t revert;
int32_t spaceOutOff;
if ( format == FORMAT_HTML ) {
sb->safePrintf("<nobr><b>");
// make a new query
sb->safePrintf("<a href=\"/search?c=%s&q=",cr->m_coll);
sb->urlEncode(gi->m_term,gi->m_termLen);
sb->safeMemcpy("+|+",3);
char *q = hr->getString("q",NULL,"");
sb->urlEncode(q);
sb->safePrintf("\">");
sb->safeMemcpy(gi->m_term,gi->m_termLen);
sb->safePrintf("</a></b>");
sb->safePrintf(" <font color=gray size=-1>");
numOff = sb->m_length;
sb->safePrintf(" ");//,gi->m_numPages);
sb->safePrintf("</font>");
sb->safePrintf("</b>");
revert = sb->length();
sb->safePrintf("<font color=blue style=align:right;>"
"<a style=cursor:hand;cursor:pointer; "
"onclick=ccc(%"INT32");>"
, gigabitId // s_gigabitCount
);
spaceOutOff = sb->length();
sb->safePrintf( "%c%c%c",
0xe2,
0x87,
0x93);
sb->safePrintf(//"[more]"
"</a></font>");
sb->safePrintf("</nobr>"); // <br>
}
if ( format == FORMAT_XML ) {
sb->safePrintf("\t\t<gigabit>\n");
sb->safePrintf("\t\t\t<term><![CDATA[");
sb->cdataEncode(gi->m_term,gi->m_termLen);
sb->safePrintf("]]></term>\n");
sb->safePrintf("\t\t\t<score>%f</score>\n",gi->m_gbscore);
sb->safePrintf("\t\t\t<minPop>%"INT32"</minPop>\n",gi->m_minPop);
}
if ( format == FORMAT_JSON ) {
sb->safePrintf("\t{\n");
//sb->safePrintf("\t\"gigabit\":{\n");
sb->safePrintf("\t\t\"term\":\"");
sb->jsonEncode(gi->m_term,gi->m_termLen);
sb->safePrintf("\",\n");
sb->safePrintf("\t\t\"score\":%f,\n",gi->m_gbscore);
sb->safePrintf("\t\t\"minPop\":%"INT32",\n",gi->m_minPop);
}
// get facts
int32_t numNuggets = 0;
int32_t numFacts = msg40->m_factBuf.length() / sizeof(Fact);
Fact *facts = (Fact *)msg40->m_factBuf.getBufStart();
bool first = true;
bool second = false;
bool printedSecond = false;
//int64_t lastDocId = -1LL;
int32_t saveOffset = 0;
for ( int32_t i = 0 ; i < numFacts ; i++ ) {
Fact *fi = &facts[i];
// if printed for a higher scoring gigabit, skip
if ( fi->m_printed ) continue;
// check gigabit match
int32_t k; for ( k = 0 ; k < fi->m_numGigabits ; k++ )
if ( fi->m_gigabitPtrs[k] == gi ) break;
// skip this fact/sentence if does not contain gigabit
if ( k >= fi->m_numGigabits ) continue;
// do not print if no period at end
char *s = fi->m_fact;
char *e = s + fi->m_factLen;
if ( e[-1] != '*' ) continue;
e--;
again:
// first time, print in the single fact div
if ( first && format == FORMAT_HTML ) {
sb->safePrintf("<div "
//"style=\"border:1px lightgray solid;\"
"id=fd%"INT32">",gigabitId);//s_gigabitCount);
}
if ( second && format == FORMAT_HTML ) {
sb->safePrintf("<div style=\"max-height:300px;"
"display:none;"
"overflow-x:hidden;"
"overflow-y:auto;"//scroll;"
//"border:1px lightgray solid; "
"\" "
"id=sd%"INT32">",gigabitId);//s_gigabitCount);
printedSecond = true;
}
Msg20Reply *reply = fi->m_reply;
// ok, print it out
if ( ! first && ! second && format == FORMAT_HTML ) {
sb->safePrintf("<br><br>\n");
}
numNuggets++;
// let's highlight with gigabits and query terms
SafeBuf tmpBuf;
Highlight h;
h.set ( &tmpBuf , s , e - s , gigabitQuery, "<u>", "</u>", 0 );
// now highlight the original query as well but in black bold
SafeBuf tmpBuf2;
h.set ( &tmpBuf2, tmpBuf.getBufStart(), tmpBuf.length(), &si->m_q, "<b>", "</b>", 0 );
int32_t dlen; char *dom = getDomFast(reply->ptr_ubuf,&dlen);
// print the sentence
if ( format == FORMAT_HTML )
sb->safeStrcpy(tmpBuf2.getBufStart());
if ( format == FORMAT_XML ) {
sb->safePrintf("\t\t\t<instance>\n"
"\t\t\t\t<sentence><![CDATA[");
sb->cdataEncode(tmpBuf2.getBufStart());
sb->safePrintf("]]></sentence>\n");
sb->safePrintf("\t\t\t\t<url><![CDATA[");
sb->cdataEncode(reply->ptr_ubuf);
sb->safePrintf("]]></url>\n");
sb->safePrintf("\t\t\t\t<domain><![CDATA[");
sb->cdataEncode(dom,dlen);
sb->safePrintf("]]></domain>\n");
sb->safePrintf("\t\t\t</instance>\n");
}
if ( format == FORMAT_JSON ) {
sb->safePrintf("\t\t\"instance\":{\n"
"\t\t\t\"sentence\":\"");
sb->jsonEncode(tmpBuf2.getBufStart());
sb->safePrintf("\",\n");
sb->safePrintf("\t\t\t\"url\":\"");
sb->jsonEncode(reply->ptr_ubuf);
sb->safePrintf("\",\n");
sb->safePrintf("\t\t\t\"domain\":\"");
sb->jsonEncode(dom,dlen);
sb->safePrintf("\"\n");
sb->safePrintf("\t\t},\n");
}
fi->m_printed = 1;
saveOffset = sb->length();
if ( format == FORMAT_HTML ) {
sb->safePrintf(" <a href=/get?c=%s&cnsp=0&"
"strip=0&d=%"INT64">",
cr->m_coll,reply->m_docId);
sb->safeMemcpy(dom,dlen);
sb->safePrintf("</a>\n");
sb->safePrintf("</div>");
}
if ( second ) {
second = false;
}
if ( first ) {
first = false;
second = true;
// print first gigabit all over again but in 2nd div
goto again;
}
}
if ( format == FORMAT_XML )
sb->safePrintf("\t\t</gigabit>\n");
if ( format == FORMAT_JSON ) {
// remove last ,\n
sb->m_length -= 2;
// replace with just \n
// end the gigabit
sb->safePrintf("\n\t},\n");
}
// all done if not html
if ( format != FORMAT_HTML )
return true;
// we counted the first one twice since we had to throw it into
// the hidden div too!
if ( numNuggets > 1 ) numNuggets--;
// do not print the double down arrow if no nuggets printed
if ( numNuggets <= 0 ) {
sb->m_length = revert;
sb->safePrintf("</nobr>");
}
// just remove down arrow if only 1...
else if ( numNuggets == 1 ) {
char *dst = sb->getBufStart()+spaceOutOff;
dst[0] = ' ';
dst[1] = ' ';
dst[2] = ' ';
}
// store the # of nuggets in ()'s like (10 )
else {
char tmp[10];
sprintf(tmp,"(%"INT32")",numNuggets);
char *src = tmp;
// starting storing digits after "( "
char *dst = sb->getBufStart()+numOff;
int32_t srcLen = gbstrlen(tmp);
if ( srcLen > 5 ) srcLen = 5;
for ( int32_t k = 0 ; k < srcLen ; k++ )
dst[k] = src[k];
}
if ( printedSecond ) {
sb->safePrintf("</div>");
}
return true;
}
// . make a web page from results stored in msg40
// . send it on TcpSocket "s" when done
// . returns false if blocked, true otherwise
@ -1045,22 +730,6 @@ bool gotResults ( void *state ) {
return true;
}
// defined in PageRoot.cpp
bool expandHtml ( SafeBuf& sb,
char *head ,
int32_t hlen ,
char *q ,
int32_t qlen ,
HttpRequest *r ,
SearchInput *si,
char *method ,
CollectionRec *cr ) ;
bool printLeftColumnRocketAndTabs ( SafeBuf *sb,
bool isSearchResultsPage ,
CollectionRec *cr ,
char *tabName );
bool printLeftNavColumn ( SafeBuf &sb, State0 *st ) {
@ -1135,271 +804,26 @@ bool printLeftNavColumn ( SafeBuf &sb, State0 *st ) {
// . tabName = "search"
printLeftColumnRocketAndTabs ( &sb , true , cr , "search" );
}
//
// BEGIN FACET PRINTING
//
//
// . print out one table for each gbfacet: term in the query
// . LATER: show the text string corresponding to the hash
// by looking it up in the titleRec
//
msg40->printFacetTables ( &sb );
//
// END FACET PRINTING
//
//
// BEGIN FACET PRINTING
//
//
// . print out one table for each gbfacet: term in the query
// . LATER: show the text string corresponding to the hash
// by looking it up in the titleRec
//
if ( format == FORMAT_HTML ) msg40->printFacetTables ( &sb );
//
// END FACET PRINTING
//
//
// BEGIN PRINT GIGABITS
//
SafeBuf *gbuf = &msg40->m_gigabitBuf;
int32_t numGigabits = gbuf->length()/sizeof(Gigabit);
if ( ! st->m_header )
numGigabits = 0;
// print gigabits
Gigabit *gigabits = (Gigabit *)gbuf->getBufStart();
if ( numGigabits && format == FORMAT_XML )
sb.safePrintf("\t<gigabits>\n");
if ( numGigabits && format == FORMAT_JSON )
sb.safePrintf("\"gigabits\":[\n");
if ( numGigabits && format == FORMAT_HTML )
// gigabit unhide function
sb.safePrintf (
"<script>"
"function ccc ( gn ) {\n"
"var e = document.getElementById('fd'+gn);\n"
"var f = document.getElementById('sd'+gn);\n"
"if ( e.style.display == 'none' ){\n"
"e.style.display = '';\n"
"f.style.display = 'none';\n"
"}\n"
"else {\n"
"e.style.display = 'none';\n"
"f.style.display = '';\n"
"}\n"
"}\n"
"</script>\n"
);
if ( numGigabits && format == FORMAT_HTML )
sb.safePrintf("<div id=gigabits "
"style="
"padding:5px;"
"position:relative;"
"border-width:3px;"
"border-right-width:0px;"
"border-style:solid;"
"margin-left:10px;"
"border-top-left-radius:10px;"
"border-bottom-left-radius:10px;"
"border-color:blue;"
"background-color:white;"
"border-right-color:white;"
"margin-right:-3px;"
">"
"<table cellspacing=7>"
"<tr><td width=200px; valign=top>"
"<center><img src=/gigabits40.jpg></center>"
"<br>"
"<br>"
);
Query gigabitQuery;
char tmp[1024];
SafeBuf ttt(tmp, 1024);
// limit it to 40 gigabits for now
for ( int32_t i = 0 ; i < numGigabits && i < 40 ; i++ ) {
Gigabit *gi = &gigabits[i];
ttt.pushChar('\"');
ttt.safeMemcpy(gi->m_term,gi->m_termLen);
ttt.pushChar('\"');
ttt.pushChar(' ');
}
// term on it
ttt.nullTerm();
if ( numGigabits > 0 )
gigabitQuery.set2 ( ttt.getBufStart() ,
si->m_queryLangId ,
true , // queryexpansion?
true ); // usestopwords?
for ( int32_t i = 0 ; i < numGigabits ; i++ ) {
//if ( i > 0 && format == FORMAT_HTML )
// sb.safePrintf("<hr>");
//if ( perRow && (i % perRow == 0) )
// sb.safePrintf("</td><td valign=top>");
// print all sentences containing this gigabit
Gigabit *gi = &gigabits[i];
// after the first 3 hide them with a more link
if ( i == 1 && format == FORMAT_HTML ) {
sb.safePrintf("</span><a onclick="
"\""
"var e = "
"document.getElementById('hidegbits');"
"if ( e.style.display == 'none' ){\n"
"e.style.display = '';\n"
"this.innerHtml='Show less';"
"}"
"else {\n"
"e.style.display = 'none';\n"
"this.innerHtml='Show more';\n"
"}\n"
"\" style=cursor:hand;cursor:pointer;>"
"Show more</a>");
sb.safePrintf("<span id=hidegbits "
"style=display:none;>"
"<br><br>");
}
printGigabitContainingSentences( st, &sb, msg40, gi, si, &gigabitQuery, i );
if ( format == FORMAT_HTML )
sb.safePrintf("<br><br>");
}
//if ( numGigabits >= 1 && format == FORMAT_HTML )
if ( numGigabits && format == FORMAT_HTML )
sb.safePrintf("</td></tr></table></div><br>");
if ( numGigabits && format == FORMAT_XML )
sb.safePrintf("\t</gigabits>\n");
if ( numGigabits && format == FORMAT_JSON ) {
// remove ,\n
sb.m_length -=2;
// add back just \n
// end the gigabits array
sb.safePrintf("\n],\n");
}
//
// now print various knobs
//
//
// print date constraint functions now
//
if ( format == FORMAT_HTML && 1 == 2)
sb.safePrintf(
"<div id=best "
"style="
"font-size:14px;"
"padding:5px;"
"position:relative;"
"border-width:3px;"
"border-right-width:0px;"
"border-style:solid;"
"margin-left:10px;"
"border-top-left-radius:10px;"
"border-bottom-left-radius:10px;"
"border-color:blue;"
"background-color:white;"
"border-right-color:white;"
"margin-right:-3px;"
"text-align:right;"
">"
"<b>"
"ANYTIME &nbsp; &nbsp;"
"</b>"
"</div>"
"<br>"
"<div id=newsest "
"style="
"font-size:14px;"
"padding:5px;"
"position:relative;"
"border-width:3px;"
"border-right-width:0px;"
"border-style:solid;"
"margin-left:10px;"
"border-top-left-radius:10px;"
"border-bottom-left-radius:10px;"
"border-color:white;"
"background-color:blue;"
"border-right-color:blue;"
"margin-right:0px;"
"text-align:right;"
"color:white;"
">"
"<b>"
"LAST 24 HOURS &nbsp; &nbsp;"
"</b>"
"</div>"
"<br>"
"<div id=newsest "
"style="
"font-size:14px;"
"padding:5px;"
"position:relative;"
"border-width:3px;"
"border-right-width:0px;"
"border-style:solid;"
"margin-left:10px;"
"border-top-left-radius:10px;"
"border-bottom-left-radius:10px;"
"border-color:white;"
"background-color:blue;"
"border-right-color:blue;"
"margin-right:0px;"
"text-align:right;"
"color:white;"
">"
"<b>"
"LAST 7 DAYS &nbsp; &nbsp;"
"</b>"
"</div>"
"<br>"
"<div id=newsest "
"style="
"font-size:14px;"
"padding:5px;"
"position:relative;"
"border-width:3px;"
"border-right-width:0px;"
"border-style:solid;"
"margin-left:10px;"
"border-top-left-radius:10px;"
"border-bottom-left-radius:10px;"
"border-color:white;"
"background-color:blue;"
"border-right-color:blue;"
"margin-right:0px;"
"text-align:right;"
"color:white;"
">"
"<b>"
"LAST 30 DAYS &nbsp; &nbsp;"
"</b>"
"</div>"
"<br>"
);
//
// now the MAIN column
//
if ( format == FORMAT_HTML )
//
// now the MAIN column
//
sb.safePrintf("\n</TD>"
"<TD valign=top style=padding-left:30px;>\n");
}
return true;
}
@ -1525,7 +949,6 @@ bool printSearchResultsHeader ( State0 *st ) {
if ( header ) sb->safeStrcpy ( header );
}
// this also prints gigabits and nuggabits
// if we are xml/json we call this below otherwise we lose
// the header of <?xml...> or whatever
if ( ! g_conf.m_isMattWells && si->m_format == FORMAT_HTML ) {
@ -1962,12 +1385,6 @@ bool printSearchResultsHeader ( State0 *st ) {
st->m_header )
msg40->printFacetTables ( sb );
// now print gigabits if we are xml/json
if ( si->m_format != FORMAT_HTML ) {
// this will print gigabits
printLeftNavColumn ( *sb,st );
}
// global-index is not a custom crawl but we should use "objects"
bool isDiffbot = cr->m_isCustomCrawl;
if ( strcmp(cr->m_coll,"GLOBAL-INDEX") == 0 ) isDiffbot = true;
@ -2822,12 +2239,6 @@ bool printResult ( State0 *st, int32_t ix , int32_t *numPrintedSoFar ) {
return true;
}
// . if section voting info was request, display now, it's in json
// . so if in csv it will mess things up!!!
if ( mr->ptr_sectionVotingInfo )
// it is possible this is just "\0"
sb->safeStrcpy ( mr->ptr_sectionVotingInfo );
// each "result" is the actual cached page, in this case, a json
// object, because we were called with &icc=1. in that situation
// ptr_content is set in the msg20reply.
@ -5202,7 +4613,6 @@ bool printSingleScore ( SafeBuf *sb, SearchInput *si, SingleScore *ss, Msg20Repl
wbw = WIKI_BIGRAM_WEIGHT;
}
float hgw = getHashGroupWeight(ss->m_hashGroup);
//float dvw = getDiversityWeight(ss->m_diversityRank);
float dnw = getDensityWeight(ss->m_densityRank);
float wsw = getWordSpamWeight(ss->m_wordSpamRank);
// HACK for inlink text!
@ -5508,13 +4918,6 @@ bool printFrontPageShell ( SafeBuf *sb , char *tabName , CollectionRec *cr,
// if catId >= 1 then print the dmoz radio button
bool printLogoAndSearchBox ( SafeBuf *sb, HttpRequest *hr, SearchInput *si ) {
char *root = "";
if ( g_conf.m_isMattWells )
root = "http://www.gigablast.com";
// now make a TABLE, left PANE contains gigabits and stuff
char *coll = hr->getString("c");
if ( ! coll ) coll = "";
@ -6323,53 +5726,6 @@ bool printSearchFiltersBar ( SafeBuf *sb , HttpRequest *hr ) {
s_mi[n].m_icon = NULL;
n++;
#ifdef SUPPORT_FACETS
// BR 20160801: Disabled by default
s_mi[n].m_menuNum = 4;
s_mi[n].m_title = "Language facet";
s_mi[n].m_cgi = "facet=gbfacetint:gblang";
s_mi[n].m_icon = NULL;
n++;
s_mi[n].m_menuNum = 4;
s_mi[n].m_title = "Content type facet";
s_mi[n].m_cgi = "facet=gbfacetstr:type";
s_mi[n].m_icon = NULL;
n++;
s_mi[n].m_menuNum = 4;
s_mi[n].m_title = "Url path depth";
s_mi[n].m_cgi = "facet=gbfacetint:gbpathdepth";
s_mi[n].m_icon = NULL;
n++;
s_mi[n].m_menuNum = 4;
s_mi[n].m_title = "Spider date facet";
s_mi[n].m_cgi = "facet=gbfacetint:gbspiderdate";
s_mi[n].m_icon = NULL;
n++;
// everything in tagdb is hashed
s_mi[n].m_menuNum = 4;
s_mi[n].m_title = "Site num inlinks facet";
s_mi[n].m_cgi = "facet=gbfacetint:gbtagsitenuminlinks";
s_mi[n].m_icon = NULL;
n++;
// s_mi[n].m_menuNum = 4;
// s_mi[n].m_title = "Domains facet";
// s_mi[n].m_cgi = "facet=gbfacetint:gbdomhash";
// n++;
s_mi[n].m_menuNum = 4;
s_mi[n].m_title = "Hopcount facet";
s_mi[n].m_cgi = "facet=gbfacetint:gbhopcount";
s_mi[n].m_icon = NULL;
n++;
#endif
// output
s_mi[n].m_menuNum = 5;
s_mi[n].m_title = "Output HTML";
@ -6600,10 +5956,8 @@ bool printSearchFiltersBar ( SafeBuf *sb , HttpRequest *hr ) {
// after 4 make a new line
if ( i == 5 ) sb->safePrintf("<br><br>");
if ( i == 9 ) sb->safePrintf("<br><br>");
#ifndef SUPPORT_FACETS
if( i == 4 ) continue;
#endif
printMenu ( sb , i , hr );
}

@ -1,5 +1,6 @@
#include "gb-include.h"
#include "PageRoot.h"
#include "Indexdb.h" // makeKey(int64_t docId)
#include "Titledb.h"
#include "Spider.h"
@ -21,7 +22,7 @@ bool sendPageRoot ( TcpSocket *s, HttpRequest *r ){
return sendPageRoot ( s, r, NULL );
}
bool printNav ( SafeBuf &sb , HttpRequest *r ) {
static bool printNav ( SafeBuf &sb , HttpRequest *r ) {
sb.safePrintf("</TD></TR></TABLE>"
"</body></html>");
return true;
@ -33,7 +34,7 @@ bool printNav ( SafeBuf &sb , HttpRequest *r ) {
//
//////////////
bool printFamilyFilter ( SafeBuf& sb , bool familyFilterOn ) {
static bool printFamilyFilter ( SafeBuf& sb , bool familyFilterOn ) {
char *s1 = "";
char *s2 = "";
if ( familyFilterOn ) s1 = " checked";
@ -49,7 +50,7 @@ bool printFamilyFilter ( SafeBuf& sb , bool familyFilterOn ) {
#include "SearchInput.h"
bool printRadioButtons ( SafeBuf& sb , SearchInput *si ) {
static bool printRadioButtons ( SafeBuf& sb , SearchInput *si ) {
// don't display this for directory search
// look it up. returns catId <= 0 if dmoz not setup yet.
// From PageDirectory.cpp
@ -127,7 +128,7 @@ bool printRadioButtons ( SafeBuf& sb , SearchInput *si ) {
return true;
}
bool printLogo ( SafeBuf& sb , SearchInput *si ) {
static bool printLogo ( SafeBuf& sb , SearchInput *si ) {
// if an image was provided...
if ( ! si->m_imgUrl || ! si->m_imgUrl[0] ) {
// no, now we default to our logo
@ -172,7 +173,7 @@ bool printLogo ( SafeBuf& sb , SearchInput *si ) {
bool expandHtml ( SafeBuf& sb,
char *head ,
const char *head ,
int32_t hlen ,
char *q ,
int32_t qlen ,
@ -433,7 +434,7 @@ bool expandHtml ( SafeBuf& sb,
bool printLeftColumnRocketAndTabs ( SafeBuf *sb ,
bool isSearchResultsPage ,
CollectionRec *cr ,
char *tabName ) {
const char *tabName ) {
class MenuItem {
public:
@ -661,7 +662,7 @@ bool printLeftColumnRocketAndTabs ( SafeBuf *sb ,
return true;
}
bool printFrontPageShell ( SafeBuf *sb , char *tabName , CollectionRec *cr ,
bool printFrontPageShell ( SafeBuf *sb , const char *tabName , CollectionRec *cr ,
bool printGigablast ) {
sb->safePrintf("<html>\n");
@ -670,7 +671,7 @@ bool printFrontPageShell ( SafeBuf *sb , char *tabName , CollectionRec *cr ,
sb->safePrintf("<meta name=\"description\" content=\"A powerful, new search engine that does real-time indexing!\">\n");
sb->safePrintf("<meta name=\"keywords\" content=\"search, search engine, search engines, search the web, fresh index, green search engine, green search, clean search engine, clean search\">\n");
char *title = "An Alternative Open Source Search Engine";
const char *title = "An Alternative Open Source Search Engine";
if ( strcasecmp(tabName,"search") ) {
title = tabName;
}
@ -736,7 +737,7 @@ bool printFrontPageShell ( SafeBuf *sb , char *tabName , CollectionRec *cr ,
return true;
}
bool printWebHomePage ( SafeBuf &sb , HttpRequest *r , TcpSocket *sock ) {
static bool printWebHomePage ( SafeBuf &sb , HttpRequest *r , TcpSocket *sock ) {
SearchInput si;
si.set ( sock , r );
@ -870,7 +871,7 @@ bool printWebHomePage ( SafeBuf &sb , HttpRequest *r , TcpSocket *sock ) {
return true;
}
bool printAddUrlHomePage ( SafeBuf &sb , char *url , HttpRequest *r ) {
static bool printAddUrlHomePage ( SafeBuf &sb , char *url , HttpRequest *r ) {
CollectionRec *cr = g_collectiondb.getRec ( r );
@ -1167,7 +1168,7 @@ public:
// only allow up to 1 Msg10's to be in progress at a time
static bool s_inprogress = false;
void doneInjectingWrapper3 ( void *st ) ;
static void doneInjectingWrapper3 ( void *st ) ;
// . returns false if blocked, true otherwise
// . sets g_errno on error
@ -1438,7 +1439,7 @@ bool sendPageAddUrl ( TcpSocket *sock , HttpRequest *hr ) {
}
void doneInjectingWrapper3 ( void *st ) {
static void doneInjectingWrapper3 ( void *st ) {
State1i *st1 = (State1i *)st;
// allow others to add now
s_inprogress = false;
@ -1645,7 +1646,7 @@ void doneInjectingWrapper3 ( void *st ) {
static HashTable s_htable;
static bool s_init = false;
static int32_t s_lastTime = 0;
bool canSubmit ( uint32_t h , int32_t now , int32_t maxAddUrlsPerIpDomPerDay ) {
static bool canSubmit ( uint32_t h , int32_t now , int32_t maxAddUrlsPerIpDomPerDay ) {
// . sometimes no limit
// . 0 means no limit because if they don't want any submission they
// can just turn off add url and we want to avoid excess

29
PageRoot.h Normal file

@ -0,0 +1,29 @@
#ifndef PAGEROOT_H_
#define PAGEROOT_H_
#include "SafeBuf.h"
#include "Collectiondb.h"
class SearchInput;
bool printFrontPageShell ( SafeBuf *sb,
const char *tabName,
CollectionRec *cr,
bool printGigablast );
bool expandHtml ( SafeBuf& sb,
const char *head ,
int32_t hlen ,
char *q ,
int32_t qlen ,
HttpRequest *r ,
SearchInput *si,
char *method ,
CollectionRec *cr );
bool printLeftColumnRocketAndTabs ( SafeBuf *sb ,
bool isSearchResultsPage ,
CollectionRec *cr ,
const char *tabName );
#endif

@ -21,27 +21,12 @@ bool sendPageSockets ( TcpSocket *s , HttpRequest *r ) {
// don't allow pages bigger than 128k in cache
char buf [ 128*1024 ];
SafeBuf p(buf, 128*1024);
//char *bufEnd = buf + 256*1024;
// a ptr into "buf"
// password, too
//int32_t pwdLen = 0;
//char *pwd = r->getString ( "pwd" , &pwdLen );
//if ( pwdLen > 31 ) pwdLen = 31;
//if ( pwd ) pwd[pwdLen]='\0';
int32_t collLen = 0;
char *coll = r->getString( "c", &collLen );
if ( collLen > MAX_COLL_LEN ) collLen = MAX_COLL_LEN;
if ( coll ) coll[collLen] = '\0';
//char pbuf [32];
//if ( pwdLen > 0 ) strncpy ( pbuf , pwd , pwdLen );
//pbuf[pwdLen]='\0';
// print standard header
// char *ss = p.getBuf();
// char *ssend = p.getBufEnd();
g_pages.printAdminTop ( &p, s , r );
//p.incrementLength(sss - ss);
// now print out the sockets table for each tcp server we have
printTcpTable(&p,"HTTP Server" ,g_httpServer.getTcp());
@ -62,66 +47,6 @@ bool sendPageSockets ( TcpSocket *s , HttpRequest *r ) {
if ( m == 0 ) count++;
}
/*
sprintf ( p , "<table width=100%% bgcolor=#d0d0f0 border=1>"
"<tr><td bgcolor=#c0c0f0 colspan=%"INT32">"
"<center><font size=+1><b>Wait Times</b></font>"
"</td></tr>\n" , 3 + count );
p += gbstrlen ( p );
// print columns
sprintf ( p ,
"<tr>"
"<td><b>machine #</b></td>"
"<td><b>send wait</b></td>"
"<td><b>read wait</b></td>" );
p += gbstrlen ( p );
// print disk columns
for ( int32_t i = 0 ; i < count ; i++ ) {
sprintf ( p , "<td><b>disk %"INT32" wait</b></td>",i);
p += gbstrlen ( p );
}
// end the top row
sprintf ( p , "</tr>\n" );
p += gbstrlen ( p );
// print rows
for ( int32_t i = 0 ; i < g_hostdb.getNumMachines() ; i++ ) {
// print machine #
sprintf ( p , "<tr><td><b>%"INT32"</b></td>",i);
p += gbstrlen ( p );
// then net send
float x = (float)g_queryRouter.m_sendWaits[i] / 1000;
sprintf ( p , "<td>%.1fms</td>", x );
p += gbstrlen ( p );
// then net read
x = (float)g_queryRouter.m_readWaits[i] / 1000;
sprintf ( p , "<td>%.1fms</td>", x );
p += gbstrlen ( p );
// print disk wait in milliseconds (it's in microseconds)
// find any host that matches this machine
for ( int32_t j = 0 ; j < g_hostdb.getNumHosts() ; j++ ) {
// use in order of ip
int32_t hid = g_hostdb.m_hostPtrs[j]->m_hostId;
// get machine #
int32_t m = g_hostdb.getMachineNum(hid);
// skip if no match
if ( m != i ) continue;
// otherwise print
x = (float)g_queryRouter.m_diskWaits[hid] / 1000;
sprintf ( p , "<td>%.1fms</td>", x );
p += gbstrlen ( p );
}
// end row
sprintf ( p , "</tr>\n");
p += gbstrlen ( p );
}
// end table
sprintf ( p , "</table>");
p += gbstrlen ( p );
*/
// print the final tail
//p += g_httpServer.printTail ( p , pend - p );
// calculate buffer length
int32_t bufLen = p.length();
// . send this page

@ -8,6 +8,7 @@
#include "PageParser.h" // g_inPageParser
#include "Rebalance.h"
#include "Profiler.h"
#include "PageRoot.h"
// a global class extern'd in Pages.h
Pages g_pages;
@ -1518,8 +1519,6 @@ bool sendPageReportSpam ( TcpSocket *s , HttpRequest *r ) {
return retval;
}
bool printFrontPageShell ( SafeBuf *sb , char *tabName , CollectionRec *cr ,
bool printGigablast ) ;
// let's use a separate section for each "page"
// then have 3 tables, the input parms,
@ -2200,81 +2199,6 @@ bool printApiForPage ( SafeBuf *sb , int32_t PAGENUM , CollectionRec *cr ) {
sb->safePrintf("<b>\t},\n</b>\n");
// gigabits
sb->brify2 (
"\t# The start of the gigabits array. Each gigabit "
"is mined from the content of the search results. "
"The top "
"N results are mined, and you can control N with the "
"&dsrt input parameter described above.\n"
, cols , "\n\t# " , false );
sb->safePrintf("<b>\t\"gigabits\":[\n\n</b>");
// print gigabit #0
sb->brify2 ( "\t\t# The first gigabit in the array.\n"
, cols , "\n\t\t# " , false );
sb->safePrintf("<b>\t\t{\n\n</b>");
sb->brify2 ( "\t\t# The gigabit as a string in utf8.\n"
, cols , "\n\t\t# " , false );
sb->safePrintf("<b>\t\t\"term\":\"Membership\",\n\n</b>");
sb->brify2 ( "\t\t# The numeric score of the gigabit.\n"
, cols , "\n\t\t# " , false );
sb->safePrintf("<b>\t\t\"score\":240,\n\n</b>");
sb->brify2 ( "\t\t# The popularity ranking of the gigabit. "
"Out of 10000 random documents, how many "
"documents contain it?\n"
, cols , "\n\t\t# " , false );
sb->safePrintf("<b>\t\t\"minPop\":480,\n\n</b>");
sb->brify2 ( "\t\t# The gigabit in the context of a "
"document.\n"
, cols , "\n\t\t# " , false );
sb->safePrintf("<b>\t\t\"instance\":{\n\n</b>");
sb->brify2 ( "\t\t\t"
"# A sentence, if it exists, "
"from one of the search results "
"which also contains the gigabit and as many "
"significant query terms as possible. In UTF-8.\n"
, cols , "\n\t\t\t# " , false );
sb->brify2("<b>\t\t\t\"sentence\":"
"\"Get a free "
"<b>Tested</b> Premium Membership here!\","
"\n\n</b>"
, 80 , "\n\t\t\t " , false );
sb->brify2 ( "\t\t\t"
"# The url that contained that sentence. Always "
"starts with http.\n"
, cols , "\n\t\t\t# " , false );
sb->safePrintf("<b>\t\t\t\"url\":"
"\"http://www.tested.com/\","
"\n\n</b>");
sb->brify2 ( "\t\t\t"
"# The domain of that url.\n"
, cols , "\n\t\t\t# " , false );
sb->safePrintf("<b>\t\t\t\"domain\":"
"\"tested.com\""
"\n</b>");
// end instance
sb->safePrintf("<b>\t\t}\n\n</b>");
// end gigabit
sb->safePrintf("\t\t# End of the first gigabit\n"
"<b>\t\t},\n\n</b>");
sb->safePrintf("\t\t...\n\n");
sb->brify2 (
"\t# End of the JSON gigabits array.\n"
, cols , "\n\t# " , false );
sb->safePrintf("<b>\t],\n\n</b>");
// BEGIN FACETS
sb->safePrintf( "\t# Start of the facets array, if any.\n");
sb->safePrintf("<b>\t\"facets\":[\n</b>\n");

463
Parms.cpp

@ -2345,13 +2345,6 @@ bool Parms::setFromRequest ( HttpRequest *r ,
char *xx=NULL;*xx=0;
}
// need this for searchInput which takes default from "cr"
//CollectionRec *cr = g_collectiondb.getRec ( r , true );
// no SearchInput.cpp does this and then overrides if xml feed
// to set m_docsToScanForTopics
//setToDefault ( THIS , objType , cr );
// loop through cgi parms
for ( int32_t i = 0 ; i < r->getNumFields() ; i++ ) {
// get cgi parm name
@ -4655,59 +4648,6 @@ void Parms::init ( ) {
m->m_obj = OBJ_COLL;
m++;
m->m_title = "demotion for query terms or gigabits in url";
m->m_desc = "Demotion factor for query terms or gigabits "
"in a result's url. "
"Score will be penalized by this factor times the number "
"of query terms or gigabits in the url divided by "
"the max value below such that fewer "
"query terms or gigabits in the url causes the result "
"to be demoted more heavily, depending on the factor. "
"Higher factors demote more per query term or gigabit "
"in the page's url. "
"Generally, a page may not be demoted more than this "
"factor as a percent. Also, how it is demoted is "
"dependant on the max value. For example, "
"a factor of 0.2 will demote the page 20% if it has no "
"query terms or gigabits in its url. And if the max value is "
"10, then a page with 5 query terms or gigabits in its "
"url will be demoted 10%; and 10 or more query terms or "
"gigabits in the url will not be demoted at all. "
"0 means no demotion. "
"A safe range is from 0 to 0.35. ";
m->m_cgi = "pqrqttiu";
m->m_off = (char *)&cr.m_pqr_demFactQTTopicsInUrl - x;
m->m_type = TYPE_FLOAT;
m->m_def = "0";
m->m_group = 0;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "max value for pages with query terms or gigabits "
"in url";
m->m_desc = "Max number of query terms or gigabits in a url. "
"Pages with a number of query terms or gigabits in their "
"urls greater than or equal to this value will not be "
"demoted. "
"This controls the range of values expected to represent "
"the number of query terms or gigabits in a url. It should "
"be set to or near the estimated max number of query terms "
"or topics that can be in a url. Setting to a lower value "
"increases the penalty per query term or gigabit that is "
"not in a url, but decreases the range of values that "
"will be demoted.";
m->m_cgi = "pqrqttium";
m->m_off = (char *)&cr.m_pqr_maxValQTTopicsInUrl - x;
m->m_type = TYPE_LONG;
m->m_def = "10";
m->m_group = 0;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "demotion for pages that are not "
"root or have many paths in the url";
m->m_desc = "Demotion factor each path in the url. "
@ -4775,60 +4715,6 @@ void Parms::init ( ) {
m->m_obj = OBJ_COLL;
m++;
m->m_title = "demotion for non-location specific queries "
"with a location specific title";
m->m_desc = "Demotion factor for non-location specific queries "
"with a location specific title. "
"Pages which contain a location in their title which is "
"not in the query or the gigabits will be demoted by their "
"population multiplied by this factor divided by the max "
"place population specified below. "
"Generally, a page will not be demoted more than this "
"value as a percent. "
"0 means no demotion. ";
m->m_cgi = "pqrloct";
m->m_off = (char *)&cr.m_pqr_demFactLocTitle - x;
m->m_type = TYPE_FLOAT;
m->m_def = "0.99";
m->m_group = 0;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "demotion for non-location specific queries "
"with a location specific summary";
m->m_desc = "Demotion factor for non-location specific queries "
"with a location specific summary. "
"Pages which contain a location in their summary which is "
"not in the query or the gigabits will be demoted by their "
"population multiplied by this factor divided by the max "
"place population specified below. "
"Generally, a page will not be demoted more than this "
"value as a percent. "
"0 means no demotion. ";
m->m_cgi = "pqrlocs";
m->m_off = (char *)&cr.m_pqr_demFactLocSummary - x;
m->m_type = TYPE_FLOAT;
m->m_def = "0.95";
m->m_group = 0;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "demote locations that appear in gigabits";
m->m_desc = "Demote locations that appear in gigabits.";
m->m_cgi = "pqrlocg";
m->m_off = (char *)&cr.m_pqr_demInTopics - x;
m->m_type = TYPE_BOOL;
m->m_def = "1";
m->m_group = 0;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "max value for non-location specific queries "
"with location specific results";
m->m_desc = "Max place population. "
@ -5093,19 +4979,6 @@ void Parms::init ( ) {
m->m_obj = OBJ_COLL;
m++;
m->m_title = "percent topic similar default";
m->m_desc = "Like above, but used for deciding when to cluster "
"results by topic for the news collection.";
m->m_cgi = "ptcd";
m->m_off = (char *)&cr.m_topicSimilarCutoffDefault - x;
m->m_type = TYPE_LONG;
m->m_def = "50";
m->m_group = 0;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "max query terms";
m->m_desc = "Do not allow more than this many query terms. Helps "
"prevent big queries from resource hogging.";
@ -5844,97 +5717,6 @@ void Parms::init ( ) {
m->m_obj = OBJ_SI;
m++;
m->m_title = "results to scan for gigabits generation";
m->m_desc = "How many search results should we "
"scan for gigabit (related topics) generation. Set this to "
"zero to disable gigabits!";
m->m_cgi = "dsrt";
m->m_off = (char *)&si.m_docsToScanForTopics - y;
m->m_type = TYPE_LONG;
m->m_defOff= (char *)&cr.m_docsToScanForTopics - x;
m->m_flags = PF_API;
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
m++;
m->m_title = "ip restriction for gigabits";
m->m_desc = "Should Gigablast only get one document per IP domain "
"and per domain for gigabits (related topics) generation?";
m->m_cgi = "ipr";
m->m_off = (char *)&si.m_ipRestrictForTopics - y;
m->m_defOff= (char *)&cr.m_ipRestrict - x;
m->m_type = TYPE_BOOL;
m->m_group = 0;
m->m_flags = PF_API;
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
m++;
m->m_title = "number of gigabits to show";
m->m_desc = "What is the number of gigabits (related topics) "
"displayed per query? Set to 0 to save a little CPU time.";
m->m_cgi = "nrt";
m->m_defOff= (char *)&cr.m_numTopics - x;
m->m_off = (char *)&si.m_numTopicsToDisplay - y;
m->m_type = TYPE_LONG;
m->m_def = "11";
m->m_group = 0;
m->m_sprpg = 0; // do not propagate
m->m_sprpp = 0; // do not propagate
m->m_flags = PF_API;
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
m++;
m->m_title = "min topics score";
m->m_desc = "Gigabits (related topics) with scores below this "
"will be excluded. Scores range from 0% to over 100%.";
m->m_cgi = "mts";
m->m_defOff= (char *)&cr.m_minTopicScore - x;
m->m_off = (char *)&si.m_minTopicScore - y;
m->m_type = TYPE_LONG;
m->m_group = 0;
m->m_flags = PF_API;
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
m++;
m->m_title = "min gigabit doc count by default";
m->m_desc = "How many documents must contain the gigabit "
"(related topic) in order for it to be displayed.";
m->m_cgi = "mdc";
m->m_defOff= (char *)&cr.m_minDocCount - x;
m->m_off = (char *)&si.m_minDocCount - y;
m->m_type = TYPE_LONG;
m->m_def = "2";
m->m_group = 0;
m->m_flags = PF_API;
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
m++;
m->m_title = "dedup doc percent for gigabits (related topics)";
m->m_desc = "If a document is this percent similar to another "
"document with a higher score, then it will not contribute "
"to the gigabit generation.";
m->m_cgi = "dsp";
m->m_defOff= (char *)&cr.m_dedupSamplePercent - x;
m->m_off = (char *)&si.m_dedupSamplePercent - y;
m->m_type = TYPE_LONG;
m->m_def = "80";
m->m_group = 0;
m->m_flags = PF_API;
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
m++;
///////////////////////////////////////////
// SPIDER PROXY CONTROLS
@ -6050,19 +5832,6 @@ void Parms::init ( ) {
m->m_obj = OBJ_CONF;
m++;
m->m_title = "max words per gigabit (related topic) by default";
m->m_desc = "Maximum number of words a gigabit (related topic) "
"can have. Affects xml feeds, too.";
m->m_cgi = "mwpt";
m->m_defOff= (char *)&cr.m_maxWordsPerTopic - x;
m->m_off = (char *)&si.m_maxWordsPerTopic - y;
m->m_type = TYPE_LONG;
m->m_def = "6";
m->m_group = 0;
m->m_flags = PF_API;
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
m++;
m->m_title = "show images";
m->m_desc = "Should we return or show the thumbnail images in the "
@ -6364,52 +6133,6 @@ void Parms::init ( ) {
m->m_obj = OBJ_SI;
m++;
m->m_title = "return number of docs per topic";
m->m_desc = "Use 1 if you want Gigablast to return the number of "
"documents in the search results that contained each topic "
"(gigabit).";
m->m_def = "1";
m->m_off = (char *)&si.m_returnDocIdCount - y;
m->m_type = TYPE_BOOL;
m->m_cgi = "rdc";
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
m++;
m->m_title = "return docids per topic";
m->m_desc = "Use 1 if you want Gigablast to return the list of "
"docIds from the search results that contained each topic "
"(gigabit).";
m->m_def = "0";
m->m_off = (char *)&si.m_returnDocIds - y;
m->m_type = TYPE_BOOL;
m->m_cgi = "rd";
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
m++;
m->m_title = "return popularity per topic";
m->m_desc = "Use 1 if you want Gigablast to return the popularity "
"of each topic (gigabit).";
m->m_def = "0";
m->m_off = (char *)&si.m_returnPops - y;
m->m_type = TYPE_BOOL;
m->m_cgi = "rp";
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
m++;
m->m_title = "debug gigabits flag";
m->m_desc = "Is 1 to log gigabits debug information, 0 otherwise.";
m->m_def = "0";
m->m_off = (char *)&si.m_debugGigabits - y;
m->m_type = TYPE_BOOL;
m->m_cgi = "debuggigabits";
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
m++;
m->m_title = "return docids only";
m->m_desc = "Is 1 to return only docids as query results.";
m->m_def = "0";
@ -9864,147 +9587,6 @@ void Parms::init ( ) {
m->m_obj = OBJ_COLL;
m++;
m->m_title = "results to scan for gigabits generation by default";
m->m_desc = "How many search results should we "
"scan for gigabit (related topics) generation. Set this to "
"zero to disable gigabits generation by default.";
m->m_cgi = "dsrt";
m->m_off = (char *)&cr.m_docsToScanForTopics - x;
m->m_type = TYPE_LONG;
m->m_def = "0";
m->m_flags = PF_API | PF_CLONE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "ip restriction for gigabits by default";
m->m_desc = "Should Gigablast only get one document per IP domain "
"and per domain for gigabits (related topics) generation?";
m->m_cgi = "ipr";
m->m_off = (char *)&cr.m_ipRestrict - x;
m->m_type = TYPE_BOOL;
// default to 0 since newspaperarchive only has docs from same IP dom
m->m_def = "0";
m->m_group = 0;
m->m_flags = PF_API | PF_CLONE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "remove overlapping topics";
m->m_desc = "Should Gigablast remove overlapping topics (gigabits)?";
m->m_cgi = "rot";
m->m_off = (char *)&cr.m_topicRemoveOverlaps - x;
m->m_type = TYPE_BOOL;
m->m_def = "1";
m->m_group = 0;
m->m_flags = PF_API | PF_CLONE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "number of gigabits to show by default";
m->m_desc = "What is the number of "
"related topics (gigabits) "
"displayed per query? Set to 0 to save "
"CPU time.";
m->m_cgi = "nrt";
m->m_off = (char *)&cr.m_numTopics - x;
m->m_type = TYPE_LONG;
m->m_def = "0";
m->m_group = 0;
m->m_sprpg = 0; // do not propagate
m->m_sprpp = 0; // do not propagate
m->m_flags = PF_API | PF_CLONE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "min gigabit score by default";
m->m_desc = "Gigabits (related topics) with scores below this "
"will be excluded. Scores range from 0% to over 100%.";
m->m_cgi = "mts";
m->m_off = (char *)&cr.m_minTopicScore - x;
m->m_type = TYPE_LONG;
m->m_def = "5";
m->m_group = 0;
m->m_flags = PF_API | PF_CLONE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "min gigabit doc count by default";
m->m_desc = "How many documents must contain the gigabit "
"(related topic) in order for it to be displayed.";
m->m_cgi = "mdc";
m->m_off = (char *)&cr.m_minDocCount - x;
m->m_type = TYPE_LONG;
m->m_def = "2";
m->m_group = 0;
m->m_flags = PF_API | PF_CLONE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "dedup doc percent for gigabits (related topics)";
m->m_desc = "If a document is this percent similar to another "
"document with a higher score, then it will not contribute "
"to the gigabit generation.";
m->m_cgi = "dsp";
m->m_off = (char *)&cr.m_dedupSamplePercent - x;
m->m_type = TYPE_LONG;
m->m_def = "80";
m->m_group = 0;
m->m_flags = PF_API | PF_CLONE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "max words per gigabit (related topic) by default";
m->m_desc = "Maximum number of words a gigabit (related topic) "
"can have. Affects xml feeds, too.";
m->m_cgi = "mwpt";
m->m_off = (char *)&cr.m_maxWordsPerTopic - x;
m->m_type = TYPE_LONG;
m->m_def = "6";
m->m_group = 0;
m->m_flags = PF_API | PF_CLONE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "gigabit max sample size";
m->m_desc = "Max chars to sample from each doc for gigabits "
"(related topics).";
m->m_cgi = "tmss";
m->m_off = (char *)&cr.m_topicSampleSize - x;
m->m_type = TYPE_LONG;
m->m_def = "4096";
m->m_group = 0;
m->m_flags = PF_API | PF_CLONE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "gigabit max punct len";
m->m_desc = "Max sequential punct chars allowed in a gigabit "
"(related topic). "
" Set to 1 for speed, 5 or more for best topics but twice as "
"slow.";
m->m_cgi = "tmpl";
m->m_off = (char *)&cr.m_topicMaxPunctLen - x;
m->m_type = TYPE_LONG;
m->m_def = "1";
m->m_group = 0;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "display indexed date";
m->m_desc = "Display the indexed date along with results.";
m->m_cgi = "didt";
@ -10331,6 +9913,31 @@ void Parms::init ( ) {
m++;
m->m_title = "msg40->39 timeout";
m->m_desc = "Timeout for Msg40/Msg3a to collect candidate docids with Msg39. In milliseconds";
m->m_cgi = "msgfourty_msgthirtynine_timeout";
m->m_off = offsetof(Conf,m_msg40_msg39_timeout);
m->m_xml = "msg40_msg39_timeout";
m->m_type = TYPE_LONG_LONG;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_CONF;
m->m_def = "5000";
m->m_flags = 0;
m++;
m->m_title = "msg3a->39 network overhead";
m->m_desc = "Additional overhead/latecny for msg39 request+response over the network";
m->m_cgi = "msgthreea_msgthirtynine_network_overhead";
m->m_off = offsetof(Conf,m_msg3a_msg39_network_overhead);
m->m_xml = "msg3a_msg39_network_overhead";
m->m_type = TYPE_LONG_LONG;
m->m_page = PAGE_SEARCH;
m->m_obj = OBJ_CONF;
m->m_def = "250";
m->m_flags = 0;
m++;
///////////////////////////////////////////
// PAGE SPIDER CONTROLS
///////////////////////////////////////////
@ -12108,16 +11715,6 @@ void Parms::init ( ) {
m->m_obj = OBJ_CONF;
m++;
m->m_title = "log debug topic messages";
m->m_cgi = "ldto";
m->m_off = (char *)&g_conf.m_logDebugTopics - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_priv = 1;
m->m_page = PAGE_LOG;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "log debug topDoc messages";
m->m_cgi = "ldtopd";
m->m_off = (char *)&g_conf.m_logDebugTopDocs - g;
@ -12334,16 +11931,6 @@ void Parms::init ( ) {
m->m_obj = OBJ_CONF;
m++;
m->m_title = "log timing messages for related topics";
m->m_cgi = "ltt";
m->m_off = (char *)&g_conf.m_logTimingTopics - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_priv = 1;
m->m_page = PAGE_LOG;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "log reminder messages";
m->m_desc = "Log reminders to the programmer. You do not need this.";
m->m_cgi = "lr";

@ -5,7 +5,6 @@
Phrases::Phrases ( ) {
m_buf = NULL;
m_phraseSpam = NULL;
}
Phrases::~Phrases ( ) {
@ -18,29 +17,13 @@ void Phrases::reset() {
}
m_buf = NULL;
m_phraseSpam = NULL;
}
// initialize this token array with the string, "s" of length, "len".
bool Phrases::set( Words *words,
Bits *bits ,
bool useStopWords ,
bool useStems ,
int32_t titleRecVersion,
int32_t niceness) {
bool Phrases::set( Words *words, Bits *bits, int32_t titleRecVersion, int32_t niceness ) {
// reset in case being re-used
reset();
// now we never use stop words and we just index two-word phrases
// so that a search for "get a" in quotes will match a doc that has
// the phrase "get a clue". it might impact performance, but it should
// be insignificant... but we need to have this level of precision.
// ok -- but what about 'kick a ball'. we might not have that phrase
// in the results for "kick a" AND "a ball"!! so we really need to
// index "kick a ball" as well as "kick a" and "a ball". i don't think
// that will cause too much bloat.
//useStopWords = false;
// ensure we have words
if ( ! words ) return true;
@ -49,7 +32,7 @@ bool Phrases::set( Words *words,
m_numPhrases = words->getNumWords();
// how much mem do we need?
int32_t need = m_numPhrases * (8+8+1+1+1);
int32_t need = m_numPhrases * (8+1);
// alloc if we need to
if ( need > PHRASE_BUF_SIZE )
@ -65,26 +48,17 @@ bool Phrases::set( Words *words,
// phrase not using stop words
m_phraseIds2 = (int64_t *)p ; p += m_numPhrases * 8;
m_phraseIds3 = (int64_t *)p ; p += m_numPhrases * 8;
m_phraseSpam = (unsigned char *)p ; p += m_numPhrases * 1;
m_numWordsTotal2= (unsigned char *)p ; p += m_numPhrases * 1;
m_numWordsTotal3= (unsigned char *)p ; p += m_numPhrases * 1;
// sanity
if ( p != m_buf + need ) { char *xx=NULL;*xx=0; }
// clear this
memset ( m_numWordsTotal2 , 0 , m_numPhrases );
memset ( m_numWordsTotal3 , 0 , m_numPhrases );
// point to this info while we parse
m_words = words;
m_wptrs = words->getWords();
m_wlens = words->getWordLens();
m_wids = words->getWordIds();
m_bits = bits;
m_useStopWords = useStopWords;
m_useStems = useStems;
// we now are dependent on this
m_titleRecVersion = titleRecVersion;
@ -93,7 +67,10 @@ bool Phrases::set( Words *words,
// . sets m_phraseIds [i]
// . sets m_phraseSpam[i] to PSKIP if NO phrase exists
for ( int32_t i = 0 ; i < words->getNumWords() ; i++ ) {
if ( ! m_wids[i] ) continue;
if ( ! m_wids[i] ) {
continue;
}
setPhrase ( i , niceness);
}
// success
@ -109,16 +86,15 @@ void Phrases::setPhrase ( int32_t i, int32_t niceness ) {
// hash of the phrase
int64_t h = 0LL;
// the hash of the two-word phrase (now we do 3,4 and 5 word phrases)
// the hash of the two-word phrase
int64_t h2 = 0LL;
int64_t h3 = 0LL;
// reset
unsigned char pos = 0;
// now look for other tokens that should follow the ith token
int32_t nw = m_words->getNumWords();
int32_t numWordsInPhrase = 1;
int32_t nw = m_words->getNumWords();
int32_t numWordsInPhrase = 1;
// use the min spam from all words in the phrase as the spam for phrase
char minSpam = -1;
@ -142,9 +118,10 @@ void Phrases::setPhrase ( int32_t i, int32_t niceness ) {
// a phrase but not be in it, then the phrase id ends up just
// being the following word's id. causing the synonyms code to
// give a synonym which it should not un Synonyms::set()
if ( ! m_bits->canBeInPhrase(i) )
if ( ! m_bits->canBeInPhrase(i) ) {
// so indeed, skip it then
goto nophrase;
}
h = m_wids[i];
@ -160,14 +137,21 @@ void Phrases::setPhrase ( int32_t i, int32_t niceness ) {
// . do not allow more than 32 alnum/punct "words" in a phrase
// . this prevents phrases with 100,000 words from slowing
// us down. would put us in a huge double-nested for loop
if ( j > i + 32 ) goto nophrase;
if ( j > i + 32 ) {
goto nophrase;
}
// deal with punct words
if ( ! m_wids[j] ) {
// if we cannot pair across word j then break
if ( ! m_bits->canPairAcross (j) ) break;
if ( !m_bits->canPairAcross( j ) ) {
break;
}
// does it have a hyphen?
if (j==i+1 && m_words->hasChar(j,'-')) hasHyphen=true;
if ( j == i + 1 && m_words->hasChar( j, '-' ) ) {
hasHyphen = true;
}
continue;
}
@ -180,51 +164,35 @@ void Phrases::setPhrase ( int32_t i, int32_t niceness ) {
int32_t conti = pos;
// hash the jth word into the hash
h = hash64Lower_utf8_cont(m_wptrs[j],
m_wlens[j],
h,
&conti );
h = hash64Lower_utf8_cont( m_wptrs[j], m_wlens[j], h, &conti );
pos = conti;
numWordsInPhrase++;
++numWordsInPhrase;
// N-word phrases?
if ( numWordsInPhrase == 2 ) {
h2 = h;
m_numWordsTotal2[i] = j-i+1;
if ( m_bits->isStopWord(j) )
hasStopWord2 = true;
continue;
}
if ( numWordsInPhrase == 3 ) {
h3 = h;
m_numWordsTotal3[i] = j-i+1;
//continue;
m_numWordsTotal2[i] = j - i + 1;
hasStopWord2 = m_bits->isStopWord(j);
break;
}
}
// if we cannot pair across word j then break
if ( ! m_bits->canPairAcross (j) ) break;
// keep chugging?
if ( numWordsInPhrase >= 5 ) {
// if we're not using stop words then break
if ( ! m_useStopWords ) break;
// if it's not a stop word then break
if ( ! m_bits->isStopWord (j) ) break;
if ( ! m_bits->canPairAcross (j) ) {
break;
}
// otherwise, get the next word
}
// if we had no phrase then use 0 as id (need 2+ words to be a pharse)
if ( numWordsInPhrase <= 1 ) {
nophrase:
m_phraseSpam[i] = PSKIP;
m_phraseIds2[i] = 0LL;
m_phraseIds3[i] = 0LL;
m_numWordsTotal2[i] = 0;
m_numWordsTotal3[i] = 0;
return;
}
@ -236,7 +204,6 @@ void Phrases::setPhrase ( int32_t i, int32_t niceness ) {
// set the phrase spam
if ( minSpam == -1 ) minSpam = 0;
m_phraseSpam[i] = minSpam;
// hyphen between numbers does not count (so 1-2 != 12)
if ( isNum ) hasHyphen = false;
@ -247,25 +214,23 @@ void Phrases::setPhrase ( int32_t i, int32_t niceness ) {
// . "i-phone" -> iphone
// . "e-mail" -> email
if ( hasHyphen || ! hasStopWord2 ) {
//m_phraseIds [i] = h;
m_phraseIds2[i] = h2;
}
// . "st. and" !-> stand
// . "the rapist" !-> therapist
else {
//m_phraseIds [i] = h ^ 0x768867;
m_phraseIds2[i] = h2 ^ 0x768867;
}
// forget hyphen logic for these
m_phraseIds3[i] = h3;
}
// . store phrase that starts with word #i into "printBuf"
// . return bytes stored in "printBuf"
char *Phrases::getPhrase ( int32_t i , int32_t *phrLen , int32_t npw ) {
// return 0 if no phrase
if ( m_phraseSpam[i] == PSKIP ) return NULL;
if ( m_phraseIds2[i] == 0LL ) {
return NULL;
}
// store the phrase in here
static char buf[256];
// . how many words, including punct words, are in phrase?
@ -273,7 +238,6 @@ char *Phrases::getPhrase ( int32_t i , int32_t *phrLen , int32_t npw ) {
//int32_t n = m_numWordsTotal[i] ;
int32_t n ;
if ( npw == 2 ) n = m_numWordsTotal2[i] ;
else if ( npw == 3 ) n = m_numWordsTotal3[i] ;
else { char *xx=NULL; *xx=0; }
char *s = buf;
@ -303,42 +267,6 @@ char *Phrases::getPhrase ( int32_t i , int32_t *phrLen , int32_t npw ) {
return buf;
}
// . word #n is in a phrase if he has [word][punct] or [punct][word]
// before/after him and you can pair across the punct and include both
// in a phrase
// . used by SimpleQuery class to see if a word is in a phrase or not
// . if it is then the query may choose not to represent the word by itself
bool Phrases::isInPhrase ( int32_t n ) {
// returns true if we started a phrase (our phraseSpam is not PSKIP)
if ( m_phraseSpam[n] != PSKIP ) return true;
// . see if we were in a phrase started by a word before us
// . this only words since stop words - whose previous word cannot be
// paired across - are able to start phrases
if ( n < 2 ) return false;
if ( ! m_bits->canPairAcross(n-1) ) return false;
if ( ! m_bits->canBeInPhrase(n-2) ) return false;
return true;
}
int32_t Phrases::getMaxWordsInPhrase ( int32_t i , int64_t *pid ) {
*pid = 0LL;
if ( m_numWordsTotal3[i] ) {
*pid = m_phraseIds3[i];
return m_numWordsTotal3[i];
}
if ( m_numWordsTotal2[i] ) {
*pid = m_phraseIds2[i];
return m_numWordsTotal2[i];
}
return 0;
}
int32_t Phrases::getMinWordsInPhrase ( int32_t i , int64_t *pid ) {
*pid = 0LL;

@ -8,17 +8,11 @@
#ifndef _PHRASES_H_
#define _PHRASES_H_
//#include "TermTable.h"
#include "Bits.h"
//#include "Spam.h"
//#include "Scores.h"
#include "Words.h"
//#include "Weights.h"
#define PHRASE_BUF_SIZE (MAX_WORDS * 14)
#define PSKIP 201
class Phrases {
public:
@ -27,82 +21,32 @@ class Phrases {
~Phrases();
void reset() ;
bool set2 ( Words *words, Bits *bits , int32_t niceness ) {
return set ( words,bits,true,false,TITLEREC_CURRENT_VERSION,
niceness); };
bool set2( Words *words, Bits *bits, int32_t niceness ) {
return set( words, bits, TITLEREC_CURRENT_VERSION, niceness );
}
// . set the hashes (m_phraseIds) of the phrases for these words
// . a phraseSpam of PSKIP means word is not in a phrase
// . "bits" describes the words in a phrasing context
// . "spam" is % spam of each word (spam may be NULL)
bool set ( Words *words,
Bits *bits ,
//Spam *spam ,
//Scores *scores ,
bool useStopWords ,
bool useStems ,
int32_t titleRecVersion,
int32_t niceness);
bool set( Words *words, Bits *bits, int32_t titleRecVersion, int32_t niceness );
//int64_t getPhraseId ( int32_t n ) { return m_phraseIds [n]; };
int64_t getPhraseId2 ( int32_t n ) { return m_phraseIds2[n]; };
//int64_t *getPhraseIds ( ) { return m_phraseIds ; };
int64_t *getPhraseIds2( ) { return m_phraseIds2; };
int64_t *getPhraseIds3( ) { return m_phraseIds3; };
//int64_t *getPhraseIds4( ) { return m_phraseIds4; };
//int64_t *getPhraseIds5( ) { return m_phraseIds5; };
//int64_t *getStripPhraseIds ( ) { return m_stripPhraseIds ; };
//int64_t getStripPhraseId ( int32_t n )
//{ return m_stripPhraseIds [n]; };
int32_t getPhraseSpam ( int32_t n ) { return m_phraseSpam[n]; };
bool hasPhraseId ( int32_t n ) { return (m_phraseSpam[n]!=PSKIP);};
bool startsAPhrase ( int32_t n ) { return (m_phraseSpam[n]!=PSKIP);};
bool isInPhrase ( int32_t n ) ;
// . often word #i is involved in 2 phrases
// . m_phraseIds[i] only holds the one he starts
// . this gets the one he's in the middle of or on the right of
// . used by Query.cpp for phrase-forcing
//int64_t getLeftPhraseId ( int32_t i ) ;
//int64_t getLeftStripPhraseId ( int32_t i ) ;
//int32_t getLeftPhraseIndex ( int32_t i ) ;
// . each non-spammy occurence of phrase adds "baseScore" to it's score
/*
bool hash ( TermTable *table ,
Weights *weightsPtr ,
uint32_t baseScore ,
uint32_t maxScore ,
int64_t startHash ,
char *prefix1 ,
int32_t prefixLen1 ,
char *prefix2 ,
int32_t prefixLen2 ,
bool hashUniqueOnly ,
int32_t titleRecVersion,
int32_t niceness = 0);
*/
int64_t *getPhraseIds2( ) { return m_phraseIds2; }
// . store phrase that starts with word #i into "dest"
// . we also NULL terminated it in "dest"
// . return length
char *getPhrase ( int32_t i , int32_t *phrLen , int32_t npw );
//char *getNWordPhrase ( int32_t i , int32_t *phrLen , int32_t npw ) ;
//char *getStripPhrase ( int32_t i , int32_t *phrLen );
//int32_t getNumWords ( int32_t i ) { return m_numWordsTotal[i]; };
//int32_t getNumWordsInPhrase ( int32_t i ) { return m_numWordsTotal [i]; };
int32_t getNumWordsInPhrase2( int32_t i ) { return m_numWordsTotal2[i]; };
int32_t getNumWordsInPhrase2( int32_t i ) { return m_numWordsTotal2[i]; }
int32_t getMaxWordsInPhrase( int32_t i , int64_t *pid ) ;
int32_t getMinWordsInPhrase( int32_t i , int64_t *pid ) ;
// . leave this public so SimpleQuery.cpp can mess with it
// . called by Phrases::set() above for each i
// . we set phraseSpam to 0 to 100% typically
// . we set phraseSpam to PSKIP if word #i cannot start a phrase
void setPhrase ( int32_t i ,
int32_t niceness);
void setPhrase( int32_t i, int32_t niceness );
// private:
@ -111,26 +55,10 @@ class Phrases {
char *m_buf;
int32_t m_bufSize;
// . these are 1-1 with the words in the Words class
// . phraseSpam is PSKIP if the phraseId is invalid
//int64_t *m_phraseIds ;
// the two word hash
int64_t *m_phraseIds2 ;
int64_t *m_phraseIds3 ;
//int64_t *m_phraseIds4 ;
//int64_t *m_phraseIds5 ;
//int64_t *m_stripPhraseIds ;
unsigned char *m_phraseSpam ;
// . # words in phrase TOTAL (including punct words)
// . used for printing
// . used by SimpleQuery::getTermIds() for setting word ranges
// for phrases
//unsigned char *m_numWordsTotal ;
// for the two word phrases:
unsigned char *m_numWordsTotal2 ;
unsigned char *m_numWordsTotal3 ;
//unsigned char *m_numWordsTotal4 ;
//unsigned char *m_numWordsTotal5 ;
int32_t m_numPhrases; // should equal the # of words
// placeholders to avoid passing to subroutine
@ -140,19 +68,7 @@ class Phrases {
int32_t *m_wlens;
Bits *m_bits;
bool m_useStems;
bool m_useStopWords;
int32_t m_titleRecVersion;
// replaces Scores
//class Sections *m_sections;
//class Section *m_sectionPtrs;
// word scores, set in Scores.cpp
//int32_t *m_wordScores;
// the score of the phrase is the min of the scores of the words that
// make up the phrase
//int32_t *m_phraseScores ;
};
#endif

2
Pos.h

@ -4,7 +4,7 @@
#define _POS_H_
#include <stdint.h>
#include <Titledb.h>
#include "Titledb.h"
// this class is used to measure the number of characters between two "words"
// (as defined in the Words.cpp class) in units of "characters". A utf8

@ -3839,95 +3839,6 @@ void PosdbTable::intersectLists10_r ( ) {
if( g_conf.m_logTracePosdb ) log(LOG_TRACE,"%s:%s:%d: seoHack: %s, numTerms: %"INT32"", __FILE__,__func__, __LINE__, seoHack?"true":"false", m_q->m_numTerms);
// if we are just a sitehash:xxxxx list and m_getSectionStats is
// true then assume the list is one of hacked posdb keys where
// the wordposition bits and others are really a 32-bit site hash
// and we have to see how many different docids and sites have
// this term. and we compare to our site hash,
// m_r->m_sectionSiteHash32 to determine if the posdb key is
// onsite or offsite. then XmlDoc::printRainbowSections()
// can print out how many page/sites duplicate your section's content.
// MDW: TODO: for the facet terms just compile the stats and do not
// send to intersecting. they are ignored for those purposes. send
// the hashtable back so msg3a can integrate the stats. keep in mind
// we have multiple docid ranges sometimes for one query!!!!
/*
MDW: take this out. now treat as a normal termlist but
do not use for scoring. so it is kinda like gbmin: gbmax:
query operators but it will just add the facet values to
QueryTerm::m_facetHashList for transmission back to the aggregator
node. however, it is only for docids in the final result set!
if ( m_r->m_getFacetStats ) {
// reset
m_facetStats.m_totalMatches = 0;
m_facetStats.m_totalEntries = 0;
m_dt.clear();
// scan the posdb keys
//for ( int32_t i = 0 ; i < m_msg2->getNumListsInGroup(0); i++) {
// get the sublist
RdbList *list = m_msg2->getList(0);//Group(0)[i];
char *p = list->getList ();
char *pend = p + list->getListSize();
// test
//int64_t final = 5663137686803656554LL;
//final &= TERMID_MASK;
//if ( p<pend && g_posdb.getTermId(p) == final )
// log("boo");
// scan it
for ( ; p < pend ; ) {
// . first key is the full size
// . uses the w,G,s,v and F bits to hold this
// . this is no longer necessarily sitehash, but
// can be any val, like now FacetStats is using
// it for the innerHtml sentence content hash32
int32_t sh32 = g_posdb.getFacetVal32 ( p );
//int64_t d = g_posdb.getDocId(p);
//int32_t rs = list->getRecSize(p);
// this will not update listptrlo, watch out!
p += list->getRecSize ( p );
// does this xpath from another docid have the
// same inner html as us?
if ( sh32 == m_r->m_myFacetVal32 ) // m_siteHash32 )
m_facetStats.m_totalMatches++;
// always this
m_facetStats.m_totalEntries++;
// unique site count
if ( m_dt.isInTable ( &sh32 ) ) continue;
// count it
m_facetStats.m_numUniqueVals++;
// only once
m_dt.addKey ( &sh32 );
// log it
//log("usite: %08"XINT32" %"INT64" rs=%"INT32"",sh32,d,rs);
// stop if too much so we do not try to
// re-alloc in a thread!
if ( m_dt.m_numSlotsUsed >= 1000000 ) break;
}
// and return the list of merging
int32_t *s = (int32_t *)m_facetHashList.getBufStart();
int32_t *send = (int32_t *)m_facetHashList.getBufEnd();
//if ( m_facetStats.m_numUniqueSites == 17 ) {
// log("q=%s",m_r->ptr_query);
// log("hey");
// //char *xx = NULL;*xx=0;
//}
//if(!strcmp(m_r->ptr_query,"gbsectionhash:3335323672699668766"
// log("boo");
int32_t *orig = s;
for ( int32_t i = 0 ; i < m_dt.m_numSlots ; i++ ) {
if ( ! m_dt.m_flags[i] ) continue;
*s++ = *(int32_t *)m_dt.getKeyFromSlot(i);
if ( s >= send ) break;
}
m_facetHashList.setLength((char *)s-(char *)orig);
return;
}
*/
//
// hash the docids in the whitelist termlists into a hashtable.
// every docid in the search results must be in there. the
@ -5826,9 +5737,7 @@ void PosdbTable::intersectLists10_r ( ) {
// . first key is the full size
// . uses the w,G,s,v and F bits to hold this
// . this is no longer necessarily sitehash,but
// can be any val, like now SectionStats is
// using it for the innerHtml sentence
// content hash32
// can be any val
int32_t val32 = g_posdb.getFacetVal32 ( p2 );
// PREADVANCE "p"
@ -5967,12 +5876,6 @@ void PosdbTable::intersectLists10_r ( ) {
skipFacetCheck:
// if only one term like gbfacetstr:gbxpathsitehash123456
// then do not bother adding to top tree
if ( m_r->m_forSectionStats ) goto advance;
// . seoDebug hack so we can set "dcs"
// . we only come here if we actually made it into m_topTree
if ( secondPass ) {

115
Posdb.h

@ -132,15 +132,6 @@ class Posdb {
bool addColl ( char *coll, bool doVerify = true );
// . xmldoc.cpp should call this
// . store all posdb keys from revdbList into one hashtable
// and only add to new list if not in there
//bool makeList ( class RdbList *revdbList ,
// int64_t docId ,
// class Words *words );
// . make a 16-byte key from all these components
// . since it is 16 bytes, the big bit will be set
void makeKey ( void *kp ,
@ -440,80 +431,8 @@ public:
int32_t m_quotedStartId;
};
/*
#include "RdbList.h"
class PosdbList : public RdbList {
public:
// why do i have to repeat this for LinkInfo::set() calling our set()??
void set ( char *list , int32_t listSize , bool ownData ) {
RdbList::set ( list ,
listSize ,
list , // alloc
listSize , // alloc size
0 , // fixed data size
ownData ,
true , // use half keys?
sizeof(key_t));// 12 bytes per key
};
// clear the low bits on the keys so terms are DELETED
void clearDelBits ( );
void print();
// . these are made for special IndexLists, too
// . getTermId() assumes as 12 byte key
int64_t getCurrentTermId12 ( ) {
return getTermId12 ( m_listPtr ); };
int64_t getTermId12 ( char *rec ) {
return (*(uint64_t *)(&rec[4])) >> 16 ;
};
int64_t getTermId16 ( char *rec ) {
return (*(uint64_t *)(&rec[8])) >> 16 ;
};
// these 2 assume 12 and 6 byte keys respectively
int64_t getCurrentDocId () {
if ( isHalfBitOn ( m_listPtr ) ) return getDocId6 (m_listPtr);
else return getDocId12(m_listPtr);
};
int64_t getDocId ( char *rec ) {
if ( isHalfBitOn ( rec ) ) return getDocId6 (rec);
else return getDocId12(rec);
};
int64_t getCurrentDocId12 ( ) {
return getDocId12 ( m_listPtr ); };
int64_t getDocId12 ( char *rec ) {
return ((*(uint64_t *)(rec)) >> 2) & DOCID_MASK; };
int64_t getDocId6 ( char *rec ) {
int64_t docid;
*(int32_t *)(&docid) = *(int32_t *)rec;
((char *)&docid)[4] = rec[4];
docid >>= 2;
return docid & DOCID_MASK;
};
// this works with either 12 or 6 byte keys
unsigned char getCurrentScore ( ) {
return getScore(m_listPtr); };
unsigned char getScore ( char *rec ) { return ~rec[5]; };
// uncomplemented...
void setScore ( char *rec , char score ) { rec[5] = score; };
// for date lists only...
int32_t getCurrentDate ( ) { return ~*(int32_t *)(m_listPtr+6); };
};
*/
#include "Query.h" // MAX_QUERY_TERMS, qvec_t
// max # search results that can be viewed without using TopTree
//#define MAX_RESULTS 1000
class PosdbTable {
public:
@ -525,10 +444,7 @@ class PosdbTable {
char debug ,
void *logstate ,
class TopTree *topTree ,
//char *coll ,
collnum_t collnum ,
//IndexList *lists ,
//int32_t numLists ,
class Msg2 *msg2,
class Msg39Request *r );
@ -538,12 +454,6 @@ class PosdbTable {
// pre-allocate memory since intersection runs in a thread
bool allocTopTree ( );
// . returns false on error and sets errno
// . we assume there are "m_numTerms" lists passed in (see set() above)
//void intersectLists_r ( );
//void intersectLists9_r ( );
void getTermPairScoreForNonBody ( int32_t i, int32_t j,
char *wpi, char *wpj,
char *endi, char *endj,
@ -580,7 +490,9 @@ class PosdbTable {
void freeMem ( ) ;
// has init already been called?
bool isInitialized ( ) { return m_initialized; };
bool isInitialized() {
return m_initialized;
}
uint64_t m_docId;
@ -609,56 +521,37 @@ class PosdbTable {
int32_t m_maxScores;
//char *m_coll;
collnum_t m_collnum;
int32_t *m_qpos;
int32_t *m_wikiPhraseIds;
int32_t *m_quotedStartIds;
//class DocIdScore *m_ds;
int32_t m_qdist;
float *m_freqWeights;
//int64_t *m_freqs;
char *m_bflags;
int32_t *m_qtermNums;
float m_bestWindowScore;
//char **m_finalWinners1;
//char **m_finalWinners2;
//float *m_finalScores;
char **m_windowTermPtrs;
// how many docs in the collection?
int64_t m_docsInColl;
//SectionStats m_sectionStats;
//SafeBuf m_facetHashList;
//HashTableX m_dt;
class Msg2 *m_msg2;
// if getting more than MAX_RESULTS results, use this top tree to hold
// them rather than the m_top*[] arrays above
class TopTree *m_topTree;
//HashTableX m_docIdTable;
SafeBuf m_scoreInfoBuf;
SafeBuf m_pairScoreBuf;
SafeBuf m_singleScoreBuf;
SafeBuf m_stackBuf;
//SafeBuf m_mergeBuf;
// a reference to the query
Query *m_q;
int32_t m_nqt;
// these are NOT in imap space, but in query term space, 1-1 with
// Query::m_qterms[]
//IndexList *m_lists;
//int32_t m_numLists;
// has init() been called?
bool m_initialized;
@ -668,8 +561,6 @@ class PosdbTable {
// for debug msgs
void *m_logstate;
//int64_t m_numDocsInColl;
class Msg39Request *m_r;
// for gbsortby:item.price ...

@ -341,7 +341,7 @@ bool PostQueryRerank::preRerank ( ) {
return false;
// . calculate maximum url length in pages for reranking
// by query terms or topics in a url
// by query terms in a url
int32_t urlLen = mr->size_ubuf - 1;//msg20->getUrlLen();
if ( urlLen > m_maxUrlLen )
m_maxUrlLen = urlLen;
@ -379,7 +379,7 @@ bool PostQueryRerank::preRerank ( ) {
}
// . setup reranking for query terms or topics in url (pqrqttiu)
// . setup reranking for query terms in url (pqrqttiu)
// . add space to max url length for terminating NULL and allocate
// room for max length
m_maxUrlLen++;

@ -266,7 +266,6 @@ bool Process::init ( ) {
// . let's try to save tfndb first, that is the most important,
// followed by titledb perhaps...
m_rdbs[m_numRdbs++] = g_titledb.getRdb ();
m_rdbs[m_numRdbs++] = g_sectiondb.getRdb ();
m_rdbs[m_numRdbs++] = g_posdb.getRdb ();
m_rdbs[m_numRdbs++] = g_spiderdb.getRdb ();
m_rdbs[m_numRdbs++] = g_clusterdb.getRdb ();
@ -277,7 +276,6 @@ bool Process::init ( ) {
// save what urls we have been doled
m_rdbs[m_numRdbs++] = g_doledb.getRdb ();
m_rdbs[m_numRdbs++] = g_titledb2.getRdb ();
m_rdbs[m_numRdbs++] = g_sectiondb2.getRdb ();
m_rdbs[m_numRdbs++] = g_posdb2.getRdb ();
m_rdbs[m_numRdbs++] = g_spiderdb2.getRdb ();
m_rdbs[m_numRdbs++] = g_clusterdb2.getRdb ();

@ -1,6 +1,4 @@
#define MAX_TOPICS_PER_TERM 28
#define MAX_ALLOWED_TOPICS 100
#define EI_NIDENT 16
#ifndef _PROFILER_H_
#define _PROFILER_H_

740
Query.cpp

@ -2211,7 +2211,7 @@ bool Query::setQWords ( char boolFlag ,
else if ( wp[0]=='-' && wplen==1 )
posNum += 0;
// 'mr. x'
else if ( wp[0]=='.' && words.isSpaces2(i,1))
else if ( wp[0]=='.' && words.isSpaces(i,1))
posNum += 0;
// animal (dog)
else
@ -3242,14 +3242,7 @@ bool Query::setQWords ( char boolFlag ,
// make the phrases from the words and the tweaked Bits class
//Phrases phrases;
if ( ! phrases.set ( &words ,
&bits ,
//NULL ,
true , // use stop words?
false , // use stems?
TITLEREC_CURRENT_VERSION,
0 /*niceness*/))//disallows HUGE phrases
if ( !phrases.set( &words, &bits, TITLEREC_CURRENT_VERSION, 0 ) )
return false;
int64_t *wids = words.getWordIds();
@ -3258,17 +3251,7 @@ bool Query::setQWords ( char boolFlag ,
for ( int32_t i = 0 ; i < numWords ; i++ ) {
// get the ith QueryWord
QueryWord *qw = &m_qwords[i];
// if word is ignored because it is opcode, or whatever,
// it cannot start a phrase
// THIS IS BROKEN
//if ( qw->m_queryOp && qw->m_opcode == OP_PIPE){
// for (int32_t j = i-1;j>=0;j--){
// if (!m_qwords[j].m_phraseId) continue;
// m_qwords[j].m_ignorePhrase = IGNORE_BOOLOP;
// break;
// }
//
//}
if ( qw->m_ignoreWord ) continue;
if ( qw->m_fieldCode && qw->m_quoteStart < 0) continue;
// get the first word # to our left that starts a phrase
@ -3280,8 +3263,7 @@ bool Query::setQWords ( char boolFlag ,
if ( ! bits.canPairAcross(j+1) ) break;
//if ( ! bits.canStartPhrase(j) ) continue;
if ( ! wids[j] ) continue;
// phrases.getNumWordsInPhrase()
//if( j + phrases.getMaxWordsInPhrase(j,&tmp)<i) break;
qw->m_leftPhraseStart = j;
// we can't pair across alnum words now, we just want bigrams
if ( wids[j] ) break;
@ -3335,8 +3317,7 @@ bool Query::setQWords ( char boolFlag ,
else qw->m_phraseId = pid;
// how many regular words int32_t is the bigram?
int32_t plen2; phrases.getPhrase ( i , &plen2 ,2);
// the trigram?
int32_t plen3; phrases.getPhrase ( i , &plen3 ,3);
// get just the bigram for now
qw->m_phraseLen = plen2;
// do not ignore the phrase, it's valid
@ -3736,22 +3717,6 @@ static bool s_isInitialized = false;
// 3rd field = m_hasColon
struct QueryField g_fields[] = {
/*
BR 20160117: No longer hashed
{"gbfieldmatch",
FIELD_GBFIELDMATCH,
true,
"gbfieldmatch:strings.vendor:\"My Vendor Inc.\"",
"Matches all the meta tag or JSON or XML fields that have "
"the name \"strings.vendor\" and contain the exactly provided "
"value, in this case, <i>My Vendor Inc.</i>. This is CASE "
"SENSITIVE and includes punctuation, so it's exact match. In "
"general, it should be a very short termlist, so it should be fast.",
"Advanced Query Operators",
QTF_BEGINNEWTABLE },
*/
{"url",
FIELD_URL,
true,
@ -3779,10 +3744,6 @@ struct QueryField g_fields[] = {
NULL,
0 },
//{"links", FIELD_LINKS, true,"Same as link:."},
//{"ilink", FIELD_ILINK, true,"Similar to above."},
{"sitelink",
FIELD_SITELINK,
true,
@ -3809,8 +3770,6 @@ struct QueryField g_fields[] = {
NULL,
QTF_DUP },
//{"coll", FIELD_COLL, true,"Not sure if this works."},
{"ip",
FIELD_IP,
true,
@ -3877,22 +3836,6 @@ struct QueryField g_fields[] = {
NULL,
0},
//{"isclean", FIELD_ISCLEAN, true,"Matches all pages that are deemed non-offensive and safe for children."},
/*
BR 20160108: No longer stored in our posdb as we don't plan to use it
{"gbinrss",
FIELD_GBRSS,
true,
"gbinrss:1",
"Matches all documents that are in RSS feeds. Likewise, use "
"<i>gbinrss:0</i> to match all documents that are NOT in RSS feeds.",
NULL,
0},
*/
{"type",
FIELD_TYPE,
false,
@ -3925,44 +3868,6 @@ struct QueryField g_fields[] = {
NULL,
0},
/*
BR 20160117: No longer hash image info
{"gbimage",
FIELD_URL,
false,
"gbimage:site.com/image.jpg",
"Matches all documents that contain the specified image.",
NULL,
0},
{"gbhasthumbnail",
FIELD_GENERIC,
false,
"gbhasthumbnail:1",
"Matches all documents for which Gigablast detected a thumbnail. "
"Likewise use <i>gbhasthumbnail:0</i> to match all documents that "
"do not have thumbnails.",
NULL,
0},
*/
/*
BR 20160117: No longer hash tags
{"gbtag*",
FIELD_TAG,
false,
"gbtag*",
"Matches all documents whose tag named * have the specified value "
"in the tagdb entry for the url. Example: gbtagsitenuminlinks:2 "
"matches all documents that have 2 qualified "
"inlinks pointing to their site "
"based on the tagdb record. You can also provide your own "
"tags in addition to the tags already present. See the <i>tagdb</i> "
"menu for more information.",
NULL,
0},
*/
{"gbzipcode",
FIELD_ZIP,
false,
@ -3972,25 +3877,6 @@ struct QueryField g_fields[] = {
NULL,
0},
/*
BR 20160108: No longer stored in our posdb as we don't plan to use it
{"gbcharset",
FIELD_CHARSET,
false,
"gbcharset:windows-1252",
"Matches all documents originally in the Windows-1252 charset. "
"Available character sets are listed in the <i>iana_charset.cpp</i> "
"file in the open source distribution. There are a lot. Some "
"more popular ones are: <i>us, latin1, iso-8859-1, csascii, ascii, "
"latin2, latin3, latin4, greek, utf-8, shift_jis.",
NULL,
0},
*/
// this just complicates things for now, so comment out
//{"urlhash",FIELD_URLHASH, false,""},
{"gblang",
FIELD_GBLANG,
false,
@ -4005,91 +3891,6 @@ struct QueryField g_fields[] = {
NULL,
0},
//{"gbquality",FIELD_GBQUALITY,true,""},
//{"gblinktextin",FIELD_LINKTEXTIN,true,""},
//{"gblinktextout",FIELD_LINKTEXTOUT,true,""},
//{"gbkeyword",FIELD_KEYWORD,true,""},
//{"gbcharset", FIELD_CHARSET, false,""},
/*
// BR 20160106: No longer stored in our posdb as we don't use it
{"gbpathdepth",
FIELD_GBOTHER,
false,
"gbpathdepth:3",
"Matches all documents whose url has 3 path components to it like "
"http://somedomain.com/dir1/dir2/dir3/foo.html",
NULL,
0},
*/
/*
// BR 20160108: No longer stored in our posdb as we don't use it
{"gbhopcount",
FIELD_GBOTHER,
false,
"gbhopcount:2",
"Matches all documents that are a minimum of two link hops away "
"from a root url.",
NULL,
0},
*/
/*
// BR 20160108: No longer stored in our posdb as we don't use it
{"gbhasfilename",
FIELD_GBOTHER,
false,
"gbhasfilename:1",
"Matches all documents whose url ends in a filename like "
"<i>http://somedomain.com/dir1/myfile</i> and not "
"<i>http://somedomain.com/dir1/dir2/</i>. Likewise, use "
"<i>gbhasfilename:0</i> to match all the documents that do not "
"have a filename in their url.",
NULL,
0},
*/
/*
BR 20160108: No longer stored in our posdb as we don't plan to use it
{"gbiscgi",
FIELD_GBOTHER,
false,
"gbiscgi:1",
"Matches all documents that have a question mark in their url. "
"Likewise gbiscgi:0 matches all documents that do not.",
NULL,
0},
*/
/*
BR 20160108: No longer stored in our posdb as we don't use it
{"gbhasext",
FIELD_GBOTHER,
false,
"gbhasext:1",
"Matches all documents that have a file extension in their url. "
"Likewise, <i>gbhasext:0</i> matches all documents that do not have "
"a file extension in their url.",
NULL,
0},
*/
/*
BR 20160106 removed
{"gbsubmiturl",
FIELD_GBOTHER,
false,
"gbsubmiturl:domain.com/process.php",
"Matches all documents that have a form that submits to the "
"specified url.",
NULL,
0},
*/
// diffbot only
{"gbparenturl",
FIELD_GBPARENTURL,
@ -4131,92 +3932,10 @@ struct QueryField g_fields[] = {
NULL,
0},
//
// for content type CT_STATUS documents (Spider status docs)
//
//{"qdom", FIELD_QUOTA, false,""},
//{"qhost", FIELD_QUOTA, false,""},
/*
// BR 20160117: No longer supported
{"gbsortbyfloat",
FIELD_GBSORTBYFLOAT,
false,
"cameras gbsortbyfloat:price",
"Sort all documents that "
"contain 'camera' by price. <i>price</i> can be a root JSON field or "
"in a meta tag, or in an xml &lt;price&gt; tag.",
"Numeric Field Query Operators",
QTF_BEGINNEWTABLE },
{"gbsortbyfloat",
FIELD_GBSORTBYFLOAT,
false,
"cameras gbsortbyfloat:product.price",
"Sort all documents that "
"contain 'camera' by price. <i>price</i> can be in a JSON document "
"like "
"<i>{ \"product\":{\"price\":1500.00}} "
"</i> or, alternatively, an XML document like <i>"
"&lt;product&gt;&lt;price&gt;1500.00&lt;/price&gt;&lt;/product&gt;"
"</i>",
NULL,
0 },
{"gbrevsortbyfloat",
FIELD_GBREVSORTBYFLOAT,
false,
"cameras gbrevsortbyfloat:product.price",
"Like above example but sorted with highest prices on top.",
NULL,
0 },
{"gbsortby",
FIELD_GBSORTBYFLOAT,
false,
"dog gbsortbyint:gbdocspiderdate",
"Sort the documents that contain 'dog' by "
"the date they were last spidered, with the newest "
"on top.",
NULL,
QTF_HIDE},
{"gbrevsortby",
FIELD_GBREVSORTBYFLOAT,
false,
"dog gbrevsortbyint:gbdocspiderdate",
"Sort the documents that contain 'dog' by "
"the date they were last spidered, but with the "
"oldest on top.",
NULL,
QTF_HIDE},
*/
/*
// BR 20160117: No longer supported
{"gbsortbyint",
FIELD_GBSORTBYINT,
false,
"pilots gbsortbyint:employees",
"Sort all documents that "
"contain 'pilots' by employees. "
"<i>employees</i> can be a root JSON field or "
"in a meta tag, or in an xml &lt;price&gt; tag. The value it "
"contains is interpreted as a 32-bit integer.",
NULL,
0 },
*/
{"gbsortbyint",
FIELD_GBSORTBYINT,
false,
@ -4225,33 +3944,6 @@ struct QueryField g_fields[] = {
NULL,
0},
/*
// BR 20160117: No longer supported
{"gbsortbyint",
FIELD_GBSORTBYINT,
false,
"gbsortbyint:company.employees",
"Sort all documents by employees. Documents can contain "
"<i>employees</i> in a JSON document "
"like "
"<i>{ \"product\":{\"price\":1500.00}} "
"</i> or, alternatively, an XML document like <i>"
"&lt;product&gt;&lt;price&gt;1500.00&lt;/price&gt;&lt;/product&gt;"
"</i>",
NULL,
0 },
{"gbsortbyint",
FIELD_GBSORTBYINT,
false,
"gbsortbyint:gbsitenuminlinks",
"Sort all documents by the number of distinct inlinks the "
"document's site has.",
NULL,
0 },
*/
{"gbrevsortbyint",
FIELD_GBREVSORTBYINT,
false,
@ -4261,114 +3953,6 @@ struct QueryField g_fields[] = {
NULL,
0},
/*
// BR 20160117: No longer supported
// gbmin:price:1.23
{"gbminfloat",
FIELD_GBNUMBERMIN,
false,
"cameras gbminfloat:price:109.99",
"Matches all documents that "
"contain 'camera' or 'cameras' and have a price of at least 109.99. "
"<i>price</i> can be a root JSON field or "
"in a meta tag name <i>price</i>, or in an xml &lt;price&gt; tag.",
NULL,
0 },
{"gbminfloat",
FIELD_GBNUMBERMIN,
false,
"cameras gbminfloat:product.price:109.99",
"Matches all documents that "
"contain 'camera' or 'cameras' and have a price of at least 109.99 "
"in a JSON document like "
"<i>{ \"product\":{\"price\":1500.00}} "
"</i> or, alternatively, an XML document like <i>"
"&lt;product&gt;&lt;price&gt;1500.00&lt;/price&gt;&lt;/product&gt;"
"</i>",
NULL,
0 },
// alias we need to bury
{"gbmin",
FIELD_GBNUMBERMIN,
false,
"",
"",
NULL,
QTF_HIDE},
{"gbmaxfloat",
FIELD_GBNUMBERMAX,
false,
"cameras gbmaxfloat:price:109.99",
"Like the gbminfloat examples above, but is an upper bound.",
NULL,
0 },
{"gbequalfloat",
FIELD_GBNUMBEREQUALFLOAT,
false,
"gbequalfloat:product.price:1.23",
"Similar to gbminfloat and gbmaxfloat but is an equality constraint.",
NULL,
0 },
{"gbmax",
FIELD_GBNUMBERMAX,
false,
"",
"",
NULL,
QTF_HIDE},
{"gbminint",
FIELD_GBNUMBERMININT,
false,
"gbminint:gbspiderdate:1391749680",
"Matches all documents with a spider timestamp of at least "
"1391749680. Use this as opposed th gbminfloat when you need "
"32 bits of integer precision.",
NULL,
0},
{"gbmaxint",
FIELD_GBNUMBERMAXINT,
false,
"gbmaxint:company.employees:20",
"Matches all companies with 20 or less employees "
"in a JSON document like "
"<i>{ \"company\":{\"employees\":13}} "
"</i> or, alternatively, an XML document like <i>"
"&lt;company&gt;&lt;employees&gt;13&lt;/employees&gt;"
"&lt;/company&gt;"
"</i>",
NULL,
0},
{"gbequalint",
FIELD_GBNUMBEREQUALINT,
false,
"gbequalint:company.employees:13",
"Similar to gbminint and gbmaxint but is an equality constraint.",
NULL,
0},
*/
{"gbdocspiderdate",
FIELD_GENERIC,
false,
@ -4413,114 +3997,6 @@ struct QueryField g_fields[] = {
NULL,
0},
// {"gbreplyspiderdate",FIELD_GENERIC,false,
// "Example: gbspiderdate:1400081479 will return spider log "
// "results that have "
// "that spider date timestamp (UTC)"},
/* BR 20160108: All facets disabled as test. Don't think we will need any of them */
#ifdef SUPPORT_FACETS
{"gbfacetstr",
FIELD_GBFACETSTR,
false,
"gbfacetstr:color",
"Returns facets in "
"the search results "
"by their color field. <i>color</i> is case INsensitive.",
"Facet Related Query Operators",
QTF_BEGINNEWTABLE},
{"gbfacetstr",
FIELD_GBFACETSTR,
false,
"gbfacetstr:product.color",
"Returns facets in "
"the color field in a JSON document like "
"<i>{ \"product\":{\"color\":\"red\"}} "
"</i> or, alternatively, an XML document like <i>"
"&lt;product&gt;&lt;color&gt;red&lt;/price&gt;&lt;/product&gt;"
"</i>. <i>product.color</i> is case INsensitive.",
NULL,
0},
{"gbfacetstr",
FIELD_GBFACETSTR,
false,
"gbfacetstr:gbtagsite cat",
"Returns facets from the site names of all pages "
"that contain the word 'cat' or 'cats', etc. <i>gbtagsite</i> is case insensitive."
,
NULL,
0},
{"gbfacetint", FIELD_GBFACETINT, false,
"gbfacetint:product.cores",
"Returns facets in "
"of the <i>cores</i> field in a JSON document like "
"<i>{ \"product\":{\"cores\":10}} "
"</i> or, alternatively, an XML document like <i>"
"&lt;product&gt;&lt;cores&gt;10&lt;/price&gt;&lt;/product&gt;"
"</i>. <i>product.cores</i> is case INsensitive.",
NULL,
0},
{"gbfacetint", FIELD_GBFACETINT, false,
"gbfacetint:gbhopcount",
"Returns facets in "
"of the <i>gbhopcount</i> field over the documents so you can "
"search the distribution of hopcounts over the index. <i>gbhopcount</i> is "
"case INsensitive.",
NULL,
0},
{"gbfacetint", FIELD_GBFACETINT, false,
"gbfacetint:gbtagsitenuminlinks",
"Returns facets in "
"of the <i>sitenuminlinks</i> field for the tag <i>sitenuminlinks</i>"
"in the tag for each site. Any numeric tag in tagdb can be "
"facetizeed "
"in this manner so you can add your own facets this way on a per "
"site or per url basis by making tagdb entries. Case Insensitive.",
NULL,
0},
{"gbfacetint", FIELD_GBFACETINT, false,
"gbfacetint:size,0-10,10-20,30-100,100-200,200-1000,1000-10000",
"Returns facets in "
"of the <i>size</i> field (either in json, field or a meta tag) "
"and cluster the results into the specified ranges. <i>size</i> is "
"case INsensitive.",
NULL,
0},
{"gbfacetint", FIELD_GBFACETINT, false,
"gbfacetint:gbsitenuminlinks",
"Returns facets based on # of site inlinks the site of each "
"result has. <i>gbsitenuminlinks</i> is case INsensitive.",
NULL,
0},
{"gbfacetfloat", FIELD_GBFACETFLOAT, false,
"gbfacetfloat:product.weight",
"Returns facets "
"of the <i>weight</i> field in a JSON document like "
"<i>{ \"product\":{\"weight\":1.45}} "
"</i> or, alternatively, an XML document like <i>"
"&lt;product&gt;&lt;weight&gt;1.45&lt;/price&gt;&lt;/product&gt;"
"</i>. <i>product.weight</i> is case INsensitive.",
NULL,
0},
{"gbfacetfloat", FIELD_GBFACETFLOAT, false,
"gbfacetfloat:product.price,0-1.5,1.5-5,5.0-20,20-100.0",
"Similar to above but cluster the pricess into the specified ranges. "
"<i>product.price</i> is case insensitive.",
NULL,
0},
#endif
//
// spider status docs queries
//
@ -4610,17 +4086,6 @@ struct QueryField g_fields[] = {
NULL,
0},
#ifdef SUPPORT_FACETS
{"gbssNumRedirects",
FIELD_GENERIC,
false,
"gbfacetint:gbssNumRedirects",
"Query on the number of times the url redirect when attempting to "
"index it.",
NULL,
0},
#endif
{"gbssDocId",
FIELD_GENERIC,
false,
@ -4629,26 +4094,6 @@ struct QueryField g_fields[] = {
NULL,
0},
#ifdef SUPPORT_FACETS
{"gbssHopCount",
FIELD_GENERIC,
false,
"gbfacetint:gbssHopCount",
"Query on the hop count of the document.",
NULL,
0},
{"gbssCrawlRound",
FIELD_GENERIC,
false,
"gbfacetint:gbssCrawlRound",
"Query on the crawl round number.",
NULL,
0},
#endif
{"gbssDupOfDocId",
FIELD_GENERIC,
false,
@ -4689,17 +4134,6 @@ struct QueryField g_fields[] = {
NULL,
0},
#ifdef SUPPORT_FACETS
{"gbssContentHash32",
FIELD_GENERIC,
false,
"gbfacetint:gbssContentHash32",
"The hash of the document content, excluding dates and times. Used "
"internally for deduping.",
NULL,
0},
#endif
{"gbssDownloadDurationMS",
FIELD_GENERIC,
false,
@ -4724,25 +4158,6 @@ struct QueryField g_fields[] = {
NULL,
0},
#ifdef SUPPORT_FACETS
{"gbssUsedRobotsTxt",
FIELD_GENERIC,
false,
"gbfacetint:gbssUsedRobotsTxt",
"This is 0 or 1 depending on if robots.txt was not obeyed or obeyed, "
"respectively.",
NULL,
0},
{"gbssConsecutiveErrors",
FIELD_GENERIC,
false,
"gbfacetint:gbssConsecutiveErrors",
"For the last set of indexing attempts how many were errors?",
NULL,
0},
#endif
{"gbssIp",
FIELD_GENERIC,
false,
@ -4778,65 +4193,6 @@ struct QueryField g_fields[] = {
NULL,
0},
#ifdef SUPPORT_FACETS
{"gbssContentInjected",
FIELD_GENERIC,
false,
"gbfacetint:gbssContentInjected",
"This is 0 or 1 if the content was not injected or injected, "
"respectively.",
NULL,
0},
{"gbssPercentContentChanged",
FIELD_GENERIC,
false,
"gbfacetfloat:gbssPercentContentChanged",
"A float between 0 and 100, inclusive. Represents how much "
"the document has changed since the last time we indexed it. This is "
"only valid if the document was successfully indexed this time."
"respectively.",
NULL,
0},
{"gbssSpiderPriority",
FIELD_GENERIC,
false,
"gbfacetint:gbssSpiderPriority",
"The spider priority, from 0 to 127, inclusive, of the document "
"according to the url filters table.",
NULL,
0},
{"gbssMatchingUrlFilter",
FIELD_GENERIC,
false,
"gbfacetstr:gbssMatchingUrlFilter",
"The url filter expression the document matched.",
NULL,
0},
{"gbssLanguage",
FIELD_GENERIC,
false,
"gbfacetstr:gbssLanguage",
"The language of the document. If document was empty or not "
"downloaded then this will not be present. Uses xx to mean "
"unknown language. Uses the language abbreviations found at the "
"bottom of the url filters page.",
NULL,
0},
{"gbssContentType",
FIELD_GENERIC,
false,
"gbfacetstr:gbssContentType",
"The content type of the document. Like html, xml, json, pdf, etc. "
"This field is not present if unknown.",
NULL,
0},
#endif
{"gbssContentLen",
FIELD_GENERIC,
false,
@ -4845,93 +4201,8 @@ struct QueryField g_fields[] = {
NULL,
0},
#ifdef SUPPORT_FACETS
{"gbssCrawlDelayMS",
FIELD_GENERIC,
false,
"gbfacetint:gbssCrawlDelay",
"The crawl delay according to the robots.txt of the document. "
"This is -1 if not specified in the robots.txt or not found.",
NULL,
0},
#endif
/*
{"gbssSentToDiffbotThisTime",
FIELD_GENERIC,
false,
"gbssSentToDiffbotThisTime:1",
"Was the document's url sent to diffbot for processing this time "
"of spidering the url?",
NULL,
0},
{"gbssSentToDiffbotAtSomeTime",
FIELD_GENERIC,
false,
"gbssSentToDiffbotAtSomeTime:1",
"Was the document's url sent to diffbot for processing, either this "
"time or some time before?",
NULL,
0},
{"gbssDiffbotReplyCode",
FIELD_GENERIC,
false,
"gbssDiffbotReplyCode:0",
"The reply received from diffbot. 0 means success, otherwise, it "
"indicates an error code.",
NULL,
0},
{"gbssDiffbotReplyMsg",
FIELD_GENERIC,
false,
"gbfacetstr:gbssDiffbotReplyMsg:0",
"The reply received from diffbot represented in text.",
NULL,
0},
{"gbssDiffbotReplyLen",
FIELD_GENERIC,
false,
"gbsortbyint:gbssDiffbotReplyLen",
"The length of the reply received from diffbot.",
NULL,
0},
{"gbssDiffbotReplyResponseTimeMS",
FIELD_GENERIC,
false,
"gbsortbyint:gbssDiffbotReplyResponseTimeMS",
"The time in milliseconds it took to get a reply from diffbot.",
NULL,
0},
{"gbssDiffbotReplyRetries",
FIELD_GENERIC,
false,
"gbfacetint:gbssDiffbotReplyRetries",
"The number of times we had to resend the request to diffbot "
"because diffbot returned a 504 gateway timed out error.",
NULL,
0},
{"gbssDiffbotReplyNumObjects",
FIELD_GENERIC,
false,
"gbfacetint:gbssDiffbotReplyNumObjects",
"The number of JSON objects diffbot excavated from the provided url.",
NULL,
0},
*/
// they don't need to know about this
{"gbad",FIELD_GBAD,false,"","",NULL,QTF_HIDE},
//BR 20160117 removed: {"gbtagvector", FIELD_GBTAGVECTOR, false,"","",NULL,QTF_HIDE},
{"gbsamplevector", FIELD_GBSAMPLEVECTOR, false,"","",NULL,QTF_HIDE},
{"gbcontenthash", FIELD_GBCONTENTHASH, false,"","",NULL,QTF_HIDE},
{"gbduphash" ,FIELD_GBOTHER,false,"","",NULL,QTF_HIDE},
@ -5606,7 +4877,6 @@ bool QueryTerm::isSplit() {
if(!m_fieldCode) return true;
if(m_fieldCode == FIELD_QUOTA) return false;
//BR 20160117 removed: if(m_fieldCode == FIELD_GBTAGVECTOR) return false;
//BR 20160106 removed: if(m_fieldCode == FIELD_GBGIGABITVECTOR) return false;
if(m_fieldCode == FIELD_GBSAMPLEVECTOR) return false;
if(m_fieldCode == FIELD_GBSECTIONHASH) return false;
if(m_fieldCode == FIELD_GBCONTENTHASH) return false;

25
Query.h

@ -569,15 +569,9 @@ class QueryTerm {
char m_endKey [MAX_KEY_BYTES];
char m_ks;
// used by Msg40.cpp for gigabits generation
int64_t m_hash64d;
int32_t m_popWeight;
uint64_t m_numDocsThatHaveFacet;
};
//#define MAX_OPSLOTS 256
#define MAX_EXPRESSIONS 100
// operand1 AND operand2 OR ...
@ -646,26 +640,14 @@ class Query {
int32_t serialize(char *buf, int32_t bufLen);
int32_t deserialize(char *buf, int32_t bufLen);
// . if a term is truncated in indexdb, change its '+' sign to a '*'
// . will recopmute m_bitScores to fix bit #7
//void softenTruncatedTerms ( );
bool setQueryTermScores ( int64_t *termFreqsArg ) ;
// about how hits for this query?
//int64_t getEstimatedTotalHits ( );
char *getQuery ( ) { return m_orig ; };
int32_t getQueryLen ( ) { return m_origLen; };
//int32_t getNumIgnored ( ) { return m_numIgnored; };
//int32_t getNumNotIgnored ( ) { return m_numTerms ; };
int32_t getNumTerms ( ) { return m_numTerms; };
char getTermSign ( int32_t i ) { return m_qterms[i].m_termSign; };
bool isPhrase ( int32_t i ) { return m_qterms[i].m_isPhrase; };
bool isInPhrase ( int32_t i ) { return m_qterms[i].m_inPhrase; };
bool isInQuotes ( int32_t i ) { return m_qterms[i].m_inQuotes; };
int64_t getTermId ( int32_t i ) { return m_qterms[i].m_termId; };
char getFieldCode2( int32_t i ) { return m_qterms[i].m_fieldCode; };
int64_t getRawTermId ( int32_t i ) { return m_qterms[i].m_rawTermId; };
@ -687,13 +669,6 @@ class Query {
bool isSplit(int32_t i) { return m_qterms[i].isSplit(); };
// . Msg39 calls this to get our vector so it can pass it to Msg37
// . the signs and ids are dupped in the QueryTerm classes, too
//int64_t *getTermFreqs ( ) { return m_termFreqs ; };
//int64_t getTermFreq ( int32_t i ) { return m_termFreqs[i]; };
//int64_t *getTermIds ( ) { return m_termIds ; };
//char *getTermSigns ( ) { return m_termSigns ; };
//int32_t *getComponentCodes ( ) { return m_componentCodes; };
int64_t getRawWordId ( int32_t i ) { return m_qwords[i].m_rawWordId;};
int32_t getNumComponentTerms ( ) { return m_numComponents; };

85
Rdb.cpp

@ -16,7 +16,6 @@
#include "Spider.h"
#include "SpiderColl.h"
#include "Doledb.h"
#include "Revdb.h"
#include "hash.h"
void attemptMergeAll ( int fd , void *state ) ;
@ -168,10 +167,6 @@ bool Rdb::init ( char *dir ,
if ( m_rdbId == RDB2_INDEXDB2 ) m_pageSize = GB_INDEXDB_PAGE_SIZE;
if ( m_rdbId == RDB_POSDB ) m_pageSize = GB_INDEXDB_PAGE_SIZE;
if ( m_rdbId == RDB2_POSDB2 ) m_pageSize = GB_INDEXDB_PAGE_SIZE;
//if ( m_rdbId == RDB_DATEDB ) m_pageSize = GB_INDEXDB_PAGE_SIZE;
//if ( m_rdbId == RDB2_DATEDB2 ) m_pageSize = GB_INDEXDB_PAGE_SIZE;
if ( m_rdbId == RDB_SECTIONDB ) m_pageSize = GB_INDEXDB_PAGE_SIZE;
if ( m_rdbId == RDB2_SECTIONDB2) m_pageSize = GB_INDEXDB_PAGE_SIZE;
if ( m_rdbId == RDB_TITLEDB ) m_pageSize = GB_INDEXDB_PAGE_SIZE;
if ( m_rdbId == RDB2_TITLEDB2 ) m_pageSize = GB_INDEXDB_PAGE_SIZE;
if ( m_rdbId == RDB_SPIDERDB ) m_pageSize = GB_INDEXDB_PAGE_SIZE;
@ -180,30 +175,7 @@ bool Rdb::init ( char *dir ,
if ( m_rdbId == RDB_SERPDB ) m_pageSize = GB_INDEXDB_PAGE_SIZE;
if ( m_rdbId == RDB_LINKDB ) m_pageSize = GB_INDEXDB_PAGE_SIZE;
if ( m_rdbId == RDB2_LINKDB2 ) m_pageSize = GB_INDEXDB_PAGE_SIZE;
if ( m_rdbId == RDB_REVDB ) m_pageSize = GB_INDEXDB_PAGE_SIZE;
if ( m_rdbId == RDB2_REVDB2 ) m_pageSize = GB_INDEXDB_PAGE_SIZE;
// let's obsolete this rec/list cache because using the
// disk page cache cleverly, is usually better than this,
// because this ignores newly added data (it is not realtime),
// and it really only saves us from having to intersect a
// bunch of indexdb/datedb lists.
/*
loadCacheFromDisk = false;
maxCacheMem = 0;
maxCacheNodes = 0;
// . set up our cache
// . we could be adding lists so keep fixedDataSize -1 for cache
if ( ! m_cache.init ( maxCacheMem ,
fixedDataSize ,
true , // support lists
maxCacheNodes ,
m_useHalfKeys ,
m_dbname ,
loadCacheFromDisk ,
m_ks , // cache key size
m_ks ) ) // data key size
return false;
*/
// we can't merge more than MAX_RDB_FILES files at a time
if ( minToMerge > MAX_RDB_FILES ) minToMerge = MAX_RDB_FILES;
m_minToMerge = minToMerge;
@ -1736,17 +1708,14 @@ bool Rdb::addList ( collnum_t collnum , RdbList *list,
//! g_conf.m_rebuildNoSplits &&
//! g_conf.m_removeBadPages &&
( m_rdbId == RDB_TITLEDB ||
//m_rdbId == RDB_SECTIONDB ||
m_rdbId == RDB_PLACEDB ||
m_rdbId == RDB_TFNDB ||
m_rdbId == RDB_INDEXDB ||
m_rdbId == RDB_POSDB ||
//m_rdbId == RDB_DATEDB ||
m_rdbId == RDB_POSDB ||
m_rdbId == RDB_CLUSTERDB ||
m_rdbId == RDB_LINKDB ||
m_rdbId == RDB_DOLEDB ||
m_rdbId == RDB_SPIDERDB ||
m_rdbId == RDB_REVDB ) ) {
m_rdbId == RDB_SPIDERDB ) ) {
// exception, spider status docs can be deleted from titledb
// if user turns off 'index spider replies' before doing
@ -1765,20 +1734,6 @@ bool Rdb::addList ( collnum_t collnum , RdbList *list,
exception:
/*
if ( g_repair.isRepairActive() &&
g_repair.m_fullRebuild &&
collnum != g_repair.m_newCollnum &&
m_rdbId != RDB_TAGDB ) {
log("db: How did an add come in while in full repair mode?"
" addCollnum=%"INT32" repairCollnum=%"INT32" db=%s",
(int32_t)collnum , (int32_t)g_repair.m_newCollnum ,
m_dbname );
g_errno = EREPAIRING;
return false;
}
*/
// if we are currently in a quickpoll, make sure we are not in
// RdbTree::getList(), because we could mess that loop up by adding
// or deleting a record into/from the tree now
@ -2811,23 +2766,19 @@ Rdb *getRdbFromId ( uint8_t rdbId ) {
s_table9 [ RDB_INDEXDB ] = g_indexdb.getRdb();
s_table9 [ RDB_POSDB ] = g_posdb.getRdb();
s_table9 [ RDB_TITLEDB ] = g_titledb.getRdb();
s_table9 [ RDB_SECTIONDB ] = g_sectiondb.getRdb();
s_table9 [ RDB_SPIDERDB ] = g_spiderdb.getRdb();
s_table9 [ RDB_DOLEDB ] = g_doledb.getRdb();
s_table9 [ RDB_CLUSTERDB ] = g_clusterdb.getRdb();
s_table9 [ RDB_LINKDB ] = g_linkdb.getRdb();
s_table9 [ RDB_STATSDB ] = g_statsdb.getRdb();
s_table9 [ RDB_REVDB ] = g_revdb.getRdb();
s_table9 [ RDB_PARMDB ] = NULL;
s_table9 [ RDB2_INDEXDB2 ] = g_indexdb2.getRdb();
s_table9 [ RDB2_POSDB2 ] = g_posdb2.getRdb();
s_table9 [ RDB2_TITLEDB2 ] = g_titledb2.getRdb();
s_table9 [ RDB2_SECTIONDB2 ] = g_sectiondb2.getRdb();
s_table9 [ RDB2_SPIDERDB2 ] = g_spiderdb2.getRdb();
s_table9 [ RDB2_CLUSTERDB2 ] = g_clusterdb2.getRdb();
s_table9 [ RDB2_LINKDB2 ] = g_linkdb2.getRdb();
s_table9 [ RDB2_REVDB2 ] = g_revdb2.getRdb();
s_table9 [ RDB2_TAGDB2 ] = g_tagdb2.getRdb();
}
if ( rdbId >= RDB_END ) return NULL;
@ -2840,22 +2791,18 @@ char getIdFromRdb ( Rdb *rdb ) {
if ( rdb == g_indexdb.getRdb () ) return RDB_INDEXDB;
if ( rdb == g_posdb.getRdb () ) return RDB_POSDB;
if ( rdb == g_titledb.getRdb () ) return RDB_TITLEDB;
if ( rdb == g_sectiondb.getRdb () ) return RDB_SECTIONDB;
if ( rdb == g_spiderdb.getRdb () ) return RDB_SPIDERDB;
if ( rdb == g_doledb.getRdb () ) return RDB_DOLEDB;
if ( rdb == g_clusterdb.getRdb () ) return RDB_CLUSTERDB;
if ( rdb == g_statsdb.getRdb () ) return RDB_STATSDB;
if ( rdb == g_linkdb.getRdb () ) return RDB_LINKDB;
if ( rdb == g_revdb.getRdb () ) return RDB_REVDB;
if ( rdb == g_indexdb2.getRdb () ) return RDB2_INDEXDB2;
if ( rdb == g_posdb2.getRdb () ) return RDB2_POSDB2;
if ( rdb == g_tagdb2.getRdb () ) return RDB2_TAGDB2;
if ( rdb == g_titledb2.getRdb () ) return RDB2_TITLEDB2;
if ( rdb == g_sectiondb2.getRdb () ) return RDB2_SECTIONDB2;
if ( rdb == g_spiderdb2.getRdb () ) return RDB2_SPIDERDB2;
if ( rdb == g_clusterdb2.getRdb () ) return RDB2_CLUSTERDB2;
if ( rdb == g_linkdb2.getRdb () ) return RDB2_LINKDB2;
if ( rdb == g_revdb2.getRdb () ) return RDB2_REVDB2;
log(LOG_LOGIC,"db: getIdFromRdb: no rdbId for %s.",rdb->m_dbname);
return 0;
@ -2868,12 +2815,10 @@ char isSecondaryRdb ( uint8_t rdbId ) {
case RDB2_POSDB2 : return true;
case RDB2_TAGDB2 : return true;
case RDB2_TITLEDB2 : return true;
case RDB2_SECTIONDB2 : return true;
case RDB2_PLACEDB2 : return true;
case RDB2_SPIDERDB2 : return true;
case RDB2_TFNDB2 : return true;
case RDB2_CLUSTERDB2 : return true;
case RDB2_REVDB2 : return true;
case RDB2_LINKDB2 : return true;
}
return false;
@ -2898,13 +2843,9 @@ char getKeySizeFromRdbId ( uint8_t rdbId ) {
i == RDB_SPIDERDB ||
i == RDB_TAGDB ||
i == RDB_SYNCDB ||
i == RDB_SECTIONDB ||
i == RDB_PLACEDB ||
//i == RDB2_DATEDB2 ||
i == RDB2_SPIDERDB2 ||
i == RDB2_TAGDB2 ||
i == RDB2_SECTIONDB2 ||
i == RDB2_PLACEDB2 )
ks = 16;
if ( i == RDB_POSDB || i == RDB2_POSDB2 )
@ -2942,11 +2883,9 @@ int32_t getDataSizeFromRdbId ( uint8_t rdbId ) {
i == RDB_TFNDB ||
i == RDB_CLUSTERDB ||
i == RDB_DATEDB ||
//i == RDB_FAKEDB ||
i == RDB_LINKDB )
ds = 0;
else if ( i == RDB_TITLEDB ||
i == RDB_REVDB ||
i == RDB_SYNCDB ||
i == RDB_CACHEDB ||
i == RDB_SERPDB ||
@ -2960,8 +2899,6 @@ int32_t getDataSizeFromRdbId ( uint8_t rdbId ) {
ds = -1;
else if ( i == RDB_STATSDB )
ds = sizeof(StatData);
else if ( i == RDB_SECTIONDB )
ds = sizeof(SectionVote);
else if ( i == RDB2_POSDB2 ||
i == RDB2_INDEXDB2 ||
i == RDB2_TFNDB2 ||
@ -2970,23 +2907,17 @@ int32_t getDataSizeFromRdbId ( uint8_t rdbId ) {
i == RDB2_DATEDB2 )
ds = 0;
else if ( i == RDB2_TITLEDB2 ||
i == RDB2_REVDB2 ||
i == RDB2_TAGDB2 ||
i == RDB2_CATDB2 ||
i == RDB2_SPIDERDB2 ||
i == RDB2_PLACEDB2 )
ds = -1;
else if ( i == RDB2_SECTIONDB2 )
ds = sizeof(SectionVote);
else { char *xx=NULL;*xx=0; }
// get the rdb for this rdbId
//Rdb *rdb = getRdbFromId ( i );
// sanity check
//if ( ! rdb ) continue;//{ char *xx=NULL;*xx=0; }
// sanity!
//if ( rdb->m_ks == 0 ) { char *xx=NULL;*xx=0; }
else {
continue;
}
// set the table
s_table2[i] = ds;//rdb->m_fixedDataSize;
s_table2[i] = ds;
}
}
return s_table2[rdbId];

File diff suppressed because it is too large Load Diff

@ -80,40 +80,27 @@ public:
Msg5 m_msg5b;
Msg4 m_msg4;
bool m_needsCallback;
//Msg50 m_msg50;
char m_docQuality;
//Msg14 m_msg14;
//RdbList m_scanList;
RdbList m_titleRecList;
int64_t m_docId;
char m_isDelete;
RdbList m_ulist;
RdbList m_addlist;
//int32_t m_ruleset;
//LinkTextReply m_rootLinkText;
int64_t m_totalMem;
int32_t m_stage ;
int32_t m_tfn;
int32_t m_count;
bool m_updated;
//key_t m_currentTitleRecKey; // for tfndb
// titledb scan vars
//key_t m_nextRevdbKey;
key_t m_nextTitledbKey;
key_t m_nextSpiderdbKey;
//key_t m_nextIndexdbKey;
key_t m_nextPosdbKey;
//key_t m_nextDatedbKey;
key128_t m_nextLinkdbKey;
//key128_t m_nextPlacedbKey;
key_t m_endKey;
int64_t m_uh48;
//TitleRec m_tr;
//Msg8a m_msg8a;
int32_t m_priority;
uint64_t m_contentHash;
//key_t m_tfndbKey;
key_t m_clusterdbKey ;
key_t m_spiderdbKey;
char m_srBuf[SR_BUFSIZE];
@ -127,8 +114,6 @@ public:
// spiderdb scan vars
bool m_isNew;
//SpiderRec m_sr;
//SiteRec m_siteRec;
TagRec m_tagRec;
@ -139,8 +124,6 @@ public:
int64_t m_prevDocId;
bool m_completedFirstScan ;
bool m_completedSpiderdbScan ;
//bool m_completedIndexdbScan ;
//key_t m_lastRevdbKey;
key_t m_lastTitledbKey;
key_t m_lastSpiderdbKey;
@ -158,7 +141,6 @@ public:
int64_t m_recsRoot;
int64_t m_recsNonRoot;
int64_t m_recsInjected;
//int32_t m_fn;
// spiderdb scan stats
int32_t m_spiderRecsScanned ;
@ -168,21 +150,13 @@ public:
// generic scan parms
char m_rebuildTitledb ;
//char m_rebuildIndexdb ;
char m_rebuildPosdb ;
//char m_rebuildNoSplits ;
//char m_rebuildDatedb ;
//char m_rebuildTfndb ;
char m_rebuildClusterdb ;
char m_rebuildSpiderdb ;
char m_rebuildSitedb ;
char m_rebuildLinkdb ;
char m_rebuildTagdb ;
//char m_rebuildPlacedb ;
//char m_rebuildSectiondb ;
//char m_rebuildRevdb ;
char m_fullRebuild ;
//char m_removeBadPages ;
char m_rebuildRoots ;
char m_rebuildNonRoots ;
@ -208,7 +182,6 @@ public:
char m_SAVE_END;
// i'd like to save these but they are ptrs
//char *m_coll;
CollectionRec *m_cr;
//for timing a repair process

169
Revdb.cpp

@ -1,169 +0,0 @@
#include "gb-include.h"
#include "Revdb.h"
#include "Threads.h"
Revdb g_revdb;
Revdb g_revdb2;
// reset rdb
void Revdb::reset() { m_rdb.reset(); }
// init our rdb
bool Revdb::init ( ) {
int64_t maxTreeMem = 200000000;
// . what's max # of tree nodes?
// . assume avg RevRec size (compressed html doc) is about 1k we get:
// . NOTE: overhead is about 32 bytes per node
int32_t maxTreeNodes = maxTreeMem / (1*1024);
// each entry in the cache is usually just a single record, no lists
int32_t maxCacheNodes = 0;//g_conf.m_revdbMaxCacheMem / (10*1024);
// initialize our own internal rdb
if ( ! m_rdb.init ( g_hostdb.m_dir ,
"revdb" ,
true , // dedup same keys?
-1 , // fixed record size
// this should not really be changed...
2 , // min files to merge
maxTreeMem,//g_conf.m_revdbMaxTreeMem ,
maxTreeNodes ,
// now we balance so Sync.cpp can ordered huge list
true , // balance tree?
0 , // cache mem
maxCacheNodes ,
false ,// half keys?
false ,// g_conf.m_revdbSav
NULL , // page cache ptr
false ) )// is titledb?
return false;
return true;
}
// init the rebuild/secondary rdb, used by PageRepair.cpp
bool Revdb::init2 ( int32_t treeMem ) {
// . what's max # of tree nodes?
// . assume avg RevRec size (compressed html doc) is about 1k we get:
// . NOTE: overhead is about 32 bytes per node
int32_t maxTreeNodes = treeMem / (1*1024);
// initialize our own internal rdb
if ( ! m_rdb.init ( g_hostdb.m_dir ,
"revdbRebuild" ,
true , // dedup same keys?
-1 , // fixed record size
240 , // MinFilesToMerge
treeMem ,
maxTreeNodes ,
// now we balance so Sync.cpp can ordered huge list
true , // balance tree?
0 , // MaxCacheMem ,
0 , // maxCacheNodes
false , // half keys?
false , // revdbSaveCache
NULL , // page cache ptr
false ) )// is titledb?
return false;
return true;
}
/*
bool Revdb::addColl ( char *coll, bool doVerify ) {
if ( ! m_rdb.addColl ( coll ) ) return false;
if ( ! doVerify ) return true;
// verify
if ( verify(coll) ) return true;
// if not allowing scale, return false
if ( ! g_conf.m_allowScale ) return false;
// otherwise let it go
log ( "db: Verify failed, but scaling is allowed, passing." );
return true;
}
*/
bool Revdb::verify ( char *coll ) {
log ( LOG_INFO, "db: Verifying Revdb for coll %s...", coll );
g_threads.disableThreads();
Msg5 msg5;
Msg5 msg5b;
RdbList list;
key_t startKey;
key_t endKey;
startKey.setMin();
endKey.setMax();
//int32_t minRecSizes = 64000;
CollectionRec *cr = g_collectiondb.getRec(coll);
if ( ! msg5.getList ( RDB_REVDB ,
cr->m_collnum ,
&list ,
startKey ,
endKey ,
1024*1024 , // minRecSizes ,
true , // includeTree ,
false , // add to cache?
0 , // max cache age
0 , // startFileNum ,
-1 , // numFiles ,
NULL , // state
NULL , // callback
0 , // niceness
false , // err correction?
NULL , // cache key ptr
0 , // retry num
-1 , // maxRetries
true , // compensate for merge
-1LL , // sync point
&msg5b ,
false )) {
g_threads.enableThreads();
return log("db: HEY! it did not block");
}
int32_t count = 0;
int32_t got = 0;
for ( list.resetListPtr() ; ! list.isExhausted() ;
list.skipCurrentRecord() ) {
key_t k = list.getCurrentKey();
count++;
//uint32_t groupId = getGroupId ( RDB_REVDB , &k );
//if ( groupId == g_hostdb.m_groupId ) got++;
uint32_t shardNum = getShardNum( RDB_REVDB , &k );
if ( shardNum == getMyShardNum() ) got++;
}
if ( got != count ) {
log ("db: Out of first %"INT32" records in revdb, "
"only %"INT32" belong to our group.",count,got);
// exit if NONE, we probably got the wrong data
if ( count > 10 && got == 0 )
log("db: Are you sure you have the right "
"data in the right directory? "
"Exiting.");
log ( "db: Exiting due to Revdb inconsistency." );
g_threads.enableThreads();
return g_conf.m_bypassValidation;
}
log ( LOG_INFO, "db: Revdb passed verification successfully for %"INT32""
" recs.", count );
// DONE
g_threads.enableThreads();
return true;
}
// . make the key of a RevRec from a docId
// . remember to set the low bit so it's not a delete
// . hi bits are set in the key
key_t Revdb::makeKey ( int64_t docId, bool isDel ){
key_t key ;
key.n1 = 0;
// shift up for delbit
key.n0 = ((uint64_t)docId) << 1;
// final del bit
if ( ! isDel ) key.n0 |= 0x01;
return key;
};
int64_t Revdb::getDocId ( key_t *k ) {
return (k->n0 >> 1);
}

52
Revdb.h

@ -1,52 +0,0 @@
// Matt Wells, copyright Jun 2001
// . db of metalists used to delete a doc now
#ifndef _REVDB_H_
#define _REVDB_H_
#include "Rdb.h"
#include "Url.h"
#include "Conf.h"
#include "Xml.h"
#include "Titledb.h"
// new key format:
// . <docId> - 38 bits
// . <delBit> - 1 bit
// data format:
// . a metalist that is passed in to Msg4
class Revdb {
public:
// reset rdb
void reset();
bool verify ( char *coll );
bool addColl ( char *coll, bool doVerify = true );
// init m_rdb
bool init ();
// init secondary/rebuild revdb
bool init2 ( int32_t treeMem ) ;
// like titledb basically
key_t makeKey ( int64_t docId , bool del ) ;
int64_t getDocId ( key_t *k );
Rdb *getRdb() { return &m_rdb; };
// holds binary format rev entries
Rdb m_rdb;
};
extern class Revdb g_revdb;
extern class Revdb g_revdb2;
#endif

@ -7,7 +7,7 @@
#include "Words.h"
#include "Sections.h"
SafeBuf::SafeBuf(int32_t initSize, char *label ) {
SafeBuf::SafeBuf(int32_t initSize, const char *label ) {
if(initSize <= 0) initSize = 1;
m_capacity = initSize;
m_length = 0;
@ -36,11 +36,11 @@ SafeBuf::SafeBuf() {
m_label = NULL;
}
void SafeBuf::setLabel ( char *label ) {
void SafeBuf::setLabel ( const char *label ) {
m_label = label;
}
SafeBuf::SafeBuf(char* stackBuf, int32_t cap, char* label) {
SafeBuf::SafeBuf(char* stackBuf, int32_t cap, const char* label) {
m_usingStack = true;
m_capacity = cap;
m_buf = stackBuf;
@ -133,7 +133,7 @@ bool SafeBuf::safeMemcpy(const char *s, int32_t len) {
return true;
}
bool SafeBuf::safeMemcpy_nospaces(char *s, int32_t len) {
bool SafeBuf::safeMemcpy_nospaces(const char *s, int32_t len) {
// put a silent \0 at the end
int32_t tmp = len + m_length+1;
if(tmp >= m_capacity ) {
@ -158,7 +158,7 @@ bool SafeBuf::safeMemcpy ( Words *w , int32_t a , int32_t b ) {
return safeMemcpy ( p , pend - p );
}
char* SafeBuf::pushStr (char* str, uint32_t len) {
char* SafeBuf::pushStr (const char* str, uint32_t len) {
int32_t initLen = m_length;
bool status = safeMemcpy ( str , len );
status &= nullTerm();
@ -273,7 +273,7 @@ bool SafeBuf::cat(SafeBuf& c) {
return safeMemcpy(c.getBufStart(), c.length());
}
bool SafeBuf::reserve(int32_t i , char *label, bool clearIt ) {
bool SafeBuf::reserve(int32_t i , const char *label, bool clearIt ) {
// if we don't already have a label and they provided one, use it
if ( ! m_label ) {
@ -333,7 +333,7 @@ bool SafeBuf::reserve(int32_t i , char *label, bool clearIt ) {
//reserve this many bytes, if we need to alloc, we double the
//buffer size.
bool SafeBuf::reserve2x(int32_t i, char *label) {
bool SafeBuf::reserve2x(int32_t i, const char *label) {
//watch out for overflow!
if((m_capacity << 1) + i < m_capacity) return false;
if(i + m_length >= m_capacity)
@ -433,7 +433,7 @@ int32_t SafeBuf::safeSave (char *filename ) {
}
int32_t SafeBuf::fillFromFile(char *dir,char *filename,char *label) {
int32_t SafeBuf::fillFromFile(const char *dir, const char *filename, const char *label) {
m_label = label;
char buf[1024];
if ( dir ) snprintf(buf,1024,"%s/%s",dir,filename);
@ -451,7 +451,7 @@ char *SafeBuf::getNextLine ( char *p ) {
}
// returns -1 on error
int32_t SafeBuf::catFile(char *filename) {
int32_t SafeBuf::catFile(const char *filename) {
SafeBuf sb2;
if ( sb2.fillFromFile(filename) < 0 ) return -1;
// add 1 for a null
@ -462,7 +462,7 @@ int32_t SafeBuf::catFile(char *filename) {
// returns -1 on error
int32_t SafeBuf::fillFromFile(char *filename) {
int32_t SafeBuf::fillFromFile(const char *filename) {
struct stat results;
if (stat(filename, &results) != 0) {
// An error occurred
@ -1135,7 +1135,7 @@ bool SafeBuf::addTag ( Tag *tag ) {
}
// this puts a \0 at the end but does not update m_length for the \0
bool SafeBuf::safeStrcpy ( char *s ) {
bool SafeBuf::safeStrcpy ( const char *s ) {
if ( ! s ) return true;
int32_t slen = gbstrlen(s);
if ( ! reserve ( slen+1 ) ) return false;
@ -1565,7 +1565,7 @@ void SafeBuf::replaceChar ( char src , char dst ) {
// encode a double quote char to two double quote chars
bool SafeBuf::csvEncode ( char *s , int32_t len , int32_t niceness ) {
bool SafeBuf::csvEncode ( const char *s , int32_t len , int32_t niceness ) {
if ( ! s ) return true;
@ -1578,7 +1578,7 @@ bool SafeBuf::csvEncode ( char *s , int32_t len , int32_t niceness ) {
//char *dstEnd = m_buf + m_capacity;
// scan through all
char *send = s + len;
const char *send = s + len;
for ( ; s < send ; s++ ) {
// breathe
QUICKPOLL ( niceness );
@ -1603,9 +1603,9 @@ bool SafeBuf::csvEncode ( char *s , int32_t len , int32_t niceness ) {
return true;
}
bool SafeBuf::base64Encode ( char *sx , int32_t len , int32_t niceness ) {
bool SafeBuf::base64Encode ( const char *sx , int32_t len , int32_t niceness ) {
unsigned char *s = (unsigned char *)sx;
const unsigned char *s = (const unsigned char *)sx;
if ( ! s ) return true;
@ -1630,7 +1630,7 @@ bool SafeBuf::base64Encode ( char *sx , int32_t len , int32_t niceness ) {
unsigned char val;
// scan through all
unsigned char *send = s + len;
const unsigned char *send = s + len;
for ( ; s < send ; ) {
// breathe
QUICKPOLL ( niceness );
@ -1696,7 +1696,7 @@ bool SafeBuf::base64Encode( char *s ) {
return base64Encode(s,gbstrlen(s));
}
bool SafeBuf::base64Decode ( char *src , int32_t srcLen , int32_t niceness ) {
bool SafeBuf::base64Decode ( const char *src , int32_t srcLen , int32_t niceness ) {
// make the map
static unsigned char s_bmap[256];

@ -17,17 +17,17 @@ class SafeBuf {
public:
//*TRUCTORS
SafeBuf();
SafeBuf(int32_t initSize, char *label);
SafeBuf(int32_t initSize, const char *label);
void constructor();
//be careful with passing in a stackBuf! it could go out
//of scope independently of the safebuf.
SafeBuf(char* stackBuf, int32_t cap, char* label = NULL);
SafeBuf(char* stackBuf, int32_t cap, const char* label = NULL);
SafeBuf(char *heapBuf, int32_t bufMax, int32_t bytesInUse, bool ownData);
~SafeBuf();
void setLabel ( char *label );
void setLabel ( const char *label );
// CAUTION: BE CAREFUL WHEN USING THE FOLLOWING TWO FUNCTIONS!!
// setBuf() allows you reset the contents of the SafeBuf to either
@ -68,11 +68,11 @@ public:
// saves to tmp file and if that succeeds then renames to orig filename
int32_t safeSave (char *filename );
int32_t fillFromFile(char *filename);
int32_t fillFromFile(char *dir,char *filename, char *label=NULL);
int32_t load(char *dir,char *fname,char *label = NULL) {
int32_t fillFromFile(const char *filename);
int32_t fillFromFile(const char *dir, const char *filename, const char *label=NULL);
int32_t load(const char *dir, const char *fname, const char *label = NULL) {
return fillFromFile(dir,fname,label);};
int32_t load(char *fname) { return fillFromFile(fname);};
int32_t load(const char *fname) { return fillFromFile(fname);};
bool safeTruncateEllipsis ( char *src , int32_t maxLen );
bool safeTruncateEllipsis ( char *src , int32_t srcLen, int32_t maxLen );
@ -103,21 +103,21 @@ public:
#else
bool safePrintf(const char *formatString, ...);
#endif
bool safeMemcpy(void *s, int32_t len){return safeMemcpy((char *)s,len);}
bool safeMemcpy(const void *s, int32_t len){return safeMemcpy((const char*)s,len);}
bool safeMemcpy(const char *s, int32_t len);
bool safeMemcpy_nospaces(char *s, int32_t len);
bool safeMemcpy_nospaces(const char *s, int32_t len);
bool safeMemcpy(SafeBuf *c){return safeMemcpy(c->m_buf,c->m_length);}
bool safeMemcpy ( class Words *w , int32_t a , int32_t b ) ;
bool safeStrcpy ( char *s ) ;
bool safeStrcpy ( const char *s ) ;
//bool safeStrcpyPrettyJSON ( char *decodedJson ) ;
bool safeUtf8ToJSON ( const char *utf8 ) ;
bool jsonEncode ( const char *utf8 ) { return safeUtf8ToJSON(utf8); }
bool jsonEncode ( char *utf8 , int32_t utf8Len );
bool csvEncode ( char *s , int32_t len , int32_t niceness = 0 );
bool csvEncode ( const char *s , int32_t len , int32_t niceness = 0 );
bool base64Encode ( char *s , int32_t len , int32_t niceness = 0 );
bool base64Decode ( char *src , int32_t srcLen , int32_t niceness = 0 ) ;
bool base64Encode ( const char *s , int32_t len , int32_t niceness = 0 );
bool base64Decode ( const char *src , int32_t srcLen , int32_t niceness = 0 ) ;
bool base64Encode( char *s ) ;
@ -132,8 +132,8 @@ public:
// . if clearIt is true we init the new buffer space to zeroes
// . used by Collectiondb.cpp
bool reserve(int32_t i, char *label=NULL , bool clearIt = false );
bool reserve2x(int32_t i, char *label = NULL );
bool reserve(int32_t i, const char *label=NULL , bool clearIt = false );
bool reserve2x(int32_t i, const char *label = NULL );
char *makeSpace ( int32_t size ) {
if ( ! reserve ( size ) ) return NULL;
@ -147,7 +147,7 @@ public:
};
void setLength(int32_t i) { m_length = i; }
char *getNextLine ( char *p ) ;
int32_t catFile(char *filename) ;
int32_t catFile(const char *filename) ;
void detachBuf();
bool insert ( class SafeBuf *c , int32_t insertPos ) ;
@ -266,7 +266,7 @@ public:
// hack off trailing 0's
bool printFloatPretty ( float f ) ;
char* pushStr (char* str, uint32_t len);
char* pushStr (const char* str, uint32_t len);
bool pushPtr ( void *ptr );
bool pushLong (int32_t i);
bool pushLongLong (int64_t i);
@ -307,7 +307,7 @@ public:
protected:
char *m_buf;
public:
char *m_label;
const char *m_label;
bool m_usingStack;
int16_t m_encoding; // output charset

@ -61,59 +61,13 @@ key_t SearchInput::makeKey ( ) {
// space separated, NULL terminated, list of meta tag names to display
if ( m_displayMetas )
k.n0 = hash64b ( m_displayMetas , k.n0 );
// name of collection in external cluster to get titleRecs for
// related pages from
//if ( m_rp_getExternalPages && m_rp_externalColl )
// k.n0 = hash64b ( m_rp_externalColl , k.n0 );
// collection e import from
//if ( m_importColl )
// k.n0 = hash64b ( m_importColl , k.n0 );
// the special query parm
//if ( m_sq && m_sqLen > 0 )
// k.n0 = hash64 ( m_sq , m_sqLen , k.n0 );
//if ( m_noDocIds && m_noDocIdsLen )
// k.n0 = hash64 ( m_noDocIds , m_noDocIdsLen , k.n0 );
//if ( m_noSiteIds && m_noSiteIdsLen )
// k.n0 = hash64 ( m_noSiteIds , m_noSiteIdsLen , k.n0 );
// no need to hash these again separately, they are in between
// m_START and m_END_HASH
// language
//if ( m_language )
// k.n0 = hash64 ( m_language , k.n0 );
//if ( m_gblang )
// k.n0 = hash64 ( m_gblang , k.n0 );
// . now include the hash of the search parameters
// . nnot incuding m_docsToScanForTopics since since we got TopicGroups
char *a = ((char *)&m_START) + 4 ; // msg40->m_dpf;
char *b = (char *)&m_END_HASH ; // msg40->m_topicGroups;
int32_t size = b - a;
// push and flush some parms that should not contribute
//int32_t save1 = m_refs_numToDisplay;
//int32_t save2 = m_rp_numToDisplay;
//int32_t save3 = m_numTopicsToDisplay;
//m_refs_numToDisplay = 0;
//m_rp_numToDisplay = 0;
//m_numTopicsToDisplay = 0;
// and hash it all up
k.n0 = hash64 ( a , size , k.n0 );
// and pop out the parms that did not contribute
//m_refs_numToDisplay = save1;
//m_rp_numToDisplay = save2;
//m_numTopicsToDisplay = save3;
// hash each topic group
for ( int32_t i = 0 ; i < 1 ; i++ ) {
TopicGroup *t = &m_topicGroups[i];
//k.n0 = hash64 ( t->m_numTopics , k.n0 );
k.n0 = hash64 ( t->m_maxTopics , k.n0 );
k.n0 = hash64 ( t->m_docsToScanForTopics , k.n0 );
k.n0 = hash64 ( t->m_minTopicScore , k.n0 );
k.n0 = hash64 ( t->m_maxWordsPerTopic , k.n0 );
k.n0 = hash64b( t->m_meta , k.n0 );
k.n0 = hash64 ( t->m_delimeter , k.n0 );
k.n0 = hash64 ( t->m_useIdfForTopics , k.n0 );
k.n0 = hash64 ( t->m_dedup , k.n0 );
}
// . boolean queries have operators (AND OR NOT ( ) ) that we need
// to consider in this hash as well. so
// . so just hash the whole damn query
@ -313,18 +267,13 @@ bool SearchInput::set ( TcpSocket *sock , HttpRequest *r ) { //, Query *q ) {
// now override automatic defaults for special cases
if ( tmpFormat != FORMAT_HTML ) {
m_familyFilter = 0;
m_numTopicsToDisplay = 0;
m_doQueryHighlighting = 0;
//m_spellCheck = 0;
m_getDocIdScoringInfo = false;
// turn gigabits off by default if not html
//m_docsToScanForTopics = 0;
}
// if they have a list of sites...
if ( m_sites && m_sites[0] ) {
m_doSiteClustering = false;
m_ipRestrictForTopics = false;
}
@ -576,18 +525,10 @@ bool SearchInput::set ( TcpSocket *sock , HttpRequest *r ) { //, Query *q ) {
m_doSiteClustering = true;
// turn off some parms
if ( m_q.m_hasUrlField )
m_ipRestrictForTopics = false;
if ( m_q.m_hasIpField )
m_ipRestrictForTopics = false;
if ( m_q.m_hasPositiveSiteField ) {
m_ipRestrictForTopics = false;
m_doSiteClustering = false;
}
if ( cr && ! cr->m_ipRestrict )
m_ipRestrictForTopics = false;
if ( m_q.m_hasQuotaField ) {
m_doSiteClustering = false;
m_doDupContentRemoval = false;
@ -629,36 +570,6 @@ bool SearchInput::set ( TcpSocket *sock , HttpRequest *r ) { //, Query *q ) {
// save it
m_rcache = readFromCache;
//
// TODO: use Parms.cpp defaults
//
TopicGroup *tg = &m_topicGroups[0];
//
//
// gigabits
//
//
tg->m_numTopics = 50;
tg->m_maxTopics = 50;
tg->m_docsToScanForTopics = m_docsToScanForTopics;
tg->m_minTopicScore = 0;
tg->m_maxWordsPerTopic = 6;
tg->m_meta[0] = '\0';
tg->m_delimeter = '\0';
tg->m_useIdfForTopics = false;
tg->m_dedup = true;
// need to be on at least 2 pages!
tg->m_minDocCount = 2;
tg->m_ipRestrict = m_ipRestrictForTopics;
tg->m_dedupSamplePercent = 80;
tg->m_topicRemoveOverlaps = true;
tg->m_topicSampleSize = 4096;
// max sequential punct chars allowedin a topic
tg->m_topicMaxPunctLen = 1;
return true;
}

@ -22,27 +22,6 @@
#define MAX_TOPIC_GROUPS 1
// . parameters used to generate a set of related topics (gigabits)
// . you can have Msg24 generate multiple sets of related topics in one call
class TopicGroup {
public:
int32_t m_numTopics;
int32_t m_maxTopics;
int32_t m_docsToScanForTopics;
int32_t m_minTopicScore;
int32_t m_maxWordsPerTopic;
char m_meta[32];
char m_delimeter;
bool m_useIdfForTopics;
bool m_dedup;
int32_t m_minDocCount ;
bool m_ipRestrict ;
char m_dedupSamplePercent; // -1 means no deduping
bool m_topicRemoveOverlaps;
int32_t m_topicSampleSize;
int32_t m_topicMaxPunctLen;
};
class SearchInput {
public:
@ -53,9 +32,6 @@ class SearchInput {
void test ( );
key_t makeKey ( ) ;
// private
void setTopicGroups ( class HttpRequest *r ,
class CollectionRec *cr ) ;
bool setQueryBuffers ( class HttpRequest *hr ) ;
//void setToDefaults ( class CollectionRec *cr , int32_t niceness ) ;
@ -110,7 +86,6 @@ class SearchInput {
char m_isCollAdmin;
// these are set from things above
TopicGroup m_topicGroups [ MAX_TOPIC_GROUPS ];// msg40
SafeBuf m_sbuf1;
SafeBuf m_sbuf2;
@ -146,7 +121,6 @@ class SearchInput {
char m_wcache; // msg40
char m_debug; // msg40
char m_debugGigabits;
char m_spiderResults;
char m_spiderResultRoots;
@ -157,7 +131,6 @@ class SearchInput {
// do not include these in makeKey()
int32_t m_numTopicsToDisplay;
int32_t m_refs_numToDisplay;
int32_t m_rp_numToDisplay;
@ -204,7 +177,6 @@ class SearchInput {
char m_excludeMetaText;
char m_doBotDetection;
int32_t m_includeCachedCopy;
char m_getSectionVotingInfo;
char m_familyFilter; // msg40
char m_showErrors;
char m_doSiteClustering; // msg40
@ -228,18 +200,6 @@ class SearchInput {
char *m_filetype;
// . related topic (gigabits) parameters
// . TODO: prepend m_top_ to these var names
int32_t m_docsToScanForTopics; // msg40
int32_t m_minTopicScore; // msg40
int32_t m_minDocCount; // msg40
int32_t m_dedupSamplePercent; // msg40
int32_t m_maxWordsPerTopic; // msg40
int32_t m_ipRestrictForTopics; // msg40
char m_returnDocIdCount; // msg40
char m_returnDocIds; // msg40
char m_returnPops; // msg40
// . reference page parameters
// . copied from CollectionRec.h
int32_t m_refs_numToGenerate; // msg40
@ -306,12 +266,9 @@ class SearchInput {
int32_t m_docsToScanForReranking;
float m_pqr_demFactSubPhrase;
float m_pqr_demFactCommonInlinks;
float m_pqr_demFactLocTitle;
float m_pqr_demFactLocSummary;
float m_pqr_demFactProximity;
float m_pqr_demFactInSection;
float m_pqr_demFactOrigScore;
bool m_pqr_demInTopics;
// . buzz stuff (buzz)
// . these controls the set of results, so should be in the makeKey()
// as it is, in between the start and end hash vars
@ -348,15 +305,9 @@ class SearchInput {
////////
// . end the section we hash in SearchInput::makeKey()
// . we also hash displayMetas, TopicGroups and Query into the key
// . we also hash displayMetas and Query into the key
int32_t m_END_HASH;
//////
//
// STUFF NOT REALLY USED NWO
//
//////
// a marker for SearchInput::test()
int32_t m_END_TEST;

File diff suppressed because it is too large Load Diff

@ -7,8 +7,6 @@
#include "Bits.h"
#include "Words.h"
#include "Rdb.h"
//#include "DiskPageCache.h"
// KEY:
// ssssssss ssssssss ssssssss ssssssss s = 48 bit site hash
@ -21,7 +19,7 @@
// NNNNNNNN NNNNNNNN NNNNNNNN NNNNNNNN N = SectionVote::m_numSampled
// h: hash value. typically the lower 32 bits of the
// Section::m_sentenceContentHash64 or the Section::m_contentHash64 vars. we
// Section::m_contentHash64 vars. we
// do not need the full 64 bits because we have the 48 bit site hash included
// to reduce collisions substantially.
@ -33,80 +31,43 @@
// . these are descriptive flags, they are computed when Sections is set
// . SEC_NOTEXT sections do not vote, i.e. they are not stored in Sectiondb
#define SEC_NOTEXT 0x0001 // implies section has no alnum words
// . Weights.cpp zeroes out the weights for these types of sections
// . is section delimeted by the <script> tag, <marquee> tag, etc.
//#define SEC_UNUSED 0x0002
//#define SEC_UNUSED 0x0004
#define SEC_SCRIPT 0x0008
#define SEC_STYLE 0x0010
#define SEC_SELECT 0x0020
#define SEC_MARQUEE 0x0040
#define SEC_CONTAINER 0x0080
// . in title/header. for gigabits in XmlDoc.cpp
// . is section delemited by <title> or <hN> tags?
#define SEC_IN_TITLE 0x0100
#define SEC_IN_HEADER 0x0200
// used by Events.cpp to indicate if section contains a TimeOfDay ("7 p.m.")
#define SEC_HAS_TOD 0x0400
//#define SEC_UNUSED 0x0040
//#define SEC_UNUSED 0x0080
#define SEC_IN_TITLE 0x0100 // in title
#define SEC_IN_HEADER 0x0200 // in <hN> tags
//#define SEC_UNUSED 0x0400
#define SEC_HIDDEN 0x0800 // <div style="display: none">
#define SEC_IN_TABLE 0x1000
//#define SEC_UNUSED 0x1000
#define SEC_FAKE 0x2000 // <hr>/<br>/sentence based faux section
#define SEC_NOSCRIPT 0x4000
//#define SEC_UNUSED 0x8000
#define SEC_HEADING_CONTAINER 0x8000
#define SEC_MENU 0x010000
#define SEC_LINK_TEXT 0x020000
#define SEC_MENU_HEADER 0x040000
#define SEC_INPUT_HEADER 0x080000
#define SEC_INPUT_FOOTER 0x100000
#define SEC_HEADING 0x200000
// reasons why a section is not an event
#define SEC_UNBALANCED 0x00400000 // interlaced section/tags
#define SEC_OPEN_ENDED 0x00800000 // no closing tag found
#define SEC_MENU 0x00010000
#define SEC_LINK_TEXT 0x00020000
#define SEC_MENU_HEADER 0x00040000
#define SEC_INPUT_HEADER 0x00080000
#define SEC_INPUT_FOOTER 0x00100000
#define SEC_HEADING 0x00200000
//#define SEC_UNUSED 0x00400000
//#define SEC_UNUSED 0x00800000
#define SEC_SENTENCE 0x01000000 // made by a sentence?
#define SEC_PLAIN_TEXT 0x02000000
//#define SEC_UNUSED_1 0x04000000
//#define SEC_UNUSED 0x04000000
//#define SEC_UNUSED 0x00008000000LL
//#define SEC_UNUSED 0x00010000000LL
//#define SEC_UNUSED 0x00020000000LL
//#define SEC_UNUSED 0x00040000000LL
//#define SEC_UNUSED 0x00080000000LL
// . this is set in Dates.cpp and used by Dates.cpp and Events.cpp
// . we identify max tod sections and make it so brothers in a list of two
// or more such sections cannot telescope to each other's dates, and so we
// do not share each other's event descriptions. fixes abqtango.com
// and salsapower.com from grabbing event description text from "failed"
// event sections that are brothers to successful event sections.
#define SEC_TOD_EVENT 0x00008000000LL
#define SEC_NIXED_HEADING_CONTAINER 0x00010000000LL
#define SEC_SECOND_TITLE 0x00020000000LL
#define SEC_SPLIT_SENT 0x00040000000LL
#define SEC_HAS_REGISTRATION 0x00080000000LL
#define SEC_HAS_PARKING 0x00100000000LL
//#define SEC_UNUSED 0x00100000000LL
#define SEC_MENU_SENTENCE 0x00200000000LL
// fix for folkmads.org:
#define SEC_HR_CONTAINER 0x00400000000LL
#define SEC_HAS_DOM 0x00800000000LL
#define SEC_HAS_DOW 0x01000000000LL
#define SEC_EVENT_BROTHER 0x02000000000LL
#define SEC_DATE_LIST_CONTAINER 0x04000000000LL
#define SEC_TAIL_CRAP 0x08000000000LL
#define SEC_CONTROL 0x0000010000000000LL
#define SEC_STRIKE 0x0000020000000000LL
#define SEC_STRIKE2 0x0000040000000000LL
#define SEC_HAS_MONTH 0x0000080000000000LL
#define SEC_IGNOREEVENTBROTHER 0x0000100000000000LL
#define SEC_HASEVENTDOMDOW 0x0000200000000000LL
#define SEC_STOREHOURSCONTAINER 0x0000400000000000LL
#define SEC_PUBDATECONTAINER 0x0000800000000000LL
#define SEC_TABLE_HEADER 0x0001000000000000LL
#define SEC_HASDATEHEADERROW 0x0002000000000000LL
#define SEC_HASDATEHEADERCOL 0x0004000000000000LL
#define SEC_MULTIDIMS 0x0008000000000000LL
#define SEC_HASHXPATH 0x0010000000000000LL
//#define SEC_UNUSED 0x00400000000LL
//#define SEC_UNUSED 0x00800000000LL
// . some random-y numbers for Section::m_baseHash
// . used by splitSection() function
@ -114,174 +75,10 @@
#define BH_SENTENCE 4590649
#define BH_IMPLIED 95468323
// values for Section::m_sentFlags (sentence flags)
#define SENT_HAS_COLON 0x00000001
//#define SENT_UNUSED_1 0x00000002
#define SENT_BAD_FIRST_WORD 0x00000004
#define SENT_MIXED_CASE 0x00000008
#define SENT_POWERED_BY 0x00000010
#define SENT_MULT_EVENTS 0x00000020
#define SENT_PAGE_REPEAT 0x00000040
#define SENT_NUMBERS_ONLY 0x00000080
//#define SENT_UNUSED_6 0x00000100
#define SENT_SECOND_TITLE 0x00000200
#define SENT_IS_DATE 0x00000400
#define SENT_LAST_STOP 0x00000800
#define SENT_NUMBER_START 0x00001000
#define SENT_TAG_INDICATOR 0x00002000
#define SENT_PRETTY 0x00004000
#define SENT_IN_HEADER 0x00008000
#define SENT_MIXED_CASE_STRICT 0x00010000
#define SENT_IN_LIST 0x00020000
#define SENT_COLON_ENDS 0x00040000
//#define SENT_UNUSED_7 0x00080000
#define SENT_IN_TITLEY_TAG 0x00100000
#define SENT_CITY_STATE 0x00200000
#define SENT_PRICEY 0x00400000
#define SENT_PERIOD_ENDS 0x00800000
#define SENT_HAS_PHONE 0x01000000
#define SENT_IN_MENU 0x02000000
#define SENT_MIXED_TEXT 0x04000000
#define SENT_TAGS 0x08000000
#define SENT_INTITLEFIELD 0x10000000
#define SENT_STRANGE_PUNCT 0x20000000
#define SENT_INPLACEFIELD 0x40000000
#define SENT_INNONTITLEFIELD 0x80000000
//#define SENT_UNUSED_2 0x0000000100000000LL
#define SENT_HASNOSPACE 0x0000000200000000LL
#define SENT_IS_BYLINE 0x0000000400000000LL
#define SENT_NON_TITLE_FIELD 0x0000000800000000LL
#define SENT_TITLE_FIELD 0x0000001000000000LL
#define SENT_UNIQUE_TAG_HASH 0x0000002000000000LL
#define SENT_AFTER_SENTENCE 0x0000004000000000LL
#define SENT_WORD_SANDWICH 0x0000008000000000LL
//#define SENT_UNUSED_3 0x0000010000000000LL
#define SENT_NUKE_FIRST_WORD 0x0000020000000000LL
#define SENT_FIELD_NAME 0x0000040000000000LL
#define SENT_PERIOD_ENDS_HARD 0x0000080000000000LL
#define SENT_PARENS_START 0x0000100000000000LL
#define SENT_IN_MENU_HEADER 0x0000200000000000LL
#define SENT_IN_TRUMBA_TITLE 0x0000400000000000LL
//#define SENT_UNUSED_8 0x0000800000000000LL
#define SENT_FORMTABLE_FIELD 0x0001000000000000LL
#define SENT_FORMTABLE_VALUE 0x0002000000000000LL
#define SENT_IN_TAG 0x0004000000000000LL
#define SENT_AFTER_SPACER 0x0008000000000000LL
#define SENT_BEFORE_SPACER 0x0010000000000000LL
#define SENT_OBVIOUS_PLACE 0x0020000000000000LL
//#define SENT_UNUSED_4 0x0040000000000000LL
#define SENT_HASSOMEEVENTSDATE 0x0080000000000000LL
#define SENT_AFTER_COLON 0x0100000000000000LL
#define SENT_HASTITLEWORDS 0x0200000000000000LL
//#define SENT_UNUSED_5 0x0400000000000000LL
//#define SENT_UNUSED_9 0x0800000000000000LL
#define SENT_IN_BIG_LIST 0x1000000000000000LL
#define SENT_BADEVENTSTART 0x2000000000000000LL
#define SENT_MENU_SENTENCE 0x4000000000000000LL
#define SENT_HAS_PRICE 0x8000000000000000ULL
#define NOINDEXFLAGS (SEC_SCRIPT|SEC_STYLE|SEC_SELECT)
// the section type (bit flag vector for SEC_*) is currently 32 bits
typedef int64_t sec_t;
//typedef int64_t titleflags_t;
typedef int64_t sentflags_t;
typedef uint32_t turkbits_t;
bool isPlaceIndicator ( int64_t *widp ) ;
char *getSentBitLabel ( sentflags_t sf ) ;
sentflags_t getMixedCaseFlags ( class Words *words ,
wbit_t *bits ,
int32_t senta ,
int32_t sentb ,
int32_t niceness ) ;
int32_t hasTitleWords ( sentflags_t sflags ,
int32_t senta,
int32_t sentb,
int32_t alnumCount,
class Bits *bits ,
class Words *words ,
bool useAsterisk ,
int32_t niceness );
class Sectiondb {
public:
// reset rdb
void reset();
bool verify ( char *coll );
bool addColl ( char *coll, bool doVerify = true );
// init m_rdb
bool init ();
// init secondary/rebuild sectiondb
bool init2 ( int32_t treeMem ) ;
Rdb *getRdb() { return &m_rdb; }
uint64_t getSiteHash ( void *k ) {
return ((*(uint64_t *)(((char *)k)+8))) >> 16;};
uint32_t getSectionHash ( void *k ) {
return (*(uint32_t *)(((char *)k)+6)); }
int64_t getDocId ( void *k ) {
return ((*(uint64_t *)k) >> 2) & DOCID_MASK; }
uint8_t getSectionType ( void *k ) {
return ((unsigned char *)k)[5]; };
// holds binary format title entries
Rdb m_rdb;
//DiskPageCache *getDiskPageCache ( ) { return &m_pc; };
//DiskPageCache m_pc;
};
extern class Sectiondb g_sectiondb;
extern class Sectiondb g_sectiondb2;
// this is only needed for sections, not facets in general i don think.
// facets has the whole QueryTerm::m_facetHashTable array with more info
//
// . for gbfacet:gbxpathsite1234567 posdb query stats compilation to
// show how many pages duplicate your section's content on your site
// at the same xpath. the hash of the innerHTML for that xpath is
// embedded into the posdb key like a number in a number key, so the
// wordpos bits etc are sacrificed to hold that 32-bit number.
// . used by XmlDoc::getSectionsWithDupStats() for display in
// XmlDoc::printRainbowSections()
// . these are in QueryTerm::m_facetStats and computed from
// QueryTerm::m_facetHashTable
class SectionStats {
public:
SectionStats() { reset(); }
void reset ( ) {
m_totalMatches = 0; // posdb key "val" matches ours
m_totalEntries = 0; // total posdb keys
m_numUniqueVals = 0; // # of unique "vals"
m_totalDocIds = 0;
};
// # of times xpath innerhtml matched ours. 1 count per docid max.
int64_t m_totalMatches;
// # of times this xpath occurred. doc can have multiple times.
int64_t m_totalEntries;
// # of unique vals this xpath had. doc can have multiple counts.
int64_t m_numUniqueVals;
int64_t m_totalDocIds;
};
class Section {
public:
@ -295,9 +92,6 @@ public:
class Section *m_next;
class Section *m_prev;
// used by Events.cpp to count # of timeofdays in section
//class Event *m_event;
// . if we are an element in a list, what is the list container section
// . a containing section is a section containing MULTIPLE
// smaller sections
@ -314,24 +108,6 @@ public:
// are a sentence section then this points to itself.
class Section *m_sentenceSection;
// . set in XmlDoc::getSectionsWithDupStats()
// . voting info for this section over all indexed pages from this site
SectionStats m_stats;
int32_t m_votesForDup;
int32_t m_votesForNotDup;
float getSectiondbVoteFactor ( ) {
// now punish if repeated on many page on the site
float a = (float)m_votesForNotDup;
float b = (float)m_votesForDup;
if ( a == 0 && b == 0 ) return 1.0;
// use that as a modifier
float factor = a / ( a + b);
// minimum so we do not completely nuke title i guess
if ( factor < .10 ) factor = .10;
return factor;
};
// position of the first and last alnum word contained directly OR
// indirectly in this section. use -1 if no text contained...
int32_t m_firstWordPos;
@ -348,32 +124,11 @@ public:
int32_t m_senta;
int32_t m_sentb;
// each sentence is numbered
//int32_t m_sentNum;
class Section *m_prevSent;
class Section *m_nextSent;
// . if we are in a table, what position are we
// . starts at 1 and goes upwards
// . we start it at 1 so that way we know that 0 is invalid!
int32_t m_rowNum;
int32_t m_colNum;
class Section *m_tableSec;
class Section *m_headColSection;
class Section *m_headRowSection;
class Section *m_leftCell;
class Section *m_aboveCell;
// hash of this tag's baseHash and all its parents baseHashes combined
uint32_t m_tagHash;
// like above but for turk voting. includes hash of the class tag attr
// from m_turkBaseHash, whereas m_tagHash uses m_baseHash of parent.
uint32_t m_turkTagHash32;
// for debug output display of color coded nested sections
uint32_t m_colorHash;
@ -384,35 +139,13 @@ public:
// div and span tags, etc. to make them unique
uint32_t m_baseHash;
// just hash the "class=" value along with the tagid
uint32_t m_turkBaseHash;
// kinda like m_baseHash but for xml tags and only hashes the
// tag name and none of the fields
uint32_t m_xmlNameHash;
// these deal with enumertated tags and are used by Events.cpp
int32_t m_occNum;
int32_t m_numOccurences;
// used by XmlDoc.cpp to set a topological distance
int32_t m_topDist;
// hash of all the alnum words DIRECTLY in this section
uint64_t m_contentHash64;
uint64_t m_sentenceContentHash64;
// . used by the SEC_EVENTBROTHER algo in Dates.cpp to detect
// [more] or [details] links that indicate distinct items
// . sometimes the "(more)" link is combined into the last sentence
// so we have to treat the last link kinda like its own sentence too!
uint32_t m_lastLinkContentHash32;
// hash of all sentences contained indirectly or directly.
// uses m_sentenceContentHash64 (for sentences)
uint64_t m_indirectSentHash64;
// . range of words in Words class we encompass
// . m_wordStart and m_wordEnd are the tag word #'s
// . ACTUALLY it is a half-closed interval [a,b) like all else
@ -422,45 +155,27 @@ public:
int32_t m_a;//wordStart;
int32_t m_b;//wordEnd;
// for event titles and descriptions
sentflags_t m_sentFlags;
// . # alnum words only in this and only this section
// . if we have none, we are SEC_NOTEXT
int32_t m_exclusive;
// our depth. # of tags in the hash
int32_t m_depth;
// container for the #define'd SEC_* values above
sec_t m_flags;
// used to mark it in Dates.cpp like a breadcrumb trail
int32_t m_mark;
// Events.cpp assigns a date to each section
int32_t m_firstDate;
char m_used;
// used in Sections::splitSections() function
int32_t m_processedHash;
int32_t m_gbFrameNum;
// do we contain section "arg"?
bool contains ( class Section *arg ) {
return ( m_a <= arg->m_a && m_b >= arg->m_b ); };
bool contains( class Section *arg ) {
return ( m_a <= arg->m_a && m_b >= arg->m_b );
}
// do we contain section "arg"?
bool strictlyContains ( class Section *arg ) {
if ( m_a < arg->m_a && m_b >= arg->m_b ) return true;
if ( m_a <= arg->m_a && m_b > arg->m_b ) return true;
return false;
};
// does this section contain the word #a?
bool contains2 ( int32_t a ) { return ( m_a <= a && m_b > a ); };
}
bool isVirtualSection ( ) ;
};
@ -474,84 +189,48 @@ public:
#define FMT_JSON 3
class Sections {
public:
Sections();
~Sections();
public:
Sections ( ) ;
void reset() ;
~Sections ( ) ;
void reset();
// . returns false if blocked, true otherwise
// . returns true and sets g_errno on error
// . sets m_sections[] array, 1-1 with words array "w"
bool set(class Words *w, class Phrases *phrases, class Bits *bits, class Url *url,
bool set(class Words *w, class Bits *bits, class Url *url,
int64_t siteHash64, char *coll, int32_t niceness, uint8_t contentType );
bool addVotes(class SectionVotingTable *nsvt, uint32_t tagPairHash );
bool verifySections ( ) ;
int32_t getStoredSize ( ) ;
static int32_t getStoredSize ( char *p ) ;
int32_t serialize ( char *p ) ;
bool growSections ( );
bool getSectiondbList ( );
bool gotSectiondbList ( bool *needsRecall ) ;
void setNextBrotherPtrs ( bool setContainer ) ;
// this is used by Events.cpp Section::m_nextSent
void setNextSentPtrs();
bool print ( SafeBuf *sbuf ,
class HashTableX *pt ,
class HashTableX *et ,
class HashTableX *st ,
class HashTableX *at ,
class HashTableX *tt ,
//class HashTableX *rt ,
class HashTableX *priceTable ) ;
bool print( SafeBuf *sbuf, class HashTableX *pt, class HashTableX *et, class HashTableX *st,
class HashTableX *at, class HashTableX *tt, class HashTableX *priceTable );
void printFlags ( class SafeBuf *sbuf , class Section *sn ) ;
bool printVotingInfoInJSON ( SafeBuf *sb ) ;
bool print2(SafeBuf *sbuf, int32_t hiPos, int32_t *wposVec, char *densityVec,
char *wordSpamVec, char *fragVec, char format = FMT_HTML );
bool print2 ( SafeBuf *sbuf ,
int32_t hiPos,
int32_t *wposVec,
char *densityVec,
char *diversityVec,
char *wordSpamVec,
char *fragVec,
char format = FMT_HTML );
bool printSectionDiv ( class Section *sk , char format = FMT_HTML );
bool printSectionDiv ( Section *sk , char format = FMT_HTML );
class SafeBuf *m_sbuf;
char *getSectionsReply ( int32_t *size );
char *getSectionsVotes ( int32_t *size );
bool isHardSection ( class Section *sn );
bool isHardSection ( Section *sn );
bool setMenus ( );
bool setFormTableBits ( ) ;
bool setTableRowsAndCols ( class Section *tableSec ) ;
bool setTableHeaderBits ( class Section *table );
bool setTableScanPtrs ( class Section *ts ) ;
void setHeader ( int32_t r , class Section *first , sec_t flag ) ;
bool setHeadingBit ( ) ;
void setTagHashes ( ) ;
bool setRegistrationBits ( ) ;
bool m_setRegBits ;
bool m_alnumPosValid;
// save it
class Words *m_words ;
class Bits *m_bits ;
@ -564,39 +243,15 @@ class Sections {
int32_t *m_wposVec;
char *m_densityVec;
char *m_diversityVec;
char *m_wordSpamVec;
char *m_fragVec;
// url ends in .rss or .xml ?
bool m_isRSSExt;
bool m_isFacebook ;
bool m_isEventBrite ;
bool m_isStubHub ;
Msg0 m_msg0;
key128_t m_startKey;
int32_t m_recall;
IndexList m_list;
int64_t m_termId;
int32_t m_numLineWaiters;
bool m_waitInLine;
int32_t m_articleStartWord;
int32_t m_articleEndWord;
bool m_hadArticle;
int32_t m_numInvalids;
int32_t m_totalSiteVoters;
int32_t m_numAlnumWordsInArticle;
// word #'s (-1 means invalid)
int32_t m_titleStart;
int32_t m_titleEnd;
int32_t m_titleStartAlnumPos;
int32_t m_numVotes;
// these are 1-1 with the Words::m_words[] array
class Section **m_sectionPtrs;
@ -604,25 +259,8 @@ class Sections {
// save this too
int32_t m_nw ;
// new stuff
HashTableX m_ot;
HashTableX m_vt;
// for caching parition scores
HashTableX m_ct;
// buf for serializing m_osvt into
char *m_buf;
int32_t m_bufSize;
// buf for serializing m_nsvt into
char *m_buf2;
int32_t m_bufSize2;
// allocate m_sections[] buffer
class Section *m_sections;
//int32_t m_sectionsBufSize;
int32_t m_numSections;
int32_t m_maxNumSections;
@ -633,71 +271,25 @@ class Sections {
// see what section a word is in.
SafeBuf m_sectionPtrBuf;
int32_t m_numSentenceSections;
bool m_isTestColl;
// assume no malloc
bool m_needsFree;
char m_localBuf [ SECTIONS_LOCALBUFSIZE ];
// set a flag
bool m_badHtml;
int64_t *m_wids;
int64_t *m_pids;
int32_t *m_wlens;
char **m_wptrs;
nodeid_t *m_tids;
// the new way
bool addImpliedSections ( );
bool setSentFlagsPart1 ( );
bool setSentFlagsPart2 ( );
sentflags_t getSentEventEndingOrBeginningFlags ( sentflags_t sflags ,
int32_t senta ,
int32_t sentb ,
int32_t alnumCount) ;
void setSentPrettyFlag ( class Section *si ) ;
int32_t m_hiPos;
bool m_sentFlagsAreSet;
bool m_addedImpliedSections;
int32_t addImpliedSections3 ();
int32_t getDelimScore ( class Section *bro,
char method,
class Section *delim ,
class Partition *part );
int32_t getDelimHash ( char method , class Section *bro ) ;
bool addImpliedLists ( ) ;
int32_t getDelimScore2 ( class Section *bro,
char method,
class Section *delim ,
int32_t *a ,
int32_t *b );
bool hashSentBits ( class Section *sx ,
class HashTableX *vht ,
class Section *container ,
uint32_t mod ,
class HashTableX *labelTable,
char *modLabel );
bool hashSentPairs ( Section *sx ,
Section *sb ,
HashTableX *vht ,
Section *container ,
HashTableX *labelTable );
bool addSentenceSections ( ) ;
class Section *insertSubSection ( int32_t a, int32_t b, int32_t newBaseHash ) ;
int32_t splitSectionsByTag ( nodeid_t tagid ) ;
bool splitSections ( char *delimeter , int32_t dh );
class Section *m_rootSection; // the first section, aka m_firstSection
class Section *m_lastSection;
@ -706,72 +298,8 @@ class Sections {
// kinda like m_rootSection, the first sentence section that occurs
// in the document, is NULL iff no sentences in document
class Section *m_firstSent;
class Section *m_lastSent;
bool containsTagId ( class Section *si, nodeid_t tagId ) ;
bool isTagDelimeter ( class Section *si , nodeid_t tagId ) ;
bool isDelimeter ( int32_t i , char *delimeter , int32_t *delimEnd ) {
// . HACK: special case when delimeter is 0x01
// . that means we are back-to-back br tags
if ( delimeter == (char *)0x01 ) {
// must be a br tag
if ( m_tids[i] != TAG_BR ) return false;
// assume that
int32_t k = i + 1;
// bad if end
if ( k >= m_nw ) return false;
// bad if a wid
if ( m_wids[k] ) return false;
// inc if punct
if ( ! m_tids[k] ) k++;
// bad if end
if ( k >= m_nw ) return false;
// must be another br tag
if ( m_tids[k] != TAG_BR ) return false;
// mark as end i guess
*delimEnd = k + 1;
return true;
}
// no word is a delimeter
if ( m_wids[i] ) return false;
// tags "<hr" and "<br"
if ( m_wptrs[i][0] == delimeter[0] &&
m_wptrs[i][1] == delimeter[1] &&
m_wptrs[i][2] == delimeter[2] )
return true;
// if no match above, forget it
if ( m_tids[i] ) return false;
// otherwise, we are a punctuation "word"
// the bullet is 3 bytes long
if ( m_wlens[i] < 3 ) return false;
// if not a bullet, skip it (&bull)
char *p = m_wptrs[i];
char *pend = p + m_wlens[i];
for ( ; p < pend ; p++ ) {
if ( p[0] != delimeter[0] ) continue;
if ( p[1] != delimeter[1] ) continue;
if ( p[2] != delimeter[2] ) continue;
return true;
}
return false;
};
};
// convert sectionType to a string
char *getSectionTypeAsStr ( int32_t sectionType );
// hash of the last 3 parent tagids
//uint32_t getSectionContentTagHash3 ( class Section *sn ) ;
// only allow this many urls per site to add sectiondb info
#define MAX_SITE_VOTERS 32
// . the key in sectiondb is basically the Section::m_tagHash
// (with a docId) and the data portion of the Rdb record is this SectionVote
// . the Sections::m_nsvt and m_osvt hash tables contain SectionVotes
@ -790,99 +318,4 @@ public:
float m_numSampled;
};
class SectionVotingTable {
public:
SectionVotingTable ( ) ;
//bool set ( Sections *sections , class RdbList *sectiondbList );
void reset () { m_svt.reset(); }
bool print ( SafeBuf *sbuf , char *title ) ;
// stock table from a sectiondb rdblist
bool addListOfVotes ( RdbList *list,
key128_t **lastKey ,
int64_t docId ,
int32_t niceness ) ;
// index our sections as flag|tagHash pairs using a termId which
// is basically our sitehash. this allows us to "vote" on what
// sections are static, dynamic, "texty" by indexing our votes into
// datedb.
bool hash ( int64_t docId ,
class HashTableX *dt ,
uint64_t siteHash64 ,
int32_t niceness ) ;
bool addVote2 ( int32_t tagHash, int32_t sectionType , float score ) {
return addVote3 ( tagHash,sectionType,score,1);};
bool addVote3 ( //class HashTableX *ttt ,
int32_t tagHash ,
int32_t sectionType ,
float score ,
float numSampled ,
bool hackFix = false ) ;
// return -1.0 if no voters!
float getScore ( Section *sn , int32_t sectionType ) {
if ( ! sn ) return -1.0;
return getScore ( sn->m_tagHash , sectionType ); };
float getScore ( int32_t tagHash , int32_t sectionType ) ;
float getNumSampled ( Section *sn , int32_t sectionType ) {
if ( ! sn ) return 0.0;
return getNumSampled ( sn->m_tagHash , sectionType ); };
float getNumSampled ( int32_t tagHash , int32_t sectionType ) ;
int32_t getNumVotes ( ) { return m_svt.getNumSlotsUsed(); };
bool init ( int32_t numSlots , char *name , int32_t niceness ) {
return m_svt.set(8,sizeof(SectionVote),numSlots,
NULL,0,false,niceness,name); };
HashTableX m_svt;
int32_t m_totalSiteVoters;
//int32_t m_totalSimilarLayouts;
};
//
// BEGIN SECTION TYPES
//
// . these are the core section types
// . these are not to be confused with the section bit flags below
// . we put these into sectiondb in the form of a SectionVote
// . the SectionVote is the data portion of the rdb record, and the key
// of the rdb record contains the url site hash and the section m_tagHash
// . in this way, a page can vote on what type of section a tag hash describes
//#define SV_TEXTY 1 // section has mostly non-hypertext words
#define SV_CLOCK 2 // DateParse2.cpp. section contains a clock
#define SV_EURDATEFMT 3 // DateParse2.cpp. contains european date fmt
#define SV_EVENT 4 // used in Events.cpp to indicate event container
#define SV_ADDRESS 5 // used in Events.cpp to indicate address container
// . HACK: the "date" is not the enum tag hash, but is the tagPairHash for this
// . every doc has just one of these describing the entire layout of the page
// . basically looking for these is same as doing a gbtaghash: query
#define SV_TAGPAIRHASH 20
// . HACK: the "date" is not the enum tag hash, but is the contentHash!
// . this allows us to detect a duplicate section even though the layout
// of the web page is not quite the same, but is from the same site
#define SV_TAGCONTENTHASH 21
// now Dates.cpp sets these too
#define SV_FUTURE_DATE 24
#define SV_PAST_DATE 25
#define SV_CURRENT_DATE 26
#define SV_SITE_VOTER 29
#define SV_TURKTAGHASH 30
#endif

@ -697,8 +697,6 @@ class SpiderRequest {
unsigned m_reserved3n :1;
unsigned m_reserved3k :1;
unsigned m_reserved3e :1;
//unsigned m_matchesUrlCrawlPattern :1;
//unsigned m_matchesUrlProcessPattern:1;
unsigned m_reserved3f :1;
unsigned m_reserved3g :1;
unsigned m_siteNumInlinksValid :1;
@ -711,30 +709,16 @@ class SpiderRequest {
// want the url's to have their links spidered. default is to make
// this 0 and to not avoid spidering the links.
unsigned m_avoidSpiderLinks:1;
// for identifying address heavy sites...
//unsigned m_tagYellowPages:1;
// when indexing urls for dmoz, i.e. the urls outputted from
// 'dmozparse urldump -s' we need to index them even if there
// was a ETCPTIMEDOUT because we have to have indexed the same
// urls that dmoz has in it in order to be identical to dmoz.
unsigned m_ignoreExternalErrors:1;
// called XmlDoc::set4() from PageSubmit.cpp?
//unsigned m_isPageSubmit:1;
//
// INTERNAL USE ONLY
//
// are we in the m_orderTree/m_doleTables/m_ipTree
//unsigned m_inOrderTree:1;
// are we doled out?
//unsigned m_doled:1;
// are we a re-add of a spiderrequest already in spiderdb added
// from xmldoc.cpp when done spidering so that the spider request
// gets back in the cache quickly?
//unsigned m_readd:1;
// . what url filter num do we match in the url filters table?
// . determines our spider priority and wait time
int16_t m_ufn;
@ -772,14 +756,6 @@ class SpiderRequest {
int32_t getRecSize () { return m_dataSize + 4 + sizeof(key128_t); }
// how much buf will we need to serialize ourselves?
//int32_t getRecSize () {
// //return m_dataSize + 4 + sizeof(key128_t); }
// return (m_url - (char *)this) + gbstrlen(m_url) + 1
// // subtract m_key and m_dataSize
// - sizeof(key_t) - 4 ;
//};
int32_t getUrlLen() { return m_dataSize -
// subtract the \0
((char *)m_url-(char *)&m_firstIp) - 1;};

@ -88,23 +88,6 @@ static Label s_labels[] = {
// eventually
{GRAPH_QUANTITY,-1,"docs_indexed", .1,"%.0f docs" , -1, 0x00cc0099,"docs indexed" }
//{ "termlist_intersect",0x0000ff00},
//{ "termlist_intersect_soft",0x00008000}, // rat=0
//{ "transmit_data_nice",0x00aa00aa },
//{ "transmit_data", 0x00ff00ff },
//{ "zak_ref_1a", 0x00ccffcc },
//{ "zak_ref_1b",0x00fffacd },
//{ "get_summary", 0x0000ff},
//{ "get_summary_nice", 0x0000b0},
//{ "get_gigabits",0x00d1e1ff },
//{ "get_termlists_nice", 0x00aaaa00},
//{ "get_termlists",0x00ffff00 },
//{ "get_all_summaries", 0x008220ff},
//{ "rdb_list_merge",0x0000ffff },
//{ "titlerec_compress",0x00ffffff },
//{ "titlerec_uncompress", 0x00ffffff} ,
//{ "parm_change",0xffc0c0} // pink?
};
void drawLine3 ( SafeBuf &sb ,

@ -2014,23 +2014,6 @@ bool isQueryStopWord ( char *s , int32_t len , int64_t h , int32_t langId ) {
s_queryStopWords2[langEnglish] = s_queryStopWordsEnglish;
s_queryStopWords2[langGerman ] = s_queryStopWordsGerman;
// set up the hash table
// if ( ! s_queryStopWordTable.set ( sizeof(s_queryStopWords) * 2 ) )
// return log(LOG_INIT,"query: Could not init query "
// "stop words table.");
// // now add in all the stop words
// int32_t n = (int32_t)sizeof(s_queryStopWords)/ sizeof(char *);
// for ( int32_t i = 0 ; i < n ; i++ ) {
// char *sw = s_queryStopWords[i];
// int32_t swlen = gbstrlen ( sw );
// int64_t swh = hash64Lower ( sw , swlen );
// s_queryStopWordTable.addTerm (swh,i+1,i+1,true);
// // . add w/o accent marks too!
// // . skip "f<>r" though because fur is an eng. word
// //if ( *sw=='f' && *(sw+1)=='<27>' &&
// // *(sw+2)=='r' && swlen == 3 ) continue;
// //swh = hash64AsciiLower ( sw , swlen );
// //s_queryStopWordTable.addTerm (swh,i+1,i+1,true);
// }
for ( int32_t i = 0 ; i <= MAXLANGID ; i++ ) {
HashTableX *ht = &s_queryStopWordTables[i];
char **words = s_queryStopWords2[i];
@ -3844,9 +3827,6 @@ static char *s_commonWords[] = {
static HashTableX s_commonWordTable;
static bool s_commonWordsInitialized = false;
static HashTableX s_commonQueryWordTable;
static bool s_commonQueryWordsInitialized = false;
// for Process.cpp::resetAll() to call when exiting to free all mem
void resetStopWordTables() {
@ -3854,7 +3834,6 @@ void resetStopWordTables() {
for ( int i = 0 ; i <= MAXLANGID ; i++ )
s_queryStopWordTables[i].reset();
s_commonWordTable.reset();
s_commonQueryWordTable.reset();
}
// used by Msg24.cpp for gigabits generation
@ -3896,467 +3875,10 @@ int32_t isCommonWord ( int64_t h ) {
return s_commonWordTable.getScore ( &h );
}
static char *s_verbs[] = {
"runs",
"run",
"go",
"goes",
"going"
};
static HashTableX s_verbTable;
static bool s_verbsInitialized = false;
// used by Msg24.cpp for gigabits generation
bool isVerb ( int64_t *hp ) {
// include a bunch of foreign prepositions so they don't get required
// by the bitScores in IndexTable.cpp
if ( ! s_verbsInitialized ) {
// set up the hash table
if ( ! s_verbTable.set (8,0,sizeof(s_verbs)*2,
NULL,0,false,0,"verbs") )
return log(LOG_INIT,
"query: Could not init verbs table.");
// now add in all the stop words
int32_t n = (int32_t)sizeof(s_verbs)/ sizeof(char *);
for ( int32_t i = 0 ; i < n ; i++ ) {
char *sw = s_verbs[i];
int32_t swlen = gbstrlen ( sw );
// use the same algo that Words.cpp computeWordIds does
int64_t swh = hash64Lower_utf8 ( sw , swlen );
if ( ! s_verbTable.addKey ( &swh ) ) {
char *xx=NULL;*xx=0; }
}
s_verbsInitialized = true;
}
// get from table
return (bool)s_verbTable.isInTable ( hp );
}
void resetStopWords ( ) {
s_stopWordTable.reset();
for ( int i = 0 ; i <= MAXLANGID ; i++ )
s_queryStopWordTables[i].reset();
s_commonWordTable.reset();
s_verbTable.reset();
s_commonQueryWordTable.reset();
}
static char *s_commonQueryWords[] = {
"to",
"and",
"ands",
"anding",
"anded",
"be", // "be fine" for fatboyshouseofbbq.com matching queries
"thereof",
"of",
"the",
"this",
"between",
"onto",
"too",
"every",
"always",
"more", // fix "more more" bringing up whitehouse.gov
"of",
"the",
"this",
"one",
"two",
"three",
"four",
"01",
"02",
"03",
"04",
"05",
"06",
"07",
"08",
"09",
"1",
"2",
"3",
"4",
"5",
"6",
"7",
"8",
"9",
"10",
"11",
"12",
"13",
"14",
"15",
"16",
"17",
"18",
"19",
"20",
"21",
"22",
"23",
"24",
"25",
"26",
"27",
"28",
"29",
"30",
"31",
"i","ii","iii","iv","vi","vii","viii","ix","x","xi",
"xii","xiii","xiv","xv","xvi","xvii","xviii","xix",
"xx","xxi","xxii","xxiii","xxiv","xxv","xxvi","xxvii",
"xxviii","xxix","xxx","xxxi",
"january",
"february",
"march",
"april",
"may",
"june",
"july",
"august",
"september",
"october",
"november",
"december",
"jan",
"feb",
"mar",
"apr",
"may",
"jun",
"jul",
"aug",
"sep",
"oct",
"nov",
"dec",
"2010",
"2011",
"2012",
"2013",
"2014",
"2015",
"a",
"over", // fix 'over site' for www.espn.com comeptitor pages
"am", // 'am so' for voyageofattraction.com
"be",
"being",
"been",
"so",
"soh",
"moar",
"more",
"most",
"than",
"much",
"los", // fix 'los dos y com' for www.espn.com comeptitor pages
"dos",
"view", // fix for jezebelgallery.com 'view homepage'
"viewed",
"views",
"viewing",
"homepage",
"homepages",
"webpage",
"webpages",
"home",
"homed",
"homing",
"wit", // wtf?
"homes",
"house",
"houses",
"housed",
"housing",
"page",
// fix getting 'green web' and 'green pages' for gigablast.com
// as two independent queries for a competitor
"pages",
// damn, paged is a synonym of pages
"paged",
"paging",
"info",
"infos",
"informative",
"information", // 'the information' for wcnews.com
"site",
"sites",
"sited",
"siting",
"is", // fix 'is website'
"welcome", // whitehouse.gov fix
"online",
"am", // 'am web' query
"y", // spanish for "and"
"at",
"be",
"by",
"on",
"or",
"do",
"doesn't",
"in",
"into",
"i",
"an",
"or",
"as",
"at",
"by",
"for",
"with",
"about",
"from",
"any", // stop 'any web' for diffbot.com
// german is messing us up so that two queries that should
// be basically the same "dos code" and "codes" are not! they
// should have the same synbasehash64! fix for cheatcc.com
// competitor pages from getting legal sites.
// because it matches "dos codes"
"dos",
"de",
"die",
"del",
"via",
"e",
// spanish. messing up ibm.com competitor pages.
// because it matches "es international"
"es",
// fix newser.com 'more of you' 'know you' 'know more'
"you", // "where do you" "you but" "but you"
"your",
"what",
"wat",
"where", // "and where you"
"who",
"when",
"what's",
"where's",
"who's", // 'who's who' for www.fudwatch.co.uk
"when's",
"which",
"wich",
"but", // "and but"
"ver", // fix ver ver related query everyone matches for some reason
"click", // click here is so popular!
"clicked",
"clicks",
"clicking",
"klick",
"klicked"
"klicks",
"klicking",
"here",
"per",
"a",
"b",
"c",
"d",
"e",
"f",
"g",
"h",
"i",
"j",
"k",
"l",
"m",
"n",
"o",
"p",
"q",
"r",
"s",
"t",
"u",
"v",
"w",
"x",
"y",
"z",
"innen", // wtf is this?
// fix matching queries for yahoo.com:
"inc",
"go",
"goes",
"going",
"gone",
"went",
"link",
"links",
"linked",
"hyperlinking",
"hyperlink",
"hyperlinks",
"hyperlinked",
"hyperlinking",
"exit",
"ing", // wtf?
"ed", // wtf?
"om",
"por",
"their",
"theirs",
"doh", // syn of do!
"do",
"don't",
"doesn't",
"did",
"does",
"done",
"do's",
"doing",
"hame", // wtf?
"were",
"was",
"can",
"cans",
"canning",
"canned",
"are",
"if",
"his",
"hers",
"him",
"her",
"fand", // wtf?
"s's",
"a's",
"he",
"she",
"that",
"en", // spanish?
"le", // french?
"will",
"willy",
"www",
"w3", // synonym for www
"com",
"coms", // synonym for com
"org",
"orgs",
"net", // .net
"nets",
"edu",
"gov",
"no", // fix 'no no' missing term for army-list.com
"my", // fix 'my' missing term for army-list.com
//"no", // 'no http' seems common. because we were ignoring "no"
// because it was a query stop word in portuguese!!
"it", // this hurts I.T. i guess...
"http",
"https",
"web",
"webs",
"below",
"site",
"website",
"sites",
"websites",
// until we fix it right! this shows up so much
"lincoln",
"lincolns"
};
// . used by Msg24.cpp for gigabits generation
// . h is the full wordid, not 48-bit termid
// . you can now pass in a 32-bit word hash instead of 64 and it should
// still work!!!
int32_t isCommonQueryWordInEnglish ( int64_t h64 ) {
// include a bunch of foreign prepositions so they don't get required
// by the bitScores in IndexTable.cpp
if ( ! s_commonQueryWordsInitialized ) {
// set up the hash table
int32_t ss = sizeof(s_commonQueryWords);
if ( ! s_commonQueryWordTable.set (8,4,ss*2,
NULL,0,false,0,
"commonwrds") )
return log(LOG_INIT,
"query: Could not init common words "
"table.");
// now add in all the stop words
int32_t n = (int32_t)sizeof(s_commonQueryWords)/ sizeof(char *);
for ( int32_t i = 0 ; i < n ; i++ ) {
char *sw = s_commonQueryWords[i];
int32_t swlen = gbstrlen ( sw );
// use the same algo that Words.cpp computeWordIds does
int64_t swh64 = hash64Lower_utf8 ( sw , swlen );
if ( ! s_commonQueryWordTable.addTerm ( &swh64,i+1 ) )
return false;
// if you pass in a 32-bit "h64" from hash32n()
// you must make sure it is UNSIGNED so the top
// 32 bits of the h64 are not set to 0xffffffff
// two's complement
swh64 &= 0x00000000ffffffffLL;
if ( ! s_commonQueryWordTable.addTerm ( &swh64,i+1 ) )
return false;
swh64 |= 0xffffffff00000000LL;
if ( ! s_commonQueryWordTable.addTerm ( &swh64,i+1 ) )
return false;
// . add w/o accent marks too!
// . skip "f<>r" though because fur is an eng. word
//if ( *sw=='f' && *(sw+1)=='<27>' &&
// *(sw+2)=='r' && swlen == 3 ) continue;
//swh = hash64AsciiLower ( sw , swlen );
//s_commonQueryWordTable.addTerm (swh,i+1,i+1,true);
}
s_commonQueryWordsInitialized = true;
// sanity test
int32_t tid32 = hash32n("on");
if ( !isCommonQueryWordInEnglish(tid32)){char *xx=NULL;*xx=0;}
tid32 = hash32n("web");
if ( !isCommonQueryWordInEnglish(tid32)){char *xx=NULL;*xx=0;}
}
// . all 1 char letter words are stop words
// . good for initials and some contractions
//if ( len == 1 && is_alpha_a(*s) ) return true;
// get from table
return (int32_t)s_commonQueryWordTable.getScore ( &h64 );
}

@ -13,27 +13,18 @@ bool isStopWord ( char *s , int32_t len , int64_t h ) ;
// used by Synonyms.cpp
bool isStopWord2 ( int64_t *h ) ;
//just a stub for now
//bool isStopWord ( UChar *s , int32_t len , int64_t h );
// . damn i forgot to include these above
// . i need these so m_bitScores in IndexTable.cpp doesn't have to require
// them! Otherwise, it's like all queries have quotes around them again...
bool isQueryStopWord ( char *s , int32_t len , int64_t h , int32_t langId ) ;
//bool isQueryStopWord ( UChar *s , int32_t len , int64_t h ) ;
// is it a COMMON word?
int32_t isCommonWord ( int64_t h ) ;
int32_t isCommonQueryWordInEnglish ( int64_t h ) ;
bool initWordTable(class HashTableX *table, char* words[],
//int32_t size ,
char *label);
bool isVerb ( int64_t *hp ) ;
// for Process.cpp::resetAll() to call when exiting to free all mem
void resetStopWordTables();

@ -661,7 +661,6 @@ int64_t Summary::getBestWindow ( Matches *matches, int32_t mm, int32_t *lasta,
}
// . we NULLify the section ptrs if we already used the word in another summary.
// . google seems to index SEC_MARQUEE, so i took that out of here
int32_t badFlags = SEC_SCRIPT|SEC_STYLE|SEC_SELECT|SEC_IN_TITLE;
if ( (bb[matchWordNum] & D_USED) || ( sp && (sp[matchWordNum]->m_flags & badFlags) ) ) {
// assume no best window
@ -1059,7 +1058,6 @@ bool Summary::getDefaultSummary ( Xml *xml, Words *words, Sections *sections, Po
int32_t bestEnd = -1;
int32_t longestConsecutive = 0;
int32_t lastAlnum = -1;
// google seems to index SEC_MARQUEE, so i took that out of here
int32_t badFlags = SEC_SCRIPT|SEC_STYLE|SEC_SELECT|SEC_IN_TITLE;
// shortcut
nodeid_t *tids = words->m_tagIds;

@ -26,12 +26,10 @@ public:
// . set m_title to the title of the document represented by "xd"
// . if getHardTitle is true will always use the title in the <title>
// tag, but if that is not present, will try dmoz titles before
// resorting to trying to guess a title from the document content
// or incoming link text.
// tag, but if that is not present, will resort to trying to guess
// a title from the document content or incoming link text.
// . uses the following:
// . title tag
// . dmoz title
// . meta title tag
// . incoming link text
// . <hX> tags at the top of the scored content

@ -1,7 +1,6 @@
#include "gb-include.h"
#include "Words.h"
#include "Phrases.h" // for isInPhrase() for hashWordIffNotInPhrase
#include "Unicode.h" // getUtf8CharSize()
#include "StopWords.h"
#include "Speller.h"
@ -108,7 +107,9 @@ int32_t countWords ( char *p ) {
bool Words::set( Xml *xml, bool computeWordIds, int32_t niceness, int32_t node1, int32_t node2 ) {
// prevent setting with the same string
if ( m_xml == xml ) { char *xx=NULL;*xx=0; }
reset();
m_xml = xml;
// if xml is empty, bail
@ -171,12 +172,6 @@ bool Words::set( Xml *xml, bool computeWordIds, int32_t niceness, int32_t node1,
m_tagIds[m_numWords] |= BACKBIT;
}
//log(LOG_DEBUG, "Words: Word %"INT32": got tag %s%s (%d)",
// m_numWords,
// isBackTag(m_numWords)?"/":"",
// g_nodes[getTagId(m_numWords)].m_nodeName,
// getTagId(m_numWords));
m_numWords++;
// used by XmlDoc.cpp
@ -188,41 +183,6 @@ bool Words::set( Xml *xml, bool computeWordIds, int32_t niceness, int32_t node1,
return true;
}
bool Words::set11 ( char *s , char *send , int32_t niceness ) {
reset();
// this will make addWords() scan for tags
m_hasTags = true;
// save it
char saved = *send;
// null term
*send = '\0';
// determine rough upper bound on number of words by counting
// punct/alnum boundaries
m_preCount = countWords ( s );
// true = tagIds
bool status = allocateWordBuffers(m_preCount,true);
// deal with error now
if ( !status ) {
*send = saved;
return false;
}
// and set the words
status = addWords(s,0x7fffffff, true, niceness );
// bring it back
*send = saved;
// return error?
return status;
}
// . set words from a string
// . assume no HTML entities in the string "s"
// . s must be NULL terminated
@ -249,10 +209,7 @@ bool Words::set( char *s, bool computeWordIds, int32_t niceness ) {
bool Words::addWords( char *s, int32_t nodeLen, bool computeWordIds, int32_t niceness ) {
int32_t i = 0;
int32_t j;
//int32_t k = 0;
int32_t wlen;
//uint32_t e;
//int32_t skip;
int32_t badCount = 0;
bool hadApostrophe = false;
@ -453,21 +410,11 @@ bool Words::addWords( char *s, int32_t nodeLen, bool computeWordIds, int32_t nic
m_words [ m_numWords ] = &s[j];
m_wordLens[ m_numWords ] = wlen;
// . Lars says it's better to leave the accented chars intact
// . google agrees
// . but what about "re'sume"?
if ( computeWordIds ) {
int64_t h = hash64Lower_utf8(&s[j],wlen);
m_wordIds [m_numWords] = h;
// until we get an accent removal algo, comment this
// out and possibly use the query synonym pipeline
// to search without accents. MDW
//int64_t h2 = hash64AsciiLowerE(&s[j],wlen);
//if ( h2 != h ) m_stripWordIds [m_numWords] = h2;
//else m_stripWordIds [m_numWords] = 0LL;
//m_stripWordIds[m_numWords] = 0;
}
m_nodes[m_numWords] = 0;
if (m_tagIds) m_tagIds[m_numWords] = 0;
m_numWords++;
@ -658,7 +605,6 @@ int32_t Words::getLanguage( Sections *sections ,
return -1;
// . avoid words in these bad sections
// . google seems to index SEC_MARQUEE so i took that out of badFlags
int32_t badFlags = SEC_SCRIPT|SEC_STYLE|SEC_SELECT;
// shortcuts
int64_t *wids = m_wordIds;
@ -798,34 +744,6 @@ int32_t Words::getLanguage( Sections *sections ,
return l;
}
// get the word index at the given character position
int32_t Words::getWordAt ( char *p ) { // int32_t charPos ) {
if ( ! p ) { char *xx=NULL;*xx=0; }
if ( p < m_words[0] ) { char *xx=NULL;*xx=0; }
if ( p >= getContentEnd() ) { char *xx=NULL;*xx=0; }
int32_t step = m_numWords / 2;
int32_t i = m_numWords / 2 ;
for (;;) {
// divide it by 2 each time
step >>= 1;
// always at least one
if ( step <= 0 )
step = 1;
// is it a hit?
if ( p >= m_words[i] && p < m_words[i] + m_wordLens[i] )
return i;
// compare
if ( m_words[i] < p )
i += step;
else
i -= step;
}
return -1;
}
// . return the value of the specified "field" within this html tag, "s"
// . the case of "field" does not matter
char *getFieldValue ( char *s ,

21
Words.h

@ -56,8 +56,6 @@ class Words {
// . html tags are NOT parsed out
bool set( char *s, bool computeIds, int32_t niceness );
bool set11 ( char *s , char *send , int32_t niceness ) ;
// . similar to above
// . but we temporarily stick a \0 @ s[slen] for parsing purposes
bool set( char *s, int32_t slen, bool computeIds, int32_t niceness = 0 );
@ -112,8 +110,6 @@ class Words {
return size;
}
int32_t getWordAt ( char *charPos );
// . CAUTION: don't call this for punct "words"... it's bogus for them
// . this is only for alnum "words"
int64_t getWordId( int32_t n ) const {
@ -121,16 +117,11 @@ class Words {
}
bool isStopWord ( int32_t n ) {
return ::isStopWord(m_words [n],
m_wordLens[n],
m_wordIds [n]);
return ::isStopWord( m_words[n], m_wordLens[n], m_wordIds[n] );
}
bool isQueryStopWord ( int32_t n , int32_t langId ) {
return ::isQueryStopWord(m_words [n],
m_wordLens[n],
m_wordIds [n],
langId);
return ::isQueryStopWord( m_words[n], m_wordLens[n], m_wordIds[n], langId );
}
@ -180,13 +171,7 @@ class Words {
return false;
}
bool isSpaces ( int32_t n ) {
for ( int32_t i = 0 ; i < m_wordLens[n] ; i++ )
if ( ! is_wspace_utf8(&m_words[n][i]) ) return false;
return true;
}
bool isSpaces2 ( int32_t n , int32_t starti ) {
bool isSpaces ( int32_t n , int32_t starti = 0 ) {
for ( int32_t i = starti ; i < m_wordLens[n] ; i++ )
if ( ! is_wspace_utf8(&m_words[n][i]) ) return false;
return true;

4034
XmlDoc.cpp

File diff suppressed because it is too large Load Diff

355
XmlDoc.h

@ -18,38 +18,30 @@
#ifndef _XMLDOC_H_
#define _XMLDOC_H_
//#include "HashTableX.h"
#include "Lang.h"
#include "Words.h"
#include "Bits.h"
#include "Pos.h"
#include "Phrases.h"
//#include "Synonyms.h"
//#include "Weights.h"
#include "Xml.h"
#include "SafeBuf.h"
#include "Images.h"
#include "Sections.h"
#include "Msge0.h"
#include "Msge1.h"
//#include "Msge2.h"
#include "Msg4.h"
#include "SearchInput.h"
#include "Msg40.h"
//#include "IndexList.h"
#include "Msg0.h"
#include "Msg22.h"
#include "Tagdb.h"
#include "Url.h"
#include "Linkdb.h"
//#include "LinkInfo.h"
//#include "Msg25.h"
#include "MsgC.h"
#include "Msg13.h"
#include "RdbList.h"
#include "SiteGetter.h"
//#include "CollectionRec.h"
#include "Msg20.h"
#include "Matches.h"
#include "Query.h"
@ -62,24 +54,15 @@
#include "PingServer.h"
#include "Json.h"
//#define XMLDOC_MAX_AD_IDS 4
//#define XMLDOC_ADLEN 64
#define MAXFRAGWORDS 80000
#define MAX_WIKI_DOCIDS 20
#define MAX_TAG_PAIR_HASHES 100
#include "Msg40.h"
//#define SAMPLE_VECTOR_SIZE (32*4)
#define POST_VECTOR_SIZE (32*4)
#define XD_GQ_MAX_SIZE 1000
#define XD_MAX_GIGABIT_HASHES 48
#define XD_MAX_AD_IDS 5
#define MAX_LINK_TEXT_LEN 512
#define MAX_SURROUNDING_TEXT_WIDTH 600
@ -280,11 +263,11 @@ public:
char *ptr_firstUrl;
char *ptr_redirUrl;
char *ptr_rootTitleBuf;
int32_t *ptr_gigabitHashes;
int32_t *ptr_gigabitScores;
int32_t *ptr_unused12;
int32_t *ptr_unused13;
void *ptr_unused8;
int64_t *ptr_wikiDocIds;
rscore_t *ptr_wikiScores;
int64_t *ptr_unused10;
rscore_t *ptr_unused11;
char *ptr_imageData;
int32_t *ptr_unused6;
int32_t *ptr_unused7;
@ -305,11 +288,11 @@ public:
int32_t size_firstUrl;
int32_t size_redirUrl;
int32_t size_rootTitleBuf;
int32_t size_gigabitHashes;
int32_t size_gigabitScores;
int32_t size_unused12;
int32_t size_unused13;
int32_t size_unused8;
int32_t size_wikiDocIds;
int32_t size_wikiScores;
int32_t size_unused10;
int32_t size_unused11;
int32_t size_imageData;
int32_t size_unused6;
int32_t size_unused7;
@ -404,9 +387,6 @@ public:
SafeBuf m_spiderStatusDocMetaList;
char *getIsAdult ( ) ;
int64_t **getWikiDocIds ( ) ;
void gotWikiResults ( class UdpSlot *slot );
//class HashTableX *getClockCandidatesTable();
int32_t getOutlinkAge ( int32_t outlinkNum ) ;
char *getIsPermalink ( ) ;
char *getIsUrlPermalinkFormat ( ) ;
@ -421,19 +401,7 @@ public:
class Bits *getBitsForSummary ( ) ;
class Pos *getPos ( );
class Phrases *getPhrases ( ) ;
//class Synonyms *getSynonyms ( );
class Sections *getExplicitSections ( ) ;
class Sections *getImpliedSections ( ) ;
class Sections *getSections ( ) ;
class Sections *getSectionsWithDupStats ( );
//BR 20160106 removed: class SafeBuf *getInlineSectionVotingBuf();
bool gotSectionFacets( class Multicast *mcast );
class SectionStats *getSectionStats ( uint32_t secHash32, uint32_t sentHash32, bool cacheOnly );
class SectionVotingTable *getOldSectionVotingTable();
class SectionVotingTable *getNewSectionVotingTable();
char **getSectionsReply ( ) ;
char **getSectionsVotes ( ) ;
HashTableX *getSectionVotingTable();
int32_t *getLinkSiteHashes ( );
class Links *getLinks ( bool doQuickSet = false ) ;
class HashTableX *getCountTable ( ) ;
@ -442,36 +410,21 @@ public:
int32_t *getSummaryVector ( ) ;
int32_t *getPageSampleVector ( ) ;
int32_t *getPostLinkTextVector ( int32_t linkNode ) ;
int32_t computeVector ( class Sections *sections, class Words *words,
uint32_t *vec , int32_t start = 0 , int32_t end = -1 );
int32_t computeVector ( class Words *words, uint32_t *vec , int32_t start = 0 , int32_t end = -1 );
float *getTagSimilarity ( class XmlDoc *xd2 ) ;
float *getGigabitSimilarity ( class XmlDoc *xd2 ) ;
float *getPageSimilarity ( class XmlDoc *xd2 ) ;
float *getPercentChanged ( );
uint64_t *getFuzzyDupHash ( );
int64_t *getExactContentHash64();
int64_t *getLooseContentHash64();
class RdbList *getDupList ( ) ;
class RdbList *getLikedbListForReq ( );
class RdbList *getLikedbListForIndexing ( );
char *getIsDup ( ) ;
char *isDupOfUs ( int64_t d ) ;
uint32_t *getGigabitVectorScorelessHash ( ) ;
int32_t **getGigabitHashes ( );
char *getGigabitQuery ( ) ;
char *getMetaDescription( int32_t *mdlen ) ;
char *getMetaSummary ( int32_t *mslen ) ;
char *getMetaKeywords( int32_t *mklen ) ;
char *getMetaGeoPlacename( int32_t *mgplen );
bool addGigabits ( char *s , int64_t docId , uint8_t langId ) ;
bool addGigabits2 ( char *s,int32_t slen,int64_t docId,uint8_t langId);
bool addGigabits ( class Words *ww ,
int64_t docId,
class Sections *sections,
//class Weights *we ,
uint8_t langId );
int32_t *getSiteSpiderQuota ( ) ;
class Url *getCurrentUrl ( ) ;
class Url *getFirstUrl() ;
@ -626,10 +579,6 @@ public:
char *addOutlinkSpiderRecsToMetaList ( );
//bool addTable96 ( class HashTableX *tt1 ,
// int32_t date1 ,
// bool nosplit ) ;
int32_t getSiteRank ();
bool addTable144 ( class HashTableX *tt1 ,
int64_t docId ,
@ -637,11 +586,6 @@ public:
bool addTable224 ( HashTableX *tt1 ) ;
//bool addTableDate ( class HashTableX *tt1 , //T<key128_t,char> *tt1
// uint64_t docId ,
// uint8_t rdbId ,
// bool nosplit ) ;
bool addTable128 ( class HashTableX *tt1 , // T <key128_t,char>*tt1
uint8_t rdbId ,
bool forDelete ) ;
@ -662,10 +606,7 @@ public:
bool hashUrl ( class HashTableX *table, bool urlOnly );
bool hashDateNumbers ( class HashTableX *tt );
bool hashSections ( class HashTableX *table ) ;
bool hashIncomingLinkText ( class HashTableX *table ,
bool hashAnomalies ,
bool hashNonAnomalies ) ;
bool hashIncomingLinkText( class HashTableX *table, bool hashAnomalies, bool hashNonAnomalies );
bool hashLinksForLinkdb ( class HashTableX *table ) ;
bool hashNeighborhoods ( class HashTableX *table ) ;
bool hashRSSInfo ( class HashTableX *table ) ;
@ -683,11 +624,8 @@ public:
bool hashTagRec ( class HashTableX *table ) ;
bool hashPermalink ( class HashTableX *table ) ;
bool hashVectors(class HashTableX *table ) ;
// BR 20160106 removed: bool hashAds(class HashTableX *table ) ;
class Url *getBaseUrl ( ) ;
// BR 20160106 removed: bool hashSubmitUrls ( class HashTableX *table ) ;
// BR 20160106 removed: bool hashImageStuff ( class HashTableX *table ) ;
bool hashIsAdult ( class HashTableX *table ) ;
void set20 ( Msg20Request *req ) ;
@ -700,8 +638,6 @@ public:
class Title *getTitle ();
class Summary *getSummary () ;
char *getHighlightedSummary ();
SafeBuf *getSampleForGigabits ( ) ;
SafeBuf *getSampleForGigabitsJSON ( ) ;
char *getIsNoArchive ( ) ;
int32_t *getUrlFilterNum();
char *getIsLinkSpam ( ) ;
@ -709,64 +645,21 @@ public:
char *getIsErrorPage ( ) ;
char* matchErrorMsg(char* p, char* pend );
bool hashWords ( //int32_t wordStart ,
//int32_t wordEnd ,
class HashInfo *hi ) ;
bool hashSingleTerm ( int64_t termId ,
class HashInfo *hi ) ;
bool hashSingleTerm ( char *s ,
int32_t slen ,
class HashInfo *hi );
bool hashString ( class HashTableX *ht ,
//class Weights *we ,
class Bits *bits ,
char *s ,
int32_t slen ) ;
bool hashString ( char *s ,
int32_t slen ,
class HashInfo *hi ) ;
bool hashString ( char *s ,
class HashInfo *hi ) ;
bool hashWords( class HashInfo *hi );
bool hashSingleTerm( int64_t termId, class HashInfo *hi );
bool hashSingleTerm( char *s, int32_t slen, class HashInfo *hi );
bool hashString( class HashTableX *ht, class Bits *bits, char *s, int32_t slen );
bool hashString( char *s, int32_t slen, class HashInfo *hi );
bool hashString( char *s, class HashInfo *hi );
bool hashWords3( class HashInfo *hi, class Words *words, class Phrases *phrases, class Synonyms *synonyms,
class Sections *sections, class HashTableX *countTable, char *fragVec, char *wordSpamVec,
char *langVec, char docLangId, class SafeBuf *pbuf, class HashTableX *wts,
class SafeBuf *wbuf, int32_t niceness );
bool hashWords3 ( //int32_t wordStart ,
//int32_t wordEnd ,
class HashInfo *hi ,
class Words *words ,
class Phrases *phrases ,
class Synonyms *synonyms ,
class Sections *sections ,
class HashTableX *countTable ,
char *fragVec ,
char *wordSpamVec ,
char *langVec ,
char docLangId , // default lang id
class SafeBuf *pbuf ,
class HashTableX *wts ,
class SafeBuf *wbuf ,
int32_t niceness );
bool hashString3 ( char *s ,
int32_t slen ,
class HashInfo *hi ,
class HashTableX *countTable ,
class SafeBuf *pbuf ,
class HashTableX *wts ,
class SafeBuf *wbuf ,
int32_t version ,
int32_t siteNumInlinks ,
int32_t niceness );
//bool hashSectionTerm ( char *term ,
// class HashInfo *hi ,
// int32_t sentHash32 ) ;
bool hashFacet1 ( char *term, class Words *words , HashTableX *dt) ;
bool hashFacet2 ( char *prefix,char *term,int32_t val32, HashTableX *dt,
bool shardByTermId = false ) ;
bool hashString3( char *s, int32_t slen, class HashInfo *hi, class HashTableX *countTable,
class SafeBuf *pbuf, class HashTableX *wts, class SafeBuf *wbuf, int32_t version,
int32_t siteNumInlinks, int32_t niceness );
// gbfieldmatch:
bool hashFieldMatchTerm ( char *val, int32_t vlen, class HashInfo *hi);
@ -788,8 +681,6 @@ public:
FacetValHash_t fvh ) ;
bool storeFacetValuesSite ( char *qs , SafeBuf *sb ,
FacetValHash_t fvh );
bool storeFacetValuesSections ( char *qs , class SafeBuf *sb ,
FacetValHash_t fvh ) ;
bool storeFacetValuesHtml ( char *qs , class SafeBuf *sb ,
FacetValHash_t fvh ) ;
bool storeFacetValuesXml ( char *qs , class SafeBuf *sb ,
@ -819,16 +710,12 @@ public:
public:
// stuff set from the key of the titleRec, above the compression area
//key_t m_key;
int64_t m_docId;
char *m_ubuf;
int32_t m_ubufSize;
int32_t m_ubufAlloc;
// does this page link to gigablast, or has a search form to it?
//bool searchboxToGigablast();
// private:
// we we started spidering it, in milliseconds since the epoch
@ -843,16 +730,6 @@ public:
int64_t m_setTime;
int64_t m_cpuSummaryStartTime;
// timers
int64_t m_beginSEOTime;
int64_t m_beginTimeAllMatch;
int64_t m_beginTimeMatchUrl;
int64_t m_beginTimeFullQueries;
int64_t m_beginTimeLinks;
//int64_t m_beginMsg98s;
int64_t m_beginRelatedQueries;
int64_t m_beginMsg95s;
// . these should all be set using set*() function calls so their
// individual validity flags can bet set to true, and successive
// calls to their corresponding get*() functions will not core
@ -873,8 +750,6 @@ public:
int64_t m_firstUrlHash64;
Url m_currentUrl;
//char *m_coll;
//char m_collBuf[MAX_COLL_LEN+1]; // include \0
CollectionRec *m_lastcr;
collnum_t m_collnum;
int32_t m_lastCollRecResetCount;
@ -908,91 +783,24 @@ public:
Bits m_bits2;
Pos m_pos;
Phrases m_phrases;
//Synonyms m_synonyms;
SafeBuf m_synBuf;
//Weights m_weights;
Sections m_sections;
// a hack storage thing used by Msg13.cpp
class Msg13Request *m_hsr;
Section *m_si;
//Section *m_nextSection;
//Section *m_lastSection;
int32_t m_mcastRequestsOut;
int32_t m_mcastRequestsIn;
int32_t m_secStatsErrno;
char *m_queryBuf;
Msg39Request *m_msg39RequestArray;
SafeBuf m_mcastBuf;
Multicast *m_mcastArray;
//char *m_inUse;
//Query *m_queryArray;
//Query *m_sharedQuery;
bool m_gotDupStats;
//Query m_q4;
//Msg3a m_msg3a;
//Msg39Request m_r39;
Msg39Request m_mr2;
SectionStats m_sectionStats;
HashTableX m_sectionStatsTable;
//char m_sectionHashQueryBuf[128];
// also set in getSections()
int32_t m_maxVotesForDup;
// . for rebuild logging of what's changed
// . Repair.cpp sets these based on titlerec
char m_logLangId;
int32_t m_logSiteNumInlinks;
SectionVotingTable m_nsvt;
SectionVotingTable m_osvt;
int32_t m_numSectiondbReads;
int32_t m_numSectiondbNeeds;
key128_t m_sectiondbStartKey;
RdbList m_secdbList;
int32_t m_sectiondbRecall;
bool m_gotFacets;
SafeBuf m_tmpBuf2;
SafeBuf m_inlineSectionVotingBuf;
//HashTableX m_rvt;
//Msg17 m_msg17;
//char *m_cachedRootVoteRec;
//int32_t m_cachedRootVoteRecSize;
//bool m_triedVoteCache;
//bool m_storedVoteCache;
//SafeBuf m_cacheRecBuf;
SafeBuf m_timeAxisUrl;
HashTableX m_turkVotingTable;
HashTableX m_turkBitsTable;
uint32_t m_confirmedTitleContentHash ;
uint32_t m_confirmedTitleTagHash ;
// turk voting tag rec
TagRec m_vtr;
// tagrec of banned turks
TagRec m_bannedTurkRec;
// and the table of the hashed banned turk users
HashTableX m_turkBanTable;
// used for displaying turk votes...
HashTableX m_vctab;
HashTableX m_vcduptab;
Images m_images;
HashTableX m_countTable;
HttpMime m_mime;
TagRec m_tagRec;
SafeBuf m_tagRecBuf;
// copy of m_oldTagRec but with our modifications, if any
//TagRec m_newTagRec;
SafeBuf m_newTagBuf;
SafeBuf m_fragBuf;
SafeBuf m_wordSpamBuf;
@ -1002,9 +810,6 @@ public:
class SafeBuf *m_savedSb;
class HttpRequest *m_savedHr;
char m_savedChar;
// validity flags. on reset() all these are set to false.
char m_VALIDSTART;
// DO NOT add validity flags above this line!
@ -1013,7 +818,6 @@ public:
char m_addedSpiderReplySizeValid;
char m_addedStatusDocSizeValid;
char m_downloadStartTimeValid;
//char m_docQualityValid;
char m_siteValid;
char m_startTimeValid;
char m_currentUrlValid;
@ -1025,7 +829,6 @@ public:
char m_lastUrlValid;
char m_docIdValid;
char m_availDocIdValid;
//char m_collValid;
char m_tagRecValid;
char m_robotsTxtLenValid;
char m_tagRecDataValid;
@ -1034,7 +837,6 @@ public:
char m_filteredRootTitleBufValid;
char m_titleBufValid;
char m_fragBufValid;
char m_inlineSectionVotingBufValid;
char m_wordSpamBufValid;
char m_finalSummaryBufValid;
char m_matchingQueryBufValid;
@ -1042,32 +844,24 @@ public:
char m_relatedQueryBufValid;
char m_queryLinkBufValid;
char m_redirSpiderRequestValid;
//char m_queryPtrsValid;
char m_queryOffsetsValid;
//char m_queryPtrsSortedValid;
char m_queryPtrsWholeValid;
char m_relatedDocIdBufValid;
char m_topMatchingQueryBufValid;
char m_relatedDocIdsScoredBufValid;
char m_relatedDocIdsWithTitlesValid;
char m_relatedTitleBufValid;
//char m_queryLinkBufValid;
char m_missingTermBufValid;
char m_matchingTermBufValid;
//char m_relPtrsValid;
char m_sortedPosdbListBufValid;
char m_wpSortedPosdbListBufValid;
char m_termListBufValid;
char m_insertableTermsBufValid;
char m_scoredInsertableTermsBufValid;
//char m_iwfiBufValid; // for holding WordFreqInfo instances
char m_wordPosInfoBufValid;
char m_recommendedLinksBufValid;
//char m_queryHashTableValid;
char m_queryOffsetTableValid;
//char m_socketWriteBufValid;
//char m_numBannedOutlinksValid;
char m_hopCountValid;
char m_isInjectingValid;
char m_isImportingValid;
@ -1091,35 +885,19 @@ public:
char m_posValid;
char m_isUrlBadYearValid;
char m_phrasesValid;
//char m_synonymsValid;
//char m_weightsValid;
char m_sectionsValid;
char m_subSentsValid;
char m_osvtValid;
char m_nsvtValid;
//char m_rvtValid;
char m_turkVotingTableValid;
char m_turkBitsTableValid;
char m_turkBanTableValid;
char m_vctabValid;
char m_explicitSectionsValid;
char m_impliedSectionsValid;
char m_sectionVotingTableValid;
char m_imageDataValid;
char m_imagesValid;
char m_msge0Valid;
char m_msge1Valid;
//char m_msge2Valid;
//char m_sampleVectorValid;
char m_gigabitHashesValid;
//char m_oldsrValid;
char m_sreqValid;
char m_srepValid;
bool m_ipValid;
bool m_firstIpValid;
bool m_spideredTimeValid;
//bool m_nextSpiderTimeValid;
bool m_indexedTimeValid;
bool m_firstIndexedValid;
bool m_isInIndexValid;
@ -1127,26 +905,16 @@ public:
bool m_outlinksAddedDateValid;
bool m_countryIdValid;
bool m_bodyStartPosValid;
/*
bool m_titleWeightValid;
bool m_headerWeightValid;
bool m_urlPathWeightValid;
bool m_externalLinkTextWeightValid;
bool m_internalLinkTextWeightValid;
bool m_conceptWeightValid;
*/
bool m_httpStatusValid;
bool m_crawlDelayValid;
bool m_finalCrawlDelayValid;
bool m_titleRecKeyValid;
bool m_wikiDocIdsValid;
bool m_versionValid;
bool m_rawUtf8ContentValid;
bool m_expandedUtf8ContentValid;
bool m_utf8ContentValid;
bool m_isAllowedValid;
//bool m_tryAgainTimeDeltaValid;
//bool m_eliminateMenusValid;
bool m_redirUrlValid;
bool m_redirCookieBufValid;
bool m_metaRedirUrlValid;
@ -1163,11 +931,9 @@ public:
bool m_redirErrorValid;
bool m_domHash32Valid;
bool m_contentHash32Valid;
//bool m_tagHash32Valid;
bool m_tagPairHash32Valid;
bool m_spiderLinksValid;
//bool m_nextSpiderPriorityValid;
bool m_firstIndexedDateValid;
bool m_isPermalinkValid;
@ -1186,8 +952,6 @@ public:
bool m_dupListValid;
bool m_likedbListValid;
bool m_isDupValid;
bool m_gigabitVectorHashValid;
bool m_gigabitQueryValid;
bool m_metaDescValid;
bool m_metaSummaryValid;
bool m_metaKeywordsValid;
@ -1196,23 +960,16 @@ public:
bool m_oldDocValid;
bool m_extraDocValid;
bool m_rootDocValid;
//bool m_gatewayDocValid;
bool m_oldMetaListValid;
bool m_oldTitleRecValid;
bool m_rootTitleRecValid;
bool m_isIndexedValid;
bool m_siteNumInlinksValid;
//bool m_siteNumInlinksUniqueIpValid;//FreshValid;
//bool m_siteNumInlinksUniqueCBlockValid;//sitePopValid
//bool m_siteNumInlinksTotalValid;
bool m_siteNumInlinks8Valid;
bool m_siteLinkInfoValid;
bool m_isWWWDupValid;
bool m_linkInfo1Valid;
bool m_linkSiteHashesValid;
bool m_sectionsReplyValid;
bool m_sectionsVotesValid;
bool m_sectiondbDataValid;
bool m_placedbDataValid;
bool m_siteHash64Valid;
bool m_siteHash32Valid;
@ -1228,7 +985,6 @@ public:
bool m_isSiteRootValid;
bool m_wasContentInjectedValid;
bool m_outlinkHopCountVectorValid;
//bool m_isSpamValid;
bool m_isFilteredValid;
bool m_urlFilterNumValid;
bool m_numOutlinksAddedValid;
@ -1245,7 +1001,6 @@ public:
bool m_titleValid;
bool m_htbValid;
bool m_collnumValid;
//bool m_twidsValid;
bool m_termId32BufValid;
bool m_termInfoBufValid;
bool m_newTermInfoBufValid;
@ -1254,9 +1009,6 @@ public:
bool m_spiderStatusDocMetaListValid;
bool m_isCompromisedValid;
bool m_isNoArchiveValid;
//bool m_isVisibleValid;
//bool m_clockCandidatesTableValid;
//bool m_clockCandidatesDataValid;
bool m_titleRecBufValid;
bool m_isLinkSpamValid;
bool m_isErrorPageValid;
@ -1280,19 +1032,9 @@ public:
// DO NOT add validity flags below this line!
char m_VALIDEND;
// more stuff
//char *m_utf8Content;
//int32_t m_utf8ContentLen;
// use this stuff for getting wiki docids that match our doc's gigabits
//Query m_wq;
//SearchInput m_si;
//Msg40 m_msg40;
bool m_printedMenu;
//HashTableX m_clockCandidatesTable;
//SafeBuf m_cctbuf;
int32_t m_urlPubDate;
//int32_t m_urlAge;
char m_isUrlPermalinkFormat;
uint8_t m_summaryLangId;
int32_t m_tagPairHashVec[MAX_TAG_PAIR_HASHES];
@ -1306,7 +1048,6 @@ public:
int32_t m_postVec[POST_VECTOR_SIZE/4];
int32_t m_postVecSize;
float m_tagSimilarity;
float m_gigabitSimilarity;
float m_pageSimilarity;
float m_percentChanged;
bool m_unchanged;
@ -1330,17 +1071,6 @@ public:
Msg22 m_msg22d;
Msg22 m_msg22e;
Msg22 m_msg22f;
//int32_t m_collLen;
uint32_t m_gigabitVectorHash;
char m_gigabitQuery [XD_GQ_MAX_SIZE];
int32_t m_gigabitHashes [XD_MAX_GIGABIT_HASHES];
int32_t m_gigabitScores [XD_MAX_GIGABIT_HASHES];
char *m_gigabitPtrs [XD_MAX_GIGABIT_HASHES];
// for debug printing really
class GigabitInfo *m_top[100];
int32_t m_numTop;
//char m_metaDesc[1025];
//char m_metaKeywords[1025];
// these now reference directly into the html src so our
// WordPosInfo::m_wordPtr algo works in seo.cpp
char *m_metaDesc;
@ -1355,11 +1085,9 @@ public:
int32_t m_siteSpiderQuota;
//int32_t m_numBannedOutlinks;
class XmlDoc *m_oldDoc;
class XmlDoc *m_extraDoc;
class XmlDoc *m_rootDoc;
//class XmlDoc *m_gatewayDoc;
RdbList m_oldMetaList;
char *m_oldTitleRec;
int32_t m_oldTitleRecSize;
@ -1377,10 +1105,7 @@ public:
int32_t m_tagdbCollLen;
Url m_extraUrl;
//int32_t m_siteNumInlinksFresh;
//int32_t m_sitePop;
uint8_t m_siteNumInlinks8;
//int32_t m_siteNumInlinks;
LinkInfo m_siteLinkInfo;
SafeBuf m_mySiteLinkInfoBuf;
SafeBuf m_myPageLinkInfoBuf;
@ -1391,7 +1116,6 @@ public:
char m_useSiteLinkBuf;
char m_usePageLinkBuf;
char m_printInXml;
//Msg25 m_msg25;
SafeBuf m_tmpBuf11;
SafeBuf m_tmpBuf12;
Multicast m_mcast11;
@ -1399,7 +1123,6 @@ public:
// lists from cachedb for msg25's msg20 replies serialized
RdbList m_siteReplyList;
RdbList m_pageReplyList;
//void (* m_masterLoopWrapper) (void *state);
MsgC m_msgc;
bool m_isAllowed;
bool m_forwardDownloadRequest;
@ -1410,10 +1133,6 @@ public:
// for limiting # of iframe tag expansions
int32_t m_numExpansions;
char m_newOnly;
//int32_t m_tryAgainTimeDelta;
//int32_t m_sameIpWait;
//int32_t m_sameDomainWait;
//int32_t m_maxSpidersPerDomain;
char m_isWWWDup;
char m_calledMsg0b;
@ -1424,24 +1143,14 @@ public:
class RdbList *m_ulist;
void *m_hack;
class XmlDoc *m_hackxd;
//class LinkInfo *m_linkInfo1Ptr;
char *m_linkInfoColl;
//char m_injectedReply;
//int32_t m_minInlinkerHopCount;
//class LinkInfo *m_linkInfo2Ptr;
SiteGetter m_siteGetter;
int64_t m_siteHash64;
//char *m_site;
//int32_t m_siteLen;
//Url m_siteUrl;
int32_t m_siteHash32;
char *m_httpReply;
//char m_downloadAttempted;
char m_incrementedAttemptsCount;
char m_incrementedDownloadCount;
char m_redirectFlag;
//char m_isScraping;
//char m_throttleDownload;
char m_spamCheckDisabled;
char m_useRobotsTxt;
int32_t m_robotsTxtLen;
@ -1455,15 +1164,12 @@ public:
int32_t m_filteredContentMaxSize;
char m_calledThread;
int32_t m_errno;
//class CollectionRec *m_cr;
//int32_t m_utf8ContentAllocSize;
int32_t m_hostHash32a;
int32_t m_hostHash32b;
int32_t m_domHash32;
int32_t m_priorityQueueNum;
// this points into m_msge0 i guess
//class TagRec **m_outlinkTagRecVector;
Msge0 m_msge0;
// this points into m_msge1 i guess
@ -1729,8 +1435,6 @@ public:
char *m_wikiqbuf;
int32_t m_wikiqbufSize;
int64_t m_wikiDocIds [ MAX_WIKI_DOCIDS ];
rscore_t m_wikiScores [ MAX_WIKI_DOCIDS ];
bool m_registeredSleepCallback;
bool m_addedNegativeDoledbRec;
@ -1741,16 +1445,12 @@ public:
int32_t m_niceness;
bool m_usePosdb ;
//bool m_useDatedb ;
bool m_useClusterdb ;
bool m_useLinkdb ;
bool m_useSpiderdb ;
bool m_useTitledb ;
bool m_useTagdb ;
bool m_usePlacedb ;
//bool m_useTimedb ;
bool m_useSectiondb ;
//bool m_useRevdb ;
bool m_useSecondaryRdbs ;
int32_t m_linkeeQualityBoost;
@ -1762,10 +1462,7 @@ public:
bool m_storeTermListInfo;
char m_sortTermListBy;
SafeBuf m_sectiondbData;
//char *m_sectiondbData;
char *m_placedbData;
//int32_t m_sectiondbDataSize;
int32_t m_placedbDataSize;
// we now have HashInfo to replace this
@ -1861,6 +1558,8 @@ public:
void *finalState ,
void (* finalCallback)(void *));
void logQueryTiming(const char* function, int64_t startTime);
bool doInjectLoop ( );
void doneInjecting ( class XmlDoc *xd );
int32_t m_i;

@ -184,24 +184,13 @@ static bool storeTerm ( char *s ,
// we know the termlist is small, or the termlist is being used for spidering
// or parsing purposes and is usually not sent across the network.
bool XmlDoc::hashNoSplit ( HashTableX *tt ) {
//if ( m_pbuf )
// m_pbuf->safePrintf("<h3>Terms which are immune to indexdb "
// "splitting:</h3>");
//if ( m_skipIndexing ) return true;
// this should be ready to go and not block!
int64_t *pch64 = getExactContentHash64();
//int64_t *pch64 = getLooseContentHash64();
if ( ! pch64 || pch64 == (void *)-1 ) { char *xx=NULL;*xx=0; }
// shortcut
Url *fu = getFirstUrl();
//BR 20160117: removed: if ( ! hashVectors ( tt ) ) return false;
// constructor should set to defaults automatically
HashInfo hi;
hi.m_hashGroup = HASHGROUP_INTAG;
@ -1869,18 +1858,7 @@ bool XmlDoc::hashUrl ( HashTableX *tt, bool urlOnly ) { // , bool isStatusDoc )
}
/////////////
//
// CHROME DETECTION
//
// we search for these terms we hash here in getSectionsWithDupStats()
// so we can remove chrome.
//
/////////////
// . returns false and sets g_errno on error
// . copied Url2.cpp into here basically, so we can now dump Url2.cpp
bool XmlDoc::hashSections ( HashTableX *tt ) {
// BR 20160106: No longer store xpath-hashes in posdb as we do not use them.
return true;
@ -2706,134 +2684,14 @@ bool XmlDoc::hashPermalink ( HashTableX *tt ) {
}
//hash the tag pair vector, the gigabit vector and the sample vector
bool XmlDoc::hashVectors ( HashTableX *tt ) {
setStatus ( "hashing vectors" );
int32_t blen;
char buf[32];
HashInfo hi;
hi.m_tt = tt;
hi.m_shardByTermId = true;
/*
BR 20160117 removed
int32_t score = *getSiteNumInlinks8() * 256;
if ( score <= 0 ) score = 1;
//char *field;
//char *descr;
//h = m_tagVector.getVectorHash();
uint32_t tph = *getTagPairHash32();
blen = sprintf(buf,"%"UINT32"", tph);
//field = "gbtagvector";
//descr = "tag vector hash";
// update hash parms
HashInfo hi;
hi.m_tt = tt;
hi.m_hashGroup = HASHGROUP_INTAG;
hi.m_prefix = "gbtagvector";
hi.m_desc = "tag vector hash";
hi.m_shardByTermId = true;
// this returns false on failure
if ( ! hashString ( buf,blen, &hi ) ) return false;
*/
/*
BR 20160106 removed
uint32_t h = *getGigabitVectorScorelessHash();
blen = sprintf(buf,"%"UINT32"",(uint32_t)h);
// udpate hash parms
hi.m_prefix = "gbgigabitvector";
hi.m_desc = "gigabit vector hash";
// this returns false on failure
if ( ! hashString ( buf,blen,&hi) ) return false;
*/
// . dup checking uses the two hashes above, not this hash!!! MDW
// . i think this vector is just used to see if the page changed
// significantly since last spidering
// . it is used by getPercentChanged() and by Dates.cpp
// . sanity check
//if ( ! m_pageSampleVecValid ) { char *xx=NULL;*xx=0; }
//int32_t *pc = m_pageSampleVec;
//h = hash32((char *)m_pageSampleVec, SAMPLE_VECTOR_SIZE);
//blen = sprintf(buf,"%"UINT32"",(int32_t unsigned int)h);
//field = "gbsamplevector";
//descr = "sample vector hash";
// this returns false on failure
//if ( ! hashString ( tt,buf,blen,score,field,descr) )
// return false;
// . hash combined for Dup Dectection
// . must match XmlDoc::getDupList ( );
//uint64_t h1 = m_tagVector.getVectorHash();
//uint64_t h2 = getGigabitVectorScorelessHash(gigabitVec);
//uint64_t h64 = hash64 ( h1 , h2 );
// take this out for now
/*
uint64_t *dh = getDupHash ( );
blen = sprintf(buf,"%"UINT64"", *dh );//h64);
//field = "gbduphash";
//descr = "dup vector hash";
// update hash parms
hi.m_prefix = "gbduphash";
hi.m_desc = "dup vector hash";
// this returns false on failure
if ( ! hashString ( buf,blen,&hi ) ) return false;
*/
// hash the wikipedia docids we match
if ( ! m_wikiDocIdsValid ) { char *xx=NULL;*xx=0; }
for ( int32_t i = 0 ; i < size_wikiDocIds/8 ; i++ ) {
blen = sprintf(buf,"%"UINT64"",ptr_wikiDocIds[i]);
// convert to int32_t
//int32_t convScore = (int32_t)ptr_wikiScores[i];
// get score
//uint32_t ws = score8to32 ( convScore );
// update hash parms
hi.m_prefix = "gbwikidocid";
hi.m_desc = "wiki docid";
hi.m_hashGroup = HASHGROUP_INTAG;
// this returns false on failure
if ( ! hashString ( buf,blen,&hi ) ) return false;
}
return true;
}
/*
BR 20160106 removed.
// hash gbhasthumbnail:0|1
bool XmlDoc::hashImageStuff ( HashTableX *tt ) {
setStatus ("hashing image stuff");
char *val = "0";
char **td = getThumbnailData();
if ( *td ) val = "1";
// update hash parms
HashInfo hi;
hi.m_tt = tt;
hi.m_hashGroup = HASHGROUP_INTAG;
hi.m_prefix = "gbhasthumbnail";
hi.m_desc = "has a thumbnail";
// this returns false on failure
if ( ! hashString ( val,1,&hi ) ) return false;
return true;
}
*/
// returns false and sets g_errno on error
bool XmlDoc::hashIsAdult ( HashTableX *tt ) {
@ -3080,7 +2938,7 @@ bool XmlDoc::hashString3( char *s ,
return false;
if ( ! bits.set ( &words , version , niceness ) )
return false;
if ( ! phrases.set(&words,&bits,true,false,version,niceness ) )
if ( !phrases.set( &words, &bits, version, niceness ) )
return false;
// use primary langid of doc
@ -3348,15 +3206,15 @@ bool XmlDoc::hashWords3 ( //int32_t wordStart ,
// hashTitle we count all the words in the title
// towards the density rank even if they are
// in different sentences
if ( sx->m_flags & SEC_IN_TITLE )
//hashGroup = HASHGROUP_TITLE;
if ( sx->m_flags & SEC_IN_TITLE ) {
continue;
if ( sx->m_flags & SEC_IN_HEADER )
}
if ( sx->m_flags & SEC_IN_HEADER ) {
hashGroup = HASHGROUP_HEADING;
if ( sx->m_flags & ( SEC_MENU |
SEC_MENU_SENTENCE |
SEC_MENU_HEADER ) )
}
if ( sx->m_flags & ( SEC_MENU | SEC_MENU_SENTENCE | SEC_MENU_HEADER ) ) {
hashGroup = HASHGROUP_INMENU;
}
}
// this is for link text and meta tags mostly
@ -3381,10 +3239,6 @@ bool XmlDoc::hashWords3 ( //int32_t wordStart ,
// otherwise it will be the document's primary language.
char langId = langUnknown;
if ( m_wts && langVec ) langId = langVec[i];
// keep it as the original vector. i'm not sure we use
// this for anything but for display, so show the user
// how we made our calculation of the document's primary lang
//if ( langId == langUnknown ) langId = docLangId;
char wd;
if ( hi->m_useCountTable ) wd = wdv[i];
@ -3458,8 +3312,7 @@ bool XmlDoc::hashWords3 ( //int32_t wordStart ,
// if using posdb
key144_t k;
// if ( i == 11429 )
// log("foo");
g_posdb.makeKey ( &k ,
h ,
0LL,//docid
@ -3476,16 +3329,10 @@ bool XmlDoc::hashWords3 ( //int32_t wordStart ,
false , // delkey?
hi->m_shardByTermId );
// get the one we lost
// char *kstr = KEYSTR ( &k , sizeof(POSDBKEY) );
// if (!strcmp(kstr,"0x0ca3417544e400000000000032b96bf8aa01"))
// log("got lost key");
// key should NEVER collide since we are always incrementing
// the distance cursor, m_dist
dt->addTerm144 ( &k );
// add to wts for PageParser.cpp display
if ( wts ) {
if ( ! storeTerm ( wptrs[i],wlens[i],h,hi,i,
@ -3494,7 +3341,6 @@ bool XmlDoc::hashWords3 ( //int32_t wordStart ,
wd,//v[i],
ws,
hashGroup,
//false, // is phrase?
wbuf,
wts,
SOURCE_NONE, // synsrc
@ -3567,7 +3413,6 @@ skipsingleword:
////////
int64_t npid = pids2[i];
int32_t npw = 2;
uint64_t ph2 = 0;
// repeat for the two word hash if different!
@ -3599,7 +3444,7 @@ skipsingleword:
if ( wts && npid ) {
// get phrase as a string
int32_t plen;
char *phr=phrases->getPhrase(i,&plen,npw);
char *phr=phrases->getPhrase(i,&plen,2);
// store it
if ( ! storeTerm ( phr,plen,ph2,hi,i,
wposvec[i], // wordPos
@ -3647,190 +3492,12 @@ skipsingleword:
return false;
}
#ifdef SUPPORT_FACETS
//BR 20160108 - facets DISABLED AS TEST. Don't think we will use them.
//https://gigablast.com/syntax.html?c=main
#ifdef PRIVACORE_SAFE_VERSION
#error Oops? Do not enable SUPPORT_FACETS with PRIVACORE_SAFE_VERSION. Stores too much unused data in posdb.
#endif
// hash a single term so they can do gbfacet:ext or
// gbfacet:siterank or gbfacet:price. a field on a field.
if ( prefixHash && words->m_numWords )
{
// hash gbfacet:price with and store the price in the key
hashFacet1 ( hi->m_prefix, words ,hi->m_tt);//, hi );
}
#endif
// between calls? i.e. hashTitle() and hashBody()
//if ( wc > 0 ) m_dist = wposvec[wc-1] + 100;
if ( i > 0 ) m_dist = wposvec[i-1] + 100;
return true;
}
// just like hashNumber*() functions but we use "gbfacet" as the
// primary prefix, NOT gbminint, gbmin, gbmax, gbmaxint, gbsortby,
// gbsortbyint, gbrevsortby, gbrevsortbyint
bool XmlDoc::hashFacet1 ( char *term ,
Words *words ,
HashTableX *tt ) {
// need a prefix
//if ( ! hi->m_prefix ) return true;
// hash the ENTIRE content, all words as one blob
int32_t nw = words->getNumWords();
char *a = words->m_words[0];
char *b = words->m_words[nw-1]+words->m_wordLens[nw-1];
// hash the whole string as one value, the value of the facet
int32_t val32 = hash32 ( a , b - a );
if ( ! hashFacet2 ( "gbfacetstr",term, val32 , tt ) ) return false;
return true;
}
bool XmlDoc::hashFacet2 ( char *prefix,
char *term ,
int32_t val32 ,
HashTableX *tt ,
// we only use this for gbxpathsitehash terms:
bool shardByTermId ) {
// need a prefix
//if ( ! hi->m_prefix ) return true;
//int32_t plen = gbstrlen ( hi->m_prefix );
//if ( plen <= 0 ) return true;
// we gotta make this case insensitive, and skip spaces
// because if it is 'focal length' we can't search
// 'focal length:10' because that comes across as TWO terms.
//int64_t prefixHash =hash64Lower_utf8_nospaces ( hi->m_prefix,plen);
// now any field has to support gbfacet:thatfield
// and store the 32-bit termid into where we normally put
// the word position bits, etc.
//static int64_t s_facetPrefixHash = 0LL;
//if ( ! s_facetPrefixHash )
// s_facetPrefixHash = hash64n ( "gbfacet" );
// this is case-sensitive
int64_t prefixHash = hash64n ( prefix );
// term is like something like "object.price" or whatever.
// it is the json field itself, or the meta tag name, etc.
int64_t termId64 = hash64n ( term );
// combine with the "gbfacet" prefix. old prefix hash on right.
// like "price" on right and "gbfacetfloat" on left... see Query.cpp.
int64_t ph2 = hash64 ( termId64, prefixHash );
// . now store it
// . use field hash as the termid. normally this would just be
// a prefix hash
// . use mostly fake value otherwise
key144_t k;
g_posdb.makeKey ( &k ,
ph2 ,
0,//docid
0,// word pos #
0,// densityRank , // 0-15
0 , // MAXDIVERSITYRANK
0 , // wordSpamRank ,
0 , //siterank
0 , // hashGroup,
// we set to docLang final hash loop
//langUnknown, // langid
// unless already set. so set to english here
// so it will not be set to something else
// otherwise our floats would be ordered by langid!
// somehow we have to indicate that this is a float
// termlist so it will not be mangled any more.
//langEnglish,
langUnknown,
0 , // multiplier
false, // syn?
false , // delkey?
shardByTermId );
//int64_t final = hash64n("products.offerprice",0);
//int64_t prefix = hash64n("gbsortby",0);
//int64_t h64 = hash64 ( final , prefix);
//if ( ph2 == h64 )
// log("hey: got offer price");
// now set the float in that key
g_posdb.setInt ( &k , val32 );
// HACK: this bit is ALWAYS set by Posdb::makeKey() to 1
// so that we can b-step into a posdb list and make sure
// we are aligned on a 6 byte or 12 byte key, since they come
// in both sizes. but for this, hack it off to tell
// addTable144() that we are a special posdb key, a "numeric"
// key that has a float stored in it. then it will NOT
// set the siterank and langid bits which throw our sorting
// off!!
g_posdb.setAlignmentBit ( &k , 0 );
HashTableX *dt = tt;//hi->m_tt;
// the key may indeed collide, but that's ok for this application
if ( ! dt->addTerm144 ( &k ) )
return false;
if ( ! m_wts )
return true;
bool isFloat = false;
if ( strcmp(prefix,"gbfacetfloat")==0 ) isFloat = true;
// store in buffer for display on pageparser.cpp output
char buf[130];
if ( isFloat )
snprintf(buf,128,"facetField=%s facetVal32=%f",term,
*(float *)&val32);
else
snprintf(buf,128,"facetField=%s facetVal32=%"UINT32"",
term,(uint32_t)val32);
int32_t bufLen = gbstrlen(buf);
// make a special hashinfo for this facet
HashInfo hi;
hi.m_tt = tt;
// the full prefix
char fullPrefix[66];
snprintf(fullPrefix,64,"%s:%s",prefix,term);
hi.m_prefix = fullPrefix;//"gbfacet";
// add to wts for PageParser.cpp display
// store it
if ( ! storeTerm ( buf,
bufLen,
ph2, // prefixHash, // s_facetPrefixHash,
&hi,
0, // word#, i,
0, // wordPos
0,// densityRank , // 0-15
0, // MAXDIVERSITYRANK,//phrase
0, // ws,
0, // hashGroup,
//true,
&m_wbuf,
m_wts,
// a hack for display in wts:
SOURCE_NUMBER, // SOURCE_BIGRAM, // synsrc
langUnknown ,
k) )
return false;
return true;
}
bool XmlDoc::hashFieldMatchTerm ( char *val , int32_t vlen , HashInfo *hi ) {
HashTableX *tt = hi->m_tt;
@ -4346,27 +4013,6 @@ char *XmlDoc::hashJSONFields2 ( HashTableX *table ,
}
}
//
// for deduping search results we set m_contentHash32 here for
// diffbot json objects.
// we can't do this here anymore, we have to set the
// contenthash in ::getContentHash32() because we need it to
// set EDOCUNCHANGED in ::getIndexCode() above.
//
/*
if ( hi->m_hashGroup != HASHGROUP_INURL ) {
// make the content hash so we can set m_contentHash32
// for deduping
int32_t nh32 = hash32n ( name );
// do an exact hash for now...
int32_t vh32 = hash32 ( val , vlen , m_niceness );
// accumulate, order independently
totalHash32 ^= nh32;
totalHash32 ^= vh32;
}
*/
// index like "title:whatever"
hi->m_prefix = name;
hashString ( val , vlen , hi );
@ -4384,24 +4030,8 @@ char *XmlDoc::hashJSONFields2 ( HashTableX *table ,
hi->m_prefix = NULL;
hashString ( val , vlen , hi );
/*
// a number? hash special then as well
if ( ji->m_type != JT_NUMBER ) continue;
// use prefix for this though
hi->m_prefix = name;
// hash as a number so we can sort search results by
// this number and do range constraints
float f = ji->m_valueDouble;
if ( ! hashNumberForSortingAsFloat ( f , hi ) )
return NULL;
*/
}
//m_contentHash32 = totalHash32;
//m_contentHash32Valid = true;
return (char *)0x01;
}

@ -8,10 +8,6 @@
//
#define PRIVACORE_SAFE_VERSION
// Facet support disabled by default to save space in posdb
#undef SUPPORT_FACETS
// fix on 64-bit architectures so sizeof(uint96_t) is 12, not 16!
//#pragma pack(0)

396
main.cpp

@ -19,7 +19,6 @@
#include "Posdb.h"
#include "Datedb.h"
#include "Titledb.h"
#include "Revdb.h"
#include "Tagdb.h"
#include "Spider.h"
#include "SpiderColl.h"
@ -95,8 +94,6 @@ static void dumpTitledb (char *coll, int32_t sfn, int32_t numFiles, bool includ
int64_t docId , bool justPrintDups );
static int32_t dumpSpiderdb ( char *coll,int32_t sfn,int32_t numFiles,bool includeTree,
char printStats , int32_t firstIp );
static void dumpSectiondb( char *coll,int32_t sfn,int32_t numFiles,bool includeTree);
static void dumpRevdb ( char *coll,int32_t sfn,int32_t numFiles,bool includeTree);
static void dumpTagdb( char *coll, int32_t sfn, int32_t numFiles, bool includeTree, char rec = 0,
int32_t rdbId = RDB_TAGDB, char *site = NULL );
@ -653,16 +650,6 @@ int main2 ( int argc , char *argv[] ) {
"all events as if the time is UTCtimestamp.\n\n"
*/
/*
#ifdef _CLIENT_
//there was <hostId> in this command but it
// wasn't used in the program, so deleting it from
// here
"dump <V> [C [X [Y [Z]]]]\n\tdump a db in "
#else
*/
//"dump <db> <collection> [T]\n\tDump a db from disk. "
"dump <db> <collection>\n\tDump a db from disk. "
"Example: gb dump t main\n"
"\t<collection> is the name of the collection.\n"
@ -687,7 +674,6 @@ int main2 ( int argc , char *argv[] ) {
"\t<db> is W to dump tagdb for wget.\n"
"\t<db> is x to dump doledb.\n"
"\t<db> is w to dump waiting tree.\n"
"\t<db> is B to dump sectiondb.\n"
"\t<db> is C to dump catdb.\n"
"\t<db> is l to dump clusterdb.\n"
"\t<db> is z to dump statsdb all keys.\n"
@ -2239,10 +2225,6 @@ int main2 ( int argc , char *argv[] ) {
fprintf(stdout,"error dumping spiderdb\n");
}
}
else if ( argv[cmdarg+1][0] == 'B' )
dumpSectiondb(coll,startFileNum,numFiles,includeTree);
else if ( argv[cmdarg+1][0] == 'V' )
dumpRevdb(coll,startFileNum,numFiles,includeTree);
else if ( argv[cmdarg+1][0] == 'S' ) {
char *site = NULL;
if ( cmdarg+6 < argc ) {
@ -2638,61 +2620,16 @@ int main2 ( int argc , char *argv[] ) {
if ( ! g_linkdb.init() ) {
log("db: Linkdb init failed." ); return 1; }
// use sectiondb again for its immense voting power for detecting and
// removing web page chrome, categories, etc. only use if
// CollectionRec::m_isCustomCrawl perhaps to save space.
if ( ! g_sectiondb.init() ) {
log("db: Sectiondb init failed." ); return 1; }
// now clean the trees since all rdbs have loaded their rdb trees
// from disk, we need to remove bogus collection data from teh trees
// like if a collection was delete but tree never saved right it'll
// still have the collection's data in it
if ( ! g_collectiondb.addRdbBaseToAllRdbsForEachCollRec ( ) ) {
log("db: Collectiondb init failed." ); return 1; }
// . now read in a little bit of each db and make sure the contained
// records belong in our group
// . only do this if we have more than one group
// . we may have records from other groups if we are scaling, but
// if we cannot find *any* records in our group we probably have
// the wrong data files.
//if ( ! checkDataParity() ) return 1;
//Load the high-frequency term shortcuts (if they exist)
g_hfts.load();
// init the vector cache
/*
if ( ! g_vectorCache.init ( g_conf.m_maxVectorCacheMem,
VECTOR_REC_SIZE-sizeof(key_t),
true,
g_conf.m_maxVectorCacheMem /
( sizeof(collnum_t) + 20 +
VECTOR_REC_SIZE ) ,
true,
"vector",
false,
12,
12 ) ) {
log("db: Vector Cache init failed." ); return 1; }
*/
// . gb gendbs
// . hostId should have already been picked up above, so it could be
// used to initialize all the rdbs
//if ( strcmp ( cmd , "gendbs" ) == 0 ) {
// char *coll = argv[cmdarg+1];
// // generate the dbs
// genDbs ( coll ); // coll
// g_log.m_disabled = true;
// return 0;
//}
//if ( strcmp ( cmd, "genclusterdb" ) == 0 ) {
// char *coll = argv[cmdarg+1];
// makeClusterdb ( coll );
// g_log.m_disabled = true;
// return 0;
//}
// test all collection dirs for write permission -- metalincs' request
int32_t pcount = 0;
for ( int32_t i = 0 ; i < g_collectiondb.m_numRecs ; i++ ) {
@ -2709,16 +2646,6 @@ int main2 ( int argc , char *argv[] ) {
checkDirPerms ( tt ) ;
}
// and now that all rdbs have loaded lets count the gbeventcount
// keys we have in datedb. those represent the # of events we
// have indexed.
//g_collectiondb.countEvents();
//if (!ucInit(g_hostdb.m_dir, true)) {
// log("Unicode initialization failed!");
// return 1;
//}
//
// NOTE: ANYTHING THAT USES THE PARSER SHOULD GO BELOW HERE, UCINIT!
//
@ -2728,20 +2655,6 @@ int main2 ( int argc , char *argv[] ) {
return 1;
}
// have to test after unified dict is loaded because if word is
// of unknown langid we try to get syns for it anyway if it has
// only one possible lang according to unified dict
//if ( ! g_wiktionary.test2() ) return 1;
/*
if ( strcmp ( cmd, "gendaterange" ) == 0 ) {
char *coll = argv[cmdarg+1];
genDateRange ( coll );
g_log.m_disabled = true;
return 0;
}
*/
// Load the category language table
g_countryCode.loadHashTable();
int32_t nce = g_countryCode.getNumEntries();
@ -2765,64 +2678,6 @@ int main2 ( int argc , char *argv[] ) {
log("db: ResultsCache: %s",mstrerror(g_errno));
return 1;
}
/*
maxMem = 40000000;
int32_t maxNodes2 = maxMem/(8+8+50*(8+4+4));
if ( ! g_genericCache[SEORESULTS_CACHEID].init (
maxMem , // max cache mem
-1 , // fixedDataSize
false , // support lists of recs?
maxNodes2 , // max cache nodes
false , // use half keys?
"seoresults" , // filename
true)){ // save to disk?
log("db: ResultsCache: %s",mstrerror(g_errno));
return 1;
}
*/
/*
int32_t maxMem1 = g_conf.m_siteLinkInfoMaxCacheMem;
if ( ! g_genericCache[SITELINKINFO_CACHEID].init (
maxMem1 , // max cache mem
4 , // fixedDataSize
false , // support lists of recs?
maxMem1/36 , // max cache nodes
false , // use half keys?
"sitelinkinfo" , // filename
//g_conf.m_siteLinkInfoSaveCache ) ) {
true)){
log("db: SiteLinkInfoCache: %s",mstrerror(g_errno));
return 1;
}
int32_t maxMem2a = g_conf.m_siteQualityMaxCacheMem;
if ( ! g_genericCache[SITEQUALITY_CACHEID].init (
maxMem2a , // max cache mem
1 , // fixedDataSize
false , // support lists of recs?
maxMem2a/36 , // max cache nodes
false , // use half keys?
"sitequality" , // filename
//g_conf.m_siteQualitySaveCache ) ) {
true)) {
log("db: SiteQualityCache: %s",mstrerror(g_errno));
return 1;
}
*/
/*
int32_t maxMem2b = g_conf.m_siteQualityMaxCacheMem * .10 ;
if ( ! g_genericCacheSmallLocal[SITEQUALITY_CACHEID].init (
maxMem2b , // max cache mem
1 , // fixedDataSize
false , // support lists of recs?
maxMem2b/36 , // max cache nodes
false , // use half keys?
"sitequality" , // filename
//g_conf.m_siteQualitySaveCache ) ) {
false)) {
log("db: SiteQualityCacheSmallLocal: %s",mstrerror(g_errno));
return 1;
}
*/
// init minsitenuminlinks buffer
if ( ! g_tagdb.loadMinSiteInlinksBuffer() ) {
@ -7836,223 +7691,6 @@ void *startUp ( void *state , ThreadEntry *t ) {
return 0; //NULL;
}
void dumpSectiondb(char *coll,int32_t startFileNum,int32_t numFiles,
bool includeTree) {
//g_conf.m_spiderdbMaxTreeMem = 1024*1024*30;
g_sectiondb.init ();
//g_collectiondb.init(true);
g_sectiondb.getRdb()->addRdbBase1(coll );
key128_t startKey ;
key128_t endKey ;
startKey.setMin();
endKey.setMax();
// turn off threads
g_threads.disableThreads();
// get a meg at a time
int32_t minRecSizes = 1024*1024;
Msg5 msg5;
RdbList list;
char tmpBuf[1024];
SafeBuf sb(tmpBuf, 1024);
bool firstKey = true;
CollectionRec *cr = g_collectiondb.getRec(coll);
loop:
// use msg5 to get the list, should ALWAYS block since no threads
if ( ! msg5.getList ( RDB_SECTIONDB ,
cr->m_collnum ,
&list ,
(char *)&startKey ,
(char *)&endKey ,
minRecSizes ,
includeTree ,
false , // add to cache?
0 , // max cache age
startFileNum ,
numFiles ,
NULL , // state
NULL , // callback
0 , // niceness
false )){// err correction?
log(LOG_LOGIC,"db: getList did not block.");
return;
}
// all done if empty
if ( list.isEmpty() ) return;
key128_t lastk;
// loop over entries in list
for(list.resetListPtr();!list.isExhausted(); list.skipCurrentRecord()){
char *rec = list.getCurrentRec();
key128_t *k = (key128_t *)rec;
char *data = list.getCurrentData();
int32_t size = list.getCurrentDataSize();
// is it a delete?
if ( (k->n0 & 0x01) == 0 ) {
printf("k.n1=%016"XINT64" k.n0=%016"XINT64" (delete)\n",
k->n1 , k->n0 | 0x01 ); // fix it!
continue;
}
if ( size != sizeof(SectionVote) ) { char *xx=NULL;*xx=0; }
// sanity check
if ( ! firstKey ) {
if ( k->n1 < lastk.n1 ) { char *xx=NULL;*xx=0; }
if ( k->n1 == lastk.n1 && k->n0 < lastk.n0 ) {
char *xx=NULL;*xx=0; }
}
// no longer a first key
firstKey = false;
// copy it
gbmemcpy ( &lastk , k , sizeof(key128_t) );
int32_t shardNum = getShardNum (RDB_SECTIONDB,k);
//int32_t groupNum = g_hostdb.getGroupNum ( gid );
// point to the data
char *p = data;
char *pend = data + size;
// breach check
if ( p >= pend ) {
printf("corrupt sectiondb rec k.n0=%"UINT64"",k->n0);
continue;
}
// parse it up
SectionVote *sv = (SectionVote *)data;
int64_t termId = g_datedb.getTermId ( k );
// score is the section type
unsigned char score2 = g_datedb.getScore(k);
char *stype = "unknown";
if ( score2 == SV_CLOCK ) stype = "clock ";
if ( score2 == SV_EURDATEFMT ) stype = "eurdatefmt ";
if ( score2 == SV_EVENT ) stype = "event ";
if ( score2 == SV_ADDRESS ) stype = "address ";
if ( score2 == SV_TAGPAIRHASH ) stype = "tagpairhash ";
if ( score2 == SV_TAGCONTENTHASH ) stype = "tagcontenthash";
if ( score2 == SV_FUTURE_DATE ) stype = "futuredate ";
if ( score2 == SV_PAST_DATE ) stype = "pastdate ";
if ( score2 == SV_CURRENT_DATE ) stype = "currentdate ";
if ( score2 == SV_SITE_VOTER ) stype = "sitevoter ";
if ( score2 == SV_TURKTAGHASH ) stype = "turktaghash ";
int64_t d = g_datedb.getDocId(k);
int32_t date = g_datedb.getDate(k);
// dump it
printf("k=%s "
"sh48=%"XINT64" " // sitehash is the termid
"date=%010"UINT32" "
"%s (%"UINT32") "
"d=%012"UINT64" "
"score=%f samples=%f "
"shardnum=%"INT32""
"\n",
//k->n1,
//k->n0,
KEYSTR(k,sizeof(key128_t)),
termId,
date,
stype,(uint32_t)score2,
d,
sv->m_score,
sv->m_numSampled,
shardNum);
}
startKey = *(key128_t *)list.getLastKey();
startKey += (uint32_t) 1;
// watch out for wrap around
if ( startKey < *(key128_t *)list.getLastKey() ){ printf("\n"); return;}
goto loop;
}
void dumpRevdb(char *coll,int32_t startFileNum,int32_t numFiles, bool includeTree) {
//g_conf.m_spiderdbMaxTreeMem = 1024*1024*30;
g_revdb.init ();
//g_collectiondb.init(true);
g_revdb.getRdb()->addRdbBase1(coll );
key_t startKey ;
key_t endKey ;
startKey.setMin();
endKey.setMax();
// turn off threads
g_threads.disableThreads();
// get a meg at a time
int32_t minRecSizes = 1024*1024;
Msg5 msg5;
RdbList list;
char tmpBuf[1024];
SafeBuf sb(tmpBuf, 1024);
bool firstKey = true;
CollectionRec *cr = g_collectiondb.getRec(coll);
loop:
// use msg5 to get the list, should ALWAYS block since no threads
if ( ! msg5.getList ( RDB_REVDB ,
cr->m_collnum ,
&list ,
(char *)&startKey ,
(char *)&endKey ,
minRecSizes ,
includeTree ,
false , // add to cache?
0 , // max cache age
startFileNum ,
numFiles ,
NULL , // state
NULL , // callback
0 , // niceness
false )){// err correction?
log(LOG_LOGIC,"db: getList did not block.");
return;
}
// all done if empty
if ( list.isEmpty() ) return;
key_t lastk;
// loop over entries in list
for(list.resetListPtr();!list.isExhausted(); list.skipCurrentRecord()){
char *rec = list.getCurrentRec();
key_t *k = (key_t *)rec;
char *data = list.getCurrentData();
int32_t size = list.getCurrentDataSize();
// get docid from key
int64_t d = g_revdb.getDocId(k);
// is it a delete?
if ( (k->n0 & 0x01) == 0 ) {
printf("k.n1=%08"XINT32" k.n0=%016"XINT64" d=%"UINT64" (delete)\n",
k->n1 , k->n0 | 0x01 , d ); // fix it!
continue;
}
//if ( size != sizeof(SectionVote) ) { char *xx=NULL;*xx=0; }
// sanity check
if ( ! firstKey ) {
if ( k->n1 < lastk.n1 ) { char *xx=NULL;*xx=0; }
if ( k->n1 == lastk.n1 && k->n0 < lastk.n0 ) {
char *xx=NULL;*xx=0; }
}
// no longer a first key
firstKey = false;
// copy it
gbmemcpy ( &lastk , k , sizeof(key_t) );
// point to the data
char *p = data;
char *pend = data + size;
// breach check
if ( p > pend ) {
printf("corrupt revdb rec k.n1=0x%08"XINT32" d=%"UINT64"\n",
k->n1,d);
continue;
}
// parse it up
//SectionVote *sv = (SectionVote *)data;
// dump it
printf("k.n1=%08"XINT32" k.n0=%016"XINT64" ds=%06"INT32" d=%"UINT64"\n",
k->n1,k->n0,size,d);
}
startKey = *(key_t *)list.getLastKey();
startKey += (uint32_t) 1;
// watch out for wrap around
if ( startKey < *(key_t *)list.getLastKey() ){ printf("\n"); return;}
goto loop;
}
void dumpTagdb( char *coll, int32_t startFileNum, int32_t numFiles, bool includeTree, char req, int32_t rdbId,
char *siteArg ) {
//g_conf.m_spiderdbMaxTreeMem = 1024*1024*30;
@ -8473,13 +8111,11 @@ bool parseTest ( char *coll , int64_t docId , char *query ) {
// computeWordIds from xml
words.set ( &xml , true , true ) ;
bits.set ( &words ,TITLEREC_CURRENT_VERSION, 0);
Phrases phrases;
phrases.set ( &words,&bits,true,true,TITLEREC_CURRENT_VERSION,0);
t = gettimeofdayInMilliseconds_force();
for ( int32_t i = 0 ; i < 100 ; i++ )
//if ( ! words.set ( &xml , true , true ) )
// do not supply xd so it will be set from scratch
if ( !sections.set( &words, &phrases, &bits, NULL, 0, NULL, 0, 0 ) )
if ( !sections.set( &words, &bits, NULL, 0, NULL, 0, 0 ) )
return log("build: speedtestxml: sections set: %s",
mstrerror(g_errno));
@ -8493,14 +8129,10 @@ bool parseTest ( char *coll , int64_t docId , char *query ) {
//Phrases phrases;
Phrases phrases;
t = gettimeofdayInMilliseconds_force();
for ( int32_t i = 0 ; i < 100 ; i++ )
if ( ! phrases.set ( &words ,
&bits ,
true , // use stop words
false , // use stems
TITLEREC_CURRENT_VERSION ,
0 ) ) // niceness
for ( int32_t i = 0 ; i < 100 ; i++ )
if ( !phrases.set( &words, &bits, TITLEREC_CURRENT_VERSION, 0 ) )
return log("build: speedtestxml: Phrases set: %s",
mstrerror(g_errno));
// print time it took
@ -8597,22 +8229,6 @@ bool summaryTest1 ( char *rec , int32_t listSize, char *coll , int64_t docId ,
xml.set( content, contentLen, xd.m_version, 0, CT_HTML );
xd.getSummary();
//Summary s;
// bool status;
/*
status = s.set ( &xml ,
&q ,
NULL , // termFreqs
false , // doStemming?
summaryMaxLen ,
numSummaryLines ,
summaryMaxNumCharsPerLine ,
bigSampleRadius ,
bigSampleMaxLen ,
ratInSummary ,
&tr );
*/
}
// print time it took
@ -8641,8 +8257,6 @@ bool summaryTest2 ( char *rec , int32_t listSize, char *coll , int64_t docId ,
int32_t numSummaryLines = cr->m_summaryMaxNumLines;
int32_t summaryMaxNumCharsPerLine = cr->m_summaryMaxNumCharsPerLine;
// these are arbitrary (taken from Msg24.cpp)
int32_t bigSampleRadius = 100;
int32_t bigSampleMaxLen = 4000;
bool ratInSummary = false;
Query q;
@ -8731,8 +8345,6 @@ bool summaryTest2 ( char *rec , int32_t listSize, char *coll , int64_t docId ,
summaryMaxLen ,
numSummaryLines ,
summaryMaxNumCharsPerLine ,
bigSampleRadius ,
bigSampleMaxLen ,
ratInSummary ,
&tr );
// time it

22
qa.cpp

@ -745,8 +745,7 @@ bool qainject1 ( ) {
if ( ! s_flags[16] ) {
s_flags[16] = true;
if ( ! getUrl ( "/search?c=qatest123&qa=1&format=xml&q=%2Bthe"
"&dsrt=500",
if ( ! getUrl ( "/search?c=qatest123&qa=1&format=xml&q=%2Bthe",
702467314 ) )
return false;
}
@ -1573,8 +1572,7 @@ bool qaWarcFiles ( ) {
}
if ( s_flags[EXAMINE_RESULTS1] == 0) {
s_flags[EXAMINE_RESULTS1]++;
if ( ! getUrl ( "/search?c=qatest123&qa=1&format=xml&q=%2Bthe"
"&dsrt=500",
if ( ! getUrl ( "/search?c=qatest123&qa=1&format=xml&q=%2Bthe",
702467314 ) )
return false;
}
@ -1596,8 +1594,7 @@ bool qaWarcFiles ( ) {
if ( s_flags[EXAMINE_RESULTS2] == 0) {
s_flags[EXAMINE_RESULTS2]++;
if ( ! getUrl ( "/search?c=qatest123&qa=1&format=xml&q=%2Bthe"
"&dsrt=500",
if ( ! getUrl ( "/search?c=qatest123&qa=1&format=xml&q=%2Bthe",
702467314 ) )
return false;
}
@ -1790,14 +1787,6 @@ bool qaMetadataFacetSearch ( ) {
return false;
}
// if ( ! s_flags[EXAMINE_RESULTS] ) {
// s_flags[16] = true;
// if ( ! getUrl ( "/search?c=qatest123&qa=1&q=%2Bthe"
// "&dsrt=500",
// 702467314 ) )
// return false;
// }
return true;
}
@ -1876,8 +1865,7 @@ bool qaimport () {
// test query
if ( ! s_flags[16] ) {
s_flags[16] = true;
if ( ! getUrl ( "/search?c=qatest123&qa=1&format=xml&q=%2Bthe"
"&dsrt=500",
if ( ! getUrl ( "/search?c=qatest123&qa=1&format=xml&q=%2Bthe",
702467314 ) )
return false;
}
@ -1887,7 +1875,7 @@ bool qaimport () {
if ( ! s_flags[29] ) {
s_flags[29] = true;
if ( ! getUrl ( "/search?c=qatest123&qa=1&format=xml&"
"q=mediapost&dsrt=0&sc=1",
"q=mediapost&sc=1",
702467314 ) )
return false;
}

@ -16,7 +16,10 @@ def verify_file(gb_api, httpserver, filename, custom_filename, content_type, exp
# add url
assert gb_api.add_url(file_url) == True
result = gb_api.search('url:' + file_url)
payload = {}
payload.update({'showerrors': '1'})
result = gb_api.search('url:' + file_url, payload)
assert len(result['results']) == 1
assert result['results'][0]['contentType'] == expected_content_type

@ -28,14 +28,11 @@ static void generateSummary(Summary &summary, char *htmlInput, char *queryStr, c
Bits bits;
ASSERT_TRUE(bits.set(&words, TITLEREC_CURRENT_VERSION, 0));
Phrases phrases;
ASSERT_TRUE(phrases.set(&words, &bits, true, false, TITLEREC_CURRENT_VERSION, 0));
Url url;
url.set(urlStr);
Sections sections;
ASSERT_TRUE(sections.set(&words, &phrases, &bits, &url, 0, "", 0, CT_HTML));
ASSERT_TRUE(sections.set(&words, &bits, &url, 0, "", 0, CT_HTML));
Query query;
ASSERT_TRUE(query.set2(queryStr, langEnglish, true));
@ -53,6 +50,9 @@ static void generateSummary(Summary &summary, char *htmlInput, char *queryStr, c
Bits bitsForSummary;
ASSERT_TRUE(bitsForSummary.setForSummary(&words));
Phrases phrases;
ASSERT_TRUE(phrases.set(&words, &bits, TITLEREC_CURRENT_VERSION, 0));
Matches matches;
matches.setQuery(&query);
ASSERT_TRUE(matches.set(&words, &phrases, &sections, &bitsForSummary, &pos, &xml, &title, &url, &linkInfo, 0));