3352 lines
110 KiB
C++
3352 lines
110 KiB
C++
#include "gb-include.h"
|
|
|
|
#include "Msg51.h"
|
|
//#include "Msg24.h"
|
|
#include "Query.h"
|
|
#include "Msg20.h"
|
|
//#include "TermTable.h"
|
|
#include "Words.h"
|
|
#include "Speller.h"
|
|
#include <math.h>
|
|
#include "StopWords.h"
|
|
#include "HashTable.h"
|
|
#include "Clusterdb.h"
|
|
#include "Scores.h"
|
|
#include "Stats.h"
|
|
#include "Words.h"
|
|
|
|
// here's the knobs:
|
|
|
|
// sample radius in chars around each query term : 600 (line 212)
|
|
// max sample size, all excerpts, per document : 100k (line 213)
|
|
// map from distance to query term in words to score: (line 855)
|
|
// map from popularity to score weight : (lines 950 et al)
|
|
// the comments above are way out of date (aac, Jan 2008)
|
|
//
|
|
// QPOP multiplier params
|
|
#define QPOP_ZONE_0 10
|
|
#define QPOP_ZONE_1 30
|
|
#define QPOP_ZONE_2 80
|
|
#define QPOP_ZONE_3 100
|
|
#define QPOP_ZONE_4 300
|
|
#define QPOP_MULT_0 10
|
|
#define QPOP_MULT_1 8
|
|
#define QPOP_MULT_2 6
|
|
#define QPOP_MULT_3 4
|
|
#define QPOP_MULT_4 2
|
|
// QTR scoring params
|
|
#define MAX_SCORE_MULTIPLIER 3000 // orig: 3000
|
|
#define ALT_MAX_SCORE 12000 // orig: 12000
|
|
#define ALT_START_SCORE 1000
|
|
#define QTR_ZONE_0 4
|
|
#define QTR_ZONE_1 8
|
|
#define QTR_ZONE_2 12
|
|
#define QTR_ZONE_3 20
|
|
#define QTR_BONUS_0 1000
|
|
#define QTR_BONUS_1 800
|
|
#define QTR_BONUS_2 500
|
|
#define QTR_BONUS_3 200
|
|
#define QTR_BONUS_CW 1
|
|
#define MULTIPLE_HIT_BOOST 1000 // orig: 1000
|
|
// gigabit phrase scoring params
|
|
#define SPARSE_MARK 0.34
|
|
#define SPARSE_PENALTY 1000
|
|
#define FWC_PENALTY 500 // penalty for beginning with common word
|
|
#define POP_ZONE_0 0.00001
|
|
#define POP_ZONE_1 0.0001
|
|
#define POP_ZONE_2 0.001
|
|
#define POP_ZONE_3 0.01
|
|
#define POP_BOOST_0 3.0
|
|
#define POP_BOOST_1 1.5
|
|
#define POP_BOOST_2 1.0
|
|
#define POP_BOOST_3 0.3
|
|
#define POP_BOOST_4 0.1
|
|
|
|
|
|
//static bool onSamePages(int32_t i,int32_t j,int32_t *slots,int32_t *heads,int32_t *pages);
|
|
|
|
static void handleRequest24 ( UdpSlot *slot , int32_t netnice ) ;
|
|
|
|
static void setRepeatScores ( char *repeatScores ,
|
|
int64_t *wids ,
|
|
int32_t nw ,
|
|
char *repeatTable ,
|
|
int32_t repeatTableNumSlots ,
|
|
Words *words ) ;
|
|
|
|
Msg24::Msg24 ( ) {
|
|
m_numTopics = 0;
|
|
m_request = NULL;
|
|
m_reply = NULL;
|
|
|
|
m_topicPtrs = NULL;
|
|
m_topicLens = NULL;
|
|
m_topicScores = NULL;
|
|
m_topicGids = NULL;
|
|
m_topicPops = NULL;
|
|
m_topicDocIds = NULL;
|
|
m_topicNumDocIds = NULL;
|
|
m_isUnicode = false;
|
|
}
|
|
|
|
Msg24::~Msg24 ( ) { reset(); }
|
|
|
|
|
|
void Msg24::reset ( ) {
|
|
if ( m_request && m_request != m_requestBuf )
|
|
mfree ( m_request , m_requestSize , "Msg24" );
|
|
m_request = NULL;
|
|
// free reply if we should
|
|
if ( m_reply ) mfree ( m_reply , m_replySize , "Msg24" );
|
|
m_reply = NULL;
|
|
m_isUnicode = false;
|
|
}
|
|
|
|
|
|
bool Msg24::registerHandler ( ) {
|
|
// . register ourselves with the udp server
|
|
// . it calls our callback when it receives a msg of type 0x24
|
|
if ( ! g_udpServer.registerHandler ( 0x24, handleRequest24 ))
|
|
return false;
|
|
return true;
|
|
}
|
|
|
|
static void gotReplyWrapper24 ( void *state1 , void *state2 ) ;
|
|
|
|
bool Msg24::generateTopics ( char *coll ,
|
|
int32_t collLen ,
|
|
char *query ,
|
|
int32_t queryLen ,
|
|
//float termFreqWeights ,
|
|
//float phraseAffWeights ,
|
|
int64_t *docIds ,
|
|
char *clusterLevels ,
|
|
int32_t numDocIds ,
|
|
TopicGroup *topicGroups ,
|
|
int32_t numTopicGroups ,
|
|
//int32_t docsToScanForTopics ,
|
|
//int32_t minTopicScore ,
|
|
//int32_t maxTopics ,
|
|
//int32_t maxWordsPerPhrase ,
|
|
int32_t maxCacheAge ,
|
|
bool addToCache ,
|
|
bool returnDocIdCount ,
|
|
bool returnDocIds ,
|
|
bool returnPops ,
|
|
void *state ,
|
|
void (* callback) (void *state ),
|
|
int32_t niceness) {
|
|
// force it to be true, since hi bit is set in pops if topic is unicode
|
|
returnPops = true;
|
|
// warning
|
|
if ( ! coll ) log(LOG_LOGIC,"net: NULL collection. msg24.");
|
|
// force it
|
|
returnDocIdCount = true;
|
|
// if we don't get docids, then deserialize doesn't work because it
|
|
// expects the docids to be valid.
|
|
returnDocIds = true;
|
|
// reset
|
|
m_numTopics = 0;
|
|
//m_docsToScanForTopics = docsToScanForTopics;
|
|
//m_minTopicScore = minTopicScore;
|
|
//m_maxTopics = maxTopics;
|
|
m_numDocIds = numDocIds;
|
|
m_coll = coll;
|
|
m_collLen = collLen;
|
|
m_returnDocIdCount = returnDocIdCount;
|
|
m_returnDocIds = returnDocIds;
|
|
m_returnPops = returnPops;
|
|
// bail if no operations to do
|
|
if ( numTopicGroups <= 0 ) return true;
|
|
if ( numDocIds <= 0 ) return true;
|
|
|
|
int32_t numTopicsToGen = topicGroups->m_numTopics;
|
|
// get the min we have to scan
|
|
int32_t docsToScanForTopics = topicGroups[0].m_docsToScanForTopics;
|
|
|
|
for ( int32_t i = 1 ; i < numTopicGroups ; i++ ) {
|
|
int32_t x = topicGroups[i].m_docsToScanForTopics ;
|
|
if ( x > docsToScanForTopics ) docsToScanForTopics = x;
|
|
|
|
if ( topicGroups[i].m_numTopics > numTopicsToGen )
|
|
numTopicsToGen = topicGroups[i].m_numTopics;
|
|
}
|
|
// bail if none
|
|
if ( docsToScanForTopics <= 0 ) return true;
|
|
if ( numTopicsToGen == 0 ) return true;
|
|
|
|
m_state = state;
|
|
m_callback = callback;
|
|
|
|
m_startTime = gettimeofdayInMilliseconds();
|
|
|
|
// save, caller should not delete this!
|
|
m_topicGroups = topicGroups;
|
|
m_numTopicGroups = numTopicGroups;
|
|
// truncate
|
|
//if ( maxTopics > MAX_TOPICS ) maxTopics = MAX_TOPICS;
|
|
// truncate
|
|
//if ( numDocIds > MAX_DOCIDS_TO_SCAN )
|
|
// numDocIds = MAX_DOCIDS_TO_SCAN ;
|
|
if ( numDocIds > docsToScanForTopics )
|
|
numDocIds = docsToScanForTopics ;
|
|
|
|
int32_t size = sizeof(TopicGroup) * numTopicGroups ;
|
|
if ( queryLen > MAX_QUERY_LEN ) queryLen = MAX_QUERY_LEN;
|
|
|
|
// how much space do we need?
|
|
int32_t need = 4+4+4+size+
|
|
queryLen+1+
|
|
numDocIds*8 +
|
|
numDocIds +collLen+1 + sizeof(niceness);
|
|
m_requestSize = need;
|
|
|
|
// make enough room for the request
|
|
if ( need < MSG24_REQUEST_SIZE ) m_request = m_requestBuf;
|
|
else {
|
|
m_request = (char *)mmalloc ( need , "Msg24a" );
|
|
if ( ! m_request ) {
|
|
log("topics: Failed to allocate %"INT32" bytes.",need);
|
|
return true;
|
|
}
|
|
}
|
|
|
|
char *p = m_request;
|
|
// store the cache parms
|
|
*(int32_t *)p = maxCacheAge ; p += 4;
|
|
*(char *)p = addToCache ; p += 1;
|
|
*(char *)p = returnDocIdCount ; p += 1;
|
|
*(char *)p = returnDocIds ; p += 1;
|
|
*(char *)p = returnPops ; p += 1;
|
|
*(int32_t *)p = niceness ; p += sizeof(int32_t);
|
|
// store minTopicScore
|
|
//*(int32_t *)p = minTopicScore ; p += 4;
|
|
//*(int32_t *)p = maxTopics ; p += 4;
|
|
//*(int32_t *)p = maxWordsPerPhrase ; p += 4;
|
|
// store topic group information
|
|
*(int32_t *)p = numTopicGroups; p += 4;
|
|
gbmemcpy ( p , topicGroups , size ); p += size;
|
|
// then coll
|
|
gbmemcpy ( p , coll , collLen ); p += collLen ;
|
|
*p++ = '\0';
|
|
// then query
|
|
gbmemcpy ( p , query , queryLen ); p += queryLen;
|
|
*p++ = '\0';
|
|
// then docids
|
|
gbmemcpy ( p , docIds , numDocIds * 8 ); p += numDocIds * 8;
|
|
// then cluster levels
|
|
gbmemcpy ( p , clusterLevels , numDocIds ); p += numDocIds ;
|
|
// how big is it?
|
|
//m_requestSize = p - m_request;
|
|
// sanity check
|
|
//if ( m_requestSize > 5+MAX_QUERY_LEN + 1 + MAX_DOCIDS_TO_SCAN * 9){
|
|
// char *xx = NULL ; *xx = 0; }
|
|
if ( p - m_request != m_requestSize ) {
|
|
log("Bad msg24 request size");
|
|
char *xx = NULL ; *xx = 0;
|
|
}
|
|
// . the groupId to handle... just pick randomly
|
|
int32_t groupId = ((uint32_t)docIds[0]) & g_hostdb.m_groupMask;
|
|
// . returns false and sets g_errno on error
|
|
// . reply should be stored in UdpSlot::m_tmpBuf
|
|
if ( ! m_mcast.send ( m_request ,
|
|
m_requestSize ,
|
|
0x24 , // msgType 0x24
|
|
false , // m_mcast own m_request?
|
|
groupId , // send to group (groupKey)
|
|
false , // send to whole group?
|
|
(int32_t)docIds[0] , // key is lower bits of docId
|
|
this , // state data
|
|
NULL , // state data
|
|
gotReplyWrapper24 ,
|
|
30 , // 30 second time out
|
|
niceness , // niceness
|
|
false , // realtime?
|
|
-1 , // first hostid
|
|
NULL,//m_reply , // store reply in here
|
|
0,//MAX_REPLY_LEN , // this is how big it can be
|
|
false , // free reply buf?
|
|
false , // do disk load balancing?
|
|
0 , // maxCacheAge
|
|
(key_t)0 , // cacheKey
|
|
RDB_NONE , // TITLEDB // rdbId of titledb
|
|
0 ) ){// minRecSizes avg
|
|
log("topics: Had error sending request for topics to host in "
|
|
"group #%"INT32": %s.",groupId,mstrerror(g_errno));
|
|
return true;
|
|
}
|
|
// otherwise, we blocked and gotReplyWrapper will be called
|
|
return false;
|
|
}
|
|
|
|
void gotReplyWrapper24 ( void *state1 , void *state2 ) {
|
|
Msg24 *THIS = (Msg24 *)state1;
|
|
THIS->gotReply();
|
|
THIS->m_callback ( THIS->m_state );
|
|
}
|
|
|
|
void Msg24::gotReply ( ) {
|
|
// bail on error, multicast will free the reply buffer if it should
|
|
if ( g_errno ) {
|
|
log("topics: Had error getting topics: %s.",
|
|
mstrerror(g_errno));
|
|
return;
|
|
}
|
|
// get the reply
|
|
int32_t maxSize ;
|
|
bool freeIt ;
|
|
m_reply = m_mcast.getBestReply (&m_replySize, &maxSize, &freeIt);
|
|
relabel( m_reply, m_replySize, "Msg24-GBR" );
|
|
// sanity check
|
|
//if ( reply != m_reply ) { char *xx = NULL ; *xx = 0 ; }
|
|
// . parse the reply, it should be our m_reply buffer
|
|
// . topics are NULL terminated
|
|
deserialize ( m_reply , m_replySize );
|
|
|
|
int64_t now = gettimeofdayInMilliseconds();
|
|
g_stats.addStat_r ( 0 ,
|
|
m_startTime ,
|
|
now,
|
|
"get_gigabits",
|
|
0x00d1e1ff ,
|
|
STAT_QUERY );
|
|
/*
|
|
int32_t i = 0;
|
|
while ( p < pend && i < MAX_TOPICS ) {
|
|
m_topicScores[i] = *(int32_t *)p ; p += 4;
|
|
m_topicLens [i] = *(int32_t *)p ; p += 4;
|
|
m_topicGids [i] = *(char *)p ; p += 1;
|
|
m_topicPtrs [i] = p ; p += m_topicLens[i] + 1;
|
|
i++;
|
|
}
|
|
m_numTopics = i;
|
|
*/
|
|
}
|
|
|
|
// if this is too big we can run out of sockets to use to launch
|
|
#define MAX_OUTSTANDING 50
|
|
|
|
State24::State24 ( ) {
|
|
m_msg20 = NULL;
|
|
m_mem = NULL;
|
|
m_memPtr = NULL;
|
|
m_memEnd = NULL;
|
|
|
|
}
|
|
State24::~State24 ( ) {
|
|
if ( m_msg20 == m_buf20 ) return;
|
|
for ( int32_t i = 0 ; i < m_numDocIds ; i++ ) m_msg20[i].destructor();
|
|
mfree ( m_msg20 , sizeof(Msg20) * m_numDocIds , "Msg24" );
|
|
m_msg20 = NULL;
|
|
if ( m_mem ) {
|
|
mfree ( m_mem, m_memEnd - m_mem, "Msg24" );
|
|
m_mem = NULL;
|
|
m_memEnd = NULL;
|
|
m_memPtr = NULL;
|
|
}
|
|
}
|
|
|
|
|
|
static void launchMsg20s ( State24 *st, bool callsample, int32_t sampleSize );
|
|
static void gotSampleWrapper ( void *state ) ;
|
|
|
|
void handleRequest24 ( UdpSlot *slot , int32_t netnice ) {
|
|
// if niceness is 0, use the higher priority udpServer
|
|
UdpServer *us = &g_udpServer;
|
|
//if ( niceness == 0 ) us = &g_udpServer2;
|
|
// make the state
|
|
State24 *st ;
|
|
try { st = new (State24); }
|
|
catch ( ... ) {
|
|
g_errno = ENOMEM;
|
|
log("topics: Could not allocate %i bytes for generating "
|
|
"topics. Replying with error.",sizeof(State24));
|
|
us->sendErrorReply ( slot , EBADREQUESTSIZE );
|
|
return;
|
|
}
|
|
mnew ( st , sizeof(State24) , "Msg24b" );
|
|
// get the request
|
|
char *request = slot->m_readBuf;
|
|
int32_t requestSize = slot->m_readBufSize;
|
|
char *requestEnd = request + requestSize;
|
|
// parse the request
|
|
char *p = request;
|
|
// get cache parms
|
|
//int32_t maxCacheAge = *(int32_t *)p ; p += 4;
|
|
//char addToCache = *(char *)p ; p += 1;
|
|
st->m_maxCacheAge = *(int32_t *)p ; p += 4;
|
|
st->m_addToCache = *(char *)p ; p += 1;
|
|
st->m_returnDocIdCount = *(char *)p ; p += 1;
|
|
st->m_returnDocIds = *(char *)p ; p += 1;
|
|
st->m_returnPops = *(char *)p ; p += 1;
|
|
st->m_niceness = *(int32_t *)p ; p += sizeof(int32_t);
|
|
// first is minTopicScore
|
|
//int32_t minTopicScore = *(int32_t *)p ; p += 4;
|
|
// until we roll to all hosts, lets keep the protocol standard
|
|
//int32_t maxTopics = *(int32_t *)p ; p += 4;
|
|
//int32_t maxWordsPerPhrase = *(int32_t *)p ; p += 4;
|
|
//int32_t maxTopics = 100;
|
|
//int32_t maxWordsPerPhrase = 6;
|
|
//st->m_minTopicScore = minTopicScore;
|
|
//st->m_maxTopics = maxTopics;
|
|
//st->m_maxWordsPerPhrase = maxWordsPerPhrase;
|
|
// get topic group information
|
|
st->m_numTopicGroups = *(int32_t *)p ; p += 4;
|
|
int32_t size = sizeof(TopicGroup) * st->m_numTopicGroups ;
|
|
gbmemcpy ( st->m_topicGroups , p , size ); p += size;
|
|
// then coll
|
|
st->m_coll = p; p += strlen(p) + 1;
|
|
// . then the query, a NULL terminated string
|
|
// . store it in state
|
|
int32_t qlen = strlen ( p );
|
|
if ( qlen > MAX_QUERY_LEN ) qlen = MAX_QUERY_LEN;
|
|
gbmemcpy ( st->m_query , p , qlen );
|
|
st->m_query [ qlen ] = '\0';
|
|
st->m_queryLen = qlen;
|
|
p += qlen + 1;
|
|
// then the docids
|
|
//int64_t *docIds = (int64_t *)p;
|
|
//int32_t numDocIds = (requestEnd - p) / 9;
|
|
//p += numDocIds * 8;
|
|
// cluster levels
|
|
//char *clusterLevels = p;
|
|
st->m_docIds = (int64_t *)p;
|
|
st->m_numDocIds = (requestEnd - p) / 9;
|
|
p += st->m_numDocIds * 8;
|
|
// cluster levels
|
|
st->m_clusterLevels = p;
|
|
|
|
// truncate
|
|
//if ( numDocIds > MAX_DOCIDS_TO_SCAN )
|
|
// numDocIds = MAX_DOCIDS_TO_SCAN ;
|
|
// see if anyone blocks at all
|
|
//bool noBlock = true;
|
|
// we haven't got any responses as of yet or sent any requests
|
|
st->m_slot = slot;
|
|
//st->m_niceness = 0; // niceness;
|
|
st->m_numReplies = 0;
|
|
st->m_numRequests = 0;
|
|
|
|
// allocate enough msg20s
|
|
if ( st->m_numDocIds <= 50 )
|
|
st->m_msg20 = st->m_buf20;
|
|
else {
|
|
st->m_msg20=(Msg20 *)mmalloc(sizeof(Msg20)*
|
|
st->m_numDocIds,"Msg24c");
|
|
if ( ! st->m_msg20 ) {
|
|
log("Msg24: alloc of msg20s for %"INT32" bytes failed",
|
|
sizeof(Msg20)*st->m_numDocIds);
|
|
// prevent a core dump in Msg24::~Msg24
|
|
st->m_numDocIds = 0;
|
|
mdelete ( st , sizeof(State24) , "Msg24" );
|
|
delete ( st );
|
|
us->sendErrorReply ( slot , g_errno );
|
|
return;
|
|
}
|
|
for ( int32_t i = 0 ; i < st->m_numDocIds ; i++ )
|
|
st->m_msg20[i].constructor();
|
|
}
|
|
|
|
// set query if need be
|
|
//Query qq;
|
|
st->m_qq.set ( st->m_query , st->m_queryLen , NULL , 0, 2 , true );
|
|
// make a display metas string to get content for out TopicGroups
|
|
//char dbuf[1024];
|
|
p = st->m_dbuf;
|
|
char *pend = st->m_dbuf + 1024;
|
|
for ( int32_t i = 0 ; i < st->m_numTopicGroups ; i++ ) {
|
|
TopicGroup *t = &st->m_topicGroups [ i ];
|
|
int32_t tlen = strlen ( t->m_meta );
|
|
if ( p + tlen + 1 >= pend ) break;
|
|
if ( i > 0 ) *p++ = ' ';
|
|
gbmemcpy ( p , t->m_meta , tlen );
|
|
p += tlen;
|
|
}
|
|
//int32_t dbufLen = p - dbuf;
|
|
st->m_dbufLen = p - st->m_dbuf;
|
|
*p = '\0';
|
|
st->m_n = 0;
|
|
st->m_i = 0;
|
|
launchMsg20s ( st , true ,st->m_topicGroups[0].m_topicSampleSize );
|
|
}
|
|
|
|
void launchMsg20s ( State24 *st , bool callsample , int32_t sampleSize ) {
|
|
// launch all the msg20 to get big samples of each doc
|
|
//int32_t n = 0;
|
|
for ( ; st->m_i < st->m_numDocIds ; st->m_i++ ) {
|
|
// skip if clustered out
|
|
if ( st->m_clusterLevels[st->m_i] != CR_OK )
|
|
continue;
|
|
// wait for later if too many outstanding
|
|
if ( st->m_numRequests - st->m_numReplies >=
|
|
MAX_OUTSTANDING ) return;
|
|
// use the jth slot if we should
|
|
//if ( j >= 0 ) n = j;
|
|
// save the msg index
|
|
//st->m_msg20[n].m_n = n;
|
|
//st->m_msg20[n].m_parent = st;
|
|
// supply the display metas as the meta in our TopicGroups
|
|
// . start up a Msg20 to get the relevant doc text
|
|
// . this will return false if blocks
|
|
// . a 32k sample takes 11ms to hash in hashSample() and
|
|
// most samples are below 5k anyway...
|
|
Msg20 *mm = &st->m_msg20[st->m_n++];
|
|
// set the summary request then get it!
|
|
Msg20Request req;
|
|
Query *q = &st->m_qq;
|
|
//int32_t nt = q->m_numTerms;
|
|
req.ptr_qbuf = q->getQuery();
|
|
req.size_qbuf = q->getQueryLen()+1;
|
|
//req.ptr_termFreqs = (char *)m_msg3a.m_termFreqs;
|
|
//req.size_termFreqs = 8 * nt;
|
|
//req.ptr_affWeights = (char *)m_msg3a.m_affWeights;
|
|
//req.size_affWeights = 4 * nt; // 4 = sizeof(float)
|
|
req.ptr_coll = st->m_coll;
|
|
req.size_coll = strlen(st->m_coll)+1;
|
|
if ( st->m_dbufLen > 0 ) {
|
|
req.ptr_displayMetas = st->m_dbuf ;
|
|
req.size_displayMetas = st->m_dbufLen+1;
|
|
}
|
|
req.m_docId = st->m_docIds[st->m_i];
|
|
req.m_numSummaryLines = 0;
|
|
req.m_maxCacheAge = st->m_maxCacheAge;
|
|
req.m_wcache = st->m_addToCache;
|
|
req.m_state = st;
|
|
req.m_callback = gotSampleWrapper;
|
|
req.m_niceness = st->m_niceness;
|
|
//req.m_summaryMode = m_si->m_summaryMode;
|
|
req.m_boolFlag = q->m_isBoolean; // 2 means auto?
|
|
//req.m_allowPunctInPhrase = m_si->m_allowPunctInPhrase;
|
|
//req.m_showBanned = m_si->m_showBanned;
|
|
//req.m_excludeLinkText = m_si->m_excludeLinkText ;
|
|
//req.m_hackFixWords = m_si->m_hackFixWords ;
|
|
//req.m_hackFixPhrases = m_si->m_hackFixPhrases ;
|
|
//req.m_includeCachedCopy= m_si->m_includeCachedCopy;//bigsm
|
|
req.m_bigSampleRadius = 100;
|
|
req.m_bigSampleMaxLen = sampleSize;
|
|
if ( ! mm->getSummary ( &req )) {st->m_numRequests++;continue;}
|
|
#ifdef _OLDMSG20_
|
|
if ( ! mm->getSummary ( &st->m_qq ,
|
|
NULL , // term freqs
|
|
NULL , // aff weights
|
|
st->m_docIds[st->m_i] ,
|
|
1 , // clusterLevel
|
|
0 , // # sum lines
|
|
st->m_maxCacheAge ,
|
|
st->m_addToCache ,
|
|
st->m_coll , // coll
|
|
strlen(st->m_coll) ,
|
|
st , // state
|
|
gotSampleWrapper ,
|
|
st->m_niceness ,
|
|
false , // root?
|
|
st->m_dbuf , // dt metas
|
|
st->m_dbufLen , // dtmetalen
|
|
100 , // smpl radius
|
|
sampleSize )){// smpl max
|
|
st->m_numRequests++;
|
|
// if just launching one, bail if this blocked
|
|
//if ( j >= 0 ) return;
|
|
continue;
|
|
}
|
|
#endif
|
|
// deal with an error
|
|
if ( g_errno ) {
|
|
// log it
|
|
log("topics: Received error when getting "
|
|
"document with docId %"INT64": %s. Document will not "
|
|
"contribute to the topics generation.",
|
|
st->m_docIds[st->m_i],mstrerror(g_errno));
|
|
// reset g_errno
|
|
g_errno = 0;
|
|
}
|
|
// . otherwise we got summary without blocking
|
|
// . increment # of replies (instant reply) and results
|
|
st->m_numReplies++;
|
|
st->m_numRequests++;
|
|
// if we were just launching one and it did not block, return
|
|
//if ( j >= 0 ) return;
|
|
}
|
|
// did anyone block? if so, return false for now
|
|
if ( st->m_numReplies < st->m_numRequests ) return ;
|
|
// . otherwise, we got everyone, so go right to the merge routine
|
|
// . returns false if not all replies have been received
|
|
// . returns true if done
|
|
// . sets g_errno on error
|
|
if ( callsample ) gotSampleWrapper ( st );
|
|
}
|
|
|
|
static bool hashSample ( Query *q, char *sample , int32_t sampleLen ,
|
|
TermTable *master, int32_t *nqiPtr ,
|
|
TopicGroup *t ,
|
|
State24* st,
|
|
int64_t docId ,
|
|
char *vecs , int32_t *numVecs ,
|
|
class Words *wordsPtr , class Scores *scoresPtr ,
|
|
bool isUnicode ,
|
|
char *repeatTable , int32_t repeatTableNumSlots ,
|
|
char language );
|
|
|
|
void gotSampleWrapper ( void *state ) {
|
|
// get ptr to our state 24 class
|
|
State24 *st = (State24 *)state;
|
|
// if niceness is 0, use the higher priority udpServer
|
|
UdpServer *us = &g_udpServer;
|
|
//if ( st->m_niceness == 0 ) us = &g_udpServer2;
|
|
//else us = &g_udpServer ;
|
|
UdpSlot *slot = st->m_slot;
|
|
// just bitch if there was an error, then ignore it
|
|
if ( g_errno ) {
|
|
log("topics: Had error getting document: %s. Document will "
|
|
"not contribute to the topics generation.",
|
|
mstrerror(g_errno));
|
|
g_errno = 0;
|
|
}
|
|
// we got one
|
|
st->m_numReplies++;
|
|
// launch another request if we can
|
|
// return if all done
|
|
launchMsg20s ( st , false , st->m_topicGroups[0].m_topicSampleSize ) ;
|
|
// wait for all replies to get here
|
|
if ( st->m_numReplies < st->m_numRequests ) return;
|
|
// get time now
|
|
//int64_t now = gettimeofdayInMilliseconds();
|
|
// . add the stat
|
|
// . use purple for tie to get all summaries
|
|
//g_stats.addStat_r ( 0 ,
|
|
// m_startTime ,
|
|
// now ,
|
|
// 0x008220ff );
|
|
// timestamp log
|
|
//int64_t startTime = gettimeofdayInMilliseconds();
|
|
log(LOG_DEBUG,"topics: msg24: Got %"INT32" titleRecs.",// in %"INT64" ms",
|
|
st->m_numReplies );//, now - m_startTime );
|
|
|
|
// set query
|
|
//Query q;
|
|
//q.set ( st->m_query , st->m_queryLen , NULL , 0 , 2/*auto*/, true);
|
|
|
|
// . init table for up to about 5k total distinct pronouns & phrases
|
|
// . it automatically grows by like 20% if it runs out of space
|
|
// . only alloc space for linked lists if docid info is wanted
|
|
TermTable master;
|
|
if ( ! master.set ( 20000 , true , true ,
|
|
st->m_returnDocIdCount | st->m_returnDocIds ,
|
|
st->m_returnPops , true, false, NULL ) ) {
|
|
mdelete ( st , sizeof(State24) , "Msg24" );
|
|
delete ( st );
|
|
log("topics: Could not allocate memory for topic generation.");
|
|
us->sendErrorReply ( slot , ENOMEM );
|
|
return ;
|
|
}
|
|
|
|
// timestamp log
|
|
int64_t startTime = gettimeofdayInMilliseconds();
|
|
|
|
// debug
|
|
//char *pp = (char *)mmalloc ( 4 , "foo");
|
|
//*(int32_t *)pp = 0;
|
|
//us->sendReply_ass ( pp , 4 , pp , 4 , slot );
|
|
//delete(st);
|
|
//return;
|
|
|
|
// store all topics (scores/gids) in this buffer
|
|
//char buf [ 128*1024 ];
|
|
//char *p = buf;
|
|
//char *pend = buf + 128*1024;
|
|
char *buf = NULL;
|
|
int32_t bufSize = 0;
|
|
//for ( int32_t yyy = 0 ; yyy < 100 ; yyy++ ) { master.clear();//mdw
|
|
// loop over all topic groups
|
|
for ( int32_t i = 0 ; i < st->m_numTopicGroups ; i++ ) {
|
|
// get ith topic group descriptor
|
|
TopicGroup *t = &st->m_topicGroups[i];
|
|
// . generate topics for this topic group
|
|
// . serialize them into "p"
|
|
// . getTopics will realloc() this "buf" to exactly the size
|
|
// it needs
|
|
getTopics ( st , t , &master , &st->m_qq , i ,
|
|
// getTopics will realloc this buffer
|
|
&buf , &bufSize , NULL , NULL , NULL );
|
|
// clear master table each time
|
|
if ( i + 1 < st->m_numTopicGroups ) master.clear();
|
|
}
|
|
//}
|
|
|
|
// free mem now to avoid fragmentation
|
|
master.reset();
|
|
|
|
// if small enough, copy into slot's tmp buffer
|
|
char *reply = buf;
|
|
int32_t replySize = bufSize;
|
|
// launch it
|
|
us->sendReply_ass ( reply , replySize , reply , replySize , slot );
|
|
mdelete ( st , sizeof(State24) , "Msg24" );
|
|
delete ( st );
|
|
|
|
// . on host0, this is 21.3 ms with a std.dev. of 17.5 using dsrt=30
|
|
// measured on log[b-d] with the limit of 4 words per "giga bit".
|
|
// . now time with our new 6 word phrase maximum:
|
|
// sum = 1294.0 avg = 16.0 sdev = 10.8 ... our rewrite was faster!!
|
|
//if ( g_conf.m_timingDebugEnabled )
|
|
int64_t took = gettimeofdayInMilliseconds() - startTime ;
|
|
if ( took > 1 )
|
|
log(LOG_TIMING,"topics: Took %"INT64" ms to parse out topics.",
|
|
took );
|
|
// timing debug
|
|
else log(LOG_TIMING,"topics: Took %"INT64" ms to parse out topics.", took);
|
|
}
|
|
|
|
class DocIdLink {
|
|
public:
|
|
int64_t m_docId;
|
|
int32_t m_next; // offset into st->m_mem to DocIdLink
|
|
};
|
|
|
|
|
|
// returns false and set g_errno on error, true otherwise
|
|
bool getTopics ( State24 *st ,
|
|
TopicGroup *t ,
|
|
TermTable *master ,
|
|
Query *q ,
|
|
char gid ,
|
|
char **buf ,
|
|
int32_t *bufSize ,
|
|
// these ptrs are supplied by the spider when trying to
|
|
// generate the gigabit vector for a document it is indexing
|
|
class Words *wordsPtr ,
|
|
class Scores *scoresPtr ,
|
|
int32_t *hashes ,
|
|
unsigned char language ,
|
|
int32_t niceness ,
|
|
LinkInfo* linkInfo,
|
|
LinkInfo* linkInfo2) {
|
|
|
|
////////////////////////////////////////////
|
|
//
|
|
// GENERATE THE TOPICS
|
|
//
|
|
////////////////////////////////////////////
|
|
|
|
|
|
//int64_t start = gettimeofdayInMilliseconds();
|
|
|
|
// only allow one vote per ip
|
|
HashTable iptable;
|
|
// return fales and set g_errno if this alloc fails
|
|
if ( t->m_ipRestrict && ! iptable.set ( st->m_numRequests * 4 ) )
|
|
return false;
|
|
|
|
// space for all vectors for deduping samples that are 80% similar
|
|
char vbuf [ 64*1024 ];
|
|
char *vecs = vbuf;
|
|
int32_t numVecs = 0;
|
|
int32_t vneed = st->m_numRequests * SAMPLE_VECTOR_SIZE;
|
|
if ( t->m_dedupSamplePercent >= 0 && vneed > 64*1024 )
|
|
vecs = (char *)mmalloc ( vneed , "Msg24d" );
|
|
if ( ! vecs ) return false;
|
|
|
|
// hack, if words supplied, treat as one request
|
|
if ( wordsPtr ) st->m_numRequests = 1;
|
|
|
|
//
|
|
//
|
|
// . make the hash table used for repeated fragment detection
|
|
// . one slot per word, over all samples
|
|
//
|
|
//
|
|
|
|
// for every sample estimate the number of words so we know how big
|
|
// to make our repeat hash table
|
|
int32_t maxWords = 0;
|
|
Words tmpw;
|
|
// if getting a gigabit vector for a single doc, we know the # of words
|
|
if ( wordsPtr ) maxWords += wordsPtr->getNumWords();
|
|
// otherwise, get max # of words for each big sample via Msg20
|
|
int32_t numMsg20Used = 0;
|
|
for ( int32_t i = 0 ; ! wordsPtr && i < st->m_numRequests ; i++ ) {
|
|
Msg20* thisMsg20 = NULL;
|
|
if(wordsPtr) {}
|
|
else if(st->m_msg20) thisMsg20 = &st->m_msg20[i];
|
|
else {
|
|
thisMsg20 = st->m_msg20Ptrs[i];
|
|
if ( st->m_clusterLevels[i] != CR_OK ) continue;
|
|
}
|
|
//continue if we've gotten no content
|
|
if(!wordsPtr &&
|
|
(!thisMsg20 || (thisMsg20 && thisMsg20->m_errno)))
|
|
continue;
|
|
// make sure the summary is not in a foreign language (aac)
|
|
if (thisMsg20) {
|
|
unsigned char sLang;
|
|
sLang = thisMsg20->m_r->m_summaryLanguage;
|
|
if (language != langUnknown && sLang != language) continue;
|
|
};
|
|
// get the ith big sample
|
|
char *sample = NULL;
|
|
int32_t slen = 0;
|
|
// but if doing metas, get the display content
|
|
char *next = NULL;
|
|
if(thisMsg20) next = thisMsg20->getDisplayBuf();
|
|
if ( t->m_meta[0] && next)
|
|
sample = thisMsg20->getNextDisplayBuf(&slen,&next);
|
|
// XmlDoc::getGigabitVector() provides us with the Words/Scores
|
|
// classes for the whole document. that is the "sample"
|
|
else {
|
|
sample = thisMsg20->getBigSampleBuf();
|
|
slen = thisMsg20->getBigSampleLen();
|
|
}
|
|
// are we unicode?
|
|
bool isUnicode = thisMsg20->isUnicode();
|
|
// set parser vars
|
|
char *p = sample;
|
|
char *pend = sample + slen;
|
|
// each sample consists of multiple \0 terminated excerpts
|
|
int32_t sampleWords = 0;
|
|
#ifdef DEBUG_MSG24
|
|
int32_t numExcerpts = 0;
|
|
#endif
|
|
while ( p < pend ) {
|
|
int32_t plen ;
|
|
if ( isUnicode ) plen = ucStrNLen (p,pend-p);
|
|
else plen = strlen (p);
|
|
if ( isUnicode ) sampleWords += countWords((UChar *)p,plen);
|
|
else sampleWords += countWords( p,plen);
|
|
// advance to next excerpt
|
|
p += plen + 1;
|
|
#ifdef DEBUG_MSG24
|
|
numExcerpts++;
|
|
#endif
|
|
};
|
|
#ifdef DEBUG_MSG24
|
|
if ( sampleWords > 2048 ) {
|
|
char *dbgBuf = NULL;
|
|
log("topics: Unusually int32_t sample in Msg24: "
|
|
"sampleWords=%"INT32" numExcerpts=%"INT32"",
|
|
sampleWords, numExcerpts);
|
|
if ( (dbgBuf = (char *)mmalloc(slen+1, "DEBUG_MSG24")) ) {
|
|
int jjStep = 1;
|
|
if (isUnicode) jjStep = 2;
|
|
int kk = 0;
|
|
for (int jj = 0; jj< slen; jj += jjStep) {
|
|
if (sample[jj]) {
|
|
dbgBuf[kk++] = sample[jj];
|
|
}
|
|
else {
|
|
dbgBuf[kk++] = '#';
|
|
};
|
|
};
|
|
dbgBuf[kk++] = '\0';
|
|
log("topics: \tsample was: %s", dbgBuf);
|
|
};
|
|
}
|
|
else {
|
|
log("topics: Reasonable sample in Msg24: "
|
|
"sampleWords=%"INT32" numExcerpts=%"INT32"",
|
|
sampleWords, numExcerpts);
|
|
};
|
|
#endif
|
|
if (maxWords + sampleWords > 0x08000000) {
|
|
log("topics: too many words in samples. "
|
|
"Discarding the remaining samples "
|
|
"(maxWords=%"INT32")", maxWords);
|
|
break;
|
|
}
|
|
else {
|
|
maxWords += sampleWords;
|
|
numMsg20Used++;
|
|
};
|
|
}
|
|
// make it big enough so there are gaps, so chains are not too long
|
|
int32_t minBuckets = (int32_t)(maxWords * 1.5);
|
|
if(minBuckets < 512) minBuckets = 512;
|
|
int32_t numSlots = 2 * getHighestLitBitValue ( minBuckets ) ;
|
|
int32_t need2 = numSlots * (8+4);
|
|
char *rbuf = NULL;
|
|
char tmpBuf2[13000];
|
|
// sanity check
|
|
if ( need2 < 0 ) {
|
|
g_errno = EBADENGINEER;
|
|
return log("query: bad engineer in Msg24.cpp. need2=%"INT32" "
|
|
"numSlots=%"INT32" maxWords=%"INT32" q=%s", need2,numSlots,maxWords,q->m_orig);
|
|
}
|
|
if ( need2 < 13000 ) rbuf = tmpBuf2;
|
|
else rbuf = (char *)mmalloc ( need2 , "WeightsSet3");
|
|
if ( ! rbuf ) return false;
|
|
// sanity check
|
|
if ( numSlots * 8 > need2 || numSlots * 8 < 0 ) {
|
|
g_errno = EBADENGINEER;
|
|
return log("query: bad engineer in Msg24.cpp. need2=%"INT32" "
|
|
"numSlots=%"INT32" q=%s", need2,numSlots,q->m_orig);
|
|
}
|
|
// clear the keys in the hash table (empty it out)
|
|
memset ( rbuf , 0 , numSlots * 8 );
|
|
// set the member var to this
|
|
char *repeatTable = rbuf;
|
|
int32_t repeatTableNumSlots = numSlots;
|
|
|
|
//
|
|
//
|
|
// end making the hash table for repeated fragment detection
|
|
//
|
|
//
|
|
|
|
|
|
// now combine all the pronouns and pronoun phrases into one big hash
|
|
// table and collect the top 10 topics
|
|
int32_t nqi = 0; // how many query terms actually used? for normalizing.
|
|
int32_t tcount = 0; // how many title recs did we process?
|
|
QUICKPOLL(niceness);
|
|
|
|
for ( int32_t i = 0 ; i < numMsg20Used ; i++ ) {
|
|
Msg20* thisMsg20 = NULL;
|
|
if(wordsPtr) {}
|
|
else if(st->m_msg20) thisMsg20 = &st->m_msg20[i];
|
|
else {
|
|
thisMsg20 = st->m_msg20Ptrs[i];
|
|
if ( st->m_clusterLevels[i] != CR_OK ) continue;
|
|
}
|
|
// make sure the summary is not in a foreign language (aac)
|
|
if (thisMsg20) {
|
|
unsigned char sLang;
|
|
sLang = thisMsg20->m_r->m_summaryLanguage;
|
|
if (language != langUnknown && sLang != language) continue;
|
|
};
|
|
//continue if we've gotten no content
|
|
if(!wordsPtr &&
|
|
(!thisMsg20 || (thisMsg20 && thisMsg20->m_errno)))
|
|
continue;
|
|
// skip if from an ip we already did
|
|
if ( t->m_ipRestrict ) {
|
|
int32_t ipd = ipdom (thisMsg20->getIp() );
|
|
// zero is invalid!
|
|
if ( ! ipd ) continue;
|
|
//log("url=%s",thisMsg20->getUrl());
|
|
if ( iptable.getValue(ipd) ) {
|
|
//log("dup=%s",thisMsg20->getUrl());
|
|
continue;
|
|
}
|
|
// now we also check domain
|
|
Url uu;
|
|
uu.set ( thisMsg20->getUrl() ,
|
|
thisMsg20->getUrlLen() );
|
|
// "mid dom" is the "ibm" part of ibm.com or ibm.de
|
|
char *dom = uu.getMidDomain();
|
|
int32_t dlen = uu.getMidDomainLen();
|
|
if ( dom && dlen > 0 ) {
|
|
int32_t h = hash32 ( dom , dlen );
|
|
if ( iptable.getValue(h) ) continue;
|
|
iptable.addKey (h,1);
|
|
}
|
|
// add ip
|
|
iptable.addKey (ipd,1);
|
|
}
|
|
// get the ith big sample
|
|
char *bigSampleBuf = NULL;
|
|
int32_t bigSampleLen = 0;
|
|
// but if doing metas, get the display content
|
|
char *next = NULL;
|
|
if(thisMsg20) next = thisMsg20->getDisplayBuf();
|
|
// but if doing metas, get the display content
|
|
if ( t->m_meta[0] && next) {
|
|
bigSampleBuf =
|
|
thisMsg20->getNextDisplayBuf(&bigSampleLen,&next);
|
|
}
|
|
// XmlDoc::getGigabitVector() provides us with the Words/Scores
|
|
// classes for the whole document. that is the "sample"
|
|
else if ( ! wordsPtr ) {
|
|
bigSampleBuf = thisMsg20->getBigSampleBuf();
|
|
bigSampleLen = thisMsg20->getBigSampleLen();
|
|
}
|
|
// skip if empty
|
|
if ( !wordsPtr && (bigSampleLen<=0 ||!bigSampleBuf)) continue;
|
|
// otherwise count it
|
|
tcount++;
|
|
// the docid
|
|
int64_t docId = 0;
|
|
if ( ! wordsPtr ) docId = thisMsg20->getDocId();
|
|
// are we unicode?
|
|
bool isUnicode;
|
|
if ( ! wordsPtr ) isUnicode = thisMsg20->isUnicode();
|
|
else isUnicode = wordsPtr->isUnicode();
|
|
unsigned char lang = language;
|
|
if ( ! wordsPtr ) lang = thisMsg20->getLanguage();
|
|
// continue; // mdw
|
|
QUICKPOLL(niceness);
|
|
// . hash it into the master table
|
|
// . this may alloc st->m_mem, so be sure to free below
|
|
hashSample ( q, bigSampleBuf, bigSampleLen, master, &nqi , t ,
|
|
st, docId ,
|
|
vecs , &numVecs ,
|
|
wordsPtr , scoresPtr , isUnicode ,
|
|
repeatTable , repeatTableNumSlots , lang );
|
|
// ignore errors
|
|
g_errno = 0;
|
|
|
|
// hash the inlink texts and neighborhoods
|
|
for(Inlink *k=NULL;linkInfo&&(k=linkInfo->getNextInlink(k));){
|
|
char *s = k->ptr_linkText;
|
|
int32_t len = k->size_linkText - 1;
|
|
hashSample ( q, s, len, master, &nqi , t ,
|
|
st, docId , // 0
|
|
vecs , &numVecs ,
|
|
NULL , NULL , k->m_isUnicode,
|
|
repeatTable , repeatTableNumSlots ,
|
|
lang );
|
|
// and surrounding
|
|
s = k->ptr_surroundingText;
|
|
len = k->size_surroundingText - 1;
|
|
hashSample ( q, s, len, master, &nqi , t ,
|
|
st, docId , // 0
|
|
vecs , &numVecs ,
|
|
NULL , NULL , k->m_isUnicode,
|
|
repeatTable , repeatTableNumSlots ,
|
|
lang );
|
|
}
|
|
for(Inlink*k=NULL;linkInfo2&&(k=linkInfo2->getNextInlink(k));){
|
|
char *s = k->ptr_linkText;
|
|
int32_t len = k->size_linkText - 1;
|
|
hashSample ( q, s, len, master, &nqi, t ,
|
|
st, docId , // docId
|
|
vecs , &numVecs ,
|
|
NULL , NULL, isUnicode,
|
|
repeatTable, repeatTableNumSlots,
|
|
lang );
|
|
}
|
|
// ignore errors
|
|
g_errno = 0;
|
|
}
|
|
|
|
//hash meta keywords and meta description when generating the gigabit
|
|
//vector, mainly useful for docs which have all of their content in frames
|
|
if(st->m_dbufLen > 0 && wordsPtr) {
|
|
hashSample ( q, st->m_dbuf, st->m_dbufLen, master, &nqi , t ,
|
|
st, 0/*docId*/ ,
|
|
vecs , &numVecs ,
|
|
NULL , NULL , wordsPtr->isUnicode() ,
|
|
repeatTable , repeatTableNumSlots , language );
|
|
}
|
|
|
|
//log("did samples in %"INT64" ",gettimeofdayInMilliseconds()-start);
|
|
|
|
int32_t nt = master->getNumTerms();
|
|
|
|
// debug msg
|
|
/*
|
|
for ( int32_t i = 0 ; i < nt ; i++ ) {
|
|
int32_t score = master->getScoreFromTermNum(i) ;
|
|
if ( ! score ) continue;
|
|
char *ptr = master->getTermPtr(i) ;
|
|
int32_t len = master->getTermLen(i);
|
|
char ff[1024];
|
|
if ( len > 1020 ) len = 1020;
|
|
gbmemcpy ( ff , ptr , len );
|
|
ff[len] = '\0';
|
|
// we can have html entities in here now
|
|
//if ( ! is_alnum(ff[0]) ) { char *xx = NULL; *xx = 0; }
|
|
log("%08"INT32" %s",score,ff);
|
|
}
|
|
*/
|
|
|
|
// how many do we need?
|
|
int32_t need = t->m_maxTopics ;
|
|
// get this many winners
|
|
int32_t maxWinners = need;
|
|
// double it in case some get deduped
|
|
if ( t->m_dedup ) maxWinners *= 2; // mdw
|
|
// count how many get removed, might have to recompute
|
|
int32_t removed ;
|
|
int32_t got = 0;
|
|
|
|
// now get the top MAX_TOPICS or maxWinners pronouns or pronoun phrases
|
|
//int32_t scores [ MAX_TOPICS ];
|
|
//char *ptrs [ MAX_TOPICS ];
|
|
//unsigned char lens [ MAX_TOPICS ];
|
|
int32_t *scores = NULL;
|
|
char **ptrs = NULL;
|
|
int32_t *lens = NULL;
|
|
char *isunis = NULL;
|
|
int32_t *slots = NULL;
|
|
int32_t *pages = NULL;
|
|
// these vars are used below
|
|
//char *ptrs2 [ MAX_TOPICS ];
|
|
//int32_t lens2 [ MAX_TOPICS ];
|
|
char **ptrs2 = NULL;
|
|
int32_t *lens2 = NULL;
|
|
|
|
char *tmpBuf = NULL;
|
|
int32_t tmpSize = 0;
|
|
//bool triedLinkInfo = false;
|
|
redo:
|
|
// ensure maxWinners not too big
|
|
//if ( maxWinners > MAX_TOPICS ) maxWinners = MAX_TOPICS;
|
|
|
|
// allocate enough space
|
|
int32_t newSize = maxWinners*(sizeof(char *)+4+4+4+4+sizeof(char *)+4+1);
|
|
char *newBuf = (char *)mrealloc(tmpBuf,tmpSize , newSize , "Msg24e" );
|
|
if ( ! newBuf ) {
|
|
if ( tmpBuf ) mfree ( tmpBuf , tmpSize , "Msg24" );
|
|
// free the links in the linked list, if any
|
|
if ( st->m_mem ) {
|
|
mfree ( st->m_mem, st->m_memEnd - st->m_mem, "Msg24" );
|
|
st->m_mem = NULL;
|
|
st->m_memEnd = NULL;
|
|
st->m_memPtr = NULL;
|
|
}
|
|
if ( vecs != vbuf ) mfree ( vecs , vneed , "Msg24" );
|
|
return log("topics: realloc to %"INT32" failed.",newSize);
|
|
}
|
|
tmpBuf = newBuf;
|
|
tmpSize = newSize;
|
|
char *pp = tmpBuf;
|
|
ptrs = (char **)pp ; pp += sizeof(char *) * maxWinners;
|
|
scores = (int32_t *)pp ; pp += 4 * maxWinners;
|
|
lens = (int32_t *)pp ; pp += 4 * maxWinners;
|
|
isunis = pp ; pp += maxWinners;
|
|
slots = (int32_t *)pp ; pp += 4 * maxWinners;
|
|
pages = (int32_t *)pp ; pp += 4 * maxWinners;
|
|
ptrs2 = (char **)pp ; pp += sizeof(char *) * maxWinners;
|
|
lens2 = (int32_t *)pp ; pp += 4 * maxWinners;
|
|
|
|
int32_t *pops = master->m_pops;
|
|
|
|
QUICKPOLL(niceness);
|
|
|
|
int32_t np = 0;
|
|
int32_t minScore = 0x7fffffff;
|
|
int32_t minj = -1;
|
|
int32_t i ;
|
|
int32_t *heads = master->getHeads();
|
|
bool callRedo = true;
|
|
// total # of pages sampled
|
|
int32_t sampled = numMsg20Used;
|
|
for ( i = 0 ; i < nt && np < maxWinners ; i++ ) {
|
|
// skip term #i from "table" if it has 0 score
|
|
int32_t score = master->m_scores[i]; // getScoreFromTermNum(i) ;
|
|
if ( ! score ) continue;
|
|
|
|
// . make it higher the more popular a term is
|
|
// . these are based on a MAXPOP of 10000
|
|
int32_t mdc = (int32_t)((((double)sampled * 3.0 *
|
|
(double)(pops[i]&0x7fffffff))+0.5)/MAXPOP);
|
|
if ( mdc < t->m_minDocCount ) mdc = t->m_minDocCount;
|
|
|
|
// skip if does not meet the min doc count
|
|
int32_t count = 0;
|
|
//if ( mdc > 1 || st->m_returnDocIds ) {
|
|
if ( t->m_minDocCount > 1 || st->m_returnDocIds ) {
|
|
DocIdLink *link = (DocIdLink *)(st->m_mem+heads[i]);
|
|
while ( (char *)link >= st->m_mem ) {
|
|
count++;
|
|
link = (DocIdLink*)(st->m_mem + link->m_next);
|
|
}
|
|
if ( count < mdc ) continue;
|
|
}
|
|
|
|
// set the min of all in our list
|
|
if ( score < minScore ) { minScore = score; minj = np; }
|
|
// i've seen this become NULL at line 753 on gb1 below for
|
|
// /search?code=mammaXbG&uip=12.41.126.39&n=15&raw=8&q=
|
|
// manhattan,+ny
|
|
// so let's try it again and try to find out why maybe
|
|
if ( master->m_termLens[i] <= 0 ) {
|
|
char *orig = "";
|
|
if ( q ) orig = q->m_orig;
|
|
log (LOG_LOGIC,"query: Got 0 length gigabit. q=%s",
|
|
orig);
|
|
continue;
|
|
}
|
|
// recalc the score
|
|
//double frac1 = ((MAXPOP-(pops[i]&0x7fffffff))*100.0)/MAXPOP;
|
|
//double frac2 = ((double)count * 100.0) / (double)sampled;
|
|
//score = (int32_t)((frac1 * frac2) / 100.0);
|
|
// we got a winner
|
|
scores [ np ] = score;
|
|
ptrs [ np ] = master->m_termPtrs[i]; // getTermPtr(i) ;
|
|
lens [ np ] = master->m_termLens[i]; // getTermLen(i);
|
|
isunis [ np ] = master->m_isunis[i];
|
|
slots [ np ] = i;
|
|
pages [ np ] = count;
|
|
np++;
|
|
}
|
|
|
|
QUICKPOLL(niceness);
|
|
// if not enough no matter what, do not redo
|
|
if ( np < maxWinners ) callRedo = false;
|
|
// now do the rest
|
|
for ( ; i < nt ; i++ ) {
|
|
// skip term #i from "table" if it has 0 score
|
|
int32_t score = master->m_scores[i]; // getScoreFromTermNum(i) ;
|
|
// bail if empty
|
|
if ( score <= 0 ) continue;
|
|
// ignore if not a winner
|
|
if ( score <= minScore ) continue;
|
|
// . make it higher the more popular a term is
|
|
// . these are based on a MAXPOP of 10000
|
|
int32_t mdc = (int32_t)((((double)sampled * 3.0 *
|
|
(double)(pops[i]&0x7fffffff))+0.5)/MAXPOP);
|
|
if ( mdc < t->m_minDocCount ) mdc = t->m_minDocCount;
|
|
|
|
// skip if does not meet the min doc count
|
|
int32_t count = 0;
|
|
if ( t->m_minDocCount > 1 || st->m_returnDocIds ) {
|
|
DocIdLink *link = (DocIdLink *)(st->m_mem+heads[i]);
|
|
// m_next is -1 to indicate end
|
|
while ( (char *)link >= st->m_mem ) {
|
|
count++;
|
|
link = (DocIdLink *)(st->m_mem + link->m_next);
|
|
}
|
|
if ( count < mdc ) continue;
|
|
}
|
|
// find the score it will replace, the min one
|
|
//int32_t j ;
|
|
//for ( j = 0 ; j < np ; j++ )
|
|
// if ( scores [ j ] == minScore ) break;
|
|
// bad engineer?
|
|
//if ( j == np ) { char *xx = NULL; *xx = 0; }
|
|
// recalc the score
|
|
//double frac1 = ((MAXPOP-(pops[i]&0x7fffffff))*100.0)/MAXPOP;
|
|
//double frac2 = ((double)count * 100.0) / (double)sampled;
|
|
//int32_t newScore = (int32_t)((frac1 * frac2) / 100.0);
|
|
//int32_t oldminj = minj;
|
|
// replace jth guy
|
|
scores [ minj ] = score;
|
|
ptrs [ minj ] = master->m_termPtrs[i]; // getTermPtr(i) ;
|
|
lens [ minj ] = master->m_termLens[i]; // getTermLen(i);
|
|
isunis [ minj ] = master->m_isunis[i];
|
|
pages [ minj ] = count;
|
|
slots [ minj ] = i;
|
|
//log("ptrs[%"INT32"]=%"XINT32"",j,ptrs[j]);
|
|
// hopefully we increased the min score in our top set now
|
|
minScore = 0x7fffffff;
|
|
for ( int32_t j = 0 ; j < np ; j++ ) {
|
|
if ( scores[j] < minScore ) {
|
|
minScore = scores[j];
|
|
minj = j;
|
|
}
|
|
}
|
|
//scores [oldminj] = newScore;
|
|
}
|
|
|
|
// bubble sort the top winners
|
|
again:
|
|
bool flag = 0;
|
|
for ( int32_t i = 1 ; i < np ; i++ ) {
|
|
if ( scores[i-1] >= scores[i] ) continue;
|
|
int32_t ts = scores[i];
|
|
char *tp = ptrs [i];
|
|
int32_t tl = lens [i];
|
|
char tu = isunis[i];
|
|
int32_t tc = pages [i];
|
|
int32_t tt = slots [i];
|
|
scores [i ] = scores[i-1];
|
|
ptrs [i ] = ptrs [i-1];
|
|
lens [i ] = lens [i-1];
|
|
isunis [i ] = isunis[i-1];
|
|
pages [i ] = pages [i-1];
|
|
slots [i ] = slots [i-1];
|
|
scores [i-1] = ts;
|
|
ptrs [i-1] = tp;
|
|
lens [i-1] = tl;
|
|
isunis [i-1] = tu;
|
|
pages [i-1] = tc;
|
|
slots [i-1] = tt;
|
|
flag = 1;
|
|
}
|
|
if ( flag == 1 ) goto again;
|
|
|
|
QUICKPOLL(niceness);
|
|
|
|
// . normalize all scores
|
|
// . assume 20000 pointer per query term per page
|
|
// . an topic term will get 20000 points for each query term it is
|
|
// close to
|
|
int32_t max = nqi * tcount * MAX_SCORE_MULTIPLIER ; //10000;
|
|
if ( nqi == 0 ) max = tcount * ALT_MAX_SCORE;
|
|
if ( max == 0 ) max = 1;
|
|
for ( i = 0 ; i < np ; i++ ) {
|
|
// skip if length is 0, it was a dup from above
|
|
//if ( lens[i] <= 0 ) continue;
|
|
scores[i] = (scores[i] * 100) / max;
|
|
if ( scores[i] <= 0 ) scores[i] = 1;
|
|
if ( scores[i] >= 100 ) scores[i] = 100; // add a log statement here? (aac)
|
|
}
|
|
|
|
// . now set ptrs2/lens2 to point to comparison string in each topic
|
|
// . skip it over stop words, don't compare those
|
|
// . this way we can do a more flexible strcasestr and ignore common
|
|
// words when comparing, they don't add much beyond repetition
|
|
// . "super bowl" + "the super bowl" --> "super bowl"
|
|
//char *ptrs2 [ MAX_TOPICS ];
|
|
//int32_t lens2 [ MAX_TOPICS ];
|
|
for ( i = 0 ; i < np ; i++ ) {
|
|
/*
|
|
Words w;
|
|
w.set ( false , ptrs[i] , lens[i] , false );
|
|
int32_t nw = w.getNumWords();
|
|
// skip if none
|
|
if ( nw <= 0 ) continue;
|
|
*/
|
|
// establish our new ptrs
|
|
ptrs2 [ i ] = ptrs[i];
|
|
lens2 [ i ] = lens[i];
|
|
// skip initial common words
|
|
//----> not if capitalized!! leave those in tact. like
|
|
// Michael Jackson's "Beat It"
|
|
/*
|
|
int32_t h;
|
|
int32_t j = 0;
|
|
if ( w.isPunct(j) ) j++;
|
|
for ( ; j < nw ; j += 2 ) {
|
|
char *ww = w.getWord (j);
|
|
int32_t wwlen = w.getWordLen(j);
|
|
// if capitlized, leave it
|
|
if ( is_upper(ww[0]) ) break;
|
|
// single letter lower case is common word
|
|
if ( wwlen <= 1 && is_alpha(ww[0]) ) goto gotone;
|
|
// leave it if not common
|
|
h= hash64d(w.getWord(j),w.getWordLen(j));
|
|
if ( ! isCommonWord ( h ) ) break;
|
|
// otherwise, scrub it off
|
|
gotone:
|
|
ptrs2 [i] = w.getWord(j+2);
|
|
}
|
|
// skip trailing common words
|
|
int32_t k = nw - 1 ;
|
|
if ( w.isPunct(k) ) k--;
|
|
for ( ; k >= j ; k -= 2 ) {
|
|
char *ww = w.getWord (k);
|
|
int32_t wwlen = w.getWordLen(k);
|
|
// if capitlized, leave it
|
|
if ( is_upper(ww[k]) ) break;
|
|
// single letter lower case is common word
|
|
if ( wwlen <= 1 && is_alpha(ww[0]) ) goto gotone;
|
|
// left off here!!
|
|
if ( w.getWordLen(j) <= 1&&is_alpha(w.getWord(j)[0]) )
|
|
continue;
|
|
h=hash64d(w.getWord(j),w.getWordLen(j));
|
|
if ( ! isCommonWord ( h ) ) break;
|
|
}
|
|
// set new length
|
|
char *end2 = w.getWord(k) + w.getWordLen(k);
|
|
lens2[i] = end2 - ptrs2[i];
|
|
*/
|
|
}
|
|
|
|
if ( ! t->m_dedup ) goto skipdedup;
|
|
//goto skipdedup; // mdw
|
|
|
|
removed = 0;
|
|
// now remove similar terms from the top topics
|
|
for ( int32_t i = 0 ; i < np - 1 ; i++ ) {
|
|
// skip if nuked already
|
|
if ( lens[i] == 0 ) continue;
|
|
// scan down to this score, but not below
|
|
//int32_t minScore = (scores[i] * 75) / 100 ;
|
|
int32_t minScore = scores[i] - 25;
|
|
// if we get replaced by a longer guy, remember him
|
|
int32_t replacerj = -1;
|
|
// . a longer term than encapsulates us can eliminate us
|
|
// . or, if we're the longer, we eliminate the int16_ter
|
|
for ( int32_t j = i + 1 ; j < np ; j++ ) {
|
|
// skip if nuked already
|
|
if ( lens[j] == 0 ) continue;
|
|
// null term both
|
|
char c1 = ptrs2[i][lens2[i]];
|
|
char c2 = ptrs2[j][lens2[j]];
|
|
ptrs2[i][lens2[i]] = '\0';
|
|
ptrs2[j][lens2[j]] = '\0';
|
|
// if we are the int16_ter, and longer contains us
|
|
// then it nukes us... unless his score is too low
|
|
if ( lens2[i] < lens2[j] ) {
|
|
// if int16_ter is contained
|
|
char *s;
|
|
if (isunis[j] == 0 && isunis[i] == 0)
|
|
s = gb_strcasestr (ptrs2[j],ptrs2[i]) ;
|
|
else if (isunis[j] == 0 && isunis[i] == 1)
|
|
s = ucStrNCaseStr(
|
|
ptrs2[j],
|
|
(UChar*)ptrs2[i], lens2[i]>>1);
|
|
else if (isunis[j] == 1 && isunis[i] == 0)
|
|
s = (char*)ucStrNCaseStr(
|
|
(UChar*)ptrs2[j], lens2[j]>>1,
|
|
ptrs2[i]);
|
|
else
|
|
s = (char*)ucStrNCaseStr(
|
|
(UChar*)ptrs2[j], lens2[j]>>1,
|
|
(UChar*)ptrs2[i], lens2[i]>>1);
|
|
// un-null term both
|
|
ptrs2[i][lens2[i]] = c1;
|
|
ptrs2[j][lens2[j]] = c2;
|
|
// even if he's longer, if his score is too
|
|
// low then he cannot nuke us
|
|
if ( scores[j] < minScore ) continue;
|
|
// if we were NOT contained by someone below...
|
|
if ( ! s ) continue;
|
|
// he's gotta be on all of our pages, too
|
|
//if ( ! onSamePages(i,j,slots,heads,pages) )
|
|
// continue;
|
|
// int16_ter gets our score (we need to sort)
|
|
// not yet! let him finish, then replace him!!
|
|
replacerj = j;
|
|
// see if we can nuke other guys at least
|
|
continue;
|
|
}
|
|
// . otherwise, we are the longer
|
|
// . we can nuke any int16_ter below us, all scores
|
|
char *s;
|
|
if (isunis[i] == 0 && isunis[j] == 0)
|
|
s = gb_strcasestr (ptrs2[i],ptrs2[j]) ;
|
|
else if (isunis[i] == 0 && isunis[j] == 1)
|
|
s = ucStrNCaseStr(
|
|
ptrs2[i],
|
|
(UChar*)ptrs2[j], lens2[j]>>1);
|
|
else if (isunis[i] == 1 && isunis[j] == 0)
|
|
s = (char*)ucStrNCaseStr(
|
|
(UChar*)ptrs2[i], lens2[i]>>1,
|
|
ptrs2[j]);
|
|
else
|
|
s = (char*)ucStrNCaseStr(
|
|
(UChar*)ptrs2[i], lens2[i]>>1,
|
|
(UChar*)ptrs2[j], lens2[j]>>1);
|
|
// un-null term both
|
|
ptrs2[i][lens2[i]] = c1;
|
|
ptrs2[j][lens2[j]] = c2;
|
|
|
|
QUICKPOLL(niceness);
|
|
|
|
|
|
// keep going if no match
|
|
if ( ! s ) continue;
|
|
// remove him if we contain him
|
|
lens[j] = 0;
|
|
// count him
|
|
removed++;
|
|
// the redo flag
|
|
//rflag = 1;
|
|
|
|
}
|
|
// if we got replaced by a longer guy, he replaces us
|
|
// and takes our score
|
|
if ( replacerj >= 0 ) {
|
|
ptrs [i] = ptrs [replacerj];
|
|
lens [i] = lens [replacerj];
|
|
pages [i] = pages [replacerj];
|
|
slots [i] = slots [replacerj];
|
|
ptrs2 [i] = ptrs2 [replacerj];
|
|
lens2 [i] = lens2 [replacerj];
|
|
//scores[i] = scores[replacerj];
|
|
lens [replacerj] = 0;
|
|
i--;
|
|
// count him
|
|
removed++;
|
|
// the redo flag
|
|
//rflag = 1;
|
|
}
|
|
}
|
|
|
|
// . PROBLEM #2: often a phrase and the next phrase, +1, are in
|
|
// there... how to fix? the higher scoring one should swallow
|
|
// up the lower scoring one, even if only 3 of the 4 words match
|
|
// (do not count common words)
|
|
|
|
// . #3 or when all non-query, non-common terms match... pick the
|
|
// longer and remove the common words, but keep query words.
|
|
|
|
// again2:
|
|
//char rflag = 0;
|
|
// if two terms are close in score, and one is a longer version
|
|
// of the other, choose it and remove the int16_ter
|
|
for ( int32_t i = 0 ; i < np - 1 ; i++ ) {
|
|
// skip if nuked already
|
|
if ( lens[i] == 0 ) continue;
|
|
// scan down to this score, but not below
|
|
//int32_t minScore = (scores[i] * 75) / 100 ;
|
|
int32_t minScore = scores[i] - 15;
|
|
// if we get replaced by a longer guy, remember him
|
|
int32_t replacerj = -1;
|
|
// . a longer term than encapsulates us can eliminate us
|
|
// . or, if we're the longer, we eliminate the int16_ter
|
|
for ( int32_t j = i + 1 ; j < np ; j++ ) {
|
|
// skip if nuked already
|
|
if ( lens[j] == 0 ) continue;
|
|
// null term both
|
|
char c1 = ptrs[i][lens[i]];
|
|
char c2 = ptrs[j][lens[j]];
|
|
ptrs[i][lens[i]] = '\0';
|
|
ptrs[j][lens[j]] = '\0';
|
|
// if we are the int16_ter, and longer contains us
|
|
// then it nukes us... unless his score is too low
|
|
if ( lens[i] < lens[j] ) {
|
|
// if int16_ter is contained
|
|
char *s;
|
|
if (isunis[j] == 0 && isunis[i] == 0)
|
|
s = gb_strcasestr (ptrs2[j],ptrs2[i]) ;
|
|
else if (isunis[j] == 0 && isunis[i] == 1)
|
|
s = ucStrNCaseStr(
|
|
ptrs2[j],
|
|
(UChar*)ptrs2[i], lens2[i]>>1);
|
|
else if (isunis[j] == 1 && isunis[i] == 0)
|
|
s = (char*)ucStrNCaseStr(
|
|
(UChar*)ptrs2[j], lens2[j]>>1,
|
|
ptrs2[i]);
|
|
else
|
|
s = (char*)ucStrNCaseStr(
|
|
(UChar*)ptrs2[j], lens2[j]>>1,
|
|
(UChar*)ptrs2[i], lens2[i]>>1);
|
|
// un-null term both
|
|
ptrs[i][lens[i]] = c1;
|
|
ptrs[j][lens[j]] = c2;
|
|
// even if he's longer, if his score is too
|
|
// low then he cannot nuke us
|
|
if ( scores[j] < minScore ) continue;
|
|
// if we were NOT contained by someone below...
|
|
if ( ! s ) continue;
|
|
// if we are not on the same pages as the
|
|
// int16_ter one, then we cannot absorb him
|
|
//if ( ! onSamePages(i,j,slots,heads,pages))
|
|
// continue;
|
|
// int16_ter gets our score (we need to sort)
|
|
// not yet! let him finish, then replace him!!
|
|
replacerj = j;
|
|
// see if we can nuke other guys at least
|
|
continue;
|
|
}
|
|
// . otherwise, we are the longer
|
|
// . we can nuke any int16_ter below us, all scores
|
|
char *s;
|
|
if (isunis[i] == 0 && isunis[j] == 0)
|
|
s = gb_strcasestr (ptrs2[i],ptrs2[j]) ;
|
|
else if (isunis[i] == 0 && isunis[j] == 1)
|
|
s = ucStrNCaseStr(
|
|
ptrs2[i],
|
|
(UChar*)ptrs2[j], lens2[j]>>1);
|
|
else if (isunis[i] == 1 && isunis[j] == 0)
|
|
s = (char*)ucStrNCaseStr(
|
|
(UChar*)ptrs2[i], lens2[i]>>1,
|
|
ptrs2[j]);
|
|
else
|
|
s = (char*)ucStrNCaseStr(
|
|
(UChar*)ptrs2[i], lens2[i]>>1,
|
|
(UChar*)ptrs2[j], lens2[j]>>1);
|
|
// un-null term both
|
|
ptrs[i][lens[i]] = c1;
|
|
ptrs[j][lens[j]] = c2;
|
|
|
|
QUICKPOLL(niceness);
|
|
|
|
// keep going if no match
|
|
if ( ! s ) continue;
|
|
// if we are not on the same pages as the
|
|
// int16_ter one, then we cannot absorb him
|
|
//if ( ! onSamePages(i,j,slots,heads,pages))
|
|
// continue;
|
|
// remove him if we contain him
|
|
lens[j] = 0;
|
|
// count him
|
|
removed++;
|
|
// the redo flag
|
|
//rflag = 1;
|
|
|
|
}
|
|
// if we got replaced by a longer guy, he replaces us
|
|
// and takes our score
|
|
if ( replacerj >= 0 ) {
|
|
ptrs [i] = ptrs [replacerj];
|
|
lens [i] = lens [replacerj];
|
|
pages [i] = pages [replacerj];
|
|
slots [i] = slots [replacerj];
|
|
//scores[i] = scores[replacerj];
|
|
lens [replacerj] = 0;
|
|
i--;
|
|
// count him
|
|
removed++;
|
|
// the redo flag
|
|
//rflag = 1;
|
|
}
|
|
}
|
|
// if someone got replaced, loop more
|
|
//if ( rflag ) goto again2;
|
|
|
|
// remove common phrases
|
|
for ( int32_t i = 0 ; i < np ; i++ ) {
|
|
// skip if nuked already
|
|
if ( lens[i] == 0 ) continue;
|
|
// compare
|
|
bool remove = false;
|
|
if ( isunis[i] == 0 ) { //com org dom xhtml html dtd
|
|
if (!strncasecmp(ptrs[i], "all rights reserved",lens[i]) ||
|
|
!strncasecmp(ptrs[i], "rights reserved" ,lens[i]) ||
|
|
!strncasecmp(ptrs[i], "in addition" ,lens[i]) ||
|
|
!strncasecmp(ptrs[i], "for example" ,lens[i]) ||
|
|
!strncasecmp(ptrs[i], "in order" ,lens[i]) ||
|
|
!strncasecmp(ptrs[i], "in fact" ,lens[i]) ||
|
|
!strncasecmp(ptrs[i], "in general" ,lens[i]) ||
|
|
!strncasecmp(ptrs[i], "contact us" ,lens[i]) ||
|
|
!strncasecmp(ptrs[i], "at the same time" ,lens[i]) ||
|
|
!strncasecmp(ptrs[i], "http" ,lens[i]) ||
|
|
!strncasecmp(ptrs[i], "html" ,lens[i]) ||
|
|
!strncasecmp(ptrs[i], "s " ,lens[i]) ||
|
|
!strncasecmp(ptrs[i], "for more information",lens[i]))
|
|
remove = true;
|
|
}
|
|
else {
|
|
if ( !ucStrCaseCmp( (UChar*)ptrs[i], lens[i] >> 1,
|
|
"all rights reserved", 19) ||
|
|
!ucStrCaseCmp( (UChar*)ptrs[i], lens[i] >> 1,
|
|
"rights reserved", 15) ||
|
|
!ucStrCaseCmp( (UChar*)ptrs[i], lens[i] >> 1,
|
|
"in addition", 11) ||
|
|
!ucStrCaseCmp( (UChar*)ptrs[i], lens[i] >> 1,
|
|
"for example", 11) ||
|
|
!ucStrCaseCmp( (UChar*)ptrs[i], lens[i] >> 1,
|
|
"in order", 8) ||
|
|
!ucStrCaseCmp( (UChar*)ptrs[i], lens[i] >> 1,
|
|
"in fact", 7) ||
|
|
!ucStrCaseCmp( (UChar*)ptrs[i], lens[i] >> 1,
|
|
"in general", 10) ||
|
|
!ucStrCaseCmp( (UChar*)ptrs[i], lens[i] >> 1,
|
|
"contact us", 10) ||
|
|
!ucStrCaseCmp( (UChar*)ptrs[i], lens[i] >> 1,
|
|
"at the same time", 16) ||
|
|
!ucStrCaseCmp( (UChar*)ptrs[i], lens[i] >> 1,
|
|
"http", 4) ||
|
|
!ucStrCaseCmp( (UChar*)ptrs[i], lens[i] >> 1,
|
|
"s ", 2) ||
|
|
!ucStrCaseCmp( (UChar*)ptrs[i], lens[i] >> 1,
|
|
"for more information", 20) )
|
|
remove = true;
|
|
}
|
|
if ( remove ) {
|
|
lens[i] = 0;
|
|
// count him
|
|
removed++;
|
|
}
|
|
}
|
|
QUICKPOLL(niceness);
|
|
// now after longer topics replaced the int16_ter topics which they
|
|
// contained, remove the longer topics if they have too many words
|
|
// remove common phrases
|
|
for ( int32_t i = 0 ; i < np ; i++ ) {
|
|
// skip if nuked already
|
|
if ( lens[i] == 0 ) continue;
|
|
if ( ! ptrs[i] ) continue;
|
|
|
|
Words w;
|
|
w.set ( false , false, ptrs[i] , lens[i] , TITLEREC_CURRENT_VERSION,
|
|
false, false, niceness );
|
|
int32_t nw = w.getNumWords();
|
|
// . does it have comma? or other punct besides an apostrophe?
|
|
// . we allow gigabit phrases to incorporate a int32_t stretch
|
|
// of punct... only before the LAST word in the phrase,
|
|
// that way our overlap removal still works well.
|
|
bool hasPunct = false;
|
|
for ( int32_t k = 0 ; k < lens[i] ; k++ ) {
|
|
if ( ! is_punct(ptrs[i][k]) ) continue;
|
|
// apostrophe is ok as int32_t as alnum follows
|
|
if ( ptrs[i][k] == '\'' &&
|
|
is_alnum(ptrs[i][k+1]) ) continue;
|
|
// . period ok, as int32_t as space or alnum follows
|
|
// . if space follows, then an alnum must follow that
|
|
// . same goes for colon
|
|
QUICKPOLL(niceness);
|
|
|
|
// . for now, until we get abbreviations working,
|
|
// alnum must follow period
|
|
if ( (ptrs[i][k] == '.' || ptrs[i][k] == ':' ) &&
|
|
( is_alnum(ptrs[i][k+1]) ||
|
|
// accept single initial before the period, too
|
|
(ptrs[i][k+1] ==' ' && is_alnum(ptrs[i][k+2])
|
|
&& k>=2 && ptrs[i][k-2]==' ')))
|
|
continue;
|
|
// comma is ok if surrounded by digits
|
|
if ( (ptrs[i][k] == ',' &&
|
|
is_digit(ptrs[i][k-1]) &&
|
|
is_digit(ptrs[i][k+1]) )) continue;
|
|
// percent is ok
|
|
if ( ptrs[i][k] == '%' ) continue;
|
|
if ( ptrs[i][k] == '&' ) continue;
|
|
if ( ptrs[i][k] == '@' ) continue;
|
|
if ( ptrs[i][k] == '-' ) continue;
|
|
//if ( ptrs[i][k] == '(' ) continue;
|
|
//if ( ptrs[i][k] == ')' ) continue;
|
|
hasPunct = true;
|
|
break;
|
|
}
|
|
// keep it if words are under limit
|
|
// and has no commas
|
|
if ( nw <= 2*t->m_maxWordsPerTopic -1 && ! hasPunct )
|
|
continue;
|
|
lens[i] = 0;
|
|
removed++;
|
|
}
|
|
|
|
QUICKPOLL(niceness);
|
|
// if we removed enough to fall below maxWinners, redo
|
|
got = np - removed;
|
|
if ( got >= need ) goto skipdedup;
|
|
// if we already did all from "master", no more left!
|
|
if ( np >= master->getNumTermsUsed() ) goto skipdedup;
|
|
// if we didn't have enough raw results, do not redo it
|
|
if ( ! callRedo ) goto skipdedup;
|
|
// or if already hit MAX_TOPICS
|
|
//if ( maxWinners >= MAX_TOPICS ) goto skipdedup; mdw
|
|
if ( got == 0 ) maxWinners = maxWinners*2;
|
|
else maxWinners = ((int64_t)maxWinners *
|
|
(int64_t)need * 110LL) /
|
|
((int64_t)got * 100LL) + 10;
|
|
goto redo; // mdw
|
|
|
|
skipdedup:
|
|
|
|
// free the repeat table if it allocated mem
|
|
if ( repeatTable != tmpBuf2 ) {
|
|
mfree ( repeatTable , need2 , "Msg24" );
|
|
repeatTable = NULL;
|
|
}
|
|
|
|
|
|
// how much space do we need for reply?
|
|
int32_t size = 0;
|
|
// 4 bytes for number of topics
|
|
size += 4;
|
|
// then how much for each topic?
|
|
int32_t ntp = 0;
|
|
for ( i = 0 ; i < np ; i++ ) {
|
|
// cutoff at min score
|
|
if ( scores[i] < t->m_minTopicScore ) continue;
|
|
// skip if length is 0, it was a dup from above
|
|
if ( lens[i] <= 0 ) continue;
|
|
// we always get the count now
|
|
if ( st->m_returnDocIds ) {
|
|
int32_t count = 0;
|
|
DocIdLink *link = (DocIdLink *)(st->m_mem+heads[slots[i]]);
|
|
while ( (char *)link >= st->m_mem ) {
|
|
count++;
|
|
link = (DocIdLink *)(st->m_mem + link->m_next);
|
|
}
|
|
// space for the docids if they want them
|
|
size += 8 * count;
|
|
// sanity check
|
|
if ( count != pages[i] ) { char *xx = NULL; *xx = 0; }
|
|
}
|
|
// length (include \0 for null termination)
|
|
size += 4 + 4 + 4 + 1 + lens[i] + 1;
|
|
// . do we send back docid info?
|
|
// . each termId can have a linked list of docids
|
|
// . how many are in that list? (0 if none)
|
|
size += 4;
|
|
// 4 bytes for the dummy place holder. each one of these
|
|
// can be a ptr to the list of docids, but it will be NULL
|
|
// if we do not have a list of docids for this gigabit.
|
|
size += 4;
|
|
// the popularity... topic pop
|
|
size += 4;
|
|
// count numbre of topics we'll store
|
|
ntp++;
|
|
}
|
|
// realloc reply
|
|
newSize = *bufSize + size;
|
|
char *s = (char *) mrealloc ( *buf , *bufSize , newSize , "Msg24f" );
|
|
if ( ! s ) {
|
|
if ( tmpBuf ) mfree ( tmpBuf , tmpSize , "Msg24" );
|
|
if ( *buf ) mfree ( *buf , *bufSize , "Msg24" );
|
|
*buf = NULL;
|
|
*bufSize = 0;
|
|
// free the links in the linked list, if any
|
|
if ( st->m_mem ) {
|
|
mfree ( st->m_mem, st->m_memEnd - st->m_mem, "Msg24" );
|
|
st->m_mem = NULL;
|
|
st->m_memEnd = NULL;
|
|
st->m_memPtr = NULL;
|
|
}
|
|
if ( vecs != vbuf ) mfree ( vecs , vneed , "Msg24" );
|
|
return log("topics: Realloc reply buf to %"INT32" failed.",newSize);
|
|
}
|
|
// we realloc'd successfully, use it
|
|
*buf = s;
|
|
// copy into reply after previous topic groups
|
|
char *p = *buf + *bufSize;
|
|
// serialize ourselves into the buffer
|
|
//serialize2 ( p , ptrs , scores , lens , gids );
|
|
// store number of topics first
|
|
*(int32_t *)p = ntp; p += 4;
|
|
// arrays first
|
|
char **pptrs = (char **)p; p += ntp * 4;
|
|
int32_t *pscores = (int32_t *)p; p += ntp * 4;
|
|
int32_t *plens = (int32_t *)p; p += ntp * 4;
|
|
int32_t *ndocids = (int32_t *)p; p += ntp * 4;
|
|
int64_t **dptrs = (int64_t **)p; p += ntp * 4; // place holder
|
|
int32_t *ppops = (int32_t *)p; p += ntp * 4;
|
|
char *pgids = (char *)p; p += ntp ;
|
|
char *ptext = p;
|
|
int32_t j = 0;
|
|
for ( i = 0 ; i < np ; i++ ) {
|
|
// cutoff at min score
|
|
if ( scores[i] < t->m_minTopicScore ) continue;
|
|
// skip if length is 0, it was a dup from above
|
|
if ( lens[i] <= 0 ) continue;
|
|
// store it
|
|
pptrs [j] = (char *)(ptext - p);
|
|
pscores [j] = scores [i];
|
|
plens [j] = lens [i];
|
|
pgids [j] = gid;
|
|
if ( pops ) ppops [j] = pops[slots[i]];
|
|
else ppops [j] = 0;
|
|
ndocids [j] = 0;
|
|
dptrs [j] = NULL; // dummy placeholder
|
|
gbmemcpy ( ptext , ptrs[i] , lens[i] ); ptext += lens[i];
|
|
//if ( hashes && j < GIGABITS_IN_VECTOR )
|
|
// hashes[j] = hash32Lower (ptrs[i],lens[i]);
|
|
*ptext++ = '\0';
|
|
j++;
|
|
}
|
|
QUICKPOLL(niceness);
|
|
|
|
|
|
// fill in docid info
|
|
if ( st->m_returnDocIdCount || st->m_returnDocIds ) {
|
|
// reset j for this repeat loop
|
|
j = 0;
|
|
// this loop header is the same as above
|
|
for ( i = 0 ; i < np ; i++ ) {
|
|
// cutoff at min score
|
|
if ( scores[i] < t->m_minTopicScore ) continue;
|
|
// skip if length is 0, it was a dup from above
|
|
if ( lens[i] <= 0 ) continue;
|
|
// count em
|
|
int32_t count = 0;
|
|
DocIdLink *link = (DocIdLink *)(st->m_mem+heads[slots[i]]);
|
|
while ( (char *)link >= st->m_mem ) {
|
|
count++;
|
|
if ( st->m_returnDocIds ) {
|
|
*(int64_t *)ptext = link->m_docId;
|
|
ptext += 8;
|
|
}
|
|
link = (DocIdLink *)(st->m_mem + link->m_next);
|
|
}
|
|
ndocids[j] = count;
|
|
j++;
|
|
}
|
|
}
|
|
//skipd:
|
|
// update buf parms for re-calls
|
|
*bufSize = newSize;
|
|
|
|
// free tmp buf
|
|
mfree ( tmpBuf , tmpSize , "Msg24" );
|
|
// free the links in the linked list, if any
|
|
if ( st->m_mem ) {
|
|
mfree ( st->m_mem , st->m_memEnd - st->m_mem , "Msg24" );
|
|
st->m_mem = NULL;
|
|
st->m_memEnd = NULL;
|
|
st->m_memPtr = NULL;
|
|
}
|
|
if ( vecs != vbuf ) mfree ( vecs , vneed , "Msg24" );
|
|
// copy into reply topic buf
|
|
//char *start = slot->m_tmpBuf;
|
|
//char *p = slot->m_tmpBuf;
|
|
//char *pend = p + TMPBUFSIZE;
|
|
/*
|
|
for ( i = 0 ; i < np ; i++ ) {
|
|
// cutoff at min score
|
|
if ( scores[i] < t->m_minTopicScore ) continue;
|
|
// skip if length is 0, it was a dup from above
|
|
if ( lens[i] <= 0 ) continue;
|
|
if ( p + lens[i] + 9 >= pend ) break;
|
|
*(int32_t *)p = scores[i]; p += 4;
|
|
*(int32_t *)p = lens [i]; p += 4;
|
|
*(char *)p = gid ; p += 1;
|
|
gbmemcpy ( p , ptrs[i] , lens[i] ); p += lens[i];
|
|
*p++ = '\0';
|
|
}
|
|
*/
|
|
return true;
|
|
}
|
|
|
|
/*
|
|
bool onSamePages ( int32_t i, int32_t j, int32_t *slots, int32_t *heads, int32_t *pages ) {
|
|
if ( pages[i] != pages[j] ) return false;
|
|
DocIdLink *link1 = (DocIdLink *)(st->m_mem+heads[slots[i]]);
|
|
DocIdLink *link2 = (DocIdLink *)(st->m_mem+heads[slots[j]]);
|
|
while ( (char *)link1 >= st->m_mem ) {
|
|
if ( link1->m_docId != link2->m_docId ) return false;
|
|
link1 = (DocIdLink *)(st->m_mem + link1->m_next);
|
|
link2 = (DocIdLink *)(st->m_mem + link2->m_next);
|
|
}
|
|
return true;
|
|
}
|
|
*/
|
|
|
|
void hashExcerpt ( Query *q , uint64_t *qids , int32_t *qpops ,
|
|
int32_t nqi , TermTable *tt , char *buf , int32_t bufLen ,
|
|
Words *w , TopicGroup *t , Scores *scoresPtr ,
|
|
bool isUnicode , char *repeatTable ,
|
|
int32_t repeatTableNumSlots , char language );
|
|
|
|
// . returns false and sets g_errno on error
|
|
// . here's the tricky part
|
|
// . *nqiPtr is how many query terms we used - so caller can normalize scores
|
|
bool hashSample ( Query *q, char *bigSampleBuf , int32_t bigSampleLen ,
|
|
TermTable *master, int32_t *nqiPtr , TopicGroup *t ,
|
|
State24 *st, int64_t docId ,
|
|
char *vecs , int32_t *numVecs ,
|
|
Words *wordsPtr , Scores *scoresPtr , bool isUnicode ,
|
|
char *repeatTable , int32_t repeatTableNumSlots ,
|
|
char language ) {
|
|
// numTerms must be less than this
|
|
//if ( q && q->getNumTerms() > MAX_QUERY_TERMS ) (aac)
|
|
if ( q && q->m_numWords > MAX_QUERY_TERMS )
|
|
return log("topics: Too many query terms for "
|
|
"topic generation.");
|
|
|
|
//bool returnDocIdCount = st->m_returnDocIdCount;
|
|
//bool returnDocIds = st->m_returnDocIds;
|
|
bool returnPops = st->m_returnPops;
|
|
|
|
|
|
// this is the pure content now
|
|
char *content = bigSampleBuf;
|
|
int32_t contentLen = bigSampleLen;
|
|
// truncate it to 40k, that's enough
|
|
//if ( contentLen > 50*1024 ) contentLen = 50*1024;
|
|
// bail if empty!
|
|
if ( ! wordsPtr && (! content || contentLen <= 0) ) {
|
|
log("topics: Got empty document for topic generation.");
|
|
return true;
|
|
}
|
|
// make buf point to the available space
|
|
char *buf = content;
|
|
// get length of the buffer
|
|
int32_t bufLen = contentLen;
|
|
|
|
#ifdef DEBUG_MSG24
|
|
if (q) {
|
|
log("topics: Query stats in hashSample");
|
|
int32_t numQT = q->getNumTerms();
|
|
int32_t numQW = q->m_numWords;
|
|
log("topics: \tnumQueryTerms = %"INT32"", numQT);
|
|
log("topics: \tnumQueryWords = %"INT32"", numQW);
|
|
char *thisQT, *thisQW, iCode, tmpBuf[1024];
|
|
int32_t qtLen, qwLen, i, j, k;
|
|
for (i = 0; i < numQT; i++) {
|
|
thisQT = q->getTerm(i);
|
|
qtLen = q->getTermLen(i);
|
|
k = 0;
|
|
for (j = 0; j < qtLen && k < 1023; j++) {
|
|
if (thisQT[j]) tmpBuf[k++] = thisQT[j];
|
|
};
|
|
tmpBuf[k] = '\0';
|
|
log ("topics: \tQT[%"INT32"] = %s", i, &tmpBuf[0]);
|
|
};
|
|
for (i = 0; i < numQW; i++) {
|
|
thisQW = q->m_qwords[i].m_word;
|
|
qwLen = q->m_qwords[i].m_wordLen;
|
|
iCode = q->m_qwords[i].m_ignoreWord;
|
|
k = 0;
|
|
for (j = 0; j < qwLen && k < 1023; j++) {
|
|
if (thisQW[j]) tmpBuf[k++] = thisQW[j];
|
|
};
|
|
tmpBuf[k] = '\0';
|
|
log ("topics: \tQW[%"INT32"] = %s,\tignore = %i", i, &tmpBuf[0], iCode);
|
|
};
|
|
};
|
|
#endif
|
|
|
|
// get query hashes/ids, 32 bit, skip phrases
|
|
uint64_t qids [MAX_QUERY_TERMS];
|
|
int32_t qpops[MAX_QUERY_TERMS];
|
|
int32_t nqi = 0;
|
|
//for ( int32_t i=0 ; q && i<q->getNumTerms() && nqi<MAX_QUERY_TERMS; i++){ (aac)
|
|
for ( int32_t i=0 ; q && i < q->m_numWords && nqi<MAX_QUERY_TERMS; i++){
|
|
//if ( q->isPhrase (i) ) continue; (aac)
|
|
//if ( q->isQueryStopWord(i) ) continue; (aac)
|
|
char ignCode = q->m_qwords[i].m_ignoreWord;
|
|
if ( ignCode && ignCode != 8 ) continue;
|
|
char *s = q->m_qwords[i].m_word; // q->getTerm(i); (aac)
|
|
int32_t slen = q->m_qwords[i].m_wordLen; // q->getTermLen(i); (aac)
|
|
int32_t qpop;
|
|
int32_t encodeType = csISOLatin1;
|
|
if ( q->isUnicode() ) encodeType = csUTF16;
|
|
qids[nqi] = hash64d(s, slen, encodeType);
|
|
qpop = g_speller.getPhrasePopularity(s, qids[nqi], true,
|
|
language);
|
|
if ( qpop < QPOP_ZONE_0 ) qpop = QPOP_MULT_0;
|
|
else if ( qpop < QPOP_ZONE_1 ) qpop = QPOP_MULT_1;
|
|
else if ( qpop < QPOP_ZONE_2 ) qpop = QPOP_MULT_2;
|
|
else if ( qpop < QPOP_ZONE_3 ) qpop = QPOP_MULT_3;
|
|
else if ( qpop < QPOP_ZONE_4 ) qpop = QPOP_MULT_4;
|
|
else qpop = 1;
|
|
// qpop = 1; // this makes no sense here (aac)
|
|
qpops[nqi] = qpop;
|
|
nqi++;
|
|
}
|
|
// tell caller how many query terms we used so he can normalize scores
|
|
*nqiPtr = nqi;
|
|
|
|
//int64_t start = gettimeofdayInMilliseconds();
|
|
|
|
TermTable tt;
|
|
if ( ! tt.set(20000,true,true, false , returnPops, false, false,NULL)){
|
|
log("topics: Had error allocating a table for topic "
|
|
"generation: %s.",mstrerror(g_errno));
|
|
//mfree ( buf , bufMaxLen , "Msg24" );
|
|
return true;
|
|
}
|
|
|
|
Words w;
|
|
|
|
//---> word next to both query terms should not be between by word just
|
|
//next to one....
|
|
//---> weight by query popularity too!
|
|
|
|
//log("******** hashing doc *********");
|
|
|
|
// hash each excerpt
|
|
char *p = buf;
|
|
// most samples are under 5k, i've seend a 32k sample take 11ms!
|
|
char *pend = buf + bufLen;
|
|
while ( p < pend ) {
|
|
// debug
|
|
//log("docId=%"INT64" EXCERPT=%s",docId,p);
|
|
int32_t plen ;
|
|
if ( isUnicode ) plen = ucStrNLen(p,pend-p);
|
|
else plen = strlen(p);
|
|
// p is only non-NULL if we are doing it the old way
|
|
hashExcerpt ( q, qids, qpops, nqi, &tt, p, plen, &w, t , NULL,
|
|
isUnicode , repeatTable , repeatTableNumSlots ,
|
|
language );
|
|
// advance to next excerpt
|
|
if ( isUnicode ) p += plen + 2;
|
|
else p += plen + 1;
|
|
}
|
|
|
|
// hash the provided wordsPtr as one excerpt if there
|
|
if ( wordsPtr )
|
|
hashExcerpt ( q, qids, qpops, nqi, &tt, NULL,0, wordsPtr, t ,
|
|
scoresPtr , isUnicode ,
|
|
repeatTable , repeatTableNumSlots ,
|
|
language );
|
|
|
|
// . compute the fingerprint/similarirtyVector from this table
|
|
// the same way we do for documents for deduping them at query time
|
|
// . or we could just wait for our dedup algo to kick in... (mdw)
|
|
// then comment this stuff out ...
|
|
if ( t->m_dedupSamplePercent >= 0 ) {
|
|
char *v1 = vecs + (*numVecs * SAMPLE_VECTOR_SIZE);
|
|
g_clusterdb.getSampleVector ( v1 , &tt );
|
|
// compare to others done so far
|
|
char *v2 = vecs ;
|
|
for ( int32_t i = 0 ; i < *numVecs ; i++,v2+=SAMPLE_VECTOR_SIZE){
|
|
char ss = g_clusterdb.getSampleSimilarity(v1,v2,
|
|
SAMPLE_VECTOR_SIZE);
|
|
// return true if too similar to another sample we did
|
|
if ( ss >= t->m_dedupSamplePercent ) { // 80 ) {
|
|
log(LOG_DEBUG,"topics: removed dup sample.");
|
|
return true;
|
|
}
|
|
}
|
|
// we have another vector to contend with for next time
|
|
*numVecs = *numVecs + 1;
|
|
}
|
|
|
|
//log("TOOK %"INT64" ms plen=%"INT32"",gettimeofdayInMilliseconds()-start,
|
|
// bufLen);
|
|
|
|
// . this termtable carries two special buckets per slot in order
|
|
// to hold a linked list of docids with each termid in the hash table
|
|
// . heads is NULL if returnDocIdCount and returnDocIds are false
|
|
int32_t *heads = master->getHeads();
|
|
// . now hash the entries of this table, tt, into the master
|
|
// . the master contains entries from all the other tables
|
|
//log("have %"INT32" terms in termtable. adding to master.",
|
|
// tt.getNumTermsUsed());
|
|
int32_t nt = tt.getNumTerms();
|
|
int32_t pop = 0 ;
|
|
for ( int32_t i = 0 ; i < nt ; i++ ) {
|
|
// this should be indented
|
|
//if ( ! tt.getScoreFromTermNum(i) ) continue;
|
|
if ( ! tt.m_scores[i] ) continue;
|
|
//int32_t ii = (int32_t)tt.getTermPtr(i);
|
|
// then divide by that
|
|
int32_t score = tt.getScoreFromTermNum(i) ;
|
|
// watch out for 0
|
|
if ( score <= 0 ) continue;
|
|
// . get the bucket
|
|
// . may be or may not be full (score is 0 if empty)
|
|
int32_t n = master->getTermNum ( tt.getTermId(i) );
|
|
// skip if 0, i've seen this happen before
|
|
if ( tt.getTermId(i) == 0 ) continue;
|
|
// . but now we add one more things to the termtable,
|
|
// a linked list field for keeping track of the docids
|
|
// of the documents that contain each termid
|
|
// . grab some mem for the link
|
|
// . "heads" is NULL if we should not do this...
|
|
if ( heads ) {
|
|
if ( st->m_memPtr + sizeof(DocIdLink) > st->m_memEnd ) {
|
|
int32_t oldSize = st->m_memEnd - st->m_mem;
|
|
int32_t newSize = oldSize + 256*1024;
|
|
char *s = (char *)mrealloc(st->m_mem,oldSize,
|
|
newSize,"Msg24g");
|
|
if ( !s )
|
|
return log("Msg24: realloc failed.");
|
|
int32_t off = st->m_memPtr - st->m_mem;
|
|
st->m_mem = s;
|
|
st->m_memEnd = s + newSize;
|
|
st->m_memPtr = s + off;
|
|
}
|
|
DocIdLink *link = (DocIdLink *)st->m_memPtr;
|
|
st->m_memPtr += sizeof(DocIdLink);
|
|
link->m_docId = docId;
|
|
// if empty... make new head
|
|
if ( master->m_scores[n] == 0 ) {
|
|
link->m_next = -1;
|
|
master->m_heads[n] = (char *)link - st->m_mem;
|
|
}
|
|
// otherwise, add link to tail of this bucket
|
|
else {
|
|
link->m_next = master->m_heads[n];
|
|
master->m_heads[n] = (char *)link - st->m_mem;
|
|
}
|
|
}
|
|
if ( returnPops ) pop = tt.m_pops[i];
|
|
// set hi bit of "pop" if in unicode
|
|
if ( isUnicode ) pop |= 0x80000000;
|
|
else pop &= 0x7fffffff;
|
|
// . add term to master table
|
|
// . don't keep filling it up if we failed to alloc more space
|
|
// because that causes getTermNum() above to crash if the
|
|
// table is 100% full.
|
|
if ( ! master->addTerm ( tt.getTermId(i) ,
|
|
// divide by the AVG score used
|
|
//tt.getScoreFromTermNum(i)+30000/pop,
|
|
score ,
|
|
//tt.getScoreFromTermNum(i)+30000,
|
|
0x7fffffff ,
|
|
false ,
|
|
TITLEREC_CURRENT_VERSION ,
|
|
tt.getTermPtr(i) ,
|
|
tt.getTermLen(i) ,
|
|
n ,// termNum
|
|
NULL ,// dummy(char *)link
|
|
pop,
|
|
isUnicode ) )
|
|
break;
|
|
// debug msg
|
|
if ( g_conf.m_logDebugQuery ) {
|
|
char *ww = tt.getTermPtr(i);
|
|
int32_t wwlen = tt.getTermLen(i);
|
|
char c = ww[wwlen];
|
|
ww[wwlen]='\0';
|
|
log(LOG_DEBUG,"topics: master termId=%"UINT32" "
|
|
"score=%"INT32" cumscore=%"INT32" len=%"INT32" term=%s\n",
|
|
(int32_t)tt.getTermId(i),
|
|
score,master->getScoreFromTermId(tt.getTermId(i)),
|
|
wwlen,ww);
|
|
ww[wwlen]=c;
|
|
}
|
|
}
|
|
|
|
//log("master has %"INT32" terms",master->getNumTermsUsed());
|
|
// clear any error
|
|
if ( g_errno ) {
|
|
log("topics: Had error getting topic candidates from document: "
|
|
"%s.",mstrerror(g_errno));
|
|
g_errno = 0;
|
|
}
|
|
//mfree ( buf , bufMaxLen , "Msg24" );
|
|
return true;
|
|
}
|
|
|
|
|
|
void hashExcerpt ( Query *q , uint64_t *qids , int32_t *qpops, int32_t nqi,
|
|
TermTable *tt , char *buf , int32_t bufLen ,
|
|
Words *w , TopicGroup *t , Scores *scoresPtr ,
|
|
bool isUnicode , char *repeatTable ,
|
|
int32_t repeatTableNumSlots , char language ) {
|
|
// . bring it out
|
|
// . allow one more word per gigabit, then remove gigabits that
|
|
// are that length. this fixes the problem of having the same
|
|
// sentence repeated in different documents, which are fairly
|
|
// different as a whole, but have the same repeated sentence or
|
|
// paragraph.
|
|
// . by only adding one, if the next word is a common word then
|
|
// we would fail to make a larger gigabit, that's why i added
|
|
// the maxjend code below this.
|
|
int32_t maxWordsPerPhrase = t->m_maxWordsPerTopic ;
|
|
if ( t->m_topicRemoveOverlaps ) maxWordsPerPhrase += 2;
|
|
char enforceQueryRadius = ! t->m_meta[0];
|
|
char delimeter = t->m_delimeter; // 0 means none (default)
|
|
char idf = t->m_useIdfForTopics;
|
|
// or if no query, no query radius
|
|
if ( ! q || q->getNumNonFieldedSingletonTerms() == 0 )
|
|
enforceQueryRadius = false;
|
|
// . now all the data is in buf/bufLen
|
|
// . parse it up into Words
|
|
// . now XmlDoc::getGigabitVector() calls us and it already has the
|
|
// Words passed up, so it will use a NULL buf
|
|
if ( buf ) w->set ( isUnicode , // isUnicode?
|
|
false , // isNormalized?
|
|
buf ,
|
|
bufLen ,
|
|
TITLEREC_CURRENT_VERSION,
|
|
true , // compute word ids?
|
|
true ); // has html entities?
|
|
int32_t nw = w->getNumWords();
|
|
// don't breech our arrays man
|
|
if ( nw > 10000 ) nw = 10000;
|
|
void *lrgBuf;
|
|
int32_t lrgBufSize = 0;
|
|
lrgBufSize += 1002 * MAX_QUERY_TERMS * sizeof(int32_t);
|
|
lrgBufSize += 2 * nw * sizeof(int32_t);
|
|
lrgBufSize += 3 * nw * sizeof(char);
|
|
lrgBufSize += nw * sizeof(uint64_t);
|
|
lrgBuf = (char *)mmalloc(lrgBufSize, "hashExcerpt (Msg24)");
|
|
if (! lrgBuf) {
|
|
nw >>= 2;
|
|
lrgBufSize = 0;
|
|
lrgBufSize += 1002 * MAX_QUERY_TERMS * sizeof(int32_t);
|
|
lrgBufSize += 2 * nw * sizeof(int32_t);
|
|
lrgBufSize += 3 * nw * sizeof(char);
|
|
lrgBufSize += nw * sizeof(uint64_t);
|
|
lrgBuf = (char *)mmalloc(lrgBufSize, "hashExcerpt (Msg24)");
|
|
};
|
|
if (! lrgBuf) {
|
|
log("topics: could not allocate local buffer "
|
|
"(%"INT32" bytes required)", lrgBufSize);
|
|
return;
|
|
};
|
|
char *lrgBufPtr = (char *)lrgBuf;
|
|
|
|
// . the popularity of word #i is pops[i]
|
|
// . but we only set below if we need to
|
|
int32_t *pops = (int32_t *) lrgBufPtr; // popularity 1-1 with first 10000 words
|
|
lrgBufPtr += nw * sizeof(int32_t);
|
|
char *iqt = lrgBufPtr; // is query term? 1-1 with words
|
|
lrgBufPtr += nw * sizeof(char);
|
|
char *icw = lrgBufPtr; // do not let frags end in these words
|
|
lrgBufPtr += nw * sizeof(char);
|
|
int32_t *qtrs = (int32_t *)lrgBufPtr; // the raw QTR scores (aac)
|
|
lrgBufPtr += nw * sizeof(int32_t);
|
|
|
|
// record list of word positions for each query term
|
|
int32_t *pos = (int32_t *)lrgBufPtr;
|
|
lrgBufPtr += MAX_QUERY_TERMS * 1000 * sizeof(int32_t);
|
|
int32_t *posLen = (int32_t *)lrgBufPtr;
|
|
lrgBufPtr += MAX_QUERY_TERMS * sizeof(int32_t);
|
|
int32_t *posPtr = (int32_t *)lrgBufPtr;
|
|
lrgBufPtr += MAX_QUERY_TERMS * sizeof(int32_t);
|
|
//for ( int32_t i = 0 ; q && i < q->getNumTerms() ; i++ ) { (aac)
|
|
for (int32_t i = 0; q && i < q->m_numWords && i < MAX_QUERY_TERMS; i++) {
|
|
posLen[i] = 0; posPtr[i] = 0; }
|
|
|
|
// skip punct
|
|
int32_t i = 0;
|
|
if ( i < nw && w->isPunct(i) ) i++;
|
|
qtrs[i] = 0;
|
|
uint64_t *wids = (uint64_t *)lrgBufPtr;
|
|
lrgBufPtr += nw * sizeof(uint64_t);
|
|
// record the positions of all query words
|
|
char **wp = w->m_words;
|
|
int32_t *wlen = w->m_wordLens;
|
|
int32_t step = 2;
|
|
int64_t *rwids = w->getWordIds();
|
|
int32_t *scores = NULL;
|
|
|
|
// . now we keep a hash table to zero out repeated fragments
|
|
// . it uses a sliding window of 5 words
|
|
// . it stores the hash of those 5 words in the hash table
|
|
// . if sees how many 5-word matches it gets in a row
|
|
// . the more matches it gets, the more it demotes the word scores
|
|
// . these are stored in the weights class
|
|
// . a repeatScore of 0 means to demote it out completely, 100 means
|
|
// it is not repeated at all
|
|
// . multiply the final gigabit score by the repeatScore/100.
|
|
char *repeatScores = lrgBufPtr;
|
|
lrgBufPtr += nw * sizeof(char);
|
|
setRepeatScores ( repeatScores , rwids , nw , repeatTable ,
|
|
repeatTableNumSlots , w );
|
|
|
|
QUICKPOLL(0);
|
|
// single char length in bytes, etc.
|
|
char oneChar = 1;
|
|
char twoChars = 2;
|
|
char threeChars = 3;
|
|
if ( isUnicode ) {
|
|
oneChar = 2;
|
|
twoChars = 4;
|
|
threeChars = 6;
|
|
}
|
|
// . advance one word at a time if doing it the new way
|
|
// . also, the word ids will already be set, so use those to see what
|
|
// is indexable and what isn't
|
|
if ( ! buf ) {
|
|
step = 1;
|
|
scores = scoresPtr->m_scores;
|
|
}
|
|
// loop over the words in our sample
|
|
//for ( ; i < nw ; i += 2 ) {
|
|
for ( ; i < nw ; i += step ) {
|
|
qtrs[i] = 0;
|
|
// do we have pre-supplied words and scores from XmlDoc.cpp?
|
|
//if ( rwids ) {
|
|
// skip if not indexable
|
|
if ( ! rwids[i] ) continue;
|
|
// or if score is <= 0
|
|
if ( scores && scores[i] <= 0 ) continue;
|
|
// or repeated too much
|
|
if ( repeatScores[i] <= 20 ) continue;
|
|
//}
|
|
// reset popularity
|
|
if ( idf ) pops[i] = -1;
|
|
else pops[i] = 1; // assume all same if not using idf
|
|
// reset "is query term" array
|
|
iqt[i] = 0;
|
|
// store the id
|
|
int32_t encodeType = csISOLatin1;
|
|
if ( isUnicode ) encodeType = csUTF16;
|
|
wids[i] = hash64d(wp[i], wlen[i], encodeType);
|
|
// . is it a common word?
|
|
// . it is if it is just one letter
|
|
// . what about X-windows coming up for a 'windows' query?
|
|
// or e-mail coming up for a query?
|
|
// . METALINCS likes to have 1 digit topics
|
|
if ( wlen[i] <= oneChar && is_lower(wp[i][0]) ) icw[i] = 1;
|
|
// unicode ~equivalent
|
|
//if ( isUnicode && wlen[i] == 2 ) icw[i] = 1;
|
|
// 2004 is common here but if it makes it in, don't remove it
|
|
// in the top topics list... no. loses 'atari 2600' then!
|
|
//else if ( is_digit(w->getWord(i)[0]) )
|
|
// icw[i] = 1;
|
|
#ifndef _METALINCS_
|
|
else icw[i] = isCommonWord ( (int32_t)rwids[i] );
|
|
#else
|
|
// always allow gigabits that start with numbers for metalincs
|
|
else if ( ! is_digit(wp[i][0]))
|
|
icw[i] = isCommonWord ( (int32_t)rwids[i] );
|
|
else
|
|
icw[i] = 0;
|
|
#endif
|
|
// debug msg
|
|
/*
|
|
char *s = w->getWord(i);
|
|
int32_t slen = w->getWordLen(i);
|
|
char c = s[slen];
|
|
s[slen]='\0';
|
|
log("icw=%"INT32" %s",icw[i],s);
|
|
s[slen]=c;
|
|
*/
|
|
// is it a query term? if so, record its word # in "pos" arry
|
|
for ( int32_t j = 0 ; j < nqi ; j++ ) {
|
|
if ( wids[i] != qids[j] ) continue;
|
|
if ( posLen[j] >= 1000 ) continue;
|
|
pos [ 1000 * j + posLen[j] ] = i;
|
|
posLen [ j ]++;
|
|
// mark this word so if a phrase only has
|
|
// all query terms we do not hash it
|
|
iqt[i] = 1;
|
|
break;
|
|
}
|
|
}
|
|
|
|
QUICKPOLL(0);
|
|
// max score -- ONE max scoring hits per doc
|
|
int32_t maxScore = nqi * MAX_SCORE_MULTIPLIER;
|
|
// this happens when generating the gigabit vector for a single doc
|
|
// so don't hamper it to such a small ceiling
|
|
if ( nqi == 0 ) maxScore = ALT_MAX_SCORE;
|
|
|
|
// skip punct
|
|
i = 0;
|
|
if ( i < nw && w->isPunct(i) ) i++;
|
|
// score each word based on distance to query terms
|
|
int32_t score;
|
|
// loop through all the words
|
|
//for ( ; i < nw ; i += 2 ) {x
|
|
for ( ; i < nw ; i += step ) {
|
|
// debug point
|
|
//if ( strncasecmp( wp[i],"Microsoft",9) == 0 )
|
|
// log("hey");
|
|
// do we have pre-supplied words and scores from XmlDoc.cpp?
|
|
//if ( rwids ) {
|
|
// skip if not indexable
|
|
if ( ! rwids[i] ) continue;
|
|
// or if score is <= 0
|
|
if ( scores && scores[i] <= 0 ) continue;
|
|
//}
|
|
// skip if in a repeat chunk of doc
|
|
if ( repeatScores[i] <= 20 ) continue;
|
|
// protect against misspelled html entities (aac)
|
|
if ( (wp[i][-oneChar] == '&' && is_alnum(wp[i][0])) ||
|
|
(wp[i][0] == '&' && is_alnum(wp[i][oneChar])) ) continue;
|
|
// no more one or two letter gigabits (aac)
|
|
if ( wlen[i] < threeChars && (! is_digit(wp[i][0])) ) continue;
|
|
//continue; //mdw
|
|
// if we had a delimeter, previous word must have it
|
|
// or be the first punct word
|
|
if ( delimeter && i >= 2 && ! w->hasChar(i-1,delimeter) )
|
|
continue;
|
|
// skip if a query term, it's ineligible
|
|
//if ( w->getWordLen(i) == 0 ) continue;
|
|
// if query is NULL, assume we are restricting to meta tags
|
|
// and query is not necessary
|
|
if ( enforceQueryRadius ) score = 0;
|
|
else score = ALT_START_SCORE;
|
|
int32_t j ;
|
|
int32_t nm = 0; // number of matches
|
|
for ( j = 0 ; j < nqi ; j++ ) {
|
|
// skip if no query terms in doc for query term #j
|
|
if ( posLen[j] <= 0 ) continue;
|
|
// get distance in words
|
|
int32_t d1 = i - pos[ 1000 * j + posPtr[j] ] ;
|
|
if ( d1 < 0 ) d1 = d1 * -1;
|
|
if ( posPtr[j] + 1 >= posLen[j] ) {
|
|
if (d1 >= QTR_ZONE_3) continue;
|
|
if (iqt[i] || icw[i] ||
|
|
wlen[i] <= threeChars) {
|
|
// common word, query terms, int16_t words
|
|
// are all second class citizens when it
|
|
// comes to scoring: they get a small
|
|
// bonus, to ensure that they are
|
|
// considered in the next stage, but do not
|
|
// benefit from QPOP and multiple hit
|
|
// bonuses (aac)
|
|
score += QTR_BONUS_CW;
|
|
continue;
|
|
};
|
|
if (d1 < QTR_ZONE_0)
|
|
score += QTR_BONUS_0;
|
|
else if (d1 < QTR_ZONE_1)
|
|
score += QTR_BONUS_1;
|
|
else if (d1 < QTR_ZONE_2)
|
|
score += QTR_BONUS_2;
|
|
else
|
|
score += QTR_BONUS_3;
|
|
nm++;
|
|
score *= qpops[j];
|
|
continue;
|
|
}
|
|
int32_t d2 = pos[ 1000 * j + posPtr[j] + 1 ] - i ;
|
|
if ( d2 < 0 ) d2 = d2 * -1;
|
|
if ( d2 > d1 ) {
|
|
// if ( d1 >=20 ) continue;
|
|
// if ( d1 < 4 ) score += 1000;
|
|
// else if ( d1 < 8 ) score += 800;
|
|
// else if ( d1 < 12 ) score += 500;
|
|
// else score += 200;
|
|
// nm++;
|
|
// score *= qpops[j];
|
|
// continue;
|
|
if (d1 >= QTR_ZONE_3) continue;
|
|
if (iqt[i] || icw[i] ||
|
|
wlen[i] <= threeChars) {
|
|
// common word, query terms, int16_t words
|
|
// are all second class citizens when it
|
|
// comes to scoring: they get a small
|
|
// bonus, to ensure that they are
|
|
// considered in the next stage, but do not
|
|
// benefit from QPOP and multiple hit
|
|
// bonuses (aac)
|
|
score += QTR_BONUS_CW;
|
|
continue;
|
|
};
|
|
if (d1 < QTR_ZONE_0)
|
|
score += QTR_BONUS_0;
|
|
else if (d1 < QTR_ZONE_1)
|
|
score += QTR_BONUS_1;
|
|
else if (d1 < QTR_ZONE_2)
|
|
score += QTR_BONUS_2;
|
|
else
|
|
score += QTR_BONUS_3;
|
|
nm++;
|
|
score *= qpops[j];
|
|
continue;
|
|
}
|
|
// if ( d2 >=20 ) { posPtr[j]++; continue; }
|
|
// if ( d2 < 4 ) score += 1000;
|
|
// else if ( d2 < 8 ) score += 800;
|
|
// else if ( d2 < 12 ) score += 500;
|
|
// else score += 200;
|
|
// nm++;
|
|
// score *= qpops[j];
|
|
if (d2 >= QTR_ZONE_3) { posPtr[j]++; continue; };
|
|
if (iqt[i] || icw[i] || wlen[i] <= threeChars) {
|
|
// common word, query terms, int16_t words
|
|
// are all second class citizens when it
|
|
// comes to scoring: they get a small
|
|
// bonus, to ensure that they are
|
|
// considered in the next stage, but do not
|
|
// benefit from QPOP and multiple hit
|
|
// bonuses (aac)
|
|
score += QTR_BONUS_CW;
|
|
continue;
|
|
};
|
|
if (d2 < QTR_ZONE_0) score += QTR_BONUS_0;
|
|
else if (d2 < QTR_ZONE_1) score += QTR_BONUS_1;
|
|
else if (d2 < QTR_ZONE_2) score += QTR_BONUS_2;
|
|
else score += QTR_BONUS_3;
|
|
nm++;
|
|
score *= qpops[j];
|
|
continue;
|
|
posPtr[j]++;
|
|
}
|
|
|
|
// skip if too far away from all query terms
|
|
if ( score <= 0 ) continue;
|
|
|
|
// no longer count closeness to query terms for score,
|
|
// just use # times topic is in doc(s) and popularity
|
|
//score = 1000;
|
|
|
|
// set pop if it is -1
|
|
if ( pops[i] == -1 ) {
|
|
pops[i] = g_speller.
|
|
getPhrasePopularity( wp[i],wids[i], true,
|
|
language );
|
|
// decrease popularity by half if
|
|
// capitalized so Jack does not have
|
|
// same pop as "jack"
|
|
if ( is_upper (wp[i][0]) ) pops[i] >>= 1;
|
|
if ( pops[i] == 0 ) pops[i] = 1;
|
|
QUICKPOLL(0);
|
|
}
|
|
|
|
// give a boost for multiple hits
|
|
// the more terms in range, the bigger the boost
|
|
if ( nm > 1 ) {
|
|
//log("nm=%"INT32"",nm);
|
|
score += MULTIPLE_HIT_BOOST * nm;
|
|
};
|
|
|
|
// save the raw QTR score
|
|
qtrs[i] = score;
|
|
};
|
|
|
|
QUICKPOLL(0);
|
|
int32_t mm = 0;
|
|
// skip punct
|
|
i = 0;
|
|
if ( i < nw && w->isPunct(i) ) i++;
|
|
for ( ; i < nw ; i += step ) {
|
|
float pop;
|
|
int32_t score;
|
|
int32_t bonus;
|
|
// must start with a QTR-scoring word
|
|
if (qtrs[i] <= 0) continue;
|
|
// add it to table
|
|
// init for debug here
|
|
char *ww;
|
|
int32_t wwlen;
|
|
//char c;
|
|
int32_t ss;
|
|
ww = wp [i]; // w->getWord(i);
|
|
wwlen = wlen[i]; // w->getWordLen(i);
|
|
if ( icw[i] ) {
|
|
// . skip this and all phrases if we're "to"
|
|
// . avoid "to use..." "to do..." "to make..." annoying
|
|
// . "to" has score 1, "and" has score 2, "of" is 3,
|
|
// . "the" is 4, "this" is 5
|
|
if ( icw[i] <= 5 ) continue;
|
|
// cannot start with any common word, unless capitalized
|
|
if ( is_lower(wp[i][0]) ) continue;
|
|
}
|
|
// if a hyphen is immediately before us, we cannot start
|
|
// a phrase... fu-ture, preven-tion
|
|
if ( i > 0 && wp[i][-oneChar]=='-' ) continue;
|
|
// same for colon
|
|
if ( i > 0 && wp[i][-oneChar]==':' ) continue;
|
|
// . if a "'s " is before us, we cannot start either
|
|
// . "valentine's day cards"
|
|
if ( i >= 3 &&
|
|
wp[i][-threeChars]=='\'' &&
|
|
wp[i][-twoChars ]=='s' &&
|
|
is_space(wp[i][-oneChar]) ) continue;
|
|
// or if our first char is a digit and a "digit," is before us
|
|
// because we don't want to break numbers with commas in them
|
|
if ( is_digit(wp[i][0]) && i >= 2 && wp[i][-oneChar]==',' &&
|
|
is_digit(wp[i][-twoChars]) ) continue;
|
|
// set initial popularity
|
|
if (pops[i] > 0) {
|
|
pop = ((float) pops[i]) / MAXPOP;
|
|
}
|
|
else {
|
|
pop = 1.0 / MAXPOP;
|
|
};
|
|
// set initial score and bonus
|
|
score = qtrs[i];
|
|
bonus = 0;
|
|
uint64_t h = wids[i]; // hash value
|
|
// if first letter is upper case, double the score
|
|
//if ( is_upper (w->getWord(i)[0]) ) score <<= 1;
|
|
|
|
// . loop through all phrases that start with this word
|
|
// . up to 6 real words per phrase
|
|
// . 'j' counts our 'words' which counts a $ of puncts as word
|
|
int32_t jend = i + maxWordsPerPhrase * 2; // 12;
|
|
int32_t maxjend = jend ;
|
|
if ( t->m_topicRemoveOverlaps ) maxjend += 8;
|
|
if ( jend > nw ) jend = nw;
|
|
if ( maxjend > nw ) maxjend = nw;
|
|
|
|
QUICKPOLL(0);
|
|
|
|
int32_t count = 0;
|
|
int32_t nqc = 0; // # common/query words in our phrase
|
|
int32_t nhw = 0; // # of "hot words" (contribute to score)
|
|
if ( scores ) mm = scores[i];
|
|
//for ( int32_t j = i ; j < jend ; j += 2 ) {
|
|
for ( int32_t j = i ; j < jend ; j += step ) {
|
|
// skip if not indexable
|
|
if ( ! rwids[j] ) continue;
|
|
// or if score is <= 0
|
|
if ( scores && scores[j] <= 0 ) continue;
|
|
if ( repeatScores[j] <= 20 ) continue;
|
|
// no ending in ing on capitalized
|
|
if ( wlen[j] > threeChars &&
|
|
wp[j][wlen[j]-oneChar ]=='g' &&
|
|
wp[j][wlen[j]-twoChars ]=='n' &&
|
|
wp[j][wlen[j]-threeChars]=='i' &&
|
|
is_lower(wp[j][0]) )
|
|
continue;
|
|
if (j == i) {
|
|
if (icw[j] || wlen[j] < threeChars) bonus -= FWC_PENALTY;
|
|
// if word is 4 letters or more and ends in ed, do not
|
|
// allow to be its own gigabit
|
|
if ( wlen[j] > threeChars &&
|
|
wp[j][wlen[j]-oneChar ]=='d' &&
|
|
wp[j][wlen[j]-twoChars]=='e' )
|
|
continue;
|
|
// no more "com" gigabits, please! (aac)
|
|
if ( wlen[j] == threeChars &&
|
|
wp[j][0 ]=='c' &&
|
|
wp[j][oneChar ]=='o' &&
|
|
wp[j][twoChars]=='m') continue;
|
|
};
|
|
// let's generalize even more! do not allow common
|
|
// single words as gigabits, with 250+ pop
|
|
//if ( pop > 100 && j == i && is_lower(wp[j][0]) ) continue;
|
|
// the above assumes a MAX_POP of 10k (sanity check)
|
|
//if ( MAXPOP != 10000 ) { char *xx = NULL; *xx = 0; }
|
|
// are we passed the first word in the phrase?
|
|
if ( j > i ) {
|
|
// advance phrase length
|
|
wwlen += wlen[j-1] + wlen[j];
|
|
// . cut phrase int16_t if too much punct between
|
|
// the current word, j, and the last one, j-2
|
|
// . but allow for abbreviations or initials
|
|
// of single letters, like 'harry s. truman'.
|
|
// we do not want to break before 's.'
|
|
// . because the phrase "s. doesn't stand for
|
|
// anything." was unable to form. we only
|
|
// got "s." and "doesn't stand for anything."
|
|
// as possible gigabit candidates.
|
|
//if ( wlen[j-1] > 1 ) {
|
|
// if ( wlen[j-1] != 2 ) break;
|
|
// if ( wp [j-1][0] != '.' ) break;
|
|
// if ( wlen[j-2] > 1 ) break;
|
|
//}
|
|
// . we now allow most punct since it is
|
|
// filtered out above w/ hasPunct variable
|
|
// . this a little more than doubles the
|
|
// processing overhead going from 1 to 3
|
|
// . going from 1 to 2 we see that we take 60ms
|
|
// instead of 50ms *when removing overlaps*
|
|
// . at 1 we take about 48/45ms, not much
|
|
// different when removing overlaps
|
|
// . increasing this totally wipes out our
|
|
// overlap problem, but it is very expensive,
|
|
// so now i just halt after jumping one big
|
|
// string of punct below, and filter out
|
|
// those gigabits above with hasPunct.
|
|
// . i'd really like to NOT have this here
|
|
// because we get much better gigabits, but
|
|
// we need it as a speed saver...
|
|
if (wlen[j-1]>t->m_topicMaxPunctLen) break;
|
|
// no phrasing across commas, etc.
|
|
/*
|
|
if ( wlen[j-1] == 2 ) {
|
|
// only allow " " or ": " or ". "
|
|
if ( wp[j-1][1]!=' ' ) break;
|
|
if ( wp[j-1][0]!=' ' &&
|
|
wp[j-1][0]!=':' &&
|
|
wp[j-1][0]!='\'' && // beatles'
|
|
// allow commas here, but we
|
|
// remove any gigabits with commas
|
|
// because we just use them to
|
|
// cancel out bad gigabits.
|
|
wp[j-1][0]!=',' &&
|
|
wp[j-1][0]!='.' ) break;
|
|
// . TODO: add in sgt. col. so that
|
|
// stuff can be in a gigabit
|
|
// . only allow ". " if prev word was
|
|
// abbreviation.
|
|
if ( wp[j-1][0]=='.' &&
|
|
j >= 2 &&
|
|
wlen[j-2] > 3) break; // != 1
|
|
}
|
|
*/
|
|
// or if we just skipped the delimeter,
|
|
// we are not allowed to phrase across that
|
|
// if one was provided
|
|
if ( delimeter && w->hasChar(j-1,delimeter))
|
|
break;
|
|
// make sure we could phrase across last word
|
|
//if ( wlen[j-1] > 1 &&
|
|
// bits.getPunctuationBits(wp[j-1],wlen[j-1])
|
|
// == 0 ) break;
|
|
// accumulate the phrase's hash
|
|
h = hash64 ( h , wids[j] );
|
|
// set pop if it is -1
|
|
if ( pops[j] == -1 ) {
|
|
pops[j]= g_speller.
|
|
getPhrasePopularity( wp[j],
|
|
wids[j], true, language );
|
|
// decrease popularity by half if
|
|
// capitalized so Jack does not have
|
|
// same pop as "jack"
|
|
if ( is_upper (wp[j][0]) )
|
|
pops[j] >>= 1;
|
|
// why was this in there?
|
|
if ( pops[j] <= 0 ) pops[j] = 1;
|
|
QUICKPOLL(0);
|
|
}
|
|
// adjust popularity
|
|
pop = (pop * pops[j])/MAXPOP;
|
|
// watch our for overflow
|
|
if ( pop <= 0.0 ) pop = 1.0/MAXPOP;
|
|
// get lowest of scores
|
|
if ( scores && scores[j] > mm ) mm = scores[j];
|
|
}
|
|
|
|
// keep track of words
|
|
count++;
|
|
if ( iqt[j] || icw[j] ) {
|
|
nqc++; // increment number of query/commoners
|
|
}
|
|
else if (qtrs[j] > 0) {
|
|
score += qtrs[j];
|
|
nhw++; // increment "hot word" counter
|
|
};
|
|
// keep phrasing until next punct word is delimeter
|
|
// or the end
|
|
if ( delimeter ) {
|
|
// if we end on a punct word, then hash
|
|
// our phrase, otherwise, only hash it if
|
|
// the next word has the delimeter
|
|
if ( j+2 < jend && ! w->hasChar(j+1,delimeter))
|
|
continue;
|
|
}
|
|
// otherwise, ensure phrase is not ALL query terms
|
|
else {
|
|
// if phrase is all commoners & query skip it
|
|
if ( nqc == count ) {
|
|
#ifdef DEBUG_MSG24
|
|
char saveChar = ww[wwlen];
|
|
ww[wwlen] = '\0';
|
|
log("topics: phrase is all QT or CW; skipping"
|
|
" phrase %s", ww);
|
|
ww[wwlen] = saveChar;
|
|
#endif
|
|
continue;
|
|
};
|
|
}
|
|
// . skip if we're common, pair across common words
|
|
// . BUT it is common to end a meta from tag in ".com"
|
|
// so we should not count that one as common
|
|
if ( icw[j] ) {
|
|
// allow for more words only for purposes
|
|
// of fixing the ABCD and BCDE overlap bug
|
|
// without having to raise jend for all cases
|
|
if ( jend < maxjend ) jend++;
|
|
continue;
|
|
}
|
|
// do not stop if - . or @ follows us right b4 alnum
|
|
if ( j+1 < nw && is_alnum(wp[j+1][oneChar]) ) {
|
|
if ( wp[j+1][0]=='-' ) continue;
|
|
if ( wp[j+1][0]=='.' ) continue;
|
|
if ( wp[j+1][0]=='\'') continue;
|
|
if ( wp[j+1][0]=='@' ) continue;
|
|
// . do not split phrases between capitalized words
|
|
// . this should fix the Costa Rica, Costa Blah bug
|
|
// . it may decrease score of Belkin for query
|
|
// 'Belkin Omni Cube' but that's ok because if
|
|
// Belkin is important it will be used independently.
|
|
if ( is_upper(wp[j][0]) &&
|
|
j + 2 < nw &&
|
|
wp[j+1][0]==' ' &&
|
|
is_upper(wp[j+2][0]) &&
|
|
wlen[j+1] == oneChar &&
|
|
t->m_maxWordsPerTopic > 1 )
|
|
continue;
|
|
}
|
|
// do not mix caps
|
|
if ( is_upper(wp[i][0]) != is_upper(wp[j][0]) )
|
|
continue;
|
|
// . do not stop on a single capital letter
|
|
// . so we don't stop on "George W->" (george w. bush)
|
|
// . i added the " && j > i" so METALINCS can have
|
|
// single digit gigabits
|
|
if ( wlen[j] == oneChar && j > i ) continue;
|
|
// . do not split after Mr. or St. or Ms. or Mt. ...
|
|
// . fixes 'st. valentines day'
|
|
if ( wlen[j] == twoChars && is_upper(wp[j][0]) &&
|
|
wp[j][twoChars]=='.' ) continue;
|
|
// sgt. or col.
|
|
if ( wlen[j] == threeChars && wp[j][threeChars]=='.' ){
|
|
if ( to_lower(wp[j][0 ])=='s' &&
|
|
to_lower(wp[j][oneChar ])=='g' &&
|
|
to_lower(wp[j][twoChars])=='t' ) continue;
|
|
if ( to_lower(wp[j][0 ])=='c' &&
|
|
to_lower(wp[j][oneChar ])=='o' &&
|
|
to_lower(wp[j][twoChars])=='l' ) continue;
|
|
if ( to_lower(wp[j][0 ])=='m' &&
|
|
to_lower(wp[j][oneChar ])=='r' &&
|
|
to_lower(wp[j][twoChars])=='s' ) continue;
|
|
}
|
|
// . do not split commas in numbers
|
|
// . like 1,000,000,000
|
|
if ( j >= 2 &&
|
|
wp[j][-oneChar ]==',' &&
|
|
is_digit(wp[j][-twoChars]) &&
|
|
wp[j][wlen[j]]==',' &&
|
|
is_digit(wp[j][wlen[j]+oneChar]))
|
|
continue;
|
|
/*
|
|
if ( pop < 1 ) ;
|
|
else if ( pop < 2 ) ss = (score * 90) / 100;
|
|
else if ( pop < 5 ) ss = (score * 85) / 100;
|
|
else if ( pop < 10 ) ss = (score * 80) / 100;
|
|
else if ( pop < 20 ) ss = (score * 75) / 100;
|
|
else if ( pop < 30 ) ss = (score * 70) / 100;
|
|
else if ( pop < 40 ) ss = (score * 65) / 100;
|
|
else if ( pop < 50 ) ss = (score * 60) / 100;
|
|
else ss = (score * 40) / 100;
|
|
*/
|
|
//if ( tt->getScoreFromTermId((int64_t)h) > 0 )
|
|
// continue;
|
|
// debug msg
|
|
//char c = ww[wwlen];
|
|
//ww[wwlen]='\0';
|
|
//fprintf(stderr,"tid=%"UINT32" score=%"INT32" pop=%"INT32" len=%"INT32" "
|
|
// "repeat=%"INT32" term=%s\n",h,ss,pop,wwlen,
|
|
// repeatScores[i],ww);
|
|
//ww[wwlen]=c;
|
|
// include any ending or starting ( or )
|
|
if ( i > 0 && ww[-oneChar] == '(' ) {
|
|
// ensure we got a ')' somewhere before adding (
|
|
for ( int32_t r = 0 ; r <= wwlen ; r++ )
|
|
if ( ww[r]==')' ) {
|
|
ww--; wwlen++; break; }
|
|
}
|
|
if ( i < nw && ww[wwlen] == ')' ) {
|
|
// we need a '(' somewhere before adding the )
|
|
for ( int32_t r = 0 ; r <= wwlen ; r++ )
|
|
if ( ww[r]=='(' ) {
|
|
wwlen++; break; }
|
|
}
|
|
// now remove ('s if begin AND end in them
|
|
if ( ww[0] == '(' && ww[wwlen-oneChar] == ')' ) {
|
|
ww++; wwlen -= twoChars; }
|
|
// now double score if capitalized, we need more
|
|
// proper nouns for topic clustering to work better,
|
|
// but it doesn't count if start of a sentence, so
|
|
// there must be some alnum word right before it.
|
|
//if (is_upper(ww[0]) && !isUnicode && wwlen>=2 &&
|
|
if ( is_upper(ww[0]) && wwlen>=twoChars &&
|
|
is_alnum(ww[-twoChars]))
|
|
ss <<= 1; // 1;
|
|
// adjust the gigabit score using the new scores array
|
|
//if ( scores && mm != NORM_WORD_SCORE )
|
|
// ss = (ss * mm) / NORM_WORD_SCORE;
|
|
// adjust the gigabit score using the new scores array
|
|
//if ( scores && mm != NORM_WORD_SCORE )
|
|
// ss = (ss * mm) / NORM_WORD_SCORE;
|
|
// only count the highest scoring guy once per page
|
|
//int32_t tn = tt->getTermNum((int64_t)h);
|
|
//maxScore = ss;
|
|
//if ( tn >= 0 ) {
|
|
// int32_t sc = tt->getScoreFromTermNum(tn);
|
|
// if ( sc > maxScore ) maxScore = sc;
|
|
//}
|
|
// . add it
|
|
// . now store the popularity, too, so we can display
|
|
// it for the winning gigabits
|
|
//if ( ! tt->addTerm ((int64_t)h,ss,maxScore,false,
|
|
// ww,wwlen,tn,NULL,pop) )
|
|
// . weight score by pop
|
|
// . lets try weighting more popular phrases more!
|
|
ss = score;
|
|
if (nhw > 0) ss /= nhw;
|
|
ss += bonus;
|
|
float boost;
|
|
if ( ((float)nhw) / count < SPARSE_MARK)
|
|
ss -= SPARSE_PENALTY;
|
|
if (pop < POP_ZONE_0) boost = POP_BOOST_0;
|
|
else if (pop < POP_ZONE_1) boost = POP_BOOST_1;
|
|
else if (pop < POP_ZONE_2) boost = POP_BOOST_2;
|
|
else if (pop < POP_ZONE_3) boost = POP_BOOST_3;
|
|
else boost = POP_BOOST_4;
|
|
ss = (int32_t)(boost *ss);
|
|
if ( ss <= 0 ) ss = 1;
|
|
// store it
|
|
int32_t ipop = (int32_t)(pop * MAXPOP);
|
|
if ( ! tt->addTerm ((int64_t)h,ss,maxScore,false,
|
|
TITLEREC_CURRENT_VERSION ,
|
|
ww,wwlen,-1,NULL,ipop) ) {
|
|
log("topics: No memory to grow table.");
|
|
return;
|
|
}
|
|
|
|
// stop after indexing a word after a int32_t string of
|
|
// punct, this is the overlap bug fix without taking
|
|
// a performance hit. hasPunct above will remove it.
|
|
if ( j > i && wlen[j-1] > twoChars ) break;
|
|
}
|
|
}
|
|
// clear any error
|
|
if ( g_errno ) {
|
|
log("topics: Had error getting topic candidates from "
|
|
"document: %s.",mstrerror(g_errno));
|
|
g_errno = 0;
|
|
}
|
|
mfree(lrgBuf, lrgBufSize, "hashExcerpt (Msg24)");
|
|
}
|
|
|
|
// taken from Weights.cpp's set3() function
|
|
void setRepeatScores ( char *repeatScores ,
|
|
int64_t *wids ,
|
|
int32_t nw ,
|
|
char *repeatTable ,
|
|
int32_t repeatTableNumSlots ,
|
|
Words *words ) {
|
|
// if no words, nothing to do
|
|
if ( nw == 0 ) return;
|
|
|
|
char *ptr = repeatTable;
|
|
int32_t numSlots = repeatTableNumSlots;
|
|
int64_t *hashes = (int64_t *)ptr; ptr += numSlots * 8;
|
|
int32_t *vals = (int32_t *)ptr; ptr += numSlots * 4;
|
|
|
|
int64_t ringWids [ 5 ];
|
|
int32_t ringPos [ 5 ];
|
|
int32_t ringi = 0;
|
|
int32_t count = 0;
|
|
int64_t h = 0;
|
|
|
|
// make the mask
|
|
uint32_t mask = numSlots - 1;
|
|
|
|
// clear ring of hashes
|
|
memset ( ringWids , 0 , 5 * sizeof(int64_t) );
|
|
|
|
// for sanity check
|
|
//int32_t lastStart = -1;
|
|
|
|
// count how many 5-word sequences we match in a row
|
|
int32_t matched = 0;
|
|
int32_t matchStart = -1;
|
|
|
|
// reset
|
|
memset ( repeatScores , 100 , nw );
|
|
|
|
// return until we fix the infinite loop bug
|
|
//return;
|
|
|
|
// . hash EVERY 5-word sequence in the document
|
|
// . if we get a match look and see what sequences it matches
|
|
// . we allow multiple instances of the same hash to be stored in
|
|
// the hash table, so keep checking for a matching hash until you
|
|
// chain to a 0 hash, indicating the chain ends
|
|
// . check each matching hash to see if more than 5 words match
|
|
// . get the max words that matched from all of the candidates
|
|
// . demote the word and phrase weights based on the total/max
|
|
// number of words matching
|
|
for ( int32_t i = 0 ; i < nw ; i++ ) {
|
|
// skip if not alnum word
|
|
if ( ! wids[i] ) continue;
|
|
// reset
|
|
//repeatScores[i] = 100;
|
|
// add new to the 5 word hash
|
|
h ^= wids[i];
|
|
// . remove old from 5 word hash before adding new...
|
|
// . initial ring wids are 0, so should be benign at startup
|
|
h ^= ringWids[ringi];
|
|
// add to ring
|
|
ringWids[ringi] = wids[i];
|
|
// save our position
|
|
ringPos[ringi] = i;
|
|
// wrap the ring ptr if we need to, that is why we are a ring
|
|
if ( ++ringi >= 5 ) ringi = 0;
|
|
// this 5-word sequence starts with word # "start"
|
|
int32_t start = ringPos[ringi];
|
|
// need at least 5 words in the ring buffer to do analysis
|
|
if ( ++count < 5 ) continue;
|
|
// sanity check
|
|
//if ( start <= lastStart ) { char *xx = NULL; *xx = 0; }
|
|
// look up in the hash table
|
|
int32_t n = h & mask;
|
|
// stop at new york times - debug
|
|
/*
|
|
if ( words->m_words[i][0] == 'A' &&
|
|
words->m_words[i][1] == 's' &&
|
|
words->m_words[i][2] == 'k' &&
|
|
words->m_words[i][3] == 'e' &&
|
|
words->m_words[i][4] == 'd' &&
|
|
words->m_words[i][5] == ' ' &&
|
|
words->m_words[i][6] == 'Q' &&
|
|
words->m_words[i][7] == 'u' )
|
|
log("hey");
|
|
*/
|
|
loop:
|
|
// all done if empty
|
|
if ( ! hashes[n] ) {
|
|
// add ourselves to the hash table now
|
|
hashes[n] = h;
|
|
// this is where the 5-word sequence starts
|
|
vals [n] = matchStart+1;
|
|
// do not demote any words if less than 8 matched
|
|
if ( matched < 3 ) { matched = 0; continue; }
|
|
// reset
|
|
matched = 0;
|
|
// . how much we should we demote
|
|
// . 10 matching words pretty much means 0 weights
|
|
//float demote = 1.0 - ((matched-5)*.10);
|
|
//if ( demote >= 1.0 ) continue;
|
|
//if ( demote < 0.0 ) demote = 0.0;
|
|
// demote the words involved
|
|
for ( int32_t j = matchStart ; j < i ; j++ )
|
|
repeatScores[j] = 0;
|
|
// get next word
|
|
continue;
|
|
}
|
|
// get next in chain if hash does not match
|
|
if ( hashes[n] != h ) {
|
|
// wrap around the hash table if we hit the end
|
|
if ( ++n >= numSlots ) n = 0;
|
|
// check out bucket #n now
|
|
goto loop;
|
|
}
|
|
// save start of matching sequence for demote loop
|
|
if ( matched == 0 ) matchStart = start;
|
|
// inc the match count
|
|
matched++;
|
|
}
|
|
// if we ended without nulling out some matches
|
|
if ( matched < 3 ) return;
|
|
for ( int32_t j = matchStart ; j < nw ; j++ ) repeatScores[j] = 0;
|
|
|
|
}
|
|
|
|
/*
|
|
// is it a stop word?
|
|
char isCommonPhrase ( int32_t h ) {
|
|
static TermTable s_table;
|
|
static bool s_isInitialized = false;
|
|
// . these have the stop words above plus some foreign stop words
|
|
// . these aren't
|
|
// . i shrunk this list a lot
|
|
// . see backups for the hold list
|
|
// . i shrunk this list a lot
|
|
// . see backups for the hold list
|
|
static char *s_stopPhrases[] = {
|
|
"all rights reserved" ,
|
|
"in addition" ,
|
|
"for example" ,
|
|
"for more information"
|
|
};
|
|
// include a bunch of foreign prepositions so they don't get required
|
|
// by the bitScores in IndexTable.cpp
|
|
if ( ! s_isInitialized ) {
|
|
// set up the hash table
|
|
if ( ! s_table.set ( sizeof(s_stopPhrases) * 2 ) )
|
|
return log("Msg24::isCommonPhrase: error set table");
|
|
// now add in all the stop words
|
|
int32_t n = (int32_t)sizeof(s_stopPhrases)/ sizeof(char *);
|
|
for ( int32_t i = 0 ; i < n ; i++ ) {
|
|
// set the phrases
|
|
char *sw = s_stopPhrases[i];
|
|
int32_t swlen = strlen ( sw );
|
|
Words w;
|
|
w->set ( false , sw , swlen );
|
|
int32_t h = hash64d ( w->getWord (0),
|
|
w->getWordLen(0));
|
|
for ( int32_t j = 1 ; j < w->getNumWords() ; j++ )
|
|
int32_t h2 =
|
|
|
|
int32_t swh = hash64d ( sw , swlen );
|
|
s_table.addTerm ((int32_t)swh,i+1,0x7fffffff,true);
|
|
}
|
|
s_isInitialized = true;
|
|
}
|
|
|
|
// . all 1 char letter words are stop words
|
|
// . good for initials and some contractions
|
|
//if ( len == 1 && is_alpha(*s) ) return true;
|
|
|
|
// get from table
|
|
return (char)s_table.getScoreFromTermId ( h );
|
|
}
|
|
*/
|
|
|
|
int32_t Msg24::getStoredSize ( ) {
|
|
// store number of topics into 4 bytes
|
|
int32_t size = 4;
|
|
// store number of topics we have
|
|
// all related topics that have scores >= m_minTopicScore
|
|
for ( int32_t i = 0 ; i < m_numTopics ; i++ ) {
|
|
// get group info
|
|
//TopicGroup *t = &m_topicGroups[m_topicGids[i]];
|
|
// break if buf is too small
|
|
//if ( size + m_topicLens[i] + 2 + 8 > MAX_REPLY_LEN ) break;
|
|
// include \0 and 4 byte score and 4 byte topic length
|
|
size +=
|
|
4 + // topic ptr
|
|
4 + // topicScore
|
|
4 + // topicLen
|
|
4 + // numDocIds
|
|
4 + // ptr to docids
|
|
4 + // topic pop
|
|
1 + // topic gid
|
|
m_topicLens[i] + 1 + // topic string with \0
|
|
m_topicNumDocIds[i]*8; // actual docids
|
|
}
|
|
return size;
|
|
}
|
|
|
|
// . serialize ourselves for the cache
|
|
// . returns bytes written
|
|
// . returns -1 and sets g_errno on error
|
|
// . just like serializing the reply
|
|
int32_t Msg24::serialize ( char *buf , int32_t bufLen ) {
|
|
char *p = buf;
|
|
// store number of topics
|
|
*(int32_t *)p = m_numTopics; p += 4;
|
|
// if no topics, bail
|
|
if ( m_numTopics <= 0 ) return 4;
|
|
// then the ptrs, with offset relative to m_topicPtrs[0] so
|
|
// deserialize works
|
|
char *base = m_topicPtrs[0];
|
|
for ( int32_t i = 0 ; i < m_numTopics ; i++ ) {
|
|
*(int32_t *)p = m_topicPtrs[i] - base; p += 4; }
|
|
// then the scores
|
|
gbmemcpy ( p , m_topicScores , m_numTopics * 4 ); p += m_numTopics * 4;
|
|
gbmemcpy ( p , m_topicLens , m_numTopics * 4 ); p += m_numTopics * 4;
|
|
gbmemcpy ( p , m_topicNumDocIds, m_numTopics * 4 ); p += m_numTopics * 4;
|
|
// these m_topicDocIds, are just essentially placeholders for ptrs
|
|
// to the docids, just like the topic ptrs above, but these call all
|
|
// be NULL if we didn't get back the list of docids for each gigabit
|
|
p += m_numTopics * 4;
|
|
// then the popularity rating of each topic
|
|
gbmemcpy ( p , m_topicPops , m_numTopics * 4 ); p += m_numTopics * 4;
|
|
gbmemcpy ( p , m_topicGids , m_numTopics ); p += m_numTopics;
|
|
// then the text
|
|
for ( int32_t i = 0 ; i < m_numTopics ; i++ ) {
|
|
gbmemcpy ( p , m_topicPtrs[i] , m_topicLens[i] ) ;
|
|
p += m_topicLens[i];
|
|
*p++ = '\0';
|
|
}
|
|
// and one array of docids per topic
|
|
for ( int32_t i = 0 ; i < m_numTopics ; i++ ) {
|
|
gbmemcpy ( p , m_topicDocIds[i] , m_topicNumDocIds[i] * 8 );
|
|
p += m_topicNumDocIds[i] * 8;
|
|
// sanity check
|
|
//for ( int32_t k = 0 ; k < m_topicNumDocIds[i] ; k++ )
|
|
// if ( m_topicDocIds[i][k] & ~((int64_t)DOCID_MASK) ) {
|
|
// log("query: Msg24 bad docid in serialize.");
|
|
// char *xx = NULL; *xx = 0;
|
|
// }
|
|
}
|
|
// debug msg
|
|
//log("in nt=%"INT32"",*nt);
|
|
if ( p - buf > bufLen ) {
|
|
log("query: Msg24 serialize overflow.");
|
|
char *xx = NULL; *xx = 0;
|
|
}
|
|
return p - buf;
|
|
}
|
|
|
|
// . deserialize ourselves for the cache
|
|
// . returns bytes written
|
|
// . returns -1 and sets g_errno on error
|
|
// . Msg40 owns the buffer, so we can reference it without having to copy
|
|
int32_t Msg24::deserialize ( char *buf , int32_t bufLen ) {
|
|
// sanity check, i've seen this happen before when the handle of
|
|
// the Msg24 runs out of memory at a certain plance and ends up
|
|
// sending back a 0 length reply
|
|
if ( bufLen < 4 ) {
|
|
g_errno = EBADREPLY;
|
|
log("query: Msg24::deserialize: bad reply.");
|
|
return -1;
|
|
}
|
|
char *p = buf;
|
|
m_numTopics = *(int32_t *)p; p += 4;
|
|
// another sanity check, just in case
|
|
if ( bufLen < m_numTopics * (6*4+1) ) {
|
|
g_errno = EBADREPLY;
|
|
log("query: Msg24::deserialize: bad reply 2.");
|
|
return -1;
|
|
}
|
|
m_topicPtrs = (char **)p; p += m_numTopics * 4;
|
|
m_topicScores = (int32_t *)p; p += m_numTopics * 4;
|
|
m_topicLens = (int32_t *)p; p += m_numTopics * 4;
|
|
m_topicNumDocIds = (int32_t *)p; p += m_numTopics * 4; //voters
|
|
m_topicDocIds = (int64_t **)p; p += m_numTopics * 4; //placehldrs
|
|
m_topicPops = (int32_t *)p; p += m_numTopics * 4;
|
|
m_topicGids = p; p += m_numTopics;
|
|
// . make ptrs to topic text
|
|
// . we were just provided with offsets to make it portable
|
|
char *off = p;
|
|
for ( int32_t i = 0 ; i < m_numTopics ; i++ ) {
|
|
m_topicPtrs[i] = (int32_t)m_topicPtrs[i] + off;
|
|
p += m_topicLens[i] + 1;
|
|
}
|
|
// now for the array of docids per topic
|
|
for ( int32_t i = 0 ; i < m_numTopics ; i++ ) {
|
|
m_topicDocIds[i] = (int64_t *)p;
|
|
p += m_topicNumDocIds[i] * 8;
|
|
// sanity check
|
|
//for ( int32_t k = 0 ; k < m_topicNumDocIds[i] ; k++ )
|
|
// if ( m_topicDocIds[i][k] & ~((int64_t)DOCID_MASK) ) {
|
|
// log("query: Msg24 bad docid in deserialize.");
|
|
// char *xx = NULL; *xx = 0;
|
|
// }
|
|
}
|
|
if ( p - buf > bufLen ) {
|
|
log("query: Msg24 deserialize overflow.");
|
|
char *xx = NULL; *xx = 0;
|
|
}
|
|
return p - buf;
|
|
}
|
|
|
|
|
|
//if we already have the msg20s, just generate the gigabits from those.
|
|
bool Msg24::generateTopicsLocal ( char *coll ,
|
|
int32_t collLen ,
|
|
char *query ,
|
|
int32_t queryLen ,
|
|
Msg20** msg20Ptrs ,
|
|
int32_t numMsg20s ,
|
|
char *clusterLevels ,
|
|
TopicGroup *topicGroups ,
|
|
int32_t numTopicGroups ,
|
|
unsigned char lang ) { // (aac)
|
|
// force it to be true, since hi bit is set in pops if topic is unicode
|
|
m_returnPops = true;
|
|
// warning
|
|
if ( ! coll ) log(LOG_LOGIC,"net: NULL collection. msg24.");
|
|
// force it
|
|
m_returnDocIdCount = true;
|
|
// if we don't get docids, then deserialize doesn't work because it
|
|
// expects the docids to be valid.
|
|
m_returnDocIds = true;
|
|
// reset
|
|
m_numTopics = 0;
|
|
//m_docsToScanForTopics = docsToScanForTopics;
|
|
//m_minTopicScore = minTopicScore;
|
|
//m_maxTopics = maxTopics;
|
|
m_numDocIds = 0;
|
|
m_coll = coll;
|
|
m_collLen = collLen;
|
|
// bail if no operations to do
|
|
|
|
int32_t numTopicsToGen = topicGroups->m_numTopics;
|
|
// get the min we have to scan
|
|
int32_t docsToScanForTopics = topicGroups[0].m_docsToScanForTopics;
|
|
|
|
for ( int32_t i = 1 ; i < numTopicGroups ; i++ ) {
|
|
int32_t x = topicGroups[i].m_docsToScanForTopics ;
|
|
if ( x > docsToScanForTopics ) docsToScanForTopics = x;
|
|
|
|
if ( topicGroups[i].m_numTopics > numTopicsToGen )
|
|
numTopicsToGen = topicGroups[i].m_numTopics;
|
|
}
|
|
// bail if none
|
|
if ( docsToScanForTopics <= 0 ) return true;
|
|
if ( numTopicsToGen == 0 ) return true;
|
|
|
|
m_startTime = gettimeofdayInMilliseconds();
|
|
|
|
// save, caller should not delete this!
|
|
m_topicGroups = topicGroups;
|
|
m_numTopicGroups = numTopicGroups;
|
|
// truncate
|
|
//if ( maxTopics > MAX_TOPICS ) maxTopics = MAX_TOPICS;
|
|
// truncate
|
|
//if ( numDocIds > MAX_DOCIDS_TO_SCAN )
|
|
// numDocIds = MAX_DOCIDS_TO_SCAN ;
|
|
// if ( numDocIds > docsToScanForTopics )
|
|
// numDocIds = docsToScanForTopics ;
|
|
|
|
|
|
State24 st;
|
|
st.m_slot = NULL;
|
|
st.m_niceness = 0;
|
|
st.m_numRequests = numMsg20s;
|
|
st.m_numReplies = numMsg20s;
|
|
|
|
gbmemcpy ( st.m_query , query , queryLen );
|
|
st.m_query [ queryLen ] = '\0';
|
|
st.m_queryLen = queryLen;
|
|
st.m_qq.set ( st.m_query , st.m_queryLen , NULL , 0, 2 , true );
|
|
|
|
st.m_numTopicGroups = m_numTopicGroups;
|
|
gbmemcpy(st.m_topicGroups, m_topicGroups,
|
|
sizeof(TopicGroup) * m_numTopicGroups);
|
|
st.m_maxCacheAge = 0;
|
|
st.m_addToCache = false;
|
|
st.m_returnDocIdCount = m_returnDocIdCount;
|
|
st.m_returnDocIds = m_returnDocIds;
|
|
st.m_returnPops = true; // ??? use this in dedup vector?
|
|
st.m_docIds = NULL;
|
|
st.m_numDocIds = 0;
|
|
st.m_clusterLevels = clusterLevels;
|
|
st.m_n = 0;
|
|
st.m_i = 0;
|
|
st.m_coll = coll;
|
|
st.m_msg20Ptrs = msg20Ptrs;
|
|
st.m_msg20 = NULL;
|
|
|
|
|
|
TermTable master;
|
|
if ( ! master.set ( 20000 , true , true ,
|
|
st.m_returnDocIdCount | st.m_returnDocIds ,
|
|
st.m_returnPops , true, false, NULL ) ) {
|
|
log("topics: Could not allocate memory for topic generation.");
|
|
return true;
|
|
}
|
|
|
|
|
|
char *buf = NULL;
|
|
int32_t bufSize = 0;
|
|
for ( int32_t i = 0 ; i < st.m_numTopicGroups ; i++ ) {
|
|
// get ith topic group descriptor
|
|
TopicGroup *t = &st.m_topicGroups[i];
|
|
// . generate topics for this topic group
|
|
// . serialize them into "p"
|
|
// . getTopics will realloc() this "buf" to exactly the size
|
|
// it needs
|
|
getTopics ( &st , t , &master , &st.m_qq , i ,
|
|
// getTopics will realloc this buffer
|
|
&buf , &bufSize , NULL , NULL , NULL, lang ); // (aac)
|
|
// clear master table each time
|
|
if ( i + 1 < st.m_numTopicGroups ) master.clear();
|
|
}
|
|
//}
|
|
|
|
// free mem now to avoid fragmentation
|
|
master.reset();
|
|
deserialize ( buf , bufSize );
|
|
|
|
//we are pointing into buf, but we want to make sure it gets freed when we
|
|
//are done with it, so we make it our m_reply
|
|
m_reply = buf;
|
|
m_replySize = bufSize;
|
|
g_stats.addStat_r ( 0 ,
|
|
m_startTime ,
|
|
gettimeofdayInMilliseconds(),
|
|
"get_gigabits",
|
|
0x00d1e1ff ,
|
|
STAT_QUERY );
|
|
return true;
|
|
}
|