get mike's super long query working
This commit is contained in:
20
Matches.cpp
20
Matches.cpp
@ -24,6 +24,8 @@
|
||||
Matches::Matches ( ) {
|
||||
m_detectSubPhrases = false;
|
||||
m_numMatchGroups = 0;
|
||||
m_qwordFlags = NULL;
|
||||
m_qwordAllocSize = 0;
|
||||
reset();
|
||||
}
|
||||
Matches::~Matches( ) { reset(); }
|
||||
@ -39,6 +41,10 @@ void Matches::reset ( ) {
|
||||
m_bitsArray [i].reset();
|
||||
}
|
||||
m_numMatchGroups = 0;
|
||||
if ( m_qwordFlags && m_qwordFlags != (mf_t *)m_tmpBuf ) {
|
||||
mfree ( m_qwordFlags , m_qwordAllocSize , "mmqw" );
|
||||
m_qwordFlags = NULL;
|
||||
}
|
||||
//m_explicitsMatched = 0;
|
||||
//m_matchableRequiredBits = 0;
|
||||
//m_hasAllQueryTerms = false;
|
||||
@ -103,6 +109,20 @@ void Matches::setQuery ( Query *q ) {
|
||||
|
||||
//memset ( m_foundNegTermVector, 0, m_q->getNumTerms() );
|
||||
|
||||
if ( m_qwordFlags ) { char *xx=NULL;*xx=0; }
|
||||
|
||||
int32_t need = m_q->m_numWords * sizeof(mf_t) ;
|
||||
m_qwordAllocSize = need;
|
||||
if ( need < 128 )
|
||||
m_qwordFlags = (mf_t *)m_tmpBuf;
|
||||
else
|
||||
m_qwordFlags = (mf_t *)mmalloc ( need , "mmqf" );
|
||||
|
||||
if ( ! m_qwordFlags ) {
|
||||
log("matches: alloc failed for query %s",q->m_orig);
|
||||
return;
|
||||
}
|
||||
|
||||
// this is word based. these are each 1 byte
|
||||
memset ( m_qwordFlags , 0 , m_q->m_numWords * sizeof(mf_t));
|
||||
|
||||
|
@ -183,7 +183,10 @@ class Matches {
|
||||
|
||||
// . 1-1 with Query::m_qwords[] array of QWords
|
||||
// . shows the match flags for that query word
|
||||
mf_t m_qwordFlags[MAX_QUERY_WORDS];
|
||||
//mf_t m_qwordFlags[MAX_QUERY_WORDS];
|
||||
mf_t *m_qwordFlags;
|
||||
int32_t m_qwordAllocSize;
|
||||
char m_tmpBuf[128];
|
||||
|
||||
//stuff for detecting whether a match is part of a larger phrase
|
||||
void setSubPhraseDetection();
|
||||
|
22
PageGet.cpp
22
PageGet.cpp
@ -40,7 +40,9 @@ public:
|
||||
bool m_isLocal;
|
||||
//bool m_seq;
|
||||
bool m_rtq;
|
||||
char m_q[MAX_QUERY_LEN+1];
|
||||
//char m_q[MAX_QUERY_LEN+1];
|
||||
SafeBuf m_qsb;
|
||||
char m_qtmpBuf[128];
|
||||
int32_t m_qlen;
|
||||
char m_boolFlag;
|
||||
bool m_printed;
|
||||
@ -98,7 +100,7 @@ bool sendPageGet ( TcpSocket *s , HttpRequest *r ) {
|
||||
int32_t qlen = 0;
|
||||
char *q = r->getString ( "q" , &qlen , NULL /*default*/);
|
||||
// ensure query not too big
|
||||
if ( qlen >= MAX_QUERY_LEN-1 ) {
|
||||
if ( qlen >= ABS_MAX_QUERY_LEN-1 ) {
|
||||
g_errno=EQUERYTOOBIG;
|
||||
return g_httpServer.sendErrorReply (s,500 ,mstrerror(g_errno));
|
||||
}
|
||||
@ -156,8 +158,16 @@ bool sendPageGet ( TcpSocket *s , HttpRequest *r ) {
|
||||
// delete ( st );
|
||||
// return sendPageNetResult( s );
|
||||
//}
|
||||
if ( q && qlen > 0 ) strcpy ( st->m_q , q );
|
||||
else st->m_q[0] = '\0';
|
||||
//if ( q && qlen > 0 ) strcpy ( st->m_q , q );
|
||||
//else st->m_q[0] = '\0';
|
||||
|
||||
st->m_qsb.setBuf ( st->m_qtmpBuf,128,0,false );
|
||||
st->m_qsb.setLabel ( "qsbpg" );
|
||||
|
||||
// save the query
|
||||
if ( q && qlen > 0 )
|
||||
st->m_qsb.safeStrcpy ( q );
|
||||
|
||||
st->m_qlen = qlen;
|
||||
//st->m_seq = seq;
|
||||
st->m_rtq = rtq;
|
||||
@ -415,8 +425,8 @@ bool processLoop ( void *state ) {
|
||||
int32_t startLen2 = sb->length();//p;
|
||||
|
||||
// query should be NULL terminated
|
||||
char *q = st->m_q;
|
||||
int32_t qlen = st->m_qlen;
|
||||
char *q = st->m_qsb.getBufStart();
|
||||
int32_t qlen = st->m_qsb.getLength(); // m_qlen;
|
||||
|
||||
char styleTitle[128] = "font-size:14px;font-weight:600;"
|
||||
"color:#000000;";
|
||||
|
@ -2795,13 +2795,14 @@ bool printSearchResultsHeader ( State0 *st ) {
|
||||
|
||||
//Highlight h;
|
||||
|
||||
st->m_qe[0] = '\0';
|
||||
//st->m_qe[0] = '\0';
|
||||
st->m_qesb.nullTerm();
|
||||
|
||||
// encode query buf
|
||||
//char qe[MAX_QUERY_LEN+1];
|
||||
char *dq = si->m_displayQuery;
|
||||
//int32_t dqlen = si->m_displayQueryLen;
|
||||
if ( dq ) urlEncode(st->m_qe,MAX_QUERY_LEN*2,dq,gbstrlen(dq));
|
||||
if ( dq ) st->m_qesb.urlEncode(dq);
|
||||
|
||||
// how many results were requested?
|
||||
//int32_t docsWanted = msg40->getDocsWanted();
|
||||
@ -5187,7 +5188,7 @@ bool printResult ( State0 *st, int32_t ix , int32_t *numPrintedSoFar ) {
|
||||
"get?"
|
||||
"q=%s&c=%s&d=%"INT64">"
|
||||
"cached</a>\n",
|
||||
st->m_qe , coll ,
|
||||
st->m_qesb.getBufStart() , coll ,
|
||||
mr->m_docId );
|
||||
else if ( printCached )
|
||||
sb->safePrintf ( "<a href=\""
|
||||
@ -5196,7 +5197,7 @@ bool printResult ( State0 *st, int32_t ix , int32_t *numPrintedSoFar ) {
|
||||
"qlang=%s&"
|
||||
"c=%s&d=%"INT64"&cnsp=0\">"
|
||||
"cached</a>\n",
|
||||
st->m_qe ,
|
||||
st->m_qesb.getBufStart() ,
|
||||
// "qlang" parm
|
||||
si->m_defaultSortLang,
|
||||
coll ,
|
||||
@ -5336,7 +5337,7 @@ bool printResult ( State0 *st, int32_t ix , int32_t *numPrintedSoFar ) {
|
||||
"d=%"INT64"&"
|
||||
"cnsp=0\">"
|
||||
"sections</a>\n",
|
||||
st->m_qe ,
|
||||
st->m_qesb.getBufStart() ,
|
||||
// "qlang" parm
|
||||
si->m_defaultSortLang,
|
||||
coll ,
|
||||
@ -5449,7 +5450,7 @@ bool printResult ( State0 *st, int32_t ix , int32_t *numPrintedSoFar ) {
|
||||
qq.urlEncode("site:");
|
||||
qq.urlEncode (hbuf);
|
||||
qq.urlEncode(" | ");
|
||||
qq.safeStrcpy(st->m_qe);
|
||||
qq.safeStrcpy(st->m_qesb.getBufStart());
|
||||
qq.nullTerm();
|
||||
// get the original url and add/replace in query
|
||||
char tmp2[512];
|
||||
|
@ -52,7 +52,8 @@ public:
|
||||
int64_t m_took; // how long it took to get the results
|
||||
HttpRequest m_hr;
|
||||
bool m_printedHeaderRow;
|
||||
char m_qe[MAX_QUERY_LEN+1];
|
||||
//char m_qe[MAX_QUERY_LEN+1];
|
||||
SafeBuf m_qesb;
|
||||
|
||||
// for printing our search result json items in csv:
|
||||
HashTableX m_columnTable;
|
||||
|
@ -1858,11 +1858,11 @@ bool printDirHomePage ( SafeBuf &sb , HttpRequest *r ) {
|
||||
// . call g_httpServer.sendDynamicPage() to send it
|
||||
bool sendPageRoot ( TcpSocket *s , HttpRequest *r, char *cookie ) {
|
||||
// don't allow pages bigger than 128k in cache
|
||||
char buf [ 10*1024 + MAX_QUERY_LEN ];
|
||||
char buf [ 10*1024 ];//+ MAX_QUERY_LEN ];
|
||||
// a ptr into "buf"
|
||||
//char *p = buf;
|
||||
//char *pend = buf + 10*1024 + MAX_QUERY_LEN - 100 ;
|
||||
SafeBuf sb(buf, 10*1024 + MAX_QUERY_LEN);
|
||||
SafeBuf sb(buf, 10*1024 );//+ MAX_QUERY_LEN);
|
||||
// print bgcolors, set focus, set font style
|
||||
//p = g_httpServer.printFocus ( p , pend );
|
||||
//p = g_httpServer.printColors ( p , pend );
|
||||
|
@ -29,7 +29,9 @@ typedef float rscore_t;
|
||||
|
||||
#define MINSCORE 1
|
||||
#define MIN_SAVE_SIZE 100
|
||||
#define PQR_BUF_SIZE MAX_QUERY_LEN
|
||||
// we don't use this any more so make it compile
|
||||
//#define PQR_BUF_SIZE MAX_QUERY_LEN
|
||||
#define PQR_BUF_SIZE 64
|
||||
|
||||
class PostQueryRerank {
|
||||
public:
|
||||
|
113
Query.cpp
113
Query.cpp
@ -68,9 +68,11 @@ void Query::reset ( ) {
|
||||
qt->m_facetIndexBuf.purge();
|
||||
}
|
||||
|
||||
m_sb.purge();
|
||||
m_osb.purge();
|
||||
m_docIdRestriction = 0LL;
|
||||
m_groupThatHasDocId = NULL;
|
||||
m_bufLen = 0;
|
||||
//m_bufLen = 0;
|
||||
m_origLen = 0;
|
||||
m_numWords = 0;
|
||||
//m_numOperands = 0;
|
||||
@ -160,17 +162,26 @@ bool Query::set2 ( char *query ,
|
||||
//m_coll = coll;
|
||||
//m_collLen = collLen;
|
||||
// truncate query if too big
|
||||
if ( queryLen >= MAX_QUERY_LEN ) {
|
||||
log("query: Query length of %"INT32" must be less than %"INT32". "
|
||||
"Truncating.",queryLen,(int32_t)MAX_QUERY_LEN);
|
||||
queryLen = MAX_QUERY_LEN - 1;
|
||||
if ( queryLen >= ABS_MAX_QUERY_LEN ) {
|
||||
log("query: Query length of %"INT32" must be "
|
||||
"less than %"INT32". "
|
||||
"Truncating.",queryLen,(int32_t)ABS_MAX_QUERY_LEN);
|
||||
queryLen = ABS_MAX_QUERY_LEN - 1;
|
||||
m_truncated = true;
|
||||
}
|
||||
// save original query
|
||||
m_osb.setBuf ( m_otmpBuf , 128 , 0 , false );
|
||||
m_osb.setLabel ("oqbuf" );
|
||||
m_osb.reserve ( queryLen + 1 );
|
||||
m_osb.safeMemcpy ( query , queryLen );
|
||||
m_osb.nullTerm ();
|
||||
|
||||
m_origLen = queryLen;
|
||||
gbmemcpy ( m_orig , query , queryLen );
|
||||
m_orig [ m_origLen ] = '\0';
|
||||
//m_origLen = queryLen;
|
||||
//gbmemcpy ( m_orig , query , queryLen );
|
||||
//m_orig [ m_origLen ] = '\0';
|
||||
|
||||
m_orig = m_osb.getBufStart();
|
||||
m_origLen = m_osb.getLength();
|
||||
|
||||
log(LOG_DEBUG, "query: set called = %s", m_orig);
|
||||
|
||||
@ -204,9 +215,16 @@ bool Query::set2 ( char *query ,
|
||||
// that were set somewhere above!!! i moved top: label above!
|
||||
//reset();
|
||||
|
||||
// reserve some space, guessing how much we'd need
|
||||
m_sb.setBuf(m_tmpBuf3,128,0,false);
|
||||
m_sb.setLabel("qrystk");
|
||||
int32_t need = queryLen * 2 + 32;
|
||||
if ( ! m_sb.reserve ( need ) )
|
||||
return false;
|
||||
|
||||
// convenience ptr
|
||||
char *p = m_buf;
|
||||
char *pend = m_buf + MAX_QUERY_LEN;
|
||||
//char *p = m_buf;
|
||||
//char *pend = m_buf + MAX_QUERY_LEN;
|
||||
bool inQuotesFlag = false;
|
||||
// . copy query into m_buf
|
||||
// . translate ( and ) to special query operators so Words class
|
||||
@ -219,27 +237,31 @@ bool Query::set2 ( char *query ,
|
||||
if ( query[i] == '\"' ) inQuotesFlag = !inQuotesFlag;
|
||||
|
||||
if ( inQuotesFlag ) {
|
||||
*p = query [i];
|
||||
p++;
|
||||
//*p = query [i];
|
||||
//p++;
|
||||
m_sb.pushChar(query[i]);
|
||||
continue;
|
||||
}
|
||||
|
||||
// dst buf must be big enough
|
||||
if ( p + 8 >= pend ) {
|
||||
g_errno = EBUFTOOSMALL;
|
||||
return log(LOG_LOGIC,"query: query: query too big.");
|
||||
}
|
||||
// if ( p + 8 >= pend ) {
|
||||
// g_errno = EBUFTOOSMALL;
|
||||
// return log(LOG_LOGIC,"query: query: query too big.");
|
||||
// }
|
||||
// translate ( and )
|
||||
if ( boolFlag == 1 && query[i] == '(' ) {
|
||||
gbmemcpy ( p , " LeFtP " , 7 ); p += 7;
|
||||
//gbmemcpy ( p , " LeFtP " , 7 ); p += 7;
|
||||
m_sb.safeMemcpy ( " LeFtP " , 7 );
|
||||
continue;
|
||||
}
|
||||
if ( boolFlag == 1 && query[i] == ')' ) {
|
||||
gbmemcpy ( p , " RiGhP " , 7 ); p += 7;
|
||||
//gbmemcpy ( p , " RiGhP " , 7 ); p += 7;
|
||||
m_sb.safeMemcpy ( " RiGhP " , 7 );
|
||||
continue;
|
||||
}
|
||||
if ( query[i] == '|' ) {
|
||||
gbmemcpy ( p , " PiiPE " , 7 ); p += 7;
|
||||
//gbmemcpy ( p , " PiiPE " , 7 ); p += 7;
|
||||
m_sb.safeMemcpy ( " PiiPE " , 7 );
|
||||
continue;
|
||||
}
|
||||
// translate [#a] [#r] [#ap] [#rp] [] [p] to operators
|
||||
@ -249,28 +271,34 @@ bool Query::set2 ( char *query ,
|
||||
while ( is_digit(query[j]) ) j++;
|
||||
char c = query[j];
|
||||
if ( (c == 'a' || c == 'r') && query[j+1]==']' ) {
|
||||
sprintf ( p , " LeFtB %"INT32" %c RiGhB ",val,c);
|
||||
p += gbstrlen(p);
|
||||
//sprintf ( p , " LeFtB %"INT32" %c RiGhB ",
|
||||
m_sb.safePrintf(" LeFtB %"INT32" %c RiGhB ",
|
||||
val,c);
|
||||
//p += gbstrlen(p);
|
||||
i = j + 1;
|
||||
continue;
|
||||
}
|
||||
else if ( (c == 'a' || c == 'r') &&
|
||||
query[j+1]=='p' && query[j+2]==']') {
|
||||
sprintf ( p , " LeFtB %"INT32" %cp RiGhB ",val,c);
|
||||
p += gbstrlen(p);
|
||||
//sprintf ( p , " LeFtB %"INT32" %cp RiGhB ",
|
||||
m_sb.safePrintf(" LeFtB %"INT32" %cp RiGhB ",
|
||||
val,c);
|
||||
//p += gbstrlen(p);
|
||||
i = j + 2;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
if ( query[i] == '[' && query[i+1] == ']' ) {
|
||||
sprintf ( p , " LeFtB RiGhB ");
|
||||
p += gbstrlen(p);
|
||||
//sprintf ( p , " LeFtB RiGhB ");
|
||||
//p += gbstrlen(p);
|
||||
m_sb.safePrintf ( " LeFtB RiGhB ");
|
||||
i = i + 1;
|
||||
continue;
|
||||
}
|
||||
if ( query[i] == '[' && query[i+1] == 'p' && query[i+2]==']') {
|
||||
sprintf ( p , " LeFtB RiGhB ");
|
||||
p += gbstrlen(p);
|
||||
//sprintf ( p , " LeFtB RiGhB ");
|
||||
//p += gbstrlen(p);
|
||||
m_sb.safePrintf ( " LeFtB RiGhB ");
|
||||
i = i + 2;
|
||||
continue;
|
||||
}
|
||||
@ -306,17 +334,22 @@ bool Query::set2 ( char *query ,
|
||||
|
||||
// TODO: copy altavista's operators here? & | !
|
||||
// otherwise, just a plain copy
|
||||
*p = query [i];
|
||||
p++;
|
||||
// *p = query [i];
|
||||
// p++;
|
||||
m_sb.pushChar ( query[i] );
|
||||
}
|
||||
// NULL terminate
|
||||
*p = '\0';
|
||||
//*p = '\0';
|
||||
m_sb.nullTerm();
|
||||
// debug statement
|
||||
//log(LOG_DEBUG,"Query: Got new query=%s",tempBuf);
|
||||
//printf("query: query: Got new query=%s\n",tempBuf);
|
||||
|
||||
// set length
|
||||
m_bufLen = p - m_buf;
|
||||
//m_bufLen = p - m_buf;
|
||||
|
||||
//m_buf = m_sb.getBufStart();
|
||||
//m_bufLen = m_sb.length();
|
||||
|
||||
Words words;
|
||||
Phrases phrases;
|
||||
@ -1991,16 +2024,17 @@ bool Query::setQWords ( char boolFlag ,
|
||||
// . because we now deal with boolean queries, we make parentheses
|
||||
// their own separate Word, so tell "words" we're setting a query
|
||||
//Words words;
|
||||
if ( ! words.set ( m_buf , m_bufLen,
|
||||
if ( ! words.set ( m_sb.getBufStart() , m_sb.length() ,
|
||||
//buf , m_bufLen,
|
||||
TITLEREC_CURRENT_VERSION, true, true ) )
|
||||
return log("query: Had error parsing query: %s.",
|
||||
mstrerror(g_errno));
|
||||
int32_t numWords = words.getNumWords();
|
||||
// truncate it
|
||||
if ( numWords > MAX_QUERY_WORDS ) {
|
||||
if ( numWords > ABS_MAX_QUERY_WORDS ) {
|
||||
log("query: Had %"INT32" words. Max is %"INT32". Truncating.",
|
||||
numWords,(int32_t)MAX_QUERY_WORDS);
|
||||
numWords = MAX_QUERY_WORDS;
|
||||
numWords,(int32_t)ABS_MAX_QUERY_WORDS);
|
||||
numWords = ABS_MAX_QUERY_WORDS;
|
||||
m_truncated = true;
|
||||
}
|
||||
m_numWords = numWords;
|
||||
@ -2026,8 +2060,8 @@ bool Query::setQWords ( char boolFlag ,
|
||||
|
||||
// is all alpha chars in query in upper case? caps lock on?
|
||||
bool allUpper = true;
|
||||
char *p = m_buf;
|
||||
char *pend = m_buf + m_bufLen;
|
||||
char *p = m_sb.getBufStart();//m_buf;
|
||||
char *pend = m_sb.getBuf(); // m_buf + m_bufLen;
|
||||
for ( ; p < pend ; p += getUtf8CharSize(p) )
|
||||
if ( is_alpha_utf8 ( p ) && ! is_upper_utf8 ( p ) ) {
|
||||
allUpper = false; break; }
|
||||
@ -2127,7 +2161,7 @@ bool Query::setQWords ( char boolFlag ,
|
||||
char *ignoreTill = NULL;
|
||||
|
||||
// loop over all words, these QueryWords are 1-1 with "words"
|
||||
for ( int32_t i = 0 ; i < numWords && i < MAX_QUERY_WORDS ; i++ ) {
|
||||
for ( int32_t i = 0 ; i < numWords && i < ABS_MAX_QUERY_WORDS ; i++ ) {
|
||||
// convenience var, these are 1-1 with "words"
|
||||
QueryWord *qw = &m_qwords[i];
|
||||
// set to defaults?
|
||||
@ -3328,7 +3362,8 @@ bool Query::setQWords ( char boolFlag ,
|
||||
// search up to this far
|
||||
int32_t maxj = i + nw;
|
||||
// but not past our truncated limit
|
||||
if ( maxj > MAX_QUERY_WORDS ) maxj = MAX_QUERY_WORDS;
|
||||
if ( maxj > ABS_MAX_QUERY_WORDS )
|
||||
maxj = ABS_MAX_QUERY_WORDS;
|
||||
|
||||
for ( j = i ; j < maxj ; j++ ) {
|
||||
// skip punct
|
||||
|
26
Query.h
26
Query.h
@ -10,7 +10,9 @@
|
||||
|
||||
// keep these down to save memory
|
||||
//#define MAX_QUERY_LEN 8000 // url:XXX can be quite long! (MAX_URL_LEN)
|
||||
#define MAX_QUERY_LEN 3200
|
||||
//#define MAX_QUERY_LEN 3200
|
||||
// support big OR queries for image shingles
|
||||
#define ABS_MAX_QUERY_LEN 32000
|
||||
// . words need to deal with int32_t list of sites!
|
||||
// . remember, words can be string of punctuation, too
|
||||
//#define MAX_QUERY_WORDS 5000
|
||||
@ -21,7 +23,8 @@
|
||||
// seems like we alloc just enough to hold our words now so that this
|
||||
// is really a performance capper but it is used in Summary.cpp
|
||||
// and Matches.h so don't go too big just yet
|
||||
#define MAX_QUERY_WORDS 800
|
||||
//#define MAX_QUERY_WORDS 800
|
||||
#define ABS_MAX_QUERY_WORDS 9000
|
||||
|
||||
// . how many IndexLists might we get/intersect
|
||||
// . we now use a int64_t to hold the query term bits for non-boolean queries
|
||||
@ -986,18 +989,27 @@ class Query {
|
||||
class Host *m_groupThatHasDocId;
|
||||
|
||||
// for holding the filtered query, in utf8
|
||||
char m_buf [ MAX_QUERY_LEN ];
|
||||
int32_t m_bufLen;
|
||||
//char m_buf [ MAX_QUERY_LEN ];
|
||||
//int32_t m_bufLen;
|
||||
|
||||
// for holding the filtered query, in utf8
|
||||
SafeBuf m_sb;
|
||||
char m_tmpBuf3[128];
|
||||
|
||||
// for holding the filtered/NULL-terminated query for doing
|
||||
// matching. basically store phrases in here without punct
|
||||
// so we can point a needle to them for matching in XmlDoc.cpp.
|
||||
char m_needleBuf [ MAX_QUERY_LEN + 1 ];
|
||||
int32_t m_needleBufLen;
|
||||
//char m_needleBuf [ MAX_QUERY_LEN + 1 ];
|
||||
//int32_t m_needleBufLen;
|
||||
|
||||
// the original query
|
||||
char m_orig [ MAX_QUERY_LEN ];
|
||||
//char m_orig [ MAX_QUERY_LEN ];
|
||||
//int32_t m_origLen;
|
||||
|
||||
char *m_orig;
|
||||
int32_t m_origLen;
|
||||
SafeBuf m_osb;
|
||||
char m_otmpBuf[128];
|
||||
|
||||
// we just have a ptr to this so don't pull the rug out
|
||||
//char *m_coll;
|
||||
|
10
SafeBuf.h
10
SafeBuf.h
@ -10,6 +10,9 @@
|
||||
* (for java programmers, very similar to the StringBuffer class, with all the speed that c++ allows).
|
||||
* Most of strings in Gigablast are handled by those.
|
||||
*/
|
||||
|
||||
#include "iana_charset.h"
|
||||
|
||||
class SafeBuf {
|
||||
public:
|
||||
//*TRUCTORS
|
||||
@ -33,8 +36,11 @@ public:
|
||||
// want SafeBuf to free the data for you. Keep in mind, all
|
||||
// previous content in SafeBuf will be cleared when you pass it
|
||||
// a new buffer.
|
||||
bool setBuf(char *newBuf, int32_t bufMax, int32_t bytesInUse, bool ownData,
|
||||
int16_t encoding );
|
||||
bool setBuf(char *newBuf,
|
||||
int32_t bufMax,
|
||||
int32_t bytesInUse,
|
||||
bool ownData,
|
||||
int16_t encoding = csUTF8 );
|
||||
// yieldBuf() allows you to take over the buffer in SafeBuf.
|
||||
// You may only free the data if it was originally owned by
|
||||
// the SafeBuf.
|
||||
|
44
Summary.cpp
44
Summary.cpp
@ -12,6 +12,7 @@ Summary::Summary()
|
||||
//m_buf = NULL;
|
||||
m_bitScoresBuf = NULL;
|
||||
m_bitScoresBufSize = 0;
|
||||
m_wordWeights = NULL;
|
||||
reset();
|
||||
}
|
||||
|
||||
@ -36,6 +37,14 @@ void Summary::reset() {
|
||||
m_numExcerpts = 0;
|
||||
m_summaryLocs.reset();
|
||||
m_summaryLocsPops.reset();
|
||||
if ( m_wordWeights && m_wordWeights != (float *)m_tmpBuf ) {
|
||||
mfree ( m_wordWeights , m_wordWeightSize , "sumww");
|
||||
m_wordWeights = NULL;
|
||||
}
|
||||
m_wordWeights = NULL;
|
||||
if ( m_buf && m_buf != m_tmpBuf2 )
|
||||
mfree ( m_buf , m_bufSize , "ssstkb" );
|
||||
m_buf = NULL;
|
||||
}
|
||||
|
||||
|
||||
@ -151,6 +160,15 @@ bool Summary::set2 ( Xml *xml ,
|
||||
end - start );
|
||||
start = gettimeofdayInMilliseconds();*/
|
||||
//
|
||||
int32_t need1 = q->m_numWords * sizeof(float);
|
||||
m_wordWeightSize = need1;
|
||||
if ( need1 < 128 )
|
||||
m_wordWeights = (float *)m_tmpBuf;
|
||||
else
|
||||
m_wordWeights = (float *)mmalloc ( need1 , "wwsum" );
|
||||
if ( ! m_wordWeights ) return false;
|
||||
|
||||
|
||||
|
||||
// zero out all word weights
|
||||
for ( int32_t i = 0 ; i < q->m_numWords; i++ )
|
||||
@ -229,11 +247,25 @@ bool Summary::set2 ( Xml *xml ,
|
||||
pend = m_summary + maxSummaryLen;
|
||||
m_numExcerpts = 0;
|
||||
|
||||
int32_t need2 = (1+1+1) * m_q->m_numWords;
|
||||
m_bufSize = need2;
|
||||
if ( need2 < 128 )
|
||||
m_buf = m_tmpBuf2;
|
||||
else
|
||||
m_buf = (char *)mmalloc ( need2 , "stkbuf" );
|
||||
if ( ! m_buf ) return false;
|
||||
char *x = m_buf;
|
||||
char *retired = x;
|
||||
x += m_q->m_numWords;
|
||||
char *maxGotIt = x;
|
||||
x += m_q->m_numWords;
|
||||
char *gotIt = x;
|
||||
|
||||
// . the "maxGotIt" count vector accumulates into "retired"
|
||||
// . that is how we keep track of what query words we used for previous
|
||||
// summary excerpts so we try to get diversified excerpts with
|
||||
// different query terms/words in them
|
||||
char retired [ MAX_QUERY_WORDS ];
|
||||
//char retired [ MAX_QUERY_WORDS ];
|
||||
memset ( retired, 0, m_q->m_numWords * sizeof(char) );
|
||||
|
||||
// some query words are already matched in the title
|
||||
@ -260,7 +292,7 @@ bool Summary::set2 ( Xml *xml ,
|
||||
int32_t maxb = 0;
|
||||
int32_t maxi = -1;
|
||||
int32_t lasta = -1;
|
||||
char maxGotIt [ MAX_QUERY_WORDS ];
|
||||
//char maxGotIt [ MAX_QUERY_WORDS ];
|
||||
|
||||
if(lastNumFinal == numFinal) {
|
||||
if(maxLoops-- <= 0) {
|
||||
@ -296,7 +328,7 @@ bool Summary::set2 ( Xml *xml ,
|
||||
if ( skip ) continue;
|
||||
|
||||
// ask him for the query words he matched
|
||||
char gotIt [ MAX_QUERY_WORDS ];
|
||||
//char gotIt [ MAX_QUERY_WORDS ];
|
||||
// clear it for him
|
||||
memset ( gotIt, 0, m_q->m_numWords * sizeof(char) );
|
||||
|
||||
@ -558,6 +590,11 @@ bool Summary::set2 ( Xml *xml ,
|
||||
m_displayLen = p - m_summary;
|
||||
}
|
||||
|
||||
// free the mem we used if we allocated it
|
||||
if ( m_buf && m_buf != m_tmpBuf2 )
|
||||
mfree ( m_buf , m_bufSize , "ssstkb" );
|
||||
m_buf = NULL;
|
||||
|
||||
|
||||
// If we still didn't find a summary, get the default summary
|
||||
if ( p == m_summary ) {
|
||||
@ -570,6 +607,7 @@ bool Summary::set2 ( Xml *xml ,
|
||||
maxSummaryLen );
|
||||
if ( m_numDisplayLines > 0 )
|
||||
m_displayLen = m_summaryLen;
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
|
@ -266,7 +266,14 @@ class Summary {
|
||||
|
||||
char *m_bitScoresBuf;
|
||||
int32_t m_bitScoresBufSize;
|
||||
float m_wordWeights[MAX_QUERY_WORDS];
|
||||
//float m_wordWeights[MAX_QUERY_WORDS];
|
||||
float *m_wordWeights;
|
||||
int32_t m_wordWeightSize;
|
||||
char m_tmpBuf[128];
|
||||
|
||||
char *m_buf;
|
||||
int32_t m_bufSize;
|
||||
char m_tmpBuf2[128];
|
||||
|
||||
char m_summaryLocBuf[MAX_SUMMARY_LOCS*sizeof(uint64_t)];
|
||||
SafeBuf m_summaryLocs;
|
||||
|
@ -45266,7 +45266,7 @@ SafeBuf *XmlDoc::getMatchingQueriesScoredForFullQuery ( ) {
|
||||
// prepend to the query?
|
||||
int32_t ulen = m_firstUrl.m_ulen;
|
||||
// go to next guy if this query is too big already
|
||||
if ( ulen + qlen + 10 > MAX_QUERY_LEN ) {
|
||||
if ( ulen + qlen + 10 > ABS_MAX_QUERY_LEN ) {
|
||||
m_queryNum++;
|
||||
goto loop;
|
||||
}
|
||||
|
Reference in New Issue
Block a user