get mike's super long query working

This commit is contained in:
Matt
2015-07-13 14:59:44 -06:00
parent 0e009fa6bc
commit 34ec49e804
13 changed files with 205 additions and 70 deletions

@ -24,6 +24,8 @@
Matches::Matches ( ) {
m_detectSubPhrases = false;
m_numMatchGroups = 0;
m_qwordFlags = NULL;
m_qwordAllocSize = 0;
reset();
}
Matches::~Matches( ) { reset(); }
@ -39,6 +41,10 @@ void Matches::reset ( ) {
m_bitsArray [i].reset();
}
m_numMatchGroups = 0;
if ( m_qwordFlags && m_qwordFlags != (mf_t *)m_tmpBuf ) {
mfree ( m_qwordFlags , m_qwordAllocSize , "mmqw" );
m_qwordFlags = NULL;
}
//m_explicitsMatched = 0;
//m_matchableRequiredBits = 0;
//m_hasAllQueryTerms = false;
@ -103,6 +109,20 @@ void Matches::setQuery ( Query *q ) {
//memset ( m_foundNegTermVector, 0, m_q->getNumTerms() );
if ( m_qwordFlags ) { char *xx=NULL;*xx=0; }
int32_t need = m_q->m_numWords * sizeof(mf_t) ;
m_qwordAllocSize = need;
if ( need < 128 )
m_qwordFlags = (mf_t *)m_tmpBuf;
else
m_qwordFlags = (mf_t *)mmalloc ( need , "mmqf" );
if ( ! m_qwordFlags ) {
log("matches: alloc failed for query %s",q->m_orig);
return;
}
// this is word based. these are each 1 byte
memset ( m_qwordFlags , 0 , m_q->m_numWords * sizeof(mf_t));

@ -183,7 +183,10 @@ class Matches {
// . 1-1 with Query::m_qwords[] array of QWords
// . shows the match flags for that query word
mf_t m_qwordFlags[MAX_QUERY_WORDS];
//mf_t m_qwordFlags[MAX_QUERY_WORDS];
mf_t *m_qwordFlags;
int32_t m_qwordAllocSize;
char m_tmpBuf[128];
//stuff for detecting whether a match is part of a larger phrase
void setSubPhraseDetection();

@ -40,7 +40,9 @@ public:
bool m_isLocal;
//bool m_seq;
bool m_rtq;
char m_q[MAX_QUERY_LEN+1];
//char m_q[MAX_QUERY_LEN+1];
SafeBuf m_qsb;
char m_qtmpBuf[128];
int32_t m_qlen;
char m_boolFlag;
bool m_printed;
@ -98,7 +100,7 @@ bool sendPageGet ( TcpSocket *s , HttpRequest *r ) {
int32_t qlen = 0;
char *q = r->getString ( "q" , &qlen , NULL /*default*/);
// ensure query not too big
if ( qlen >= MAX_QUERY_LEN-1 ) {
if ( qlen >= ABS_MAX_QUERY_LEN-1 ) {
g_errno=EQUERYTOOBIG;
return g_httpServer.sendErrorReply (s,500 ,mstrerror(g_errno));
}
@ -156,8 +158,16 @@ bool sendPageGet ( TcpSocket *s , HttpRequest *r ) {
// delete ( st );
// return sendPageNetResult( s );
//}
if ( q && qlen > 0 ) strcpy ( st->m_q , q );
else st->m_q[0] = '\0';
//if ( q && qlen > 0 ) strcpy ( st->m_q , q );
//else st->m_q[0] = '\0';
st->m_qsb.setBuf ( st->m_qtmpBuf,128,0,false );
st->m_qsb.setLabel ( "qsbpg" );
// save the query
if ( q && qlen > 0 )
st->m_qsb.safeStrcpy ( q );
st->m_qlen = qlen;
//st->m_seq = seq;
st->m_rtq = rtq;
@ -415,8 +425,8 @@ bool processLoop ( void *state ) {
int32_t startLen2 = sb->length();//p;
// query should be NULL terminated
char *q = st->m_q;
int32_t qlen = st->m_qlen;
char *q = st->m_qsb.getBufStart();
int32_t qlen = st->m_qsb.getLength(); // m_qlen;
char styleTitle[128] = "font-size:14px;font-weight:600;"
"color:#000000;";

@ -2795,13 +2795,14 @@ bool printSearchResultsHeader ( State0 *st ) {
//Highlight h;
st->m_qe[0] = '\0';
//st->m_qe[0] = '\0';
st->m_qesb.nullTerm();
// encode query buf
//char qe[MAX_QUERY_LEN+1];
char *dq = si->m_displayQuery;
//int32_t dqlen = si->m_displayQueryLen;
if ( dq ) urlEncode(st->m_qe,MAX_QUERY_LEN*2,dq,gbstrlen(dq));
if ( dq ) st->m_qesb.urlEncode(dq);
// how many results were requested?
//int32_t docsWanted = msg40->getDocsWanted();
@ -5187,7 +5188,7 @@ bool printResult ( State0 *st, int32_t ix , int32_t *numPrintedSoFar ) {
"get?"
"q=%s&c=%s&d=%"INT64">"
"cached</a>\n",
st->m_qe , coll ,
st->m_qesb.getBufStart() , coll ,
mr->m_docId );
else if ( printCached )
sb->safePrintf ( "<a href=\""
@ -5196,7 +5197,7 @@ bool printResult ( State0 *st, int32_t ix , int32_t *numPrintedSoFar ) {
"qlang=%s&"
"c=%s&d=%"INT64"&cnsp=0\">"
"cached</a>\n",
st->m_qe ,
st->m_qesb.getBufStart() ,
// "qlang" parm
si->m_defaultSortLang,
coll ,
@ -5336,7 +5337,7 @@ bool printResult ( State0 *st, int32_t ix , int32_t *numPrintedSoFar ) {
"d=%"INT64"&"
"cnsp=0\">"
"sections</a>\n",
st->m_qe ,
st->m_qesb.getBufStart() ,
// "qlang" parm
si->m_defaultSortLang,
coll ,
@ -5449,7 +5450,7 @@ bool printResult ( State0 *st, int32_t ix , int32_t *numPrintedSoFar ) {
qq.urlEncode("site:");
qq.urlEncode (hbuf);
qq.urlEncode(" | ");
qq.safeStrcpy(st->m_qe);
qq.safeStrcpy(st->m_qesb.getBufStart());
qq.nullTerm();
// get the original url and add/replace in query
char tmp2[512];

@ -52,7 +52,8 @@ public:
int64_t m_took; // how long it took to get the results
HttpRequest m_hr;
bool m_printedHeaderRow;
char m_qe[MAX_QUERY_LEN+1];
//char m_qe[MAX_QUERY_LEN+1];
SafeBuf m_qesb;
// for printing our search result json items in csv:
HashTableX m_columnTable;

@ -1858,11 +1858,11 @@ bool printDirHomePage ( SafeBuf &sb , HttpRequest *r ) {
// . call g_httpServer.sendDynamicPage() to send it
bool sendPageRoot ( TcpSocket *s , HttpRequest *r, char *cookie ) {
// don't allow pages bigger than 128k in cache
char buf [ 10*1024 + MAX_QUERY_LEN ];
char buf [ 10*1024 ];//+ MAX_QUERY_LEN ];
// a ptr into "buf"
//char *p = buf;
//char *pend = buf + 10*1024 + MAX_QUERY_LEN - 100 ;
SafeBuf sb(buf, 10*1024 + MAX_QUERY_LEN);
SafeBuf sb(buf, 10*1024 );//+ MAX_QUERY_LEN);
// print bgcolors, set focus, set font style
//p = g_httpServer.printFocus ( p , pend );
//p = g_httpServer.printColors ( p , pend );

@ -29,7 +29,9 @@ typedef float rscore_t;
#define MINSCORE 1
#define MIN_SAVE_SIZE 100
#define PQR_BUF_SIZE MAX_QUERY_LEN
// we don't use this any more so make it compile
//#define PQR_BUF_SIZE MAX_QUERY_LEN
#define PQR_BUF_SIZE 64
class PostQueryRerank {
public:

113
Query.cpp

@ -68,9 +68,11 @@ void Query::reset ( ) {
qt->m_facetIndexBuf.purge();
}
m_sb.purge();
m_osb.purge();
m_docIdRestriction = 0LL;
m_groupThatHasDocId = NULL;
m_bufLen = 0;
//m_bufLen = 0;
m_origLen = 0;
m_numWords = 0;
//m_numOperands = 0;
@ -160,17 +162,26 @@ bool Query::set2 ( char *query ,
//m_coll = coll;
//m_collLen = collLen;
// truncate query if too big
if ( queryLen >= MAX_QUERY_LEN ) {
log("query: Query length of %"INT32" must be less than %"INT32". "
"Truncating.",queryLen,(int32_t)MAX_QUERY_LEN);
queryLen = MAX_QUERY_LEN - 1;
if ( queryLen >= ABS_MAX_QUERY_LEN ) {
log("query: Query length of %"INT32" must be "
"less than %"INT32". "
"Truncating.",queryLen,(int32_t)ABS_MAX_QUERY_LEN);
queryLen = ABS_MAX_QUERY_LEN - 1;
m_truncated = true;
}
// save original query
m_osb.setBuf ( m_otmpBuf , 128 , 0 , false );
m_osb.setLabel ("oqbuf" );
m_osb.reserve ( queryLen + 1 );
m_osb.safeMemcpy ( query , queryLen );
m_osb.nullTerm ();
m_origLen = queryLen;
gbmemcpy ( m_orig , query , queryLen );
m_orig [ m_origLen ] = '\0';
//m_origLen = queryLen;
//gbmemcpy ( m_orig , query , queryLen );
//m_orig [ m_origLen ] = '\0';
m_orig = m_osb.getBufStart();
m_origLen = m_osb.getLength();
log(LOG_DEBUG, "query: set called = %s", m_orig);
@ -204,9 +215,16 @@ bool Query::set2 ( char *query ,
// that were set somewhere above!!! i moved top: label above!
//reset();
// reserve some space, guessing how much we'd need
m_sb.setBuf(m_tmpBuf3,128,0,false);
m_sb.setLabel("qrystk");
int32_t need = queryLen * 2 + 32;
if ( ! m_sb.reserve ( need ) )
return false;
// convenience ptr
char *p = m_buf;
char *pend = m_buf + MAX_QUERY_LEN;
//char *p = m_buf;
//char *pend = m_buf + MAX_QUERY_LEN;
bool inQuotesFlag = false;
// . copy query into m_buf
// . translate ( and ) to special query operators so Words class
@ -219,27 +237,31 @@ bool Query::set2 ( char *query ,
if ( query[i] == '\"' ) inQuotesFlag = !inQuotesFlag;
if ( inQuotesFlag ) {
*p = query [i];
p++;
//*p = query [i];
//p++;
m_sb.pushChar(query[i]);
continue;
}
// dst buf must be big enough
if ( p + 8 >= pend ) {
g_errno = EBUFTOOSMALL;
return log(LOG_LOGIC,"query: query: query too big.");
}
// if ( p + 8 >= pend ) {
// g_errno = EBUFTOOSMALL;
// return log(LOG_LOGIC,"query: query: query too big.");
// }
// translate ( and )
if ( boolFlag == 1 && query[i] == '(' ) {
gbmemcpy ( p , " LeFtP " , 7 ); p += 7;
//gbmemcpy ( p , " LeFtP " , 7 ); p += 7;
m_sb.safeMemcpy ( " LeFtP " , 7 );
continue;
}
if ( boolFlag == 1 && query[i] == ')' ) {
gbmemcpy ( p , " RiGhP " , 7 ); p += 7;
//gbmemcpy ( p , " RiGhP " , 7 ); p += 7;
m_sb.safeMemcpy ( " RiGhP " , 7 );
continue;
}
if ( query[i] == '|' ) {
gbmemcpy ( p , " PiiPE " , 7 ); p += 7;
//gbmemcpy ( p , " PiiPE " , 7 ); p += 7;
m_sb.safeMemcpy ( " PiiPE " , 7 );
continue;
}
// translate [#a] [#r] [#ap] [#rp] [] [p] to operators
@ -249,28 +271,34 @@ bool Query::set2 ( char *query ,
while ( is_digit(query[j]) ) j++;
char c = query[j];
if ( (c == 'a' || c == 'r') && query[j+1]==']' ) {
sprintf ( p , " LeFtB %"INT32" %c RiGhB ",val,c);
p += gbstrlen(p);
//sprintf ( p , " LeFtB %"INT32" %c RiGhB ",
m_sb.safePrintf(" LeFtB %"INT32" %c RiGhB ",
val,c);
//p += gbstrlen(p);
i = j + 1;
continue;
}
else if ( (c == 'a' || c == 'r') &&
query[j+1]=='p' && query[j+2]==']') {
sprintf ( p , " LeFtB %"INT32" %cp RiGhB ",val,c);
p += gbstrlen(p);
//sprintf ( p , " LeFtB %"INT32" %cp RiGhB ",
m_sb.safePrintf(" LeFtB %"INT32" %cp RiGhB ",
val,c);
//p += gbstrlen(p);
i = j + 2;
continue;
}
}
if ( query[i] == '[' && query[i+1] == ']' ) {
sprintf ( p , " LeFtB RiGhB ");
p += gbstrlen(p);
//sprintf ( p , " LeFtB RiGhB ");
//p += gbstrlen(p);
m_sb.safePrintf ( " LeFtB RiGhB ");
i = i + 1;
continue;
}
if ( query[i] == '[' && query[i+1] == 'p' && query[i+2]==']') {
sprintf ( p , " LeFtB RiGhB ");
p += gbstrlen(p);
//sprintf ( p , " LeFtB RiGhB ");
//p += gbstrlen(p);
m_sb.safePrintf ( " LeFtB RiGhB ");
i = i + 2;
continue;
}
@ -306,17 +334,22 @@ bool Query::set2 ( char *query ,
// TODO: copy altavista's operators here? & | !
// otherwise, just a plain copy
*p = query [i];
p++;
// *p = query [i];
// p++;
m_sb.pushChar ( query[i] );
}
// NULL terminate
*p = '\0';
//*p = '\0';
m_sb.nullTerm();
// debug statement
//log(LOG_DEBUG,"Query: Got new query=%s",tempBuf);
//printf("query: query: Got new query=%s\n",tempBuf);
// set length
m_bufLen = p - m_buf;
//m_bufLen = p - m_buf;
//m_buf = m_sb.getBufStart();
//m_bufLen = m_sb.length();
Words words;
Phrases phrases;
@ -1991,16 +2024,17 @@ bool Query::setQWords ( char boolFlag ,
// . because we now deal with boolean queries, we make parentheses
// their own separate Word, so tell "words" we're setting a query
//Words words;
if ( ! words.set ( m_buf , m_bufLen,
if ( ! words.set ( m_sb.getBufStart() , m_sb.length() ,
//buf , m_bufLen,
TITLEREC_CURRENT_VERSION, true, true ) )
return log("query: Had error parsing query: %s.",
mstrerror(g_errno));
int32_t numWords = words.getNumWords();
// truncate it
if ( numWords > MAX_QUERY_WORDS ) {
if ( numWords > ABS_MAX_QUERY_WORDS ) {
log("query: Had %"INT32" words. Max is %"INT32". Truncating.",
numWords,(int32_t)MAX_QUERY_WORDS);
numWords = MAX_QUERY_WORDS;
numWords,(int32_t)ABS_MAX_QUERY_WORDS);
numWords = ABS_MAX_QUERY_WORDS;
m_truncated = true;
}
m_numWords = numWords;
@ -2026,8 +2060,8 @@ bool Query::setQWords ( char boolFlag ,
// is all alpha chars in query in upper case? caps lock on?
bool allUpper = true;
char *p = m_buf;
char *pend = m_buf + m_bufLen;
char *p = m_sb.getBufStart();//m_buf;
char *pend = m_sb.getBuf(); // m_buf + m_bufLen;
for ( ; p < pend ; p += getUtf8CharSize(p) )
if ( is_alpha_utf8 ( p ) && ! is_upper_utf8 ( p ) ) {
allUpper = false; break; }
@ -2127,7 +2161,7 @@ bool Query::setQWords ( char boolFlag ,
char *ignoreTill = NULL;
// loop over all words, these QueryWords are 1-1 with "words"
for ( int32_t i = 0 ; i < numWords && i < MAX_QUERY_WORDS ; i++ ) {
for ( int32_t i = 0 ; i < numWords && i < ABS_MAX_QUERY_WORDS ; i++ ) {
// convenience var, these are 1-1 with "words"
QueryWord *qw = &m_qwords[i];
// set to defaults?
@ -3328,7 +3362,8 @@ bool Query::setQWords ( char boolFlag ,
// search up to this far
int32_t maxj = i + nw;
// but not past our truncated limit
if ( maxj > MAX_QUERY_WORDS ) maxj = MAX_QUERY_WORDS;
if ( maxj > ABS_MAX_QUERY_WORDS )
maxj = ABS_MAX_QUERY_WORDS;
for ( j = i ; j < maxj ; j++ ) {
// skip punct

26
Query.h

@ -10,7 +10,9 @@
// keep these down to save memory
//#define MAX_QUERY_LEN 8000 // url:XXX can be quite long! (MAX_URL_LEN)
#define MAX_QUERY_LEN 3200
//#define MAX_QUERY_LEN 3200
// support big OR queries for image shingles
#define ABS_MAX_QUERY_LEN 32000
// . words need to deal with int32_t list of sites!
// . remember, words can be string of punctuation, too
//#define MAX_QUERY_WORDS 5000
@ -21,7 +23,8 @@
// seems like we alloc just enough to hold our words now so that this
// is really a performance capper but it is used in Summary.cpp
// and Matches.h so don't go too big just yet
#define MAX_QUERY_WORDS 800
//#define MAX_QUERY_WORDS 800
#define ABS_MAX_QUERY_WORDS 9000
// . how many IndexLists might we get/intersect
// . we now use a int64_t to hold the query term bits for non-boolean queries
@ -986,18 +989,27 @@ class Query {
class Host *m_groupThatHasDocId;
// for holding the filtered query, in utf8
char m_buf [ MAX_QUERY_LEN ];
int32_t m_bufLen;
//char m_buf [ MAX_QUERY_LEN ];
//int32_t m_bufLen;
// for holding the filtered query, in utf8
SafeBuf m_sb;
char m_tmpBuf3[128];
// for holding the filtered/NULL-terminated query for doing
// matching. basically store phrases in here without punct
// so we can point a needle to them for matching in XmlDoc.cpp.
char m_needleBuf [ MAX_QUERY_LEN + 1 ];
int32_t m_needleBufLen;
//char m_needleBuf [ MAX_QUERY_LEN + 1 ];
//int32_t m_needleBufLen;
// the original query
char m_orig [ MAX_QUERY_LEN ];
//char m_orig [ MAX_QUERY_LEN ];
//int32_t m_origLen;
char *m_orig;
int32_t m_origLen;
SafeBuf m_osb;
char m_otmpBuf[128];
// we just have a ptr to this so don't pull the rug out
//char *m_coll;

@ -10,6 +10,9 @@
* (for java programmers, very similar to the StringBuffer class, with all the speed that c++ allows).
* Most of strings in Gigablast are handled by those.
*/
#include "iana_charset.h"
class SafeBuf {
public:
//*TRUCTORS
@ -33,8 +36,11 @@ public:
// want SafeBuf to free the data for you. Keep in mind, all
// previous content in SafeBuf will be cleared when you pass it
// a new buffer.
bool setBuf(char *newBuf, int32_t bufMax, int32_t bytesInUse, bool ownData,
int16_t encoding );
bool setBuf(char *newBuf,
int32_t bufMax,
int32_t bytesInUse,
bool ownData,
int16_t encoding = csUTF8 );
// yieldBuf() allows you to take over the buffer in SafeBuf.
// You may only free the data if it was originally owned by
// the SafeBuf.

@ -12,6 +12,7 @@ Summary::Summary()
//m_buf = NULL;
m_bitScoresBuf = NULL;
m_bitScoresBufSize = 0;
m_wordWeights = NULL;
reset();
}
@ -36,6 +37,14 @@ void Summary::reset() {
m_numExcerpts = 0;
m_summaryLocs.reset();
m_summaryLocsPops.reset();
if ( m_wordWeights && m_wordWeights != (float *)m_tmpBuf ) {
mfree ( m_wordWeights , m_wordWeightSize , "sumww");
m_wordWeights = NULL;
}
m_wordWeights = NULL;
if ( m_buf && m_buf != m_tmpBuf2 )
mfree ( m_buf , m_bufSize , "ssstkb" );
m_buf = NULL;
}
@ -151,6 +160,15 @@ bool Summary::set2 ( Xml *xml ,
end - start );
start = gettimeofdayInMilliseconds();*/
//
int32_t need1 = q->m_numWords * sizeof(float);
m_wordWeightSize = need1;
if ( need1 < 128 )
m_wordWeights = (float *)m_tmpBuf;
else
m_wordWeights = (float *)mmalloc ( need1 , "wwsum" );
if ( ! m_wordWeights ) return false;
// zero out all word weights
for ( int32_t i = 0 ; i < q->m_numWords; i++ )
@ -229,11 +247,25 @@ bool Summary::set2 ( Xml *xml ,
pend = m_summary + maxSummaryLen;
m_numExcerpts = 0;
int32_t need2 = (1+1+1) * m_q->m_numWords;
m_bufSize = need2;
if ( need2 < 128 )
m_buf = m_tmpBuf2;
else
m_buf = (char *)mmalloc ( need2 , "stkbuf" );
if ( ! m_buf ) return false;
char *x = m_buf;
char *retired = x;
x += m_q->m_numWords;
char *maxGotIt = x;
x += m_q->m_numWords;
char *gotIt = x;
// . the "maxGotIt" count vector accumulates into "retired"
// . that is how we keep track of what query words we used for previous
// summary excerpts so we try to get diversified excerpts with
// different query terms/words in them
char retired [ MAX_QUERY_WORDS ];
//char retired [ MAX_QUERY_WORDS ];
memset ( retired, 0, m_q->m_numWords * sizeof(char) );
// some query words are already matched in the title
@ -260,7 +292,7 @@ bool Summary::set2 ( Xml *xml ,
int32_t maxb = 0;
int32_t maxi = -1;
int32_t lasta = -1;
char maxGotIt [ MAX_QUERY_WORDS ];
//char maxGotIt [ MAX_QUERY_WORDS ];
if(lastNumFinal == numFinal) {
if(maxLoops-- <= 0) {
@ -296,7 +328,7 @@ bool Summary::set2 ( Xml *xml ,
if ( skip ) continue;
// ask him for the query words he matched
char gotIt [ MAX_QUERY_WORDS ];
//char gotIt [ MAX_QUERY_WORDS ];
// clear it for him
memset ( gotIt, 0, m_q->m_numWords * sizeof(char) );
@ -558,6 +590,11 @@ bool Summary::set2 ( Xml *xml ,
m_displayLen = p - m_summary;
}
// free the mem we used if we allocated it
if ( m_buf && m_buf != m_tmpBuf2 )
mfree ( m_buf , m_bufSize , "ssstkb" );
m_buf = NULL;
// If we still didn't find a summary, get the default summary
if ( p == m_summary ) {
@ -570,6 +607,7 @@ bool Summary::set2 ( Xml *xml ,
maxSummaryLen );
if ( m_numDisplayLines > 0 )
m_displayLen = m_summaryLen;
return status;
}

@ -266,7 +266,14 @@ class Summary {
char *m_bitScoresBuf;
int32_t m_bitScoresBufSize;
float m_wordWeights[MAX_QUERY_WORDS];
//float m_wordWeights[MAX_QUERY_WORDS];
float *m_wordWeights;
int32_t m_wordWeightSize;
char m_tmpBuf[128];
char *m_buf;
int32_t m_bufSize;
char m_tmpBuf2[128];
char m_summaryLocBuf[MAX_SUMMARY_LOCS*sizeof(uint64_t)];
SafeBuf m_summaryLocs;

@ -45266,7 +45266,7 @@ SafeBuf *XmlDoc::getMatchingQueriesScoredForFullQuery ( ) {
// prepend to the query?
int32_t ulen = m_firstUrl.m_ulen;
// go to next guy if this query is too big already
if ( ulen + qlen + 10 > MAX_QUERY_LEN ) {
if ( ulen + qlen + 10 > ABS_MAX_QUERY_LEN ) {
m_queryNum++;
goto loop;
}