Merge branch 'diffbot' of github.com:gigablast/open-source-search-engine into diffbot

This commit is contained in:
Matt Wells 2013-10-24 17:59:22 -07:00
commit 129937168d
8 changed files with 227 additions and 133 deletions

@ -73,8 +73,9 @@ char s_termList[1024];
// . content must be NULL terminated
// . if "useAnchors" is true we do click and scroll
// . if "isQueryTerms" is true, we do typical anchors in a special way
long Highlight::set ( char *buf ,
long bufLen ,
long Highlight::set ( SafeBuf *sb,
//char *buf ,
//long bufLen ,
char *content ,
long contentLen ,
// primary language of the document (for synonyms)
@ -119,8 +120,9 @@ long Highlight::set ( char *buf ,
// store
m_numMatches = matches.getNumMatches();
return set ( buf ,
bufLen ,
return set ( sb ,
//buf ,
//bufLen ,
&words ,
&matches ,
doStemming ,
@ -133,8 +135,9 @@ long Highlight::set ( char *buf ,
}
// New version
long Highlight::set ( char *buf ,
long bufLen ,
long Highlight::set ( SafeBuf *sb ,
//char *buf ,
//long bufLen ,
Words *words ,
Matches *matches ,
bool doStemming ,
@ -162,18 +165,20 @@ long Highlight::set ( char *buf ,
if ( m_frontTag ) m_frontTagLen = gbstrlen ( frontTag );
if ( m_backTag ) m_backTagLen = gbstrlen ( backTag );
// point to buffer to store highlighted text into
m_buf = buf;
m_bufLen = bufLen;
m_bufPtr = buf;
//m_buf = buf;
//m_bufLen = bufLen;
//m_bufPtr = buf;
m_sb = sb;
// save room for terminating \0
m_bufEnd = m_buf + m_bufLen - 1;
//m_bufEnd = m_buf + m_bufLen - 1;
if ( ! highlightWords ( words, matches, q ) ) return 0;
// null terminate
*m_bufPtr = '\0';
//*m_bufPtr = '\0';
m_sb->nullTerm();
// return the length
return m_bufPtr - m_buf;
return m_sb->length();//m_bufPtr - m_buf;
}
bool Highlight::highlightWords ( Words *words , Matches *m, Query *q ) {
@ -228,6 +233,7 @@ bool Highlight::highlightWords ( Words *words , Matches *m, Query *q ) {
endHead = false;
endHtml = false;
// bail now if out of room
/*
if ( m_bufPtr + MAX_URL_LEN + 1024 + wlen >= m_bufEnd ) {
// don't spam the logs
static long long s_lastTime = 0;
@ -238,6 +244,7 @@ bool Highlight::highlightWords ( Words *words , Matches *m, Query *q ) {
s_lastTime = now;
return true;
}
*/
if ( (words->getTagId(i) ) == TAG_TITLE ) { //<TITLE>
if ( words->isBackTag(i) ) inTitle = false;
else inTitle = true;
@ -282,8 +289,9 @@ bool Highlight::highlightWords ( Words *words , Matches *m, Query *q ) {
//else frontTag = s_frontTags [ p[i] % 10];
else frontTag =s_frontTags[mat->m_colorNum%10];
// OK...this is UTF-8 output, and ASCII Text
strcpy ( m_bufPtr , frontTag );
m_bufPtr += frontTagLen;
//strcpy ( m_bufPtr , frontTag );
//m_bufPtr += frontTagLen;
m_sb->safeStrcpy ( (char *)frontTag );
//log(LOG_DEBUG,
// "Highlight: starting phrase %d at word %d\n",
// p[i], i);
@ -296,8 +304,9 @@ bool Highlight::highlightWords ( Words *words , Matches *m, Query *q ) {
else if ( endHead ) {
// include the tags style sheet immediately before
// the closing </TITLE> tag
memcpy( m_bufPtr, s_styleSheet, s_styleSheetLen );
m_bufPtr += s_styleSheetLen;
//memcpy( m_bufPtr, s_styleSheet, s_styleSheetLen );
m_sb->safeMemcpy( s_styleSheet , s_styleSheetLen );
//m_bufPtr += s_styleSheetLen;
}
//else if ( endHtml ) {
// ;
@ -326,14 +335,16 @@ bool Highlight::highlightWords ( Words *words , Matches *m, Query *q ) {
// write the alnum word
//m_bufPtr +=latin1ToUtf8(m_bufPtr, m_bufEnd-m_bufPtr,w, wlen);
// everything is utf8 now
memcpy ( m_bufPtr, w , wlen );
m_bufPtr += wlen;
//memcpy ( m_bufPtr, w , wlen );
//m_bufPtr += wlen;
m_sb->safeMemcpy ( w , wlen );
// back tag
if ( i == backTagi-1 ) {
// store the back tag
memcpy ( m_bufPtr , backTag , backTagLen );
m_bufPtr += backTagLen ;
//memcpy ( m_bufPtr , backTag , backTagLen );
//m_bufPtr += backTagLen ;
m_sb->safeMemcpy ( (char *)backTag , backTagLen );
//log(LOG_DEBUG,
// "Highlight: ending phrase %d at word %d\n",
// p[i], i);

@ -19,8 +19,9 @@ class Highlight {
// . we highlight Query "q" in "xml" as best as we can
// . store highlighted text into "buf"
// . return length stored into "buf"
long set ( char *buf ,
long bufLen ,
long set ( //char *buf ,
//long bufLen ,
SafeBuf *sb,
char *content ,
long contentLen ,
char docLangId ,
@ -33,8 +34,9 @@ class Highlight {
long fieldCode , // = 0 ,
long niceness ) ;
long set ( char *buf ,
long bufLen ,
long set ( //char *buf ,
//long bufLen ,
SafeBuf *sb ,
Words *words ,
Matches *matches ,
bool doStemming ,
@ -52,10 +54,11 @@ class Highlight {
bool highlightWords ( Words *words , Matches *m , Query *q=NULL );
// null terminate and store the highlighted content in m_buf
char *m_buf ;
long m_bufLen;
char *m_bufPtr;
char *m_bufEnd;
//char *m_buf ;
//long m_bufLen;
//char *m_bufPtr;
//char *m_bufEnd;
class SafeBuf *m_sb;
//Words m_words;
Matches m_matches;

@ -194,6 +194,7 @@ bool Hostdb::init ( char *filename , long hostId , char *netName ,
// skip known directives
if ( ! strncmp(p,"port-offset:",12) ||
! strncmp(p,"index-splits:",13) ||
! strncmp(p,"num-mirrors:",12) ||
! strncmp(p,"working-dir:",12) )
p = p;
// check if this is a spare host
@ -243,13 +244,14 @@ bool Hostdb::init ( char *filename , long hostId , char *netName ,
if ( ! m_hosts ) return log(
"conf: Memory allocation failed.");
unsigned long maxShard = 0;
//unsigned long maxShard = 0;
long numGrunts = 0;
// now fill up m_hosts
p = m_buf;
i = 0;
long line = 1;
unsigned long lastShard = 0;
//unsigned long lastShard = 0;
long proxyNum = 0;
// assume defaults
@ -257,6 +259,7 @@ bool Hostdb::init ( char *filename , long hostId , char *netName ,
long indexSplits = 0;
char *wdir2 = NULL;
long wdirlen2 = 0;
long numMirrors = -1;
for ( ; *p ; p++ , line++ ) {
if ( is_wspace_a (*p) ) continue;
@ -273,6 +276,15 @@ bool Hostdb::init ( char *filename , long hostId , char *netName ,
continue;
}
if ( ! strncmp(p,"num-mirrors:",12) ) {
p += 12;
// skip spaces after the colon
while ( is_wspace_a(*p) ) p++;
numMirrors = atol(p);
while ( *p && *p != '\n' ) p++;
continue;
}
// does the line say "working-dir: xxxx" ?
if ( ! strncmp(p,"working-dir:",12) ) {
p += 12;
@ -351,13 +363,6 @@ bool Hostdb::init ( char *filename , long hostId , char *netName ,
// skip numeric hostid or "proxy" keyword
while ( ! is_wspace_a(*p) ) p++;
if ( indexSplits == 0 ) {
g_errno = EBADENGINEER;
log("admin: need index-splits: xxx directive "
"in hosts.conf");
return false;
}
// read in switch id
//h->m_switchId = atoi(p);
@ -590,7 +595,7 @@ bool Hostdb::init ( char *filename , long hostId , char *netName ,
// our group is based on our split!
//h->m_group = i % g_hostdb.m_indexSplits; // # grps
//h->m_group = i % indexSplits; // # grps
h->m_shardNum = i % indexSplits;
//h->m_shardNum = i % indexSplits;
// i guess proxy and spares don't count
if ( h->m_type != HT_GRUNT ) h->m_shardNum = 0;
@ -665,9 +670,12 @@ bool Hostdb::init ( char *filename , long hostId , char *netName ,
h->m_externalHttpsPort = h->m_httpsPort;
// get max group number
if ( h->m_shardNum > maxShard && h->m_type==HT_GRUNT )
maxShard = h->m_shardNum;
//if ( h->m_shardNum > maxShard && h->m_type==HT_GRUNT )
// maxShard = h->m_shardNum;
if ( h->m_type == HT_GRUNT )
numGrunts++;
/*
if ( h->m_shardNum <= lastShard && h->m_shardNum != 0
&& !(h->m_type&(HT_ALL_PROXIES)) ) {
g_errno = EBADENGINEER;
@ -678,6 +686,7 @@ bool Hostdb::init ( char *filename , long hostId , char *netName ,
filename,line);
}
lastShard = h->m_shardNum;
*/
// skip line now
while ( *p && *p != '\n' )
@ -742,10 +751,46 @@ bool Hostdb::init ( char *filename , long hostId , char *netName ,
//m_numHosts = i;
m_numTotalHosts = i;
// how many shards are we configure for?
m_numShards = maxShard + 1; // g_conf.m_numGroups;
//m_numShards = maxShard + 1; // g_conf.m_numGroups;
// # of mirrors is zero if no mirrors,
// if it is 1 then each host has ONE MIRROR host
if ( numMirrors == 0 )
indexSplits = numGrunts;
if ( numMirrors > 0 )
indexSplits = numGrunts / (numMirrors+1);
if ( indexSplits == 0 ) {
g_errno = EBADENGINEER;
log("admin: need num-mirrors: xxx or "
"index-splits: xxx directive "
"in hosts.conf");
return false;
}
numMirrors = (numGrunts / indexSplits) - 1 ;
if ( numMirrors < 0 ) {
g_errno = EBADENGINEER;
log("admin: need num-mirrors: xxx or "
"index-splits: xxx directive "
"in hosts.conf (2)");
return false;
}
m_indexSplits = indexSplits;
m_numShards = numGrunts / (numMirrors+1);
//
// set Host::m_shardNum
//
for ( long i = 0 ; i < numGrunts ; i++ ) {
Host *h = &m_hosts[i];
h->m_shardNum = i % indexSplits;
}
// assign spare hosts
if ( m_numSpareHosts > MAX_SPARES ) {
log ( "conf: Number of spares (%li) exceeds max of %i, "

@ -305,23 +305,24 @@ bool processLoop ( void *state ) {
SafeBuf sb;
// alloc buffer now
char *buf = NULL;
long bufMaxSize = 0;
//char *buf = NULL;
//long bufMaxSize = 0;
//bufMaxSize = len + ( 32 * 1024 ) ;
bufMaxSize = contentLen + ( 32 * 1024 ) ;
buf = (char *)mmalloc ( bufMaxSize , "PageGet2" );
char *p = buf;
char *bufEnd = buf + bufMaxSize;
if ( ! buf ) {
return sendErrorReply ( st , g_errno );
}
//bufMaxSize = contentLen + ( 32 * 1024 ) ;
//buf = (char *)mmalloc ( bufMaxSize , "PageGet2" );
//char *p = buf;
//char *bufEnd = buf + bufMaxSize;
//if ( ! buf ) {
// return sendErrorReply ( st , g_errno );
//}
// for undoing the header
char *start1 = p;
//char *start1 = p;
long startLen1 = sb.length();
// we are always utfu
if ( strip != 2 )
p += sprintf(p, "<meta http-equiv=\"Content-Type\" "
sb.safePrintf( "<meta http-equiv=\"Content-Type\" "
"content=\"text/html;charset=utf8\">\n");
// base href
@ -332,20 +333,21 @@ bool processLoop ( void *state ) {
if ( xd->ptr_redirUrl ) base = xd->ptr_redirUrl;
//Url *redir = *xd->getRedirUrl();
if ( strip != 2 ) {
sprintf ( p , "<BASE HREF=\"%s\">" , base );
p += gbstrlen ( p );
sb.safePrintf ( "<BASE HREF=\"%s\">" , base );
//p += gbstrlen ( p );
}
// default colors in case css files missing
if ( strip != 2 ) {
sprintf ( p , "\n<style type=\"text/css\">\n"
sb.safePrintf( "\n<style type=\"text/css\">\n"
"body{background-color:white;color:black;}\n"
"</style>\n");
p += gbstrlen ( p );
//p += gbstrlen ( p );
}
char *start2 = p;
// for undoing the stuff below
long startLen2 = sb.length();//p;
// query should be NULL terminated
char *q = st->m_q;
@ -376,7 +378,7 @@ bool processLoop ( void *state ) {
// CNS: if ( ! st->m_clickNScroll ) {
if ( printDisclaimer ) {
sprintf ( p ,
sb.safePrintf(//sprintf ( p ,
//"<BASE HREF=\"%s\">"
//"<table border=1 width=100%%>"
//"<tr><td>"
@ -394,28 +396,31 @@ bool processLoop ( void *state ) {
"<a href=\"%s\" style=\"%s\">%s</a>"
"" , styleTitle, f->getUrl(), styleLink,
f->getUrl() );
p += gbstrlen ( p );
//p += gbstrlen ( p );
// then the rest
sprintf(p ,
//sprintf(p ,
sb.safePrintf(
"<span style=\"%s\">. "
"Gigablast is not responsible for the content of "
"this page.</span>", styleTitle );
p += gbstrlen ( p );
//p += gbstrlen ( p );
sprintf ( p , "<br/><span style=\"%s\">"
sb.safePrintf ( "<br/><span style=\"%s\">"
"Cached: </span>"
"<span style=\"%s\">",
styleTitle, styleText );
p += gbstrlen ( p );
//p += gbstrlen ( p );
// then the spider date in GMT
time_t lastSpiderDate = xd->m_spideredTime;
struct tm *timeStruct = gmtime ( &lastSpiderDate );
strftime ( p, 100,"%b %d, %Y UTC", timeStruct);
p += gbstrlen ( p );
char tbuf[100];
strftime ( tbuf, 100,"%b %d, %Y UTC", timeStruct);
//p += gbstrlen ( p );
sb.safeStrcpy(tbuf);
// Moved over from PageResults.cpp
p += sprintf (p, "</span> - <a href=\""
sb.safePrintf( "</span> - <a href=\""
"/get?"
"q=%s&amp;c=%s&amp;rtq=%li&amp;"
"d=%lli&amp;strip=1\""
@ -427,7 +432,7 @@ bool processLoop ( void *state ) {
// a link to alexa
if ( f->getUrlLen() > 5 ) {
p += sprintf (p, " - <a href=\"http:"
sb.safePrintf( " - <a href=\"http:"
"//web.archive.org/web/*/%s\""
" style=\"%s\">"
"[older copies]</a>" ,
@ -435,29 +440,29 @@ bool processLoop ( void *state ) {
}
if (st->m_noArchive){
p += sprintf(p, " - <span style=\"%s\"><b>"
sb.safePrintf( " - <span style=\"%s\"><b>"
"[NOARCHIVE]</b></span>",
styleTell );
}
if (st->m_isBanned){
p += sprintf(p, " - <span style=\"%s\"><b>"
sb.safePrintf(" - <span style=\"%s\"><b>"
"[BANNED]</b></span>",
styleTell );
}
// only print this if we got a query
if ( qlen > 0 ) {
sprintf (p,"<br/><br/><span style=\"%s\"> "
sb.safePrintf("<br/><br/><span style=\"%s\"> "
"These search terms have been "
"highlighted: ",
styleText );
p += gbstrlen ( p );
//p += gbstrlen ( p );
}
}
// how much space left in p?
long avail = bufEnd - p;
//long avail = bufEnd - p;
// . make the url that we're outputting for (like in PageResults.cpp)
// . "thisUrl" is the baseUrl for click & scroll
char thisUrl[MAX_URL_LEN];
@ -487,7 +492,7 @@ bool processLoop ( void *state ) {
//sprintf ( x, "&seq=%li&rtq=%lid=%lli",
// (long)st->m_seq,(long)st->m_rtq,st->m_msg22.getDocId());
sprintf ( x, "&d=%lli",st->m_docId );
x += gbstrlen(p);
x += gbstrlen(x);
// set our query for highlighting
Query qq;
qq.set2 ( q, st->m_langId , true );
@ -523,16 +528,18 @@ bool processLoop ( void *state ) {
// CNS: if ( ! st->m_clickNScroll ) {
// and highlight the matches
if ( printDisclaimer ) {
hilen = hi.set ( p ,
avail ,
hilen = hi.set ( //p ,
//avail ,
&sb ,
&qw , // words to highlight
&m , // matches relative to qw
false , // doSteming
false , // st->m_clickAndScroll ,
(char *)thisUrl );// base url for ClcknScrll
p += hilen;
//p += hilen;
// now an hr
memcpy ( p , "</span></table></table>\n" , 24 ); p += 24;
//memcpy ( p , "</span></table></table>\n" , 24 ); p += 24;
sb.safeStrcpy("</span></table></table>\n");
}
@ -547,8 +554,8 @@ bool processLoop ( void *state ) {
if ( ! includeHeader ) {
// including base href is off by default when not including
// the header, so the caller must explicitly turn it back on
if ( st->m_includeBaseHref ) p = start2;
else p = start1;
if ( st->m_includeBaseHref ) sb.m_length=startLen2;//p=start2;
else sb.m_length=startLen1;//p=start1;
}
// identify start of <title> tag we wrote out
@ -596,7 +603,8 @@ bool processLoop ( void *state ) {
char *ebuf = st->m_r.getString("eb");
if ( ! ebuf ) ebuf = "";
p += sprintf ( p ,
//p += sprintf ( p ,
sb.safePrintf(
"<table border=1 "
"cellpadding=10 "
"cellspacing=0 "
@ -606,7 +614,7 @@ bool processLoop ( void *state ) {
long printLinks = st->m_r.getLong("links",0);
if ( ! printDisclaimer && printLinks )
p += sprintf ( p ,
sb.safePrintf(//p += sprintf ( p ,
// first put cached and live link
"<tr>"
"<td bgcolor=lightyellow>"
@ -637,7 +645,7 @@ bool processLoop ( void *state ) {
);
if ( printLinks ) {
p += sprintf ( p ,
sb.safePrintf(//p += sprintf ( p ,
"<tr><td bgcolor=pink>"
"<span style=\"font-size:18px;"
"font-weight:600;"
@ -646,11 +654,11 @@ bool processLoop ( void *state ) {
"<b>PAGE TITLE:</b> "
);
long tlen = titleEnd - titleStart;
memcpy ( p , titleStart , tlen ); p += tlen;
p += sprintf ( p , "</span></td></tr>" );
sb.safeMemcpy ( titleStart , tlen );
sb.safePrintf ( "</span></td></tr>" );
}
p += sprintf ( p , "</table><br>\n" );
sb.safePrintf( "</table><br>\n" );
}
@ -661,9 +669,9 @@ bool processLoop ( void *state ) {
if ( ctype == CT_DOC ) pre = true ; // filtered msword
if ( ctype == CT_PS ) pre = true ; // filtered postscript
// if it is content-type text, add a <pre>
if ( p + 5 < bufEnd && pre ) {
memcpy ( p , "<pre>" , 5 );
p += 5;
if ( pre ) {//p + 5 < bufEnd && pre ) {
sb.safePrintf("<pre>");
//p += 5;
}
if ( st->m_strip == 1 )
@ -671,7 +679,7 @@ bool processLoop ( void *state ) {
(long)xd->m_version, st->m_strip );
// it returns -1 and sets g_errno on error, line OOM
if ( contentLen == -1 ) {
if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" );
//if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" );
return sendErrorReply ( st , g_errno );
}
@ -688,8 +696,8 @@ bool processLoop ( void *state ) {
if ( ! queryHighlighting ) {
memcpy ( p , content , contentLen );
p += contentLen ;
sb.safeMemcpy ( content , contentLen );
//p += contentLen ;
}
else {
// get the content as xhtml (should be NULL terminated)
@ -697,37 +705,38 @@ bool processLoop ( void *state ) {
if ( ! xml.set ( content , contentLen , false ,
0 , false , TITLEREC_CURRENT_VERSION ,
false , 0 ) ) { // niceness is 0
if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" );
//if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" );
return sendErrorReply ( st , g_errno );
}
if ( ! ww.set ( &xml , true , 0 ) ) { // niceness is 0
if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" );
//if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" );
return sendErrorReply ( st , g_errno );
}
// sanity check
//if ( ! xd->m_wordsValid ) { char *xx=NULL;*xx=0; }
// how much space left in p?
avail = bufEnd - p;
//avail = bufEnd - p;
Matches m;
m.setQuery ( &qq );
m.addMatches ( &ww );
hilen = hi.set ( p , avail , &ww , &m ,
hilen = hi.set ( &sb , // p , avail ,
&ww , &m ,
false /*doStemming?*/ ,
st->m_clickAndScroll ,
thisUrl /*base url for click & scroll*/);
p += hilen;
//p += hilen;
log(LOG_DEBUG, "query: Done highlighting cached page content");
}
// if it is content-type text, add a </pre>
if ( p + 6 < bufEnd && pre ) {
memcpy ( p , "</pre>" , 6 );
p += 6;
if ( pre ) { // p + 6 < bufEnd && pre ) {
sb.safeMemcpy ( "</pre>" , 6 );
//p += 6;
}
// calculate bufLen
long bufLen = p - buf;
//long bufLen = p - buf;
long ct = xd->m_contentType;
@ -737,16 +746,19 @@ bool processLoop ( void *state ) {
if ( ct == CT_XML ) {
// encode the xml tags into &lt;tagname&gt; sequences
if ( !newbuf.htmlEncodeXmlTags ( buf , p-buf,0)){// niceness=0
if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" );
if ( !newbuf.htmlEncodeXmlTags ( sb.getBufStart() ,
sb.getLength(),
0)){// niceness=0
//if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" );
return sendErrorReply ( st , g_errno );
}
// free out buffer that we alloc'd before returning since this
// should have copied it into another buffer
if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" );
//if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" );
// reassign
buf = newbuf.getBufStart();
bufLen = newbuf.length();
//buf = newbuf.getBufStart();
//bufLen = newbuf.length();
sb.stealBuf ( &newbuf );
}
// now encapsulate it in html head/tail and send it off
@ -763,14 +775,18 @@ bool processLoop ( void *state ) {
mdelete ( st , sizeof(State2) , "PageGet1" );
delete (st);
bool status = g_httpServer.sendDynamicPage (s,buf,bufLen,-1,false,
bool status = g_httpServer.sendDynamicPage (s,
//buf,bufLen,
sb.getBufStart(),
sb.getLength(),
-1,false,
contentType,
-1, NULL, "utf8" );
// free out buffer that we alloc'd before returning since this
// should have copied it into another buffer
if ( ct == CT_XML ) newbuf.purge();
else if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" );
//if ( ct == CT_XML ) newbuf.purge();
//else if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" );
// and convey the status
return status;

@ -1515,8 +1515,8 @@ bool printInlinkText ( SafeBuf &sb , Msg20Reply *mr , SearchInput *si ,
if ( ! si->m_doQueryHighlighting && ! si->m_xml ) continue;
char *str = k-> ptr_linkText;
long strLen = k->size_linkText;
char tt[1024*3];
char *ttend = tt + 1024*3;
//char tt[1024*3];
//char *ttend = tt + 1024*3;
char *frontTag =
"<font style=\"color:black;background-color:yellow\">" ;
char *backTag = "</font>";
@ -1525,8 +1525,9 @@ bool printInlinkText ( SafeBuf &sb , Msg20Reply *mr , SearchInput *si ,
backTag = "</b>";
}
Highlight hi;
long hlen = hi.set ( tt ,
ttend - tt ,
SafeBuf hb;
long hlen = hi.set ( &hb,//tt ,
//ttend - tt ,
str,
strLen ,
mr->m_language, // docLangId
@ -1573,7 +1574,10 @@ bool printInlinkText ( SafeBuf &sb , Msg20Reply *mr , SearchInput *si ,
// inc it
inlinkId++;
// encode it for xml
if ( !sb.htmlEncode ( tt,hlen,false)) return false;
if ( !sb.htmlEncode ( hb.getBufStart(),
hb.length(),
false))
return false;
sb.safePrintf("\"/>\n");
continue;
}
@ -1604,7 +1608,7 @@ bool printInlinkText ( SafeBuf &sb , Msg20Reply *mr , SearchInput *si ,
//k->ptr_urlBuf);
,si->m_cr->m_coll
,k->m_docId);
if ( ! sb.safeMemcpy(tt,hlen) ) return false;
if ( ! sb.safeMemcpy(&hb) ) return false;
long hostLen = 0;
char *host = getHostFast(k->ptr_urlBuf,&hostLen,NULL);
sb.safePrintf("</td><td>");
@ -1871,8 +1875,8 @@ static int printResult ( SafeBuf &sb,
long hlen;
//copy all summary and title excerpts for this result into here
char tt[1024*32];
char *ttend = tt + 1024*32;
//char tt[1024*32];
//char *ttend = tt + 1024*32;
char *frontTag =
"<font style=\"color:black;background-color:yellow\">" ;
char *backTag = "</font>";
@ -1882,9 +1886,11 @@ static int printResult ( SafeBuf &sb,
}
long cols = 80;
if ( si->m_xml ) sb.safePrintf("\t\t<title><![CDATA[");
SafeBuf hb;
if ( str && strLen && si->m_doQueryHighlighting ) {
hlen = hi.set ( tt ,
ttend - tt ,
hlen = hi.set ( &hb,
//tt ,
//ttend - tt ,
str,
strLen ,
mr->m_language, // docLangId
@ -1897,7 +1903,10 @@ static int printResult ( SafeBuf &sb,
0,
0 ); // niceness
//if (!sb.utf8Encode2(tt, hlen)) return false;
if ( ! sb.brify ( tt,hlen,0,cols) ) return false;
if ( ! sb.brify ( hb.getBufStart(),
hb.getLength(),
0,
cols) ) return false;
}
else if ( str && strLen ) {
// determine if TiTle wraps, if it does add a <br> count for

@ -2514,6 +2514,9 @@ bool SafeBuf::decodeJSON ( long niceness ) {
// . this should support the case when the src and dst buffers are the same!
// so decodeJSONToUtf8() function below will work
// . this is used by xmldoc.cpp to PARTIALLY decode a json buf so we do not
// index letters in escapes like \n \r \f \t \uxxxx \\ \/
// . SO we do keep \"
bool SafeBuf::safeDecodeJSONToUtf8 ( char *json, long jsonLen, long niceness) {
// how much space to reserve for the copy?
@ -2687,11 +2690,13 @@ bool SafeBuf::safeStrcpyPrettyJSON ( char *decodedJson ) {
*dst++ = '\\';
continue;
}
//if ( *src == '\/' ) {
// *dst++ = '\\';
// *dst++ = '/';
// continue;
//}
// mdw: why was this commented out?
// we converted '\/' above to a single / so we must undo here
if ( *src == '/' ) {
*dst++ = '\\';
*dst++ = '/';
continue;
}
*dst++ = *src;

@ -26441,11 +26441,13 @@ char *XmlDoc::getHighlightedSummary ( ) {
if ( ! m_langIdValid ) { char *xx=NULL;*xx=0; }
char tt[5000];
//char tt[5000];
Highlight hi;
SafeBuf hb;
// highlight the query in it
long hlen = hi.set ( tt ,
4999 ,
hi.set ( &hb,
//tt ,
//4999 ,
sum,
sumLen,
m_langId,
@ -26459,8 +26461,9 @@ char *XmlDoc::getHighlightedSummary ( ) {
m_niceness );
// store into our safebuf then
m_finalSummaryBuf.safeMemcpy ( tt , hlen + 1 );
m_finalSummaryBuf.safeMemcpy ( &hb );//tt , hlen + 1 );
m_finalSummaryBufValid = true;
m_finalSummaryBuf.nullTerm();
char *fsum = m_finalSummaryBuf.getBufStart();
if ( ! fsum ) fsum = (char *)0x01;

@ -2,12 +2,14 @@
# Tells us what hosts are participating in the distributed search engine.
# This is how many pieces you want the index split into.
# So if you have 64 machines, and you want a unique piece of index on
# each machine, then make this 64. But if you have 64 machines and you
# want one level of redundancy then make this 32.
# How many mirrors do you want? If this is 0 then your data
# will NOT be replicated. If it is 1 then each host listed
# below will have one host that mirrors it, thereby decreasing
# total index capacity, but increasing redundancy. If this is
# 1 then the first half of hosts will be replicated by the
# second half of the hosts listed below.
index-splits: 1
num-mirrors: 0