mirror of
				https://github.com/privacore/open-source-search-engine.git
				synced 2025-10-30 16:36:11 -04:00 
			
		
		
		
	
		
			
				
	
	
		
			183 lines
		
	
	
		
			4.7 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
			
		
		
	
	
			183 lines
		
	
	
		
			4.7 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
| #include "Highlight.h"
 | |
| #include "tokenizer.h"
 | |
| #include "Query.h"
 | |
| #include "Matches.h"
 | |
| #include "Xml.h"
 | |
| #include "Url.h"
 | |
| 
 | |
| #include "Phrases.h"
 | |
| 
 | |
| // use different front tags for matching different term #'s
 | |
| static const char *s_frontTags[] = {
 | |
| 	"<span class='gbcnst00'>" ,
 | |
| 	"<span class='gbcnst01'>" ,
 | |
| 	"<span class='gbcnst02'>" ,
 | |
| 	"<span class='gbcnst03'>" ,
 | |
| 	"<span class='gbcnst04'>" ,
 | |
| 	"<span class='gbcnst05'>" ,
 | |
| 	"<span class='gbcnst06'>" ,
 | |
| 	"<span class='gbcnst07'>" ,
 | |
| 	"<span class='gbcnst08'>" ,
 | |
| 	"<span class='gbcnst09'>"
 | |
| };
 | |
| 
 | |
| static const char s_styleSheet[] =
 | |
| "<style type='text/css'>"
 | |
| 	"span.gbcnst00{color:black;background-color:#ffff66}"
 | |
| 	"span.gbcnst01{color:black;background-color:#a0ffff}"
 | |
| 	"span.gbcnst02{color:black;background-color:#99ff99}"
 | |
| 	"span.gbcnst03{color:black;background-color:#ff9999}"
 | |
| 	"span.gbcnst04{color:black;background-color:#ff66ff}"
 | |
| 	"span.gbcnst05{color:white;background-color:#880000}"
 | |
| 	"span.gbcnst06{color:white;background-color:#00aa00}"
 | |
| 	"span.gbcnst07{color:white;background-color:#886800}"
 | |
| 	"span.gbcnst08{color:white;background-color:#004699}"
 | |
| 	"span.gbcnst09{color:white;background-color:#990099}"
 | |
| "</style>";
 | |
| 
 | |
| // . return length stored into "buf"
 | |
| // . content must be NULL terminated
 | |
| // . if "useAnchors" is true we do click and scroll
 | |
| // . if "isQueryTerms" is true, we do typical anchors in a special way
 | |
| int32_t Highlight::set( SafeBuf *sb, const char *content, int32_t contentLen, Query *q, const char *frontTag,
 | |
| 			const char *backTag ) {
 | |
| 	TokenizerResult tr;
 | |
| 	plain_tokenizer_phase_1(content, contentLen, &tr);
 | |
| 	calculate_tokens_hashes(&tr);
 | |
| 
 | |
| 	Bits bits;
 | |
| 	if ( !bits.set(&tr)) {
 | |
| 		return -1;
 | |
| 	}
 | |
| 
 | |
| 	Phrases phrases;
 | |
| 	if ( !phrases.set(tr,bits) ) {
 | |
| 		return -1;
 | |
| 	}
 | |
| 
 | |
| 	Matches matches;
 | |
| 	matches.setQuery ( q );
 | |
| 
 | |
| 	if ( !matches.addMatches( &tr, &phrases ) ) {
 | |
| 		return -1;
 | |
| 	}
 | |
| 
 | |
| 	// store
 | |
| 	m_numMatches = matches.getNumMatches();
 | |
| 
 | |
| 	return set ( sb, &tr, &matches, frontTag, backTag, q);
 | |
| }
 | |
| 
 | |
| // New version
 | |
| int32_t Highlight::set( SafeBuf *sb, const TokenizerResult *tr, const Matches *matches, const char *frontTag,
 | |
| 			const char *backTag, const Query *q ) {
 | |
| 	// save stuff
 | |
| 	m_frontTag    = frontTag;
 | |
| 	m_backTag     = backTag;
 | |
| 
 | |
| 	// set lengths of provided front/back highlight tags
 | |
| 	if ( m_frontTag ) {
 | |
| 		m_frontTagLen = strlen ( frontTag );
 | |
| 	}
 | |
| 	if ( m_backTag  ) {
 | |
| 		m_backTagLen  = strlen ( backTag  );
 | |
| 	}
 | |
| 
 | |
| 	m_sb = sb;
 | |
| 
 | |
| 	// label it
 | |
| 	m_sb->setLabel ("highw");
 | |
| 
 | |
| 	if ( ! highlightWords ( tr, matches, q ) ) {
 | |
| 		return -1;
 | |
| 	}
 | |
| 
 | |
| 	// null terminate
 | |
| 	m_sb->nullTerm();
 | |
| 
 | |
| 	// return the length
 | |
| 	return m_sb->length();
 | |
| }
 | |
| 
 | |
| bool Highlight::highlightWords ( const TokenizerResult *tr, const Matches *m, const Query *q ) {
 | |
| 	// set nexti to the word # of the first word that matches a query word
 | |
| 	int32_t nextm = -1;
 | |
| 	int32_t nexti = -1;
 | |
| 	if ( m->getNumMatches() > 0 ) {
 | |
| 		nextm = 0;
 | |
| 		nexti = m->getMatch(0).m_wordNum;
 | |
| 	}
 | |
| 
 | |
| 	int32_t backTagi = -1;
 | |
| 	bool inTitle  = false;
 | |
| 
 | |
| 	for ( int32_t i = 0 ; (size_t)i < tr->size(); i++ ) {
 | |
| 		const auto &token = (*tr)[i];
 | |
| 		// set word's info
 | |
| 		bool endHead = false;
 | |
| 		bool endHtml = false;
 | |
| 
 | |
| 		nodeid_t base_nodeid = token.nodeid&BACKBITCOMP;
 | |
| 		if ( base_nodeid == TAG_TITLE ) {
 | |
| 			inTitle = (token.nodeid&BACKBIT)==0;
 | |
| 		} else if ( base_nodeid == TAG_HTML ) {
 | |
| 			endHtml = (token.nodeid&BACKBIT)!=0;
 | |
| 		} else if ( base_nodeid == TAG_HEAD ) {
 | |
| 			endHead = (token.nodeid&BACKBIT)!=0;
 | |
| 		}
 | |
| 
 | |
| 		// This word is a match...see if we're gonna tag it
 | |
| 		// dont put the same tags around consecutive matches
 | |
| 		if ( i == nexti && ! inTitle && ! endHead && ! endHtml) {
 | |
| 			// get the match class for the match
 | |
| 			const Match *mat = &m->getMatch(nextm);
 | |
| 			// discontinue any current font tag we are in
 | |
| 			if ( i < backTagi ) {
 | |
| 				// push backtag ahead if needed
 | |
| 				if ( i + mat->m_numWords > backTagi ) {
 | |
| 					backTagi = i + mat->m_numWords;
 | |
| 				}
 | |
| 			}
 | |
| 			else {
 | |
| 				// now each match is the entire quote, so write the front tag right now
 | |
| 				if ( m_frontTag ) {
 | |
| 					m_sb->safeStrcpy ( m_frontTag );
 | |
| 				} else {
 | |
| 					m_sb->safeStrcpy( s_frontTags[(mat->m_qwordNum % 10)] );
 | |
| 				}
 | |
| 
 | |
| 				// when to write the back tag? add the number of
 | |
| 				// words in the match to i.
 | |
| 				backTagi = i + mat->m_numWords;
 | |
| 			}
 | |
| 		} else if ( endHead ) {
 | |
| 			// include the tags style sheet immediately before the closing </TITLE> tag
 | |
| 			m_sb->safeStrcpy( s_styleSheet );
 | |
| 		}
 | |
| 
 | |
| 		if ( i == nexti ) {
 | |
| 			// advance match
 | |
| 			nextm++;
 | |
| 			// set nexti to the word # of the next match
 | |
| 			if ( nextm < m->getNumMatches() ) 
 | |
| 				nexti = m->getMatch(nextm).m_wordNum;
 | |
| 			else
 | |
| 				nexti = -1;
 | |
| 		}
 | |
| 
 | |
| 		m_sb->safeMemcpy ( token.token_start, token.token_len );
 | |
| 
 | |
| 		// back tag
 | |
| 		if ( i == backTagi-1 ) {
 | |
| 			// store the back tag
 | |
| 			if ( m_backTag ) {
 | |
| 				m_sb->safeMemcpy( m_backTag, m_backTagLen );
 | |
| 			} else {
 | |
| 				m_sb->safeStrcpy("</span>");
 | |
| 			}
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	return true;
 | |
| }
 |