1176 lines
		
	
	
		
			32 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
			
		
		
	
	
			1176 lines
		
	
	
		
			32 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
| #include "gb-include.h"
 | |
| 
 | |
| #include "SearchInput.h"
 | |
| #include "Parms.h"         // g_parms
 | |
| //#include "CollectionRec.h" // cr
 | |
| #include "Pages.h"         // g_msg
 | |
| #include "LanguageIdentifier.h"
 | |
| #include "CountryCode.h"
 | |
| #include "geo_ip_table.h"
 | |
| #include "Users.h"
 | |
| #include "Address.h" // getLatLonFromUserInput
 | |
| #include "Timedb.h"
 | |
| #include "PageResults.h"
 | |
| 
 | |
| //char getFormatFromRequest ( class HttpRequest *hr ) ;
 | |
| 
 | |
| SearchInput::SearchInput() {
 | |
| 	reset();
 | |
| }
 | |
| SearchInput::~SearchInput() {
 | |
| 	reset();
 | |
| }
 | |
| void SearchInput::reset ( ) {
 | |
| }
 | |
| 
 | |
| //void SearchInput::setToDefaults ( CollectionRec *cr , int32_t niceness ) {
 | |
| void SearchInput::clear ( int32_t niceness ) {
 | |
| 	// reset it first
 | |
| 	reset();
 | |
| 	// set all to 0 just to avoid any inconsistencies
 | |
| 	int32_t size = (char *)&m_END_TEST - (char *)&m_START;
 | |
| 	memset ( &m_START , 0x00 , size );
 | |
| 	m_sbuf1.reset();
 | |
| 	m_sbuf2.reset();
 | |
| 	m_sbuf3.reset();
 | |
| 
 | |
| 	// set these
 | |
| 	m_numLinesInSummary  = 2;
 | |
| 	m_docsWanted         = 10;
 | |
| 	m_boolFlag           = 2;
 | |
| 	m_maxQueryTerms      = 1000;
 | |
| 	m_niceness           = niceness;
 | |
| 
 | |
| 	//m_defaultSortLanguageLen = 0;
 | |
| }
 | |
| 
 | |
| 
 | |
| // . make a key for caching the search results page based on this input
 | |
| // . do not use all vars, like the m_*ToDisplay should not be included
 | |
| key_t SearchInput::makeKey ( ) {
 | |
| 	// hash the query
 | |
| 	int32_t       n       = m_q.getNumTerms  ();
 | |
| 	//int64_t *termIds = m_q.getTermIds   ();
 | |
| 	//char      *signs   = m_q.getTermSigns ();
 | |
| 	key_t k;
 | |
| 	k.n1 = 0;
 | |
| 	//k.n0 = hash64 ( (char *)termIds , n * sizeof(int64_t) );
 | |
| 	//k.n0 = hash64 ( (char *)signs   , n , k.n0 );
 | |
| 	// user defined weights, for weighting each query term separately
 | |
| 	for ( int32_t i = 0 ; i < n ; i++ ) {
 | |
| 		k.n0 = hash64 ((char *)&m_q.m_qterms[i].m_termId    ,4, k.n0);
 | |
| 		k.n0 = hash64 ((char *)&m_q.m_qterms[i].m_termSign  ,1, k.n0);
 | |
| 		k.n0 = hash64 ((char *)&m_q.m_qterms[i].m_userWeight,4, k.n0);
 | |
| 		k.n0 = hash64 ((char *)&m_q.m_qterms[i].m_userType  ,1, k.n0);
 | |
| 	}
 | |
| 	// space separated, NULL terminated, list of meta tag names to display
 | |
| 	if ( m_displayMetas          ) 
 | |
| 		k.n0 = hash64b ( m_displayMetas          , k.n0 );
 | |
| 	// name of collection in external cluster to get titleRecs for 
 | |
| 	// related pages from
 | |
| 	//if ( m_rp_getExternalPages && m_rp_externalColl )
 | |
| 	//	k.n0 = hash64b ( m_rp_externalColl , k.n0 );
 | |
| 	// collection e import from
 | |
| 	//if ( m_importColl )
 | |
| 	//	k.n0 = hash64b ( m_importColl , k.n0 );
 | |
| 	// the special query parm
 | |
| 	//if ( m_sq && m_sqLen > 0 )
 | |
| 	//	k.n0 = hash64 ( m_sq , m_sqLen , k.n0 );
 | |
| 	//if ( m_noDocIds && m_noDocIdsLen )
 | |
| 	//	k.n0 = hash64 ( m_noDocIds , m_noDocIdsLen , k.n0 );
 | |
| 	//if ( m_noSiteIds && m_noSiteIdsLen )
 | |
| 	//	k.n0 = hash64 ( m_noSiteIds , m_noSiteIdsLen , k.n0 );
 | |
| 
 | |
| 	// no need to hash these again separately, they are in between 
 | |
| 	// m_START and m_END_HASH
 | |
| 	// language
 | |
| 	//if ( m_language )
 | |
| 	//	k.n0 = hash64 ( m_language , k.n0 );
 | |
| 	//if ( m_gblang )
 | |
| 	//	k.n0 = hash64 ( m_gblang , k.n0 );
 | |
| 	// . now include the hash of the search parameters
 | |
| 	// . nnot including m_docsToScanForTopics since since we got TopicGroups
 | |
| 	char *a = ((char *)&m_START) + 4 ; // msg40->m_dpf;
 | |
| 	char *b =  (char *)&m_END_HASH   ; // msg40->m_topicGroups;
 | |
| 	int32_t size = b - a; 
 | |
| 	// push and flush some parms that should not contribute
 | |
| 	//int32_t save1 = m_refs_numToDisplay;
 | |
| 	//int32_t save2 = m_rp_numToDisplay;
 | |
| 	//int32_t save3 = m_numTopicsToDisplay;
 | |
| 	//m_refs_numToDisplay  = 0;
 | |
| 	//m_rp_numToDisplay    = 0;
 | |
| 	//m_numTopicsToDisplay = 0;
 | |
| 	// and hash it all up
 | |
| 	k.n0 = hash64 ( a , size , k.n0 );
 | |
| 	// and pop out the parms that did not contribute
 | |
| 	//m_refs_numToDisplay  = save1;
 | |
| 	//m_rp_numToDisplay    = save2;
 | |
| 	//m_numTopicsToDisplay = save3;
 | |
| 	// hash each topic group
 | |
| 	for ( int32_t i = 0 ; i < m_numTopicGroups ; i++ ) {
 | |
| 		TopicGroup *t = &m_topicGroups[i];
 | |
| 		//k.n0 = hash64 ( t->m_numTopics           , k.n0 );
 | |
| 		k.n0 = hash64 ( t->m_maxTopics           , k.n0 );
 | |
| 		k.n0 = hash64 ( t->m_docsToScanForTopics , k.n0 );
 | |
| 		k.n0 = hash64 ( t->m_minTopicScore       , k.n0 );
 | |
| 		k.n0 = hash64 ( t->m_maxWordsPerTopic    , k.n0 );
 | |
| 		k.n0 = hash64b( t->m_meta                , k.n0 );
 | |
| 		k.n0 = hash64 ( t->m_delimeter           , k.n0 );
 | |
| 		k.n0 = hash64 ( t->m_useIdfForTopics     , k.n0 );
 | |
| 		k.n0 = hash64 ( t->m_dedup               , k.n0 );
 | |
| 	}
 | |
| 	// . boolean queries have operators (AND OR NOT ( ) ) that we need
 | |
| 	//   to consider in this hash as well. so
 | |
| 	// . so just hash the whole damn query
 | |
| 	if ( m_q.m_isBoolean ) {
 | |
| 		char *q    = m_q.getQuery();
 | |
| 		int32_t  qlen = m_q.getQueryLen();
 | |
| 		k.n0 = hash64 ( q , qlen , k.n0 );
 | |
| 	}
 | |
| 
 | |
| 	// Language stuff
 | |
| 	//k.n0 = hash64(m_defaultSortLanguage, m_defaultSortLanguageLen, k.n0);
 | |
| 	//k.n0 = hash64(m_defaultSortCountry , m_defaultSortCountryLen , k.n0);
 | |
| 
 | |
| 	// debug
 | |
| 	//logf(LOG_DEBUG,"query: q=%s k.n0=%"UINT64"",m_q.getQuery(),k.n0);
 | |
| 
 | |
| 	//Msg1aParms* m1p = msg40->getReferenceParms();
 | |
| 	//if( m1p ) {
 | |
| 	//	k.n0=hash64(((char*)m1p)+sizeof(int32_t), 
 | |
| 	//		    sizeof(Msg1aParms)-8,k.n0);
 | |
| 	//}
 | |
| 	return k;
 | |
| }
 | |
| 
 | |
| void SearchInput::test ( ) {
 | |
| 	// set all to 0 just to avoid any inconsistencies
 | |
| 	char *a = ((char *)&m_START) + 4 ; // msg40->m_dpf;
 | |
| 	char *b =  (char *)&m_END_TEST;
 | |
| 	int32_t size = b - a;
 | |
| 	memset ( a , 0x00 , size );
 | |
| 	// loop through all possible cgi parms to set SearchInput
 | |
| 	for ( int32_t i = 0 ; i < g_parms.m_numSearchParms ; i++ ) {
 | |
| 		Parm *m = g_parms.m_searchParms[i];
 | |
| 		char *x = (char *)this + m->m_off;
 | |
| 		if ( m->m_type != TYPE_BOOL ) *(int32_t *)x = 0xffffffff;
 | |
| 		else                          *(char *)x = 0xff;
 | |
| 	}
 | |
| 	// ensure we're all zeros now!
 | |
| 	int32_t fix = a - (char *)this;
 | |
| 	unsigned char *p = (unsigned char *)a;
 | |
| 	for ( int32_t i = 0 ; i < size ; i++ ) {
 | |
| 		if ( p[i] == 0xff ) continue;
 | |
| 		// find it
 | |
| 		int32_t off = i + fix;
 | |
| 		char *name = NULL; // "unknown";
 | |
| 		for ( int32_t k = 0 ; k < g_parms.m_numSearchParms ; k++ ) {
 | |
| 			Parm *m = g_parms.m_searchParms[k];
 | |
| 			if ( m->m_off != off ) continue;
 | |
| 			name = m->m_title;
 | |
| 			break;
 | |
| 		}
 | |
| 		if ( ! name ) continue;
 | |
| 		log("query: Got uncovered SearchInput parm at offset "
 | |
| 		    "%" INT32 " in SearchInput. name=%s.",off,name);
 | |
| 	}
 | |
| }
 | |
| 
 | |
| void SearchInput::copy ( class SearchInput *si ) {
 | |
| 	gbmemcpy ( (char *)this , (char *)si , sizeof(SearchInput) );
 | |
| }
 | |
| 
 | |
| class SearchInput *g_si = NULL;
 | |
| 
 | |
| bool SearchInput::set ( TcpSocket *sock , HttpRequest *r ) { //, Query *q ) {
 | |
| 
 | |
| 	// store list of collection #'s to search here. usually just one.
 | |
| 	m_collnumBuf.reset();
 | |
| 
 | |
| 	m_q.reset();
 | |
| 
 | |
| 	// zero out everything, set niceness to 0
 | |
| 	clear ( 0 ) ;
 | |
| 
 | |
| 	// save it now
 | |
| 	m_sock = sock;
 | |
| 
 | |
| 	// still his buffer. m_hr will free the stuff, but "r" can
 | |
| 	// still access it for the time being, and not free it
 | |
| 	m_hr.stealBuf ( r );
 | |
| 
 | |
| 	char *coll = g_collectiondb.getDefaultColl ( r );
 | |
| 
 | |
| 	//////
 | |
| 	//
 | |
| 	// build "m_collnumBuf" to consist of all the collnums we should
 | |
| 	// be searching.
 | |
| 	//
 | |
| 	///////
 | |
| 
 | |
| 	m_firstCollnum = -1;
 | |
| 	// set this to the collrec of the first valid collnum we encounter
 | |
| 	CollectionRec *cr = NULL;
 | |
| 	// now convert list of space-separated coll names into list of collnums
 | |
| 	char *p = r->getString("c",NULL);
 | |
| 	// if no collection list was specified look for "token=" and
 | |
| 	// use those to make collections. hack for diffbot.
 | |
| 	char *token = r->getString("token",NULL);
 | |
| 	// find all collections under this token
 | |
| 	for ( int32_t i = 0 ; i < g_collectiondb.m_numRecs ; i++ ) {
 | |
| 		// must not have a "&c="
 | |
| 		if ( p ) break;
 | |
| 		// must have a "&token="
 | |
| 		if ( ! token ) break;
 | |
| 		// skip if empty
 | |
| 		CollectionRec *tmpcr = g_collectiondb.m_recs[i];
 | |
| 		if ( ! tmpcr ) continue;
 | |
| 		// skip if does not match token
 | |
| 		if ( strcmp(token,tmpcr->m_diffbotToken.getBufStart()) ) 
 | |
| 			continue;
 | |
| 		// . we got a match
 | |
| 		// . set initial junk
 | |
| 		if ( ! cr ) {
 | |
| 			cr = tmpcr;
 | |
| 			m_firstCollnum = tmpcr->m_collnum;
 | |
| 		}
 | |
| 		// save the collection #
 | |
| 		if ( ! m_collnumBuf.safeMemcpy ( &tmpcr->m_collnum, 
 | |
| 						 sizeof(collnum_t) ) )
 | |
| 			return false;
 | |
| 	}
 | |
| 
 | |
| 	// if we had a "&c=..." in the GET request process that
 | |
| 	if ( p ) {
 | |
| 	loop:
 | |
| 		char *end = p;
 | |
| 		for ( ; *end && ! is_wspace_a(*end) ; end++ );
 | |
| 		// temp null
 | |
| 		char c = *end;
 | |
| 		*end = '\0';
 | |
| 		CollectionRec *tmpcr = g_collectiondb.getRec ( p );
 | |
| 		// set defaults from the FIRST one
 | |
| 		if ( tmpcr && ! cr ) {
 | |
| 			cr = tmpcr;
 | |
| 			m_firstCollnum = tmpcr->m_collnum;
 | |
| 		}
 | |
| 		if ( ! tmpcr ) { 
 | |
| 			g_errno = ENOCOLLREC;
 | |
| 			log("query: missing collection %s",p);
 | |
| 			g_msg = " (error: no such collection)";		
 | |
| 			return false;
 | |
| 		}
 | |
| 		// add to our list
 | |
| 		if (!m_collnumBuf.safeMemcpy(&tmpcr->m_collnum,
 | |
| 					     sizeof(collnum_t)))
 | |
| 			return false;
 | |
| 		// restore the \0 character we wrote in there
 | |
| 		*end = c;
 | |
| 		// advance
 | |
| 		p = end;
 | |
| 		// skip to next collection name if there is one
 | |
| 		while ( *p && is_wspace_a(*p) ) p++; 
 | |
| 		// now add it's collection # to m_collnumBuf if there
 | |
| 		if ( *p ) goto loop;
 | |
| 	}
 | |
| 
 | |
| 	// use default collection if none provided
 | |
| 	if ( ! p && ! token && m_collnumBuf.length() <= 0 ) {
 | |
| 		// get default collection rec
 | |
| 		cr = g_collectiondb.getRec (coll);
 | |
| 		// add to our list
 | |
| 		if ( cr &&
 | |
| 		     !m_collnumBuf.safeMemcpy(&cr->m_collnum,
 | |
| 					      sizeof(collnum_t)))
 | |
| 			return false;
 | |
| 	}
 | |
| 		
 | |
| 
 | |
| 
 | |
| 	/////
 | |
| 	//
 | |
| 	// END BUILDING m_collnumBuf
 | |
| 	//
 | |
| 	/////
 | |
| 
 | |
| 
 | |
| 	// save the collrec
 | |
| 	m_cr = cr;
 | |
| 
 | |
| 	// must have had one
 | |
| 	if ( ! cr ) {
 | |
| 		log("si: si. collection does not exist");
 | |
| 		// if we comment the below out then it cores in setToDefault!
 | |
| 		g_errno = ENOCOLLREC;
 | |
| 		return false;
 | |
| 	}
 | |
| 
 | |
| 	// and set from the http request. will set m_coll, etc.
 | |
| 	g_parms.setToDefault ( (char *)this , OBJ_SI , cr );
 | |
| 
 | |
| 
 | |
| 	///////
 | |
| 	//
 | |
| 	// set defaults of some things based on format language
 | |
| 	//
 | |
| 	//////
 | |
| 
 | |
| 	// get the format. "xml" "html" "json" --> FORMAT_HTML, FORMAT_CSV ...
 | |
| 	char tmpFormat = m_hr.getReplyFormat();//getFormatFromRequest ( &m_hr);
 | |
| 	// now override automatic defaults for special cases
 | |
| 	if ( tmpFormat != FORMAT_HTML ) {
 | |
| 		m_familyFilter            = 0;
 | |
| 		m_numTopicsToDisplay      = 0;
 | |
| 		m_doQueryHighlighting     = 0;
 | |
| 		//m_spellCheck              = 0;
 | |
| 		m_getDocIdScoringInfo = false;
 | |
| 		// turn gigabits off by default if not html
 | |
| 		//m_docsToScanForTopics = 0;
 | |
| 	}
 | |
| 
 | |
| 	// if they have a list of sites...
 | |
| 	if ( m_sites && m_sites[0] ) {
 | |
| 		m_doSiteClustering        = false;
 | |
| 		m_ipRestrictForTopics     = false;
 | |
| 	}
 | |
| 
 | |
| 
 | |
| 	
 | |
| 
 | |
| 
 | |
| 	// and set from the http request. will set m_coll, etc.
 | |
| 	g_parms.setFromRequest ( &m_hr , sock , cr , (char *)this , OBJ_SI );
 | |
| 
 | |
| 
 | |
| 	if ( m_streamResults &&
 | |
| 	     tmpFormat != FORMAT_XML &&
 | |
| 	     tmpFormat != FORMAT_CSV &&
 | |
| 	     tmpFormat != FORMAT_JSON ) {
 | |
| 		log("si: streamResults only supported for "
 | |
| 		    "xml/csv/json. disabling");
 | |
| 		m_streamResults = false;
 | |
| 	}
 | |
| 
 | |
| 	m_coll = coll;
 | |
| 
 | |
| 	// it sets m_formatStr above, but we gotta set this...
 | |
| 	m_format = tmpFormat;
 | |
| 
 | |
| 
 | |
| 	//////
 | |
| 	//
 | |
| 	// fix some parms
 | |
| 	//
 | |
| 	//////
 | |
| 
 | |
| 	// set m_isMasterAdmin to zero if no correct ip or password
 | |
| 	if ( ! g_conf.isMasterAdmin ( sock , &m_hr ) ) m_isMasterAdmin = 0;
 | |
| 
 | |
| 	// collection admin?
 | |
| 	m_isCollAdmin = g_conf.isCollAdmin ( sock , &m_hr );
 | |
| 
 | |
| 	//////////////////////////////////////
 | |
| 	//
 | |
| 	// transform input into classes
 | |
| 	//
 | |
| 	//////////////////////////////////////
 | |
| 
 | |
| 	// allow for "qlang" if still don't have it
 | |
| 	//int32_t gglen2;
 | |
| 	//char *gg2 = r->getString ( "qlang" , &gglen2 , NULL );
 | |
| 	//if ( m_gblang == 0 && gg2 && gglen2 > 1 )
 | |
| 	//	m_gblang = getLanguageFromAbbr(gg2);
 | |
| 	
 | |
| 	// fix query by removing lang:xx from ask.com queries
 | |
| 	//char *end = m_query + m_queryLen -8;
 | |
| 	//if ( m_queryLen > 8 && m_query && end > m_query && 
 | |
| 	//     strncmp(end," lang:",6)==0 ) {
 | |
| 	//	char *asklang = m_query+m_queryLen - 2;
 | |
| 	//	m_gblang = getLanguageFromAbbr(asklang);
 | |
| 	//	m_queryLen -= 8;
 | |
| 	//	m_query[m_queryLen] = 0;
 | |
| 	//	
 | |
| 	//}
 | |
| 
 | |
| 	// . returns false and sets g_errno on error
 | |
| 	// . sets m_qbuf1 and m_qbuf2
 | |
| 	// . sets:
 | |
| 	//   m_sbuf1
 | |
| 	//   m_sbuf2
 | |
| 	//   m_sbuf3
 | |
| 	//   m_displayQuery
 | |
| 	//   m_qe (encoded query)
 | |
| 	//   m_rtl (right to left like hebrew)
 | |
| 	//   m_highlightQuery
 | |
| 	if ( ! setQueryBuffers (r) )
 | |
| 		return log("query: setQueryBuffers: %s",mstrerror(g_errno));
 | |
| 
 | |
| 	/* --- Virtual host language detection --- */
 | |
| 	/*
 | |
| 	if(r->getHost()) {
 | |
| 		bool langset = getLanguageFromAbbr(m_defaultSortLanguage);
 | |
| 		char *cp;
 | |
| 		if(!langset && (cp = strrchr(r->getHost(), '.'))) {
 | |
| 			uint8_t lang = getLanguageFromUserAgent(++cp);
 | |
| 			if(lang) {
 | |
| 				// char langbuf[128];
 | |
| 		// sprintf(langbuf, "qlang=%s\0", getLanguageAbbr(lang));
 | |
| 			//m_defaultSortLanguage = getLanguageAbbr(lang);
 | |
|                                 char *tmp = getLanguageAbbr(lang);
 | |
|                                 strncpy(m_defaultSortLanguage, tmp, 6);
 | |
| 				// log(LOG_INFO,
 | |
| 			//	getLanguageString(lang), r->getHost(), this);
 | |
| 			}
 | |
| 		}
 | |
| 	}
 | |
| 	*/
 | |
| 	/* --- End Virtual host language detection --- */
 | |
| 
 | |
| 
 | |
| 	//char *qs1 = m_defaultSortLanguage;
 | |
| 
 | |
| 	// this overrides though
 | |
| 	//int32_t qlen2;
 | |
| 	//char *qs2 = r->getString ("qlang",&qlen2,NULL);
 | |
| 	//if ( qs2 ) qs1 = qs2;
 | |
| 	
 | |
| 	//m_queryLang = getLanguageFromAbbr ( qs1 );
 | |
| 
 | |
| 	//m_queryLang = detectQueryLanguage();
 | |
| 
 | |
| 	//char *qs1 = getLangAbbr(m_queryLang);
 | |
| 
 | |
| 	// this parm is in Parms.cpp and should be set
 | |
| 	char *langAbbr = m_defaultSortLang;
 | |
| 
 | |
| 	// Parms.cpp sets it to an empty string, so make that null
 | |
| 	// if Parms.cpp set it to NULL it seems it comes out as "(null)"
 | |
| 	// i guess because we sprintf it or something.
 | |
| 	if ( langAbbr && langAbbr[0] == '\0' )
 | |
| 		langAbbr = NULL;
 | |
| 
 | |
| 	// if &qlang was not given explicitly fall back to coll rec
 | |
| 	if ( cr && ! langAbbr )
 | |
| 		langAbbr = cr->m_defaultSortLanguage2;
 | |
| 
 | |
| 	// if no coll rec use language unknown
 | |
| 	if ( ! langAbbr )
 | |
| 		langAbbr = "xx";
 | |
| 
 | |
| 	log(LOG_INFO,"query: using default lang of %s", langAbbr );
 | |
| 
 | |
| 	// get code
 | |
| 	m_queryLangId = getLangIdFromAbbr ( langAbbr );
 | |
| 
 | |
| 	// allow for 'xx', which means langUnknown
 | |
| 	if ( m_queryLangId == langUnknown &&
 | |
| 	     langAbbr &&
 | |
| 	     langAbbr[0] &&
 | |
| 	     langAbbr[0]!='x' )
 | |
| 		log("query: qlang of \"%s\" is NOT SUPPORTED. using "
 | |
| 		    "langUnknown, \"xx\".",langAbbr);
 | |
| 
 | |
| 	int32_t maxQueryTerms = cr->m_maxQueryTerms;
 | |
| 
 | |
| 	// . the query to use for highlighting... can be overridden with "hq"
 | |
| 	// . we need the language id for doing synonyms
 | |
| 	if ( m_prepend && m_prepend[0] )
 | |
| 		m_hqq.set2 ( m_prepend , m_queryLangId , true ,maxQueryTerms);
 | |
| 	else if ( m_highlightQuery && m_highlightQuery[0] )
 | |
| 		m_hqq.set2 (m_highlightQuery,m_queryLangId,true,maxQueryTerms);
 | |
| 	else if ( m_query && m_query[0] )
 | |
| 		m_hqq.set2 ( m_query , m_queryLangId , true,maxQueryTerms);
 | |
| 
 | |
| 	// log it here
 | |
| 	log(LOG_INFO,
 | |
| 	    "query: got query %s (len=%i)"
 | |
| 	    ,m_sbuf1.getBufStart()
 | |
| 	    ,m_sbuf1.length());
 | |
| 
 | |
| 	// . now set from m_qbuf1, the advanced/composite query buffer
 | |
| 	// . returns false and sets g_errno on error (ETOOMANYOPERANDS)
 | |
| 	if ( ! m_q.set2 ( m_sbuf1.getBufStart(), 
 | |
| 			  m_queryLangId , 
 | |
| 			  m_queryExpansion ,
 | |
| 			  true , // use QUERY stopwords?
 | |
| 			  maxQueryTerms ) ) {
 | |
| 		g_msg = " (error: query has too many operands)";
 | |
| 		return false;
 | |
| 	}
 | |
| 
 | |
| 	m_q.m_containingParent = (void *)this;
 | |
| 
 | |
| 	if ( m_q.m_truncated && m_q.m_isBoolean ) {
 | |
| 		g_errno = EQUERYTOOBIG;
 | |
| 		g_msg = " (error: query is too long)";
 | |
| 		return false;
 | |
| 	}
 | |
| 
 | |
| 
 | |
| 	if ( m_hideAllClustered )
 | |
| 		m_doSiteClustering = true;
 | |
| 
 | |
| 	// turn off some parms
 | |
| 	if ( m_q.m_hasUrlField  ) 
 | |
| 		m_ipRestrictForTopics = false;
 | |
| 	if ( m_q.m_hasIpField   )
 | |
| 		m_ipRestrictForTopics = false;
 | |
| 	if ( m_q.m_hasPositiveSiteField ) {
 | |
| 		m_ipRestrictForTopics = false;
 | |
| 		m_doSiteClustering    = false;
 | |
| 	}
 | |
| 
 | |
| 	if ( cr && ! cr->m_ipRestrict )
 | |
| 		m_ipRestrictForTopics = false;
 | |
| 
 | |
| 	if ( m_q.m_hasQuotaField ) {
 | |
| 		m_doSiteClustering    = false;
 | |
| 		m_doDupContentRemoval = false;
 | |
| 	}
 | |
| 
 | |
| 	if ( ! m_doSiteClustering )
 | |
| 		m_hideAllClustered = false;
 | |
| 
 | |
| 	// sanity check
 | |
| 	if(m_firstResultNum < 0) m_firstResultNum = 0;
 | |
| 
 | |
| 	// DEBUG: temp hack
 | |
| 	// static bool first = true;
 | |
| 	//  if ( first ) { 
 | |
| 	//  	first = false;
 | |
| 	//  	m_firstResultNum = 10;
 | |
| 	//  }
 | |
| 
 | |
| 
 | |
| 	// . if query has url: or site: term do NOT use cache by def.
 | |
| 	// . however, if spider is off then use the cache by default
 | |
| 	if ( m_useCache == -1 && g_conf.m_spideringEnabled ) {
 | |
| 		if      ( m_q.m_hasPositiveSiteField ) m_useCache = 0;
 | |
| 		else if ( m_q.m_hasIpField   ) m_useCache = 0;
 | |
| 		else if ( m_q.m_hasUrlField  ) m_useCache = 0;
 | |
| 		else if ( m_sites && m_sites[0] ) m_useCache = 0;
 | |
| 		//else if ( m_whiteListBuf.length() ) m_useCache = 0;
 | |
| 		else if ( m_url && m_url[0]   ) m_useCache = 0;
 | |
| 	}
 | |
| 
 | |
| 	// if useCache is still -1 then turn it on
 | |
| 	if ( m_useCache == -1 ) m_useCache = 1;
 | |
| 
 | |
| 	// never use cache if doing a rerank (msg3b)
 | |
| 	//if ( m_rerankRuleset >= 0 ) m_useCache = 0;
 | |
| 	bool readFromCache = false;
 | |
| 	if ( m_useCache ==  1  ) readFromCache = true;
 | |
| 	if ( m_rcache   ==  0  ) readFromCache = false;
 | |
| 	if ( m_useCache ==  0  ) readFromCache = false;
 | |
| 	// if useCache is false, don't write to cache if it was not specified
 | |
| 	if ( m_wcache == -1 ) {
 | |
| 		if ( m_useCache ==  0 ) m_wcache = 0;
 | |
| 		else                    m_wcache = 1;
 | |
| 	}
 | |
| 	// save it
 | |
| 	m_rcache = readFromCache;
 | |
| 
 | |
| 
 | |
| 	//
 | |
| 	// TODO: use Parms.cpp defaults
 | |
| 	//
 | |
| 	TopicGroup *tg = &m_topicGroups[0];
 | |
| 
 | |
| 	//
 | |
| 	//
 | |
| 	// gigabits
 | |
| 	//
 | |
| 	//
 | |
| 	tg->m_numTopics = 50;
 | |
| 	tg->m_maxTopics = 50;
 | |
| 	tg->m_docsToScanForTopics = m_docsToScanForTopics;
 | |
| 	tg->m_minTopicScore = 0;
 | |
| 	tg->m_maxWordsPerTopic = 6;
 | |
| 	tg->m_meta[0] = '\0';
 | |
| 	tg->m_delimeter = '\0';
 | |
| 	tg->m_useIdfForTopics = false;
 | |
| 	tg->m_dedup = true;
 | |
| 	// need to be on at least 2 pages!
 | |
| 	tg->m_minDocCount = 2;
 | |
| 	tg->m_ipRestrict = m_ipRestrictForTopics;
 | |
| 	tg->m_dedupSamplePercent = 80;
 | |
| 	tg->m_topicRemoveOverlaps = true;
 | |
| 	tg->m_topicSampleSize = 4096;
 | |
| 	// max sequential punct chars allowedin a topic
 | |
| 	tg->m_topicMaxPunctLen = 1;
 | |
| 	m_numTopicGroups = 1;
 | |
| 
 | |
| 
 | |
| 	return true;
 | |
| }
 | |
| 
 | |
| // . sets m_qbuf1[] and m_qbuf2[]
 | |
| // . m_qbuf1[] is the advanced query
 | |
| // . m_qbuf2[] is the query to be used for spell checking
 | |
| // . returns false and set g_errno on error
 | |
| bool SearchInput::setQueryBuffers ( HttpRequest *hr ) {
 | |
| 
 | |
| 	m_sbuf1.reset();
 | |
| 	m_sbuf2.reset();
 | |
| 	m_sbuf3.reset();
 | |
| 
 | |
| 	int16_t qcs = csUTF8;
 | |
| 	if (m_queryCharset && m_queryCharset[0]){
 | |
| 		// we need to convert the query string to utf-8
 | |
| 		int32_t qclen = gbstrlen(m_queryCharset);
 | |
| 		qcs = get_iana_charset(m_queryCharset, qclen );
 | |
| 		if (qcs == csUnknown) {
 | |
| 			//g_errno = EBADCHARSET;
 | |
| 			//g_msg = "(error: unknown query charset)";
 | |
| 			//return false;
 | |
| 			qcs = csUTF8;
 | |
| 		}
 | |
| 	}
 | |
| 	// prepend sites terms
 | |
| 	int32_t numSites = 0;
 | |
| 	char *csStr = NULL;
 | |
| 	numSites = 0;
 | |
| 	csStr = get_charset_str(qcs);
 | |
| 
 | |
| 	/*
 | |
| 	if ( m_sites && m_sites[0] ) {
 | |
| 		char *s = m_sites;
 | |
| 		char *t;
 | |
| 		int32_t  len;
 | |
| 		m_sbuf1.pushChar('(');// *p++ = '(';
 | |
| 	loop:
 | |
| 		// skip white space
 | |
| 		while ( *s && ! is_alnum_a(*s) ) s++;
 | |
| 		// bail if done
 | |
| 		if ( ! *s ) goto done;
 | |
| 		// get length of it
 | |
| 		t = s;
 | |
| 		while ( *t && ! is_wspace_a(*t) ) t++;
 | |
| 		len = t - s;
 | |
| 		// add site: term
 | |
| 		//if ( p + 12 + len >= pend ) goto toobig;
 | |
| 		if ( numSites > 0 ) m_sbuf1.safeStrcpy ( " UOR " );
 | |
| 		m_sbuf1.safeStrcpy ( "site:" );
 | |
| 		//p += ucToUtf8(p, pend-p,s, len, csStr, 0,0);
 | |
| 		m_sbuf1.safeMemcpy ( s , len );
 | |
| 		//gbmemcpy ( p , s , len     ); p += len;
 | |
| 		// *p++ = ' ';
 | |
| 		m_sbuf1.pushChar(' ');
 | |
| 		s = t;
 | |
| 		numSites++;
 | |
| 		goto loop;
 | |
| 	done:
 | |
| 		m_sbuf1.safePrintf(") | ");
 | |
| 		// inc totalLen
 | |
| 		m_sitesQueryLen = m_sitesLen + (numSites * 10);
 | |
| 	}
 | |
| 	*/
 | |
| 
 | |
| 	// prepend
 | |
| 	char *qp = hr->getString("prepend",NULL,NULL);
 | |
| 	if( qp && qp[0] ) {
 | |
| 		//if( p > pstart ) *p++ =  ' ';
 | |
| 		if ( m_sbuf1.length() ) m_sbuf1.pushChar(' ');
 | |
| 		//p += sprintf( p, "+gblang:%"INT32" |", m_gblang );
 | |
| 		m_sbuf1.safePrintf( "%s", qp );
 | |
| 	}
 | |
| 
 | |
| 	// boolean OR terms
 | |
| 	bool boolq = false;
 | |
| 	char *any = hr->getString("any",NULL);
 | |
| 	bool first = true;
 | |
| 	if ( any ) {
 | |
| 		char *s = any;
 | |
| 		char *send = any + gbstrlen(any);
 | |
| 	 	if ( m_sbuf1.length() ) m_sbuf1.pushChar(' ');
 | |
| 	 	if ( m_sbuf2.length() ) m_sbuf2.pushChar(' ');
 | |
| 		while (s < send) {
 | |
| 			while (isspace(*s) && s < send) s++;
 | |
| 			char *s2 = s+1;
 | |
| 			if (*s == '\"') {
 | |
| 				// if there's no closing quote just treat
 | |
| 				// the end of the line as such
 | |
| 				while (*s2 != '\"' && s2 < send) s2++;
 | |
| 				if (s2 < send) s2++;
 | |
| 			} else {
 | |
| 				while (!isspace(*s2) && s2 < send) s2++;
 | |
| 			}
 | |
| 			if ( first ) m_sbuf1.safeStrcpy("(");
 | |
| 			if ( first ) m_sbuf2.safeStrcpy("(");
 | |
| 			if ( ! first ) m_sbuf1.safeStrcpy(" OR ");
 | |
| 			if ( ! first ) m_sbuf2.safeStrcpy(" OR ");
 | |
| 			first = false;
 | |
| 			m_sbuf1.safeMemcpy ( s , s2 - s );
 | |
| 			m_sbuf2.safeMemcpy ( s , s2 - s );
 | |
| 			s = s2 + 1;
 | |
| 		}
 | |
| 	}
 | |
| 	if ( ! first ) m_sbuf1.safeStrcpy(") AND ");
 | |
| 	if ( ! first ) m_sbuf2.safeStrcpy(") AND ");
 | |
| 	if ( ! first ) boolq = true;
 | |
| 
 | |
| 
 | |
| 
 | |
| 	// and this
 | |
| 	if ( m_secsBack > 0 ) {
 | |
| 		int32_t timestamp = getTimeGlobalNoCore();
 | |
| 		timestamp -= m_secsBack;
 | |
| 		if ( timestamp <= 0 ) timestamp = 0;
 | |
| 		if ( m_sbuf1.length() ) m_sbuf1.pushChar(' ');
 | |
| 		m_sbuf1.safePrintf("gbminint:gbspiderdate:%" UINT32 "",timestamp);
 | |
| 	}
 | |
| 
 | |
| 	if ( m_sortBy == 1 ) {
 | |
| 		if ( m_sbuf1.length() ) m_sbuf1.pushChar(' ');
 | |
| 		m_sbuf1.safePrintf("gbsortbyint:gbspiderdate");
 | |
| 	}
 | |
| 
 | |
| 	if ( m_sortBy == 2 ) {
 | |
| 		if ( m_sbuf1.length() ) m_sbuf1.pushChar(' ');
 | |
| 		m_sbuf1.safePrintf("gbrevsortbyint:gbspiderdate");
 | |
| 	}
 | |
| 
 | |
| 	if ( m_sortBy == 3 ) {
 | |
| 		if ( m_sbuf1.length() ) m_sbuf1.pushChar(' ');
 | |
| 		m_sbuf1.safePrintf("gbsortbyint:gbsitenuminlinks");
 | |
| 	}
 | |
| 
 | |
| 	char *ft = m_filetype;
 | |
| 	if ( ft && strcasecmp(ft,"any")==0 ) ft = NULL;
 | |
| 	if ( ft && ! ft[0] ) ft = NULL;
 | |
| 	if ( ft ) {
 | |
| 		if ( m_sbuf1.length() ) m_sbuf1.pushChar(' ');
 | |
| 		m_sbuf1.safePrintf("filetype:%s",ft);
 | |
| 	}
 | |
| 
 | |
| 	// facet prepend en masse
 | |
| 	// for ( int32_t i = 1 ; i <= 6 ; i++ ) {
 | |
| 	// 	char tmp[12];
 | |
| 	// 	sprintf(tmp,"facet%"INT32"",i);
 | |
| 	// 	char *ff = hr->getString(tmp,NULL);
 | |
| 	// 	if ( ! ff ) continue;
 | |
| 	// 	if ( m_sbuf1.length() ) m_sbuf1.pushChar(' ');
 | |
| 	// 	m_sbuf1.safePrintf("%s",ff);
 | |
| 	// }
 | |
| 
 | |
| 	// one at a time for now
 | |
| 	char *ff = hr->getString("facet",NULL);
 | |
| 	if ( ff ) {
 | |
| 	 	if ( m_sbuf1.length() ) m_sbuf1.pushChar(' ');
 | |
| 	 	m_sbuf1.safePrintf("%s",ff);
 | |
| 	 }
 | |
| 
 | |
| 
 | |
| 	// append site: term
 | |
| 	// if ( m_sites && m_sites[0] ) {
 | |
| 	// 	if ( m_sbuf1.length() ) m_sbuf1.pushChar(' ');
 | |
| 	// 	m_sbuf1.safePrintf("+site:");
 | |
| 	// 	m_sbuf1.safeStrcpy(m_sites);
 | |
| 	// }
 | |
| 
 | |
| 	if ( m_familyFilter ) {
 | |
| 	 	if ( m_sbuf1.length() ) m_sbuf1.pushChar(' ');
 | |
| 	 	//if ( m_sbuf2.length() ) m_sbuf2.pushChar(' ');
 | |
| 	 	m_sbuf1.safePrintf( "+gbisadult:0");
 | |
| 	 	//m_sbuf2.safePrintf( "+gbisadult:0");
 | |
| 		if ( ! boolq ) {
 | |
| 			m_sbuf1.safeStrcpy(" |");
 | |
| 			//m_sbuf2.safeStrcpy(" |");
 | |
| 		}
 | |
| 		else {
 | |
| 			m_sbuf1.safeStrcpy(" AND ");
 | |
| 			//m_sbuf2.safeStrcpy(" AND ");
 | |
| 		}
 | |
| 
 | |
| 	}
 | |
| 
 | |
| 	// PRE-pend gblang: term
 | |
| 	int32_t gblang = hr->getLong("gblang",-1);
 | |
| 	if( gblang >= 0 ) {
 | |
| 	 	if ( m_sbuf1.length() ) m_sbuf1.pushChar(' ');
 | |
| 	 	if ( m_sbuf2.length() ) m_sbuf2.pushChar(' ');
 | |
| 	 	m_sbuf1.safePrintf( "+gblang:%" INT32 "", gblang );
 | |
| 	 	m_sbuf2.safePrintf( "+gblang:%" INT32 "", gblang );
 | |
| 		if ( ! boolq ) {
 | |
| 			m_sbuf1.safeStrcpy(" |");
 | |
| 			m_sbuf2.safeStrcpy(" |");
 | |
| 		}
 | |
| 		else {
 | |
| 			m_sbuf1.safeStrcpy(" AND ");
 | |
| 			m_sbuf2.safeStrcpy(" AND ");
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	// bookmark here so we can copy into st->m_displayQuery below
 | |
| 	//int32_t displayQueryOffset = m_sbuf1.length();
 | |
| 	// append url: term
 | |
| 	// if ( m_url && m_url[0] ) {
 | |
| 	// 	//if ( p > pstart ) *p++ = ' ';
 | |
| 	// 	if ( m_sbuf1.length() ) m_sbuf1.pushChar(' ');
 | |
| 	// 	//gbmemcpy ( p , "+url:" , 5 ); p += 5;
 | |
| 	// 	m_sbuf1.safeStrcpy ( "+url:");
 | |
| 	// 	//gbmemcpy ( p , m_url , m_urlLen ); p += m_urlLen;
 | |
| 	// 	m_sbuf1.safeStrcpy ( m_url );
 | |
| 	// }
 | |
| 	// append url: term
 | |
| 	if ( m_link && m_link[0] ) {
 | |
| 	 	if ( m_sbuf1.length() ) m_sbuf1.pushChar(' ');
 | |
| 	 	if ( m_sbuf2.length() ) m_sbuf2.pushChar(' ');
 | |
| 		m_sbuf1.safeStrcpy ( "+link:");
 | |
| 		m_sbuf2.safeStrcpy ( "+link:");
 | |
| 		m_sbuf1.safeStrcpy ( m_link );
 | |
| 		m_sbuf2.safeStrcpy ( m_link );
 | |
| 		if ( ! boolq ) {
 | |
| 			m_sbuf1.safeStrcpy(" |");
 | |
| 			m_sbuf2.safeStrcpy(" |");
 | |
| 		}
 | |
| 		else {
 | |
| 			m_sbuf1.safeStrcpy(" AND ");
 | |
| 			m_sbuf2.safeStrcpy(" AND ");
 | |
| 		}
 | |
| 	}
 | |
| 	m_sbuf1.setLabel("sisbuf1");
 | |
| 	m_sbuf2.setLabel("sisbuf2");
 | |
| 	m_sbuf3.setLabel("sisbuf3");
 | |
| 	// append the natural query
 | |
| 	if ( m_query && m_query[0] ) {
 | |
| 		//if ( p  > pstart  ) *p++  = ' ';
 | |
| 		if ( m_sbuf1.length() ) m_sbuf1.pushChar(' ');
 | |
| 		//p += ucToUtf8(p, pend-p, m_query, m_queryLen, csStr, 0,0);
 | |
| 		m_sbuf1.safeStrcpy ( m_query );
 | |
| 		//gbmemcpy ( p  , m_query , m_queryLen ); p  += m_queryLen;
 | |
| 		// add to spell checked buf, too		
 | |
| 		//if ( p2 > pstart2 ) *p2++ = ' ';
 | |
| 		if ( m_sbuf2.length() ) m_sbuf2.pushChar(' ');
 | |
| 		//p2 +=ucToUtf8(p2, pend2-p2, m_query, m_queryLen, csStr, 0,0);
 | |
| 		m_sbuf2.safeStrcpy ( m_query );
 | |
| 		//gbmemcpy ( p2 , m_query , m_queryLen ); p2 += m_queryLen;
 | |
| 	}
 | |
| 	// if ( m_query2 && m_query2[0] ) {
 | |
| 	// 	//if ( p3 > pstart3 ) *p3++ = ' ';
 | |
| 	// 	if ( m_sbuf3.length() ) m_sbuf3.pushChar(' ');
 | |
| 	// 	//p3+=ucToUtf8(p3, pend3-p3, m_query2, m_query2Len, csStr,0,0);
 | |
| 	// 	m_sbuf3.safeStrcpy ( m_query2 );
 | |
| 	// }
 | |
| 	//if (g_errno == EILSEQ){ // illegal character seq
 | |
| 	//	log("query: bad char set");
 | |
| 	//	g_errno = 0;
 | |
| 	//	if (qcs == csUTF8) {qcs = csISOLatin1;goto doOver;}
 | |
| 	//	if (qcs != csISOLatin1) {qcs = csUTF8;goto doOver;}
 | |
| 	//}
 | |
| 	// append quoted phrases to query
 | |
| 	if ( m_quote1 && m_quote1[0] ) {
 | |
| 		//if ( p  > pstart  ) *p++  = ' ';
 | |
| 		if ( m_sbuf1.length() ) m_sbuf1.pushChar(' ');
 | |
| 		//*p++ = '+';
 | |
| 		//*p++ = '\"';
 | |
| 		if ( ! boolq ) {
 | |
| 			m_sbuf1.safeStrcpy(" +\"");
 | |
| 			m_sbuf2.safeStrcpy(" +\"");
 | |
| 		}
 | |
| 		else {
 | |
| 			m_sbuf1.safeStrcpy(" AND \"");
 | |
| 			m_sbuf2.safeStrcpy(" AND \"");
 | |
| 		}
 | |
| 		//p += ucToUtf8(p, pend-p, m_quote1, m_quoteLen1, csStr, 0,0);
 | |
| 		m_sbuf1.safeStrcpy ( m_quote1 );
 | |
| 		//gbmemcpy ( p , m_quote1 , m_quoteLen1 ); p += m_quoteLen1 ;
 | |
| 		//*p++ = '\"';
 | |
| 		m_sbuf1.safeStrcpy("\"");
 | |
| 		// add to spell checked buf, too
 | |
| 		//if ( p2 > pstart2 ) *p2++ = ' ';
 | |
| 		if ( m_sbuf2.length() ) m_sbuf2.pushChar(' ');
 | |
| 		//*p2++ = '+';
 | |
| 		//*p2++ = '\"';
 | |
| 		//p2+=ucToUtf8(p2, pend2-p2, m_quote1, m_quoteLen1, csStr,0,0);
 | |
| 		m_sbuf2.safeStrcpy ( m_quote1 );
 | |
| 		//gbmemcpy ( p2 , m_quote1 , m_quoteLen1 ); p2 += m_quoteLen1 ;
 | |
| 		//*p2++ = '\"';
 | |
| 		m_sbuf2.safeStrcpy("\"");
 | |
| 	}
 | |
| 	//if (g_errno == EILSEQ){ // illegal character seq
 | |
| 	//	g_errno = 0;
 | |
| 	//	if (qcs == csUTF8) {qcs = csISOLatin1;goto doOver;}
 | |
| 	//	if (qcs != csISOLatin1) {qcs = csUTF8;goto doOver;}
 | |
| 	//}
 | |
| 	if ( m_quote2 && m_quote2[0] ) {
 | |
| 		//if ( p  > pstart  ) *p++  = ' ';
 | |
| 		if ( m_sbuf1.length() ) m_sbuf1.pushChar(' ');
 | |
| 		//*p++ = '+';
 | |
| 		//*p++ = '\"';
 | |
| 
 | |
| 		if ( ! boolq ) {
 | |
| 			m_sbuf1.safeStrcpy(" +\"");
 | |
| 			m_sbuf2.safeStrcpy(" +\"");
 | |
| 		}
 | |
| 		else {
 | |
| 			m_sbuf1.safeStrcpy(" AND \"");
 | |
| 			m_sbuf2.safeStrcpy(" AND \"");
 | |
| 		}
 | |
| 
 | |
| 		//m_sbuf1.safeStrcpy("+\"");
 | |
| 		//p += ucToUtf8(p, pend-p, m_quote2, m_quoteLen2, csStr, 0,0);
 | |
| 		m_sbuf1.safeStrcpy ( m_quote2 );
 | |
| 		//gbmemcpy ( p , m_quote2 , m_quoteLen2 ); p += m_quoteLen2 ;
 | |
| 		//*p++ = '\"';
 | |
| 		m_sbuf1.safeStrcpy("\"");
 | |
| 		// add to spell checked buf, too
 | |
| 		//if ( p2 > pstart2 ) *p2++ = ' ';
 | |
| 		if ( m_sbuf2.length() ) m_sbuf2.pushChar(' ');
 | |
| 		//*p2++ = '+';
 | |
| 		//*p2++ = '\"';
 | |
| 		//m_sbuf2.safeStrcpy("+\"");
 | |
| 		//p2+=ucToUtf8(p2, pend2-p2, m_quote2, m_quoteLen2, csStr,0,0);
 | |
| 		m_sbuf2.safeStrcpy ( m_quote2 );
 | |
| 		//gbmemcpy ( p2 , m_quote2 , m_quoteLen2 ); p2 += m_quoteLen2 ;
 | |
| 		//*p2++ = '\"';
 | |
| 		m_sbuf2.safeStrcpy("\"");
 | |
| 	}
 | |
| 	//if (g_errno == EILSEQ){ // illegal character seq
 | |
| 	//	g_errno = 0;
 | |
| 	//	if (qcs == csUTF8) {qcs = csISOLatin1;goto doOver;}
 | |
| 	//	if (qcs != csISOLatin1) {qcs = csUTF8;goto doOver;}
 | |
| 	//}
 | |
| 
 | |
| 	// append plus terms
 | |
| 	if ( m_plus && m_plus[0] ) {
 | |
| 		char *s = m_plus;
 | |
| 		char *send = m_plus + gbstrlen(m_plus);
 | |
| 		//if ( p > pstart && p < pend ) *p++  = ' ';
 | |
| 		//if ( p2 > pstart2 && p2 < pend2) *p2++ = ' ';
 | |
| 		if ( m_sbuf1.length() ) m_sbuf1.pushChar(' ');
 | |
| 		if ( m_sbuf2.length() ) m_sbuf2.pushChar(' ');
 | |
| 		while (s < send) {
 | |
| 			while (isspace(*s) && s < send) s++;
 | |
| 			char *s2 = s+1;
 | |
| 			if (*s == '\"') {
 | |
| 				// if there's no closing quote just treat
 | |
| 				// the end of the line as such
 | |
| 				while (*s2 != '\"' && s2 < send) s2++;
 | |
| 				if (s2 < send) s2++;
 | |
| 			} else {
 | |
| 				while (!isspace(*s2) && s2 < send) s2++;
 | |
| 			}
 | |
| 			//if (s2 < send) break;
 | |
| 			//if (p < pend) *p++ = '+';
 | |
| 			//if (p2 < pend2) *p2++ = '+';
 | |
| 			//m_sbuf1.pushChar('+');
 | |
| 			//m_sbuf2.pushChar('+');
 | |
| 			if ( ! boolq ) {
 | |
| 				m_sbuf1.safeStrcpy("+");
 | |
| 				m_sbuf2.safeStrcpy("+");
 | |
| 			}
 | |
| 			else {
 | |
| 				m_sbuf1.safeStrcpy(" AND ");
 | |
| 				m_sbuf2.safeStrcpy(" AND ");
 | |
| 			}
 | |
| 
 | |
| 			//p += ucToUtf8(p, pend-p, s, s2-s, csStr, 0,0);
 | |
| 			//p2 += ucToUtf8(p2, pend2-p2, s, s2-s, csStr, 0,0);
 | |
| 			m_sbuf1.safeMemcpy ( s , s2 - s );
 | |
| 			m_sbuf2.safeMemcpy ( s , s2 - s );
 | |
| 			/*
 | |
| 			if (g_errno == EILSEQ) { // illegal character seq
 | |
| 				g_errno = 0;
 | |
| 				if (qcs == csUTF8) {
 | |
| 					qcs = csISOLatin1;
 | |
| 					goto doOver;
 | |
| 				}
 | |
| 				if (qcs != csISOLatin1) {
 | |
| 					qcs = csUTF8;
 | |
| 					goto doOver;
 | |
| 				}
 | |
| 			}
 | |
| 			*/
 | |
| 			s = s2 + 1;
 | |
| 			if (s < send) {
 | |
| 				//if (p < pend) *p++ = ' ';
 | |
| 				//if (p2 < pend2) *p2++ = ' ';
 | |
| 				if ( m_sbuf1.length() ) m_sbuf1.pushChar(' ');
 | |
| 				if ( m_sbuf2.length() ) m_sbuf2.pushChar(' ');
 | |
| 			}
 | |
| 		}
 | |
| 
 | |
| 	}  
 | |
| 	// append minus terms
 | |
| 	if ( m_minus && m_minus[0] ) {
 | |
| 		char *s = m_minus;
 | |
| 		char *send = m_minus + gbstrlen(m_minus);
 | |
| 		//if ( p > pstart && p < pend ) *p++  = ' ';
 | |
| 		//if ( p2 > pstart2 && p2 < pend2) *p2++ = ' ';
 | |
| 		if ( m_sbuf1.length() ) m_sbuf1.pushChar(' ');
 | |
| 		if ( m_sbuf2.length() ) m_sbuf2.pushChar(' ');
 | |
| 		while (s < send) {
 | |
| 			while (isspace(*s) && s < send) s++;
 | |
| 			char *s2 = s+1;
 | |
| 			if (*s == '\"') {
 | |
| 				// if there's no closing quote just treat
 | |
| 				// the end of the line as such
 | |
| 				while (*s2 != '\"' && s2 < send) s2++;
 | |
| 				if (s2 < send) s2++;
 | |
| 			} else {
 | |
| 				while (!isspace(*s2) && s2 < send) s2++;
 | |
| 			}
 | |
| 			if (s2 < send) break;
 | |
| 			//if (p < pend) *p++ = '-';
 | |
| 			//if (p2 < pend2) *p2++ = '-';
 | |
| 			// m_sbuf1.pushChar('-');
 | |
| 			// m_sbuf2.pushChar('-');
 | |
| 
 | |
| 			if ( ! boolq ) {
 | |
| 				m_sbuf1.safeStrcpy("-");
 | |
| 				m_sbuf2.safeStrcpy("-");
 | |
| 			}
 | |
| 			else {
 | |
| 				m_sbuf1.safeStrcpy(" AND NOT ");
 | |
| 				m_sbuf2.safeStrcpy(" AND NOT ");
 | |
| 			}
 | |
| 
 | |
| 			//p += ucToUtf8(p, pend-p, s, s2-s, csStr, 0,0);
 | |
| 			//p2 += ucToUtf8(p2, pend2-p2, s, s2-s, csStr, 0,0);
 | |
| 			m_sbuf1.safeMemcpy ( s , s2 - s );
 | |
| 			m_sbuf2.safeMemcpy ( s , s2 - s );
 | |
| 			/*
 | |
| 			if (g_errno == EILSEQ) { // illegal character seq
 | |
| 				g_errno = 0;
 | |
| 				if (qcs == csUTF8) {
 | |
| 					qcs = csISOLatin1;
 | |
| 					goto doOver;
 | |
| 				}
 | |
| 				if (qcs != csISOLatin1) {
 | |
| 					qcs = csUTF8;
 | |
| 					goto doOver;
 | |
| 				}
 | |
| 			}
 | |
| 			*/
 | |
| 			s = s2 + 1;
 | |
| 			if (s < send) {
 | |
| 				//if (p < pend) *p++ = ' ';
 | |
| 				//if (p2 < pend2) *p2++ = ' ';
 | |
| 				if ( m_sbuf1.length() ) m_sbuf1.pushChar(' ');
 | |
| 				if ( m_sbuf2.length() ) m_sbuf2.pushChar(' ');
 | |
| 			}
 | |
| 		}
 | |
| 	}
 | |
| 	// append gbkeyword:numinlinks if they have &mininlinks=X, X>0
 | |
| 	int32_t minInlinks = m_hr.getLong("mininlinks",0);
 | |
| 	if ( minInlinks > 0 ) {
 | |
| 		//if ( p > pstart ) *p++ = ' ';
 | |
| 		if ( m_sbuf1.length() ) m_sbuf1.pushChar(' ');
 | |
| 		//char *str = "gbkeyword:numinlinks";
 | |
| 		//int32_t  len = gbstrlen(str);
 | |
| 		//gbmemcpy ( p , str , len );
 | |
| 		//p += len;
 | |
| 		m_sbuf1.safePrintf ( "gbkeyword:numinlinks");
 | |
| 	}
 | |
| 
 | |
| 	// null terms
 | |
| 	if ( ! m_sbuf1.nullTerm() ) return false;
 | |
| 	if ( ! m_sbuf2.nullTerm() ) return false;
 | |
| 	if ( ! m_sbuf3.nullTerm() ) return false;
 | |
| 
 | |
| 	// the natural query
 | |
| 	m_displayQuery = m_sbuf2.getBufStart();// + displayQueryOffset;
 | |
| 
 | |
| 	if ( ! m_displayQuery ) m_displayQuery = "";
 | |
| 
 | |
| 	while ( *m_displayQuery == ' ' ) m_displayQuery++;
 | |
| 
 | |
| 	//m_displayQueryLen = gbstrlen(m_displayQuery);//p-m_displayQuery
 | |
| 
 | |
| 
 | |
| 	//log("query: got query %s",m_sbuf1.getBufStart());
 | |
| 	//log("query: got display query %s",m_displayQuery);
 | |
| 
 | |
| 	// urlencoded display query
 | |
| 	m_qe.urlEncode ( m_displayQuery );
 | |
| 
 | |
| 	// urlEncode(m_qe,
 | |
| 	// 	  MAX_QUERY_LEN*2,
 | |
| 	// 	  m_displayQuery,
 | |
| 	// 	  gbstrlen(m_displayQuery));
 | |
| 	
 | |
| 
 | |
| 
 | |
| 
 | |
| 	//////////
 | |
| 	//
 | |
| 	// show DMOZ BREADCRUMB if doing a 
 | |
| 	// "gbpcatid:<catid> |" (Search restricted to category)
 | |
| 	// "gbcatid:<catid>"    (DMOZ urls in that topic, c=dmoz3)
 | |
| 	//
 | |
| 	//////////
 | |
| 	int32_t pcatId = -1;
 | |
| 	int32_t dcatId  = -1;
 | |
| 	// get the final query
 | |
| 	char *q =m_sbuf1.getBufStart();
 | |
| 
 | |
| 	if ( q ) sscanf(q,"gbpcatid:%" INT32 "",&pcatId);
 | |
| 	if ( q ) sscanf(q,"gbcatid:%" INT32 "",&dcatId);
 | |
| 	// pick the one that is valid
 | |
| 	int32_t catId = -1;
 | |
| 	if ( pcatId >= 0 ) catId = pcatId;
 | |
| 	if ( dcatId >= 0 ) catId = dcatId;
 | |
| 	
 | |
| 	//////
 | |
| 	//
 | |
| 	// save catid into the state
 | |
| 	m_catId = catId;
 | |
| 	//
 | |
| 	///////
 | |
| 
 | |
| 	// are we a right to left language like hebrew?
 | |
| 	if ( catId > 0 && g_categories->isIdRTL(catId) )
 | |
| 		m_isRTL = true;
 | |
| 	else
 | |
| 		m_isRTL = false;
 | |
| 
 | |
| 	return true;
 | |
| }
 | |
| 
 | |
| /*
 | |
| uint8_t SearchInput::detectQueryLanguage(void) {
 | |
| 	uint8_t lang = 0;
 | |
| 	// Check to see if default language is set.
 | |
| 	// This should override everything else.
 | |
| 	//if(m_defaultSortLanguage)
 | |
| 	//		lang = getLanguageFromAbbr(m_defaultSortLanguage);
 | |
| 
 | |
| 	// Set query language from User Agent string, if possible
 | |
| 	if(!lang && m_hr.getUserAgent())
 | |
| 		lang= g_langId.guessLanguageFromUserAgent(m_hr.getUserAgent());
 | |
| 
 | |
| 	// guess from query terms
 | |
| 	if(!lang && m_q)
 | |
| 		lang = g_langId.guessLanguageFromQuery(m_q);
 | |
| 
 | |
| 	// guess from IP addr of the requester
 | |
| 	if(!lang && m_queryIP)
 | |
| 		lang = g_langId.guessLanguageFromIP(m_queryIP);
 | |
| 
 | |
| 	// Save for later
 | |
| 	m_langHint = lang;
 | |
| 
 | |
| 	if(m_gbcountry && m_gbcountryLen > 0)
 | |
| 		m_country = g_countryCode.getIndexOfAbbr(m_gbcountry);
 | |
| 
 | |
| 	if(!m_country) {
 | |
| 		// Now guess country of the query.
 | |
| 		char *codep = g_langId.findGeoIP(m_queryIP, geoIPNumRows - 1, 0);
 | |
| 		if(codep) m_country = g_countryCode.getIndexOfAbbr(codep);
 | |
| 
 | |
| 		// Many doofuses just download firefox and don't set it
 | |
| 		// up properly, so this takes second place to the IP search.
 | |
| 		if(!m_country)
 | |
| 			m_country = g_langId.guessCountryFromUserAgent(m_hr.getUserAgent());
 | |
| 
 | |
| 	}
 | |
| 
 | |
| 	return(lang);
 | |
| }
 | |
| */
 | |
| 
 | |
| //char getFormatFromRequest ( HttpRequest *r ) {
 | |
| //
 | |
| //}
 |