open-source-search-engine/Words.cpp

#include "gb-include.h"

#include "Words.h"
#include "Phrases.h" // for isInPhrase() for hashWordIffNotInPhrase
#include "Unicode.h" // getUtf8CharSize()
#include "StopWords.h"
#include "Speller.h"
#include "HashTableX.h"
#include "Sections.h"
#include "XmlNode.h" // getTagLen()

//static int32_t printstring ( char *s , int32_t len ) ;

Words::Words ( ) {
	m_buf = NULL;
	m_bufSize = 0;
	reset();
}
Words::~Words ( ) {
	reset();
}
void Words::reset ( ) {
	m_numWords = 0;
	m_numAlnumWords = 0;
	m_xml = NULL;
	m_preCount = 0;
	if ( m_buf && m_buf != m_localBuf && m_buf != m_localBuf2 )
		mfree ( m_buf , m_bufSize , "Words" );
	m_buf = NULL;
	m_bufSize = 0;
	m_tagIds = NULL;
	m_s = NULL;
	m_numTags = 0;
	m_hasTags = false;
	m_localBuf2 = NULL;
	m_localBufSize2 = 0;
}

bool Words::set ( char *s, int32_t slen, int32_t version,
		  bool computeWordIds,
		  int32_t niceness) {
	// bail if nothing
	if ( ! s || slen == 0 ) {
		m_numWords = 0;
		m_numAlnumWords = 0;
		return true;
	}

	char c = s[slen];
	if ( c != '\0' ) s[slen]='\0';
	bool status = set ( s , version, computeWordIds , niceness );
	if ( c != '\0' ) s[slen] = c;
	return status;
}

// a quickie
// this url gives a m_preCount that is too low. why?
// http://go.tfol.com/163/speed.asp
int32_t countWords ( char *p , int32_t plen , int32_t niceness ) {
	char *pend  = p + plen;
	int32_t  count = 1;
 loop:

	// sequence of punct
	for  ( ; p < pend && ! is_alnum_utf8 (p) ; p += getUtf8CharSize(p) ) {
		// breathe
		QUICKPOLL ( niceness );
		// in case being set from xml tags, count as words now
		if ( *p=='<') count++;
	}
	count++;

	// sequence of alnum
	for  ( ; p < pend && is_alnum_utf8 (p) ; p += getUtf8CharSize(p) )
		// breathe
		QUICKPOLL ( niceness );

	count++;

	if ( p < pend ) goto loop;
	// some extra for good meaure
	return count+10;
}

int32_t countWords ( char *p , int32_t niceness ) {
	int32_t  count = 1;
 loop:

	// sequence of punct
	for  ( ; *p && ! is_alnum_utf8 (p) ; p += getUtf8CharSize(p) ) {
		// breathe
		QUICKPOLL ( niceness );
		// in case being set from xml tags, count as words now
		if ( *p=='<') count++;
	}
	count++;

	// sequence of alnum
	for  ( ; *p && is_alnum_utf8 (p) ; p += getUtf8CharSize(p) )
		// breathe
		QUICKPOLL ( niceness );

	count++;

	if ( *p ) goto loop;
	// some extra for good meaure
	return count+10;
}

static bool s_tested = false;

bool Words::set ( Xml *xml,
		  bool computeWordIds ,
		  int32_t niceness ,
		  int32_t node1 ,
		  int32_t node2 ) {
	// prevent setting with the same string
	if ( m_xml == xml ) { char *xx=NULL;*xx=0; }
	reset();
	m_xml = xml;
	m_version = xml->getVersion();
	//m_version = xml->getVersion();

	// quick test
	if ( ! s_tested ) {
		// only do once
		s_tested = true;
		// set c to a curling quote in unicode
		int32_t c = 0x201c; // 0x235e;
		// encode it into utf8
		char dst[5];
		// point to it
		char *p = dst;
		// put space in there
		*p++ = ' ';
		// "numBytes" is how many bytes it stored into 'dst"
		int32_t numBytes = utf8Encode ( c , p );
		// must be 2 bytes i guess
		if ( numBytes != 3 ) { char *xx=NULL; *xx=0; }
		// check it
		int32_t size = getUtf8CharSize(p);
		if ( size != 3 ) { char *xx=NULL; *xx=0; }
		// is that punct
		if ( ! is_punct_utf8 ( p ) ) { char *xx=NULL;*xx=0; }
		// make sure can pair across
		//unsigned char bits = getPunctuationBits  ( dst , 4 );
		// must be able to pair across
		//if ( ! ( bits & D_CAN_PAIR_ACROSS ) ) { char *xx=NULL;*xx=0;}
	}

	// if xml is empty, bail
	if   ( ! xml->getContent() ) return true;

	int32_t numNodes = xml->getNumNodes();
	if ( numNodes <= 0 ) return true;

	// . can be given a range, if node2 is -1 that means all!
	// . range is half-open: [node1, node2)
	if ( node2 < 0 ) node2 = numNodes;
	// sanity check
	if ( node1 > node2 ) { char *xx=NULL;*xx=0; }
	char *start = xml->getNode(node1);
	char *end   = xml->getNode(node2-1) + xml->getNodeLen(node2-1);
	int32_t  size  = end - start;

	m_preCount = countWords( start , size , niceness );

	// allocate based on the approximate count
	if ( ! allocateWordBuffers(m_preCount, true)) return false;

	// are we done?
	for ( int32_t k = node1 ; k < node2 && m_numWords < m_preCount ; k++ ){
		// get the kth node
		char *node    = xml->getNode   (k);
		int32_t  nodeLen = xml->getNodeLen(k);
		// is the kth node a tag?
		if ( ! xml->isTag(k) ) {
			char c = node[nodeLen];
			node[nodeLen] = '\0';
			addWords(node,nodeLen,computeWordIds,niceness);
			node[nodeLen] = c;
			continue;
		}
		// it is a tag
		m_words    [m_numWords] = node;
		m_wordLens [m_numWords] = nodeLen;
		m_tagIds   [m_numWords] = xml->getNodeId(k);
		m_wordIds  [m_numWords] = 0LL;
		m_nodes    [m_numWords] = k;
		// we have less than 127 HTML tags, so set
		// the high bit for back tags
		if ( xml->isBackTag(k)) {
			m_tagIds[m_numWords] |= BACKBIT;
		}
		//log(LOG_DEBUG, "Words: Word %" INT32 ": got tag %s%s (%d)",
		//    m_numWords,
		//    isBackTag(m_numWords)?"/":"",
		//    g_nodes[getTagId(m_numWords)].m_nodeName,
		//    getTagId(m_numWords));

		m_numWords++;
		// used by XmlDoc.cpp
		m_numTags++;
		continue;
	}
	return true;
}

bool Words::set11 ( char *s , char *send , int32_t niceness ) {
	reset();
	m_version = TITLEREC_CURRENT_VERSION;
	m_s = s;
	// this will make addWords() scan for tags
	m_hasTags = true;
	// save it
	char saved = *send;
	// null term
	*send = '\0';
	// determine rough upper bound on number of words by counting
	// punct/alnum boundaries
	m_preCount = countWords ( s , niceness );
	// true = tagIds
	bool status = allocateWordBuffers(m_preCount,true);
	// deal with error now
	if ( ! status ) { *send = saved; return false; }
	// and set the words
	status = addWords(s,0x7fffffff, true, niceness );
	// bring it back
	*send = saved;
	// return error?
	return status;
}

bool Words::setxi ( char *s , char *buf, int32_t bufSize, int32_t niceness ) {
	// prevent setting with the same string
	if ( m_s == s ) { char *xx=NULL;*xx=0; }
	reset();
	m_version = TITLEREC_CURRENT_VERSION;
	// save for sanity check
	m_s = s;
	m_localBuf2 = buf;
	m_localBufSize2 = bufSize;
	// determine rough upper bound on number of words by counting
	// punct/alnum boundaries
	m_preCount = countWords ( s , niceness );
	if (!allocateWordBuffers(m_preCount)) return false;
	bool computeWordIds = true;
	return addWords(s,0x7fffffff, computeWordIds, niceness );
}

// . set words from a string
// . assume no HTML entities in the string "s"
// . s must be NULL terminated
// . NOTE: do not free "s" from under us cuz we reference it
// . break up the string ,"s", into "words".
// . doesn't do tags, only text nodes in "xml"
// . our definition of a word is as close to English as we can get it
// . BUT we also consider a string of punctuation characters to be a word
bool Words::set ( char *s , int32_t version,
		  bool computeWordIds ,
		  int32_t niceness ) {

	// prevent setting with the same string
	if ( m_s == s ) { char *xx=NULL;*xx=0; }

	reset();
	m_version = version;
	// save for sanity check
	m_s = s;

	m_version = version;
	// determine rough upper bound on number of words by counting
	// punct/alnum boundaries
	m_preCount = countWords ( s , niceness );
	if (!allocateWordBuffers(m_preCount)) return false;

	return addWords(s,0x7fffffff, computeWordIds, niceness );
}

#include "XmlNode.h"

bool Words::addWords(char *s,int32_t nodeLen,bool computeWordIds, int32_t niceness) {
	int32_t  i = 0;
	int32_t  j;
	//int32_t  k = 0;
	int32_t  wlen;
	//uint32_t e;
	//int32_t  skip;
	int32_t badCount = 0;

	bool hadApostrophe = false;

	UCScript oldScript = ucScriptCommon;
	UCScript saved;
	UCProps props;

 uptop:

	// bad utf8 can cause a breach
	if ( i >= nodeLen ) goto done;

	if ( ! s[i] ) goto done;

	if ( ! is_alnum_utf8(s+i) ) { // && m_numWords < m_preCount ) {

		if ( m_numWords >= m_preCount ) goto done;

		// tag?
		if ( s[i]=='<' && m_hasTags && isTagStart(s+i) ) {
			// get the tag id
			if ( s[i+1]=='/' ) {
				// skip over /
				m_tagIds [m_numWords] = ::getTagId(s+i+2);
				m_tagIds [m_numWords] |= BACKBIT;
			}
			else
				m_tagIds [m_numWords] = ::getTagId(s+i+1);
			m_words    [m_numWords] = s + i;
			m_wordIds  [m_numWords] = 0LL;
			// skip till end
			int32_t tagLen = getTagLen(s+i); // ,niceness);
			m_wordLens [m_numWords] = tagLen;
			m_numWords++;
			// advance
			i += tagLen;
			goto uptop;
		}

		// it is a punct word, find end of it
		char *start = s+i;
		//for (;s[i] && ! is_alnum_utf8(s+i);i+=getUtf8CharSize(s+i));
		for ( ; s[i] ; i += getUtf8CharSize(s+i)){
			// stop on < if we got tags
			if ( s[i] == '<' && m_hasTags ) break;
			// breathe
			QUICKPOLL(niceness);
			// if we are simple ascii, skip quickly
			if ( is_ascii(s[i]) ) {
				// accumulate NON-alnum chars
				if ( ! is_alnum_a(s[i]) ) continue;
				// update
				oldScript = ucScriptCommon;
				// otherwise, stop we got alnum
				break;
			}
			// if we are utf8 we stop on special props
			UChar32 c = utf8Decode ( s+i );
			// stop if word char
			if ( ! ucIsWordChar ( c ) ) continue;
			// update first though
			oldScript = ucGetScript ( c );
			// then stop
			break;
		}
		m_words        [ m_numWords  ] = start;
		m_wordLens     [ m_numWords  ] = s+i - start;
		m_wordIds      [ m_numWords  ] = 0LL;
		if (m_tagIds) m_tagIds[m_numWords] = 0;
		m_numWords++;
		goto uptop;
	}

	// get an alnum word
	j = i;
 again:
	//for ( ; is_alnum_utf8 (&s[i] ) ; i += getUtf8CharSize(s+i) );
	for ( ; s[i] ; i += getUtf8CharSize(s+i) ) {
		// breathe
		QUICKPOLL(niceness);
		// simple ascii?
		if ( is_ascii(s[i]) ) {
			// accumulate alnum chars
			if ( is_alnum_a(s[i]) ) continue;
			// update
			oldScript = ucScriptCommon;
			// otherwise, stop we got punct
			break;
		}
		// get the code point of the utf8 char
		UChar32 c = utf8Decode ( s+i );
		// get props
		props = ucProperties ( c );
		// good stuff?
		if ( props & (UC_IGNORABLE|UC_EXTEND) ) continue;
		// stop? if UC_WORCHAR is set, that means its an alnum
		if ( ! ( props & UC_WORDCHAR ) ) {
			// reset script between words
			oldScript = ucScriptCommon;
			break;
		}
		// save it
		saved = oldScript;
		// update here
		oldScript = ucGetScript(c);
		// treat ucScriptLatin (30) as common so we can have latin1
		// like char without breaking the word!
		if ( oldScript == ucScriptLatin ) oldScript = ucScriptCommon;
		// stop on this crap too i guess. like japanese chars?
		if ( props & ( UC_IDEOGRAPH | UC_HIRAGANA | UC_THAI ) ) {
			// include it
			i += getUtf8CharSize(s+i);
			// but stop
			break;
		}
		// script change?
		if ( saved != oldScript ) break;
	}

	// . java++, A++, C++ exception
	// . A+, C+, exception
	// . TODO: consider putting in Bits.cpp w/ D_CAN_BE_IN_PHRASE
	if ( s[i]=='+' ) {
		if ( s[i+1]=='+' && !is_alnum_utf8(&s[i+2]) ) i += 2;
		else if ( !is_alnum_utf8(&s[i+1]) ) i++;
	}
	// . c#, j#, ...
	if ( s[i]=='#' && !is_alnum_utf8(&s[i+1]) ) i++;

	// comma is ok if like ,ddd!d
	if ( s[i]==',' &&
	     i-j <= 3 &&
	     is_digit(s[i-1]) ) {
		// if word so far is 2 or 3 chars, make sure digits
		if ( i-j >= 2 && ! is_digit(s[i-2]) ) goto nogo;
		if ( i-j >= 3 && ! is_digit(s[i-3]) ) goto nogo;
		// scan forward
	subloop:
		if ( s[i] == ',' &&
		     is_digit(s[i+1]) &&
		     is_digit(s[i+2]) &&
		     is_digit(s[i+3]) &&
		     ! is_digit(s[i+4]) ) {
			i += 4;
			goto subloop;
		}
	}

	// decimal point?
	if ( s[i] == '.' &&
	     is_digit(s[i-1]) &&
	     is_digit(s[i+1]) ) {
		// allow the decimal point
		i++;
		// skip over string of digits
		while ( is_digit(s[i]) ) i++;
	}

 nogo:

	// allow for words like we're dave's and i'm
	if(s[i]=='\''&&s[i+1]&&is_alnum_utf8(&s[i+1])&&!hadApostrophe){
		i++;
		hadApostrophe = true;
		goto again;
	}
	hadApostrophe = false;

	// get word length
	wlen = i - j;
	if ( m_numWords >= m_preCount ) goto done;
	m_words   [ m_numWords  ] = &s[j];
	m_wordLens[ m_numWords  ] = wlen;

	// word start
	// if ( m_numWords==11429 )
	// 	log("hey");

	// . Lars says it's better to leave the accented chars intact
	// . google agrees
	// . but what about "re'sume"?
	if ( computeWordIds ) {
		int64_t h = hash64Lower_utf8(&s[j],wlen);
		m_wordIds [m_numWords] = h;
		// until we get an accent removal algo, comment this
		// out and possibly use the query synonym pipeline
		// to search without accents. MDW
		//int64_t h2 = hash64AsciiLowerE(&s[j],wlen);
		//if ( h2 != h ) m_stripWordIds [m_numWords] = h2;
		//else           m_stripWordIds [m_numWords] = 0LL;
		//m_stripWordIds[m_numWords] = 0;
	}
	if (m_tagIds) m_tagIds[m_numWords] = 0;
	m_numWords++;
	m_numAlnumWords++;
	// break on \0 or MAX_WORDS
	//if ( ! s[i] ) goto done;
	// get a punct word
	goto uptop;
	/*
	  j = i;
	  // delineate the "punctuation" word
	  for ( ; s[i] && !is_alnum_utf8(&s[i]);i+=getUtf8CharSize(s+i));
	  // bad utf8 could cause us to breach the node, so watch out!
	  if ( i > nodeLen ) {
	  badCount++;
	  i = nodeLen;
	  }
	  // get word length
	  wlen = i - j;
	  if ( m_numWords >= m_preCount ) goto done;
	  m_words        [m_numWords  ] = &s[j];
	  m_wordLens     [m_numWords  ] = wlen;
	  m_wordIds      [m_numWords  ] = 0LL;
	  if (m_tagIds) m_tagIds[m_numWords] = 0;
	  m_numWords++;
	*/

 done:
	// bad programming warning
	if ( m_numWords > m_preCount ) {
		log(LOG_LOGIC,
		    "build: words: set: Fix counting routine.");
		char *xx = NULL; *xx = 0;
	}
	// compute total length
	if ( m_numWords <= 0 ) m_totalLen = 0;
	else m_totalLen = m_words[m_numWords-1] - s + m_wordLens[m_numWords-1];

	if ( badCount )
		log("words: had %" INT32 " bad utf8 chars",badCount);

	return true;
}

// common to Unicode and ISO-8859-1
bool Words::allocateWordBuffers(int32_t count, bool tagIds) {
	// alloc if we need to (added 4 more for m_nodes[])
	int32_t wordSize = 0;
	wordSize += sizeof(char *);
	wordSize += sizeof(int32_t);
	wordSize += sizeof(int64_t);
	wordSize += sizeof(int32_t);
	if ( tagIds ) wordSize += sizeof(nodeid_t);
	m_bufSize = wordSize * count;
	if(m_bufSize < 0) return log("build: word count overflow %" INT32 " "
				     "bytes wordSize=%" INT32 " count=%" INT32 ".",
				     m_bufSize, wordSize, count);
	if ( m_bufSize <= m_localBufSize2 && m_localBuf2 ) {
		m_buf = m_localBuf2;
	}
	else if ( m_bufSize <= WORDS_LOCALBUFSIZE ) {
		m_buf = m_localBuf;
	}
	else {
		m_buf = (char *)mmalloc ( m_bufSize , "Words" );
		if ( ! m_buf ) return log("build: Could not allocate %" INT32 " "
					  "bytes for parsing document.",
					  m_bufSize);
	}

	// set ptrs
	char *p = m_buf;
	m_words    = (char     **)p ;
	p += sizeof(char*) * count;
	m_wordLens = (int32_t      *)p ;
	p += sizeof(int32_t)* count;
	m_wordIds  = (int64_t *)p ;
	p += sizeof (int64_t) * count;
	//m_stripWordIds  = (int64_t *)p ;
	//p += sizeof (int64_t) * count;
	m_nodes = (int32_t *)p;
	p += sizeof(int32_t) * count;

	if (tagIds) {
		m_tagIds = (nodeid_t*) p;
		p += sizeof(nodeid_t) * count;
	}

	if ( p > m_buf + m_bufSize ) { char *xx=NULL;*xx=0; }

	return true;
}

void Words::print( ) {
	for (int32_t i=0;i<m_numWords;i++) {
		printWord(i);
		printf("\n");
	}
}

void Words::printWord ( int32_t i ) {
	fprintf(stderr,"#%05" INT32 " ",i);
	fprintf(stderr,"%020" UINT64 " ",m_wordIds[i]);
	// print the word
	printstring(m_words[i],m_wordLens[i]);
	//if (m_spam.m_spam[i]!=0)
	//	printf("[%i]",m_spam.m_spam[i]);
}

int32_t printstring ( char *s , int32_t len ) {
	// filter out \n's and \r's
	int32_t olen = 0;
	for ( int32_t i = 0 ; i < len && olen < 17 ; i++ ) {
		if ( s[i] == '\n' || s[i] =='\r' ) continue;
		olen++;
		fprintf(stderr,"%c",s[i]);
	}
	if ( olen == 17 ) fprintf(stderr,"...");
	//while ( olen < 20 ) { fprintf(stderr," "); olen++; }
	return olen;
}

/*
// for g_indexdb.getTermId()
#include "Indexdb.h"

// . hash all the words into "table"
// . NOTE: we append ":" to the prefixes for you, if one is not there already
bool Words::hash ( TermTable      *table          ,
		   Spam           *spam           ,
		   //Scores       *scores         ,
		   Weights        *weights        ,
		   uint32_t   baseScore      ,
		   uint32_t   maxScore       ,
		   int64_t       startHash      ,
		   char           *prefix1        ,
		   int32_t            prefixLen1     ,
		   char           *prefix2        ,
		   int32_t            prefixLen2     ,
		   bool            useStems       ,
		   bool            hashUniqueOnly ,
		   int32_t            version        , // titleRecVersion ,
		   class Phrases  *phrases        ,
		   bool            hashWordIffNotInPhrase ,
		   int32_t            niceness       ) {
	//if (g_pbuf) g_pbufPtr+=sprintf(g_pbufPtr,"<b>Words::hash()</b><br>");
	// don't hash if score is 0 or less.
	if ( baseScore <= 0 ) return true;

	// is the table storing the terms as strings, too?
	// used by PageParser.cpp
	SafeBuf *pbuf = table->getParserBuf();

	// each word has a score (spam modified)
	int32_t score;
	int32_t score2;
	// the score from the Scores class
	int32_t *wscores = NULL;
	int32_t  norm    = DW; // NORM_WORD_SCORE;
	//if ( scores ) wscores = scores->m_scores;
	// point to word weights over score if we got them
	if ( weights ) {
		wscores = weights->m_ww;
		// set to default weight, DW, defined in Weights.h
		norm    = DW;
	}
	// the hash of each word
	int64_t h;
	// now hash each form of each word
	for (int32_t i = 0 ; i < m_numWords; i++ ) {
		// don't hash punct words
		//if (m_isUnicode || m_version >= 67){
		//if (!ucIsWordChar(((UChar*)m_words[i])[0])) continue;
		if (!m_wordIds[i]) continue;

		// . if the word is not in a phrase and
		//   "hashWordOnlyIfNotInPhrase" is true then don't hash it
		// . this is used in LinkInfo::hash() to hash link text
		if ( hashWordIffNotInPhrase && phrases->isInPhrase(i) )
			continue;
		// assume words has the baseScore
		score = baseScore;
		// modify score based on score vector... like Spam class
		// but top score is XXX. the score vector weights words in
		// different sections of the documents differently. sections
		// that have lots of unhyperlinked text weight highly. this
		// is used to strip out menus, etc. used to get articles for
		// the news collection.
		if ( wscores ) {
			// ignore word completely if score is 0
			if ( wscores[i] == 0 ) continue;
			// scale the final score if we should
			if ( wscores[i] != norm ) { // NORM_WORD_SCORE ) {
				// . we use -1 to mean to index with minimal
				//   score but also to mean that it is not
				//   visible
				// . used for things in <marquee> and <select>
				// . see Scores.cpp
				//if ( wscores[i] == -1 ) score = 1;
				//score = (score * wscores[i]) >> 10;
				// TODO: can this wrap?
				score = (score * wscores[i]) / norm;
				// never decrease all the way to 0
				if (  score <= 0 ) score = 1;
			}
		}
		QUICKPOLL(niceness);
		// . hash the startHash with the wordId for this word
		// . we must mask it before adding it to the table because
		//   this table is also used to hash IndexLists into that come
		//   from LinkInfo classes (incoming link text). And when
		//   those IndexLists are hashed they used masked termIds.
		//   So we should too...
		//h = hash64 ( startHash , m_wordIds[i] ) & TERMID_MASK;
		h = g_indexdb.getTermId ( startHash , m_wordIds[i] ) ;
		//if (m_isUnicode &&
		//    (((UChar*)m_words[i])[0] == '1' ||
		//     ((UChar*)m_words[i])[0] == 's')){
		//		printf("Words::hash: starthash %"INT64" prefix2 \"
		//               %10s\" wordId "
		//	       "(%"INT64") termId: (%"INT64") ",
		//	       startHash, prefix2, m_wordIds[i], h);
		//	ucDebug(m_words[i], m_wordLens[i]);
		//}
		// . modify word's score based on the spam probability
		// . don't hash it if it's heavily spammed (spam of 100%)
		if ( spam && spam->getSpam(i) ) {
			score = score - (score*spam->getSpam(i)) / 100;
			if (  score <= 0 ) continue;
		}
		//if ( version >= 36 ) {
		score2 = score >> 1;
		if (score2 <= 0) score2 = 1;
		//}
		//else
		//	score2 = score;
		// debug, show the score for 'york'
		//if ( h == 25718418790376LL ) {
		//	int32_t ww = -1;
		//	if ( wscores ) ww = wscores[i];
		//	logf(LOG_DEBUG,"build: adding %" INT32 " for sex, wscore=%" INT32 " "
		//	     "baseScore=%" INT32 "",
		//	     score,ww,baseScore);
		//}

		// if we don't have to print out the parser info then
		// do not supply the term string to the table
		if ( ! pbuf ) {
			if ( ! table->addTerm(h,score,maxScore,hashUniqueOnly,
					      m_version ))
				return false;
			continue;
		}

		// . keep tabs on what we hash into the table if we need to
		// . store the term into term table
		int32_t slen;
		char *s = table->storeTerm ( m_words[i],
					     m_wordLens[i] ,
					     prefix1   , prefixLen1    ,
					     prefix2   , prefixLen2    ,
					     true, &slen);
		if(s == NULL) {
			g_errno = ENOMEM;
			return false;
		}
		if ( ! table->addTerm(h,score,maxScore,hashUniqueOnly,
				      m_version,s,slen))
			return false;

		// sanity check
		//if ( h == 262515731587173LL ) {
		//	int32_t nn = table->getScoreFromTermId ( h );
		//	logf(LOG_DEBUG,"build: score now %" INT32 "",nn);
		//}
	}
	// return now if we don't have to print out spam info to parser buf
	if ( ! pbuf ) return true;
	if ( ! spam && ! weights ) return true;//scores ) return true;
	// new line for parser buf
	*pbuf += '\n';
	// print page as normal
	//char m_printTags = false;
	// print out each word and it's spam value, if we have spammed words!
	int32_t i;
	for ( i = 0 ; i < m_numWords; i++ ) {
		// get the score, default it to 100
		int32_t score  = 100;
		// phrase weight
		int32_t pscore = 100;
		// NORM_WORD_SCORE is 128 last time i checked, this allows for
		// us to do fast integer operations with the resolution of a
		// float
		//if ( scores  )
		//	score = (100 * scores->getScore(i)) / NORM_WORD_SCORE;
		if ( weights ) {
			// DW is the default word weight
			score  = (100 *weights->m_ww[i]    ) / DW;
			pscore = (100 *weights->m_pw[i]    ) / DW;
		}

		//if (m_wordIds[i] && (!scores || scores->getScore(i) > 0)){
		// show tags unrendered
		if ( ! pbuf->m_renderHtml && m_wordIds[i] ) {
			if (spam->getSpam(i) ) {
				pbuf->safePrintf("<span class=\"spam\">"
						 "<strike>");
			}
			else{
				pbuf->safePrintf("<span class=\"token\">");
			}
		}
		else if ( ! pbuf->m_renderHtml && m_tagIds && m_tagIds[i] ) {
			if (m_tagIds[i] == TAG_COMMENT)
				pbuf->safePrintf("<span class=\"gbcomment\">");
			else
				pbuf->safePrintf("<span class=\"gbtag\">");
		}

		for ( int32_t j = 0 ; j < m_wordLens[i] ; j++ ) {
			UChar32 c = (unsigned char)m_words[i][j];
			// print the tag au natural if we should
			if ( pbuf->m_renderHtml ) { // ! m_printTags ) {
				c = fixWindows1252(c);
				pbuf->utf32Encode(c);
				continue;
			}
			if (c == '<'){
				pbuf->safePrintf("&lt;");
			}
			else if (c == '>'){
				pbuf->safePrintf("&gt;");
			}
			else if (c == '&'){
				pbuf->safePrintf("&amp;");
			}
			else{
				c = fixWindows1252(c);
				pbuf->utf32Encode(c);
			}
		}


		if ((m_tagIds && m_tagIds[i]) || ! m_wordIds[i] ) {
			if ( pscore != 0 ) {
				//int32_t tt=((int32_t)scores->getScore(i)*100)/
				//NORM_WORD_SCORE;
				//int32_t tt = 0;
				//if(scores) tt = scores->getScore(i);
				//else tt = score;
				//tt = score;
				//if ( tt == 0 ) tt = 1;
				//pbuf->safePrintf("<font size=-7 color=red>"
				//		 "%" INT32 "</font>",
				//		 pscore);
				//if ( scores )
				//	pbuf->safePrintf(
				//		 "<font size=-7 color=green>"
				//		 "%" INT32 "</font>",
				//		 scores->m_scores[i]);
				pbuf->safePrintf("<font size=-7>#%" INT32 "</font>",i);
			}
			if ( ! pbuf->m_renderHtml ) // ! m_printTags )
				pbuf->safePrintf("</span>\n");
		}
		//if (m_wordIds[i] && (!scores || scores->getScore(i) > 0) ){
		if (m_wordIds[i] ) { // && score ) {
			if ( m_wordIds[i] && spam->getSpam(i) ) {
				pbuf->safePrintf("</strike>[%" INT32 "]",
					(int32_t)spam->getSpam(i));
			}
			//if (m_wordIds[i] && (!scores || scores->getScore(i)
			// > 0) ){
			//if(scores && scores->getScore(i) != NORM_WORD_SCORE){
			//if ( score != 0 || pscore != 0 ) {
			//int32_t tt=((int32_t)scores->getScore(i)*100)/
			//NORM_WORD_SCORE;
			int32_t tt = 0;
			//if(scores) tt = scores->getScore(i);
			//else tt = score;
			tt = score;
			if ( tt == 0 ) tt = 1;
			pbuf->safePrintf("<font size=-7 color=red>"
					 "%" INT32 "/%" INT32 "</font>",
					 score,pscore);
			//if ( scores )
			//	pbuf->safePrintf("<font size=-7 color=green>"
			//			 "%" INT32 "</font>",
			//			 (int32_t)scores->m_scores[i]);
			pbuf->safePrintf("<font size=-7>#%" INT32 "</font>",i);
			//}
			if ( ! pbuf->m_renderHtml ) // ! m_printTags )
				pbuf->safePrintf("</span>\n");
		}
	}
	// end with a <br>
	pbuf->safePrintf ( "<br><br><br>" );
	if ( i >= m_numWords ) return true;
	// otherwise print a msg if breaked out
	pbuf->safePrintf("<br><b>... out of memory</b><br>");
	return true;
}
*/

////////////////////////////////////////////////////////////
//
// the new faster words setter.
// old was taking 346 cycles per word
//
////////////////////////////////////////////////////////////

bool Words::set2 ( Xml *xml,
		   bool computeWordIds ,
		   int32_t niceness) {
	reset();
	m_xml = xml;
	m_version = xml->getVersion();
	m_version = xml->getVersion();
	register char *p = (char *)xml->getContent();
	if ( *p ) p++;
	register int32_t x = 0;
 ploop:
	//if ( is_alnum(*(p-1)) ^ is_alnum(*p) ) x++;
	//if ( is_alnum(*p ) ) x++;
	//x += g_map_is_alpha[*p] ;
	if ( is_alnum_utf8(p) ) x++;
	//if ( isalnum(*p) ) x++;
	//if ( g_map_is_alpha[*p] ) x++;
	//x++;
	p++;
	if ( *p ) goto ploop;

	m_preCount = x;
	m_preCount = xml->getContentLen() / 2;
	//if ( m_preCount > 9000 ) m_preCount = 9000;
	//m_preCount = 9000;

	if (!allocateWordBuffers(m_preCount, true)) return false;

	int32_t numNodes = xml->getNumNodes();
	// are we done?
	for ( int32_t k = 0 ; k < numNodes && m_numWords < m_preCount ; k++ ) {
		// get the kth node
		char *node    = xml->getNode   (k);
		int32_t  nodeLen = xml->getNodeLen(k);
		// is the kth node a tag?
		if ( xml->isTag(k) ) {
			m_words         [m_numWords] = node;
			m_wordLens      [m_numWords] = nodeLen;
			m_tagIds        [m_numWords] = xml->getNodeId(k);
			m_wordIds       [m_numWords] = 0LL;
			m_nodes         [m_numWords] = k;
			// we have less than 127 HTML tags, so set
			// the high bit for back tags
			if ( xml->isBackTag(k)) {
				m_tagIds[m_numWords] |= BACKBIT;
			}

			//log(LOG_DEBUG, "Words: Word %" INT32 ": got tag %s%s (%d)",
			//    m_numWords,
			//    isBackTag(m_numWords)?"/":"",
			//    g_nodes[getTagId(m_numWords)].m_nodeName,
			//    getTagId(m_numWords));

			m_numWords++;
			// used by XmlDoc.cpp
			m_numTags++;
			continue;
		}
		// otherwise it's a text node
		char c = node[nodeLen];
		node[nodeLen] = '\0';
		addWords(node, nodeLen,computeWordIds, niceness);
		node[nodeLen] = c;
	}
	return true;
}

int32_t Words::isFloat  ( int32_t n, float& f) {
	char buf[128];
	char *p = buf;
	int32_t offset = 0;
	while(isPunct(n+offset) &&
	      !(m_words[n+offset][0] == '.' ||
		m_words[n+offset][0] == '-')) offset++;

	while(isPunct(n+offset) &&
	      !(m_words[n+offset][0] == '.' ||
		m_words[n+offset][0] == '-')) offset++;


	gbmemcpy(buf, getWord(n), getWordLen(n));
	buf[getWordLen(n)] = '\0';
	log(LOG_WARN, "trying to get %s %" INT32 "", buf, offset);


	if(isNum(n)) {
		if(1 + n < m_numWords &&
		   isPunct(n+1) && m_words[n+1][0] == '.') {
			if(2 + n < m_numWords && isNum(n+2)) {
				gbmemcpy(p, m_words[n], m_wordLens[n]);
				p += m_wordLens[n];
				gbmemcpy(p, ".", 1);
				p++;
				gbmemcpy(p, m_words[n+2], m_wordLens[n+2]);
				f = atof(buf);
				return 3 + offset;
			}
			else {
				return offset;
			}
		} else if(n > 0 && isPunct(n-1) && m_wordLens[n-1] > 0 &&
			  (m_words[n-1][m_wordLens[n-1]-1] == '.' ||
			   m_words[n-1][m_wordLens[n-1]-1] == '-')) {
			//hmm, we just skipped the period as punct?
			sprintf(buf, "0.%s",m_words[n]);
			f = atof(buf);
			return 1 + offset;
		}
		else {
			f = atof(m_words[n]);
			return 1 + offset;
		}
	}

	//does this have a period in front?
	if(isPunct(n) && (m_words[n][0] == '.' || m_words[n][0] == '-')) {
		if(1 + n < m_numWords && isNum(n+1)) {
			gbmemcpy(p, m_words[n], m_wordLens[n]);
			p += m_wordLens[n];
			gbmemcpy(p, m_words[n+1], m_wordLens[n+1]);
			f = atof(buf);
			return 2 + offset;
		}
	}
	return offset;
}

static uint8_t s_findMaxIndex(int64_t *array, int num, int *wantmax = NULL) {
	if(!array || num < 2 || num > 255) return(0);
	int64_t max, oldmax;
	int idx = 0;
	max = oldmax = INT_MIN;
	for(int x = 0; x < num; x++) {
		if(array[x] >= max) {
			oldmax = max;
			max = array[x];
			idx = x;
		}
	}
	if(max == 0) return(0);
	if(max == oldmax) return(0);
	if(wantmax) *wantmax = max;
	return((uint8_t)idx);
}

//static bool s_isWordCap ( char *word , int len ) {
//	if ( ! is_upper_utf8 ( word ) ) return false;
//	int32_t cs = getUtf8CharSize ( word );
//	if ( is_lower_utf8 ( &word[cs] ) ) return true;
//	return false;
//}

unsigned char Words::isBounded(int wordi) {
	if(wordi+1 < m_numWords &&
	   getWord(wordi)[getWordLen(wordi)] == '/' //||
	    //getWord(wordi)[getWordLen(wordi)] == '?'
	   )
		return(true);
	if(wordi+1 < m_numWords &&
	   (getWord(wordi)[getWordLen(wordi)] == '.' ||
	    getWord(wordi)[getWordLen(wordi)] == '?') &&
	   is_alnum_a(getWord(wordi)[getWordLen(wordi)+1]) )
		return(true);
	if(wordi > 0 &&
	   (getWord(wordi)[-1] == '/' ||
	    getWord(wordi)[-1] == '?'))
		return(true);

	return(false);
}

unsigned char getCharacterLanguage ( char *utf8Char ) {
	// romantic?
	char cs = getUtf8CharSize ( utf8Char );
	// can't say what language it is
	if ( cs == 1 ) return langUnknown;
	// convert to 32 bit unicode
	UChar32 c = utf8Decode ( utf8Char );
	UCScript us = ucGetScript ( c );
	// arabic? this also returns for persian!! fix?
	if ( us == ucScriptArabic )
		return langArabic;
	if ( us == ucScriptCyrillic )
		return langRussian;
	if ( us == ucScriptHebrew )
		return langHebrew;
	if ( us == ucScriptGreek )
		return langGreek;

	return langUnknown;
}

// returns -1 and sets g_errno on error, because 0 means langUnknown
int32_t Words::getLanguage( Sections *sections ,
			 int32_t maxSamples,
			 int32_t niceness,
			 int32_t *langScore) {
	// calculate scores if not given
	//Scores calcdScores;
	//if ( ! scores ) {
	//	if ( ! calcdScores.set( this,m_version,false ) )
	//		return -1;
	//	scores = &calcdScores;
	//}

	// . take a random sample of words and look them up in the
	//   language dictionary
	//HashTableT<int64_t, char> ht;
	HashTableX ht;
	int64_t langCount[MAX_LANGUAGES];
	int64_t langWorkArea[MAX_LANGUAGES];
	int32_t numWords = m_numWords;
	//int32_t skip = numWords/maxSamples;
	//if ( skip == 0 ) skip = 1;
	// reset the language count
	memset(langCount, 0, sizeof(int64_t)*MAX_LANGUAGES);
	// sample the words
	//int32_t wordBase  = 0;
	int32_t wordi     = 0;
	//if ( ! ht.set(maxSamples*1.5) ) return -1;
	if ( ! ht.set(8,1,(int32_t)(maxSamples*8.0),NULL,0,false,
		      niceness,"wordslang"))
		return -1;

	// . avoid words in these bad sections
	// . google seems to index SEC_MARQUEE so i took that out of badFlags
	int32_t badFlags = SEC_SCRIPT|SEC_STYLE|SEC_SELECT;
	// int16_tcuts
	int64_t *wids  = m_wordIds;
	int32_t      *wlens = m_wordLens;
	char     **wptrs = m_words;

	//int32_t langTotal = 0;
// 	log ( LOG_WARN, "xmldoc: Picking language from %" INT32 " words with %" INT32 " skip",
// 			numWords, skip );
	char numOne = 1;
	Section **sp = NULL;
	if ( sections ) sp = sections->m_sectionPtrs;
	// this means null too
	if ( sections && sections->m_numSections == 0 ) sp = NULL;

	int32_t maxCount = 1000;

	while ( wordi < numWords ) {
		// breathe
		QUICKPOLL( niceness );
		// move to the next valid word
		if ( ! wids [wordi]     ) { wordi++; continue; }
		if (   wlens[wordi] < 2 ) { wordi++; continue; }
		// skip if in a bad section
		//int32_t flags = sections->m_sectionPtrs[i]->m_flags;
		// meaning script section ,etc
		if ( sp && ( sp[wordi]->m_flags & badFlags ) ) {
			wordi++; continue; }
		// check the language
		//unsigned char lang = 0;

		// Skip if word is capitalized and not preceded by a tag
		//if(s_isWordCap(getWord(wordi), getWordLen(wordi)) &&
		//   wordi > 0 && !getTagId(wordi - 1)) {
		//	wordi++;
		//	continue;
		//}

		// Skip word if bounded by '/' or '?' might be in a URL
		if(isBounded(wordi)) {
			wordi++;
			continue;
		}

		// is it arabic? sometimes they are spammy pages and repeat
		// a few arabic words over and over again, so don't do deduping
		// with "ht" before checking this.
		char cl = getCharacterLanguage ( wptrs[wordi] );
		if ( cl ) {
		        langCount[(unsigned char)cl]++;
			wordi++;
			continue;
		}

		//if(ht.getSlot(m_wordIds[wordi]) !=-1) {
		if(!ht.isEmpty(&m_wordIds[wordi]) ) {
			wordi++;
			continue;
		}

		// If we can't add the word, it's not that bad.
		// Just gripe about it in the log.
		if(!ht.addKey(&m_wordIds[wordi], &numOne)) {
			log(LOG_WARN, "build: Could not add word to temporary "
			    "table, memory error?\n");
			g_errno = ENOMEM;
			return -1;
		}

		if ( maxCount-- <= 0 ) break;

		// No lang from charset, got a phrase, and 0 language does not have
		// a score Order is very important!
		int foundone = 0;
		if ( // lang == 0 &&
		    // we seem to be missing hungarian and thai
		    g_speller.getPhraseLanguages(getWord(wordi),
						 getWordLen(wordi),
						 langWorkArea) &&
		    // why must it have an "unknown score" of 0?
		    // allow -1... i don't know what that means!!
		    langWorkArea[0] <= 0) {

			int lasty = -1;
			for(int y = 1; y < MAX_LANGUAGES; y++) {
				if(langWorkArea[y] == 0) continue;
				langCount[y]++;
				int32_t pop = langWorkArea[y];
				// negative means in an official dictionary
				if ( pop < 0 ) {
					pop *= -1;
					langCount[y] += 1;
				}
				// extra?
				if ( pop > 1000 )
					langCount[y] += 2;
				if ( pop > 10000 )
					langCount[y] += 2;
				lasty = y;
				foundone++;
			}
			// . if it can only belong to one language
			// . helps fix that fact that our unifiedDict is crummy
			//   and identifes some words as being in a lot of languages
			//   like "Pronto" as being in english and not giving
			//   the popularities correctly.
			if ( foundone == 1 )
				// give massive boost
				langCount[lasty] += 10;
		}
		// . try to skip unknown words without killing sample size
		// . we lack russian, hungarian and arabic in the unified
		//   dict, so try to do character detection for those langs.
		// . should prevent them from being detected as unknown
		//   langs and coming up for english search 'gigablast'
		if ( ! foundone ) {
			langCount[langUnknown]++;
			// do not count towards sample size
			maxCount++;
		}

		// skip to the next word
		//wordBase += skip;
		//if ( wordi < wordBase )
		//	wordi = wordBase;
		//else
		wordi++;
	}
	// punish unknown count in case a doc has a lot of proper names
	// or something
	//langCount[langUnknown] /= 2;
	// get the lang with the max score then
	int l = s_findMaxIndex(langCount, MAX_LANGUAGES);
	// if(langCount[l] < 15) return(langUnknown);
	if(langScore) *langScore = langCount[l];
	// return if known now
	return l;
}

// get the word index at the given character position
int32_t Words::getWordAt ( char *p ) { // int32_t charPos ) {
	if ( ! p                  ) { char *xx=NULL;*xx=0; }
	if ( p <  m_words[0]      ) { char *xx=NULL;*xx=0; }
	if ( p >= getContentEnd() ) { char *xx=NULL;*xx=0; }

	int32_t step = m_numWords / 2;
	int32_t i = m_numWords / 2 ;

 loop:

	// divide it by 2 each time
	step >>= 1;
	// always at least one
	if ( step <= 0 ) step = 1;
	// is it a hit?
	if ( p >= m_words[i] && p < m_words[i] + m_wordLens[i] )
		return i;
	// compare
	if ( m_words[i] < p ) i += step;
	else                  i -= step;
	goto loop;
	return -1;
}


// . return the value of the specified "field" within this html tag, "s"
// . the case of "field" does not matter
char *getFieldValue ( char *s ,
		      int32_t  slen ,
		      char *field ,
		      int32_t *valueLen ) {
	// reset this to 0
	*valueLen = 0;
	// scan for the field name in our node
	int32_t flen = gbstrlen(field);
	char inQuotes = '\0';
	int32_t i;

	// make it sane
	if ( slen > 2000 ) slen = 2000;

	for ( i = 1; i + flen < slen ; i++ ) {
		// skip the field if it's quoted
		if ( inQuotes) {
			if (s[i] == inQuotes ) inQuotes = 0;
			continue;
		}
		// set inQuotes to the quote if we're in quotes
		if ( (s[i]=='\"' || s[i]=='\'')){
			inQuotes = s[i];
			continue;
		}
		// if not in quote tag might end
		if ( s[i] == '>' && ! inQuotes ) return NULL;
		// a field name must be preceeded by non-alnum
		if ( is_alnum_a ( s[i-1] ) ) continue;
		// the first character of this field shout match field[0]
		if ( to_lower_a (s[i]) != to_lower_a(field[0] )) continue;
		// field just be immediately followed by an = or space
		if (s[i+flen]!='='&&!is_wspace_a(s[i+flen]))continue;
		// field names must match
		if ( strncasecmp ( &s[i], field, flen ) != 0 ) continue;
		// break cuz we got a match for our field name
		break;
	}


	// return NULL if no matching field
	if ( i + flen >= slen ) return NULL;

	// advance i over the fieldname so it pts to = or space
	i += flen;

	// advance i over spaces
	while ( i < slen && is_wspace_a ( s[i] ) ) i++;

	// advance over the equal sign, return NULL if does not exist
	if ( i < slen && s[i++] != '=' ) return NULL;

	// advance i over spaces after the equal sign
	while ( i < slen && is_wspace_a ( s[i] ) ) i++;

	// now parse out the value of this field (could be in quotes)
	inQuotes = '\0';

	// set inQuotes to the quote if we're in quotes
	if ( s[i]=='\"' || s[i]=='\'') inQuotes = s[i++];

	// mark this as the start of the value
	int start=i;

	// advance i until we hit a space, or we hit a that quote if inQuotes
	if (inQuotes) while (i<slen && s[i] != inQuotes ) i++;
	else while ( i<slen &&!is_wspace_a(s[i])&&s[i]!='>')i++;

	// set the length of the value
	*valueLen = i - start;

	// return a ptr to the value
	return s + start;
}