forked from Mirrors/privacore-open-source-search-engine
580 lines
13 KiB
C++
580 lines
13 KiB
C++
//#include "gb-include.h"
|
|
|
|
#include "Words.h"
|
|
#include "Xml.h"
|
|
#include "Unicode.h" // getUtf8CharSize()
|
|
#include "StopWords.h"
|
|
#include "Speller.h"
|
|
#include "HashTableX.h"
|
|
#include "Sections.h"
|
|
#include "XmlNode.h" // getTagLen()
|
|
#include "Mem.h"
|
|
#include "Sanity.h"
|
|
|
|
|
|
Words::Words ( ) {
|
|
m_buf = NULL;
|
|
m_bufSize = 0;
|
|
memset(m_localBuf, 0, sizeof(m_localBuf));
|
|
reset();
|
|
}
|
|
Words::~Words ( ) {
|
|
reset();
|
|
}
|
|
void Words::reset ( ) {
|
|
m_numWords = 0;
|
|
m_numAlnumWords = 0;
|
|
m_xml = NULL;
|
|
m_preCount = 0;
|
|
if ( m_buf && m_buf != m_localBuf && m_buf != m_localBuf2 )
|
|
mfree ( m_buf , m_bufSize , "Words" );
|
|
m_buf = NULL;
|
|
m_bufSize = 0;
|
|
m_nodes = NULL;
|
|
m_tagIds = NULL;
|
|
m_numTags = 0;
|
|
m_hasTags = false;
|
|
m_localBuf2 = NULL;
|
|
m_localBufSize2 = 0;
|
|
|
|
// Coverity
|
|
m_words = NULL;
|
|
m_wordLens = 0;
|
|
m_wordIds = NULL;
|
|
}
|
|
|
|
bool Words::set( char *s, int32_t slen, bool computeWordIds ) {
|
|
// bail if nothing
|
|
if ( ! s || slen == 0 ) {
|
|
m_numWords = 0;
|
|
m_numAlnumWords = 0;
|
|
return true;
|
|
}
|
|
|
|
char c = s[slen];
|
|
if ( c != '\0' ) {
|
|
s[slen] = '\0';
|
|
}
|
|
|
|
bool status = set( s, computeWordIds );
|
|
if ( c != '\0' ) {
|
|
s[slen] = c;
|
|
}
|
|
|
|
return status;
|
|
}
|
|
|
|
// a quickie
|
|
// this url gives a m_preCount that is too low. why?
|
|
// http://go.tfol.com/163/speed.asp
|
|
static int32_t countWords ( const char *p , int32_t plen ) {
|
|
const char *pend = p + plen;
|
|
int32_t count = 1;
|
|
|
|
while ( p < pend ) {
|
|
|
|
// sequence of punct
|
|
for ( ; p < pend && ! is_alnum_utf8 (p) ; p += getUtf8CharSize(p) ) {
|
|
// in case being set from xml tags, count as words now
|
|
if ( *p == '<' ) {
|
|
count++;
|
|
}
|
|
}
|
|
count++;
|
|
|
|
// sequence of alnum
|
|
for ( ; p < pend && is_alnum_utf8 (p) ; p += getUtf8CharSize(p) )
|
|
;
|
|
|
|
count++;
|
|
|
|
};
|
|
// some extra for good meaure
|
|
return count+10;
|
|
}
|
|
|
|
static int32_t countWords ( const char *p ) {
|
|
int32_t count = 1;
|
|
|
|
while ( *p ) {
|
|
// sequence of punct
|
|
for ( ; *p && ! is_alnum_utf8 (p) ; p += getUtf8CharSize(p) ) {
|
|
// in case being set from xml tags, count as words now
|
|
if ( *p=='<') count++;
|
|
}
|
|
count++;
|
|
|
|
// sequence of alnum
|
|
for ( ; *p && is_alnum_utf8 (p) ; p += getUtf8CharSize(p) )
|
|
;
|
|
|
|
count++;
|
|
|
|
}
|
|
// some extra for good meaure
|
|
return count+10;
|
|
}
|
|
|
|
bool Words::set( Xml *xml, bool computeWordIds, int32_t node1, int32_t node2 ) {
|
|
// prevent setting with the same string
|
|
if ( m_xml == xml ) gbshutdownLogicError();
|
|
|
|
reset();
|
|
|
|
m_xml = xml;
|
|
|
|
// if xml is empty, bail
|
|
if ( !xml->getContent() ) {
|
|
return true;
|
|
}
|
|
|
|
int32_t numNodes = xml->getNumNodes();
|
|
if ( numNodes <= 0 ) {
|
|
return true;
|
|
}
|
|
|
|
// . can be given a range, if node2 is -1 that means all!
|
|
// . range is half-open: [node1, node2)
|
|
if ( node2 < 0 ) {
|
|
node2 = numNodes;
|
|
}
|
|
|
|
// sanity check
|
|
if ( node1 > node2 ) gbshutdownLogicError();
|
|
|
|
char *start = xml->getNode(node1);
|
|
char *end = xml->getNode( node2 - 1 ) + xml->getNodeLen( node2 - 1 );
|
|
int32_t size = end - start;
|
|
|
|
m_preCount = countWords( start , size );
|
|
|
|
// allocate based on the approximate count
|
|
if ( !allocateWordBuffers( m_preCount, true ) ) {
|
|
return false;
|
|
}
|
|
|
|
// are we done?
|
|
for ( int32_t k = node1; k < node2 && m_numWords < m_preCount; ++k ) {
|
|
// get the kth node
|
|
char *node = xml->getNode( k );
|
|
int32_t nodeLen = xml->getNodeLen( k );
|
|
|
|
// is the kth node a tag?
|
|
if ( !xml->isTag( k ) ) {
|
|
/// @todo ALC why are we adding NULL and restoring it after?
|
|
/// addWords should be change to use nodeLen and not null terminated string
|
|
char c = node[nodeLen];
|
|
node[nodeLen] = '\0';
|
|
addWords( node, nodeLen, computeWordIds );
|
|
node[nodeLen] = c;
|
|
continue;
|
|
}
|
|
|
|
// it is a tag
|
|
m_words [m_numWords] = node;
|
|
m_wordLens [m_numWords] = nodeLen;
|
|
m_tagIds [m_numWords] = xml->getNodeId(k);
|
|
m_wordIds [m_numWords] = 0LL;
|
|
m_nodes [m_numWords] = k;
|
|
|
|
// we have less than 127 HTML tags, so set
|
|
// the high bit for back tags
|
|
if ( xml->isBackTag(k)) {
|
|
m_tagIds[m_numWords] |= BACKBIT;
|
|
}
|
|
|
|
m_numWords++;
|
|
|
|
// used by XmlDoc.cpp
|
|
m_numTags++;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
// . set words from a string
|
|
// . assume no HTML entities in the string "s"
|
|
// . s must be NULL terminated
|
|
// . NOTE: do not free "s" from under us cuz we reference it
|
|
// . break up the string ,"s", into "words".
|
|
// . doesn't do tags, only text nodes in "xml"
|
|
// . our definition of a word is as close to English as we can get it
|
|
// . BUT we also consider a string of punctuation characters to be a word
|
|
bool Words::set( char *s, bool computeWordIds ) {
|
|
reset();
|
|
|
|
// determine rough upper bound on number of words by counting
|
|
// punct/alnum boundaries
|
|
m_preCount = countWords ( s );
|
|
if ( !allocateWordBuffers( m_preCount ) ) {
|
|
return false;
|
|
}
|
|
|
|
return addWords( s, 0x7fffffff, computeWordIds );
|
|
}
|
|
|
|
bool Words::addWords( char *s, int32_t nodeLen, bool computeWordIds ) {
|
|
int32_t i = 0;
|
|
int32_t j;
|
|
int32_t wlen;
|
|
|
|
bool hadApostrophe = false;
|
|
|
|
UCScript oldScript = ucScriptCommon;
|
|
UCScript saved;
|
|
UCProps props;
|
|
|
|
uptop:
|
|
|
|
// bad utf8 can cause a breach
|
|
if ( i >= nodeLen ) {
|
|
goto done;
|
|
}
|
|
|
|
if ( ! s[i] ) {
|
|
goto done;
|
|
}
|
|
|
|
if ( !is_alnum_utf8( s + i ) ) {
|
|
if ( m_numWords >= m_preCount ) {
|
|
goto done;
|
|
}
|
|
|
|
// tag?
|
|
if ( s[i]=='<' && m_hasTags && isTagStart(s+i) ) {
|
|
// get the tag id
|
|
if( m_tagIds ) {
|
|
if ( s[i + 1] == '/' ) {
|
|
// skip over /
|
|
m_tagIds[m_numWords] = ::getTagId( s + i + 2 );
|
|
m_tagIds[m_numWords] |= BACKBIT;
|
|
} else {
|
|
m_tagIds[m_numWords] = ::getTagId( s + i + 1 );
|
|
}
|
|
}
|
|
|
|
m_words[m_numWords] = s + i;
|
|
m_wordIds[m_numWords] = 0LL;
|
|
|
|
// skip till end
|
|
int32_t tagLen = getTagLen( s + i );
|
|
m_wordLens[m_numWords] = tagLen;
|
|
m_nodes[m_numWords] = 0;
|
|
m_numWords++;
|
|
|
|
// advance
|
|
i += tagLen;
|
|
goto uptop;
|
|
}
|
|
|
|
// it is a punct word, find end of it
|
|
char *start = s+i;
|
|
for ( ; s[i] ; i += getUtf8CharSize(s+i)) {
|
|
// stop on < if we got tags
|
|
if ( s[i] == '<' && m_hasTags ) {
|
|
break;
|
|
}
|
|
|
|
// if we are simple ascii, skip quickly
|
|
if ( is_ascii(s[i]) ) {
|
|
// accumulate NON-alnum chars
|
|
if ( ! is_alnum_a(s[i]) ) {
|
|
continue;
|
|
}
|
|
|
|
// update
|
|
oldScript = ucScriptCommon;
|
|
|
|
// otherwise, stop we got alnum
|
|
break;
|
|
}
|
|
|
|
// if we are utf8 we stop on special props
|
|
UChar32 c = utf8Decode ( s+i );
|
|
|
|
// stop if word char
|
|
if ( ! ucIsWordChar ( c ) ) {
|
|
continue;
|
|
}
|
|
|
|
// update first though
|
|
oldScript = ucGetScript ( c );
|
|
|
|
// then stop
|
|
break;
|
|
}
|
|
m_words [ m_numWords ] = start;
|
|
m_wordLens [ m_numWords ] = s+i - start;
|
|
m_wordIds [ m_numWords ] = 0LL;
|
|
m_nodes [ m_numWords ] = 0;
|
|
|
|
if (m_tagIds) {
|
|
m_tagIds[m_numWords] = 0;
|
|
}
|
|
|
|
m_numWords++;
|
|
goto uptop;
|
|
}
|
|
|
|
// get an alnum word
|
|
j = i;
|
|
again:
|
|
for ( ; s[i] ; i += getUtf8CharSize(s+i) ) {
|
|
// simple ascii?
|
|
if ( is_ascii(s[i]) ) {
|
|
// accumulate alnum chars
|
|
if ( is_alnum_a(s[i]) ) continue;
|
|
// update
|
|
oldScript = ucScriptCommon;
|
|
// otherwise, stop we got punct
|
|
break;
|
|
}
|
|
// get the code point of the utf8 char
|
|
UChar32 c = utf8Decode ( s+i );
|
|
// get props
|
|
props = ucProperties ( c );
|
|
// good stuff?
|
|
if ( props & (UC_IGNORABLE|UC_EXTEND) ) continue;
|
|
// stop? if UC_WORCHAR is set, that means its an alnum
|
|
if ( ! ( props & UC_WORDCHAR ) ) {
|
|
// reset script between words
|
|
oldScript = ucScriptCommon;
|
|
break;
|
|
}
|
|
// save it
|
|
saved = oldScript;
|
|
// update here
|
|
oldScript = ucGetScript(c);
|
|
// treat ucScriptLatin (30) as common so we can have latin1
|
|
// like char without breaking the word!
|
|
if ( oldScript == ucScriptLatin ) oldScript = ucScriptCommon;
|
|
// stop on this crap too i guess. like japanes chars?
|
|
if ( props & ( UC_IDEOGRAPH | UC_HIRAGANA | UC_THAI ) ) {
|
|
// include it
|
|
i += getUtf8CharSize(s+i);
|
|
// but stop
|
|
break;
|
|
}
|
|
// script change?
|
|
if ( saved != oldScript ) break;
|
|
}
|
|
|
|
// . java++, A++, C++ exception
|
|
// . A+, C+, exception
|
|
// . TODO: consider putting in Bits.cpp w/ D_CAN_BE_IN_PHRASE
|
|
if ( s[i]=='+' ) {
|
|
if ( s[i+1]=='+' && !is_alnum_utf8(&s[i+2]) ) i += 2;
|
|
else if ( !is_alnum_utf8(&s[i+1]) ) i++;
|
|
}
|
|
// . c#, j#, ...
|
|
if ( s[i]=='#' && !is_alnum_utf8(&s[i+1]) ) i++;
|
|
|
|
// comma is ok if like ,ddd!d
|
|
if ( s[i]==',' &&
|
|
i-j <= 3 &&
|
|
is_digit(s[i-1]) ) {
|
|
// if word so far is 2 or 3 chars, make sure digits
|
|
if ( i-j >= 2 && ! is_digit(s[i-2]) ) goto nogo;
|
|
if ( i-j >= 3 && ! is_digit(s[i-3]) ) goto nogo;
|
|
// scan forward
|
|
while ( s[i] == ',' &&
|
|
is_digit(s[i+1]) &&
|
|
is_digit(s[i+2]) &&
|
|
is_digit(s[i+3]) &&
|
|
! is_digit(s[i+4]) ) {
|
|
i += 4;
|
|
}
|
|
}
|
|
|
|
// decimal point?
|
|
if ( s[i] == '.' &&
|
|
is_digit(s[i-1]) &&
|
|
is_digit(s[i+1]) ) {
|
|
// allow the decimal point
|
|
i++;
|
|
// skip over string of digits
|
|
while ( is_digit(s[i]) ) i++;
|
|
}
|
|
|
|
nogo:
|
|
|
|
// allow for words like we're dave's and i'm
|
|
if ( s[i] == '\'' && s[i + 1] && is_alnum_utf8( &s[i + 1] ) && !hadApostrophe ) {
|
|
i++;
|
|
hadApostrophe = true;
|
|
goto again;
|
|
}
|
|
hadApostrophe = false;
|
|
|
|
// get word length
|
|
wlen = i - j;
|
|
if ( m_numWords >= m_preCount ) goto done;
|
|
m_words [ m_numWords ] = &s[j];
|
|
m_wordLens[ m_numWords ] = wlen;
|
|
|
|
if ( computeWordIds ) {
|
|
int64_t h = hash64Lower_utf8(&s[j],wlen);
|
|
m_wordIds [m_numWords] = h;
|
|
}
|
|
|
|
m_nodes[m_numWords] = 0;
|
|
if (m_tagIds) m_tagIds[m_numWords] = 0;
|
|
m_numWords++;
|
|
m_numAlnumWords++;
|
|
// get a punct word
|
|
goto uptop;
|
|
|
|
done:
|
|
// bad programming warning
|
|
if ( m_numWords > m_preCount ) {
|
|
log(LOG_LOGIC, "build: words: set: Fix counting routine.");
|
|
gbshutdownLogicError();
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
// common to Unicode and ISO-8859-1
|
|
bool Words::allocateWordBuffers(int32_t count, bool tagIds) {
|
|
// alloc if we need to (added 4 more for m_nodes[])
|
|
int32_t wordSize = 0;
|
|
wordSize += sizeof(char *);
|
|
wordSize += sizeof(int32_t);
|
|
wordSize += sizeof(int64_t);
|
|
wordSize += sizeof(int32_t);
|
|
if ( tagIds ) wordSize += sizeof(nodeid_t);
|
|
m_bufSize = wordSize * count;
|
|
if(m_bufSize < 0) {
|
|
log(LOG_WARN, "build: word count overflow %" PRId32" bytes wordSize=%" PRId32" count=%" PRId32".",
|
|
m_bufSize, wordSize, count);
|
|
return false;
|
|
}
|
|
if ( m_bufSize <= m_localBufSize2 && m_localBuf2 ) {
|
|
m_buf = m_localBuf2;
|
|
}
|
|
else if ( m_bufSize <= WORDS_LOCALBUFSIZE ) {
|
|
m_buf = m_localBuf;
|
|
}
|
|
else {
|
|
m_buf = (char *)mmalloc ( m_bufSize , "Words" );
|
|
if ( ! m_buf ) {
|
|
log(LOG_WARN, "build: Could not allocate %" PRId32" bytes for parsing document.", m_bufSize);
|
|
return false;
|
|
}
|
|
}
|
|
|
|
// set ptrs
|
|
char *p = m_buf;
|
|
m_words = (char **)p ;
|
|
p += sizeof(char*) * count;
|
|
m_wordLens = (int32_t *)p ;
|
|
p += sizeof(int32_t)* count;
|
|
m_wordIds = (int64_t *)p ;
|
|
p += sizeof (int64_t) * count;
|
|
m_nodes = (int32_t *)p;
|
|
p += sizeof(int32_t) * count;
|
|
|
|
if (tagIds) {
|
|
m_tagIds = (nodeid_t*) p;
|
|
p += sizeof(nodeid_t) * count;
|
|
}
|
|
|
|
if ( p > m_buf + m_bufSize ) gbshutdownLogicError();
|
|
|
|
return true;
|
|
}
|
|
|
|
unsigned char getCharacterLanguage ( const char *utf8Char ) {
|
|
// romantic?
|
|
char cs = getUtf8CharSize ( utf8Char );
|
|
// can't say what language it is
|
|
if ( cs == 1 ) return langUnknown;
|
|
// convert to 32 bit unicode
|
|
UChar32 c = utf8Decode ( utf8Char );
|
|
UCScript us = ucGetScript ( c );
|
|
// arabic? this also returns for persian!! fix?
|
|
if ( us == ucScriptArabic )
|
|
return langArabic;
|
|
if ( us == ucScriptCyrillic )
|
|
return langRussian;
|
|
if ( us == ucScriptHebrew )
|
|
return langHebrew;
|
|
if ( us == ucScriptGreek )
|
|
return langGreek;
|
|
|
|
return langUnknown;
|
|
}
|
|
|
|
// . return the value of the specified "field" within this html tag, "s"
|
|
// . the case of "field" does not matter
|
|
char *getFieldValue( char *s, int32_t slen, const char *field, int32_t *valueLen ) {
|
|
// reset this to 0
|
|
*valueLen = 0;
|
|
// scan for the field name in our node
|
|
int32_t flen = strlen(field);
|
|
char inQuotes = '\0';
|
|
int32_t i;
|
|
|
|
// make it sane
|
|
if ( slen > 2000 ) slen = 2000;
|
|
|
|
for ( i = 1; i + flen < slen ; i++ ) {
|
|
// skip the field if it's quoted
|
|
if ( inQuotes) {
|
|
if (s[i] == inQuotes ) inQuotes = 0;
|
|
continue;
|
|
}
|
|
// set inQuotes to the quote if we're in quotes
|
|
if ( (s[i]=='\"' || s[i]=='\'')){
|
|
inQuotes = s[i];
|
|
continue;
|
|
}
|
|
// if not in quote tag might end
|
|
if ( s[i] == '>' && ! inQuotes ) return NULL;
|
|
// a field name must be preceeded by non-alnum
|
|
if ( is_alnum_a ( s[i-1] ) ) continue;
|
|
// the first character of this field shout match field[0]
|
|
if ( to_lower_a (s[i]) != to_lower_a(field[0] )) continue;
|
|
// field just be immediately followed by an = or space
|
|
if (s[i+flen]!='='&&!is_wspace_a(s[i+flen]))continue;
|
|
// field names must match
|
|
if ( strncasecmp ( &s[i], field, flen ) != 0 ) continue;
|
|
// break cuz we got a match for our field name
|
|
break;
|
|
}
|
|
|
|
// return NULL if no matching field
|
|
if ( i + flen >= slen ) return NULL;
|
|
|
|
// advance i over the fieldname so it pts to = or space
|
|
i += flen;
|
|
|
|
// advance i over spaces
|
|
while ( i < slen && is_wspace_a ( s[i] ) ) i++;
|
|
|
|
// advance over the equal sign, return NULL if does not exist
|
|
if ( i < slen && s[i++] != '=' ) return NULL;
|
|
|
|
// advance i over spaces after the equal sign
|
|
while ( i < slen && is_wspace_a ( s[i] ) ) i++;
|
|
|
|
// now parse out the value of this field (could be in quotes)
|
|
inQuotes = '\0';
|
|
|
|
// set inQuotes to the quote if we're in quotes
|
|
if ( s[i]=='\"' || s[i]=='\'') inQuotes = s[i++];
|
|
|
|
// mark this as the start of the value
|
|
int start=i;
|
|
|
|
// advance i until we hit a space, or we hit a that quote if inQuotes
|
|
if (inQuotes) while (i<slen && s[i] != inQuotes ) i++;
|
|
else while ( i<slen &&!is_wspace_a(s[i])&&s[i]!='>')i++;
|
|
|
|
// set the length of the value
|
|
*valueLen = i - start;
|
|
|
|
// return a ptr to the value
|
|
return s + start;
|
|
}
|