forked from Mirrors/privacore-open-source-search-engine
299 lines
7.4 KiB
C++
299 lines
7.4 KiB
C++
// Matt Wells, copyright Jul 2001
|
|
|
|
// . used to parse XML/HTML (romantic char set) into words
|
|
// . TODO: ensure WordType m_types[] array is only 1 byte per entry
|
|
// . ??? a word should end at any non-alnum ??? then using phrasing for "tim's"
|
|
|
|
#ifndef GB_WORDS_H
|
|
#define GB_WORDS_H
|
|
|
|
#include "max_words.h"
|
|
#include "StopWords.h"
|
|
#include "XmlNode.h"
|
|
#include <inttypes.h>
|
|
|
|
// now Matches.h has 300 Words classes handy... try to do away with this
|
|
// make sure it does not slow us down!!
|
|
#define WORDS_LOCALBUFSIZE 80
|
|
|
|
char *getFieldValue ( char *s ,int32_t slen, const char *field , int32_t *valueLen ) ;
|
|
|
|
unsigned char getCharacterLanguage ( const char *utf8Char ) ;
|
|
|
|
class Xml;
|
|
|
|
|
|
#define NUM_LANGUAGE_SAMPLES 1000
|
|
|
|
// this bit is set in the tag id to indicate a back tag
|
|
#define BACKBIT ((nodeid_t)0x8000)
|
|
#define BACKBITCOMP ((nodeid_t)0x7fff)
|
|
|
|
class Words {
|
|
|
|
public:
|
|
|
|
// . set words from a string
|
|
// . s must be NULL terminated
|
|
// . NOTE: we never own the data
|
|
// . there is typically no html in "s"
|
|
// . html tags are NOT parsed out
|
|
bool set( char *s, bool computeIds );
|
|
|
|
// . similar to above
|
|
// . but we temporarily stick a \0 @ s[slen] for parsing purposes
|
|
bool set( char *s, int32_t slen, bool computeIds );
|
|
|
|
// . new function to set directly from an Xml, rather than extracting
|
|
// text first
|
|
// . use range (node1,node2] and if node2 is -1 that means the last one
|
|
bool set( Xml *xml, bool computeIds, int32_t node1 = 0, int32_t node2 = -1 );
|
|
|
|
bool addWords( char *s, int32_t nodeLen, bool computeIds );
|
|
|
|
// get the spam modified score of the ith word (baseScore is the
|
|
// score if the word is not spammed)
|
|
int32_t getNumWords() const {
|
|
return m_numWords;
|
|
}
|
|
int32_t getNumAlnumWords() const {
|
|
return m_numAlnumWords;
|
|
}
|
|
char *getWord( int32_t n ) {
|
|
return m_words[n];
|
|
}
|
|
const char *getWord( int32_t n ) const {
|
|
return m_words[n];
|
|
}
|
|
int32_t getWordLen( int32_t n ) const {
|
|
return m_wordLens[n];
|
|
}
|
|
int32_t getNumTags() const {
|
|
return m_numTags;
|
|
}
|
|
|
|
// . size of string from word #a up to and NOT including word #b
|
|
// . "b" can be m_numWords to mean up to the end of the doc
|
|
int32_t getStringSize( int32_t a, int32_t b ) const {
|
|
// do not let it exceed this
|
|
if ( b >= m_numWords ) {
|
|
b = m_numWords;
|
|
}
|
|
|
|
// pedal it back. we might equal a then. which is ok, that
|
|
// means to just return the length of word #a then
|
|
b--;
|
|
|
|
if ( b < a ) {
|
|
return 0;
|
|
}
|
|
|
|
if ( a < 0 ) {
|
|
return 0;
|
|
}
|
|
|
|
int32_t size = m_words[b] - m_words[a];
|
|
|
|
// add in size of word #b
|
|
size += m_wordLens[b];
|
|
|
|
return size;
|
|
}
|
|
|
|
// . CAUTION: don't call this for punct "words"... it's bogus for them
|
|
// . this is only for alnum "words"
|
|
int64_t getWordId( int32_t n ) const {
|
|
return m_wordIds [n];
|
|
}
|
|
|
|
bool isStopWord ( int32_t n ) const {
|
|
return ::isStopWord( m_words[n], m_wordLens[n], m_wordIds[n] );
|
|
}
|
|
|
|
bool isQueryStopWord ( int32_t n , int32_t langId ) const {
|
|
return ::isQueryStopWord( m_words[n], m_wordLens[n], m_wordIds[n], langId );
|
|
}
|
|
|
|
// . how many quotes in the nth word?
|
|
// . how many plusses in the nth word?
|
|
// . used exclusively by Query class for parsing query syntax
|
|
int32_t getNumQuotes( int32_t n ) const {
|
|
int32_t count = 0;
|
|
for ( int32_t i = 0; i < m_wordLens[n]; i++ ) {
|
|
if ( m_words[n][i] == '\"' ) {
|
|
count++;
|
|
}
|
|
}
|
|
|
|
return count;
|
|
}
|
|
|
|
// . do we have a ' ' 't' '\n' or '\r' in this word?
|
|
// . caller should not call this is isPunct(n) is false, pointless.
|
|
|
|
bool hasSpace( int32_t n ) const {
|
|
for ( int32_t i = 0; i < m_wordLens[n]; i++ ) {
|
|
if ( is_wspace_utf8( &m_words[n][i] ) ) {
|
|
return true;
|
|
}
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
bool hasChar( int32_t n, char c ) const {
|
|
for ( int32_t i = 0; i < m_wordLens[n]; i++ ) {
|
|
if ( m_words[n][i] == c ) {
|
|
return true;
|
|
}
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
bool isSpaces( int32_t n, int32_t starti = 0 ) const {
|
|
for ( int32_t i = starti; i < m_wordLens[n]; i++ ) {
|
|
if ( !is_wspace_utf8( &m_words[n][i] ) ) {
|
|
return false;
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
// if this is set from xml, every word is either a word or an xml node
|
|
nodeid_t getTagId( int32_t n ) const {
|
|
if ( !m_tagIds ) {
|
|
return 0;
|
|
}
|
|
|
|
return ( m_tagIds[n] & BACKBITCOMP );
|
|
}
|
|
|
|
bool isBackTag( int32_t n ) const {
|
|
if ( !m_tagIds ) {
|
|
return false;
|
|
}
|
|
|
|
if ( m_tagIds[n] & BACKBIT ) {
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
// CAUTION!!!
|
|
//
|
|
// "BACKBIT" is set in the tagid of m_tagIds[] to indicate the tag is
|
|
// a "back tag" as opposed to a "front tag". i.e. </a> vs. <a>
|
|
// respectively. so mask it out by doing "& BACKBITCOMP" if you just
|
|
// want the pure tagid!!!!
|
|
//
|
|
// CAUTION!!!
|
|
nodeid_t *getTagIds() { return m_tagIds; }
|
|
const nodeid_t *getTagIds() const { return m_tagIds; }
|
|
char * *getWordPtrs() { return m_words; }
|
|
const char * const *getWordPtrs() const { return (const char*const*)m_words; }
|
|
int32_t *getWordLens() { return m_wordLens; }
|
|
const int32_t *getWordLens() const { return m_wordLens; }
|
|
int64_t *getWordIds() { return m_wordIds; }
|
|
const int64_t *getWordIds() const { return m_wordIds; }
|
|
const int32_t *getNodes() const { return m_nodes; }
|
|
|
|
// 2 types of "words": punctuation and alnum
|
|
// isPunct() will return true on tags, too, so they are "punct"
|
|
bool isPunct ( int32_t n ) const { return m_wordIds[n] == 0;}
|
|
bool isAlnum ( int32_t n ) const { return m_wordIds[n] != 0;}
|
|
bool isAlpha ( int32_t n ) const {
|
|
if ( m_wordIds[n] == 0LL ) return false;
|
|
if ( isNum ( n ) ) return false;
|
|
return true;
|
|
}
|
|
|
|
int32_t getAsLong ( int32_t n ) const {
|
|
// skip if no digit
|
|
if ( ! is_digit ( m_words[n][0] ) ) return -1;
|
|
return atol2(m_words[n],m_wordLens[n]);
|
|
}
|
|
|
|
bool isNum ( int32_t n ) const {
|
|
if ( ! is_digit(m_words[n][0]) ) return false;
|
|
char *p = m_words[n];
|
|
char *pend = p + m_wordLens[n];
|
|
for ( ; p < pend ; p++ )
|
|
if ( ! is_digit(*p) ) return false;
|
|
return true;
|
|
}
|
|
|
|
// . are all alpha char capitalized?
|
|
bool isUpper ( int32_t n ) const {
|
|
// skip if not alnum...
|
|
if ( m_wordIds[n] == 0LL ) {
|
|
return false;
|
|
}
|
|
|
|
char *p = m_words[n];
|
|
char *pend = p + m_wordLens[n];
|
|
char cs;
|
|
for ( ; p < pend; p += cs ) {
|
|
cs = getUtf8CharSize( p );
|
|
if ( is_digit( *p ) ) {
|
|
continue;
|
|
}
|
|
|
|
if ( is_lower_utf8( p ) ) {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
bool isCapitalized( int32_t n ) const {
|
|
if ( !is_alpha_utf8( m_words[n] ) ) {
|
|
return false;
|
|
}
|
|
|
|
return is_upper_utf8( m_words[n] );
|
|
}
|
|
|
|
Words ( );
|
|
~Words ( );
|
|
void reset ( );
|
|
|
|
char *getContent() {
|
|
if ( m_numWords == 0 ) return NULL;
|
|
return m_words[0];
|
|
}
|
|
|
|
int32_t getPreCount() const { return m_preCount; }
|
|
|
|
private:
|
|
|
|
bool allocateWordBuffers(int32_t count, bool tagIds = false);
|
|
|
|
char m_localBuf [ WORDS_LOCALBUFSIZE ];
|
|
|
|
char *m_localBuf2;
|
|
int32_t m_localBufSize2;
|
|
|
|
char *m_buf;
|
|
int32_t m_bufSize;
|
|
Xml *m_xml ; // if the class is set from xml, rather than a string
|
|
|
|
int32_t m_preCount ; // estimate of number of words in the doc
|
|
char **m_words ; // pointers to the word
|
|
int32_t *m_wordLens ; // length of each word
|
|
int64_t *m_wordIds ; // lower ascii hash of word
|
|
int32_t *m_nodes ; // Xml.cpp node # (for tags only)
|
|
nodeid_t *m_tagIds ; // tag for xml "words"
|
|
|
|
int32_t m_numWords; // # of words we have
|
|
int32_t m_numAlnumWords;
|
|
|
|
bool m_hasTags;
|
|
|
|
int32_t m_numTags;
|
|
};
|
|
|
|
#endif // GB_WORDS_H
|