mirror of
https://github.com/privacore/open-source-search-engine.git
synced 2025-01-22 02:18:42 -05:00
48 lines
1.2 KiB
C++
48 lines
1.2 KiB
C++
// Matt Wells, copyright Jul 2005
|
|
|
|
#ifndef GB_POS_H
|
|
#define GB_POS_H
|
|
|
|
#include <stdint.h>
|
|
#include "TitleRecVersion.h"
|
|
|
|
// this class is used to measure the number of characters between two "words"
|
|
// (as defined in the Words.cpp class) in units of "characters". A utf8
|
|
// character can be 1, 2, 3 or 4 bytes, so be careful.
|
|
|
|
#define POS_LOCALBUFSIZE 20
|
|
|
|
class TokenizerResult;
|
|
|
|
class Pos {
|
|
|
|
public:
|
|
|
|
Pos();
|
|
~Pos();
|
|
void reset();
|
|
|
|
bool set(const TokenizerResult *tr, int32_t a = 0, int32_t b = -1);
|
|
|
|
// . filter out xml words [a,b] into plain text, stores into "f"
|
|
// . will not exceed "fend"
|
|
// . returns number of BYTES stored into "f"
|
|
unsigned filter(const TokenizerResult *tr, int32_t a, int32_t b, bool addEllipsis, char *f, char *fend,
|
|
int32_t version = TITLEREC_CURRENT_VERSION);
|
|
|
|
// . the position in CHARACTERS of word i is given by m_pos[i]
|
|
// . this is NOT the byte position. you can have 2, 3 or even 4
|
|
// byte characters in utf8. the purpose here is for counting
|
|
// "letters" or "characters" for formatting purposes.
|
|
int32_t *m_pos;
|
|
|
|
private:
|
|
char m_localBuf [ POS_LOCALBUFSIZE ];
|
|
char *m_buf;
|
|
int32_t m_bufSize;
|
|
|
|
bool m_needsFree;
|
|
};
|
|
|
|
#endif // GB_POS_H
|