forked from Mirrors/privacore-open-source-search-engine
Merge branch 'master' into dev-acceptlang
This commit is contained in:
2
Makefile
2
Makefile
@ -54,7 +54,7 @@ OBJS_O2 = \
|
||||
Rdb.o RdbBase.o \
|
||||
Sections.o Spider.o SpiderCache.o SpiderColl.o SpiderLoop.o StopWords.o Summary.o \
|
||||
Title.o \
|
||||
UCPropTable.o UdpServer.o \
|
||||
UdpServer.o \
|
||||
Xml.o XmlDoc.o XmlDoc_Indexing.o XmlNode.o \
|
||||
|
||||
|
||||
|
167
UCPropTable.cpp
167
UCPropTable.cpp
@ -1,167 +0,0 @@
|
||||
#include "gb-include.h"
|
||||
|
||||
#include "Mem.h"
|
||||
#include "UCPropTable.h"
|
||||
|
||||
UCPropTable::UCPropTable(u_char valueSize,
|
||||
u_char tableBits) {
|
||||
m_valueSize = valueSize;
|
||||
m_tableBits = tableBits;
|
||||
m_numTables = 0xF0000 >> m_tableBits;
|
||||
m_tableSize = (1 << m_tableBits) * m_valueSize;
|
||||
m_tableMask = (1 << m_tableBits) - 1;
|
||||
m_data = NULL;
|
||||
}
|
||||
|
||||
UCPropTable::~UCPropTable() {
|
||||
reset();
|
||||
}
|
||||
|
||||
void UCPropTable::reset() {
|
||||
if (m_data) {
|
||||
for (u_int32_t i=0;i<m_numTables;i++) {
|
||||
if (m_data[i])
|
||||
mfree(m_data[i], m_tableSize , "UCPropTable");
|
||||
}
|
||||
mfree(m_data, m_numTables*sizeof(m_data[0]),
|
||||
"UCPropTable");
|
||||
m_data = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
void *UCPropTable::getValue(uint32_t c){
|
||||
uint32_t prefix = c >> m_tableBits;
|
||||
uint32_t key = c & m_tableMask;
|
||||
if (prefix >= m_numTables) return NULL;
|
||||
if (m_data[prefix] == NULL) return NULL;
|
||||
return (void*) (m_data[prefix] + key*m_valueSize);
|
||||
}
|
||||
*/
|
||||
|
||||
bool UCPropTable::setValue(u_int32_t c, const void* value) {
|
||||
u_int32_t prefix = c >> m_tableBits;
|
||||
uint16_t key = c & m_tableMask;
|
||||
if (prefix >= m_numTables) return false; // invalid plane
|
||||
if (m_data == NULL){
|
||||
m_data = (u_char**)
|
||||
mmalloc(m_numTables * sizeof(m_data[0]),
|
||||
"UCPropTable");
|
||||
if (m_data == NULL) {
|
||||
log(LOG_WARN, "UCPropTable: out of memory");
|
||||
return false;
|
||||
}
|
||||
memset(m_data, '\0', m_numTables*sizeof(m_data[0]));
|
||||
}
|
||||
if (m_data[prefix] == NULL){
|
||||
m_data[prefix] = (u_char*)
|
||||
mmalloc(m_tableSize, "UCPropTable");
|
||||
if (m_data[prefix] == NULL) {
|
||||
log(LOG_WARN, "UCPropTable: out of memory");
|
||||
return false;
|
||||
}
|
||||
|
||||
memset(m_data[prefix], '\0', m_tableSize);
|
||||
}
|
||||
gbmemcpy(m_data[prefix] +key*m_valueSize, value, m_valueSize);
|
||||
return true;
|
||||
|
||||
}
|
||||
|
||||
size_t UCPropTable::getStoredSize() const {
|
||||
// Header
|
||||
u_int32_t size = sizeof(u_int32_t) // record size
|
||||
+ sizeof(u_char) // value size
|
||||
+ sizeof(u_char); // number of table bits
|
||||
|
||||
if (m_data)
|
||||
for (u_int32_t i=0 ; i < m_numTables ; i++) {
|
||||
if (m_data[i]) {
|
||||
size += sizeof(int32_t) + // table #
|
||||
m_tableSize;
|
||||
|
||||
}
|
||||
}
|
||||
size += sizeof (u_int32_t);
|
||||
return size;
|
||||
}
|
||||
#define RECORD_END (u_int32_t)0xdeadbeef
|
||||
|
||||
size_t UCPropTable::serialize(char *buf, size_t bufSize) const {
|
||||
uint32_t size = getStoredSize();
|
||||
if (bufSize < size) return 0;
|
||||
char *p = buf;
|
||||
// Header
|
||||
*(uint32_t*)p = size; p += sizeof(u_int32_t);
|
||||
*(u_char*)p = m_valueSize; p += sizeof(u_char);
|
||||
*(u_char*)p = m_tableBits; p += sizeof(u_char);
|
||||
if (m_data)
|
||||
for (u_int32_t i=0; i<m_numTables ; i++) {
|
||||
if (m_data[i]) {
|
||||
*(u_int32_t*)p = i; p += sizeof(u_int32_t);
|
||||
gbmemcpy(p, m_data[i], m_tableSize);
|
||||
p += m_tableSize;
|
||||
}
|
||||
}
|
||||
*(uint32_t*)p = RECORD_END; p += sizeof(u_int32_t);
|
||||
// sanity check
|
||||
if (p != buf + size) {
|
||||
log(LOG_WARN, "UCPropTable: size mismatch: expected %" PRId32" bytes, "
|
||||
"but wrote %" PRId32" instead", (int32_t) size, (int32_t) (p - buf));
|
||||
return 0;
|
||||
}
|
||||
return p-buf;
|
||||
}
|
||||
|
||||
size_t UCPropTable::deserialize(const void *buf, size_t bufSize) {
|
||||
reset();
|
||||
const char * const bufStart = (const char*)buf;
|
||||
const char * const bufEnd = bufStart+bufSize;
|
||||
const char *p = bufStart;
|
||||
if(bufSize < 4+1+1)
|
||||
return 0;
|
||||
u_int32_t size = *(u_int32_t*)p; p+=sizeof(u_int32_t);
|
||||
if(size > bufSize)
|
||||
return 0;
|
||||
|
||||
m_valueSize = *(u_char*)p++;
|
||||
m_tableBits = *(u_char*)p++;
|
||||
//printf ("Read %d bytes after header\n", p-bufStart);
|
||||
m_tableSize = (1 << m_tableBits) * m_valueSize;
|
||||
m_tableMask = (1 << m_tableBits) - 1;
|
||||
|
||||
m_numTables = 0xF0000 >> m_tableBits;
|
||||
// allocate main table
|
||||
m_data = (u_char**) mmalloc(m_numTables * sizeof(m_data[0]), "UCPropTable");
|
||||
if (m_data == NULL) {
|
||||
log(LOG_WARN, "UCPropTable: out of memory");
|
||||
return 0;
|
||||
}
|
||||
memset(m_data, 0, m_numTables*sizeof(m_data[0]));
|
||||
|
||||
//load tables
|
||||
while (p < bufEnd) {
|
||||
u_int32_t prefix = *(u_int32_t*)p; p += sizeof(u_int32_t);
|
||||
if ( prefix == RECORD_END ){
|
||||
if (p != bufEnd ) {
|
||||
log(LOG_WARN, "UCPropTable: unexpected end of record");
|
||||
return 0;
|
||||
}
|
||||
//printf ("Read %d bytes after footer\n", p-bufStart);
|
||||
return size;
|
||||
|
||||
}
|
||||
if(prefix<m_numTables) {
|
||||
m_data[prefix] = (u_char*) mmalloc(m_tableSize, "UCPropTable");
|
||||
if (m_data[prefix] == NULL) {
|
||||
log(LOG_WARN, "UCPropTable: out of memory");
|
||||
return 0;
|
||||
}
|
||||
memcpy(m_data[prefix], p, m_tableSize); p += m_tableSize;
|
||||
} else
|
||||
log(LOG_WARN,"UCPropTable: got invalid prefix %u at offset %tu", prefix, p-bufStart);
|
||||
}
|
||||
// shouldn't get here
|
||||
log("UCPropTable: No RECORD_END found");
|
||||
return 0;
|
||||
}
|
@ -1,41 +0,0 @@
|
||||
#ifndef GB_UCPROPTABLE_H
|
||||
#define GB_UCPROPTABLE_H
|
||||
|
||||
#include <sys/types.h>
|
||||
#include <stdlib.h> //NULL
|
||||
|
||||
class UCPropTable {
|
||||
public:
|
||||
UCPropTable(unsigned char valueSize = 1,
|
||||
unsigned char tableBits = 16) ;
|
||||
|
||||
virtual ~UCPropTable() ;
|
||||
void reset();
|
||||
|
||||
//void *getValue(uint32_t c);
|
||||
const void *getValue(uint32_t c) const {
|
||||
uint32_t prefix = c >> m_tableBits;
|
||||
uint32_t key = c & m_tableMask;
|
||||
if (prefix >= m_numTables) return NULL;
|
||||
if (m_data[prefix] == NULL) return NULL;
|
||||
return (const void*) (m_data[prefix] + key*m_valueSize);
|
||||
}
|
||||
|
||||
bool setValue(uint32_t c, const void *value);
|
||||
|
||||
size_t getSize() const { return getStoredSize() + m_numTables*sizeof(char*); }
|
||||
size_t getStoredSize() const;
|
||||
size_t serialize(char *buf, size_t bufSize) const;
|
||||
size_t deserialize(const void *buf, size_t bufSize);
|
||||
|
||||
private:
|
||||
unsigned char **m_data;
|
||||
|
||||
unsigned char m_valueSize;
|
||||
unsigned char m_tableBits;
|
||||
uint32_t m_tableSize;
|
||||
uint32_t m_tableMask;
|
||||
uint32_t m_numTables;
|
||||
};
|
||||
|
||||
#endif // GB_UCPROPTABLE_H
|
@ -1,7 +1,7 @@
|
||||
File maps and how they are used.
|
||||
|
||||
The data comes from unicode.com and are in text format.
|
||||
To speed up loading the filees are converted into binary format. The format depends on the data and how it is used.
|
||||
To speed up loading the files are converted into binary format. The format depends on the data and how it is used.
|
||||
|
||||
==== Lookups
|
||||
We are only interested in following lookups:
|
||||
@ -17,7 +17,7 @@ UnicodeData.txt uses 30+ categories.
|
||||
For hashing consistently we need to map from uppercase/titlecase to lowercase.
|
||||
|
||||
==== Optimal map types
|
||||
Most of the codepoints we process are in the 0..128 range, so optimizing for that can be worth it. Some properties are in long ranges, so that may be worth optimizing for too. Some properties are sparse i.e. appleis only to a small subset of the available codepoints. So the optimal map type isn't clear-cut.
|
||||
Most of the codepoints we process are in the 0..128 range, so optimizing for that can be worth it. Some properties are in long ranges, so that may be worth optimizing for too. Some properties are sparse i.e. applies only to a small subset of the available codepoints. So the optimal map type isn't clear-cut.
|
||||
|
||||
=== codepoint->script
|
||||
All codepoints belong to a script (almost, there are some without in the private-use blocks that doesn't have them).
|
||||
@ -28,7 +28,7 @@ Codepoints can have a multiple of properties. UnicodeData.txt uses 34 distinct p
|
||||
We ignore 2 of those properties and map the rest into a 32-bit bitmask. One 32-bit bitmask per codepoint.
|
||||
Optimal map type: full map, 32 bit per entry.
|
||||
|
||||
Additional, testing if a codepoint can be in a word/identifier (a-z, 0-9, <20>, greek letters, ...) is the most used lookup so we make a specialized map for that: full map, 1-byte boolean per entry.
|
||||
Additional, testing if a codepoint can be in a word/identifier (a-z, 0-9, <20>, greek letters, ...) is the most used lookup so we make a specialized map for that: full map, 1-byte boolean per entry. It may be worth squeezing down to 1 bit per codepoint.
|
||||
|
||||
=== upper/lowercase
|
||||
We only need a mapping to lowercase. Lowercase only applies to a tiny fraction (~1300) of the codepoints, We currently only need the mapping to lowercase so we only dump that.
|
||||
|
Reference in New Issue
Block a user