Merge branch 'master' into dev-acceptlang

This commit is contained in:
Ai Lin Chia
2018-07-26 11:09:47 +02:00
4 changed files with 4 additions and 212 deletions

@ -54,7 +54,7 @@ OBJS_O2 = \
Rdb.o RdbBase.o \
Sections.o Spider.o SpiderCache.o SpiderColl.o SpiderLoop.o StopWords.o Summary.o \
Title.o \
UCPropTable.o UdpServer.o \
UdpServer.o \
Xml.o XmlDoc.o XmlDoc_Indexing.o XmlNode.o \

@ -1,167 +0,0 @@
#include "gb-include.h"
#include "Mem.h"
#include "UCPropTable.h"
UCPropTable::UCPropTable(u_char valueSize,
u_char tableBits) {
m_valueSize = valueSize;
m_tableBits = tableBits;
m_numTables = 0xF0000 >> m_tableBits;
m_tableSize = (1 << m_tableBits) * m_valueSize;
m_tableMask = (1 << m_tableBits) - 1;
m_data = NULL;
}
UCPropTable::~UCPropTable() {
reset();
}
void UCPropTable::reset() {
if (m_data) {
for (u_int32_t i=0;i<m_numTables;i++) {
if (m_data[i])
mfree(m_data[i], m_tableSize , "UCPropTable");
}
mfree(m_data, m_numTables*sizeof(m_data[0]),
"UCPropTable");
m_data = NULL;
}
}
/*
void *UCPropTable::getValue(uint32_t c){
uint32_t prefix = c >> m_tableBits;
uint32_t key = c & m_tableMask;
if (prefix >= m_numTables) return NULL;
if (m_data[prefix] == NULL) return NULL;
return (void*) (m_data[prefix] + key*m_valueSize);
}
*/
bool UCPropTable::setValue(u_int32_t c, const void* value) {
u_int32_t prefix = c >> m_tableBits;
uint16_t key = c & m_tableMask;
if (prefix >= m_numTables) return false; // invalid plane
if (m_data == NULL){
m_data = (u_char**)
mmalloc(m_numTables * sizeof(m_data[0]),
"UCPropTable");
if (m_data == NULL) {
log(LOG_WARN, "UCPropTable: out of memory");
return false;
}
memset(m_data, '\0', m_numTables*sizeof(m_data[0]));
}
if (m_data[prefix] == NULL){
m_data[prefix] = (u_char*)
mmalloc(m_tableSize, "UCPropTable");
if (m_data[prefix] == NULL) {
log(LOG_WARN, "UCPropTable: out of memory");
return false;
}
memset(m_data[prefix], '\0', m_tableSize);
}
gbmemcpy(m_data[prefix] +key*m_valueSize, value, m_valueSize);
return true;
}
size_t UCPropTable::getStoredSize() const {
// Header
u_int32_t size = sizeof(u_int32_t) // record size
+ sizeof(u_char) // value size
+ sizeof(u_char); // number of table bits
if (m_data)
for (u_int32_t i=0 ; i < m_numTables ; i++) {
if (m_data[i]) {
size += sizeof(int32_t) + // table #
m_tableSize;
}
}
size += sizeof (u_int32_t);
return size;
}
#define RECORD_END (u_int32_t)0xdeadbeef
size_t UCPropTable::serialize(char *buf, size_t bufSize) const {
uint32_t size = getStoredSize();
if (bufSize < size) return 0;
char *p = buf;
// Header
*(uint32_t*)p = size; p += sizeof(u_int32_t);
*(u_char*)p = m_valueSize; p += sizeof(u_char);
*(u_char*)p = m_tableBits; p += sizeof(u_char);
if (m_data)
for (u_int32_t i=0; i<m_numTables ; i++) {
if (m_data[i]) {
*(u_int32_t*)p = i; p += sizeof(u_int32_t);
gbmemcpy(p, m_data[i], m_tableSize);
p += m_tableSize;
}
}
*(uint32_t*)p = RECORD_END; p += sizeof(u_int32_t);
// sanity check
if (p != buf + size) {
log(LOG_WARN, "UCPropTable: size mismatch: expected %" PRId32" bytes, "
"but wrote %" PRId32" instead", (int32_t) size, (int32_t) (p - buf));
return 0;
}
return p-buf;
}
size_t UCPropTable::deserialize(const void *buf, size_t bufSize) {
reset();
const char * const bufStart = (const char*)buf;
const char * const bufEnd = bufStart+bufSize;
const char *p = bufStart;
if(bufSize < 4+1+1)
return 0;
u_int32_t size = *(u_int32_t*)p; p+=sizeof(u_int32_t);
if(size > bufSize)
return 0;
m_valueSize = *(u_char*)p++;
m_tableBits = *(u_char*)p++;
//printf ("Read %d bytes after header\n", p-bufStart);
m_tableSize = (1 << m_tableBits) * m_valueSize;
m_tableMask = (1 << m_tableBits) - 1;
m_numTables = 0xF0000 >> m_tableBits;
// allocate main table
m_data = (u_char**) mmalloc(m_numTables * sizeof(m_data[0]), "UCPropTable");
if (m_data == NULL) {
log(LOG_WARN, "UCPropTable: out of memory");
return 0;
}
memset(m_data, 0, m_numTables*sizeof(m_data[0]));
//load tables
while (p < bufEnd) {
u_int32_t prefix = *(u_int32_t*)p; p += sizeof(u_int32_t);
if ( prefix == RECORD_END ){
if (p != bufEnd ) {
log(LOG_WARN, "UCPropTable: unexpected end of record");
return 0;
}
//printf ("Read %d bytes after footer\n", p-bufStart);
return size;
}
if(prefix<m_numTables) {
m_data[prefix] = (u_char*) mmalloc(m_tableSize, "UCPropTable");
if (m_data[prefix] == NULL) {
log(LOG_WARN, "UCPropTable: out of memory");
return 0;
}
memcpy(m_data[prefix], p, m_tableSize); p += m_tableSize;
} else
log(LOG_WARN,"UCPropTable: got invalid prefix %u at offset %tu", prefix, p-bufStart);
}
// shouldn't get here
log("UCPropTable: No RECORD_END found");
return 0;
}

@ -1,41 +0,0 @@
#ifndef GB_UCPROPTABLE_H
#define GB_UCPROPTABLE_H
#include <sys/types.h>
#include <stdlib.h> //NULL
class UCPropTable {
public:
UCPropTable(unsigned char valueSize = 1,
unsigned char tableBits = 16) ;
virtual ~UCPropTable() ;
void reset();
//void *getValue(uint32_t c);
const void *getValue(uint32_t c) const {
uint32_t prefix = c >> m_tableBits;
uint32_t key = c & m_tableMask;
if (prefix >= m_numTables) return NULL;
if (m_data[prefix] == NULL) return NULL;
return (const void*) (m_data[prefix] + key*m_valueSize);
}
bool setValue(uint32_t c, const void *value);
size_t getSize() const { return getStoredSize() + m_numTables*sizeof(char*); }
size_t getStoredSize() const;
size_t serialize(char *buf, size_t bufSize) const;
size_t deserialize(const void *buf, size_t bufSize);
private:
unsigned char **m_data;
unsigned char m_valueSize;
unsigned char m_tableBits;
uint32_t m_tableSize;
uint32_t m_tableMask;
uint32_t m_numTables;
};
#endif // GB_UCPROPTABLE_H

@ -1,7 +1,7 @@
File maps and how they are used.
The data comes from unicode.com and are in text format.
To speed up loading the filees are converted into binary format. The format depends on the data and how it is used.
To speed up loading the files are converted into binary format. The format depends on the data and how it is used.
==== Lookups
We are only interested in following lookups:
@ -17,7 +17,7 @@ UnicodeData.txt uses 30+ categories.
For hashing consistently we need to map from uppercase/titlecase to lowercase.
==== Optimal map types
Most of the codepoints we process are in the 0..128 range, so optimizing for that can be worth it. Some properties are in long ranges, so that may be worth optimizing for too. Some properties are sparse i.e. appleis only to a small subset of the available codepoints. So the optimal map type isn't clear-cut.
Most of the codepoints we process are in the 0..128 range, so optimizing for that can be worth it. Some properties are in long ranges, so that may be worth optimizing for too. Some properties are sparse i.e. applies only to a small subset of the available codepoints. So the optimal map type isn't clear-cut.
=== codepoint->script
All codepoints belong to a script (almost, there are some without in the private-use blocks that doesn't have them).
@ -28,7 +28,7 @@ Codepoints can have a multiple of properties. UnicodeData.txt uses 34 distinct p
We ignore 2 of those properties and map the rest into a 32-bit bitmask. One 32-bit bitmask per codepoint.
Optimal map type: full map, 32 bit per entry.
Additional, testing if a codepoint can be in a word/identifier (a-z, 0-9, <20>, greek letters, ...) is the most used lookup so we make a specialized map for that: full map, 1-byte boolean per entry.
Additional, testing if a codepoint can be in a word/identifier (a-z, 0-9, <20>, greek letters, ...) is the most used lookup so we make a specialized map for that: full map, 1-byte boolean per entry. It may be worth squeezing down to 1 bit per codepoint.
=== upper/lowercase
We only need a mapping to lowercase. Lowercase only applies to a tiny fraction (~1300) of the codepoints, We currently only need the mapping to lowercase so we only dump that.