privacore-open-source-searc.../Entities.cpp
Ivan Skytte Jørgensen beeddcf35d Got rid of gb-include.h
2018-07-26 17:29:51 +02:00

259 lines
7.3 KiB
C++

#include "Entities.h"
#include "HashTableX.h"
#include "Process.h"
#include "GbMutex.h"
#include "ScopedLock.h"
static HashTableX s_table;
static bool s_isInitialized = false;
static GbMutex s_tableMutex;
struct Entity {
const char *entity; //entity name with leading ampersand but without trailing semicolon, like "&nbsp"
int codepoints; //number of unicode codepoitns this entity translates to
int codepoint[2]; //unicode codepoints
size_t utf8Len; //length of utf8
char utf8[2*4]; //2 codepoints of 4 bytes each
};
#include "entities.inc"
void resetEntities ( ) {
s_table.reset();
}
static bool initEntityTable(){
ScopedLock sl(s_tableMutex);
if ( ! s_isInitialized ) {
// set up the hash table
if ( ! s_table.set ( 8,4,8192,NULL,0,false,"enttbl" ) ) {
log("build: Could not init table of HTML entities.");
return false;
}
// now add in all the html entities
const int32_t n = (int32_t)sizeof(s_entities) / (int32_t)sizeof(Entity);
for ( int32_t i = 0 ; i < n ; i++ ) {
int64_t h = hash64b ( s_entities[i].entity );
// convert the unicode codepoints to an utf8 string
char *buf = (char *)s_entities[i].utf8;
for(int j=0; j<s_entities[i].codepoints; j++) {
UChar32 codepoint = s_entities[i].codepoint[j];
int32_t len = utf8Encode(codepoint,buf);
if ( len == 0 ) { g_process.shutdownAbort(true); }
// make modification to make parsing easier
if ( codepoint == 160 ) { // nbsp
buf[0] = ' ';
len = 1;
}
buf += len;
}
s_entities[i].utf8Len = (size_t)(buf-s_entities[i].utf8);
// must not exist!
if ( s_table.isInTable(&h) ) { g_process.shutdownAbort(true);}
// store the entity index in the hash table as score
if ( ! s_table.addTerm(h, i+1) ) return false;
}
s_isInitialized = true;
}
return true;
}
// . is "s" an HTML entity? (ascii representative of an iso char)
// . return the 32-bit unicode char it represents
// . returns 0 if none
// . JAB: const-ness for optimizer...
static const Entity *getTextEntity ( const char *s , int32_t len ) {
if ( !initEntityTable()) return 0;
// take the ; off, if any
if ( s[len-1] == ';' ) len--;
// compute the hash of the entity including &, but not ;
int64_t h = hash64 ( s , len );
// get the entity index from table (stored in the score field)
int32_t i = (int32_t) s_table.getScore(h);
// return 0 if no match
if ( i == 0 )
return NULL;
// point to the utf8 char. these is 1 or 2 bytes it seems
return s_entities+i-1;
}
// . get a decimal encoded entity
// . s/len is the whol thing
// . JAB: const-ness for optimizer...
static uint32_t getDecimalEntity ( const char *s , int32_t len ) {
// take the ; off, if any
if ( s[len-1] == ';' ) len--;
// . &#1 is smallest it can be
// . &#1114111 is biggest
if ( len < 3 || len > 9 ) return 0;
// . must start with &#[0-9]
if ( s[0] !='&' || s[1] != '#' || ! is_digit(s[2]) ) return 0;
// use space as default
uint32_t v ;
if ( len == 3 ) v = (s[2]-48);
else if ( len == 4 ) v = (s[2]-48)*10 +
(s[3]-48);
else if ( len == 5 ) v = (s[2]-48)*100 +
(s[3]-48)*10 +
(s[4]-48);
else if ( len == 6 ) v = (s[2]-48)*1000 +
(s[3]-48)*100 +
(s[4]-48)*10 +
s[5]-48;
else if ( len == 7 ) v = (s[2]-48)*10000 +
(s[3]-48)*1000+
(s[4]-48)*100+
(s[5]-48)*10+
s[5]-48;
else if ( len == 8 ) v = (s[2]-48)*100000 +
(s[3]-48)*10000 +
(s[4]-48)*1000+
(s[5]-48)*100+
(s[6]-48)*10+
s[7]-48;
else if ( len == 9 ) v = (s[2]-48)*1000000 +
(s[3]-48)*100000 +
(s[4]-48)*10000 +
(s[5]-48)*1000 +
(s[6]-48)*100 +
(s[7]-48)*10 +
s[7]-48;
else return (uint32_t)' ';
//printf("Translated entity (dec)");
//for (int i=0;i<len;i++)putchar(s[i]);
//printf(" to [U+%" PRId32"]\n", v);
if (v < 32 || v>0x10ffff) return (uint32_t)' ';
return v;
}
// . get a hexadecimal encoded entity
// . JAB: const-ness for optimizer...
// . returns a UChar32
static uint32_t getHexadecimalEntity ( const char *s , int32_t len ) {
// take the ; off, if any
if ( s[len-1] == ';' ) len--;
// . &#x1 is smallest it can be
// . &#x10FFFF is biggest
if ( len < 4 || len > 9 ) return (char)0;
// . must start with &#x[0-f]
if ( s[0] !='&' || s[1] != '#' || s[2] !='x' ) return (char)0;
if ( ! is_hex ( s[3] ) ) return (char)0;
// use space as default
uint32_t v;
if ( len == 4 ) v = htob(s[3]);
else if ( len == 5 ) v = (htob(s[3]) << 4) +
htob(s[4]);
else if ( len == 6 ) v = (htob(s[3]) << 8) +
(htob(s[4]) << 4) +
htob(s[5]);
else if ( len == 7 ) v = (htob(s[3]) << 12) +
(htob(s[4]) << 8) +
(htob(s[5]) << 4) +
htob(s[6]);
else if ( len == 8 ) v = (htob(s[3]) << 16) +
(htob(s[4]) << 12) +
(htob(s[5]) << 8) +
(htob(s[6]) << 4) +
htob(s[7]);
else if ( len == 9 ) v = (htob(s[3]) << 20) +
(htob(s[4]) << 16) +
(htob(s[5]) << 12) +
(htob(s[6]) << 8) +
(htob(s[7]) << 4) +
htob(s[8]);
else
return (uint32_t)' ';
// return the char
//printf("Translated entity (dec)");
//for (int i=0;i<len;i++)putchar(s[i]);
//printf(" to [U+%04lX]\n", v);
if (v < 32 || v>0x10ffff) return (uint32_t)' ';
return (uint32_t) v;
}
// . s[maxLen] should be the NULL
// . returns full length of entity @ "s" if there is a valid one, 0 otherwise
// . sets *c to the iso character the entity represents (if there is one)
// JAB: const-ness for optimizer...
int32_t getHtmlEntity(const char *s, int32_t maxLen, uint32_t codepoint[2], int32_t *codepointCount, int32_t *utf8Len) {
//TODO: handle multi-codepoint entitites
*utf8Len=0;
// ensure there's an & as first char
if ( s[0] != '&' ) {
return 0;
}
// compute maximum length of entity, if it's indeed an entity
int32_t len = 1;
if ( s[len] == '#' ) {
len++;
}
// cut it off after <32> chars to save time and also to avoid parsing
// obscenely long incorrect entitites (eg an ampersand followed by 2MB of letters)
while ( len < maxLen && len < max_entity_name_len && is_alnum_a( s[len] ) ) {
len++;
}
// character entity reference must end with a semicolon.
// some browsers have lenient parsing, but we don't accept invalid
// references.
if ( len == maxLen || s[len] != ';' ) {
//not a valid character entity reference
return 0;
}
len++;
// we don't have entities longer than what w3c specified
if ( len > max_entity_name_len+1 ) {
return 0;
}
// all entites are 3 or more chars (&gt)
if ( len < 3 ) {
return 0;
}
// . if it's a numeric entity like &#123 use this routine
// . pass in the whole she-bang: "&#12...;" or "&acute...;
if ( s[1] == '#' ) {
if ( s[2] == 'x' ) {
codepoint[0] = getHexadecimalEntity( s, len );
*codepointCount = 1;
} else {
codepoint[0] = getDecimalEntity( s, len );
*codepointCount = 1;
}
} else {
// otherwise, it's a named entity
const Entity *entity = getTextEntity( s, len );
if(entity) {
memcpy(codepoint, entity->codepoint, entity->codepoints*sizeof(int32_t));
*codepointCount = entity->codepoints;
*utf8Len = (int32_t)entity->utf8Len;
return len;
} else {
return 0; //unknown named entity
}
}
// return 0 if not an entity, length of entity if it is an entity
if ( codepoint[0] ) {
return len;
} else {
return 0;
}
}