mirror of
https://github.com/privacore/open-source-search-engine.git
synced 2025-07-14 02:36:06 -04:00
Fix incorrect use of cp-1257 with utf8 content
For short utf8 documents ced incorrectly guesses it is windows-1257 encoded. We now override that back to utf8 in specific cases so documents don't get mangled.
This commit is contained in:
committed by
Ai Lin Chia
parent
b95017ef4d
commit
626d4afa08
@ -328,9 +328,26 @@ uint16_t GbEncoding::getCharset(HttpMime *mime, const char *url, const char *s,
|
||||
int16_t cedCharset = convertEncodingCED(encoding);
|
||||
const char *cedCharsetStr = EncodingName(encoding);
|
||||
|
||||
// we'll always use cedCharset when it's reliable or when charset is not already known
|
||||
if (cedCharset != csUnknown && (is_reliable || charset == csUnknown)) {
|
||||
charset = cedCharset;
|
||||
if(charset==csUTF8 && !invalidUtf8Encoding &&
|
||||
(cedCharset==cswindows1250 ||
|
||||
cedCharset==cswindows1251 ||
|
||||
cedCharset==cswindows1252 ||
|
||||
cedCharset==cswindows1253 ||
|
||||
cedCharset==cswindows1254 ||
|
||||
cedCharset==cswindows1255 ||
|
||||
cedCharset==cswindows1256 ||
|
||||
cedCharset==cswindows1257))
|
||||
{
|
||||
//For short documents the ced library has a weird builtin bias for classifying utf8 as windows-xxxx codepages.
|
||||
//Passing the http-header-char and meta-charset hints to ced makes it no longer recognize big5 reliably on
|
||||
//incorrectly configured webservers. So if the webserver claimed it was utf8, and we could not find any
|
||||
//illegal utf8 sequences in it and ced thinks it is a windows-xxxx codepage then we treat it as utf8.
|
||||
charset = csUTF8;
|
||||
} else {
|
||||
// we'll always use cedCharset when it's reliable or when charset is not already known
|
||||
charset = cedCharset;
|
||||
}
|
||||
}
|
||||
|
||||
// alias these charsets so iconv understands
|
||||
|
Reference in New Issue
Block a user