Fix incorrect use of cp-1257 with utf8 content

For short utf8 documents ced incorrectly guesses it is windows-1257 encoded. We now override that back to utf8 in specific cases so documents don't get mangled.
2025-07-14 02:36:06 -04:00 · 2018-03-15 15:34:52 +01:00
parent b95017ef4d
commit 626d4afa08
1 changed files with 19 additions and 2 deletions
--- a/GbEncoding.cpp
+++ b/GbEncoding.cpp
@ -328,9 +328,26 @@ uint16_t GbEncoding::getCharset(HttpMime *mime, const char *url, const char *s,
 	int16_t cedCharset = convertEncodingCED(encoding);
 	const char *cedCharsetStr = EncodingName(encoding);

-	// we'll always use cedCharset when it's reliable or when charset is not already known
 	if (cedCharset != csUnknown && (is_reliable || charset == csUnknown)) {
-		charset = cedCharset;
+		if(charset==csUTF8 && !invalidUtf8Encoding &&
+		   (cedCharset==cswindows1250 ||
+		    cedCharset==cswindows1251 ||
+		    cedCharset==cswindows1252 ||
+		    cedCharset==cswindows1253 ||
+		    cedCharset==cswindows1254 ||
+		    cedCharset==cswindows1255 ||
+		    cedCharset==cswindows1256 ||
+		    cedCharset==cswindows1257))
+		{
+			//For short documents the ced library has a weird builtin bias for classifying utf8 as windows-xxxx codepages.
+			//Passing the http-header-char and meta-charset hints to ced makes it no longer recognize big5 reliably on
+			//incorrectly configured webservers. So if the webserver claimed it was utf8, and we could not find any
+			//illegal utf8 sequences in it and ced thinks it is a windows-xxxx codepage then we treat it as utf8.
+			charset = csUTF8;
+		} else {
+			// we'll always use cedCharset when it's reliable or when charset is not already known
+			charset = cedCharset;
+		}
 	}

 	// alias these charsets so iconv understands