#!/usr/bin/perl # reads the official IANA charset list on stdin # http://www.iana.org/assignments/character-sets # generates iana_charset.h and iana_charset.cpp # sets a flag on "supported" charsets # ...the ones we recognise and that iconv will convert for us # need supported_charsets.txt for this my $curname; # current charset name my $csCount = 0; my %charsets; open(SUPPORTED, "supported_charsets.txt") or die "Couldn't open supported_charsets.txt"; my %supportedCharsets; while (<SUPPORTED>) { my $line = $_; chomp $line; chomp $line; if ($line =~ /^\s*(\d+)\s+([\w-]+)/){ my $csEnum = $1; my $name = $2; print "Supported charset: $2 ($1)\n"; $supportedCharsets{$csEnum} = 1; } } while (<>){ my $line = $_; chomp $line; chomp $line; if ($line =~ /^Name:\s+([^\s]+)[^\[]*(\[([^\]]*)\])?/){ #new charset $csCount++; #print "Charset: $1\n"; #print "Ref: $3\n"; $curname=$1; $charsets{$curname} = {}; $charsets{$curname}->{ref} = $3; $charsets{$curname}->{names} = []; push @{$charsets{$curname}->{names}}, $curname; $charsets{$curname}->{preferred} = 0; $charsets{$curname}->{enum_name} = 0; next; } next unless defined($curname); if ($line =~ /^\s*$/){ # end of charset undef $curname; next; } if ($line =~ /MIBenum:\s*(\d+)/){ $charsets{$curname}->{enum_val} = $1; next } if ($line =~ /Alias:\s+([^\s]+)(\s+\(preferred MIME name\))?/){ next if ($1 eq 'None'); my $name = $1; push @{$charsets{$curname}->{names}}, $name; if (length($2)){ $charsets{$curname}->{preferred} = $#{@{$charsets{$curname}->{names}}}; } if ($name =~/^cs/){ $charsets{$curname}->{enum_name} = $#{@{$charsets{$curname}->{names}}}; } } } #additional aliases push @{$charsets{"TIS-620"}->{names}}, "windows-874"; push @{$charsets{"Shift_JIS"}->{names}}, "x-sjis"; open CFILE, ">iana_charset.h" or die "Can't open iana_charset.h for writing"; print CFILE "// iana_charset.h\n"; print CFILE "// Generated automatically by parse_iana_charsets.pl ".gmtime()."\n"; print CFILE "// DO NOT EDIT!!!\n\n"; print CFILE "#ifndef IANA_CHARSET_H__\n"; print CFILE "#define IANA_CHARSET_H__\n"; print CFILE "enum eIANACharset{\n"; print CFILE "\tcsOther = 1, // unregistered character set\n"; print CFILE "\tcsUnknown = 2, // used as a default value\n"; foreach my $cs (sort {$a->{enum_val} <=> $b->{enum_val}} values %charsets){ next if !defined($cs->{enum_val}); my $enum_name = $cs->{names}[$cs->{enum_name}]; $enum_name =~ s/[\-\_\:]+//sg; if ($enum_name !~ /^cs/){ $enum_name = "cs".$enum_name; #print ">>>$enum_name: $cs->{enum_val}\n"; } print CFILE "\t$enum_name = $cs->{enum_val},\n"; } print CFILE "\tcsReserved = 3000\n};\n\n"; print CFILE "short get_iana_charset(char *cs, int len); \n"; print CFILE "char *get_charset_str(short cs); \n"; print CFILE "bool supportedCharset(short cs); \n"; print CFILE "void setSupportedCharsets(short *cs, int numCharsets);\n"; print CFILE "#endif\n"; close CFILE; open CFILE, ">iana_charset.cpp" or die "Can't open iana_charset.cpp for writing"; print CFILE "// iana_charset.h\n"; print CFILE "// Generated automatically by parse_iana_charsets.pl ".gmtime()."\n"; print CFILE "// DO NOT EDIT!!!\n\n"; print CFILE "#include \"gb-include.h\"\n"; print CFILE "#include \"iana_charset.h\"\n"; print CFILE "#include \"HashTableX.h\"\n"; print CFILE "#include \"Conf.h\"\n"; print CFILE "#include \"hash.h\"\n"; print CFILE<<EOL; typedef struct { char *name; char *mime; short mib_enum; char supported; } IANACharset; EOL my $str = "static IANACharset s_charsets[] = {\n"; foreach my $cs (sort {$a->{enum_val} <=> $b->{enum_val}} values %charsets){ next if !defined($cs->{enum_val}); my $enum_name = $cs->{names}[$cs->{enum_name}]; my $mime_name = $cs->{names}[$cs->{preferred}]; # Microsoft bastards if ($mime_name eq 'KS_C_5601-1987'){ $mime_name = 'x-windows-949'; } if ($enum_name =~ /^cs/){ #print "$enum_name: $cs->{enum_val}\n"; } else{ $enum_name =~ s/[\-\_\:]+//g; $enum_name = "cs".$enum_name; #print ">>>$enum_name: $cs->{enum_val}\n"; } foreach my $name (@{$cs->{names}}){ my $supported = $supportedCharsets{$cs->{enum_val}}?"1":"0"; #print "supportedCharsets: ",%supportedCharsets,"\n"; #print "$name $cs->{enum_val}: $supportedCharsets{$cs->{enum_val}}\n"; $str .= "\t{\"$name\", \"$mime_name\", $cs->{enum_val}, $supported},\n"; #print CFILE ",\n" if $name ne $cs->{names}[$#{@{$cs->{names}}}]; } } # special case...not listed in IANA charsets, but found "in the wild" #$str .= "\t{\"windows-874\", \"TIS-620\", 2259, 0},\n"; #$str .= "\t{\"x-sjis\", \"Shift_JIS\", 17, 1},\n"; chop $str;chop $str; print CFILE $str; print CFILE "\n};\n\n"; print CFILE <<EOL; static HashTableX s_table; static bool s_isInitialized = false; void reset_iana_charset ( ) { s_table.reset(); } // Slightly modified from getTextEntity short get_iana_charset(char *cs, int len) { if (!s_isInitialized){ // set up the hash table if ( ! s_table.set ( 8,4,4096,NULL,0,false,0,"ianatbl") ) return log("build: Could not init table of " "IANA Charsets."); // now add in all the charset entries long n = (long)sizeof(s_charsets) / (long)sizeof(IANACharset); // turn off quickpolling char saved = g_conf.m_useQuickpoll; g_conf.m_useQuickpoll = false; for ( long i = 0 ; i < n ; i++ ) { long long h = hash64Lower_a ( s_charsets[i].name, strlen(s_charsets[i].name) ); // store the charset index in the hash table as score if ( ! s_table.addTerm(&h, i+1) ) return log("build: add term failed"); } g_conf.m_useQuickpoll = saved; s_isInitialized = true; } long long h = hash64Lower_a ( cs , len ); // get the entity index from table (stored in the score field) long i = (long) s_table.getScore ( &h ); // return 0 if no match if ( i == 0 ) return csUnknown; // return the iso character return (short)s_charsets[i-1].mib_enum; } char *get_charset_str(short cs) { int s=0; int e=sizeof(s_charsets)/sizeof(IANACharset)-2; int i; if (cs < s_charsets[s].mib_enum) return NULL; if (cs > s_charsets[e].mib_enum) return NULL; // Binary search while (1){ // Check endpoints if (cs == s_charsets[s].mib_enum) return s_charsets[s].mime; if (cs ==s_charsets[e].mib_enum) return s_charsets[e].mime; // check midpoint i = (s+e)/2; if (cs ==s_charsets[i].mib_enum) return s_charsets[i].mime; // end of search if ((e-s)<3) return NULL; // reset either endpoint if (cs < s_charsets[i].mib_enum){e = i-1;continue;} if (cs > s_charsets[i].mib_enum){s = i+1;continue;} } } // is this charset supported? bool supportedCharset(short cs) { int s=0; int e=sizeof(s_charsets)/sizeof(IANACharset)-2; int i; if (cs < s_charsets[s].mib_enum) return false; if (cs > s_charsets[e].mib_enum) return false; // Binary search while (1){ // Check endpoints if (cs == s_charsets[s].mib_enum) return s_charsets[s].supported; if (cs ==s_charsets[e].mib_enum) return s_charsets[e].supported; // check midpoint i = (s+e)/2; if (cs ==s_charsets[i].mib_enum) return s_charsets[i].supported; // end of search if ((e-s)<3) return false; // reset either endpoint if (cs < s_charsets[i].mib_enum){e = i-1;continue;} if (cs > s_charsets[i].mib_enum){s = i+1;continue;} } } EOL close CFILE;