On 2002-03-26 06:58-0800 Pedro Ferreira wrote: > Please, what is the best tool to convert an ascii file > with unicode character codes like this: > U+3400 > U+3405 > to another UTF-8 file with the corresponding unicode > characters?
This Perl script should do the job: ========== CUT HERE ========== #!/usr/bin/perl -w #============================================================================= # $Id: h2u,v 1.3.2.1 2002/03/26 16:16:53 sunny Exp $ # Converts text files with Unicode characters on the U+nnnn format into UTF-8. # # Created by Øyvind A. Holm <[EMAIL PROTECTED]>. License: GNU GPL. #============================================================================= use strict; while (<>) { s/U\+([0-9A-F]{4})/widechar(hex($1))/gei; print; } sub widechar { my $Val = shift; if ($Val < 0x80) { return sprintf("%c", $Val); } elsif ($Val < 0x800) { return sprintf("%c%c", 0xC0 | ($Val >> 6), 0x80 | ($Val & 0x3F)); } elsif ($Val < 0x10000) { return sprintf("%c%c%c", 0xE0 | ($Val >> 12), 0x80 | (($Val >> 6) & 0x3F), 0x80 | ($Val & 0x3F)); } elsif ($Val < 0x200000) { return sprintf("%c%c%c%c", 0xF0 | ($Val >> 18), 0x80 | (($Val >> 12) & 0x3F), 0x80 | (($Val >> 6) & 0x3F), 0x80 | ($Val & 0x3F)); } elsif ($Val < 0x4000000) { return sprintf("%c%c%c%c%c", 0xF8 | ($Val >> 24), 0x80 | (($Val >> 18) & 0x3F), 0x80 | (($Val >> 12) & 0x3F), 0x80 | (($Val >> 6) & 0x3F), 0x80 | ( $Val & 0x3F)); } elsif ($Val < 0x80000000) { return sprintf("%c%c%c%c%c%c", 0xFC | ($Val >> 30), 0x80 | (($Val >> 24) & 0x3F), 0x80 | (($Val >> 18) & 0x3F), 0x80 | (($Val >> 12) & 0x3F), 0x80 | (($Val >> 6) & 0x3F), 0x80 | ( $Val & 0x3F)); } else { return widechar(0xFFFD); } } # widechar() __END__ ========== CUT HERE ========== This is a modified version of a script I use very often when converting to/from UTF-8. This version uses converts U+xxxx into UTF-8, the two scripts attached to this mail are the versions I use, they convert numeric HTML entities (ሴ and ሴ) into UTF-8 and vice versa. Nice thing to have around when editing UTF-8 files in an editor that is not UTF-8 aware. Øyvind +-------------------------------------------------------------------+ | OpenPGP: 0x629022EB 2002-02-24 Øyvind A. Holm <[EMAIL PROTECTED]> | | Fingerprint: DBE9 8D44 67F7 42AC 2CA1 7651 724E 9D53 6290 22EB | +----------------| http://www.sunbase.org[/sunny] |-----------------+
#!/usr/bin/perl -w #============================================================================= # $Id: u2h,v 1.4 2002/03/16 20:44:23 sunny Exp $ # Converts from UTF-8 charset to HTML numeric entities (☺ and ☺). # # Options: # -a convert Ampersand into entity # -d use Decimal values # -l also convert Latin-1 characters. # # Created by Øyvind A. Holm <[EMAIL PROTECTED]>. License: GNU GPL. #============================================================================= use strict; require 'getopts.pl'; ($main::opt_a, $main::opt_d, $main::opt_l) = (0, 0, 0); &Getopts('adl'); my $amp_ent = $main::opt_d ? "&" : "&"; while (<>) { $main::opt_a && s/&/$amp_ent/g; s/([\xFC-\xFD][\x80-\xBF][\x80-\xBF][\x80-\xBF][\x80-\xBF][\x80-\xBF])/decode_char($1)/ge; s/([\xF8-\xFB][\x80-\xBF][\x80-\xBF][\x80-\xBF][\x80-\xBF])/decode_char($1)/ge; s/([\xF0-\xF7][\x80-\xBF][\x80-\xBF][\x80-\xBF])/decode_char($1)/ge; s/([\xE0-\xEF][\x80-\xBF][\x80-\xBF])/decode_char($1)/ge; s/([\xC0-\xDF][\x80-\xBF])/decode_char($1)/ge; print; } # Warning: decode_char() accepts overlong sequences. sub decode_char { my $Msg = shift; my $Retval = ""; if ($Msg =~ /^([\xC0-\xDF])([\x80-\xBF])/) { my $Val = ((ord($1) & 0x1F) << 6) | (ord($2) & 0x3F); $Retval = ($main::opt_l && ($Val <= 0xFF)) ? chr($Val) : sprintf("&#%u;", $Val); } elsif ($Msg =~ /^([\xE0-\xEF])([\x80-\xBF])([\x80-\xBF])/) { $Retval = join("", "&#", ((ord($1) & 0x0F) << 12) | ((ord($2) & 0x3F) << 6) | ( ord($3) & 0x3F), ";"); } elsif ($Msg =~ /^([\xF0-\xF7])([\x80-\xBF])([\x80-\xBF])([\x80-\xBF])/) { $Retval = join("", "&#", ((ord($1) & 0x07) << 18) | ((ord($2) & 0x3F) << 12) | ((ord($3) & 0x3F) << 6) | ( ord($4) & 0x3F), ";"); } elsif ($Msg =~ /^([\xF8-\xFB])([\x80-\xBF])([\x80-\xBF])([\x80-\xBF])([\x80-\xBF])/) { $Retval = join("", "&#", ((ord($1) & 0x03) << 24) | ((ord($2) & 0x3F) << 18) | ((ord($3) & 0x3F) << 12) | ((ord($4) & 0x3F) << 6) | ( ord($5) & 0x3F), ";"); } elsif ($Msg =~ /^([\xFC-\xFD])([\x80-\xBF])([\x80-\xBF])([\x80-\xBF])([\x80-\xBF])([\x80-\xBF])/) { $Retval = join("", "&#", ((ord($1) & 0x01) << 30) | ((ord($2) & 0x3F) << 24) | ((ord($3) & 0x3F) << 18) | ((ord($4) & 0x3F) << 12) | ((ord($5) & 0x3F) << 6) | ( ord($6) & 0x3F), ";"); } $Retval =~ s/&#(\d+);/sprintf("&#x%X;", $1)/ge unless ($main::opt_d); return $Retval; } # decode_char() __END__
#!/usr/bin/perl -w #============================================================================= # $Id: h2u,v 1.3 2002/03/16 20:44:53 sunny Exp $ # Converts from numeric entities in HTML/SGML (☺ and ☺) to UTF-8. # # Options: # -l also convert Latin-1 characters. # # Created by Øyvind A. Holm <[EMAIL PROTECTED]>. License: GNU GPL. #============================================================================= use strict; require 'getopts.pl'; $main::opt_l = 0; &Getopts('l'); while (<>) { $main::opt_l && s/([\x80-\xFF])/widechar(ord($1))/ge; s/&#(\d{1,10});/widechar($1)/ge; s/&#x([0-9a-f]{1,8});/widechar(hex($1))/gei; print; } sub widechar { my $Val = shift; if ($Val < 0x80) { return sprintf("%c", $Val); } elsif ($Val < 0x800) { return sprintf("%c%c", 0xC0 | ($Val >> 6), 0x80 | ($Val & 0x3F)); } elsif ($Val < 0x10000) { return sprintf("%c%c%c", 0xE0 | ($Val >> 12), 0x80 | (($Val >> 6) & 0x3F), 0x80 | ($Val & 0x3F)); } elsif ($Val < 0x200000) { return sprintf("%c%c%c%c", 0xF0 | ($Val >> 18), 0x80 | (($Val >> 12) & 0x3F), 0x80 | (($Val >> 6) & 0x3F), 0x80 | ($Val & 0x3F)); } elsif ($Val < 0x4000000) { return sprintf("%c%c%c%c%c", 0xF8 | ($Val >> 24), 0x80 | (($Val >> 18) & 0x3F), 0x80 | (($Val >> 12) & 0x3F), 0x80 | (($Val >> 6) & 0x3F), 0x80 | ( $Val & 0x3F)); } elsif ($Val < 0x80000000) { return sprintf("%c%c%c%c%c%c", 0xFC | ($Val >> 30), 0x80 | (($Val >> 24) & 0x3F), 0x80 | (($Val >> 18) & 0x3F), 0x80 | (($Val >> 12) & 0x3F), 0x80 | (($Val >> 6) & 0x3F), 0x80 | ( $Val & 0x3F)); } else { return widechar(0xFFFD); } } # widechar() __END__