On 2002-03-26 06:58-0800 Pedro Ferreira wrote:
> Please, what is the best tool to convert an ascii file
> with unicode character codes like this:
> U+3400
> U+3405
> to another UTF-8 file with the corresponding unicode
> characters?
This Perl script should do the job:
========== CUT HERE ==========
#!/usr/bin/perl -w
#=============================================================================
# $Id: h2u,v 1.3.2.1 2002/03/26 16:16:53 sunny Exp $
# Converts text files with Unicode characters on the U+nnnn format into UTF-8.
#
# Created by Øyvind A. Holm <[EMAIL PROTECTED]>. License: GNU GPL.
#=============================================================================
use strict;
while (<>) {
s/U\+([0-9A-F]{4})/widechar(hex($1))/gei;
print;
}
sub widechar {
my $Val = shift;
if ($Val < 0x80) {
return sprintf("%c", $Val);
} elsif ($Val < 0x800) {
return sprintf("%c%c", 0xC0 | ($Val >> 6),
0x80 | ($Val & 0x3F));
} elsif ($Val < 0x10000) {
return sprintf("%c%c%c", 0xE0 | ($Val >> 12),
0x80 | (($Val >> 6) & 0x3F),
0x80 | ($Val & 0x3F));
} elsif ($Val < 0x200000) {
return sprintf("%c%c%c%c", 0xF0 | ($Val >> 18),
0x80 | (($Val >> 12) & 0x3F),
0x80 | (($Val >> 6) & 0x3F),
0x80 | ($Val & 0x3F));
} elsif ($Val < 0x4000000) {
return sprintf("%c%c%c%c%c", 0xF8 | ($Val >> 24),
0x80 | (($Val >> 18) & 0x3F),
0x80 | (($Val >> 12) & 0x3F),
0x80 | (($Val >> 6) & 0x3F),
0x80 | ( $Val & 0x3F));
} elsif ($Val < 0x80000000) {
return sprintf("%c%c%c%c%c%c", 0xFC | ($Val >> 30),
0x80 | (($Val >> 24) & 0x3F),
0x80 | (($Val >> 18) & 0x3F),
0x80 | (($Val >> 12) & 0x3F),
0x80 | (($Val >> 6) & 0x3F),
0x80 | ( $Val & 0x3F));
} else {
return widechar(0xFFFD);
}
} # widechar()
__END__
========== CUT HERE ==========
This is a modified version of a script I use very often when converting
to/from UTF-8. This version uses converts U+xxxx into UTF-8, the two
scripts attached to this mail are the versions I use, they convert
numeric HTML entities (ሴ and ሴ) into UTF-8 and vice versa.
Nice thing to have around when editing UTF-8 files in an editor that is
not UTF-8 aware.
�yvind
+-------------------------------------------------------------------+
| OpenPGP: 0x629022EB 2002-02-24 �yvind A. Holm <[EMAIL PROTECTED]> |
| Fingerprint: DBE9 8D44 67F7 42AC 2CA1 7651 724E 9D53 6290 22EB |
+----------------| http://www.sunbase.org[/sunny] |-----------------+
#!/usr/bin/perl -w
#=============================================================================
# $Id: u2h,v 1.4 2002/03/16 20:44:23 sunny Exp $
# Converts from UTF-8 charset to HTML numeric entities (☺ and ☺).
#
# Options:
# -a convert Ampersand into entity
# -d use Decimal values
# -l also convert Latin-1 characters.
#
# Created by Øyvind A. Holm <[EMAIL PROTECTED]>. License: GNU GPL.
#=============================================================================
use strict;
require 'getopts.pl';
($main::opt_a, $main::opt_d, $main::opt_l) = (0, 0, 0);
&Getopts('adl');
my $amp_ent = $main::opt_d ? "&" : "&";
while (<>) {
$main::opt_a && s/&/$amp_ent/g;
s/([\xFC-\xFD][\x80-\xBF][\x80-\xBF][\x80-\xBF][\x80-\xBF][\x80-\xBF])/decode_char($1)/ge;
s/([\xF8-\xFB][\x80-\xBF][\x80-\xBF][\x80-\xBF][\x80-\xBF])/decode_char($1)/ge;
s/([\xF0-\xF7][\x80-\xBF][\x80-\xBF][\x80-\xBF])/decode_char($1)/ge;
s/([\xE0-\xEF][\x80-\xBF][\x80-\xBF])/decode_char($1)/ge;
s/([\xC0-\xDF][\x80-\xBF])/decode_char($1)/ge;
print;
}
# Warning: decode_char() accepts overlong sequences.
sub decode_char {
my $Msg = shift;
my $Retval = "";
if ($Msg =~ /^([\xC0-\xDF])([\x80-\xBF])/) {
my $Val = ((ord($1) & 0x1F) << 6) | (ord($2) & 0x3F);
$Retval = ($main::opt_l && ($Val <= 0xFF)) ? chr($Val) :
sprintf("&#%u;", $Val);
} elsif ($Msg =~ /^([\xE0-\xEF])([\x80-\xBF])([\x80-\xBF])/) {
$Retval = join("", "&#",
((ord($1) & 0x0F) << 12) |
((ord($2) & 0x3F) << 6) |
( ord($3) & 0x3F), ";");
} elsif ($Msg =~ /^([\xF0-\xF7])([\x80-\xBF])([\x80-\xBF])([\x80-\xBF])/) {
$Retval = join("", "&#",
((ord($1) & 0x07) << 18) |
((ord($2) & 0x3F) << 12) |
((ord($3) & 0x3F) << 6) |
( ord($4) & 0x3F), ";");
} elsif ($Msg =~
/^([\xF8-\xFB])([\x80-\xBF])([\x80-\xBF])([\x80-\xBF])([\x80-\xBF])/) {
$Retval = join("", "&#",
((ord($1) & 0x03) << 24) |
((ord($2) & 0x3F) << 18) |
((ord($3) & 0x3F) << 12) |
((ord($4) & 0x3F) << 6) |
( ord($5) & 0x3F), ";");
} elsif ($Msg =~
/^([\xFC-\xFD])([\x80-\xBF])([\x80-\xBF])([\x80-\xBF])([\x80-\xBF])([\x80-\xBF])/) {
$Retval = join("", "&#",
((ord($1) & 0x01) << 30) |
((ord($2) & 0x3F) << 24) |
((ord($3) & 0x3F) << 18) |
((ord($4) & 0x3F) << 12) |
((ord($5) & 0x3F) << 6) |
( ord($6) & 0x3F), ";");
}
$Retval =~ s/&#(\d+);/sprintf("&#x%X;", $1)/ge unless ($main::opt_d);
return $Retval;
} # decode_char()
__END__
#!/usr/bin/perl -w
#=============================================================================
# $Id: h2u,v 1.3 2002/03/16 20:44:53 sunny Exp $
# Converts from numeric entities in HTML/SGML (☺ and ☺) to UTF-8.
#
# Options:
# -l also convert Latin-1 characters.
#
# Created by Øyvind A. Holm <[EMAIL PROTECTED]>. License: GNU GPL.
#=============================================================================
use strict;
require 'getopts.pl';
$main::opt_l = 0;
&Getopts('l');
while (<>) {
$main::opt_l && s/([\x80-\xFF])/widechar(ord($1))/ge;
s/&#(\d{1,10});/widechar($1)/ge;
s/&#x([0-9a-f]{1,8});/widechar(hex($1))/gei;
print;
}
sub widechar {
my $Val = shift;
if ($Val < 0x80) {
return sprintf("%c", $Val);
} elsif ($Val < 0x800) {
return sprintf("%c%c", 0xC0 | ($Val >> 6),
0x80 | ($Val & 0x3F));
} elsif ($Val < 0x10000) {
return sprintf("%c%c%c", 0xE0 | ($Val >> 12),
0x80 | (($Val >> 6) & 0x3F),
0x80 | ($Val & 0x3F));
} elsif ($Val < 0x200000) {
return sprintf("%c%c%c%c", 0xF0 | ($Val >> 18),
0x80 | (($Val >> 12) & 0x3F),
0x80 | (($Val >> 6) & 0x3F),
0x80 | ($Val & 0x3F));
} elsif ($Val < 0x4000000) {
return sprintf("%c%c%c%c%c", 0xF8 | ($Val >> 24),
0x80 | (($Val >> 18) & 0x3F),
0x80 | (($Val >> 12) & 0x3F),
0x80 | (($Val >> 6) & 0x3F),
0x80 | ( $Val & 0x3F));
} elsif ($Val < 0x80000000) {
return sprintf("%c%c%c%c%c%c", 0xFC | ($Val >> 30),
0x80 | (($Val >> 24) & 0x3F),
0x80 | (($Val >> 18) & 0x3F),
0x80 | (($Val >> 12) & 0x3F),
0x80 | (($Val >> 6) & 0x3F),
0x80 | ( $Val & 0x3F));
} else {
return widechar(0xFFFD);
}
} # widechar()
__END__