On 2002-03-26 06:58-0800 Pedro Ferreira wrote:

> Please, what is the best tool to convert an ascii file
> with unicode character codes like this:
> U+3400
> U+3405
> to another UTF-8 file with the corresponding unicode
> characters?

This Perl script should do the job:

========== CUT HERE ==========

#!/usr/bin/perl -w

#=============================================================================
# $Id: h2u,v 1.3.2.1 2002/03/26 16:16:53 sunny Exp $
# Converts text files with Unicode characters on the U+nnnn format into UTF-8.
#
# Created by &#xD8;yvind A. Holm <[EMAIL PROTECTED]>. License: GNU GPL.
#=============================================================================

use strict;

while (<>) {
        s/U\+([0-9A-F]{4})/widechar(hex($1))/gei;
        print;
}

sub widechar {
        my $Val = shift;
        if ($Val < 0x80) {
                return sprintf("%c", $Val);
        } elsif ($Val < 0x800) {
                return sprintf("%c%c", 0xC0 | ($Val >> 6),
                                       0x80 | ($Val & 0x3F));
        } elsif ($Val < 0x10000) {
                return sprintf("%c%c%c", 0xE0 |  ($Val >> 12),
                                         0x80 | (($Val >>  6) & 0x3F),
                                         0x80 |  ($Val        & 0x3F));
        } elsif ($Val < 0x200000) {
                return sprintf("%c%c%c%c", 0xF0 |  ($Val >> 18),
                                           0x80 | (($Val >> 12) & 0x3F),
                                           0x80 | (($Val >>  6) & 0x3F),
                                           0x80 |  ($Val        & 0x3F));
        } elsif ($Val < 0x4000000) {
                return sprintf("%c%c%c%c%c", 0xF8 |  ($Val >> 24),
                                             0x80 | (($Val >> 18) & 0x3F),
                                             0x80 | (($Val >> 12) & 0x3F),
                                             0x80 | (($Val >>  6) & 0x3F),
                                             0x80 | ( $Val        & 0x3F));
        } elsif ($Val < 0x80000000) {
                return sprintf("%c%c%c%c%c%c", 0xFC |  ($Val >> 30),
                                               0x80 | (($Val >> 24) & 0x3F),
                                               0x80 | (($Val >> 18) & 0x3F),
                                               0x80 | (($Val >> 12) & 0x3F),
                                               0x80 | (($Val >>  6) & 0x3F),
                                               0x80 | ( $Val        & 0x3F));
        } else {
                return widechar(0xFFFD);
        }
} # widechar()

__END__

========== CUT HERE ==========

This is a modified version of a script I use very often when converting
to/from UTF-8. This version uses converts U+xxxx into UTF-8, the two
scripts attached to this mail are the versions I use, they convert
numeric HTML entities (&#x1234; and &#4660;) into UTF-8 and vice versa.
Nice thing to have around when editing UTF-8 files in an editor that is
not UTF-8 aware.

Øyvind

+-------------------------------------------------------------------+
| OpenPGP: 0x629022EB 2002-02-24 Øyvind A. Holm <[EMAIL PROTECTED]> |
| Fingerprint: DBE9 8D44 67F7 42AC 2CA1  7651 724E 9D53 6290 22EB   |
+----------------| http://www.sunbase.org[/sunny] |-----------------+
#!/usr/bin/perl -w

#=============================================================================
# $Id: u2h,v 1.4 2002/03/16 20:44:23 sunny Exp $
# Converts from UTF-8 charset to HTML numeric entities (&#x263A; and &#9786;).
#
# Options:
#   -a  convert Ampersand into entity
#   -d  use Decimal values
#   -l  also convert Latin-1 characters.
#
# Created by &#xD8;yvind A. Holm <[EMAIL PROTECTED]>. License: GNU GPL.
#=============================================================================

use strict;
require 'getopts.pl';

($main::opt_a, $main::opt_d, $main::opt_l) = (0, 0, 0);
&Getopts('adl');

my $amp_ent = $main::opt_d ? "&#38;" : "&#x26;";

while (<>) {
        $main::opt_a && s/&/$amp_ent/g;
        
s/([\xFC-\xFD][\x80-\xBF][\x80-\xBF][\x80-\xBF][\x80-\xBF][\x80-\xBF])/decode_char($1)/ge;

        
s/([\xF8-\xFB][\x80-\xBF][\x80-\xBF][\x80-\xBF][\x80-\xBF])/decode_char($1)/ge;
        s/([\xF0-\xF7][\x80-\xBF][\x80-\xBF][\x80-\xBF])/decode_char($1)/ge;
        s/([\xE0-\xEF][\x80-\xBF][\x80-\xBF])/decode_char($1)/ge;
        s/([\xC0-\xDF][\x80-\xBF])/decode_char($1)/ge;
        print;
}

# Warning: decode_char() accepts overlong sequences.

sub decode_char {
        my $Msg = shift;
        my $Retval = "";
        if ($Msg =~ /^([\xC0-\xDF])([\x80-\xBF])/) {
                my $Val = ((ord($1) & 0x1F) << 6) | (ord($2) & 0x3F);
                $Retval = ($main::opt_l && ($Val <= 0xFF)) ? chr($Val) : 
sprintf("&#%u;", $Val);
        } elsif ($Msg =~ /^([\xE0-\xEF])([\x80-\xBF])([\x80-\xBF])/) {
                $Retval = join("", "&#",
                          ((ord($1) & 0x0F) << 12) |
                          ((ord($2) & 0x3F) <<  6) |
                          ( ord($3) & 0x3F), ";");
        } elsif ($Msg =~ /^([\xF0-\xF7])([\x80-\xBF])([\x80-\xBF])([\x80-\xBF])/) {
                $Retval = join("", "&#",
                          ((ord($1) & 0x07) << 18) |
                          ((ord($2) & 0x3F) << 12) |
                          ((ord($3) & 0x3F) <<  6) |
                          ( ord($4) & 0x3F), ";");
        } elsif ($Msg =~ 
/^([\xF8-\xFB])([\x80-\xBF])([\x80-\xBF])([\x80-\xBF])([\x80-\xBF])/) {
                $Retval = join("", "&#",
                          ((ord($1) & 0x03) << 24) |
                          ((ord($2) & 0x3F) << 18) |
                          ((ord($3) & 0x3F) << 12) |
                          ((ord($4) & 0x3F) <<  6) |
                          ( ord($5) & 0x3F), ";");
        } elsif ($Msg =~ 
/^([\xFC-\xFD])([\x80-\xBF])([\x80-\xBF])([\x80-\xBF])([\x80-\xBF])([\x80-\xBF])/) {
                $Retval = join("", "&#",
                          ((ord($1) & 0x01) << 30) |
                          ((ord($2) & 0x3F) << 24) |
                          ((ord($3) & 0x3F) << 18) |
                          ((ord($4) & 0x3F) << 12) |
                          ((ord($5) & 0x3F) <<  6) |
                          ( ord($6) & 0x3F), ";");
        }
        $Retval =~ s/&#(\d+);/sprintf("&#x%X;", $1)/ge unless ($main::opt_d);
        return $Retval;
} # decode_char()

__END__
#!/usr/bin/perl -w

#=============================================================================
# $Id: h2u,v 1.3 2002/03/16 20:44:53 sunny Exp $
# Converts from numeric entities in HTML/SGML (&#x263A; and &#9786;) to UTF-8.
#
# Options:
#   -l  also convert Latin-1 characters.
#
# Created by &#xD8;yvind A. Holm <[EMAIL PROTECTED]>. License: GNU GPL.
#=============================================================================

use strict;
require 'getopts.pl';

$main::opt_l = 0;

&Getopts('l');

while (<>) {
        $main::opt_l && s/([\x80-\xFF])/widechar(ord($1))/ge;
        s/&#(\d{1,10});/widechar($1)/ge;
        s/&#x([0-9a-f]{1,8});/widechar(hex($1))/gei;
        print;
}

sub widechar {
        my $Val = shift;
        if ($Val < 0x80) {
                return sprintf("%c", $Val);
        } elsif ($Val < 0x800) {
                return sprintf("%c%c", 0xC0 | ($Val >> 6),
                                       0x80 | ($Val & 0x3F));
        } elsif ($Val < 0x10000) {
                return sprintf("%c%c%c", 0xE0 |  ($Val >> 12),
                                         0x80 | (($Val >>  6) & 0x3F),
                                         0x80 |  ($Val        & 0x3F));
        } elsif ($Val < 0x200000) {
                return sprintf("%c%c%c%c", 0xF0 |  ($Val >> 18),
                                           0x80 | (($Val >> 12) & 0x3F),
                                           0x80 | (($Val >>  6) & 0x3F),
                                           0x80 |  ($Val        & 0x3F));
        } elsif ($Val < 0x4000000) {
                return sprintf("%c%c%c%c%c", 0xF8 |  ($Val >> 24),
                                             0x80 | (($Val >> 18) & 0x3F),
                                             0x80 | (($Val >> 12) & 0x3F),
                                             0x80 | (($Val >>  6) & 0x3F),
                                             0x80 | ( $Val        & 0x3F));
        } elsif ($Val < 0x80000000) {
                return sprintf("%c%c%c%c%c%c", 0xFC |  ($Val >> 30),
                                               0x80 | (($Val >> 24) & 0x3F),
                                               0x80 | (($Val >> 18) & 0x3F),
                                               0x80 | (($Val >> 12) & 0x3F),
                                               0x80 | (($Val >>  6) & 0x3F),
                                               0x80 | ( $Val        & 0x3F));
        } else {
                return widechar(0xFFFD);
        }
} # widechar()

__END__

Reply via email to