In perl.git, the branch khw/ebcdic has been updated <http://perl5.git.perl.org/perl.git/commitdiff/11e66312239bc787b3f0096923f554ef5edc64c6?hp=f10c945af3d902f93b141af04a0dadeccc7e26b3>
- Log ----------------------------------------------------------------- commit 11e66312239bc787b3f0096923f554ef5edc64c6 Author: Karl Williamson <[email protected]> Date: Fri Apr 5 16:22:36 2013 -0600 XXX rebase, document M dist/IO/t/io_utf8argv.t M t/op/pack.t M t/re/pat.t M t/test.pl commit 6d22a7f4cbd5d7e21bbe6715273a69daa263a663 Author: Karl Williamson <[email protected]> Date: Fri Apr 5 16:20:20 2013 -0600 lib/utf8.pm: Fix pod verbatim line wrap M lib/utf8.pm M t/porting/known_pod_issues.dat ----------------------------------------------------------------------- Summary of changes: dist/IO/t/io_utf8argv.t | 2 +- lib/utf8.pm | 28 +++++++++++++++------------- t/op/pack.t | 4 ++-- t/porting/known_pod_issues.dat | 1 - t/re/pat.t | 2 +- t/test.pl | 18 +++++++++++++----- 6 files changed, 32 insertions(+), 23 deletions(-) diff --git a/dist/IO/t/io_utf8argv.t b/dist/IO/t/io_utf8argv.t index d106d4d..ec3e415 100644 --- a/dist/IO/t/io_utf8argv.t +++ b/dist/IO/t/io_utf8argv.t @@ -16,7 +16,7 @@ plan(tests => 2); open my $fh, ">:raw", 'io_utf8argv'; # Needs a function utf8_bytes_to_native -print $fh utf8a_to_utf8n( +print $fh byte_utf8a_to_utf8n( "\xce\x9c\xe1\xbd\xb7\xce\xb1\x20\xcf\x80\xe1\xbd\xb1\xcf\x80\xce". "\xb9\xce\xb1\x2c\x20\xce\xbc\xe1\xbd\xb0\x20\xcf\x80\xce\xbf\xce". "\xb9\xe1\xbd\xb0\x20\xcf\x80\xe1\xbd\xb1\xcf\x80\xce\xb9\xce\xb1". diff --git a/lib/utf8.pm b/lib/utf8.pm index 1d6992c..90492fe 100644 --- a/lib/utf8.pm +++ b/lib/utf8.pm @@ -29,22 +29,22 @@ utf8 - Perl pragma to enable/disable UTF-8 (or UTF-EBCDIC) in source code =head1 SYNOPSIS - use utf8; - no utf8; + use utf8; + no utf8; - # Convert the internal representation of a Perl scalar to/from UTF-8. + # Convert the internal representation of a Perl scalar to/from UTF-8. - $num_octets = utf8::upgrade($string); - $success = utf8::downgrade($string[, FAIL_OK]); + $num_octets = utf8::upgrade($string); + $success = utf8::downgrade($string[, FAIL_OK]); - # Change each character of a Perl scalar to/from a series of - # characters that represent the UTF-8 bytes of each original character. + # Change each character of a Perl scalar to/from a series of + # characters that represent the UTF-8 bytes of each original character. - utf8::encode($string); # "\x{100}" becomes "\xc4\x80" - utf8::decode($string); # "\xc4\x80" becomes "\x{100}" + utf8::encode($string); # "\x{100}" becomes "\xc4\x80" + utf8::decode($string); # "\xc4\x80" becomes "\x{100}" - $flag = utf8::is_utf8(STRING); # since Perl 5.8.1 - $flag = utf8::valid(STRING); + $flag = utf8::is_utf8(STRING); # since Perl 5.8.1 + $flag = utf8::valid(STRING); =head1 DESCRIPTION @@ -145,7 +145,8 @@ individual I<UTF-X> bytes of the character. The UTF8 flag is turned off. Returns nothing. my $a = "\x{100}"; # $a contains one character, with ord 0x100 - utf8::encode($a); # $a contains two characters, with ords 0xc4 and 0x80 + utf8::encode($a); # $a contains two characters, with ords 0xc4 + # and 0x80 B<Note that this function does not handle arbitrary encodings.> Therefore Encode is recommended for the general purposes; see also @@ -161,7 +162,8 @@ turned on only if the source string contains multiple-byte I<UTF-X> characters. If I<$string> is invalid as I<UTF-X>, returns false; otherwise returns true. - my $a = "\xc4\x80"; # $a contains two characters, with ords 0xc4 and 0x80 + my $a = "\xc4\x80"; # $a contains two characters, with ords 0xc4 + # and 0x80 utf8::decode($a); # $a contains one character, with ord 0x100 B<Note that this function does not handle arbitrary encodings.> diff --git a/t/op/pack.t b/t/op/pack.t index 858fe3b..4853be9 100644 --- a/t/op/pack.t +++ b/t/op/pack.t @@ -1530,7 +1530,7 @@ is(unpack('c'), 65, "one-arg unpack (change #18751)"); # defaulting to $_ # } } -my $U_1FFC_utf8 = utf8a_to_utf8n("\341\277\274"); +my $U_1FFC_utf8 = byte_utf8a_to_utf8n("\341\277\274"); { # U0 and C0 must be scoped my (@x) = unpack("a(U0)U", "b$U_1FFC_utf8"); @@ -1786,7 +1786,7 @@ my $U_1FFC_utf8 = utf8a_to_utf8n("\341\277\274"); is(pack("A*", $high), "\xfeb"); is(pack("Z*", $high), "\xfeb\x00"); - utf8::upgrade($high = utf8a_to_utf8n("\xc3\xbe") . "b"); + utf8::upgrade($high = byte_utf8a_to_utf8n("\xc3\xbe") . "b"); is(pack("U0a2", $high), latin1_to_native("\xfe")); is(pack("U0A2", $high), latin1_to_native("\xfe")); is(pack("U0Z1", $high), latin1_to_native("\x00")); diff --git a/t/porting/known_pod_issues.dat b/t/porting/known_pod_issues.dat index 46790cb..efd1c11 100644 --- a/t/porting/known_pod_issues.dat +++ b/t/porting/known_pod_issues.dat @@ -318,7 +318,6 @@ lib/strict.pm Verbatim line length including indents exceeds 79 by 1 lib/tie/array.pm Verbatim line length including indents exceeds 79 by 1 lib/tie/hash.pm Verbatim line length including indents exceeds 79 by 3 lib/tie/scalar.pm Verbatim line length including indents exceeds 79 by 1 -lib/utf8.pm Verbatim line length including indents exceeds 79 by 4 lib/version.pod Verbatim line length including indents exceeds 79 by 1 lib/version/internals.pod Verbatim line length including indents exceeds 79 by 2 lib/vmsish.pm Verbatim line length including indents exceeds 79 by 1 diff --git a/t/re/pat.t b/t/re/pat.t index 1d378a6..73e6ce2 100644 --- a/t/re/pat.t +++ b/t/re/pat.t @@ -1246,7 +1246,7 @@ EOP # consider the UTF8ness of the previous and current pattern # string, as well as the physical bytes of the pattern string - for my $s (utf8a_to_utf8n("\xc4\x80"), "\x{100}") { + for my $s (byte_utf8a_to_utf8n("\xc4\x80"), "\x{100}") { ok($s =~ /^$s$/, "re-compile check is UTF8-aware"); } } diff --git a/t/test.pl b/t/test.pl index 027bea8..f2bdbd3 100644 --- a/t/test.pl +++ b/t/test.pl @@ -1560,7 +1560,7 @@ sub ord_native_to_latin1 { return utf8::native_to_unicode($ord); } -sub utf8a_to_utf8n { +sub byte_utf8a_to_utf8n { # Convert a UTF-8 byte sequence into the platform's native UTF-8 # equivalent, like UTF-EBCDIC. @@ -1588,15 +1588,19 @@ sub utf8a_to_utf8n { ); my $string = shift; - die "Input to utf8a-to_utf8n() must not be flagged UTF-8" + die "Input to byte_utf8a-to_utf8n() must not be flagged UTF-8" if utf8::is_utf8($string); - return $string if ord('^') == 94; # ASCII, Latin1 + #return $string if ord('^') == 94; # ASCII, Latin1 my $length = length($string); + #diag($string); + #diag($length); my $out = ""; for ($i = 0; $i < $length; $i++) { - my $byte = ord substr($in, $i, 1); + my $byte = ord substr($string, $i, 1); my $byte_count = $utf8_skip[$byte]; + #diag($byte); + #diag($byte_count); die "Illegal start byte" if $byte_count < 0; if ($i + $byte_count > $length) { @@ -1615,13 +1619,15 @@ sub utf8a_to_utf8n { my $ord = $byte & (0x1F >> ($byte_count - 2)); my $bytes_remaining = $byte_count - 1; while ($bytes_remaining > 0) { - $byte = ord substr($in, ++$i, 1); + $byte = ord substr($string, ++$i, 1); unless (($byte & 0xC0) == 0x80) { die sprintf "byte '%X' is not a valid continuation", $byte; } $ord = $ord << 6 | ($byte & 0x3f); $bytes_remaining--; } + #diag($byte); + #diag($ord); my $expected_bytes = $ord < 0x80 ? 1 @@ -1647,6 +1653,8 @@ sub utf8a_to_utf8n { $out .= chr utf8::unicode_to_native($ord); } + utf8::encode($out); # Turn off utf8 flag. + #diag($out); return $out; } -- Perl5 Master Repository
