In perl.git, the branch blead has been updated <http://perl5.git.perl.org/perl.git/commitdiff/112284f43082e616b936428a3b2d4877ef232864?hp=ed996a54e9e136efd3812a9f2d00d7102ed0ef1a>
- Log ----------------------------------------------------------------- commit 112284f43082e616b936428a3b2d4877ef232864 Author: Karl Williamson <[email protected]> Date: Mon Nov 24 10:41:23 2014 -0700 perlvar: Add info M pod/perlvar.pod commit 27c3afbd6068ac83b49a11df3e33758ef059027e Author: Karl Williamson <[email protected]> Date: Mon Nov 24 10:34:27 2014 -0700 charnames: Generalize to work on non-ASCII platforms This includes the tests. The character names are now stored in native order. This means that pack('U') no longer works on non-ASCII platforms. Use chr instead, mostly, and pack('W*') for a sequence. These changes required the 'encoding' pragma to no longer affect e.g., chr() outside its scope, which was recently done by 3e669301f0a6fa34269f0e1eaf1fbbd72cae498a. M lib/_charnames.pm M lib/charnames.t M t/lib/charnames/alias commit 374aed2060f1b45b00ae9ad47fb986b4619492c7 Author: Karl Williamson <[email protected]> Date: Sun Nov 23 21:44:17 2014 -0700 charnames: Don't return UTF-8 unless have to. A Latin-1 range character doesn't require UTF-8 to be represented. Not using UTF-8 speeds things up a lot. This commit changes the return to be UTF-8 only if necessary. This could be because 1) the character isn't representable except in UTF-8; or 2) we are being called at runtime and the character isn't ASCII-range. The only way we can currently guarantee that an upper-Latin1 character will be treated with Unicode rules is to make it UTF-8 (the Unicode bug). The compile-time code (in toke.c) knows how to properly deal with non-UTF-8 upper Latin1 range characters. The utf8::downgrade introduced in this commit is temporary. Future commits will change things so that there is no need for it. M lib/_charnames.pm commit 896eff3c48ae9560ce70fb8a9e729558e65592da Author: Karl Williamson <[email protected]> Date: Sun Nov 23 21:50:41 2014 -0700 toke.c: Ignore 'use encoding' on \N{} The encoding pragma converts from a specified encoding into Unicode. \N{} already returns the Unicode form, so the encoding pragma should not operate on them. This commit ensures that. The only reason things have appeared to work prior to this commit is that \N{} has generally returned its value in UTF-8, which 'encoding' knows enough to not disturb. However, a custom name translator installed in the program need not return in UTF-8, so this is a bug that just hasn't yet been exposed. However, the next commit is about to change things so that a regular \N{} only returns UTF-8 if it has to, so this bug would come up a lot more often. There is no need for adding a test case, because, without this commit existing tests would fail in t/uni/greek.t. M toke.c commit b64912e99f61d14bb5e6ec9388cee8affde67198 Author: Karl Williamson <[email protected]> Date: Sun Nov 23 10:18:32 2014 -0700 lib/_charnames.pm: Change variable name This lexical variable's use is about to change so it no longer will always be in UTF-8 encoding. So the name would become misleading. M lib/_charnames.pm commit 055bf491ea6f4dcf5702e3c09236b562e2943219 Author: Karl Williamson <[email protected]> Date: Sun Nov 23 10:14:22 2014 -0700 charnames: Nit in comments, pod M lib/_charnames.pm M lib/charnames.pm commit fa1e80ba41f52550cc193f33fda5d48962169788 Author: Karl Williamson <[email protected]> Date: Sun Nov 23 10:13:28 2014 -0700 charnames: Bump version to 1.43 M lib/_charnames.pm M lib/charnames.pm commit 716d5ef3fce3654c265dfdfbdc7112c38271beba Author: Karl Williamson <[email protected]> Date: Sat Nov 22 09:47:02 2014 -0700 toke.c: Typo in comment M toke.c ----------------------------------------------------------------------- Summary of changes: lib/_charnames.pm | 76 +++++++++++++++++++++++++++++---------------------- lib/charnames.pm | 4 +-- lib/charnames.t | 56 ++++++++++++++++++------------------- pod/perlvar.pod | 4 ++- t/lib/charnames/alias | 35 ++++++++++++++++++------ toke.c | 4 +-- 6 files changed, 103 insertions(+), 76 deletions(-) diff --git a/lib/_charnames.pm b/lib/_charnames.pm index 729d849..bb7d7c6 100644 --- a/lib/_charnames.pm +++ b/lib/_charnames.pm @@ -6,7 +6,7 @@ package _charnames; use strict; use warnings; -our $VERSION = '1.42'; +our $VERSION = '1.43'; use unicore::Name; # mktables-generated algorithmically-defined names use bytes (); # for $bytes::hint_bits @@ -65,10 +65,10 @@ $Carp::Internal{ (__PACKAGE__) } = 1; my %system_aliases = ( - 'SINGLE-SHIFT 2' => pack("U", utf8::unicode_to_native(0x8E)), - 'SINGLE-SHIFT 3' => pack("U", utf8::unicode_to_native(0x8F)), - 'PRIVATE USE 1' => pack("U", utf8::unicode_to_native(0x91)), - 'PRIVATE USE 2' => pack("U", utf8::unicode_to_native(0x92)), + 'SINGLE-SHIFT 2' => chr utf8::unicode_to_native(0x8E), + 'SINGLE-SHIFT 3' => chr utf8::unicode_to_native(0x8F), + 'PRIVATE USE 1' => chr utf8::unicode_to_native(0x91), + 'PRIVATE USE 2' => chr utf8::unicode_to_native(0x92), ); # These are the aliases above that differ under :loose and :full matching @@ -77,15 +77,15 @@ my %system_aliases = ( #); #my %deprecated_aliases; -#$deprecated_aliases{'BELL'} = pack("U", utf8::unicode_to_native(0x07)) if $^V lt v5.17.0; +#$deprecated_aliases{'BELL'} = chr utf8::unicode_to_native(0x07) if $^V lt v5.17.0; #my %loose_deprecated_aliases = ( #); # These are special cased in :loose matching, differing only in a medial # hyphen -my $HANGUL_JUNGSEONG_O_E_utf8 = pack("U", 0x1180); -my $HANGUL_JUNGSEONG_OE_utf8 = pack("U", 0x116C); +my $HANGUL_JUNGSEONG_O_E_utf8 = chr 0x1180; +my $HANGUL_JUNGSEONG_OE_utf8 = chr 0x116C; my $txt; # The table of official character names @@ -163,7 +163,7 @@ sub alias (@) # Set up a single alias } if ($value =~ $decimal_qr) { no warnings qw(non_unicode surrogate nonchar); # Allow any of these - $^H{charnames_ord_aliases}{$name} = pack("U", $value); + $^H{charnames_ord_aliases}{$name} = chr $value; # Use a canonical form. $^H{charnames_inverse_ords}{sprintf("%05X", $value)} = $name; @@ -285,7 +285,7 @@ sub lookup_name ($$$) { # It looks first in the aliases, then in the large table of official Unicode # names. - my $utf8; # The string result + my $result; # The string result my $save_input; if ($runtime) { @@ -298,7 +298,7 @@ sub lookup_name ($$$) { || (! defined $hints_ref->{charnames_full} && ! defined $hints_ref->{charnames_loose}); - # At runtime, but currently not at compile time, $^H gets + # At runtime, but currently not at compile time, %^H gets # stringified, so un-stringify back to the original data structures. # These get thrown away by perl before the next invocation # Also fill in the hash with the non-stringified data. @@ -321,7 +321,7 @@ sub lookup_name ($$$) { # User alias should be checked first or else can't override ours, and if we # were to add any, could conflict with theirs. if (exists $^H{charnames_ord_aliases}{$name}) { - $utf8 = $^H{charnames_ord_aliases}{$name}; + $result = $^H{charnames_ord_aliases}{$name}; } elsif (exists $^H{charnames_name_aliases}{$name}) { $name = $^H{charnames_name_aliases}{$name}; @@ -362,13 +362,13 @@ sub lookup_name ($$$) { # interested in convenience over speed, and the time for this second check # is miniscule compared to the rest of the routine. if (exists $system_aliases{$lookup_name}) { - $utf8 = $system_aliases{$lookup_name}; + $result = $system_aliases{$lookup_name}; } # There are currently no entries in this hash, so don't waste time looking # for them. But the code is retained for the unlikely possibility that # some will be added in the future. # elsif ($loose && exists $loose_system_aliases{$lookup_name}) { -# $utf8 = $loose_system_aliases{$lookup_name}; +# $result = $loose_system_aliases{$lookup_name}; # } # if (exists $deprecated_aliases{$lookup_name}) { # require warnings; @@ -376,7 +376,7 @@ sub lookup_name ($$$) { # "Unicode character name \"$name\" is deprecated, use \"" # . viacode(ord $deprecated_aliases{$lookup_name}) # . "\" instead"); -# $utf8 = $deprecated_aliases{$lookup_name}; +# $result = $deprecated_aliases{$lookup_name}; # } # There are currently no entries in this hash, so don't waste time looking # for them. But the code is retained for the unlikely possibility that @@ -387,21 +387,21 @@ sub lookup_name ($$$) { # "Unicode character name \"$name\" is deprecated, use \"" # . viacode(ord $loose_deprecated_aliases{$lookup_name}) # . "\" instead"); -# $utf8 = $loose_deprecated_aliases{$lookup_name}; +# $result = $loose_deprecated_aliases{$lookup_name}; # } } my @off; # Offsets into table of pattern match begin and end # If haven't found it yet... - if (! defined $utf8) { + if (! defined $result) { # See if has looked this input up earlier. if (! $loose && $^H{charnames_full} && exists $full_names_cache{$name}) { - $utf8 = $full_names_cache{$name}; + $result = $full_names_cache{$name}; } elsif ($loose && exists $loose_names_cache{$name}) { - $utf8 = $loose_names_cache{$name}; + $result = $loose_names_cache{$name}; } else { # Here, must do a look-up @@ -432,7 +432,7 @@ sub lookup_name ($$$) { if (($loose || $^H{charnames_full}) && (defined (my $ord = charnames::name_to_code_point_special($lookup_name, $loose)))) { - $utf8 = pack("U", $ord); + $result = chr $ord; } else { @@ -525,14 +525,14 @@ sub lookup_name ($$$) { # therefore yield the very last character in the table, which should # also be a \n, so the statement works anyway.) if (substr($txt, $off[0] - 7, 1) eq "\n") { - $utf8 = pack("U", CORE::hex substr($txt, $off[0] - 6, 5)); + $result = chr CORE::hex substr($txt, $off[0] - 6, 5); # Handle the single loose matching special case, in which two names # differ only by a single medial hyphen. If the original had a # hyphen (or more) in the right place, then it is that one. - $utf8 = $HANGUL_JUNGSEONG_O_E_utf8 + $result = $HANGUL_JUNGSEONG_O_E_utf8 if $loose - && $utf8 eq $HANGUL_JUNGSEONG_OE_utf8 + && $result eq $HANGUL_JUNGSEONG_OE_utf8 && $name =~ m/O \s* - [-\s]* E/ix; # Note that this wouldn't work if there were a 2nd # OE in the name @@ -544,7 +544,7 @@ sub lookup_name ($$$) { # The +1 skips past that newline, or, if the rindex() fails, to put # us to an offset of zero. my $charstart = rindex($txt, "\n", $off[0] - 7) + 1; - $utf8 = pack("U*", map { CORE::hex } + $result = pack("W*", map { CORE::hex } split " ", substr($txt, $charstart, $off[0] - $charstart - 1)); } } @@ -553,15 +553,27 @@ sub lookup_name ($$$) { # again, but only if it came from the one search that we cache. # (Haven't bothered with the pain of sorting out scoping issues for the # scripts searches.) - $cache_ref->{$name} = $utf8 if defined $cache_ref; + $cache_ref->{$name} = $result if defined $cache_ref; } } - - # Here, have the utf8. If the return is to be an ord, must be any single - # character. + # Here, have the result character. If the return is to be an ord, must be + # any single character. if ($wants_ord) { - return ord($utf8) if length $utf8 == 1; + return ord($result) if length $result == 1; + } + elsif (! utf8::is_utf8($result)) { + + # Here isn't UTF-8. That's OK if it is all ASCII, or we are being called + # at compile time where we know we can guarantee that Unicode rules are + # correctly imposed on the result, or under 'bytes' where we don't want + # those rules. But otherwise we have to make it UTF8 to guarantee Unicode + # rules on the returned string. + return $result if ! $runtime + || (caller $runtime)[8] & $bytes::hint_bits + || $result !~ /[[:^ascii:]]/; + utf8::upgrade($result); + return $result; } else { @@ -570,7 +582,7 @@ sub lookup_name ($$$) { my $in_bytes = ($runtime) ? (caller $runtime)[8] & $bytes::hint_bits : $^H & $bytes::hint_bits; - return $utf8 if (! $in_bytes || utf8::downgrade($utf8, 1)) # The 1 arg + return $result if (! $in_bytes || utf8::downgrade($result, 1)) # The 1 arg # means don't die on failure } @@ -594,10 +606,10 @@ sub lookup_name ($$$) { # Only other possible failure here is from use bytes. if ($runtime) { - carp not_legal_use_bytes_msg($name, $utf8); + carp not_legal_use_bytes_msg($name, $result); return; } else { - croak not_legal_use_bytes_msg($name, $utf8); + croak not_legal_use_bytes_msg($name, $result); } } # lookup_name diff --git a/lib/charnames.pm b/lib/charnames.pm index 6f5d51d..28e0282 100644 --- a/lib/charnames.pm +++ b/lib/charnames.pm @@ -1,7 +1,7 @@ package charnames; use strict; use warnings; -our $VERSION = '1.41'; +our $VERSION = '1.43'; use unicore::Name; # mktables-generated algorithmically-defined names use _charnames (); # The submodule for this where most of the work gets done @@ -297,7 +297,7 @@ matched name) or to a numeric code point (ordinal). The latter is useful for assigning names to code points in Unicode private use areas such as U+E800 through U+F8FF. -A numeric code point must be a non-negative integer or a string beginning +A numeric code point must be a non-negative integer, or a string beginning with C<"U+"> or C<"0x"> with the remainder considered to be a hexadecimal integer. A literal numeric constant must be unsigned; it will be interpreted as hex if it has a leading zero or contains diff --git a/lib/charnames.t b/lib/charnames.t index bd0c21e..e115811 100644 --- a/lib/charnames.t +++ b/lib/charnames.t @@ -65,7 +65,7 @@ EOE use charnames ":alias" => { mychar1 => "0xE8000", mychar2 => 983040, # U+F0000 mychar3 => "U+100000", - myctrl => 0x80, + myctrl => utf8::unicode_to_native(0x80), mylarge => "U+111000", }; is ("\N{PILE OF POO}", chr(0x1F4A9), "Verify :alias alone implies :full"); @@ -77,29 +77,14 @@ EOE is (charnames::viacode(0x100000), "mychar3", "And that can get the alias back"); is ("\N{mylarge}", chr(0x111000), "Verify that can define alias beyond Unicode"); is (charnames::viacode(0x111000), "mylarge", "And that can get the alias back"); - is (charnames::viacode(0x80), "myctrl", "Verify that can name a nameless control"); + is (charnames::viacode(utf8::unicode_to_native(0x80)), "myctrl", "Verify that can name a nameless control"); } -my $encoded_be; -my $encoded_alpha; -my $encoded_bet; -my $encoded_deseng; - -# If octal representation of unicode char is \0xyzt, then the utf8 is \3xy\2zt -if (ord('A') == 65) { # as on ASCII or UTF-8 machines - $encoded_be = "\320\261"; - $encoded_alpha = "\316\261"; - $encoded_bet = "\327\221"; - $encoded_deseng = "\360\220\221\215"; -} -else { # EBCDIC where UTF-EBCDIC may be used (this may be 1047 specific since - # UTF-EBCDIC is codepage specific) - $encoded_be = "\270\102\130"; - $encoded_alpha = "\264\130"; - $encoded_bet = "\270\125\130"; - $encoded_deseng = "\336\102\103\124"; -} +my $encoded_be = byte_utf8a_to_utf8n("\320\261"); +my $encoded_alpha = byte_utf8a_to_utf8n("\316\261"); +my $encoded_bet = byte_utf8a_to_utf8n("\327\221"); +my $encoded_deseng = byte_utf8a_to_utf8n("\360\220\221\215"); sub to_bytes { unpack"U0a*", shift; @@ -235,7 +220,7 @@ sub test_vianame ($$$) { use bytes; is(charnames::vianame("GOTHIC LETTER AHSA"), 0x10330, "Verify vianame \\N{name} is unaffected by 'use bytes'"); - is(charnames::vianame("U+FF"), chr(0xFF), "Verify vianame \\N{U+FF} is unaffected by 'use bytes'"); + is(charnames::vianame("U+FF"), chr(utf8::unicode_to_native(0xFF)), "Verify vianame \\N{U+FF} is unaffected by 'use bytes'"); cmp_ok($warning_count, '==', scalar @WARN, "Verify vianame doesn't warn on legal inputs under 'use bytes'"); ok(! defined charnames::vianame("U+100"), "Verify vianame \\N{U+100} is undef under 'use bytes'"); ok($warning_count == scalar @WARN - 1 && $WARN[-1] =~ /above 0xFF/, "Verify vianame gives appropriate warning for previous test"); @@ -244,9 +229,9 @@ sub test_vianame ($$$) { ok(! defined charnames::string_vianame("GOTHIC LETTER AHSA"), "Verify string_vianame(\"GOTHIC LETTER AHSA\") is undefined under 'use bytes'"); ok($warning_count == scalar @WARN - 1 && $WARN[-1] =~ /above 0xFF/, "Verify string_vianame gives appropriate warning for previous test"); $warning_count = @WARN; - is(charnames::string_vianame("U+FF"), chr(0xFF), "Verify string_vianame(\"U+FF\") is chr(0xFF) under 'use bytes'"); + is(charnames::string_vianame("U+FF"), chr(utf8::unicode_to_native(0xFF)), "Verify string_vianame(\"U+FF\") is chr(0xFF) under 'use bytes'"); cmp_ok($warning_count, '==', scalar @WARN, "Verify string_vianame doesn't warn on legal inputs under 'use bytes'"); - is(charnames::string_vianame("LATIN SMALL LETTER Y WITH DIAERESIS"), chr(0xFF), "Verify string_vianame(\"LATIN SMALL LETTER Y WITH DIAERESIS\") is chr(0xFF) under 'use bytes'"); + is(charnames::string_vianame("LATIN SMALL LETTER Y WITH DIAERESIS"), chr(utf8::unicode_to_native(0xFF)), "Verify string_vianame(\"LATIN SMALL LETTER Y WITH DIAERESIS\") is chr(native 0xFF) under ... [14 chars truncated] cmp_ok($warning_count, '==', scalar @WARN, "Verify string_vianame doesn't warn on legal inputs under 'use bytes'"); ok(! defined charnames::string_vianame("U+100"), "Verify string_vianame \\N{U+100} is undef under 'use bytes'"); ok($warning_count == scalar @WARN - 1 && $WARN[-1] =~ /above 0xFF/, "Verify string_vianame gives appropriate warning for previous test"); @@ -350,7 +335,8 @@ ok(! defined charnames::viacode(0x110000), ok((grep { /\Qyou asked for U+110000/ } @WARN), '... and gives warning'); is(charnames::viacode(0), "NULL", 'Verify charnames::viacode(0) eq "NULL"'); -is(charnames::viacode("BE"), "VULGAR FRACTION THREE QUARTERS", 'Verify charnames::viacode("BE") eq "VULGAR FRACTION THREE QUARTERS"'); +my $three_quarters = sprintf("%2X", utf8::unicode_to_native(0xBE)); +is(charnames::viacode("$three_quarters"), "VULGAR FRACTION THREE QUARTERS", 'Verify charnames::viacode(native "BE") eq "VULGAR FRACTION THREE QUARTERS"'); is(charnames::viacode("U+00000000000FEED"), "ARABIC LETTER WAW ISOLATED FORM", 'Verify charnames::viacode("U+00000000000FEED") eq "ARABIC LETTER WAW ISOLATED FORM"'); { @@ -1024,7 +1010,10 @@ is("\N{U+1D0C5}", "\N{BYZANTINE MUSICAL SYMBOL FTHORA SKLIRON CHROMA VASIS}", 'V while (<$fh>) { chomp; my ($code, $name, undef, undef, undef, undef, undef, undef, undef, undef, $u1name) = split ";"; - my $decimal = hex $code; + my $decimal = utf8::unicode_to_native(hex $code); + $code = sprintf("%04X", $decimal) unless $::IS_ASCII; + + $decimal = hex $code; # The Unicode version 1 name is used instead of any that are # marked <control>. @@ -1034,7 +1023,7 @@ is("\N{U+1D0C5}", "\N{BYZANTINE MUSICAL SYMBOL FTHORA SKLIRON CHROMA VASIS}", 'V $name = "" if $^V lt v5.17.0 && $decimal == 0x1F514; # ALERT overrides BELL - $name = 'ALERT' if $decimal == 7; + $name = 'ALERT' if $decimal == utf8::unicode_to_native(7); # Some don't have names, leave those array elements undefined next unless $name; @@ -1209,7 +1198,10 @@ is("\N{U+1D0C5}", "\N{BYZANTINE MUSICAL SYMBOL FTHORA SKLIRON CHROMA VASIS}", 'V # These four code points now have names, from NameAlias, but # aren't listed as having names in UnicodeData.txt, so viacode # returns their alias names, not undef - next if $i == 0x80 || $i == 0x81 || $i == 0x84 || $i == 0x99; + next if $i == utf8::unicode_to_native(0x80) + || $i == utf8::unicode_to_native(0x81) + || $i == utf8::unicode_to_native(0x84) + || $i == utf8::unicode_to_native(0x99); # If there is no name for this code point, all we can # test is that. @@ -1223,7 +1215,11 @@ is("\N{U+1D0C5}", "\N{BYZANTINE MUSICAL SYMBOL FTHORA SKLIRON CHROMA VASIS}", 'V # These four code points have a different Unicode1 name than # regular name, and viacode has already specifically tested # for the regular name - if ($i != 0x0a && $i != 0x0c && $i != 0x0d && $i != 0x85) { + if ($i != utf8::unicode_to_native(0x0a) + && $i != utf8::unicode_to_native(0x0c) + && $i != utf8::unicode_to_native(0x0d) + && $i != utf8::unicode_to_native(0x85)) + { $all_pass &= is(charnames::viacode($i), $names[$i], "Verify viacode(0x$hex) is \"$names[$i]\""); } @@ -1249,7 +1245,7 @@ is("\N{U+1D0C5}", "\N{BYZANTINE MUSICAL SYMBOL FTHORA SKLIRON CHROMA VASIS}", 'V s/^\s*#.*//; next unless $_; my ($name, $codes) = split ";"; - my $utf8 = pack("U*", map { hex } split " ", $codes); + my $utf8 = pack("W*", map { hex } split " ", $codes); is(charnames::string_vianame($name), $utf8, "Verify string_vianame(\"$name\") is the proper utf8"); my $loose_name = get_loose_name($name); use charnames ":loose"; diff --git a/pod/perlvar.pod b/pod/perlvar.pod index 5b9f433..8561eb8 100644 --- a/pod/perlvar.pod +++ b/pod/perlvar.pod @@ -2063,7 +2063,9 @@ X<%^H> The C<%^H> hash provides the same scoping semantic as C<$^H>. This makes it useful for implementation of lexically scoped pragmas. See -L<perlpragma>. +L<perlpragma>. All the entries are stringified when accessed at +runtime, so only simple values can be accommodated. This means no +pointers to objects, for example. When putting items into C<%^H>, in order to avoid conflicting with other users of the hash there is a convention regarding which keys to use. diff --git a/t/lib/charnames/alias b/t/lib/charnames/alias index 8227860..33ccff4 100644 --- a/t/lib/charnames/alias +++ b/t/lib/charnames/alias @@ -18,7 +18,7 @@ Here: 1 ######## # NAME autoload doesn't get viacode print "Here: \N{DIGIT THREE}\n"; -charnames::viacode(0x34); +charnames::viacode(utf8::unicode_to_native(0x34)); EXPECT OPTIONS regex Undefined subroutine &charnames::viacode called at - line \d+. @@ -327,7 +327,7 @@ use warnings; no warnings 'void'; use charnames (); charnames::vianame('SPACE'); -charnames::viacode(0x41); +charnames::viacode(utf8::unicode_to_native(0x41)); EXPECT OPTIONS regex $ @@ -335,24 +335,41 @@ $ # NAME no extraneous warning [perl #11560] use warnings; use charnames (); -print charnames::viacode(0x80), "\n"; +print charnames::viacode(utf8::unicode_to_native(0x80)), "\n"; EXPECT OPTIONS regex PADDING CHARACTER ######## -# NAME various wrong characters in :alias are errors -# Below, one of the EXPECT regexes matches both the UTF-8 and non-UTF-8 form. -# This is because under some circumstances the message gets output as UTF-8. +# NAME A wrong character in :alias is an error +# These next tests could be combined, but the messages can come out in +# different orders on EBCDIC vs ASCII, and can't have both 'random' and 'regex' +# options, and need 'regex' to avoid 'at line X' getting in the way. use charnames ":full", ":alias" => { "4e_ACUTE" => "LATIN SMALL LETTER E WITH ACUTE", - "e_A,CUTE" => "LATIN SMALL LETTER E WITH ACUTE", - "e_ACUT\x{d7}E" => "LATIN SMALL LETTER E WITH ACUTE", }; EXPECT OPTIONS regex Invalid character in charnames alias definition; marked by <-- HERE in '4<-- HERE e_ACUTE' +######## +# NAME Another wrong character in :alias is an error +use charnames ":full", ":alias" => { + "e_A,CUTE" => "LATIN SMALL LETTER E WITH ACUTE", + }; +EXPECT +OPTIONS regex Invalid character in charnames alias definition; marked by <-- HERE in 'e_A,<-- HERE CUTE' -Invalid character in charnames alias definition; marked by <-- HERE in 'e_ACUT(?:\x{d7}|\x{C3}\x{97})<-- HERE E' +######## +# NAME Another wrong character in :alias is an error +# The EXPECT regex matches both the UTF-8 and non-UTF-8 form. +# This is because under some circumstances the message gets output as UTF-8. +# We use \xab, as that is invalid in both ASCII and EBCDIC platforms, and we +# accept both UTF-8 and 1047 UTF-EBCDIC. +use charnames ":full", ":alias" => { + "e_ACUT\x{ab}E" => "LATIN SMALL LETTER E WITH ACUTE", + }; +EXPECT +OPTIONS regex +Invalid character in charnames alias definition; marked by <-- HERE in 'e_ACUT(?:\x{ab}|\x{C2}\x{AB}|\x{80\x{73})<-- HERE E' ######## # RT#73022 # NAME \N{...} interprets ... as octets rather than UTF-8 diff --git a/toke.c b/toke.c index e20c93f..059c463 100644 --- a/toke.c +++ b/toke.c @@ -3475,8 +3475,8 @@ S_scan_const(pTHX_ char *start) const STRLEN off = d - SvPVX_const(sv); d = off + SvGROW(sv, off + len + (STRLEN)(send - s) + 1); } - if (! SvUTF8(res)) { /* Make sure is \N{} return is UTF-8 */ - sv_utf8_upgrade(res); + if (! SvUTF8(res)) { /* Make sure \N{} return is UTF-8 */ + sv_utf8_upgrade_flags(res, SV_UTF8_NO_ENCODING); str = SvPV_const(res, len); } Copy(str, d, len, char); -- Perl5 Master Repository
