[perl.git] branch blead, updated. v5.21.6-71-g112284f

Karl Williamson Mon, 24 Nov 2014 09:51:20 -0800

In perl.git, the branch blead has been updated

<http://perl5.git.perl.org/perl.git/commitdiff/112284f43082e616b936428a3b2d4877ef232864?hp=ed996a54e9e136efd3812a9f2d00d7102ed0ef1a>


- Log -----------------------------------------------------------------
commit 112284f43082e616b936428a3b2d4877ef232864
Author: Karl Williamson <[email protected]>
Date:   Mon Nov 24 10:41:23 2014 -0700

    perlvar: Add info

M       pod/perlvar.pod

commit 27c3afbd6068ac83b49a11df3e33758ef059027e
Author: Karl Williamson <[email protected]>
Date:   Mon Nov 24 10:34:27 2014 -0700

    charnames: Generalize to work on non-ASCII platforms
    
    This includes the tests.
    
    The character names are now stored in native order.  This means that
    pack('U') no longer works on non-ASCII platforms.  Use chr instead,
    mostly, and pack('W*') for a sequence.
    
    These changes required the 'encoding' pragma to no longer affect e.g.,
    chr() outside its scope, which was recently done by
    3e669301f0a6fa34269f0e1eaf1fbbd72cae498a.

M       lib/_charnames.pm
M       lib/charnames.t
M       t/lib/charnames/alias

commit 374aed2060f1b45b00ae9ad47fb986b4619492c7
Author: Karl Williamson <[email protected]>
Date:   Sun Nov 23 21:44:17 2014 -0700

    charnames: Don't return UTF-8 unless have to.
    
    A Latin-1 range character doesn't require UTF-8 to be represented.  Not
    using UTF-8 speeds things up a lot.  This commit changes the return to
    be UTF-8 only if necessary.  This could be because
     1) the character isn't representable except in UTF-8; or
     2) we are being called at runtime and the character isn't ASCII-range.
        The only way we can currently guarantee that an upper-Latin1
        character will be treated with Unicode rules is to make it UTF-8
        (the Unicode bug).  The compile-time code (in toke.c) knows how to
        properly deal with non-UTF-8 upper Latin1 range characters.
    
    The utf8::downgrade introduced in this commit is temporary.  Future
    commits will change things so that there is no need for it.

M       lib/_charnames.pm

commit 896eff3c48ae9560ce70fb8a9e729558e65592da
Author: Karl Williamson <[email protected]>
Date:   Sun Nov 23 21:50:41 2014 -0700

    toke.c: Ignore 'use encoding' on \N{}
    
    The encoding pragma converts from a specified encoding into Unicode.
    \N{} already returns the Unicode form, so the encoding pragma
    should not operate on them.  This commit ensures that.  The only reason
    things have appeared to work prior to this commit is that \N{} has
    generally returned its value in UTF-8, which 'encoding' knows enough to
    not disturb.  However, a custom name translator installed in the program
    need not return in UTF-8, so this is a bug that just hasn't yet been
    exposed.
    
    However, the next commit is about to change things so that a regular
    \N{} only returns UTF-8 if it has to, so this bug would come up a lot
    more often.  There is no need for adding a test case, because, without
    this commit existing tests would fail in t/uni/greek.t.

M       toke.c

commit b64912e99f61d14bb5e6ec9388cee8affde67198
Author: Karl Williamson <[email protected]>
Date:   Sun Nov 23 10:18:32 2014 -0700

    lib/_charnames.pm: Change variable name
    
    This lexical variable's use is about to change so it no longer will
    always be in UTF-8 encoding.  So the name would become misleading.

M       lib/_charnames.pm

commit 055bf491ea6f4dcf5702e3c09236b562e2943219
Author: Karl Williamson <[email protected]>
Date:   Sun Nov 23 10:14:22 2014 -0700

    charnames: Nit in comments, pod

M       lib/_charnames.pm
M       lib/charnames.pm

commit fa1e80ba41f52550cc193f33fda5d48962169788
Author: Karl Williamson <[email protected]>
Date:   Sun Nov 23 10:13:28 2014 -0700

    charnames: Bump version to 1.43

M       lib/_charnames.pm
M       lib/charnames.pm

commit 716d5ef3fce3654c265dfdfbdc7112c38271beba
Author: Karl Williamson <[email protected]>
Date:   Sat Nov 22 09:47:02 2014 -0700

    toke.c: Typo in comment

M       toke.c
-----------------------------------------------------------------------

Summary of changes:
 lib/_charnames.pm     | 76 +++++++++++++++++++++++++++++----------------------
 lib/charnames.pm      |  4 +--
 lib/charnames.t       | 56 ++++++++++++++++++-------------------
 pod/perlvar.pod       |  4 ++-
 t/lib/charnames/alias | 35 ++++++++++++++++++------
 toke.c                |  4 +--
 6 files changed, 103 insertions(+), 76 deletions(-)

diff --git a/lib/_charnames.pm b/lib/_charnames.pm
index 729d849..bb7d7c6 100644
--- a/lib/_charnames.pm
+++ b/lib/_charnames.pm
@@ -6,7 +6,7 @@
 package _charnames;
 use strict;
 use warnings;
-our $VERSION = '1.42';
+our $VERSION = '1.43';
 use unicore::Name;    # mktables-generated algorithmically-defined names
 
 use bytes ();          # for $bytes::hint_bits
@@ -65,10 +65,10 @@ $Carp::Internal{ (__PACKAGE__) } = 1;
 
 my %system_aliases = (
 
-    'SINGLE-SHIFT 2'                => pack("U", 
utf8::unicode_to_native(0x8E)),
-    'SINGLE-SHIFT 3'                => pack("U", 
utf8::unicode_to_native(0x8F)),
-    'PRIVATE USE 1'                 => pack("U", 
utf8::unicode_to_native(0x91)),
-    'PRIVATE USE 2'                 => pack("U", 
utf8::unicode_to_native(0x92)),
+    'SINGLE-SHIFT 2'                => chr utf8::unicode_to_native(0x8E),
+    'SINGLE-SHIFT 3'                => chr utf8::unicode_to_native(0x8F),
+    'PRIVATE USE 1'                 => chr utf8::unicode_to_native(0x91),
+    'PRIVATE USE 2'                 => chr utf8::unicode_to_native(0x92),
 );
 
 # These are the aliases above that differ under :loose and :full matching
@@ -77,15 +77,15 @@ my %system_aliases = (
 #);
 
 #my %deprecated_aliases;
-#$deprecated_aliases{'BELL'} = pack("U", utf8::unicode_to_native(0x07)) if $^V 
lt v5.17.0;
+#$deprecated_aliases{'BELL'} = chr utf8::unicode_to_native(0x07) if $^V lt 
v5.17.0;
 
 #my %loose_deprecated_aliases = (
 #);
 
 # These are special cased in :loose matching, differing only in a medial
 # hyphen
-my $HANGUL_JUNGSEONG_O_E_utf8 = pack("U", 0x1180);
-my $HANGUL_JUNGSEONG_OE_utf8 = pack("U", 0x116C);
+my $HANGUL_JUNGSEONG_O_E_utf8 = chr 0x1180;
+my $HANGUL_JUNGSEONG_OE_utf8 = chr 0x116C;
 
 
 my $txt;  # The table of official character names
@@ -163,7 +163,7 @@ sub alias (@) # Set up a single alias
     }
     if ($value =~ $decimal_qr) {
         no warnings qw(non_unicode surrogate nonchar); # Allow any of these
-        $^H{charnames_ord_aliases}{$name} = pack("U", $value);
+        $^H{charnames_ord_aliases}{$name} = chr $value;
 
         # Use a canonical form.
         $^H{charnames_inverse_ords}{sprintf("%05X", $value)} = $name;
@@ -285,7 +285,7 @@ sub lookup_name ($$$) {
   # It looks first in the aliases, then in the large table of official Unicode
   # names.
 
-  my $utf8;       # The string result
+  my $result;       # The string result
   my $save_input;
 
   if ($runtime) {
@@ -298,7 +298,7 @@ sub lookup_name ($$$) {
                               || (! defined $hints_ref->{charnames_full}
                                   && ! defined $hints_ref->{charnames_loose});
 
-    # At runtime, but currently not at compile time, $^H gets
+    # At runtime, but currently not at compile time, %^H gets
     # stringified, so un-stringify back to the original data structures.
     # These get thrown away by perl before the next invocation
     # Also fill in the hash with the non-stringified data.
@@ -321,7 +321,7 @@ sub lookup_name ($$$) {
   # User alias should be checked first or else can't override ours, and if we
   # were to add any, could conflict with theirs.
   if (exists $^H{charnames_ord_aliases}{$name}) {
-    $utf8 = $^H{charnames_ord_aliases}{$name};
+    $result = $^H{charnames_ord_aliases}{$name};
   }
   elsif (exists $^H{charnames_name_aliases}{$name}) {
     $name = $^H{charnames_name_aliases}{$name};
@@ -362,13 +362,13 @@ sub lookup_name ($$$) {
     # interested in convenience over speed, and the time for this second check
     # is miniscule compared to the rest of the routine.
     if (exists $system_aliases{$lookup_name}) {
-      $utf8 = $system_aliases{$lookup_name};
+      $result = $system_aliases{$lookup_name};
     }
     # There are currently no entries in this hash, so don't waste time looking
     # for them.  But the code is retained for the unlikely possibility that
     # some will be added in the future.
 #    elsif ($loose && exists $loose_system_aliases{$lookup_name}) {
-#      $utf8 = $loose_system_aliases{$lookup_name};
+#      $result = $loose_system_aliases{$lookup_name};
 #    }
 #    if (exists $deprecated_aliases{$lookup_name}) {
 #      require warnings;
@@ -376,7 +376,7 @@ sub lookup_name ($$$) {
 #                       "Unicode character name \"$name\" is deprecated, use 
\""
 #                       . viacode(ord $deprecated_aliases{$lookup_name})
 #                       . "\" instead");
-#      $utf8 = $deprecated_aliases{$lookup_name};
+#      $result = $deprecated_aliases{$lookup_name};
 #    }
     # There are currently no entries in this hash, so don't waste time looking
     # for them.  But the code is retained for the unlikely possibility that
@@ -387,21 +387,21 @@ sub lookup_name ($$$) {
 #                       "Unicode character name \"$name\" is deprecated, use 
\""
 #                       . viacode(ord $loose_deprecated_aliases{$lookup_name})
 #                       . "\" instead");
-#      $utf8 = $loose_deprecated_aliases{$lookup_name};
+#      $result = $loose_deprecated_aliases{$lookup_name};
 #    }
   }
 
   my @off;  # Offsets into table of pattern match begin and end
 
   # If haven't found it yet...
-  if (! defined $utf8) {
+  if (! defined $result) {
 
     # See if has looked this input up earlier.
     if (! $loose && $^H{charnames_full} && exists $full_names_cache{$name}) {
-      $utf8 = $full_names_cache{$name};
+      $result = $full_names_cache{$name};
     }
     elsif ($loose && exists $loose_names_cache{$name}) {
-      $utf8 = $loose_names_cache{$name};
+      $result = $loose_names_cache{$name};
     }
     else { # Here, must do a look-up
 
@@ -432,7 +432,7 @@ sub lookup_name ($$$) {
       if (($loose || $^H{charnames_full})
           && (defined (my $ord = 
charnames::name_to_code_point_special($lookup_name, $loose))))
       {
-        $utf8 = pack("U", $ord);
+        $result = chr $ord;
       }
       else {
 
@@ -525,14 +525,14 @@ sub lookup_name ($$$) {
         # therefore yield the very last character in the table, which should
         # also be a \n, so the statement works anyway.)
         if (substr($txt, $off[0] - 7, 1) eq "\n") {
-          $utf8 = pack("U", CORE::hex substr($txt, $off[0] - 6, 5));
+          $result = chr CORE::hex substr($txt, $off[0] - 6, 5);
 
           # Handle the single loose matching special case, in which two names
           # differ only by a single medial hyphen.  If the original had a
           # hyphen (or more) in the right place, then it is that one.
-          $utf8 = $HANGUL_JUNGSEONG_O_E_utf8
+          $result = $HANGUL_JUNGSEONG_O_E_utf8
                   if $loose
-                     && $utf8 eq $HANGUL_JUNGSEONG_OE_utf8
+                     && $result eq $HANGUL_JUNGSEONG_OE_utf8
                      && $name =~ m/O \s* - [-\s]* E/ix;
                      # Note that this wouldn't work if there were a 2nd
                      # OE in the name
@@ -544,7 +544,7 @@ sub lookup_name ($$$) {
           # The +1 skips past that newline, or, if the rindex() fails, to put
           # us to an offset of zero.
           my $charstart = rindex($txt, "\n", $off[0] - 7) + 1;
-          $utf8 = pack("U*", map { CORE::hex }
+          $result = pack("W*", map { CORE::hex }
               split " ", substr($txt, $charstart, $off[0] - $charstart - 1));
         }
       }
@@ -553,15 +553,27 @@ sub lookup_name ($$$) {
       # again, but only if it came from the one search that we cache.
       # (Haven't bothered with the pain of sorting out scoping issues for the
       # scripts searches.)
-      $cache_ref->{$name} = $utf8 if defined $cache_ref;
+      $cache_ref->{$name} = $result if defined $cache_ref;
     }
   }
 
-
-  # Here, have the utf8.  If the return is to be an ord, must be any single
-  # character.
+  # Here, have the result character.  If the return is to be an ord, must be
+  # any single character.
   if ($wants_ord) {
-    return ord($utf8) if length $utf8 == 1;
+    return ord($result) if length $result == 1;
+  }
+  elsif (! utf8::is_utf8($result)) {
+
+    # Here isn't UTF-8.  That's OK if it is all ASCII, or we are being called
+    # at compile time where we know we can guarantee that Unicode rules are
+    # correctly imposed on the result, or under 'bytes' where we don't want
+    # those rules.  But otherwise we have to make it UTF8 to guarantee Unicode
+    # rules on the returned string.
+    return $result if ! $runtime
+                      || (caller $runtime)[8] & $bytes::hint_bits
+                      || $result !~ /[[:^ascii:]]/;
+    utf8::upgrade($result);
+    return $result;
   }
   else {
 
@@ -570,7 +582,7 @@ sub lookup_name ($$$) {
     my $in_bytes = ($runtime)
                    ? (caller $runtime)[8] & $bytes::hint_bits
                    : $^H & $bytes::hint_bits;
-    return $utf8 if (! $in_bytes || utf8::downgrade($utf8, 1)) # The 1 arg
+    return $result if (! $in_bytes || utf8::downgrade($result, 1)) # The 1 arg
                                                   # means don't die on failure
   }
 
@@ -594,10 +606,10 @@ sub lookup_name ($$$) {
 
   # Only other possible failure here is from use bytes.
   if ($runtime) {
-    carp not_legal_use_bytes_msg($name, $utf8);
+    carp not_legal_use_bytes_msg($name, $result);
     return;
   } else {
-    croak not_legal_use_bytes_msg($name, $utf8);
+    croak not_legal_use_bytes_msg($name, $result);
   }
 
 } # lookup_name
diff --git a/lib/charnames.pm b/lib/charnames.pm
index 6f5d51d..28e0282 100644
--- a/lib/charnames.pm
+++ b/lib/charnames.pm
@@ -1,7 +1,7 @@
 package charnames;
 use strict;
 use warnings;
-our $VERSION = '1.41';
+our $VERSION = '1.43';
 use unicore::Name;    # mktables-generated algorithmically-defined names
 use _charnames ();    # The submodule for this where most of the work gets done
 
@@ -297,7 +297,7 @@ matched name) or to a
 numeric code point (ordinal).  The latter is useful for assigning names
 to code points in Unicode private use areas such as U+E800 through
 U+F8FF.
-A numeric code point must be a non-negative integer or a string beginning
+A numeric code point must be a non-negative integer, or a string beginning
 with C<"U+"> or C<"0x"> with the remainder considered to be a
 hexadecimal integer.  A literal numeric constant must be unsigned; it
 will be interpreted as hex if it has a leading zero or contains
diff --git a/lib/charnames.t b/lib/charnames.t
index bd0c21e..e115811 100644
--- a/lib/charnames.t
+++ b/lib/charnames.t
@@ -65,7 +65,7 @@ EOE
     use charnames ":alias" => { mychar1 => "0xE8000",
                                 mychar2 => 983040,  # U+F0000
                                 mychar3 => "U+100000",
-                                myctrl => 0x80,
+                                myctrl => utf8::unicode_to_native(0x80),
                                 mylarge => "U+111000",
                               };
     is ("\N{PILE OF POO}", chr(0x1F4A9), "Verify :alias alone implies :full");
@@ -77,29 +77,14 @@ EOE
     is (charnames::viacode(0x100000), "mychar3", "And that can get the alias 
back");
     is ("\N{mylarge}", chr(0x111000), "Verify that can define alias beyond 
Unicode");
     is (charnames::viacode(0x111000), "mylarge", "And that can get the alias 
back");
-    is (charnames::viacode(0x80), "myctrl", "Verify that can name a nameless 
control");
+    is (charnames::viacode(utf8::unicode_to_native(0x80)), "myctrl", "Verify 
that can name a nameless control");
 
 }
 
-my $encoded_be;
-my $encoded_alpha;
-my $encoded_bet;
-my $encoded_deseng;
-
-# If octal representation of unicode char is \0xyzt, then the utf8 is \3xy\2zt
-if (ord('A') == 65) { # as on ASCII or UTF-8 machines
-    $encoded_be = "\320\261";
-    $encoded_alpha = "\316\261";
-    $encoded_bet = "\327\221";
-    $encoded_deseng = "\360\220\221\215";
-}
-else { # EBCDIC where UTF-EBCDIC may be used (this may be 1047 specific since
-       # UTF-EBCDIC is codepage specific)
-    $encoded_be = "\270\102\130";
-    $encoded_alpha = "\264\130";
-    $encoded_bet = "\270\125\130";
-    $encoded_deseng = "\336\102\103\124";
-}
+my $encoded_be = byte_utf8a_to_utf8n("\320\261");
+my $encoded_alpha = byte_utf8a_to_utf8n("\316\261");
+my $encoded_bet = byte_utf8a_to_utf8n("\327\221");
+my $encoded_deseng = byte_utf8a_to_utf8n("\360\220\221\215");
 
 sub to_bytes {
     unpack"U0a*", shift;
@@ -235,7 +220,7 @@ sub test_vianame ($$$) {
 
     use bytes;
     is(charnames::vianame("GOTHIC LETTER AHSA"), 0x10330, "Verify vianame 
\\N{name} is unaffected by 'use bytes'");
-    is(charnames::vianame("U+FF"), chr(0xFF), "Verify vianame \\N{U+FF} is 
unaffected by 'use bytes'");
+    is(charnames::vianame("U+FF"), chr(utf8::unicode_to_native(0xFF)), "Verify 
vianame \\N{U+FF} is unaffected by 'use bytes'");
     cmp_ok($warning_count, '==', scalar @WARN, "Verify vianame doesn't warn on 
legal inputs under 'use bytes'");
     ok(! defined charnames::vianame("U+100"), "Verify vianame \\N{U+100} is 
undef under 'use bytes'");
     ok($warning_count == scalar @WARN - 1 && $WARN[-1] =~ /above 0xFF/, 
"Verify vianame gives appropriate warning for previous test");
@@ -244,9 +229,9 @@ sub test_vianame ($$$) {
     ok(! defined charnames::string_vianame("GOTHIC LETTER AHSA"), "Verify 
string_vianame(\"GOTHIC LETTER AHSA\") is undefined under 'use bytes'");
     ok($warning_count == scalar @WARN - 1 && $WARN[-1] =~ /above 0xFF/, 
"Verify string_vianame gives appropriate warning for previous test");
     $warning_count = @WARN;
-    is(charnames::string_vianame("U+FF"), chr(0xFF), "Verify 
string_vianame(\"U+FF\") is chr(0xFF) under 'use bytes'");
+    is(charnames::string_vianame("U+FF"), chr(utf8::unicode_to_native(0xFF)), 
"Verify string_vianame(\"U+FF\") is chr(0xFF) under 'use bytes'");
     cmp_ok($warning_count, '==', scalar @WARN, "Verify string_vianame doesn't 
warn on legal inputs under 'use bytes'");
-    is(charnames::string_vianame("LATIN SMALL LETTER Y WITH DIAERESIS"), 
chr(0xFF), "Verify string_vianame(\"LATIN SMALL LETTER Y WITH DIAERESIS\") is 
chr(0xFF) under 'use bytes'");
+    is(charnames::string_vianame("LATIN SMALL LETTER Y WITH DIAERESIS"), 
chr(utf8::unicode_to_native(0xFF)), "Verify string_vianame(\"LATIN SMALL LETTER 
Y WITH DIAERESIS\") is chr(native 0xFF) under  ... [14 chars truncated]
     cmp_ok($warning_count, '==', scalar @WARN, "Verify string_vianame doesn't 
warn on legal inputs under 'use bytes'");
     ok(! defined charnames::string_vianame("U+100"), "Verify string_vianame 
\\N{U+100} is undef under 'use bytes'");
     ok($warning_count == scalar @WARN - 1 && $WARN[-1] =~ /above 0xFF/, 
"Verify string_vianame gives appropriate warning for previous test");
@@ -350,7 +335,8 @@ ok(! defined charnames::viacode(0x110000),
 ok((grep { /\Qyou asked for U+110000/ } @WARN), '... and gives warning');
 
 is(charnames::viacode(0), "NULL", 'Verify charnames::viacode(0) eq "NULL"');
-is(charnames::viacode("BE"), "VULGAR FRACTION THREE QUARTERS", 'Verify 
charnames::viacode("BE") eq "VULGAR FRACTION THREE QUARTERS"');
+my $three_quarters = sprintf("%2X", utf8::unicode_to_native(0xBE));
+is(charnames::viacode("$three_quarters"), "VULGAR FRACTION THREE QUARTERS", 
'Verify charnames::viacode(native "BE") eq "VULGAR FRACTION THREE QUARTERS"');
 is(charnames::viacode("U+00000000000FEED"), "ARABIC LETTER WAW ISOLATED FORM", 
'Verify charnames::viacode("U+00000000000FEED") eq "ARABIC LETTER WAW ISOLATED 
FORM"');
 
 {
@@ -1024,7 +1010,10 @@ is("\N{U+1D0C5}", "\N{BYZANTINE MUSICAL SYMBOL FTHORA 
SKLIRON CHROMA VASIS}", 'V
     while (<$fh>) {
         chomp;
         my ($code, $name, undef, undef, undef, undef, undef, undef, undef, 
undef, $u1name) = split ";";
-        my $decimal = hex $code;
+        my $decimal = utf8::unicode_to_native(hex $code);
+        $code = sprintf("%04X", $decimal) unless $::IS_ASCII;
+
+        $decimal = hex $code;
 
         # The Unicode version 1 name is used instead of any that are
         # marked <control>.
@@ -1034,7 +1023,7 @@ is("\N{U+1D0C5}", "\N{BYZANTINE MUSICAL SYMBOL FTHORA 
SKLIRON CHROMA VASIS}", 'V
         $name = "" if $^V lt v5.17.0 && $decimal == 0x1F514;
 
         # ALERT overrides BELL
-        $name = 'ALERT' if $decimal == 7;
+        $name = 'ALERT' if $decimal == utf8::unicode_to_native(7);
 
         # Some don't have names, leave those array elements undefined
         next unless $name;
@@ -1209,7 +1198,10 @@ is("\N{U+1D0C5}", "\N{BYZANTINE MUSICAL SYMBOL FTHORA 
SKLIRON CHROMA VASIS}", 'V
                 # These four code points now have names, from NameAlias, but
                 # aren't listed as having names in UnicodeData.txt, so viacode
                 # returns their alias names, not undef
-                next if $i == 0x80 || $i == 0x81 || $i == 0x84 || $i == 0x99;
+                next if $i == utf8::unicode_to_native(0x80)
+                              || $i == utf8::unicode_to_native(0x81)
+                              || $i == utf8::unicode_to_native(0x84)
+                              || $i == utf8::unicode_to_native(0x99);
 
                 # If there is no name for this code point, all we can
                 # test is that.
@@ -1223,7 +1215,11 @@ is("\N{U+1D0C5}", "\N{BYZANTINE MUSICAL SYMBOL FTHORA 
SKLIRON CHROMA VASIS}", 'V
                 # These four code points have a different Unicode1 name than
                 # regular name, and viacode has already specifically tested
                 # for the regular name
-                if ($i != 0x0a && $i != 0x0c && $i != 0x0d && $i != 0x85) {
+                if ($i != utf8::unicode_to_native(0x0a)
+                    && $i != utf8::unicode_to_native(0x0c)
+                    && $i != utf8::unicode_to_native(0x0d)
+                    && $i != utf8::unicode_to_native(0x85))
+                {
                     $all_pass &= is(charnames::viacode($i), $names[$i], 
"Verify viacode(0x$hex) is \"$names[$i]\"");
                 }
 
@@ -1249,7 +1245,7 @@ is("\N{U+1D0C5}", "\N{BYZANTINE MUSICAL SYMBOL FTHORA 
SKLIRON CHROMA VASIS}", 'V
             s/^\s*#.*//;
             next unless $_;
             my ($name, $codes) = split ";";
-            my $utf8 = pack("U*", map { hex } split " ", $codes);
+            my $utf8 = pack("W*", map { hex } split " ", $codes);
             is(charnames::string_vianame($name), $utf8, "Verify 
string_vianame(\"$name\") is the proper utf8");
             my $loose_name = get_loose_name($name);
             use charnames ":loose";
diff --git a/pod/perlvar.pod b/pod/perlvar.pod
index 5b9f433..8561eb8 100644
--- a/pod/perlvar.pod
+++ b/pod/perlvar.pod
@@ -2063,7 +2063,9 @@ X<%^H>
 
 The C<%^H> hash provides the same scoping semantic as C<$^H>.  This makes
 it useful for implementation of lexically scoped pragmas.  See
-L<perlpragma>.
+L<perlpragma>.   All the entries are stringified when accessed at
+runtime, so only simple values can be accommodated.  This means no
+pointers to objects, for example.
 
 When putting items into C<%^H>, in order to avoid conflicting with other
 users of the hash there is a convention regarding which keys to use.
diff --git a/t/lib/charnames/alias b/t/lib/charnames/alias
index 8227860..33ccff4 100644
--- a/t/lib/charnames/alias
+++ b/t/lib/charnames/alias
@@ -18,7 +18,7 @@ Here: 1
 ########
 # NAME autoload doesn't get viacode
 print "Here: \N{DIGIT THREE}\n";
-charnames::viacode(0x34);
+charnames::viacode(utf8::unicode_to_native(0x34));
 EXPECT
 OPTIONS regex
 Undefined subroutine &charnames::viacode called at - line \d+.
@@ -327,7 +327,7 @@ use warnings;
 no warnings 'void';
 use charnames ();
 charnames::vianame('SPACE');
-charnames::viacode(0x41);
+charnames::viacode(utf8::unicode_to_native(0x41));
 EXPECT
 OPTIONS regex
 $
@@ -335,24 +335,41 @@ $
 # NAME no extraneous warning [perl #11560]
 use warnings;
 use charnames ();
-print charnames::viacode(0x80), "\n";
+print charnames::viacode(utf8::unicode_to_native(0x80)), "\n";
 EXPECT
 OPTIONS regex
 PADDING CHARACTER
 ########
-# NAME various wrong characters in :alias are errors
-# Below, one of the EXPECT regexes matches both the UTF-8 and non-UTF-8 form.
-# This is because under some circumstances the message gets output as UTF-8.
+# NAME A wrong character in :alias is an error
+# These next tests could be combined, but the messages can come out in
+# different orders on EBCDIC vs ASCII, and can't have both 'random' and 'regex'
+# options, and need 'regex' to avoid 'at line X' getting in the way.
 use charnames ":full", ":alias" => {
                             "4e_ACUTE" => "LATIN SMALL LETTER E WITH ACUTE",
-                            "e_A,CUTE" => "LATIN SMALL LETTER E WITH ACUTE",
-                            "e_ACUT\x{d7}E" => "LATIN SMALL LETTER E WITH 
ACUTE",
                     };
 EXPECT
 OPTIONS regex
 Invalid character in charnames alias definition; marked by <-- HERE in '4<-- 
HERE e_ACUTE'
+########
+# NAME Another wrong character in :alias is an error
+use charnames ":full", ":alias" => {
+                            "e_A,CUTE" => "LATIN SMALL LETTER E WITH ACUTE",
+                    };
+EXPECT
+OPTIONS regex
 Invalid character in charnames alias definition; marked by <-- HERE in 
'e_A,<-- HERE CUTE'
-Invalid character in charnames alias definition; marked by <-- HERE in 
'e_ACUT(?:\x{d7}|\x{C3}\x{97})<-- HERE E'
+########
+# NAME Another wrong character in :alias is an error
+# The EXPECT regex matches both the UTF-8 and non-UTF-8 form.
+# This is because under some circumstances the message gets output as UTF-8.
+# We use \xab, as that is invalid in both ASCII and EBCDIC platforms, and we
+# accept both UTF-8 and 1047 UTF-EBCDIC.
+use charnames ":full", ":alias" => {
+                            "e_ACUT\x{ab}E" => "LATIN SMALL LETTER E WITH 
ACUTE",
+                    };
+EXPECT
+OPTIONS regex
+Invalid character in charnames alias definition; marked by <-- HERE in 
'e_ACUT(?:\x{ab}|\x{C2}\x{AB}|\x{80\x{73})<-- HERE E'
 ########
 # RT#73022
 # NAME \N{...} interprets ... as octets rather than UTF-8
diff --git a/toke.c b/toke.c
index e20c93f..059c463 100644
--- a/toke.c
+++ b/toke.c
@@ -3475,8 +3475,8 @@ S_scan_const(pTHX_ char *start)
                            const STRLEN off = d - SvPVX_const(sv);
                            d = off + SvGROW(sv, off + len + (STRLEN)(send - s) 
+ 1);
                        }
-                        if (! SvUTF8(res)) {    /* Make sure is \N{} return is 
UTF-8 */
-                            sv_utf8_upgrade(res);
+                        if (! SvUTF8(res)) {    /* Make sure \N{} return is 
UTF-8 */
+                            sv_utf8_upgrade_flags(res, SV_UTF8_NO_ENCODING);
                             str = SvPV_const(res, len);
                         }
                        Copy(str, d, len, char);

--
Perl5 Master Repository

[perl.git] branch blead, updated. v5.21.6-71-g112284f

Reply via email to