In perl.git, the branch khw/ebcdic has been updated

<http://perl5.git.perl.org/perl.git/commitdiff/0128fafebafca15870d74ca022eac637863ba58c?hp=ecf15a197c5bc8ff69f29c4fff6ee2ae87aa9a4e>

- Log -----------------------------------------------------------------
commit 0128fafebafca15870d74ca022eac637863ba58c
Author: Karl Williamson <[email protected]>
Date:   Tue Mar 19 11:32:55 2013 -0600

    t/re/fold_grind.t: Fixes for EBCDIC

M       t/re/fold_grind.t

commit da80bbd4edde671e68dda585cb60465182f5641d
Author: Karl Williamson <[email protected]>
Date:   Tue Mar 19 11:21:09 2013 -0600

    t/lib/charnames/alias: Fix some EBCDIC problems

M       t/lib/charnames/alias

commit a30bf40f287af872fdb286898e057f9294b05877
Author: Karl Williamson <[email protected]>
Date:   Tue Mar 19 11:20:24 2013 -0600

    XXX reset

M       t/uni/class.t

commit 11cde60a6eba986da6f8e61f32eed1f538647775
Author: Karl Williamson <[email protected]>
Date:   Tue Mar 19 11:16:12 2013 -0600

    Unicode::UCD: Fix EBCDIC bug

M       lib/Unicode/UCD.pm

commit c57248eb68abf0f85f5dcacd415d820be091b727
Author: Karl Williamson <[email protected]>
Date:   Tue Mar 19 11:01:57 2013 -0600

    feature/unicode_strings.t: Fix to work on EBCDIC

M       lib/feature/unicode_strings.t

commit 6725081ca771fe186f38e1235788931d6884fd55
Author: Karl Williamson <[email protected]>
Date:   Tue Mar 19 10:33:59 2013 -0600

    XXX lib/strict.t: Skip for now
    
    this is to see if this is what is causing the tests to not finish.

M       lib/strict.t

commit f1efbdb96393ff6aa6e234ae862cbe9d7037b77b
Author: Karl Williamson <[email protected]>
Date:   Tue Mar 19 10:12:30 2013 -0600

    regen/regcharclass.pl: make more EBCDIC friendly
    
    One of the possible inputs to this process is a string.  This clarifies
    that it must be specified in Unicode characters, and adds code to
    translate it to native, if necessary.

M       regen/regcharclass.pl

commit 3c0dc065e8962ec33758df1b68f52568ca3d7cfa
Author: Karl Williamson <[email protected]>
Date:   Tue Mar 19 10:10:46 2013 -0600

    XXX regen/regcharclass.pl: maybe temp comment out utf8_char

M       regen/regcharclass.pl

commit a9a844a5828f1853a45d029a67331fccffe26a08
Author: Karl Williamson <[email protected]>
Date:   Tue Mar 19 10:09:53 2013 -0600

    XXX temporary comment out multi-char folds

M       regcomp.c
M       regen/regcharclass.pl
-----------------------------------------------------------------------

Summary of changes:
 lib/Unicode/UCD.pm            |    6 +---
 lib/feature/unicode_strings.t |    4 +-
 lib/strict.t                  |    2 +
 regcomp.c                     |    3 ++
 regen/regcharclass.pl         |   42 ++++++++++++++++++++++++----------------
 t/lib/charnames/alias         |    6 ++--
 t/re/fold_grind.t             |   22 ++++++++++----------
 t/uni/class.t                 |    3 +-
 8 files changed, 50 insertions(+), 38 deletions(-)

diff --git a/lib/Unicode/UCD.pm b/lib/Unicode/UCD.pm
index c9600df..777e6e1 100644
--- a/lib/Unicode/UCD.pm
+++ b/lib/Unicode/UCD.pm
@@ -2976,10 +2976,8 @@ RETRY:
                 my $code_point = hex $hex_code_point;
 
                 # The name of all controls is the default: the empty string.
-                # The set of controls is immutable, so these hard-coded
-                # constants work.
-                next if $code_point <= 0x9F
-                        && ($code_point <= 0x1F || $code_point >= 0x7F);
+                # The set of controls is immutable
+                next if chr $code_point =~ /[[:cntrl:]]/u;
 
                 # If this is a name_alias, it isn't a name
                 next if grep { $_ eq $name } @{$aliases{$code_point}};
diff --git a/lib/feature/unicode_strings.t b/lib/feature/unicode_strings.t
index 8bd536f..c9d1279 100644
--- a/lib/feature/unicode_strings.t
+++ b/lib/feature/unicode_strings.t
@@ -225,7 +225,7 @@ for my $i (0 .. 255) {
 
                     # With the legacy, nothing above 128 should be in the
                     # class
-                    if ($i >= 128) {
+                    if (ord_native_to_latin1($i) >= 128) {
                         $expect_success = 0;
                         $expect_success = ! $expect_success if $complement;
                         $expect_success = ! $expect_success if 
$complement_class;
@@ -259,7 +259,7 @@ for my $i (0 .. 255) {
 
         no feature 'unicode_strings';
         $prefix = "no uni8bit; Verify $string";
-        if ($i >= 128) {
+        if (ord_native_to_latin1($i) >= 128) {
             $expect_success = 1;
             $expect_success = ! $expect_success if $complement;
         }
diff --git a/lib/strict.t b/lib/strict.t
index e067793..612f99c 100644
--- a/lib/strict.t
+++ b/lib/strict.t
@@ -2,6 +2,8 @@
 
 chdir 't' if -d 't';
 @INC = '../lib';
+    print "1..0 # Skip: for now\n";
+    exit 0;
 
 our $local_tests = 4;
 require "../t/lib/common.pl";
diff --git a/regcomp.c b/regcomp.c
index 2bc5108..aa9ea82 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -2821,6 +2821,7 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode 
*scan, UV *min_subtract, b
                                      length sequence we are looking for is 2 */
            {
                 int count = 0;
+#define is_MULTI_CHAR_FOLD_utf8_safe(s, s_end) (0)
                 int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
                 if (! len) {    /* Not a multi-char fold: get next char */
                     s += UTF8SKIP(s);
@@ -2918,6 +2919,7 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode 
*scan, UV *min_subtract, b
             const U8 s_masked = 's' & S_or_s_mask;
 
            while (s < upper) {
+#define is_MULTI_CHAR_FOLD_latin1_safe(s, s_end) (0)
                 int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
                 if (! len) {    /* Not a multi-char fold. */
                     if (*s == LATIN_SMALL_LETTER_SHARP_S && OP(scan) == EXACTF)
@@ -11980,6 +11982,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, 
U32 depth,
     /* In a range, counts how many 0-2 of the ends of it came from literals,
      * not escapes.  Thus we can tell if 'A' was input vs \x{C1} */
     UV literal_endpoint = 0;
+    allow_multi_folds = FALSE; /* XXX */
 #endif
     bool invert = FALSE;    /* Is this class to be complemented */
 
diff --git a/regen/regcharclass.pl b/regen/regcharclass.pl
index aafee3a..a2efc35 100755
--- a/regen/regcharclass.pl
+++ b/regen/regcharclass.pl
@@ -329,6 +329,13 @@ sub new {
         my $str= $txt;
         if ( $str =~ /^[""]/ ) {
             $str= eval $str;
+            if (! ASCII_PLATFORM) { # Convert string to native
+                my $native_str = "";
+                for my $ch ( split //, $str ) {
+                    $native_str .= chr utf8::unicode_to_native(ord $ch);
+                }
+                $str = $native_str;
+            }
         } elsif ($str =~ / - /x ) { # A range:  Replace this element on the
                                     # list with its expansion
             my ($lower, $upper) = $str =~ / 0x (.+?) \s* - \s* 0x (.+) /x;
@@ -1316,7 +1323,8 @@ if ( !caller ) {
 #
 # The subsequent lines give what code points go into the class defined by the
 # macro.  Multiple characters may be specified via a string like "\x0D\x0A",
-# enclosed in quotes.  Otherwise the lines consist of one of:
+# enclosed in quotes.  The characters are considered to be Unicode, and will
+# be translated into native.  Otherwise the lines consist of one of:
 #   1)  a single Unicode code point, prefaced by 0x
 #   2)  a single range of Unicode code points separated by a minus (and
 #       optional space)
@@ -1491,27 +1499,27 @@ GCB_V: Grapheme_Cluster_Break=V
 # This hasn't been commented out, because we haven't an EBCDIC platform to run
 # it on, and the 3 types of EBCDIC allegedly supported by Perl would have
 # different results
-UTF8_CHAR: Matches utf8 from 1 to 5 bytes
-=> UTF8 :safe only_ebcdic_platform
-0x0 - 0x3FFFFF:
+#UTF8_CHAR: Matches utf8 from 1 to 5 bytes
+#=> UTF8 :safe only_ebcdic_platform
+#0x0 - 0x3FFFFF:
 
 QUOTEMETA: Meta-characters that \Q should quote
 => high :fast
 \p{_Perl_Quotemeta}
 
-MULTI_CHAR_FOLD: multi-char strings that are folded to by a single character
-=> UTF8 :safe
-do regen/regcharclass_multi_char_folds.pl
-
-# 1 => All folds
-&regcharclass_multi_char_folds::multi_char_folds(1)
-
-MULTI_CHAR_FOLD: multi-char strings that are folded to by a single character
-=> LATIN1 :safe
-
-&regcharclass_multi_char_folds::multi_char_folds(0)
-# 0 => Latin1-only
-
 PATWS: pattern white space
 => generic generic_non_low cp : fast safe
 \p{PatWS}
+
+#MULTI_CHAR_FOLD: multi-char strings that are folded to by a single character
+#=> UTF8 :safe
+#do regen/regcharclass_multi_char_folds.pl
+#
+## 1 => All folds
+#&regcharclass_multi_char_folds::multi_char_folds(1)
+#
+#MULTI_CHAR_FOLD: multi-char strings that are folded to by a single character
+#=> LATIN1 :safe
+#
+#&regcharclass_multi_char_folds::multi_char_folds(0)
+## 0 => Latin1-only
diff --git a/t/lib/charnames/alias b/t/lib/charnames/alias
index b8786db..cffac3a 100644
--- a/t/lib/charnames/alias
+++ b/t/lib/charnames/alias
@@ -18,7 +18,7 @@ Here: 1
 ########
 # NAME autoload doesn't get viacode
 print "Here: \N{DIGIT THREE}\n";
-charnames::viacode(0x34);
+charnames::viacode(utf8::unicode_to_native(0x34));
 EXPECT
 OPTIONS regex
 Undefined subroutine &charnames::viacode called at - line \d+.
@@ -327,7 +327,7 @@ use warnings;
 no warnings 'void';
 use charnames ();
 charnames::vianame('SPACE');
-charnames::viacode(0x41);
+charnames::viacode(utf8::unicode_to_native(0x41));
 EXPECT
 OPTIONS regex
 $
@@ -335,7 +335,7 @@ $
 # NAME no extraneous warning [perl #11560]
 use warnings;
 use charnames ();
-print charnames::viacode(0x80), "\n";
+print charnames::viacode(utf8::unicode_to_native(0x80)), "\n";
 EXPECT
 OPTIONS regex
 PADDING CHARACTER
diff --git a/t/re/fold_grind.t b/t/re/fold_grind.t
index 3267336..ebbbdc5 100644
--- a/t/re/fold_grind.t
+++ b/t/re/fold_grind.t
@@ -66,7 +66,7 @@ my $skip_apparently_redundant = ! $ENV{PERL_RUN_SLOW_TESTS};
 sub range_type {
     my $ord = ord shift;
 
-    return $ASCII if $ord < 128;
+    return $ASCII if ord_native_to_latin1($ord) < 128;
     return $Latin1 if $ord < 256;
     return $Unicode;
 }
@@ -373,8 +373,8 @@ foreach my $to (sort { (length $a == length $b)
 }
 
 # For each range type, test additionally a character that folds to itself
-add_test(chr 0x3A, chr 0x3A);
-add_test(chr 0xF7, chr 0xF7);
+add_test(":", chr ":");
+add_test(chr ord_latin1_to_native(0xF7), chr ord_latin1_to_native(0xF7));
 add_test(chr 0x2C7, chr 0x2C7);
 
 # To cut down on the number of tests
@@ -416,7 +416,7 @@ if($Config{d_setlocale}) {
         # legal, but since we don't know what the right answers should be,
         # skip the locale tests in that situation.
         for my $i (128 .. 255) {
-            my $char = chr($i);
+            my $char = chr(ord_latin1_to_native($i));
             goto untestable_locale if uc($char) ne $char || lc($char) ne $char;
         }
         push @charsets, 'l';
@@ -466,13 +466,13 @@ foreach my $test (sort { numerically } keys %tests) {
 
     my $target_above_latin1 = grep { $_ > 255 } @target;
     my $pattern_above_latin1 = grep { $_ > 255 } @pattern;
-    my $target_has_ascii = grep { $_ < 128 } @target;
-    my $pattern_has_ascii = grep { $_ < 128 } @pattern;
-    my $target_only_ascii = ! grep { $_ > 127 } @target;
-    my $pattern_only_ascii = ! grep { $_ > 127 } @pattern;
+    my $target_has_ascii = grep { ord_native_to_latin1($_ < 128) } @target;
+    my $pattern_has_ascii = grep { ord_native_to_latin1($_ < 128) } @pattern;
+    my $target_only_ascii = ! grep { ord_native_to_latin1($_ > 127) } @target;
+    my $pattern_only_ascii = ! grep { ord_native_to_latin1($_ > 127) } 
@pattern;
     my $target_has_latin1 = grep { $_ < 256 } @target;
-    my $target_has_upper_latin1 = grep { $_ < 256 && $_ > 127 } @target;
-    my $pattern_has_upper_latin1 = grep { $_ < 256 && $_ > 127 } @pattern;
+    my $target_has_upper_latin1 = grep { $_ < 256 && $_ > 
ord_native_to_latin1(127) } @target;
+    my $pattern_has_upper_latin1 = grep { $_ < 256 && $_ > 
ord_native_to_latin1(127) } @pattern;
     my $pattern_has_latin1 = grep { $_ < 256 } @pattern;
     my $is_self = @target == 1 && @pattern == 1 && $target[0] == $pattern[0];
 
@@ -614,7 +614,7 @@ foreach my $test (sort { numerically } keys %tests) {
           my $lhs_str = eval qq{"$lhs"}; fail($@) if $@;
           my @rhs = @x_pattern;
           my $rhs = join "", @rhs;
-          my $should_fail = (! $uni_semantics && $ord >= 128 && $ord < 256 && 
! $is_self)
+          my $should_fail = (! $uni_semantics && ord_native_to_latin1($ord) >= 
128 && $ord < 256 && ! $is_self)
                             || ($charset eq 'aa' && $target_has_ascii != 
$pattern_has_ascii)
                             || ($charset eq 'l' && $target_has_latin1 != 
$pattern_has_latin1);
 
diff --git a/t/uni/class.t b/t/uni/class.t
index 144ae43..03dfb16 100644
--- a/t/uni/class.t
+++ b/t/uni/class.t
@@ -6,7 +6,8 @@ BEGIN {
 
 plan tests => 11;
 
-my $str = join "", map latin1_to_native(chr($_)), 0x20 .. 0x6F;
+# subs are wrong
+my $str = join "", map { chr $_ }, 0x20 .. 0x6F;
 
 is(($str =~ /(\p{IsMyUniClass}+)/)[0], '0123456789:;<=>?@ABCDEFGHIJKLMNO',
                                 'user-defined class compiled before defined');

--
Perl5 Master Repository

Reply via email to