In perl.git, the branch khw/ebcdic has been updated <http://perl5.git.perl.org/perl.git/commitdiff/0128fafebafca15870d74ca022eac637863ba58c?hp=ecf15a197c5bc8ff69f29c4fff6ee2ae87aa9a4e>
- Log ----------------------------------------------------------------- commit 0128fafebafca15870d74ca022eac637863ba58c Author: Karl Williamson <[email protected]> Date: Tue Mar 19 11:32:55 2013 -0600 t/re/fold_grind.t: Fixes for EBCDIC M t/re/fold_grind.t commit da80bbd4edde671e68dda585cb60465182f5641d Author: Karl Williamson <[email protected]> Date: Tue Mar 19 11:21:09 2013 -0600 t/lib/charnames/alias: Fix some EBCDIC problems M t/lib/charnames/alias commit a30bf40f287af872fdb286898e057f9294b05877 Author: Karl Williamson <[email protected]> Date: Tue Mar 19 11:20:24 2013 -0600 XXX reset M t/uni/class.t commit 11cde60a6eba986da6f8e61f32eed1f538647775 Author: Karl Williamson <[email protected]> Date: Tue Mar 19 11:16:12 2013 -0600 Unicode::UCD: Fix EBCDIC bug M lib/Unicode/UCD.pm commit c57248eb68abf0f85f5dcacd415d820be091b727 Author: Karl Williamson <[email protected]> Date: Tue Mar 19 11:01:57 2013 -0600 feature/unicode_strings.t: Fix to work on EBCDIC M lib/feature/unicode_strings.t commit 6725081ca771fe186f38e1235788931d6884fd55 Author: Karl Williamson <[email protected]> Date: Tue Mar 19 10:33:59 2013 -0600 XXX lib/strict.t: Skip for now this is to see if this is what is causing the tests to not finish. M lib/strict.t commit f1efbdb96393ff6aa6e234ae862cbe9d7037b77b Author: Karl Williamson <[email protected]> Date: Tue Mar 19 10:12:30 2013 -0600 regen/regcharclass.pl: make more EBCDIC friendly One of the possible inputs to this process is a string. This clarifies that it must be specified in Unicode characters, and adds code to translate it to native, if necessary. M regen/regcharclass.pl commit 3c0dc065e8962ec33758df1b68f52568ca3d7cfa Author: Karl Williamson <[email protected]> Date: Tue Mar 19 10:10:46 2013 -0600 XXX regen/regcharclass.pl: maybe temp comment out utf8_char M regen/regcharclass.pl commit a9a844a5828f1853a45d029a67331fccffe26a08 Author: Karl Williamson <[email protected]> Date: Tue Mar 19 10:09:53 2013 -0600 XXX temporary comment out multi-char folds M regcomp.c M regen/regcharclass.pl ----------------------------------------------------------------------- Summary of changes: lib/Unicode/UCD.pm | 6 +--- lib/feature/unicode_strings.t | 4 +- lib/strict.t | 2 + regcomp.c | 3 ++ regen/regcharclass.pl | 42 ++++++++++++++++++++++++---------------- t/lib/charnames/alias | 6 ++-- t/re/fold_grind.t | 22 ++++++++++---------- t/uni/class.t | 3 +- 8 files changed, 50 insertions(+), 38 deletions(-) diff --git a/lib/Unicode/UCD.pm b/lib/Unicode/UCD.pm index c9600df..777e6e1 100644 --- a/lib/Unicode/UCD.pm +++ b/lib/Unicode/UCD.pm @@ -2976,10 +2976,8 @@ RETRY: my $code_point = hex $hex_code_point; # The name of all controls is the default: the empty string. - # The set of controls is immutable, so these hard-coded - # constants work. - next if $code_point <= 0x9F - && ($code_point <= 0x1F || $code_point >= 0x7F); + # The set of controls is immutable + next if chr $code_point =~ /[[:cntrl:]]/u; # If this is a name_alias, it isn't a name next if grep { $_ eq $name } @{$aliases{$code_point}}; diff --git a/lib/feature/unicode_strings.t b/lib/feature/unicode_strings.t index 8bd536f..c9d1279 100644 --- a/lib/feature/unicode_strings.t +++ b/lib/feature/unicode_strings.t @@ -225,7 +225,7 @@ for my $i (0 .. 255) { # With the legacy, nothing above 128 should be in the # class - if ($i >= 128) { + if (ord_native_to_latin1($i) >= 128) { $expect_success = 0; $expect_success = ! $expect_success if $complement; $expect_success = ! $expect_success if $complement_class; @@ -259,7 +259,7 @@ for my $i (0 .. 255) { no feature 'unicode_strings'; $prefix = "no uni8bit; Verify $string"; - if ($i >= 128) { + if (ord_native_to_latin1($i) >= 128) { $expect_success = 1; $expect_success = ! $expect_success if $complement; } diff --git a/lib/strict.t b/lib/strict.t index e067793..612f99c 100644 --- a/lib/strict.t +++ b/lib/strict.t @@ -2,6 +2,8 @@ chdir 't' if -d 't'; @INC = '../lib'; + print "1..0 # Skip: for now\n"; + exit 0; our $local_tests = 4; require "../t/lib/common.pl"; diff --git a/regcomp.c b/regcomp.c index 2bc5108..aa9ea82 100644 --- a/regcomp.c +++ b/regcomp.c @@ -2821,6 +2821,7 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, b length sequence we are looking for is 2 */ { int count = 0; +#define is_MULTI_CHAR_FOLD_utf8_safe(s, s_end) (0) int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end); if (! len) { /* Not a multi-char fold: get next char */ s += UTF8SKIP(s); @@ -2918,6 +2919,7 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, b const U8 s_masked = 's' & S_or_s_mask; while (s < upper) { +#define is_MULTI_CHAR_FOLD_latin1_safe(s, s_end) (0) int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end); if (! len) { /* Not a multi-char fold. */ if (*s == LATIN_SMALL_LETTER_SHARP_S && OP(scan) == EXACTF) @@ -11980,6 +11982,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, /* In a range, counts how many 0-2 of the ends of it came from literals, * not escapes. Thus we can tell if 'A' was input vs \x{C1} */ UV literal_endpoint = 0; + allow_multi_folds = FALSE; /* XXX */ #endif bool invert = FALSE; /* Is this class to be complemented */ diff --git a/regen/regcharclass.pl b/regen/regcharclass.pl index aafee3a..a2efc35 100755 --- a/regen/regcharclass.pl +++ b/regen/regcharclass.pl @@ -329,6 +329,13 @@ sub new { my $str= $txt; if ( $str =~ /^[""]/ ) { $str= eval $str; + if (! ASCII_PLATFORM) { # Convert string to native + my $native_str = ""; + for my $ch ( split //, $str ) { + $native_str .= chr utf8::unicode_to_native(ord $ch); + } + $str = $native_str; + } } elsif ($str =~ / - /x ) { # A range: Replace this element on the # list with its expansion my ($lower, $upper) = $str =~ / 0x (.+?) \s* - \s* 0x (.+) /x; @@ -1316,7 +1323,8 @@ if ( !caller ) { # # The subsequent lines give what code points go into the class defined by the # macro. Multiple characters may be specified via a string like "\x0D\x0A", -# enclosed in quotes. Otherwise the lines consist of one of: +# enclosed in quotes. The characters are considered to be Unicode, and will +# be translated into native. Otherwise the lines consist of one of: # 1) a single Unicode code point, prefaced by 0x # 2) a single range of Unicode code points separated by a minus (and # optional space) @@ -1491,27 +1499,27 @@ GCB_V: Grapheme_Cluster_Break=V # This hasn't been commented out, because we haven't an EBCDIC platform to run # it on, and the 3 types of EBCDIC allegedly supported by Perl would have # different results -UTF8_CHAR: Matches utf8 from 1 to 5 bytes -=> UTF8 :safe only_ebcdic_platform -0x0 - 0x3FFFFF: +#UTF8_CHAR: Matches utf8 from 1 to 5 bytes +#=> UTF8 :safe only_ebcdic_platform +#0x0 - 0x3FFFFF: QUOTEMETA: Meta-characters that \Q should quote => high :fast \p{_Perl_Quotemeta} -MULTI_CHAR_FOLD: multi-char strings that are folded to by a single character -=> UTF8 :safe -do regen/regcharclass_multi_char_folds.pl - -# 1 => All folds -®charclass_multi_char_folds::multi_char_folds(1) - -MULTI_CHAR_FOLD: multi-char strings that are folded to by a single character -=> LATIN1 :safe - -®charclass_multi_char_folds::multi_char_folds(0) -# 0 => Latin1-only - PATWS: pattern white space => generic generic_non_low cp : fast safe \p{PatWS} + +#MULTI_CHAR_FOLD: multi-char strings that are folded to by a single character +#=> UTF8 :safe +#do regen/regcharclass_multi_char_folds.pl +# +## 1 => All folds +#®charclass_multi_char_folds::multi_char_folds(1) +# +#MULTI_CHAR_FOLD: multi-char strings that are folded to by a single character +#=> LATIN1 :safe +# +#®charclass_multi_char_folds::multi_char_folds(0) +## 0 => Latin1-only diff --git a/t/lib/charnames/alias b/t/lib/charnames/alias index b8786db..cffac3a 100644 --- a/t/lib/charnames/alias +++ b/t/lib/charnames/alias @@ -18,7 +18,7 @@ Here: 1 ######## # NAME autoload doesn't get viacode print "Here: \N{DIGIT THREE}\n"; -charnames::viacode(0x34); +charnames::viacode(utf8::unicode_to_native(0x34)); EXPECT OPTIONS regex Undefined subroutine &charnames::viacode called at - line \d+. @@ -327,7 +327,7 @@ use warnings; no warnings 'void'; use charnames (); charnames::vianame('SPACE'); -charnames::viacode(0x41); +charnames::viacode(utf8::unicode_to_native(0x41)); EXPECT OPTIONS regex $ @@ -335,7 +335,7 @@ $ # NAME no extraneous warning [perl #11560] use warnings; use charnames (); -print charnames::viacode(0x80), "\n"; +print charnames::viacode(utf8::unicode_to_native(0x80)), "\n"; EXPECT OPTIONS regex PADDING CHARACTER diff --git a/t/re/fold_grind.t b/t/re/fold_grind.t index 3267336..ebbbdc5 100644 --- a/t/re/fold_grind.t +++ b/t/re/fold_grind.t @@ -66,7 +66,7 @@ my $skip_apparently_redundant = ! $ENV{PERL_RUN_SLOW_TESTS}; sub range_type { my $ord = ord shift; - return $ASCII if $ord < 128; + return $ASCII if ord_native_to_latin1($ord) < 128; return $Latin1 if $ord < 256; return $Unicode; } @@ -373,8 +373,8 @@ foreach my $to (sort { (length $a == length $b) } # For each range type, test additionally a character that folds to itself -add_test(chr 0x3A, chr 0x3A); -add_test(chr 0xF7, chr 0xF7); +add_test(":", chr ":"); +add_test(chr ord_latin1_to_native(0xF7), chr ord_latin1_to_native(0xF7)); add_test(chr 0x2C7, chr 0x2C7); # To cut down on the number of tests @@ -416,7 +416,7 @@ if($Config{d_setlocale}) { # legal, but since we don't know what the right answers should be, # skip the locale tests in that situation. for my $i (128 .. 255) { - my $char = chr($i); + my $char = chr(ord_latin1_to_native($i)); goto untestable_locale if uc($char) ne $char || lc($char) ne $char; } push @charsets, 'l'; @@ -466,13 +466,13 @@ foreach my $test (sort { numerically } keys %tests) { my $target_above_latin1 = grep { $_ > 255 } @target; my $pattern_above_latin1 = grep { $_ > 255 } @pattern; - my $target_has_ascii = grep { $_ < 128 } @target; - my $pattern_has_ascii = grep { $_ < 128 } @pattern; - my $target_only_ascii = ! grep { $_ > 127 } @target; - my $pattern_only_ascii = ! grep { $_ > 127 } @pattern; + my $target_has_ascii = grep { ord_native_to_latin1($_ < 128) } @target; + my $pattern_has_ascii = grep { ord_native_to_latin1($_ < 128) } @pattern; + my $target_only_ascii = ! grep { ord_native_to_latin1($_ > 127) } @target; + my $pattern_only_ascii = ! grep { ord_native_to_latin1($_ > 127) } @pattern; my $target_has_latin1 = grep { $_ < 256 } @target; - my $target_has_upper_latin1 = grep { $_ < 256 && $_ > 127 } @target; - my $pattern_has_upper_latin1 = grep { $_ < 256 && $_ > 127 } @pattern; + my $target_has_upper_latin1 = grep { $_ < 256 && $_ > ord_native_to_latin1(127) } @target; + my $pattern_has_upper_latin1 = grep { $_ < 256 && $_ > ord_native_to_latin1(127) } @pattern; my $pattern_has_latin1 = grep { $_ < 256 } @pattern; my $is_self = @target == 1 && @pattern == 1 && $target[0] == $pattern[0]; @@ -614,7 +614,7 @@ foreach my $test (sort { numerically } keys %tests) { my $lhs_str = eval qq{"$lhs"}; fail($@) if $@; my @rhs = @x_pattern; my $rhs = join "", @rhs; - my $should_fail = (! $uni_semantics && $ord >= 128 && $ord < 256 && ! $is_self) + my $should_fail = (! $uni_semantics && ord_native_to_latin1($ord) >= 128 && $ord < 256 && ! $is_self) || ($charset eq 'aa' && $target_has_ascii != $pattern_has_ascii) || ($charset eq 'l' && $target_has_latin1 != $pattern_has_latin1); diff --git a/t/uni/class.t b/t/uni/class.t index 144ae43..03dfb16 100644 --- a/t/uni/class.t +++ b/t/uni/class.t @@ -6,7 +6,8 @@ BEGIN { plan tests => 11; -my $str = join "", map latin1_to_native(chr($_)), 0x20 .. 0x6F; +# subs are wrong +my $str = join "", map { chr $_ }, 0x20 .. 0x6F; is(($str =~ /(\p{IsMyUniClass}+)/)[0], '0123456789:;<=>?@ABCDEFGHIJKLMNO', 'user-defined class compiled before defined'); -- Perl5 Master Repository
