In perl.git, the branch blead has been updated <http://perl5.git.perl.org/perl.git/commitdiff/fc962064cacbf4393def110b51a7bac805d9c3be?hp=49c4aee9770d41c2fd7866800ef51cfa28e02b58>
- Log ----------------------------------------------------------------- commit fc962064cacbf4393def110b51a7bac805d9c3be Author: Karl Williamson <[email protected]> Date: Mon Mar 2 22:07:45 2015 -0700 DBM_Filter/t/encode.t: temporarily skip until Encode fixed M lib/DBM_Filter/t/encode.t commit f5b27708c2015c319d0178eda79bef9baeaa22a1 Author: Karl Williamson <[email protected]> Date: Mon Mar 2 21:31:07 2015 -0700 porting/readme.t: TODO failing EBCDIC test This depends on Unicode::Collate, which is not yet working properly in EBCDIC M t/porting/readme.t commit 8944ce7878105d0b5c56b840db0a95a593e01244 Author: Karl Williamson <[email protected]> Date: Sat Dec 6 23:08:38 2014 -0700 ext/SDBM_File/sdbm/dbu.c Generalize for EBCDIC platforms This also fixed a bug which hasn't shown up in the tests, in that it uses 'char' where it should be 'U8'. M ext/SDBM_File/dbu.c commit 89ad707a5b059d12b8c7715313147fabda58d12f Author: Karl Williamson <[email protected]> Date: Tue Mar 17 22:03:16 2015 -0600 regexec.c: Fix improper warning. \b{} and \B{} are valid in UTF-8 locales, as all the Unicode rules apply. Prior to this patch a warning was raised under some circumstances. The warning text was generalized to handle both \b and \B cases. The original text was only just added, in 5.21.9. M regexec.c M t/lib/warnings/regexec commit a78e2a979dade2d426dbeef8214a0f27676be887 Author: Karl Williamson <[email protected]> Date: Mon Mar 16 15:52:18 2015 -0600 re/pat_advanced.t: Tighten test This adds anchors to a pattern. I discovered while changing things that it still passed when broken M t/re/pat_advanced.t commit c440a570f986f52b764752007c070e8549b2bf7e Author: Karl Williamson <[email protected]> Date: Tue Mar 17 16:56:34 2015 -0600 regcomp.sym: Update \b descriptions M pod/perldebguts.pod M regcomp.sym M regnodes.h commit b8cae652c696ff805cc1c46872c4aa89444dd1e8 Author: Karl Williamson <[email protected]> Date: Tue Mar 17 15:44:03 2015 -0600 PATCH: [perl #124091] PP Data::Dumper fails on \n isolate Commit 31ac59b61698e704b64192de74793793f4b5b0c0 inadvertently changed the behavior of the pure perl version of Data::Dumper. If a newline is the sole character in something being dumped with useqq, it no longer got translated into a \n sequence and was output raw. This was due to the regex matching of \n at beginning and ends of strings. M dist/Data-Dumper/Dumper.pm M dist/Data-Dumper/t/dumper.t ----------------------------------------------------------------------- Summary of changes: dist/Data-Dumper/Dumper.pm | 2 +- dist/Data-Dumper/t/dumper.t | 11 ++++++++++- ext/SDBM_File/dbu.c | 26 ++++++++++++++++++-------- lib/DBM_Filter/t/encode.t | 5 +++++ pod/perldebguts.pod | 28 +++++++++++++--------------- regcomp.sym | 14 +++++++------- regexec.c | 10 +++++++--- regnodes.h | 14 +++++++------- t/lib/warnings/regexec | 31 +++++++++++++++++++++++++++++-- t/porting/readme.t | 2 ++ t/re/pat_advanced.t | 2 +- 11 files changed, 100 insertions(+), 45 deletions(-) diff --git a/dist/Data-Dumper/Dumper.pm b/dist/Data-Dumper/Dumper.pm index 0ea2e77..e884298 100644 --- a/dist/Data-Dumper/Dumper.pm +++ b/dist/Data-Dumper/Dumper.pm @@ -761,7 +761,7 @@ sub qquote { # this. || (! $IS_ASCII && $] ge 5.008_001 && utf8::is_utf8($_)); - return qq("$_") if / ^ [[:print:]]* $ /x; # fast exit + return qq("$_") unless /[[:^print:]]/; # fast exit if only printables # Here, there is at least one non-printable to output. First, translate the # escapes. diff --git a/dist/Data-Dumper/t/dumper.t b/dist/Data-Dumper/t/dumper.t index fa3ce97..14f92dd 100644 --- a/dist/Data-Dumper/t/dumper.t +++ b/dist/Data-Dumper/t/dumper.t @@ -108,7 +108,7 @@ sub SKIP_TEST { ++$TNUM; print "ok $TNUM # skip $reason\n"; } -$TMAX = 444; +$TMAX = 450; # Force Data::Dumper::Dump to use perl. We test Dumpxs explicitly by calling # it direct. Out here it lets us knobble the next if to test that the perl @@ -1746,3 +1746,12 @@ EOT TEST (q(Data::Dumper::DumperX($foo)), 'EBCDIC outlier control: DumperX') if $XS; } } +############# [perl #124091] +{ + $WANT = <<'EOT'; +#$VAR1 = "\n"; +EOT + local $Data::Dumper::Useqq = 1; + TEST (qq(Dumper("\n")), '\n alone'); + TEST (qq(Data::Dumper::DumperX("\n")), '\n alone') if $XS; +} diff --git a/ext/SDBM_File/dbu.c b/ext/SDBM_File/dbu.c index d861c0f..4631d40 100644 --- a/ext/SDBM_File/dbu.c +++ b/ext/SDBM_File/dbu.c @@ -224,19 +224,29 @@ static void prdatum(FILE *stream, datum d) { int c; - char *p = d.dptr; + U8 *p = (U8 *) d.dptr; int n = d.dsize; while (n--) { - c = *p++ & 0377; + c = *p++; +#ifndef EBCDIC /* Meta notation doesn't make sense on EBCDIC systems*/ if (c & 0200) { - fprintf(stream, "M-"); - c &= 0177; + fprintf(stream, "M-"); + c &= 0177; } - if (c == 0177 || c < ' ') - fprintf(stream, "^%c", (c == 0177) ? '?' : c + '@'); - else - putc(c, stream); +#endif + /* \c notation applies for \0 . \x1f, plus \c? */ + if (c <= 0x1F || c == QUESTION_MARK_CTRL) { + fprintf(stream, "^%c", toCTRL(c)); + } +#ifdef EBCDIC /* Instead of meta, use \x{} for non-printables */ + else if (! isPRINT_A(c)) { + fprintf(stream, "\\x{%02x}", c); + } +#endif + else { /* must be an ASCII printable */ + putc(c, stream); + } } } diff --git a/lib/DBM_Filter/t/encode.t b/lib/DBM_Filter/t/encode.t index 35f501a..37a58ac 100644 --- a/lib/DBM_Filter/t/encode.t +++ b/lib/DBM_Filter/t/encode.t @@ -76,6 +76,10 @@ VerifyData(\%h1, eval { $db1->Filter_Pop() }; is $@, '', "pop the 'utf8' filter" ; +SKIP: { + skip "Encode doesn't currently work for most filters on EBCDIC, including 8859-16", 11 if $::IS_EBCDIC || $::IS_EBCDIC; + # Actually the only thing failing below is the euro, because that's the + # only thing that's added in 8859-16. eval { $db1->Filter_Push('encode' => 'iso-8859-16') }; is $@, '', "push an 'encode' filter (specify iso-8859-16)" ; @@ -114,3 +118,4 @@ undef $db2; is $@, '', "untie without inner references" ; } +} diff --git a/pod/perldebguts.pod b/pod/perldebguts.pod index 591e69b..2b5561d 100644 --- a/pod/perldebguts.pod +++ b/pod/perldebguts.pod @@ -572,24 +572,22 @@ will be lost. GPOS no Matches where last m//g left off. # Word Boundary Opcodes: - BOUND no Match "" at any word boundary using native - charset rules for non-utf8, otherwise - Unicode rules - BOUNDL no Match "" at any boundary of a given type - using locale rules + BOUND no Like BOUNDA for non-utf8, otherwise match "" + between any Unicode \w\W or \W\w + BOUNDL no Like BOUND/BOUNDU, but \w and \W are defined + by current locale BOUNDU no Match "" at any boundary of a given type using Unicode rules - BOUNDA no Match "" at any boundary of a given type - using ASCII rules - NBOUND no Match "" at any word non-boundary using - native charset rules for non-utf8, otherwise - Unicode rules - NBOUNDL no Match "" at any boundary of a given type - using locale rules - NBOUNDU no Match "" at any boundary of a given type + BOUNDA no Match "" at any boundary between \w\W or + \W\w, where \w is [_a-zA-Z0-9] + NBOUND no Like NBOUNDA for non-utf8, otherwise match + "" between any Unicode \w\w or \W\W + NBOUNDL no Like NBOUND/NBOUNDU, but \w and \W are + defined by current locale + NBOUNDU no Match "" at any non-boundary of a given type using using Unicode rules - NBOUNDA no Match "" at any boundary of a given type - using using ASCII rules + NBOUNDA no Match "" betweeen any \w\w or \W\W, where \w + is [_a-zA-Z0-9] # [Special] alternatives: REG_ANY no Match any one character (except newline). diff --git a/regcomp.sym b/regcomp.sym index 7daa241..f79b874 100644 --- a/regcomp.sym +++ b/regcomp.sym @@ -43,15 +43,15 @@ GPOS GPOS, no ; Matches where last m//g left off. # in regcomp.c uses the enum value of the modifier as an offset from the /d # version. The complements must come after the non-complements. # BOUND, POSIX and their complements are affected, as well as EXACTF. -BOUND BOUND, no ; Match "" at any word boundary using native charset rules for non-utf8, otherwise Unicode rules -BOUNDL BOUND, no ; Match "" at any boundary of a given type using locale rules +BOUND BOUND, no ; Like BOUNDA for non-utf8, otherwise match "" between any Unicode \w\W or \W\w +BOUNDL BOUND, no ; Like BOUND/BOUNDU, but \w and \W are defined by current locale BOUNDU BOUND, no ; Match "" at any boundary of a given type using Unicode rules -BOUNDA BOUND, no ; Match "" at any boundary of a given type using ASCII rules +BOUNDA BOUND, no ; Match "" at any boundary between \w\W or \W\w, where \w is [_a-zA-Z0-9] # All NBOUND nodes are required by code in regexec.c to be greater than all BOUND ones -NBOUND NBOUND, no ; Match "" at any word non-boundary using native charset rules for non-utf8, otherwise Unicode rules -NBOUNDL NBOUND, no ; Match "" at any boundary of a given type using locale rules -NBOUNDU NBOUND, no ; Match "" at any boundary of a given type using using Unicode rules -NBOUNDA NBOUND, no ; Match "" at any boundary of a given type using using ASCII rules +NBOUND NBOUND, no ; Like NBOUNDA for non-utf8, otherwise match "" between any Unicode \w\w or \W\W +NBOUNDL NBOUND, no ; Like NBOUND/NBOUNDU, but \w and \W are defined by current locale +NBOUNDU NBOUND, no ; Match "" at any non-boundary of a given type using using Unicode rules +NBOUNDA NBOUND, no ; Match "" betweeen any \w\w or \W\W, where \w is [_a-zA-Z0-9] #* [Special] alternatives: REG_ANY REG_ANY, no 0 S ; Match any one character (except newline). diff --git a/regexec.c b/regexec.c index 5fb7288..cd03a4a 100644 --- a/regexec.c +++ b/regexec.c @@ -38,7 +38,7 @@ #endif #define B_ON_NON_UTF8_LOCALE_IS_WRONG \ - "Use of \\b{} for non-UTF-8 locale is wrong. Assuming a UTF-8 locale" + "Use of \\b{} or \\B{} for non-UTF-8 locale is wrong. Assuming a UTF-8 locale" /* * pregcomp and pregexec -- regsub and regerror are not used in perl @@ -2004,8 +2004,10 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, case BOUNDL: _CHECK_AND_WARN_PROBLEMATIC_LOCALE; if (FLAGS(c) != TRADITIONAL_BOUND) { - Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), + if (! IN_UTF8_CTYPE_LOCALE) { + Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), B_ON_NON_UTF8_LOCALE_IS_WRONG); + } goto do_boundu; } @@ -2015,8 +2017,10 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, case NBOUNDL: _CHECK_AND_WARN_PROBLEMATIC_LOCALE; if (FLAGS(c) != TRADITIONAL_BOUND) { - Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), + if (! IN_UTF8_CTYPE_LOCALE) { + Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), B_ON_NON_UTF8_LOCALE_IS_WRONG); + } goto do_nboundu; } diff --git a/regnodes.h b/regnodes.h index 144d6f6..3c9b991 100644 --- a/regnodes.h +++ b/regnodes.h @@ -19,14 +19,14 @@ #define MEOL 5 /* 0x05 Same, assuming multiline: /$/m */ #define EOS 6 /* 0x06 Match "" at end of string: /\z/ */ #define GPOS 7 /* 0x07 Matches where last m//g left off. */ -#define BOUND 8 /* 0x08 Match "" at any word boundary using native charset rules for non-utf8, otherwise Unicode rules */ -#define BOUNDL 9 /* 0x09 Match "" at any boundary of a given type using locale rules */ +#define BOUND 8 /* 0x08 Like BOUNDA for non-utf8, otherwise match "" between any Unicode \w\W or \W\w */ +#define BOUNDL 9 /* 0x09 Like BOUND/BOUNDU, but \w and \W are defined by current locale */ #define BOUNDU 10 /* 0x0a Match "" at any boundary of a given type using Unicode rules */ -#define BOUNDA 11 /* 0x0b Match "" at any boundary of a given type using ASCII rules */ -#define NBOUND 12 /* 0x0c Match "" at any word non-boundary using native charset rules for non-utf8, otherwise Unicode rules */ -#define NBOUNDL 13 /* 0x0d Match "" at any boundary of a given type using locale rules */ -#define NBOUNDU 14 /* 0x0e Match "" at any boundary of a given type using using Unicode rules */ -#define NBOUNDA 15 /* 0x0f Match "" at any boundary of a given type using using ASCII rules */ +#define BOUNDA 11 /* 0x0b Match "" at any boundary between \w\W or \W\w, where \w is [_a-zA-Z0-9] */ +#define NBOUND 12 /* 0x0c Like NBOUNDA for non-utf8, otherwise match "" between any Unicode \w\w or \W\W */ +#define NBOUNDL 13 /* 0x0d Like NBOUND/NBOUNDU, but \w and \W are defined by current locale */ +#define NBOUNDU 14 /* 0x0e Match "" at any non-boundary of a given type using using Unicode rules */ +#define NBOUNDA 15 /* 0x0f Match "" betweeen any \w\w or \W\W, where \w is [_a-zA-Z0-9] */ #define REG_ANY 16 /* 0x10 Match any one character (except newline). */ #define SANY 17 /* 0x11 Match any one character. */ #define CANY 18 /* 0x12 Match any one byte. */ diff --git a/t/lib/warnings/regexec b/t/lib/warnings/regexec index d956cb8..b62ff6e 100644 --- a/t/lib/warnings/regexec +++ b/t/lib/warnings/regexec @@ -160,5 +160,32 @@ setlocale(&POSIX::LC_CTYPE, "C"); no warnings 'locale'; "a" =~ /\b{gcb}/l; EXPECT -Use of \b{} for non-UTF-8 locale is wrong. Assuming a UTF-8 locale at - line 8. -Use of \b{} for non-UTF-8 locale is wrong. Assuming a UTF-8 locale at - line 8. +Use of \b{} or \B{} for non-UTF-8 locale is wrong. Assuming a UTF-8 locale at - line 8. +Use of \b{} or \B{} for non-UTF-8 locale is wrong. Assuming a UTF-8 locale at - line 8. +######## +# NAME \b{} in UTF-8 locale +require '../loc_tools.pl'; +unless (locales_enabled()) { + print("SKIPPED\n# locales not available\n"),exit; +} +eval { require POSIX; POSIX->import("locale_h") }; +if ($@) { + print("SKIPPED\n# no POSIX\n"),exit; +} +my $utf8_locale = find_utf8_ctype_locale(); +unless ($utf8_locale) { + print("SKIPPED\n# No UTF-8 locale available\n"),exit; +} +use warnings 'locale'; +use locale; +setlocale(&POSIX::LC_CTYPE, "C"); + "abc def" =~ /\b{wb}.*?/; + "abc def" =~ /\B{wb}.*?/; +setlocale(&POSIX::LC_CTYPE, $utf8_locale); + "abc def" =~ /\b{wb}.*?/; + "abc def" =~ /\B{wb}.*?/; +EXPECT +Use of \b{} or \B{} for non-UTF-8 locale is wrong. Assuming a UTF-8 locale at - line 16. +Use of \b{} or \B{} for non-UTF-8 locale is wrong. Assuming a UTF-8 locale at - line 16. +Use of \b{} or \B{} for non-UTF-8 locale is wrong. Assuming a UTF-8 locale at - line 17. +Use of \b{} or \B{} for non-UTF-8 locale is wrong. Assuming a UTF-8 locale at - line 17. diff --git a/t/porting/readme.t b/t/porting/readme.t index 85d044e..e127920 100644 --- a/t/porting/readme.t +++ b/t/porting/readme.t @@ -52,6 +52,8 @@ eval { }; if(@sorted_order) { + local $::TODO; + $::TODO = "Unicode::Collate not working on EBCDIC" if $::IS_EBCDIC || $::IS_EBCDIC; ok(eq_array(\@current_order, \@sorted_order), "Files are referenced in order") or print_right_order(); } diff --git a/t/re/pat_advanced.t b/t/re/pat_advanced.t index fa324fd..3eaad63 100644 --- a/t/re/pat_advanced.t +++ b/t/re/pat_advanced.t @@ -995,7 +995,7 @@ sub run_tests { # my $w; local $SIG {__WARN__} = sub {$w .= "@_"}; - $result = eval 'q(WARN) =~ /[\N{WARN}]/'; + $result = eval 'q(WARN) =~ /^[\N{WARN}]$/'; ok !$@ && $result && ! $w, '\N{} returning multi-char works'; undef $w; -- Perl5 Master Repository
