In perl.git, the branch blead has been updated <http://perl5.git.perl.org/perl.git/commitdiff/75697d6e4ef98ece405210de48e7529d01b619bf?hp=6dd6e9f9592f7349aa8ad652821e3faf61292a83>
- Log ----------------------------------------------------------------- commit 75697d6e4ef98ece405210de48e7529d01b619bf Author: Karl Williamson <[email protected]> Date: Sun Dec 21 22:02:30 2014 -0700 Empty \N{} in regex pattern should force /d to /u \N{} is for Unicode names, even if the name is actually omitted. (Accepting an empty name is, I believe, an accident, and now is supported only for backwards compatibility.) M regcomp.c M t/re/re_tests commit ec98ebe8f6cc848416a16d83adfc2b850b3348b3 Author: Karl Williamson <[email protected]> Date: Sun Dec 21 21:47:04 2014 -0700 regcomp.c: comment and white-space changes only M regcomp.c commit 2f3cbe108ec853d7b2a6f7baace652b3fc6926e7 Author: Karl Williamson <[email protected]> Date: Tue Dec 30 20:49:25 2014 -0700 warnings.pm: Fix too long verbatim lines By not indentins verbatim text so much, we don't run over 79 columns. M lib/warnings.pm M regen/warnings.pl commit bdafd784ddf8cb8a42f22ebdd4e7424673d02f24 Author: Karl Williamson <[email protected]> Date: Tue Dec 30 20:48:26 2014 -0700 perlre: Fix too long verbatim line M pod/perlre.pod commit dd405ed7fddaa1954823f2ea01dd54e7f3730882 Author: Karl Williamson <[email protected]> Date: Tue Dec 30 20:50:39 2014 -0700 lib/B/Deparse.pm: refactor a hash slightly Two of the three uses of this hash want the result to be of the form "\cX". The other wants "^X". This changes the hash to be the common substring to all three, and then the proper prefix is added to each. M lib/B/Deparse.pm commit 784251768d925d81ed2e47a2e091b1fb0bf1324e Author: Karl Williamson <[email protected]> Date: Tue Dec 30 14:13:34 2014 -0700 lib/B/Deparse.pm: Add comment M lib/B/Deparse.pm commit 02d26cb6032ab676fc80db99d9d278d0ca9eb7b8 Author: Karl Williamson <[email protected]> Date: Tue Dec 30 14:04:10 2014 -0700 lib/B/Deparse.pm: Generalize for non-ASCII platforms This makes ASCII platform-specific code generalized to non-ASCII. M lib/B/Deparse.pm commit 74b899ae3030e8dddc4750668c41c5be0a84e832 Author: Karl Williamson <[email protected]> Date: Tue Dec 30 14:09:40 2014 -0700 lib/B/Deparse.pm: Output WARNING_BITS in binary This binary value was being output as just another string, which would cause the bit patterns that coincidentally coincided with letters to be output as those. This is not portable to EBCDIC, but outputting it as \xXX is, which this commit does. I chose to output in hex instead of octal, as I think that is the more modern thing to do, and it's easier for me to grok the larger values when they are in hex. M lib/B/Deparse.pm M lib/B/Deparse.t commit 93e453e286b6102e2a477a52fb042e665fcfecf7 Author: Karl Williamson <[email protected]> Date: Tue Dec 30 13:55:42 2014 -0700 lib/B/Deparse.pm: Move hash to earlier in file No other change besides the move is done. This is so the hash can be used from another place than currently. M lib/B/Deparse.pm commit a0879bf8a3066bd055931dcc51f2c4f80ec01ffb Author: Karl Williamson <[email protected]> Date: Mon Dec 29 13:57:10 2014 -0700 perlpod: Latin1 pods need an =encoding M pod/perlpod.pod ----------------------------------------------------------------------- Summary of changes: lib/B/Deparse.pm | 89 +++++++++--------- lib/B/Deparse.t | 12 +-- lib/warnings.pm | 270 +++++++++++++++++++++++++++--------------------------- pod/perlpod.pod | 10 +- pod/perlre.pod | 3 +- regcomp.c | 19 ++-- regen/warnings.pl | 4 +- t/re/re_tests | 1 + 8 files changed, 210 insertions(+), 198 deletions(-) diff --git a/lib/B/Deparse.pm b/lib/B/Deparse.pm index 703c1e0..c496c8a 100644 --- a/lib/B/Deparse.pm +++ b/lib/B/Deparse.pm @@ -1720,6 +1720,41 @@ sub stash_variable { return $prefix . $self->maybe_qualify($prefix, $name); } +my %unctrl = # portable to EBCDIC + ( + "\c@" => '@', # unused + "\cA" => 'A', + "\cB" => 'B', + "\cC" => 'C', + "\cD" => 'D', + "\cE" => 'E', + "\cF" => 'F', + "\cG" => 'G', + "\cH" => 'H', + "\cI" => 'I', + "\cJ" => 'J', + "\cK" => 'K', + "\cL" => 'L', + "\cM" => 'M', + "\cN" => 'N', + "\cO" => 'O', + "\cP" => 'P', + "\cQ" => 'Q', + "\cR" => 'R', + "\cS" => 'S', + "\cT" => 'T', + "\cU" => 'U', + "\cV" => 'V', + "\cW" => 'W', + "\cX" => 'X', + "\cY" => 'Y', + "\cZ" => 'Z', + "\c[" => '[', # unused + "\c\\" => '\\', # unused + "\c]" => ']', # unused + "\c_" => '_', # unused + ); + # Return just the name, without the prefix. It may be returned as a quoted # string. The second return value is a boolean indicating that. sub stash_variable_name { @@ -1727,7 +1762,7 @@ sub stash_variable_name { my $name = $self->gv_name($gv, 1); $name = $self->maybe_qualify($prefix,$name); if ($name =~ /^(?:\S|(?!\d)[\ca-\cz]?(?:\w|::)*|\d+)\z/) { - $name =~ s/^([\ca-\cz])/'^'.($1|'@')/e; + $name =~ s/^([\ca-\cz])/'^' . $unctrl{$1}/e; $name =~ /^(\^..|{)/ and $name = "{$name}"; return $name, 0; # not quoted } @@ -1991,7 +2026,9 @@ sub declare_warnings { elsif (($to & WARN_MASK) eq ("\0"x length($to) & WARN_MASK)) { return $self->keyword("no") . " warnings;\n"; } - return "BEGIN {\${^WARNING_BITS} = ".perlstring($to)."}\n\cK"; + return "BEGIN {\${^WARNING_BITS} = \"" + . join("", map { sprintf("\\x%02x", ord $_) } split "", $to) + . "\"}\n\cK"; } sub declare_hints { @@ -4553,53 +4590,19 @@ sub re_uninterp { } } -my %unctrl = # portable to EBCDIC - ( - "\c@" => '\c@', # unused - "\cA" => '\cA', - "\cB" => '\cB', - "\cC" => '\cC', - "\cD" => '\cD', - "\cE" => '\cE', - "\cF" => '\cF', - "\cG" => '\cG', - "\cH" => '\cH', - "\cI" => '\cI', - "\cJ" => '\cJ', - "\cK" => '\cK', - "\cL" => '\cL', - "\cM" => '\cM', - "\cN" => '\cN', - "\cO" => '\cO', - "\cP" => '\cP', - "\cQ" => '\cQ', - "\cR" => '\cR', - "\cS" => '\cS', - "\cT" => '\cT', - "\cU" => '\cU', - "\cV" => '\cV', - "\cW" => '\cW', - "\cX" => '\cX', - "\cY" => '\cY', - "\cZ" => '\cZ', - "\c[" => '\c[', # unused - "\c\\" => '\c\\', # unused - "\c]" => '\c]', # unused - "\c_" => '\c_', # unused - ); - # character escapes, but not delimiters that might need to be escaped sub escape_str { # ASCII, UTF8 my($str) = @_; $str =~ s/(.)/ord($1) > 255 ? sprintf("\\x{%x}", ord($1)) : $1/eg; $str =~ s/\a/\\a/g; -# $str =~ s/\cH/\\b/g; # \b means something different in a regex +# $str =~ s/\cH/\\b/g; # \b means something different in a regex; and \cH + # isn't a backspace in EBCDIC $str =~ s/\t/\\t/g; $str =~ s/\n/\\n/g; $str =~ s/\e/\\e/g; $str =~ s/\f/\\f/g; $str =~ s/\r/\\r/g; - $str =~ s/([\cA-\cZ])/$unctrl{$1}/ge; + $str =~ s/([\cA-\cZ])/'\\c' . $unctrl{$1}/ge; $str =~ s/([[:^print:]])/sprintf("\\%03o", ord($1))/age; return $str; } @@ -4993,7 +4996,11 @@ sub pchr { # ASCII return '\\\\'; } elsif ($n == ord "-") { return "\\-"; - } elsif ($n >= ord(' ') and $n <= ord('~')) { + } elsif (utf8::native_to_unicode($n) >= utf8::native_to_unicode(ord(' ')) + and utf8::native_to_unicode($n) <= utf8::native_to_unicode(ord('~'))) + { + # I'm presuming a regex is not ok here, otherwise we could have used + # /[[:print:]]/a to get here return chr($n); } elsif ($n == ord "\a") { return '\\a'; @@ -5010,7 +5017,7 @@ sub pchr { # ASCII } elsif ($n == ord "\r") { return '\\r'; } elsif ($n >= ord("\cA") and $n <= ord("\cZ")) { - return '\\c' . chr(ord("@") + $n); + return '\\c' . unctrl{chr $n}; } else { # return '\x' . sprintf("%02x", $n); return '\\' . sprintf("%03o", $n); diff --git a/lib/B/Deparse.t b/lib/B/Deparse.t index 74f37a7..d7b19c1 100644 --- a/lib/B/Deparse.t +++ b/lib/B/Deparse.t @@ -1857,12 +1857,12 @@ my sub f {} print f(); >>>> use feature 'lexical_subs'; -BEGIN {${^WARNING_BITS} = "TUUUUUUUUUUUUTUT\005U\001"} +BEGIN {${^WARNING_BITS} = "\x54\x55\x55\x55\x55\x55\x55\x55\x55\x55\x55\x55\x55\x54\x55\x54\x05\x55\x01"} my sub f { - BEGIN {${^WARNING_BITS} = "TUUUUUUUUUUUUTUT\005\001"} + BEGIN {${^WARNING_BITS} = "\x54\x55\x55\x55\x55\x55\x55\x55\x55\x55\x55\x55\x55\x54\x55\x54\x05\x01"} } -BEGIN {${^WARNING_BITS} = "TUUUUUUUUUUUUTUT\005\001"} +BEGIN {${^WARNING_BITS} = "\x54\x55\x55\x55\x55\x55\x55\x55\x55\x55\x55\x55\x55\x54\x55\x54\x05\x01"} print f(); #### # SKIP ?$] < 5.017004 && "lexical subs not implemented on this Perl version" @@ -1873,13 +1873,13 @@ state sub f {} print f(); >>>> use feature 'lexical_subs'; -BEGIN {${^WARNING_BITS} = "TUUUUUUUUUUUUTUT\005U\001"} +BEGIN {${^WARNING_BITS} = "\x54\x55\x55\x55\x55\x55\x55\x55\x55\x55\x55\x55\x55\x54\x55\x54\x05\x55\x01"} CORE::state sub f { - BEGIN {${^WARNING_BITS} = "TUUUUUUUUUUUUTUT\005\001"} + BEGIN {${^WARNING_BITS} = "\x54\x55\x55\x55\x55\x55\x55\x55\x55\x55\x55\x55\x55\x54\x55\x54\x05\x01"} use feature 'state'; } -BEGIN {${^WARNING_BITS} = "TUUUUUUUUUUUUTUT\005\001"} +BEGIN {${^WARNING_BITS} = "\x54\x55\x55\x55\x55\x55\x55\x55\x55\x55\x55\x55\x55\x54\x55\x54\x05\x01"} use feature 'state'; print f(); #### diff --git a/lib/warnings.pm b/lib/warnings.pm index 5f18b40..685c036 100644 --- a/lib/warnings.pm +++ b/lib/warnings.pm @@ -797,141 +797,141 @@ to be enabled/disabled in isolation. The current hierarchy is: - everything -+ - | - +- all ---+ - | | - | +- closure - | | - | +- deprecated - | | - | +- exiting - | | - | +- experimental --+ - | | | - | | +- experimental::autoderef - | | | - | | +- experimental::lexical_subs - | | | - | | +- experimental::lexical_topic - | | | - | | +- experimental::postderef - | | | - | | +- experimental::refaliasing - | | | - | | +- experimental::regex_sets - | | | - | | +- experimental::signatures - | | | - | | +- experimental::smartmatch - | | | - | | +- experimental::win32_perlio - | | - | +- glob - | | - | +- imprecision - | | - | +- io ------------+ - | | | - | | +- closed - | | | - | | +- exec - | | | - | | +- layer - | | | - | | +- newline - | | | - | | +- pipe - | | | - | | +- syscalls - | | | - | | +- unopened - | | - | +- locale - | | - | +- misc - | | - | +- missing - | | - | +- numeric - | | - | +- once - | | - | +- overflow - | | - | +- pack - | | - | +- portable - | | - | +- recursion - | | - | +- redefine - | | - | +- redundant - | | - | +- regexp - | | - | +- severe --------+ - | | | - | | +- debugging - | | | - | | +- inplace - | | | - | | +- internal - | | | - | | +- malloc - | | - | +- signal - | | - | +- substr - | | - | +- syntax --------+ - | | | - | | +- ambiguous - | | | - | | +- bareword - | | | - | | +- digit - | | | - | | +- illegalproto - | | | - | | +- parenthesis - | | | - | | +- precedence - | | | - | | +- printf - | | | - | | +- prototype - | | | - | | +- qw - | | | - | | +- reserved - | | | - | | +- semicolon - | | - | +- taint - | | - | +- threads - | | - | +- uninitialized - | | - | +- unpack - | | - | +- untie - | | - | +- utf8 ----------+ - | | | - | | +- non_unicode - | | | - | | +- nonchar - | | | - | | +- surrogate - | | - | +- void - | - +- extra -+ - | - +- void_unusual + everything -+ + | + +- all ---+ + | | + | +- closure + | | + | +- deprecated + | | + | +- exiting + | | + | +- experimental --+ + | | | + | | +- experimental::autoderef + | | | + | | +- experimental::lexical_subs + | | | + | | +- experimental::lexical_topic + | | | + | | +- experimental::postderef + | | | + | | +- experimental::refaliasing + | | | + | | +- experimental::regex_sets + | | | + | | +- experimental::signatures + | | | + | | +- experimental::smartmatch + | | | + | | +- experimental::win32_perlio + | | + | +- glob + | | + | +- imprecision + | | + | +- io ------------+ + | | | + | | +- closed + | | | + | | +- exec + | | | + | | +- layer + | | | + | | +- newline + | | | + | | +- pipe + | | | + | | +- syscalls + | | | + | | +- unopened + | | + | +- locale + | | + | +- misc + | | + | +- missing + | | + | +- numeric + | | + | +- once + | | + | +- overflow + | | + | +- pack + | | + | +- portable + | | + | +- recursion + | | + | +- redefine + | | + | +- redundant + | | + | +- regexp + | | + | +- severe --------+ + | | | + | | +- debugging + | | | + | | +- inplace + | | | + | | +- internal + | | | + | | +- malloc + | | + | +- signal + | | + | +- substr + | | + | +- syntax --------+ + | | | + | | +- ambiguous + | | | + | | +- bareword + | | | + | | +- digit + | | | + | | +- illegalproto + | | | + | | +- parenthesis + | | | + | | +- precedence + | | | + | | +- printf + | | | + | | +- prototype + | | | + | | +- qw + | | | + | | +- reserved + | | | + | | +- semicolon + | | + | +- taint + | | + | +- threads + | | + | +- uninitialized + | | + | +- unpack + | | + | +- untie + | | + | +- utf8 ----------+ + | | | + | | +- non_unicode + | | | + | | +- nonchar + | | | + | | +- surrogate + | | + | +- void + | + +- extra -+ + | + +- void_unusual Just like the "strict" pragma any of these categories can be combined diff --git a/pod/perlpod.pod b/pod/perlpod.pod index 2f531f9..12b156b 100644 --- a/pod/perlpod.pod +++ b/pod/perlpod.pod @@ -282,11 +282,15 @@ be for formatting as a footnote). X<=encoding> X<encoding> This command is used for declaring the encoding of a document. Most -users won't need this; but if your encoding isn't US-ASCII or Latin-1, -then put a C<=encoding I<encodingname>> command early in the document so +users won't need this; but if your encoding isn't US-ASCII, +then put a C<=encoding I<encodingname>> command very early in the document so that pod formatters will know how to decode the document. For I<encodingname>, use a name recognized by the L<Encode::Supported> -module. Examples: +module. Some pod formatters may try to guess between a Latin-1 versus +UTF-8 encoding, but they may guess wrong. It's best to be explicit if +you use anything besides strict ASCII. Examples: + + =encoding latin1 =encoding utf8 diff --git a/pod/perlre.pod b/pod/perlre.pod index ff8cb18..8ed5eca 100644 --- a/pod/perlre.pod +++ b/pod/perlre.pod @@ -120,7 +120,8 @@ C</n> can be negated on a per-group basis. Alternatively, named captures may still be used. "hello" =~ /(?-n:(hi|hello))/n; # $1 is "hello" - "hello" =~ /(?<greet>hi|hello)/n; # $1 is "hello", $+{greet} is "hello" + "hello" =~ /(?<greet>hi|hello)/n; # $1 is "hello", $+{greet} is + # "hello" =item Other Modifiers diff --git a/regcomp.c b/regcomp.c index 30a94dc..c2521a9 100644 --- a/regcomp.c +++ b/regcomp.c @@ -11083,8 +11083,8 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, <substitute_parse> on success. If <valuep> is non-null, it means the caller can accept an input sequence - consisting of a just a single code point; <*valuep> is set to the value - of the only or first code point in the input. + consisting of just a single code point; <*valuep> is set to the value of the + only or first code point in the input. If <substitute_parse> is non-null, it means the caller can accept an input sequence consisting of one or more code points; <*substitute_parse> is a @@ -11164,17 +11164,18 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, RExC_parse++; /* Skip past the '{' */ - if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */ + if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */ || ! (endbrace == RExC_parse /* nothing between the {} */ - || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked below - */ - && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) - */ + || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked... */ + && strnEQ(RExC_parse, "U+", 2)))) /* ... below for a better + error msg) */ { if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */ vFAIL("\\N{NAME} must be resolved by the lexer"); } + RExC_uni_semantics = 1; /* Unicode named chars imply Unicode semantics */ + if (endbrace == RExC_parse) { /* empty: \N{} */ if (node_p) { *node_p = reg_node(pRExC_state,NOTHING); @@ -11186,7 +11187,6 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, return 0; } - RExC_uni_semantics = 1; /* Unicode named chars imply Unicode semantics */ RExC_parse += 2; /* Skip past the 'U+' */ endchar = RExC_parse + strcspn(RExC_parse, ".}"); @@ -11196,7 +11196,7 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, has_multiple_chars = (endchar < endbrace); /* We get the first code point if we want it, and either there is only one, - * or we can accept both cases of one and more than one */ + * or we can accept both cases of one and there is more than one */ if (valuep && (substitute_parse || ! has_multiple_chars)) { STRLEN length_of_hex = (STRLEN)(endchar - RExC_parse); I32 grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES @@ -11245,7 +11245,6 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, } { - /* What is done here is to convert this to a sub-pattern of the form * \x{char1}\x{char2}... * and then either return it in <*substitute_parse> if non-null; or diff --git a/regen/warnings.pl b/regen/warnings.pl index 949617f..a6b8c02 100644 --- a/regen/warnings.pl +++ b/regen/warnings.pl @@ -295,7 +295,7 @@ sub mkOct if (@ARGV && $ARGV[0] eq "tree") { - print warningsTree($tree, " ") ; + print warningsTree($tree, " ") ; exit ; } @@ -475,7 +475,7 @@ print $pm '$LAST_BIT = ' . "$index ;\n" ; print $pm '$BYTES = ' . "$warn_size ;\n" ; while (<DATA>) { if ($_ eq "=for warnings.pl tree-goes-here\n") { - print $pm warningsTree($tree, " "); + print $pm warningsTree($tree, " "); next; } print $pm $_ ; diff --git a/t/re/re_tests b/t/re/re_tests index 0341f77..dcac974 100644 --- a/t/re/re_tests +++ b/t/re/re_tests @@ -1433,6 +1433,7 @@ foo(\h)bar foo\tbar y $1 \t # Verify that \N{U+...} forces Unicode rules /\N{U+41}\x{c1}/i a\x{e1} y $& a\x{e1} /[\N{U+41}\x{c1}]/i \x{e1} y $& \x{e1} +/\N{}\xe4/i \xc4 y $& \xc4 # Empty \N{} should change /d to /u [\s][\S] \x{a0}\x{a0} n - - # Unicode complements should not match same character -- Perl5 Master Repository
