In perl.git, the branch blead has been updated <http://perl5.git.perl.org/perl.git/commitdiff/00c0cb6d254eaba165c8445a6e68686b8285b5a3?hp=0425eb330e75375b9a51ab41ef59e000b5e2df67>
- Log ----------------------------------------------------------------- commit 00c0cb6d254eaba165c8445a6e68686b8285b5a3 Author: David Golden <[email protected]> Date: Sat Jul 17 21:50:40 2010 -0400 Fix address of register error from 26a0cb8 M regcomp.c commit 043415659c1f373d9be6971c179dec7515bf9b21 Author: David Golden <[email protected]> Date: Sat Jul 17 16:27:21 2010 -0400 perlop.pod: document \o{} escape This is a merge resolution based on original work by Karl Williamson. M pod/perlop.pod commit f0a2b745ce6c03aec6412d79ce0b782f20eddce4 Author: Karl Williamson <k...@khw-desktop.(none)> Date: Thu Jul 15 17:28:28 2010 -0600 Add \o{} escape This commit adds the new construct \o{} to express a character constant by its octal ordinal value, along with ancillary tests and documentation. A function to handle this is added to util.c, and it is called from the 3 parsing places it could occur. The function is a candidate for in-lining, though I doubt that it will ever be used frequently. M embed.fnc M embed.h M global.sym M pod/perl5133delta.pod M pod/perldiag.pod M pod/perlre.pod M pod/perlrebackslash.pod M pod/perlretut.pod M proto.h M regcomp.c M t/lib/warnings/regcomp M t/lib/warnings/toke M t/op/qq.t M t/re/re_tests M toke.c M util.c commit 8e4698ef1ed0da722532bfcc769ba22fe85c4b47 Author: Karl Williamson <k...@khw-desktop.(none)> Date: Thu Jul 15 13:34:50 2010 -0600 perlrebackslash: Nits Signed-off-by: David Golden <[email protected]> M pod/perlrebackslash.pod commit e54859e6420b46f93358493ae1650071cd354eee Author: Karl Williamson <k...@khw-desktop.(none)> Date: Thu Jul 15 12:25:13 2010 -0600 perlreref.pod: Nits Signed-off-by: David Golden <[email protected]> M pod/perlreref.pod commit dc0d9c48f0899df34860778d88daab1e33365e30 Author: Karl Williamson <k...@khw-desktop.(none)> Date: Thu Jul 15 11:40:14 2010 -0600 perlre.pod: Nits Signed-off-by: David Golden <[email protected]> M pod/perlre.pod commit f6993e9e54e2b280f46496a9b43bee752047ce7e Author: Karl Williamson <k...@khw-desktop.(none)> Date: Thu Jun 24 14:41:28 2010 -0600 Nits in perlrebackslash Signed-off-by: David Golden <[email protected]> M pod/perlrebackslash.pod commit 84bb2957796edcfae3987d615d1b8f0f6495a3cf Author: Karl Williamson <k...@khw-desktop.(none)> Date: Thu Jun 24 11:00:06 2010 -0600 Add tests for \400 for "" strings, s//replacement/ Signed-off-by: David Golden <[email protected]> M t/op/qq.t M t/re/subst.t commit fa1639c581be6a27f090adf217f82a3e86ba3446 Author: Karl Williamson <k...@khw-desktop.(none)> Date: Thu Jun 24 08:21:27 2010 -0600 \400 -\777 now means the same thing in all d-quote Prior to this patch, \400 - \777 meant something different in some circumstances in regexes outside bracketed character classes. A deprecated warning message has been in place since 5.10.1 when this happens. Remove the warning, and bring the behavior into line with the other double-quotish contexts. \400 - \777 now always means the same thing as \x{100} - \x{1FF} (except when the octal forms are taken as backreferences.) Signed-off-by: David Golden <[email protected]> M pod/perl5133delta.pod M pod/perldiag.pod M pod/perlrebackslash.pod M regcomp.c M t/re/pat_rt_report.t M t/re/re_tests commit 9d8606788fa6ee1bda9ff32c9ae6693c93631733 Author: Karl Williamson <k...@khw-desktop.(none)> Date: Thu Jun 24 08:06:27 2010 -0600 Add examples to perlre on perils of not using \g{} These come from Abigail. Signed-off-by: David Golden <[email protected]> M pod/perlre.pod commit 0b70ce6f951c268ebdde6747ec43ca06875ec918 Author: Karl Williamson <k...@khw-desktop.(none)> Date: Wed Jun 23 18:53:56 2010 -0600 Remove extra blanks from t/re/re_tests, vim My vim settings were causing extra blanks in this file; remove the ones I added recently and set the vim options to disable the offending feature. Signed-off-by: David Golden <[email protected]> M t/re/re_tests ----------------------------------------------------------------------- Summary of changes: embed.fnc | 1 + embed.h | 2 + global.sym | 1 + pod/perl5133delta.pod | 23 ++++++++++ pod/perldiag.pod | 23 ++++++---- pod/perlop.pod | 46 ++++++++++++++------- pod/perlre.pod | 26 +++++++++--- pod/perlrebackslash.pod | 103 ++++++++++++++++++++++++++++++---------------- pod/perlreref.pod | 9 ++-- pod/perlretut.pod | 9 ++-- proto.h | 8 ++++ regcomp.c | 55 ++++++++++++++++++++----- t/lib/warnings/regcomp | 30 ++++++++++++++ t/lib/warnings/toke | 23 ++++++++++ t/op/qq.t | 8 +++- t/re/pat_rt_report.t | 11 +----- t/re/re_tests | 27 +++++++++--- t/re/subst.t | 13 +++++- toke.c | 14 ++++++ util.c | 68 ++++++++++++++++++++++++++++++- 20 files changed, 394 insertions(+), 106 deletions(-) diff --git a/embed.fnc b/embed.fnc index 8493dd7..37c7f2b 100644 --- a/embed.fnc +++ b/embed.fnc @@ -639,6 +639,7 @@ p |OP* |localize |NN OP *o|I32 lex ApdR |I32 |looks_like_number|NN SV *const sv Apd |UV |grok_bin |NN const char* start|NN STRLEN* len_p|NN I32* flags|NULLOK NV *result EXpR |char |grok_bslash_c |const char source|const bool output_warning +EXpR |char* |grok_bslash_o |NN const char* s|NN UV* uv|NN STRLEN* len|const bool output_warning Apd |UV |grok_hex |NN const char* start|NN STRLEN* len_p|NN I32* flags|NULLOK NV *result Apd |int |grok_number |NN const char *pv|STRLEN len|NULLOK UV *valuep ApdR |bool |grok_numeric_radix|NN const char **sp|NN const char *send diff --git a/embed.h b/embed.h index 8fb3cbe..fffdede 100644 --- a/embed.h +++ b/embed.h @@ -463,6 +463,7 @@ #define grok_bin Perl_grok_bin #if defined(PERL_CORE) || defined(PERL_EXT) #define grok_bslash_c Perl_grok_bslash_c +#define grok_bslash_o Perl_grok_bslash_o #endif #define grok_hex Perl_grok_hex #define grok_number Perl_grok_number @@ -2909,6 +2910,7 @@ #define grok_bin(a,b,c,d) Perl_grok_bin(aTHX_ a,b,c,d) #if defined(PERL_CORE) || defined(PERL_EXT) #define grok_bslash_c(a,b) Perl_grok_bslash_c(aTHX_ a,b) +#define grok_bslash_o(a,b,c,d) Perl_grok_bslash_o(aTHX_ a,b,c,d) #endif #define grok_hex(a,b,c,d) Perl_grok_hex(aTHX_ a,b,c,d) #define grok_number(a,b,c) Perl_grok_number(aTHX_ a,b,c) diff --git a/global.sym b/global.sym index aa61a69..3323815 100644 --- a/global.sym +++ b/global.sym @@ -283,6 +283,7 @@ Perl_vload_module Perl_looks_like_number Perl_grok_bin Perl_grok_bslash_c +Perl_grok_bslash_o Perl_grok_hex Perl_grok_number Perl_grok_numeric_radix diff --git a/pod/perl5133delta.pod b/pod/perl5133delta.pod index 049a78f..d4db338 100644 --- a/pod/perl5133delta.pod +++ b/pod/perl5133delta.pod @@ -28,6 +28,17 @@ here, but most should go in the L</Performance Enhancements> section. [ List each enhancement as a =head2 entry ] +=head2 \o{...} + +The escape sequence C<"\o"> in double-quotish contexts is now defined. It +must be followed by braces enclosing an octal number of at least one digit. It +means the character whose ordinal value is that octal number. This construct +allows large octal ordinals beyond the current max of 0777 to be represented. +It also allows you to specify a character in octal which can safely be +concatenated with other regex snippets without danger of changing its meaning, +and one which won't ever be confused with being a backreference to a regex +capture group. See L<perlre/Capture groups> + =head2 C<\N{I<name>}> and C<charnames> enhancements C<\N{}> and C<charnames::vianame> now know about the abbreviated character @@ -76,6 +87,18 @@ XXX For a release on a stable branch, this section aspires to be: [ List each incompatible change as a =head2 entry ] +=head2 \400 - \777 + +Use of C<\400> - C<\777> in regexes in certain circumstances has given different, +anomalous behavior than their use in all other double-quotish contexts. Since +5.10.1, a deprecated warning message has been raised when this happens. Now, +all double-quotish contexts have the same behavior, namely to be equivalent to +C<\x{100}> - C<\x{1FF}>, with no deprecation warning. Use of these values in the +command line option C<"-0"> retains the current meaning to slurp input files +whole; previously, this was documented only for C<"-0777">. It is recommended, +however, because of various ambiguities, to use the new L</\o{...}> construct +to represent characters in octal. + =head1 Deprecations XXX Any deprecated features, syntax, modules etc. should be listed here. diff --git a/pod/perldiag.pod b/pod/perldiag.pod index 26c35a0..9f9fe4b 100644 --- a/pod/perldiag.pod +++ b/pod/perldiag.pod @@ -2510,6 +2510,10 @@ comment) between the C<\N> and the C<{> in a regex with the C</x> modifier. This modifier does not change the requirement that the brace immediately follow the C<\N>. +=item Missing braces on \o{} + +(F) A C<\o> must be followed immediately by a C<{> in double-quotish context. + =item Missing comma after first argument to %s function (F) While certain functions allow you to specify a filehandle or an @@ -2978,6 +2982,11 @@ to UTC. If it's not, define the logical name F<SYS$TIMEZONE_DIFFERENTIAL> to translate to the number of seconds which need to be added to UTC to get local time. +=item Non-octal character '%c'. Resolved as "%s" + +(W digit) In parsing an octal numeric constant, a character was unexpectedly +encountered that isn't octal. The resulting value is as indicated. + =item Non-string passed as bitmask (W misc) A number has been passed as a bitmask argument to select(). @@ -3020,6 +3029,11 @@ versions of Perl are likely to eliminate this arbitrary limitation. In the meantime, try using scientific notation (e.g. "1e6" instead of "1_000_000"). +=item Number with no digits + +(F) Perl was looking for a number but found nothing that looked like a number. +This happens, for example with C<\o{}>, with no number between the braces. + =item Octal number in vector unsupported (F) Numbers with a leading C<0> are not currently allowed in vectors. @@ -4961,15 +4975,6 @@ In code that currently says C<use AutoLoader; @ISA = qw(AutoLoader);> you should remove AutoLoader from @ISA and change C<use AutoLoader;> to C<use AutoLoader 'AUTOLOAD';>. -=item Use of octal value above 377 is deprecated - -(D deprecated, W regexp) There is a constant in the regular expression whose -value is interpeted by Perl as octal and larger than 377 (255 decimal, 0xFF -hex). Perl may take this to mean different things depending on the rest of -the regular expression. If you meant such an octal value, convert it to -hexadecimal and use C<\xHH> or C<\x{HH}> instead. If you meant to have -part of it mean a backreference, use C<\g> for that. See L<perlre>. - =item Use of %s in printf format not supported (F) You attempted to use a feature of printf that is accessible from diff --git a/pod/perlop.pod b/pod/perlop.pod index c51afc3..73b83f9 100644 --- a/pod/perlop.pod +++ b/pod/perlop.pod @@ -1017,21 +1017,23 @@ from the next line. This allows you to write: The following escape sequences are available in constructs that interpolate and in transliterations. X<\t> X<\n> X<\r> X<\f> X<\b> X<\a> X<\e> X<\x> X<\0> X<\c> X<\N> X<\N{}> +X<\o{}> Sequence Note Description - \t tab (HT, TAB) - \n newline (NL) - \r return (CR) - \f form feed (FF) - \b backspace (BS) - \a alarm (bell) (BEL) - \e escape (ESC) - \x{263a} [1] hex char (example: SMILEY) - \x1b [2] restricted hex char (example: ESC) + \t tab (HT, TAB) + \n newline (NL) + \r return (CR) + \f form feed (FF) + \b backspace (BS) + \a alarm (bell) (BEL) + \e escape (ESC) + \x{263a} [1] hex char (example: SMILEY) + \x1b [2] restricted range hex char (example: ESC) \N{name} [3] named Unicode character - \N{U+263D} [4] Unicode character (example: FIRST QUARTER MOON) - \c[ [5] control char (example: chr(27)) - \033 [6] octal char (example: ESC) + \N{U+263D} [4] Unicode character (example: FIRST QUARTER MOON) + \c[ [5] control char (example: chr(27)) + \o{23072} [6] octal char (example: SMILEY) + \033 [7] restricted range octal char (example: ESC) =over 4 @@ -1136,13 +1138,25 @@ To get platform independent controls, you can use C<\N{...}>. =item [6] +The result is the character whose ordinal is the octal number between the +braces. + +If a character that isn't an octal digit is encountered, a warning is raised, +and the value is based on the octal digits before it, discarding it and all +following characters up to the closing brace. It is a fatal error if there are +no octal digits at all. + +=item [7] + The result is the character whose ordinal is the given three digit octal number. Some contexts allow 2 or even 1 digit, but any usage without exactly three digits, the first being a zero, may give unintended results. (For -example, see L<perlrebackslash/Octal escapes>.) It is best therefore to use -this construct only for ordinals C<\077> and below, remembering to pad to the -left with zeros to make three digits. For larger ordinals, it's best to -convert to some other construct, such as to hex and use C<\x{}> instead. +example, see L<perlrebackslash/Octal escapes>.) Starting in Perl 5.14, you may +use C<\o{}> instead which avoids all these problems. Otherwise, it is best to +use this construct only for ordinals C<\077> and below, remembering to pad to +the left with zeros to make three digits. For larger ordinals, either use +C<\o{}> , or convert to someething else, such as to hex and use C<\x{}> +instead. A backslash followed by a non-octal digit in a bracketed character class (C<[\8]> or C<[\9]>) will be interpreted as a NULL character and the digit. diff --git a/pod/perlre.pod b/pod/perlre.pod index 7048787..2e00f0b 100644 --- a/pod/perlre.pod +++ b/pod/perlre.pod @@ -229,12 +229,11 @@ also work: \f form feed (FF) \a alarm (bell) (BEL) \e escape (think troff) (ESC) - \033 octal char (example: ESC) - \x1B hex char (example: ESC) - \x{263a} long hex char (example: Unicode SMILEY) \cK control char (example: VT) + \x{}, \x00 character whose ordinal is the given hexadecimal number \N{name} named Unicode character \N{U+263D} Unicode character (example: FIRST QUARTER MOON) + \o{}, \000 character whose ordinal is the given octal number \l lowercase next char (think vi) \u uppercase next char (think vi) \L lowercase till \E (think vi) @@ -454,7 +453,8 @@ a backreference only if at least 11 left parentheses have opened before it. And so on. C<\1> through C<\9> are always interpreted as backreferences. You can minimize the ambiguity by always using C<\g> if you mean capturing groups; and always using 3 digits for octal constants, with the first always "0" (which -works if there are 63 (= \077) or fewer capture groups). +works if there are 63 (= \077) or fewer capture groups). There are several +examples below that illustrate these perils. The C<\I<digit>> notation also works in certain circumstances outside the pattern. See L</Warning on \1 Instead of $1> below for details.) @@ -478,6 +478,20 @@ Examples: $seconds = $3; } + /(.)(.)(.)(.)(.)(.)(.)(.)(.)\g10/ # \g10 is a backreference + /(.)(.)(.)(.)(.)(.)(.)(.)(.)\10/ # \10 is octal + /((.)(.)(.)(.)(.)(.)(.)(.)(.))\10/ # \10 is a backreference + /((.)(.)(.)(.)(.)(.)(.)(.)(.))\010/ # \010 is octal + + $a = '(.)\1'; # Creates problems when concatenated. + $b = '(.)\g{1}'; # Avoids the problems. + "aa" =~ /${a}/; # True + "aa" =~ /${b}/; # True + "aa0" =~ /${a}0/; # False! + "aa0" =~ /${b}0/; # True + "aa\x08" =~ /${a}0/; # True! + "aa\x08" =~ /${b}0/; # False + Several special variables also refer back to portions of the previous match. C<$+> returns whatever the last bracket match matched. C<$&> returns the entire matched string. (At one point C<$0> did @@ -1744,9 +1758,9 @@ spell out the character sets in full. Characters may be specified using a metacharacter syntax much like that used in C: "\n" matches a newline, "\t" a tab, "\r" a carriage return, "\f" a form feed, etc. More generally, \I<nnn>, where I<nnn> is a string -of octal digits, matches the character whose coded character set value +of three octal digits, matches the character whose coded character set value is I<nnn>. Similarly, \xI<nn>, where I<nn> are hexadecimal digits, -matches the character whose numeric value is I<nn>. The expression \cI<x> +matches the character whose ordinal is I<nn>. The expression \cI<x> matches the character control-I<x>. Finally, the "." metacharacter matches any character except "\n" (unless you use C</s>). diff --git a/pod/perlrebackslash.pod b/pod/perlrebackslash.pod index 5728e7d..d460f7f 100644 --- a/pod/perlrebackslash.pod +++ b/pod/perlrebackslash.pod @@ -62,7 +62,7 @@ quoted constructs>. Those not usable within a bracketed character class (like C<[\da-z]>) are marked as C<Not in [].> - \000 Octal escape sequence. + \000 Octal escape sequence. See also \o{}. \1 Absolute backreference. Not in []. \a Alarm or bell. \A Beginning of string. Not in []. @@ -86,6 +86,7 @@ as C<Not in [].> \n (Logical) newline character. \N Any character but newline. Experimental. Not in []. \N{} Named or numbered (Unicode) character. + \o{} Octal escape sequence. \p{}, \pP Character with the given Unicode property. \P{}, \PP Character without the given Unicode property. \Q Quotemeta till \E. Not in []. @@ -134,7 +135,7 @@ character class, C<\b> is a word/non-word boundary. =item [2] C<\n> matches a logical newline. Perl will convert between C<\n> and your -OSses native newline character when reading from or writing to text files. +OS's native newline character when reading from or writing to text files. =back @@ -166,7 +167,7 @@ Mnemonic: I<c>ontrol character. =head3 Named or numbered characters -All Unicode characters have a Unicode name and numeric ordinal value. Use the +Unicode characters have a Unicode name and numeric ordinal value. Use the C<\N{}> construct to specify a character by either of these values. To specify by name, the name of the character goes between the curly braces. @@ -179,8 +180,8 @@ hexadecimal that gives the ordinal number that Unicode has assigned to the desired character. It is customary (but not required) to use leading zeros to pad the number to 4 digits. Thus C<\N{U+0041}> means C<Latin Capital Letter A>, and you will rarely see it written without the two -leading zeros. C<\N{U+0041}> means C<A> even on EBCDIC machines (where the -ordinal value of C<A> is not 0x41). +leading zeros. C<\N{U+0041}> means "A" even on EBCDIC machines (where the +ordinal value of "A" is not 0x41). It is even possible to give your own names to characters, and even to short sequences of characters. For details, see L<charnames>. @@ -207,33 +208,57 @@ match "as is". =head3 Octal escapes -Octal escapes consist of a backslash followed by three octal digits -matching the code point of the character you want to use. (In some contexts, -two or even one octal digits are also accepted, sometimes with a warning.) This -allows for 512 characters (C<\000> up to C<\777>) that can be expressed this -way (but anything above C<\377> is deprecated). Enough in pre-Unicode days, -but most Unicode characters cannot be escaped this way. +There are two forms of octal escapes. Each is used to specify a character by +its ordinal, specified in octal notation. + +One form, available starting in Perl 5.14 looks like C<\o{...}>, where the dots +represent one or more octal digits. It can be used for any Unicode character. + +It was introduced to avoid the potential problems with the other form, +available in all Perls. That form consists of a backslash followed by three +octal digits. One problem with this form is that it can look exactly like an +old-style backreference (see +L</Disambiguation rules between old-style octal escapes and backreferences> +below.) You can avoid this by making the first of the three digits always a +zero, but that makes \077 the largest ordinal unambiguously specifiable by this +form. + +In some contexts, a backslash followed by two or even one octal digits may be +interpreted as an octal escape, sometimes with a warning, and because of some +bugs, sometimes with surprising results. Also, if you are creating a regex +out of smaller snippets concatentated together, and you use fewer than three +digits, the beginning of one snippet may be interpreted as adding digits to the +ending of the snippet before it. See L</Absolute referencing> for more +discussion and examples of the snippet problem. Note that a character that is expressed as an octal escape is considered as a character without special meaning by the regex engine, and will match "as is". -=head4 Examples (assuming an ASCII platform) +To summarize, the C<\o{}> form is always safe to use, and the other form is +safe to use for ordinals up through \077 when you use exactly three digits to +specify them. - $str = "Perl"; - $str =~ /\120/; # Match, "\120" is "P". - $str =~ /\120+/; # Match, "\120" is "P", it is repeated at least once - $str =~ /P\053/; # No match, "\053" is "+" and taken literally. +Mnemonic: I<0>ctal or I<o>ctal. -=head4 Caveat +=head4 Examples (assuming an ASCII platform) -Octal escapes potentially clash with old-style backreferences (see L</Absolute -referencing> below). They both consist of a backslash followed by numbers. So -Perl has to use heuristics to determine whether it is a backreference or an -octal escape. You can avoid ambiguity by using the C<\g> form for -backreferences, and by beginning octal escapes with a "0". (Since octal -escapes are 3 digits, this latter method works only up to C<\077>.) In the -absence of C<\g>, Perl uses the following rules: + $str = "Perl"; + $str =~ /\o{120}/; # Match, "\120" is "P". + $str =~ /\120/; # Same. + $str =~ /\o{120}+/; # Match, "\120" is "P", it's repeated at least once + $str =~ /\120+/; # Same. + $str =~ /P\053/; # No match, "\053" is "+" and taken literally. + /\o{23073}/ # Black foreground, white background smiling face. + /\o{4801234567}/ # Raises a warning, and yields chr(4) + +=head4 Disambiguation rules between old-style octal escapes and backreferences + +Octal escapes of the C<\000> form outside of bracketed character classes +potentially clash with old-style backreferences. (see L</Absolute referencing> +below). They both consist of a backslash followed by numbers. So Perl has to +use heuristics to determine whether it is a backreference or an octal escape. +Perl uses the following rules to disambiguate: =over 4 @@ -247,29 +272,35 @@ If the first digit following the backslash is a 0, it's an octal escape. =item 3 -If the number following the backslash is N (decimal), and Perl already has +If the number following the backslash is N (in decimal), and Perl already has seen N capture groups, Perl will consider this to be a backreference. -Otherwise, it will consider it to be an octal escape. Note that if N > 999, -Perl only takes the first three digits for the octal escape; the rest is -matched as is. +Otherwise, it will consider it to be an octal escape. Note that if N has more +than three digits, Perl only takes the first three for the octal escape; +the rest are matched as is. my $pat = "(" x 999; $pat .= "a"; $pat .= ")" x 999; /^($pat)\1000$/; # Matches 'aa'; there are 1000 capture groups. /^$pat\1000$/; # Matches 'a...@0'; there are 999 capture groups - # and \1000 is seen as \100 (a '@') and a '0'. + # and \1000 is seen as \100 (a '@') and a '0' =back +You can the force a backreference interpretation always by using the C<\g{...}> +form. You can the force an octal interpretation always by using the C<\o{...}> +form, or for numbers up through \077 (= 63 decimal), by using three digits, +beginning with a "0". + =head3 Hexadecimal escapes -Hexadecimal escapes start with C<\x> and are then either followed by a -two digit hexadecimal number, or a hexadecimal number of arbitrary length -surrounded by curly braces. The hexadecimal number is the code point of -the character you want to express. +Like octal escapes, there are two forms of hexadecimal escapes, but both start +with the same thing, C<\x>. This is followed by either exactly two hexadecimal +digits forming a number, or a hexadecimal number of arbitrary length surrounded +by curly braces. The hexadecimal number is the code point of the character you +want to express. -Note that a character that is expressed as a hexadecimal escape is considered +Note that a character that is expressed as one of these escapes is considered as a character without special meaning by the regex engine, and will match "as is". @@ -366,8 +397,8 @@ Either C<\gI<N>> (starting in Perl 5.10.0), or C<\I<N>> (old-style) where I<N> is a positive (unsigned) decimal number of any length is an absolute reference to a capturing group. -I<N> refers to the Nth set of parentheses - so C<\gI<N>> refers to whatever has -been matched by that set of parenthesis. Thus C<\g1> refers to the first +I<N> refers to the Nth set of parentheses, so C<\gI<N>> refers to whatever has +been matched by that set of parentheses. Thus C<\g1> refers to the first capture group in the regex. The C<\gI<N>> form can be equivalently written as C<\g{I<N>}> diff --git a/pod/perlreref.pod b/pod/perlreref.pod index c6e0178..01d57cc 100644 --- a/pod/perlreref.pod +++ b/pod/perlreref.pod @@ -89,11 +89,12 @@ These work as in normal strings. \n Newline \r Carriage return \t Tab - \037 Any octal ASCII value - \x7f Any hexadecimal ASCII value - \x{263a} A wide hexadecimal value + \037 Char whose ordinal is the 3 octal digits, max \777 + \o{2307} Char whose ordinal is the octal number, unrestricted + \x7f Char whose ordinal is the 2 hex digits, max \xFF + \x{263a} Char whose ordinal is the hex number, unrestricted \cx Control-x - \N{name} A named character + \N{name} A named Unicode character \N{U+263D} A Unicode character by hex ordinal \l Lowercase next character diff --git a/pod/perlretut.pod b/pod/perlretut.pod index eae266a..f218717 100644 --- a/pod/perlretut.pod +++ b/pod/perlretut.pod @@ -184,7 +184,8 @@ bytes. Here are some examples of escapes: "1000\t2000" =~ m(0\t2) # matches "1000\n2000" =~ /0\n20/ # matches "1000\t2000" =~ /\000\t2/ # doesn't match, "0" ne "\000" - "cat" =~ /\143\x61\x74/ # matches in ASCII, but a weird way to spell cat + "cat" =~ /\o{143}\x61\x74/ # matches in ASCII, but a weird way + # to spell cat If you've been around Perl a while, all this talk of escape sequences may seem familiar. Similar escape sequences are used in double-quoted @@ -1876,9 +1877,9 @@ much about Perl's internal representation of strings. But they do need to know 1) how to represent Unicode characters in a regexp and 2) that a matching operation will treat the string to be searched as a sequence of characters, not bytes. The answer to 1) is that Unicode characters -greater than C<chr(255)> are represented using the C<\x{hex}> notation, -because the \0 octal and \x hex (without curly braces) don't go further -than 255. +greater than C<chr(255)> are represented using the C<\x{hex}> notation, because +\x hex (without curly braces) doesn't go further than 255. (Starting in Perl +5.14) if you're an octal fan, you can also use C<\o{oct}>. /\x{263a}/; # match a Unicode smiley face :) diff --git a/proto.h b/proto.h index 6a5110e..1fc1180 100644 --- a/proto.h +++ b/proto.h @@ -1633,6 +1633,14 @@ PERL_CALLCONV UV Perl_grok_bin(pTHX_ const char* start, STRLEN* len_p, I32* flag PERL_CALLCONV char Perl_grok_bslash_c(pTHX_ const char source, const bool output_warning) __attribute__warn_unused_result__; +PERL_CALLCONV char* Perl_grok_bslash_o(pTHX_ const char* s, UV* uv, STRLEN* len, const bool output_warning) + __attribute__warn_unused_result__ + __attribute__nonnull__(pTHX_1) + __attribute__nonnull__(pTHX_2) + __attribute__nonnull__(pTHX_3); +#define PERL_ARGS_ASSERT_GROK_BSLASH_O \ + assert(s); assert(uv); assert(len) + PERL_CALLCONV UV Perl_grok_hex(pTHX_ const char* start, STRLEN* len_p, I32* flags, NV *result) __attribute__nonnull__(pTHX_1) __attribute__nonnull__(pTHX_2) diff --git a/regcomp.c b/regcomp.c index 49651b2..1b22d90 100644 --- a/regcomp.c +++ b/regcomp.c @@ -7362,6 +7362,7 @@ tryagain: register UV ender; register char *p; char *s; + char *error_msg; STRLEN foldlen; U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf; @@ -7462,6 +7463,31 @@ tryagain: ender = ASCII_TO_NATIVE('\007'); p++; break; + case 'o': + { + STRLEN brace_len = len; + UV result; + if ((error_msg = grok_bslash_o(p, + &result, + &brace_len, + SIZE_ONLY)) + != NULL) + { + vFAIL(error_msg); + } + else + { + ender = result; + } + p += brace_len; + if (PL_encoding && ender < 0x100) { + goto recode_encoding; + } + if (ender > 0xff) { + RExC_utf8 = 1; + } + break; + } case 'x': if (*++p == '{') { char* const e = strchr(p, '}'); @@ -7500,17 +7526,8 @@ tryagain: I32 flags = 0; STRLEN numlen = 3; ender = grok_oct(p, &numlen, &flags, NULL); - - /* An octal above 0xff is interpreted differently - * depending on if the re is in utf8 or not. If it - * is in utf8, the value will be itself, otherwise - * it is interpreted as modulo 0x100. It has been - * decided to discourage the use of octal above the - * single-byte range. For now, warn only when - * it ends up modulo */ - if (SIZE_ONLY && ender >= 0x100 - && ! UTF && ! PL_encoding) { - ckWARNregdep(p, "Use of octal value above 377 is deprecated"); + if (ender > 0xff) { + RExC_utf8 = 1; } p += numlen; } @@ -7980,6 +7997,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth) parseit: while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') { + char* error_msg; charclassloop: @@ -8086,6 +8104,21 @@ parseit: case 'b': value = '\b'; break; case 'e': value = ASCII_TO_NATIVE('\033');break; case 'a': value = ASCII_TO_NATIVE('\007');break; + case 'o': + RExC_parse--; /* function expects to be pointed at the 'o' */ + if ((error_msg = grok_bslash_o(RExC_parse, + &value, + &numlen, + SIZE_ONLY)) + != NULL) + { + vFAIL(error_msg); + } + RExC_parse += numlen; + if (PL_encoding && value < 0x100) { + goto recode_encoding; + } + break; case 'x': if (*RExC_parse == '{') { I32 flags = PERL_SCAN_ALLOW_UNDERSCORES diff --git a/t/lib/warnings/regcomp b/t/lib/warnings/regcomp index f85aa44..3f80ccc 100644 --- a/t/lib/warnings/regcomp +++ b/t/lib/warnings/regcomp @@ -207,3 +207,33 @@ Useless (?-c) - don't use /gc modifier in regex; marked by <-- HERE in m/(?o-c < Useless (?o) - use /o modifier in regex; marked by <-- HERE in m/(?o <-- HERE gc)/ at - line 12. Useless (?g) - use /g modifier in regex; marked by <-- HERE in m/(?og <-- HERE c)/ at - line 12. Useless (?c) - use /gc modifier in regex; marked by <-- HERE in m/(?ogc <-- HERE )/ at - line 12. +######## +# regcomp.c [S_regatom] +$a = qr/\o{/; +EXPECT +Missing right brace on \o{ in regex; marked by <-- HERE in m/\ <-- HERE o{/ at - line 2. +######## +# regcomp.c [S_regatom] +$a = qr/\o/; +EXPECT +Missing braces on \o{} in regex; marked by <-- HERE in m/\ <-- HERE o/ at - line 2. +######## +# regcomp.c [S_regatom] +$a = qr/\o{}/; +EXPECT +Number with no digits in regex; marked by <-- HERE in m/\ <-- HERE o{}/ at - line 2. +######## +# regcomp.c [S_regclass] +$a = qr/[\o{]/; +EXPECT +Missing right brace on \o{ in regex; marked by <-- HERE in m/[\ <-- HERE o{]/ at - line 2. +######## +# regcomp.c [S_regclass] +$a = qr/[\o]/; +EXPECT +Missing braces on \o{} in regex; marked by <-- HERE in m/[\ <-- HERE o]/ at - line 2. +######## +# regcomp.c [S_regclass] +$a = qr/[\o{}]/; +EXPECT +Number with no digits in regex; marked by <-- HERE in m/[\ <-- HERE o{}]/ at - line 2. diff --git a/t/lib/warnings/toke b/t/lib/warnings/toke index 4bb131f..076270c 100644 --- a/t/lib/warnings/toke +++ b/t/lib/warnings/toke @@ -966,3 +966,26 @@ Use of := for an empty attribute list is deprecated at - line 36. Use of := for an empty attribute list is deprecated at - line 38. Use of := for an empty attribute list is deprecated at - line 41. Use of := for an empty attribute list is deprecated at - line 42. +######## +# toke.c +use warnings 'syntax' ; +my $a = "\o"; +my $a = "\o{"; +my $a = "\o{}"; +no warnings 'syntax' ; +my $a = "\o"; +my $a = "\o{"; +my $a = "\o{}"; +EXPECT +Missing braces on \o{} at - line 3, within string +Missing right brace on \o{ at - line 4, within string +Number with no digits at - line 5, within string +BEGIN not safe after errors--compilation aborted at - line 6. +######## +# toke.c +use warnings 'digit' ; +my $a = "\o{1238456}"; +no warnings 'digit' ; +my $a = "\o{1238456}"; +EXPECT +Non-octal character '8'. Resolved as "\o{123}" at - line 3. diff --git a/t/op/qq.t b/t/op/qq.t index 3a3108e..0136608 100644 --- a/t/op/qq.t +++ b/t/op/qq.t @@ -5,7 +5,7 @@ BEGIN { @INC = '../lib'; } -print q(1..23 +print q(1..29 ); # This is() function is written to avoid "" @@ -61,6 +61,12 @@ is ("\x{000000000000000000000000000000000000000000000000000000000000000072}", is ("\x{0_06_5}", chr 101); is ("\x{1234}", chr 4660); is ("\x{10FFFD}", chr 1114109); +is ("\400", chr 0x100); +is ("\600", chr 0x180); +is ("\777", chr 0x1FF); +is ("a\o{120}b", "a" . chr(0x50) . "b"); +is ("a\o{400}b", "a" . chr(0x100) . "b"); +is ("a\o{1000}b", "a" . chr(0x200) . "b"); # These kludged tests should change when we remove the temporary fatal error # in util.c for "\c{". And, the warning there should probably not be diff --git a/t/re/pat_rt_report.t b/t/re/pat_rt_report.t index efbbe8f..33b6f7c 100644 --- a/t/re/pat_rt_report.t +++ b/t/re/pat_rt_report.t @@ -21,7 +21,7 @@ BEGIN { } -plan tests => 2511; # Update this when adding/deleting tests. +plan tests => 2510; # Update this when adding/deleting tests. run_tests() unless caller; @@ -1053,15 +1053,6 @@ sub run_tests { iseq $te [0], '../'; } - # This currently has to come before any "use encoding" in this file. - { - local $Message; - local $BugId = '59342'; - must_warn 'qr/\400/', '^Use of octal value above 377'; - } - - - { local $BugId = '60034'; my $a = "xyzt" x 8192; diff --git a/t/re/re_tests b/t/re/re_tests index 7cf5a80..36a2f4c 100644 --- a/t/re/re_tests +++ b/t/re/re_tests @@ -1444,14 +1444,27 @@ abc\N{def - c - \\N{NAME} must be resolved by the lexer /abc\N {SPACE}/x - c - Missing braces # Verifies catches hex errors, and doesn't expose our . notation to the outside -/\N{U+0xBEEF}/ - c - Illegal hexadecimal digit -/\N{U+BEEF.BEAD}/ - c - Illegal hexadecimal digit +/\N{U+0xBEEF}/ - c - Illegal hexadecimal digit +/\N{U+BEEF.BEAD}/ - c - Illegal hexadecimal digit # Verify works in single quotish context; regex compiler delivers slightly different msg # \N{U+BEEF.BEAD} succeeds here, because can't completely hide it from the outside. -\N{U+0xBEEF} - c - Invalid hexadecimal number -\c` - c - \"\\c`\" more clearly written simply as \"\\ \" -\c1 - c - \"\\c1\" more clearly written simply as \"q\" -\cA \001 y $& \1 +\N{U+0xBEEF} - c - Invalid hexadecimal number +\c` - c - \"\\c`\" more clearly written simply as \"\\ \" +\c1 - c - \"\\c1\" more clearly written simply as \"q\" +\cA \001 y $& \1 -# vim: set noexpandtab +\400 \x{100} y $& \x{100} +\600 \x{180} y $& \x{180} +\777 \x{1FF} y $& \x{1FF} +[a\400] \x{100} y $& \x{100} +[b\600] \x{180} y $& \x{180} +[c\777] \x{1FF} y $& \x{1FF} +\o{120} \x{50} y $& \x{50} +\o{400} \x{100} y $& \x{100} +\o{1000} \x{200} y $& \x{200} +[a\o{120}] \x{50} y $& \x{50} +[a\o{400}] \x{100} y $& \x{100} +[a\o{1000}] \x{200} y $& \x{200} + +# vim: softtabstop=0 noexpandtab diff --git a/t/re/subst.t b/t/re/subst.t index 73c7ac0..de6284a 100644 --- a/t/re/subst.t +++ b/t/re/subst.t @@ -7,7 +7,7 @@ BEGIN { } require './test.pl'; -plan( tests => 167 ); +plan( tests => 170 ); # Stolen from re/ReTest.pl. Can't just use the file since it doesn't support # like() and it conflicts with test.pl @@ -713,3 +713,14 @@ fresh_perl_is( '$_="abcef"; s/bc|(.)\G(.)/$1 ? "[$1-$2]" : "XX"/ge; print' => 'a is($non_sub_string, $string, "Verify that failed substitute doesn't change string"); } + +{ # Verify largish octal in replacement pattern + + my $string = "a"; + $string =~ s/a/\400/; + is($string, chr 0x100, "Verify that handles s/foo/\\400/"); + $string =~ s/./\600/; + is($string, chr 0x180, "Verify that handles s/foo/\\600/"); + $string =~ s/./\777/; + is($string, chr 0x1FF, "Verify that handles s/foo/\\777/"); +} diff --git a/toke.c b/toke.c index b7b33e8..75fb327 100644 --- a/toke.c +++ b/toke.c @@ -2879,6 +2879,20 @@ S_scan_const(pTHX_ char *start) } goto NUM_ESCAPE_INSERT; + /* eg. \o{24} indicates the octal constant \024 */ + case 'o': + { + STRLEN len; + + char* error = grok_bslash_o(s, &uv, &len, 1); + s += len; + if (error) { + yyerror(error); + continue; + } + goto NUM_ESCAPE_INSERT; + } + /* eg. \x24 indicates the hex constant 0x24 */ case 'x': ++s; diff --git a/util.c b/util.c index b3b385e..6fdc653 100644 --- a/util.c +++ b/util.c @@ -3904,7 +3904,7 @@ Perl_report_evil_fh(pTHX_ const GV *gv, const IO *io, I32 op) char Perl_grok_bslash_c(pTHX_ const char source, const bool output_warning) { - + U8 result; if (! isASCII(source)) { @@ -3935,6 +3935,72 @@ Perl_grok_bslash_c(pTHX_ const char source, const bool output_warning) return result; } +char * +Perl_grok_bslash_o(pTHX_ const char *s, UV *uv, STRLEN *len, const bool output_warning) +{ + +/* Documentation to be supplied when interface nailed down finally + * This returns NULL on success, otherwise a pointer to an internal constant + * error message. On input: + * s points to a string that begins with o, and the previous character was + * a backslash. + * uv points to a UV that will hold the output value + * len will point to the next character in the string past the end of this + * construct + * output_warning says whether to output any warning messages, or suppress + * them + */ + char* e; + STRLEN numbers_len; + I32 flags = PERL_SCAN_ALLOW_UNDERSCORES + | PERL_SCAN_DISALLOW_PREFIX + /* XXX Until the message is improved in grok_oct, handle errors + * ourselves */ + | PERL_SCAN_SILENT_ILLDIGIT; + + PERL_ARGS_ASSERT_GROK_BSLASH_O; + + + assert(*s == 'o'); + s++; + + if (*s != '{') { + *len = 1; /* Move past the o */ + return "Missing braces on \\o{}"; + } + + e = strchr(s, '}'); + if (!e) { + *len = 2; /* Move past the o{ */ + return "Missing right brace on \\o{"; + } + + /* Return past the '}' no matter what is inside the braces */ + *len = e - s + 2; /* 2 = 1 for the o + 1 for the '}' */ + + s++; /* Point to first digit */ + + numbers_len = e - s; + if (numbers_len == 0) { + return "Number with no digits"; + } + + *uv = NATIVE_TO_UNI(grok_oct(s, &numbers_len, &flags, NULL)); + /* Note that if has non-octal, will ignore everything starting with that up + * to the '}' */ + + if (output_warning && numbers_len != (STRLEN) (e - s)) { + Perl_ck_warner(aTHX_ packWARN(WARN_DIGIT), + /* diag_listed_as: Non-octal character '%c'. Resolved as "%s" */ + "Non-octal character '%c'. Resolved as \"\\o{%.*s}\"", + *(s + numbers_len), + (int) numbers_len, + s); + } + + return NULL; +} + /* To workaround core dumps from the uninitialised tm_zone we get the * system to give us a reasonable struct to copy. This fix means that * strftime uses the tm_zone and tm_gmtoff values returned by -- Perl5 Master Repository
