In perl.git, the branch blead has been updated <https://perl5.git.perl.org/perl.git/commitdiff/14f657d436dd5738712c1d294e7d5f7898336ba4?hp=823c3b2daca3409863f10ec5e1c6d416d2614a5a>
- Log ----------------------------------------------------------------- commit 14f657d436dd5738712c1d294e7d5f7898336ba4 Author: Karl Williamson <[email protected]> Date: Wed Mar 13 15:21:39 2019 -0600 regexec.c: We know the end ptr; don't need to recalc commit 2892a27e931e4ba534dd20dc9c94542eda19afbf Author: Karl Williamson <[email protected]> Date: Wed Mar 13 15:20:50 2019 -0600 regexec.c: Add assertion commit 1f25ceb1dbba05ad62ee0a371c12863528fc4a9f Author: Karl Williamson <[email protected]> Date: Wed Mar 13 14:23:03 2019 -0600 regcomp.c: Add assertion commit ee2223a54bee07e3f5e8a63ce5bd71c29f9a2e85 Author: Karl Williamson <[email protected]> Date: Wed Mar 13 13:36:00 2019 -0600 regcomp.c: Rmv unnecessary branch The function memchr() seems to get inlined so it is very fast, and it's legal to call it with a 0 length, so let it figure out that it's zero. commit 135226faaef3671e917c2e1d253e89a47c2b64f0 Author: Karl Williamson <[email protected]> Date: Wed Mar 13 13:23:24 2019 -0600 perlvar: Fix broken link commit 67d5c462b11923ef2f2c3b6dc5834d982347e17b Author: Karl Williamson <[email protected]> Date: Wed Mar 13 13:18:28 2019 -0600 perlrecharclass: Minor wording improvements commit 447fcf49ce69125df8ec9c1b46a7e6f24df98683 Author: Karl Williamson <[email protected]> Date: Wed Mar 13 13:17:39 2019 -0600 perlre: Minor wording improvements commit 407fecf1ecbd5b45621badd1485c91ddf95256e1 Author: Karl Williamson <[email protected]> Date: Wed Mar 13 13:16:43 2019 -0600 perlre: Italicize a bunch of stuff These are not meant to be written literally. commit 6dd641e14cd2675068749eeea8c8aabee158595e Author: Karl Williamson <[email protected]> Date: Wed Mar 13 11:42:15 2019 -0600 dquote.c: Use UTF8_SAFE_SKIP Otherwise malformed input could cause this to return a pointer outside its buffer commit 85fcc8f2234ce65ebd31480efc38dc4a3ec8ad13 Author: Karl Williamson <[email protected]> Date: Wed Mar 13 11:41:09 2019 -0600 Add UTF8_SAFE_SKIP API macro This version of UTF8SKIP refuses to advance beyond the end pointer ----------------------------------------------------------------------- Summary of changes: dquote.c | 6 +- pod/perlre.pod | 196 ++++++++++++++++++++++++------------------------ pod/perlrecharclass.pod | 7 +- pod/perlvar.pod | 3 +- regcomp.c | 5 +- regexec.c | 4 +- utf8.h | 11 +++ 7 files changed, 124 insertions(+), 108 deletions(-) diff --git a/dquote.c b/dquote.c index 6913ca5ce4..10fb2b5df0 100644 --- a/dquote.c +++ b/dquote.c @@ -141,7 +141,7 @@ Perl_grok_bslash_o(pTHX_ char **s, const char * const send, UV *uv, if (numbers_len != (STRLEN) (e - *s)) { if (strict) { *s += numbers_len; - *s += (UTF) ? UTF8SKIP(*s) : (STRLEN) 1; + *s += (UTF) ? UTF8_SAFE_SKIP(*s, send) : 1; *error_msg = "Non-octal character"; return FALSE; } @@ -223,7 +223,7 @@ Perl_grok_bslash_x(pTHX_ char **s, const char * const send, UV *uv, *s += len; if (strict && len != 2) { if (len < 2) { - *s += (UTF) ? UTF8SKIP(*s) : 1; + *s += (UTF) ? UTF8_SAFE_SKIP(*s, send) : 1; *error_msg = "Non-hex character"; } else { @@ -272,7 +272,7 @@ Perl_grok_bslash_x(pTHX_ char **s, const char * const send, UV *uv, if (strict && numbers_len != (STRLEN) (e - *s)) { *s += numbers_len; - *s += (UTF) ? UTF8SKIP(*s) : 1; + *s += (UTF) ? UTF8_SAFE_SKIP(*s, send) : 1; *error_msg = "Non-hex character"; return FALSE; } diff --git a/pod/perlre.pod b/pod/perlre.pod index 900c28497a..209cac7f8d 100644 --- a/pod/perlre.pod +++ b/pod/perlre.pod @@ -563,7 +563,8 @@ At any given time, exactly one of these modifiers is in effect. Their existence allows Perl to keep the originally compiled behavior of a regular expression, regardless of what rules are in effect when it is actually executed. And if it is interpolated into a larger regex, the -original's rules continue to apply to it, and only it. +original's rules continue to apply to it, and don't affect the other +parts. The C</l> and C</u> modifiers are automatically selected for regular expressions compiled within the scope of various pragmas, @@ -720,8 +721,8 @@ the pattern uses L<C<(*script_run: ...)>|/Script Runs> Another mnemonic for this modifier is "Depends", as the rules actually used depend on various things, and as a result you can get unexpected results. See L<perlunicode/The "Unicode Bug">. The Unicode Bug has -become rather infamous, leading to yet another (printable) name for this -modifier, "Dodgy". +become rather infamous, leading to yet another (without swearing) name +for this modifier, "Dodgy". Unless the pattern or string are encoded in UTF-8, only ASCII characters can match positively. @@ -925,7 +926,7 @@ string" problem can be most efficiently performed when written as: as we know that if the final quote does not match, backtracking will not help. See the independent subexpression -L</C<< (?>pattern) >>> for more details; +L</C<< (?>I<pattern>) >>> for more details; possessive quantifiers are just syntactic sugar for that construct. For instance the above example could also be written as follows: @@ -1035,8 +1036,9 @@ See L</Extended Patterns> below for details. =item [7] -Note that C<\N> has two meanings. When of the form C<\N{NAME}>, it matches the -character or character sequence whose name is C<NAME>; and similarly +Note that C<\N> has two meanings. When of the form C<\N{I<NAME>}>, it +matches the character or character sequence whose name is I<NAME>; and +similarly when of the form C<\N{U+I<hex>}>, it matches the character whose Unicode code point is I<hex>. Otherwise it matches any character but C<\n>. @@ -1337,10 +1339,10 @@ expressions, and 2) whenever you see one, you should stop and =over 4 -=item C<(?#text)> +=item C<(?#I<text>)> X<(?#)> -A comment. The text is ignored. +A comment. The I<text> is ignored. Note that Perl closes the comment as soon as it sees a C<")">, so there is no way to put a literal C<")"> in the comment. The pattern's closing delimiter must be escaped by @@ -1402,8 +1404,8 @@ repetition of the previous word, assuming the C</x> modifier, and no C</i> modifier outside this group. These modifiers do not carry over into named subpatterns called in the -enclosing group. In other words, a pattern such as C<((?i)(?&NAME))> does not -change the case-sensitivity of the C<"NAME"> pattern. +enclosing group. In other words, a pattern such as C<((?i)(?&I<NAME>))> does not +change the case-sensitivity of the I<NAME> pattern. A modifier is overridden by later occurrences of this construct in the same scope containing the same modifier, so that @@ -1448,12 +1450,12 @@ C<(?-d:...)> and C<(?dl:...)> are fatal errors. Note also that the C<"p"> modifier is special in that its presence anywhere in a pattern has a global effect. -=item C<(?:pattern)> +=item C<(?:I<pattern>)> X<(?:)> -=item C<(?adluimnsx-imnsx:pattern)> +=item C<(?adluimnsx-imnsx:I<pattern>)> -=item C<(?^aluimnsx:pattern)> +=item C<(?^aluimnsx:I<pattern>)> X<(?^:)> This is for clustering, not capturing; it groups subexpressions like @@ -1518,7 +1520,7 @@ redundant. Mnemonic for C<(?^...)>: A fresh beginning since the usual use of a caret is to match at the beginning. -=item C<(?|pattern)> +=item C<(?|I<pattern>)> X<(?|)> X<Branch reset> This is the "branch reset" pattern, which has the special property @@ -1574,11 +1576,11 @@ lookahead matches text following the current match position. =over 4 -=item C<(?=pattern)> +=item C<(?=I<pattern>)> -=item C<(*pla:pattern)> +=item C<(*pla:I<pattern>)> -=item C<(*positive_lookahead:pattern)> +=item C<(*positive_lookahead:I<pattern>)> X<(?=)> X<(*pla> X<(*positive_lookahead> @@ -1590,11 +1592,11 @@ matches a word followed by a tab, without including the tab in C<$&>. The alphabetic forms are experimental; using them yields a warning in the C<experimental::alpha_assertions> category. -=item C<(?!pattern)> +=item C<(?!I<pattern>)> -=item C<(*nla:pattern)> +=item C<(*nla:I<pattern>)> -=item C<(*negative_lookahead:pattern)> +=item C<(*negative_lookahead:I<pattern>)> X<(?!)> X<(*nla> X<(*negative_lookahead> @@ -1613,13 +1615,13 @@ match. Use lookbehind instead (see below). The alphabetic forms are experimental; using them yields a warning in the C<experimental::alpha_assertions> category. -=item C<(?<=pattern)> +=item C<(?<=I<pattern>)> =item C<\K> -=item C<(*plb:pattern)> +=item C<(*plb:I<pattern>)> -=item C<(*positive_lookbehind:pattern)> +=item C<(*positive_lookbehind:I<pattern>)> X<(?<=)> X<(*plb> X<(*positive_lookbehind> @@ -1654,11 +1656,11 @@ can be rewritten as the much more efficient The alphabetic forms (not including C<\K> are experimental; using them yields a warning in the C<experimental::alpha_assertions> category. -=item C<(?<!pattern)> +=item C<(?<!I<pattern>)> -=item C<(*nlb:pattern)> +=item C<(*nlb:I<pattern>)> -=item C<(*negative_lookbehind:pattern)> +=item C<(*negative_lookbehind:I<pattern>)> X<(?<!)> X<(*nlb> X<(*negative_lookbehind> @@ -1677,22 +1679,23 @@ C<experimental::alpha_assertions> category. =back -=item C<< (?<NAME>pattern) >> +=item C<< (?<I<NAME>>I<pattern>) >> -=item C<(?'NAME'pattern)> +=item C<(?'I<NAME>'I<pattern>)> X<< (?<NAME>) >> X<(?'NAME')> X<named capture> X<capture> A named capture group. Identical in every respect to normal capturing parentheses C<()> but for the additional fact that the group can be referred to by name in various regular expression -constructs (like C<\g{NAME}>) and can be accessed by name +constructs (like C<\g{I<NAME>}>) and can be accessed by name after a successful match via C<%+> or C<%->. See L<perlvar> for more details on the C<%+> and C<%-> hashes. If multiple distinct capture groups have the same name, then -C<$+{NAME}> will refer to the leftmost defined group in the match. +C<$+{I<NAME>}> will refer to the leftmost defined group in the match. -The forms C<(?'NAME'pattern)> and C<< (?<NAME>pattern) >> are equivalent. +The forms C<(?'I<NAME>'I<pattern>)> and C<< (?<I<NAME>>I<pattern>) >> +are equivalent. B<NOTE:> While the notation of this construct is the same as the similar function in .NET regexes, the behavior is not. In Perl the groups are @@ -1701,7 +1704,7 @@ pattern /(x)(?<foo>y)(z)/ -C<$+{I<foo>}> will be the same as C<$2>, and C<$3> will contain 'z' instead of +C<$+{foo}> will be the same as C<$2>, and C<$3> will contain 'z' instead of the opposite which is what a .NET regex hacker might expect. Currently I<NAME> is restricted to simple identifiers only. @@ -1710,29 +1713,30 @@ its Unicode extension (see L<utf8>), though it isn't extended by the locale (see L<perllocale>). B<NOTE:> In order to make things easier for programmers with experience -with the Python or PCRE regex engines, the pattern C<< (?PE<lt>NAMEE<gt>pattern) >> -may be used instead of C<< (?<NAME>pattern) >>; however this form does not +with the Python or PCRE regex engines, the pattern C<< +(?PE<lt>I<NAME>E<gt>I<pattern>) >> +may be used instead of C<< (?<I<NAME>>I<pattern>) >>; however this form does not support the use of single quotes as a delimiter for the name. -=item C<< \k<NAME> >> +=item C<< \k<I<NAME>> >> -=item C<< \k'NAME' >> +=item C<< \k'I<NAME>' >> Named backreference. Similar to numeric backreferences, except that the group is designated by name and not number. If multiple groups have the same name then it refers to the leftmost defined group in the current match. -It is an error to refer to a name not defined by a C<< (?<NAME>) >> +It is an error to refer to a name not defined by a C<< (?<I<NAME>>) >> earlier in the pattern. Both forms are equivalent. B<NOTE:> In order to make things easier for programmers with experience -with the Python or PCRE regex engines, the pattern C<< (?P=NAME) >> -may be used instead of C<< \k<NAME> >>. +with the Python or PCRE regex engines, the pattern C<< (?P=I<NAME>) >> +may be used instead of C<< \k<I<NAME>> >>. -=item C<(?{ code })> +=item C<(?{ I<code> })> X<(?{})> X<regex, code in> X<regexp, code in> X<regular expression, code in> B<WARNING>: Using this feature safely requires that you understand its @@ -1836,9 +1840,9 @@ This assertion may be used as the condition in a (?(condition)yes-pattern|no-pattern) -switch. If I<not> used in this way, the result of evaluation of C<code> +switch. If I<not> used in this way, the result of evaluation of I<code> is put into the special variable C<$^R>. This happens immediately, so -C<$^R> can be used from other C<(?{ code })> assertions inside the same +C<$^R> can be used from other C<(?{ I<code> })> assertions inside the same regular expression. The assignment to C<$^R> above is properly localized, so the old @@ -1854,7 +1858,7 @@ keep track of the number of nested parentheses. For example: print "color = $color, animal = $animal\n"; -=item C<(??{ code })> +=item C<(??{ I<code> })> X<(??{})> X<regex, postponed> X<regexp, postponed> X<regular expression, postponed> @@ -1865,7 +1869,7 @@ optimisations in the regex engine. For more information on this, see L</Embedded Code Execution Frequency>. This is a "postponed" regular subexpression. It behaves in I<exactly> the -same way as a C<(?{ code })> code block as described above, except that +same way as a C<(?{ I<code> })> code block as described above, except that its return value, rather than being assigned to C<$^R>, is treated as a pattern, compiled if it's a string (or used as-is if its a qr// object), then matched as if it were inserted instead of this construct. @@ -1901,7 +1905,7 @@ The following pattern matches a parenthesized group: }x; See also -L<C<(?I<PARNO>)>|/(?PARNO) (?-PARNO) (?+PARNO) (?R) (?0)> +L<C<(?I<PARNO>)>|/(?I<PARNO>) (?-I<PARNO>) (?+I<PARNO>) (?R) (?0)> for a different, more efficient way to accomplish the same task. @@ -1921,11 +1925,11 @@ the current position in the string. Information about capture state from the caller for things like backreferences is available to the subpattern, but capture buffers set by the subpattern are not visible to the caller. -Similar to C<(??{ code })> except that it does not involve executing any +Similar to C<(??{ I<code> })> except that it does not involve executing any code or potentially compiling a returned pattern string; instead it treats the part of the current pattern contained within a specified capture group as an independent pattern that must match at the current position. Also -different is the treatment of capture buffers, unlike C<(??{ code })> +different is the treatment of capture buffers, unlike C<(??{ I<code> })> recursive patterns have access to their caller's match state, so one can use backreferences safely. @@ -1993,7 +1997,7 @@ as atomic. Also, modifiers are resolved at compile time, so constructs like C<(?i:(?1))> or C<(?:(?i)(?1))> do not affect how the sub-pattern will be processed. -=item C<(?&NAME)> +=item C<(?&I<NAME>)> X<(?&NAME)> Recurse to a named subpattern. Identical to C<(?I<PARNO>)> except that the @@ -2004,19 +2008,19 @@ It is an error to refer to a name that is not declared somewhere in the pattern. B<NOTE:> In order to make things easier for programmers with experience -with the Python or PCRE regex engines the pattern C<< (?P>NAME) >> -may be used instead of C<< (?&NAME) >>. +with the Python or PCRE regex engines the pattern C<< (?P>I<NAME>) >> +may be used instead of C<< (?&I<NAME>) >>. -=item C<(?(condition)yes-pattern|no-pattern)> +=item C<(?(I<condition>)I<yes-pattern>|I<no-pattern>)> X<(?()> -=item C<(?(condition)yes-pattern)> +=item C<(?(I<condition>)I<yes-pattern>)> -Conditional expression. Matches C<yes-pattern> if C<condition> yields -a true value, matches C<no-pattern> otherwise. A missing pattern always +Conditional expression. Matches I<yes-pattern> if I<condition> yields +a true value, matches I<no-pattern> otherwise. A missing pattern always matches. -C<(condition)> should be one of: +C<(I<condition>)> should be one of: =over 4 @@ -2036,7 +2040,7 @@ matched); (true when evaluated inside of recursion or eval). Additionally the C<"R"> may be followed by a number, (which will be true when evaluated when recursing -inside of the appropriate group), or by C<&NAME>, in which case it will +inside of the appropriate group), or by C<&I<NAME>>, in which case it will be true only when evaluated during recursion in the named group. =back @@ -2064,12 +2068,12 @@ Full syntax: C<< (?(?=I<lookahead>)I<then>|I<else>) >> =item C<(?{ I<CODE> })> Treats the return value of the code block as the condition. -Full syntax: C<< (?(?{ code })then|else) >> +Full syntax: C<< (?(?{ I<code> })I<then>|I<else>) >> =item C<(R)> Checks if the expression has been evaluated inside of recursion. -Full syntax: C<< (?(R)then|else) >> +Full syntax: C<< (?(R)I<then>|I<else>) >> =item C<(R1)> C<(R2)> ... @@ -2080,7 +2084,7 @@ inside of the n-th capture group. This check is the regex equivalent of In other words, it does not check the full recursion stack. -Full syntax: C<< (?(R1)then|else) >> +Full syntax: C<< (?(R1)I<then>|I<else>) >> =item C<(R&I<NAME>)> @@ -2088,14 +2092,14 @@ Similar to C<(R1)>, this predicate checks to see if we're executing directly inside of the leftmost group with a given name (this is the same logic used by C<(?&I<NAME>)> to disambiguate). It does not check the full stack, but only the name of the innermost active recursion. -Full syntax: C<< (?(R&name)then|else) >> +Full syntax: C<< (?(R&I<name>)I<then>|I<else>) >> =item C<(DEFINE)> In this case, the yes-pattern is never directly executed, and no no-pattern is allowed. Similar in spirit to C<(?{0})> but more efficient. See below for details. -Full syntax: C<< (?(DEFINE)definitions...) >> +Full syntax: C<< (?(DEFINE)I<definitions>...) >> =back @@ -2148,15 +2152,15 @@ Will output 2, not 1. This is particularly important if you intend to compile the definitions with the C<qr//> operator, and later interpolate them in another pattern. -=item C<< (?>pattern) >> +=item C<< (?>I<pattern>) >> -=item C<< (*atomic:pattern) >> +=item C<< (*atomic:I<pattern>) >> X<(?E<gt>pattern)> X<(*atomic> X<backtrack> X<backtracking> X<atomic> X<possessive> An "independent" subexpression, one which matches the substring -that a I<standalone> C<pattern> would match if anchored at the given +that a standalone I<pattern> would match if anchored at the given position, and it matches I<nothing other than this substring>. This construct is useful for optimizations of what would otherwise be "eternal" matches, because it will not backtrack (see L</"Backtracking">). @@ -2172,12 +2176,12 @@ group C<ab> (see L</"Backtracking">). In particular, C<a*> inside C<a*ab> will match fewer characters than a standalone C<a*>, since this makes the tail match. -C<< (?>pattern) >> does not disable backtracking altogether once it has +C<< (?>I<pattern>) >> does not disable backtracking altogether once it has matched. It is still possible to backtrack past the construct, but not into it. So C<< ((?>a*)|(?>b*))ar >> will still match "bar". -An effect similar to C<< (?>pattern) >> may be achieved by writing -C<(?=(pattern))\g{-1}>. This matches the same substring as a standalone +An effect similar to C<< (?>I<pattern>) >> may be achieved by writing +C<(?=(I<pattern>))\g{-1}>. This matches the same substring as a standalone C<a+>, and the following C<\g{-1}> eats the matched string; it therefore makes a zero-length assertion into an analogue of C<< (?>...) >>. (The difference between these two constructs is that the second one @@ -2536,7 +2540,7 @@ you can write either of these: (*atomic_script_run:pattern) (*asr:pattern) -(See L</C<(?E<gt>pattern)>>.) +(See L</C<(?E<gt>I<pattern>)>>.) In Taiwan, Japan, and Korea, it is common for text to have a mixture of characters from their native scripts and base Chinese. Perl follows @@ -2652,13 +2656,13 @@ rules apply: On failure, the C<$REGERROR> variable will be set to the I<ARG> value of the verb pattern, if the verb was involved in the failure of the match. If the I<ARG> part of the pattern was omitted, then C<$REGERROR> will be set to the -name of the last C<(*MARK:NAME)> pattern executed, or to TRUE if there was +name of the last C<(*MARK:I<NAME>)> pattern executed, or to TRUE if there was none. Also, the C<$REGMARK> variable will be set to FALSE. On a successful match, the C<$REGERROR> variable will be set to FALSE, and the C<$REGMARK> variable will be set to the name of the last -C<(*MARK:NAME)> pattern executed. See the explanation for the -C<(*MARK:NAME)> verb below for more details. +C<(*MARK:I<NAME>)> pattern executed. See the explanation for the +C<(*MARK:I<NAME>)> verb below for more details. B<NOTE:> C<$REGERROR> and C<$REGMARK> are not magic variables like C<$1> and most other regex-related variables. They are not local to a scope, nor @@ -2677,7 +2681,7 @@ argument, then C<$REGERROR> and C<$REGMARK> are not touched at all. =over 4 -=item C<(*PRUNE)> C<(*PRUNE:NAME)> +=item C<(*PRUNE)> C<(*PRUNE:I<NAME>)> X<(*PRUNE)> X<(*PRUNE:NAME)> This zero-width pattern prunes the backtracking tree at the current point @@ -2722,14 +2726,14 @@ at each matching starting point like so: Any number of C<(*PRUNE)> assertions may be used in a pattern. -See also C<<< L<< /(?>pattern) >> >>> and possessive quantifiers for +See also C<<< L<< /(?>I<pattern>) >> >>> and possessive quantifiers for other ways to control backtracking. In some cases, the use of C<(*PRUNE)> can be replaced with a C<< (?>pattern) >> with no functional difference; however, C<(*PRUNE)> can be used to handle cases that cannot be expressed using a C<< (?>pattern) >> alone. -=item C<(*SKIP)> C<(*SKIP:NAME)> +=item C<(*SKIP)> C<(*SKIP:I<NAME>)> X<(*SKIP)> This zero-width pattern is similar to C<(*PRUNE)>, except that on @@ -2739,8 +2743,8 @@ of this pattern. This effectively means that the regex engine "skips" forward to this position on failure and tries to match again, (assuming that there is sufficient room to match). -The name of the C<(*SKIP:NAME)> pattern has special significance. If a -C<(*MARK:NAME)> was encountered while matching, then it is that position +The name of the C<(*SKIP:I<NAME>)> pattern has special significance. If a +C<(*MARK:I<NAME>)> was encountered while matching, then it is that position which is used as the "skip point". If no C<(*MARK)> of that name was encountered, then the C<(*SKIP)> operator has no effect. When used without a name the "skip point" is where the match point was when @@ -2762,7 +2766,7 @@ Once the 'aaab' at the start of the string has matched, and the C<(*SKIP)> executed, the next starting point will be where the cursor was when the C<(*SKIP)> was executed. -=item C<(*MARK:NAME)> C<(*:NAME)> +=item C<(*MARK:I<NAME>)> C<(*:I<NAME>)> X<(*MARK)> X<(*MARK:NAME)> X<(*:NAME)> This zero-width pattern can be used to mark the point reached in a string @@ -2771,13 +2775,13 @@ mark may be given a name. A later C<(*SKIP)> pattern will then skip forward to that point if backtracked into on failure. Any number of C<(*MARK)> patterns are allowed, and the I<NAME> portion may be duplicated. -In addition to interacting with the C<(*SKIP)> pattern, C<(*MARK:NAME)> +In addition to interacting with the C<(*SKIP)> pattern, C<(*MARK:I<NAME>)> can be used to "label" a pattern branch, so that after matching, the program can determine which branches of the pattern were involved in the match. When a match is successful, the C<$REGMARK> variable will be set to the -name of the most recently executed C<(*MARK:NAME)> that was involved +name of the most recently executed C<(*MARK:I<NAME>)> that was involved in the match. This can be used to determine which branch of a pattern was matched @@ -2789,19 +2793,19 @@ C</(?:x(*MARK:x)|y(*MARK:y)|z(*MARK:z))/>. When a match has failed, and unless another verb has been involved in failing the match and has provided its own name to use, the C<$REGERROR> variable will be set to the name of the most recently executed -C<(*MARK:NAME)>. +C<(*MARK:I<NAME>)>. See L</(*SKIP)> for more details. -As a shortcut C<(*MARK:NAME)> can be written C<(*:NAME)>. +As a shortcut C<(*MARK:I<NAME>)> can be written C<(*:I<NAME>)>. -=item C<(*THEN)> C<(*THEN:NAME)> +=item C<(*THEN)> C<(*THEN:I<NAME>)> This is similar to the "cut group" operator C<::> from Perl 6. Like C<(*PRUNE)>, this verb always matches, and when backtracked into on failure, it causes the regex engine to try the next alternation in the innermost enclosing group (capturing or otherwise) that has alternations. -The two branches of a C<(?(condition)yes-pattern|no-pattern)> do not +The two branches of a C<(?(I<condition>)I<yes-pattern>|I<no-pattern>)> do not count as an alternation, as far as C<(*THEN)> is concerned. Its name comes from the observation that this operation combined with the @@ -2830,7 +2834,7 @@ is not the same as as after matching the I<A> but failing on the I<B> the C<(*THEN)> verb will backtrack and try I<C>; but the C<(*PRUNE)> verb will simply fail. -=item C<(*COMMIT)> C<(*COMMIT:args)> +=item C<(*COMMIT)> C<(*COMMIT:I<args>)> X<(*COMMIT)> This is the Perl 6 "commit pattern" C<< <commit> >> or C<:::>. It's a @@ -2851,7 +2855,7 @@ In other words, once the C<(*COMMIT)> has been entered, and if the pattern does not match, the regex engine will not try any further matching on the rest of the string. -=item C<(*FAIL)> C<(*F)> C<(*FAIL:arg)> +=item C<(*FAIL)> C<(*F)> C<(*FAIL:I<arg>)> X<(*FAIL)> X<(*F)> This pattern matches nothing and always fails. It can be used to force the @@ -2862,7 +2866,7 @@ the argument can be obtained from C<$REGERROR>. It is probably useful only when combined with C<(?{})> or C<(??{})>. -=item C<(*ACCEPT)> C<(*ACCEPT:arg)> +=item C<(*ACCEPT)> C<(*ACCEPT:I<arg>)> X<(*ACCEPT)> This pattern matches nothing and causes the end of successful matching at @@ -3095,14 +3099,14 @@ else in the whole regular expression.) For this grouping operator there is no need to describe the ordering, since only whether or not C<"S"> can match is important. -=item C<(??{ EXPR })>, C<(?I<PARNO>)> +=item C<(??{ I<EXPR> })>, C<(?I<PARNO>)> The ordering is the same as for the regular expression which is -the result of EXPR, or the pattern contained by capture group I<PARNO>. +the result of I<EXPR>, or the pattern contained by capture group I<PARNO>. -=item C<(?(condition)yes-pattern|no-pattern)> +=item C<(?(I<condition>)I<yes-pattern>|I<no-pattern>)> -Recall that which of C<yes-pattern> or C<no-pattern> actually matches is +Recall that which of I<yes-pattern> or I<no-pattern> actually matches is already determined. The ordering of the matches is the same as for the chosen subexpression. @@ -3210,17 +3214,17 @@ Perl-specific syntax, the following are also accepted: =over 4 -=item C<< (?PE<lt>NAMEE<gt>pattern) >> +=item C<< (?PE<lt>I<NAME>E<gt>I<pattern>) >> -Define a named capture group. Equivalent to C<< (?<NAME>pattern) >>. +Define a named capture group. Equivalent to C<< (?<I<NAME>>I<pattern>) >>. -=item C<< (?P=NAME) >> +=item C<< (?P=I<NAME>) >> -Backreference to a named capture group. Equivalent to C<< \g{NAME} >>. +Backreference to a named capture group. Equivalent to C<< \g{I<NAME>} >>. -=item C<< (?P>NAME) >> +=item C<< (?P>I<NAME>) >> -Subroutine call to a named capture group. Equivalent to C<< (?&NAME) >>. +Subroutine call to a named capture group. Equivalent to C<< (?&I<NAME>) >>. =back diff --git a/pod/perlrecharclass.pod b/pod/perlrecharclass.pod index 0f6a624e85..bda60cd49e 100644 --- a/pod/perlrecharclass.pod +++ b/pod/perlrecharclass.pod @@ -359,9 +359,10 @@ C</\pLl/> is valid, but means something different. It matches a two character string: a letter (Unicode property C<\pL>), followed by a lowercase C<l>. -If locale rules are not in effect, the use of -a Unicode property will force the regular expression into using Unicode -rules, if it isn't already. +What a Unicode property matches is never subject to locale rules, and +if locale rules are not otherwise in effect, the use of a Unicode +property will force the regular expression into using Unicode rules, if +it isn't already. Note that almost all properties are immune to case-insensitive matching. That is, adding a C</i> regular expression modifier does not change what diff --git a/pod/perlvar.pod b/pod/perlvar.pod index 03b2215b66..d67d4cd8b1 100644 --- a/pod/perlvar.pod +++ b/pod/perlvar.pod @@ -931,7 +931,8 @@ is equivalent to $2, etc. should output "f-o-a-l". -See also L</$I<digits>>, L</%{^CAPTURE}> and L</%{^CAPTURE_ALL}>. +See also L<<< /$<I<digits>> ($1, $2, ...) >>>, L</%{^CAPTURE}> and +L</%{^CAPTURE_ALL}>. Note that unlike most other regex magic variables there is no single letter equivalent to C<@{^CAPTURE}>. diff --git a/regcomp.c b/regcomp.c index b5903bf8df..3b269466ee 100644 --- a/regcomp.c +++ b/regcomp.c @@ -13269,9 +13269,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) char name = *RExC_parse; char * endbrace = NULL; RExC_parse += 2; - if (RExC_parse < RExC_end) { - endbrace = (char *) memchr(RExC_parse, '}', RExC_end - RExC_parse); - } + endbrace = (char *) memchr(RExC_parse, '}', RExC_end - RExC_parse); if (! endbrace) { vFAIL2("Missing right brace on \\%c{}", name); @@ -16796,6 +16794,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, "Ignoring zero length \\N{} in character class"); } else { /* cp_count > 1 */ + assert(cp_count > 1); if (! RExC_in_multi_char_class) { if (invert || range || *RExC_parse == '-') { if (strict) { diff --git a/regexec.c b/regexec.c index e50145d449..64a65462b5 100644 --- a/regexec.c +++ b/regexec.c @@ -155,7 +155,7 @@ static const char* const non_utf8_target_but_utf8_required #define NEXTCHR_EOS -10 /* nextchr has fallen off the end */ #define NEXTCHR_IS_EOS (nextchr < 0) -#define SET_nextchr \ +#define SET_nextchr __ASSERT_(locinput <= reginfo->strend) \ nextchr = ((locinput < reginfo->strend) ? UCHARAT(locinput) : NEXTCHR_EOS) #define SET_locinput(p) \ @@ -1760,7 +1760,7 @@ STMT_START { case trie_utf8l: \ _CHECK_AND_WARN_PROBLEMATIC_LOCALE; \ if (utf8_target && UTF8_IS_ABOVE_LATIN1(*uc)) { \ - _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc + UTF8SKIP(uc)); \ + _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc_end); \ } \ /* FALLTHROUGH */ \ case trie_utf8: \ diff --git a/utf8.h b/utf8.h index 99e795d3a4..7773007e49 100644 --- a/utf8.h +++ b/utf8.h @@ -498,6 +498,17 @@ only) byte is pointed to by C<s>. #define UTF8SKIP(s) PL_utf8skip[*(const U8*)(s)] #define UTF8_SKIP(s) UTF8SKIP(s) +/* + +=for apidoc Am|STRLEN|UTF8_SAFE_SKIP|char* s|char* e +returns the number of bytes in the UTF-8 encoded character whose first (perhaps +only) byte is pointed to by C<s>. But never returns beyond C<e>. + +=cut + */ +#define UTF8_SAFE_SKIP(s, e) (__ASSERT_((e) > (s)) \ + MIN(((e) - (s)), UTF8_SKIP(s))) + /* Most code that says 'UNI_' really means the native value for code points up * through 255 */ #define UNI_IS_INVARIANT(cp) UVCHR_IS_INVARIANT(cp) -- Perl5 Master Repository
