In perl.git, the branch blead has been updated <http://perl5.git.perl.org/perl.git/commitdiff/d56b1f57fd32d77a6c2b437d81c206ad3905e15a?hp=ba474e876da44d462e1da4f95365622b59a8d402>
- Log ----------------------------------------------------------------- commit d56b1f57fd32d77a6c2b437d81c206ad3905e15a Author: Karl Williamson <[email protected]> Date: Sat Sep 27 17:13:42 2014 -0600 regcomp.c: Properly dereference a ptr UTF8_IS_INVARIANT takes a byte as its argument, not a ptr. This bug was introduced by 62a59291 in 5.21.4. It doesn't probably cause bugs because the ptr is always too big to be an invariant, but it slows things down somewhat by not taking a shortcut it could. It turns out that there is a discrepency between the APIs of the UTF8-foo macros. Some take a byte, and some a string ptr. M regcomp.c commit 9cba692be9578e72e0f03f616e61b7fa7a2fb79d Author: Karl Williamson <[email protected]> Date: Sun Sep 21 22:07:58 2014 -0600 Suppress some Solaris warnings We get an integer overflow message when we left shift a 1 into the highest bit of a word. This changes the 1's into 1U's to indicate unsigned. This is done for all the flag bits in the affected word, as they could get reorderd by someone in the future, unintentionally reintroducing this problem again. M op.h M op_reg_common.h M regexp.h commit bb62883ea327df0c836748360635ed9394e2264c Author: Karl Williamson <[email protected]> Date: Wed Sep 24 12:51:46 2014 -0600 op_reg_common.h: Update comment The PL file previously referred to has been deleted, and replaced by a different one. M op_reg_common.h commit ad51dc19e7823fed4e15215d91471becffd8a416 Author: Karl Williamson <[email protected]> Date: Wed Sep 24 12:49:13 2014 -0600 B/Makefile.PL: Allow constants to be long/unsigned This looks for numerical constants, but failed to see long and/or unsigned ones, which have suffixes U and/or L, or lowercase u and/or l. M ext/B/Makefile.PL commit cc4d09e1f55f1a823a9b410aa3e43aab0df8147a Author: Karl Williamson <[email protected]> Date: Mon Sep 22 11:56:48 2014 -0600 Deprecate multiple "x" in "/xx" It is planned for a future Perl release to have /xx mean something different from just /x. To prepare for this, this commit raises a deprecation warning if someone currently has this usage. A grep of CPAN did not turn up any instances of this, but this is to be safe anyway. The added code is more general than actually needed, in case we want to do this for another flag. M ext/re/re.pm M ext/re/t/reflags.t M pod/perldelta.pod M pod/perldiag.pod M regcomp.c M regexp.h M t/re/reg_mesg.t M toke.c commit 477afe810ce7934cb37983108672d43de207e0c5 Author: Karl Williamson <[email protected]> Date: Mon Sep 22 11:54:37 2014 -0600 toke.c: Clarify comment M toke.c commit a297f62776814496b3d468e7406a4c3069097f29 Author: Karl Williamson <[email protected]> Date: Wed Sep 17 17:59:39 2014 -0600 op_reg_common.h: White-space only Align columns vertically M op_reg_common.h commit 334afb3ef43aa91a8701c4ca7e753a5ec8e71f88 Author: Karl Williamson <[email protected]> Date: Wed Sep 17 17:55:16 2014 -0600 Make space for /xx flag This doesn't actually use the flag yet. We no longer have to make version-dependent changes to ext/Devel-Peek/t/Peek.t, (it being in /ext) so this doesn't M dump.c M ext/B/t/concise-xs.t M ext/Devel-Peek/t/Peek.t M ext/re/re.pm M lib/B/Deparse.pm M op.h M op_reg_common.h M regexp.h M regnodes.h commit 9a7d4f5f5c4be53dfbc55ca52953e3794a5ef68a Author: Karl Williamson <[email protected]> Date: Wed Sep 17 17:57:17 2014 -0600 op_reg_common.h: #define in terms of more basic one The mask to copy bits should always include at least the compile-time bits. By defining it in terms of the compile-time bits, we make it easier to change and understand. M op_reg_common.h commit 13f27704e67ca16eefc14851c1cbf02be39509c6 Author: Karl Williamson <[email protected]> Date: Wed Sep 17 12:16:12 2014 -0600 Up regex flags limit for (??{}) Previously the regex pattern compilation flags needed for this construct would fit into an 8-bit byte. This conveniently fits into the flags structure element of a regnode. There are changes coming that require more than 8 bits, so in preparation, this commit adds an argument to the node that implements (??{}) (31-bits usable for flags), and moves the storage to that. M op_reg_common.h M pod/perldebguts.pod M regcomp.c M regcomp.sym M regexec.c M regnodes.h commit 81a9a93bd8cf15be6db1942f8e44c590ecbace15 Author: Karl Williamson <[email protected]> Date: Tue Sep 16 17:19:52 2014 -0600 regcomp.c: Add assertions This changes from assuming that certain things are constant to using a variable instead, and then asserting that the variable matches the constant. M regcomp.c commit 9990d5ab621b83dea3adb80c343aff125fe9764b Author: Karl Williamson <[email protected]> Date: Tue Sep 16 17:16:47 2014 -0600 regcomp.c: Add a function and use it This adds a function to allocate a regnode with 2 32-bit arguments, and uses it, rather than the ad-hoc code that did the same thing previously. This is in preparation for this code being used in a 2nd place in a future commit. M embed.fnc M embed.h M proto.h M regcomp.c M regcomp.h commit 975a06f7f899d1dd89d8a66defc0b3ee41013d1f Author: Karl Williamson <[email protected]> Date: Tue Sep 16 16:26:36 2014 -0600 regcomp.sym: ANYOF nodes have an argument Plus a bitmap, but they always have an argument besides, contrary to what was specified here. Future commits rely on this, whereas heretofore this error was harmless. M pod/perldebguts.pod M regcomp.sym M regnodes.h commit 9da99c5e4f931efdaa065a5f0101b2b992a303ef Author: Karl Williamson <[email protected]> Date: Tue Sep 16 10:00:32 2014 -0600 regcomp.h: Add comment M regcomp.h commit 1b05bdbbd1fc8444b0d5367d119682d1e85509c1 Author: Karl Williamson <[email protected]> Date: Mon Sep 15 12:14:17 2014 -0600 regcomp.c: Outdent line to align with neighbors M regcomp.c commit cf0c7a70bd5aa9bb2c3310813c46536d8de98aa8 Author: Karl Williamson <[email protected]> Date: Mon Sep 15 11:30:25 2014 -0600 regcomp.h: Remove obsolete #defines These internal definitions are no longer used. M regcomp.h commit 888ee784309e6a9850865ba42c979d9b01ddcab1 Author: Karl Williamson <[email protected]> Date: Mon Sep 15 11:29:43 2014 -0600 regcomp.h: Use existing macro instead of reinventing M regcomp.h commit f917fa87667f8af2b4046abdd570ba6f00843f81 Author: Karl Williamson <[email protected]> Date: Mon Sep 15 08:46:35 2014 -0600 regcomp.c: Extract duplicated code to fcn This causes the nearly-duplicate code of S_reg_node and S_reganode to be placed into a single function, S_regnode_guts. There is one place where it might not be obvious that this doesn't change things. And that is under DEBUGGING, reg_node() called Set_Node_Offset(RExC_emit, RExC_parse + (op == END)); and reganode called Set_Cur_Node_Offset; However Set_Cur_Node_Offset is defined to be Set_Node_Offset(RExC_emit, RExC_parse) and since op will never be END for reganode, the two statements are equivalent. M embed.fnc M embed.h M proto.h M regcomp.c commit ab4ad2b039e3f3ef88c73577588316a24010ffec Author: Karl Williamson <[email protected]> Date: Sun Sep 14 22:25:47 2014 -0600 regcomp.c: Move some statements These statements are moved to after the debugging code so that the two functions are essentially identical before them. This will allow the two to be combined in a future commit. I verified by testing that the debugging info was not affected. M regcomp.c commit a6cd0feae31ef21ba4a856dce379d30fceab3849 Author: Karl Williamson <[email protected]> Date: Wed Sep 17 12:11:21 2014 -0600 op.h: Move flag bits; comment shared-bit scheme This changes op.h to correspond with regexp.h. It moves all the used bits up in the word so that if a new shared bit is added, the #error will be triggered, alerting the person doing it that things need adjusting so binary compatibility is preserved. M op.h commit 0bd51918c654be8e8daeef6cad6a5fe6b435bca2 Author: Karl Williamson <[email protected]> Date: Wed Sep 17 12:08:41 2014 -0600 regexp.h: Comment shared-pool free bits scheme M regexp.h commit 3f6e6b29f65727e5ed5b4a317f6eac62edbc5f86 Author: Karl Williamson <[email protected]> Date: Fri Sep 12 14:04:12 2014 -0600 regexp.h: Make tentative division of free-bit space This sets a #define to point in the middle of the free-space, so that bits at either end can be added without having to adjust many other defines. M regexp.h commit 32f62247ca81d31e1ed69dd4be73c90f5184ff47 Author: Karl Williamson <[email protected]> Date: Fri Sep 12 13:59:13 2014 -0600 regexp.h: Define flag bit directly, not indirectly This #defined a symbol then did a compile time check that it was the same as another symbol. This commit simply defines it as the other symbol directly, and moves it to above the other definitions, which it no longer is part of. This prepares for the next commit. M regexp.h commit 1d32d911228f87e28c3611f90fd3274717ee6b68 Author: Karl Williamson <[email protected]> Date: Fri Sep 12 14:40:03 2014 -0600 regexp.h Remove unused bit placeholders We do not need a placeholder for unused flag bits. And removing them makes the generated regnodes.h more accurate as to what bits are available. M regexp.h M regnodes.h commit a3b51d37a1c354a57994f7ada55bd0a05849903d Author: Karl Williamson <[email protected]> Date: Thu Sep 11 22:45:06 2014 -0600 regexp.h: Move regex flag bit positions. This moves three bits to create a block of unused bits at the beginning. The first bit had to be moved to make space for other uses that are coming in future commits. This breaks binary compatibility, so might as well move the other two bits so that all the unused bits are consolidated at the beginning. This pool of unused bits is the boundary between the bits that are common to op.h and regexp.h (and in op_reg_common.h) and those that are separate. It's best to have all the unused bits there, so when we need to use one, it can be taken from either side, as needed, without us being trapped into having an available bit, but of the wrong kind. M regexp.h M regnodes.h ----------------------------------------------------------------------- Summary of changes: dump.c | 1 + embed.fnc | 8 +++ embed.h | 2 + ext/B/Makefile.PL | 12 ++-- ext/B/t/concise-xs.t | 3 +- ext/Devel-Peek/t/Peek.t | 4 +- ext/re/re.pm | 18 +++++- ext/re/t/reflags.t | 10 +++- lib/B/Deparse.pm | 7 ++- op.h | 68 +++++++++++++++++------ op_reg_common.h | 56 ++++++++++--------- pod/perldebguts.pod | 5 +- pod/perldelta.pod | 15 ++++- pod/perldiag.pod | 7 +++ proto.h | 11 ++++ regcomp.c | 143 ++++++++++++++++++++++++++---------------------- regcomp.h | 18 +++--- regcomp.sym | 4 +- regexec.c | 2 +- regexp.h | 113 +++++++++++++++++++++++++------------- regnodes.h | 36 ++++++------ t/re/reg_mesg.t | 4 ++ toke.c | 31 +++++++---- 23 files changed, 373 insertions(+), 205 deletions(-) diff --git a/dump.c b/dump.c index 16ac581..bfc176d 100644 --- a/dump.c +++ b/dump.c @@ -1363,6 +1363,7 @@ const struct flag_to_name regexp_extflags_names[] = { {RXf_PMf_SINGLELINE, "PMf_SINGLELINE,"}, {RXf_PMf_FOLD, "PMf_FOLD,"}, {RXf_PMf_EXTENDED, "PMf_EXTENDED,"}, + {RXf_PMf_EXTENDED_MORE, "PMf_EXTENDED_MORE,"}, {RXf_PMf_KEEPCOPY, "PMf_KEEPCOPY,"}, {RXf_IS_ANCHORED, "IS_ANCHORED,"}, {RXf_NO_INPLACE_SUBST, "NO_INPLACE_SUBST,"}, diff --git a/embed.fnc b/embed.fnc index 8764e7d..ee5f115 100644 --- a/embed.fnc +++ b/embed.fnc @@ -2086,8 +2086,16 @@ p |OP * |tied_method|NN SV *methname|NN SV **sp \ #if defined(PERL_IN_REGCOMP_C) Es |regnode*|reg |NN RExC_state_t *pRExC_state \ |I32 paren|NN I32 *flagp|U32 depth +Es |regnode*|regnode_guts |NN RExC_state_t *pRExC_state \ + |const U8 op \ + |const STRLEN extra_len \ + |NN const char* const name Es |regnode*|reganode |NN RExC_state_t *pRExC_state|U8 op \ |U32 arg +Es |regnode*|reg2Lanode |NN RExC_state_t *pRExC_state \ + |const U8 op \ + |const U32 arg1 \ + |const I32 arg2 Es |regnode*|regatom |NN RExC_state_t *pRExC_state \ |NN I32 *flagp|U32 depth Es |regnode*|regbranch |NN RExC_state_t *pRExC_state \ diff --git a/embed.h b/embed.h index 300f55f..e0df072 100644 --- a/embed.h +++ b/embed.h @@ -977,6 +977,7 @@ #define parse_lparen_question_flags(a) S_parse_lparen_question_flags(aTHX_ a) #define populate_ANYOF_from_invlist(a,b) S_populate_ANYOF_from_invlist(aTHX_ a,b) #define reg(a,b,c,d) S_reg(aTHX_ a,b,c,d) +#define reg2Lanode(a,b,c,d) S_reg2Lanode(aTHX_ a,b,c,d) #define reg_node(a,b) S_reg_node(aTHX_ a,b) #define reg_recode(a,b) S_reg_recode(aTHX_ a,b) #define reg_scan_name(a,b) S_reg_scan_name(aTHX_ a,b) @@ -986,6 +987,7 @@ #define regbranch(a,b,c,d) S_regbranch(aTHX_ a,b,c,d) #define regclass(a,b,c,d,e,f,g) S_regclass(aTHX_ a,b,c,d,e,f,g) #define reginsert(a,b,c,d) S_reginsert(aTHX_ a,b,c,d) +#define regnode_guts(a,b,c,d) S_regnode_guts(aTHX_ a,b,c,d) #define regpatws S_regpatws #define regpiece(a,b,c) S_regpiece(aTHX_ a,b,c) #define regpposixcc(a,b,c) S_regpposixcc(aTHX_ a,b,c) diff --git a/ext/B/Makefile.PL b/ext/B/Makefile.PL index 81f6826..e70e8fd 100644 --- a/ext/B/Makefile.PL +++ b/ext/B/Makefile.PL @@ -43,10 +43,14 @@ foreach my $tuple (['cop.h'], open my $fh, '<', $path or die "Cannot open $path: $!"; while (<$fh>) { push @names, $1 if (/ \#define \s+ ( $pfx \w+ ) \s+ - ( [()|\dx]+ # Parens, '|', digits, 'x' - | \(? \d+ \s* << .*? # digits left shifted by anything - ) \s* (?: $| \/ \* ) # ending at comment or $ - /x); + ( [()|\dx]+ [UuLl]{0,2} # Parens, '|', digits, 'x', + # followed by optional long, + # unsigned qualifiers + | \(? \d+ [UuLl]{0,2} \s* << .*? # digits left shifted by anything + # followed by optional + # long, unsigned qualifiers + ) \s* (?: $| \/ \* ) # ending at comment or $ + /x); } close $fh; } diff --git a/ext/B/t/concise-xs.t b/ext/B/t/concise-xs.t index c2258f7..c1d7f18 100644 --- a/ext/B/t/concise-xs.t +++ b/ext/B/t/concise-xs.t @@ -166,7 +166,8 @@ my $testpkgs = { OPpSORT_INPLACE OPpSORT_INTEGER OPpSORT_NUMERIC OPpSORT_REVERSE OPpREVERSE_INPLACE OPpTARGET_MY OPpTRANS_COMPLEMENT OPpTRANS_DELETE OPpTRANS_SQUASH - PMf_CONTINUE PMf_EVAL PMf_EXTENDED PMf_FOLD PMf_GLOBAL + PMf_CONTINUE PMf_EVAL PMf_EXTENDED PMf_EXTENDED_MORE + PMf_FOLD PMf_GLOBAL PMf_KEEP PMf_NONDESTRUCT PMf_MULTILINE PMf_ONCE PMf_SINGLELINE POSTFIX SVf_FAKE SVf_IOK SVf_NOK SVf_POK SVf_ROK diff --git a/ext/Devel-Peek/t/Peek.t b/ext/Devel-Peek/t/Peek.t index dd14862..f321e18 100644 --- a/ext/Devel-Peek/t/Peek.t +++ b/ext/Devel-Peek/t/Peek.t @@ -1189,7 +1189,7 @@ do_test('UTF-8 in a regular expression', CUR = 13 STASH = $ADDR "Regexp" COMPFLAGS = 0x0 \(\) - EXTFLAGS = 0x680040 \(CHECK_ALL,USE_INTUIT_NOML,USE_INTUIT_ML\) + EXTFLAGS = 0x680080 \(CHECK_ALL,USE_INTUIT_NOML,USE_INTUIT_ML\) (?: ENGINE = $ADDR \(STANDARD\) )? INTFLAGS = 0x0(?: \(\))? NPARENS = 0 @@ -1212,7 +1212,7 @@ do_test('UTF-8 in a regular expression', PV = $ADDR "\(\?\^u:\\\\\\\\x\{100\}\)" \[UTF8 "\(\?\^u:\\\\\\\\x\{100\}\)"\] CUR = 13 COMPFLAGS = 0x0 \(\) - EXTFLAGS = 0x680040 \(CHECK_ALL,USE_INTUIT_NOML,USE_INTUIT_ML\) + EXTFLAGS = 0x680080 \(CHECK_ALL,USE_INTUIT_NOML,USE_INTUIT_ML\) (?: ENGINE = $ADDR \(STANDARD\) )? INTFLAGS = 0x0(?: \(\))? NPARENS = 0 diff --git a/ext/re/re.pm b/ext/re/re.pm index c2d6eed..511c1c4 100644 --- a/ext/re/re.pm +++ b/ext/re/re.pm @@ -23,7 +23,7 @@ my %reflags = ( s => 1 << ($PMMOD_SHIFT + 1), i => 1 << ($PMMOD_SHIFT + 2), x => 1 << ($PMMOD_SHIFT + 3), - p => 1 << ($PMMOD_SHIFT + 4), + p => 1 << ($PMMOD_SHIFT + 5), # special cases: d => 0, l => 1, @@ -109,6 +109,7 @@ sub _load_unload { sub bits { my $on = shift; my $bits = 0; + my %seen; # Has flag already been seen? ARG: foreach my $idx (0..$#_){ my $s=$_[$idx]; @@ -187,7 +188,8 @@ sub bits { && $^H{reflags_charset} == $reflags{$_}; } } elsif (exists $reflags{$_}) { - $on + $seen{$_}++; + $on ? $reflags |= $reflags{$_} : ($reflags &= ~$reflags{$_}); } else { @@ -208,6 +210,18 @@ sub bits { ")"); } } + if (exists $seen{'x'} && $seen{'x'} > 1 + && (warnings::enabled("deprecated") + || warnings::enabled("regexp"))) + { + my $message = "Having more than one /x regexp modifier is deprecated"; + if (warnings::enabled("deprecated")) { + warnings::warn("deprecated", $message); + } + else { + warnings::warn("regexp", $message); + } + } $bits; } diff --git a/ext/re/t/reflags.t b/ext/re/t/reflags.t index b2cbf80..e90a712 100644 --- a/ext/re/t/reflags.t +++ b/ext/re/t/reflags.t @@ -10,7 +10,7 @@ BEGIN { use strict; -use Test::More tests => 62; +use Test::More tests => 63; my @flags = qw( a d l u ); @@ -165,9 +165,13 @@ is qr//, '(?^:)', 'no re "/aai"'; } $w = ""; - eval "use re '/axaa'"; + eval "use re '/amaa'"; like $w, qr/The "a" flag may only appear a maximum of twice/, - "warning with eval \"use re \"/axaa\""; + "warning with eval \"use re \"/amaa\""; + $w = ""; + eval "use re '/xamax'"; + like $w, qr/Having more than one \/x regexp modifier is deprecated/, + "warning with eval \"use re \"/xamax\""; } diff --git a/lib/B/Deparse.pm b/lib/B/Deparse.pm index ab85acf..a4cb5a3 100644 --- a/lib/B/Deparse.pm +++ b/lib/B/Deparse.pm @@ -20,8 +20,8 @@ use B qw(class main_root main_start main_cv svref_2object opnumber perlstring SVpad_TYPED CVf_METHOD CVf_LVALUE PMf_KEEP PMf_GLOBAL PMf_CONTINUE PMf_EVAL PMf_ONCE - PMf_MULTILINE PMf_SINGLELINE PMf_FOLD PMf_EXTENDED); -$VERSION = '1.28'; + PMf_MULTILINE PMf_SINGLELINE PMf_FOLD PMf_EXTENDED PMf_EXTENDED_MORE); +$VERSION = '1.29'; use strict; use vars qw/$AUTOLOAD/; use warnings (); @@ -4756,11 +4756,12 @@ sub re_flags { $flags .= "o" if $pmflags & PMf_KEEP; $flags .= "s" if $pmflags & PMf_SINGLELINE; $flags .= "x" if $pmflags & PMf_EXTENDED; + $flags .= "x" if $pmflags & PMf_EXTENDED_MORE; $flags .= "p" if $pmflags & RXf_PMf_KEEPCOPY; if (my $charset = $pmflags & RXf_PMf_CHARSET) { # Hardcoding this is fragile, but B does not yet export the # constants we need. - $flags .= qw(d l u a aa)[$charset >> 5] + $flags .= qw(d l u a aa)[$charset >> 6] } # The /d flag is indicated by 0; only show it if necessary. elsif ($self->{hinthash} and diff --git a/op.h b/op.h index 139375d..df39d33 100644 --- a/op.h +++ b/op.h @@ -245,52 +245,84 @@ struct pmop { #define PM_SETRE(o,r) ((o)->op_pmregexp = (r)) #endif -/* Leave some space, so future bit allocations can go either in the shared or - * unshared area without affecting binary compatibility */ -#define PMf_BASE_SHIFT (_RXf_PMf_SHIFT_NEXT+6) +/* Currently these PMf flags occupy a single 32-bit word. Not all bits are + * currently used. The lower bits are shared with their corresponding RXf flag + * bits, up to but not including _RXf_PMf_SHIFT_NEXT. The unused bits + * immediately follow; finally the used Pmf-only (unshared) bits, so that the + * highest bit in the word is used. This gathers all the unused bits as a pool + * in the middle, like so: 11111111111111110000001111111111 + * where the '1's represent used bits, and the '0's unused. This design allows + * us to allocate off one end of the pool if we need to add a shared bit, and + * off the other end if we need a non-shared bit, without disturbing the other + * bits. This maximizes the likelihood of being able to change things without + * breaking binary compatibility. + * + * To add shared bits, do so in op_reg_common.h. This should change + * _RXf_PMf_SHIFT_NEXT so that things won't compile. Then come to regexp.h and + * op.h and adjust the constant adders in the definitions of PMf_BASE_SHIFT and + * Pmf_BASE_SHIFT down by the number of shared bits you added. That's it. + * Things should be binary compatible. But if either of these gets to having + * to subtract rather than add, leave at 0 and adjust all the entries below + * that are in terms of this according. But if the first one of those is + * already PMf_BASE_SHIFT+0, there are no bits left, and a redesign is in + * order. + * + * To remove unshared bits, just delete its entry. If you're where breaking + * binary compatibility is ok to do, you might want to adjust things to move + * the newly opened space so that it gets absorbed into the common pool. + * + * To add unshared bits, first use up any gaps in the middle. Otherwise, + * allocate off the low end until you get to PMf_BASE_SHIFT+0. If that isn't + * enough, move PMf_BASE_SHIFT down (if possible) and add the new bit at the + * other end instead; this preserves binary compatibility. */ +#define PMf_BASE_SHIFT (_RXf_PMf_SHIFT_NEXT+4) /* 'use re "taint"' in scope: taint $1 etc. if target tainted */ -#define PMf_RETAINT (1<<(PMf_BASE_SHIFT+0)) +#define PMf_RETAINT (1U<<(PMf_BASE_SHIFT+5)) /* match successfully only once per reset, with related flag RXf_USED in * re->extflags holding state. This is used only for ?? matches, and only on * OP_MATCH and OP_QR */ -#define PMf_ONCE (1<<(PMf_BASE_SHIFT+1)) +#define PMf_ONCE (1U<<(PMf_BASE_SHIFT+6)) /* PMf_ONCE, i.e. ?pat?, has matched successfully. Not used under threading. */ -#define PMf_USED (1<<(PMf_BASE_SHIFT+3)) +#define PMf_USED (1U<<(PMf_BASE_SHIFT+7)) /* subst replacement is constant */ -#define PMf_CONST (1<<(PMf_BASE_SHIFT+4)) +#define PMf_CONST (1U<<(PMf_BASE_SHIFT+8)) /* keep 1st runtime pattern forever */ -#define PMf_KEEP (1<<(PMf_BASE_SHIFT+5)) +#define PMf_KEEP (1U<<(PMf_BASE_SHIFT+9)) -#define PMf_GLOBAL (1<<(PMf_BASE_SHIFT+6)) /* pattern had a g modifier */ +#define PMf_GLOBAL (1U<<(PMf_BASE_SHIFT+10)) /* pattern had a g modifier */ /* don't reset pos() if //g fails */ -#define PMf_CONTINUE (1<<(PMf_BASE_SHIFT+7)) +#define PMf_CONTINUE (1U<<(PMf_BASE_SHIFT+11)) /* evaluating replacement as expr */ -#define PMf_EVAL (1<<(PMf_BASE_SHIFT+8)) +#define PMf_EVAL (1U<<(PMf_BASE_SHIFT+12)) /* Return substituted string instead of modifying it. */ -#define PMf_NONDESTRUCT (1<<(PMf_BASE_SHIFT+9)) +#define PMf_NONDESTRUCT (1U<<(PMf_BASE_SHIFT+13)) /* the pattern has a CV attached (currently only under qr/...(?{}).../) */ -#define PMf_HAS_CV (1<<(PMf_BASE_SHIFT+10)) +#define PMf_HAS_CV (1U<<(PMf_BASE_SHIFT+14)) /* op_code_list is private; don't free it etc. It may well point to * code within another sub, with different pad etc */ -#define PMf_CODELIST_PRIVATE (1<<(PMf_BASE_SHIFT+11)) +#define PMf_CODELIST_PRIVATE (1U<<(PMf_BASE_SHIFT+15)) /* the PMOP is a QR (we should be able to detect that from the op type, * but the regex compilation API passes just the pm flags, not the op * itself */ -#define PMf_IS_QR (1<<(PMf_BASE_SHIFT+12)) -#define PMf_USE_RE_EVAL (1<<(PMf_BASE_SHIFT+13)) /* use re'eval' in scope */ - -#if PMf_BASE_SHIFT+13 > 31 +#define PMf_IS_QR (1U<<(PMf_BASE_SHIFT+16)) +#define PMf_USE_RE_EVAL (1U<<(PMf_BASE_SHIFT+17)) /* use re'eval' in scope */ + +/* See comments at the beginning of these defines about adding bits. The + * highest bit position should be used, so that if PMf_BASE_SHIFT gets + * increased, the #error below will be triggered so that you will be reminded + * to adjust things at the other end to keep the bit positions unchanged */ +#if PMf_BASE_SHIFT+17 > 31 # error Too many PMf_ bits used. See above and regnodes.h for any spare in middle #endif diff --git a/op_reg_common.h b/op_reg_common.h index 9dcdaae..9f7227d 100644 --- a/op_reg_common.h +++ b/op_reg_common.h @@ -10,7 +10,7 @@ */ /* These defines are used in both op.h and regexp.h The definitions use the - * shift form so that ext/B/defsubs_h.PL will pick them up. + * shift form so that ext/B/Makefile.PL will pick them up. * * Data structures used in the two headers have common fields, and in fact one * is copied onto the other. This makes it easy to keep them in sync */ @@ -28,11 +28,12 @@ * INT_PAT_MODS in regexp.h for the reason contiguity is needed */ /* Make sure to update lib/re.pm when changing these! */ /* Make sure you keep the pure PMf_ versions below in sync */ -#define RXf_PMf_MULTILINE (1 << (RXf_PMf_STD_PMMOD_SHIFT+0)) /* /m */ -#define RXf_PMf_SINGLELINE (1 << (RXf_PMf_STD_PMMOD_SHIFT+1)) /* /s */ -#define RXf_PMf_FOLD (1 << (RXf_PMf_STD_PMMOD_SHIFT+2)) /* /i */ -#define RXf_PMf_EXTENDED (1 << (RXf_PMf_STD_PMMOD_SHIFT+3)) /* /x */ -#define RXf_PMf_KEEPCOPY (1 << (RXf_PMf_STD_PMMOD_SHIFT+4)) /* /p */ +#define RXf_PMf_MULTILINE (1U << (RXf_PMf_STD_PMMOD_SHIFT+0)) /* /m */ +#define RXf_PMf_SINGLELINE (1U << (RXf_PMf_STD_PMMOD_SHIFT+1)) /* /s */ +#define RXf_PMf_FOLD (1U << (RXf_PMf_STD_PMMOD_SHIFT+2)) /* /i */ +#define RXf_PMf_EXTENDED (1U << (RXf_PMf_STD_PMMOD_SHIFT+3)) /* /x */ +#define RXf_PMf_EXTENDED_MORE (1U << (RXf_PMf_STD_PMMOD_SHIFT+4)) /* /xx */ +#define RXf_PMf_KEEPCOPY (1U << (RXf_PMf_STD_PMMOD_SHIFT+5)) /* /p */ /* The character set for the regex is stored in a field of more than one bit * using an enum, for reasons of compactness and to ensure that the options are @@ -48,7 +49,7 @@ typedef enum { REGEX_ASCII_MORE_RESTRICTED_CHARSET } regex_charset; -#define _RXf_PMf_CHARSET_SHIFT ((RXf_PMf_STD_PMMOD_SHIFT)+5) +#define _RXf_PMf_CHARSET_SHIFT ((RXf_PMf_STD_PMMOD_SHIFT)+6) #define RXf_PMf_CHARSET (7 << (_RXf_PMf_CHARSET_SHIFT)) /* 3 bits */ /* Manually decorate these functions here with gcc-style attributes just to @@ -80,38 +81,43 @@ get_regex_charset(const U32 flags) return (regex_charset) ((flags & RXf_PMf_CHARSET) >> _RXf_PMf_CHARSET_SHIFT); } -#define _RXf_PMf_SHIFT_COMPILETIME (RXf_PMf_STD_PMMOD_SHIFT+8) +#define _RXf_PMf_SHIFT_COMPILETIME (RXf_PMf_STD_PMMOD_SHIFT+9) /* Set in Perl_pmruntime if op_flags & OPf_SPECIAL, i.e. split. Will be used by regex engines to check whether they should set RXf_SKIPWHITE */ -#define RXf_PMf_SPLIT (1<<(RXf_PMf_STD_PMMOD_SHIFT+8)) +#define RXf_PMf_SPLIT (1U<<(RXf_PMf_STD_PMMOD_SHIFT+9)) /* Next available bit after the above. Name begins with '_' so won't be * exported by B */ -#define _RXf_PMf_SHIFT_NEXT (RXf_PMf_STD_PMMOD_SHIFT+9) +#define _RXf_PMf_SHIFT_NEXT (RXf_PMf_STD_PMMOD_SHIFT+10) /* Mask of the above bits. These need to be transferred from op_pmflags to * re->extflags during compilation */ -#define RXf_PMf_COMPILETIME (RXf_PMf_MULTILINE|RXf_PMf_SINGLELINE|RXf_PMf_FOLD|RXf_PMf_EXTENDED|RXf_PMf_KEEPCOPY|RXf_PMf_CHARSET) -#define RXf_PMf_FLAGCOPYMASK (RXf_PMf_MULTILINE|RXf_PMf_SINGLELINE|RXf_PMf_FOLD|RXf_PMf_EXTENDED|RXf_PMf_KEEPCOPY|RXf_PMf_CHARSET|RXf_PMf_SPLIT) - -#if RXf_PMf_COMPILETIME > 255 -# error RXf_PMf_COMPILETIME wont fit in U8 flags field of eval node +#define RXf_PMf_COMPILETIME (RXf_PMf_MULTILINE|RXf_PMf_SINGLELINE|RXf_PMf_FOLD|RXf_PMf_EXTENDED|RXf_PMf_EXTENDED_MORE|RXf_PMf_KEEPCOPY|RXf_PMf_CHARSET) +#define RXf_PMf_FLAGCOPYMASK (RXf_PMf_COMPILETIME|RXf_PMf_SPLIT) + + /* Exclude win32 because it can't cope with I32_MAX definition */ +#ifndef WIN32 +# if RXf_PMf_COMPILETIME > I32_MAX +# error RXf_PMf_COMPILETIME wont fit in arg2 field of eval node +# endif #endif -/* These copies need to be numerical or defsubs_h.PL won't know about them. */ -#define PMf_MULTILINE 1<<0 -#define PMf_SINGLELINE 1<<1 -#define PMf_FOLD 1<<2 -#define PMf_EXTENDED 1<<3 -#define PMf_KEEPCOPY 1<<4 -#define PMf_CHARSET 7<<5 -#define PMf_SPLIT 1<<8 - -#if PMf_MULTILINE != RXf_PMf_MULTILINE || PMf_SINGLELINE != RXf_PMf_SINGLELINE || PMf_FOLD != RXf_PMf_FOLD || PMf_EXTENDED != RXf_PMf_EXTENDED || PMf_KEEPCOPY != RXf_PMf_KEEPCOPY || PMf_SPLIT != RXf_ ... [43 chars truncated] +/* These copies need to be numerical or ext/B/Makefile.PL won't think they are + * constants */ +#define PMf_MULTILINE 1U<<0 +#define PMf_SINGLELINE 1U<<1 +#define PMf_FOLD 1U<<2 +#define PMf_EXTENDED 1U<<3 +#define PMf_EXTENDED_MORE 1U<<4 +#define PMf_KEEPCOPY 1U<<5 +#define PMf_CHARSET 7U<<6 +#define PMf_SPLIT 1U<<9 + +#if PMf_MULTILINE != RXf_PMf_MULTILINE || PMf_SINGLELINE != RXf_PMf_SINGLELINE || PMf_FOLD != RXf_PMf_FOLD || PMf_EXTENDED != RXf_PMf_EXTENDED || PMf_EXTENDED_MORE != RXf_PMf_EXTENDED_MORE || PMf_KEE ... [89 chars truncated] # error RXf_PMf defines are wrong #endif diff --git a/pod/perldebguts.pod b/pod/perldebguts.pod index 54644d7..59d27b0 100644 --- a/pod/perldebguts.pod +++ b/pod/perldebguts.pod @@ -591,7 +591,7 @@ will be lost. REG_ANY no Match any one character (except newline). SANY no Match any one character. CANY no Match any one byte. - ANYOF sv Match character in (or not in) this class, + ANYOF sv 1 Match character in (or not in) this class, single char match only # POSIX Character Classes: @@ -716,7 +716,8 @@ will be lost. # The heavy worker - EVAL evl 1 Execute some Perl code. + EVAL evl/flags Execute some Perl code. + 2L # Modifiers diff --git a/pod/perldelta.pod b/pod/perldelta.pod index a6d28f4..4ea0a09 100644 --- a/pod/perldelta.pod +++ b/pod/perldelta.pod @@ -94,7 +94,20 @@ as an updated module in the L</Modules and Pragmata> section. =back -[ List each other deprecation as a =head2 entry ] +=head2 Use of multiple /x regexp modifiers + +It is now deprecated to say something like any of the following: + + qr/foo/xx; + /(?xax:foo)/; + use re qw(/amxx); + +That is, now C<x> should only occur once in any string of contiguous +regular expression pattern modifiers. We do not believe there are any +occurrences of this in all of CPAN. This is in preparation for a future +Perl release having C</xx> mean to allow white-space for readability in +bracketed character classes (those enclosed in square brackets: +C<[...]>). =head1 Performance Enhancements diff --git a/pod/perldiag.pod b/pod/perldiag.pod index abfa50d..b4559ce 100644 --- a/pod/perldiag.pod +++ b/pod/perldiag.pod @@ -2207,6 +2207,13 @@ created on an emergency basis to prevent a core dump. (F) The parser has given up trying to parse the program after 10 errors. Further error messages would likely be uninformative. +=item Having more than one /%c regexp modifier is deprecated + +(D deprecated, regexp) You used the indicated regular expression pattern +modifier at least twice in a string of modifiers. It is deprecated to +do this with this particular modifier, to allow future extensions to the +Perl language. + =item Hexadecimal float: exponent overflow (W overflow) The hexadecimal floating point has a larger exponent diff --git a/proto.h b/proto.h index 78b107b..b4ab3df 100644 --- a/proto.h +++ b/proto.h @@ -6918,6 +6918,11 @@ STATIC regnode* S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U3 #define PERL_ARGS_ASSERT_REG \ assert(pRExC_state); assert(flagp) +STATIC regnode* S_reg2Lanode(pTHX_ RExC_state_t *pRExC_state, const U8 op, const U32 arg1, const I32 arg2) + __attribute__nonnull__(pTHX_1); +#define PERL_ARGS_ASSERT_REG2LANODE \ + assert(pRExC_state) + STATIC regnode* S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op) __attribute__nonnull__(pTHX_1); #define PERL_ARGS_ASSERT_REG_NODE \ @@ -6968,6 +6973,12 @@ STATIC void S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U #define PERL_ARGS_ASSERT_REGINSERT \ assert(pRExC_state); assert(opnd) +STATIC regnode* S_regnode_guts(pTHX_ RExC_state_t *pRExC_state, const U8 op, const STRLEN extra_len, const char* const name) + __attribute__nonnull__(pTHX_1) + __attribute__nonnull__(pTHX_4); +#define PERL_ARGS_ASSERT_REGNODE_GUTS \ + assert(pRExC_state); assert(name) + STATIC char * S_regpatws(RExC_state_t *pRExC_state, char *p, const bool recognize_comment) __attribute__warn_unused_result__ __attribute__nonnull__(1) diff --git a/regcomp.c b/regcomp.c index 2b69938..049d6e7 100644 --- a/regcomp.c +++ b/regcomp.c @@ -9324,6 +9324,7 @@ S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state) regex_charset cs; bool has_use_defaults = FALSE; const char* const seqstart = RExC_parse - 1; /* Point to the '?' */ + int x_mod_count = 0; PERL_ARGS_ASSERT_PARSE_LPAREN_QUESTION_FLAGS; @@ -9351,7 +9352,7 @@ S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state) switch (*RExC_parse) { /* Code for the imsx flags */ - CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp); + CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp, x_mod_count); case LOCALE_PAT_MOD: if (has_charset_modifier) { @@ -9488,6 +9489,9 @@ S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state) if (RExC_flags & RXf_PMf_FOLD) { RExC_contains_i = 1; } + if (PASS2) { + STD_PMMOD_FLAGS_PARSE_X_WARN(x_mod_count); + } return; /*NOTREACHED*/ default: @@ -9501,6 +9505,10 @@ S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state) ++RExC_parse; } + + if (PASS2) { + STD_PMMOD_FLAGS_PARSE_X_WARN(x_mod_count); + } } /* @@ -9931,21 +9939,18 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) num = RExC_npar + num - 1; } - ret = reganode(pRExC_state, GOSUB, num); + ret = reg2Lanode(pRExC_state, GOSUB, num, RExC_recurse_count); if (!SIZE_ONLY) { if (num > (I32)RExC_rx->nparens) { RExC_parse++; vFAIL("Reference to nonexistent group"); } - ARG2L_SET( ret, RExC_recurse_count++); - RExC_emit++; + RExC_recurse_count++; DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log, "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret))); - } else { - RExC_size++; - } - RExC_seen |= REG_RECURSE_SEEN; + } + RExC_seen |= REG_RECURSE_SEEN; Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */ Set_Node_Offset(ret, parse_start); /* MJD */ @@ -10008,17 +10013,22 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) if (is_logical) { regnode *eval; ret = reg_node(pRExC_state, LOGICAL); - eval = reganode(pRExC_state, EVAL, n); + + eval = reg2Lanode(pRExC_state, EVAL, + n, + + /* for later propagation into (??{}) + * return value */ + RExC_flags & RXf_PMf_COMPILETIME + ); if (!SIZE_ONLY) { ret->flags = 2; - /* for later propagation into (??{}) return value */ - eval->flags = (U8) (RExC_flags & RXf_PMf_COMPILETIME); } REGTAIL(pRExC_state, ret, eval); /* deal with the length of this later - MJD */ return ret; } - ret = reganode(pRExC_state, EVAL, n); + ret = reg2Lanode(pRExC_state, EVAL, n, 0); Set_Node_Length(ret, RExC_parse - parse_start + 1); Set_Node_Offset(ret, parse_start); return ret; @@ -13657,7 +13667,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, } else { /* Is a backslash; get the code point of the char after it */ - if (UTF && ! UTF8_IS_INVARIANT(RExC_parse)) { + if (UTF && ! UTF8_IS_INVARIANT(UCHARAT(RExC_parse))) { value = utf8n_to_uvchr((U8*)RExC_parse, RExC_end - RExC_parse, &numlen, UTF8_ALLOW_DEFAULT); @@ -15416,21 +15426,23 @@ S_nextchar(pTHX_ RExC_state_t *pRExC_state) } } -/* -- reg_node - emit a node -*/ -STATIC regnode * /* Location. */ -S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op) +STATIC regnode * +S_regnode_guts(pTHX_ RExC_state_t *pRExC_state, const U8 op, const STRLEN extra_size, const char* const name) { - regnode *ptr; + /* Allocate a regnode for 'op' and returns it, with 'extra_size' extra + * space. In pass1, it aligns and increments RExC_size; in pass2, + * RExC_emit */ + regnode * const ret = RExC_emit; GET_RE_DEBUG_FLAGS_DECL; - PERL_ARGS_ASSERT_REG_NODE; + PERL_ARGS_ASSERT_REGNODE_GUTS; + + assert(extra_size >= regarglen[op]); if (SIZE_ONLY) { SIZE_ALIGN(RExC_size); - RExC_size += 1; + RExC_size += 1 + extra_size; return(ret); } if (RExC_emit >= RExC_emit_bound) @@ -15438,13 +15450,13 @@ S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op) op, (void*)RExC_emit, (void*)RExC_emit_bound); NODE_ALIGN_FILL(ret); - ptr = ret; - FILL_ADVANCE_NODE(ptr, op); -#ifdef RE_TRACK_PATTERN_OFFSETS +#ifndef RE_TRACK_PATTERN_OFFSETS + PERL_UNUSED_ARG(name); +#else if (RExC_offsets) { /* MJD */ MJD_OFFSET_DEBUG( ("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n", - "reg_node", __LINE__, + name, __LINE__, PL_reg_name[op], (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ? "Overwriting end of array!\n" : "OK", @@ -15454,7 +15466,26 @@ S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op) Set_Node_Offset(RExC_emit, RExC_parse + (op == END)); } #endif - RExC_emit = ptr; + return(ret); +} + +/* +- reg_node - emit a node +*/ +STATIC regnode * /* Location. */ +S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op) +{ + regnode * const ret = regnode_guts(pRExC_state, op, regarglen[op], "reg_node"); + + PERL_ARGS_ASSERT_REG_NODE; + + assert(regarglen[op] == 0); + + if (PASS2) { + regnode *ptr = ret; + FILL_ADVANCE_NODE(ptr, op); + RExC_emit = ptr; + } return(ret); } @@ -15464,54 +15495,36 @@ S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op) STATIC regnode * /* Location. */ S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg) { - regnode *ptr; - regnode * const ret = RExC_emit; - GET_RE_DEBUG_FLAGS_DECL; + regnode * const ret = regnode_guts(pRExC_state, op, regarglen[op], "reganode"); PERL_ARGS_ASSERT_REGANODE; - if (SIZE_ONLY) { - SIZE_ALIGN(RExC_size); - RExC_size += 2; - /* - We can't do this: + assert(regarglen[op] == 1); - assert(2==regarglen[op]+1); + if (PASS2) { + regnode *ptr = ret; + FILL_ADVANCE_NODE_ARG(ptr, op, arg); + RExC_emit = ptr; + } + return(ret); +} - Anything larger than this has to allocate the extra amount. - If we changed this to be: +STATIC regnode * +S_reg2Lanode(pTHX_ RExC_state_t *pRExC_state, const U8 op, const U32 arg1, const I32 arg2) +{ + /* emit a node with U32 and I32 arguments */ - RExC_size += (1 + regarglen[op]); + regnode * const ret = regnode_guts(pRExC_state, op, regarglen[op], "reg2Lanode"); - then it wouldn't matter. Its not clear what side effect - might come from that so its not done so far. - -- dmq - */ - return(ret); - } - if (RExC_emit >= RExC_emit_bound) - Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p", - op, (void*)RExC_emit, (void*)RExC_emit_bound); + PERL_ARGS_ASSERT_REG2LANODE; - NODE_ALIGN_FILL(ret); - ptr = ret; - FILL_ADVANCE_NODE_ARG(ptr, op, arg); -#ifdef RE_TRACK_PATTERN_OFFSETS - if (RExC_offsets) { /* MJD */ - MJD_OFFSET_DEBUG( - ("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n", - "reganode", - __LINE__, - PL_reg_name[op], - (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ? - "Overwriting end of array!\n" : "OK", - (UV)(RExC_emit - RExC_emit_start), - (UV)(RExC_parse - RExC_start), - (UV)RExC_offsets[0])); - Set_Cur_Node_Offset; + assert(regarglen[op] == 2); + + if (PASS2) { + regnode *ptr = ret; + FILL_ADVANCE_NODE_2L_ARG(ptr, op, arg1, arg2); + RExC_emit = ptr; } -#endif - RExC_emit = ptr; return(ret); } diff --git a/regcomp.h b/regcomp.h index 1d41d6e..c08e5ad 100644 --- a/regcomp.h +++ b/regcomp.h @@ -216,7 +216,7 @@ struct regnode_charclass { U8 flags; U8 type; U16 next_off; - U32 arg1; + U32 arg1; /* set by set_ANYOF_arg() */ char bitmap[ANYOF_BITMAP_SIZE]; /* only compile-time */ }; @@ -361,6 +361,13 @@ struct regnode_ssc { (ptr)->type = op; (ptr)->next_off = 0; (ptr)++; } STMT_END #define FILL_ADVANCE_NODE_ARG(ptr, op, arg) STMT_START { \ ARG_SET(ptr, arg); FILL_ADVANCE_NODE(ptr, op); (ptr) += 1; } STMT_END +#define FILL_ADVANCE_NODE_2L_ARG(ptr, op, arg1, arg2) \ + STMT_START { \ + ARG_SET(ptr, arg1); \ + ARG2L_SET(ptr, arg2); \ + FILL_ADVANCE_NODE(ptr, op); \ + (ptr) += 2; \ + } STMT_END #define REG_MAGIC 0234 @@ -513,10 +520,6 @@ struct regnode_ssc { /* Utility macros for the bitmap and classes of ANYOF */ -#define ANYOF_SIZE (sizeof(struct regnode_charclass)) -#define ANYOF_POSIXL_SIZE (sizeof(regnode_charclass_posixl)) -#define ANYOF_CLASS_SIZE ANYOF_POSIXL_SIZE - #define ANYOF_FLAGS(p) ((p)->flags) #define ANYOF_BIT(c) (1U << ((c) & 7)) @@ -572,9 +575,8 @@ struct regnode_ssc { #define ANYOF_BITMAP_CLEARALL(p) \ Zero (ANYOF_BITMAP(p), ANYOF_BITMAP_SIZE) -#define ANYOF_SKIP ((ANYOF_SIZE - 1)/sizeof(regnode)) -#define ANYOF_POSIXL_SKIP ((ANYOF_POSIXL_SIZE - 1)/sizeof(regnode)) -#define ANYOF_CLASS_SKIP ANYOF_POSIXL_SKIP +#define ANYOF_SKIP EXTRA_SIZE(struct regnode_charclass) +#define ANYOF_POSIXL_SKIP EXTRA_SIZE(regnode_charclass_posixl) /* * Utility definitions. diff --git a/regcomp.sym b/regcomp.sym index 6908712..2178503 100644 --- a/regcomp.sym +++ b/regcomp.sym @@ -57,7 +57,7 @@ NBOUNDA NBOUND, no ; Match "" at any word non-boundary using ASCI REG_ANY REG_ANY, no 0 S ; Match any one character (except newline). SANY REG_ANY, no 0 S ; Match any one character. CANY REG_ANY, no 0 S ; Match any one byte. -ANYOF ANYOF, sv 0 S ; Match character in (or not in) this class, single char match only +ANYOF ANYOF, sv 1 S ; Match character in (or not in) this class, single char match only #* POSIX Character Classes: # Order of the below is important. See ordering comment above. @@ -165,7 +165,7 @@ GROUPP GROUPP, num 1 ; Whether the group matched. #*The heavy worker -EVAL EVAL, evl 1 ; Execute some Perl code. +EVAL EVAL, evl/flags 2L ; Execute some Perl code. #*Modifiers diff --git a/regexec.c b/regexec.c index e44691e..3af3e37 100644 --- a/regexec.c +++ b/regexec.c @@ -5448,7 +5448,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) re_sv = rex->engine->op_comp(aTHX_ &ret, 1, NULL, rex->engine, NULL, NULL, /* copy /msix etc to inner pattern */ - scan->flags, + ARG2L(scan), pm_flags); if (!(SvFLAGS(ret) diff --git a/regexp.h b/regexp.h index 54bae12..ff16410 100644 --- a/regexp.h +++ b/regexp.h @@ -274,11 +274,18 @@ and check for NULL. #define RXf_PMf_STD_PMMOD (RXf_PMf_MULTILINE|RXf_PMf_SINGLELINE|RXf_PMf_FOLD|RXf_PMf_EXTENDED) -#define CASE_STD_PMMOD_FLAGS_PARSE_SET(pmfl) \ - case IGNORE_PAT_MOD: *(pmfl) |= RXf_PMf_FOLD; break; \ - case MULTILINE_PAT_MOD: *(pmfl) |= RXf_PMf_MULTILINE; break; \ - case SINGLE_PAT_MOD: *(pmfl) |= RXf_PMf_SINGLELINE; break; \ - case XTENDED_PAT_MOD: *(pmfl) |= RXf_PMf_EXTENDED; break +#define CASE_STD_PMMOD_FLAGS_PARSE_SET(pmfl, x_count) \ + case IGNORE_PAT_MOD: *(pmfl) |= RXf_PMf_FOLD; break; \ + case MULTILINE_PAT_MOD: *(pmfl) |= RXf_PMf_MULTILINE; break; \ + case SINGLE_PAT_MOD: *(pmfl) |= RXf_PMf_SINGLELINE; break; \ + case XTENDED_PAT_MOD: *(pmfl) |= RXf_PMf_EXTENDED; (x_count)++; break; + +#define STD_PMMOD_FLAGS_PARSE_X_WARN(x_count) \ + if (UNLIKELY((x_count) > 1)) { \ + Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \ + "Having more than one /%c regexp modifier is deprecated", \ + XTENDED_PAT_MOD); \ + } /* Note, includes charset ones, assumes 0 is the default for them */ #define STD_PMMOD_FLAGS_CLEAR(pmfl) \ @@ -321,7 +328,7 @@ and check for NULL. /* This string is expected by regcomp.c to be ordered so that the first * character is the flag in bit RXf_PMf_STD_PMMOD_SHIFT of extflags; the next * character is bit +1, etc. */ -#define STD_PAT_MODS "msix" +#define STD_PAT_MODS "msixx" #define CHARSET_PAT_MODS ASCII_RESTRICT_PAT_MODS DEPENDS_PAT_MODS LOCALE_PAT_MODS UNICODE_PAT_MODS @@ -341,61 +348,89 @@ and check for NULL. * */ -/* Leave some space, so future bit allocations can go either in the shared or - * unshared area without affecting binary compatibility */ -#define RXf_BASE_SHIFT (_RXf_PMf_SHIFT_NEXT) - /* Set in Perl_pmruntime if op_flags & OPf_SPECIAL, i.e. split. Will be used by regex engines to check whether they should set RXf_SKIPWHITE */ -#define RXf_SPLIT (1<<(RXf_BASE_SHIFT-1)) -#if RXf_SPLIT != RXf_PMf_SPLIT -# error "RXf_SPLIT does not match RXf_PMf_SPLIT" -#endif +#define RXf_SPLIT RXf_PMf_SPLIT + +/* Currently the regex flags occupy a single 32-bit word. Not all bits are + * currently used. The lower bits are shared with their corresponding PMf flag + * bits, up to but not including _RXf_PMf_SHIFT_NEXT. The unused bits + * immediately follow; finally the used RXf-only (unshared) bits, so that the + * highest bit in the word is used. This gathers all the unused bits as a pool + * in the middle, like so: 11111111111111110000001111111111 + * where the '1's represent used bits, and the '0's unused. This design allows + * us to allocate off one end of the pool if we need to add a shared bit, and + * off the other end if we need a non-shared bit, without disturbing the other + * bits. This maximizes the likelihood of being able to change things without + * breaking binary compatibility. + * + * To add shared bits, do so in op_reg_common.h. This should change + * _RXf_PMf_SHIFT_NEXT so that things won't compile. Then come to regexp.h and + * op.h and adjust the constant adders in the definitions of RXf_BASE_SHIFT and + * Pmf_BASE_SHIFT down by the number of shared bits you added. That's it. + * Things should be binary compatible. But if either of these gets to having + * to subtract rather than add, leave at 0 and instead adjust all the entries + * that are in terms of it. But if the first one of those is already + * RXf_BASE_SHIFT+0, there are no bits left, and a redesign is in order. + * + * To remove unshared bits, just delete its entry. If you're where breaking + * binary compatibility is ok to do, you might want to adjust things to move + * the newly opened space so that it gets absorbed into the common pool. + * + * To add unshared bits, first use up any gaps in the middle. Otherwise, + * allocate off the low end until you get to RXf_BASE_SHIFT+0. If that isn't + * enough, move RXf_BASE_SHIFT down (if possible) and add the new bit at the + * other end instead; this preserves binary compatibility. + * + * For the regexp bits, PL_reg_extflags_name[] in regnodes.h has a comment + * giving which bits are used/unused */ -/* Do we have some sort of anchor? */ -#define RXf_IS_ANCHORED (1<<(RXf_BASE_SHIFT+0)) -#define RXf_UNUSED1 (1<<(RXf_BASE_SHIFT+1)) -#define RXf_UNUSED2 (1<<(RXf_BASE_SHIFT+2)) -#define RXf_UNUSED3 (1<<(RXf_BASE_SHIFT+3)) -#define RXf_UNUSED4 (1<<(RXf_BASE_SHIFT+4)) -#define RXf_UNUSED5 (1<<(RXf_BASE_SHIFT+5)) +#define RXf_BASE_SHIFT (_RXf_PMf_SHIFT_NEXT + 4) /* What we have seen */ -#define RXf_NO_INPLACE_SUBST (1<<(RXf_BASE_SHIFT+6)) -#define RXf_EVAL_SEEN (1<<(RXf_BASE_SHIFT+7)) -#define RXf_UNUSED8 (1<<(RXf_BASE_SHIFT+8)) +#define RXf_NO_INPLACE_SUBST (1U<<(RXf_BASE_SHIFT+2)) +#define RXf_EVAL_SEEN (1U<<(RXf_BASE_SHIFT+3)) /* Special */ -#define RXf_UNBOUNDED_QUANTIFIER_SEEN (1<<(RXf_BASE_SHIFT+9)) -#define RXf_CHECK_ALL (1<<(RXf_BASE_SHIFT+10)) +#define RXf_UNBOUNDED_QUANTIFIER_SEEN (1U<<(RXf_BASE_SHIFT+4)) +#define RXf_CHECK_ALL (1U<<(RXf_BASE_SHIFT+5)) /* UTF8 related */ -#define RXf_MATCH_UTF8 (1<<(RXf_BASE_SHIFT+11)) /* $1 etc are utf8 */ +#define RXf_MATCH_UTF8 (1U<<(RXf_BASE_SHIFT+6)) /* $1 etc are utf8 */ /* Intuit related */ -#define RXf_USE_INTUIT_NOML (1<<(RXf_BASE_SHIFT+12)) -#define RXf_USE_INTUIT_ML (1<<(RXf_BASE_SHIFT+13)) -#define RXf_INTUIT_TAIL (1<<(RXf_BASE_SHIFT+14)) +#define RXf_USE_INTUIT_NOML (1U<<(RXf_BASE_SHIFT+7)) +#define RXf_USE_INTUIT_ML (1U<<(RXf_BASE_SHIFT+8)) +#define RXf_INTUIT_TAIL (1U<<(RXf_BASE_SHIFT+9)) #define RXf_USE_INTUIT (RXf_USE_INTUIT_NOML|RXf_USE_INTUIT_ML) +/* Do we have some sort of anchor? */ +#define RXf_IS_ANCHORED (1U<<(RXf_BASE_SHIFT+10)) + /* Copy and tainted info */ -#define RXf_COPY_DONE (1<<(RXf_BASE_SHIFT+16)) +#define RXf_COPY_DONE (1U<<(RXf_BASE_SHIFT+11)) /* post-execution: $1 et al are tainted */ -#define RXf_TAINTED_SEEN (1<<(RXf_BASE_SHIFT+17)) +#define RXf_TAINTED_SEEN (1U<<(RXf_BASE_SHIFT+12)) /* this pattern was tainted during compilation */ -#define RXf_TAINTED (1<<(RXf_BASE_SHIFT+18)) +#define RXf_TAINTED (1U<<(RXf_BASE_SHIFT+13)) /* Flags indicating special patterns */ -#define RXf_START_ONLY (1<<(RXf_BASE_SHIFT+19)) /* Pattern is /^/ */ -#define RXf_SKIPWHITE (1<<(RXf_BASE_SHIFT+20)) /* Pattern is for a split " " */ -#define RXf_WHITE (1<<(RXf_BASE_SHIFT+21)) /* Pattern is /\s+/ */ -#define RXf_NULL (1U<<(RXf_BASE_SHIFT+22)) /* Pattern is // */ -#if RXf_BASE_SHIFT+22 > 31 -# error Too many RXf_PMf bits used. See regnodes.h for any spare in middle +#define RXf_START_ONLY (1U<<(RXf_BASE_SHIFT+14)) /* Pattern is /^/ */ +#define RXf_SKIPWHITE (1U<<(RXf_BASE_SHIFT+15)) /* Pattern is for a */ + /* split " " */ +#define RXf_WHITE (1U<<(RXf_BASE_SHIFT+16)) /* Pattern is /\s+/ */ +#define RXf_NULL (1U<<(RXf_BASE_SHIFT+17)) /* Pattern is // */ + +/* See comments at the beginning of these defines about adding bits. The + * highest bit position should be used, so that if RXf_BASE_SHIFT gets + * increased, the #error below will be triggered so that you will be reminded + * to adjust things at the other end to keep the bit positions unchanged */ +#if RXf_BASE_SHIFT+17 > 31 +# error Too many RXf_PMf bits used. See comments at beginning of these for what to do #endif /* diff --git a/regnodes.h b/regnodes.h index 133ad65..7bc75c9 100644 --- a/regnodes.h +++ b/regnodes.h @@ -310,7 +310,7 @@ static const U8 regarglen[] = { 0, /* REG_ANY */ 0, /* SANY */ 0, /* CANY */ - 0, /* ANYOF */ + EXTRA_SIZE(struct regnode_1), /* ANYOF */ 0, /* POSIXD */ 0, /* POSIXL */ 0, /* POSIXU */ @@ -357,7 +357,7 @@ static const U8 regarglen[] = { EXTRA_SIZE(struct regnode_1), /* SUSPEND */ EXTRA_SIZE(struct regnode_1), /* IFTHEN */ EXTRA_SIZE(struct regnode_1), /* GROUPP */ - EXTRA_SIZE(struct regnode_1), /* EVAL */ + EXTRA_SIZE(struct regnode_2L), /* EVAL */ 0, /* MINMOD */ 0, /* LOGICAL */ EXTRA_SIZE(struct regnode_1), /* RENUM */ @@ -632,32 +632,32 @@ EXTCONST char * const PL_reg_name[] = { EXTCONST char * PL_reg_extflags_name[]; #else EXTCONST char * const PL_reg_extflags_name[] = { - /* Bits in extflags defined: 11111110111111111111111111111111 */ + /* Bits in extflags defined: 11111111111111110000001111111111 */ "MULTILINE", /* 0x00000001 */ "SINGLELINE", /* 0x00000002 */ "FOLD", /* 0x00000004 */ "EXTENDED", /* 0x00000008 */ - "KEEPCOPY", /* 0x00000010 */ - "CHARSET0", /* 0x00000020 : "CHARSET" - 0x000000e0 */ - "CHARSET1", /* 0x00000040 : "CHARSET" - 0x000000e0 */ - "CHARSET2", /* 0x00000080 : "CHARSET" - 0x000000e0 */ - "SPLIT", /* 0x00000100 */ - "IS_ANCHORED", /* 0x00000200 */ - "UNUSED1", /* 0x00000400 */ - "UNUSED2", /* 0x00000800 */ - "UNUSED3", /* 0x00001000 */ - "UNUSED4", /* 0x00002000 */ - "UNUSED5", /* 0x00004000 */ - "NO_INPLACE_SUBST", /* 0x00008000 */ - "EVAL_SEEN", /* 0x00010000 */ - "UNUSED8", /* 0x00020000 */ + "EXTENDED_MORE", /* 0x00000010 */ + "KEEPCOPY", /* 0x00000020 */ + "CHARSET0", /* 0x00000040 : "CHARSET" - 0x000001c0 */ + "CHARSET1", /* 0x00000080 : "CHARSET" - 0x000001c0 */ + "CHARSET2", /* 0x00000100 : "CHARSET" - 0x000001c0 */ + "SPLIT", /* 0x00000200 */ + "UNUSED_BIT_10", /* 0x00000400 */ + "UNUSED_BIT_11", /* 0x00000800 */ + "UNUSED_BIT_12", /* 0x00001000 */ + "UNUSED_BIT_13", /* 0x00002000 */ + "UNUSED_BIT_14", /* 0x00004000 */ + "UNUSED_BIT_15", /* 0x00008000 */ + "NO_INPLACE_SUBST", /* 0x00010000 */ + "EVAL_SEEN", /* 0x00020000 */ "UNBOUNDED_QUANTIFIER_SEEN",/* 0x00040000 */ "CHECK_ALL", /* 0x00080000 */ "MATCH_UTF8", /* 0x00100000 */ "USE_INTUIT_NOML", /* 0x00200000 */ "USE_INTUIT_ML", /* 0x00400000 */ "INTUIT_TAIL", /* 0x00800000 */ - "UNUSED_BIT_24", /* 0x01000000 */ + "IS_ANCHORED", /* 0x01000000 */ "COPY_DONE", /* 0x02000000 */ "TAINTED_SEEN", /* 0x04000000 */ "TAINTED", /* 0x08000000 */ diff --git a/t/re/reg_mesg.t b/t/re/reg_mesg.t index 347234f..5162aac 100644 --- a/t/re/reg_mesg.t +++ b/t/re/reg_mesg.t @@ -439,6 +439,10 @@ my @deprecated = ( 'Unescaped left brace in regex is deprecated, passed through {#} m/\q{{#}/' ], '/:{4,a}/' => 'Unescaped left brace in regex is deprecated, passed through {#} m/:{{#}4,a}/', + '/abc/xix' => 'Having more than one /x regexp modifier is deprecated', + '/(?xmsixp:abc)/' => 'Having more than one /x regexp modifier is deprecated', + '/(?xmsixp)abc/' => 'Having more than one /x regexp modifier is deprecated', + '/(?xxxx:abc)/' => 'Having more than one /x regexp modifier is deprecated', ); while (my ($regex, $expect) = splice @death, 0, 2) { diff --git a/toke.c b/toke.c index aeaeb46..33a68c6 100644 --- a/toke.c +++ b/toke.c @@ -8668,14 +8668,14 @@ S_scan_ident(pTHX_ char *s, char *dest, STRLEN destlen, I32 ck_uni) } static bool -S_pmflag(pTHX_ const char* const valid_flags, U32 * pmfl, char** s, char* charset) { - - /* Adds, subtracts to/from 'pmfl' based on regex modifier flags found in - * the parse starting at 's', based on the subset that are valid in this - * context input to this routine in 'valid_flags'. Advances s. Returns - * TRUE if the input should be treated as a valid flag, so the next char - * may be as well; otherwise FALSE. 'charset' should point to a NUL upon - * first call on the current regex. This routine will set it to any +S_pmflag(pTHX_ const char* const valid_flags, U32 * pmfl, char** s, char* charset, unsigned int * x_mod_count) { + + /* Adds, subtracts to/from 'pmfl' based on the next regex modifier flag + * found in the parse starting at 's', based on the subset that are valid + * in this context input to this routine in 'valid_flags'. Advances s. + * Returns TRUE if the input should be treated as a valid flag, so the next + * char may be as well; otherwise FALSE. 'charset' should point to a NUL + * upon first call on the current regex. This routine will set it to any * charset modifier found. The caller shouldn't change it. This way, * another charset modifier encountered in the parse can be detected as an * error, as we have decided to allow only one */ @@ -8697,7 +8697,7 @@ S_pmflag(pTHX_ const char* const valid_flags, U32 * pmfl, char** s, char* charse switch (c) { - CASE_STD_PMMOD_FLAGS_PARSE_SET(pmfl); + CASE_STD_PMMOD_FLAGS_PARSE_SET(pmfl, *x_mod_count); case GLOBAL_PAT_MOD: *pmfl |= PMf_GLOBAL; break; case CONTINUE_PAT_MOD: *pmfl |= PMf_CONTINUE; break; case ONCE_PAT_MOD: *pmfl |= PMf_KEEP; break; @@ -8772,6 +8772,7 @@ S_scan_pat(pTHX_ char *start, I32 type) const char * const valid_flags = (const char *)((type == OP_QR) ? QR_PAT_MODS : M_PAT_MODS); char charset = '\0'; /* character set modifier */ + unsigned int x_mod_count = 0; PERL_ARGS_ASSERT_SCAN_PAT; @@ -8821,7 +8822,9 @@ S_scan_pat(pTHX_ char *start, I32 type) pm->op_pmflags |= PMf_IS_QR; } - while (*s && S_pmflag(aTHX_ valid_flags, &(pm->op_pmflags), &s, &charset)) {}; + while (*s && S_pmflag(aTHX_ valid_flags, &(pm->op_pmflags), + &s, &charset, &x_mod_count)) + {}; /* issue a warning if /c is specified,but /g is not */ if ((pm->op_pmflags & PMf_CONTINUE) && !(pm->op_pmflags & PMf_GLOBAL)) { @@ -8829,6 +8832,8 @@ S_scan_pat(pTHX_ char *start, I32 type) "Use of /c modifier is meaningless without /g" ); } + STD_PMMOD_FLAGS_PARSE_X_WARN(x_mod_count); + PL_lex_op = (OP*)pm; pl_yylval.ival = OP_MATCH; return s; @@ -8843,6 +8848,7 @@ S_scan_subst(pTHX_ char *start) line_t first_line; I32 es = 0; char charset = '\0'; /* character set modifier */ + unsigned int x_mod_count = 0; char *t; PERL_ARGS_ASSERT_SCAN_SUBST; @@ -8876,12 +8882,15 @@ S_scan_subst(pTHX_ char *start) s++; es++; } - else if (! S_pmflag(aTHX_ S_PAT_MODS, &(pm->op_pmflags), &s, &charset)) + else if (! S_pmflag(aTHX_ S_PAT_MODS, &(pm->op_pmflags), + &s, &charset, &x_mod_count)) { break; } } + STD_PMMOD_FLAGS_PARSE_X_WARN(x_mod_count); + if ((pm->op_pmflags & PMf_CONTINUE)) { Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), "Use of /c modifier is meaningless in s///" ); } -- Perl5 Master Repository
