In perl.git, the branch blead has been updated <http://perl5.git.perl.org/perl.git/commitdiff/cf052a6bbe9dd79a5a36f3d2a83f1028c0cfc7a8?hp=4d25f022879459bbce407d5dff37fc799b7dcd68>
- Log ----------------------------------------------------------------- commit cf052a6bbe9dd79a5a36f3d2a83f1028c0cfc7a8 Author: Matthew Horsfall <[email protected]> Date: Wed Dec 17 16:28:34 2014 -0500 Add tests for stringification of regexps containing /n M t/re/reg_nocapture.t commit cf4f23c735c1f139a2be59157d1761ab7a58520f Author: Matthew Horsfall <[email protected]> Date: Wed Dec 17 16:20:46 2014 -0500 Bump re.pm version for changes M ext/re/re.pm commit 41d7c59e80e58de2b82e3759c8de14ee8aedb564 Author: Matthew Horsfall (alh) <[email protected]> Date: Wed Oct 22 20:56:47 2014 -0400 Support for nocapture regexp flag /n This flag will prevent () from capturing and filling in $1, $2, etc... Named captures will still work though, and if used will cause $1, $2, etc... to be filled in *only* within named groups. The motivation behind this is to allow the common construct of: /(?:b|c)a(?:t|n)/ To be rewritten more cleanly as: /(b|c)a(t|n)/n When you want grouping but no memory penalty on captures. You can also use ?n inside of a () directly to avoid capturing, and ?-n inside of a () to negate its effects if you want to capture. M MANIFEST M dump.c M ext/re/re.pm M regcomp.c M regexp.h A t/re/reg_nocapture.t commit fde14af1e494660628714463be39d015ab768ff4 Author: Matthew Horsfall (alh) <[email protected]> Date: Wed Oct 22 20:50:13 2014 -0400 Fixes to make test pass for regexp nocapture bit addition. * Make Devel-Peek/t/Peek.t less sensitive to regexp flag changes. Devel-Peek had flag names and binary representation hardcoded. Flag names *should* be enough. Otherwise we have to update bits of this test every time we muck with flags that don't affect the flags being tested. * Let B::Deparse know about the new RXf_PMf_CHARSET shift value. M ext/Devel-Peek/t/Peek.t M lib/B/Deparse.pm commit e3b64d84f2a2d0eace596457dba25ca4974384e5 Author: Karl Williamson <[email protected]> Date: Wed Oct 22 12:38:20 2014 -0600 Create bit for /n. M op.h M op_reg_common.h M regexp.h M regnodes.h ----------------------------------------------------------------------- Summary of changes: MANIFEST | 1 + dump.c | 1 + ext/Devel-Peek/t/Peek.t | 4 ++-- ext/re/re.pm | 5 ++-- lib/B/Deparse.pm | 2 +- op.h | 2 +- op_reg_common.h | 23 +++++++++++-------- regcomp.c | 6 +++-- regexp.h | 15 +++++++----- regnodes.h | 14 ++++++------ t/re/reg_nocapture.t | 61 +++++++++++++++++++++++++++++++++++++++++++++++++ 11 files changed, 103 insertions(+), 31 deletions(-) create mode 100644 t/re/reg_nocapture.t diff --git a/MANIFEST b/MANIFEST index 6eb0ed8..73d8788 100644 --- a/MANIFEST +++ b/MANIFEST @@ -5462,6 +5462,7 @@ t/re/reg_fold.t See if case folding works properly t/re/reg_mesg.t See if one can get regular expression errors t/re/reg_namedcapture.t Make sure glob assignment doesn't break named capture t/re/reg_nc_tie.t Test the tied methods of Tie::Hash::NamedCapture +t/re/reg_nocapture.t Test the /n flag for regexps t/re/reg_pmod.t See if regexp /p modifier works as expected t/re/reg_posixcc.t See if posix character classes behave consistently t/re/re_tests Regular expressions for regexp.t diff --git a/dump.c b/dump.c index f888a48..0ed7962 100644 --- a/dump.c +++ b/dump.c @@ -1382,6 +1382,7 @@ const struct flag_to_name regexp_extflags_names[] = { {RXf_PMf_EXTENDED, "PMf_EXTENDED,"}, {RXf_PMf_EXTENDED_MORE, "PMf_EXTENDED_MORE,"}, {RXf_PMf_KEEPCOPY, "PMf_KEEPCOPY,"}, + {RXf_PMf_NOCAPTURE, "PMf_NOCAPURE,"}, {RXf_IS_ANCHORED, "IS_ANCHORED,"}, {RXf_NO_INPLACE_SUBST, "NO_INPLACE_SUBST,"}, {RXf_EVAL_SEEN, "EVAL_SEEN,"}, diff --git a/ext/Devel-Peek/t/Peek.t b/ext/Devel-Peek/t/Peek.t index 118b35e..062aa2e 100644 --- a/ext/Devel-Peek/t/Peek.t +++ b/ext/Devel-Peek/t/Peek.t @@ -1190,7 +1190,7 @@ do_test('UTF-8 in a regular expression', CUR = 13 STASH = $ADDR "Regexp" COMPFLAGS = 0x0 \(\) - EXTFLAGS = 0x680080 \(CHECK_ALL,USE_INTUIT_NOML,USE_INTUIT_ML\) + EXTFLAGS = $ADDR \(CHECK_ALL,USE_INTUIT_NOML,USE_INTUIT_ML\) (?: ENGINE = $ADDR \(STANDARD\) )? INTFLAGS = 0x0(?: \(\))? NPARENS = 0 @@ -1213,7 +1213,7 @@ do_test('UTF-8 in a regular expression', PV = $ADDR "\(\?\^u:\\\\\\\\x\{100\}\)" \[UTF8 "\(\?\^u:\\\\\\\\x\{100\}\)"\] CUR = 13 COMPFLAGS = 0x0 \(\) - EXTFLAGS = 0x680080 \(CHECK_ALL,USE_INTUIT_NOML,USE_INTUIT_ML\) + EXTFLAGS = $ADDR \(CHECK_ALL,USE_INTUIT_NOML,USE_INTUIT_ML\) (?: ENGINE = $ADDR \(STANDARD\) )? INTFLAGS = 0x0(?: \(\))? NPARENS = 0 diff --git a/ext/re/re.pm b/ext/re/re.pm index 7c2044e..bee65d2 100644 --- a/ext/re/re.pm +++ b/ext/re/re.pm @@ -4,7 +4,7 @@ package re; use strict; use warnings; -our $VERSION = "0.28"; +our $VERSION = "0.29"; our @ISA = qw(Exporter); our @EXPORT_OK = ('regmust', qw(is_regexp regexp_pattern @@ -23,7 +23,8 @@ my %reflags = ( s => 1 << ($PMMOD_SHIFT + 1), i => 1 << ($PMMOD_SHIFT + 2), x => 1 << ($PMMOD_SHIFT + 3), - p => 1 << ($PMMOD_SHIFT + 5), + n => 1 << ($PMMOD_SHIFT + 5), + p => 1 << ($PMMOD_SHIFT + 6), # special cases: d => 0, l => 1, diff --git a/lib/B/Deparse.pm b/lib/B/Deparse.pm index ebb0285..ed47097 100644 --- a/lib/B/Deparse.pm +++ b/lib/B/Deparse.pm @@ -5396,7 +5396,7 @@ sub re_flags { if (my $charset = $pmflags & RXf_PMf_CHARSET) { # Hardcoding this is fragile, but B does not yet export the # constants we need. - $flags .= qw(d l u a aa)[$charset >> 6] + $flags .= qw(d l u a aa)[$charset >> 7] } # The /d flag is indicated by 0; only show it if necessary. elsif ($self->{hinthash} and diff --git a/op.h b/op.h index 80aac56..1e4c9d9 100644 --- a/op.h +++ b/op.h @@ -319,7 +319,7 @@ struct pmop { * allocate off the low end until you get to PMf_BASE_SHIFT+0. If that isn't * enough, move PMf_BASE_SHIFT down (if possible) and add the new bit at the * other end instead; this preserves binary compatibility. */ -#define PMf_BASE_SHIFT (_RXf_PMf_SHIFT_NEXT+4) +#define PMf_BASE_SHIFT (_RXf_PMf_SHIFT_NEXT+3) /* 'use re "taint"' in scope: taint $1 etc. if target tainted */ #define PMf_RETAINT (1U<<(PMf_BASE_SHIFT+5)) diff --git a/op_reg_common.h b/op_reg_common.h index c3f3e7f..3edd4d8 100644 --- a/op_reg_common.h +++ b/op_reg_common.h @@ -33,7 +33,9 @@ #define RXf_PMf_FOLD (1U << (RXf_PMf_STD_PMMOD_SHIFT+2)) /* /i */ #define RXf_PMf_EXTENDED (1U << (RXf_PMf_STD_PMMOD_SHIFT+3)) /* /x */ #define RXf_PMf_EXTENDED_MORE (1U << (RXf_PMf_STD_PMMOD_SHIFT+4)) /* /xx */ -#define RXf_PMf_KEEPCOPY (1U << (RXf_PMf_STD_PMMOD_SHIFT+5)) /* /p */ +#define RXf_PMf_NOCAPTURE (1U << (RXf_PMf_STD_PMMOD_SHIFT+5)) /* /n */ + +#define RXf_PMf_KEEPCOPY (1U << (RXf_PMf_STD_PMMOD_SHIFT+6)) /* /p */ /* The character set for the regex is stored in a field of more than one bit * using an enum, for reasons of compactness and to ensure that the options are @@ -49,7 +51,7 @@ typedef enum { REGEX_ASCII_MORE_RESTRICTED_CHARSET } regex_charset; -#define _RXf_PMf_CHARSET_SHIFT ((RXf_PMf_STD_PMMOD_SHIFT)+6) +#define _RXf_PMf_CHARSET_SHIFT ((RXf_PMf_STD_PMMOD_SHIFT)+7) #define RXf_PMf_CHARSET (7U << (_RXf_PMf_CHARSET_SHIFT)) /* 3 bits */ /* Manually decorate these functions here with gcc-style attributes just to @@ -81,22 +83,22 @@ get_regex_charset(const U32 flags) return (regex_charset) ((flags & RXf_PMf_CHARSET) >> _RXf_PMf_CHARSET_SHIFT); } -#define _RXf_PMf_SHIFT_COMPILETIME (RXf_PMf_STD_PMMOD_SHIFT+9) +#define _RXf_PMf_SHIFT_COMPILETIME (RXf_PMf_STD_PMMOD_SHIFT+10) /* Set in Perl_pmruntime if op_flags & OPf_SPECIAL, i.e. split. Will be used by regex engines to check whether they should set RXf_SKIPWHITE */ -#define RXf_PMf_SPLIT (1U<<(RXf_PMf_STD_PMMOD_SHIFT+9)) +#define RXf_PMf_SPLIT (1U<<(RXf_PMf_STD_PMMOD_SHIFT+10)) /* Next available bit after the above. Name begins with '_' so won't be * exported by B */ -#define _RXf_PMf_SHIFT_NEXT (RXf_PMf_STD_PMMOD_SHIFT+10) +#define _RXf_PMf_SHIFT_NEXT (RXf_PMf_STD_PMMOD_SHIFT+11) /* Mask of the above bits. These need to be transferred from op_pmflags to * re->extflags during compilation */ -#define RXf_PMf_COMPILETIME (RXf_PMf_MULTILINE|RXf_PMf_SINGLELINE|RXf_PMf_FOLD|RXf_PMf_EXTENDED|RXf_PMf_EXTENDED_MORE|RXf_PMf_KEEPCOPY|RXf_PMf_CHARSET) +#define RXf_PMf_COMPILETIME (RXf_PMf_MULTILINE|RXf_PMf_SINGLELINE|RXf_PMf_FOLD|RXf_PMf_EXTENDED|RXf_PMf_EXTENDED_MORE|RXf_PMf_KEEPCOPY|RXf_PMf_NOCAPTURE|RXf_PMf_CHARSET) #define RXf_PMf_FLAGCOPYMASK (RXf_PMf_COMPILETIME|RXf_PMf_SPLIT) #if 0 /* Temporary to get Jenkins happy again */ @@ -115,11 +117,12 @@ get_regex_charset(const U32 flags) #define PMf_FOLD (1U<<2) #define PMf_EXTENDED (1U<<3) #define PMf_EXTENDED_MORE (1U<<4) -#define PMf_KEEPCOPY (1U<<5) -#define PMf_CHARSET (7U<<6) -#define PMf_SPLIT (1U<<9) +#define PMf_NOCAPTURE (1U<<5) +#define PMf_KEEPCOPY (1U<<6) +#define PMf_CHARSET (7U<<7) +#define PMf_SPLIT (1U<<10) -#if PMf_MULTILINE != RXf_PMf_MULTILINE || PMf_SINGLELINE != RXf_PMf_SINGLELINE || PMf_FOLD != RXf_PMf_FOLD || PMf_EXTENDED != RXf_PMf_EXTENDED || PMf_EXTENDED_MORE != RXf_PMf_EXTENDED_MORE || PMf_KEE ... [89 chars truncated] +#if PMf_MULTILINE != RXf_PMf_MULTILINE || PMf_SINGLELINE != RXf_PMf_SINGLELINE || PMf_FOLD != RXf_PMf_FOLD || PMf_EXTENDED != RXf_PMf_EXTENDED || PMf_EXTENDED_MORE != RXf_PMf_EXTENDED_MORE || PMf_KEE ... [127 chars truncated] # error RXf_PMf defines are wrong #endif diff --git a/regcomp.c b/regcomp.c index 4556d1a..a58080b 100644 --- a/regcomp.c +++ b/regcomp.c @@ -9587,7 +9587,7 @@ S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state) and must be globally applied -- japhy */ switch (*RExC_parse) { - /* Code for the imsx flags */ + /* Code for the imsxn flags */ CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp, x_mod_count); case LOCALE_PAT_MOD: @@ -10443,7 +10443,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) goto parse_rest; } /* end switch */ } - else { /* (...) */ + else if (!(RExC_flags & RXf_PMf_NOCAPTURE)) { /* (...) */ capturing_parens: parno = RExC_npar; RExC_npar++; @@ -10465,6 +10465,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) Set_Node_Length(ret, 1); /* MJD */ Set_Node_Offset(ret, RExC_parse); /* MJD */ is_open = 1; + } else { + ret = NULL; } } else /* ! paren */ diff --git a/regexp.h b/regexp.h index 81ae0a6..3348e17 100644 --- a/regexp.h +++ b/regexp.h @@ -272,13 +272,14 @@ and check for NULL. #include "op_reg_common.h" -#define RXf_PMf_STD_PMMOD (RXf_PMf_MULTILINE|RXf_PMf_SINGLELINE|RXf_PMf_FOLD|RXf_PMf_EXTENDED) +#define RXf_PMf_STD_PMMOD (RXf_PMf_MULTILINE|RXf_PMf_SINGLELINE|RXf_PMf_FOLD|RXf_PMf_EXTENDED|RXf_PMf_NOCAPTURE) #define CASE_STD_PMMOD_FLAGS_PARSE_SET(pmfl, x_count) \ case IGNORE_PAT_MOD: *(pmfl) |= RXf_PMf_FOLD; break; \ case MULTILINE_PAT_MOD: *(pmfl) |= RXf_PMf_MULTILINE; break; \ case SINGLE_PAT_MOD: *(pmfl) |= RXf_PMf_SINGLELINE; break; \ - case XTENDED_PAT_MOD: *(pmfl) |= RXf_PMf_EXTENDED; (x_count)++; break; + case XTENDED_PAT_MOD: *(pmfl) |= RXf_PMf_EXTENDED; (x_count)++; break;\ + case NOCAPTURE_PAT_MOD: *(pmfl) |= RXf_PMf_NOCAPTURE; break; #define STD_PMMOD_FLAGS_PARSE_X_WARN(x_count) \ if (UNLIKELY((x_count) > 1)) { \ @@ -289,7 +290,7 @@ and check for NULL. /* Note, includes charset ones, assumes 0 is the default for them */ #define STD_PMMOD_FLAGS_CLEAR(pmfl) \ - *(pmfl) &= ~(RXf_PMf_FOLD|RXf_PMf_MULTILINE|RXf_PMf_SINGLELINE|RXf_PMf_EXTENDED|RXf_PMf_CHARSET) + *(pmfl) &= ~(RXf_PMf_FOLD|RXf_PMf_MULTILINE|RXf_PMf_SINGLELINE|RXf_PMf_EXTENDED|RXf_PMf_CHARSET|RXf_PMf_NOCAPTURE) /* chars and strings used as regex pattern modifiers * Singular is a 'c'har, plural is a "string" @@ -301,6 +302,7 @@ and check for NULL. #define DEFAULT_PAT_MOD '^' /* Short for all the default modifiers */ #define EXEC_PAT_MOD 'e' #define KEEPCOPY_PAT_MOD 'p' +#define NOCAPTURE_PAT_MOD 'n' #define ONCE_PAT_MOD 'o' #define GLOBAL_PAT_MOD 'g' #define CONTINUE_PAT_MOD 'c' @@ -316,6 +318,7 @@ and check for NULL. #define ONCE_PAT_MODS "o" #define KEEPCOPY_PAT_MODS "p" +#define NOCAPTURE_PAT_MODS "n" #define EXEC_PAT_MODS "e" #define LOOP_PAT_MODS "gc" #define NONDESTRUCT_PAT_MODS "r" @@ -328,7 +331,7 @@ and check for NULL. /* This string is expected by regcomp.c to be ordered so that the first * character is the flag in bit RXf_PMf_STD_PMMOD_SHIFT of extflags; the next * character is bit +1, etc. */ -#define STD_PAT_MODS "msixx" +#define STD_PAT_MODS "msixxn" #define CHARSET_PAT_MODS ASCII_RESTRICT_PAT_MODS DEPENDS_PAT_MODS LOCALE_PAT_MODS UNICODE_PAT_MODS @@ -337,7 +340,7 @@ and check for NULL. * extflags; the next character is in bit +1, etc. */ #define INT_PAT_MODS STD_PAT_MODS KEEPCOPY_PAT_MODS -#define EXT_PAT_MODS ONCE_PAT_MODS KEEPCOPY_PAT_MODS +#define EXT_PAT_MODS ONCE_PAT_MODS KEEPCOPY_PAT_MODS NOCAPTURE_PAT_MODS #define QR_PAT_MODS STD_PAT_MODS EXT_PAT_MODS CHARSET_PAT_MODS #define M_PAT_MODS QR_PAT_MODS LOOP_PAT_MODS #define S_PAT_MODS M_PAT_MODS EXEC_PAT_MODS NONDESTRUCT_PAT_MODS @@ -388,7 +391,7 @@ and check for NULL. * For the regexp bits, PL_reg_extflags_name[] in regnodes.h has a comment * giving which bits are used/unused */ -#define RXf_BASE_SHIFT (_RXf_PMf_SHIFT_NEXT + 4) +#define RXf_BASE_SHIFT (_RXf_PMf_SHIFT_NEXT + 3) /* What we have seen */ #define RXf_NO_INPLACE_SUBST (1U<<(RXf_BASE_SHIFT+2)) diff --git a/regnodes.h b/regnodes.h index 937dd57..41662a0 100644 --- a/regnodes.h +++ b/regnodes.h @@ -627,18 +627,18 @@ EXTCONST char * const PL_reg_name[] = { EXTCONST char * PL_reg_extflags_name[]; #else EXTCONST char * const PL_reg_extflags_name[] = { - /* Bits in extflags defined: 11111111111111110000001111111111 */ + /* Bits in extflags defined: 11111111111111110000011111111111 */ "MULTILINE", /* 0x00000001 */ "SINGLELINE", /* 0x00000002 */ "FOLD", /* 0x00000004 */ "EXTENDED", /* 0x00000008 */ "EXTENDED_MORE", /* 0x00000010 */ - "KEEPCOPY", /* 0x00000020 */ - "CHARSET0", /* 0x00000040 : "CHARSET" - 0x000001c0 */ - "CHARSET1", /* 0x00000080 : "CHARSET" - 0x000001c0 */ - "CHARSET2", /* 0x00000100 : "CHARSET" - 0x000001c0 */ - "SPLIT", /* 0x00000200 */ - "UNUSED_BIT_10", /* 0x00000400 */ + "NOCAPTURE", /* 0x00000020 */ + "KEEPCOPY", /* 0x00000040 */ + "CHARSET0", /* 0x00000080 : "CHARSET" - 0x00000380 */ + "CHARSET1", /* 0x00000100 : "CHARSET" - 0x00000380 */ + "CHARSET2", /* 0x00000200 : "CHARSET" - 0x00000380 */ + "SPLIT", /* 0x00000400 */ "UNUSED_BIT_11", /* 0x00000800 */ "UNUSED_BIT_12", /* 0x00001000 */ "UNUSED_BIT_13", /* 0x00002000 */ diff --git a/t/re/reg_nocapture.t b/t/re/reg_nocapture.t new file mode 100644 index 0000000..40c3080 --- /dev/null +++ b/t/re/reg_nocapture.t @@ -0,0 +1,61 @@ +#!./perl + +BEGIN { + chdir 't' if -d 't'; + @INC = '../lib'; + require './test.pl'; +} + +use strict; +use warnings; + +plan tests => 25; + +# Some /qr/ tests +my $re = qr/(.*) b c d/; +ok("a b c d" =~ /$re/n, "/n still matches"); +is($1, "a", "Outer /n doesn't affect inner qr//"); + +$re = qr/(.*) b c d/n; +ok("a b c d" =~ /$re/, "qr//n matches"); +is($1, undef, "qr//n prevents capturing"); + +ok("a b c d" =~ $re, "qr// out of // matches"); +is($1, undef, "qr//n prevents capturing"); + +# Some // tests +ok("a b c d" =~ /(a) b c d/n, "//n matches"); +is($1, undef, "/n prevents capture"); + +ok("a b c d" =~ /(a) (b) c d/n, "//n matches with multiple ()"); +is($1, undef, "/n prevents capture in \$1"); +is($2, undef, "/n prevents capture in \$2"); + +# ?n +ok("a b c d" =~ /(?n:a) b c (d)/, "?n matches"); +is($1, 'd', "?n: blocked capture"); + +# ?-n:() +ok("a b c d" =~ /(?-n:(a)) b c (d)/n, "?-n matches"); +is($1, 'a', "?-n:() disabled nocapture"); + +ok("a b c d" =~ /(?<a>.) (?<b>.) (.*)/n, "named capture..."); +is($1, 'a', "named capture allows $1 with /n"); +is($2, 'b', "named capture allows $2 with /n"); +is($3, undef, "(.*) didn't capture with /n"); + +is($+{a}, 'a', "\$+{a} is correct"); +is($+{b}, 'b', "\$+{b} is correct"); + +is(qr/(what)/n, '(?^n:(what))', + 'qr//n stringified is correct'); + +is(qr/(?n:what)/, '(?^:(?n:what))', + 'qr/(?n:...)/ stringified is correct'); + +is(qr/(?-n:what)/, '(?^:(?-n:what))', + 'qr/(?-n:...)/ stringified is correct'); + +is(qr/(?-n:what)/n, '(?^n:(?-n:what))', + 'qr/(?-n:...)/n stringified is correct'); + -- Perl5 Master Repository
