In perl.git, the branch blead has been updated

<http://perl5.git.perl.org/perl.git/commitdiff/3a1b438d8754210a3357e3be5ccec788e9be1c0a?hp=3f5e3f2f07f57709accec3bfe9c57bc97f48246c>

- Log -----------------------------------------------------------------
commit 3a1b438d8754210a3357e3be5ccec788e9be1c0a
Author: Yves Orton <[email protected]>
Date:   Mon Mar 25 23:15:00 2013 +0100

    fix comment, reindent and add parenthesis for clarity
    
    I had to stare at this expression and make sure there wasn't
    anything tricky for too long, so I added parens, and reindented
    it.

M       dist/B-Deparse/Deparse.pm

commit dbc200c5a1d3ae1d9360435a384c19883bf5f4f6
Author: Yves Orton <[email protected]>
Date:   Mon Mar 25 23:23:40 2013 +0100

    rework split() special case interaction with regex engine
    
    This patch resolves several issues at once. The parts are
    sufficiently interconnected that it is hard to break it down
    into smaller commits. The tickets open for these issues are:
    
      RT #94490  - split and constant folding
      RT #116086 - split "\x20" doesn't work as documented
    
    It additionally corrects some issues with cached regexes that
    were exposed by the split changes (and applied to them).
    
    It effectively reverts 5255171e6cd0accee6f76ea2980e32b3b5b8e171
    and cccd1425414e6518c1fc8b7bcaccfb119320c513.
    
    Prior to this patch the special RXf_SKIPWHITE behavior of
    
        split(" ", $thing)
    
    was only available if Perl could resolve the first argument to
    split at compile time, meaning under various arcane situations.
    
    This manifested as oddities like
    
        my $delim = $cond ? " " : qr/\s+/;
        split $delim, $string;
    
    and
    
        split $cond ? " ", qr/\s+/, $string
    
    not behaving the same as:
    
        ($cond ? split(" ", $string) : split(/\s+/, $string))
    
    which isn't very convenient.
    
    This patch changes this by adding a new flag to the op_pmflags,
    PMf_SPLIT which enables pp_regcomp() to know whether it was called
    as part of split, which allows the RXf_SPLIT to be passed into run
    time regex compilation. We also preserve the original flags so
    pattern caching works properly, by adding a new property to the
    regexp structure, "compflags", and related macros for accessing it.
    We preserve the original flags passed into the compilation process,
    so we can compare when we are trying to decide if we need to
    recompile.
    
    Note that this essentially the opposite fix from the one applied
    originally to fix #94490 in 5255171e6cd0accee6f76ea2980e32b3b5b8e171.
    The reverted patch was meant to make:
    
            split( 0 || " ", $thing )            #1
    
    consistent with
    
            my $x=0; split( $x || " ", $thing )  #2
    
    and not with
    
            split( " ", $thing )                 #3
    
    This was reverted because it broke C<split("\x{20}", $thing)>, and
    because one might argue that is not that #1 does the wrong thing,
    but rather that the behavior of #2 that is wrong. In other words
    we might expect that all three should behave the same as #3, and
    that instead of "fixing" the behavior of #1 to be like #2, we should
    really fix the behavior of #2 to behave like #3. (Which is what we did.)
    
    Also, it doesn't make sense to move the special case detection logic
    further from the regex engine. We really want the regex engine to decide
    this stuff itself, otherwise split " ", ... wouldn't work properly with
    an alternate engine. (Imagine we add a special regexp meta pattern that 
behaves
    the same as " " does in a split /.../. For instance we might make
    split /(*SPLITWHITE)/ trigger the same behavior as split " ".
    
    The other major change as result of this patch is it effectively
    reverts commit cccd1425414e6518c1fc8b7bcaccfb119320c513, which
    was intended to get rid of RXf_SPLIT and RXf_SKIPWHITE, which
    and free up bits in the regex flags structure.
    
    But we dont want to get rid of these vars, and it turns out that
    RXf_SEEN_LOOKBEHIND is used only in the same situation as the new
    RXf_MODIFIES_VARS. So I have renamed RXf_SEEN_LOOKBEHIND to
    RXf_NO_INPLACE_SUBST, and then instead of using two vars we use
    only the one. Which in turn allows RXf_SPLIT and RXf_SKIPWHITE to
    have their bits back.

M       dump.c
M       ext/Devel-Peek/t/Peek.t
M       op.c
M       op.h
M       op_reg_common.h
M       pod/perlreapi.pod
M       pp.c
M       pp_ctl.c
M       pp_hot.c
M       regcomp.c
M       regexp.h
M       regnodes.h
M       t/op/split.t
M       t/re/recompile.t

commit c9d98c4e542a0779fb34f107a15def6ed7ff3f98
Author: Yves Orton <[email protected]>
Date:   Mon Mar 25 23:06:22 2013 +0100

    simplify regcomp.c by using vars to avoid repeated macros
    
    Use two temporary variables to simplify the logic, and maybe
    speed up a nanosecond or two.
    
    Also chainsaw some long dead logic. (I #ifdef'ed it out years ago)

M       regcomp.c

commit 6976c98688aafe67a4a8b88a77e48e96b8ea0fb9
Author: Yves Orton <[email protected]>
Date:   Mon Mar 25 20:08:56 2013 +0100

    Improve how regcomp.pl handles multibits
    
    In preparation for future changes.

M       regen/regcomp.pl
M       regnodes.h
-----------------------------------------------------------------------

Summary of changes:
 dist/B-Deparse/Deparse.pm |   15 ++++++++-----
 dump.c                    |   21 ++++++++++++++++--
 ext/Devel-Peek/t/Peek.t   |    2 +
 op.c                      |   13 ++++++-----
 op.h                      |    2 +-
 op_reg_common.h           |   23 +++++++++++++++-----
 pod/perlreapi.pod         |    8 ++++--
 pp.c                      |    7 ++---
 pp_ctl.c                  |   19 +++++++++++++++-
 pp_hot.c                  |    2 +-
 regcomp.c                 |   49 ++++++++++++++++++++++++++++----------------
 regen/regcomp.pl          |   23 ++++++++++++++------
 regexp.h                  |   35 ++++++++++++++++---------------
 regnodes.h                |   16 +++++++-------
 t/op/split.t              |   35 +++++++++++++++++++++++---------
 t/re/recompile.t          |    8 ++++++-
 16 files changed, 185 insertions(+), 93 deletions(-)

diff --git a/dist/B-Deparse/Deparse.pm b/dist/B-Deparse/Deparse.pm
index 1771f01..0241c14 100644
--- a/dist/B-Deparse/Deparse.pm
+++ b/dist/B-Deparse/Deparse.pm
@@ -20,7 +20,7 @@ use B qw(class main_root main_start main_cv svref_2object 
opnumber perlstring
          CVf_METHOD CVf_LVALUE
         PMf_KEEP PMf_GLOBAL PMf_CONTINUE PMf_EVAL PMf_ONCE
         PMf_MULTILINE PMf_SINGLELINE PMf_FOLD PMf_EXTENDED);
-$VERSION = '1.19';
+$VERSION = '1.20';
 use strict;
 use vars qw/$AUTOLOAD/;
 use warnings ();
@@ -4803,13 +4803,16 @@ sub pp_split {
 
     # handle special case of split(), and split(' ') that compiles to /\s+/
     # Under 5.10, the reflags may be undef if the split regexp isn't a constant
-    # Under 5.17.5+, the special flag is on split itself.
+    # Under 5.17.5-5.17.9, the special flag is on split itself.
     $kid = $op->first;
     if ( $op->flags & OPf_SPECIAL
-       or
-        $kid->flags & OPf_SPECIAL
-        and ( $] < 5.009 ? $kid->pmflags & PMf_SKIPWHITE()
-             : ($kid->reflags || 0) & RXf_SKIPWHITE() ) ) {
+         or (
+            $kid->flags & OPf_SPECIAL
+            and ( $] < 5.009 ? $kid->pmflags & PMf_SKIPWHITE()
+                             : ($kid->reflags || 0) & RXf_SKIPWHITE()
+            )
+         )
+    ) {
        $exprs[0] = "' '";
     }
 
diff --git a/dump.c b/dump.c
index dd0e305..eaf6674 100644
--- a/dump.c
+++ b/dump.c
@@ -672,6 +672,8 @@ S_pm_description(pTHX_ const PMOP *pm)
             if (RX_EXTFLAGS(regex) & RXf_CHECK_ALL)
                 sv_catpv(desc, ",ALL");
         }
+        if (RX_EXTFLAGS(regex) & RXf_SKIPWHITE)
+            sv_catpv(desc, ",SKIPWHITE");
     }
 
     append_flags(desc, pmflags, pmflags_flags_names);
@@ -1449,7 +1451,7 @@ const struct flag_to_name regexp_flags_names[] = {
     {RXf_ANCH_GPOS,       "ANCH_GPOS,"},
     {RXf_GPOS_SEEN,       "GPOS_SEEN,"},
     {RXf_GPOS_FLOAT,      "GPOS_FLOAT,"},
-    {RXf_LOOKBEHIND_SEEN, "LOOKBEHIND_SEEN,"},
+    {RXf_NO_INPLACE_SUBST, "NO_INPLACE_SUBST,"},
     {RXf_EVAL_SEEN,       "EVAL_SEEN,"},
     {RXf_CANY_SEEN,       "CANY_SEEN,"},
     {RXf_NOSCAN,          "NOSCAN,"},
@@ -1458,10 +1460,12 @@ const struct flag_to_name regexp_flags_names[] = {
     {RXf_USE_INTUIT_NOML, "USE_INTUIT_NOML,"},
     {RXf_USE_INTUIT_ML,   "USE_INTUIT_ML,"},
     {RXf_INTUIT_TAIL,     "INTUIT_TAIL,"},
+    {RXf_SPLIT,           "SPLIT,"},
     {RXf_COPY_DONE,       "COPY_DONE,"},
     {RXf_TAINTED_SEEN,    "TAINTED_SEEN,"},
     {RXf_TAINTED,         "TAINTED,"},
     {RXf_START_ONLY,      "START_ONLY,"},
+    {RXf_SKIPWHITE,       "SKIPWHITE,"},
     {RXf_WHITE,           "WHITE,"},
     {RXf_NULL,            "NULL,"},
 };
@@ -2091,7 +2095,17 @@ Perl_do_sv_dump(pTHX_ I32 level, PerlIO *file, SV *sv, 
I32 nest, I32 maxnest, bo
       dumpregexp:
        {
            struct regexp * const r = ReANY((REGEXP*)sv);
-           flags = RX_EXTFLAGS((REGEXP*)sv);
+            flags = r->compflags;
+            sv_setpv(d,"");
+            append_flags(d, flags, regexp_flags_names);
+            if (*(SvEND(d) - 1) == ',') {
+                SvCUR_set(d, SvCUR(d) - 1);
+                SvPVX(d)[SvCUR(d)] = '\0';
+            }
+            Perl_dump_indent(aTHX_ level, file, "  COMPFLAGS = 0x%"UVxf" 
(%s)\n",
+                                (UV)(r->compflags), SvPVX_const(d));
+
+            flags = r->extflags;
            sv_setpv(d,"");
            append_flags(d, flags, regexp_flags_names);
            if (*(SvEND(d) - 1) == ',') {
@@ -2099,7 +2113,8 @@ Perl_do_sv_dump(pTHX_ I32 level, PerlIO *file, SV *sv, 
I32 nest, I32 maxnest, bo
                SvPVX(d)[SvCUR(d)] = '\0';
            }
            Perl_dump_indent(aTHX_ level, file, "  EXTFLAGS = 0x%"UVxf" (%s)\n",
-                               (UV)flags, SvPVX_const(d));
+                                (UV)(r->extflags), SvPVX_const(d));
+
            Perl_dump_indent(aTHX_ level, file, "  INTFLAGS = 0x%"UVxf"\n",
                                (UV)(r->intflags));
            Perl_dump_indent(aTHX_ level, file, "  NPARENS = %"UVuf"\n",
diff --git a/ext/Devel-Peek/t/Peek.t b/ext/Devel-Peek/t/Peek.t
index 1debcb5..3de4600 100644
--- a/ext/Devel-Peek/t/Peek.t
+++ b/ext/Devel-Peek/t/Peek.t
@@ -339,6 +339,7 @@ do_test('reference to regexp',
     STASH = $ADDR\\t"Regexp"'
 . ($] < 5.013 ? '' :
 '
+    COMPFLAGS = 0x0 \(\)
     EXTFLAGS = 0x680000 \(CHECK_ALL,USE_INTUIT_NOML,USE_INTUIT_ML\)
     INTFLAGS = 0x0
     NPARENS = 0
@@ -943,6 +944,7 @@ do_test('UTF-8 in a regular expression',
     PV = $ADDR "\(\?\^u:\\\\\\\\x\{100\}\)" \[UTF8 
"\(\?\^u:\\\\\\\\x\{100\}\)"\]
     CUR = 13
     STASH = $ADDR      "Regexp"
+    COMPFLAGS = 0x0 \(\)
     EXTFLAGS = 0x680040 \(CHECK_ALL,USE_INTUIT_NOML,USE_INTUIT_ML\)
     INTFLAGS = 0x0
     NPARENS = 0
diff --git a/op.c b/op.c
index 2b83188..c502d3f 100644
--- a/op.c
+++ b/op.c
@@ -4592,6 +4592,9 @@ Perl_pmruntime(pTHX_ OP *o, OP *expr, bool isreg, I32 
floor)
        U32 rx_flags = pm->op_pmflags & RXf_PMf_COMPILETIME;
        regexp_engine const *eng = current_re_engine();
 
+        if (o->op_flags & OPf_SPECIAL)
+            rx_flags |= RXf_SPLIT;
+
        if (!has_code || !eng->op_comp) {
            /* compile-time simple constant pattern */
 
@@ -4668,6 +4671,9 @@ Perl_pmruntime(pTHX_ OP *o, OP *expr, bool isreg, I32 
floor)
            pm->op_pmflags |= PMf_CODELIST_PRIVATE;
        }
 
+        if (o->op_flags & OPf_SPECIAL)
+            pm->op_pmflags |= PMf_SPLIT;
+
        /* the OP_REGCMAYBE is a placeholder in the non-threaded case
         * to allow its op_next to be pointed past the regcomp and
         * preceding stacking ops;
@@ -9755,15 +9761,10 @@ Perl_ck_split(pTHX_ OP *o)
        cLISTOPo->op_last = kid; /* There was only one element previously */
     }
 
-    if (kid->op_type == OP_CONST && !(kid->op_private & OPpCONST_FOLDED)) {
-       SV * const sv = kSVOP->op_sv;
-       if (SvPOK(sv) && SvCUR(sv) == 1 && *SvPVX(sv) == ' ')
-           o->op_flags |= OPf_SPECIAL;
-    }
     if (kid->op_type != OP_MATCH || kid->op_flags & OPf_STACKED) {
        OP * const sibl = kid->op_sibling;
        kid->op_sibling = 0;
-       kid = pmruntime( newPMOP(OP_MATCH, 0), kid, 0, 0);
+        kid = pmruntime( newPMOP(OP_MATCH, OPf_SPECIAL), kid, 0, 0); /* 
OPf_SPECIAL is used to trigger split " " behavior */
        if (cLISTOPo->op_first == cLISTOPo->op_last)
            cLISTOPo->op_last = kid;
        cLISTOPo->op_first = kid;
diff --git a/op.h b/op.h
index 88703da..8b87a9c 100644
--- a/op.h
+++ b/op.h
@@ -115,7 +115,7 @@ Deprecated.  Use C<GIMME_V> instead.
                                /*  On OP_ENTERSUB || OP_NULL, saw a "do". */
                                /*  On OP_EXISTS, treat av as av, not avhv.  */
                                /*  On OP_(ENTER|LEAVE)EVAL, don't clear $@ */
-                               /*  On OP_SPLIT, special split " " */
+                                /*  On pushre, rx is used as part of split, 
e.g. split " " */
                                /*  On regcomp, "use re 'eval'" was in scope */
                                /*  On OP_READLINE, was <$filehandle> */
                                /*  On RV2[ACGHS]V, don't create GV--in
diff --git a/op_reg_common.h b/op_reg_common.h
index eed483b..9dcdaae 100644
--- a/op_reg_common.h
+++ b/op_reg_common.h
@@ -27,6 +27,7 @@
  * RXf_PMf_STD_PMMOD_SHIFT, followed by the p.  See STD_PAT_MODS and
  * INT_PAT_MODS in regexp.h for the reason contiguity is needed */
 /* Make sure to update lib/re.pm when changing these! */
+/* Make sure you keep the pure PMf_ versions below in sync */
 #define RXf_PMf_MULTILINE      (1 << (RXf_PMf_STD_PMMOD_SHIFT+0))    /* /m */
 #define RXf_PMf_SINGLELINE     (1 << (RXf_PMf_STD_PMMOD_SHIFT+1))    /* /s */
 #define RXf_PMf_FOLD           (1 << (RXf_PMf_STD_PMMOD_SHIFT+2))    /* /i */
@@ -79,13 +80,23 @@ get_regex_charset(const U32 flags)
     return (regex_charset) ((flags & RXf_PMf_CHARSET) >> 
_RXf_PMf_CHARSET_SHIFT);
 }
 
+#define _RXf_PMf_SHIFT_COMPILETIME (RXf_PMf_STD_PMMOD_SHIFT+8)
+
+/*
+  Set in Perl_pmruntime if op_flags & OPf_SPECIAL, i.e. split. Will
+  be used by regex engines to check whether they should set
+  RXf_SKIPWHITE
+*/
+#define RXf_PMf_SPLIT (1<<(RXf_PMf_STD_PMMOD_SHIFT+8))
+
 /* Next available bit after the above.  Name begins with '_' so won't be
  * exported by B */
-#define _RXf_PMf_SHIFT_NEXT (RXf_PMf_STD_PMMOD_SHIFT+8)
+#define _RXf_PMf_SHIFT_NEXT (RXf_PMf_STD_PMMOD_SHIFT+9)
 
 /* Mask of the above bits.  These need to be transferred from op_pmflags to
  * re->extflags during compilation */
-#define RXf_PMf_COMPILETIME    
(RXf_PMf_MULTILINE|RXf_PMf_SINGLELINE|RXf_PMf_CHARSET|RXf_PMf_FOLD|RXf_PMf_EXTENDED|RXf_PMf_KEEPCOPY)
+#define RXf_PMf_COMPILETIME    
(RXf_PMf_MULTILINE|RXf_PMf_SINGLELINE|RXf_PMf_FOLD|RXf_PMf_EXTENDED|RXf_PMf_KEEPCOPY|RXf_PMf_CHARSET)
+#define RXf_PMf_FLAGCOPYMASK   
(RXf_PMf_MULTILINE|RXf_PMf_SINGLELINE|RXf_PMf_FOLD|RXf_PMf_EXTENDED|RXf_PMf_KEEPCOPY|RXf_PMf_CHARSET|RXf_PMf_SPLIT)
 
 #if RXf_PMf_COMPILETIME > 255
 #  error RXf_PMf_COMPILETIME wont fit in U8 flags field of eval node
@@ -97,18 +108,18 @@ get_regex_charset(const U32 flags)
 #define PMf_FOLD         1<<2
 #define PMf_EXTENDED     1<<3
 #define PMf_KEEPCOPY     1<<4
+#define PMf_CHARSET      7<<5
+#define PMf_SPLIT        1<<8
 
-#if PMf_MULTILINE != RXf_PMf_MULTILINE || PMf_SINGLELINE != RXf_PMf_SINGLELINE 
|| PMf_FOLD != RXf_PMf_FOLD || PMf_EXTENDED != RXf_PMf_EXTENDED || PMf_KEEPCOPY 
!= RXf_PMf_KEEPCOPY
+#if PMf_MULTILINE != RXf_PMf_MULTILINE || PMf_SINGLELINE != RXf_PMf_SINGLELINE 
|| PMf_FOLD != RXf_PMf_FOLD || PMf_EXTENDED != RXf_PMf_EXTENDED || PMf_KEEPCOPY 
!= RXf_PMf_KEEPCOPY || PMf_SPLIT != RXf_ ... [43 chars truncated]
 #   error RXf_PMf defines are wrong
 #endif
 
-#define PMf_COMPILETIME RXf_PMf_COMPILETIME
-
 /*  Error check that haven't left something out of this.  This isn't done
  *  directly in the #define because doing so confuses regcomp.pl.
  *  (2**n - 1) is n 1 bits, so the below gets the contiguous bits between the
  *  beginning and ending shifts */
-#if RXf_PMf_COMPILETIME != (((1 << (_RXf_PMf_SHIFT_NEXT))-1) \
+#if RXf_PMf_COMPILETIME != (((1 << (_RXf_PMf_SHIFT_COMPILETIME))-1) \
                             & (~((1 << RXf_PMf_STD_PMMOD_SHIFT)-1)))
 #   error RXf_PMf_COMPILETIME is invalid
 #endif
diff --git a/pod/perlreapi.pod b/pod/perlreapi.pod
index f1b6fdcf..eaaa179 100644
--- a/pod/perlreapi.pod
+++ b/pod/perlreapi.pod
@@ -223,11 +223,13 @@ Perl's engine sets this flag on empty patterns, this 
optimization
 makes C<split //> much faster than it would otherwise be.  It's even
 faster than C<unpack>.
 
-=item RXf_MODIFIES_VARS
+=item RXf_NO_INPLACE_SUBST
 
 Added in perl 5.18.0, this flag indicates that a regular expression might
-assign to non-magical variables (such as $REGMARK and $REGERROR) during
-matching.  C<s///> will skip certain optimisations when this is set.
+perform an operation that would interfere with inplace substituion. For
+instance it might contain lookbehind, or assign to non-magical variables
+(such as $REGMARK and $REGERROR) during matching.  C<s///> will skip
+certain optimisations when this is set.
 
 =back
 
diff --git a/pp.c b/pp.c
index 14ba91d..6da9970 100644
--- a/pp.c
+++ b/pp.c
@@ -5313,7 +5313,6 @@ PP(pp_split)
     STRLEN len;
     const char *s = SvPV_const(sv, len);
     const bool do_utf8 = DO_UTF8(sv);
-    const bool skipwhite = PL_op->op_flags & OPf_SPECIAL;
     const char *strend = s + len;
     PMOP *pm;
     REGEXP *rx;
@@ -5346,7 +5345,7 @@ PP(pp_split)
     rx = PM_GETRE(pm);
 
     TAINT_IF(get_regex_charset(RX_EXTFLAGS(rx)) == REGEX_LOCALE_CHARSET &&
-            (RX_EXTFLAGS(rx) & RXf_WHITE || skipwhite));
+             (RX_EXTFLAGS(rx) & (RXf_WHITE | RXf_SKIPWHITE)));
 
     RX_MATCH_UTF8_set(rx, do_utf8);
 
@@ -5386,7 +5385,7 @@ PP(pp_split)
     }
     base = SP - PL_stack_base;
     orig = s;
-    if (skipwhite) {
+    if (RX_EXTFLAGS(rx) & RXf_SKIPWHITE) {
        if (do_utf8) {
            while (isSPACE_utf8(s))
                s += UTF8SKIP(s);
@@ -5408,7 +5407,7 @@ PP(pp_split)
 
     if (!limit)
        limit = maxiters + 2;
-    if (RX_EXTFLAGS(rx) & RXf_WHITE || skipwhite) {
+    if (RX_EXTFLAGS(rx) & RXf_WHITE) {
        while (--limit) {
            m = s;
            /* this one uses 'm' and is a negative test */
diff --git a/pp_ctl.c b/pp_ctl.c
index c9d833f..f518bc2 100644
--- a/pp_ctl.c
+++ b/pp_ctl.c
@@ -83,7 +83,7 @@ PP(pp_regcomp)
     REGEXP *re = NULL;
     REGEXP *new_re;
     const regexp_engine *eng;
-    bool is_bare_re;
+    bool is_bare_re= FALSE;
 
     if (PL_op->op_flags & OPf_STACKED) {
        dMARK;
@@ -107,14 +107,27 @@ PP(pp_regcomp)
     assert (re != (REGEXP*) &PL_sv_undef);
     eng = re ? RX_ENGINE(re) : current_re_engine();
 
+    /*
+     In the below logic: these are basically the same - check if this regcomp 
is part of a split.
+
+    (PL_op->op_pmflags & PMf_split )
+    (PL_op->op_next->op_type == OP_PUSHRE)
+
+    We could add a new mask for this and copy the PMf_split, if we did
+    some bit definition fiddling first.
+
+    For now we leave this
+    */
+
     new_re = (eng->op_comp
                    ? eng->op_comp
                    : &Perl_re_op_compile
            )(aTHX_ args, nargs, pm->op_code_list, eng, re,
                &is_bare_re,
-               (pm->op_pmflags & RXf_PMf_COMPILETIME),
+                (pm->op_pmflags & RXf_PMf_FLAGCOPYMASK),
                pm->op_pmflags |
                    (PL_op->op_flags & OPf_SPECIAL ? PMf_USE_RE_EVAL : 0));
+
     if (pm->op_pmflags & PMf_HAS_CV)
        ReANY(new_re)->qr_anoncv
                        = (CV*) SvREFCNT_inc(PAD_SV(PL_op->op_targ));
@@ -145,11 +158,13 @@ PP(pp_regcomp)
        ReREFCNT_dec(new_re);
        new_re = tmp;
     }
+
     if (re != new_re) {
        ReREFCNT_dec(re);
        PM_SETRE(pm, new_re);
     }
 
+
 #ifndef INCOMPLETE_TAINTS
     if (TAINTING_get && TAINT_get) {
        SvTAINTED_on((SV*)new_re);
diff --git a/pp_hot.c b/pp_hot.c
index 8871593..7eb0c61 100644
--- a/pp_hot.c
+++ b/pp_hot.c
@@ -2308,7 +2308,7 @@ PP(pp_subst)
 #endif
         && (I32)clen <= RX_MINLENRET(rx)
         && (once || !(r_flags & REXEC_COPY_STR))
-       && !(RX_EXTFLAGS(rx) & (RXf_LOOKBEHIND_SEEN|RXf_MODIFIES_VARS))
+        && !(RX_EXTFLAGS(rx) & RXf_NO_INPLACE_SUBST)
        && (!doutf8 || SvUTF8(TARG))
        && !(rpm->op_pmflags & PMf_NONDESTRUCT))
     {
diff --git a/regcomp.c b/regcomp.c
index 6686d8b..34a4e9f 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -5338,6 +5338,8 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int 
pat_count,
 
        SV **svp;
 
+        DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
+            "Compiling List of SVs %d elements%s\n",pat_count, orig_rx_flags & 
RXf_SPLIT ? " for split" : ""));
        /* apply magic and RE overloading to each arg */
        for (svp = patternp; svp < patternp + pat_count; svp++) {
            SV *rx = *svp;
@@ -5506,6 +5508,9 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int 
pat_count,
                    *is_bare_re = TRUE;
                SvREFCNT_inc(re);
                Safefree(pRExC_state->code_blocks);
+                DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
+                    "Precompiled pattern%s\n", orig_rx_flags & RXf_SPLIT ? " 
for split" : ""));
+
                return (REGEXP*)re;
            }
        }
@@ -5518,6 +5523,9 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int 
pat_count,
            bool is_code = 0;
            OP *o;
 
+            DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
+                "Compiling OP_LIST%s\n", orig_rx_flags & RXf_SPLIT ? " for 
split" : ""));
+
            pat = newSVpvn("", 0);
            SAVEFREESV(pat);
            if (code_is_utf8)
@@ -5548,6 +5556,8 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int 
pat_count,
        }
        else {
            assert(expr->op_type == OP_CONST);
+            DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
+                "Compiling OP_CONST%s\n", orig_rx_flags & RXf_SPLIT ? " for 
split" : ""));
            pat = cSVOPx_sv(expr);
        }
     }
@@ -5641,13 +5651,19 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int 
pat_count,
     }
 
     /* return old regex if pattern hasn't changed */
+    /* XXX: note in the below we have to check the flags as well as the 
pattern.
+     *
+     * Things get a touch tricky as we have to compare the utf8 flag 
independently
+     * from the compile flags.
+     */
 
     if (   old_re
         && !recompile
-       && !!RX_UTF8(old_re) == !!RExC_utf8
+        && !!RX_UTF8(old_re) == !!RExC_utf8
+        && ( RX_COMPFLAGS(old_re) == ( orig_rx_flags & RXf_PMf_FLAGCOPYMASK ) )
        && RX_PRECOMP(old_re)
        && RX_PRELEN(old_re) == plen
-       && memEQ(RX_PRECOMP(old_re), exp, plen))
+        && memEQ(RX_PRECOMP(old_re), exp, plen))
     {
        /* with runtime code, always recompile */
        runtime_code = S_has_runtime_code(aTHX_ pRExC_state, expr, pm_flags,
@@ -5799,6 +5815,8 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int 
pat_count,
     RXi_SET( r, ri );
     r->engine= eng;
     r->extflags = rx_flags;
+    RXp_COMPFLAGS(r) = orig_rx_flags & RXf_PMf_FLAGCOPYMASK;
+
     if (pm_flags & PMf_IS_QR) {
        ri->code_blocks = pRExC_state->code_blocks;
        ri->num_code_blocks = pRExC_state->num_code_blocks;
@@ -6302,7 +6320,7 @@ reStudy:
     if (RExC_seen & REG_SEEN_GPOS)
        r->extflags |= RXf_GPOS_SEEN;
     if (RExC_seen & REG_SEEN_LOOKBEHIND)
-       r->extflags |= RXf_LOOKBEHIND_SEEN;
+        r->extflags |= RXf_NO_INPLACE_SUBST; /* inplace might break the 
lookbehind */
     if (pRExC_state->num_code_blocks)
        r->extflags |= RXf_EVAL_SEEN;
     if (RExC_seen & REG_SEEN_CANY)
@@ -6310,7 +6328,7 @@ reStudy:
     if (RExC_seen & REG_SEEN_VERBARG)
     {
        r->intflags |= PREGf_VERBARG_SEEN;
-       r->extflags |= RXf_MODIFIES_VARS;
+        r->extflags |= RXf_NO_INPLACE_SUBST; /* don't understand this! Yves */
     }
     if (RExC_seen & REG_SEEN_CUTGROUP)
        r->intflags |= PREGf_CUTGROUP_SEEN;
@@ -6321,27 +6339,22 @@ reStudy:
     else
         RXp_PAREN_NAMES(r) = NULL;
 
-#ifdef STUPID_PATTERN_CHECKS            
-    if (RX_PRELEN(rx) == 0)
-        r->extflags |= RXf_NULL;
-    if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3))
-        r->extflags |= RXf_WHITE;
-    else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^')
-        r->extflags |= RXf_START_ONLY;
-#else
     {
         regnode *first = ri->program + 1;
         U8 fop = OP(first);
+        regnode *next = NEXTOPER(first);
+        U8 nop = OP(next);
 
-        if (PL_regkind[fop] == NOTHING && OP(NEXTOPER(first)) == END)
+        if (PL_regkind[fop] == NOTHING && nop == END)
             r->extflags |= RXf_NULL;
-        else if (PL_regkind[fop] == BOL && OP(NEXTOPER(first)) == END)
+        else if (PL_regkind[fop] == BOL && nop == END)
             r->extflags |= RXf_START_ONLY;
-        else if (fop == PLUS && PL_regkind[OP(NEXTOPER(first))] == POSIXD && 
FLAGS(NEXTOPER(first)) == _CC_SPACE
-                            && OP(regnext(first)) == END)
-            r->extflags |= RXf_WHITE;    
+        else if (fop == PLUS && PL_regkind[nop] == POSIXD && FLAGS(next) == 
_CC_SPACE && OP(regnext(first)) == END)
+            r->extflags |= RXf_WHITE;
+        else if ( r->extflags & RXf_SPLIT && fop == EXACT && STR_LEN(first) == 
1 && *(STRING(first)) == ' ' && OP(regnext(first)) == END )
+            r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
+
     }
-#endif
 #ifdef DEBUGGING
     if (RExC_paren_names) {
         ri->name_list_idx = add_data( pRExC_state, 1, "a" );
diff --git a/regen/regcomp.pl b/regen/regcomp.pl
index 5e1abfb..97719b0 100644
--- a/regen/regcomp.pl
+++ b/regen/regcomp.pl
@@ -266,7 +266,8 @@ foreach my $file ("op_reg_common.h", "regexp.h") {
         if (s/ \# \s* define \s+ ( _? RXf_ \w+ ) \s+ //xi) {
             chomp;
             my $define = $1;
-            s: / \s* \* .*? \* \s* / : :x;    # Replace comments by a blank
+            my $orig= $_;
+            s{ /\* .*? \*/ }{ }x;    # Replace comments by a blank
 
             # Replace any prior defined symbols by their values
             foreach my $key (keys %definitions) {
@@ -282,7 +283,12 @@ foreach my $file ("op_reg_common.h", "regexp.h") {
 
             next unless $_ =~ /<</; # Bit defines use left shift
             if($val & $newval) {
-                die sprintf "Both $define and $reverse{$newval} use %08X", 
$newval;
+                my @names=($define, $reverse{$newval});
+                s/PMf_// for @names;
+                if ($names[0] ne $names[1]) {
+                    die sprintf "ERROR: both $define and $reverse{$newval} use 
0x%08X (%s:%s)", $newval, $orig, $_;
+                }
+                next;
             }
             $val|=$newval;
             $rxfv{$define}= $newval;
@@ -292,9 +298,11 @@ foreach my $file ("op_reg_common.h", "regexp.h") {
 }
 my %vrxf=reverse %rxfv;
 printf $out "\t/* Bits in extflags defined: %s */\n", unpack 'B*', pack 'N', 
$val;
+my %multibits;
 for (0..31) {
     my $power_of_2 = 2**$_;
     my $n=$vrxf{$power_of_2};
+    my $extra = "";
     if (! $n) {
 
         # Here, there was no name that matched exactly the bit.  It could be
@@ -309,16 +317,17 @@ for (0..31) {
             # that name, and all the bits it matches
             foreach my $name (keys %rxfv) {
                 if ($rxfv{$name} & $power_of_2) {
-                    $n = $name;
-                    $power_of_2 = $rxfv{$name};
+                    $n = $name . ( $multibits{$name}++ );
+                    $extra= sprintf qq{ : "%s" - 0x%08x}, $name, $rxfv{$name}
+                        if $power_of_2 != $rxfv{$name};
                     last;
                 }
             }
         }
     }
-    $n=~s/^RXf_(PMf_)?//;
-    printf $out qq(\t%-20s/* 0x%08x */\n), 
-        qq("$n",),$power_of_2;
+    s/\bRXf_(PMf_)?// for $n, $extra;
+    printf $out qq(\t%-20s/* 0x%08x%s */\n),
+        qq("$n",),$power_of_2, $extra;
 }  
  
 print $out <<EOP;
diff --git a/regexp.h b/regexp.h
index 51630e4..6b16d14 100644
--- a/regexp.h
+++ b/regexp.h
@@ -130,6 +130,9 @@ struct reg_code_block {
        /* Information about the match that isn't often used */         \
        /* offset from wrapped to the start of precomp */               \
        PERL_BITFIELD32 pre_prefix:4;                                   \
+        /* original flags used to compile the pattern, may differ */    \
+        /* from extflags in various ways */                             \
+        PERL_BITFIELD32 compflags:9;                                    \
        CV *qr_anoncv   /* the anon sub wrapped round qr/(?{..})/ */
 
 typedef struct regexp {
@@ -333,7 +336,17 @@ and check for NULL.
 
 /* Leave some space, so future bit allocations can go either in the shared or
  * unshared area without affecting binary compatibility */
-#define RXf_BASE_SHIFT (_RXf_PMf_SHIFT_NEXT+1)
+#define RXf_BASE_SHIFT (_RXf_PMf_SHIFT_NEXT)
+
+/*
+  Set in Perl_pmruntime if op_flags & OPf_SPECIAL, i.e. split. Will
+  be used by regex engines to check whether they should set
+  RXf_SKIPWHITE
+*/
+#define RXf_SPLIT                (1<<(RXf_BASE_SHIFT-1))
+#if RXf_SPLIT != RXf_PMf_SPLIT
+#   error "RXf_SPLIT does not match RXf_PMf_SPLIT"
+#endif
 
 /* Manually decorate this function with gcc-style attributes just to
  * avoid having to restructure the header files and their called order,
@@ -366,19 +379,6 @@ get_regex_charset_name(const U32 flags, STRLEN* const lenp)
     }
 }
 
-/*
-  Two flags no longer used.
-  RXf_SPLIT used to be set in Perl_pmruntime if op_flags & OPf_SPECIAL,
-  i.e., split.  It was used by the regex engine to check whether it should
-  set RXf_SKIPWHITE.  Regexp plugins on CPAN also have done the same thing
-  historically, so we leave these flags defined.
-*/
-#ifndef PERL_CORE
-# define RXf_SPLIT             0
-# define RXf_SKIPWHITE         0
-#endif
-
-
 /* Anchor and GPOS related stuff */
 #define RXf_ANCH_BOL           (1<<(RXf_BASE_SHIFT+0))
 #define RXf_ANCH_MBOL          (1<<(RXf_BASE_SHIFT+1))
@@ -392,7 +392,7 @@ get_regex_charset_name(const U32 flags, STRLEN* const lenp)
 #define RXf_ANCH_SINGLE         (RXf_ANCH_SBOL|RXf_ANCH_GPOS)
 
 /* What we have seen */
-#define RXf_LOOKBEHIND_SEEN    (1<<(RXf_BASE_SHIFT+6))
+#define RXf_NO_INPLACE_SUBST    (1<<(RXf_BASE_SHIFT+6))
 #define RXf_EVAL_SEEN          (1<<(RXf_BASE_SHIFT+7))
 #define RXf_CANY_SEEN          (1<<(RXf_BASE_SHIFT+8))
 
@@ -409,8 +409,6 @@ get_regex_charset_name(const U32 flags, STRLEN* const lenp)
 #define RXf_INTUIT_TAIL        (1<<(RXf_BASE_SHIFT+14))
 #define RXf_USE_INTUIT         (RXf_USE_INTUIT_NOML|RXf_USE_INTUIT_ML)
 
-#define RXf_MODIFIES_VARS      (1<<(RXf_BASE_SHIFT+15))
-
 /* Copy and tainted info */
 #define RXf_COPY_DONE          (1<<(RXf_BASE_SHIFT+16))
 
@@ -422,6 +420,7 @@ get_regex_charset_name(const U32 flags, STRLEN* const lenp)
 
 /* Flags indicating special patterns */
 #define RXf_START_ONLY         (1<<(RXf_BASE_SHIFT+19)) /* Pattern is /^/ */
+#define RXf_SKIPWHITE                (1<<(RXf_BASE_SHIFT+20)) /* Pattern is 
for a split " " */
 #define RXf_WHITE              (1<<(RXf_BASE_SHIFT+21)) /* Pattern is /\s+/ */
 #define RXf_NULL               (1U<<(RXf_BASE_SHIFT+22)) /* Pattern is // */
 #if RXf_BASE_SHIFT+22 > 31
@@ -468,6 +467,7 @@ get_regex_charset_name(const U32 flags, STRLEN* const lenp)
                                         : RX_MATCH_COPIED_off(prog))
 
 #define RXp_EXTFLAGS(rx)       ((rx)->extflags)
+#define RXp_COMPFLAGS(rx)        ((rx)->compflags)
 
 /* For source compatibility. We used to store these explicitly.  */
 #define RX_PRECOMP(prog)       (RX_WRAPPED(prog) + ReANY(prog)->pre_prefix)
@@ -482,6 +482,7 @@ get_regex_charset_name(const U32 flags, STRLEN* const lenp)
 #define RX_CHECK_SUBSTR(prog)  (ReANY(prog)->check_substr)
 #define RX_REFCNT(prog)                SvREFCNT(prog)
 #define RX_EXTFLAGS(prog)      RXp_EXTFLAGS(ReANY(prog))
+#define RX_COMPFLAGS(prog)        RXp_COMPFLAGS(ReANY(prog))
 #define RX_ENGINE(prog)                (ReANY(prog)->engine)
 #define RX_SUBBEG(prog)                (ReANY(prog)->subbeg)
 #define RX_SUBOFFSET(prog)     (ReANY(prog)->suboffset)
diff --git a/regnodes.h b/regnodes.h
index 433460d..0caf86d 100644
--- a/regnodes.h
+++ b/regnodes.h
@@ -650,23 +650,23 @@ EXTCONST char * const PL_reg_name[] = {
 EXTCONST char * PL_reg_extflags_name[];
 #else
 EXTCONST char * const PL_reg_extflags_name[] = {
-       /* Bits in extflags defined: 11011111111111111111111011111111 */
+       /* Bits in extflags defined: 11111110111111111111111111111111 */
        "MULTILINE",        /* 0x00000001 */
        "SINGLELINE",       /* 0x00000002 */
        "FOLD",             /* 0x00000004 */
        "EXTENDED",         /* 0x00000008 */
        "KEEPCOPY",         /* 0x00000010 */
-       "CHARSET",          /* 0x000000e0 */
-       "CHARSET",          /* 0x000000e0 */
-       "CHARSET",          /* 0x000000e0 */
-       "UNUSED_BIT_8",     /* 0x00000100 */
+       "CHARSET0",         /* 0x00000020 : "CHARSET" - 0x000000e0 */
+       "CHARSET1",         /* 0x00000040 : "CHARSET" - 0x000000e0 */
+       "CHARSET2",         /* 0x00000080 : "CHARSET" - 0x000000e0 */
+       "SPLIT",            /* 0x00000100 */
        "ANCH_BOL",         /* 0x00000200 */
        "ANCH_MBOL",        /* 0x00000400 */
        "ANCH_SBOL",        /* 0x00000800 */
        "ANCH_GPOS",        /* 0x00001000 */
        "GPOS_SEEN",        /* 0x00002000 */
        "GPOS_FLOAT",       /* 0x00004000 */
-       "LOOKBEHIND_SEEN",  /* 0x00008000 */
+       "NO_INPLACE_SUBST", /* 0x00008000 */
        "EVAL_SEEN",        /* 0x00010000 */
        "CANY_SEEN",        /* 0x00020000 */
        "NOSCAN",           /* 0x00040000 */
@@ -675,12 +675,12 @@ EXTCONST char * const PL_reg_extflags_name[] = {
        "USE_INTUIT_NOML",  /* 0x00200000 */
        "USE_INTUIT_ML",    /* 0x00400000 */
        "INTUIT_TAIL",      /* 0x00800000 */
-       "MODIFIES_VARS",    /* 0x01000000 */
+       "UNUSED_BIT_24",    /* 0x01000000 */
        "COPY_DONE",        /* 0x02000000 */
        "TAINTED_SEEN",     /* 0x04000000 */
        "TAINTED",          /* 0x08000000 */
        "START_ONLY",       /* 0x10000000 */
-       "UNUSED_BIT_29",    /* 0x20000000 */
+       "SKIPWHITE",        /* 0x20000000 */
        "WHITE",            /* 0x40000000 */
        "NULL",             /* 0x80000000 */
 };
diff --git a/t/op/split.t b/t/op/split.t
index c198737..5e01159 100644
--- a/t/op/split.t
+++ b/t/op/split.t
@@ -6,7 +6,7 @@ BEGIN {
     require './test.pl';
 }
 
-plan tests => 115;
+plan tests => 118;
 
 $FS = ':';
 
@@ -418,16 +418,22 @@ is($cnt, scalar(@ary));
     is "@PATH", "Font GlyphNames", "hybrid scalar-and-array context";
 }
 
-# [perl #94490] constant folding should not invoke special split " "
-# behaviour.
-@_=split(0||" ","foo  bar");
-is @_, 3, 'split(0||" ") is not treated like split(" ")'; #'
-
 {
     my @results;
-    my $expr;
-    $expr = ' a b c ';
+    my $expr= "foo  bar";
+    my $cond;
+
+    @results= split(0||" ", $expr);
+    is @results, 2, 'split(0||" ") is treated like split(" ")'; #'
+
+    $cond= 0;
+    @results= split $cond ? " " : qr/ /, $expr;
+    is @results, 3, 'split($cond ? " " : qr/ /, $expr) works as expected (like 
qr/ /)';
+    $cond= 1;
+    @results= split $cond ? " " : qr/ /, $expr;
+    is @results, 2, 'split($cond ? " " : qr/ /, $expr) works as expected (like 
" ")';
 
+    $expr = ' a b c ';
     @results = split /\s/, $expr;
     is @results, 4,
         "split on regex of single space metacharacter: captured 4 elements";
@@ -452,10 +458,19 @@ is @_, 3, 'split(0||" ") is not treated like split(" ")'; 
#'
         "split on string of single whitespace: captured 3 elements";
     is $results[0], 'a',
         "split on string of single whitespace: first element is non-empty; 
multiple contiguous space characters";
+
+    my @seq;
+    for my $cond (0,1,0,1,0) {
+        $expr = "  foo  ";
+        @results = split $cond ? qr/ / : " ", $expr;
+        push @seq, scalar(@results) . ":" . $results[-1];
+    }
+    is join(" ", @seq), "1:foo 3:foo 1:foo 3:foo 1:foo",
+        qq{split(\$cond ? qr/ / : " ", "$exp") behaves as expected over 
repeated similar patterns};
 }
 
-TODO: {
-    local $::TODO = 'RT #116086: split "\x20" does not work as documented';
+{
+    # 'RT #116086: split "\x20" does not work as documented';
     my @results;
     my $expr;
     $expr = ' a b c ';
diff --git a/t/re/recompile.t b/t/re/recompile.t
index ad00df8..63a7068 100644
--- a/t/re/recompile.t
+++ b/t/re/recompile.t
@@ -22,7 +22,7 @@ BEGIN {
 }
 
 
-plan tests => 36;
+plan tests => 38;
 
 my $results = runperl(
                        switches => [ '-Dr' ],
@@ -187,3 +187,9 @@ my $y = '(?{1})';
 BEGIN { $^H |= 0x00200000 } # lightweight "use re 'eval'"
 "a" =~ qr/a$x$_/ for $y, $y, $y;
 CODE
+
+comp_n(6, <<'CODE', 'embedded code qr');
+my $x = qr/a/i;
+my $y = qr/a/;
+"a" =~ qr/a$_/ for $x, $y, $x, $y;
+CODE

--
Perl5 Master Repository

Reply via email to