In perl.git, the branch blead has been updated

<https://perl5.git.perl.org/perl.git/commitdiff/2813d4adc971fbaa124b5322d4bccaa73e9df8e2?hp=9c13cd3cdfa6ab6920882a355869287a277989c3>

- Log -----------------------------------------------------------------
commit 2813d4adc971fbaa124b5322d4bccaa73e9df8e2
Author: Karl Williamson <[email protected]>
Date:   Mon Jan 29 20:47:56 2018 -0700

    Add ANYOFM regnode
    
    This is a specialized ANYOF node for use when the code points in it
    have characteristics that allow them to be matched with a mask instead
    of a bit map.  When this happens, the speed up is pretty spectacular:
    
    Key:
        Ir   Instruction read
        Dr   Data read
        Dw   Data write
        COND conditional branches
        IND  indirect branches
    
    The numbers represent raw counts per loop iteration.
    
    Results of ('b' x 10000) . 'a' =~ /[Aa]/
    
              blead    mask Ratio %
           -------- ------- -------
        Ir 153132.0 25636.0   597.3
        Dr  40909.0  2155.0  1898.3
        Dw  20593.0   593.0  3472.7
      COND  20529.0  3028.0   678.0
       IND     22.0    22.0   100.0
    
    See the comments in regcomp.c or
    http://nntp.perl.org/group/perl.perl5.porters/249001 for a description
    of the cases that this new technique can handle.  But several common
    ones include the C0 controls (on ASCII platforms), [01], [0-7], [Aa] and
    any other ASCII case pair.
    
    The set of ASCII characters also could be done with this node instead of
    having the special ASCII regnode, reducing code size and complexity.
    I haven't investigated the speed loss of doing so.
    
    A NANYOFM node could be created for matching the complements this one
    matches.
    
    A pattern like /A/i is not affected by this commit, but the regex
    optimizer could be changed to take advantage of this commit.  What would
    need to be done is for it to look at the first byte of an EXACTFish node
    and if its one of the case pairs this handles, to generate a synthetic
    start class for it.  This would automatically invoke the sped up code.

commit 67a1b5f935fc7a39d75e1cafb06a0cea10871612
Author: Karl Williamson <[email protected]>
Date:   Mon Jan 22 13:55:03 2018 -0700

    recomp.sym: Add ANYOFM regnode
    
    This uses a mask instead of a bitmap, and is restricted to representing
    invariant characters under UTF-8 that meet particular bit patterns.

commit 2b7ee0568e8c163f9205a7bcb178d69ef88571ce
Author: Karl Williamson <[email protected]>
Date:   Thu Jan 25 13:35:09 2018 -0700

    regcomp.c: White-space only
    
    Indent code that the previous commit created a block around

commit 06a83acd6aa23a9d19f925cc3232ff18fe7deee2
Author: Karl Williamson <[email protected]>
Date:   Thu Jan 25 13:26:16 2018 -0700

    regcomp.c: Allow a fcn param to be NULL
    
    In which case handling is skipped.  This is in preparation for a future
    commit which will use this function in a slightly different manner

commit 070e8b2ef4f827a7e0d3199f7b37883a09545802
Author: Karl Williamson <[email protected]>
Date:   Fri Dec 29 15:45:38 2017 -0700

    regexec.c: Use word-at-a-time to repeat /i single byte pattern
    
    For most of the case folding pairs, like [Aa], it is possible to use a
    mask to match them word-at-a-time in regrepeat(), so that long sequences
    of them are handled with significantly better performance.

commit ab1efbdc1f74b2f4db076efa0b4d54f387d74efe
Author: Karl Williamson <[email protected]>
Date:   Fri Dec 29 15:17:41 2017 -0700

    regexec.c: Use word-at-a-time to repeat a single byte pattern
    
    There is special code in the function regrepeat() to handle instances
    where the pattern to repeat is a single byte.  These all can be done
    word-at-a-time to significantly increase the performance of long
    repeats.

commit 6a40c2e4e2dc26eb6ad39caf87cebef0743b90e7
Author: Karl Williamson <[email protected]>
Date:   Tue Dec 26 18:25:26 2017 -0700

    regexec.c: Replace loop by memchr()
    
    This can be called on a potentially long string.

commit 56dd984bdb8056d778b964ab6a46cb7dfaef915c
Author: Karl Williamson <[email protected]>
Date:   Mon Jan 29 20:33:14 2018 -0700

    Compile variant_byte_number() for EBCDIC
    
    Future commits will use this without regard to platform.

commit 597ee3f45b478da1456092f63d3ac698ee812786
Author: Karl Williamson <[email protected]>
Date:   Mon Jan 29 20:07:51 2018 -0700

    Use different scheme to handle MSVC6
    
    Recent commit 0b08cab0fc46a5f381ca18a451f55cf12c81d966 caused a function
    to not be compiled when running on MSVC6, and hence its callers needed
    to use an alternative mechanism there.  This is easy enough, it turns
    out, but it also turns out that there are more opportunities to call
    this function.  Rather than having each caller have to know about the
    MSVC6 problem, this current commit reimplements the function on that
    platform to use a slow, dumb method, so knowing about the issue is
    confined to just this one function.

-----------------------------------------------------------------------

Summary of changes:
 embed.fnc           |  11 +-
 embed.h             |   8 +-
 inline.h            |  28 +++--
 pod/perldebguts.pod |   2 +
 proto.h             |  34 ++++--
 regcomp.c           | 203 +++++++++++++++++++++++++++++----
 regcomp.sym         |   1 +
 regexec.c           | 251 ++++++++++++++++++++++++++++++++++++----
 regnodes.h          | 321 ++++++++++++++++++++++++++--------------------------
 t/re/anyof.t        |   2 +-
 10 files changed, 636 insertions(+), 225 deletions(-)

diff --git a/embed.fnc b/embed.fnc
index 35202e8d7c..02546ffb3f 100644
--- a/embed.fnc
+++ b/embed.fnc
@@ -806,9 +806,7 @@ AndmoR      |bool   |is_utf8_invariant_string|NN const U8* 
const s              \
 AnidR  |bool   |is_utf8_invariant_string_loc|NN const U8* const s          \
                |STRLEN len                                                 \
                |NULLOK const U8 ** ep
-#if ! defined(EBCDIC) && ! defined USING_MSVC6
 AniR   |unsigned int|_variant_byte_number|PERL_UINTMAX_T word
-#endif
 #if defined(PERL_CORE) || defined(PERL_EXT)
 EinR   |Size_t |variant_under_utf8_count|NN const U8* const s              \
                |NN const U8* const e
@@ -2459,6 +2457,7 @@ Es        |SSize_t|study_chunk    |NN RExC_state_t 
*pRExC_state \
                                 |I32 stopparen|U32 recursed_depth \
                                |NULLOK regnode_ssc *and_withp \
                                |U32 flags|U32 depth
+EsR    |SV *   |get_ANYOFM_contents|NN const regnode * n
 EsRn   |U32    |add_data       |NN RExC_state_t* const pRExC_state \
                                |NN const char* const s|const U32 n
 rs     |void   |re_croak2      |bool utf8|NN const char* pat1|NN const char* 
pat2|...
@@ -2491,7 +2490,7 @@ Es        |const regnode*|dumpuntil|NN const regexp *r|NN 
const regnode *start \
                                |NN SV* sv|I32 indent|U32 depth
 Es     |void   |put_code_point |NN SV* sv|UV c
 Es     |bool   |put_charclass_bitmap_innards|NN SV* sv             \
-                               |NN char* bitmap                    \
+                               |NULLOK char* bitmap                \
                                |NULLOK SV* nonbitmap_invlist       \
                                |NULLOK SV* only_utf8_locale_invlist\
                                |NULLOK const regnode * const node  \
@@ -2534,6 +2533,12 @@ ERp      |bool   |_is_grapheme   |NN const U8 * 
strbeg|NN const U8 * s|NN const U8 *stren
 ERs    |bool   |isFOO_utf8_lc  |const U8 classnum|NN const U8* character
 ERns   |char *|find_next_ascii|NN char* s|NN const char * send|const bool 
is_utf8
 ERns   |char *|find_next_non_ascii|NN char* s|NN const char * send|const bool 
is_utf8
+ERns   |char * |find_next_masked|NN char * s                           \
+                                |NN const char * send                  \
+                                |const U8 byte|const U8 mask
+ERns   |char *|find_span_end   |NN char* s|NN const char * send|const char 
span_byte
+ERns   |U8 *|find_span_end_mask|NN U8 * s|NN const U8 * send   \
+                               |const U8 span_byte|const U8 mask
 ERs    |SSize_t|regmatch       |NN regmatch_info *reginfo|NN char *startpos|NN 
regnode *prog
 WERs   |I32    |regrepeat      |NN regexp *prog|NN char **startposp \
                                |NN const regnode *p \
diff --git a/embed.h b/embed.h
index 334c6063fb..d53dff9123 100644
--- a/embed.h
+++ b/embed.h
@@ -46,6 +46,7 @@
 #define _to_utf8_lower_flags(a,b,c,d,e,f,g)    Perl__to_utf8_lower_flags(aTHX_ 
a,b,c,d,e,f,g)
 #define _to_utf8_title_flags(a,b,c,d,e,f,g)    Perl__to_utf8_title_flags(aTHX_ 
a,b,c,d,e,f,g)
 #define _to_utf8_upper_flags(a,b,c,d,e,f,g)    Perl__to_utf8_upper_flags(aTHX_ 
a,b,c,d,e,f,g)
+#define _variant_byte_number   S__variant_byte_number
 #define amagic_call(a,b,c,d)   Perl_amagic_call(aTHX_ a,b,c,d)
 #define amagic_deref_call(a,b) Perl_amagic_deref_call(aTHX_ a,b)
 #define apply_attrs_string(a,b,c,d)    Perl_apply_attrs_string(aTHX_ a,b,c,d)
@@ -768,9 +769,6 @@
 #define whichsig_sv(a)         Perl_whichsig_sv(aTHX_ a)
 #define wrap_keyword_plugin(a,b)       Perl_wrap_keyword_plugin(aTHX_ a,b)
 #define wrap_op_checker(a,b,c) Perl_wrap_op_checker(aTHX_ a,b,c)
-#if ! defined(EBCDIC) && ! defined USING_MSVC6
-#define _variant_byte_number   S__variant_byte_number
-#endif
 #if !(defined(HAS_MEMMEM))
 #define ninstr                 Perl_ninstr
 #endif
@@ -1012,6 +1010,7 @@
 #define compute_EXACTish       S_compute_EXACTish
 #define construct_ahocorasick_from_trie(a,b,c) 
S_construct_ahocorasick_from_trie(aTHX_ a,b,c)
 #define edit_distance          S_edit_distance
+#define get_ANYOFM_contents(a) S_get_ANYOFM_contents(aTHX_ a)
 #define get_ANYOF_cp_list_for_ssc(a,b) S_get_ANYOF_cp_list_for_ssc(aTHX_ a,b)
 #define get_invlist_iter_addr  S_get_invlist_iter_addr
 #define grok_bslash_N(a,b,c,d,e,f,g)   S_grok_bslash_N(aTHX_ a,b,c,d,e,f,g)
@@ -1119,7 +1118,10 @@
 #define backup_one_WB(a,b,c,d) S_backup_one_WB(aTHX_ a,b,c,d)
 #define find_byclass(a,b,c,d,e)        S_find_byclass(aTHX_ a,b,c,d,e)
 #define find_next_ascii                S_find_next_ascii
+#define find_next_masked       S_find_next_masked
 #define find_next_non_ascii    S_find_next_non_ascii
+#define find_span_end          S_find_span_end
+#define find_span_end_mask     S_find_span_end_mask
 #define isFOO_utf8_lc(a,b)     S_isFOO_utf8_lc(aTHX_ a,b)
 #define isGCB(a,b,c,d,e)       S_isGCB(aTHX_ a,b,c,d,e)
 #define isLB(a,b,c,d,e,f)      S_isLB(aTHX_ a,b,c,d,e,f)
diff --git a/inline.h b/inline.h
index 769e0532ac..3cd90e5712 100644
--- a/inline.h
+++ b/inline.h
@@ -401,8 +401,6 @@ S_is_utf8_invariant_string_loc(const U8* const s, STRLEN 
len, const U8 ** ep)
                                       | ( ( (PTR2nat(x)                       \
                                            & PERL_WORD_BOUNDARY_MASK) >> 2))))
 
-#  ifndef USING_MSVC6
-
     /* Do the word-at-a-time iff there is at least one usable full word.  That
      * means that after advancing to a word boundary, there still is at least a
      * full word left.  The number of bytes needed to advance is 'wordsize -
@@ -460,7 +458,6 @@ S_is_utf8_invariant_string_loc(const U8* const s, STRLEN 
len, const U8 ** ep)
         } while (x + PERL_WORDSIZE <= send);
     }
 
-#  endif    /* End of ! MSVC6 */
 #endif      /* End of ! EBCDIC */
 
     /* Process per-byte */
@@ -479,11 +476,6 @@ S_is_utf8_invariant_string_loc(const U8* const s, STRLEN 
len, const U8 ** ep)
     return TRUE;
 }
 
-#if ! defined(EBCDIC) && ! defined(USING_MSVC6)
-
-/* Apparent compiler error with MSVC6, so can't use this function.  All callers
- * to it must be compiled to use the EBCDIC fallback on MSVC6 */
-
 PERL_STATIC_INLINE unsigned int
 S__variant_byte_number(PERL_UINTMAX_T word)
 {
@@ -496,7 +488,24 @@ S__variant_byte_number(PERL_UINTMAX_T word)
     /* Get just the msb bits of each byte */
     word &= PERL_VARIANTS_WORD_MASK;
 
-#  if BYTEORDER == 0x1234 || BYTEORDER == 0x12345678
+#  ifdef USING_MSVC6    /* VC6 has some issues with the normal code, and the
+                           easiest thing is to hide that from the callers */
+    {
+        unsigned int i;
+        const U8 * s = (U8 *) &word;
+        dTHX;
+
+        for (i = 0; i < sizeof(word); i++ ) {
+            if (s[i]) {
+                return i;
+            }
+        }
+
+        Perl_croak(aTHX_ "panic: %s: %d: unexpected zero word\n",
+                                 __FILE__, __LINE__);
+    }
+
+#  elif BYTEORDER == 0x1234 || BYTEORDER == 0x12345678
 
     /* Bytes are stored like
      *  Byte8 ... Byte2 Byte1
@@ -574,7 +583,6 @@ S__variant_byte_number(PERL_UINTMAX_T word)
     return (unsigned int) word;
 }
 
-#endif /* ! EBCDIC */
 #if defined(PERL_CORE) || defined(PERL_EXT)
 
 /*
diff --git a/pod/perldebguts.pod b/pod/perldebguts.pod
index 3a66f24a20..b1c01ca2d6 100644
--- a/pod/perldebguts.pod
+++ b/pod/perldebguts.pod
@@ -605,6 +605,8 @@ will be lost.
                             single char match only
  ANYOFD          sv 1       Like ANYOF, but /d is in effect
  ANYOFL          sv 1       Like ANYOF, but /l is in effect
+ ANYOFM          byte 1     Like ANYOF, but matches an invariant byte as
+                            determined by the mask and arg
 
  # POSIX Character Classes:
  POSIXD          none       Some [[:class:]] under /d; the FLAGS field
diff --git a/proto.h b/proto.h
index eadfc976db..0755630a94 100644
--- a/proto.h
+++ b/proto.h
@@ -137,6 +137,11 @@ PERL_CALLCONV UV   Perl__to_utf8_title_flags(pTHX_ const 
U8 *p, const U8* e, U8* u
 PERL_CALLCONV UV       Perl__to_utf8_upper_flags(pTHX_ const U8 *p, const U8 
*e, U8* ustrp, STRLEN *lenp, bool flags, const char * const file, const int 
line);
 #define PERL_ARGS_ASSERT__TO_UTF8_UPPER_FLAGS  \
        assert(p); assert(ustrp); assert(file)
+#ifndef PERL_NO_INLINE_FUNCTIONS
+PERL_STATIC_INLINE unsigned int        S__variant_byte_number(PERL_UINTMAX_T 
word)
+                       __attribute__warn_unused_result__;
+#endif
+
 PERL_CALLCONV void     Perl__warn_problematic_locale(void);
 PERL_CALLCONV_NO_RET void      Perl_abort_execution(pTHX_ const char * const 
msg, const char * const name)
                        __attribute__noreturn__;
@@ -3806,13 +3811,6 @@ PERL_CALLCONV int        Perl_yylex(pTHX);
 PERL_CALLCONV int      Perl_yyparse(pTHX_ int gramtype);
 PERL_CALLCONV void     Perl_yyquit(pTHX);
 PERL_CALLCONV void     Perl_yyunlex(pTHX);
-#if ! defined(EBCDIC) && ! defined USING_MSVC6
-#ifndef PERL_NO_INLINE_FUNCTIONS
-PERL_STATIC_INLINE unsigned int        S__variant_byte_number(PERL_UINTMAX_T 
word)
-                       __attribute__warn_unused_result__;
-#endif
-
-#endif
 #if ! defined(HAS_MEMRCHR) && (defined(PERL_CORE) || defined(PERL_EXT))
 #ifndef PERL_NO_INLINE_FUNCTIONS
 PERL_STATIC_INLINE void *      S_my_memrchr(const char * s, const char c, 
const STRLEN len);
@@ -4190,7 +4188,7 @@ STATIC const regnode*     S_dumpuntil(pTHX_ const regexp 
*r, const regnode *start, c
        assert(r); assert(start); assert(node); assert(sv)
 STATIC bool    S_put_charclass_bitmap_innards(pTHX_ SV* sv, char* bitmap, SV* 
nonbitmap_invlist, SV* only_utf8_locale_invlist, const regnode * const node, 
const bool force_as_is_display);
 #define PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS  \
-       assert(sv); assert(bitmap)
+       assert(sv)
 STATIC SV*     S_put_charclass_bitmap_innards_common(pTHX_ SV* invlist, SV* 
posixes, SV* only_utf8, SV* not_utf8, SV* only_utf8_locale, const bool invert);
 #define PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS_COMMON   \
        assert(invlist)
@@ -5181,6 +5179,11 @@ STATIC int       S_edit_distance(const UV *src, const UV 
*tgt, const STRLEN x, const S
 #define PERL_ARGS_ASSERT_EDIT_DISTANCE \
        assert(src); assert(tgt)
 
+STATIC SV *    S_get_ANYOFM_contents(pTHX_ const regnode * n)
+                       __attribute__warn_unused_result__;
+#define PERL_ARGS_ASSERT_GET_ANYOFM_CONTENTS   \
+       assert(n)
+
 STATIC SV*     S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t 
*pRExC_state, const regnode_charclass* const node);
 #define PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC     \
        assert(pRExC_state); assert(node)
@@ -5574,11 +5577,26 @@ STATIC char *   S_find_next_ascii(char* s, const char * 
send, const bool is_utf8)
 #define PERL_ARGS_ASSERT_FIND_NEXT_ASCII       \
        assert(s); assert(send)
 
+STATIC char *  S_find_next_masked(char * s, const char * send, const U8 byte, 
const U8 mask)
+                       __attribute__warn_unused_result__;
+#define PERL_ARGS_ASSERT_FIND_NEXT_MASKED      \
+       assert(s); assert(send)
+
 STATIC char *  S_find_next_non_ascii(char* s, const char * send, const bool 
is_utf8)
                        __attribute__warn_unused_result__;
 #define PERL_ARGS_ASSERT_FIND_NEXT_NON_ASCII   \
        assert(s); assert(send)
 
+STATIC char *  S_find_span_end(char* s, const char * send, const char 
span_byte)
+                       __attribute__warn_unused_result__;
+#define PERL_ARGS_ASSERT_FIND_SPAN_END \
+       assert(s); assert(send)
+
+STATIC U8 *    S_find_span_end_mask(U8 * s, const U8 * send, const U8 
span_byte, const U8 mask)
+                       __attribute__warn_unused_result__;
+#define PERL_ARGS_ASSERT_FIND_SPAN_END_MASK    \
+       assert(s); assert(send)
+
 STATIC bool    S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character)
                        __attribute__warn_unused_result__;
 #define PERL_ARGS_ASSERT_ISFOO_UTF8_LC \
diff --git a/regcomp.c b/regcomp.c
index 198f291f06..1cd5329f10 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -5516,6 +5516,27 @@ Perl_re_printf( aTHX_  "LHS=%" UVuf " RHS=%" UVuf "\n",
                                                           (regnode_charclass 
*) scan);
                    break;
 
+                case ANYOFM:
+                  {
+                    SV* cp_list = get_ANYOFM_contents(scan);
+
+                    if (flags & SCF_DO_STCLASS_OR) {
+                        ssc_union(data->start_class,
+                                  cp_list,
+                                  FALSE /* don't invert */
+                                  );
+                    }
+                    else if (flags & SCF_DO_STCLASS_AND) {
+                        ssc_intersection(data->start_class,
+                                         cp_list,
+                                         FALSE /* don't invert */
+                                         );
+                    }
+
+                    SvREFCNT_dec_NN(cp_list);
+                    break;
+                  }
+
                case NPOSIXL:
                     invert = 1;
                     /* FALLTHROUGH */
@@ -17999,25 +18020,20 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 
*flagp, U32 depth,
      * certain common classes that are easy to test.  Getting to this point in
      * the code means that the class didn't get optimized there.  Since this
      * code is only executed in Pass 2, it is too late to save space--it has
-     * been allocated in Pass 1, and currently isn't given back.  But turning
-     * things into an EXACTish node can allow the optimizer to join it to any
-     * adjacent such nodes.  And if the class is equivalent to things like /./,
-     * expensive run-time swashes can be avoided.  Now that we have more
-     * complete information, we can find things necessarily missed by the
-     * earlier code.  Another possible "optimization" that isn't done is that
-     * something like [Ee] could be changed into an EXACTFU.  khw tried this
-     * and found that the ANYOF is faster, including for code points not in the
-     * bitmap.  This still might make sense to do, provided it got joined with
-     * an adjacent node(s) to create a longer EXACTFU one.  This could be
-     * accomplished by creating a pseudo ANYOF_EXACTFU node type that the join
-     * routine would know is joinable.  If that didn't happen, the node type
-     * could then be made a straight ANYOF */
+     * been allocated in Pass 1, and currently isn't given back.  XXX Why not?
+     * But turning things into an EXACTish node can allow the optimizer to join
+     * it to any adjacent such nodes.  And if the class is equivalent to things
+     * like /./, expensive run-time swashes can be avoided.  Now that we have
+     * more complete information, we can find things necessarily missed by the
+     * earlier code. */
 
     if (optimizable && cp_list && ! invert) {
         UV start, end;
         U8 op = END;  /* The optimzation node-type */
         int posix_class = -1;   /* Illegal value */
         const char * cur_parse= RExC_parse;
+        U8 ANYOFM_mask;
+        U32 anode_arg = 0;
 
         invlist_iterinit(cp_list);
         if (! invlist_iternext(cp_list, &start, &end)) {
@@ -18156,6 +18172,106 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 
*flagp, U32 depth,
                 }
               found_posix: ;
             }
+
+            /* If it didn't match a POSIX class, it might be able to be turned
+             * into an ANYOFM node.  Compare two different bytes, bit-by-bit.
+             * In some positions, the bits in each will be 1; and in other
+             * positions both will be 0; and in some positions the bit will be
+             * 1 in one byte, and 0 in the other.  Let 'n' be the number of
+             * positions where the bits differ.  We create a mask which has
+             * exactly 'n' 0 bits, each in a position where the two bytes
+             * differ.  Now take the set of all bytes that when ANDed with the
+             * mask yield the same result.  That set has 2**n elements, and is
+             * representable by just two 8 bit numbers: the result and the
+             * mask.  Importantly, matching the set can be vectorized by
+             * creating a word full of the result bytes, and a word full of the
+             * mask bytes, yielding a significant speed up.  Here, see if this
+             * node matches such a set.  As a concrete example consider [01],
+             * and the byte representing '0' which is 0x30 on ASCII machines.
+             * It has the bits 0011 0000.  Take the mask 1111 1110.  If we AND
+             * 0x31 and 0x30 with that mask we get 0x30.  Any other bytes ANDed
+             * yield something else.  So [01], which is a common usage, is
+             * optimizable into ANYOFM, and can benefit from the speed up.  We
+             * can only do this on UTF-8 invariant bytes, because the variance
+             * would throw this off.  */
+            if (   op == END
+                && invlist_highest(cp_list) <=
+#ifdef EBCDIC
+                                               0xFF
+#else
+                                               0x7F
+#endif
+            ) {
+                Size_t cp_count = 0;
+                bool first_time = TRUE;
+                unsigned int lowest_cp;
+                U8 bits_differing = 0;
+
+                /* Only needed on EBCDIC, as there, variants and non- are mixed
+                 * together.  Could #ifdef it out on ASCII, but probably the
+                 * compiler will optimize it out */
+                bool has_variant = FALSE;
+
+                /* Go through the bytes and find the bit positions that differ 
*/
+                invlist_iterinit(cp_list);
+                while (invlist_iternext(cp_list, &start, &end)) {
+                    unsigned int i = start;
+
+                    cp_count += end - start + 1;
+
+                    if (first_time) {
+                        if (! UVCHR_IS_INVARIANT(i)) {
+                            has_variant = TRUE;
+                            continue;
+                        }
+
+                        first_time = FALSE;
+                        lowest_cp = start;
+
+                        i++;
+                    }
+
+                    /* Find the bit positions that differ from the lowest code
+                     * point in the node.  Keep track of all such positions by
+                     * OR'ing */
+                    for (; i <= end; i++) {
+                        if (! UVCHR_IS_INVARIANT(i)) {
+                            has_variant = TRUE;
+                            continue;
+                        }
+
+                        bits_differing  |= i ^ lowest_cp;
+                    }
+                }
+                invlist_iterfinish(cp_list);
+
+                /* At the end of the loop, we count how many bits differ from
+                 * the bits in lowest code point, call the count 'd'.  If the
+                 * set we found contains 2**d elements, it is the closure of
+                 * all code points that differ only in those bit positions.  To
+                 * convince yourself of that, first note that the number in the
+                 * closure must be a power of 2, which we test for.  The only
+                 * way we could have that count and it be some differing set,
+                 * is if we got some code points that don't differ from the
+                 * lowest code point in any position, but do differ from each
+                 * other in some other position.  That means one code point has
+                 * a 1 in that position, and another has a 0.  But that would
+                 * mean that one of them differs from the lowest code point in
+                 * that position, which possibility we've already excluded. */
+                if ( ! has_variant
+                    && cp_count == 1U << PL_bitcount[bits_differing])
+                {
+                    assert(cp_count > 1);
+                    op = ANYOFM;
+
+                    /* We need to make the bits that differ be 0's */
+                    ANYOFM_mask = ~ bits_differing; /* This goes into FLAGS */
+
+                    /* The argument is the lowest code point */
+                    anode_arg = lowest_cp;
+                    *flagp |= HASWIDTH|SIMPLE;
+                }
+            }
         }
 
         if (op != END) {
@@ -18163,7 +18279,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, 
U32 depth,
             RExC_emit = (regnode *)orig_emit;
 
             if (regarglen[op]) {
-                ret = reganode(pRExC_state, op, 0);
+                ret = reganode(pRExC_state, op, anode_arg);
             } else {
                 ret = reg_node(pRExC_state, op);
             }
@@ -18178,6 +18294,9 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, 
U32 depth,
             else if (PL_regkind[op] == POSIXD || PL_regkind[op] == NPOSIXD) {
                 FLAGS(ret) = posix_class;
             }
+            else if (PL_regkind[op] == ANYOFM) {
+                FLAGS(ret) = ANYOFM_mask;
+            }
 
             SvREFCNT_dec_NN(cp_list);
             return ret;
@@ -19030,6 +19149,36 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, 
regnode *p,
 }
 #endif
 
+STATIC SV*
+S_get_ANYOFM_contents(pTHX_ const regnode * n) {
+
+    /* Returns an inversion list of all the code points matched by the ANYOFM
+     * node 'n' */
+
+    SV * cp_list = _new_invlist(-1);
+    const U8 lowest = ARG(n);
+    unsigned int i;
+    U8 count = 0;
+    U8 needed = 1U << PL_bitcount[ (U8) ~ FLAGS(n)];
+
+    PERL_ARGS_ASSERT_GET_ANYOFM_CONTENTS;
+
+    /* Starting with the lowest code point, any code point that ANDed with the
+     * mask yields the lowest code point is in the set */
+    for (i = lowest; i <= 0xFF; i++) {
+        if ((i & FLAGS(n)) == ARG(n)) {
+            cp_list = add_cp_to_invlist(cp_list, i);
+            count++;
+
+            /* We know how many code points (a power of two) that are in the
+             * set.  No use looking once we've got that number */
+            if (count >= needed) break;
+        }
+    }
+
+    return cp_list;
+}
+
 /*
  - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
  */
@@ -19556,6 +19705,15 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const 
regnode *o, const regmatch_
 
         SvREFCNT_dec(unresolved);
     }
+    else if (k == ANYOFM) {
+        SV * cp_list = get_ANYOFM_contents(o);
+
+       Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
+        put_charclass_bitmap_innards(sv, NULL, cp_list, NULL, NULL, TRUE);
+       Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
+
+        SvREFCNT_dec(cp_list);
+    }
     else if (k == POSIXD || k == NPOSIXD) {
         U8 index = FLAGS(o) * 2;
         if (index < C_ARRAY_LENGTH(anyofs)) {
@@ -20595,7 +20753,7 @@ S_put_charclass_bitmap_innards(pTHX_ SV *sv,
 {
     /* Appends to 'sv' a displayable version of the innards of the bracketed
      * character class defined by the other arguments:
-     *  'bitmap' points to the bitmap.
+     *  'bitmap' points to the bitmap, or NULL if to ignore that.
      *  'nonbitmap_invlist' is an inversion list of the code points that are in
      *      the bitmap range, but for some reason aren't in the bitmap; NULL if
      *      none.  The reasons for this could be that they require some
@@ -20706,13 +20864,16 @@ S_put_charclass_bitmap_innards(pTHX_ SV *sv,
     }
 
     /* Accumulate the bit map into the unconditional match list */
-    for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
-        if (BITMAP_TEST(bitmap, i)) {
-            int start = i++;
-            for (; i < NUM_ANYOF_CODE_POINTS && BITMAP_TEST(bitmap, i); i++) {
-                /* empty */
+    if (bitmap) {
+        for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
+            if (BITMAP_TEST(bitmap, i)) {
+                int start = i++;
+                for (;
+                     i < NUM_ANYOF_CODE_POINTS && BITMAP_TEST(bitmap, i);
+                     i++)
+                { /* empty */ }
+                invlist = _add_range_to_invlist(invlist, start, i-1);
             }
-            invlist = _add_range_to_invlist(invlist, start, i-1);
         }
     }
 
diff --git a/regcomp.sym b/regcomp.sym
index cddf84c24d..14840b5845 100644
--- a/regcomp.sym
+++ b/regcomp.sym
@@ -59,6 +59,7 @@ SANY        REG_ANY,    no 0 S    ; Match any one character.
 ANYOF       ANYOF,      sv 1 S    ; Match character in (or not in) this class, 
single char match only
 ANYOFD      ANYOF,      sv 1 S    ; Like ANYOF, but /d is in effect
 ANYOFL      ANYOF,      sv 1 S    ; Like ANYOF, but /l is in effect
+ANYOFM      ANYOFM      byte 1 S  ; Like ANYOF, but matches an invariant byte 
as determined by the mask and arg
 
 #* POSIX Character Classes:
 # Order of the below is important.  See ordering comment above.
diff --git a/regexec.c b/regexec.c
index 530f1d6250..31a133f20b 100644
--- a/regexec.c
+++ b/regexec.c
@@ -560,7 +560,7 @@ S_find_next_ascii(char * s, const char * send, const bool 
utf8_target)
 
     PERL_ARGS_ASSERT_FIND_NEXT_ASCII;
 
-#if ! defined(EBCDIC) && ! defined(USING_MSVC6)
+#ifndef EBCDIC
 
     if ((STRLEN) (send - s) >= PERL_WORDSIZE
 
@@ -676,6 +676,200 @@ S_find_next_non_ascii(char * s, const char * send, const 
bool utf8_target)
 
 }
 
+STATIC char *
+S_find_span_end(char * s, const char * send, const char span_byte)
+{
+    /* Returns the position of the first byte in the sequence between 's' and
+     * 'send-1' inclusive that isn't 'span_byte'; returns 'send' if none found.
+     * */
+
+    PERL_ARGS_ASSERT_FIND_SPAN_END;
+
+    assert(send >= s);
+
+    if ((STRLEN) (send - s) >= PERL_WORDSIZE
+                          + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
+                          - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK))
+    {
+        PERL_UINTMAX_T span_word;
+
+        /* Process per-byte until reach word boundary.  XXX This loop could be
+         * eliminated if we knew that this platform had fast unaligned reads */
+        while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) {
+            if (*s != span_byte) {
+                return s;
+            }
+            s++;
+        }
+
+        /* Create a word filled with the bytes we are spanning */
+        span_word = PERL_COUNT_MULTIPLIER * span_byte;
+
+        /* Process per-word as long as we have at least a full word left */
+        do {
+
+            /* Keep going if the whole word is composed of 'span_byte's */
+            if ((* (PERL_UINTMAX_T *) s) == span_word)  {
+                s += PERL_WORDSIZE;
+                continue;
+            }
+
+            /* Here, at least one byte in the word isn't 'span_byte'.  This xor
+             * leaves 1 bits only in those non-matching bytes */
+            span_word ^= * (PERL_UINTMAX_T *) s;
+
+            /* Make sure the upper bit of each non-matching byte is set.  This
+             * makes each such byte look like an ASCII platform variant byte */
+            span_word |= span_word << 1;
+            span_word |= span_word << 2;
+            span_word |= span_word << 4;
+
+            /* That reduces the problem to what this function solves */
+            return s + _variant_byte_number(span_word);
+
+        } while (s + PERL_WORDSIZE <= send);
+    }
+
+    /* Process the straggler bytes beyond the final word boundary */
+    while (s < send) {
+        if (*s != span_byte) {
+            return s;
+        }
+        s++;
+    }
+
+    return s;
+}
+
+STATIC char *
+S_find_next_masked(char * s, const char * send, const U8 byte, const U8 mask)
+{
+    /* Returns the position of the first byte in the sequence between 's'
+     * and 'send-1' inclusive that when ANDed with 'mask' yields 'byte';
+     * returns 'send' if none found.  It uses word-level operations instead of
+     * byte to speed up the process */
+
+    PERL_ARGS_ASSERT_FIND_NEXT_MASKED;
+
+    assert(send >= s);
+    assert((byte & mask) == byte);
+
+    if ((STRLEN) (send - s) >= PERL_WORDSIZE
+                          + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
+                          - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK))
+    {
+        PERL_UINTMAX_T word_complemented, mask_word;
+
+        while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) {
+            if (((* (U8 *) s) & mask) == byte) {
+                return s;
+            }
+            s++;
+        }
+
+        word_complemented = ~ (PERL_COUNT_MULTIPLIER * byte);
+        mask_word =            PERL_COUNT_MULTIPLIER * mask;
+
+        do {
+            PERL_UINTMAX_T masked = (* (PERL_UINTMAX_T *) s) & mask_word;
+
+            /* If 'masked' contains 'byte' within it, anding with the
+             * complement will leave those 8 bits 0 */
+            masked &= word_complemented;
+
+            /* This causes the most significant bit to be set to 1 for any
+             * bytes in the word that aren't completely 0 */
+            masked |= masked << 1;
+            masked |= masked << 2;
+            masked |= masked << 4;
+
+            /* The msbits are the same as what marks a byte as variant, so we
+             * can use this mask.  If all msbits are 1, the word doesn't
+             * contain 'byte' */
+            if ((masked & PERL_VARIANTS_WORD_MASK) == PERL_VARIANTS_WORD_MASK) 
{
+                s += PERL_WORDSIZE;
+                continue;
+            }
+
+            /* Here, the msbit of bytes in the word that aren't 'byte' are 1,
+             * and any that are, are 0.  Complement and re-AND to swap that */
+            masked = ~ masked;
+            masked &= PERL_VARIANTS_WORD_MASK;
+
+            /* This reduces the problem to that solved by this function */
+            s += _variant_byte_number(masked);
+            return s;
+
+        } while (s + PERL_WORDSIZE <= send);
+    }
+
+    while (s < send) {
+        if (((* (U8 *) s) & mask) == byte) {
+            return s;
+        }
+        s++;
+    }
+
+    return s;
+}
+
+STATIC U8 *
+S_find_span_end_mask(U8 * s, const U8 * send, const U8 span_byte, const U8 
mask)
+{
+    /* Returns the position of the first byte in the sequence between 's' and
+     * 'send-1' inclusive that when ANDed with 'mask' isn't 'span_byte'.
+     * 'span_byte' should have been ANDed with 'mask' in the call of this
+     * function.  Returns 'send' if none found.  Works like find_span_end(),
+     * except for the AND */
+
+    PERL_ARGS_ASSERT_FIND_SPAN_END_MASK;
+
+    assert(send >= s);
+    assert((span_byte & mask) == span_byte);
+
+    if ((STRLEN) (send - s) >= PERL_WORDSIZE
+                          + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
+                          - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK))
+    {
+        PERL_UINTMAX_T span_word, mask_word;
+
+        while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) {
+            if (((* (U8 *) s) & mask) != span_byte) {
+                return s;
+            }
+            s++;
+        }
+
+        span_word = PERL_COUNT_MULTIPLIER * span_byte;
+        mask_word = PERL_COUNT_MULTIPLIER * mask;
+
+        do {
+            PERL_UINTMAX_T masked = (* (PERL_UINTMAX_T *) s) & mask_word;
+
+            if (masked == span_word) {
+                s += PERL_WORDSIZE;
+                continue;
+            }
+
+            masked ^= span_word;
+            masked |= masked << 1;
+            masked |= masked << 2;
+            masked |= masked << 4;
+            return s + _variant_byte_number(masked);
+
+        } while (s + PERL_WORDSIZE <= send);
+    }
+
+    while (s < send) {
+        if (((* (U8 *) s) & mask) != span_byte) {
+            return s;
+        }
+        s++;
+    }
+
+    return s;
+}
+
 /*
  * pregexec and friends
  */
@@ -2062,6 +2256,12 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, 
char *s,
         }
         break;
 
+    case ANYOFM:    /* ARG() is the base byte; FLAGS() the mask byte */
+        /* UTF-8ness doesn't matter, so use 0 */
+        REXEC_FBC_FIND_NEXT_SCAN(0,
+                                 find_next_masked(s, strend, ARG(c), 
FLAGS(c)));
+        break;
+
     case EXACTFA_NO_TRIE:   /* This node only generated for non-utf8 patterns 
*/
         assert(! is_utf8_pat);
        /* FALLTHROUGH */
@@ -6537,6 +6737,13 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, 
regnode *prog)
            }
            break;
 
+        case ANYOFM:
+            if (NEXTCHR_IS_EOS || (UCHARAT(locinput) & FLAGS(scan)) != 
ARG(scan)) {
+                sayNO;
+            }
+            locinput++;
+            break;
+
         case ASCII:
             if (NEXTCHR_IS_EOS || ! isASCII(UCHARAT(locinput))) {
                 sayNO;
@@ -9003,8 +9210,10 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const 
regnode *p,
                hardcount++;
            }
        } else {
-           while (scan < loceol && *scan != '\n')
-               scan++;
+            scan = (char *) memchr(scan, '\n', loceol - scan);
+            if (! scan) {
+                scan = loceol;
+            }
        }
        break;
     case SANY:
@@ -9028,7 +9237,7 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const 
regnode *p,
 
        c = (U8)*STRING(p);
 
-        /* Can use a simple loop if the pattern char to match on is invariant
+        /* Can use a simple find if the pattern char to match on is invariant
          * under UTF-8, or both target and pattern aren't UTF-8.  Note that we
          * can use UTF8_IS_INVARIANT() even if the pattern isn't UTF-8, as it's
          * true iff it doesn't matter if the argument is in UTF-8 or not */
@@ -9038,9 +9247,7 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const 
regnode *p,
                  * since here, to match at all, 1 char == 1 byte */
                 loceol = scan + max;
             }
-           while (scan < loceol && UCHARAT(scan) == c) {
-               scan++;
-           }
+            scan = find_span_end(scan, loceol, (U8) c);
        }
        else if (reginfo->is_utf8_pat) {
             if (utf8_target) {
@@ -9060,11 +9267,9 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const 
regnode *p,
             else if (! UTF8_IS_ABOVE_LATIN1(c)) {
 
                 /* Target isn't utf8; convert the character in the UTF-8
-                 * pattern to non-UTF8, and do a simple loop */
+                 * pattern to non-UTF8, and do a simple find */
                 c = EIGHT_BIT_UTF8_TO_NATIVE(c, *(STRING(p) + 1));
-                while (scan < loceol && UCHARAT(scan) == c) {
-                    scan++;
-                }
+                scan = find_span_end(scan, loceol, (U8) c);
             } /* else pattern char is above Latin1, can't possibly match the
                  non-UTF-8 target */
         }
@@ -9162,9 +9367,7 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const 
regnode *p,
                 }
             }
             else if (c1 == c2) {
-                while (scan < loceol && UCHARAT(scan) == c1) {
-                    scan++;
-                }
+                scan = find_span_end(scan, loceol, c1);
             }
             else {
                 /* See comments in regmatch() CURLY_B_min_known_fail.  We avoid
@@ -9173,14 +9376,12 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const 
regnode *p,
                 U8 c1_c2_bits_differing = c1 ^ c2;
 
                 if (isPOWER_OF_2(c1_c2_bits_differing)) {
-                    U8 c1_masked = c1 & ~ c1_c2_bits_differing;
                     U8 c1_c2_mask = ~ c1_c2_bits_differing;
 
-                    while (   scan < loceol
-                           && (UCHARAT(scan) & c1_c2_mask) == c1_masked)
-                    {
-                        scan++;
-                    }
+                    scan = (char *) find_span_end_mask((U8 *) scan,
+                                                       (U8 *) loceol,
+                                                       c1 & c1_c2_mask,
+                                                       c1_c2_mask);
                 }
                 else {
                     while (    scan < loceol
@@ -9222,7 +9423,7 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const 
regnode *p,
        }
        break;
 
-    case ASCII:
+    case ANYOFM:
         if (utf8_target && loceol - scan > max) {
 
             /* We didn't adjust <loceol> at the beginning of this routine
@@ -9231,6 +9432,14 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const 
regnode *p,
             loceol = scan + max;
         }
 
+        scan = (char *) find_span_end_mask((U8 *) scan, (U8 *) loceol, (U8) 
ARG(p), FLAGS(p));
+        break;
+
+    case ASCII:
+        if (utf8_target && loceol - scan > max) {
+            loceol = scan + max;
+        }
+
         scan = find_next_non_ascii(scan, loceol, utf8_target);
        break;
 
diff --git a/regnodes.h b/regnodes.h
index f76aab4cc0..855a215650 100644
--- a/regnodes.h
+++ b/regnodes.h
@@ -6,8 +6,8 @@
 
 /* Regops and State definitions */
 
-#define REGNODE_MAX            96
-#define REGMATCH_STATE_MAX     138
+#define REGNODE_MAX            97
+#define REGMATCH_STATE_MAX     139
 
 #define        END                     0       /* 0000 End of program. */
 #define        SUCCEED                 1       /* 0x01 Return from a 
subroutine, basically. */
@@ -32,82 +32,83 @@
 #define        ANYOF                   18      /* 0x12 Match character in (or 
not in) this class, single char match only */
 #define        ANYOFD                  19      /* 0x13 Like ANYOF, but /d is 
in effect */
 #define        ANYOFL                  20      /* 0x14 Like ANYOF, but /l is 
in effect */
-#define        POSIXD                  21      /* 0x15 Some [[:class:]] under 
/d; the FLAGS field gives which one */
-#define        POSIXL                  22      /* 0x16 Some [[:class:]] under 
/l; the FLAGS field gives which one */
-#define        POSIXU                  23      /* 0x17 Some [[:class:]] under 
/u; the FLAGS field gives which one */
-#define        POSIXA                  24      /* 0x18 Some [[:class:]] under 
/a; the FLAGS field gives which one */
-#define        NPOSIXD                 25      /* 0x19 complement of POSIXD, 
[[:^class:]] */
-#define        NPOSIXL                 26      /* 0x1a complement of POSIXL, 
[[:^class:]] */
-#define        NPOSIXU                 27      /* 0x1b complement of POSIXU, 
[[:^class:]] */
-#define        NPOSIXA                 28      /* 0x1c complement of POSIXA, 
[[:^class:]] */
-#define        ASCII                   29      /* 0x1d [[:ascii:]] */
-#define        NASCII                  30      /* 0x1e [[:^ascii:]] */
-#define        CLUMP                   31      /* 0x1f Match any extended 
grapheme cluster sequence */
-#define        BRANCH                  32      /* 0x20 Match this alternative, 
or the next... */
-#define        EXACT                   33      /* 0x21 Match this string 
(preceded by length). */
-#define        EXACTL                  34      /* 0x22 Like EXACT, but /l is 
in effect (used so locale-related warnings can be checked for). */
-#define        EXACTF                  35      /* 0x23 Match this non-UTF-8 
string (not guaranteed to be folded) using /id rules (w/len). */
-#define        EXACTFL                 36      /* 0x24 Match this string (not 
guaranteed to be folded) using /il rules (w/len). */
-#define        EXACTFU                 37      /* 0x25 Match this string 
(folded iff in UTF-8, length in folding doesn't change if not in UTF-8) using 
/iu rules (w/len). */
-#define        EXACTFA                 38      /* 0x26 Match this string (not 
guaranteed to be folded) using /iaa rules (w/len). */
-#define        EXACTFU_SS              39      /* 0x27 Match this string 
(folded iff in UTF-8, length in folding may change even if not in UTF-8) using 
/iu rules (w/len). */
-#define        EXACTFLU8               40      /* 0x28 Rare cirucmstances: 
like EXACTFU, but is under /l, UTF-8, folded, and everything in it is above 
255. */
-#define        EXACTFA_NO_TRIE         41      /* 0x29 Match this string 
(which is not trie-able; not guaranteed to be folded) using /iaa rules (w/len). 
*/
-#define        NOTHING                 42      /* 0x2a Match empty string. */
-#define        TAIL                    43      /* 0x2b Match empty string. Can 
jump here from outside. */
-#define        STAR                    44      /* 0x2c Match this (simple) 
thing 0 or more times. */
-#define        PLUS                    45      /* 0x2d Match this (simple) 
thing 1 or more times. */
-#define        CURLY                   46      /* 0x2e Match this simple thing 
{n,m} times. */
-#define        CURLYN                  47      /* 0x2f Capture next-after-this 
simple thing */
-#define        CURLYM                  48      /* 0x30 Capture this 
medium-complex thing {n,m} times. */
-#define        CURLYX                  49      /* 0x31 Match this complex 
thing {n,m} times. */
-#define        WHILEM                  50      /* 0x32 Do curly processing and 
see if rest matches. */
-#define        OPEN                    51      /* 0x33 Mark this point in 
input as start of #n. */
-#define        CLOSE                   52      /* 0x34 Close corresponding 
OPEN of #n. */
-#define        SROPEN                  53      /* 0x35 Same as OPEN, but for 
script run */
-#define        SRCLOSE                 54      /* 0x36 Close preceding SROPEN 
*/
-#define        REF                     55      /* 0x37 Match some already 
matched string */
-#define        REFF                    56      /* 0x38 Match already matched 
string, folded using native charset rules for non-utf8 */
-#define        REFFL                   57      /* 0x39 Match already matched 
string, folded in loc. */
-#define        REFFU                   58      /* 0x3a Match already matched 
string, folded using unicode rules for non-utf8 */
-#define        REFFA                   59      /* 0x3b Match already matched 
string, folded using unicode rules for non-utf8, no mixing ASCII, non-ASCII */
-#define        NREF                    60      /* 0x3c Match some already 
matched string */
-#define        NREFF                   61      /* 0x3d Match already matched 
string, folded using native charset rules for non-utf8 */
-#define        NREFFL                  62      /* 0x3e Match already matched 
string, folded in loc. */
-#define        NREFFU                  63      /* 0x3f Match already matched 
string, folded using unicode rules for non-utf8 */
-#define        NREFFA                  64      /* 0x40 Match already matched 
string, folded using unicode rules for non-utf8, no mixing ASCII, non-ASCII */
-#define        LONGJMP                 65      /* 0x41 Jump far away. */
-#define        BRANCHJ                 66      /* 0x42 BRANCH with long 
offset. */
-#define        IFMATCH                 67      /* 0x43 Succeeds if the 
following matches. */
-#define        UNLESSM                 68      /* 0x44 Fails if the following 
matches. */
-#define        SUSPEND                 69      /* 0x45 "Independent" sub-RE. */
-#define        IFTHEN                  70      /* 0x46 Switch, should be 
preceded by switcher. */
-#define        GROUPP                  71      /* 0x47 Whether the group 
matched. */
-#define        EVAL                    72      /* 0x48 Execute some Perl code. 
*/
-#define        MINMOD                  73      /* 0x49 Next operator is not 
greedy. */
-#define        LOGICAL                 74      /* 0x4a Next opcode should set 
the flag only. */
-#define        RENUM                   75      /* 0x4b Group with 
independently numbered parens. */
-#define        TRIE                    76      /* 0x4c Match many 
EXACT(F[ALU]?)? at once. flags==type */
-#define        TRIEC                   77      /* 0x4d Same as TRIE, but with 
embedded charclass data */
-#define        AHOCORASICK             78      /* 0x4e Aho Corasick stclass. 
flags==type */
-#define        AHOCORASICKC            79      /* 0x4f Same as AHOCORASICK, 
but with embedded charclass data */
-#define        GOSUB                   80      /* 0x50 recurse to paren arg1 
at (signed) ofs arg2 */
-#define        NGROUPP                 81      /* 0x51 Whether the group 
matched. */
-#define        INSUBP                  82      /* 0x52 Whether we are in a 
specific recurse. */
-#define        DEFINEP                 83      /* 0x53 Never execute directly. 
*/
-#define        ENDLIKE                 84      /* 0x54 Used only for the type 
field of verbs */
-#define        OPFAIL                  85      /* 0x55 Same as (?!), but with 
verb arg */
-#define        ACCEPT                  86      /* 0x56 Accepts the current 
matched string, with verbar */
-#define        VERB                    87      /* 0x57 Used only for the type 
field of verbs */
-#define        PRUNE                   88      /* 0x58 Pattern fails at this 
startpoint if no-backtracking through this */
-#define        MARKPOINT               89      /* 0x59 Push the current 
location for rollback by cut. */
-#define        SKIP                    90      /* 0x5a On failure skip forward 
(to the mark) before retrying */
-#define        COMMIT                  91      /* 0x5b Pattern fails outright 
if backtracking through this */
-#define        CUTGROUP                92      /* 0x5c On failure go to the 
next alternation in the group */
-#define        KEEPS                   93      /* 0x5d $& begins here. */
-#define        LNBREAK                 94      /* 0x5e generic newline pattern 
*/
-#define        OPTIMIZED               95      /* 0x5f Placeholder for dump. */
-#define        PSEUDO                  96      /* 0x60 Pseudo opcode for 
internal use. */
+#define        ANYOFM                  21      /* 0x15 Like ANYOF, but matches 
an invariant byte as determined by the mask and arg */
+#define        POSIXD                  22      /* 0x16 Some [[:class:]] under 
/d; the FLAGS field gives which one */
+#define        POSIXL                  23      /* 0x17 Some [[:class:]] under 
/l; the FLAGS field gives which one */
+#define        POSIXU                  24      /* 0x18 Some [[:class:]] under 
/u; the FLAGS field gives which one */
+#define        POSIXA                  25      /* 0x19 Some [[:class:]] under 
/a; the FLAGS field gives which one */
+#define        NPOSIXD                 26      /* 0x1a complement of POSIXD, 
[[:^class:]] */
+#define        NPOSIXL                 27      /* 0x1b complement of POSIXL, 
[[:^class:]] */
+#define        NPOSIXU                 28      /* 0x1c complement of POSIXU, 
[[:^class:]] */
+#define        NPOSIXA                 29      /* 0x1d complement of POSIXA, 
[[:^class:]] */
+#define        ASCII                   30      /* 0x1e [[:ascii:]] */
+#define        NASCII                  31      /* 0x1f [[:^ascii:]] */
+#define        CLUMP                   32      /* 0x20 Match any extended 
grapheme cluster sequence */
+#define        BRANCH                  33      /* 0x21 Match this alternative, 
or the next... */
+#define        EXACT                   34      /* 0x22 Match this string 
(preceded by length). */
+#define        EXACTL                  35      /* 0x23 Like EXACT, but /l is 
in effect (used so locale-related warnings can be checked for). */
+#define        EXACTF                  36      /* 0x24 Match this non-UTF-8 
string (not guaranteed to be folded) using /id rules (w/len). */
+#define        EXACTFL                 37      /* 0x25 Match this string (not 
guaranteed to be folded) using /il rules (w/len). */
+#define        EXACTFU                 38      /* 0x26 Match this string 
(folded iff in UTF-8, length in folding doesn't change if not in UTF-8) using 
/iu rules (w/len). */
+#define        EXACTFA                 39      /* 0x27 Match this string (not 
guaranteed to be folded) using /iaa rules (w/len). */
+#define        EXACTFU_SS              40      /* 0x28 Match this string 
(folded iff in UTF-8, length in folding may change even if not in UTF-8) using 
/iu rules (w/len). */
+#define        EXACTFLU8               41      /* 0x29 Rare cirucmstances: 
like EXACTFU, but is under /l, UTF-8, folded, and everything in it is above 
255. */
+#define        EXACTFA_NO_TRIE         42      /* 0x2a Match this string 
(which is not trie-able; not guaranteed to be folded) using /iaa rules (w/len). 
*/
+#define        NOTHING                 43      /* 0x2b Match empty string. */
+#define        TAIL                    44      /* 0x2c Match empty string. Can 
jump here from outside. */
+#define        STAR                    45      /* 0x2d Match this (simple) 
thing 0 or more times. */
+#define        PLUS                    46      /* 0x2e Match this (simple) 
thing 1 or more times. */
+#define        CURLY                   47      /* 0x2f Match this simple thing 
{n,m} times. */
+#define        CURLYN                  48      /* 0x30 Capture next-after-this 
simple thing */
+#define        CURLYM                  49      /* 0x31 Capture this 
medium-complex thing {n,m} times. */
+#define        CURLYX                  50      /* 0x32 Match this complex 
thing {n,m} times. */
+#define        WHILEM                  51      /* 0x33 Do curly processing and 
see if rest matches. */
+#define        OPEN                    52      /* 0x34 Mark this point in 
input as start of #n. */
+#define        CLOSE                   53      /* 0x35 Close corresponding 
OPEN of #n. */
+#define        SROPEN                  54      /* 0x36 Same as OPEN, but for 
script run */
+#define        SRCLOSE                 55      /* 0x37 Close preceding SROPEN 
*/
+#define        REF                     56      /* 0x38 Match some already 
matched string */
+#define        REFF                    57      /* 0x39 Match already matched 
string, folded using native charset rules for non-utf8 */
+#define        REFFL                   58      /* 0x3a Match already matched 
string, folded in loc. */
+#define        REFFU                   59      /* 0x3b Match already matched 
string, folded using unicode rules for non-utf8 */
+#define        REFFA                   60      /* 0x3c Match already matched 
string, folded using unicode rules for non-utf8, no mixing ASCII, non-ASCII */
+#define        NREF                    61      /* 0x3d Match some already 
matched string */
+#define        NREFF                   62      /* 0x3e Match already matched 
string, folded using native charset rules for non-utf8 */
+#define        NREFFL                  63      /* 0x3f Match already matched 
string, folded in loc. */
+#define        NREFFU                  64      /* 0x40 Match already matched 
string, folded using unicode rules for non-utf8 */
+#define        NREFFA                  65      /* 0x41 Match already matched 
string, folded using unicode rules for non-utf8, no mixing ASCII, non-ASCII */
+#define        LONGJMP                 66      /* 0x42 Jump far away. */
+#define        BRANCHJ                 67      /* 0x43 BRANCH with long 
offset. */
+#define        IFMATCH                 68      /* 0x44 Succeeds if the 
following matches. */
+#define        UNLESSM                 69      /* 0x45 Fails if the following 
matches. */
+#define        SUSPEND                 70      /* 0x46 "Independent" sub-RE. */
+#define        IFTHEN                  71      /* 0x47 Switch, should be 
preceded by switcher. */
+#define        GROUPP                  72      /* 0x48 Whether the group 
matched. */
+#define        EVAL                    73      /* 0x49 Execute some Perl code. 
*/
+#define        MINMOD                  74      /* 0x4a Next operator is not 
greedy. */
+#define        LOGICAL                 75      /* 0x4b Next opcode should set 
the flag only. */
+#define        RENUM                   76      /* 0x4c Group with 
independently numbered parens. */
+#define        TRIE                    77      /* 0x4d Match many 
EXACT(F[ALU]?)? at once. flags==type */
+#define        TRIEC                   78      /* 0x4e Same as TRIE, but with 
embedded charclass data */
+#define        AHOCORASICK             79      /* 0x4f Aho Corasick stclass. 
flags==type */
+#define        AHOCORASICKC            80      /* 0x50 Same as AHOCORASICK, 
but with embedded charclass data */
+#define        GOSUB                   81      /* 0x51 recurse to paren arg1 
at (signed) ofs arg2 */
+#define        NGROUPP                 82      /* 0x52 Whether the group 
matched. */
+#define        INSUBP                  83      /* 0x53 Whether we are in a 
specific recurse. */
+#define        DEFINEP                 84      /* 0x54 Never execute directly. 
*/
+#define        ENDLIKE                 85      /* 0x55 Used only for the type 
field of verbs */
+#define        OPFAIL                  86      /* 0x56 Same as (?!), but with 
verb arg */
+#define        ACCEPT                  87      /* 0x57 Accepts the current 
matched string, with verbar */
+#define        VERB                    88      /* 0x58 Used only for the type 
field of verbs */
+#define        PRUNE                   89      /* 0x59 Pattern fails at this 
startpoint if no-backtracking through this */
+#define        MARKPOINT               90      /* 0x5a Push the current 
location for rollback by cut. */
+#define        SKIP                    91      /* 0x5b On failure skip forward 
(to the mark) before retrying */
+#define        COMMIT                  92      /* 0x5c Pattern fails outright 
if backtracking through this */
+#define        CUTGROUP                93      /* 0x5d On failure go to the 
next alternation in the group */
+#define        KEEPS                   94      /* 0x5e $& begins here. */
+#define        LNBREAK                 95      /* 0x5f generic newline pattern 
*/
+#define        OPTIMIZED               96      /* 0x60 Placeholder for dump. */
+#define        PSEUDO                  97      /* 0x61 Pseudo opcode for 
internal use. */
        /* ------------ States ------------- */
 #define        TRIE_next               (REGNODE_MAX + 1)       /* state for 
TRIE */
 #define        TRIE_next_fail          (REGNODE_MAX + 2)       /* state for 
TRIE */
@@ -179,6 +180,7 @@ EXTCONST U8 PL_regkind[] = {
        ANYOF,          /* ANYOF                  */
        ANYOF,          /* ANYOFD                 */
        ANYOF,          /* ANYOFL                 */
+       ANYOFM,         /* ANYOFM                 */
        POSIXD,         /* POSIXD                 */
        POSIXD,         /* POSIXL                 */
        POSIXD,         /* POSIXU                 */
@@ -327,6 +329,7 @@ static const U8 regarglen[] = {
        EXTRA_SIZE(struct regnode_1),           /* ANYOF        */
        EXTRA_SIZE(struct regnode_1),           /* ANYOFD       */
        EXTRA_SIZE(struct regnode_1),           /* ANYOFL       */
+       EXTRA_SIZE(struct regnode_1),           /* ANYOFM       */
        0,                                      /* POSIXD       */
        0,                                      /* POSIXL       */
        0,                                      /* POSIXU       */
@@ -429,6 +432,7 @@ static const char reg_off_by_arg[] = {
        0,      /* ANYOF        */
        0,      /* ANYOFD       */
        0,      /* ANYOFL       */
+       0,      /* ANYOFM       */
        0,      /* POSIXD       */
        0,      /* POSIXL       */
        0,      /* POSIXU       */
@@ -537,82 +541,83 @@ EXTCONST char * const PL_reg_name[] = {
        "ANYOF",                        /* 0x12 */
        "ANYOFD",                       /* 0x13 */
        "ANYOFL",                       /* 0x14 */
-       "POSIXD",                       /* 0x15 */
-       "POSIXL",                       /* 0x16 */
-       "POSIXU",                       /* 0x17 */
-       "POSIXA",                       /* 0x18 */
-       "NPOSIXD",                      /* 0x19 */
-       "NPOSIXL",                      /* 0x1a */
-       "NPOSIXU",                      /* 0x1b */
-       "NPOSIXA",                      /* 0x1c */
-       "ASCII",                        /* 0x1d */
-       "NASCII",                       /* 0x1e */
-       "CLUMP",                        /* 0x1f */
-       "BRANCH",                       /* 0x20 */
-       "EXACT",                        /* 0x21 */
-       "EXACTL",                       /* 0x22 */
-       "EXACTF",                       /* 0x23 */
-       "EXACTFL",                      /* 0x24 */
-       "EXACTFU",                      /* 0x25 */
-       "EXACTFA",                      /* 0x26 */
-       "EXACTFU_SS",                   /* 0x27 */
-       "EXACTFLU8",                    /* 0x28 */
-       "EXACTFA_NO_TRIE",              /* 0x29 */
-       "NOTHING",                      /* 0x2a */
-       "TAIL",                         /* 0x2b */
-       "STAR",                         /* 0x2c */
-       "PLUS",                         /* 0x2d */
-       "CURLY",                        /* 0x2e */
-       "CURLYN",                       /* 0x2f */
-       "CURLYM",                       /* 0x30 */
-       "CURLYX",                       /* 0x31 */
-       "WHILEM",                       /* 0x32 */
-       "OPEN",                         /* 0x33 */
-       "CLOSE",                        /* 0x34 */
-       "SROPEN",                       /* 0x35 */
-       "SRCLOSE",                      /* 0x36 */
-       "REF",                          /* 0x37 */
-       "REFF",                         /* 0x38 */
-       "REFFL",                        /* 0x39 */
-       "REFFU",                        /* 0x3a */
-       "REFFA",                        /* 0x3b */
-       "NREF",                         /* 0x3c */
-       "NREFF",                        /* 0x3d */
-       "NREFFL",                       /* 0x3e */
-       "NREFFU",                       /* 0x3f */
-       "NREFFA",                       /* 0x40 */
-       "LONGJMP",                      /* 0x41 */
-       "BRANCHJ",                      /* 0x42 */
-       "IFMATCH",                      /* 0x43 */
-       "UNLESSM",                      /* 0x44 */
-       "SUSPEND",                      /* 0x45 */
-       "IFTHEN",                       /* 0x46 */
-       "GROUPP",                       /* 0x47 */
-       "EVAL",                         /* 0x48 */
-       "MINMOD",                       /* 0x49 */
-       "LOGICAL",                      /* 0x4a */
-       "RENUM",                        /* 0x4b */
-       "TRIE",                         /* 0x4c */
-       "TRIEC",                        /* 0x4d */
-       "AHOCORASICK",                  /* 0x4e */
-       "AHOCORASICKC",                 /* 0x4f */
-       "GOSUB",                        /* 0x50 */
-       "NGROUPP",                      /* 0x51 */
-       "INSUBP",                       /* 0x52 */
-       "DEFINEP",                      /* 0x53 */
-       "ENDLIKE",                      /* 0x54 */
-       "OPFAIL",                       /* 0x55 */
-       "ACCEPT",                       /* 0x56 */
-       "VERB",                         /* 0x57 */
-       "PRUNE",                        /* 0x58 */
-       "MARKPOINT",                    /* 0x59 */
-       "SKIP",                         /* 0x5a */
-       "COMMIT",                       /* 0x5b */
-       "CUTGROUP",                     /* 0x5c */
-       "KEEPS",                        /* 0x5d */
-       "LNBREAK",                      /* 0x5e */
-       "OPTIMIZED",                    /* 0x5f */
-       "PSEUDO",                       /* 0x60 */
+       "ANYOFM",                       /* 0x15 */
+       "POSIXD",                       /* 0x16 */
+       "POSIXL",                       /* 0x17 */
+       "POSIXU",                       /* 0x18 */
+       "POSIXA",                       /* 0x19 */
+       "NPOSIXD",                      /* 0x1a */
+       "NPOSIXL",                      /* 0x1b */
+       "NPOSIXU",                      /* 0x1c */
+       "NPOSIXA",                      /* 0x1d */
+       "ASCII",                        /* 0x1e */
+       "NASCII",                       /* 0x1f */
+       "CLUMP",                        /* 0x20 */
+       "BRANCH",                       /* 0x21 */
+       "EXACT",                        /* 0x22 */
+       "EXACTL",                       /* 0x23 */
+       "EXACTF",                       /* 0x24 */
+       "EXACTFL",                      /* 0x25 */
+       "EXACTFU",                      /* 0x26 */
+       "EXACTFA",                      /* 0x27 */
+       "EXACTFU_SS",                   /* 0x28 */
+       "EXACTFLU8",                    /* 0x29 */
+       "EXACTFA_NO_TRIE",              /* 0x2a */
+       "NOTHING",                      /* 0x2b */
+       "TAIL",                         /* 0x2c */
+       "STAR",                         /* 0x2d */
+       "PLUS",                         /* 0x2e */
+       "CURLY",                        /* 0x2f */
+       "CURLYN",                       /* 0x30 */
+       "CURLYM",                       /* 0x31 */
+       "CURLYX",                       /* 0x32 */
+       "WHILEM",                       /* 0x33 */
+       "OPEN",                         /* 0x34 */
+       "CLOSE",                        /* 0x35 */
+       "SROPEN",                       /* 0x36 */
+       "SRCLOSE",                      /* 0x37 */
+       "REF",                          /* 0x38 */
+       "REFF",                         /* 0x39 */
+       "REFFL",                        /* 0x3a */
+       "REFFU",                        /* 0x3b */
+       "REFFA",                        /* 0x3c */
+       "NREF",                         /* 0x3d */
+       "NREFF",                        /* 0x3e */
+       "NREFFL",                       /* 0x3f */
+       "NREFFU",                       /* 0x40 */
+       "NREFFA",                       /* 0x41 */
+       "LONGJMP",                      /* 0x42 */
+       "BRANCHJ",                      /* 0x43 */
+       "IFMATCH",                      /* 0x44 */
+       "UNLESSM",                      /* 0x45 */
+       "SUSPEND",                      /* 0x46 */
+       "IFTHEN",                       /* 0x47 */
+       "GROUPP",                       /* 0x48 */
+       "EVAL",                         /* 0x49 */
+       "MINMOD",                       /* 0x4a */
+       "LOGICAL",                      /* 0x4b */
+       "RENUM",                        /* 0x4c */
+       "TRIE",                         /* 0x4d */
+       "TRIEC",                        /* 0x4e */
+       "AHOCORASICK",                  /* 0x4f */
+       "AHOCORASICKC",                 /* 0x50 */
+       "GOSUB",                        /* 0x51 */
+       "NGROUPP",                      /* 0x52 */
+       "INSUBP",                       /* 0x53 */
+       "DEFINEP",                      /* 0x54 */
+       "ENDLIKE",                      /* 0x55 */
+       "OPFAIL",                       /* 0x56 */
+       "ACCEPT",                       /* 0x57 */
+       "VERB",                         /* 0x58 */
+       "PRUNE",                        /* 0x59 */
+       "MARKPOINT",                    /* 0x5a */
+       "SKIP",                         /* 0x5b */
+       "COMMIT",                       /* 0x5c */
+       "CUTGROUP",                     /* 0x5d */
+       "KEEPS",                        /* 0x5e */
+       "LNBREAK",                      /* 0x5f */
+       "OPTIMIZED",                    /* 0x60 */
+       "PSEUDO",                       /* 0x61 */
        /* ------------ States ------------- */
        "TRIE_next",                    /* REGNODE_MAX +0x01 */
        "TRIE_next_fail",               /* REGNODE_MAX +0x02 */
@@ -749,7 +754,7 @@ EXTCONST U8 PL_varies[] __attribute__deprecated__ = {
 EXTCONST U8 PL_varies_bitmask[];
 #else
 EXTCONST U8 PL_varies_bitmask[] = {
-    0x00, 0x00, 0x00, 0x80, 0x01, 0xF0, 0x87, 0xFF, 0x65, 0x00, 0x00, 0x00, 
0x00
+    0x00, 0x00, 0x00, 0x00, 0x03, 0xE0, 0x0F, 0xFF, 0xCB, 0x00, 0x00, 0x00, 
0x00
 };
 #endif /* DOINIT */
 
@@ -761,8 +766,8 @@ EXTCONST U8 PL_varies_bitmask[] = {
 EXTCONST U8 PL_simple[] __attribute__deprecated__;
 #else
 EXTCONST U8 PL_simple[] __attribute__deprecated__ = {
-    REG_ANY, SANY, ANYOF, ANYOFD, ANYOFL, POSIXD, POSIXL, POSIXU, POSIXA,
-    NPOSIXD, NPOSIXL, NPOSIXU, NPOSIXA, ASCII, NASCII,
+    REG_ANY, SANY, ANYOF, ANYOFD, ANYOFL, ANYOFM, POSIXD, POSIXL, POSIXU,
+    POSIXA, NPOSIXD, NPOSIXL, NPOSIXU, NPOSIXA, ASCII, NASCII,
     0
 };
 #endif /* DOINIT */
@@ -771,7 +776,7 @@ EXTCONST U8 PL_simple[] __attribute__deprecated__ = {
 EXTCONST U8 PL_simple_bitmask[];
 #else
 EXTCONST U8 PL_simple_bitmask[] = {
-    0x00, 0x00, 0xFF, 0x7F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
0x00
+    0x00, 0x00, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
0x00
 };
 #endif /* DOINIT */
 
diff --git a/t/re/anyof.t b/t/re/anyof.t
index d24e4a71a8..12fb9b3a8c 100644
--- a/t/re/anyof.t
+++ b/t/re/anyof.t
@@ -31,7 +31,7 @@ BEGIN {
 # skipped and not skipped.
 
 my @tests = (
-    '[[{]' => 'ANYOF[\[\{]',
+    '[[{]' => 'ANYOFM[\[\{]',
     '[^\S ]' => 'ANYOFD[\t\n\x0B\f\r{utf8}\x85\xA0][1680 2000-200A 2028-2029 
202F 205F 3000]',
     '[^\n\r]' => 'ANYOF[^\n\r][0100-INFINITY]',
     '[^\/\|,\$\%%\@\ \%"\<\>\:\#\&\*\{\}\[\]\(\)]' => 'ANYOF[^ 
"#$%&()*,/:<>@\[\]\{|\}][0100-INFINITY]',

-- 
Perl5 Master Repository

Reply via email to