[perl.git] branch blead, updated. v5.21.4-186-gb35552d

Karl Williamson Mon, 29 Sep 2014 12:08:11 -0700

In perl.git, the branch blead has been updated

<http://perl5.git.perl.org/perl.git/commitdiff/b35552de5cea8eb47ccb046284ecb9a099430255?hp=ba27e2b14916d249d8db56bdf41409034871a54a>


- Log -----------------------------------------------------------------
commit b35552de5cea8eb47ccb046284ecb9a099430255
Author: Karl Williamson <[email protected]>
Date:   Mon Sep 22 13:59:39 2014 -0600

    Tighten uses of regex synthetic start class
    
    A synthetic start class (SSC) is generated by the regular expression
    pattern compiler to give a consolidation of all the possible things that
    can match at the beginning of where a pattern can possibly match.
    For example
        qr/a?bfoo/;
    requires the match to begin with either an 'a' or a 'b'.  There are no
    other possibilities.  We can set things up to quickly scan for either of
    these in the target string, and only when one of these is found do we
    need to look for 'foo'.
    
    There is an overhead associated with using SSCs.  If the number of
    possibilities that the SSC excludes is relatively small, it can be
    counter-productive to use them.
    
    This patch creates a crude sieve to decide whether to use an SSC or not.
    If the SSC doesn't exclude at least half the "likely" possiblities, it
    is discarded.  This patch is a starting point, and can be refined if
    necessary as we gain experience.
    
    See thread beginning with
    http://nntp.perl.org/group/perl.perl5.porters/212644
    
    In many patterns, no SSC is generated; and with the advent of tries,
    SSC's have become less important, so whatever we do is not terribly
    critical.

M       embed.fnc
M       embed.h
M       ext/re/t/regop.t
M       proto.h
M       regcomp.c
M       regen/unicode_constants.pl
M       t/re/pat.t
M       unicode_constants.h

commit dea37815c59831b7e586fa51968348fbb8009e1a
Author: Karl Williamson <[email protected]>
Date:   Tue Sep 23 16:54:34 2014 -0600

    regcomp.c: Move macro definition earlier in file
    
    This is to prepare it to be used in earlier places than it is now.

M       regcomp.c

commit f17de6c9c06da0b13fe4faadfb3b9aeb778ad0cd
Author: Karl Williamson <[email protected]>
Date:   Sat Sep 20 10:20:11 2014 -0600

    regcomp.c: Use strnEQ instead of series of tests
    
    At some point it is more efficient and certainly clearer to call a
    library function to do a strcmp than to test each individual byte.
    Based on irc discussion with the original coder of this, I changed this
    sequence of 6 comparisions to a strnEQ.  At the same time, this adds an
    buffer overlflow check.  I was unable to cause the previous code to
    overflow, but I believe it was possible.  And this changes the magic
    number 6 in the code to a mnemonic giving its meaning.

M       regcomp.c
-----------------------------------------------------------------------

Summary of changes:
 embed.fnc                  |  2 ++
 embed.h                    |  1 +
 ext/re/t/regop.t           |  5 +--
 proto.h                    |  6 ++++
 regcomp.c                  | 89 ++++++++++++++++++++++++++++++++++++++--------
 regen/unicode_constants.pl | 15 ++++++++
 t/re/pat.t                 |  2 +-
 unicode_constants.h        |  3 ++
 8 files changed, 104 insertions(+), 19 deletions(-)

diff --git a/embed.fnc b/embed.fnc
index ee5f115..5fa38e8 100644
--- a/embed.fnc
+++ b/embed.fnc
@@ -2180,6 +2180,8 @@ Ei        |void   |ssc_add_range  |NN regnode_ssc *ssc \
 Ei     |void   |ssc_cp_and     |NN regnode_ssc *ssc \
                                |UV const cp
 Ein    |void   |ssc_clear_locale|NN regnode_ssc *ssc
+Ens    |bool   |is_ssc_worth_it|NN const RExC_state_t * pRExC_state \
+                               |NN const regnode_ssc * ssc
 Es     |void   |ssc_finalize   |NN RExC_state_t *pRExC_state \
                                |NN regnode_ssc *ssc
 Es     |SSize_t|study_chunk    |NN RExC_state_t *pRExC_state \
diff --git a/embed.h b/embed.h
index e0df072..1fe7076 100644
--- a/embed.h
+++ b/embed.h
@@ -971,6 +971,7 @@
 #define invlist_set_len(a,b,c) S_invlist_set_len(aTHX_ a,b,c)
 #define invlist_set_previous_index     S_invlist_set_previous_index
 #define invlist_trim           S_invlist_trim
+#define is_ssc_worth_it                S_is_ssc_worth_it
 #define join_exact(a,b,c,d,e,f,g)      S_join_exact(aTHX_ a,b,c,d,e,f,g)
 #define make_trie(a,b,c,d,e,f,g,h)     S_make_trie(aTHX_ a,b,c,d,e,f,g,h)
 #define nextchar(a)            S_nextchar(aTHX_ a)
diff --git a/ext/re/t/regop.t b/ext/re/t/regop.t
index 6397d4e..60e4c02 100644
--- a/ext/re/t/regop.t
+++ b/ext/re/t/regop.t
@@ -261,7 +261,6 @@ Offsets: [3]
 Freeing REx: "[q]"
 ---
 #Compiling REx "^(\S{1,9}):\s*(\d+)$"
-#synthetic stclass 
"ANYOF[\x{00}-\x{06}\a\b\x{0E}-\x{1F}\x{21}-\x{FF}][{utf8}0100-167F 1681-1FFF 
200B-2027 202A-202E 2030-205E 2060-2FFF 3001-INFINITY]".
 #Final program:
 #   1: SBOL (2)
 #   2: OPEN1 (4)
@@ -277,11 +276,9 @@ Freeing REx: "[q]"
 #  17: CLOSE2 (19)
 #  19: EOL (20)
 #  20: END (0)
-#floating ":" at 1..9 (checking floating) stclass 
ANYOF[\x{00}-\x{06}\a\b\x{0E}-\x{1F}\x{21}-\x{FF}][{utf8}0100-167F 1681-1FFF 
200B-2027 202A-202E 2030-205E 2060-2FFF 3001-INFINITY] anchored(SBOL) mi ... [6 
chars truncated]
 #Freeing REx: "^(\S{1,9}):\s*(\d+)$"
-floating ":" at 1..9 (checking floating) stclass 
ANYOF[\x{00}-\x{06}\a\b\x{0E}-\x{1F}\x{21}-\x{FF}][{utf8}0100-167F 1681-1FFF 
200B-2027 202A-202E 2030-205E 2060-2FFF 3001-INFINITY] anchored(SBOL) min ... 
[5 chars truncated]
 %MATCHED%
-synthetic stclass
+Freeing REx: "^(\S{1,9}):\s*(\d+)$"
 ---
 #Compiling REx 
"(?(DEFINE)(?<foo>foo))(?(DEFINE)(?<bar>(?&foo)bar))(?(DEFINE"...
 #Got 532 bytes for offset annotations.
diff --git a/proto.h b/proto.h
index b4ab3df..4f36b27 100644
--- a/proto.h
+++ b/proto.h
@@ -6872,6 +6872,12 @@ PERL_STATIC_INLINE void  S_invlist_trim(SV* const 
invlist)
 #define PERL_ARGS_ASSERT_INVLIST_TRIM  \
        assert(invlist)
 
+STATIC bool    S_is_ssc_worth_it(const RExC_state_t * pRExC_state, const 
regnode_ssc * ssc)
+                       __attribute__nonnull__(1)
+                       __attribute__nonnull__(2);
+#define PERL_ARGS_ASSERT_IS_SSC_WORTH_IT       \
+       assert(pRExC_state); assert(ssc)
+
 STATIC U32     S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV 
*min_subtract, bool *unfolded_multi_char, U32 flags, regnode *val, U32 depth)
                        __attribute__nonnull__(pTHX_1)
                        __attribute__nonnull__(pTHX_2)
diff --git a/regcomp.c b/regcomp.c
index 049d6e7..c8df348 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -102,6 +102,9 @@ EXTERN_C const struct regexp_engine my_reg_engine;
 #define        STATIC  static
 #endif
 
+#ifndef MIN
+#define MIN(a,b) ((a) < (b) ? (a) : (b))
+#endif
 
 struct RExC_state_t {
     U32                flags;                  /* RXf_* are we folding, 
multilining? */
@@ -1441,6 +1444,71 @@ S_ssc_clear_locale(regnode_ssc *ssc)
     ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS;
 }
 
+#define NON_OTHER_COUNT   NON_OTHER_COUNT_FOR_USE_ONLY_BY_REGCOMP_DOT_C
+
+STATIC bool
+S_is_ssc_worth_it(const RExC_state_t * pRExC_state, const regnode_ssc * ssc)
+{
+    /* The synthetic start class is used to hopefully quickly winnow down
+     * places where a pattern could start a match in the target string.  If it
+     * doesn't really narrow things down that much, there isn't much point to
+     * having the overhead of using it.  This function uses some very crude
+     * heuristics to decide if to use the ssc or not.
+     *
+     * It returns TRUE if 'ssc' rules out more than half what it considers to
+     * be the "likely" possible matches, but of course it doesn't know what the
+     * actual things being matched are going to be; these are only guesses
+     *
+     * For /l matches, it assumes that the only likely matches are going to be
+     *      in the 0-255 range, uniformly distributed, so half of that is 127
+     * For /a and /d matches, it assumes that the likely matches will be just
+     *      the ASCII range, so half of that is 63
+     * For /u and there isn't anything matching above the Latin1 range, it
+     *      assumes that that is the only range likely to be matched, and uses
+     *      half that as the cut-off: 127.  If anything matches above Latin1,
+     *      it assumes that all of Unicode could match (uniformly), except for
+     *      non-Unicode code points and things in the General Category "Other"
+     *      (unassigned, private use, surrogates, controls and formats).  This
+     *      is a much large number. */
+
+    const U32 max_match = (LOC)
+                          ? 127
+                          : (! UNI_SEMANTICS)
+                            ? 63
+                            : (invlist_highest(ssc->invlist) < 256)
+                              ? 127
+                              : ((NON_OTHER_COUNT + 1) / 2) - 1;
+    U32 count = 0;      /* Running total of number of code points matched by
+                           'ssc' */
+    UV start, end;      /* Start and end points of current range in inversion
+                           list */
+
+    PERL_ARGS_ASSERT_IS_SSC_WORTH_IT;
+
+    invlist_iterinit(ssc->invlist);
+    while (invlist_iternext(ssc->invlist, &start, &end)) {
+
+        /* /u is the only thing that we expect to match above 255; so if not /u
+         * and even if there are matches above 255, ignore them.  This catches
+         * things like \d under /d which does match the digits above 255, but
+         * since the pattern is /d, it is not likely to be expecting them */
+        if (! UNI_SEMANTICS) {
+            if (start > 255) {
+                break;
+            }
+            end = MIN(end, 255);
+        }
+        count += end - start + 1;
+        if (count > max_match) {
+            invlist_iterfinish(ssc->invlist);
+            return FALSE;
+        }
+    }
+
+    return TRUE;
+}
+
+
 STATIC void
 S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc)
 {
@@ -7069,7 +7137,7 @@ reStudy:
        if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
            && stclass_flag
             && ! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
-           && !ssc_is_anything(data.start_class))
+           && is_ssc_worth_it(pRExC_state, data.start_class))
        {
            const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
 
@@ -7149,7 +7217,7 @@ reStudy:
                = r->float_substr = r->float_utf8 = NULL;
 
         if (! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
-            && ! ssc_is_anything(data.start_class))
+           && is_ssc_worth_it(pRExC_state, data.start_class))
         {
            const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
 
@@ -10036,6 +10104,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 
*flagp,U32 depth)
            case '(':           /* (?(?{...})...) and (?(?=...)...) */
            {
                int is_define= 0;
+                const int DEFINE_len = sizeof("DEFINE") - 1;
                if (RExC_parse[0] == '?') {        /* (?(?...)) */
                    if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
                        || RExC_parse[1] == '<'
@@ -10078,15 +10147,11 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 
*flagp,U32 depth)
                     ret = reganode(pRExC_state,NGROUPP,num);
                     goto insert_if_check_paren;
                }
-               else if (RExC_parse[0] == 'D' &&
-                        RExC_parse[1] == 'E' &&
-                        RExC_parse[2] == 'F' &&
-                        RExC_parse[3] == 'I' &&
-                        RExC_parse[4] == 'N' &&
-                        RExC_parse[5] == 'E')
-               {
+               else if (strnEQ(RExC_parse, "DEFINE",
+                                       MIN(DEFINE_len, RExC_end - RExC_parse)))
+                {
                    ret = reganode(pRExC_state,DEFINEP,0);
-                   RExC_parse +=6 ;
+                   RExC_parse += DEFINE_len;
                    is_define = 1;
                    goto insert_if_check_paren;
                }
@@ -16886,10 +16951,6 @@ S_put_code_point(pTHX_ SV *sv, UV c)
 
 #define MAX_PRINT_A MAX_PRINT_A_FOR_USE_ONLY_BY_REGCOMP_DOT_C
 
-#ifndef MIN
-#define MIN(a,b) ((a) < (b) ? (a) : (b))
-#endif
-
 STATIC void
 S_put_range(pTHX_ SV *sv, UV start, const UV end, const bool allow_literals)
 {
diff --git a/regen/unicode_constants.pl b/regen/unicode_constants.pl
index c81f767..936c1a8 100644
--- a/regen/unicode_constants.pl
+++ b/regen/unicode_constants.pl
@@ -155,7 +155,22 @@ foreach my $charset (get_supported_code_pages()) {
     printf $out_fh "#   define MAX_PRINT_A_FOR_USE_ONLY_BY_REGCOMP_DOT_C   
0x%02X   /* The max code point that isPRINT_A */\n", $max_PRINT_A;
 
     print $out_fh "\n" . get_conditional_compile_line_end();
+
+}
+
+use Unicode::UCD 'prop_invlist';
+
+my $count = 0;
+my @other_invlist = prop_invlist("Other");
+for (my $i = 0; $i < @other_invlist; $i += 2) {
+    $count += ((defined $other_invlist[$i+1])
+              ? $other_invlist[$i+1]
+              : 0x110000)
+              - $other_invlist[$i];
 }
+printf $out_fh "\n/* The number of code points not matching \\pC */\n"
+             . "#define NON_OTHER_COUNT_FOR_USE_ONLY_BY_REGCOMP_DOT_C  %d\n",
+            0x110000 - $count;
 
 print $out_fh "\n#endif /* H_UNICODE_CONSTANTS */\n";
 
diff --git a/t/re/pat.t b/t/re/pat.t
index 926b67a..ac6bb3f 100644
--- a/t/re/pat.t
+++ b/t/re/pat.t
@@ -1495,7 +1495,7 @@ EOP
                             qr/\d?c/d
                             qr/\w?c/l
                             qr/\s?c/a
-                            qr/[[:alpha:]]?c/u
+                            qr/[[:lower:]]?c/u
     )) {
       SKIP: {
         skip "no re-debug under miniperl" if is_miniperl;
diff --git a/unicode_constants.h b/unicode_constants.h
index 6cd8cc6..a7ddfeb 100644
--- a/unicode_constants.h
+++ b/unicode_constants.h
@@ -170,6 +170,9 @@
 
 #endif /* EBCDIC POSIX-BC */
 
+/* The number of code points not matching \pC */
+#define NON_OTHER_COUNT_FOR_USE_ONLY_BY_REGCOMP_DOT_C  112806
+
 #endif /* H_UNICODE_CONSTANTS */
 
 /* ex: set ro: */

--
Perl5 Master Repository

[perl.git] branch blead, updated. v5.21.4-186-gb35552d

Reply via email to