In perl.git, the branch blead has been updated <http://perl5.git.perl.org/perl.git/commitdiff/b35552de5cea8eb47ccb046284ecb9a099430255?hp=ba27e2b14916d249d8db56bdf41409034871a54a>
- Log ----------------------------------------------------------------- commit b35552de5cea8eb47ccb046284ecb9a099430255 Author: Karl Williamson <[email protected]> Date: Mon Sep 22 13:59:39 2014 -0600 Tighten uses of regex synthetic start class A synthetic start class (SSC) is generated by the regular expression pattern compiler to give a consolidation of all the possible things that can match at the beginning of where a pattern can possibly match. For example qr/a?bfoo/; requires the match to begin with either an 'a' or a 'b'. There are no other possibilities. We can set things up to quickly scan for either of these in the target string, and only when one of these is found do we need to look for 'foo'. There is an overhead associated with using SSCs. If the number of possibilities that the SSC excludes is relatively small, it can be counter-productive to use them. This patch creates a crude sieve to decide whether to use an SSC or not. If the SSC doesn't exclude at least half the "likely" possiblities, it is discarded. This patch is a starting point, and can be refined if necessary as we gain experience. See thread beginning with http://nntp.perl.org/group/perl.perl5.porters/212644 In many patterns, no SSC is generated; and with the advent of tries, SSC's have become less important, so whatever we do is not terribly critical. M embed.fnc M embed.h M ext/re/t/regop.t M proto.h M regcomp.c M regen/unicode_constants.pl M t/re/pat.t M unicode_constants.h commit dea37815c59831b7e586fa51968348fbb8009e1a Author: Karl Williamson <[email protected]> Date: Tue Sep 23 16:54:34 2014 -0600 regcomp.c: Move macro definition earlier in file This is to prepare it to be used in earlier places than it is now. M regcomp.c commit f17de6c9c06da0b13fe4faadfb3b9aeb778ad0cd Author: Karl Williamson <[email protected]> Date: Sat Sep 20 10:20:11 2014 -0600 regcomp.c: Use strnEQ instead of series of tests At some point it is more efficient and certainly clearer to call a library function to do a strcmp than to test each individual byte. Based on irc discussion with the original coder of this, I changed this sequence of 6 comparisions to a strnEQ. At the same time, this adds an buffer overlflow check. I was unable to cause the previous code to overflow, but I believe it was possible. And this changes the magic number 6 in the code to a mnemonic giving its meaning. M regcomp.c ----------------------------------------------------------------------- Summary of changes: embed.fnc | 2 ++ embed.h | 1 + ext/re/t/regop.t | 5 +-- proto.h | 6 ++++ regcomp.c | 89 ++++++++++++++++++++++++++++++++++++++-------- regen/unicode_constants.pl | 15 ++++++++ t/re/pat.t | 2 +- unicode_constants.h | 3 ++ 8 files changed, 104 insertions(+), 19 deletions(-) diff --git a/embed.fnc b/embed.fnc index ee5f115..5fa38e8 100644 --- a/embed.fnc +++ b/embed.fnc @@ -2180,6 +2180,8 @@ Ei |void |ssc_add_range |NN regnode_ssc *ssc \ Ei |void |ssc_cp_and |NN regnode_ssc *ssc \ |UV const cp Ein |void |ssc_clear_locale|NN regnode_ssc *ssc +Ens |bool |is_ssc_worth_it|NN const RExC_state_t * pRExC_state \ + |NN const regnode_ssc * ssc Es |void |ssc_finalize |NN RExC_state_t *pRExC_state \ |NN regnode_ssc *ssc Es |SSize_t|study_chunk |NN RExC_state_t *pRExC_state \ diff --git a/embed.h b/embed.h index e0df072..1fe7076 100644 --- a/embed.h +++ b/embed.h @@ -971,6 +971,7 @@ #define invlist_set_len(a,b,c) S_invlist_set_len(aTHX_ a,b,c) #define invlist_set_previous_index S_invlist_set_previous_index #define invlist_trim S_invlist_trim +#define is_ssc_worth_it S_is_ssc_worth_it #define join_exact(a,b,c,d,e,f,g) S_join_exact(aTHX_ a,b,c,d,e,f,g) #define make_trie(a,b,c,d,e,f,g,h) S_make_trie(aTHX_ a,b,c,d,e,f,g,h) #define nextchar(a) S_nextchar(aTHX_ a) diff --git a/ext/re/t/regop.t b/ext/re/t/regop.t index 6397d4e..60e4c02 100644 --- a/ext/re/t/regop.t +++ b/ext/re/t/regop.t @@ -261,7 +261,6 @@ Offsets: [3] Freeing REx: "[q]" --- #Compiling REx "^(\S{1,9}):\s*(\d+)$" -#synthetic stclass "ANYOF[\x{00}-\x{06}\a\b\x{0E}-\x{1F}\x{21}-\x{FF}][{utf8}0100-167F 1681-1FFF 200B-2027 202A-202E 2030-205E 2060-2FFF 3001-INFINITY]". #Final program: # 1: SBOL (2) # 2: OPEN1 (4) @@ -277,11 +276,9 @@ Freeing REx: "[q]" # 17: CLOSE2 (19) # 19: EOL (20) # 20: END (0) -#floating ":" at 1..9 (checking floating) stclass ANYOF[\x{00}-\x{06}\a\b\x{0E}-\x{1F}\x{21}-\x{FF}][{utf8}0100-167F 1681-1FFF 200B-2027 202A-202E 2030-205E 2060-2FFF 3001-INFINITY] anchored(SBOL) mi ... [6 chars truncated] #Freeing REx: "^(\S{1,9}):\s*(\d+)$" -floating ":" at 1..9 (checking floating) stclass ANYOF[\x{00}-\x{06}\a\b\x{0E}-\x{1F}\x{21}-\x{FF}][{utf8}0100-167F 1681-1FFF 200B-2027 202A-202E 2030-205E 2060-2FFF 3001-INFINITY] anchored(SBOL) min ... [5 chars truncated] %MATCHED% -synthetic stclass +Freeing REx: "^(\S{1,9}):\s*(\d+)$" --- #Compiling REx "(?(DEFINE)(?<foo>foo))(?(DEFINE)(?<bar>(?&foo)bar))(?(DEFINE"... #Got 532 bytes for offset annotations. diff --git a/proto.h b/proto.h index b4ab3df..4f36b27 100644 --- a/proto.h +++ b/proto.h @@ -6872,6 +6872,12 @@ PERL_STATIC_INLINE void S_invlist_trim(SV* const invlist) #define PERL_ARGS_ASSERT_INVLIST_TRIM \ assert(invlist) +STATIC bool S_is_ssc_worth_it(const RExC_state_t * pRExC_state, const regnode_ssc * ssc) + __attribute__nonnull__(1) + __attribute__nonnull__(2); +#define PERL_ARGS_ASSERT_IS_SSC_WORTH_IT \ + assert(pRExC_state); assert(ssc) + STATIC U32 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, bool *unfolded_multi_char, U32 flags, regnode *val, U32 depth) __attribute__nonnull__(pTHX_1) __attribute__nonnull__(pTHX_2) diff --git a/regcomp.c b/regcomp.c index 049d6e7..c8df348 100644 --- a/regcomp.c +++ b/regcomp.c @@ -102,6 +102,9 @@ EXTERN_C const struct regexp_engine my_reg_engine; #define STATIC static #endif +#ifndef MIN +#define MIN(a,b) ((a) < (b) ? (a) : (b)) +#endif struct RExC_state_t { U32 flags; /* RXf_* are we folding, multilining? */ @@ -1441,6 +1444,71 @@ S_ssc_clear_locale(regnode_ssc *ssc) ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS; } +#define NON_OTHER_COUNT NON_OTHER_COUNT_FOR_USE_ONLY_BY_REGCOMP_DOT_C + +STATIC bool +S_is_ssc_worth_it(const RExC_state_t * pRExC_state, const regnode_ssc * ssc) +{ + /* The synthetic start class is used to hopefully quickly winnow down + * places where a pattern could start a match in the target string. If it + * doesn't really narrow things down that much, there isn't much point to + * having the overhead of using it. This function uses some very crude + * heuristics to decide if to use the ssc or not. + * + * It returns TRUE if 'ssc' rules out more than half what it considers to + * be the "likely" possible matches, but of course it doesn't know what the + * actual things being matched are going to be; these are only guesses + * + * For /l matches, it assumes that the only likely matches are going to be + * in the 0-255 range, uniformly distributed, so half of that is 127 + * For /a and /d matches, it assumes that the likely matches will be just + * the ASCII range, so half of that is 63 + * For /u and there isn't anything matching above the Latin1 range, it + * assumes that that is the only range likely to be matched, and uses + * half that as the cut-off: 127. If anything matches above Latin1, + * it assumes that all of Unicode could match (uniformly), except for + * non-Unicode code points and things in the General Category "Other" + * (unassigned, private use, surrogates, controls and formats). This + * is a much large number. */ + + const U32 max_match = (LOC) + ? 127 + : (! UNI_SEMANTICS) + ? 63 + : (invlist_highest(ssc->invlist) < 256) + ? 127 + : ((NON_OTHER_COUNT + 1) / 2) - 1; + U32 count = 0; /* Running total of number of code points matched by + 'ssc' */ + UV start, end; /* Start and end points of current range in inversion + list */ + + PERL_ARGS_ASSERT_IS_SSC_WORTH_IT; + + invlist_iterinit(ssc->invlist); + while (invlist_iternext(ssc->invlist, &start, &end)) { + + /* /u is the only thing that we expect to match above 255; so if not /u + * and even if there are matches above 255, ignore them. This catches + * things like \d under /d which does match the digits above 255, but + * since the pattern is /d, it is not likely to be expecting them */ + if (! UNI_SEMANTICS) { + if (start > 255) { + break; + } + end = MIN(end, 255); + } + count += end - start + 1; + if (count > max_match) { + invlist_iterfinish(ssc->invlist); + return FALSE; + } + } + + return TRUE; +} + + STATIC void S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc) { @@ -7069,7 +7137,7 @@ reStudy: if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset) && stclass_flag && ! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING) - && !ssc_is_anything(data.start_class)) + && is_ssc_worth_it(pRExC_state, data.start_class)) { const U32 n = add_data(pRExC_state, STR_WITH_LEN("f")); @@ -7149,7 +7217,7 @@ reStudy: = r->float_substr = r->float_utf8 = NULL; if (! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING) - && ! ssc_is_anything(data.start_class)) + && is_ssc_worth_it(pRExC_state, data.start_class)) { const U32 n = add_data(pRExC_state, STR_WITH_LEN("f")); @@ -10036,6 +10104,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) case '(': /* (?(?{...})...) and (?(?=...)...) */ { int is_define= 0; + const int DEFINE_len = sizeof("DEFINE") - 1; if (RExC_parse[0] == '?') { /* (?(?...)) */ if (RExC_parse[1] == '=' || RExC_parse[1] == '!' || RExC_parse[1] == '<' @@ -10078,15 +10147,11 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) ret = reganode(pRExC_state,NGROUPP,num); goto insert_if_check_paren; } - else if (RExC_parse[0] == 'D' && - RExC_parse[1] == 'E' && - RExC_parse[2] == 'F' && - RExC_parse[3] == 'I' && - RExC_parse[4] == 'N' && - RExC_parse[5] == 'E') - { + else if (strnEQ(RExC_parse, "DEFINE", + MIN(DEFINE_len, RExC_end - RExC_parse))) + { ret = reganode(pRExC_state,DEFINEP,0); - RExC_parse +=6 ; + RExC_parse += DEFINE_len; is_define = 1; goto insert_if_check_paren; } @@ -16886,10 +16951,6 @@ S_put_code_point(pTHX_ SV *sv, UV c) #define MAX_PRINT_A MAX_PRINT_A_FOR_USE_ONLY_BY_REGCOMP_DOT_C -#ifndef MIN -#define MIN(a,b) ((a) < (b) ? (a) : (b)) -#endif - STATIC void S_put_range(pTHX_ SV *sv, UV start, const UV end, const bool allow_literals) { diff --git a/regen/unicode_constants.pl b/regen/unicode_constants.pl index c81f767..936c1a8 100644 --- a/regen/unicode_constants.pl +++ b/regen/unicode_constants.pl @@ -155,7 +155,22 @@ foreach my $charset (get_supported_code_pages()) { printf $out_fh "# define MAX_PRINT_A_FOR_USE_ONLY_BY_REGCOMP_DOT_C 0x%02X /* The max code point that isPRINT_A */\n", $max_PRINT_A; print $out_fh "\n" . get_conditional_compile_line_end(); + +} + +use Unicode::UCD 'prop_invlist'; + +my $count = 0; +my @other_invlist = prop_invlist("Other"); +for (my $i = 0; $i < @other_invlist; $i += 2) { + $count += ((defined $other_invlist[$i+1]) + ? $other_invlist[$i+1] + : 0x110000) + - $other_invlist[$i]; } +printf $out_fh "\n/* The number of code points not matching \\pC */\n" + . "#define NON_OTHER_COUNT_FOR_USE_ONLY_BY_REGCOMP_DOT_C %d\n", + 0x110000 - $count; print $out_fh "\n#endif /* H_UNICODE_CONSTANTS */\n"; diff --git a/t/re/pat.t b/t/re/pat.t index 926b67a..ac6bb3f 100644 --- a/t/re/pat.t +++ b/t/re/pat.t @@ -1495,7 +1495,7 @@ EOP qr/\d?c/d qr/\w?c/l qr/\s?c/a - qr/[[:alpha:]]?c/u + qr/[[:lower:]]?c/u )) { SKIP: { skip "no re-debug under miniperl" if is_miniperl; diff --git a/unicode_constants.h b/unicode_constants.h index 6cd8cc6..a7ddfeb 100644 --- a/unicode_constants.h +++ b/unicode_constants.h @@ -170,6 +170,9 @@ #endif /* EBCDIC POSIX-BC */ +/* The number of code points not matching \pC */ +#define NON_OTHER_COUNT_FOR_USE_ONLY_BY_REGCOMP_DOT_C 112806 + #endif /* H_UNICODE_CONSTANTS */ /* ex: set ro: */ -- Perl5 Master Repository
