In perl.git, the branch blead has been updated <http://perl5.git.perl.org/perl.git/commitdiff/bc1644f1ea23213883e5ccb511a8df05762b7e39?hp=d1666a42c346bc434b8bf06c75a90c0ba3681dd0>
- Log ----------------------------------------------------------------- commit bc1644f1ea23213883e5ccb511a8df05762b7e39 Author: Karl Williamson <[email protected]> Date: Thu Jan 15 20:13:45 2015 -0700 regcomp.c: White-space only Indent inside a newly formed block M regcomp.c commit 7475a24fb1bcc3d031d46c3a83616671479634f5 Author: Karl Williamson <[email protected]> Date: Thu Jan 15 20:03:09 2015 -0700 regcomp.c: Fix bug in /[A-Z]/i This also fixes /[a-z]/i. When not under /i, these two ranges alone in a bracketed character class can be optimized into qr/[[:upper:]]/a and qr/[[:lower:]]/a respectively. This optimization saves space in the pattern (as no bitmap is needed), and I think it executes faster. But this optimization has to be foregone under /i (unless /a is also present) because otherwise certain non-ASCII characters such as the \N{KELVIN SIGN} don't match, and they should. M regcomp.c M t/re/re_tests commit 1cc8089a1939c02111513be3f1f49631ccb84757 Author: Karl Williamson <[email protected]> Date: Thu Jan 15 19:40:24 2015 -0700 regcomp.c: Fix comment M regcomp.c commit 92811740b3931484900a149574c1babaa3cbca16 Author: Karl Williamson <[email protected]> Date: Thu Jan 15 18:52:35 2015 -0700 regcomp.c: Improve generated code for some [:posix:] classes For regexes compiled not under /l, [:posix:] classes in general require special handling, as 1) under /d they can match two different sets of code points, depending on whether or not the target string is in UTF-8. 2) under /a, the matches above the ASCII range need to be filtered out. But note that nothing special is needed for /u, and under [:ascii:], there is nothing to be filtered out, and the UTF-8ness of the target doesn't matter either. And since [:digit:] and [:xdigit:] don't have matches in the upper Latin1 range, under /d the UTF-8ness doesn't matter for them either. This commit skips the special handling for the above conditions. This leads to less work during pattern compilation, and can mean faster run-time code, as well, skipping function calls that otherwise would be done. M regcomp.c commit db5a595d4367084062eac3c7e0dab9e0a1c1d51d Author: Karl Williamson <[email protected]> Date: Thu Jan 15 18:35:40 2015 -0700 regcomp.c: Rmv unneeded temporary M regcomp.c commit 4182a68d9d329b7e832d7cd002f1d5461a9029c2 Author: Karl Williamson <[email protected]> Date: Thu Jan 15 11:32:22 2015 -0700 Fix regex pattern dump of ANYOF nodes An ANYOF node is used to handle bracketed character classes and Unicode properties. It is implemented in part as a bit vector of code points that match for smallish ordinals, plus other means to specify larger code points, or code points that match only under certain conditions. For example, under /d, whether [\w] matches word characters in the upper Latin1 range depends on whether the target string is in UTF-8 or not, so the characters which are conditionally matched are saved outside the bitmap. For performance, there is a flag that gets set if all code points too large for the bitmap match. This is a common occurrence for complemented classes, such as [^a-z]. Prior to this commit, if this flag was set, those things that are conditionally matched were not output when dumping the regex pattern (under the command line option -Dr, for example). M regcomp.c ----------------------------------------------------------------------- Summary of changes: regcomp.c | 71 ++++++++++++++++++++++++++++++++++++++++++----------------- t/re/re_tests | 4 ++++ 2 files changed, 55 insertions(+), 20 deletions(-) diff --git a/regcomp.c b/regcomp.c index 0b2abf7..b62c30d 100644 --- a/regcomp.c +++ b/regcomp.c @@ -13769,6 +13769,10 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, separate for a while from the non-complemented versions because of complications with /d matching */ + SV* simple_posixes = NULL; /* But under some conditions, the classes can be + treated more simply than the general case, + leading to less compilation and execution + work */ UV element_count = 0; /* Number of distinct elements in the class. Optimizations may be possible if this is tiny */ AV * multi_char_matches = NULL; /* Code points that fold to more than one @@ -14458,15 +14462,33 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, &cp_list); } } - else { /* Garden variety class. If is NASCII, NDIGIT, ... + else if (UNI_SEMANTICS + || classnum == _CC_ASCII + || (DEPENDS_SEMANTICS && (classnum == _CC_DIGIT + || classnum == _CC_XDIGIT))) + { + /* We usually have to worry about /d and /a affecting what + * POSIX classes match, with special code needed for /d + * because we won't know until runtime what all matches. + * But there is no extra work needed under /u, and + * [:ascii:] is unaffected by /a and /d; and :digit: and + * :xdigit: don't have runtime differences under /d. So we + * can special case these, and avoid some extra work below, + * and at runtime. */ + _invlist_union_maybe_complement_2nd( + simple_posixes, + PL_XPosix_ptrs[classnum], + namedclass % 2 != 0, + &simple_posixes); + } + else { /* Garden variety class. If is NUPPER, NALPHA, ... complement and use nposixes */ SV** posixes_ptr = namedclass % 2 == 0 ? &posixes : &nposixes; - SV** source_ptr = &PL_XPosix_ptrs[classnum]; _invlist_union_maybe_complement_2nd( *posixes_ptr, - *source_ptr, + PL_XPosix_ptrs[classnum], namedclass % 2 != 0, posixes_ptr); } @@ -14873,24 +14895,29 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, op = POSIXA; } } - else if (prevvalue == 'A') { - if (value == 'Z' + else if (AT_LEAST_ASCII_RESTRICTED || ! FOLD) { + /* We can optimize A-Z or a-z, but not if they could match + * something like the KELVIN SIGN under /i (/a means they + * can't) */ + if (prevvalue == 'A') { + if (value == 'Z' #ifdef EBCDIC - && literal_endpoint == 2 + && literal_endpoint == 2 #endif - ) { - arg = (FOLD) ? _CC_ALPHA : _CC_UPPER; - op = POSIXA; + ) { + arg = (FOLD) ? _CC_ALPHA : _CC_UPPER; + op = POSIXA; + } } - } - else if (prevvalue == 'a') { - if (value == 'z' + else if (prevvalue == 'a') { + if (value == 'z' #ifdef EBCDIC - && literal_endpoint == 2 + && literal_endpoint == 2 #endif - ) { - arg = (FOLD) ? _CC_ALPHA : _CC_LOWER; - op = POSIXA; + ) { + arg = (FOLD) ? _CC_ALPHA : _CC_LOWER; + op = POSIXA; + } } } } @@ -14944,6 +14971,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, SvREFCNT_dec(posixes); SvREFCNT_dec(nposixes); + SvREFCNT_dec(simple_posixes); SvREFCNT_dec(cp_list); SvREFCNT_dec(cp_foldable_list); return ret; @@ -15101,6 +15129,10 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, * classes. The lists are kept separate up to now because we don't want to * fold the classes (folding of those is automatically handled by the swash * fetching code) */ + if (simple_posixes) { + _invlist_union(cp_list, simple_posixes, &cp_list); + SvREFCNT_dec_NN(simple_posixes); + } if (posixes || nposixes) { if (posixes && AT_LEAST_ASCII_RESTRICTED) { /* Under /a and /aa, nothing above ASCII matches these */ @@ -16485,13 +16517,12 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_ sv_catpvs(sv, "{non-utf8-latin1-all}"); } - /* output information about the unicode matching */ if (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP) sv_catpvs(sv, "{above_bitmap_all}"); - else if (ARG(o) != ANYOF_ONLY_HAS_BITMAP) { + + if (ARG(o) != ANYOF_ONLY_HAS_BITMAP) { SV *lv; /* Set if there is something outside the bit map. */ - bool byte_output = FALSE; /* If something in the bitmap has - been output */ + bool byte_output = FALSE; /* If something has been output */ SV *only_utf8_locale; /* Get the stuff that wasn't in the bitmap. 'bitmap_invlist' diff --git a/t/re/re_tests b/t/re/re_tests index dcac974..ce8d0cf 100644 --- a/t/re/re_tests +++ b/t/re/re_tests @@ -1898,5 +1898,9 @@ A+(*PRUNE)BC(?{}) AAABC y $& AAABC /( (?&solution) | % ) \Z (?(DEFINE) (?<solution>7\%\ solution) )/x 7% solution y $1 7% solution # [perl #122890] (.)(?{$~=$^N}) \x{100} y $~ \x{100} # [perl #123135] +# pat string y/n/etc expr expected-expr skip-reason comment +/[a-z]/i \N{KELVIN SIGN} y $& \N{KELVIN SIGN} +/[A-Z]/i \N{LATIN SMALL LETTER LONG S} y $& \N{LATIN SMALL LETTER LONG S} + # Keep these lines at the end of the file # vim: softtabstop=0 noexpandtab -- Perl5 Master Repository
