In perl.git, the branch blead has been updated <http://perl5.git.perl.org/perl.git/commitdiff/8a776697be4d3151d33fc5fefefbc88f290a7c29?hp=1059e5b21f7e16d18b3df847d2e5d3f343f44f1c>
- Log ----------------------------------------------------------------- commit 8a776697be4d3151d33fc5fefefbc88f290a7c29 Author: Karl Williamson <[email protected]> Date: Sat Dec 19 11:22:04 2015 -0700 regcomp.c: Skip some work We can optimize ANYOF nodes that are equivalent to POSIX character classes. Discovering if they are equivalent takes work, which can be skipped with a simple test that will rule out many run-of-the-mill character classes. M regcomp.c commit d113e221247bf39615d19b490f672c4d47380294 Author: Karl Williamson <[email protected]> Date: Sat Dec 19 11:19:35 2015 -0700 regcomp.c: White space only Indent a section of code in preparation for the next commit which will make it into a block. M regcomp.c commit 0e7784f31c0885f80e7369182fdad8861ad9508a Author: Karl Williamson <[email protected]> Date: Sat Dec 19 11:14:07 2015 -0700 regcomp.c: Add comments M regcomp.c commit 13f7995862ecd2f387b4b447464429871c52b1a3 Author: Karl Williamson <[email protected]> Date: Sat Dec 19 09:49:00 2015 -0700 mktables: Add "$0:" to its first output So in a make, it is abundantly clear where the messages are coming from M charclass_invlists.h M lib/unicore/mktables M regcharclass.h ----------------------------------------------------------------------- Summary of changes: charclass_invlists.h | 2 +- lib/unicore/mktables | 4 +-- regcharclass.h | 2 +- regcomp.c | 78 ++++++++++++++++++++++++++++++++++++---------------- 4 files changed, 59 insertions(+), 27 deletions(-) diff --git a/charclass_invlists.h b/charclass_invlists.h index 91ef063..8a37ab6 100644 --- a/charclass_invlists.h +++ b/charclass_invlists.h @@ -99537,7 +99537,7 @@ static const UV XPosixXDigit_invlist[] = { /* for EBCDIC POSIX-BC */ * 1a0687fb9c6c4567e853913549df0944fe40821279a3e9cdaa6ab8679bc286fd lib/unicore/extracted/DLineBreak.txt * 40bcfed3ca727c19e1331f6c33806231d5f7eeeabd2e6a9e06a3740c85d0c250 lib/unicore/extracted/DNumType.txt * a18d502bad39d527ac5586d7bc93e29f565859e3bcc24ada627eff606d6f5fed lib/unicore/extracted/DNumValues.txt - * 4d44b51567e796f3021824c071a3d73fea1a664b59a064956be17850d976631e lib/unicore/mktables + * 8e23f7adafce8ef1aadbbb3f1e942c14f5d5c8318599cae7ed0ad555e60d4639 lib/unicore/mktables * 462c9aaa608fb2014cd9649af1c5c009485c60b9c8b15b89401fdc10cf6161c6 lib/unicore/version * 996abda3c0fbc2bfd575092af09e3b9b0331e624eb2e969a268457f8fd31ecbb regen/charset_translations.pl * 8a097f8f726bb1619af2f27f149ab87e60a1602f790147e3a561358be16abd27 regen/mk_invlists.pl diff --git a/lib/unicore/mktables b/lib/unicore/mktables index 57e05f3..be66780 100644 --- a/lib/unicore/mktables +++ b/lib/unicore/mktables @@ -19332,10 +19332,10 @@ if (! $rebuild) { } } if (! $rebuild) { - print "Files seem to be ok, not bothering to rebuild. Add '-w' option to force build\n"; + print "$0: Files seem to be ok, not bothering to rebuild. Add '-w' option to force build\n"; exit(0); } -print "Must rebuild tables.\n" if $verbosity >= $VERBOSE; +print "$0: Must rebuild tables.\n" if $verbosity >= $VERBOSE; # Ready to do the major processing. First create the perl pseudo-property. $perl = Property->new('perl', Type => $NON_STRING, Perl_Extension => 1); diff --git a/regcharclass.h b/regcharclass.h index 66064d1..30e9133 100644 --- a/regcharclass.h +++ b/regcharclass.h @@ -2514,7 +2514,7 @@ * 1a0687fb9c6c4567e853913549df0944fe40821279a3e9cdaa6ab8679bc286fd lib/unicore/extracted/DLineBreak.txt * 40bcfed3ca727c19e1331f6c33806231d5f7eeeabd2e6a9e06a3740c85d0c250 lib/unicore/extracted/DNumType.txt * a18d502bad39d527ac5586d7bc93e29f565859e3bcc24ada627eff606d6f5fed lib/unicore/extracted/DNumValues.txt - * 4d44b51567e796f3021824c071a3d73fea1a664b59a064956be17850d976631e lib/unicore/mktables + * 8e23f7adafce8ef1aadbbb3f1e942c14f5d5c8318599cae7ed0ad555e60d4639 lib/unicore/mktables * 462c9aaa608fb2014cd9649af1c5c009485c60b9c8b15b89401fdc10cf6161c6 lib/unicore/version * 996abda3c0fbc2bfd575092af09e3b9b0331e624eb2e969a268457f8fd31ecbb regen/charset_translations.pl * d9c04ac46bdd81bb3e26519f2b8eb6242cb12337205add3f7cf092b0c58dccc4 regen/regcharclass.pl diff --git a/regcomp.c b/regcomp.c index 7a028fd..f11fda1 100644 --- a/regcomp.c +++ b/regcomp.c @@ -3367,6 +3367,14 @@ S_construct_ahocorasick_from_trie(pTHX_ RExC_state_t *pRExC_state, regnode *sour * The adjacent nodes actually may be separated by NOTHING-kind nodes, and * these get optimized out * + * XXX khw thinks this should be enhanced to fill EXACT (at least) nodes as full + * as possible, even if that means splitting an existing node so that its first + * part is moved to the preceeding node. This would maximise the efficiency of + * memEQ during matching. Elsewhere in this file, khw proposes splitting + * EXACTFish nodes into portions that don't change under folding vs those that + * do. Those portions that don't change may be the only things in the pattern that + * could be used to find fixed and floating strings. + * * If a node is to match under /i (folded), the number of characters it matches * can be different than its character length if it contains a multi-character * fold. *min_subtract is set to the total delta number of characters of the @@ -12360,7 +12368,12 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) reparse: /* We look for the EXACTFish to EXACT node optimizaton only if - * folding. (And we don't need to figure this out until pass 2) */ + * folding. (And we don't need to figure this out until pass 2). + * XXX It might actually make sense to split the node into portions + * that are exact and ones that aren't, so that we could later use + * the exact ones to find the longest fixed and floating strings. + * One would want to join them back into a larger node. One could + * use a pseudo regnode like 'EXACT_ORIG_FOLD' */ maybe_exact = FOLD && PASS2; /* XXX The node can hold up to 255 bytes, yet this only goes to @@ -16127,7 +16140,14 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, * adjacent such nodes. And if the class is equivalent to things like /./, * expensive run-time swashes can be avoided. Now that we have more * complete information, we can find things necessarily missed by the - * earlier code. */ + * earlier code. Another possible "optimization" that isn't done is that + * something like [Ee] could be changed into an EXACTFU. khw tried this + * and found that the ANYOF is faster, including for code points not in the + * bitmap. This still might make sense to do, provided it got joined with + * an adjacent node(s) to create a longer EXACTFU one. This could be + * accomplished by creating a pseudo ANYOF_EXACTFU node type that the join + * routine would know is joinable. If that didn't happen, the node type + * could then be made a straight ANYOF */ if (optimizable && cp_list && ! invert) { UV start, end; @@ -16216,36 +16236,48 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, invlist_iterfinish(cp_list); if (op == END) { + const UV cp_list_len = _invlist_len(cp_list); + const UV* cp_list_array = invlist_array(cp_list); /* Here, didn't find an optimization. See if this matches any of * the POSIX classes. These run slightly faster for above-Unicode * code points, so don't bother with POSIXA ones nor the 2 that - * have no above-Unicode matches */ - for (posix_class = 0; - posix_class <= _HIGHEST_REGCOMP_DOT_H_SYNC; - posix_class++) - { - int try_inverted; - if (posix_class == _CC_ASCII || posix_class == _CC_CNTRL) { - continue; - } - for (try_inverted = 0; try_inverted < 2; try_inverted++) { + * have no above-Unicode matches. We can avoid these checks unless + * the ANYOF matches at least as high as the lowest POSIX one + * (which was manually found to be \v. The actual code point may + * increase in later Unicode releases, if a higher code point is + * assigned to be \v, but this code will never break. It would + * just mean we could execute the checks for posix optimizations + * unnecessarily) */ + + if (cp_list_array[cp_list_len-1] > 0x2029) { + for (posix_class = 0; + posix_class <= _HIGHEST_REGCOMP_DOT_H_SYNC; + posix_class++) + { + int try_inverted; + if (posix_class == _CC_ASCII || posix_class == _CC_CNTRL) { + continue; + } + for (try_inverted = 0; try_inverted < 2; try_inverted++) { - /* Check if matches normal or inverted */ - if (_invlistEQ(cp_list, - PL_XPosix_ptrs[posix_class], - try_inverted)) - { - op = (try_inverted) - ? NPOSIXU - : POSIXU; - *flagp |= HASWIDTH|SIMPLE; - goto found_posix; + /* Check if matches normal or inverted */ + if (_invlistEQ(cp_list, + PL_XPosix_ptrs[posix_class], + try_inverted)) + { + op = (try_inverted) + ? NPOSIXU + : POSIXU; + *flagp |= HASWIDTH|SIMPLE; + goto found_posix; + } } } + found_posix: ; } - found_posix: ; } + if (op != END) { RExC_parse = (char *)orig_parse; RExC_emit = (regnode *)orig_emit; -- Perl5 Master Repository
