In perl.git, the branch blead has been updated <http://perl5.git.perl.org/perl.git/commitdiff/2786be71c69a3e244009b94145ca66f2326aadb9?hp=234242ca5bbd2e073a9d2c4dc7103a0dcea61a54>
- Log ----------------------------------------------------------------- commit 2786be71c69a3e244009b94145ca66f2326aadb9 Author: Karl Williamson <[email protected]> Date: Tue Dec 7 22:51:34 2010 -0700 regcomp.c: Clean up optimization for 1-char [] A single character character class can be optimized into an EXACT node. The changes elsewhere allow this to no longer be constrained to ASCII-only when the pattern isn't UTF-8. Also, the optimization shouldn't have happened for FOLDED characters, as explained in the comments, when they participate in multi-char folds; so that is removed. Also, a locale node with folded characters can be optimized. M regcomp.c commit 3a15e693385b1ab6186ad77f2fc208db1d0e05ea Author: Karl Williamson <[email protected]> Date: Tue Dec 7 17:04:02 2010 -0700 regcomp: Allow freeing up bit in ANYOF flags The flags field is fully used, and until the ANYOF node is split later in development, the CLASS bit will need to be freed up to give the space for other uses. This patch allows for this to easily be toggled. M regcomp.c M regcomp.h M regexec.c commit 40c7855679f2fac8010393d53040d01a1eecb050 Author: Karl Williamson <[email protected]> Date: Tue Dec 7 16:23:07 2010 -0700 regcomp.c: Move [] inversion optimization The optimization to do inversion a compile time is moved to earlier. This doesn't help today, but it may someday when we start keeping better track of Unicode characters, and is the more logical place for it. M regcomp.c commit fed836b3bfd7f84c2054616037d002570c7f407c Author: Karl Williamson <[email protected]> Date: Tue Dec 7 15:21:55 2010 -0700 regcomp.c: When inverting a [], adjust bit count When one complements every bit, the count of those that are set should be complemented as well. M regcomp.c commit f1d6d3cd22e129a9bf34f86ebe89183aad01d216 Author: Karl Williamson <[email protected]> Date: Tue Dec 7 15:16:07 2010 -0700 Subject: [PATCH] regcomp.c: adjust flag When something matches above Latin1, it should have the ANYOF_UTF8 bit set. M regcomp.c commit 30e9bc90ff255ffc2e220a3b10772597a8d8423a Author: Karl Williamson <[email protected]> Date: Tue Dec 7 15:13:29 2010 -0700 regcomp.c: Change constants for clarity. Oddly, it is clearer to use 0xFF as an exclusive-or target instead of an unrelated #define that happens to have that value. M regcomp.c commit a1f3213ba6ba505766fbd1a7636a29ffe462cfe9 Author: Karl Williamson <[email protected]> Date: Tue Dec 7 15:06:35 2010 -0700 regcomp.c: fix indent M regcomp.c commit b3a66ba06366992e90140792ae0fb1480093d249 Author: Karl Williamson <[email protected]> Date: Tue Dec 7 15:04:20 2010 -0700 regcomp.c: remove no longer needed test optimize_invert is no longer needed given the changes already made, as now if there is something not in the bitmap, a flag will be set, and the optimization doesn't take place unless the only flag is inversion. And, the bitmap is setup completely now for anything that doesn't have to be deferred to runtime, and such deferrals are marked with other flags. M regcomp.c commit ce1c68b24a719356e8e7724a2c8a963159f3c18d Author: Karl Williamson <[email protected]> Date: Tue Dec 7 14:50:13 2010 -0700 regcomp.c: isASCII doesn't match outside ANYOF bitmap So there is no need to tell regexec that it does, and then can combine two other statements M regcomp.c commit 41b0f1c13ae23011b37421cd90820bc780b74f30 Author: Karl Williamson <[email protected]> Date: Tue Dec 7 14:36:30 2010 -0700 regcomp.c: Fix compiler warning One smoke is warning about truncated results. This should fix that. It may be that other compilers will now complain, and we'll need to add casts, but I'm waiting to see. M regcomp.c commit b8e7c664739512d97c3d65f44d80a53754521f15 Author: Karl Williamson <[email protected]> Date: Mon Dec 6 15:11:59 2010 -0700 regcomp.c: Remove no longer necessary loop Recent changes to this cause the bitmap to be populated where possible with the all folding taken into consideration. Therefore, the FOLD flag isn't necessary except when that wasn't possible, and the loop that went through looking for folds is also not necessary, as the bitmap is now completely populated before it gets to where the loop was. M regcomp.c commit ffc130aa464ab42cfe8e8a19205f5b398675d98f Author: Karl Williamson <[email protected]> Date: Sat Dec 11 15:02:49 2010 -0700 regcomp.c: remove unncessary counting stored now contains the number of 1 bits in the ANYOF node, and is no longer needed to be arbitrarily set. Part of this is because there is now a flag if there is any match outside the bitmap, which prohibits optimization if so. M regcomp.c ----------------------------------------------------------------------- Summary of changes: regcomp.c | 147 +++++++++++++++++++++++++++++++++++++------------------------ regcomp.h | 30 ++++++++++--- regexec.c | 4 +- 3 files changed, 114 insertions(+), 67 deletions(-) diff --git a/regcomp.c b/regcomp.c index 34c4e95..66cadcf 100644 --- a/regcomp.c +++ b/regcomp.c @@ -3609,8 +3609,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, goto do_default; if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */ value = (ANYOF_BITMAP_TEST(data->start_class,'\n') - || ((data->start_class->flags & ANYOF_CLASS) - && ANYOF_CLASS_TEST_ANY_SET(data->start_class))); + || ANYOF_CLASS_TEST_ANY_SET(data->start_class)); cl_anything(pRExC_state, data->start_class); } if (flags & SCF_DO_STCLASS_AND || !value) @@ -8300,7 +8299,7 @@ S_set_regclass_bit_fold(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 PERL_STATIC_INLINE U8 -S_set_regclass_bit(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U32 value) +S_set_regclass_bit(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value) { /* This inline function sets a bit in the bitmap if not already set, and if * appropriate, its fold, returning the number of bits that actually @@ -8344,12 +8343,11 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth) bool need_class = 0; SV *listsv = NULL; UV n; - bool optimize_invert = TRUE; AV* unicode_alternate = NULL; #ifdef EBCDIC UV literal_endpoint = 0; #endif - UV stored = 0; /* 0, 1, or more than 1 chars stored in the class */ + UV stored = 0; /* how many chars stored in the bitmap */ regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in case we need to change the emitted regop to an EXACT. */ @@ -8378,14 +8376,21 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth) if (SIZE_ONLY) { RExC_size += ANYOF_SKIP; +#ifdef ANYOF_ADD_LOC_SKIP + if (LOC) { + RExC_size += ANYOF_ADD_LOC_SKIP; + } +#endif listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */ } else { RExC_emit += ANYOF_SKIP; - if (FOLD) - ANYOF_FLAGS(ret) |= ANYOF_FOLD; - if (LOC) + if (LOC) { ANYOF_FLAGS(ret) |= ANYOF_LOCALE; +#ifdef ANYOF_ADD_LOC_SKIP + RExC_emit += ANYOF_ADD_LOC_SKIP; +#endif + } ANYOF_BITMAP_ZERO(ret); listsv = newSVpvs("# comment\n"); } @@ -8599,10 +8604,14 @@ parseit: if (LOC && namedclass < ANYOF_MAX && ! need_class) { need_class = 1; if (SIZE_ONLY) { +#ifdef ANYOF_CLASS_ADD_SKIP RExC_size += ANYOF_CLASS_ADD_SKIP; +#endif } else { +#ifdef ANYOF_CLASS_ADD_SKIP RExC_emit += ANYOF_CLASS_ADD_SKIP; +#endif ANYOF_CLASS_ZERO(ret); } ANYOF_FLAGS(ret) |= ANYOF_CLASS; @@ -8627,7 +8636,7 @@ parseit: else { ANYOF_FLAGS(ret) |= ANYOF_UTF8; Perl_sv_catpvf(aTHX_ listsv, - "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-'); + "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-'); } } @@ -8640,8 +8649,6 @@ parseit: const char *what = NULL; char yesno = 0; - if (namedclass > OOB_NAMEDCLASS) - optimize_invert = FALSE; /* Possible truncation here but in some 64-bit environments * the compiler gets heartburn about switch on 64-bit values. * A similar issue a little earlier when switching on value. @@ -8679,7 +8686,8 @@ parseit: S_set_regclass_bit(aTHX_ pRExC_state, ret, ASCII_TO_NATIVE(value)); } yesno = '+'; - what = "ASCII"; + what = NULL; /* Doesn't match outside ascii, so + don't want to add +utf8:: */ break; case ANYOF_NASCII: if (LOC) @@ -8729,13 +8737,9 @@ parseit: if (what) { /* Strings such as "+utf8::isWord\n" */ Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what); - } - stored+=2; /* can't optimize this class */ - - /* All but ASCII can match characters storable only in utf8 */ - if (namedclass != ANYOF_ASCII) { ANYOF_FLAGS(ret) |= ANYOF_UTF8; } + continue; } } /* end of namedclass \blah */ @@ -8773,7 +8777,6 @@ parseit: } /* now is the next time */ - /*stored += (value - prevvalue + 1);*/ if (!SIZE_ONLY) { if (prevvalue < 256) { const IV ceilvalue = value < 256 ? value : 255; @@ -8808,7 +8811,6 @@ parseit: if (value > 255 || UTF) { const UV prevnatvalue = NATIVE_TO_UNI(prevvalue); const UV natvalue = NATIVE_TO_UNI(value); - stored+=2; /* can't optimize this class */ /* If the code point requires utf8 to represent, and we are not * folding, it can't match unless the target is in utf8. Only @@ -8906,52 +8908,81 @@ parseit: return ret; /****** !SIZE_ONLY AFTER HERE *********/ - if( stored == 1 && (value < 128 || (value < 256 && !UTF)) - && !( ANYOF_FLAGS(ret) & ( ANYOF_FLAGS_ALL ^ ANYOF_FOLD ) ) - ) { - /* optimize single char class to an EXACT node but *only* when its not - * a UTF/high char. Note that the information needed to decide to do - * this optimization is not currently available until the 2nd pass, and - * that the actually used EXACT node takes less space than the - * calculated ANYOF node, and hence the amount of space calculated in - * the first pass is larger than actually used. Currently we don't - * keep track of enough information to do this for nodes which contain - * matches outside the bitmap */ + /* Folding in the bitmap is taken care of above, but not for locale, for + * which we have to wait to see what folding is in effect at runtime, and + * for things not in the bitmap */ + if (FOLD && (LOC || ANYOF_FLAGS(ret) & ANYOF_NONBITMAP)) { + ANYOF_FLAGS(ret) |= ANYOF_FOLD; + } + + /* Optimize inverted simple patterns (e.g. [^a-z]). Note that this doesn't + * optimize locale. Doing so perhaps could be done as long as there is + * nothing like \w in it; some thought also would have to be given to the + * interaction with above 0x100 chars */ + if ((ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) { + for (value = 0; value < ANYOF_BITMAP_SIZE; ++value) + ANYOF_BITMAP(ret)[value] ^= 0xFF; + stored = 256 - stored; + + /* The inversion means that everything above 255 is matched */ + ANYOF_FLAGS(ret) = ANYOF_UTF8|ANYOF_UNICODE_ALL; + } + + /* A single character class can be "optimized" into an EXACTish node. + * Note that since we don't currently count how many characters there are + * outside the bitmap, we are XXX missing optimization possibilities for + * them. This optimization can't happen unless this is a truly single + * character class, which means that it can't be an inversion into a + * many-character class, and there must be no possibility of there being + * things outside the bitmap. 'stored' (only) for locales doesn't include + * \w, etc, so have to make a special test that they aren't present */ + if (! (ANYOF_FLAGS(ret) & (ANYOF_NONBITMAP|ANYOF_INVERT|ANYOF_UNICODE_ALL)) + && ((stored == 1 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE)) + || (! ANYOF_CLASS_TEST_ANY_SET(ret)))))) + { + /* Note that the information needed to decide to do this optimization + * is not currently available until the 2nd pass, and that the actually + * used EXACT node takes less space than the calculated ANYOF node, and + * hence the amount of space calculated in the first pass is larger + * than actually used, so this optimization doesn't gain us any space. + * But an EXACT node is faster than an ANYOF node, and can be combined + * with any adjacent EXACT nodes later by the optimizer for further + * gains. */ + const char * cur_parse= RExC_parse; RExC_emit = (regnode *)orig_emit; RExC_parse = (char *)orig_parse; - ret = reg_node(pRExC_state, - (U8)((ANYOF_FLAGS(ret) & ANYOF_FOLD) ? EXACTF : EXACT)); + + /* (A locale node can have 1 point and be folded; all the other folds + * will include the fold, hence will have 2 points, so we won't get + * here with FOLD set unless it is also locale) */ + ret = reg_node(pRExC_state, (U8) (! FOLD) + ? EXACT + : EXACTFL + ); RExC_parse = (char *)cur_parse; - *STRING(ret)= (char)value; - STR_LEN(ret)= 1; - RExC_emit += STR_SZ(1); + if (UTF && ! NATIVE_IS_INVARIANT(value)) { + *STRING(ret)= UTF8_EIGHT_BIT_HI((U8) value); + *(STRING(ret) + 1)= UTF8_EIGHT_BIT_LO((U8) value); + STR_LEN(ret)= 2; + RExC_emit += STR_SZ(2); + } + else { + *STRING(ret)= (char)value; + STR_LEN(ret)= 1; + RExC_emit += STR_SZ(1); + } SvREFCNT_dec(listsv); return ret; - } - /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */ - if ( /* If the only flag is folding (plus possibly inversion). */ - ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD) - ) { - for (value = 0; value < 256; ++value) { - if (ANYOF_BITMAP_TEST(ret, value)) { - UV fold = PL_fold[value]; - if (fold != value) - ANYOF_BITMAP_SET(ret, fold); - } - } - ANYOF_FLAGS(ret) &= ~ANYOF_FOLD; + /* (A 2-character class of the very special form like [aA] could be + * optimized into an EXACTFish node, but only for non-locales, and for + * characters which only have the two folds; so things like 'fF' and + * 'Ii' wouldn't work because of the fold of 'LATIN SMALL LIGATURE FI'. + * Since we don't have that information currently conveniently + * available, skip the optimization) */ } - /* optimize inverted simple patterns (e.g. [^a-z]) */ - if (optimize_invert && - /* If the only flag is inversion. */ - (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) { - for (value = 0; value < ANYOF_BITMAP_SIZE; ++value) - ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL; - ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL; - } { AV * const av = newAV(); SV *rv; @@ -9712,8 +9743,8 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) } EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags); - /* output any special charclass tests (used mostly under use locale) */ - if (o->flags & ANYOF_CLASS && ANYOF_CLASS_TEST_ANY_SET(o)) + /* output any special charclass tests (used entirely under use locale) */ + if (ANYOF_CLASS_TEST_ANY_SET(o)) for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++) if (ANYOF_CLASS_TEST(o,i)) { sv_catpv(sv, anyofs[i]); diff --git a/regcomp.h b/regcomp.h index 6dc05f5..c140089 100644 --- a/regcomp.h +++ b/regcomp.h @@ -406,12 +406,6 @@ struct regnode_charclass_class { #define ANYOF_CLASS_CLEAR(p, c) (ANYOF_CLASS_BYTE(p, c) &= ~ANYOF_BIT(c)) #define ANYOF_CLASS_TEST(p, c) (ANYOF_CLASS_BYTE(p, c) & ANYOF_BIT(c)) -/* Quicker way to see if there are actually any tests. This is because - * currently the set of tests can be empty even when the class bitmap is - * allocated */ -#define ANYOF_CLASS_TEST_ANY_SET(p) /* assumes sizeof(p) = 4 */ \ - memNE (((struct regnode_charclass_class*)(p))->classflags, "0000", ANYOF_CLASS_SIZE) - #define ANYOF_CLASS_ZERO(ret) Zero(((struct regnode_charclass_class*)(ret))->classflags, ANYOF_CLASSBITMAP_SIZE, char) #define ANYOF_BITMAP_ZERO(ret) Zero(((struct regnode_charclass*)(ret))->bitmap, ANYOF_BITMAP_SIZE, char) @@ -431,7 +425,29 @@ struct regnode_charclass_class { #define ANYOF_SKIP ((ANYOF_SIZE - 1)/sizeof(regnode)) #define ANYOF_CLASS_SKIP ((ANYOF_CLASS_SIZE - 1)/sizeof(regnode)) -#define ANYOF_CLASS_ADD_SKIP (ANYOF_CLASS_SKIP - ANYOF_SKIP) + +/* The class bit can be set to the locale one if necessary to save bits at the + * expense of having locale ANYOF nodes always have a class bit map, and hence + * take up extra space. This allows convenient changing it as development + * proceeds on this */ +#if ANYOF_CLASS == ANYOF_LOCALE +# undef ANYOF_CLASS_ADD_SKIP +# define ANYOF_ADD_LOC_SKIP (ANYOF_CLASS_SKIP - ANYOF_SKIP) + + /* Quicker way to see if there are actually any tests. This is because + * currently the set of tests can be empty even when the class bitmap is + * allocated */ +# if ANYOF_CLASSBITMAP_SIZE != 4 +# error ANYOF_CLASSBITMAP_SIZE is expected to be 4 +# endif +# define ANYOF_CLASS_TEST_ANY_SET(p) /* assumes sizeof(p) = 4 */ \ + memNE (((struct regnode_charclass_class*)(p))->classflags, \ + "\0\0\0\0", ANYOF_CLASSBITMAP_SIZE) +#else +# define ANYOF_CLASS_ADD_SKIP (ANYOF_CLASS_SKIP - ANYOF_SKIP) +# undef ANYOF_ADD_LOC_SKIP +# define ANYOF_CLASS_TEST_ANY_SET(p) (ANYOF_FLAGS(p) & ANYOF_CLASS) +#endif /* diff --git a/regexec.c b/regexec.c index c1f1ae2..f2723e4 100644 --- a/regexec.c +++ b/regexec.c @@ -6356,8 +6356,8 @@ S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n, match = TRUE; } - if (!match && (flags & ANYOF_CLASS) && ANYOF_CLASS_TEST_ANY_SET(n)) { - PL_reg_flags |= RF_tainted; + if (!match && ANYOF_CLASS_TEST_ANY_SET(n)) { + PL_reg_flags |= RF_tainted; /* CLASS implies LOCALE */ if ( (ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) || (ANYOF_CLASS_TEST(n, ANYOF_NALNUM) && !isALNUM_LC(c)) || -- Perl5 Master Repository
