In perl.git, the branch blead has been updated <http://perl5.git.perl.org/perl.git/commitdiff/040aea3a0d449d98f0f858032aa9eba11c90d19d?hp=f53580fec42f3b12264ee27b756dec257c0bb77a>
- Log ----------------------------------------------------------------- commit 040aea3a0d449d98f0f858032aa9eba11c90d19d Author: Karl Williamson <[email protected]> Date: Thu Dec 16 08:44:59 2010 -0700 regexec.c: white-space only Commit 9e2c615305806d76433db342e5659ffeccc3746a didn't adjust the white space for the changes that it introduced. This patch does that. M regexec.c commit 78969a9808d226f8ac2f0c992aa6a67140a56ea6 Author: Karl Williamson <[email protected]> Date: Wed Dec 15 19:22:37 2010 -0700 regexec:c Remove unreached code The new name of ANYOF_LOC_NONBITMAP_FOLD makes it clear that the only way folding can be aplicable here is if it is under locale. M regexec.c commit 390656605358790e356331349a2f922ae36ae5df Author: Karl Williamson <[email protected]> Date: Wed Dec 15 18:34:59 2010 -0700 Change name of regex intrnl macro to new meaning ANYOF_FOLD is now used only under fewer conditions. Otherwise the bitmap of character 0-255 is fully calculated with the folds, and the flag is not set. One condition is under locale, where the folds aren't known at compile time; the other is for things accessible through a swash. By changing the name to its new meaning, certain optimizations become more obvious. M regcomp.c M regcomp.h M regexec.c M utf8.h ----------------------------------------------------------------------- Summary of changes: regcomp.c | 34 +++++++++++----------- regcomp.h | 15 ++++++++-- regexec.c | 93 +++++++++++++++++++++++++++++-------------------------------- utf8.h | 2 +- 4 files changed, 74 insertions(+), 70 deletions(-) diff --git a/regcomp.c b/regcomp.c index 122c560..d52bf13 100644 --- a/regcomp.c +++ b/regcomp.c @@ -710,7 +710,7 @@ S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *c cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL; if (LOC) cl->flags |= ANYOF_LOCALE; - cl->flags |= ANYOF_FOLD; + cl->flags |= ANYOF_LOC_NONBITMAP_FOLD; } /* Can match anything (initialization) */ @@ -767,8 +767,8 @@ S_cl_and(struct regnode_charclass_class *cl, if (!(ANYOF_CLASS_TEST_ANY_SET(and_with)) && !(ANYOF_CLASS_TEST_ANY_SET(cl)) && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE) - && !(and_with->flags & ANYOF_FOLD) - && !(cl->flags & ANYOF_FOLD)) { + && !(and_with->flags & ANYOF_LOC_NONBITMAP_FOLD) + && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) { int i; if (and_with->flags & ANYOF_INVERT) @@ -781,8 +781,8 @@ S_cl_and(struct regnode_charclass_class *cl, if (!(and_with->flags & ANYOF_EOS)) cl->flags &= ~ANYOF_EOS; - if (!(and_with->flags & ANYOF_FOLD)) - cl->flags &= ~ANYOF_FOLD; + if (!(and_with->flags & ANYOF_LOC_NONBITMAP_FOLD)) + cl->flags &= ~ANYOF_LOC_NONBITMAP_FOLD; if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_NONBITMAP && !(and_with->flags & ANYOF_INVERT)) { @@ -818,8 +818,8 @@ S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, con * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i')) */ if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE) - && !(or_with->flags & ANYOF_FOLD) - && !(cl->flags & ANYOF_FOLD) ) { + && !(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD) + && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD) ) { int i; for (i = 0; i < ANYOF_BITMAP_SIZE; i++) @@ -831,8 +831,8 @@ S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, con } else { /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */ if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE) - && (!(or_with->flags & ANYOF_FOLD) - || (cl->flags & ANYOF_FOLD)) ) { + && (!(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD) + || (cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) ) { int i; /* OR char bitmap and class bitmap separately */ @@ -851,8 +851,8 @@ S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, con if (or_with->flags & ANYOF_EOS) cl->flags |= ANYOF_EOS; - if (or_with->flags & ANYOF_FOLD) - cl->flags |= ANYOF_FOLD; + if (or_with->flags & ANYOF_LOC_NONBITMAP_FOLD) + cl->flags |= ANYOF_LOC_NONBITMAP_FOLD; /* If both nodes match something outside the bitmap, but what they match * outside is not the same pointer, and hence not easily compared, give up @@ -3085,7 +3085,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, if (uc >= 0x100 || (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE)) && !ANYOF_BITMAP_TEST(data->start_class, uc) - && (!(data->start_class->flags & ANYOF_FOLD) + && (!(data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD) || !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc]))) ) compat = 0; @@ -3140,7 +3140,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, if (compat) { ANYOF_BITMAP_SET(data->start_class, uc); data->start_class->flags &= ~ANYOF_EOS; - data->start_class->flags |= ANYOF_FOLD; + data->start_class->flags |= ANYOF_LOC_NONBITMAP_FOLD; if (OP(scan) == EXACTFL) { data->start_class->flags |= ANYOF_LOCALE; } @@ -3155,7 +3155,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, } } else if (flags & SCF_DO_STCLASS_OR) { - if (data->start_class->flags & ANYOF_FOLD) { + if (data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD) { /* false positive possible if the class is case-folded. Assume that the locale settings are the same... */ if (uc < 0x100) { @@ -8913,7 +8913,7 @@ parseit: * which we have to wait to see what folding is in effect at runtime, and * for things not in the bitmap */ if (FOLD && (LOC || ANYOF_FLAGS(ret) & ANYOF_NONBITMAP)) { - ANYOF_FLAGS(ret) |= ANYOF_FOLD; + ANYOF_FLAGS(ret) |= ANYOF_LOC_NONBITMAP_FOLD; } /* Optimize inverted simple patterns (e.g. [^a-z]). Note that this doesn't @@ -8974,7 +8974,7 @@ parseit: /* A locale node with one point can be folded; all the other cases * with folding will have two points, since we calculate them above */ - if (ANYOF_FLAGS(ret) & ANYOF_FOLD) { + if (ANYOF_FLAGS(ret) & ANYOF_LOC_NONBITMAP_FOLD) { op = EXACTFL; } else { @@ -9745,7 +9745,7 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) if (flags & ANYOF_LOCALE) sv_catpvs(sv, "{loc}"); - if (flags & ANYOF_FOLD) + if (flags & ANYOF_LOC_NONBITMAP_FOLD) sv_catpvs(sv, "{i}"); Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]); if (flags & ANYOF_INVERT) diff --git a/regcomp.h b/regcomp.h index 00fd945..3e87aa9 100644 --- a/regcomp.h +++ b/regcomp.h @@ -314,9 +314,18 @@ struct regnode_charclass_class { /* Flags for node->flags of ANYOF */ -#define ANYOF_LOCALE 0x01 -#define ANYOF_FOLD 0x02 -#define ANYOF_INVERT 0x04 +#define ANYOF_LOCALE 0x01 + +/* The fold is calculated and stored in the bitmap where possible at compile + * time. However there are two cases where it isn't possible. These share + * this bit: 1) under locale, where the actual folding varies depending on + * what the locale is at the time of execution; and 2) where the folding is + * specified in a swash, not the bitmap, such as characters which aren't + * specified in the bitmap, or properties that aren't looked at at compile time + */ +#define ANYOF_LOC_NONBITMAP_FOLD 0x02 + +#define ANYOF_INVERT 0x04 /* CLASS is never set unless LOCALE is too: has runtime \d, \w, [:posix:], ... */ #define ANYOF_CLASS 0x08 diff --git a/regexec.c b/regexec.c index 7778992..512be67 100644 --- a/regexec.c +++ b/regexec.c @@ -6339,56 +6339,50 @@ S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n, if (c < 256) { if (ANYOF_BITMAP_TEST(n, c)) match = TRUE; - else if (flags & ANYOF_FOLD) { - U8 f; - if (flags & ANYOF_LOCALE) { - PL_reg_flags |= RF_tainted; - f = PL_fold_locale[c]; - } - else - f = PL_fold[c]; - if (f != c && ANYOF_BITMAP_TEST(n, f)) - match = TRUE; - } - - if (!match && ANYOF_CLASS_TEST_ANY_SET(n)) { - PL_reg_flags |= RF_tainted; /* CLASS implies LOCALE */ - if ( - (ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) || - (ANYOF_CLASS_TEST(n, ANYOF_NALNUM) && !isALNUM_LC(c)) || - (ANYOF_CLASS_TEST(n, ANYOF_SPACE) && isSPACE_LC(c)) || - (ANYOF_CLASS_TEST(n, ANYOF_NSPACE) && !isSPACE_LC(c)) || - (ANYOF_CLASS_TEST(n, ANYOF_DIGIT) && isDIGIT_LC(c)) || - (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT) && !isDIGIT_LC(c)) || - (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC) && isALNUMC_LC(c)) || - (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) || - (ANYOF_CLASS_TEST(n, ANYOF_ALPHA) && isALPHA_LC(c)) || - (ANYOF_CLASS_TEST(n, ANYOF_NALPHA) && !isALPHA_LC(c)) || - (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII(c)) || - (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII(c)) || - (ANYOF_CLASS_TEST(n, ANYOF_CNTRL) && isCNTRL_LC(c)) || - (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL) && !isCNTRL_LC(c)) || - (ANYOF_CLASS_TEST(n, ANYOF_GRAPH) && isGRAPH_LC(c)) || - (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH) && !isGRAPH_LC(c)) || - (ANYOF_CLASS_TEST(n, ANYOF_LOWER) && isLOWER_LC(c)) || - (ANYOF_CLASS_TEST(n, ANYOF_NLOWER) && !isLOWER_LC(c)) || - (ANYOF_CLASS_TEST(n, ANYOF_PRINT) && isPRINT_LC(c)) || - (ANYOF_CLASS_TEST(n, ANYOF_NPRINT) && !isPRINT_LC(c)) || - (ANYOF_CLASS_TEST(n, ANYOF_PUNCT) && isPUNCT_LC(c)) || - (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT) && !isPUNCT_LC(c)) || - (ANYOF_CLASS_TEST(n, ANYOF_UPPER) && isUPPER_LC(c)) || - (ANYOF_CLASS_TEST(n, ANYOF_NUPPER) && !isUPPER_LC(c)) || - (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT) && isXDIGIT(c)) || - (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c)) || - (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC) && isPSXSPC(c)) || - (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c)) || - (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK(c)) || - (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK(c)) - ) /* How's that for a conditional? */ + else if (flags & ANYOF_LOCALE) { + PL_reg_flags |= RF_tainted; + + if ((flags & ANYOF_LOC_NONBITMAP_FOLD) + && ANYOF_BITMAP_TEST(n, PL_fold_locale[c])) { match = TRUE; } + else if (ANYOF_CLASS_TEST_ANY_SET(n) && + ((ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_NALNUM) && !isALNUM_LC(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_SPACE) && isSPACE_LC(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_NSPACE) && !isSPACE_LC(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_DIGIT) && isDIGIT_LC(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT) && !isDIGIT_LC(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC) && isALNUMC_LC(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_ALPHA) && isALPHA_LC(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_NALPHA) && !isALPHA_LC(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_CNTRL) && isCNTRL_LC(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL) && !isCNTRL_LC(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_GRAPH) && isGRAPH_LC(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH) && !isGRAPH_LC(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_LOWER) && isLOWER_LC(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_NLOWER) && !isLOWER_LC(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_PRINT) && isPRINT_LC(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_NPRINT) && !isPRINT_LC(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_PUNCT) && isPUNCT_LC(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT) && !isPUNCT_LC(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_UPPER) && isUPPER_LC(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_NUPPER) && !isUPPER_LC(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT) && isXDIGIT(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC) && isPSXSPC(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK(c)) + ) /* How's that for a conditional? */ + ) { + match = TRUE; + } } } @@ -6397,8 +6391,9 @@ S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n, if (!match) { if (utf8_target && (flags & ANYOF_UNICODE_ALL)) { if (c >= 256 - || ((flags & ANYOF_FOLD) /* Latin1 1 that has a non-Latin1 fold - should match */ + || ((flags & ANYOF_LOC_NONBITMAP_FOLD) /* Latin1 1 that has a + non-Latin1 fold + should match */ && _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(c))) { match = TRUE; @@ -6420,7 +6415,7 @@ S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n, } if (swash_fetch(sw, utf8_p, 1)) match = TRUE; - else if (flags & ANYOF_FOLD) { + else if (flags & ANYOF_LOC_NONBITMAP_FOLD) { if (!match && lenp && av) { I32 i; for (i = 0; i <= av_len(av); i++) { diff --git a/utf8.h b/utf8.h index 405b8b4..a162114 100644 --- a/utf8.h +++ b/utf8.h @@ -291,7 +291,7 @@ Perl's extended UTF-8 means we can have start bytes up to FF. #define ANYOF_FOLD_SHARP_S(node, input, end) \ (ANYOF_BITMAP_TEST(node, LATIN_SMALL_LETTER_SHARP_S) && \ (ANYOF_FLAGS(node) & ANYOF_NONBITMAP) && \ - (ANYOF_FLAGS(node) & ANYOF_FOLD) && \ + (ANYOF_FLAGS(node) & ANYOF_LOC_NONBITMAP_FOLD) && \ ((end) > (input) + 1) && \ toLOWER((input)[0]) == 's' && \ toLOWER((input)[1]) == 's') -- Perl5 Master Repository
