In perl.git, the branch blead has been updated <http://perl5.git.perl.org/perl.git/commitdiff/61c8799482f9e533904bfe832138c24064709f7d?hp=0f43fd573c94446b795d95875cb722dd3f61d1fd>
- Log ----------------------------------------------------------------- commit 61c8799482f9e533904bfe832138c24064709f7d Author: Karl Williamson <[email protected]> Date: Tue Oct 16 21:20:47 2012 -0600 regcomp.c: Silence compiler warning I didn't notice that I had introduced this warning until now. M regcomp.c commit 565fc1bb88638c2490cdab7a1055007f6b2d577c Author: Karl Williamson <[email protected]> Date: Tue Oct 16 12:09:04 2012 -0600 regex: \R can match either 1 or 2 chars Therefore it is not "simple", and should not be compiled as such, causing things like the test added herein to fail. M regcomp.c M regexec.c M t/re/re_tests commit 89fd9d0b0c00e75d034243e463ba1dcb825d9136 Author: Karl Williamson <[email protected]> Date: Tue Oct 16 11:12:22 2012 -0600 regcomp.c: Pass NULL instead of &dummy to function This saves the function from setting a throw-away value M regcomp.c commit 2db97d4168d61ce2b1f393bd6bba984467bb4bf9 Author: Karl Williamson <[email protected]> Date: Tue Oct 16 11:11:11 2012 -0600 regcomp.c, regexec.c: Comments only; no code changes M regcomp.c M regexec.c commit 49b95fadfe8c5c974cc7538ea3f53702cdf705ff Author: Karl Williamson <[email protected]> Date: Tue Oct 16 11:09:52 2012 -0600 regexec.c: White-space only; no code changes This indents a newly-formed block correctly M regexec.c commit 613a425dda0dc9e3f838151a5d796c902cfd922e Author: Karl Williamson <[email protected]> Date: Tue Oct 16 10:56:28 2012 -0600 regexec.c: Tighten loops in regrepeat() regrepeat() is used to match some simple thing repeatedly in a row. In the case of EXACTFish nodes, it will repeat a single character (and its fold). Prior to this commit, it was using the full generality of foldEQ_utf8() whenever the target was encoded in UTF-8. This full generality requires quite a bit of processing. However, most Unicode folds are of the simple variety containing just a character and its upper- or lower-cased equivalent, and so the full generality of foldEQ_utf8() is needed only comparatively infrequently. This commit takes advantage of the newly added and enhanced S_setup_EXACTISH_ST_c1_c2() to look at the character being repeated and decide what level of generality is needed. regrepeat() then uses a loop that is only as complicated as needed. This also adds some asserts that the nodes contain exactly 1 character M regexec.c commit 79a2a0e89816b80870df1f9b9e7bb5fb1edcd556 Author: Karl Williamson <[email protected]> Date: Tue Oct 16 10:17:01 2012 -0600 regexec: Do less work on quantified UTF-8 Consider the regexes /A*B/ and /A*?B/ where A and B are arbitrary, except that B begins with an EXACTish node. Prior to this patch, as a shortcut, the loop for accumulating A* would look for the first character of B to help it decide if B is a possiblity for the next thing. It did not test for all of B unless testing showed that the next thing could be the beginning of B. If the target string was UTF-8, it converted each new sequence of bytes to the code point they represented, and then did the comparision. This is a relative expensive process. This commit avoids that conversion by just doing a memEQ at the current input position. To do this, it revamps S_setup_EXACTISH_ST_c1_c2() to output the UTF-8 sequences to compare against. The function also has been tightened up so that there are fewer false positives. M regexec.c M regexp.h M utf8.c commit 57f0e7e230d864f5b78d28bb89545ef671c101a0 Author: Karl Williamson <[email protected]> Date: Tue Oct 16 09:58:24 2012 -0600 utf8.h: Add guard against recursive #include A future commit will #include this from another header M utf8.h commit 40b1ba4ffc62ae8198d69e8e3b33cf8201c6a18f Author: Karl Williamson <[email protected]> Date: Tue Oct 16 10:45:44 2012 -0600 regen/regcharclass.pl: Change name of generated macro This changes the macro isMULTI_CHAR_FOLD() (non-utf8 version) from just generating ascii-range code points to generating the full Latin1 range. However there are no such non-ASCII values, so the macro expansion is unchanged. By changing the name, it becomes clearer in future commits that we aren't excluding things that we should be considering. M regcharclass.h M regcomp.c M regen/regcharclass.pl M regen/regcharclass_multi_char_folds.pl commit b4291290926312792a6bfb115da2883d6c9c433d Author: Karl Williamson <[email protected]> Date: Tue Oct 9 13:34:08 2012 -0600 regexec.c: Change variable name This actually is a pointer to the pattern string, not to a byte. M regexec.c commit 9d1714db8368fab113c4f12da2ea29e7926e2c62 Author: Karl Williamson <[email protected]> Date: Tue Oct 9 13:32:12 2012 -0600 regexp.h: Update comments These comments should have been changed in commit c74f6de970ef0f0eb8ba43b1840fde0cf5a45497, but were mistakenly omitted. M regexp.h ----------------------------------------------------------------------- Summary of changes: regcharclass.h | 4 +- regcomp.c | 18 +- regen/regcharclass.pl | 6 +- regen/regcharclass_multi_char_folds.pl | 22 +- regexec.c | 555 ++++++++++++++++++-------------- regexp.h | 14 +- t/re/re_tests | 2 + utf8.c | 2 +- utf8.h | 5 + 9 files changed, 365 insertions(+), 263 deletions(-) diff --git a/regcharclass.h b/regcharclass.h index 37f57ba..b631cd1 100644 --- a/regcharclass.h +++ b/regcharclass.h @@ -694,12 +694,12 @@ : 0 ) /* - MULTI_CHAR_FOLD: multi-char ascii strings that are folded to by a single character + MULTI_CHAR_FOLD: multi-char strings that are folded to by a single character ®charclass_multi_char_folds::multi_char_folds(0) */ /*** GENERATED CODE ***/ -#define is_MULTI_CHAR_FOLD_low_safe(s,e) \ +#define is_MULTI_CHAR_FOLD_latin1_safe(s,e) \ ( ((e)-(s) > 2) ? \ ( ( ( ((U8*)s)[0] & 0xDF ) == 0x46 ) ? \ ( ( ( ((U8*)s)[1] & 0xDF ) == 0x46 ) ? \ diff --git a/regcomp.c b/regcomp.c index bf2be2d..163351b 100644 --- a/regcomp.c +++ b/regcomp.c @@ -2866,7 +2866,7 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, b /* Here, the pattern is not UTF-8. Look for the multi-char folds * that are all ASCII. As in the above case, EXACTFL and EXACTFA * nodes can't have multi-char folds to this range (and there are - * no existing ones to the upper latin1 range). In the EXACTF + * no existing ones in the upper latin1 range). In the EXACTF * case we look also for the sharp s, which can be in the final * position. Otherwise we can stop looking 1 byte earlier because * have to find at least two characters for a multi-fold */ @@ -2883,7 +2883,7 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, b const U8 s_masked = 's' & S_or_s_mask; while (s < upper) { - int len = is_MULTI_CHAR_FOLD_low_safe(s, s_end); + int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end); if (! len) { /* Not a multi-char fold. */ if (*s == LATIN_SMALL_LETTER_SHARP_S && OP(scan) == EXACTF) { @@ -10274,7 +10274,7 @@ tryagain: goto finish_meta_pat; case 'R': ret = reg_node(pRExC_state, LNBREAK); - *flagp |= HASWIDTH|SIMPLE; + *flagp |= HASWIDTH; goto finish_meta_pat; case 'h': ret = reg_node(pRExC_state, HORIZWS); @@ -11440,7 +11440,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) dVAR; UV nextvalue; - UV prevvalue, save_prevvalue = OOB_UNICODE; + UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE; IV range = 0; UV value, save_value = 0; regnode *ret; @@ -12618,10 +12618,9 @@ parseit: * to force that */ if (! PL_utf8_tofold) { U8 dummy[UTF8_MAXBYTES+1]; - STRLEN dummy_len; /* This string is just a short named one above \xff */ - to_utf8_fold((U8*) HYPHEN_UTF8, dummy, &dummy_len); + to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL); assert(PL_utf8_tofold); /* Verify that worked */ } PL_utf8_foldclosures = @@ -12762,9 +12761,10 @@ parseit: /* Single character fold of above Latin1. Add everything in * its fold closure to the list that this node should match. * The fold closures data structure is a hash with the keys - * being every character that is folded to, like 'k', and the - * values each an array of everything that folds to its key. - * e.g. [ 'k', 'K', KELVIN_SIGN ] */ + * being the UTF-8 of every character that is folded to, like + * 'k', and the values each an array of all code points that + * fold to its key. e.g. [ 'k', 'K', KELVIN_SIGN ]. + * Multi-character folds are not included */ if ((listp = hv_fetch(PL_utf8_foldclosures, (char *) foldbuf, foldlen, FALSE))) { diff --git a/regen/regcharclass.pl b/regen/regcharclass.pl index 37a8682..461192b 100755 --- a/regen/regcharclass.pl +++ b/regen/regcharclass.pl @@ -1253,8 +1253,8 @@ do regen/regcharclass_multi_char_folds.pl # 1 => All folds ®charclass_multi_char_folds::multi_char_folds(1) -MULTI_CHAR_FOLD: multi-char ascii strings that are folded to by a single character -=> low : safe +MULTI_CHAR_FOLD: multi-char strings that are folded to by a single character +=> LATIN1 :safe -# 0 => ASCII-only ®charclass_multi_char_folds::multi_char_folds(0) +# 0 => Latin1-only diff --git a/regen/regcharclass_multi_char_folds.pl b/regen/regcharclass_multi_char_folds.pl index ce2d781..f0fd6b3 100644 --- a/regen/regcharclass_multi_char_folds.pl +++ b/regen/regcharclass_multi_char_folds.pl @@ -9,15 +9,19 @@ use Unicode::UCD "prop_invmap"; # of the sequences of code points that are multi-character folds in the # current Unicode version. If the parameter is 1, all such folds are # returned. If the parameters is 0, only the ones containing exclusively -# ASCII characters are returned. In the latter case all combinations of ASCII -# characters that can fold to the base one are returned. Thus for 'ss', it -# would return in addition, 'Ss', 'sS', and 'SS'. This is because this code -# is designed to help regcomp.c, and EXACTFish regnodes. For non-UTF-8 -# patterns, the strings are not folded, so we need to check for the upper and -# lower case versions. For UTF-8 patterns, the strings are folded, so we only -# need to worry about the fold version. There are no non-ASCII Latin1 -# multi-char folds currently, and none likely to be ever added, so this -# doesn't worry about that case, except to croak should it happen. +# Latin1 characters are returned. In the latter case all combinations of +# Latin1 characters that can fold to the base one are returned. Thus for +# 'ss', it would return in addition, 'Ss', 'sS', and 'SS'. This is because +# this code is designed to help regcomp.c, and EXACTFish regnodes. For +# non-UTF-8 patterns, the strings are not folded, so we need to check for the +# upper and lower case versions. For UTF-8 patterns, the strings are folded, +# so we only need to worry about the fold version. There are no non-ASCII +# Latin1 multi-char folds currently, and none likely to be ever added. Thus +# the output is the same as if it were just asking for ASCII characters, not +# full Latin1. Hence, it is suitable for generating things that match +# EXACTFA. It does check for and croak if there ever were to be an upper +# Latin1 range multi-character fold. +# # This is designed for input to regen/regcharlass.pl. sub gen_combinations ($;) { diff --git a/regexec.c b/regexec.c index 7aa1e71..f25bce1 100644 --- a/regexec.c +++ b/regexec.c @@ -2936,6 +2936,8 @@ S_regtry(pTHX_ regmatch_info *reginfo, char **startposp) #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */ #define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */ +#define CHRTEST_NOT_A_CP_1 -999 +#define CHRTEST_NOT_A_CP_2 -998 #define SLAB_FIRST(s) (&(s)->states[0]) #define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1]) @@ -3273,175 +3275,252 @@ S_clear_backtrack_stack(pTHX_ void *p) } } static bool -S_setup_EXACTISH_ST_c1_c2(pTHX_ regnode *text_node, I32 *c1, I32 *c2) +S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p, U8* c1_utf8, int *c2p, U8* c2_utf8) { - /* This sets up a relatively quick check for the initial part of what must - * match after a CURLY-type operation condition (the "B" in A*B), where B - * starts with an EXACTish node, <text_node>. If this check is not met, - * the caller knows that it should continue with the loop. If the check is - * met, the caller must see if all of B is met, before making the decision. + /* This function determines if there are one or two characters that match + * the first character of the passed-in EXACTish node <text_node>, and if + * so, returns them in the passed-in pointers. * - * This function sets *<c1> and *<c2> to be the first code point of B. If - * there are two possible such code points (as when the text_node is - * folded), *<c2> is set to the second. If there are more than two (which - * happens for some folds), or there is some other complication, these - * parameters are set to CHRTEST_VOID, to indicate not to do a quick check: - * just try all of B after every time through the loop. + * If it determines that no possible character in the target string can + * match, it returns FALSE; otherwise TRUE. (The FALSE situation occurs if + * the first character in <text_node> requires UTF-8 to represent, and the + * target string isn't in UTF-8.) * - * If the routine determines that there is no possible way for there to be - * a match, it returns FALSE. - * */ + * If there are more than two characters that could match the beginning of + * <text_node>, or if more context is required to determine a match or not, + * it sets both *<c1p> and *<c2p> to CHRTEST_VOID. + * + * The motiviation behind this function is to allow the caller to set up + * tight loops for matching. If <text_node> is of type EXACT, there is + * only one possible character that can match its first character, and so + * the situation is quite simple. But things get much more complicated if + * folding is involved. It may be that the first character of an EXACTFish + * node doesn't participate in any possible fold, e.g., punctuation, so it + * can be matched only by itself. The vast majority of characters that are + * in folds match just two things, their lower and upper-case equivalents. + * But not all are like that; some have multiple possible matches, or match + * sequences of more than one character. This function sorts all that out. + * + * Consider the patterns A*B or A*?B where A and B are arbitrary. In a + * loop of trying to match A*, we know we can't exit where the thing + * following it isn't a B. And something can't be a B unless it is the + * beginning of B. By putting a quick test for that beginning in a tight + * loop, we can rule out things that can't possibly be B without having to + * break out of the loop, thus avoiding work. Similarly, if A is a single + * character, we can make a tight loop matching A*, using the outputs of + * this function. + * + * If the target string to match isn't in UTF-8, and there aren't + * complications which require CHRTEST_VOID, *<c1p> and *<c2p> are set to + * the one or two possible octets (which are characters in this situation) + * that can match. In all cases, if there is only one character that can + * match, *<c1p> and *<c2p> will be identical. + * + * If the target string is in UTF-8, the buffers pointed to by <c1_utf8> + * and <c2_utf8> will contain the one or two UTF-8 sequences of bytes that + * can match the beginning of <text_node>. They should be declared with at + * least length UTF8_MAXBYTES+1. (If the target string isn't in UTF-8, it is + * undefined what these contain.) If one or both of the buffers are + * invariant under UTF-8, *<c1p>, and *<c2p> will also be set to the + * corresponding invariant. If variant, the corresponding *<c1p> and/or + * *<c2p> will be set to a negative number(s) that shouldn't match any code + * point (unless inappropriately coerced to unsigned). *<c1p> will equal + * *<c2p> if and only if <c1_utf8> and <c2_utf8> are the same. */ const bool utf8_target = PL_reg_match_utf8; - const U32 uniflags = UTF8_ALLOW_DEFAULT; + + UV c1, c2; + bool use_chrtest_void = FALSE; + + /* Used when we have both utf8 input and utf8 output, to avoid converting + * to/from code points */ + bool utf8_has_been_setup = FALSE; + dVAR; - /* First byte from the EXACTish node */ - U8 *pat_byte = (U8*)STRING(text_node); + U8 *pat = (U8*)STRING(text_node); + + if (OP(text_node) == EXACT) { + + /* In an exact node, only one thing can be matched, that first + * character. If both the pat and the target are UTF-8, we can just + * copy the input to the output, avoiding finding the code point of + * that character */ + if (! UTF_PATTERN) { + c2 = c1 = *pat; + } + else if (utf8_target) { + Copy(pat, c1_utf8, UTF8SKIP(pat), U8); + Copy(pat, c2_utf8, UTF8SKIP(pat), U8); + utf8_has_been_setup = TRUE; + } + else { + c2 = c1 = valid_utf8_to_uvchr(pat, NULL); + } + } + else /* an EXACTFish node */ + if ((UTF_PATTERN + && is_MULTI_CHAR_FOLD_utf8_safe(pat, + pat + STR_LEN(text_node))) + || (! UTF_PATTERN + && is_MULTI_CHAR_FOLD_latin1_safe(pat, + pat + STR_LEN(text_node)))) + { + /* Multi-character folds require more context to sort out. Also + * PL_utf8_foldclosures used below doesn't handle them, so have to be + * handled outside this routine */ + use_chrtest_void = TRUE; + } + else { /* an EXACTFish node which doesn't begin with a multi-char fold */ + c1 = (UTF_PATTERN) ? valid_utf8_to_uvchr(pat, NULL) : *pat; + if (c1 > 256) { + /* Load the folds hash, if not already done */ + SV** listp; + if (! PL_utf8_foldclosures) { + if (! PL_utf8_tofold) { + U8 dummy[UTF8_MAXBYTES+1]; + + /* Force loading this by folding an above-Latin1 char */ + to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL); + assert(PL_utf8_tofold); /* Verify that worked */ + } + PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold); + } + + /* The fold closures data structure is a hash with the keys being + * the UTF-8 of every character that is folded to, like 'k', and + * the values each an array of all code points that fold to its + * key. e.g. [ 'k', 'K', KELVIN_SIGN ]. Multi-character folds are + * not included */ + if ((! (listp = hv_fetch(PL_utf8_foldclosures, + (char *) pat, + UTF8SKIP(pat), + FALSE)))) + { + /* Not found in the hash, therefore there are no folds + * containing it, so there is only a single character that + * could match */ + c2 = c1; + } + else { /* Does participate in folds */ + AV* list = (AV*) *listp; + if (av_len(list) != 1) { - if (! UTF_PATTERN) { /* Not UTF-8: the code point is the byte */ - *c1 = *pat_byte; - if (OP(text_node) == EXACT) { - *c2 = *c1; + /* If there aren't exactly two folds to this, it is outside + * the scope of this function */ + use_chrtest_void = TRUE; + } + else { /* There are two. Get them */ + SV** c_p = av_fetch(list, 0, FALSE); + if (c_p == NULL) { + Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure"); + } + c1 = SvUV(*c_p); + + c_p = av_fetch(list, 1, FALSE); + if (c_p == NULL) { + Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure"); + } + c2 = SvUV(*c_p); + + /* Folds that cross the 255/256 boundary are forbidden if + * EXACTFL, or EXACTFA and one is ASCIII. Since the + * pattern character is above 256, and its only other match + * is below 256, the only legal match will be to itself. + * We have thrown away the original, so have to compute + * which is the one above 255 */ + if ((c1 < 256) != (c2 < 256)) { + if (OP(text_node) == EXACTFL + || (OP(text_node) == EXACTFA + && (isASCII(c1) || isASCII(c2)))) + { + if (c1 < 256) { + c1 = c2; + } + else { + c2 = c1; + } + } + } + } + } } - else if (utf8_target - && HAS_NONLATIN1_FOLD_CLOSURE(*c1) - && (OP(text_node) != EXACTFA || ! isASCII(*c1))) + else /* Here, c1 is < 255 */ + if (utf8_target + && HAS_NONLATIN1_FOLD_CLOSURE(c1) + && OP(text_node) != EXACTFL + && (OP(text_node) != EXACTFA || ! isASCII(c1))) { /* Here, there could be something above Latin1 in the target which - * folds to this character in the pattern, which means there are - * more than two possible beginnings of B. */ - *c1 = *c2 = CHRTEST_VOID; + * folds to this character in the pattern. All such cases except + * LATIN SMALL LETTER Y WITH DIAERESIS have more than two characters + * involved in their folds, so are outside the scope of this + * function */ + if (UNLIKELY(c1 == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) { + c2 = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS; + } + else { + use_chrtest_void = TRUE; + } } else { /* Here nothing above Latin1 can fold to the pattern character */ switch (OP(text_node)) { case EXACTFL: /* /l rules */ - *c2 = PL_fold_locale[*c1]; - break; - - case EXACTFU_SS: /* This requires special handling: Don't - shortcut */ - *c1 = *c2 = CHRTEST_VOID; + c2 = PL_fold_locale[c1]; break; case EXACTF: if (! utf8_target) { /* /d rules */ - *c2 = PL_fold[*c1]; + c2 = PL_fold[c1]; break; } /* FALLTHROUGH */ /* /u rules for all these. This happens to work for - * EXACTFA in the ASCII range as nothing in Latin1 folds to - * ASCII */ + * EXACTFA as nothing in Latin1 folds to ASCII */ case EXACTFA: case EXACTFU_TRICKYFOLD: + case EXACTFU_SS: case EXACTFU: - *c2 = PL_fold_latin1[*c1]; + c2 = PL_fold_latin1[c1]; break; default: Perl_croak(aTHX_ "panic: Unexpected op %u", OP(text_node)); } } } - else { /* UTF_PATTERN */ - if (OP(text_node) == EXACT) { - *c2 = *c1 = utf8n_to_uvchr(pat_byte, UTF8_MAXBYTES, 0, uniflags); - if (*c1 < 0) { /* Overflowed what we can handle */ - *c1 = *c2 = CHRTEST_VOID; - } - else if (*c1 > 255 && ! utf8_target) { - return FALSE; /* Can't possibly match */ - } + + /* Here have figured things out. Set up the returns */ + if (use_chrtest_void) { + *c2p = *c1p = CHRTEST_VOID; + } + else if (utf8_target) { + if (! utf8_has_been_setup) { /* Don't have the utf8; must get it */ + uvchr_to_utf8(c1_utf8, c1); + uvchr_to_utf8(c2_utf8, c2); } - else { - if (UTF8_IS_ABOVE_LATIN1(*pat_byte)) { - /* A multi-character fold is complicated, probably has more - * than two possibilities */ - if (is_MULTI_CHAR_FOLD_utf8_safe((char*) pat_byte, - (char*) pat_byte + STR_LEN(text_node))) - { - *c1 = *c2 = CHRTEST_VOID; - } - else { /* Not a multi-char fold */ - - /* Load the folds hash, if not already done */ - SV** listp; - if (! PL_utf8_foldclosures) { - if (! PL_utf8_tofold) { - U8 dummy[UTF8_MAXBYTES+1]; - STRLEN dummy_len; - - /* Force loading this by folding an above-Latin1 - * char */ - to_utf8_fold((U8*) HYPHEN_UTF8, dummy, &dummy_len); - assert(PL_utf8_tofold); /* Verify that worked */ - } - PL_utf8_foldclosures = - _swash_inversion_hash(PL_utf8_tofold); - } + /* Invariants are stored in both the utf8 and byte outputs; Use + * negative numbers otherwise for the byte ones. Make sure that the + * byte ones are the same iff the utf8 ones are the same */ + *c1p = (UTF8_IS_INVARIANT(*c1_utf8)) ? *c1_utf8 : CHRTEST_NOT_A_CP_1; + *c2p = (UTF8_IS_INVARIANT(*c2_utf8)) + ? *c2_utf8 + : (c1 == c2) + ? CHRTEST_NOT_A_CP_1 + : CHRTEST_NOT_A_CP_2; + } + else if (c1 > 255) { + if (c2 > 255) { /* both possibilities are above what a non-utf8 string + can represent */ + return FALSE; + } - /* The fold closures data structure is a hash with the keys - * being every character that is folded to, like 'k', and - * the values each an array of everything that folds to its - * key. e.g. [ 'k', 'K', KELVIN_SIGN ] */ - if ((! (listp = hv_fetch(PL_utf8_foldclosures, - (char *) pat_byte, - UTF8SKIP(pat_byte), - FALSE)))) - { - /* Not found in the hash, therefore there are no folds - * containing it, so there is only a single char - * possible for beginning B */ - *c2 = *c1 = utf8n_to_uvchr(pat_byte, STR_LEN(text_node), - 0, uniflags); - if (*c1 < 0) { /* Overflowed what we can handle */ - *c1 = *c2 = CHRTEST_VOID; - } - } - else { - AV* list = (AV*) *listp; - if (av_len(list) != 1) { /* If there aren't exactly - two folds to this, have - to test B completely */ - *c1 = *c2 = CHRTEST_VOID; - } - else { /* There are two. Set *c1 and *c2 to them */ - SV** c_p = av_fetch(list, 0, FALSE); - if (c_p == NULL) { - Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure"); - } - *c1 = SvUV(*c_p); - c_p = av_fetch(list, 1, FALSE); - if (c_p == NULL) { - Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure"); - } - *c2 = SvUV(*c_p); - } - } - } - } - else { - /* Get the character represented by the UTF-8-encoded byte */ - U8 c = (UTF8_IS_INVARIANT(*pat_byte)) - ? *pat_byte - : TWO_BYTE_UTF8_TO_UNI(*pat_byte, *(pat_byte+1)); - - if (HAS_NONLATIN1_FOLD_CLOSURE(c) - && (OP(text_node) != EXACTFA || ! isASCII(c))) - { /* Something above Latin1 folds to this; hence there are - more than 2 possibilities for B to begin with */ - *c1 = *c2 = CHRTEST_VOID; - } - else { - *c1 = c; - *c2 = (OP(text_node) == EXACTFL) - ? PL_fold_locale[*c1] - : PL_fold_latin1[*c1]; - } - } - } + *c1p = *c2p = c2; /* c2 is the only representable value */ + } + else { /* c1 is representable; see about c2 */ + *c1p = c1; + *c2p = (c2 < 256) ? c2 : c1; } return TRUE; @@ -4569,7 +4648,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) /* This call case insensitively compares the entire buffer * at s, with the current input starting at locinput, but * not going off the end given by PL_regeol, and returns in - * limit upon success, how much of the current input was + * <limit> upon success, how much of the current input was * matched */ if (! foldEQ_utf8_flags(s, NULL, rex->offs[n].end - ln, utf8_target, locinput, &limit, 0, utf8_target, utf8_fold_flags)) @@ -5574,8 +5653,8 @@ NULL IS_TEXT and friends need to change. */ if (PL_regkind[OP(text_node)] == EXACT) { - if (! S_setup_EXACTISH_ST_c1_c2(aTHX_ text_node, - &ST.c1, &ST.c2)) + if (! S_setup_EXACTISH_ST_c1_c2(aTHX_ + text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8)) { sayNO; } @@ -5590,19 +5669,31 @@ NULL "", (IV)ST.count) ); if (! NEXTCHR_IS_EOS && ST.c1 != CHRTEST_VOID) { - const UV c = (utf8_target) - ? utf8n_to_uvchr((U8*)locinput, - UTF8_MAXBYTES, NULL, - uniflags) - : nextchr; - if (c != (UV) ST.c1 && c != (UV) ST.c2) { + if (! UTF8_IS_INVARIANT(nextchr) && utf8_target) { + if (memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput)) + && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput))) + { + /* simulate B failing */ + DEBUG_OPTIMISE_r( + PerlIO_printf(Perl_debug_log, + "%*s CURLYM Fast bail next target=U+%"UVXf" c1=U+%"UVXf" c2=U+%"UVXf"\n", + (int)(REPORT_CODE_OFF+(depth*2)),"", + valid_utf8_to_uvchr((U8 *) locinput, NULL), + valid_utf8_to_uvchr(ST.c1_utf8, NULL), + valid_utf8_to_uvchr(ST.c2_utf8, NULL)) + ); + state_num = CURLYM_B_fail; + goto reenter_switch; + } + } + else if (nextchr != ST.c1 && nextchr != ST.c2) { /* simulate B failing */ DEBUG_OPTIMISE_r( PerlIO_printf(Perl_debug_log, - "%*s CURLYM Fast bail c1=%"IVdf" c2=%"IVdf"\n", + "%*s CURLYM Fast bail next target=U+%X c1=U+%X c2=U+%X\n", (int)(REPORT_CODE_OFF+(depth*2)),"", - (IV)ST.c1,(IV)ST.c2 - )); + (int) nextchr, ST.c1, ST.c2) + ); state_num = CURLYM_B_fail; goto reenter_switch; } @@ -5738,8 +5829,8 @@ NULL if this changes back then the macro for IS_TEXT and friends need to change. */ - if (! S_setup_EXACTISH_ST_c1_c2(aTHX_ text_node, - &ST.c1, &ST.c2)) + if (! S_setup_EXACTISH_ST_c1_c2(aTHX_ + text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8)) { sayNO; } @@ -5831,26 +5922,21 @@ NULL if (utf8_target) { n = (ST.oldloc == locinput) ? 0 : 1; if (ST.c1 == ST.c2) { - STRLEN len; /* set n to utf8_distance(oldloc, locinput) */ - while (locinput <= ST.maxpos && - utf8n_to_uvchr((U8*)locinput, - UTF8_MAXBYTES, &len, - uniflags) != (UV)ST.c1) { - locinput += len; + while (locinput <= ST.maxpos + && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))) + { + locinput += UTF8SKIP(locinput); n++; } } else { /* set n to utf8_distance(oldloc, locinput) */ - while (locinput <= ST.maxpos) { - STRLEN len; - const UV c = utf8n_to_uvchr((U8*)locinput, - UTF8_MAXBYTES, &len, - uniflags); - if (c == (UV)ST.c1 || c == (UV)ST.c2) - break; - locinput += len; + while (locinput <= ST.maxpos + && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput)) + && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput))) + { + locinput += UTF8SKIP(locinput); n++; } } @@ -5931,16 +6017,25 @@ NULL goto fake_end; } { - UV c = 0; - if (ST.c1 != CHRTEST_VOID && locinput < PL_regeol) - c = utf8_target ? utf8n_to_uvchr((U8*)locinput, - UTF8_MAXBYTES, 0, uniflags) - : (UV) UCHARAT(locinput); + bool could_match = locinput < PL_regeol; + /* If it could work, try it. */ - if (ST.c1 == CHRTEST_VOID - || (locinput < PL_regeol && - (c == (UV)ST.c1 || c == (UV)ST.c2))) - { + if (ST.c1 != CHRTEST_VOID && could_match) { + if (! UTF8_IS_INVARIANT(UCHARAT(locinput)) && utf8_target) + { + could_match = memEQ(locinput, + ST.c1_utf8, + UTF8SKIP(locinput)) + || memEQ(locinput, + ST.c2_utf8, + UTF8SKIP(locinput)); + } + else { + could_match = UCHARAT(locinput) == ST.c1 + || UCHARAT(locinput) == ST.c2; + } + } + if (ST.c1 == CHRTEST_VOID || could_match) { CURLY_SETPAREN(ST.paren, ST.count); PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput); assert(0); /* NOTREACHED */ @@ -6484,6 +6579,8 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma scan = loceol; break; case EXACT: + assert(STR_LEN(p) == (UTF_PATTERN) ? UTF8SKIP(STRING(p)) : 1); + c = (U8)*STRING(p); /* Can use a simple loop if the pattern char to match on is invariant @@ -6561,56 +6658,61 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma case EXACTFU: utf8_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0; - do_exactf: - c = (U8)*STRING(p); + do_exactf: { + int c1, c2; + U8 c1_utf8[UTF8_MAXBYTES+1], c2_utf8[UTF8_MAXBYTES+1]; - if (utf8_target - || OP(p) == EXACTFU_SS - || (UTF_PATTERN && ! UTF8_IS_INVARIANT(c))) - { - /* Use full Unicode fold matching */ - char *tmpeol = loceol; - STRLEN pat_len = (UTF_PATTERN) ? UTF8SKIP(STRING(p)) : 1; - while (hardcount < max - && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target, - STRING(p), NULL, pat_len, cBOOL(UTF_PATTERN), utf8_flags)) - { - scan = tmpeol; - tmpeol = loceol; - hardcount++; - } + assert(STR_LEN(p) == (UTF_PATTERN) ? UTF8SKIP(STRING(p)) : 1); - /* XXX Note that the above handles properly the German sharp s in - * the pattern matching ss in the string. But it doesn't handle - * properly cases where the string contains say 'LIGATURE ff' and - * the pattern is 'f+'. This would require, say, a new function or - * revised interface to foldEQ_utf8(), in which the maximum number - * of characters to match could be passed and it would return how - * many actually did. This is just one of many cases where - * multi-char folds don't work properly, and so the fix is being - * deferred */ - } - else { - U8 folded; - - /* Here, the string isn't utf8; and either the pattern isn't utf8 - * or c is an invariant, so its utf8ness doesn't affect c. Can - * just do simple comparisons for exact or fold matching. */ - switch (OP(p)) { - case EXACTF: folded = PL_fold[c]; break; - case EXACTFA: - case EXACTFU_TRICKYFOLD: - case EXACTFU: folded = PL_fold_latin1[c]; break; - case EXACTFL: folded = PL_fold_locale[c]; break; - default: Perl_croak(aTHX_ "panic: Unexpected op %u", OP(p)); - } - while (scan < loceol && - (UCHARAT(scan) == c || UCHARAT(scan) == folded)) - { - scan++; - } + if (S_setup_EXACTISH_ST_c1_c2(aTHX_ p, &c1, c1_utf8, &c2, c2_utf8)) { + if (c1 == CHRTEST_VOID) { + /* Use full Unicode fold matching */ + char *tmpeol = loceol; + STRLEN pat_len = (UTF_PATTERN) ? UTF8SKIP(STRING(p)) : 1; + while (hardcount < max + && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target, + STRING(p), NULL, pat_len, + cBOOL(UTF_PATTERN), utf8_flags)) + { + scan = tmpeol; + tmpeol = loceol; + hardcount++; + } + } + else if (utf8_target) { + if (c1 == c2) { + while (hardcount < max + && memEQ(scan, c1_utf8, UTF8SKIP(scan))) + { + scan += UTF8SKIP(scan); + hardcount++; + } + } + else { + while (hardcount < max + && (memEQ(scan, c1_utf8, UTF8SKIP(scan)) + || memEQ(scan, c2_utf8, UTF8SKIP(scan)))) + { + scan += UTF8SKIP(scan); + hardcount++; + } + } + } + else if (c1 == c2) { + while (scan < loceol && UCHARAT(scan) == c1) { + scan++; + } + } + else { + while (scan < loceol && + (UCHARAT(scan) == c1 || UCHARAT(scan) == c2)) + { + scan++; + } + } } break; + } case ANYOF: if (utf8_target) { STRLEN inclasslen; @@ -6920,25 +7022,8 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma } break; case LNBREAK: - if (utf8_target) { - loceol = PL_regeol; - while (hardcount < max && scan < loceol && - (c=is_LNBREAK_utf8_safe(scan, loceol))) { - scan += c; - hardcount++; - } - } else { - /* - LNBREAK can match two latin chars, which is ok, - because we have a null terminated string, but we - have to use hardcount in this situation - */ - while (scan < loceol && (c=is_LNBREAK_latin1_safe(scan, loceol))) { - scan+=c; - hardcount++; - } - } - break; + Perl_croak(aTHX_ "panic: regrepeat() should not be called with non-simple: LNBREAK"); + assert(0); /* NOTREACHED */ case HORIZWS: if (utf8_target) { loceol = PL_regeol; diff --git a/regexp.h b/regexp.h index f631db9..e1d5906 100644 --- a/regexp.h +++ b/regexp.h @@ -18,6 +18,8 @@ /* we don't want to include this stuff if we are inside of an external regex engine based on the core one - like re 'debug'*/ +#include "utf8.h" + struct regnode { U8 flags; U8 type; @@ -102,8 +104,8 @@ struct reg_code_block { /* Information about the match that the perl core uses to */ \ /* manage things */ \ U32 extflags; /* Flags used both externally and internally */ \ - I32 minlen; /* mininum possible length of string to match */\ - I32 minlenret; /* mininum possible length of $& */ \ + I32 minlen; /* mininum possible number of chars in string to match */\ + I32 minlenret; /* mininum possible number of chars in $& */ \ U32 gofs; /* chars left of pos that we search from */ \ /* substring data about strings that must appear in the */ \ /* final match, used for optimisations */ \ @@ -740,7 +742,7 @@ typedef struct regmatch_state { struct { /* this first element must match u.yes */ struct regmatch_state *prev_yes_state; - I32 c1, c2; /* case fold search */ + int c1, c2; /* case fold search */ CHECKPOINT cp; U32 lastparen; U32 lastcloseparen; @@ -749,6 +751,8 @@ typedef struct regmatch_state { bool minmod; regnode *A, *B; /* the nodes corresponding to /A*B/ */ regnode *me; /* the curlym node */ + U8 c1_utf8[UTF8_MAXBYTES+1]; /* */ + U8 c2_utf8[UTF8_MAXBYTES+1]; } curlym; struct { @@ -756,12 +760,14 @@ typedef struct regmatch_state { CHECKPOINT cp; U32 lastparen; U32 lastcloseparen; - I32 c1, c2; /* case fold search */ + int c1, c2; /* case fold search */ char *maxpos; /* highest possible point in string to match */ char *oldloc; /* the previous locinput */ int count; int min, max; /* {m,n} */ regnode *A, *B; /* the nodes corresponding to /A*B/ */ + U8 c1_utf8[UTF8_MAXBYTES+1]; /* */ + U8 c2_utf8[UTF8_MAXBYTES+1]; } curly; /* and CURLYN/PLUS/STAR */ } u; diff --git a/t/re/re_tests b/t/re/re_tests index a59a6ab..387a74b 100644 --- a/t/re/re_tests +++ b/t/re/re_tests @@ -1709,4 +1709,6 @@ ab[c\\\](??{"x"})]{3}d ab\\](d y - - (\x{100}) \x{2000}\x{2000}\x{2000}\x{100} y $-[0]:$-[1]:$+[0]:$+[1] 3:3:4:4 +^\R{2}$ \r\n\r\n y $& \r\n\r\n + # vim: softtabstop=0 noexpandtab diff --git a/utf8.c b/utf8.c index 13fb689..8ad0478 100644 --- a/utf8.c +++ b/utf8.c @@ -3606,7 +3606,7 @@ HV* Perl__swash_inversion_hash(pTHX_ SV* const swash) { - /* Subject to change or removal. For use only in one place in regcomp.c. + /* Subject to change or removal. For use only in regcomp.c and regexec.c * Can't be used on a property that is subject to user override, as it * relies on the value of SPECIALS in the swash which would be set by * utf8_heavy.pl to the hash in the non-overriden file, and hence is not set diff --git a/utf8.h b/utf8.h index 5330e21..7472de1 100644 --- a/utf8.h +++ b/utf8.h @@ -8,6 +8,9 @@ * */ +#ifndef H_UTF8 /* Guard against recursive inclusion */ +#define H_UTF8 1 + /* Use UTF-8 as the default script encoding? * Turning this on will break scripts having non-UTF-8 binary * data (such as Latin-1) in string literals. */ @@ -514,6 +517,8 @@ Perl's extended UTF-8 means we can have start bytes up to FF. # define IS_UTF8_CHAR_FAST(n) ((n) <= 4) #endif +#endif /* H_UTF8 */ + /* * Local variables: * c-indentation-style: bsd -- Perl5 Master Repository
