In perl.git, the branch smoke-me/khw-lexact has been updated <https://perl5.git.perl.org/perl.git/commitdiff/eb8f0ab482110da549e2c58a1c05a6ce1c0b6246?hp=8224bce37e0350a2b529bcff668f074a310b9971>
- Log ----------------------------------------------------------------- commit eb8f0ab482110da549e2c58a1c05a6ce1c0b6246 Author: Karl Williamson <[email protected]> Date: Sun Sep 15 17:25:08 2019 -0600 f ----------------------------------------------------------------------- Summary of changes: regcomp.h | 8 ++++---- regexec.c | 64 ++++++++++++++++++++++++++++++++++++++++++--------------------- 2 files changed, 47 insertions(+), 25 deletions(-) diff --git a/regcomp.h b/regcomp.h index ed16f2491d..0b6844ad11 100644 --- a/regcomp.h +++ b/regcomp.h @@ -331,12 +331,12 @@ struct regnode_ssc { #define FLAGS(p) ((p)->flags) /* Caution: Doesn't apply to all \ regnode types. For some, it's the \ character set of the regnode */ -#define STR_LENs(p) (((struct regnode_string *)p)->str_len) -#define STRINGs(p) (((struct regnode_string *)p)->string) +#define STR_LENs(p) (__ASSERT_(OP(p) != LEXACT) ((struct regnode_string *)p)->str_len) +#define STRINGs(p) (__ASSERT_(OP(p) != LEXACT) ((struct regnode_string *)p)->string) #define OPERANDs(p) STRINGs(p) -#define STR_LENl(p) ((U16)((((struct regnode_string *)p)->str_len) | ((((struct regnode_string *)p)->string[0]) << 8))) -#define STRINGl(p) ((((struct regnode_string *)p)->string) + 1) +#define STR_LENl(p) (__ASSERT_(OP(p) == LEXACT) (U16)((((struct regnode_string *)p)->str_len) | ((((struct regnode_string *)p)->string[0]) << 8))) +#define STRINGl(p) (__ASSERT_(OP(p) == LEXACT) (((struct regnode_string *)p)->string) + 1) #define OPERANDl(p) STRINGl(p) #define OPERAND(p) ((OP(p) == LEXACT) ? OPERANDl(p) : OPERANDs(p)) diff --git a/regexec.c b/regexec.c index 62e473e6a9..973b9e93a1 100644 --- a/regexec.c +++ b/regexec.c @@ -2298,8 +2298,8 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, * first character. c2 is its fold. This logic will not work for * Unicode semantics and the german sharp ss, which hence should * not be compiled into a node that gets here. */ - pat_string = STRING(c); - ln = STR_LEN(c); /* length to match in octets/bytes */ + pat_string = STRINGs(c); + ln = STR_LENs(c); /* length to match in octets/bytes */ /* We know that we have to match at least 'ln' bytes (which is the * same as characters, since not utf8). If we have to match 3 @@ -2374,8 +2374,8 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, /* If one of the operands is in utf8, we can't use the simpler folding * above, due to the fact that many different characters can have the * same fold, or portion of a fold, or different- length fold */ - pat_string = STRING(c); - ln = STR_LEN(c); /* length to match in octets/bytes */ + pat_string = STRINGs(c); + ln = STR_LENs(c); /* length to match in octets/bytes */ pat_end = pat_string + ln; lnc = is_utf8_pat /* length to match in characters */ ? utf8_length((U8 *) pat_string, (U8 *) pat_end) @@ -4263,7 +4263,7 @@ S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p, } } else { /* an EXACTFish node */ - U8 *pat_end = pat + STR_LEN(text_node); + U8 *pat_end = pat + STR_LENs(text_node); /* An EXACTFL node has at least some characters unfolded, because what * they match is not known until now. So, now is the time to fold @@ -6275,6 +6275,14 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) } #undef ST + case LEXACT: + { + char *s; + + s = STRINGl(scan); + ln = STR_LENl(scan); + goto join_short_long_exact; + case EXACTL: /* /abc/l */ _CHECK_AND_WARN_PROBLEMATIC_LOCALE; @@ -6293,13 +6301,13 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) sayNO; } /* FALLTHROUGH */ + case EXACT: /* /abc/ */ - case LEXACT: - { - char *s; do_exact: - s = STRING(scan); - ln = STR_LEN(scan); + s = STRINGs(scan); + ln = STR_LENs(scan); + + join_short_long_exact: if (utf8_target != is_utf8_pat) { /* The target and the pattern have differing utf8ness. */ char *l = locinput; @@ -6451,8 +6459,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) fold_utf8_flags = 0; do_exactf: - s = STRING(scan); - ln = STR_LEN(scan); + s = STRINGs(scan); + ln = STR_LENs(scan); if ( utf8_target || is_utf8_pat @@ -9366,6 +9374,16 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, else scan = this_eol; break; + + case LEXACT: + { + U8 * string; + Size_t str_len; + + string = (U8 *) STRINGl(p); + str_len = STR_LENl(p); + goto join_short_long_exact; + case EXACTL: _CHECK_AND_WARN_PROBLEMATIC_LOCALE; if (utf8_target && UTF8_IS_ABOVE_LATIN1(*scan)) { @@ -9378,12 +9396,15 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, break; } /* FALLTHROUGH */ - case LEXACT: case EXACT: do_exact: - assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1); + string = (U8 *) STRINGs(p); + str_len = STR_LENs(p); + + join_short_long_exact: + assert(str_len == reginfo->is_utf8_pat ? UTF8SKIP(string) : 1); - c = (U8)*STRING(p); + c = *string; /* Can use a simple find if the pattern char to match on is invariant * under UTF-8, or both target and pattern aren't UTF-8. Note that we @@ -9405,8 +9426,8 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, * string EQ */ while (hardcount < max && scan < this_eol - && (scan_char_len = UTF8SKIP(scan)) <= STR_LEN(p) - && memEQ(scan, STRING(p), scan_char_len)) + && (scan_char_len = UTF8SKIP(scan)) <= str_len + && memEQ(scan, string, scan_char_len)) { scan += scan_char_len; hardcount++; @@ -9416,7 +9437,7 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, /* Target isn't utf8; convert the character in the UTF-8 * pattern to non-UTF8, and do a simple find */ - c = EIGHT_BIT_UTF8_TO_NATIVE(c, *(STRING(p) + 1)); + c = EIGHT_BIT_UTF8_TO_NATIVE(c, *(string + 1)); scan = (char *) find_span_end((U8 *) scan, (U8 *) this_eol, (U8) c); } /* else pattern char is above Latin1, can't possibly match the non-UTF-8 target */ @@ -9440,6 +9461,7 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, } } break; + } case EXACTFAA_NO_TRIE: /* This node only generated for non-utf8 patterns */ assert(! reginfo->is_utf8_pat); @@ -9490,7 +9512,7 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, int c1, c2; U8 c1_utf8[UTF8_MAXBYTES+1], c2_utf8[UTF8_MAXBYTES+1]; - assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1); + assert(STR_LENs(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRINGs(p)) : 1); if (S_setup_EXACTISH_ST_c1_c2(aTHX_ p, &c1, c1_utf8, &c2, c2_utf8, reginfo)) @@ -9498,10 +9520,10 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, if (c1 == CHRTEST_VOID) { /* Use full Unicode fold matching */ char *tmpeol = loceol; - STRLEN pat_len = reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1; + STRLEN pat_len = reginfo->is_utf8_pat ? UTF8SKIP(STRINGs(p)) : 1; while (hardcount < max && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target, - STRING(p), NULL, pat_len, + STRINGs(p), NULL, pat_len, reginfo->is_utf8_pat, utf8_flags)) { scan = tmpeol; -- Perl5 Master Repository
