In perl.git, the branch blead has been updated <http://perl5.git.perl.org/perl.git/commitdiff/2b2266b19c5f9a3d80040e731c02927e6be1dace?hp=10a51195f4a059336d372c2e632f5556281f3806>
- Log ----------------------------------------------------------------- commit 2b2266b19c5f9a3d80040e731c02927e6be1dace Author: Karl Williamson <[email protected]> Date: Fri Jan 23 12:26:42 2015 -0700 regcomp.c: Clarify comment M regcomp.c commit 6635f04f27bfaa7296536aad8659044e8bed8df3 Author: Karl Williamson <[email protected]> Date: Fri Jan 23 12:00:49 2015 -0700 handy.h: EXTERN_C-ize PL_charclass See thread http://nntp.perl.org/group/perl.perl5.porters/224999 M handy.h commit 7aa20b428680dffac746f9c0947535a222940bda Author: Karl Williamson <[email protected]> Date: Fri Jan 23 11:57:14 2015 -0700 regcomp.c: Another minor optimization The [:cased:] internal class now handles [:upper:] and/or [:lower:] under /i matching. This code skipped possible optimizations because it didn't think to use this. M regcomp.c commit f59fa626ecf8377ec531b277e5bd1c0f5958916d Author: Karl Williamson <[email protected]> Date: Fri Jan 23 11:20:30 2015 -0700 regcomp.c: Minor optimizations \d, [:digit:], and [:xdigit:] don't match anything in the upper Latin1 range. Therefore whether or not the target string is UTF-8 or not doesn't change what they match, hence the /d modifier acts exactly like the /u modifier for them. At run-time /u executes fewer branches because it doesn't have to test if the target string is in UTF-8 or not, so treating these as if /u had instead been specified saves some runtime. M regcomp.c commit c52b8b12af74c72a617744f9e9dfef7ea49a16c7 Author: Karl Williamson <[email protected]> Date: Fri Jan 23 11:19:23 2015 -0700 regexec.c, regcomp.c: White-space only This changes some labels to be outdented 2 spaces from surrounding code M regcomp.c M regexec.c ----------------------------------------------------------------------- Summary of changes: handy.h | 2 ++ regcomp.c | 75 +++++++++++++++++++++++++++++++++++++++------------------------ regexec.c | 26 +++++++++++----------- 3 files changed, 61 insertions(+), 42 deletions(-) diff --git a/handy.h b/handy.h index 1256ea1..faa9f7a 100644 --- a/handy.h +++ b/handy.h @@ -1020,6 +1020,7 @@ static const char* const swash_property_names[] = { }; #endif +START_EXTERN_C # ifdef DOINIT EXTCONST U32 PL_charclass[] = { # include "l1_char_class_tab.h" @@ -1028,6 +1029,7 @@ EXTCONST U32 PL_charclass[] = { # else /* ! DOINIT */ EXTCONST U32 PL_charclass[]; # endif +END_EXTERN_C /* The 1U keeps Solaris from griping when shifting sets the uppermost bit */ # define _CC_mask(classnum) (1U << (classnum)) diff --git a/regcomp.c b/regcomp.c index b0256b7..3851d34 100644 --- a/regcomp.c +++ b/regcomp.c @@ -8447,7 +8447,7 @@ Perl__invlist_populate_swatch(SV* const invlist, swatch[offset >> 3] |= 1 << (offset & 7); } - join_end_of_list: + join_end_of_list: /* Quit if at the end of the list */ if (i >= len) { @@ -9619,7 +9619,7 @@ S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state) : REGEX_DEPENDS_CHARSET; has_charset_modifier = DEPENDS_PAT_MOD; break; - excess_modifier: + excess_modifier: RExC_parse++; if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) { vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD); @@ -9632,7 +9632,7 @@ S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state) vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1)); } NOT_REACHED; /*NOTREACHED*/ - neg_modifier: + neg_modifier: RExC_parse++; vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"", *(RExC_parse - 1)); @@ -9704,7 +9704,7 @@ S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state) return; /*NOTREACHED*/ default: - fail_modifiers: + fail_modifiers: RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1; /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */ vFAIL2utf8f("Sequence (%"UTF8f"...) not recognized", @@ -10849,7 +10849,7 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) nextchar(pRExC_state); } - do_curly: + do_curly: if ((flags&SIMPLE)) { MARK_NAUGHTY_EXP(2, 2); reginsert(pRExC_state, CURLY, ret, depth+1); @@ -11791,7 +11791,15 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) /* FALLTHROUGH */ case 'd': arg = ANYOF_DIGIT; - goto join_posix; + if (! DEPENDS_SEMANTICS) { + goto join_posix; + } + + /* \d doesn't have any matches in the upper Latin1 range, hence /d + * is equivalent to /u. Changing to /u saves some branches at + * runtime */ + op = POSIXU; + goto join_posix_op_known; case 'R': ret = reg_node(pRExC_state, LNBREAK); @@ -11820,7 +11828,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) case 's': arg = ANYOF_SPACE; - join_posix: + join_posix: op = POSIXD + get_regex_charset(RExC_flags); if (op > POSIXA) { /* /aa is same as /a */ @@ -11830,7 +11838,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) RExC_contains_locale = 1; } - join_posix_op_known: + join_posix_op_known: if (invert) { op += NPOSIXD - POSIXD; @@ -11844,7 +11852,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) *flagp |= HASWIDTH|SIMPLE; /* FALLTHROUGH */ - finish_meta_pat: + finish_meta_pat: nextchar(pRExC_state); Set_Node_Length(ret, 2); /* MJD */ break; @@ -11900,7 +11908,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) } break; case 'k': /* Handle \k<NAME> and \k'NAME' */ - parse_named_seq: + parse_named_seq: { char ch= RExC_parse[1]; if (ch != '<' && ch != '\'' && ch != '{') { @@ -12066,7 +12074,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) RExC_parse++; - defchar: { + defchar: { STRLEN len = 0; UV ender = 0; char *p; @@ -12105,7 +12113,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) s0 = s; - reparse: + reparse: /* We do the EXACTFish to EXACT node only if folding. (And we * don't need to figure this out until pass 2) */ @@ -12341,7 +12349,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) if (IN_ENCODING && ender < 0x100) goto recode_encoding; break; - recode_encoding: + recode_encoding: if (! RExC_override_recoding) { SV* enc = _get_encoding(); ender = reg_recode((const char)(U8)ender, &enc); @@ -12724,8 +12732,8 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) } } /* End of verifying node ends with an appropriate char */ - loopdone: /* Jumped to when encounters something that shouldn't be in - the node */ + loopdone: /* Jumped to when encounters something that shouldn't be + in the node */ /* I (khw) don't know if you can get here with zero length, but the * old code handled this situation by creating a zero-length EXACT @@ -13193,7 +13201,7 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, RExC_parse++; } - no_close: + no_close: FAIL("Syntax error in (?[...])"); } @@ -13891,7 +13899,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, break; } - charclassloop: + charclassloop: namedclass = OOB_NAMEDCLASS; /* initialize as illegal */ save_value = value; @@ -14258,7 +14266,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, goto recode_encoding; break; } - recode_encoding: + recode_encoding: if (! RExC_override_recoding) { SV* enc = _get_encoding(); value = reg_recode((const char)(U8)value, &enc); @@ -14886,7 +14894,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, /* The actual POSIXish node for all the rest depends on the * charset modifier. The ones in the first set depend only on - * ASCII or, if available on this platform, locale */ + * ASCII or, if available on this platform, also locale */ case ANYOF_ASCII: case ANYOF_NASCII: #ifdef HAS_ISASCII @@ -14896,19 +14904,27 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, #endif goto join_posix; - case ANYOF_NCASED: + /* The following don't have any matches in the upper Latin1 + * range, hence /d is equivalent to /u for them. Making it /u + * saves some branches at runtime */ + case ANYOF_DIGIT: + case ANYOF_NDIGIT: + case ANYOF_XDIGIT: + case ANYOF_NXDIGIT: + if (! DEPENDS_SEMANTICS) { + goto treat_as_default; + } + + op = POSIXU; + goto join_posix; + + /* The following change to CASED under /i */ case ANYOF_LOWER: case ANYOF_NLOWER: case ANYOF_UPPER: case ANYOF_NUPPER: - /* under /a could be alpha */ if (FOLD) { - if (ASCII_RESTRICTED) { - namedclass = ANYOF_ALPHA + (namedclass % 2); - } - else if (! LOC) { - break; - } + namedclass = ANYOF_CASED + (namedclass % 2); } /* FALLTHROUGH */ @@ -14916,12 +14932,13 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, * We take advantage of the enum ordering of the charset * modifiers to get the exact node type, */ default: + treat_as_default: op = POSIXD + get_regex_charset(RExC_flags); if (op > POSIXA) { /* /aa is same as /a */ op = POSIXA; } - join_posix: + join_posix: /* The odd numbered ones are the complements of the * next-lower even number one */ if (namedclass % 2 == 1) { @@ -16638,7 +16655,7 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_ sv_catpv(sv, t); } - out_dump: + out_dump: Safefree(origs); SvREFCNT_dec_NN(lv); diff --git a/regexec.c b/regexec.c index 6a209ad..dc940c3 100644 --- a/regexec.c +++ b/regexec.c @@ -1845,7 +1845,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, /* FALLTHROUGH */ - do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there + do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there are no glitches with fold-length differences between the target string and pattern */ @@ -1879,8 +1879,8 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, } break; - do_exactf_utf8: - { + do_exactf_utf8: + { unsigned expansion; /* If one of the operands is in utf8, we can't use the simpler folding @@ -2021,7 +2021,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, } else { - posix_utf8: + posix_utf8: classnum = (_char_class_number) FLAGS(c); if (classnum < _FIRST_NON_SWASH_CC) { while (s < strend) { @@ -4704,7 +4704,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) case BOUNDA: /* /\b/a */ - bound_ascii_match_only: + bound_ascii_match_only: /* Here the string isn't utf8, or is utf8 and only ascii characters * are to match \w. In the latter case looking at the byte just * prior to the current one may be just the final byte of a @@ -4887,7 +4887,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) locinput += 2; } else { /* Handle above Latin-1 code points */ - utf8_posix_above_latin1: + utf8_posix_above_latin1: classnum = (_char_class_number) FLAGS(scan); if (classnum < _FIRST_NON_SWASH_CC) { @@ -5143,7 +5143,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) locinput += UTF8SKIP(locinput); } } - exit_utf8: + exit_utf8: if (locinput > reginfo->strend) sayNO; } break; @@ -5580,7 +5580,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) maxopenparen = 0; /* run the pattern returned from (??{...}) */ - eval_recurse_doit: /* Share code with GOSUB below this line + eval_recurse_doit: /* Share code with GOSUB below this line * At this point we expect the stack context to be * set up correctly */ @@ -6682,7 +6682,7 @@ NULL /* NOTREACHED */ NOT_REACHED; - curly_try_B_max: + curly_try_B_max: /* a successful greedy match: now try to match B */ if (cur_eval && cur_eval->u.eval.close_paren && cur_eval->u.eval.close_paren == (U32)ST.paren) { @@ -6732,7 +6732,7 @@ NULL #undef ST case END: /* last op of main pattern */ - fake_end: + fake_end: if (cur_eval) { /* we've just finished A in /(??{A})B/; now continue with B */ @@ -6983,7 +6983,7 @@ NULL /* this is a point to jump to in order to increment * locinput by one character */ - increment_locinput: + increment_locinput: assert(!NEXTCHR_IS_EOS); if (utf8_target) { locinput += PL_utf8skip[nextchr]; @@ -7376,7 +7376,7 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, case EXACTFU: utf8_flags = reginfo->is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0; - do_exactf: { + do_exactf: { int c1, c2; U8 c1_utf8[UTF8_MAXBYTES+1], c2_utf8[UTF8_MAXBYTES+1]; @@ -7537,7 +7537,7 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, } } else { - utf8_posix: + utf8_posix: classnum = (_char_class_number) FLAGS(p); if (classnum < _FIRST_NON_SWASH_CC) { -- Perl5 Master Repository
