In perl.git, the branch blead has been updated <http://perl5.git.perl.org/perl.git/commitdiff/5ee57374e52ac64a2ab7681c2e0fa832e1c3a601?hp=055ae706a0637a77ba440aedf596d2ebac642051>
- Log ----------------------------------------------------------------- commit 5ee57374e52ac64a2ab7681c2e0fa832e1c3a601 Author: Yves Orton <[email protected]> Date: Thu Oct 27 22:35:21 2016 +0200 regcomp.c: document the trie common prefix logic I wrote this code some time ago. It is somewhat of a state machine with some interesting implicit assumptions which took me a while to remember. While I do it seems reasonable to document them so the next guy (maybe/probably me) doesn't have to think so hard. M regcomp.c commit d3d91c7ce972aa4a787621573eaa9f43187c1447 Author: Yves Orton <[email protected]> Date: Thu Oct 27 22:34:49 2016 +0200 regcomp.c: avoid some unnecessary work when it wont be used M regcomp.c commit ee0dfd0b018301d385524a3a56b30f5bbf76ac7d Author: Yves Orton <[email protected]> Date: Thu Oct 27 22:32:09 2016 +0200 regcomp.c: in trie common prefix logic rename idx to first_ofs Using 'idx' and 'ofs' interchangably is confusing, calling this first_ofs makes it more obvious what it is used for. M regcomp.c commit dca5fc4c983ec7d36d46ec5224844cb8a7a91e5b Author: Yves Orton <[email protected]> Date: Thu Oct 27 22:28:55 2016 +0200 regcomp.c: whitespace only change, break up dense code/long line M regcomp.c commit 20ed8c886149743bda473079228bf6caf641752a Author: Yves Orton <[email protected]> Date: Thu Oct 27 22:28:30 2016 +0200 regcomp.c: add a comment about the trie logic M regcomp.c commit 8bcafbf4d45903d8b322259c2c47d651129cddbe Author: Yves Orton <[email protected]> Date: Thu Oct 27 22:26:39 2016 +0200 regcomp.c: refactor TRIE bitmap logic to a macro In four places we use the same logic, so refactor it to one macro called from four places and avoid any future oversights missing one. M regcomp.c commit 3092ee0cc2fd064f94e1ba8dc866b0ba8efc3cb8 Author: Yves Orton <[email protected]> Date: Thu Oct 27 20:58:37 2016 +0200 optimise gv.c a bit (we could do better) We save a few ops for package vars starting with 'E' by checking the second char as well. We could probably be much smarter with this switch, but we would have to generate it, which involves its own issues. M gv.c ----------------------------------------------------------------------- Summary of changes: gv.c | 12 +++++--- regcomp.c | 102 ++++++++++++++++++++++++++++++++++++-------------------------- 2 files changed, 68 insertions(+), 46 deletions(-) diff --git a/gv.c b/gv.c index 3b09b8a398..1cf0d8dd74 100644 --- a/gv.c +++ b/gv.c @@ -1856,10 +1856,12 @@ S_gv_magicalize(pTHX_ GV *gv, HV *stash, const char *name, STRLEN len, if (len) { switch (*name) { case 'E': - if (memEQs(name, len, "EXPORT") + if ( + len >= 6 && name[1] == 'X' && + (memEQs(name, len, "EXPORT") ||memEQs(name, len, "EXPORT_OK") ||memEQs(name, len, "EXPORT_FAIL") - ||memEQs(name, len, "EXPORT_TAGS") + ||memEQs(name, len, "EXPORT_TAGS")) ) GvMULTI_on(gv); break; @@ -1921,10 +1923,12 @@ S_gv_magicalize(pTHX_ GV *gv, HV *stash, const char *name, STRLEN len, } break; case 'E': - if (memEQs(name, len, "EXPORT") + if ( + len >= 6 && name[1] == 'X' && + (memEQs(name, len, "EXPORT") ||memEQs(name, len, "EXPORT_OK") ||memEQs(name, len, "EXPORT_FAIL") - ||memEQs(name, len, "EXPORT_TAGS") + ||memEQs(name, len, "EXPORT_TAGS")) ) GvMULTI_on(gv); break; diff --git a/regcomp.c b/regcomp.c index a0782d3ff1..e9c7972e0c 100644 --- a/regcomp.c +++ b/regcomp.c @@ -2426,6 +2426,21 @@ is the recommended Unicode-aware way of saying : ( state==1 ? special : 0 ) \ ) +#define TRIE_BITMAP_SET_FOLDED(trie, uvc, folder) \ +STMT_START { \ + TRIE_BITMAP_SET(trie, uvc); \ + /* store the folded codepoint */ \ + if ( folder ) \ + TRIE_BITMAP_SET(trie, folder[(U8) uvc ]); \ + \ + if ( !UTF ) { \ + /* store first byte of utf8 representation of */ \ + /* variant codepoints */ \ + if (! UVCHR_IS_INVARIANT(uvc)) { \ + TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc)); \ + } \ + } \ +} STMT_END #define MADE_TRIE 1 #define MADE_JUMP_TRIE 2 #define MADE_EXACT_TRIE 4 @@ -2551,12 +2566,23 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, bitmap?*/ if (OP(noper) == NOTHING) { + /* skip past a NOTHING at the start of an alternation + * eg, /(?:)a|(?:b)/ should be the same as /a|b/ + */ regnode *noper_next= regnext(noper); if (noper_next < tail) noper= noper_next; } - if ( noper < tail && ( OP(noper) == flags || ( flags == EXACTFU && OP(noper) == EXACTFU_SS ) ) ) { + if ( noper < tail && + ( + OP(noper) == flags || + ( + flags == EXACTFU && + OP(noper) == EXACTFU_SS + ) + ) + ) { uc= (U8*)STRING(noper); e= uc + STR_LEN(noper); } else { @@ -2573,6 +2599,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S); } } + for ( ; uc < e ; uc += len ) { /* Look at each char in the current branch */ TRIE_CHARCOUNT(trie)++; @@ -2656,18 +2683,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, if ( set_bit ) { /* store the codepoint in the bitmap, and its folded * equivalent. */ - TRIE_BITMAP_SET(trie, uvc); - - /* store the folded codepoint */ - if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]); - - if ( !UTF ) { - /* store first byte of utf8 representation of - variant codepoints */ - if (! UVCHR_IS_INVARIANT(uvc)) { - TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc)); - } - } + TRIE_BITMAP_SET_FOLDED(trie, uvc, folder); set_bit = 0; /* We've done our bit :-) */ } } else { @@ -3232,13 +3248,19 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, split out as an EXACT and put in front of the TRIE node. */ trie->startstate= 1; if ( trie->bitmap && !widecharmap && !trie->jump ) { + /* we want to find the first state that has more than + * one transition, if that state is not the first state + * then we have a common prefix which we can remove. + */ U32 state; for ( state = 1 ; state < trie->statecount-1 ; state++ ) { U32 ofs = 0; - I32 idx = -1; + I32 first_ofs = -1; /* keeps track of the ofs of the first + transition, -1 means none */ U32 count = 0; const U32 base = trie->states[ state ].trans.base; + /* does this state terminate an alternation? */ if ( trie->states[state].wordnum ) count = 1; @@ -3248,58 +3270,54 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, trie->trans[ base + ofs - trie->uniquecharcount ].check == state ) { if ( ++count > 1 ) { - SV **tmp = av_fetch( revcharmap, ofs, 0); - const U8 *ch = (U8*)SvPV_nolen_const( *tmp ); + /* we have more than one transition */ + SV **tmp; + U8 *ch; + /* if this is the first state there is no common prefix + * to extract, so we can exit */ if ( state == 1 ) break; + tmp = av_fetch( revcharmap, ofs, 0); + ch = (U8*)SvPV_nolen_const( *tmp ); + + /* if we are on count 2 then we need to initialize the + * bitmap, and store the previous char if there was one + * in it*/ if ( count == 2 ) { + /* clear the bitmap */ Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char); DEBUG_OPTIMISE_r( Perl_re_indentf( aTHX_ "New Start State=%"UVuf" Class: [", depth+1, (UV)state)); - if (idx >= 0) { - SV ** const tmp = av_fetch( revcharmap, idx, 0); + if (first_ofs >= 0) { + SV ** const tmp = av_fetch( revcharmap, first_ofs, 0); const U8 * const ch = (U8*)SvPV_nolen_const( *tmp ); - TRIE_BITMAP_SET(trie,*ch); - if ( folder ) - TRIE_BITMAP_SET(trie, folder[ *ch ]); - if ( !UTF ) { - /* store first byte of utf8 representation of - variant codepoints */ - if (! UVCHR_IS_INVARIANT(*ch)) { - TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(*ch)); - } - } + TRIE_BITMAP_SET_FOLDED(trie,*ch,folder); DEBUG_OPTIMISE_r( Perl_re_printf( aTHX_ "%s", (char*)ch) ); } } - TRIE_BITMAP_SET(trie,*ch); - if ( folder ) - TRIE_BITMAP_SET(trie,folder[ *ch ]); - if ( !UTF ) { - /* store first byte of utf8 representation of - variant codepoints */ - if (! UVCHR_IS_INVARIANT(*ch)) { - TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(*ch)); - } - } + /* store the current firstchar in the bitmap */ + TRIE_BITMAP_SET_FOLDED(trie,*ch,folder); DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "%s", ch)); } - idx = ofs; + first_ofs = ofs; } } if ( count == 1 ) { - SV **tmp = av_fetch( revcharmap, idx, 0); + /* This state has only one transition, its transition is part + * of a common prefix - we need to concatenate the char it + * represents to what we have so far. */ + SV **tmp = av_fetch( revcharmap, first_ofs, 0); STRLEN len; char *ch = SvPV( *tmp, len ); DEBUG_OPTIMISE_r({ SV *sv=sv_newmortal(); - Perl_re_indentf( aTHX_ "Prefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n", + Perl_re_indentf( aTHX_ "Prefix State: %"UVuf" Ofs:%"UVuf" Char='%s'\n", depth+1, - (UV)state, (UV)idx, + (UV)state, (UV)first_ofs, pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6, PL_colors[0], PL_colors[1], (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) | -- Perl5 Master Repository
