In perl.git, the branch blead has been updated <http://perl5.git.perl.org/perl.git/commitdiff/632c9f80dfaf91e6a695c9a916ab6136110e4ac7?hp=d050f2859975c1e0a222bc3689e73a4e39c58b87>
- Log ----------------------------------------------------------------- commit 632c9f80dfaf91e6a695c9a916ab6136110e4ac7 Author: Karl Williamson <pub...@khwilliamson.com> Date: Sun Dec 16 08:56:28 2012 -0700 regen/unicode_constants.pl: Add option to skip if undef I thought I would need this new functionality in this regen script, but ended up going a different route. But just in case someone might find this useful in the future, here it is. M regen/unicode_constants.pl commit 7d43c479c5220d368a2e5d94341c40f8d4cb1769 Author: Karl Williamson <pub...@khwilliamson.com> Date: Sat Dec 15 09:53:19 2012 -0700 regexec.c: More efficient Korean \X processing This refactors the code slightly that checks for Korean precomposed syllables in \X. It eliminates the PL_variable formerly used to keep track of things. M embed.fnc M embed.h M embedvar.h M intrpvar.h M proto.h M regexec.c M sv.c commit 22913b96d35efdf1a58eddd0cfba7640c55fbcc7 Author: Karl Williamson <pub...@khwilliamson.com> Date: Sat Dec 15 09:42:36 2012 -0700 regexec.c: Move #defines to earlier in the file They will be used in a later commit earlier. This also changes the wording of the comment slightly to give more explanation, since the context they are now found in is different M regexec.c ----------------------------------------------------------------------- Summary of changes: embed.fnc | 2 - embed.h | 1 - embedvar.h | 1 - intrpvar.h | 1 - proto.h | 6 --- regen/unicode_constants.pl | 19 ++++++--- regexec.c | 88 ++++++++----------------------------------- sv.c | 1 - 8 files changed, 30 insertions(+), 89 deletions(-) diff --git a/embed.fnc b/embed.fnc index 0a382f6..2be18ad 100644 --- a/embed.fnc +++ b/embed.fnc @@ -2037,8 +2037,6 @@ ERsn |U8* |reghop3 |NN U8 *s|I32 off|NN const U8 *lim ERsM |SV* |core_regclass_swash|NULLOK const regexp *prog \ |NN const struct regnode *node|bool doinit \ |NULLOK SV **listsvp -:not currently used EiR |bool |is_utf8_X_LV |NN const U8 *p -EiR |bool |is_utf8_X_LVT |NN const U8 *p #ifdef XXX_dmq ERsn |U8* |reghop4 |NN U8 *s|I32 off|NN const U8 *llim \ |NN const U8 *rlim diff --git a/embed.h b/embed.h index d6b1c2f..4ae36e3 100644 --- a/embed.h +++ b/embed.h @@ -970,7 +970,6 @@ #define core_regclass_swash(a,b,c,d) S_core_regclass_swash(aTHX_ a,b,c,d) #define find_byclass(a,b,c,d,e) S_find_byclass(aTHX_ a,b,c,d,e) #define isFOO_lc(a,b) S_isFOO_lc(aTHX_ a,b) -#define is_utf8_X_LVT(a) S_is_utf8_X_LVT(aTHX_ a) #define reg_check_named_buff_matched(a,b) S_reg_check_named_buff_matched(aTHX_ a,b) #define regcppop(a) S_regcppop(aTHX_ a) #define regcppush(a,b) S_regcppush(aTHX_ a,b) diff --git a/embedvar.h b/embedvar.h index 9fc6709..87791b4 100644 --- a/embedvar.h +++ b/embedvar.h @@ -350,7 +350,6 @@ #define PL_unitcheckav_save (vTHX->Iunitcheckav_save) #define PL_unlockhook (vTHX->Iunlockhook) #define PL_unsafe (vTHX->Iunsafe) -#define PL_utf8_X_LVT (vTHX->Iutf8_X_LVT) #define PL_utf8_X_extend (vTHX->Iutf8_X_extend) #define PL_utf8_X_regular_begin (vTHX->Iutf8_X_regular_begin) #define PL_utf8_alnum (vTHX->Iutf8_alnum) diff --git a/intrpvar.h b/intrpvar.h index 004989c..b513d22 100644 --- a/intrpvar.h +++ b/intrpvar.h @@ -625,7 +625,6 @@ PERLVAR(I, utf8_punct, SV *) PERLVAR(I, utf8_mark, SV *) PERLVAR(I, utf8_X_regular_begin, SV *) PERLVAR(I, utf8_X_extend, SV *) -PERLVAR(I, utf8_X_LVT, SV *) PERLVAR(I, utf8_toupper, SV *) PERLVAR(I, utf8_totitle, SV *) PERLVAR(I, utf8_tolower, SV *) diff --git a/proto.h b/proto.h index 7f4942e..2ab4429 100644 --- a/proto.h +++ b/proto.h @@ -6790,12 +6790,6 @@ STATIC char* S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, cons STATIC bool S_isFOO_lc(pTHX_ const U8 classnum, const U8 character) __attribute__warn_unused_result__; -PERL_STATIC_INLINE bool S_is_utf8_X_LVT(pTHX_ const U8 *p) - __attribute__warn_unused_result__ - __attribute__nonnull__(pTHX_1); -#define PERL_ARGS_ASSERT_IS_UTF8_X_LVT \ - assert(p) - STATIC I32 S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan) __attribute__warn_unused_result__ __attribute__nonnull__(pTHX_1) diff --git a/regen/unicode_constants.pl b/regen/unicode_constants.pl index 729bde8..48b43f4 100644 --- a/regen/unicode_constants.pl +++ b/regen/unicode_constants.pl @@ -39,6 +39,9 @@ END # white space from the initial token. # string indicates that the output is to be of the string form # described in the comments above that are placed in the file. +# string_skip_ifundef is the same as 'string', but instead of dying if the +# code point doesn't exist, the line is just skipped: no output is +# generated for it # first indicates that the output is to be of the FIRST_BYTE form. # tail indicates that the output is of the _TAIL form. # native indicates that the output is the code point, converted to the @@ -72,6 +75,7 @@ while ( <DATA> ) { my $name; my $cp; + my $undef_ok = $desired_name || $flag =~ /skip_if_undef/; if ($name_or_cp =~ /[^[:xdigit:]]/) { @@ -82,20 +86,23 @@ while ( <DATA> ) { } else { $cp = $name_or_cp; - $name = charnames::viacode("0$cp") // ""; # viacode requires a leading - # zero to be sure that the - # argument is hex - die "Unknown code point '$cp' at line $.: $_\n" unless defined $cp; + $name = charnames::viacode("0$cp"); # viacode requires a leading zero + # to be sure that the argument is + # hex + if (! defined $name) { + die "Unknown code point '$cp' at line $.: $_\n" unless $undef_ok; + $name = ""; + } } - $name = $desired_name if $name eq ""; + $name = $desired_name if $name eq "" && $desired_name; $name =~ s/ /_/g; # The macro name can have no blanks in it my $str = join "", map { sprintf "\\x%02X", $_ } unpack("U0C*", pack("U", hex $cp)); my $suffix = '_UTF8'; - if (! defined $flag || $flag eq 'string') { + if (! defined $flag || $flag =~ /^ string (_skip_if_undef)? $/x) { $str = "\"$str\""; # Will be a string constant } elsif ($flag eq 'tail') { $str =~ s/\\x..//; # Remove the first byte diff --git a/regexec.c b/regexec.c index c5ae04d..c4b949b 100644 --- a/regexec.c +++ b/regexec.c @@ -327,6 +327,15 @@ static const char* const non_utf8_target_but_utf8_required } \ } STMT_END +/* These constants are for finding GCB=LV and GCB=LVT in the CLUMP regnode. + * These are for the pre-composed Hangul syllables, which are all in a + * contiguous block and arranged there in such a way so as to facilitate + * alorithmic determination of their characteristics. As such, they don't need + * a swash, but can be determined by simple arithmetic. Almost all are + * GCB=LVT, but every 28th one is a GCB=LV */ +#define SBASE 0xAC00 /* Start of block */ +#define SCount 11172 /* Length of block */ +#define TCount 28 static void restore_pos(pTHX_ void *arg); @@ -4592,10 +4601,15 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) if (locinput < PL_regeol && is_GCB_LV_LVT_V_utf8(locinput)) { - /* Otherwise keep going. Must be LV, LVT or V. - * See if LVT */ - if (is_utf8_X_LVT((U8*)locinput)) { + * See if LVT, by first ruling out V, then LV */ + if (! is_GCB_V_utf8(locinput) + /* All but every TCount one is LV */ + && (valid_utf8_to_uvchr((U8 *) locinput, + NULL) + - SBASE) + % TCount != 0) + { locinput += UTF8SKIP(locinput); } else { @@ -7732,74 +7746,6 @@ S_to_byte_substr(pTHX_ regexp *prog) return TRUE; } -/* These constants are for finding GCB=LV and GCB=LVT. These are for the - * pre-composed Hangul syllables, which are all in a contiguous block and - * arranged there in such a way so as to facilitate alorithmic determination of - * their characteristics. As such, they don't need a swash, but can be - * determined by simple arithmetic. Almost all are GCB=LVT, but every 28th one - * is a GCB=LV */ -#define SBASE 0xAC00 /* Start of block */ -#define SCount 11172 /* Length of block */ -#define TCount 28 - -#if 0 /* This routine is not currently used */ -PERL_STATIC_INLINE bool -S_is_utf8_X_LV(pTHX_ const U8 *p) -{ - /* Unlike most other similarly named routines here, this does not create a - * swash, so swash_fetch() cannot be used on PL_utf8_X_LV. */ - - dVAR; - - UV cp = valid_utf8_to_uvchr(p, NULL); - - PERL_ARGS_ASSERT_IS_UTF8_X_LV; - - /* The earliest Unicode releases did not have these precomposed Hangul - * syllables. Set to point to undef in that case, so will return false on - * every call */ - if (! PL_utf8_X_LV) { /* Set up if this is the first time called */ - PL_utf8_X_LV = swash_init("utf8", "_X_GCB_LV", &PL_sv_undef, 1, 0); - if (_invlist_len(_get_swash_invlist(PL_utf8_X_LV)) == 0) { - SvREFCNT_dec(PL_utf8_X_LV); - PL_utf8_X_LV = &PL_sv_undef; - } - } - - return (PL_utf8_X_LV != &PL_sv_undef - && cp >= SBASE && cp < SBASE + SCount - && (cp - SBASE) % TCount == 0); /* Only every TCount one is LV */ -} -#endif - -PERL_STATIC_INLINE bool -S_is_utf8_X_LVT(pTHX_ const U8 *p) -{ - /* Unlike most other similarly named routines here, this does not create a - * swash, so swash_fetch() cannot be used on PL_utf8_X_LVT. */ - - dVAR; - - UV cp = valid_utf8_to_uvchr(p, NULL); - - PERL_ARGS_ASSERT_IS_UTF8_X_LVT; - - /* The earliest Unicode releases did not have these precomposed Hangul - * syllables. Set to point to undef in that case, so will return false on - * every call */ - if (! PL_utf8_X_LVT) { /* Set up if this is the first time called */ - PL_utf8_X_LVT = swash_init("utf8", "_X_GCB_LVT", &PL_sv_undef, 1, 0); - if (_invlist_len(_get_swash_invlist(PL_utf8_X_LVT)) == 0) { - SvREFCNT_dec(PL_utf8_X_LVT); - PL_utf8_X_LVT = &PL_sv_undef; - } - } - - return (PL_utf8_X_LVT != &PL_sv_undef - && cp >= SBASE && cp < SBASE + SCount - && (cp - SBASE) % TCount != 0); /* All but every TCount one is LV */ -} - /* * Local variables: * c-indentation-style: bsd diff --git a/sv.c b/sv.c index 73fa710..50f8e66 100644 --- a/sv.c +++ b/sv.c @@ -13649,7 +13649,6 @@ perl_clone_using(PerlInterpreter *proto_perl, UV flags, PL_utf8_mark = sv_dup_inc(proto_perl->Iutf8_mark, param); PL_utf8_X_regular_begin = sv_dup_inc(proto_perl->Iutf8_X_regular_begin, param); PL_utf8_X_extend = sv_dup_inc(proto_perl->Iutf8_X_extend, param); - PL_utf8_X_LVT = sv_dup_inc(proto_perl->Iutf8_X_LVT, param); PL_utf8_toupper = sv_dup_inc(proto_perl->Iutf8_toupper, param); PL_utf8_totitle = sv_dup_inc(proto_perl->Iutf8_totitle, param); PL_utf8_tolower = sv_dup_inc(proto_perl->Iutf8_tolower, param); -- Perl5 Master Repository