In perl.git, the branch blead has been updated

<http://perl5.git.perl.org/perl.git/commitdiff/632c9f80dfaf91e6a695c9a916ab6136110e4ac7?hp=d050f2859975c1e0a222bc3689e73a4e39c58b87>

- Log -----------------------------------------------------------------
commit 632c9f80dfaf91e6a695c9a916ab6136110e4ac7
Author: Karl Williamson <pub...@khwilliamson.com>
Date:   Sun Dec 16 08:56:28 2012 -0700

    regen/unicode_constants.pl: Add option to skip if undef
    
    I thought I would need this new functionality in this regen script, but
    ended up going a different route.  But just in case someone might find
    this useful in the future, here it is.

M       regen/unicode_constants.pl

commit 7d43c479c5220d368a2e5d94341c40f8d4cb1769
Author: Karl Williamson <pub...@khwilliamson.com>
Date:   Sat Dec 15 09:53:19 2012 -0700

    regexec.c: More efficient Korean \X processing
    
    This refactors the code slightly that checks for Korean precomposed
    syllables in \X.  It eliminates the PL_variable formerly used to keep
    track of things.

M       embed.fnc
M       embed.h
M       embedvar.h
M       intrpvar.h
M       proto.h
M       regexec.c
M       sv.c

commit 22913b96d35efdf1a58eddd0cfba7640c55fbcc7
Author: Karl Williamson <pub...@khwilliamson.com>
Date:   Sat Dec 15 09:42:36 2012 -0700

    regexec.c: Move #defines to earlier in the file
    
    They will be used in a later commit earlier.  This also changes the
    wording of the comment slightly to give more explanation, since the
    context they are now found in is different

M       regexec.c
-----------------------------------------------------------------------

Summary of changes:
 embed.fnc                  |    2 -
 embed.h                    |    1 -
 embedvar.h                 |    1 -
 intrpvar.h                 |    1 -
 proto.h                    |    6 ---
 regen/unicode_constants.pl |   19 ++++++---
 regexec.c                  |   88 ++++++++-----------------------------------
 sv.c                       |    1 -
 8 files changed, 30 insertions(+), 89 deletions(-)

diff --git a/embed.fnc b/embed.fnc
index 0a382f6..2be18ad 100644
--- a/embed.fnc
+++ b/embed.fnc
@@ -2037,8 +2037,6 @@ ERsn      |U8*    |reghop3        |NN U8 *s|I32 off|NN 
const U8 *lim
 ERsM   |SV*    |core_regclass_swash|NULLOK const regexp *prog \
                                |NN const struct regnode *node|bool doinit \
                                |NULLOK SV **listsvp
-:not currently used EiR        |bool   |is_utf8_X_LV           |NN const U8 *p
-EiR    |bool   |is_utf8_X_LVT          |NN const U8 *p
 #ifdef XXX_dmq
 ERsn   |U8*    |reghop4        |NN U8 *s|I32 off|NN const U8 *llim \
                                |NN const U8 *rlim
diff --git a/embed.h b/embed.h
index d6b1c2f..4ae36e3 100644
--- a/embed.h
+++ b/embed.h
@@ -970,7 +970,6 @@
 #define core_regclass_swash(a,b,c,d)   S_core_regclass_swash(aTHX_ a,b,c,d)
 #define find_byclass(a,b,c,d,e)        S_find_byclass(aTHX_ a,b,c,d,e)
 #define isFOO_lc(a,b)          S_isFOO_lc(aTHX_ a,b)
-#define is_utf8_X_LVT(a)       S_is_utf8_X_LVT(aTHX_ a)
 #define reg_check_named_buff_matched(a,b)      
S_reg_check_named_buff_matched(aTHX_ a,b)
 #define regcppop(a)            S_regcppop(aTHX_ a)
 #define regcppush(a,b)         S_regcppush(aTHX_ a,b)
diff --git a/embedvar.h b/embedvar.h
index 9fc6709..87791b4 100644
--- a/embedvar.h
+++ b/embedvar.h
@@ -350,7 +350,6 @@
 #define PL_unitcheckav_save    (vTHX->Iunitcheckav_save)
 #define PL_unlockhook          (vTHX->Iunlockhook)
 #define PL_unsafe              (vTHX->Iunsafe)
-#define PL_utf8_X_LVT          (vTHX->Iutf8_X_LVT)
 #define PL_utf8_X_extend       (vTHX->Iutf8_X_extend)
 #define PL_utf8_X_regular_begin        (vTHX->Iutf8_X_regular_begin)
 #define PL_utf8_alnum          (vTHX->Iutf8_alnum)
diff --git a/intrpvar.h b/intrpvar.h
index 004989c..b513d22 100644
--- a/intrpvar.h
+++ b/intrpvar.h
@@ -625,7 +625,6 @@ PERLVAR(I, utf8_punct,      SV *)
 PERLVAR(I, utf8_mark,  SV *)
 PERLVAR(I, utf8_X_regular_begin, SV *)
 PERLVAR(I, utf8_X_extend, SV *)
-PERLVAR(I, utf8_X_LVT, SV *)
 PERLVAR(I, utf8_toupper, SV *)
 PERLVAR(I, utf8_totitle, SV *)
 PERLVAR(I, utf8_tolower, SV *)
diff --git a/proto.h b/proto.h
index 7f4942e..2ab4429 100644
--- a/proto.h
+++ b/proto.h
@@ -6790,12 +6790,6 @@ STATIC char*     S_find_byclass(pTHX_ regexp * prog, 
const regnode *c, char *s, cons
 STATIC bool    S_isFOO_lc(pTHX_ const U8 classnum, const U8 character)
                        __attribute__warn_unused_result__;
 
-PERL_STATIC_INLINE bool        S_is_utf8_X_LVT(pTHX_ const U8 *p)
-                       __attribute__warn_unused_result__
-                       __attribute__nonnull__(pTHX_1);
-#define PERL_ARGS_ASSERT_IS_UTF8_X_LVT \
-       assert(p)
-
 STATIC I32     S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const 
regnode *scan)
                        __attribute__warn_unused_result__
                        __attribute__nonnull__(pTHX_1)
diff --git a/regen/unicode_constants.pl b/regen/unicode_constants.pl
index 729bde8..48b43f4 100644
--- a/regen/unicode_constants.pl
+++ b/regen/unicode_constants.pl
@@ -39,6 +39,9 @@ END
 # white space from the initial token.
 #   string  indicates that the output is to be of the string form
 #           described in the comments above that are placed in the file.
+#   string_skip_ifundef  is the same as 'string', but instead of dying if the
+#           code point doesn't exist, the line is just skipped: no output is
+#           generated for it
 #   first   indicates that the output is to be of the FIRST_BYTE form.
 #   tail    indicates that the output is of the _TAIL form.
 #   native  indicates that the output is the code point, converted to the
@@ -72,6 +75,7 @@ while ( <DATA> ) {
 
     my $name;
     my $cp;
+    my $undef_ok = $desired_name || $flag =~ /skip_if_undef/;
 
     if ($name_or_cp =~ /[^[:xdigit:]]/) {
 
@@ -82,20 +86,23 @@ while ( <DATA> ) {
     }
     else {
         $cp = $name_or_cp;
-        $name = charnames::viacode("0$cp") // ""; # viacode requires a leading
-                                                  # zero to be sure that the
-                                                  # argument is hex
-        die "Unknown code point '$cp' at line $.: $_\n" unless defined $cp;
+        $name = charnames::viacode("0$cp"); # viacode requires a leading zero
+                                            # to be sure that the argument is
+                                            # hex
+        if (! defined $name) {
+            die "Unknown code point '$cp' at line $.: $_\n" unless $undef_ok;
+            $name = "";
+        }
     }
 
-    $name = $desired_name if $name eq "";
+    $name = $desired_name if $name eq "" && $desired_name;
     $name =~ s/ /_/g;   # The macro name can have no blanks in it
 
     my $str = join "", map { sprintf "\\x%02X", $_ }
                        unpack("U0C*", pack("U", hex $cp));
 
     my $suffix = '_UTF8';
-    if (! defined $flag  || $flag eq 'string') {
+    if (! defined $flag  || $flag =~ /^ string (_skip_if_undef)? $/x) {
         $str = "\"$str\"";  # Will be a string constant
     } elsif ($flag eq 'tail') {
             $str =~ s/\\x..//;  # Remove the first byte
diff --git a/regexec.c b/regexec.c
index c5ae04d..c4b949b 100644
--- a/regexec.c
+++ b/regexec.c
@@ -327,6 +327,15 @@ static const char* const non_utf8_target_but_utf8_required
     } \
 } STMT_END 
 
+/* These constants are for finding GCB=LV and GCB=LVT in the CLUMP regnode.
+ * These are for the pre-composed Hangul syllables, which are all in a
+ * contiguous block and arranged there in such a way so as to facilitate
+ * alorithmic determination of their characteristics.  As such, they don't need
+ * a swash, but can be determined by simple arithmetic.  Almost all are
+ * GCB=LVT, but every 28th one is a GCB=LV */
+#define SBASE 0xAC00    /* Start of block */
+#define SCount 11172    /* Length of block */
+#define TCount 28
 
 static void restore_pos(pTHX_ void *arg);
 
@@ -4592,10 +4601,15 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char 
*startpos, regnode *prog)
                             if (locinput < PL_regeol
                                 && is_GCB_LV_LVT_V_utf8(locinput))
                             {
-
                                 /* Otherwise keep going.  Must be LV, LVT or V.
-                                 * See if LVT */
-                                if (is_utf8_X_LVT((U8*)locinput)) {
+                                 * See if LVT, by first ruling out V, then LV 
*/
+                                if (! is_GCB_V_utf8(locinput)
+                                        /* All but every TCount one is LV */
+                                    && (valid_utf8_to_uvchr((U8 *) locinput,
+                                                                         NULL)
+                                                                        - 
SBASE)
+                                        % TCount != 0)
+                                {
                                     locinput += UTF8SKIP(locinput);
                                 } else {
 
@@ -7732,74 +7746,6 @@ S_to_byte_substr(pTHX_ regexp *prog)
     return TRUE;
 }
 
-/* These constants are for finding GCB=LV and GCB=LVT.  These are for the
- * pre-composed Hangul syllables, which are all in a contiguous block and
- * arranged there in such a way so as to facilitate alorithmic determination of
- * their characteristics.  As such, they don't need a swash, but can be
- * determined by simple arithmetic.  Almost all are GCB=LVT, but every 28th one
- * is a GCB=LV */
-#define SBASE 0xAC00    /* Start of block */
-#define SCount 11172    /* Length of block */
-#define TCount 28
-
-#if 0   /* This routine is not currently used */
-PERL_STATIC_INLINE bool
-S_is_utf8_X_LV(pTHX_ const U8 *p)
-{
-    /* Unlike most other similarly named routines here, this does not create a
-     * swash, so swash_fetch() cannot be used on PL_utf8_X_LV. */
-
-    dVAR;
-
-    UV cp = valid_utf8_to_uvchr(p, NULL);
-
-    PERL_ARGS_ASSERT_IS_UTF8_X_LV;
-
-    /* The earliest Unicode releases did not have these precomposed Hangul
-     * syllables.  Set to point to undef in that case, so will return false on
-     * every call */
-    if (! PL_utf8_X_LV) {   /* Set up if this is the first time called */
-        PL_utf8_X_LV = swash_init("utf8", "_X_GCB_LV", &PL_sv_undef, 1, 0);
-        if (_invlist_len(_get_swash_invlist(PL_utf8_X_LV)) == 0) {
-            SvREFCNT_dec(PL_utf8_X_LV);
-            PL_utf8_X_LV = &PL_sv_undef;
-        }
-    }
-
-    return (PL_utf8_X_LV != &PL_sv_undef
-            && cp >= SBASE && cp < SBASE + SCount
-            && (cp - SBASE) % TCount == 0); /* Only every TCount one is LV */
-}
-#endif
-
-PERL_STATIC_INLINE bool
-S_is_utf8_X_LVT(pTHX_ const U8 *p)
-{
-    /* Unlike most other similarly named routines here, this does not create a
-     * swash, so swash_fetch() cannot be used on PL_utf8_X_LVT. */
-
-    dVAR;
-
-    UV cp = valid_utf8_to_uvchr(p, NULL);
-
-    PERL_ARGS_ASSERT_IS_UTF8_X_LVT;
-
-    /* The earliest Unicode releases did not have these precomposed Hangul
-     * syllables.  Set to point to undef in that case, so will return false on
-     * every call */
-    if (! PL_utf8_X_LVT) {   /* Set up if this is the first time called */
-        PL_utf8_X_LVT = swash_init("utf8", "_X_GCB_LVT", &PL_sv_undef, 1, 0);
-        if (_invlist_len(_get_swash_invlist(PL_utf8_X_LVT)) == 0) {
-            SvREFCNT_dec(PL_utf8_X_LVT);
-            PL_utf8_X_LVT = &PL_sv_undef;
-        }
-    }
-
-    return (PL_utf8_X_LVT != &PL_sv_undef
-            && cp >= SBASE && cp < SBASE + SCount
-            && (cp - SBASE) % TCount != 0); /* All but every TCount one is LV 
*/
-}
-
 /*
  * Local variables:
  * c-indentation-style: bsd
diff --git a/sv.c b/sv.c
index 73fa710..50f8e66 100644
--- a/sv.c
+++ b/sv.c
@@ -13649,7 +13649,6 @@ perl_clone_using(PerlInterpreter *proto_perl, UV flags,
     PL_utf8_mark       = sv_dup_inc(proto_perl->Iutf8_mark, param);
     PL_utf8_X_regular_begin    = sv_dup_inc(proto_perl->Iutf8_X_regular_begin, 
param);
     PL_utf8_X_extend   = sv_dup_inc(proto_perl->Iutf8_X_extend, param);
-    PL_utf8_X_LVT      = sv_dup_inc(proto_perl->Iutf8_X_LVT, param);
     PL_utf8_toupper    = sv_dup_inc(proto_perl->Iutf8_toupper, param);
     PL_utf8_totitle    = sv_dup_inc(proto_perl->Iutf8_totitle, param);
     PL_utf8_tolower    = sv_dup_inc(proto_perl->Iutf8_tolower, param);

--
Perl5 Master Repository

Reply via email to