[perl.git] branch blead updated. v5.29.3-33-g107b598f38

Karl Williamson Sun, 30 Sep 2018 09:47:06 -0700

In perl.git, the branch blead has been updated

<https://perl5.git.perl.org/perl.git/commitdiff/107b598f388e4da5b3fe8967f1af8a3083b91294?hp=5133224233bb01d0fecad317741666793a0a15e1>


- Log -----------------------------------------------------------------
commit 107b598f388e4da5b3fe8967f1af8a3083b91294
Author: Karl Williamson <k...@cpan.org>
Date:   Sat Sep 29 09:13:03 2018 -0600

    regexec.c: Comments, White-space only
    
    A few clarifications

commit a49e18618eb19b48f03f4057fda8848c109e2371
Author: Karl Williamson <k...@cpan.org>
Date:   Sat Sep 29 09:07:39 2018 -0600

    regexec.c: Remove obsolete comments
    
    The first comment listed an item as a TODO that was recommended by
    Unicode.  That recommendation is being rescinded in Unicode 12.0 based
    on a ticket I filed against Unicode, which in turn was based on feedback
    from Asmus Freitag.
    
    The second comment was obsoleted by later code changes.

commit debce029c15e140a130915fddf3799a92814a061
Author: Karl Williamson <k...@cpan.org>
Date:   Sat Sep 29 09:03:12 2018 -0600

    regexec.c: Remove macro use for further clarity
    
    Commit 4c83fb55d7096a1d0e6a7a8e25d20b186be3281d added a macro for
    clarity.  I have since realized that it is even clearer to spell things
    as this commit now does.

commit e9c7e9d5f2fc347241a1e059223cbb02398b19bf
Author: Karl Williamson <k...@cpan.org>
Date:   Sun Sep 30 10:41:04 2018 -0600

    re/script_run.t: White-space only

commit 393e5a4585b92e635cfc4eee34da8f86f3bfd2af
Author: Karl Williamson <k...@cpan.org>
Date:   Sun Sep 30 10:38:02 2018 -0600

    PATCH: [perl #133547]: script run broken
    
    All scripts can have the ASCII digits for their numbers.  Scripts with
    their own digits can alternatively use those.  Only one of these two
    sets can be used in a script run.  The decision as to which set to use
    must be deferred until the first digit is encountered, as otherwise we
    don't know which set will be used.  Prior to this commit, the decision
    was being made prematurely in some cases.  As a result of this change,
    the non-ASCII-digits in the Common script need to be special-cased, and
    different criteria are used to decide if we need to look up whether a
    character is a digit or not.

commit 81ec018c6daca2b4c8c87eb335a371b4c90753f3
Author: Karl Williamson <k...@cpan.org>
Date:   Sun Sep 30 10:33:22 2018 -0600

    regexec.c: Rename variable
    
    The new name is clearer as to its meaning, more so after the next
    commit.

-----------------------------------------------------------------------

Summary of changes:
 regexec.c         | 146 +++++++++++++++++++++++++-----------------------------
 t/re/script_run.t |  13 +++--
 2 files changed, 77 insertions(+), 82 deletions(-)

diff --git a/regexec.c b/regexec.c
index 16a230997e..7dbc8400e6 100644
--- a/regexec.c
+++ b/regexec.c
@@ -10280,17 +10280,6 @@ Perl_isSCRIPT_RUN(pTHX_ const U8 * s, const U8 * send, 
const bool utf8_target)
      * These are all defined in charclass_invlists.h */
 
     /* XXX Here are the additional things UTS 39 says could be done:
-     * Mark Chinese strings as “mixed script” if they contain both simplified
-     * (S) and traditional (T) Chinese characters, using the Unihan data in the
-     * Unicode Character Database [UCD].  The criterion can only be applied if
-     * the language of the string is known to be Chinese. So, for example, the
-     * string “写真だけの結婚式 ” is Japanese, and should not be marked as
-     * mixed script because of a mixture of S and T characters.  Testing for
-     * whether a character is S or T needs to be based not on whether the
-     * character has a S or T variant , but whether the character is an S or T
-     * variant. khw notes that the sample contains a Hiragana character, and it
-     * is unclear if absence of any foreign script marks the script as
-     * "Chinese"
      *
      * Forbid sequences of the same nonspacing mark
      *
@@ -10303,9 +10292,9 @@ Perl_isSCRIPT_RUN(pTHX_ const U8 * s, const U8 * send, 
const bool utf8_target)
     SV * decimals_invlist = PL_XPosix_ptrs[_CC_DIGIT];
     UV * decimals_array = invlist_array(decimals_invlist);
 
-    /* What code point is the digit '0' of the script run? */
+    /* What code point is the digit '0' of the script run? (0 meaning FALSE if
+     * not currently known) */
     UV zero_of_run = 0;
-#define SEEN_A_DIGIT (zero_of_run != 0)
 
     SCX_enum script_of_run  = SCX_INVALID;   /* Illegal value */
     SCX_enum script_of_char = SCX_INVALID;
@@ -10316,8 +10305,6 @@ Perl_isSCRIPT_RUN(pTHX_ const U8 * s, const U8 * send, 
const bool utf8_target)
     PERL_UINT_FAST8_T intersection_len = 0;
 
     bool retval = TRUE;
-
-    /* This is supposed to be a return parameter, but currently unused */
     SCX_enum * ret_script = NULL;
 
     assert(send >= s);
@@ -10325,7 +10312,8 @@ Perl_isSCRIPT_RUN(pTHX_ const U8 * s, const U8 * send, 
const bool utf8_target)
     PERL_ARGS_ASSERT_ISSCRIPT_RUN;
 
     /* All code points in 0..255 are either Common or Latin, so must be a
-     * script run.  We can special case it */
+     * script run.  We can return immediately unless we need to know which
+     * script it is. */
     if (! utf8_target && LIKELY(send > s)) {
         if (ret_script == NULL) {
             return TRUE;
@@ -10339,26 +10327,30 @@ Perl_isSCRIPT_RUN(pTHX_ const U8 * s, const U8 * 
send, const bool utf8_target)
             }
         }
 
-        /* If all are Common ... */
+        /* Here, all are Common */
         *ret_script = SCX_Common;
         return TRUE;
     }
 
     /* Look at each character in the sequence */
     while (s < send) {
+        /* If the current character being examined is a digit, this is the code
+         * point of the zero for its sequence of 10 */
+        UV zero_of_char;
+
         UV cp;
 
         /* The code allows all scripts to use the ASCII digits.  This is
          * because they are used in commerce even in scripts that have their
-         * own set.  Hence any ASCII ones found are ok, unless a digit from
-         * another set has already been encountered.  (The other digit ranges
-         * in Common are not similarly blessed) */
+         * own set.  Hence any ASCII ones found are ok, unless and until a
+         * digit from another set has already been encountered.  (The other
+         * digit ranges in Common are not similarly blessed) */
         if (UNLIKELY(isDIGIT(*s))) {
             if (UNLIKELY(script_of_run == SCX_Unknown)) {
                 retval = FALSE;
                 break;
             }
-            if (SEEN_A_DIGIT) {
+            if (zero_of_run) {
                 if (zero_of_run != '0') {
                     retval = FALSE;
                     break;
@@ -10384,7 +10376,7 @@ Perl_isSCRIPT_RUN(pTHX_ const U8 * s, const U8 * send, 
const bool utf8_target)
         /* If is within the range [+0 .. +9] of the script's zero, it also is a
          * digit in that script.  We can skip the rest of this code for this
          * character. */
-        if (UNLIKELY(   SEEN_A_DIGIT
+        if (UNLIKELY(   zero_of_run
                      && cp >= zero_of_run
                      && cp - zero_of_run <= 9))
         {
@@ -10449,7 +10441,7 @@ Perl_isSCRIPT_RUN(pTHX_ const U8 * s, const U8 * send, 
const bool utf8_target)
 
             /* But Common contains several sets of digits.  Only the '0' set
              * can be part of another script. */
-            if (SEEN_A_DIGIT && zero_of_run != '0') {
+            if (zero_of_run && zero_of_run != '0') {
                 retval = FALSE;
                 break;
             }
@@ -10457,16 +10449,6 @@ Perl_isSCRIPT_RUN(pTHX_ const U8 * s, const U8 * send, 
const bool utf8_target)
             script_of_run = script_of_char;
         }
 
-        /* All decimal digits must be from the same sequence of 10.  Above, we
-         * handled any ASCII digits without descending to here.  We also
-         * handled the case where we already knew what digit sequence is the
-         * one to use, and the character is in that sequence.  Now that we know
-         * the script, we can use script_zeros[] to directly find which
-         * sequence the script uses, except in a few cases it returns 0 */
-        if (UNLIKELY(zero_of_run == 0 && script_of_char >= 0)) {
-            zero_of_run = script_zeros[script_of_char];
-        }
-
         /* Now we can see if the script of the character is the same as that of
          * the run */
         if (LIKELY(script_of_char == script_of_run)) {
@@ -10474,7 +10456,6 @@ Perl_isSCRIPT_RUN(pTHX_ const U8 * s, const U8 * send, 
const bool utf8_target)
             goto scripts_match;
         }
 
-
         /* Here, the script of the run isn't Common.  But characters in Common
          * match any script */
         if (script_of_char == SCX_Common) {
@@ -10624,54 +10605,61 @@ Perl_isSCRIPT_RUN(pTHX_ const U8 * s, const U8 * 
send, const bool utf8_target)
         /* Here, the script of the character is compatible with that of the
          * run.  That means that in most cases, it continues the script run.
          * Either it and the run match exactly, or one or both can be in any of
-         * several scripts, and the intersection is not empty.  But if the
-         * character is a decimal digit, we need further handling.  If we
-         * haven't seen a digit before, it would establish what set of 10 all
-         * must come from; and if we have established a set, we need to check
-         * that this is in it.
-         *
-         * But there are cases we can rule out without having to look up if
-         * this is a digit:
-         *   a.  All instances of [0-9] have been dealt with earlier.
-         *   b.  The next digit encoded by Unicode is 1600 code points further
-         *       on, so if the code point in this loop iteration is less than
-         *       that, it isn't a digit.
-         *   c.  Most scripts that have digits have a single set of 10.  If
-         *       we've encountered a digit in such a script, 'zero_of_run' is
-         *       set to the code point (call it z) whose numeric value is 0.
-         *       If the code point in this loop iteration is in the range
-         *       z..z+9, it is in the script's set of 10, and we've actually
-         *       handled it earlier in this function and won't reach this
-         *       point.  But, code points in that script that aren't in that
-         *       range can't be digits, so we don't have to look any such up.
-         *       We can tell if this script is such a one by looking at
-         *       'script_zeros[]' for it.  It is non-zero iff it has a single
-         *       set of digits.  This rule doesn't apply if we haven't narrowed
-         *       down the possible scripts to a single one yet.  Nor if the
-         *       zero of the run is '0', as that also hasn't narrowed things
-         *       down completely */
-        if (    cp >= FIRST_NON_ASCII_DECIMAL_DIGIT
-            && (   intersection
-                || script_of_char < 0   /* Also implies an intersection */
-                || zero_of_run == '0'
-                || script_zeros[script_of_char] == 0))
+         * several scripts, and the intersection is not empty.  However, if the
+         * character is a decimal digit, it could still mean failure if it is
+         * from the wrong sequence of 10.  So, we need to look at if it's a
+         * digit.  We've already handled the 10 decimal digits, and the next
+         * lowest one is this one: */
+        if (cp < FIRST_NON_ASCII_DECIMAL_DIGIT) {
+            continue;   /* Not a digit; this character is part of the run */
+        }
+
+        /* If we have a definitive '0' for the script of this character, we
+         * know that for this to be a digit, it must be in the range of +0..+9
+         * of that zero. */
+        if (   script_of_char >= 0
+            && (zero_of_char = script_zeros[script_of_char]))
         {
-            SSize_t range_zero_index;
-            range_zero_index = _invlist_search(decimals_invlist, cp);
-            if (   LIKELY(range_zero_index >= 0)
-                && ELEMENT_RANGE_MATCHES_INVLIST(range_zero_index))
+            if (   cp < zero_of_char
+                || cp > zero_of_char + 9)
             {
-                UV range_zero = decimals_array[range_zero_index];
-                if (SEEN_A_DIGIT) {
-                    if (zero_of_run != range_zero) {
-                        retval = FALSE;
-                        break;
-                    }
-                }
-                else {
-                    zero_of_run = range_zero;
-                }
+                continue;   /* Not a digit; this character is part of the run
+                             */
+            }
+
+        }
+        else {  /* Need to look up if this character is a digit or not */
+            SSize_t index_of_zero_of_char;
+            index_of_zero_of_char = _invlist_search(decimals_invlist, cp);
+            if (     UNLIKELY(index_of_zero_of_char < 0)
+                || ! ELEMENT_RANGE_MATCHES_INVLIST(index_of_zero_of_char))
+            {
+                continue;   /* Not a digit; this character is part of the run.
+                             */
             }
+
+            zero_of_char = decimals_array[index_of_zero_of_char];
+        }
+
+        /* Here, the character is a decimal digit, and the zero of its sequence
+         * of 10 is in 'zero_of_char'.  If we already have a zero for this run,
+         * they better be the same. */
+        if (zero_of_run) {
+            if (zero_of_run != zero_of_char) {
+                retval = FALSE;
+                break;
+            }
+        }
+        else if (script_of_char == SCX_Common && script_of_run != SCX_Common) {
+
+            /* Here, the script run isn't Common, but the current digit is in
+             * Common, and isn't '0'-'9' (those were handled earlier).   Only
+             * '0'-'9' are acceptable in non-Common scripts. */
+            retval = FALSE;
+            break;
+        }
+        else {  /* Otherwise we now have a zero for this run */
+            zero_of_run = zero_of_char;
         }
     } /* end of looping through CLOSESR text */
 
diff --git a/t/re/script_run.t b/t/re/script_run.t
index 10c71034c4..035a9104aa 100644
--- a/t/re/script_run.t
+++ b/t/re/script_run.t
@@ -92,9 +92,16 @@ foreach my $type ('script_run', 'sr', 'atomic_script_run', 
'asr') {
 }
 
     # Until fixed, this was skipping the '['
-    unlike("abc]c", qr/^ (*sr:a(*sr:[bc]*)c) $/x, "Doesn't skip parts of exact 
matches");
+    unlike("abc]c", qr/^ (*sr:a(*sr:[bc]*)c) $/x,
+           "Doesn't skip parts of exact matches");
 
-      like("abc", qr/(*asr:a[bc]*c)/, "Outer asr works on a run");
-    unlike("abc", qr/(*asr:a(*asr:[bc]*)c)/, "Nested asr works to exclude some 
things");
+    like("abc", qr/(*asr:a[bc]*c)/, "Outer asr works on a run");
+    unlike("abc", qr/(*asr:a(*asr:[bc]*)c)/,
+           "Nested asr works to exclude some things");
+
+    like("\x{0980}12\x{0993}", qr/^(*sr:.{4})/,
+         "Script with own zero works with ASCII digits"); # perl #133547
+    like("\x{3041}12\x{3041}", qr/^(*sr:.{4})/,
+         "Script without own zero works with ASCII digits");
 
 done_testing();

-- 
Perl5 Master Repository

[perl.git] branch blead updated. v5.29.3-33-g107b598f38

Reply via email to