[perl.git] branch blead, updated. v5.25.8-285-g323e4ec46d

Karl Williamson Thu, 19 Jan 2017 13:10:01 -0800

In perl.git, the branch blead has been updated

<http://perl5.git.perl.org/perl.git/commitdiff/323e4ec46db0dc8d22d9eae846bc5f0fe9d642ec?hp=92ff660bc8c29480a311c0b95430f16c175961db>


- Log -----------------------------------------------------------------
commit 323e4ec46db0dc8d22d9eae846bc5f0fe9d642ec
Author: Karl Williamson <[email protected]>
Date:   Mon Jan 16 17:52:55 2017 -0700

    perlebcdic: Rmv obsolete text about 'use encoding'

M       pod/perlebcdic.pod

commit 2dc9bc8419687eac4909c664fdbbb1c6a2a8c683
Author: Karl Williamson <[email protected]>
Date:   Mon Jan 16 17:51:20 2017 -0700

    perlop: Remove obsolete text
    
    'use encoding' no longer works.  Don't say it does.

M       pod/perlop.pod

commit 8bafd282a6fa128ebec02d04e12692c4b75a7d3f
Author: Karl Williamson <[email protected]>
Date:   Fri Jan 13 13:53:17 2017 -0700

    Fix bug with a digit range under re 'strict'
    
    "use re 'strict" is supposed to warn if a range whose start and end
    points are digits aren't from the same group of 10.  For example, if you
    mix Bengali and Thai digits.  It wasn't working properly for 5 groups of
    mathematical digits starting at U+1D7E.  This commit fixes that, and
    refactors the code to bail out as soon as it discovers that no warning
    is warranted, instead of doing unnecessary work.

M       charclass_invlists.h
M       lib/unicore/mktables
M       regcharclass.h
M       regcomp.c
M       t/re/reg_mesg.t
-----------------------------------------------------------------------

Summary of changes:
 charclass_invlists.h |   2 +-
 lib/unicore/mktables |  12 ++++++
 pod/perlebcdic.pod   |   3 --
 pod/perlop.pod       |   7 +---
 regcharclass.h       |   2 +-
 regcomp.c            | 110 +++++++++++++++++++++++++++++++++------------------
 t/re/reg_mesg.t      |   3 ++
 7 files changed, 90 insertions(+), 49 deletions(-)

diff --git a/charclass_invlists.h b/charclass_invlists.h
index 038336db20..732b6d0a8a 100644
--- a/charclass_invlists.h
+++ b/charclass_invlists.h
@@ -95407,7 +95407,7 @@ static const U8 WB_table[24][24] = {
  * 37f6186253da9824bdb27f4ad867bfe8c25d4dc6bdb2f05585e40a034675a348 
lib/unicore/extracted/DLineBreak.txt
  * ef24061b5a5dc93d7e90c2e34530ec757180ee75d872cba65ffc946e52624ae8 
lib/unicore/extracted/DNumType.txt
  * a197371fec9a1b517058b440841f60f9378d81682084eef8db22a88cb2f96e90 
lib/unicore/extracted/DNumValues.txt
- * 066d6e75f95cf6794161c8ac0b1a40990277de90eefb913be2e675a7cba38d59 
lib/unicore/mktables
+ * 4bcfb4545be21663ca38a2acbfcbf2b0f3252652a34b50f1a56ef76cb959861b 
lib/unicore/mktables
  * cdecb300baad839a6f62791229f551a4fa33f3cbdca08e378dc976466354e778 
lib/unicore/version
  * 913d2f93f3cb6cdf1664db888bf840bc4eb074eef824e082fceda24a9445e60c 
regen/charset_translations.pl
  * 9534d0cc3914fa1f5d574332c3199605c3d14f8691a0729d68d8498ac2b36280 
regen/mk_invlists.pl
diff --git a/lib/unicore/mktables b/lib/unicore/mktables
index fa1f1f4445..542461742d 100644
--- a/lib/unicore/mktables
+++ b/lib/unicore/mktables
@@ -13699,6 +13699,18 @@ numerals.
 END
     ));
 
+    # Make sure this assumption in perl core code is valid in this Unicode
+    # release, with known exceptions
+    foreach my $range (property_ref('Numeric-Type')->table('Decimal')->ranges) 
{
+        next if $range->end - $range->start == 9;
+        next if $range->start == 0x1D7CE;   # This whole range was added in 3.1
+        next if $range->end == 0x19DA && $v_version eq v5.2.0;
+        next if $range->end - $range->start < 9 && $v_version le 4.0.0;
+        Carp::my_carp("Range $range unexpectedly doesn't contain 10"
+                    . " decimal digits.  Code in regcomp.c assumes it does,"
+                    . " and will have to be fixed.  Proceeding anyway.");
+    }
+
     Property->new('Legacy_Case_Folding',
                     File => "Fold",
                     Directory => $map_directory,
diff --git a/pod/perlebcdic.pod b/pod/perlebcdic.pod
index 6dd8e10c4a..288a71f877 100644
--- a/pod/perlebcdic.pod
+++ b/pod/perlebcdic.pod
@@ -1855,9 +1855,6 @@ EBCDIC platforms.  And some of the failures are real 
bugs.  If you
 compile and do a C<make test> on Perl, all tests on the C</cpan>
 directory are skipped.
 
-In particular, the (now deprecated) L<encoding> pragma is not supported
-under EBCDIC.
-
 L<Encode> partially works.
 
 =item *
diff --git a/pod/perlop.pod b/pod/perlop.pod
index 3cf9db67e6..6550133284 100644
--- a/pod/perlop.pod
+++ b/pod/perlop.pod
@@ -1569,12 +1569,9 @@ as a Unicode code point no matter what the native 
encoding is.  The name of the
 character in the 256th position (indexed by 0) in Unicode is
 C<LATIN CAPITAL LETTER A WITH MACRON>.
 
-There are a couple of exceptions to the above rule.  S<C<\N{U+I<hex number>}>> 
is
+An exception to the above rule is that S<C<\N{U+I<hex number>}>> is
 always interpreted as a Unicode code point, so that C<\N{U+0050}> is C<"P"> 
even
-on EBCDIC platforms.  And if C<S<L<use encoding|encoding>>> is in effect, the
-number is considered to be in that encoding, and is translated from that into
-the platform's native encoding if there is a corresponding native character;
-otherwise to Unicode.
+on EBCDIC platforms.
 
 =back
 
diff --git a/regcharclass.h b/regcharclass.h
index bb44e8224c..4be75bcac7 100644
--- a/regcharclass.h
+++ b/regcharclass.h
@@ -1897,7 +1897,7 @@
  * 37f6186253da9824bdb27f4ad867bfe8c25d4dc6bdb2f05585e40a034675a348 
lib/unicore/extracted/DLineBreak.txt
  * ef24061b5a5dc93d7e90c2e34530ec757180ee75d872cba65ffc946e52624ae8 
lib/unicore/extracted/DNumType.txt
  * a197371fec9a1b517058b440841f60f9378d81682084eef8db22a88cb2f96e90 
lib/unicore/extracted/DNumValues.txt
- * 066d6e75f95cf6794161c8ac0b1a40990277de90eefb913be2e675a7cba38d59 
lib/unicore/mktables
+ * 4bcfb4545be21663ca38a2acbfcbf2b0f3252652a34b50f1a56ef76cb959861b 
lib/unicore/mktables
  * cdecb300baad839a6f62791229f551a4fa33f3cbdca08e378dc976466354e778 
lib/unicore/version
  * 913d2f93f3cb6cdf1664db888bf840bc4eb074eef824e082fceda24a9445e60c 
regen/charset_translations.pl
  * 1d27ae8b75d81a082b1fc594673e08540280f8169309a7b5047015c8091a2bfb 
regen/regcharclass.pl
diff --git a/regcomp.c b/regcomp.c
index d865c73c06..97888ca7ec 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -16809,15 +16809,19 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 
*flagp, U32 depth,
                      * must be be all digits or all letters of the same case.
                      * Otherwise, the range is non-portable and unclear as to
                      * what it contains */
-                    if ((isPRINT_A(prevvalue) || isPRINT_A(value))
-                        && (non_portable_endpoint
-                            || ! ((isDIGIT_A(prevvalue) && isDIGIT_A(value))
-                                   || (isLOWER_A(prevvalue) && 
isLOWER_A(value))
-                                   || (isUPPER_A(prevvalue) && 
isUPPER_A(value)))))
-                    {
-                        vWARN(RExC_parse, "Ranges of ASCII printables should 
be some subset of \"0-9\", \"A-Z\", or \"a-z\"");
+                    if (             (isPRINT_A(prevvalue) || isPRINT_A(value))
+                        && (          non_portable_endpoint
+                            || ! (   (isDIGIT_A(prevvalue) && isDIGIT_A(value))
+                                  || (isLOWER_A(prevvalue) && isLOWER_A(value))
+                                  || (isUPPER_A(prevvalue) && isUPPER_A(value))
+                    ))) {
+                        vWARN(RExC_parse, "Ranges of ASCII printables should"
+                                          " be some subset of \"0-9\","
+                                          " \"A-Z\", or \"a-z\"");
                     }
                     else if (prevvalue >= 0x660) { /* ARABIC_INDIC_DIGIT_ZERO 
*/
+                        SSize_t index_start;
+                        SSize_t index_final;
 
                         /* But the nature of Unicode and languages mean we
                          * can't do the same checks for above-ASCII ranges,
@@ -16825,40 +16829,68 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 
*flagp, U32 depth,
                          * contain only digits from the same group of 10.  The
                          * ASCII case is handled just above.  0x660 is the
                          * first digit character beyond ASCII.  Hence here, the
-                         * range could be a range of digits.  Find out.  */
-                        IV index_start = 
_invlist_search(PL_XPosix_ptrs[_CC_DIGIT],
-                                                         prevvalue);
-                        IV index_final = 
_invlist_search(PL_XPosix_ptrs[_CC_DIGIT],
-                                                         value);
-
-                        /* If the range start and final points are in the same
-                         * inversion list element, it means that either both
-                         * are not digits, or both are digits in a consecutive
-                         * sequence of digits.  (So far, Unicode has kept all
-                         * such sequences as distinct groups of 10, but assert
-                         * to make sure).  If the end points are not in the
-                         * same element, neither should be a digit. */
-                        if (index_start == index_final) {
-                            assert(! ELEMENT_RANGE_MATCHES_INVLIST(index_start)
-                            || 
(invlist_array(PL_XPosix_ptrs[_CC_DIGIT])[index_start+1]
-                               - 
invlist_array(PL_XPosix_ptrs[_CC_DIGIT])[index_start]
-                               == 10)
-                               /* But actually Unicode did have one group of 11
-                                * 'digits' in 5.2, so in case we are operating
-                                * on that version, let that pass */
-                            || 
(invlist_array(PL_XPosix_ptrs[_CC_DIGIT])[index_start+1]
-                               - 
invlist_array(PL_XPosix_ptrs[_CC_DIGIT])[index_start]
-                                == 11
-                               && 
invlist_array(PL_XPosix_ptrs[_CC_DIGIT])[index_start]
-                                == 0x19D0)
-                            );
+                         * range could be a range of digits.  First some
+                         * unlikely special cases.  Grandfather in that a range
+                         * ending in 19DA (NEW TAI LUE THAM DIGIT ONE) is bad
+                         * if its starting value is one of the 10 digits prior
+                         * to it.  This is because it is an alternate way of
+                         * writing 19D1, and some people may expect it to be in
+                         * that group.  But it is bad, because it won't give
+                         * the expected results.  In Unicode 5.2 it was
+                         * considered to be in that group (of 11, hence), but
+                         * this was fixed in the next version */
+
+                        if (UNLIKELY(value == 0x19DA && prevvalue >= 0x19D0)) {
+                            goto warn_bad_digit_range;
                         }
-                        else if ((index_start >= 0
-                                  && 
ELEMENT_RANGE_MATCHES_INVLIST(index_start))
-                                 || (index_final >= 0
-                                     && 
ELEMENT_RANGE_MATCHES_INVLIST(index_final)))
+                        else if (UNLIKELY(   prevvalue >= 0x1D7CE
+                                          &&     value <= 0x1D7FF))
                         {
-                            vWARN(RExC_parse, "Ranges of digits should be from 
the same group of 10");
+                            /* This is the only other case currently in Unicode
+                             * where the algorithm below fails.  The code
+                             * points just above are the end points of a single
+                             * range containing only decimal digits.  It is 5
+                             * different series of 0-9.  All other ranges of
+                             * digits currently in Unicode are just a single
+                             * series.  (And mktables will notify us if a later
+                             * Unicode version breaks this.)
+                             *
+                             * If the range being checked is at most 9 long,
+                             * and the digit values represented are in
+                             * numerical order, they are from the same series.
+                             * */
+                            if (         value - prevvalue > 9
+                                ||    (((    value - 0x1D7CE) % 10)
+                                     <= (prevvalue - 0x1D7CE) % 10))
+                            {
+                                goto warn_bad_digit_range;
+                            }
+                        }
+                        else {
+
+                            /* For all other ranges of digits in Unicode, the
+                             * algorithm is just to check if both end points
+                             * are in the same series, which is the same range.
+                             * */
+                            index_start = _invlist_search(
+                                                    PL_XPosix_ptrs[_CC_DIGIT],
+                                                    prevvalue);
+
+                            /* Warn if the range starts and ends with a digit,
+                             * and they are not in the same group of 10. */
+                            if (   index_start >= 0
+                                && ELEMENT_RANGE_MATCHES_INVLIST(index_start)
+                                && (index_final =
+                                    _invlist_search(PL_XPosix_ptrs[_CC_DIGIT],
+                                                    value)) != index_start
+                                && index_final >= 0
+                                && ELEMENT_RANGE_MATCHES_INVLIST(index_final))
+                            {
+                              warn_bad_digit_range:
+                                vWARN(RExC_parse, "Ranges of digits should be"
+                                                  " from the same group of"
+                                                  " 10");
+                            }
                         }
                     }
                 }
diff --git a/t/re/reg_mesg.t b/t/re/reg_mesg.t
index 050448b66c..22711d5f62 100644
--- a/t/re/reg_mesg.t
+++ b/t/re/reg_mesg.t
@@ -628,6 +628,9 @@ my @warning_utf8_only_under_strict = mark_as_utf8(
  '/ã(?[ [ áª - áª ] ])/; #no latin1' => "Ranges of digits should be from 
the same group of 10 {#} m/ã(?[ [ áª - áª {#}] ])/",
  '/ã[á§-á§]/; #no latin1' => "Ranges of digits should be from the same 
group of 10 {#} m/ã[á§-á§{#}]/",
  '/ã(?[ [ á§ - á§ ] ])/; #no latin1' => "Ranges of digits should be from 
the same group of 10 {#} m/ã(?[ [ á§ - á§ {#}] ])/",
+ '/ã(?[ [ ð - ð¡ ] ])/; #no latin1' => "",
+ '/ã(?[ [ ð§ - ð± ] ])/; #no latin1' => "Ranges of digits should be from 
the same group of 10 {#} m/ã(?[ [ ð§ - ð± {#}] ])/",
+ '/ã(?[ [ ð§ - ð° ] ])/; #no latin1' => "Ranges of digits should be from 
the same group of 10 {#} m/ã(?[ [ ð§ - ð° {#}] ])/",
 );
 
 push @warning_only_under_strict, @warning_utf8_only_under_strict;

--
Perl5 Master Repository

[perl.git] branch blead, updated. v5.25.8-285-g323e4ec46d

Reply via email to