In perl.git, the branch blead has been updated <http://perl5.git.perl.org/perl.git/commitdiff/323e4ec46db0dc8d22d9eae846bc5f0fe9d642ec?hp=92ff660bc8c29480a311c0b95430f16c175961db>
- Log ----------------------------------------------------------------- commit 323e4ec46db0dc8d22d9eae846bc5f0fe9d642ec Author: Karl Williamson <[email protected]> Date: Mon Jan 16 17:52:55 2017 -0700 perlebcdic: Rmv obsolete text about 'use encoding' M pod/perlebcdic.pod commit 2dc9bc8419687eac4909c664fdbbb1c6a2a8c683 Author: Karl Williamson <[email protected]> Date: Mon Jan 16 17:51:20 2017 -0700 perlop: Remove obsolete text 'use encoding' no longer works. Don't say it does. M pod/perlop.pod commit 8bafd282a6fa128ebec02d04e12692c4b75a7d3f Author: Karl Williamson <[email protected]> Date: Fri Jan 13 13:53:17 2017 -0700 Fix bug with a digit range under re 'strict' "use re 'strict" is supposed to warn if a range whose start and end points are digits aren't from the same group of 10. For example, if you mix Bengali and Thai digits. It wasn't working properly for 5 groups of mathematical digits starting at U+1D7E. This commit fixes that, and refactors the code to bail out as soon as it discovers that no warning is warranted, instead of doing unnecessary work. M charclass_invlists.h M lib/unicore/mktables M regcharclass.h M regcomp.c M t/re/reg_mesg.t ----------------------------------------------------------------------- Summary of changes: charclass_invlists.h | 2 +- lib/unicore/mktables | 12 ++++++ pod/perlebcdic.pod | 3 -- pod/perlop.pod | 7 +--- regcharclass.h | 2 +- regcomp.c | 110 +++++++++++++++++++++++++++++++++------------------ t/re/reg_mesg.t | 3 ++ 7 files changed, 90 insertions(+), 49 deletions(-) diff --git a/charclass_invlists.h b/charclass_invlists.h index 038336db20..732b6d0a8a 100644 --- a/charclass_invlists.h +++ b/charclass_invlists.h @@ -95407,7 +95407,7 @@ static const U8 WB_table[24][24] = { * 37f6186253da9824bdb27f4ad867bfe8c25d4dc6bdb2f05585e40a034675a348 lib/unicore/extracted/DLineBreak.txt * ef24061b5a5dc93d7e90c2e34530ec757180ee75d872cba65ffc946e52624ae8 lib/unicore/extracted/DNumType.txt * a197371fec9a1b517058b440841f60f9378d81682084eef8db22a88cb2f96e90 lib/unicore/extracted/DNumValues.txt - * 066d6e75f95cf6794161c8ac0b1a40990277de90eefb913be2e675a7cba38d59 lib/unicore/mktables + * 4bcfb4545be21663ca38a2acbfcbf2b0f3252652a34b50f1a56ef76cb959861b lib/unicore/mktables * cdecb300baad839a6f62791229f551a4fa33f3cbdca08e378dc976466354e778 lib/unicore/version * 913d2f93f3cb6cdf1664db888bf840bc4eb074eef824e082fceda24a9445e60c regen/charset_translations.pl * 9534d0cc3914fa1f5d574332c3199605c3d14f8691a0729d68d8498ac2b36280 regen/mk_invlists.pl diff --git a/lib/unicore/mktables b/lib/unicore/mktables index fa1f1f4445..542461742d 100644 --- a/lib/unicore/mktables +++ b/lib/unicore/mktables @@ -13699,6 +13699,18 @@ numerals. END )); + # Make sure this assumption in perl core code is valid in this Unicode + # release, with known exceptions + foreach my $range (property_ref('Numeric-Type')->table('Decimal')->ranges) { + next if $range->end - $range->start == 9; + next if $range->start == 0x1D7CE; # This whole range was added in 3.1 + next if $range->end == 0x19DA && $v_version eq v5.2.0; + next if $range->end - $range->start < 9 && $v_version le 4.0.0; + Carp::my_carp("Range $range unexpectedly doesn't contain 10" + . " decimal digits. Code in regcomp.c assumes it does," + . " and will have to be fixed. Proceeding anyway."); + } + Property->new('Legacy_Case_Folding', File => "Fold", Directory => $map_directory, diff --git a/pod/perlebcdic.pod b/pod/perlebcdic.pod index 6dd8e10c4a..288a71f877 100644 --- a/pod/perlebcdic.pod +++ b/pod/perlebcdic.pod @@ -1855,9 +1855,6 @@ EBCDIC platforms. And some of the failures are real bugs. If you compile and do a C<make test> on Perl, all tests on the C</cpan> directory are skipped. -In particular, the (now deprecated) L<encoding> pragma is not supported -under EBCDIC. - L<Encode> partially works. =item * diff --git a/pod/perlop.pod b/pod/perlop.pod index 3cf9db67e6..6550133284 100644 --- a/pod/perlop.pod +++ b/pod/perlop.pod @@ -1569,12 +1569,9 @@ as a Unicode code point no matter what the native encoding is. The name of the character in the 256th position (indexed by 0) in Unicode is C<LATIN CAPITAL LETTER A WITH MACRON>. -There are a couple of exceptions to the above rule. S<C<\N{U+I<hex number>}>> is +An exception to the above rule is that S<C<\N{U+I<hex number>}>> is always interpreted as a Unicode code point, so that C<\N{U+0050}> is C<"P"> even -on EBCDIC platforms. And if C<S<L<use encoding|encoding>>> is in effect, the -number is considered to be in that encoding, and is translated from that into -the platform's native encoding if there is a corresponding native character; -otherwise to Unicode. +on EBCDIC platforms. =back diff --git a/regcharclass.h b/regcharclass.h index bb44e8224c..4be75bcac7 100644 --- a/regcharclass.h +++ b/regcharclass.h @@ -1897,7 +1897,7 @@ * 37f6186253da9824bdb27f4ad867bfe8c25d4dc6bdb2f05585e40a034675a348 lib/unicore/extracted/DLineBreak.txt * ef24061b5a5dc93d7e90c2e34530ec757180ee75d872cba65ffc946e52624ae8 lib/unicore/extracted/DNumType.txt * a197371fec9a1b517058b440841f60f9378d81682084eef8db22a88cb2f96e90 lib/unicore/extracted/DNumValues.txt - * 066d6e75f95cf6794161c8ac0b1a40990277de90eefb913be2e675a7cba38d59 lib/unicore/mktables + * 4bcfb4545be21663ca38a2acbfcbf2b0f3252652a34b50f1a56ef76cb959861b lib/unicore/mktables * cdecb300baad839a6f62791229f551a4fa33f3cbdca08e378dc976466354e778 lib/unicore/version * 913d2f93f3cb6cdf1664db888bf840bc4eb074eef824e082fceda24a9445e60c regen/charset_translations.pl * 1d27ae8b75d81a082b1fc594673e08540280f8169309a7b5047015c8091a2bfb regen/regcharclass.pl diff --git a/regcomp.c b/regcomp.c index d865c73c06..97888ca7ec 100644 --- a/regcomp.c +++ b/regcomp.c @@ -16809,15 +16809,19 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, * must be be all digits or all letters of the same case. * Otherwise, the range is non-portable and unclear as to * what it contains */ - if ((isPRINT_A(prevvalue) || isPRINT_A(value)) - && (non_portable_endpoint - || ! ((isDIGIT_A(prevvalue) && isDIGIT_A(value)) - || (isLOWER_A(prevvalue) && isLOWER_A(value)) - || (isUPPER_A(prevvalue) && isUPPER_A(value))))) - { - vWARN(RExC_parse, "Ranges of ASCII printables should be some subset of \"0-9\", \"A-Z\", or \"a-z\""); + if ( (isPRINT_A(prevvalue) || isPRINT_A(value)) + && ( non_portable_endpoint + || ! ( (isDIGIT_A(prevvalue) && isDIGIT_A(value)) + || (isLOWER_A(prevvalue) && isLOWER_A(value)) + || (isUPPER_A(prevvalue) && isUPPER_A(value)) + ))) { + vWARN(RExC_parse, "Ranges of ASCII printables should" + " be some subset of \"0-9\"," + " \"A-Z\", or \"a-z\""); } else if (prevvalue >= 0x660) { /* ARABIC_INDIC_DIGIT_ZERO */ + SSize_t index_start; + SSize_t index_final; /* But the nature of Unicode and languages mean we * can't do the same checks for above-ASCII ranges, @@ -16825,40 +16829,68 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, * contain only digits from the same group of 10. The * ASCII case is handled just above. 0x660 is the * first digit character beyond ASCII. Hence here, the - * range could be a range of digits. Find out. */ - IV index_start = _invlist_search(PL_XPosix_ptrs[_CC_DIGIT], - prevvalue); - IV index_final = _invlist_search(PL_XPosix_ptrs[_CC_DIGIT], - value); - - /* If the range start and final points are in the same - * inversion list element, it means that either both - * are not digits, or both are digits in a consecutive - * sequence of digits. (So far, Unicode has kept all - * such sequences as distinct groups of 10, but assert - * to make sure). If the end points are not in the - * same element, neither should be a digit. */ - if (index_start == index_final) { - assert(! ELEMENT_RANGE_MATCHES_INVLIST(index_start) - || (invlist_array(PL_XPosix_ptrs[_CC_DIGIT])[index_start+1] - - invlist_array(PL_XPosix_ptrs[_CC_DIGIT])[index_start] - == 10) - /* But actually Unicode did have one group of 11 - * 'digits' in 5.2, so in case we are operating - * on that version, let that pass */ - || (invlist_array(PL_XPosix_ptrs[_CC_DIGIT])[index_start+1] - - invlist_array(PL_XPosix_ptrs[_CC_DIGIT])[index_start] - == 11 - && invlist_array(PL_XPosix_ptrs[_CC_DIGIT])[index_start] - == 0x19D0) - ); + * range could be a range of digits. First some + * unlikely special cases. Grandfather in that a range + * ending in 19DA (NEW TAI LUE THAM DIGIT ONE) is bad + * if its starting value is one of the 10 digits prior + * to it. This is because it is an alternate way of + * writing 19D1, and some people may expect it to be in + * that group. But it is bad, because it won't give + * the expected results. In Unicode 5.2 it was + * considered to be in that group (of 11, hence), but + * this was fixed in the next version */ + + if (UNLIKELY(value == 0x19DA && prevvalue >= 0x19D0)) { + goto warn_bad_digit_range; } - else if ((index_start >= 0 - && ELEMENT_RANGE_MATCHES_INVLIST(index_start)) - || (index_final >= 0 - && ELEMENT_RANGE_MATCHES_INVLIST(index_final))) + else if (UNLIKELY( prevvalue >= 0x1D7CE + && value <= 0x1D7FF)) { - vWARN(RExC_parse, "Ranges of digits should be from the same group of 10"); + /* This is the only other case currently in Unicode + * where the algorithm below fails. The code + * points just above are the end points of a single + * range containing only decimal digits. It is 5 + * different series of 0-9. All other ranges of + * digits currently in Unicode are just a single + * series. (And mktables will notify us if a later + * Unicode version breaks this.) + * + * If the range being checked is at most 9 long, + * and the digit values represented are in + * numerical order, they are from the same series. + * */ + if ( value - prevvalue > 9 + || ((( value - 0x1D7CE) % 10) + <= (prevvalue - 0x1D7CE) % 10)) + { + goto warn_bad_digit_range; + } + } + else { + + /* For all other ranges of digits in Unicode, the + * algorithm is just to check if both end points + * are in the same series, which is the same range. + * */ + index_start = _invlist_search( + PL_XPosix_ptrs[_CC_DIGIT], + prevvalue); + + /* Warn if the range starts and ends with a digit, + * and they are not in the same group of 10. */ + if ( index_start >= 0 + && ELEMENT_RANGE_MATCHES_INVLIST(index_start) + && (index_final = + _invlist_search(PL_XPosix_ptrs[_CC_DIGIT], + value)) != index_start + && index_final >= 0 + && ELEMENT_RANGE_MATCHES_INVLIST(index_final)) + { + warn_bad_digit_range: + vWARN(RExC_parse, "Ranges of digits should be" + " from the same group of" + " 10"); + } } } } diff --git a/t/re/reg_mesg.t b/t/re/reg_mesg.t index 050448b66c..22711d5f62 100644 --- a/t/re/reg_mesg.t +++ b/t/re/reg_mesg.t @@ -628,6 +628,9 @@ my @warning_utf8_only_under_strict = mark_as_utf8( '/ã(?[ [ ᪠- ᪠] ])/; #no latin1' => "Ranges of digits should be from the same group of 10 {#} m/ã(?[ [ ᪠- ᪠{#}] ])/", '/ã[á§-á§]/; #no latin1' => "Ranges of digits should be from the same group of 10 {#} m/ã[á§-á§{#}]/", '/ã(?[ [ á§ - á§ ] ])/; #no latin1' => "Ranges of digits should be from the same group of 10 {#} m/ã(?[ [ á§ - á§ {#}] ])/", + '/ã(?[ [ ð - ð¡ ] ])/; #no latin1' => "", + '/ã(?[ [ ð§ - ð± ] ])/; #no latin1' => "Ranges of digits should be from the same group of 10 {#} m/ã(?[ [ ð§ - ð± {#}] ])/", + '/ã(?[ [ ð§ - ð° ] ])/; #no latin1' => "Ranges of digits should be from the same group of 10 {#} m/ã(?[ [ ð§ - ð° {#}] ])/", ); push @warning_only_under_strict, @warning_utf8_only_under_strict; -- Perl5 Master Repository
