[perl.git] branch blead, updated. v5.19.7-134-gea5ced4

Karl Williamson Wed, 01 Jan 2014 19:32:23 -0800

In perl.git, the branch blead has been updated

<http://perl5.git.perl.org/perl.git/commitdiff/ea5ced44af8b967bfce3763b11ba4714d4fcd154?hp=23dfa30831199ff0adfa3c42488e59e3df455e2c>


- Log -----------------------------------------------------------------
commit ea5ced44af8b967bfce3763b11ba4714d4fcd154
Author: Karl Williamson <[email protected]>
Date:   Wed Jan 1 20:08:02 2014 -0700

    Change some warnings in utf8n_to_uvchr()
    
    This bottom level function decodes the first character of a UTF-8 string
    into a code point.  It is discouraged from using it directly.  This
    commit cleans up some of the warnings it can raise.  Now, tests for
    malformations are done before any tests for other potential issues.  One
    of those issues involves code points so large that they have never
    appeared in any official standard (the current standard has scaled back
    the highest acceptable code point from earlier versions).  It is
    possible (though not done in CPAN) to warn and/or forbid these code
    points, while accepting smaller code points that are still above the
    legal Unicode maximum.  The warning message for this now includes the
    code point if representable on the machine.  Previously it always
    displayed raw bytes, which is what it still does for non-representable
    code points.
-----------------------------------------------------------------------

Summary of changes:
 ext/XS-APItest/t/utf8.t | 11 +++++++----
 pod/perldelta.pod       | 20 ++++++++++++++++---
 utf8.c                  | 52 ++++++++++++++++++++++++-------------------------
 utf8.h                  |  4 +++-
 4 files changed, 53 insertions(+), 34 deletions(-)

diff --git a/ext/XS-APItest/t/utf8.t b/ext/XS-APItest/t/utf8.t
index 5f1c9c9..bc5a7ed 100644
--- a/ext/XS-APItest/t/utf8.t
+++ b/ext/XS-APItest/t/utf8.t
@@ -166,7 +166,7 @@ my @tests = (
         # This code point is chosen so that it is representable in a UV on
         # 32-bit machines
         $UTF8_WARN_FE_FF, $UTF8_DISALLOW_FE_FF, 'utf8', 0x80000000, 7,
-        qr/Code point beginning with byte .* is not Unicode, and not portable/
+        qr/Code point 0x80000000 is not Unicode, and not portable/
     ],
     [ "overflow with FE/FF",
         # This tests the interaction of WARN_FE_FF/DISALLOW_FE_FF with
@@ -178,9 +178,12 @@ my @tests = (
         ($has_quad)
             ? "\xff\x80\x90\x90\x90\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf"
             : "\xfe\x86\x80\x80\x80\x80\x80",
-        $UTF8_WARN_FE_FF, $UTF8_DISALLOW_FE_FF, 'utf8', 0,
+
+        # We include both warning categories to make sure the FE_FF one has
+        # precedence
+        "$UTF8_WARN_FE_FF|$UTF8_WARN_SUPER", "$UTF8_DISALLOW_FE_FF", 'utf8', 0,
         ($has_quad) ? 13 : 7,
-        qr/Code point beginning with byte .* is not Unicode, and not portable/
+        qr/overflow at byte .*, after start byte 0xf/
     ],
 );
 
@@ -188,7 +191,7 @@ if ($has_quad) {    # All FF's will overflow on 32 bit
     push @tests,
         [ "begins with FF", 
"\xff\x80\x80\x80\x80\x80\x81\x80\x80\x80\x80\x80\x80",
             $UTF8_WARN_FE_FF, $UTF8_DISALLOW_FE_FF, 'utf8', $FF_ret, 13,
-            qr/Code point beginning with byte .* is not Unicode, and not 
portable/
+            qr/Code point 0x.* is not Unicode, and not portable/
         ];
 }
 
diff --git a/pod/perldelta.pod b/pod/perldelta.pod
index e36ae85..86f0570 100644
--- a/pod/perldelta.pod
+++ b/pod/perldelta.pod
@@ -376,9 +376,23 @@ The public API newATTRSUB was previously a macro to the 
private
 function Perl_newATTRSUB. Function Perl_newATTRSUB has been removed. newATTRSUB
 is now macro to a different internal function.
 
-=item XXX
-
-XXX
+=item Changes in warnings raised by C<utf8n_to_uvchr()>
+
+This bottom level function decodes the first character of a UTF-8 string
+into a code point.  It is accessible to C<XS> level code, but it's
+discouraged from using it directly.  There are higher level functions
+that call this that should be used instead, such as
+L<perlapi/utf8_to_uvchr_buf>.  For completeness though, this documents
+some changes to it.  Now, tests for malformations are done before any
+tests for other potential issues.  One of those issues involves code
+points so large that they have never appeared in any official standard
+(the current standard has scaled back the highest acceptable code point
+from earlier versions).  It is possible (though not done in CPAN) to
+warn and/or forbid these code points, while accepting smaller code
+points that are still above the legal Unicode maximum.  The warning
+message for this now includes the code point if representable on the
+machine.  Previously it always displayed raw bytes, which is what it
+still does for non-representable code points.
 
 =back
 
diff --git a/utf8.c b/utf8.c
index 41e2c4c..8dc69bb 100644
--- a/utf8.c
+++ b/utf8.c
@@ -778,32 +778,8 @@ Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, 
STRLEN *retlen, U32 flags)
        }
     }
 
-#ifndef EBCDIC /* EBCDIC allows FE, FF, can't overflow */
-    if ((*s0 & 0xFE) == 0xFE   /* matches both FE, FF */
-       && (flags & (UTF8_WARN_FE_FF|UTF8_DISALLOW_FE_FF)))
-    {
-       /* By adding UTF8_CHECK_ONLY to the test, we avoid unnecessary
-        * generation of the sv, since no warnings are raised under CHECK */
-       if ((flags & (UTF8_WARN_FE_FF|UTF8_CHECK_ONLY)) == UTF8_WARN_FE_FF
-           && ckWARN_d(WARN_UTF8))
-       {
-           /* This message is deliberately not of the same syntax as the other
-            * messages for malformations, for backwards compatibility in the
-            * unlikely event that code is relying on its precise earlier text
-            */
-           sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s Code point beginning with 
byte 0x%02X is not Unicode, and not portable", malformed_text, *s0));
-           pack_warn = packWARN(WARN_UTF8);
-       }
-       if (flags & UTF8_DISALLOW_FE_FF) {
-           goto malformed;
-       }
-    }
+#ifndef EBCDIC /* EBCDIC can't overflow */
     if (UNLIKELY(overflowed)) {
-
-       /* If the first byte is FF, it will overflow a 32-bit word.  If the
-        * first byte is FE, it will overflow a signed 32-bit word.  The
-        * above preserves backward compatibility, since its message was used
-        * in earlier versions of this code in preference to overflow */
        sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (overflow at byte 0x%02x, after 
start byte 0x%02x)", malformed_text, overflow_byte, *s0));
        goto malformed;
     }
@@ -830,6 +806,9 @@ Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, 
STRLEN *retlen, U32 flags)
                     |UTF8_WARN_ILLEGAL_INTERCHANGE)))
     {
        if (UNICODE_IS_SURROGATE(uv)) {
+
+            /* By adding UTF8_CHECK_ONLY to the test, we avoid unnecessary
+             * generation of the sv, since no warnings are raised under CHECK 
*/
            if ((flags & (UTF8_WARN_SURROGATE|UTF8_CHECK_ONLY)) == 
UTF8_WARN_SURROGATE
                && ckWARN_d(WARN_SURROGATE))
            {
@@ -842,11 +821,32 @@ Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, 
STRLEN *retlen, U32 flags)
        }
        else if ((uv > PERL_UNICODE_MAX)) {
            if ((flags & (UTF8_WARN_SUPER|UTF8_CHECK_ONLY)) == UTF8_WARN_SUPER
-               && ckWARN_d(WARN_NON_UNICODE))
+                && ckWARN_d(WARN_NON_UNICODE))
            {
                sv = sv_2mortal(Perl_newSVpvf(aTHX_ "Code point 0x%04"UVXf" is 
not Unicode, may not be portable", uv));
                pack_warn = packWARN(WARN_NON_UNICODE);
            }
+#ifndef EBCDIC /* EBCDIC always allows FE, FF */
+
+            /* The first byte being 0xFE or 0xFF is a subset of the SUPER code
+             * points.  We test for these after the regular SUPER ones, and
+             * before possibly bailing out, so that the more dire warning
+             * overrides the regular one, if applicable */
+            if ((*s0 & 0xFE) == 0xFE   /* matches both FE, FF */
+                && (flags & (UTF8_WARN_FE_FF|UTF8_DISALLOW_FE_FF)))
+            {
+                if ((flags & (UTF8_WARN_FE_FF|UTF8_CHECK_ONLY))
+                                                            == UTF8_WARN_FE_FF
+                    && ckWARN_d(WARN_UTF8))
+                {
+                    sv = sv_2mortal(Perl_newSVpvf(aTHX_ "Code point 0x%"UVXf" 
is not Unicode, and not portable", uv));
+                    pack_warn = packWARN(WARN_UTF8);
+                }
+                if (flags & UTF8_DISALLOW_FE_FF) {
+                    goto disallowed;
+                }
+            }
+#endif
            if (flags & UTF8_DISALLOW_SUPER) {
                goto disallowed;
            }
diff --git a/utf8.h b/utf8.h
index 2d48775..f72a243 100644
--- a/utf8.h
+++ b/utf8.h
@@ -447,7 +447,9 @@ Perl's extended UTF-8 means we can have start bytes up to 
FF.
 #define UTF8_WARN_SUPER                        0x0400  /* points above the 
legal max */
 
 /* Code points which never were part of the original UTF-8 standard, the first
- * byte of which is a FE or FF on ASCII platforms. */
+ * byte of which is a FE or FF on ASCII platforms. If the first byte is FF, it
+ * will overflow a 32-bit word.  If the first byte is FE, it will overflow a
+ * signed 32-bit word. */
 #define UTF8_DISALLOW_FE_FF            0x0800
 #define UTF8_WARN_FE_FF                        0x1000
 

--
Perl5 Master Repository

[perl.git] branch blead, updated. v5.19.7-134-gea5ced4

Reply via email to