In perl.git, the branch blead has been updated

<http://perl5.git.perl.org/perl.git/commitdiff/b33122be550de502b2a2c0da46023b7ef4e00466?hp=cc25b9e2d291fedac80973dc493e321d31d3cb1a>

- Log -----------------------------------------------------------------
commit b33122be550de502b2a2c0da46023b7ef4e00466
Author: Karl Williamson <[email protected]>
Date:   Tue Nov 22 20:30:56 2016 -0700

    APItest/t/utf8.t: White space only
    
    This indents the new block formed by the previous commit.  However,
    since the indentation is getting too much, it also changes the indents
    for all the nested for loops to 2 spaces to allow room on the line.

M       ext/XS-APItest/t/utf8.t

commit e308b348b63b8c65648ae3d340ce96b3ec19f1a2
Author: Karl Williamson <[email protected]>
Date:   Tue Nov 22 17:47:35 2016 -0700

    Split diagnostics for two UTF-8 malformations
    
    Some UTF-8 sequences may have multiple malformations.  Commit
    2b5e7bc2e60b4c4b5d87aa66e066363d9dce7930 tried to make sure that all
    possible ones are raised, instead of abandoning searching after one is
    found.  Since, I realized that there was yet another case of two
    malformations that it returned only one or the other of.
    
    An input buffer may be too short to fully express the code point it
    purports to.  This can be determined by the first byte of the UTF-8
    sequence indicating a longer sequence is requred than the space
    available.  But also, that shortened sequence can have a premature
    beginning of another character earlier than the shortness.  This commit
    causes these to be both raised, instead of the previous behavior of
    noting just one.

M       ext/XS-APItest/t/utf8.t
M       t/op/utf8decode.t
M       utf8.c

commit 2d0a280183c8525ba909db81b0007830c2f3a118
Author: Karl Williamson <[email protected]>
Date:   Tue Nov 22 18:14:45 2016 -0700

    APItest/t/utf8.t: Partially refactor to use table data
    
    This removes kludgy code that was trying, given a partial
    character, to determine if there enough bytes present to guarantee that
    the whole character must belong to a class of characters or not.  Now
    the necessary length to make that determination has instead manually
    been placed in a table, so it can be looked up.  In doing so, I
    corrected one length that was failing on EBCDIC.

M       ext/XS-APItest/t/utf8.t

commit 926e918e37f4148f962b9ac8d18e5e25ec847aa2
Author: Karl Williamson <[email protected]>
Date:   Tue Nov 22 18:07:31 2016 -0700

    APItest/t/utf8.t: Fix test
    
    It turns out that this test has two malformations, and should only have
    one; a future commit will remove the masking of the 2nd one.

M       ext/XS-APItest/t/utf8.t

commit 4b986740a596a745ad405eeffe2379d6e3ff7318
Author: Karl Williamson <[email protected]>
Date:   Tue Nov 22 18:01:21 2016 -0700

    APItest/t/utf8.t: Comments only

M       ext/XS-APItest/t/utf8.t

commit 5d03eac4dea7335a15adb10d0eaf0eda32a1949e
Author: Karl Williamson <[email protected]>
Date:   Tue Nov 22 17:55:10 2016 -0700

    APItest/t/utf8.t: Add some indentation to diagnositcs
    
    This is so they don't interrupt reading the output when there are
    errors.

M       ext/XS-APItest/t/utf8.t

commit 9a6c9c819a93cfcbc7497801aa9539a522b04291
Author: Karl Williamson <[email protected]>
Date:   Tue Nov 22 13:15:18 2016 -0700

    utf8.c: Clarify warning message.
    
    This warning was changed recently in the 5.25 series, and has not been
    in a stable release.

M       ext/XS-APItest/t/utf8.t
M       lib/utf8.t
M       t/op/lex.t
M       t/op/utf8decode.t
M       utf8.c

commit d8954c68c70d0ebe0fa2b888b6f5b72f7c6f23d9
Author: Karl Williamson <[email protected]>
Date:   Mon Nov 21 14:59:47 2016 -0700

    APItest/t/utf8.t: Simplify expression slightly

M       ext/XS-APItest/t/utf8.t

commit 5c3356666e0c63b7bf654d7aa91c59d9c230ecba
Author: Karl Williamson <[email protected]>
Date:   Sun Nov 20 07:56:40 2016 -0700

    APItest/t/handy.t: Output details if test fails
    
    There should be no warnings generated, but if there are, we want to see
    what they were.

M       ext/XS-APItest/t/handy.t
-----------------------------------------------------------------------

Summary of changes:
 ext/XS-APItest/t/handy.t |    4 +-
 ext/XS-APItest/t/utf8.t  | 1017 +++++++++++++++++++++++-----------------------
 lib/utf8.t               |    2 +-
 t/op/lex.t               |    4 +-
 t/op/utf8decode.t        |   55 ++-
 utf8.c                   |   77 ++--
 6 files changed, 599 insertions(+), 560 deletions(-)

diff --git a/ext/XS-APItest/t/handy.t b/ext/XS-APItest/t/handy.t
index b6eaa3e..a85f701 100644
--- a/ext/XS-APItest/t/handy.t
+++ b/ext/XS-APItest/t/handy.t
@@ -453,6 +453,8 @@ foreach my $name (sort keys %to_properties) {
 }
 
 # This is primarily to make sure that no non-Unicode warnings get generated
-is(scalar @warnings, 0, "No warnings were generated " . join ", ", @warnings);
+unless (is(scalar @warnings, 0, "No warnings were generated")) {
+    diag @warnings;
+}
 
 done_testing;
diff --git a/ext/XS-APItest/t/utf8.t b/ext/XS-APItest/t/utf8.t
index 49a6fba..50e1359 100644
--- a/ext/XS-APItest/t/utf8.t
+++ b/ext/XS-APItest/t/utf8.t
@@ -980,7 +980,7 @@ for my $restriction (sort keys %restriction_types) {
                                   "Verify $this_name is TRUE for valid input"
                                 . "$test_name_suffix"))
                         {
-                            diag("The bytes starting at offset"
+                            diag("    The bytes starting at offset"
                                . " $error_offset are"
                                . display_bytes(substr(
                                           $restriction_types{$restriction}
@@ -997,7 +997,8 @@ for my $restriction (sort keys %restriction_types) {
                             my $min = ($error_offset < $expected_offset)
                                     ? $error_offset
                                     : $expected_offset;
-                            diag display_bytes(substr($bytes, $min));
+                            diag("    The bytes starting at offset" . $min
+                              . " are " . display_bytes(substr($bytes, $min)));
                         }
 
                         if ($function eq '_loclen') {
@@ -1015,39 +1016,43 @@ my $REPLACEMENT = 0xFFFD;
 
 # Now test the malformations.  All these raise category utf8 warnings.
 my @malformations = (
+    # ($testname, $bytes, $length, $allow_flags, $expected_error_flags,
+    #  $allowed_uv, $expected_len, $needed_to_discern_len, $message )
     [ "zero length string malformation", "", 0,
-        $UTF8_ALLOW_EMPTY, $UTF8_GOT_EMPTY, 0, 0,
+        $UTF8_ALLOW_EMPTY, $UTF8_GOT_EMPTY, 0, 0, 0,
         qr/empty string/
     ],
-    [ "orphan continuation byte malformation", I8_to_native("${I8c}a"),
-        2,
-        $UTF8_ALLOW_CONTINUATION, $UTF8_GOT_CONTINUATION, $REPLACEMENT, 1,
+    [ "orphan continuation byte malformation", I8_to_native("${I8c}a"), 2,
+        $UTF8_ALLOW_CONTINUATION, $UTF8_GOT_CONTINUATION, $REPLACEMENT,
+        1, 1,
         qr/unexpected continuation byte/
     ],
     [ "premature next character malformation (immediate)",
         (isASCII) ? "\xc2\xc2\x80" : I8_to_native("\xc5\xc5\xa0"),
         3,
-        $UTF8_ALLOW_NON_CONTINUATION, $UTF8_GOT_NON_CONTINUATION, 
$REPLACEMENT, 1,
+        $UTF8_ALLOW_NON_CONTINUATION, $UTF8_GOT_NON_CONTINUATION, $REPLACEMENT,
+        1, 2,
         qr/unexpected non-continuation byte.*immediately after start byte/
     ],
     [ "premature next character malformation (non-immediate)",
-        I8_to_native("\xf1${I8c}a"),
-        3,
-        $UTF8_ALLOW_NON_CONTINUATION, $UTF8_GOT_NON_CONTINUATION, 
$REPLACEMENT, 2,
+        I8_to_native("\xef${I8c}a"), 3,
+        $UTF8_ALLOW_NON_CONTINUATION, $UTF8_GOT_NON_CONTINUATION, $REPLACEMENT,
+        2, 3,
         qr/unexpected non-continuation byte .* 2 bytes after start byte/
     ],
     [ "too short malformation", I8_to_native("\xf1${I8c}a"), 2,
         # Having the 'a' after this, but saying there are only 2 bytes also
         # tests that we pay attention to the passed in length
-        $UTF8_ALLOW_SHORT, $UTF8_GOT_SHORT, $REPLACEMENT, 2,
-        qr/2 bytes, need 4/
+        $UTF8_ALLOW_SHORT, $UTF8_GOT_SHORT, $REPLACEMENT,
+        2, 2,
+        qr/2 bytes available, need 4/
     ],
     [ "overlong malformation, lowest 2-byte",
         (isASCII) ? "\xc0\x80" : I8_to_native("\xc0\xa0"),
         2,
         $UTF8_ALLOW_LONG, $UTF8_GOT_LONG,
         0,   # NUL
-        2,
+        2, 1,
         qr/overlong/
     ],
     [ "overlong malformation, highest 2-byte",
@@ -1055,7 +1060,7 @@ my @malformations = (
         2,
         $UTF8_ALLOW_LONG, $UTF8_GOT_LONG,
         (isASCII) ? 0x7F : utf8::unicode_to_native(0x9F),
-        2,
+        2, 1,
         qr/overlong/
     ],
     [ "overlong malformation, lowest 3-byte",
@@ -1063,7 +1068,7 @@ my @malformations = (
         3,
         $UTF8_ALLOW_LONG, $UTF8_GOT_LONG,
         0,   # NUL
-        3,
+        3, (isASCII) ? 2 : 1,
         qr/overlong/
     ],
     [ "overlong malformation, highest 3-byte",
@@ -1071,7 +1076,7 @@ my @malformations = (
         3,
         $UTF8_ALLOW_LONG, $UTF8_GOT_LONG,
         (isASCII) ? 0x7FF : 0x3FF,
-        3,
+        3, (isASCII) ? 2 : 1,
         qr/overlong/
     ],
     [ "overlong malformation, lowest 4-byte",
@@ -1079,7 +1084,7 @@ my @malformations = (
         4,
         $UTF8_ALLOW_LONG, $UTF8_GOT_LONG,
         0,   # NUL
-        4,
+        4, 2,
         qr/overlong/
     ],
     [ "overlong malformation, highest 4-byte",
@@ -1087,7 +1092,7 @@ my @malformations = (
         4,
         $UTF8_ALLOW_LONG, $UTF8_GOT_LONG,
         (isASCII) ? 0xFFFF : 0x3FFF,
-        4,
+        4, 2,
         qr/overlong/
     ],
     [ "overlong malformation, lowest 5-byte",
@@ -1097,7 +1102,7 @@ my @malformations = (
         5,
         $UTF8_ALLOW_LONG, $UTF8_GOT_LONG,
         0,   # NUL
-        5,
+        5, 2,
         qr/overlong/
     ],
     [ "overlong malformation, highest 5-byte",
@@ -1107,7 +1112,7 @@ my @malformations = (
         5,
         $UTF8_ALLOW_LONG, $UTF8_GOT_LONG,
         (isASCII) ? 0x1FFFFF : 0x3FFFF,
-        5,
+        5, 2,
         qr/overlong/
     ],
     [ "overlong malformation, lowest 6-byte",
@@ -1117,7 +1122,7 @@ my @malformations = (
         6,
         $UTF8_ALLOW_LONG, $UTF8_GOT_LONG,
         0,   # NUL
-        6,
+        6, 2,
         qr/overlong/
     ],
     [ "overlong malformation, highest 6-byte",
@@ -1127,7 +1132,7 @@ my @malformations = (
         6,
         $UTF8_ALLOW_LONG, $UTF8_GOT_LONG,
         (isASCII) ? 0x3FFFFFF : 0x3FFFFF,
-        6,
+        6, 2,
         qr/overlong/
     ],
     [ "overlong malformation, lowest 7-byte",
@@ -1137,7 +1142,7 @@ my @malformations = (
         7,
         $UTF8_ALLOW_LONG, $UTF8_GOT_LONG,
         0,   # NUL
-        7,
+        7, 2,
         qr/overlong/
     ],
     [ "overlong malformation, highest 7-byte",
@@ -1147,7 +1152,7 @@ my @malformations = (
         7,
         $UTF8_ALLOW_LONG, $UTF8_GOT_LONG,
         (isASCII) ? 0x7FFFFFFF : 0x3FFFFFF,
-        7,
+        7, 2,
         qr/overlong/
     ],
 );
@@ -1161,20 +1166,20 @@ if (isASCII && ! $is64bit) {    # 32-bit ASCII platform
             0,  # There is no way to allow this malformation
             $UTF8_GOT_OVERFLOW,
             $REPLACEMENT,
-            7,
+            7, 2,
             qr/overflows/
         ],
-        [ "overflow malformation, can tell on first byte",
+        [ "overflow malformation",
             "\xff\x80\x80\x80\x80\x80\x81\x80\x80\x80\x80\x80\x80",
             $max_bytes,
             0,  # There is no way to allow this malformation
             $UTF8_GOT_OVERFLOW,
             $REPLACEMENT,
-            $max_bytes,
+            $max_bytes, 1,
             qr/overflows/
         ];
 }
-else {
+else { # 64-bit ASCII, or EBCDIC of any size.
     # On EBCDIC platforms, another overlong test is needed even on 32-bit
     # systems, whereas it doesn't happen on ASCII except on 64-bit ones.
 
@@ -1188,7 +1193,7 @@ else {
             $max_bytes,
             $UTF8_ALLOW_LONG, $UTF8_GOT_LONG,
             0,   # NUL
-            $max_bytes,
+            $max_bytes, (isASCII) ? 7 : 8,
             qr/overlong/,
         ],
         [ "overlong malformation, highest max-byte",
@@ -1198,7 +1203,7 @@ else {
             $max_bytes,
             $UTF8_ALLOW_LONG, $UTF8_GOT_LONG,
             (isASCII) ? 0xFFFFFFFFF : 0x3FFFFFFF,
-            $max_bytes,
+            $max_bytes, (isASCII) ? 7 : 8,
             qr/overlong/,
         ];
 
@@ -1210,11 +1215,11 @@ else {
             0,  # There is no way to allow this malformation
             $UTF8_GOT_OVERFLOW,
             $REPLACEMENT,
-            $max_bytes,
+            $max_bytes, 8,
             qr/overflows/
         ];
     }
-    else {  # 64-bit
+    else {  # 64-bit, either ASCII or EBCDIC
         push @malformations,
             [ "overflow malformation",
                (isASCII)
@@ -1224,7 +1229,7 @@ else {
                 0,  # There is no way to allow this malformation
                 $UTF8_GOT_OVERFLOW,
                 $REPLACEMENT,
-                $max_bytes,
+                $max_bytes, (isASCII) ? 3 : 2,
                 qr/overflows/
             ];
     }
@@ -1232,7 +1237,7 @@ else {
 
 foreach my $test (@malformations) {
     my ($testname, $bytes, $length, $allow_flags, $expected_error_flags,
-        $allowed_uv, $expected_len, $message ) = @$test;
+        $allowed_uv, $expected_len, $needed_to_discern_len, $message ) = 
@$test;
 
     if (length($bytes) < $length) {
         fail("Internal test error: actual buffer length (" . length($bytes)
@@ -1284,45 +1289,14 @@ foreach my $test (@malformations) {
         undef @warnings;
 
         $ret = test_is_utf8_valid_partial_char_flags($bytes, $j, 0);
+
         my $ret_should_be = 0;
         my $comment = "";
-        if ($testname =~ /premature|short/ && $j < 3) {
-
-            # The tests are hard-coded so these relationships hold
-            my $cut_off = 2;
-            $cut_off = 3 if $testname =~ /non-immediate/;
-            if ($j < $cut_off) {
-                $ret_should_be = 1;
-                $comment = ", but need $cut_off bytes to discern:";
-            }
-        }
-        elsif ($testname =~ /overlong/ && ! isASCII && $length == 3) {
-            # 3-byte overlongs on EBCDIC are determinable on the first byte
-        }
-        elsif ($testname =~ /overlong/ && $length > 2) {
-            if ($length <= 7 && $j < 2) {
-                $ret_should_be = 1;
-                $comment = ", but need 2 bytes to discern:";
-            }
-            elsif ($length > 7 && $j < 7) {
-                $ret_should_be = 1;
-                $comment = ", but need 7 bytes to discern:";
-            }
-        }
-        elsif ($testname =~ /overflow/ && $testname !~ /first byte/) {
-            if (isASCII) {
-                if ($j < (($is64bit) ? 3 : 2)) {
-                    $comment = ", but need $j bytes to discern:";
-                    $ret_should_be = 1;
-                }
-            }
-            else {
-                if ($j < (($is64bit) ? 2 : 8)) {
-                    $comment = ", but need $j bytes to discern:";
-                    $ret_should_be = 1;
-                }
-            }
+        if ($j < $needed_to_discern_len) {
+            $ret_should_be = 1;
+            $comment = ", but need $needed_to_discern_len bytes to discern:";
         }
+
         is($ret, $ret_should_be, "$testname: is_utf8_valid_partial_char_flags("
                                 . display_bytes($partial)
                                 . ")$comment returns $ret_should_be");
@@ -1431,11 +1405,14 @@ sub nonportable_regex ($) {
 # Now test the cases where a legal code point is generated, but may or may not
 # be allowed/warned on.
 my @tests = (
+     # ($testname, $bytes, $warn_flags, $disallow_flags, $expected_error_flags,
+     #  $category, $allowed_uv, $expected_len, $needed_to_discern_len, 
$message )
     [ "lowest surrogate",
         (isASCII) ? "\xed\xa0\x80" : I8_to_native("\xf1\xb6\xa0\xa0"),
         $UTF8_WARN_SURROGATE, $UTF8_DISALLOW_SURROGATE, $UTF8_GOT_SURROGATE,
         'surrogate', 0xD800,
         (isASCII) ? 3 : 4,
+        2,
         qr/surrogate/
     ],
     [ "a middle surrogate",
@@ -1443,6 +1420,7 @@ my @tests = (
         $UTF8_WARN_SURROGATE, $UTF8_DISALLOW_SURROGATE, $UTF8_GOT_SURROGATE,
         'surrogate', 0xD90D,
         (isASCII) ? 3 : 4,
+        2,
         qr/surrogate/
     ],
     [ "highest surrogate",
@@ -1450,6 +1428,7 @@ my @tests = (
         $UTF8_WARN_SURROGATE, $UTF8_DISALLOW_SURROGATE, $UTF8_GOT_SURROGATE,
         'surrogate', 0xDFFF,
         (isASCII) ? 3 : 4,
+        2,
         qr/surrogate/
     ],
     [ "first non_unicode",
@@ -1457,6 +1436,7 @@ my @tests = (
         $UTF8_WARN_SUPER, $UTF8_DISALLOW_SUPER, $UTF8_GOT_SUPER,
         'non_unicode', 0x110000,
         (isASCII) ? 4 : 5,
+        2,
         qr/(not Unicode|for a non-Unicode code point).* may not be portable/
     ],
     [ "non_unicode whose first byte tells that",
@@ -1465,6 +1445,7 @@ my @tests = (
         'non_unicode',
         (isASCII) ? 0x140000 : 0x200000,
         (isASCII) ? 4 : 5,
+        1,
         qr/(not Unicode|for a non-Unicode code point).* may not be portable/
     ],
     [ "first of 32 consecutive non-character code points",
@@ -1472,6 +1453,7 @@ my @tests = (
         $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
         'nonchar', 0xFDD0,
         (isASCII) ? 3 : 4,
+        (isASCII) ? 3 : 4,
         qr/Unicode non-character.*is not recommended for open interchange/
     ],
     [ "a mid non-character code point of the 32 consecutive ones",
@@ -1479,6 +1461,7 @@ my @tests = (
         $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
         'nonchar', 0xFDE0,
         (isASCII) ? 3 : 4,
+        (isASCII) ? 3 : 4,
         qr/Unicode non-character.*is not recommended for open interchange/
     ],
     [ "final of 32 consecutive non-character code points",
@@ -1486,6 +1469,7 @@ my @tests = (
         $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
         'nonchar', 0xFDEF,
         (isASCII) ? 3 : 4,
+        (isASCII) ? 3 : 4,
         qr/Unicode non-character.*is not recommended for open interchange/
     ],
     [ "non-character code point U+FFFE",
@@ -1493,6 +1477,7 @@ my @tests = (
         $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
         'nonchar', 0xFFFE,
         (isASCII) ? 3 : 4,
+        (isASCII) ? 3 : 4,
         qr/Unicode non-character.*is not recommended for open interchange/
     ],
     [ "non-character code point U+FFFF",
@@ -1500,42 +1485,49 @@ my @tests = (
         $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
         'nonchar', 0xFFFF,
         (isASCII) ? 3 : 4,
+        (isASCII) ? 3 : 4,
         qr/Unicode non-character.*is not recommended for open interchange/
     ],
     [ "non-character code point U+1FFFE",
         (isASCII) ? "\xf0\x9f\xbf\xbe" : I8_to_native("\xf3\xbf\xbf\xbe"),
         $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
-        'nonchar', 0x1FFFE, 4,
+        'nonchar', 0x1FFFE,
+        4, 4,
         qr/Unicode non-character.*is not recommended for open interchange/
     ],
     [ "non-character code point U+1FFFF",
         (isASCII) ? "\xf0\x9f\xbf\xbf" : I8_to_native("\xf3\xbf\xbf\xbf"),
         $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
-        'nonchar', 0x1FFFF, 4,
+        'nonchar', 0x1FFFF,
+        4, 4,
         qr/Unicode non-character.*is not recommended for open interchange/
     ],
     [ "non-character code point U+2FFFE",
         (isASCII) ? "\xf0\xaf\xbf\xbe" : I8_to_native("\xf5\xbf\xbf\xbe"),
         $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
-        'nonchar', 0x2FFFE, 4,
+        'nonchar', 0x2FFFE,
+        4, 4,
         qr/Unicode non-character.*is not recommended for open interchange/
     ],
     [ "non-character code point U+2FFFF",
         (isASCII) ? "\xf0\xaf\xbf\xbf" : I8_to_native("\xf5\xbf\xbf\xbf"),
         $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
-        'nonchar', 0x2FFFF, 4,
+        'nonchar', 0x2FFFF,
+        4, 4,
         qr/Unicode non-character.*is not recommended for open interchange/
     ],
     [ "non-character code point U+3FFFE",
         (isASCII) ? "\xf0\xbf\xbf\xbe" : I8_to_native("\xf7\xbf\xbf\xbe"),
         $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
-        'nonchar', 0x3FFFE, 4,
+        'nonchar', 0x3FFFE,
+        4, 4,
         qr/Unicode non-character.*is not recommended for open interchange/
     ],
     [ "non-character code point U+3FFFF",
         (isASCII) ? "\xf0\xbf\xbf\xbf" : I8_to_native("\xf7\xbf\xbf\xbf"),
         $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
-        'nonchar', 0x3FFFF, 4,
+        'nonchar', 0x3FFFF,
+        4, 4,
         qr/Unicode non-character.*is not recommended for open interchange/
     ],
     [ "non-character code point U+4FFFE",
@@ -1543,6 +1535,7 @@ my @tests = (
         $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
         'nonchar', 0x4FFFE,
         (isASCII) ? 4 : 5,
+        (isASCII) ? 4 : 5,
         qr/Unicode non-character.*is not recommended for open interchange/
     ],
     [ "non-character code point U+4FFFF",
@@ -1550,6 +1543,7 @@ my @tests = (
         $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
         'nonchar', 0x4FFFF,
         (isASCII) ? 4 : 5,
+        (isASCII) ? 4 : 5,
         qr/Unicode non-character.*is not recommended for open interchange/
     ],
     [ "non-character code point U+5FFFE",
@@ -1557,6 +1551,7 @@ my @tests = (
         $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
         'nonchar', 0x5FFFE,
         (isASCII) ? 4 : 5,
+        (isASCII) ? 4 : 5,
         qr/Unicode non-character.*is not recommended for open interchange/
     ],
     [ "non-character code point U+5FFFF",
@@ -1564,6 +1559,7 @@ my @tests = (
         $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
         'nonchar', 0x5FFFF,
         (isASCII) ? 4 : 5,
+        (isASCII) ? 4 : 5,
         qr/Unicode non-character.*is not recommended for open interchange/
     ],
     [ "non-character code point U+6FFFE",
@@ -1571,6 +1567,7 @@ my @tests = (
         $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
         'nonchar', 0x6FFFE,
         (isASCII) ? 4 : 5,
+        (isASCII) ? 4 : 5,
         qr/Unicode non-character.*is not recommended for open interchange/
     ],
     [ "non-character code point U+6FFFF",
@@ -1578,6 +1575,7 @@ my @tests = (
         $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
         'nonchar', 0x6FFFF,
         (isASCII) ? 4 : 5,
+        (isASCII) ? 4 : 5,
         qr/Unicode non-character.*is not recommended for open interchange/
     ],
     [ "non-character code point U+7FFFE",
@@ -1585,6 +1583,7 @@ my @tests = (
         $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
         'nonchar', 0x7FFFE,
         (isASCII) ? 4 : 5,
+        (isASCII) ? 4 : 5,
         qr/Unicode non-character.*is not recommended for open interchange/
     ],
     [ "non-character code point U+7FFFF",
@@ -1592,6 +1591,7 @@ my @tests = (
         $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
         'nonchar', 0x7FFFF,
         (isASCII) ? 4 : 5,
+        (isASCII) ? 4 : 5,
         qr/Unicode non-character.*is not recommended for open interchange/
     ],
     [ "non-character code point U+8FFFE",
@@ -1599,6 +1599,7 @@ my @tests = (
         $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
         'nonchar', 0x8FFFE,
         (isASCII) ? 4 : 5,
+        (isASCII) ? 4 : 5,
         qr/Unicode non-character.*is not recommended for open interchange/
     ],
     [ "non-character code point U+8FFFF",
@@ -1606,6 +1607,7 @@ my @tests = (
         $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
         'nonchar', 0x8FFFF,
         (isASCII) ? 4 : 5,
+        (isASCII) ? 4 : 5,
         qr/Unicode non-character.*is not recommended for open interchange/
     ],
     [ "non-character code point U+9FFFE",
@@ -1613,6 +1615,7 @@ my @tests = (
         $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
         'nonchar', 0x9FFFE,
         (isASCII) ? 4 : 5,
+        (isASCII) ? 4 : 5,
         qr/Unicode non-character.*is not recommended for open interchange/
     ],
     [ "non-character code point U+9FFFF",
@@ -1620,6 +1623,7 @@ my @tests = (
         $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
         'nonchar', 0x9FFFF,
         (isASCII) ? 4 : 5,
+        (isASCII) ? 4 : 5,
         qr/Unicode non-character.*is not recommended for open interchange/
     ],
     [ "non-character code point U+AFFFE",
@@ -1627,6 +1631,7 @@ my @tests = (
         $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
         'nonchar', 0xAFFFE,
         (isASCII) ? 4 : 5,
+        (isASCII) ? 4 : 5,
         qr/Unicode non-character.*is not recommended for open interchange/
     ],
     [ "non-character code point U+AFFFF",
@@ -1634,6 +1639,7 @@ my @tests = (
         $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
         'nonchar', 0xAFFFF,
         (isASCII) ? 4 : 5,
+        (isASCII) ? 4 : 5,
         qr/Unicode non-character.*is not recommended for open interchange/
     ],
     [ "non-character code point U+BFFFE",
@@ -1641,6 +1647,7 @@ my @tests = (
         $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
         'nonchar', 0xBFFFE,
         (isASCII) ? 4 : 5,
+        (isASCII) ? 4 : 5,
         qr/Unicode non-character.*is not recommended for open interchange/
     ],
     [ "non-character code point U+BFFFF",
@@ -1648,6 +1655,7 @@ my @tests = (
         $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
         'nonchar', 0xBFFFF,
         (isASCII) ? 4 : 5,
+        (isASCII) ? 4 : 5,
         qr/Unicode non-character.*is not recommended for open interchange/
     ],
     [ "non-character code point U+CFFFE",
@@ -1655,6 +1663,7 @@ my @tests = (
         $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
         'nonchar', 0xCFFFE,
         (isASCII) ? 4 : 5,
+        (isASCII) ? 4 : 5,
         qr/Unicode non-character.*is not recommended for open interchange/
     ],
     [ "non-character code point U+CFFFF",
@@ -1662,6 +1671,7 @@ my @tests = (
         $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
         'nonchar', 0xCFFFF,
         (isASCII) ? 4 : 5,
+        (isASCII) ? 4 : 5,
         qr/Unicode non-character.*is not recommended for open interchange/
     ],
     [ "non-character code point U+DFFFE",
@@ -1669,6 +1679,7 @@ my @tests = (
         $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
         'nonchar', 0xDFFFE,
         (isASCII) ? 4 : 5,
+        (isASCII) ? 4 : 5,
         qr/Unicode non-character.*is not recommended for open interchange/
     ],
     [ "non-character code point U+DFFFF",
@@ -1676,6 +1687,7 @@ my @tests = (
         $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
         'nonchar', 0xDFFFF,
         (isASCII) ? 4 : 5,
+        (isASCII) ? 4 : 5,
         qr/Unicode non-character.*is not recommended for open interchange/
     ],
     [ "non-character code point U+EFFFE",
@@ -1683,6 +1695,7 @@ my @tests = (
         $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
         'nonchar', 0xEFFFE,
         (isASCII) ? 4 : 5,
+        (isASCII) ? 4 : 5,
         qr/Unicode non-character.*is not recommended for open interchange/
     ],
     [ "non-character code point U+EFFFF",
@@ -1690,6 +1703,7 @@ my @tests = (
         $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
         'nonchar', 0xEFFFF,
         (isASCII) ? 4 : 5,
+        (isASCII) ? 4 : 5,
         qr/Unicode non-character.*is not recommended for open interchange/
     ],
     [ "non-character code point U+FFFFE",
@@ -1697,6 +1711,7 @@ my @tests = (
         $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
         'nonchar', 0xFFFFE,
         (isASCII) ? 4 : 5,
+        (isASCII) ? 4 : 5,
         qr/Unicode non-character.*is not recommended for open interchange/
     ],
     [ "non-character code point U+FFFFF",
@@ -1704,6 +1719,7 @@ my @tests = (
         $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
         'nonchar', 0xFFFFF,
         (isASCII) ? 4 : 5,
+        (isASCII) ? 4 : 5,
         qr/Unicode non-character.*is not recommended for open interchange/
     ],
     [ "non-character code point U+10FFFE",
@@ -1711,6 +1727,7 @@ my @tests = (
         $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
         'nonchar', 0x10FFFE,
         (isASCII) ? 4 : 5,
+        (isASCII) ? 4 : 5,
         qr/Unicode non-character.*is not recommended for open interchange/
     ],
     [ "non-character code point U+10FFFF",
@@ -1718,6 +1735,7 @@ my @tests = (
         $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
         'nonchar', 0x10FFFF,
         (isASCII) ? 4 : 5,
+        (isASCII) ? 4 : 5,
         qr/Unicode non-character.*is not recommended for open interchange/
     ],
     [ "requires at least 32 bits",
@@ -1728,7 +1746,9 @@ my @tests = (
         # 32-bit machines
         $UTF8_WARN_ABOVE_31_BIT, $UTF8_DISALLOW_ABOVE_31_BIT,
         $UTF8_GOT_ABOVE_31_BIT,
-        'utf8', 0x80000000, (isASCII) ? 7 : $max_bytes,
+        'utf8', 0x80000000,
+        (isASCII) ? 7 : $max_bytes,
+        (isASCII) ? 1 : 8,
         nonportable_regex(0x80000000)
     ],
     [ "highest 32 bit code point",
@@ -1737,7 +1757,9 @@ my @tests = (
          : 
I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa3\xbf\xbf\xbf\xbf\xbf\xbf"),
         $UTF8_WARN_ABOVE_31_BIT, $UTF8_DISALLOW_ABOVE_31_BIT,
         $UTF8_GOT_ABOVE_31_BIT,
-        'utf8', 0xFFFFFFFF, (isASCII) ? 7 : $max_bytes,
+        'utf8', 0xFFFFFFFF,
+        (isASCII) ? 7 : $max_bytes,
+        (isASCII) ? 1 : 8,
         nonportable_regex(0xffffffff)
     ],
     [ "requires at least 32 bits, and use SUPER-type flags, instead of 
ABOVE_31_BIT",
@@ -1745,7 +1767,9 @@ my @tests = (
          ? "\xfe\x82\x80\x80\x80\x80\x80"
          : 
I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa2\xa0\xa0\xa0\xa0\xa0\xa0"),
         $UTF8_WARN_SUPER, $UTF8_DISALLOW_SUPER, $UTF8_GOT_SUPER,
-        'utf8', 0x80000000, (isASCII) ? 7 : $max_bytes,
+        'utf8', 0x80000000,
+        (isASCII) ? 7 : $max_bytes,
+        (isASCII) ? 1 : 8,
         nonportable_regex(0x80000000)
     ],
     [ "overflow with warnings/disallow for more than 31 bits",
@@ -1768,7 +1792,8 @@ my @tests = (
         $UTF8_DISALLOW_ABOVE_31_BIT,
         $UTF8_GOT_ABOVE_31_BIT,
         'utf8', 0,
-        (! isASCII) ? $max_bytes : ($is64bit) ? $max_bytes : 7,   # XXX
+        (! isASCII || $is64bit) ? $max_bytes : 7,
+        (isASCII || $is64bit) ? 2 : 8,
         qr/overflows/
     ],
 );
@@ -1781,7 +1806,8 @@ if (! $is64bit) {
                 "\xFE\x84\x80\x80\x80\x80\x80",
                 $UTF8_WARN_ABOVE_31_BIT, $UTF8_DISALLOW_ABOVE_31_BIT,
                 $UTF8_GOT_ABOVE_31_BIT,
-                'utf8', 0x100000000, 7,
+                'utf8', 0x100000000,
+                7, 1,
                 qr/and( is)? not portable/
             ];
     }
@@ -1795,7 +1821,8 @@ else {
             : 
I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa2\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
             $UTF8_WARN_ABOVE_31_BIT, $UTF8_DISALLOW_ABOVE_31_BIT,
             $UTF8_GOT_ABOVE_31_BIT,
-            'utf8', 0x1000000000, $max_bytes,
+            'utf8', 0x1000000000,
+            $max_bytes, (isASCII) ? 1 : 7,
             qr/and( is)? not portable/
         ];
     if (! isASCII) {
@@ -1804,35 +1831,40 @@ else {
                 
I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
                 $UTF8_WARN_ABOVE_31_BIT,$UTF8_DISALLOW_ABOVE_31_BIT,
                 $UTF8_GOT_ABOVE_31_BIT,
-                'utf8', 0x800000000, $max_bytes,
+                'utf8', 0x800000000,
+                $max_bytes, 7,
                 nonportable_regex(0x80000000)
             ],
             [ "requires at least 32 bits",
                 
I8_to_native("\xff\xa0\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
                 $UTF8_WARN_ABOVE_31_BIT,$UTF8_DISALLOW_ABOVE_31_BIT,
                 $UTF8_GOT_ABOVE_31_BIT,
-                'utf8', 0x10000000000, $max_bytes,
+                'utf8', 0x10000000000,
+                $max_bytes, 6,
                 nonportable_regex(0x10000000000)
             ],
             [ "requires at least 32 bits",
                 
I8_to_native("\xff\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
                 $UTF8_WARN_ABOVE_31_BIT,$UTF8_DISALLOW_ABOVE_31_BIT,
                 $UTF8_GOT_ABOVE_31_BIT,
-                'utf8', 0x200000000000, $max_bytes,
+                'utf8', 0x200000000000,
+                $max_bytes, 5,
                 nonportable_regex(0x20000000000)
             ],
             [ "requires at least 32 bits",
                 
I8_to_native("\xff\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
                 $UTF8_WARN_ABOVE_31_BIT,$UTF8_DISALLOW_ABOVE_31_BIT,
                 $UTF8_GOT_ABOVE_31_BIT,
-                'utf8', 0x4000000000000, $max_bytes,
+                'utf8', 0x4000000000000,
+                $max_bytes, 4,
                 nonportable_regex(0x4000000000000)
             ],
             [ "requires at least 32 bits",
                 
I8_to_native("\xff\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
                 $UTF8_WARN_ABOVE_31_BIT,$UTF8_DISALLOW_ABOVE_31_BIT,
                 $UTF8_GOT_ABOVE_31_BIT,
-                'utf8', 0x80000000000000, $max_bytes,
+                'utf8', 0x80000000000000,
+                $max_bytes, 3,
                 nonportable_regex(0x80000000000000)
             ],
             [ "requires at least 32 bits",
@@ -1840,7 +1872,8 @@ else {
                    #IBM-1047  
\xFE\x41\x41\x41\x41\x41\x41\x43\x41\x41\x41\x41\x41\x41
                 $UTF8_WARN_ABOVE_31_BIT,$UTF8_DISALLOW_ABOVE_31_BIT,
                 $UTF8_GOT_ABOVE_31_BIT,
-                'utf8', 0x1000000000000000, $max_bytes,
+                'utf8', 0x1000000000000000,
+                $max_bytes, 2,
                 nonportable_regex(0x1000000000000000)
             ];
     }
@@ -1848,7 +1881,7 @@ else {
 
 foreach my $test (@tests) {
     my ($testname, $bytes, $warn_flags, $disallow_flags, $expected_error_flags,
-        $category, $allowed_uv, $expected_len, $message ) = @$test;
+        $category, $allowed_uv, $expected_len, $needed_to_discern_len, 
$message ) = @$test;
 
     my $length = length $bytes;
     my $will_overflow = $testname =~ /overflow/ ? 'overflow' : "";
@@ -1936,41 +1969,16 @@ foreach my $test (@tests) {
                 if ($disallow_flag) {
                     $ret_should_be = 0;
                     $comment = "disallowed";
+                    if ($j < $needed_to_discern_len) {
+                        $ret_should_be = 1;
+                        $comment .= ", but need $needed_to_discern_len bytes 
to discern:";
+                    }
                 }
                 else {
                     $ret_should_be = 1;
                     $comment = "allowed";
                 }
 
-                if ($disallow_flag) {
-                    if ($testname =~ /non-character/) {
-                        $ret_should_be = 1;
-                        $comment .= ", but but need full char to discern";
-                    }
-                    elsif ($testname =~ /surrogate/) {
-                        if ($j < 2) {
-                            $ret_should_be = 1;
-                            $comment .= ", but need 2 bytes to discern";
-                        }
-                    }
-                    elsif (   ($disallow_flags & $UTF8_DISALLOW_SUPER)
-                           && $j < 2
-                           && ord(native_to_I8(substr($bytes, 0, 1)))
-                               lt ((isASCII) ? 0xF5 : 0xFA))
-                    {
-                        $ret_should_be = 1;
-                        $comment .= ", but need 2 bytes to discern";
-                    }
-                    elsif (   ! isASCII
-                           && $testname =~ /requires at least 32 bits/)
-                    {
-                        # On EBCDIC, the boundary between 31 and 32 bits is
-                        # more complicated.
-                        $ret_should_be = 1 if native_to_I8($partial) le
-                     
"\xFF\xA0\xA0\xA0\xA0\xA0\xA0\xA1\xBF\xBF\xBF\xBF\xBF\xBF";
-                    }
-                }
-
                 undef @warnings;
 
                 $ret = test_is_utf8_valid_partial_char_flags($partial, $j, 
$disallow_flag);
@@ -1991,413 +1999,424 @@ foreach my $test (@tests) {
     # of utf8 warnings to verify they work with and without the utf8 class,
     # and don't have effects on other sublass warnings
     foreach my $warning ('utf8', 'surrogate', 'nonchar', 'non_unicode') {
-        foreach my $warn_flag (0, $warn_flags) {
-            foreach my $disallow_flag (0, $disallow_flags) {
-                foreach my $do_warning (0, 1) {
+      foreach my $warn_flag (0, $warn_flags) {
+        foreach my $disallow_flag (0, $disallow_flags) {
+          foreach my $do_warning (0, 1) {
+
+            # We try each of the above with various combinations of
+            # malformations that can occur on the same input sequence.
+            foreach my $short ("", "short") {
+              foreach my $unexpected_noncont ("",
+                                              "unexpected non-continuation")
+              {
+                foreach my $overlong ("", "overlong") {
+
+                    # If we're already at the longest possible, we
+                    # can't create an overlong (which would be longer)
+                    # can't handle anything larger.
+                    next if $overlong && $expected_len >= $max_bytes;
+
+                    my @malformations;
+                    my @expected_errors;
+                    push @malformations, $short if $short;
+                    push @malformations, $unexpected_noncont
+                                                      if $unexpected_noncont;
+                    push @malformations, $overlong if $overlong;
+
+                    # The overflow malformation test in the input
+                    # array is coerced into being treated like one of
+                    # the others.
+                    if ($will_overflow) {
+                        push @malformations, 'overflow';
+                        push @expected_errors, $UTF8_GOT_OVERFLOW;
+                    }
 
-                    # We try each of the above with various combinations of
-                    # malformations that can occur on the same input sequence.
-                    foreach my $short ("",
-                                       "short",
-                                       "unexpected non-continuation")
-                    {
-                        # The non-characters can't be discerned with a short
+                    my $malformations_name = join "/", @malformations;
+                    $malformations_name .= " malformation"
+                                                if $malformations_name;
+                    $malformations_name .= "s" if @malformations > 1;
+                    my $this_bytes = $bytes;
+                    my $this_length = $length;
+                    my $expected_uv = $allowed_uv;
+                    my $this_expected_len = $expected_len;
+                    my $this_needed_to_discern_len = $needed_to_discern_len;
+                    if ($malformations_name) {
+                        $expected_uv = 0;
+
+                        # Coerce the input into the desired
                         # malformation
-                        next if $short && $testname =~ /non-character/;
-
-                        foreach my $overlong ("", "overlong") {
-
-                            # If we're already at the longest possible, we
-                            # can't create an overlong (which would be longer)
-                            # can't handle anything larger.
-                            next if $overlong && $expected_len >= $max_bytes;
-
-                            my @malformations;
-                            my @expected_errors;
-                            push @malformations, $short if $short;
-                            push @malformations, $overlong if $overlong;
-
-                            # The overflow malformation test in the input
-                            # array is coerced into being treated like one of
-                            # the others.
-                            if ($will_overflow) {
-                                push @malformations, 'overflow';
-                                push @expected_errors, $UTF8_GOT_OVERFLOW;
-                            }
-
-                            my $malformations_name = join "/", @malformations;
-                            $malformations_name .= " malformation"
-                                                        if $malformations_name;
-                            $malformations_name .= "s" if @malformations > 1;
-                            my $this_bytes = $bytes;
-                            my $this_length = $length;
-                            my $expected_uv = $allowed_uv;
-                            my $this_expected_len = $expected_len;
-                            if ($malformations_name) {
-                                $expected_uv = 0;
-
-                                # Coerce the input into the desired
-                                # malformation
-                                if ($malformations_name =~ /overlong/) {
-
-                                    # For an overlong, we convert the original
-                                    # start byte into a continuation byte with
-                                    # the same data bits as originally. ...
-                                    substr($this_bytes, 0, 1)
-                                        = 
start_byte_to_cont(substr($this_bytes,
-                                                                    0, 1));
-
-                                    # ... Then we prepend it with a known
-                                    # overlong sequence.  This should evaluate
-                                    # to the exact same code point as the
-                                    # original.
-                                    $this_bytes
-                                    = I8_to_native("\xff")
-                                    . (I8_to_native(chr $first_continuation)
-                                       x ( $max_bytes - 1 - 
length($this_bytes)))
-                                    . $this_bytes;
-                                    $this_length = length($this_bytes);
-                                    $this_expected_len = $max_bytes;
-                                    push @expected_errors, $UTF8_GOT_LONG;
-                                }
-                                if ($malformations_name =~ /short/) {
-
-                                    # Just tell the test to not look far
-                                    # enough into the input.
-                                    $this_length--;
-                                    $this_expected_len--;
-                                    push @expected_errors, $UTF8_GOT_SHORT;
-                                }
-                                elsif ($malformations_name
-                                                        =~ /non-continuation/)
-                                {
-                                    # Change the final continuation byte into
-                                    # a non one.
-                                    substr($this_bytes, -1, 1) = '?';
-                                    $this_expected_len--;
-                                    push @expected_errors,
-                                                    $UTF8_GOT_NON_CONTINUATION;
-                                }
-                            }
+                        if ($malformations_name =~ /overlong/) {
+
+                            # For an overlong, we convert the original
+                            # start byte into a continuation byte with
+                            # the same data bits as originally. ...
+                            substr($this_bytes, 0, 1)
+                                = start_byte_to_cont(substr($this_bytes,
+                                                            0, 1));
+
+                            # ... Then we prepend it with a known
+                            # overlong sequence.  This should evaluate
+                            # to the exact same code point as the
+                            # original.
+                            $this_bytes
+                            = I8_to_native("\xff")
+                            . (I8_to_native(chr $first_continuation)
+                            x ( $max_bytes - 1 - length($this_bytes)))
+                            . $this_bytes;
+                            $this_length = length($this_bytes);
+                            $this_needed_to_discern_len
+                                 = $max_bytes - ($this_expected_len
+                                               - $this_needed_to_discern_len);
+                            $this_expected_len = $max_bytes;
+                            push @expected_errors, $UTF8_GOT_LONG;
+                        }
+                        if ($malformations_name =~ /short/) {
 
-                            my $eval_warn = $do_warning
-                                        ? "use warnings '$warning'"
-                                        : $warning eq "utf8"
-                                            ? "no warnings 'utf8'"
-                                            : ( "use warnings 'utf8';"
-                                              . " no warnings '$warning'");
-
-                            # Is effectively disallowed if we've set up a
-                            # malformation, even if the flag indicates it is
-                            # allowed.  Fix up test name to indicate this as
-                            # well
-                            my $disallowed = $disallow_flag
-                                          || $malformations_name;
-                            my $this_name = "utf8n_to_uvchr_error() $testname: 
"
-                                                        . (($disallow_flag)
-                                                           ? 'disallowed'
-                                                           : $disallowed
-                                                             ? $disallowed
-                                                             : 'allowed');
-                            $this_name .= ", $eval_warn";
-                            $this_name .= ", " . (($warn_flag)
-                                                ? 'with warning flag'
-                                                : 'no warning flag');
-
-                            undef @warnings;
-                            my $ret_ref;
-                            my $display_bytes = display_bytes($this_bytes);
-                            my $call = "Call was: $eval_warn; \$ret_ref"
-                                     . " = test_utf8n_to_uvchr_error("
-                                     . "'$display_bytes', $this_length,"
-                                     . "$warn_flag"
-                                     . "|$disallow_flag)";
-                            my $eval_text =      "$eval_warn; \$ret_ref"
-                                     . " = test_utf8n_to_uvchr_error("
-                                     . "'$this_bytes',"
-                                     . " $this_length, $warn_flag"
-                                     . "|$disallow_flag)";
-                            eval "$eval_text";
-                            if (! ok ("$@ eq ''",
-                                "$this_name: eval succeeded"))
-                            {
-                                diag "\$!='$!'; eval'd=\"$call\"";
-                                next;
-                            }
-                            if ($disallowed) {
-                                unless (is($ret_ref->[0], 0,
-                                           "$this_name: Returns 0"))
-                                {
-                                    diag $call;
-                                }
-                            }
-                            else {
-                                unless (is($ret_ref->[0], $expected_uv,
-                                        "$this_name: Returns expected uv: "
-                                        . sprintf("0x%04X", $expected_uv)))
-                                {
-                                    diag $call;
-                                }
-                            }
-                            unless (is($ret_ref->[1], $this_expected_len,
-                                "$this_name: Returns expected length:"
-                              . " $this_expected_len"))
-                            {
-                                diag $call;
-                            }
+                            # Just tell the test to not look far
+                            # enough into the input.
+                            $this_length--;
+                            $this_expected_len--;
+                            push @expected_errors, $UTF8_GOT_SHORT;
+                        }
+                        if ($malformations_name
+                                                =~ /non-continuation/)
+                        {
+                            # Change the final continuation byte into
+                            # a non one.
+                            my $pos = ($short) ? -2 : -1;
+                            substr($this_bytes, $pos, 1) = '?';
+                            $this_expected_len--;
+                            push @expected_errors,
+                                            $UTF8_GOT_NON_CONTINUATION;
+                        }
+                    }
 
-                            my $errors = $ret_ref->[2];
+                    my $eval_warn = $do_warning
+                                ? "use warnings '$warning'"
+                                : $warning eq "utf8"
+                                    ? "no warnings 'utf8'"
+                                    : ( "use warnings 'utf8';"
+                                    . " no warnings '$warning'");
+
+                    # Is effectively disallowed if we've set up a
+                    # malformation, even if the flag indicates it is
+                    # allowed.  Fix up test name to indicate this as
+                    # well
+                    my $disallowed = $disallow_flag
+                                || $malformations_name;
+                    my $this_name = "utf8n_to_uvchr_error() $testname: "
+                                                . (($disallow_flag)
+                                                ? 'disallowed'
+                                                : $disallowed
+                                                    ? $disallowed
+                                                    : 'allowed');
+                    $this_name .= ", $eval_warn";
+                    $this_name .= ", " . (($warn_flag)
+                                        ? 'with warning flag'
+                                        : 'no warning flag');
+
+                    undef @warnings;
+                    my $ret_ref;
+                    my $display_bytes = display_bytes($this_bytes);
+                    my $call = "    Call was: $eval_warn; \$ret_ref"
+                            . " = test_utf8n_to_uvchr_error("
+                            . "'$display_bytes', $this_length,"
+                            . "$warn_flag"
+                            . "|$disallow_flag)";
+                    my $eval_text =      "$eval_warn; \$ret_ref"
+                            . " = test_utf8n_to_uvchr_error("
+                            . "'$this_bytes',"
+                            . " $this_length, $warn_flag"
+                            . "|$disallow_flag)";
+                    eval "$eval_text";
+                    if (! ok ("$@ eq ''",
+                        "$this_name: eval succeeded"))
+                    {
+                        diag "\$!='$!'; eval'd=\"$call\"";
+                        next;
+                    }
+                    if ($disallowed) {
+                        unless (is($ret_ref->[0], 0,
+                                "$this_name: Returns 0"))
+                        {
+                            diag $call;
+                        }
+                    }
+                    else {
+                        unless (is($ret_ref->[0], $expected_uv,
+                                "$this_name: Returns expected uv: "
+                                . sprintf("0x%04X", $expected_uv)))
+                        {
+                            diag $call;
+                        }
+                    }
+                    unless (is($ret_ref->[1], $this_expected_len,
+                        "$this_name: Returns expected length:"
+                    . " $this_expected_len"))
+                    {
+                        diag $call;
+                    }
 
-                            for (my $i = @expected_errors - 1; $i >= 0; $i--) {
-                                if (ok($expected_errors[$i] & $errors,
-                                       "Expected and got error bit return"
-                                     . " for $malformations[$i] malformation"))
-                                {
-                                    $errors &= ~$expected_errors[$i];
-                                }
-                                splice @expected_errors, $i, 1;
-                            }
-                            unless (is(scalar @expected_errors, 0,
-                                    "Got all the expected malformation 
errors"))
-                            {
-                                diag Dumper \@expected_errors;
-                            }
+                    my $errors = $ret_ref->[2];
 
-                            if ($warn_flag || $disallow_flag) {
-                                is($errors, $expected_error_flags,
-                                   "Got the correct error flag");
-                            }
-                            else {
-                                is($errors, 0, "Got no other error flag");
-                            }
+                    for (my $i = @expected_errors - 1; $i >= 0; $i--) {
+                        if (ok($expected_errors[$i] & $errors,
+                            "Expected and got error bit return"
+                            . " for $malformations[$i] malformation"))
+                        {
+                            $errors &= ~$expected_errors[$i];
+                        }
+                        splice @expected_errors, $i, 1;
+                    }
+                    unless (is(scalar @expected_errors, 0,
+                            "Got all the expected malformation errors"))
+                    {
+                        diag Dumper \@expected_errors;
+                    }
 
-                            if (@malformations) {
-                                if (! $do_warning && $warning eq 'utf8') {
-                                    goto no_warnings_expected;
-                                }
+                    if (   $this_expected_len >= $this_needed_to_discern_len
+                        && ($warn_flag || $disallow_flag))
+                    {
+                        unless (is($errors, $expected_error_flags,
+                                "Got the correct error flag"))
+                        {
+                            diag $call;
+                        }
+                    }
+                    else {
+                        is($errors, 0, "Got no other error flag");
+                    }
 
-                                # Check that each malformation generates a
-                                # warning, removing that warning if found
-                              MALFORMATION:
-                                foreach my $malformation (@malformations) {
-                                    foreach (my $i = 0; $i < @warnings; $i++) {
-                                        if ($warnings[$i] =~ /$malformation/) {
-                                            pass("Expected and got"
-                                               . "'$malformation' warning");
-                                            splice @warnings, $i, 1;
-                                            next MALFORMATION;
-                                        }
-                                    }
-                                    fail("Expected '$malformation' warning"
-                                       . " but didn't get it");
+                    if (@malformations) {
+                        if (! $do_warning && $warning eq 'utf8') {
+                            goto no_warnings_expected;
+                        }
 
+                        # Check that each malformation generates a
+                        # warning, removing that warning if found
+                    MALFORMATION:
+                        foreach my $malformation (@malformations) {
+                            foreach (my $i = 0; $i < @warnings; $i++) {
+                                if ($warnings[$i] =~ /$malformation/) {
+                                    pass("Expected and got"
+                                    . "'$malformation' warning");
+                                    splice @warnings, $i, 1;
+                                    next MALFORMATION;
                                 }
                             }
+                            fail("Expected '$malformation' warning"
+                            . " but didn't get it");
 
-                            # Any overflow will override any super or above-31
-                            # warnings.
-                            goto no_warnings_expected if $will_overflow;
+                        }
+                    }
 
-                            if (    ! $do_warning
-                                && (   $warning eq 'utf8'
-                                    || $warning eq $category))
+                    # Any overflow will override any super or above-31
+                    # warnings.
+                    goto no_warnings_expected
+                                if $will_overflow || $this_expected_len
+                                        < $this_needed_to_discern_len;
+
+                    if (    ! $do_warning
+                        && (   $warning eq 'utf8'
+                            || $warning eq $category))
+                    {
+                        goto no_warnings_expected;
+                    }
+                    elsif ($warn_flag) {
+                        if (is(scalar @warnings, 1,
+                            "$this_name: Got a single warning "))
+                        {
+                            unless (like($warnings[0], $message,
+                                    "$this_name: Got expected warning"))
                             {
-                                goto no_warnings_expected;
-                            }
-                            elsif ($warn_flag) {
-                                if (is(scalar @warnings, 1,
-                                    "$this_name: Got a single warning "))
-                                {
-                                    unless (like($warnings[0], $message,
-                                            "$this_name: Got expected 
warning"))
-                                    {
-                                        diag $call;
-                                    }
-                                }
-                                else {
-                                    diag $call;
-                                    if (scalar @warnings) {
-                                        output_warnings(@warnings);
-                                    }
-                                }
+                                diag $call;
                             }
-                            else {
-                              no_warnings_expected:
-                                unless (is(scalar @warnings, 0,
-                                        "$this_name: Got no warnings"))
-                                {
-                                    diag $call;
-                                    output_warnings(@warnings);
-                                }
+                        }
+                        else {
+                            diag $call;
+                            if (scalar @warnings) {
+                                output_warnings(@warnings);
                             }
+                        }
+                    }
+                    else {
+                    no_warnings_expected:
+                        unless (is(scalar @warnings, 0,
+                                "$this_name: Got no warnings"))
+                        {
+                            diag $call;
+                            output_warnings(@warnings);
+                        }
+                    }
 
-                            # Check CHECK_ONLY results when the input is
-                            # disallowed.  Do this when actually disallowed,
-                            # not just when the $disallow_flag is set
-                            if ($disallowed) {
-                                undef @warnings;
-                                $ret_ref = test_utf8n_to_uvchr_error(
-                                               $this_bytes, $this_length,
-                                               
$disallow_flag|$UTF8_CHECK_ONLY);
-                                unless (is($ret_ref->[0], 0,
-                                        "$this_name, CHECK_ONLY: Returns 0"))
-                                {
-                                    diag $call;
-                                }
-                                unless (is($ret_ref->[1], -1,
-                                    "$this_name: CHECK_ONLY: returns -1 for"
-                                  . " length"))
-                                {
-                                    diag $call;
-                                }
-                                if (! is(scalar @warnings, 0,
-                                    "$this_name, CHECK_ONLY: no warnings"
-                                  . " generated"))
-                                {
-                                    diag $call;
-                                    output_warnings(@warnings);
-                                }
-                            }
+                    # Check CHECK_ONLY results when the input is
+                    # disallowed.  Do this when actually disallowed,
+                    # not just when the $disallow_flag is set
+                    if ($disallowed) {
+                        undef @warnings;
+                        $ret_ref = test_utf8n_to_uvchr_error(
+                                    $this_bytes, $this_length,
+                                    $disallow_flag|$UTF8_CHECK_ONLY);
+                        unless (is($ret_ref->[0], 0,
+                                "$this_name, CHECK_ONLY: Returns 0"))
+                        {
+                            diag $call;
+                        }
+                        unless (is($ret_ref->[1], -1,
+                            "$this_name: CHECK_ONLY: returns -1 for"
+                        . " length"))
+                        {
+                            diag $call;
+                        }
+                        if (! is(scalar @warnings, 0,
+                            "$this_name, CHECK_ONLY: no warnings"
+                        . " generated"))
+                        {
+                            diag $call;
+                            output_warnings(@warnings);
+                        }
+                    }
 
-                            # Now repeat some of the above, but for
-                            # uvchr_to_utf8_flags().  Since this comes from an
-                            # existing code point, it hasn't overflowed, and
-                            # isn't malformed.
-                            next if @malformations;
-
-                            # The warning and disallow flags passed in are for
-                            # utf8n_to_uvchr_error().  Convert them for
-                            # uvchr_to_utf8_flags().
-                            my $uvchr_warn_flag = 0;
-                            my $uvchr_disallow_flag = 0;
-                            if ($warn_flag) {
-                                if ($warn_flag == $UTF8_WARN_SURROGATE) {
-                                    $uvchr_warn_flag = $UNICODE_WARN_SURROGATE
-                                }
-                                elsif ($warn_flag == $UTF8_WARN_NONCHAR) {
-                                    $uvchr_warn_flag = $UNICODE_WARN_NONCHAR
-                                }
-                                elsif ($warn_flag == $UTF8_WARN_SUPER) {
-                                    $uvchr_warn_flag = $UNICODE_WARN_SUPER
-                                }
-                                elsif ($warn_flag == $UTF8_WARN_ABOVE_31_BIT) {
-                                    $uvchr_warn_flag
-                                                   = 
$UNICODE_WARN_ABOVE_31_BIT;
-                                }
-                                else {
-                                    fail(sprintf "Unexpected warn flag: %x",
-                                        $warn_flag);
-                                    next;
-                                }
-                            }
-                            if ($disallow_flag) {
-                                if ($disallow_flag == $UTF8_DISALLOW_SURROGATE)
-                                {
-                                    $uvchr_disallow_flag
-                                                = $UNICODE_DISALLOW_SURROGATE;
-                                }
-                                elsif ($disallow_flag == 
$UTF8_DISALLOW_NONCHAR)
-                                {
-                                    $uvchr_disallow_flag
-                                                = $UNICODE_DISALLOW_NONCHAR;
-                                }
-                                elsif ($disallow_flag == $UTF8_DISALLOW_SUPER) 
{
-                                    $uvchr_disallow_flag
-                                                  = $UNICODE_DISALLOW_SUPER;
-                                }
-                                elsif ($disallow_flag
-                                                == $UTF8_DISALLOW_ABOVE_31_BIT)
-                                {
-                                    $uvchr_disallow_flag =
-                                                $UNICODE_DISALLOW_ABOVE_31_BIT;
-                                }
-                                else {
-                                    fail(sprintf "Unexpected disallow flag: 
%x",
-                                        $disallow_flag);
-                                    next;
-                                }
-                            }
+                    # Now repeat some of the above, but for
+                    # uvchr_to_utf8_flags().  Since this comes from an
+                    # existing code point, it hasn't overflowed, and
+                    # isn't malformed.
+                    next if @malformations;
+
+                    # The warning and disallow flags passed in are for
+                    # utf8n_to_uvchr_error().  Convert them for
+                    # uvchr_to_utf8_flags().
+                    my $uvchr_warn_flag = 0;
+                    my $uvchr_disallow_flag = 0;
+                    if ($warn_flag) {
+                        if ($warn_flag == $UTF8_WARN_SURROGATE) {
+                            $uvchr_warn_flag = $UNICODE_WARN_SURROGATE
+                        }
+                        elsif ($warn_flag == $UTF8_WARN_NONCHAR) {
+                            $uvchr_warn_flag = $UNICODE_WARN_NONCHAR
+                        }
+                        elsif ($warn_flag == $UTF8_WARN_SUPER) {
+                            $uvchr_warn_flag = $UNICODE_WARN_SUPER
+                        }
+                        elsif ($warn_flag == $UTF8_WARN_ABOVE_31_BIT) {
+                            $uvchr_warn_flag
+                                        = $UNICODE_WARN_ABOVE_31_BIT;
+                        }
+                        else {
+                            fail(sprintf "Unexpected warn flag: %x",
+                                $warn_flag);
+                            next;
+                        }
+                    }
+                    if ($disallow_flag) {
+                        if ($disallow_flag == $UTF8_DISALLOW_SURROGATE)
+                        {
+                            $uvchr_disallow_flag
+                                        = $UNICODE_DISALLOW_SURROGATE;
+                        }
+                        elsif ($disallow_flag == $UTF8_DISALLOW_NONCHAR)
+                        {
+                            $uvchr_disallow_flag
+                                        = $UNICODE_DISALLOW_NONCHAR;
+                        }
+                        elsif ($disallow_flag == $UTF8_DISALLOW_SUPER) {
+                            $uvchr_disallow_flag
+                                        = $UNICODE_DISALLOW_SUPER;
+                        }
+                        elsif ($disallow_flag
+                                        == $UTF8_DISALLOW_ABOVE_31_BIT)
+                        {
+                            $uvchr_disallow_flag =
+                                        $UNICODE_DISALLOW_ABOVE_31_BIT;
+                        }
+                        else {
+                            fail(sprintf "Unexpected disallow flag: %x",
+                                $disallow_flag);
+                            next;
+                        }
+                    }
 
-                            $disallowed = $uvchr_disallow_flag;
-
-                            $this_name = "uvchr_to_utf8_flags() $testname: "
-                                                    . (($uvchr_disallow_flag)
-                                                        ? 'disallowed'
-                                                        : ($disallowed)
-                                                        ? 'ABOVE_31_BIT 
allowed'
-                                                        : 'allowed');
-                            $this_name .= ", $eval_warn";
-                            $this_name .= ", " . (($uvchr_warn_flag)
-                                                ? 'with warning flag'
-                                                : 'no warning flag');
-
-                            undef @warnings;
-                            my $ret;
-                            my $warn_flag = sprintf "0x%x", $uvchr_warn_flag;
-                            my $disallow_flag = sprintf "0x%x",
-                                                        $uvchr_disallow_flag;
-                            $call = sprintf("call was: $eval_warn; \$ret"
-                                          . " = test_uvchr_to_utf8_flags("
-                                          . " 0x%x, 
$warn_flag|$disallow_flag)",
-                                        $allowed_uv);
-                            $eval_text = "$eval_warn; \$ret ="
-                                       . " test_uvchr_to_utf8_flags("
-                                       . "$allowed_uv, $warn_flag|"
-                                       . "$disallow_flag)";
-                            eval "$eval_text";
-                            if (! ok ("$@ eq ''", "$this_name: eval 
succeeded"))
-                            {
-                                diag "\$!='$!'; eval'd=\"$eval_text\"";
-                                next;
-                            }
-                            if ($disallowed) {
-                                unless (is($ret, undef,
-                                        "$this_name: Returns undef"))
-                                {
-                                    diag $call;
-                                }
-                            }
-                            else {
-                                unless (is($ret, $bytes,
-                                        "$this_name: Returns expected string"))
-                                {
-                                    diag $call;
-                                }
-                            }
-                            if (! $do_warning
-                                && ($warning eq 'utf8' || $warning eq 
$category))
-                            {
-                                if (!is(scalar @warnings, 0,
-                                        "$this_name: No warnings generated"))
-                                {
-                                    diag $call;
-                                    output_warnings(@warnings);
-                                }
-                            }
-                            elsif (       $uvchr_warn_flag
-                                   && (   $warning eq 'utf8'
-                                       || $warning eq $category))
+                    $disallowed = $uvchr_disallow_flag;
+
+                    $this_name = "uvchr_to_utf8_flags() $testname: "
+                                            . (($uvchr_disallow_flag)
+                                                ? 'disallowed'
+                                                : ($disallowed)
+                                                ? 'ABOVE_31_BIT allowed'
+                                                : 'allowed');
+                    $this_name .= ", $eval_warn";
+                    $this_name .= ", " . (($uvchr_warn_flag)
+                                        ? 'with warning flag'
+                                        : 'no warning flag');
+
+                    undef @warnings;
+                    my $ret;
+                    my $warn_flag = sprintf "0x%x", $uvchr_warn_flag;
+                    my $disallow_flag = sprintf "0x%x",
+                                                $uvchr_disallow_flag;
+                    $call = sprintf("    Call was: $eval_warn; \$ret"
+                                . " = test_uvchr_to_utf8_flags("
+                                . " 0x%x, $warn_flag|$disallow_flag)",
+                                $allowed_uv);
+                    $eval_text = "$eval_warn; \$ret ="
+                            . " test_uvchr_to_utf8_flags("
+                            . "$allowed_uv, $warn_flag|"
+                            . "$disallow_flag)";
+                    eval "$eval_text";
+                    if (! ok ("$@ eq ''", "$this_name: eval succeeded"))
+                    {
+                        diag "\$!='$!'; eval'd=\"$eval_text\"";
+                        next;
+                    }
+                    if ($disallowed) {
+                        unless (is($ret, undef,
+                                "$this_name: Returns undef"))
+                        {
+                            diag $call;
+                        }
+                    }
+                    else {
+                        unless (is($ret, $bytes,
+                                "$this_name: Returns expected string"))
+                        {
+                            diag $call;
+                        }
+                    }
+                    if (! $do_warning
+                        && ($warning eq 'utf8' || $warning eq $category))
+                    {
+                        if (!is(scalar @warnings, 0,
+                                "$this_name: No warnings generated"))
+                        {
+                            diag $call;
+                            output_warnings(@warnings);
+                        }
+                    }
+                    elsif (       $uvchr_warn_flag
+                        && (   $warning eq 'utf8'
+                            || $warning eq $category))
+                    {
+                        if (is(scalar @warnings, 1,
+                            "$this_name: Got a single warning "))
+                        {
+                            unless (like($warnings[0], $message,
+                                    "$this_name: Got expected warning"))
                             {
-                                if (is(scalar @warnings, 1,
-                                    "$this_name: Got a single warning "))
-                                {
-                                    unless (like($warnings[0], $message,
-                                            "$this_name: Got expected 
warning"))
-                                    {
-                                        diag $call;
-                                    }
-                                }
-                                else {
-                                    diag $call;
-                                    output_warnings(@warnings)
-                                                        if scalar @warnings;
-                                }
+                                diag $call;
                             }
                         }
+                        else {
+                            diag $call;
+                            output_warnings(@warnings)
+                                                if scalar @warnings;
+                        }
                     }
                 }
+              }
             }
+          }
         }
+      }
     }
 }
 
diff --git a/lib/utf8.t b/lib/utf8.t
index 0530faf..06d9a84 100644
--- a/lib/utf8.t
+++ b/lib/utf8.t
@@ -165,7 +165,7 @@ no utf8; # Ironic, no?
     use utf8; %a = ("\xE1\xA0"=>"sterling");
     print 'start'; printf '%x,', ord \$_ foreach keys %a; print "end\n";
 BANG
-             qr/^Malformed UTF-8 character: .*? \(too short; got \d bytes?, 
need \d\).*start\d+,end$/sm
+             qr/^Malformed UTF-8 character: .*? \(too short; \d bytes? 
available, need \d\).*start\d+,end$/sm
             ],
             );
     foreach (@tests) {
diff --git a/t/op/lex.t b/t/op/lex.t
index df96ed7..bd6bb0f 100644
--- a/t/op/lex.t
+++ b/t/op/lex.t
@@ -255,13 +255,13 @@ SKIP:
     fresh_perl_is(
         "BEGIN{\$^H=hex ~0}\xF3",
         "Integer overflow in hexadecimal number at - line 1.\n" .
-        "Malformed UTF-8 character: \\xf3 (too short; got 1 byte, need 4) at - 
line 1.",
+        "Malformed UTF-8 character: \\xf3 (too short; 1 byte available, need 
4) at - line 1.",
         {},
         '[perl #128996] - use of PL_op after op is freed'
     );
     fresh_perl_like(
         qq(BEGIN{\$0="";\$^H=-hex join""=>1}""\xFF),
-        qr/Malformed UTF-8 character: \\xff \(too short; got 1 byte, need 13\) 
at - line 1\./,
+        qr/Malformed UTF-8 character: \\xff \(too short; 1 byte available, 
need 13\) at - line 1\./,
         {},
         '[perl #128997] - buffer read overflow'
     );
diff --git a/t/op/utf8decode.t b/t/op/utf8decode.t
index 90c233a..b56c38b 100644
--- a/t/op/utf8decode.t
+++ b/t/op/utf8decode.t
@@ -83,15 +83,28 @@ foreach (<DATA>) {
            isnt($experr, '', "Expected first warning for $id provided");
 
             my $message;
+            my $after = "";
             if ($expect64 != $expect32 && ! $is64bit) {
                 like($warnings[0], qr/overflow/, "overflow warning for $id 
seen");
                 shift @warnings;
-                $message = "Expected first warning after overflow for $id 
seen";
+                $after .= "overflow";
             }
-            else {
-                $message = "Expected first warning for $id seen";
+
+            # The data below assumes that if there is both a 'short' and
+            # 'non-continuation' malformation, the latter has precedence.  But
+            # that has changed, and rather than mess with the data, this works
+            # around that.
+            if (   @warnings > 1
+                && $warnings[0] =~ /short/
+                && $warnings[1] =~ /unexpected non-continuation/)
+            {
+                $after .= " and " if $after;
+                $after .= "short";
+                shift @warnings;
             }
-           like($warnings[0], qr/$experr/, $message);
+            $after = "after $after " if $after;
+
+           like($warnings[0], qr/$experr/, "Expected first warning ${after}for 
$id seen");
            local $::TODO;
            if ($expect < 0) {
                $expect = -$expect;
@@ -150,27 +163,27 @@ __DATA__
 3.1.9 N64 -    64      
80:81:82:83:84:85:86:87:88:89:8a:8b:8c:8d:8e:8f:90:91:92:93:94:95:96:97:98:99:9a:9b:9c:9d:9e:9f:a0:a1:a2:a3:a4:a5:a6:a7:a8:a9:aa:ab:ac:ad:ae:af:b0:b1:b2:b3:b4:b5:b6:b7:b8:b9:ba:bb:bc:b
 ... [43 chars truncated]
 3.2    Lonely start characters
 3.2.1 N34 -    64      
c0:20:c1:20:c2:20:c3:20:c4:20:c5:20:c6:20:c7:20:c8:20:c9:20:ca:20:cb:20:cc:20:cd:20:ce:20:cf:20:d0:20:d1:20:d2:20:d3:20:d4:20:d5:20:d6:20:d7:20:d8:20:d9:20:da:20:db:20:dc:20:dd:20:de:
 ... [83 chars truncated]
-3.2.2 N16 -    32      
e0:20:e1:20:e2:20:e3:20:e4:20:e5:20:e6:20:e7:20:e8:20:e9:20:ea:20:eb:20:ec:20:ed:20:ee:20:ef:20
 -       unexpected non-continuation byte 0x20, immediately after start byte 
0xe0
-3.2.3 N8 -     16      f0:20:f1:20:f2:20:f3:20:f4:20:f5:20:f6:20:f7:20 -       
unexpected non-continuation byte 0x20, immediately after start byte 0xf0
-3.2.4 N4 -     8       f8:20:f9:20:fa:20:fb:20 -       unexpected 
non-continuation byte 0x20, immediately after start byte 0xf8
-3.2.5 N2 -     4       fc:20:fd:20     -       unexpected non-continuation 
byte 0x20, immediately after start byte 0xfc
+3.2.2 N17 -    32      
e0:20:e1:20:e2:20:e3:20:e4:20:e5:20:e6:20:e7:20:e8:20:e9:20:ea:20:eb:20:ec:20:ed:20:ee:20:ef:20
 -       unexpected non-continuation byte 0x20, immediately after start byte 
0xe0
+3.2.3 N9 -     16      f0:20:f1:20:f2:20:f3:20:f4:20:f5:20:f6:20:f7:20 -       
unexpected non-continuation byte 0x20, immediately after start byte 0xf0
+3.2.4 N6 -     8       f8:20:f9:20:fa:20:fb:20 -       unexpected 
non-continuation byte 0x20, immediately after start byte 0xf8
+3.2.5 N4 -     4       fc:20:fd:20     -       unexpected non-continuation 
byte 0x20, immediately after start byte 0xfc
 3.3    Sequences with last continuation byte missing
-3.3.1 N2 -     1       c0      -       1 byte, need 2
-3.3.2 N2 -     2       e0:80   -       2 bytes, need 3
-3.3.3 N2 -     3       f0:80:80        -       3 bytes, need 4
-3.3.4 N2 -     4       f8:80:80:80     -       4 bytes, need 5
-3.3.5 N2 -     5       fc:80:80:80:80  -       5 bytes, need 6
-3.3.6 n -      1       df      -       1 byte, need 2
-3.3.7 n -      2       ef:bf   -       2 bytes, need 3
-3.3.8 n -      3       f7:bf:bf        -       3 bytes, need 4
-3.3.9 n -      4       fb:bf:bf:bf     -       4 bytes, need 5
-3.3.10 n -     5       fd:bf:bf:bf:bf  -       5 bytes, need 6
+3.3.1 N2 -     1       c0      -       1 byte available, need 2
+3.3.2 N2 -     2       e0:80   -       2 bytes available, need 3
+3.3.3 N2 -     3       f0:80:80        -       3 bytes available, need 4
+3.3.4 N2 -     4       f8:80:80:80     -       4 bytes available, need 5
+3.3.5 N2 -     5       fc:80:80:80:80  -       5 bytes available, need 6
+3.3.6 n -      1       df      -       1 byte available, need 2
+3.3.7 n -      2       ef:bf   -       2 bytes available, need 3
+3.3.8 n -      3       f7:bf:bf        -       3 bytes available, need 4
+3.3.9 n -      4       fb:bf:bf:bf     -       4 bytes available, need 5
+3.3.10 n -     5       fd:bf:bf:bf:bf  -       5 bytes available, need 6
 3.4    Concatenation of incomplete sequences
 3.4.1 N15 -    30      
c0:e0:80:f0:80:80:f8:80:80:80:fc:80:80:80:80:df:ef:bf:f7:bf:bf:fb:bf:bf:bf:fd:bf:bf:bf:bf
       -       unexpected non-continuation byte 0xe0, immediately after start 
byte 0xc0
 3.5    Impossible bytes (but not with Perl's extended UTF-8)
-3.5.1 n -      1       fe      -       1 byte, need 7
-3.5.2 N2,1 -   1       ff      -       1 byte, need 13
-3.5.3 N8,5 -   4       fe:fe:ff:ff     -       byte 0xfe
+3.5.1 n -      1       fe      -       1 byte available, need 7
+3.5.2 N2,1 -   1       ff      -       1 byte available, need 13
+3.5.3 N11,8 -  4       fe:fe:ff:ff     -       byte 0xfe
 4      Overlong sequences
 4.1    Examples of an overlong ASCII character
 4.1.1 n -      2       c0:af   -       overlong
diff --git a/utf8.c b/utf8.c
index 4146361..da575a9 100644
--- a/utf8.c
+++ b/utf8.c
@@ -1015,6 +1015,7 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
     STRLEN expectlen   = 0;     /* How long should this sequence be?
                                    (initialized to silence compilers' wrong
                                    warning) */
+    STRLEN avail_len   = 0;     /* When input is too short, gives what that is 
*/
     U32 discard_errors = 0;     /* Used to save branches when 'errors' is NULL;
                                    this gets set and discarded */
 
@@ -1101,12 +1102,21 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
      * sequence, leaving just the bits that are part of the value.  */
     uv = NATIVE_UTF8_TO_I8(uv) & UTF_START_MASK(expectlen);
 
+    /* Setup the loop end point, making sure to not look past the end of the
+     * input string, and flag it as too short if the size isn't big enough. */
+    send = (U8*) s0;
+    if (UNLIKELY(curlen < expectlen)) {
+        possible_problems |= UTF8_GOT_SHORT;
+        avail_len = curlen;
+        send += curlen;
+    }
+    else {
+        send += expectlen;
+    }
+    adjusted_send = send;
+
     /* Now, loop through the remaining bytes in the character's sequence,
-     * accumulating each into the working value as we go.  Be sure to not look
-     * past the end of the input string */
-    send = adjusted_send = (U8*) s0 + ((expectlen <= curlen)
-                                       ? expectlen
-                                       : curlen);
+     * accumulating each into the working value as we go. */
     for (s = s0 + 1; s < send; s++) {
        if (LIKELY(UTF8_IS_CONTINUATION(*s))) {
            uv = UTF8_ACCUMULATE(uv, *s);
@@ -1116,21 +1126,17 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
         /* Here, found a non-continuation before processing all expected bytes.
          * This byte indicates the beginning of a new character, so quit, even
          * if allowing this malformation. */
-        curlen = s - s0;    /* Save how many bytes we actually got */
         possible_problems |= UTF8_GOT_NON_CONTINUATION;
-        goto finish_short;
+        break;
     } /* End of loop through the character's bytes */
 
     /* Save how many bytes were actually in the character */
     curlen = s - s0;
 
-    /* Did we get all the continuation bytes that were expected?  Note that we
-     * know this result even without executing the loop above.  But we had to
-     * do the loop to see if there are unexpected non-continuations. */
-    if (UNLIKELY(curlen < expectlen)) {
-       possible_problems |= UTF8_GOT_SHORT;
+    /* A convenience macro that matches either of the too-short conditions.  */
+#   define UTF8_GOT_TOO_SHORT (UTF8_GOT_SHORT|UTF8_GOT_NON_CONTINUATION)
 
-      finish_short:
+    if (UNLIKELY(possible_problems & UTF8_GOT_TOO_SHORT)) {
         uv_so_far = uv;
         uv = UNICODE_REPLACEMENT;
     }
@@ -1164,10 +1170,6 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
     {
         possible_problems |= UTF8_GOT_LONG;
 
-        /* A convenience macro that matches either of the too-short conditions.
-         * */
-#       define UTF8_GOT_TOO_SHORT (UTF8_GOT_SHORT|UTF8_GOT_NON_CONTINUATION)
-
         if (UNLIKELY(possible_problems & UTF8_GOT_TOO_SHORT)) {
             UV min_uv = uv_so_far;
             STRLEN i;
@@ -1264,6 +1266,9 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
     /* At this point:
      * curlen               contains the number of bytes in the sequence that
      *                      this call should advance the input by.
+     * avail_len            gives the available number of bytes passed in, but
+     *                      only if this is less than the expected number of
+     *                      bytes, based on the code point's start byte.
      * possible_problems'   is 0 if there weren't any problems; otherwise a bit
      *                      is set in it for each potential problem found.
      * uv                   contains the code point the input sequence
@@ -1360,22 +1365,6 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
                     }
                 }
             }
-            else if (possible_problems & UTF8_GOT_NON_CONTINUATION) {
-                possible_problems &= ~UTF8_GOT_NON_CONTINUATION;
-                *errors |= UTF8_GOT_NON_CONTINUATION;
-
-                if (! (flags & UTF8_ALLOW_NON_CONTINUATION)) {
-                    disallowed = TRUE;
-                    if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) {
-                        pack_warn = packWARN(WARN_UTF8);
-                        message = Perl_form(aTHX_ "%s",
-                            unexpected_non_continuation_text(s0,
-                                                            send - s0,
-                                                            s - s0,
-                                                            (int) expectlen));
-                    }
-                }
-            }
             else if (possible_problems & UTF8_GOT_SHORT) {
                 possible_problems &= ~UTF8_GOT_SHORT;
                 *errors |= UTF8_GOT_SHORT;
@@ -1385,16 +1374,32 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
                     if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) {
                         pack_warn = packWARN(WARN_UTF8);
                         message = Perl_form(aTHX_
-                                "%s: %s (too short; got %d byte%s, need %d)",
+                                "%s: %s (too short; %d byte%s available, need 
%d)",
                                 malformed_text,
                                 _byte_dump_string(s0, send - s0),
-                                (int)curlen,
-                                curlen == 1 ? "" : "s",
+                                (int)avail_len,
+                                avail_len == 1 ? "" : "s",
                                 (int)expectlen);
                     }
                 }
 
             }
+            else if (possible_problems & UTF8_GOT_NON_CONTINUATION) {
+                possible_problems &= ~UTF8_GOT_NON_CONTINUATION;
+                *errors |= UTF8_GOT_NON_CONTINUATION;
+
+                if (! (flags & UTF8_ALLOW_NON_CONTINUATION)) {
+                    disallowed = TRUE;
+                    if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) {
+                        pack_warn = packWARN(WARN_UTF8);
+                        message = Perl_form(aTHX_ "%s",
+                            unexpected_non_continuation_text(s0,
+                                                            send - s0,
+                                                            s - s0,
+                                                            (int) expectlen));
+                    }
+                }
+            }
             else if (possible_problems & UTF8_GOT_LONG) {
                 possible_problems &= ~UTF8_GOT_LONG;
                 *errors |= UTF8_GOT_LONG;

--
Perl5 Master Repository

Reply via email to