In perl.git, the branch blead has been updated <https://perl5.git.perl.org/perl.git/commitdiff/9c13cd3cdfa6ab6920882a355869287a277989c3?hp=0b08cab0fc46a5f381ca18a451f55cf12c81d966>
- Log ----------------------------------------------------------------- commit 9c13cd3cdfa6ab6920882a355869287a277989c3 Author: Karl Williamson <[email protected]> Date: Sun Jan 28 14:48:53 2018 -0700 APItest/APItest.xs: Simplify mappings Instead of using SVs, use the underlying C type, and so the code here doesn't have to deal with the SV conversions commit e08037291c2f611062f5eb94bf15c8607efe5bcc Author: Karl Williamson <[email protected]> Date: Sun Jan 28 14:47:16 2018 -0700 APItest/t/utf8_warn_base.pl: White-space only This outdents a bunch of code to make it a shift width of 2 instead of 4 because the nesting was getting too deep, making the space available on a line too short. commit 23038144c235075a2b8963ddcd94b9f94de1996f Author: Karl Williamson <[email protected]> Date: Sun Jan 28 14:43:00 2018 -0700 APItest/t/utf8_warn_base.pl: Improve diagnostics commit 37657a5b6c74c2e0dea5f3efa1407aaf51790d35 Author: Karl Williamson <[email protected]> Date: Sat Jan 27 17:43:00 2018 -0700 Add utf8n_to_uvchr_msgs() This UTF-8 to code point translator variant is to meet the needs of Encode, and provides XS authors with more general capability than the other decoders. ----------------------------------------------------------------------- Summary of changes: embed.fnc | 8 +- embed.h | 2 +- ext/XS-APItest/APItest.xs | 53 +- ext/XS-APItest/t/utf8_warn_base.pl | 1400 +++++++++++++++++++----------------- proto.h | 3 + utf8.c | 147 +++- utf8.h | 2 + 7 files changed, 929 insertions(+), 686 deletions(-) diff --git a/embed.fnc b/embed.fnc index e16c8a65f9..35202e8d7c 100644 --- a/embed.fnc +++ b/embed.fnc @@ -1860,11 +1860,17 @@ Adop |UV |utf8n_to_uvchr |NN const U8 *s \ |STRLEN curlen \ |NULLOK STRLEN *retlen \ |const U32 flags -Adp |UV |utf8n_to_uvchr_error|NN const U8 *s \ +Adop |UV |utf8n_to_uvchr_error|NN const U8 *s \ |STRLEN curlen \ |NULLOK STRLEN *retlen \ |const U32 flags \ |NULLOK U32 * errors +Adp |UV |utf8n_to_uvchr_msgs|NN const U8 *s \ + |STRLEN curlen \ + |NULLOK STRLEN *retlen \ + |const U32 flags \ + |NULLOK U32 * errors \ + |NULLOK AV ** msgs AipnR |UV |valid_utf8_to_uvchr |NN const U8 *s|NULLOK STRLEN *retlen Ap |UV |utf8n_to_uvuni|NN const U8 *s|STRLEN curlen|NULLOK STRLEN *retlen|U32 flags diff --git a/embed.h b/embed.h index 008b8067b7..334c6063fb 100644 --- a/embed.h +++ b/embed.h @@ -736,7 +736,7 @@ #define utf8_to_uvchr(a,b) Perl_utf8_to_uvchr(aTHX_ a,b) #define utf8_to_uvuni(a,b) Perl_utf8_to_uvuni(aTHX_ a,b) #define utf8_to_uvuni_buf(a,b,c) Perl_utf8_to_uvuni_buf(aTHX_ a,b,c) -#define utf8n_to_uvchr_error(a,b,c,d,e) Perl_utf8n_to_uvchr_error(aTHX_ a,b,c,d,e) +#define utf8n_to_uvchr_msgs(a,b,c,d,e,f) Perl_utf8n_to_uvchr_msgs(aTHX_ a,b,c,d,e,f) #define utf8n_to_uvuni(a,b,c,d) Perl_utf8n_to_uvuni(aTHX_ a,b,c,d) #define uvoffuni_to_utf8_flags(a,b,c) Perl_uvoffuni_to_utf8_flags(aTHX_ a,b,c) #define uvuni_to_utf8(a,b) Perl_uvuni_to_utf8(aTHX_ a,b) diff --git a/ext/XS-APItest/APItest.xs b/ext/XS-APItest/APItest.xs index 0ad08237af..0be5d95310 100644 --- a/ext/XS-APItest/APItest.xs +++ b/ext/XS-APItest/APItest.xs @@ -1379,16 +1379,55 @@ bytes_cmp_utf8(bytes, utf8) OUTPUT: RETVAL +AV * +test_utf8n_to_uvchr_msgs(s, len, flags) + char *s + STRLEN len + U32 flags + PREINIT: + STRLEN retlen; + UV ret; + U32 errors; + AV *msgs = NULL; + + CODE: + RETVAL = newAV(); + sv_2mortal((SV*)RETVAL); + + ret = utf8n_to_uvchr_msgs((U8*) s, + len, + &retlen, + flags, + &errors, + &msgs); + + /* Returns the return value in [0]; <retlen> in [1], <errors> in [2] */ + av_push(RETVAL, newSVuv(ret)); + if (retlen == (STRLEN) -1) { + av_push(RETVAL, newSViv(-1)); + } + else { + av_push(RETVAL, newSVuv(retlen)); + } + av_push(RETVAL, newSVuv(errors)); + + /* And any messages in [3] */ + if (msgs) { + av_push(RETVAL, newRV_noinc((SV*)msgs)); + } + + OUTPUT: + RETVAL + AV * test_utf8n_to_uvchr_error(s, len, flags) - SV *s - SV *len - SV *flags + char *s + STRLEN len + U32 flags PREINIT: STRLEN retlen; UV ret; - STRLEN slen; U32 errors; CODE: @@ -1401,10 +1440,10 @@ test_utf8n_to_uvchr_error(s, len, flags) RETVAL = newAV(); sv_2mortal((SV*)RETVAL); - ret = utf8n_to_uvchr_error((U8*) SvPV(s, slen), - SvUV(len), + ret = utf8n_to_uvchr_error((U8*) s, + len, &retlen, - SvUV(flags), + flags, &errors); /* Returns the return value in [0]; <retlen> in [1], <errors> in [2] */ diff --git a/ext/XS-APItest/t/utf8_warn_base.pl b/ext/XS-APItest/t/utf8_warn_base.pl index 91de8a8711..6c3b04afeb 100644 --- a/ext/XS-APItest/t/utf8_warn_base.pl +++ b/ext/XS-APItest/t/utf8_warn_base.pl @@ -702,653 +702,673 @@ sub do_warnings_test(@) my $num_test_files = $ENV{TEST_JOBS} || 1; $num_test_files = 10 if $num_test_files > 10; +# We only really need to test utf8n_to_uvchr_msgs() once with this flag. +my $tested_CHECK_ONLY = 0; + my $test_count = -1; foreach my $test (@tests) { - $test_count++; - next if $test_count % $num_test_files != $::TEST_CHUNK; - - my ($testname, $bytes, $allowed_uv, $needed_to_discern_len) = @$test; - - my $length = length $bytes; - my $initially_overlong = $testname =~ /overlong/; - my $initially_orphan = $testname =~ /orphan/; - my $will_overflow = $allowed_uv < 0; - - my $uv_string = sprintf(($allowed_uv < 0x100) ? "%02X" : "%04X", $allowed_uv); - my $display_bytes = display_bytes($bytes); - - my $controlling_warning_category; - my $utf8n_flag_to_warn; - my $utf8n_flag_to_disallow; - my $uvchr_flag_to_warn; - my $uvchr_flag_to_disallow; - - # We want to test that the independent flags are actually independent. - # For example, that a surrogate doesn't trigger a non-character warning, - # and conversely, turning off an above-Unicode flag doesn't suppress a - # surrogate warning. Earlier versions of this file used nested loops to - # test all possible combinations. But that creates lots of tests, making - # this run too long. What is now done instead is to use the complement of - # the category we are testing to greatly reduce the combinatorial - # explosion. For example, if we have a surrogate and we aren't expecting - # a warning about it, we set all the flags for non-surrogates to raise - # warnings. If one shows up, it indicates the flags aren't independent. - my $utf8n_flag_to_warn_complement; - my $utf8n_flag_to_disallow_complement; - my $uvchr_flag_to_warn_complement; - my $uvchr_flag_to_disallow_complement; - - # Many of the code points being tested are middling in that if code point - # edge cases work, these are very likely to as well. Because this test - # file takes a while to execute, we skip testing the edge effects of code - # points deemed middling, while testing their basics and continuing to - # fully test the non-middling code points. - my $skip_most_tests = 0; - - my $cp_message_qr; # Pattern that matches the message raised when - # that message contains the problematic code - # point. The message is the same (currently) both - # when going from/to utf8. - my $non_cp_trailing_text; # The suffix text when the message doesn't - # contain a code point. (This is a result of - # some sort of malformation that means we - # can't get an exact code poin - my $extended_cp_message_qr = qr/\QCode point 0x$uv_string is not Unicode,\E - \Q requires a Perl extension, and so is not\E - \Q portable\E/x; - my $extended_non_cp_trailing_text - = "is a Perl extension, and so is not portable"; - - # What bytes should have been used to specify a code point that has been - # specified as an overlong. - my $correct_bytes_for_overlong; - - # Is this test malformed from the beginning? If so, we know to generally - # expect that the tests will show it isn't valid. - my $initially_malformed = 0; - - if ($initially_overlong || $initially_orphan) { - $non_cp_trailing_text = "if you see this, there is an error"; - $cp_message_qr = qr/\Q$non_cp_trailing_text\E/; - $initially_malformed = 1; - $utf8n_flag_to_warn = 0; - $utf8n_flag_to_disallow = 0; - - $utf8n_flag_to_warn_complement = $::UTF8_WARN_SURROGATE; - $utf8n_flag_to_disallow_complement = $::UTF8_DISALLOW_SURROGATE; - if (! $will_overflow && $allowed_uv <= 0x10FFFF) { - $utf8n_flag_to_warn_complement |= $::UTF8_WARN_SUPER; - $utf8n_flag_to_disallow_complement |= $::UTF8_DISALLOW_SUPER; - if (($allowed_uv & 0xFFFF) != 0xFFFF) { - $utf8n_flag_to_warn_complement |= $::UTF8_WARN_NONCHAR; - $utf8n_flag_to_disallow_complement |= $::UTF8_DISALLOW_NONCHAR; - } - } - if (! is_extended_utf8($bytes)) { - $utf8n_flag_to_warn_complement |= $::UTF8_WARN_PERL_EXTENDED; - $utf8n_flag_to_disallow_complement |= $::UTF8_DISALLOW_PERL_EXTENDED; - } - - $controlling_warning_category = 'utf8'; - - if ($initially_overlong) { - if (! defined $needed_to_discern_len) { - $needed_to_discern_len = overlong_discern_len($bytes); - } - $correct_bytes_for_overlong = display_bytes_no_quotes(chr $allowed_uv); - } - } - elsif($will_overflow || $allowed_uv > 0x10FFFF) { - - # Set the SUPER flags; later, we test for PERL_EXTENDED as well. - $utf8n_flag_to_warn = $::UTF8_WARN_SUPER; - $utf8n_flag_to_disallow = $::UTF8_DISALLOW_SUPER; - $uvchr_flag_to_warn = $::UNICODE_WARN_SUPER; - $uvchr_flag_to_disallow = $::UNICODE_DISALLOW_SUPER;; - - # Below, we add the flags for non-perl_extended to the code points - # that don't fit that category. Special tests are done for this - # category in the inner loop. - $utf8n_flag_to_warn_complement = $::UTF8_WARN_NONCHAR - |$::UTF8_WARN_SURROGATE; - $utf8n_flag_to_disallow_complement = $::UTF8_DISALLOW_NONCHAR - |$::UTF8_DISALLOW_SURROGATE; - $uvchr_flag_to_warn_complement = $::UNICODE_WARN_NONCHAR - |$::UNICODE_WARN_SURROGATE; - $uvchr_flag_to_disallow_complement = $::UNICODE_DISALLOW_NONCHAR - |$::UNICODE_DISALLOW_SURROGATE; - $controlling_warning_category = 'non_unicode'; - - if ($will_overflow) { # This is realy a malformation - $non_cp_trailing_text = "if you see this, there is an error"; - $cp_message_qr = qr/\Q$non_cp_trailing_text\E/; - $initially_malformed = 1; - if (! defined $needed_to_discern_len) { - $needed_to_discern_len = overflow_discern_len($length); - } - } - elsif (requires_extended_utf8($allowed_uv)) { - $cp_message_qr = $extended_cp_message_qr; - $non_cp_trailing_text = $extended_non_cp_trailing_text; - $needed_to_discern_len = 1 unless defined $needed_to_discern_len; - } - else { - $cp_message_qr = qr/\QCode point 0x$uv_string is not Unicode,\E - \Q may not be portable\E/x; - $non_cp_trailing_text = "is for a non-Unicode code point, may not" - . " be portable"; - $utf8n_flag_to_warn_complement |= $::UTF8_WARN_PERL_EXTENDED; - $utf8n_flag_to_disallow_complement - |= $::UTF8_DISALLOW_PERL_EXTENDED; - $uvchr_flag_to_warn_complement |= $::UNICODE_WARN_PERL_EXTENDED; - $uvchr_flag_to_disallow_complement - |= $::UNICODE_DISALLOW_PERL_EXTENDED; - } - } - elsif ($allowed_uv >= 0xD800 && $allowed_uv <= 0xDFFF) { - $cp_message_qr = qr/UTF-16 surrogate U\+$uv_string/; - $non_cp_trailing_text = "is for a surrogate"; - $needed_to_discern_len = 2 unless defined $needed_to_discern_len; - $skip_most_tests = 1 if $allowed_uv > 0xD800 && $allowed_uv < 0xDFFF; - - $utf8n_flag_to_warn = $::UTF8_WARN_SURROGATE; - $utf8n_flag_to_disallow = $::UTF8_DISALLOW_SURROGATE; - $uvchr_flag_to_warn = $::UNICODE_WARN_SURROGATE; - $uvchr_flag_to_disallow = $::UNICODE_DISALLOW_SURROGATE;; - - $utf8n_flag_to_warn_complement = $::UTF8_WARN_NONCHAR - |$::UTF8_WARN_SUPER - |$::UTF8_WARN_PERL_EXTENDED; - $utf8n_flag_to_disallow_complement = $::UTF8_DISALLOW_NONCHAR - |$::UTF8_DISALLOW_SUPER - |$::UTF8_DISALLOW_PERL_EXTENDED; - $uvchr_flag_to_warn_complement = $::UNICODE_WARN_NONCHAR - |$::UNICODE_WARN_SUPER - |$::UNICODE_WARN_PERL_EXTENDED; - $uvchr_flag_to_disallow_complement = $::UNICODE_DISALLOW_NONCHAR - |$::UNICODE_DISALLOW_SUPER - |$::UNICODE_DISALLOW_PERL_EXTENDED; - $controlling_warning_category = 'surrogate'; - } - elsif ( ($allowed_uv >= 0xFDD0 && $allowed_uv <= 0xFDEF) - || ($allowed_uv & 0xFFFE) == 0xFFFE) - { - $cp_message_qr = qr/\QUnicode non-character U+$uv_string\E - \Q is not recommended for open interchange\E/x; - $non_cp_trailing_text = "if you see this, there is an error"; - $needed_to_discern_len = $length unless defined $needed_to_discern_len; - if ( ($allowed_uv > 0xFDD0 && $allowed_uv < 0xFDEF) - || ($allowed_uv > 0xFFFF && $allowed_uv < 0x10FFFE)) - { - $skip_most_tests = 1; - } - - $utf8n_flag_to_warn = $::UTF8_WARN_NONCHAR; - $utf8n_flag_to_disallow = $::UTF8_DISALLOW_NONCHAR; - $uvchr_flag_to_warn = $::UNICODE_WARN_NONCHAR; - $uvchr_flag_to_disallow = $::UNICODE_DISALLOW_NONCHAR;; - - $utf8n_flag_to_warn_complement = $::UTF8_WARN_SURROGATE - |$::UTF8_WARN_SUPER - |$::UTF8_WARN_PERL_EXTENDED; - $utf8n_flag_to_disallow_complement = $::UTF8_DISALLOW_SURROGATE - |$::UTF8_DISALLOW_SUPER - |$::UTF8_DISALLOW_PERL_EXTENDED; - $uvchr_flag_to_warn_complement = $::UNICODE_WARN_SURROGATE - |$::UNICODE_WARN_SUPER - |$::UNICODE_WARN_PERL_EXTENDED; - $uvchr_flag_to_disallow_complement = $::UNICODE_DISALLOW_SURROGATE - |$::UNICODE_DISALLOW_SUPER - |$::UNICODE_DISALLOW_PERL_EXTENDED; - - $controlling_warning_category = 'nonchar'; - } - else { - die "Can't figure out what type of warning to test for $testname" - } - - die 'Didn\'t set $needed_to_discern_len for ' . $testname - unless defined $needed_to_discern_len; - - # We try various combinations of malformations that can occur - foreach my $short (0, 1) { - next if $skip_most_tests && $short; - foreach my $unexpected_noncont (0, 1) { - next if $skip_most_tests && $unexpected_noncont; - foreach my $overlong (0, 1) { - next if $overlong && $skip_most_tests; - next if $initially_overlong && ! $overlong; - - # If we're creating an overlong, it can't be longer than the - # maximum length, so skip if we're already at that length. - next if (! $initially_overlong && $overlong) - && $length >= $::max_bytes; - - my $this_cp_message_qr = $cp_message_qr; - my $this_non_cp_trailing_text = $non_cp_trailing_text; - - foreach my $malformed_allow_type (0..2) { - # 0 don't allow this malformation; ignored if no malformation - # 1 allow, with REPLACEMENT CHARACTER returned - # 2 allow, with intended code point returned. All malformations - # other than overlong can't determine the intended code point, - # so this isn't valid for them. - next if $malformed_allow_type == 2 - && ($will_overflow || $short || $unexpected_noncont); - next if $skip_most_tests && $malformed_allow_type; - - # Here we are in the innermost loop for malformations. So we - # know which ones are in effect. Can now change the input to be - # appropriately malformed. We also can set up certain other - # things now, like whether we expect a return flag from this - # malformation, and which flag. - - my $this_bytes = $bytes; - my $this_length = $length; - my $this_expected_len = $length; - my $this_needed_to_discern_len = $needed_to_discern_len; - - my @malformation_names; - my @expected_malformation_warnings; - my @expected_malformation_return_flags; - - # Contains the flags for any allowed malformations. Currently no - # combinations of on/off are tested for. It's either all are - # allowed, or none are. - my $allow_flags = 0; - my $overlong_is_in_perl_extended_utf8 = 0; - my $dont_use_overlong_cp = 0; - - if ($initially_orphan) { - next if $overlong || $short || $unexpected_noncont; - } - - if ($overlong) { - if (! $initially_overlong) { - my $new_expected_len; - - # To force this malformation, we convert the original start - # byte into a continuation byte with the same data bits as - # originally. ... - my $start_byte = substr($this_bytes, 0, 1); - my $converted_to_continuation_byte - = start_byte_to_cont($start_byte); - - # ... Then we prepend it with a known overlong sequence. - # This should evaluate to the exact same code point as the - # original. We try to avoid an overlong using Perl - # extended UTF-8. The code points are the highest - # representable as overlongs on the respective platform - # without using extended UTF-8. - if (native_to_I8($start_byte) lt "\xFC") { - $start_byte = I8_to_native("\xFC"); - $new_expected_len = 6; - } - elsif (! isASCII && native_to_I8($start_byte) lt "\xFE") { - - # FE is not extended UTF-8 on EBCDIC - $start_byte = I8_to_native("\xFE"); - $new_expected_len = 7; - } - else { # Must use extended UTF-8. On ASCII platforms, we - # could express some overlongs here starting with - # \xFE, but there's no real reason to do so. - $overlong_is_in_perl_extended_utf8 = 1; - $start_byte = I8_to_native("\xFF"); - $new_expected_len = $::max_bytes; - $this_cp_message_qr = $extended_cp_message_qr; - - # The warning that gets raised doesn't include the - # code point in the message if the code point can be - # expressed without using extended UTF-8, but the - # particular overlong sequence used is in extended - # UTF-8. To do otherwise would be confusing to the - # user, as it would claim the code point requires - # extended, when it doesn't. - $dont_use_overlong_cp = 1 - unless requires_extended_utf8($allowed_uv); - $this_non_cp_trailing_text - = $extended_non_cp_trailing_text; - } - - # Splice in the revise continuation byte, preceded by the - # start byte and the proper number of the lowest - # continuation bytes. - $this_bytes = $start_byte - . ($native_lowest_continuation_chr - x ( $new_expected_len - - 1 - - length($this_bytes))) - . $converted_to_continuation_byte - . substr($this_bytes, 1); - $this_length = length($this_bytes); - $this_needed_to_discern_len = $new_expected_len - - ( $this_expected_len - - $this_needed_to_discern_len); - $this_expected_len = $new_expected_len; - } - } - - if ($short) { - - # To force this malformation, just tell the test to not look - # as far as it should into the input. - $this_length--; - $this_expected_len--; - - $allow_flags |= $::UTF8_ALLOW_SHORT if $malformed_allow_type; - } + $test_count++; + next if $test_count % $num_test_files != $::TEST_CHUNK; + + my ($testname, $bytes, $allowed_uv, $needed_to_discern_len) = @$test; + + my $length = length $bytes; + my $initially_overlong = $testname =~ /overlong/; + my $initially_orphan = $testname =~ /orphan/; + my $will_overflow = $allowed_uv < 0; + + my $uv_string = sprintf(($allowed_uv < 0x100) ? "%02X" : "%04X", $allowed_uv); + my $display_bytes = display_bytes($bytes); + + my $controlling_warning_category; + my $utf8n_flag_to_warn; + my $utf8n_flag_to_disallow; + my $uvchr_flag_to_warn; + my $uvchr_flag_to_disallow; + + # We want to test that the independent flags are actually independent. + # For example, that a surrogate doesn't trigger a non-character warning, + # and conversely, turning off an above-Unicode flag doesn't suppress a + # surrogate warning. Earlier versions of this file used nested loops to + # test all possible combinations. But that creates lots of tests, making + # this run too long. What is now done instead is to use the complement of + # the category we are testing to greatly reduce the combinatorial + # explosion. For example, if we have a surrogate and we aren't expecting + # a warning about it, we set all the flags for non-surrogates to raise + # warnings. If one shows up, it indicates the flags aren't independent. + my $utf8n_flag_to_warn_complement; + my $utf8n_flag_to_disallow_complement; + my $uvchr_flag_to_warn_complement; + my $uvchr_flag_to_disallow_complement; + + # Many of the code points being tested are middling in that if code point + # edge cases work, these are very likely to as well. Because this test + # file takes a while to execute, we skip testing the edge effects of code + # points deemed middling, while testing their basics and continuing to + # fully test the non-middling code points. + my $skip_most_tests = 0; + + my $cp_message_qr; # Pattern that matches the message raised when + # that message contains the problematic code + # point. The message is the same (currently) both + # when going from/to utf8. + my $non_cp_trailing_text; # The suffix text when the message doesn't + # contain a code point. (This is a result of + # some sort of malformation that means we + # can't get an exact code poin + my $extended_cp_message_qr = qr/\QCode point 0x$uv_string is not Unicode,\E + \Q requires a Perl extension, and so is not\E + \Q portable\E/x; + my $extended_non_cp_trailing_text + = "is a Perl extension, and so is not portable"; + + # What bytes should have been used to specify a code point that has been + # specified as an overlong. + my $correct_bytes_for_overlong; + + # Is this test malformed from the beginning? If so, we know to generally + # expect that the tests will show it isn't valid. + my $initially_malformed = 0; + + if ($initially_overlong || $initially_orphan) { + $non_cp_trailing_text = "if you see this, there is an error"; + $cp_message_qr = qr/\Q$non_cp_trailing_text\E/; + $initially_malformed = 1; + $utf8n_flag_to_warn = 0; + $utf8n_flag_to_disallow = 0; + + $utf8n_flag_to_warn_complement = $::UTF8_WARN_SURROGATE; + $utf8n_flag_to_disallow_complement = $::UTF8_DISALLOW_SURROGATE; + if (! $will_overflow && $allowed_uv <= 0x10FFFF) { + $utf8n_flag_to_warn_complement |= $::UTF8_WARN_SUPER; + $utf8n_flag_to_disallow_complement |= $::UTF8_DISALLOW_SUPER; + if (($allowed_uv & 0xFFFF) != 0xFFFF) { + $utf8n_flag_to_warn_complement |= $::UTF8_WARN_NONCHAR; + $utf8n_flag_to_disallow_complement |= $::UTF8_DISALLOW_NONCHAR; + } + } + if (! is_extended_utf8($bytes)) { + $utf8n_flag_to_warn_complement |= $::UTF8_WARN_PERL_EXTENDED; + $utf8n_flag_to_disallow_complement |= $::UTF8_DISALLOW_PERL_EXTENDED; + } - if ($unexpected_noncont) { + $controlling_warning_category = 'utf8'; - # To force this malformation, change the final continuation - # byte into a start byte. - my $pos = ($short) ? -2 : -1; - substr($this_bytes, $pos, 1) = $known_start_byte; - $this_expected_len--; - } + if ($initially_overlong) { + if (! defined $needed_to_discern_len) { + $needed_to_discern_len = overlong_discern_len($bytes); + } + $correct_bytes_for_overlong = display_bytes_no_quotes(chr $allowed_uv); + } + } + elsif($will_overflow || $allowed_uv > 0x10FFFF) { + + # Set the SUPER flags; later, we test for PERL_EXTENDED as well. + $utf8n_flag_to_warn = $::UTF8_WARN_SUPER; + $utf8n_flag_to_disallow = $::UTF8_DISALLOW_SUPER; + $uvchr_flag_to_warn = $::UNICODE_WARN_SUPER; + $uvchr_flag_to_disallow = $::UNICODE_DISALLOW_SUPER;; + + # Below, we add the flags for non-perl_extended to the code points + # that don't fit that category. Special tests are done for this + # category in the inner loop. + $utf8n_flag_to_warn_complement = $::UTF8_WARN_NONCHAR + |$::UTF8_WARN_SURROGATE; + $utf8n_flag_to_disallow_complement = $::UTF8_DISALLOW_NONCHAR + |$::UTF8_DISALLOW_SURROGATE; + $uvchr_flag_to_warn_complement = $::UNICODE_WARN_NONCHAR + |$::UNICODE_WARN_SURROGATE; + $uvchr_flag_to_disallow_complement = $::UNICODE_DISALLOW_NONCHAR + |$::UNICODE_DISALLOW_SURROGATE; + $controlling_warning_category = 'non_unicode'; + + if ($will_overflow) { # This is realy a malformation + $non_cp_trailing_text = "if you see this, there is an error"; + $cp_message_qr = qr/\Q$non_cp_trailing_text\E/; + $initially_malformed = 1; + if (! defined $needed_to_discern_len) { + $needed_to_discern_len = overflow_discern_len($length); + } + } + elsif (requires_extended_utf8($allowed_uv)) { + $cp_message_qr = $extended_cp_message_qr; + $non_cp_trailing_text = $extended_non_cp_trailing_text; + $needed_to_discern_len = 1 unless defined $needed_to_discern_len; + } + else { + $cp_message_qr = qr/\QCode point 0x$uv_string is not Unicode,\E + \Q may not be portable\E/x; + $non_cp_trailing_text = "is for a non-Unicode code point, may not" + . " be portable"; + $utf8n_flag_to_warn_complement |= $::UTF8_WARN_PERL_EXTENDED; + $utf8n_flag_to_disallow_complement + |= $::UTF8_DISALLOW_PERL_EXTENDED; + $uvchr_flag_to_warn_complement |= $::UNICODE_WARN_PERL_EXTENDED; + $uvchr_flag_to_disallow_complement + |= $::UNICODE_DISALLOW_PERL_EXTENDED; + } + } + elsif ($allowed_uv >= 0xD800 && $allowed_uv <= 0xDFFF) { + $cp_message_qr = qr/UTF-16 surrogate U\+$uv_string/; + $non_cp_trailing_text = "is for a surrogate"; + $needed_to_discern_len = 2 unless defined $needed_to_discern_len; + $skip_most_tests = 1 if $allowed_uv > 0xD800 && $allowed_uv < 0xDFFF; + + $utf8n_flag_to_warn = $::UTF8_WARN_SURROGATE; + $utf8n_flag_to_disallow = $::UTF8_DISALLOW_SURROGATE; + $uvchr_flag_to_warn = $::UNICODE_WARN_SURROGATE; + $uvchr_flag_to_disallow = $::UNICODE_DISALLOW_SURROGATE;; + + $utf8n_flag_to_warn_complement = $::UTF8_WARN_NONCHAR + |$::UTF8_WARN_SUPER + |$::UTF8_WARN_PERL_EXTENDED; + $utf8n_flag_to_disallow_complement = $::UTF8_DISALLOW_NONCHAR + |$::UTF8_DISALLOW_SUPER + |$::UTF8_DISALLOW_PERL_EXTENDED; + $uvchr_flag_to_warn_complement = $::UNICODE_WARN_NONCHAR + |$::UNICODE_WARN_SUPER + |$::UNICODE_WARN_PERL_EXTENDED; + $uvchr_flag_to_disallow_complement = $::UNICODE_DISALLOW_NONCHAR + |$::UNICODE_DISALLOW_SUPER + |$::UNICODE_DISALLOW_PERL_EXTENDED; + $controlling_warning_category = 'surrogate'; + } + elsif ( ($allowed_uv >= 0xFDD0 && $allowed_uv <= 0xFDEF) + || ($allowed_uv & 0xFFFE) == 0xFFFE) + { + $cp_message_qr = qr/\QUnicode non-character U+$uv_string\E + \Q is not recommended for open interchange\E/x; + $non_cp_trailing_text = "if you see this, there is an error"; + $needed_to_discern_len = $length unless defined $needed_to_discern_len; + if ( ($allowed_uv > 0xFDD0 && $allowed_uv < 0xFDEF) + || ($allowed_uv > 0xFFFF && $allowed_uv < 0x10FFFE)) + { + $skip_most_tests = 1; + } - # The whole point of a test that is malformed from the beginning - # is to test for that malformation. If we've modified things so - # much that we don't have enough information to detect that - # malformation, there's no point in testing. - next if $initially_malformed - && $this_expected_len < $this_needed_to_discern_len; - - # Here, we've transformed the input with all of the desired - # non-overflow malformations. We are now in a position to - # construct any potential warnings for those malformations. But - # it's a pain to get the detailed messages exactly right, so for - # now XXX, only do so for those that return an explicit code - # point. - - if ($initially_orphan) { - push @malformation_names, "orphan continuation"; - push @expected_malformation_return_flags, - $::UTF8_GOT_CONTINUATION; - $allow_flags |= $::UTF8_ALLOW_CONTINUATION - if $malformed_allow_type; - push @expected_malformation_warnings, qr/unexpected continuation/; - } + $utf8n_flag_to_warn = $::UTF8_WARN_NONCHAR; + $utf8n_flag_to_disallow = $::UTF8_DISALLOW_NONCHAR; + $uvchr_flag_to_warn = $::UNICODE_WARN_NONCHAR; + $uvchr_flag_to_disallow = $::UNICODE_DISALLOW_NONCHAR;; + + $utf8n_flag_to_warn_complement = $::UTF8_WARN_SURROGATE + |$::UTF8_WARN_SUPER + |$::UTF8_WARN_PERL_EXTENDED; + $utf8n_flag_to_disallow_complement = $::UTF8_DISALLOW_SURROGATE + |$::UTF8_DISALLOW_SUPER + |$::UTF8_DISALLOW_PERL_EXTENDED; + $uvchr_flag_to_warn_complement = $::UNICODE_WARN_SURROGATE + |$::UNICODE_WARN_SUPER + |$::UNICODE_WARN_PERL_EXTENDED; + $uvchr_flag_to_disallow_complement = $::UNICODE_DISALLOW_SURROGATE + |$::UNICODE_DISALLOW_SUPER + |$::UNICODE_DISALLOW_PERL_EXTENDED; + + $controlling_warning_category = 'nonchar'; + } + else { + die "Can't figure out what type of warning to test for $testname" + } + + die 'Didn\'t set $needed_to_discern_len for ' . $testname + unless defined $needed_to_discern_len; + + # We try various combinations of malformations that can occur + foreach my $short (0, 1) { + next if $skip_most_tests && $short; + foreach my $unexpected_noncont (0, 1) { + next if $skip_most_tests && $unexpected_noncont; + foreach my $overlong (0, 1) { + next if $overlong && $skip_most_tests; + next if $initially_overlong && ! $overlong; + + # If we're creating an overlong, it can't be longer than the + # maximum length, so skip if we're already at that length. + next if (! $initially_overlong && $overlong) + && $length >= $::max_bytes; + + my $this_cp_message_qr = $cp_message_qr; + my $this_non_cp_trailing_text = $non_cp_trailing_text; + + foreach my $malformed_allow_type (0..2) { + # 0 don't allow this malformation; ignored if no malformation + # 1 allow, with REPLACEMENT CHARACTER returned + # 2 allow, with intended code point returned. All malformations + # other than overlong can't determine the intended code point, + # so this isn't valid for them. + next if $malformed_allow_type == 2 + && ($will_overflow || $short || $unexpected_noncont); + next if $skip_most_tests && $malformed_allow_type; + + # Here we are in the innermost loop for malformations. So we + # know which ones are in effect. Can now change the input to be + # appropriately malformed. We also can set up certain other + # things now, like whether we expect a return flag from this + # malformation, and which flag. + + my $this_bytes = $bytes; + my $this_length = $length; + my $this_expected_len = $length; + my $this_needed_to_discern_len = $needed_to_discern_len; + + my @malformation_names; + my @expected_malformation_warnings; + my @expected_malformation_return_flags; + + # Contains the flags for any allowed malformations. Currently no + # combinations of on/off are tested for. It's either all are + # allowed, or none are. + my $allow_flags = 0; + my $overlong_is_in_perl_extended_utf8 = 0; + my $dont_use_overlong_cp = 0; + + if ($initially_orphan) { + next if $overlong || $short || $unexpected_noncont; + } - if ($overlong) { - push @malformation_names, 'overlong'; - push @expected_malformation_return_flags, $::UTF8_GOT_LONG; + if ($overlong) { + if (! $initially_overlong) { + my $new_expected_len; + + # To force this malformation, we convert the original start + # byte into a continuation byte with the same data bits as + # originally. ... + my $start_byte = substr($this_bytes, 0, 1); + my $converted_to_continuation_byte + = start_byte_to_cont($start_byte); + + # ... Then we prepend it with a known overlong sequence. + # This should evaluate to the exact same code point as the + # original. We try to avoid an overlong using Perl + # extended UTF-8. The code points are the highest + # representable as overlongs on the respective platform + # without using extended UTF-8. + if (native_to_I8($start_byte) lt "\xFC") { + $start_byte = I8_to_native("\xFC"); + $new_expected_len = 6; + } + elsif (! isASCII && native_to_I8($start_byte) lt "\xFE") { + + # FE is not extended UTF-8 on EBCDIC + $start_byte = I8_to_native("\xFE"); + $new_expected_len = 7; + } + else { # Must use extended UTF-8. On ASCII platforms, we + # could express some overlongs here starting with + # \xFE, but there's no real reason to do so. + $overlong_is_in_perl_extended_utf8 = 1; + $start_byte = I8_to_native("\xFF"); + $new_expected_len = $::max_bytes; + $this_cp_message_qr = $extended_cp_message_qr; + + # The warning that gets raised doesn't include the + # code point in the message if the code point can be + # expressed without using extended UTF-8, but the + # particular overlong sequence used is in extended + # UTF-8. To do otherwise would be confusing to the + # user, as it would claim the code point requires + # extended, when it doesn't. + $dont_use_overlong_cp = 1 + unless requires_extended_utf8($allowed_uv); + $this_non_cp_trailing_text + = $extended_non_cp_trailing_text; + } + + # Splice in the revise continuation byte, preceded by the + # start byte and the proper number of the lowest + # continuation bytes. + $this_bytes = $start_byte + . ($native_lowest_continuation_chr + x ( $new_expected_len + - 1 + - length($this_bytes))) + . $converted_to_continuation_byte + . substr($this_bytes, 1); + $this_length = length($this_bytes); + $this_needed_to_discern_len = $new_expected_len + - ( $this_expected_len + - $this_needed_to_discern_len); + $this_expected_len = $new_expected_len; + } + } - # If one of the other malformation types is also in effect, we - # don't know what the intended code point was. - if ($short || $unexpected_noncont || $will_overflow) { - push @expected_malformation_warnings, qr/overlong/; - } - else { - my $wrong_bytes = display_bytes_no_quotes( - substr($this_bytes, 0, $this_length)); - if (! defined $correct_bytes_for_overlong) { - $correct_bytes_for_overlong - = display_bytes_no_quotes($bytes); - } - my $prefix = ( $allowed_uv > 0x10FFFF - || ! isASCII && $allowed_uv < 256) - ? "0x" - : "U+"; - push @expected_malformation_warnings, - qr/\QMalformed UTF-8 character: $wrong_bytes\E - \Q (overlong; instead use\E - \Q $correct_bytes_for_overlong to\E - \Q represent $prefix$uv_string)/x; - } + if ($short) { - if ($malformed_allow_type == 2) { - $allow_flags |= $::UTF8_ALLOW_LONG_AND_ITS_VALUE; - } - elsif ($malformed_allow_type) { - $allow_flags |= $::UTF8_ALLOW_LONG; - } - } - if ($short) { - push @malformation_names, 'short'; - push @expected_malformation_return_flags, $::UTF8_GOT_SHORT; - push @expected_malformation_warnings, qr/too short/; - } - if ($unexpected_noncont) { - push @malformation_names, 'unexpected non-continuation'; - push @expected_malformation_return_flags, - $::UTF8_GOT_NON_CONTINUATION; - $allow_flags |= $::UTF8_ALLOW_NON_CONTINUATION - if $malformed_allow_type; - push @expected_malformation_warnings, - qr/unexpected non-continuation byte/; - } + # To force this malformation, just tell the test to not look + # as far as it should into the input. + $this_length--; + $this_expected_len--; - # The overflow malformation is done differently than other - # malformations. It comes from manually typed tests in the test - # array. We now make it be treated like one of the other - # malformations. But some has to be deferred until the inner loop - my $overflow_msg_pattern; - if ($will_overflow) { - push @malformation_names, 'overflow'; + $allow_flags |= $::UTF8_ALLOW_SHORT if $malformed_allow_type; + } - $overflow_msg_pattern = display_bytes_no_quotes( - substr($this_bytes, 0, $this_expected_len)); - $overflow_msg_pattern = qr/\QMalformed UTF-8 character:\E - \Q $overflow_msg_pattern\E - \Q (overflows)\E/x; - push @expected_malformation_return_flags, $::UTF8_GOT_OVERFLOW; - $allow_flags |= $::UTF8_ALLOW_OVERFLOW if $malformed_allow_type; - } + if ($unexpected_noncont) { - # And we can create the malformation-related text for the the test - # names we eventually will generate. - my $malformations_name = ""; - if (@malformation_names) { - $malformations_name .= "dis" unless $malformed_allow_type; - $malformations_name .= "allowed "; - $malformations_name .= "malformation"; - $malformations_name .= "s" if @malformation_names > 1; - $malformations_name .= ": "; - $malformations_name .= join "/", @malformation_names; - $malformations_name = " ($malformations_name)"; - } + # To force this malformation, change the final continuation + # byte into a start byte. + my $pos = ($short) ? -2 : -1; + substr($this_bytes, $pos, 1) = $known_start_byte; + $this_expected_len--; + } - # Done setting up the malformation related stuff + # The whole point of a test that is malformed from the beginning + # is to test for that malformation. If we've modified things so + # much that we don't have enough information to detect that + # malformation, there's no point in testing. + next if $initially_malformed + && $this_expected_len < $this_needed_to_discern_len; + + # Here, we've transformed the input with all of the desired + # non-overflow malformations. We are now in a position to + # construct any potential warnings for those malformations. But + # it's a pain to get the detailed messages exactly right, so for + # now XXX, only do so for those that return an explicit code + # point. + + if ($initially_orphan) { + push @malformation_names, "orphan continuation"; + push @expected_malformation_return_flags, + $::UTF8_GOT_CONTINUATION; + $allow_flags |= $::UTF8_ALLOW_CONTINUATION + if $malformed_allow_type; + push @expected_malformation_warnings, qr/unexpected continuation/; + } - { # First test the isFOO calls - use warnings; # XXX no warnings 'deprecated'; # Make sure these don't raise warnings - undef @warnings_gotten; + if ($overlong) { + push @malformation_names, 'overlong'; + push @expected_malformation_return_flags, $::UTF8_GOT_LONG; - my $ret = test_isUTF8_CHAR($this_bytes, $this_length); - my $ret_flags - = test_isUTF8_CHAR_flags($this_bytes, $this_length, 0); - if ($malformations_name) { - is($ret, 0, "For $testname$malformations_name: isUTF8_CHAR() returns 0"); - is($ret_flags, 0, " And isUTF8_CHAR_flags() returns 0"); - } - else { - is($ret, $this_length, "For $testname: isUTF8_CHAR() returns" - . " expected length: $this_length"); - is($ret_flags, $this_length, - " And isUTF8_CHAR_flags(...,0) returns expected" - . " length: $this_length"); - } - is(scalar @warnings_gotten, 0, - " And neither isUTF8_CHAR() nor isUTF8_CHAR()_flags" - . " generated any warnings") - or output_warnings(@warnings_gotten); - - undef @warnings_gotten; - $ret = test_isSTRICT_UTF8_CHAR($this_bytes, $this_length); - if ($malformations_name) { - is($ret, 0, " And isSTRICT_UTF8_CHAR() returns 0"); - } - else { - my $expected_ret - = ( $testname =~ /surrogate|non-character/ - || $allowed_uv > 0x10FFFF) - ? 0 - : $this_length; - is($ret, $expected_ret, - " And isSTRICT_UTF8_CHAR() returns expected" - . " length: $expected_ret"); - $ret = test_isUTF8_CHAR_flags($this_bytes, $this_length, - $::UTF8_DISALLOW_ILLEGAL_INTERCHANGE); - is($ret, $expected_ret, - " And isUTF8_CHAR_flags('" - . "DISALLOW_ILLEGAL_INTERCHANGE') acts like" - . " isSTRICT_UTF8_CHAR"); - } - is(scalar @warnings_gotten, 0, - " And neither isSTRICT_UTF8_CHAR() nor" - . " isUTF8_CHAR_flags generated any warnings") - or output_warnings(@warnings_gotten); - - undef @warnings_gotten; - $ret = test_isC9_STRICT_UTF8_CHAR($this_bytes, $this_length); - if ($malformations_name) { - is($ret, 0, " And isC9_STRICT_UTF8_CHAR() returns 0"); - } - else { - my $expected_ret = ( $testname =~ /surrogate/ - || $allowed_uv > 0x10FFFF) - ? 0 - : $this_expected_len; - is($ret, $expected_ret, " And isC9_STRICT_UTF8_CHAR()" - . " returns expected length:" - . " $expected_ret"); - $ret = test_isUTF8_CHAR_flags($this_bytes, $this_length, - $::UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE); - is($ret, $expected_ret, - " And isUTF8_CHAR_flags('" - . "DISALLOW_ILLEGAL_C9_INTERCHANGE') acts like" - . " isC9_STRICT_UTF8_CHAR"); - } - is(scalar @warnings_gotten, 0, - " And neither isC9_STRICT_UTF8_CHAR() nor" - . " isUTF8_CHAR_flags generated any warnings") - or output_warnings(@warnings_gotten); + # If one of the other malformation types is also in effect, we + # don't know what the intended code point was. + if ($short || $unexpected_noncont || $will_overflow) { + push @expected_malformation_warnings, qr/overlong/; + } + else { + my $wrong_bytes = display_bytes_no_quotes( + substr($this_bytes, 0, $this_length)); + if (! defined $correct_bytes_for_overlong) { + $correct_bytes_for_overlong + = display_bytes_no_quotes($bytes); + } + my $prefix = ( $allowed_uv > 0x10FFFF + || ! isASCII && $allowed_uv < 256) + ? "0x" + : "U+"; + push @expected_malformation_warnings, + qr/\QMalformed UTF-8 character: $wrong_bytes\E + \Q (overlong; instead use\E + \Q $correct_bytes_for_overlong to\E + \Q represent $prefix$uv_string)/x; + } - foreach my $disallow_type (0..2) { - # 0 is don't disallow this type of code point - # 1 is do disallow - # 2 is do disallow, but only code points requiring - # perl-extended-UTF8 + if ($malformed_allow_type == 2) { + $allow_flags |= $::UTF8_ALLOW_LONG_AND_ITS_VALUE; + } + elsif ($malformed_allow_type) { + $allow_flags |= $::UTF8_ALLOW_LONG; + } + } + if ($short) { + push @malformation_names, 'short'; + push @expected_malformation_return_flags, $::UTF8_GOT_SHORT; + push @expected_malformation_warnings, qr/too short/; + } + if ($unexpected_noncont) { + push @malformation_names, 'unexpected non-continuation'; + push @expected_malformation_return_flags, + $::UTF8_GOT_NON_CONTINUATION; + $allow_flags |= $::UTF8_ALLOW_NON_CONTINUATION + if $malformed_allow_type; + push @expected_malformation_warnings, + qr/unexpected non-continuation byte/; + } - my $disallow_flags; - my $expected_ret; + # The overflow malformation is done differently than other + # malformations. It comes from manually typed tests in the test + # array. We now make it be treated like one of the other + # malformations. But some has to be deferred until the inner loop + my $overflow_msg_pattern; + if ($will_overflow) { + push @malformation_names, 'overflow'; + + $overflow_msg_pattern = display_bytes_no_quotes( + substr($this_bytes, 0, $this_expected_len)); + $overflow_msg_pattern = qr/\QMalformed UTF-8 character:\E + \Q $overflow_msg_pattern\E + \Q (overflows)\E/x; + push @expected_malformation_return_flags, $::UTF8_GOT_OVERFLOW; + $allow_flags |= $::UTF8_ALLOW_OVERFLOW if $malformed_allow_type; + } - if ($malformations_name) { + # And we can create the malformation-related text for the the test + # names we eventually will generate. + my $malformations_name = ""; + if (@malformation_names) { + $malformations_name .= "dis" unless $malformed_allow_type; + $malformations_name .= "allowed "; + $malformations_name .= "malformation"; + $malformations_name .= "s" if @malformation_names > 1; + $malformations_name .= ": "; + $malformations_name .= join "/", @malformation_names; + $malformations_name = " ($malformations_name)"; + } - # Malformations are by default disallowed, so testing - # with $disallow_type equal to 0 is sufficicient. - next if $disallow_type; + # Done setting up the malformation related stuff - $disallow_flags = 0; - $expected_ret = 0; - } - elsif ($disallow_type == 1) { - $disallow_flags = $utf8n_flag_to_disallow; - $expected_ret = 0; - } - elsif ($disallow_type == 2) { - next if ! requires_extended_utf8($allowed_uv); - $disallow_flags = $::UTF8_DISALLOW_PERL_EXTENDED; - $expected_ret = 0; - } - else { # type is 0 - $disallow_flags = $utf8n_flag_to_disallow_complement; - $expected_ret = $this_length; - } + { # First test the isFOO calls + use warnings; # XXX no warnings 'deprecated'; # Make sure these don't raise warnings + undef @warnings_gotten; - $ret = test_isUTF8_CHAR_flags($this_bytes, $this_length, - $disallow_flags); - is($ret, $expected_ret, - " And isUTF8_CHAR_flags($display_bytes," - . " $disallow_flags) returns $expected_ret") + my $ret = test_isUTF8_CHAR($this_bytes, $this_length); + my $ret_flags + = test_isUTF8_CHAR_flags($this_bytes, $this_length, 0); + if ($malformations_name) { + is($ret, 0, "For $testname$malformations_name: isUTF8_CHAR() returns 0"); + is($ret_flags, 0, " And isUTF8_CHAR_flags() returns 0"); + } + else { + is($ret, $this_length, "For $testname: isUTF8_CHAR() returns" + . " expected length: $this_length"); + is($ret_flags, $this_length, + " And isUTF8_CHAR_flags(...,0) returns expected" + . " length: $this_length"); + } + is(scalar @warnings_gotten, 0, + " And neither isUTF8_CHAR() nor isUTF8_CHAR()_flags" + . " generated any warnings") + or output_warnings(@warnings_gotten); + + undef @warnings_gotten; + $ret = test_isSTRICT_UTF8_CHAR($this_bytes, $this_length); + if ($malformations_name) { + is($ret, 0, " And isSTRICT_UTF8_CHAR() returns 0"); + } + else { + my $expected_ret + = ( $testname =~ /surrogate|non-character/ + || $allowed_uv > 0x10FFFF) + ? 0 + : $this_length; + is($ret, $expected_ret, + " And isSTRICT_UTF8_CHAR() returns expected" + . " length: $expected_ret"); + $ret = test_isUTF8_CHAR_flags($this_bytes, $this_length, + $::UTF8_DISALLOW_ILLEGAL_INTERCHANGE); + is($ret, $expected_ret, + " And isUTF8_CHAR_flags('" + . "DISALLOW_ILLEGAL_INTERCHANGE') acts like" + . " isSTRICT_UTF8_CHAR"); + } + is(scalar @warnings_gotten, 0, + " And neither isSTRICT_UTF8_CHAR() nor" + . " isUTF8_CHAR_flags generated any warnings") + or output_warnings(@warnings_gotten); + + undef @warnings_gotten; + $ret = test_isC9_STRICT_UTF8_CHAR($this_bytes, $this_length); + if ($malformations_name) { + is($ret, 0, " And isC9_STRICT_UTF8_CHAR() returns 0"); + } + else { + my $expected_ret = ( $testname =~ /surrogate/ + || $allowed_uv > 0x10FFFF) + ? 0 + : $this_expected_len; + is($ret, $expected_ret, " And isC9_STRICT_UTF8_CHAR()" + . " returns expected length:" + . " $expected_ret"); + $ret = test_isUTF8_CHAR_flags($this_bytes, $this_length, + $::UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE); + is($ret, $expected_ret, + " And isUTF8_CHAR_flags('" + . "DISALLOW_ILLEGAL_C9_INTERCHANGE') acts like" + . " isC9_STRICT_UTF8_CHAR"); + } + is(scalar @warnings_gotten, 0, + " And neither isC9_STRICT_UTF8_CHAR() nor" + . " isUTF8_CHAR_flags generated any warnings") + or output_warnings(@warnings_gotten); + + foreach my $disallow_type (0..2) { + # 0 is don't disallow this type of code point + # 1 is do disallow + # 2 is do disallow, but only code points requiring + # perl-extended-UTF8 + + my $disallow_flags; + my $expected_ret; + + if ($malformations_name) { + + # Malformations are by default disallowed, so testing + # with $disallow_type equal to 0 is sufficicient. + next if $disallow_type; + + $disallow_flags = 0; + $expected_ret = 0; + } + elsif ($disallow_type == 1) { + $disallow_flags = $utf8n_flag_to_disallow; + $expected_ret = 0; + } + elsif ($disallow_type == 2) { + next if ! requires_extended_utf8($allowed_uv); + $disallow_flags = $::UTF8_DISALLOW_PERL_EXTENDED; + $expected_ret = 0; + } + else { # type is 0 + $disallow_flags = $utf8n_flag_to_disallow_complement; + $expected_ret = $this_length; + } + + $ret = test_isUTF8_CHAR_flags($this_bytes, $this_length, + $disallow_flags); + is($ret, $expected_ret, + " And isUTF8_CHAR_flags($display_bytes," + . " $disallow_flags) returns $expected_ret") + or diag "The flags mean " + . flags_to_text($disallow_flags, + \@utf8n_flags_to_text); + is(scalar @warnings_gotten, 0, + " And isUTF8_CHAR_flags(...) generated" + . " no warnings") + or output_warnings(@warnings_gotten); + + # Test partial character handling, for each byte not a + # full character + my $did_test_partial = 0; + for (my $j = 1; $j < $this_length - 1; $j++) { + $did_test_partial = 1; + my $partial = substr($this_bytes, 0, $j); + my $ret_should_be; + my $comment; + if ($disallow_type || $malformations_name) { + $ret_should_be = 0; + $comment = "disallowed"; + + # The number of bytes required to tell if a + # sequence has something wrong is the smallest of + # all the things wrong with it. We start with the + # number for this type of code point, if that is + # disallowed; or the whole length if not. The + # latter is what a couple of the malformations + # require. + my $needed_to_tell = ($disallow_type) + ? $this_needed_to_discern_len + : $this_expected_len; + + # Then we see if the malformations that are + # detectable early in the string are present. + if ($overlong) { + my $dl = overlong_discern_len($this_bytes); + $needed_to_tell = $dl if $dl < $needed_to_tell; + } + if ($will_overflow) { + my $dl = overflow_discern_len($length); + $needed_to_tell = $dl if $dl < $needed_to_tell; + } + + if ($j < $needed_to_tell) { + $ret_should_be = 1; + $comment .= ", but need $needed_to_tell" + . " bytes to discern:"; + } + } + else { + $ret_should_be = 1; + $comment = "allowed"; + } + + undef @warnings_gotten; + + $ret = test_is_utf8_valid_partial_char_flags($partial, + $j, $disallow_flags); + is($ret, $ret_should_be, + " And is_utf8_valid_partial_char_flags(" + . display_bytes($partial) + . ", $disallow_flags), $comment: returns" + . " $ret_should_be") or diag "The flags mean " - . flags_to_text($disallow_flags, - \@utf8n_flags_to_text); - is(scalar @warnings_gotten, 0, - " And isUTF8_CHAR_flags(...) generated" - . " no warnings") - or output_warnings(@warnings_gotten); - - # Test partial character handling, for each byte not a - # full character - my $did_test_partial = 0; - for (my $j = 1; $j < $this_length - 1; $j++) { - $did_test_partial = 1; - my $partial = substr($this_bytes, 0, $j); - my $ret_should_be; - my $comment; - if ($disallow_type || $malformations_name) { - $ret_should_be = 0; - $comment = "disallowed"; - - # The number of bytes required to tell if a - # sequence has something wrong is the smallest of - # all the things wrong with it. We start with the - # number for this type of code point, if that is - # disallowed; or the whole length if not. The - # latter is what a couple of the malformations - # require. - my $needed_to_tell = ($disallow_type) - ? $this_needed_to_discern_len - : $this_expected_len; - - # Then we see if the malformations that are - # detectable early in the string are present. - if ($overlong) { - my $dl = overlong_discern_len($this_bytes); - $needed_to_tell = $dl if $dl < $needed_to_tell; - } - if ($will_overflow) { - my $dl = overflow_discern_len($length); - $needed_to_tell = $dl if $dl < $needed_to_tell; - } - - if ($j < $needed_to_tell) { - $ret_should_be = 1; - $comment .= ", but need $needed_to_tell" - . " bytes to discern:"; - } - } - else { - $ret_should_be = 1; - $comment = "allowed"; - } + . flags_to_text($disallow_flags, \@utf8n_flags_to_text); + } + + if ($did_test_partial) { + is(scalar @warnings_gotten, 0, + " And is_utf8_valid_partial_char_flags()" + . " generated no warnings for any of the lengths") + or output_warnings(@warnings_gotten); + } + } + } - undef @warnings_gotten; - - $ret = test_is_utf8_valid_partial_char_flags($partial, - $j, $disallow_flags); - is($ret, $ret_should_be, - " And is_utf8_valid_partial_char_flags(" - . display_bytes($partial) - . ", $disallow_flags), $comment: returns" - . " $ret_should_be") - or diag "The flags mean " - . flags_to_text($disallow_flags, \@utf8n_flags_to_text); - } + # Now test the to/from UTF-8 calls. There are several orthogonal + # variables involved. We test most possible combinations - if ($did_test_partial) { - is(scalar @warnings_gotten, 0, - " And is_utf8_valid_partial_char_flags()" - . " generated no warnings for any of the lengths") - or output_warnings(@warnings_gotten); - } - } + foreach my $do_disallow (0, 1) { + if ($do_disallow) { + next if $initially_overlong || $initially_orphan; } - - # Now test the to/from UTF-8 calls. There are several orthogonal - # variables involved. We test most possible combinations - - foreach my $do_disallow (0, 1) { - if ($do_disallow) { - next if $initially_overlong || $initially_orphan; - } - else { - next if $skip_most_tests; + else { + next if $skip_most_tests; } + # This tests three functions. utf8n_to_uvchr_error, + # utf8n_to_uvchr_msgs, and uvchr_to_utf8_flags. But only the + # first two are variants of each other. We use a loop + # 'which_func' to determine which of these. uvchr_to_utf8_flags + # is done separately at the end of each iteration, only when + # which_func is 0. which_func is numeric in part so we don't + # have to type in the function name and risk misspelling it + # somewhere, and also it sets whether we are expecting warnings + # or not in certain places. The _msgs() version of the function + # expects warnings even if lexical ones are turned off, so by + # making its which_func == 1, we can say we want warnings; + # whereas the other one with the value 0, doesn't get them. + for my $which_func (0, 1) { + my $func = ($which_func) + ? 'utf8n_to_uvchr_msgs' + : 'utf8n_to_uvchr_error'; + # We classify the warnings into certain "interesting" types, # described later foreach my $warning_type (0..4) { @@ -1356,6 +1376,12 @@ foreach my $test (@tests) { foreach my $use_warn_flag (0, 1) { if ($use_warn_flag) { next if $initially_overlong || $initially_orphan; + + # Since utf8n_to_uvchr_msgs() expects warnings even + # when lexical ones are turned off, we can skip + # testing it when they are turned on, with little + # likelihood of missing an error case. + next if $which_func; } else { next if $skip_most_tests; @@ -1390,9 +1416,9 @@ foreach my $test (@tests) { } elsif ($warning_type == 1) { $eval_warn = "no warnings"; - $expect_regular_warnings = 0; - $expect_warnings_for_overflow = 0; - $expect_warnings_for_malformed = 0; + $expect_regular_warnings = $which_func; + $expect_warnings_for_overflow = $which_func; + $expect_warnings_for_malformed = $which_func; } elsif ($warning_type == 2) { $eval_warn = "no warnings; use warnings 'utf8'"; @@ -1407,7 +1433,7 @@ foreach my $test (@tests) { $expect_regular_warnings = $use_warn_flag; $expect_warnings_for_overflow = $controlling_warning_category eq 'non_unicode'; - $expect_warnings_for_malformed = 0; + $expect_warnings_for_malformed = $which_func; } elsif ($warning_type == 4) { # Like type 3, but uses the # PERL_EXTENDED flags @@ -1567,7 +1593,8 @@ foreach my $test (@tests) { } } - my $this_name = "utf8n_to_uvchr_error() $testname: "; + my $this_name = "$func() $testname: "; + my @scratch_expected_return_flags = @expected_return_flags; if (! $initially_malformed) { $this_name .= ($disallowed) ? 'disallowed, ' @@ -1586,7 +1613,7 @@ foreach my $test (@tests) { my $this_flags = $allow_flags|$this_warning_flags|$this_disallow_flags; my $eval_text = "$eval_warn; \$ret_ref" - . " = test_utf8n_to_uvchr_error(" + . " = test_$func(" . "'$this_bytes', $this_length, $this_flags)"; eval "$eval_text"; if (! ok ("$@ eq ''", "$this_name: eval succeeded")) @@ -1595,6 +1622,7 @@ foreach my $test (@tests) { . utf8n_display_call($eval_text); next; } + if ($disallowed) { is($ret_ref->[0], 0, " And returns 0") or diag "Call was: " . utf8n_display_call($eval_text); @@ -1612,47 +1640,95 @@ foreach my $test (@tests) { my $returned_flags = $ret_ref->[2]; - for (my $i = @expected_return_flags - 1; $i >= 0; $i--) { - if ($expected_return_flags[$i] & $returned_flags) { - if ($expected_return_flags[$i] - == $::UTF8_GOT_PERL_EXTENDED) - { - pass(" Expected and got return flag for" - . " PERL_EXTENDED"); - } - # The first entries in this are - # malformations - elsif ($i > @malformation_names - 1) { - pass(" Expected and got return flag" - . " for " . $controlling_warning_category); - } - else { - pass(" Expected and got return flag for " - . $malformation_names[$i] - . " malformation"); - } - $returned_flags &= ~$expected_return_flags[$i]; - splice @expected_return_flags, $i, 1; - } + for (my $i = @scratch_expected_return_flags - 1; + $i >= 0; + $i--) + { + if ($scratch_expected_return_flags[$i] & $returned_flags) + { + if ($scratch_expected_return_flags[$i] + == $::UTF8_GOT_PERL_EXTENDED) + { + pass(" Expected and got return flag for" + . " PERL_EXTENDED"); + } + # The first entries in this are + # malformations + elsif ($i > @malformation_names - 1) { + pass(" Expected and got return flag" + . " for " . $controlling_warning_category); + } + else { + pass(" Expected and got return flag for " + . $malformation_names[$i] + . " malformation"); + } + $returned_flags + &= ~$scratch_expected_return_flags[$i]; + splice @scratch_expected_return_flags, $i, 1; + } } - is($returned_flags, 0, - " Got no unexpected return flags") - or diag "The unexpected flags gotten were: " + if (! is($returned_flags, 0, + " Got no unexpected return flags")) + { + diag "The unexpected flags gotten were: " . (flags_to_text($returned_flags, \@utf8n_flags_to_text) # We strip off any prefixes from the flag # names =~ s/ \b [A-Z] _ //xgr); - is (scalar @expected_return_flags, 0, - " Got all expected return flags") - or diag "The expected flags not gotten were: " + diag "Call was: " . utf8n_display_call($eval_text); + } + + if (! is (scalar @scratch_expected_return_flags, 0, + " Got all expected return flags")) + { + diag "The expected flags not gotten were: " . (flags_to_text(eval join("|", - @expected_return_flags), + @scratch_expected_return_flags), \@utf8n_flags_to_text) # We strip off any prefixes from the flag # names =~ s/ \b [A-Z] _ //xgr); + diag "Call was: " . utf8n_display_call($eval_text); + } + + if ($which_func) { + my @returned_warnings; + for my $element_ref (@{$ret_ref->[3]}) { + push @returned_warnings, $element_ref->{'text'}; + my $text = $element_ref->{'text'}; + my $flag = $element_ref->{'flag_bit'}; + my $category = $element_ref->{'warning_category'}; + + if (! ok(($flag & ($flag-1)) == 0, + "flag for returned msg is a single bit")) + { + diag sprintf("flags are %x; msg=%s", $flag, $text); + } + else { + if (grep { $_ == $flag } @expected_return_flags) { + pass("flag for returned msg is expected"); + } + else { + fail("flag for returned msg is expected: " + . flags_to_text($flag, \@utf8n_flags_to_text)); + } + } + + # In perl space, don't know the category numbers + isnt($category, 0, + "returned category for msg isn't 0"); + } + + ok(@warnings_gotten == 0, "$func raised no warnings;" + . " the next tests are for ones in the returned" + . " variable") + or diag join "\n", "The unexpected warnings were:", + @warnings_gotten; + @warnings_gotten = @returned_warnings; + } do_warnings_test(@expected_warnings) or diag "Call was: " . utf8n_display_call($eval_text); @@ -1660,11 +1736,15 @@ foreach my $test (@tests) { # Check CHECK_ONLY results when the input is # disallowed. Do this when actually disallowed, - # not just when the $this_disallow_flags is set - if ($disallowed) { + # not just when the $this_disallow_flags is set. We only + # test once utf8n_to_uvchr_msgs() with this. + if ( $disallowed + && ($which_func == 0 || ! $tested_CHECK_ONLY)) + { + $tested_CHECK_ONLY = 1; my $this_flags = $this_disallow_flags|$::UTF8_CHECK_ONLY; my $eval_text = "use warnings; \$ret_ref =" - . " test_utf8n_to_uvchr_error('" + . " test_$func('" . "$this_bytes', $this_length," . " $this_flags)"; eval $eval_text; @@ -1693,6 +1773,7 @@ foreach my $test (@tests) { # existing code point, it hasn't overflowed, and isn't # malformed. next if @malformation_names; + next if $which_func; $this_warning_flags = ($use_warn_flag) ? $this_uvchr_flag_to_warn @@ -1749,6 +1830,7 @@ foreach my $test (@tests) { } } } + } } done_testing; diff --git a/proto.h b/proto.h index 911b96156c..eadfc976db 100644 --- a/proto.h +++ b/proto.h @@ -3666,6 +3666,9 @@ PERL_CALLCONV UV Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *r PERL_CALLCONV UV Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, const U32 flags, U32 * errors); #define PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_ERROR \ assert(s) +PERL_CALLCONV UV Perl_utf8n_to_uvchr_msgs(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, const U32 flags, U32 * errors, AV ** msgs); +#define PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_MSGS \ + assert(s) PERL_CALLCONV UV Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags); #define PERL_ARGS_ASSERT_UTF8N_TO_UVUNI \ assert(s) diff --git a/utf8.c b/utf8.c index 3123bd0182..34e47f3389 100644 --- a/utf8.c +++ b/utf8.c @@ -1167,7 +1167,8 @@ THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES. Most code should use L</utf8_to_uvchr_buf>() rather than call this directly. This function is for code that needs to know what the precise malformation(s) -are when an error is found. +are when an error is found. If you also need to know the generated warning +messages, use L</utf8n_to_uvchr_msgs>() instead. It is like C<L</utf8n_to_uvchr>> but it takes an extra parameter placed after all the others, C<errors>. If this parameter is 0, this function behaves @@ -1272,14 +1273,81 @@ To do your own error handling, call this function with the C<UTF8_CHECK_ONLY> flag to suppress any warnings, and then examine the C<*errors> return. =cut + +Also implemented as a macro in utf8.h */ UV Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s, - STRLEN curlen, - STRLEN *retlen, - const U32 flags, - U32 * errors) + STRLEN curlen, + STRLEN *retlen, + const U32 flags, + U32 * errors) +{ + PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_ERROR; + + return utf8n_to_uvchr_msgs(s, curlen, retlen, flags, errors, NULL); +} + +/* + +=for apidoc utf8n_to_uvchr_msgs + +THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES. +Most code should use L</utf8_to_uvchr_buf>() rather than call this directly. + +This function is for code that needs to know what the precise malformation(s) +are when an error is found, and wants the corresponding warning and/or error +messages to be returned to the caller rather than be displayed. All messages +that would have been displayed if all lexcial warnings are enabled will be +returned. + +It is just like C<L</utf8n_to_uvchr_error>> but it takes an extra parameter +placed after all the others, C<msgs>. If this parameter is 0, this function +behaves identically to C<L</utf8n_to_uvchr_error>>. Otherwise, C<msgs> should +be a pointer to an C<AV *> variable, in which this function creates a new AV to +contain any appropriate messages. The elements of the array are ordered so +that the first message that would have been displayed is in the 0th element, +and so on. Each element is a hash with three key-value pairs, as follows: + +=over 4 + +=item C<text> + +The text of the message as a C<SVpv>. + +=item C<warn_categories> + +The warning category (or categories) packed into a C<SVuv>. + +=item C<flag> + +A single flag bit associated with this message, in a C<SVuv>. +The bit corresponds to some bit in the C<*errors> return value, +such as C<UTF8_GOT_LONG>. + +=back + +It's important to note that specifying this parameter as non-null will cause +any warnings this function would otherwise generate to be suppressed, and +instead be placed in C<*msgs>. The caller can check the lexical warnings state +(or not) when choosing what to do with the returned messages. + +If the flag C<UTF8_CHECK_ONLY> is passed, no warnings are generated, and hence +no AV is created. + +The caller, of course, is responsible for freeing any returned AV. + +=cut +*/ + +UV +Perl_utf8n_to_uvchr_msgs(pTHX_ const U8 *s, + STRLEN curlen, + STRLEN *retlen, + const U32 flags, + U32 * errors, + AV ** msgs) { const U8 * const s0 = s; U8 * send = NULL; /* (initialized to silence compilers' wrong @@ -1302,7 +1370,7 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s, routine; see [perl #130921] */ UV uv_so_far = 0; /* (Initialized to silence compilers' wrong warning) */ - PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_ERROR; + PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_MSGS; if (errors) { *errors = 0; @@ -1576,9 +1644,14 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s, bool disallowed = FALSE; const U32 orig_problems = possible_problems; + if (msgs) { + *msgs = NULL; + } + while (possible_problems) { /* Handle each possible problem */ UV pack_warn = 0; char * message = NULL; + U32 this_flag_bit = 0; /* Each 'if' clause handles one problem. They are ordered so that * the first ones' messages will be displayed before the later @@ -1623,16 +1696,17 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s, * necessarily do so in the future. We output (only) the * most dire warning */ if (! (flags & UTF8_CHECK_ONLY)) { - if (ckWARN_d(WARN_UTF8)) { + if (msgs || ckWARN_d(WARN_UTF8)) { pack_warn = packWARN(WARN_UTF8); } - else if (ckWARN_d(WARN_NON_UNICODE)) { + else if (msgs || ckWARN_d(WARN_NON_UNICODE)) { pack_warn = packWARN(WARN_NON_UNICODE); } if (pack_warn) { message = Perl_form(aTHX_ "%s: %s (overflows)", malformed_text, _byte_dump_string(s0, curlen, 0)); + this_flag_bit = UTF8_GOT_OVERFLOW; } } } @@ -1649,10 +1723,13 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s, assert(0); disallowed = TRUE; - if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) { + if ( (msgs + || ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY)) + { pack_warn = packWARN(WARN_UTF8); message = Perl_form(aTHX_ "%s (empty string)", malformed_text); + this_flag_bit = UTF8_GOT_EMPTY; } } } @@ -1662,13 +1739,16 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s, if (! (flags & UTF8_ALLOW_CONTINUATION)) { disallowed = TRUE; - if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) { + if (( msgs + || ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY)) + { pack_warn = packWARN(WARN_UTF8); message = Perl_form(aTHX_ "%s: %s (unexpected continuation byte 0x%02x," " with no preceding start byte)", malformed_text, _byte_dump_string(s0, 1, 0), *s0); + this_flag_bit = UTF8_GOT_CONTINUATION; } } } @@ -1678,7 +1758,9 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s, if (! (flags & UTF8_ALLOW_SHORT)) { disallowed = TRUE; - if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) { + if (( msgs + || ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY)) + { pack_warn = packWARN(WARN_UTF8); message = Perl_form(aTHX_ "%s: %s (too short; %d byte%s available, need %d)", @@ -1687,6 +1769,7 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s, (int)avail_len, avail_len == 1 ? "" : "s", (int)expectlen); + this_flag_bit = UTF8_GOT_SHORT; } } @@ -1697,7 +1780,9 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s, if (! (flags & UTF8_ALLOW_NON_CONTINUATION)) { disallowed = TRUE; - if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) { + if (( msgs + || ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY)) + { /* If we don't know for sure that the input length is * valid, avoid as much as possible reading past the @@ -1711,6 +1796,7 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s, printlen, s - s0, (int) expectlen)); + this_flag_bit = UTF8_GOT_NON_CONTINUATION; } } } @@ -1721,7 +1807,7 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s, *errors |= UTF8_GOT_SURROGATE; if ( ! (flags & UTF8_CHECK_ONLY) - && ckWARN_d(WARN_SURROGATE)) + && (msgs || ckWARN_d(WARN_SURROGATE))) { pack_warn = packWARN(WARN_SURROGATE); @@ -1736,6 +1822,7 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s, else { message = Perl_form(aTHX_ surrogate_cp_format, uv); } + this_flag_bit = UTF8_GOT_SURROGATE; } } @@ -1751,7 +1838,7 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s, *errors |= UTF8_GOT_SUPER; if ( ! (flags & UTF8_CHECK_ONLY) - && ckWARN_d(WARN_NON_UNICODE)) + && (msgs || ckWARN_d(WARN_NON_UNICODE))) { pack_warn = packWARN(WARN_NON_UNICODE); @@ -1765,6 +1852,7 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s, else { message = Perl_form(aTHX_ super_cp_format, uv); } + this_flag_bit = UTF8_GOT_SUPER; } } @@ -1774,7 +1862,7 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s, if (UNLIKELY(isUTF8_PERL_EXTENDED(s0))) { if ( ! (flags & UTF8_CHECK_ONLY) && (flags & (UTF8_WARN_PERL_EXTENDED|UTF8_WARN_SUPER)) - && ckWARN_d(WARN_NON_UNICODE)) + && (msgs || ckWARN_d(WARN_NON_UNICODE))) { pack_warn = packWARN(WARN_NON_UNICODE); @@ -1798,6 +1886,7 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s, " so is not portable", _byte_dump_string(s0, curlen, 0)); } + this_flag_bit = UTF8_GOT_PERL_EXTENDED; } if (flags & ( UTF8_WARN_PERL_EXTENDED @@ -1823,7 +1912,7 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s, *errors |= UTF8_GOT_NONCHAR; if ( ! (flags & UTF8_CHECK_ONLY) - && ckWARN_d(WARN_NONCHAR)) + && (msgs || ckWARN_d(WARN_NONCHAR))) { /* The code above should have guaranteed that we don't * get here with errors other than overlong */ @@ -1832,6 +1921,7 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s, pack_warn = packWARN(WARN_NONCHAR); message = Perl_form(aTHX_ nonchar_cp_format, uv); + this_flag_bit = UTF8_GOT_NONCHAR; } } @@ -1857,7 +1947,9 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s, else { disallowed = TRUE; - if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) { + if (( msgs + || ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY)) + { pack_warn = packWARN(WARN_UTF8); /* These error types cause 'uv' to be something that @@ -1900,6 +1992,7 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s, small code points */ UNI_TO_NATIVE(uv)); } + this_flag_bit = UTF8_GOT_LONG; } } } /* End of looking through the possible flags */ @@ -1907,7 +2000,25 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s, ... 39 lines suppressed ... -- Perl5 Master Repository
