In perl.git, the branch blead has been updated <https://perl5.git.perl.org/perl.git/commitdiff/ae315a0a3c51e68887704d4907bb6a502a6d4e3f?hp=0b13e5291ebd9c786dea21905e17886c5a310454>
- Log ----------------------------------------------------------------- commit ae315a0a3c51e68887704d4907bb6a502a6d4e3f Author: Karl Williamson <k...@cpan.org> Date: Sun Feb 4 21:47:09 2018 -0700 APItest: Add tests for utf8_to_bytes() commit 8132136a878b27b9619d552278dd329a2f289bd4 Author: Karl Williamson <k...@cpan.org> Date: Sun Feb 4 21:44:17 2018 -0700 APItest:t/utf8_setup.pl: Display printables as themselves Instead of the harder to read \xXX ----------------------------------------------------------------------- Summary of changes: MANIFEST | 1 + ext/XS-APItest/APItest.xs | 18 +++++++++++ ext/XS-APItest/t/utf8_setup.pl | 6 +++- ext/XS-APItest/t/utf8_to_bytes.t | 68 ++++++++++++++++++++++++++++++++++++++++ 4 files changed, 92 insertions(+), 1 deletion(-) create mode 100644 ext/XS-APItest/t/utf8_to_bytes.t diff --git a/MANIFEST b/MANIFEST index 96c8da5b5e..4a5c649e45 100644 --- a/MANIFEST +++ b/MANIFEST @@ -4394,6 +4394,7 @@ ext/XS-APItest/t/underscore_length.t Test find_rundefsv() ext/XS-APItest/t/utf16_to_utf8.t Test behaviour of utf16_to_utf8{,reversed} ext/XS-APItest/t/utf8.t Tests for code in utf8.c ext/XS-APItest/t/utf8_setup.pl Tests for code in utf8.c +ext/XS-APItest/t/utf8_to_bytes.t Tests for code in utf8.c ext/XS-APItest/t/utf8_warn00.t Tests for code in utf8.c ext/XS-APItest/t/utf8_warn01.t Tests for code in utf8.c ext/XS-APItest/t/utf8_warn02.t Tests for code in utf8.c diff --git a/ext/XS-APItest/APItest.xs b/ext/XS-APItest/APItest.xs index 0be5d95310..5e67e7fa40 100644 --- a/ext/XS-APItest/APItest.xs +++ b/ext/XS-APItest/APItest.xs @@ -1379,6 +1379,24 @@ bytes_cmp_utf8(bytes, utf8) OUTPUT: RETVAL +AV * +test_utf8_to_bytes(bytes, lenp) + unsigned char * bytes + STRLEN lenp + PREINIT: + char * ret; + CODE: + RETVAL = newAV(); + sv_2mortal((SV*)RETVAL); + + ret = (char *) utf8_to_bytes(bytes, &lenp); + av_push(RETVAL, newSVpv(ret, 0)); + av_push(RETVAL, newSViv(lenp)); + av_push(RETVAL, newSVpv((const char *) bytes, 0)); + + OUTPUT: + RETVAL + AV * test_utf8n_to_uvchr_msgs(s, len, flags) char *s diff --git a/ext/XS-APItest/t/utf8_setup.pl b/ext/XS-APItest/t/utf8_setup.pl index ec7a5ce3d1..231b4d9494 100644 --- a/ext/XS-APItest/t/utf8_setup.pl +++ b/ext/XS-APItest/t/utf8_setup.pl @@ -11,7 +11,11 @@ sub isASCII { ord "A" == 65 } sub display_bytes_no_quotes { use bytes; my $string = shift; - return join("", map { sprintf("\\x%02x", ord $_) } split "", $string) + return join("", map { + ($_ =~ /[[:print:]]/) + ? $_ + : sprintf("\\x%02x", ord $_) + } split "", $string) } sub display_bytes { diff --git a/ext/XS-APItest/t/utf8_to_bytes.t b/ext/XS-APItest/t/utf8_to_bytes.t new file mode 100644 index 0000000000..4c03f842f5 --- /dev/null +++ b/ext/XS-APItest/t/utf8_to_bytes.t @@ -0,0 +1,68 @@ +#!perl -w + +# This is a base file to be used by various .t's in its directory +# It tests various malformed UTF-8 sequences and some code points that are +# "problematic", and verifies that the correct warnings/flags etc are +# generated when using them. For the code points, it also takes the UTF-8 and +# perturbs it to be malformed in various ways, and tests that this gets +# appropriately detected. + +use strict; +use Test::More; + +BEGIN { + require './t/utf8_setup.pl'; + use_ok('XS::APItest'); +}; + +$|=1; + +use Data::Dumper; + +my @well_formed = ( + "\xE1", + "The quick brown fox jumped over the lazy dog", + "Ces systèmes de codage sont souvent incompatibles entre eux. Ainsi, deux systèmes peuvent utiliser le même nombre pour deux caractères différents ou utiliser différents nombres pour le même caractère.", + "Kelimelerin m\xC3\xAAme caract\xC3\xA8re ve yaz\xC3\xB1abc", +); + +my @malformed = ( + "Kelimelerin m\xC3\xAAme caract\xC3\xA8re ve yaz\xC4\xB1abc", + "Kelimelerin m\xC3\xAAme caract\xC3\xA8re ve yaz\xC4\xB1\xC3\xA8abc", + "Kelimelerin m\xC3\xAAme caract\xC3re ve yazi\xC3\xA8abc", + "Kelimelerin m\xC3\xAAme caract\xA8 ve yazi\xC3\xA8abc", + "Kelimelerin m\xC3\xAAme caract\xC3\xA8\xC3re ve yazi\xC3\xA8abc", +); + +for my $test (@well_formed) { + my $utf8 = $test; + utf8::upgrade($utf8); + my $utf8_length; + my $byte_length = length $test; + + { + use bytes; + $utf8_length = length $utf8; + } + + my $ret_ref = test_utf8_to_bytes($utf8, $utf8_length); + + is ($ret_ref->[0], $test, "Successfully downgraded " + . display_bytes($utf8)); + is ($ret_ref->[1], $byte_length, "... And returned correct length(" + . $byte_length . ")"); +} + +for my $test (@malformed) { + my $utf8 = $test; + my $utf8_length = length $test; + + my $ret_ref = test_utf8_to_bytes($utf8, $utf8_length); + + ok (! defined $ret_ref->[0], "Returned undef for malformed " + . display_bytes($utf8)); + is ($ret_ref->[1], -1, "... And returned length -1"); + is ($ret_ref->[2], $utf8, "... And left the input unchanged"); +} + +done_testing(); -- Perl5 Master Repository