This is an automated email from the ASF dual-hosted git repository.

mgrigorov pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/avro.git


The following commit(s) were added to refs/heads/main by this push:
     new 677e9829b AVRO-1517: [Perl] Encode UTF-8 strings as bytes (#2979)
677e9829b is described below

commit 677e9829bae30cc76527c6f5702f8c2384be61c5
Author: José Joaquín Atria <[email protected]>
AuthorDate: Wed Jun 26 16:12:45 2024 +0100

    AVRO-1517: [Perl] Encode UTF-8 strings as bytes (#2979)
    
    From John Karp's original description of [the issue]:
    
    > By default in Perl, a string is a sequence of bytes, values 0-255.
    > However, if a Unicode character is included that cannot be represented
    > with a single byte, the string gets 'upgraded' to a non-byte-based
    > Unicode string allowing ordinals outside that range. When string
    > operations are done with byte and non-byte Unicode strings, the result
    > is always non-byte, with the byte string first 'upgraded'. Upgrading
    > consists of utf8 encoding and setting a utf8 flag on the string. ('utf8'
    > is a variant of UTF-8 used by Perl)
    >
    > The Perl Avro API is accepting these Unicode strings as-is for the
    > 'bytes' type. This is a problem because
    >
    >   1. values >255 are not valid as bytes, and any encoding is their job
    >
    >   2. As Avro assembles the serialized data, Perl 'upgrades' all the data,
    >      having the effect of utf8 encoding our serialized binary data.
    >
    > The correct behavior is for the Avro Perl API is to attempt to downgrade
    > the string, and if this fails because it contained values >255 then to
    > raise an error. (The behavior of 'string' won't change, it will still
    > take Unicode strings as expected.)
    
    This change, based on the one submitted for that ticket, adds these
    behaviours and tests to exercise them.
    
    [the issue]: https://issues.apache.org/jira/browse/AVRO-1517
---
 lang/perl/Changes                   |  4 ++++
 lang/perl/lib/Avro/BinaryEncoder.pm | 21 ++++++++++++++-------
 lang/perl/lib/Avro/Schema.pm        | 15 +++++++++------
 lang/perl/t/01_schema.t             | 28 +++++++++++++++++++++++++++-
 4 files changed, 54 insertions(+), 14 deletions(-)

diff --git a/lang/perl/Changes b/lang/perl/Changes
index 678056028..c1551566f 100644
--- a/lang/perl/Changes
+++ b/lang/perl/Changes
@@ -11,6 +11,10 @@ Revision history for Perl extension Avro
           for int and long types were off by one
         - Silenced a spurious warning that was raised when
           validating an undefined value for some data types
+        - Make sure Unicode strings are downgraded when they
+          are encoded in fields of type 'byte' or 'fixed'.
+          Errors resulting from this process will be raised as
+          Avro::BinaryEncoder::Error exceptions
 
 1.00  Fri Jan 17 15:00:00 2014
         - Relicense under apache license 2.0
diff --git a/lang/perl/lib/Avro/BinaryEncoder.pm 
b/lang/perl/lib/Avro/BinaryEncoder.pm
index 18a25813e..47eb93116 100644
--- a/lang/perl/lib/Avro/BinaryEncoder.pm
+++ b/lang/perl/lib/Avro/BinaryEncoder.pm
@@ -138,7 +138,9 @@ sub encode_double {
 sub encode_bytes {
     my $class = shift;
     my ($schema, $data, $cb) = @_;
-    encode_long($class, undef, bytes::length($data), $cb);
+    throw Avro::BinaryEncoder::Error("Invalid data given for 'bytes': Contains 
values >255")
+        unless utf8::downgrade($data, 1);
+    encode_long($class, undef, length($data), $cb);
     $cb->(\$data);
 }
 
@@ -146,7 +148,7 @@ sub encode_string {
     my $class = shift;
     my ($schema, $data, $cb) = @_;
     my $bytes = Encode::encode_utf8($data);
-    encode_long($class, undef, bytes::length($bytes), $cb);
+    encode_long($class, undef, length($bytes), $cb);
     $cb->(\$bytes);
 }
 
@@ -270,11 +272,16 @@ sub encode_union {
 sub encode_fixed {
     my $class = shift;
     my ($schema, $data, $cb) = @_;
-    if (bytes::length $data != $schema->size) {
-        my $s1 = bytes::length $data;
-        my $s2 = $schema->size;
-        throw Avro::BinaryEncoder::Error("Fixed size doesn't match $s1!=$s2");
-    }
+
+    throw Avro::BinaryEncoder::Error("Invalid data given for 'fixed': Contains 
values >255")
+        unless utf8::downgrade($data, 1);
+
+    my $length = length $data;
+    my $size   = $schema->size;
+
+    throw Avro::BinaryEncoder::Error("Fixed size doesn't match $length!=$size")
+        unless $length == $size;
+
     $cb->(\$data);
 }
 
diff --git a/lang/perl/lib/Avro/Schema.pm b/lang/perl/lib/Avro/Schema.pm
index 3451b0b4d..200d2947c 100644
--- a/lang/perl/lib/Avro/Schema.pm
+++ b/lang/perl/lib/Avro/Schema.pm
@@ -313,7 +313,11 @@ sub is_data_valid {
     if ($type eq 'float' or $type eq 'double') {
         $data =~ /^$RE{num}{real}$/ ? return 1 : 0;
     }
-    if ($type eq "bytes" or $type eq "string") {
+    if ($type eq 'bytes') {
+        return 0 if ref $data;
+        return 1 unless utf8::is_utf8($data) and $data =~ /[^\x00-\xFF]/;
+    }
+    if ($type eq 'string') {
         return 1 unless ref $data;
     }
     if ($type eq 'boolean') {
@@ -807,11 +811,10 @@ sub new {
 }
 
 sub is_data_valid {
-    my $schema = shift;
-    my $default = shift;
-    my $size = $schema->{size};
-    return 1 if $default && bytes::length $default == $size;
-    return 0;
+    my ( $schema, $data ) = @_;
+
+    return 0 if utf8::is_utf8($data) && $data =~ /[^\x00-\xFF]/;
+    return $data && length($data) == $schema->{size};
 }
 
 sub size {
diff --git a/lang/perl/t/01_schema.t b/lang/perl/t/01_schema.t
index f844ef0f6..44a580929 100644
--- a/lang/perl/t/01_schema.t
+++ b/lang/perl/t/01_schema.t
@@ -19,7 +19,7 @@ use strict;
 use warnings;
 
 use Test::More;
-plan tests => 137;
+plan tests => 145;
 use Test::Exception;
 use_ok 'Avro::Schema';
 
@@ -42,6 +42,32 @@ isa_ok $s2, 'Avro::Schema::Primitive';
 is $s2->type, "string", "type is string";
 is $s, $s2, "string Schematas are singletons";
 
+## Perl strings as bytes
+{
+    my $schema = Avro::Schema->parse(q({"type": "bytes"}));
+    ok $schema->is_data_valid(''), 'Empty string is valid as bytes';
+    ok $schema->is_data_valid("\0"), 'Zero byte is valid as bytes';
+    ok !$schema->is_data_valid("\x{100}"), 'Values > 255 not valid as bytes';
+
+    my $bytes = '';
+    utf8::upgrade($bytes);
+
+    ok $schema->is_data_valid($bytes), 'Upgraded string valid as bytes';
+}
+
+## Perl strings as fixed
+{
+    my $schema = Avro::Schema->parse(q({"type": "fixed", "name": "foo", 
"size": 1 }));
+    ok !$schema->is_data_valid(''), 'Too few bytes vs. schema';
+    ok $schema->is_data_valid("\0"), 'Zero byte is valid as fixed';
+    ok !$schema->is_data_valid("\x{100}"), 'Values > 255 not valid as fixed';
+
+    my $bytes = "\xff";
+    utf8::upgrade($bytes);
+
+    ok $schema->is_data_valid($bytes), 'Upgraded string valid as fixed';
+}
+
 ## Records
 {
     my $s3 = Avro::Schema::Record->new(

Reply via email to