This is an automated email from the ASF dual-hosted git repository.
mgrigorov pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/avro.git
The following commit(s) were added to refs/heads/main by this push:
new 677e9829b AVRO-1517: [Perl] Encode UTF-8 strings as bytes (#2979)
677e9829b is described below
commit 677e9829bae30cc76527c6f5702f8c2384be61c5
Author: José Joaquín Atria <[email protected]>
AuthorDate: Wed Jun 26 16:12:45 2024 +0100
AVRO-1517: [Perl] Encode UTF-8 strings as bytes (#2979)
From John Karp's original description of [the issue]:
> By default in Perl, a string is a sequence of bytes, values 0-255.
> However, if a Unicode character is included that cannot be represented
> with a single byte, the string gets 'upgraded' to a non-byte-based
> Unicode string allowing ordinals outside that range. When string
> operations are done with byte and non-byte Unicode strings, the result
> is always non-byte, with the byte string first 'upgraded'. Upgrading
> consists of utf8 encoding and setting a utf8 flag on the string. ('utf8'
> is a variant of UTF-8 used by Perl)
>
> The Perl Avro API is accepting these Unicode strings as-is for the
> 'bytes' type. This is a problem because
>
> 1. values >255 are not valid as bytes, and any encoding is their job
>
> 2. As Avro assembles the serialized data, Perl 'upgrades' all the data,
> having the effect of utf8 encoding our serialized binary data.
>
> The correct behavior is for the Avro Perl API is to attempt to downgrade
> the string, and if this fails because it contained values >255 then to
> raise an error. (The behavior of 'string' won't change, it will still
> take Unicode strings as expected.)
This change, based on the one submitted for that ticket, adds these
behaviours and tests to exercise them.
[the issue]: https://issues.apache.org/jira/browse/AVRO-1517
---
lang/perl/Changes | 4 ++++
lang/perl/lib/Avro/BinaryEncoder.pm | 21 ++++++++++++++-------
lang/perl/lib/Avro/Schema.pm | 15 +++++++++------
lang/perl/t/01_schema.t | 28 +++++++++++++++++++++++++++-
4 files changed, 54 insertions(+), 14 deletions(-)
diff --git a/lang/perl/Changes b/lang/perl/Changes
index 678056028..c1551566f 100644
--- a/lang/perl/Changes
+++ b/lang/perl/Changes
@@ -11,6 +11,10 @@ Revision history for Perl extension Avro
for int and long types were off by one
- Silenced a spurious warning that was raised when
validating an undefined value for some data types
+ - Make sure Unicode strings are downgraded when they
+ are encoded in fields of type 'byte' or 'fixed'.
+ Errors resulting from this process will be raised as
+ Avro::BinaryEncoder::Error exceptions
1.00 Fri Jan 17 15:00:00 2014
- Relicense under apache license 2.0
diff --git a/lang/perl/lib/Avro/BinaryEncoder.pm
b/lang/perl/lib/Avro/BinaryEncoder.pm
index 18a25813e..47eb93116 100644
--- a/lang/perl/lib/Avro/BinaryEncoder.pm
+++ b/lang/perl/lib/Avro/BinaryEncoder.pm
@@ -138,7 +138,9 @@ sub encode_double {
sub encode_bytes {
my $class = shift;
my ($schema, $data, $cb) = @_;
- encode_long($class, undef, bytes::length($data), $cb);
+ throw Avro::BinaryEncoder::Error("Invalid data given for 'bytes': Contains
values >255")
+ unless utf8::downgrade($data, 1);
+ encode_long($class, undef, length($data), $cb);
$cb->(\$data);
}
@@ -146,7 +148,7 @@ sub encode_string {
my $class = shift;
my ($schema, $data, $cb) = @_;
my $bytes = Encode::encode_utf8($data);
- encode_long($class, undef, bytes::length($bytes), $cb);
+ encode_long($class, undef, length($bytes), $cb);
$cb->(\$bytes);
}
@@ -270,11 +272,16 @@ sub encode_union {
sub encode_fixed {
my $class = shift;
my ($schema, $data, $cb) = @_;
- if (bytes::length $data != $schema->size) {
- my $s1 = bytes::length $data;
- my $s2 = $schema->size;
- throw Avro::BinaryEncoder::Error("Fixed size doesn't match $s1!=$s2");
- }
+
+ throw Avro::BinaryEncoder::Error("Invalid data given for 'fixed': Contains
values >255")
+ unless utf8::downgrade($data, 1);
+
+ my $length = length $data;
+ my $size = $schema->size;
+
+ throw Avro::BinaryEncoder::Error("Fixed size doesn't match $length!=$size")
+ unless $length == $size;
+
$cb->(\$data);
}
diff --git a/lang/perl/lib/Avro/Schema.pm b/lang/perl/lib/Avro/Schema.pm
index 3451b0b4d..200d2947c 100644
--- a/lang/perl/lib/Avro/Schema.pm
+++ b/lang/perl/lib/Avro/Schema.pm
@@ -313,7 +313,11 @@ sub is_data_valid {
if ($type eq 'float' or $type eq 'double') {
$data =~ /^$RE{num}{real}$/ ? return 1 : 0;
}
- if ($type eq "bytes" or $type eq "string") {
+ if ($type eq 'bytes') {
+ return 0 if ref $data;
+ return 1 unless utf8::is_utf8($data) and $data =~ /[^\x00-\xFF]/;
+ }
+ if ($type eq 'string') {
return 1 unless ref $data;
}
if ($type eq 'boolean') {
@@ -807,11 +811,10 @@ sub new {
}
sub is_data_valid {
- my $schema = shift;
- my $default = shift;
- my $size = $schema->{size};
- return 1 if $default && bytes::length $default == $size;
- return 0;
+ my ( $schema, $data ) = @_;
+
+ return 0 if utf8::is_utf8($data) && $data =~ /[^\x00-\xFF]/;
+ return $data && length($data) == $schema->{size};
}
sub size {
diff --git a/lang/perl/t/01_schema.t b/lang/perl/t/01_schema.t
index f844ef0f6..44a580929 100644
--- a/lang/perl/t/01_schema.t
+++ b/lang/perl/t/01_schema.t
@@ -19,7 +19,7 @@ use strict;
use warnings;
use Test::More;
-plan tests => 137;
+plan tests => 145;
use Test::Exception;
use_ok 'Avro::Schema';
@@ -42,6 +42,32 @@ isa_ok $s2, 'Avro::Schema::Primitive';
is $s2->type, "string", "type is string";
is $s, $s2, "string Schematas are singletons";
+## Perl strings as bytes
+{
+ my $schema = Avro::Schema->parse(q({"type": "bytes"}));
+ ok $schema->is_data_valid(''), 'Empty string is valid as bytes';
+ ok $schema->is_data_valid("\0"), 'Zero byte is valid as bytes';
+ ok !$schema->is_data_valid("\x{100}"), 'Values > 255 not valid as bytes';
+
+ my $bytes = '';
+ utf8::upgrade($bytes);
+
+ ok $schema->is_data_valid($bytes), 'Upgraded string valid as bytes';
+}
+
+## Perl strings as fixed
+{
+ my $schema = Avro::Schema->parse(q({"type": "fixed", "name": "foo",
"size": 1 }));
+ ok !$schema->is_data_valid(''), 'Too few bytes vs. schema';
+ ok $schema->is_data_valid("\0"), 'Zero byte is valid as fixed';
+ ok !$schema->is_data_valid("\x{100}"), 'Values > 255 not valid as fixed';
+
+ my $bytes = "\xff";
+ utf8::upgrade($bytes);
+
+ ok $schema->is_data_valid($bytes), 'Upgraded string valid as fixed';
+}
+
## Records
{
my $s3 = Avro::Schema::Record->new(