This is an automated email from the ASF dual-hosted git repository.
mgrigorov pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/avro.git
The following commit(s) were added to refs/heads/main by this push:
new 82d864fd3 AVRO-1521 [Perl] Fix boolean encoding errors (#2986)
82d864fd3 is described below
commit 82d864fd3751e77ecd255b6b28914926d72916f9
Author: José Joaquín Atria <[email protected]>
AuthorDate: Fri Jun 28 12:20:41 2024 +0100
AVRO-1521 [Perl] Fix boolean encoding errors (#2986)
This change fixes a long-standing issue with the binary encoding
of boolean values. In particular, that while several "smart" values
were accepted as valid boolean values by Avro::Schema (eg. "true"
and "no"), Avro::BinaryEncoder encoded them as true or false depending
on their truth value for Perl. This resulted in both of those examples
being encoded as true, because for Perl any non-empty string is true.
This change makes it so that those values are accepted and properly
handled, and handles other values that represent boolean values
like JSON::PP::Boolean references and native Perl booleans (those
that would be returned by eg. builtin::true).
This also includes a small but possibly breaking bugfix for the
detection of valid boolean values in Avro::Schema, which was using
a non-anchored regular expression to filter values, meaning that
eg. any value that had an "n" anywhere would be considered valid.
This was most likely an involuntary error, so while breaking, it
feels like we have to fix it.
---
lang/perl/Changes | 12 +++++
lang/perl/lib/Avro/BinaryEncoder.pm | 24 +++++++++-
lang/perl/lib/Avro/Schema.pm | 4 +-
lang/perl/t/02_bin_encode.t | 91 +++++++++++++++++++++++++++----------
4 files changed, 105 insertions(+), 26 deletions(-)
diff --git a/lang/perl/Changes b/lang/perl/Changes
index c1551566f..462f90e90 100644
--- a/lang/perl/Changes
+++ b/lang/perl/Changes
@@ -15,6 +15,18 @@ Revision history for Perl extension Avro
are encoded in fields of type 'byte' or 'fixed'.
Errors resulting from this process will be raised as
Avro::BinaryEncoder::Error exceptions
+ - Fixed a bug with the detection of valid boolean values
+ which was not using anchors in a regular expression.
+ Valid values are (case insensitively) 'yes', 'y', 'no',
+ 'n', 'true', 't', 'false', 'f', 0, 1, and anything that
+ is accepted by JSON::PP::is_bool. References rejected by
+ this last function are not valid
+ - Fixed some issues around the binary encoding of boolean
+ values. In particular, Avro::BinaryEncoder now correctly
+ encodes all the values that Avro::Schema accepts as
+ valid boolean values, which before were blindly evaluated
+ in boolean context resulting in strings like 'false'
+ being serialised as if they were true
1.00 Fri Jan 17 15:00:00 2014
- Relicense under apache license 2.0
diff --git a/lang/perl/lib/Avro/BinaryEncoder.pm
b/lang/perl/lib/Avro/BinaryEncoder.pm
index 47eb93116..51399a64c 100644
--- a/lang/perl/lib/Avro/BinaryEncoder.pm
+++ b/lang/perl/lib/Avro/BinaryEncoder.pm
@@ -23,6 +23,7 @@ use Config;
use Encode();
use Error::Simple;
use Regexp::Common qw(number);
+use JSON::PP; # For is_bool
our $VERSION = '++MODULE_VERSION++';
@@ -91,7 +92,28 @@ sub encode_null {
sub encode_boolean {
my $class = shift;
my ($schema, $data, $cb) = @_;
- $cb->( $data ? \"\x1" : \"\x0" );
+
+ throw Avro::BinaryEncoder::Error( "<UNDEF> is not a valid boolean value")
+ unless defined $data;
+
+ if ( my $type = ref $data ) {
+ throw Avro::BinaryEncoder::Error("cannot encode a '$type' reference as
boolean")
+ unless $type eq 'JSON::PP::Boolean';
+ }
+ else {
+ throw Avro::BinaryEncoder::Error( "'$data' is not a valid boolean
value")
+ unless $data eq '' # For Perl versions without builtin::is_bool
+ || JSON::PP::is_bool($data)
+ || $data =~ /^(?:true|t|false|f|yes|y|no|n|0|1)$/i;
+ }
+
+ # Some values might be false but evaluate to "truthy" for Perl,
+ # like the string 'false'. For these known exceptions, which
+ # would otherwise be false, we read a false value. For everything
+ # else, we rely on their value evaluated in a boolean context.
+ my $true = $data =~ /^(?:no|n|false|f)$/i ? 0 : !!$data;
+
+ $cb->( $true ? \"\x1" : \"\x0" );
}
sub encode_int {
diff --git a/lang/perl/lib/Avro/Schema.pm b/lang/perl/lib/Avro/Schema.pm
index 200d2947c..0543a90f0 100644
--- a/lang/perl/lib/Avro/Schema.pm
+++ b/lang/perl/lib/Avro/Schema.pm
@@ -321,9 +321,9 @@ sub is_data_valid {
return 1 unless ref $data;
}
if ($type eq 'boolean') {
+ return 1 if JSON::PP::is_bool($data);
return 0 if ref $data; # sometimes risky
- return 1 if $data =~ m{yes|no|y|n|t|f|true|false}i;
- return 0;
+ return $data =~ m{^(?:yes|no|y|n|t|f|true|false|0|1)$}i;
}
return 0;
}
diff --git a/lang/perl/t/02_bin_encode.t b/lang/perl/t/02_bin_encode.t
index 4a3001fe9..5be10e7b7 100644
--- a/lang/perl/t/02_bin_encode.t
+++ b/lang/perl/t/02_bin_encode.t
@@ -24,18 +24,26 @@ use Config;
use Test::More;
use Test::Exception;
use Math::BigInt;
+use JSON::PP; # For booleans
use_ok 'Avro::BinaryEncoder';
-sub primitive_ok {
- my ($primitive_type, $primitive_val, $expected_enc) = @_;
+sub primitive {
+ my ($type, $data) = @_;
- my $data;
- my $meth = "encode_$primitive_type";
- Avro::BinaryEncoder->$meth(
- undef, $primitive_val, sub { $data = ${$_[0]} }
+ my $encoded;
+ my $method = "encode_$type";
+ Avro::BinaryEncoder->$method(
+ undef, $data, sub { $encoded = ${$_[0]} }
);
- is $data, $expected_enc, "primitive $primitive_type encoded correctly";
+ return $encoded;
+}
+
+sub primitive_ok {
+ my ($type, $have, $want) = @_;
+ my $data = primitive( $type => $have );
+ is primitive( $type => $have ), $want,
+ "primitive $type encoded @{[ $have // '<UNDEF>' ]} correctly";
return $data;
}
@@ -44,8 +52,48 @@ sub primitive_ok {
primitive_ok null => undef, '';
primitive_ok null => 'whatev', '';
- primitive_ok boolean => 0, "\x0";
- primitive_ok boolean => 1, "\x1";
+ primitive_ok boolean => 0, "\x0";
+ primitive_ok boolean => 1, "\x1";
+ primitive_ok boolean => 'false', "\x0";
+ primitive_ok boolean => 'true', "\x1";
+ primitive_ok boolean => 'f', "\x0";
+ primitive_ok boolean => 't', "\x1";
+ primitive_ok boolean => 'no', "\x0";
+ primitive_ok boolean => 'yes', "\x1";
+ primitive_ok boolean => 'n', "\x0";
+ primitive_ok boolean => 'y', "\x1";
+ primitive_ok boolean => 'FALSE', "\x0";
+ primitive_ok boolean => 'TRUE', "\x1";
+ primitive_ok boolean => 'F', "\x0";
+ primitive_ok boolean => 'T', "\x1";
+ primitive_ok boolean => 'NO', "\x0";
+ primitive_ok boolean => 'YES', "\x1";
+ primitive_ok boolean => 'N', "\x0";
+ primitive_ok boolean => 'Y', "\x1";
+ primitive_ok boolean => !!0, "\x0"; # Native false
+ primitive_ok boolean => !!1, "\x1"; # Native true
+ primitive_ok boolean => $JSON::PP::false, "\x0";
+ primitive_ok boolean => $JSON::PP::true, "\x1";
+
+ throws_ok {
+ primitive boolean => undef;
+ } "Avro::BinaryEncoder::Error", "<UNDEF> is not a valid boolean value";
+
+ throws_ok {
+ primitive boolean => 2;
+ } "Avro::BinaryEncoder::Error", "'2' is not a valid boolean value";
+
+ throws_ok {
+ primitive boolean => -1;
+ } "Avro::BinaryEncoder::Error", "'-1' is not a valid boolean value";
+
+ throws_ok {
+ primitive boolean => 'anything truthy';
+ } "Avro::BinaryEncoder::Error", "'anything truthy' is not a valid boolean
value";
+
+ throws_ok {
+ primitive boolean => {};
+ } "Avro::BinaryEncoder::Error", "cannot encode a 'HASH' reference as
boolean";
## - high-bit of each byte should be set except for last one
## - rest of bits are:
@@ -74,33 +122,30 @@ sub primitive_ok {
primitive_ok long => -Math::BigInt->new(2)**63,
"\xff\xff\xff\xff\xff\xff\xff\xff\xff\x01";
throws_ok {
- primitive_ok int => Math::BigInt->new(2)**31;
+ primitive int => Math::BigInt->new(2)**31;
} "Avro::BinaryEncoder::Error", "32-bit signed int overflow";
throws_ok {
- primitive_ok int => -Math::BigInt->new(2)**31 - 1;
+ primitive int => -Math::BigInt->new(2)**31 - 1;
} "Avro::BinaryEncoder::Error", "32-bit signed int underflow";
throws_ok {
- primitive_ok int => Math::BigInt->new(2)**63;
+ primitive int => Math::BigInt->new(2)**63;
} "Avro::BinaryEncoder::Error", "64-bit signed int overflow";
throws_ok {
- primitive_ok long => -Math::BigInt->new(2)**63 - 1;
+ primitive long => -Math::BigInt->new(2)**63 - 1;
} "Avro::BinaryEncoder::Error", "64-bit signed int underflow";
- for (qw(long int)) {
- throws_ok {
- primitive_ok $_ => "x", undef;
- } 'Avro::BinaryEncoder::Error', 'numeric values only';
- };
+ throws_ok {
+ primitive $_ => "x";
+ } 'Avro::BinaryEncoder::Error', 'numeric values only' for qw(long int);
+
# In Unicode, there are decimals that aren't 0-9.
# Make sure we handle non-ascii decimals cleanly.
- for (qw(long int)) {
- throws_ok {
- primitive_ok $_ => "\N{U+0661}", undef;
- } 'Avro::BinaryEncoder::Error', 'ascii decimals only';
- };
+ throws_ok {
+ primitive $_ => "\N{U+0661}";
+ } 'Avro::BinaryEncoder::Error', 'ascii decimals only' for qw(long int);
}
## spec examples