This is an automated email from the ASF dual-hosted git repository.
emkornfield pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 423ca163a2 PARQUET-2163: Handle decimal schemas with large
fixed_len_byte_arrays
423ca163a2 is described below
commit 423ca163a26781ef6a8229af22b7e6e2d7423a54
Author: William Butler <[email protected]>
AuthorDate: Wed Jul 6 09:02:00 2022 -0700
PARQUET-2163: Handle decimal schemas with large fixed_len_byte_arrays
The precision calculation had been overflowing to infinity when the
length of the fixed_len_byte_array > 128, triggering an error when then
trying to convert infinity to an int32. We can actually simplify the
logic by noting that log_b(a^(x)) = log_b(a)*x. This avoids the
intermediate infinity. We also added a check for extremely large value
sizes implying a max precision that cannot fit in int32. Even 129 byte
decimal seems extreme.
The formula Parquet C++ was using is technically incorrect vs the
Parquet specification. The specification says that the max precision is
floor(log_10(2^(B*8 -1) - 1)), where the C++ implementation was omitting the
outer -1. However, this is okay as it is easy to prove that these values
will always be the same (ignoring the realities of FP arithmetic) & in
practice all three formulas agree through 128 when using FP.
Bug found through fuzzing.
Closes #13456 from tachyonwill/float_overflow
Authored-by: William Butler <[email protected]>
Signed-off-by: Micah Kornfield <[email protected]>
---
cpp/src/parquet/schema_test.cc | 28 ++++++++++++++++++++++++++++
cpp/src/parquet/types.cc | 8 +++++++-
2 files changed, 35 insertions(+), 1 deletion(-)
diff --git a/cpp/src/parquet/schema_test.cc b/cpp/src/parquet/schema_test.cc
index 703bac8108..603d9ed8e2 100644
--- a/cpp/src/parquet/schema_test.cc
+++ b/cpp/src/parquet/schema_test.cc
@@ -1688,9 +1688,26 @@ TEST(TestSchemaNodeCreation, FactoryExceptions) {
ASSERT_ANY_THROW(PrimitiveNode::Make("interval", Repetition::REQUIRED,
IntervalLogicalType::Make(),
Type::FIXED_LEN_BYTE_ARRAY, 11));
+ // Scale is greater than precision.
+ ASSERT_ANY_THROW(PrimitiveNode::Make("decimal", Repetition::REQUIRED,
+ DecimalLogicalType::Make(10, 11),
Type::INT64));
+ ASSERT_ANY_THROW(PrimitiveNode::Make("decimal", Repetition::REQUIRED,
+ DecimalLogicalType::Make(17, 18),
Type::INT64));
// Primitive too small for given precision ...
ASSERT_ANY_THROW(PrimitiveNode::Make("decimal", Repetition::REQUIRED,
DecimalLogicalType::Make(16, 6),
Type::INT32));
+ ASSERT_ANY_THROW(PrimitiveNode::Make("decimal", Repetition::REQUIRED,
+ DecimalLogicalType::Make(10, 9),
Type::INT32));
+ ASSERT_ANY_THROW(PrimitiveNode::Make("decimal", Repetition::REQUIRED,
+ DecimalLogicalType::Make(19, 17),
Type::INT64));
+ ASSERT_ANY_THROW(PrimitiveNode::Make("decimal", Repetition::REQUIRED,
+ DecimalLogicalType::Make(308, 6),
+ Type::FIXED_LEN_BYTE_ARRAY, 128));
+ // Length is too long
+ ASSERT_ANY_THROW(PrimitiveNode::Make("decimal", Repetition::REQUIRED,
+ DecimalLogicalType::Make(10, 6),
+ Type::FIXED_LEN_BYTE_ARRAY, 891723283));
+
// Incompatible primitive length ...
ASSERT_ANY_THROW(PrimitiveNode::Make("uuid", Repetition::REQUIRED,
UUIDLogicalType::Make(),
@@ -1942,6 +1959,17 @@ TEST_F(TestDecimalSchemaElementConstruction,
DecimalCases) {
true, check_DECIMAL},
{"decimal", LogicalType::Decimal(11, 11), Type::INT64, -1, true,
ConvertedType::DECIMAL, true, check_DECIMAL},
+ {"decimal", LogicalType::Decimal(9, 9), Type::INT32, -1, true,
+ ConvertedType::DECIMAL, true, check_DECIMAL},
+ {"decimal", LogicalType::Decimal(18, 18), Type::INT64, -1, true,
+ ConvertedType::DECIMAL, true, check_DECIMAL},
+ {"decimal", LogicalType::Decimal(307, 7), Type::FIXED_LEN_BYTE_ARRAY,
128, true,
+ ConvertedType::DECIMAL, true, check_DECIMAL},
+ {"decimal", LogicalType::Decimal(310, 32), Type::FIXED_LEN_BYTE_ARRAY,
129, true,
+ ConvertedType::DECIMAL, true, check_DECIMAL},
+ {"decimal", LogicalType::Decimal(2147483645, 2147483645),
+ Type::FIXED_LEN_BYTE_ARRAY, 891723282, true, ConvertedType::DECIMAL,
true,
+ check_DECIMAL},
};
for (const SchemaElementConstructionArguments& c : cases) {
diff --git a/cpp/src/parquet/types.cc b/cpp/src/parquet/types.cc
index ef23c40662..349fc682aa 100644
--- a/cpp/src/parquet/types.cc
+++ b/cpp/src/parquet/types.cc
@@ -909,8 +909,14 @@ bool
LogicalType::Impl::Decimal::is_applicable(parquet::Type::type primitive_typ
}
} break;
case parquet::Type::FIXED_LEN_BYTE_ARRAY: {
+ // If the primitive length is larger than this we will overflow int32
when
+ // calculating precision.
+ if (primitive_length <= 0 || primitive_length > 891723282) {
+ ok = false;
+ break;
+ }
ok = precision_ <= static_cast<int32_t>(std::floor(
- std::log10(std::pow(2.0, (8.0 * primitive_length)
- 1.0))));
+ std::log10(2) * ((8.0 * primitive_length) -
1.0)));
} break;
case parquet::Type::BYTE_ARRAY: {
ok = true;