This is an automated email from the ASF dual-hosted git repository. apitrou pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push: new caf4f70247 GH-46739: [C++] Fix Float16 signed zero/NaN equality comparisons (#46973) caf4f70247 is described below commit caf4f702479503b2da4f1a606248107e71a8d772 Author: Ben Harkins <60872452+beni...@users.noreply.github.com> AuthorDate: Thu Sep 4 05:23:52 2025 -0400 GH-46739: [C++] Fix Float16 signed zero/NaN equality comparisons (#46973) ### Rationale for this change Equality comparisons between half-floats (used in their scalar/array `Equals` methods) do not properly handle `EqualOptions::nans_equal` and `EqualOptions::signed_zeros_equal`. ### What changes are included in this PR? - Internal fixes to the current comparison behavior and additional tests as needed - Prevents Float16 NaNs from being randomly generated by test utilities by default (matching behavior for float/double) ### Are these changes tested? Yes ### Are there any user-facing changes? No * GitHub Issue: #46739 Authored-by: Benjamin Harkins <benphark...@gmail.com> Signed-off-by: Antoine Pitrou <anto...@python.org> --- cpp/src/arrow/array/array_test.cc | 50 +++++++++------- cpp/src/arrow/compare.cc | 5 +- cpp/src/arrow/scalar.h | 19 ++++++ cpp/src/arrow/scalar_test.cc | 84 +++++++++++++++----------- cpp/src/arrow/testing/random.cc | 112 ++++++++++++++++++++++++++++------- cpp/src/arrow/testing/random.h | 30 +++++++++- cpp/src/arrow/testing/random_test.cc | 30 +++++++++- cpp/src/arrow/type_fwd.h | 1 + cpp/src/arrow/type_traits.h | 5 ++ 9 files changed, 253 insertions(+), 83 deletions(-) diff --git a/cpp/src/arrow/array/array_test.cc b/cpp/src/arrow/array/array_test.cc index 85885ded14..4db76512d2 100644 --- a/cpp/src/arrow/array/array_test.cc +++ b/cpp/src/arrow/array/array_test.cc @@ -2120,16 +2120,21 @@ void CheckSliceApproxEquals() { ASSERT_TRUE(slice1->ApproxEquals(slice2)); } +template <typename ArrowType> +using NumericArgType = std::conditional_t<is_half_float_type<ArrowType>::value, Float16, + typename ArrowType::c_type>; + template <typename TYPE> void CheckFloatingNanEquality() { + using V = NumericArgType<TYPE>; std::shared_ptr<Array> a, b; std::shared_ptr<DataType> type = TypeTraits<TYPE>::type_singleton(); - const auto nan_value = static_cast<typename TYPE::c_type>(NAN); + const auto nan_value = std::numeric_limits<V>::quiet_NaN(); // NaN in a null entry - ArrayFromVector<TYPE>(type, {true, false}, {0.5, nan_value}, &a); - ArrayFromVector<TYPE>(type, {true, false}, {0.5, nan_value}, &b); + ArrayFromVector<TYPE, V>(type, {true, false}, {V(0.5), nan_value}, &a); + ArrayFromVector<TYPE, V>(type, {true, false}, {V(0.5), nan_value}, &b); ASSERT_TRUE(a->Equals(b)); ASSERT_TRUE(b->Equals(a)); ASSERT_TRUE(a->ApproxEquals(b)); @@ -2140,8 +2145,8 @@ void CheckFloatingNanEquality() { ASSERT_TRUE(b->RangeEquals(a, 1, 2, 1)); // NaN in a valid entry - ArrayFromVector<TYPE>(type, {false, true}, {0.5, nan_value}, &a); - ArrayFromVector<TYPE>(type, {false, true}, {0.5, nan_value}, &b); + ArrayFromVector<TYPE, V>(type, {false, true}, {V(0.5), nan_value}, &a); + ArrayFromVector<TYPE, V>(type, {false, true}, {V(0.5), nan_value}, &b); ASSERT_FALSE(a->Equals(b)); ASSERT_FALSE(b->Equals(a)); ASSERT_TRUE(a->Equals(b, EqualOptions().nans_equal(true))); @@ -2160,8 +2165,8 @@ void CheckFloatingNanEquality() { ASSERT_TRUE(b->RangeEquals(a, 0, 1, 0)); // NaN != non-NaN - ArrayFromVector<TYPE>(type, {false, true}, {0.5, nan_value}, &a); - ArrayFromVector<TYPE>(type, {false, true}, {0.5, 0.0}, &b); + ArrayFromVector<TYPE, V>(type, {false, true}, {V(0.5), nan_value}, &a); + ArrayFromVector<TYPE, V>(type, {false, true}, {V(0.5), V(0.0)}, &b); ASSERT_FALSE(a->Equals(b)); ASSERT_FALSE(b->Equals(a)); ASSERT_FALSE(a->Equals(b, EqualOptions().nans_equal(true))); @@ -2182,15 +2187,16 @@ void CheckFloatingNanEquality() { template <typename TYPE> void CheckFloatingInfinityEquality() { + using V = NumericArgType<TYPE>; std::shared_ptr<Array> a, b; std::shared_ptr<DataType> type = TypeTraits<TYPE>::type_singleton(); - const auto infinity = std::numeric_limits<typename TYPE::c_type>::infinity(); + const auto infinity = std::numeric_limits<V>::infinity(); for (auto nans_equal : {false, true}) { // Infinity in a null entry - ArrayFromVector<TYPE>(type, {true, false}, {0.5, infinity}, &a); - ArrayFromVector<TYPE>(type, {true, false}, {0.5, -infinity}, &b); + ArrayFromVector<TYPE, V>(type, {true, false}, {V(0.5), infinity}, &a); + ArrayFromVector<TYPE, V>(type, {true, false}, {V(0.5), -infinity}, &b); ASSERT_TRUE(a->Equals(b)); ASSERT_TRUE(b->Equals(a)); ASSERT_TRUE(a->ApproxEquals(b, EqualOptions().atol(1e-5).nans_equal(nans_equal))); @@ -2201,8 +2207,8 @@ void CheckFloatingInfinityEquality() { ASSERT_TRUE(b->RangeEquals(a, 1, 2, 1)); // Infinity in a valid entry - ArrayFromVector<TYPE>(type, {false, true}, {0.5, infinity}, &a); - ArrayFromVector<TYPE>(type, {false, true}, {0.5, infinity}, &b); + ArrayFromVector<TYPE, V>(type, {false, true}, {V(0.5), infinity}, &a); + ArrayFromVector<TYPE, V>(type, {false, true}, {V(0.5), infinity}, &b); ASSERT_TRUE(a->Equals(b)); ASSERT_TRUE(b->Equals(a)); ASSERT_TRUE(a->ApproxEquals(b, EqualOptions().atol(1e-5).nans_equal(nans_equal))); @@ -2219,8 +2225,8 @@ void CheckFloatingInfinityEquality() { ASSERT_TRUE(b->RangeEquals(a, 0, 1, 0)); // Infinity != non-infinity - ArrayFromVector<TYPE>(type, {false, true}, {0.5, -infinity}, &a); - ArrayFromVector<TYPE>(type, {false, true}, {0.5, 0.0}, &b); + ArrayFromVector<TYPE, V>(type, {false, true}, {V(0.5), -infinity}, &a); + ArrayFromVector<TYPE, V>(type, {false, true}, {V(0.5), V(0.0)}, &b); ASSERT_FALSE(a->Equals(b)); ASSERT_FALSE(b->Equals(a)); ASSERT_FALSE(a->ApproxEquals(b, EqualOptions().atol(1e-5).nans_equal(nans_equal))); @@ -2228,8 +2234,8 @@ void CheckFloatingInfinityEquality() { ASSERT_FALSE(a->ApproxEquals(b, EqualOptions().atol(1e-5).nans_equal(nans_equal))); ASSERT_FALSE(b->ApproxEquals(a, EqualOptions().atol(1e-5).nans_equal(nans_equal))); // Infinity != Negative infinity - ArrayFromVector<TYPE>(type, {true, true}, {0.5, -infinity}, &a); - ArrayFromVector<TYPE>(type, {true, true}, {0.5, infinity}, &b); + ArrayFromVector<TYPE, V>(type, {true, true}, {V(0.5), -infinity}, &a); + ArrayFromVector<TYPE, V>(type, {true, true}, {V(0.5), infinity}, &b); ASSERT_FALSE(a->Equals(b)); ASSERT_FALSE(b->Equals(a)); ASSERT_FALSE(a->ApproxEquals(b)); @@ -2249,11 +2255,12 @@ void CheckFloatingInfinityEquality() { template <typename TYPE> void CheckFloatingZeroEquality() { + using V = NumericArgType<TYPE>; std::shared_ptr<Array> a, b; std::shared_ptr<DataType> type = TypeTraits<TYPE>::type_singleton(); - ArrayFromVector<TYPE>(type, {true, false}, {0.0, 1.0}, &a); - ArrayFromVector<TYPE>(type, {true, false}, {0.0, 1.0}, &b); + ArrayFromVector<TYPE, V>(type, {true, false}, {V(0.0), V(1.0)}, &a); + ArrayFromVector<TYPE, V>(type, {true, false}, {V(0.0), V(1.0)}, &b); ASSERT_TRUE(a->Equals(b)); ASSERT_TRUE(b->Equals(a)); for (auto nans_equal : {false, true}) { @@ -2269,8 +2276,8 @@ void CheckFloatingZeroEquality() { } } - ArrayFromVector<TYPE>(type, {true, false}, {0.0, 1.0}, &a); - ArrayFromVector<TYPE>(type, {true, false}, {-0.0, 1.0}, &b); + ArrayFromVector<TYPE, V>(type, {true, false}, {V(0.0), V(1.0)}, &a); + ArrayFromVector<TYPE, V>(type, {true, false}, {V(-0.0), V(1.0)}, &b); for (auto nans_equal : {false, true}) { auto opts = EqualOptions().nans_equal(nans_equal); ASSERT_TRUE(a->Equals(b, opts)); @@ -2306,16 +2313,19 @@ TEST(TestPrimitiveAdHoc, FloatingSliceApproxEquals) { TEST(TestPrimitiveAdHoc, FloatingNanEquality) { CheckFloatingNanEquality<FloatType>(); CheckFloatingNanEquality<DoubleType>(); + CheckFloatingNanEquality<HalfFloatType>(); } TEST(TestPrimitiveAdHoc, FloatingInfinityEquality) { CheckFloatingInfinityEquality<FloatType>(); CheckFloatingInfinityEquality<DoubleType>(); + CheckFloatingInfinityEquality<HalfFloatType>(); } TEST(TestPrimitiveAdHoc, FloatingZeroEquality) { CheckFloatingZeroEquality<FloatType>(); CheckFloatingZeroEquality<DoubleType>(); + CheckFloatingZeroEquality<HalfFloatType>(); } // ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc index 76fd47119e..d37325fa1a 100644 --- a/cpp/src/arrow/compare.cc +++ b/cpp/src/arrow/compare.cc @@ -110,7 +110,7 @@ struct FloatingEquality<uint16_t, Flags> { bool operator()(uint16_t x, uint16_t y) const { Float16 f_x = Float16::FromBits(x); Float16 f_y = Float16::FromBits(y); - if (x == y) { + if (f_x == f_y) { return Flags::signed_zeros_equal || (f_x.signbit() == f_y.signbit()); } if (Flags::nans_equal && f_x.is_nan() && f_y.is_nan()) { @@ -171,7 +171,8 @@ void VisitFloatingEquality(const EqualOptions& options, bool floating_approximat } inline bool IdentityImpliesEqualityNansNotEqual(const DataType& type) { - if (type.id() == Type::FLOAT || type.id() == Type::DOUBLE) { + if (type.id() == Type::FLOAT || type.id() == Type::DOUBLE || + type.id() == Type::HALF_FLOAT) { return false; } for (const auto& child : type.fields()) { diff --git a/cpp/src/arrow/scalar.h b/cpp/src/arrow/scalar.h index 7ef3730120..b96d930a44 100644 --- a/cpp/src/arrow/scalar.h +++ b/cpp/src/arrow/scalar.h @@ -37,6 +37,7 @@ #include "arrow/type_traits.h" #include "arrow/util/compare.h" #include "arrow/util/decimal.h" +#include "arrow/util/float16.h" #include "arrow/util/visibility.h" #include "arrow/visit_type_inline.h" @@ -245,6 +246,12 @@ struct ARROW_EXPORT UInt64Scalar : public NumericScalar<UInt64Type> { struct ARROW_EXPORT HalfFloatScalar : public NumericScalar<HalfFloatType> { using NumericScalar<HalfFloatType>::NumericScalar; + + explicit HalfFloatScalar(util::Float16 value) + : NumericScalar(value.bits(), float16()) {} + + HalfFloatScalar(util::Float16 value, std::shared_ptr<DataType> type) + : NumericScalar(value.bits(), std::move(type)) {} }; struct ARROW_EXPORT FloatScalar : public NumericScalar<FloatType> { @@ -969,6 +976,18 @@ struct MakeScalarImpl { return Status::OK(); } + // This isn't captured by the generic case above because `util::Float16` isn't implicity + // convertible to `uint16_t` (HalfFloat's ValueType) + template <typename T> + std::enable_if_t<std::is_same_v<std::decay_t<ValueRef>, util::Float16> && + is_half_float_type<T>::value, + Status> + Visit(const T& t) { + out_ = std::make_shared<HalfFloatScalar>(static_cast<ValueRef>(value_), + std::move(type_)); + return Status::OK(); + } + Status Visit(const ExtensionType& t) { ARROW_ASSIGN_OR_RAISE(auto storage, MakeScalar(t.storage_type(), static_cast<ValueRef>(value_))); diff --git a/cpp/src/arrow/scalar_test.cc b/cpp/src/arrow/scalar_test.cc index 422f688957..4a34e5d13c 100644 --- a/cpp/src/arrow/scalar_test.cc +++ b/cpp/src/arrow/scalar_test.cc @@ -39,6 +39,7 @@ #include "arrow/testing/random.h" #include "arrow/testing/util.h" #include "arrow/type_traits.h" +#include "arrow/util/float16.h" namespace arrow { @@ -46,6 +47,7 @@ using compute::Cast; using compute::CastOptions; using internal::checked_cast; using internal::checked_pointer_cast; +using util::Float16; std::shared_ptr<Scalar> CheckMakeNullScalar(const std::shared_ptr<DataType>& type) { const auto scalar = MakeNullScalar(type); @@ -201,22 +203,33 @@ TEST(TestScalar, IdentityCast) { */ } +template <typename ArrowType> +using NumericArgType = std::conditional_t<is_half_float_type<ArrowType>::value, Float16, + typename ArrowType::c_type>; + template <typename T> class TestNumericScalar : public ::testing::Test { public: TestNumericScalar() = default; }; -TYPED_TEST_SUITE(TestNumericScalar, NumericArrowTypes); +using NumericArrowTypesPlusHalfFloat = + testing::Types<UInt8Type, UInt16Type, UInt32Type, UInt64Type, Int8Type, Int16Type, + Int32Type, Int64Type, FloatType, DoubleType, HalfFloatType>; +TYPED_TEST_SUITE(TestNumericScalar, NumericArrowTypesPlusHalfFloat); TYPED_TEST(TestNumericScalar, Basics) { - using T = typename TypeParam::c_type; + using T = NumericArgType<TypeParam>; using ScalarType = typename TypeTraits<TypeParam>::ScalarType; T value = static_cast<T>(1); auto scalar_val = std::make_shared<ScalarType>(value); - ASSERT_EQ(value, scalar_val->value); + if constexpr (is_half_float_type<TypeParam>::value) { + ASSERT_EQ(value, Float16::FromBits(scalar_val->value)); + } else { + ASSERT_EQ(value, scalar_val->value); + } ASSERT_TRUE(scalar_val->is_valid); ASSERT_OK(scalar_val->ValidateFull()); @@ -227,8 +240,13 @@ TYPED_TEST(TestNumericScalar, Basics) { auto scalar_other = std::make_shared<ScalarType>(other_value); ASSERT_NE(*scalar_other, *scalar_val); - scalar_val->value = other_value; - ASSERT_EQ(other_value, scalar_val->value); + if constexpr (is_half_float_type<TypeParam>::value) { + scalar_val->value = other_value.bits(); + ASSERT_EQ(other_value, Float16::FromBits(scalar_val->value)); + } else { + scalar_val->value = other_value; + ASSERT_EQ(other_value, scalar_val->value); + } ASSERT_EQ(*scalar_other, *scalar_val); ScalarType stack_val; @@ -255,72 +273,72 @@ TYPED_TEST(TestNumericScalar, Basics) { ASSERT_OK(two->ValidateFull()); ASSERT_TRUE(null->Equals(*null_value)); - ASSERT_TRUE(one->Equals(ScalarType(1))); - ASSERT_FALSE(one->Equals(ScalarType(2))); - ASSERT_TRUE(two->Equals(ScalarType(2))); - ASSERT_FALSE(two->Equals(ScalarType(3))); + ASSERT_TRUE(one->Equals(ScalarType(static_cast<T>(1)))); + ASSERT_FALSE(one->Equals(ScalarType(static_cast<T>(2)))); + ASSERT_TRUE(two->Equals(ScalarType(static_cast<T>(2)))); + ASSERT_FALSE(two->Equals(ScalarType(static_cast<T>(3)))); ASSERT_TRUE(null->ApproxEquals(*null_value)); - ASSERT_TRUE(one->ApproxEquals(ScalarType(1))); - ASSERT_FALSE(one->ApproxEquals(ScalarType(2))); - ASSERT_TRUE(two->ApproxEquals(ScalarType(2))); - ASSERT_FALSE(two->ApproxEquals(ScalarType(3))); + ASSERT_TRUE(one->ApproxEquals(ScalarType(static_cast<T>(1)))); + ASSERT_FALSE(one->ApproxEquals(ScalarType(static_cast<T>(2)))); + ASSERT_TRUE(two->ApproxEquals(ScalarType(static_cast<T>(2)))); + ASSERT_FALSE(two->ApproxEquals(ScalarType(static_cast<T>(3)))); } TYPED_TEST(TestNumericScalar, Hashing) { - using T = typename TypeParam::c_type; + using T = NumericArgType<TypeParam>; using ScalarType = typename TypeTraits<TypeParam>::ScalarType; std::unordered_set<std::shared_ptr<Scalar>, Scalar::Hash, Scalar::PtrsEqual> set; set.emplace(std::make_shared<ScalarType>()); - for (T i = 0; i < 10; ++i) { - set.emplace(std::make_shared<ScalarType>(i)); + for (int i = 0; i < 10; ++i) { + ASSERT_TRUE(set.emplace(std::make_shared<ScalarType>(static_cast<T>(i))).second); } ASSERT_FALSE(set.emplace(std::make_shared<ScalarType>()).second); - for (T i = 0; i < 10; ++i) { - ASSERT_FALSE(set.emplace(std::make_shared<ScalarType>(i)).second); + for (int i = 0; i < 10; ++i) { + ASSERT_FALSE(set.emplace(std::make_shared<ScalarType>(static_cast<T>(i))).second); } } TYPED_TEST(TestNumericScalar, MakeScalar) { - using T = typename TypeParam::c_type; + using T = NumericArgType<TypeParam>; using ScalarType = typename TypeTraits<TypeParam>::ScalarType; auto type = TypeTraits<TypeParam>::type_singleton(); std::shared_ptr<Scalar> three = MakeScalar(static_cast<T>(3)); ASSERT_OK(three->ValidateFull()); - ASSERT_EQ(ScalarType(3), *three); + ASSERT_EQ(ScalarType(static_cast<T>(3)), *three); - AssertMakeScalar(ScalarType(3), type, static_cast<T>(3)); + AssertMakeScalar(ScalarType(static_cast<T>(3)), type, static_cast<T>(3)); - AssertParseScalar(type, "3", ScalarType(3)); + AssertParseScalar(type, "3", ScalarType(static_cast<T>(3))); } template <typename T> class TestRealScalar : public ::testing::Test { public: - using CType = typename T::c_type; + using ValueType = NumericArgType<T>; using ScalarType = typename TypeTraits<T>::ScalarType; void SetUp() { type_ = TypeTraits<T>::type_singleton(); - scalar_val_ = std::make_shared<ScalarType>(static_cast<CType>(1)); + scalar_val_ = std::make_shared<ScalarType>(static_cast<ValueType>(1)); ASSERT_TRUE(scalar_val_->is_valid); - scalar_other_ = std::make_shared<ScalarType>(static_cast<CType>(1.1)); + scalar_other_ = std::make_shared<ScalarType>(static_cast<ValueType>(1.1)); ASSERT_TRUE(scalar_other_->is_valid); - scalar_zero_ = std::make_shared<ScalarType>(static_cast<CType>(0.0)); - scalar_other_zero_ = std::make_shared<ScalarType>(static_cast<CType>(0.0)); - scalar_neg_zero_ = std::make_shared<ScalarType>(static_cast<CType>(-0.0)); + scalar_zero_ = std::make_shared<ScalarType>(static_cast<ValueType>(0.0)); + scalar_other_zero_ = std::make_shared<ScalarType>(static_cast<ValueType>(0.0)); + scalar_neg_zero_ = std::make_shared<ScalarType>(static_cast<ValueType>(-0.0)); - const CType nan_value = std::numeric_limits<CType>::quiet_NaN(); + const auto nan_value = std::numeric_limits<ValueType>::quiet_NaN(); scalar_nan_ = std::make_shared<ScalarType>(nan_value); ASSERT_TRUE(scalar_nan_->is_valid); - const CType other_nan_value = std::numeric_limits<CType>::quiet_NaN(); + const auto other_nan_value = std::numeric_limits<ValueType>::quiet_NaN(); scalar_other_nan_ = std::make_shared<ScalarType>(other_nan_value); ASSERT_TRUE(scalar_other_nan_->is_valid); } @@ -522,7 +540,9 @@ class TestRealScalar : public ::testing::Test { scalar_zero_, scalar_other_zero_, scalar_neg_zero_; }; -TYPED_TEST_SUITE(TestRealScalar, RealArrowTypes); +using RealArrowTypesPlusHalfFloat = + ::testing::Types<FloatType, DoubleType, HalfFloatType>; +TYPED_TEST_SUITE(TestRealScalar, RealArrowTypesPlusHalfFloat); TYPED_TEST(TestRealScalar, NanEquals) { this->TestNanEquals(); } @@ -1181,8 +1201,6 @@ TEST(TestDayTimeIntervalScalars, Basics) { ASSERT_TRUE(first->Equals(ts_val2)); } -// TODO test HalfFloatScalar - TYPED_TEST(TestNumericScalar, Cast) { auto type = TypeTraits<TypeParam>::type_singleton(); diff --git a/cpp/src/arrow/testing/random.cc b/cpp/src/arrow/testing/random.cc index 2d6ba44d7e..5f95638b7d 100644 --- a/cpp/src/arrow/testing/random.cc +++ b/cpp/src/arrow/testing/random.cc @@ -43,6 +43,7 @@ #include "arrow/util/bitmap_reader.h" #include "arrow/util/checked_cast.h" #include "arrow/util/decimal.h" +#include "arrow/util/float16.h" #include "arrow/util/key_value_metadata.h" #include "arrow/util/logging_internal.h" #include "arrow/util/pcg_random.h" @@ -54,55 +55,85 @@ namespace arrow { using internal::checked_cast; using internal::checked_pointer_cast; using internal::ToChars; +using util::Float16; namespace random { namespace { +template <typename ValueType, typename DistributionType> +struct GeneratorFactory { + GeneratorFactory(ValueType min, ValueType max) : min_(min), max_(max) {} + + auto operator()(pcg32_fast* rng) const { + return [dist = DistributionType(min_, max_), rng]() mutable { + return static_cast<ValueType>(dist(*rng)); + }; + } + + private: + ValueType min_; + ValueType max_; +}; + +template <typename DistributionType> +struct GeneratorFactory<Float16, DistributionType> { + GeneratorFactory(Float16 min, Float16 max) : min_(min.ToFloat()), max_(max.ToFloat()) {} + + auto operator()(pcg32_fast* rng) const { + return [dist = DistributionType(min_, max_), rng]() mutable { + return Float16(dist(*rng)).bits(); + }; + } + + private: + float min_; + float max_; +}; + template <typename ValueType, typename DistributionType> struct GenerateOptions { + static constexpr bool kIsHalfFloat = std::is_same_v<ValueType, Float16>; + using PhysicalType = std::conditional_t<kIsHalfFloat, HalfFloatType::c_type, ValueType>; + using FactoryType = GeneratorFactory<ValueType, DistributionType>; + GenerateOptions(SeedType seed, ValueType min, ValueType max, double probability, double nan_probability = 0.0) - : min_(min), - max_(max), + : generator_factory_(FactoryType(min, max)), seed_(seed), probability_(probability), nan_probability_(nan_probability) {} void GenerateData(uint8_t* buffer, size_t n) { - GenerateTypedData(reinterpret_cast<ValueType*>(buffer), n); + GenerateTypedData(reinterpret_cast<PhysicalType*>(buffer), n); } template <typename V> - typename std::enable_if<!std::is_floating_point<V>::value>::type GenerateTypedData( - V* data, size_t n) { + typename std::enable_if<!std::is_floating_point_v<V> && !kIsHalfFloat>::type + GenerateTypedData(V* data, size_t n) { GenerateTypedDataNoNan(data, n); } template <typename V> - typename std::enable_if<std::is_floating_point<V>::value>::type GenerateTypedData( - V* data, size_t n) { + typename std::enable_if<std::is_floating_point_v<V> || kIsHalfFloat>::type + GenerateTypedData(V* data, size_t n) { if (nan_probability_ == 0.0) { GenerateTypedDataNoNan(data, n); return; } pcg32_fast rng(seed_++); - DistributionType dist(min_, max_); + auto gen = generator_factory_(&rng); ::arrow::random::bernoulli_distribution nan_dist(nan_probability_); - const ValueType nan_value = std::numeric_limits<ValueType>::quiet_NaN(); + const PhysicalType nan_value = get_nan(); - // A static cast is required due to the int16 -> int8 handling. - std::generate(data, data + n, [&] { - return nan_dist(rng) ? nan_value : static_cast<ValueType>(dist(rng)); - }); + std::generate(data, data + n, [&] { return nan_dist(rng) ? nan_value : gen(); }); } - void GenerateTypedDataNoNan(ValueType* data, size_t n) { + void GenerateTypedDataNoNan(PhysicalType* data, size_t n) { pcg32_fast rng(seed_++); - DistributionType dist(min_, max_); + auto gen = generator_factory_(&rng); - // A static cast is required due to the int16 -> int8 handling. - std::generate(data, data + n, [&] { return static_cast<ValueType>(dist(rng)); }); + std::generate(data, data + n, [&] { return gen(); }); } void GenerateBitmap(uint8_t* buffer, size_t n, int64_t* null_count) { @@ -121,8 +152,15 @@ struct GenerateOptions { if (null_count != nullptr) *null_count = count; } - ValueType min_; - ValueType max_; + static constexpr PhysicalType get_nan() { + if constexpr (kIsHalfFloat) { + return std::numeric_limits<ValueType>::quiet_NaN().bits(); + } else { + return std::numeric_limits<ValueType>::quiet_NaN(); + } + } + + FactoryType generator_factory_; SeedType seed_; double probability_; double nan_probability_; @@ -228,8 +266,6 @@ PRIMITIVE_RAND_INTEGER_IMPL(UInt32, uint32_t, UInt32Type) PRIMITIVE_RAND_INTEGER_IMPL(Int32, int32_t, Int32Type) PRIMITIVE_RAND_INTEGER_IMPL(UInt64, uint64_t, UInt64Type) PRIMITIVE_RAND_INTEGER_IMPL(Int64, int64_t, Int64Type) -// Generate 16bit values for half-float -PRIMITIVE_RAND_INTEGER_IMPL(Float16, int16_t, HalfFloatType) std::shared_ptr<Array> RandomArrayGenerator::Date64(int64_t size, int64_t min, int64_t max, double null_probability, @@ -241,6 +277,25 @@ std::shared_ptr<Array> RandomArrayGenerator::Date64(int64_t size, int64_t min, memory_pool); } +std::shared_ptr<Array> RandomArrayGenerator::Float16(int64_t size, uint16_t min, + uint16_t max, + double null_probability, + int64_t alignment, + MemoryPool* memory_pool) { + return this->Float16(size, Float16::FromBits(min), Float16::FromBits(max), + null_probability, /*nan_probability=*/0, alignment, memory_pool); +} + +std::shared_ptr<Array> RandomArrayGenerator::Float16( + int64_t size, util::Float16 min, util::Float16 max, double null_probability, + double nan_probability, int64_t alignment, MemoryPool* memory_pool) { + using OptionType = + GenerateOptions<util::Float16, ::arrow::random::uniform_real_distribution<float>>; + OptionType options(seed(), min, max, null_probability, nan_probability); + return GenerateNumericArray<HalfFloatType, OptionType>(size, options, alignment, + memory_pool); +} + std::shared_ptr<Array> RandomArrayGenerator::Float32(int64_t size, float min, float max, double null_probability, double nan_probability, @@ -1089,10 +1144,23 @@ std::shared_ptr<Array> RandomArrayGenerator::ArrayOf(const Field& field, int64_t GENERATE_INTEGRAL_CASE(Int32Type); GENERATE_INTEGRAL_CASE(UInt64Type); GENERATE_INTEGRAL_CASE(Int64Type); - GENERATE_INTEGRAL_CASE_VIEW(Int16Type, HalfFloatType); GENERATE_FLOATING_CASE(FloatType, Float32); GENERATE_FLOATING_CASE(DoubleType, Float64); + case Type::type::HALF_FLOAT: { + using ValueType = util::Float16; + const ValueType min_value = GetMetadata<ValueType>( + field.metadata().get(), "min", std::numeric_limits<ValueType>::min()); + const ValueType max_value = GetMetadata<ValueType>( + field.metadata().get(), "max", std::numeric_limits<ValueType>::max()); + const double nan_probability = + GetMetadata<double>(field.metadata().get(), "nan_probability", 0); + VALIDATE_MIN_MAX(min_value, max_value); + VALIDATE_RANGE(nan_probability, 0.0, 1.0); + return Float16(length, min_value, max_value, null_probability, nan_probability, + alignment, memory_pool); + } + case Type::type::STRING: case Type::type::BINARY: { const auto min_length = diff --git a/cpp/src/arrow/testing/random.h b/cpp/src/arrow/testing/random.h index ad87b12105..d9122915a0 100644 --- a/cpp/src/arrow/testing/random.h +++ b/cpp/src/arrow/testing/random.h @@ -28,6 +28,7 @@ #include "arrow/testing/uniform_real.h" #include "arrow/testing/visibility.h" #include "arrow/type.h" +#include "arrow/util/float16.h" namespace arrow { @@ -198,11 +199,33 @@ class ARROW_TESTING_EXPORT RandomArrayGenerator { /// \param[in] memory_pool memory pool to allocate memory from /// /// \return a generated Array - std::shared_ptr<Array> Float16(int64_t size, int16_t min, int16_t max, + /// + /// \deprecated Deprecated in 22.0.0. Use the other Float16() method that accepts + /// nan_probability as a parameter + ARROW_DEPRECATED( + "Deprecated in 22.0.0. Use the other Float16() method that accepts nan_probability " + "as a parameter") + std::shared_ptr<Array> Float16(int64_t size, uint16_t min, uint16_t max, double null_probability = 0, int64_t alignment = kDefaultBufferAlignment, MemoryPool* memory_pool = default_memory_pool()); + /// \brief Generate a random HalfFloatArray + /// + /// \param[in] size the size of the array to generate + /// \param[in] min the lower bound of the uniform distribution + /// \param[in] max the upper bound of the uniform distribution + /// \param[in] null_probability the probability of a value being null + /// \param[in] nan_probability the probability of a value being NaN + /// \param[in] alignment alignment for memory allocations (in bytes) + /// \param[in] memory_pool memory pool to allocate memory from + /// + /// \return a generated Array + std::shared_ptr<Array> Float16(int64_t size, util::Float16 min, util::Float16 max, + double null_probability = 0, double nan_probability = 0, + int64_t alignment = kDefaultBufferAlignment, + MemoryPool* memory_pool = default_memory_pool()); + /// \brief Generate a random FloatArray /// /// \param[in] size the size of the array to generate @@ -281,8 +304,9 @@ class ARROW_TESTING_EXPORT RandomArrayGenerator { return Int64(size, static_cast<int64_t>(min), static_cast<int64_t>(max), null_probability, alignment, memory_pool); case Type::HALF_FLOAT: - return Float16(size, static_cast<int16_t>(min), static_cast<int16_t>(max), - null_probability, alignment, memory_pool); + return Float16(size, util::Float16::FromBits(static_cast<uint16_t>(min)), + util::Float16::FromBits(static_cast<uint16_t>(max)), + null_probability, /*nan_probability=*/0, alignment, memory_pool); case Type::FLOAT: return Float32(size, static_cast<float>(min), static_cast<float>(max), null_probability, /*nan_probability=*/0, alignment, memory_pool); diff --git a/cpp/src/arrow/testing/random_test.cc b/cpp/src/arrow/testing/random_test.cc index 6f8621f8e9..279fb6dc91 100644 --- a/cpp/src/arrow/testing/random_test.cc +++ b/cpp/src/arrow/testing/random_test.cc @@ -26,12 +26,14 @@ #include "arrow/type_traits.h" #include "arrow/util/checked_cast.h" #include "arrow/util/decimal.h" +#include "arrow/util/float16.h" #include "arrow/util/key_value_metadata.h" #include "arrow/util/pcg_random.h" namespace arrow { using internal::checked_cast; +using util::Float16; namespace random { @@ -242,8 +244,14 @@ TYPED_TEST(RandomNumericArrayTest, GenerateMinMax) { auto array = this->Downcast(batch->column(0)); for (auto slot : *array) { if (!slot.has_value()) continue; - ASSERT_GE(slot, typename TypeParam::c_type(0)); - ASSERT_LE(slot, typename TypeParam::c_type(127)); + if constexpr (is_half_float_type<TypeParam>::value) { + const auto f16_slot = Float16::FromBits(*slot); + ASSERT_GE(f16_slot, Float16(0)); + ASSERT_LE(f16_slot, Float16(127)); + } else { + ASSERT_GE(slot, typename TypeParam::c_type(0)); + ASSERT_LE(slot, typename TypeParam::c_type(127)); + } } } @@ -256,7 +264,11 @@ TYPED_TEST(RandomNumericArrayTest, EmptyRange) { auto array = this->Downcast(batch->column(0)); for (auto slot : *array) { if (!slot.has_value()) continue; - ASSERT_EQ(slot, typename TypeParam::c_type(42)); + if constexpr (is_half_float_type<TypeParam>::value) { + ASSERT_EQ(Float16::FromBits(*slot), Float16(42)); + } else { + ASSERT_EQ(slot, typename TypeParam::c_type(42)); + } } } @@ -359,6 +371,18 @@ TEST(TypeSpecificTests, DictionaryValues) { ASSERT_EQ(16, array->dictionary()->length()); } +TEST(TypeSpecificTests, Float16Nan) { + auto field = arrow::field("float16", float16(), + key_value_metadata({{"nan_probability", "1.0"}})); + auto base_array = GenerateArray(*field, kExpectedLength, 0xDEADBEEF); + AssertTypeEqual(field->type(), base_array->type()); + auto array = internal::checked_pointer_cast<NumericArray<HalfFloatType>>(base_array); + ASSERT_OK(array->ValidateFull()); + for (const auto& value : *array) { + ASSERT_TRUE(!value.has_value() || Float16::FromBits(*value).is_nan()); + } +} + TEST(TypeSpecificTests, Float32Nan) { auto field = arrow::field("float32", float32(), key_value_metadata({{"nan_probability", "1.0"}})); diff --git a/cpp/src/arrow/type_fwd.h b/cpp/src/arrow/type_fwd.h index dc290cd327..be26c40dc1 100644 --- a/cpp/src/arrow/type_fwd.h +++ b/cpp/src/arrow/type_fwd.h @@ -46,6 +46,7 @@ class Future; namespace util { class Codec; class CodecOptions; +class Float16; } // namespace util class Buffer; diff --git a/cpp/src/arrow/type_traits.h b/cpp/src/arrow/type_traits.h index 90c110a96b..1b7a02e108 100644 --- a/cpp/src/arrow/type_traits.h +++ b/cpp/src/arrow/type_traits.h @@ -316,6 +316,11 @@ struct TypeTraits<HalfFloatType> { static inline std::shared_ptr<DataType> type_singleton() { return float16(); } }; +template <> +struct CTypeTraits<util::Float16> : public TypeTraits<HalfFloatType> { + using ArrowType = HalfFloatType; +}; + template <> struct TypeTraits<Decimal32Type> { using ArrayType = Decimal32Array;