(arrow) branch main updated: GH-46739: [C++] Fix Float16 signed zero/NaN equality comparisons (#46973)

apitrou Thu, 04 Sep 2025 03:00:25 -0700

This is an automated email from the ASF dual-hosted git repository.

apitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git



The following commit(s) were added to refs/heads/main by this push:
     new caf4f70247 GH-46739: [C++] Fix Float16 signed zero/NaN equality 
comparisons (#46973)
caf4f70247 is described below

commit caf4f702479503b2da4f1a606248107e71a8d772
Author: Ben Harkins <60872452+beni...@users.noreply.github.com>
AuthorDate: Thu Sep 4 05:23:52 2025 -0400

    GH-46739: [C++] Fix Float16 signed zero/NaN equality comparisons (#46973)
    
    ### Rationale for this change
    
    Equality comparisons between half-floats (used in their scalar/array 
`Equals` methods) do not properly handle `EqualOptions::nans_equal` and 
`EqualOptions::signed_zeros_equal`.
    
    ### What changes are included in this PR?
    
    - Internal fixes to the current comparison behavior and additional tests as 
needed
    - Prevents Float16 NaNs from being randomly generated by test utilities by 
default (matching behavior for float/double)
    
    ### Are these changes tested?
    
    Yes
    
    ### Are there any user-facing changes?
    
    No
    
    * GitHub Issue: #46739
    
    Authored-by: Benjamin Harkins <benphark...@gmail.com>
    Signed-off-by: Antoine Pitrou <anto...@python.org>
---
 cpp/src/arrow/array/array_test.cc    |  50 +++++++++-------
 cpp/src/arrow/compare.cc             |   5 +-
 cpp/src/arrow/scalar.h               |  19 ++++++
 cpp/src/arrow/scalar_test.cc         |  84 +++++++++++++++-----------
 cpp/src/arrow/testing/random.cc      | 112 ++++++++++++++++++++++++++++-------
 cpp/src/arrow/testing/random.h       |  30 +++++++++-
 cpp/src/arrow/testing/random_test.cc |  30 +++++++++-
 cpp/src/arrow/type_fwd.h             |   1 +
 cpp/src/arrow/type_traits.h          |   5 ++
 9 files changed, 253 insertions(+), 83 deletions(-)

diff --git a/cpp/src/arrow/array/array_test.cc 
b/cpp/src/arrow/array/array_test.cc
index 85885ded14..4db76512d2 100644
--- a/cpp/src/arrow/array/array_test.cc
+++ b/cpp/src/arrow/array/array_test.cc
@@ -2120,16 +2120,21 @@ void CheckSliceApproxEquals() {
   ASSERT_TRUE(slice1->ApproxEquals(slice2));
 }
 
+template <typename ArrowType>
+using NumericArgType = 
std::conditional_t<is_half_float_type<ArrowType>::value, Float16,
+                                          typename ArrowType::c_type>;
+
 template <typename TYPE>
 void CheckFloatingNanEquality() {
+  using V = NumericArgType<TYPE>;
   std::shared_ptr<Array> a, b;
   std::shared_ptr<DataType> type = TypeTraits<TYPE>::type_singleton();
 
-  const auto nan_value = static_cast<typename TYPE::c_type>(NAN);
+  const auto nan_value = std::numeric_limits<V>::quiet_NaN();
 
   // NaN in a null entry
-  ArrayFromVector<TYPE>(type, {true, false}, {0.5, nan_value}, &a);
-  ArrayFromVector<TYPE>(type, {true, false}, {0.5, nan_value}, &b);
+  ArrayFromVector<TYPE, V>(type, {true, false}, {V(0.5), nan_value}, &a);
+  ArrayFromVector<TYPE, V>(type, {true, false}, {V(0.5), nan_value}, &b);
   ASSERT_TRUE(a->Equals(b));
   ASSERT_TRUE(b->Equals(a));
   ASSERT_TRUE(a->ApproxEquals(b));
@@ -2140,8 +2145,8 @@ void CheckFloatingNanEquality() {
   ASSERT_TRUE(b->RangeEquals(a, 1, 2, 1));
 
   // NaN in a valid entry
-  ArrayFromVector<TYPE>(type, {false, true}, {0.5, nan_value}, &a);
-  ArrayFromVector<TYPE>(type, {false, true}, {0.5, nan_value}, &b);
+  ArrayFromVector<TYPE, V>(type, {false, true}, {V(0.5), nan_value}, &a);
+  ArrayFromVector<TYPE, V>(type, {false, true}, {V(0.5), nan_value}, &b);
   ASSERT_FALSE(a->Equals(b));
   ASSERT_FALSE(b->Equals(a));
   ASSERT_TRUE(a->Equals(b, EqualOptions().nans_equal(true)));
@@ -2160,8 +2165,8 @@ void CheckFloatingNanEquality() {
   ASSERT_TRUE(b->RangeEquals(a, 0, 1, 0));
 
   // NaN != non-NaN
-  ArrayFromVector<TYPE>(type, {false, true}, {0.5, nan_value}, &a);
-  ArrayFromVector<TYPE>(type, {false, true}, {0.5, 0.0}, &b);
+  ArrayFromVector<TYPE, V>(type, {false, true}, {V(0.5), nan_value}, &a);
+  ArrayFromVector<TYPE, V>(type, {false, true}, {V(0.5), V(0.0)}, &b);
   ASSERT_FALSE(a->Equals(b));
   ASSERT_FALSE(b->Equals(a));
   ASSERT_FALSE(a->Equals(b, EqualOptions().nans_equal(true)));
@@ -2182,15 +2187,16 @@ void CheckFloatingNanEquality() {
 
 template <typename TYPE>
 void CheckFloatingInfinityEquality() {
+  using V = NumericArgType<TYPE>;
   std::shared_ptr<Array> a, b;
   std::shared_ptr<DataType> type = TypeTraits<TYPE>::type_singleton();
 
-  const auto infinity = std::numeric_limits<typename TYPE::c_type>::infinity();
+  const auto infinity = std::numeric_limits<V>::infinity();
 
   for (auto nans_equal : {false, true}) {
     // Infinity in a null entry
-    ArrayFromVector<TYPE>(type, {true, false}, {0.5, infinity}, &a);
-    ArrayFromVector<TYPE>(type, {true, false}, {0.5, -infinity}, &b);
+    ArrayFromVector<TYPE, V>(type, {true, false}, {V(0.5), infinity}, &a);
+    ArrayFromVector<TYPE, V>(type, {true, false}, {V(0.5), -infinity}, &b);
     ASSERT_TRUE(a->Equals(b));
     ASSERT_TRUE(b->Equals(a));
     ASSERT_TRUE(a->ApproxEquals(b, 
EqualOptions().atol(1e-5).nans_equal(nans_equal)));
@@ -2201,8 +2207,8 @@ void CheckFloatingInfinityEquality() {
     ASSERT_TRUE(b->RangeEquals(a, 1, 2, 1));
 
     // Infinity in a valid entry
-    ArrayFromVector<TYPE>(type, {false, true}, {0.5, infinity}, &a);
-    ArrayFromVector<TYPE>(type, {false, true}, {0.5, infinity}, &b);
+    ArrayFromVector<TYPE, V>(type, {false, true}, {V(0.5), infinity}, &a);
+    ArrayFromVector<TYPE, V>(type, {false, true}, {V(0.5), infinity}, &b);
     ASSERT_TRUE(a->Equals(b));
     ASSERT_TRUE(b->Equals(a));
     ASSERT_TRUE(a->ApproxEquals(b, 
EqualOptions().atol(1e-5).nans_equal(nans_equal)));
@@ -2219,8 +2225,8 @@ void CheckFloatingInfinityEquality() {
     ASSERT_TRUE(b->RangeEquals(a, 0, 1, 0));
 
     // Infinity != non-infinity
-    ArrayFromVector<TYPE>(type, {false, true}, {0.5, -infinity}, &a);
-    ArrayFromVector<TYPE>(type, {false, true}, {0.5, 0.0}, &b);
+    ArrayFromVector<TYPE, V>(type, {false, true}, {V(0.5), -infinity}, &a);
+    ArrayFromVector<TYPE, V>(type, {false, true}, {V(0.5), V(0.0)}, &b);
     ASSERT_FALSE(a->Equals(b));
     ASSERT_FALSE(b->Equals(a));
     ASSERT_FALSE(a->ApproxEquals(b, 
EqualOptions().atol(1e-5).nans_equal(nans_equal)));
@@ -2228,8 +2234,8 @@ void CheckFloatingInfinityEquality() {
     ASSERT_FALSE(a->ApproxEquals(b, 
EqualOptions().atol(1e-5).nans_equal(nans_equal)));
     ASSERT_FALSE(b->ApproxEquals(a, 
EqualOptions().atol(1e-5).nans_equal(nans_equal)));
     // Infinity != Negative infinity
-    ArrayFromVector<TYPE>(type, {true, true}, {0.5, -infinity}, &a);
-    ArrayFromVector<TYPE>(type, {true, true}, {0.5, infinity}, &b);
+    ArrayFromVector<TYPE, V>(type, {true, true}, {V(0.5), -infinity}, &a);
+    ArrayFromVector<TYPE, V>(type, {true, true}, {V(0.5), infinity}, &b);
     ASSERT_FALSE(a->Equals(b));
     ASSERT_FALSE(b->Equals(a));
     ASSERT_FALSE(a->ApproxEquals(b));
@@ -2249,11 +2255,12 @@ void CheckFloatingInfinityEquality() {
 
 template <typename TYPE>
 void CheckFloatingZeroEquality() {
+  using V = NumericArgType<TYPE>;
   std::shared_ptr<Array> a, b;
   std::shared_ptr<DataType> type = TypeTraits<TYPE>::type_singleton();
 
-  ArrayFromVector<TYPE>(type, {true, false}, {0.0, 1.0}, &a);
-  ArrayFromVector<TYPE>(type, {true, false}, {0.0, 1.0}, &b);
+  ArrayFromVector<TYPE, V>(type, {true, false}, {V(0.0), V(1.0)}, &a);
+  ArrayFromVector<TYPE, V>(type, {true, false}, {V(0.0), V(1.0)}, &b);
   ASSERT_TRUE(a->Equals(b));
   ASSERT_TRUE(b->Equals(a));
   for (auto nans_equal : {false, true}) {
@@ -2269,8 +2276,8 @@ void CheckFloatingZeroEquality() {
     }
   }
 
-  ArrayFromVector<TYPE>(type, {true, false}, {0.0, 1.0}, &a);
-  ArrayFromVector<TYPE>(type, {true, false}, {-0.0, 1.0}, &b);
+  ArrayFromVector<TYPE, V>(type, {true, false}, {V(0.0), V(1.0)}, &a);
+  ArrayFromVector<TYPE, V>(type, {true, false}, {V(-0.0), V(1.0)}, &b);
   for (auto nans_equal : {false, true}) {
     auto opts = EqualOptions().nans_equal(nans_equal);
     ASSERT_TRUE(a->Equals(b, opts));
@@ -2306,16 +2313,19 @@ TEST(TestPrimitiveAdHoc, FloatingSliceApproxEquals) {
 TEST(TestPrimitiveAdHoc, FloatingNanEquality) {
   CheckFloatingNanEquality<FloatType>();
   CheckFloatingNanEquality<DoubleType>();
+  CheckFloatingNanEquality<HalfFloatType>();
 }
 
 TEST(TestPrimitiveAdHoc, FloatingInfinityEquality) {
   CheckFloatingInfinityEquality<FloatType>();
   CheckFloatingInfinityEquality<DoubleType>();
+  CheckFloatingInfinityEquality<HalfFloatType>();
 }
 
 TEST(TestPrimitiveAdHoc, FloatingZeroEquality) {
   CheckFloatingZeroEquality<FloatType>();
   CheckFloatingZeroEquality<DoubleType>();
+  CheckFloatingZeroEquality<HalfFloatType>();
 }
 
 // ----------------------------------------------------------------------
diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc
index 76fd47119e..d37325fa1a 100644
--- a/cpp/src/arrow/compare.cc
+++ b/cpp/src/arrow/compare.cc
@@ -110,7 +110,7 @@ struct FloatingEquality<uint16_t, Flags> {
   bool operator()(uint16_t x, uint16_t y) const {
     Float16 f_x = Float16::FromBits(x);
     Float16 f_y = Float16::FromBits(y);
-    if (x == y) {
+    if (f_x == f_y) {
       return Flags::signed_zeros_equal || (f_x.signbit() == f_y.signbit());
     }
     if (Flags::nans_equal && f_x.is_nan() && f_y.is_nan()) {
@@ -171,7 +171,8 @@ void VisitFloatingEquality(const EqualOptions& options, 
bool floating_approximat
 }
 
 inline bool IdentityImpliesEqualityNansNotEqual(const DataType& type) {
-  if (type.id() == Type::FLOAT || type.id() == Type::DOUBLE) {
+  if (type.id() == Type::FLOAT || type.id() == Type::DOUBLE ||
+      type.id() == Type::HALF_FLOAT) {
     return false;
   }
   for (const auto& child : type.fields()) {
diff --git a/cpp/src/arrow/scalar.h b/cpp/src/arrow/scalar.h
index 7ef3730120..b96d930a44 100644
--- a/cpp/src/arrow/scalar.h
+++ b/cpp/src/arrow/scalar.h
@@ -37,6 +37,7 @@
 #include "arrow/type_traits.h"
 #include "arrow/util/compare.h"
 #include "arrow/util/decimal.h"
+#include "arrow/util/float16.h"
 #include "arrow/util/visibility.h"
 #include "arrow/visit_type_inline.h"
 
@@ -245,6 +246,12 @@ struct ARROW_EXPORT UInt64Scalar : public 
NumericScalar<UInt64Type> {
 
 struct ARROW_EXPORT HalfFloatScalar : public NumericScalar<HalfFloatType> {
   using NumericScalar<HalfFloatType>::NumericScalar;
+
+  explicit HalfFloatScalar(util::Float16 value)
+      : NumericScalar(value.bits(), float16()) {}
+
+  HalfFloatScalar(util::Float16 value, std::shared_ptr<DataType> type)
+      : NumericScalar(value.bits(), std::move(type)) {}
 };
 
 struct ARROW_EXPORT FloatScalar : public NumericScalar<FloatType> {
@@ -969,6 +976,18 @@ struct MakeScalarImpl {
     return Status::OK();
   }
 
+  // This isn't captured by the generic case above because `util::Float16` 
isn't implicity
+  // convertible to `uint16_t` (HalfFloat's ValueType)
+  template <typename T>
+  std::enable_if_t<std::is_same_v<std::decay_t<ValueRef>, util::Float16> &&
+                       is_half_float_type<T>::value,
+                   Status>
+  Visit(const T& t) {
+    out_ = std::make_shared<HalfFloatScalar>(static_cast<ValueRef>(value_),
+                                             std::move(type_));
+    return Status::OK();
+  }
+
   Status Visit(const ExtensionType& t) {
     ARROW_ASSIGN_OR_RAISE(auto storage,
                           MakeScalar(t.storage_type(), 
static_cast<ValueRef>(value_)));
diff --git a/cpp/src/arrow/scalar_test.cc b/cpp/src/arrow/scalar_test.cc
index 422f688957..4a34e5d13c 100644
--- a/cpp/src/arrow/scalar_test.cc
+++ b/cpp/src/arrow/scalar_test.cc
@@ -39,6 +39,7 @@
 #include "arrow/testing/random.h"
 #include "arrow/testing/util.h"
 #include "arrow/type_traits.h"
+#include "arrow/util/float16.h"
 
 namespace arrow {
 
@@ -46,6 +47,7 @@ using compute::Cast;
 using compute::CastOptions;
 using internal::checked_cast;
 using internal::checked_pointer_cast;
+using util::Float16;
 
 std::shared_ptr<Scalar> CheckMakeNullScalar(const std::shared_ptr<DataType>& 
type) {
   const auto scalar = MakeNullScalar(type);
@@ -201,22 +203,33 @@ TEST(TestScalar, IdentityCast) {
   */
 }
 
+template <typename ArrowType>
+using NumericArgType = 
std::conditional_t<is_half_float_type<ArrowType>::value, Float16,
+                                          typename ArrowType::c_type>;
+
 template <typename T>
 class TestNumericScalar : public ::testing::Test {
  public:
   TestNumericScalar() = default;
 };
 
-TYPED_TEST_SUITE(TestNumericScalar, NumericArrowTypes);
+using NumericArrowTypesPlusHalfFloat =
+    testing::Types<UInt8Type, UInt16Type, UInt32Type, UInt64Type, Int8Type, 
Int16Type,
+                   Int32Type, Int64Type, FloatType, DoubleType, HalfFloatType>;
+TYPED_TEST_SUITE(TestNumericScalar, NumericArrowTypesPlusHalfFloat);
 
 TYPED_TEST(TestNumericScalar, Basics) {
-  using T = typename TypeParam::c_type;
+  using T = NumericArgType<TypeParam>;
   using ScalarType = typename TypeTraits<TypeParam>::ScalarType;
 
   T value = static_cast<T>(1);
 
   auto scalar_val = std::make_shared<ScalarType>(value);
-  ASSERT_EQ(value, scalar_val->value);
+  if constexpr (is_half_float_type<TypeParam>::value) {
+    ASSERT_EQ(value, Float16::FromBits(scalar_val->value));
+  } else {
+    ASSERT_EQ(value, scalar_val->value);
+  }
   ASSERT_TRUE(scalar_val->is_valid);
   ASSERT_OK(scalar_val->ValidateFull());
 
@@ -227,8 +240,13 @@ TYPED_TEST(TestNumericScalar, Basics) {
   auto scalar_other = std::make_shared<ScalarType>(other_value);
   ASSERT_NE(*scalar_other, *scalar_val);
 
-  scalar_val->value = other_value;
-  ASSERT_EQ(other_value, scalar_val->value);
+  if constexpr (is_half_float_type<TypeParam>::value) {
+    scalar_val->value = other_value.bits();
+    ASSERT_EQ(other_value, Float16::FromBits(scalar_val->value));
+  } else {
+    scalar_val->value = other_value;
+    ASSERT_EQ(other_value, scalar_val->value);
+  }
   ASSERT_EQ(*scalar_other, *scalar_val);
 
   ScalarType stack_val;
@@ -255,72 +273,72 @@ TYPED_TEST(TestNumericScalar, Basics) {
   ASSERT_OK(two->ValidateFull());
 
   ASSERT_TRUE(null->Equals(*null_value));
-  ASSERT_TRUE(one->Equals(ScalarType(1)));
-  ASSERT_FALSE(one->Equals(ScalarType(2)));
-  ASSERT_TRUE(two->Equals(ScalarType(2)));
-  ASSERT_FALSE(two->Equals(ScalarType(3)));
+  ASSERT_TRUE(one->Equals(ScalarType(static_cast<T>(1))));
+  ASSERT_FALSE(one->Equals(ScalarType(static_cast<T>(2))));
+  ASSERT_TRUE(two->Equals(ScalarType(static_cast<T>(2))));
+  ASSERT_FALSE(two->Equals(ScalarType(static_cast<T>(3))));
 
   ASSERT_TRUE(null->ApproxEquals(*null_value));
-  ASSERT_TRUE(one->ApproxEquals(ScalarType(1)));
-  ASSERT_FALSE(one->ApproxEquals(ScalarType(2)));
-  ASSERT_TRUE(two->ApproxEquals(ScalarType(2)));
-  ASSERT_FALSE(two->ApproxEquals(ScalarType(3)));
+  ASSERT_TRUE(one->ApproxEquals(ScalarType(static_cast<T>(1))));
+  ASSERT_FALSE(one->ApproxEquals(ScalarType(static_cast<T>(2))));
+  ASSERT_TRUE(two->ApproxEquals(ScalarType(static_cast<T>(2))));
+  ASSERT_FALSE(two->ApproxEquals(ScalarType(static_cast<T>(3))));
 }
 
 TYPED_TEST(TestNumericScalar, Hashing) {
-  using T = typename TypeParam::c_type;
+  using T = NumericArgType<TypeParam>;
   using ScalarType = typename TypeTraits<TypeParam>::ScalarType;
 
   std::unordered_set<std::shared_ptr<Scalar>, Scalar::Hash, Scalar::PtrsEqual> 
set;
   set.emplace(std::make_shared<ScalarType>());
-  for (T i = 0; i < 10; ++i) {
-    set.emplace(std::make_shared<ScalarType>(i));
+  for (int i = 0; i < 10; ++i) {
+    
ASSERT_TRUE(set.emplace(std::make_shared<ScalarType>(static_cast<T>(i))).second);
   }
 
   ASSERT_FALSE(set.emplace(std::make_shared<ScalarType>()).second);
-  for (T i = 0; i < 10; ++i) {
-    ASSERT_FALSE(set.emplace(std::make_shared<ScalarType>(i)).second);
+  for (int i = 0; i < 10; ++i) {
+    
ASSERT_FALSE(set.emplace(std::make_shared<ScalarType>(static_cast<T>(i))).second);
   }
 }
 
 TYPED_TEST(TestNumericScalar, MakeScalar) {
-  using T = typename TypeParam::c_type;
+  using T = NumericArgType<TypeParam>;
   using ScalarType = typename TypeTraits<TypeParam>::ScalarType;
   auto type = TypeTraits<TypeParam>::type_singleton();
 
   std::shared_ptr<Scalar> three = MakeScalar(static_cast<T>(3));
   ASSERT_OK(three->ValidateFull());
-  ASSERT_EQ(ScalarType(3), *three);
+  ASSERT_EQ(ScalarType(static_cast<T>(3)), *three);
 
-  AssertMakeScalar(ScalarType(3), type, static_cast<T>(3));
+  AssertMakeScalar(ScalarType(static_cast<T>(3)), type, static_cast<T>(3));
 
-  AssertParseScalar(type, "3", ScalarType(3));
+  AssertParseScalar(type, "3", ScalarType(static_cast<T>(3)));
 }
 
 template <typename T>
 class TestRealScalar : public ::testing::Test {
  public:
-  using CType = typename T::c_type;
+  using ValueType = NumericArgType<T>;
   using ScalarType = typename TypeTraits<T>::ScalarType;
 
   void SetUp() {
     type_ = TypeTraits<T>::type_singleton();
 
-    scalar_val_ = std::make_shared<ScalarType>(static_cast<CType>(1));
+    scalar_val_ = std::make_shared<ScalarType>(static_cast<ValueType>(1));
     ASSERT_TRUE(scalar_val_->is_valid);
 
-    scalar_other_ = std::make_shared<ScalarType>(static_cast<CType>(1.1));
+    scalar_other_ = std::make_shared<ScalarType>(static_cast<ValueType>(1.1));
     ASSERT_TRUE(scalar_other_->is_valid);
 
-    scalar_zero_ = std::make_shared<ScalarType>(static_cast<CType>(0.0));
-    scalar_other_zero_ = std::make_shared<ScalarType>(static_cast<CType>(0.0));
-    scalar_neg_zero_ = std::make_shared<ScalarType>(static_cast<CType>(-0.0));
+    scalar_zero_ = std::make_shared<ScalarType>(static_cast<ValueType>(0.0));
+    scalar_other_zero_ = 
std::make_shared<ScalarType>(static_cast<ValueType>(0.0));
+    scalar_neg_zero_ = 
std::make_shared<ScalarType>(static_cast<ValueType>(-0.0));
 
-    const CType nan_value = std::numeric_limits<CType>::quiet_NaN();
+    const auto nan_value = std::numeric_limits<ValueType>::quiet_NaN();
     scalar_nan_ = std::make_shared<ScalarType>(nan_value);
     ASSERT_TRUE(scalar_nan_->is_valid);
 
-    const CType other_nan_value = std::numeric_limits<CType>::quiet_NaN();
+    const auto other_nan_value = std::numeric_limits<ValueType>::quiet_NaN();
     scalar_other_nan_ = std::make_shared<ScalarType>(other_nan_value);
     ASSERT_TRUE(scalar_other_nan_->is_valid);
   }
@@ -522,7 +540,9 @@ class TestRealScalar : public ::testing::Test {
       scalar_zero_, scalar_other_zero_, scalar_neg_zero_;
 };
 
-TYPED_TEST_SUITE(TestRealScalar, RealArrowTypes);
+using RealArrowTypesPlusHalfFloat =
+    ::testing::Types<FloatType, DoubleType, HalfFloatType>;
+TYPED_TEST_SUITE(TestRealScalar, RealArrowTypesPlusHalfFloat);
 
 TYPED_TEST(TestRealScalar, NanEquals) { this->TestNanEquals(); }
 
@@ -1181,8 +1201,6 @@ TEST(TestDayTimeIntervalScalars, Basics) {
   ASSERT_TRUE(first->Equals(ts_val2));
 }
 
-// TODO test HalfFloatScalar
-
 TYPED_TEST(TestNumericScalar, Cast) {
   auto type = TypeTraits<TypeParam>::type_singleton();
 
diff --git a/cpp/src/arrow/testing/random.cc b/cpp/src/arrow/testing/random.cc
index 2d6ba44d7e..5f95638b7d 100644
--- a/cpp/src/arrow/testing/random.cc
+++ b/cpp/src/arrow/testing/random.cc
@@ -43,6 +43,7 @@
 #include "arrow/util/bitmap_reader.h"
 #include "arrow/util/checked_cast.h"
 #include "arrow/util/decimal.h"
+#include "arrow/util/float16.h"
 #include "arrow/util/key_value_metadata.h"
 #include "arrow/util/logging_internal.h"
 #include "arrow/util/pcg_random.h"
@@ -54,55 +55,85 @@ namespace arrow {
 using internal::checked_cast;
 using internal::checked_pointer_cast;
 using internal::ToChars;
+using util::Float16;
 
 namespace random {
 
 namespace {
 
+template <typename ValueType, typename DistributionType>
+struct GeneratorFactory {
+  GeneratorFactory(ValueType min, ValueType max) : min_(min), max_(max) {}
+
+  auto operator()(pcg32_fast* rng) const {
+    return [dist = DistributionType(min_, max_), rng]() mutable {
+      return static_cast<ValueType>(dist(*rng));
+    };
+  }
+
+ private:
+  ValueType min_;
+  ValueType max_;
+};
+
+template <typename DistributionType>
+struct GeneratorFactory<Float16, DistributionType> {
+  GeneratorFactory(Float16 min, Float16 max) : min_(min.ToFloat()), 
max_(max.ToFloat()) {}
+
+  auto operator()(pcg32_fast* rng) const {
+    return [dist = DistributionType(min_, max_), rng]() mutable {
+      return Float16(dist(*rng)).bits();
+    };
+  }
+
+ private:
+  float min_;
+  float max_;
+};
+
 template <typename ValueType, typename DistributionType>
 struct GenerateOptions {
+  static constexpr bool kIsHalfFloat = std::is_same_v<ValueType, Float16>;
+  using PhysicalType = std::conditional_t<kIsHalfFloat, HalfFloatType::c_type, 
ValueType>;
+  using FactoryType = GeneratorFactory<ValueType, DistributionType>;
+
   GenerateOptions(SeedType seed, ValueType min, ValueType max, double 
probability,
                   double nan_probability = 0.0)
-      : min_(min),
-        max_(max),
+      : generator_factory_(FactoryType(min, max)),
         seed_(seed),
         probability_(probability),
         nan_probability_(nan_probability) {}
 
   void GenerateData(uint8_t* buffer, size_t n) {
-    GenerateTypedData(reinterpret_cast<ValueType*>(buffer), n);
+    GenerateTypedData(reinterpret_cast<PhysicalType*>(buffer), n);
   }
 
   template <typename V>
-  typename std::enable_if<!std::is_floating_point<V>::value>::type 
GenerateTypedData(
-      V* data, size_t n) {
+  typename std::enable_if<!std::is_floating_point_v<V> && !kIsHalfFloat>::type
+  GenerateTypedData(V* data, size_t n) {
     GenerateTypedDataNoNan(data, n);
   }
 
   template <typename V>
-  typename std::enable_if<std::is_floating_point<V>::value>::type 
GenerateTypedData(
-      V* data, size_t n) {
+  typename std::enable_if<std::is_floating_point_v<V> || kIsHalfFloat>::type
+  GenerateTypedData(V* data, size_t n) {
     if (nan_probability_ == 0.0) {
       GenerateTypedDataNoNan(data, n);
       return;
     }
     pcg32_fast rng(seed_++);
-    DistributionType dist(min_, max_);
+    auto gen = generator_factory_(&rng);
     ::arrow::random::bernoulli_distribution nan_dist(nan_probability_);
-    const ValueType nan_value = std::numeric_limits<ValueType>::quiet_NaN();
+    const PhysicalType nan_value = get_nan();
 
-    // A static cast is required due to the int16 -> int8 handling.
-    std::generate(data, data + n, [&] {
-      return nan_dist(rng) ? nan_value : static_cast<ValueType>(dist(rng));
-    });
+    std::generate(data, data + n, [&] { return nan_dist(rng) ? nan_value : 
gen(); });
   }
 
-  void GenerateTypedDataNoNan(ValueType* data, size_t n) {
+  void GenerateTypedDataNoNan(PhysicalType* data, size_t n) {
     pcg32_fast rng(seed_++);
-    DistributionType dist(min_, max_);
+    auto gen = generator_factory_(&rng);
 
-    // A static cast is required due to the int16 -> int8 handling.
-    std::generate(data, data + n, [&] { return 
static_cast<ValueType>(dist(rng)); });
+    std::generate(data, data + n, [&] { return gen(); });
   }
 
   void GenerateBitmap(uint8_t* buffer, size_t n, int64_t* null_count) {
@@ -121,8 +152,15 @@ struct GenerateOptions {
     if (null_count != nullptr) *null_count = count;
   }
 
-  ValueType min_;
-  ValueType max_;
+  static constexpr PhysicalType get_nan() {
+    if constexpr (kIsHalfFloat) {
+      return std::numeric_limits<ValueType>::quiet_NaN().bits();
+    } else {
+      return std::numeric_limits<ValueType>::quiet_NaN();
+    }
+  }
+
+  FactoryType generator_factory_;
   SeedType seed_;
   double probability_;
   double nan_probability_;
@@ -228,8 +266,6 @@ PRIMITIVE_RAND_INTEGER_IMPL(UInt32, uint32_t, UInt32Type)
 PRIMITIVE_RAND_INTEGER_IMPL(Int32, int32_t, Int32Type)
 PRIMITIVE_RAND_INTEGER_IMPL(UInt64, uint64_t, UInt64Type)
 PRIMITIVE_RAND_INTEGER_IMPL(Int64, int64_t, Int64Type)
-// Generate 16bit values for half-float
-PRIMITIVE_RAND_INTEGER_IMPL(Float16, int16_t, HalfFloatType)
 
 std::shared_ptr<Array> RandomArrayGenerator::Date64(int64_t size, int64_t min,
                                                     int64_t max, double 
null_probability,
@@ -241,6 +277,25 @@ std::shared_ptr<Array> 
RandomArrayGenerator::Date64(int64_t size, int64_t min,
                                                       memory_pool);
 }
 
+std::shared_ptr<Array> RandomArrayGenerator::Float16(int64_t size, uint16_t 
min,
+                                                     uint16_t max,
+                                                     double null_probability,
+                                                     int64_t alignment,
+                                                     MemoryPool* memory_pool) {
+  return this->Float16(size, Float16::FromBits(min), Float16::FromBits(max),
+                       null_probability, /*nan_probability=*/0, alignment, 
memory_pool);
+}
+
+std::shared_ptr<Array> RandomArrayGenerator::Float16(
+    int64_t size, util::Float16 min, util::Float16 max, double 
null_probability,
+    double nan_probability, int64_t alignment, MemoryPool* memory_pool) {
+  using OptionType =
+      GenerateOptions<util::Float16, 
::arrow::random::uniform_real_distribution<float>>;
+  OptionType options(seed(), min, max, null_probability, nan_probability);
+  return GenerateNumericArray<HalfFloatType, OptionType>(size, options, 
alignment,
+                                                         memory_pool);
+}
+
 std::shared_ptr<Array> RandomArrayGenerator::Float32(int64_t size, float min, 
float max,
                                                      double null_probability,
                                                      double nan_probability,
@@ -1089,10 +1144,23 @@ std::shared_ptr<Array> 
RandomArrayGenerator::ArrayOf(const Field& field, int64_t
       GENERATE_INTEGRAL_CASE(Int32Type);
       GENERATE_INTEGRAL_CASE(UInt64Type);
       GENERATE_INTEGRAL_CASE(Int64Type);
-      GENERATE_INTEGRAL_CASE_VIEW(Int16Type, HalfFloatType);
       GENERATE_FLOATING_CASE(FloatType, Float32);
       GENERATE_FLOATING_CASE(DoubleType, Float64);
 
+    case Type::type::HALF_FLOAT: {
+      using ValueType = util::Float16;
+      const ValueType min_value = GetMetadata<ValueType>(
+          field.metadata().get(), "min", 
std::numeric_limits<ValueType>::min());
+      const ValueType max_value = GetMetadata<ValueType>(
+          field.metadata().get(), "max", 
std::numeric_limits<ValueType>::max());
+      const double nan_probability =
+          GetMetadata<double>(field.metadata().get(), "nan_probability", 0);
+      VALIDATE_MIN_MAX(min_value, max_value);
+      VALIDATE_RANGE(nan_probability, 0.0, 1.0);
+      return Float16(length, min_value, max_value, null_probability, 
nan_probability,
+                     alignment, memory_pool);
+    }
+
     case Type::type::STRING:
     case Type::type::BINARY: {
       const auto min_length =
diff --git a/cpp/src/arrow/testing/random.h b/cpp/src/arrow/testing/random.h
index ad87b12105..d9122915a0 100644
--- a/cpp/src/arrow/testing/random.h
+++ b/cpp/src/arrow/testing/random.h
@@ -28,6 +28,7 @@
 #include "arrow/testing/uniform_real.h"
 #include "arrow/testing/visibility.h"
 #include "arrow/type.h"
+#include "arrow/util/float16.h"
 
 namespace arrow {
 
@@ -198,11 +199,33 @@ class ARROW_TESTING_EXPORT RandomArrayGenerator {
   /// \param[in] memory_pool memory pool to allocate memory from
   ///
   /// \return a generated Array
-  std::shared_ptr<Array> Float16(int64_t size, int16_t min, int16_t max,
+  ///
+  /// \deprecated Deprecated in 22.0.0. Use the other Float16() method that 
accepts
+  /// nan_probability as a parameter
+  ARROW_DEPRECATED(
+      "Deprecated in 22.0.0. Use the other Float16() method that accepts 
nan_probability "
+      "as a parameter")
+  std::shared_ptr<Array> Float16(int64_t size, uint16_t min, uint16_t max,
                                  double null_probability = 0,
                                  int64_t alignment = kDefaultBufferAlignment,
                                  MemoryPool* memory_pool = 
default_memory_pool());
 
+  /// \brief Generate a random HalfFloatArray
+  ///
+  /// \param[in] size the size of the array to generate
+  /// \param[in] min the lower bound of the uniform distribution
+  /// \param[in] max the upper bound of the uniform distribution
+  /// \param[in] null_probability the probability of a value being null
+  /// \param[in] nan_probability the probability of a value being NaN
+  /// \param[in] alignment alignment for memory allocations (in bytes)
+  /// \param[in] memory_pool memory pool to allocate memory from
+  ///
+  /// \return a generated Array
+  std::shared_ptr<Array> Float16(int64_t size, util::Float16 min, 
util::Float16 max,
+                                 double null_probability = 0, double 
nan_probability = 0,
+                                 int64_t alignment = kDefaultBufferAlignment,
+                                 MemoryPool* memory_pool = 
default_memory_pool());
+
   /// \brief Generate a random FloatArray
   ///
   /// \param[in] size the size of the array to generate
@@ -281,8 +304,9 @@ class ARROW_TESTING_EXPORT RandomArrayGenerator {
         return Int64(size, static_cast<int64_t>(min), 
static_cast<int64_t>(max),
                      null_probability, alignment, memory_pool);
       case Type::HALF_FLOAT:
-        return Float16(size, static_cast<int16_t>(min), 
static_cast<int16_t>(max),
-                       null_probability, alignment, memory_pool);
+        return Float16(size, 
util::Float16::FromBits(static_cast<uint16_t>(min)),
+                       util::Float16::FromBits(static_cast<uint16_t>(max)),
+                       null_probability, /*nan_probability=*/0, alignment, 
memory_pool);
       case Type::FLOAT:
         return Float32(size, static_cast<float>(min), static_cast<float>(max),
                        null_probability, /*nan_probability=*/0, alignment, 
memory_pool);
diff --git a/cpp/src/arrow/testing/random_test.cc 
b/cpp/src/arrow/testing/random_test.cc
index 6f8621f8e9..279fb6dc91 100644
--- a/cpp/src/arrow/testing/random_test.cc
+++ b/cpp/src/arrow/testing/random_test.cc
@@ -26,12 +26,14 @@
 #include "arrow/type_traits.h"
 #include "arrow/util/checked_cast.h"
 #include "arrow/util/decimal.h"
+#include "arrow/util/float16.h"
 #include "arrow/util/key_value_metadata.h"
 #include "arrow/util/pcg_random.h"
 
 namespace arrow {
 
 using internal::checked_cast;
+using util::Float16;
 
 namespace random {
 
@@ -242,8 +244,14 @@ TYPED_TEST(RandomNumericArrayTest, GenerateMinMax) {
   auto array = this->Downcast(batch->column(0));
   for (auto slot : *array) {
     if (!slot.has_value()) continue;
-    ASSERT_GE(slot, typename TypeParam::c_type(0));
-    ASSERT_LE(slot, typename TypeParam::c_type(127));
+    if constexpr (is_half_float_type<TypeParam>::value) {
+      const auto f16_slot = Float16::FromBits(*slot);
+      ASSERT_GE(f16_slot, Float16(0));
+      ASSERT_LE(f16_slot, Float16(127));
+    } else {
+      ASSERT_GE(slot, typename TypeParam::c_type(0));
+      ASSERT_LE(slot, typename TypeParam::c_type(127));
+    }
   }
 }
 
@@ -256,7 +264,11 @@ TYPED_TEST(RandomNumericArrayTest, EmptyRange) {
   auto array = this->Downcast(batch->column(0));
   for (auto slot : *array) {
     if (!slot.has_value()) continue;
-    ASSERT_EQ(slot, typename TypeParam::c_type(42));
+    if constexpr (is_half_float_type<TypeParam>::value) {
+      ASSERT_EQ(Float16::FromBits(*slot), Float16(42));
+    } else {
+      ASSERT_EQ(slot, typename TypeParam::c_type(42));
+    }
   }
 }
 
@@ -359,6 +371,18 @@ TEST(TypeSpecificTests, DictionaryValues) {
   ASSERT_EQ(16, array->dictionary()->length());
 }
 
+TEST(TypeSpecificTests, Float16Nan) {
+  auto field = arrow::field("float16", float16(),
+                            key_value_metadata({{"nan_probability", "1.0"}}));
+  auto base_array = GenerateArray(*field, kExpectedLength, 0xDEADBEEF);
+  AssertTypeEqual(field->type(), base_array->type());
+  auto array = 
internal::checked_pointer_cast<NumericArray<HalfFloatType>>(base_array);
+  ASSERT_OK(array->ValidateFull());
+  for (const auto& value : *array) {
+    ASSERT_TRUE(!value.has_value() || Float16::FromBits(*value).is_nan());
+  }
+}
+
 TEST(TypeSpecificTests, Float32Nan) {
   auto field = arrow::field("float32", float32(),
                             key_value_metadata({{"nan_probability", "1.0"}}));
diff --git a/cpp/src/arrow/type_fwd.h b/cpp/src/arrow/type_fwd.h
index dc290cd327..be26c40dc1 100644
--- a/cpp/src/arrow/type_fwd.h
+++ b/cpp/src/arrow/type_fwd.h
@@ -46,6 +46,7 @@ class Future;
 namespace util {
 class Codec;
 class CodecOptions;
+class Float16;
 }  // namespace util
 
 class Buffer;
diff --git a/cpp/src/arrow/type_traits.h b/cpp/src/arrow/type_traits.h
index 90c110a96b..1b7a02e108 100644
--- a/cpp/src/arrow/type_traits.h
+++ b/cpp/src/arrow/type_traits.h
@@ -316,6 +316,11 @@ struct TypeTraits<HalfFloatType> {
   static inline std::shared_ptr<DataType> type_singleton() { return float16(); 
}
 };
 
+template <>
+struct CTypeTraits<util::Float16> : public TypeTraits<HalfFloatType> {
+  using ArrowType = HalfFloatType;
+};
+
 template <>
 struct TypeTraits<Decimal32Type> {
   using ArrayType = Decimal32Array;

(arrow) branch main updated: GH-46739: [C++] Fix Float16 signed zero/NaN equality comparisons (#46973)

Reply via email to