pitrou commented on code in PR #49290:
URL: https://github.com/apache/arrow/pull/49290#discussion_r2851616393


##########
cpp/src/arrow/testing/math.h:
##########
@@ -23,17 +23,17 @@
 namespace arrow {
 
 ARROW_TESTING_EXPORT
-bool WithinUlp(util::Float16 left, util::Float16 right, int n_ulps);
+bool WithinUlp(util::Float16 left, util::Float16 right, uint16_t n_ulps);
 ARROW_TESTING_EXPORT
-bool WithinUlp(float left, float right, int n_ulps);
+bool WithinUlp(float left, float right, uint16_t n_ulps);
 ARROW_TESTING_EXPORT
-bool WithinUlp(double left, double right, int n_ulps);
+bool WithinUlp(double left, double right, uint16_t n_ulps);

Review Comment:
   Is there a need for keeping these functions here?



##########
cpp/src/arrow/util/ulp_distance.cc:
##########
@@ -0,0 +1,103 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/util/ulp_distance.h"
+
+#include <algorithm>
+#include <bit>
+#include <cinttypes>
+#include <cmath>
+#include <type_traits>
+
+#include "arrow/util/float16.h"
+
+namespace arrow {
+namespace {
+
+template <typename Float>
+struct FloatToUInt;
+
+template <>
+struct FloatToUInt<double> {
+  using Type = uint64_t;
+};
+
+template <>
+struct FloatToUInt<float> {
+  using Type = uint32_t;
+};
+
+template <>
+struct FloatToUInt<util::Float16> {
+  using Type = uint16_t;
+};
+
+template <typename Float>
+struct UlpDistanceUtil {
+ public:
+  using UIntType = FloatToUInt<Float>::Type;
+  static constexpr UIntType kNumberOfBits = sizeof(Float) * 8;
+  static constexpr UIntType kSignMask = static_cast<UIntType>(1) << 
(kNumberOfBits - 1);
+
+  // This implementation is inspired by:
+  // 
https://randomascii.wordpress.com/2012/02/25/comparing-floating-point-numbers-2012-edition/
+  static UIntType UlpDistance(Float left, Float right) {
+    auto unsigned_left = std::bit_cast<UIntType>(left);
+    auto unsigned_right = std::bit_cast<UIntType>(right);
+    auto biased_left = ConvertSignAndMagnitudeToBiased(unsigned_left);
+    auto biased_right = ConvertSignAndMagnitudeToBiased(unsigned_right);
+    if (biased_left > biased_right) {
+      std::swap(biased_left, biased_right);
+    }
+
+    // Handling of NaN should be determined by the comparison policy.

Review Comment:
   That's a bit cryptic. I think the helper APIs should have well-defined 
behavior on NaNs (presumably returning false?).



##########
cpp/src/arrow/util/ulp_distance.h:
##########


Review Comment:
   Can we make this internal by calling it `ulp_distance_internal.h`? I don't 
think we need it to expose it to third-party code for now.



##########
cpp/src/arrow/util/ulp_distance.h:
##########
@@ -0,0 +1,33 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/type_fwd.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+ARROW_EXPORT
+bool UlpDistance(util::Float16 left, util::Float16 right, uint16_t n_ulps);

Review Comment:
   We should call this `WithinUlp` like the former testing functions, IMHO.



##########
cpp/src/arrow/scalar_test.cc:
##########
@@ -562,6 +572,134 @@ TYPED_TEST(TestRealScalar, ListViewOf) { 
this->TestListViewOf(); }
 
 TYPED_TEST(TestRealScalar, LargeListViewOf) { this->TestLargeListViewOf(); }
 
+namespace {
+template <typename ScalarType, typename CType>
+std::shared_ptr<ScalarType> CreateScalar(CType value) {
+  return std::make_shared<ScalarType>(value);
+}
+
+template <typename CType>
+bool IsScalarEqual(CType left, CType right, const EqualOptions& options) {
+  std::shared_ptr<Scalar> scalar_left =
+      CreateScalar<typename CTypeTraits<CType>::ScalarType>(left);
+  std::shared_ptr<Scalar> scalar_right =
+      CreateScalar<typename CTypeTraits<CType>::ScalarType>(right);
+  return scalar_left->Equals(*scalar_right, options);
+}
+
+template <typename CType>
+void AssertScalarEquals(CType left, CType right, const EqualOptions& options) {
+  ASSERT_TRUE(IsScalarEqual(left, right, options));
+}
+
+template <typename CType>
+void AssertScalarNotEquals(CType left, CType right, const EqualOptions& 
options) {
+  ASSERT_FALSE(IsScalarEqual(left, right, options));
+}
+
+}  // namespace
+
+TEST(TestRealScalarUlpDistance, Double) {
+  auto options = EqualOptions::Defaults().use_ulp_distance(true);
+
+  // Check for different value
+  AssertScalarEquals(1.0, 1.0000000000000002, options.ulp_distance(1));

Review Comment:
   I would say we want comprehensive tests for the new math helpers in 
`arrow/util`, but can limit ourselves to a few tests for scalar equality 
(basically, to check that the options are properly taken into account).



##########
cpp/src/arrow/scalar_test.cc:
##########
@@ -562,6 +572,134 @@ TYPED_TEST(TestRealScalar, ListViewOf) { 
this->TestListViewOf(); }
 
 TYPED_TEST(TestRealScalar, LargeListViewOf) { this->TestLargeListViewOf(); }
 
+namespace {
+template <typename ScalarType, typename CType>
+std::shared_ptr<ScalarType> CreateScalar(CType value) {
+  return std::make_shared<ScalarType>(value);
+}
+
+template <typename CType>
+bool IsScalarEqual(CType left, CType right, const EqualOptions& options) {
+  std::shared_ptr<Scalar> scalar_left =
+      CreateScalar<typename CTypeTraits<CType>::ScalarType>(left);
+  std::shared_ptr<Scalar> scalar_right =
+      CreateScalar<typename CTypeTraits<CType>::ScalarType>(right);
+  return scalar_left->Equals(*scalar_right, options);
+}
+
+template <typename CType>
+void AssertScalarEquals(CType left, CType right, const EqualOptions& options) {

Review Comment:
   We already have `AssertScalarsEqual` that will emit a nice error message if 
the scalars compare unequal. These additional helpers seem a bit confusing 
(because of the similarity) and pointless at the same time.



##########
cpp/src/arrow/util/ulp_distance.h:
##########
@@ -0,0 +1,33 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/type_fwd.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+ARROW_EXPORT
+bool UlpDistance(util::Float16 left, util::Float16 right, uint16_t n_ulps);

Review Comment:
   And we can *also* expose a function `int UlpDistance(X left, X right)` if we 
want.



##########
cpp/src/arrow/scalar_test.cc:
##########
@@ -562,6 +572,134 @@ TYPED_TEST(TestRealScalar, ListViewOf) { 
this->TestListViewOf(); }
 
 TYPED_TEST(TestRealScalar, LargeListViewOf) { this->TestLargeListViewOf(); }
 
+namespace {
+template <typename ScalarType, typename CType>
+std::shared_ptr<ScalarType> CreateScalar(CType value) {
+  return std::make_shared<ScalarType>(value);
+}
+
+template <typename CType>
+bool IsScalarEqual(CType left, CType right, const EqualOptions& options) {
+  std::shared_ptr<Scalar> scalar_left =
+      CreateScalar<typename CTypeTraits<CType>::ScalarType>(left);
+  std::shared_ptr<Scalar> scalar_right =
+      CreateScalar<typename CTypeTraits<CType>::ScalarType>(right);
+  return scalar_left->Equals(*scalar_right, options);
+}
+
+template <typename CType>
+void AssertScalarEquals(CType left, CType right, const EqualOptions& options) {
+  ASSERT_TRUE(IsScalarEqual(left, right, options));
+}
+
+template <typename CType>
+void AssertScalarNotEquals(CType left, CType right, const EqualOptions& 
options) {
+  ASSERT_FALSE(IsScalarEqual(left, right, options));
+}
+
+}  // namespace
+
+TEST(TestRealScalarUlpDistance, Double) {
+  auto options = EqualOptions::Defaults().use_ulp_distance(true);
+
+  // Check for different value
+  AssertScalarEquals(1.0, 1.0000000000000002, options.ulp_distance(1));

Review Comment:
   Also, we want to add more tests for array equality in `array_test.cc`.



##########
cpp/src/arrow/util/ulp_distance.h:
##########
@@ -0,0 +1,33 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/type_fwd.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {

Review Comment:
   Put this in the `arrow::internal` namespace perhaps?



##########
cpp/src/arrow/compare.h:
##########
@@ -115,6 +118,31 @@ class EqualOptions {
     return res;
   }
 
+  /// Whether the "ulp_distance" property is used in the comparison.
+  ///
+  /// This option only affects the Equals methods
+  /// and has no effect on ApproxEquals methods.
+  /// If both "ulp_distance" and "atol" are specified, the comparison
+  /// succeeds when either condition is satisfied.
+  bool use_ulp_distance() const { return use_ulp_distance_; }
+
+  /// Return a new EqualOptions object with the "use_ulp_distance" property 
changed.
+  EqualOptions use_ulp_distance(bool v) const {
+    auto res = EqualOptions(*this);
+    res.use_ulp_distance_ = v;
+    return res;
+  }
+
+  /// The ulp distance for approximate comparisons of floating-point values.
+  /// Note that this option is ignored if "use_ulp_distance" is set to false.
+  uint16_t ulp_distance() const { return ulp_distance_; }
+
+  /// Return a new EqualOptions object with the "ulp_distance" property 
changed.
+  EqualOptions ulp_distance(uint16_t v) {
+    auto res = EqualOptions(*this);
+    res.ulp_distance_ = v;
+    return res;
+  }

Review Comment:
   This is starting to be a lot of methods just to customize FP comparison. 
Also, usually you know the number of ULPs you want, so you have to call two 
methods (`use_ulp_distance` and `ulp_distance`) which feels pointlessly 
complicated.
   
   I wonder if we could have a single method to encompass all these usages, for 
example:
   ```c++
     auto options = EqualOptions().float_comparison(EqualOptions::Atol(1e-5));
     auto options2 = EqualOptions().float_comparison(EqualOptions::Ulps(2));
   ```
   



##########
cpp/src/arrow/compare.h:
##########
@@ -132,11 +160,13 @@ class EqualOptions {
 
  protected:
   double atol_ = kDefaultAbsoluteTolerance;
+  uint16_t ulp_distance_ = kDefaultUlpDistance;
   bool nans_equal_ = false;
   bool signed_zeros_equal_ = true;
   bool use_atol_ = false;
   bool use_schema_ = true;
   bool use_metadata_ = false;
+  bool use_ulp_distance_ = false;

Review Comment:
   We could have a enum to simplify those flags slightly, for example:
   ```c++
     enum FloatComparison { Exact, Atol, Ulps };
     FloatComparison float_comparison_ = Exact;
   ```
   
   Alternatively, we could use a std::variant to also encompass the numeric 
parameters:
   ```c++
     struct ExactComparison {};
     struct UlpComparison { int max_ulps; }
     struct AtolComparison { double atol; }
   
     // Defaults to ExactComparison
     std::variant<ExactComparison, UlpComparison, AtolComparison> 
float_comparison_ = {};
   ```
   



##########
cpp/src/arrow/compare.h:
##########
@@ -66,6 +67,8 @@ class EqualOptions {
   bool use_atol() const { return use_atol_; }
 
   /// Return a new EqualOptions object with the "use_atol" property changed.
+  /// If both "ulp_distance" and "atol" are specified, the comparison
+  /// succeeds when either condition is satisfied.

Review Comment:
   That sounds slightly weird and more complicated to maintain. Why not just 
have it either/or?



##########
cpp/src/arrow/compare.cc:
##########
@@ -71,86 +72,118 @@ using util::Float16;
 
 namespace {
 
-template <bool Approximate, bool NansEqual, bool SignedZerosEqual>
+template <bool AbsoluteTolerance, bool NansEqual, bool SignedZerosEqual,
+          bool UlpDistanceEqual>
 struct FloatingEqualityFlags {
-  static constexpr bool approximate = Approximate;
+  static constexpr bool absolute_tolerance = AbsoluteTolerance;
   static constexpr bool nans_equal = NansEqual;
   static constexpr bool signed_zeros_equal = SignedZerosEqual;
+  static constexpr bool ulp_distance_equal = UlpDistanceEqual;
 };
 
 template <typename T, typename Flags>
 struct FloatingEquality {
   explicit FloatingEquality(const EqualOptions& options)
-      : epsilon(static_cast<T>(options.atol())) {}
+      : epsilon(static_cast<T>(options.atol())), 
ulp_distance(options.ulp_distance()) {}
 
   bool operator()(T x, T y) const {
     if (x == y) {
       return Flags::signed_zeros_equal || (std::signbit(x) == std::signbit(y));
     }
-    if (Flags::nans_equal && std::isnan(x) && std::isnan(y)) {
+
+    if constexpr (Flags::nans_equal) {
+      if (std::isnan(x) && std::isnan(y)) {
+        return true;
+      }
+    } else if (std::isnan(x) || std::isnan(y)) {
+      return false;
+    }
+
+    if (Flags::absolute_tolerance && (std::fabs(x - y) <= epsilon)) {
       return true;
     }
-    if (Flags::approximate && (fabs(x - y) <= epsilon)) {
+    if (Flags::ulp_distance_equal && UlpDistance(x, y, ulp_distance)) {
       return true;
     }
     return false;
   }
 
   const T epsilon;
+  const uint16_t ulp_distance;
 };
 
 // For half-float equality.
 template <typename Flags>
 struct FloatingEquality<uint16_t, Flags> {
   explicit FloatingEquality(const EqualOptions& options)
-      : epsilon(static_cast<float>(options.atol())) {}
+      : epsilon(static_cast<float>(options.atol())),
+        ulp_distance(options.ulp_distance()) {}
 
   bool operator()(uint16_t x, uint16_t y) const {
     Float16 f_x = Float16::FromBits(x);
     Float16 f_y = Float16::FromBits(y);
     if (f_x == f_y) {
       return Flags::signed_zeros_equal || (f_x.signbit() == f_y.signbit());
     }
-    if (Flags::nans_equal && f_x.is_nan() && f_y.is_nan()) {
+    if constexpr (Flags::nans_equal) {
+      if (f_x.is_nan() && f_y.is_nan()) {
+        return true;
+      }
+    } else if (f_x.is_nan() || f_y.is_nan()) {
+      return false;
+    }
+    if (Flags::absolute_tolerance &&
+        (std::fabs(f_x.ToFloat() - f_y.ToFloat()) <= epsilon)) {
       return true;
     }
-    if (Flags::approximate && (fabs(f_x.ToFloat() - f_y.ToFloat()) <= 
epsilon)) {
+    if (Flags::ulp_distance_equal && UlpDistance(f_x, f_y, ulp_distance)) {
       return true;
     }
     return false;
   }
 
   const float epsilon;
+  const uint16_t ulp_distance;
 };
 
 template <typename T, typename Visitor>
 struct FloatingEqualityDispatcher {
   const EqualOptions& options;
-  bool floating_approximate;
   Visitor&& visit;
 
-  template <bool Approximate, bool NansEqual>
-  void DispatchL3() {
-    if (options.signed_zeros_equal()) {
-      visit(FloatingEquality<T, FloatingEqualityFlags<Approximate, NansEqual, 
true>>{
+  template <bool AbsoluteTolerance, bool NansEqual, bool SingedZero>

Review Comment:
   Typo: SignedZero



##########
cpp/src/arrow/util/ulp_distance.h:
##########
@@ -0,0 +1,33 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/type_fwd.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+ARROW_EXPORT
+bool UlpDistance(util::Float16 left, util::Float16 right, uint16_t n_ulps);

Review Comment:
   Also let's use `int` for `n_ulps`? There's no need to make it unsigned 
16-bits.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to