[GitHub] [arrow] pitrou commented on a diff in pull request #36073: GH-36036: [C++][Parquet] Implement Float16 logical type

via GitHub Mon, 19 Jun 2023 08:56:08 -0700


pitrou commented on code in PR #36073:
URL: https://github.com/apache/arrow/pull/36073#discussion_r1234187461



##########
cpp/src/arrow/util/float16.h:
##########
@@ -0,0 +1,179 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <array>
+#include <cmath>
+#include <cstdint>
+#include <cstring>
+#include <limits>
+#include <type_traits>
+
+#include "arrow/util/bit_util.h"
+#include "arrow/util/ubsan.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+namespace util {
+
+/// \brief Base class for an IEEE half-precision float, encoded as a `uint16_t`
+///
+/// The exact format is as follows (from MSB to LSB):
+/// - bit 0:     sign
+/// - bits 1-5:  exponent
+/// - bits 6-15: mantissa
+///
+/// NOTE: Methods in the class should not mutate the unerlying value or 
produce copies.
+/// Such functionality is delegated to subclasses.
+class Float16Base {
+ public:
+  Float16Base() = default;
+  constexpr explicit Float16Base(uint16_t value) : value_(value) {}
+
+  constexpr uint16_t bits() const { return value_; }
+  constexpr explicit operator uint16_t() const { return bits(); }
+
+  constexpr bool signbit() const { return (value_ & 0x8000) != 0; }
+
+  constexpr bool is_nan() const {
+    return (value_ & 0x7c00) == 0x7c00 && (value_ & 0x03ff) != 0;
+  }
+  constexpr bool is_infinity() const { return (value_ & 0x7fff) == 0x7c00; }
+  constexpr bool is_zero() const { return (value_ & 0x7fff) == 0; }
+
+  /// \brief Copy the value's bytes in native-endian byte order
+  void ToBytes(uint8_t* dest) const { std::memcpy(dest, &value_, 
sizeof(value_)); }
+  /// \brief Return the value's bytes in native-endian byte order
+  std::array<uint8_t, 2> ToBytes() const {
+    std::array<uint8_t, 2> bytes;
+    ToBytes(bytes.data());
+    return bytes;
+  }
+
+  void ToLittleEndian(uint8_t* dest) const {
+    Float16Base{bit_util::ToLittleEndian(value_)}.ToBytes(dest);
+  }
+  std::array<uint8_t, 2> ToLittleEndian() const {
+    return Float16Base{bit_util::ToLittleEndian(value_)}.ToBytes();
+  }
+
+  void ToBigEndian(uint8_t* dest) const {
+    Float16Base{bit_util::ToBigEndian(value_)}.ToBytes(dest);
+  }
+  std::array<uint8_t, 2> ToBigEndian() const {
+    return Float16Base{bit_util::ToBigEndian(value_)}.ToBytes();
+  }
+
+  friend constexpr bool operator==(Float16Base lhs, Float16Base rhs) {
+    if (lhs.is_nan() || rhs.is_nan()) return false;
+    return Float16Base::CompareEq(lhs, rhs);
+  }
+  friend constexpr bool operator!=(Float16Base lhs, Float16Base rhs) {
+    return !(lhs == rhs);
+  }
+
+  friend constexpr bool operator<(Float16Base lhs, Float16Base rhs) {
+    if (lhs.is_nan() || rhs.is_nan()) return false;
+    return Float16Base::CompareLt(lhs, rhs);
+  }
+  friend constexpr bool operator>(Float16Base lhs, Float16Base rhs) { return 
rhs < lhs; }

Review Comment:
   Nit, but it's a bit weird to have this one delegate to its counterpart, 
while `operator>=` doesn't.



##########
cpp/src/arrow/util/float16.h:
##########
@@ -0,0 +1,179 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <array>
+#include <cmath>
+#include <cstdint>
+#include <cstring>
+#include <limits>
+#include <type_traits>
+
+#include "arrow/util/bit_util.h"
+#include "arrow/util/ubsan.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+namespace util {
+
+/// \brief Base class for an IEEE half-precision float, encoded as a `uint16_t`
+///
+/// The exact format is as follows (from MSB to LSB):
+/// - bit 0:     sign
+/// - bits 1-5:  exponent
+/// - bits 6-15: mantissa
+///
+/// NOTE: Methods in the class should not mutate the unerlying value or 
produce copies.
+/// Such functionality is delegated to subclasses.
+class Float16Base {
+ public:
+  Float16Base() = default;
+  constexpr explicit Float16Base(uint16_t value) : value_(value) {}
+
+  constexpr uint16_t bits() const { return value_; }
+  constexpr explicit operator uint16_t() const { return bits(); }
+
+  constexpr bool signbit() const { return (value_ & 0x8000) != 0; }
+
+  constexpr bool is_nan() const {
+    return (value_ & 0x7c00) == 0x7c00 && (value_ & 0x03ff) != 0;
+  }
+  constexpr bool is_infinity() const { return (value_ & 0x7fff) == 0x7c00; }
+  constexpr bool is_zero() const { return (value_ & 0x7fff) == 0; }
+
+  /// \brief Copy the value's bytes in native-endian byte order
+  void ToBytes(uint8_t* dest) const { std::memcpy(dest, &value_, 
sizeof(value_)); }
+  /// \brief Return the value's bytes in native-endian byte order
+  std::array<uint8_t, 2> ToBytes() const {
+    std::array<uint8_t, 2> bytes;
+    ToBytes(bytes.data());
+    return bytes;
+  }
+
+  void ToLittleEndian(uint8_t* dest) const {
+    Float16Base{bit_util::ToLittleEndian(value_)}.ToBytes(dest);
+  }
+  std::array<uint8_t, 2> ToLittleEndian() const {
+    return Float16Base{bit_util::ToLittleEndian(value_)}.ToBytes();
+  }
+
+  void ToBigEndian(uint8_t* dest) const {

Review Comment:
   Converting to big-endian is probably not useful? (though harmless as well)



##########
cpp/src/arrow/util/float16.h:
##########
@@ -0,0 +1,179 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <array>
+#include <cmath>
+#include <cstdint>
+#include <cstring>
+#include <limits>
+#include <type_traits>
+
+#include "arrow/util/bit_util.h"
+#include "arrow/util/ubsan.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+namespace util {
+
+/// \brief Base class for an IEEE half-precision float, encoded as a `uint16_t`
+///
+/// The exact format is as follows (from MSB to LSB):
+/// - bit 0:     sign
+/// - bits 1-5:  exponent
+/// - bits 6-15: mantissa
+///
+/// NOTE: Methods in the class should not mutate the unerlying value or 
produce copies.
+/// Such functionality is delegated to subclasses.
+class Float16Base {
+ public:
+  Float16Base() = default;
+  constexpr explicit Float16Base(uint16_t value) : value_(value) {}
+
+  constexpr uint16_t bits() const { return value_; }
+  constexpr explicit operator uint16_t() const { return bits(); }
+
+  constexpr bool signbit() const { return (value_ & 0x8000) != 0; }
+
+  constexpr bool is_nan() const {
+    return (value_ & 0x7c00) == 0x7c00 && (value_ & 0x03ff) != 0;
+  }
+  constexpr bool is_infinity() const { return (value_ & 0x7fff) == 0x7c00; }
+  constexpr bool is_zero() const { return (value_ & 0x7fff) == 0; }
+
+  /// \brief Copy the value's bytes in native-endian byte order
+  void ToBytes(uint8_t* dest) const { std::memcpy(dest, &value_, 
sizeof(value_)); }
+  /// \brief Return the value's bytes in native-endian byte order
+  std::array<uint8_t, 2> ToBytes() const {
+    std::array<uint8_t, 2> bytes;
+    ToBytes(bytes.data());
+    return bytes;
+  }
+
+  void ToLittleEndian(uint8_t* dest) const {
+    Float16Base{bit_util::ToLittleEndian(value_)}.ToBytes(dest);
+  }
+  std::array<uint8_t, 2> ToLittleEndian() const {
+    return Float16Base{bit_util::ToLittleEndian(value_)}.ToBytes();
+  }
+
+  void ToBigEndian(uint8_t* dest) const {
+    Float16Base{bit_util::ToBigEndian(value_)}.ToBytes(dest);
+  }
+  std::array<uint8_t, 2> ToBigEndian() const {
+    return Float16Base{bit_util::ToBigEndian(value_)}.ToBytes();
+  }
+
+  friend constexpr bool operator==(Float16Base lhs, Float16Base rhs) {
+    if (lhs.is_nan() || rhs.is_nan()) return false;
+    return Float16Base::CompareEq(lhs, rhs);
+  }
+  friend constexpr bool operator!=(Float16Base lhs, Float16Base rhs) {
+    return !(lhs == rhs);

Review Comment:
   If either value is NaN, we should return false but this will return true.



##########
cpp/src/parquet/statistics.cc:
##########
@@ -53,6 +55,25 @@ namespace {
 constexpr int value_length(int value_length, const ByteArray& value) { return 
value.len; }
 constexpr int value_length(int type_length, const FLBA& value) { return 
type_length; }
 
+// Static "constants" for normalizing float16 min/max values. These need to be 
expressed
+// as pointers because `Float16LogicalType` represents an FLBA.
+const uint8_t* float16_lowest() {
+  static const auto bytes = 
std::numeric_limits<Float16>::lowest().ToLittleEndian();

Review Comment:
   This could probably made `constexpr` by spelling out the `#if 
ARROW_LITTLE_ENDIAN` dance explicitly in `ToLittleEndian` and friends.
   Not sure that's useful here, but worth remembering.



##########
cpp/src/parquet/statistics.cc:
##########
@@ -277,11 +298,42 @@ template <bool is_signed>
 struct CompareHelper<FLBAType, is_signed>
     : public BinaryLikeCompareHelperBase<FLBAType, is_signed> {};
 
+struct Float16CompareHelper {
+  using T = FLBA;
+
+  static T DefaultMin() { return T{float16_max()}; }
+  static T DefaultMax() { return T{float16_lowest()}; }
+
+  static T Coalesce(T val, T fallback) {
+    return val.ptr != nullptr && Float16::FromLittleEndian(val.ptr).is_nan() ? 
fallback
+                                                                             : 
val;

Review Comment:
   Shouldn't this be, rather (also adding parentheses for clarity):
   ```suggestion
       return (val.ptr == nullptr || 
Float16::FromLittleEndian(val.ptr).is_nan()) ? fallback
                                                                                
  : val;
   ```



##########
cpp/src/parquet/statistics.cc:
##########
@@ -277,11 +298,42 @@ template <bool is_signed>
 struct CompareHelper<FLBAType, is_signed>
     : public BinaryLikeCompareHelperBase<FLBAType, is_signed> {};
 
+struct Float16CompareHelper {
+  using T = FLBA;
+
+  static T DefaultMin() { return T{float16_max()}; }
+  static T DefaultMax() { return T{float16_lowest()}; }
+
+  static T Coalesce(T val, T fallback) {
+    return val.ptr != nullptr && Float16::FromLittleEndian(val.ptr).is_nan() ? 
fallback
+                                                                             : 
val;

Review Comment:
   That said, I don't think `Coalesce` can be called with a null FLBA, which 
should simplify this.



##########
cpp/src/parquet/statistics.cc:
##########
@@ -458,6 +540,16 @@ std::pair<ByteArray, ByteArray> TypedComparatorImpl<false, 
ByteArrayType>::GetMi
   return GetMinMaxBinaryHelper<false>(*this, values);
 }
 
+static LogicalType::Type::type LogicalTypeId(const ColumnDescriptor* descr) {

Review Comment:
   Not sure the `static` is required (aren't we already in the anonymouns 
namespace?).



##########
cpp/src/arrow/util/float16.h:
##########
@@ -0,0 +1,179 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <array>
+#include <cmath>
+#include <cstdint>
+#include <cstring>
+#include <limits>
+#include <type_traits>
+
+#include "arrow/util/bit_util.h"
+#include "arrow/util/ubsan.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+namespace util {
+
+/// \brief Base class for an IEEE half-precision float, encoded as a `uint16_t`
+///
+/// The exact format is as follows (from MSB to LSB):
+/// - bit 0:     sign
+/// - bits 1-5:  exponent
+/// - bits 6-15: mantissa
+///
+/// NOTE: Methods in the class should not mutate the unerlying value or 
produce copies.
+/// Such functionality is delegated to subclasses.
+class Float16Base {
+ public:
+  Float16Base() = default;
+  constexpr explicit Float16Base(uint16_t value) : value_(value) {}
+
+  constexpr uint16_t bits() const { return value_; }
+  constexpr explicit operator uint16_t() const { return bits(); }
+
+  constexpr bool signbit() const { return (value_ & 0x8000) != 0; }
+
+  constexpr bool is_nan() const {
+    return (value_ & 0x7c00) == 0x7c00 && (value_ & 0x03ff) != 0;
+  }
+  constexpr bool is_infinity() const { return (value_ & 0x7fff) == 0x7c00; }
+  constexpr bool is_zero() const { return (value_ & 0x7fff) == 0; }
+
+  /// \brief Copy the value's bytes in native-endian byte order
+  void ToBytes(uint8_t* dest) const { std::memcpy(dest, &value_, 
sizeof(value_)); }
+  /// \brief Return the value's bytes in native-endian byte order
+  std::array<uint8_t, 2> ToBytes() const {
+    std::array<uint8_t, 2> bytes;
+    ToBytes(bytes.data());
+    return bytes;
+  }
+
+  void ToLittleEndian(uint8_t* dest) const {
+    Float16Base{bit_util::ToLittleEndian(value_)}.ToBytes(dest);
+  }
+  std::array<uint8_t, 2> ToLittleEndian() const {
+    return Float16Base{bit_util::ToLittleEndian(value_)}.ToBytes();
+  }
+
+  void ToBigEndian(uint8_t* dest) const {
+    Float16Base{bit_util::ToBigEndian(value_)}.ToBytes(dest);
+  }
+  std::array<uint8_t, 2> ToBigEndian() const {
+    return Float16Base{bit_util::ToBigEndian(value_)}.ToBytes();
+  }
+
+  friend constexpr bool operator==(Float16Base lhs, Float16Base rhs) {
+    if (lhs.is_nan() || rhs.is_nan()) return false;
+    return Float16Base::CompareEq(lhs, rhs);
+  }
+  friend constexpr bool operator!=(Float16Base lhs, Float16Base rhs) {
+    return !(lhs == rhs);
+  }
+
+  friend constexpr bool operator<(Float16Base lhs, Float16Base rhs) {
+    if (lhs.is_nan() || rhs.is_nan()) return false;
+    return Float16Base::CompareLt(lhs, rhs);
+  }
+  friend constexpr bool operator>(Float16Base lhs, Float16Base rhs) { return 
rhs < lhs; }
+
+  friend constexpr bool operator<=(Float16Base lhs, Float16Base rhs) {
+    if (lhs.is_nan() || rhs.is_nan()) return false;
+    return !Float16Base::CompareLt(rhs, lhs);
+  }
+  friend constexpr bool operator>=(Float16Base lhs, Float16Base rhs) {
+    if (lhs.is_nan() || rhs.is_nan()) return false;
+    return !Float16Base::CompareLt(lhs, rhs);
+  }
+
+  friend std::ostream& operator<<(std::ostream& os, Float16Base arg) {
+    return (os << arg.bits());
+  }
+
+ protected:
+  uint16_t value_;
+
+ private:
+  // Comparison helpers that assume neither operand is NaN
+  static constexpr bool CompareEq(Float16Base lhs, Float16Base rhs) {
+    return (lhs.bits() == rhs.bits()) || (lhs.is_zero() && rhs.is_zero());
+  }
+  static constexpr bool CompareLt(Float16Base lhs, Float16Base rhs) {
+    if (lhs.signbit()) {
+      if (rhs.signbit()) {
+        // Both are negative
+        return (lhs.bits() & 0x7fff) > (rhs.bits() & 0x7fff);
+      } else {
+        // Handle +/-0
+        return !lhs.is_zero() || rhs.bits() != 0;
+      }
+    } else if (rhs.signbit()) {
+      return false;
+    } else {
+      // Both are positive
+      return (lhs.bits() & 0x7fff) < (rhs.bits() & 0x7fff);

Review Comment:
   If they're both positive, then why do we need to AND the bits?



##########
cpp/src/arrow/util/float16_test.cc:
##########
@@ -0,0 +1,160 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <array>
+#include <utility>
+#include <vector>
+
+#include <gtest/gtest.h>
+
+#include "arrow/testing/gtest_util.h"
+#include "arrow/util/endian.h"
+#include "arrow/util/float16.h"
+#include "arrow/util/ubsan.h"
+
+namespace arrow {
+namespace util {
+namespace {
+
+template <typename T>
+using Limits = std::numeric_limits<T>;
+
+// Holds a float16 and its equivalent float32
+struct TestValue {
+  TestValue(Float16 f16, float f32) : f16(f16), f32(f32) {}
+  TestValue(uint16_t u16, float f32) : TestValue(Float16(u16), f32) {}
+
+  Float16 f16;
+  float f32;
+};
+
+#define GENERATE_OPERATOR(NAME, OP)                              \
+  struct NAME {                                                  \
+    std::pair<bool, bool> operator()(TestValue l, TestValue r) { \
+      return std::make_pair((l.f32 OP r.f32), (l.f16 OP r.f16)); \
+    }                                                            \
+  }
+
+GENERATE_OPERATOR(CompareEq, ==);
+GENERATE_OPERATOR(CompareNe, !=);
+GENERATE_OPERATOR(CompareLt, <);
+GENERATE_OPERATOR(CompareGt, >);
+GENERATE_OPERATOR(CompareLe, <=);
+GENERATE_OPERATOR(CompareGe, >=);
+
+#undef GENERATE_OPERATOR
+
+const std::vector<TestValue> g_test_values = {
+    TestValue(Limits<Float16>::min(), +0.00006104f),
+    TestValue(Limits<Float16>::max(), +65504.0f),
+    TestValue(Limits<Float16>::lowest(), -65504.0f),
+    TestValue(+Limits<Float16>::infinity(), +Limits<float>::infinity()),
+    TestValue(-Limits<Float16>::infinity(), -Limits<float>::infinity()),
+    // Multiple (semantically equivalent) NaN representations
+    TestValue(0x7fff, Limits<float>::quiet_NaN()),
+    TestValue(0xffff, Limits<float>::quiet_NaN()),
+    TestValue(0x7e00, Limits<float>::quiet_NaN()),
+    TestValue(0xfe00, Limits<float>::quiet_NaN()),
+    // Positive/negative zeroes
+    TestValue(0x0000, +0.0f),
+    TestValue(0x8000, -0.0f),
+    // Miscellaneous values. In general, they're chosen to test the 
sign/exponent and
+    // exponent/mantissa boundaries
+    TestValue(0x101c, +0.000502f),
+    TestValue(0x901c, -0.000502f),
+    TestValue(0x101d, +0.0005022f),
+    TestValue(0x901d, -0.0005022f),
+    TestValue(0x121c, +0.000746f),
+    TestValue(0x921c, -0.000746f),
+    TestValue(0x141c, +0.001004f),
+    TestValue(0x941c, -0.001004f),
+    TestValue(0x501c, +32.9f),
+    TestValue(0xd01c, -32.9f),
+    // A few subnormals for good measure
+    TestValue(0x001c, +0.0000017f),
+    TestValue(0x801c, -0.0000017f),
+    TestValue(0x021c, +0.0000332f),
+    TestValue(0x821c, -0.0000332f),
+};
+
+template <typename Operator>
+class Float16OperatorTest : public ::testing::Test {
+ public:
+  void TestCompare(const std::vector<TestValue>& test_values) {
+    const auto num_values = static_cast<int>(test_values.size());
+
+    // Check all combinations of operands in both directions
+    for (int offset = 0; offset < num_values; ++offset) {

Review Comment:
   This is a strange way to write the double loop. Why not simply:
   ```c++
   for (int i = 0; i < num_values; ++i) {
     for (int j = 0; i < num_values; ++j) {
   ```



##########
cpp/src/parquet/statistics.cc:
##########
@@ -525,6 +616,19 @@ class TypedStatisticsImpl : public TypedStatistics<DType> {
   bool Equals(const Statistics& raw_other) const override {
     if (physical_type() != raw_other.physical_type()) return false;
 
+    const auto logical_id = LogicalTypeId(*this);
+    switch (logical_id) {
+      // Only compare against logical types that influence the interpretation 
of the
+      // physical type
+      case LogicalType::Type::FLOAT16:
+        if (LogicalTypeId(raw_other) != logical_id) {
+          return false;

Review Comment:
   What if `this` is not a float16 but `raw_other` is?



##########
cpp/src/arrow/util/float16.h:
##########
@@ -0,0 +1,179 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <array>
+#include <cmath>
+#include <cstdint>
+#include <cstring>
+#include <limits>
+#include <type_traits>
+
+#include "arrow/util/bit_util.h"
+#include "arrow/util/ubsan.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+namespace util {
+
+/// \brief Base class for an IEEE half-precision float, encoded as a `uint16_t`
+///
+/// The exact format is as follows (from MSB to LSB):
+/// - bit 0:     sign
+/// - bits 1-5:  exponent
+/// - bits 6-15: mantissa
+///
+/// NOTE: Methods in the class should not mutate the unerlying value or 
produce copies.
+/// Such functionality is delegated to subclasses.
+class Float16Base {
+ public:
+  Float16Base() = default;
+  constexpr explicit Float16Base(uint16_t value) : value_(value) {}
+
+  constexpr uint16_t bits() const { return value_; }
+  constexpr explicit operator uint16_t() const { return bits(); }
+
+  constexpr bool signbit() const { return (value_ & 0x8000) != 0; }
+
+  constexpr bool is_nan() const {
+    return (value_ & 0x7c00) == 0x7c00 && (value_ & 0x03ff) != 0;
+  }
+  constexpr bool is_infinity() const { return (value_ & 0x7fff) == 0x7c00; }
+  constexpr bool is_zero() const { return (value_ & 0x7fff) == 0; }
+
+  /// \brief Copy the value's bytes in native-endian byte order
+  void ToBytes(uint8_t* dest) const { std::memcpy(dest, &value_, 
sizeof(value_)); }
+  /// \brief Return the value's bytes in native-endian byte order
+  std::array<uint8_t, 2> ToBytes() const {
+    std::array<uint8_t, 2> bytes;
+    ToBytes(bytes.data());
+    return bytes;
+  }
+
+  void ToLittleEndian(uint8_t* dest) const {
+    Float16Base{bit_util::ToLittleEndian(value_)}.ToBytes(dest);
+  }
+  std::array<uint8_t, 2> ToLittleEndian() const {
+    return Float16Base{bit_util::ToLittleEndian(value_)}.ToBytes();
+  }
+
+  void ToBigEndian(uint8_t* dest) const {
+    Float16Base{bit_util::ToBigEndian(value_)}.ToBytes(dest);
+  }
+  std::array<uint8_t, 2> ToBigEndian() const {
+    return Float16Base{bit_util::ToBigEndian(value_)}.ToBytes();
+  }
+
+  friend constexpr bool operator==(Float16Base lhs, Float16Base rhs) {
+    if (lhs.is_nan() || rhs.is_nan()) return false;
+    return Float16Base::CompareEq(lhs, rhs);
+  }
+  friend constexpr bool operator!=(Float16Base lhs, Float16Base rhs) {
+    return !(lhs == rhs);
+  }
+
+  friend constexpr bool operator<(Float16Base lhs, Float16Base rhs) {
+    if (lhs.is_nan() || rhs.is_nan()) return false;
+    return Float16Base::CompareLt(lhs, rhs);
+  }
+  friend constexpr bool operator>(Float16Base lhs, Float16Base rhs) { return 
rhs < lhs; }
+
+  friend constexpr bool operator<=(Float16Base lhs, Float16Base rhs) {
+    if (lhs.is_nan() || rhs.is_nan()) return false;
+    return !Float16Base::CompareLt(rhs, lhs);
+  }
+  friend constexpr bool operator>=(Float16Base lhs, Float16Base rhs) {
+    if (lhs.is_nan() || rhs.is_nan()) return false;
+    return !Float16Base::CompareLt(lhs, rhs);
+  }
+
+  friend std::ostream& operator<<(std::ostream& os, Float16Base arg) {
+    return (os << arg.bits());
+  }
+
+ protected:
+  uint16_t value_;
+
+ private:
+  // Comparison helpers that assume neither operand is NaN
+  static constexpr bool CompareEq(Float16Base lhs, Float16Base rhs) {
+    return (lhs.bits() == rhs.bits()) || (lhs.is_zero() && rhs.is_zero());
+  }
+  static constexpr bool CompareLt(Float16Base lhs, Float16Base rhs) {
+    if (lhs.signbit()) {
+      if (rhs.signbit()) {
+        // Both are negative
+        return (lhs.bits() & 0x7fff) > (rhs.bits() & 0x7fff);

Review Comment:
   I don't think ANDing changes anything here?



##########
cpp/src/arrow/util/float16.h:
##########
@@ -0,0 +1,179 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <array>
+#include <cmath>
+#include <cstdint>
+#include <cstring>
+#include <limits>
+#include <type_traits>
+
+#include "arrow/util/bit_util.h"
+#include "arrow/util/ubsan.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+namespace util {
+
+/// \brief Base class for an IEEE half-precision float, encoded as a `uint16_t`
+///
+/// The exact format is as follows (from MSB to LSB):
+/// - bit 0:     sign
+/// - bits 1-5:  exponent
+/// - bits 6-15: mantissa
+///
+/// NOTE: Methods in the class should not mutate the unerlying value or 
produce copies.
+/// Such functionality is delegated to subclasses.
+class Float16Base {
+ public:
+  Float16Base() = default;
+  constexpr explicit Float16Base(uint16_t value) : value_(value) {}
+
+  constexpr uint16_t bits() const { return value_; }
+  constexpr explicit operator uint16_t() const { return bits(); }
+
+  constexpr bool signbit() const { return (value_ & 0x8000) != 0; }
+
+  constexpr bool is_nan() const {
+    return (value_ & 0x7c00) == 0x7c00 && (value_ & 0x03ff) != 0;
+  }
+  constexpr bool is_infinity() const { return (value_ & 0x7fff) == 0x7c00; }
+  constexpr bool is_zero() const { return (value_ & 0x7fff) == 0; }
+
+  /// \brief Copy the value's bytes in native-endian byte order
+  void ToBytes(uint8_t* dest) const { std::memcpy(dest, &value_, 
sizeof(value_)); }
+  /// \brief Return the value's bytes in native-endian byte order
+  std::array<uint8_t, 2> ToBytes() const {
+    std::array<uint8_t, 2> bytes;
+    ToBytes(bytes.data());
+    return bytes;
+  }
+
+  void ToLittleEndian(uint8_t* dest) const {
+    Float16Base{bit_util::ToLittleEndian(value_)}.ToBytes(dest);
+  }
+  std::array<uint8_t, 2> ToLittleEndian() const {
+    return Float16Base{bit_util::ToLittleEndian(value_)}.ToBytes();
+  }
+
+  void ToBigEndian(uint8_t* dest) const {
+    Float16Base{bit_util::ToBigEndian(value_)}.ToBytes(dest);
+  }
+  std::array<uint8_t, 2> ToBigEndian() const {
+    return Float16Base{bit_util::ToBigEndian(value_)}.ToBytes();
+  }
+
+  friend constexpr bool operator==(Float16Base lhs, Float16Base rhs) {
+    if (lhs.is_nan() || rhs.is_nan()) return false;
+    return Float16Base::CompareEq(lhs, rhs);
+  }
+  friend constexpr bool operator!=(Float16Base lhs, Float16Base rhs) {
+    return !(lhs == rhs);
+  }
+
+  friend constexpr bool operator<(Float16Base lhs, Float16Base rhs) {
+    if (lhs.is_nan() || rhs.is_nan()) return false;
+    return Float16Base::CompareLt(lhs, rhs);
+  }
+  friend constexpr bool operator>(Float16Base lhs, Float16Base rhs) { return 
rhs < lhs; }
+
+  friend constexpr bool operator<=(Float16Base lhs, Float16Base rhs) {
+    if (lhs.is_nan() || rhs.is_nan()) return false;
+    return !Float16Base::CompareLt(rhs, lhs);
+  }
+  friend constexpr bool operator>=(Float16Base lhs, Float16Base rhs) {
+    if (lhs.is_nan() || rhs.is_nan()) return false;
+    return !Float16Base::CompareLt(lhs, rhs);
+  }
+
+  friend std::ostream& operator<<(std::ostream& os, Float16Base arg) {

Review Comment:
   This means we should include `<iosfwd>` (and by not defining the method in 
this header, we avoid including the entire iostream library).



##########
cpp/src/arrow/util/float16.h:
##########
@@ -0,0 +1,179 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <array>
+#include <cmath>
+#include <cstdint>
+#include <cstring>
+#include <limits>
+#include <type_traits>
+
+#include "arrow/util/bit_util.h"
+#include "arrow/util/ubsan.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+namespace util {
+
+/// \brief Base class for an IEEE half-precision float, encoded as a `uint16_t`
+///
+/// The exact format is as follows (from MSB to LSB):
+/// - bit 0:     sign
+/// - bits 1-5:  exponent
+/// - bits 6-15: mantissa
+///
+/// NOTE: Methods in the class should not mutate the unerlying value or 
produce copies.
+/// Such functionality is delegated to subclasses.
+class Float16Base {
+ public:
+  Float16Base() = default;
+  constexpr explicit Float16Base(uint16_t value) : value_(value) {}
+
+  constexpr uint16_t bits() const { return value_; }
+  constexpr explicit operator uint16_t() const { return bits(); }
+
+  constexpr bool signbit() const { return (value_ & 0x8000) != 0; }
+
+  constexpr bool is_nan() const {
+    return (value_ & 0x7c00) == 0x7c00 && (value_ & 0x03ff) != 0;
+  }
+  constexpr bool is_infinity() const { return (value_ & 0x7fff) == 0x7c00; }
+  constexpr bool is_zero() const { return (value_ & 0x7fff) == 0; }
+
+  /// \brief Copy the value's bytes in native-endian byte order
+  void ToBytes(uint8_t* dest) const { std::memcpy(dest, &value_, 
sizeof(value_)); }
+  /// \brief Return the value's bytes in native-endian byte order
+  std::array<uint8_t, 2> ToBytes() const {
+    std::array<uint8_t, 2> bytes;
+    ToBytes(bytes.data());
+    return bytes;
+  }
+
+  void ToLittleEndian(uint8_t* dest) const {

Review Comment:
   It would be nice to add docstrings for all public methods.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [arrow] pitrou commented on a diff in pull request #36073: GH-36036: [C++][Parquet] Implement Float16 logical type

Reply via email to