[GitHub] [arrow] bkietz commented on a diff in pull request #36073: GH-36036: [C++][Python][Parquet] Implement Float16 logical type

via GitHub Thu, 03 Aug 2023 12:53:20 -0700


bkietz commented on code in PR #36073:
URL: https://github.com/apache/arrow/pull/36073#discussion_r1283641666



##########
cpp/src/arrow/util/float16.cc:
##########
@@ -0,0 +1,28 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <ostream>
+
+#include "arrow/util/float16.h"
+
+namespace arrow {
+namespace util {
+
+std::ostream& operator<<(std::ostream& os, Float16Base arg) { return (os << 
arg.bits()); }

Review Comment:
   I think a missing piece from this PR is conversion to/from `float`. With 
that defined, we could print the equivalent `float` value instead and test that 
each `float16_test_cc::TestValue` is internally consistent



##########
cpp/src/parquet/arrow/reader_internal.cc:
##########
@@ -713,6 +715,77 @@ Status TransferDecimal(RecordReader* reader, MemoryPool* 
pool,
   return Status::OK();
 }
 
+static inline Status ConvertToHalfFloat(const Array& array,
+                                        const std::shared_ptr<DataType>& type,
+                                        MemoryPool* pool, 
std::shared_ptr<Array>* out) {
+  constexpr int32_t byte_width = sizeof(uint16_t);
+  DCHECK_EQ(checked_cast<const ::arrow::HalfFloatType&>(*type).byte_width(), 
byte_width);
+
+  // We read the halffloat (uint16_t) bytes from a raw binary array, in which 
they're
+  // assumed to be little-endian.
+  const auto& binary_array = checked_cast<const 
::arrow::FixedSizeBinaryArray&>(array);
+  DCHECK_EQ(checked_cast<const 
::arrow::FixedSizeBinaryType&>(*binary_array.type())
+                .byte_width(),
+            byte_width);
+
+  // Number of elements in the halffloat array
+  const int64_t length = binary_array.length();
+  // Allocate data for the output halffloat array
+  ARROW_ASSIGN_OR_RAISE(auto data, ::arrow::AllocateBuffer(length * 
byte_width, pool));
+  uint8_t* out_ptr = data->mutable_data();
+
+  const int64_t null_count = binary_array.null_count();
+  // Copy the values to the output array in native-endian format
+  if (null_count > 0) {
+    for (int64_t i = 0; i < length; ++i, out_ptr += byte_width) {
+      Float16 f16{0};
+      if (binary_array.IsValid(i)) {
+        const uint8_t* in_ptr = binary_array.GetValue(i);
+        f16 = Float16::FromLittleEndian(in_ptr);
+      }
+      f16.ToBytes(out_ptr);
+    }
+  } else {
+#if ARROW_LITTLE_ENDIAN
+    // No need to byte-swap, so do a simple copy
+    std::memcpy(out_ptr, binary_array.raw_values(), length * byte_width);
+#else
+    for (int64_t i = 0; i < length; ++i, out_ptr += byte_width) {
+      const uint8_t* in_ptr = binary_array.GetValue(i);
+      Float16::FromLittleEndian(in_ptr).ToBytes(out_ptr);
+    }
+#endif
+  }
+
+  *out = std::make_shared<::arrow::HalfFloatArray>(
+      type, length, std::move(data), binary_array.null_bitmap(), null_count);
+  return Status::OK();
+}
+
+/// \brief Convert an arrow::BinaryArray to an arrow::HalfFloatArray
+/// We do this by:
+/// 1. Creating an arrow::BinaryArray from the RecordReader's builder
+/// 2. Allocating a buffer for the arrow::HalfFloatArray
+/// 3. Converting the little-endian bytes in each BinaryArray entry to 
native-endian
+/// halffloat (uint16_t) values
+Status TransferHalfFloat(RecordReader* reader, MemoryPool* pool,
+                         const std::shared_ptr<Field>& field, Datum* out) {
+  auto binary_reader = dynamic_cast<BinaryRecordReader*>(reader);
+  DCHECK(binary_reader);
+  ::arrow::ArrayVector chunks = binary_reader->GetBuilderChunks();

Review Comment:
   Maybe I'm missing something, but I think this should be specifically a fixed 
size binary array:
   ```suggestion
   /// 1. Creating an arrow::FixedSizeBinaryArray from the RecordReader's 
builder
   /// 2. Allocating a buffer for the arrow::HalfFloatArray
   /// 3. Converting the little-endian bytes in each FixedSizeBinaryArray entry 
to native-endian
   /// halffloat (uint16_t) values
   Status TransferHalfFloat(RecordReader* reader, MemoryPool* pool,
                            const std::shared_ptr<Field>& field, Datum* out) {
     auto* binary_reader = dynamic_cast<FLBARecordReader*>(reader);
     DCHECK(binary_reader);
     ::arrow::ArrayVector chunks = binary_reader->GetBuilderChunks();
   ```



##########
cpp/src/arrow/util/float16.h:
##########
@@ -0,0 +1,198 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <array>
+#include <cstdint>
+#include <cstring>
+#include <iosfwd>
+#include <limits>
+#include <type_traits>
+
+#include "arrow/util/endian.h"
+#include "arrow/util/ubsan.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+namespace util {
+
+/// \brief Base class for an IEEE half-precision float, encoded as a `uint16_t`
+///
+/// The exact format is as follows (from MSB to LSB):
+/// - bit 0:     sign
+/// - bits 1-5:  exponent
+/// - bits 6-15: mantissa
+///
+/// NOTE: Methods in the class should not mutate the unerlying value or 
produce copies.

Review Comment:
   ```suggestion
   /// NOTE: Methods in the class should not mutate the underlying value or 
produce copies.
   ```



##########
cpp/src/arrow/util/float16.h:
##########
@@ -0,0 +1,198 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <array>
+#include <cstdint>
+#include <cstring>
+#include <iosfwd>
+#include <limits>
+#include <type_traits>
+
+#include "arrow/util/endian.h"
+#include "arrow/util/ubsan.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+namespace util {
+
+/// \brief Base class for an IEEE half-precision float, encoded as a `uint16_t`
+///
+/// The exact format is as follows (from MSB to LSB):
+/// - bit 0:     sign
+/// - bits 1-5:  exponent
+/// - bits 6-15: mantissa
+///
+/// NOTE: Methods in the class should not mutate the unerlying value or 
produce copies.

Review Comment:
   I'm not sure why this division is beneficial. Is this implementation 
inspired by another which makes this distinction? If possible, I think it'd be 
preferable to be simpler and have a single class.
   
   If you're looking at BasicDecimal/Decimal, those classes are kept separate 
so that Gandiva can include them in generated LLVM IR (so methods which 
reference std:: types and functions are kept out). Similar support for float16 
is out of scope here, I would think.



##########
cpp/src/parquet/arrow/reader_internal.cc:
##########
@@ -713,6 +715,77 @@ Status TransferDecimal(RecordReader* reader, MemoryPool* 
pool,
   return Status::OK();
 }
 
+static inline Status ConvertToHalfFloat(const Array& array,

Review Comment:
   Could this return Result<std::shared_ptr<Array>> instead?



##########
cpp/src/arrow/util/float16_test.cc:
##########
@@ -0,0 +1,168 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <array>
+#include <utility>
+#include <vector>
+
+#include <gtest/gtest.h>
+
+#include "arrow/testing/gtest_util.h"
+#include "arrow/util/endian.h"
+#include "arrow/util/float16.h"
+#include "arrow/util/ubsan.h"
+
+namespace arrow {
+namespace util {
+namespace {
+
+template <typename T>
+using Limits = std::numeric_limits<T>;
+
+// Holds a float16 and its equivalent float32
+struct TestValue {
+  TestValue(Float16 f16, float f32) : f16(f16), f32(f32) {}
+  TestValue(uint16_t u16, float f32) : TestValue(Float16(u16), f32) {}
+
+  Float16 f16;
+  float f32;
+};
+
+#define GENERATE_OPERATOR(NAME, OP)                              \
+  struct NAME {                                                  \
+    std::pair<bool, bool> operator()(TestValue l, TestValue r) { \
+      return std::make_pair((l.f32 OP r.f32), (l.f16 OP r.f16)); \
+    }                                                            \
+  }
+
+GENERATE_OPERATOR(CompareEq, ==);
+GENERATE_OPERATOR(CompareNe, !=);
+GENERATE_OPERATOR(CompareLt, <);
+GENERATE_OPERATOR(CompareGt, >);
+GENERATE_OPERATOR(CompareLe, <=);
+GENERATE_OPERATOR(CompareGe, >=);
+
+#undef GENERATE_OPERATOR
+
+const std::vector<TestValue> g_test_values = {
+    TestValue(Limits<Float16>::min(), +0.00006104f),
+    TestValue(Limits<Float16>::max(), +65504.0f),
+    TestValue(Limits<Float16>::lowest(), -65504.0f),
+    TestValue(+Limits<Float16>::infinity(), +Limits<float>::infinity()),
+    TestValue(-Limits<Float16>::infinity(), -Limits<float>::infinity()),
+    // Multiple (semantically equivalent) NaN representations
+    TestValue(0x7fff, Limits<float>::quiet_NaN()),
+    TestValue(0xffff, Limits<float>::quiet_NaN()),
+    TestValue(0x7e00, Limits<float>::quiet_NaN()),
+    TestValue(0xfe00, Limits<float>::quiet_NaN()),
+    // Positive/negative zeroes
+    TestValue(0x0000, +0.0f),
+    TestValue(0x8000, -0.0f),
+    // Miscellaneous values. In general, they're chosen to test the 
sign/exponent and
+    // exponent/mantissa boundaries
+    TestValue(0x101c, +0.000502f),
+    TestValue(0x901c, -0.000502f),
+    TestValue(0x101d, +0.0005022f),
+    TestValue(0x901d, -0.0005022f),
+    TestValue(0x121c, +0.000746f),
+    TestValue(0x921c, -0.000746f),
+    TestValue(0x141c, +0.001004f),
+    TestValue(0x941c, -0.001004f),
+    TestValue(0x501c, +32.9f),
+    TestValue(0xd01c, -32.9f),
+    // A few subnormals for good measure
+    TestValue(0x001c, +0.0000017f),
+    TestValue(0x801c, -0.0000017f),
+    TestValue(0x021c, +0.0000332f),
+    TestValue(0x821c, -0.0000332f),
+};
+
+template <typename Operator>
+class Float16OperatorTest : public ::testing::Test {
+ public:
+  void TestCompare(const std::vector<TestValue>& test_values) {
+    const auto num_values = static_cast<int>(test_values.size());
+
+    // Check all combinations of operands in both directions
+    for (int i = 0; i < num_values; ++i) {
+      for (int j = 0; j < num_values; ++j) {
+        ARROW_SCOPED_TRACE(i, ",", j);
+
+        auto a = test_values[i];
+        auto b = test_values[j];
+
+        // Results for float16 and float32 should be the same
+        auto ret = Operator{}(a, b);
+        ASSERT_EQ(ret.first, ret.second);
+      }
+    }
+  }
+};
+
+using OperatorTypes =
+    ::testing::Types<CompareEq, CompareNe, CompareLt, CompareGt, CompareLe, 
CompareGe>;
+
+TYPED_TEST_SUITE(Float16OperatorTest, OperatorTypes);
+
+TYPED_TEST(Float16OperatorTest, Compare) { this->TestCompare(g_test_values); }
+

Review Comment:
   Instead of going through TYPED_TEST etc, I think it'd be more readable to 
use:
   
   ```c++
   TEST(Float16Test, Compare) {
     auto ExpectOp = [](std::string op_name, auto op) {
       ARROW_SCOPED_TRACE(op_name);
       const auto num_values = static_cast<int>(g_test_values.size());
   
       // Check all combinations of operands in both directions
       for (int i = 0; i < num_values; ++i) {
         for (int j = 0; j < num_values; ++j) {
           ARROW_SCOPED_TRACE(i, ",", j);
   
           auto [a16, a32] = test_values[i];
           auto [b16, b32] = test_values[j];
   
           // Results for float16 and float32 should be the same
           ASSERT_EQ(op(a16, b16), op(a32, b32));
         }
       }
     };
   
     ExpectOp("equal", [](auto l, auto r) { return l == r; });
     // ...
   }
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [arrow] bkietz commented on a diff in pull request #36073: GH-36036: [C++][Python][Parquet] Implement Float16 logical type

Reply via email to