[GitHub] [arrow] pitrou commented on a diff in pull request #36073: GH-36036: [C++][Python][Parquet] Implement Float16 logical type

via GitHub Wed, 23 Aug 2023 03:45:56 -0700


pitrou commented on code in PR #36073:
URL: https://github.com/apache/arrow/pull/36073#discussion_r1302751308



##########
cpp/src/arrow/util/float16.cc:
##########
@@ -0,0 +1,187 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <ostream>
+
+#include "arrow/util/float16.h"
+#include "arrow/util/ubsan.h"
+
+namespace arrow {
+namespace util {
+
+namespace {
+
+// --------------------------------------------------------
+// Binary conversions
+// --------------------------------------------------------
+// These routines are partially adapted from Numpy's C implementation
+//
+// Some useful metrics for conversions between different precisions:
+// |-----------------------------------------|
+// | precision | half    | single  | double  |
+// |-----------------------------------------|
+// | mantissa  | 10 bits | 23 bits | 53 bits |

Review Comment:
   Should it be "52 bits" for double? In these three cases, a leading 1 is 
implied without being materialized AFAIU.



##########
cpp/src/arrow/util/float16.h:
##########
@@ -0,0 +1,192 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <array>
+#include <cstdint>
+#include <cstring>
+#include <iosfwd>
+#include <limits>
+#include <type_traits>
+
+#include "arrow/util/endian.h"
+#include "arrow/util/ubsan.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+namespace util {
+
+/// \brief Class representing an IEEE half-precision float, encoded as a 
`uint16_t`
+///
+/// The exact format is as follows (from LSB to MSB):
+/// - bits 0-10:  mantissa
+/// - bits 10-15: exponent
+/// - bit 15:     sign
+///
+class ARROW_EXPORT Float16 {
+ public:
+  Float16() = default;
+  constexpr explicit Float16(uint16_t value) : value_(value) {}
+
+  /// \brief Create a `Float16` from a 32-bit float (may lose precision)
+  static Float16 FromFloat(float f);
+
+  /// \brief Read a `Float16` from memory in native-endian byte order
+  static Float16 FromBytes(const uint8_t* src) {
+    return Float16(SafeLoadAs<uint16_t>(src));
+  }
+
+  /// \brief Read a `Float16` from memory in little-endian byte order
+  static Float16 FromLittleEndian(const uint8_t* src) {
+    return Float16(bit_util::FromLittleEndian(SafeLoadAs<uint16_t>(src)));
+  }
+
+  /// \brief Read a `Float16` from memory in big-endian byte order
+  static Float16 FromBigEndian(const uint8_t* src) {
+    return Float16(bit_util::FromBigEndian(SafeLoadAs<uint16_t>(src)));
+  }
+
+  /// \brief Return the value's integer representation
+  constexpr uint16_t bits() const { return value_; }
+  constexpr explicit operator uint16_t() const { return bits(); }
+
+  /// \brief Return true if the value is negative (sign bit is set)
+  constexpr bool signbit() const { return (value_ & 0x8000) != 0; }
+
+  /// \brief Return true if the value is NaN
+  constexpr bool is_nan() const {
+    return (value_ & 0x7c00) == 0x7c00 && (value_ & 0x03ff) != 0;
+  }
+  /// \brief Return true if the value is positive/negative infinity
+  constexpr bool is_infinity() const { return (value_ & 0x7fff) == 0x7c00; }
+  /// \brief Return true if the value is positive/negative zero
+  constexpr bool is_zero() const { return (value_ & 0x7fff) == 0; }

Review Comment:
   For completeness, add a `is_finite` method? (meaning neither inf nor nan)



##########
cpp/src/parquet/statistics_test.cc:
##########
@@ -61,6 +63,18 @@ using schema::PrimitiveNode;
 
 namespace test {
 
+struct BufferedFloat16 {
+  explicit BufferedFloat16(Float16 f16)
+      : f16(f16), buffer(*::arrow::AllocateBuffer(sizeof(uint16_t))) {
+    this->f16.ToLittleEndian(buffer->mutable_data());
+  }
+  explicit BufferedFloat16(uint16_t bits) : BufferedFloat16(Float16(bits)) {}
+  const uint8_t* bytes() const { return buffer->data(); }
+
+  Float16 f16;
+  std::shared_ptr<::arrow::Buffer> buffer;

Review Comment:
   Perhaps this can simply be a `std::array<uint8_t, 2>`? The buffer itself 
doesn't seem used below, unless I'm missing something.



##########
cpp/src/parquet/statistics_test.cc:
##########
@@ -1082,6 +1109,45 @@ void TestStatisticsSortOrder<FLBAType>::SetValues() {
       .set_max(std::string(reinterpret_cast<const char*>(&vals[8][0]), 
FLBA_LENGTH));
 }
 
+template <>
+void TestStatisticsSortOrder<Float16LogicalType>::AddNodes(std::string name) {
+  auto node =
+      schema::PrimitiveNode::Make(name, Repetition::REQUIRED, 
LogicalType::Float16(),
+                                  Type::FIXED_LEN_BYTE_ARRAY, 
sizeof(uint16_t));
+  fields_.push_back(std::move(node));
+}
+
+template <>
+void TestStatisticsSortOrder<Float16LogicalType>::SetValues() {
+  constexpr int kValueLen = 2;
+  constexpr int kNumBytes = NUM_VALUES * kValueLen;
+
+  const uint16_t u16_vals[NUM_VALUES] = {
+      0b1100010100000000,  // -5.0
+      0b1100010000000000,  // -4.0
+      0b1100001000000000,  // -3.0
+      0b1100000000000000,  // -2.0
+      0b1011110000000000,  // -1.0
+      0b0000000000000000,  // +0.0
+      0b0011110000000000,  // +1.0
+      0b0100000000000000,  // +2.0
+      0b0100001000000000,  // +3.0
+      0b0100010000000000,  // +4.0

Review Comment:
   Can you reorder the values so that they are not pre-sorted? This would 
exercise the minmax routines a bit better.



##########
cpp/src/parquet/arrow/reader_internal.cc:
##########
@@ -713,6 +715,77 @@ Status TransferDecimal(RecordReader* reader, MemoryPool* 
pool,
   return Status::OK();
 }
 
+static inline Status ConvertToHalfFloat(const Array& array,
+                                        const std::shared_ptr<DataType>& type,
+                                        MemoryPool* pool, 
std::shared_ptr<Array>* out) {
+  constexpr int32_t byte_width = sizeof(uint16_t);
+  DCHECK_EQ(checked_cast<const ::arrow::HalfFloatType&>(*type).byte_width(), 
byte_width);
+
+  // We read the halffloat (uint16_t) bytes from a raw binary array, in which 
they're
+  // assumed to be little-endian.
+  const auto& binary_array = checked_cast<const 
::arrow::FixedSizeBinaryArray&>(array);
+  DCHECK_EQ(checked_cast<const 
::arrow::FixedSizeBinaryType&>(*binary_array.type())
+                .byte_width(),
+            byte_width);
+
+  // Number of elements in the halffloat array
+  const int64_t length = binary_array.length();
+  // Allocate data for the output halffloat array
+  ARROW_ASSIGN_OR_RAISE(auto data, ::arrow::AllocateBuffer(length * 
byte_width, pool));
+  uint8_t* out_ptr = data->mutable_data();
+
+  const int64_t null_count = binary_array.null_count();
+  // Copy the values to the output array in native-endian format
+  if (null_count > 0) {
+    for (int64_t i = 0; i < length; ++i, out_ptr += byte_width) {
+      Float16 f16{0};
+      if (binary_array.IsValid(i)) {
+        const uint8_t* in_ptr = binary_array.GetValue(i);
+        f16 = Float16::FromLittleEndian(in_ptr);
+      }
+      f16.ToBytes(out_ptr);
+    }
+  } else {
+#if ARROW_LITTLE_ENDIAN
+    // No need to byte-swap, so do a simple copy
+    std::memcpy(out_ptr, binary_array.raw_values(), length * byte_width);
+#else
+    for (int64_t i = 0; i < length; ++i, out_ptr += byte_width) {
+      const uint8_t* in_ptr = binary_array.GetValue(i);
+      Float16::FromLittleEndian(in_ptr).ToBytes(out_ptr);
+    }
+#endif
+  }
+
+  *out = std::make_shared<::arrow::HalfFloatArray>(
+      type, length, std::move(data), binary_array.null_bitmap(), null_count);
+  return Status::OK();
+}
+
+/// \brief Convert an arrow::BinaryArray to an arrow::HalfFloatArray
+/// We do this by:
+/// 1. Creating an arrow::BinaryArray from the RecordReader's builder
+/// 2. Allocating a buffer for the arrow::HalfFloatArray
+/// 3. Converting the little-endian bytes in each BinaryArray entry to 
native-endian
+/// halffloat (uint16_t) values
+Status TransferHalfFloat(RecordReader* reader, MemoryPool* pool,
+                         const std::shared_ptr<Field>& field, Datum* out) {
+  auto binary_reader = dynamic_cast<BinaryRecordReader*>(reader);
+  DCHECK(binary_reader);
+  ::arrow::ArrayVector chunks = binary_reader->GetBuilderChunks();

Review Comment:
   One could just reuse `TransferBinary` here, and then view the resulting 
`FixedSizeBinaryArray` as `float16`.



##########
cpp/src/parquet/arrow/reader_internal.cc:
##########
@@ -772,6 +845,13 @@ Status TransferColumnData(RecordReader* reader, const 
std::shared_ptr<Field>& va
       RETURN_NOT_OK(TransferBinary(reader, pool, value_field, 
&chunked_result));
       result = chunked_result;
     } break;
+    case ::arrow::Type::HALF_FLOAT: {
+      if (descr->physical_type() != ::parquet::Type::FIXED_LEN_BYTE_ARRAY) {
+        return Status::Invalid("Physical type for ", 
value_field->type()->ToString(),
+                               " must be fixed length binary");
+      }

Review Comment:
   Do you also want to check the FLBA width?



##########
cpp/src/parquet/arrow/test_util.h:
##########
@@ -65,12 +65,44 @@ struct Decimal256WithPrecisionAndScale {
   static constexpr int32_t scale = PRECISION - 1;
 };
 
+inline std::vector<uint16_t> RandomHalfFloatValues(size_t size, uint16_t min,

Review Comment:
   And perhaps `random_real` can be enhanced to accept a `Float16` type 
parameter?



##########
cpp/src/parquet/arrow/arrow_reader_writer_test.cc:
##########
@@ -916,9 +925,15 @@ TYPED_TEST(TestParquetIO, SingleColumnOptionalReadWrite) {
 }
 
 TYPED_TEST(TestParquetIO, SingleColumnOptionalDictionaryWrite) {
-  // Skip tests for BOOL as we don't create dictionaries for it.
-  if (TypeParam::type_id == ::arrow::Type::BOOL) {
-    return;
+  switch (TypeParam::type_id) {
+    // Skip tests for BOOL as we don't create dictionaries for it.
+    case ::arrow::Type::BOOL:
+    // Skip tests for HALF_FLOAT as it's not currently supported by 
`dictionary_encode`
+    case ::arrow::Type::HALF_FLOAT:
+      GTEST_SKIP();

Review Comment:
   Let's add a skip message for clarity (though I'm not sure GTest always 
displays it).
   ```suggestion
         GTEST_SKIP() << "dictionary_encode not supported for float16" ;
   ```



##########
cpp/src/arrow/util/float16.h:
##########
@@ -0,0 +1,192 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <array>
+#include <cstdint>
+#include <cstring>
+#include <iosfwd>
+#include <limits>
+#include <type_traits>
+
+#include "arrow/util/endian.h"
+#include "arrow/util/ubsan.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+namespace util {
+
+/// \brief Class representing an IEEE half-precision float, encoded as a 
`uint16_t`
+///
+/// The exact format is as follows (from LSB to MSB):
+/// - bits 0-10:  mantissa
+/// - bits 10-15: exponent
+/// - bit 15:     sign
+///
+class ARROW_EXPORT Float16 {
+ public:
+  Float16() = default;
+  constexpr explicit Float16(uint16_t value) : value_(value) {}
+
+  /// \brief Create a `Float16` from a 32-bit float (may lose precision)
+  static Float16 FromFloat(float f);
+
+  /// \brief Read a `Float16` from memory in native-endian byte order
+  static Float16 FromBytes(const uint8_t* src) {
+    return Float16(SafeLoadAs<uint16_t>(src));
+  }
+
+  /// \brief Read a `Float16` from memory in little-endian byte order
+  static Float16 FromLittleEndian(const uint8_t* src) {
+    return Float16(bit_util::FromLittleEndian(SafeLoadAs<uint16_t>(src)));
+  }
+
+  /// \brief Read a `Float16` from memory in big-endian byte order
+  static Float16 FromBigEndian(const uint8_t* src) {
+    return Float16(bit_util::FromBigEndian(SafeLoadAs<uint16_t>(src)));
+  }
+
+  /// \brief Return the value's integer representation
+  constexpr uint16_t bits() const { return value_; }
+  constexpr explicit operator uint16_t() const { return bits(); }
+
+  /// \brief Return true if the value is negative (sign bit is set)
+  constexpr bool signbit() const { return (value_ & 0x8000) != 0; }
+
+  /// \brief Return true if the value is NaN
+  constexpr bool is_nan() const {
+    return (value_ & 0x7c00) == 0x7c00 && (value_ & 0x03ff) != 0;
+  }
+  /// \brief Return true if the value is positive/negative infinity
+  constexpr bool is_infinity() const { return (value_ & 0x7fff) == 0x7c00; }
+  /// \brief Return true if the value is positive/negative zero
+  constexpr bool is_zero() const { return (value_ & 0x7fff) == 0; }
+
+  /// \brief Convert to a 32-bit float
+  float ToFloat() const;

Review Comment:
   Also a `ToDouble` for completeness?



##########
cpp/src/arrow/util/float16_test.cc:
##########
@@ -0,0 +1,246 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <array>
+#include <cmath>
+#include <utility>
+#include <vector>
+
+#include <gtest/gtest.h>
+
+#include "arrow/testing/gtest_util.h"
+#include "arrow/util/endian.h"
+#include "arrow/util/float16.h"
+#include "arrow/util/ubsan.h"
+
+namespace arrow {
+namespace util {
+namespace {
+
+template <typename T>
+using Limits = std::numeric_limits<T>;
+
+float F32(uint32_t bits) { return SafeCopy<float>(bits); }
+
+TEST(Float16Test, RoundTripFromFloat32) {
+  struct TestCase {
+    float f32;
+    uint16_t b16;
+    float f16_as_f32;
+  };
+  // Expected values were also manually validated with numpy-1.24.3
+  const TestCase test_cases[] = {
+      // +/-0.0f
+      {F32(0x80000000u), 0b1000000000000000u, -0.0f},
+      {F32(0x00000000u), 0b0000000000000000u, +0.0f},
+      // 32-bit exp is 102 => 2^-25. Rounding to nearest.
+      {F32(0xb3000001u), 0b1000000000000001u, -5.96046447754e-8f},
+      // 32-bit exp is 102 => 2^-25. Rounding to even.
+      {F32(0xb3000000u), 0b1000000000000000u, -0.0f},
+      // 32-bit exp is 101 => 2^-26. Underflow to zero.
+      {F32(0xb2800001u), 0b1000000000000000u, -0.0f},
+      // 32-bit exp is 108 => 2^-19.
+      {F32(0xb61a0000u), 0b1000000000100110u, -2.26497650146e-6f},
+      // 32-bit exp is 108 => 2^-19.
+      {F32(0xb61e0000u), 0b1000000000101000u, -2.38418579102e-6f},
+      // 32-bit exp is 112 => 2^-15. Rounding to nearest.
+      {F32(0xb87fa001u), 0b1000001111111111u, -6.09755516052e-5f},
+      // 32-bit exp is 112 => 2^-15. Rounds to 16-bit exp of 1 => 2^-14
+      {F32(0xb87fe001u), 0b1000010000000000u, -6.103515625e-5f},
+      // 32-bit exp is 142 => 2^15. Rounding to nearest.
+      {F32(0xc7001001u), 0b1111100000000001u, -32800.0f},
+      // 32-bit exp is 142 => 2^15. Rounding to even.
+      {F32(0xc7001000u), 0b1111100000000000u, -32768.0f},
+      // 65520.0f rounds to inf
+      {F32(0x477ff000u), 0b0111110000000000u, Limits<float>::infinity()},
+      // 65488.0039062f rounds to 65504.0 (float16 max)
+      {F32(0x477fd001u), 0b0111101111111111u, 65504.0f},
+      // 32-bit exp is 127 => 2^0, rounds to 16-bit exp of 16 => 2^1.
+      {F32(0xbffff000u), 0b1100000000000000u, -2.0f},
+  };
+
+  for (size_t index = 0; index < std::size(test_cases); ++index) {
+    ARROW_SCOPED_TRACE("index=", index);
+    const auto& tc = test_cases[index];
+    const auto f16 = Float16::FromFloat(tc.f32);
+    EXPECT_EQ(tc.b16, f16.bits());
+    EXPECT_EQ(tc.f16_as_f32, f16.ToFloat());

Review Comment:
   Also test predicates? For example:
   ```suggestion
       EXPECT_EQ(tc.f16_as_f32, f16.ToFloat());
       EXPECT_EQ(std::signbit(tc.f16_as_f32), f16.signbit());
       EXPECT_EQ(std::isnan(tc.f16_as_f32), f16.isnan());
       EXPECT_EQ(std::isinf(tc.f16_as_f32), f16.isinf());
   ```
   



##########
cpp/src/arrow/util/float16.h:
##########
@@ -0,0 +1,192 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <array>
+#include <cstdint>
+#include <cstring>
+#include <iosfwd>
+#include <limits>
+#include <type_traits>
+
+#include "arrow/util/endian.h"
+#include "arrow/util/ubsan.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+namespace util {
+
+/// \brief Class representing an IEEE half-precision float, encoded as a 
`uint16_t`
+///
+/// The exact format is as follows (from LSB to MSB):
+/// - bits 0-10:  mantissa
+/// - bits 10-15: exponent
+/// - bit 15:     sign
+///
+class ARROW_EXPORT Float16 {
+ public:
+  Float16() = default;
+  constexpr explicit Float16(uint16_t value) : value_(value) {}
+
+  /// \brief Create a `Float16` from a 32-bit float (may lose precision)
+  static Float16 FromFloat(float f);
+
+  /// \brief Read a `Float16` from memory in native-endian byte order
+  static Float16 FromBytes(const uint8_t* src) {
+    return Float16(SafeLoadAs<uint16_t>(src));
+  }
+
+  /// \brief Read a `Float16` from memory in little-endian byte order
+  static Float16 FromLittleEndian(const uint8_t* src) {
+    return Float16(bit_util::FromLittleEndian(SafeLoadAs<uint16_t>(src)));
+  }
+
+  /// \brief Read a `Float16` from memory in big-endian byte order
+  static Float16 FromBigEndian(const uint8_t* src) {
+    return Float16(bit_util::FromBigEndian(SafeLoadAs<uint16_t>(src)));
+  }
+
+  /// \brief Return the value's integer representation
+  constexpr uint16_t bits() const { return value_; }
+  constexpr explicit operator uint16_t() const { return bits(); }
+
+  /// \brief Return true if the value is negative (sign bit is set)
+  constexpr bool signbit() const { return (value_ & 0x8000) != 0; }
+
+  /// \brief Return true if the value is NaN
+  constexpr bool is_nan() const {
+    return (value_ & 0x7c00) == 0x7c00 && (value_ & 0x03ff) != 0;
+  }
+  /// \brief Return true if the value is positive/negative infinity
+  constexpr bool is_infinity() const { return (value_ & 0x7fff) == 0x7c00; }
+  /// \brief Return true if the value is positive/negative zero
+  constexpr bool is_zero() const { return (value_ & 0x7fff) == 0; }
+
+  /// \brief Convert to a 32-bit float
+  float ToFloat() const;
+
+  /// \brief Copy the value's bytes in native-endian byte order
+  void ToBytes(uint8_t* dest) const { std::memcpy(dest, &value_, 
sizeof(value_)); }
+  /// \brief Return the value's bytes in native-endian byte order
+  constexpr std::array<uint8_t, 2> ToBytes() const {
+#if ARROW_LITTLE_ENDIAN
+    return ToLittleEndian();
+#else
+    return ToBigEndian();
+#endif
+  }
+
+  /// \brief Copy the value's bytes in little-endian byte order
+  void ToLittleEndian(uint8_t* dest) const {
+    Float16{bit_util::ToLittleEndian(value_)}.ToBytes(dest);
+  }
+  /// \brief Return the value's bytes in little-endian byte order
+  constexpr std::array<uint8_t, 2> ToLittleEndian() const {
+#if ARROW_LITTLE_ENDIAN
+    return {uint8_t(value_ & 0xff), uint8_t(value_ >> 8)};
+#else
+    return {uint8_t(value_ >> 8), uint8_t(value_ & 0xff)};
+#endif
+  }
+
+  /// \brief Copy the value's bytes in big-endian byte order
+  void ToBigEndian(uint8_t* dest) const {
+    Float16{bit_util::ToBigEndian(value_)}.ToBytes(dest);
+  }
+  /// \brief Return the value's bytes in big-endian byte order
+  constexpr std::array<uint8_t, 2> ToBigEndian() const {
+#if ARROW_LITTLE_ENDIAN
+    return {uint8_t(value_ >> 8), uint8_t(value_ & 0xff)};
+#else
+    return {uint8_t(value_ & 0xff), uint8_t(value_ >> 8)};
+#endif
+  }
+
+  constexpr Float16 operator-() const { return Float16(value_ ^ 0x8000); }
+  constexpr Float16 operator+() const { return Float16(value_); }
+
+  friend constexpr bool operator==(Float16 lhs, Float16 rhs) {
+    if (lhs.is_nan() || rhs.is_nan()) return false;
+    return Float16::CompareEq(lhs, rhs);
+  }
+  friend constexpr bool operator!=(Float16 lhs, Float16 rhs) { return !(lhs == 
rhs); }
+
+  friend constexpr bool operator<(Float16 lhs, Float16 rhs) {
+    if (lhs.is_nan() || rhs.is_nan()) return false;
+    return Float16::CompareLt(lhs, rhs);
+  }
+  friend constexpr bool operator>(Float16 lhs, Float16 rhs) { return rhs < 
lhs; }
+
+  friend constexpr bool operator<=(Float16 lhs, Float16 rhs) {
+    if (lhs.is_nan() || rhs.is_nan()) return false;
+    return !Float16::CompareLt(rhs, lhs);
+  }
+  friend constexpr bool operator>=(Float16 lhs, Float16 rhs) { return rhs <= 
lhs; }
+
+  ARROW_FRIEND_EXPORT friend std::ostream& operator<<(std::ostream& os, 
Float16 arg);
+
+ protected:
+  uint16_t value_;
+
+ private:
+  // Comparison helpers that assume neither operand is NaN
+  static constexpr bool CompareEq(Float16 lhs, Float16 rhs) {
+    return (lhs.bits() == rhs.bits()) || (lhs.is_zero() && rhs.is_zero());
+  }
+  static constexpr bool CompareLt(Float16 lhs, Float16 rhs) {
+    if (lhs.signbit()) {
+      if (rhs.signbit()) {
+        // Both are negative
+        return lhs.bits() > rhs.bits();
+      } else {
+        // Handle +/-0
+        return !lhs.is_zero() || rhs.bits() != 0;
+      }
+    } else if (rhs.signbit()) {
+      return false;
+    } else {
+      // Both are positive
+      return lhs.bits() < rhs.bits();
+    }
+  }
+};
+
+static_assert(std::is_trivial_v<Float16>);
+
+}  // namespace util
+}  // namespace arrow
+
+// TODO: Not complete

Review Comment:
   By the way, do you think it would be admissible to specialize `std::isnan` 
and friends for `Float16`? @bkietz 



##########
cpp/src/arrow/util/float16.cc:
##########
@@ -0,0 +1,187 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <ostream>
+
+#include "arrow/util/float16.h"
+#include "arrow/util/ubsan.h"
+
+namespace arrow {
+namespace util {
+
+namespace {
+
+// --------------------------------------------------------
+// Binary conversions
+// --------------------------------------------------------
+// These routines are partially adapted from Numpy's C implementation

Review Comment:
   (note for another issue/PR: interestingly Numpy has started adding HW 
accelerations for these: 
https://github.com/numpy/numpy/blob/main/numpy/core/src/common/half.hpp#L55)



##########
cpp/src/parquet/arrow/arrow_schema_test.cc:
##########
@@ -851,6 +851,8 @@ TEST_F(TestConvertArrowSchema, ArrowFields) {
        ParquetType::FIXED_LEN_BYTE_ARRAY, 7},
       {"decimal(32, 8)", ::arrow::decimal(32, 8), LogicalType::Decimal(32, 8),
        ParquetType::FIXED_LEN_BYTE_ARRAY, 14},
+      {"float16", ::arrow::float16(), LogicalType::Float16(),
+       ParquetType::FIXED_LEN_BYTE_ARRAY, 2},

Review Comment:
   This will test Arrow to Parquet, can also you add a test for the other way 
round in `TestConvertParquetSchema`?



##########
cpp/src/parquet/arrow/test_util.h:
##########
@@ -65,12 +65,44 @@ struct Decimal256WithPrecisionAndScale {
   static constexpr int32_t scale = PRECISION - 1;
 };
 
+inline std::vector<uint16_t> RandomHalfFloatValues(size_t size, uint16_t min,

Review Comment:
   Hmm, this function is quite complicated and also won't generate a uniform 
distribution of values like `random_real` does, will it?
   How about simply generating random floats with `random_real` and then 
casting them to `Float16`?



##########
cpp/src/parquet/arrow/reader_internal.cc:
##########
@@ -713,6 +715,77 @@ Status TransferDecimal(RecordReader* reader, MemoryPool* 
pool,
   return Status::OK();
 }
 
+static inline Status ConvertToHalfFloat(const Array& array,
+                                        const std::shared_ptr<DataType>& type,
+                                        MemoryPool* pool, 
std::shared_ptr<Array>* out) {
+  constexpr int32_t byte_width = sizeof(uint16_t);
+  DCHECK_EQ(checked_cast<const ::arrow::HalfFloatType&>(*type).byte_width(), 
byte_width);
+
+  // We read the halffloat (uint16_t) bytes from a raw binary array, in which 
they're
+  // assumed to be little-endian.
+  const auto& binary_array = checked_cast<const 
::arrow::FixedSizeBinaryArray&>(array);
+  DCHECK_EQ(checked_cast<const 
::arrow::FixedSizeBinaryType&>(*binary_array.type())
+                .byte_width(),
+            byte_width);
+
+  // Number of elements in the halffloat array
+  const int64_t length = binary_array.length();
+  // Allocate data for the output halffloat array
+  ARROW_ASSIGN_OR_RAISE(auto data, ::arrow::AllocateBuffer(length * 
byte_width, pool));
+  uint8_t* out_ptr = data->mutable_data();
+
+  const int64_t null_count = binary_array.null_count();
+  // Copy the values to the output array in native-endian format
+  if (null_count > 0) {
+    for (int64_t i = 0; i < length; ++i, out_ptr += byte_width) {
+      Float16 f16{0};

Review Comment:
   Also, Parquet C++ has no big-endian compatibility currently. 



##########
cpp/src/parquet/arrow/arrow_schema_test.cc:
##########
@@ -906,22 +908,6 @@ TEST_F(TestConvertArrowSchema, ArrowFields) {
   // ASSERT_NO_FATAL_FAILURE();
 }
 
-TEST_F(TestConvertArrowSchema, ArrowNonconvertibleFields) {
-  struct FieldConstructionArguments {
-    std::string name;
-    std::shared_ptr<::arrow::DataType> datatype;
-  };
-
-  std::vector<FieldConstructionArguments> cases = {
-      {"float16", ::arrow::float16()},

Review Comment:
   Instead of removing this test, can we use another Arrow datatype whose 
conversion to Parquet isn't supported?



##########
cpp/src/parquet/column_writer.cc:
##########
@@ -2305,6 +2307,74 @@ struct SerializeFunctor<
   int64_t* scratch;
 };
 
+// ----------------------------------------------------------------------
+// Write Arrow to Float16
+
+// Requires a custom serializer because Float16s in Parquet are stored as a 
2-byte
+// (little-endian) FLBA, whereas in Arrow they're a native `uint16_t`. Also, a 
temporary
+// buffer is needed if there's an endian mismatch.
+template <>
+struct SerializeFunctor<::parquet::FLBAType, ::arrow::HalfFloatType> {
+  Status Serialize(const ::arrow::HalfFloatArray& array, ArrowWriteContext* 
ctx,
+                   FLBA* out) {
+#if ARROW_LITTLE_ENDIAN

Review Comment:
   None of the Parquet C++ implementation is big-endian compatible, so I'm not 
sure you should bother specifically here. We also don't have any CI for this.



##########
cpp/src/parquet/statistics.cc:
##########
@@ -527,9 +616,28 @@ class TypedStatisticsImpl : public TypedStatistics<DType> {
 
   void IncrementNumValues(int64_t n) override { num_values_ += n; }
 
+  static bool IsMeaningfulLogicalType(LogicalType::Type::type type) {
+    switch (type) {
+      case LogicalType::Type::FLOAT16:
+        return true;
+      default:
+        return false;
+    }
+  }
+
   bool Equals(const Statistics& raw_other) const override {
     if (physical_type() != raw_other.physical_type()) return false;
 
+    const auto logical_id = LogicalTypeId(*this);

Review Comment:
   Perhaps store the logical type id in the `TypedStatisticsImpl` constructor, 
instead of calling `LogicalTypeId` from multiple places?



##########
cpp/src/arrow/util/float16_test.cc:
##########
@@ -0,0 +1,246 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <array>
+#include <cmath>
+#include <utility>
+#include <vector>
+
+#include <gtest/gtest.h>
+
+#include "arrow/testing/gtest_util.h"
+#include "arrow/util/endian.h"
+#include "arrow/util/float16.h"
+#include "arrow/util/ubsan.h"
+
+namespace arrow {
+namespace util {
+namespace {
+
+template <typename T>
+using Limits = std::numeric_limits<T>;
+
+float F32(uint32_t bits) { return SafeCopy<float>(bits); }
+
+TEST(Float16Test, RoundTripFromFloat32) {
+  struct TestCase {
+    float f32;
+    uint16_t b16;
+    float f16_as_f32;
+  };
+  // Expected values were also manually validated with numpy-1.24.3
+  const TestCase test_cases[] = {
+      // +/-0.0f
+      {F32(0x80000000u), 0b1000000000000000u, -0.0f},
+      {F32(0x00000000u), 0b0000000000000000u, +0.0f},
+      // 32-bit exp is 102 => 2^-25. Rounding to nearest.
+      {F32(0xb3000001u), 0b1000000000000001u, -5.96046447754e-8f},
+      // 32-bit exp is 102 => 2^-25. Rounding to even.
+      {F32(0xb3000000u), 0b1000000000000000u, -0.0f},
+      // 32-bit exp is 101 => 2^-26. Underflow to zero.
+      {F32(0xb2800001u), 0b1000000000000000u, -0.0f},
+      // 32-bit exp is 108 => 2^-19.
+      {F32(0xb61a0000u), 0b1000000000100110u, -2.26497650146e-6f},
+      // 32-bit exp is 108 => 2^-19.
+      {F32(0xb61e0000u), 0b1000000000101000u, -2.38418579102e-6f},
+      // 32-bit exp is 112 => 2^-15. Rounding to nearest.
+      {F32(0xb87fa001u), 0b1000001111111111u, -6.09755516052e-5f},
+      // 32-bit exp is 112 => 2^-15. Rounds to 16-bit exp of 1 => 2^-14
+      {F32(0xb87fe001u), 0b1000010000000000u, -6.103515625e-5f},
+      // 32-bit exp is 142 => 2^15. Rounding to nearest.
+      {F32(0xc7001001u), 0b1111100000000001u, -32800.0f},
+      // 32-bit exp is 142 => 2^15. Rounding to even.
+      {F32(0xc7001000u), 0b1111100000000000u, -32768.0f},
+      // 65520.0f rounds to inf
+      {F32(0x477ff000u), 0b0111110000000000u, Limits<float>::infinity()},
+      // 65488.0039062f rounds to 65504.0 (float16 max)
+      {F32(0x477fd001u), 0b0111101111111111u, 65504.0f},
+      // 32-bit exp is 127 => 2^0, rounds to 16-bit exp of 16 => 2^1.
+      {F32(0xbffff000u), 0b1100000000000000u, -2.0f},
+  };
+
+  for (size_t index = 0; index < std::size(test_cases); ++index) {
+    ARROW_SCOPED_TRACE("index=", index);
+    const auto& tc = test_cases[index];
+    const auto f16 = Float16::FromFloat(tc.f32);
+    EXPECT_EQ(tc.b16, f16.bits());
+    EXPECT_EQ(tc.f16_as_f32, f16.ToFloat());

Review Comment:
   Similar suggestion in other tests below.



##########
cpp/src/parquet/arrow/reader_internal.cc:
##########
@@ -713,6 +715,77 @@ Status TransferDecimal(RecordReader* reader, MemoryPool* 
pool,
   return Status::OK();
 }
 
+static inline Status ConvertToHalfFloat(const Array& array,
+                                        const std::shared_ptr<DataType>& type,
+                                        MemoryPool* pool, 
std::shared_ptr<Array>* out) {
+  constexpr int32_t byte_width = sizeof(uint16_t);
+  DCHECK_EQ(checked_cast<const ::arrow::HalfFloatType&>(*type).byte_width(), 
byte_width);
+
+  // We read the halffloat (uint16_t) bytes from a raw binary array, in which 
they're
+  // assumed to be little-endian.
+  const auto& binary_array = checked_cast<const 
::arrow::FixedSizeBinaryArray&>(array);
+  DCHECK_EQ(checked_cast<const 
::arrow::FixedSizeBinaryType&>(*binary_array.type())
+                .byte_width(),
+            byte_width);
+
+  // Number of elements in the halffloat array
+  const int64_t length = binary_array.length();
+  // Allocate data for the output halffloat array
+  ARROW_ASSIGN_OR_RAISE(auto data, ::arrow::AllocateBuffer(length * 
byte_width, pool));
+  uint8_t* out_ptr = data->mutable_data();
+
+  const int64_t null_count = binary_array.null_count();
+  // Copy the values to the output array in native-endian format
+  if (null_count > 0) {
+    for (int64_t i = 0; i < length; ++i, out_ptr += byte_width) {
+      Float16 f16{0};

Review Comment:
   memcpy isn't even needed. You can (or should be able to) just reuse the 
buffers. For example by calling `array.View(type)`.



##########
cpp/src/parquet/arrow/arrow_reader_writer_test.cc:
##########
@@ -916,9 +925,15 @@ TYPED_TEST(TestParquetIO, SingleColumnOptionalReadWrite) {
 }
 
 TYPED_TEST(TestParquetIO, SingleColumnOptionalDictionaryWrite) {
-  // Skip tests for BOOL as we don't create dictionaries for it.
-  if (TypeParam::type_id == ::arrow::Type::BOOL) {
-    return;
+  switch (TypeParam::type_id) {
+    // Skip tests for BOOL as we don't create dictionaries for it.
+    case ::arrow::Type::BOOL:
+    // Skip tests for HALF_FLOAT as it's not currently supported by 
`dictionary_encode`
+    case ::arrow::Type::HALF_FLOAT:
+      GTEST_SKIP();

Review Comment:
   Also do the same for bool above.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [arrow] pitrou commented on a diff in pull request #36073: GH-36036: [C++][Python][Parquet] Implement Float16 logical type

Reply via email to