mapleFU commented on code in PR #36073:
URL: https://github.com/apache/arrow/pull/36073#discussion_r1302427543
##########
cpp/src/parquet/column_writer.cc:
##########
@@ -2305,6 +2307,74 @@ struct SerializeFunctor<
int64_t* scratch;
};
+// ----------------------------------------------------------------------
+// Write Arrow to Float16
+
+// Requires a custom serializer because Float16s in Parquet are stored as a
2-byte
+// (little-endian) FLBA, whereas in Arrow they're a native `uint16_t`. Also, a
temporary
+// buffer is needed if there's an endian mismatch.
+template <>
+struct SerializeFunctor<::parquet::FLBAType, ::arrow::HalfFloatType> {
+ Status Serialize(const ::arrow::HalfFloatArray& array, ArrowWriteContext*
ctx,
+ FLBA* out) {
+#if ARROW_LITTLE_ENDIAN
+ return SerializeInPlace(array, ctx, out);
+#else
+ return SerializeWithScratch(array, ctx, out);
+#endif
+ }
+
+ Status SerializeInPlace(const ::arrow::HalfFloatArray& array,
ArrowWriteContext*,
+ FLBA* out) {
+ const uint16_t* values = array.raw_values();
+ if (array.null_count() == 0) {
+ for (int64_t i = 0; i < array.length(); ++i) {
+ out[i] = ToFLBA(&values[i]);
+ }
+ } else {
+ for (int64_t i = 0; i < array.length(); ++i) {
+ out[i] = array.IsValid(i) ? ToFLBA(&values[i]) : FLBA{};
+ }
+ }
+ return Status::OK();
+ }
+
+ Status SerializeWithScratch(const ::arrow::HalfFloatArray& array,
+ ArrowWriteContext* ctx, FLBA* out) {
+ AllocateScratch(array, ctx);
+ if (array.null_count() == 0) {
+ for (int64_t i = 0; i < array.length(); ++i) {
+ out[i] = ToFLBA(array.Value(i));
+ }
+ } else {
+ for (int64_t i = 0; i < array.length(); ++i) {
+ out[i] = array.IsValid(i) ? ToFLBA(array.Value(i)) : FLBA{};
+ }
+ }
+ return Status::OK();
+ }
+
+ private:
+ FLBA ToFLBA(const uint16_t* value_ptr) const {
+ return FLBA{reinterpret_cast<const uint8_t*>(value_ptr)};
+ }
+ FLBA ToFLBA(uint16_t value) {
Review Comment:
The code look good to me, but the two mode here is a bit confusing, would
different name for these `ToFLBA` better? (avoid mis-use the `ToFLBA` and touch
`scratch_` )
##########
cpp/src/parquet/arrow/reader_internal.cc:
##########
@@ -713,6 +715,77 @@ Status TransferDecimal(RecordReader* reader, MemoryPool*
pool,
return Status::OK();
}
+static inline Status ConvertToHalfFloat(const Array& array,
+ const std::shared_ptr<DataType>& type,
+ MemoryPool* pool,
std::shared_ptr<Array>* out) {
+ constexpr int32_t byte_width = sizeof(uint16_t);
+ DCHECK_EQ(checked_cast<const ::arrow::HalfFloatType&>(*type).byte_width(),
byte_width);
+
+ // We read the halffloat (uint16_t) bytes from a raw binary array, in which
they're
+ // assumed to be little-endian.
+ const auto& binary_array = checked_cast<const
::arrow::FixedSizeBinaryArray&>(array);
+ DCHECK_EQ(checked_cast<const
::arrow::FixedSizeBinaryType&>(*binary_array.type())
+ .byte_width(),
+ byte_width);
+
+ // Number of elements in the halffloat array
+ const int64_t length = binary_array.length();
+ // Allocate data for the output halffloat array
+ ARROW_ASSIGN_OR_RAISE(auto data, ::arrow::AllocateBuffer(length *
byte_width, pool));
+ uint8_t* out_ptr = data->mutable_data();
+
+ const int64_t null_count = binary_array.null_count();
+ // Copy the values to the output array in native-endian format
+ if (null_count > 0) {
+ for (int64_t i = 0; i < length; ++i, out_ptr += byte_width) {
+ Float16 f16{0};
Review Comment:
The code looks good to me, but why can't we blink memcpy when it's
`LITTLE_ENDIAN`? Since if it's null, the value is undefined?
##########
cpp/src/parquet/arrow/reader_internal.cc:
##########
@@ -713,6 +715,77 @@ Status TransferDecimal(RecordReader* reader, MemoryPool*
pool,
return Status::OK();
}
+static inline Status ConvertToHalfFloat(const Array& array,
+ const std::shared_ptr<DataType>& type,
+ MemoryPool* pool,
std::shared_ptr<Array>* out) {
+ constexpr int32_t byte_width = sizeof(uint16_t);
+ DCHECK_EQ(checked_cast<const ::arrow::HalfFloatType&>(*type).byte_width(),
byte_width);
+
+ // We read the halffloat (uint16_t) bytes from a raw binary array, in which
they're
+ // assumed to be little-endian.
+ const auto& binary_array = checked_cast<const
::arrow::FixedSizeBinaryArray&>(array);
+ DCHECK_EQ(checked_cast<const
::arrow::FixedSizeBinaryType&>(*binary_array.type())
+ .byte_width(),
+ byte_width);
+
+ // Number of elements in the halffloat array
+ const int64_t length = binary_array.length();
+ // Allocate data for the output halffloat array
+ ARROW_ASSIGN_OR_RAISE(auto data, ::arrow::AllocateBuffer(length *
byte_width, pool));
+ uint8_t* out_ptr = data->mutable_data();
+
+ const int64_t null_count = binary_array.null_count();
+ // Copy the values to the output array in native-endian format
+ if (null_count > 0) {
+ for (int64_t i = 0; i < length; ++i, out_ptr += byte_width) {
+ Float16 f16{0};
+ if (binary_array.IsValid(i)) {
+ const uint8_t* in_ptr = binary_array.GetValue(i);
+ f16 = Float16::FromLittleEndian(in_ptr);
+ }
+ f16.ToBytes(out_ptr);
+ }
+ } else {
+#if ARROW_LITTLE_ENDIAN
+ // No need to byte-swap, so do a simple copy
+ std::memcpy(out_ptr, binary_array.raw_values(), length * byte_width);
+#else
+ for (int64_t i = 0; i < length; ++i, out_ptr += byte_width) {
+ const uint8_t* in_ptr = binary_array.GetValue(i);
+ Float16::FromLittleEndian(in_ptr).ToBytes(out_ptr);
+ }
+#endif
+ }
+
+ *out = std::make_shared<::arrow::HalfFloatArray>(
+ type, length, std::move(data), binary_array.null_bitmap(), null_count);
+ return Status::OK();
+}
+
+/// \brief Convert an arrow::BinaryArray to an arrow::HalfFloatArray
+/// We do this by:
+/// 1. Creating an arrow::BinaryArray from the RecordReader's builder
+/// 2. Allocating a buffer for the arrow::HalfFloatArray
+/// 3. Converting the little-endian bytes in each BinaryArray entry to
native-endian
+/// halffloat (uint16_t) values
+Status TransferHalfFloat(RecordReader* reader, MemoryPool* pool,
+ const std::shared_ptr<Field>& field, Datum* out) {
+ auto binary_reader = dynamic_cast<BinaryRecordReader*>(reader);
+ DCHECK(binary_reader);
+ ::arrow::ArrayVector chunks = binary_reader->GetBuilderChunks();
Review Comment:
`BinaryRecordReader` can be regard as a base class, so it would work here.
however I think it would be more clear with @bkietz 's advice
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]