Copilot commented on code in PR #41870:
URL: https://github.com/apache/arrow/pull/41870#discussion_r3334577890
##########
cpp/src/arrow/tensor.cc:
##########
@@ -224,7 +224,7 @@ Status ValidateTensorParameters(const
std::shared_ptr<DataType>& type,
}
template <typename Out>
-struct ConvertColumnsToTensorVisitor {
+struct ConvertArrayToTensorVisitor {
Out*& out_values;
const ArrayData& in_data;
Review Comment:
In the null-handling path this visitor uses `static_cast<Out>(NAN)` when an
input slot is null. That conversion is undefined for integral `Out` types and
also doesn't produce a correct float16 NaN bit pattern when the tensor output
type is `HALF_FLOAT` (which is currently written via a `uint16_t` buffer).
To make `null_to_nan=true` safe for float16 output, write an IEEE-754
binary16 quiet-NaN bit pattern for `HALF_FLOAT` instead of casting `NAN` to
`uint16_t`.
##########
cpp/src/arrow/tensor.cc:
##########
@@ -256,25 +256,28 @@ struct ConvertColumnsToTensorVisitor {
};
template <typename Out>
-struct ConvertColumnsToTensorRowMajorVisitor {
+struct ConvertArrayToTensorRowMajorVisitor {
Out*& out_values;
const ArrayData& in_data;
- int num_cols;
- int col_idx;
+ int64_t num_cols;
+ int64_t col_idx;
+ int64_t chunk_idx;
template <typename T>
Status Visit(const T&) {
if constexpr (is_numeric(T::type_id)) {
using In = typename T::c_type;
auto in_values = ArraySpan(in_data).GetSpan<In>(1, in_data.length);
+ const int64_t base = chunk_idx * num_cols + col_idx;
+
if (in_data.null_count == 0) {
for (int64_t i = 0; i < in_data.length; ++i) {
- out_values[i * num_cols + col_idx] = static_cast<Out>(in_values[i]);
+ out_values[base + i * num_cols] = static_cast<Out>(in_values[i]);
}
} else {
for (int64_t i = 0; i < in_data.length; ++i) {
- out_values[i * num_cols + col_idx] =
+ out_values[base + i * num_cols] =
in_data.IsNull(i) ? static_cast<Out>(NAN) :
static_cast<Out>(in_values[i]);
Review Comment:
Same issue in the row-major conversion visitor: nulls are written using
`static_cast<Out>(NAN)`, which is undefined if `Out` is integral and is
incorrect for float16 output (written through a `uint16_t` buffer). This can
lead to UB and wrong values when converting float16 data with
`null_to_nan=true`.
Write a float16 quiet-NaN bit pattern for `HALF_FLOAT` instead.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]