felipecrv commented on code in PR #43302:
URL: https://github.com/apache/arrow/pull/43302#discussion_r1729006308
##########
cpp/src/arrow/compute/kernels/scalar_cast_string.cc:
##########
@@ -305,19 +310,242 @@ BinaryToBinaryCastExec(KernelContext* ctx, const
ExecSpan& batch, ExecResult* ou
ctx, input, out->array_data().get());
}
+// String View -> Offset String
+template <typename O, typename I>
+enable_if_t<is_binary_view_like_type<I>::value &&
is_base_binary_type<O>::value, Status>
+BinaryToBinaryCastExec(KernelContext* ctx, const ExecSpan& batch, ExecResult*
out) {
+ using OutputBuilderType = typename TypeTraits<O>::BuilderType;
+ const CastOptions& options = checked_cast<const
CastState&>(*ctx->state()).options;
+ const ArraySpan& input = batch[0].array;
+
+ if constexpr (!I::is_utf8 && O::is_utf8) {
+ if (!options.allow_invalid_utf8) {
+ InitializeUTF8();
+ ArraySpanVisitor<I> visitor;
+ Utf8Validator validator;
+ RETURN_NOT_OK(visitor.Visit(input, &validator));
+ }
+ }
+
+ const int64_t sum_of_binary_view_sizes = util::SumOfBinaryViewSizes(
+ input.GetValues<BinaryViewType::c_type>(1), input.length);
+
+ // TODO(GH-43573): A more efficient implementation that copies the validity
+ // bitmap all at once is possible, but would mean we don't delegate all the
+ // building logic to the ArrayBuilder implementation for the output type.
+ OutputBuilderType builder(options.to_type.GetSharedPtr(),
ctx->memory_pool());
+ RETURN_NOT_OK(builder.Resize(input.length));
+ RETURN_NOT_OK(builder.ReserveData(sum_of_binary_view_sizes));
+ arrow::internal::ArraySpanInlineVisitor<I> visitor;
+ RETURN_NOT_OK(visitor.VisitStatus(
+ input,
+ [&](std::string_view v) {
+ // Append valid string view
+ return builder.Append(v);
+ },
+ [&]() {
+ // Append null
+ builder.UnsafeAppendNull();
+ return Status::OK();
+ }));
+
+ std::shared_ptr<ArrayData> output_array;
+ RETURN_NOT_OK(builder.FinishInternal(&output_array));
+ out->value = std::move(output_array);
+ return Status::OK();
+}
+
+// Offset String -> String View
+template <typename O, typename I>
+enable_if_t<is_base_binary_type<I>::value &&
is_binary_view_like_type<O>::value, Status>
+BinaryToBinaryCastExec(KernelContext* ctx, const ExecSpan& batch, ExecResult*
out) {
+ using offset_type = typename I::offset_type;
+ const CastOptions& options = checked_cast<const
CastState&>(*ctx->state()).options;
+ const ArraySpan& input = batch[0].array;
+
+ if constexpr (!I::is_utf8 && O::is_utf8) {
+ if (!options.allow_invalid_utf8) {
+ InitializeUTF8();
+ ArraySpanVisitor<I> visitor;
+ Utf8Validator validator;
+ RETURN_NOT_OK(visitor.Visit(input, &validator));
+ }
+ }
+
+ // Start with a zero-copy cast, then reconfigure the view and data buffers
+ RETURN_NOT_OK(ZeroCopyCastExec(ctx, batch, out));
+ ArrayData* output = out->array_data().get();
+
+ const int64_t total_length = input.offset + input.length;
+ const auto* validity = input.GetValues<uint8_t>(0, 0);
+ const auto* input_offsets = input.GetValues<offset_type>(1);
+ const auto* input_data = input.GetValues<uint8_t>(2, 0);
+
+ // Turn buffers[1] into a buffer of empty BinaryViewType::c_type entries.
+ ARROW_ASSIGN_OR_RAISE(output->buffers[1],
+ ctx->Allocate(total_length * BinaryViewType::kSize));
+ memset(output->buffers[1]->mutable_data(), 0, total_length *
BinaryViewType::kSize);
+
+ // Check against offset overflow
+ if (total_length > 0) {
+ // Offsets must be monotonically increasing, that is offsets[j+1] >=
offsets[j] for
+ // 0 <= j < length, even for null slots. This property ensures the
location for all
+ // values is valid and well defined.
Review Comment:
```suggestion
// 0 <= j < length, even for null slots. So we only need to check the
last
// offset.
```
##########
cpp/src/arrow/compute/kernels/scalar_cast_string.cc:
##########
@@ -305,19 +310,242 @@ BinaryToBinaryCastExec(KernelContext* ctx, const
ExecSpan& batch, ExecResult* ou
ctx, input, out->array_data().get());
}
+// String View -> Offset String
+template <typename O, typename I>
+enable_if_t<is_binary_view_like_type<I>::value &&
is_base_binary_type<O>::value, Status>
+BinaryToBinaryCastExec(KernelContext* ctx, const ExecSpan& batch, ExecResult*
out) {
+ using OutputBuilderType = typename TypeTraits<O>::BuilderType;
+ const CastOptions& options = checked_cast<const
CastState&>(*ctx->state()).options;
+ const ArraySpan& input = batch[0].array;
+
+ if constexpr (!I::is_utf8 && O::is_utf8) {
+ if (!options.allow_invalid_utf8) {
+ InitializeUTF8();
+ ArraySpanVisitor<I> visitor;
+ Utf8Validator validator;
+ RETURN_NOT_OK(visitor.Visit(input, &validator));
+ }
+ }
+
+ const int64_t sum_of_binary_view_sizes = util::SumOfBinaryViewSizes(
+ input.GetValues<BinaryViewType::c_type>(1), input.length);
+
+ // TODO(GH-43573): A more efficient implementation that copies the validity
+ // bitmap all at once is possible, but would mean we don't delegate all the
+ // building logic to the ArrayBuilder implementation for the output type.
+ OutputBuilderType builder(options.to_type.GetSharedPtr(),
ctx->memory_pool());
+ RETURN_NOT_OK(builder.Resize(input.length));
+ RETURN_NOT_OK(builder.ReserveData(sum_of_binary_view_sizes));
+ arrow::internal::ArraySpanInlineVisitor<I> visitor;
+ RETURN_NOT_OK(visitor.VisitStatus(
+ input,
+ [&](std::string_view v) {
+ // Append valid string view
+ return builder.Append(v);
+ },
+ [&]() {
+ // Append null
+ builder.UnsafeAppendNull();
+ return Status::OK();
+ }));
+
+ std::shared_ptr<ArrayData> output_array;
+ RETURN_NOT_OK(builder.FinishInternal(&output_array));
+ out->value = std::move(output_array);
+ return Status::OK();
+}
+
+// Offset String -> String View
+template <typename O, typename I>
+enable_if_t<is_base_binary_type<I>::value &&
is_binary_view_like_type<O>::value, Status>
+BinaryToBinaryCastExec(KernelContext* ctx, const ExecSpan& batch, ExecResult*
out) {
+ using offset_type = typename I::offset_type;
+ const CastOptions& options = checked_cast<const
CastState&>(*ctx->state()).options;
+ const ArraySpan& input = batch[0].array;
+
+ if constexpr (!I::is_utf8 && O::is_utf8) {
+ if (!options.allow_invalid_utf8) {
+ InitializeUTF8();
+ ArraySpanVisitor<I> visitor;
+ Utf8Validator validator;
+ RETURN_NOT_OK(visitor.Visit(input, &validator));
+ }
+ }
+
+ // Start with a zero-copy cast, then reconfigure the view and data buffers
+ RETURN_NOT_OK(ZeroCopyCastExec(ctx, batch, out));
+ ArrayData* output = out->array_data().get();
+
+ const int64_t total_length = input.offset + input.length;
+ const auto* validity = input.GetValues<uint8_t>(0, 0);
+ const auto* input_offsets = input.GetValues<offset_type>(1);
+ const auto* input_data = input.GetValues<uint8_t>(2, 0);
+
+ // Turn buffers[1] into a buffer of empty BinaryViewType::c_type entries.
+ ARROW_ASSIGN_OR_RAISE(output->buffers[1],
+ ctx->Allocate(total_length * BinaryViewType::kSize));
+ memset(output->buffers[1]->mutable_data(), 0, total_length *
BinaryViewType::kSize);
+
+ // Check against offset overflow
+ if (total_length > 0) {
+ // Offsets must be monotonically increasing, that is offsets[j+1] >=
offsets[j] for
Review Comment:
```suggestion
// Offsets are monotonically increasing, that is offsets[j] <=
offsets[j+1] for
```
this is necessary in order to say `for all 0 <= j < length`
##########
cpp/src/arrow/compute/kernels/scalar_cast_string.cc:
##########
@@ -305,19 +310,198 @@ BinaryToBinaryCastExec(KernelContext* ctx, const
ExecSpan& batch, ExecResult* ou
ctx, input, out->array_data().get());
}
+// View -> Span
+template <typename O, typename I>
+enable_if_t<is_binary_view_like_type<I>::value &&
is_base_binary_type<O>::value, Status>
+BinaryToBinaryCastExec(KernelContext* ctx, const ExecSpan& batch, ExecResult*
out) {
+ using OutputBuilderType = typename TypeTraits<O>::BuilderType;
+ const CastOptions& options = checked_cast<const
CastState&>(*ctx->state()).options;
+ const ArraySpan& input = batch[0].array;
+
+ if constexpr (!I::is_utf8 && O::is_utf8) {
+ if (!options.allow_invalid_utf8) {
+ InitializeUTF8();
+ ArraySpanVisitor<I> visitor;
+ Utf8Validator validator;
+ RETURN_NOT_OK(visitor.Visit(input, &validator));
+ }
+ }
+
+ // XXX: a more efficient implementation that zero-copies the validity bitmap
+ // is possible, but requires a more complex implementation for building the
+ // offsets and data buffers
+ OutputBuilderType builder(options.to_type.GetSharedPtr(),
ctx->memory_pool());
+ RETURN_NOT_OK(builder.Resize(input.length));
+ arrow::internal::ArraySpanInlineVisitor<I> visitor;
+ RETURN_NOT_OK(visitor.VisitStatus(
+ input, [&](std::string_view v) { return builder.Append(v); },
+ [&]() {
+ builder.UnsafeAppendNull();
+ return Status::OK();
+ }));
+
+ std::shared_ptr<ArrayData> output_array;
+ RETURN_NOT_OK(builder.FinishInternal(&output_array));
+ out->value = std::move(output_array);
+ return Status::OK();
+}
+
+// Span -> View
+template <typename O, typename I>
+enable_if_t<is_base_binary_type<I>::value &&
is_binary_view_like_type<O>::value, Status>
+BinaryToBinaryCastExec(KernelContext* ctx, const ExecSpan& batch, ExecResult*
out) {
+ using offset_type = typename I::offset_type;
+ const CastOptions& options = checked_cast<const
CastState&>(*ctx->state()).options;
+ const ArraySpan& input = batch[0].array;
+
+ if constexpr (!I::is_utf8 && O::is_utf8) {
+ if (!options.allow_invalid_utf8) {
+ InitializeUTF8();
+ ArraySpanVisitor<I> visitor;
+ Utf8Validator validator;
+ RETURN_NOT_OK(visitor.Visit(input, &validator));
+ }
+ }
+
+ // Start with a zero-copy cast, then reconfigure the view and data buffers
+ RETURN_NOT_OK(ZeroCopyCastExec(ctx, batch, out));
+ ArrayData* output = out->array_data().get();
+ auto offsets_buffer = std::move(output->buffers[1]);
+ auto data_buffer = std::move(output->buffers[2]);
+
+ const int64_t total_length = input.offset + input.length;
+ const auto* validity = input.GetValues<uint8_t>(0, 0);
+ const auto* input_offsets = input.GetValues<offset_type>(1);
+ const auto* input_data = input.GetValues<uint8_t>(2, 0);
+
+ // Turn buffers[1] into a buffer of empty BinaryViewType::c_type entries.
+ ARROW_ASSIGN_OR_RAISE(output->buffers[1],
+ ctx->Allocate(total_length * BinaryViewType::kSize));
+ memset(output->buffers[1]->mutable_data(), 0, total_length *
BinaryViewType::kSize);
+ auto* out_views = output->GetMutableValues<BinaryViewType::c_type>(1);
+
+ bool all_entries_are_inline = true;
+ VisitSetBitRunsVoid(
+ validity, output->offset, output->length,
+ [&](int64_t start_offset, int64_t run_length) {
+ for (int64_t i = start_offset; i < start_offset + run_length; i++) {
+ const offset_type data_offset = input_offsets[i];
+ const offset_type data_length = input_offsets[i + 1] - data_offset;
+ auto& out_view = out_views[i];
+ if (data_length <= BinaryViewType::kInlineSize) {
+ out_view.inlined.size = static_cast<int32_t>(data_length);
+ memcpy(out_view.inlined.data.data(), input_data + data_offset,
data_length);
+ } else {
+ out_view.ref.size = static_cast<int32_t>(data_length);
+ memcpy(out_view.ref.prefix.data(), input_data + data_offset,
+ BinaryViewType::kPrefixSize);
+ // out_view.ref.buffer_index = 0;
Review Comment:
I added some comments on the code. I think the idea works well in the sense
that it's correct and only wasteful if the input is already wasteful (like the
all null array with long strings under them). No need to worry about this case.
##########
cpp/src/arrow/compute/kernels/scalar_cast_string.cc:
##########
@@ -305,19 +310,242 @@ BinaryToBinaryCastExec(KernelContext* ctx, const
ExecSpan& batch, ExecResult* ou
ctx, input, out->array_data().get());
}
+// String View -> Offset String
+template <typename O, typename I>
+enable_if_t<is_binary_view_like_type<I>::value &&
is_base_binary_type<O>::value, Status>
+BinaryToBinaryCastExec(KernelContext* ctx, const ExecSpan& batch, ExecResult*
out) {
+ using OutputBuilderType = typename TypeTraits<O>::BuilderType;
+ const CastOptions& options = checked_cast<const
CastState&>(*ctx->state()).options;
+ const ArraySpan& input = batch[0].array;
+
+ if constexpr (!I::is_utf8 && O::is_utf8) {
+ if (!options.allow_invalid_utf8) {
+ InitializeUTF8();
+ ArraySpanVisitor<I> visitor;
+ Utf8Validator validator;
+ RETURN_NOT_OK(visitor.Visit(input, &validator));
+ }
+ }
+
+ const int64_t sum_of_binary_view_sizes = util::SumOfBinaryViewSizes(
+ input.GetValues<BinaryViewType::c_type>(1), input.length);
+
+ // TODO(GH-43573): A more efficient implementation that copies the validity
+ // bitmap all at once is possible, but would mean we don't delegate all the
+ // building logic to the ArrayBuilder implementation for the output type.
+ OutputBuilderType builder(options.to_type.GetSharedPtr(),
ctx->memory_pool());
+ RETURN_NOT_OK(builder.Resize(input.length));
+ RETURN_NOT_OK(builder.ReserveData(sum_of_binary_view_sizes));
+ arrow::internal::ArraySpanInlineVisitor<I> visitor;
+ RETURN_NOT_OK(visitor.VisitStatus(
+ input,
+ [&](std::string_view v) {
+ // Append valid string view
+ return builder.Append(v);
+ },
+ [&]() {
+ // Append null
+ builder.UnsafeAppendNull();
+ return Status::OK();
+ }));
+
+ std::shared_ptr<ArrayData> output_array;
+ RETURN_NOT_OK(builder.FinishInternal(&output_array));
+ out->value = std::move(output_array);
+ return Status::OK();
+}
+
+// Offset String -> String View
+template <typename O, typename I>
+enable_if_t<is_base_binary_type<I>::value &&
is_binary_view_like_type<O>::value, Status>
+BinaryToBinaryCastExec(KernelContext* ctx, const ExecSpan& batch, ExecResult*
out) {
+ using offset_type = typename I::offset_type;
+ const CastOptions& options = checked_cast<const
CastState&>(*ctx->state()).options;
+ const ArraySpan& input = batch[0].array;
+
+ if constexpr (!I::is_utf8 && O::is_utf8) {
+ if (!options.allow_invalid_utf8) {
+ InitializeUTF8();
+ ArraySpanVisitor<I> visitor;
+ Utf8Validator validator;
+ RETURN_NOT_OK(visitor.Visit(input, &validator));
+ }
+ }
+
+ // Start with a zero-copy cast, then reconfigure the view and data buffers
+ RETURN_NOT_OK(ZeroCopyCastExec(ctx, batch, out));
+ ArrayData* output = out->array_data().get();
+
+ const int64_t total_length = input.offset + input.length;
+ const auto* validity = input.GetValues<uint8_t>(0, 0);
+ const auto* input_offsets = input.GetValues<offset_type>(1);
+ const auto* input_data = input.GetValues<uint8_t>(2, 0);
+
+ // Turn buffers[1] into a buffer of empty BinaryViewType::c_type entries.
+ ARROW_ASSIGN_OR_RAISE(output->buffers[1],
+ ctx->Allocate(total_length * BinaryViewType::kSize));
+ memset(output->buffers[1]->mutable_data(), 0, total_length *
BinaryViewType::kSize);
+
+ // Check against offset overflow
+ if (total_length > 0) {
+ // Offsets must be monotonically increasing, that is offsets[j+1] >=
offsets[j] for
+ // 0 <= j < length, even for null slots. This property ensures the
location for all
+ // values is valid and well defined.
+ const int64_t max_data_offset = input_offsets[input.length];
+ if (ARROW_PREDICT_FALSE(max_data_offset >
std::numeric_limits<int32_t>::max())) {
+ // A more complicated loop could work by slicing the data buffer into
+ // more than one variadic buffer, but this is probably overkill for now
+ // before someone hits this problem in practice.
+ return Status::Invalid("Failed casting from ", input.type->ToString(), "
to ",
+ output->type->ToString(),
+ ": input array too large for efficient
conversion.");
+ }
+ }
Review Comment:
You can kill this whole block with an `if constexpr` against the
`sizeof(offset_type)` -- if the `offset_type` has only 32-bits, overflow is
guaranteed to not happen by construction.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]