mapleFU commented on code in PR #43302:
URL: https://github.com/apache/arrow/pull/43302#discussion_r1728199107
##########
cpp/src/arrow/compute/kernels/scalar_cast_string.cc:
##########
@@ -305,19 +310,198 @@ BinaryToBinaryCastExec(KernelContext* ctx, const
ExecSpan& batch, ExecResult* ou
ctx, input, out->array_data().get());
}
+// View -> Span
+template <typename O, typename I>
+enable_if_t<is_binary_view_like_type<I>::value &&
is_base_binary_type<O>::value, Status>
+BinaryToBinaryCastExec(KernelContext* ctx, const ExecSpan& batch, ExecResult*
out) {
+ using OutputBuilderType = typename TypeTraits<O>::BuilderType;
+ const CastOptions& options = checked_cast<const
CastState&>(*ctx->state()).options;
+ const ArraySpan& input = batch[0].array;
+
+ if constexpr (!I::is_utf8 && O::is_utf8) {
+ if (!options.allow_invalid_utf8) {
+ InitializeUTF8();
+ ArraySpanVisitor<I> visitor;
+ Utf8Validator validator;
+ RETURN_NOT_OK(visitor.Visit(input, &validator));
+ }
+ }
+
+ // XXX: a more efficient implementation that zero-copies the validity bitmap
+ // is possible, but requires a more complex implementation for building the
+ // offsets and data buffers
+ OutputBuilderType builder(options.to_type.GetSharedPtr(),
ctx->memory_pool());
+ RETURN_NOT_OK(builder.Resize(input.length));
+ arrow::internal::ArraySpanInlineVisitor<I> visitor;
+ RETURN_NOT_OK(visitor.VisitStatus(
+ input, [&](std::string_view v) { return builder.Append(v); },
+ [&]() {
+ builder.UnsafeAppendNull();
+ return Status::OK();
+ }));
+
+ std::shared_ptr<ArrayData> output_array;
+ RETURN_NOT_OK(builder.FinishInternal(&output_array));
+ out->value = std::move(output_array);
+ return Status::OK();
+}
+
+// Span -> View
+template <typename O, typename I>
+enable_if_t<is_base_binary_type<I>::value &&
is_binary_view_like_type<O>::value, Status>
+BinaryToBinaryCastExec(KernelContext* ctx, const ExecSpan& batch, ExecResult*
out) {
+ using offset_type = typename I::offset_type;
+ const CastOptions& options = checked_cast<const
CastState&>(*ctx->state()).options;
+ const ArraySpan& input = batch[0].array;
+
+ if constexpr (!I::is_utf8 && O::is_utf8) {
+ if (!options.allow_invalid_utf8) {
+ InitializeUTF8();
+ ArraySpanVisitor<I> visitor;
+ Utf8Validator validator;
+ RETURN_NOT_OK(visitor.Visit(input, &validator));
+ }
+ }
+
+ // Start with a zero-copy cast, then reconfigure the view and data buffers
+ RETURN_NOT_OK(ZeroCopyCastExec(ctx, batch, out));
+ ArrayData* output = out->array_data().get();
+ auto offsets_buffer = std::move(output->buffers[1]);
+ auto data_buffer = std::move(output->buffers[2]);
+
+ const int64_t total_length = input.offset + input.length;
+ const auto* validity = input.GetValues<uint8_t>(0, 0);
+ const auto* input_offsets = input.GetValues<offset_type>(1);
+ const auto* input_data = input.GetValues<uint8_t>(2, 0);
+
+ // Turn buffers[1] into a buffer of empty BinaryViewType::c_type entries.
+ ARROW_ASSIGN_OR_RAISE(output->buffers[1],
+ ctx->Allocate(total_length * BinaryViewType::kSize));
+ memset(output->buffers[1]->mutable_data(), 0, total_length *
BinaryViewType::kSize);
+ auto* out_views = output->GetMutableValues<BinaryViewType::c_type>(1);
+
+ bool all_entries_are_inline = true;
+ VisitSetBitRunsVoid(
+ validity, output->offset, output->length,
+ [&](int64_t start_offset, int64_t run_length) {
+ for (int64_t i = start_offset; i < start_offset + run_length; i++) {
+ const offset_type data_offset = input_offsets[i];
+ const offset_type data_length = input_offsets[i + 1] - data_offset;
+ auto& out_view = out_views[i];
+ if (data_length <= BinaryViewType::kInlineSize) {
+ out_view.inlined.size = static_cast<int32_t>(data_length);
+ memcpy(out_view.inlined.data.data(), input_data + data_offset,
data_length);
+ } else {
+ out_view.ref.size = static_cast<int32_t>(data_length);
+ memcpy(out_view.ref.prefix.data(), input_data + data_offset,
+ BinaryViewType::kPrefixSize);
+ // out_view.ref.buffer_index = 0;
Review Comment:
I've using this:
```c++
// Check against offset overflow
if (total_length > 0) {
// Offsets must be monotonically increasing, that is offsets[j+1] >=
offsets[j] for
// 0 <= j < length, even for null slots. This property ensures the
location for all
// values is valid and well defined.
const int64_t max_data_offset = input_offsets[input.length];
if (ARROW_PREDICT_FALSE(max_data_offset >
std::numeric_limits<int32_t>::max())) {
// A more complicated loop could work by slicing the data buffer into
// more than one variadic buffer, but this is probably overkill for now
// before someone hits this problem in practice.
return Status::Invalid("Failed casting from ", input.type->ToString(),
" to ",
output->type->ToString(),
": input array too large for efficient
conversion.");
}
}
```
Pro:
1. Don't going to loop and as simple as "FIXED->Offset" one
Cons:
1. For the all-null case, if null string have large buffer, this might not
work well
Do you think this is ok?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]