pitrou commented on code in PR #43302:
URL: https://github.com/apache/arrow/pull/43302#discussion_r1686273056
##########
cpp/src/arrow/compute/kernels/scalar_cast_string.cc:
##########
@@ -305,19 +310,198 @@ BinaryToBinaryCastExec(KernelContext* ctx, const
ExecSpan& batch, ExecResult* ou
ctx, input, out->array_data().get());
}
+// View -> Span
+template <typename O, typename I>
+enable_if_t<is_binary_view_like_type<I>::value &&
is_base_binary_type<O>::value, Status>
+BinaryToBinaryCastExec(KernelContext* ctx, const ExecSpan& batch, ExecResult*
out) {
+ using OutputBuilderType = typename TypeTraits<O>::BuilderType;
+ const CastOptions& options = checked_cast<const
CastState&>(*ctx->state()).options;
+ const ArraySpan& input = batch[0].array;
+
+ if constexpr (!I::is_utf8 && O::is_utf8) {
+ if (!options.allow_invalid_utf8) {
+ InitializeUTF8();
+ ArraySpanVisitor<I> visitor;
+ Utf8Validator validator;
+ RETURN_NOT_OK(visitor.Visit(input, &validator));
+ }
+ }
+
+ // XXX: a more efficient implementation that zero-copies the validity bitmap
+ // is possible, but requires a more complex implementation for building the
+ // offsets and data buffers
+ OutputBuilderType builder(options.to_type.GetSharedPtr(),
ctx->memory_pool());
+ RETURN_NOT_OK(builder.Resize(input.length));
Review Comment:
Is it possible to presize the data as well, in addition to the offsets?
##########
cpp/src/arrow/compute/kernels/scalar_cast_string.cc:
##########
@@ -305,19 +310,198 @@ BinaryToBinaryCastExec(KernelContext* ctx, const
ExecSpan& batch, ExecResult* ou
ctx, input, out->array_data().get());
}
+// View -> Span
+template <typename O, typename I>
+enable_if_t<is_binary_view_like_type<I>::value &&
is_base_binary_type<O>::value, Status>
+BinaryToBinaryCastExec(KernelContext* ctx, const ExecSpan& batch, ExecResult*
out) {
+ using OutputBuilderType = typename TypeTraits<O>::BuilderType;
+ const CastOptions& options = checked_cast<const
CastState&>(*ctx->state()).options;
+ const ArraySpan& input = batch[0].array;
+
+ if constexpr (!I::is_utf8 && O::is_utf8) {
+ if (!options.allow_invalid_utf8) {
+ InitializeUTF8();
+ ArraySpanVisitor<I> visitor;
+ Utf8Validator validator;
+ RETURN_NOT_OK(visitor.Visit(input, &validator));
+ }
+ }
+
+ // XXX: a more efficient implementation that zero-copies the validity bitmap
+ // is possible, but requires a more complex implementation for building the
+ // offsets and data buffers
+ OutputBuilderType builder(options.to_type.GetSharedPtr(),
ctx->memory_pool());
+ RETURN_NOT_OK(builder.Resize(input.length));
+ arrow::internal::ArraySpanInlineVisitor<I> visitor;
+ RETURN_NOT_OK(visitor.VisitStatus(
+ input, [&](std::string_view v) { return builder.Append(v); },
+ [&]() {
+ builder.UnsafeAppendNull();
+ return Status::OK();
+ }));
+
+ std::shared_ptr<ArrayData> output_array;
+ RETURN_NOT_OK(builder.FinishInternal(&output_array));
+ out->value = std::move(output_array);
+ return Status::OK();
+}
+
+// Span -> View
+template <typename O, typename I>
+enable_if_t<is_base_binary_type<I>::value &&
is_binary_view_like_type<O>::value, Status>
+BinaryToBinaryCastExec(KernelContext* ctx, const ExecSpan& batch, ExecResult*
out) {
+ using offset_type = typename I::offset_type;
+ const CastOptions& options = checked_cast<const
CastState&>(*ctx->state()).options;
+ const ArraySpan& input = batch[0].array;
+
+ if constexpr (!I::is_utf8 && O::is_utf8) {
+ if (!options.allow_invalid_utf8) {
+ InitializeUTF8();
+ ArraySpanVisitor<I> visitor;
+ Utf8Validator validator;
+ RETURN_NOT_OK(visitor.Visit(input, &validator));
+ }
+ }
+
+ // Start with a zero-copy cast, then reconfigure the view and data buffers
+ RETURN_NOT_OK(ZeroCopyCastExec(ctx, batch, out));
+ ArrayData* output = out->array_data().get();
+ auto offsets_buffer = std::move(output->buffers[1]);
+ auto data_buffer = std::move(output->buffers[2]);
+
+ const int64_t total_length = input.offset + input.length;
+ const auto* validity = input.GetValues<uint8_t>(0, 0);
+ const auto* input_offsets = input.GetValues<offset_type>(1);
+ const auto* input_data = input.GetValues<uint8_t>(2, 0);
+
+ // Turn buffers[1] into a buffer of empty BinaryViewType::c_type entries.
+ ARROW_ASSIGN_OR_RAISE(output->buffers[1],
+ ctx->Allocate(total_length * BinaryViewType::kSize));
+ memset(output->buffers[1]->mutable_data(), 0, total_length *
BinaryViewType::kSize);
+ auto* out_views = output->GetMutableValues<BinaryViewType::c_type>(1);
+
+ bool all_entries_are_inline = true;
+ VisitSetBitRunsVoid(
+ validity, output->offset, output->length,
+ [&](int64_t start_offset, int64_t run_length) {
+ for (int64_t i = start_offset; i < start_offset + run_length; i++) {
+ const offset_type data_offset = input_offsets[i];
+ const offset_type data_length = input_offsets[i + 1] - data_offset;
+ auto& out_view = out_views[i];
+ if (data_length <= BinaryViewType::kInlineSize) {
+ out_view.inlined.size = static_cast<int32_t>(data_length);
+ memcpy(out_view.inlined.data.data(), input_data + data_offset,
data_length);
+ } else {
+ out_view.ref.size = static_cast<int32_t>(data_length);
+ memcpy(out_view.ref.prefix.data(), input_data + data_offset,
+ BinaryViewType::kPrefixSize);
+ // out_view.ref.buffer_index = 0;
+ out_view.ref.offset = static_cast<int32_t>(data_offset);
+ // TODO(felipecrv): validate data_offsets can't overflow
Review Comment:
Is this a TODO for this PR? Otherwise, perhaps create a GH issue for it.
##########
cpp/src/arrow/compute/kernels/scalar_cast_string.cc:
##########
@@ -286,17 +288,20 @@ Status CastBinaryToBinaryOffsets<int64_t,
int32_t>(KernelContext* ctx,
}
}
+// Span -> Span
Review Comment:
Why "span"? Did you mean "string"? Or perhaps "Non-view"?
##########
cpp/src/arrow/compute/kernels/scalar_cast_test.cc:
##########
@@ -97,7 +97,10 @@ static std::vector<std::shared_ptr<DataType>>
kDictionaryIndexTypes = kIntegerTy
static std::vector<std::shared_ptr<DataType>> kBaseBinaryTypes = {
Review Comment:
It seems you should also change `StringToBoolean` and other similar
functions to include view types as inputs?
##########
cpp/src/arrow/type_traits.h:
##########
@@ -1624,6 +1639,16 @@ static inline bool is_binary(const DataType& type) {
return is_binary(type.id())
/// Convenience for checking using the type's id
static inline bool is_string(const DataType& type) { return
is_string(type.id()); }
+/// \brief Check for a binary-view-like type
+///
+/// \param[in] type the type to check
+/// \return whether type is a binary-view-like type
+///
+/// Convenience for checking using the type's id
+static inline bool is_binary_view_like(const DataType& type) {
Review Comment:
By the way, should all the `static inline bool is_XXX(const DataType& type)`
predicates be turned into `constexpr`? This probably needs `DataType::id()` to
be made `constexpr` as well.
##########
cpp/src/arrow/compute/kernels/scalar_cast_string.cc:
##########
@@ -305,19 +310,198 @@ BinaryToBinaryCastExec(KernelContext* ctx, const
ExecSpan& batch, ExecResult* ou
ctx, input, out->array_data().get());
}
+// View -> Span
+template <typename O, typename I>
+enable_if_t<is_binary_view_like_type<I>::value &&
is_base_binary_type<O>::value, Status>
+BinaryToBinaryCastExec(KernelContext* ctx, const ExecSpan& batch, ExecResult*
out) {
+ using OutputBuilderType = typename TypeTraits<O>::BuilderType;
+ const CastOptions& options = checked_cast<const
CastState&>(*ctx->state()).options;
+ const ArraySpan& input = batch[0].array;
+
+ if constexpr (!I::is_utf8 && O::is_utf8) {
+ if (!options.allow_invalid_utf8) {
+ InitializeUTF8();
+ ArraySpanVisitor<I> visitor;
+ Utf8Validator validator;
+ RETURN_NOT_OK(visitor.Visit(input, &validator));
+ }
+ }
+
+ // XXX: a more efficient implementation that zero-copies the validity bitmap
+ // is possible, but requires a more complex implementation for building the
+ // offsets and data buffers
Review Comment:
Because of a possible input offset? Even without zero-copying, `CopyBitmap`
would be much faster than appending validity bits one by one. That said, this
can be left to a another GH issue (if we care at all).
##########
cpp/src/arrow/compute/kernels/scalar_cast_string.cc:
##########
@@ -305,19 +310,198 @@ BinaryToBinaryCastExec(KernelContext* ctx, const
ExecSpan& batch, ExecResult* ou
ctx, input, out->array_data().get());
}
+// View -> Span
+template <typename O, typename I>
+enable_if_t<is_binary_view_like_type<I>::value &&
is_base_binary_type<O>::value, Status>
+BinaryToBinaryCastExec(KernelContext* ctx, const ExecSpan& batch, ExecResult*
out) {
+ using OutputBuilderType = typename TypeTraits<O>::BuilderType;
+ const CastOptions& options = checked_cast<const
CastState&>(*ctx->state()).options;
+ const ArraySpan& input = batch[0].array;
+
+ if constexpr (!I::is_utf8 && O::is_utf8) {
+ if (!options.allow_invalid_utf8) {
+ InitializeUTF8();
+ ArraySpanVisitor<I> visitor;
+ Utf8Validator validator;
+ RETURN_NOT_OK(visitor.Visit(input, &validator));
+ }
+ }
+
+ // XXX: a more efficient implementation that zero-copies the validity bitmap
+ // is possible, but requires a more complex implementation for building the
+ // offsets and data buffers
+ OutputBuilderType builder(options.to_type.GetSharedPtr(),
ctx->memory_pool());
+ RETURN_NOT_OK(builder.Resize(input.length));
+ arrow::internal::ArraySpanInlineVisitor<I> visitor;
+ RETURN_NOT_OK(visitor.VisitStatus(
+ input, [&](std::string_view v) { return builder.Append(v); },
+ [&]() {
+ builder.UnsafeAppendNull();
+ return Status::OK();
+ }));
+
+ std::shared_ptr<ArrayData> output_array;
+ RETURN_NOT_OK(builder.FinishInternal(&output_array));
+ out->value = std::move(output_array);
+ return Status::OK();
+}
+
+// Span -> View
+template <typename O, typename I>
+enable_if_t<is_base_binary_type<I>::value &&
is_binary_view_like_type<O>::value, Status>
+BinaryToBinaryCastExec(KernelContext* ctx, const ExecSpan& batch, ExecResult*
out) {
+ using offset_type = typename I::offset_type;
+ const CastOptions& options = checked_cast<const
CastState&>(*ctx->state()).options;
+ const ArraySpan& input = batch[0].array;
+
+ if constexpr (!I::is_utf8 && O::is_utf8) {
+ if (!options.allow_invalid_utf8) {
+ InitializeUTF8();
+ ArraySpanVisitor<I> visitor;
+ Utf8Validator validator;
+ RETURN_NOT_OK(visitor.Visit(input, &validator));
+ }
+ }
+
+ // Start with a zero-copy cast, then reconfigure the view and data buffers
+ RETURN_NOT_OK(ZeroCopyCastExec(ctx, batch, out));
+ ArrayData* output = out->array_data().get();
+ auto offsets_buffer = std::move(output->buffers[1]);
+ auto data_buffer = std::move(output->buffers[2]);
+
+ const int64_t total_length = input.offset + input.length;
+ const auto* validity = input.GetValues<uint8_t>(0, 0);
+ const auto* input_offsets = input.GetValues<offset_type>(1);
+ const auto* input_data = input.GetValues<uint8_t>(2, 0);
+
+ // Turn buffers[1] into a buffer of empty BinaryViewType::c_type entries.
+ ARROW_ASSIGN_OR_RAISE(output->buffers[1],
+ ctx->Allocate(total_length * BinaryViewType::kSize));
+ memset(output->buffers[1]->mutable_data(), 0, total_length *
BinaryViewType::kSize);
+ auto* out_views = output->GetMutableValues<BinaryViewType::c_type>(1);
+
+ bool all_entries_are_inline = true;
+ VisitSetBitRunsVoid(
+ validity, output->offset, output->length,
+ [&](int64_t start_offset, int64_t run_length) {
+ for (int64_t i = start_offset; i < start_offset + run_length; i++) {
+ const offset_type data_offset = input_offsets[i];
+ const offset_type data_length = input_offsets[i + 1] - data_offset;
+ auto& out_view = out_views[i];
+ if (data_length <= BinaryViewType::kInlineSize) {
+ out_view.inlined.size = static_cast<int32_t>(data_length);
+ memcpy(out_view.inlined.data.data(), input_data + data_offset,
data_length);
+ } else {
+ out_view.ref.size = static_cast<int32_t>(data_length);
+ memcpy(out_view.ref.prefix.data(), input_data + data_offset,
+ BinaryViewType::kPrefixSize);
+ // out_view.ref.buffer_index = 0;
Review Comment:
Is it meant to be left commented out?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]