felipecrv commented on code in PR #43302:
URL: https://github.com/apache/arrow/pull/43302#discussion_r1704737082


##########
cpp/src/arrow/compute/kernels/scalar_cast_string.cc:
##########
@@ -305,19 +310,198 @@ BinaryToBinaryCastExec(KernelContext* ctx, const 
ExecSpan& batch, ExecResult* ou
       ctx, input, out->array_data().get());
 }
 
+// View -> Span
+template <typename O, typename I>
+enable_if_t<is_binary_view_like_type<I>::value && 
is_base_binary_type<O>::value, Status>
+BinaryToBinaryCastExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* 
out) {
+  using OutputBuilderType = typename TypeTraits<O>::BuilderType;
+  const CastOptions& options = checked_cast<const 
CastState&>(*ctx->state()).options;
+  const ArraySpan& input = batch[0].array;
+
+  if constexpr (!I::is_utf8 && O::is_utf8) {
+    if (!options.allow_invalid_utf8) {
+      InitializeUTF8();
+      ArraySpanVisitor<I> visitor;
+      Utf8Validator validator;
+      RETURN_NOT_OK(visitor.Visit(input, &validator));
+    }
+  }
+
+  // XXX: a more efficient implementation that zero-copies the validity bitmap
+  // is possible, but requires a more complex implementation for building the
+  // offsets and data buffers
+  OutputBuilderType builder(options.to_type.GetSharedPtr(), 
ctx->memory_pool());
+  RETURN_NOT_OK(builder.Resize(input.length));
+  arrow::internal::ArraySpanInlineVisitor<I> visitor;
+  RETURN_NOT_OK(visitor.VisitStatus(
+      input, [&](std::string_view v) { return builder.Append(v); },
+      [&]() {
+        builder.UnsafeAppendNull();
+        return Status::OK();
+      }));
+
+  std::shared_ptr<ArrayData> output_array;
+  RETURN_NOT_OK(builder.FinishInternal(&output_array));
+  out->value = std::move(output_array);
+  return Status::OK();
+}
+
+// Span -> View
+template <typename O, typename I>
+enable_if_t<is_base_binary_type<I>::value && 
is_binary_view_like_type<O>::value, Status>
+BinaryToBinaryCastExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* 
out) {
+  using offset_type = typename I::offset_type;
+  const CastOptions& options = checked_cast<const 
CastState&>(*ctx->state()).options;
+  const ArraySpan& input = batch[0].array;
+
+  if constexpr (!I::is_utf8 && O::is_utf8) {
+    if (!options.allow_invalid_utf8) {
+      InitializeUTF8();
+      ArraySpanVisitor<I> visitor;
+      Utf8Validator validator;
+      RETURN_NOT_OK(visitor.Visit(input, &validator));
+    }
+  }
+
+  // Start with a zero-copy cast, then reconfigure the view and data buffers
+  RETURN_NOT_OK(ZeroCopyCastExec(ctx, batch, out));
+  ArrayData* output = out->array_data().get();
+  auto offsets_buffer = std::move(output->buffers[1]);
+  auto data_buffer = std::move(output->buffers[2]);
+
+  const int64_t total_length = input.offset + input.length;
+  const auto* validity = input.GetValues<uint8_t>(0, 0);
+  const auto* input_offsets = input.GetValues<offset_type>(1);
+  const auto* input_data = input.GetValues<uint8_t>(2, 0);
+
+  // Turn buffers[1] into a buffer of empty BinaryViewType::c_type entries.
+  ARROW_ASSIGN_OR_RAISE(output->buffers[1],
+                        ctx->Allocate(total_length * BinaryViewType::kSize));
+  memset(output->buffers[1]->mutable_data(), 0, total_length * 
BinaryViewType::kSize);
+  auto* out_views = output->GetMutableValues<BinaryViewType::c_type>(1);
+
+  bool all_entries_are_inline = true;
+  VisitSetBitRunsVoid(
+      validity, output->offset, output->length,
+      [&](int64_t start_offset, int64_t run_length) {
+        for (int64_t i = start_offset; i < start_offset + run_length; i++) {
+          const offset_type data_offset = input_offsets[i];
+          const offset_type data_length = input_offsets[i + 1] - data_offset;
+          auto& out_view = out_views[i];
+          if (data_length <= BinaryViewType::kInlineSize) {
+            out_view.inlined.size = static_cast<int32_t>(data_length);
+            memcpy(out_view.inlined.data.data(), input_data + data_offset, 
data_length);
+          } else {
+            out_view.ref.size = static_cast<int32_t>(data_length);
+            memcpy(out_view.ref.prefix.data(), input_data + data_offset,
+                   BinaryViewType::kPrefixSize);
+            // out_view.ref.buffer_index = 0;
+            out_view.ref.offset = static_cast<int32_t>(data_offset);
+            // TODO(felipecrv): validate data_offsets can't overflow
+            all_entries_are_inline = false;
+          }

Review Comment:
   I check the offset overflow outside of the loop now and I'm leveraging the 
fact that I can memset-0 the views, write only the offset fields. This seems 
highly specific to fixed-size-string conversion *loop* to be made a helper.



##########
cpp/src/arrow/compute/kernels/scalar_cast_string.cc:
##########
@@ -305,19 +310,198 @@ BinaryToBinaryCastExec(KernelContext* ctx, const 
ExecSpan& batch, ExecResult* ou
       ctx, input, out->array_data().get());
 }
 
+// View -> Span
+template <typename O, typename I>
+enable_if_t<is_binary_view_like_type<I>::value && 
is_base_binary_type<O>::value, Status>
+BinaryToBinaryCastExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* 
out) {
+  using OutputBuilderType = typename TypeTraits<O>::BuilderType;
+  const CastOptions& options = checked_cast<const 
CastState&>(*ctx->state()).options;
+  const ArraySpan& input = batch[0].array;
+
+  if constexpr (!I::is_utf8 && O::is_utf8) {
+    if (!options.allow_invalid_utf8) {
+      InitializeUTF8();
+      ArraySpanVisitor<I> visitor;
+      Utf8Validator validator;
+      RETURN_NOT_OK(visitor.Visit(input, &validator));
+    }
+  }
+
+  // XXX: a more efficient implementation that zero-copies the validity bitmap
+  // is possible, but requires a more complex implementation for building the
+  // offsets and data buffers
+  OutputBuilderType builder(options.to_type.GetSharedPtr(), 
ctx->memory_pool());
+  RETURN_NOT_OK(builder.Resize(input.length));
+  arrow::internal::ArraySpanInlineVisitor<I> visitor;
+  RETURN_NOT_OK(visitor.VisitStatus(
+      input, [&](std::string_view v) { return builder.Append(v); },
+      [&]() {
+        builder.UnsafeAppendNull();
+        return Status::OK();
+      }));
+
+  std::shared_ptr<ArrayData> output_array;
+  RETURN_NOT_OK(builder.FinishInternal(&output_array));
+  out->value = std::move(output_array);
+  return Status::OK();
+}
+
+// Span -> View
+template <typename O, typename I>
+enable_if_t<is_base_binary_type<I>::value && 
is_binary_view_like_type<O>::value, Status>
+BinaryToBinaryCastExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* 
out) {
+  using offset_type = typename I::offset_type;
+  const CastOptions& options = checked_cast<const 
CastState&>(*ctx->state()).options;
+  const ArraySpan& input = batch[0].array;
+
+  if constexpr (!I::is_utf8 && O::is_utf8) {
+    if (!options.allow_invalid_utf8) {
+      InitializeUTF8();
+      ArraySpanVisitor<I> visitor;
+      Utf8Validator validator;
+      RETURN_NOT_OK(visitor.Visit(input, &validator));
+    }
+  }
+
+  // Start with a zero-copy cast, then reconfigure the view and data buffers
+  RETURN_NOT_OK(ZeroCopyCastExec(ctx, batch, out));
+  ArrayData* output = out->array_data().get();
+  auto offsets_buffer = std::move(output->buffers[1]);
+  auto data_buffer = std::move(output->buffers[2]);
+
+  const int64_t total_length = input.offset + input.length;
+  const auto* validity = input.GetValues<uint8_t>(0, 0);
+  const auto* input_offsets = input.GetValues<offset_type>(1);
+  const auto* input_data = input.GetValues<uint8_t>(2, 0);
+
+  // Turn buffers[1] into a buffer of empty BinaryViewType::c_type entries.
+  ARROW_ASSIGN_OR_RAISE(output->buffers[1],
+                        ctx->Allocate(total_length * BinaryViewType::kSize));
+  memset(output->buffers[1]->mutable_data(), 0, total_length * 
BinaryViewType::kSize);
+  auto* out_views = output->GetMutableValues<BinaryViewType::c_type>(1);
+
+  bool all_entries_are_inline = true;
+  VisitSetBitRunsVoid(
+      validity, output->offset, output->length,
+      [&](int64_t start_offset, int64_t run_length) {
+        for (int64_t i = start_offset; i < start_offset + run_length; i++) {
+          const offset_type data_offset = input_offsets[i];
+          const offset_type data_length = input_offsets[i + 1] - data_offset;
+          auto& out_view = out_views[i];
+          if (data_length <= BinaryViewType::kInlineSize) {
+            out_view.inlined.size = static_cast<int32_t>(data_length);
+            memcpy(out_view.inlined.data.data(), input_data + data_offset, 
data_length);
+          } else {
+            out_view.ref.size = static_cast<int32_t>(data_length);
+            memcpy(out_view.ref.prefix.data(), input_data + data_offset,
+                   BinaryViewType::kPrefixSize);
+            // out_view.ref.buffer_index = 0;
+            out_view.ref.offset = static_cast<int32_t>(data_offset);
+            // TODO(felipecrv): validate data_offsets can't overflow
+            all_entries_are_inline = false;
+          }

Review Comment:
   I check the offset overflow outside of the loop now and I'm leveraging the 
fact that I can memset-0 the views, write only the offset fields. This seems 
highly specific to fixed-size-string conversion **loop** to be made a helper.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to