lidavidm commented on a change in pull request #12248:
URL: https://github.com/apache/arrow/pull/12248#discussion_r803250366
##########
File path: cpp/src/arrow/compute/kernels/scalar_cast_nested.cc
##########
@@ -150,6 +150,104 @@ void AddListCast(CastFunction* func) {
DCHECK_OK(func->AddKernel(SrcType::type_id, std::move(kernel)));
}
+struct CastStruct {
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ const CastOptions& options = CastState::Get(ctx);
+ const auto in_field_count =
+ checked_cast<const StructType&>(*batch[0].type()).num_fields();
+ const auto out_field_count =
+ checked_cast<const StructType&>(*out->type()).num_fields();
+
+ if (in_field_count != out_field_count) {
+ return Status::TypeError("struct field sizes do not match: ",
+ batch[0].type()->ToString(), " ",
out->type()->ToString());
+ }
+
+ for (int i = 0; i < in_field_count; ++i) {
+ const auto in_field_name =
+ checked_cast<const StructType&>(*batch[0].type()).field(i)->name();
+ const auto out_field_name =
+ checked_cast<const StructType&>(*out->type()).field(i)->name();
+ if (in_field_name != out_field_name) {
+ return Status::TypeError(
+ "struct field names do not match: ", batch[0].type()->ToString(),
" ",
+ out->type()->ToString());
+ }
+
+ const auto in_field_nullable =
+ checked_cast<const
StructType&>(*batch[0].type()).field(i)->nullable();
+ const auto out_field_nullable =
+ checked_cast<const StructType&>(*out->type()).field(i)->nullable();
+
+ if (in_field_nullable && !out_field_nullable) {
+ return Status::TypeError("cannot cast nullable struct to non-nullable
struct: ",
+ batch[0].type()->ToString(), " ",
+ out->type()->ToString());
+ }
+ }
+
+ for (int i = 0; i < in_field_count; ++i) {
+ const auto in_field_name =
+ checked_cast<const StructType&>(*batch[0].type()).field(i)->name();
+ const auto out_field_name =
+ checked_cast<const StructType&>(*out->type()).field(i)->name();
+ if (in_field_name != out_field_name) {
+ return Status::TypeError(
+ "struct field names do not match: ", batch[0].type()->ToString(),
" ",
+ out->type()->ToString());
+ }
+ }
+
+ if (out->kind() == Datum::SCALAR) {
+ const auto& in_scalar = checked_cast<const
StructScalar&>(*batch[0].scalar());
+ auto out_scalar = checked_cast<StructScalar*>(out->scalar().get());
+
+ DCHECK(!out_scalar->is_valid);
+ if (in_scalar.is_valid) {
+ for (int i = 0; i < in_field_count; i++) {
+ auto values = in_scalar.value[i];
+ auto target_type = out->type()->field(i)->type();
+ ARROW_ASSIGN_OR_RAISE(Datum cast_values,
+ Cast(values, target_type, options,
ctx->exec_context()));
+ DCHECK_EQ(Datum::SCALAR, cast_values.kind());
+ out_scalar->value.push_back(cast_values.scalar());
+ }
+ out_scalar->is_valid = true;
+ }
+ return Status::OK();
+ }
+
+ const ArrayData& in_array = *batch[0].array();
+ ArrayData* out_array = out->mutable_array();
+ out_array->buffers = in_array.buffers;
Review comment:
Though - sorry - but it's probably better to slice the children, not
copy the offset, and slice the bitmap in `in_array.buffers` when copying? The
reason being, if we `Cast` a large array that's been sliced, this right now
will cast the _entire_ child array, even though we only technically care about
the sliced part. Does that make sense?
The null bitmap can be copied with CopyBitmap:
https://github.com/apache/arrow/blob/ec38aebb36e99e54e69089cbc6a623a616575dde/cpp/src/arrow/util/bitmap_ops.h#L44-L46
It would be something like
```
if (in_array.buffers[0]) {
ARROW_ASSIGN_OR_RAISE(out_array->buffers[0],
CopyBitmap(ctx->memory_pool(), in_array.buffers[0]->data(), in_array.offset,
in_array.length));
}
```
Though, frankly, it _should_ work below to set `kernel.null_handling` to
`INTERSECTION` and then the compute infra will compute the null bitmap for you.
(And you shouldn't have to mess with `out_array->buffers` or
`out_array->offset`.)
##########
File path: cpp/src/arrow/compute/kernels/scalar_cast_nested.cc
##########
@@ -150,6 +150,104 @@ void AddListCast(CastFunction* func) {
DCHECK_OK(func->AddKernel(SrcType::type_id, std::move(kernel)));
}
+struct CastStruct {
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ const CastOptions& options = CastState::Get(ctx);
+ const auto in_field_count =
+ checked_cast<const StructType&>(*batch[0].type()).num_fields();
+ const auto out_field_count =
+ checked_cast<const StructType&>(*out->type()).num_fields();
+
+ if (in_field_count != out_field_count) {
+ return Status::TypeError("struct field sizes do not match: ",
+ batch[0].type()->ToString(), " ",
out->type()->ToString());
+ }
+
+ for (int i = 0; i < in_field_count; ++i) {
+ const auto in_field_name =
+ checked_cast<const StructType&>(*batch[0].type()).field(i)->name();
+ const auto out_field_name =
+ checked_cast<const StructType&>(*out->type()).field(i)->name();
+ if (in_field_name != out_field_name) {
+ return Status::TypeError(
+ "struct field names do not match: ", batch[0].type()->ToString(),
" ",
+ out->type()->ToString());
+ }
+
+ const auto in_field_nullable =
+ checked_cast<const
StructType&>(*batch[0].type()).field(i)->nullable();
+ const auto out_field_nullable =
+ checked_cast<const StructType&>(*out->type()).field(i)->nullable();
+
+ if (in_field_nullable && !out_field_nullable) {
+ return Status::TypeError("cannot cast nullable struct to non-nullable
struct: ",
+ batch[0].type()->ToString(), " ",
+ out->type()->ToString());
+ }
+ }
+
+ for (int i = 0; i < in_field_count; ++i) {
+ const auto in_field_name =
+ checked_cast<const StructType&>(*batch[0].type()).field(i)->name();
+ const auto out_field_name =
+ checked_cast<const StructType&>(*out->type()).field(i)->name();
+ if (in_field_name != out_field_name) {
+ return Status::TypeError(
+ "struct field names do not match: ", batch[0].type()->ToString(),
" ",
+ out->type()->ToString());
+ }
+ }
+
+ if (out->kind() == Datum::SCALAR) {
+ const auto& in_scalar = checked_cast<const
StructScalar&>(*batch[0].scalar());
+ auto out_scalar = checked_cast<StructScalar*>(out->scalar().get());
+
+ DCHECK(!out_scalar->is_valid);
+ if (in_scalar.is_valid) {
+ for (int i = 0; i < in_field_count; i++) {
+ auto values = in_scalar.value[i];
+ auto target_type = out->type()->field(i)->type();
+ ARROW_ASSIGN_OR_RAISE(Datum cast_values,
+ Cast(values, target_type, options,
ctx->exec_context()));
+ DCHECK_EQ(Datum::SCALAR, cast_values.kind());
+ out_scalar->value.push_back(cast_values.scalar());
+ }
+ out_scalar->is_valid = true;
+ }
+ return Status::OK();
+ }
+
+ const ArrayData& in_array = *batch[0].array();
+ ArrayData* out_array = out->mutable_array();
+ out_array->buffers = in_array.buffers;
Review comment:
This looks good now, since we're not slicing the child and copying over
the out offset, so it should pass.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]