pitrou commented on code in PR #14198:
URL: https://github.com/apache/arrow/pull/14198#discussion_r980924631


##########
cpp/src/arrow/compute/cast.cc:
##########
@@ -96,8 +96,11 @@ class CastMetaFunction : public MetaFunction {
     ARROW_ASSIGN_OR_RAISE(auto cast_options, ValidateOptions(options));
     // args[0].type() could be a nullptr so check for that before
     // we do anything with it.
+    // TODO: if types are equal except for field names of list types, we can

Review Comment:
   Do you want to open a JIRA for this TODO?



##########
cpp/src/arrow/compute/kernels/scalar_cast_nested.cc:
##########
@@ -40,25 +40,22 @@ namespace {
 
 // (Large)List<T> -> (Large)List<U>
 
-template <typename SrcType, typename DestType>
-typename std::enable_if<SrcType::type_id == DestType::type_id, Status>::type
-CastListOffsets(KernelContext* ctx, const ArraySpan& in_array, ArrayData* 
out_array) {
-  return Status::OK();
-}
-
 // TODO(wesm): memory could be preallocated here and it would make
 // things simpler
 template <typename SrcType, typename DestType>
-typename std::enable_if<SrcType::type_id != DestType::type_id, Status>::type
-CastListOffsets(KernelContext* ctx, const ArraySpan& in_array, ArrayData* 
out_array) {
+Status CastListOffsets(KernelContext* ctx, const ArraySpan& in_array,
+                       ArrayData* out_array) {
   using src_offset_type = typename SrcType::offset_type;
   using dest_offset_type = typename DestType::offset_type;
 
-  ARROW_ASSIGN_OR_RAISE(out_array->buffers[1],
-                        ctx->Allocate(sizeof(dest_offset_type) * 
(in_array.length + 1)));
-  ::arrow::internal::CastInts(in_array.GetValues<src_offset_type>(1),
-                              out_array->GetMutableValues<dest_offset_type>(1),
-                              in_array.length + 1);
+  if constexpr (!std::is_same<src_offset_type, dest_offset_type>::value) {

Review Comment:
   Woohoo!



##########
cpp/src/arrow/datum.cc:
##########
@@ -177,6 +177,30 @@ bool Datum::Equals(const Datum& other) const {
   }
 }
 
+Result<Datum> Datum::View(const std::shared_ptr<DataType>& out_type) const {
+  if (this->is_scalar()) {
+    std::shared_ptr<Array> tmp_array;
+    ARROW_ASSIGN_OR_RAISE(
+        tmp_array, MakeArrayFromScalar(*this->scalar().get(), 1, 
default_memory_pool()));
+    ARROW_ASSIGN_OR_RAISE(std::shared_ptr<ArrayData> array,
+                          ::arrow::internal::GetArrayView(tmp_array->data(), 
out_type));
+    ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Scalar> out, 
MakeArray(array)->GetScalar(0));
+    return Datum(out);
+  } else if (this->is_array()) {
+    ARROW_ASSIGN_OR_RAISE(std::shared_ptr<ArrayData> array,
+                          ::arrow::internal::GetArrayView(this->array(), 
out_type));
+    return Datum(array);
+  } else if (this->is_chunked_array()) {
+    ARROW_ASSIGN_OR_RAISE(std::shared_ptr<ChunkedArray> array,
+                          this->chunked_array()->View(out_type));
+    return Datum(array);
+  } else if (this->kind() == Datum::NONE) {
+    return Datum();
+  } else {
+    return Status::Invalid("Cannot cast tabular structure to a single data 
type.");

Review Comment:
   `TypeError` instead?



##########
cpp/src/arrow/compute/kernels/scalar_cast_nested.cc:
##########
@@ -206,6 +211,75 @@ void AddStructToStructCast(CastFunction* func) {
   DCHECK_OK(func->AddKernel(StructType::type_id, std::move(kernel)));
 }
 
+template <typename DestType>
+struct CastMap {
+  using CastListImpl = CastList<MapType, DestType>;
+
+  static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* 
out) {
+    const CastOptions& options = CastState::Get(ctx);
+
+    std::shared_ptr<DataType> entry_type =
+        checked_cast<const DestType&>(*out->type()).value_type();
+    // Assert is struct with two fields
+    if (!(entry_type->id() == Type::STRUCT && entry_type->num_fields() == 2)) {
+      return Status::TypeError(
+          "Map type must be cast to a list<struct> with exactly two fields.");
+    }
+    std::shared_ptr<DataType> key_type = entry_type->field(0)->type();
+    std::shared_ptr<DataType> value_type = entry_type->field(1)->type();
+
+    const ArraySpan& in_array = batch[0].array;
+
+    ArrayData* out_array = out->array_data().get();
+    out_array->buffers[0] = in_array.GetBuffer(0);
+    out_array->buffers[1] = in_array.GetBuffer(1);
+
+    std::shared_ptr<ArrayData> entries = in_array.child_data[0].ToArrayData();
+
+    RETURN_NOT_OK(CastListImpl::HandleOffsets(ctx, in_array, out_array, 
&entries));
+
+    // Handle keys
+    const std::shared_ptr<ArrayData>& keys =
+        entries->child_data[0]->Slice(entries->offset, entries->length);
+    ARROW_ASSIGN_OR_RAISE(Datum cast_keys,
+                          Cast(keys, key_type, options, ctx->exec_context()));
+    DCHECK(cast_keys.is_array());
+
+    // Handle values
+    const std::shared_ptr<ArrayData>& values =
+        entries->child_data[1]->Slice(entries->offset, entries->length);
+    ARROW_ASSIGN_OR_RAISE(Datum cast_values,
+                          Cast(values, value_type, options, 
ctx->exec_context()));
+    DCHECK(cast_values.is_array());
+
+    // Copy null bitmap
+    std::shared_ptr<Buffer> null_bitmap;
+    if (in_array.buffers[0].data != nullptr) {

Review Comment:
   Or perhaps you mean to use `entries.buffers[0].data` instead?



##########
cpp/src/arrow/compute/kernels/scalar_cast_nested.cc:
##########
@@ -206,6 +211,75 @@ void AddStructToStructCast(CastFunction* func) {
   DCHECK_OK(func->AddKernel(StructType::type_id, std::move(kernel)));
 }
 
+template <typename DestType>
+struct CastMap {
+  using CastListImpl = CastList<MapType, DestType>;
+
+  static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* 
out) {
+    const CastOptions& options = CastState::Get(ctx);
+
+    std::shared_ptr<DataType> entry_type =
+        checked_cast<const DestType&>(*out->type()).value_type();
+    // Assert is struct with two fields
+    if (!(entry_type->id() == Type::STRUCT && entry_type->num_fields() == 2)) {
+      return Status::TypeError(
+          "Map type must be cast to a list<struct> with exactly two fields.");
+    }
+    std::shared_ptr<DataType> key_type = entry_type->field(0)->type();
+    std::shared_ptr<DataType> value_type = entry_type->field(1)->type();
+
+    const ArraySpan& in_array = batch[0].array;
+
+    ArrayData* out_array = out->array_data().get();
+    out_array->buffers[0] = in_array.GetBuffer(0);
+    out_array->buffers[1] = in_array.GetBuffer(1);
+
+    std::shared_ptr<ArrayData> entries = in_array.child_data[0].ToArrayData();
+
+    RETURN_NOT_OK(CastListImpl::HandleOffsets(ctx, in_array, out_array, 
&entries));
+
+    // Handle keys
+    const std::shared_ptr<ArrayData>& keys =
+        entries->child_data[0]->Slice(entries->offset, entries->length);
+    ARROW_ASSIGN_OR_RAISE(Datum cast_keys,
+                          Cast(keys, key_type, options, ctx->exec_context()));
+    DCHECK(cast_keys.is_array());
+
+    // Handle values
+    const std::shared_ptr<ArrayData>& values =
+        entries->child_data[1]->Slice(entries->offset, entries->length);
+    ARROW_ASSIGN_OR_RAISE(Datum cast_values,
+                          Cast(values, value_type, options, 
ctx->exec_context()));
+    DCHECK(cast_values.is_array());
+
+    // Copy null bitmap
+    std::shared_ptr<Buffer> null_bitmap;
+    if (in_array.buffers[0].data != nullptr) {
+      ARROW_ASSIGN_OR_RAISE(null_bitmap,
+                            CopyBitmap(ctx->memory_pool(), 
in_array.buffers[0].data,
+                                       in_array.offset, in_array.length));

Review Comment:
   Is copying necessary if the offset is 0?



##########
cpp/src/arrow/compute/kernels/scalar_cast_nested.cc:
##########
@@ -206,6 +211,75 @@ void AddStructToStructCast(CastFunction* func) {
   DCHECK_OK(func->AddKernel(StructType::type_id, std::move(kernel)));
 }
 
+template <typename DestType>
+struct CastMap {
+  using CastListImpl = CastList<MapType, DestType>;
+
+  static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* 
out) {
+    const CastOptions& options = CastState::Get(ctx);
+
+    std::shared_ptr<DataType> entry_type =
+        checked_cast<const DestType&>(*out->type()).value_type();
+    // Assert is struct with two fields
+    if (!(entry_type->id() == Type::STRUCT && entry_type->num_fields() == 2)) {
+      return Status::TypeError(
+          "Map type must be cast to a list<struct> with exactly two fields.");
+    }
+    std::shared_ptr<DataType> key_type = entry_type->field(0)->type();
+    std::shared_ptr<DataType> value_type = entry_type->field(1)->type();
+
+    const ArraySpan& in_array = batch[0].array;
+
+    ArrayData* out_array = out->array_data().get();
+    out_array->buffers[0] = in_array.GetBuffer(0);
+    out_array->buffers[1] = in_array.GetBuffer(1);
+
+    std::shared_ptr<ArrayData> entries = in_array.child_data[0].ToArrayData();
+
+    RETURN_NOT_OK(CastListImpl::HandleOffsets(ctx, in_array, out_array, 
&entries));
+
+    // Handle keys
+    const std::shared_ptr<ArrayData>& keys =
+        entries->child_data[0]->Slice(entries->offset, entries->length);
+    ARROW_ASSIGN_OR_RAISE(Datum cast_keys,
+                          Cast(keys, key_type, options, ctx->exec_context()));
+    DCHECK(cast_keys.is_array());
+
+    // Handle values
+    const std::shared_ptr<ArrayData>& values =
+        entries->child_data[1]->Slice(entries->offset, entries->length);
+    ARROW_ASSIGN_OR_RAISE(Datum cast_values,
+                          Cast(values, value_type, options, 
ctx->exec_context()));
+    DCHECK(cast_values.is_array());
+
+    // Copy null bitmap
+    std::shared_ptr<Buffer> null_bitmap;
+    if (in_array.buffers[0].data != nullptr) {

Review Comment:
   I'm curious: why do this after you assigned to `out_array->buffers` above?



##########
cpp/src/arrow/compute/kernels/scalar_cast_test.cc:
##########
@@ -2209,6 +2209,41 @@ TEST(Cast, ListToListOptionsPassthru) {
   }
 }
 
+TEST(Cast, MapToMapFieldNames) {
+  std::shared_ptr<DataType> src_type = map(utf8(), field("x", list(field("a", 
int64()))));
+  std::shared_ptr<DataType> dst_type = map(utf8(), field("y", list(field("b", 
int64()))));
+
+  std::shared_ptr<Array> src =
+      ArrayFromJSON(src_type, "[[[\"1\", [1,2,3]]], [[\"2\", [4,5,6]]]]");
+  std::shared_ptr<Array> dst =
+      ArrayFromJSON(dst_type, "[[[\"1\", [1,2,3]]], [[\"2\", [4,5,6]]]]");
+
+  CheckCast(src, dst);
+
+  src_type = map(utf8(), field("x", int64()));
+  dst_type = map(utf8(), field("y", int64()));
+
+  src = ArrayFromJSON(src_type, "[[[\"1\", 1]], [[\"2\", 6]], [[\"3\", 36]]]");
+  dst = ArrayFromJSON(dst_type, "[[[\"1\", 1]], [[\"2\", 6]], [[\"3\", 36]]]");
+
+  CheckCast(src, dst);
+
+  src_type = map(int32(), int32());
+  dst_type = list(struct_({field("a", int32()), field("b", int64())}));
+
+  src = ArrayFromJSON(src_type, "[[[1, 1]], [[1, 6]], [[1, 36]]]");
+  dst = ArrayFromJSON(dst_type, "[[[1, 1]], [[1, 6]], [[1, 36]]]");
+
+  CheckCast(src, dst);
+
+  dst_type = list(
+      struct_({field("key", int32()), field("value", int64()), field("extra", 
int64())}));
+  EXPECT_RAISES_WITH_MESSAGE_THAT(
+      TypeError,
+      ::testing::HasSubstr("must be cast to a list<struct> with exactly two 
fields"),
+      Cast(src, dst_type));
+}

Review Comment:
   Could you add tests:
   * with large_list
   * with nulls
   * with non-zero list offset
   ?



##########
cpp/src/arrow/datum.cc:
##########
@@ -177,6 +177,30 @@ bool Datum::Equals(const Datum& other) const {
   }
 }
 
+Result<Datum> Datum::View(const std::shared_ptr<DataType>& out_type) const {
+  if (this->is_scalar()) {
+    std::shared_ptr<Array> tmp_array;
+    ARROW_ASSIGN_OR_RAISE(
+        tmp_array, MakeArrayFromScalar(*this->scalar().get(), 1, 
default_memory_pool()));
+    ARROW_ASSIGN_OR_RAISE(std::shared_ptr<ArrayData> array,
+                          ::arrow::internal::GetArrayView(tmp_array->data(), 
out_type));
+    ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Scalar> out, 
MakeArray(array)->GetScalar(0));

Review Comment:
   Can we instead not support this? This implementation is very heavy and isn't 
really "zero-copy" either.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to