This is an automated email from the ASF dual-hosted git repository.

apitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new 6a47e4d28c GH-33592: [C++] support casting nullable fields to 
non-nullable if there are no null values (#43782)
6a47e4d28c is described below

commit 6a47e4d28cdc4592fe6a458dbe5efe3b17a090e5
Author: Nick Crews <[email protected]>
AuthorDate: Thu Feb 13 05:49:21 2025 -0900

    GH-33592: [C++] support casting nullable fields to non-nullable if there 
are no null values (#43782)
    
    * GitHub Issue: #33592
    
    Notes for myself/fixer:
    - [tests that need to get 
updated](https://github.com/search?q=repo%3Aapache%2Farrow%20%22cannot%20cast%20nullable%20field%22&type=code)
 (almost definitely not a complete list)
    - [update: actually we should handle the go implementation in the go 
repository.] hmm, looks like [go wrapper does its own nullability 
checks](https://github.com/apache/arrow/blob/f078942ce2df68de8f48c3b4233132133601ca53/go/arrow/compute/cast.go#L283-L286).
 I assume this is just an optimization to not have to go into the C++ and hit 
the error down there. So if we just delete this check then I think the C++ 
logic will handle all of it???
    - [how the plain column handles this 
cast](https://github.com/apache/arrow/blob/f078942ce2df68de8f48c3b4233132133601ca53/python/pyarrow/table.pxi#L3238-L3243),
 some logic like this probably needs to get ported over to the struct 
implementation
    - (running from /cpp/build) `cmake .. --preset ninja-debug-basic`, then 
`cmake --build . && PYTHON=python ctest -R 'arrow-compute-scalar-cast-test' 
--output-on-failure` to run the specific test
    -  to format: `uvx pre-commit run --all-files clang-format`
    
    Lead-authored-by: Nick Crews <[email protected]>
    Co-authored-by: Antoine Pitrou <[email protected]>
    Signed-off-by: Antoine Pitrou <[email protected]>
---
 .../arrow/compute/kernels/scalar_cast_nested.cc    |  30 +++--
 cpp/src/arrow/compute/kernels/scalar_cast_test.cc  | 141 +++++++++++----------
 2 files changed, 90 insertions(+), 81 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc 
b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc
index 0033d89fc4..a50dcdead2 100644
--- a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc
@@ -363,10 +363,6 @@ struct CastStruct {
         const auto& in_field = in_type.field(in_field_index);
         // If there are more in_fields check if they match the out_field.
         if (in_field->name() == out_field->name()) {
-          if (in_field->nullable() && !out_field->nullable()) {
-            return Status::TypeError("cannot cast nullable field to 
non-nullable field: ",
-                                     in_type.ToString(), " ", 
out_type.ToString());
-          }
           // Found matching in_field and out_field.
           fields_to_select[out_field_index++] = in_field_index;
           // Using the same in_field for multiple out_fields is not allowed.
@@ -403,17 +399,27 @@ struct CastStruct {
     }
 
     int out_field_index = 0;
-    for (int field_index : fields_to_select) {
-      const auto& target_type = out->type()->field(out_field_index++)->type();
-      if (field_index == kFillNullSentinel) {
-        ARROW_ASSIGN_OR_RAISE(auto nulls,
-                              MakeArrayOfNull(target_type->GetSharedPtr(), 
batch.length));
+    for (int in_field_index : fields_to_select) {
+      const auto& out_field = out_type.field(out_field_index++);
+      const auto& out_field_type = out_field->type();
+      if (in_field_index == kFillNullSentinel) {
+        ARROW_ASSIGN_OR_RAISE(
+            auto nulls, MakeArrayOfNull(out_field_type->GetSharedPtr(), 
batch.length,
+                                        ctx->memory_pool()));
         out_array->child_data.push_back(nulls->data());
       } else {
-        const auto& values = 
(in_array.child_data[field_index].ToArrayData()->Slice(
+        const auto& in_field = in_type.field(in_field_index);
+        const auto& in_values = 
(in_array.child_data[in_field_index].ToArrayData()->Slice(
             in_array.offset, in_array.length));
-        ARROW_ASSIGN_OR_RAISE(Datum cast_values,
-                              Cast(values, target_type, options, 
ctx->exec_context()));
+        if (in_field->nullable() && !out_field->nullable() &&
+            in_values->GetNullCount() > 0) {
+          return Status::Invalid(
+              "field '", in_field->name(), "' of type ", 
in_field->type()->ToString(),
+              " has nulls. Can't cast to non-nullable field '", 
out_field->name(),
+              "' of type ", out_field_type->ToString());
+        }
+        ARROW_ASSIGN_OR_RAISE(Datum cast_values, Cast(in_values, 
out_field_type, options,
+                                                      ctx->exec_context()));
         DCHECK(cast_values.is_array());
         out_array->child_data.push_back(cast_values.array());
       }
diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc 
b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc
index d7f73e2bb7..528e6acded 100644
--- a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc
@@ -4076,92 +4076,95 @@ TEST(Cast, StructToBiggerNullableStruct) {
 TEST(Cast, StructToDifferentNullabilityStruct) {
   {
     // OK to go from non-nullable to nullable...
-    std::vector<std::shared_ptr<Field>> fields_src_non_nullable = {
+    std::vector<std::shared_ptr<Field>> fields_src = {
         std::make_shared<Field>("a", int8(), false),
         std::make_shared<Field>("b", int8(), false),
         std::make_shared<Field>("c", int8(), false)};
-    std::shared_ptr<Array> a_src_non_nullable, b_src_non_nullable, 
c_src_non_nullable;
-    a_src_non_nullable = ArrayFromJSON(int8(), "[11, 23, 56]");
-    b_src_non_nullable = ArrayFromJSON(int8(), "[32, 46, 37]");
-    c_src_non_nullable = ArrayFromJSON(int8(), "[95, 11, 44]");
-    ASSERT_OK_AND_ASSIGN(
-        auto src_non_nullable,
-        StructArray::Make({a_src_non_nullable, b_src_non_nullable, 
c_src_non_nullable},
-                          fields_src_non_nullable));
-
-    std::shared_ptr<Array> a_dest_nullable, b_dest_nullable, c_dest_nullable;
-    a_dest_nullable = ArrayFromJSON(int64(), "[11, 23, 56]");
-    b_dest_nullable = ArrayFromJSON(int64(), "[32, 46, 37]");
-    c_dest_nullable = ArrayFromJSON(int64(), "[95, 11, 44]");
-
-    std::vector<std::shared_ptr<Field>> fields_dest1_nullable = {
+    std::vector<std::shared_ptr<Array>> arrays_src = {
+        ArrayFromJSON(int8(), "[11, 23, 56]"),
+        ArrayFromJSON(int8(), "[32, 46, 37]"),
+        ArrayFromJSON(int8(), "[95, 11, 44]"),
+    };
+    ASSERT_OK_AND_ASSIGN(auto src, StructArray::Make(arrays_src, fields_src));
+
+    std::vector<std::shared_ptr<Field>> fields_dest = {
         std::make_shared<Field>("a", int64(), true),
         std::make_shared<Field>("b", int64(), true),
-        std::make_shared<Field>("c", int64(), true)};
-    ASSERT_OK_AND_ASSIGN(
-        auto dest1_nullable,
-        StructArray::Make({a_dest_nullable, b_dest_nullable, c_dest_nullable},
-                          fields_dest1_nullable));
-    CheckCast(src_non_nullable, dest1_nullable);
-
-    std::vector<std::shared_ptr<Field>> fields_dest2_nullable = {
-        std::make_shared<Field>("a", int64(), true),
-        std::make_shared<Field>("c", int64(), true)};
-    ASSERT_OK_AND_ASSIGN(
-        auto dest2_nullable,
-        StructArray::Make({a_dest_nullable, c_dest_nullable}, 
fields_dest2_nullable));
-    CheckCast(src_non_nullable, dest2_nullable);
-
-    std::vector<std::shared_ptr<Field>> fields_dest3_nullable = {
-        std::make_shared<Field>("b", int64(), true)};
-    ASSERT_OK_AND_ASSIGN(auto dest3_nullable,
-                         StructArray::Make({b_dest_nullable}, 
fields_dest3_nullable));
-    CheckCast(src_non_nullable, dest3_nullable);
+        std::make_shared<Field>("c", int64(), true),
+    };
+    std::vector<std::shared_ptr<Array>> arrays_dest = {
+        ArrayFromJSON(int64(), "[11, 23, 56]"),
+        ArrayFromJSON(int64(), "[32, 46, 37]"),
+        ArrayFromJSON(int64(), "[95, 11, 44]"),
+    };
+    ASSERT_OK_AND_ASSIGN(auto dest, StructArray::Make(arrays_dest, 
fields_dest));
+    CheckCast(src, dest);
+
+    std::vector<std::shared_ptr<Field>> fields_dest_ac = {fields_dest[0], 
fields_dest[2]};
+    std::vector<std::shared_ptr<Array>> arrays_dest_ac = {arrays_dest[0], 
arrays_dest[2]};
+    ASSERT_OK_AND_ASSIGN(auto dest_ac, StructArray::Make(arrays_dest_ac, 
fields_dest_ac));
+    CheckCast(src, dest_ac);
+
+    std::vector<std::shared_ptr<Field>> fields_dest_b = {fields_dest[1]};
+    std::vector<std::shared_ptr<Array>> arrays_dest_b = {arrays_dest[1]};
+    ASSERT_OK_AND_ASSIGN(auto dest_b, StructArray::Make(arrays_dest_b, 
fields_dest_b));
+    CheckCast(src, dest_b);
   }
   {
-    // But NOT OK to go from nullable to non-nullable...
-    std::vector<std::shared_ptr<Field>> fields_src_nullable = {
+    // But when going from nullable to non-nullable, all data must be 
non-null...
+    std::vector<std::shared_ptr<Field>> fields_src = {
         std::make_shared<Field>("a", int8(), true),
         std::make_shared<Field>("b", int8(), true),
         std::make_shared<Field>("c", int8(), true)};
-    std::shared_ptr<Array> a_src_nullable, b_src_nullable, c_src_nullable;
-    a_src_nullable = ArrayFromJSON(int8(), "[1, null, 5]");
-    b_src_nullable = ArrayFromJSON(int8(), "[3, 4, null]");
-    c_src_nullable = ArrayFromJSON(int8(), "[9, 11, 44]");
-    ASSERT_OK_AND_ASSIGN(
-        auto src_nullable,
-        StructArray::Make({a_src_nullable, b_src_nullable, c_src_nullable},
-                          fields_src_nullable));
-
-    std::vector<std::shared_ptr<Field>> fields_dest1_non_nullable = {
+    std::vector<std::shared_ptr<Array>> arrays_src = {
+        ArrayFromJSON(int8(), "[1, null, 5]"),
+        ArrayFromJSON(int8(), "[3, 4, null]"),
+        ArrayFromJSON(int8(), "[9, 11, 44]"),
+    };
+    ASSERT_OK_AND_ASSIGN(auto src, StructArray::Make(arrays_src, fields_src));
+
+    std::vector<std::shared_ptr<Field>> fields_dest = {
         std::make_shared<Field>("a", int64(), false),
         std::make_shared<Field>("b", int64(), false),
         std::make_shared<Field>("c", int64(), false)};
-    const auto dest1_non_nullable = arrow::struct_(fields_dest1_non_nullable);
-    const auto options1_non_nullable = CastOptions::Safe(dest1_non_nullable);
     EXPECT_RAISES_WITH_MESSAGE_THAT(
-        TypeError,
-        ::testing::HasSubstr("cannot cast nullable field to non-nullable 
field"),
-        Cast(src_nullable, options1_non_nullable));
+        Invalid,
+        ::testing::HasSubstr(
+            "field 'a' of type int8 has nulls. Can't cast to non-nullable 
field 'a' "
+            "of type int64"),
+        Cast(src, CastOptions::Safe(arrow::struct_(fields_dest))));
 
-    std::vector<std::shared_ptr<Field>> fields_dest2_non_nullable = {
-        std::make_shared<Field>("a", int64(), false),
-        std::make_shared<Field>("c", int64(), false)};
-    const auto dest2_non_nullable = arrow::struct_(fields_dest2_non_nullable);
-    const auto options2_non_nullable = CastOptions::Safe(dest2_non_nullable);
+    std::vector<std::shared_ptr<Field>> fields_dest_ac = {fields_dest[0], 
fields_dest[2]};
     EXPECT_RAISES_WITH_MESSAGE_THAT(
-        TypeError,
-        ::testing::HasSubstr("cannot cast nullable field to non-nullable 
field"),
-        Cast(src_nullable, options2_non_nullable));
-
-    std::vector<std::shared_ptr<Field>> fields_dest3_non_nullable = {
-        std::make_shared<Field>("c", int64(), false)};
-    const auto dest3_non_nullable = arrow::struct_(fields_dest3_non_nullable);
-    const auto options3_non_nullable = CastOptions::Safe(dest3_non_nullable);
+        Invalid,
+        ::testing::HasSubstr(
+            "field 'a' of type int8 has nulls. Can't cast to non-nullable 
field 'a' "
+            "of type int64"),
+        Cast(src, CastOptions::Safe(arrow::struct_(fields_dest_ac))));
+
+    // if we only select a field with no nulls, it should be fine:
+    std::vector<std::shared_ptr<Field>> fields_dest_c = {fields_dest[2]};
+    std::vector<std::shared_ptr<Array>> arrays_dest_c = {
+        ArrayFromJSON(int64(), "[9, 11, 44]")};
+    ASSERT_OK_AND_ASSIGN(auto dest_c, StructArray::Make(arrays_dest_c, 
fields_dest_c));
+    CheckCast(src, dest_c);
+
+    // A slice that doesn't contain nulls is castable...
+    std::vector<std::shared_ptr<Array>> arrays_dest_0 = {
+        ArrayFromJSON(int64(), "[1]"),
+        ArrayFromJSON(int64(), "[3]"),
+        ArrayFromJSON(int64(), "[9]"),
+    };
+    ASSERT_OK_AND_ASSIGN(auto dest_0, StructArray::Make(arrays_dest_0, 
fields_dest));
+    CheckCast(src->Slice(0, 1), dest_0);
+
+    // ...but a slice that contains nulls will error.
     EXPECT_RAISES_WITH_MESSAGE_THAT(
-        TypeError,
-        ::testing::HasSubstr("cannot cast nullable field to non-nullable 
field"),
-        Cast(src_nullable, options3_non_nullable));
+        Invalid,
+        ::testing::HasSubstr(
+            "field 'a' of type int8 has nulls. Can't cast to non-nullable 
field 'a' "
+            "of type int64"),
+        Cast(src->Slice(1, 3), 
CastOptions::Safe(arrow::struct_(fields_dest))));
   }
 }
 

Reply via email to