This is an automated email from the ASF dual-hosted git repository.
lidavidm pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new c14d55d99f GH-44555: [C++][Compute] Allow casting struct to bigger
nullable struct (#44587)
c14d55d99f is described below
commit c14d55d99f3e26414eb47ba3fac5585b7f66a20d
Author: Thomas Newton <[email protected]>
AuthorDate: Mon Nov 25 00:59:20 2024 +0000
GH-44555: [C++][Compute] Allow casting struct to bigger nullable struct
(#44587)
### Rationale for this change
Sometimes its useful to add a column full of nulls in a cast. Currently
this is supported in top level columns but not inside structs. Example where
this is important: https://github.com/delta-io/delta-rs/issues/1610
### What changes are included in this PR?
Add support for filling in columns full of null for nullable struct fields.
I've gone for a fairly minimal change that achieves what I needed but I
wonder if there should be a more significant change so that this casting is
done by field name and ignore the field order.
### Are these changes tested?
Yes. The expected behaviour in several existing tests has been altered and
a couple of new tests have been added.
I also rolled out a custom build with this change internally because it
suddenly became a critical problem.
### Are there any user-facing changes?
Yes. There are scenarios that previously failed with `struct fields don't
match or are in the wrong order` but now succeed after filling in `null`s.
* GitHub Issue: #44555
Lead-authored-by: Thomas Newton <[email protected]>
Co-authored-by: David Li <[email protected]>
Signed-off-by: David Li <[email protected]>
---
.../arrow/compute/kernels/scalar_cast_nested.cc | 78 ++++++++++-----
cpp/src/arrow/compute/kernels/scalar_cast_test.cc | 111 +++++++++++++++------
cpp/src/arrow/dataset/scanner_test.cc | 25 +++--
3 files changed, 148 insertions(+), 66 deletions(-)
diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc
b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc
index ec5291ef60..0033d89fc4 100644
--- a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc
@@ -18,6 +18,7 @@
// Implementation of casting to (or between) list types
#include <limits>
+#include <set>
#include <utility>
#include <vector>
@@ -340,6 +341,8 @@ struct CastFixedList {
struct CastStruct {
static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult*
out) {
+ static constexpr int kFillNullSentinel = -2;
+
const CastOptions& options = CastState::Get(ctx);
const auto& in_type = checked_cast<const StructType&>(*batch[0].type());
const auto& out_type = checked_cast<const StructType&>(*out->type());
@@ -348,25 +351,46 @@ struct CastStruct {
std::vector<int> fields_to_select(out_field_count, -1);
- int out_field_index = 0;
- for (int in_field_index = 0;
- in_field_index < in_field_count && out_field_index < out_field_count;
- ++in_field_index) {
- const auto& in_field = in_type.field(in_field_index);
+ std::set<std::string> all_in_field_names;
+ for (int in_field_index = 0; in_field_index < in_field_count;
++in_field_index) {
+ all_in_field_names.insert(in_type.field(in_field_index)->name());
+ }
+
+ for (int in_field_index = 0, out_field_index = 0;
+ out_field_index < out_field_count;) {
const auto& out_field = out_type.field(out_field_index);
- if (in_field->name() == out_field->name()) {
- if (in_field->nullable() && !out_field->nullable()) {
- return Status::TypeError("cannot cast nullable field to non-nullable
field: ",
- in_type.ToString(), " ",
out_type.ToString());
+ if (in_field_index < in_field_count) {
+ const auto& in_field = in_type.field(in_field_index);
+ // If there are more in_fields check if they match the out_field.
+ if (in_field->name() == out_field->name()) {
+ if (in_field->nullable() && !out_field->nullable()) {
+ return Status::TypeError("cannot cast nullable field to
non-nullable field: ",
+ in_type.ToString(), " ",
out_type.ToString());
+ }
+ // Found matching in_field and out_field.
+ fields_to_select[out_field_index++] = in_field_index;
+ // Using the same in_field for multiple out_fields is not allowed.
+ in_field_index++;
+ continue;
}
- fields_to_select[out_field_index++] = in_field_index;
}
- }
-
- if (out_field_index < out_field_count) {
- return Status::TypeError(
- "struct fields don't match or are in the wrong order: Input fields:
",
- in_type.ToString(), " output fields: ", out_type.ToString());
+ if (all_in_field_names.count(out_field->name()) == 0 &&
out_field->nullable()) {
+ // Didn't match current in_field, but we can fill with null.
+ // Filling with null is only acceptable on nullable fields when there
+ // is definitely no in_field with matching name.
+
+ fields_to_select[out_field_index++] = kFillNullSentinel;
+ } else if (in_field_index < in_field_count) {
+ // Didn't match current in_field, and the we cannot fill with null, so
+ // try next in_field.
+ in_field_index++;
+ } else {
+ // Didn't match current in_field, we cannot fill with null, and there
+ // are no more in_fields to try, so fail.
+ return Status::TypeError(
+ "struct fields don't match or are in the wrong order: Input
fields: ",
+ in_type.ToString(), " output fields: ", out_type.ToString());
+ }
}
const ArraySpan& in_array = batch[0].array;
@@ -378,17 +402,21 @@ struct CastStruct {
in_array.offset, in_array.length));
}
- out_field_index = 0;
+ int out_field_index = 0;
for (int field_index : fields_to_select) {
- const auto& values =
(in_array.child_data[field_index].ToArrayData()->Slice(
- in_array.offset, in_array.length));
const auto& target_type = out->type()->field(out_field_index++)->type();
-
- ARROW_ASSIGN_OR_RAISE(Datum cast_values,
- Cast(values, target_type, options,
ctx->exec_context()));
-
- DCHECK(cast_values.is_array());
- out_array->child_data.push_back(cast_values.array());
+ if (field_index == kFillNullSentinel) {
+ ARROW_ASSIGN_OR_RAISE(auto nulls,
+ MakeArrayOfNull(target_type->GetSharedPtr(),
batch.length));
+ out_array->child_data.push_back(nulls->data());
+ } else {
+ const auto& values =
(in_array.child_data[field_index].ToArrayData()->Slice(
+ in_array.offset, in_array.length));
+ ARROW_ASSIGN_OR_RAISE(Datum cast_values,
+ Cast(values, target_type, options,
ctx->exec_context()));
+ DCHECK(cast_values.is_array());
+ out_array->child_data.push_back(cast_values.array());
+ }
}
return Status::OK();
diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc
b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc
index 6315044a1b..33a0142550 100644
--- a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc
@@ -2621,6 +2621,8 @@ static void CheckStructToStructSubset(
d2 = ArrayFromJSON(dest_value_type, "[6, 51, 49]");
e2 = ArrayFromJSON(dest_value_type, "[19, 17, 74]");
+ auto nulls = ArrayFromJSON(dest_value_type, "[null, null, null]");
+
ASSERT_OK_AND_ASSIGN(auto src,
StructArray::Make({a1, b1, c1, d1, e1},
field_names));
ASSERT_OK_AND_ASSIGN(auto dest1,
@@ -2646,34 +2648,38 @@ static void CheckStructToStructSubset(
CheckCast(src, dest5);
// field does not exist
- const auto dest6 = arrow::struct_({std::make_shared<Field>("a", int8()),
- std::make_shared<Field>("d", int16()),
- std::make_shared<Field>("f",
int64())});
- const auto options6 = CastOptions::Safe(dest6);
+ ASSERT_OK_AND_ASSIGN(auto dest6,
+ StructArray::Make({a1, d1, nulls}, {"a", "d",
"f"}));
+ CheckCast(src, dest6);
+
+ const auto dest7 = arrow::struct_(
+ {std::make_shared<Field>("a", int8()), std::make_shared<Field>("d",
int16()),
+ std::make_shared<Field>("f", int64(), /*nullable=*/false)});
+ const auto options7 = CastOptions::Safe(dest7);
EXPECT_RAISES_WITH_MESSAGE_THAT(
TypeError,
::testing::HasSubstr("struct fields don't match or are in the wrong
order"),
- Cast(src, options6));
+ Cast(src, options7));
// fields in wrong order
- const auto dest7 = arrow::struct_({std::make_shared<Field>("a", int8()),
+ const auto dest8 = arrow::struct_({std::make_shared<Field>("a", int8()),
std::make_shared<Field>("c", int16()),
std::make_shared<Field>("b",
int64())});
- const auto options7 = CastOptions::Safe(dest7);
+ const auto options8 = CastOptions::Safe(dest8);
EXPECT_RAISES_WITH_MESSAGE_THAT(
TypeError,
::testing::HasSubstr("struct fields don't match or are in the wrong
order"),
- Cast(src, options7));
+ Cast(src, options8));
// duplicate missing field names
- const auto dest8 = arrow::struct_(
+ const auto dest9 = arrow::struct_(
{std::make_shared<Field>("a", int8()), std::make_shared<Field>("c",
int16()),
std::make_shared<Field>("d", int32()), std::make_shared<Field>("a",
int64())});
- const auto options8 = CastOptions::Safe(dest8);
+ const auto options9 = CastOptions::Safe(dest9);
EXPECT_RAISES_WITH_MESSAGE_THAT(
TypeError,
::testing::HasSubstr("struct fields don't match or are in the wrong
order"),
- Cast(src, options8));
+ Cast(src, options9));
// duplicate present field names
ASSERT_OK_AND_ASSIGN(
@@ -2720,6 +2726,8 @@ static void CheckStructToStructSubsetWithNulls(
d2 = ArrayFromJSON(dest_value_type, "[6, 51, null]");
e2 = ArrayFromJSON(dest_value_type, "[null, 17, 74]");
+ auto nulls = ArrayFromJSON(dest_value_type, "[null, null, null]");
+
std::shared_ptr<Buffer> null_bitmap;
BitmapFromVector<int>({0, 1, 0}, &null_bitmap);
@@ -2755,34 +2763,39 @@ static void CheckStructToStructSubsetWithNulls(
CheckCast(src_null, dest5_null);
// field does not exist
- const auto dest6_null = arrow::struct_({std::make_shared<Field>("a",
int8()),
- std::make_shared<Field>("d",
int16()),
- std::make_shared<Field>("f",
int64())});
- const auto options6_null = CastOptions::Safe(dest6_null);
+ ASSERT_OK_AND_ASSIGN(
+ auto dest6_null,
+ StructArray::Make({a1, d1, nulls}, {"a", "d", "f"}, null_bitmap));
+ CheckCast(src_null, dest6_null);
+
+ const auto dest7_null = arrow::struct_(
+ {std::make_shared<Field>("a", int8()), std::make_shared<Field>("d",
int16()),
+ std::make_shared<Field>("f", int64(), /*nullable=*/false)});
+ const auto options7_null = CastOptions::Safe(dest7_null);
EXPECT_RAISES_WITH_MESSAGE_THAT(
TypeError,
::testing::HasSubstr("struct fields don't match or are in the wrong
order"),
- Cast(src_null, options6_null));
+ Cast(src_null, options7_null));
// fields in wrong order
- const auto dest7_null = arrow::struct_({std::make_shared<Field>("a",
int8()),
+ const auto dest8_null = arrow::struct_({std::make_shared<Field>("a",
int8()),
std::make_shared<Field>("c",
int16()),
std::make_shared<Field>("b",
int64())});
- const auto options7_null = CastOptions::Safe(dest7_null);
+ const auto options8_null = CastOptions::Safe(dest8_null);
EXPECT_RAISES_WITH_MESSAGE_THAT(
TypeError,
::testing::HasSubstr("struct fields don't match or are in the wrong
order"),
- Cast(src_null, options7_null));
+ Cast(src_null, options8_null));
// duplicate missing field names
- const auto dest8_null = arrow::struct_(
+ const auto dest9_null = arrow::struct_(
{std::make_shared<Field>("a", int8()), std::make_shared<Field>("c",
int16()),
std::make_shared<Field>("d", int32()), std::make_shared<Field>("a",
int64())});
- const auto options8_null = CastOptions::Safe(dest8_null);
+ const auto options9_null = CastOptions::Safe(dest9_null);
EXPECT_RAISES_WITH_MESSAGE_THAT(
TypeError,
::testing::HasSubstr("struct fields don't match or are in the wrong
order"),
- Cast(src_null, options8_null));
+ Cast(src_null, options9_null));
// duplicate present field values
ASSERT_OK_AND_ASSIGN(
@@ -2818,20 +2831,26 @@ TEST(Cast, StructToStructSubsetWithNulls) {
}
TEST(Cast, StructToSameSizedButDifferentNamedStruct) {
- std::vector<std::string> field_names = {"a", "b"};
+ std::vector<std::string> src_field_names = {"a", "b"};
std::shared_ptr<Array> a, b;
a = ArrayFromJSON(int8(), "[1, 2]");
b = ArrayFromJSON(int8(), "[3, 4]");
- ASSERT_OK_AND_ASSIGN(auto src, StructArray::Make({a, b}, field_names));
+ auto nulls = ArrayFromJSON(int8(), "[null, null]");
+ ASSERT_OK_AND_ASSIGN(auto src, StructArray::Make({a, b}, src_field_names));
+
+ std::vector<std::string> dest1_field_names = {"c", "d"};
+ ASSERT_OK_AND_ASSIGN(auto dest1, StructArray::Make({nulls, nulls},
dest1_field_names));
+ CheckCast(src, dest1);
- const auto dest = arrow::struct_(
- {std::make_shared<Field>("c", int8()), std::make_shared<Field>("d",
int8())});
- const auto options = CastOptions::Safe(dest);
+ const auto dest2 =
+ arrow::struct_({std::make_shared<Field>("c", int8(), /*nullable=*/false),
+ std::make_shared<Field>("d", int8())});
+ const auto options2 = CastOptions::Safe(dest2);
EXPECT_RAISES_WITH_MESSAGE_THAT(
TypeError,
::testing::HasSubstr("struct fields don't match or are in the wrong
order"),
- Cast(src, options));
+ Cast(src, options2));
}
TEST(Cast, StructToBiggerStruct) {
@@ -2841,15 +2860,41 @@ TEST(Cast, StructToBiggerStruct) {
b = ArrayFromJSON(int8(), "[3, 4]");
ASSERT_OK_AND_ASSIGN(auto src, StructArray::Make({a, b}, field_names));
- const auto dest = arrow::struct_({std::make_shared<Field>("a", int8()),
- std::make_shared<Field>("b", int8()),
- std::make_shared<Field>("c", int8())});
- const auto options = CastOptions::Safe(dest);
+ const auto dest1 = arrow::struct_(
+ {std::make_shared<Field>("a", int8()), std::make_shared<Field>("b",
int8()),
+ std::make_shared<Field>("c", int8(), /*nullable=*/false)});
+ const auto options1 = CastOptions::Safe(dest1);
+
+ EXPECT_RAISES_WITH_MESSAGE_THAT(
+ TypeError,
+ ::testing::HasSubstr("struct fields don't match or are in the wrong
order"),
+ Cast(src, options1));
+
+ const auto dest2 =
+ arrow::struct_({std::make_shared<Field>("a", int8()),
+ std::make_shared<Field>("c", int8(), /*nullable=*/false),
+ std::make_shared<Field>("b", int8())});
+ const auto options2 = CastOptions::Safe(dest2);
EXPECT_RAISES_WITH_MESSAGE_THAT(
TypeError,
::testing::HasSubstr("struct fields don't match or are in the wrong
order"),
- Cast(src, options));
+ Cast(src, options2));
+}
+
+TEST(Cast, StructToBiggerNullableStruct) {
+ std::vector<std::string> field_names = {"a", "b"};
+ std::shared_ptr<Array> a, b, c;
+ a = ArrayFromJSON(int8(), "[1, 2]");
+ b = ArrayFromJSON(int8(), "[3, 4]");
+ ASSERT_OK_AND_ASSIGN(auto src, StructArray::Make({a, b}, field_names));
+
+ c = ArrayFromJSON(int8(), "[null, null]");
+ ASSERT_OK_AND_ASSIGN(auto dest, StructArray::Make({a, b, c}, {"a", "b",
"c"}));
+ CheckCast(src, dest);
+
+ ASSERT_OK_AND_ASSIGN(auto dest2, StructArray::Make({a, c, b}, {"a", "c",
"b"}));
+ CheckCast(src, dest2);
}
TEST(Cast, StructToDifferentNullabilityStruct) {
diff --git a/cpp/src/arrow/dataset/scanner_test.cc
b/cpp/src/arrow/dataset/scanner_test.cc
index 58bc9c8c0e..bf4e3248e5 100644
--- a/cpp/src/arrow/dataset/scanner_test.cc
+++ b/cpp/src/arrow/dataset/scanner_test.cc
@@ -2332,7 +2332,7 @@ DatasetAndBatches MakeNestedDataset() {
field("b", boolean()),
field("c", struct_({
field("d", int64()),
- field("e", float64()),
+ field("e", int64()),
})),
});
const auto physical_schema = ::arrow::schema({
@@ -2535,14 +2535,14 @@ TEST(ScanNode, MaterializationOfVirtualColumn) {
TEST(ScanNode, MaterializationOfNestedVirtualColumn) {
TestPlan plan;
- auto basic = MakeNestedDataset();
+ auto nested = MakeNestedDataset();
auto options = std::make_shared<ScanOptions>();
options->projection = Materialize({"a", "b", "c"},
/*include_aug_fields=*/true);
ASSERT_OK(acero::Declaration::Sequence(
{
- {"scan", ScanNodeOptions{basic.dataset, options}},
+ {"scan", ScanNodeOptions{nested.dataset, options}},
{"augmented_project",
acero::ProjectNodeOptions{
{field_ref("a"), field_ref("b"), field_ref("c")}}},
@@ -2550,11 +2550,20 @@ TEST(ScanNode, MaterializationOfNestedVirtualColumn) {
})
.AddToPlan(plan.get()));
- // TODO(ARROW-1888): allow scanner to "patch up" structs with casts
- EXPECT_FINISHES_AND_RAISES_WITH_MESSAGE_THAT(
- TypeError,
- ::testing::HasSubstr("struct fields don't match or are in the wrong
order"),
- plan.Run());
+ auto expected = nested.batches;
+
+ for (auto& batch : expected) {
+ // Scan will fill in "c.d" with nulls.
+ ASSERT_OK_AND_ASSIGN(auto nulls,
+ MakeArrayOfNull(int64()->GetSharedPtr(),
batch.length));
+ auto c_data = batch.values[2].array()->Copy();
+ c_data->child_data.insert(c_data->child_data.begin(), nulls->data());
+ c_data->type = nested.dataset->schema()->field(2)->type();
+ auto c_array = std::make_shared<StructArray>(c_data);
+ batch.values[2] = c_array;
+ }
+
+ ASSERT_THAT(plan.Run(),
Finishes(ResultWith(UnorderedElementsAreArray(expected))));
}
TEST(ScanNode, MinimalEndToEnd) {