This is an automated email from the ASF dual-hosted git repository.
apitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 6a47e4d28c GH-33592: [C++] support casting nullable fields to
non-nullable if there are no null values (#43782)
6a47e4d28c is described below
commit 6a47e4d28cdc4592fe6a458dbe5efe3b17a090e5
Author: Nick Crews <[email protected]>
AuthorDate: Thu Feb 13 05:49:21 2025 -0900
GH-33592: [C++] support casting nullable fields to non-nullable if there
are no null values (#43782)
* GitHub Issue: #33592
Notes for myself/fixer:
- [tests that need to get
updated](https://github.com/search?q=repo%3Aapache%2Farrow%20%22cannot%20cast%20nullable%20field%22&type=code)
(almost definitely not a complete list)
- [update: actually we should handle the go implementation in the go
repository.] hmm, looks like [go wrapper does its own nullability
checks](https://github.com/apache/arrow/blob/f078942ce2df68de8f48c3b4233132133601ca53/go/arrow/compute/cast.go#L283-L286).
I assume this is just an optimization to not have to go into the C++ and hit
the error down there. So if we just delete this check then I think the C++
logic will handle all of it???
- [how the plain column handles this
cast](https://github.com/apache/arrow/blob/f078942ce2df68de8f48c3b4233132133601ca53/python/pyarrow/table.pxi#L3238-L3243),
some logic like this probably needs to get ported over to the struct
implementation
- (running from /cpp/build) `cmake .. --preset ninja-debug-basic`, then
`cmake --build . && PYTHON=python ctest -R 'arrow-compute-scalar-cast-test'
--output-on-failure` to run the specific test
- to format: `uvx pre-commit run --all-files clang-format`
Lead-authored-by: Nick Crews <[email protected]>
Co-authored-by: Antoine Pitrou <[email protected]>
Signed-off-by: Antoine Pitrou <[email protected]>
---
.../arrow/compute/kernels/scalar_cast_nested.cc | 30 +++--
cpp/src/arrow/compute/kernels/scalar_cast_test.cc | 141 +++++++++++----------
2 files changed, 90 insertions(+), 81 deletions(-)
diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc
b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc
index 0033d89fc4..a50dcdead2 100644
--- a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc
@@ -363,10 +363,6 @@ struct CastStruct {
const auto& in_field = in_type.field(in_field_index);
// If there are more in_fields check if they match the out_field.
if (in_field->name() == out_field->name()) {
- if (in_field->nullable() && !out_field->nullable()) {
- return Status::TypeError("cannot cast nullable field to
non-nullable field: ",
- in_type.ToString(), " ",
out_type.ToString());
- }
// Found matching in_field and out_field.
fields_to_select[out_field_index++] = in_field_index;
// Using the same in_field for multiple out_fields is not allowed.
@@ -403,17 +399,27 @@ struct CastStruct {
}
int out_field_index = 0;
- for (int field_index : fields_to_select) {
- const auto& target_type = out->type()->field(out_field_index++)->type();
- if (field_index == kFillNullSentinel) {
- ARROW_ASSIGN_OR_RAISE(auto nulls,
- MakeArrayOfNull(target_type->GetSharedPtr(),
batch.length));
+ for (int in_field_index : fields_to_select) {
+ const auto& out_field = out_type.field(out_field_index++);
+ const auto& out_field_type = out_field->type();
+ if (in_field_index == kFillNullSentinel) {
+ ARROW_ASSIGN_OR_RAISE(
+ auto nulls, MakeArrayOfNull(out_field_type->GetSharedPtr(),
batch.length,
+ ctx->memory_pool()));
out_array->child_data.push_back(nulls->data());
} else {
- const auto& values =
(in_array.child_data[field_index].ToArrayData()->Slice(
+ const auto& in_field = in_type.field(in_field_index);
+ const auto& in_values =
(in_array.child_data[in_field_index].ToArrayData()->Slice(
in_array.offset, in_array.length));
- ARROW_ASSIGN_OR_RAISE(Datum cast_values,
- Cast(values, target_type, options,
ctx->exec_context()));
+ if (in_field->nullable() && !out_field->nullable() &&
+ in_values->GetNullCount() > 0) {
+ return Status::Invalid(
+ "field '", in_field->name(), "' of type ",
in_field->type()->ToString(),
+ " has nulls. Can't cast to non-nullable field '",
out_field->name(),
+ "' of type ", out_field_type->ToString());
+ }
+ ARROW_ASSIGN_OR_RAISE(Datum cast_values, Cast(in_values,
out_field_type, options,
+ ctx->exec_context()));
DCHECK(cast_values.is_array());
out_array->child_data.push_back(cast_values.array());
}
diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc
b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc
index d7f73e2bb7..528e6acded 100644
--- a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc
@@ -4076,92 +4076,95 @@ TEST(Cast, StructToBiggerNullableStruct) {
TEST(Cast, StructToDifferentNullabilityStruct) {
{
// OK to go from non-nullable to nullable...
- std::vector<std::shared_ptr<Field>> fields_src_non_nullable = {
+ std::vector<std::shared_ptr<Field>> fields_src = {
std::make_shared<Field>("a", int8(), false),
std::make_shared<Field>("b", int8(), false),
std::make_shared<Field>("c", int8(), false)};
- std::shared_ptr<Array> a_src_non_nullable, b_src_non_nullable,
c_src_non_nullable;
- a_src_non_nullable = ArrayFromJSON(int8(), "[11, 23, 56]");
- b_src_non_nullable = ArrayFromJSON(int8(), "[32, 46, 37]");
- c_src_non_nullable = ArrayFromJSON(int8(), "[95, 11, 44]");
- ASSERT_OK_AND_ASSIGN(
- auto src_non_nullable,
- StructArray::Make({a_src_non_nullable, b_src_non_nullable,
c_src_non_nullable},
- fields_src_non_nullable));
-
- std::shared_ptr<Array> a_dest_nullable, b_dest_nullable, c_dest_nullable;
- a_dest_nullable = ArrayFromJSON(int64(), "[11, 23, 56]");
- b_dest_nullable = ArrayFromJSON(int64(), "[32, 46, 37]");
- c_dest_nullable = ArrayFromJSON(int64(), "[95, 11, 44]");
-
- std::vector<std::shared_ptr<Field>> fields_dest1_nullable = {
+ std::vector<std::shared_ptr<Array>> arrays_src = {
+ ArrayFromJSON(int8(), "[11, 23, 56]"),
+ ArrayFromJSON(int8(), "[32, 46, 37]"),
+ ArrayFromJSON(int8(), "[95, 11, 44]"),
+ };
+ ASSERT_OK_AND_ASSIGN(auto src, StructArray::Make(arrays_src, fields_src));
+
+ std::vector<std::shared_ptr<Field>> fields_dest = {
std::make_shared<Field>("a", int64(), true),
std::make_shared<Field>("b", int64(), true),
- std::make_shared<Field>("c", int64(), true)};
- ASSERT_OK_AND_ASSIGN(
- auto dest1_nullable,
- StructArray::Make({a_dest_nullable, b_dest_nullable, c_dest_nullable},
- fields_dest1_nullable));
- CheckCast(src_non_nullable, dest1_nullable);
-
- std::vector<std::shared_ptr<Field>> fields_dest2_nullable = {
- std::make_shared<Field>("a", int64(), true),
- std::make_shared<Field>("c", int64(), true)};
- ASSERT_OK_AND_ASSIGN(
- auto dest2_nullable,
- StructArray::Make({a_dest_nullable, c_dest_nullable},
fields_dest2_nullable));
- CheckCast(src_non_nullable, dest2_nullable);
-
- std::vector<std::shared_ptr<Field>> fields_dest3_nullable = {
- std::make_shared<Field>("b", int64(), true)};
- ASSERT_OK_AND_ASSIGN(auto dest3_nullable,
- StructArray::Make({b_dest_nullable},
fields_dest3_nullable));
- CheckCast(src_non_nullable, dest3_nullable);
+ std::make_shared<Field>("c", int64(), true),
+ };
+ std::vector<std::shared_ptr<Array>> arrays_dest = {
+ ArrayFromJSON(int64(), "[11, 23, 56]"),
+ ArrayFromJSON(int64(), "[32, 46, 37]"),
+ ArrayFromJSON(int64(), "[95, 11, 44]"),
+ };
+ ASSERT_OK_AND_ASSIGN(auto dest, StructArray::Make(arrays_dest,
fields_dest));
+ CheckCast(src, dest);
+
+ std::vector<std::shared_ptr<Field>> fields_dest_ac = {fields_dest[0],
fields_dest[2]};
+ std::vector<std::shared_ptr<Array>> arrays_dest_ac = {arrays_dest[0],
arrays_dest[2]};
+ ASSERT_OK_AND_ASSIGN(auto dest_ac, StructArray::Make(arrays_dest_ac,
fields_dest_ac));
+ CheckCast(src, dest_ac);
+
+ std::vector<std::shared_ptr<Field>> fields_dest_b = {fields_dest[1]};
+ std::vector<std::shared_ptr<Array>> arrays_dest_b = {arrays_dest[1]};
+ ASSERT_OK_AND_ASSIGN(auto dest_b, StructArray::Make(arrays_dest_b,
fields_dest_b));
+ CheckCast(src, dest_b);
}
{
- // But NOT OK to go from nullable to non-nullable...
- std::vector<std::shared_ptr<Field>> fields_src_nullable = {
+ // But when going from nullable to non-nullable, all data must be
non-null...
+ std::vector<std::shared_ptr<Field>> fields_src = {
std::make_shared<Field>("a", int8(), true),
std::make_shared<Field>("b", int8(), true),
std::make_shared<Field>("c", int8(), true)};
- std::shared_ptr<Array> a_src_nullable, b_src_nullable, c_src_nullable;
- a_src_nullable = ArrayFromJSON(int8(), "[1, null, 5]");
- b_src_nullable = ArrayFromJSON(int8(), "[3, 4, null]");
- c_src_nullable = ArrayFromJSON(int8(), "[9, 11, 44]");
- ASSERT_OK_AND_ASSIGN(
- auto src_nullable,
- StructArray::Make({a_src_nullable, b_src_nullable, c_src_nullable},
- fields_src_nullable));
-
- std::vector<std::shared_ptr<Field>> fields_dest1_non_nullable = {
+ std::vector<std::shared_ptr<Array>> arrays_src = {
+ ArrayFromJSON(int8(), "[1, null, 5]"),
+ ArrayFromJSON(int8(), "[3, 4, null]"),
+ ArrayFromJSON(int8(), "[9, 11, 44]"),
+ };
+ ASSERT_OK_AND_ASSIGN(auto src, StructArray::Make(arrays_src, fields_src));
+
+ std::vector<std::shared_ptr<Field>> fields_dest = {
std::make_shared<Field>("a", int64(), false),
std::make_shared<Field>("b", int64(), false),
std::make_shared<Field>("c", int64(), false)};
- const auto dest1_non_nullable = arrow::struct_(fields_dest1_non_nullable);
- const auto options1_non_nullable = CastOptions::Safe(dest1_non_nullable);
EXPECT_RAISES_WITH_MESSAGE_THAT(
- TypeError,
- ::testing::HasSubstr("cannot cast nullable field to non-nullable
field"),
- Cast(src_nullable, options1_non_nullable));
+ Invalid,
+ ::testing::HasSubstr(
+ "field 'a' of type int8 has nulls. Can't cast to non-nullable
field 'a' "
+ "of type int64"),
+ Cast(src, CastOptions::Safe(arrow::struct_(fields_dest))));
- std::vector<std::shared_ptr<Field>> fields_dest2_non_nullable = {
- std::make_shared<Field>("a", int64(), false),
- std::make_shared<Field>("c", int64(), false)};
- const auto dest2_non_nullable = arrow::struct_(fields_dest2_non_nullable);
- const auto options2_non_nullable = CastOptions::Safe(dest2_non_nullable);
+ std::vector<std::shared_ptr<Field>> fields_dest_ac = {fields_dest[0],
fields_dest[2]};
EXPECT_RAISES_WITH_MESSAGE_THAT(
- TypeError,
- ::testing::HasSubstr("cannot cast nullable field to non-nullable
field"),
- Cast(src_nullable, options2_non_nullable));
-
- std::vector<std::shared_ptr<Field>> fields_dest3_non_nullable = {
- std::make_shared<Field>("c", int64(), false)};
- const auto dest3_non_nullable = arrow::struct_(fields_dest3_non_nullable);
- const auto options3_non_nullable = CastOptions::Safe(dest3_non_nullable);
+ Invalid,
+ ::testing::HasSubstr(
+ "field 'a' of type int8 has nulls. Can't cast to non-nullable
field 'a' "
+ "of type int64"),
+ Cast(src, CastOptions::Safe(arrow::struct_(fields_dest_ac))));
+
+ // if we only select a field with no nulls, it should be fine:
+ std::vector<std::shared_ptr<Field>> fields_dest_c = {fields_dest[2]};
+ std::vector<std::shared_ptr<Array>> arrays_dest_c = {
+ ArrayFromJSON(int64(), "[9, 11, 44]")};
+ ASSERT_OK_AND_ASSIGN(auto dest_c, StructArray::Make(arrays_dest_c,
fields_dest_c));
+ CheckCast(src, dest_c);
+
+ // A slice that doesn't contain nulls is castable...
+ std::vector<std::shared_ptr<Array>> arrays_dest_0 = {
+ ArrayFromJSON(int64(), "[1]"),
+ ArrayFromJSON(int64(), "[3]"),
+ ArrayFromJSON(int64(), "[9]"),
+ };
+ ASSERT_OK_AND_ASSIGN(auto dest_0, StructArray::Make(arrays_dest_0,
fields_dest));
+ CheckCast(src->Slice(0, 1), dest_0);
+
+ // ...but a slice that contains nulls will error.
EXPECT_RAISES_WITH_MESSAGE_THAT(
- TypeError,
- ::testing::HasSubstr("cannot cast nullable field to non-nullable
field"),
- Cast(src_nullable, options3_non_nullable));
+ Invalid,
+ ::testing::HasSubstr(
+ "field 'a' of type int8 has nulls. Can't cast to non-nullable
field 'a' "
+ "of type int64"),
+ Cast(src->Slice(1, 3),
CastOptions::Safe(arrow::struct_(fields_dest))));
}
}