This is an automated email from the ASF dual-hosted git repository.
westonpace pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 65202d0178 GH-34619: [C++] Add extension array handling to ArraySpan
conversion (#34684)
65202d0178 is described below
commit 65202d0178d74e84f5478e669470c9bc7d712c59
Author: Weston Pace <[email protected]>
AuthorDate: Fri Mar 24 06:13:14 2023 -0700
GH-34619: [C++] Add extension array handling to ArraySpan conversion
(#34684)
### Rationale for this change
The take kernel would generate a segmentation fault when used on an
extension array with a dictionary storage type.
### What changes are included in this PR?
Conversion to/from ArraySpan now uses the storage type to determine if it
needs to assign a dictionary
### Are these changes tested?
I added unit tests that verify ArraySpan round trip for most types,
including an extension dictionary type (and verified this test fails without
the fix).
### Are there any user-facing changes?
No
* Closes: #34619
Authored-by: Weston Pace <[email protected]>
Signed-off-by: Weston Pace <[email protected]>
---
cpp/src/arrow/array/array_test.cc | 16 ++++++++++++++++
cpp/src/arrow/array/data.cc | 19 ++++++++++++++-----
2 files changed, 30 insertions(+), 5 deletions(-)
diff --git a/cpp/src/arrow/array/array_test.cc
b/cpp/src/arrow/array/array_test.cc
index a434a75374..d18b16ba78 100644
--- a/cpp/src/arrow/array/array_test.cc
+++ b/cpp/src/arrow/array/array_test.cc
@@ -661,6 +661,13 @@ TEST_F(TestArray, TestMakeArrayFromMapScalar) {
AssertAppendScalar(pool_, std::make_shared<MapScalar>(scalar));
}
+void CheckSpanRoundTrip(const Array& array) {
+ ArraySpan span;
+ span.SetMembers(*array.data());
+ std::shared_ptr<Array> array2 = span.ToArray();
+ AssertArraysEqual(array, *array2);
+}
+
TEST_F(TestArray, TestMakeEmptyArray) {
FieldVector union_fields1({field("a", utf8()), field("b", int32())});
FieldVector union_fields2({field("a", null()), field("b",
list(large_utf8()))});
@@ -696,9 +703,18 @@ TEST_F(TestArray, TestMakeEmptyArray) {
ASSERT_OK_AND_ASSIGN(auto array, MakeEmptyArray(type));
ASSERT_OK(array->ValidateFull());
ASSERT_EQ(array->length(), 0);
+ CheckSpanRoundTrip(*array);
}
}
+TEST_F(TestArray, ExtensionSpanRoundTrip) {
+ // Other types are checked in MakeEmptyArray but MakeEmptyArray doesn't
+ // work for extension types so we check that here
+ ASSERT_OK_AND_ASSIGN(auto array, MakeEmptyArray(dictionary(int8(), utf8())));
+ auto ext_array = ExtensionArray(dict_extension_type(), std::move(array));
+ CheckSpanRoundTrip(ext_array);
+}
+
TEST_F(TestArray, TestAppendArraySlice) {
auto scalars = GetScalars();
ArraySpan span;
diff --git a/cpp/src/arrow/array/data.cc b/cpp/src/arrow/array/data.cc
index b13022f9c3..e23facce63 100644
--- a/cpp/src/arrow/array/data.cc
+++ b/cpp/src/arrow/array/data.cc
@@ -157,6 +157,11 @@ void ArraySpan::SetMembers(const ArrayData& data) {
}
Type::type type_id = this->type->id();
+ if (type_id == Type::EXTENSION) {
+ const ExtensionType* ext_type = checked_cast<const
ExtensionType*>(this->type);
+ type_id = ext_type->storage_type()->id();
+ }
+
if (data.buffers[0] == nullptr && type_id != Type::NA &&
type_id != Type::SPARSE_UNION && type_id != Type::DENSE_UNION) {
// This should already be zero but we make for sure
@@ -168,7 +173,7 @@ void ArraySpan::SetMembers(const ArrayData& data) {
this->buffers[i] = {};
}
- if (this->type->id() == Type::DICTIONARY) {
+ if (type_id == Type::DICTIONARY) {
this->child_data.resize(1);
this->child_data[0].SetMembers(*data.dictionary);
} else {
@@ -412,16 +417,20 @@ std::shared_ptr<ArrayData> ArraySpan::ToArrayData() const
{
result->buffers.emplace_back(this->GetBuffer(i));
}
- if (this->type->id() == Type::NA) {
+ Type::type type_id = this->type->id();
+ if (type_id == Type::EXTENSION) {
+ const ExtensionType* ext_type = checked_cast<const
ExtensionType*>(this->type);
+ type_id = ext_type->storage_type()->id();
+ }
+
+ if (type_id == Type::NA) {
result->null_count = this->length;
} else if (this->buffers[0].data == nullptr) {
// No validity bitmap, so the null count is 0
result->null_count = 0;
}
- // TODO(wesm): what about extension arrays?
-
- if (this->type->id() == Type::DICTIONARY) {
+ if (type_id == Type::DICTIONARY) {
result->dictionary = this->dictionary().ToArrayData();
} else {
// Emit children, too