This is an automated email from the ASF dual-hosted git repository.
wjones127 pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 541647a6cf GH-37669: [C++][Python] Fix casting to extension type with
fixed size list storage type (#42219)
541647a6cf is described below
commit 541647a6cf219c8c4fbe8f53d8e4fe7c3740cb66
Author: Joris Van den Bossche <[email protected]>
AuthorDate: Fri Jun 21 20:53:52 2024 +0200
GH-37669: [C++][Python] Fix casting to extension type with fixed size list
storage type (#42219)
### Rationale for this change
Casting to an extension type with fixed-size list storage type was
segfaulting. The underlying issue was a debug check in the casting kernel in
the code path about pre-allocated data, but in this case we shouldn't be
pre-allocating anything, because "cast to extension type" _can_ be zero copy,
and we should let that be handled by the underlying cast to the storage type.
### What changes are included in this PR?
Specifically mark the cast kernels to extension type as
`NullHandling::COMPUTED_NO_PREALLOCATE` and `MemAllocation::NO_PREALLOCATE`
### Are these changes tested?
Yes
### Are there any user-facing changes?
No
* GitHub Issue: #37669
Authored-by: Joris Van den Bossche <[email protected]>
Signed-off-by: Will Jones <[email protected]>
---
.../arrow/compute/kernels/scalar_cast_extension.cc | 5 +-
python/pyarrow/tests/test_extension_type.py | 60 ++++++++++++++++++++++
2 files changed, 63 insertions(+), 2 deletions(-)
diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_extension.cc
b/cpp/src/arrow/compute/kernels/scalar_cast_extension.cc
index c32a6ef6de..2a54d28c6f 100644
--- a/cpp/src/arrow/compute/kernels/scalar_cast_extension.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_cast_extension.cc
@@ -56,8 +56,9 @@ Status CastToExtension(KernelContext* ctx, const ExecSpan&
batch, ExecResult* ou
std::shared_ptr<CastFunction> GetCastToExtension(std::string name) {
auto func = std::make_shared<CastFunction>(std::move(name), Type::EXTENSION);
for (Type::type in_ty : AllTypeIds()) {
- DCHECK_OK(
- func->AddKernel(in_ty, {InputType(in_ty)}, kOutputTargetType,
CastToExtension));
+ DCHECK_OK(func->AddKernel(in_ty, {InputType(in_ty)}, kOutputTargetType,
+ CastToExtension,
NullHandling::COMPUTED_NO_PREALLOCATE,
+ MemAllocation::NO_PREALLOCATE));
}
return func;
}
diff --git a/python/pyarrow/tests/test_extension_type.py
b/python/pyarrow/tests/test_extension_type.py
index 9863d96058..1c4d0175a2 100644
--- a/python/pyarrow/tests/test_extension_type.py
+++ b/python/pyarrow/tests/test_extension_type.py
@@ -194,6 +194,21 @@ class MyListType(pa.ExtensionType):
return cls(storage_type)
+class MyFixedListType(pa.ExtensionType):
+
+ def __init__(self, storage_type):
+ assert isinstance(storage_type, pa.FixedSizeListType)
+ super().__init__(storage_type, 'pyarrow.tests.MyFixedListType')
+
+ def __arrow_ext_serialize__(self):
+ return b''
+
+ @classmethod
+ def __arrow_ext_deserialize__(cls, storage_type, serialized):
+ assert serialized == b''
+ return cls(storage_type)
+
+
class AnnotatedType(pa.ExtensionType):
"""
Generic extension type that can store any storage type.
@@ -738,6 +753,36 @@ def test_casting_dict_array_to_extension_type():
UUID('30313233-3435-3637-3839-616263646566')]
+def test_cast_to_extension_with_nested_storage():
+ # https://github.com/apache/arrow/issues/37669
+
+ # With fixed-size list
+ array = pa.array([[1, 2], [3, 4], [5, 6]], pa.list_(pa.float64(), 2))
+ result = array.cast(MyFixedListType(pa.list_(pa.float64(), 2)))
+ expected = pa.ExtensionArray.from_storage(MyFixedListType(array.type),
array)
+ assert result.equals(expected)
+
+ ext_type = MyFixedListType(pa.list_(pa.float32(), 2))
+ result = array.cast(ext_type)
+ expected = pa.ExtensionArray.from_storage(
+ ext_type, array.cast(ext_type.storage_type)
+ )
+ assert result.equals(expected)
+
+ # With variable-size list
+ array = pa.array([[1, 2], [3], [4, 5, 6]], pa.list_(pa.float64()))
+ result = array.cast(MyListType(pa.list_(pa.float64())))
+ expected = pa.ExtensionArray.from_storage(MyListType(array.type), array)
+ assert result.equals(expected)
+
+ ext_type = MyListType(pa.list_(pa.float32()))
+ result = array.cast(ext_type)
+ expected = pa.ExtensionArray.from_storage(
+ ext_type, array.cast(ext_type.storage_type)
+ )
+ assert result.equals(expected)
+
+
def test_concat():
arr1 = pa.array([1, 2, 3], IntegerType())
arr2 = pa.array([4, 5, 6], IntegerType())
@@ -1500,6 +1545,21 @@ def test_tensor_type_equality():
assert not tensor_type == tensor_type3
+def test_tensor_type_cast():
+ tensor_type = pa.fixed_shape_tensor(pa.int8(), [2, 3])
+ inner = pa.array(range(18), pa.int8())
+ storage = pa.FixedSizeListArray.from_arrays(inner, 6)
+
+ # cast storage -> extension type
+ result = storage.cast(tensor_type)
+ expected = pa.ExtensionArray.from_storage(tensor_type, storage)
+ assert result.equals(expected)
+
+ # cast extension type -> storage type
+ storage_result = result.cast(storage.type)
+ assert storage_result.equals(storage)
+
+
@pytest.mark.pandas
def test_extension_to_pandas_storage_type(registered_period_type):
period_type, _ = registered_period_type