This is an automated email from the ASF dual-hosted git repository.
rok pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new d08d5e64fc GH-48470: [Python] Construct UuidArray from list of
UuidScalars (#48746)
d08d5e64fc is described below
commit d08d5e64fcfd8759d3a7089eced3e9a2d7a17f20
Author: tadeja <[email protected]>
AuthorDate: Wed Mar 18 13:11:39 2026 +0100
GH-48470: [Python] Construct UuidArray from list of UuidScalars (#48746)
### Rationale for this change
Fixes #48470. Also fixes all extension types, not just UUID.
### What changes are included in this PR?
An extension scalar is unwrapped to its storage type when building arrays.
### Are these changes tested?
Yes, new `test_array_from_extension_scalars` covers builtin (uuid, bool8,
json_, opaque) and custom types across all storage types (int, float, bool,
string, binary, large string/binary, decimal, fixed-size binary, struct,
timestamp, duration, date).
### Are there any user-facing changes?
Now user can run such an example to get the output below instead of
`ArrowInvalid` message.
This now works for any extension type, not just UUID.
```python
import pyarrow as pa
pa.array([pa.scalar(b'1'*16, type=pa.uuid())], type=pa.uuid())
```
```
<pyarrow.lib.UuidArray object at 0x128186970>
[
31313131313131313131313131313131
]
```
* GitHub Issue: #48470
Lead-authored-by: Tadeja Kadunc <[email protected]>
Co-authored-by: tadeja <[email protected]>
Co-authored-by: Rok Mihevc <[email protected]>
Signed-off-by: Rok Mihevc <[email protected]>
---
python/pyarrow/src/arrow/python/python_to_arrow.cc | 26 +++++++---
python/pyarrow/tests/test_extension_type.py | 55 ++++++++++++++++++++++
2 files changed, 74 insertions(+), 7 deletions(-)
diff --git a/python/pyarrow/src/arrow/python/python_to_arrow.cc
b/python/pyarrow/src/arrow/python/python_to_arrow.cc
index c70510a480..e7ce54abcd 100644
--- a/python/pyarrow/src/arrow/python/python_to_arrow.cc
+++ b/python/pyarrow/src/arrow/python/python_to_arrow.cc
@@ -584,6 +584,14 @@ class PyConverter : public Converter<PyObject*,
PyConversionOptions> {
}
};
+// Helper function to unwrap extension scalar to its storage scalar
+const Scalar& GetStorageScalar(const Scalar& scalar) {
+ if (scalar.type->id() == Type::EXTENSION) {
+ return *checked_cast<const ExtensionScalar&>(scalar).value;
+ }
+ return scalar;
+}
+
template <typename T, typename Enable = void>
class PyPrimitiveConverter;
@@ -663,7 +671,8 @@ class PyPrimitiveConverter<
} else if (arrow::py::is_scalar(value)) {
ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Scalar> scalar,
arrow::py::unwrap_scalar(value));
- ARROW_RETURN_NOT_OK(this->primitive_builder_->AppendScalar(*scalar));
+ ARROW_RETURN_NOT_OK(
+ this->primitive_builder_->AppendScalar(GetStorageScalar(*scalar)));
} else {
ARROW_ASSIGN_OR_RAISE(
auto converted, PyValue::Convert(this->primitive_type_,
this->options_, value));
@@ -684,7 +693,8 @@ class PyPrimitiveConverter<
} else if (arrow::py::is_scalar(value)) {
ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Scalar> scalar,
arrow::py::unwrap_scalar(value));
- ARROW_RETURN_NOT_OK(this->primitive_builder_->AppendScalar(*scalar));
+ ARROW_RETURN_NOT_OK(
+ this->primitive_builder_->AppendScalar(GetStorageScalar(*scalar)));
} else {
ARROW_ASSIGN_OR_RAISE(
auto converted, PyValue::Convert(this->primitive_type_,
this->options_, value));
@@ -710,7 +720,8 @@ class PyPrimitiveConverter<T, enable_if_t<std::is_same<T,
FixedSizeBinaryType>::
} else if (arrow::py::is_scalar(value)) {
ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Scalar> scalar,
arrow::py::unwrap_scalar(value));
- ARROW_RETURN_NOT_OK(this->primitive_builder_->AppendScalar(*scalar));
+ ARROW_RETURN_NOT_OK(
+ this->primitive_builder_->AppendScalar(GetStorageScalar(*scalar)));
} else {
ARROW_RETURN_NOT_OK(
PyValue::Convert(this->primitive_type_, this->options_, value,
view_));
@@ -747,7 +758,8 @@ class PyPrimitiveConverter<
} else if (arrow::py::is_scalar(value)) {
ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Scalar> scalar,
arrow::py::unwrap_scalar(value));
- ARROW_RETURN_NOT_OK(this->primitive_builder_->AppendScalar(*scalar));
+ ARROW_RETURN_NOT_OK(
+ this->primitive_builder_->AppendScalar(GetStorageScalar(*scalar)));
} else {
ARROW_RETURN_NOT_OK(
PyValue::Convert(this->primitive_type_, this->options_, value,
view_));
@@ -791,7 +803,7 @@ class PyDictionaryConverter<U, enable_if_has_c_type<U>>
} else if (arrow::py::is_scalar(value)) {
ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Scalar> scalar,
arrow::py::unwrap_scalar(value));
- return this->value_builder_->AppendScalar(*scalar, 1);
+ return this->value_builder_->AppendScalar(GetStorageScalar(*scalar), 1);
} else {
ARROW_ASSIGN_OR_RAISE(auto converted,
PyValue::Convert(this->value_type_,
this->options_, value));
@@ -810,7 +822,7 @@ class PyDictionaryConverter<U, enable_if_has_string_view<U>>
} else if (arrow::py::is_scalar(value)) {
ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Scalar> scalar,
arrow::py::unwrap_scalar(value));
- return this->value_builder_->AppendScalar(*scalar, 1);
+ return this->value_builder_->AppendScalar(GetStorageScalar(*scalar), 1);
} else {
ARROW_RETURN_NOT_OK(
PyValue::Convert(this->value_type_, this->options_, value, view_));
@@ -983,7 +995,7 @@ class PyStructConverter : public
StructConverter<PyConverter, PyConverterTrait>
} else if (arrow::py::is_scalar(value)) {
ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Scalar> scalar,
arrow::py::unwrap_scalar(value));
- return this->struct_builder_->AppendScalar(*scalar);
+ return this->struct_builder_->AppendScalar(GetStorageScalar(*scalar));
}
switch (input_kind_) {
case InputKind::DICT:
diff --git a/python/pyarrow/tests/test_extension_type.py
b/python/pyarrow/tests/test_extension_type.py
index 66fcfc0556..465b556876 100644
--- a/python/pyarrow/tests/test_extension_type.py
+++ b/python/pyarrow/tests/test_extension_type.py
@@ -16,6 +16,7 @@
# under the License.
import contextlib
+import datetime
import os
import shutil
import subprocess
@@ -1486,6 +1487,60 @@ def test_uuid_bytes_property_raises():
pa.scalar(bad)
+def test_array_from_extension_scalars():
+ # One case per C++ converter: FixedSizeBinary, Binary/String
+ builtin_cases = [
+ (pa.uuid(), [b"0123456789abcdef"]),
+ (pa.opaque(pa.binary(), "t", "v"), [b"x", b"y"]),
+ ]
+ for ext_type, values in builtin_cases:
+ scalars = [pa.scalar(v, type=ext_type) for v in values]
+ result = pa.array(scalars, type=ext_type)
+ assert result.equals(pa.array(values, type=ext_type))
+
+ # One case per C++ converter: Numeric, Timestamp/Duration, Struct
+ custom_cases = [
+ (IntegerType(), [100, 200]),
+ (AnnotatedType(pa.timestamp("us"), "ts"),
+ [datetime.datetime(2023, 1, 1)]),
+ (MyStructType(), [{"left": 1, "right": 2}]),
+ ]
+ for ext_type, values in custom_cases:
+ with registered_extension_type(ext_type):
+ scalars = [pa.scalar(v, type=ext_type) for v in values]
+ result = pa.array(scalars, type=ext_type)
+ assert result.equals(pa.array(values, type=ext_type))
+
+ # Null handling
+ uuid_type = pa.uuid()
+ scalars = [pa.scalar(b"0123456789abcdef", type=uuid_type),
+ pa.scalar(None, type=uuid_type)]
+ result = pa.array(scalars, type=uuid_type)
+ assert result[0].is_valid and not result[1].is_valid
+
+ # ExtensionScalar.from_storage path
+ scalars = [
+ pa.ExtensionScalar.from_storage(uuid_type, b"0123456789abcdef"),
+ pa.ExtensionScalar.from_storage(uuid_type, None),
+ ]
+ result = pa.array(scalars, type=uuid_type)
+ expected = pa.array([b"0123456789abcdef", None], type=uuid_type)
+ assert result.equals(expected)
+
+ # Type inference without explicit type
+ u = uuid4()
+ scalars = [pa.scalar(u, type=pa.uuid()), None]
+ result = pa.array(scalars)
+ assert result.type == pa.uuid()
+ assert result[0].as_py() == u
+ assert not result[1].is_valid
+
+ # Mixed extension scalars and raw Python objects
+ u1, u2 = uuid4(), uuid4()
+ result = pa.array([pa.scalar(u1, type=pa.uuid()), u2], type=pa.uuid())
+ assert result.equals(pa.array([u1, u2], type=pa.uuid()))
+
+
def test_tensor_type():
tensor_type = pa.fixed_shape_tensor(pa.int8(), [2, 3])
assert tensor_type.extension_name == "arrow.fixed_shape_tensor"