This is an automated email from the ASF dual-hosted git repository.
ruifengz pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 9d414bd53e30 [SPARK-54942][PYTHON][TESTS][FOLLOWUP] Add tests for
pa.Array.cast of nested/container types
9d414bd53e30 is described below
commit 9d414bd53e300617158bae95a14da332319cd981
Author: Yicong-Huang <[email protected]>
AuthorDate: Sat Jan 24 07:34:10 2026 +0800
[SPARK-54942][PYTHON][TESTS][FOLLOWUP] Add tests for pa.Array.cast of
nested/container types
### What changes were proposed in this pull request?
Add `PyArrowNestedTypeCastTests` to test `pa.Array.cast()` for container
types (`list`, `large_list`, `fixed_size_list`, `map`, `struct`) and their
nested variants.
- 11 test methods, 123 test cases total
- Covers successful casts (e.g., list↔large_list, map→list<struct>)
- Covers expected failures (e.g., list→map, container→scalar)
### Why are the changes needed?
Part of [SPARK-54936](https://issues.apache.org/jira/browse/SPARK-54936) to
monitor upstream PyArrow behavior changes. Detects breaking changes when
upgrading PyArrow versions.
### Does this PR introduce _any_ user-facing change?
No.
### How was this patch tested?
`pytest
python/pyspark/tests/upstream/pyarrow/test_pyarrow_array_cast.py::PyArrowNestedTypeCastTests
-v`
### Was this patch authored or co-authored using generative AI tooling?
No.
Closes #53939 from
Yicong-Huang/SPARK-54942/feat/add-nested-type-cast-support.
Lead-authored-by: Yicong-Huang
<[email protected]>
Co-authored-by: Yicong Huang
<[email protected]>
Signed-off-by: Ruifeng Zheng <[email protected]>
---
.../upstream/pyarrow/test_pyarrow_array_cast.py | 899 ++++++++++++++++++++-
1 file changed, 894 insertions(+), 5 deletions(-)
diff --git a/python/pyspark/tests/upstream/pyarrow/test_pyarrow_array_cast.py
b/python/pyspark/tests/upstream/pyarrow/test_pyarrow_array_cast.py
index e01b3b6dd81c..6e74ec9231d9 100644
--- a/python/pyspark/tests/upstream/pyarrow/test_pyarrow_array_cast.py
+++ b/python/pyspark/tests/upstream/pyarrow/test_pyarrow_array_cast.py
@@ -16,14 +16,28 @@
#
"""
-Tests for PyArrow's pa.Array.cast() method with default parameters only.
+Tests for PyArrow's pa.Array.cast() method with safe=True (default).
-## Numerical Type Conversion Matrix (pa.Array.cast with default safe=True)
+## Scalar Type Cast Tests (PyArrowScalarTypeCastTests)
-### Covered Types:
-- **Signed Integers**: int8, int16, int32, int64
-- **Unsigned Integers**: uint8, uint16, uint32, uint64
+Tests all scalar-to-scalar type conversions:
+- **Integers**: int8, int16, int32, int64, uint8, uint16, uint32, uint64
- **Floats**: float16, float32, float64
+- **Boolean**: bool
+- **Strings**: string, large_string
+- **Binary**: binary, large_binary, fixed_size_binary
+- **Decimal**: decimal128, decimal256
+- **Date**: date32, date64
+- **Time**: time32(s/ms), time64(us/ns)
+- **Timestamp**: timestamp(s/ms/us/ns), with/without timezone
+- **Duration**: duration(s/ms/us/ns)
+
+## Nested/Container Type Cast Tests (PyArrowNestedTypeCastTests)
+
+Tests container-to-container type conversions:
+- **List variants**: list, large_list, fixed_size_list
+- **Map**: map<key, value>
+- **Struct**: struct<fields...>
"""
import platform
@@ -7142,6 +7156,881 @@ class PyArrowNumericalCastTests(unittest.TestCase):
self._run_cast_tests(casts, "time64_ns")
[email protected](not have_pyarrow, pyarrow_requirement_message)
+class PyArrowNestedTypeCastTests(unittest.TestCase):
+ """
+ Tests for PyArrow nested type casts (list, map, struct) with safe=True.
+
+ This class tests container-level type conversions:
+ 1. List variants: list <-> large_list <-> fixed_size_list
+ 2. Map <-> List<Struct>: map<K,V> -> list<struct<key:K, value:V>>
+ 3. Cross-container failures: list<->map, list<->struct, map<->struct
+ 4. Struct -> Struct: field matching, reordering, adding/dropping fields
+ """
+
+ # Class-level storage for test results
+ _cast_results = {}
+ _all_targets = set()
+
+ @classmethod
+ def setUpClass(cls):
+ """Initialize the cast results matrix."""
+ cls._cast_results = {}
+ cls._all_targets = set()
+
+ @classmethod
+ def tearDownClass(cls):
+ """Print the cast matrix after all tests complete."""
+ if not cls._cast_results:
+ return
+
+ print("\n" + "=" * 90)
+ print("PyArrow Nested Type Cast Matrix (Y=success, N=expected
failure)")
+ print("=" * 90)
+
+ # Get all sources
+ sources = sorted(cls._cast_results.keys())
+
+ # Print by source
+ for src in sources:
+ print(f"\n{src}:")
+ for tgt in sorted(cls._cast_results.get(src, {}).keys()):
+ status = cls._cast_results[src][tgt]
+ print(f" → {tgt:<45} [{status}]")
+
+ # Summary
+ total_y = sum(1 for s in cls._cast_results.values() for v in
s.values() if v == "Y")
+ total_n = sum(1 for s in cls._cast_results.values() for v in
s.values() if v == "N")
+ print(f"\n{'=' * 90}")
+ print(
+ f"Success (Y): {total_y}, Expected Failures (N): {total_n}, Total:
{total_y + total_n}"
+ )
+ print("=" * 90)
+
+ def _run_nested_cast_tests(self, casts_dict, source_type_name):
+ """Run cast tests for a nested source type using dictionary of test
pairs."""
+ import pyarrow as pa
+
+ failed_cases = []
+
+ # Initialize results for this source type
+ if source_type_name not in self.__class__._cast_results:
+ self.__class__._cast_results[source_type_name] = {}
+
+ for tgt_type_name, test_pairs in casts_dict.items():
+ self.__class__._all_targets.add(tgt_type_name)
+ expects_failure = False
+ test_passed = True
+
+ for i, pair in enumerate(test_pairs):
+ src_arr, expected = pair[0], pair[1]
+ # Optional: target type override (for types that can't be
inferred from name)
+ tgt_type = pair[2] if len(pair) > 2 else None
+ case_id = f"{source_type_name}->{tgt_type_name}[{i}]"
+
+ try:
+ if isinstance(expected, type) and issubclass(expected,
Exception):
+ expects_failure = True
+ try:
+ result = src_arr.cast(tgt_type)
+ failed_cases.append(
+ f"{case_id}: expected {expected.__name__}, "
+ f"got success: {result.to_pylist()}"
+ )
+ test_passed = False
+ except expected:
+ pass
+ except Exception as e:
+ failed_cases.append(
+ f"{case_id}: expected {expected.__name__}, "
+ f"got {type(e).__name__}: {e}"
+ )
+ test_passed = False
+ elif isinstance(expected, pa.Array):
+ try:
+ result = src_arr.cast(expected.type)
+ if result.to_pylist() != expected.to_pylist():
+ failed_cases.append(
+ f"{case_id}: mismatch, expected
{expected.to_pylist()}, "
+ f"got {result.to_pylist()}"
+ )
+ test_passed = False
+ except Exception as e:
+ failed_cases.append(
+ f"{case_id}: expected success, got
{type(e).__name__}: {e}"
+ )
+ test_passed = False
+ except Exception as e:
+ failed_cases.append(f"{case_id}: test error: {e}")
+ test_passed = False
+
+ # Record result: Y for success, N for expected failure
+ if test_passed:
+ self.__class__._cast_results[source_type_name][tgt_type_name]
= (
+ "N" if expects_failure else "Y"
+ )
+
+ if failed_cases:
+ self.fail("\n".join(failed_cases))
+
+ def test_list_casts(self):
+ """Test list<T> -> all nested types."""
+ import pyarrow as pa
+
+ casts = {
+ "large_list": [
+ (
+ pa.array([[1, 2], [3, 4]], type=pa.list_(pa.int32())),
+ pa.array([[1, 2], [3, 4]], type=pa.large_list(pa.int32())),
+ ),
+ (
+ pa.array([[1, None], None, [3]],
type=pa.list_(pa.int32())),
+ pa.array([[1, None], None, [3]],
type=pa.large_list(pa.int32())),
+ ),
+ (
+ pa.array([[], [1]], type=pa.list_(pa.int32())),
+ pa.array([[], [1]], type=pa.large_list(pa.int32())),
+ ),
+ (
+ pa.array([None, None], type=pa.list_(pa.int32())),
+ pa.array([None, None], type=pa.large_list(pa.int32())),
+ ),
+ (
+ pa.array([], type=pa.list_(pa.int32())),
+ pa.array([], type=pa.large_list(pa.int32())),
+ ),
+ # Nested: list<list> -> large_list<list>
+ (
+ pa.array([[[1, 2], [3]], [[4]]],
type=pa.list_(pa.list_(pa.int32()))),
+ pa.array([[[1, 2], [3]], [[4]]],
type=pa.large_list(pa.list_(pa.int32()))),
+ ),
+ # Nested: list<struct> -> large_list<struct>
+ (
+ pa.array(
+ [[{"x": 1, "y": "a"}]],
+ type=pa.list_(pa.struct([("x", pa.int32()), ("y",
pa.string())])),
+ ),
+ pa.array(
+ [[{"x": 1, "y": "a"}]],
+ type=pa.large_list(pa.struct([("x", pa.int32()), ("y",
pa.string())])),
+ ),
+ ),
+ ],
+ "fixed_size_list": [
+ (
+ pa.array([[1, 2], [3, 4]], type=pa.list_(pa.int32())),
+ pa.array([[1, 2], [3, 4]], type=pa.list_(pa.int32(), 2)),
+ ),
+ (
+ pa.array([[1, None], None], type=pa.list_(pa.int32())),
+ pa.array([[1, None], None], type=pa.list_(pa.int32(), 2)),
+ ),
+ (
+ pa.array([None, None], type=pa.list_(pa.int32())),
+ pa.array([None, None], type=pa.list_(pa.int32(), 2)),
+ ),
+ # Mismatched size - failure
+ (
+ pa.array([[1, 2], [3, 4]], type=pa.list_(pa.int32())),
+ pa.lib.ArrowInvalid,
+ pa.list_(pa.int32(), 3),
+ ),
+ # Variable length - failure
+ (
+ pa.array([[1, 2], [3, 4, 5]], type=pa.list_(pa.int32())),
+ pa.lib.ArrowInvalid,
+ pa.list_(pa.int32(), 2),
+ ),
+ ],
+ "list_large_list": [
+ (
+ pa.array([[[1, 2], [3]], [[4]]],
type=pa.list_(pa.list_(pa.int32()))),
+ pa.array([[[1, 2], [3]], [[4]]],
type=pa.list_(pa.large_list(pa.int32()))),
+ ),
+ ],
+ "large_list_large_list": [
+ (
+ pa.array([[[1, 2]], [[3]]],
type=pa.list_(pa.list_(pa.int32()))),
+ pa.array([[[1, 2]], [[3]]],
type=pa.large_list(pa.large_list(pa.int32()))),
+ ),
+ ],
+ "map": [
+ (
+ pa.array([[1, 2]], type=pa.list_(pa.int32())),
+ pa.lib.ArrowNotImplementedError,
+ pa.map_(pa.string(), pa.int32()),
+ ),
+ ],
+ "struct": [
+ (
+ pa.array([[1, 2]], type=pa.list_(pa.int32())),
+ pa.lib.ArrowNotImplementedError,
+ pa.struct([("x", pa.int32()), ("y", pa.int32())]),
+ ),
+ ],
+ }
+ self._run_nested_cast_tests(casts, "list")
+
+ def test_nested_list_casts(self):
+ """Test list<nested> -> other types."""
+ import pyarrow as pa
+
+ casts = {
+ "large_list<list<>>": [
+ (
+ pa.array([[[1, 2], [3]], [[4]]],
type=pa.list_(pa.list_(pa.int32()))),
+ pa.array([[[1, 2], [3]], [[4]]],
type=pa.large_list(pa.list_(pa.int32()))),
+ ),
+ ],
+ "list<large_list<>>": [
+ (
+ pa.array([[["a", "b"]], [["c"]]],
type=pa.list_(pa.list_(pa.string()))),
+ pa.array([[["a", "b"]], [["c"]]],
type=pa.list_(pa.large_list(pa.string()))),
+ ),
+ ],
+ "large_list<struct<>>": [
+ (
+ pa.array(
+ [[{"x": 1, "y": "a"}]],
+ type=pa.list_(pa.struct([("x", pa.int32()), ("y",
pa.string())])),
+ ),
+ pa.array(
+ [[{"x": 1, "y": "a"}]],
+ type=pa.large_list(pa.struct([("x", pa.int32()), ("y",
pa.string())])),
+ ),
+ ),
+ ],
+ "large_list<map<>>": [
+ (
+ pa.array(
+ [[[("a", 1), ("b", 2)]]],
+ type=pa.list_(pa.map_(pa.string(), pa.int32())),
+ ),
+ pa.array(
+ [[[("a", 1), ("b", 2)]]],
+ type=pa.large_list(pa.map_(pa.string(), pa.int32())),
+ ),
+ ),
+ ],
+ "map<>": [
+ (
+ pa.array(
+ [[{"key": "a", "value": 1}]],
+ type=pa.list_(pa.struct([("key", pa.string()),
("value", pa.int32())])),
+ ),
+ pa.lib.ArrowNotImplementedError,
+ pa.map_(pa.string(), pa.int32()),
+ ),
+ ],
+ }
+ self._run_nested_cast_tests(casts, "list<nested>")
+
+ def test_large_list_casts(self):
+ """Test large_list<T> -> all nested types."""
+ import pyarrow as pa
+
+ casts = {
+ "list": [
+ (
+ pa.array([[1, 2], [3, 4]], type=pa.large_list(pa.int32())),
+ pa.array([[1, 2], [3, 4]], type=pa.list_(pa.int32())),
+ ),
+ (
+ pa.array([[1, None], None, [3]],
type=pa.large_list(pa.int32())),
+ pa.array([[1, None], None, [3]],
type=pa.list_(pa.int32())),
+ ),
+ ],
+ "fixed_size_list": [
+ (
+ pa.array([[1, 2], [3, 4]], type=pa.large_list(pa.int32())),
+ pa.array([[1, 2], [3, 4]], type=pa.list_(pa.int32(), 2)),
+ ),
+ # Mismatched size - failure
+ (
+ pa.array([[1, 2], [3, 4]], type=pa.large_list(pa.int32())),
+ pa.lib.ArrowInvalid,
+ pa.list_(pa.int32(), 3),
+ ),
+ ],
+ }
+ self._run_nested_cast_tests(casts, "large_list")
+
+ def test_nested_large_list_casts(self):
+ """Test large_list<nested> -> other types."""
+ import pyarrow as pa
+
+ casts = {
+ "list<list<>>": [
+ (
+ pa.array([[[1, 2]], [[3]]],
type=pa.large_list(pa.list_(pa.int32()))),
+ pa.array([[[1, 2]], [[3]]],
type=pa.list_(pa.list_(pa.int32()))),
+ ),
+ ],
+ "large_list<large_list<>>": [
+ (
+ pa.array([[["a"]], [["b", "c"]]],
type=pa.large_list(pa.list_(pa.string()))),
+ pa.array(
+ [[["a"]], [["b", "c"]]],
type=pa.large_list(pa.large_list(pa.string()))
+ ),
+ ),
+ ],
+ "list<struct<>>": [
+ (
+ pa.array(
+ [[{"x": 1.0}]],
+ type=pa.large_list(pa.struct([("x", pa.float64())])),
+ ),
+ pa.array(
+ [[{"x": 1.0}]],
+ type=pa.list_(pa.struct([("x", pa.float64())])),
+ ),
+ ),
+ ],
+ }
+ self._run_nested_cast_tests(casts, "large_list<nested>")
+
+ def test_fixed_size_list_casts(self):
+ """Test fixed_size_list<T, N> -> all nested types."""
+ import pyarrow as pa
+
+ casts = {
+ "list": [
+ (
+ pa.array([[1, 2], [3, 4]], type=pa.list_(pa.int32(), 2)),
+ pa.array([[1, 2], [3, 4]], type=pa.list_(pa.int32())),
+ ),
+ (
+ pa.array([[1, None], None], type=pa.list_(pa.int32(), 2)),
+ pa.array([[1, None], None], type=pa.list_(pa.int32())),
+ ),
+ ],
+ "large_list": [
+ (
+ pa.array([[1, 2], [3, 4]], type=pa.list_(pa.int32(), 2)),
+ pa.array([[1, 2], [3, 4]], type=pa.large_list(pa.int32())),
+ ),
+ ],
+ "fixed_size_list_same_size": [
+ (
+ pa.array([[1, 2], [3, 4]], type=pa.list_(pa.int32(), 2)),
+ pa.array([[1, 2], [3, 4]], type=pa.list_(pa.int64(), 2)),
+ ),
+ ],
+ "fixed_size_list_diff_size": [
+ (
+ pa.array([[1, 2], [3, 4]], type=pa.list_(pa.int32(), 2)),
+ pa.lib.ArrowTypeError,
+ pa.list_(pa.int32(), 3),
+ ),
+ ],
+ }
+ self._run_nested_cast_tests(casts, "fixed_size_list")
+
+ def test_nested_fixed_size_list_casts(self):
+ """Test fixed_size_list<nested> -> other types."""
+ import pyarrow as pa
+
+ casts = {
+ "list<list<>>": [
+ (
+ pa.array([[[1, 2], [3, 4]]],
type=pa.list_(pa.list_(pa.int32()), 2)),
+ pa.array([[[1, 2], [3, 4]]],
type=pa.list_(pa.list_(pa.int32()))),
+ ),
+ ],
+ "large_list<list<>>": [
+ (
+ pa.array([[["a"], ["b"]]],
type=pa.list_(pa.list_(pa.string()), 2)),
+ pa.array([[["a"], ["b"]]],
type=pa.large_list(pa.list_(pa.string()))),
+ ),
+ ],
+ "list<struct<>>": [
+ (
+ pa.array(
+ [[{"x": 1.0}, {"x": 2.0}]],
+ type=pa.list_(pa.struct([("x", pa.float64())]), 2),
+ ),
+ pa.array(
+ [[{"x": 1.0}, {"x": 2.0}]],
+ type=pa.list_(pa.struct([("x", pa.float64())])),
+ ),
+ ),
+ ],
+ "fixed_size_list<large_list<>>": [
+ (
+ pa.array([[[1], [2]]], type=pa.list_(pa.list_(pa.int32()),
2)),
+ pa.array([[[1], [2]]],
type=pa.list_(pa.large_list(pa.int32()), 2)),
+ ),
+ ],
+ }
+ self._run_nested_cast_tests(casts, "fixed_size_list<nested>")
+
+ def test_map_casts(self):
+ """Test map<K, V> -> all nested types."""
+ import pyarrow as pa
+
+ casts = {
+ "list_struct": [
+ (
+ pa.array(
+ [[("a", 1), ("b", 2)], [("c", 3)]],
+ type=pa.map_(pa.string(), pa.int32()),
+ ),
+ pa.array(
+ [
+ [{"key": "a", "value": 1}, {"key": "b", "value":
2}],
+ [{"key": "c", "value": 3}],
+ ],
+ type=pa.list_(pa.struct([("key", pa.string()),
("value", pa.int32())])),
+ ),
+ ),
+ (
+ pa.array(
+ [[("a", 1), ("b", None)], None],
+ type=pa.map_(pa.string(), pa.int32()),
+ ),
+ pa.array(
+ [[{"key": "a", "value": 1}, {"key": "b", "value":
None}], None],
+ type=pa.list_(pa.struct([("key", pa.string()),
("value", pa.int32())])),
+ ),
+ ),
+ (
+ pa.array([[], [("a", 1)]], type=pa.map_(pa.string(),
pa.int32())),
+ pa.array(
+ [[], [{"key": "a", "value": 1}]],
+ type=pa.list_(pa.struct([("key", pa.string()),
("value", pa.int32())])),
+ ),
+ ),
+ (
+ pa.array([None, None], type=pa.map_(pa.string(),
pa.int32())),
+ pa.array(
+ [None, None],
+ type=pa.list_(pa.struct([("key", pa.string()),
("value", pa.int32())])),
+ ),
+ ),
+ (
+ pa.array([], type=pa.map_(pa.string(), pa.int32())),
+ pa.array(
+ [],
+ type=pa.list_(pa.struct([("key", pa.string()),
("value", pa.int32())])),
+ ),
+ ),
+ (
+ pa.array([[(1, "x"), (2, "y")]], type=pa.map_(pa.int64(),
pa.string())),
+ pa.array(
+ [[{"key": 1, "value": "x"}, {"key": 2, "value": "y"}]],
+ type=pa.list_(pa.struct([("key", pa.int64()),
("value", pa.string())])),
+ ),
+ ),
+ # Custom field names
+ (
+ pa.array([[("a", 1), ("b", 2)]], type=pa.map_(pa.string(),
pa.int32())),
+ pa.array(
+ [[{"k": "a", "v": 1}, {"k": "b", "v": 2}]],
+ type=pa.list_(pa.struct([("k", pa.string()), ("v",
pa.int32())])),
+ ),
+ ),
+ ],
+ "large_list_struct": [
+ (
+ pa.array([[("a", 1), ("b", 2)]], type=pa.map_(pa.string(),
pa.int32())),
+ pa.array(
+ [[{"key": "a", "value": 1}, {"key": "b", "value": 2}]],
+ type=pa.large_list(
+ pa.struct([("key", pa.string()), ("value",
pa.int32())])
+ ),
+ ),
+ ),
+ ],
+ "fixed_size_list_struct": [
+ (
+ pa.array(
+ [[("a", 1), ("b", 2)], [("c", 3)]],
+ type=pa.map_(pa.string(), pa.int32()),
+ ),
+ pa.lib.ArrowInvalid,
+ pa.list_(pa.struct([("key", pa.string()), ("value",
pa.int32())]), 2),
+ ),
+ ],
+ "list_struct_wrong_count": [
+ (
+ pa.array([[("a", 1)]], type=pa.map_(pa.string(),
pa.int32())),
+ pa.lib.ArrowTypeError,
+ pa.list_(
+ pa.struct(
+ [("key", pa.string()), ("value", pa.int32()),
("extra", pa.int32())]
+ )
+ ),
+ ),
+ (
+ pa.array([[("a", 1)]], type=pa.map_(pa.string(),
pa.int32())),
+ pa.lib.ArrowTypeError,
+ pa.list_(pa.struct([("key", pa.string())])),
+ ),
+ ],
+ "list_scalar": [
+ (
+ pa.array([[("a", 1)]], type=pa.map_(pa.string(),
pa.int32())),
+ pa.lib.ArrowTypeError,
+ pa.list_(pa.string()),
+ ),
+ ],
+ "struct": [
+ (
+ pa.array([[("x", 1), ("y", 2)]], type=pa.map_(pa.string(),
pa.int32())),
+ pa.lib.ArrowNotImplementedError,
+ pa.struct([("x", pa.int32()), ("y", pa.int32())]),
+ ),
+ ],
+ }
+ self._run_nested_cast_tests(casts, "map")
+
+ def test_nested_map_casts(self):
+ """Test map<nested> -> other types (map with nested key or value)."""
+ import pyarrow as pa
+
+ casts = {
+ "list<struct<key,value:struct<>>>": [
+ (
+ pa.array(
+ [[("a", {"x": 1, "y": "hello"})]],
+ type=pa.map_(
+ pa.string(),
+ pa.struct([("x", pa.int32()), ("y", pa.string())]),
+ ),
+ ),
+ pa.array(
+ [[{"key": "a", "value": {"x": 1, "y": "hello"}}]],
+ type=pa.list_(
+ pa.struct(
+ [
+ ("key", pa.string()),
+ ("value", pa.struct([("x", pa.int32()),
("y", pa.string())])),
+ ]
+ )
+ ),
+ ),
+ ),
+ ],
+ "list<struct<key,value:list<>>>": [
+ (
+ pa.array(
+ [[("a", [1, 2, 3])]],
+ type=pa.map_(pa.string(), pa.list_(pa.int32())),
+ ),
+ pa.array(
+ [[{"key": "a", "value": [1, 2, 3]}]],
+ type=pa.list_(
+ pa.struct([("key", pa.string()), ("value",
pa.list_(pa.int32()))])
+ ),
+ ),
+ ),
+ # With nulls
+ (
+ pa.array(
+ [[("x", [1.0, None]), ("y", None)], None],
+ type=pa.map_(pa.string(), pa.list_(pa.float64())),
+ ),
+ pa.array(
+ [[{"key": "x", "value": [1.0, None]}, {"key": "y",
"value": None}], None],
+ type=pa.list_(
+ pa.struct([("key", pa.string()), ("value",
pa.list_(pa.float64()))])
+ ),
+ ),
+ ),
+ ],
+ "list<struct<key,value:map<>>>": [
+ (
+ pa.array(
+ [[("outer", [("inner", 1)])]],
+ type=pa.map_(pa.string(), pa.map_(pa.string(),
pa.int32())),
+ ),
+ pa.array(
+ [[{"key": "outer", "value": [("inner", 1)]}]],
+ type=pa.list_(
+ pa.struct(
+ [
+ ("key", pa.string()),
+ ("value", pa.map_(pa.string(),
pa.int32())),
+ ]
+ )
+ ),
+ ),
+ ),
+ ],
+ }
+ self._run_nested_cast_tests(casts, "map<nested>")
+
+ def test_struct_casts(self):
+ """Test struct<fields...> -> all nested types."""
+ import pyarrow as pa
+
+ casts = {
+ # struct -> struct (same schema)
+ "struct_same": [
+ (
+ pa.array(
+ [{"x": 1, "y": 2}, {"x": 3, "y": 4}],
+ type=pa.struct([("x", pa.int32()), ("y", pa.int32())]),
+ ),
+ pa.array(
+ [{"x": 1, "y": 2}, {"x": 3, "y": 4}],
+ type=pa.struct([("x", pa.int32()), ("y", pa.int32())]),
+ ),
+ ),
+ # With nulls
+ (
+ pa.array(
+ [{"x": 1, "y": None}, None],
+ type=pa.struct([("x", pa.int32()), ("y", pa.int32())]),
+ ),
+ pa.array(
+ [{"x": 1, "y": None}, None],
+ type=pa.struct([("x", pa.int32()), ("y", pa.int32())]),
+ ),
+ ),
+ # All nulls
+ (
+ pa.array(
+ [None, None],
+ type=pa.struct([("x", pa.int32()), ("y", pa.int32())]),
+ ),
+ pa.array(
+ [None, None],
+ type=pa.struct([("x", pa.int32()), ("y", pa.int32())]),
+ ),
+ ),
+ # Empty array
+ (
+ pa.array([], type=pa.struct([("x", pa.int32())])),
+ pa.array([], type=pa.struct([("x", pa.int32())])),
+ ),
+ ],
+ # struct -> struct (field name mismatch - missing become null)
+ "struct_name_mismatch": [
+ (
+ pa.array(
+ [{"x": 1, "y": 2}],
+ type=pa.struct([("x", pa.int32()), ("y", pa.int32())]),
+ ),
+ pa.array(
+ [{"a": None, "b": None}],
+ type=pa.struct([("a", pa.int32()), ("b", pa.int32())]),
+ ),
+ ),
+ ],
+ # struct -> struct (more fields - extra become null)
+ "struct_more_fields": [
+ (
+ pa.array(
+ [{"x": 1, "y": 2}],
+ type=pa.struct([("x", pa.int32()), ("y", pa.int32())]),
+ ),
+ pa.array(
+ [{"x": 1, "y": 2, "z": None}],
+ type=pa.struct([("x", pa.int32()), ("y", pa.int32()),
("z", pa.int32())]),
+ ),
+ ),
+ ],
+ # struct -> struct (fewer fields - drops extra)
+ "struct_fewer_fields": [
+ (
+ pa.array(
+ [{"x": 1, "y": 2}],
+ type=pa.struct([("x", pa.int32()), ("y", pa.int32())]),
+ ),
+ pa.array([{"x": 1}], type=pa.struct([("x", pa.int32())])),
+ ),
+ ],
+ # struct -> struct (field reorder)
+ "struct_reorder": [
+ (
+ pa.array(
+ [{"x": 1, "y": 2}],
+ type=pa.struct([("x", pa.int32()), ("y", pa.int32())]),
+ ),
+ pa.array(
+ [{"y": 2, "x": 1}],
+ type=pa.struct([("y", pa.int32()), ("x", pa.int32())]),
+ ),
+ ),
+ ],
+ # struct -> list (not supported)
+ "list": [
+ (
+ pa.array(
+ [{"x": 1, "y": 2}],
+ type=pa.struct([("x", pa.int32()), ("y", pa.int32())]),
+ ),
+ pa.lib.ArrowNotImplementedError,
+ pa.list_(pa.int32()),
+ ),
+ ],
+ # struct -> map (not supported)
+ "map": [
+ (
+ pa.array(
+ [{"x": 1, "y": 2}],
+ type=pa.struct([("x", pa.int32()), ("y", pa.int32())]),
+ ),
+ pa.lib.ArrowNotImplementedError,
+ pa.map_(pa.string(), pa.int32()),
+ ),
+ ],
+ }
+ self._run_nested_cast_tests(casts, "struct")
+
+ def test_nested_struct_casts(self):
+ """Test struct<nested> -> other types."""
+ import pyarrow as pa
+
+ casts = {
+ "struct<large_list<>>": [
+ (
+ pa.array(
+ [{"items": [1, 2, 3], "name": "test"}],
+ type=pa.struct([("items", pa.list_(pa.int32())),
("name", pa.string())]),
+ ),
+ pa.array(
+ [{"items": [1, 2, 3], "name": "test"}],
+ type=pa.struct(
+ [
+ ("items", pa.large_list(pa.int32())),
+ ("name", pa.string()),
+ ]
+ ),
+ ),
+ ),
+ ],
+ "struct<struct<>>": [
+ (
+ pa.array(
+ [{"inner": {"a": 1, "b": "x"}, "id": 100}],
+ type=pa.struct(
+ [
+ ("inner", pa.struct([("a", pa.int32()), ("b",
pa.string())])),
+ ("id", pa.int32()),
+ ]
+ ),
+ ),
+ pa.array(
+ [{"inner": {"a": 1, "b": "x"}, "id": 100}],
+ type=pa.struct(
+ [
+ ("inner", pa.struct([("a", pa.int64()), ("b",
pa.string())])),
+ ("id", pa.int64()),
+ ]
+ ),
+ ),
+ ),
+ # With nulls
+ (
+ pa.array(
+ [{"inner": None, "id": 1}, None],
+ type=pa.struct(
+ [
+ ("inner", pa.struct([("x", pa.float64())])),
+ ("id", pa.int32()),
+ ]
+ ),
+ ),
+ pa.array(
+ [{"inner": None, "id": 1}, None],
+ type=pa.struct(
+ [
+ ("inner", pa.struct([("x", pa.float64())])),
+ ("id", pa.int32()),
+ ]
+ ),
+ ),
+ ),
+ ],
+ "struct<map<>>": [
+ (
+ pa.array(
+ [{"data": [("k1", 1), ("k2", 2)], "label": "test"}],
+ type=pa.struct(
+ [
+ ("data", pa.map_(pa.string(), pa.int32())),
+ ("label", pa.string()),
+ ]
+ ),
+ ),
+ pa.array(
+ [{"data": [("k1", 1), ("k2", 2)], "label": "test"}],
+ type=pa.struct(
+ [
+ ("data", pa.map_(pa.string(), pa.int32())),
+ ("label", pa.string()),
+ ]
+ ),
+ ),
+ ),
+ ],
+ }
+ self._run_nested_cast_tests(casts, "struct<nested>")
+
+ def test_container_to_scalar_fails(self):
+ """Verify all container types fail when casting to scalar types."""
+ import pyarrow as pa
+
+ # All container type sample arrays
+ container_arrays = [
+ ("list<int32>", pa.array([[1, 2], [3]], pa.list_(pa.int32()))),
+ ("list<string>", pa.array([["a", "b"]], pa.list_(pa.string()))),
+ ("large_list<int32>", pa.array([[1, 2]],
pa.large_list(pa.int32()))),
+ ("large_list<float64>", pa.array([[1.5, 2.5]],
pa.large_list(pa.float64()))),
+ ("fixed_size_list<int32,2>", pa.array([[1, 2]],
pa.list_(pa.int32(), 2))),
+ ("fixed_size_list<string,2>", pa.array([["a", "b"]],
pa.list_(pa.string(), 2))),
+ ("map<string,int32>", pa.array([[("a", 1)]], pa.map_(pa.string(),
pa.int32()))),
+ ("map<int64,string>", pa.array([[(1, "x")]], pa.map_(pa.int64(),
pa.string()))),
+ ("struct<x:int32>", pa.array([{"x": 1}], pa.struct([("x",
pa.int32())]))),
+ (
+ "struct<a:string,b:float64>",
+ pa.array(
+ [{"a": "hello", "b": 1.5}], pa.struct([("a", pa.string()),
("b", pa.float64())])
+ ),
+ ),
+ ]
+
+ # All scalar types to test
+ scalar_types = [
+ # String/Binary
+ ("string", pa.string()),
+ ("binary", pa.binary()),
+ # Numeric
+ ("int64", pa.int64()),
+ ("float64", pa.float64()),
+ # Boolean
+ ("bool", pa.bool_()),
+ # Temporal
+ ("date32", pa.date32()),
+ ("timestamp_us", pa.timestamp("us")),
+ ("duration_ns", pa.duration("ns")),
+ ]
+
+ failed_cases = []
+
+ for container_name, container_arr in container_arrays:
+ for scalar_name, scalar_type in scalar_types:
+ case_id = f"{container_name} -> {scalar_name}"
+ try:
+ result = container_arr.cast(scalar_type, safe=True)
+ failed_cases.append(
+ f"{case_id}: expected ArrowNotImplementedError, "
+ f"got success: {result.to_pylist()}"
+ )
+ except pa.lib.ArrowNotImplementedError:
+ pass # Expected
+ except Exception as e:
+ failed_cases.append(
+ f"{case_id}: expected ArrowNotImplementedError, "
+ f"got {type(e).__name__}: {e}"
+ )
+
+ if failed_cases:
+ self.fail("\n".join(failed_cases))
+
+
if __name__ == "__main__":
from pyspark.testing import main
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]