This is an automated email from the ASF dual-hosted git repository.
gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 20d928ee7581 [SPARK-55462][PYTHON] Support VariantType in convert_numpy
20d928ee7581 is described below
commit 20d928ee75817705d966c26914ecaded7bcd625d
Author: Fangchen Li <[email protected]>
AuthorDate: Mon Feb 23 07:22:33 2026 +0900
[SPARK-55462][PYTHON] Support VariantType in convert_numpy
### What changes were proposed in this pull request?
Support VariantType in convert_numpy
### Why are the changes needed?
Part of the new arrow to pandas converter
### Does this PR introduce _any_ user-facing change?
No.
### How was this patch tested?
Unittests
### Was this patch authored or co-authored using generative AI tooling?
Generated-by: Claude Opus 4.6
Closes #54312 from fangchenli/SPARK-55462-convert-numpy-variant.
Authored-by: Fangchen Li <[email protected]>
Signed-off-by: Hyukjin Kwon <[email protected]>
---
python/pyspark/sql/conversion.py | 7 +++++-
python/pyspark/sql/tests/test_conversion.py | 36 +++++++++++++++++++++++++++++
2 files changed, 42 insertions(+), 1 deletion(-)
diff --git a/python/pyspark/sql/conversion.py b/python/pyspark/sql/conversion.py
index 7b5fd9747a69..34e047ae52d5 100644
--- a/python/pyspark/sql/conversion.py
+++ b/python/pyspark/sql/conversion.py
@@ -1597,6 +1597,7 @@ class ArrowArrayToPandasConversion:
TimestampType,
TimestampNTZType,
UserDefinedType,
+ VariantType,
)
if df_for_struct and isinstance(spark_type, StructType):
return all(isinstance(f.dataType, supported_types) for f in
spark_type.fields)
@@ -1729,13 +1730,17 @@ class ArrowArrayToPandasConversion:
if v is not None
else None
)
+ elif isinstance(spark_type, VariantType):
+ series = arr.to_pandas(date_as_object=True)
+ series = series.map(
+ lambda v: VariantVal(v["value"], v["metadata"]) if v is not
None else None
+ )
# elif isinstance(
# spark_type,
# (
# ArrayType,
# MapType,
# StructType,
- # VariantType,
# GeographyType,
# GeometryType,
# ),
diff --git a/python/pyspark/sql/tests/test_conversion.py
b/python/pyspark/sql/tests/test_conversion.py
index 0560ac983250..c3ac461ca1d4 100644
--- a/python/pyspark/sql/tests/test_conversion.py
+++ b/python/pyspark/sql/tests/test_conversion.py
@@ -44,6 +44,8 @@ from pyspark.sql.types import (
StructType,
TimestampType,
UserDefinedType,
+ VariantType,
+ VariantVal,
)
from pyspark.testing.objects import ExamplePoint, ExamplePointUDT,
PythonOnlyPoint, PythonOnlyUDT
from pyspark.testing.utils import (
@@ -620,6 +622,40 @@ class ArrowArrayToPandasConversionTests(unittest.TestCase):
self.assertEqual(result.iloc[0], ExamplePoint(1.0, 2.0))
self.assertEqual(result.iloc[1], ExamplePoint(3.0, 4.0))
+ def test_variant_convert_numpy(self):
+ import pyarrow as pa
+
+ variant_type = pa.struct(
+ [
+ pa.field("value", pa.binary(), nullable=False),
+ pa.field("metadata", pa.binary(), nullable=False,
metadata={b"variant": b"true"}),
+ ]
+ )
+
+ # basic conversion with nulls
+ arr = pa.array(
+ [
+ {"value": b"\x01", "metadata": b"\x02"},
+ None,
+ {"value": b"\x03", "metadata": b"\x04"},
+ ],
+ type=variant_type,
+ )
+ result = ArrowArrayToPandasConversion.convert_numpy(arr,
VariantType(), ser_name="v")
+ self.assertIsInstance(result.iloc[0], VariantVal)
+ self.assertEqual(result.iloc[0].value, b"\x01")
+ self.assertEqual(result.iloc[0].metadata, b"\x02")
+ self.assertIsNone(result.iloc[1])
+ self.assertEqual(result.iloc[2].value, b"\x03")
+ self.assertEqual(result.iloc[2].metadata, b"\x04")
+ self.assertEqual(result.name, "v")
+
+ # empty
+ result = ArrowArrayToPandasConversion.convert_numpy(
+ pa.array([], type=variant_type), VariantType()
+ )
+ self.assertEqual(len(result), 0)
+
if __name__ == "__main__":
from pyspark.testing import main
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]