This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch branch-3.5 in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-3.5 by this push: new 98645a267511 [SPARK-52791][PS] Fix error when inferring a UDT with a null first element 98645a267511 is described below commit 98645a2675110d9f4aab3d6dca1fc4e89cdff053 Author: Peter Nguyen <petern0...@gmail.com> AuthorDate: Wed Jul 23 09:14:42 2025 +0900 [SPARK-52791][PS] Fix error when inferring a UDT with a null first element I modified the udt condition to check the first non-null element instead of the first element (which might be null). ``` import pyspark.pandas as ps from pyspark.ml.linalg import SparseVector sparse_values = {0: 0.1, 1: 1.1} ps_series = ps.Series([None, SparseVector(1, \{0: 1.2}), SparseVector(1, \{0: 3})]) ``` Error: ``` pyarrow.lib.ArrowInvalid: Could not convert SparseVector(1, {0: 1.2}) with type SparseVector: did not recognize Python value type when inferring an Arrow data type ``` This should work as normal, but it fails because the first element is None Yes, previously it would error, but now it works properly. This is a behavior change from all previous spark versions, and should probably be backported. Added a test No Closes #51475 from petern48/fix_infer_spark_type. Authored-by: Peter Nguyen <petern0...@gmail.com> Signed-off-by: Hyukjin Kwon <gurwls...@apache.org> (cherry picked from commit 5182eb4c6a51989b37f054ef07173cd797611d2b) Signed-off-by: Hyukjin Kwon <gurwls...@apache.org> --- .../pandas/tests/data_type_ops/test_udt_ops.py | 20 ++++++++++++++++++++ python/pyspark/pandas/typedef/typehints.py | 5 +++-- 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/python/pyspark/pandas/tests/data_type_ops/test_udt_ops.py b/python/pyspark/pandas/tests/data_type_ops/test_udt_ops.py index 45f8cca56ee9..7b264582e044 100644 --- a/python/pyspark/pandas/tests/data_type_ops/test_udt_ops.py +++ b/python/pyspark/pandas/tests/data_type_ops/test_udt_ops.py @@ -129,6 +129,26 @@ class UDTOpsTestsMixin: self.assert_eq(pser, psser._to_pandas()) self.assert_eq(ps.from_pandas(pser), psser) + def test_with_first_null(self): + lst = [None, None, None, SparseVector(1, {0: 0.1})] + pser = pd.Series(lst) + psser = ps.Series(lst) + self.assert_eq(pser, psser._to_pandas()) + self.assert_eq(ps.from_pandas(pser), psser) + + lst2 = [SparseVector(1, {0: 0.1}), None, None, None] + pdf = pd.DataFrame({"a": lst, "b": lst2}) + psdf = ps.DataFrame({"a": lst, "b": lst2}) + self.assert_eq(pdf, psdf._to_pandas()) + self.assert_eq(ps.from_pandas(pdf), psdf) + + def test_with_all_null(self): + lst = [None, None, None, None] + pser = pd.Series(lst, dtype=object) + psser = ps.Series(lst, dtype=object) + self.assert_eq(pser, psser._to_pandas()) + self.assert_eq(ps.from_pandas(pser), psser) + def test_isnull(self): self.assert_eq(self.pser.isnull(), self.psser.isnull()) diff --git a/python/pyspark/pandas/typedef/typehints.py b/python/pyspark/pandas/typedef/typehints.py index 012eabf958eb..5c7b3e01686a 100644 --- a/python/pyspark/pandas/typedef/typehints.py +++ b/python/pyspark/pandas/typedef/typehints.py @@ -354,8 +354,9 @@ def infer_pd_series_spark_type( if dtype == np.dtype("object"): if len(pser) == 0 or pser.isnull().all(): return types.NullType() - elif hasattr(pser.iloc[0], "__UDT__"): - return pser.iloc[0].__UDT__ + notnull = pser[pser.notnull()] + if hasattr(notnull.iloc[0], "__UDT__"): + return notnull.iloc[0].__UDT__ else: return from_arrow_type(pa.Array.from_pandas(pser).type, prefer_timestamp_ntz) elif isinstance(dtype, CategoricalDtype): --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org