This is an automated email from the ASF dual-hosted git repository.
dongjoon pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new f46f387cc2fb [SPARK-54608][PYTHON] Avoid double caching of type
converter in UDTF
f46f387cc2fb is described below
commit f46f387cc2fb1004d7f05e0a64998c454347f4f9
Author: Ruifeng Zheng <[email protected]>
AuthorDate: Fri Dec 5 10:21:38 2025 -0800
[SPARK-54608][PYTHON] Avoid double caching of type converter in UDTF
### What changes were proposed in this pull request?
Avoid double caching of type converter in UDTF
### Why are the changes needed?
In
https://github.com/apache/spark/commit/38bc351fcbdd7fb86da665194ad510de7cc6f2ed,
`_create_converter_from_pandas` was cached by `functools.lru_cache`, we no
longer need to cache it in callsites.
### Does this PR introduce _any_ user-facing change?
no
### How was this patch tested?
ci
### Was this patch authored or co-authored using generative AI tooling?
no
Closes #53343 from zhengruifeng/py_cache_udtf.
Authored-by: Ruifeng Zheng <[email protected]>
Signed-off-by: Dongjoon Hyun <[email protected]>
---
python/pyspark/sql/pandas/serializers.py | 20 ++++++--------------
1 file changed, 6 insertions(+), 14 deletions(-)
diff --git a/python/pyspark/sql/pandas/serializers.py
b/python/pyspark/sql/pandas/serializers.py
index 667af40c36bc..d46e0b2052cc 100644
--- a/python/pyspark/sql/pandas/serializers.py
+++ b/python/pyspark/sql/pandas/serializers.py
@@ -964,7 +964,6 @@ class
ArrowStreamPandasUDTFSerializer(ArrowStreamPandasUDFSerializer):
# Enable additional coercions for UDTF serialization
int_to_decimal_coercion_enabled=int_to_decimal_coercion_enabled,
)
- self._converter_map = dict()
def _create_batch(self, series):
"""
@@ -1012,18 +1011,6 @@ class
ArrowStreamPandasUDTFSerializer(ArrowStreamPandasUDFSerializer):
return pa.RecordBatch.from_arrays(arrs, ["_%d" % i for i in
range(len(arrs))])
- def _get_or_create_converter_from_pandas(self, dt):
- key = dt.json()
- if key not in self._converter_map:
- conv = _create_converter_from_pandas(
- dt,
- timezone=self._timezone,
- error_on_duplicated_field_names=False,
- ignore_unexpected_complex_type_values=True,
- )
- self._converter_map[key] = conv
- return self._converter_map[key]
-
def _create_array(self, series, arrow_type, spark_type=None,
arrow_cast=False):
"""
Override the `_create_array` method in the superclass to create an
Arrow Array
@@ -1055,7 +1042,12 @@ class
ArrowStreamPandasUDTFSerializer(ArrowStreamPandasUDFSerializer):
if arrow_type is not None:
dt = spark_type or from_arrow_type(arrow_type,
prefer_timestamp_ntz=True)
- conv = self._get_or_create_converter_from_pandas(dt)
+ conv = _create_converter_from_pandas(
+ dt,
+ timezone=self._timezone,
+ error_on_duplicated_field_names=False,
+ ignore_unexpected_complex_type_values=True,
+ )
series = conv(series)
if self._int_to_decimal_coercion_enabled:
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]