This is an automated email from the ASF dual-hosted git repository.
ruifengz pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new dc9f55966093 Revert "[SPARK-54285][PYTHON] Cache timezone info to
avoid expensive timestamp conversion"
dc9f55966093 is described below
commit dc9f559660937098ceeded113c2318c5c14ba73f
Author: Ruifeng Zheng <[email protected]>
AuthorDate: Mon Mar 9 15:53:13 2026 +0800
Revert "[SPARK-54285][PYTHON] Cache timezone info to avoid expensive
timestamp conversion"
### What changes were proposed in this pull request?
revert https://github.com/apache/spark/pull/52980
### Why are the changes needed?
after winter->summer time swift, the test for timestampe coercion for
vanilla python udf `pyspark.sql.tests.coercion.test_python_udf_input_type`
starts
[failing](https://github.com/apache/spark/actions/runs/22837174577/job/66235976642),
it seems the datetime was added 1 additional hour after udf execution.
I found the suspicious commit is the [cached
timezone](https://github.com/apache/spark/commit/5fb072e4f25f471e69e2b81ee0155cc24a20725a),
the test passes after revert it.
### Does this PR introduce _any_ user-facing change?
No
### How was this patch tested?
CI
### Was this patch authored or co-authored using generative AI tooling?
No
Closes #54682 from zhengruifeng/fix_time_udf.
Authored-by: Ruifeng Zheng <[email protected]>
Signed-off-by: Ruifeng Zheng <[email protected]>
---
python/pyspark/sql/types.py | 13 +------------
python/pyspark/worker.py | 8 +-------
2 files changed, 2 insertions(+), 19 deletions(-)
diff --git a/python/pyspark/sql/types.py b/python/pyspark/sql/types.py
index 943e0943ca48..3575f1f95398 100644
--- a/python/pyspark/sql/types.py
+++ b/python/pyspark/sql/types.py
@@ -443,12 +443,6 @@ class TimeType(AnyTimeType):
class TimestampType(DatetimeType, metaclass=DataTypeSingleton):
"""Timestamp (datetime.datetime) data type."""
- # We need to cache the timezone info for datetime.datetime.fromtimestamp
- # otherwise the forked process will be extremely slow to convert the
timestamp.
- # This is probably a glibc issue - the forked process will have a bad
cache/lock
- # status for the timezone info.
- tz_info = None
-
def needConversion(self) -> bool:
return True
@@ -462,12 +456,7 @@ class TimestampType(DatetimeType,
metaclass=DataTypeSingleton):
def fromInternal(self, ts: int) -> datetime.datetime:
if ts is not None:
# using int to avoid precision loss in float
- # If TimestampType.tz_info is not None, we need to use it to
convert the timestamp.
- # Otherwise, we need to use the default timezone.
- # We need to replace the tzinfo to None to keep backward
compatibility
- return datetime.datetime.fromtimestamp(ts // 1000000,
self.tz_info).replace(
- microsecond=ts % 1000000, tzinfo=None
- )
+ return datetime.datetime.fromtimestamp(ts //
1000000).replace(microsecond=ts % 1000000)
class TimestampNTZType(DatetimeType, metaclass=DataTypeSingleton):
diff --git a/python/pyspark/worker.py b/python/pyspark/worker.py
index 84e275936fb8..7fbe0849ee63 100644
--- a/python/pyspark/worker.py
+++ b/python/pyspark/worker.py
@@ -18,7 +18,6 @@
"""
Worker that receives input from Piped RDD.
"""
-import datetime
import os
import sys
import dataclasses
@@ -73,7 +72,7 @@ from pyspark.sql.pandas.serializers import (
ArrowStreamUDTFSerializer,
ArrowStreamArrowUDTFSerializer,
)
-from pyspark.sql.pandas.types import to_arrow_type, TimestampType
+from pyspark.sql.pandas.types import to_arrow_type
from pyspark.sql.types import (
ArrayType,
BinaryType,
@@ -3392,11 +3391,6 @@ def main(infile, outfile):
if split_index == -1: # for unit tests
sys.exit(-1)
start_faulthandler_periodic_traceback()
-
- # Use the local timezone to convert the timestamp
- tz = datetime.datetime.now().astimezone().tzinfo
- TimestampType.tz_info = tz
-
check_python_version(infile)
memory_limit_mb = int(os.environ.get("PYSPARK_EXECUTOR_MEMORY_MB",
"-1"))
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]