This is an automated email from the ASF dual-hosted git repository.
ruifengz pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 33d2dc274ed0 [SPARK-54577][PYTHON] Optimize Py4J calls in schema
inference
33d2dc274ed0 is described below
commit 33d2dc274ed01696ab14d1877c8dc6065650da64
Author: Ruifeng Zheng <[email protected]>
AuthorDate: Thu Dec 4 09:13:17 2025 +0800
[SPARK-54577][PYTHON] Optimize Py4J calls in schema inference
### What changes were proposed in this pull request?
Optimize Py4J calls in schema inference
### Why are the changes needed?
to fetch all configs in single py4j call
### Does this PR introduce _any_ user-facing change?
no
### How was this patch tested?
ci
### Was this patch authored or co-authored using generative AI tooling?
no
Closes #53300 from zhengruifeng/py4j_infer_schema.
Authored-by: Ruifeng Zheng <[email protected]>
Signed-off-by: Ruifeng Zheng <[email protected]>
---
python/pyspark/sql/session.py | 46 ++++++++++++++++++++++++++++++++++---------
1 file changed, 37 insertions(+), 9 deletions(-)
diff --git a/python/pyspark/sql/session.py b/python/pyspark/sql/session.py
index b59ed9f0a840..4e45972d79b3 100644
--- a/python/pyspark/sql/session.py
+++ b/python/pyspark/sql/session.py
@@ -62,7 +62,6 @@ from pyspark.sql.types import (
)
from pyspark.errors.exceptions.captured import install_exception_handler
from pyspark.sql.utils import (
- is_timestamp_ntz_preferred,
to_str,
try_remote_session_classmethod,
remote_only,
@@ -1048,10 +1047,25 @@ class SparkSession(SparkConversionMixin):
errorClass="CANNOT_INFER_EMPTY_SCHEMA",
messageParameters={},
)
- infer_dict_as_struct = self._jconf.inferDictAsStruct()
- infer_array_from_first_element =
self._jconf.legacyInferArrayTypeFromFirstElement()
- infer_map_from_first_pair =
self._jconf.legacyInferMapStructTypeFromFirstItem()
- prefer_timestamp_ntz = is_timestamp_ntz_preferred()
+
+ (
+ timestampType,
+ inferDictAsStruct,
+ legacyInferArrayTypeFromFirstElement,
+ legacyInferMapStructTypeFromFirstItem,
+ ) = self._jconf.getConfs(
+ [
+ "spark.sql.timestampType",
+ "spark.sql.pyspark.inferNestedDictAsStruct.enabled",
+
"spark.sql.pyspark.legacy.inferArrayTypeFromFirstElement.enabled",
+ "spark.sql.pyspark.legacy.inferMapTypeFromFirstPair.enabled",
+ ]
+ )
+ prefer_timestamp_ntz = timestampType == "TIMESTAMP_NTZ"
+ infer_dict_as_struct = inferDictAsStruct == "true"
+ infer_array_from_first_element = legacyInferArrayTypeFromFirstElement
== "true"
+ infer_map_from_first_pair = legacyInferMapStructTypeFromFirstItem ==
"true"
+
schema = reduce(
_merge_type,
(
@@ -1101,10 +1115,24 @@ class SparkSession(SparkConversionMixin):
messageParameters={},
)
- infer_dict_as_struct = self._jconf.inferDictAsStruct()
- infer_array_from_first_element =
self._jconf.legacyInferArrayTypeFromFirstElement()
- infer_map_from_first_pair =
self._jconf.legacyInferMapStructTypeFromFirstItem()
- prefer_timestamp_ntz = is_timestamp_ntz_preferred()
+ (
+ timestampType,
+ inferDictAsStruct,
+ legacyInferArrayTypeFromFirstElement,
+ legacyInferMapStructTypeFromFirstItem,
+ ) = self._jconf.getConfs(
+ [
+ "spark.sql.timestampType",
+ "spark.sql.pyspark.inferNestedDictAsStruct.enabled",
+
"spark.sql.pyspark.legacy.inferArrayTypeFromFirstElement.enabled",
+ "spark.sql.pyspark.legacy.inferMapTypeFromFirstPair.enabled",
+ ]
+ )
+ prefer_timestamp_ntz = timestampType == "TIMESTAMP_NTZ"
+ infer_dict_as_struct = inferDictAsStruct == "true"
+ infer_array_from_first_element = legacyInferArrayTypeFromFirstElement
== "true"
+ infer_map_from_first_pair = legacyInferMapStructTypeFromFirstItem ==
"true"
+
if samplingRatio is None:
schema = _infer_schema(
first,
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]