This is an automated email from the ASF dual-hosted git repository.
gurwls223 pushed a commit to branch branch-3.5
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-3.5 by this push:
new e9865788787 [SPARK-44957][PYTHON][SQL][TESTS] Make PySpark
(pyspark-sql module) tests passing without any dependency
e9865788787 is described below
commit e98657887871884501ff84c7b0412d1d2d345e67
Author: Hyukjin Kwon <[email protected]>
AuthorDate: Fri Aug 25 15:39:37 2023 +0900
[SPARK-44957][PYTHON][SQL][TESTS] Make PySpark (pyspark-sql module) tests
passing without any dependency
### What changes were proposed in this pull request?
This PR proposes to fix the tests to properly run or skip when there aren't
optional dependencies installed.
### Why are the changes needed?
Currently, it fails as below:
```
Running PySpark tests. Output is in /.../spark/python/unit-tests.log
Will test against the following Python executables: ['python3']
Will test the following Python modules: ['pyspark-sql']
python3 python_implementation is CPython
python3 version is: Python 3.10.12
Starting test(python3):
pyspark.sql.tests.pandas.test_pandas_grouped_map_with_state (temp output:
/.../spark/python/target/8e530108-4d5e-46e4-88fb-8f0dfb7b47e2/python3__pyspark.sql.tests.pandas.test_pandas_grouped_map_with_state__jggatex7.log)
Starting test(python3): pyspark.sql.tests.pandas.test_pandas_grouped_map
(temp output:
/.../spark/python/target/3b6e9e5a-c479-408c-9365-8286330e8e7c/python3__pyspark.sql.tests.pandas.test_pandas_grouped_map__1lrovmur.log)
Starting test(python3): pyspark.sql.tests.pandas.test_pandas_cogrouped_map
(temp output:
/.../spark/python/target/68c7cf56-ed7a-453e-8d6d-3a0eb519d997/python3__pyspark.sql.tests.pandas.test_pandas_cogrouped_map__sw2875dr.log)
Starting test(python3): pyspark.sql.tests.pandas.test_pandas_map (temp
output:
/.../spark/python/target/90712186-a104-4491-ae0d-2b5ab973991b/python3__pyspark.sql.tests.pandas.test_pandas_map__ysp4911q.log)
Traceback (most recent call last):
File "/.../miniconda3/envs/vanilla-3.10/lib/python3.10/runpy.py", line
196, in _run_module_as_main
return _run_code(code, main_globals, None,
File "/.../miniconda3/envs/vanilla-3.10/lib/python3.10/runpy.py", line
86, in _run_code
exec(code, run_globals)
File
"/.../workspace/forked/spark/python/pyspark/sql/tests/pandas/test_pandas_map.py",
line 27, in <module>
from pyspark.testing.sqlutils import (
File "/.../workspace/forked/spark/python/pyspark/testing/__init__.py",
line 19, in <module>
from pyspark.testing.pandasutils import assertPandasOnSparkEqual
File "/.../workspace/forked/spark/python/pyspark/testing/pandasutils.py",
line 22, in <module>
import pandas as pd
ModuleNotFoundError: No module named 'pandas'
```
PySpark tests should pass without optional dependencies.
### Does this PR introduce _any_ user-facing change?
No, test-only.
### How was this patch tested?
Manually ran as described above.
### Was this patch authored or co-authored using generative AI tooling?
No.
Closes #42670 from HyukjinKwon/SPARK-44957.
Authored-by: Hyukjin Kwon <[email protected]>
Signed-off-by: Hyukjin Kwon <[email protected]>
(cherry picked from commit fb45476d58c7936518cea1b9510145ecd5ec6fd1)
Signed-off-by: Hyukjin Kwon <[email protected]>
---
python/pyspark/sql/functions.py | 24 +++++++-----------------
python/pyspark/sql/tests/test_arrow.py | 6 ++++--
python/pyspark/testing/pandasutils.py | 10 ++++++++--
3 files changed, 19 insertions(+), 21 deletions(-)
diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index 398ad15676c..aaf4823b3e1 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -7778,9 +7778,6 @@ def to_unix_timestamp(
.. versionadded:: 3.5.0
- .. versionchanged:: 3.5.0
- Supports Spark Connect.
-
Parameters
----------
timestamp : :class:`~pyspark.sql.Column` or str
@@ -7819,9 +7816,6 @@ def to_timestamp_ltz(
.. versionadded:: 3.5.0
- .. versionchanged:: 3.5.0
- Supports Spark Connect.
-
Parameters
----------
timestamp : :class:`~pyspark.sql.Column` or str
@@ -7831,17 +7825,15 @@ def to_timestamp_ltz(
Examples
--------
- >>> spark.conf.set("spark.sql.session.timeZone", "UTC")
>>> df = spark.createDataFrame([("2016-12-31",)], ["e"])
>>> df.select(to_timestamp_ltz(df.e,
lit("yyyy-MM-dd")).alias('r')).collect()
+ ... # doctest: +SKIP
[Row(r=datetime.datetime(2016, 12, 31, 0, 0))]
- >>> spark.conf.unset("spark.sql.session.timeZone")
- >>> spark.conf.set("spark.sql.session.timeZone", "UTC")
>>> df = spark.createDataFrame([("2016-12-31",)], ["e"])
>>> df.select(to_timestamp_ltz(df.e).alias('r')).collect()
+ ... # doctest: +SKIP
[Row(r=datetime.datetime(2016, 12, 31, 0, 0))]
- >>> spark.conf.unset("spark.sql.session.timeZone")
"""
if format is not None:
return _invoke_function_over_columns("to_timestamp_ltz", timestamp,
format)
@@ -7860,9 +7852,6 @@ def to_timestamp_ntz(
.. versionadded:: 3.5.0
- .. versionchanged:: 3.5.0
- Supports Spark Connect.
-
Parameters
----------
timestamp : :class:`~pyspark.sql.Column` or str
@@ -7872,17 +7861,15 @@ def to_timestamp_ntz(
Examples
--------
- >>> spark.conf.set("spark.sql.session.timeZone", "America/Los_Angeles")
>>> df = spark.createDataFrame([("2016-04-08",)], ["e"])
>>> df.select(to_timestamp_ntz(df.e,
lit("yyyy-MM-dd")).alias('r')).collect()
+ ... # doctest: +SKIP
[Row(r=datetime.datetime(2016, 4, 8, 0, 0))]
- >>> spark.conf.unset("spark.sql.session.timeZone")
- >>> spark.conf.set("spark.sql.session.timeZone", "America/Los_Angeles")
>>> df = spark.createDataFrame([("2016-04-08",)], ["e"])
>>> df.select(to_timestamp_ntz(df.e).alias('r')).collect()
+ ... # doctest: +SKIP
[Row(r=datetime.datetime(2016, 4, 8, 0, 0))]
- >>> spark.conf.unset("spark.sql.session.timeZone")
"""
if format is not None:
return _invoke_function_over_columns("to_timestamp_ntz", timestamp,
format)
@@ -14447,13 +14434,16 @@ def call_function(funcName: str, *cols:
"ColumnOrName") -> Column:
| 2.0|
+-------+
>>> _ = spark.sql("CREATE FUNCTION custom_avg AS
'test.org.apache.spark.sql.MyDoubleAvg'")
+ ... # doctest: +SKIP
>>> df.select(call_function("custom_avg", col("id"))).show()
+ ... # doctest: +SKIP
+------------------------------------+
|spark_catalog.default.custom_avg(id)|
+------------------------------------+
| 102.0|
+------------------------------------+
>>> df.select(call_function("spark_catalog.default.custom_avg",
col("id"))).show()
+ ... # doctest: +SKIP
+------------------------------------+
|spark_catalog.default.custom_avg(id)|
+------------------------------------+
diff --git a/python/pyspark/sql/tests/test_arrow.py
b/python/pyspark/sql/tests/test_arrow.py
index ac45c4c565f..1b81ed72b22 100644
--- a/python/pyspark/sql/tests/test_arrow.py
+++ b/python/pyspark/sql/tests/test_arrow.py
@@ -24,8 +24,6 @@ import warnings
from distutils.version import LooseVersion
from typing import cast
-import numpy as np
-
from pyspark import SparkContext, SparkConf
from pyspark.sql import Row, SparkSession
from pyspark.sql.functions import rand, udf, assert_true, lit
@@ -183,6 +181,8 @@ class ArrowTestsMixin:
@property
def create_np_arrs(self):
+ import numpy as np
+
int_dtypes = ["int8", "int16", "int32", "int64"]
float_dtypes = ["float32", "float64"]
return (
@@ -584,6 +584,8 @@ class ArrowTestsMixin:
self.check_createDataFrame_with_ndarray(arrow_enabled)
def check_createDataFrame_with_ndarray(self, arrow_enabled):
+ import numpy as np
+
dtypes = ["tinyint", "smallint", "int", "bigint", "float", "double"]
expected_dtypes = (
[[("value", t)] for t in dtypes]
diff --git a/python/pyspark/testing/pandasutils.py
b/python/pyspark/testing/pandasutils.py
index 1122944b2c0..c80ffb7ee53 100644
--- a/python/pyspark/testing/pandasutils.py
+++ b/python/pyspark/testing/pandasutils.py
@@ -19,11 +19,10 @@ import functools
import shutil
import tempfile
import warnings
-import pandas as pd
from contextlib import contextmanager
from distutils.version import LooseVersion
import decimal
-from typing import Any, Union
+from typing import Any, Union, TYPE_CHECKING
import pyspark.pandas as ps
from pyspark.pandas.frame import DataFrame
@@ -57,6 +56,13 @@ except ImportError as e:
plotly_requirement_message = str(e)
have_plotly = plotly_requirement_message is None
+try:
+ from pyspark.sql.pandas.utils import require_minimum_pandas_version
+
+ require_minimum_pandas_version()
+ import pandas as pd
+except ImportError:
+ pass
__all__ = ["assertPandasOnSparkEqual"]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]