This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new fb45476d58c [SPARK-44957][PYTHON][SQL][TESTS] Make PySpark (pyspark-sql module) tests passing without any dependency fb45476d58c is described below commit fb45476d58c7936518cea1b9510145ecd5ec6fd1 Author: Hyukjin Kwon <gurwls...@apache.org> AuthorDate: Fri Aug 25 15:39:37 2023 +0900 [SPARK-44957][PYTHON][SQL][TESTS] Make PySpark (pyspark-sql module) tests passing without any dependency ### What changes were proposed in this pull request? This PR proposes to fix the tests to properly run or skip when there aren't optional dependencies installed. ### Why are the changes needed? Currently, it fails as below: ``` Running PySpark tests. Output is in /.../spark/python/unit-tests.log Will test against the following Python executables: ['python3'] Will test the following Python modules: ['pyspark-sql'] python3 python_implementation is CPython python3 version is: Python 3.10.12 Starting test(python3): pyspark.sql.tests.pandas.test_pandas_grouped_map_with_state (temp output: /.../spark/python/target/8e530108-4d5e-46e4-88fb-8f0dfb7b47e2/python3__pyspark.sql.tests.pandas.test_pandas_grouped_map_with_state__jggatex7.log) Starting test(python3): pyspark.sql.tests.pandas.test_pandas_grouped_map (temp output: /.../spark/python/target/3b6e9e5a-c479-408c-9365-8286330e8e7c/python3__pyspark.sql.tests.pandas.test_pandas_grouped_map__1lrovmur.log) Starting test(python3): pyspark.sql.tests.pandas.test_pandas_cogrouped_map (temp output: /.../spark/python/target/68c7cf56-ed7a-453e-8d6d-3a0eb519d997/python3__pyspark.sql.tests.pandas.test_pandas_cogrouped_map__sw2875dr.log) Starting test(python3): pyspark.sql.tests.pandas.test_pandas_map (temp output: /.../spark/python/target/90712186-a104-4491-ae0d-2b5ab973991b/python3__pyspark.sql.tests.pandas.test_pandas_map__ysp4911q.log) Traceback (most recent call last): File "/.../miniconda3/envs/vanilla-3.10/lib/python3.10/runpy.py", line 196, in _run_module_as_main return _run_code(code, main_globals, None, File "/.../miniconda3/envs/vanilla-3.10/lib/python3.10/runpy.py", line 86, in _run_code exec(code, run_globals) File "/.../workspace/forked/spark/python/pyspark/sql/tests/pandas/test_pandas_map.py", line 27, in <module> from pyspark.testing.sqlutils import ( File "/.../workspace/forked/spark/python/pyspark/testing/__init__.py", line 19, in <module> from pyspark.testing.pandasutils import assertPandasOnSparkEqual File "/.../workspace/forked/spark/python/pyspark/testing/pandasutils.py", line 22, in <module> import pandas as pd ModuleNotFoundError: No module named 'pandas' ``` PySpark tests should pass without optional dependencies. ### Does this PR introduce _any_ user-facing change? No, test-only. ### How was this patch tested? Manually ran as described above. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #42670 from HyukjinKwon/SPARK-44957. Authored-by: Hyukjin Kwon <gurwls...@apache.org> Signed-off-by: Hyukjin Kwon <gurwls...@apache.org> --- python/pyspark/sql/functions.py | 24 +++++++----------------- python/pyspark/sql/tests/test_arrow.py | 6 ++++-- python/pyspark/testing/pandasutils.py | 10 ++++++++-- 3 files changed, 19 insertions(+), 21 deletions(-) diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py index 535ac06530a..5d5557cb916 100644 --- a/python/pyspark/sql/functions.py +++ b/python/pyspark/sql/functions.py @@ -7826,9 +7826,6 @@ def to_unix_timestamp( .. versionadded:: 3.5.0 - .. versionchanged:: 3.5.0 - Supports Spark Connect. - Parameters ---------- timestamp : :class:`~pyspark.sql.Column` or str @@ -7861,9 +7858,6 @@ def to_timestamp_ltz( .. versionadded:: 3.5.0 - .. versionchanged:: 3.5.0 - Supports Spark Connect. - Parameters ---------- timestamp : :class:`~pyspark.sql.Column` or str @@ -7873,17 +7867,15 @@ def to_timestamp_ltz( Examples -------- - >>> spark.conf.set("spark.sql.session.timeZone", "UTC") >>> df = spark.createDataFrame([("2016-12-31",)], ["e"]) >>> df.select(to_timestamp_ltz(df.e, lit("yyyy-MM-dd")).alias('r')).collect() + ... # doctest: +SKIP [Row(r=datetime.datetime(2016, 12, 31, 0, 0))] - >>> spark.conf.unset("spark.sql.session.timeZone") - >>> spark.conf.set("spark.sql.session.timeZone", "UTC") >>> df = spark.createDataFrame([("2016-12-31",)], ["e"]) >>> df.select(to_timestamp_ltz(df.e).alias('r')).collect() + ... # doctest: +SKIP [Row(r=datetime.datetime(2016, 12, 31, 0, 0))] - >>> spark.conf.unset("spark.sql.session.timeZone") """ if format is not None: return _invoke_function_over_columns("to_timestamp_ltz", timestamp, format) @@ -7902,9 +7894,6 @@ def to_timestamp_ntz( .. versionadded:: 3.5.0 - .. versionchanged:: 3.5.0 - Supports Spark Connect. - Parameters ---------- timestamp : :class:`~pyspark.sql.Column` or str @@ -7914,17 +7903,15 @@ def to_timestamp_ntz( Examples -------- - >>> spark.conf.set("spark.sql.session.timeZone", "America/Los_Angeles") >>> df = spark.createDataFrame([("2016-04-08",)], ["e"]) >>> df.select(to_timestamp_ntz(df.e, lit("yyyy-MM-dd")).alias('r')).collect() + ... # doctest: +SKIP [Row(r=datetime.datetime(2016, 4, 8, 0, 0))] - >>> spark.conf.unset("spark.sql.session.timeZone") - >>> spark.conf.set("spark.sql.session.timeZone", "America/Los_Angeles") >>> df = spark.createDataFrame([("2016-04-08",)], ["e"]) >>> df.select(to_timestamp_ntz(df.e).alias('r')).collect() + ... # doctest: +SKIP [Row(r=datetime.datetime(2016, 4, 8, 0, 0))] - >>> spark.conf.unset("spark.sql.session.timeZone") """ if format is not None: return _invoke_function_over_columns("to_timestamp_ntz", timestamp, format) @@ -14489,13 +14476,16 @@ def call_function(funcName: str, *cols: "ColumnOrName") -> Column: | 2.0| +-------+ >>> _ = spark.sql("CREATE FUNCTION custom_avg AS 'test.org.apache.spark.sql.MyDoubleAvg'") + ... # doctest: +SKIP >>> df.select(call_function("custom_avg", col("id"))).show() + ... # doctest: +SKIP +------------------------------------+ |spark_catalog.default.custom_avg(id)| +------------------------------------+ | 102.0| +------------------------------------+ >>> df.select(call_function("spark_catalog.default.custom_avg", col("id"))).show() + ... # doctest: +SKIP +------------------------------------+ |spark_catalog.default.custom_avg(id)| +------------------------------------+ diff --git a/python/pyspark/sql/tests/test_arrow.py b/python/pyspark/sql/tests/test_arrow.py index e26aabbea27..a97801f3af1 100644 --- a/python/pyspark/sql/tests/test_arrow.py +++ b/python/pyspark/sql/tests/test_arrow.py @@ -24,8 +24,6 @@ import warnings from distutils.version import LooseVersion from typing import cast -import numpy as np - from pyspark import SparkContext, SparkConf from pyspark.sql import Row, SparkSession from pyspark.sql.functions import rand, udf, assert_true, lit @@ -183,6 +181,8 @@ class ArrowTestsMixin: @property def create_np_arrs(self): + import numpy as np + int_dtypes = ["int8", "int16", "int32", "int64"] float_dtypes = ["float32", "float64"] return ( @@ -584,6 +584,8 @@ class ArrowTestsMixin: self.check_createDataFrame_with_ndarray(arrow_enabled) def check_createDataFrame_with_ndarray(self, arrow_enabled): + import numpy as np + dtypes = ["tinyint", "smallint", "int", "bigint", "float", "double"] expected_dtypes = ( [[("value", t)] for t in dtypes] diff --git a/python/pyspark/testing/pandasutils.py b/python/pyspark/testing/pandasutils.py index 1122944b2c0..c80ffb7ee53 100644 --- a/python/pyspark/testing/pandasutils.py +++ b/python/pyspark/testing/pandasutils.py @@ -19,11 +19,10 @@ import functools import shutil import tempfile import warnings -import pandas as pd from contextlib import contextmanager from distutils.version import LooseVersion import decimal -from typing import Any, Union +from typing import Any, Union, TYPE_CHECKING import pyspark.pandas as ps from pyspark.pandas.frame import DataFrame @@ -57,6 +56,13 @@ except ImportError as e: plotly_requirement_message = str(e) have_plotly = plotly_requirement_message is None +try: + from pyspark.sql.pandas.utils import require_minimum_pandas_version + + require_minimum_pandas_version() + import pandas as pd +except ImportError: + pass __all__ = ["assertPandasOnSparkEqual"] --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org