This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new daf260f74e12 [SPARK-47831][PS][CONNECT][TESTS] Run Pandas API on Spark for pyspark-connect package daf260f74e12 is described below commit daf260f74e12fc5e9fad6091f6230e71a9e6c9c1 Author: Hyukjin Kwon <gurwls...@apache.org> AuthorDate: Fri Apr 12 18:39:22 2024 +0900 [SPARK-47831][PS][CONNECT][TESTS] Run Pandas API on Spark for pyspark-connect package ### What changes were proposed in this pull request? This PR proposes to extends `pyspark-connect` scheduled job to run Pandas API on Spark tests as well. ### Why are the changes needed? In order to make sure pure Python library works with Pandas API on Spark. ### Does this PR introduce _any_ user-facing change? No, test-only. ### How was this patch tested? https://github.com/HyukjinKwon/spark/actions/runs/8659133747/job/23744381515 ### Was this patch authored or co-authored using generative AI tooling? No Closes #46001 from HyukjinKwon/test-ps-scheduledjob. Authored-by: Hyukjin Kwon <gurwls...@apache.org> Signed-off-by: Hyukjin Kwon <gurwls...@apache.org> --- .github/workflows/build_python_connect.yml | 12 +++++++--- python/packaging/connect/setup.py | 26 ++++++++++++++++++++++ .../tests/connect/test_parity_memory_profiler.py | 3 +++ 3 files changed, 38 insertions(+), 3 deletions(-) diff --git a/.github/workflows/build_python_connect.yml b/.github/workflows/build_python_connect.yml index 8deee026131e..6bd1b4526b0d 100644 --- a/.github/workflows/build_python_connect.yml +++ b/.github/workflows/build_python_connect.yml @@ -72,18 +72,24 @@ jobs: python packaging/connect/setup.py sdist cd dist pip install pyspark-connect-*.tar.gz - pip install scikit-learn torch torchvision torcheval + pip install 'six==1.16.0' 'pandas<=2.2.2' scipy 'plotly>=4.8' 'mlflow>=2.8.1' coverage matplotlib openpyxl 'memory-profiler>=0.61.0' 'scikit-learn>=1.3.2' torch torchvision torcheval deepspeed unittest-xml-reporting - name: Run tests env: - SPARK_CONNECT_TESTING_REMOTE: sc://localhost SPARK_TESTING: 1 + SPARK_CONNECT_TESTING_REMOTE: sc://localhost run: | + # Make less noisy + cp conf/log4j2.properties.template conf/log4j2.properties + sed -i 's/rootLogger.level = info/rootLogger.level = warn/g' conf/log4j2.properties # Start a Spark Connect server - ./sbin/start-connect-server.sh --jars `find connector/connect/server/target -name spark-connect*SNAPSHOT.jar` + ./sbin/start-connect-server.sh --driver-java-options "-Dlog4j.configurationFile=file:$GITHUB_WORKSPACE/conf/log4j2.properties" --jars `find connector/connect/server/target -name spark-connect*SNAPSHOT.jar` # Remove Py4J and PySpark zipped library to make sure there is no JVM connection rm python/lib/* rm -r python/pyspark + # Several tests related to catalog requires to run them sequencially, e.g., writing a table in a listener. ./python/run-tests --parallelism=1 --python-executables=python3 --modules pyspark-connect,pyspark-ml-connect + # None of tests are dependent on each other in Pandas API on Spark so run them in parallel + ./python/run-tests --parallelism=4 --python-executables=python3 --modules pyspark-pandas-connect-part0,pyspark-pandas-connect-part1,pyspark-pandas-connect-part2,pyspark-pandas-connect-part3 - name: Upload test results to report if: always() uses: actions/upload-artifact@v4 diff --git a/python/packaging/connect/setup.py b/python/packaging/connect/setup.py index 419ed36b4236..fe1e7486faa9 100755 --- a/python/packaging/connect/setup.py +++ b/python/packaging/connect/setup.py @@ -78,6 +78,32 @@ if "SPARK_TESTING" in os.environ: "pyspark.sql.tests.pandas", "pyspark.sql.tests.streaming", "pyspark.ml.tests.connect", + "pyspark.pandas.tests", + "pyspark.pandas.tests.computation", + "pyspark.pandas.tests.data_type_ops", + "pyspark.pandas.tests.diff_frames_ops", + "pyspark.pandas.tests.frame", + "pyspark.pandas.tests.groupby", + "pyspark.pandas.tests.indexes", + "pyspark.pandas.tests.io", + "pyspark.pandas.tests.plot", + "pyspark.pandas.tests.resample", + "pyspark.pandas.tests.reshape", + "pyspark.pandas.tests.series", + "pyspark.pandas.tests.window", + "pyspark.pandas.tests.connect", + "pyspark.pandas.tests.connect.computation", + "pyspark.pandas.tests.connect.data_type_ops", + "pyspark.pandas.tests.connect.diff_frames_ops", + "pyspark.pandas.tests.connect.frame", + "pyspark.pandas.tests.connect.groupby", + "pyspark.pandas.tests.connect.indexes", + "pyspark.pandas.tests.connect.io", + "pyspark.pandas.tests.connect.plot", + "pyspark.pandas.tests.connect.resample", + "pyspark.pandas.tests.connect.reshape", + "pyspark.pandas.tests.connect.series", + "pyspark.pandas.tests.connect.window", ] try: diff --git a/python/pyspark/sql/tests/connect/test_parity_memory_profiler.py b/python/pyspark/sql/tests/connect/test_parity_memory_profiler.py index 513e49a144e5..f95e0bfbf8d6 100644 --- a/python/pyspark/sql/tests/connect/test_parity_memory_profiler.py +++ b/python/pyspark/sql/tests/connect/test_parity_memory_profiler.py @@ -18,10 +18,13 @@ import inspect import os import unittest +from pyspark.util import is_remote_only from pyspark.tests.test_memory_profiler import MemoryProfiler2TestsMixin, _do_computation from pyspark.testing.connectutils import ReusedConnectTestCase +# TODO(SPARK-47830): Reeanble MemoryProfilerParityTests for pyspark-connect +@unittest.skipIf(is_remote_only(), "Skipped for now") class MemoryProfilerParityTests(MemoryProfiler2TestsMixin, ReusedConnectTestCase): def setUp(self) -> None: super().setUp() --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org