This is an automated email from the ASF dual-hosted git repository.
gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new c5a9bc79071d [SPARK-50761][INFRA] Make `pyspark-pandas` can be tested
alone
c5a9bc79071d is described below
commit c5a9bc79071d03d81047866f43f19f17fb5b5a1f
Author: Ruifeng Zheng <[email protected]>
AuthorDate: Wed Jan 8 13:31:40 2025 +0900
[SPARK-50761][INFRA] Make `pyspark-pandas` can be tested alone
### What changes were proposed in this pull request?
Make pyspark-pandas module no longer depend on pyspark module
### Why are the changes needed?
in existing framework, `pyspark-pandas` depends on `pyspark`, so:
1, a PS-only PR will trigger the test of the whole pyspark (both PS and
non-PS);
2, in daily test `Build / Python-only (master, Python PS with old
dependencies)` cannot only test the PS part
### Does this PR introduce _any_ user-facing change?
no, infra-only
### How was this patch tested?
PR builder with
```
default: '{"pyspark-pandas": "true"}'
```
https://github.com/zhengruifeng/spark/actions/runs/12651324677/job/35253582924
### Was this patch authored or co-authored using generative AI tooling?
no
Closes #49404 from zhengruifeng/infra_split_py.
Authored-by: Ruifeng Zheng <[email protected]>
Signed-off-by: Hyukjin Kwon <[email protected]>
---
.github/workflows/build_and_test.yml | 19 ++++++++++++++-----
1 file changed, 14 insertions(+), 5 deletions(-)
diff --git a/.github/workflows/build_and_test.yml
b/.github/workflows/build_and_test.yml
index 614fdd49d83d..14b9b87dd50a 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -84,10 +84,11 @@ jobs:
id: set-outputs
run: |
if [ -z "${{ inputs.jobs }}" ]; then
- pyspark_modules=`cd dev && python -c "import
sparktestsupport.modules as m; print(','.join(m.name for m in m.all_modules if
m.name.startswith('pyspark')))"`
+ pyspark_modules=`cd dev && python -c "import
sparktestsupport.modules as m; print(','.join(m.name for m in m.all_modules if
m.name.startswith('pyspark') and not m.name.startswith('pyspark-pandas')))"`
+ pyspark_pandas_modules=`cd dev && python -c "import
sparktestsupport.modules as m; print(','.join(m.name for m in m.all_modules if
m.name.startswith('pyspark-pandas')))"`
pyspark=`./dev/is-changed.py -m $pyspark_modules`
+ pandas=`./dev/is-changed.py -m $pyspark_pandas_modules`
if [[ "${{ github.repository }}" != 'apache/spark' ]]; then
- pandas=$pyspark
yarn=`./dev/is-changed.py -m yarn`
kubernetes=`./dev/is-changed.py -m kubernetes`
sparkr=`./dev/is-changed.py -m sparkr`
@@ -139,7 +140,7 @@ jobs:
if: inputs.branch != 'branch-3.5'
env: ${{ fromJSON(inputs.envs) }}
run: |
- if [[ "${{ fromJson(steps.set-outputs.outputs.required).pyspark }}"
== 'true' ]]; then
+ if [[ "${{ fromJson(steps.set-outputs.outputs.required).pyspark }}"
== 'true' || "${{ fromJson(steps.set-outputs.outputs.required).pyspark-pandas
}}" == 'true' ]]; then
if [[ "${{ env.PYSPARK_IMAGE_TO_TEST }}" == "" ]]; then
echo "PYSPARK_IMAGE_TO_TEST is required when pyspark is enabled."
exit 1
@@ -392,6 +393,7 @@ jobs:
needs: precondition
if: >-
fromJson(needs.precondition.outputs.required).pyspark == 'true' ||
+ fromJson(needs.precondition.outputs.required).pyspark-pandas == 'true' ||
fromJson(needs.precondition.outputs.required).lint == 'true' ||
fromJson(needs.precondition.outputs.required).docs == 'true' ||
fromJson(needs.precondition.outputs.required).sparkr == 'true'
@@ -468,7 +470,7 @@ jobs:
# Use the infra image cache to speed up
cache-from:
type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-sparkr-cache:${{
inputs.branch }}
- name: Build and push (PySpark with ${{ env.PYSPARK_IMAGE_TO_TEST }})
- if: ${{ inputs.branch != 'branch-3.5' &&
fromJson(needs.precondition.outputs.required).pyspark == 'true' &&
env.PYSPARK_IMAGE_TO_TEST != '' }}
+ if: ${{ inputs.branch != 'branch-3.5' &&
(fromJson(needs.precondition.outputs.required).pyspark == 'true' ||
fromJson(needs.precondition.outputs.required).pyspark-pandas == 'true') &&
env.PYSPARK_IMAGE_TO_TEST != '' }}
id: docker_build_pyspark
env: ${{ fromJSON(inputs.envs) }}
uses: docker/build-push-action@v6
@@ -484,7 +486,7 @@ jobs:
pyspark:
needs: [precondition, infra-image]
# always run if pyspark == 'true', even infra-image is skip (such as
non-master job)
- if: (!cancelled()) &&
fromJson(needs.precondition.outputs.required).pyspark == 'true'
+ if: (!cancelled()) &&
(fromJson(needs.precondition.outputs.required).pyspark == 'true' ||
fromJson(needs.precondition.outputs.required).pyspark-pandas == 'true')
name: "Build modules: ${{ matrix.modules }}"
runs-on: ubuntu-latest
timeout-minutes: 180
@@ -517,6 +519,13 @@ jobs:
- >-
pyspark-pandas-connect-part3
exclude:
+ # Always run if pyspark == 'true', even infra-image is skip (such as
non-master job)
+ # In practice, the build will run in individual PR, but not against
the individual commit
+ # in Apache Spark repository.
+ - modules: ${{ fromJson(needs.precondition.outputs.required).pyspark
!= 'true' && 'pyspark-sql, pyspark-resource, pyspark-testing' }}
+ - modules: ${{ fromJson(needs.precondition.outputs.required).pyspark
!= 'true' && 'pyspark-core, pyspark-errors, pyspark-streaming, pyspark-logger'
}}
+ - modules: ${{ fromJson(needs.precondition.outputs.required).pyspark
!= 'true' && 'pyspark-mllib, pyspark-ml, pyspark-ml-connect' }}
+ - modules: ${{ fromJson(needs.precondition.outputs.required).pyspark
!= 'true' && 'pyspark-connect' }}
# Always run if pyspark-pandas == 'true', even infra-image is skip
(such as non-master job)
# In practice, the build will run in individual PR, but not against
the individual commit
# in Apache Spark repository.
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]