This is an automated email from the ASF dual-hosted git repository.

ruifengz pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
     new 45da6f66d155 [SPARK-50477][INFRA] Add a separate docker file for 
python 3.9 daily build
45da6f66d155 is described below

commit 45da6f66d155fff4024840f804d80f335e66360c
Author: Ruifeng Zheng <[email protected]>
AuthorDate: Wed Dec 4 16:41:21 2024 +0800

    [SPARK-50477][INFRA] Add a separate docker file for python 3.9 daily build
    
    ### What changes were proposed in this pull request?
    Add a separate docker file for python 3.9 daily build
    
    ### Why are the changes needed?
    to isolate the environments
    
    ### Does this PR introduce _any_ user-facing change?
    no, infra-only
    
    ### How was this patch tested?
    ci, the second commit and the 4-th commit tested this PR against the new 
image
    
    
https://github.com/zhengruifeng/spark/actions/runs/12135050296/job/33835846375
    
    
https://github.com/zhengruifeng/spark/actions/runs/12140138335/job/33850700922
    
    ### Was this patch authored or co-authored using generative AI tooling?
    no
    
    Closes #49042 from zhengruifeng/infra_py_images.
    
    Authored-by: Ruifeng Zheng <[email protected]>
    Signed-off-by: Ruifeng Zheng <[email protected]>
---
 .github/workflows/build_and_test.yml           | 36 ++++++++++-
 .github/workflows/build_infra_images_cache.yml | 14 +++++
 .github/workflows/build_python_3.9.yml         |  1 +
 dev/spark-test-image/python-309/Dockerfile     | 82 ++++++++++++++++++++++++++
 4 files changed, 132 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/build_and_test.yml 
b/.github/workflows/build_and_test.yml
index 3117872e2168..cf49316fafbb 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -55,6 +55,7 @@ jobs:
     runs-on: ubuntu-latest
     env:
       GITHUB_PREV_SHA: ${{ github.event.before }}
+      PYSPARK_IMAGE_TO_TEST: ''
     outputs:
       required: ${{ steps.set-outputs.outputs.required }}
       image_url: ${{ steps.infra-image-outputs.outputs.image_url }}
@@ -64,6 +65,8 @@ jobs:
       image_lint_url_link: ${{ 
steps.infra-image-link.outputs.image_lint_url_link }}
       image_sparkr_url: ${{ 
steps.infra-image-sparkr-outputs.outputs.image_sparkr_url }}
       image_sparkr_url_link: ${{ 
steps.infra-image-link.outputs.image_sparkr_url_link }}
+      image_pyspark_url: ${{ 
steps.infra-image-pyspark-outputs.outputs.image_pyspark_url }}
+      image_pyspark_url_link: ${{ 
steps.infra-image-link.outputs.image_pyspark_url_link }}
     steps:
     - name: Checkout Spark repository
       uses: actions/checkout@v4
@@ -164,8 +167,19 @@ jobs:
         IMG_NAME="apache-spark-ci-image-sparkr:${{ inputs.branch }}-${{ 
github.run_id }}"
         IMG_URL="ghcr.io/$REPO_OWNER/$IMG_NAME"
         echo "image_sparkr_url=$IMG_URL" >> $GITHUB_OUTPUT
+    - name: Generate infra image URL (PySpark ${{ env.PYSPARK_IMAGE_TO_TEST }})
+      id: infra-image-pyspark-outputs
+      if: ${{ env.PYSPARK_IMAGE_TO_TEST }}
+      env: ${{ fromJSON(inputs.envs) }}
+      run: |
+        # Convert to lowercase to meet Docker repo name requirement
+        REPO_OWNER=$(echo "${{ github.repository_owner }}" | tr '[:upper:]' 
'[:lower:]')
+        IMG_NAME="apache-spark-ci-image-pyspark-${{ env.PYSPARK_IMAGE_TO_TEST 
}}:${{ inputs.branch }}-${{ github.run_id }}"
+        IMG_URL="ghcr.io/$REPO_OWNER/$IMG_NAME"
+        echo "image_pyspark_url=$IMG_URL" >> $GITHUB_OUTPUT
     - name: Link the docker images
       id: infra-image-link
+      env: ${{ fromJSON(inputs.envs) }}
       run: |
         # Set the image URL for job "docs"
         # Should delete the link and directly use image_docs_url after SPARK 
3.x EOL
@@ -173,10 +187,16 @@ jobs:
           echo "image_docs_url_link=${{ 
steps.infra-image-outputs.outputs.image_url }}" >> $GITHUB_OUTPUT
           echo "image_lint_url_link=${{ 
steps.infra-image-outputs.outputs.image_url }}" >> $GITHUB_OUTPUT
           echo "image_sparkr_url_link=${{ 
steps.infra-image-outputs.outputs.image_url }}" >> $GITHUB_OUTPUT
+          echo "image_pyspark_url_link=${{ 
steps.infra-image-outputs.outputs.image_url }}" >> $GITHUB_OUTPUT
         else
           echo "image_docs_url_link=${{ 
steps.infra-image-docs-outputs.outputs.image_docs_url }}" >> $GITHUB_OUTPUT
           echo "image_lint_url_link=${{ 
steps.infra-image-lint-outputs.outputs.image_lint_url }}" >> $GITHUB_OUTPUT
           echo "image_sparkr_url_link=${{ 
steps.infra-image-sparkr-outputs.outputs.image_sparkr_url }}" >> $GITHUB_OUTPUT
+          if [[ "${{ env.PYSPARK_IMAGE_TO_TEST }}" != "" ]]; then
+            echo "image_pyspark_url_link=${{ 
steps.infra-image-pyspark-outputs.outputs.image_pyspark_url }}" >> 
$GITHUB_OUTPUT
+          else
+            echo "image_pyspark_url_link=${{ 
steps.infra-image-outputs.outputs.image_url }}" >> $GITHUB_OUTPUT
+          fi
         fi
 
   # Build: build Spark and run the tests for specified modules.
@@ -360,6 +380,8 @@ jobs:
     runs-on: ubuntu-latest
     permissions:
       packages: write
+    env:
+      PYSPARK_IMAGE_TO_TEST: ''
     steps:
       - name: Login to GitHub Container Registry
         uses: docker/login-action@v3
@@ -428,6 +450,18 @@ jobs:
             ${{ needs.precondition.outputs.image_sparkr_url }}
           # Use the infra image cache to speed up
           cache-from: 
type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-sparkr-cache:${{
 inputs.branch }}
+      - name: Build and push (PySpark ${{ env.PYSPARK_IMAGE_TO_TEST }})
+        if: ${{ env.PYSPARK_IMAGE_TO_TEST }}
+        id: docker_build_pyspark
+        env: ${{ fromJSON(inputs.envs) }}
+        uses: docker/build-push-action@v6
+        with:
+          context: ./dev/spark-test-image/${{ env.PYSPARK_IMAGE_TO_TEST }}/
+          push: true
+          tags: |
+            ${{ needs.precondition.outputs.image_pyspark_url }}
+          # Use the infra image cache to speed up
+          cache-from: 
type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-pyspark-${{
 env.PYSPARK_IMAGE_TO_TEST }}-cache:${{ inputs.branch }}
 
 
   pyspark:
@@ -438,7 +472,7 @@ jobs:
     runs-on: ubuntu-latest
     timeout-minutes: 180
     container:
-      image: ${{ needs.precondition.outputs.image_url }}
+      image: ${{ needs.precondition.outputs.image_pyspark_url_link }}
     strategy:
       fail-fast: false
       matrix:
diff --git a/.github/workflows/build_infra_images_cache.yml 
b/.github/workflows/build_infra_images_cache.yml
index a6beacedeebd..3d5a8306aca8 100644
--- a/.github/workflows/build_infra_images_cache.yml
+++ b/.github/workflows/build_infra_images_cache.yml
@@ -30,6 +30,7 @@ on:
     - 'dev/spark-test-image/docs/Dockerfile'
     - 'dev/spark-test-image/lint/Dockerfile'
     - 'dev/spark-test-image/sparkr/Dockerfile'
+    - 'dev/spark-test-image/python-309/Dockerfile'
     - '.github/workflows/build_infra_images_cache.yml'
   # Create infra image when cutting down branches/tags
   create:
@@ -102,3 +103,16 @@ jobs:
       - name: Image digest (SparkR)
         if: hashFiles('dev/spark-test-image/sparkr/Dockerfile') != ''
         run: echo ${{ steps.docker_build_sparkr.outputs.digest }}
+      - name: Build and push (PySpark with Python 3.9)
+        if: hashFiles('dev/spark-test-image/python-309/Dockerfile') != ''
+        id: docker_build_pyspark_python_309
+        uses: docker/build-push-action@v6
+        with:
+          context: ./dev/spark-test-image/python-309/
+          push: true
+          tags: 
ghcr.io/apache/spark/apache-spark-github-action-image-pyspark-python-309-cache:${{
 github.ref_name }}-static
+          cache-from: 
type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-pyspark-python-309-cache:${{
 github.ref_name }}
+          cache-to: 
type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-pyspark-python-309-cache:${{
 github.ref_name }},mode=max
+      - name: Image digest (PySpark with Python 3.9)
+        if: hashFiles('dev/spark-test-image/python-309/Dockerfile') != ''
+        run: echo ${{ steps.docker_build_pyspark_python_309.outputs.digest }}
diff --git a/.github/workflows/build_python_3.9.yml 
b/.github/workflows/build_python_3.9.yml
index b2401fcf2aa1..744e18cc8db3 100644
--- a/.github/workflows/build_python_3.9.yml
+++ b/.github/workflows/build_python_3.9.yml
@@ -36,6 +36,7 @@ jobs:
       hadoop: hadoop3
       envs: >-
         {
+          "PYSPARK_IMAGE_TO_TEST": "python-309",
           "PYTHON_TO_TEST": "python3.9"
         }
       jobs: >-
diff --git a/dev/spark-test-image/python-309/Dockerfile 
b/dev/spark-test-image/python-309/Dockerfile
new file mode 100644
index 000000000000..dbab99c1441b
--- /dev/null
+++ b/dev/spark-test-image/python-309/Dockerfile
@@ -0,0 +1,82 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# Image for building and testing Spark branches. Based on Ubuntu 22.04.
+# See also in https://hub.docker.com/_/ubuntu
+FROM ubuntu:jammy-20240911.1
+LABEL org.opencontainers.image.authors="Apache Spark project 
<[email protected]>"
+LABEL org.opencontainers.image.licenses="Apache-2.0"
+LABEL org.opencontainers.image.ref.name="Apache Spark Infra Image For PySpark 
with Python 3.09"
+# Overwrite this label to avoid exposing the underlying Ubuntu OS version label
+LABEL org.opencontainers.image.version=""
+
+ENV FULL_REFRESH_DATE 20241119
+
+ENV DEBIAN_FRONTEND noninteractive
+ENV DEBCONF_NONINTERACTIVE_SEEN true
+
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    ca-certificates \
+    curl \
+    gfortran \
+    git \
+    gnupg \
+    libcurl4-openssl-dev \
+    libfontconfig1-dev \
+    libfreetype6-dev \
+    libfribidi-dev \
+    libgit2-dev \
+    libharfbuzz-dev \
+    libjpeg-dev \
+    liblapack-dev \
+    libopenblas-dev \
+    libpng-dev \
+    libpython3-dev \
+    libssl-dev \
+    libtiff5-dev \
+    libxml2-dev \
+    openjdk-17-jdk-headless \
+    pandoc \
+    pkg-config \
+    qpdf \
+    software-properties-common \
+    wget \
+    zlib1g-dev \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install Python 3.9
+RUN add-apt-repository ppa:deadsnakes/ppa
+RUN apt-get update && apt-get install -y \
+    python3.9 python3.9-distutils \
+    && rm -rf /var/lib/apt/lists/*
+
+ARG BASIC_PIP_PKGS="numpy pyarrow>=18.0.0 six==1.16.0 pandas==2.2.3 scipy 
plotly>=4.8 mlflow>=2.8.1 coverage matplotlib openpyxl memory-profiler>=0.61.0 
scikit-learn>=1.3.2"
+# Python deps for Spark Connect
+ARG CONNECT_PIP_PKGS="grpcio==1.67.0 grpcio-status==1.67.0 protobuf==5.28.3 
googleapis-common-protos==1.65.0 graphviz==0.20.3"
+
+# Install Python 3.9
+RUN add-apt-repository ppa:deadsnakes/ppa
+RUN apt-get update && apt-get install -y \
+    python3.9 python3.9-distutils \
+    && rm -rf /var/lib/apt/lists/*
+RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3.9
+RUN python3.9 -m pip install --ignore-installed blinker>=1.6.2 # mlflow needs 
this
+RUN python3.9 -m pip install --force $BASIC_PIP_PKGS unittest-xml-reporting 
$CONNECT_PIP_PKGS && \
+    python3.9 -m pip install torch torchvision --index-url 
https://download.pytorch.org/whl/cpu && \
+    python3.9 -m pip install torcheval && \
+    python3.9 -m pip cache purge


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to