This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
     new c85d41c558b6 [SPARK-46200][INFRA] re-org the testing dockerfile
c85d41c558b6 is described below

commit c85d41c558b671f815fca980c614a22ca267c28b
Author: Ruifeng Zheng <ruife...@apache.org>
AuthorDate: Fri Dec 1 12:26:32 2023 -0800

    [SPARK-46200][INFRA] re-org the testing dockerfile
    
    ### What changes were proposed in this pull request?
    re-org the testing dockerfile:
    1, move R package installation before the python part;
    2, combine pip install commands to make sure no conflict (except the 
torch-related pkgs, since we can not specify the `--index-url` to a subset of 
pkgs in single pip command):
    ```
    RUN python3.9 -m pip install pkg-a==x
    RUN python3.9 -m pip install pkg-b==y
    ```
    pkg-b installation can potentially break `pkg-a==x` by installing another 
version
    
    ### Why are the changes needed?
    to make sure no conflict
    
    ### Does this PR introduce _any_ user-facing change?
    no, test-only
    
    ### How was this patch tested?
    ci
    
    ### Was this patch authored or co-authored using generative AI tooling?
    no
    
    Closes #44107 from zhengruifeng/infra_docker_refactor.
    
    Authored-by: Ruifeng Zheng <ruife...@apache.org>
    Signed-off-by: Dongjoon Hyun <dh...@apple.com>
---
 dev/infra/Dockerfile | 50 +++++++++++++++++++++++---------------------------
 1 file changed, 23 insertions(+), 27 deletions(-)

diff --git a/dev/infra/Dockerfile b/dev/infra/Dockerfile
index f0ca4a47d698..3e449bcb6c82 100644
--- a/dev/infra/Dockerfile
+++ b/dev/infra/Dockerfile
@@ -63,16 +63,6 @@ RUN apt-get update && apt-get install -y \
     zlib1g-dev \
     && rm -rf /var/lib/apt/lists/*
 
-RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3.9
-
-RUN add-apt-repository ppa:pypy/ppa
-
-RUN mkdir -p /usr/local/pypy/pypy3.8 && \
-    curl -sqL 
https://downloads.python.org/pypy/pypy3.8-v7.3.11-linux64.tar.bz2 | tar xjf - 
-C /usr/local/pypy/pypy3.8 --strip-components=1 && \
-    ln -sf /usr/local/pypy/pypy3.8/bin/pypy /usr/local/bin/pypy3.8 && \
-    ln -sf /usr/local/pypy/pypy3.8/bin/pypy /usr/local/bin/pypy3
-
-RUN curl -sS https://bootstrap.pypa.io/get-pip.py | pypy3
 
 RUN echo 'deb https://cloud.r-project.org/bin/linux/ubuntu focal-cran40/' >> 
/etc/apt/sources.list
 RUN gpg --keyserver hkps://keyserver.ubuntu.com --recv-key 
E298A3A825C0D65DFD57CBB651716619E084DAB9
@@ -92,17 +82,28 @@ RUN Rscript -e "devtools::install_version('preferably', 
version='0.4', repos='ht
 # See more in SPARK-39735
 ENV R_LIBS_SITE 
"/usr/local/lib/R/site-library:${R_LIBS_SITE}:/usr/lib/R/library"
 
+
+RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3.9
+
+
+RUN add-apt-repository ppa:pypy/ppa
+RUN mkdir -p /usr/local/pypy/pypy3.8 && \
+    curl -sqL 
https://downloads.python.org/pypy/pypy3.8-v7.3.11-linux64.tar.bz2 | tar xjf - 
-C /usr/local/pypy/pypy3.8 --strip-components=1 && \
+    ln -sf /usr/local/pypy/pypy3.8/bin/pypy /usr/local/bin/pypy3.8 && \
+    ln -sf /usr/local/pypy/pypy3.8/bin/pypy /usr/local/bin/pypy3
+RUN curl -sS https://bootstrap.pypa.io/get-pip.py | pypy3
 RUN pypy3 -m pip install numpy 'six==1.16.0' 'pandas<=2.1.3' scipy coverage 
matplotlib
-RUN python3.9 -m pip install numpy 'pyarrow>=14.0.0' 'six==1.16.0' 
'pandas<=2.1.3' scipy unittest-xml-reporting plotly>=4.8 'mlflow>=2.8.1' 
coverage matplotlib openpyxl 'memory-profiler>=0.61.0' 'scikit-learn>=1.3.2'
 
-# Add Python deps for Spark Connect.
-RUN python3.9 -m pip install 'grpcio==1.59.3' 'grpcio-status==1.59.3' 
'protobuf==4.25.1' 'googleapis-common-protos==1.56.4'
 
-# Add torch as a testing dependency for TorchDistributor
+ARG BASIC_PIP_PKGS="numpy pyarrow>=14.0.0 six==1.16.0 pandas<=2.1.3 scipy 
unittest-xml-reporting plotly>=4.8 mlflow>=2.8.1 coverage matplotlib openpyxl 
memory-profiler>=0.61.0 scikit-learn>=1.3.2"
+# Python deps for Spark Connect
+ARG CONNECT_PIP_PKGS="grpcio==1.59.3 grpcio-status==1.59.3 protobuf==4.25.1 
googleapis-common-protos==1.56.4"
+
+
+RUN python3.9 -m pip install $BASIC_PIP_PKGS $CONNECT_PIP_PKGS
+# Add torch as a testing dependency for TorchDistributor and 
DeepspeedTorchDistributor
 RUN python3.9 -m pip install 'torch<=2.0.1' torchvision --index-url 
https://download.pytorch.org/whl/cpu
-RUN python3.9 -m pip install torcheval
-# Add Deepspeed as a testing dependency for DeepspeedTorchDistributor
-RUN python3.9 -m pip install deepspeed
+RUN python3.9 -m pip install deepspeed torcheval
 
 # Install Python 3.10 at the last stage to avoid breaking Python 3.9
 RUN add-apt-repository ppa:deadsnakes/ppa
@@ -110,11 +111,9 @@ RUN apt-get update && apt-get install -y \
     python3.10 python3.10-distutils \
     && rm -rf /var/lib/apt/lists/*
 RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3.10
-RUN python3.10 -m pip install numpy 'pyarrow>=14.0.0' 'six==1.16.0' 
'pandas<=2.1.3' scipy unittest-xml-reporting plotly>=4.8 'mlflow>=2.8.1' 
coverage matplotlib openpyxl 'memory-profiler>=0.61.0' 'scikit-learn>=1.3.2'
-RUN python3.10 -m pip install 'grpcio==1.59.3' 'grpcio-status==1.59.3' 
'protobuf==4.25.1' 'googleapis-common-protos==1.56.4'
+RUN python3.10 -m pip install $BASIC_PIP_PKGS $CONNECT_PIP_PKGS
 RUN python3.10 -m pip install 'torch<=2.0.1' torchvision --index-url 
https://download.pytorch.org/whl/cpu
-RUN python3.10 -m pip install torcheval
-RUN python3.10 -m pip install deepspeed
+RUN python3.10 -m pip install deepspeed torcheval
 
 # Install Python 3.11 at the last stage to avoid breaking the existing Python 
installations
 RUN add-apt-repository ppa:deadsnakes/ppa
@@ -122,11 +121,9 @@ RUN apt-get update && apt-get install -y \
     python3.11 python3.11-distutils \
     && rm -rf /var/lib/apt/lists/*
 RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3.11
-RUN python3.11 -m pip install numpy 'pyarrow>=14.0.0' 'six==1.16.0' 
'pandas<=2.1.3' scipy unittest-xml-reporting plotly>=4.8 'mlflow>=2.8.1' 
coverage matplotlib openpyxl 'memory-profiler>=0.61.0' 'scikit-learn>=1.3.2'
-RUN python3.11 -m pip install 'grpcio==1.59.3' 'grpcio-status==1.59.3' 
'protobuf==4.25.1' 'googleapis-common-protos==1.56.4'
+RUN python3.11 -m pip install $BASIC_PIP_PKGS $CONNECT_PIP_PKGS
 RUN python3.11 -m pip install 'torch<=2.0.1' torchvision --index-url 
https://download.pytorch.org/whl/cpu
-RUN python3.11 -m pip install torcheval
-RUN python3.11 -m pip install deepspeed
+RUN python3.11 -m pip install deepspeed torcheval
 
 # Install Python 3.12 at the last stage to avoid breaking the existing Python 
installations
 RUN add-apt-repository ppa:deadsnakes/ppa
@@ -134,8 +131,7 @@ RUN apt-get update && apt-get install -y \
     python3.12 python3.12-distutils \
     && rm -rf /var/lib/apt/lists/*
 RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3.12
-RUN python3.12 -m pip install numpy 'pyarrow>=14.0.0' 'six==1.16.0' 
'pandas<=2.1.3' scipy unittest-xml-reporting plotly>=4.8 'mlflow>=2.8.1' 
coverage matplotlib openpyxl 'memory-profiler>=0.61.0' 'scikit-learn>=1.3.2'
-RUN python3.12 -m pip install 'grpcio==1.59.3' 'grpcio-status==1.59.3' 
'protobuf==4.25.1' 'googleapis-common-protos==1.56.4'
+RUN python3.12 -m pip install $BASIC_PIP_PKGS $CONNECT_PIP_PKGS
 # TODO(SPARK-46078) Use official one instead of nightly build when it's ready
 RUN python3.12 -m pip install --pre torch --index-url 
https://download.pytorch.org/whl/nightly/cpu
 RUN python3.12 -m pip install torchvision --index-url 
https://download.pytorch.org/whl/cpu


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

Reply via email to