This is an automated email from the ASF dual-hosted git repository.
yikun pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark-docker.git
The following commit(s) were added to refs/heads/master by this push:
new 7f83637 [SPARK-43365] Refactor Dockerfile and workflow based on base
image
7f83637 is described below
commit 7f836378d8bfe453b7e1dba304b54cb1cfacda49
Author: Yikun Jiang <[email protected]>
AuthorDate: Sat May 6 09:15:41 2023 +0800
[SPARK-43365] Refactor Dockerfile and workflow based on base image
### What changes were proposed in this pull request?
This PR changes Dockerfile and workflow based on base image to save space
by sharing layers by having one image from another.
After this PR:
- The spark / PySpark / SparkR related files extract into base image
- Install PySpark / SparkR deps in PySpark / SparkR images.
- Add the base image build step
- Apply changes to template: `./add-dockerfiles.sh 3.4.0` to make it work.
- This PR didn't contain changes on 3.3.X Dockerfiles to make PR more
clear, the 3.3.x changes will be a separate PR when we address all comments for
3.4.0.
[1]
https://github.com/docker-library/official-images/pull/13089?notification_referrer_id=NT_kwDOABp-orI0MzIwMzMwNzY5OjE3MzYzNTQ#issuecomment-1533540388
### Why are the changes needed?
Address DOI comments, and also to save space by sharing layers by having
one image from another.
### Does this PR introduce _any_ user-facing change?
No
### How was this patch tested?
CI passed.
Closes #36 from Yikun/official.
Authored-by: Yikun Jiang <[email protected]>
Signed-off-by: Yikun Jiang <[email protected]>
---
.github/workflows/main.yml | 20 ++++
3.4.0/scala2.12-java11-python3-r-ubuntu/Dockerfile | 63 +-----------
.../entrypoint.sh | 114 ---------------------
3.4.0/scala2.12-java11-python3-ubuntu/Dockerfile | 63 +-----------
.../scala2.12-java11-python3-ubuntu/entrypoint.sh | 114 ---------------------
3.4.0/scala2.12-java11-r-ubuntu/Dockerfile | 60 +----------
3.4.0/scala2.12-java11-r-ubuntu/entrypoint.sh | 107 -------------------
3.4.0/scala2.12-java11-ubuntu/Dockerfile | 4 +
3.4.0/scala2.12-java11-ubuntu/entrypoint.sh | 7 ++
Dockerfile.template | 15 ---
add-dockerfiles.sh | 9 +-
entrypoint.sh.template | 2 -
add-dockerfiles.sh => r-python.template | 54 +++-------
tools/template.py | 16 +++
14 files changed, 77 insertions(+), 571 deletions(-)
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index fd37990..c1d0c56 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -91,10 +91,12 @@ jobs:
scala) SUFFIX=ubuntu
;;
esac
+ BASE_IMGAE_TAG=${{ inputs.spark }}-scala${{ inputs.scala }}-java${{
inputs.java }}-ubuntu
TAG=scala${{ inputs.scala }}-java${{ inputs.java }}-$SUFFIX
IMAGE_NAME=spark
IMAGE_PATH=${{ inputs.spark }}/$TAG
+ BASE_IMAGE_PATH=${{ inputs.spark }}/scala${{ inputs.scala }}-java${{
inputs.java }}-ubuntu
if [ "${{ inputs.build }}" == "true" ]; then
# Use the local registry to build and test
REPO_OWNER=$(echo "${{ github.repository_owner }}" | tr
'[:upper:]' '[:lower:]')
@@ -105,6 +107,7 @@ jobs:
TEST_REPO=${{ inputs.repository }}
UNIQUE_IMAGE_TAG=${{ inputs.image-tag }}
fi
+ BASE_IMAGE_URL=$TEST_REPO/$IMAGE_NAME:$BASE_IMGAE_TAG
IMAGE_URL=$TEST_REPO/$IMAGE_NAME:$UNIQUE_IMAGE_TAG
PUBLISH_REPO=${{ inputs.repository }}
@@ -116,8 +119,12 @@ jobs:
echo "TEST_REPO=${TEST_REPO}" >> $GITHUB_ENV
# Image name: spark
echo "IMAGE_NAME=${IMAGE_NAME}" >> $GITHUB_ENV
+ # Base Image Dockerfile: 3.3.0/scala2.12-java11-ubuntu
+ echo "BASE_IMAGE_PATH=${BASE_IMAGE_PATH}" >> $GITHUB_ENV
# Image dockerfile path: 3.3.0/scala2.12-java11-python3-ubuntu
echo "IMAGE_PATH=${IMAGE_PATH}" >> $GITHUB_ENV
+ # Base Image URL: spark:3.3.0-scala2.12-java11-ubuntu
+ echo "BASE_IMAGE_URL=${BASE_IMAGE_URL}" >> $GITHUB_ENV
# Image URL:
ghcr.io/apache/spark-docker/spark:3.3.0-scala2.12-java11-python3-ubuntu
echo "IMAGE_URL=${IMAGE_URL}" >> $GITHUB_ENV
@@ -132,6 +139,9 @@ jobs:
echo "IMAGE_PATH: "${IMAGE_PATH}
echo "IMAGE_URL: "${IMAGE_URL}
+ echo "BASE_IMAGE_PATH: "${BASE_IMAGE_PATH}
+ echo "BASE_IMAGE_URL: "${BASE_IMAGE_URL}
+
echo "PUBLISH_REPO:"${PUBLISH_REPO}
echo "PUBLISH_IMAGE_URL:"${PUBLISH_IMAGE_URL}
@@ -146,10 +156,20 @@ jobs:
# This required by local registry
driver-opts: network=host
+ - name: Build - Build the base image
+ if: ${{ inputs.build }}
+ uses: docker/build-push-action@v3
+ with:
+ context: ${{ env.BASE_IMAGE_PATH }}
+ tags: ${{ env.BASE_IMAGE_URL }}
+ platforms: linux/amd64,linux/arm64
+ push: true
+
- name: Build - Build and push test image
if: ${{ inputs.build }}
uses: docker/build-push-action@v3
with:
+ build-args: BASE_IMAGE=${{ env.BASE_IMAGE_URL }}
context: ${{ env.IMAGE_PATH }}
tags: ${{ env.IMAGE_URL }}
platforms: linux/amd64,linux/arm64
diff --git a/3.4.0/scala2.12-java11-python3-r-ubuntu/Dockerfile
b/3.4.0/scala2.12-java11-python3-r-ubuntu/Dockerfile
index 4f62e8d..86337c5 100644
--- a/3.4.0/scala2.12-java11-python3-r-ubuntu/Dockerfile
+++ b/3.4.0/scala2.12-java11-python3-r-ubuntu/Dockerfile
@@ -14,73 +14,14 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
-FROM eclipse-temurin:11-jre-focal
-
-ARG spark_uid=185
-
-RUN groupadd --system --gid=${spark_uid} spark && \
- useradd --system --uid=${spark_uid} --gid=spark spark
+ARG BASE_IMAGE=spark:3.4.0-scala2.12-java11-ubuntu
+FROM $BASE_IMAGE
RUN set -ex && \
apt-get update && \
- ln -s /lib /lib64 && \
- apt install -y gnupg2 wget bash tini libc6 libpam-modules krb5-user
libnss3 procps net-tools gosu && \
apt install -y python3 python3-pip && \
apt install -y r-base r-base-dev && \
- mkdir -p /opt/spark && \
- mkdir /opt/spark/python && \
- mkdir -p /opt/spark/examples && \
- mkdir -p /opt/spark/work-dir && \
- touch /opt/spark/RELEASE && \
- chown -R spark:spark /opt/spark && \
- rm /bin/sh && \
- ln -sv /bin/bash /bin/sh && \
- echo "auth required pam_wheel.so use_uid" >> /etc/pam.d/su && \
- chgrp root /etc/passwd && chmod ug+rw /etc/passwd && \
rm -rf /var/cache/apt/* && \
rm -rf /var/lib/apt/lists/*
-# Install Apache Spark
-# https://downloads.apache.org/spark/KEYS
-ENV
SPARK_TGZ_URL=https://archive.apache.org/dist/spark/spark-3.4.0/spark-3.4.0-bin-hadoop3.tgz
\
-
SPARK_TGZ_ASC_URL=https://archive.apache.org/dist/spark/spark-3.4.0/spark-3.4.0-bin-hadoop3.tgz.asc
\
- GPG_KEY=CC68B3D16FE33A766705160BA7E57908C7A4E1B1
-
-RUN set -ex; \
- export SPARK_TMP="$(mktemp -d)"; \
- cd $SPARK_TMP; \
- wget -nv -O spark.tgz "$SPARK_TGZ_URL"; \
- wget -nv -O spark.tgz.asc "$SPARK_TGZ_ASC_URL"; \
- export GNUPGHOME="$(mktemp -d)"; \
- gpg --keyserver hkps://keys.openpgp.org --recv-key "$GPG_KEY" || \
- gpg --keyserver hkps://keyserver.ubuntu.com --recv-keys "$GPG_KEY"; \
- gpg --batch --verify spark.tgz.asc spark.tgz; \
- gpgconf --kill all; \
- rm -rf "$GNUPGHOME" spark.tgz.asc; \
- \
- tar -xf spark.tgz --strip-components=1; \
- chown -R spark:spark .; \
- mv jars /opt/spark/; \
- mv bin /opt/spark/; \
- mv sbin /opt/spark/; \
- mv kubernetes/dockerfiles/spark/decom.sh /opt/; \
- mv examples /opt/spark/; \
- mv kubernetes/tests /opt/spark/; \
- mv data /opt/spark/; \
- mv python/pyspark /opt/spark/python/pyspark/; \
- mv python/lib /opt/spark/python/lib/; \
- mv R /opt/spark/; \
- cd ..; \
- rm -rf "$SPARK_TMP";
-
-COPY entrypoint.sh /opt/
-
-ENV SPARK_HOME /opt/spark
ENV R_HOME /usr/lib/R
-
-WORKDIR /opt/spark/work-dir
-RUN chmod g+w /opt/spark/work-dir
-RUN chmod a+x /opt/decom.sh
-RUN chmod a+x /opt/entrypoint.sh
-
-ENTRYPOINT [ "/opt/entrypoint.sh" ]
diff --git a/3.4.0/scala2.12-java11-python3-r-ubuntu/entrypoint.sh
b/3.4.0/scala2.12-java11-python3-r-ubuntu/entrypoint.sh
deleted file mode 100644
index 4bb1557..0000000
--- a/3.4.0/scala2.12-java11-python3-r-ubuntu/entrypoint.sh
+++ /dev/null
@@ -1,114 +0,0 @@
-#!/bin/bash
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements. See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-# Check whether there is a passwd entry for the container UID
-myuid=$(id -u)
-mygid=$(id -g)
-# turn off -e for getent because it will return error code in anonymous uid
case
-set +e
-uidentry=$(getent passwd $myuid)
-set -e
-
-# If there is no passwd entry for the container UID, attempt to create one
-if [ -z "$uidentry" ] ; then
- if [ -w /etc/passwd ] ; then
- echo "$myuid:x:$myuid:$mygid:${SPARK_USER_NAME:-anonymous
uid}:$SPARK_HOME:/bin/false" >> /etc/passwd
- else
- echo "Container ENTRYPOINT failed to add passwd entry for anonymous
UID"
- fi
-fi
-
-if [ -z "$JAVA_HOME" ]; then
- JAVA_HOME=$(java -XshowSettings:properties -version 2>&1 > /dev/null | grep
'java.home' | awk '{print $3}')
-fi
-
-SPARK_CLASSPATH="$SPARK_CLASSPATH:${SPARK_HOME}/jars/*"
-env | grep SPARK_JAVA_OPT_ | sort -t_ -k4 -n | sed 's/[^=]*=\(.*\)/\1/g' >
/tmp/java_opts.txt
-readarray -t SPARK_EXECUTOR_JAVA_OPTS < /tmp/java_opts.txt
-
-if [ -n "$SPARK_EXTRA_CLASSPATH" ]; then
- SPARK_CLASSPATH="$SPARK_CLASSPATH:$SPARK_EXTRA_CLASSPATH"
-fi
-
-if ! [ -z ${PYSPARK_PYTHON+x} ]; then
- export PYSPARK_PYTHON
-fi
-if ! [ -z ${PYSPARK_DRIVER_PYTHON+x} ]; then
- export PYSPARK_DRIVER_PYTHON
-fi
-
-# If HADOOP_HOME is set and SPARK_DIST_CLASSPATH is not set, set it here so
Hadoop jars are available to the executor.
-# It does not set SPARK_DIST_CLASSPATH if already set, to avoid overriding
customizations of this value from elsewhere e.g. Docker/K8s.
-if [ -n "${HADOOP_HOME}" ] && [ -z "${SPARK_DIST_CLASSPATH}" ]; then
- export SPARK_DIST_CLASSPATH="$($HADOOP_HOME/bin/hadoop classpath)"
-fi
-
-if ! [ -z ${HADOOP_CONF_DIR+x} ]; then
- SPARK_CLASSPATH="$HADOOP_CONF_DIR:$SPARK_CLASSPATH";
-fi
-
-if ! [ -z ${SPARK_CONF_DIR+x} ]; then
- SPARK_CLASSPATH="$SPARK_CONF_DIR:$SPARK_CLASSPATH";
-elif ! [ -z ${SPARK_HOME+x} ]; then
- SPARK_CLASSPATH="$SPARK_HOME/conf:$SPARK_CLASSPATH";
-fi
-
-case "$1" in
- driver)
- shift 1
- CMD=(
- "$SPARK_HOME/bin/spark-submit"
- --conf "spark.driver.bindAddress=$SPARK_DRIVER_BIND_ADDRESS"
- --deploy-mode client
- "$@"
- )
- ;;
- executor)
- shift 1
- CMD=(
- ${JAVA_HOME}/bin/java
- "${SPARK_EXECUTOR_JAVA_OPTS[@]}"
- -Xms$SPARK_EXECUTOR_MEMORY
- -Xmx$SPARK_EXECUTOR_MEMORY
- -cp "$SPARK_CLASSPATH:$SPARK_DIST_CLASSPATH"
- org.apache.spark.scheduler.cluster.k8s.KubernetesExecutorBackend
- --driver-url $SPARK_DRIVER_URL
- --executor-id $SPARK_EXECUTOR_ID
- --cores $SPARK_EXECUTOR_CORES
- --app-id $SPARK_APPLICATION_ID
- --hostname $SPARK_EXECUTOR_POD_IP
- --resourceProfileId $SPARK_RESOURCE_PROFILE_ID
- --podName $SPARK_EXECUTOR_POD_NAME
- )
- ;;
-
- *)
- # Non-spark-on-k8s command provided, proceeding in pass-through mode...
- CMD=("$@")
- ;;
-esac
-
-# Switch to spark if no USER specified (root by default) otherwise use USER
directly
-switch_spark_if_root() {
- if [ $(id -u) -eq 0 ]; then
- echo gosu spark
- fi
-}
-
-# Execute the container CMD under tini for better hygiene
-exec $(switch_spark_if_root) /usr/bin/tini -s -- "${CMD[@]}"
diff --git a/3.4.0/scala2.12-java11-python3-ubuntu/Dockerfile
b/3.4.0/scala2.12-java11-python3-ubuntu/Dockerfile
index 2be0cb4..540805f 100644
--- a/3.4.0/scala2.12-java11-python3-ubuntu/Dockerfile
+++ b/3.4.0/scala2.12-java11-python3-ubuntu/Dockerfile
@@ -14,70 +14,11 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
-FROM eclipse-temurin:11-jre-focal
-
-ARG spark_uid=185
-
-RUN groupadd --system --gid=${spark_uid} spark && \
- useradd --system --uid=${spark_uid} --gid=spark spark
+ARG BASE_IMAGE=spark:3.4.0-scala2.12-java11-ubuntu
+FROM $BASE_IMAGE
RUN set -ex && \
apt-get update && \
- ln -s /lib /lib64 && \
- apt install -y gnupg2 wget bash tini libc6 libpam-modules krb5-user
libnss3 procps net-tools gosu && \
apt install -y python3 python3-pip && \
- mkdir -p /opt/spark && \
- mkdir /opt/spark/python && \
- mkdir -p /opt/spark/examples && \
- mkdir -p /opt/spark/work-dir && \
- touch /opt/spark/RELEASE && \
- chown -R spark:spark /opt/spark && \
- rm /bin/sh && \
- ln -sv /bin/bash /bin/sh && \
- echo "auth required pam_wheel.so use_uid" >> /etc/pam.d/su && \
- chgrp root /etc/passwd && chmod ug+rw /etc/passwd && \
rm -rf /var/cache/apt/* && \
rm -rf /var/lib/apt/lists/*
-
-# Install Apache Spark
-# https://downloads.apache.org/spark/KEYS
-ENV
SPARK_TGZ_URL=https://archive.apache.org/dist/spark/spark-3.4.0/spark-3.4.0-bin-hadoop3.tgz
\
-
SPARK_TGZ_ASC_URL=https://archive.apache.org/dist/spark/spark-3.4.0/spark-3.4.0-bin-hadoop3.tgz.asc
\
- GPG_KEY=CC68B3D16FE33A766705160BA7E57908C7A4E1B1
-
-RUN set -ex; \
- export SPARK_TMP="$(mktemp -d)"; \
- cd $SPARK_TMP; \
- wget -nv -O spark.tgz "$SPARK_TGZ_URL"; \
- wget -nv -O spark.tgz.asc "$SPARK_TGZ_ASC_URL"; \
- export GNUPGHOME="$(mktemp -d)"; \
- gpg --keyserver hkps://keys.openpgp.org --recv-key "$GPG_KEY" || \
- gpg --keyserver hkps://keyserver.ubuntu.com --recv-keys "$GPG_KEY"; \
- gpg --batch --verify spark.tgz.asc spark.tgz; \
- gpgconf --kill all; \
- rm -rf "$GNUPGHOME" spark.tgz.asc; \
- \
- tar -xf spark.tgz --strip-components=1; \
- chown -R spark:spark .; \
- mv jars /opt/spark/; \
- mv bin /opt/spark/; \
- mv sbin /opt/spark/; \
- mv kubernetes/dockerfiles/spark/decom.sh /opt/; \
- mv examples /opt/spark/; \
- mv kubernetes/tests /opt/spark/; \
- mv data /opt/spark/; \
- mv python/pyspark /opt/spark/python/pyspark/; \
- mv python/lib /opt/spark/python/lib/; \
- cd ..; \
- rm -rf "$SPARK_TMP";
-
-COPY entrypoint.sh /opt/
-
-ENV SPARK_HOME /opt/spark
-
-WORKDIR /opt/spark/work-dir
-RUN chmod g+w /opt/spark/work-dir
-RUN chmod a+x /opt/decom.sh
-RUN chmod a+x /opt/entrypoint.sh
-
-ENTRYPOINT [ "/opt/entrypoint.sh" ]
diff --git a/3.4.0/scala2.12-java11-python3-ubuntu/entrypoint.sh
b/3.4.0/scala2.12-java11-python3-ubuntu/entrypoint.sh
deleted file mode 100644
index 4bb1557..0000000
--- a/3.4.0/scala2.12-java11-python3-ubuntu/entrypoint.sh
+++ /dev/null
@@ -1,114 +0,0 @@
-#!/bin/bash
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements. See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-# Check whether there is a passwd entry for the container UID
-myuid=$(id -u)
-mygid=$(id -g)
-# turn off -e for getent because it will return error code in anonymous uid
case
-set +e
-uidentry=$(getent passwd $myuid)
-set -e
-
-# If there is no passwd entry for the container UID, attempt to create one
-if [ -z "$uidentry" ] ; then
- if [ -w /etc/passwd ] ; then
- echo "$myuid:x:$myuid:$mygid:${SPARK_USER_NAME:-anonymous
uid}:$SPARK_HOME:/bin/false" >> /etc/passwd
- else
- echo "Container ENTRYPOINT failed to add passwd entry for anonymous
UID"
- fi
-fi
-
-if [ -z "$JAVA_HOME" ]; then
- JAVA_HOME=$(java -XshowSettings:properties -version 2>&1 > /dev/null | grep
'java.home' | awk '{print $3}')
-fi
-
-SPARK_CLASSPATH="$SPARK_CLASSPATH:${SPARK_HOME}/jars/*"
-env | grep SPARK_JAVA_OPT_ | sort -t_ -k4 -n | sed 's/[^=]*=\(.*\)/\1/g' >
/tmp/java_opts.txt
-readarray -t SPARK_EXECUTOR_JAVA_OPTS < /tmp/java_opts.txt
-
-if [ -n "$SPARK_EXTRA_CLASSPATH" ]; then
- SPARK_CLASSPATH="$SPARK_CLASSPATH:$SPARK_EXTRA_CLASSPATH"
-fi
-
-if ! [ -z ${PYSPARK_PYTHON+x} ]; then
- export PYSPARK_PYTHON
-fi
-if ! [ -z ${PYSPARK_DRIVER_PYTHON+x} ]; then
- export PYSPARK_DRIVER_PYTHON
-fi
-
-# If HADOOP_HOME is set and SPARK_DIST_CLASSPATH is not set, set it here so
Hadoop jars are available to the executor.
-# It does not set SPARK_DIST_CLASSPATH if already set, to avoid overriding
customizations of this value from elsewhere e.g. Docker/K8s.
-if [ -n "${HADOOP_HOME}" ] && [ -z "${SPARK_DIST_CLASSPATH}" ]; then
- export SPARK_DIST_CLASSPATH="$($HADOOP_HOME/bin/hadoop classpath)"
-fi
-
-if ! [ -z ${HADOOP_CONF_DIR+x} ]; then
- SPARK_CLASSPATH="$HADOOP_CONF_DIR:$SPARK_CLASSPATH";
-fi
-
-if ! [ -z ${SPARK_CONF_DIR+x} ]; then
- SPARK_CLASSPATH="$SPARK_CONF_DIR:$SPARK_CLASSPATH";
-elif ! [ -z ${SPARK_HOME+x} ]; then
- SPARK_CLASSPATH="$SPARK_HOME/conf:$SPARK_CLASSPATH";
-fi
-
-case "$1" in
- driver)
- shift 1
- CMD=(
- "$SPARK_HOME/bin/spark-submit"
- --conf "spark.driver.bindAddress=$SPARK_DRIVER_BIND_ADDRESS"
- --deploy-mode client
- "$@"
- )
- ;;
- executor)
- shift 1
- CMD=(
- ${JAVA_HOME}/bin/java
- "${SPARK_EXECUTOR_JAVA_OPTS[@]}"
- -Xms$SPARK_EXECUTOR_MEMORY
- -Xmx$SPARK_EXECUTOR_MEMORY
- -cp "$SPARK_CLASSPATH:$SPARK_DIST_CLASSPATH"
- org.apache.spark.scheduler.cluster.k8s.KubernetesExecutorBackend
- --driver-url $SPARK_DRIVER_URL
- --executor-id $SPARK_EXECUTOR_ID
- --cores $SPARK_EXECUTOR_CORES
- --app-id $SPARK_APPLICATION_ID
- --hostname $SPARK_EXECUTOR_POD_IP
- --resourceProfileId $SPARK_RESOURCE_PROFILE_ID
- --podName $SPARK_EXECUTOR_POD_NAME
- )
- ;;
-
- *)
- # Non-spark-on-k8s command provided, proceeding in pass-through mode...
- CMD=("$@")
- ;;
-esac
-
-# Switch to spark if no USER specified (root by default) otherwise use USER
directly
-switch_spark_if_root() {
- if [ $(id -u) -eq 0 ]; then
- echo gosu spark
- fi
-}
-
-# Execute the container CMD under tini for better hygiene
-exec $(switch_spark_if_root) /usr/bin/tini -s -- "${CMD[@]}"
diff --git a/3.4.0/scala2.12-java11-r-ubuntu/Dockerfile
b/3.4.0/scala2.12-java11-r-ubuntu/Dockerfile
index 22fe82b..c65c2ce 100644
--- a/3.4.0/scala2.12-java11-r-ubuntu/Dockerfile
+++ b/3.4.0/scala2.12-java11-r-ubuntu/Dockerfile
@@ -14,69 +14,13 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
-FROM eclipse-temurin:11-jre-focal
-
-ARG spark_uid=185
-
-RUN groupadd --system --gid=${spark_uid} spark && \
- useradd --system --uid=${spark_uid} --gid=spark spark
+ARG BASE_IMAGE=spark:3.4.0-scala2.12-java11-ubuntu
+FROM $BASE_IMAGE
RUN set -ex && \
apt-get update && \
- ln -s /lib /lib64 && \
- apt install -y gnupg2 wget bash tini libc6 libpam-modules krb5-user
libnss3 procps net-tools gosu && \
apt install -y r-base r-base-dev && \
- mkdir -p /opt/spark && \
- mkdir -p /opt/spark/examples && \
- mkdir -p /opt/spark/work-dir && \
- touch /opt/spark/RELEASE && \
- chown -R spark:spark /opt/spark && \
- rm /bin/sh && \
- ln -sv /bin/bash /bin/sh && \
- echo "auth required pam_wheel.so use_uid" >> /etc/pam.d/su && \
- chgrp root /etc/passwd && chmod ug+rw /etc/passwd && \
rm -rf /var/cache/apt/* && \
rm -rf /var/lib/apt/lists/*
-# Install Apache Spark
-# https://downloads.apache.org/spark/KEYS
-ENV
SPARK_TGZ_URL=https://archive.apache.org/dist/spark/spark-3.4.0/spark-3.4.0-bin-hadoop3.tgz
\
-
SPARK_TGZ_ASC_URL=https://archive.apache.org/dist/spark/spark-3.4.0/spark-3.4.0-bin-hadoop3.tgz.asc
\
- GPG_KEY=CC68B3D16FE33A766705160BA7E57908C7A4E1B1
-
-RUN set -ex; \
- export SPARK_TMP="$(mktemp -d)"; \
- cd $SPARK_TMP; \
- wget -nv -O spark.tgz "$SPARK_TGZ_URL"; \
- wget -nv -O spark.tgz.asc "$SPARK_TGZ_ASC_URL"; \
- export GNUPGHOME="$(mktemp -d)"; \
- gpg --keyserver hkps://keys.openpgp.org --recv-key "$GPG_KEY" || \
- gpg --keyserver hkps://keyserver.ubuntu.com --recv-keys "$GPG_KEY"; \
- gpg --batch --verify spark.tgz.asc spark.tgz; \
- gpgconf --kill all; \
- rm -rf "$GNUPGHOME" spark.tgz.asc; \
- \
- tar -xf spark.tgz --strip-components=1; \
- chown -R spark:spark .; \
- mv jars /opt/spark/; \
- mv bin /opt/spark/; \
- mv sbin /opt/spark/; \
- mv kubernetes/dockerfiles/spark/decom.sh /opt/; \
- mv examples /opt/spark/; \
- mv kubernetes/tests /opt/spark/; \
- mv data /opt/spark/; \
- mv R /opt/spark/; \
- cd ..; \
- rm -rf "$SPARK_TMP";
-
-COPY entrypoint.sh /opt/
-
-ENV SPARK_HOME /opt/spark
ENV R_HOME /usr/lib/R
-
-WORKDIR /opt/spark/work-dir
-RUN chmod g+w /opt/spark/work-dir
-RUN chmod a+x /opt/decom.sh
-RUN chmod a+x /opt/entrypoint.sh
-
-ENTRYPOINT [ "/opt/entrypoint.sh" ]
diff --git a/3.4.0/scala2.12-java11-r-ubuntu/entrypoint.sh
b/3.4.0/scala2.12-java11-r-ubuntu/entrypoint.sh
deleted file mode 100644
index 159d539..0000000
--- a/3.4.0/scala2.12-java11-r-ubuntu/entrypoint.sh
+++ /dev/null
@@ -1,107 +0,0 @@
-#!/bin/bash
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements. See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-# Check whether there is a passwd entry for the container UID
-myuid=$(id -u)
-mygid=$(id -g)
-# turn off -e for getent because it will return error code in anonymous uid
case
-set +e
-uidentry=$(getent passwd $myuid)
-set -e
-
-# If there is no passwd entry for the container UID, attempt to create one
-if [ -z "$uidentry" ] ; then
- if [ -w /etc/passwd ] ; then
- echo "$myuid:x:$myuid:$mygid:${SPARK_USER_NAME:-anonymous
uid}:$SPARK_HOME:/bin/false" >> /etc/passwd
- else
- echo "Container ENTRYPOINT failed to add passwd entry for anonymous
UID"
- fi
-fi
-
-if [ -z "$JAVA_HOME" ]; then
- JAVA_HOME=$(java -XshowSettings:properties -version 2>&1 > /dev/null | grep
'java.home' | awk '{print $3}')
-fi
-
-SPARK_CLASSPATH="$SPARK_CLASSPATH:${SPARK_HOME}/jars/*"
-env | grep SPARK_JAVA_OPT_ | sort -t_ -k4 -n | sed 's/[^=]*=\(.*\)/\1/g' >
/tmp/java_opts.txt
-readarray -t SPARK_EXECUTOR_JAVA_OPTS < /tmp/java_opts.txt
-
-if [ -n "$SPARK_EXTRA_CLASSPATH" ]; then
- SPARK_CLASSPATH="$SPARK_CLASSPATH:$SPARK_EXTRA_CLASSPATH"
-fi
-
-# If HADOOP_HOME is set and SPARK_DIST_CLASSPATH is not set, set it here so
Hadoop jars are available to the executor.
-# It does not set SPARK_DIST_CLASSPATH if already set, to avoid overriding
customizations of this value from elsewhere e.g. Docker/K8s.
-if [ -n "${HADOOP_HOME}" ] && [ -z "${SPARK_DIST_CLASSPATH}" ]; then
- export SPARK_DIST_CLASSPATH="$($HADOOP_HOME/bin/hadoop classpath)"
-fi
-
-if ! [ -z ${HADOOP_CONF_DIR+x} ]; then
- SPARK_CLASSPATH="$HADOOP_CONF_DIR:$SPARK_CLASSPATH";
-fi
-
-if ! [ -z ${SPARK_CONF_DIR+x} ]; then
- SPARK_CLASSPATH="$SPARK_CONF_DIR:$SPARK_CLASSPATH";
-elif ! [ -z ${SPARK_HOME+x} ]; then
- SPARK_CLASSPATH="$SPARK_HOME/conf:$SPARK_CLASSPATH";
-fi
-
-case "$1" in
- driver)
- shift 1
- CMD=(
- "$SPARK_HOME/bin/spark-submit"
- --conf "spark.driver.bindAddress=$SPARK_DRIVER_BIND_ADDRESS"
- --deploy-mode client
- "$@"
- )
- ;;
- executor)
- shift 1
- CMD=(
- ${JAVA_HOME}/bin/java
- "${SPARK_EXECUTOR_JAVA_OPTS[@]}"
- -Xms$SPARK_EXECUTOR_MEMORY
- -Xmx$SPARK_EXECUTOR_MEMORY
- -cp "$SPARK_CLASSPATH:$SPARK_DIST_CLASSPATH"
- org.apache.spark.scheduler.cluster.k8s.KubernetesExecutorBackend
- --driver-url $SPARK_DRIVER_URL
- --executor-id $SPARK_EXECUTOR_ID
- --cores $SPARK_EXECUTOR_CORES
- --app-id $SPARK_APPLICATION_ID
- --hostname $SPARK_EXECUTOR_POD_IP
- --resourceProfileId $SPARK_RESOURCE_PROFILE_ID
- --podName $SPARK_EXECUTOR_POD_NAME
- )
- ;;
-
- *)
- # Non-spark-on-k8s command provided, proceeding in pass-through mode...
- CMD=("$@")
- ;;
-esac
-
-# Switch to spark if no USER specified (root by default) otherwise use USER
directly
-switch_spark_if_root() {
- if [ $(id -u) -eq 0 ]; then
- echo gosu spark
- fi
-}
-
-# Execute the container CMD under tini for better hygiene
-exec $(switch_spark_if_root) /usr/bin/tini -s -- "${CMD[@]}"
diff --git a/3.4.0/scala2.12-java11-ubuntu/Dockerfile
b/3.4.0/scala2.12-java11-ubuntu/Dockerfile
index 4e3df64..997b8d3 100644
--- a/3.4.0/scala2.12-java11-ubuntu/Dockerfile
+++ b/3.4.0/scala2.12-java11-ubuntu/Dockerfile
@@ -26,6 +26,7 @@ RUN set -ex && \
ln -s /lib /lib64 && \
apt install -y gnupg2 wget bash tini libc6 libpam-modules krb5-user
libnss3 procps net-tools gosu && \
mkdir -p /opt/spark && \
+ mkdir /opt/spark/python && \
mkdir -p /opt/spark/examples && \
mkdir -p /opt/spark/work-dir && \
touch /opt/spark/RELEASE && \
@@ -64,6 +65,9 @@ RUN set -ex; \
mv examples /opt/spark/; \
mv kubernetes/tests /opt/spark/; \
mv data /opt/spark/; \
+ mv python/pyspark /opt/spark/python/pyspark/; \
+ mv python/lib /opt/spark/python/lib/; \
+ mv R /opt/spark/; \
cd ..; \
rm -rf "$SPARK_TMP";
diff --git a/3.4.0/scala2.12-java11-ubuntu/entrypoint.sh
b/3.4.0/scala2.12-java11-ubuntu/entrypoint.sh
index 159d539..4bb1557 100644
--- a/3.4.0/scala2.12-java11-ubuntu/entrypoint.sh
+++ b/3.4.0/scala2.12-java11-ubuntu/entrypoint.sh
@@ -45,6 +45,13 @@ if [ -n "$SPARK_EXTRA_CLASSPATH" ]; then
SPARK_CLASSPATH="$SPARK_CLASSPATH:$SPARK_EXTRA_CLASSPATH"
fi
+if ! [ -z ${PYSPARK_PYTHON+x} ]; then
+ export PYSPARK_PYTHON
+fi
+if ! [ -z ${PYSPARK_DRIVER_PYTHON+x} ]; then
+ export PYSPARK_DRIVER_PYTHON
+fi
+
# If HADOOP_HOME is set and SPARK_DIST_CLASSPATH is not set, set it here so
Hadoop jars are available to the executor.
# It does not set SPARK_DIST_CLASSPATH if already set, to avoid overriding
customizations of this value from elsewhere e.g. Docker/K8s.
if [ -n "${HADOOP_HOME}" ] && [ -z "${SPARK_DIST_CLASSPATH}" ]; then
diff --git a/Dockerfile.template b/Dockerfile.template
index 4819cb2..5fe4f25 100644
--- a/Dockerfile.template
+++ b/Dockerfile.template
@@ -25,16 +25,8 @@ RUN set -ex && \
apt-get update && \
ln -s /lib /lib64 && \
apt install -y gnupg2 wget bash tini libc6 libpam-modules krb5-user
libnss3 procps net-tools gosu && \
- {%- if HAVE_PY %}
- apt install -y python3 python3-pip && \
- {%- endif %}
- {%- if HAVE_R %}
- apt install -y r-base r-base-dev && \
- {%- endif %}
mkdir -p /opt/spark && \
- {%- if HAVE_PY %}
mkdir /opt/spark/python && \
- {%- endif %}
mkdir -p /opt/spark/examples && \
mkdir -p /opt/spark/work-dir && \
touch /opt/spark/RELEASE && \
@@ -73,22 +65,15 @@ RUN set -ex; \
mv examples /opt/spark/; \
mv kubernetes/tests /opt/spark/; \
mv data /opt/spark/; \
- {%- if HAVE_PY %}
mv python/pyspark /opt/spark/python/pyspark/; \
mv python/lib /opt/spark/python/lib/; \
- {%- endif %}
- {%- if HAVE_R %}
mv R /opt/spark/; \
- {%- endif %}
cd ..; \
rm -rf "$SPARK_TMP";
COPY entrypoint.sh /opt/
ENV SPARK_HOME /opt/spark
-{%- if HAVE_R %}
-ENV R_HOME /usr/lib/R
-{%- endif %}
WORKDIR /opt/spark/work-dir
RUN chmod g+w /opt/spark/work-dir
diff --git a/add-dockerfiles.sh b/add-dockerfiles.sh
index 1683f33..7dcd7b0 100755
--- a/add-dockerfiles.sh
+++ b/add-dockerfiles.sh
@@ -48,6 +48,11 @@ for TAG in $TAGS; do
OPTS+=" --spark-version $VERSION"
mkdir -p $VERSION/$TAG
- python3 tools/template.py $OPTS -f entrypoint.sh.template >
$VERSION/$TAG/entrypoint.sh
- python3 tools/template.py $OPTS > $VERSION/$TAG/Dockerfile
+
+ if [ "$TAG" == "scala2.12-java11-ubuntu" ]; then
+ python3 tools/template.py $OPTS > $VERSION/$TAG/Dockerfile
+ python3 tools/template.py $OPTS -f entrypoint.sh.template >
$VERSION/$TAG/entrypoint.sh
+ else
+ python3 tools/template.py $OPTS -f r-python.template >
$VERSION/$TAG/Dockerfile
+ fi
done
diff --git a/entrypoint.sh.template b/entrypoint.sh.template
index dd56d84..4bb1557 100644
--- a/entrypoint.sh.template
+++ b/entrypoint.sh.template
@@ -44,7 +44,6 @@ readarray -t SPARK_EXECUTOR_JAVA_OPTS < /tmp/java_opts.txt
if [ -n "$SPARK_EXTRA_CLASSPATH" ]; then
SPARK_CLASSPATH="$SPARK_CLASSPATH:$SPARK_EXTRA_CLASSPATH"
fi
-{%- if HAVE_PY %}
if ! [ -z ${PYSPARK_PYTHON+x} ]; then
export PYSPARK_PYTHON
@@ -52,7 +51,6 @@ fi
if ! [ -z ${PYSPARK_DRIVER_PYTHON+x} ]; then
export PYSPARK_DRIVER_PYTHON
fi
-{%- endif %}
# If HADOOP_HOME is set and SPARK_DIST_CLASSPATH is not set, set it here so
Hadoop jars are available to the executor.
# It does not set SPARK_DIST_CLASSPATH if already set, to avoid overriding
customizations of this value from elsewhere e.g. Docker/K8s.
diff --git a/add-dockerfiles.sh b/r-python.template
old mode 100755
new mode 100644
similarity index 50%
copy from add-dockerfiles.sh
copy to r-python.template
index 1683f33..fec4e70
--- a/add-dockerfiles.sh
+++ b/r-python.template
@@ -1,5 +1,3 @@
-#!/usr/bin/env bash
-
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
@@ -16,38 +14,20 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
-
-# Usage: $0 [version]
-# Generate dockerfiles for specified spark version.
-#
-# Examples:
-# - Add 3.3.0 dockerfiles:
-# $ ./add-dockerfiles.sh
-# - Add 3.3.1 dockerfiles:
-# $ ./add-dockerfiles.sh 3.3.1
-
-VERSION=${1:-"3.3.0"}
-
-TAGS="
-scala2.12-java11-python3-r-ubuntu
-scala2.12-java11-python3-ubuntu
-scala2.12-java11-r-ubuntu
-scala2.12-java11-ubuntu
-"
-
-for TAG in $TAGS; do
- OPTS=""
- if echo $TAG | grep -q "python"; then
- OPTS+=" --pyspark"
- fi
-
- if echo $TAG | grep -q "r-"; then
- OPTS+=" --sparkr"
- fi
-
- OPTS+=" --spark-version $VERSION"
-
- mkdir -p $VERSION/$TAG
- python3 tools/template.py $OPTS -f entrypoint.sh.template >
$VERSION/$TAG/entrypoint.sh
- python3 tools/template.py $OPTS > $VERSION/$TAG/Dockerfile
-done
+ARG BASE_IMAGE=spark:{{ SPARK_VERSION }}-scala{{ SCALA_VERSION }}-java{{
JAVA_VERSION }}-ubuntu
+FROM $BASE_IMAGE
+
+RUN set -ex && \
+ apt-get update && \
+ {%- if HAVE_PY %}
+ apt install -y python3 python3-pip && \
+ {%- endif %}
+ {%- if HAVE_R %}
+ apt install -y r-base r-base-dev && \
+ {%- endif %}
+ rm -rf /var/cache/apt/* && \
+ rm -rf /var/lib/apt/lists/*
+{%- if HAVE_R %}
+
+ENV R_HOME /usr/lib/R
+{%- endif %}
diff --git a/tools/template.py b/tools/template.py
index 693182b..cb74cc3 100755
--- a/tools/template.py
+++ b/tools/template.py
@@ -50,6 +50,20 @@ def parse_opts():
default="3.3.0",
)
+ parser.add_argument(
+ "-j",
+ "--java-version",
+ help="The Spark version of Dockerfile.",
+ default="11",
+ )
+
+ parser.add_argument(
+ "-s",
+ "--scala-version",
+ help="The Spark version of Dockerfile.",
+ default="2.12",
+ )
+
parser.add_argument(
"-i",
"--image",
@@ -88,6 +102,8 @@ def main():
HAVE_R=opts.sparkr,
SPARK_VERSION=opts.spark_version,
SPARK_GPG_KEY=GPG_KEY_DICT.get(opts.spark_version),
+ JAVA_VERSION=opts.java_version,
+ SCALA_VERSION=opts.scala_version,
)
)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]