This is an automated email from the ASF dual-hosted git repository.
jiayu pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/sedona.git
The following commit(s) were added to refs/heads/master by this push:
new 414d272811 [GH-2489] Improve the docker image build process (#2528)
414d272811 is described below
commit 414d272811918430b1766f0e4c0b36a7f6693ff0
Author: Jia Yu <[email protected]>
AuthorDate: Tue Nov 25 01:42:33 2025 -0700
[GH-2489] Improve the docker image build process (#2528)
---
.github/workflows/docker-build.yml | 2 +-
docker/install-sedona.sh | 6 ++--
docker/install-spark.sh | 63 +++++++++++++++++++++++++++++++++++---
docker/install-zeppelin.sh | 60 +++++++++++++++++++++++++++++++++---
4 files changed, 117 insertions(+), 14 deletions(-)
diff --git a/.github/workflows/docker-build.yml
b/.github/workflows/docker-build.yml
index a34aaa8ade..fda7d7534b 100644
--- a/.github/workflows/docker-build.yml
+++ b/.github/workflows/docker-build.yml
@@ -45,7 +45,7 @@ jobs:
strategy:
fail-fast: true
matrix:
- os: ['ubuntu-latest']
+ os: ['ubuntu-latest', 'ubuntu-24.04-arm']
spark: ['4.0.1']
include:
- spark: 4.0.1
diff --git a/docker/install-sedona.sh b/docker/install-sedona.sh
index 35507f2e57..d36e76e11c 100755
--- a/docker/install-sedona.sh
+++ b/docker/install-sedona.sh
@@ -39,17 +39,17 @@ if [ "$sedona_version" = "latest" ]; then
else
# Code to execute when SEDONA_VERSION is not "latest"
# Download Sedona
- curl --retry 5 --retry-delay 10 --retry-connrefused
https://repo1.maven.org/maven2/org/apache/sedona/sedona-spark-shaded-"${sedona_spark_version}"_2.13/"${sedona_version}"/sedona-spark-shaded-"${sedona_spark_version}"_2.13-"${sedona_version}".jar
-o
"$SPARK_HOME"/jars/sedona-spark-shaded-"${sedona_spark_version}"_2.13-"${sedona_version}".jar
+ curl --progress-bar --show-error --retry 5 --retry-delay 10
--retry-connrefused
https://repo1.maven.org/maven2/org/apache/sedona/sedona-spark-shaded-"${sedona_spark_version}"_2.13/"${sedona_version}"/sedona-spark-shaded-"${sedona_spark_version}"_2.13-"${sedona_version}".jar
-o
"$SPARK_HOME"/jars/sedona-spark-shaded-"${sedona_spark_version}"_2.13-"${sedona_version}".jar
# Install Sedona Python
pip3 install apache-sedona=="${sedona_version}" --break-system-packages
fi
# Download gresearch spark extension
-curl --retry 5 --retry-delay 10 --retry-connrefused
https://repo1.maven.org/maven2/uk/co/gresearch/spark/spark-extension_2.13/"${spark_extension_version}"-"${spark_compat_version}"/spark-extension_2.13-"${spark_extension_version}"-"${spark_compat_version}".jar
-o
"$SPARK_HOME"/jars/spark-extension_2.13-"${spark_extension_version}"-"${spark_compat_version}".jar
+curl --progress-bar --show-error --retry 5 --retry-delay 10
--retry-connrefused
https://repo1.maven.org/maven2/uk/co/gresearch/spark/spark-extension_2.13/"${spark_extension_version}"-"${spark_compat_version}"/spark-extension_2.13-"${spark_extension_version}"-"${spark_compat_version}".jar
-o
"$SPARK_HOME"/jars/spark-extension_2.13-"${spark_extension_version}"-"${spark_compat_version}".jar
# Install Spark extension Python
pip3 install
pyspark-extension=="${spark_extension_version}"."${spark_compat_version}"
--break-system-packages
# Download GeoTools jar
-curl --retry 5 --retry-delay 10 --retry-connrefused
https://repo1.maven.org/maven2/org/datasyslab/geotools-wrapper/"${geotools_wrapper_version}"/geotools-wrapper-"${geotools_wrapper_version}".jar
-o "$SPARK_HOME"/jars/geotools-wrapper-"${geotools_wrapper_version}".jar
+curl --progress-bar --show-error --retry 5 --retry-delay 10
--retry-connrefused
https://repo1.maven.org/maven2/org/datasyslab/geotools-wrapper/"${geotools_wrapper_version}"/geotools-wrapper-"${geotools_wrapper_version}".jar
-o "$SPARK_HOME"/jars/geotools-wrapper-"${geotools_wrapper_version}".jar
diff --git a/docker/install-spark.sh b/docker/install-spark.sh
index 6e3023233e..8756a006c2 100755
--- a/docker/install-spark.sh
+++ b/docker/install-spark.sh
@@ -24,16 +24,69 @@ spark_version=$1
hadoop_s3_version=$2
aws_sdk_version=$3
+# Helper function to download with throttled progress updates (every 5 seconds)
+download_with_progress() {
+ local url=$1
+ local output=$2
+ local description=${3:-"Downloading"}
+
+ # Start download in background, redirect progress to /dev/null
+ curl -L --silent --show-error --retry 5 --retry-delay 10
--retry-connrefused "${url}" -o "${output}" &
+ local curl_pid=$!
+
+ # Monitor progress every 5 seconds
+ while kill -0 $curl_pid 2>/dev/null; do
+ sleep 5
+ if [ -f "${output}" ]; then
+ # Use stat for portability (works on both Linux and macOS)
+ local size=$(stat -c%s "${output}" 2>/dev/null || stat -f%z
"${output}" 2>/dev/null || echo 0)
+ local size_mb=$((size / 1024 / 1024))
+ echo "${description}... ${size_mb} MB downloaded"
+ fi
+ done
+
+ # Wait for curl to finish
+ wait $curl_pid
+ local exit_code=$?
+ if [ $exit_code -ne 0 ]; then
+ echo "Download failed with exit code $exit_code"
+ return $exit_code
+ fi
+
+ # Show final size
+ if [ -f "${output}" ]; then
+ local final_size=$(stat -c%s "${output}" 2>/dev/null || stat -f%z
"${output}" 2>/dev/null || echo 0)
+ local final_size_mb=$((final_size / 1024 / 1024))
+ echo "${description} completed: ${final_size_mb} MB"
+ fi
+}
+
# Download Spark jar and set up PySpark
-curl --retry 5 --retry-delay 10 --retry-connrefused
https://archive.apache.org/dist/spark/spark-"${spark_version}"/spark-"${spark_version}"-bin-hadoop3.tgz
-o spark.tgz
-tar -xf spark.tgz && mv spark-"${spark_version}"-bin-hadoop3/* "${SPARK_HOME}"/
-rm spark.tgz && rm -rf spark-"${spark_version}"-bin-hadoop3
+# Download from Lyra Hosting mirror (faster) but verify checksum from Apache
archive
+spark_filename="spark-${spark_version}-bin-hadoop3.tgz"
+spark_download_url="https://mirror.lyrahosting.com/apache/spark/spark-${spark_version}/${spark_filename}"
+checksum_url="https://archive.apache.org/dist/spark/spark-${spark_version}/${spark_filename}.sha512"
+
+echo "Downloading Spark ${spark_version} from Lyra Hosting mirror..."
+download_with_progress "${spark_download_url}" "${spark_filename}"
"Downloading Spark"
+
+echo "Downloading checksum from Apache archive..."
+curl -L --silent --show-error --retry 5 --retry-delay 10 --retry-connrefused
"${checksum_url}" -o "${spark_filename}.sha512"
+
+echo "Verifying checksum..."
+sha512sum -c "${spark_filename}.sha512" || { echo "Checksum verification
failed!"; exit 1; }
+
+echo "Checksum verified successfully. Extracting Spark..."
+tar -xf "${spark_filename}" && mv spark-"${spark_version}"-bin-hadoop3/*
"${SPARK_HOME}"/
+rm "${spark_filename}" "${spark_filename}.sha512" && rm -rf
spark-"${spark_version}"-bin-hadoop3
# Add S3 jars
-curl --retry 5 --retry-delay 10 --retry-connrefused
https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/"${hadoop_s3_version}"/hadoop-aws-"${hadoop_s3_version}".jar
-o "${SPARK_HOME}"/jars/hadoop-aws-"${hadoop_s3_version}".jar
+echo "Downloading Hadoop AWS S3 jar..."
+download_with_progress
"https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/${hadoop_s3_version}/hadoop-aws-${hadoop_s3_version}.jar"
"${SPARK_HOME}/jars/hadoop-aws-${hadoop_s3_version}.jar" "Downloading Hadoop
AWS"
# Add AWS SDK v2 bundle (required by spark-extension 2.14.2+)
-curl --retry 5 --retry-delay 10 --retry-connrefused
https://repo1.maven.org/maven2/software/amazon/awssdk/bundle/"${aws_sdk_version}"/bundle-"${aws_sdk_version}".jar
-o "${SPARK_HOME}"/jars/aws-sdk-v2-bundle-"${aws_sdk_version}".jar
+echo "Downloading AWS SDK v2 bundle..."
+download_with_progress
"https://repo1.maven.org/maven2/software/amazon/awssdk/bundle/${aws_sdk_version}/bundle-${aws_sdk_version}.jar"
"${SPARK_HOME}/jars/aws-sdk-v2-bundle-${aws_sdk_version}.jar" "Downloading AWS
SDK"
# Set up master IP address and executor memory
cp "${SPARK_HOME}"/conf/spark-defaults.conf.template
"${SPARK_HOME}"/conf/spark-defaults.conf
diff --git a/docker/install-zeppelin.sh b/docker/install-zeppelin.sh
index 6bc9ce5380..3b0ca6d27f 100755
--- a/docker/install-zeppelin.sh
+++ b/docker/install-zeppelin.sh
@@ -23,9 +23,59 @@ set -e
ZEPPELIN_VERSION=${1:-0.9.0}
TARGET_DIR=${2:-/opt}
-# Download and extract Zeppelin using curl
-curl -L --retry 5 --retry-delay 10 --retry-connrefused
https://archive.apache.org/dist/zeppelin/zeppelin-"${ZEPPELIN_VERSION}"/zeppelin-"${ZEPPELIN_VERSION}"-bin-netinst.tgz
\
- -o zeppelin-"${ZEPPELIN_VERSION}"-bin-netinst.tgz
-tar -xzf zeppelin-"${ZEPPELIN_VERSION}"-bin-netinst.tgz -C "${TARGET_DIR}"
+# Helper function to download with throttled progress updates (every 5 seconds)
+download_with_progress() {
+ local url=$1
+ local output=$2
+ local description=${3:-"Downloading"}
+
+ # Start download in background
+ curl -L --silent --show-error --retry 5 --retry-delay 10
--retry-connrefused "${url}" -o "${output}" &
+ local curl_pid=$!
+
+ # Monitor progress every 5 seconds
+ while kill -0 $curl_pid 2>/dev/null; do
+ sleep 5
+ if [ -f "${output}" ]; then
+ # Use stat for portability (works on both Linux and macOS)
+ local size=$(stat -c%s "${output}" 2>/dev/null || stat -f%z
"${output}" 2>/dev/null || echo 0)
+ local size_mb=$((size / 1024 / 1024))
+ echo "${description}... ${size_mb} MB downloaded"
+ fi
+ done
+
+ # Wait for curl to finish
+ wait $curl_pid
+ local exit_code=$?
+ if [ $exit_code -ne 0 ]; then
+ echo "Download failed with exit code $exit_code"
+ return $exit_code
+ fi
+
+ # Show final size
+ if [ -f "${output}" ]; then
+ local final_size=$(stat -c%s "${output}" 2>/dev/null || stat -f%z
"${output}" 2>/dev/null || echo 0)
+ local final_size_mb=$((final_size / 1024 / 1024))
+ echo "${description} completed: ${final_size_mb} MB"
+ fi
+}
+
+# Download and extract Zeppelin
+# Download from Lyra Hosting mirror (faster) but verify checksum from Apache
archive
+zeppelin_filename="zeppelin-${ZEPPELIN_VERSION}-bin-netinst.tgz"
+zeppelin_download_url="https://mirror.lyrahosting.com/apache/zeppelin/zeppelin-${ZEPPELIN_VERSION}/${zeppelin_filename}"
+checksum_url="https://archive.apache.org/dist/zeppelin/zeppelin-${ZEPPELIN_VERSION}/${zeppelin_filename}.sha512"
+
+echo "Downloading Zeppelin ${ZEPPELIN_VERSION} from Lyra Hosting mirror..."
+download_with_progress "${zeppelin_download_url}" "${zeppelin_filename}"
"Downloading Zeppelin"
+
+echo "Downloading checksum from Apache archive..."
+curl -L --silent --show-error --retry 5 --retry-delay 10 --retry-connrefused
"${checksum_url}" -o "${zeppelin_filename}.sha512"
+
+echo "Verifying checksum..."
+sha512sum -c "${zeppelin_filename}.sha512" || { echo "Checksum verification
failed!"; exit 1; }
+
+echo "Checksum verified successfully. Extracting Zeppelin..."
+tar -xzf "${zeppelin_filename}" -C "${TARGET_DIR}"
mv "${TARGET_DIR}"/zeppelin-"${ZEPPELIN_VERSION}"-bin-netinst
"${ZEPPELIN_HOME}"
-rm zeppelin-"${ZEPPELIN_VERSION}"-bin-netinst.tgz
+rm "${zeppelin_filename}" "${zeppelin_filename}.sha512"