This is an automated email from the ASF dual-hosted git repository.

jiayu pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/sedona.git


The following commit(s) were added to refs/heads/master by this push:
     new 414d272811 [GH-2489] Improve the docker image build process (#2528)
414d272811 is described below

commit 414d272811918430b1766f0e4c0b36a7f6693ff0
Author: Jia Yu <[email protected]>
AuthorDate: Tue Nov 25 01:42:33 2025 -0700

    [GH-2489] Improve the docker image build process (#2528)
---
 .github/workflows/docker-build.yml |  2 +-
 docker/install-sedona.sh           |  6 ++--
 docker/install-spark.sh            | 63 +++++++++++++++++++++++++++++++++++---
 docker/install-zeppelin.sh         | 60 +++++++++++++++++++++++++++++++++---
 4 files changed, 117 insertions(+), 14 deletions(-)

diff --git a/.github/workflows/docker-build.yml 
b/.github/workflows/docker-build.yml
index a34aaa8ade..fda7d7534b 100644
--- a/.github/workflows/docker-build.yml
+++ b/.github/workflows/docker-build.yml
@@ -45,7 +45,7 @@ jobs:
     strategy:
       fail-fast: true
       matrix:
-        os: ['ubuntu-latest']
+        os: ['ubuntu-latest', 'ubuntu-24.04-arm']
         spark: ['4.0.1']
         include:
           - spark: 4.0.1
diff --git a/docker/install-sedona.sh b/docker/install-sedona.sh
index 35507f2e57..d36e76e11c 100755
--- a/docker/install-sedona.sh
+++ b/docker/install-sedona.sh
@@ -39,17 +39,17 @@ if [ "$sedona_version" = "latest" ]; then
 else
   # Code to execute when SEDONA_VERSION is not "latest"
   # Download Sedona
-  curl --retry 5 --retry-delay 10 --retry-connrefused 
https://repo1.maven.org/maven2/org/apache/sedona/sedona-spark-shaded-"${sedona_spark_version}"_2.13/"${sedona_version}"/sedona-spark-shaded-"${sedona_spark_version}"_2.13-"${sedona_version}".jar
 -o 
"$SPARK_HOME"/jars/sedona-spark-shaded-"${sedona_spark_version}"_2.13-"${sedona_version}".jar
+  curl --progress-bar --show-error --retry 5 --retry-delay 10 
--retry-connrefused 
https://repo1.maven.org/maven2/org/apache/sedona/sedona-spark-shaded-"${sedona_spark_version}"_2.13/"${sedona_version}"/sedona-spark-shaded-"${sedona_spark_version}"_2.13-"${sedona_version}".jar
 -o 
"$SPARK_HOME"/jars/sedona-spark-shaded-"${sedona_spark_version}"_2.13-"${sedona_version}".jar
 
   # Install Sedona Python
   pip3 install apache-sedona=="${sedona_version}" --break-system-packages
 fi
 
 # Download gresearch spark extension
-curl --retry 5 --retry-delay 10 --retry-connrefused 
https://repo1.maven.org/maven2/uk/co/gresearch/spark/spark-extension_2.13/"${spark_extension_version}"-"${spark_compat_version}"/spark-extension_2.13-"${spark_extension_version}"-"${spark_compat_version}".jar
 -o 
"$SPARK_HOME"/jars/spark-extension_2.13-"${spark_extension_version}"-"${spark_compat_version}".jar
+curl --progress-bar --show-error --retry 5 --retry-delay 10 
--retry-connrefused 
https://repo1.maven.org/maven2/uk/co/gresearch/spark/spark-extension_2.13/"${spark_extension_version}"-"${spark_compat_version}"/spark-extension_2.13-"${spark_extension_version}"-"${spark_compat_version}".jar
 -o 
"$SPARK_HOME"/jars/spark-extension_2.13-"${spark_extension_version}"-"${spark_compat_version}".jar
 
 # Install Spark extension Python
 pip3 install 
pyspark-extension=="${spark_extension_version}"."${spark_compat_version}" 
--break-system-packages
 
 # Download GeoTools jar
-curl --retry 5 --retry-delay 10 --retry-connrefused 
https://repo1.maven.org/maven2/org/datasyslab/geotools-wrapper/"${geotools_wrapper_version}"/geotools-wrapper-"${geotools_wrapper_version}".jar
 -o "$SPARK_HOME"/jars/geotools-wrapper-"${geotools_wrapper_version}".jar
+curl --progress-bar --show-error --retry 5 --retry-delay 10 
--retry-connrefused 
https://repo1.maven.org/maven2/org/datasyslab/geotools-wrapper/"${geotools_wrapper_version}"/geotools-wrapper-"${geotools_wrapper_version}".jar
 -o "$SPARK_HOME"/jars/geotools-wrapper-"${geotools_wrapper_version}".jar
diff --git a/docker/install-spark.sh b/docker/install-spark.sh
index 6e3023233e..8756a006c2 100755
--- a/docker/install-spark.sh
+++ b/docker/install-spark.sh
@@ -24,16 +24,69 @@ spark_version=$1
 hadoop_s3_version=$2
 aws_sdk_version=$3
 
+# Helper function to download with throttled progress updates (every 5 seconds)
+download_with_progress() {
+    local url=$1
+    local output=$2
+    local description=${3:-"Downloading"}
+
+    # Start download in background, redirect progress to /dev/null
+    curl -L --silent --show-error --retry 5 --retry-delay 10 
--retry-connrefused "${url}" -o "${output}" &
+    local curl_pid=$!
+
+    # Monitor progress every 5 seconds
+    while kill -0 $curl_pid 2>/dev/null; do
+        sleep 5
+        if [ -f "${output}" ]; then
+            # Use stat for portability (works on both Linux and macOS)
+            local size=$(stat -c%s "${output}" 2>/dev/null || stat -f%z 
"${output}" 2>/dev/null || echo 0)
+            local size_mb=$((size / 1024 / 1024))
+            echo "${description}... ${size_mb} MB downloaded"
+        fi
+    done
+
+    # Wait for curl to finish
+    wait $curl_pid
+    local exit_code=$?
+    if [ $exit_code -ne 0 ]; then
+        echo "Download failed with exit code $exit_code"
+        return $exit_code
+    fi
+
+    # Show final size
+    if [ -f "${output}" ]; then
+        local final_size=$(stat -c%s "${output}" 2>/dev/null || stat -f%z 
"${output}" 2>/dev/null || echo 0)
+        local final_size_mb=$((final_size / 1024 / 1024))
+        echo "${description} completed: ${final_size_mb} MB"
+    fi
+}
+
 # Download Spark jar and set up PySpark
-curl --retry 5 --retry-delay 10 --retry-connrefused 
https://archive.apache.org/dist/spark/spark-"${spark_version}"/spark-"${spark_version}"-bin-hadoop3.tgz
 -o spark.tgz
-tar -xf spark.tgz && mv spark-"${spark_version}"-bin-hadoop3/* "${SPARK_HOME}"/
-rm spark.tgz && rm -rf spark-"${spark_version}"-bin-hadoop3
+# Download from Lyra Hosting mirror (faster) but verify checksum from Apache 
archive
+spark_filename="spark-${spark_version}-bin-hadoop3.tgz"
+spark_download_url="https://mirror.lyrahosting.com/apache/spark/spark-${spark_version}/${spark_filename}";
+checksum_url="https://archive.apache.org/dist/spark/spark-${spark_version}/${spark_filename}.sha512";
+
+echo "Downloading Spark ${spark_version} from Lyra Hosting mirror..."
+download_with_progress "${spark_download_url}" "${spark_filename}" 
"Downloading Spark"
+
+echo "Downloading checksum from Apache archive..."
+curl -L --silent --show-error --retry 5 --retry-delay 10 --retry-connrefused 
"${checksum_url}" -o "${spark_filename}.sha512"
+
+echo "Verifying checksum..."
+sha512sum -c "${spark_filename}.sha512" || { echo "Checksum verification 
failed!"; exit 1; }
+
+echo "Checksum verified successfully. Extracting Spark..."
+tar -xf "${spark_filename}" && mv spark-"${spark_version}"-bin-hadoop3/* 
"${SPARK_HOME}"/
+rm "${spark_filename}" "${spark_filename}.sha512" && rm -rf 
spark-"${spark_version}"-bin-hadoop3
 
 # Add S3 jars
-curl --retry 5 --retry-delay 10 --retry-connrefused 
https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/"${hadoop_s3_version}"/hadoop-aws-"${hadoop_s3_version}".jar
 -o "${SPARK_HOME}"/jars/hadoop-aws-"${hadoop_s3_version}".jar
+echo "Downloading Hadoop AWS S3 jar..."
+download_with_progress 
"https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/${hadoop_s3_version}/hadoop-aws-${hadoop_s3_version}.jar";
 "${SPARK_HOME}/jars/hadoop-aws-${hadoop_s3_version}.jar" "Downloading Hadoop 
AWS"
 
 # Add AWS SDK v2 bundle (required by spark-extension 2.14.2+)
-curl --retry 5 --retry-delay 10 --retry-connrefused 
https://repo1.maven.org/maven2/software/amazon/awssdk/bundle/"${aws_sdk_version}"/bundle-"${aws_sdk_version}".jar
 -o "${SPARK_HOME}"/jars/aws-sdk-v2-bundle-"${aws_sdk_version}".jar
+echo "Downloading AWS SDK v2 bundle..."
+download_with_progress 
"https://repo1.maven.org/maven2/software/amazon/awssdk/bundle/${aws_sdk_version}/bundle-${aws_sdk_version}.jar";
 "${SPARK_HOME}/jars/aws-sdk-v2-bundle-${aws_sdk_version}.jar" "Downloading AWS 
SDK"
 
 # Set up master IP address and executor memory
 cp "${SPARK_HOME}"/conf/spark-defaults.conf.template 
"${SPARK_HOME}"/conf/spark-defaults.conf
diff --git a/docker/install-zeppelin.sh b/docker/install-zeppelin.sh
index 6bc9ce5380..3b0ca6d27f 100755
--- a/docker/install-zeppelin.sh
+++ b/docker/install-zeppelin.sh
@@ -23,9 +23,59 @@ set -e
 ZEPPELIN_VERSION=${1:-0.9.0}
 TARGET_DIR=${2:-/opt}
 
-# Download and extract Zeppelin using curl
-curl -L --retry 5 --retry-delay 10 --retry-connrefused 
https://archive.apache.org/dist/zeppelin/zeppelin-"${ZEPPELIN_VERSION}"/zeppelin-"${ZEPPELIN_VERSION}"-bin-netinst.tgz
 \
-    -o zeppelin-"${ZEPPELIN_VERSION}"-bin-netinst.tgz
-tar -xzf zeppelin-"${ZEPPELIN_VERSION}"-bin-netinst.tgz -C "${TARGET_DIR}"
+# Helper function to download with throttled progress updates (every 5 seconds)
+download_with_progress() {
+    local url=$1
+    local output=$2
+    local description=${3:-"Downloading"}
+
+    # Start download in background
+    curl -L --silent --show-error --retry 5 --retry-delay 10 
--retry-connrefused "${url}" -o "${output}" &
+    local curl_pid=$!
+
+    # Monitor progress every 5 seconds
+    while kill -0 $curl_pid 2>/dev/null; do
+        sleep 5
+        if [ -f "${output}" ]; then
+            # Use stat for portability (works on both Linux and macOS)
+            local size=$(stat -c%s "${output}" 2>/dev/null || stat -f%z 
"${output}" 2>/dev/null || echo 0)
+            local size_mb=$((size / 1024 / 1024))
+            echo "${description}... ${size_mb} MB downloaded"
+        fi
+    done
+
+    # Wait for curl to finish
+    wait $curl_pid
+    local exit_code=$?
+    if [ $exit_code -ne 0 ]; then
+        echo "Download failed with exit code $exit_code"
+        return $exit_code
+    fi
+
+    # Show final size
+    if [ -f "${output}" ]; then
+        local final_size=$(stat -c%s "${output}" 2>/dev/null || stat -f%z 
"${output}" 2>/dev/null || echo 0)
+        local final_size_mb=$((final_size / 1024 / 1024))
+        echo "${description} completed: ${final_size_mb} MB"
+    fi
+}
+
+# Download and extract Zeppelin
+# Download from Lyra Hosting mirror (faster) but verify checksum from Apache 
archive
+zeppelin_filename="zeppelin-${ZEPPELIN_VERSION}-bin-netinst.tgz"
+zeppelin_download_url="https://mirror.lyrahosting.com/apache/zeppelin/zeppelin-${ZEPPELIN_VERSION}/${zeppelin_filename}";
+checksum_url="https://archive.apache.org/dist/zeppelin/zeppelin-${ZEPPELIN_VERSION}/${zeppelin_filename}.sha512";
+
+echo "Downloading Zeppelin ${ZEPPELIN_VERSION} from Lyra Hosting mirror..."
+download_with_progress "${zeppelin_download_url}" "${zeppelin_filename}" 
"Downloading Zeppelin"
+
+echo "Downloading checksum from Apache archive..."
+curl -L --silent --show-error --retry 5 --retry-delay 10 --retry-connrefused 
"${checksum_url}" -o "${zeppelin_filename}.sha512"
+
+echo "Verifying checksum..."
+sha512sum -c "${zeppelin_filename}.sha512" || { echo "Checksum verification 
failed!"; exit 1; }
+
+echo "Checksum verified successfully. Extracting Zeppelin..."
+tar -xzf "${zeppelin_filename}" -C "${TARGET_DIR}"
 mv "${TARGET_DIR}"/zeppelin-"${ZEPPELIN_VERSION}"-bin-netinst 
"${ZEPPELIN_HOME}"
-rm zeppelin-"${ZEPPELIN_VERSION}"-bin-netinst.tgz
+rm "${zeppelin_filename}" "${zeppelin_filename}.sha512"

Reply via email to