This is an automated email from the ASF dual-hosted git repository. jiayu pushed a commit to branch improve-docker-build-speed in repository https://gitbox.apache.org/repos/asf/sedona.git
commit 4dde9ed388a06b234a50b4ec2a5a6959009a6dd0 Author: Jia Yu <[email protected]> AuthorDate: Mon Nov 24 23:37:35 2025 -0800 Improve the build process --- docker/install-sedona.sh | 6 ++--- docker/install-spark.sh | 63 ++++++++++++++++++++++++++++++++++++++++++---- docker/install-zeppelin.sh | 48 +++++++++++++++++++++++++++++++---- 3 files changed, 104 insertions(+), 13 deletions(-) diff --git a/docker/install-sedona.sh b/docker/install-sedona.sh index 35507f2e57..d36e76e11c 100755 --- a/docker/install-sedona.sh +++ b/docker/install-sedona.sh @@ -39,17 +39,17 @@ if [ "$sedona_version" = "latest" ]; then else # Code to execute when SEDONA_VERSION is not "latest" # Download Sedona - curl --retry 5 --retry-delay 10 --retry-connrefused https://repo1.maven.org/maven2/org/apache/sedona/sedona-spark-shaded-"${sedona_spark_version}"_2.13/"${sedona_version}"/sedona-spark-shaded-"${sedona_spark_version}"_2.13-"${sedona_version}".jar -o "$SPARK_HOME"/jars/sedona-spark-shaded-"${sedona_spark_version}"_2.13-"${sedona_version}".jar + curl --progress-bar --show-error --retry 5 --retry-delay 10 --retry-connrefused https://repo1.maven.org/maven2/org/apache/sedona/sedona-spark-shaded-"${sedona_spark_version}"_2.13/"${sedona_version}"/sedona-spark-shaded-"${sedona_spark_version}"_2.13-"${sedona_version}".jar -o "$SPARK_HOME"/jars/sedona-spark-shaded-"${sedona_spark_version}"_2.13-"${sedona_version}".jar # Install Sedona Python pip3 install apache-sedona=="${sedona_version}" --break-system-packages fi # Download gresearch spark extension -curl --retry 5 --retry-delay 10 --retry-connrefused https://repo1.maven.org/maven2/uk/co/gresearch/spark/spark-extension_2.13/"${spark_extension_version}"-"${spark_compat_version}"/spark-extension_2.13-"${spark_extension_version}"-"${spark_compat_version}".jar -o "$SPARK_HOME"/jars/spark-extension_2.13-"${spark_extension_version}"-"${spark_compat_version}".jar +curl --progress-bar --show-error --retry 5 --retry-delay 10 --retry-connrefused https://repo1.maven.org/maven2/uk/co/gresearch/spark/spark-extension_2.13/"${spark_extension_version}"-"${spark_compat_version}"/spark-extension_2.13-"${spark_extension_version}"-"${spark_compat_version}".jar -o "$SPARK_HOME"/jars/spark-extension_2.13-"${spark_extension_version}"-"${spark_compat_version}".jar # Install Spark extension Python pip3 install pyspark-extension=="${spark_extension_version}"."${spark_compat_version}" --break-system-packages # Download GeoTools jar -curl --retry 5 --retry-delay 10 --retry-connrefused https://repo1.maven.org/maven2/org/datasyslab/geotools-wrapper/"${geotools_wrapper_version}"/geotools-wrapper-"${geotools_wrapper_version}".jar -o "$SPARK_HOME"/jars/geotools-wrapper-"${geotools_wrapper_version}".jar +curl --progress-bar --show-error --retry 5 --retry-delay 10 --retry-connrefused https://repo1.maven.org/maven2/org/datasyslab/geotools-wrapper/"${geotools_wrapper_version}"/geotools-wrapper-"${geotools_wrapper_version}".jar -o "$SPARK_HOME"/jars/geotools-wrapper-"${geotools_wrapper_version}".jar diff --git a/docker/install-spark.sh b/docker/install-spark.sh index 6e3023233e..560dfe5290 100755 --- a/docker/install-spark.sh +++ b/docker/install-spark.sh @@ -24,16 +24,69 @@ spark_version=$1 hadoop_s3_version=$2 aws_sdk_version=$3 +# Helper function to download with throttled progress updates (every 5 seconds) +download_with_progress() { + local url=$1 + local output=$2 + local description=${3:-"Downloading"} + + # Start download in background, redirect progress to /dev/null + curl --silent --show-error --retry 5 --retry-delay 10 --retry-connrefused "${url}" -o "${output}" & + local curl_pid=$! + + # Monitor progress every 5 seconds + while kill -0 $curl_pid 2>/dev/null; do + sleep 5 + if [ -f "${output}" ]; then + # Use stat for portability (works on both Linux and macOS) + local size=$(stat -c%s "${output}" 2>/dev/null || stat -f%z "${output}" 2>/dev/null || echo 0) + local size_mb=$((size / 1024 / 1024)) + echo "${description}... ${size_mb} MB downloaded" + fi + done + + # Wait for curl to finish + wait $curl_pid + local exit_code=$? + if [ $exit_code -ne 0 ]; then + echo "Download failed with exit code $exit_code" + return $exit_code + fi + + # Show final size + if [ -f "${output}" ]; then + local final_size=$(stat -c%s "${output}" 2>/dev/null || stat -f%z "${output}" 2>/dev/null || echo 0) + local final_size_mb=$((final_size / 1024 / 1024)) + echo "${description} completed: ${final_size_mb} MB" + fi +} + # Download Spark jar and set up PySpark -curl --retry 5 --retry-delay 10 --retry-connrefused https://archive.apache.org/dist/spark/spark-"${spark_version}"/spark-"${spark_version}"-bin-hadoop3.tgz -o spark.tgz -tar -xf spark.tgz && mv spark-"${spark_version}"-bin-hadoop3/* "${SPARK_HOME}"/ -rm spark.tgz && rm -rf spark-"${spark_version}"-bin-hadoop3 +# Download from Lyra Hosting mirror (faster) but verify checksum from Apache archive +spark_filename="spark-${spark_version}-bin-hadoop3.tgz" +spark_download_url="https://mirror.lyrahosting.com/apache/spark/spark-${spark_version}/${spark_filename}" +checksum_url="https://archive.apache.org/dist/spark/spark-${spark_version}/${spark_filename}.sha512" + +echo "Downloading Spark ${spark_version} from Lyra Hosting mirror..." +download_with_progress "${spark_download_url}" "${spark_filename}" "Downloading Spark" + +echo "Downloading checksum from Apache archive..." +curl --silent --show-error --retry 5 --retry-delay 10 --retry-connrefused "${checksum_url}" -o "${spark_filename}.sha512" + +echo "Verifying checksum..." +sha512sum -c "${spark_filename}.sha512" || (echo "Checksum verification failed!" && exit 1) + +echo "Checksum verified successfully. Extracting Spark..." +tar -xf "${spark_filename}" && mv spark-"${spark_version}"-bin-hadoop3/* "${SPARK_HOME}"/ +rm "${spark_filename}" "${spark_filename}.sha512" && rm -rf spark-"${spark_version}"-bin-hadoop3 # Add S3 jars -curl --retry 5 --retry-delay 10 --retry-connrefused https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/"${hadoop_s3_version}"/hadoop-aws-"${hadoop_s3_version}".jar -o "${SPARK_HOME}"/jars/hadoop-aws-"${hadoop_s3_version}".jar +echo "Downloading Hadoop AWS S3 jar..." +download_with_progress "https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/${hadoop_s3_version}/hadoop-aws-${hadoop_s3_version}.jar" "${SPARK_HOME}/jars/hadoop-aws-${hadoop_s3_version}.jar" "Downloading Hadoop AWS" # Add AWS SDK v2 bundle (required by spark-extension 2.14.2+) -curl --retry 5 --retry-delay 10 --retry-connrefused https://repo1.maven.org/maven2/software/amazon/awssdk/bundle/"${aws_sdk_version}"/bundle-"${aws_sdk_version}".jar -o "${SPARK_HOME}"/jars/aws-sdk-v2-bundle-"${aws_sdk_version}".jar +echo "Downloading AWS SDK v2 bundle..." +download_with_progress "https://repo1.maven.org/maven2/software/amazon/awssdk/bundle/${aws_sdk_version}/bundle-${aws_sdk_version}.jar" "${SPARK_HOME}/jars/aws-sdk-v2-bundle-${aws_sdk_version}.jar" "Downloading AWS SDK" # Set up master IP address and executor memory cp "${SPARK_HOME}"/conf/spark-defaults.conf.template "${SPARK_HOME}"/conf/spark-defaults.conf diff --git a/docker/install-zeppelin.sh b/docker/install-zeppelin.sh index 6bc9ce5380..4da2a44870 100755 --- a/docker/install-zeppelin.sh +++ b/docker/install-zeppelin.sh @@ -23,9 +23,47 @@ set -e ZEPPELIN_VERSION=${1:-0.9.0} TARGET_DIR=${2:-/opt} -# Download and extract Zeppelin using curl -curl -L --retry 5 --retry-delay 10 --retry-connrefused https://archive.apache.org/dist/zeppelin/zeppelin-"${ZEPPELIN_VERSION}"/zeppelin-"${ZEPPELIN_VERSION}"-bin-netinst.tgz \ - -o zeppelin-"${ZEPPELIN_VERSION}"-bin-netinst.tgz -tar -xzf zeppelin-"${ZEPPELIN_VERSION}"-bin-netinst.tgz -C "${TARGET_DIR}" +# Helper function to download with throttled progress updates (every 5 seconds) +download_with_progress() { + local url=$1 + local output=$2 + local description=${3:-"Downloading"} + + # Start download in background + curl -L --silent --show-error --retry 5 --retry-delay 10 --retry-connrefused "${url}" -o "${output}" & + local curl_pid=$! + + # Monitor progress every 5 seconds + while kill -0 $curl_pid 2>/dev/null; do + sleep 5 + if [ -f "${output}" ]; then + # Use stat for portability (works on both Linux and macOS) + local size=$(stat -c%s "${output}" 2>/dev/null || stat -f%z "${output}" 2>/dev/null || echo 0) + local size_mb=$((size / 1024 / 1024)) + echo "${description}... ${size_mb} MB downloaded" + fi + done + + # Wait for curl to finish + wait $curl_pid + local exit_code=$? + if [ $exit_code -ne 0 ]; then + echo "Download failed with exit code $exit_code" + return $exit_code + fi + + # Show final size + if [ -f "${output}" ]; then + local final_size=$(stat -c%s "${output}" 2>/dev/null || stat -f%z "${output}" 2>/dev/null || echo 0) + local final_size_mb=$((final_size / 1024 / 1024)) + echo "${description} completed: ${final_size_mb} MB" + fi +} + +# Download and extract Zeppelin +echo "Downloading Zeppelin ${ZEPPELIN_VERSION}..." +zeppelin_filename="zeppelin-${ZEPPELIN_VERSION}-bin-netinst.tgz" +download_with_progress "https://archive.apache.org/dist/zeppelin/zeppelin-${ZEPPELIN_VERSION}/${zeppelin_filename}" "${zeppelin_filename}" "Downloading Zeppelin" +tar -xzf "${zeppelin_filename}" -C "${TARGET_DIR}" mv "${TARGET_DIR}"/zeppelin-"${ZEPPELIN_VERSION}"-bin-netinst "${ZEPPELIN_HOME}" -rm zeppelin-"${ZEPPELIN_VERSION}"-bin-netinst.tgz +rm "${zeppelin_filename}"
