Copilot commented on code in PR #2528:
URL: https://github.com/apache/sedona/pull/2528#discussion_r2558932560


##########
docker/install-zeppelin.sh:
##########
@@ -23,9 +23,59 @@ set -e
 ZEPPELIN_VERSION=${1:-0.9.0}
 TARGET_DIR=${2:-/opt}
 
-# Download and extract Zeppelin using curl
-curl -L --retry 5 --retry-delay 10 --retry-connrefused 
https://archive.apache.org/dist/zeppelin/zeppelin-"${ZEPPELIN_VERSION}"/zeppelin-"${ZEPPELIN_VERSION}"-bin-netinst.tgz
 \
-    -o zeppelin-"${ZEPPELIN_VERSION}"-bin-netinst.tgz
-tar -xzf zeppelin-"${ZEPPELIN_VERSION}"-bin-netinst.tgz -C "${TARGET_DIR}"
+# Helper function to download with throttled progress updates (every 5 seconds)
+download_with_progress() {
+    local url=$1
+    local output=$2
+    local description=${3:-"Downloading"}
+
+    # Start download in background
+    curl -L --silent --show-error --retry 5 --retry-delay 10 
--retry-connrefused "${url}" -o "${output}" &
+    local curl_pid=$!
+
+    # Monitor progress every 5 seconds
+    while kill -0 $curl_pid 2>/dev/null; do
+        sleep 5
+        if [ -f "${output}" ]; then
+            # Use stat for portability (works on both Linux and macOS)
+            local size=$(stat -c%s "${output}" 2>/dev/null || stat -f%z 
"${output}" 2>/dev/null || echo 0)
+            local size_mb=$((size / 1024 / 1024))
+            echo "${description}... ${size_mb} MB downloaded"
+        fi
+    done
+
+    # Wait for curl to finish
+    wait $curl_pid
+    local exit_code=$?
+    if [ $exit_code -ne 0 ]; then
+        echo "Download failed with exit code $exit_code"
+        return $exit_code
+    fi
+
+    # Show final size
+    if [ -f "${output}" ]; then
+        local final_size=$(stat -c%s "${output}" 2>/dev/null || stat -f%z 
"${output}" 2>/dev/null || echo 0)
+        local final_size_mb=$((final_size / 1024 / 1024))
+        echo "${description} completed: ${final_size_mb} MB"
+    fi
+}
+
+# Download and extract Zeppelin
+# Download from Lyra Hosting mirror (faster) but verify checksum from Apache 
archive
+zeppelin_filename="zeppelin-${ZEPPELIN_VERSION}-bin-netinst.tgz"
+zeppelin_download_url="https://mirror.lyrahosting.com/apache/zeppelin/zeppelin-${ZEPPELIN_VERSION}/${zeppelin_filename}";
+checksum_url="https://archive.apache.org/dist/zeppelin/zeppelin-${ZEPPELIN_VERSION}/${zeppelin_filename}.sha512";
+
+echo "Downloading Zeppelin ${ZEPPELIN_VERSION} from Lyra Hosting mirror..."
+download_with_progress "${zeppelin_download_url}" "${zeppelin_filename}" 
"Downloading Zeppelin"
+
+echo "Downloading checksum from Apache archive..."
+curl --silent --show-error --retry 5 --retry-delay 10 --retry-connrefused 
"${checksum_url}" -o "${zeppelin_filename}.sha512"
+
+echo "Verifying checksum..."
+sha512sum -c "${zeppelin_filename}.sha512" || (echo "Checksum verification 
failed!" && exit 1)

Review Comment:
   Using subshell with `exit 1` won't terminate the script due to `set -e` not 
propagating through the subshell. Replace with `|| { echo \"Checksum 
verification failed!\"; exit 1; }` to ensure the script exits on checksum 
failure.
   ```suggestion
   sha512sum -c "${zeppelin_filename}.sha512" || { echo "Checksum verification 
failed!"; exit 1; }
   ```



##########
docker/install-spark.sh:
##########
@@ -24,16 +24,69 @@ spark_version=$1
 hadoop_s3_version=$2
 aws_sdk_version=$3
 
+# Helper function to download with throttled progress updates (every 5 seconds)
+download_with_progress() {
+    local url=$1
+    local output=$2
+    local description=${3:-"Downloading"}
+
+    # Start download in background, redirect progress to /dev/null
+    curl --silent --show-error --retry 5 --retry-delay 10 --retry-connrefused 
"${url}" -o "${output}" &
+    local curl_pid=$!
+
+    # Monitor progress every 5 seconds
+    while kill -0 $curl_pid 2>/dev/null; do
+        sleep 5
+        if [ -f "${output}" ]; then
+            # Use stat for portability (works on both Linux and macOS)
+            local size=$(stat -c%s "${output}" 2>/dev/null || stat -f%z 
"${output}" 2>/dev/null || echo 0)
+            local size_mb=$((size / 1024 / 1024))
+            echo "${description}... ${size_mb} MB downloaded"
+        fi
+    done
+
+    # Wait for curl to finish
+    wait $curl_pid
+    local exit_code=$?
+    if [ $exit_code -ne 0 ]; then
+        echo "Download failed with exit code $exit_code"
+        return $exit_code
+    fi
+
+    # Show final size
+    if [ -f "${output}" ]; then
+        local final_size=$(stat -c%s "${output}" 2>/dev/null || stat -f%z 
"${output}" 2>/dev/null || echo 0)
+        local final_size_mb=$((final_size / 1024 / 1024))
+        echo "${description} completed: ${final_size_mb} MB"
+    fi
+}
+
 # Download Spark jar and set up PySpark
-curl --retry 5 --retry-delay 10 --retry-connrefused 
https://archive.apache.org/dist/spark/spark-"${spark_version}"/spark-"${spark_version}"-bin-hadoop3.tgz
 -o spark.tgz
-tar -xf spark.tgz && mv spark-"${spark_version}"-bin-hadoop3/* "${SPARK_HOME}"/
-rm spark.tgz && rm -rf spark-"${spark_version}"-bin-hadoop3
+# Download from Lyra Hosting mirror (faster) but verify checksum from Apache 
archive
+spark_filename="spark-${spark_version}-bin-hadoop3.tgz"
+spark_download_url="https://mirror.lyrahosting.com/apache/spark/spark-${spark_version}/${spark_filename}";
+checksum_url="https://archive.apache.org/dist/spark/spark-${spark_version}/${spark_filename}.sha512";
+
+echo "Downloading Spark ${spark_version} from Lyra Hosting mirror..."
+download_with_progress "${spark_download_url}" "${spark_filename}" 
"Downloading Spark"
+
+echo "Downloading checksum from Apache archive..."
+curl --silent --show-error --retry 5 --retry-delay 10 --retry-connrefused 
"${checksum_url}" -o "${spark_filename}.sha512"
+
+echo "Verifying checksum..."
+sha512sum -c "${spark_filename}.sha512" || (echo "Checksum verification 
failed!" && exit 1)

Review Comment:
   Using subshell with `exit 1` won't terminate the script due to `set -e` not 
propagating through the subshell. Replace with `|| { echo \"Checksum 
verification failed!\"; exit 1; }` to ensure the script exits on checksum 
failure.
   ```suggestion
   sha512sum -c "${spark_filename}.sha512" || { echo "Checksum verification 
failed!"; exit 1; }
   ```



##########
docker/install-spark.sh:
##########
@@ -24,16 +24,69 @@ spark_version=$1
 hadoop_s3_version=$2
 aws_sdk_version=$3
 
+# Helper function to download with throttled progress updates (every 5 seconds)
+download_with_progress() {
+    local url=$1
+    local output=$2
+    local description=${3:-"Downloading"}
+
+    # Start download in background, redirect progress to /dev/null
+    curl --silent --show-error --retry 5 --retry-delay 10 --retry-connrefused 
"${url}" -o "${output}" &

Review Comment:
   Missing `-L` flag for following redirects. The Zeppelin script includes `-L` 
in its curl command (line 33 of install-zeppelin.sh), but this Spark version 
omits it. Mirror URLs may redirect, causing downloads to fail without this flag.
   ```suggestion
       curl -L --silent --show-error --retry 5 --retry-delay 10 
--retry-connrefused "${url}" -o "${output}" &
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to