(sedona) branch master updated: [GH-2351] [CI] Fix R CI flakiness with Spark download from PySpark (#2352)

jiayu Tue, 16 Sep 2025 23:05:16 -0700

This is an automated email from the ASF dual-hosted git repository.

jiayu pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/sedona.git



The following commit(s) were added to refs/heads/master by this push:
     new 6c8f66cab0 [GH-2351] [CI] Fix R CI flakiness with Spark download from 
PySpark (#2352)
6c8f66cab0 is described below

commit 6c8f66cab0cfcfa7891d075af512656daf5b3bd8
Author: Feng Zhang <[email protected]>
AuthorDate: Tue Sep 16 22:39:21 2025 -0700

    [GH-2351] [CI] Fix R CI flakiness with Spark download from PySpark (#2352)
    
    * Fix R CI flakiness with Spark download retry logic and timeout
    
    * test the same way python.yml download spark
    
    * add jai versions env
    
    * revert r install script
    
    * update R helper to check SPARK_HOME before downloading spark from apache 
archive
    
    * clean up comments
---
 .github/workflows/r.yml              | 36 +++++++++++++++++++++++++-----------
 R/tests/testthat/helper-initialize.R | 27 +++++++++++++++++++++------
 2 files changed, 46 insertions(+), 17 deletions(-)

diff --git a/.github/workflows/r.yml b/.github/workflows/r.yml
index 55951c4036..236b30e7e3 100644
--- a/.github/workflows/r.yml
+++ b/.github/workflows/r.yml
@@ -41,6 +41,9 @@ on:
 
 env:
   MAVEN_OPTS: -Dmaven.wagon.httpconnectionManager.ttlSeconds=60
+  JAI_CORE_VERSION: '1.1.3'
+  JAI_CODEC_VERSION: '1.1.3'
+  JAI_IMAGEIO_VERSION: '1.1'
   DO_NOT_TRACK: true
 
 concurrency:
@@ -109,24 +112,34 @@ jobs:
           distribution: 'temurin'
           java-version: '11'
           cache: 'maven'
-      - name: Get OS name
-        id: os-name
-        run: |
-          # `os_name` will be like "Ubuntu-20.04.1-LTS"
-          OS_NAME=$(lsb_release -ds | sed 's/\s/-/g')
-          echo "os-name=$OS_NAME" >> $GITHUB_OUTPUT
-      - name: Cache Spark installations
-        if: runner.os != 'Windows'
-        uses: actions/cache@master
+      - uses: actions/setup-python@v5
         with:
-          path: ~/spark
-          key: apache.sedona-apache-spark-${{ steps.os-name.outputs.os-name 
}}-${{ env.SPARK_VERSION }}
+          python-version: '3.11'
+      - name: Install PySpark
+        run: |
+          pip3 install pyspark==${SPARK_VERSION}
+      - name: Download JAI libraries
+        run: |
+          PYSPARK_PATH=$(python3 -c "import pyspark; 
print(pyspark.__path__[0])")
+          wget --retry-connrefused --waitretry=10 --read-timeout=20 
--timeout=15 --tries=5 
https://repo.osgeo.org/repository/release/javax/media/jai_core/${JAI_CORE_VERSION}/jai_core-${JAI_CORE_VERSION}.jar
+          wget --retry-connrefused --waitretry=10 --read-timeout=20 
--timeout=15 --tries=5 
https://repo.osgeo.org/repository/release/javax/media/jai_codec/${JAI_CODEC_VERSION}/jai_codec-${JAI_CODEC_VERSION}.jar
+          wget --retry-connrefused --waitretry=10 --read-timeout=20 
--timeout=15 --tries=5 
https://repo.osgeo.org/repository/release/javax/media/jai_imageio/${JAI_IMAGEIO_VERSION}/jai_imageio-${JAI_IMAGEIO_VERSION}.jar
+          mv -v jai_core-${JAI_CORE_VERSION}.jar ${PYSPARK_PATH}/jars
+          mv -v jai_codec-${JAI_CODEC_VERSION}.jar ${PYSPARK_PATH}/jars
+          mv -v jai_imageio-${JAI_IMAGEIO_VERSION}.jar ${PYSPARK_PATH}/jars
+          echo "PYSPARK_PATH=${PYSPARK_PATH}" >> $GITHUB_ENV
       - name: Build Sedona libraries
         run: |
           SPARK_COMPAT_VERSION=${SPARK_VERSION:0:3}
           mvn -q clean install -DskipTests -Dspark=${SPARK_COMPAT_VERSION} 
-Dscala=${SCALA_VERSION:0:4} -Dgeotools
+      - name: Copy Sedona JARs to PySpark
+        run: |
+          find spark-shaded/target -name sedona-*.jar -exec cp {} 
${PYSPARK_PATH}/jars/ \;
       - name: Run tests
         run: |
+          # Set SPARK_HOME to PySpark path
+          export SPARK_HOME=${PYSPARK_PATH}
+
           if [[ "${SPARK_VERSION:0:3}" < "3.3" ]]; then
             case "$HADOOP_VERSION" in
               3)
@@ -143,6 +156,7 @@ jobs:
           cd ./R/tests
           NOT_CRAN='true' Rscript testthat.R
         shell: bash
+        timeout-minutes: 30
       - uses: actions/upload-artifact@v4
         if: failure()
         with:
diff --git a/R/tests/testthat/helper-initialize.R 
b/R/tests/testthat/helper-initialize.R
index 84cadbff01..9d1143d93f 100644
--- a/R/tests/testthat/helper-initialize.R
+++ b/R/tests/testthat/helper-initialize.R
@@ -20,9 +20,16 @@ testthat_spark_connection <- function(conn_retry_interval_s 
= 2) {
   if (!exists(conn_key, envir = .GlobalEnv)) {
     version <- Sys.getenv("SPARK_VERSION")
     hadoop_version <- Sys.getenv("HADOOP_VERSION")
-    spark_installed <- spark_installed_versions()
-    if (nrow(spark_installed[spark_installed$spark == version & 
spark_installed$hadoop == hadoop_version, ]) == 0) {
-      spark_install(version, hadoop_version)
+    spark_home <- Sys.getenv("SPARK_HOME")
+
+    if (spark_home != "") {
+      message(sprintf("Using pre-installed Spark from: %s", spark_home))
+    } else {
+      spark_installed <- spark_installed_versions()
+      if (nrow(spark_installed[spark_installed$spark == version & 
spark_installed$hadoop == hadoop_version, ]) == 0) {
+        message("Installing Spark for local development...")
+        spark_install(version, hadoop_version)
+      }
     }
 
     conn_attempts <- 3
@@ -33,13 +40,21 @@ testthat_spark_connection <- function(conn_retry_interval_s 
= 2) {
           config <- spark_config()
           config[["sparklyr.connect.timeout"]] <- 300
 
-          sc <- spark_connect(
+          # Use spark_home if set (CI), otherwise use version (local dev)
+          connect_args <- list(
             master = "local",
             method = "shell",
             config = config,
-            app_name = paste0("testthat-", uuid::UUIDgenerate()),
-            version = version
+            app_name = paste0("testthat-", uuid::UUIDgenerate())
           )
+
+          if (spark_home != "") {
+            connect_args$spark_home <- spark_home
+          } else {
+            connect_args$version <- version
+          }
+
+          sc <- do.call(spark_connect, connect_args)
           assign(conn_key, sc, envir = .GlobalEnv)
           TRUE
         },

(sedona) branch master updated: [GH-2351] [CI] Fix R CI flakiness with Spark download from PySpark (#2352)

Reply via email to