This is an automated email from the ASF dual-hosted git repository.
jiayu pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/sedona.git
The following commit(s) were added to refs/heads/master by this push:
new 6c8f66cab0 [GH-2351] [CI] Fix R CI flakiness with Spark download from
PySpark (#2352)
6c8f66cab0 is described below
commit 6c8f66cab0cfcfa7891d075af512656daf5b3bd8
Author: Feng Zhang <[email protected]>
AuthorDate: Tue Sep 16 22:39:21 2025 -0700
[GH-2351] [CI] Fix R CI flakiness with Spark download from PySpark (#2352)
* Fix R CI flakiness with Spark download retry logic and timeout
* test the same way python.yml download spark
* add jai versions env
* revert r install script
* update R helper to check SPARK_HOME before downloading spark from apache
archive
* clean up comments
---
.github/workflows/r.yml | 36 +++++++++++++++++++++++++-----------
R/tests/testthat/helper-initialize.R | 27 +++++++++++++++++++++------
2 files changed, 46 insertions(+), 17 deletions(-)
diff --git a/.github/workflows/r.yml b/.github/workflows/r.yml
index 55951c4036..236b30e7e3 100644
--- a/.github/workflows/r.yml
+++ b/.github/workflows/r.yml
@@ -41,6 +41,9 @@ on:
env:
MAVEN_OPTS: -Dmaven.wagon.httpconnectionManager.ttlSeconds=60
+ JAI_CORE_VERSION: '1.1.3'
+ JAI_CODEC_VERSION: '1.1.3'
+ JAI_IMAGEIO_VERSION: '1.1'
DO_NOT_TRACK: true
concurrency:
@@ -109,24 +112,34 @@ jobs:
distribution: 'temurin'
java-version: '11'
cache: 'maven'
- - name: Get OS name
- id: os-name
- run: |
- # `os_name` will be like "Ubuntu-20.04.1-LTS"
- OS_NAME=$(lsb_release -ds | sed 's/\s/-/g')
- echo "os-name=$OS_NAME" >> $GITHUB_OUTPUT
- - name: Cache Spark installations
- if: runner.os != 'Windows'
- uses: actions/cache@master
+ - uses: actions/setup-python@v5
with:
- path: ~/spark
- key: apache.sedona-apache-spark-${{ steps.os-name.outputs.os-name
}}-${{ env.SPARK_VERSION }}
+ python-version: '3.11'
+ - name: Install PySpark
+ run: |
+ pip3 install pyspark==${SPARK_VERSION}
+ - name: Download JAI libraries
+ run: |
+ PYSPARK_PATH=$(python3 -c "import pyspark;
print(pyspark.__path__[0])")
+ wget --retry-connrefused --waitretry=10 --read-timeout=20
--timeout=15 --tries=5
https://repo.osgeo.org/repository/release/javax/media/jai_core/${JAI_CORE_VERSION}/jai_core-${JAI_CORE_VERSION}.jar
+ wget --retry-connrefused --waitretry=10 --read-timeout=20
--timeout=15 --tries=5
https://repo.osgeo.org/repository/release/javax/media/jai_codec/${JAI_CODEC_VERSION}/jai_codec-${JAI_CODEC_VERSION}.jar
+ wget --retry-connrefused --waitretry=10 --read-timeout=20
--timeout=15 --tries=5
https://repo.osgeo.org/repository/release/javax/media/jai_imageio/${JAI_IMAGEIO_VERSION}/jai_imageio-${JAI_IMAGEIO_VERSION}.jar
+ mv -v jai_core-${JAI_CORE_VERSION}.jar ${PYSPARK_PATH}/jars
+ mv -v jai_codec-${JAI_CODEC_VERSION}.jar ${PYSPARK_PATH}/jars
+ mv -v jai_imageio-${JAI_IMAGEIO_VERSION}.jar ${PYSPARK_PATH}/jars
+ echo "PYSPARK_PATH=${PYSPARK_PATH}" >> $GITHUB_ENV
- name: Build Sedona libraries
run: |
SPARK_COMPAT_VERSION=${SPARK_VERSION:0:3}
mvn -q clean install -DskipTests -Dspark=${SPARK_COMPAT_VERSION}
-Dscala=${SCALA_VERSION:0:4} -Dgeotools
+ - name: Copy Sedona JARs to PySpark
+ run: |
+ find spark-shaded/target -name sedona-*.jar -exec cp {}
${PYSPARK_PATH}/jars/ \;
- name: Run tests
run: |
+ # Set SPARK_HOME to PySpark path
+ export SPARK_HOME=${PYSPARK_PATH}
+
if [[ "${SPARK_VERSION:0:3}" < "3.3" ]]; then
case "$HADOOP_VERSION" in
3)
@@ -143,6 +156,7 @@ jobs:
cd ./R/tests
NOT_CRAN='true' Rscript testthat.R
shell: bash
+ timeout-minutes: 30
- uses: actions/upload-artifact@v4
if: failure()
with:
diff --git a/R/tests/testthat/helper-initialize.R
b/R/tests/testthat/helper-initialize.R
index 84cadbff01..9d1143d93f 100644
--- a/R/tests/testthat/helper-initialize.R
+++ b/R/tests/testthat/helper-initialize.R
@@ -20,9 +20,16 @@ testthat_spark_connection <- function(conn_retry_interval_s
= 2) {
if (!exists(conn_key, envir = .GlobalEnv)) {
version <- Sys.getenv("SPARK_VERSION")
hadoop_version <- Sys.getenv("HADOOP_VERSION")
- spark_installed <- spark_installed_versions()
- if (nrow(spark_installed[spark_installed$spark == version &
spark_installed$hadoop == hadoop_version, ]) == 0) {
- spark_install(version, hadoop_version)
+ spark_home <- Sys.getenv("SPARK_HOME")
+
+ if (spark_home != "") {
+ message(sprintf("Using pre-installed Spark from: %s", spark_home))
+ } else {
+ spark_installed <- spark_installed_versions()
+ if (nrow(spark_installed[spark_installed$spark == version &
spark_installed$hadoop == hadoop_version, ]) == 0) {
+ message("Installing Spark for local development...")
+ spark_install(version, hadoop_version)
+ }
}
conn_attempts <- 3
@@ -33,13 +40,21 @@ testthat_spark_connection <- function(conn_retry_interval_s
= 2) {
config <- spark_config()
config[["sparklyr.connect.timeout"]] <- 300
- sc <- spark_connect(
+ # Use spark_home if set (CI), otherwise use version (local dev)
+ connect_args <- list(
master = "local",
method = "shell",
config = config,
- app_name = paste0("testthat-", uuid::UUIDgenerate()),
- version = version
+ app_name = paste0("testthat-", uuid::UUIDgenerate())
)
+
+ if (spark_home != "") {
+ connect_args$spark_home <- spark_home
+ } else {
+ connect_args$version <- version
+ }
+
+ sc <- do.call(spark_connect, connect_args)
assign(conn_key, sc, envir = .GlobalEnv)
TRUE
},