This is an automated email from the ASF dual-hosted git repository.
hongze pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/incubator-gluten.git
The following commit(s) were added to refs/heads/main by this push:
new c22eb87092 [VL] Add S3 integration tests to gluten (#11516)
c22eb87092 is described below
commit c22eb87092388a06f32fab17fc09971e7b370e7b
Author: Mariam AlMesfer <[email protected]>
AuthorDate: Thu Feb 12 20:06:29 2026 +0300
[VL] Add S3 integration tests to gluten (#11516)
Co-authored-by: Mariam-Almesfer <[email protected]>
---
.github/workflows/util/install-resources.sh | 60 ++++++++++++++++++++++
.github/workflows/velox_backend_x86.yml | 24 +++++++++
.../apache/gluten/integration/QueryRunner.scala | 2 +-
.../apache/gluten/integration/ds/TpcdsSuite.scala | 2 +-
.../apache/gluten/integration/h/TpchSuite.scala | 2 +-
5 files changed, 87 insertions(+), 3 deletions(-)
diff --git a/.github/workflows/util/install-resources.sh
b/.github/workflows/util/install-resources.sh
index 0a5240d132..e325457794 100755
--- a/.github/workflows/util/install-resources.sh
+++ b/.github/workflows/util/install-resources.sh
@@ -93,6 +93,66 @@ EOF
"$HADOOP_HOME/bin/hdfs" dfs -ls /
}
+function install_minio {
+ echo "Installing MinIO..."
+
+ apt-get update -y
+ apt-get install -y curl
+
+ curl -fsSL -o /usr/local/bin/minio
https://dl.min.io/server/minio/release/linux-amd64/minio
+ chmod +x /usr/local/bin/minio
+
+ curl -fsSL -o /usr/local/bin/mc
https://dl.min.io/client/mc/release/linux-amd64/mc
+ chmod +x /usr/local/bin/mc
+
+ echo "MinIO installed successfully"
+}
+
+function setup_minio {
+ local spark_version="${1:-3.5}"
+ local spark_version_short=$(echo "${spark_version}" | cut -d '.' -f 1,2 | tr
-d '.')
+
+ case "$spark_version" in
+ 3.3) hadoop_aws_version="3.3.2"; aws_sdk_artifact="aws-java-sdk-bundle";
aws_sdk_version="1.12.262" ;;
+ 3.4|3.5*) hadoop_aws_version="3.3.4";
aws_sdk_artifact="aws-java-sdk-bundle"; aws_sdk_version="1.12.262" ;;
+ 4.0) hadoop_aws_version="3.4.0"; aws_sdk_artifact="bundle";
aws_sdk_version="2.25.11" ;;
+ 4.1) hadoop_aws_version="3.4.1"; aws_sdk_artifact="bundle";
aws_sdk_version="2.25.11" ;;
+ *) hadoop_aws_version="3.3.4"; aws_sdk_artifact="aws-java-sdk-bundle";
aws_sdk_version="1.12.262" ;;
+ esac
+
+ local
spark_jars_dir="${GITHUB_WORKSPACE:-$PWD}/tools/gluten-it/package/target/lib"
+ mkdir -p "$spark_jars_dir"
+
+ wget -nv
https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/${hadoop_aws_version}/hadoop-aws-${hadoop_aws_version}.jar
-P "$spark_jars_dir" || return 1
+
+ if [ "$aws_sdk_artifact" == "aws-java-sdk-bundle" ]; then
+ wget -nv
https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/${aws_sdk_version}/aws-java-sdk-bundle-${aws_sdk_version}.jar
-P "$spark_jars_dir" || return 1
+ else
+ wget -nv
https://repo1.maven.org/maven2/software/amazon/awssdk/bundle/${aws_sdk_version}/bundle-${aws_sdk_version}.jar
-P "$spark_jars_dir" || return 1
+ fi
+
+ export MINIO_DATA_DIR="${RUNNER_TEMP:-/tmp}/minio-data"
+ mkdir -p "$MINIO_DATA_DIR"
+ export MINIO_ROOT_USER=admin
+ export MINIO_ROOT_PASSWORD=admin123
+
+ nohup minio server --address ":9100" --console-address ":9101"
"$MINIO_DATA_DIR" > /tmp/minio.log 2>&1 &
+
+ for i in {1..60}; do
+ curl -sSf http://localhost:9100/minio/health/ready >/dev/null 2>&1 && break
+ sleep 1
+ done
+
+ if ! curl -sSf http://localhost:9100/minio/health/ready >/dev/null 2>&1; then
+ echo "MinIO failed to start"
+ cat /tmp/minio.log || true
+ exit 1
+ fi
+
+ mc alias set s3local http://localhost:9100 "$MINIO_ROOT_USER"
"$MINIO_ROOT_PASSWORD"
+ mc mb -p s3local/gluten-it || true
+}
+
# Installs Spark binary and source releases with:
# 1 - spark version
# 2 - hadoop version
diff --git a/.github/workflows/velox_backend_x86.yml
b/.github/workflows/velox_backend_x86.yml
index 545fe71ab3..9064600f46 100644
--- a/.github/workflows/velox_backend_x86.yml
+++ b/.github/workflows/velox_backend_x86.yml
@@ -183,7 +183,15 @@ jobs:
source .github/workflows/util/install-resources.sh
install_hadoop
setup_hdfs
+ - name: Install MinIO
+ if: matrix.os == 'ubuntu:22.04' && matrix.spark == 'spark-3.5' &&
matrix.java == 'java-8'
+ shell: bash
+ run: |
+ export JAVA_HOME=/usr/lib/jvm/${{ matrix.java }}-openjdk-amd64
+ source .github/workflows/util/install-resources.sh
+ install_minio
- name: Build and run TPC-H / TPC-DS
+ shell: bash
run: |
cd $GITHUB_WORKSPACE/
export JAVA_HOME=/usr/lib/jvm/${{ matrix.java }}-openjdk-amd64
@@ -198,6 +206,14 @@ jobs:
esac
cd $GITHUB_WORKSPACE/tools/gluten-it
$GITHUB_WORKSPACE/$MVN_CMD clean install -P${{ matrix.spark }} -P${{
matrix.java }}
+ # Setup S3 JARs after gluten-it build
+ if [ "${{ matrix.os }}" = "ubuntu:22.04" ] && \
+ [ "${{ matrix.spark }}" = "spark-3.5" ] && \
+ [ "${{ matrix.java }}" = "java-8" ]; then
+ source
$GITHUB_WORKSPACE/.github/workflows/util/install-resources.sh
+ SPARK_VERSION=$(echo "${{ matrix.spark }}" | sed 's/spark-//')
+ setup_minio "$SPARK_VERSION"
+ fi
GLUTEN_IT_JVM_ARGS=-Xmx5G sbin/gluten-it.sh queries-compare \
--local --preset=velox --benchmark-type=h --error-on-memleak
--off-heap-size=10g -s=1.0 --threads=16 --iterations=1 \
&& GLUTEN_IT_JVM_ARGS=-Xmx6G sbin/gluten-it.sh queries-compare \
@@ -208,6 +224,14 @@ jobs:
GLUTEN_IT_JVM_ARGS=-Xmx6G sbin/gluten-it.sh queries-compare \
--local --preset=velox --benchmark-type=h --error-on-memleak
--off-heap-size=10g -s=1.0 --threads=16 --iterations=1 \
--queries=q1 --data-dir="hdfs://localhost:9000/test"
+ GLUTEN_IT_JVM_ARGS=-Xmx6G sbin/gluten-it.sh queries-compare \
+ --local --preset=velox --benchmark-type=h --error-on-memleak
--off-heap-size=10g -s=1.0 --threads=16 --iterations=1 \
+ --queries=q1 --data-dir="s3a://gluten-it/test" \
+ --extra-conf=spark.hadoop.fs.s3a.endpoint=http://localhost:9100 \
+ --extra-conf=spark.hadoop.fs.s3a.access.key=admin \
+ --extra-conf=spark.hadoop.fs.s3a.secret.key=admin123 \
+ --extra-conf=spark.hadoop.fs.s3a.path.style.access=true \
+
--extra-conf=spark.hadoop.fs.s3a.impl=org.apache.hadoop.fs.s3a.S3AFileSystem
fi
tpc-test-centos8:
diff --git
a/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/QueryRunner.scala
b/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/QueryRunner.scala
index 3df38314fb..8f817f418a 100644
---
a/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/QueryRunner.scala
+++
b/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/QueryRunner.scala
@@ -67,7 +67,7 @@ class QueryRunner(val source: String, val dataPath: String) {
}
private def fileExists(datapath: String): Boolean = {
- if (datapath.startsWith("hdfs:")) {
+ if (datapath.startsWith("hdfs:") || datapath.startsWith("s3a:")) {
val uri = URI.create(datapath)
FileSystem.get(uri, new Configuration()).exists(new Path(uri.getPath))
} else new File(datapath).exists()
diff --git
a/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/ds/TpcdsSuite.scala
b/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/ds/TpcdsSuite.scala
index 66a83395f7..9293e1a09d 100644
---
a/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/ds/TpcdsSuite.scala
+++
b/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/ds/TpcdsSuite.scala
@@ -81,7 +81,7 @@ class TpcdsSuite(
"non_partitioned"
}
val featureFlags = dataGenFeatures.map(feature =>
s"-$feature").mkString("")
- if (dataDir.startsWith("hdfs://")) {
+ if (dataDir.startsWith("hdfs://") || dataDir.startsWith("s3a://")) {
return
s"$dataDir/$TPCDS_WRITE_RELATIVE_PATH-$dataScale-$dataSource-$partitionedFlag$featureFlags"
}
new File(dataDir).toPath
diff --git
a/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/h/TpchSuite.scala
b/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/h/TpchSuite.scala
index e4a1104c48..af36cc4946 100644
---
a/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/h/TpchSuite.scala
+++
b/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/h/TpchSuite.scala
@@ -76,7 +76,7 @@ class TpchSuite(
override private[integration] def dataWritePath(): String = {
val featureFlags = dataGenFeatures.map(feature =>
s"-$feature").mkString("")
- if (dataDir.startsWith("hdfs://")) {
+ if (dataDir.startsWith("hdfs://") || dataDir.startsWith("s3a://")) {
return
s"$dataDir/$TPCH_WRITE_RELATIVE_PATH-$dataScale-$dataSource$featureFlags"
}
new File(dataDir).toPath
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]