This is an automated email from the ASF dual-hosted git repository.
philo pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/incubator-gluten.git
The following commit(s) were added to refs/heads/main by this push:
new 9b63e3b06d [GLUTEN-9619][VL] Add HDFS integration tests to gluten-it
(#11373)
9b63e3b06d is described below
commit 9b63e3b06d24c7f2e0d1b8750e152f2177ad042d
Author: inf <[email protected]>
AuthorDate: Mon Feb 2 16:22:39 2026 +0300
[GLUTEN-9619][VL] Add HDFS integration tests to gluten-it (#11373)
---
.github/workflows/util/setup-helper.sh | 90 ++++++++++++++++++++++
.github/workflows/velox_backend_x86.yml | 12 +++
.../apache/gluten/integration/QueryRunner.scala | 13 +++-
.../apache/gluten/integration/TableCreator.scala | 40 ++++++----
.../apache/gluten/integration/ds/TpcdsSuite.scala | 3 +
.../apache/gluten/integration/h/TpchSuite.scala | 3 +
6 files changed, 146 insertions(+), 15 deletions(-)
diff --git a/.github/workflows/util/setup-helper.sh
b/.github/workflows/util/setup-helper.sh
index 6560c76078..338ae057c8 100644
--- a/.github/workflows/util/setup-helper.sh
+++ b/.github/workflows/util/setup-helper.sh
@@ -34,6 +34,96 @@ function install_maven {
fi
}
+function install_hadoop {
+ echo "Installing Hadoop..."
+
+ apt-get update -y
+ apt-get install -y curl tar gzip
+
+ local HADOOP_VERSION=3.3.6
+ curl -fsSL -o hadoop.tgz
"https://archive.apache.org/dist/hadoop/common/hadoop-${HADOOP_VERSION}/hadoop-${HADOOP_VERSION}.tar.gz"
+ tar -xzf hadoop.tgz --no-same-owner --no-same-permissions
+
+ export HADOOP_HOME="$PWD/hadoop-${HADOOP_VERSION}"
+ export PATH="$HADOOP_HOME/bin:$HADOOP_HOME/sbin:$PATH"
+
+ export LD_LIBRARY_PATH="$HADOOP_HOME/lib/native:$LD_LIBRARY_PATH"
+
+ if [ -n "$GITHUB_ENV" ]; then
+ echo "HADOOP_HOME=$HADOOP_HOME" >> $GITHUB_ENV
+ echo "LD_LIBRARY_PATH=$HADOOP_HOME/lib/native:$LD_LIBRARY_PATH" >>
$GITHUB_ENV
+ echo "$HADOOP_HOME/bin" >> $GITHUB_PATH
+ fi
+}
+
+function setup_hdfs {
+ export HADOOP_CONF_DIR="$HADOOP_HOME/etc/hadoop"
+
+ cat > "$HADOOP_CONF_DIR/core-site.xml" <<'EOF'
+<configuration>
+ <property>
+ <name>fs.defaultFS</name>
+ <value>hdfs://localhost:9000</value>
+ </property>
+</configuration>
+EOF
+
+ cat > "$HADOOP_CONF_DIR/hdfs-site.xml" <<'EOF'
+<configuration>
+ <property>
+ <name>dfs.replication</name>
+ <value>1</value>
+ </property>
+
+ <property>
+ <name>dfs.namenode.rpc-address</name>
+ <value>localhost:9000</value>
+ </property>
+
+ <property>
+ <name>dfs.namenode.http-address</name>
+ <value>localhost:9870</value>
+ </property>
+
+ <property>
+ <name>dfs.datanode.address</name>
+ <value>localhost:9866</value>
+ </property>
+
+ <property>
+ <name>dfs.datanode.http.address</name>
+ <value>localhost:9864</value>
+ </property>
+
+ <property>
+ <name>dfs.permissions.enabled</name>
+ <value>false</value>
+ </property>
+</configuration>
+EOF
+
+ export HDFS_TMP="${RUNNER_TEMP:-/tmp}/hdfs"
+ mkdir -p "$HDFS_TMP/nn" "$HDFS_TMP/dn"
+
+ perl -0777 -i -pe 's#</configuration># <property>\n
<name>dfs.namenode.name.dir</name>\n <value>file:'"$HDFS_TMP"'/nn</value>\n
</property>\n <property>\n <name>dfs.datanode.data.dir</name>\n
<value>file:'"$HDFS_TMP"'/dn</value>\n </property>\n</configuration>#s' \
+ "$HADOOP_CONF_DIR/hdfs-site.xml"
+
+ if [ -n "${GITHUB_ENV:-}" ]; then
+ echo "HADOOP_CONF_DIR=$HADOOP_CONF_DIR" >> "$GITHUB_ENV"
+ echo "HADOOP_HOME=$HADOOP_HOME" >> "$GITHUB_ENV"
+ fi
+
+ "$HADOOP_HOME/bin/hdfs" namenode -format -force -nonInteractive
+ "$HADOOP_HOME/sbin/hadoop-daemon.sh" start namenode
+ "$HADOOP_HOME/sbin/hadoop-daemon.sh" start datanode
+
+ for i in {1..60}; do
+ "$HADOOP_HOME/bin/hdfs" dfs -ls / >/dev/null 2>&1 && break
+ sleep 1
+ done
+
+ "$HADOOP_HOME/bin/hdfs" dfs -ls /
+}
for cmd in "$@"
do
echo "Running: $cmd"
diff --git a/.github/workflows/velox_backend_x86.yml
b/.github/workflows/velox_backend_x86.yml
index f324ab5596..459efb730d 100644
--- a/.github/workflows/velox_backend_x86.yml
+++ b/.github/workflows/velox_backend_x86.yml
@@ -175,6 +175,11 @@ jobs:
apt remove openjdk-11* -y
fi
ls -l
/root/.m2/repository/org/apache/arrow/arrow-dataset/15.0.0-gluten/
+ - name: Install Hadoop & Setup HDFS
+ if: matrix.os == 'ubuntu:22.04' && matrix.spark == 'spark-3.5' &&
matrix.java == 'java-8'
+ run: |
+ export JAVA_HOME=/usr/lib/jvm/${{ matrix.java }}-openjdk-amd64
+ bash .github/workflows/util/setup-helper.sh install_hadoop setup_hdfs
- name: Build and run TPC-H / TPC-DS
run: |
cd $GITHUB_WORKSPACE/
@@ -194,6 +199,13 @@ jobs:
--local --preset=velox --benchmark-type=h --error-on-memleak
--off-heap-size=10g -s=1.0 --threads=16 --iterations=1 \
&& GLUTEN_IT_JVM_ARGS=-Xmx6G sbin/gluten-it.sh queries-compare \
--local --preset=velox --benchmark-type=ds --error-on-memleak
--off-heap-size=10g -s=1.0 --threads=16 --iterations=1
+ if [ "${{ matrix.os }}" = "ubuntu:22.04" ] && \
+ [ "${{ matrix.spark }}" = "spark-3.5" ] && \
+ [ "${{ matrix.java }}" = "java-8" ]; then
+ GLUTEN_IT_JVM_ARGS=-Xmx6G sbin/gluten-it.sh queries-compare \
+ --local --preset=velox --benchmark-type=h --error-on-memleak
--off-heap-size=10g -s=1.0 --threads=16 --iterations=1 \
+ --queries=q1 --data-dir="hdfs://localhost:9000/test"
+ fi
tpc-test-centos8:
needs: build-native-lib-centos-7
diff --git
a/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/QueryRunner.scala
b/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/QueryRunner.scala
index 04685320a0..3df38314fb 100644
---
a/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/QueryRunner.scala
+++
b/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/QueryRunner.scala
@@ -22,14 +22,16 @@ import org.apache.spark.sql.{RunResult, SparkQueryRunner,
SparkSession}
import com.google.common.base.Preconditions
import org.apache.commons.lang3.exception.ExceptionUtils
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs.{FileSystem, Path}
import java.io.File
+import java.net.URI
class QueryRunner(val source: String, val dataPath: String) {
import QueryRunner._
-
Preconditions.checkState(
- new File(dataPath).exists(),
+ fileExists(dataPath),
s"Data not found at $dataPath, try using command `<gluten-it>
data-gen-only <options>` to generate it first.",
Array(): _*)
@@ -63,6 +65,13 @@ class QueryRunner(val source: String, val dataPath: String) {
Failure(query.id, e)
}
}
+
+ private def fileExists(datapath: String): Boolean = {
+ if (datapath.startsWith("hdfs:")) {
+ val uri = URI.create(datapath)
+ FileSystem.get(uri, new Configuration()).exists(new Path(uri.getPath))
+ } else new File(datapath).exists()
+ }
}
object QueryRunner {
diff --git
a/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/TableCreator.scala
b/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/TableCreator.scala
index b4b3c203fd..f382f9aad7 100644
---
a/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/TableCreator.scala
+++
b/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/TableCreator.scala
@@ -18,9 +18,13 @@ package org.apache.gluten.integration
import org.apache.spark.sql.{AnalysisException, SparkSession}
-import java.io.File
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs.{FileSystem, Path}
+
+import java.net.URI
import scala.collection.mutable
+import scala.collection.mutable.ArrayBuffer
trait TableCreator {
def create(spark: SparkSession, source: String, dataPath: String): Unit
@@ -33,44 +37,54 @@ object TableCreator {
private object DiscoverSchema extends TableCreator {
override def create(spark: SparkSession, source: String, dataPath:
String): Unit = {
- val files = new File(dataPath).listFiles()
- val tableNames = files.map(_.getName)
+ val uri = URI.create(dataPath)
+ val fs = FileSystem.get(uri, new Configuration())
+
+ val basePath = new Path(dataPath)
+ val statuses = fs.listStatus(basePath)
+
+ val tableDirs = statuses.filter(_.isDirectory).map(_.getPath)
+
+ val tableNames = ArrayBuffer[String]()
+
val existedTableNames = mutable.ArrayBuffer[String]()
val createdTableNames = mutable.ArrayBuffer[String]()
val recoveredPartitionTableNames = mutable.ArrayBuffer[String]()
- if (tableNames.isEmpty) {
- return
+ tableDirs.foreach {
+ tablePath =>
+ val tableName = tablePath.getName
+ tableNames += tableName
}
println("Creating catalog tables: " + tableNames.mkString(", "))
- files.foreach(
- file => {
- val tableName = file.getName
+ tableDirs.foreach {
+ tablePath =>
+ val tableName = tablePath.getName
if (spark.catalog.tableExists(tableName)) {
existedTableNames += tableName
} else {
- spark.catalog.createTable(tableName, file.getAbsolutePath, source)
+ spark.catalog.createTable(tableName, tablePath.toString, source)
createdTableNames += tableName
try {
spark.catalog.recoverPartitions(tableName)
recoveredPartitionTableNames += tableName
} catch {
case _: AnalysisException =>
- // Swallows analysis exceptions.
}
}
- })
+ }
+ if (tableNames.isEmpty) {
+ return
+ }
if (existedTableNames.nonEmpty) {
println("Tables already exists: " + existedTableNames.mkString(", "))
}
-
if (createdTableNames.nonEmpty) {
println("Tables created: " + createdTableNames.mkString(", "))
}
-
if (recoveredPartitionTableNames.nonEmpty) {
println("Recovered partition tables: " +
recoveredPartitionTableNames.mkString(", "))
}
diff --git
a/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/ds/TpcdsSuite.scala
b/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/ds/TpcdsSuite.scala
index e6cff8f8cf..66a83395f7 100644
---
a/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/ds/TpcdsSuite.scala
+++
b/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/ds/TpcdsSuite.scala
@@ -81,6 +81,9 @@ class TpcdsSuite(
"non_partitioned"
}
val featureFlags = dataGenFeatures.map(feature =>
s"-$feature").mkString("")
+ if (dataDir.startsWith("hdfs://")) {
+ return
s"$dataDir/$TPCDS_WRITE_RELATIVE_PATH-$dataScale-$dataSource-$partitionedFlag$featureFlags"
+ }
new File(dataDir).toPath
.resolve(s"$TPCDS_WRITE_RELATIVE_PATH-$dataScale-$dataSource-$partitionedFlag$featureFlags")
.toFile
diff --git
a/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/h/TpchSuite.scala
b/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/h/TpchSuite.scala
index a0361e9c9f..e4a1104c48 100644
---
a/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/h/TpchSuite.scala
+++
b/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/h/TpchSuite.scala
@@ -76,6 +76,9 @@ class TpchSuite(
override private[integration] def dataWritePath(): String = {
val featureFlags = dataGenFeatures.map(feature =>
s"-$feature").mkString("")
+ if (dataDir.startsWith("hdfs://")) {
+ return
s"$dataDir/$TPCH_WRITE_RELATIVE_PATH-$dataScale-$dataSource$featureFlags"
+ }
new File(dataDir).toPath
.resolve(s"$TPCH_WRITE_RELATIVE_PATH-$dataScale-$dataSource$featureFlags")
.toFile
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]