(incubator-gluten) branch main updated: [GLUTEN-9619][VL] Add HDFS integration tests to gluten-it (#11373)

philo Mon, 02 Feb 2026 05:22:53 -0800

This is an automated email from the ASF dual-hosted git repository.

philo pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/incubator-gluten.git



The following commit(s) were added to refs/heads/main by this push:
     new 9b63e3b06d [GLUTEN-9619][VL] Add HDFS integration tests to gluten-it 
(#11373)
9b63e3b06d is described below

commit 9b63e3b06d24c7f2e0d1b8750e152f2177ad042d
Author: inf <[email protected]>
AuthorDate: Mon Feb 2 16:22:39 2026 +0300

    [GLUTEN-9619][VL] Add HDFS integration tests to gluten-it (#11373)
---
 .github/workflows/util/setup-helper.sh             | 90 ++++++++++++++++++++++
 .github/workflows/velox_backend_x86.yml            | 12 +++
 .../apache/gluten/integration/QueryRunner.scala    | 13 +++-
 .../apache/gluten/integration/TableCreator.scala   | 40 ++++++----
 .../apache/gluten/integration/ds/TpcdsSuite.scala  |  3 +
 .../apache/gluten/integration/h/TpchSuite.scala    |  3 +
 6 files changed, 146 insertions(+), 15 deletions(-)

diff --git a/.github/workflows/util/setup-helper.sh 
b/.github/workflows/util/setup-helper.sh
index 6560c76078..338ae057c8 100644
--- a/.github/workflows/util/setup-helper.sh
+++ b/.github/workflows/util/setup-helper.sh
@@ -34,6 +34,96 @@ function install_maven {
   fi
 }
 
+function install_hadoop {
+  echo "Installing Hadoop..."
+  
+  apt-get update -y
+  apt-get install -y curl tar gzip
+  
+  local HADOOP_VERSION=3.3.6
+  curl -fsSL -o hadoop.tgz 
"https://archive.apache.org/dist/hadoop/common/hadoop-${HADOOP_VERSION}/hadoop-${HADOOP_VERSION}.tar.gz";
+  tar -xzf hadoop.tgz --no-same-owner --no-same-permissions
+
+  export HADOOP_HOME="$PWD/hadoop-${HADOOP_VERSION}"
+  export PATH="$HADOOP_HOME/bin:$HADOOP_HOME/sbin:$PATH"
+
+  export LD_LIBRARY_PATH="$HADOOP_HOME/lib/native:$LD_LIBRARY_PATH"
+
+  if [ -n "$GITHUB_ENV" ]; then
+    echo "HADOOP_HOME=$HADOOP_HOME" >> $GITHUB_ENV
+    echo "LD_LIBRARY_PATH=$HADOOP_HOME/lib/native:$LD_LIBRARY_PATH" >> 
$GITHUB_ENV
+    echo "$HADOOP_HOME/bin" >> $GITHUB_PATH
+  fi
+}
+
+function setup_hdfs {
+  export HADOOP_CONF_DIR="$HADOOP_HOME/etc/hadoop"
+
+  cat > "$HADOOP_CONF_DIR/core-site.xml" <<'EOF'
+<configuration>
+  <property>
+    <name>fs.defaultFS</name>
+    <value>hdfs://localhost:9000</value>
+  </property>
+</configuration>
+EOF
+
+  cat > "$HADOOP_CONF_DIR/hdfs-site.xml" <<'EOF'
+<configuration>
+  <property>
+    <name>dfs.replication</name>
+    <value>1</value>
+  </property>
+
+  <property>
+    <name>dfs.namenode.rpc-address</name>
+    <value>localhost:9000</value>
+  </property>
+
+  <property>
+    <name>dfs.namenode.http-address</name>
+    <value>localhost:9870</value>
+  </property>
+
+  <property>
+    <name>dfs.datanode.address</name>
+    <value>localhost:9866</value>
+  </property>
+
+  <property>
+    <name>dfs.datanode.http.address</name>
+    <value>localhost:9864</value>
+  </property>
+
+  <property>
+    <name>dfs.permissions.enabled</name>
+    <value>false</value>
+  </property>
+</configuration>
+EOF
+
+  export HDFS_TMP="${RUNNER_TEMP:-/tmp}/hdfs"
+  mkdir -p "$HDFS_TMP/nn" "$HDFS_TMP/dn"
+
+  perl -0777 -i -pe 's#</configuration>#  <property>\n    
<name>dfs.namenode.name.dir</name>\n    <value>file:'"$HDFS_TMP"'/nn</value>\n  
</property>\n  <property>\n    <name>dfs.datanode.data.dir</name>\n    
<value>file:'"$HDFS_TMP"'/dn</value>\n  </property>\n</configuration>#s' \
+    "$HADOOP_CONF_DIR/hdfs-site.xml"
+
+  if [ -n "${GITHUB_ENV:-}" ]; then
+    echo "HADOOP_CONF_DIR=$HADOOP_CONF_DIR" >> "$GITHUB_ENV"
+    echo "HADOOP_HOME=$HADOOP_HOME" >> "$GITHUB_ENV"
+  fi
+
+  "$HADOOP_HOME/bin/hdfs" namenode -format -force -nonInteractive
+  "$HADOOP_HOME/sbin/hadoop-daemon.sh" start namenode
+  "$HADOOP_HOME/sbin/hadoop-daemon.sh" start datanode
+
+  for i in {1..60}; do
+    "$HADOOP_HOME/bin/hdfs" dfs -ls / >/dev/null 2>&1 && break
+    sleep 1
+  done
+
+  "$HADOOP_HOME/bin/hdfs" dfs -ls /
+}
 for cmd in "$@"
 do
     echo "Running: $cmd"
diff --git a/.github/workflows/velox_backend_x86.yml 
b/.github/workflows/velox_backend_x86.yml
index f324ab5596..459efb730d 100644
--- a/.github/workflows/velox_backend_x86.yml
+++ b/.github/workflows/velox_backend_x86.yml
@@ -175,6 +175,11 @@ jobs:
             apt remove openjdk-11* -y
           fi
           ls -l 
/root/.m2/repository/org/apache/arrow/arrow-dataset/15.0.0-gluten/
+      - name: Install Hadoop & Setup HDFS
+        if: matrix.os == 'ubuntu:22.04' && matrix.spark == 'spark-3.5' && 
matrix.java == 'java-8'
+        run: |
+          export JAVA_HOME=/usr/lib/jvm/${{ matrix.java }}-openjdk-amd64
+          bash .github/workflows/util/setup-helper.sh install_hadoop setup_hdfs
       - name: Build and run TPC-H / TPC-DS
         run: |
           cd $GITHUB_WORKSPACE/
@@ -194,6 +199,13 @@ jobs:
             --local --preset=velox --benchmark-type=h --error-on-memleak 
--off-heap-size=10g -s=1.0 --threads=16 --iterations=1 \
           && GLUTEN_IT_JVM_ARGS=-Xmx6G sbin/gluten-it.sh queries-compare \
             --local --preset=velox --benchmark-type=ds --error-on-memleak 
--off-heap-size=10g -s=1.0 --threads=16 --iterations=1
+          if [ "${{ matrix.os }}" = "ubuntu:22.04" ] && \
+            [ "${{ matrix.spark }}" = "spark-3.5" ] && \
+            [ "${{ matrix.java }}" = "java-8" ]; then
+            GLUTEN_IT_JVM_ARGS=-Xmx6G sbin/gluten-it.sh queries-compare \
+            --local --preset=velox --benchmark-type=h --error-on-memleak 
--off-heap-size=10g -s=1.0 --threads=16 --iterations=1 \
+            --queries=q1 --data-dir="hdfs://localhost:9000/test"
+          fi
 
   tpc-test-centos8:
     needs: build-native-lib-centos-7
diff --git 
a/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/QueryRunner.scala
 
b/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/QueryRunner.scala
index 04685320a0..3df38314fb 100644
--- 
a/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/QueryRunner.scala
+++ 
b/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/QueryRunner.scala
@@ -22,14 +22,16 @@ import org.apache.spark.sql.{RunResult, SparkQueryRunner, 
SparkSession}
 
 import com.google.common.base.Preconditions
 import org.apache.commons.lang3.exception.ExceptionUtils
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs.{FileSystem, Path}
 
 import java.io.File
+import java.net.URI
 
 class QueryRunner(val source: String, val dataPath: String) {
   import QueryRunner._
-
   Preconditions.checkState(
-    new File(dataPath).exists(),
+    fileExists(dataPath),
     s"Data not found at $dataPath, try using command `<gluten-it> 
data-gen-only <options>` to generate it first.",
     Array(): _*)
 
@@ -63,6 +65,13 @@ class QueryRunner(val source: String, val dataPath: String) {
         Failure(query.id, e)
     }
   }
+
+  private def fileExists(datapath: String): Boolean = {
+    if (datapath.startsWith("hdfs:")) {
+      val uri = URI.create(datapath)
+      FileSystem.get(uri, new Configuration()).exists(new Path(uri.getPath))
+    } else new File(datapath).exists()
+  }
 }
 
 object QueryRunner {
diff --git 
a/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/TableCreator.scala
 
b/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/TableCreator.scala
index b4b3c203fd..f382f9aad7 100644
--- 
a/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/TableCreator.scala
+++ 
b/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/TableCreator.scala
@@ -18,9 +18,13 @@ package org.apache.gluten.integration
 
 import org.apache.spark.sql.{AnalysisException, SparkSession}
 
-import java.io.File
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs.{FileSystem, Path}
+
+import java.net.URI
 
 import scala.collection.mutable
+import scala.collection.mutable.ArrayBuffer
 
 trait TableCreator {
   def create(spark: SparkSession, source: String, dataPath: String): Unit
@@ -33,44 +37,54 @@ object TableCreator {
 
   private object DiscoverSchema extends TableCreator {
     override def create(spark: SparkSession, source: String, dataPath: 
String): Unit = {
-      val files = new File(dataPath).listFiles()
-      val tableNames = files.map(_.getName)
+      val uri = URI.create(dataPath)
+      val fs = FileSystem.get(uri, new Configuration())
+
+      val basePath = new Path(dataPath)
+      val statuses = fs.listStatus(basePath)
+
+      val tableDirs = statuses.filter(_.isDirectory).map(_.getPath)
+
+      val tableNames = ArrayBuffer[String]()
+
       val existedTableNames = mutable.ArrayBuffer[String]()
       val createdTableNames = mutable.ArrayBuffer[String]()
       val recoveredPartitionTableNames = mutable.ArrayBuffer[String]()
 
-      if (tableNames.isEmpty) {
-        return
+      tableDirs.foreach {
+        tablePath =>
+          val tableName = tablePath.getName
+          tableNames += tableName
       }
 
       println("Creating catalog tables: " + tableNames.mkString(", "))
 
-      files.foreach(
-        file => {
-          val tableName = file.getName
+      tableDirs.foreach {
+        tablePath =>
+          val tableName = tablePath.getName
           if (spark.catalog.tableExists(tableName)) {
             existedTableNames += tableName
           } else {
-            spark.catalog.createTable(tableName, file.getAbsolutePath, source)
+            spark.catalog.createTable(tableName, tablePath.toString, source)
             createdTableNames += tableName
             try {
               spark.catalog.recoverPartitions(tableName)
               recoveredPartitionTableNames += tableName
             } catch {
               case _: AnalysisException =>
-              // Swallows analysis exceptions.
             }
           }
-        })
+      }
 
+      if (tableNames.isEmpty) {
+        return
+      }
       if (existedTableNames.nonEmpty) {
         println("Tables already exists: " + existedTableNames.mkString(", "))
       }
-
       if (createdTableNames.nonEmpty) {
         println("Tables created: " + createdTableNames.mkString(", "))
       }
-
       if (recoveredPartitionTableNames.nonEmpty) {
         println("Recovered partition tables: " + 
recoveredPartitionTableNames.mkString(", "))
       }
diff --git 
a/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/ds/TpcdsSuite.scala
 
b/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/ds/TpcdsSuite.scala
index e6cff8f8cf..66a83395f7 100644
--- 
a/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/ds/TpcdsSuite.scala
+++ 
b/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/ds/TpcdsSuite.scala
@@ -81,6 +81,9 @@ class TpcdsSuite(
       "non_partitioned"
     }
     val featureFlags = dataGenFeatures.map(feature => 
s"-$feature").mkString("")
+    if (dataDir.startsWith("hdfs://")) {
+      return 
s"$dataDir/$TPCDS_WRITE_RELATIVE_PATH-$dataScale-$dataSource-$partitionedFlag$featureFlags"
+    }
     new File(dataDir).toPath
       
.resolve(s"$TPCDS_WRITE_RELATIVE_PATH-$dataScale-$dataSource-$partitionedFlag$featureFlags")
       .toFile
diff --git 
a/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/h/TpchSuite.scala
 
b/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/h/TpchSuite.scala
index a0361e9c9f..e4a1104c48 100644
--- 
a/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/h/TpchSuite.scala
+++ 
b/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/h/TpchSuite.scala
@@ -76,6 +76,9 @@ class TpchSuite(
 
   override private[integration] def dataWritePath(): String = {
     val featureFlags = dataGenFeatures.map(feature => 
s"-$feature").mkString("")
+    if (dataDir.startsWith("hdfs://")) {
+      return 
s"$dataDir/$TPCH_WRITE_RELATIVE_PATH-$dataScale-$dataSource$featureFlags"
+    }
     new File(dataDir).toPath
       
.resolve(s"$TPCH_WRITE_RELATIVE_PATH-$dataScale-$dataSource$featureFlags")
       .toFile


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(incubator-gluten) branch main updated: [GLUTEN-9619][VL] Add HDFS integration tests to gluten-it (#11373)

Reply via email to