(gluten) branch main updated: [VL] Respect orc.force.positional.evolution (#12234)

beliefer Sat, 06 Jun 2026 22:09:33 -0700

This is an automated email from the ASF dual-hosted git repository.

beliefer pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/gluten.git



The following commit(s) were added to refs/heads/main by this push:
     new fc90a7933a [VL] Respect orc.force.positional.evolution (#12234)
fc90a7933a is described below

commit fc90a7933afaf5518ad20a60a5d79d482cea5ef1
Author: Jiaan Geng <[email protected]>
AuthorDate: Sun Jun 7 13:08:19 2026 +0800

    [VL] Respect orc.force.positional.evolution (#12234)
    
    * [VL] Respect orc.force.positional.evolution
---
 .../org/apache/gluten/config/VeloxConfig.scala     |  3 ++-
 .../org/apache/gluten/config/GlutenConfig.scala    | 15 +++++++++++
 .../hive/execution/GlutenHiveSQLQuerySuite.scala   | 30 ++++++++++++++++++++++
 .../hive/execution/GlutenHiveSQLQuerySuite.scala   | 30 ++++++++++++++++++++++
 .../hive/execution/GlutenHiveSQLQuerySuite.scala   | 30 ++++++++++++++++++++++
 .../hive/execution/GlutenHiveSQLQuerySuite.scala   | 30 ++++++++++++++++++++++
 .../hive/execution/GlutenHiveSQLQuerySuite.scala   | 30 ++++++++++++++++++++++
 7 files changed, 167 insertions(+), 1 deletion(-)

diff --git 
a/backends-velox/src/main/scala/org/apache/gluten/config/VeloxConfig.scala 
b/backends-velox/src/main/scala/org/apache/gluten/config/VeloxConfig.scala
index 0e9c22154e..e411c9d190 100644
--- a/backends-velox/src/main/scala/org/apache/gluten/config/VeloxConfig.scala
+++ b/backends-velox/src/main/scala/org/apache/gluten/config/VeloxConfig.scala
@@ -94,7 +94,8 @@ class VeloxConfig(conf: SQLConf) extends GlutenConfig(conf) {
 
   def cudfShuffleMaxPrefetchBytes: Long = 
getConf(CUDF_SHUFFLE_MAX_PREFETCH_BYTES)
 
-  def orcUseColumnNames: Boolean = getConf(ORC_USE_COLUMN_NAMES)
+  def orcUseColumnNames: Boolean = getConf(ORC_USE_COLUMN_NAMES) &&
+    !conf.getConfString(GlutenConfig.SPARK_ORC_FORCE_POSITIONAL_EVOLUTION, 
"false").toBoolean
 
   def parquetUseColumnNames: Boolean = getConf(PARQUET_USE_COLUMN_NAMES)
 
diff --git 
a/gluten-substrait/src/main/scala/org/apache/gluten/config/GlutenConfig.scala 
b/gluten-substrait/src/main/scala/org/apache/gluten/config/GlutenConfig.scala
index 8656b3e2af..9c59097991 100644
--- 
a/gluten-substrait/src/main/scala/org/apache/gluten/config/GlutenConfig.scala
+++ 
b/gluten-substrait/src/main/scala/org/apache/gluten/config/GlutenConfig.scala
@@ -437,6 +437,8 @@ object GlutenConfig extends ConfigRegistry {
   val SPARK_S3_ENDPOINT_REGION: String = HADOOP_PREFIX + S3_ENDPOINT_REGION
   val S3_AWS_IMDS_ENABLED = "fs.s3a.aws.imds.enabled"
   val SPARK_S3_AWS_IMDS_ENABLED: String = HADOOP_PREFIX + S3_AWS_IMDS_ENABLED
+  val ORC_FORCE_POSITIONAL_EVOLUTION = "orc.force.positional.evolution"
+  val SPARK_ORC_FORCE_POSITIONAL_EVOLUTION = HADOOP_PREFIX + 
ORC_FORCE_POSITIONAL_EVOLUTION
 
   // ABFS config
   val ABFS_PREFIX = "fs.azure."
@@ -580,6 +582,19 @@ object GlutenConfig extends ConfigRegistry {
       }
       .foreach { case (k, v) => nativeConfMap.put(k, v) }
 
+    // When `orc.force.positional.evolution=true`, vanilla Spark maps ORC 
columns by
+    // position rather than by name (see OrcUtils.requestedColumnIds). The 
Velox ORC reader
+    // must do the same, otherwise name-based matching against a mismatched 
file schema
+    // reads columns back as null/empty. Override the (Velox) 
orcUseColumnNames session conf
+    // so native reads ORC by position too. Harmless for backends that ignore 
this key.
+    // String literal is used because gluten-substrait cannot depend on 
backends-velox.
+    if (
+      backendName == "velox" &&
+      conf.getOrElse(SPARK_ORC_FORCE_POSITIONAL_EVOLUTION, "false").toBoolean
+    ) {
+      
nativeConfMap.put("spark.gluten.sql.columnar.backend.velox.orcUseColumnNames", 
"false")
+    }
+
     // Pass the latest tokens to native
     nativeConfMap.put(
       ReservedKeys.GLUTEN_UGI_TOKENS,
diff --git 
a/gluten-ut/spark33/src/test/scala/org/apache/spark/sql/hive/execution/GlutenHiveSQLQuerySuite.scala
 
b/gluten-ut/spark33/src/test/scala/org/apache/spark/sql/hive/execution/GlutenHiveSQLQuerySuite.scala
index 04382e32ad..97ec1cfba8 100644
--- 
a/gluten-ut/spark33/src/test/scala/org/apache/spark/sql/hive/execution/GlutenHiveSQLQuerySuite.scala
+++ 
b/gluten-ut/spark33/src/test/scala/org/apache/spark/sql/hive/execution/GlutenHiveSQLQuerySuite.scala
@@ -137,6 +137,36 @@ class GlutenHiveSQLQuerySuite extends 
GlutenHiveSQLQuerySuiteBase {
       purge = false)
   }
 
+  testGluten("orc.force.positional.evolution maps Hive ORC columns by 
position") {
+    val hiveClient: HiveClient =
+      
spark.sharedState.externalCatalog.unwrapped.asInstanceOf[HiveExternalCatalog].client
+
+    withSQLConf("spark.sql.hive.convertMetastoreOrc" -> "false") {
+      withTempDir {
+        dir =>
+          val orcLoc = s"file:///$dir/test_orc_pos"
+          withTable("test_orc_pos", "test_orc_pos_renamed") {
+            // Write ORC files whose physical column names are c1, c2 (c1 = 1, 
c2 = 2).
+            hiveClient.runSqlHive(
+              s"create table test_orc_pos(c1 int, c2 int) stored as orc 
location '$orcLoc'")
+            hiveClient.runSqlHive("insert into test_orc_pos select 1, 2")
+
+            // A second table over the SAME files but with mismatched column 
names (x, y).
+            // By name, x/y are not present in the files; only position 
mapping can read them.
+            hiveClient.runSqlHive(
+              s"create table test_orc_pos_renamed(x int, y int) stored as orc 
location '$orcLoc'")
+
+            // orc.force.positional.evolution=true => read by position: x -> 
c1 (=1), y -> c2 (=2).
+            withSQLConf("spark.hadoop.orc.force.positional.evolution" -> 
"true") {
+              val df = sql("select x, y from test_orc_pos_renamed")
+              checkAnswer(df, Seq(Row(1, 2)))
+              checkOperatorMatch[HiveTableScanExecTransformer](df)
+            }
+          }
+      }
+    }
+  }
+
   test("GLUTEN-11062: Supports mixed input format for partitioned Hive table") 
{
     val hiveClient: HiveClient =
       
spark.sharedState.externalCatalog.unwrapped.asInstanceOf[HiveExternalCatalog].client
diff --git 
a/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/hive/execution/GlutenHiveSQLQuerySuite.scala
 
b/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/hive/execution/GlutenHiveSQLQuerySuite.scala
index b6e6c7b7aa..93b31a02e9 100644
--- 
a/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/hive/execution/GlutenHiveSQLQuerySuite.scala
+++ 
b/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/hive/execution/GlutenHiveSQLQuerySuite.scala
@@ -49,6 +49,36 @@ class GlutenHiveSQLQuerySuite extends 
GlutenHiveSQLQuerySuiteBase {
       purge = false)
   }
 
+  testGluten("orc.force.positional.evolution maps Hive ORC columns by 
position") {
+    val hiveClient: HiveClient =
+      
spark.sharedState.externalCatalog.unwrapped.asInstanceOf[HiveExternalCatalog].client
+
+    withSQLConf("spark.sql.hive.convertMetastoreOrc" -> "false") {
+      withTempDir {
+        dir =>
+          val orcLoc = s"file:///$dir/test_orc_pos"
+          withTable("test_orc_pos", "test_orc_pos_renamed") {
+            // Write ORC files whose physical column names are c1, c2 (c1 = 1, 
c2 = 2).
+            hiveClient.runSqlHive(
+              s"create table test_orc_pos(c1 int, c2 int) stored as orc 
location '$orcLoc'")
+            hiveClient.runSqlHive("insert into test_orc_pos select 1, 2")
+
+            // A second table over the SAME files but with mismatched column 
names (x, y).
+            // By name, x/y are not present in the files; only position 
mapping can read them.
+            hiveClient.runSqlHive(
+              s"create table test_orc_pos_renamed(x int, y int) stored as orc 
location '$orcLoc'")
+
+            // orc.force.positional.evolution=true => read by position: x -> 
c1 (=1), y -> c2 (=2).
+            withSQLConf("spark.hadoop.orc.force.positional.evolution" -> 
"true") {
+              val df = sql("select x, y from test_orc_pos_renamed")
+              checkAnswer(df, Seq(Row(1, 2)))
+              checkOperatorMatch[HiveTableScanExecTransformer](df)
+            }
+          }
+      }
+    }
+  }
+
   test("GLUTEN-11062: Supports mixed input format for partitioned Hive table") 
{
     val hiveClient: HiveClient =
       
spark.sharedState.externalCatalog.unwrapped.asInstanceOf[HiveExternalCatalog].client
diff --git 
a/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/hive/execution/GlutenHiveSQLQuerySuite.scala
 
b/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/hive/execution/GlutenHiveSQLQuerySuite.scala
index 2125d9f364..0938f0dd6c 100644
--- 
a/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/hive/execution/GlutenHiveSQLQuerySuite.scala
+++ 
b/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/hive/execution/GlutenHiveSQLQuerySuite.scala
@@ -119,6 +119,36 @@ class GlutenHiveSQLQuerySuite extends 
GlutenHiveSQLQuerySuiteBase {
       purge = false)
   }
 
+  testGluten("orc.force.positional.evolution maps Hive ORC columns by 
position") {
+    val hiveClient: HiveClient =
+      
spark.sharedState.externalCatalog.unwrapped.asInstanceOf[HiveExternalCatalog].client
+
+    withSQLConf("spark.sql.hive.convertMetastoreOrc" -> "false") {
+      withTempDir {
+        dir =>
+          val orcLoc = s"file:///$dir/test_orc_pos"
+          withTable("test_orc_pos", "test_orc_pos_renamed") {
+            // Write ORC files whose physical column names are c1, c2 (c1 = 1, 
c2 = 2).
+            hiveClient.runSqlHive(
+              s"create table test_orc_pos(c1 int, c2 int) stored as orc 
location '$orcLoc'")
+            hiveClient.runSqlHive("insert into test_orc_pos select 1, 2")
+
+            // A second table over the SAME files but with mismatched column 
names (x, y).
+            // By name, x/y are not present in the files; only position 
mapping can read them.
+            hiveClient.runSqlHive(
+              s"create table test_orc_pos_renamed(x int, y int) stored as orc 
location '$orcLoc'")
+
+            // orc.force.positional.evolution=true => read by position: x -> 
c1 (=1), y -> c2 (=2).
+            withSQLConf("spark.hadoop.orc.force.positional.evolution" -> 
"true") {
+              val df = sql("select x, y from test_orc_pos_renamed")
+              checkAnswer(df, Seq(Row(1, 2)))
+              checkOperatorMatch[HiveTableScanExecTransformer](df)
+            }
+          }
+      }
+    }
+  }
+
   test("GLUTEN-11062: Supports mixed input format for partitioned Hive table") 
{
     val hiveClient: HiveClient =
       
spark.sharedState.externalCatalog.unwrapped.asInstanceOf[HiveExternalCatalog].client
diff --git 
a/gluten-ut/spark40/src/test/scala/org/apache/spark/sql/hive/execution/GlutenHiveSQLQuerySuite.scala
 
b/gluten-ut/spark40/src/test/scala/org/apache/spark/sql/hive/execution/GlutenHiveSQLQuerySuite.scala
index 2125d9f364..0938f0dd6c 100644
--- 
a/gluten-ut/spark40/src/test/scala/org/apache/spark/sql/hive/execution/GlutenHiveSQLQuerySuite.scala
+++ 
b/gluten-ut/spark40/src/test/scala/org/apache/spark/sql/hive/execution/GlutenHiveSQLQuerySuite.scala
@@ -119,6 +119,36 @@ class GlutenHiveSQLQuerySuite extends 
GlutenHiveSQLQuerySuiteBase {
       purge = false)
   }
 
+  testGluten("orc.force.positional.evolution maps Hive ORC columns by 
position") {
+    val hiveClient: HiveClient =
+      
spark.sharedState.externalCatalog.unwrapped.asInstanceOf[HiveExternalCatalog].client
+
+    withSQLConf("spark.sql.hive.convertMetastoreOrc" -> "false") {
+      withTempDir {
+        dir =>
+          val orcLoc = s"file:///$dir/test_orc_pos"
+          withTable("test_orc_pos", "test_orc_pos_renamed") {
+            // Write ORC files whose physical column names are c1, c2 (c1 = 1, 
c2 = 2).
+            hiveClient.runSqlHive(
+              s"create table test_orc_pos(c1 int, c2 int) stored as orc 
location '$orcLoc'")
+            hiveClient.runSqlHive("insert into test_orc_pos select 1, 2")
+
+            // A second table over the SAME files but with mismatched column 
names (x, y).
+            // By name, x/y are not present in the files; only position 
mapping can read them.
+            hiveClient.runSqlHive(
+              s"create table test_orc_pos_renamed(x int, y int) stored as orc 
location '$orcLoc'")
+
+            // orc.force.positional.evolution=true => read by position: x -> 
c1 (=1), y -> c2 (=2).
+            withSQLConf("spark.hadoop.orc.force.positional.evolution" -> 
"true") {
+              val df = sql("select x, y from test_orc_pos_renamed")
+              checkAnswer(df, Seq(Row(1, 2)))
+              checkOperatorMatch[HiveTableScanExecTransformer](df)
+            }
+          }
+      }
+    }
+  }
+
   test("GLUTEN-11062: Supports mixed input format for partitioned Hive table") 
{
     val hiveClient: HiveClient =
       
spark.sharedState.externalCatalog.unwrapped.asInstanceOf[HiveExternalCatalog].client
diff --git 
a/gluten-ut/spark41/src/test/scala/org/apache/spark/sql/hive/execution/GlutenHiveSQLQuerySuite.scala
 
b/gluten-ut/spark41/src/test/scala/org/apache/spark/sql/hive/execution/GlutenHiveSQLQuerySuite.scala
index 2125d9f364..0938f0dd6c 100644
--- 
a/gluten-ut/spark41/src/test/scala/org/apache/spark/sql/hive/execution/GlutenHiveSQLQuerySuite.scala
+++ 
b/gluten-ut/spark41/src/test/scala/org/apache/spark/sql/hive/execution/GlutenHiveSQLQuerySuite.scala
@@ -119,6 +119,36 @@ class GlutenHiveSQLQuerySuite extends 
GlutenHiveSQLQuerySuiteBase {
       purge = false)
   }
 
+  testGluten("orc.force.positional.evolution maps Hive ORC columns by 
position") {
+    val hiveClient: HiveClient =
+      
spark.sharedState.externalCatalog.unwrapped.asInstanceOf[HiveExternalCatalog].client
+
+    withSQLConf("spark.sql.hive.convertMetastoreOrc" -> "false") {
+      withTempDir {
+        dir =>
+          val orcLoc = s"file:///$dir/test_orc_pos"
+          withTable("test_orc_pos", "test_orc_pos_renamed") {
+            // Write ORC files whose physical column names are c1, c2 (c1 = 1, 
c2 = 2).
+            hiveClient.runSqlHive(
+              s"create table test_orc_pos(c1 int, c2 int) stored as orc 
location '$orcLoc'")
+            hiveClient.runSqlHive("insert into test_orc_pos select 1, 2")
+
+            // A second table over the SAME files but with mismatched column 
names (x, y).
+            // By name, x/y are not present in the files; only position 
mapping can read them.
+            hiveClient.runSqlHive(
+              s"create table test_orc_pos_renamed(x int, y int) stored as orc 
location '$orcLoc'")
+
+            // orc.force.positional.evolution=true => read by position: x -> 
c1 (=1), y -> c2 (=2).
+            withSQLConf("spark.hadoop.orc.force.positional.evolution" -> 
"true") {
+              val df = sql("select x, y from test_orc_pos_renamed")
+              checkAnswer(df, Seq(Row(1, 2)))
+              checkOperatorMatch[HiveTableScanExecTransformer](df)
+            }
+          }
+      }
+    }
+  }
+
   test("GLUTEN-11062: Supports mixed input format for partitioned Hive table") 
{
     val hiveClient: HiveClient =
       
spark.sharedState.externalCatalog.unwrapped.asInstanceOf[HiveExternalCatalog].client


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(gluten) branch main updated: [VL] Respect orc.force.positional.evolution (#12234)

Reply via email to