This is an automated email from the ASF dual-hosted git repository.
beliefer pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/gluten.git
The following commit(s) were added to refs/heads/main by this push:
new fc90a7933a [VL] Respect orc.force.positional.evolution (#12234)
fc90a7933a is described below
commit fc90a7933afaf5518ad20a60a5d79d482cea5ef1
Author: Jiaan Geng <[email protected]>
AuthorDate: Sun Jun 7 13:08:19 2026 +0800
[VL] Respect orc.force.positional.evolution (#12234)
* [VL] Respect orc.force.positional.evolution
---
.../org/apache/gluten/config/VeloxConfig.scala | 3 ++-
.../org/apache/gluten/config/GlutenConfig.scala | 15 +++++++++++
.../hive/execution/GlutenHiveSQLQuerySuite.scala | 30 ++++++++++++++++++++++
.../hive/execution/GlutenHiveSQLQuerySuite.scala | 30 ++++++++++++++++++++++
.../hive/execution/GlutenHiveSQLQuerySuite.scala | 30 ++++++++++++++++++++++
.../hive/execution/GlutenHiveSQLQuerySuite.scala | 30 ++++++++++++++++++++++
.../hive/execution/GlutenHiveSQLQuerySuite.scala | 30 ++++++++++++++++++++++
7 files changed, 167 insertions(+), 1 deletion(-)
diff --git
a/backends-velox/src/main/scala/org/apache/gluten/config/VeloxConfig.scala
b/backends-velox/src/main/scala/org/apache/gluten/config/VeloxConfig.scala
index 0e9c22154e..e411c9d190 100644
--- a/backends-velox/src/main/scala/org/apache/gluten/config/VeloxConfig.scala
+++ b/backends-velox/src/main/scala/org/apache/gluten/config/VeloxConfig.scala
@@ -94,7 +94,8 @@ class VeloxConfig(conf: SQLConf) extends GlutenConfig(conf) {
def cudfShuffleMaxPrefetchBytes: Long =
getConf(CUDF_SHUFFLE_MAX_PREFETCH_BYTES)
- def orcUseColumnNames: Boolean = getConf(ORC_USE_COLUMN_NAMES)
+ def orcUseColumnNames: Boolean = getConf(ORC_USE_COLUMN_NAMES) &&
+ !conf.getConfString(GlutenConfig.SPARK_ORC_FORCE_POSITIONAL_EVOLUTION,
"false").toBoolean
def parquetUseColumnNames: Boolean = getConf(PARQUET_USE_COLUMN_NAMES)
diff --git
a/gluten-substrait/src/main/scala/org/apache/gluten/config/GlutenConfig.scala
b/gluten-substrait/src/main/scala/org/apache/gluten/config/GlutenConfig.scala
index 8656b3e2af..9c59097991 100644
---
a/gluten-substrait/src/main/scala/org/apache/gluten/config/GlutenConfig.scala
+++
b/gluten-substrait/src/main/scala/org/apache/gluten/config/GlutenConfig.scala
@@ -437,6 +437,8 @@ object GlutenConfig extends ConfigRegistry {
val SPARK_S3_ENDPOINT_REGION: String = HADOOP_PREFIX + S3_ENDPOINT_REGION
val S3_AWS_IMDS_ENABLED = "fs.s3a.aws.imds.enabled"
val SPARK_S3_AWS_IMDS_ENABLED: String = HADOOP_PREFIX + S3_AWS_IMDS_ENABLED
+ val ORC_FORCE_POSITIONAL_EVOLUTION = "orc.force.positional.evolution"
+ val SPARK_ORC_FORCE_POSITIONAL_EVOLUTION = HADOOP_PREFIX +
ORC_FORCE_POSITIONAL_EVOLUTION
// ABFS config
val ABFS_PREFIX = "fs.azure."
@@ -580,6 +582,19 @@ object GlutenConfig extends ConfigRegistry {
}
.foreach { case (k, v) => nativeConfMap.put(k, v) }
+ // When `orc.force.positional.evolution=true`, vanilla Spark maps ORC
columns by
+ // position rather than by name (see OrcUtils.requestedColumnIds). The
Velox ORC reader
+ // must do the same, otherwise name-based matching against a mismatched
file schema
+ // reads columns back as null/empty. Override the (Velox)
orcUseColumnNames session conf
+ // so native reads ORC by position too. Harmless for backends that ignore
this key.
+ // String literal is used because gluten-substrait cannot depend on
backends-velox.
+ if (
+ backendName == "velox" &&
+ conf.getOrElse(SPARK_ORC_FORCE_POSITIONAL_EVOLUTION, "false").toBoolean
+ ) {
+
nativeConfMap.put("spark.gluten.sql.columnar.backend.velox.orcUseColumnNames",
"false")
+ }
+
// Pass the latest tokens to native
nativeConfMap.put(
ReservedKeys.GLUTEN_UGI_TOKENS,
diff --git
a/gluten-ut/spark33/src/test/scala/org/apache/spark/sql/hive/execution/GlutenHiveSQLQuerySuite.scala
b/gluten-ut/spark33/src/test/scala/org/apache/spark/sql/hive/execution/GlutenHiveSQLQuerySuite.scala
index 04382e32ad..97ec1cfba8 100644
---
a/gluten-ut/spark33/src/test/scala/org/apache/spark/sql/hive/execution/GlutenHiveSQLQuerySuite.scala
+++
b/gluten-ut/spark33/src/test/scala/org/apache/spark/sql/hive/execution/GlutenHiveSQLQuerySuite.scala
@@ -137,6 +137,36 @@ class GlutenHiveSQLQuerySuite extends
GlutenHiveSQLQuerySuiteBase {
purge = false)
}
+ testGluten("orc.force.positional.evolution maps Hive ORC columns by
position") {
+ val hiveClient: HiveClient =
+
spark.sharedState.externalCatalog.unwrapped.asInstanceOf[HiveExternalCatalog].client
+
+ withSQLConf("spark.sql.hive.convertMetastoreOrc" -> "false") {
+ withTempDir {
+ dir =>
+ val orcLoc = s"file:///$dir/test_orc_pos"
+ withTable("test_orc_pos", "test_orc_pos_renamed") {
+ // Write ORC files whose physical column names are c1, c2 (c1 = 1,
c2 = 2).
+ hiveClient.runSqlHive(
+ s"create table test_orc_pos(c1 int, c2 int) stored as orc
location '$orcLoc'")
+ hiveClient.runSqlHive("insert into test_orc_pos select 1, 2")
+
+ // A second table over the SAME files but with mismatched column
names (x, y).
+ // By name, x/y are not present in the files; only position
mapping can read them.
+ hiveClient.runSqlHive(
+ s"create table test_orc_pos_renamed(x int, y int) stored as orc
location '$orcLoc'")
+
+ // orc.force.positional.evolution=true => read by position: x ->
c1 (=1), y -> c2 (=2).
+ withSQLConf("spark.hadoop.orc.force.positional.evolution" ->
"true") {
+ val df = sql("select x, y from test_orc_pos_renamed")
+ checkAnswer(df, Seq(Row(1, 2)))
+ checkOperatorMatch[HiveTableScanExecTransformer](df)
+ }
+ }
+ }
+ }
+ }
+
test("GLUTEN-11062: Supports mixed input format for partitioned Hive table")
{
val hiveClient: HiveClient =
spark.sharedState.externalCatalog.unwrapped.asInstanceOf[HiveExternalCatalog].client
diff --git
a/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/hive/execution/GlutenHiveSQLQuerySuite.scala
b/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/hive/execution/GlutenHiveSQLQuerySuite.scala
index b6e6c7b7aa..93b31a02e9 100644
---
a/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/hive/execution/GlutenHiveSQLQuerySuite.scala
+++
b/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/hive/execution/GlutenHiveSQLQuerySuite.scala
@@ -49,6 +49,36 @@ class GlutenHiveSQLQuerySuite extends
GlutenHiveSQLQuerySuiteBase {
purge = false)
}
+ testGluten("orc.force.positional.evolution maps Hive ORC columns by
position") {
+ val hiveClient: HiveClient =
+
spark.sharedState.externalCatalog.unwrapped.asInstanceOf[HiveExternalCatalog].client
+
+ withSQLConf("spark.sql.hive.convertMetastoreOrc" -> "false") {
+ withTempDir {
+ dir =>
+ val orcLoc = s"file:///$dir/test_orc_pos"
+ withTable("test_orc_pos", "test_orc_pos_renamed") {
+ // Write ORC files whose physical column names are c1, c2 (c1 = 1,
c2 = 2).
+ hiveClient.runSqlHive(
+ s"create table test_orc_pos(c1 int, c2 int) stored as orc
location '$orcLoc'")
+ hiveClient.runSqlHive("insert into test_orc_pos select 1, 2")
+
+ // A second table over the SAME files but with mismatched column
names (x, y).
+ // By name, x/y are not present in the files; only position
mapping can read them.
+ hiveClient.runSqlHive(
+ s"create table test_orc_pos_renamed(x int, y int) stored as orc
location '$orcLoc'")
+
+ // orc.force.positional.evolution=true => read by position: x ->
c1 (=1), y -> c2 (=2).
+ withSQLConf("spark.hadoop.orc.force.positional.evolution" ->
"true") {
+ val df = sql("select x, y from test_orc_pos_renamed")
+ checkAnswer(df, Seq(Row(1, 2)))
+ checkOperatorMatch[HiveTableScanExecTransformer](df)
+ }
+ }
+ }
+ }
+ }
+
test("GLUTEN-11062: Supports mixed input format for partitioned Hive table")
{
val hiveClient: HiveClient =
spark.sharedState.externalCatalog.unwrapped.asInstanceOf[HiveExternalCatalog].client
diff --git
a/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/hive/execution/GlutenHiveSQLQuerySuite.scala
b/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/hive/execution/GlutenHiveSQLQuerySuite.scala
index 2125d9f364..0938f0dd6c 100644
---
a/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/hive/execution/GlutenHiveSQLQuerySuite.scala
+++
b/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/hive/execution/GlutenHiveSQLQuerySuite.scala
@@ -119,6 +119,36 @@ class GlutenHiveSQLQuerySuite extends
GlutenHiveSQLQuerySuiteBase {
purge = false)
}
+ testGluten("orc.force.positional.evolution maps Hive ORC columns by
position") {
+ val hiveClient: HiveClient =
+
spark.sharedState.externalCatalog.unwrapped.asInstanceOf[HiveExternalCatalog].client
+
+ withSQLConf("spark.sql.hive.convertMetastoreOrc" -> "false") {
+ withTempDir {
+ dir =>
+ val orcLoc = s"file:///$dir/test_orc_pos"
+ withTable("test_orc_pos", "test_orc_pos_renamed") {
+ // Write ORC files whose physical column names are c1, c2 (c1 = 1,
c2 = 2).
+ hiveClient.runSqlHive(
+ s"create table test_orc_pos(c1 int, c2 int) stored as orc
location '$orcLoc'")
+ hiveClient.runSqlHive("insert into test_orc_pos select 1, 2")
+
+ // A second table over the SAME files but with mismatched column
names (x, y).
+ // By name, x/y are not present in the files; only position
mapping can read them.
+ hiveClient.runSqlHive(
+ s"create table test_orc_pos_renamed(x int, y int) stored as orc
location '$orcLoc'")
+
+ // orc.force.positional.evolution=true => read by position: x ->
c1 (=1), y -> c2 (=2).
+ withSQLConf("spark.hadoop.orc.force.positional.evolution" ->
"true") {
+ val df = sql("select x, y from test_orc_pos_renamed")
+ checkAnswer(df, Seq(Row(1, 2)))
+ checkOperatorMatch[HiveTableScanExecTransformer](df)
+ }
+ }
+ }
+ }
+ }
+
test("GLUTEN-11062: Supports mixed input format for partitioned Hive table")
{
val hiveClient: HiveClient =
spark.sharedState.externalCatalog.unwrapped.asInstanceOf[HiveExternalCatalog].client
diff --git
a/gluten-ut/spark40/src/test/scala/org/apache/spark/sql/hive/execution/GlutenHiveSQLQuerySuite.scala
b/gluten-ut/spark40/src/test/scala/org/apache/spark/sql/hive/execution/GlutenHiveSQLQuerySuite.scala
index 2125d9f364..0938f0dd6c 100644
---
a/gluten-ut/spark40/src/test/scala/org/apache/spark/sql/hive/execution/GlutenHiveSQLQuerySuite.scala
+++
b/gluten-ut/spark40/src/test/scala/org/apache/spark/sql/hive/execution/GlutenHiveSQLQuerySuite.scala
@@ -119,6 +119,36 @@ class GlutenHiveSQLQuerySuite extends
GlutenHiveSQLQuerySuiteBase {
purge = false)
}
+ testGluten("orc.force.positional.evolution maps Hive ORC columns by
position") {
+ val hiveClient: HiveClient =
+
spark.sharedState.externalCatalog.unwrapped.asInstanceOf[HiveExternalCatalog].client
+
+ withSQLConf("spark.sql.hive.convertMetastoreOrc" -> "false") {
+ withTempDir {
+ dir =>
+ val orcLoc = s"file:///$dir/test_orc_pos"
+ withTable("test_orc_pos", "test_orc_pos_renamed") {
+ // Write ORC files whose physical column names are c1, c2 (c1 = 1,
c2 = 2).
+ hiveClient.runSqlHive(
+ s"create table test_orc_pos(c1 int, c2 int) stored as orc
location '$orcLoc'")
+ hiveClient.runSqlHive("insert into test_orc_pos select 1, 2")
+
+ // A second table over the SAME files but with mismatched column
names (x, y).
+ // By name, x/y are not present in the files; only position
mapping can read them.
+ hiveClient.runSqlHive(
+ s"create table test_orc_pos_renamed(x int, y int) stored as orc
location '$orcLoc'")
+
+ // orc.force.positional.evolution=true => read by position: x ->
c1 (=1), y -> c2 (=2).
+ withSQLConf("spark.hadoop.orc.force.positional.evolution" ->
"true") {
+ val df = sql("select x, y from test_orc_pos_renamed")
+ checkAnswer(df, Seq(Row(1, 2)))
+ checkOperatorMatch[HiveTableScanExecTransformer](df)
+ }
+ }
+ }
+ }
+ }
+
test("GLUTEN-11062: Supports mixed input format for partitioned Hive table")
{
val hiveClient: HiveClient =
spark.sharedState.externalCatalog.unwrapped.asInstanceOf[HiveExternalCatalog].client
diff --git
a/gluten-ut/spark41/src/test/scala/org/apache/spark/sql/hive/execution/GlutenHiveSQLQuerySuite.scala
b/gluten-ut/spark41/src/test/scala/org/apache/spark/sql/hive/execution/GlutenHiveSQLQuerySuite.scala
index 2125d9f364..0938f0dd6c 100644
---
a/gluten-ut/spark41/src/test/scala/org/apache/spark/sql/hive/execution/GlutenHiveSQLQuerySuite.scala
+++
b/gluten-ut/spark41/src/test/scala/org/apache/spark/sql/hive/execution/GlutenHiveSQLQuerySuite.scala
@@ -119,6 +119,36 @@ class GlutenHiveSQLQuerySuite extends
GlutenHiveSQLQuerySuiteBase {
purge = false)
}
+ testGluten("orc.force.positional.evolution maps Hive ORC columns by
position") {
+ val hiveClient: HiveClient =
+
spark.sharedState.externalCatalog.unwrapped.asInstanceOf[HiveExternalCatalog].client
+
+ withSQLConf("spark.sql.hive.convertMetastoreOrc" -> "false") {
+ withTempDir {
+ dir =>
+ val orcLoc = s"file:///$dir/test_orc_pos"
+ withTable("test_orc_pos", "test_orc_pos_renamed") {
+ // Write ORC files whose physical column names are c1, c2 (c1 = 1,
c2 = 2).
+ hiveClient.runSqlHive(
+ s"create table test_orc_pos(c1 int, c2 int) stored as orc
location '$orcLoc'")
+ hiveClient.runSqlHive("insert into test_orc_pos select 1, 2")
+
+ // A second table over the SAME files but with mismatched column
names (x, y).
+ // By name, x/y are not present in the files; only position
mapping can read them.
+ hiveClient.runSqlHive(
+ s"create table test_orc_pos_renamed(x int, y int) stored as orc
location '$orcLoc'")
+
+ // orc.force.positional.evolution=true => read by position: x ->
c1 (=1), y -> c2 (=2).
+ withSQLConf("spark.hadoop.orc.force.positional.evolution" ->
"true") {
+ val df = sql("select x, y from test_orc_pos_renamed")
+ checkAnswer(df, Seq(Row(1, 2)))
+ checkOperatorMatch[HiveTableScanExecTransformer](df)
+ }
+ }
+ }
+ }
+ }
+
test("GLUTEN-11062: Supports mixed input format for partitioned Hive table")
{
val hiveClient: HiveClient =
spark.sharedState.externalCatalog.unwrapped.asInstanceOf[HiveExternalCatalog].client
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]