Copilot commented on code in PR #12234:
URL: https://github.com/apache/gluten/pull/12234#discussion_r3366942768


##########
gluten-substrait/src/main/scala/org/apache/gluten/config/GlutenConfig.scala:
##########
@@ -576,6 +578,19 @@ object GlutenConfig extends ConfigRegistry {
       }
       .foreach { case (k, v) => nativeConfMap.put(k, v) }
 
+    // When `orc.force.positional.evolution=true`, vanilla Spark maps ORC 
columns by
+    // position rather than by name (see OrcUtils.requestedColumnIds). The 
Velox ORC reader
+    // must do the same, otherwise name-based matching against a mismatched 
file schema
+    // reads columns back as null/empty. Override the (Velox) 
orcUseColumnNames session conf
+    // so native reads ORC by position too. Harmless for backends that ignore 
this key.
+    // String literal is used because gluten-substrait cannot depend on 
backends-velox.
+    if (
+      backendName == "velox" &&
+      conf.getOrElse(SPARK_ORC_FORCE_POSITIONAL_EVOLUTION, "false").toBoolean
+    ) {
+      
nativeConfMap.put("spark.gluten.sql.columnar.backend.velox.orcUseColumnNames", 
"false")
+    }

Review Comment:
   Using String#toBoolean here will throw IllegalArgumentException if the 
config value is anything other than a case-insensitive "true"/"false" (e.g., 
"1"), which can make native session conf construction unexpectedly fail. Prefer 
a non-throwing parse (e.g., java.lang.Boolean.parseBoolean or 
equalsIgnoreCase("true")).



##########
gluten-ut/spark35/src/test/scala/org/apache/spark/sql/hive/execution/GlutenHiveSQLQuerySuite.scala:
##########
@@ -119,7 +119,38 @@ class GlutenHiveSQLQuerySuite extends 
GlutenHiveSQLQuerySuiteBase {
       purge = false)
   }
 
+  testGluten("orc.force.positional.evolution maps Hive ORC columns by 
position") {
+    val hiveClient: HiveClient =
+      
spark.sharedState.externalCatalog.unwrapped.asInstanceOf[HiveExternalCatalog].client
+
+    withSQLConf("spark.sql.hive.convertMetastoreOrc" -> "false") {
+      withTempDir {
+        dir =>
+          val orcLoc = s"file:///$dir/test_orc_pos"
+          withTable("test_orc_pos", "test_orc_pos_renamed") {
+            // Write ORC files whose physical column names are c1, c2 (c1 = 1, 
c2 = 2).
+            hiveClient.runSqlHive(
+              s"create table test_orc_pos(c1 int, c2 int) stored as orc 
location '$orcLoc'")

Review Comment:
   Both tables are created as managed tables while sharing the same explicit 
LOCATION. Hive may reject creating a second managed table on a non-empty 
location and/or may delete the underlying data when dropping the first table, 
which undermines the intent of “two tables over the SAME files”. Use EXTERNAL 
tables here to ensure the shared-location setup is reliable and the underlying 
files are preserved.



##########
backends-velox/src/main/scala/org/apache/gluten/config/VeloxConfig.scala:
##########
@@ -94,7 +94,8 @@ class VeloxConfig(conf: SQLConf) extends GlutenConfig(conf) {
 
   def cudfShuffleMaxPrefetchBytes: Long = 
getConf(CUDF_SHUFFLE_MAX_PREFETCH_BYTES)
 
-  def orcUseColumnNames: Boolean = getConf(ORC_USE_COLUMN_NAMES)
+  def orcUseColumnNames: Boolean = getConf(ORC_USE_COLUMN_NAMES) &&
+    !conf.getConfString(GlutenConfig.SPARK_ORC_FORCE_POSITIONAL_EVOLUTION, 
"false").toBoolean
 

Review Comment:
   String#toBoolean throws on non "true"/"false" values. Since this key comes 
from a free-form Spark/Hadoop config string, consider using a non-throwing 
parse (e.g., java.lang.Boolean.parseBoolean) to avoid unexpected failures when 
users pass other truthy values.



##########
gluten-ut/spark35/src/test/scala/org/apache/spark/sql/hive/execution/GlutenHiveSQLQuerySuite.scala:
##########
@@ -119,7 +119,38 @@ class GlutenHiveSQLQuerySuite extends 
GlutenHiveSQLQuerySuiteBase {
       purge = false)
   }
 
+  testGluten("orc.force.positional.evolution maps Hive ORC columns by 
position") {
+    val hiveClient: HiveClient =
+      
spark.sharedState.externalCatalog.unwrapped.asInstanceOf[HiveExternalCatalog].client
+
+    withSQLConf("spark.sql.hive.convertMetastoreOrc" -> "false") {
+      withTempDir {
+        dir =>
+          val orcLoc = s"file:///$dir/test_orc_pos"
+          withTable("test_orc_pos", "test_orc_pos_renamed") {
+            // Write ORC files whose physical column names are c1, c2 (c1 = 1, 
c2 = 2).
+            hiveClient.runSqlHive(
+              s"create table test_orc_pos(c1 int, c2 int) stored as orc 
location '$orcLoc'")
+            hiveClient.runSqlHive("insert into test_orc_pos select 1, 2")
+
+            // A second table over the SAME files but with mismatched column 
names (x, y).
+            // By name, x/y are not present in the files; only position 
mapping can read them.
+            hiveClient.runSqlHive(
+              s"create table test_orc_pos_renamed(x int, y int) stored as orc 
location '$orcLoc'")

Review Comment:
   This table is intended to point at the same ORC files as test_orc_pos. 
Creating it as a managed table at an existing/non-empty LOCATION can fail in 
Hive and/or lead to data deletion on drop. Declare it as an EXTERNAL table to 
make the shared-location behavior reliable.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to