(incubator-gluten) branch main updated: [GLUTEN-11422][VL] Update ORC related Spark unit tests for Spark 3.5+ (#11423)

rui Tue, 24 Feb 2026 01:34:22 -0800

This is an automated email from the ASF dual-hosted git repository.

rui pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/incubator-gluten.git



The following commit(s) were added to refs/heads/main by this push:
     new ab95d62ef3 [GLUTEN-11422][VL] Update ORC related Spark unit tests for 
Spark 3.5+ (#11423)
ab95d62ef3 is described below

commit ab95d62ef34c341f87e92f84856f45997cce2755
Author: loudongfeng <[email protected]>
AuthorDate: Tue Feb 24 17:34:10 2026 +0800

    [GLUTEN-11422][VL] Update ORC related Spark unit tests for Spark 3.5+ 
(#11423)
---
 .../gluten/utils/velox/VeloxTestSettings.scala     | 99 ++++------------------
 .../datasources/orc/GlutenOrcEncryptionSuite.scala | 21 +++++
 .../hive/execution/GlutenHiveSQLQuerySuite.scala   | 70 +++++++++++++++
 .../gluten/utils/velox/VeloxTestSettings.scala     | 93 ++++----------------
 .../hive/execution/GlutenHiveSQLQuerySuite.scala   | 70 +++++++++++++++
 .../gluten/utils/velox/VeloxTestSettings.scala     | 93 ++++----------------
 .../hive/execution/GlutenHiveSQLQuerySuite.scala   | 70 +++++++++++++++
 7 files changed, 278 insertions(+), 238 deletions(-)

diff --git 
a/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala
 
b/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala
index b76a717e42..1207121da7 100644
--- 
a/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala
+++ 
b/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala
@@ -282,106 +282,36 @@ class VeloxTestSettings extends BackendTestSettings {
   enableSuite[GlutenJsonLegacyTimeParserSuite]
   enableSuite[GlutenValidateRequirementsSuite]
   enableSuite[GlutenOrcColumnarBatchReaderSuite]
+  enableSuite[GlutenOrcEncryptionSuite]
+    // Orc encryption not supported yet
+    .exclude("Write and read an encrypted file")
+    .exclude("Write and read an encrypted table")
+    .exclude("SPARK-35325: Write and read encrypted nested columns")
+    .exclude("SPARK-35992: Write and read fully-encrypted columns with default 
masking")
   enableSuite[GlutenOrcFilterSuite]
     .exclude("SPARK-32622: case sensitivity in predicate pushdown")
   enableSuite[GlutenOrcPartitionDiscoverySuite]
-    .exclude("read partitioned table - normal case")
-    .exclude("read partitioned table - with nulls")
   enableSuite[GlutenOrcV1PartitionDiscoverySuite]
-    .exclude("read partitioned table - normal case")
-    .exclude("read partitioned table - with nulls")
-    .exclude("read partitioned table - partition key included in orc file")
-    .exclude("read partitioned table - with nulls and partition keys are 
included in Orc file")
   enableSuite[GlutenOrcV1QuerySuite]
-    // Rewrite to disable Spark's columnar reader.
-    .exclude("Simple selection form ORC table")
-    .exclude("simple select queries")
-    .exclude("overwriting")
-    .exclude("self-join")
-    .exclude("columns only referenced by pushed down filters should remain")
-    .exclude("SPARK-5309 strings stored using dictionary compression in orc")
-    // For exception test.
+    // Expected exception org.apache.spark.sql.AnalysisException to be thrown
+    // , but no exception was thrown
     .exclude("SPARK-20728 Make ORCFileFormat configurable between sql/hive and 
sql/core")
-    .exclude("Read/write binary data")
-    .exclude("Read/write all types with non-primitive type")
-    .exclude("Creating case class RDD table")
-    .exclude("save and load case class RDD with `None`s as orc")
-    .exclude("SPARK-16610: Respect orc.compress (i.e., OrcConf.COMPRESS) when" 
+
-      " compression is unset")
-    .exclude("Compression options for writing to an ORC file (SNAPPY, ZLIB and 
NONE)")
-    .exclude("appending")
-    .exclude("nested data - struct with array field")
-    .exclude("nested data - array of struct")
-    .exclude("SPARK-9170: Don't implicitly lowercase of user-provided columns")
-    .exclude("SPARK-10623 Enable ORC PPD")
-    .exclude("SPARK-14962 Produce correct results on array type with 
isnotnull")
-    .exclude("SPARK-15198 Support for pushing down filters for boolean types")
-    .exclude("Support for pushing down filters for decimal types")
-    .exclude("Support for pushing down filters for timestamp types")
-    .exclude("column nullability and comment - write and then read")
-    .exclude("Empty schema does not read data from ORC file")
-    .exclude("read from multiple orc input paths")
+    // not supported ignoreCorruptFiles
     .exclude("Enabling/disabling ignoreCorruptFiles")
-    .exclude("SPARK-27160 Predicate pushdown correctness on DecimalType for 
ORC")
-    .exclude("LZO compression options for writing to an ORC file")
-    .exclude("Schema discovery on empty ORC files")
-    .exclude("SPARK-21791 ORC should support column names with dot")
-    .exclude("SPARK-25579 ORC PPD should support column names with dot")
-    .exclude("SPARK-34862: Support ORC vectorized reader for nested column")
-    .exclude("SPARK-37728: Reading nested columns with ORC vectorized reader 
should not")
-    .exclude("SPARK-36594: ORC vectorized reader should properly check maximal 
number of fields")
-    .exclude("Read/write all timestamp types")
-    .exclude("SPARK-37463: read/write Timestamp ntz to Orc with different time 
zone")
-    .exclude("SPARK-39381: Make vectorized orc columar writer batch size 
configurable")
+    // Schema mismatch, From Kind: BIGINT, To Kind: VARCHAR
     .exclude("SPARK-39830: Reading ORC table that requires type promotion may 
throw AIOOBE")
   enableSuite[GlutenOrcV2QuerySuite]
-    .exclude("Read/write binary data")
-    .exclude("Read/write all types with non-primitive type")
-    // Rewrite to disable Spark's columnar reader.
-    .exclude("Simple selection form ORC table")
-    .exclude("Creating case class RDD table")
-    .exclude("save and load case class RDD with `None`s as orc")
-    .exclude("SPARK-16610: Respect orc.compress (i.e., OrcConf.COMPRESS) when 
compression is unset")
-    .exclude("Compression options for writing to an ORC file (SNAPPY, ZLIB and 
NONE)")
-    .exclude("appending")
-    .exclude("nested data - struct with array field")
-    .exclude("nested data - array of struct")
-    .exclude("SPARK-9170: Don't implicitly lowercase of user-provided columns")
-    .exclude("SPARK-10623 Enable ORC PPD")
-    .exclude("SPARK-14962 Produce correct results on array type with 
isnotnull")
-    .exclude("SPARK-15198 Support for pushing down filters for boolean types")
-    .exclude("Support for pushing down filters for decimal types")
-    .exclude("Support for pushing down filters for timestamp types")
-    .exclude("column nullability and comment - write and then read")
-    .exclude("Empty schema does not read data from ORC file")
-    .exclude("read from multiple orc input paths")
+// feature not supported
     .exclude("Enabling/disabling ignoreCorruptFiles")
-    .exclude("SPARK-27160 Predicate pushdown correctness on DecimalType for 
ORC")
-    .exclude("LZO compression options for writing to an ORC file")
-    .exclude("Schema discovery on empty ORC files")
-    .exclude("SPARK-21791 ORC should support column names with dot")
-    .exclude("SPARK-25579 ORC PPD should support column names with dot")
-    .exclude("SPARK-34862: Support ORC vectorized reader for nested column")
-    .exclude("SPARK-37728: Reading nested columns with ORC vectorized reader 
should not")
-    .exclude("SPARK-36594: ORC vectorized reader should properly check maximal 
number of fields")
-    .exclude("Read/write all timestamp types")
-    .exclude("SPARK-37463: read/write Timestamp ntz to Orc with different time 
zone")
-    .exclude("SPARK-39381: Make vectorized orc columar writer batch size 
configurable")
+    // Schema mismatch, From Kind: BIGINT, To Kind: VARCHAR
     .exclude("SPARK-39830: Reading ORC table that requires type promotion may 
throw AIOOBE")
-    .exclude("simple select queries")
-    .exclude("overwriting")
-    .exclude("self-join")
-    .exclude("columns only referenced by pushed down filters should remain")
-    .exclude("SPARK-5309 strings stored using dictionary compression in orc")
     // For exception test.
     .exclude("SPARK-20728 Make ORCFileFormat configurable between sql/hive and 
sql/core")
   enableSuite[GlutenOrcSourceSuite]
     // Rewrite to disable Spark's columnar reader.
+    // date result miss match
     .exclude("SPARK-31238: compatibility with Spark 2.4 in reading dates")
     .exclude("SPARK-31238, SPARK-31423: rebasing dates in write")
-    .exclude("SPARK-31284: compatibility with Spark 2.4 in reading timestamps")
-    .exclude("SPARK-31284, SPARK-31423: rebasing timestamps in write")
-    .exclude("SPARK-34862: Support ORC vectorized reader for nested column")
     // Ignored to disable vectorized reading check.
     .exclude("SPARK-36594: ORC vectorized reader should properly check maximal 
number of fields")
     .exclude("create temporary orc table")
@@ -389,9 +319,9 @@ class VeloxTestSettings extends BackendTestSettings {
     .exclude("appending insert")
     .exclude("overwrite insert")
     .exclude("SPARK-34897: Support reconcile schemas based on index after 
nested column pruning")
+    // date result miss match
     .excludeGlutenTest("SPARK-31238: compatibility with Spark 2.4 in reading 
dates")
     .excludeGlutenTest("SPARK-31238, SPARK-31423: rebasing dates in write")
-    .excludeGlutenTest("SPARK-34862: Support ORC vectorized reader for nested 
column")
     // exclude as struct not supported
     .exclude("SPARK-36663: OrcUtils.toCatalystSchema should correctly handle a 
column name which consists of only numbers")
     .exclude("SPARK-37812: Reuse result row when deserializing a struct")
@@ -399,6 +329,7 @@ class VeloxTestSettings extends BackendTestSettings {
     .exclude("SPARK-36931: Support reading and writing ANSI intervals 
(spark.sql.orc.enableVectorizedReader=true, 
spark.sql.orc.enableNestedColumnVectorizedReader=true)")
     .exclude("SPARK-36931: Support reading and writing ANSI intervals 
(spark.sql.orc.enableVectorizedReader=true, 
spark.sql.orc.enableNestedColumnVectorizedReader=false)")
   enableSuite[GlutenOrcV1FilterSuite]
+    // Expected exception org.apache.spark.SparkException to be thrown, but no 
exception was thrown
     .exclude("SPARK-32622: case sensitivity in predicate pushdown")
   enableSuite[GlutenOrcV1SchemaPruningSuite]
   enableSuite[GlutenOrcV2SchemaPruningSuite]
diff --git 
a/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/execution/datasources/orc/GlutenOrcEncryptionSuite.scala
 
b/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/execution/datasources/orc/GlutenOrcEncryptionSuite.scala
new file mode 100644
index 0000000000..27664b02e3
--- /dev/null
+++ 
b/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/execution/datasources/orc/GlutenOrcEncryptionSuite.scala
@@ -0,0 +1,21 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.sql.execution.datasources.orc
+
+import org.apache.spark.sql.GlutenSQLTestsBaseTrait
+
+class GlutenOrcEncryptionSuite extends OrcEncryptionSuite with 
GlutenSQLTestsBaseTrait {}
diff --git 
a/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/hive/execution/GlutenHiveSQLQuerySuite.scala
 
b/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/hive/execution/GlutenHiveSQLQuerySuite.scala
index f945b38ede..2c42fccd86 100644
--- 
a/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/hive/execution/GlutenHiveSQLQuerySuite.scala
+++ 
b/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/hive/execution/GlutenHiveSQLQuerySuite.scala
@@ -16,12 +16,18 @@
  */
 package org.apache.spark.sql.hive.execution
 
+import org.apache.gluten.config.GlutenConfig
+import org.apache.gluten.execution.FileSourceScanExecTransformer
+
 import org.apache.spark.SparkConf
 import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.TableIdentifier
+import org.apache.spark.sql.execution.SparkPlan
 import org.apache.spark.sql.hive.{HiveExternalCatalog, 
HiveTableScanExecTransformer}
 import org.apache.spark.sql.hive.client.HiveClient
 
+import scala.collection.immutable.Seq
+
 class GlutenHiveSQLQuerySuite extends GlutenHiveSQLQuerySuiteBase {
 
   override def sparkConf: SparkConf = {
@@ -49,6 +55,70 @@ class GlutenHiveSQLQuerySuite extends 
GlutenHiveSQLQuerySuiteBase {
       purge = false)
   }
 
+  // copy from GLUTEN-4796, which only added to spark33
+  testGluten("Add orc char type validation") {
+    withSQLConf("spark.sql.hive.convertMetastoreOrc" -> "false") {
+      sql("DROP TABLE IF EXISTS test_orc")
+      sql(
+        "CREATE TABLE test_orc (name char(10), id int)" +
+          " USING hive OPTIONS(fileFormat 'orc')")
+      sql("INSERT INTO test_orc VALUES('test', 1)")
+    }
+
+    def testExecPlan(
+        convertMetastoreOrc: String,
+        charTypeFallbackEnabled: String,
+        shouldFindTransformer: Boolean,
+        transformerClass: Class[_ <: SparkPlan]
+    ): Unit = {
+
+      withSQLConf(
+        "spark.sql.hive.convertMetastoreOrc" -> convertMetastoreOrc,
+        GlutenConfig.VELOX_FORCE_ORC_CHAR_TYPE_SCAN_FALLBACK.key -> 
charTypeFallbackEnabled
+      ) {
+        val queries = Seq("select id from test_orc", "select name, id from 
test_orc")
+
+        queries.foreach {
+          query =>
+            val executedPlan = getExecutedPlan(spark.sql(query))
+            val planCondition = 
executedPlan.exists(_.find(transformerClass.isInstance).isDefined)
+
+            if (shouldFindTransformer) {
+              assert(planCondition)
+            } else {
+              assert(!planCondition)
+            }
+        }
+      }
+    }
+
+    testExecPlan(
+      "false",
+      "true",
+      shouldFindTransformer = false,
+      classOf[HiveTableScanExecTransformer])
+    testExecPlan(
+      "false",
+      "false",
+      shouldFindTransformer = true,
+      classOf[HiveTableScanExecTransformer])
+
+    testExecPlan(
+      "true",
+      "true",
+      shouldFindTransformer = false,
+      classOf[FileSourceScanExecTransformer])
+    testExecPlan(
+      "true",
+      "false",
+      shouldFindTransformer = true,
+      classOf[FileSourceScanExecTransformer])
+    spark.sessionState.catalog.dropTable(
+      TableIdentifier("test_orc"),
+      ignoreIfNotExists = true,
+      purge = false)
+  }
+
   test("GLUTEN-11062: Supports mixed input format for partitioned Hive table") 
{
     val hiveClient: HiveClient =
       
spark.sharedState.externalCatalog.unwrapped.asInstanceOf[HiveExternalCatalog].client
diff --git 
a/gluten-ut/spark40/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala
 
b/gluten-ut/spark40/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala
index 6e94d4cc0e..e94e958a0f 100644
--- 
a/gluten-ut/spark40/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala
+++ 
b/gluten-ut/spark40/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala
@@ -317,8 +317,6 @@ class VeloxTestSettings extends BackendTestSettings {
   enableSuite[GlutenCSVParsingOptionsSuite]
   // Generated suites for org.apache.spark.sql.execution.datasources.json
   enableSuite[GlutenJsonParsingOptionsSuite]
-  // Generated suites for org.apache.spark.sql.execution.datasources.orc
-  enableSuite[GlutenOrcEncryptionSuite]
   // Generated suites for org.apache.spark.sql.execution.datasources.parquet
   enableSuite[GlutenParquetAvroCompatibilitySuite]
   enableSuite[GlutenParquetCommitterSuite]
@@ -406,94 +404,34 @@ class VeloxTestSettings extends BackendTestSettings {
     .exclude("SPARK-4228 DataFrame to JSON")
   enableSuite[GlutenValidateRequirementsSuite]
   enableSuite[GlutenOrcColumnarBatchReaderSuite]
+  enableSuite[GlutenOrcEncryptionSuite]
+    // Orc encryption not supported yet
+    .exclude("Write and read an encrypted file")
+    .exclude("Write and read an encrypted table")
+    .exclude("SPARK-35325: Write and read encrypted nested columns")
+    .exclude("SPARK-35992: Write and read fully-encrypted columns with default 
masking")
   enableSuite[GlutenOrcFilterSuite]
     .exclude("SPARK-32622: case sensitivity in predicate pushdown")
   enableSuite[GlutenOrcPartitionDiscoverySuite]
-    .exclude("read partitioned table - normal case")
-    .exclude("read partitioned table - with nulls")
   enableSuite[GlutenOrcV1PartitionDiscoverySuite]
-    .exclude("read partitioned table - normal case")
-    .exclude("read partitioned table - with nulls")
-    .exclude("read partitioned table - partition key included in orc file")
-    .exclude("read partitioned table - with nulls and partition keys are 
included in Orc file")
   enableSuite[GlutenOrcV1QuerySuite]
-    // For exception test.
+    // Expected exception org.apache.spark.sql.AnalysisException to be thrown
+    // , but no exception was thrown
     .exclude("SPARK-20728 Make ORCFileFormat configurable between sql/hive and 
sql/core")
-    .exclude("Read/write binary data")
-    .exclude("Read/write all types with non-primitive type")
-    .exclude("Creating case class RDD table")
-    .exclude("save and load case class RDD with `None`s as orc")
-    .exclude("SPARK-16610: Respect orc.compress (i.e., OrcConf.COMPRESS) when" 
+
-      " compression is unset")
-    .exclude("Compression options for writing to an ORC file (SNAPPY, ZLIB and 
NONE)")
-    .exclude("appending")
-    .exclude("nested data - struct with array field")
-    .exclude("nested data - array of struct")
-    .exclude("SPARK-9170: Don't implicitly lowercase of user-provided columns")
-    .exclude("SPARK-10623 Enable ORC PPD")
-    .exclude("SPARK-14962 Produce correct results on array type with 
isnotnull")
-    .exclude("SPARK-15198 Support for pushing down filters for boolean types")
-    .exclude("Support for pushing down filters for decimal types")
-    .exclude("Support for pushing down filters for timestamp types")
-    .exclude("column nullability and comment - write and then read")
-    .exclude("Empty schema does not read data from ORC file")
-    .exclude("read from multiple orc input paths")
+    // not supported ignoreCorruptFiles
     .exclude("Enabling/disabling ignoreCorruptFiles")
-    .exclude("SPARK-27160 Predicate pushdown correctness on DecimalType for 
ORC")
-    .exclude("LZO compression options for writing to an ORC file")
-    .exclude("Schema discovery on empty ORC files")
-    .exclude("SPARK-21791 ORC should support column names with dot")
-    .exclude("SPARK-25579 ORC PPD should support column names with dot")
-    .exclude("SPARK-34862: Support ORC vectorized reader for nested column")
-    .exclude("SPARK-37728: Reading nested columns with ORC vectorized reader 
should not")
-    .exclude("SPARK-36594: ORC vectorized reader should properly check maximal 
number of fields")
-    .exclude("Read/write all timestamp types")
-    .exclude("SPARK-37463: read/write Timestamp ntz to Orc with different time 
zone")
-    .exclude("SPARK-39381: Make vectorized orc columar writer batch size 
configurable")
+    // Schema mismatch, From Kind: BIGINT, To Kind: VARCHAR
     .exclude("SPARK-39830: Reading ORC table that requires type promotion may 
throw AIOOBE")
   enableSuite[GlutenOrcV2QuerySuite]
-    .exclude("Read/write binary data")
-    .exclude("Read/write all types with non-primitive type")
-    // Rewrite to disable Spark's columnar reader.
-    .exclude("Simple selection form ORC table")
-    .exclude("Creating case class RDD table")
-    .exclude("save and load case class RDD with `None`s as orc")
-    .exclude("SPARK-16610: Respect orc.compress (i.e., OrcConf.COMPRESS) when 
compression is unset")
-    .exclude("Compression options for writing to an ORC file (SNAPPY, ZLIB and 
NONE)")
-    .exclude("appending")
-    .exclude("nested data - struct with array field")
-    .exclude("nested data - array of struct")
-    .exclude("SPARK-9170: Don't implicitly lowercase of user-provided columns")
-    .exclude("SPARK-10623 Enable ORC PPD")
-    .exclude("SPARK-14962 Produce correct results on array type with 
isnotnull")
-    .exclude("SPARK-15198 Support for pushing down filters for boolean types")
-    .exclude("Support for pushing down filters for decimal types")
-    .exclude("Support for pushing down filters for timestamp types")
-    .exclude("column nullability and comment - write and then read")
-    .exclude("Empty schema does not read data from ORC file")
-    .exclude("read from multiple orc input paths")
+    // feature not supported
     .exclude("Enabling/disabling ignoreCorruptFiles")
-    .exclude("SPARK-27160 Predicate pushdown correctness on DecimalType for 
ORC")
-    .exclude("LZO compression options for writing to an ORC file")
-    .exclude("Schema discovery on empty ORC files")
-    .exclude("SPARK-21791 ORC should support column names with dot")
-    .exclude("SPARK-25579 ORC PPD should support column names with dot")
-    .exclude("SPARK-34862: Support ORC vectorized reader for nested column")
-    .exclude("SPARK-37728: Reading nested columns with ORC vectorized reader 
should not")
-    .exclude("SPARK-36594: ORC vectorized reader should properly check maximal 
number of fields")
-    .exclude("Read/write all timestamp types")
-    .exclude("SPARK-37463: read/write Timestamp ntz to Orc with different time 
zone")
-    .exclude("SPARK-39381: Make vectorized orc columar writer batch size 
configurable")
+    // Schema mismatch, From Kind: BIGINT, To Kind: VARCHAR
     .exclude("SPARK-39830: Reading ORC table that requires type promotion may 
throw AIOOBE")
-    .exclude("simple select queries")
-    .exclude("overwriting")
-    .exclude("self-join")
-    .exclude("columns only referenced by pushed down filters should remain")
-    .exclude("SPARK-5309 strings stored using dictionary compression in orc")
     // For exception test.
     .exclude("SPARK-20728 Make ORCFileFormat configurable between sql/hive and 
sql/core")
   enableSuite[GlutenOrcSourceSuite]
-    // https://github.com/apache/incubator-gluten/issues/11218
+    // Rewrite to disable Spark's columnar reader.
+    // date result miss match
     .exclude("SPARK-31238: compatibility with Spark 2.4 in reading dates")
     .exclude("SPARK-31238, SPARK-31423: rebasing dates in write")
     // Ignored to disable vectorized reading check.
@@ -503,9 +441,9 @@ class VeloxTestSettings extends BackendTestSettings {
     .exclude("appending insert")
     .exclude("overwrite insert")
     .exclude("SPARK-34897: Support reconcile schemas based on index after 
nested column pruning")
+    // date result miss match
     .excludeGlutenTest("SPARK-31238: compatibility with Spark 2.4 in reading 
dates")
     .excludeGlutenTest("SPARK-31238, SPARK-31423: rebasing dates in write")
-    .excludeGlutenTest("SPARK-34862: Support ORC vectorized reader for nested 
column")
     // exclude as struct not supported
     .exclude("SPARK-36663: OrcUtils.toCatalystSchema should correctly handle a 
column name which consists of only numbers")
     .exclude("SPARK-37812: Reuse result row when deserializing a struct")
@@ -513,6 +451,7 @@ class VeloxTestSettings extends BackendTestSettings {
     .exclude("SPARK-36931: Support reading and writing ANSI intervals 
(spark.sql.orc.enableVectorizedReader=true, 
spark.sql.orc.enableNestedColumnVectorizedReader=true)")
     .exclude("SPARK-36931: Support reading and writing ANSI intervals 
(spark.sql.orc.enableVectorizedReader=true, 
spark.sql.orc.enableNestedColumnVectorizedReader=false)")
   enableSuite[GlutenOrcV1FilterSuite]
+    // Expected exception org.apache.spark.SparkException to be thrown, but no 
exception was thrown
     .exclude("SPARK-32622: case sensitivity in predicate pushdown")
   enableSuite[GlutenOrcV1SchemaPruningSuite]
   enableSuite[GlutenOrcV2SchemaPruningSuite]
diff --git 
a/gluten-ut/spark40/src/test/scala/org/apache/spark/sql/hive/execution/GlutenHiveSQLQuerySuite.scala
 
b/gluten-ut/spark40/src/test/scala/org/apache/spark/sql/hive/execution/GlutenHiveSQLQuerySuite.scala
index f945b38ede..2c42fccd86 100644
--- 
a/gluten-ut/spark40/src/test/scala/org/apache/spark/sql/hive/execution/GlutenHiveSQLQuerySuite.scala
+++ 
b/gluten-ut/spark40/src/test/scala/org/apache/spark/sql/hive/execution/GlutenHiveSQLQuerySuite.scala
@@ -16,12 +16,18 @@
  */
 package org.apache.spark.sql.hive.execution
 
+import org.apache.gluten.config.GlutenConfig
+import org.apache.gluten.execution.FileSourceScanExecTransformer
+
 import org.apache.spark.SparkConf
 import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.TableIdentifier
+import org.apache.spark.sql.execution.SparkPlan
 import org.apache.spark.sql.hive.{HiveExternalCatalog, 
HiveTableScanExecTransformer}
 import org.apache.spark.sql.hive.client.HiveClient
 
+import scala.collection.immutable.Seq
+
 class GlutenHiveSQLQuerySuite extends GlutenHiveSQLQuerySuiteBase {
 
   override def sparkConf: SparkConf = {
@@ -49,6 +55,70 @@ class GlutenHiveSQLQuerySuite extends 
GlutenHiveSQLQuerySuiteBase {
       purge = false)
   }
 
+  // copy from GLUTEN-4796, which only added to spark33
+  testGluten("Add orc char type validation") {
+    withSQLConf("spark.sql.hive.convertMetastoreOrc" -> "false") {
+      sql("DROP TABLE IF EXISTS test_orc")
+      sql(
+        "CREATE TABLE test_orc (name char(10), id int)" +
+          " USING hive OPTIONS(fileFormat 'orc')")
+      sql("INSERT INTO test_orc VALUES('test', 1)")
+    }
+
+    def testExecPlan(
+        convertMetastoreOrc: String,
+        charTypeFallbackEnabled: String,
+        shouldFindTransformer: Boolean,
+        transformerClass: Class[_ <: SparkPlan]
+    ): Unit = {
+
+      withSQLConf(
+        "spark.sql.hive.convertMetastoreOrc" -> convertMetastoreOrc,
+        GlutenConfig.VELOX_FORCE_ORC_CHAR_TYPE_SCAN_FALLBACK.key -> 
charTypeFallbackEnabled
+      ) {
+        val queries = Seq("select id from test_orc", "select name, id from 
test_orc")
+
+        queries.foreach {
+          query =>
+            val executedPlan = getExecutedPlan(spark.sql(query))
+            val planCondition = 
executedPlan.exists(_.find(transformerClass.isInstance).isDefined)
+
+            if (shouldFindTransformer) {
+              assert(planCondition)
+            } else {
+              assert(!planCondition)
+            }
+        }
+      }
+    }
+
+    testExecPlan(
+      "false",
+      "true",
+      shouldFindTransformer = false,
+      classOf[HiveTableScanExecTransformer])
+    testExecPlan(
+      "false",
+      "false",
+      shouldFindTransformer = true,
+      classOf[HiveTableScanExecTransformer])
+
+    testExecPlan(
+      "true",
+      "true",
+      shouldFindTransformer = false,
+      classOf[FileSourceScanExecTransformer])
+    testExecPlan(
+      "true",
+      "false",
+      shouldFindTransformer = true,
+      classOf[FileSourceScanExecTransformer])
+    spark.sessionState.catalog.dropTable(
+      TableIdentifier("test_orc"),
+      ignoreIfNotExists = true,
+      purge = false)
+  }
+
   test("GLUTEN-11062: Supports mixed input format for partitioned Hive table") 
{
     val hiveClient: HiveClient =
       
spark.sharedState.externalCatalog.unwrapped.asInstanceOf[HiveExternalCatalog].client
diff --git 
a/gluten-ut/spark41/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala
 
b/gluten-ut/spark41/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala
index 2b9e4555d7..0f2f10b24d 100644
--- 
a/gluten-ut/spark41/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala
+++ 
b/gluten-ut/spark41/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala
@@ -328,8 +328,6 @@ class VeloxTestSettings extends BackendTestSettings {
   enableSuite[GlutenCSVParsingOptionsSuite]
   // Generated suites for org.apache.spark.sql.execution.datasources.json
   enableSuite[GlutenJsonParsingOptionsSuite]
-  // Generated suites for org.apache.spark.sql.execution.datasources.orc
-  enableSuite[GlutenOrcEncryptionSuite]
   // Generated suites for org.apache.spark.sql.execution.datasources.parquet
   enableSuite[GlutenParquetAvroCompatibilitySuite]
   enableSuite[GlutenParquetCommitterSuite]
@@ -363,94 +361,34 @@ class VeloxTestSettings extends BackendTestSettings {
     .exclude("SPARK-4228 DataFrame to JSON")
   enableSuite[GlutenValidateRequirementsSuite]
   enableSuite[GlutenOrcColumnarBatchReaderSuite]
+  enableSuite[GlutenOrcEncryptionSuite]
+    // Orc encryption not supported yet
+    .exclude("Write and read an encrypted file")
+    .exclude("Write and read an encrypted table")
+    .exclude("SPARK-35325: Write and read encrypted nested columns")
+    .exclude("SPARK-35992: Write and read fully-encrypted columns with default 
masking")
   enableSuite[GlutenOrcFilterSuite]
     .exclude("SPARK-32622: case sensitivity in predicate pushdown")
   enableSuite[GlutenOrcPartitionDiscoverySuite]
-    .exclude("read partitioned table - normal case")
-    .exclude("read partitioned table - with nulls")
   enableSuite[GlutenOrcV1PartitionDiscoverySuite]
-    .exclude("read partitioned table - normal case")
-    .exclude("read partitioned table - with nulls")
-    .exclude("read partitioned table - partition key included in orc file")
-    .exclude("read partitioned table - with nulls and partition keys are 
included in Orc file")
   enableSuite[GlutenOrcV1QuerySuite]
-    // For exception test.
+    // Expected exception org.apache.spark.sql.AnalysisException to be thrown
+    // , but no exception was thrown
     .exclude("SPARK-20728 Make ORCFileFormat configurable between sql/hive and 
sql/core")
-    .exclude("Read/write binary data")
-    .exclude("Read/write all types with non-primitive type")
-    .exclude("Creating case class RDD table")
-    .exclude("save and load case class RDD with `None`s as orc")
-    .exclude("SPARK-16610: Respect orc.compress (i.e., OrcConf.COMPRESS) when" 
+
-      " compression is unset")
-    .exclude("Compression options for writing to an ORC file (SNAPPY, ZLIB and 
NONE)")
-    .exclude("appending")
-    .exclude("nested data - struct with array field")
-    .exclude("nested data - array of struct")
-    .exclude("SPARK-9170: Don't implicitly lowercase of user-provided columns")
-    .exclude("SPARK-10623 Enable ORC PPD")
-    .exclude("SPARK-14962 Produce correct results on array type with 
isnotnull")
-    .exclude("SPARK-15198 Support for pushing down filters for boolean types")
-    .exclude("Support for pushing down filters for decimal types")
-    .exclude("Support for pushing down filters for timestamp types")
-    .exclude("column nullability and comment - write and then read")
-    .exclude("Empty schema does not read data from ORC file")
-    .exclude("read from multiple orc input paths")
+    // not supported ignoreCorruptFiles
     .exclude("Enabling/disabling ignoreCorruptFiles")
-    .exclude("SPARK-27160 Predicate pushdown correctness on DecimalType for 
ORC")
-    .exclude("LZO compression options for writing to an ORC file")
-    .exclude("Schema discovery on empty ORC files")
-    .exclude("SPARK-21791 ORC should support column names with dot")
-    .exclude("SPARK-25579 ORC PPD should support column names with dot")
-    .exclude("SPARK-34862: Support ORC vectorized reader for nested column")
-    .exclude("SPARK-37728: Reading nested columns with ORC vectorized reader 
should not")
-    .exclude("SPARK-36594: ORC vectorized reader should properly check maximal 
number of fields")
-    .exclude("Read/write all timestamp types")
-    .exclude("SPARK-37463: read/write Timestamp ntz to Orc with different time 
zone")
-    .exclude("SPARK-39381: Make vectorized orc columar writer batch size 
configurable")
+    // Schema mismatch, From Kind: BIGINT, To Kind: VARCHAR
     .exclude("SPARK-39830: Reading ORC table that requires type promotion may 
throw AIOOBE")
   enableSuite[GlutenOrcV2QuerySuite]
-    .exclude("Read/write binary data")
-    .exclude("Read/write all types with non-primitive type")
-    // Rewrite to disable Spark's columnar reader.
-    .exclude("Simple selection form ORC table")
-    .exclude("Creating case class RDD table")
-    .exclude("save and load case class RDD with `None`s as orc")
-    .exclude("SPARK-16610: Respect orc.compress (i.e., OrcConf.COMPRESS) when 
compression is unset")
-    .exclude("Compression options for writing to an ORC file (SNAPPY, ZLIB and 
NONE)")
-    .exclude("appending")
-    .exclude("nested data - struct with array field")
-    .exclude("nested data - array of struct")
-    .exclude("SPARK-9170: Don't implicitly lowercase of user-provided columns")
-    .exclude("SPARK-10623 Enable ORC PPD")
-    .exclude("SPARK-14962 Produce correct results on array type with 
isnotnull")
-    .exclude("SPARK-15198 Support for pushing down filters for boolean types")
-    .exclude("Support for pushing down filters for decimal types")
-    .exclude("Support for pushing down filters for timestamp types")
-    .exclude("column nullability and comment - write and then read")
-    .exclude("Empty schema does not read data from ORC file")
-    .exclude("read from multiple orc input paths")
+    // feature not supported
     .exclude("Enabling/disabling ignoreCorruptFiles")
-    .exclude("SPARK-27160 Predicate pushdown correctness on DecimalType for 
ORC")
-    .exclude("LZO compression options for writing to an ORC file")
-    .exclude("Schema discovery on empty ORC files")
-    .exclude("SPARK-21791 ORC should support column names with dot")
-    .exclude("SPARK-25579 ORC PPD should support column names with dot")
-    .exclude("SPARK-34862: Support ORC vectorized reader for nested column")
-    .exclude("SPARK-37728: Reading nested columns with ORC vectorized reader 
should not")
-    .exclude("SPARK-36594: ORC vectorized reader should properly check maximal 
number of fields")
-    .exclude("Read/write all timestamp types")
-    .exclude("SPARK-37463: read/write Timestamp ntz to Orc with different time 
zone")
-    .exclude("SPARK-39381: Make vectorized orc columar writer batch size 
configurable")
+    // Schema mismatch, From Kind: BIGINT, To Kind: VARCHAR
     .exclude("SPARK-39830: Reading ORC table that requires type promotion may 
throw AIOOBE")
-    .exclude("simple select queries")
-    .exclude("overwriting")
-    .exclude("self-join")
-    .exclude("columns only referenced by pushed down filters should remain")
-    .exclude("SPARK-5309 strings stored using dictionary compression in orc")
     // For exception test.
     .exclude("SPARK-20728 Make ORCFileFormat configurable between sql/hive and 
sql/core")
   enableSuite[GlutenOrcSourceSuite]
-    // https://github.com/apache/incubator-gluten/issues/11218
+    // Rewrite to disable Spark's columnar reader.
+    // date result miss match
     .exclude("SPARK-31238: compatibility with Spark 2.4 in reading dates")
     .exclude("SPARK-31238, SPARK-31423: rebasing dates in write")
     // Ignored to disable vectorized reading check.
@@ -460,9 +398,9 @@ class VeloxTestSettings extends BackendTestSettings {
     .exclude("appending insert")
     .exclude("overwrite insert")
     .exclude("SPARK-34897: Support reconcile schemas based on index after 
nested column pruning")
+    // date result miss match
     .excludeGlutenTest("SPARK-31238: compatibility with Spark 2.4 in reading 
dates")
     .excludeGlutenTest("SPARK-31238, SPARK-31423: rebasing dates in write")
-    .excludeGlutenTest("SPARK-34862: Support ORC vectorized reader for nested 
column")
     // exclude as struct not supported
     .exclude("SPARK-36663: OrcUtils.toCatalystSchema should correctly handle a 
column name which consists of only numbers")
     .exclude("SPARK-37812: Reuse result row when deserializing a struct")
@@ -470,6 +408,7 @@ class VeloxTestSettings extends BackendTestSettings {
     .exclude("SPARK-36931: Support reading and writing ANSI intervals 
(spark.sql.orc.enableVectorizedReader=true, 
spark.sql.orc.enableNestedColumnVectorizedReader=true)")
     .exclude("SPARK-36931: Support reading and writing ANSI intervals 
(spark.sql.orc.enableVectorizedReader=true, 
spark.sql.orc.enableNestedColumnVectorizedReader=false)")
   enableSuite[GlutenOrcV1FilterSuite]
+    // Expected exception org.apache.spark.SparkException to be thrown, but no 
exception was thrown
     .exclude("SPARK-32622: case sensitivity in predicate pushdown")
   enableSuite[GlutenOrcV1SchemaPruningSuite]
   enableSuite[GlutenOrcV2SchemaPruningSuite]
diff --git 
a/gluten-ut/spark41/src/test/scala/org/apache/spark/sql/hive/execution/GlutenHiveSQLQuerySuite.scala
 
b/gluten-ut/spark41/src/test/scala/org/apache/spark/sql/hive/execution/GlutenHiveSQLQuerySuite.scala
index f945b38ede..2c42fccd86 100644
--- 
a/gluten-ut/spark41/src/test/scala/org/apache/spark/sql/hive/execution/GlutenHiveSQLQuerySuite.scala
+++ 
b/gluten-ut/spark41/src/test/scala/org/apache/spark/sql/hive/execution/GlutenHiveSQLQuerySuite.scala
@@ -16,12 +16,18 @@
  */
 package org.apache.spark.sql.hive.execution
 
+import org.apache.gluten.config.GlutenConfig
+import org.apache.gluten.execution.FileSourceScanExecTransformer
+
 import org.apache.spark.SparkConf
 import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.TableIdentifier
+import org.apache.spark.sql.execution.SparkPlan
 import org.apache.spark.sql.hive.{HiveExternalCatalog, 
HiveTableScanExecTransformer}
 import org.apache.spark.sql.hive.client.HiveClient
 
+import scala.collection.immutable.Seq
+
 class GlutenHiveSQLQuerySuite extends GlutenHiveSQLQuerySuiteBase {
 
   override def sparkConf: SparkConf = {
@@ -49,6 +55,70 @@ class GlutenHiveSQLQuerySuite extends 
GlutenHiveSQLQuerySuiteBase {
       purge = false)
   }
 
+  // copy from GLUTEN-4796, which only added to spark33
+  testGluten("Add orc char type validation") {
+    withSQLConf("spark.sql.hive.convertMetastoreOrc" -> "false") {
+      sql("DROP TABLE IF EXISTS test_orc")
+      sql(
+        "CREATE TABLE test_orc (name char(10), id int)" +
+          " USING hive OPTIONS(fileFormat 'orc')")
+      sql("INSERT INTO test_orc VALUES('test', 1)")
+    }
+
+    def testExecPlan(
+        convertMetastoreOrc: String,
+        charTypeFallbackEnabled: String,
+        shouldFindTransformer: Boolean,
+        transformerClass: Class[_ <: SparkPlan]
+    ): Unit = {
+
+      withSQLConf(
+        "spark.sql.hive.convertMetastoreOrc" -> convertMetastoreOrc,
+        GlutenConfig.VELOX_FORCE_ORC_CHAR_TYPE_SCAN_FALLBACK.key -> 
charTypeFallbackEnabled
+      ) {
+        val queries = Seq("select id from test_orc", "select name, id from 
test_orc")
+
+        queries.foreach {
+          query =>
+            val executedPlan = getExecutedPlan(spark.sql(query))
+            val planCondition = 
executedPlan.exists(_.find(transformerClass.isInstance).isDefined)
+
+            if (shouldFindTransformer) {
+              assert(planCondition)
+            } else {
+              assert(!planCondition)
+            }
+        }
+      }
+    }
+
+    testExecPlan(
+      "false",
+      "true",
+      shouldFindTransformer = false,
+      classOf[HiveTableScanExecTransformer])
+    testExecPlan(
+      "false",
+      "false",
+      shouldFindTransformer = true,
+      classOf[HiveTableScanExecTransformer])
+
+    testExecPlan(
+      "true",
+      "true",
+      shouldFindTransformer = false,
+      classOf[FileSourceScanExecTransformer])
+    testExecPlan(
+      "true",
+      "false",
+      shouldFindTransformer = true,
+      classOf[FileSourceScanExecTransformer])
+    spark.sessionState.catalog.dropTable(
+      TableIdentifier("test_orc"),
+      ignoreIfNotExists = true,
+      purge = false)
+  }
+
   test("GLUTEN-11062: Supports mixed input format for partitioned Hive table") 
{
     val hiveClient: HiveClient =
       
spark.sharedState.externalCatalog.unwrapped.asInstanceOf[HiveExternalCatalog].client


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(incubator-gluten) branch main updated: [GLUTEN-11422][VL] Update ORC related Spark unit tests for Spark 3.5+ (#11423)

Reply via email to