This is an automated email from the ASF dual-hosted git repository.
rui pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/incubator-gluten.git
The following commit(s) were added to refs/heads/main by this push:
new ab95d62ef3 [GLUTEN-11422][VL] Update ORC related Spark unit tests for
Spark 3.5+ (#11423)
ab95d62ef3 is described below
commit ab95d62ef34c341f87e92f84856f45997cce2755
Author: loudongfeng <[email protected]>
AuthorDate: Tue Feb 24 17:34:10 2026 +0800
[GLUTEN-11422][VL] Update ORC related Spark unit tests for Spark 3.5+
(#11423)
---
.../gluten/utils/velox/VeloxTestSettings.scala | 99 ++++------------------
.../datasources/orc/GlutenOrcEncryptionSuite.scala | 21 +++++
.../hive/execution/GlutenHiveSQLQuerySuite.scala | 70 +++++++++++++++
.../gluten/utils/velox/VeloxTestSettings.scala | 93 ++++----------------
.../hive/execution/GlutenHiveSQLQuerySuite.scala | 70 +++++++++++++++
.../gluten/utils/velox/VeloxTestSettings.scala | 93 ++++----------------
.../hive/execution/GlutenHiveSQLQuerySuite.scala | 70 +++++++++++++++
7 files changed, 278 insertions(+), 238 deletions(-)
diff --git
a/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala
b/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala
index b76a717e42..1207121da7 100644
---
a/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala
+++
b/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala
@@ -282,106 +282,36 @@ class VeloxTestSettings extends BackendTestSettings {
enableSuite[GlutenJsonLegacyTimeParserSuite]
enableSuite[GlutenValidateRequirementsSuite]
enableSuite[GlutenOrcColumnarBatchReaderSuite]
+ enableSuite[GlutenOrcEncryptionSuite]
+ // Orc encryption not supported yet
+ .exclude("Write and read an encrypted file")
+ .exclude("Write and read an encrypted table")
+ .exclude("SPARK-35325: Write and read encrypted nested columns")
+ .exclude("SPARK-35992: Write and read fully-encrypted columns with default
masking")
enableSuite[GlutenOrcFilterSuite]
.exclude("SPARK-32622: case sensitivity in predicate pushdown")
enableSuite[GlutenOrcPartitionDiscoverySuite]
- .exclude("read partitioned table - normal case")
- .exclude("read partitioned table - with nulls")
enableSuite[GlutenOrcV1PartitionDiscoverySuite]
- .exclude("read partitioned table - normal case")
- .exclude("read partitioned table - with nulls")
- .exclude("read partitioned table - partition key included in orc file")
- .exclude("read partitioned table - with nulls and partition keys are
included in Orc file")
enableSuite[GlutenOrcV1QuerySuite]
- // Rewrite to disable Spark's columnar reader.
- .exclude("Simple selection form ORC table")
- .exclude("simple select queries")
- .exclude("overwriting")
- .exclude("self-join")
- .exclude("columns only referenced by pushed down filters should remain")
- .exclude("SPARK-5309 strings stored using dictionary compression in orc")
- // For exception test.
+ // Expected exception org.apache.spark.sql.AnalysisException to be thrown
+ // , but no exception was thrown
.exclude("SPARK-20728 Make ORCFileFormat configurable between sql/hive and
sql/core")
- .exclude("Read/write binary data")
- .exclude("Read/write all types with non-primitive type")
- .exclude("Creating case class RDD table")
- .exclude("save and load case class RDD with `None`s as orc")
- .exclude("SPARK-16610: Respect orc.compress (i.e., OrcConf.COMPRESS) when"
+
- " compression is unset")
- .exclude("Compression options for writing to an ORC file (SNAPPY, ZLIB and
NONE)")
- .exclude("appending")
- .exclude("nested data - struct with array field")
- .exclude("nested data - array of struct")
- .exclude("SPARK-9170: Don't implicitly lowercase of user-provided columns")
- .exclude("SPARK-10623 Enable ORC PPD")
- .exclude("SPARK-14962 Produce correct results on array type with
isnotnull")
- .exclude("SPARK-15198 Support for pushing down filters for boolean types")
- .exclude("Support for pushing down filters for decimal types")
- .exclude("Support for pushing down filters for timestamp types")
- .exclude("column nullability and comment - write and then read")
- .exclude("Empty schema does not read data from ORC file")
- .exclude("read from multiple orc input paths")
+ // not supported ignoreCorruptFiles
.exclude("Enabling/disabling ignoreCorruptFiles")
- .exclude("SPARK-27160 Predicate pushdown correctness on DecimalType for
ORC")
- .exclude("LZO compression options for writing to an ORC file")
- .exclude("Schema discovery on empty ORC files")
- .exclude("SPARK-21791 ORC should support column names with dot")
- .exclude("SPARK-25579 ORC PPD should support column names with dot")
- .exclude("SPARK-34862: Support ORC vectorized reader for nested column")
- .exclude("SPARK-37728: Reading nested columns with ORC vectorized reader
should not")
- .exclude("SPARK-36594: ORC vectorized reader should properly check maximal
number of fields")
- .exclude("Read/write all timestamp types")
- .exclude("SPARK-37463: read/write Timestamp ntz to Orc with different time
zone")
- .exclude("SPARK-39381: Make vectorized orc columar writer batch size
configurable")
+ // Schema mismatch, From Kind: BIGINT, To Kind: VARCHAR
.exclude("SPARK-39830: Reading ORC table that requires type promotion may
throw AIOOBE")
enableSuite[GlutenOrcV2QuerySuite]
- .exclude("Read/write binary data")
- .exclude("Read/write all types with non-primitive type")
- // Rewrite to disable Spark's columnar reader.
- .exclude("Simple selection form ORC table")
- .exclude("Creating case class RDD table")
- .exclude("save and load case class RDD with `None`s as orc")
- .exclude("SPARK-16610: Respect orc.compress (i.e., OrcConf.COMPRESS) when
compression is unset")
- .exclude("Compression options for writing to an ORC file (SNAPPY, ZLIB and
NONE)")
- .exclude("appending")
- .exclude("nested data - struct with array field")
- .exclude("nested data - array of struct")
- .exclude("SPARK-9170: Don't implicitly lowercase of user-provided columns")
- .exclude("SPARK-10623 Enable ORC PPD")
- .exclude("SPARK-14962 Produce correct results on array type with
isnotnull")
- .exclude("SPARK-15198 Support for pushing down filters for boolean types")
- .exclude("Support for pushing down filters for decimal types")
- .exclude("Support for pushing down filters for timestamp types")
- .exclude("column nullability and comment - write and then read")
- .exclude("Empty schema does not read data from ORC file")
- .exclude("read from multiple orc input paths")
+// feature not supported
.exclude("Enabling/disabling ignoreCorruptFiles")
- .exclude("SPARK-27160 Predicate pushdown correctness on DecimalType for
ORC")
- .exclude("LZO compression options for writing to an ORC file")
- .exclude("Schema discovery on empty ORC files")
- .exclude("SPARK-21791 ORC should support column names with dot")
- .exclude("SPARK-25579 ORC PPD should support column names with dot")
- .exclude("SPARK-34862: Support ORC vectorized reader for nested column")
- .exclude("SPARK-37728: Reading nested columns with ORC vectorized reader
should not")
- .exclude("SPARK-36594: ORC vectorized reader should properly check maximal
number of fields")
- .exclude("Read/write all timestamp types")
- .exclude("SPARK-37463: read/write Timestamp ntz to Orc with different time
zone")
- .exclude("SPARK-39381: Make vectorized orc columar writer batch size
configurable")
+ // Schema mismatch, From Kind: BIGINT, To Kind: VARCHAR
.exclude("SPARK-39830: Reading ORC table that requires type promotion may
throw AIOOBE")
- .exclude("simple select queries")
- .exclude("overwriting")
- .exclude("self-join")
- .exclude("columns only referenced by pushed down filters should remain")
- .exclude("SPARK-5309 strings stored using dictionary compression in orc")
// For exception test.
.exclude("SPARK-20728 Make ORCFileFormat configurable between sql/hive and
sql/core")
enableSuite[GlutenOrcSourceSuite]
// Rewrite to disable Spark's columnar reader.
+ // date result miss match
.exclude("SPARK-31238: compatibility with Spark 2.4 in reading dates")
.exclude("SPARK-31238, SPARK-31423: rebasing dates in write")
- .exclude("SPARK-31284: compatibility with Spark 2.4 in reading timestamps")
- .exclude("SPARK-31284, SPARK-31423: rebasing timestamps in write")
- .exclude("SPARK-34862: Support ORC vectorized reader for nested column")
// Ignored to disable vectorized reading check.
.exclude("SPARK-36594: ORC vectorized reader should properly check maximal
number of fields")
.exclude("create temporary orc table")
@@ -389,9 +319,9 @@ class VeloxTestSettings extends BackendTestSettings {
.exclude("appending insert")
.exclude("overwrite insert")
.exclude("SPARK-34897: Support reconcile schemas based on index after
nested column pruning")
+ // date result miss match
.excludeGlutenTest("SPARK-31238: compatibility with Spark 2.4 in reading
dates")
.excludeGlutenTest("SPARK-31238, SPARK-31423: rebasing dates in write")
- .excludeGlutenTest("SPARK-34862: Support ORC vectorized reader for nested
column")
// exclude as struct not supported
.exclude("SPARK-36663: OrcUtils.toCatalystSchema should correctly handle a
column name which consists of only numbers")
.exclude("SPARK-37812: Reuse result row when deserializing a struct")
@@ -399,6 +329,7 @@ class VeloxTestSettings extends BackendTestSettings {
.exclude("SPARK-36931: Support reading and writing ANSI intervals
(spark.sql.orc.enableVectorizedReader=true,
spark.sql.orc.enableNestedColumnVectorizedReader=true)")
.exclude("SPARK-36931: Support reading and writing ANSI intervals
(spark.sql.orc.enableVectorizedReader=true,
spark.sql.orc.enableNestedColumnVectorizedReader=false)")
enableSuite[GlutenOrcV1FilterSuite]
+ // Expected exception org.apache.spark.SparkException to be thrown, but no
exception was thrown
.exclude("SPARK-32622: case sensitivity in predicate pushdown")
enableSuite[GlutenOrcV1SchemaPruningSuite]
enableSuite[GlutenOrcV2SchemaPruningSuite]
diff --git
a/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/execution/datasources/orc/GlutenOrcEncryptionSuite.scala
b/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/execution/datasources/orc/GlutenOrcEncryptionSuite.scala
new file mode 100644
index 0000000000..27664b02e3
--- /dev/null
+++
b/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/execution/datasources/orc/GlutenOrcEncryptionSuite.scala
@@ -0,0 +1,21 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.sql.execution.datasources.orc
+
+import org.apache.spark.sql.GlutenSQLTestsBaseTrait
+
+class GlutenOrcEncryptionSuite extends OrcEncryptionSuite with
GlutenSQLTestsBaseTrait {}
diff --git
a/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/hive/execution/GlutenHiveSQLQuerySuite.scala
b/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/hive/execution/GlutenHiveSQLQuerySuite.scala
index f945b38ede..2c42fccd86 100644
---
a/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/hive/execution/GlutenHiveSQLQuerySuite.scala
+++
b/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/hive/execution/GlutenHiveSQLQuerySuite.scala
@@ -16,12 +16,18 @@
*/
package org.apache.spark.sql.hive.execution
+import org.apache.gluten.config.GlutenConfig
+import org.apache.gluten.execution.FileSourceScanExecTransformer
+
import org.apache.spark.SparkConf
import org.apache.spark.sql.Row
import org.apache.spark.sql.catalyst.TableIdentifier
+import org.apache.spark.sql.execution.SparkPlan
import org.apache.spark.sql.hive.{HiveExternalCatalog,
HiveTableScanExecTransformer}
import org.apache.spark.sql.hive.client.HiveClient
+import scala.collection.immutable.Seq
+
class GlutenHiveSQLQuerySuite extends GlutenHiveSQLQuerySuiteBase {
override def sparkConf: SparkConf = {
@@ -49,6 +55,70 @@ class GlutenHiveSQLQuerySuite extends
GlutenHiveSQLQuerySuiteBase {
purge = false)
}
+ // copy from GLUTEN-4796, which only added to spark33
+ testGluten("Add orc char type validation") {
+ withSQLConf("spark.sql.hive.convertMetastoreOrc" -> "false") {
+ sql("DROP TABLE IF EXISTS test_orc")
+ sql(
+ "CREATE TABLE test_orc (name char(10), id int)" +
+ " USING hive OPTIONS(fileFormat 'orc')")
+ sql("INSERT INTO test_orc VALUES('test', 1)")
+ }
+
+ def testExecPlan(
+ convertMetastoreOrc: String,
+ charTypeFallbackEnabled: String,
+ shouldFindTransformer: Boolean,
+ transformerClass: Class[_ <: SparkPlan]
+ ): Unit = {
+
+ withSQLConf(
+ "spark.sql.hive.convertMetastoreOrc" -> convertMetastoreOrc,
+ GlutenConfig.VELOX_FORCE_ORC_CHAR_TYPE_SCAN_FALLBACK.key ->
charTypeFallbackEnabled
+ ) {
+ val queries = Seq("select id from test_orc", "select name, id from
test_orc")
+
+ queries.foreach {
+ query =>
+ val executedPlan = getExecutedPlan(spark.sql(query))
+ val planCondition =
executedPlan.exists(_.find(transformerClass.isInstance).isDefined)
+
+ if (shouldFindTransformer) {
+ assert(planCondition)
+ } else {
+ assert(!planCondition)
+ }
+ }
+ }
+ }
+
+ testExecPlan(
+ "false",
+ "true",
+ shouldFindTransformer = false,
+ classOf[HiveTableScanExecTransformer])
+ testExecPlan(
+ "false",
+ "false",
+ shouldFindTransformer = true,
+ classOf[HiveTableScanExecTransformer])
+
+ testExecPlan(
+ "true",
+ "true",
+ shouldFindTransformer = false,
+ classOf[FileSourceScanExecTransformer])
+ testExecPlan(
+ "true",
+ "false",
+ shouldFindTransformer = true,
+ classOf[FileSourceScanExecTransformer])
+ spark.sessionState.catalog.dropTable(
+ TableIdentifier("test_orc"),
+ ignoreIfNotExists = true,
+ purge = false)
+ }
+
test("GLUTEN-11062: Supports mixed input format for partitioned Hive table")
{
val hiveClient: HiveClient =
spark.sharedState.externalCatalog.unwrapped.asInstanceOf[HiveExternalCatalog].client
diff --git
a/gluten-ut/spark40/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala
b/gluten-ut/spark40/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala
index 6e94d4cc0e..e94e958a0f 100644
---
a/gluten-ut/spark40/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala
+++
b/gluten-ut/spark40/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala
@@ -317,8 +317,6 @@ class VeloxTestSettings extends BackendTestSettings {
enableSuite[GlutenCSVParsingOptionsSuite]
// Generated suites for org.apache.spark.sql.execution.datasources.json
enableSuite[GlutenJsonParsingOptionsSuite]
- // Generated suites for org.apache.spark.sql.execution.datasources.orc
- enableSuite[GlutenOrcEncryptionSuite]
// Generated suites for org.apache.spark.sql.execution.datasources.parquet
enableSuite[GlutenParquetAvroCompatibilitySuite]
enableSuite[GlutenParquetCommitterSuite]
@@ -406,94 +404,34 @@ class VeloxTestSettings extends BackendTestSettings {
.exclude("SPARK-4228 DataFrame to JSON")
enableSuite[GlutenValidateRequirementsSuite]
enableSuite[GlutenOrcColumnarBatchReaderSuite]
+ enableSuite[GlutenOrcEncryptionSuite]
+ // Orc encryption not supported yet
+ .exclude("Write and read an encrypted file")
+ .exclude("Write and read an encrypted table")
+ .exclude("SPARK-35325: Write and read encrypted nested columns")
+ .exclude("SPARK-35992: Write and read fully-encrypted columns with default
masking")
enableSuite[GlutenOrcFilterSuite]
.exclude("SPARK-32622: case sensitivity in predicate pushdown")
enableSuite[GlutenOrcPartitionDiscoverySuite]
- .exclude("read partitioned table - normal case")
- .exclude("read partitioned table - with nulls")
enableSuite[GlutenOrcV1PartitionDiscoverySuite]
- .exclude("read partitioned table - normal case")
- .exclude("read partitioned table - with nulls")
- .exclude("read partitioned table - partition key included in orc file")
- .exclude("read partitioned table - with nulls and partition keys are
included in Orc file")
enableSuite[GlutenOrcV1QuerySuite]
- // For exception test.
+ // Expected exception org.apache.spark.sql.AnalysisException to be thrown
+ // , but no exception was thrown
.exclude("SPARK-20728 Make ORCFileFormat configurable between sql/hive and
sql/core")
- .exclude("Read/write binary data")
- .exclude("Read/write all types with non-primitive type")
- .exclude("Creating case class RDD table")
- .exclude("save and load case class RDD with `None`s as orc")
- .exclude("SPARK-16610: Respect orc.compress (i.e., OrcConf.COMPRESS) when"
+
- " compression is unset")
- .exclude("Compression options for writing to an ORC file (SNAPPY, ZLIB and
NONE)")
- .exclude("appending")
- .exclude("nested data - struct with array field")
- .exclude("nested data - array of struct")
- .exclude("SPARK-9170: Don't implicitly lowercase of user-provided columns")
- .exclude("SPARK-10623 Enable ORC PPD")
- .exclude("SPARK-14962 Produce correct results on array type with
isnotnull")
- .exclude("SPARK-15198 Support for pushing down filters for boolean types")
- .exclude("Support for pushing down filters for decimal types")
- .exclude("Support for pushing down filters for timestamp types")
- .exclude("column nullability and comment - write and then read")
- .exclude("Empty schema does not read data from ORC file")
- .exclude("read from multiple orc input paths")
+ // not supported ignoreCorruptFiles
.exclude("Enabling/disabling ignoreCorruptFiles")
- .exclude("SPARK-27160 Predicate pushdown correctness on DecimalType for
ORC")
- .exclude("LZO compression options for writing to an ORC file")
- .exclude("Schema discovery on empty ORC files")
- .exclude("SPARK-21791 ORC should support column names with dot")
- .exclude("SPARK-25579 ORC PPD should support column names with dot")
- .exclude("SPARK-34862: Support ORC vectorized reader for nested column")
- .exclude("SPARK-37728: Reading nested columns with ORC vectorized reader
should not")
- .exclude("SPARK-36594: ORC vectorized reader should properly check maximal
number of fields")
- .exclude("Read/write all timestamp types")
- .exclude("SPARK-37463: read/write Timestamp ntz to Orc with different time
zone")
- .exclude("SPARK-39381: Make vectorized orc columar writer batch size
configurable")
+ // Schema mismatch, From Kind: BIGINT, To Kind: VARCHAR
.exclude("SPARK-39830: Reading ORC table that requires type promotion may
throw AIOOBE")
enableSuite[GlutenOrcV2QuerySuite]
- .exclude("Read/write binary data")
- .exclude("Read/write all types with non-primitive type")
- // Rewrite to disable Spark's columnar reader.
- .exclude("Simple selection form ORC table")
- .exclude("Creating case class RDD table")
- .exclude("save and load case class RDD with `None`s as orc")
- .exclude("SPARK-16610: Respect orc.compress (i.e., OrcConf.COMPRESS) when
compression is unset")
- .exclude("Compression options for writing to an ORC file (SNAPPY, ZLIB and
NONE)")
- .exclude("appending")
- .exclude("nested data - struct with array field")
- .exclude("nested data - array of struct")
- .exclude("SPARK-9170: Don't implicitly lowercase of user-provided columns")
- .exclude("SPARK-10623 Enable ORC PPD")
- .exclude("SPARK-14962 Produce correct results on array type with
isnotnull")
- .exclude("SPARK-15198 Support for pushing down filters for boolean types")
- .exclude("Support for pushing down filters for decimal types")
- .exclude("Support for pushing down filters for timestamp types")
- .exclude("column nullability and comment - write and then read")
- .exclude("Empty schema does not read data from ORC file")
- .exclude("read from multiple orc input paths")
+ // feature not supported
.exclude("Enabling/disabling ignoreCorruptFiles")
- .exclude("SPARK-27160 Predicate pushdown correctness on DecimalType for
ORC")
- .exclude("LZO compression options for writing to an ORC file")
- .exclude("Schema discovery on empty ORC files")
- .exclude("SPARK-21791 ORC should support column names with dot")
- .exclude("SPARK-25579 ORC PPD should support column names with dot")
- .exclude("SPARK-34862: Support ORC vectorized reader for nested column")
- .exclude("SPARK-37728: Reading nested columns with ORC vectorized reader
should not")
- .exclude("SPARK-36594: ORC vectorized reader should properly check maximal
number of fields")
- .exclude("Read/write all timestamp types")
- .exclude("SPARK-37463: read/write Timestamp ntz to Orc with different time
zone")
- .exclude("SPARK-39381: Make vectorized orc columar writer batch size
configurable")
+ // Schema mismatch, From Kind: BIGINT, To Kind: VARCHAR
.exclude("SPARK-39830: Reading ORC table that requires type promotion may
throw AIOOBE")
- .exclude("simple select queries")
- .exclude("overwriting")
- .exclude("self-join")
- .exclude("columns only referenced by pushed down filters should remain")
- .exclude("SPARK-5309 strings stored using dictionary compression in orc")
// For exception test.
.exclude("SPARK-20728 Make ORCFileFormat configurable between sql/hive and
sql/core")
enableSuite[GlutenOrcSourceSuite]
- // https://github.com/apache/incubator-gluten/issues/11218
+ // Rewrite to disable Spark's columnar reader.
+ // date result miss match
.exclude("SPARK-31238: compatibility with Spark 2.4 in reading dates")
.exclude("SPARK-31238, SPARK-31423: rebasing dates in write")
// Ignored to disable vectorized reading check.
@@ -503,9 +441,9 @@ class VeloxTestSettings extends BackendTestSettings {
.exclude("appending insert")
.exclude("overwrite insert")
.exclude("SPARK-34897: Support reconcile schemas based on index after
nested column pruning")
+ // date result miss match
.excludeGlutenTest("SPARK-31238: compatibility with Spark 2.4 in reading
dates")
.excludeGlutenTest("SPARK-31238, SPARK-31423: rebasing dates in write")
- .excludeGlutenTest("SPARK-34862: Support ORC vectorized reader for nested
column")
// exclude as struct not supported
.exclude("SPARK-36663: OrcUtils.toCatalystSchema should correctly handle a
column name which consists of only numbers")
.exclude("SPARK-37812: Reuse result row when deserializing a struct")
@@ -513,6 +451,7 @@ class VeloxTestSettings extends BackendTestSettings {
.exclude("SPARK-36931: Support reading and writing ANSI intervals
(spark.sql.orc.enableVectorizedReader=true,
spark.sql.orc.enableNestedColumnVectorizedReader=true)")
.exclude("SPARK-36931: Support reading and writing ANSI intervals
(spark.sql.orc.enableVectorizedReader=true,
spark.sql.orc.enableNestedColumnVectorizedReader=false)")
enableSuite[GlutenOrcV1FilterSuite]
+ // Expected exception org.apache.spark.SparkException to be thrown, but no
exception was thrown
.exclude("SPARK-32622: case sensitivity in predicate pushdown")
enableSuite[GlutenOrcV1SchemaPruningSuite]
enableSuite[GlutenOrcV2SchemaPruningSuite]
diff --git
a/gluten-ut/spark40/src/test/scala/org/apache/spark/sql/hive/execution/GlutenHiveSQLQuerySuite.scala
b/gluten-ut/spark40/src/test/scala/org/apache/spark/sql/hive/execution/GlutenHiveSQLQuerySuite.scala
index f945b38ede..2c42fccd86 100644
---
a/gluten-ut/spark40/src/test/scala/org/apache/spark/sql/hive/execution/GlutenHiveSQLQuerySuite.scala
+++
b/gluten-ut/spark40/src/test/scala/org/apache/spark/sql/hive/execution/GlutenHiveSQLQuerySuite.scala
@@ -16,12 +16,18 @@
*/
package org.apache.spark.sql.hive.execution
+import org.apache.gluten.config.GlutenConfig
+import org.apache.gluten.execution.FileSourceScanExecTransformer
+
import org.apache.spark.SparkConf
import org.apache.spark.sql.Row
import org.apache.spark.sql.catalyst.TableIdentifier
+import org.apache.spark.sql.execution.SparkPlan
import org.apache.spark.sql.hive.{HiveExternalCatalog,
HiveTableScanExecTransformer}
import org.apache.spark.sql.hive.client.HiveClient
+import scala.collection.immutable.Seq
+
class GlutenHiveSQLQuerySuite extends GlutenHiveSQLQuerySuiteBase {
override def sparkConf: SparkConf = {
@@ -49,6 +55,70 @@ class GlutenHiveSQLQuerySuite extends
GlutenHiveSQLQuerySuiteBase {
purge = false)
}
+ // copy from GLUTEN-4796, which only added to spark33
+ testGluten("Add orc char type validation") {
+ withSQLConf("spark.sql.hive.convertMetastoreOrc" -> "false") {
+ sql("DROP TABLE IF EXISTS test_orc")
+ sql(
+ "CREATE TABLE test_orc (name char(10), id int)" +
+ " USING hive OPTIONS(fileFormat 'orc')")
+ sql("INSERT INTO test_orc VALUES('test', 1)")
+ }
+
+ def testExecPlan(
+ convertMetastoreOrc: String,
+ charTypeFallbackEnabled: String,
+ shouldFindTransformer: Boolean,
+ transformerClass: Class[_ <: SparkPlan]
+ ): Unit = {
+
+ withSQLConf(
+ "spark.sql.hive.convertMetastoreOrc" -> convertMetastoreOrc,
+ GlutenConfig.VELOX_FORCE_ORC_CHAR_TYPE_SCAN_FALLBACK.key ->
charTypeFallbackEnabled
+ ) {
+ val queries = Seq("select id from test_orc", "select name, id from
test_orc")
+
+ queries.foreach {
+ query =>
+ val executedPlan = getExecutedPlan(spark.sql(query))
+ val planCondition =
executedPlan.exists(_.find(transformerClass.isInstance).isDefined)
+
+ if (shouldFindTransformer) {
+ assert(planCondition)
+ } else {
+ assert(!planCondition)
+ }
+ }
+ }
+ }
+
+ testExecPlan(
+ "false",
+ "true",
+ shouldFindTransformer = false,
+ classOf[HiveTableScanExecTransformer])
+ testExecPlan(
+ "false",
+ "false",
+ shouldFindTransformer = true,
+ classOf[HiveTableScanExecTransformer])
+
+ testExecPlan(
+ "true",
+ "true",
+ shouldFindTransformer = false,
+ classOf[FileSourceScanExecTransformer])
+ testExecPlan(
+ "true",
+ "false",
+ shouldFindTransformer = true,
+ classOf[FileSourceScanExecTransformer])
+ spark.sessionState.catalog.dropTable(
+ TableIdentifier("test_orc"),
+ ignoreIfNotExists = true,
+ purge = false)
+ }
+
test("GLUTEN-11062: Supports mixed input format for partitioned Hive table")
{
val hiveClient: HiveClient =
spark.sharedState.externalCatalog.unwrapped.asInstanceOf[HiveExternalCatalog].client
diff --git
a/gluten-ut/spark41/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala
b/gluten-ut/spark41/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala
index 2b9e4555d7..0f2f10b24d 100644
---
a/gluten-ut/spark41/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala
+++
b/gluten-ut/spark41/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala
@@ -328,8 +328,6 @@ class VeloxTestSettings extends BackendTestSettings {
enableSuite[GlutenCSVParsingOptionsSuite]
// Generated suites for org.apache.spark.sql.execution.datasources.json
enableSuite[GlutenJsonParsingOptionsSuite]
- // Generated suites for org.apache.spark.sql.execution.datasources.orc
- enableSuite[GlutenOrcEncryptionSuite]
// Generated suites for org.apache.spark.sql.execution.datasources.parquet
enableSuite[GlutenParquetAvroCompatibilitySuite]
enableSuite[GlutenParquetCommitterSuite]
@@ -363,94 +361,34 @@ class VeloxTestSettings extends BackendTestSettings {
.exclude("SPARK-4228 DataFrame to JSON")
enableSuite[GlutenValidateRequirementsSuite]
enableSuite[GlutenOrcColumnarBatchReaderSuite]
+ enableSuite[GlutenOrcEncryptionSuite]
+ // Orc encryption not supported yet
+ .exclude("Write and read an encrypted file")
+ .exclude("Write and read an encrypted table")
+ .exclude("SPARK-35325: Write and read encrypted nested columns")
+ .exclude("SPARK-35992: Write and read fully-encrypted columns with default
masking")
enableSuite[GlutenOrcFilterSuite]
.exclude("SPARK-32622: case sensitivity in predicate pushdown")
enableSuite[GlutenOrcPartitionDiscoverySuite]
- .exclude("read partitioned table - normal case")
- .exclude("read partitioned table - with nulls")
enableSuite[GlutenOrcV1PartitionDiscoverySuite]
- .exclude("read partitioned table - normal case")
- .exclude("read partitioned table - with nulls")
- .exclude("read partitioned table - partition key included in orc file")
- .exclude("read partitioned table - with nulls and partition keys are
included in Orc file")
enableSuite[GlutenOrcV1QuerySuite]
- // For exception test.
+ // Expected exception org.apache.spark.sql.AnalysisException to be thrown
+ // , but no exception was thrown
.exclude("SPARK-20728 Make ORCFileFormat configurable between sql/hive and
sql/core")
- .exclude("Read/write binary data")
- .exclude("Read/write all types with non-primitive type")
- .exclude("Creating case class RDD table")
- .exclude("save and load case class RDD with `None`s as orc")
- .exclude("SPARK-16610: Respect orc.compress (i.e., OrcConf.COMPRESS) when"
+
- " compression is unset")
- .exclude("Compression options for writing to an ORC file (SNAPPY, ZLIB and
NONE)")
- .exclude("appending")
- .exclude("nested data - struct with array field")
- .exclude("nested data - array of struct")
- .exclude("SPARK-9170: Don't implicitly lowercase of user-provided columns")
- .exclude("SPARK-10623 Enable ORC PPD")
- .exclude("SPARK-14962 Produce correct results on array type with
isnotnull")
- .exclude("SPARK-15198 Support for pushing down filters for boolean types")
- .exclude("Support for pushing down filters for decimal types")
- .exclude("Support for pushing down filters for timestamp types")
- .exclude("column nullability and comment - write and then read")
- .exclude("Empty schema does not read data from ORC file")
- .exclude("read from multiple orc input paths")
+ // not supported ignoreCorruptFiles
.exclude("Enabling/disabling ignoreCorruptFiles")
- .exclude("SPARK-27160 Predicate pushdown correctness on DecimalType for
ORC")
- .exclude("LZO compression options for writing to an ORC file")
- .exclude("Schema discovery on empty ORC files")
- .exclude("SPARK-21791 ORC should support column names with dot")
- .exclude("SPARK-25579 ORC PPD should support column names with dot")
- .exclude("SPARK-34862: Support ORC vectorized reader for nested column")
- .exclude("SPARK-37728: Reading nested columns with ORC vectorized reader
should not")
- .exclude("SPARK-36594: ORC vectorized reader should properly check maximal
number of fields")
- .exclude("Read/write all timestamp types")
- .exclude("SPARK-37463: read/write Timestamp ntz to Orc with different time
zone")
- .exclude("SPARK-39381: Make vectorized orc columar writer batch size
configurable")
+ // Schema mismatch, From Kind: BIGINT, To Kind: VARCHAR
.exclude("SPARK-39830: Reading ORC table that requires type promotion may
throw AIOOBE")
enableSuite[GlutenOrcV2QuerySuite]
- .exclude("Read/write binary data")
- .exclude("Read/write all types with non-primitive type")
- // Rewrite to disable Spark's columnar reader.
- .exclude("Simple selection form ORC table")
- .exclude("Creating case class RDD table")
- .exclude("save and load case class RDD with `None`s as orc")
- .exclude("SPARK-16610: Respect orc.compress (i.e., OrcConf.COMPRESS) when
compression is unset")
- .exclude("Compression options for writing to an ORC file (SNAPPY, ZLIB and
NONE)")
- .exclude("appending")
- .exclude("nested data - struct with array field")
- .exclude("nested data - array of struct")
- .exclude("SPARK-9170: Don't implicitly lowercase of user-provided columns")
- .exclude("SPARK-10623 Enable ORC PPD")
- .exclude("SPARK-14962 Produce correct results on array type with
isnotnull")
- .exclude("SPARK-15198 Support for pushing down filters for boolean types")
- .exclude("Support for pushing down filters for decimal types")
- .exclude("Support for pushing down filters for timestamp types")
- .exclude("column nullability and comment - write and then read")
- .exclude("Empty schema does not read data from ORC file")
- .exclude("read from multiple orc input paths")
+ // feature not supported
.exclude("Enabling/disabling ignoreCorruptFiles")
- .exclude("SPARK-27160 Predicate pushdown correctness on DecimalType for
ORC")
- .exclude("LZO compression options for writing to an ORC file")
- .exclude("Schema discovery on empty ORC files")
- .exclude("SPARK-21791 ORC should support column names with dot")
- .exclude("SPARK-25579 ORC PPD should support column names with dot")
- .exclude("SPARK-34862: Support ORC vectorized reader for nested column")
- .exclude("SPARK-37728: Reading nested columns with ORC vectorized reader
should not")
- .exclude("SPARK-36594: ORC vectorized reader should properly check maximal
number of fields")
- .exclude("Read/write all timestamp types")
- .exclude("SPARK-37463: read/write Timestamp ntz to Orc with different time
zone")
- .exclude("SPARK-39381: Make vectorized orc columar writer batch size
configurable")
+ // Schema mismatch, From Kind: BIGINT, To Kind: VARCHAR
.exclude("SPARK-39830: Reading ORC table that requires type promotion may
throw AIOOBE")
- .exclude("simple select queries")
- .exclude("overwriting")
- .exclude("self-join")
- .exclude("columns only referenced by pushed down filters should remain")
- .exclude("SPARK-5309 strings stored using dictionary compression in orc")
// For exception test.
.exclude("SPARK-20728 Make ORCFileFormat configurable between sql/hive and
sql/core")
enableSuite[GlutenOrcSourceSuite]
- // https://github.com/apache/incubator-gluten/issues/11218
+ // Rewrite to disable Spark's columnar reader.
+ // date result miss match
.exclude("SPARK-31238: compatibility with Spark 2.4 in reading dates")
.exclude("SPARK-31238, SPARK-31423: rebasing dates in write")
// Ignored to disable vectorized reading check.
@@ -460,9 +398,9 @@ class VeloxTestSettings extends BackendTestSettings {
.exclude("appending insert")
.exclude("overwrite insert")
.exclude("SPARK-34897: Support reconcile schemas based on index after
nested column pruning")
+ // date result miss match
.excludeGlutenTest("SPARK-31238: compatibility with Spark 2.4 in reading
dates")
.excludeGlutenTest("SPARK-31238, SPARK-31423: rebasing dates in write")
- .excludeGlutenTest("SPARK-34862: Support ORC vectorized reader for nested
column")
// exclude as struct not supported
.exclude("SPARK-36663: OrcUtils.toCatalystSchema should correctly handle a
column name which consists of only numbers")
.exclude("SPARK-37812: Reuse result row when deserializing a struct")
@@ -470,6 +408,7 @@ class VeloxTestSettings extends BackendTestSettings {
.exclude("SPARK-36931: Support reading and writing ANSI intervals
(spark.sql.orc.enableVectorizedReader=true,
spark.sql.orc.enableNestedColumnVectorizedReader=true)")
.exclude("SPARK-36931: Support reading and writing ANSI intervals
(spark.sql.orc.enableVectorizedReader=true,
spark.sql.orc.enableNestedColumnVectorizedReader=false)")
enableSuite[GlutenOrcV1FilterSuite]
+ // Expected exception org.apache.spark.SparkException to be thrown, but no
exception was thrown
.exclude("SPARK-32622: case sensitivity in predicate pushdown")
enableSuite[GlutenOrcV1SchemaPruningSuite]
enableSuite[GlutenOrcV2SchemaPruningSuite]
diff --git
a/gluten-ut/spark41/src/test/scala/org/apache/spark/sql/hive/execution/GlutenHiveSQLQuerySuite.scala
b/gluten-ut/spark41/src/test/scala/org/apache/spark/sql/hive/execution/GlutenHiveSQLQuerySuite.scala
index f945b38ede..2c42fccd86 100644
---
a/gluten-ut/spark41/src/test/scala/org/apache/spark/sql/hive/execution/GlutenHiveSQLQuerySuite.scala
+++
b/gluten-ut/spark41/src/test/scala/org/apache/spark/sql/hive/execution/GlutenHiveSQLQuerySuite.scala
@@ -16,12 +16,18 @@
*/
package org.apache.spark.sql.hive.execution
+import org.apache.gluten.config.GlutenConfig
+import org.apache.gluten.execution.FileSourceScanExecTransformer
+
import org.apache.spark.SparkConf
import org.apache.spark.sql.Row
import org.apache.spark.sql.catalyst.TableIdentifier
+import org.apache.spark.sql.execution.SparkPlan
import org.apache.spark.sql.hive.{HiveExternalCatalog,
HiveTableScanExecTransformer}
import org.apache.spark.sql.hive.client.HiveClient
+import scala.collection.immutable.Seq
+
class GlutenHiveSQLQuerySuite extends GlutenHiveSQLQuerySuiteBase {
override def sparkConf: SparkConf = {
@@ -49,6 +55,70 @@ class GlutenHiveSQLQuerySuite extends
GlutenHiveSQLQuerySuiteBase {
purge = false)
}
+ // copy from GLUTEN-4796, which only added to spark33
+ testGluten("Add orc char type validation") {
+ withSQLConf("spark.sql.hive.convertMetastoreOrc" -> "false") {
+ sql("DROP TABLE IF EXISTS test_orc")
+ sql(
+ "CREATE TABLE test_orc (name char(10), id int)" +
+ " USING hive OPTIONS(fileFormat 'orc')")
+ sql("INSERT INTO test_orc VALUES('test', 1)")
+ }
+
+ def testExecPlan(
+ convertMetastoreOrc: String,
+ charTypeFallbackEnabled: String,
+ shouldFindTransformer: Boolean,
+ transformerClass: Class[_ <: SparkPlan]
+ ): Unit = {
+
+ withSQLConf(
+ "spark.sql.hive.convertMetastoreOrc" -> convertMetastoreOrc,
+ GlutenConfig.VELOX_FORCE_ORC_CHAR_TYPE_SCAN_FALLBACK.key ->
charTypeFallbackEnabled
+ ) {
+ val queries = Seq("select id from test_orc", "select name, id from
test_orc")
+
+ queries.foreach {
+ query =>
+ val executedPlan = getExecutedPlan(spark.sql(query))
+ val planCondition =
executedPlan.exists(_.find(transformerClass.isInstance).isDefined)
+
+ if (shouldFindTransformer) {
+ assert(planCondition)
+ } else {
+ assert(!planCondition)
+ }
+ }
+ }
+ }
+
+ testExecPlan(
+ "false",
+ "true",
+ shouldFindTransformer = false,
+ classOf[HiveTableScanExecTransformer])
+ testExecPlan(
+ "false",
+ "false",
+ shouldFindTransformer = true,
+ classOf[HiveTableScanExecTransformer])
+
+ testExecPlan(
+ "true",
+ "true",
+ shouldFindTransformer = false,
+ classOf[FileSourceScanExecTransformer])
+ testExecPlan(
+ "true",
+ "false",
+ shouldFindTransformer = true,
+ classOf[FileSourceScanExecTransformer])
+ spark.sessionState.catalog.dropTable(
+ TableIdentifier("test_orc"),
+ ignoreIfNotExists = true,
+ purge = false)
+ }
+
test("GLUTEN-11062: Supports mixed input format for partitioned Hive table")
{
val hiveClient: HiveClient =
spark.sharedState.externalCatalog.unwrapped.asInstanceOf[HiveExternalCatalog].client
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]