This is an automated email from the ASF dual-hosted git repository.
rui pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/incubator-gluten.git
The following commit(s) were added to refs/heads/main by this push:
new 1b36edcb28 [GLUTEN-8307][VL] Enable Int64 Timestamp Parquet reader
(#8308)
1b36edcb28 is described below
commit 1b36edcb2842d72ab8539c0f0c29d6fb1a86c422
Author: Mingliang Zhu <[email protected]>
AuthorDate: Tue Jan 7 13:46:57 2025 +0800
[GLUTEN-8307][VL] Enable Int64 Timestamp Parquet reader (#8308)
---
.../org/apache/gluten/backendsapi/velox/VeloxBackend.scala | 8 +-------
.../org/apache/gluten/utils/velox/VeloxTestSettings.scala | 8 --------
.../org/apache/gluten/utils/velox/VeloxTestSettings.scala | 11 +----------
.../org/apache/gluten/utils/velox/VeloxTestSettings.scala | 12 +-----------
.../org/apache/gluten/utils/velox/VeloxTestSettings.scala | 12 +-----------
.../main/scala/org/apache/gluten/config/GlutenConfig.scala | 10 ----------
6 files changed, 4 insertions(+), 57 deletions(-)
diff --git
a/backends-velox/src/main/scala/org/apache/gluten/backendsapi/velox/VeloxBackend.scala
b/backends-velox/src/main/scala/org/apache/gluten/backendsapi/velox/VeloxBackend.scala
index 18c9efab39..4918a6eade 100644
---
a/backends-velox/src/main/scala/org/apache/gluten/backendsapi/velox/VeloxBackend.scala
+++
b/backends-velox/src/main/scala/org/apache/gluten/backendsapi/velox/VeloxBackend.scala
@@ -155,18 +155,12 @@ object VeloxBackendSettings extends BackendSettingsApi {
format match {
case ParquetReadFormat =>
- val typeValidator: PartialFunction[StructField, String] = {
- // Parquet timestamp is not fully supported yet
- case StructField(_, TimestampType, _, _)
- if
GlutenConfig.get.forceParquetTimestampTypeScanFallbackEnabled =>
- "TimestampType(force fallback)"
- }
val parquetOptions = new
ParquetOptions(CaseInsensitiveMap(properties), SQLConf.get)
if (parquetOptions.mergeSchema) {
// https://github.com/apache/incubator-gluten/issues/7174
Some(s"not support when merge schema is true")
} else {
- validateTypes(typeValidator)
+ None
}
case DwrfReadFormat => None
case OrcReadFormat =>
diff --git
a/gluten-ut/spark32/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala
b/gluten-ut/spark32/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala
index baed98729b..f5071d2f3f 100644
---
a/gluten-ut/spark32/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala
+++
b/gluten-ut/spark32/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala
@@ -917,8 +917,6 @@ class VeloxTestSettings extends BackendTestSettings {
.exclude("SPARK-35640: read binary as timestamp should throw schema
incompatible error")
// Exception msg.
.exclude("SPARK-35640: int as long should throw schema incompatible error")
- // Velox only support read Timestamp with INT96 for now.
- .exclude("read dictionary and plain encoded timestamp_millis written as
INT64")
enableSuite[GlutenParquetV1PartitionDiscoverySuite]
enableSuite[GlutenParquetV2PartitionDiscoverySuite]
enableSuite[GlutenParquetProtobufCompatibilitySuite]
@@ -927,9 +925,6 @@ class VeloxTestSettings extends BackendTestSettings {
.exclude("Enabling/disabling ignoreCorruptFiles")
// decimal failed ut
.exclude("SPARK-34212 Parquet should read decimals correctly")
- // Timestamp is read as INT96.
- .exclude("Migration from INT96 to TIMESTAMP_MICROS timestamp type")
- .exclude("SPARK-10365 timestamp written and read as INT64 -
TIMESTAMP_MICROS")
// Rewrite because the filter after datasource is not needed.
.exclude(
"SPARK-26677: negated null-safe equality comparison should not filter
matched row groups")
@@ -938,9 +933,6 @@ class VeloxTestSettings extends BackendTestSettings {
.exclude("Enabling/disabling ignoreCorruptFiles")
// decimal failed ut
.exclude("SPARK-34212 Parquet should read decimals correctly")
- // Timestamp is read as INT96.
- .exclude("Migration from INT96 to TIMESTAMP_MICROS timestamp type")
- .exclude("SPARK-10365 timestamp written and read as INT64 -
TIMESTAMP_MICROS")
// Rewrite because the filter after datasource is not needed.
.exclude(
"SPARK-26677: negated null-safe equality comparison should not filter
matched row groups")
diff --git
a/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala
b/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala
index d1f8b5b0c4..d3bc3846d8 100644
---
a/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala
+++
b/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala
@@ -720,8 +720,7 @@ class VeloxTestSettings extends BackendTestSettings {
.exclude("SPARK-35640: read binary as timestamp should throw schema
incompatible error")
// Exception msg.
.exclude("SPARK-35640: int as long should throw schema incompatible error")
- // Velox only support read Timestamp with INT96 for now.
- .exclude("read dictionary and plain encoded timestamp_millis written as
INT64")
+ // Velox parquet reader not allow offset zero.
.exclude("SPARK-40128 read DELTA_LENGTH_BYTE_ARRAY encoded strings")
enableSuite[GlutenParquetV1PartitionDiscoverySuite]
enableSuite[GlutenParquetV2PartitionDiscoverySuite]
@@ -731,10 +730,6 @@ class VeloxTestSettings extends BackendTestSettings {
.exclude("Enabling/disabling ignoreCorruptFiles")
// decimal failed ut
.exclude("SPARK-34212 Parquet should read decimals correctly")
- // Timestamp is read as INT96.
- .exclude("Migration from INT96 to TIMESTAMP_MICROS timestamp type")
- .exclude("SPARK-10365 timestamp written and read as INT64 -
TIMESTAMP_MICROS")
- .exclude("SPARK-36182: read TimestampNTZ as TimestampLTZ")
// new added in spark-3.3 and need fix later, random failure may caused by
memory free
.exclude("SPARK-39833: pushed filters with project without filter columns")
.exclude("SPARK-39833: pushed filters with count()")
@@ -746,10 +741,6 @@ class VeloxTestSettings extends BackendTestSettings {
.exclude("Enabling/disabling ignoreCorruptFiles")
// decimal failed ut
.exclude("SPARK-34212 Parquet should read decimals correctly")
- // Timestamp is read as INT96.
- .exclude("Migration from INT96 to TIMESTAMP_MICROS timestamp type")
- .exclude("SPARK-10365 timestamp written and read as INT64 -
TIMESTAMP_MICROS")
- .exclude("SPARK-36182: read TimestampNTZ as TimestampLTZ")
// Rewrite because the filter after datasource is not needed.
.exclude(
"SPARK-26677: negated null-safe equality comparison should not filter
matched row groups")
diff --git
a/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala
b/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala
index 1de6961192..cc9746dcdb 100644
---
a/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala
+++
b/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala
@@ -715,9 +715,7 @@ class VeloxTestSettings extends BackendTestSettings {
.exclude("SPARK-35640: read binary as timestamp should throw schema
incompatible error")
// Exception msg.
.exclude("SPARK-35640: int as long should throw schema incompatible error")
- // Velox only support read Timestamp with INT96 for now.
- .exclude("read dictionary and plain encoded timestamp_millis written as
INT64")
- .exclude("Read TimestampNTZ and TimestampLTZ for various logical TIMESTAMP
types")
+ // Velox parquet reader not allow offset zero.
.exclude("SPARK-40128 read DELTA_LENGTH_BYTE_ARRAY encoded strings")
enableSuite[GlutenParquetV1PartitionDiscoverySuite]
enableSuite[GlutenParquetV2PartitionDiscoverySuite]
@@ -728,10 +726,6 @@ class VeloxTestSettings extends BackendTestSettings {
.exclude("Enabling/disabling ignoreCorruptFiles")
// decimal failed ut
.exclude("SPARK-34212 Parquet should read decimals correctly")
- // Timestamp is read as INT96.
- .exclude("Migration from INT96 to TIMESTAMP_MICROS timestamp type")
- .exclude("SPARK-10365 timestamp written and read as INT64 -
TIMESTAMP_MICROS")
- .exclude("SPARK-36182: read TimestampNTZ as TimestampLTZ")
// new added in spark-3.3 and need fix later, random failure may caused by
memory free
.exclude("SPARK-39833: pushed filters with project without filter columns")
.exclude("SPARK-39833: pushed filters with count()")
@@ -744,10 +738,6 @@ class VeloxTestSettings extends BackendTestSettings {
.exclude("Enabling/disabling ignoreCorruptFiles")
// decimal failed ut
.exclude("SPARK-34212 Parquet should read decimals correctly")
- // Timestamp is read as INT96.
- .exclude("Migration from INT96 to TIMESTAMP_MICROS timestamp type")
- .exclude("SPARK-10365 timestamp written and read as INT64 -
TIMESTAMP_MICROS")
- .exclude("SPARK-36182: read TimestampNTZ as TimestampLTZ")
// Rewrite because the filter after datasource is not needed.
.exclude(
"SPARK-26677: negated null-safe equality comparison should not filter
matched row groups")
diff --git
a/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala
b/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala
index 2cf2f8ad31..71786c9132 100644
---
a/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala
+++
b/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala
@@ -726,9 +726,7 @@ class VeloxTestSettings extends BackendTestSettings {
.exclude("SPARK-35640: read binary as timestamp should throw schema
incompatible error")
// Exception msg.
.exclude("SPARK-35640: int as long should throw schema incompatible error")
- // Velox only support read Timestamp with INT96 for now.
- .exclude("read dictionary and plain encoded timestamp_millis written as
INT64")
- .exclude("Read TimestampNTZ and TimestampLTZ for various logical TIMESTAMP
types")
+ // Velox parquet reader not allow offset zero.
.exclude("SPARK-40128 read DELTA_LENGTH_BYTE_ARRAY encoded strings")
enableSuite[GlutenParquetV1PartitionDiscoverySuite]
enableSuite[GlutenParquetV2PartitionDiscoverySuite]
@@ -739,10 +737,6 @@ class VeloxTestSettings extends BackendTestSettings {
.exclude("Enabling/disabling ignoreCorruptFiles")
// decimal failed ut
.exclude("SPARK-34212 Parquet should read decimals correctly")
- // Timestamp is read as INT96.
- .exclude("Migration from INT96 to TIMESTAMP_MICROS timestamp type")
- .exclude("SPARK-10365 timestamp written and read as INT64 -
TIMESTAMP_MICROS")
- .exclude("SPARK-36182: read TimestampNTZ as TimestampLTZ")
// new added in spark-3.3 and need fix later, random failure may caused by
memory free
.exclude("SPARK-39833: pushed filters with project without filter columns")
.exclude("SPARK-39833: pushed filters with count()")
@@ -755,10 +749,6 @@ class VeloxTestSettings extends BackendTestSettings {
.exclude("Enabling/disabling ignoreCorruptFiles")
// decimal failed ut
.exclude("SPARK-34212 Parquet should read decimals correctly")
- // Timestamp is read as INT96.
- .exclude("Migration from INT96 to TIMESTAMP_MICROS timestamp type")
- .exclude("SPARK-10365 timestamp written and read as INT64 -
TIMESTAMP_MICROS")
- .exclude("SPARK-36182: read TimestampNTZ as TimestampLTZ")
// Rewrite because the filter after datasource is not needed.
.exclude(
"SPARK-26677: negated null-safe equality comparison should not filter
matched row groups")
diff --git
a/shims/common/src/main/scala/org/apache/gluten/config/GlutenConfig.scala
b/shims/common/src/main/scala/org/apache/gluten/config/GlutenConfig.scala
index f6ed032734..cd01b1a42f 100644
--- a/shims/common/src/main/scala/org/apache/gluten/config/GlutenConfig.scala
+++ b/shims/common/src/main/scala/org/apache/gluten/config/GlutenConfig.scala
@@ -126,9 +126,6 @@ class GlutenConfig(conf: SQLConf) extends Logging {
def forceOrcCharTypeScanFallbackEnabled: Boolean =
conf.getConf(VELOX_FORCE_ORC_CHAR_TYPE_SCAN_FALLBACK)
- def forceParquetTimestampTypeScanFallbackEnabled: Boolean =
- conf.getConf(VELOX_FORCE_PARQUET_TIMESTAMP_TYPE_SCAN_FALLBACK)
-
def scanFileSchemeValidationEnabled: Boolean =
conf.getConf(VELOX_SCAN_FILE_SCHEME_VALIDATION_ENABLED)
@@ -2184,13 +2181,6 @@ object GlutenConfig {
.booleanConf
.createWithDefault(true)
- val VELOX_FORCE_PARQUET_TIMESTAMP_TYPE_SCAN_FALLBACK =
- buildConf("spark.gluten.sql.parquet.timestampType.scan.fallback.enabled")
- .internal()
- .doc("Force fallback for parquet timestamp type scan.")
- .booleanConf
- .createWithDefault(false)
-
val VELOX_SCAN_FILE_SCHEME_VALIDATION_ENABLED =
buildConf("spark.gluten.sql.scan.fileSchemeValidation.enabled")
.internal()
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]