(incubator-gluten) branch main updated: [GLUTEN-8307][VL] Enable Int64 Timestamp Parquet reader (#8308)

rui Mon, 06 Jan 2025 21:47:07 -0800

This is an automated email from the ASF dual-hosted git repository.

rui pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/incubator-gluten.git



The following commit(s) were added to refs/heads/main by this push:
     new 1b36edcb28 [GLUTEN-8307][VL] Enable Int64 Timestamp Parquet reader 
(#8308)
1b36edcb28 is described below

commit 1b36edcb2842d72ab8539c0f0c29d6fb1a86c422
Author: Mingliang Zhu <[email protected]>
AuthorDate: Tue Jan 7 13:46:57 2025 +0800

    [GLUTEN-8307][VL] Enable Int64 Timestamp Parquet reader (#8308)
---
 .../org/apache/gluten/backendsapi/velox/VeloxBackend.scala   |  8 +-------
 .../org/apache/gluten/utils/velox/VeloxTestSettings.scala    |  8 --------
 .../org/apache/gluten/utils/velox/VeloxTestSettings.scala    | 11 +----------
 .../org/apache/gluten/utils/velox/VeloxTestSettings.scala    | 12 +-----------
 .../org/apache/gluten/utils/velox/VeloxTestSettings.scala    | 12 +-----------
 .../main/scala/org/apache/gluten/config/GlutenConfig.scala   | 10 ----------
 6 files changed, 4 insertions(+), 57 deletions(-)

diff --git 
a/backends-velox/src/main/scala/org/apache/gluten/backendsapi/velox/VeloxBackend.scala
 
b/backends-velox/src/main/scala/org/apache/gluten/backendsapi/velox/VeloxBackend.scala
index 18c9efab39..4918a6eade 100644
--- 
a/backends-velox/src/main/scala/org/apache/gluten/backendsapi/velox/VeloxBackend.scala
+++ 
b/backends-velox/src/main/scala/org/apache/gluten/backendsapi/velox/VeloxBackend.scala
@@ -155,18 +155,12 @@ object VeloxBackendSettings extends BackendSettingsApi {
 
       format match {
         case ParquetReadFormat =>
-          val typeValidator: PartialFunction[StructField, String] = {
-            // Parquet timestamp is not fully supported yet
-            case StructField(_, TimestampType, _, _)
-                if 
GlutenConfig.get.forceParquetTimestampTypeScanFallbackEnabled =>
-              "TimestampType(force fallback)"
-          }
           val parquetOptions = new 
ParquetOptions(CaseInsensitiveMap(properties), SQLConf.get)
           if (parquetOptions.mergeSchema) {
             // https://github.com/apache/incubator-gluten/issues/7174
             Some(s"not support when merge schema is true")
           } else {
-            validateTypes(typeValidator)
+            None
           }
         case DwrfReadFormat => None
         case OrcReadFormat =>
diff --git 
a/gluten-ut/spark32/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala
 
b/gluten-ut/spark32/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala
index baed98729b..f5071d2f3f 100644
--- 
a/gluten-ut/spark32/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala
+++ 
b/gluten-ut/spark32/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala
@@ -917,8 +917,6 @@ class VeloxTestSettings extends BackendTestSettings {
     .exclude("SPARK-35640: read binary as timestamp should throw schema 
incompatible error")
     // Exception msg.
     .exclude("SPARK-35640: int as long should throw schema incompatible error")
-    // Velox only support read Timestamp with INT96 for now.
-    .exclude("read dictionary and plain encoded timestamp_millis written as 
INT64")
   enableSuite[GlutenParquetV1PartitionDiscoverySuite]
   enableSuite[GlutenParquetV2PartitionDiscoverySuite]
   enableSuite[GlutenParquetProtobufCompatibilitySuite]
@@ -927,9 +925,6 @@ class VeloxTestSettings extends BackendTestSettings {
     .exclude("Enabling/disabling ignoreCorruptFiles")
     // decimal failed ut
     .exclude("SPARK-34212 Parquet should read decimals correctly")
-    // Timestamp is read as INT96.
-    .exclude("Migration from INT96 to TIMESTAMP_MICROS timestamp type")
-    .exclude("SPARK-10365 timestamp written and read as INT64 - 
TIMESTAMP_MICROS")
     // Rewrite because the filter after datasource is not needed.
     .exclude(
       "SPARK-26677: negated null-safe equality comparison should not filter 
matched row groups")
@@ -938,9 +933,6 @@ class VeloxTestSettings extends BackendTestSettings {
     .exclude("Enabling/disabling ignoreCorruptFiles")
     // decimal failed ut
     .exclude("SPARK-34212 Parquet should read decimals correctly")
-    // Timestamp is read as INT96.
-    .exclude("Migration from INT96 to TIMESTAMP_MICROS timestamp type")
-    .exclude("SPARK-10365 timestamp written and read as INT64 - 
TIMESTAMP_MICROS")
     // Rewrite because the filter after datasource is not needed.
     .exclude(
       "SPARK-26677: negated null-safe equality comparison should not filter 
matched row groups")
diff --git 
a/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala
 
b/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala
index d1f8b5b0c4..d3bc3846d8 100644
--- 
a/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala
+++ 
b/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala
@@ -720,8 +720,7 @@ class VeloxTestSettings extends BackendTestSettings {
     .exclude("SPARK-35640: read binary as timestamp should throw schema 
incompatible error")
     // Exception msg.
     .exclude("SPARK-35640: int as long should throw schema incompatible error")
-    // Velox only support read Timestamp with INT96 for now.
-    .exclude("read dictionary and plain encoded timestamp_millis written as 
INT64")
+    // Velox parquet reader not allow offset zero.
     .exclude("SPARK-40128 read DELTA_LENGTH_BYTE_ARRAY encoded strings")
   enableSuite[GlutenParquetV1PartitionDiscoverySuite]
   enableSuite[GlutenParquetV2PartitionDiscoverySuite]
@@ -731,10 +730,6 @@ class VeloxTestSettings extends BackendTestSettings {
     .exclude("Enabling/disabling ignoreCorruptFiles")
     // decimal failed ut
     .exclude("SPARK-34212 Parquet should read decimals correctly")
-    // Timestamp is read as INT96.
-    .exclude("Migration from INT96 to TIMESTAMP_MICROS timestamp type")
-    .exclude("SPARK-10365 timestamp written and read as INT64 - 
TIMESTAMP_MICROS")
-    .exclude("SPARK-36182: read TimestampNTZ as TimestampLTZ")
     // new added in spark-3.3 and need fix later, random failure may caused by 
memory free
     .exclude("SPARK-39833: pushed filters with project without filter columns")
     .exclude("SPARK-39833: pushed filters with count()")
@@ -746,10 +741,6 @@ class VeloxTestSettings extends BackendTestSettings {
     .exclude("Enabling/disabling ignoreCorruptFiles")
     // decimal failed ut
     .exclude("SPARK-34212 Parquet should read decimals correctly")
-    // Timestamp is read as INT96.
-    .exclude("Migration from INT96 to TIMESTAMP_MICROS timestamp type")
-    .exclude("SPARK-10365 timestamp written and read as INT64 - 
TIMESTAMP_MICROS")
-    .exclude("SPARK-36182: read TimestampNTZ as TimestampLTZ")
     // Rewrite because the filter after datasource is not needed.
     .exclude(
       "SPARK-26677: negated null-safe equality comparison should not filter 
matched row groups")
diff --git 
a/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala
 
b/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala
index 1de6961192..cc9746dcdb 100644
--- 
a/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala
+++ 
b/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala
@@ -715,9 +715,7 @@ class VeloxTestSettings extends BackendTestSettings {
     .exclude("SPARK-35640: read binary as timestamp should throw schema 
incompatible error")
     // Exception msg.
     .exclude("SPARK-35640: int as long should throw schema incompatible error")
-    // Velox only support read Timestamp with INT96 for now.
-    .exclude("read dictionary and plain encoded timestamp_millis written as 
INT64")
-    .exclude("Read TimestampNTZ and TimestampLTZ for various logical TIMESTAMP 
types")
+    // Velox parquet reader not allow offset zero.
     .exclude("SPARK-40128 read DELTA_LENGTH_BYTE_ARRAY encoded strings")
   enableSuite[GlutenParquetV1PartitionDiscoverySuite]
   enableSuite[GlutenParquetV2PartitionDiscoverySuite]
@@ -728,10 +726,6 @@ class VeloxTestSettings extends BackendTestSettings {
     .exclude("Enabling/disabling ignoreCorruptFiles")
     // decimal failed ut
     .exclude("SPARK-34212 Parquet should read decimals correctly")
-    // Timestamp is read as INT96.
-    .exclude("Migration from INT96 to TIMESTAMP_MICROS timestamp type")
-    .exclude("SPARK-10365 timestamp written and read as INT64 - 
TIMESTAMP_MICROS")
-    .exclude("SPARK-36182: read TimestampNTZ as TimestampLTZ")
     // new added in spark-3.3 and need fix later, random failure may caused by 
memory free
     .exclude("SPARK-39833: pushed filters with project without filter columns")
     .exclude("SPARK-39833: pushed filters with count()")
@@ -744,10 +738,6 @@ class VeloxTestSettings extends BackendTestSettings {
     .exclude("Enabling/disabling ignoreCorruptFiles")
     // decimal failed ut
     .exclude("SPARK-34212 Parquet should read decimals correctly")
-    // Timestamp is read as INT96.
-    .exclude("Migration from INT96 to TIMESTAMP_MICROS timestamp type")
-    .exclude("SPARK-10365 timestamp written and read as INT64 - 
TIMESTAMP_MICROS")
-    .exclude("SPARK-36182: read TimestampNTZ as TimestampLTZ")
     // Rewrite because the filter after datasource is not needed.
     .exclude(
       "SPARK-26677: negated null-safe equality comparison should not filter 
matched row groups")
diff --git 
a/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala
 
b/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala
index 2cf2f8ad31..71786c9132 100644
--- 
a/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala
+++ 
b/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala
@@ -726,9 +726,7 @@ class VeloxTestSettings extends BackendTestSettings {
     .exclude("SPARK-35640: read binary as timestamp should throw schema 
incompatible error")
     // Exception msg.
     .exclude("SPARK-35640: int as long should throw schema incompatible error")
-    // Velox only support read Timestamp with INT96 for now.
-    .exclude("read dictionary and plain encoded timestamp_millis written as 
INT64")
-    .exclude("Read TimestampNTZ and TimestampLTZ for various logical TIMESTAMP 
types")
+    // Velox parquet reader not allow offset zero.
     .exclude("SPARK-40128 read DELTA_LENGTH_BYTE_ARRAY encoded strings")
   enableSuite[GlutenParquetV1PartitionDiscoverySuite]
   enableSuite[GlutenParquetV2PartitionDiscoverySuite]
@@ -739,10 +737,6 @@ class VeloxTestSettings extends BackendTestSettings {
     .exclude("Enabling/disabling ignoreCorruptFiles")
     // decimal failed ut
     .exclude("SPARK-34212 Parquet should read decimals correctly")
-    // Timestamp is read as INT96.
-    .exclude("Migration from INT96 to TIMESTAMP_MICROS timestamp type")
-    .exclude("SPARK-10365 timestamp written and read as INT64 - 
TIMESTAMP_MICROS")
-    .exclude("SPARK-36182: read TimestampNTZ as TimestampLTZ")
     // new added in spark-3.3 and need fix later, random failure may caused by 
memory free
     .exclude("SPARK-39833: pushed filters with project without filter columns")
     .exclude("SPARK-39833: pushed filters with count()")
@@ -755,10 +749,6 @@ class VeloxTestSettings extends BackendTestSettings {
     .exclude("Enabling/disabling ignoreCorruptFiles")
     // decimal failed ut
     .exclude("SPARK-34212 Parquet should read decimals correctly")
-    // Timestamp is read as INT96.
-    .exclude("Migration from INT96 to TIMESTAMP_MICROS timestamp type")
-    .exclude("SPARK-10365 timestamp written and read as INT64 - 
TIMESTAMP_MICROS")
-    .exclude("SPARK-36182: read TimestampNTZ as TimestampLTZ")
     // Rewrite because the filter after datasource is not needed.
     .exclude(
       "SPARK-26677: negated null-safe equality comparison should not filter 
matched row groups")
diff --git 
a/shims/common/src/main/scala/org/apache/gluten/config/GlutenConfig.scala 
b/shims/common/src/main/scala/org/apache/gluten/config/GlutenConfig.scala
index f6ed032734..cd01b1a42f 100644
--- a/shims/common/src/main/scala/org/apache/gluten/config/GlutenConfig.scala
+++ b/shims/common/src/main/scala/org/apache/gluten/config/GlutenConfig.scala
@@ -126,9 +126,6 @@ class GlutenConfig(conf: SQLConf) extends Logging {
   def forceOrcCharTypeScanFallbackEnabled: Boolean =
     conf.getConf(VELOX_FORCE_ORC_CHAR_TYPE_SCAN_FALLBACK)
 
-  def forceParquetTimestampTypeScanFallbackEnabled: Boolean =
-    conf.getConf(VELOX_FORCE_PARQUET_TIMESTAMP_TYPE_SCAN_FALLBACK)
-
   def scanFileSchemeValidationEnabled: Boolean =
     conf.getConf(VELOX_SCAN_FILE_SCHEME_VALIDATION_ENABLED)
 
@@ -2184,13 +2181,6 @@ object GlutenConfig {
       .booleanConf
       .createWithDefault(true)
 
-  val VELOX_FORCE_PARQUET_TIMESTAMP_TYPE_SCAN_FALLBACK =
-    buildConf("spark.gluten.sql.parquet.timestampType.scan.fallback.enabled")
-      .internal()
-      .doc("Force fallback for parquet timestamp type scan.")
-      .booleanConf
-      .createWithDefault(false)
-
   val VELOX_SCAN_FILE_SCHEME_VALIDATION_ENABLED =
     buildConf("spark.gluten.sql.scan.fileSchemeValidation.enabled")
       .internal()


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(incubator-gluten) branch main updated: [GLUTEN-8307][VL] Enable Int64 Timestamp Parquet reader (#8308)

Reply via email to