This is an automated email from the ASF dual-hosted git repository.

liuneng pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/incubator-gluten.git


The following commit(s) were added to refs/heads/main by this push:
     new 352b91e5d [GLUEN-6506][CH]Fix ORC read wrong timestamp value (#6507)
352b91e5d is described below

commit 352b91e5d3980b4244beeb3b15a33667da4112b3
Author: kevinyhzou <[email protected]>
AuthorDate: Mon Aug 12 11:14:47 2024 +0800

    [GLUEN-6506][CH]Fix ORC read wrong timestamp value (#6507)
    
    What changes were proposed in this pull request?
    (Please fill in changes proposed in this fix)
    
    (Fixes: #6506)
    
    related to pr: ClickHouse/ClickHouse#67175
    
    How was this patch tested?
    test by ut
---
 .../resources/orc-data/test_reader_time_zone.snappy.orc  | Bin 0 -> 427 bytes
 .../execution/GlutenClickHouseHiveTableSuite.scala       |  11 +++++++++++
 .../GlutenClickHouseNativeWriteTableSuite.scala          |  15 ++++++---------
 .../Storages/SubstraitSource/ORCFormatFile.cpp           |   8 +++++++-
 4 files changed, 24 insertions(+), 10 deletions(-)

diff --git 
a/backends-clickhouse/src/test/resources/orc-data/test_reader_time_zone.snappy.orc
 
b/backends-clickhouse/src/test/resources/orc-data/test_reader_time_zone.snappy.orc
new file mode 100644
index 000000000..ab1b785db
Binary files /dev/null and 
b/backends-clickhouse/src/test/resources/orc-data/test_reader_time_zone.snappy.orc
 differ
diff --git 
a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseHiveTableSuite.scala
 
b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseHiveTableSuite.scala
index fc686056f..57fda7714 100644
--- 
a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseHiveTableSuite.scala
+++ 
b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseHiveTableSuite.scala
@@ -1303,4 +1303,15 @@ class GlutenClickHouseHiveTableSuite
       .mode(SaveMode.Overwrite)
       .save(dataPath)
   }
+
+  test("GLUTEN-6506: Orc read time zone") {
+    val dataPath = s"$basePath/orc-data/test_reader_time_zone.snappy.orc"
+    val create_table_sql = ("create table test_tbl_6506(" +
+      "id bigint, t timestamp) stored as orc location '%s'")
+      .format(dataPath)
+    val select_sql = "select * from test_tbl_6506"
+    spark.sql(create_table_sql)
+    compareResultsAgainstVanillaSpark(select_sql, true, _ => {})
+    spark.sql("drop table test_tbl_6506")
+  }
 }
diff --git 
a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseNativeWriteTableSuite.scala
 
b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseNativeWriteTableSuite.scala
index 11710a758..9f5dc4d3c 100644
--- 
a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseNativeWriteTableSuite.scala
+++ 
b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseNativeWriteTableSuite.scala
@@ -42,6 +42,10 @@ class GlutenClickHouseNativeWriteTableSuite
   private var _hiveSpark: SparkSession = _
 
   override protected def sparkConf: SparkConf = {
+    var sessionTimeZone = "GMT"
+    if (isSparkVersionGE("3.5")) {
+      sessionTimeZone = java.util.TimeZone.getDefault.getID
+    }
     new SparkConf()
       .set("spark.plugins", "org.apache.gluten.GlutenPlugin")
       .set("spark.memory.offHeap.enabled", "true")
@@ -65,6 +69,7 @@ class GlutenClickHouseNativeWriteTableSuite
       // TODO: support default ANSI policy
       .set("spark.sql.storeAssignmentPolicy", "legacy")
       .set("spark.sql.warehouse.dir", getWarehouseDir)
+      .set("spark.sql.session.timeZone", sessionTimeZone)
       .set("spark.gluten.sql.columnar.backend.ch.runtime_config.logger.level", 
"error")
       .setMaster("local[1]")
   }
@@ -623,20 +628,12 @@ class GlutenClickHouseNativeWriteTableSuite
       ("date_field", "date"),
       ("timestamp_field", "timestamp")
     )
-    def excludeTimeFieldForORC(format: String): Seq[String] = {
-      if (format.equals("orc") && isSparkVersionGE("3.5")) {
-        // FIXME:https://github.com/apache/incubator-gluten/pull/6507
-        fields.keys.filterNot(_.equals("timestamp_field")).toSeq
-      } else {
-        fields.keys.toSeq
-      }
-    }
     val origin_table = "origin_table"
     withSource(genTestData(), origin_table) {
       nativeWrite {
         format =>
           val table_name = table_name_template.format(format)
-          val testFields = excludeTimeFieldForORC(format)
+          val testFields = fields.keys.toSeq
           writeAndCheckRead(origin_table, table_name, testFields, 
isSparkVersionLE("3.3")) {
             fields =>
               spark
diff --git a/cpp-ch/local-engine/Storages/SubstraitSource/ORCFormatFile.cpp 
b/cpp-ch/local-engine/Storages/SubstraitSource/ORCFormatFile.cpp
index 1c5701075..66556e237 100644
--- a/cpp-ch/local-engine/Storages/SubstraitSource/ORCFormatFile.cpp
+++ b/cpp-ch/local-engine/Storages/SubstraitSource/ORCFormatFile.cpp
@@ -24,6 +24,7 @@
 #    include <Processors/Formats/Impl/ArrowBufferedStreams.h>
 #    include <Processors/Formats/Impl/NativeORCBlockInputFormat.h>
 #    include <Storages/SubstraitSource/OrcUtil.h>
+#    include <Common/CHUtil.h>
 
 namespace local_engine
 {
@@ -67,7 +68,12 @@ FormatFile::InputFormatPtr 
ORCFormatFile::createInputFormat(const DB::Block & he
         std::back_inserter(skip_stripe_indices));
 
     format_settings.orc.skip_stripes = 
std::unordered_set<int>(skip_stripe_indices.begin(), skip_stripe_indices.end());
-
+    if (context->getConfigRef().has("timezone"))
+    {
+        const String config_timezone = 
context->getConfigRef().getString("timezone");
+        const String mapped_timezone = 
DateTimeUtil::convertTimeZone(config_timezone);
+        format_settings.orc.reader_time_zone_name = mapped_timezone;
+    }
     auto input_format = 
std::make_shared<DB::NativeORCBlockInputFormat>(*file_format->read_buffer, 
header, format_settings);
     file_format->input = input_format;
     return file_format;


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to