This is an automated email from the ASF dual-hosted git repository.
liuneng pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/incubator-gluten.git
The following commit(s) were added to refs/heads/main by this push:
new 352b91e5d [GLUEN-6506][CH]Fix ORC read wrong timestamp value (#6507)
352b91e5d is described below
commit 352b91e5d3980b4244beeb3b15a33667da4112b3
Author: kevinyhzou <[email protected]>
AuthorDate: Mon Aug 12 11:14:47 2024 +0800
[GLUEN-6506][CH]Fix ORC read wrong timestamp value (#6507)
What changes were proposed in this pull request?
(Please fill in changes proposed in this fix)
(Fixes: #6506)
related to pr: ClickHouse/ClickHouse#67175
How was this patch tested?
test by ut
---
.../resources/orc-data/test_reader_time_zone.snappy.orc | Bin 0 -> 427 bytes
.../execution/GlutenClickHouseHiveTableSuite.scala | 11 +++++++++++
.../GlutenClickHouseNativeWriteTableSuite.scala | 15 ++++++---------
.../Storages/SubstraitSource/ORCFormatFile.cpp | 8 +++++++-
4 files changed, 24 insertions(+), 10 deletions(-)
diff --git
a/backends-clickhouse/src/test/resources/orc-data/test_reader_time_zone.snappy.orc
b/backends-clickhouse/src/test/resources/orc-data/test_reader_time_zone.snappy.orc
new file mode 100644
index 000000000..ab1b785db
Binary files /dev/null and
b/backends-clickhouse/src/test/resources/orc-data/test_reader_time_zone.snappy.orc
differ
diff --git
a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseHiveTableSuite.scala
b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseHiveTableSuite.scala
index fc686056f..57fda7714 100644
---
a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseHiveTableSuite.scala
+++
b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseHiveTableSuite.scala
@@ -1303,4 +1303,15 @@ class GlutenClickHouseHiveTableSuite
.mode(SaveMode.Overwrite)
.save(dataPath)
}
+
+ test("GLUTEN-6506: Orc read time zone") {
+ val dataPath = s"$basePath/orc-data/test_reader_time_zone.snappy.orc"
+ val create_table_sql = ("create table test_tbl_6506(" +
+ "id bigint, t timestamp) stored as orc location '%s'")
+ .format(dataPath)
+ val select_sql = "select * from test_tbl_6506"
+ spark.sql(create_table_sql)
+ compareResultsAgainstVanillaSpark(select_sql, true, _ => {})
+ spark.sql("drop table test_tbl_6506")
+ }
}
diff --git
a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseNativeWriteTableSuite.scala
b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseNativeWriteTableSuite.scala
index 11710a758..9f5dc4d3c 100644
---
a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseNativeWriteTableSuite.scala
+++
b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseNativeWriteTableSuite.scala
@@ -42,6 +42,10 @@ class GlutenClickHouseNativeWriteTableSuite
private var _hiveSpark: SparkSession = _
override protected def sparkConf: SparkConf = {
+ var sessionTimeZone = "GMT"
+ if (isSparkVersionGE("3.5")) {
+ sessionTimeZone = java.util.TimeZone.getDefault.getID
+ }
new SparkConf()
.set("spark.plugins", "org.apache.gluten.GlutenPlugin")
.set("spark.memory.offHeap.enabled", "true")
@@ -65,6 +69,7 @@ class GlutenClickHouseNativeWriteTableSuite
// TODO: support default ANSI policy
.set("spark.sql.storeAssignmentPolicy", "legacy")
.set("spark.sql.warehouse.dir", getWarehouseDir)
+ .set("spark.sql.session.timeZone", sessionTimeZone)
.set("spark.gluten.sql.columnar.backend.ch.runtime_config.logger.level",
"error")
.setMaster("local[1]")
}
@@ -623,20 +628,12 @@ class GlutenClickHouseNativeWriteTableSuite
("date_field", "date"),
("timestamp_field", "timestamp")
)
- def excludeTimeFieldForORC(format: String): Seq[String] = {
- if (format.equals("orc") && isSparkVersionGE("3.5")) {
- // FIXME:https://github.com/apache/incubator-gluten/pull/6507
- fields.keys.filterNot(_.equals("timestamp_field")).toSeq
- } else {
- fields.keys.toSeq
- }
- }
val origin_table = "origin_table"
withSource(genTestData(), origin_table) {
nativeWrite {
format =>
val table_name = table_name_template.format(format)
- val testFields = excludeTimeFieldForORC(format)
+ val testFields = fields.keys.toSeq
writeAndCheckRead(origin_table, table_name, testFields,
isSparkVersionLE("3.3")) {
fields =>
spark
diff --git a/cpp-ch/local-engine/Storages/SubstraitSource/ORCFormatFile.cpp
b/cpp-ch/local-engine/Storages/SubstraitSource/ORCFormatFile.cpp
index 1c5701075..66556e237 100644
--- a/cpp-ch/local-engine/Storages/SubstraitSource/ORCFormatFile.cpp
+++ b/cpp-ch/local-engine/Storages/SubstraitSource/ORCFormatFile.cpp
@@ -24,6 +24,7 @@
# include <Processors/Formats/Impl/ArrowBufferedStreams.h>
# include <Processors/Formats/Impl/NativeORCBlockInputFormat.h>
# include <Storages/SubstraitSource/OrcUtil.h>
+# include <Common/CHUtil.h>
namespace local_engine
{
@@ -67,7 +68,12 @@ FormatFile::InputFormatPtr
ORCFormatFile::createInputFormat(const DB::Block & he
std::back_inserter(skip_stripe_indices));
format_settings.orc.skip_stripes =
std::unordered_set<int>(skip_stripe_indices.begin(), skip_stripe_indices.end());
-
+ if (context->getConfigRef().has("timezone"))
+ {
+ const String config_timezone =
context->getConfigRef().getString("timezone");
+ const String mapped_timezone =
DateTimeUtil::convertTimeZone(config_timezone);
+ format_settings.orc.reader_time_zone_name = mapped_timezone;
+ }
auto input_format =
std::make_shared<DB::NativeORCBlockInputFormat>(*file_format->read_buffer,
header, format_settings);
file_format->input = input_format;
return file_format;
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]