This is an automated email from the ASF dual-hosted git repository.
changchen pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/incubator-gluten.git
The following commit(s) were added to refs/heads/main by this push:
new efd2cbde30 [GLUTEN-7896][CH]Fix to_date diff for time parser policy
config (#7923)
efd2cbde30 is described below
commit efd2cbde3064631179dede8ad2b7a0e96d426d06
Author: kevinyhzou <[email protected]>
AuthorDate: Tue Nov 26 19:53:13 2024 +0800
[GLUTEN-7896][CH]Fix to_date diff for time parser policy config (#7923)
* fix pre-projection not take effect
* Fix time_parser_plicy set legacy
* fix
* fix 11
* add test
* fix ci test
* Fix code bug
* fix review
* modify test
---
.../GlutenClickHouseTPCHSaltNullParquetSuite.scala | 18 +++-
cpp-ch/local-engine/Common/CHUtil.cpp | 5 +
cpp-ch/local-engine/Common/CHUtil.h | 1 +
.../CommonScalarFunctionParser.cpp | 1 -
.../Parser/scalar_function_parser/getTimestamp.cpp | 23 +++++
.../Parser/scalar_function_parser/getTimestamp.h | 106 +++++++++++++++++++++
.../scalar_function_parser/unixTimestamp.cpp | 14 +--
.../scala/org/apache/gluten/GlutenConfig.scala | 3 +-
8 files changed, 160 insertions(+), 11 deletions(-)
diff --git
a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/tpch/GlutenClickHouseTPCHSaltNullParquetSuite.scala
b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/tpch/GlutenClickHouseTPCHSaltNullParquetSuite.scala
index 5d7bcf324a..4a2b7040fa 100644
---
a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/tpch/GlutenClickHouseTPCHSaltNullParquetSuite.scala
+++
b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/tpch/GlutenClickHouseTPCHSaltNullParquetSuite.scala
@@ -2192,7 +2192,7 @@ class GlutenClickHouseTPCHSaltNullParquetSuite extends
GlutenClickHouseTPCHAbstr
}
}
- test("GLUTEN-3135: Bug fix to_date") {
+ test("GLUTEN-3135/GLUTEN-7896: Bug fix to_date") {
val create_table_sql =
"""
| create table test_tbl_3135(id bigint, data string) using parquet
@@ -2209,13 +2209,27 @@ class GlutenClickHouseTPCHSaltNullParquetSuite extends
GlutenClickHouseTPCHAbstr
|(7, '1970-01-01 00:00:00'),
|(8, '2024-3-2'),
|(9, '2024-03-2'),
- |(10, '2024-03')
+ |(10, '2024-03'),
+ |(11, '2024-03-02 11:22:33')
|""".stripMargin
spark.sql(create_table_sql)
spark.sql(insert_data_sql)
val select_sql = "select id, to_date(data) from test_tbl_3135"
compareResultsAgainstVanillaSpark(select_sql, true, { _ => })
+
+ withSQLConf(("spark.sql.legacy.timeParserPolicy" -> "corrected")) {
+ compareResultsAgainstVanillaSpark(
+ "select id, to_date('2024-03-2 11:22:33', 'yyyy-MM-dd') from
test_tbl_3135 where id = 11",
+ true,
+ { _ => })
+ }
+ withSQLConf(("spark.sql.legacy.timeParserPolicy" -> "legacy")) {
+ compareResultsAgainstVanillaSpark(
+ "select id, to_date(data, 'yyyy-MM-dd') from test_tbl_3135 where id =
11",
+ true,
+ { _ => })
+ }
spark.sql("drop table test_tbl_3135")
}
diff --git a/cpp-ch/local-engine/Common/CHUtil.cpp
b/cpp-ch/local-engine/Common/CHUtil.cpp
index 03df93c851..8fef52e50a 100644
--- a/cpp-ch/local-engine/Common/CHUtil.cpp
+++ b/cpp-ch/local-engine/Common/CHUtil.cpp
@@ -762,6 +762,11 @@ void BackendInitializerUtil::initSettings(const
SparkConfigs::ConfigMap & spark_
settings.set(key, toField(key, value));
LOG_DEBUG(&Poco::Logger::get("CHUtil"), "Set settings key:{}
value:{}", key, value);
}
+ else if (key == TIMER_PARSER_POLICY)
+ {
+ settings.set(key, value);
+ LOG_DEBUG(&Poco::Logger::get("CHUtil"), "Set settings key:{}
value:{}", key, value);
+ }
}
/// Finally apply some fixed kvs to settings.
diff --git a/cpp-ch/local-engine/Common/CHUtil.h
b/cpp-ch/local-engine/Common/CHUtil.h
index cff69090ee..a5fb24f6af 100644
--- a/cpp-ch/local-engine/Common/CHUtil.h
+++ b/cpp-ch/local-engine/Common/CHUtil.h
@@ -40,6 +40,7 @@ namespace local_engine
static const String MERGETREE_INSERT_WITHOUT_LOCAL_STORAGE =
"mergetree.insert_without_local_storage";
static const String MERGETREE_MERGE_AFTER_INSERT =
"mergetree.merge_after_insert";
static const std::string DECIMAL_OPERATIONS_ALLOW_PREC_LOSS =
"spark.sql.decimalOperations.allowPrecisionLoss";
+static const std::string TIMER_PARSER_POLICY =
"spark.sql.legacy.timeParserPolicy";
static const std::unordered_set<String> BOOL_VALUE_SETTINGS{
MERGETREE_MERGE_AFTER_INSERT, MERGETREE_INSERT_WITHOUT_LOCAL_STORAGE,
DECIMAL_OPERATIONS_ALLOW_PREC_LOSS};
diff --git
a/cpp-ch/local-engine/Parser/scalar_function_parser/CommonScalarFunctionParser.cpp
b/cpp-ch/local-engine/Parser/scalar_function_parser/CommonScalarFunctionParser.cpp
index d658426745..ec8b4e0d12 100644
---
a/cpp-ch/local-engine/Parser/scalar_function_parser/CommonScalarFunctionParser.cpp
+++
b/cpp-ch/local-engine/Parser/scalar_function_parser/CommonScalarFunctionParser.cpp
@@ -57,7 +57,6 @@ REGISTER_COMMON_SCALAR_FUNCTION_PARSER(Not, not, not );
REGISTER_COMMON_SCALAR_FUNCTION_PARSER(Xor, xor, xor);
REGISTER_COMMON_SCALAR_FUNCTION_PARSER(Cast, cast, CAST);
-REGISTER_COMMON_SCALAR_FUNCTION_PARSER(GetTimestamp, get_timestamp,
parseDateTime64InJodaSyntaxOrNull);
REGISTER_COMMON_SCALAR_FUNCTION_PARSER(Quarter, quarter, toQuarter);
// math functions
diff --git a/cpp-ch/local-engine/Parser/scalar_function_parser/getTimestamp.cpp
b/cpp-ch/local-engine/Parser/scalar_function_parser/getTimestamp.cpp
new file mode 100644
index 0000000000..4724f82009
--- /dev/null
+++ b/cpp-ch/local-engine/Parser/scalar_function_parser/getTimestamp.cpp
@@ -0,0 +1,23 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <Parser/scalar_function_parser/getTimestamp.h>
+
+namespace local_engine
+{
+ static FunctionParserRegister<FunctionParserGetTimestamp>
register_get_timestamp;
+}
diff --git a/cpp-ch/local-engine/Parser/scalar_function_parser/getTimestamp.h
b/cpp-ch/local-engine/Parser/scalar_function_parser/getTimestamp.h
new file mode 100644
index 0000000000..5e32e00569
--- /dev/null
+++ b/cpp-ch/local-engine/Parser/scalar_function_parser/getTimestamp.h
@@ -0,0 +1,106 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <DataTypes/DataTypeNullable.h>
+#include <Parser/FunctionParser.h>
+#include <Core/Settings.h>
+#include <Core/Field.h>
+#include <Common/CHUtil.h>
+#include <boost/algorithm/string/case_conv.hpp>
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+ extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
+ extern const int ILLEGAL_TYPE_OF_ARGUMENT;
+}
+}
+
+
+namespace local_engine
+{
+class FunctionParserGetTimestamp : public FunctionParser
+{
+public:
+ explicit FunctionParserGetTimestamp(ParserContextPtr parser_context_) :
FunctionParser(parser_context_) {}
+ ~FunctionParserGetTimestamp() override = default;
+
+ static constexpr auto name = "get_timestamp";
+ String getName() const override { return name; }
+
+ const ActionsDAG::Node * parse(
+ const substrait::Expression_ScalarFunction & substrait_func,
+ ActionsDAG & actions_dag) const override
+ {
+ /*
+ spark function: get_timestamp(expr, fmt)
+ 1. If timeParserPolicy is LEGACY
+ 1) fmt has 0 'S', ch function =
parseDateTime64InJodaSyntaxOrNull(substr(expr,1,length(fmt)), fmt);
+ 2) fmt has 'S' more than 0, make the fmt has 3 'S', ch function =
parseDateTime64InJodaSyntaxOrNull(expr, fmt)
+ 2. Else ch function = parseDateTime64InJodaSyntaxOrNull(expr, fmt)
+ */
+ auto parsed_args = parseFunctionArguments(substrait_func, actions_dag);
+ if (parsed_args.size() != 2)
+ throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH,
"Function {} requires exactly two arguments", getName());
+ const auto * expr_arg = parsed_args[0];
+ const auto * fmt_arg = parsed_args[1];
+
+ const auto & args = substrait_func.arguments();
+ bool fmt_string_literal = args[1].value().has_literal();
+ String fmt;
+ if (fmt_string_literal)
+ {
+ const auto & literal_fmt_expr = args[1].value().literal();
+ fmt_string_literal = literal_fmt_expr.has_string();
+ fmt = fmt_string_literal ? literal_fmt_expr.string() : "";
+ }
+ if (!fmt_string_literal)
+ throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "The second
of function {} must be const String.", name);
+
+ UInt32 s_count = std::count(fmt.begin(), fmt.end(), 'S');
+ String time_parser_policy =
getContext()->getSettingsRef().has(TIMER_PARSER_POLICY) ?
toString(getContext()->getSettingsRef().get(TIMER_PARSER_POLICY)) : "";
+ boost::to_lower(time_parser_policy);
+ if (time_parser_policy == "legacy")
+ {
+ if (s_count == 0)
+ {
+ const auto * index_begin_node =
addColumnToActionsDAG(actions_dag, std::make_shared<DataTypeUInt64>(), 1);
+ const auto * index_end_node =
addColumnToActionsDAG(actions_dag, std::make_shared<DataTypeUInt64>(),
fmt.size());
+ const auto * substr_node = toFunctionNode(actions_dag,
"substringUTF8", {expr_arg, index_begin_node, index_end_node});
+ const auto * fmt_node = addColumnToActionsDAG(actions_dag,
std::make_shared<DataTypeString>(), fmt);
+ const auto * result_node = toFunctionNode(actions_dag,
"parseDateTime64InJodaSyntaxOrNull", {substr_node, fmt_node});
+ return convertNodeTypeIfNeeded(substrait_func, result_node,
actions_dag);
+ }
+ else if (s_count < 3)
+ fmt += String(3 - s_count, 'S');
+ else
+ fmt = fmt.substr(0, fmt.size() - (s_count - 3));
+
+ const auto * fmt_node = addColumnToActionsDAG(actions_dag,
std::make_shared<DataTypeString>(), fmt);
+ const auto * result_node = toFunctionNode(actions_dag,
"parseDateTime64InJodaSyntaxOrNull", {expr_arg, fmt_node});
+ return convertNodeTypeIfNeeded(substrait_func, result_node,
actions_dag);
+ }
+ else
+ {
+ const auto * result_node = toFunctionNode(actions_dag,
"parseDateTime64InJodaSyntaxOrNull", {expr_arg, fmt_arg});
+ return convertNodeTypeIfNeeded(substrait_func, result_node,
actions_dag);
+ }
+ }
+};
+}
diff --git
a/cpp-ch/local-engine/Parser/scalar_function_parser/unixTimestamp.cpp
b/cpp-ch/local-engine/Parser/scalar_function_parser/unixTimestamp.cpp
index 622237da97..33997734c5 100644
--- a/cpp-ch/local-engine/Parser/scalar_function_parser/unixTimestamp.cpp
+++ b/cpp-ch/local-engine/Parser/scalar_function_parser/unixTimestamp.cpp
@@ -17,7 +17,7 @@
#include <DataTypes/DataTypeNullable.h>
#include <Parser/FunctionParser.h>
-
+#include <Parser/scalar_function_parser/getTimestamp.h>
namespace DB
{
@@ -34,10 +34,10 @@ namespace local_engine
{
template<typename Name>
-class FunctionParserUnixTimestamp : public FunctionParser
+class FunctionParserUnixTimestamp : public FunctionParserGetTimestamp
{
public:
- explicit FunctionParserUnixTimestamp(ParserContextPtr parser_context_) :
FunctionParser(parser_context_) {}
+ explicit FunctionParserUnixTimestamp(ParserContextPtr parser_context_) :
FunctionParserGetTimestamp(parser_context_) {}
~FunctionParserUnixTimestamp() override = default;
static constexpr auto name = Name::name;
@@ -60,13 +60,13 @@ public:
const auto * expr_arg = parsed_args[0];
const auto * fmt_arg = parsed_args[1];
auto expr_type = removeNullable(expr_arg->result_type);
+ if (isString(expr_type))
+ return FunctionParserGetTimestamp::parse(substrait_func,
actions_dag);
+
const DateLUTImpl * date_lut = &DateLUT::instance();
const auto * time_zone_node = addColumnToActionsDAG(actions_dag,
std::make_shared<DataTypeString>(), date_lut->getTimeZone());
-
const DB::ActionsDAG::Node * result_node = nullptr;
- if (isString(expr_type))
- result_node = toFunctionNode(actions_dag,
"parseDateTime64InJodaSyntaxOrNull", {expr_arg, fmt_arg, time_zone_node});
- else if (isDateOrDate32(expr_type))
+ if (isDateOrDate32(expr_type))
result_node = toFunctionNode(actions_dag,
"sparkDateToUnixTimestamp", {expr_arg, time_zone_node});
else if (isDateTime(expr_type) || isDateTime64(expr_type))
result_node = toFunctionNode(actions_dag, "toUnixTimestamp",
{expr_arg, time_zone_node});
diff --git a/shims/common/src/main/scala/org/apache/gluten/GlutenConfig.scala
b/shims/common/src/main/scala/org/apache/gluten/GlutenConfig.scala
index e0d06ce6fc..2ccdcae99b 100644
--- a/shims/common/src/main/scala/org/apache/gluten/GlutenConfig.scala
+++ b/shims/common/src/main/scala/org/apache/gluten/GlutenConfig.scala
@@ -805,7 +805,8 @@ object GlutenConfig {
SPARK_OFFHEAP_ENABLED,
SESSION_LOCAL_TIMEZONE.key,
DECIMAL_OPERATIONS_ALLOW_PREC_LOSS.key,
- SPARK_REDACTION_REGEX
+ SPARK_REDACTION_REGEX,
+ LEGACY_TIME_PARSER_POLICY.key
)
nativeConfMap.putAll(conf.filter(e => keys.contains(e._1)).asJava)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]