This is an automated email from the ASF dual-hosted git repository.

changchen pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/incubator-gluten.git


The following commit(s) were added to refs/heads/main by this push:
     new efd2cbde30 [GLUTEN-7896][CH]Fix to_date diff for time parser policy 
config  (#7923)
efd2cbde30 is described below

commit efd2cbde3064631179dede8ad2b7a0e96d426d06
Author: kevinyhzou <[email protected]>
AuthorDate: Tue Nov 26 19:53:13 2024 +0800

    [GLUTEN-7896][CH]Fix to_date diff for time parser policy config  (#7923)
    
    * fix pre-projection not take effect
    
    * Fix time_parser_plicy set legacy
    
    * fix
    
    * fix 11
    
    * add test
    
    * fix ci test
    
    * Fix code bug
    
    * fix review
    
    * modify test
---
 .../GlutenClickHouseTPCHSaltNullParquetSuite.scala |  18 +++-
 cpp-ch/local-engine/Common/CHUtil.cpp              |   5 +
 cpp-ch/local-engine/Common/CHUtil.h                |   1 +
 .../CommonScalarFunctionParser.cpp                 |   1 -
 .../Parser/scalar_function_parser/getTimestamp.cpp |  23 +++++
 .../Parser/scalar_function_parser/getTimestamp.h   | 106 +++++++++++++++++++++
 .../scalar_function_parser/unixTimestamp.cpp       |  14 +--
 .../scala/org/apache/gluten/GlutenConfig.scala     |   3 +-
 8 files changed, 160 insertions(+), 11 deletions(-)

diff --git 
a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/tpch/GlutenClickHouseTPCHSaltNullParquetSuite.scala
 
b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/tpch/GlutenClickHouseTPCHSaltNullParquetSuite.scala
index 5d7bcf324a..4a2b7040fa 100644
--- 
a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/tpch/GlutenClickHouseTPCHSaltNullParquetSuite.scala
+++ 
b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/tpch/GlutenClickHouseTPCHSaltNullParquetSuite.scala
@@ -2192,7 +2192,7 @@ class GlutenClickHouseTPCHSaltNullParquetSuite extends 
GlutenClickHouseTPCHAbstr
     }
   }
 
-  test("GLUTEN-3135: Bug fix to_date") {
+  test("GLUTEN-3135/GLUTEN-7896: Bug fix to_date") {
     val create_table_sql =
       """
         | create table test_tbl_3135(id bigint, data string) using parquet
@@ -2209,13 +2209,27 @@ class GlutenClickHouseTPCHSaltNullParquetSuite extends 
GlutenClickHouseTPCHAbstr
         |(7, '1970-01-01 00:00:00'),
         |(8, '2024-3-2'),
         |(9, '2024-03-2'),
-        |(10, '2024-03')
+        |(10, '2024-03'),
+        |(11, '2024-03-02 11:22:33')
         |""".stripMargin
     spark.sql(create_table_sql)
     spark.sql(insert_data_sql)
 
     val select_sql = "select id, to_date(data) from test_tbl_3135"
     compareResultsAgainstVanillaSpark(select_sql, true, { _ => })
+
+    withSQLConf(("spark.sql.legacy.timeParserPolicy" -> "corrected")) {
+      compareResultsAgainstVanillaSpark(
+        "select id, to_date('2024-03-2 11:22:33', 'yyyy-MM-dd') from 
test_tbl_3135 where id = 11",
+        true,
+        { _ => })
+    }
+    withSQLConf(("spark.sql.legacy.timeParserPolicy" -> "legacy")) {
+      compareResultsAgainstVanillaSpark(
+        "select id, to_date(data, 'yyyy-MM-dd') from test_tbl_3135 where id = 
11",
+        true,
+        { _ => })
+    }
     spark.sql("drop table test_tbl_3135")
   }
 
diff --git a/cpp-ch/local-engine/Common/CHUtil.cpp 
b/cpp-ch/local-engine/Common/CHUtil.cpp
index 03df93c851..8fef52e50a 100644
--- a/cpp-ch/local-engine/Common/CHUtil.cpp
+++ b/cpp-ch/local-engine/Common/CHUtil.cpp
@@ -762,6 +762,11 @@ void BackendInitializerUtil::initSettings(const 
SparkConfigs::ConfigMap & spark_
             settings.set(key, toField(key, value));
             LOG_DEBUG(&Poco::Logger::get("CHUtil"), "Set settings key:{} 
value:{}", key, value);
         }
+        else if (key == TIMER_PARSER_POLICY)
+        {
+            settings.set(key, value);
+            LOG_DEBUG(&Poco::Logger::get("CHUtil"), "Set settings key:{} 
value:{}", key, value);
+        }
     }
 
     /// Finally apply some fixed kvs to settings.
diff --git a/cpp-ch/local-engine/Common/CHUtil.h 
b/cpp-ch/local-engine/Common/CHUtil.h
index cff69090ee..a5fb24f6af 100644
--- a/cpp-ch/local-engine/Common/CHUtil.h
+++ b/cpp-ch/local-engine/Common/CHUtil.h
@@ -40,6 +40,7 @@ namespace local_engine
 static const String MERGETREE_INSERT_WITHOUT_LOCAL_STORAGE = 
"mergetree.insert_without_local_storage";
 static const String MERGETREE_MERGE_AFTER_INSERT = 
"mergetree.merge_after_insert";
 static const std::string DECIMAL_OPERATIONS_ALLOW_PREC_LOSS = 
"spark.sql.decimalOperations.allowPrecisionLoss";
+static const std::string TIMER_PARSER_POLICY = 
"spark.sql.legacy.timeParserPolicy";
 
 static const std::unordered_set<String> BOOL_VALUE_SETTINGS{
     MERGETREE_MERGE_AFTER_INSERT, MERGETREE_INSERT_WITHOUT_LOCAL_STORAGE, 
DECIMAL_OPERATIONS_ALLOW_PREC_LOSS};
diff --git 
a/cpp-ch/local-engine/Parser/scalar_function_parser/CommonScalarFunctionParser.cpp
 
b/cpp-ch/local-engine/Parser/scalar_function_parser/CommonScalarFunctionParser.cpp
index d658426745..ec8b4e0d12 100644
--- 
a/cpp-ch/local-engine/Parser/scalar_function_parser/CommonScalarFunctionParser.cpp
+++ 
b/cpp-ch/local-engine/Parser/scalar_function_parser/CommonScalarFunctionParser.cpp
@@ -57,7 +57,6 @@ REGISTER_COMMON_SCALAR_FUNCTION_PARSER(Not, not, not );
 REGISTER_COMMON_SCALAR_FUNCTION_PARSER(Xor, xor, xor);
 
 REGISTER_COMMON_SCALAR_FUNCTION_PARSER(Cast, cast, CAST);
-REGISTER_COMMON_SCALAR_FUNCTION_PARSER(GetTimestamp, get_timestamp, 
parseDateTime64InJodaSyntaxOrNull);
 REGISTER_COMMON_SCALAR_FUNCTION_PARSER(Quarter, quarter, toQuarter);
 
 // math functions
diff --git a/cpp-ch/local-engine/Parser/scalar_function_parser/getTimestamp.cpp 
b/cpp-ch/local-engine/Parser/scalar_function_parser/getTimestamp.cpp
new file mode 100644
index 0000000000..4724f82009
--- /dev/null
+++ b/cpp-ch/local-engine/Parser/scalar_function_parser/getTimestamp.cpp
@@ -0,0 +1,23 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <Parser/scalar_function_parser/getTimestamp.h>
+
+namespace local_engine
+{
+    static FunctionParserRegister<FunctionParserGetTimestamp> 
register_get_timestamp;
+}
diff --git a/cpp-ch/local-engine/Parser/scalar_function_parser/getTimestamp.h 
b/cpp-ch/local-engine/Parser/scalar_function_parser/getTimestamp.h
new file mode 100644
index 0000000000..5e32e00569
--- /dev/null
+++ b/cpp-ch/local-engine/Parser/scalar_function_parser/getTimestamp.h
@@ -0,0 +1,106 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <DataTypes/DataTypeNullable.h>
+#include <Parser/FunctionParser.h>
+#include <Core/Settings.h>
+#include <Core/Field.h>
+#include <Common/CHUtil.h>
+#include <boost/algorithm/string/case_conv.hpp>
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+    extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
+    extern const int ILLEGAL_TYPE_OF_ARGUMENT;
+}
+}
+
+
+namespace local_engine
+{
+class FunctionParserGetTimestamp : public FunctionParser
+{
+public:
+    explicit FunctionParserGetTimestamp(ParserContextPtr parser_context_) : 
FunctionParser(parser_context_) {}
+    ~FunctionParserGetTimestamp() override = default;
+
+    static constexpr auto name = "get_timestamp";
+    String getName() const override { return name; }
+
+    const ActionsDAG::Node * parse(
+        const substrait::Expression_ScalarFunction & substrait_func,
+        ActionsDAG & actions_dag) const override
+    {
+        /*
+        spark function: get_timestamp(expr, fmt)
+        1. If timeParserPolicy is LEGACY
+            1) fmt has 0 'S', ch function = 
parseDateTime64InJodaSyntaxOrNull(substr(expr,1,length(fmt)), fmt);
+            2) fmt has 'S' more than 0, make the fmt has 3 'S', ch function =  
parseDateTime64InJodaSyntaxOrNull(expr, fmt)
+        2. Else ch function = parseDateTime64InJodaSyntaxOrNull(expr, fmt)
+        */
+        auto parsed_args = parseFunctionArguments(substrait_func, actions_dag);
+        if (parsed_args.size() != 2)
+            throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, 
"Function {} requires exactly two arguments", getName());
+        const auto * expr_arg = parsed_args[0];
+        const auto * fmt_arg = parsed_args[1];
+        
+        const auto & args = substrait_func.arguments();
+        bool fmt_string_literal = args[1].value().has_literal();
+        String fmt;
+        if (fmt_string_literal)
+        {
+            const auto & literal_fmt_expr = args[1].value().literal();
+            fmt_string_literal = literal_fmt_expr.has_string();
+            fmt = fmt_string_literal ? literal_fmt_expr.string() : "";
+        }
+        if (!fmt_string_literal)
+            throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "The second 
of function {} must be const String.", name);
+
+        UInt32 s_count = std::count(fmt.begin(), fmt.end(), 'S');
+        String time_parser_policy = 
getContext()->getSettingsRef().has(TIMER_PARSER_POLICY) ? 
toString(getContext()->getSettingsRef().get(TIMER_PARSER_POLICY)) : "";
+        boost::to_lower(time_parser_policy);
+        if (time_parser_policy == "legacy")
+        {
+            if (s_count == 0)
+            {
+                const auto * index_begin_node = 
addColumnToActionsDAG(actions_dag, std::make_shared<DataTypeUInt64>(), 1);
+                const auto * index_end_node = 
addColumnToActionsDAG(actions_dag, std::make_shared<DataTypeUInt64>(), 
fmt.size());
+                const auto * substr_node = toFunctionNode(actions_dag, 
"substringUTF8", {expr_arg, index_begin_node, index_end_node});
+                const auto * fmt_node = addColumnToActionsDAG(actions_dag, 
std::make_shared<DataTypeString>(), fmt);
+                const auto * result_node = toFunctionNode(actions_dag, 
"parseDateTime64InJodaSyntaxOrNull", {substr_node, fmt_node});
+                return convertNodeTypeIfNeeded(substrait_func, result_node, 
actions_dag);
+            }
+            else if (s_count < 3)
+                fmt += String(3 - s_count, 'S');
+            else
+                fmt = fmt.substr(0, fmt.size() - (s_count - 3));
+
+            const auto * fmt_node = addColumnToActionsDAG(actions_dag, 
std::make_shared<DataTypeString>(), fmt);
+            const auto * result_node = toFunctionNode(actions_dag, 
"parseDateTime64InJodaSyntaxOrNull", {expr_arg, fmt_node});
+            return convertNodeTypeIfNeeded(substrait_func, result_node, 
actions_dag);
+        }
+        else
+        {
+            const auto * result_node = toFunctionNode(actions_dag, 
"parseDateTime64InJodaSyntaxOrNull", {expr_arg, fmt_arg});
+            return convertNodeTypeIfNeeded(substrait_func, result_node, 
actions_dag);
+        }
+    }
+};
+}
diff --git 
a/cpp-ch/local-engine/Parser/scalar_function_parser/unixTimestamp.cpp 
b/cpp-ch/local-engine/Parser/scalar_function_parser/unixTimestamp.cpp
index 622237da97..33997734c5 100644
--- a/cpp-ch/local-engine/Parser/scalar_function_parser/unixTimestamp.cpp
+++ b/cpp-ch/local-engine/Parser/scalar_function_parser/unixTimestamp.cpp
@@ -17,7 +17,7 @@
 
 #include <DataTypes/DataTypeNullable.h>
 #include <Parser/FunctionParser.h>
-
+#include <Parser/scalar_function_parser/getTimestamp.h>
 
 namespace DB
 {
@@ -34,10 +34,10 @@ namespace local_engine
 {
 
 template<typename Name>
-class FunctionParserUnixTimestamp : public FunctionParser
+class FunctionParserUnixTimestamp : public FunctionParserGetTimestamp
 {
 public:
-    explicit FunctionParserUnixTimestamp(ParserContextPtr parser_context_) : 
FunctionParser(parser_context_) {}
+    explicit FunctionParserUnixTimestamp(ParserContextPtr parser_context_) : 
FunctionParserGetTimestamp(parser_context_) {}
     ~FunctionParserUnixTimestamp() override = default;
 
     static constexpr auto name = Name::name;
@@ -60,13 +60,13 @@ public:
         const auto * expr_arg = parsed_args[0];
         const auto * fmt_arg = parsed_args[1];
         auto expr_type = removeNullable(expr_arg->result_type);
+        if (isString(expr_type))
+            return FunctionParserGetTimestamp::parse(substrait_func, 
actions_dag);
+
         const DateLUTImpl * date_lut = &DateLUT::instance();
         const auto * time_zone_node = addColumnToActionsDAG(actions_dag, 
std::make_shared<DataTypeString>(), date_lut->getTimeZone());
-
         const DB::ActionsDAG::Node * result_node = nullptr;
-        if (isString(expr_type))
-            result_node = toFunctionNode(actions_dag, 
"parseDateTime64InJodaSyntaxOrNull", {expr_arg, fmt_arg, time_zone_node});
-        else if (isDateOrDate32(expr_type))
+        if (isDateOrDate32(expr_type))
             result_node = toFunctionNode(actions_dag, 
"sparkDateToUnixTimestamp", {expr_arg, time_zone_node});
         else if (isDateTime(expr_type) || isDateTime64(expr_type))
             result_node = toFunctionNode(actions_dag, "toUnixTimestamp", 
{expr_arg, time_zone_node});
diff --git a/shims/common/src/main/scala/org/apache/gluten/GlutenConfig.scala 
b/shims/common/src/main/scala/org/apache/gluten/GlutenConfig.scala
index e0d06ce6fc..2ccdcae99b 100644
--- a/shims/common/src/main/scala/org/apache/gluten/GlutenConfig.scala
+++ b/shims/common/src/main/scala/org/apache/gluten/GlutenConfig.scala
@@ -805,7 +805,8 @@ object GlutenConfig {
       SPARK_OFFHEAP_ENABLED,
       SESSION_LOCAL_TIMEZONE.key,
       DECIMAL_OPERATIONS_ALLOW_PREC_LOSS.key,
-      SPARK_REDACTION_REGEX
+      SPARK_REDACTION_REGEX,
+      LEGACY_TIME_PARSER_POLICY.key
     )
     nativeConfMap.putAll(conf.filter(e => keys.contains(e._1)).asJava)
 


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to