This is an automated email from the ASF dual-hosted git repository.

taiyangli pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/incubator-gluten.git


The following commit(s) were added to refs/heads/main by this push:
     new a51f69310 [GLUTEN-6208][CH] Enable more uts in 
GlutenStringExpressionsSuite (#6218)
a51f69310 is described below

commit a51f6931007256eec10b1a7ae69ef39554ce52f4
Author: 李扬 <[email protected]>
AuthorDate: Wed Jun 26 15:51:18 2024 +0800

    [GLUTEN-6208][CH] Enable more uts in GlutenStringExpressionsSuite (#6218)
---
 .../local-engine/Parser/SerializedPlanParser.cpp   | 13 ----
 cpp-ch/local-engine/Parser/SerializedPlanParser.h  |  2 -
 .../Parser/scalar_function_parser/concat.cpp       | 79 ++++++++++++++++++++++
 .../utils/clickhouse/ClickHouseTestSettings.scala  |  9 ---
 .../utils/clickhouse/ClickHouseTestSettings.scala  | 61 -----------------
 .../utils/clickhouse/ClickHouseTestSettings.scala  | 12 ----
 .../utils/clickhouse/ClickHouseTestSettings.scala  | 12 ----
 7 files changed, 79 insertions(+), 109 deletions(-)

diff --git a/cpp-ch/local-engine/Parser/SerializedPlanParser.cpp 
b/cpp-ch/local-engine/Parser/SerializedPlanParser.cpp
index 3115950cd..325ec32dc 100644
--- a/cpp-ch/local-engine/Parser/SerializedPlanParser.cpp
+++ b/cpp-ch/local-engine/Parser/SerializedPlanParser.cpp
@@ -664,19 +664,6 @@ SerializedPlanParser::getFunctionName(const std::string & 
function_signature, co
         else
             ch_function_name = "reverseUTF8";
     }
-    else if (function_name == "concat")
-    {
-        /// 1. ConcatOverloadResolver cannot build arrayConcat for 
Nullable(Array) type which causes failures when using functions like 
concat(split()).
-        ///    So we use arrayConcat directly if the output type is array.
-        /// 2. CH ConcatImpl can only accept at least 2 arguments, but Spark 
concat can accept 1 argument, like concat('a')
-        ///    in such case we use identity function
-        if (function.output_type().has_list())
-            ch_function_name = "arrayConcat";
-        else if (args.size() == 1)
-            ch_function_name = "identity";
-        else
-            ch_function_name = "concat";
-    }
     else
         ch_function_name = SCALAR_FUNCTIONS.at(function_name);
 
diff --git a/cpp-ch/local-engine/Parser/SerializedPlanParser.h 
b/cpp-ch/local-engine/Parser/SerializedPlanParser.h
index aa18197e5..6ce92b558 100644
--- a/cpp-ch/local-engine/Parser/SerializedPlanParser.h
+++ b/cpp-ch/local-engine/Parser/SerializedPlanParser.h
@@ -127,13 +127,11 @@ static const std::map<std::string, std::string> 
SCALAR_FUNCTIONS
        {"trim", ""}, // trimLeft or trimLeftSpark, depends on argument size
        {"ltrim", ""}, // trimRight or trimRightSpark, depends on argument size
        {"rtrim", ""}, // trimBoth or trimBothSpark, depends on argument size
-       {"concat", ""}, /// dummy mapping
        {"strpos", "positionUTF8"},
        {"char_length",
         "char_length"}, /// Notice: when input argument is binary type, 
corresponding ch function is length instead of char_length
        {"replace", "replaceAll"},
        {"regexp_replace", "replaceRegexpAll"},
-       // {"regexp_extract", "regexpExtract"},
        {"regexp_extract_all", "regexpExtractAllSpark"},
        {"chr", "char"},
        {"rlike", "match"},
diff --git a/cpp-ch/local-engine/Parser/scalar_function_parser/concat.cpp 
b/cpp-ch/local-engine/Parser/scalar_function_parser/concat.cpp
new file mode 100644
index 000000000..416fe7741
--- /dev/null
+++ b/cpp-ch/local-engine/Parser/scalar_function_parser/concat.cpp
@@ -0,0 +1,79 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <Parser/FunctionParser.h>
+#include <Common/CHUtil.h>
+#include <Core/Field.h>
+#include <DataTypes/DataTypeNullable.h>
+#include <DataTypes/IDataType.h>
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+    extern const int BAD_ARGUMENTS;
+    extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
+}
+}
+
+namespace local_engine
+{
+
+class FunctionParserConcat : public FunctionParser
+{
+public:
+    explicit FunctionParserConcat(SerializedPlanParser * plan_parser_) : 
FunctionParser(plan_parser_) {}
+    ~FunctionParserConcat() override = default;
+
+    static constexpr auto name = "concat";
+
+    String getName() const override { return name; }
+
+    const ActionsDAG::Node * parse(
+        const substrait::Expression_ScalarFunction & substrait_func,
+        ActionsDAGPtr & actions_dag) const override
+    {
+        /*
+          parse concat(args) as:
+            1. if output type is array, return arrayConcat(args)
+            2. otherwise:
+                1) if args is empty, return empty string
+                2) if args have size 1, return identity(args[0])
+                3) otherwise return concat(args)
+        */
+        auto args = parseFunctionArguments(substrait_func, "", actions_dag);
+        const auto & output_type = substrait_func.output_type();
+        const ActionsDAG::Node * result_node = nullptr;
+        if (output_type.has_list())
+        {
+             result_node = toFunctionNode(actions_dag, "arrayConcat", args);
+        }
+        else
+        {
+            if (args.empty())
+                result_node = addColumnToActionsDAG(actions_dag, 
std::make_shared<DataTypeString>(), "");
+            else if (args.size() == 1)
+                result_node = toFunctionNode(actions_dag, "identity", args);
+            else
+                result_node = toFunctionNode(actions_dag, "concat", args);
+        }
+        return convertNodeTypeIfNeeded(substrait_func, result_node, 
actions_dag);
+    }
+};
+
+static FunctionParserRegister<FunctionParserConcat> register_concat;
+}
diff --git 
a/gluten-ut/spark32/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala
 
b/gluten-ut/spark32/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala
index 162671680..d12a40b76 100644
--- 
a/gluten-ut/spark32/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala
+++ 
b/gluten-ut/spark32/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala
@@ -437,7 +437,6 @@ class ClickHouseTestSettings extends BackendTestSettings {
     .exclude("string regex_replace / regex_extract")
     .exclude("string overlay function")
     .exclude("binary overlay function")
-    .exclude("string / binary substring function")
     .exclude("string parse_url function")
   enableSuite[GlutenSubquerySuite]
     .exclude("SPARK-15370: COUNT bug in subquery in subquery in subquery")
@@ -894,7 +893,6 @@ class ClickHouseTestSettings extends BackendTestSettings {
     .exclude("SPARK-34814: LikeSimplification should handle NULL")
   enableSuite[GlutenSortOrderExpressionsSuite].exclude("SortPrefix")
   enableSuite[GlutenStringExpressionsSuite]
-    .exclude("concat")
     .exclude("StringComparison")
     .exclude("Substring")
     .exclude("string substring_index function")
@@ -902,22 +900,15 @@ class ClickHouseTestSettings extends BackendTestSettings {
     .exclude("string for ascii")
     .exclude("base64/unbase64 for string")
     .exclude("encode/decode for string")
-    .exclude("soundex unit test")
-    .exclude("replace")
     .exclude("overlay for string")
     .exclude("overlay for byte array")
     .exclude("translate")
-    .exclude("FORMAT")
-    .exclude("SPARK-22603: FormatString should not generate codes beyond 64KB")
-    .exclude("INSTR")
     .exclude("LOCATE")
     .exclude("LPAD/RPAD")
     .exclude("REPEAT")
     .exclude("length for string / binary")
-    .exclude("format_number / FormatNumber")
     .exclude("ParseUrl")
     .exclude("SPARK-33468: ParseUrl in ANSI mode should fail if input string 
is not a valid url")
-    .exclude("Sentences")
     .excludeGlutenTest("SPARK-40213: ascii for Latin-1 Supplement characters")
   enableSuite[GlutenTryCastSuite]
     .exclude("null cast")
diff --git 
a/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala
 
b/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala
index 3147c7c3d..52e7ebcbd 100644
--- 
a/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala
+++ 
b/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala
@@ -458,7 +458,6 @@ class ClickHouseTestSettings extends BackendTestSettings {
     .exclude("string regex_replace / regex_extract")
     .exclude("string overlay function")
     .exclude("binary overlay function")
-    .exclude("string / binary substring function")
     .exclude("string parse_url function")
   enableSuite[GlutenSubquerySuite]
     .exclude("SPARK-15370: COUNT bug in subquery in subquery in subquery")
@@ -474,58 +473,9 @@ class ClickHouseTestSettings extends BackendTestSettings {
   enableSuite[GlutenXPathFunctionsSuite]
   enableSuite[QueryTestSuite]
   enableSuite[GlutenAnsiCastSuiteWithAnsiModeOff]
-    .exclude("null cast")
     .exclude("cast string to date")
-    .exclude("cast string to timestamp")
-    .exclude("cast from boolean")
-    .exclude("cast from int")
-    .exclude("cast from long")
-    .exclude("cast from float")
-    .exclude("cast from double")
-    .exclude("cast from timestamp")
-    .exclude("data type casting")
-    .exclude("cast and add")
-    .exclude("from decimal")
-    .exclude("cast from array")
-    .exclude("cast from map")
-    .exclude("cast from struct")
-    .exclude("cast struct with a timestamp field")
-    .exclude("cast between string and interval")
-    .exclude("cast string to boolean")
-    .exclude("SPARK-20302 cast with same structure")
-    .exclude("SPARK-22500: cast for struct should not generate codes beyond 
64KB")
-    .exclude("SPARK-27671: cast from nested null type in struct")
-    .exclude("Process Infinity, -Infinity, NaN in case insensitive manner")
-    .exclude("SPARK-22825 Cast array to string")
-    .exclude("SPARK-33291: Cast array with null elements to string")
-    .exclude("SPARK-22973 Cast map to string")
-    .exclude("SPARK-22981 Cast struct to string")
-    .exclude("SPARK-33291: Cast struct with null elements to string")
-    .exclude("SPARK-34667: cast year-month interval to string")
-    .exclude("SPARK-34668: cast day-time interval to string")
-    .exclude("SPARK-35698: cast timestamp without time zone to string")
     .exclude("SPARK-35711: cast timestamp without time zone to timestamp with 
local time zone")
-    .exclude("SPARK-35716: cast timestamp without time zone to date type")
-    .exclude("SPARK-35718: cast date type to timestamp without timezone")
-    .exclude("SPARK-35719: cast timestamp with local time zone to timestamp 
without timezone")
-    .exclude("SPARK-35720: cast string to timestamp without timezone")
-    .exclude("SPARK-35112: Cast string to day-time interval")
-    .exclude("SPARK-35111: Cast string to year-month interval")
-    .exclude("SPARK-35820: Support cast DayTimeIntervalType in different 
fields")
     .exclude("SPARK-35819: Support cast YearMonthIntervalType in different 
fields")
-    .exclude("SPARK-35768: Take into account year-month interval fields in 
cast")
-    .exclude("SPARK-35735: Take into account day-time interval fields in cast")
-    .exclude("ANSI mode: Throw exception on casting out-of-range value to byte 
type")
-    .exclude("ANSI mode: Throw exception on casting out-of-range value to 
short type")
-    .exclude("ANSI mode: Throw exception on casting out-of-range value to int 
type")
-    .exclude("ANSI mode: Throw exception on casting out-of-range value to long 
type")
-    .exclude("Fast fail for cast string type to decimal type in ansi mode")
-    .exclude("cast a timestamp before the epoch 1970-01-01 00:00:00Z")
-    .exclude("cast from array III")
-    .exclude("cast from map II")
-    .exclude("cast from map III")
-    .exclude("cast from struct II")
-    .exclude("cast from struct III")
   enableSuite[GlutenAnsiCastSuiteWithAnsiModeOn]
     .exclude("null cast")
     .exclude("cast string to date")
@@ -902,7 +852,6 @@ class ClickHouseTestSettings extends BackendTestSettings {
     .exclude("SPARK - 34814: LikeSimplification should handleNULL")
   enableSuite[GlutenSortOrderExpressionsSuite].exclude("SortPrefix")
   enableSuite[GlutenStringExpressionsSuite]
-    .exclude("concat")
     .exclude("StringComparison")
     .exclude("Substring")
     .exclude("string substring_index function")
@@ -911,24 +860,14 @@ class ClickHouseTestSettings extends BackendTestSettings {
     .exclude("string for ascii")
     .exclude("base64/unbase64 for string")
     .exclude("encode/decode for string")
-    .exclude("soundex unit test")
-    .exclude("replace")
     .exclude("overlay for string")
     .exclude("overlay for byte array")
     .exclude("translate")
-    .exclude("FORMAT")
-    .exclude("SPARK-22603: FormatString should not generate codes beyond 64KB")
-    .exclude("INSTR")
     .exclude("LOCATE")
-    .exclude("LPAD/RPAD")
     .exclude("REPEAT")
     .exclude("length for string / binary")
-    .exclude("format_number / FormatNumber")
-    .exclude("ToNumber: positive tests")
-    .exclude("ToNumber: negative tests (the input string does not match the 
format string)")
     .exclude("ParseUrl")
     .exclude("SPARK-33468: ParseUrl in ANSI mode should fail if input string 
is not a valid url")
-    .exclude("Sentences")
   enableSuite[GlutenTryCastSuite]
     .exclude("null cast")
     .exclude("cast string to date")
diff --git 
a/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala
 
b/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala
index 07af1fa84..38ed2c534 100644
--- 
a/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala
+++ 
b/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala
@@ -457,7 +457,6 @@ class ClickHouseTestSettings extends BackendTestSettings {
     .exclude("string regex_replace / regex_extract")
     .exclude("string overlay function")
     .exclude("binary overlay function")
-    .exclude("string / binary substring function")
     .exclude("string parse_url function")
   enableSuite[GlutenSubquerySuite]
     .exclude("SPARK-15370: COUNT bug in subquery in subquery in subquery")
@@ -756,7 +755,6 @@ class ClickHouseTestSettings extends BackendTestSettings {
     .exclude("SPARK - 34814: LikeSimplification should handleNULL")
   enableSuite[GlutenSortOrderExpressionsSuite].exclude("SortPrefix")
   enableSuite[GlutenStringExpressionsSuite]
-    .exclude("concat")
     .exclude("StringComparison")
     .exclude("Substring")
     .exclude("string substring_index function")
@@ -766,24 +764,14 @@ class ClickHouseTestSettings extends BackendTestSettings {
     .exclude("base64/unbase64 for string")
     .exclude("encode/decode for string")
     .exclude("Levenshtein distance")
-    .exclude("soundex unit test")
-    .exclude("replace")
     .exclude("overlay for string")
     .exclude("overlay for byte array")
     .exclude("translate")
-    .exclude("FORMAT")
-    .exclude("SPARK-22603: FormatString should not generate codes beyond 64KB")
-    .exclude("INSTR")
     .exclude("LOCATE")
-    .exclude("LPAD/RPAD")
     .exclude("REPEAT")
     .exclude("length for string / binary")
-    .exclude("format_number / FormatNumber")
-    .exclude("ToNumber: positive tests")
-    .exclude("ToNumber: negative tests (the input string does not match the 
format string)")
     .exclude("ParseUrl")
     .exclude("SPARK-33468: ParseUrl in ANSI mode should fail if input string 
is not a valid url")
-    .exclude("Sentences")
   enableSuite[GlutenDataSourceV2DataFrameSessionCatalogSuite]
   enableSuite[GlutenDataSourceV2SQLSessionCatalogSuite]
   enableSuite[GlutenDataSourceV2SQLSuiteV1Filter]
diff --git 
a/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala
 
b/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala
index 07af1fa84..38ed2c534 100644
--- 
a/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala
+++ 
b/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala
@@ -457,7 +457,6 @@ class ClickHouseTestSettings extends BackendTestSettings {
     .exclude("string regex_replace / regex_extract")
     .exclude("string overlay function")
     .exclude("binary overlay function")
-    .exclude("string / binary substring function")
     .exclude("string parse_url function")
   enableSuite[GlutenSubquerySuite]
     .exclude("SPARK-15370: COUNT bug in subquery in subquery in subquery")
@@ -756,7 +755,6 @@ class ClickHouseTestSettings extends BackendTestSettings {
     .exclude("SPARK - 34814: LikeSimplification should handleNULL")
   enableSuite[GlutenSortOrderExpressionsSuite].exclude("SortPrefix")
   enableSuite[GlutenStringExpressionsSuite]
-    .exclude("concat")
     .exclude("StringComparison")
     .exclude("Substring")
     .exclude("string substring_index function")
@@ -766,24 +764,14 @@ class ClickHouseTestSettings extends BackendTestSettings {
     .exclude("base64/unbase64 for string")
     .exclude("encode/decode for string")
     .exclude("Levenshtein distance")
-    .exclude("soundex unit test")
-    .exclude("replace")
     .exclude("overlay for string")
     .exclude("overlay for byte array")
     .exclude("translate")
-    .exclude("FORMAT")
-    .exclude("SPARK-22603: FormatString should not generate codes beyond 64KB")
-    .exclude("INSTR")
     .exclude("LOCATE")
-    .exclude("LPAD/RPAD")
     .exclude("REPEAT")
     .exclude("length for string / binary")
-    .exclude("format_number / FormatNumber")
-    .exclude("ToNumber: positive tests")
-    .exclude("ToNumber: negative tests (the input string does not match the 
format string)")
     .exclude("ParseUrl")
     .exclude("SPARK-33468: ParseUrl in ANSI mode should fail if input string 
is not a valid url")
-    .exclude("Sentences")
   enableSuite[GlutenDataSourceV2DataFrameSessionCatalogSuite]
   enableSuite[GlutenDataSourceV2SQLSessionCatalogSuite]
   enableSuite[GlutenDataSourceV2SQLSuiteV1Filter]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to