This is an automated email from the ASF dual-hosted git repository.
taiyangli pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/incubator-gluten.git
The following commit(s) were added to refs/heads/main by this push:
new a51f69310 [GLUTEN-6208][CH] Enable more uts in
GlutenStringExpressionsSuite (#6218)
a51f69310 is described below
commit a51f6931007256eec10b1a7ae69ef39554ce52f4
Author: 李扬 <[email protected]>
AuthorDate: Wed Jun 26 15:51:18 2024 +0800
[GLUTEN-6208][CH] Enable more uts in GlutenStringExpressionsSuite (#6218)
---
.../local-engine/Parser/SerializedPlanParser.cpp | 13 ----
cpp-ch/local-engine/Parser/SerializedPlanParser.h | 2 -
.../Parser/scalar_function_parser/concat.cpp | 79 ++++++++++++++++++++++
.../utils/clickhouse/ClickHouseTestSettings.scala | 9 ---
.../utils/clickhouse/ClickHouseTestSettings.scala | 61 -----------------
.../utils/clickhouse/ClickHouseTestSettings.scala | 12 ----
.../utils/clickhouse/ClickHouseTestSettings.scala | 12 ----
7 files changed, 79 insertions(+), 109 deletions(-)
diff --git a/cpp-ch/local-engine/Parser/SerializedPlanParser.cpp
b/cpp-ch/local-engine/Parser/SerializedPlanParser.cpp
index 3115950cd..325ec32dc 100644
--- a/cpp-ch/local-engine/Parser/SerializedPlanParser.cpp
+++ b/cpp-ch/local-engine/Parser/SerializedPlanParser.cpp
@@ -664,19 +664,6 @@ SerializedPlanParser::getFunctionName(const std::string &
function_signature, co
else
ch_function_name = "reverseUTF8";
}
- else if (function_name == "concat")
- {
- /// 1. ConcatOverloadResolver cannot build arrayConcat for
Nullable(Array) type which causes failures when using functions like
concat(split()).
- /// So we use arrayConcat directly if the output type is array.
- /// 2. CH ConcatImpl can only accept at least 2 arguments, but Spark
concat can accept 1 argument, like concat('a')
- /// in such case we use identity function
- if (function.output_type().has_list())
- ch_function_name = "arrayConcat";
- else if (args.size() == 1)
- ch_function_name = "identity";
- else
- ch_function_name = "concat";
- }
else
ch_function_name = SCALAR_FUNCTIONS.at(function_name);
diff --git a/cpp-ch/local-engine/Parser/SerializedPlanParser.h
b/cpp-ch/local-engine/Parser/SerializedPlanParser.h
index aa18197e5..6ce92b558 100644
--- a/cpp-ch/local-engine/Parser/SerializedPlanParser.h
+++ b/cpp-ch/local-engine/Parser/SerializedPlanParser.h
@@ -127,13 +127,11 @@ static const std::map<std::string, std::string>
SCALAR_FUNCTIONS
{"trim", ""}, // trimLeft or trimLeftSpark, depends on argument size
{"ltrim", ""}, // trimRight or trimRightSpark, depends on argument size
{"rtrim", ""}, // trimBoth or trimBothSpark, depends on argument size
- {"concat", ""}, /// dummy mapping
{"strpos", "positionUTF8"},
{"char_length",
"char_length"}, /// Notice: when input argument is binary type,
corresponding ch function is length instead of char_length
{"replace", "replaceAll"},
{"regexp_replace", "replaceRegexpAll"},
- // {"regexp_extract", "regexpExtract"},
{"regexp_extract_all", "regexpExtractAllSpark"},
{"chr", "char"},
{"rlike", "match"},
diff --git a/cpp-ch/local-engine/Parser/scalar_function_parser/concat.cpp
b/cpp-ch/local-engine/Parser/scalar_function_parser/concat.cpp
new file mode 100644
index 000000000..416fe7741
--- /dev/null
+++ b/cpp-ch/local-engine/Parser/scalar_function_parser/concat.cpp
@@ -0,0 +1,79 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <Parser/FunctionParser.h>
+#include <Common/CHUtil.h>
+#include <Core/Field.h>
+#include <DataTypes/DataTypeNullable.h>
+#include <DataTypes/IDataType.h>
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+ extern const int BAD_ARGUMENTS;
+ extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
+}
+}
+
+namespace local_engine
+{
+
+class FunctionParserConcat : public FunctionParser
+{
+public:
+ explicit FunctionParserConcat(SerializedPlanParser * plan_parser_) :
FunctionParser(plan_parser_) {}
+ ~FunctionParserConcat() override = default;
+
+ static constexpr auto name = "concat";
+
+ String getName() const override { return name; }
+
+ const ActionsDAG::Node * parse(
+ const substrait::Expression_ScalarFunction & substrait_func,
+ ActionsDAGPtr & actions_dag) const override
+ {
+ /*
+ parse concat(args) as:
+ 1. if output type is array, return arrayConcat(args)
+ 2. otherwise:
+ 1) if args is empty, return empty string
+ 2) if args have size 1, return identity(args[0])
+ 3) otherwise return concat(args)
+ */
+ auto args = parseFunctionArguments(substrait_func, "", actions_dag);
+ const auto & output_type = substrait_func.output_type();
+ const ActionsDAG::Node * result_node = nullptr;
+ if (output_type.has_list())
+ {
+ result_node = toFunctionNode(actions_dag, "arrayConcat", args);
+ }
+ else
+ {
+ if (args.empty())
+ result_node = addColumnToActionsDAG(actions_dag,
std::make_shared<DataTypeString>(), "");
+ else if (args.size() == 1)
+ result_node = toFunctionNode(actions_dag, "identity", args);
+ else
+ result_node = toFunctionNode(actions_dag, "concat", args);
+ }
+ return convertNodeTypeIfNeeded(substrait_func, result_node,
actions_dag);
+ }
+};
+
+static FunctionParserRegister<FunctionParserConcat> register_concat;
+}
diff --git
a/gluten-ut/spark32/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala
b/gluten-ut/spark32/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala
index 162671680..d12a40b76 100644
---
a/gluten-ut/spark32/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala
+++
b/gluten-ut/spark32/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala
@@ -437,7 +437,6 @@ class ClickHouseTestSettings extends BackendTestSettings {
.exclude("string regex_replace / regex_extract")
.exclude("string overlay function")
.exclude("binary overlay function")
- .exclude("string / binary substring function")
.exclude("string parse_url function")
enableSuite[GlutenSubquerySuite]
.exclude("SPARK-15370: COUNT bug in subquery in subquery in subquery")
@@ -894,7 +893,6 @@ class ClickHouseTestSettings extends BackendTestSettings {
.exclude("SPARK-34814: LikeSimplification should handle NULL")
enableSuite[GlutenSortOrderExpressionsSuite].exclude("SortPrefix")
enableSuite[GlutenStringExpressionsSuite]
- .exclude("concat")
.exclude("StringComparison")
.exclude("Substring")
.exclude("string substring_index function")
@@ -902,22 +900,15 @@ class ClickHouseTestSettings extends BackendTestSettings {
.exclude("string for ascii")
.exclude("base64/unbase64 for string")
.exclude("encode/decode for string")
- .exclude("soundex unit test")
- .exclude("replace")
.exclude("overlay for string")
.exclude("overlay for byte array")
.exclude("translate")
- .exclude("FORMAT")
- .exclude("SPARK-22603: FormatString should not generate codes beyond 64KB")
- .exclude("INSTR")
.exclude("LOCATE")
.exclude("LPAD/RPAD")
.exclude("REPEAT")
.exclude("length for string / binary")
- .exclude("format_number / FormatNumber")
.exclude("ParseUrl")
.exclude("SPARK-33468: ParseUrl in ANSI mode should fail if input string
is not a valid url")
- .exclude("Sentences")
.excludeGlutenTest("SPARK-40213: ascii for Latin-1 Supplement characters")
enableSuite[GlutenTryCastSuite]
.exclude("null cast")
diff --git
a/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala
b/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala
index 3147c7c3d..52e7ebcbd 100644
---
a/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala
+++
b/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala
@@ -458,7 +458,6 @@ class ClickHouseTestSettings extends BackendTestSettings {
.exclude("string regex_replace / regex_extract")
.exclude("string overlay function")
.exclude("binary overlay function")
- .exclude("string / binary substring function")
.exclude("string parse_url function")
enableSuite[GlutenSubquerySuite]
.exclude("SPARK-15370: COUNT bug in subquery in subquery in subquery")
@@ -474,58 +473,9 @@ class ClickHouseTestSettings extends BackendTestSettings {
enableSuite[GlutenXPathFunctionsSuite]
enableSuite[QueryTestSuite]
enableSuite[GlutenAnsiCastSuiteWithAnsiModeOff]
- .exclude("null cast")
.exclude("cast string to date")
- .exclude("cast string to timestamp")
- .exclude("cast from boolean")
- .exclude("cast from int")
- .exclude("cast from long")
- .exclude("cast from float")
- .exclude("cast from double")
- .exclude("cast from timestamp")
- .exclude("data type casting")
- .exclude("cast and add")
- .exclude("from decimal")
- .exclude("cast from array")
- .exclude("cast from map")
- .exclude("cast from struct")
- .exclude("cast struct with a timestamp field")
- .exclude("cast between string and interval")
- .exclude("cast string to boolean")
- .exclude("SPARK-20302 cast with same structure")
- .exclude("SPARK-22500: cast for struct should not generate codes beyond
64KB")
- .exclude("SPARK-27671: cast from nested null type in struct")
- .exclude("Process Infinity, -Infinity, NaN in case insensitive manner")
- .exclude("SPARK-22825 Cast array to string")
- .exclude("SPARK-33291: Cast array with null elements to string")
- .exclude("SPARK-22973 Cast map to string")
- .exclude("SPARK-22981 Cast struct to string")
- .exclude("SPARK-33291: Cast struct with null elements to string")
- .exclude("SPARK-34667: cast year-month interval to string")
- .exclude("SPARK-34668: cast day-time interval to string")
- .exclude("SPARK-35698: cast timestamp without time zone to string")
.exclude("SPARK-35711: cast timestamp without time zone to timestamp with
local time zone")
- .exclude("SPARK-35716: cast timestamp without time zone to date type")
- .exclude("SPARK-35718: cast date type to timestamp without timezone")
- .exclude("SPARK-35719: cast timestamp with local time zone to timestamp
without timezone")
- .exclude("SPARK-35720: cast string to timestamp without timezone")
- .exclude("SPARK-35112: Cast string to day-time interval")
- .exclude("SPARK-35111: Cast string to year-month interval")
- .exclude("SPARK-35820: Support cast DayTimeIntervalType in different
fields")
.exclude("SPARK-35819: Support cast YearMonthIntervalType in different
fields")
- .exclude("SPARK-35768: Take into account year-month interval fields in
cast")
- .exclude("SPARK-35735: Take into account day-time interval fields in cast")
- .exclude("ANSI mode: Throw exception on casting out-of-range value to byte
type")
- .exclude("ANSI mode: Throw exception on casting out-of-range value to
short type")
- .exclude("ANSI mode: Throw exception on casting out-of-range value to int
type")
- .exclude("ANSI mode: Throw exception on casting out-of-range value to long
type")
- .exclude("Fast fail for cast string type to decimal type in ansi mode")
- .exclude("cast a timestamp before the epoch 1970-01-01 00:00:00Z")
- .exclude("cast from array III")
- .exclude("cast from map II")
- .exclude("cast from map III")
- .exclude("cast from struct II")
- .exclude("cast from struct III")
enableSuite[GlutenAnsiCastSuiteWithAnsiModeOn]
.exclude("null cast")
.exclude("cast string to date")
@@ -902,7 +852,6 @@ class ClickHouseTestSettings extends BackendTestSettings {
.exclude("SPARK - 34814: LikeSimplification should handleNULL")
enableSuite[GlutenSortOrderExpressionsSuite].exclude("SortPrefix")
enableSuite[GlutenStringExpressionsSuite]
- .exclude("concat")
.exclude("StringComparison")
.exclude("Substring")
.exclude("string substring_index function")
@@ -911,24 +860,14 @@ class ClickHouseTestSettings extends BackendTestSettings {
.exclude("string for ascii")
.exclude("base64/unbase64 for string")
.exclude("encode/decode for string")
- .exclude("soundex unit test")
- .exclude("replace")
.exclude("overlay for string")
.exclude("overlay for byte array")
.exclude("translate")
- .exclude("FORMAT")
- .exclude("SPARK-22603: FormatString should not generate codes beyond 64KB")
- .exclude("INSTR")
.exclude("LOCATE")
- .exclude("LPAD/RPAD")
.exclude("REPEAT")
.exclude("length for string / binary")
- .exclude("format_number / FormatNumber")
- .exclude("ToNumber: positive tests")
- .exclude("ToNumber: negative tests (the input string does not match the
format string)")
.exclude("ParseUrl")
.exclude("SPARK-33468: ParseUrl in ANSI mode should fail if input string
is not a valid url")
- .exclude("Sentences")
enableSuite[GlutenTryCastSuite]
.exclude("null cast")
.exclude("cast string to date")
diff --git
a/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala
b/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala
index 07af1fa84..38ed2c534 100644
---
a/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala
+++
b/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala
@@ -457,7 +457,6 @@ class ClickHouseTestSettings extends BackendTestSettings {
.exclude("string regex_replace / regex_extract")
.exclude("string overlay function")
.exclude("binary overlay function")
- .exclude("string / binary substring function")
.exclude("string parse_url function")
enableSuite[GlutenSubquerySuite]
.exclude("SPARK-15370: COUNT bug in subquery in subquery in subquery")
@@ -756,7 +755,6 @@ class ClickHouseTestSettings extends BackendTestSettings {
.exclude("SPARK - 34814: LikeSimplification should handleNULL")
enableSuite[GlutenSortOrderExpressionsSuite].exclude("SortPrefix")
enableSuite[GlutenStringExpressionsSuite]
- .exclude("concat")
.exclude("StringComparison")
.exclude("Substring")
.exclude("string substring_index function")
@@ -766,24 +764,14 @@ class ClickHouseTestSettings extends BackendTestSettings {
.exclude("base64/unbase64 for string")
.exclude("encode/decode for string")
.exclude("Levenshtein distance")
- .exclude("soundex unit test")
- .exclude("replace")
.exclude("overlay for string")
.exclude("overlay for byte array")
.exclude("translate")
- .exclude("FORMAT")
- .exclude("SPARK-22603: FormatString should not generate codes beyond 64KB")
- .exclude("INSTR")
.exclude("LOCATE")
- .exclude("LPAD/RPAD")
.exclude("REPEAT")
.exclude("length for string / binary")
- .exclude("format_number / FormatNumber")
- .exclude("ToNumber: positive tests")
- .exclude("ToNumber: negative tests (the input string does not match the
format string)")
.exclude("ParseUrl")
.exclude("SPARK-33468: ParseUrl in ANSI mode should fail if input string
is not a valid url")
- .exclude("Sentences")
enableSuite[GlutenDataSourceV2DataFrameSessionCatalogSuite]
enableSuite[GlutenDataSourceV2SQLSessionCatalogSuite]
enableSuite[GlutenDataSourceV2SQLSuiteV1Filter]
diff --git
a/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala
b/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala
index 07af1fa84..38ed2c534 100644
---
a/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala
+++
b/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala
@@ -457,7 +457,6 @@ class ClickHouseTestSettings extends BackendTestSettings {
.exclude("string regex_replace / regex_extract")
.exclude("string overlay function")
.exclude("binary overlay function")
- .exclude("string / binary substring function")
.exclude("string parse_url function")
enableSuite[GlutenSubquerySuite]
.exclude("SPARK-15370: COUNT bug in subquery in subquery in subquery")
@@ -756,7 +755,6 @@ class ClickHouseTestSettings extends BackendTestSettings {
.exclude("SPARK - 34814: LikeSimplification should handleNULL")
enableSuite[GlutenSortOrderExpressionsSuite].exclude("SortPrefix")
enableSuite[GlutenStringExpressionsSuite]
- .exclude("concat")
.exclude("StringComparison")
.exclude("Substring")
.exclude("string substring_index function")
@@ -766,24 +764,14 @@ class ClickHouseTestSettings extends BackendTestSettings {
.exclude("base64/unbase64 for string")
.exclude("encode/decode for string")
.exclude("Levenshtein distance")
- .exclude("soundex unit test")
- .exclude("replace")
.exclude("overlay for string")
.exclude("overlay for byte array")
.exclude("translate")
- .exclude("FORMAT")
- .exclude("SPARK-22603: FormatString should not generate codes beyond 64KB")
- .exclude("INSTR")
.exclude("LOCATE")
- .exclude("LPAD/RPAD")
.exclude("REPEAT")
.exclude("length for string / binary")
- .exclude("format_number / FormatNumber")
- .exclude("ToNumber: positive tests")
- .exclude("ToNumber: negative tests (the input string does not match the
format string)")
.exclude("ParseUrl")
.exclude("SPARK-33468: ParseUrl in ANSI mode should fail if input string
is not a valid url")
- .exclude("Sentences")
enableSuite[GlutenDataSourceV2DataFrameSessionCatalogSuite]
enableSuite[GlutenDataSourceV2SQLSessionCatalogSuite]
enableSuite[GlutenDataSourceV2SQLSuiteV1Filter]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]