This is an automated email from the ASF dual-hosted git repository.
taiyangli pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/incubator-gluten.git
The following commit(s) were added to refs/heads/main by this push:
new 138931e78 [GLUTEN-6813][CH] Support soundex function (#7093)
138931e78 is described below
commit 138931e78e3a138f9c42a218de084d33123db088
Author: 李扬 <[email protected]>
AuthorDate: Thu Sep 5 10:25:24 2024 +0800
[GLUTEN-6813][CH] Support soundex function (#7093)
* support soundex function
* add uts
* fix style
* fix failed uts
---
.../src/main/scala/org/apache/gluten/utils/CHExpressionUtil.scala | 1 -
.../execution/tpch/GlutenClickHouseTPCHSaltNullParquetSuite.scala | 6 ++++++
.../Parser/scalar_function_parser/CommonScalarFunctionParser.cpp | 1 +
.../org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala | 3 +++
.../org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala | 3 +++
.../org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala | 3 +++
.../org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala | 3 +++
7 files changed, 19 insertions(+), 1 deletion(-)
diff --git
a/backends-clickhouse/src/main/scala/org/apache/gluten/utils/CHExpressionUtil.scala
b/backends-clickhouse/src/main/scala/org/apache/gluten/utils/CHExpressionUtil.scala
index 1d0f13055..868e42a94 100644
---
a/backends-clickhouse/src/main/scala/org/apache/gluten/utils/CHExpressionUtil.scala
+++
b/backends-clickhouse/src/main/scala/org/apache/gluten/utils/CHExpressionUtil.scala
@@ -205,7 +205,6 @@ object CHExpressionUtil {
URL_ENCODE -> DefaultValidator(),
FORMAT_STRING -> FormatStringValidator(),
SKEWNESS -> DefaultValidator(),
- SOUNDEX -> DefaultValidator(),
MAKE_YM_INTERVAL -> DefaultValidator(),
MAP_ZIP_WITH -> DefaultValidator(),
ZIP_WITH -> DefaultValidator(),
diff --git
a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/tpch/GlutenClickHouseTPCHSaltNullParquetSuite.scala
b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/tpch/GlutenClickHouseTPCHSaltNullParquetSuite.scala
index 1db37e00f..49697872e 100644
---
a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/tpch/GlutenClickHouseTPCHSaltNullParquetSuite.scala
+++
b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/tpch/GlutenClickHouseTPCHSaltNullParquetSuite.scala
@@ -2936,5 +2936,11 @@ class GlutenClickHouseTPCHSaltNullParquetSuite extends
GlutenClickHouseTPCHAbstr
checkBHJWithIsNullAwareAntiJoin(df)
})
}
+
+ test("soundex") {
+ runQueryAndCompare("select soundex(c_mktsegment) from customer limit 50") {
+ checkGlutenOperatorMatch[ProjectExecTransformer]
+ }
+ }
}
// scalastyle:on line.size.limit
diff --git
a/cpp-ch/local-engine/Parser/scalar_function_parser/CommonScalarFunctionParser.cpp
b/cpp-ch/local-engine/Parser/scalar_function_parser/CommonScalarFunctionParser.cpp
index ae654bd29..f7aea3157 100644
---
a/cpp-ch/local-engine/Parser/scalar_function_parser/CommonScalarFunctionParser.cpp
+++
b/cpp-ch/local-engine/Parser/scalar_function_parser/CommonScalarFunctionParser.cpp
@@ -134,6 +134,7 @@ REGISTER_COMMON_SCALAR_FUNCTION_PARSER(Uuid, uuid,
generateUUIDv4);
REGISTER_COMMON_SCALAR_FUNCTION_PARSER(Levenshtein, levenshtein,
editDistanceUTF8);
REGISTER_COMMON_SCALAR_FUNCTION_PARSER(FormatString, format_string, printf);
REGISTER_COMMON_SCALAR_FUNCTION_PARSER(Concat, concat, concat);
+REGISTER_COMMON_SCALAR_FUNCTION_PARSER(SoundEx, soundex, soundex);
REGISTER_COMMON_SCALAR_FUNCTION_PARSER(Crc32, crc32, CRC32);
REGISTER_COMMON_SCALAR_FUNCTION_PARSER(Murmur3Hash, murmur3hash,
sparkMurmurHash3_32);
diff --git
a/gluten-ut/spark32/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala
b/gluten-ut/spark32/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala
index fb9ce5afb..6d5083dbe 100644
---
a/gluten-ut/spark32/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala
+++
b/gluten-ut/spark32/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala
@@ -908,6 +908,9 @@ class ClickHouseTestSettings extends BackendTestSettings {
.exclude("ParseUrl")
.exclude("SPARK-33468: ParseUrl in ANSI mode should fail if input string
is not a valid url")
.exclude("FORMAT") // refer
https://github.com/apache/incubator-gluten/issues/6765
+ .exclude(
+ "soundex unit test"
+ ) // CH and spark returns different results when input non-ASCII characters
.excludeGlutenTest("SPARK-40213: ascii for Latin-1 Supplement characters")
enableSuite[GlutenTryCastSuite]
.exclude("null cast")
diff --git
a/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala
b/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala
index 705f5beaf..de979ac27 100644
---
a/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala
+++
b/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala
@@ -904,6 +904,9 @@ class ClickHouseTestSettings extends BackendTestSettings {
.exclude("ParseUrl")
.exclude("SPARK-33468: ParseUrl in ANSI mode should fail if input string
is not a valid url")
.exclude("FORMAT") // refer
https://github.com/apache/incubator-gluten/issues/6765
+ .exclude(
+ "soundex unit test"
+ ) // CH and spark returns different results when input non-ASCII characters
enableSuite[GlutenTryCastSuite]
.exclude("null cast")
.exclude("cast string to date")
diff --git
a/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala
b/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala
index 5f30dea84..89a44c602 100644
---
a/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala
+++
b/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala
@@ -808,6 +808,9 @@ class ClickHouseTestSettings extends BackendTestSettings {
.exclude("ParseUrl")
.exclude("SPARK-33468: ParseUrl in ANSI mode should fail if input string
is not a valid url")
.exclude("FORMAT") // refer
https://github.com/apache/incubator-gluten/issues/6765
+ .exclude(
+ "soundex unit test"
+ ) // CH and spark returns different results when input non-ASCII characters
enableSuite[GlutenDataSourceV2DataFrameSessionCatalogSuite]
enableSuite[GlutenDataSourceV2SQLSessionCatalogSuite]
enableSuite[GlutenDataSourceV2SQLSuiteV1Filter]
diff --git
a/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala
b/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala
index 6a2241f7e..388036c55 100644
---
a/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala
+++
b/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala
@@ -808,6 +808,9 @@ class ClickHouseTestSettings extends BackendTestSettings {
.exclude("ParseUrl")
.exclude("SPARK-33468: ParseUrl in ANSI mode should fail if input string
is not a valid url")
.exclude("FORMAT") // refer
https://github.com/apache/incubator-gluten/issues/6765
+ .exclude(
+ "soundex unit test"
+ ) // CH and spark returns different results when input non-ASCII characters
enableSuite[GlutenDataSourceV2DataFrameSessionCatalogSuite]
enableSuite[GlutenDataSourceV2SQLSessionCatalogSuite]
enableSuite[GlutenDataSourceV2SQLSuiteV1Filter]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]