This is an automated email from the ASF dual-hosted git repository.

taiyangli pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/incubator-gluten.git


The following commit(s) were added to refs/heads/main by this push:
     new 138931e78 [GLUTEN-6813][CH] Support soundex function (#7093)
138931e78 is described below

commit 138931e78e3a138f9c42a218de084d33123db088
Author: 李扬 <[email protected]>
AuthorDate: Thu Sep 5 10:25:24 2024 +0800

    [GLUTEN-6813][CH] Support soundex function (#7093)
    
    * support soundex function
    
    * add uts
    
    * fix style
    
    * fix failed uts
---
 .../src/main/scala/org/apache/gluten/utils/CHExpressionUtil.scala   | 1 -
 .../execution/tpch/GlutenClickHouseTPCHSaltNullParquetSuite.scala   | 6 ++++++
 .../Parser/scalar_function_parser/CommonScalarFunctionParser.cpp    | 1 +
 .../org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala | 3 +++
 .../org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala | 3 +++
 .../org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala | 3 +++
 .../org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala | 3 +++
 7 files changed, 19 insertions(+), 1 deletion(-)

diff --git 
a/backends-clickhouse/src/main/scala/org/apache/gluten/utils/CHExpressionUtil.scala
 
b/backends-clickhouse/src/main/scala/org/apache/gluten/utils/CHExpressionUtil.scala
index 1d0f13055..868e42a94 100644
--- 
a/backends-clickhouse/src/main/scala/org/apache/gluten/utils/CHExpressionUtil.scala
+++ 
b/backends-clickhouse/src/main/scala/org/apache/gluten/utils/CHExpressionUtil.scala
@@ -205,7 +205,6 @@ object CHExpressionUtil {
     URL_ENCODE -> DefaultValidator(),
     FORMAT_STRING -> FormatStringValidator(),
     SKEWNESS -> DefaultValidator(),
-    SOUNDEX -> DefaultValidator(),
     MAKE_YM_INTERVAL -> DefaultValidator(),
     MAP_ZIP_WITH -> DefaultValidator(),
     ZIP_WITH -> DefaultValidator(),
diff --git 
a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/tpch/GlutenClickHouseTPCHSaltNullParquetSuite.scala
 
b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/tpch/GlutenClickHouseTPCHSaltNullParquetSuite.scala
index 1db37e00f..49697872e 100644
--- 
a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/tpch/GlutenClickHouseTPCHSaltNullParquetSuite.scala
+++ 
b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/tpch/GlutenClickHouseTPCHSaltNullParquetSuite.scala
@@ -2936,5 +2936,11 @@ class GlutenClickHouseTPCHSaltNullParquetSuite extends 
GlutenClickHouseTPCHAbstr
         checkBHJWithIsNullAwareAntiJoin(df)
       })
   }
+
+  test("soundex") {
+    runQueryAndCompare("select soundex(c_mktsegment) from customer limit 50") {
+      checkGlutenOperatorMatch[ProjectExecTransformer]
+    }
+  }
 }
 // scalastyle:on line.size.limit
diff --git 
a/cpp-ch/local-engine/Parser/scalar_function_parser/CommonScalarFunctionParser.cpp
 
b/cpp-ch/local-engine/Parser/scalar_function_parser/CommonScalarFunctionParser.cpp
index ae654bd29..f7aea3157 100644
--- 
a/cpp-ch/local-engine/Parser/scalar_function_parser/CommonScalarFunctionParser.cpp
+++ 
b/cpp-ch/local-engine/Parser/scalar_function_parser/CommonScalarFunctionParser.cpp
@@ -134,6 +134,7 @@ REGISTER_COMMON_SCALAR_FUNCTION_PARSER(Uuid, uuid, 
generateUUIDv4);
 REGISTER_COMMON_SCALAR_FUNCTION_PARSER(Levenshtein, levenshtein, 
editDistanceUTF8);
 REGISTER_COMMON_SCALAR_FUNCTION_PARSER(FormatString, format_string, printf);
 REGISTER_COMMON_SCALAR_FUNCTION_PARSER(Concat, concat, concat);
+REGISTER_COMMON_SCALAR_FUNCTION_PARSER(SoundEx, soundex, soundex);
 
 REGISTER_COMMON_SCALAR_FUNCTION_PARSER(Crc32, crc32, CRC32);
 REGISTER_COMMON_SCALAR_FUNCTION_PARSER(Murmur3Hash, murmur3hash, 
sparkMurmurHash3_32);
diff --git 
a/gluten-ut/spark32/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala
 
b/gluten-ut/spark32/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala
index fb9ce5afb..6d5083dbe 100644
--- 
a/gluten-ut/spark32/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala
+++ 
b/gluten-ut/spark32/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala
@@ -908,6 +908,9 @@ class ClickHouseTestSettings extends BackendTestSettings {
     .exclude("ParseUrl")
     .exclude("SPARK-33468: ParseUrl in ANSI mode should fail if input string 
is not a valid url")
     .exclude("FORMAT") // refer 
https://github.com/apache/incubator-gluten/issues/6765
+    .exclude(
+      "soundex unit test"
+    ) // CH and spark returns different results when input non-ASCII characters
     .excludeGlutenTest("SPARK-40213: ascii for Latin-1 Supplement characters")
   enableSuite[GlutenTryCastSuite]
     .exclude("null cast")
diff --git 
a/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala
 
b/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala
index 705f5beaf..de979ac27 100644
--- 
a/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala
+++ 
b/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala
@@ -904,6 +904,9 @@ class ClickHouseTestSettings extends BackendTestSettings {
     .exclude("ParseUrl")
     .exclude("SPARK-33468: ParseUrl in ANSI mode should fail if input string 
is not a valid url")
     .exclude("FORMAT") // refer 
https://github.com/apache/incubator-gluten/issues/6765
+    .exclude(
+      "soundex unit test"
+    ) // CH and spark returns different results when input non-ASCII characters
   enableSuite[GlutenTryCastSuite]
     .exclude("null cast")
     .exclude("cast string to date")
diff --git 
a/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala
 
b/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala
index 5f30dea84..89a44c602 100644
--- 
a/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala
+++ 
b/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala
@@ -808,6 +808,9 @@ class ClickHouseTestSettings extends BackendTestSettings {
     .exclude("ParseUrl")
     .exclude("SPARK-33468: ParseUrl in ANSI mode should fail if input string 
is not a valid url")
     .exclude("FORMAT") // refer 
https://github.com/apache/incubator-gluten/issues/6765
+    .exclude(
+      "soundex unit test"
+    ) // CH and spark returns different results when input non-ASCII characters
   enableSuite[GlutenDataSourceV2DataFrameSessionCatalogSuite]
   enableSuite[GlutenDataSourceV2SQLSessionCatalogSuite]
   enableSuite[GlutenDataSourceV2SQLSuiteV1Filter]
diff --git 
a/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala
 
b/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala
index 6a2241f7e..388036c55 100644
--- 
a/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala
+++ 
b/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala
@@ -808,6 +808,9 @@ class ClickHouseTestSettings extends BackendTestSettings {
     .exclude("ParseUrl")
     .exclude("SPARK-33468: ParseUrl in ANSI mode should fail if input string 
is not a valid url")
     .exclude("FORMAT") // refer 
https://github.com/apache/incubator-gluten/issues/6765
+    .exclude(
+      "soundex unit test"
+    ) // CH and spark returns different results when input non-ASCII characters
   enableSuite[GlutenDataSourceV2DataFrameSessionCatalogSuite]
   enableSuite[GlutenDataSourceV2SQLSessionCatalogSuite]
   enableSuite[GlutenDataSourceV2SQLSuiteV1Filter]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to