(spark) branch master updated: [SPARK-47415][SQL] Add collation support for Levenshtein expression

wenchen Tue, 11 Jun 2024 09:42:20 -0700

This is an automated email from the ASF dual-hosted git repository.

wenchen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/master by this push:
     new 583ab0500c79 [SPARK-47415][SQL] Add collation support for Levenshtein 
expression
583ab0500c79 is described below

commit 583ab0500c79bd3cf0146bc1f05f8a11ec37d9a5
Author: Uros Bojanic <[email protected]>
AuthorDate: Tue Jun 11 09:42:01 2024 -0700

    [SPARK-47415][SQL] Add collation support for Levenshtein expression
    
    ### What changes were proposed in this pull request?
    Introduce collation support for `levenshtein` string expression 
(pass-through).
    
    ### Why are the changes needed?
    Add collation support for Levenshtein expression in Spark.
    
    ### Does this PR introduce _any_ user-facing change?
    Yes, users should now be able to use collated strings within arguments for 
string function: levenshtein.
    
    ### How was this patch tested?
    E2e sql tests.
    
    ### Was this patch authored or co-authored using generative AI tooling?
    No.
    
    Closes #46788 from uros-db/levenshtein.
    
    Authored-by: Uros Bojanic <[email protected]>
    Signed-off-by: Wenchen Fan <[email protected]>
---
 .../catalyst/expressions/stringExpressions.scala   |  4 ++--
 .../sql/CollationStringExpressionsSuite.scala      | 22 ++++++++++++++++++++++
 2 files changed, 24 insertions(+), 2 deletions(-)

diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
index 09ec501311ad..ac23962f41ed 100755
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
@@ -2240,8 +2240,8 @@ case class Levenshtein(
   }
 
   override def inputTypes: Seq[AbstractDataType] = threshold match {
-    case Some(_) => Seq(StringType, StringType, IntegerType)
-    case _ => Seq(StringType, StringType)
+    case Some(_) => Seq(StringTypeAnyCollation, StringTypeAnyCollation, 
IntegerType)
+    case _ => Seq(StringTypeAnyCollation, StringTypeAnyCollation)
   }
 
   override def children: Seq[Expression] = threshold match {
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/CollationStringExpressionsSuite.scala
 
b/sql/core/src/test/scala/org/apache/spark/sql/CollationStringExpressionsSuite.scala
index db02946e3dfe..31be149b9c9c 100644
--- 
a/sql/core/src/test/scala/org/apache/spark/sql/CollationStringExpressionsSuite.scala
+++ 
b/sql/core/src/test/scala/org/apache/spark/sql/CollationStringExpressionsSuite.scala
@@ -645,6 +645,28 @@ class CollationStringExpressionsSuite
     })
   }
 
+  test("Levenshtein string expression with collation") {
+    // Supported collations
+    case class LevenshteinTestCase(
+      left: String, right: String, collationName: String, threshold: 
Option[Int], result: Int
+    )
+    val testCases = Seq(
+      LevenshteinTestCase("kitten", "sitTing", "UTF8_BINARY", None, result = 
4),
+      LevenshteinTestCase("kitten", "sitTing", "UTF8_BINARY_LCASE", None, 
result = 4),
+      LevenshteinTestCase("kitten", "sitTing", "UNICODE", Some(3), result = 
-1),
+      LevenshteinTestCase("kitten", "sitTing", "UNICODE_CI", Some(3), result = 
-1)
+    )
+    testCases.foreach(t => {
+      withSQLConf(SQLConf.DEFAULT_COLLATION.key -> t.collationName) {
+        val th = if (t.threshold.isDefined) s", ${t.threshold.get}" else ""
+        val query = s"select levenshtein('${t.left}', '${t.right}'$th)"
+        // Result & data type
+        checkAnswer(sql(query), Row(t.result))
+        assert(sql(query).schema.fields.head.dataType.sameType(IntegerType))
+      }
+    })
+  }
+
   test("Support Left/Right/Substr with collation") {
     case class SubstringTestCase(
         method: String,


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(spark) branch master updated: [SPARK-47415][SQL] Add collation support for Levenshtein expression

Reply via email to