This is an automated email from the ASF dual-hosted git repository.
wenchen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 583ab0500c79 [SPARK-47415][SQL] Add collation support for Levenshtein
expression
583ab0500c79 is described below
commit 583ab0500c79bd3cf0146bc1f05f8a11ec37d9a5
Author: Uros Bojanic <[email protected]>
AuthorDate: Tue Jun 11 09:42:01 2024 -0700
[SPARK-47415][SQL] Add collation support for Levenshtein expression
### What changes were proposed in this pull request?
Introduce collation support for `levenshtein` string expression
(pass-through).
### Why are the changes needed?
Add collation support for Levenshtein expression in Spark.
### Does this PR introduce _any_ user-facing change?
Yes, users should now be able to use collated strings within arguments for
string function: levenshtein.
### How was this patch tested?
E2e sql tests.
### Was this patch authored or co-authored using generative AI tooling?
No.
Closes #46788 from uros-db/levenshtein.
Authored-by: Uros Bojanic <[email protected]>
Signed-off-by: Wenchen Fan <[email protected]>
---
.../catalyst/expressions/stringExpressions.scala | 4 ++--
.../sql/CollationStringExpressionsSuite.scala | 22 ++++++++++++++++++++++
2 files changed, 24 insertions(+), 2 deletions(-)
diff --git
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
index 09ec501311ad..ac23962f41ed 100755
---
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
+++
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
@@ -2240,8 +2240,8 @@ case class Levenshtein(
}
override def inputTypes: Seq[AbstractDataType] = threshold match {
- case Some(_) => Seq(StringType, StringType, IntegerType)
- case _ => Seq(StringType, StringType)
+ case Some(_) => Seq(StringTypeAnyCollation, StringTypeAnyCollation,
IntegerType)
+ case _ => Seq(StringTypeAnyCollation, StringTypeAnyCollation)
}
override def children: Seq[Expression] = threshold match {
diff --git
a/sql/core/src/test/scala/org/apache/spark/sql/CollationStringExpressionsSuite.scala
b/sql/core/src/test/scala/org/apache/spark/sql/CollationStringExpressionsSuite.scala
index db02946e3dfe..31be149b9c9c 100644
---
a/sql/core/src/test/scala/org/apache/spark/sql/CollationStringExpressionsSuite.scala
+++
b/sql/core/src/test/scala/org/apache/spark/sql/CollationStringExpressionsSuite.scala
@@ -645,6 +645,28 @@ class CollationStringExpressionsSuite
})
}
+ test("Levenshtein string expression with collation") {
+ // Supported collations
+ case class LevenshteinTestCase(
+ left: String, right: String, collationName: String, threshold:
Option[Int], result: Int
+ )
+ val testCases = Seq(
+ LevenshteinTestCase("kitten", "sitTing", "UTF8_BINARY", None, result =
4),
+ LevenshteinTestCase("kitten", "sitTing", "UTF8_BINARY_LCASE", None,
result = 4),
+ LevenshteinTestCase("kitten", "sitTing", "UNICODE", Some(3), result =
-1),
+ LevenshteinTestCase("kitten", "sitTing", "UNICODE_CI", Some(3), result =
-1)
+ )
+ testCases.foreach(t => {
+ withSQLConf(SQLConf.DEFAULT_COLLATION.key -> t.collationName) {
+ val th = if (t.threshold.isDefined) s", ${t.threshold.get}" else ""
+ val query = s"select levenshtein('${t.left}', '${t.right}'$th)"
+ // Result & data type
+ checkAnswer(sql(query), Row(t.result))
+ assert(sql(query).schema.fields.head.dataType.sameType(IntegerType))
+ }
+ })
+ }
+
test("Support Left/Right/Substr with collation") {
case class SubstringTestCase(
method: String,
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]