nikolamand-db commented on code in PR #45963:
URL: https://github.com/apache/spark/pull/45963#discussion_r1596897091
##########
common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java:
##########
@@ -722,6 +722,65 @@ public static UTF8String execLowercase(
}
}
+ /**
+ * Utility class for collation aware Levenshtein function.
+ */
+ public static class Levenshtein{
+
+ /**
+ * Implementation of SubstringEquals interface for collation aware
comparison of two substrings.
+ */
+ private static class CollationSubstringEquals implements
UTF8String.SubstringEquals {
+ private final int collationId;
+ private final UTF8String left, right;
+
+ CollationSubstringEquals(int collationId) {
+ this.collationId = collationId;
+ this.left = new UTF8String();
+ this.right = new UTF8String();
+ }
+
+ @Override
+ public boolean equals(UTF8String left, UTF8String right, int posLeft,
int posRight,
+ int lenLeft, int lenRight) {
+ this.left.moveAddress(left, posLeft, lenLeft);
+ this.right.moveAddress(right, posRight, lenRight);
+ return CollationFactory.fetchCollation(collationId).equalsFunction
+ .apply(this.left, this.right);
+ }
+ }
+
+ public static Integer exec(final UTF8String left, final UTF8String right,
final int collationId){
+ CollationFactory.Collation collation =
CollationFactory.fetchCollation(collationId);
+
+ if (collation.supportsBinaryEquality){
+ return left.levenshteinDistance(right);
+ }
+ else{
+ return left.levenshteinDistance(right, new
CollationSubstringEquals(collationId));
+ }
+ }
+
+ public static Integer execWithThreshold(final UTF8String left, final
UTF8String right, final int threshold, final int collationId){
+ CollationFactory.Collation collation =
CollationFactory.fetchCollation(collationId);
+
+ if (collation.supportsBinaryEquality){
+ return left.levenshteinDistance(right, threshold);
+ }
+ else{
+ return left.levenshteinDistance(right, threshold, new
CollationSubstringEquals(collationId));
+ }
Review Comment:
This should be done in other places as well (both threshold & non-threshold
Levenshtein).
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]