This is an automated email from the ASF dual-hosted git repository.
jackie pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/pinot.git
The following commit(s) were added to refs/heads/master by this push:
new 3b1b33db5d2 Add String functions for regex and distance (#17372)
3b1b33db5d2 is described below
commit 3b1b33db5d250c5068123380f02e1d6a6c3b138e
Author: Akanksha kedia <[email protected]>
AuthorDate: Sat Dec 20 04:15:10 2025 +0530
Add String functions for regex and distance (#17372)
---
.../common/function/scalar/StringFunctions.java | 49 +++++++++++++++++
.../function/scalar/StringFunctionsTest.java | 64 ++++++++++++++++++++++
2 files changed, 113 insertions(+)
diff --git
a/pinot-common/src/main/java/org/apache/pinot/common/function/scalar/StringFunctions.java
b/pinot-common/src/main/java/org/apache/pinot/common/function/scalar/StringFunctions.java
index 69565c4a007..d09debc6faf 100644
---
a/pinot-common/src/main/java/org/apache/pinot/common/function/scalar/StringFunctions.java
+++
b/pinot-common/src/main/java/org/apache/pinot/common/function/scalar/StringFunctions.java
@@ -632,6 +632,55 @@ public class StringFunctions {
return distance;
}
+ /**
+ * Calculates the Levenshtein edit distance between two strings.
+ * The Levenshtein distance is the minimum number of single-character edits
+ * (insertions, deletions, or substitutions) needed to transform one string
into another.
+ * This complements the existing hammingDistance function by handling
strings of different lengths.
+ *
+ * @param input1 First string
+ * @param input2 Second string
+ * @return The Levenshtein distance between the two strings
+ */
+ @ScalarFunction
+ public static int levenshteinDistance(String input1, String input2) {
+ int len1 = input1.length();
+ int len2 = input2.length();
+
+ // If one string is empty, return the length of the other
+ if (len1 == 0) {
+ return len2;
+ }
+ if (len2 == 0) {
+ return len1;
+ }
+
+ // Create a matrix to store distances
+ int[][] dp = new int[len1 + 1][len2 + 1];
+
+ // Initialize first row and column
+ for (int i = 0; i <= len1; i++) {
+ dp[i][0] = i;
+ }
+ for (int j = 0; j <= len2; j++) {
+ dp[0][j] = j;
+ }
+
+ // Fill the matrix using dynamic programming
+ for (int i = 1; i <= len1; i++) {
+ for (int j = 1; j <= len2; j++) {
+ int cost = (input1.charAt(i - 1) == input2.charAt(j - 1)) ? 0 : 1;
+ dp[i][j] = Math.min(
+ Math.min(dp[i - 1][j] + 1, // deletion
+ dp[i][j - 1] + 1), // insertion
+ dp[i - 1][j - 1] + cost // substitution
+ );
+ }
+ }
+
+ return dp[len1][len2];
+ }
+
/**
* @see String#contains(CharSequence)
* @param input
diff --git
a/pinot-common/src/test/java/org/apache/pinot/common/function/scalar/StringFunctionsTest.java
b/pinot-common/src/test/java/org/apache/pinot/common/function/scalar/StringFunctionsTest.java
index 37abf1e2cfc..9a4aadbd587 100644
---
a/pinot-common/src/test/java/org/apache/pinot/common/function/scalar/StringFunctionsTest.java
+++
b/pinot-common/src/test/java/org/apache/pinot/common/function/scalar/StringFunctionsTest.java
@@ -94,6 +94,51 @@ public class StringFunctionsTest {
};
}
+ @DataProvider(name = "levenshteinDistanceTestCases")
+ public static Object[][] levenshteinDistanceTestCases() {
+ return new Object[][]{
+ // Basic test cases
+ {"", "", 0},
+ {"a", "", 1},
+ {"", "a", 1},
+ {"a", "a", 0},
+
+ // Classic examples
+ {"kitten", "sitting", 3},
+ {"saturday", "sunday", 3},
+ {"intention", "execution", 5},
+
+ // Single character operations
+ {"cat", "bat", 1}, // substitution
+ {"cat", "cats", 1}, // insertion
+ {"cats", "cat", 1}, // deletion
+
+ // More complex cases
+ {"book", "back", 2},
+ {"hello", "world", 4},
+ {"algorithm", "altruistic", 6},
+
+ // Edge cases with repeated characters
+ {"aaa", "aa", 1},
+ {"aa", "aaa", 1},
+ {"abc", "def", 3},
+
+ // Longer strings
+ {"abcdefghijklmnop", "1234567890123456", 16},
+ {"programming", "grammar", 6},
+
+ // Case sensitivity
+ {"Hello", "hello", 1},
+ {"WORLD", "world", 5},
+
+ // Special characters and numbers
+ {"test123", "test456", 3},
+ {"hello!", "hello?", 1},
+ {"[email protected]", "[email protected]", 1}
+ };
+ }
+
+
@Test(dataProvider = "isJson")
public void testIsJson(String input, boolean expectedValue) {
assertEquals(StringFunctions.isJson(input), expectedValue);
@@ -115,6 +160,25 @@ public class StringFunctionsTest {
assertEquals(StringFunctions.suffixesWithSuffix(input, length, "$"),
expectedSuffixWithRegexChar);
}
+ @Test(dataProvider = "levenshteinDistanceTestCases")
+ public void testLevenshteinDistance(String input1, String input2, int
expectedDistance) {
+ assertEquals(StringFunctions.levenshteinDistance(input1, input2),
expectedDistance);
+ }
+
+
+ @Test
+ public void testHammingDistance() {
+ // Test existing hammingDistance function for comparison
+ assertEquals(StringFunctions.hammingDistance("abc", "abc"), 0);
+ assertEquals(StringFunctions.hammingDistance("abc", "def"), 3);
+ assertEquals(StringFunctions.hammingDistance("abc", "aef"), 2);
+ assertEquals(StringFunctions.hammingDistance("abc", "abcd"), -1); //
Different lengths
+
+ // Demonstrate the difference between hammingDistance and
levenshteinDistance
+ assertEquals(StringFunctions.hammingDistance("cat", "cats"), -1); //
Hamming can't handle different lengths
+ assertEquals(StringFunctions.levenshteinDistance("cat", "cats"), 1); //
Levenshtein can handle different lengths
+ }
+
@Test
public void encodeUrl() {
assertEquals(StringFunctions.encodeUrl(""), "");
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]