This is an automated email from the ASF dual-hosted git repository.

jackie pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/pinot.git


The following commit(s) were added to refs/heads/master by this push:
     new 3b1b33db5d2 Add String functions for regex and distance (#17372)
3b1b33db5d2 is described below

commit 3b1b33db5d250c5068123380f02e1d6a6c3b138e
Author: Akanksha kedia <[email protected]>
AuthorDate: Sat Dec 20 04:15:10 2025 +0530

    Add String functions for regex and distance (#17372)
---
 .../common/function/scalar/StringFunctions.java    | 49 +++++++++++++++++
 .../function/scalar/StringFunctionsTest.java       | 64 ++++++++++++++++++++++
 2 files changed, 113 insertions(+)

diff --git 
a/pinot-common/src/main/java/org/apache/pinot/common/function/scalar/StringFunctions.java
 
b/pinot-common/src/main/java/org/apache/pinot/common/function/scalar/StringFunctions.java
index 69565c4a007..d09debc6faf 100644
--- 
a/pinot-common/src/main/java/org/apache/pinot/common/function/scalar/StringFunctions.java
+++ 
b/pinot-common/src/main/java/org/apache/pinot/common/function/scalar/StringFunctions.java
@@ -632,6 +632,55 @@ public class StringFunctions {
     return distance;
   }
 
+  /**
+   * Calculates the Levenshtein edit distance between two strings.
+   * The Levenshtein distance is the minimum number of single-character edits
+   * (insertions, deletions, or substitutions) needed to transform one string 
into another.
+   * This complements the existing hammingDistance function by handling 
strings of different lengths.
+   *
+   * @param input1 First string
+   * @param input2 Second string
+   * @return The Levenshtein distance between the two strings
+   */
+  @ScalarFunction
+  public static int levenshteinDistance(String input1, String input2) {
+    int len1 = input1.length();
+    int len2 = input2.length();
+
+    // If one string is empty, return the length of the other
+    if (len1 == 0) {
+      return len2;
+    }
+    if (len2 == 0) {
+      return len1;
+    }
+
+    // Create a matrix to store distances
+    int[][] dp = new int[len1 + 1][len2 + 1];
+
+    // Initialize first row and column
+    for (int i = 0; i <= len1; i++) {
+      dp[i][0] = i;
+    }
+    for (int j = 0; j <= len2; j++) {
+      dp[0][j] = j;
+    }
+
+    // Fill the matrix using dynamic programming
+    for (int i = 1; i <= len1; i++) {
+      for (int j = 1; j <= len2; j++) {
+        int cost = (input1.charAt(i - 1) == input2.charAt(j - 1)) ? 0 : 1;
+        dp[i][j] = Math.min(
+            Math.min(dp[i - 1][j] + 1,      // deletion
+                     dp[i][j - 1] + 1),     // insertion
+            dp[i - 1][j - 1] + cost         // substitution
+        );
+      }
+    }
+
+    return dp[len1][len2];
+  }
+
   /**
    * @see String#contains(CharSequence)
    * @param input
diff --git 
a/pinot-common/src/test/java/org/apache/pinot/common/function/scalar/StringFunctionsTest.java
 
b/pinot-common/src/test/java/org/apache/pinot/common/function/scalar/StringFunctionsTest.java
index 37abf1e2cfc..9a4aadbd587 100644
--- 
a/pinot-common/src/test/java/org/apache/pinot/common/function/scalar/StringFunctionsTest.java
+++ 
b/pinot-common/src/test/java/org/apache/pinot/common/function/scalar/StringFunctionsTest.java
@@ -94,6 +94,51 @@ public class StringFunctionsTest {
     };
   }
 
+  @DataProvider(name = "levenshteinDistanceTestCases")
+  public static Object[][] levenshteinDistanceTestCases() {
+    return new Object[][]{
+        // Basic test cases
+        {"", "", 0},
+        {"a", "", 1},
+        {"", "a", 1},
+        {"a", "a", 0},
+
+        // Classic examples
+        {"kitten", "sitting", 3},
+        {"saturday", "sunday", 3},
+        {"intention", "execution", 5},
+
+        // Single character operations
+        {"cat", "bat", 1}, // substitution
+        {"cat", "cats", 1}, // insertion
+        {"cats", "cat", 1}, // deletion
+
+        // More complex cases
+        {"book", "back", 2},
+        {"hello", "world", 4},
+        {"algorithm", "altruistic", 6},
+
+        // Edge cases with repeated characters
+        {"aaa", "aa", 1},
+        {"aa", "aaa", 1},
+        {"abc", "def", 3},
+
+        // Longer strings
+        {"abcdefghijklmnop", "1234567890123456", 16},
+        {"programming", "grammar", 6},
+
+        // Case sensitivity
+        {"Hello", "hello", 1},
+        {"WORLD", "world", 5},
+
+        // Special characters and numbers
+        {"test123", "test456", 3},
+        {"hello!", "hello?", 1},
+        {"[email protected]", "[email protected]", 1}
+    };
+  }
+
+
   @Test(dataProvider = "isJson")
   public void testIsJson(String input, boolean expectedValue) {
     assertEquals(StringFunctions.isJson(input), expectedValue);
@@ -115,6 +160,25 @@ public class StringFunctionsTest {
     assertEquals(StringFunctions.suffixesWithSuffix(input, length, "$"), 
expectedSuffixWithRegexChar);
   }
 
+  @Test(dataProvider = "levenshteinDistanceTestCases")
+  public void testLevenshteinDistance(String input1, String input2, int 
expectedDistance) {
+    assertEquals(StringFunctions.levenshteinDistance(input1, input2), 
expectedDistance);
+  }
+
+
+  @Test
+  public void testHammingDistance() {
+    // Test existing hammingDistance function for comparison
+    assertEquals(StringFunctions.hammingDistance("abc", "abc"), 0);
+    assertEquals(StringFunctions.hammingDistance("abc", "def"), 3);
+    assertEquals(StringFunctions.hammingDistance("abc", "aef"), 2);
+    assertEquals(StringFunctions.hammingDistance("abc", "abcd"), -1); // 
Different lengths
+
+    // Demonstrate the difference between hammingDistance and 
levenshteinDistance
+    assertEquals(StringFunctions.hammingDistance("cat", "cats"), -1); // 
Hamming can't handle different lengths
+    assertEquals(StringFunctions.levenshteinDistance("cat", "cats"), 1); // 
Levenshtein can handle different lengths
+  }
+
   @Test
   public void encodeUrl() {
     assertEquals(StringFunctions.encodeUrl(""), "");


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to