This is an automated email from the ASF dual-hosted git repository.
hansva pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-hop.git
The following commit(s) were added to refs/heads/master by this push:
new 8b84906 HOP-3186 : Re-implement Damerau Levenshtein distance
calculation
new c16fb83 Merge pull request #1068 from mattcasters/master
8b84906 is described below
commit 8b84906224b252b1cb38b4a363f042445b369efa
Author: Matt Casters <[email protected]>
AuthorDate: Fri Sep 17 11:59:25 2021 +0200
HOP-3186 : Re-implement Damerau Levenshtein distance calculation
---
.../main/java/org/apache/hop/core/util/Utils.java | 152 +++++++++++----------
.../java/org/apache/hop/core/util/UtilsTest.java | 28 +++-
2 files changed, 107 insertions(+), 73 deletions(-)
diff --git a/core/src/main/java/org/apache/hop/core/util/Utils.java
b/core/src/main/java/org/apache/hop/core/util/Utils.java
index 0bc6da7..033e2b0 100644
--- a/core/src/main/java/org/apache/hop/core/util/Utils.java
+++ b/core/src/main/java/org/apache/hop/core/util/Utils.java
@@ -6,7 +6,7 @@
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
- * http://www.apache.org/licenses/LICENSE-2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
@@ -17,91 +17,99 @@
package org.apache.hop.core.util;
+import org.apache.commons.lang3.StringUtils;
+import org.apache.commons.lang3.math.NumberUtils;
import org.apache.hop.core.encryption.Encr;
import org.apache.hop.core.variables.IVariables;
+import java.util.HashMap;
import java.util.List;
+import java.util.Map;
import java.util.concurrent.TimeUnit;
-/* Levenshtein in Java, originally from Josh Drew's code at
- * http://joshdrew.com/
- * Code from http://blog.lolyco.com
- *
- */
public class Utils {
- private static final int[] ZERO_LENGTH_INT_ARRAY = new int[0];
- private static int damerauLevenshteinDistance(String s, String t, int[]
workspace) {
- int lenS = s.length();
- int lenT = t.length();
- int lenS1 = lenS + 1;
- int lenT1 = lenT + 1;
- if (lenT1 == 1) {
- return lenS1 - 1;
- }
- if (lenS1 == 1) {
- return lenT1 - 1;
+ // Fairly simple algorithm implemented from pseudo-code algorithm documented
on Wikipedia:
+ //
+ // https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance
+ //
+ public static int getDamerauLevenshteinDistance(String one, String two) {
+ // A few utility variables
+ //
+ int oneLength = one == null ? 0 : one.length();
+ int twoLength = two == null ? 0 : two.length();
+ int oneTwoLength = oneLength + twoLength;
+
+ // A bit of house-keeping for null values...
+ //
+ if (StringUtils.isEmpty(one)) {
+ if (StringUtils.isEmpty(two)) {
+ return 0;
+ } else {
+ return twoLength;
+ }
+ } else if (StringUtils.isEmpty(two)) {
+ return oneLength;
}
- int[] dl = workspace;
- int dlIndex = 0;
- int sPrevIndex = 0, tPrevIndex = 0, rowBefore = 0, min = 0, cost = 0, tmp
= 0;
- int tri = lenS1 + 2;
- // start row with constant
- dlIndex = 0;
- for (tmp = 0; tmp < lenT1; tmp++) {
- dl[dlIndex] = tmp;
- dlIndex += lenS1;
+
+ // A distancesMatrix to keep track of distances between characters.
+ // It explores all valid distances in the loop below
+ //
+ int distancesMatrix[][] = new int[oneLength + 2][twoLength + 2];
+
+ // Initialize the matrix
+ // The maximum possible length is the sum of both String lengths
+ // The minimum is obviously 0
+ //
+ distancesMatrix[0][0] = oneTwoLength;
+ for (int x = 0; x <= oneLength; x++) {
+ distancesMatrix[x + 1][1] = x;
+ distancesMatrix[x + 1][0] = oneTwoLength;
}
- for (int sIndex = 0; sIndex < lenS; sIndex++) {
- dlIndex = sIndex + 1;
- dl[dlIndex] = dlIndex; // start column with constant
- for (int tIndex = 0; tIndex < lenT; tIndex++) {
- rowBefore = dlIndex;
- dlIndex += lenS1;
- // deletion
- min = dl[rowBefore] + 1;
- // insertion
- tmp = dl[dlIndex - 1] + 1;
- if (tmp < min) {
- min = tmp;
- }
- cost = 1;
- if (s.charAt(sIndex) == t.charAt(tIndex)) {
- cost = 0;
- }
- if (sIndex > 0 && tIndex > 0) {
- if (s.charAt(sIndex) == t.charAt(tPrevIndex)
- && s.charAt(sPrevIndex) == t.charAt(tIndex)) {
- tmp = dl[rowBefore - tri] + cost;
- // transposition
- if (tmp < min) {
- min = tmp;
- }
- }
- }
- // substitution
- tmp = dl[rowBefore - 1] + cost;
- if (tmp < min) {
- min = tmp;
- }
- dl[dlIndex] = min;
- tPrevIndex = tIndex;
- }
- sPrevIndex = sIndex;
+ for (int y = 0; y <= twoLength; y++) {
+ distancesMatrix[1][y + 1] = y;
+ distancesMatrix[0][y + 1] = oneTwoLength;
}
- return dl[dlIndex];
- }
+ Map<Character, Integer> characterMap = new HashMap<>();
- private static int[] getWorkspace(int sl, int tl) {
- return new int[(sl + 1) * (tl + 1)];
- }
+ // Loop over string one and calculate distances with characters in string
two
+ //
+ for (int x = 1; x <= oneLength; x++) {
+ // A temporary index to have a reference point to compare with later
+ // to compare deletion, insertion, substitution and transposition
+ //
+ int tmpIndex = 0;
- public static int getDamerauLevenshteinDistance(String s, String t) {
- if (s != null && t != null) {
- return damerauLevenshteinDistance(s, t, getWorkspace(s.length(),
t.length()));
- } else {
- return damerauLevenshteinDistance(s, t, ZERO_LENGTH_INT_ARRAY);
+ // Loop over string two
+ //
+ for (int y = 1; y <= twoLength; y++) {
+ // 1-based index from pseudo-code
+ //
+ char charOne = one.charAt(x - 1);
+ char charTwo = two.charAt(y - 1);
+
+ int x1 = characterMap.getOrDefault(charTwo, 0);
+ int y1 = tmpIndex;
+
+ // Same characters: distance is 0
+ int distance = charOne == charTwo ? 0 : 1;
+ if (distance == 0) {
+ tmpIndex = y;
+ }
+
+ // Get the minimum of the 4 distances for deletion, insertion,
substitution and
+ // transposition
+ //
+ distancesMatrix[x + 1][y + 1] =
+ NumberUtils.min(
+ distancesMatrix[x][y] + distance,
+ distancesMatrix[x + 1][y] + 1,
+ distancesMatrix[x][y + 1] + 1,
+ distancesMatrix[x1][y1] + (x - x1 - 1) + 1 + (y - y1 - 1));
+ }
+ characterMap.put(one.charAt(x - 1), x);
}
+ return distancesMatrix[oneLength + 1][twoLength + 1];
}
/**
diff --git a/core/src/test/java/org/apache/hop/core/util/UtilsTest.java
b/core/src/test/java/org/apache/hop/core/util/UtilsTest.java
index be2e513..a6e20dc 100644
--- a/core/src/test/java/org/apache/hop/core/util/UtilsTest.java
+++ b/core/src/test/java/org/apache/hop/core/util/UtilsTest.java
@@ -6,7 +6,7 @@
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
- * http://www.apache.org/licenses/LICENSE-2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
@@ -203,4 +203,30 @@ public class UtilsTest {
assertArrayEquals(newI[0], i2);
assertTrue(newI[0] == i2); // If arrays are equal sized, it should return
original object
}
+
+ @Test
+ public void testDamerauLevenshteinDistance() throws Exception {
+ int distance = Utils.getDamerauLevenshteinDistance("Hello", "Hallow");
+ assertEquals(2, distance);
+ distance = Utils.getDamerauLevenshteinDistance("Green", "Grean");
+ assertEquals(1, distance);
+ distance = Utils.getDamerauLevenshteinDistance("Poker", "Powker");
+ assertEquals(1, distance);
+ distance = Utils.getDamerauLevenshteinDistance("Kettle", "Apache Hop");
+ assertEquals(9, distance);
+ distance = Utils.getDamerauLevenshteinDistance("A quick brown fox", "A
lazy dog");
+ assertEquals(13, distance);
+ distance = Utils.getDamerauLevenshteinDistance(null, "A string with length
23");
+ assertEquals(23, distance);
+ distance = Utils.getDamerauLevenshteinDistance("String length 16", null);
+ assertEquals(16, distance);
+ distance = Utils.getDamerauLevenshteinDistance(null, null);
+ assertEquals(0, distance);
+ distance = Utils.getDamerauLevenshteinDistance("", "A string with length
23");
+ assertEquals(23, distance);
+ distance = Utils.getDamerauLevenshteinDistance("String length 16", "");
+ assertEquals(16, distance);
+ distance = Utils.getDamerauLevenshteinDistance("", "");
+ assertEquals(0, distance);
+ }
}