Repository: commons-text Updated Branches: refs/heads/master 6d69377a2 -> a37bead30
TEXT-22: Incorporating https://github.com/apache/commons-lang/commit/103b64a373256feae6ca85f2bf220e7694e48fa4#diff-3f4c835875ddc8a211c0272e8be51394 Project: http://git-wip-us.apache.org/repos/asf/commons-text/repo Commit: http://git-wip-us.apache.org/repos/asf/commons-text/commit/a37bead3 Tree: http://git-wip-us.apache.org/repos/asf/commons-text/tree/a37bead3 Diff: http://git-wip-us.apache.org/repos/asf/commons-text/diff/a37bead3 Branch: refs/heads/master Commit: a37bead303adb298f59f76389d5704bef6ff183d Parents: 6d69377 Author: Rob Tompkins <chtom...@gmail.com> Authored: Thu Dec 1 15:31:37 2016 -0800 Committer: Rob Tompkins <chtom...@gmail.com> Committed: Thu Dec 1 15:31:37 2016 -0800 ---------------------------------------------------------------------- .../text/similarity/LevenshteinDistance.java | 45 ++++++-------------- 1 file changed, 13 insertions(+), 32 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/commons-text/blob/a37bead3/src/main/java/org/apache/commons/text/similarity/LevenshteinDistance.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/similarity/LevenshteinDistance.java b/src/main/java/org/apache/commons/text/similarity/LevenshteinDistance.java index e8628be..01ec3dc 100644 --- a/src/main/java/org/apache/commons/text/similarity/LevenshteinDistance.java +++ b/src/main/java/org/apache/commons/text/similarity/LevenshteinDistance.java @@ -307,12 +307,10 @@ public class LevenshteinDistance implements EditDistance<Integer> { * <p>A higher score indicates a greater distance.</p> * * <p>The previous implementation of the Levenshtein distance algorithm - * was from <a href="http://www.merriampark.com/ld.htm">http://www.merriampark.com/ld.htm</a></p> + * was from <a href="https://web.archive.org/web/20120526085419/http://www.merriampark.com/ldjava.htm"> + * https://web.archive.org/web/20120526085419/http://www.merriampark.com/ldjava.htm</a></p> * - * <p>Chas Emerick has written an implementation in Java, which avoids an OutOfMemoryError - * which can occur when my Java implementation is used with very large strings.<br> - * This implementation of the Levenshtein distance algorithm - * is from <a href="http://www.merriampark.com/ldjava.htm">http://www.merriampark.com/ldjava.htm</a></p> + * <p>This implementation only need one single-dimensional arrays of length s.length() + 1</p> * * <pre> * unlimitedCompare(null, *) = IllegalArgumentException @@ -339,20 +337,8 @@ public class LevenshteinDistance implements EditDistance<Integer> { } /* - The difference between this impl. and the previous is that, rather - than creating and retaining a matrix of size s.length() + 1 by t.length() + 1, - we maintain two single-dimensional arrays of length s.length() + 1. The first, d, - is the 'current working' distance array that maintains the newest distance cost - counts as we iterate through the characters of String s. Each time we increment - the index of String t we are comparing, d is copied to p, the second int[]. Doing so - allows us to retain the previous cost counts as required by the algorithm (taking - the minimum of the cost count to the left, up one, and diagonally up and to the left - of the current cost count being calculated). (Note that the arrays aren't really - copied anymore, just switched...this is clearly much better than cloning an array - or doing a System.arraycopy() each time through the outer loop.) - - Effectively, the difference between the two implementations is this one does not - cause an out of memory condition when calculating the LD over two very large strings. + This implementation use two variable to record the previous cost counts, + So this implementation use less memory than previous impl. */ int n = left.length(); // length of left @@ -373,16 +359,15 @@ public class LevenshteinDistance implements EditDistance<Integer> { m = right.length(); } - int[] p = new int[n + 1]; //'previous' cost array, horizontally - int[] d = new int[n + 1]; // cost array, horizontally - int[] tempD; //placeholder to assist in swapping p and d + int[] p = new int[n + 1]; // indexes into strings left and right int i; // iterates through left int j; // iterates through right + int upper_left; + int upper; char rightJ; // jth character of right - int cost; // cost for (i = 0; i <= n; i++) { @@ -390,23 +375,19 @@ public class LevenshteinDistance implements EditDistance<Integer> { } for (j = 1; j <= m; j++) { + upper_left = p[0]; rightJ = right.charAt(j - 1); - d[0] = j; + p[0] = j; for (i = 1; i <= n; i++) { + upper = p[i]; cost = left.charAt(i - 1) == rightJ ? 0 : 1; // minimum of cell to the left+1, to the top+1, diagonally left and up +cost - d[i] = Math.min(Math.min(d[i - 1] + 1, p[i] + 1), p[i - 1] + cost); + p[i] = Math.min(Math.min(p[i - 1] + 1, p[i] + 1), upper_left + cost); + upper_left = upper; } - - // copy current distance counts to 'previous row' distance counts - tempD = p; - p = d; - d = tempD; } - // our last action in the above loop was to switch d and p, so p now - // actually has the most recent cost counts return p[n]; }