http://git-wip-us.apache.org/repos/asf/commons-text/blob/c7cf533d/src/main/java/org/apache/commons/text/beta/similarity/LevenshteinDetailedDistance.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/beta/similarity/LevenshteinDetailedDistance.java b/src/main/java/org/apache/commons/text/beta/similarity/LevenshteinDetailedDistance.java deleted file mode 100644 index ec69f1d..0000000 --- a/src/main/java/org/apache/commons/text/beta/similarity/LevenshteinDetailedDistance.java +++ /dev/null @@ -1,519 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.text.beta.similarity; - -import java.util.Arrays; - -/** - * An algorithm for measuring the difference between two character sequences. - * - * <p> - * This is the number of changes needed to change one sequence into another, - * where each change is a single character modification (deletion, insertion - * or substitution). - * </p> - * - * @since 1.0 - */ -public class LevenshteinDetailedDistance implements EditDistance<LevenshteinResults> { - - /** - * Default instance. - */ - private static final LevenshteinDetailedDistance DEFAULT_INSTANCE = new LevenshteinDetailedDistance(); - /** - * Threshold. - */ - private final Integer threshold; - - /** - * <p> - * This returns the default instance that uses a version - * of the algorithm that does not use a threshold parameter. - * </p> - * - * @see LevenshteinDetailedDistance#getDefaultInstance() - */ - public LevenshteinDetailedDistance() { - this(null); - } - - /** - * If the threshold is not null, distance calculations will be limited to a maximum length. - * - * <p>If the threshold is null, the unlimited version of the algorithm will be used.</p> - * - * @param threshold If this is null then distances calculations will not be limited. This may not be negative. - */ - public LevenshteinDetailedDistance(final Integer threshold) { - if (threshold != null && threshold < 0) { - throw new IllegalArgumentException("Threshold must not be negative"); - } - this.threshold = threshold; - } - - /** - * <p>Find the Levenshtein distance between two Strings.</p> - * - * <p>A higher score indicates a greater distance.</p> - * - * <p>The previous implementation of the Levenshtein distance algorithm - * was from <a href="http://www.merriampark.com/ld.htm">http://www.merriampark.com/ld.htm</a></p> - * - * <p>Chas Emerick has written an implementation in Java, which avoids an OutOfMemoryError - * which can occur when my Java implementation is used with very large strings.<br> - * This implementation of the Levenshtein distance algorithm - * is from <a href="http://www.merriampark.com/ldjava.htm">http://www.merriampark.com/ldjava.htm</a></p> - * - * <pre> - * distance.apply(null, *) = IllegalArgumentException - * distance.apply(*, null) = IllegalArgumentException - * distance.apply("","") = 0 - * distance.apply("","a") = 1 - * distance.apply("aaapppp", "") = 7 - * distance.apply("frog", "fog") = 1 - * distance.apply("fly", "ant") = 3 - * distance.apply("elephant", "hippo") = 7 - * distance.apply("hippo", "elephant") = 7 - * distance.apply("hippo", "zzzzzzzz") = 8 - * distance.apply("hello", "hallo") = 1 - * </pre> - * - * @param left the first string, must not be null - * @param right the second string, must not be null - * @return result distance, or -1 - * @throws IllegalArgumentException if either String input {@code null} - */ - @Override - public LevenshteinResults apply(final CharSequence left, final CharSequence right) { - if (threshold != null) { - return limitedCompare(left, right, threshold); - } - return unlimitedCompare(left, right); - } - - /** - * Gets the default instance. - * - * @return the default instace - */ - public static LevenshteinDetailedDistance getDefaultInstance() { - return DEFAULT_INSTANCE; - } - - /** - * Gets the distance threshold. - * - * @return the distance threshold - */ - public Integer getThreshold() { - return threshold; - } - - /** - * Find the Levenshtein distance between two CharSequences if it's less than or - * equal to a given threshold. - * - * <p> - * This implementation follows from Algorithms on Strings, Trees and - * Sequences by Dan Gusfield and Chas Emerick's implementation of the - * Levenshtein distance algorithm from <a - * href="http://www.merriampark.com/ld.htm" - * >http://www.merriampark.com/ld.htm</a> - * </p> - * - * <pre> - * limitedCompare(null, *, *) = IllegalArgumentException - * limitedCompare(*, null, *) = IllegalArgumentException - * limitedCompare(*, *, -1) = IllegalArgumentException - * limitedCompare("","", 0) = 0 - * limitedCompare("aaapppp", "", 8) = 7 - * limitedCompare("aaapppp", "", 7) = 7 - * limitedCompare("aaapppp", "", 6)) = -1 - * limitedCompare("elephant", "hippo", 7) = 7 - * limitedCompare("elephant", "hippo", 6) = -1 - * limitedCompare("hippo", "elephant", 7) = 7 - * limitedCompare("hippo", "elephant", 6) = -1 - * </pre> - * - * @param left the first string, must not be null - * @param right the second string, must not be null - * @param threshold the target threshold, must not be negative - * @return result distance, or -1 - */ - private static LevenshteinResults limitedCompare(CharSequence left, - CharSequence right, - final int threshold) { //NOPMD - if (left == null || right == null) { - throw new IllegalArgumentException("Strings must not be null"); - } - if (threshold < 0) { - throw new IllegalArgumentException("Threshold must not be negative"); - } - - /* - * This implementation only computes the distance if it's less than or - * equal to the threshold value, returning -1 if it's greater. The - * advantage is performance: unbounded distance is O(nm), but a bound of - * k allows us to reduce it to O(km) time by only computing a diagonal - * stripe of width 2k + 1 of the cost table. It is also possible to use - * this to compute the unbounded Levenshtein distance by starting the - * threshold at 1 and doubling each time until the distance is found; - * this is O(dm), where d is the distance. - * - * One subtlety comes from needing to ignore entries on the border of - * our stripe eg. p[] = |#|#|#|* d[] = *|#|#|#| We must ignore the entry - * to the left of the leftmost member We must ignore the entry above the - * rightmost member - * - * Another subtlety comes from our stripe running off the matrix if the - * strings aren't of the same size. Since string s is always swapped to - * be the shorter of the two, the stripe will always run off to the - * upper right instead of the lower left of the matrix. - * - * As a concrete example, suppose s is of length 5, t is of length 7, - * and our threshold is 1. In this case we're going to walk a stripe of - * length 3. The matrix would look like so: - * - * <pre> - * 1 2 3 4 5 - * 1 |#|#| | | | - * 2 |#|#|#| | | - * 3 | |#|#|#| | - * 4 | | |#|#|#| - * 5 | | | |#|#| - * 6 | | | | |#| - * 7 | | | | | | - * </pre> - * - * Note how the stripe leads off the table as there is no possible way - * to turn a string of length 5 into one of length 7 in edit distance of - * 1. - * - * Additionally, this implementation decreases memory usage by using two - * single-dimensional arrays and swapping them back and forth instead of - * allocating an entire n by m matrix. This requires a few minor - * changes, such as immediately returning when it's detected that the - * stripe has run off the matrix and initially filling the arrays with - * large values so that entries we don't compute are ignored. - * - * See Algorithms on Strings, Trees and Sequences by Dan Gusfield for - * some discussion. - */ - - int n = left.length(); // length of left - int m = right.length(); // length of right - - // if one string is empty, the edit distance is necessarily the length of the other - if (n == 0) { - return m <= threshold ? new LevenshteinResults(m, m, 0, 0) : new LevenshteinResults(-1, 0, 0, 0); - } else if (m == 0) { - return n <= threshold ? new LevenshteinResults(n, 0, n, 0) : new LevenshteinResults(-1, 0, 0, 0); - } - - boolean swapped = false; - if (n > m) { - // swap the two strings to consume less memory - final CharSequence tmp = left; - left = right; - right = tmp; - n = m; - m = right.length(); - swapped = true; - } - - int[] p = new int[n + 1]; // 'previous' cost array, horizontally - int[] d = new int[n + 1]; // cost array, horizontally - int[] tempD; // placeholder to assist in swapping p and d - final int[][] matrix = new int[m + 1][n + 1]; - - //filling the first row and first column values in the matrix - for (int index = 0; index <= n; index++) { - matrix[0][index] = index; - } - for (int index = 0; index <= m; index++) { - matrix[index][0] = index; - } - - // fill in starting table values - final int boundary = Math.min(n, threshold) + 1; - for (int i = 0; i < boundary; i++) { - p[i] = i; - } - // these fills ensure that the value above the rightmost entry of our - // stripe will be ignored in following loop iterations - Arrays.fill(p, boundary, p.length, Integer.MAX_VALUE); - Arrays.fill(d, Integer.MAX_VALUE); - - // iterates through t - for (int j = 1; j <= m; j++) { - final char rightJ = right.charAt(j - 1); // jth character of right - d[0] = j; - - // compute stripe indices, constrain to array size - final int min = Math.max(1, j - threshold); - final int max = j > Integer.MAX_VALUE - threshold ? n : Math.min( - n, j + threshold); - - // the stripe may lead off of the table if s and t are of different sizes - if (min > max) { - return new LevenshteinResults(-1, 0, 0, 0); - } - - // ignore entry left of leftmost - if (min > 1) { - d[min - 1] = Integer.MAX_VALUE; - } - - // iterates through [min, max] in s - for (int i = min; i <= max; i++) { - if (left.charAt(i - 1) == rightJ) { - // diagonally left and up - d[i] = p[i - 1]; - } else { - // 1 + minimum of cell to the left, to the top, diagonally left and up - d[i] = 1 + Math.min(Math.min(d[i - 1], p[i]), p[i - 1]); - } - matrix[j][i] = d[i]; - } - - // copy current distance counts to 'previous row' distance counts - tempD = p; - p = d; - d = tempD; - } - - // if p[n] is greater than the threshold, there's no guarantee on it being the correct distance - if (p[n] <= threshold) { - return findDetailedResults(left, right, matrix, swapped); - } - return new LevenshteinResults(-1, 0, 0, 0); - } - - /** - * <p>Find the Levenshtein distance between two Strings.</p> - * - * <p>A higher score indicates a greater distance.</p> - * - * <p>The previous implementation of the Levenshtein distance algorithm - * was from <a href="http://www.merriampark.com/ld.htm">http://www.merriampark.com/ld.htm</a></p> - * - * <p>Chas Emerick has written an implementation in Java, which avoids an OutOfMemoryError - * which can occur when my Java implementation is used with very large strings.<br> - * This implementation of the Levenshtein distance algorithm - * is from <a href="http://www.merriampark.com/ldjava.htm">http://www.merriampark.com/ldjava.htm</a></p> - * - * <pre> - * unlimitedCompare(null, *) = IllegalArgumentException - * unlimitedCompare(*, null) = IllegalArgumentException - * unlimitedCompare("","") = 0 - * unlimitedCompare("","a") = 1 - * unlimitedCompare("aaapppp", "") = 7 - * unlimitedCompare("frog", "fog") = 1 - * unlimitedCompare("fly", "ant") = 3 - * unlimitedCompare("elephant", "hippo") = 7 - * unlimitedCompare("hippo", "elephant") = 7 - * unlimitedCompare("hippo", "zzzzzzzz") = 8 - * unlimitedCompare("hello", "hallo") = 1 - * </pre> - * - * @param left the first String, must not be null - * @param right the second String, must not be null - * @return result distance, or -1 - * @throws IllegalArgumentException if either String input {@code null} - */ - private static LevenshteinResults unlimitedCompare(CharSequence left, CharSequence right) { - if (left == null || right == null) { - throw new IllegalArgumentException("Strings must not be null"); - } - - /* - The difference between this impl. and the previous is that, rather - than creating and retaining a matrix of size s.length() + 1 by t.length() + 1, - we maintain two single-dimensional arrays of length s.length() + 1. The first, d, - is the 'current working' distance array that maintains the newest distance cost - counts as we iterate through the characters of String s. Each time we increment - the index of String t we are comparing, d is copied to p, the second int[]. Doing so - allows us to retain the previous cost counts as required by the algorithm (taking - the minimum of the cost count to the left, up one, and diagonally up and to the left - of the current cost count being calculated). (Note that the arrays aren't really - copied anymore, just switched...this is clearly much better than cloning an array - or doing a System.arraycopy() each time through the outer loop.) - - Effectively, the difference between the two implementations is this one does not - cause an out of memory condition when calculating the LD over two very large strings. - */ - - int n = left.length(); // length of left - int m = right.length(); // length of right - - if (n == 0) { - return new LevenshteinResults(m, m, 0, 0); - } else if (m == 0) { - return new LevenshteinResults(n, 0, n, 0); - } - boolean swapped = false; - if (n > m) { - // swap the input strings to consume less memory - final CharSequence tmp = left; - left = right; - right = tmp; - n = m; - m = right.length(); - swapped = true; - } - - int[] p = new int[n + 1]; // 'previous' cost array, horizontally - int[] d = new int[n + 1]; // cost array, horizontally - int[] tempD; //placeholder to assist in swapping p and d - final int[][] matrix = new int[m + 1][n + 1]; - - // filling the first row and first column values in the matrix - for (int index = 0; index <= n; index++) { - matrix[0][index] = index; - } - for (int index = 0; index <= m; index++) { - matrix[index][0] = index; - } - - // indexes into strings left and right - int i; // iterates through left - int j; // iterates through right - - char rightJ; // jth character of right - - int cost; // cost - for (i = 0; i <= n; i++) { - p[i] = i; - } - - for (j = 1; j <= m; j++) { - rightJ = right.charAt(j - 1); - d[0] = j; - - for (i = 1; i <= n; i++) { - cost = left.charAt(i - 1) == rightJ ? 0 : 1; - // minimum of cell to the left+1, to the top+1, diagonally left and up +cost - d[i] = Math.min(Math.min(d[i - 1] + 1, p[i] + 1), p[i - 1] + cost); - //filling the matrix - matrix[j][i] = d[i]; - } - - // copy current distance counts to 'previous row' distance counts - tempD = p; - p = d; - d = tempD; - } - return findDetailedResults(left, right, matrix, swapped); - } - - /** - * Finds count for each of the three [insert, delete, substitute] operations - * needed. This is based on the matrix formed based on the two character - * sequence. - * - * @param left character sequence which need to be converted from - * @param right character sequence which need to be converted to - * @param matrix two dimensional array containing - * @param swapped tells whether the value for left character sequence and right - * character sequence were swapped to save memory - * @return result object containing the count of insert, delete and substitute and total count needed - */ - private static LevenshteinResults findDetailedResults(final CharSequence left, - final CharSequence right, - final int[][] matrix, - final boolean swapped) { - - int delCount = 0; - int addCount = 0; - int subCount = 0; - - int rowIndex = right.length(); - int columnIndex = left.length(); - - int dataAtLeft = 0; - int dataAtTop = 0; - int dataAtDiagonal = 0; - int data = 0; - boolean deleted = false; - boolean added = false; - - while (rowIndex >= 0 && columnIndex >= 0) { - - if (columnIndex == 0) { - dataAtLeft = -1; - } else { - dataAtLeft = matrix[rowIndex][columnIndex - 1]; - } - if (rowIndex == 0) { - dataAtTop = -1; - } else { - dataAtTop = matrix[rowIndex - 1][columnIndex]; - } - if (rowIndex > 0 && columnIndex > 0) { - dataAtDiagonal = matrix[rowIndex - 1][columnIndex - 1]; - } else { - dataAtDiagonal = -1; - } - if (dataAtLeft == -1 && dataAtTop == -1 && dataAtDiagonal == -1) { - break; - } - data = matrix[rowIndex][columnIndex]; - - // case in which the character at left and right are the same, - // in this case none of the counters will be incremented. - if (columnIndex > 0 && rowIndex > 0 && left.charAt(columnIndex - 1) == right.charAt(rowIndex - 1)) { - columnIndex--; - rowIndex--; - continue; - } - - // handling insert and delete cases. - deleted = false; - added = false; - if (data - 1 == dataAtLeft && (data <= dataAtDiagonal && data <= dataAtTop) - || (dataAtDiagonal == -1 && dataAtTop == -1)) { // NOPMD - columnIndex--; - if (swapped) { - addCount++; - added = true; - } else { - delCount++; - deleted = true; - } - } else if (data - 1 == dataAtTop && (data <= dataAtDiagonal && data <= dataAtLeft) - || (dataAtDiagonal == -1 && dataAtLeft == -1)) { // NOPMD - rowIndex--; - if (swapped) { - delCount++; - deleted = true; - } else { - addCount++; - added = true; - } - } - - // substituted case - if (!added && !deleted) { - subCount++; - columnIndex--; - rowIndex--; - } - } - return new LevenshteinResults(addCount + delCount + subCount, addCount, delCount, subCount); - } -}
http://git-wip-us.apache.org/repos/asf/commons-text/blob/c7cf533d/src/main/java/org/apache/commons/text/beta/similarity/LevenshteinDistance.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/beta/similarity/LevenshteinDistance.java b/src/main/java/org/apache/commons/text/beta/similarity/LevenshteinDistance.java deleted file mode 100644 index 1f7d9a8..0000000 --- a/src/main/java/org/apache/commons/text/beta/similarity/LevenshteinDistance.java +++ /dev/null @@ -1,396 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.text.beta.similarity; - -import java.util.Arrays; - -/** - * An algorithm for measuring the difference between two character sequences. - * - * <p> - * This is the number of changes needed to change one sequence into another, - * where each change is a single character modification (deletion, insertion - * or substitution). - * </p> - * - * <p> - * This code has been adapted from Apache Commons Lang 3.3. - * </p> - * - * @since 1.0 - */ -public class LevenshteinDistance implements EditDistance<Integer> { - - /** - * Default instance. - */ - private static final LevenshteinDistance DEFAULT_INSTANCE = new LevenshteinDistance(); - - /** - * Threshold. - */ - private final Integer threshold; - - /** - * <p> - * This returns the default instance that uses a version - * of the algorithm that does not use a threshold parameter. - * </p> - * - * @see LevenshteinDistance#getDefaultInstance() - */ - public LevenshteinDistance() { - this(null); - } - - /** - * <p> - * If the threshold is not null, distance calculations will be limited to a maximum length. - * If the threshold is null, the unlimited version of the algorithm will be used. - * </p> - * - * @param threshold - * If this is null then distances calculations will not be limited. - * This may not be negative. - */ - public LevenshteinDistance(final Integer threshold) { - if (threshold != null && threshold < 0) { - throw new IllegalArgumentException("Threshold must not be negative"); - } - this.threshold = threshold; - } - - /** - * <p>Find the Levenshtein distance between two Strings.</p> - * - * <p>A higher score indicates a greater distance.</p> - * - * <p>The previous implementation of the Levenshtein distance algorithm - * was from <a href="http://www.merriampark.com/ld.htm">http://www.merriampark.com/ld.htm</a></p> - * - * <p>Chas Emerick has written an implementation in Java, which avoids an OutOfMemoryError - * which can occur when my Java implementation is used with very large strings.<br> - * This implementation of the Levenshtein distance algorithm - * is from <a href="http://www.merriampark.com/ldjava.htm">http://www.merriampark.com/ldjava.htm</a></p> - * - * <pre> - * distance.apply(null, *) = IllegalArgumentException - * distance.apply(*, null) = IllegalArgumentException - * distance.apply("","") = 0 - * distance.apply("","a") = 1 - * distance.apply("aaapppp", "") = 7 - * distance.apply("frog", "fog") = 1 - * distance.apply("fly", "ant") = 3 - * distance.apply("elephant", "hippo") = 7 - * distance.apply("hippo", "elephant") = 7 - * distance.apply("hippo", "zzzzzzzz") = 8 - * distance.apply("hello", "hallo") = 1 - * </pre> - * - * @param left the first string, must not be null - * @param right the second string, must not be null - * @return result distance, or -1 - * @throws IllegalArgumentException if either String input {@code null} - */ - @Override - public Integer apply(final CharSequence left, final CharSequence right) { - if (threshold != null) { - return limitedCompare(left, right, threshold); - } - return unlimitedCompare(left, right); - } - - /** - * Gets the default instance. - * - * @return the default instace - */ - public static LevenshteinDistance getDefaultInstance() { - return DEFAULT_INSTANCE; - } - - /** - * Gets the distance threshold. - * - * @return the distance threshold - */ - public Integer getThreshold() { - return threshold; - } - - /** - * Find the Levenshtein distance between two CharSequences if it's less than or - * equal to a given threshold. - * - * <p> - * This implementation follows from Algorithms on Strings, Trees and - * Sequences by Dan Gusfield and Chas Emerick's implementation of the - * Levenshtein distance algorithm from <a - * href="http://www.merriampark.com/ld.htm" - * >http://www.merriampark.com/ld.htm</a> - * </p> - * - * <pre> - * limitedCompare(null, *, *) = IllegalArgumentException - * limitedCompare(*, null, *) = IllegalArgumentException - * limitedCompare(*, *, -1) = IllegalArgumentException - * limitedCompare("","", 0) = 0 - * limitedCompare("aaapppp", "", 8) = 7 - * limitedCompare("aaapppp", "", 7) = 7 - * limitedCompare("aaapppp", "", 6)) = -1 - * limitedCompare("elephant", "hippo", 7) = 7 - * limitedCompare("elephant", "hippo", 6) = -1 - * limitedCompare("hippo", "elephant", 7) = 7 - * limitedCompare("hippo", "elephant", 6) = -1 - * </pre> - * - * @param left the first string, must not be null - * @param right the second string, must not be null - * @param threshold the target threshold, must not be negative - * @return result distance, or -1 - */ - private static int limitedCompare(CharSequence left, CharSequence right, final int threshold) { // NOPMD - if (left == null || right == null) { - throw new IllegalArgumentException("Strings must not be null"); - } - if (threshold < 0) { - throw new IllegalArgumentException("Threshold must not be negative"); - } - - /* - * This implementation only computes the distance if it's less than or - * equal to the threshold value, returning -1 if it's greater. The - * advantage is performance: unbounded distance is O(nm), but a bound of - * k allows us to reduce it to O(km) time by only computing a diagonal - * stripe of width 2k + 1 of the cost table. It is also possible to use - * this to compute the unbounded Levenshtein distance by starting the - * threshold at 1 and doubling each time until the distance is found; - * this is O(dm), where d is the distance. - * - * One subtlety comes from needing to ignore entries on the border of - * our stripe eg. p[] = |#|#|#|* d[] = *|#|#|#| We must ignore the entry - * to the left of the leftmost member We must ignore the entry above the - * rightmost member - * - * Another subtlety comes from our stripe running off the matrix if the - * strings aren't of the same size. Since string s is always swapped to - * be the shorter of the two, the stripe will always run off to the - * upper right instead of the lower left of the matrix. - * - * As a concrete example, suppose s is of length 5, t is of length 7, - * and our threshold is 1. In this case we're going to walk a stripe of - * length 3. The matrix would look like so: - * - * <pre> - * 1 2 3 4 5 - * 1 |#|#| | | | - * 2 |#|#|#| | | - * 3 | |#|#|#| | - * 4 | | |#|#|#| - * 5 | | | |#|#| - * 6 | | | | |#| - * 7 | | | | | | - * </pre> - * - * Note how the stripe leads off the table as there is no possible way - * to turn a string of length 5 into one of length 7 in edit distance of - * 1. - * - * Additionally, this implementation decreases memory usage by using two - * single-dimensional arrays and swapping them back and forth instead of - * allocating an entire n by m matrix. This requires a few minor - * changes, such as immediately returning when it's detected that the - * stripe has run off the matrix and initially filling the arrays with - * large values so that entries we don't compute are ignored. - * - * See Algorithms on Strings, Trees and Sequences by Dan Gusfield for - * some discussion. - */ - - int n = left.length(); // length of left - int m = right.length(); // length of right - - // if one string is empty, the edit distance is necessarily the length - // of the other - if (n == 0) { - return m <= threshold ? m : -1; - } else if (m == 0) { - return n <= threshold ? n : -1; - } - - if (n > m) { - // swap the two strings to consume less memory - final CharSequence tmp = left; - left = right; - right = tmp; - n = m; - m = right.length(); - } - - int[] p = new int[n + 1]; // 'previous' cost array, horizontally - int[] d = new int[n + 1]; // cost array, horizontally - int[] tempD; // placeholder to assist in swapping p and d - - // fill in starting table values - final int boundary = Math.min(n, threshold) + 1; - for (int i = 0; i < boundary; i++) { - p[i] = i; - } - // these fills ensure that the value above the rightmost entry of our - // stripe will be ignored in following loop iterations - Arrays.fill(p, boundary, p.length, Integer.MAX_VALUE); - Arrays.fill(d, Integer.MAX_VALUE); - - // iterates through t - for (int j = 1; j <= m; j++) { - final char rightJ = right.charAt(j - 1); // jth character of right - d[0] = j; - - // compute stripe indices, constrain to array size - final int min = Math.max(1, j - threshold); - final int max = j > Integer.MAX_VALUE - threshold ? n : Math.min( - n, j + threshold); - - // the stripe may lead off of the table if s and t are of different - // sizes - if (min > max) { - return -1; - } - - // ignore entry left of leftmost - if (min > 1) { - d[min - 1] = Integer.MAX_VALUE; - } - - // iterates through [min, max] in s - for (int i = min; i <= max; i++) { - if (left.charAt(i - 1) == rightJ) { - // diagonally left and up - d[i] = p[i - 1]; - } else { - // 1 + minimum of cell to the left, to the top, diagonally - // left and up - d[i] = 1 + Math.min(Math.min(d[i - 1], p[i]), p[i - 1]); - } - } - - // copy current distance counts to 'previous row' distance counts - tempD = p; - p = d; - d = tempD; - } - - // if p[n] is greater than the threshold, there's no guarantee on it - // being the correct - // distance - if (p[n] <= threshold) { - return p[n]; - } - return -1; - } - - /** - * <p>Find the Levenshtein distance between two Strings.</p> - * - * <p>A higher score indicates a greater distance.</p> - * - * <p>The previous implementation of the Levenshtein distance algorithm - * was from <a href="https://web.archive.org/web/20120526085419/http://www.merriampark.com/ldjava.htm"> - * https://web.archive.org/web/20120526085419/http://www.merriampark.com/ldjava.htm</a></p> - * - * <p>This implementation only need one single-dimensional arrays of length s.length() + 1</p> - * - * <pre> - * unlimitedCompare(null, *) = IllegalArgumentException - * unlimitedCompare(*, null) = IllegalArgumentException - * unlimitedCompare("","") = 0 - * unlimitedCompare("","a") = 1 - * unlimitedCompare("aaapppp", "") = 7 - * unlimitedCompare("frog", "fog") = 1 - * unlimitedCompare("fly", "ant") = 3 - * unlimitedCompare("elephant", "hippo") = 7 - * unlimitedCompare("hippo", "elephant") = 7 - * unlimitedCompare("hippo", "zzzzzzzz") = 8 - * unlimitedCompare("hello", "hallo") = 1 - * </pre> - * - * @param left the first String, must not be null - * @param right the second String, must not be null - * @return result distance, or -1 - * @throws IllegalArgumentException if either String input {@code null} - */ - private static int unlimitedCompare(CharSequence left, CharSequence right) { - if (left == null || right == null) { - throw new IllegalArgumentException("Strings must not be null"); - } - - /* - This implementation use two variable to record the previous cost counts, - So this implementation use less memory than previous impl. - */ - - int n = left.length(); // length of left - int m = right.length(); // length of right - - if (n == 0) { - return m; - } else if (m == 0) { - return n; - } - - if (n > m) { - // swap the input strings to consume less memory - final CharSequence tmp = left; - left = right; - right = tmp; - n = m; - m = right.length(); - } - - int[] p = new int[n + 1]; - - // indexes into strings left and right - int i; // iterates through left - int j; // iterates through right - int upperLeft; - int upper; - - char rightJ; // jth character of right - int cost; // cost - - for (i = 0; i <= n; i++) { - p[i] = i; - } - - for (j = 1; j <= m; j++) { - upperLeft = p[0]; - rightJ = right.charAt(j - 1); - p[0] = j; - - for (i = 1; i <= n; i++) { - upper = p[i]; - cost = left.charAt(i - 1) == rightJ ? 0 : 1; - // minimum of cell to the left+1, to the top+1, diagonally left and up +cost - p[i] = Math.min(Math.min(p[i - 1] + 1, p[i] + 1), upperLeft + cost); - upperLeft = upper; - } - } - - return p[n]; - } - -} http://git-wip-us.apache.org/repos/asf/commons-text/blob/c7cf533d/src/main/java/org/apache/commons/text/beta/similarity/LevenshteinResults.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/beta/similarity/LevenshteinResults.java b/src/main/java/org/apache/commons/text/beta/similarity/LevenshteinResults.java deleted file mode 100644 index 4e0c272..0000000 --- a/src/main/java/org/apache/commons/text/beta/similarity/LevenshteinResults.java +++ /dev/null @@ -1,125 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.text.beta.similarity; - -import java.util.Objects; - -/** - * Container class to store Levenshtein distance between two character sequences. - * - * <p>Stores the count of insert, deletion and substitute operations needed to - * change one character sequence into another.</p> - * - * <p>This class is immutable.</p> - * - * @since 1.0 - */ -public class LevenshteinResults { - /** - * Edit distance. - */ - private final Integer distance; - /** - * Insert character count. - */ - private final Integer insertCount; - /** - * Delete character count. - */ - private final Integer deleteCount; - /** - * Substitute character count. - */ - private final Integer substituteCount; - - /** - * Create the results for a detailed Levenshtein distance. - * - * @param distance distance between two character sequences. - * @param insertCount insert character count - * @param deleteCount delete character count - * @param substituteCount substitute character count - */ - public LevenshteinResults(final Integer distance, final Integer insertCount, final Integer deleteCount, - final Integer substituteCount) { - this.distance = distance; - this.insertCount = insertCount; - this.deleteCount = deleteCount; - this.substituteCount = substituteCount; - } - - /** - * Get the distance between two character sequences. - * - * @return distance between two character sequence - */ - public Integer getDistance() { - return distance; - } - - /** - * Get the number of insertion needed to change one character sequence into another. - * - * @return insert character count - */ - public Integer getInsertCount() { - return insertCount; - } - - /** - * Get the number of character deletion needed to change one character sequence to other. - * - * @return delete character count - */ - public Integer getDeleteCount() { - return deleteCount; - } - - /** - * Get the number of character substitution needed to change one character sequence into another. - * - * @return substitute character count - */ - public Integer getSubstituteCount() { - return substituteCount; - } - - @Override - public boolean equals(final Object o) { - if (this == o) { - return true; - } - if (o == null || getClass() != o.getClass()) { - return false; - } - final LevenshteinResults result = (LevenshteinResults) o; - return Objects.equals(distance, result.distance) && Objects.equals(insertCount, result.insertCount) - && Objects.equals(deleteCount, result.deleteCount) - && Objects.equals(substituteCount, result.substituteCount); - } - - @Override - public int hashCode() { - return Objects.hash(distance, insertCount, deleteCount, substituteCount); - } - - @Override - public String toString() { - return "Distance: " + distance + ", Insert: " + insertCount + ", Delete: " + deleteCount + ", Substitute: " - + substituteCount; - } -} http://git-wip-us.apache.org/repos/asf/commons-text/blob/c7cf533d/src/main/java/org/apache/commons/text/beta/similarity/LongestCommonSubsequence.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/beta/similarity/LongestCommonSubsequence.java b/src/main/java/org/apache/commons/text/beta/similarity/LongestCommonSubsequence.java deleted file mode 100644 index b985a67..0000000 --- a/src/main/java/org/apache/commons/text/beta/similarity/LongestCommonSubsequence.java +++ /dev/null @@ -1,144 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.text.beta.similarity; - -/** - * A similarity algorithm indicating the length of the longest common subsequence between two strings. - * - * <p> - * The Longest common subsequence algorithm returns the length of the longest subsequence that two strings have in - * common. Two strings that are entirely different, return a value of 0, and two strings that return a value - * of the commonly shared length implies that the strings are completely the same in value and position. - * <i>Note.</i> Generally this algorithm is fairly inefficient, as for length <i>m</i>, <i>n</i> of the input - * <code>CharSequence</code>'s <code>left</code> and <code>right</code> respectively, the runtime of the - * algorithm is <i>O(m*n)</i>. - * </p> - * - * <p> - * This implementation is based on the Longest Commons Substring algorithm - * from <a href="https://en.wikipedia.org/wiki/Longest_common_subsequence_problem"> - * https://en.wikipedia.org/wiki/Longest_common_subsequence_problem</a>. - * </p> - * - * <p>For further reading see:</p> - * - * <p>Lothaire, M. <i>Applied combinatorics on words</i>. New York: Cambridge U Press, 2005. <b>12-13</b></p> - * - * @since 1.0 - */ -public class LongestCommonSubsequence implements SimilarityScore<Integer> { - - /** - * Calculates longestCommonSubsequence similarity score of two <code>CharSequence</code>'s passed as - * input. - * - * @param left first character sequence - * @param right second character sequence - * @return longestCommonSubsequenceLength - * @throws IllegalArgumentException - * if either String input {@code null} - */ - @Override - public Integer apply(final CharSequence left, final CharSequence right) { - // Quick return for invalid inputs - if (left == null || right == null) { - throw new IllegalArgumentException("Inputs must not be null"); - } - return logestCommonSubsequence(left, right).length(); - } - - /** - * - * Computes the longestCommonSubsequence between the two <code>CharSequence</code>'s passed as - * input. - * - * <p> - * Note, a substring and - * subsequence are not necessarily the same thing. Indeed, <code>abcxyzqrs</code> and - * <code>xyzghfm</code> have both the same common substring and subsequence, namely <code>xyz</code>. However, - * <code>axbyczqrs</code> and <code>abcxyzqtv</code> have the longest common subsequence <code>xyzq</code> because a - * subsequence need not have adjacent characters. - * </p> - * - * <p> - * For reference, we give the definition of a subsequence for the reader: a <i>subsequence</i> is a sequence that - * can be derived from another sequence by deleting some elements without changing the order of the remaining - * elements. - * </p> - * - * @param left first character sequence - * @param right second character sequence - * @return lcsLengthArray - * @throws IllegalArgumentException - * if either String input {@code null} - */ - public CharSequence logestCommonSubsequence(final CharSequence left, final CharSequence right) { - // Quick return - if (left == null || right == null) { - throw new IllegalArgumentException("Inputs must not be null"); - } - StringBuilder longestCommonSubstringArray = new StringBuilder(Math.max(left.length(), right.length())); - int[][] lcsLengthArray = longestCommonSubstringLengthArray(left, right); - int i = left.length() - 1; - int j = right.length() - 1; - int k = lcsLengthArray[left.length()][right.length()] - 1; - while (k >= 0) { - if (left.charAt(i) == right.charAt(j)) { - longestCommonSubstringArray.append(left.charAt(i)); - i = i - 1; - j = j - 1; - k = k - 1; - } else if (lcsLengthArray[i + 1][j] < lcsLengthArray[i][j + 1]) { - i = i - 1; - } else { - j = j - 1; - } - } - return longestCommonSubstringArray.reverse().toString(); - } - - /** - * - * Computes the lcsLengthArray for the sake of doing the actual lcs calculation. This is the - * dynamic programming portion of the algorithm, and is the reason for the runtime complexity being - * O(m*n), where m=left.length() and n=right.length(). - * - * @param left first character sequence - * @param right second character sequence - * @return lcsLengthArray - */ - public int[][] longestCommonSubstringLengthArray(final CharSequence left, final CharSequence right) { - int[][] lcsLengthArray = new int[left.length() + 1][right.length() + 1]; - for (int i = 0; i < left.length(); i++) { - for (int j = 0; j < right.length(); j++) { - if (i == 0) { - lcsLengthArray[i][j] = 0; - } - if (j == 0) { - lcsLengthArray[i][j] = 0; - } - if (left.charAt(i) == right.charAt(j)) { - lcsLengthArray[i + 1][j + 1] = lcsLengthArray[i][j] + 1; - } else { - lcsLengthArray[i + 1][j + 1] = Math.max(lcsLengthArray[i + 1][j], lcsLengthArray[i][j + 1]); - } - } - } - return lcsLengthArray; - } - -} http://git-wip-us.apache.org/repos/asf/commons-text/blob/c7cf533d/src/main/java/org/apache/commons/text/beta/similarity/LongestCommonSubsequenceDistance.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/beta/similarity/LongestCommonSubsequenceDistance.java b/src/main/java/org/apache/commons/text/beta/similarity/LongestCommonSubsequenceDistance.java deleted file mode 100644 index 46a73a3..0000000 --- a/src/main/java/org/apache/commons/text/beta/similarity/LongestCommonSubsequenceDistance.java +++ /dev/null @@ -1,64 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.text.beta.similarity; - -/** - * An edit distance algorithm based on the length of the longest common subsequence between two strings. - * - * <p> - * This code is directly based upon the implementation in {@link LongestCommonSubsequence}. - * </p> - * - * <p> - * For reference see: <a href="https://en.wikipedia.org/wiki/Longest_common_subsequence_problem"> - * https://en.wikipedia.org/wiki/Longest_common_subsequence_problem</a>. - * </p> - * - * <p>For further reading see:</p> - * - * <p>Lothaire, M. <i>Applied combinatorics on words</i>. New York: Cambridge U Press, 2005. <b>12-13</b></p> - * - * @since 1.0 - */ -public class LongestCommonSubsequenceDistance implements EditDistance<Integer> { - - /** - * Object for calculating the longest common subsequence that we can then normalize in apply. - */ - private final LongestCommonSubsequence longestCommonSubsequence = new LongestCommonSubsequence(); - - /** - * Calculates an edit distance between two <code>CharSequence</code>'s <code>left</code> and - * <code>right</code> as: <code>left.length() + right.length() - 2 * LCS(left, right)</code>, where - * <code>LCS</code> is given in {@link LongestCommonSubsequence#apply(CharSequence, CharSequence)}. - * - * @param left first character sequence - * @param right second character sequence - * @return distance - * @throws IllegalArgumentException - * if either String input {@code null} - */ - @Override - public Integer apply(final CharSequence left, final CharSequence right) { - // Quick return for invalid inputs - if (left == null || right == null) { - throw new IllegalArgumentException("Inputs must not be null"); - } - return left.length() + right.length() - 2 * longestCommonSubsequence.apply(left, right); - } - -} http://git-wip-us.apache.org/repos/asf/commons-text/blob/c7cf533d/src/main/java/org/apache/commons/text/beta/similarity/RegexTokenizer.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/beta/similarity/RegexTokenizer.java b/src/main/java/org/apache/commons/text/beta/similarity/RegexTokenizer.java deleted file mode 100644 index 014ace8..0000000 --- a/src/main/java/org/apache/commons/text/beta/similarity/RegexTokenizer.java +++ /dev/null @@ -1,52 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.text.beta.similarity; - -import java.util.ArrayList; -import java.util.List; -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -/** - * A simple word tokenizer that utilizes regex to find words. It applies a regex - * {@code}(\w)+{@code} over the input text to extract words from a given character - * sequence. - * - * @since 1.0 - */ -class RegexTokenizer implements Tokenizer<CharSequence> { - - /** - * {@inheritDoc} - * - * @throws IllegalArgumentException if the input text is blank - */ - @Override - public CharSequence[] tokenize(final CharSequence text) { - if (text == null || text.toString().trim().equals("")) { - throw new IllegalArgumentException("Invalid text"); - } - final Pattern pattern = Pattern.compile("(\\w)+"); - final Matcher matcher = pattern.matcher(text.toString()); - final List<String> tokens = new ArrayList<>(); - while (matcher.find()) { - tokens.add(matcher.group(0)); - } - return tokens.toArray(new String[0]); - } - -} http://git-wip-us.apache.org/repos/asf/commons-text/blob/c7cf533d/src/main/java/org/apache/commons/text/beta/similarity/SimilarityScore.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/beta/similarity/SimilarityScore.java b/src/main/java/org/apache/commons/text/beta/similarity/SimilarityScore.java deleted file mode 100644 index f92aefb..0000000 --- a/src/main/java/org/apache/commons/text/beta/similarity/SimilarityScore.java +++ /dev/null @@ -1,63 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.text.beta.similarity; - -/** - * Interface for the concept of a string similarity score. - * - * <p> - * A string similarity score is intended to have <i>some</i> of the properties of a metric, yet - * allowing for exceptions, namely the Jaro-Winkler similarity score. - * </p> - * <p> - * We Define a SimilarityScore to be a function <code>d: [X * X] -> [0, INFINITY)</code> with the - * following properties: - * </p> - * <ul> - * <li><code>d(x,y) >= 0</code>, non-negativity or separation axiom</li> - * <li><code>d(x,y) == d(y,x)</code>, symmetry.</li> - * </ul> - * - * <p> - * Notice, these are two of the properties that contribute to d being a metric. - * </p> - * - * - * <p> - * Further, this intended to be BiFunction<CharSequence, CharSequence, R>. - * The <code>apply</code> method - * accepts a pair of {@link CharSequence} parameters - * and returns an <code>R</code> type similarity score. We have ommitted the explicit - * statement of extending BiFunction due to it only being implemented in Java 1.8, and we - * wish to maintain Java 1.7 compatibility. - * </p> - * - * @param <R> The type of similarity score unit used by this EditDistance. - * @since 1.0 - */ -public interface SimilarityScore<R> { - - /** - * Compares two CharSequences. - * - * @param left the first CharSequence - * @param right the second CharSequence - * @return the similarity score between two CharSequences - */ - R apply(CharSequence left, CharSequence right); - -} http://git-wip-us.apache.org/repos/asf/commons-text/blob/c7cf533d/src/main/java/org/apache/commons/text/beta/similarity/SimilarityScoreFrom.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/beta/similarity/SimilarityScoreFrom.java b/src/main/java/org/apache/commons/text/beta/similarity/SimilarityScoreFrom.java deleted file mode 100644 index dd66f73..0000000 --- a/src/main/java/org/apache/commons/text/beta/similarity/SimilarityScoreFrom.java +++ /dev/null @@ -1,112 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.text.beta.similarity; - -/** - * <p> - * This stores a {@link SimilarityScore} implementation and a {@link CharSequence} "left" string. - * The {@link #apply(CharSequence right)} method accepts the "right" string and invokes the - * comparison function for the pair of strings. - * </p> - * - * <p> - * The following is an example which finds the most similar string: - * </p> - * <pre> - * SimilarityScore<Integer> similarityScore = new LevenshteinDistance(); - * String target = "Apache"; - * SimilarityScoreFrom<Integer> similarityScoreFrom = - * new SimilarityScoreFrom<Integer>(similarityScore, target); - * String mostSimilar = null; - * Integer shortestDistance = null; - * - * for (String test : new String[] { "Appaloosa", "a patchy", "apple" }) { - * Integer distance = similarityScoreFrom.apply(test); - * if (shortestDistance == null || distance < shortestDistance) { - * shortestDistance = distance; - * mostSimilar = test; - * } - * } - * - * System.out.println("The string most similar to \"" + target + "\" " - * + "is \"" + mostSimilar + "\" because " - * + "its distance is only " + shortestDistance + "."); - * </pre> - * - * @param <R> This is the type of similarity score used by the SimilarityScore function. - * @since 1.0 - */ -public class SimilarityScoreFrom<R> { - - /** - * Similarity score. - */ - private final SimilarityScore<R> similarityScore; - /** - * Left parameter used in distance function. - */ - private final CharSequence left; - - /** - * <p>This accepts the similarity score implementation and the "left" string.</p> - * - * @param similarityScore This may not be null. - * @param left This may be null here, - * but the SimilarityScore#compare(CharSequence left, CharSequence right) - * implementation may not accept nulls. - */ - public SimilarityScoreFrom(final SimilarityScore<R> similarityScore, final CharSequence left) { - if (similarityScore == null) { - throw new IllegalArgumentException("The edit distance may not be null."); - } - - this.similarityScore = similarityScore; - this.left = left; - } - - /** - * <p> - * This compares "left" field against the "right" parameter - * using the "similarity score" implementation. - * </p> - * - * @param right the second CharSequence - * @return the similarity score between two CharSequences - */ - public R apply(final CharSequence right) { - return similarityScore.apply(left, right); - } - - /** - * Gets the left parameter. - * - * @return the left parameter - */ - public CharSequence getLeft() { - return left; - } - - /** - * Gets the edit distance. - * - * @return the edit distance - */ - public SimilarityScore<R> getSimilarityScore() { - return similarityScore; - } - -} http://git-wip-us.apache.org/repos/asf/commons-text/blob/c7cf533d/src/main/java/org/apache/commons/text/beta/similarity/Tokenizer.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/beta/similarity/Tokenizer.java b/src/main/java/org/apache/commons/text/beta/similarity/Tokenizer.java deleted file mode 100644 index 08e1ecc..0000000 --- a/src/main/java/org/apache/commons/text/beta/similarity/Tokenizer.java +++ /dev/null @@ -1,35 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.text.beta.similarity; - -/** - * A tokenizer. Can produce arrays of tokens from a given type. - * - * @param <T> given type - * @since 1.0 - */ -interface Tokenizer<T> { - - /** - * Returns an array of tokens. - * - * @param text input text - * @return array of tokens - */ - T[] tokenize(CharSequence text); - -} http://git-wip-us.apache.org/repos/asf/commons-text/blob/c7cf533d/src/main/java/org/apache/commons/text/beta/similarity/package-info.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/beta/similarity/package-info.java b/src/main/java/org/apache/commons/text/beta/similarity/package-info.java deleted file mode 100644 index de5a313..0000000 --- a/src/main/java/org/apache/commons/text/beta/similarity/package-info.java +++ /dev/null @@ -1,44 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -/** - * <p>Provides algorithms for string similarity.</p> - * - * <p>The algorithms that implement the EditDistance interface follow the same - * simple principle: the more similar (closer) strings are, lower is the distance. - * For example, the words house and hose are closer than house and trousers.</p> - * - * <p>The following algorithms are available at the moment:</p> - * - * <ul> - * <li>{@link org.apache.commons.text.beta.similarity.CosineDistance Cosine Distance}</li> - * <li>{@link org.apache.commons.text.beta.similarity.CosineSimilarity Cosine Similarity}</li> - * <li>{@link org.apache.commons.text.beta.similarity.FuzzyScore Fuzzy Score}</li> - * <li>{@link org.apache.commons.text.beta.similarity.HammingDistance Hamming Distance}</li> - * <li>{@link org.apache.commons.text.beta.similarity.JaroWinklerDistance Jaro-Winkler Distance}</li> - * <li>{@link org.apache.commons.text.beta.similarity.LevenshteinDistance Levenshtein Distance}</li> - * <li>{@link org.apache.commons.text.beta.similarity.LongestCommonSubsequenceDistance - * Longest Commons Subsequence Distance}</li> - * </ul> - * - * <p>The {@link org.apache.commons.text.beta.similarity.CosineDistance Cosine Distance} - * utilises a {@link org.apache.commons.text.beta.similarity.RegexTokenizer regular expression tokenizer (\w+)}. - * And the {@link org.apache.commons.text.beta.similarity.LevenshteinDistance Levenshtein Distance}'s - * behaviour can be changed to take into consideration a maximum throughput.</p> - * - * @since 1.0 - */ -package org.apache.commons.text.beta.similarity; http://git-wip-us.apache.org/repos/asf/commons-text/blob/c7cf533d/src/main/java/org/apache/commons/text/beta/translate/AggregateTranslator.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/beta/translate/AggregateTranslator.java b/src/main/java/org/apache/commons/text/beta/translate/AggregateTranslator.java deleted file mode 100644 index 897790b..0000000 --- a/src/main/java/org/apache/commons/text/beta/translate/AggregateTranslator.java +++ /dev/null @@ -1,65 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.text.beta.translate; - -import java.io.IOException; -import java.io.Writer; -import java.util.ArrayList; -import java.util.List; - -/** - * Executes a sequence of translators one after the other. Execution ends whenever - * the first translator consumes codepoints from the input. - * - * @since 1.0 - */ -public class AggregateTranslator extends CharSequenceTranslator { - - private final List<CharSequenceTranslator> translators = new ArrayList<>(); - - /** - * Specify the translators to be used at creation time. - * - * @param translators CharSequenceTranslator array to aggregate - */ - public AggregateTranslator(final CharSequenceTranslator... translators) { - if (translators != null) { - for (CharSequenceTranslator translator : translators) { - if (translator != null) { - this.translators.add(translator); - } - } - } - } - - /** - * The first translator to consume codepoints from the input is the 'winner'. - * Execution stops with the number of consumed codepoints being returned. - * {@inheritDoc} - */ - @Override - public int translate(final CharSequence input, final int index, final Writer out) throws IOException { - for (final CharSequenceTranslator translator : translators) { - final int consumed = translator.translate(input, index, out); - if(consumed != 0) { - return consumed; - } - } - return 0; - } - -} http://git-wip-us.apache.org/repos/asf/commons-text/blob/c7cf533d/src/main/java/org/apache/commons/text/beta/translate/CharSequenceTranslator.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/beta/translate/CharSequenceTranslator.java b/src/main/java/org/apache/commons/text/beta/translate/CharSequenceTranslator.java deleted file mode 100644 index 1025528..0000000 --- a/src/main/java/org/apache/commons/text/beta/translate/CharSequenceTranslator.java +++ /dev/null @@ -1,135 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.text.beta.translate; - -import java.io.IOException; -import java.io.StringWriter; -import java.io.Writer; -import java.util.Locale; - -/** - * An API for translating text. - * Its core use is to escape and unescape text. Because escaping and unescaping - * is completely contextual, the API does not present two separate signatures. - * - * @since 1.0 - */ -public abstract class CharSequenceTranslator { - - static final char[] HEX_DIGITS = new char[] {'0','1','2','3','4','5','6','7','8','9','A','B','C','D','E','F'}; - - /** - * Translate a set of codepoints, represented by an int index into a CharSequence, - * into another set of codepoints. The number of codepoints consumed must be returned, - * and the only IOExceptions thrown must be from interacting with the Writer so that - * the top level API may reliably ignore StringWriter IOExceptions. - * - * @param input CharSequence that is being translated - * @param index int representing the current point of translation - * @param out Writer to translate the text to - * @return int count of codepoints consumed - * @throws IOException if and only if the Writer produces an IOException - */ - public abstract int translate(CharSequence input, int index, Writer out) throws IOException; - - /** - * Helper for non-Writer usage. - * @param input CharSequence to be translated - * @return String output of translation - */ - public final String translate(final CharSequence input) { - if (input == null) { - return null; - } - try { - final StringWriter writer = new StringWriter(input.length() * 2); - translate(input, writer); - return writer.toString(); - } catch (final IOException ioe) { - // this should never ever happen while writing to a StringWriter - throw new RuntimeException(ioe); - } - } - - /** - * Translate an input onto a Writer. This is intentionally final as its algorithm is - * tightly coupled with the abstract method of this class. - * - * @param input CharSequence that is being translated - * @param out Writer to translate the text to - * @throws IOException if and only if the Writer produces an IOException - */ - public final void translate(final CharSequence input, final Writer out) throws IOException { - if (out == null) { - throw new IllegalArgumentException("The Writer must not be null"); - } - if (input == null) { - return; - } - int pos = 0; - final int len = input.length(); - while (pos < len) { - final int consumed = translate(input, pos, out); - if (consumed == 0) { - // inlined implementation of Character.toChars(Character.codePointAt(input, pos)) - // avoids allocating temp char arrays and duplicate checks - final char c1 = input.charAt(pos); - out.write(c1); - pos++; - if (Character.isHighSurrogate(c1) && pos < len) { - final char c2 = input.charAt(pos); - if (Character.isLowSurrogate(c2)) { - out.write(c2); - pos++; - } - } - continue; - } - // contract with translators is that they have to understand codepoints - // and they just took care of a surrogate pair - for (int pt = 0; pt < consumed; pt++) { - pos += Character.charCount(Character.codePointAt(input, pos)); - } - } - } - - /** - * Helper method to create a merger of this translator with another set of - * translators. Useful in customizing the standard functionality. - * - * @param translators CharSequenceTranslator array of translators to merge with this one - * @return CharSequenceTranslator merging this translator with the others - */ - public final CharSequenceTranslator with(final CharSequenceTranslator... translators) { - final CharSequenceTranslator[] newArray = new CharSequenceTranslator[translators.length + 1]; - newArray[0] = this; - System.arraycopy(translators, 0, newArray, 1, translators.length); - return new AggregateTranslator(newArray); - } - - /** - * <p>Returns an upper case hexadecimal <code>String</code> for the given - * character.</p> - * - * @param codepoint The codepoint to convert. - * @return An upper case hexadecimal <code>String</code> - */ - public static String hex(final int codepoint) { - return Integer.toHexString(codepoint).toUpperCase(Locale.ENGLISH); - } - -} http://git-wip-us.apache.org/repos/asf/commons-text/blob/c7cf533d/src/main/java/org/apache/commons/text/beta/translate/CodePointTranslator.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/beta/translate/CodePointTranslator.java b/src/main/java/org/apache/commons/text/beta/translate/CodePointTranslator.java deleted file mode 100644 index 83386db..0000000 --- a/src/main/java/org/apache/commons/text/beta/translate/CodePointTranslator.java +++ /dev/null @@ -1,51 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.text.beta.translate; - -import java.io.IOException; -import java.io.Writer; - -/** - * Helper subclass to CharSequenceTranslator to allow for translations that - * will replace up to one character at a time. - * - * @since 1.0 - */ -public abstract class CodePointTranslator extends CharSequenceTranslator { - - /** - * Implementation of translate that maps onto the abstract translate(int, Writer) method. - * {@inheritDoc} - */ - @Override - public final int translate(final CharSequence input, final int index, final Writer out) throws IOException { - final int codepoint = Character.codePointAt(input, index); - final boolean consumed = translate(codepoint, out); - return consumed ? 1 : 0; - } - - /** - * Translate the specified codepoint into another. - * - * @param codepoint int character input to translate - * @param out Writer to optionally push the translated output to - * @return boolean as to whether translation occurred or not - * @throws IOException if and only if the Writer produces an IOException - */ - public abstract boolean translate(int codepoint, Writer out) throws IOException; - -} http://git-wip-us.apache.org/repos/asf/commons-text/blob/c7cf533d/src/main/java/org/apache/commons/text/beta/translate/CsvTranslators.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/beta/translate/CsvTranslators.java b/src/main/java/org/apache/commons/text/beta/translate/CsvTranslators.java deleted file mode 100644 index e1dbf83..0000000 --- a/src/main/java/org/apache/commons/text/beta/translate/CsvTranslators.java +++ /dev/null @@ -1,82 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.text.beta.translate; - -import org.apache.commons.lang3.CharUtils; -import org.apache.commons.lang3.StringUtils; - -import java.io.IOException; -import java.io.Writer; - -/** - * This class holds inner classes for escaping/unescaping Comma Separated Values. - */ -public class CsvTranslators { - - private static final char CSV_DELIMITER = ','; - private static final char CSV_QUOTE = '"'; - private static final String CSV_QUOTE_STR = String.valueOf(CSV_QUOTE); - private static final String CSV_ESCAPED_QUOTE_STR = CSV_QUOTE_STR + CSV_QUOTE_STR; - private static final char[] CSV_SEARCH_CHARS = - new char[] {CSV_DELIMITER, CSV_QUOTE, CharUtils.CR, CharUtils.LF}; - - private CsvTranslators() { } - - /** - * Translator for escaping Comma Separated Values. - */ - public static class CsvEscaper extends SinglePassTranslator { - - @Override - void translateWhole(final CharSequence input, final Writer out) throws IOException { - final String inputSting = input.toString(); - if (StringUtils.containsNone(inputSting, CSV_SEARCH_CHARS)) { - out.write(inputSting); - } else { - // input needs quoting - out.write(CSV_QUOTE); - out.write(StringUtils.replace(inputSting, CSV_QUOTE_STR, CSV_ESCAPED_QUOTE_STR)); - out.write(CSV_QUOTE); - } - } - } - - /** - * Translator for unescaping escaped Comma Separated Value entries. - */ - public static class CsvUnescaper extends SinglePassTranslator { - - @Override - void translateWhole(final CharSequence input, final Writer out) throws IOException { - // is input not quoted? - if (input.charAt(0) != CSV_QUOTE || input.charAt(input.length() - 1) != CSV_QUOTE) { - out.write(input.toString()); - return; - } - - // strip quotes - final String quoteless = input.subSequence(1, input.length() - 1).toString(); - - if (StringUtils.containsAny(quoteless, CSV_SEARCH_CHARS)) { - // deal with escaped quotes; ie) "" - out.write(StringUtils.replace(quoteless, CSV_ESCAPED_QUOTE_STR, CSV_QUOTE_STR)); - } else { - out.write(input.toString()); - } - } - } -} \ No newline at end of file