TEXT-32: Initial pass as LCS, needs better tests

Project: http://git-wip-us.apache.org/repos/asf/commons-text/repo
Commit: http://git-wip-us.apache.org/repos/asf/commons-text/commit/676a1a70
Tree: http://git-wip-us.apache.org/repos/asf/commons-text/tree/676a1a70
Diff: http://git-wip-us.apache.org/repos/asf/commons-text/diff/676a1a70

Branch: refs/heads/master
Commit: 676a1a70557b60e849bef86ccf565faff49b45e9
Parents: 4c29a73
Author: Rob Tompkins <[email protected]>
Authored: Tue Dec 20 20:35:40 2016 -0500
Committer: Rob Tompkins <[email protected]>
Committed: Thu Dec 22 15:34:14 2016 -0500

----------------------------------------------------------------------
 .../similarity/LongestCommonSubsequence.java    | 160 +++++++++++++++++++
 .../LongestCommonSubsequenceTest.java           |  36 +++++
 2 files changed, 196 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/commons-text/blob/676a1a70/src/main/java/org/apache/commons/text/similarity/LongestCommonSubsequence.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/commons/text/similarity/LongestCommonSubsequence.java
 
b/src/main/java/org/apache/commons/text/similarity/LongestCommonSubsequence.java
new file mode 100644
index 0000000..4e809c2
--- /dev/null
+++ 
b/src/main/java/org/apache/commons/text/similarity/LongestCommonSubsequence.java
@@ -0,0 +1,160 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.text.similarity;
+
+/**
+ * A similarity algorithm indicating the length of the longest common 
subsequence between two strings..
+ *
+ * <p>
+ * The Longest common subsequence algorithm returns the length of the longest 
subsequence that two strings have in
+ * common. Two strings that are entirely different, return a value of 0, and 
two strings that return a value
+ * of the commonly shared length implies that the strings are completely the 
same in value and position.
+ * <i>Note.</i>  Generally this algorithm is fairly inefficient, as for length 
<i>m</i>, <i>n</i> of the input
+ * <code>CharSequence</code>'s <code>left</code> and <code>right</code> 
respectively, the runtime of the
+ * algoirthm is <i>O(m*n)</i>.
+ * </p>
+ *
+ * <p>
+ * This implementation is based on the Longest Commons Substring algorithm
+ * from <a 
href="https://en.wikipedia.org/wiki/Longest_common_subsequence_problem";>
+ * https://en.wikipedia.org/wiki/Longest_common_subsequence_problem</a>.
+ * </p>
+ *
+ * <p>For further reading see:</p>
+ *
+ * <p>Lothaire, M. <i>Applied combinatorics on words</i>. New York: Cambridge 
U Press, 2005. <b>12-13</b></p>
+ *
+ * @since 1.0
+ */
+public class LongestCommonSubsequence implements SimilarityScore<Integer> {
+
+    public Integer apply(final CharSequence left, final CharSequence right) {
+        // Quick return for invalid inputs
+        if (left == null || right == null) {
+            throw new IllegalArgumentException("Inputs must not be null");
+        }
+        return 
longestCommonSubstringLengthArray(left,right)[left.length()][right.length()];
+    }
+
+    /**
+     *
+     * Computes the longestCommonSubsequence between the two strings given.
+     *
+     * <p>
+     * Note, a substring and
+     * subsequence are not necessarily the same thing. Indeed, 
<code>abcxyzqrs</code> and
+     * <code>xyzghfm</code> have both the same common substring and 
subsequence, namely <code>xyz</code>. However,
+     * <code>axbyczqrs</code> and <code>abcxyzqtv</code> have the longest 
common subsequence <code>xyzq</code> because a
+     * subsequence need not have adjacent characters.
+     * </p>
+     *
+     * <p>
+     * For reference, we give the definition of a subsequence for the reader: 
a <i>subsequence</i> is a sequence that can be
+     * derived from another sequence by deleting some elements without 
changing the order of the remaining elements.
+     * </p>
+     *
+     * @param left
+     * @param right
+     * @return lcsLengthArray
+     */
+    public CharSequence logestCommonSubsequence(final CharSequence left, final 
CharSequence right) {
+        // Quick return
+        if (left == null || right == null) {
+            throw new IllegalArgumentException("Inputs must not be null");
+        }
+        StringBuffer longestCommonSubstringArray = new 
StringBuffer(Math.max(left.length(), right.length()));
+        int[][] lcsLengthArray = longestCommonSubstringLengthArray(left, 
right);
+        int i = left.length() - 1;
+        int j = right.length() - 1;
+        int k = lcsLengthArray[left.length()][right.length()] - 1;
+        while (k >= 0) {
+            if (left.charAt(i) == right.charAt(j)) {
+                longestCommonSubstringArray.append(left.charAt(i));
+                i = i - 1;
+                j = j - 1;
+                k = k - 1;
+            } else if (lcsLengthArray[i + 1][j] < lcsLengthArray[i][j + 1]) {
+                i = i - 1;
+            } else {
+                j = j - 1;
+            }
+        }
+        return longestCommonSubstringArray;
+    }
+
+    /**
+     *
+     * Computes the lcsLengthArray for the sake of doing the actual lcs 
calculation. This is the
+     * dynamic programming portion of the algorithm, and is the reason for the 
runtime complexity being
+     * O(m*n), where m=left.length() and n=right.length().
+     *
+     * @param left
+     * @param right
+     * @return lcsLengthArray
+     */
+    public int[][] longestCommonSubstringLengthArray(final CharSequence left, 
final CharSequence right) {
+        int[][] lcsLengthArray = new int[left.length() + 1][right.length() + 
1];
+        for (int i=0; i < left.length(); i++) {
+            for (int j=0; j < right.length(); j++) {
+                if (i == 0) {
+                    lcsLengthArray[i][j] = 0;
+                }
+                if (j == 0) {
+                    lcsLengthArray[i][j] = 0;
+                }
+                if (left.charAt(i) == right.charAt(j)) {
+                    lcsLengthArray[i + 1][j + 1] = lcsLengthArray[i][j] + 1;
+                } else {
+                    lcsLengthArray[i + 1][j + 1] = Math.max(lcsLengthArray[i + 
1][j], lcsLengthArray[i][j + 1]);
+                }
+            }
+        }
+        return lcsLengthArray;
+    }
+
+
+    public int wikiLcs(CharSequence left, CharSequence right) {
+        int m = left.length();
+        int n = right.length();
+        char[] x = left.toString().toCharArray();
+        char[] y = right.toString().toCharArray();
+
+        int[][] c = new int[m + 1][n + 1];
+
+        for (int i = 0; i <= m; i++) {
+            c[i][0] = 0;
+        }
+
+        for (int j = 0; j <= n; j++) {
+            c[0][j] = 0;
+        }
+
+        for (int i = 1; i <= m; i++) {
+            for (int j = 1; j <= n; j++) {
+                if (x[i - 1] == y[j - 1]) {
+                    c[i][j] = c[i - 1][j - 1] + 1;
+
+                } else {
+                    c[i][j] = Math.max(c[i][j - 1], c[i - 1][j]);
+                }
+            }
+        }
+
+        return c[m][n];
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/commons-text/blob/676a1a70/src/test/java/org/apache/commons/text/similarity/LongestCommonSubsequenceTest.java
----------------------------------------------------------------------
diff --git 
a/src/test/java/org/apache/commons/text/similarity/LongestCommonSubsequenceTest.java
 
b/src/test/java/org/apache/commons/text/similarity/LongestCommonSubsequenceTest.java
new file mode 100644
index 0000000..9c50412
--- /dev/null
+++ 
b/src/test/java/org/apache/commons/text/similarity/LongestCommonSubsequenceTest.java
@@ -0,0 +1,36 @@
+package org.apache.commons.text.similarity;
+
+import org.junit.Test;
+
+/**
+ * Created by tompkicr on 12/20/16.
+ */
+public class LongestCommonSubsequenceTest {
+
+    @Test
+    public void testLongestCommonSubsequence() {
+        LongestCommonSubsequence lcs = new LongestCommonSubsequence();
+        CharSequence retVal = lcs.logestCommonSubsequence("abab","abba");
+        System.out.println("testLongestCommonSubstring:");
+        System.out.println("retVal: " + retVal);
+        for (int n=0; n < retVal.length(); n++){
+            System.out.print("isISOControl: " + 
Character.isISOControl(retVal.charAt(n)) + " ");
+            System.out.println(retVal.charAt(n));
+        }
+    }
+
+    @Test
+    public void testLcsLengthArray() {
+        LongestCommonSubsequence lcs = new LongestCommonSubsequence();
+        String left = "abab";
+        String right = "abba";
+        int[][] retVal = lcs.longestCommonSubstringLengthArray(left, right);
+        System.out.println("testLcsLengthArray:");
+        for (int i=0; i <= left.length(); i++){
+            for (int j=0; j <= right.length(); j++){
+                System.out.print(retVal[i][j] + " ");
+            }
+            System.out.print("\n");
+        }
+    }
+}

Reply via email to