TEXT-32: LCS distance calculation
Project: http://git-wip-us.apache.org/repos/asf/commons-text/repo Commit: http://git-wip-us.apache.org/repos/asf/commons-text/commit/288781ef Tree: http://git-wip-us.apache.org/repos/asf/commons-text/tree/288781ef Diff: http://git-wip-us.apache.org/repos/asf/commons-text/diff/288781ef Branch: refs/heads/master Commit: 288781eff3725fd55c1ddece95be9f22fddfd324 Parents: 46bb20a Author: Rob Tompkins <[email protected]> Authored: Thu Dec 22 15:55:27 2016 -0500 Committer: Rob Tompkins <[email protected]> Committed: Thu Dec 22 15:55:27 2016 -0500 ---------------------------------------------------------------------- .../LongestCommonSubsequenceDistance.java | 61 ++++++++++++++++++ .../LongestCommonSubsequenceDistanceTest.java | 68 ++++++++++++++++++++ .../LongestCommonSubsequenceTest.java | 26 ++++++-- 3 files changed, 150 insertions(+), 5 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/commons-text/blob/288781ef/src/main/java/org/apache/commons/text/similarity/LongestCommonSubsequenceDistance.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/similarity/LongestCommonSubsequenceDistance.java b/src/main/java/org/apache/commons/text/similarity/LongestCommonSubsequenceDistance.java new file mode 100644 index 0000000..bac02de --- /dev/null +++ b/src/main/java/org/apache/commons/text/similarity/LongestCommonSubsequenceDistance.java @@ -0,0 +1,61 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.text.similarity; + +/** + * An edit distance algorithm based on the length of the longest common subsequence between two strings. + * + * <p> + * This code is directly based upon the implementation in {@link LongestCommonSubsequence}. + * </p> + * + * <p> + * For reference see: <a href="https://en.wikipedia.org/wiki/Longest_common_subsequence_problem"> + * https://en.wikipedia.org/wiki/Longest_common_subsequence_problem</a>. + * </p> + * + * <p>For further reading see:</p> + * + * <p>Lothaire, M. <i>Applied combinatorics on words</i>. New York: Cambridge U Press, 2005. <b>12-13</b></p> + * + * @since 1.0 + */ +public class LongestCommonSubsequenceDistance implements EditDistance<Integer> { + + private final LongestCommonSubsequence longestCommonSubsequence = new LongestCommonSubsequence(); + + /** + * Calculates an edit distance between two <code>CharSequence</code>'s <code>left</code> and + * <code>right</code> as: <code>left.length() + right.length() - 2 * LCS(left, right)</code>, where + * <code>LCS</code> is given in {@link LongestCommonSubsequence#apply(CharSequence, CharSequence)}. + * + * @param left first character sequence + * @param right second character sequence + * @return distance + * @throws IllegalArgumentException + * if either String input {@code null} + */ + @Override + public Integer apply(final CharSequence left, final CharSequence right) { + // Quick return for invalid inputs + if (left == null || right == null) { + throw new IllegalArgumentException("Inputs must not be null"); + } + return left.length() + right.length() - 2 * longestCommonSubsequence.apply(left, right); + } + +} http://git-wip-us.apache.org/repos/asf/commons-text/blob/288781ef/src/test/java/org/apache/commons/text/similarity/LongestCommonSubsequenceDistanceTest.java ---------------------------------------------------------------------- diff --git a/src/test/java/org/apache/commons/text/similarity/LongestCommonSubsequenceDistanceTest.java b/src/test/java/org/apache/commons/text/similarity/LongestCommonSubsequenceDistanceTest.java new file mode 100644 index 0000000..a107b47 --- /dev/null +++ b/src/test/java/org/apache/commons/text/similarity/LongestCommonSubsequenceDistanceTest.java @@ -0,0 +1,68 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.text.similarity; + +import org.junit.BeforeClass; +import org.junit.Test; + +import static org.junit.Assert.assertEquals; + +/** + * Unit tests for {@link org.apache.commons.text.similarity.LongestCommonSubsequenceDistance}. + */ +public class LongestCommonSubsequenceDistanceTest { + + private static LongestCommonSubsequenceDistance subject; + + @BeforeClass + public static void setup() { + subject = new LongestCommonSubsequenceDistance(); + } + + @Test + public void testGettingLogestCommonSubsequenceDistacne() { + assertEquals(Integer.valueOf(0), subject.apply("", "")); + assertEquals(Integer.valueOf(4), subject.apply("left", "")); + assertEquals(Integer.valueOf(5), subject.apply("", "right")); + assertEquals(Integer.valueOf(1), subject.apply("frog", "fog")); + assertEquals(Integer.valueOf(6), subject.apply("fly", "ant")); + assertEquals(Integer.valueOf(11), subject.apply("elephant", "hippo")); + assertEquals(Integer.valueOf(7), subject.apply("ABC Corporation", "ABC Corp")); + assertEquals(Integer.valueOf(4), subject.apply("D N H Enterprises Inc", "D & H Enterprises, Inc.")); + assertEquals(Integer.valueOf(9), subject.apply("My Gym Children's Fitness Center", "My Gym. Childrens Fitness")); + assertEquals(Integer.valueOf(3), subject.apply("PENNSYLVANIA", "PENNCISYLVNIA")); + assertEquals(Integer.valueOf(7), subject.apply("left", "right")); + assertEquals(Integer.valueOf(9), subject.apply("leettteft", "ritttght")); + assertEquals(Integer.valueOf(0), subject.apply("the same string", "the same string")); + } + + @Test(expected = IllegalArgumentException.class) + public void testGettingLongestCommonSubsequenceDistanceNullNull() throws Exception { + subject.apply(null, null); + } + + @Test(expected = IllegalArgumentException.class) + public void testGettingLongestCommonSubsequenceDistanceStringNull() throws Exception { + subject.apply(" ", null); + } + + @Test(expected = IllegalArgumentException.class) + public void testGettingLongestCommonSubsequenceDistanceNullString() throws Exception { + subject.apply(null, "right"); + } + +} http://git-wip-us.apache.org/repos/asf/commons-text/blob/288781ef/src/test/java/org/apache/commons/text/similarity/LongestCommonSubsequenceTest.java ---------------------------------------------------------------------- diff --git a/src/test/java/org/apache/commons/text/similarity/LongestCommonSubsequenceTest.java b/src/test/java/org/apache/commons/text/similarity/LongestCommonSubsequenceTest.java index e852884..c5f43ba 100644 --- a/src/test/java/org/apache/commons/text/similarity/LongestCommonSubsequenceTest.java +++ b/src/test/java/org/apache/commons/text/similarity/LongestCommonSubsequenceTest.java @@ -1,19 +1,35 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package org.apache.commons.text.similarity; -import org.junit.Before; +import org.junit.BeforeClass; import org.junit.Test; import static org.junit.Assert.assertEquals; /** - * Created by tompkicr on 12/20/16. + * Unit tests for {@link org.apache.commons.text.similarity.LongestCommonSubsequence}. */ public class LongestCommonSubsequenceTest { - private LongestCommonSubsequence subject; + private static LongestCommonSubsequence subject; - @Before - public void setup() { + @BeforeClass + public static void setup() { subject = new LongestCommonSubsequence(); }
