http://git-wip-us.apache.org/repos/asf/commons-text/blob/4d927351/src/main/java/org/apache/commons/text/diff/EditScript.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/diff/EditScript.java b/src/main/java/org/apache/commons/text/diff/EditScript.java deleted file mode 100644 index bf4b185..0000000 --- a/src/main/java/org/apache/commons/text/diff/EditScript.java +++ /dev/null @@ -1,133 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.text.diff; - -import java.util.ArrayList; -import java.util.List; - -/** - * This class gathers all the {@link EditCommand commands} needed to transform - * one objects sequence into another objects sequence. - * <p> - * An edit script is the most general view of the differences between two - * sequences. It is built as the result of the comparison between two sequences - * by the {@link StringsComparator StringsComparator} class. The user can - * walk through it using the <em>visitor</em> design pattern. - * <p> - * It is guaranteed that the objects embedded in the {@link InsertCommand insert - * commands} come from the second sequence and that the objects embedded in - * either the {@link DeleteCommand delete commands} or {@link KeepCommand keep - * commands} come from the first sequence. This can be important if subclassing - * is used for some elements in the first sequence and the <code>equals</code> - * method is specialized. - * - * @see StringsComparator - * @see EditCommand - * @see CommandVisitor - * @see ReplacementsHandler - * - * @param <T> object type - * @since 1.0 - */ -public class EditScript<T> { - - /** Container for the commands. */ - private final List<EditCommand<T>> commands; - - /** Length of the longest common subsequence. */ - private int lcsLength; - - /** Number of modifications. */ - private int modifications; - - /** - * Simple constructor. Creates a new empty script. - */ - public EditScript() { - commands = new ArrayList<>(); - lcsLength = 0; - modifications = 0; - } - - /** - * Add a keep command to the script. - * - * @param command command to add - */ - public void append(final KeepCommand<T> command) { - commands.add(command); - ++lcsLength; - } - - /** - * Add an insert command to the script. - * - * @param command command to add - */ - public void append(final InsertCommand<T> command) { - commands.add(command); - ++modifications; - } - - /** - * Add a delete command to the script. - * - * @param command command to add - */ - public void append(final DeleteCommand<T> command) { - commands.add(command); - ++modifications; - } - - /** - * Visit the script. The script implements the <em>visitor</em> design - * pattern, this method is the entry point to which the user supplies its - * own visitor, the script will be responsible to drive it through the - * commands in order and call the appropriate method as each command is - * encountered. - * - * @param visitor the visitor that will visit all commands in turn - */ - public void visit(final CommandVisitor<T> visitor) { - for (final EditCommand<T> command : commands) { - command.accept(visitor); - } - } - - /** - * Get the length of the Longest Common Subsequence (LCS). The length of the - * longest common subsequence is the number of {@link KeepCommand keep - * commands} in the script. - * - * @return length of the Longest Common Subsequence - */ - public int getLCSLength() { - return lcsLength; - } - - /** - * Get the number of effective modifications. The number of effective - * modification is the number of {@link DeleteCommand delete} and - * {@link InsertCommand insert} commands in the script. - * - * @return number of effective modifications - */ - public int getModifications() { - return modifications; - } - -}
http://git-wip-us.apache.org/repos/asf/commons-text/blob/4d927351/src/main/java/org/apache/commons/text/diff/InsertCommand.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/diff/InsertCommand.java b/src/main/java/org/apache/commons/text/diff/InsertCommand.java deleted file mode 100644 index f0337dc..0000000 --- a/src/main/java/org/apache/commons/text/diff/InsertCommand.java +++ /dev/null @@ -1,58 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.text.diff; - -/** - * Command representing the insertion of one object of the second sequence. - * <p> - * When one object of the second sequence has no corresponding object in the - * first sequence at the right place, the {@link EditScript edit script} - * transforming the first sequence into the second sequence uses an instance of - * this class to represent the insertion of this object. The objects embedded in - * these type of commands always come from the second sequence. - * </p> - * - * @see StringsComparator - * @see EditScript - * - * @param <T> object type - * @since 1.0 - */ -public class InsertCommand<T> extends EditCommand<T> { - - /** - * Simple constructor. Creates a new instance of InsertCommand - * - * @param object the object of the second sequence that should be inserted - */ - public InsertCommand(final T object) { - super(object); - } - - /** - * Accept a visitor. When an <code>InsertCommand</code> accepts a visitor, - * it calls its {@link CommandVisitor#visitInsertCommand visitInsertCommand} - * method. - * - * @param visitor the visitor to be accepted - */ - @Override - public void accept(final CommandVisitor<T> visitor) { - visitor.visitInsertCommand(getObject()); - } - -} http://git-wip-us.apache.org/repos/asf/commons-text/blob/4d927351/src/main/java/org/apache/commons/text/diff/KeepCommand.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/diff/KeepCommand.java b/src/main/java/org/apache/commons/text/diff/KeepCommand.java deleted file mode 100644 index 34c6fe7..0000000 --- a/src/main/java/org/apache/commons/text/diff/KeepCommand.java +++ /dev/null @@ -1,58 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.text.diff; - -/** - * Command representing the keeping of one object present in both sequences. - * <p> - * When one object of the first sequence <code>equals</code> another objects in - * the second sequence at the right place, the {@link EditScript edit script} - * transforming the first sequence into the second sequence uses an instance of - * this class to represent the keeping of this object. The objects embedded in - * these type of commands always come from the first sequence. - * </p> - * - * @see StringsComparator - * @see EditScript - * - * @param <T> object type - * @since 1.0 - */ -public class KeepCommand<T> extends EditCommand<T> { - - /** - * Simple constructor. Creates a new instance of KeepCommand - * - * @param object the object belonging to both sequences (the object is a - * reference to the instance in the first sequence which is known - * to be equal to an instance in the second sequence) - */ - public KeepCommand(final T object) { - super(object); - } - - /** - * Accept a visitor. When a <code>KeepCommand</code> accepts a visitor, it - * calls its {@link CommandVisitor#visitKeepCommand visitKeepCommand} method. - * - * @param visitor the visitor to be accepted - */ - @Override - public void accept(final CommandVisitor<T> visitor) { - visitor.visitKeepCommand(getObject()); - } -} http://git-wip-us.apache.org/repos/asf/commons-text/blob/4d927351/src/main/java/org/apache/commons/text/diff/ReplacementsFinder.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/diff/ReplacementsFinder.java b/src/main/java/org/apache/commons/text/diff/ReplacementsFinder.java deleted file mode 100644 index 46f1b88..0000000 --- a/src/main/java/org/apache/commons/text/diff/ReplacementsFinder.java +++ /dev/null @@ -1,124 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.text.diff; - -import java.util.ArrayList; -import java.util.List; - -/** - * This class handles sequences of replacements resulting from a comparison. - * <p> - * The comparison of two objects sequences leads to the identification of common - * parts and parts which only belong to the first or to the second sequence. The - * common parts appear in the edit script in the form of <em>keep</em> commands, - * they can be considered as synchronization objects between the two sequences. - * These synchronization objects split the two sequences in synchronized - * sub-sequences. The first sequence can be transformed into the second one by - * replacing each synchronized sub-sequence of the first sequence by the - * corresponding sub-sequence of the second sequence. This is a synthetic way to - * see an {@link EditScript edit script}, replacing individual - * {@link DeleteCommand delete}, {@link KeepCommand keep} and - * {@link InsertCommand insert} commands by fewer replacements acting on - * complete sub-sequences. - * </p> - * <p> - * This class is devoted to perform this interpretation. It visits an - * {@link EditScript edit script} (because it implements the - * {@link CommandVisitor CommandVisitor} interface) and calls a user-supplied - * handler implementing the {@link ReplacementsHandler ReplacementsHandler} - * interface to process the sub-sequences. - * </p> - * - * @see ReplacementsHandler - * @see EditScript - * @see StringsComparator - * - * @param <T> object type - * @since 1.0 - */ -public class ReplacementsFinder<T> implements CommandVisitor<T> { - - /** - * List of pending insertions. - */ - private final List<T> pendingInsertions; - /** - * List of pending deletions. - */ - private final List<T> pendingDeletions; - /** - * Count of elements skipped. - */ - private int skipped; - - /** Handler to call when synchronized sequences are found. */ - private final ReplacementsHandler<T> handler; - - /** - * Simple constructor. Creates a new instance of {@link ReplacementsFinder}. - * - * @param handler handler to call when synchronized sequences are found - */ - public ReplacementsFinder(final ReplacementsHandler<T> handler) { - pendingInsertions = new ArrayList<>(); - pendingDeletions = new ArrayList<>(); - skipped = 0; - this.handler = handler; - } - - /** - * Add an object to the pending insertions set. - * - * @param object object to insert - */ - @Override - public void visitInsertCommand(final T object) { - pendingInsertions.add(object); - } - - /** - * Handle a synchronization object. - * <p> - * When a synchronization object is identified, the pending insertions and - * pending deletions sets are provided to the user handler as subsequences. - * </p> - * - * @param object synchronization object detected - */ - @Override - public void visitKeepCommand(final T object) { - if (pendingDeletions.isEmpty() && pendingInsertions.isEmpty()) { - ++skipped; - } else { - handler.handleReplacement(skipped, pendingDeletions, pendingInsertions); - pendingDeletions.clear(); - pendingInsertions.clear(); - skipped = 1; - } - } - - /** - * Add an object to the pending deletions set. - * - * @param object object to delete - */ - @Override - public void visitDeleteCommand(final T object) { - pendingDeletions.add(object); - } - -} http://git-wip-us.apache.org/repos/asf/commons-text/blob/4d927351/src/main/java/org/apache/commons/text/diff/ReplacementsHandler.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/diff/ReplacementsHandler.java b/src/main/java/org/apache/commons/text/diff/ReplacementsHandler.java deleted file mode 100644 index 3beb716..0000000 --- a/src/main/java/org/apache/commons/text/diff/ReplacementsHandler.java +++ /dev/null @@ -1,52 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.text.diff; - -import java.util.List; - -/** - * This interface is devoted to handle synchronized replacement sequences. - * - * @param <T> object type - * @see ReplacementsFinder - * @since 1.0 - */ -public interface ReplacementsHandler<T> { - - /** - * Handle two synchronized sequences. - * <p> - * This method is called by a {@link ReplacementsFinder ReplacementsFinder} - * instance when it has synchronized two sub-sequences of object arrays - * being compared, and at least one of the sequences is non-empty. Since the - * sequences are synchronized, the objects before the two sub-sequences are - * equals (if they exist). This property also holds for the objects after - * the two sub-sequences. - * <p> - * The replacement is defined as replacing the <code>from</code> - * sub-sequence into the <code>to</code> sub-sequence. - * - * @param skipped number of tokens skipped since the last call (i.e. number of - * tokens that were in both sequences), this number should be strictly positive - * except on the very first call where it can be zero (if the first object of - * the two sequences are different) - * @param from sub-sequence of objects coming from the first sequence - * @param to sub-sequence of objects coming from the second sequence - */ - void handleReplacement(int skipped, List<T> from, List<T> to); - -} http://git-wip-us.apache.org/repos/asf/commons-text/blob/4d927351/src/main/java/org/apache/commons/text/diff/StringsComparator.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/diff/StringsComparator.java b/src/main/java/org/apache/commons/text/diff/StringsComparator.java deleted file mode 100644 index a286c4a..0000000 --- a/src/main/java/org/apache/commons/text/diff/StringsComparator.java +++ /dev/null @@ -1,331 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.text.diff; - -/** - * <p> - * It is guaranteed that the comparisons will always be done as - * <code>o1.equals(o2)</code> where <code>o1</code> belongs to the first - * sequence and <code>o2</code> belongs to the second sequence. This can - * be important if subclassing is used for some elements in the first - * sequence and the <code>equals</code> method is specialized. - * </p> - * <p> - * Comparison can be seen from two points of view: either as giving the smallest - * modification allowing to transform the first sequence into the second one, or - * as giving the longest sequence which is a subsequence of both initial - * sequences. The <code>equals</code> method is used to compare objects, so any - * object can be put into sequences. Modifications include deleting, inserting - * or keeping one object, starting from the beginning of the first sequence. - * </p> - * <p> - * This class implements the comparison algorithm, which is the very efficient - * algorithm from Eugene W. Myers - * <a href="http://www.cis.upenn.edu/~bcpierce/courses/dd/papers/diff.ps"> - * An O(ND) Difference Algorithm and Its Variations</a>. This algorithm produces - * the shortest possible {@link EditScript edit script} containing all the - * {@link EditCommand commands} needed to transform the first sequence into - * the second one. - * - * <p> - * This code has been adapted from Apache Commons Collections 4.0. - * </p> - * - * @see EditScript - * @see EditCommand - * @see CommandVisitor - * @since 1.0 - */ -public class StringsComparator { - - /** - * First character sequence. - */ - private final String left; - /** - * Second character sequence. - */ - private final String right; - /** - * Temporary array. - */ - private final int[] vDown; - /** - * Temporary array. - */ - private final int[] vUp; - - /** - * Simple constructor. - * <p> - * Creates a new instance of StringsComparator. - * </p> - * <p> - * It is <em>guaranteed</em> that the comparisons will always be done as - * <code>o1.equals(o2)</code> where <code>o1</code> belongs to the first - * sequence and <code>o2</code> belongs to the second sequence. This can be - * important if subclassing is used for some elements in the first sequence - * and the <code>equals</code> method is specialized. - * </p> - * - * @param left first character sequence to be compared - * @param right second character sequence to be compared - */ - public StringsComparator(final String left, final String right) { - this.left = left; - this.right = right; - - final int size = left.length() + right.length() + 2; - vDown = new int[size]; - vUp = new int[size]; - } - - /** - * Get the {@link EditScript} object. - * <p> - * It is guaranteed that the objects embedded in the {@link InsertCommand - * insert commands} come from the second sequence and that the objects - * embedded in either the {@link DeleteCommand delete commands} or - * {@link KeepCommand keep commands} come from the first sequence. This can - * be important if subclassing is used for some elements in the first - * sequence and the <code>equals</code> method is specialized. - * </p> - * - * @return the edit script resulting from the comparison of the two - * sequences - */ - public EditScript<Character> getScript() { - final EditScript<Character> script = new EditScript<>(); - buildScript(0, left.length(), 0, right.length(), script); - return script; - } - - /** - * Build an edit script. - * - * @param start1 the begin of the first sequence to be compared - * @param end1 the end of the first sequence to be compared - * @param start2 the begin of the second sequence to be compared - * @param end2 the end of the second sequence to be compared - * @param script the edited script - */ - private void buildScript(final int start1, final int end1, final int start2, final int end2, - final EditScript<Character> script) { - final Snake middle = getMiddleSnake(start1, end1, start2, end2); - - if (middle == null - || middle.getStart() == end1 && middle.getDiag() == end1 - end2 - || middle.getEnd() == start1 && middle.getDiag() == start1 - start2) { - - int i = start1; - int j = start2; - while (i < end1 || j < end2) { - if (i < end1 && j < end2 && left.charAt(i) == right.charAt(j)) { - script.append(new KeepCommand<>(left.charAt(i))); - ++i; - ++j; - } else { - if (end1 - start1 > end2 - start2) { - script.append(new DeleteCommand<>(left.charAt(i))); - ++i; - } else { - script.append(new InsertCommand<>(right.charAt(j))); - ++j; - } - } - } - - } else { - - buildScript(start1, middle.getStart(), - start2, middle.getStart() - middle.getDiag(), - script); - for (int i = middle.getStart(); i < middle.getEnd(); ++i) { - script.append(new KeepCommand<>(left.charAt(i))); - } - buildScript(middle.getEnd(), end1, - middle.getEnd() - middle.getDiag(), end2, - script); - } - } - - /** - * Get the middle snake corresponding to two subsequences of the - * main sequences. - * <p> - * The snake is found using the MYERS Algorithm (this algorithms has - * also been implemented in the GNU diff program). This algorithm is - * explained in Eugene Myers article: - * <a href="http://www.cs.arizona.edu/people/gene/PAPERS/diff.ps"> - * An O(ND) Difference Algorithm and Its Variations</a>. - * </p> - * - * @param start1 the begin of the first sequence to be compared - * @param end1 the end of the first sequence to be compared - * @param start2 the begin of the second sequence to be compared - * @param end2 the end of the second sequence to be compared - * @return the middle snake - */ - private Snake getMiddleSnake(final int start1, final int end1, final int start2, final int end2) { - // Myers Algorithm - // Initialisations - final int m = end1 - start1; - final int n = end2 - start2; - if (m == 0 || n == 0) { - return null; - } - - final int delta = m - n; - final int sum = n + m; - final int offset = (sum % 2 == 0 ? sum : sum + 1) / 2; - vDown[1 + offset] = start1; - vUp[1 + offset] = end1 + 1; - - for (int d = 0; d <= offset; ++d) { - // Down - for (int k = -d; k <= d; k += 2) { - // First step - - final int i = k + offset; - if (k == -d || k != d && vDown[i - 1] < vDown[i + 1]) { - vDown[i] = vDown[i + 1]; - } else { - vDown[i] = vDown[i - 1] + 1; - } - - int x = vDown[i]; - int y = x - start1 + start2 - k; - - while (x < end1 && y < end2 && left.charAt(x) == right.charAt(y)) { - vDown[i] = ++x; - ++y; - } - // Second step - if (delta % 2 != 0 && delta - d <= k && k <= delta + d) { - if (vUp[i - delta] <= vDown[i]) { // NOPMD - return buildSnake(vUp[i - delta], k + start1 - start2, end1, end2); - } - } - } - - // Up - for (int k = delta - d; k <= delta + d; k += 2) { - // First step - final int i = k + offset - delta; - if (k == delta - d - || k != delta + d && vUp[i + 1] <= vUp[i - 1]) { - vUp[i] = vUp[i + 1] - 1; - } else { - vUp[i] = vUp[i - 1]; - } - - int x = vUp[i] - 1; - int y = x - start1 + start2 - k; - while (x >= start1 && y >= start2 - && left.charAt(x) == right.charAt(y)) { - vUp[i] = x--; - y--; - } - // Second step - if (delta % 2 == 0 && -d <= k && k <= d) { - if (vUp[i] <= vDown[i + delta]) { // NOPMD - return buildSnake(vUp[i], k + start1 - start2, end1, end2); - } - } - } - } - - // this should not happen - throw new RuntimeException("Internal Error"); - } - - /** - * Build a snake. - * - * @param start the value of the start of the snake - * @param diag the value of the diagonal of the snake - * @param end1 the value of the end of the first sequence to be compared - * @param end2 the value of the end of the second sequence to be compared - * @return the snake built - */ - private Snake buildSnake(final int start, final int diag, final int end1, final int end2) { - int end = start; - while (end - diag < end2 - && end < end1 - && left.charAt(end) == right.charAt(end - diag)) { - ++end; - } - return new Snake(start, end, diag); - } - - /** - * This class is a simple placeholder to hold the end part of a path - * under construction in a {@link StringsComparator StringsComparator}. - */ - private static class Snake { - - /** Start index. */ - private final int start; - - /** End index. */ - private final int end; - - /** Diagonal number. */ - private final int diag; - - /** - * Simple constructor. Creates a new instance of Snake with specified indices. - * - * @param start start index of the snake - * @param end end index of the snake - * @param diag diagonal number - */ - public Snake(final int start, final int end, final int diag) { - this.start = start; - this.end = end; - this.diag = diag; - } - - /** - * Get the start index of the snake. - * - * @return start index of the snake - */ - public int getStart() { - return start; - } - - /** - * Get the end index of the snake. - * - * @return end index of the snake - */ - public int getEnd() { - return end; - } - - /** - * Get the diagonal number of the snake. - * - * @return diagonal number of the snake - */ - public int getDiag() { - return diag; - } - } - -} http://git-wip-us.apache.org/repos/asf/commons-text/blob/4d927351/src/main/java/org/apache/commons/text/diff/package-info.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/diff/package-info.java b/src/main/java/org/apache/commons/text/diff/package-info.java deleted file mode 100644 index 92fde86..0000000 --- a/src/main/java/org/apache/commons/text/diff/package-info.java +++ /dev/null @@ -1,25 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -/** - * <p>Provides algorithms for diff between strings.</p> - * - * <p>The initial implementation of the Myers algorithm was adapted from the - * commons-collections sequence package.</p> - * - * @since 1.0 - */ -package org.apache.commons.text.diff; http://git-wip-us.apache.org/repos/asf/commons-text/blob/4d927351/src/main/java/org/apache/commons/text/package-info.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/package-info.java b/src/main/java/org/apache/commons/text/package-info.java deleted file mode 100644 index f36b6dc..0000000 --- a/src/main/java/org/apache/commons/text/package-info.java +++ /dev/null @@ -1,22 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -/** - * <p>Basic classes for text handling.</p> - * - * @since 1.0 - */ -package org.apache.commons.text; http://git-wip-us.apache.org/repos/asf/commons-text/blob/4d927351/src/main/java/org/apache/commons/text/similarity/CosineDistance.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/similarity/CosineDistance.java b/src/main/java/org/apache/commons/text/similarity/CosineDistance.java deleted file mode 100644 index 516c215..0000000 --- a/src/main/java/org/apache/commons/text/similarity/CosineDistance.java +++ /dev/null @@ -1,57 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.text.similarity; - -import java.util.Map; - -/** - * Measures the cosine distance between two character sequences. - * - * <p>It utilizes the {@link CosineSimilarity} to compute the distance. Character sequences - * are converted into vectors through a simple tokenizer that works with a regular expression - * to split words in a sentence.</p> - * - * <p> - * For further explanation about Cosine Similarity and Cosine Distance, refer to - * http://en.wikipedia.org/wiki/Cosine_similarity. - * </p> - * - * @since 1.0 - * @see CosineSimilarity - */ -public class CosineDistance implements EditDistance<Double> { - /** - * Tokenizer used to convert the character sequence into a vector. - */ - private final Tokenizer<CharSequence> tokenizer = new RegexTokenizer(); - /** - * Cosine similarity. - */ - private final CosineSimilarity cosineSimilarity = new CosineSimilarity(); - - @Override - public Double apply(final CharSequence left, final CharSequence right) { - final CharSequence[] leftTokens = tokenizer.tokenize(left); - final CharSequence[] rightTokens = tokenizer.tokenize(right); - - final Map<CharSequence, Integer> leftVector = Counter.of(leftTokens); - final Map<CharSequence, Integer> rightVector = Counter.of(rightTokens); - final double similarity = cosineSimilarity.cosineSimilarity(leftVector, rightVector); - return 1.0 - similarity; - } - -} http://git-wip-us.apache.org/repos/asf/commons-text/blob/4d927351/src/main/java/org/apache/commons/text/similarity/CosineSimilarity.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/similarity/CosineSimilarity.java b/src/main/java/org/apache/commons/text/similarity/CosineSimilarity.java deleted file mode 100644 index 7288f3e..0000000 --- a/src/main/java/org/apache/commons/text/similarity/CosineSimilarity.java +++ /dev/null @@ -1,101 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.text.similarity; - -import java.util.HashSet; -import java.util.Map; -import java.util.Set; - -/** - * Measures the Cosine similarity of two vectors of an inner product space and - * compares the angle between them. - * - * <p> - * For further explanation about the Cosine Similarity, refer to - * http://en.wikipedia.org/wiki/Cosine_similarity. - * </p> - * - * @since 1.0 - */ -public class CosineSimilarity { - - /** - * Calculates the cosine similarity for two given vectors. - * - * @param leftVector left vector - * @param rightVector right vector - * @return cosine similarity between the two vectors - */ - public Double cosineSimilarity(final Map<CharSequence, Integer> leftVector, final Map<CharSequence, Integer> rightVector) { - if (leftVector == null || rightVector == null) { - throw new IllegalArgumentException("Vectors must not be null"); - } - - final Set<CharSequence> intersection = getIntersection(leftVector, rightVector); - - final double dotProduct = dot(leftVector, rightVector, intersection); - double d1 = 0.0d; - for (final Integer value : leftVector.values()) { - d1 += Math.pow(value, 2); - } - double d2 = 0.0d; - for (final Integer value : rightVector.values()) { - d2 += Math.pow(value, 2); - } - double cosineSimilarity; - if (d1 <= 0.0 || d2 <= 0.0) { - cosineSimilarity = 0.0; - } else { - cosineSimilarity = (double) (dotProduct / (double) (Math.sqrt(d1) * Math.sqrt(d2))); - } - return cosineSimilarity; - } - - /** - * Returns a set with strings common to the two given maps. - * - * @param leftVector left vector map - * @param rightVector right vector map - * @return common strings - */ - private Set<CharSequence> getIntersection(final Map<CharSequence, Integer> leftVector, - final Map<CharSequence, Integer> rightVector) { - final Set<CharSequence> intersection = new HashSet<>(leftVector.keySet()); - intersection.retainAll(rightVector.keySet()); - return intersection; - } - - /** - * Computes the dot product of two vectors. It ignores remaining elements. It means - * that if a vector is longer than other, then a smaller part of it will be used to compute - * the dot product. - * - * @param leftVector left vector - * @param rightVector right vector - * @param intersection common elements - * @return the dot product - */ - private double dot(final Map<CharSequence, Integer> leftVector, final Map<CharSequence, Integer> rightVector, - final Set<CharSequence> intersection) { - long dotProduct = 0; - for (final CharSequence key : intersection) { - dotProduct += leftVector.get(key) * rightVector.get(key); - } - return dotProduct; - } - -} http://git-wip-us.apache.org/repos/asf/commons-text/blob/4d927351/src/main/java/org/apache/commons/text/similarity/Counter.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/similarity/Counter.java b/src/main/java/org/apache/commons/text/similarity/Counter.java deleted file mode 100644 index d259d6b..0000000 --- a/src/main/java/org/apache/commons/text/similarity/Counter.java +++ /dev/null @@ -1,62 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.text.similarity; - -import java.util.HashMap; -import java.util.Map; - -/** - * Java implementation of Python's collections Counter module. - * - * <p>It counts how many times each element provided occurred in an array and - * returns a dict with the element as key and the count as value.</p> - * - * @see <a href="https://docs.python.org/dev/library/collections.html#collections.Counter"> - * https://docs.python.org/dev/library/collections.html#collections.Counter</a> - * - * @since 1.0 - */ -final class Counter { - - /** - * Hidden constructor. - */ - private Counter() { - super(); - } - - /** - * It counts how many times each element provided occurred in an array and - * returns a dict with the element as key and the count as value. - * - * @param tokens array of tokens - * @return dict, where the elements are key, and the count the value - */ - public static Map<CharSequence, Integer> of(final CharSequence[] tokens) { - final Map<CharSequence, Integer> innerCounter = new HashMap<>(); - for (final CharSequence token : tokens) { - if (innerCounter.containsKey(token)) { - int value = innerCounter.get(token); - innerCounter.put(token, ++value); - } else { - innerCounter.put(token, 1); - } - } - return innerCounter; - } - -} http://git-wip-us.apache.org/repos/asf/commons-text/blob/4d927351/src/main/java/org/apache/commons/text/similarity/EditDistance.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/similarity/EditDistance.java b/src/main/java/org/apache/commons/text/similarity/EditDistance.java deleted file mode 100644 index cf4e2c0..0000000 --- a/src/main/java/org/apache/commons/text/similarity/EditDistance.java +++ /dev/null @@ -1,59 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.text.similarity; - -/** - * Interface for <a href="http://en.wikipedia.org/wiki/Edit_distance">Edit Distances</a>. - * - * <p> - * An edit distance is a formal metric on the Kleene closure (<code>X<sup>*</sup></code>) over an - * alphabet (<code>X</code>). Note, that a <a href="https://en.wikipedia.org/wiki/Metric_(mathematics)">metric</a> - * on a set <code>S</code> is a function <code>d: [S * S] -> [0, INFINITY)</code> such - * that the following hold for <code>x,y,z</code> in - * the set <code>S</code>: - * </p> - * <ul> - * <li><code>d(x,y) >= 0</code>, non-negativity or separation axiom</li> - * <li><code>d(x,y) == 0</code>, if and only if, <code>x == y</code></li> - * <li><code>d(x,y) == d(y,x)</code>, symmetry, and</li> - * <li><code>d(x,z) <= d(x,y) + d(y,z)</code>, the triangle inequality</li> - * </ul> - * - * - * <p> - * This is a BiFunction<CharSequence, CharSequence, R>. - * The <code>apply</code> method - * accepts a pair of {@link CharSequence} parameters - * and returns an <code>R</code> type similarity score. - * </p> - * - * @param <R> The type of similarity score unit used by this EditDistance. - * @since 1.0 - */ -public interface EditDistance<R> extends SimilarityScore<R> { - - /** - * Compares two CharSequences. - * - * @param left the first CharSequence - * @param right the second CharSequence - * @return the similarity score between two CharSequences - */ - @Override - R apply(CharSequence left, CharSequence right); - -} http://git-wip-us.apache.org/repos/asf/commons-text/blob/4d927351/src/main/java/org/apache/commons/text/similarity/EditDistanceFrom.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/similarity/EditDistanceFrom.java b/src/main/java/org/apache/commons/text/similarity/EditDistanceFrom.java deleted file mode 100644 index b67a41f..0000000 --- a/src/main/java/org/apache/commons/text/similarity/EditDistanceFrom.java +++ /dev/null @@ -1,112 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.text.similarity; - -/** - * <p> - * This stores a {@link EditDistance} implementation and a {@link CharSequence} "left" string. - * The {@link #apply(CharSequence right)} method accepts the "right" string and invokes the - * comparison function for the pair of strings. - * </p> - * - * <p> - * The following is an example which finds the most similar string: - * </p> - * <pre> - * EditDistance<Integer> editDistance = new LevenshteinDistance(); - * String target = "Apache"; - * EditDistanceFrom<Integer> editDistanceFrom = - * new EditDistanceFrom<Integer>(editDistance, target); - * String mostSimilar = null; - * Integer shortestDistance = null; - * - * for (String test : new String[] { "Appaloosa", "a patchy", "apple" }) { - * Integer distance = editDistanceFrom.apply(test); - * if (shortestDistance == null || distance < shortestDistance) { - * shortestDistance = distance; - * mostSimilar = test; - * } - * } - * - * System.out.println("The string most similar to \"" + target + "\" " - * + "is \"" + mostSimilar + "\" because " - * + "its distance is only " + shortestDistance + "."); - * </pre> - * - * @param <R> This is the type of similarity score used by the EditDistance function. - * @since 1.0 - */ -public class EditDistanceFrom<R> { - - /** - * Edit distance. - */ - private final EditDistance<R> editDistance; - /** - * Left parameter used in distance function. - */ - private final CharSequence left; - - /** - * <p>This accepts the edit distance implementation and the "left" string.</p> - * - * @param editDistance This may not be null. - * @param left This may be null here, - * but the EditDistance#compare(CharSequence left, CharSequence right) - * implementation may not accept nulls. - */ - public EditDistanceFrom(final EditDistance<R> editDistance, final CharSequence left) { - if (editDistance == null) { - throw new IllegalArgumentException("The edit distance may not be null."); - } - - this.editDistance = editDistance; - this.left = left; - } - - /** - * <p> - * This compares "left" field against the "right" parameter - * using the "edit distance" implementation. - * </p> - * - * @param right the second CharSequence - * @return the similarity score between two CharSequences - */ - public R apply(final CharSequence right) { - return editDistance.apply(left, right); - } - - /** - * Gets the left parameter. - * - * @return the left parameter - */ - public CharSequence getLeft() { - return left; - } - - /** - * Gets the edit distance. - * - * @return the edit distance - */ - public EditDistance<R> getEditDistance() { - return editDistance; - } - -} http://git-wip-us.apache.org/repos/asf/commons-text/blob/4d927351/src/main/java/org/apache/commons/text/similarity/FuzzyScore.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/similarity/FuzzyScore.java b/src/main/java/org/apache/commons/text/similarity/FuzzyScore.java deleted file mode 100644 index 8356960..0000000 --- a/src/main/java/org/apache/commons/text/similarity/FuzzyScore.java +++ /dev/null @@ -1,144 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.text.similarity; - -import java.util.Locale; - -/** - * A matching algorithm that is similar to the searching algorithms implemented in editors such - * as Sublime Text, TextMate, Atom and others. - * - * <p> - * One point is given for every matched character. Subsequent matches yield two bonus points. A higher score - * indicates a higher similarity. - * </p> - * - * <p> - * This code has been adapted from Apache Commons Lang 3.3. - * </p> - * - * @since 1.0 - */ -public class FuzzyScore { - - /** - * Locale used to change the case of text. - */ - private final Locale locale; - - - /** - * <p>This returns a {@link Locale}-specific {@link FuzzyScore}.</p> - * - * @param locale The string matching logic is case insensitive. - A {@link Locale} is necessary to normalize both Strings to lower case. - * @throws IllegalArgumentException - * This is thrown if the {@link Locale} parameter is {@code null}. - */ - public FuzzyScore(final Locale locale) { - if (locale == null) { - throw new IllegalArgumentException("Locale must not be null"); - } - this.locale = locale; - } - - /** - * <p> - * Find the Fuzzy Score which indicates the similarity score between two - * Strings. - * </p> - * - * <pre> - * score.fuzzyScore(null, null, null) = IllegalArgumentException - * score.fuzzyScore("", "", Locale.ENGLISH) = 0 - * score.fuzzyScore("Workshop", "b", Locale.ENGLISH) = 0 - * score.fuzzyScore("Room", "o", Locale.ENGLISH) = 1 - * score.fuzzyScore("Workshop", "w", Locale.ENGLISH) = 1 - * score.fuzzyScore("Workshop", "ws", Locale.ENGLISH) = 2 - * score.fuzzyScore("Workshop", "wo", Locale.ENGLISH) = 4 - * score.fuzzyScore("Apache Software Foundation", "asf", Locale.ENGLISH) = 3 - * </pre> - * - * @param term a full term that should be matched against, must not be null - * @param query the query that will be matched against a term, must not be - * null - * @return result score - * @throws IllegalArgumentException if either String input {@code null} or - * Locale input {@code null} - */ - public Integer fuzzyScore(final CharSequence term, final CharSequence query) { - if (term == null || query == null) { - throw new IllegalArgumentException("Strings must not be null"); - } - - // fuzzy logic is case insensitive. We normalize the Strings to lower - // case right from the start. Turning characters to lower case - // via Character.toLowerCase(char) is unfortunately insufficient - // as it does not accept a locale. - final String termLowerCase = term.toString().toLowerCase(locale); - final String queryLowerCase = query.toString().toLowerCase(locale); - - // the resulting score - int score = 0; - - // the position in the term which will be scanned next for potential - // query character matches - int termIndex = 0; - - // index of the previously matched character in the term - int previousMatchingCharacterIndex = Integer.MIN_VALUE; - - for (int queryIndex = 0; queryIndex < queryLowerCase.length(); queryIndex++) { - final char queryChar = queryLowerCase.charAt(queryIndex); - - boolean termCharacterMatchFound = false; - for (; termIndex < termLowerCase.length() - && !termCharacterMatchFound; termIndex++) { - final char termChar = termLowerCase.charAt(termIndex); - - if (queryChar == termChar) { - // simple character matches result in one point - score++; - - // subsequent character matches further improve - // the score. - if (previousMatchingCharacterIndex + 1 == termIndex) { - score += 2; - } - - previousMatchingCharacterIndex = termIndex; - - // we can leave the nested loop. Every character in the - // query can match at most one character in the term. - termCharacterMatchFound = true; - } - } - } - - return score; - } - - /** - * Gets the locale. - * - * @return the locale - */ - public Locale getLocale() { - return locale; - } - -} http://git-wip-us.apache.org/repos/asf/commons-text/blob/4d927351/src/main/java/org/apache/commons/text/similarity/HammingDistance.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/similarity/HammingDistance.java b/src/main/java/org/apache/commons/text/similarity/HammingDistance.java deleted file mode 100644 index 8d88fe8..0000000 --- a/src/main/java/org/apache/commons/text/similarity/HammingDistance.java +++ /dev/null @@ -1,78 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.text.similarity; - -/** - * The hamming distance between two strings of equal length is the number of - * positions at which the corresponding symbols are different. - * - * <p> - * For further explanation about the Hamming Distance, take a look at its - * Wikipedia page at http://en.wikipedia.org/wiki/Hamming_distance. - * </p> - * - * @since 1.0 - */ -public class HammingDistance implements EditDistance<Integer> { - - /** - * Find the Hamming Distance between two strings with the same - * length. - * - * <p>The distance starts with zero, and for each occurrence of a - * different character in either String, it increments the distance - * by 1, and finally return its value.</p> - * - * <p>Since the Hamming Distance can only be calculated between strings of equal length, input of different lengths - * will throw IllegalArgumentException</p> - * - * <pre> - * distance.apply("", "") = 0 - * distance.apply("pappa", "pappa") = 0 - * distance.apply("1011101", "1011111") = 1 - * distance.apply("ATCG", "ACCC") = 2 - * distance.apply("karolin", "kerstin" = 3 - * </pre> - * - * @param left the first CharSequence, must not be null - * @param right the second CharSequence, must not be null - * @return distance - * @throws IllegalArgumentException if either input is {@code null} or - * if they do not have the same length - */ - @Override - public Integer apply(final CharSequence left, final CharSequence right) { - if (left == null || right == null) { - throw new IllegalArgumentException("Strings must not be null"); - } - - if (left.length() != right.length()) { - throw new IllegalArgumentException("Strings must have the same length"); - } - - int distance = 0; - - for (int i = 0; i < left.length(); i++) { - if (left.charAt(i) != right.charAt(i)) { - distance++; - } - } - - return distance; - } - -} http://git-wip-us.apache.org/repos/asf/commons-text/blob/4d927351/src/main/java/org/apache/commons/text/similarity/JaccardDistance.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/similarity/JaccardDistance.java b/src/main/java/org/apache/commons/text/similarity/JaccardDistance.java deleted file mode 100644 index 4a3d5cc..0000000 --- a/src/main/java/org/apache/commons/text/similarity/JaccardDistance.java +++ /dev/null @@ -1,52 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.text.similarity; - -/** - * Measures the Jaccard distance of two sets of character sequence. Jaccard - * distance is the dissimilarity between two sets. Its the complementary of - * Jaccard similarity. - * - * <p> - * For further explanation about Jaccard Distance, refer - * https://en.wikipedia.org/wiki/Jaccard_index - * </p> - * - * @since 1.0 - */ -public class JaccardDistance implements EditDistance<Double> { - - private final JaccardSimilarity jaccardSimilarity = new JaccardSimilarity(); - - /** - * Calculates Jaccard distance of two set character sequence passed as - * input. Calculates Jaccard similarity and returns the complement of it. - * - * @param left first character sequence - * @param right second character sequence - * @return index - * @throws IllegalArgumentException - * if either String input {@code null} - */ - @Override - public Double apply(CharSequence left, CharSequence right) { - if (left == null || right == null) { - throw new IllegalArgumentException("Input cannot be null"); - } - return Math.round((1 - jaccardSimilarity.apply(left, right)) * 100d) / 100d; - } -} http://git-wip-us.apache.org/repos/asf/commons-text/blob/4d927351/src/main/java/org/apache/commons/text/similarity/JaccardSimilarity.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/similarity/JaccardSimilarity.java b/src/main/java/org/apache/commons/text/similarity/JaccardSimilarity.java deleted file mode 100644 index ffacb3f..0000000 --- a/src/main/java/org/apache/commons/text/similarity/JaccardSimilarity.java +++ /dev/null @@ -1,88 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.text.similarity; - -import java.util.HashSet; -import java.util.Set; - -/** - * Measures the Jaccard similarity (aka Jaccard index) of two sets of character - * sequence. Jaccard similarity is the size of the intersection divided by the - * size of the union of the two sets. - * - * <p> - * For further explanation about Jaccard Similarity, refer - * https://en.wikipedia.org/wiki/Jaccard_index - * </p> - * - * @since 1.0 - */ -public class JaccardSimilarity implements SimilarityScore<Double> { - - /** - * Calculates Jaccard Similarity of two set character sequence passed as - * input. - * - * @param left first character sequence - * @param right second character sequence - * @return index - * @throws IllegalArgumentException - * if either String input {@code null} - */ - @Override - public Double apply(CharSequence left, CharSequence right) { - if (left == null || right == null) { - throw new IllegalArgumentException("Input cannot be null"); - } - return Math.round(calculateJaccardSimilarity(left, right) * 100d) / 100d; - } - - /** - * Calculates Jaccard Similarity of two character sequences passed as - * input. Does the calculation by identifying the union (characters in at - * least one of the two sets) of the two sets and intersection (characters - * which are present in set one which are present in set two) - * - * @param left first character sequence - * @param right second character sequence - * @return index - */ - private Double calculateJaccardSimilarity(CharSequence left, CharSequence right) { - Set<String> intersectionSet = new HashSet<String>(); - Set<String> unionSet = new HashSet<String>(); - boolean unionFilled = false; - int leftLength = left.length(); - int rightLength = right.length(); - if (leftLength == 0 || rightLength == 0) { - return 0d; - } - - for (int leftIndex = 0; leftIndex < leftLength; leftIndex++) { - unionSet.add(String.valueOf(left.charAt(leftIndex))); - for (int rightIndex = 0; rightIndex < rightLength; rightIndex++) { - if (!unionFilled) { - unionSet.add(String.valueOf(right.charAt(rightIndex))); - } - if (left.charAt(leftIndex) == right.charAt(rightIndex)) { - intersectionSet.add(String.valueOf(left.charAt(leftIndex))); - } - } - unionFilled = true; - } - return Double.valueOf(intersectionSet.size()) / Double.valueOf(unionSet.size()); - } -} http://git-wip-us.apache.org/repos/asf/commons-text/blob/4d927351/src/main/java/org/apache/commons/text/similarity/JaroWinklerDistance.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/similarity/JaroWinklerDistance.java b/src/main/java/org/apache/commons/text/similarity/JaroWinklerDistance.java deleted file mode 100644 index 33c6a2c..0000000 --- a/src/main/java/org/apache/commons/text/similarity/JaroWinklerDistance.java +++ /dev/null @@ -1,161 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.text.similarity; - -import java.util.Arrays; - -/** - * A similarity algorithm indicating the percentage of matched characters between two character sequences. - * - * <p> - * The Jaro measure is the weighted sum of percentage of matched characters - * from each file and transposed characters. Winkler increased this measure - * for matching initial characters. - * </p> - * - * <p> - * This implementation is based on the Jaro Winkler similarity algorithm - * from <a href="http://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance"> - * http://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance</a>. - * </p> - * - * <p> - * This code has been adapted from Apache Commons Lang 3.3. - * </p> - * - * @since 1.0 - */ -public class JaroWinklerDistance implements SimilarityScore<Double> { - - /** - * The default prefix length limit set to four. - */ - private static final int PREFIX_LENGTH_LIMIT = 4; - /** - * Represents a failed index search. - */ - public static final int INDEX_NOT_FOUND = -1; - - /** - * Find the Jaro Winkler Distance which indicates the similarity score - * between two CharSequences. - * - * <pre> - * distance.apply(null, null) = IllegalArgumentException - * distance.apply("","") = 0.0 - * distance.apply("","a") = 0.0 - * distance.apply("aaapppp", "") = 0.0 - * distance.apply("frog", "fog") = 0.93 - * distance.apply("fly", "ant") = 0.0 - * distance.apply("elephant", "hippo") = 0.44 - * distance.apply("hippo", "elephant") = 0.44 - * distance.apply("hippo", "zzzzzzzz") = 0.0 - * distance.apply("hello", "hallo") = 0.88 - * distance.apply("ABC Corporation", "ABC Corp") = 0.93 - * distance.apply("D N H Enterprises Inc", "D & H Enterprises, Inc.") = 0.95 - * distance.apply("My Gym Children's Fitness Center", "My Gym. Childrens Fitness") = 0.92 - * distance.apply("PENNSYLVANIA", "PENNCISYLVNIA") = 0.88 - * </pre> - * - * @param left the first String, must not be null - * @param right the second String, must not be null - * @return result distance - * @throws IllegalArgumentException if either String input {@code null} - */ - @Override - public Double apply(final CharSequence left, final CharSequence right) { - final double defaultScalingFactor = 0.1; - final double percentageRoundValue = 100.0; - - if (left == null || right == null) { - throw new IllegalArgumentException("Strings must not be null"); - } - - int[] mtp = matches(left, right); - double m = mtp[0]; - if (m == 0) { - return 0D; - } - double j = ((m / left.length() + m / right.length() + (m - mtp[1]) / m)) / 3; - double jw = j < 0.7D ? j : j + Math.min(defaultScalingFactor, 1D / mtp[3]) * mtp[2] * (1D - j); - return Math.round(jw * percentageRoundValue) / percentageRoundValue; - } - - /** - * This method returns the Jaro-Winkler string matches, transpositions, prefix, max array. - * - * @param first the first string to be matched - * @param second the second string to be machted - * @return mtp array containing: matches, transpositions, prefix, and max length - */ - protected static int[] matches(final CharSequence first, final CharSequence second) { - CharSequence max, min; - if (first.length() > second.length()) { - max = first; - min = second; - } else { - max = second; - min = first; - } - int range = Math.max(max.length() / 2 - 1, 0); - int[] matchIndexes = new int[min.length()]; - Arrays.fill(matchIndexes, -1); - boolean[] matchFlags = new boolean[max.length()]; - int matches = 0; - for (int mi = 0; mi < min.length(); mi++) { - char c1 = min.charAt(mi); - for (int xi = Math.max(mi - range, 0), xn = Math.min(mi + range + 1, max.length()); xi < xn; xi++) { - if (!matchFlags[xi] && c1 == max.charAt(xi)) { - matchIndexes[mi] = xi; - matchFlags[xi] = true; - matches++; - break; - } - } - } - char[] ms1 = new char[matches]; - char[] ms2 = new char[matches]; - for (int i = 0, si = 0; i < min.length(); i++) { - if (matchIndexes[i] != -1) { - ms1[si] = min.charAt(i); - si++; - } - } - for (int i = 0, si = 0; i < max.length(); i++) { - if (matchFlags[i]) { - ms2[si] = max.charAt(i); - si++; - } - } - int transpositions = 0; - for (int mi = 0; mi < ms1.length; mi++) { - if (ms1[mi] != ms2[mi]) { - transpositions++; - } - } - int prefix = 0; - for (int mi = 0; mi < min.length(); mi++) { - if (first.charAt(mi) == second.charAt(mi)) { - prefix++; - } else { - break; - } - } - return new int[] { matches, transpositions / 2, prefix, max.length() }; - } - -}