http://git-wip-us.apache.org/repos/asf/commons-text/blob/4d927351/src/main/java/org/apache/commons/text/beta/diff/EditCommand.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/beta/diff/EditCommand.java b/src/main/java/org/apache/commons/text/beta/diff/EditCommand.java new file mode 100644 index 0000000..a98a719 --- /dev/null +++ b/src/main/java/org/apache/commons/text/beta/diff/EditCommand.java @@ -0,0 +1,88 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.text.beta.diff; + +/** + * Abstract base class for all commands used to transform an objects sequence + * into another one. + * <p> + * When two objects sequences are compared through the + * {@link StringsComparator#getScript StringsComparator.getScript} method, + * the result is provided has a {@link EditScript script} containing the commands + * that progressively transform the first sequence into the second one. + * </p> + * <p> + * There are only three types of commands, all of which are subclasses of this + * abstract class. Each command is associated with one object belonging to at + * least one of the sequences. These commands are {@link InsertCommand + * InsertCommand} which correspond to an object of the second sequence being + * inserted into the first sequence, {@link DeleteCommand DeleteCommand} which + * correspond to an object of the first sequence being removed and + * {@link KeepCommand KeepCommand} which correspond to an object of the first + * sequence which <code>equals</code> an object in the second sequence. It is + * guaranteed that comparison is always performed this way (i.e. the + * <code>equals</code> method of the object from the first sequence is used and + * the object passed as an argument comes from the second sequence) ; this can + * be important if subclassing is used for some elements in the first sequence + * and the <code>equals</code> method is specialized. + * </p> + * + * <p> + * This code has been adapted from Apache Commons Collections 4.0. + * </p> + * + * @see StringsComparator + * @see EditScript + * + * @param <T> object type + * @since 1.0 + */ +public abstract class EditCommand<T> { + + /** Object on which the command should be applied. */ + private final T object; + + /** + * Simple constructor. Creates a new instance of EditCommand + * + * @param object reference to the object associated with this command, this + * refers to an element of one of the sequences being compared + */ + protected EditCommand(final T object) { + this.object = object; + } + + /** + * Returns the object associated with this command. + * + * @return the object on which the command is applied + */ + protected T getObject() { + return object; + } + + /** + * Accept a visitor. + * <p> + * This method is invoked for each commands belonging to + * an {@link EditScript EditScript}, in order to implement the visitor design pattern + * + * @param visitor the visitor to be accepted + */ + public abstract void accept(CommandVisitor<T> visitor); + +}
http://git-wip-us.apache.org/repos/asf/commons-text/blob/4d927351/src/main/java/org/apache/commons/text/beta/diff/EditScript.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/beta/diff/EditScript.java b/src/main/java/org/apache/commons/text/beta/diff/EditScript.java new file mode 100644 index 0000000..ed18438 --- /dev/null +++ b/src/main/java/org/apache/commons/text/beta/diff/EditScript.java @@ -0,0 +1,133 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.text.beta.diff; + +import java.util.ArrayList; +import java.util.List; + +/** + * This class gathers all the {@link EditCommand commands} needed to transform + * one objects sequence into another objects sequence. + * <p> + * An edit script is the most general view of the differences between two + * sequences. It is built as the result of the comparison between two sequences + * by the {@link StringsComparator StringsComparator} class. The user can + * walk through it using the <em>visitor</em> design pattern. + * <p> + * It is guaranteed that the objects embedded in the {@link InsertCommand insert + * commands} come from the second sequence and that the objects embedded in + * either the {@link DeleteCommand delete commands} or {@link KeepCommand keep + * commands} come from the first sequence. This can be important if subclassing + * is used for some elements in the first sequence and the <code>equals</code> + * method is specialized. + * + * @see StringsComparator + * @see EditCommand + * @see CommandVisitor + * @see ReplacementsHandler + * + * @param <T> object type + * @since 1.0 + */ +public class EditScript<T> { + + /** Container for the commands. */ + private final List<EditCommand<T>> commands; + + /** Length of the longest common subsequence. */ + private int lcsLength; + + /** Number of modifications. */ + private int modifications; + + /** + * Simple constructor. Creates a new empty script. + */ + public EditScript() { + commands = new ArrayList<>(); + lcsLength = 0; + modifications = 0; + } + + /** + * Add a keep command to the script. + * + * @param command command to add + */ + public void append(final KeepCommand<T> command) { + commands.add(command); + ++lcsLength; + } + + /** + * Add an insert command to the script. + * + * @param command command to add + */ + public void append(final InsertCommand<T> command) { + commands.add(command); + ++modifications; + } + + /** + * Add a delete command to the script. + * + * @param command command to add + */ + public void append(final DeleteCommand<T> command) { + commands.add(command); + ++modifications; + } + + /** + * Visit the script. The script implements the <em>visitor</em> design + * pattern, this method is the entry point to which the user supplies its + * own visitor, the script will be responsible to drive it through the + * commands in order and call the appropriate method as each command is + * encountered. + * + * @param visitor the visitor that will visit all commands in turn + */ + public void visit(final CommandVisitor<T> visitor) { + for (final EditCommand<T> command : commands) { + command.accept(visitor); + } + } + + /** + * Get the length of the Longest Common Subsequence (LCS). The length of the + * longest common subsequence is the number of {@link KeepCommand keep + * commands} in the script. + * + * @return length of the Longest Common Subsequence + */ + public int getLCSLength() { + return lcsLength; + } + + /** + * Get the number of effective modifications. The number of effective + * modification is the number of {@link DeleteCommand delete} and + * {@link InsertCommand insert} commands in the script. + * + * @return number of effective modifications + */ + public int getModifications() { + return modifications; + } + +} http://git-wip-us.apache.org/repos/asf/commons-text/blob/4d927351/src/main/java/org/apache/commons/text/beta/diff/InsertCommand.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/beta/diff/InsertCommand.java b/src/main/java/org/apache/commons/text/beta/diff/InsertCommand.java new file mode 100644 index 0000000..ecdde02 --- /dev/null +++ b/src/main/java/org/apache/commons/text/beta/diff/InsertCommand.java @@ -0,0 +1,58 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.text.beta.diff; + +/** + * Command representing the insertion of one object of the second sequence. + * <p> + * When one object of the second sequence has no corresponding object in the + * first sequence at the right place, the {@link EditScript edit script} + * transforming the first sequence into the second sequence uses an instance of + * this class to represent the insertion of this object. The objects embedded in + * these type of commands always come from the second sequence. + * </p> + * + * @see StringsComparator + * @see EditScript + * + * @param <T> object type + * @since 1.0 + */ +public class InsertCommand<T> extends EditCommand<T> { + + /** + * Simple constructor. Creates a new instance of InsertCommand + * + * @param object the object of the second sequence that should be inserted + */ + public InsertCommand(final T object) { + super(object); + } + + /** + * Accept a visitor. When an <code>InsertCommand</code> accepts a visitor, + * it calls its {@link CommandVisitor#visitInsertCommand visitInsertCommand} + * method. + * + * @param visitor the visitor to be accepted + */ + @Override + public void accept(final CommandVisitor<T> visitor) { + visitor.visitInsertCommand(getObject()); + } + +} http://git-wip-us.apache.org/repos/asf/commons-text/blob/4d927351/src/main/java/org/apache/commons/text/beta/diff/KeepCommand.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/beta/diff/KeepCommand.java b/src/main/java/org/apache/commons/text/beta/diff/KeepCommand.java new file mode 100644 index 0000000..7f03ade --- /dev/null +++ b/src/main/java/org/apache/commons/text/beta/diff/KeepCommand.java @@ -0,0 +1,58 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.text.beta.diff; + +/** + * Command representing the keeping of one object present in both sequences. + * <p> + * When one object of the first sequence <code>equals</code> another objects in + * the second sequence at the right place, the {@link EditScript edit script} + * transforming the first sequence into the second sequence uses an instance of + * this class to represent the keeping of this object. The objects embedded in + * these type of commands always come from the first sequence. + * </p> + * + * @see StringsComparator + * @see EditScript + * + * @param <T> object type + * @since 1.0 + */ +public class KeepCommand<T> extends EditCommand<T> { + + /** + * Simple constructor. Creates a new instance of KeepCommand + * + * @param object the object belonging to both sequences (the object is a + * reference to the instance in the first sequence which is known + * to be equal to an instance in the second sequence) + */ + public KeepCommand(final T object) { + super(object); + } + + /** + * Accept a visitor. When a <code>KeepCommand</code> accepts a visitor, it + * calls its {@link CommandVisitor#visitKeepCommand visitKeepCommand} method. + * + * @param visitor the visitor to be accepted + */ + @Override + public void accept(final CommandVisitor<T> visitor) { + visitor.visitKeepCommand(getObject()); + } +} http://git-wip-us.apache.org/repos/asf/commons-text/blob/4d927351/src/main/java/org/apache/commons/text/beta/diff/ReplacementsFinder.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/beta/diff/ReplacementsFinder.java b/src/main/java/org/apache/commons/text/beta/diff/ReplacementsFinder.java new file mode 100644 index 0000000..a09c015 --- /dev/null +++ b/src/main/java/org/apache/commons/text/beta/diff/ReplacementsFinder.java @@ -0,0 +1,124 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.text.beta.diff; + +import java.util.ArrayList; +import java.util.List; + +/** + * This class handles sequences of replacements resulting from a comparison. + * <p> + * The comparison of two objects sequences leads to the identification of common + * parts and parts which only belong to the first or to the second sequence. The + * common parts appear in the edit script in the form of <em>keep</em> commands, + * they can be considered as synchronization objects between the two sequences. + * These synchronization objects split the two sequences in synchronized + * sub-sequences. The first sequence can be transformed into the second one by + * replacing each synchronized sub-sequence of the first sequence by the + * corresponding sub-sequence of the second sequence. This is a synthetic way to + * see an {@link EditScript edit script}, replacing individual + * {@link DeleteCommand delete}, {@link KeepCommand keep} and + * {@link InsertCommand insert} commands by fewer replacements acting on + * complete sub-sequences. + * </p> + * <p> + * This class is devoted to perform this interpretation. It visits an + * {@link EditScript edit script} (because it implements the + * {@link CommandVisitor CommandVisitor} interface) and calls a user-supplied + * handler implementing the {@link ReplacementsHandler ReplacementsHandler} + * interface to process the sub-sequences. + * </p> + * + * @see ReplacementsHandler + * @see EditScript + * @see StringsComparator + * + * @param <T> object type + * @since 1.0 + */ +public class ReplacementsFinder<T> implements CommandVisitor<T> { + + /** + * List of pending insertions. + */ + private final List<T> pendingInsertions; + /** + * List of pending deletions. + */ + private final List<T> pendingDeletions; + /** + * Count of elements skipped. + */ + private int skipped; + + /** Handler to call when synchronized sequences are found. */ + private final ReplacementsHandler<T> handler; + + /** + * Simple constructor. Creates a new instance of {@link ReplacementsFinder}. + * + * @param handler handler to call when synchronized sequences are found + */ + public ReplacementsFinder(final ReplacementsHandler<T> handler) { + pendingInsertions = new ArrayList<>(); + pendingDeletions = new ArrayList<>(); + skipped = 0; + this.handler = handler; + } + + /** + * Add an object to the pending insertions set. + * + * @param object object to insert + */ + @Override + public void visitInsertCommand(final T object) { + pendingInsertions.add(object); + } + + /** + * Handle a synchronization object. + * <p> + * When a synchronization object is identified, the pending insertions and + * pending deletions sets are provided to the user handler as subsequences. + * </p> + * + * @param object synchronization object detected + */ + @Override + public void visitKeepCommand(final T object) { + if (pendingDeletions.isEmpty() && pendingInsertions.isEmpty()) { + ++skipped; + } else { + handler.handleReplacement(skipped, pendingDeletions, pendingInsertions); + pendingDeletions.clear(); + pendingInsertions.clear(); + skipped = 1; + } + } + + /** + * Add an object to the pending deletions set. + * + * @param object object to delete + */ + @Override + public void visitDeleteCommand(final T object) { + pendingDeletions.add(object); + } + +} http://git-wip-us.apache.org/repos/asf/commons-text/blob/4d927351/src/main/java/org/apache/commons/text/beta/diff/ReplacementsHandler.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/beta/diff/ReplacementsHandler.java b/src/main/java/org/apache/commons/text/beta/diff/ReplacementsHandler.java new file mode 100644 index 0000000..8744081 --- /dev/null +++ b/src/main/java/org/apache/commons/text/beta/diff/ReplacementsHandler.java @@ -0,0 +1,52 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.text.beta.diff; + +import java.util.List; + +/** + * This interface is devoted to handle synchronized replacement sequences. + * + * @param <T> object type + * @see ReplacementsFinder + * @since 1.0 + */ +public interface ReplacementsHandler<T> { + + /** + * Handle two synchronized sequences. + * <p> + * This method is called by a {@link ReplacementsFinder ReplacementsFinder} + * instance when it has synchronized two sub-sequences of object arrays + * being compared, and at least one of the sequences is non-empty. Since the + * sequences are synchronized, the objects before the two sub-sequences are + * equals (if they exist). This property also holds for the objects after + * the two sub-sequences. + * <p> + * The replacement is defined as replacing the <code>from</code> + * sub-sequence into the <code>to</code> sub-sequence. + * + * @param skipped number of tokens skipped since the last call (i.e. number of + * tokens that were in both sequences), this number should be strictly positive + * except on the very first call where it can be zero (if the first object of + * the two sequences are different) + * @param from sub-sequence of objects coming from the first sequence + * @param to sub-sequence of objects coming from the second sequence + */ + void handleReplacement(int skipped, List<T> from, List<T> to); + +} http://git-wip-us.apache.org/repos/asf/commons-text/blob/4d927351/src/main/java/org/apache/commons/text/beta/diff/StringsComparator.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/beta/diff/StringsComparator.java b/src/main/java/org/apache/commons/text/beta/diff/StringsComparator.java new file mode 100644 index 0000000..3d39433 --- /dev/null +++ b/src/main/java/org/apache/commons/text/beta/diff/StringsComparator.java @@ -0,0 +1,331 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.text.beta.diff; + +/** + * <p> + * It is guaranteed that the comparisons will always be done as + * <code>o1.equals(o2)</code> where <code>o1</code> belongs to the first + * sequence and <code>o2</code> belongs to the second sequence. This can + * be important if subclassing is used for some elements in the first + * sequence and the <code>equals</code> method is specialized. + * </p> + * <p> + * Comparison can be seen from two points of view: either as giving the smallest + * modification allowing to transform the first sequence into the second one, or + * as giving the longest sequence which is a subsequence of both initial + * sequences. The <code>equals</code> method is used to compare objects, so any + * object can be put into sequences. Modifications include deleting, inserting + * or keeping one object, starting from the beginning of the first sequence. + * </p> + * <p> + * This class implements the comparison algorithm, which is the very efficient + * algorithm from Eugene W. Myers + * <a href="http://www.cis.upenn.edu/~bcpierce/courses/dd/papers/diff.ps"> + * An O(ND) Difference Algorithm and Its Variations</a>. This algorithm produces + * the shortest possible {@link EditScript edit script} containing all the + * {@link EditCommand commands} needed to transform the first sequence into + * the second one. + * + * <p> + * This code has been adapted from Apache Commons Collections 4.0. + * </p> + * + * @see EditScript + * @see EditCommand + * @see CommandVisitor + * @since 1.0 + */ +public class StringsComparator { + + /** + * First character sequence. + */ + private final String left; + /** + * Second character sequence. + */ + private final String right; + /** + * Temporary array. + */ + private final int[] vDown; + /** + * Temporary array. + */ + private final int[] vUp; + + /** + * Simple constructor. + * <p> + * Creates a new instance of StringsComparator. + * </p> + * <p> + * It is <em>guaranteed</em> that the comparisons will always be done as + * <code>o1.equals(o2)</code> where <code>o1</code> belongs to the first + * sequence and <code>o2</code> belongs to the second sequence. This can be + * important if subclassing is used for some elements in the first sequence + * and the <code>equals</code> method is specialized. + * </p> + * + * @param left first character sequence to be compared + * @param right second character sequence to be compared + */ + public StringsComparator(final String left, final String right) { + this.left = left; + this.right = right; + + final int size = left.length() + right.length() + 2; + vDown = new int[size]; + vUp = new int[size]; + } + + /** + * Get the {@link EditScript} object. + * <p> + * It is guaranteed that the objects embedded in the {@link InsertCommand + * insert commands} come from the second sequence and that the objects + * embedded in either the {@link DeleteCommand delete commands} or + * {@link KeepCommand keep commands} come from the first sequence. This can + * be important if subclassing is used for some elements in the first + * sequence and the <code>equals</code> method is specialized. + * </p> + * + * @return the edit script resulting from the comparison of the two + * sequences + */ + public EditScript<Character> getScript() { + final EditScript<Character> script = new EditScript<>(); + buildScript(0, left.length(), 0, right.length(), script); + return script; + } + + /** + * Build an edit script. + * + * @param start1 the begin of the first sequence to be compared + * @param end1 the end of the first sequence to be compared + * @param start2 the begin of the second sequence to be compared + * @param end2 the end of the second sequence to be compared + * @param script the edited script + */ + private void buildScript(final int start1, final int end1, final int start2, final int end2, + final EditScript<Character> script) { + final Snake middle = getMiddleSnake(start1, end1, start2, end2); + + if (middle == null + || middle.getStart() == end1 && middle.getDiag() == end1 - end2 + || middle.getEnd() == start1 && middle.getDiag() == start1 - start2) { + + int i = start1; + int j = start2; + while (i < end1 || j < end2) { + if (i < end1 && j < end2 && left.charAt(i) == right.charAt(j)) { + script.append(new KeepCommand<>(left.charAt(i))); + ++i; + ++j; + } else { + if (end1 - start1 > end2 - start2) { + script.append(new DeleteCommand<>(left.charAt(i))); + ++i; + } else { + script.append(new InsertCommand<>(right.charAt(j))); + ++j; + } + } + } + + } else { + + buildScript(start1, middle.getStart(), + start2, middle.getStart() - middle.getDiag(), + script); + for (int i = middle.getStart(); i < middle.getEnd(); ++i) { + script.append(new KeepCommand<>(left.charAt(i))); + } + buildScript(middle.getEnd(), end1, + middle.getEnd() - middle.getDiag(), end2, + script); + } + } + + /** + * Get the middle snake corresponding to two subsequences of the + * main sequences. + * <p> + * The snake is found using the MYERS Algorithm (this algorithms has + * also been implemented in the GNU diff program). This algorithm is + * explained in Eugene Myers article: + * <a href="http://www.cs.arizona.edu/people/gene/PAPERS/diff.ps"> + * An O(ND) Difference Algorithm and Its Variations</a>. + * </p> + * + * @param start1 the begin of the first sequence to be compared + * @param end1 the end of the first sequence to be compared + * @param start2 the begin of the second sequence to be compared + * @param end2 the end of the second sequence to be compared + * @return the middle snake + */ + private Snake getMiddleSnake(final int start1, final int end1, final int start2, final int end2) { + // Myers Algorithm + // Initialisations + final int m = end1 - start1; + final int n = end2 - start2; + if (m == 0 || n == 0) { + return null; + } + + final int delta = m - n; + final int sum = n + m; + final int offset = (sum % 2 == 0 ? sum : sum + 1) / 2; + vDown[1 + offset] = start1; + vUp[1 + offset] = end1 + 1; + + for (int d = 0; d <= offset; ++d) { + // Down + for (int k = -d; k <= d; k += 2) { + // First step + + final int i = k + offset; + if (k == -d || k != d && vDown[i - 1] < vDown[i + 1]) { + vDown[i] = vDown[i + 1]; + } else { + vDown[i] = vDown[i - 1] + 1; + } + + int x = vDown[i]; + int y = x - start1 + start2 - k; + + while (x < end1 && y < end2 && left.charAt(x) == right.charAt(y)) { + vDown[i] = ++x; + ++y; + } + // Second step + if (delta % 2 != 0 && delta - d <= k && k <= delta + d) { + if (vUp[i - delta] <= vDown[i]) { // NOPMD + return buildSnake(vUp[i - delta], k + start1 - start2, end1, end2); + } + } + } + + // Up + for (int k = delta - d; k <= delta + d; k += 2) { + // First step + final int i = k + offset - delta; + if (k == delta - d + || k != delta + d && vUp[i + 1] <= vUp[i - 1]) { + vUp[i] = vUp[i + 1] - 1; + } else { + vUp[i] = vUp[i - 1]; + } + + int x = vUp[i] - 1; + int y = x - start1 + start2 - k; + while (x >= start1 && y >= start2 + && left.charAt(x) == right.charAt(y)) { + vUp[i] = x--; + y--; + } + // Second step + if (delta % 2 == 0 && -d <= k && k <= d) { + if (vUp[i] <= vDown[i + delta]) { // NOPMD + return buildSnake(vUp[i], k + start1 - start2, end1, end2); + } + } + } + } + + // this should not happen + throw new RuntimeException("Internal Error"); + } + + /** + * Build a snake. + * + * @param start the value of the start of the snake + * @param diag the value of the diagonal of the snake + * @param end1 the value of the end of the first sequence to be compared + * @param end2 the value of the end of the second sequence to be compared + * @return the snake built + */ + private Snake buildSnake(final int start, final int diag, final int end1, final int end2) { + int end = start; + while (end - diag < end2 + && end < end1 + && left.charAt(end) == right.charAt(end - diag)) { + ++end; + } + return new Snake(start, end, diag); + } + + /** + * This class is a simple placeholder to hold the end part of a path + * under construction in a {@link StringsComparator StringsComparator}. + */ + private static class Snake { + + /** Start index. */ + private final int start; + + /** End index. */ + private final int end; + + /** Diagonal number. */ + private final int diag; + + /** + * Simple constructor. Creates a new instance of Snake with specified indices. + * + * @param start start index of the snake + * @param end end index of the snake + * @param diag diagonal number + */ + public Snake(final int start, final int end, final int diag) { + this.start = start; + this.end = end; + this.diag = diag; + } + + /** + * Get the start index of the snake. + * + * @return start index of the snake + */ + public int getStart() { + return start; + } + + /** + * Get the end index of the snake. + * + * @return end index of the snake + */ + public int getEnd() { + return end; + } + + /** + * Get the diagonal number of the snake. + * + * @return diagonal number of the snake + */ + public int getDiag() { + return diag; + } + } + +} http://git-wip-us.apache.org/repos/asf/commons-text/blob/4d927351/src/main/java/org/apache/commons/text/beta/diff/package-info.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/beta/diff/package-info.java b/src/main/java/org/apache/commons/text/beta/diff/package-info.java new file mode 100644 index 0000000..0d1323f --- /dev/null +++ b/src/main/java/org/apache/commons/text/beta/diff/package-info.java @@ -0,0 +1,25 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** + * <p>Provides algorithms for diff between strings.</p> + * + * <p>The initial implementation of the Myers algorithm was adapted from the + * commons-collections sequence package.</p> + * + * @since 1.0 + */ +package org.apache.commons.text.beta.diff; http://git-wip-us.apache.org/repos/asf/commons-text/blob/4d927351/src/main/java/org/apache/commons/text/beta/package-info.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/beta/package-info.java b/src/main/java/org/apache/commons/text/beta/package-info.java new file mode 100644 index 0000000..dc58111 --- /dev/null +++ b/src/main/java/org/apache/commons/text/beta/package-info.java @@ -0,0 +1,22 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** + * <p>Basic classes for text handling.</p> + * + * @since 1.0 + */ +package org.apache.commons.text.beta; http://git-wip-us.apache.org/repos/asf/commons-text/blob/4d927351/src/main/java/org/apache/commons/text/beta/similarity/CosineDistance.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/beta/similarity/CosineDistance.java b/src/main/java/org/apache/commons/text/beta/similarity/CosineDistance.java new file mode 100644 index 0000000..a51fbee --- /dev/null +++ b/src/main/java/org/apache/commons/text/beta/similarity/CosineDistance.java @@ -0,0 +1,57 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.text.beta.similarity; + +import java.util.Map; + +/** + * Measures the cosine distance between two character sequences. + * + * <p>It utilizes the {@link CosineSimilarity} to compute the distance. Character sequences + * are converted into vectors through a simple tokenizer that works with a regular expression + * to split words in a sentence.</p> + * + * <p> + * For further explanation about Cosine Similarity and Cosine Distance, refer to + * http://en.wikipedia.org/wiki/Cosine_similarity. + * </p> + * + * @since 1.0 + * @see CosineSimilarity + */ +public class CosineDistance implements EditDistance<Double> { + /** + * Tokenizer used to convert the character sequence into a vector. + */ + private final Tokenizer<CharSequence> tokenizer = new RegexTokenizer(); + /** + * Cosine similarity. + */ + private final CosineSimilarity cosineSimilarity = new CosineSimilarity(); + + @Override + public Double apply(final CharSequence left, final CharSequence right) { + final CharSequence[] leftTokens = tokenizer.tokenize(left); + final CharSequence[] rightTokens = tokenizer.tokenize(right); + + final Map<CharSequence, Integer> leftVector = Counter.of(leftTokens); + final Map<CharSequence, Integer> rightVector = Counter.of(rightTokens); + final double similarity = cosineSimilarity.cosineSimilarity(leftVector, rightVector); + return 1.0 - similarity; + } + +} http://git-wip-us.apache.org/repos/asf/commons-text/blob/4d927351/src/main/java/org/apache/commons/text/beta/similarity/CosineSimilarity.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/beta/similarity/CosineSimilarity.java b/src/main/java/org/apache/commons/text/beta/similarity/CosineSimilarity.java new file mode 100644 index 0000000..b8fe704 --- /dev/null +++ b/src/main/java/org/apache/commons/text/beta/similarity/CosineSimilarity.java @@ -0,0 +1,101 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.text.beta.similarity; + +import java.util.HashSet; +import java.util.Map; +import java.util.Set; + +/** + * Measures the Cosine similarity of two vectors of an inner product space and + * compares the angle between them. + * + * <p> + * For further explanation about the Cosine Similarity, refer to + * http://en.wikipedia.org/wiki/Cosine_similarity. + * </p> + * + * @since 1.0 + */ +public class CosineSimilarity { + + /** + * Calculates the cosine similarity for two given vectors. + * + * @param leftVector left vector + * @param rightVector right vector + * @return cosine similarity between the two vectors + */ + public Double cosineSimilarity(final Map<CharSequence, Integer> leftVector, final Map<CharSequence, Integer> rightVector) { + if (leftVector == null || rightVector == null) { + throw new IllegalArgumentException("Vectors must not be null"); + } + + final Set<CharSequence> intersection = getIntersection(leftVector, rightVector); + + final double dotProduct = dot(leftVector, rightVector, intersection); + double d1 = 0.0d; + for (final Integer value : leftVector.values()) { + d1 += Math.pow(value, 2); + } + double d2 = 0.0d; + for (final Integer value : rightVector.values()) { + d2 += Math.pow(value, 2); + } + double cosineSimilarity; + if (d1 <= 0.0 || d2 <= 0.0) { + cosineSimilarity = 0.0; + } else { + cosineSimilarity = (double) (dotProduct / (double) (Math.sqrt(d1) * Math.sqrt(d2))); + } + return cosineSimilarity; + } + + /** + * Returns a set with strings common to the two given maps. + * + * @param leftVector left vector map + * @param rightVector right vector map + * @return common strings + */ + private Set<CharSequence> getIntersection(final Map<CharSequence, Integer> leftVector, + final Map<CharSequence, Integer> rightVector) { + final Set<CharSequence> intersection = new HashSet<>(leftVector.keySet()); + intersection.retainAll(rightVector.keySet()); + return intersection; + } + + /** + * Computes the dot product of two vectors. It ignores remaining elements. It means + * that if a vector is longer than other, then a smaller part of it will be used to compute + * the dot product. + * + * @param leftVector left vector + * @param rightVector right vector + * @param intersection common elements + * @return the dot product + */ + private double dot(final Map<CharSequence, Integer> leftVector, final Map<CharSequence, Integer> rightVector, + final Set<CharSequence> intersection) { + long dotProduct = 0; + for (final CharSequence key : intersection) { + dotProduct += leftVector.get(key) * rightVector.get(key); + } + return dotProduct; + } + +} http://git-wip-us.apache.org/repos/asf/commons-text/blob/4d927351/src/main/java/org/apache/commons/text/beta/similarity/Counter.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/beta/similarity/Counter.java b/src/main/java/org/apache/commons/text/beta/similarity/Counter.java new file mode 100644 index 0000000..5093cbf --- /dev/null +++ b/src/main/java/org/apache/commons/text/beta/similarity/Counter.java @@ -0,0 +1,62 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.text.beta.similarity; + +import java.util.HashMap; +import java.util.Map; + +/** + * Java implementation of Python's collections Counter module. + * + * <p>It counts how many times each element provided occurred in an array and + * returns a dict with the element as key and the count as value.</p> + * + * @see <a href="https://docs.python.org/dev/library/collections.html#collections.Counter"> + * https://docs.python.org/dev/library/collections.html#collections.Counter</a> + * + * @since 1.0 + */ +final class Counter { + + /** + * Hidden constructor. + */ + private Counter() { + super(); + } + + /** + * It counts how many times each element provided occurred in an array and + * returns a dict with the element as key and the count as value. + * + * @param tokens array of tokens + * @return dict, where the elements are key, and the count the value + */ + public static Map<CharSequence, Integer> of(final CharSequence[] tokens) { + final Map<CharSequence, Integer> innerCounter = new HashMap<>(); + for (final CharSequence token : tokens) { + if (innerCounter.containsKey(token)) { + int value = innerCounter.get(token); + innerCounter.put(token, ++value); + } else { + innerCounter.put(token, 1); + } + } + return innerCounter; + } + +} http://git-wip-us.apache.org/repos/asf/commons-text/blob/4d927351/src/main/java/org/apache/commons/text/beta/similarity/EditDistance.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/beta/similarity/EditDistance.java b/src/main/java/org/apache/commons/text/beta/similarity/EditDistance.java new file mode 100644 index 0000000..4de96fb --- /dev/null +++ b/src/main/java/org/apache/commons/text/beta/similarity/EditDistance.java @@ -0,0 +1,59 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.text.beta.similarity; + +/** + * Interface for <a href="http://en.wikipedia.org/wiki/Edit_distance">Edit Distances</a>. + * + * <p> + * An edit distance is a formal metric on the Kleene closure (<code>X<sup>*</sup></code>) over an + * alphabet (<code>X</code>). Note, that a <a href="https://en.wikipedia.org/wiki/Metric_(mathematics)">metric</a> + * on a set <code>S</code> is a function <code>d: [S * S] -> [0, INFINITY)</code> such + * that the following hold for <code>x,y,z</code> in + * the set <code>S</code>: + * </p> + * <ul> + * <li><code>d(x,y) >= 0</code>, non-negativity or separation axiom</li> + * <li><code>d(x,y) == 0</code>, if and only if, <code>x == y</code></li> + * <li><code>d(x,y) == d(y,x)</code>, symmetry, and</li> + * <li><code>d(x,z) <= d(x,y) + d(y,z)</code>, the triangle inequality</li> + * </ul> + * + * + * <p> + * This is a BiFunction<CharSequence, CharSequence, R>. + * The <code>apply</code> method + * accepts a pair of {@link CharSequence} parameters + * and returns an <code>R</code> type similarity score. + * </p> + * + * @param <R> The type of similarity score unit used by this EditDistance. + * @since 1.0 + */ +public interface EditDistance<R> extends SimilarityScore<R> { + + /** + * Compares two CharSequences. + * + * @param left the first CharSequence + * @param right the second CharSequence + * @return the similarity score between two CharSequences + */ + @Override + R apply(CharSequence left, CharSequence right); + +} http://git-wip-us.apache.org/repos/asf/commons-text/blob/4d927351/src/main/java/org/apache/commons/text/beta/similarity/EditDistanceFrom.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/beta/similarity/EditDistanceFrom.java b/src/main/java/org/apache/commons/text/beta/similarity/EditDistanceFrom.java new file mode 100644 index 0000000..34452fb --- /dev/null +++ b/src/main/java/org/apache/commons/text/beta/similarity/EditDistanceFrom.java @@ -0,0 +1,112 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.text.beta.similarity; + +/** + * <p> + * This stores a {@link EditDistance} implementation and a {@link CharSequence} "left" string. + * The {@link #apply(CharSequence right)} method accepts the "right" string and invokes the + * comparison function for the pair of strings. + * </p> + * + * <p> + * The following is an example which finds the most similar string: + * </p> + * <pre> + * EditDistance<Integer> editDistance = new LevenshteinDistance(); + * String target = "Apache"; + * EditDistanceFrom<Integer> editDistanceFrom = + * new EditDistanceFrom<Integer>(editDistance, target); + * String mostSimilar = null; + * Integer shortestDistance = null; + * + * for (String test : new String[] { "Appaloosa", "a patchy", "apple" }) { + * Integer distance = editDistanceFrom.apply(test); + * if (shortestDistance == null || distance < shortestDistance) { + * shortestDistance = distance; + * mostSimilar = test; + * } + * } + * + * System.out.println("The string most similar to \"" + target + "\" " + * + "is \"" + mostSimilar + "\" because " + * + "its distance is only " + shortestDistance + "."); + * </pre> + * + * @param <R> This is the type of similarity score used by the EditDistance function. + * @since 1.0 + */ +public class EditDistanceFrom<R> { + + /** + * Edit distance. + */ + private final EditDistance<R> editDistance; + /** + * Left parameter used in distance function. + */ + private final CharSequence left; + + /** + * <p>This accepts the edit distance implementation and the "left" string.</p> + * + * @param editDistance This may not be null. + * @param left This may be null here, + * but the EditDistance#compare(CharSequence left, CharSequence right) + * implementation may not accept nulls. + */ + public EditDistanceFrom(final EditDistance<R> editDistance, final CharSequence left) { + if (editDistance == null) { + throw new IllegalArgumentException("The edit distance may not be null."); + } + + this.editDistance = editDistance; + this.left = left; + } + + /** + * <p> + * This compares "left" field against the "right" parameter + * using the "edit distance" implementation. + * </p> + * + * @param right the second CharSequence + * @return the similarity score between two CharSequences + */ + public R apply(final CharSequence right) { + return editDistance.apply(left, right); + } + + /** + * Gets the left parameter. + * + * @return the left parameter + */ + public CharSequence getLeft() { + return left; + } + + /** + * Gets the edit distance. + * + * @return the edit distance + */ + public EditDistance<R> getEditDistance() { + return editDistance; + } + +} http://git-wip-us.apache.org/repos/asf/commons-text/blob/4d927351/src/main/java/org/apache/commons/text/beta/similarity/FuzzyScore.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/beta/similarity/FuzzyScore.java b/src/main/java/org/apache/commons/text/beta/similarity/FuzzyScore.java new file mode 100644 index 0000000..3647a49 --- /dev/null +++ b/src/main/java/org/apache/commons/text/beta/similarity/FuzzyScore.java @@ -0,0 +1,144 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.text.beta.similarity; + +import java.util.Locale; + +/** + * A matching algorithm that is similar to the searching algorithms implemented in editors such + * as Sublime Text, TextMate, Atom and others. + * + * <p> + * One point is given for every matched character. Subsequent matches yield two bonus points. A higher score + * indicates a higher similarity. + * </p> + * + * <p> + * This code has been adapted from Apache Commons Lang 3.3. + * </p> + * + * @since 1.0 + */ +public class FuzzyScore { + + /** + * Locale used to change the case of text. + */ + private final Locale locale; + + + /** + * <p>This returns a {@link Locale}-specific {@link FuzzyScore}.</p> + * + * @param locale The string matching logic is case insensitive. + A {@link Locale} is necessary to normalize both Strings to lower case. + * @throws IllegalArgumentException + * This is thrown if the {@link Locale} parameter is {@code null}. + */ + public FuzzyScore(final Locale locale) { + if (locale == null) { + throw new IllegalArgumentException("Locale must not be null"); + } + this.locale = locale; + } + + /** + * <p> + * Find the Fuzzy Score which indicates the similarity score between two + * Strings. + * </p> + * + * <pre> + * score.fuzzyScore(null, null, null) = IllegalArgumentException + * score.fuzzyScore("", "", Locale.ENGLISH) = 0 + * score.fuzzyScore("Workshop", "b", Locale.ENGLISH) = 0 + * score.fuzzyScore("Room", "o", Locale.ENGLISH) = 1 + * score.fuzzyScore("Workshop", "w", Locale.ENGLISH) = 1 + * score.fuzzyScore("Workshop", "ws", Locale.ENGLISH) = 2 + * score.fuzzyScore("Workshop", "wo", Locale.ENGLISH) = 4 + * score.fuzzyScore("Apache Software Foundation", "asf", Locale.ENGLISH) = 3 + * </pre> + * + * @param term a full term that should be matched against, must not be null + * @param query the query that will be matched against a term, must not be + * null + * @return result score + * @throws IllegalArgumentException if either String input {@code null} or + * Locale input {@code null} + */ + public Integer fuzzyScore(final CharSequence term, final CharSequence query) { + if (term == null || query == null) { + throw new IllegalArgumentException("Strings must not be null"); + } + + // fuzzy logic is case insensitive. We normalize the Strings to lower + // case right from the start. Turning characters to lower case + // via Character.toLowerCase(char) is unfortunately insufficient + // as it does not accept a locale. + final String termLowerCase = term.toString().toLowerCase(locale); + final String queryLowerCase = query.toString().toLowerCase(locale); + + // the resulting score + int score = 0; + + // the position in the term which will be scanned next for potential + // query character matches + int termIndex = 0; + + // index of the previously matched character in the term + int previousMatchingCharacterIndex = Integer.MIN_VALUE; + + for (int queryIndex = 0; queryIndex < queryLowerCase.length(); queryIndex++) { + final char queryChar = queryLowerCase.charAt(queryIndex); + + boolean termCharacterMatchFound = false; + for (; termIndex < termLowerCase.length() + && !termCharacterMatchFound; termIndex++) { + final char termChar = termLowerCase.charAt(termIndex); + + if (queryChar == termChar) { + // simple character matches result in one point + score++; + + // subsequent character matches further improve + // the score. + if (previousMatchingCharacterIndex + 1 == termIndex) { + score += 2; + } + + previousMatchingCharacterIndex = termIndex; + + // we can leave the nested loop. Every character in the + // query can match at most one character in the term. + termCharacterMatchFound = true; + } + } + } + + return score; + } + + /** + * Gets the locale. + * + * @return the locale + */ + public Locale getLocale() { + return locale; + } + +} http://git-wip-us.apache.org/repos/asf/commons-text/blob/4d927351/src/main/java/org/apache/commons/text/beta/similarity/HammingDistance.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/beta/similarity/HammingDistance.java b/src/main/java/org/apache/commons/text/beta/similarity/HammingDistance.java new file mode 100644 index 0000000..2ab73ea --- /dev/null +++ b/src/main/java/org/apache/commons/text/beta/similarity/HammingDistance.java @@ -0,0 +1,78 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.text.beta.similarity; + +/** + * The hamming distance between two strings of equal length is the number of + * positions at which the corresponding symbols are different. + * + * <p> + * For further explanation about the Hamming Distance, take a look at its + * Wikipedia page at http://en.wikipedia.org/wiki/Hamming_distance. + * </p> + * + * @since 1.0 + */ +public class HammingDistance implements EditDistance<Integer> { + + /** + * Find the Hamming Distance between two strings with the same + * length. + * + * <p>The distance starts with zero, and for each occurrence of a + * different character in either String, it increments the distance + * by 1, and finally return its value.</p> + * + * <p>Since the Hamming Distance can only be calculated between strings of equal length, input of different lengths + * will throw IllegalArgumentException</p> + * + * <pre> + * distance.apply("", "") = 0 + * distance.apply("pappa", "pappa") = 0 + * distance.apply("1011101", "1011111") = 1 + * distance.apply("ATCG", "ACCC") = 2 + * distance.apply("karolin", "kerstin" = 3 + * </pre> + * + * @param left the first CharSequence, must not be null + * @param right the second CharSequence, must not be null + * @return distance + * @throws IllegalArgumentException if either input is {@code null} or + * if they do not have the same length + */ + @Override + public Integer apply(final CharSequence left, final CharSequence right) { + if (left == null || right == null) { + throw new IllegalArgumentException("Strings must not be null"); + } + + if (left.length() != right.length()) { + throw new IllegalArgumentException("Strings must have the same length"); + } + + int distance = 0; + + for (int i = 0; i < left.length(); i++) { + if (left.charAt(i) != right.charAt(i)) { + distance++; + } + } + + return distance; + } + +} http://git-wip-us.apache.org/repos/asf/commons-text/blob/4d927351/src/main/java/org/apache/commons/text/beta/similarity/JaccardDistance.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/beta/similarity/JaccardDistance.java b/src/main/java/org/apache/commons/text/beta/similarity/JaccardDistance.java new file mode 100644 index 0000000..515830d --- /dev/null +++ b/src/main/java/org/apache/commons/text/beta/similarity/JaccardDistance.java @@ -0,0 +1,52 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.text.beta.similarity; + +/** + * Measures the Jaccard distance of two sets of character sequence. Jaccard + * distance is the dissimilarity between two sets. Its the complementary of + * Jaccard similarity. + * + * <p> + * For further explanation about Jaccard Distance, refer + * https://en.wikipedia.org/wiki/Jaccard_index + * </p> + * + * @since 1.0 + */ +public class JaccardDistance implements EditDistance<Double> { + + private final JaccardSimilarity jaccardSimilarity = new JaccardSimilarity(); + + /** + * Calculates Jaccard distance of two set character sequence passed as + * input. Calculates Jaccard similarity and returns the complement of it. + * + * @param left first character sequence + * @param right second character sequence + * @return index + * @throws IllegalArgumentException + * if either String input {@code null} + */ + @Override + public Double apply(CharSequence left, CharSequence right) { + if (left == null || right == null) { + throw new IllegalArgumentException("Input cannot be null"); + } + return Math.round((1 - jaccardSimilarity.apply(left, right)) * 100d) / 100d; + } +} http://git-wip-us.apache.org/repos/asf/commons-text/blob/4d927351/src/main/java/org/apache/commons/text/beta/similarity/JaccardSimilarity.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/beta/similarity/JaccardSimilarity.java b/src/main/java/org/apache/commons/text/beta/similarity/JaccardSimilarity.java new file mode 100644 index 0000000..db3e08d --- /dev/null +++ b/src/main/java/org/apache/commons/text/beta/similarity/JaccardSimilarity.java @@ -0,0 +1,88 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.text.beta.similarity; + +import java.util.HashSet; +import java.util.Set; + +/** + * Measures the Jaccard similarity (aka Jaccard index) of two sets of character + * sequence. Jaccard similarity is the size of the intersection divided by the + * size of the union of the two sets. + * + * <p> + * For further explanation about Jaccard Similarity, refer + * https://en.wikipedia.org/wiki/Jaccard_index + * </p> + * + * @since 1.0 + */ +public class JaccardSimilarity implements SimilarityScore<Double> { + + /** + * Calculates Jaccard Similarity of two set character sequence passed as + * input. + * + * @param left first character sequence + * @param right second character sequence + * @return index + * @throws IllegalArgumentException + * if either String input {@code null} + */ + @Override + public Double apply(CharSequence left, CharSequence right) { + if (left == null || right == null) { + throw new IllegalArgumentException("Input cannot be null"); + } + return Math.round(calculateJaccardSimilarity(left, right) * 100d) / 100d; + } + + /** + * Calculates Jaccard Similarity of two character sequences passed as + * input. Does the calculation by identifying the union (characters in at + * least one of the two sets) of the two sets and intersection (characters + * which are present in set one which are present in set two) + * + * @param left first character sequence + * @param right second character sequence + * @return index + */ + private Double calculateJaccardSimilarity(CharSequence left, CharSequence right) { + Set<String> intersectionSet = new HashSet<String>(); + Set<String> unionSet = new HashSet<String>(); + boolean unionFilled = false; + int leftLength = left.length(); + int rightLength = right.length(); + if (leftLength == 0 || rightLength == 0) { + return 0d; + } + + for (int leftIndex = 0; leftIndex < leftLength; leftIndex++) { + unionSet.add(String.valueOf(left.charAt(leftIndex))); + for (int rightIndex = 0; rightIndex < rightLength; rightIndex++) { + if (!unionFilled) { + unionSet.add(String.valueOf(right.charAt(rightIndex))); + } + if (left.charAt(leftIndex) == right.charAt(rightIndex)) { + intersectionSet.add(String.valueOf(left.charAt(leftIndex))); + } + } + unionFilled = true; + } + return Double.valueOf(intersectionSet.size()) / Double.valueOf(unionSet.size()); + } +} http://git-wip-us.apache.org/repos/asf/commons-text/blob/4d927351/src/main/java/org/apache/commons/text/beta/similarity/JaroWinklerDistance.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/beta/similarity/JaroWinklerDistance.java b/src/main/java/org/apache/commons/text/beta/similarity/JaroWinklerDistance.java new file mode 100644 index 0000000..6e81946 --- /dev/null +++ b/src/main/java/org/apache/commons/text/beta/similarity/JaroWinklerDistance.java @@ -0,0 +1,161 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.text.beta.similarity; + +import java.util.Arrays; + +/** + * A similarity algorithm indicating the percentage of matched characters between two character sequences. + * + * <p> + * The Jaro measure is the weighted sum of percentage of matched characters + * from each file and transposed characters. Winkler increased this measure + * for matching initial characters. + * </p> + * + * <p> + * This implementation is based on the Jaro Winkler similarity algorithm + * from <a href="http://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance"> + * http://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance</a>. + * </p> + * + * <p> + * This code has been adapted from Apache Commons Lang 3.3. + * </p> + * + * @since 1.0 + */ +public class JaroWinklerDistance implements SimilarityScore<Double> { + + /** + * The default prefix length limit set to four. + */ + private static final int PREFIX_LENGTH_LIMIT = 4; + /** + * Represents a failed index search. + */ + public static final int INDEX_NOT_FOUND = -1; + + /** + * Find the Jaro Winkler Distance which indicates the similarity score + * between two CharSequences. + * + * <pre> + * distance.apply(null, null) = IllegalArgumentException + * distance.apply("","") = 0.0 + * distance.apply("","a") = 0.0 + * distance.apply("aaapppp", "") = 0.0 + * distance.apply("frog", "fog") = 0.93 + * distance.apply("fly", "ant") = 0.0 + * distance.apply("elephant", "hippo") = 0.44 + * distance.apply("hippo", "elephant") = 0.44 + * distance.apply("hippo", "zzzzzzzz") = 0.0 + * distance.apply("hello", "hallo") = 0.88 + * distance.apply("ABC Corporation", "ABC Corp") = 0.93 + * distance.apply("D N H Enterprises Inc", "D & H Enterprises, Inc.") = 0.95 + * distance.apply("My Gym Children's Fitness Center", "My Gym. Childrens Fitness") = 0.92 + * distance.apply("PENNSYLVANIA", "PENNCISYLVNIA") = 0.88 + * </pre> + * + * @param left the first String, must not be null + * @param right the second String, must not be null + * @return result distance + * @throws IllegalArgumentException if either String input {@code null} + */ + @Override + public Double apply(final CharSequence left, final CharSequence right) { + final double defaultScalingFactor = 0.1; + final double percentageRoundValue = 100.0; + + if (left == null || right == null) { + throw new IllegalArgumentException("Strings must not be null"); + } + + int[] mtp = matches(left, right); + double m = mtp[0]; + if (m == 0) { + return 0D; + } + double j = ((m / left.length() + m / right.length() + (m - mtp[1]) / m)) / 3; + double jw = j < 0.7D ? j : j + Math.min(defaultScalingFactor, 1D / mtp[3]) * mtp[2] * (1D - j); + return Math.round(jw * percentageRoundValue) / percentageRoundValue; + } + + /** + * This method returns the Jaro-Winkler string matches, transpositions, prefix, max array. + * + * @param first the first string to be matched + * @param second the second string to be machted + * @return mtp array containing: matches, transpositions, prefix, and max length + */ + protected static int[] matches(final CharSequence first, final CharSequence second) { + CharSequence max, min; + if (first.length() > second.length()) { + max = first; + min = second; + } else { + max = second; + min = first; + } + int range = Math.max(max.length() / 2 - 1, 0); + int[] matchIndexes = new int[min.length()]; + Arrays.fill(matchIndexes, -1); + boolean[] matchFlags = new boolean[max.length()]; + int matches = 0; + for (int mi = 0; mi < min.length(); mi++) { + char c1 = min.charAt(mi); + for (int xi = Math.max(mi - range, 0), xn = Math.min(mi + range + 1, max.length()); xi < xn; xi++) { + if (!matchFlags[xi] && c1 == max.charAt(xi)) { + matchIndexes[mi] = xi; + matchFlags[xi] = true; + matches++; + break; + } + } + } + char[] ms1 = new char[matches]; + char[] ms2 = new char[matches]; + for (int i = 0, si = 0; i < min.length(); i++) { + if (matchIndexes[i] != -1) { + ms1[si] = min.charAt(i); + si++; + } + } + for (int i = 0, si = 0; i < max.length(); i++) { + if (matchFlags[i]) { + ms2[si] = max.charAt(i); + si++; + } + } + int transpositions = 0; + for (int mi = 0; mi < ms1.length; mi++) { + if (ms1[mi] != ms2[mi]) { + transpositions++; + } + } + int prefix = 0; + for (int mi = 0; mi < min.length(); mi++) { + if (first.charAt(mi) == second.charAt(mi)) { + prefix++; + } else { + break; + } + } + return new int[] { matches, transpositions / 2, prefix, max.length() }; + } + +}