http://git-wip-us.apache.org/repos/asf/commons-text/blob/4d927351/src/main/java/org/apache/commons/text/beta/diff/EditCommand.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/beta/diff/EditCommand.java 
b/src/main/java/org/apache/commons/text/beta/diff/EditCommand.java
new file mode 100644
index 0000000..a98a719
--- /dev/null
+++ b/src/main/java/org/apache/commons/text/beta/diff/EditCommand.java
@@ -0,0 +1,88 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.text.beta.diff;
+
+/**
+ * Abstract base class for all commands used to transform an objects sequence
+ * into another one.
+ * <p>
+ * When two objects sequences are compared through the
+ * {@link StringsComparator#getScript StringsComparator.getScript} method,
+ * the result is provided has a {@link EditScript script} containing the 
commands
+ * that progressively transform the first sequence into the second one.
+ * </p>
+ * <p>
+ * There are only three types of commands, all of which are subclasses of this
+ * abstract class. Each command is associated with one object belonging to at
+ * least one of the sequences. These commands are {@link InsertCommand
+ * InsertCommand} which correspond to an object of the second sequence being
+ * inserted into the first sequence, {@link DeleteCommand DeleteCommand} which
+ * correspond to an object of the first sequence being removed and
+ * {@link KeepCommand KeepCommand} which correspond to an object of the first
+ * sequence which <code>equals</code> an object in the second sequence. It is
+ * guaranteed that comparison is always performed this way (i.e. the
+ * <code>equals</code> method of the object from the first sequence is used and
+ * the object passed as an argument comes from the second sequence) ; this can
+ * be important if subclassing is used for some elements in the first sequence
+ * and the <code>equals</code> method is specialized.
+ * </p>
+ *
+ * <p>
+ * This code has been adapted from Apache Commons Collections 4.0.
+ * </p>
+ *
+ * @see StringsComparator
+ * @see EditScript
+ *
+ * @param <T> object type
+ * @since 1.0
+ */
+public abstract class EditCommand<T> {
+
+    /** Object on which the command should be applied. */
+    private final T object;
+
+    /**
+     * Simple constructor. Creates a new instance of EditCommand
+     *
+     * @param object  reference to the object associated with this command, 
this
+     *   refers to an element of one of the sequences being compared
+     */
+    protected EditCommand(final T object) {
+        this.object = object;
+    }
+
+    /**
+     * Returns the object associated with this command.
+     *
+     * @return the object on which the command is applied
+     */
+    protected T getObject() {
+        return object;
+    }
+
+    /**
+     * Accept a visitor.
+     * <p>
+     * This method is invoked for each commands belonging to
+     * an {@link EditScript EditScript}, in order to implement the visitor 
design pattern
+     *
+     * @param visitor  the visitor to be accepted
+     */
+    public abstract void accept(CommandVisitor<T> visitor);
+
+}

http://git-wip-us.apache.org/repos/asf/commons-text/blob/4d927351/src/main/java/org/apache/commons/text/beta/diff/EditScript.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/beta/diff/EditScript.java 
b/src/main/java/org/apache/commons/text/beta/diff/EditScript.java
new file mode 100644
index 0000000..ed18438
--- /dev/null
+++ b/src/main/java/org/apache/commons/text/beta/diff/EditScript.java
@@ -0,0 +1,133 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.text.beta.diff;
+
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * This class gathers all the {@link EditCommand commands} needed to transform
+ * one objects sequence into another objects sequence.
+ * <p>
+ * An edit script is the most general view of the differences between two
+ * sequences. It is built as the result of the comparison between two sequences
+ * by the {@link StringsComparator StringsComparator} class. The user can
+ * walk through it using the <em>visitor</em> design pattern.
+ * <p>
+ * It is guaranteed that the objects embedded in the {@link InsertCommand 
insert
+ * commands} come from the second sequence and that the objects embedded in
+ * either the {@link DeleteCommand delete commands} or {@link KeepCommand keep
+ * commands} come from the first sequence. This can be important if subclassing
+ * is used for some elements in the first sequence and the <code>equals</code>
+ * method is specialized.
+ *
+ * @see StringsComparator
+ * @see EditCommand
+ * @see CommandVisitor
+ * @see ReplacementsHandler
+ *
+ * @param <T> object type
+ * @since 1.0
+ */
+public class EditScript<T> {
+
+    /** Container for the commands. */
+    private final List<EditCommand<T>> commands;
+
+    /** Length of the longest common subsequence. */
+    private int lcsLength;
+
+    /** Number of modifications. */
+    private int modifications;
+
+    /**
+     * Simple constructor. Creates a new empty script.
+     */
+    public EditScript() {
+        commands = new ArrayList<>();
+        lcsLength = 0;
+        modifications = 0;
+    }
+
+    /**
+     * Add a keep command to the script.
+     *
+     * @param command  command to add
+     */
+    public void append(final KeepCommand<T> command) {
+        commands.add(command);
+        ++lcsLength;
+    }
+
+    /**
+     * Add an insert command to the script.
+     *
+     * @param command  command to add
+     */
+    public void append(final InsertCommand<T> command) {
+        commands.add(command);
+        ++modifications;
+    }
+
+    /**
+     * Add a delete command to the script.
+     *
+     * @param command  command to add
+     */
+    public void append(final DeleteCommand<T> command) {
+        commands.add(command);
+        ++modifications;
+    }
+
+    /**
+     * Visit the script. The script implements the <em>visitor</em> design
+     * pattern, this method is the entry point to which the user supplies its
+     * own visitor, the script will be responsible to drive it through the
+     * commands in order and call the appropriate method as each command is
+     * encountered.
+     *
+     * @param visitor  the visitor that will visit all commands in turn
+     */
+    public void visit(final CommandVisitor<T> visitor) {
+        for (final EditCommand<T> command : commands) {
+            command.accept(visitor);
+        }
+    }
+
+    /**
+     * Get the length of the Longest Common Subsequence (LCS). The length of 
the
+     * longest common subsequence is the number of {@link KeepCommand keep
+     * commands} in the script.
+     *
+     * @return length of the Longest Common Subsequence
+     */
+    public int getLCSLength() {
+        return lcsLength;
+    }
+
+    /**
+     * Get the number of effective modifications. The number of effective
+     * modification is the number of {@link DeleteCommand delete} and
+     * {@link InsertCommand insert} commands in the script.
+     *
+     * @return number of effective modifications
+     */
+    public int getModifications() {
+        return modifications;
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/commons-text/blob/4d927351/src/main/java/org/apache/commons/text/beta/diff/InsertCommand.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/beta/diff/InsertCommand.java 
b/src/main/java/org/apache/commons/text/beta/diff/InsertCommand.java
new file mode 100644
index 0000000..ecdde02
--- /dev/null
+++ b/src/main/java/org/apache/commons/text/beta/diff/InsertCommand.java
@@ -0,0 +1,58 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.text.beta.diff;
+
+/**
+ * Command representing the insertion of one object of the second sequence.
+ * <p>
+ * When one object of the second sequence has no corresponding object in the
+ * first sequence at the right place, the {@link EditScript edit script}
+ * transforming the first sequence into the second sequence uses an instance of
+ * this class to represent the insertion of this object. The objects embedded 
in
+ * these type of commands always come from the second sequence.
+ * </p>
+ *
+ * @see StringsComparator
+ * @see EditScript
+ *
+ * @param <T> object type
+ * @since 1.0
+ */
+public class InsertCommand<T> extends EditCommand<T> {
+
+    /**
+     * Simple constructor. Creates a new instance of InsertCommand
+     *
+     * @param object  the object of the second sequence that should be inserted
+     */
+    public InsertCommand(final T object) {
+        super(object);
+    }
+
+    /**
+     * Accept a visitor. When an <code>InsertCommand</code> accepts a visitor,
+     * it calls its {@link CommandVisitor#visitInsertCommand 
visitInsertCommand}
+     * method.
+     *
+     * @param visitor  the visitor to be accepted
+     */
+    @Override
+    public void accept(final CommandVisitor<T> visitor) {
+        visitor.visitInsertCommand(getObject());
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/commons-text/blob/4d927351/src/main/java/org/apache/commons/text/beta/diff/KeepCommand.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/beta/diff/KeepCommand.java 
b/src/main/java/org/apache/commons/text/beta/diff/KeepCommand.java
new file mode 100644
index 0000000..7f03ade
--- /dev/null
+++ b/src/main/java/org/apache/commons/text/beta/diff/KeepCommand.java
@@ -0,0 +1,58 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.text.beta.diff;
+
+/**
+ * Command representing the keeping of one object present in both sequences.
+ * <p>
+ * When one object of the first sequence <code>equals</code> another objects in
+ * the second sequence at the right place, the {@link EditScript edit script}
+ * transforming the first sequence into the second sequence uses an instance of
+ * this class to represent the keeping of this object. The objects embedded in
+ * these type of commands always come from the first sequence.
+ * </p>
+ *
+ * @see StringsComparator
+ * @see EditScript
+ *
+ * @param <T> object type
+ * @since 1.0
+ */
+public class KeepCommand<T> extends EditCommand<T> {
+
+    /**
+     * Simple constructor. Creates a new instance of KeepCommand
+     *
+     * @param object  the object belonging to both sequences (the object is a
+     *   reference to the instance in the first sequence which is known
+     *   to be equal to an instance in the second sequence)
+     */
+    public KeepCommand(final T object) {
+        super(object);
+    }
+
+    /**
+     * Accept a visitor. When a <code>KeepCommand</code> accepts a visitor, it
+     * calls its {@link CommandVisitor#visitKeepCommand visitKeepCommand} 
method.
+     *
+     * @param visitor  the visitor to be accepted
+     */
+    @Override
+    public void accept(final CommandVisitor<T> visitor) {
+        visitor.visitKeepCommand(getObject());
+    }
+}

http://git-wip-us.apache.org/repos/asf/commons-text/blob/4d927351/src/main/java/org/apache/commons/text/beta/diff/ReplacementsFinder.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/commons/text/beta/diff/ReplacementsFinder.java 
b/src/main/java/org/apache/commons/text/beta/diff/ReplacementsFinder.java
new file mode 100644
index 0000000..a09c015
--- /dev/null
+++ b/src/main/java/org/apache/commons/text/beta/diff/ReplacementsFinder.java
@@ -0,0 +1,124 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.text.beta.diff;
+
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * This class handles sequences of replacements resulting from a comparison.
+ * <p>
+ * The comparison of two objects sequences leads to the identification of 
common
+ * parts and parts which only belong to the first or to the second sequence. 
The
+ * common parts appear in the edit script in the form of <em>keep</em> 
commands,
+ * they can be considered as synchronization objects between the two sequences.
+ * These synchronization objects split the two sequences in synchronized
+ * sub-sequences. The first sequence can be transformed into the second one by
+ * replacing each synchronized sub-sequence of the first sequence by the
+ * corresponding sub-sequence of the second sequence. This is a synthetic way 
to
+ * see an {@link EditScript edit script}, replacing individual
+ * {@link DeleteCommand delete}, {@link KeepCommand keep} and
+ * {@link InsertCommand insert} commands by fewer replacements acting on
+ * complete sub-sequences.
+ * </p>
+ * <p>
+ * This class is devoted to perform this interpretation. It visits an
+ * {@link EditScript edit script} (because it implements the
+ * {@link CommandVisitor CommandVisitor} interface) and calls a user-supplied
+ * handler implementing the {@link ReplacementsHandler ReplacementsHandler}
+ * interface to process the sub-sequences.
+ * </p>
+ *
+ * @see ReplacementsHandler
+ * @see EditScript
+ * @see StringsComparator
+ *
+ * @param <T> object type
+ * @since 1.0
+ */
+public class ReplacementsFinder<T> implements CommandVisitor<T> {
+
+    /**
+     * List of pending insertions.
+     */
+    private final List<T> pendingInsertions;
+    /**
+     * List of pending deletions.
+     */
+    private final List<T> pendingDeletions;
+    /**
+     * Count of elements skipped.
+     */
+    private int skipped;
+
+    /** Handler to call when synchronized sequences are found. */
+    private final ReplacementsHandler<T> handler;
+
+    /**
+     * Simple constructor. Creates a new instance of {@link 
ReplacementsFinder}.
+     *
+     * @param handler  handler to call when synchronized sequences are found
+     */
+    public ReplacementsFinder(final ReplacementsHandler<T> handler) {
+        pendingInsertions = new ArrayList<>();
+        pendingDeletions  = new ArrayList<>();
+        skipped           = 0;
+        this.handler      = handler;
+    }
+
+    /**
+     * Add an object to the pending insertions set.
+     *
+     * @param object  object to insert
+     */
+    @Override
+    public void visitInsertCommand(final T object) {
+        pendingInsertions.add(object);
+    }
+
+    /**
+     * Handle a synchronization object.
+     * <p>
+     * When a synchronization object is identified, the pending insertions and
+     * pending deletions sets are provided to the user handler as subsequences.
+     * </p>
+     *
+     * @param object  synchronization object detected
+     */
+    @Override
+    public void visitKeepCommand(final T object) {
+        if (pendingDeletions.isEmpty() && pendingInsertions.isEmpty()) {
+            ++skipped;
+        } else {
+            handler.handleReplacement(skipped, pendingDeletions, 
pendingInsertions);
+            pendingDeletions.clear();
+            pendingInsertions.clear();
+            skipped = 1;
+        }
+    }
+
+    /**
+     * Add an object to the pending deletions set.
+     *
+     * @param object  object to delete
+     */
+    @Override
+    public void visitDeleteCommand(final T object) {
+        pendingDeletions.add(object);
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/commons-text/blob/4d927351/src/main/java/org/apache/commons/text/beta/diff/ReplacementsHandler.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/commons/text/beta/diff/ReplacementsHandler.java 
b/src/main/java/org/apache/commons/text/beta/diff/ReplacementsHandler.java
new file mode 100644
index 0000000..8744081
--- /dev/null
+++ b/src/main/java/org/apache/commons/text/beta/diff/ReplacementsHandler.java
@@ -0,0 +1,52 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.text.beta.diff;
+
+import java.util.List;
+
+/**
+ * This interface is devoted to handle synchronized replacement sequences.
+ *
+ * @param <T> object type
+ * @see ReplacementsFinder
+ * @since 1.0
+ */
+public interface ReplacementsHandler<T> {
+
+    /**
+     * Handle two synchronized sequences.
+     * <p>
+     * This method is called by a {@link ReplacementsFinder ReplacementsFinder}
+     * instance when it has synchronized two sub-sequences of object arrays
+     * being compared, and at least one of the sequences is non-empty. Since 
the
+     * sequences are synchronized, the objects before the two sub-sequences are
+     * equals (if they exist). This property also holds for the objects after
+     * the two sub-sequences.
+     * <p>
+     * The replacement is defined as replacing the <code>from</code>
+     * sub-sequence into the <code>to</code> sub-sequence.
+     *
+     * @param skipped  number of tokens skipped since the last call (i.e. 
number of
+     *   tokens that were in both sequences), this number should be strictly 
positive
+     *   except on the very first call where it can be zero (if the first 
object of
+     *   the two sequences are different)
+     * @param from  sub-sequence of objects coming from the first sequence
+     * @param to  sub-sequence of objects coming from the second sequence
+     */
+    void handleReplacement(int skipped, List<T> from, List<T> to);
+
+}

http://git-wip-us.apache.org/repos/asf/commons-text/blob/4d927351/src/main/java/org/apache/commons/text/beta/diff/StringsComparator.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/commons/text/beta/diff/StringsComparator.java 
b/src/main/java/org/apache/commons/text/beta/diff/StringsComparator.java
new file mode 100644
index 0000000..3d39433
--- /dev/null
+++ b/src/main/java/org/apache/commons/text/beta/diff/StringsComparator.java
@@ -0,0 +1,331 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.text.beta.diff;
+
+/**
+ * <p>
+ * It is guaranteed that the comparisons will always be done as
+ * <code>o1.equals(o2)</code> where <code>o1</code> belongs to the first
+ * sequence and <code>o2</code> belongs to the second sequence. This can
+ * be important if subclassing is used for some elements in the first
+ * sequence and the <code>equals</code> method is specialized.
+ * </p>
+ * <p>
+ * Comparison can be seen from two points of view: either as giving the 
smallest
+ * modification allowing to transform the first sequence into the second one, 
or
+ * as giving the longest sequence which is a subsequence of both initial
+ * sequences. The <code>equals</code> method is used to compare objects, so any
+ * object can be put into sequences. Modifications include deleting, inserting
+ * or keeping one object, starting from the beginning of the first sequence.
+ * </p>
+ * <p>
+ * This class implements the comparison algorithm, which is the very efficient
+ * algorithm from Eugene W. Myers
+ * <a href="http://www.cis.upenn.edu/~bcpierce/courses/dd/papers/diff.ps";>
+ * An O(ND) Difference Algorithm and Its Variations</a>. This algorithm 
produces
+ * the shortest possible {@link EditScript edit script} containing all the
+ * {@link EditCommand commands} needed to transform the first sequence into
+ * the second one.
+ *
+ * <p>
+ * This code has been adapted from Apache Commons Collections 4.0.
+ * </p>
+ *
+ * @see EditScript
+ * @see EditCommand
+ * @see CommandVisitor
+ * @since 1.0
+ */
+public class StringsComparator {
+
+    /**
+     * First character sequence.
+     */
+    private final String left;
+    /**
+     * Second character sequence.
+     */
+    private final String right;
+    /**
+     * Temporary array.
+     */
+    private final int[] vDown;
+    /**
+     * Temporary array.
+     */
+    private final int[] vUp;
+
+    /**
+     * Simple constructor.
+     * <p>
+     * Creates a new instance of StringsComparator.
+     * </p>
+     * <p>
+     * It is <em>guaranteed</em> that the comparisons will always be done as
+     * <code>o1.equals(o2)</code> where <code>o1</code> belongs to the first
+     * sequence and <code>o2</code> belongs to the second sequence. This can be
+     * important if subclassing is used for some elements in the first sequence
+     * and the <code>equals</code> method is specialized.
+     * </p>
+     *
+     * @param left first character sequence to be compared
+     * @param right second character sequence to be compared
+     */
+    public StringsComparator(final String left, final String right) {
+        this.left = left;
+        this.right = right;
+
+        final int size = left.length() + right.length() + 2;
+        vDown = new int[size];
+        vUp   = new int[size];
+    }
+
+    /**
+     * Get the {@link EditScript} object.
+     * <p>
+     * It is guaranteed that the objects embedded in the {@link InsertCommand
+     * insert commands} come from the second sequence and that the objects
+     * embedded in either the {@link DeleteCommand delete commands} or
+     * {@link KeepCommand keep commands} come from the first sequence. This can
+     * be important if subclassing is used for some elements in the first
+     * sequence and the <code>equals</code> method is specialized.
+     * </p>
+     *
+     * @return the edit script resulting from the comparison of the two
+     *         sequences
+     */
+    public EditScript<Character> getScript() {
+        final EditScript<Character> script = new EditScript<>();
+        buildScript(0, left.length(), 0, right.length(), script);
+        return script;
+    }
+
+    /**
+     * Build an edit script.
+     *
+     * @param start1  the begin of the first sequence to be compared
+     * @param end1  the end of the first sequence to be compared
+     * @param start2  the begin of the second sequence to be compared
+     * @param end2  the end of the second sequence to be compared
+     * @param script the edited script
+     */
+    private void buildScript(final int start1, final int end1, final int 
start2, final int end2,
+            final EditScript<Character> script) {
+        final Snake middle = getMiddleSnake(start1, end1, start2, end2);
+
+        if (middle == null
+                || middle.getStart() == end1 && middle.getDiag() == end1 - end2
+                || middle.getEnd() == start1 && middle.getDiag() == start1 - 
start2) {
+
+            int i = start1;
+            int j = start2;
+            while (i < end1 || j < end2) {
+                if (i < end1 && j < end2 && left.charAt(i) == right.charAt(j)) 
{
+                    script.append(new KeepCommand<>(left.charAt(i)));
+                    ++i;
+                    ++j;
+                } else {
+                    if (end1 - start1 > end2 - start2) {
+                        script.append(new DeleteCommand<>(left.charAt(i)));
+                        ++i;
+                    } else {
+                        script.append(new InsertCommand<>(right.charAt(j)));
+                        ++j;
+                    }
+                }
+            }
+
+        } else {
+
+            buildScript(start1, middle.getStart(),
+                        start2, middle.getStart() - middle.getDiag(),
+                        script);
+            for (int i = middle.getStart(); i < middle.getEnd(); ++i) {
+                script.append(new KeepCommand<>(left.charAt(i)));
+            }
+            buildScript(middle.getEnd(), end1,
+                        middle.getEnd() - middle.getDiag(), end2,
+                        script);
+        }
+    }
+
+    /**
+     * Get the middle snake corresponding to two subsequences of the
+     * main sequences.
+     * <p>
+     * The snake is found using the MYERS Algorithm (this algorithms has
+     * also been implemented in the GNU diff program). This algorithm is
+     * explained in Eugene Myers article:
+     * <a href="http://www.cs.arizona.edu/people/gene/PAPERS/diff.ps";>
+     * An O(ND) Difference Algorithm and Its Variations</a>.
+     * </p>
+     *
+     * @param start1  the begin of the first sequence to be compared
+     * @param end1  the end of the first sequence to be compared
+     * @param start2  the begin of the second sequence to be compared
+     * @param end2  the end of the second sequence to be compared
+     * @return the middle snake
+     */
+    private Snake getMiddleSnake(final int start1, final int end1, final int 
start2, final int end2) {
+        // Myers Algorithm
+        // Initialisations
+        final int m = end1 - start1;
+        final int n = end2 - start2;
+        if (m == 0 || n == 0) {
+            return null;
+        }
+
+        final int delta  = m - n;
+        final int sum    = n + m;
+        final int offset = (sum % 2 == 0 ? sum : sum + 1) / 2;
+        vDown[1 + offset] = start1;
+        vUp[1 + offset]   = end1 + 1;
+
+        for (int d = 0; d <= offset; ++d) {
+            // Down
+            for (int k = -d; k <= d; k += 2) {
+                // First step
+
+                final int i = k + offset;
+                if (k == -d || k != d && vDown[i - 1] < vDown[i + 1]) {
+                    vDown[i] = vDown[i + 1];
+                } else {
+                    vDown[i] = vDown[i - 1] + 1;
+                }
+
+                int x = vDown[i];
+                int y = x - start1 + start2 - k;
+
+                while (x < end1 && y < end2 && left.charAt(x) == 
right.charAt(y)) {
+                    vDown[i] = ++x;
+                    ++y;
+                }
+                // Second step
+                if (delta % 2 != 0 && delta - d <= k && k <= delta + d) {
+                    if (vUp[i - delta] <= vDown[i]) { // NOPMD
+                        return buildSnake(vUp[i - delta], k + start1 - start2, 
end1, end2);
+                    }
+                }
+            }
+
+            // Up
+            for (int k = delta - d; k <= delta + d; k += 2) {
+                // First step
+                final int i = k + offset - delta;
+                if (k == delta - d
+                        || k != delta + d && vUp[i + 1] <= vUp[i - 1]) {
+                    vUp[i] = vUp[i + 1] - 1;
+                } else {
+                    vUp[i] = vUp[i - 1];
+                }
+
+                int x = vUp[i] - 1;
+                int y = x - start1 + start2 - k;
+                while (x >= start1 && y >= start2
+                        && left.charAt(x) == right.charAt(y)) {
+                    vUp[i] = x--;
+                    y--;
+                }
+                // Second step
+                if (delta % 2 == 0 && -d <= k && k <= d) {
+                    if (vUp[i] <= vDown[i + delta]) { // NOPMD
+                        return buildSnake(vUp[i], k + start1 - start2, end1, 
end2);
+                    }
+                }
+            }
+        }
+
+        // this should not happen
+        throw new RuntimeException("Internal Error");
+    }
+
+    /**
+     * Build a snake.
+     *
+     * @param start  the value of the start of the snake
+     * @param diag  the value of the diagonal of the snake
+     * @param end1  the value of the end of the first sequence to be compared
+     * @param end2  the value of the end of the second sequence to be compared
+     * @return the snake built
+     */
+    private Snake buildSnake(final int start, final int diag, final int end1, 
final int end2) {
+        int end = start;
+        while (end - diag < end2
+                && end < end1
+                && left.charAt(end) == right.charAt(end - diag)) {
+            ++end;
+        }
+        return new Snake(start, end, diag);
+    }
+
+    /**
+     * This class is a simple placeholder to hold the end part of a path
+     * under construction in a {@link StringsComparator StringsComparator}.
+     */
+    private static class Snake {
+
+        /** Start index. */
+        private final int start;
+
+        /** End index. */
+        private final int end;
+
+        /** Diagonal number. */
+        private final int diag;
+
+        /**
+         * Simple constructor. Creates a new instance of Snake with specified 
indices.
+         *
+         * @param start  start index of the snake
+         * @param end  end index of the snake
+         * @param diag  diagonal number
+         */
+        public Snake(final int start, final int end, final int diag) {
+            this.start = start;
+            this.end   = end;
+            this.diag  = diag;
+        }
+
+        /**
+         * Get the start index of the snake.
+         *
+         * @return start index of the snake
+         */
+        public int getStart() {
+            return start;
+        }
+
+        /**
+         * Get the end index of the snake.
+         *
+         * @return end index of the snake
+         */
+        public int getEnd() {
+            return end;
+        }
+
+        /**
+         * Get the diagonal number of the snake.
+         *
+         * @return diagonal number of the snake
+         */
+        public int getDiag() {
+            return diag;
+        }
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/commons-text/blob/4d927351/src/main/java/org/apache/commons/text/beta/diff/package-info.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/beta/diff/package-info.java 
b/src/main/java/org/apache/commons/text/beta/diff/package-info.java
new file mode 100644
index 0000000..0d1323f
--- /dev/null
+++ b/src/main/java/org/apache/commons/text/beta/diff/package-info.java
@@ -0,0 +1,25 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/**
+ * <p>Provides algorithms for diff between strings.</p>
+ *
+ * <p>The initial implementation of the Myers algorithm was adapted from the
+ * commons-collections sequence package.</p>
+ *
+ * @since 1.0
+ */
+package org.apache.commons.text.beta.diff;

http://git-wip-us.apache.org/repos/asf/commons-text/blob/4d927351/src/main/java/org/apache/commons/text/beta/package-info.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/beta/package-info.java 
b/src/main/java/org/apache/commons/text/beta/package-info.java
new file mode 100644
index 0000000..dc58111
--- /dev/null
+++ b/src/main/java/org/apache/commons/text/beta/package-info.java
@@ -0,0 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/**
+ * <p>Basic classes for text handling.</p>
+ *
+ * @since 1.0
+ */
+package org.apache.commons.text.beta;

http://git-wip-us.apache.org/repos/asf/commons-text/blob/4d927351/src/main/java/org/apache/commons/text/beta/similarity/CosineDistance.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/commons/text/beta/similarity/CosineDistance.java 
b/src/main/java/org/apache/commons/text/beta/similarity/CosineDistance.java
new file mode 100644
index 0000000..a51fbee
--- /dev/null
+++ b/src/main/java/org/apache/commons/text/beta/similarity/CosineDistance.java
@@ -0,0 +1,57 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.text.beta.similarity;
+
+import java.util.Map;
+
+/**
+ * Measures the cosine distance between two character sequences.
+ *
+ * <p>It utilizes the {@link CosineSimilarity} to compute the distance. 
Character sequences
+ * are converted into vectors through a simple tokenizer that works with a 
regular expression
+ * to split words in a sentence.</p>
+ *
+ * <p>
+ * For further explanation about Cosine Similarity and Cosine Distance, refer 
to
+ * http://en.wikipedia.org/wiki/Cosine_similarity.
+ * </p>
+ *
+ * @since 1.0
+ * @see CosineSimilarity
+ */
+public class CosineDistance implements EditDistance<Double> {
+    /**
+     * Tokenizer used to convert the character sequence into a vector.
+     */
+    private final Tokenizer<CharSequence> tokenizer = new RegexTokenizer();
+    /**
+     * Cosine similarity.
+     */
+    private final CosineSimilarity cosineSimilarity = new CosineSimilarity();
+
+    @Override
+    public Double apply(final CharSequence left, final CharSequence right) {
+        final CharSequence[] leftTokens = tokenizer.tokenize(left);
+        final CharSequence[] rightTokens = tokenizer.tokenize(right);
+
+        final Map<CharSequence, Integer> leftVector = Counter.of(leftTokens);
+        final Map<CharSequence, Integer> rightVector = Counter.of(rightTokens);
+        final double similarity = 
cosineSimilarity.cosineSimilarity(leftVector, rightVector);
+        return 1.0 - similarity;
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/commons-text/blob/4d927351/src/main/java/org/apache/commons/text/beta/similarity/CosineSimilarity.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/commons/text/beta/similarity/CosineSimilarity.java 
b/src/main/java/org/apache/commons/text/beta/similarity/CosineSimilarity.java
new file mode 100644
index 0000000..b8fe704
--- /dev/null
+++ 
b/src/main/java/org/apache/commons/text/beta/similarity/CosineSimilarity.java
@@ -0,0 +1,101 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.text.beta.similarity;
+
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+
+/**
+ * Measures the Cosine similarity of two vectors of an inner product space and
+ * compares the angle between them.
+ *
+ * <p>
+ * For further explanation about the Cosine Similarity, refer to
+ * http://en.wikipedia.org/wiki/Cosine_similarity.
+ * </p>
+ *
+ * @since 1.0
+ */
+public class CosineSimilarity {
+
+    /**
+     * Calculates the cosine similarity for two given vectors.
+     *
+     * @param leftVector left vector
+     * @param rightVector right vector
+     * @return cosine similarity between the two vectors
+     */
+    public Double cosineSimilarity(final Map<CharSequence, Integer> 
leftVector, final Map<CharSequence, Integer> rightVector) {
+        if (leftVector == null || rightVector == null) {
+            throw new IllegalArgumentException("Vectors must not be null");
+        }
+
+        final Set<CharSequence> intersection = getIntersection(leftVector, 
rightVector);
+
+        final double dotProduct = dot(leftVector, rightVector, intersection);
+        double d1 = 0.0d;
+        for (final Integer value : leftVector.values()) {
+            d1 += Math.pow(value, 2);
+        }
+        double d2 = 0.0d;
+        for (final Integer value : rightVector.values()) {
+            d2 += Math.pow(value, 2);
+        }
+        double cosineSimilarity;
+        if (d1 <= 0.0 || d2 <= 0.0) {
+            cosineSimilarity = 0.0;
+        } else {
+            cosineSimilarity = (double) (dotProduct / (double) (Math.sqrt(d1) 
* Math.sqrt(d2)));
+        }
+        return cosineSimilarity;
+    }
+
+    /**
+     * Returns a set with strings common to the two given maps.
+     *
+     * @param leftVector left vector map
+     * @param rightVector right vector map
+     * @return common strings
+     */
+    private Set<CharSequence> getIntersection(final Map<CharSequence, Integer> 
leftVector,
+            final Map<CharSequence, Integer> rightVector) {
+        final Set<CharSequence> intersection = new 
HashSet<>(leftVector.keySet());
+        intersection.retainAll(rightVector.keySet());
+        return intersection;
+    }
+
+    /**
+     * Computes the dot product of two vectors. It ignores remaining elements. 
It means
+     * that if a vector is longer than other, then a smaller part of it will 
be used to compute
+     * the dot product.
+     *
+     * @param leftVector left vector
+     * @param rightVector right vector
+     * @param intersection common elements
+     * @return the dot product
+     */
+    private double dot(final Map<CharSequence, Integer> leftVector, final 
Map<CharSequence, Integer> rightVector,
+            final Set<CharSequence> intersection) {
+        long dotProduct = 0;
+        for (final CharSequence key : intersection) {
+            dotProduct += leftVector.get(key) * rightVector.get(key);
+        }
+        return dotProduct;
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/commons-text/blob/4d927351/src/main/java/org/apache/commons/text/beta/similarity/Counter.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/beta/similarity/Counter.java 
b/src/main/java/org/apache/commons/text/beta/similarity/Counter.java
new file mode 100644
index 0000000..5093cbf
--- /dev/null
+++ b/src/main/java/org/apache/commons/text/beta/similarity/Counter.java
@@ -0,0 +1,62 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.text.beta.similarity;
+
+import java.util.HashMap;
+import java.util.Map;
+
+/**
+ * Java implementation of Python's collections Counter module.
+ *
+ * <p>It counts how many times each element provided occurred in an array and
+ * returns a dict with the element as key and the count as value.</p>
+ *
+ * @see <a 
href="https://docs.python.org/dev/library/collections.html#collections.Counter";>
+ * https://docs.python.org/dev/library/collections.html#collections.Counter</a>
+ *
+ * @since 1.0
+ */
+final class Counter {
+
+    /**
+     * Hidden constructor.
+     */
+    private Counter() {
+        super();
+    }
+
+    /**
+     * It counts how many times each element provided occurred in an array and
+     * returns a dict with the element as key and the count as value.
+     *
+     * @param tokens array of tokens
+     * @return dict, where the elements are key, and the count the value
+     */
+    public static Map<CharSequence, Integer> of(final CharSequence[] tokens) {
+        final Map<CharSequence, Integer> innerCounter = new HashMap<>();
+        for (final CharSequence token : tokens) {
+            if (innerCounter.containsKey(token)) {
+                int value = innerCounter.get(token);
+                innerCounter.put(token, ++value);
+            } else {
+                innerCounter.put(token, 1);
+            }
+        }
+        return innerCounter;
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/commons-text/blob/4d927351/src/main/java/org/apache/commons/text/beta/similarity/EditDistance.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/commons/text/beta/similarity/EditDistance.java 
b/src/main/java/org/apache/commons/text/beta/similarity/EditDistance.java
new file mode 100644
index 0000000..4de96fb
--- /dev/null
+++ b/src/main/java/org/apache/commons/text/beta/similarity/EditDistance.java
@@ -0,0 +1,59 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.text.beta.similarity;
+
+/**
+ * Interface for <a href="http://en.wikipedia.org/wiki/Edit_distance";>Edit 
Distances</a>.
+ *
+ * <p>
+ * An edit distance is a formal metric on the Kleene closure 
(<code>X<sup>*</sup></code>) over an
+ * alphabet (<code>X</code>). Note, that a <a 
href="https://en.wikipedia.org/wiki/Metric_(mathematics)">metric</a>
+ * on a set <code>S</code> is a function <code>d: [S * S] -&gt; [0, 
INFINITY)</code> such
+ * that the following hold for <code>x,y,z</code> in
+ * the set <code>S</code>:
+ * </p>
+ * <ul>
+ *     <li><code>d(x,y) &gt;= 0</code>, non-negativity or separation axiom</li>
+ *     <li><code>d(x,y) == 0</code>, if and only if, <code>x == y</code></li>
+ *     <li><code>d(x,y) == d(y,x)</code>, symmetry, and</li>
+ *     <li><code>d(x,z) &lt;=  d(x,y) + d(y,z)</code>, the triangle 
inequality</li>
+ * </ul>
+ *
+ *
+ * <p>
+ * This is a BiFunction&lt;CharSequence, CharSequence, R&gt;.
+ * The <code>apply</code> method
+ * accepts a pair of {@link CharSequence} parameters
+ * and returns an <code>R</code> type similarity score.
+ * </p>
+ *
+ * @param <R> The type of similarity score unit used by this EditDistance.
+ * @since 1.0
+ */
+public interface EditDistance<R> extends SimilarityScore<R> {
+
+    /**
+     * Compares two CharSequences.
+     *
+     * @param left the first CharSequence
+     * @param right the second CharSequence
+     * @return the similarity score between two CharSequences
+     */
+    @Override
+    R apply(CharSequence left, CharSequence right);
+
+}

http://git-wip-us.apache.org/repos/asf/commons-text/blob/4d927351/src/main/java/org/apache/commons/text/beta/similarity/EditDistanceFrom.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/commons/text/beta/similarity/EditDistanceFrom.java 
b/src/main/java/org/apache/commons/text/beta/similarity/EditDistanceFrom.java
new file mode 100644
index 0000000..34452fb
--- /dev/null
+++ 
b/src/main/java/org/apache/commons/text/beta/similarity/EditDistanceFrom.java
@@ -0,0 +1,112 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.text.beta.similarity;
+
+/**
+ * <p>
+ * This stores a {@link EditDistance} implementation and a {@link 
CharSequence} "left" string.
+ * The {@link #apply(CharSequence right)} method accepts the "right" string 
and invokes the
+ * comparison function for the pair of strings.
+ * </p>
+ *
+ * <p>
+ * The following is an example which finds the most similar string:
+ * </p>
+ * <pre>
+ * EditDistance&lt;Integer&gt; editDistance = new LevenshteinDistance();
+ * String target = "Apache";
+ * EditDistanceFrom&lt;Integer&gt; editDistanceFrom =
+ *     new EditDistanceFrom&lt;Integer&gt;(editDistance, target);
+ * String mostSimilar = null;
+ * Integer shortestDistance = null;
+ *
+ * for (String test : new String[] { "Appaloosa", "a patchy", "apple" }) {
+ *     Integer distance = editDistanceFrom.apply(test);
+ *     if (shortestDistance == null || distance &lt; shortestDistance) {
+ *         shortestDistance = distance;
+ *         mostSimilar = test;
+ *     }
+ * }
+ *
+ * System.out.println("The string most similar to \"" + target + "\" "
+ *     + "is \"" + mostSimilar + "\" because "
+ *     + "its distance is only " + shortestDistance + ".");
+ * </pre>
+ *
+ * @param <R> This is the type of similarity score used by the EditDistance 
function.
+ * @since 1.0
+ */
+public class EditDistanceFrom<R> {
+
+    /**
+     * Edit distance.
+     */
+    private final EditDistance<R> editDistance;
+    /**
+     * Left parameter used in distance function.
+     */
+    private final CharSequence left;
+
+    /**
+     * <p>This accepts the edit distance implementation and the "left" 
string.</p>
+     *
+     * @param editDistance This may not be null.
+     * @param left This may be null here,
+     *             but the EditDistance#compare(CharSequence left, 
CharSequence right)
+     *             implementation may not accept nulls.
+     */
+    public EditDistanceFrom(final EditDistance<R> editDistance, final 
CharSequence left) {
+        if (editDistance == null) {
+            throw new IllegalArgumentException("The edit distance may not be 
null.");
+        }
+
+        this.editDistance = editDistance;
+        this.left = left;
+    }
+
+    /**
+     * <p>
+     * This compares "left" field against the "right" parameter
+     * using the "edit distance" implementation.
+     * </p>
+     *
+     * @param right the second CharSequence
+     * @return the similarity score between two CharSequences
+     */
+    public R apply(final CharSequence right) {
+        return editDistance.apply(left, right);
+    }
+
+    /**
+     * Gets the left parameter.
+     *
+     * @return the left parameter
+     */
+    public CharSequence getLeft() {
+        return left;
+    }
+
+    /**
+     * Gets the edit distance.
+     *
+     * @return the edit distance
+     */
+    public EditDistance<R> getEditDistance() {
+        return editDistance;
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/commons-text/blob/4d927351/src/main/java/org/apache/commons/text/beta/similarity/FuzzyScore.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/commons/text/beta/similarity/FuzzyScore.java 
b/src/main/java/org/apache/commons/text/beta/similarity/FuzzyScore.java
new file mode 100644
index 0000000..3647a49
--- /dev/null
+++ b/src/main/java/org/apache/commons/text/beta/similarity/FuzzyScore.java
@@ -0,0 +1,144 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.text.beta.similarity;
+
+import java.util.Locale;
+
+/**
+ * A matching algorithm that is similar to the searching algorithms 
implemented in editors such
+ * as Sublime Text, TextMate, Atom and others.
+ *
+ * <p>
+ * One point is given for every matched character. Subsequent matches yield 
two bonus points. A higher score
+ * indicates a higher similarity.
+ * </p>
+ *
+ * <p>
+ * This code has been adapted from Apache Commons Lang 3.3.
+ * </p>
+ *
+ * @since 1.0
+ */
+public class FuzzyScore {
+
+    /**
+     * Locale used to change the case of text.
+     */
+    private final Locale locale;
+
+
+    /**
+     * <p>This returns a {@link Locale}-specific {@link FuzzyScore}.</p>
+     *
+     * @param locale The string matching logic is case insensitive.
+                     A {@link Locale} is necessary to normalize both Strings 
to lower case.
+     * @throws IllegalArgumentException
+     *         This is thrown if the {@link Locale} parameter is {@code null}.
+     */
+    public FuzzyScore(final Locale locale) {
+        if (locale == null) {
+            throw new IllegalArgumentException("Locale must not be null");
+        }
+        this.locale = locale;
+    }
+
+    /**
+     * <p>
+     * Find the Fuzzy Score which indicates the similarity score between two
+     * Strings.
+     * </p>
+     *
+     * <pre>
+     * score.fuzzyScore(null, null, null)                                    = 
IllegalArgumentException
+     * score.fuzzyScore("", "", Locale.ENGLISH)                              = 0
+     * score.fuzzyScore("Workshop", "b", Locale.ENGLISH)                     = 0
+     * score.fuzzyScore("Room", "o", Locale.ENGLISH)                         = 
1
+     * score.fuzzyScore("Workshop", "w", Locale.ENGLISH)                     = 
1
+     * score.fuzzyScore("Workshop", "ws", Locale.ENGLISH)                    = 
2
+     * score.fuzzyScore("Workshop", "wo", Locale.ENGLISH)                    = 
4
+     * score.fuzzyScore("Apache Software Foundation", "asf", Locale.ENGLISH) = 
3
+     * </pre>
+     *
+     * @param term a full term that should be matched against, must not be null
+     * @param query the query that will be matched against a term, must not be
+     *            null
+     * @return result score
+     * @throws IllegalArgumentException if either String input {@code null} or
+     *             Locale input {@code null}
+     */
+    public Integer fuzzyScore(final CharSequence term, final CharSequence 
query) {
+        if (term == null || query == null) {
+            throw new IllegalArgumentException("Strings must not be null");
+        }
+
+        // fuzzy logic is case insensitive. We normalize the Strings to lower
+        // case right from the start. Turning characters to lower case
+        // via Character.toLowerCase(char) is unfortunately insufficient
+        // as it does not accept a locale.
+        final String termLowerCase = term.toString().toLowerCase(locale);
+        final String queryLowerCase = query.toString().toLowerCase(locale);
+
+        // the resulting score
+        int score = 0;
+
+        // the position in the term which will be scanned next for potential
+        // query character matches
+        int termIndex = 0;
+
+        // index of the previously matched character in the term
+        int previousMatchingCharacterIndex = Integer.MIN_VALUE;
+
+        for (int queryIndex = 0; queryIndex < queryLowerCase.length(); 
queryIndex++) {
+            final char queryChar = queryLowerCase.charAt(queryIndex);
+
+            boolean termCharacterMatchFound = false;
+            for (; termIndex < termLowerCase.length()
+                    && !termCharacterMatchFound; termIndex++) {
+                final char termChar = termLowerCase.charAt(termIndex);
+
+                if (queryChar == termChar) {
+                    // simple character matches result in one point
+                    score++;
+
+                    // subsequent character matches further improve
+                    // the score.
+                    if (previousMatchingCharacterIndex + 1 == termIndex) {
+                        score += 2;
+                    }
+
+                    previousMatchingCharacterIndex = termIndex;
+
+                    // we can leave the nested loop. Every character in the
+                    // query can match at most one character in the term.
+                    termCharacterMatchFound = true;
+                }
+            }
+        }
+
+        return score;
+    }
+
+    /**
+     * Gets the locale.
+     *
+     * @return the locale
+     */
+    public Locale getLocale() {
+        return locale;
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/commons-text/blob/4d927351/src/main/java/org/apache/commons/text/beta/similarity/HammingDistance.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/commons/text/beta/similarity/HammingDistance.java 
b/src/main/java/org/apache/commons/text/beta/similarity/HammingDistance.java
new file mode 100644
index 0000000..2ab73ea
--- /dev/null
+++ b/src/main/java/org/apache/commons/text/beta/similarity/HammingDistance.java
@@ -0,0 +1,78 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.text.beta.similarity;
+
+/**
+ * The hamming distance between two strings of equal length is the number of
+ * positions at which the corresponding symbols are different.
+ *
+ * <p>
+ * For further explanation about the Hamming Distance, take a look at its
+ * Wikipedia page at http://en.wikipedia.org/wiki/Hamming_distance.
+ * </p>
+ *
+ * @since 1.0
+ */
+public class HammingDistance implements EditDistance<Integer> {
+
+    /**
+     * Find the Hamming Distance between two strings with the same
+     * length.
+     *
+     * <p>The distance starts with zero, and for each occurrence of a
+     * different character in either String, it increments the distance
+     * by 1, and finally return its value.</p>
+     *
+     * <p>Since the Hamming Distance can only be calculated between strings of 
equal length, input of different lengths
+     * will throw IllegalArgumentException</p>
+     *
+     * <pre>
+     * distance.apply("", "")               = 0
+     * distance.apply("pappa", "pappa")     = 0
+     * distance.apply("1011101", "1011111") = 1
+     * distance.apply("ATCG", "ACCC")       = 2
+     * distance.apply("karolin", "kerstin"  = 3
+     * </pre>
+     *
+     * @param left the first CharSequence, must not be null
+     * @param right the second CharSequence, must not be null
+     * @return distance
+     * @throws IllegalArgumentException if either input is {@code null} or
+     *             if they do not have the same length
+     */
+    @Override
+    public Integer apply(final CharSequence left, final CharSequence right) {
+        if (left == null || right == null) {
+            throw new IllegalArgumentException("Strings must not be null");
+        }
+
+        if (left.length() != right.length()) {
+            throw new IllegalArgumentException("Strings must have the same 
length");
+        }
+
+        int distance = 0;
+
+        for (int i = 0; i < left.length(); i++) {
+            if (left.charAt(i) != right.charAt(i)) {
+                distance++;
+            }
+        }
+
+        return distance;
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/commons-text/blob/4d927351/src/main/java/org/apache/commons/text/beta/similarity/JaccardDistance.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/commons/text/beta/similarity/JaccardDistance.java 
b/src/main/java/org/apache/commons/text/beta/similarity/JaccardDistance.java
new file mode 100644
index 0000000..515830d
--- /dev/null
+++ b/src/main/java/org/apache/commons/text/beta/similarity/JaccardDistance.java
@@ -0,0 +1,52 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.text.beta.similarity;
+
+/**
+ * Measures the Jaccard distance of two sets of character sequence. Jaccard
+ * distance is the dissimilarity between two sets. Its the complementary of
+ * Jaccard similarity.
+ *
+ * <p>
+ * For further explanation about Jaccard Distance, refer
+ * https://en.wikipedia.org/wiki/Jaccard_index
+ * </p>
+ *
+ * @since 1.0
+ */
+public class JaccardDistance implements EditDistance<Double> {
+
+    private final JaccardSimilarity jaccardSimilarity = new 
JaccardSimilarity();
+
+    /**
+     * Calculates Jaccard distance of two set character sequence passed as
+     * input. Calculates Jaccard similarity and returns the complement of it.
+     * 
+     * @param left first character sequence
+     * @param right second character sequence
+     * @return index
+     * @throws IllegalArgumentException
+     *             if either String input {@code null}
+     */
+    @Override
+    public Double apply(CharSequence left, CharSequence right) {
+        if (left == null || right == null) {
+            throw new IllegalArgumentException("Input cannot be null");
+        }
+        return Math.round((1 - jaccardSimilarity.apply(left, right)) * 100d) / 
100d;
+    }
+}

http://git-wip-us.apache.org/repos/asf/commons-text/blob/4d927351/src/main/java/org/apache/commons/text/beta/similarity/JaccardSimilarity.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/commons/text/beta/similarity/JaccardSimilarity.java 
b/src/main/java/org/apache/commons/text/beta/similarity/JaccardSimilarity.java
new file mode 100644
index 0000000..db3e08d
--- /dev/null
+++ 
b/src/main/java/org/apache/commons/text/beta/similarity/JaccardSimilarity.java
@@ -0,0 +1,88 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.text.beta.similarity;
+
+import java.util.HashSet;
+import java.util.Set;
+
+/**
+ * Measures the Jaccard similarity (aka Jaccard index) of two sets of character
+ * sequence. Jaccard similarity is the size of the intersection divided by the
+ * size of the union of the two sets.
+ *
+ * <p>
+ * For further explanation about Jaccard Similarity, refer
+ * https://en.wikipedia.org/wiki/Jaccard_index
+ * </p>
+ *
+ * @since 1.0
+ */
+public class JaccardSimilarity implements SimilarityScore<Double> {
+
+    /**
+     * Calculates Jaccard Similarity of two set character sequence passed as
+     * input.
+     * 
+     * @param left first character sequence
+     * @param right second character sequence
+     * @return index
+     * @throws IllegalArgumentException
+     *             if either String input {@code null}
+     */
+    @Override
+    public Double apply(CharSequence left, CharSequence right) {
+        if (left == null || right == null) {
+            throw new IllegalArgumentException("Input cannot be null");
+        }
+        return Math.round(calculateJaccardSimilarity(left, right) * 100d) / 
100d;
+    }
+
+    /**
+     * Calculates Jaccard Similarity of two character sequences passed as
+     * input. Does the calculation by identifying the union (characters in at
+     * least one of the two sets) of the two sets and intersection (characters
+     * which are present in set one which are present in set two)
+     * 
+     * @param left first character sequence
+     * @param right second character sequence
+     * @return index
+     */
+    private Double calculateJaccardSimilarity(CharSequence left, CharSequence 
right) {
+        Set<String> intersectionSet = new HashSet<String>();
+        Set<String> unionSet = new HashSet<String>();
+        boolean unionFilled = false;
+        int leftLength = left.length();
+        int rightLength = right.length();
+        if (leftLength == 0 || rightLength == 0) {
+            return 0d;
+        }
+
+        for (int leftIndex = 0; leftIndex < leftLength; leftIndex++) {
+            unionSet.add(String.valueOf(left.charAt(leftIndex)));
+            for (int rightIndex = 0; rightIndex < rightLength; rightIndex++) {
+                if (!unionFilled) {
+                    unionSet.add(String.valueOf(right.charAt(rightIndex)));
+                }
+                if (left.charAt(leftIndex) == right.charAt(rightIndex)) {
+                    
intersectionSet.add(String.valueOf(left.charAt(leftIndex)));
+                }
+            }
+            unionFilled = true;
+        }
+        return Double.valueOf(intersectionSet.size()) / 
Double.valueOf(unionSet.size());
+    }
+}

http://git-wip-us.apache.org/repos/asf/commons-text/blob/4d927351/src/main/java/org/apache/commons/text/beta/similarity/JaroWinklerDistance.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/commons/text/beta/similarity/JaroWinklerDistance.java
 
b/src/main/java/org/apache/commons/text/beta/similarity/JaroWinklerDistance.java
new file mode 100644
index 0000000..6e81946
--- /dev/null
+++ 
b/src/main/java/org/apache/commons/text/beta/similarity/JaroWinklerDistance.java
@@ -0,0 +1,161 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.text.beta.similarity;
+
+import java.util.Arrays;
+
+/**
+ * A similarity algorithm indicating the percentage of matched characters 
between two character sequences.
+ *
+ * <p>
+ * The Jaro measure is the weighted sum of percentage of matched characters
+ * from each file and transposed characters. Winkler increased this measure
+ * for matching initial characters.
+ * </p>
+ *
+ * <p>
+ * This implementation is based on the Jaro Winkler similarity algorithm
+ * from <a href="http://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance";>
+ * http://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance</a>.
+ * </p>
+ *
+ * <p>
+ * This code has been adapted from Apache Commons Lang 3.3.
+ * </p>
+ *
+ * @since 1.0
+ */
+public class JaroWinklerDistance implements SimilarityScore<Double> {
+
+    /**
+     * The default prefix length limit set to four.
+     */
+    private static final int PREFIX_LENGTH_LIMIT = 4;
+    /**
+     * Represents a failed index search.
+     */
+    public static final int INDEX_NOT_FOUND = -1;
+
+    /**
+     * Find the Jaro Winkler Distance which indicates the similarity score
+     * between two CharSequences.
+     *
+     * <pre>
+     * distance.apply(null, null)          = IllegalArgumentException
+     * distance.apply("","")               = 0.0
+     * distance.apply("","a")              = 0.0
+     * distance.apply("aaapppp", "")       = 0.0
+     * distance.apply("frog", "fog")       = 0.93
+     * distance.apply("fly", "ant")        = 0.0
+     * distance.apply("elephant", "hippo") = 0.44
+     * distance.apply("hippo", "elephant") = 0.44
+     * distance.apply("hippo", "zzzzzzzz") = 0.0
+     * distance.apply("hello", "hallo")    = 0.88
+     * distance.apply("ABC Corporation", "ABC Corp") = 0.93
+     * distance.apply("D N H Enterprises Inc", "D &amp; H Enterprises, Inc.") 
= 0.95
+     * distance.apply("My Gym Children's Fitness Center", "My Gym. Childrens 
Fitness") = 0.92
+     * distance.apply("PENNSYLVANIA", "PENNCISYLVNIA")    = 0.88
+     * </pre>
+     *
+     * @param left the first String, must not be null
+     * @param right the second String, must not be null
+     * @return result distance
+     * @throws IllegalArgumentException if either String input {@code null}
+     */
+    @Override
+    public Double apply(final CharSequence left, final CharSequence right) {
+        final double defaultScalingFactor = 0.1;
+        final double percentageRoundValue = 100.0;
+
+        if (left == null || right == null) {
+            throw new IllegalArgumentException("Strings must not be null");
+        }
+
+        int[] mtp = matches(left, right);
+        double m = mtp[0];
+        if (m == 0) {
+            return 0D;
+        }
+        double j = ((m / left.length() + m / right.length() + (m - mtp[1]) / 
m)) / 3;
+        double jw = j < 0.7D ? j : j + Math.min(defaultScalingFactor, 1D / 
mtp[3]) * mtp[2] * (1D - j);
+        return Math.round(jw * percentageRoundValue) / percentageRoundValue;
+    }
+
+    /**
+     * This method returns the Jaro-Winkler string matches, transpositions, 
prefix, max array.
+     *
+     * @param first the first string to be matched
+     * @param second the second string to be machted
+     * @return mtp array containing: matches, transpositions, prefix, and max 
length
+     */
+    protected static int[] matches(final CharSequence first, final 
CharSequence second) {
+        CharSequence max, min;
+        if (first.length() > second.length()) {
+            max = first;
+            min = second;
+        } else {
+            max = second;
+            min = first;
+        }
+        int range = Math.max(max.length() / 2 - 1, 0);
+        int[] matchIndexes = new int[min.length()];
+        Arrays.fill(matchIndexes, -1);
+        boolean[] matchFlags = new boolean[max.length()];
+        int matches = 0;
+        for (int mi = 0; mi < min.length(); mi++) {
+            char c1 = min.charAt(mi);
+            for (int xi = Math.max(mi - range, 0), xn = Math.min(mi + range + 
1, max.length()); xi < xn; xi++) {
+                if (!matchFlags[xi] && c1 == max.charAt(xi)) {
+                    matchIndexes[mi] = xi;
+                    matchFlags[xi] = true;
+                    matches++;
+                    break;
+                }
+            }
+        }
+        char[] ms1 = new char[matches];
+        char[] ms2 = new char[matches];
+        for (int i = 0, si = 0; i < min.length(); i++) {
+            if (matchIndexes[i] != -1) {
+                ms1[si] = min.charAt(i);
+                si++;
+            }
+        }
+        for (int i = 0, si = 0; i < max.length(); i++) {
+            if (matchFlags[i]) {
+                ms2[si] = max.charAt(i);
+                si++;
+            }
+        }
+        int transpositions = 0;
+        for (int mi = 0; mi < ms1.length; mi++) {
+            if (ms1[mi] != ms2[mi]) {
+                transpositions++;
+            }
+        }
+        int prefix = 0;
+        for (int mi = 0; mi < min.length(); mi++) {
+            if (first.charAt(mi) == second.charAt(mi)) {
+                prefix++;
+            } else {
+                break;
+            }
+        }
+        return new int[] { matches, transpositions / 2, prefix, max.length() };
+    }
+
+}

Reply via email to