Author: britter
Date: Wed May  7 18:42:33 2014
New Revision: 1593112

URL: http://svn.apache.org/r1593112
Log:
LANG-999: Add fuzzy String matching logic to StringUtils. This also closes #20 
from github. Thanks to Ben Ripkens.

Modified:
    commons/proper/lang/trunk/src/changes/changes.xml
    
commons/proper/lang/trunk/src/main/java/org/apache/commons/lang3/StringUtils.java
    
commons/proper/lang/trunk/src/test/java/org/apache/commons/lang3/StringUtilsTest.java

Modified: commons/proper/lang/trunk/src/changes/changes.xml
URL: 
http://svn.apache.org/viewvc/commons/proper/lang/trunk/src/changes/changes.xml?rev=1593112&r1=1593111&r2=1593112&view=diff
==============================================================================
--- commons/proper/lang/trunk/src/changes/changes.xml [utf-8] (original)
+++ commons/proper/lang/trunk/src/changes/changes.xml [utf-8] Wed May  7 
18:42:33 2014
@@ -22,6 +22,7 @@
   <body>
 
   <release version="3.4" date="tba" description="tba">
+    <action issue="LANG-999" type="add" dev="britter" due-to="Ben Ripkens">Add 
fuzzy String matching logic to StringUtils</action>
     <action issue="LANG-1006" type="update" dev="britter" due-to="Thiago 
Andrade">Add wrap (with String or char) to StringUtils</action>
     <action issue="LANG-1005" type="update" dev="britter" due-to="Michael 
Osipov">Extend DurationFormatUtils#formatDurationISO default pattern to match 
#formatDurationHMS</action>
     <action issue="LANG-1007" type="update" dev="britter" due-to="Thiago 
Andrade">Fixing NumberUtils JAVADoc comments for max methods</action>

Modified: 
commons/proper/lang/trunk/src/main/java/org/apache/commons/lang3/StringUtils.java
URL: 
http://svn.apache.org/viewvc/commons/proper/lang/trunk/src/main/java/org/apache/commons/lang3/StringUtils.java?rev=1593112&r1=1593111&r2=1593112&view=diff
==============================================================================
--- 
commons/proper/lang/trunk/src/main/java/org/apache/commons/lang3/StringUtils.java
 (original)
+++ 
commons/proper/lang/trunk/src/main/java/org/apache/commons/lang3/StringUtils.java
 Wed May  7 18:42:33 2014
@@ -7073,6 +7073,85 @@ public class StringUtils {
     }
 
     /**
+     * <p>Determine the fuzzy score which indicates the similarity between two 
Strings.</p>
+     *
+     * <p>This string matching algorithm is similar to the algorithms of 
editors such as Sublime Text,
+     * TextMate, Atom and others. One point is given for every matched 
character. Subsequent
+     * matches yield two bonus points. A higher score indicates a higher 
similarity.</p>
+     *
+     * <pre>
+     * StringUtils.getFuzzyDistance(null, null, null)                          
          = IllegalArgumentException
+     * StringUtils.getFuzzyDistance("", "", Locale.ENGLISH)                    
          = 0
+     * StringUtils.getFuzzyDistance("Workshop", "b", Locale.ENGLISH)           
          = 0
+     * StringUtils.getFuzzyDistance("Room", "o", Locale.ENGLISH)               
          = 1
+     * StringUtils.getFuzzyDistance("Workshop", "w", Locale.ENGLISH)           
          = 1
+     * StringUtils.getFuzzyDistance("Workshop", "ws", Locale.ENGLISH)          
          = 2
+     * StringUtils.getFuzzyDistance("Workshop", "wo", Locale.ENGLISH)          
          = 4
+     * StringUtils.getFuzzyDistance("Apache Software Foundation", "asf", 
Locale.ENGLISH) = 3
+     * </pre>
+     *
+     * @param term a full term that should be matched against, must not be null
+     * @param query the query that will be matched against a term, must not be 
null
+     * @param locale This string matching logic is case insensitive. A locale 
is necessary to normalize
+     *  both Strings to lower case.
+     * @return result score
+     * @throws IllegalArgumentException if either String input {@code null} or 
Locale input {@code null}
+     * @since 3.4
+     */
+    public static int getFuzzyDistance(final CharSequence term, final 
CharSequence query, final Locale locale) {
+        if (term == null || query == null) {
+            throw new IllegalArgumentException("Strings must not be null");
+        } else if (locale == null) {
+            throw new IllegalArgumentException("Locale must not be null");
+        }
+
+        // fuzzy logic is case insensitive. We normalize the Strings to lower
+        // case right from the start. Turning characters to lower case
+        // via Character.toLowerCase(char) is unfortunately insufficient
+        // as it does not accept a locale.
+        final String termLowerCase = term.toString().toLowerCase(locale);
+        final String queryLowerCase = query.toString().toLowerCase(locale);
+
+        // the resulting score
+        int score = 0;
+
+        // the position in the term which will be scanned next for potential
+        // query character matches
+        int termIndex = 0;
+
+        // index of the previously matched character in the term
+        int previousMatchingCharacterIndex = Integer.MIN_VALUE;
+
+        for (int queryIndex = 0; queryIndex < queryLowerCase.length(); 
queryIndex++) {
+            char queryChar = queryLowerCase.charAt(queryIndex);
+
+            boolean termCharacterMatchFound = false;
+            for (; termIndex < termLowerCase.length() && 
!termCharacterMatchFound; termIndex++) {
+                char termChar = termLowerCase.charAt(termIndex);
+
+                if (queryChar == termChar) {
+                    // simple character matches result in one point
+                    score++;
+
+                    // subsequent character matches further improve
+                    // the score.
+                    if (previousMatchingCharacterIndex + 1 == termIndex) {
+                        score += 2;
+                    }
+
+                    previousMatchingCharacterIndex = termIndex;
+
+                    // we can leave the nested loop. Every character in the
+                    // query can match at most one character in the term.
+                    termCharacterMatchFound = true;
+                }
+            }
+        }
+
+        return score;
+    }
+
+    /**
      * Gets a set of matching characters between two strings.
      *
      * <p><Two characters from the first string and the second string are 
considered matching if the character's

Modified: 
commons/proper/lang/trunk/src/test/java/org/apache/commons/lang3/StringUtilsTest.java
URL: 
http://svn.apache.org/viewvc/commons/proper/lang/trunk/src/test/java/org/apache/commons/lang3/StringUtilsTest.java?rev=1593112&r1=1593111&r2=1593112&view=diff
==============================================================================
--- 
commons/proper/lang/trunk/src/test/java/org/apache/commons/lang3/StringUtilsTest.java
 (original)
+++ 
commons/proper/lang/trunk/src/test/java/org/apache/commons/lang3/StringUtilsTest.java
 Wed May  7 18:42:33 2014
@@ -2018,6 +2018,37 @@ public class StringUtilsTest {
             StringUtils.getJaroWinklerDistance(null, "clear");
     }
 
+    @Test
+    public void testGetFuzzyDistance() throws Exception {
+        assertEquals(0, StringUtils.getFuzzyDistance("", "", Locale.ENGLISH));
+        assertEquals(0, StringUtils.getFuzzyDistance("Workshop", "b", 
Locale.ENGLISH));
+        assertEquals(1, StringUtils.getFuzzyDistance("Room", "o", 
Locale.ENGLISH));
+        assertEquals(1, StringUtils.getFuzzyDistance("Workshop", "w", 
Locale.ENGLISH));
+        assertEquals(2, StringUtils.getFuzzyDistance("Workshop", "ws", 
Locale.ENGLISH));
+        assertEquals(4, StringUtils.getFuzzyDistance("Workshop", "wo", 
Locale.ENGLISH));
+        assertEquals(3, StringUtils.getFuzzyDistance("Apache Software 
Foundation", "asf", Locale.ENGLISH));
+    }
+
+    @Test(expected = IllegalArgumentException.class)
+    public void testGetFuzzyDistance_NullNullNull() throws Exception {
+        StringUtils.getFuzzyDistance(null, null, null);
+    }
+
+    @Test(expected = IllegalArgumentException.class)
+    public void testGetFuzzyDistance_StringNullLoclae() throws Exception {
+        StringUtils.getFuzzyDistance(" ", null, Locale.ENGLISH);
+    }
+
+    @Test(expected = IllegalArgumentException.class)
+    public void testGetFuzzyDistance_NullStringLocale() throws Exception {
+        StringUtils.getFuzzyDistance(null, "clear", Locale.ENGLISH);
+    }
+
+    @Test(expected = IllegalArgumentException.class)
+    public void testGetFuzzyDistance_StringStringNull() throws Exception {
+        StringUtils.getFuzzyDistance(" ", "clear", null);
+    }
+
     /**
      * A sanity check for {@link StringUtils#EMPTY}.
      */


Reply via email to