Author: sebb
Date: Sun Apr  2 20:41:29 2017
New Revision: 1789911

URL: http://svn.apache.org/viewvc?rev=1789911&view=rev
Log:
CODEC-233 Soundex should support more algorithm variants

Modified:
    commons/proper/codec/trunk/src/changes/changes.xml
    
commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/Soundex.java
    
commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/SoundexTest.java

Modified: commons/proper/codec/trunk/src/changes/changes.xml
URL: 
http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/changes/changes.xml?rev=1789911&r1=1789910&r2=1789911&view=diff
==============================================================================
--- commons/proper/codec/trunk/src/changes/changes.xml (original)
+++ commons/proper/codec/trunk/src/changes/changes.xml Sun Apr  2 20:41:29 2017
@@ -45,6 +45,7 @@ The <action> type attribute can be add,u
     <release version="1.11" date="2017-MM-DD" description="Feature and fix 
release.">
       <!-- The first attribute below should be the issue id; makes it easier 
to navigate in the IDE outline -->
 
+      <action issue="CODEC-233" dev="sebb" type="update" due-to="Yossi 
Tamari">Soundex should support more algorithm variants</action>
       <action issue="CODEC-145" dev="sebb" type="fix" due-to="Jesse 
Glick">Base64.encodeBase64String could better use newStringUsAscii (ditto 
encodeBase64URLSafeString)</action>
       <action issue="CODEC-144" dev="sebb" type="fix">BaseNCodec: 
encodeToString and encodeAsString methods are identical</action>
       <action issue="CODEC-232" dev="sebb" type="fix">URLCodec is neither 
immutable nor threadsafe</action>

Modified: 
commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/Soundex.java
URL: 
http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/Soundex.java?rev=1789911&r1=1789910&r2=1789911&view=diff
==============================================================================
--- 
commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/Soundex.java
 (original)
+++ 
commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/Soundex.java
 Sun Apr  2 20:41:29 2017
@@ -32,15 +32,31 @@ import org.apache.commons.codec.StringEn
 public class Soundex implements StringEncoder {
 
     /**
+     * The marker character used to indicate a silent (ignored) character.
+     * These are ignored except when they appear as the first character.
+     * <p>
+     * Note: the {@link #US_ENGLISH_MAPPING_STRING} does not use this mechanism
+     * because changing it might break existing code. Mappings that don't 
contain
+     * a silent marker code are treated as though H and W are silent.
+     * <p>
+     * To override this, use the {@link #Soundex(String, boolean)} constructor.
+     * @since 1.11
+     */
+    public static final char SILENT_MARKER = '-';
+
+    /**
      * This is a default mapping of the 26 letters used in US English. A value 
of <code>0</code> for a letter position
-     * means do not encode.
+     * means do not encode, but treat as a separator when it occurs between 
consonants with the same code.
      * <p>
      * (This constant is provided as both an implementation convenience and to 
allow Javadoc to pick
      * up the value for the constant values page.)
-     * </p>
-     *
+     * <p>
+     * <b>Note that letters H and W are treated specially.</b>
+     * They are ignored (after the first letter) and don't act as separators
+     * between consonants with the same code.
      * @see #US_ENGLISH_MAPPING
      */
+    //                                                      
ABCDEFGHIJKLMNOPQRSTUVWXYZ
     public static final String US_ENGLISH_MAPPING_STRING = 
"01230120022455012623010202";
 
     /**
@@ -53,12 +69,45 @@ public class Soundex implements StringEn
 
     /**
      * An instance of Soundex using the US_ENGLISH_MAPPING mapping.
+     * This treats H and W as silent letters.
+     * Apart from when they appear as the first letter, they are ignored.
+     * They don't act as separators between duplicate codes.
      *
      * @see #US_ENGLISH_MAPPING
+     * @see #US_ENGLISH_MAPPING_STRING
      */
     public static final Soundex US_ENGLISH = new Soundex();
 
     /**
+     * An instance of Soundex using the Simplified Soundex mapping, as 
described here:
+     * http://west-penwith.org.uk/misc/soundex.htm
+     * <p>
+     * This treats H and W the same as vowels (AEIOUY).
+     * Such letters aren't encoded (after the first), but they do
+     * act as separators when dropping duplicate codes.
+     * The mapping is otherwise the same as for {@link #US_ENGLISH}
+     * <p>
+     * @since 1.11
+     */
+    public static final Soundex US_ENGLISH_SIMPLIFIED = new 
Soundex(US_ENGLISH_MAPPING_STRING, false);
+
+    /**
+     * An instance of Soundex using the mapping as per the Genealogy site:
+     * http://www.genealogy.com/articles/research/00000060.html
+     * <p>
+     * This treats vowels (AEIOUY), H and W as silent letters.
+     * Such letters are ignored (after the first) and do not
+     * act as separators when dropping duplicate codes.
+     * <p>
+     * The codes for consonants are otherwise the same as for 
+     * {@link #US_ENGLISH_MAPPING_STRING} and {@link #US_ENGLISH_SIMPLIFIED}
+     *
+     * @since 1.11
+     */
+    public static final Soundex US_ENGLISH_GENEALOGY = new 
Soundex("-123-12--22455-12623-1-2-2");
+    //                                                              
ABCDEFGHIJKLMNOPQRSTUVWXYZ
+
+    /**
      * The maximum length of a Soundex code - Soundex codes are only four 
characters by definition.
      *
      * @deprecated This feature is not needed since the encoding size must be 
constant. Will be removed in 2.0.
@@ -73,6 +122,15 @@ public class Soundex implements StringEn
     private final char[] soundexMapping;
 
     /**
+     * Should H and W be treated specially?
+     * <p>
+     * In versions of the code prior to 1.11,
+     * the code always treated H and W as silent (ignored) letters.
+     * If this field is false, H and W are no longer special-cased.
+     */
+    private final boolean specialCaseHW;
+
+    /**
      * Creates an instance using US_ENGLISH_MAPPING
      *
      * @see Soundex#Soundex(char[])
@@ -80,6 +138,7 @@ public class Soundex implements StringEn
      */
     public Soundex() {
         this.soundexMapping = US_ENGLISH_MAPPING;
+        this.specialCaseHW = true;
     }
 
     /**
@@ -88,6 +147,8 @@ public class Soundex implements StringEn
      *
      * Every letter of the alphabet is "mapped" to a numerical value. This 
char array holds the values to which each
      * letter is mapped. This implementation contains a default map for 
US_ENGLISH
+     * <p>
+     * If the mapping contains an instance of {@link #SILENT_MARKER} then H 
and W are not given special treatment
      *
      * @param mapping
      *                  Mapping array to use when finding the corresponding 
code for a given character
@@ -95,11 +156,23 @@ public class Soundex implements StringEn
     public Soundex(final char[] mapping) {
         this.soundexMapping = new char[mapping.length];
         System.arraycopy(mapping, 0, this.soundexMapping, 0, mapping.length);
+        this.specialCaseHW = !hasMarker(this.soundexMapping);
+    }
+
+    private boolean hasMarker(char[] mapping) {
+        for(char ch : mapping) {
+            if (ch == SILENT_MARKER) {
+                return true;
+            }
+        }
+        return false;
     }
 
     /**
      * Creates a refined soundex instance using a custom mapping. This 
constructor can be used to customize the mapping,
      * and/or possibly provide an internationalized mapping for a non-Western 
character set.
+     * <p>
+     * If the mapping contains an instance of {@link #SILENT_MARKER} then H 
and W are not given special treatment
      *
      * @param mapping
      *            Mapping string to use when finding the corresponding code 
for a given character
@@ -107,6 +180,21 @@ public class Soundex implements StringEn
      */
     public Soundex(final String mapping) {
         this.soundexMapping = mapping.toCharArray();
+        this.specialCaseHW = !hasMarker(this.soundexMapping);
+    }
+
+    /**
+     * Creates a refined soundex instance using a custom mapping. This 
constructor can be used to customize the mapping,
+     * and/or possibly provide an internationalized mapping for a non-Western 
character set.
+     *
+     * @param mapping
+     *            Mapping string to use when finding the corresponding code 
for a given character
+     * @param specialCaseHW if true, then 
+     * @since 1.11
+     */
+    public Soundex(final String mapping, boolean specialCaseHW) {
+        this.soundexMapping = mapping.toCharArray();
+        this.specialCaseHW = specialCaseHW;
     }
 
     /**
@@ -190,7 +278,7 @@ public class Soundex implements StringEn
     private char map(final char ch) {
         final int index = ch - 'A';
         if (index < 0 || index >= this.soundexMapping.length) {
-            throw new IllegalArgumentException("The character is not mapped: " 
+ ch);
+            throw new IllegalArgumentException("The character is not mapped: " 
+ ch + " (index=" + index + ")");
         }
         return this.soundexMapping[index];
     }
@@ -231,10 +319,13 @@ public class Soundex implements StringEn
         char lastDigit = map(first); // previous digit
         for(int i = 1; i < str.length() && count < out.length ; i++) {
             char ch = str.charAt(i);
-            if (ch == 'H' || ch == 'W') { // these are ignored completely
+            if ((this.specialCaseHW) && (ch == 'H' || ch == 'W')) { // these 
are ignored completely
                 continue;
             }
             char digit = map(ch);
+            if (digit == SILENT_MARKER) {
+                continue;
+            }
             if (digit != '0' && digit != lastDigit) { // don't store vowels or 
repeats
                 out[count++] = digit;
             }

Modified: 
commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/SoundexTest.java
URL: 
http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/SoundexTest.java?rev=1789911&r1=1789910&r2=1789911&view=diff
==============================================================================
--- 
commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/SoundexTest.java
 (original)
+++ 
commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/SoundexTest.java
 Sun Apr  2 20:41:29 2017
@@ -403,4 +403,33 @@ public class SoundexTest extends StringE
         Assert.assertEquals("T522", this.getStringEncoder().encode("Tymczak"));
         Assert.assertEquals("P236", this.getStringEncoder().encode("Pfister"));
     }
+
+    @Test
+// examples and algorithm rules from:  
http://www.genealogy.com/articles/research/00000060.html
+    public void testGenealogy() { // treat vowels and HW as silent
+        Soundex s = Soundex.US_ENGLISH_GENEALOGY;
+        Assert.assertEquals("H251", s.encode("Heggenburger"));
+        Assert.assertEquals("B425", s.encode("Blackman"));
+        Assert.assertEquals("S530", s.encode("Schmidt"));
+        Assert.assertEquals("L150", s.encode("Lippmann"));
+        // Additional local example
+        Assert.assertEquals("D200", s.encode("Dodds")); // 'o' is not a 
separator here - it is silent
+        Assert.assertEquals("D200", s.encode("Dhdds")); // 'h' is silent
+        Assert.assertEquals("D200", s.encode("Dwdds")); // 'w' is silent
+    }
+
+    @Test
+// examples and algorithm rules from:  
http://west-penwith.org.uk/misc/soundex.htm
+    public void testSimplifiedSoundex() { // treat vowels and HW as separators
+        Soundex s = Soundex.US_ENGLISH_SIMPLIFIED;
+        Assert.assertEquals("W452", s.encode("WILLIAMS"));
+        Assert.assertEquals("B625", s.encode("BARAGWANATH"));
+        Assert.assertEquals("D540", s.encode("DONNELL"));
+        Assert.assertEquals("L300", s.encode("LLOYD"));
+        Assert.assertEquals("W422", s.encode("WOOLCOCK"));
+        // Additional local examples
+        Assert.assertEquals("D320", s.encode("Dodds"));
+        Assert.assertEquals("D320", s.encode("Dwdds")); // w is a separator
+        Assert.assertEquals("D320", s.encode("Dhdds")); // h is a separator
+    }
 }


Reply via email to