Author: sebb
Date: Sun Apr 2 20:41:29 2017
New Revision: 1789911
URL: http://svn.apache.org/viewvc?rev=1789911&view=rev
Log:
CODEC-233 Soundex should support more algorithm variants
Modified:
commons/proper/codec/trunk/src/changes/changes.xml
commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/Soundex.java
commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/SoundexTest.java
Modified: commons/proper/codec/trunk/src/changes/changes.xml
URL:
http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/changes/changes.xml?rev=1789911&r1=1789910&r2=1789911&view=diff
==============================================================================
--- commons/proper/codec/trunk/src/changes/changes.xml (original)
+++ commons/proper/codec/trunk/src/changes/changes.xml Sun Apr 2 20:41:29 2017
@@ -45,6 +45,7 @@ The <action> type attribute can be add,u
<release version="1.11" date="2017-MM-DD" description="Feature and fix
release.">
<!-- The first attribute below should be the issue id; makes it easier
to navigate in the IDE outline -->
+ <action issue="CODEC-233" dev="sebb" type="update" due-to="Yossi
Tamari">Soundex should support more algorithm variants</action>
<action issue="CODEC-145" dev="sebb" type="fix" due-to="Jesse
Glick">Base64.encodeBase64String could better use newStringUsAscii (ditto
encodeBase64URLSafeString)</action>
<action issue="CODEC-144" dev="sebb" type="fix">BaseNCodec:
encodeToString and encodeAsString methods are identical</action>
<action issue="CODEC-232" dev="sebb" type="fix">URLCodec is neither
immutable nor threadsafe</action>
Modified:
commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/Soundex.java
URL:
http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/Soundex.java?rev=1789911&r1=1789910&r2=1789911&view=diff
==============================================================================
---
commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/Soundex.java
(original)
+++
commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/Soundex.java
Sun Apr 2 20:41:29 2017
@@ -32,15 +32,31 @@ import org.apache.commons.codec.StringEn
public class Soundex implements StringEncoder {
/**
+ * The marker character used to indicate a silent (ignored) character.
+ * These are ignored except when they appear as the first character.
+ * <p>
+ * Note: the {@link #US_ENGLISH_MAPPING_STRING} does not use this mechanism
+ * because changing it might break existing code. Mappings that don't
contain
+ * a silent marker code are treated as though H and W are silent.
+ * <p>
+ * To override this, use the {@link #Soundex(String, boolean)} constructor.
+ * @since 1.11
+ */
+ public static final char SILENT_MARKER = '-';
+
+ /**
* This is a default mapping of the 26 letters used in US English. A value
of <code>0</code> for a letter position
- * means do not encode.
+ * means do not encode, but treat as a separator when it occurs between
consonants with the same code.
* <p>
* (This constant is provided as both an implementation convenience and to
allow Javadoc to pick
* up the value for the constant values page.)
- * </p>
- *
+ * <p>
+ * <b>Note that letters H and W are treated specially.</b>
+ * They are ignored (after the first letter) and don't act as separators
+ * between consonants with the same code.
* @see #US_ENGLISH_MAPPING
*/
+ //
ABCDEFGHIJKLMNOPQRSTUVWXYZ
public static final String US_ENGLISH_MAPPING_STRING =
"01230120022455012623010202";
/**
@@ -53,12 +69,45 @@ public class Soundex implements StringEn
/**
* An instance of Soundex using the US_ENGLISH_MAPPING mapping.
+ * This treats H and W as silent letters.
+ * Apart from when they appear as the first letter, they are ignored.
+ * They don't act as separators between duplicate codes.
*
* @see #US_ENGLISH_MAPPING
+ * @see #US_ENGLISH_MAPPING_STRING
*/
public static final Soundex US_ENGLISH = new Soundex();
/**
+ * An instance of Soundex using the Simplified Soundex mapping, as
described here:
+ * http://west-penwith.org.uk/misc/soundex.htm
+ * <p>
+ * This treats H and W the same as vowels (AEIOUY).
+ * Such letters aren't encoded (after the first), but they do
+ * act as separators when dropping duplicate codes.
+ * The mapping is otherwise the same as for {@link #US_ENGLISH}
+ * <p>
+ * @since 1.11
+ */
+ public static final Soundex US_ENGLISH_SIMPLIFIED = new
Soundex(US_ENGLISH_MAPPING_STRING, false);
+
+ /**
+ * An instance of Soundex using the mapping as per the Genealogy site:
+ * http://www.genealogy.com/articles/research/00000060.html
+ * <p>
+ * This treats vowels (AEIOUY), H and W as silent letters.
+ * Such letters are ignored (after the first) and do not
+ * act as separators when dropping duplicate codes.
+ * <p>
+ * The codes for consonants are otherwise the same as for
+ * {@link #US_ENGLISH_MAPPING_STRING} and {@link #US_ENGLISH_SIMPLIFIED}
+ *
+ * @since 1.11
+ */
+ public static final Soundex US_ENGLISH_GENEALOGY = new
Soundex("-123-12--22455-12623-1-2-2");
+ //
ABCDEFGHIJKLMNOPQRSTUVWXYZ
+
+ /**
* The maximum length of a Soundex code - Soundex codes are only four
characters by definition.
*
* @deprecated This feature is not needed since the encoding size must be
constant. Will be removed in 2.0.
@@ -73,6 +122,15 @@ public class Soundex implements StringEn
private final char[] soundexMapping;
/**
+ * Should H and W be treated specially?
+ * <p>
+ * In versions of the code prior to 1.11,
+ * the code always treated H and W as silent (ignored) letters.
+ * If this field is false, H and W are no longer special-cased.
+ */
+ private final boolean specialCaseHW;
+
+ /**
* Creates an instance using US_ENGLISH_MAPPING
*
* @see Soundex#Soundex(char[])
@@ -80,6 +138,7 @@ public class Soundex implements StringEn
*/
public Soundex() {
this.soundexMapping = US_ENGLISH_MAPPING;
+ this.specialCaseHW = true;
}
/**
@@ -88,6 +147,8 @@ public class Soundex implements StringEn
*
* Every letter of the alphabet is "mapped" to a numerical value. This
char array holds the values to which each
* letter is mapped. This implementation contains a default map for
US_ENGLISH
+ * <p>
+ * If the mapping contains an instance of {@link #SILENT_MARKER} then H
and W are not given special treatment
*
* @param mapping
* Mapping array to use when finding the corresponding
code for a given character
@@ -95,11 +156,23 @@ public class Soundex implements StringEn
public Soundex(final char[] mapping) {
this.soundexMapping = new char[mapping.length];
System.arraycopy(mapping, 0, this.soundexMapping, 0, mapping.length);
+ this.specialCaseHW = !hasMarker(this.soundexMapping);
+ }
+
+ private boolean hasMarker(char[] mapping) {
+ for(char ch : mapping) {
+ if (ch == SILENT_MARKER) {
+ return true;
+ }
+ }
+ return false;
}
/**
* Creates a refined soundex instance using a custom mapping. This
constructor can be used to customize the mapping,
* and/or possibly provide an internationalized mapping for a non-Western
character set.
+ * <p>
+ * If the mapping contains an instance of {@link #SILENT_MARKER} then H
and W are not given special treatment
*
* @param mapping
* Mapping string to use when finding the corresponding code
for a given character
@@ -107,6 +180,21 @@ public class Soundex implements StringEn
*/
public Soundex(final String mapping) {
this.soundexMapping = mapping.toCharArray();
+ this.specialCaseHW = !hasMarker(this.soundexMapping);
+ }
+
+ /**
+ * Creates a refined soundex instance using a custom mapping. This
constructor can be used to customize the mapping,
+ * and/or possibly provide an internationalized mapping for a non-Western
character set.
+ *
+ * @param mapping
+ * Mapping string to use when finding the corresponding code
for a given character
+ * @param specialCaseHW if true, then
+ * @since 1.11
+ */
+ public Soundex(final String mapping, boolean specialCaseHW) {
+ this.soundexMapping = mapping.toCharArray();
+ this.specialCaseHW = specialCaseHW;
}
/**
@@ -190,7 +278,7 @@ public class Soundex implements StringEn
private char map(final char ch) {
final int index = ch - 'A';
if (index < 0 || index >= this.soundexMapping.length) {
- throw new IllegalArgumentException("The character is not mapped: "
+ ch);
+ throw new IllegalArgumentException("The character is not mapped: "
+ ch + " (index=" + index + ")");
}
return this.soundexMapping[index];
}
@@ -231,10 +319,13 @@ public class Soundex implements StringEn
char lastDigit = map(first); // previous digit
for(int i = 1; i < str.length() && count < out.length ; i++) {
char ch = str.charAt(i);
- if (ch == 'H' || ch == 'W') { // these are ignored completely
+ if ((this.specialCaseHW) && (ch == 'H' || ch == 'W')) { // these
are ignored completely
continue;
}
char digit = map(ch);
+ if (digit == SILENT_MARKER) {
+ continue;
+ }
if (digit != '0' && digit != lastDigit) { // don't store vowels or
repeats
out[count++] = digit;
}
Modified:
commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/SoundexTest.java
URL:
http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/SoundexTest.java?rev=1789911&r1=1789910&r2=1789911&view=diff
==============================================================================
---
commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/SoundexTest.java
(original)
+++
commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/SoundexTest.java
Sun Apr 2 20:41:29 2017
@@ -403,4 +403,33 @@ public class SoundexTest extends StringE
Assert.assertEquals("T522", this.getStringEncoder().encode("Tymczak"));
Assert.assertEquals("P236", this.getStringEncoder().encode("Pfister"));
}
+
+ @Test
+// examples and algorithm rules from:
http://www.genealogy.com/articles/research/00000060.html
+ public void testGenealogy() { // treat vowels and HW as silent
+ Soundex s = Soundex.US_ENGLISH_GENEALOGY;
+ Assert.assertEquals("H251", s.encode("Heggenburger"));
+ Assert.assertEquals("B425", s.encode("Blackman"));
+ Assert.assertEquals("S530", s.encode("Schmidt"));
+ Assert.assertEquals("L150", s.encode("Lippmann"));
+ // Additional local example
+ Assert.assertEquals("D200", s.encode("Dodds")); // 'o' is not a
separator here - it is silent
+ Assert.assertEquals("D200", s.encode("Dhdds")); // 'h' is silent
+ Assert.assertEquals("D200", s.encode("Dwdds")); // 'w' is silent
+ }
+
+ @Test
+// examples and algorithm rules from:
http://west-penwith.org.uk/misc/soundex.htm
+ public void testSimplifiedSoundex() { // treat vowels and HW as separators
+ Soundex s = Soundex.US_ENGLISH_SIMPLIFIED;
+ Assert.assertEquals("W452", s.encode("WILLIAMS"));
+ Assert.assertEquals("B625", s.encode("BARAGWANATH"));
+ Assert.assertEquals("D540", s.encode("DONNELL"));
+ Assert.assertEquals("L300", s.encode("LLOYD"));
+ Assert.assertEquals("W422", s.encode("WOOLCOCK"));
+ // Additional local examples
+ Assert.assertEquals("D320", s.encode("Dodds"));
+ Assert.assertEquals("D320", s.encode("Dwdds")); // w is a separator
+ Assert.assertEquals("D320", s.encode("Dhdds")); // h is a separator
+ }
}