ggregory 2003/11/06 08:31:47
Modified: codec/src/test/org/apache/commons/codec/language
SoundexTest.java
codec/src/java/org/apache/commons/codec/language
Soundex.java
Added: codec/src/test/org/apache/commons/codec/language
AllTests.java
Log:
Soundex encoding bugs.
http://issues.apache.org/bugzilla/show_bug.cgi?id=24471
Revision Changes Path
1.6 +250 -98
jakarta-commons/codec/src/test/org/apache/commons/codec/language/SoundexTest.java
Index: SoundexTest.java
===================================================================
RCS file:
/home/cvs/jakarta-commons/codec/src/test/org/apache/commons/codec/language/SoundexTest.java,v
retrieving revision 1.5
retrieving revision 1.6
diff -u -r1.5 -r1.6
--- SoundexTest.java 4 Nov 2003 02:43:09 -0000 1.5
+++ SoundexTest.java 6 Nov 2003 16:31:47 -0000 1.6
@@ -2,68 +2,57 @@
* ====================================================================
*
* The Apache Software License, Version 1.1
- *
- * Copyright (c) 2001-2003 The Apache Software Foundation. All rights
- * reserved.
- *
+ *
+ * Copyright (c) 2001-2003 The Apache Software Foundation. All rights reserved.
+ *
* Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- *
- * 3. The end-user documentation included with the redistribution,
- * if any, must include the following acknowledgement:
- * "This product includes software developed by the
- * Apache Software Foundation (http://www.apache.org/)."
- * Alternately, this acknowledgement may appear in the software itself,
- * if and wherever such third-party acknowledgements normally appear.
- *
- * 4. The names "Apache", "The Jakarta Project", "Commons", and "Apache Software
- * Foundation" must not be used to endorse or promote products derived
- * from this software without prior written permission. For written
- * permission, please contact [EMAIL PROTECTED]
- *
- * 5. Products derived from this software may not be called "Apache",
- * "Apache" nor may "Apache" appear in their name without prior
- * written permission of the Apache Software Foundation.
- *
- * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
- * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
- * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
- * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
- * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
- * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
+ * modification, are permitted provided that the following conditions are met: 1.
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer. 2. Redistributions in
+ * binary form must reproduce the above copyright notice, this list of
+ * conditions and the following disclaimer in the documentation and/or other
+ * materials provided with the distribution. 3. The end-user documentation
+ * included with the redistribution, if any, must include the following
+ * acknowledgement: "This product includes software developed by the Apache
+ * Software Foundation (http://www.apache.org/)." Alternately, this
+ * acknowledgement may appear in the software itself, if and wherever such
+ * third-party acknowledgements normally appear. 4. The names "Apache", "The
+ * Jakarta Project", "Commons", and "Apache Software Foundation" must not be
+ * used to endorse or promote products derived from this software without prior
+ * written permission. For written permission, please contact
+ * [EMAIL PROTECTED] 5. Products derived from this software may not be called
+ * "Apache", "Apache" nor may "Apache" appear in their name without prior
+ * written permission of the Apache Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED WARRANTIES,
+ * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * APACHE SOFTWARE FOUNDATION OR ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
* ====================================================================
- *
- * This software consists of voluntary contributions made by many
- * individuals on behalf of the Apache Software Foundation. For more
- * information on the Apache Software Foundation, please see
- * <http://www.apache.org/>.
- *
- */
+ *
+ * This software consists of voluntary contributions made by many individuals
+ * on behalf of the Apache Software Foundation. For more information on the
+ * Apache Software Foundation, please see <http://www.apache.org/> .
+ *
+ */
+// (FYI: Formatted and sorted with Eclipse)
package org.apache.commons.codec.language;
import junit.framework.Test;
import junit.framework.TestSuite;
-
import org.apache.commons.codec.StringEncoder;
import org.apache.commons.codec.StringEncoderAbstractTest;
/**
+ * Tests [EMAIL PROTECTED] Soundex}
+ *
* @version $Revision$ $Date$
* @author Rodney Waldhoff
* @author Gary Gregory
@@ -74,16 +63,17 @@
return (new TestSuite(SoundexTest.class));
}
- private Soundex _encoder = null;
+ private Soundex encoder = null;
public SoundexTest(String name) {
super(name);
}
+
/**
- * @return Returns the _encoder.
- */
+ * @return Returns the _encoder.
+ */
public Soundex getEncoder() {
- return this._encoder;
+ return this.encoder;
}
protected StringEncoder makeEncoder() {
@@ -91,13 +81,14 @@
}
/**
- * @param _encoder The _encoder to set.
- */
+ * @param encoder
+ * The encoder to set.
+ */
public void setEncoder(Soundex encoder) {
- this._encoder = encoder;
+ this.encoder = encoder;
}
- public void setUp() throws Exception {
+ public void setUp() throws Exception {
super.setUp();
this.setEncoder(new Soundex());
}
@@ -107,51 +98,212 @@
this.setEncoder(null);
}
- // ------------------------------------------------------------------------
+ void encodeAll(String[] strings, String expectedEncoding) {
+ for (int i = 0; i < strings.length; i++) {
+ assertEquals(expectedEncoding, this.getEncoder().encode(strings[i]));
+ }
+ }
+
+ public void testB650() {
+ this.encodeAll(
+ new String[] {
+ "BARHAM",
+ "BARONE",
+ "BARRON",
+ "BERNA",
+ "BIRNEY",
+ "BIRNIE",
+ "BOOROM",
+ "BOREN",
+ "BORN",
+ "BOURN",
+ "BOURNE",
+ "BOWRON",
+ "BRAIN",
+ "BRAME",
+ "BRANN",
+ "BRAUN",
+ "BREEN",
+ "BRIEN",
+ "BRIM",
+ "BRIMM",
+ "BRINN",
+ "BRION",
+ "BROOM",
+ "BROOME",
+ "BROWN",
+ "BROWNE",
+ "BRUEN",
+ "BRUHN",
+ "BRUIN",
+ "BRUMM",
+ "BRUN",
+ "BRUNO",
+ "BRYAN",
+ "BURIAN",
+ "BURN",
+ "BURNEY",
+ "BYRAM",
+ "BYRNE",
+ "BYRON",
+ "BYRUM" },
+ "B650");
+ }
- public void testEncode() throws Exception {
- assertEquals("T235",this.getEncoder().encode("testing"));
- assertEquals("T000",this.getEncoder().encode("The"));
- assertEquals("Q200",this.getEncoder().encode("quick"));
- assertEquals("B650",this.getEncoder().encode("brown"));
- assertEquals("F200",this.getEncoder().encode("fox"));
- assertEquals("J513",this.getEncoder().encode("jumped"));
- assertEquals("O160",this.getEncoder().encode("over"));
- assertEquals("T000",this.getEncoder().encode("the"));
- assertEquals("L200",this.getEncoder().encode("lazy"));
- assertEquals("D200",this.getEncoder().encode("dogs"));
+ public void testEncodeBasic() {
+ assertEquals("T235", this.getEncoder().encode("testing"));
+ assertEquals("T000", this.getEncoder().encode("The"));
+ assertEquals("Q200", this.getEncoder().encode("quick"));
+ assertEquals("B650", this.getEncoder().encode("brown"));
+ assertEquals("F200", this.getEncoder().encode("fox"));
+ assertEquals("J513", this.getEncoder().encode("jumped"));
+ assertEquals("O160", this.getEncoder().encode("over"));
+ assertEquals("T000", this.getEncoder().encode("the"));
+ assertEquals("L200", this.getEncoder().encode("lazy"));
+ assertEquals("D200", this.getEncoder().encode("dogs"));
}
/**
- * Examples from
- * http://www.bradandkathy.com/genealogy/overviewofsoundex.html
- */
- public void testEncode2() throws Exception {
- assertEquals("A462",this.getEncoder().encode("Allricht"));
- assertEquals("E166",this.getEncoder().encode("Eberhard"));
- assertEquals("E521",this.getEncoder().encode("Engebrethson"));
- assertEquals("H512",this.getEncoder().encode("Heimbach"));
- assertEquals("H524",this.getEncoder().encode("Hanselmann"));
- assertEquals("H431",this.getEncoder().encode("Hildebrand"));
- assertEquals("K152",this.getEncoder().encode("Kavanagh"));
- assertEquals("L530",this.getEncoder().encode("Lind, Van"));
- assertEquals("L222",this.getEncoder().encode("Lukaschowsky"));
- assertEquals("M235",this.getEncoder().encode("McDonnell"));
- assertEquals("M200",this.getEncoder().encode("McGee"));
- // Fix me?
- //assertEquals("O165",this.getEncoder().encode("O'Brien"));
- assertEquals("O155",this.getEncoder().encode("Opnian"));
- assertEquals("O155",this.getEncoder().encode("Oppenheimer"));
- // Fix me?
- //assertEquals("S460",this.getEncoder().encode("Swhgler"));
- assertEquals("R355",this.getEncoder().encode("Riedemanas"));
- assertEquals("Z300",this.getEncoder().encode("Zita"));
- assertEquals("Z325",this.getEncoder().encode("Zitzmeinn"));
+ * Examples from
+ * http://www.bradandkathy.com/genealogy/overviewofsoundex.html
+ */
+ public void testEncodeBatch2() {
+ assertEquals("A462", this.getEncoder().encode("Allricht"));
+ assertEquals("E166", this.getEncoder().encode("Eberhard"));
+ assertEquals("E521", this.getEncoder().encode("Engebrethson"));
+ assertEquals("H512", this.getEncoder().encode("Heimbach"));
+ assertEquals("H524", this.getEncoder().encode("Hanselmann"));
+ assertEquals("H431", this.getEncoder().encode("Hildebrand"));
+ assertEquals("K152", this.getEncoder().encode("Kavanagh"));
+ assertEquals("L530", this.getEncoder().encode("Lind"));
+ assertEquals("L222", this.getEncoder().encode("Lukaschowsky"));
+ assertEquals("M235", this.getEncoder().encode("McDonnell"));
+ assertEquals("M200", this.getEncoder().encode("McGee"));
+ assertEquals("O155", this.getEncoder().encode("Opnian"));
+ assertEquals("O155", this.getEncoder().encode("Oppenheimer"));
+ assertEquals("R355", this.getEncoder().encode("Riedemanas"));
+ assertEquals("Z300", this.getEncoder().encode("Zita"));
+ assertEquals("Z325", this.getEncoder().encode("Zitzmeinn"));
}
-
+
+ /**
+ * Examples from
+ * http://www.archives.gov/research_room/genealogy/census/soundex.html
+ */
+ public void testEncodeBatch3() {
+ assertEquals("W252", this.getEncoder().encode("Washington"));
+ assertEquals("L000", this.getEncoder().encode("Lee"));
+ assertEquals("G362", this.getEncoder().encode("Gutierrez"));
+ assertEquals("P236", this.getEncoder().encode("Pfister"));
+ assertEquals("J250", this.getEncoder().encode("Jackson"));
+ assertEquals("T522", this.getEncoder().encode("Tymczak"));
+ // For VanDeusen: D-250 (D, 2 for the S, 5 for the N, 0 added) is also
+ // possible.
+ assertEquals("V532", this.getEncoder().encode("VanDeusen"));
+ }
+
+ /**
+ * Examples from: http://www.myatt.demon.co.uk/sxalg.htm
+ */
+ public void testEncodeBatch4() {
+ assertEquals("H452", this.getEncoder().encode("HOLMES"));
+ assertEquals("A355", this.getEncoder().encode("ADOMOMI"));
+ assertEquals("V536", this.getEncoder().encode("VONDERLEHR"));
+ assertEquals("B400", this.getEncoder().encode("BALL"));
+ assertEquals("S000", this.getEncoder().encode("SHAW"));
+ assertEquals("J250", this.getEncoder().encode("JACKSON"));
+ assertEquals("S545", this.getEncoder().encode("SCANLON"));
+ assertEquals("S532", this.getEncoder().encode("SAINTJOHN"));
+
+ }
+
+ public void testEncodeIgnoreApostrophes() {
+ this.encodeAll(new String[] { "OBrien", "'OBrien", "O'Brien", "OB'rien",
"OBr'ien", "OBri'en", "OBrie'n", "OBrien'" }, "O165");
+ }
+
+ /**
+ * Test data from http://www.myatt.demon.co.uk/sxalg.htm
+ *
+ * @throws EncoderException
+ */
+ public void testEncodeIgnoreHyphens() {
+ this.encodeAll(
+ new String[] {
+ "KINGSMITH",
+ "-KINGSMITH",
+ "K-INGSMITH",
+ "KI-NGSMITH",
+ "KIN-GSMITH",
+ "KING-SMITH",
+ "KINGS-MITH",
+ "KINGSM-ITH",
+ "KINGSMI-TH",
+ "KINGSMIT-H",
+ "KINGSMITH-" },
+ "K525");
+ }
+
+ public void testEncodeIgnoreTrimmable() {
+ assertEquals("W252", this.getEncoder().encode(" \t\n\r Washington \t\n\r
"));
+ }
+
+ /**
+ * Consonants from the same code group separated by W or H are treated as
+ * one.
+ */
+ public void testHWRuleEx1() {
+ // From
+ // http://www.archives.gov/research_room/genealogy/census/soundex.html:
+ // Ashcraft is coded A-261 (A, 2 for the S, C ignored, 6 for the R, 1
+ // for the F). It is not coded A-226.
+ assertEquals("A261", this.getEncoder().encode("Ashcraft"));
+ }
+
+ /**
+ * Consonants from the same code group separated by W or H are treated as
+ * one.
+ *
+ * Test data from http://www.myatt.demon.co.uk/sxalg.htm
+ */
+ public void testHWRuleEx2() {
+ assertEquals("B312", this.getEncoder().encode("BOOTHDAVIS"));
+ assertEquals("B312", this.getEncoder().encode("BOOTH-DAVIS"));
+ }
+
+ /**
+ * Consonants from the same code group separated by W or H are treated as
+ * one.
+ *
+ * Test data from http://www.myatt.demon.co.uk/sxalg.htm
+ */
+ public void testHWRuleEx3() {
+ assertEquals("S460", this.getEncoder().encode("Sgler"));
+ assertEquals("S460", this.getEncoder().encode("Swhgler"));
+ // Also S460:
+ this.encodeAll(
+ new String[] {
+ "SAILOR",
+ "SALYER",
+ "SAYLOR",
+ "SCHALLER",
+ "SCHELLER",
+ "SCHILLER",
+ "SCHOOLER",
+ "SCHULER",
+ "SCHUYLER",
+ "SEILER",
+ "SEYLER",
+ "SHOLAR",
+ "SHULER",
+ "SILAR",
+ "SILER",
+ "SILLER" },
+ "S460");
+ }
+
public void testMaxLength() throws Exception {
Soundex soundex = new Soundex();
- soundex.setMaxLength( soundex.getMaxLength() );
+ soundex.setMaxLength(soundex.getMaxLength());
}
}
1.1
jakarta-commons/codec/src/test/org/apache/commons/codec/language/AllTests.java
Index: AllTests.java
===================================================================
/*
* Copyright (C) 1993-2003 SEAGULL
*
* AllTests.java
* Created on Nov 5, 2003, 8:25:55 PM
*
*/
package org.apache.commons.codec.language;
import junit.framework.Test;
import junit.framework.TestSuite;
/**
* Tests all test cases in this package.
*
* @author <a href="mailto:[EMAIL PROTECTED]">Gary Gregory</a>
* @version $Id: AllTests.java,v 1.1 2003/11/06 16:31:47 ggregory Exp $
*/
public class AllTests {
public static Test suite() {
TestSuite suite = new TestSuite("Test for
org.apache.commons.codec.language");
//$JUnit-BEGIN$
suite.addTest(MetaphoneTest.suite());
suite.addTest(SoundexTest.suite());
suite.addTest(RefinedSoundexTest.suite());
suite.addTest(DoubleMetaphoneTest.suite());
//$JUnit-END$
return suite;
}
}
1.11 +173 -126
jakarta-commons/codec/src/java/org/apache/commons/codec/language/Soundex.java
Index: Soundex.java
===================================================================
RCS file:
/home/cvs/jakarta-commons/codec/src/java/org/apache/commons/codec/language/Soundex.java,v
retrieving revision 1.10
retrieving revision 1.11
diff -u -r1.10 -r1.11
--- Soundex.java 4 Nov 2003 02:43:09 -0000 1.10
+++ Soundex.java 6 Nov 2003 16:31:47 -0000 1.11
@@ -2,58 +2,45 @@
* ====================================================================
*
* The Apache Software License, Version 1.1
- *
- * Copyright (c) 2001-2003 The Apache Software Foundation. All rights
- * reserved.
- *
+ *
+ * Copyright (c) 2001-2003 The Apache Software Foundation. All rights reserved.
+ *
* Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- *
- * 3. The end-user documentation included with the redistribution,
- * if any, must include the following acknowledgement:
- * "This product includes software developed by the
- * Apache Software Foundation (http://www.apache.org/)."
- * Alternately, this acknowledgement may appear in the software itself,
- * if and wherever such third-party acknowledgements normally appear.
- *
- * 4. The names "Apache", "The Jakarta Project", "Commons", and "Apache Software
- * Foundation" must not be used to endorse or promote products derived
- * from this software without prior written permission. For written
- * permission, please contact [EMAIL PROTECTED]
- *
- * 5. Products derived from this software may not be called "Apache",
- * "Apache" nor may "Apache" appear in their name without prior
- * written permission of the Apache Software Foundation.
- *
- * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
- * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
- * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
- * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
- * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
- * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
+ * modification, are permitted provided that the following conditions are met: 1.
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer. 2. Redistributions in
+ * binary form must reproduce the above copyright notice, this list of
+ * conditions and the following disclaimer in the documentation and/or other
+ * materials provided with the distribution. 3. The end-user documentation
+ * included with the redistribution, if any, must include the following
+ * acknowledgement: "This product includes software developed by the Apache
+ * Software Foundation (http://www.apache.org/)." Alternately, this
+ * acknowledgement may appear in the software itself, if and wherever such
+ * third-party acknowledgements normally appear. 4. The names "Apache", "The
+ * Jakarta Project", "Commons", and "Apache Software Foundation" must not be
+ * used to endorse or promote products derived from this software without prior
+ * written permission. For written permission, please contact
+ * [EMAIL PROTECTED] 5. Products derived from this software may not be called
+ * "Apache", "Apache" nor may "Apache" appear in their name without prior
+ * written permission of the Apache Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED WARRANTIES,
+ * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * APACHE SOFTWARE FOUNDATION OR ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
* ====================================================================
- *
- * This software consists of voluntary contributions made by many
- * individuals on behalf of the Apache Software Foundation. For more
- * information on the Apache Software Foundation, please see
- * <http://www.apache.org/>.
- *
- */
+ *
+ * This software consists of voluntary contributions made by many individuals
+ * on behalf of the Apache Software Foundation. For more information on the
+ * Apache Software Foundation, please see <http://www.apache.org/> .
+ *
+ */
package org.apache.commons.codec.language;
@@ -61,9 +48,9 @@
import org.apache.commons.codec.StringEncoder;
/**
- * Encodes a string into a soundex value. Soundex is an encoding used to
- * relate similar names, but can also be used as a general purpose
- * scheme to find word with similar phonemes.
+ * Encodes a string into a soundex value. Soundex is an encoding used to relate
+ * similar names, but can also be used as a general purpose scheme to find word
+ * with similar phonemes.
*
* @author [EMAIL PROTECTED]
* @author Tim O'Brien
@@ -73,71 +60,92 @@
public class Soundex implements StringEncoder {
/**
- * This static variable contains an instance of the
- * Soundex using the US_ENGLISH mapping.
- */
+ * This static variable contains an instance of the Soundex using the
+ * US_ENGLISH mapping.
+ */
public static final Soundex US_ENGLISH = new Soundex();
/**
- * This is a default mapping of the 26 letters used
- * in US english.
- */
- public static final char[] US_ENGLISH_MAPPING =
- "01230120022455012623010202".toCharArray();
+ * This is a default mapping of the 26 letters used in US english.
+ * A value of <code>0</code> for a letter position means do not encode.
+ */
+ public static final char[] US_ENGLISH_MAPPING =
"01230120022455012623010202".toCharArray();
/**
- * The maximum length of a Soundex code - Soundex codes are
- * only four characters by definition.
- */
+ * The maximum length of a Soundex code - Soundex codes are only four
+ * characters by definition.
+ */
private int maxLength = 4;
-
+
/**
- * Every letter of the alphabet is "mapped" to a numerical
- * value. This char array holds the values to which each
- * letter is mapped. This implementation contains a default
- * map for US_ENGLISH
- */
+ * Every letter of the alphabet is "mapped" to a numerical value. This char
+ * array holds the values to which each letter is mapped. This
+ * implementation contains a default map for US_ENGLISH
+ */
private char[] soundexMapping;
/**
- * Creates an instance of the Soundex object using the default
- * US_ENGLISH mapping.
- */
+ * Creates an instance of the Soundex object using the default US_ENGLISH
+ * mapping.
+ */
public Soundex() {
this(US_ENGLISH_MAPPING);
}
/**
- * Creates a soundex instance using a custom mapping. This
- * constructor can be used to customize the mapping, and/or possibly
- * provide an internationalized mapping for a non-Western character
- * set.
- *
- * @param mapping Mapping array to use when finding the corresponding
- * code for a given character
- */
+ * Creates a soundex instance using a custom mapping. This constructor can
+ * be used to customize the mapping, and/or possibly provide an
+ * internationalized mapping for a non-Western character set.
+ *
+ * @param mapping
+ * Mapping array to use when finding the corresponding code
for
+ * a given character
+ */
public Soundex(char[] mapping) {
this.setSoundexMapping(mapping);
}
/**
- * Encodes an Object using the soundex algorithm. This method
- * is provided in order to satisfy the requirements of the
- * Encoder interface, and will throw an EncoderException if the
- * supplied object is not of type java.lang.String.
- *
- * @param pObject Object to encode
- * @return An object (or type java.lang.String) containing the
- * soundex code which corresponds to the String supplied.
- * @throws EncoderException if the parameter supplied is not
- * of type java.lang.String
- */
+ * Cleans up the input string before Soundex processing by trimming and
+ * removing punctuation characters. The string is returned in upper-case.
+ */
+ private String clean(String str) {
+ if (str == null || str.length() == 0) {
+ return str;
+ }
+ int len = str.length();
+ char[] chars = new char[len];
+ int count = 0;
+ for (int i = 0; i < len; i++) {
+ if (Character.isLetter(str.charAt(i))) {
+ chars[count++] = str.charAt(i);
+ }
+ }
+ if (count == len) {
+ return str.toUpperCase();
+ }
+ return new String(chars, 0, count).toUpperCase();
+ }
+
+ /**
+ * Encodes an Object using the soundex algorithm. This method is provided
+ * in order to satisfy the requirements of the Encoder interface, and will
+ * throw an EncoderException if the supplied object is not of type
+ * java.lang.String.
+ *
+ * @param pObject
+ * Object to encode
+ * @return An object (or type java.lang.String) containing the soundex code
+ * which corresponds to the String supplied.
+ * @throws EncoderException
+ * if the parameter supplied is not of type java.lang.String
+ */
public Object encode(Object pObject) throws EncoderException {
Object result;
if (!(pObject instanceof java.lang.String)) {
- throw new EncoderException("Parameter supplied to Soundex encode is not
of type java.lang.String");
+ throw new EncoderException("Parameter supplied to Soundex encode is not
of type java.lang.String");
} else {
result = soundex((String) pObject);
}
@@ -147,79 +155,118 @@
}
/**
- * Encodes a String using the soundex algorithm.
- *
- * @param pString A String object to encode
- * @return A Soundex code corresponding to the String supplied
- */
+ * Encodes a String using the soundex algorithm.
+ *
+ * @param pString
+ * A String object to encode
+ * @return A Soundex code corresponding to the String supplied
+ */
public String encode(String pString) {
- return (soundex(pString));
+ return soundex(pString);
}
/**
- * Used internally by the SoundEx algorithm.
- *
- * @param c character to use to retrieve mapping code
- * @return Mapping code for a particular character
- */
- private char getMappingCode(char c) {
+ * Used internally by the SoundEx algorithm.
+ *
+ * Consonants from the same code group separated by W or H are treated as one.
+ *
+ * @param str
+ * the whole string
+ * @param index
+ * the character position to encode
+ * @return Mapping code for a particular character
+ */
+ private char getMappingCode(String str, int index) {
+ char c = str.charAt(index);
if (!Character.isLetter(c)) {
return 0;
} else {
- return this.getSoundexMapping()[Character.toUpperCase(c) - 'A'];
+ char mappedChar = this.map(c);
+ // HW rule check
+ if (index > 1 && mappedChar != '0') {
+ char hwChar = str.charAt(index-1);
+ if ('H' == hwChar || 'W' == hwChar) {
+ char preHWChar = str.charAt(index - 2);
+ char firstCode = this.map(preHWChar);
+ if (firstCode == mappedChar || 'H' == preHWChar || 'W' ==
preHWChar) {
+ return 0;
+ }
+ }
+ }
+ return mappedChar;
}
}
/**
- * Returns the maxLength. Standard Soundex
- * @return int
- */
+ * Returns the maxLength. Standard Soundex
+ *
+ * @return int
+ */
public int getMaxLength() {
return this.maxLength;
}
/**
- * @return Returns the soundexMapping.
- */
+ * @return Returns the soundexMapping.
+ */
private char[] getSoundexMapping() {
return this.soundexMapping;
}
/**
- * Sets the maxLength.
- * @param maxLength The maxLength to set
+ * Maps the given upper-case character to it's Soudex code.
*/
+ private char map(char c) {
+ return this.getSoundexMapping()[c - 'A'];
+ }
+
+ /**
+ * Sets the maxLength.
+ *
+ * @param maxLength
+ * The maxLength to set
+ */
public void setMaxLength(int maxLength) {
this.maxLength = maxLength;
}
/**
- * @param soundexMapping The soundexMapping to set.
- */
+ * @param soundexMapping
+ * The soundexMapping to set.
+ */
private void setSoundexMapping(char[] soundexMapping) {
this.soundexMapping = soundexMapping;
}
/**
- * Retreives the Soundex code for a given String object.
- *
- * @param str String to encode using the Soundex algorithm
- * @return A soundex code for the String supplied
- */
+ * Retreives the Soundex code for a given String object.
+ *
+ * @param str
+ * String to encode using the Soundex algorithm
+ * @return A soundex code for the String supplied
+ */
public String soundex(String str) {
- if (null == str || str.length() == 0) { return str; }
-
+ if (str == null) {
+ return null;
+ }
+ str = this.clean(str);
+ if (str.length() == 0) {
+ return str;
+ }
+
char out[] = { '0', '0', '0', '0' };
char last, mapped;
int incount = 1, count = 1;
- out[0] = Character.toUpperCase(str.charAt(0));
- last = getMappingCode(str.charAt(0));
- while ((incount < str.length()) && (mapped =
getMappingCode(str.charAt(incount++))) != 0 && (count < this.getMaxLength())) {
+ out[0] = str.charAt(0);
+ last = getMappingCode(str, 0);
+ while ((incount < str.length()) && (count < this.getMaxLength())) {
+ if ((mapped = getMappingCode(str, incount++)) != 0) {
if ((mapped != '0') && (mapped != last)) {
out[count++] = mapped;
}
last = mapped;
}
+ }
return new String(out);
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [EMAIL PROTECTED]
For additional commands, e-mail: [EMAIL PROTECTED]