ggregory 2003/12/09 16:04:46
Modified: codec/src/test/org/apache/commons/codec/language
SoundexTest.java
codec RELEASE-NOTES.txt default.properties
codec/src/java/org/apache/commons/codec/language
Soundex.java
Log:
Added an implementation of the Soundex DIFFERENCE algorithm, a feature requested in
http://nagoya.apache.org/bugzilla/show_bug.cgi?id=25243.
Note that this commit only covers adding the difference API to the Soundex class.
See the ticket for comments.
Revision Changes Path
1.10 +27 -8
jakarta-commons/codec/src/test/org/apache/commons/codec/language/SoundexTest.java
Index: SoundexTest.java
===================================================================
RCS file:
/home/cvs/jakarta-commons/codec/src/test/org/apache/commons/codec/language/SoundexTest.java,v
retrieving revision 1.9
retrieving revision 1.10
diff -u -r1.9 -r1.10
--- SoundexTest.java 4 Dec 2003 23:32:39 -0000 1.9
+++ SoundexTest.java 10 Dec 2003 00:04:46 -0000 1.10
@@ -60,6 +60,7 @@
import junit.framework.Test;
import junit.framework.TestSuite;
+import org.apache.commons.codec.EncoderException;
import org.apache.commons.codec.StringEncoder;
import org.apache.commons.codec.StringEncoderAbstractTest;
@@ -81,6 +82,12 @@
public SoundexTest(String name) {
super(name);
}
+
+ void encodeAll(String[] strings, String expectedEncoding) {
+ for (int i = 0; i < strings.length; i++) {
+ assertEquals(expectedEncoding, this.getEncoder().encode(strings[i]));
+ }
+ }
/**
* @return Returns the _encoder.
@@ -111,12 +118,6 @@
this.setEncoder(null);
}
- void encodeAll(String[] strings, String expectedEncoding) {
- for (int i = 0; i < strings.length; i++) {
- assertEquals(expectedEncoding, this.getEncoder().encode(strings[i]));
- }
- }
-
public void testB650() {
this.encodeAll(
new String[] {
@@ -163,6 +164,25 @@
"B650");
}
+ public void testDifference() throws EncoderException {
+ // Edge cases
+ assertEquals(this.getEncoder().difference(null, null), 0);
+ assertEquals(this.getEncoder().difference("", ""), 0);
+ assertEquals(this.getEncoder().difference(" ", " "), 0);
+ // Normal cases
+ assertEquals(this.getEncoder().difference("Smith", "Smythe"), 4);
+ assertEquals(this.getEncoder().difference("Ann", "Andrew"), 2);
+ assertEquals(this.getEncoder().difference("Margaret", "Andrew"), 1);
+ assertEquals(this.getEncoder().difference("Janet", "Margaret"), 0);
+ // Examples from
http://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_de-dz_8co5.asp
+ assertEquals(this.getEncoder().difference("Green", "Greene"), 4);
+ assertEquals(this.getEncoder().difference("Blotchet-Halls", "Greene"), 0);
+ // Examples from
http://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_setu-sus_3o6w.asp
+ assertEquals(this.getEncoder().difference("Smith", "Smythe"), 4);
+ assertEquals(this.getEncoder().difference("Smithers", "Smythers"), 4);
+ assertEquals(this.getEncoder().difference("Anothers", "Brothers"), 2);
+ }
+
public void testEncodeBasic() {
assertEquals("T235", this.getEncoder().encode("testing"));
assertEquals("T000", this.getEncoder().encode("The"));
@@ -342,7 +362,6 @@
public void testMsSqlServer2() {
this.encodeAll(new String[]{"Erickson", "Erickson", "Erikson", "Ericson",
"Ericksen", "Ericsen"}, "E625");
}
-
/**
* Examples for MS SQLServer from
* http://databases.about.com/library/weekly/aa042901a.htm
1.13 +7 -26 jakarta-commons/codec/RELEASE-NOTES.txt
Index: RELEASE-NOTES.txt
===================================================================
RCS file: /home/cvs/jakarta-commons/codec/RELEASE-NOTES.txt,v
retrieving revision 1.12
retrieving revision 1.13
diff -u -r1.12 -r1.13
--- RELEASE-NOTES.txt 25 Nov 2003 05:03:29 -0000 1.12
+++ RELEASE-NOTES.txt 10 Dec 2003 00:04:46 -0000 1.13
@@ -1,7 +1,7 @@
$Id$
Commons Codec Package
- Version 1.2
+ Version 1.3-dev
Release Notes
@@ -9,34 +9,15 @@
This document contains the release notes for this version of the Commons
Codec package, and highlights changes since the previous version. Version
-1.2 contains bug fixes for the 1.1 release.
+1.3 contains bug fixes for the 1.2 release.
NEW FEATURES:
-* URLCodec - Implements the 'www-form-urlencoded' encoding scheme
-
-* DigestUtils - Simple utility class that provides static convenience
- methods for calculating md5 and hex digests.
-
+* Soundex - Implemented the DIFFERENCE algorithm.
BUG FIXES:
-* Fix for Bug 19860: Modified Base64 to remedy non-compliance with RFC
- 2045. Non-Base64 characters were not being discarded during the
- decode. RFC 2045 explicitly states that all characters outside of the
- base64 alphabet are to be ignored.
-
-* Fix for Bug 24360: Hex.decode(Object) throws a ClassCastException
- when a String argument is passed in.
-
-* Fix for Bug 24471: Soundex: The HW rule is not applied; hyphens and
- apostrophes are not ignored.
-
-* Fix for Bug 24484: Soundex.setMaxLength causes bugs and is not needed.
- Calling Soundex.setMaxLength() with a value of 2 or less causes the wrong
- answer to be returned. Since the encoding returned by Soundex is always
- of length 4 by definition (we do not use the '-' in as a letter-nnn
- separator) the need for a maxLength attribute is not needed. Deprecate
- the field and accessor methods.
+* Fix for Bug XXXX: ZZZZ
+
+* Fixes for missing tags in Javadoc comments.
-* Fix in Metaphone relating to the handling of the maximum code length.
1.9 +2 -2 jakarta-commons/codec/default.properties
Index: default.properties
===================================================================
RCS file: /home/cvs/jakarta-commons/codec/default.properties,v
retrieving revision 1.8
retrieving revision 1.9
diff -u -r1.8 -r1.9
--- default.properties 26 Nov 2003 03:18:28 -0000 1.8
+++ default.properties 10 Dec 2003 00:04:46 -0000 1.9
@@ -13,7 +13,7 @@
component.title = Encode/Decode Utilities
# The current version number of this component
-component.version = 1.2
+component.version = 1.3-dev
# The name that is used to create the jar file
final.name = ${component.name}-${component.version}
1.14 +136 -57
jakarta-commons/codec/src/java/org/apache/commons/codec/language/Soundex.java
Index: Soundex.java
===================================================================
RCS file:
/home/cvs/jakarta-commons/codec/src/java/org/apache/commons/codec/language/Soundex.java,v
retrieving revision 1.13
retrieving revision 1.14
diff -u -r1.13 -r1.14
--- Soundex.java 12 Nov 2003 19:02:57 -0000 1.13
+++ Soundex.java 10 Dec 2003 00:04:46 -0000 1.14
@@ -2,58 +2,45 @@
* ====================================================================
*
* The Apache Software License, Version 1.1
- *
- * Copyright (c) 2001-2003 The Apache Software Foundation. All rights
- * reserved.
- *
+ *
+ * Copyright (c) 2001-2003 The Apache Software Foundation. All rights reserved.
+ *
* Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- *
- * 3. The end-user documentation included with the redistribution,
- * if any, must include the following acknowledgement:
- * "This product includes software developed by the
- * Apache Software Foundation (http://www.apache.org/)."
- * Alternately, this acknowledgement may appear in the software itself,
- * if and wherever such third-party acknowledgements normally appear.
- *
- * 4. The names "Apache", "The Jakarta Project", "Commons", and "Apache Software
- * Foundation" must not be used to endorse or promote products derived
- * from this software without prior written permission. For written
- * permission, please contact [EMAIL PROTECTED]
- *
- * 5. Products derived from this software may not be called "Apache",
- * "Apache" nor may "Apache" appear in their name without prior
- * written permission of the Apache Software Foundation.
- *
- * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
- * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
- * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
- * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
- * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
- * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
+ * modification, are permitted provided that the following conditions are met: 1.
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer. 2. Redistributions in
+ * binary form must reproduce the above copyright notice, this list of
+ * conditions and the following disclaimer in the documentation and/or other
+ * materials provided with the distribution. 3. The end-user documentation
+ * included with the redistribution, if any, must include the following
+ * acknowledgement: "This product includes software developed by the Apache
+ * Software Foundation (http://www.apache.org/)." Alternately, this
+ * acknowledgement may appear in the software itself, if and wherever such
+ * third-party acknowledgements normally appear. 4. The names "Apache", "The
+ * Jakarta Project", "Commons", and "Apache Software Foundation" must not be
+ * used to endorse or promote products derived from this software without prior
+ * written permission. For written permission, please contact
+ * [EMAIL PROTECTED] 5. Products derived from this software may not be called
+ * "Apache", "Apache" nor may "Apache" appear in their name without prior
+ * written permission of the Apache Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED WARRANTIES,
+ * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * APACHE SOFTWARE FOUNDATION OR ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
* ====================================================================
- *
- * This software consists of voluntary contributions made by many
- * individuals on behalf of the Apache Software Foundation. For more
- * information on the Apache Software Foundation, please see
- * <http://www.apache.org/>.
- *
- */
+ *
+ * This software consists of voluntary contributions made by many individuals
+ * on behalf of the Apache Software Foundation. For more information on the
+ * Apache Software Foundation, please see <http://www.apache.org/> .
+ *
+ */
package org.apache.commons.codec.language;
@@ -85,10 +72,92 @@
public static final char[] US_ENGLISH_MAPPING =
"01230120022455012623010202".toCharArray();
/**
+ * Returns the difference between the Soundex values of two Strings. For
+ * Soundex, this return value ranges from 0 through 4: 0 indicates little or
+ * no similarity, and 4 indicates strong similarity or identical values.
+ *
+ * @param s1
+ * A String.
+ * @param s2
+ * A String.
+ * @return The return value ranges from 0 through 4: 0 indicates little or
+ * no similarity, and 4 indicates strong similarity or identical
+ * values.
+ *
+ * @see #difference(StringEncoder,String,String)
+ * @see <a
href="http://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_de-dz_8co5.asp">
+ * MS T-SQL DIFFERENCE</a>
+ *
+ * @throws EncoderException
+ * if an error occurs encoding one of the strings
+ */
+ public int difference(String s1, String s2) throws EncoderException {
+ return difference(this, s1, s2);
+ }
+
+ /**
+ * Returns the difference between the encoded values of two Strings. The
+ * higher the difference factor, the more similar the strings. For Soundex,
+ * this return value ranges from 0 through 4: 0 indicates little or no
+ * similarity, and 4 indicates strong similarity or identical values.
+ *
+ * @param encoder
+ * The encoder to use to encode the String parameters with.
+ * @param s1
+ * A String.
+ * @param s2
+ * A String.
+ * @return an integer from 0 to the length of the shorter string. The
+ * smaller the number, the more different the strings are.
+ *
+ * @see #differenceEncoded(String,String)
+ * @see <a
href="http://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_de-dz_8co5.asp">
+ * MS T-SQL DIFFERENCE</a>
+ *
+ * @throws EncoderException
+ * if an error occurs encoding one of the strings
+ */
+ public static int difference(StringEncoder encoder, String s1, String s2)
throws EncoderException {
+ return differenceEncoded(encoder.encode(s1), encoder.encode(s2));
+ }
+
+ /**
+ * Returns the difference between the values of two encoded Strings. The
+ * higher the difference factor, the more similar the strings. For Soundex,
+ * this return value ranges from 0 through 4: 0 indicates little or no
+ * similarity, and 4 indicates strong similarity or identical values.
+ *
+ * @param es1
+ * An encoded String.
+ * @param es2
+ * An encoded String.
+ * @return an integer from 0 to the length of the shorter string. The
+ * smaller the number, the more different the strings are.
+ *
+ * @see <a
href="http://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_de-dz_8co5.asp">
+ * MS T-SQL DIFFERENCE</a>
+ */
+ public static int differenceEncoded(String es1, String es2) {
+
+ if (es1 == null || es2 == null) {
+ return 0;
+ }
+ int lengthToMatch = Math.min(es1.length(), es2.length());
+ int diff = 0;
+ for (int i = 0; i < lengthToMatch; i++) {
+ if (es1.charAt(i) == es2.charAt(i)) {
+ diff++;
+ }
+ }
+ return diff;
+ }
+
+ /**
* The maximum length of a Soundex code - Soundex codes are only four
* characters by definition.
- *
- * @deprecated This feature is not needed since the encoding size must be
constant.
+ *
+ * @deprecated This feature is not needed since the encoding size must be
+ * constant.
*/
private int maxLength = 4;
@@ -123,6 +192,10 @@
/**
* Cleans up the input string before Soundex processing by only returning
* upper case letters.
+ *
+ * @param str
+ * The String to clean
+ * @return a clean String.
*/
private String clean(String str) {
if (str == null || str.length() == 0) {
@@ -211,7 +284,8 @@
/**
* Returns the maxLength. Standard Soundex
*
- * @deprecated This feature is not needed since the encoding size must be
constant.
+ * @deprecated This feature is not needed since the encoding size must be
+ * constant.
* @return int
*/
public int getMaxLength() {
@@ -227,6 +301,10 @@
/**
* Maps the given upper-case character to it's Soudex code.
+ *
+ * @param c
+ * An upper-case character.
+ * @return A Soundex code.
*/
private char map(char c) {
return this.getSoundexMapping()[c - 'A'];
@@ -235,7 +313,8 @@
/**
* Sets the maxLength.
*
- * @deprecated This feature is not needed since the encoding size must be
constant.
+ * @deprecated This feature is not needed since the encoding size must be
+ * constant.
* @param maxLength
* The maxLength to set
*/
@@ -266,7 +345,7 @@
if (str.length() == 0) {
return str;
}
- char out[] = { '0', '0', '0', '0' };
+ char out[] = {'0', '0', '0', '0'};
char last, mapped;
int incount = 1, count = 1;
out[0] = str.charAt(0);
@@ -283,4 +362,4 @@
return new String(out);
}
-}
+}
\ No newline at end of file
---------------------------------------------------------------------
To unsubscribe, e-mail: [EMAIL PROTECTED]
For additional commands, e-mail: [EMAIL PROTECTED]