ggregory    2003/12/09 16:04:46

  Modified:    codec/src/test/org/apache/commons/codec/language
                        SoundexTest.java
               codec    RELEASE-NOTES.txt default.properties
               codec/src/java/org/apache/commons/codec/language
                        Soundex.java
  Log:
  Added an implementation of the Soundex DIFFERENCE algorithm, a feature requested in 
http://nagoya.apache.org/bugzilla/show_bug.cgi?id=25243.
  Note that this commit only covers adding the difference API to the Soundex class. 
See the ticket for comments.
  
  Revision  Changes    Path
  1.10      +27 -8     
jakarta-commons/codec/src/test/org/apache/commons/codec/language/SoundexTest.java
  
  Index: SoundexTest.java
  ===================================================================
  RCS file: 
/home/cvs/jakarta-commons/codec/src/test/org/apache/commons/codec/language/SoundexTest.java,v
  retrieving revision 1.9
  retrieving revision 1.10
  diff -u -r1.9 -r1.10
  --- SoundexTest.java  4 Dec 2003 23:32:39 -0000       1.9
  +++ SoundexTest.java  10 Dec 2003 00:04:46 -0000      1.10
  @@ -60,6 +60,7 @@
   
   import junit.framework.Test;
   import junit.framework.TestSuite;
  +import org.apache.commons.codec.EncoderException;
   import org.apache.commons.codec.StringEncoder;
   import org.apache.commons.codec.StringEncoderAbstractTest;
   
  @@ -81,6 +82,12 @@
       public SoundexTest(String name) {
           super(name);
       }
  +
  +    void encodeAll(String[] strings, String expectedEncoding) {
  +        for (int i = 0; i < strings.length; i++) {
  +            assertEquals(expectedEncoding, this.getEncoder().encode(strings[i]));
  +        }
  +    }
       
       /**
         * @return Returns the _encoder.
  @@ -111,12 +118,6 @@
           this.setEncoder(null);
       }
   
  -    void encodeAll(String[] strings, String expectedEncoding) {
  -        for (int i = 0; i < strings.length; i++) {
  -            assertEquals(expectedEncoding, this.getEncoder().encode(strings[i]));
  -        }
  -    }
  -
       public void testB650() {
           this.encodeAll(
               new String[] {
  @@ -163,6 +164,25 @@
               "B650");
       }
   
  +    public void testDifference() throws EncoderException {
  +        // Edge cases
  +        assertEquals(this.getEncoder().difference(null, null), 0);
  +        assertEquals(this.getEncoder().difference("", ""), 0);
  +        assertEquals(this.getEncoder().difference(" ", " "), 0);
  +        // Normal cases
  +        assertEquals(this.getEncoder().difference("Smith", "Smythe"), 4);
  +        assertEquals(this.getEncoder().difference("Ann", "Andrew"), 2);
  +        assertEquals(this.getEncoder().difference("Margaret", "Andrew"), 1);
  +        assertEquals(this.getEncoder().difference("Janet", "Margaret"), 0);
  +        // Examples from 
http://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_de-dz_8co5.asp
  +        assertEquals(this.getEncoder().difference("Green", "Greene"), 4);
  +        assertEquals(this.getEncoder().difference("Blotchet-Halls", "Greene"), 0);
  +        // Examples from 
http://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_setu-sus_3o6w.asp
  +        assertEquals(this.getEncoder().difference("Smith", "Smythe"), 4);
  +        assertEquals(this.getEncoder().difference("Smithers", "Smythers"), 4);
  +        assertEquals(this.getEncoder().difference("Anothers", "Brothers"), 2);      
  
  +    }
  +
       public void testEncodeBasic() {
           assertEquals("T235", this.getEncoder().encode("testing"));
           assertEquals("T000", this.getEncoder().encode("The"));
  @@ -342,7 +362,6 @@
       public void testMsSqlServer2() {
           this.encodeAll(new String[]{"Erickson", "Erickson", "Erikson", "Ericson", 
"Ericksen", "Ericsen"}, "E625");
       }
  -    
       /**
        * Examples for MS SQLServer from
        * http://databases.about.com/library/weekly/aa042901a.htm
  
  
  
  1.13      +7 -26     jakarta-commons/codec/RELEASE-NOTES.txt
  
  Index: RELEASE-NOTES.txt
  ===================================================================
  RCS file: /home/cvs/jakarta-commons/codec/RELEASE-NOTES.txt,v
  retrieving revision 1.12
  retrieving revision 1.13
  diff -u -r1.12 -r1.13
  --- RELEASE-NOTES.txt 25 Nov 2003 05:03:29 -0000      1.12
  +++ RELEASE-NOTES.txt 10 Dec 2003 00:04:46 -0000      1.13
  @@ -1,7 +1,7 @@
   $Id$
   
                        Commons Codec Package
  -                         Version 1.2
  +                         Version 1.3-dev
                            Release Notes
   
   
  @@ -9,34 +9,15 @@
   
   This document contains the release notes for this version of the Commons
   Codec package, and highlights changes since the previous version.  Version
  -1.2 contains bug fixes for the 1.1 release.
  +1.3 contains bug fixes for the 1.2 release.
   
   NEW FEATURES:
   
  -* URLCodec - Implements the 'www-form-urlencoded' encoding scheme
  -
  -* DigestUtils - Simple utility class that provides static convenience
  -  methods for calculating md5 and hex digests.
  -
  +* Soundex - Implemented the DIFFERENCE algorithm.
   
   BUG FIXES:
   
  -* Fix for Bug 19860: Modified Base64 to remedy non-compliance with RFC
  -  2045.  Non-Base64 characters were not being discarded during the
  -  decode.  RFC 2045 explicitly states that all characters outside of the
  -  base64 alphabet are to be ignored.  
  -
  -* Fix for Bug 24360: Hex.decode(Object) throws a ClassCastException 
  -  when a String argument is passed in.
  -  
  -* Fix for Bug 24471: Soundex: The HW rule is not applied; hyphens and 
  -  apostrophes are not ignored.
  -  
  -* Fix for Bug 24484: Soundex.setMaxLength causes bugs and is not needed.
  -  Calling Soundex.setMaxLength() with a value of 2 or less causes the wrong
  -  answer to be returned.  Since the encoding returned by Soundex is always
  -  of length 4 by definition (we do not use the '-' in as a letter-nnn 
  -  separator) the need for a maxLength attribute is not needed.  Deprecate 
  -  the field and accessor methods.
  +* Fix for Bug XXXX: ZZZZ
  +
  +* Fixes for missing tags in Javadoc comments.  
   
  -* Fix in Metaphone relating to the handling of the maximum code length.
  
  
  
  1.9       +2 -2      jakarta-commons/codec/default.properties
  
  Index: default.properties
  ===================================================================
  RCS file: /home/cvs/jakarta-commons/codec/default.properties,v
  retrieving revision 1.8
  retrieving revision 1.9
  diff -u -r1.8 -r1.9
  --- default.properties        26 Nov 2003 03:18:28 -0000      1.8
  +++ default.properties        10 Dec 2003 00:04:46 -0000      1.9
  @@ -13,7 +13,7 @@
   component.title = Encode/Decode Utilities
   
   # The current version number of this component
  -component.version = 1.2
  +component.version = 1.3-dev
   
   # The name that is used to create the jar file
   final.name = ${component.name}-${component.version}
  
  
  
  1.14      +136 -57   
jakarta-commons/codec/src/java/org/apache/commons/codec/language/Soundex.java
  
  Index: Soundex.java
  ===================================================================
  RCS file: 
/home/cvs/jakarta-commons/codec/src/java/org/apache/commons/codec/language/Soundex.java,v
  retrieving revision 1.13
  retrieving revision 1.14
  diff -u -r1.13 -r1.14
  --- Soundex.java      12 Nov 2003 19:02:57 -0000      1.13
  +++ Soundex.java      10 Dec 2003 00:04:46 -0000      1.14
  @@ -2,58 +2,45 @@
    * ====================================================================
    * 
    * The Apache Software License, Version 1.1
  - *
  - * Copyright (c) 2001-2003 The Apache Software Foundation.  All rights
  - * reserved.
  - *
  + * 
  + * Copyright (c) 2001-2003 The Apache Software Foundation. All rights reserved.
  + * 
    * Redistribution and use in source and binary forms, with or without
  - * modification, are permitted provided that the following conditions
  - * are met:
  - *
  - * 1. Redistributions of source code must retain the above copyright
  - *    notice, this list of conditions and the following disclaimer. 
  - *
  - * 2. Redistributions in binary form must reproduce the above copyright
  - *    notice, this list of conditions and the following disclaimer in
  - *    the documentation and/or other materials provided with the
  - *    distribution.
  - *
  - * 3. The end-user documentation included with the redistribution,
  - *    if any, must include the following acknowledgement:  
  - *       "This product includes software developed by the 
  - *        Apache Software Foundation (http://www.apache.org/)."
  - *    Alternately, this acknowledgement may appear in the software itself,
  - *    if and wherever such third-party acknowledgements normally appear.
  - *
  - * 4. The names "Apache", "The Jakarta Project", "Commons", and "Apache Software
  - *    Foundation" must not be used to endorse or promote products derived
  - *    from this software without prior written permission. For written 
  - *    permission, please contact [EMAIL PROTECTED]
  - *
  - * 5. Products derived from this software may not be called "Apache",
  - *    "Apache" nor may "Apache" appear in their name without prior 
  - *    written permission of the Apache Software Foundation.
  - *
  - * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
  - * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  - * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  - * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
  - * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
  - * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  - * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  - * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  - * SUCH DAMAGE.
  + * modification, are permitted provided that the following conditions are met: 1.
  + * Redistributions of source code must retain the above copyright notice, this
  + * list of conditions and the following disclaimer. 2. Redistributions in
  + * binary form must reproduce the above copyright notice, this list of
  + * conditions and the following disclaimer in the documentation and/or other
  + * materials provided with the distribution. 3. The end-user documentation
  + * included with the redistribution, if any, must include the following
  + * acknowledgement: "This product includes software developed by the Apache
  + * Software Foundation (http://www.apache.org/)." Alternately, this
  + * acknowledgement may appear in the software itself, if and wherever such
  + * third-party acknowledgements normally appear. 4. The names "Apache", "The
  + * Jakarta Project", "Commons", and "Apache Software Foundation" must not be
  + * used to endorse or promote products derived from this software without prior
  + * written permission. For written permission, please contact
  + * [EMAIL PROTECTED] 5. Products derived from this software may not be called
  + * "Apache", "Apache" nor may "Apache" appear in their name without prior
  + * written permission of the Apache Software Foundation.
  + * 
  + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED WARRANTIES,
  + * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
  + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
  + * APACHE SOFTWARE FOUNDATION OR ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
  + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
    * ====================================================================
  - *
  - * This software consists of voluntary contributions made by many
  - * individuals on behalf of the Apache Software Foundation.  For more
  - * information on the Apache Software Foundation, please see
  - * <http://www.apache.org/>.
  - *
  - */ 
  + * 
  + * This software consists of voluntary contributions made by many individuals
  + * on behalf of the Apache Software Foundation. For more information on the
  + * Apache Software Foundation, please see <http://www.apache.org/> .
  + *  
  + */
   
   package org.apache.commons.codec.language;
   
  @@ -85,10 +72,92 @@
       public static final char[] US_ENGLISH_MAPPING = 
"01230120022455012623010202".toCharArray();
   
       /**
  +      * Returns the difference between the Soundex values of two Strings. For
  +      * Soundex, this return value ranges from 0 through 4: 0 indicates little or
  +      * no similarity, and 4 indicates strong similarity or identical values.
  +      * 
  +      * @param s1
  +      *                  A String.
  +      * @param s2
  +      *                  A String.
  +      * @return The return value ranges from 0 through 4: 0 indicates little or
  +      *             no similarity, and 4 indicates strong similarity or identical
  +      *             values.
  +      * 
  +     * @see #difference(StringEncoder,String,String)
  +      * @see <a 
href="http://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_de-dz_8co5.asp";>
  +      *          MS T-SQL DIFFERENCE</a>
  +      * 
  +      * @throws EncoderException
  +      *                  if an error occurs encoding one of the strings
  +      */
  +    public int difference(String s1, String s2) throws EncoderException {
  +        return difference(this, s1, s2);
  +    }
  +
  +    /**
  +      * Returns the difference between the encoded values of two Strings. The
  +      * higher the difference factor, the more similar the strings. For Soundex,
  +      * this return value ranges from 0 through 4: 0 indicates little or no
  +      * similarity, and 4 indicates strong similarity or identical values.
  +      * 
  +      * @param encoder
  +      *                  The encoder to use to encode the String parameters with.
  +      * @param s1
  +      *                  A String.
  +      * @param s2
  +      *                  A String.
  +      * @return an integer from 0 to the length of the shorter string. The
  +      *             smaller the number, the more different the strings are.
  +      * 
  +     * @see #differenceEncoded(String,String)
  +      * @see <a 
href="http://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_de-dz_8co5.asp";>
  +      *          MS T-SQL DIFFERENCE</a>
  +      * 
  +      * @throws EncoderException
  +      *                  if an error occurs encoding one of the strings
  +      */
  +    public static int difference(StringEncoder encoder, String s1, String s2) 
throws EncoderException {
  +        return differenceEncoded(encoder.encode(s1), encoder.encode(s2));
  +    }
  +
  +    /**
  +      * Returns the difference between the values of two encoded Strings. The
  +      * higher the difference factor, the more similar the strings. For Soundex,
  +      * this return value ranges from 0 through 4: 0 indicates little or no
  +      * similarity, and 4 indicates strong similarity or identical values.
  +      * 
  +      * @param es1
  +      *                  An encoded String.
  +      * @param es2
  +      *                  An encoded String.
  +      * @return an integer from 0 to the length of the shorter string. The
  +      *             smaller the number, the more different the strings are.
  +      * 
  +      * @see <a 
href="http://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_de-dz_8co5.asp";>
  +      *          MS T-SQL DIFFERENCE</a>
  +      */
  +    public static int differenceEncoded(String es1, String es2) {
  +
  +        if (es1 == null || es2 == null) {
  +            return 0;
  +        }
  +        int lengthToMatch = Math.min(es1.length(), es2.length());
  +        int diff = 0;
  +        for (int i = 0; i < lengthToMatch; i++) {
  +            if (es1.charAt(i) == es2.charAt(i)) {
  +                diff++;
  +            }
  +        }
  +        return diff;
  +    }
  +
  +    /**
         * The maximum length of a Soundex code - Soundex codes are only four
         * characters by definition.
  -     * 
  -     * @deprecated This feature is not needed since the encoding size must be 
constant.
  +      * 
  +      * @deprecated This feature is not needed since the encoding size must be
  +      *                     constant.
         */
       private int maxLength = 4;
   
  @@ -123,6 +192,10 @@
       /**
         * Cleans up the input string before Soundex processing by only returning
         * upper case letters.
  +      * 
  +      * @param str
  +      *                  The String to clean
  +      * @return a clean String.
         */
       private String clean(String str) {
           if (str == null || str.length() == 0) {
  @@ -211,7 +284,8 @@
       /**
         * Returns the maxLength. Standard Soundex
         * 
  -     * @deprecated This feature is not needed since the encoding size must be 
constant.
  +      * @deprecated This feature is not needed since the encoding size must be
  +      *                     constant.
         * @return int
         */
       public int getMaxLength() {
  @@ -227,6 +301,10 @@
   
       /**
         * Maps the given upper-case character to it's Soudex code.
  +      * 
  +      * @param c
  +      *                  An upper-case character.
  +      * @return A Soundex code.
         */
       private char map(char c) {
           return this.getSoundexMapping()[c - 'A'];
  @@ -235,7 +313,8 @@
       /**
         * Sets the maxLength.
         * 
  -     * @deprecated This feature is not needed since the encoding size must be 
constant.
  +      * @deprecated This feature is not needed since the encoding size must be
  +      *                     constant.
         * @param maxLength
         *                  The maxLength to set
         */
  @@ -266,7 +345,7 @@
           if (str.length() == 0) {
               return str;
           }
  -        char out[] = { '0', '0', '0', '0' };
  +        char out[] = {'0', '0', '0', '0'};
           char last, mapped;
           int incount = 1, count = 1;
           out[0] = str.charAt(0);
  @@ -283,4 +362,4 @@
           return new String(out);
       }
   
  -}
  +}
  \ No newline at end of file
  
  
  

---------------------------------------------------------------------
To unsubscribe, e-mail: [EMAIL PROTECTED]
For additional commands, e-mail: [EMAIL PROTECTED]

Reply via email to