bayard 2002/11/18 00:46:02
Modified: codec/src/java/org/apache/commons/codec Metaphone.java
Soundex.java
Added: codec/src/java/org/apache/commons/codec Encoder.java
EncoderComparator.java RefinedSoundex.java
Removed: codec/src/java/org/apache/commons/codec
SoundexComparator.java
Log:
A common interface added, a new algorithm added [refined soundex] and a generic
comparator introduced.
Submitted-by: "O'brien, Tim" <[EMAIL PROTECTED]>
Revision Changes Path
1.3 +320 -293
jakarta-commons-sandbox/codec/src/java/org/apache/commons/codec/Metaphone.java
Index: Metaphone.java
===================================================================
RCS file:
/home/cvs/jakarta-commons-sandbox/codec/src/java/org/apache/commons/codec/Metaphone.java,v
retrieving revision 1.2
retrieving revision 1.3
diff -u -r1.2 -r1.3
--- Metaphone.java 21 Oct 2002 19:05:19 -0000 1.2
+++ Metaphone.java 18 Nov 2002 08:46:02 -0000 1.3
@@ -1,293 +1,320 @@
-// Permission given by wbrogden for code to be used anywhere.
-package org.apache.commons.codec;
-
-
-/* ====================================================================
- * The Apache Software License, Version 1.1
- *
- * Copyright (c) 2001 The Apache Software Foundation. All rights
- * reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- *
- * 3. The end-user documentation included with the redistribution,
- * if any, must include the following acknowledgment:
- * "This product includes software developed by the
- * Apache Software Foundation (http://www.apache.org/)."
- * Alternately, this acknowledgment may appear in the software itself,
- * if and wherever such third-party acknowledgments normally appear.
- *
- * 4. The names "Apache" and "Apache Software Foundation" and
- * "Apache Commons" must not be used to endorse or promote products
- * derived from this software without prior written permission. For
- * written permission, please contact [EMAIL PROTECTED]
- *
- * 5. Products derived from this software may not be called "Apache",
- * "Apache Turbine", nor may "Apache" appear in their name, without
- * prior written permission of the Apache Software Foundation.
- *
- * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
- * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
- * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
- * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
- * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
- * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- * ====================================================================
- *
- * This software consists of voluntary contributions made by many
- * individuals on behalf of the Apache Software Foundation. For more
- * information on the Apache Software Foundation, please see
- * <http://www.apache.org/>.
- */
-
-/* Metaphone.java
- * A class to generate phonetic code and keep lists of objects
- * retrievable by a phonetic code.
- * reference: Computer Language of Dec. 1990, p 39
- * "Hanging on the Metaphone" by Lawrence Philips
- *
- * This Java implementation, William B. Brogden. December, 1997
- *
- * @author [EMAIL PROTECTED]
- */
-
-/*
- * List functionality removed: 2001-06-21 [EMAIL PROTECTED]
- */
-
-/*
- * Notes:
- * The static method metaphone converts an input String into a code.
- * All input is converted to upper case.
- * Limitations: Input format is expected to be a single ASCII word
- * with only characters in the A - Z range, no punctuation or numbers.
- *
- */
-
-class Metaphone {
-
- static String vowels = "AEIOU" ;
- static String frontv = "EIY" ;
- static String varson = "CSPTG" ;
-
- static final int maxCodeLen = 4 ;
-
- /**
- * Are the metaphones of two strings the same.
- */
- static public boolean isMetaphoneEqual(String str1, String str2) {
- return metaphone(str1).equals(metaphone(str2));
- }
-
- /**
- * Find the metaphone value of a String. This is similar to the
- * soundex algorithm, but better at finding similar sounding words.
- */
- static public String metaphone( String txt ){
- int mtsz = 0 ;
- boolean hard = false ;
- if(( txt == null ) ||
- ( txt.length() == 0 )) return "" ;
- // single character is itself
- if( txt.length() == 1 ) return txt.toUpperCase() ;
-
- char[] inwd = txt.toUpperCase().toCharArray() ;
-
- String tmpS ;
- StringBuffer local = new StringBuffer( 40 ); // manipulate
- StringBuffer code = new StringBuffer( 10 ) ; // output
- // handle initial 2 characters exceptions
- switch( inwd[0] ){
- case 'K': case 'G' : case 'P' : /* looking for KN, etc*/
- if( inwd[1] == 'N')local.append(inwd, 1, inwd.length - 1 );
- else local.append( inwd );
- break;
- case 'A': /* looking for AE */
- if( inwd[1] == 'E' )local.append(inwd, 1, inwd.length - 1 );
- else local.append( inwd );
- break;
- case 'W' : /* looking for WR or WH */
- if( inwd[1] == 'R' ){ // WR -> R
- local.append(inwd, 1, inwd.length - 1 ); break ;
- }
- if( inwd[1] == 'H'){
- local.append(inwd, 1, inwd.length - 1 );
- local.setCharAt( 0,'W'); // WH -> W
- }
- else local.append( inwd );
- break;
- case 'X' : /* initial X becomes S */
- inwd[0] = 'S' ;local.append( inwd );
- break ;
- default :
- local.append( inwd );
- } // now local has working string with initials fixed
- int wdsz = local.length();
- int n = 0 ;
- while((mtsz < maxCodeLen ) && // max code size of 4 works well
- (n < wdsz ) ){
- char symb = local.charAt(n) ;
- // remove duplicate letters except C
- if(( symb != 'C' ) &&
- (n > 0 ) && ( local.charAt(n - 1 ) == symb )) n++ ;
- else{ // not dup
- switch( symb ){
- case 'A' : case 'E' : case 'I' : case 'O' : case 'U' :
- if( n == 0 ) { code.append(symb );mtsz++;
- }
- break ; // only use vowel if leading char
- case 'B' :
- if( (n > 0 ) &&
- !(n + 1 == wdsz ) && // not MB at end of word
- ( local.charAt(n - 1) == 'M')) {
- code.append(symb);
- }
- else code.append(symb);
- mtsz++ ;
- break ;
- case 'C' : // lots of C special cases
- /* discard if SCI, SCE or SCY */
- if( ( n > 0 ) &&
- ( local.charAt(n-1) == 'S' ) &&
- ( n + 1 < wdsz ) &&
- ( frontv.indexOf( local.charAt(n + 1)) >= 0 )){ break ;}
- tmpS = local.toString();
- if( tmpS.indexOf("CIA", n ) == n ) { // "CIA" -> X
- code.append('X' ); mtsz++; break ;
- }
- if( ( n + 1 < wdsz ) &&
- (frontv.indexOf( local.charAt(n+1) )>= 0 )){
- code.append('S');mtsz++; break ; // CI,CE,CY -> S
- }
- if(( n > 0) &&
- ( tmpS.indexOf("SCH",n-1 )== n-1 )){ // SCH->sk
- code.append('K') ; mtsz++;break ;
- }
- if( tmpS.indexOf("CH", n ) == n ){ // detect CH
- if((n == 0 ) &&
- (wdsz >= 3 ) && // CH consonant -> K consonant
- (vowels.indexOf( local.charAt( 2) ) < 0 )){
- code.append('K');
- }
- else { code.append('X'); // CHvowel -> X
- }
- mtsz++;
- }
- else { code.append('K' );mtsz++;
- }
- break ;
- case 'D' :
- if(( n + 2 < wdsz )&& // DGE DGI DGY -> J
- ( local.charAt(n+1) == 'G' )&&
- (frontv.indexOf( local.charAt(n+2) )>= 0)){
- code.append('J' ); n += 2 ;
- }
- else { code.append( 'T' );
- }
- mtsz++;
- break ;
- case 'G' : // GH silent at end or before consonant
- if(( n + 2 == wdsz )&&
- (local.charAt(n+1) == 'H' )) break ;
- if(( n + 2 < wdsz ) &&
- (local.charAt(n+1) == 'H' )&&
- (vowels.indexOf( local.charAt(n+2)) < 0 )) break ;
- tmpS = local.toString();
- if((n > 0) &&
- ( tmpS.indexOf("GN", n ) == n)||
- ( tmpS.indexOf("GNED",n) == n )) break ; // silent G
- if(( n > 0 ) &&
- (local.charAt(n-1) == 'G')) hard = true ;
- else hard = false ;
- if((n+1 < wdsz) &&
- (frontv.indexOf( local.charAt(n+1) ) >= 0 )&&
- (!hard) ) code.append( 'J' );
- else code.append('K');
- mtsz++;
- break ;
- case 'H':
- if( n + 1 == wdsz ) break ; // terminal H
- if((n > 0) &&
- (varson.indexOf( local.charAt(n-1)) >= 0)) break ;
- if( vowels.indexOf( local.charAt(n+1)) >=0 ){
- code.append('H') ; mtsz++;// Hvowel
- }
- break;
- case 'F': case 'J' : case 'L' :
- case 'M': case 'N' : case 'R' :
- code.append( symb ); mtsz++; break ;
- case 'K' :
- if( n > 0 ){ // not initial
- if( local.charAt( n -1) != 'C' ) {
- code.append(symb );
- }
- }
- else code.append( symb ); // initial K
- mtsz++ ;
- break ;
- case 'P' :
- if((n + 1 < wdsz) && // PH -> F
- (local.charAt( n+1) == 'H'))code.append('F');
- else code.append( symb );
- mtsz++;
- break ;
- case 'Q' :
- code.append('K' );mtsz++; break ;
- case 'S' :
- tmpS = local.toString();
- if((tmpS.indexOf("SH", n )== n) ||
- (tmpS.indexOf("SIO",n )== n) ||
- (tmpS.indexOf("SIA",n )== n)) code.append('X');
- else code.append( 'S' );
- mtsz++ ;
- break ;
- case 'T' :
- tmpS = local.toString(); // TIA TIO -> X
- if((tmpS.indexOf("TIA",n )== n)||
- (tmpS.indexOf("TIO",n )== n) ){
- code.append('X'); mtsz++; break;
- }
- if( tmpS.indexOf("TCH",n )==n) break;
- // substitute numeral 0 for TH (resembles theta after all)
- if( tmpS.indexOf("TH", n )==n) code.append('0');
- else code.append( 'T' );
- mtsz++ ;
- break ;
- case 'V' :
- code.append('F'); mtsz++;break ;
- case 'W' : case 'Y' : // silent if not followed by vowel
- if((n+1 < wdsz) &&
- (vowels.indexOf( local.charAt(n+1))>=0)){
- code.append( symb );mtsz++;
- }
- break ;
- case 'X' :
- code.append('K'); code.append('S');mtsz += 2;
- break ;
- case 'Z' :
- code.append('S'); mtsz++; break ;
- } // end switch
- n++ ;
- } // end else from symb != 'C'
- if( mtsz > 4 )code.setLength( 4);
- }
- return code.toString();
- } // end static method metaPhone()
-
-}
+// Permission given by wbrogden for code to be used anywhere.
+package org.apache.commons.codec;
+
+
+/* ====================================================================
+ * The Apache Software License, Version 1.1
+ *
+ * Copyright (c) 2001 The Apache Software Foundation. All rights
+ * reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * 3. The end-user documentation included with the redistribution,
+ * if any, must include the following acknowledgment:
+ * "This product includes software developed by the
+ * Apache Software Foundation (http://www.apache.org/)."
+ * Alternately, this acknowledgment may appear in the software itself,
+ * if and wherever such third-party acknowledgments normally appear.
+ *
+ * 4. The names "Apache" and "Apache Software Foundation" and
+ * "Apache Commons" must not be used to endorse or promote products
+ * derived from this software without prior written permission. For
+ * written permission, please contact [EMAIL PROTECTED]
+ *
+ * 5. Products derived from this software may not be called "Apache",
+ * "Apache Turbine", nor may "Apache" appear in their name, without
+ * prior written permission of the Apache Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+ * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This software consists of voluntary contributions made by many
+ * individuals on behalf of the Apache Software Foundation. For more
+ * information on the Apache Software Foundation, please see
+ * <http://www.apache.org/>.
+ */
+
+/* Metaphone.java
+ * A class to generate phonetic code and keep lists of objects
+ * retrievable by a phonetic code.
+ * reference: Computer Language of Dec. 1990, p 39
+ * "Hanging on the Metaphone" by Lawrence Philips
+ *
+ * This Java implementation, William B. Brogden. December, 1997
+ *
+ * @author [EMAIL PROTECTED]
+ * @author [EMAIL PROTECTED]
+ */
+
+/*
+ * List functionality removed: 2001-06-21 [EMAIL PROTECTED]
+ */
+
+/*
+ * Notes:
+ * The static method metaphone converts an input String into a code.
+ * All input is converted to upper case.
+ * Limitations: Input format is expected to be a single ASCII word
+ * with only characters in the A - Z range, no punctuation or numbers.
+ *
+ */
+
+public class Metaphone implements Encoder {
+
+ private String vowels = "AEIOU" ;
+ private String frontv = "EIY" ;
+ private String varson = "CSPTG" ;
+
+ private int maxCodeLen = 4 ;
+
+ public Metaphone() {
+ super();
+ }
+
+
+ /**
+ * Find the metaphone value of a String. This is similar to the
+ * soundex algorithm, but better at finding similar sounding words.
+ */
+ public String metaphone( String txt ){
+ int mtsz = 0 ;
+ boolean hard = false ;
+ if(( txt == null ) ||
+ ( txt.length() == 0 )) return "" ;
+ // single character is itself
+ if( txt.length() == 1 ) return txt.toUpperCase() ;
+
+ char[] inwd = txt.toUpperCase().toCharArray() ;
+
+ String tmpS ;
+ StringBuffer local = new StringBuffer( 40 ); // manipulate
+ StringBuffer code = new StringBuffer( 10 ) ; // output
+ // handle initial 2 characters exceptions
+ switch( inwd[0] ){
+ case 'K': case 'G' : case 'P' : /* looking for KN, etc*/
+ if( inwd[1] == 'N')local.append(inwd, 1, inwd.length - 1 );
+ else local.append( inwd );
+ break;
+ case 'A': /* looking for AE */
+ if( inwd[1] == 'E' )local.append(inwd, 1, inwd.length - 1 );
+ else local.append( inwd );
+ break;
+ case 'W' : /* looking for WR or WH */
+ if( inwd[1] == 'R' ){ // WR -> R
+ local.append(inwd, 1, inwd.length - 1 ); break ;
+ }
+ if( inwd[1] == 'H'){
+ local.append(inwd, 1, inwd.length - 1 );
+ local.setCharAt( 0,'W'); // WH -> W
+ }
+ else local.append( inwd );
+ break;
+ case 'X' : /* initial X becomes S */
+ inwd[0] = 'S' ;local.append( inwd );
+ break ;
+ default :
+ local.append( inwd );
+ } // now local has working string with initials fixed
+ int wdsz = local.length();
+ int n = 0 ;
+ while((mtsz < maxCodeLen ) && // max code size of 4 works well
+ (n < wdsz ) ){
+ char symb = local.charAt(n) ;
+ // remove duplicate letters except C
+ if(( symb != 'C' ) &&
+ (n > 0 ) && ( local.charAt(n - 1 ) == symb )) n++ ;
+ else{ // not dup
+ switch( symb ){
+ case 'A' : case 'E' : case 'I' : case 'O' : case 'U' :
+ if( n == 0 ) { code.append(symb );mtsz++;
+ }
+ break ; // only use vowel if leading char
+ case 'B' :
+ if( (n > 0 ) &&
+ !(n + 1 == wdsz ) && // not MB at end of word
+ ( local.charAt(n - 1) == 'M')) {
+ code.append(symb);
+ }
+ else code.append(symb);
+ mtsz++ ;
+ break ;
+ case 'C' : // lots of C special cases
+ /* discard if SCI, SCE or SCY */
+ if( ( n > 0 ) &&
+ ( local.charAt(n-1) == 'S' ) &&
+ ( n + 1 < wdsz ) &&
+ ( frontv.indexOf( local.charAt(n + 1)) >= 0 )){ break ;}
+ tmpS = local.toString();
+ if( tmpS.indexOf("CIA", n ) == n ) { // "CIA" -> X
+ code.append('X' ); mtsz++; break ;
+ }
+ if( ( n + 1 < wdsz ) &&
+ (frontv.indexOf( local.charAt(n+1) )>= 0 )){
+ code.append('S');mtsz++; break ; // CI,CE,CY -> S
+ }
+ if(( n > 0) &&
+ ( tmpS.indexOf("SCH",n-1 )== n-1 )){ // SCH->sk
+ code.append('K') ; mtsz++;break ;
+ }
+ if( tmpS.indexOf("CH", n ) == n ){ // detect CH
+ if((n == 0 ) &&
+ (wdsz >= 3 ) && // CH consonant -> K consonant
+ (vowels.indexOf( local.charAt( 2) ) < 0 )){
+ code.append('K');
+ }
+ else { code.append('X'); // CHvowel -> X
+ }
+ mtsz++;
+ }
+ else { code.append('K' );mtsz++;
+ }
+ break ;
+ case 'D' :
+ if(( n + 2 < wdsz )&& // DGE DGI DGY -> J
+ ( local.charAt(n+1) == 'G' )&&
+ (frontv.indexOf( local.charAt(n+2) )>= 0)){
+ code.append('J' ); n += 2 ;
+ }
+ else { code.append( 'T' );
+ }
+ mtsz++;
+ break ;
+ case 'G' : // GH silent at end or before consonant
+ if(( n + 2 == wdsz )&&
+ (local.charAt(n+1) == 'H' )) break ;
+ if(( n + 2 < wdsz ) &&
+ (local.charAt(n+1) == 'H' )&&
+ (vowels.indexOf( local.charAt(n+2)) < 0 )) break ;
+ tmpS = local.toString();
+ if((n > 0) &&
+ ( tmpS.indexOf("GN", n ) == n)||
+ ( tmpS.indexOf("GNED",n) == n )) break ; // silent G
+ if(( n > 0 ) &&
+ (local.charAt(n-1) == 'G')) hard = true ;
+ else hard = false ;
+ if((n+1 < wdsz) &&
+ (frontv.indexOf( local.charAt(n+1) ) >= 0 )&&
+ (!hard) ) code.append( 'J' );
+ else code.append('K');
+ mtsz++;
+ break ;
+ case 'H':
+ if( n + 1 == wdsz ) break ; // terminal H
+ if((n > 0) &&
+ (varson.indexOf( local.charAt(n-1)) >= 0)) break ;
+ if( vowels.indexOf( local.charAt(n+1)) >=0 ){
+ code.append('H') ; mtsz++;// Hvowel
+ }
+ break;
+ case 'F': case 'J' : case 'L' :
+ case 'M': case 'N' : case 'R' :
+ code.append( symb ); mtsz++; break ;
+ case 'K' :
+ if( n > 0 ){ // not initial
+ if( local.charAt( n -1) != 'C' ) {
+ code.append(symb );
+ }
+ }
+ else code.append( symb ); // initial K
+ mtsz++ ;
+ break ;
+ case 'P' :
+ if((n + 1 < wdsz) && // PH -> F
+ (local.charAt( n+1) == 'H'))code.append('F');
+ else code.append( symb );
+ mtsz++;
+ break ;
+ case 'Q' :
+ code.append('K' );mtsz++; break ;
+ case 'S' :
+ tmpS = local.toString();
+ if((tmpS.indexOf("SH", n )== n) ||
+ (tmpS.indexOf("SIO",n )== n) ||
+ (tmpS.indexOf("SIA",n )== n)) code.append('X');
+ else code.append( 'S' );
+ mtsz++ ;
+ break ;
+ case 'T' :
+ tmpS = local.toString(); // TIA TIO -> X
+ if((tmpS.indexOf("TIA",n )== n)||
+ (tmpS.indexOf("TIO",n )== n) ){
+ code.append('X'); mtsz++; break;
+ }
+ if( tmpS.indexOf("TCH",n )==n) break;
+ // substitute numeral 0 for TH (resembles theta after all)
+ if( tmpS.indexOf("TH", n )==n) code.append('0');
+ else code.append( 'T' );
+ mtsz++ ;
+ break ;
+ case 'V' :
+ code.append('F'); mtsz++;break ;
+ case 'W' : case 'Y' : // silent if not followed by vowel
+ if((n+1 < wdsz) &&
+ (vowels.indexOf( local.charAt(n+1))>=0)){
+ code.append( symb );mtsz++;
+ }
+ break ;
+ case 'X' :
+ code.append('K'); code.append('S');mtsz += 2;
+ break ;
+ case 'Z' :
+ code.append('S'); mtsz++; break ;
+ } // end switch
+ n++ ;
+ } // end else from symb != 'C'
+ if( mtsz > 4 )code.setLength( 4);
+ }
+ return code.toString();
+ } // end static method metaPhone()
+
+ public String encode(String pString) {
+ return( metaphone( pString ) );
+ }
+
+ /**
+ * Are the metaphones of two strings the same.
+ */
+ public boolean isMetaphoneEqual(String str1, String str2) {
+ return metaphone(str1).equals(metaphone(str2));
+ }
+
+ /**
+ * Returns the maxCodeLen.
+ * @return int
+ */
+ public int getMaxCodeLen() {
+ return maxCodeLen;
+ }
+
+ /**
+ * Sets the maxCodeLen.
+ * @param maxCodeLen The maxCodeLen to set
+ */
+ public void setMaxCodeLen(int maxCodeLen) {
+ this.maxCodeLen = maxCodeLen;
+ }
+
+}
+
1.2 +143 -115
jakarta-commons-sandbox/codec/src/java/org/apache/commons/codec/Soundex.java
Index: Soundex.java
===================================================================
RCS file:
/home/cvs/jakarta-commons-sandbox/codec/src/java/org/apache/commons/codec/Soundex.java,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -r1.1 -r1.2
--- Soundex.java 22 Feb 2002 05:20:00 -0000 1.1
+++ Soundex.java 18 Nov 2002 08:46:02 -0000 1.2
@@ -1,115 +1,143 @@
-package org.apache.commons.codec;
-
-/* ====================================================================
- * The Apache Software License, Version 1.1
- *
- * Copyright (c) 2001 The Apache Software Foundation. All rights
- * reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- *
- * 3. The end-user documentation included with the redistribution,
- * if any, must include the following acknowledgment:
- * "This product includes software developed by the
- * Apache Software Foundation (http://www.apache.org/)."
- * Alternately, this acknowledgment may appear in the software itself,
- * if and wherever such third-party acknowledgments normally appear.
- *
- * 4. The names "Apache" and "Apache Software Foundation" and
- * "Apache Commons" must not be used to endorse or promote products
- * derived from this software without prior written permission. For
- * written permission, please contact [EMAIL PROTECTED]
- *
- * 5. Products derived from this software may not be called "Apache",
- * "Apache Turbine", nor may "Apache" appear in their name, without
- * prior written permission of the Apache Software Foundation.
- *
- * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
- * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
- * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
- * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
- * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
- * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- * ====================================================================
- *
- * This software consists of voluntary contributions made by many
- * individuals on behalf of the Apache Software Foundation. For more
- * information on the Apache Software Foundation, please see
- * <http://www.apache.org/>.
- */
-
-/**
- * Find the Soundex of a string. Needs internationalisation in a
- * future release.
- *
- * @author [EMAIL PROTECTED]
- * @version $Id$
- */
-public class Soundex {
-
- static public final char[] US_ENGLISH_MAPPING =
- "01230120022455012623010202".toCharArray();
-
- static public final Soundex US_ENGLISH = new Soundex();
-
- private char[] soundexMapping;
-
- public Soundex() {
- this(US_ENGLISH_MAPPING);
- }
-
- public Soundex(char[] mapping) {
- this.soundexMapping = mapping;
- }
-
- /**
- * Get the SoundEx value of a string.
- * This implementation is taken from the code-snippers on
- * http://www.sourceforge.net/
- */
- public String soundex(String str) {
- char out[] = { '0', '0', '0', '0' };
- char last, mapped;
- int incount = 1, count = 1;
- out[0] = Character.toUpperCase( str.charAt(0) );
- last = getMappingCode( str.charAt(0) );
- while( (incount < str.length() ) &&
- (mapped = getMappingCode(str.charAt(incount++))) != 0 &&
- (count < 4) )
- {
- if( (mapped != '0') && (mapped != last) ) {
- out[count++] = mapped;
- }
- last = mapped;
- }
- return new String(out);
- }
-
- /**
- * Used internally by the SoundEx algorithm.
- */
- private char getMappingCode(char c) {
- if( !Character.isLetter(c) ) {
- return 0;
- } else {
- return soundexMapping[Character.toUpperCase(c) - 'A'];
- }
- }
-
-}
+package org.apache.commons.codec;
+
+/* ====================================================================
+ * The Apache Software License, Version 1.1
+ *
+ * Copyright (c) 2001 The Apache Software Foundation. All rights
+ * reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * 3. The end-user documentation included with the redistribution,
+ * if any, must include the following acknowledgment:
+ * "This product includes software developed by the
+ * Apache Software Foundation (http://www.apache.org/)."
+ * Alternately, this acknowledgment may appear in the software itself,
+ * if and wherever such third-party acknowledgments normally appear.
+ *
+ * 4. The names "Apache" and "Apache Software Foundation" and
+ * "Apache Commons" must not be used to endorse or promote products
+ * derived from this software without prior written permission. For
+ * written permission, please contact [EMAIL PROTECTED]
+ *
+ * 5. Products derived from this software may not be called "Apache",
+ * "Apache Turbine", nor may "Apache" appear in their name, without
+ * prior written permission of the Apache Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+ * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This software consists of voluntary contributions made by many
+ * individuals on behalf of the Apache Software Foundation. For more
+ * information on the Apache Software Foundation, please see
+ * <http://www.apache.org/>.
+ */
+
+/**
+ * Encodes a string into a refined soundex value.
+ * A refined soundex code is optimized for spell checking word.
+ * "Soundex" method originally developed by Margaret Odell and
+ * Robert Russell
+ *
+ * http://www.bluepoof.com/Soundex/info2.html
+ *
+ * @todo Needs internationalisation in a future release.
+ *
+ * @author [EMAIL PROTECTED]
+ * @author [EMAIL PROTECTED]
+ * @version $Id$
+ */
+public class Soundex implements Encoder {
+
+ static public final char[] US_ENGLISH_MAPPING =
+ "01230120022455012623010202".toCharArray();
+
+ static public final Soundex US_ENGLISH = new Soundex();
+
+ private char[] soundexMapping;
+ private int maxLength = 4;
+
+ public Soundex() {
+ this(US_ENGLISH_MAPPING);
+ }
+
+ public Soundex(char[] mapping) {
+ this.soundexMapping = mapping;
+ }
+
+ /**
+ * Get the SoundEx value of a string.
+ * This implementation is taken from the code-snippers on
+ * http://www.sourceforge.net/
+ */
+ public String soundex(String str) {
+ char out[] = { '0', '0', '0', '0' };
+ char last, mapped;
+ int incount = 1, count = 1;
+ out[0] = Character.toUpperCase( str.charAt(0) );
+ last = getMappingCode( str.charAt(0) );
+ while( (incount < str.length() ) &&
+ (mapped = getMappingCode(str.charAt(incount++))) != 0 &&
+ (count < maxLength) )
+ {
+ if( (mapped != '0') && (mapped != last) ) {
+ out[count++] = mapped;
+ }
+ last = mapped;
+ }
+ return new String(out);
+ }
+
+ public String encode(String pString) {
+ return( soundex( pString ) );
+ }
+
+ /**
+ * Used internally by the SoundEx algorithm.
+ */
+ private char getMappingCode(char c) {
+ if( !Character.isLetter(c) ) {
+ return 0;
+ } else {
+ return soundexMapping[Character.toUpperCase(c) - 'A'];
+ }
+ }
+
+ /**
+ * Returns the maxLength. Standard Soundex
+ * @return int
+ */
+ public int getMaxLength() {
+ return maxLength;
+ }
+
+ /**
+ * Sets the maxLength.
+ * @param maxLength The maxLength to set
+ */
+ public void setMaxLength(int maxLength) {
+ this.maxLength = maxLength;
+ }
+
+}
1.1
jakarta-commons-sandbox/codec/src/java/org/apache/commons/codec/Encoder.java
Index: Encoder.java
===================================================================
package org.apache.commons.codec;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Commons" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact [EMAIL PROTECTED]
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Turbine", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
/**
* Encoder is an interface, which is implemented by Soundex,
* Metaphone, Soundex2, etc.
*
* @author [EMAIL PROTECTED]
* @version $Id: Encoder.java,v 1.1 2002/11/18 08:46:02 bayard Exp $
*/
public interface Encoder {
public String encode(String str);
}
1.1
jakarta-commons-sandbox/codec/src/java/org/apache/commons/codec/EncoderComparator.java
Index: EncoderComparator.java
===================================================================
package org.apache.commons.codec;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Commons" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact [EMAIL PROTECTED]
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Turbine", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
import java.util.Comparator;
/**
* Compare using an Encoder.
*
* @author [EMAIL PROTECTED]
* @version $Id: EncoderComparator.java,v 1.1 2002/11/18 08:46:02 bayard Exp $
*/
public class EncoderComparator implements Comparator {
private Encoder encoder;
/**
* Use the default soundex algorithm, US_ENGLISH.
*/
public EncoderComparator() {
this(RefinedSoundex.US_ENGLISH);
}
/**
* Use the provided soundex algorithm.
*/
public EncoderComparator(Encoder en) {
this.encoder = en;
}
public int compare(Object o1, Object o2) {
String s1 = encoder.encode(o1.toString());
String s2 = encoder.encode(o2.toString());
return s1.compareTo(s2);
}
}
1.1
jakarta-commons-sandbox/codec/src/java/org/apache/commons/codec/RefinedSoundex.java
Index: RefinedSoundex.java
===================================================================
package org.apache.commons.codec;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Commons" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact [EMAIL PROTECTED]
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Turbine", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
/**
* Encodes a string into a soundex value. Sounde is an encoding used to
* relate similar names, but can also be used as a general purpose
* scheme to find word with similar phonemes.
* More information may be found at: http://www.bluepoof.com/Soundex/info2.html
*
* @todo Needs internationalisation in a future release.
*
* @author [EMAIL PROTECTED]
* @version $Id: RefinedSoundex.java,v 1.1 2002/11/18 08:46:02 bayard Exp $
*/
public class RefinedSoundex implements Encoder {
static public final char[] US_ENGLISH_MAPPING =
"01360240043788015936020505".toCharArray();
static public final RefinedSoundex US_ENGLISH = new RefinedSoundex();
private char[] soundexMapping;
public RefinedSoundex() {
this(US_ENGLISH_MAPPING);
}
public RefinedSoundex(char[] mapping) {
this.soundexMapping = mapping;
}
/**
* Get the SoundEx value of a string.
* This implementation is taken from the code-snippers on
* http://www.sourceforge.net/
*/
public String soundex(String str) {
StringBuffer sBuf = new StringBuffer();
str = str.toUpperCase();
sBuf.append( str.charAt(0) );
char last, mapped, current;
last = '*';
for( int i = 0; i < str.length(); i++ ) {
current = getMappingCode( str.charAt(i) );
if( current == last ) {
continue;
} else if( current != 0 ) {
sBuf.append( current );
}
last = current;
}
return sBuf.toString();
}
public String encode(String pString) {
return( soundex( pString ) );
}
/**
* Used internally by the SoundEx algorithm.
*/
private char getMappingCode(char c) {
if( !Character.isLetter(c) ) {
return 0;
} else {
return soundexMapping[Character.toUpperCase(c) - 'A'];
}
}
}
--
To unsubscribe, e-mail: <mailto:[EMAIL PROTECTED]>
For additional commands, e-mail: <mailto:[EMAIL PROTECTED]>