tobrien 2003/05/29 16:03:29
Modified: codec checkstyle.properties project.properties
codec/src/java/org/apache/commons/codec/base64 Base64.java
codec/src/java/org/apache/commons/codec/binary Base64.java
Hex.java
codec/src/java/org/apache/commons/codec/language
Metaphone.java RefinedSoundex.java Soundex.java
Log:
Fixed a number of checkstyle problems - from around 270 checkstyle issues to 16
Revision Changes Path
1.2 +15 -10 jakarta-commons/codec/checkstyle.properties
Index: checkstyle.properties
===================================================================
RCS file: /home/cvs/jakarta-commons/codec/checkstyle.properties,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -r1.1 -r1.2
--- checkstyle.properties 25 Apr 2003 17:50:55 -0000 1.1
+++ checkstyle.properties 29 May 2003 23:03:28 -0000 1.2
@@ -1,14 +1,19 @@
-#checkstyle.header.file=LICENSE.txt
+checkstyle.header.file=LICENSE.txt
# 2-5 = CVS Header in Commons license, 10 = copyright date, 32 = product name
-checkstyle.header.ignoreline=2,3,4,5,10,32
+#checkstyle.header.ignoreline=2,3,4,5,10,32
-checkstyle.ignore.maxlinelen=2
+# Ignore operator wrap, this has the effect of allowing
+# operators to appear at both the eol and the nl. This
+# setting should be eol, but checkstyle had problems
+# parsing this property when set to "eol". "ignore"
+# was selected as a fallback.
+checkstyle.wrap.operator = ignore
+
+# Ignore padding around parenthese, this allows for both
+# foo(a,b), and foo( a, b ).
+checkstyle.paren.pad = ignore
+
+# One should not be instantiating a java.lang.Boolean
+checkstyle.illegal.instantiations = java.lang.Boolean
-checkstyle.excludes=**/parser/*
-checkstyle.lcurly.type=eol
-checkstyle.lcurly.method=nlow
-checkstyle.lcurly.other=eol
-checkstyle.rcurly=alone
-checkstyle.javadoc.scope=nothing
-checkstyle.allow.protected=true
1.2 +0 -1 jakarta-commons/codec/project.properties
Index: project.properties
===================================================================
RCS file: /home/cvs/jakarta-commons/codec/project.properties,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -r1.1 -r1.2
--- project.properties 25 Apr 2003 17:50:55 -0000 1.1
+++ project.properties 29 May 2003 23:03:28 -0000 1.2
@@ -3,7 +3,6 @@
##
maven.checkstyle.properties=${basedir}/checkstyle.properties
-maven.checkstyle.excludes=**/parser/*
maven.test.failure = false
maven.junit.fork=true
maven.linkcheck.enable=true
1.2 +95 -39
jakarta-commons/codec/src/java/org/apache/commons/codec/base64/Base64.java
Index: Base64.java
===================================================================
RCS file:
/home/cvs/jakarta-commons/codec/src/java/org/apache/commons/codec/base64/Base64.java,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -r1.1 -r1.2
--- Base64.java 25 Apr 2003 17:50:56 -0000 1.1
+++ Base64.java 29 May 2003 23:03:28 -0000 1.2
@@ -84,17 +84,56 @@
public final class Base64 {
protected static final String DEFAULT_CHAR_ENCODING = "ISO-8859-1";
- private static final int BASELENGTH = 255;
- private static final int LOOKUPLENGTH = 64;
- private static final int TWENTYFOURBITGROUP = 24;
- private static final int EIGHTBIT = 8;
- private static final int SIXTEENBIT = 16;
- private static final int SIXBIT = 6;
- private static final int FOURBYTE = 4;
- private static final int SIGN = -128;
- private static final byte PAD = (byte) '=';
- private static final byte[] EMPTY_BYTE_ARRAY = new byte[0];
+
+ /**
+ * The bsae length
+ */
+ static final int BASELENGTH = 255;
+
+ /**
+ * Lookup length
+ */
+ static final int LOOKUPLENGTH = 64;
+
+ /**
+ * Used to calculate the number of bits in a byte.
+ */
+ static final int EIGHTBIT = 8;
+
+ /**
+ * Used when encoding something which has fewer than 24 bits
+ */
+ static final int SIXTEENBIT = 16;
+
+ /**
+ * Constant used to determine how many bits data contains
+ */
+ static final int TWENTYFOURBITGROUP = 24;
+
+ /**
+ * Used to get the number of Quadruples
+ */
+ static final int FOURBYTE = 4;
+
+ /**
+ * Used to test the sign of a byte
+ */
+ static final int SIGN = -128;
+
+ /**
+ * Byte used to pad output
+ */
+ static final byte PAD = (byte) '=';
+
+ // Create arrays to hold the base64 characters and a
+ // lookup for base64 chars
private static byte[] base64Alphabet = new byte[BASELENGTH];
+
+ private static final byte[] EMPTY_BYTE_ARRAY = new byte[0];
+
+ /**
+ * Lookup table
+ */
private static byte[] lookUpBase64Alphabet = new byte[LOOKUPLENGTH];
static {
@@ -133,17 +172,37 @@
}
+ /**
+ * Tests to see whether the bytes of this string are
+ * Base64
+ *
+ * @param isValidString String to test
+ * @return trus if String is base64
+ */
public static boolean isBase64(String isValidString) {
return (isBase64(isValidString.getBytes()));
}
-
+ /**
+ * Tests a byte to see whether it falls within the Base64
+ * alphabet (or if it is a padding character).
+ *
+ * @param octect byte to test
+ * @return true if byte is in alphabet or padding
+ */
public static boolean isBase64(byte octect) {
// Should we ignore white space?
return (octect == PAD || base64Alphabet[octect] != -1);
}
-
+ /**
+ * Tests byte array to see if all characters are within the
+ * Base64 alphabet
+ *
+ * @param arrayOctect A byte[] to test
+ * @return true if all data falls within the Base64 alphabet OR if the
+ * array is empty.
+ */
public static boolean isBase64(byte[] arrayOctect) {
int length = arrayOctect.length;
if (length == 0) {
@@ -177,8 +236,7 @@
if (fewerThan24bits != 0) {
//data not divisible by 24 bit
encodedData = new byte[(numberTriplets + 1) * 4];
- }
- else {
+ } else {
// 16 or 8 bit
encodedData = new byte[numberTriplets * 4];
}
@@ -212,8 +270,10 @@
: (byte) ((b3) >> 6 ^ 0xfc);
encodedData[encodedIndex] = lookUpBase64Alphabet[val1];
- encodedData[encodedIndex + 1] = lookUpBase64Alphabet[val2 | (k << 4)];
- encodedData[encodedIndex + 2] = lookUpBase64Alphabet[(l << 2) | val3];
+ encodedData[encodedIndex + 1] = lookUpBase64Alphabet[val2
+ | (k << 4)];
+ encodedData[encodedIndex + 2] = lookUpBase64Alphabet[(l << 2)
+ | val3];
encodedData[encodedIndex + 3] = lookUpBase64Alphabet[b3 & 0x3f];
}
@@ -231,8 +291,7 @@
encodedData[encodedIndex + 1] = lookUpBase64Alphabet[k << 4];
encodedData[encodedIndex + 2] = PAD;
encodedData[encodedIndex + 3] = PAD;
- }
- else if (fewerThan24bits == SIXTEENBIT) {
+ } else if (fewerThan24bits == SIXTEENBIT) {
b1 = binaryData[dataIndex];
b2 = binaryData[dataIndex + 1];
l = (byte) (b2 & 0x0f);
@@ -247,7 +306,8 @@
: (byte) ((b2) >> 4 ^ 0xf0);
encodedData[encodedIndex] = lookUpBase64Alphabet[val1];
- encodedData[encodedIndex + 1] = lookUpBase64Alphabet[val2 | (k << 4)];
+ encodedData[encodedIndex + 1] = lookUpBase64Alphabet[val2
+ | (k << 4)];
encodedData[encodedIndex + 2] = lookUpBase64Alphabet[l << 2];
encodedData[encodedIndex + 3] = PAD;
}
@@ -266,8 +326,7 @@
public static String encode(String data) {
try {
return encode(data, DEFAULT_CHAR_ENCODING);
- }
- catch (UnsupportedEncodingException uee) {
+ } catch (UnsupportedEncodingException uee) {
throw new IllegalStateException(uee.toString());
}
}
@@ -281,12 +340,11 @@
*
* @param data String of data to convert
* @param charEncoding the character encoding to use when converting
- * a String to a byte[]
+ * a String to a byte[]
* @return Base64-encoded String
*/
public static String encode(String data, String charEncoding)
- throws UnsupportedEncodingException
- {
+ throws UnsupportedEncodingException {
// Check arguments
if (data == null) {
@@ -301,8 +359,7 @@
OutputStreamWriter osw = new OutputStreamWriter(bos, charEncoding);
try {
osw.write(data);
- }
- catch (IOException ioe) {
+ } catch (IOException ioe) {
throw new RuntimeException(ioe.toString());
}
@@ -316,8 +373,7 @@
bos = new ByteArrayOutputStream(encodedData.length);
try {
bos.write(encodedData);
- }
- catch (IOException ioe) {
+ } catch (IOException ioe) {
throw new RuntimeException(ioe.toString());
}
@@ -327,7 +383,7 @@
/**
* Decodes Base64 data into octects
*
- * @param binaryData Byte array containing Base64 data
+ * @param base64Data Byte array containing Base64 data
* @return Array containing decoded data.
*/
public static byte[] decode(byte[] base64Data) {
@@ -370,19 +426,19 @@
b4 = base64Alphabet[marker1];
decodedData[encodedIndex] = (byte) (b1 << 2 | b2 >> 4);
- decodedData[encodedIndex + 1] = (byte) (((b2 & 0xf) << 4) | ((b3 >>
2) & 0xf));
+ decodedData[encodedIndex + 1] = (byte) (((b2 & 0xf) << 4)
+ | ((b3 >> 2) & 0xf));
decodedData[encodedIndex + 2] = (byte) (b3 << 6 | b4);
- }
- else if (marker0 == PAD) {
+ } else if (marker0 == PAD) {
//Two PAD e.g. 3c[Pad][Pad]
decodedData[encodedIndex] = (byte) (b1 << 2 | b2 >> 4);
- }
- else if (marker1 == PAD) {
+ } else if (marker1 == PAD) {
//One PAD e.g. 3cQ[Pad]
b3 = base64Alphabet[marker0];
decodedData[encodedIndex] = (byte) (b1 << 2 | b2 >> 4);
- decodedData[encodedIndex + 1] = (byte) (((b2 & 0xf) << 4) | ((b3 >>
2) & 0xf));
+ decodedData[encodedIndex + 1] = (byte) (((b2 & 0xf) << 4)
+ | ((b3 >> 2) & 0xf));
}
encodedIndex += 3;
}
1.4 +153 -63
jakarta-commons/codec/src/java/org/apache/commons/codec/binary/Base64.java
Index: Base64.java
===================================================================
RCS file:
/home/cvs/jakarta-commons/codec/src/java/org/apache/commons/codec/binary/Base64.java,v
retrieving revision 1.3
retrieving revision 1.4
diff -u -r1.3 -r1.4
--- Base64.java 14 May 2003 02:40:18 -0000 1.3
+++ Base64.java 29 May 2003 23:03:28 -0000 1.4
@@ -78,25 +78,60 @@
* @author <a href="[EMAIL PROTECTED]">Daniel Rall</a>
* @author <a href="[EMAIL PROTECTED]">Martin Redington</a>
* @author <a href="mailto:[EMAIL PROTECTED]">Gary Gregory</a>
+ * @author <a href="mailto:[EMAIL PROTECTED]">Tim O'Brien</a>
* @since 1.0-dev
- *
- * @todo Add more documentation
*/
public class Base64 implements BinaryEncoder, BinaryDecoder {
- // Create constants pertaining to the chunk requirement
+ /**
+ * Chunk size according to RFC 2045
+ */
static final int CHUNK_SIZE = 76;
+
+ /**
+ * Chunk separator, we use a newline to separate chunks
+ * of encoded data (if you ask for it to be chunked)
+ */
static final byte[] CHUNK_SEPARATOR = "\n".getBytes();
- // Create numerical and byte constants
+ /**
+ * The bsae length
+ */
static final int BASELENGTH = 255;
+
+ /**
+ * Lookup length
+ */
static final int LOOKUPLENGTH = 64;
- static final int TWENTYFOURBITGROUP = 24;
+
+ /**
+ * Used to calculate the number of bits in a byte.
+ */
static final int EIGHTBIT = 8;
+
+ /**
+ * Used when encoding something which has fewer than 24 bits
+ */
static final int SIXTEENBIT = 16;
- static final int SIXBIT = 6;
+
+ /**
+ * Constant used to determine how many bits data contains
+ */
+ static final int TWENTYFOURBITGROUP = 24;
+
+ /**
+ * Used to get the number of Quadruples
+ */
static final int FOURBYTE = 4;
+
+ /**
+ * Used to test the sign of a byte
+ */
static final int SIGN = -128;
+
+ /**
+ * Byte used to pad output
+ */
static final byte PAD = (byte) '=';
// Create arrays to hold the base64 characters and a
@@ -141,15 +176,21 @@
private static boolean isBase64(byte octect) {
if (octect == PAD) {
return true;
- }
- else if (base64Alphabet[octect] == -1) {
+ } else if (base64Alphabet[octect] == -1) {
return false;
- }
- else {
+ } else {
return true;
}
}
+ /**
+ * This array tests a given byte array to see if it contains
+ * only valid characters within the Base64 alphabet.
+ *
+ * @param arrayOctect byte array to test
+ * @return true if all bytes are valid characters in the Base64
+ * alphabet or if the byte array is empty; false, otherwise
+ */
public static boolean isArrayByteBase64(byte[] arrayOctect) {
arrayOctect = discardWhitespace(arrayOctect);
@@ -168,15 +209,41 @@
return true;
}
-
+ /**
+ * Encodes binary data using the base64 algorithm (this
+ * does not "chunk" the output).
+ *
+ * @param binaryData binary data to encode
+ * @return Base64 characters
+ */
public static byte[] encodeBase64(byte[] binaryData) {
return (encodeBase64(binaryData, false));
}
+ /**
+ * Encodes binary data using the base64 algorithm and chunks
+ * the encoded output into 76 character blocks
+ *
+ * @param binaryData binary data to encode
+ * @return Base64 characters chunked in 76 character blocks
+ */
public static byte[] encodeBase64Chunked(byte[] binaryData) {
return (encodeBase64(binaryData, true));
}
+
+ /**
+ * Decodes an Object using the base64 algorithm. This method
+ * is provided in order to satisfy the requirements of the
+ * Decoder interface, and will throw a DecoderException if the
+ * supplied object is not of type byte[].
+ *
+ * @param pObject Object to decode
+ * @return An object (of type byte[]) containing the
+ * binary data which corresponds to the byte[] supplied.
+ * @throws DecoderException if the parameter supplied is not
+ * of type byte[]
+ */
public Object decode(Object pObject) throws DecoderException {
Object result;
@@ -186,8 +253,7 @@
"Parameter supplied to "
+ "Base64 "
+ "decode is not a byte[]");
- }
- else {
+ } else {
result = decode((byte[]) pObject);
}
@@ -195,6 +261,15 @@
}
+ /**
+ * Decodes a byte[] containing containing
+ * characters in the Base64 alphabet.
+ *
+ * @param pArray A byte array containing Base64 character data
+ * @return a byte array containing binary data
+ * @throws DecoderException if there is an Decoder specific exception
+ * during the decoding process
+ */
public byte[] decode(byte[] pArray) throws DecoderException {
byte[] result;
result = decodeBase64((byte[]) pArray);
@@ -205,6 +280,8 @@
* Encodes hex octects into Base64.
*
* @param binaryData Array containing binary data to encode.
+ * @param isChunked if isChunked is true this encoder will chunk
+ * the base64 output into 76 character blocks
* @return Base64-encoded data.
*/
public static byte[] encodeBase64(byte[] binaryData, boolean isChunked) {
@@ -218,8 +295,7 @@
if (fewerThan24bits != 0) {
//data not divisible by 24 bit
encodedDataLength = (numberTriplets + 1) * 4;
- }
- else {
+ } else {
// 16 or 8 bit
encodedDataLength = numberTriplets * 4;
}
@@ -318,8 +394,7 @@
encodedData[encodedIndex + 1] = lookUpBase64Alphabet[k << 4];
encodedData[encodedIndex + 2] = PAD;
encodedData[encodedIndex + 3] = PAD;
- }
- else if (fewerThan24bits == SIXTEENBIT) {
+ } else if (fewerThan24bits == SIXTEENBIT) {
b1 = binaryData[dataIndex];
b2 = binaryData[dataIndex + 1];
@@ -360,7 +435,7 @@
/**
* Decodes Base64 data into octects
*
- * @param binaryData Byte array containing Base64 data
+ * @param base64Data Byte array containing Base64 data
* @return Array containing decoded data.
*/
public static byte[] decodeBase64(byte[] base64Data) {
@@ -391,33 +466,31 @@
}
decodedData = new byte[lastData - numberQuadruple];
}
-
+
for (int i = 0; i < numberQuadruple; i++) {
dataIndex = i * 4;
marker0 = base64Data[dataIndex + 2];
marker1 = base64Data[dataIndex + 3];
-
+
b1 = base64Alphabet[base64Data[dataIndex]];
b2 = base64Alphabet[base64Data[dataIndex + 1]];
-
+
if (marker0 != PAD && marker1 != PAD) {
//No PAD e.g 3cQl
b3 = base64Alphabet[marker0];
b4 = base64Alphabet[marker1];
-
+
decodedData[encodedIndex] = (byte) (b1 << 2 | b2 >> 4);
decodedData[encodedIndex + 1] =
(byte) (((b2 & 0xf) << 4) | ((b3 >> 2) & 0xf));
decodedData[encodedIndex + 2] = (byte) (b3 << 6 | b4);
- }
- else if (marker0 == PAD) {
+ } else if (marker0 == PAD) {
//Two PAD e.g. 3c[Pad][Pad]
decodedData[encodedIndex] = (byte) (b1 << 2 | b2 >> 4);
- }
- else if (marker1 == PAD) {
+ } else if (marker1 == PAD) {
//One PAD e.g. 3cQ[Pad]
b3 = base64Alphabet[marker0];
-
+
decodedData[encodedIndex] = (byte) (b1 << 2 | b2 >> 4);
decodedData[encodedIndex + 1] =
(byte) (((b2 & 0xf) << 4) | ((b3 >> 2) & 0xf));
@@ -426,7 +499,7 @@
}
return decodedData;
}
-
+
/**
* Discards any whitespace from a base-64 encoded block.
*
@@ -437,15 +510,15 @@
static byte[] discardWhitespace(byte[] data) {
byte groomedData[] = new byte[data.length];
int bytesCopied = 0;
-
+
for (int i = 0; i < data.length; i++) {
switch (data[i]) {
- case (byte) ' ' :
- case (byte) '\n' :
- case (byte) '\r' :
- case (byte) '\t' :
+ case (byte) ' ' :
+ case (byte) '\n' :
+ case (byte) '\r' :
+ case (byte) '\t' :
break;
- default:
+ default:
groomedData[bytesCopied++] = data[i];
}
}
@@ -456,38 +529,47 @@
return packedData;
}
-
- /**
- * Discards any characters outside of the base64 alphabet, per
- * the requirements on page 25 of RFC 2045 - "Any characters
- * outside of the base64 alphabet are to be ignored in base64
- * encoded data."
- *
- * @param data The base-64 encoded data to groom
- * @return The data, less non-base64 characters (see RFC 2045).
- */
- static byte[] discardNonBase64(byte[] data) {
- byte groomedData[] = new byte[data.length];
- int bytesCopied = 0;
-
- for (int i = 0; i < data.length; i++) {
- if( isBase64(data[i]) ) {
- groomedData[bytesCopied++] = data[i];
- }
- }
- byte packedData[] = new byte[bytesCopied];
+ /**
+ * Discards any characters outside of the base64 alphabet, per
+ * the requirements on page 25 of RFC 2045 - "Any characters
+ * outside of the base64 alphabet are to be ignored in base64
+ * encoded data."
+ *
+ * @param data The base-64 encoded data to groom
+ * @return The data, less non-base64 characters (see RFC 2045).
+ */
+ static byte[] discardNonBase64(byte[] data) {
+ byte groomedData[] = new byte[data.length];
+ int bytesCopied = 0;
+
+ for (int i = 0; i < data.length; i++) {
+ if (isBase64(data[i])) {
+ groomedData[bytesCopied++] = data[i];
+ }
+ }
- System.arraycopy(groomedData, 0, packedData, 0, bytesCopied);
+ byte packedData[] = new byte[bytesCopied];
+
+ System.arraycopy(groomedData, 0, packedData, 0, bytesCopied);
- return packedData;
- }
+ return packedData;
+ }
// Implementation of the Encoder Interface
/**
- * encode an Object
+ * Encodes an Object using the base64 algorithm. This method
+ * is provided in order to satisfy the requirements of the
+ * Encoder interface, and will throw an EncoderException if the
+ * supplied object is not of type byte[].
+ *
+ * @param pObject Object to encode
+ * @return An object (of type byte[]) containing the
+ * base64 encoded data which corresponds to the byte[] supplied.
+ * @throws EncoderException if the parameter supplied is not
+ * of type byte[]
*/
public Object encode(Object pObject) throws EncoderException {
@@ -498,8 +580,7 @@
"Parameter supplied to "
+ "Base64 "
+ "encode is not a byte[]");
- }
- else {
+ } else {
result = encode((byte[]) pObject);
}
@@ -507,6 +588,15 @@
}
+ /**
+ * Encodes a byte[] containing binary data, into a byte[] containing
+ * characters in the Base64 alphabet.
+ *
+ * @param pArray a byte array containing binary data
+ * @return A byte array containing only Base64 character data
+ * @throws EncoderException if there is an Encoder specific exception
+ * during the encoding process
+ */
public byte[] encode(byte[] pArray) throws EncoderException {
return (encodeBase64(pArray, false));
}
1.2 +20 -11
jakarta-commons/codec/src/java/org/apache/commons/codec/binary/Hex.java
Index: Hex.java
===================================================================
RCS file:
/home/cvs/jakarta-commons/codec/src/java/org/apache/commons/codec/binary/Hex.java,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -r1.1 -r1.2
--- Hex.java 25 Apr 2003 17:50:56 -0000 1.1
+++ Hex.java 29 May 2003 23:03:28 -0000 1.2
@@ -72,11 +72,14 @@
/**
- Converts an array of bytes into an array of characters representing the
- hexidecimal values of each byte in order. The returned array will be
- double the length of the passed array, as it takes two characters to
- represent any given byte.
- */
+ * Converts an array of bytes into an array of characters representing the
+ * hexidecimal values of each byte in order. The returned array will be
+ * double the length of the passed array, as it takes two characters to
+ * represent any given byte.
+ *
+ * @param data array of byte to convert to Hex characters
+ * @return A char[] containing hexidecimal characters
+ */
public static char[] encodeHex(byte[] data) {
int l = data.length;
@@ -95,12 +98,18 @@
/**
- Converts an array of characters representing hexidecimal values into an
- array of bytes of those same values. The returned array will be half the
- length of the passed array, as it takes two characters to represent any
- given byte. An exception is thrown if the passed char array has an odd
- number of elements.
- */
+ * Converts an array of characters representing hexidecimal values into an
+ * array of bytes of those same values. The returned array will be half the
+ * length of the passed array, as it takes two characters to represent any
+ * given byte. An exception is thrown if the passed char array has an odd
+ * number of elements.
+ *
+ * @param data An array of characters containing hexidecimal digits
+ * @return A byte array array containing binary data decoded from
+ * the supplied char array.
+ * @throws Exception Thrown if an odd number of characters is supplied
+ * to this function
+ */
public static byte[] decodeHex(char[] data) throws Exception {
int l = data.length;
1.2 +218 -111
jakarta-commons/codec/src/java/org/apache/commons/codec/language/Metaphone.java
Index: Metaphone.java
===================================================================
RCS file:
/home/cvs/jakarta-commons/codec/src/java/org/apache/commons/codec/language/Metaphone.java,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -r1.1 -r1.2
--- Metaphone.java 25 Apr 2003 17:50:56 -0000 1.1
+++ Metaphone.java 29 May 2003 23:03:29 -0000 1.2
@@ -71,12 +71,29 @@
*/
public class Metaphone implements StringEncoder {
+ /**
+ * Five values in the English language
+ */
private String vowels = "AEIOU" ;
+
+ /**
+ * Variable used in Metaphone algorithm
+ */
private String frontv = "EIY" ;
+
+ /**
+ * Variable used in Metaphone algorithm
+ */
private String varson = "CSPTG" ;
+ /**
+ * The max code length for metaphone is 4
+ */
private int maxCodeLen = 4 ;
+ /**
+ * Creates an instance of the Metaphone encoder
+ */
public Metaphone() {
super();
}
@@ -87,14 +104,21 @@
* All input is converted to upper case.
* Limitations: Input format is expected to be a single ASCII word
* with only characters in the A - Z range, no punctuation or numbers.
+ *
+ * @param txt String to find the metaphone code for
+ * @return A metaphone code corresponding to the String supplied
*/
- public String metaphone(String txt){
+ public String metaphone(String txt) {
int mtsz = 0 ;
boolean hard = false ;
- if ((txt == null) ||
- (txt.length() == 0)) return "" ;
+ if ((txt == null)
+ || (txt.length() == 0)) {
+ return "" ;
+ }
// single character is itself
- if (txt.length() == 1) return txt.toUpperCase() ;
+ if (txt.length() == 1) {
+ return txt.toUpperCase() ;
+ }
char[] inwd = txt.toUpperCase().toCharArray() ;
@@ -102,167 +126,231 @@
StringBuffer local = new StringBuffer(40); // manipulate
StringBuffer code = new StringBuffer(10) ; // output
// handle initial 2 characters exceptions
- switch(inwd[0]){
- case 'K': case 'G' : case 'P' : /* looking for KN, etc*/
- if (inwd[1] == 'N')local.append(inwd, 1, inwd.length - 1);
- else local.append(inwd);
+ switch(inwd[0]) {
+ case 'K' :
+ case 'G' :
+ case 'P' : /* looking for KN, etc*/
+ if (inwd[1] == 'N') {
+ local.append(inwd, 1, inwd.length - 1);
+ } else {
+ local.append(inwd);
+ }
break;
case 'A': /* looking for AE */
- if (inwd[1] == 'E')local.append(inwd, 1, inwd.length - 1);
- else local.append(inwd);
+ if (inwd[1] == 'E') {
+ local.append(inwd, 1, inwd.length - 1);
+ } else {
+ local.append(inwd);
+ }
break;
case 'W' : /* looking for WR or WH */
- if (inwd[1] == 'R'){ // WR -> R
- local.append(inwd, 1, inwd.length - 1); break ;
+ if (inwd[1] == 'R') { // WR -> R
+ local.append(inwd, 1, inwd.length - 1);
+ break ;
}
- if (inwd[1] == 'H'){
+ if (inwd[1] == 'H') {
local.append(inwd, 1, inwd.length - 1);
- local.setCharAt(0,'W'); // WH -> W
+ local.setCharAt(0, 'W'); // WH -> W
+ } else {
+ local.append(inwd);
}
- else local.append(inwd);
break;
case 'X' : /* initial X becomes S */
- inwd[0] = 'S' ;local.append(inwd);
+ inwd[0] = 'S';
+ local.append(inwd);
break ;
default :
local.append(inwd);
} // now local has working string with initials fixed
+
int wdsz = local.length();
int n = 0 ;
- while((mtsz < maxCodeLen) && // max code size of 4 works well
- (n < wdsz)){
+
+ while ((mtsz < maxCodeLen) // max code size of 4 works well
+ && (n < wdsz)) {
char symb = local.charAt(n) ;
// remove duplicate letters except C
- if ((symb != 'C') &&
- (n > 0) && (local.charAt(n - 1) == symb)) n++ ;
- else{ // not dup
- switch(symb){
+ if ((symb != 'C')
+ && (n > 0) && (local.charAt(n - 1) == symb)) {
+ n++ ;
+ } else { // not dup
+ switch(symb) {
case 'A' : case 'E' : case 'I' : case 'O' : case 'U' :
- if (n == 0) { code.append(symb);mtsz++;
+ if (n == 0) {
+ code.append(symb);
+ mtsz++;
}
break ; // only use vowel if leading char
case 'B' :
- if ((n > 0) &&
- !(n + 1 == wdsz) && // not MB at end of word
- (local.charAt(n - 1) == 'M')) {
+ if ((n > 0)
+ && !(n + 1 == wdsz) // not MB at end of word
+ && (local.charAt(n - 1) == 'M')) {
+ code.append(symb);
+ } else {
code.append(symb);
}
- else code.append(symb);
- mtsz++ ;
- break ;
+ mtsz++;
+ break;
case 'C' : // lots of C special cases
/* discard if SCI, SCE or SCY */
- if ((n > 0) &&
- (local.charAt(n-1) == 'S') &&
- (n + 1 < wdsz) &&
- (frontv.indexOf(local.charAt(n + 1)) >= 0)){ break ;}
+ if ((n > 0)
+ && (local.charAt(n - 1) == 'S')
+ && (n + 1 < wdsz)
+ && (frontv.indexOf(local.charAt(n + 1)) >= 0)) {
+ break ;
+ }
tmpS = local.toString();
if (tmpS.indexOf("CIA", n) == n) { // "CIA" -> X
code.append('X'); mtsz++; break ;
}
- if ((n + 1 < wdsz) &&
- (frontv.indexOf(local.charAt(n+1))>= 0)){
- code.append('S');mtsz++; break ; // CI,CE,CY -> S
- }
- if ((n > 0) &&
- (tmpS.indexOf("SCH",n-1)== n-1)){ // SCH->sk
- code.append('K') ; mtsz++;break ;
- }
- if (tmpS.indexOf("CH", n) == n){ // detect CH
- if ((n == 0) &&
- (wdsz >= 3) && // CH consonant -> K consonant
- (vowels.indexOf(local.charAt(2)) < 0)){
+ if ((n + 1 < wdsz)
+ && (frontv.indexOf(local.charAt(n + 1)) >= 0)) {
+ code.append('S');
+ mtsz++;
+ break ; // CI,CE,CY -> S
+ }
+ if ((n > 0)
+ && (tmpS.indexOf("SCH", n - 1) == n - 1)) { // SCH->sk
+ code.append('K') ;
+ mtsz++;
+ break ;
+ }
+ if (tmpS.indexOf("CH", n) == n) { // detect CH
+ if ((n == 0)
+ && (wdsz >= 3) // CH consonant -> K consonant
+ && (vowels.indexOf(local.charAt(2)) < 0)) {
code.append('K');
- }
- else { code.append('X'); // CHvowel -> X
+ } else {
+ code.append('X'); // CHvowel -> X
}
mtsz++;
- }
- else { code.append('K');mtsz++;
+ } else {
+ code.append('K');
+ mtsz++;
}
break ;
case 'D' :
- if ((n + 2 < wdsz)&& // DGE DGI DGY -> J
- (local.charAt(n+1) == 'G')&&
- (frontv.indexOf(local.charAt(n+2))>= 0)){
+ if ((n + 2 < wdsz) // DGE DGI DGY -> J
+ && (local.charAt(n + 1) == 'G')
+ && (frontv.indexOf(local.charAt(n + 2)) >= 0)) {
code.append('J'); n += 2 ;
- }
- else { code.append('T');
+ } else {
+ code.append('T');
}
mtsz++;
break ;
case 'G' : // GH silent at end or before consonant
- if ((n + 2 == wdsz)&&
- (local.charAt(n+1) == 'H')) break ;
- if ((n + 2 < wdsz) &&
- (local.charAt(n+1) == 'H')&&
- (vowels.indexOf(local.charAt(n+2)) < 0)) break ;
+ if ((n + 2 == wdsz)
+ && (local.charAt(n + 1) == 'H')) {
+ break;
+ }
+ if ((n + 2 < wdsz)
+ && (local.charAt(n + 1) == 'H')
+ && (vowels.indexOf(local.charAt(n + 2)) < 0)) {
+ break;
+ }
tmpS = local.toString();
- if ((n > 0) &&
- (tmpS.indexOf("GN", n) == n)||
- (tmpS.indexOf("GNED",n) == n)) break ; // silent G
- if ((n > 0) &&
- (local.charAt(n-1) == 'G')) hard = true ;
- else hard = false ;
- if ((n+1 < wdsz) &&
- (frontv.indexOf(local.charAt(n+1)) >= 0)&&
- (!hard)) code.append('J');
- else code.append('K');
+ if ((n > 0)
+ && (tmpS.indexOf("GN", n) == n)
+ || (tmpS.indexOf("GNED", n) == n)) {
+ break; // silent G
+ }
+ if ((n > 0)
+ && (local.charAt(n - 1) == 'G')) {
+ hard = true ;
+ } else {
+ hard = false ;
+ }
+ if ((n + 1 < wdsz)
+ && (frontv.indexOf(local.charAt(n + 1)) >= 0)
+ && (!hard)) {
+ code.append('J');
+ } else {
+ code.append('K');
+ }
mtsz++;
break ;
case 'H':
- if (n + 1 == wdsz) break ; // terminal H
- if ((n > 0) &&
- (varson.indexOf(local.charAt(n-1)) >= 0)) break ;
- if (vowels.indexOf(local.charAt(n+1)) >=0){
- code.append('H') ; mtsz++;// Hvowel
+ if (n + 1 == wdsz) {
+ break ; // terminal H
}
+ if ((n > 0)
+ && (varson.indexOf(local.charAt(n - 1)) >= 0)) {
+ break;
+ }
+ if (vowels.indexOf(local.charAt(n + 1)) >= 0) {
+ code.append('H');
+ mtsz++;// Hvowel
+ }
+ break;
+ case 'F':
+ case 'J' :
+ case 'L' :
+ case 'M':
+ case 'N' :
+ case 'R' :
+ code.append(symb);
+ mtsz++;
break;
- case 'F': case 'J' : case 'L' :
- case 'M': case 'N' : case 'R' :
- code.append(symb); mtsz++; break ;
case 'K' :
- if (n > 0){ // not initial
- if (local.charAt(n -1) != 'C') {
+ if (n > 0) { // not initial
+ if (local.charAt(n - 1) != 'C') {
code.append(symb);
}
+ } else {
+ code.append(symb); // initial K
}
- else code.append(symb); // initial K
mtsz++ ;
break ;
case 'P' :
- if ((n + 1 < wdsz) && // PH -> F
- (local.charAt(n+1) == 'H'))code.append('F');
- else code.append(symb);
+ if ((n + 1 < wdsz)
+ && (local.charAt(n + 1) == 'H')) {
+ // PH -> F
+ code.append('F');
+ } else {
+ code.append(symb);
+ }
mtsz++;
break ;
case 'Q' :
- code.append('K');mtsz++; break ;
+ code.append('K');
+ mtsz++;
+ break;
case 'S' :
tmpS = local.toString();
- if ((tmpS.indexOf("SH", n)== n) ||
- (tmpS.indexOf("SIO",n)== n) ||
- (tmpS.indexOf("SIA",n)== n)) code.append('X');
- else code.append('S');
- mtsz++ ;
- break ;
+ if ((tmpS.indexOf("SH", n) == n)
+ || (tmpS.indexOf("SIO", n) == n)
+ || (tmpS.indexOf("SIA", n) == n)) {
+ code.append('X');
+ } else {
+ code.append('S');
+ }
+ mtsz++;
+ break;
case 'T' :
tmpS = local.toString(); // TIA TIO -> X
- if ((tmpS.indexOf("TIA",n)== n)||
- (tmpS.indexOf("TIO",n)== n)){
- code.append('X'); mtsz++; break;
+ if ((tmpS.indexOf("TIA", n) == n)
+ || (tmpS.indexOf("TIO", n) == n)) {
+ code.append('X');
+ mtsz++;
+ break;
+ }
+ if (tmpS.indexOf("TCH", n) == n) {
+ break;
}
- if (tmpS.indexOf("TCH",n)==n) break;
// substitute numeral 0 for TH (resembles theta after all)
- if (tmpS.indexOf("TH", n)==n) code.append('0');
- else code.append('T');
+ if (tmpS.indexOf("TH", n) == n) {
+ code.append('0');
+ } else {
+ code.append('T');
+ }
mtsz++ ;
break ;
case 'V' :
code.append('F'); mtsz++;break ;
case 'W' : case 'Y' : // silent if not followed by vowel
- if ((n+1 < wdsz) &&
- (vowels.indexOf(local.charAt(n + 1)) >= 0)) {
+ if ((n + 1 < wdsz)
+ && (vowels.indexOf(local.charAt(n + 1)) >= 0)) {
code.append(symb);
mtsz++;
}
@@ -278,29 +366,52 @@
if (mtsz > 4) { code.setLength(4); }
}
return code.toString();
- } // end static method metaPhone()
+ }
+
+ /**
+ * Encodes an Object using the metaphone algorithm. This method
+ * is provided in order to satisfy the requirements of the
+ * Encoder interface, and will throw an EncoderException if the
+ * supplied object is not of type java.lang.String.
+ *
+ * @param pObject Object to encode
+ * @return An object (or type java.lang.String) containing the
+ * metaphone code which corresponds to the String supplied.
+ * @throws EncoderException if the parameter supplied is not
+ * of type java.lang.String
+ */
public Object encode(Object pObject) throws EncoderException {
Object result;
-
if (!(pObject instanceof java.lang.String)) {
throw new EncoderException("Parameter supplied to Metaphone "
+ "encode is not of type "
+ "java.lang.String");
- }
- else {
+ } else {
result = metaphone((String) pObject);
}
-
return result;
}
+ /**
+ * Encodes a String using the Metaphone algorithm.
+ *
+ * @param pString String object to encode
+ * @return The metaphone code corresponding to the String supplied
+ * @throws EncoderException thrown if a Metaphone specific exception
+ * is encountered.
+ */
public String encode(String pString) throws EncoderException {
return (metaphone(pString));
}
/**
- * Are the metaphones of two strings the same.
+ * Tests is the metaphones of two strings are identical.
+ *
+ * @param str1 First of two strings to compare
+ * @param str2 Second of two strings to compare
+ * @return true if the metaphones of these strings are identical,
+ * false otherwise.
*/
public boolean isMetaphoneEqual(String str1, String str2) {
return metaphone(str1).equals(metaphone(str2));
@@ -310,16 +421,12 @@
* Returns the maxCodeLen.
* @return int
*/
- public int getMaxCodeLen() {
- return maxCodeLen;
- }
+ public int getMaxCodeLen() { return maxCodeLen; }
/**
* Sets the maxCodeLen.
* @param maxCodeLen The maxCodeLen to set
*/
- public void setMaxCodeLen(int maxCodeLen) {
- this.maxCodeLen = maxCodeLen;
- }
+ public void setMaxCodeLen(int maxCodeLen) { this.maxCodeLen = maxCodeLen; }
}
1.5 +63 -16
jakarta-commons/codec/src/java/org/apache/commons/codec/language/RefinedSoundex.java
Index: RefinedSoundex.java
===================================================================
RCS file:
/home/cvs/jakarta-commons/codec/src/java/org/apache/commons/codec/language/RefinedSoundex.java,v
retrieving revision 1.4
retrieving revision 1.5
diff -u -r1.4 -r1.5
--- RefinedSoundex.java 29 May 2003 21:14:59 -0000 1.4
+++ RefinedSoundex.java 29 May 2003 23:03:29 -0000 1.5
@@ -68,25 +68,55 @@
*/
public class RefinedSoundex implements StringEncoder {
+ /**
+ * RefinedSoundex is *refined* for a number of
+ * reasons one being that the mappings have been
+ * altered. This implementation contains default
+ * mappings for US English.
+ */
public static final char[] US_ENGLISH_MAPPING =
"01360240043788015936020505".toCharArray();
+ /**
+ * This static variable contains an instance of the
+ * RefinedSoundex using the US_ENGLISH mapping.
+ */
public static final RefinedSoundex US_ENGLISH = new RefinedSoundex();
+ /**
+ * Every letter of the alphabet is "mapped" to a numerical
+ * value. This char array holds the values to which each
+ * letter is mapped. This implementation contains a default
+ * map for US_ENGLISH
+ */
private char[] soundexMapping;
+ /**
+ * Creates an instance of the RefinedSoundex object using the
+ * default US English mapping.
+ */
public RefinedSoundex() {
this(US_ENGLISH_MAPPING);
}
+ /**
+ * Creates a refined soundex instance using a custom mapping. This
+ * constructor can be used to customize the mapping, and/or possibly
+ * provide an internationalized mapping for a non-Western character
+ * set.
+ *
+ * @param mapping Mapping array to use when finding the corresponding
+ * code for a given character
+ */
public RefinedSoundex(char[] mapping) {
this.soundexMapping = mapping;
}
/**
- * Get the SoundEx value of a string.
- * This implementation is taken from the code-snippers on
- * http://www.sourceforge.net/
+ * Retreives the Refined Soundex code for a given String object.
+ *
+ * @param str String to encode using the Refined Soundex algorithm
+ * @return A soundex code for the String supplied
*/
public String soundex(String str) {
if (null == str || str.length() == 0) { return str; }
@@ -104,8 +134,7 @@
current = getMappingCode(str.charAt(i));
if (current == last) {
continue;
- }
- else if (current != 0) {
+ } else if (current != 0) {
sBuf.append(current);
}
@@ -116,37 +145,55 @@
return sBuf.toString();
}
+ /**
+ * Encodes a String using the refined soundex algorithm.
+ *
+ * @param pString A String object to encode
+ * @return A Soundex code corresponding to the String supplied
+ * @throws EncoderException throws exception if there is an
+ * encoding-specific problem
+ */
public String encode(String pString) throws EncoderException {
return (soundex(pString));
}
+ /**
+ * Encodes an Object using the refined soundex algorithm. This method
+ * is provided in order to satisfy the requirements of the
+ * Encoder interface, and will throw an EncoderException if the
+ * supplied object is not of type java.lang.String.
+ *
+ * @param pObject Object to encode
+ * @return An object (or type java.lang.String) containing the
+ * refined soundex code which corresponds to the String supplied.
+ * @throws EncoderException if the parameter supplied is not
+ * of type java.lang.String
+ */
public Object encode(Object pObject) throws EncoderException {
-
Object result;
-
if (!(pObject instanceof java.lang.String)) {
throw new EncoderException("Parameter supplied to "
+ "RefinedSoundex "
+ "encode is not of type "
+ "java.lang.String");
- }
- else {
+ } else {
result = soundex((String) pObject);
}
-
return result;
-
}
-
/**
- * Used internally by the SoundEx algorithm.
+ * Returns the mapping code for a given character. The mapping
+ * codes are maintained in an internal char array named soundexMapping,
+ * and the default values of these mappings are US English.
+ *
+ * @param c char to get mapping for
+ * @return A character (really a numeral) to return for the given char
*/
private char getMappingCode(char c) {
if (!Character.isLetter(c)) {
return 0;
- }
- else {
+ } else {
return soundexMapping[Character.toUpperCase(c) - 'A'];
}
}
1.4 +62 -15
jakarta-commons/codec/src/java/org/apache/commons/codec/language/Soundex.java
Index: Soundex.java
===================================================================
RCS file:
/home/cvs/jakarta-commons/codec/src/java/org/apache/commons/codec/language/Soundex.java,v
retrieving revision 1.3
retrieving revision 1.4
diff -u -r1.3 -r1.4
--- Soundex.java 12 May 2003 17:17:24 -0000 1.3
+++ Soundex.java 29 May 2003 23:03:29 -0000 1.4
@@ -61,39 +61,66 @@
* relate similar names, but can also be used as a general purpose
* scheme to find word with similar phonemes.
*
- * <!-- This link is broken: -->
- * <!-- @see <a
href="http://www.bluepoof.com/Soundex/info2.html">http://www.bluepoof.com/Soundex/info2.html</a>
-->
- *
* @author [EMAIL PROTECTED]
* @author [EMAIL PROTECTED]
* @author [EMAIL PROTECTED]
* @version $Revision$ $Date$
- *
- * @todo Internationalize Exception Messages
*/
public class Soundex implements StringEncoder {
+ /**
+ * This is a default mapping of the 26 letters used
+ * in US english.
+ */
public static final char[] US_ENGLISH_MAPPING =
"01230120022455012623010202".toCharArray();
+ /**
+ * This static variable contains an instance of the
+ * Soundex using the US_ENGLISH mapping.
+ */
public static final Soundex US_ENGLISH = new Soundex();
+ /**
+ * Every letter of the alphabet is "mapped" to a numerical
+ * value. This char array holds the values to which each
+ * letter is mapped. This implementation contains a default
+ * map for US_ENGLISH
+ */
private char[] soundexMapping;
- private int maxLength = 4;
+ /**
+ * The maximum length of a Soundex code - Soundex codes are
+ * only four characters by definition.
+ */
+ private int maxLength = 4;
+ /**
+ * Creates an instance of the Soundex object using the default
+ * US_ENGLISH mapping.
+ */
public Soundex() {
this(US_ENGLISH_MAPPING);
}
+ /**
+ * Creates a soundex instance using a custom mapping. This
+ * constructor can be used to customize the mapping, and/or possibly
+ * provide an internationalized mapping for a non-Western character
+ * set.
+ *
+ * @param mapping Mapping array to use when finding the corresponding
+ * code for a given character
+ */
public Soundex(char[] mapping) {
this.soundexMapping = mapping;
}
/**
- * Get the SoundEx value of a string.
- * This implementation is taken from the code-snippers on
- * http://www.sourceforge.net/
+ * Retreives the Soundex code for a given String object.
+ *
+ * @param str String to encode using the Soundex algorithm
+ * @return A soundex code for the String supplied
*/
public String soundex(String str) {
if (null == str || str.length() == 0) { return str; }
@@ -114,6 +141,18 @@
return new String(out);
}
+ /**
+ * Encodes an Object using the soundex algorithm. This method
+ * is provided in order to satisfy the requirements of the
+ * Encoder interface, and will throw an EncoderException if the
+ * supplied object is not of type java.lang.String.
+ *
+ * @param pObject Object to encode
+ * @return An object (or type java.lang.String) containing the
+ * soundex code which corresponds to the String supplied.
+ * @throws EncoderException if the parameter supplied is not
+ * of type java.lang.String
+ */
public Object encode(Object pObject) throws EncoderException {
Object result;
@@ -123,8 +162,7 @@
+ "Soundex "
+ "encode is not of type "
+ "java.lang.String");
- }
- else {
+ } else {
result = soundex((String) pObject);
}
@@ -132,19 +170,28 @@
}
-
+ /**
+ * Encodes a String using the soundex algorithm.
+ *
+ * @param pString A String object to encode
+ * @return A Soundex code corresponding to the String supplied
+ * @throws EncoderException throws exception if there is an
+ * encoding-specific problem
+ */
public String encode(String pString) throws EncoderException {
return (soundex(pString));
}
/**
* Used internally by the SoundEx algorithm.
+ *
+ * @param c character to use to retrieve mapping code
+ * @return Mapping code for a particular character
*/
private char getMappingCode(char c) {
if (!Character.isLetter(c)) {
return 0;
- }
- else {
+ } else {
return soundexMapping[Character.toUpperCase(c) - 'A'];
}
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [EMAIL PROTECTED]
For additional commands, e-mail: [EMAIL PROTECTED]