Author: tn
Date: Fri Mar 9 18:22:09 2012
New Revision: 1298958
URL: http://svn.apache.org/viewvc?rev=1298958&view=rev
Log:
[CODEC-63] Merged duplicate unit tests, added algorithm outline to class
description
Modified:
commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/Nysiis.java
commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/NysiisTest.java
Modified:
commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/Nysiis.java
URL:
http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/Nysiis.java?rev=1298958&r1=1298957&r2=1298958&view=diff
==============================================================================
---
commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/Nysiis.java
(original)
+++
commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/Nysiis.java
Fri Mar 9 18:22:09 2012
@@ -27,11 +27,42 @@ import org.apache.commons.codec.StringEn
*
* Encodes a string into a NYSIIS value. NYSIIS is an encoding used to relate
similar names, but can also be used as a
* general purpose scheme to find word with similar phonemes.
- *
+ *
* <p>
* NYSIIS features an accuracy increase of 2.7% over the traditional Soundex
algorithm.
* </p>
- *
+ *
+ * <p>Algorithm description:
+ * <pre>
+ * 1. Transcode first characters of name
+ * 1a. MAC -> MCC
+ * 1b. KN -> NN
+ * 1c. K -> C
+ * 1d. PH -> FF
+ * 1e. PF -> FF
+ * 1f. SCH -> SSS
+ * 2. Transcode last characters of name
+ * 2a. EE, IE -> Y
+ * 2b. DT,RT,RD,NT,ND -> D
+ * 3. First character of key = first character of name
+ * 4. Transcode remaining characters by following these rules, incrementing by
one character each time
+ * 4a. EV -> AF else A,E,I,O,U -> A
+ * 4b. Q -> G
+ * 4c. Z -> S
+ * 4d. M -> N
+ * 4e. KN -> N else K -> C
+ * 4f. SCH -> SSS
+ * 4g. PH -> FF
+ * 4h. H -> If previous or next is nonvowel, previous
+ * 4i. W -> If previous is vowel, previous
+ * 4j. Add current to key if current != last key character
+ * 5. If last character is S, remove it
+ * 6. If last characters are AY, replace with Y
+ * 7. If last character is A, remove it
+ * 8. Collapse all strings of repeated characters
+ * 9. Add original first character of name as first character of key
+ * </pre></p>
+ *
* @see <a
href="http://en.wikipedia.org/wiki/NYSIIS">http://en.wikipedia.org/wiki/NYSIIS</a>
* @see <a
href="http://www.dropby.com/NYSIIS.html">http://www.dropby.com/NYSIIS.html</a>
* @see Soundex
@@ -39,24 +70,24 @@ import org.apache.commons.codec.StringEn
*/
public class Nysiis implements StringEncoder {
- private static final char[] CHARS_A = new char[] { 'A' };
- private static final char[] CHARS_AF = new char[] { 'A', 'F' };
- private static final char[] CHARS_C = new char[] { 'C' };
- private static final char[] CHARS_FF = new char[] { 'F', 'F' };
- private static final char[] CHARS_G = new char[] { 'G' };
- private static final char[] CHARS_N = new char[] { 'N' };
- private static final char[] CHARS_NN = new char[] { 'N', 'N' };
- private static final char[] CHARS_S = new char[] { 'S' };
+ private static final char[] CHARS_A = new char[] { 'A' };
+ private static final char[] CHARS_AF = new char[] { 'A', 'F' };
+ private static final char[] CHARS_C = new char[] { 'C' };
+ private static final char[] CHARS_FF = new char[] { 'F', 'F' };
+ private static final char[] CHARS_G = new char[] { 'G' };
+ private static final char[] CHARS_N = new char[] { 'N' };
+ private static final char[] CHARS_NN = new char[] { 'N', 'N' };
+ private static final char[] CHARS_S = new char[] { 'S' };
private static final char[] CHARS_SSS = new char[] { 'S', 'S', 'S' };
-
- private static final Pattern PAT_MAC = Pattern.compile("^MAC");
- private static final Pattern PAT_KN = Pattern.compile("^KN");
- private static final Pattern PAT_K = Pattern.compile("^K");
- private static final Pattern PAT_PH_PF = Pattern.compile("^(PH|PF)");
- private static final Pattern PAT_SCH = Pattern.compile("^SCH");
- private static final Pattern PAT_EE_IE = Pattern.compile("(EE|IE)$");
+
+ private static final Pattern PAT_MAC = Pattern.compile("^MAC");
+ private static final Pattern PAT_KN = Pattern.compile("^KN");
+ private static final Pattern PAT_K = Pattern.compile("^K");
+ private static final Pattern PAT_PH_PF = Pattern.compile("^(PH|PF)");
+ private static final Pattern PAT_SCH = Pattern.compile("^SCH");
+ private static final Pattern PAT_EE_IE = Pattern.compile("(EE|IE)$");
private static final Pattern PAT_DT_ETC =
Pattern.compile("(DT|RT|RD|NT|ND)$");
-
+
private static final char SPACE = ' ';
private static final int TRUE_LENGTH = 6;
Modified:
commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/NysiisTest.java
URL:
http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/NysiisTest.java?rev=1298958&r1=1298957&r2=1298958&view=diff
==============================================================================
---
commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/NysiisTest.java
(original)
+++
commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/NysiisTest.java
Fri Mar 9 18:22:09 2012
@@ -17,9 +17,6 @@
package org.apache.commons.codec.language;
-import java.util.Arrays;
-import java.util.List;
-
import org.apache.commons.codec.EncoderException;
import org.apache.commons.codec.StringEncoder;
import org.apache.commons.codec.StringEncoderAbstractTest;
@@ -83,100 +80,37 @@ public class NysiisTest extends StringEn
this.encodeAll(new String[] { "Dane", "Dean", "Dionne" }, "DAN");
}
- @Test
- public void testDropBy() throws EncoderException {
- this.assertEncodings(
- new String[] { "MACINTOSH", "MCANT" },
- new String[] { "KNUTH", "NAT" },
- new String[] { "KOEHN", "CAN" },
- new String[] { "PHILLIPSON", "FALAPSAN" },
- new String[] { "PFEISTER", "FASTAR" },
- new String[] { "MCKEE", "MCY" },
- new String[] { "MACKIE", "MCY" },
- new String[] { "HEITSCHMIDT", "HATSNAD" },
- new String[] { "BART", "BAD" },
- new String[] { "HURD", "HAD" },
- new String[] { "HUNT", "HAD" },
- new String[] { "WESTERLUND", "WASTARLAD" },
- new String[] { "CASSTEVENS", "CASTAFAN" },
- new String[] { "VASQUEZ", "VASG" },
- new String[] { "FRAZIER", "FRASAR" },
- new String[] { "BOWMAN", "BANAN" },
- new String[] { "RICKERT", "RACAD" },
- new String[] { "DEUTSCH", "DAT" },
- new String[] { "WESTPHAL", "WASTFAL" },
- new String[] { "SHRIVER", "SRAVAR" },
- new String[] { "KUHL", "CAL" },
- new String[] { "RAWSON", "RASAN" },
- new String[] { "JILES", "JAL" },
- new String[] { "CARRAWAY", "CARY" },
- new String[] { "YAMADA", "YANAD" });
- }
-
/**
- * Tests data gathered from around the internets.
+ * Tests data gathered from around the internet.
*
+ * @see <a
href="http://www.dropby.com/NYSIISTextStrings.html">http://www.dropby.com/NYSIISTextStrings.html</a>
* @throws EncoderException
*/
@Test
- public void testDropBy2() throws EncoderException {
- // Explanation of differences between this implementation and the one
at dropby.com.
- //
- // Algorithm (taken from www.dropby.com/NYSIIS.html):
- //
- // 1. Transcode first characters of name:
- // MAC > MCC
- // KN > NN
- // K > C
- // PH > FF
- // PF > FF
- // SCH > SSS
- //
- // 2. Transcode last characters of name:
- // EE, IE > Y
- // DT,RT,RD,NT,ND > D
- //
- // 3. First character of key = first character of name.
- //
- // 4. Transcode remaining characters by following these rules,
incrementing by one character each time:
- // 4a. EV > AF else A,E,I,O,U > A
- // 4b. Q > G
- // 4c. Z > S
- // 4d. M > N
- // 4e. KN > N else K > C
- // 4f. SCH > SSS
- // 4g. PH > FF
- // 4h. H > If previous or next is nonvowel, previous
- // 4i. W > If previous is vowel, previous
- // 4j. Add current to key if current != last key character
- //
- // 5. If last character is S, remove it
- // 6. If last characters are AY, replace with Y
- // 7. If last character is A, remove it
- // 8. Collapse all strings of repeated characters
- // 9. Add original first character of name as first character of key
+ public void testDropBy() throws EncoderException {
+ // Explanation of differences between this implementation and the one
at dropby.com is
+ // prepended to the test string. The referenced rules refer to the
outlined steps the
+ // class description for Nysiis.
this.assertEncodings(
- // http://www.dropby.com/indexLF.html?content=/NYSIIS.html
// 1. Transcode first characters of name
new String[] { "MACINTOSH", "MCANT" },
// violates 4j: the second N should not be added, as the first
// key char is already a N
- new String[] { "KNUTH", "NAT" }, // Original: NNAT; modified:
NATH
+ new String[] { "KNUTH", "NAT" }, // Original: NNAT;
modified: NATH
// O and E are transcoded to A because of rule 4a
// H also to A because of rule 4h
// the N gets mysteriously lost, maybe because of a wrongly
implemented rule 4h
// that skips the next char in such a case?
// the remaining A is removed because of rule 7
- new String[] { "KOEHN", "CAN" }, // Original: C
+ new String[] { "KOEHN", "CAN" }, // Original: C
// violates 4j: see also KNUTH
new String[] { "PHILLIPSON", "FALAPSAN" }, // Original:
FFALAP[SAN]
// violates 4j: see also KNUTH
- new String[] { "PFEISTER", "FASTAR" }, // Original: FFASTA[R]
+ new String[] { "PFEISTER", "FASTAR" }, // Original:
FFASTA[R]
// violates 4j: see also KNUTH
- new String[] { "SCHOENHOEFT", "SANAFT" }, // Original:
SSANAF[T]
- // http://www.dropby.com/indexLF.html?content=/NYSIIS.html
- // 2.Transcode last characters of name:
+ new String[] { "SCHOENHOEFT", "SANAFT" }, // Original:
SSANAF[T]
+ // 2. Transcode last characters of name:
new String[] { "MCKEE", "MCY" },
new String[] { "MACKIE", "MCY" },
new String[] { "HEITSCHMIDT", "HATSNAD" },
@@ -184,8 +118,8 @@ public class NysiisTest extends StringEn
new String[] { "HURD", "HAD" },
new String[] { "HUNT", "HAD" },
new String[] { "WESTERLUND", "WASTARLAD" },
- // http://www.dropby.com/indexLF.html?content=/NYSIIS.html
- // 4. Transcode remaining characters by following these rules,
incrementing by one character each time:
+ // 4. Transcode remaining characters by following these rules,
+ // incrementing by one character each time:
new String[] { "CASSTEVENS", "CASTAFAN" },
new String[] { "VASQUEZ", "VASG" },
new String[] { "FRAZIER", "FRASAR" },
@@ -195,18 +129,18 @@ public class NysiisTest extends StringEn
// violates 5: the last S is not removed
// when comparing to DEUTS, which is phonetically similar
// the result it also DAT, which is correct for DEUTSCH too imo
- new String[] { "DEUTSCH", "DAT" }, // Original: DATS
+ new String[] { "DEUTSCH", "DAT" }, // Original: DATS
new String[] { "WESTPHAL", "WASTFAL" },
// violates 4h: the H should be transcoded to S and thus
ignored as
// the first key character is also S
- new String[] { "SHRIVER", "SRAVAR" }, // Original: SHRAVA[R]
+ new String[] { "SHRIVER", "SRAVAR" }, // Original:
SHRAVA[R]
// same as KOEHN, the L gets mysteriously lost
- new String[] { "KUHL", "CAL" }, // Original: C
+ new String[] { "KUHL", "CAL" }, // Original: C
new String[] { "RAWSON", "RASAN" },
// If last character is S, remove it
new String[] { "JILES", "JAL" },
// violates 6: if the last two characters are AY, remove A
- new String[] { "CARRAWAY", "CARY" }, // Original: CARAY
+ new String[] { "CARRAWAY", "CARY" }, // Original: CARAY
new String[] { "YAMADA", "YANAD" });
}