Author: tn
Date: Thu Mar 8 20:56:35 2012
New Revision: 1298576
URL: http://svn.apache.org/viewvc?rev=1298576&view=rev
Log:
[CODEC-63] Added explanation for different results to dropby.com, Raised CC to
100/100
Modified:
commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/NysiisTest.java
Modified:
commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/NysiisTest.java
URL:
http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/NysiisTest.java?rev=1298576&r1=1298575&r2=1298576&view=diff
==============================================================================
---
commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/NysiisTest.java
(original)
+++
commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/NysiisTest.java
Thu Mar 8 20:56:35 2012
@@ -49,6 +49,15 @@ public class NysiisTest extends StringEn
}
@Test
+ public void testTrueVariant() {
+ Nysiis encoder = new Nysiis(true);
+
+ String encoded = encoder.encode("WESTERLUND");
+ Assert.assertTrue(encoded.length() <= 6);
+ Assert.assertEquals("WASTAR", encoded);
+ }
+
+ @Test
public void testBran() throws EncoderException {
encodeAll(new String[] { "Brian", "Brown", "Brun" }, "BRAN");
}
@@ -71,6 +80,17 @@ public class NysiisTest extends StringEn
}
@Test
+ public void testSpecialBranches() throws EncoderException {
+ this.encodeAll(new String[] { "Kobwick" }, "CABWAC");
+ this.encodeAll(new String[] { "Kocher" }, "CACAR");
+ this.encodeAll(new String[] { "Fesca" }, "FASC");
+ this.encodeAll(new String[] { "Shom" }, "SAN");
+ this.encodeAll(new String[] { "Ohlo" }, "OL");
+ this.encodeAll(new String[] { "Uhu" }, "UH");
+ this.encodeAll(new String[] { "Um" }, "UN");
+ }
+
+ @Test
public void testDropBy() throws EncoderException {
List<String[]> testValues =
Arrays.asList(
@@ -112,16 +132,62 @@ public class NysiisTest extends StringEn
*/
@Test
public void testDropBy2() throws EncoderException {
+ // Explanation of differences between this implementation and the one
at dropby.com.
+ //
+ // Algorithm (taken from www.dropby.com/NYSIIS.html):
+ //
+ // 1. Transcode first characters of name:
+ // MAC » MCC
+ // KN » NN
+ // K » C
+ // PH » FF
+ // PF » FF
+ // SCH » SSS
+ //
+ // 2. Transcode last characters of name:
+ // EE, IE » Y
+ // DT,RT,RD,NT,ND » D
+ //
+ // 3. First character of key = first character of name.
+ //
+ // 4. Transcode remaining characters by following these rules,
incrementing by one character each time:
+ // 4a. EV » AF else A,E,I,O,U » A
+ // 4b. Q » G
+ // 4c. Z » S
+ // 4d. M » N
+ // 4e. KN » N else K » C
+ // 4f. SCH » SSS
+ // 4g. PH » FF
+ // 4h. H » If previous or next is nonvowel, previous
+ // 4i. W » If previous is vowel, previous
+ // 4j. Add current to key if current != last key character
+ //
+ // 5. If last character is S, remove it
+ // 6. If last characters are AY, replace with Y
+ // 7. If last character is A, remove it
+ // 8. Collapse all strings of repeated characters
+ // 9. Add original first character of name as first character of key
+
List<String[]> testValues =
Arrays.asList(
//
http://www.dropby.com/indexLF.html?content=/NYSIIS.html
// 1. Transcode first characters of name
new String[] { "MACINTOSH", "MCANT" },
- //new String[] { "KNUTH", "NNATH" }, // Original:
NNAT; modified: NATH
- //new String[] { "KOEHN", "C" },
- //new String[] { "PHILLIPSON", "FFALAP" },
- //new String[] { "PFEISTER", "FFASTA" },
- //new String[] { "SCHOENHOEFT", "SSANAF" },
+ // violates 4j: the second N should not be added, as
the first
+ // key char is already a N
+ new String[] { "KNUTH", "NAT" }, // Original: NNAT;
modified: NATH
+ // O and E are transcoded to A because of rule 4a
+ // H also to A because of rule 4h
+ // the N gets mysteriously lost, maybe because of a
wrongly implemented rule 4h
+ // that skips the next char in such a case?
+ // the remaining A is removed because of rule 7
+ new String[] { "KOEHN", "CAN" }, // Original: C
+ // violates 4j: see also KNUTH
+ new String[] { "PHILLIPSON", "FALAPSAN" }, //
Original: FFALAP[SAN]
+ // violates 4j: see also KNUTH
+ new String[] { "PFEISTER", "FASTAR" }, // Original:
FFASTA[R]
+ // violoates 4j: see also KNUTH
+ new String[] { "SCHOENHOEFT", "SANAFT" }, // Original:
SSANAF[T]
//
http://www.dropby.com/indexLF.html?content=/NYSIIS.html
// 2.Transcode last characters of name:
new String[] { "MCKEE", "MCY" },
@@ -139,14 +205,21 @@ public class NysiisTest extends StringEn
new String[] { "BOWMAN", "BANAN" },
new String[] { "MCKNIGHT", "MCNAGT" },
new String[] { "RICKERT", "RACAD" },
- //new String[] { "DEUTSCH", "DATS" },
+ // violates 5: the last S is not removed
+ // when comparing to DEUTS, which is phonetically
similar
+ // the result it also DAT, which is correct for
DEUTSCH too imo
+ new String[] { "DEUTSCH", "DAT" }, // Original: DATS
new String[] { "WESTPHAL", "WASTFAL" },
- //new String[] { "SHRIVER", "SHRAVA" },
- //new String[] { "KUHL", "C" },
+ // violates 4h: the H should be transcoded to S and
thus ignored as
+ // the first key character is also S
+ new String[] { "SHRIVER", "SRAVAR" }, // Original:
SHRAVA[R]
+ // same as KOEHN, the L gets mysteriously lost, the
correct one
+ new String[] { "KUHL", "CAL" }, // Original: C
new String[] { "RAWSON", "RASAN" },
// If last character is S, remove it
new String[] { "JILES", "JAL" },
- //new String[] { "CARRAWAY", "CARAY" },
+ // violates 6: if the last two characters are AY,
remove A
+ new String[] { "CARRAWAY", "CARY" }, // Original: CARAY
new String[] { "YAMADA", "YANAD" });
for (String[] arr : testValues) {