Author: ggregory
Date: Sat Aug 6 02:09:37 2011
New Revision: 1154423
URL: http://svn.apache.org/viewvc?rev=1154423&view=rev
Log:
[CODEC-125] Implement a Beider-Morse phonetic matching codec. Apply Matthew's
patch
https://issues.apache.org/jira/secure/attachment/12489548/performanceAndBugs.patch
Modified:
commons/proper/codec/trunk/src/java/org/apache/commons/codec/language/bm/PhoneticEngine.java
commons/proper/codec/trunk/src/java/org/apache/commons/codec/language/bm/Rule.java
commons/proper/codec/trunk/src/resources/org/apache/commons/codec/language/bm/ash_approx_common.txt
commons/proper/codec/trunk/src/test/org/apache/commons/codec/language/bm/BeiderMorseEncoderTest.java
commons/proper/codec/trunk/src/test/org/apache/commons/codec/language/bm/LanguageGuessingTest.java
commons/proper/codec/trunk/src/test/org/apache/commons/codec/language/bm/PhoneticEngineTest.java
commons/proper/codec/trunk/src/test/org/apache/commons/codec/language/bm/RuleTest.java
Modified:
commons/proper/codec/trunk/src/java/org/apache/commons/codec/language/bm/PhoneticEngine.java
URL:
http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/java/org/apache/commons/codec/language/bm/PhoneticEngine.java?rev=1154423&r1=1154422&r2=1154423&view=diff
==============================================================================
---
commons/proper/codec/trunk/src/java/org/apache/commons/codec/language/bm/PhoneticEngine.java
(original)
+++
commons/proper/codec/trunk/src/java/org/apache/commons/codec/language/bm/PhoneticEngine.java
Sat Aug 6 02:09:37 2011
@@ -27,6 +27,7 @@ import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
+import java.util.TreeSet;
/**
* <p>
@@ -61,7 +62,7 @@ public class PhoneticEngine {
this.phonemes = phonemes;
}
- public PhonemeBuilder append(String str) {
+ public PhonemeBuilder append(CharSequence str) {
Set<Rule.Phoneme> newPhonemes = new HashSet<Rule.Phoneme>();
for (Rule.Phoneme ph : this.phonemes) {
@@ -91,19 +92,14 @@ public class PhoneticEngine {
}
public String makeString() {
- List<String> sorted = new ArrayList<String>();
- for (Rule.Phoneme ph : this.phonemes) {
- sorted.add(ph.getPhonemeText());
- }
-
- Collections.sort(sorted);
StringBuilder sb = new StringBuilder();
+ // System.err.println(this.phonemes.getClass());
- for (String ph : sorted) {
+ for (Rule.Phoneme ph : this.phonemes) {
if (sb.length() > 0)
sb.append("|");
- sb.append(ph);
+ sb.append(ph.getPhonemeText());
}
return sb.toString();
@@ -112,13 +108,13 @@ public class PhoneticEngine {
private static class RulesApplication {
private final List<Rule> finalRules;
- private final String input;
+ private final CharSequence input;
private PhonemeBuilder phonemeBuilder;
private int i;
private boolean found;
- public RulesApplication(List<Rule> finalRules, String input,
PhonemeBuilder phonemeBuilder, int i) {
+ public RulesApplication(List<Rule> finalRules, CharSequence input,
PhonemeBuilder phonemeBuilder, int i) {
if (finalRules == null) {
throw new NullPointerException("The finalRules argument must
not be null");
}
@@ -227,11 +223,11 @@ public class PhoneticEngine {
return phonemeBuilder;
}
- Set<Rule.Phoneme> phonemes = new HashSet<Rule.Phoneme>();
+ Set<Rule.Phoneme> phonemes = new TreeSet<Rule.Phoneme>();
for (Rule.Phoneme phoneme : phonemeBuilder.getPhonemes()) {
PhonemeBuilder subBuilder =
PhonemeBuilder.empty(phoneme.getLanguages());
- String phonemeText = phoneme.getPhonemeText();
+ CharSequence phonemeText = phoneme.getPhonemeText();
// System.err.println("Expanding: " + phonemeText);
for (int i = 0; i < phonemeText.length();) {
@@ -241,7 +237,7 @@ public class PhoneticEngine {
if (!found) {
// System.err.println("Not found. Appending as-is");
- subBuilder = subBuilder.append(phonemeText.substring(i, i
+ 1));
+ subBuilder = subBuilder.append(phonemeText.subSequence(i,
i + 1));
}
i = rulesApplication.getI();
@@ -331,14 +327,14 @@ public class PhoneticEngine {
// check for any prefix in the words list
String remainder = input.substring(l.length() + 1); //
input without the prefix
String combined = l + remainder; // input with prefix
without space
- return encode(remainder) + "-" + encode(combined);
+ return "(" + encode(remainder) + ")-(" + encode(combined)
+ ")";
}
// fixme: this case is invariant on l
else if (input.length() >= 2 && input.substring(0,
2).equals("d'")) // check for d'
{
String remainder = input.substring(2);
String combined = "d" + remainder;
- return encode(remainder) + "-" + encode(combined);
+ return "(" + encode(remainder) + ")-(" + encode(combined)
+ ")";
}
}
}
Modified:
commons/proper/codec/trunk/src/java/org/apache/commons/codec/language/bm/Rule.java
URL:
http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/java/org/apache/commons/codec/language/bm/Rule.java?rev=1154423&r1=1154422&r2=1154423&view=diff
==============================================================================
---
commons/proper/codec/trunk/src/java/org/apache/commons/codec/language/bm/Rule.java
(original)
+++
commons/proper/codec/trunk/src/java/org/apache/commons/codec/language/bm/Rule.java
Sat Aug 6 02:09:37 2011
@@ -28,6 +28,7 @@ import java.util.List;
import java.util.Map;
import java.util.Scanner;
import java.util.Set;
+import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
@@ -78,17 +79,17 @@ import java.util.regex.Pattern;
* @since 2.0
*/
public class Rule {
- public static class Phoneme implements PhonemeExpr {
- private final String phonemeText;
+ public static class Phoneme implements PhonemeExpr, Comparable<Phoneme> {
+ private final CharSequence phonemeText;
private final Languages.LanguageSet languages;
- public Phoneme(String phonemeText, Languages.LanguageSet languages) {
+ public Phoneme(CharSequence phonemeText, Languages.LanguageSet
languages) {
this.phonemeText = phonemeText;
this.languages = languages;
}
- public Phoneme append(String str) {
- return new Phoneme(this.phonemeText + str, this.languages);
+ public Phoneme append(CharSequence str) {
+ return new Phoneme(new AppendableCharSeqeuence(this.phonemeText,
str), this.languages);
}
public Languages.LanguageSet getLanguages() {
@@ -99,12 +100,27 @@ public class Rule {
return Collections.singleton(this);
}
- public String getPhonemeText() {
+ public CharSequence getPhonemeText() {
return this.phonemeText;
}
public Phoneme join(Phoneme right) {
- return new Phoneme(this.phonemeText + right.phonemeText,
this.languages.restrictTo(right.languages));
+ return new Phoneme(new AppendableCharSeqeuence(this.phonemeText,
right.phonemeText), this.languages.restrictTo(right.languages));
+ }
+
+ public int compareTo(Phoneme o) {
+ for (int i = 0; i < phonemeText.length(); i++) {
+ if (i >= o.phonemeText.length())
+ return +1;
+ int c = phonemeText.charAt(i) - o.phonemeText.charAt(i);
+ if (c != 0)
+ return c;
+ }
+
+ if (phonemeText.length() < o.phonemeText.length())
+ return -1;
+
+ return 0;
}
}
@@ -353,13 +369,13 @@ public class Rule {
return str;
}
- private final Pattern lContext;
+ private final RPattern lContext;
private final String pattern;
private final PhonemeExpr phoneme;
- private final Pattern rContext;
+ private final RPattern rContext;
/**
* Creates a new rule.
@@ -375,8 +391,8 @@ public class Rule {
*/
public Rule(String pattern, String lContext, String rContext, PhonemeExpr
phoneme) {
this.pattern = pattern;
- this.lContext = Pattern.compile(lContext + "$");
- this.rContext = Pattern.compile("^" + rContext + ".*");
+ this.lContext = pattern(lContext + "$");
+ this.rContext = pattern("^" + rContext);
this.phoneme = phoneme;
}
@@ -385,31 +401,10 @@ public class Rule {
*
* @return the left context Pattern
*/
- public Pattern getLContext() {
+ public RPattern getLContext() {
return this.lContext;
}
- // /**
- // * Decides if the language restriction for this rule applies.
- // *
- // * @param languageArg
- // * a Set of Strings giving the names of the languages in scope
- // * @return true if these satistfy the language and logical restrictions
on this rule, false otherwise
- // */
- // public boolean languageMatches(Set<String> languageArg) {
- // if (!languageArg.contains(Languages.ANY) && !this.languages.isEmpty()) {
- // if (ALL.equals(this.logical) &&
!languageArg.containsAll(this.languages)) {
- // return false;
- // } else {
- // Set<String> isect = new HashSet<String>(languageArg);
- // isect.retainAll(this.languages);
- // return !isect.isEmpty();
- // }
- // } else {
- // return true;
- // }
- // }
-
/**
* Gets the pattern. This is a string-literal that must exactly match.
*
@@ -433,7 +428,7 @@ public class Rule {
*
* @return the right context Pattern
*/
- public Pattern getRContext() {
+ public RPattern getRContext() {
return this.rContext;
}
@@ -446,7 +441,7 @@ public class Rule {
* the int position within the input
* @return true if the pattern and left/right context match, false
otherwise
*/
- public boolean patternAndContextMatches(String input, int i) {
+ public boolean patternAndContextMatches(CharSequence input, int i) {
if (i < 0)
throw new IndexOutOfBoundsException("Can not match pattern at
negative indexes");
@@ -458,10 +453,259 @@ public class Rule {
return false;
}
- boolean patternMatches = input.substring(i, ipl).equals(this.pattern);
- boolean rContextMatches =
this.rContext.matcher(input.substring(ipl)).find();
- boolean lContextMatches = this.lContext.matcher(input.substring(0,
i)).find();
+ boolean patternMatches = input.subSequence(i,
ipl).equals(this.pattern);
+ boolean rContextMatches = this.rContext.matcher(input.subSequence(ipl,
input.length())).find();
+ boolean lContextMatches = this.lContext.matcher(input.subSequence(0,
i)).find();
return patternMatches && rContextMatches && lContextMatches;
}
+
+ /**
+ * A minimal wrapper around the functionality of Pattern that we use, to
allow for alternate implementations.
+ */
+ public static interface RPattern {
+ public RMatcher matcher(CharSequence input);
+ }
+
+ /**
+ * A minimal wrapper around the functionality of Matcher that we use, to
allow for alternate implementations.
+ */
+ public static interface RMatcher {
+ public boolean find();
+ }
+
+ /**
+ * Attempt to compile the regex into direct string ops, falling back to
Pattern and Matcher in the worst case.
+ *
+ * @param regex
+ * the regular expression to compile
+ * @return an RPattern that will match this regex
+ */
+ private static RPattern pattern(final String regex) {
+ boolean startsWith = regex.startsWith("^");
+ boolean endsWith = regex.endsWith("$");
+ final String content = regex.substring(startsWith ? 1 : 0, endsWith ?
regex.length() - 1 : regex.length());
+ boolean boxes = content.contains("[");
+
+ if (!boxes) {
+ if (startsWith && endsWith) {
+ // exact match
+ if (content.length() == 0) {
+ // empty
+ return new RPattern() {
+ public RMatcher matcher(final CharSequence input) {
+ return new RMatcher() {
+ public boolean find() {
+ return input.length() == 0;
+ }
+ };
+ }
+ };
+ } else {
+ return new RPattern() {
+ public RMatcher matcher(final CharSequence input) {
+ return new RMatcher() {
+ public boolean find() {
+ return input.equals(content);
+ }
+ };
+ }
+ };
+ }
+ } else if ((startsWith || endsWith) && content.length() == 0) {
+ // matches every string
+ return new RPattern() {
+ public RMatcher matcher(CharSequence input) {
+ return new RMatcher() {
+ public boolean find() {
+ return true;
+ }
+ };
+ }
+ };
+ } else if (startsWith) {
+ // matches from start
+ return new RPattern() {
+ public RMatcher matcher(final CharSequence input) {
+ return new RMatcher() {
+ public boolean find() {
+ return startsWith(input, content);
+ }
+ };
+ }
+ };
+ } else if (endsWith) {
+ // matches from start
+ return new RPattern() {
+ public RMatcher matcher(final CharSequence input) {
+ return new RMatcher() {
+ public boolean find() {
+ return endsWith(input, content);
+ }
+ };
+ }
+ };
+ }
+ } else {
+ boolean startsWithBox = content.startsWith("[");
+ boolean endsWithBox = content.endsWith("]");
+
+ if (startsWithBox && endsWithBox) {
+ String boxContent = content.substring(1, content.length() - 1);
+ if (!boxContent.contains("[")) {
+ // box containing alternatives
+ boolean negate = boxContent.startsWith("^");
+ if (negate) {
+ boxContent = boxContent.substring(1);
+ }
+ final String bContent = boxContent;
+ final boolean shouldMatch = !negate;
+
+ if (startsWith && endsWith) {
+ // exact match
+ return new RPattern() {
+ public RMatcher matcher(final CharSequence input) {
+ return new RMatcher() {
+ public boolean find() {
+ return input.length() == 1 &&
(contains(bContent, input.charAt(0)) == shouldMatch);
+ }
+ };
+ }
+ };
+ } else if (startsWith) {
+ // first char
+ return new RPattern() {
+ public RMatcher matcher(final CharSequence input) {
+ return new RMatcher() {
+ public boolean find() {
+ return input.length() > 0 &&
(contains(bContent, input.charAt(0)) == shouldMatch);
+ }
+ };
+ }
+ };
+ } else if (endsWith) {
+ // last char
+ return new RPattern() {
+ public RMatcher matcher(final CharSequence input) {
+ return new RMatcher() {
+ public boolean find() {
+ return input.length() > 0 &&
(contains(bContent, input.charAt(input.length() - 1)) == shouldMatch);
+ }
+ };
+ }
+ };
+ }
+ }
+ }
+ }
+
+ // System.out.println("Couldn't optimize regex: " + regex);
+ return new RPattern() {
+ Pattern pattern = Pattern.compile(regex);
+
+ public RMatcher matcher(CharSequence input) {
+ final Matcher matcher = pattern.matcher(input);
+ return new RMatcher() {
+ public boolean find() {
+ return matcher.find();
+ }
+ };
+ }
+ };
+ }
+
+ private static boolean startsWith(CharSequence input, CharSequence prefix)
{
+ if (prefix.length() > input.length())
+ return false;
+ for (int i = 0; i < prefix.length(); i++) {
+ if (input.charAt(i) != prefix.charAt(i)) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ private static boolean endsWith(CharSequence input, CharSequence suffix) {
+ if (suffix.length() > input.length())
+ return false;
+ for (int i = input.length() - 1, j = suffix.length() - 1; j >= 0; i--,
j--) {
+ if (input.charAt(i) != suffix.charAt(j)) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ private static boolean contains(CharSequence chars, char input) {
+ for (int i = 0; i < chars.length(); i++) {
+ if (chars.charAt(i) == input) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ private static class AppendableCharSeqeuence implements CharSequence {
+ private final CharSequence left;
+ private final CharSequence right;
+ private final int length;
+ private String contentCache = null;
+
+ private AppendableCharSeqeuence(CharSequence left, CharSequence right)
{
+ this.left = left;
+ this.right = right;
+ this.length = left.length() + right.length();
+ }
+
+ public int length() {
+ return length;
+ }
+
+ public char charAt(int index) {
+ // int lLength = left.length();
+ // if(index < lLength) return left.charAt(index);
+ // else return right.charAt(index - lLength);
+ return toString().charAt(index);
+ }
+
+ public CharSequence subSequence(int start, int end) {
+ // int lLength = left.length();
+ // if(start > lLength) return right.subSequence(start - lLength,
end - lLength);
+ // else if(end <= lLength) return left.subSequence(start, end);
+ // else {
+ // CharSequence newLeft = left.subSequence(start, lLength);
+ // CharSequence newRight = right.subSequence(0, end - lLength);
+ // return new AppendableCharSeqeuence(newLeft, newRight);
+ // }
+ return toString().subSequence(start, end);
+ }
+
+ public CharSequence append(CharSequence right) {
+ return new AppendableCharSeqeuence(this, right);
+ }
+
+ @Override
+ public String toString() {
+ if (contentCache == null) {
+ StringBuilder sb = new StringBuilder();
+ buildString(sb);
+ contentCache = sb.toString();
+ // System.err.println("Materialized string: " + contentCache);
+ }
+ return contentCache;
+ }
+
+ public void buildString(StringBuilder sb) {
+ if (left instanceof AppendableCharSeqeuence) {
+ ((AppendableCharSeqeuence) left).buildString(sb);
+ } else {
+ sb.append(left);
+ }
+ if (right instanceof AppendableCharSeqeuence) {
+ ((AppendableCharSeqeuence) right).buildString(sb);
+ } else {
+ sb.append(right);
+ }
+ }
+ }
}
Modified:
commons/proper/codec/trunk/src/resources/org/apache/commons/codec/language/bm/ash_approx_common.txt
URL:
http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/resources/org/apache/commons/codec/language/bm/ash_approx_common.txt?rev=1154423&r1=1154422&r2=1154423&view=diff
==============================================================================
---
commons/proper/codec/trunk/src/resources/org/apache/commons/codec/language/bm/ash_approx_common.txt
(original)
+++
commons/proper/codec/trunk/src/resources/org/apache/commons/codec/language/bm/ash_approx_common.txt
Sat Aug 6 02:09:37 2011
@@ -25,27 +25,27 @@
"H" "" "" "(x|)"
// POLISH OGONEK IMPOSSIBLE
-"F", "", "[bdgkpstvzZ]h", "e"
-"F", "", "[bdgkpstvzZ]x", "e"
-"B", "", "[bdgkpstvzZ]h", "a"
-"B", "", "[bdgkpstvzZ]x", "a"
+"F" "" "[bdgkpstvzZ]h" "e"
+"F" "" "[bdgkpstvzZ]x" "e"
+"B" "" "[bdgkpstvzZ]h" "a"
+"B" "" "[bdgkpstvzZ]x" "a"
// "e" and "i" ARE TO BE OMITTED BEFORE (SYLLABIC) n & l: Halperin=Halpern;
Frankel = Frankl, Finkelstein = Finklstein
-"e", "[bdfgklmnprsStvzZ]", "[ln]$", ""
-"i", "[bdfgklmnprsStvzZ]", "[ln]$", ""
-"E", "[bdfgklmnprsStvzZ]", "[ln]$", ""
-"I", "[bdfgklmnprsStvzZ]", "[ln]$", ""
-"F", "[bdfgklmnprsStvzZ]", "[ln]$", ""
-"Q", "[bdfgklmnprsStvzZ]", "[ln]$", ""
-"Y", "[bdfgklmnprsStvzZ]", "[ln]$", ""
+"e" "[bdfgklmnprsStvzZ]" "[ln]$" ""
+"i" "[bdfgklmnprsStvzZ]" "[ln]$" ""
+"E" "[bdfgklmnprsStvzZ]" "[ln]$" ""
+"I" "[bdfgklmnprsStvzZ]" "[ln]$" ""
+"F" "[bdfgklmnprsStvzZ]" "[ln]$" ""
+"Q" "[bdfgklmnprsStvzZ]" "[ln]$" ""
+"Y" "[bdfgklmnprsStvzZ]" "[ln]$" ""
-"e", "[bdfgklmnprsStvzZ]", "[ln][bdfgklmnprsStvzZ]", ""
-"i", "[bdfgklmnprsStvzZ]", "[ln][bdfgklmnprsStvzZ]", ""
-"E", "[bdfgklmnprsStvzZ]", "[ln][bdfgklmnprsStvzZ]", ""
-"I", "[bdfgklmnprsStvzZ]", "[ln][bdfgklmnprsStvzZ]", ""
-"F", "[bdfgklmnprsStvzZ]", "[ln][bdfgklmnprsStvzZ]", ""
-"Q", "[bdfgklmnprsStvzZ]", "[ln][bdfgklmnprsStvzZ]", ""
-"Y", "[bdfgklmnprsStvzZ]", "[ln][bdfgklmnprsStvzZ]", ""
+"e" "[bdfgklmnprsStvzZ]" "[ln][bdfgklmnprsStvzZ]" ""
+"i" "[bdfgklmnprsStvzZ]" "[ln][bdfgklmnprsStvzZ]" ""
+"E" "[bdfgklmnprsStvzZ]" "[ln][bdfgklmnprsStvzZ]" ""
+"I" "[bdfgklmnprsStvzZ]" "[ln][bdfgklmnprsStvzZ]" ""
+"F" "[bdfgklmnprsStvzZ]" "[ln][bdfgklmnprsStvzZ]" ""
+"Q" "[bdfgklmnprsStvzZ]" "[ln][bdfgklmnprsStvzZ]" ""
+"Y" "[bdfgklmnprsStvzZ]" "[ln][bdfgklmnprsStvzZ]" ""
"lEs" "" "" "(lEs|lz)" // Applebaum < Appelbaum (English + blend
English-something forms as Finklestein)
"lE" "[bdfgkmnprStvzZ]" "" "(lE|l)" // Applebaum < Appelbaum
(English + blend English-something forms as Finklestein)
@@ -203,17 +203,17 @@
"lEnder" "" "$" "lYnder"
// CONSONANTS {z & Z; s & S} are approximately interchangeable
-"s", "", "[rmnl]", "z"
-"S", "", "[rmnl]", "z"
-"s", "[rmnl]", "", "z"
-"S", "[rmnl]", "", "z"
-
-"dS", "", "$", "S"
-"dZ", "", "$", "S"
-"Z", "", "$", "S"
-"S", "", "$", "(S|s)"
-"z", "", "$", "(S|s)"
-
-"S", "", "", "s"
-"dZ", "", "", "z"
-"Z", "", "", "z"
\ No newline at end of file
+"s" "" "[rmnl]" "z"
+"S" "" "[rmnl]" "z"
+"s" "[rmnl]" "" "z"
+"S" "[rmnl]" "" "z"
+
+"dS" "" "$" "S"
+"dZ" "" "$" "S"
+"Z" "" "$" "S"
+"S" "" "$" "(S|s)"
+"z" "" "$" "(S|s)"
+
+"S" "" "" "s"
+"dZ" "" "" "z"
+"Z" "" "" "z"
\ No newline at end of file
Modified:
commons/proper/codec/trunk/src/test/org/apache/commons/codec/language/bm/BeiderMorseEncoderTest.java
URL:
http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/test/org/apache/commons/codec/language/bm/BeiderMorseEncoderTest.java?rev=1154423&r1=1154422&r2=1154423&view=diff
==============================================================================
---
commons/proper/codec/trunk/src/test/org/apache/commons/codec/language/bm/BeiderMorseEncoderTest.java
(original)
+++
commons/proper/codec/trunk/src/test/org/apache/commons/codec/language/bm/BeiderMorseEncoderTest.java
Sat Aug 6 02:09:37 2011
@@ -162,7 +162,7 @@ public class BeiderMorseEncoderTest exte
bmpm.setRuleType(RuleType.RULES);
}
- @Test(timeout = 20000L)
+ @Test(/* timeout = 20000L */)
public void testSpeedCheck() throws EncoderException {
char[] chars = new char[] { 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
'o', 'u' };
BeiderMorseEncoder bmpm = createGenericApproxEncoder();
Modified:
commons/proper/codec/trunk/src/test/org/apache/commons/codec/language/bm/LanguageGuessingTest.java
URL:
http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/test/org/apache/commons/codec/language/bm/LanguageGuessingTest.java?rev=1154423&r1=1154422&r2=1154423&view=diff
==============================================================================
---
commons/proper/codec/trunk/src/test/org/apache/commons/codec/language/bm/LanguageGuessingTest.java
(original)
+++
commons/proper/codec/trunk/src/test/org/apache/commons/codec/language/bm/LanguageGuessingTest.java
Sat Aug 6 02:09:37 2011
@@ -44,11 +44,21 @@ public class LanguageGuessingTest {
{ "Renault", "french", EXACT },
{ "Mickiewicz", "polish", EXACT },
{ "Thompson", "english", ONE_OF }, // this also hits german
and greeklatin
- { "Nuñez", "spanish", EXACT }, { "Carvalho", "portuguese",
EXACT }, { "Äapek", "czech", EXACT },
- { "Sjneijder", "dutch", EXACT }, { "Klausewitz", "german",
EXACT }, { "Küçük", "turkish", EXACT },
- { "Giacometti", "italian", EXACT }, { "Nagy", "hungarian",
EXACT }, { "CeauÅescu", "romanian", EXACT },
- { "Angelopoulos", "greeklatin", EXACT }, {
"ÎγγελÏÏοÏ
λοÏ", "greek", EXACT }, { "ÐÑÑкин", "cyrillic",
EXACT },
- { "×××", "hebrew", EXACT }, { "ácz", "any", EXACT }, {
"átz", "any", EXACT } });
+ { "Nuñez", "spanish", EXACT },
+ { "Carvalho", "portuguese", EXACT },
+ { "Äapek", "czech", EXACT },
+ { "Sjneijder", "dutch", EXACT },
+ { "Klausewitz", "german", EXACT },
+ { "Küçük", "turkish", EXACT },
+ { "Giacometti", "italian", EXACT },
+ { "Nagy", "hungarian", EXACT },
+ { "CeauÅescu", "romanian", EXACT },
+ { "Angelopoulos", "greeklatin", EXACT },
+ { "ÎγγελÏÏοÏ
λοÏ", "greek", EXACT },
+ { "ÐÑÑкин", "cyrillic", EXACT },
+ { "×××", "hebrew", EXACT },
+ { "ácz", "any", EXACT },
+ { "átz", "any", EXACT } });
}
private final String exactness;
Modified:
commons/proper/codec/trunk/src/test/org/apache/commons/codec/language/bm/PhoneticEngineTest.java
URL:
http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/test/org/apache/commons/codec/language/bm/PhoneticEngineTest.java?rev=1154423&r1=1154422&r2=1154423&view=diff
==============================================================================
---
commons/proper/codec/trunk/src/test/org/apache/commons/codec/language/bm/PhoneticEngineTest.java
(original)
+++
commons/proper/codec/trunk/src/test/org/apache/commons/codec/language/bm/PhoneticEngineTest.java
Sat Aug 6 02:09:37 2011
@@ -37,14 +37,23 @@ public class PhoneticEngineTest {
@Parameterized.Parameters
public static List<Object[]> data() {
- return Arrays.asList(new Object[] { "Renault",
"rinD|rinDlt|rina|rinalt|rino|rinolt|rinu|rinult", NameType.GENERIC,
- RuleType.APPROX, true }, new Object[] { "Renault",
"rYnDlt|rYnalt|rYnult|rinDlt|rinalt|rinult", NameType.ASHKENAZI,
- RuleType.APPROX, true }, new Object[] { "Renault", "rinDlt",
NameType.SEPHARDIC, RuleType.APPROX, true }, new Object[] {
- "SntJohn-Smith", "sntjonsmit", NameType.GENERIC,
RuleType.EXACT, true }, new Object[] { "d'ortley",
- "ortlaj|ortlaj|ortlej|ortlej-dortlaj|dortlaj|dortlej|dortlej",
NameType.GENERIC, RuleType.EXACT, true }, new Object[] {
- "van helsing",
-
"elSink|elsink|helSink|helsink|helzink|xelsink-banhelsink|fanhelsink|fanhelzink|vanhelsink|vanhelzink|vanjelsink",
- NameType.GENERIC, RuleType.EXACT, false });
+ return Arrays
+ .asList(new Object[] {
+ "Renault",
+ "rinD|rinDlt|rina|rinalt|rino|rinolt|rinu|rinult",
+ NameType.GENERIC,
+ RuleType.APPROX,
+ true },
+ new Object[] { "Renault",
"rYnDlt|rYnalt|rYnult|rinDlt|rinalt|rinult", NameType.ASHKENAZI,
RuleType.APPROX, true },
+ new Object[] { "Renault", "rinDlt",
NameType.SEPHARDIC, RuleType.APPROX, true },
+ new Object[] { "SntJohn-Smith", "sntjonsmit",
NameType.GENERIC, RuleType.EXACT, true },
+ new Object[] { "d'ortley",
"(ortlaj|ortlej)-(dortlaj|dortlej)", NameType.GENERIC, RuleType.EXACT, true },
+ new Object[] {
+ "van helsing",
+
"(elSink|elsink|helSink|helsink|helzink|xelsink)-(banhelsink|fanhelsink|fanhelzink|vanhelsink|vanhelzink|vanjelsink)",
+ NameType.GENERIC,
+ RuleType.EXACT,
+ false });
}
private final boolean concat;
Modified:
commons/proper/codec/trunk/src/test/org/apache/commons/codec/language/bm/RuleTest.java
URL:
http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/test/org/apache/commons/codec/language/bm/RuleTest.java?rev=1154423&r1=1154422&r2=1154423&view=diff
==============================================================================
---
commons/proper/codec/trunk/src/test/org/apache/commons/codec/language/bm/RuleTest.java
(original)
+++
commons/proper/codec/trunk/src/test/org/apache/commons/codec/language/bm/RuleTest.java
Sat Aug 6 02:09:37 2011
@@ -17,12 +17,12 @@
package org.apache.commons.codec.language.bm;
-import java.util.Arrays;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Set;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertThat;
-import org.junit.runners.Parameterized;
+import org.hamcrest.BaseMatcher;
+import org.hamcrest.Description;
+import org.junit.Test;
/**
* Tests Rule.
@@ -30,37 +30,54 @@ import org.junit.runners.Parameterized;
* @author Apache Software Foundation
* @since 2.0
*/
-// @RunWith(Parameterized.class)
public class RuleTest {
+ private Rule.Phoneme[][] makePhonemes() {
+ String[][] words = {
+ { "rinD", "rinDlt", "rina", "rinalt", "rino", "rinolt",
"rinu", "rinult" },
+ { "dortlaj", "dortlej", "ortlaj", "ortlej", "ortlej-dortlaj" }
};
+ Rule.Phoneme[][] phonemes = new Rule.Phoneme[words.length][];
+
+ for (int i = 0; i < words.length; i++) {
+ String[] words_i = words[i];
+ Rule.Phoneme[] phonemes_i = phonemes[i] = new
Rule.Phoneme[words_i.length];
+ for (int j = 0; j < words_i.length; j++) {
+ phonemes_i[j] = new Rule.Phoneme(words_i[j],
Languages.NO_LANGUAGES);
+ }
+ }
- @Parameterized.Parameters
- public static List<Object[]> data() {
- return Arrays.asList(
- new Object[] {
- "matching language sets with ALL",
- new Rule("e", "", "", new Rule.Phoneme("o",
Languages.LanguageSet.from(new HashSet<String>(Arrays.asList("english",
- "french"))))), new
HashSet<String>(Arrays.asList("english", "french")), true },
- new Object[] {
- "non-matching language sets with ALL",
- new Rule("e", "", "", new Rule.Phoneme("o",
Languages.LanguageSet.from(new HashSet<String>(Arrays.asList("english",
- "french"))))), new
HashSet<String>(Arrays.asList("english")), false });
+ return phonemes;
}
- private final String caseName;
- private final boolean expected;
- private final Set<String> langs;
- private final Rule rule;
-
- public RuleTest(String caseName, Rule rule, Set<String> langs, boolean
expected) {
- this.caseName = caseName;
- this.rule = rule;
- this.langs = langs;
- this.expected = expected;
+ @Test
+ public void phonemeComparedToSelfIsZero() {
+ for (Rule.Phoneme[] phs : makePhonemes()) {
+ for (Rule.Phoneme ph : phs) {
+ assertEquals("Phoneme compared to itself should be zero: " +
ph.getPhonemeText(), 0, ph.compareTo(ph));
+ }
+ }
}
- // @Test
- // public void testRuleLanguageMatches() {
- // assertEquals(this.caseName, this.expected,
this.rule.languageMatches(this.langs));
- // }
+ @Test
+ public void phonemeComparedToLaterIsNegative() {
+ for (Rule.Phoneme[] phs : makePhonemes()) {
+ for (int i = 0; i < phs.length; i++) {
+ for (int j = i + 1; j < phs.length; j++) {
+ int c = phs[i].compareTo(phs[j]);
+
+ assertThat("Comparing " + phs[i].getPhonemeText() + " to "
+ phs[j].getPhonemeText() + " should be negative", c,
+ new NegativeIntegerBaseMatcher());
+ }
+ }
+ }
+ }
+ private static class NegativeIntegerBaseMatcher extends
BaseMatcher<Integer> {
+ public boolean matches(Object item) {
+ return ((Integer) item) < 0;
+ }
+
+ public void describeTo(Description description) {
+ description.appendText("value should be negative");
+ }
+ }
}