Repository: opennlp Updated Branches: refs/heads/LangDetect 5f53fe610 -> b406dbe13 (forced update)
http://git-wip-us.apache.org/repos/asf/opennlp/blob/b406dbe1/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageSampleTest.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageSampleTest.java b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageSampleTest.java new file mode 100644 index 0000000..7d12581 --- /dev/null +++ b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageSampleTest.java @@ -0,0 +1,89 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.langdetect; + + +import org.junit.Assert; +import org.junit.Test; + + +public class LanguageSampleTest { + + @Test + public void testConstructor() { + Language lang = new Language("aLang"); + CharSequence context = "aContext"; + + LanguageSample sample = new LanguageSample(lang, context); + + Assert.assertEquals(lang, sample.getLanguage()); + Assert.assertEquals(context, sample.getContext()); + } + + @Test(expected = NullPointerException.class) + public void testNullLang() throws Exception { + CharSequence context = "aContext"; + + new LanguageSample(null, context); + } + + @Test(expected = NullPointerException.class) + public void testNullContext() { + Language lang = new Language("aLang"); + + new LanguageSample(lang, null); + } + + @Test + public void testToString() { + Language lang = new Language("aLang"); + CharSequence context = "aContext"; + + LanguageSample sample = new LanguageSample(lang, context); + + Assert.assertEquals(lang.getLang() + "\t" + context, sample.toString()); + } + + @Test + public void testHash() { + + int hashA = new LanguageSample(new Language("aLang"), "aContext").hashCode(); + int hashB = new LanguageSample(new Language("bLang"), "aContext").hashCode(); + int hashC = new LanguageSample(new Language("aLang"), "bContext").hashCode(); + + Assert.assertNotEquals(hashA, hashB); + Assert.assertNotEquals(hashA, hashC); + Assert.assertNotEquals(hashB, hashC); + } + + @Test + public void testEquals() throws Exception { + + LanguageSample sampleA = new LanguageSample(new Language("aLang"), "aContext"); + LanguageSample sampleA1 = new LanguageSample(new Language("aLang"), "aContext"); + LanguageSample sampleB = new LanguageSample(new Language("bLang"), "aContext"); + LanguageSample sampleC = new LanguageSample(new Language("aLang"), "bContext"); + + Assert.assertEquals(sampleA, sampleA); + Assert.assertEquals(sampleA, sampleA1); + Assert.assertNotEquals(sampleA, sampleB); + Assert.assertNotEquals(sampleA, sampleC); + Assert.assertNotEquals(sampleB, sampleC); + Assert.assertFalse(sampleA.equals("something else")); + } +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/b406dbe1/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageTest.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageTest.java b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageTest.java new file mode 100644 index 0000000..dc25bc6 --- /dev/null +++ b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageTest.java @@ -0,0 +1,101 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.langdetect; + +import org.junit.Assert; +import org.junit.Test; + + +public class LanguageTest { + + + @Test + public void emptyConfidence() throws Exception { + String languageCode = "aLanguage"; + Language lang = new Language(languageCode); + + Assert.assertEquals(languageCode, lang.getLang()); + Assert.assertEquals(0, lang.getConfidence(), 0); + } + + @Test + public void nonEmptyConfidence() throws Exception { + String languageCode = "aLanguage"; + double confidence = 0.05; + Language lang = new Language(languageCode, confidence); + + Assert.assertEquals(languageCode, lang.getLang()); + Assert.assertEquals(confidence, lang.getConfidence(), 0); + } + + @Test(expected = NullPointerException.class) + public void emptyLanguage() throws Exception { + new Language(null); + } + + @Test(expected = NullPointerException.class) + public void emptyLanguageConfidence() throws Exception { + new Language(null, 0.05); + } + + @Test + public void testToString() { + Language lang = new Language("aLang"); + + Assert.assertEquals("aLang (0.0)", lang.toString()); + + lang = new Language("aLang", 0.0886678); + + Assert.assertEquals("aLang (0.0886678)", lang.toString()); + } + + + @Test + public void testHash() { + int hashA = new Language("aLang").hashCode(); + int hashAA = new Language("aLang").hashCode(); + int hashB = new Language("BLang").hashCode(); + int hashA5 = new Language("aLang", 5.0).hashCode(); + int hashA6 = new Language("BLang", 6.0).hashCode(); + + Assert.assertEquals(hashA, hashAA); + + Assert.assertNotEquals(hashA, hashB); + Assert.assertNotEquals(hashA, hashA5); + Assert.assertNotEquals(hashB, hashA5); + Assert.assertNotEquals(hashA5, hashA6); + } + + @Test + public void testEquals() { + Language langA = new Language("langA"); + Language langB = new Language("langB"); + Language langA5 = new Language("langA5", 5.0); + Language langA6 = new Language("langA5", 6.0); + + Assert.assertEquals(langA, langA); + Assert.assertEquals(langA5, langA5); + + Assert.assertNotEquals(langA, langA5); + Assert.assertNotEquals(langA, langB); + + Assert.assertEquals(langA6, langA5); + + Assert.assertNotEquals(langA, "something else"); + } +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/b406dbe1/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/EmojiCharSequenceNormalizerTest.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/EmojiCharSequenceNormalizerTest.java b/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/EmojiCharSequenceNormalizerTest.java new file mode 100644 index 0000000..0f8dfe7 --- /dev/null +++ b/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/EmojiCharSequenceNormalizerTest.java @@ -0,0 +1,43 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.util.normalizer; + +import org.junit.Assert; +import org.junit.Test; + + +public class EmojiCharSequenceNormalizerTest { + + public EmojiCharSequenceNormalizer normalizer = EmojiCharSequenceNormalizer.getInstance(); + + @Test + public void normalizeEmoji() throws Exception { + + String s = new StringBuilder() + .append("Any funny text goes here ") + .appendCodePoint(0x1F606) + .appendCodePoint(0x1F606) + .appendCodePoint(0x1F606) + .append(" ") + .appendCodePoint(0x1F61B) + .toString(); + Assert.assertEquals( + "Any funny text goes here ", normalizer.normalize(s)); + } + +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/b406dbe1/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/NumberCharSequenceNormalizerTest.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/NumberCharSequenceNormalizerTest.java b/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/NumberCharSequenceNormalizerTest.java new file mode 100644 index 0000000..50b1f0c --- /dev/null +++ b/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/NumberCharSequenceNormalizerTest.java @@ -0,0 +1,32 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package opennlp.tools.util.normalizer; + +import org.junit.Assert; +import org.junit.Test; + + +public class NumberCharSequenceNormalizerTest { + + public NumberCharSequenceNormalizer normalizer = NumberCharSequenceNormalizer.getInstance(); + + + @Test + public void normalize() throws Exception { + Assert.assertEquals("absc , abcd", normalizer.normalize("absc 123,0123 abcd")); + } +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/b406dbe1/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/ShrinkCharSequenceNormalizerTest.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/ShrinkCharSequenceNormalizerTest.java b/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/ShrinkCharSequenceNormalizerTest.java new file mode 100644 index 0000000..95cf300 --- /dev/null +++ b/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/ShrinkCharSequenceNormalizerTest.java @@ -0,0 +1,41 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.util.normalizer; + +import org.junit.Assert; +import org.junit.Test; + + +public class ShrinkCharSequenceNormalizerTest { + + public ShrinkCharSequenceNormalizer normalizer = ShrinkCharSequenceNormalizer.getInstance(); + + @Test + public void normalizeSpace() throws Exception { + Assert.assertEquals( + "a text extra space", normalizer.normalize("a text extra space")); + } + + @Test + public void normalizeChar() throws Exception { + Assert.assertEquals("Helloo", normalizer.normalize("Helllllloooooo")); + Assert.assertEquals("Hello", normalizer.normalize("Hello")); + Assert.assertEquals("HHello", normalizer.normalize("HHello")); + } + +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/b406dbe1/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/TwitterCharSequenceNormalizerTest.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/TwitterCharSequenceNormalizerTest.java b/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/TwitterCharSequenceNormalizerTest.java new file mode 100644 index 0000000..f0bd517 --- /dev/null +++ b/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/TwitterCharSequenceNormalizerTest.java @@ -0,0 +1,62 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.util.normalizer; + +import org.junit.Assert; +import org.junit.Test; + + +public class TwitterCharSequenceNormalizerTest { + + public TwitterCharSequenceNormalizer normalizer = TwitterCharSequenceNormalizer.getInstance(); + + @Test + public void normalizeHashtag() throws Exception { + Assert.assertEquals("asdf 2nnfdf", normalizer.normalize("asdf #hasdk23 2nnfdf")); + } + + @Test + public void normalizeUser() throws Exception { + Assert.assertEquals("asdf 2nnfdf", normalizer.normalize("asdf @hasdk23 2nnfdf")); + } + + @Test + public void normalizeRT() throws Exception { + Assert.assertEquals(" 2nnfdf", normalizer.normalize("RT RT RT 2nnfdf")); + } + + @Test + public void normalizeLaugh() throws Exception { + Assert.assertEquals("ahahah", normalizer.normalize("ahahahah")); + Assert.assertEquals("haha", normalizer.normalize("hahha")); + Assert.assertEquals("haha", normalizer.normalize("hahaa")); + Assert.assertEquals("ahaha", normalizer.normalize("ahahahahhahahhahahaaaa")); + Assert.assertEquals("jaja", normalizer.normalize("jajjajajaja")); + } + + + + @Test + public void normalizeFace() throws Exception { + Assert.assertEquals("hello hello", normalizer.normalize("hello :-) hello")); + Assert.assertEquals("hello hello", normalizer.normalize("hello ;) hello")); + Assert.assertEquals(" hello", normalizer.normalize(":) hello")); + Assert.assertEquals("hello ", normalizer.normalize("hello :P")); + } + +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/b406dbe1/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/UnicodeCharSequenceNormalizerTest.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/UnicodeCharSequenceNormalizerTest.java b/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/UnicodeCharSequenceNormalizerTest.java new file mode 100644 index 0000000..f654c74 --- /dev/null +++ b/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/UnicodeCharSequenceNormalizerTest.java @@ -0,0 +1,263 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.util.normalizer; + +import org.junit.Assert; +import org.junit.Test; + +/** + * Tests for the @{@link UnicodeCharSequenceNormalizer} based on + * https://github.com/shuyo/language-detection + */ +public class UnicodeCharSequenceNormalizerTest { + + public UnicodeCharSequenceNormalizer normalizer = UnicodeCharSequenceNormalizer.getInstance(); + + @Test + public void getMessage() throws Exception { + Assert.assertEquals("\u4F7C\u6934", UnicodeCharSequenceNormalizer.getMessage("NGram.KANJI_1_0")); + Assert.assertEquals("!blah!", UnicodeCharSequenceNormalizer.getMessage("blah")); + } + + @Test + public final void testNormalize() { + Assert.assertEquals("a b c d á é à ó ú ã", + normalizer.normalize("a b c d á é à ó ú ã")); + + } + + /** + * Test method for {@link UnicodeCharSequenceNormalizer#normalize(char)} with Latin characters + */ + @Test + public final void testNormalizeWithLatin() { + Assert.assertEquals(' ', normalizer.normalize('\u0000')); + Assert.assertEquals(' ', normalizer.normalize('\u0020')); + Assert.assertEquals(' ', normalizer.normalize('\u0030')); + Assert.assertEquals(' ', normalizer.normalize('\u0040')); + Assert.assertEquals('\u0041', normalizer.normalize('\u0041')); + Assert.assertEquals('\u005a', normalizer.normalize('\u005a')); + Assert.assertEquals(' ', normalizer.normalize('\u005b')); + Assert.assertEquals(' ', normalizer.normalize('\u0060')); + Assert.assertEquals('\u0061', normalizer.normalize('\u0061')); + Assert.assertEquals('\u007a', normalizer.normalize('\u007a')); + Assert.assertEquals(' ', normalizer.normalize('\u007b')); + Assert.assertEquals(' ', normalizer.normalize('\u007f')); + Assert.assertEquals('\u0080', normalizer.normalize('\u0080')); + Assert.assertEquals(' ', normalizer.normalize('\u00a0')); + Assert.assertEquals('\u00a1', normalizer.normalize('\u00a1')); + // LATIN_EXTENDED_ADDITIONAL + Assert.assertEquals('\u1ec3', normalizer.normalize('\u1EA0')); + Assert.assertEquals('\u1ec3', normalizer.normalize('\u1EA1')); + + Assert.assertEquals(' ', normalizer.normalize('\u2012')); + // Arabic + Assert.assertEquals('\u064a', normalizer.normalize('\u06cc')); + // Hiragana + Assert.assertEquals('\u3042', normalizer.normalize('\u3041')); + // Katakana + Assert.assertEquals('\u30a2', normalizer.normalize('\u30A1')); + // Bopomofo + Assert.assertEquals('\u3105', normalizer.normalize('\u31A0')); + // Bopomofo Ex + Assert.assertEquals('\u3105', normalizer.normalize('\u3106')); + //HANGUL_SYLLABLES + Assert.assertEquals('\uac00', normalizer.normalize('\uAC01')); + } + + /** + * Test method for {@link UnicodeCharSequenceNormalizer#normalize(char)} with CJK Kanji characters + */ + @Test + public final void testNormalizeWithCJKKanji() { + Assert.assertEquals('\u4E00', normalizer.normalize('\u4E00')); + Assert.assertEquals('\u4E01', normalizer.normalize('\u4E01')); + Assert.assertEquals('\u4E02', normalizer.normalize('\u4E02')); + Assert.assertEquals('\u4E01', normalizer.normalize('\u4E03')); + Assert.assertEquals('\u4E04', normalizer.normalize('\u4E04')); + Assert.assertEquals('\u4E05', normalizer.normalize('\u4E05')); + Assert.assertEquals('\u4E06', normalizer.normalize('\u4E06')); + Assert.assertEquals('\u4E07', normalizer.normalize('\u4E07')); + Assert.assertEquals('\u4E08', normalizer.normalize('\u4E08')); + Assert.assertEquals('\u4E09', normalizer.normalize('\u4E09')); + Assert.assertEquals('\u4E10', normalizer.normalize('\u4E10')); + Assert.assertEquals('\u4E11', normalizer.normalize('\u4E11')); + Assert.assertEquals('\u4E12', normalizer.normalize('\u4E12')); + Assert.assertEquals('\u4E13', normalizer.normalize('\u4E13')); + Assert.assertEquals('\u4E14', normalizer.normalize('\u4E14')); + Assert.assertEquals('\u4E15', normalizer.normalize('\u4E15')); + Assert.assertEquals('\u4E1e', normalizer.normalize('\u4E1e')); + Assert.assertEquals('\u4E1f', normalizer.normalize('\u4E1f')); + Assert.assertEquals('\u4E20', normalizer.normalize('\u4E20')); + Assert.assertEquals('\u4E21', normalizer.normalize('\u4E21')); + Assert.assertEquals('\u4E22', normalizer.normalize('\u4E22')); + Assert.assertEquals('\u4E23', normalizer.normalize('\u4E23')); + Assert.assertEquals('\u4E13', normalizer.normalize('\u4E24')); + Assert.assertEquals('\u4E13', normalizer.normalize('\u4E25')); + Assert.assertEquals('\u4E30', normalizer.normalize('\u4E30')); + } + + + /** + * Test method for {@link UnicodeCharSequenceNormalizer#normalize(char)} for Romanian characters + */ + @Test + public final void testNormalizeForRomanian() { + Assert.assertEquals('\u015f', normalizer.normalize('\u015f')); + Assert.assertEquals('\u0163', normalizer.normalize('\u0163')); + Assert.assertEquals('\u015f', normalizer.normalize('\u0219')); + Assert.assertEquals('\u0163', normalizer.normalize('\u021b')); + } + + /** + * Test method for {@link UnicodeCharSequenceNormalizer#normalize_vi(CharSequence)} + */ + @Test + public final void testNormalizeVietnamese() { + Assert.assertEquals("", normalizer.normalize_vi("")); + Assert.assertEquals("ABC", normalizer.normalize_vi("ABC")); + Assert.assertEquals("012", normalizer.normalize_vi("012")); + Assert.assertEquals("\u00c0", normalizer.normalize_vi("\u00c0")); + + Assert.assertEquals("\u00C0", normalizer.normalize_vi("\u0041\u0300")); + Assert.assertEquals("\u00C8", normalizer.normalize_vi("\u0045\u0300")); + Assert.assertEquals("\u00CC", normalizer.normalize_vi("\u0049\u0300")); + Assert.assertEquals("\u00D2", normalizer.normalize_vi("\u004F\u0300")); + Assert.assertEquals("\u00D9", normalizer.normalize_vi("\u0055\u0300")); + Assert.assertEquals("\u1EF2", normalizer.normalize_vi("\u0059\u0300")); + Assert.assertEquals("\u00E0", normalizer.normalize_vi("\u0061\u0300")); + Assert.assertEquals("\u00E8", normalizer.normalize_vi("\u0065\u0300")); + Assert.assertEquals("\u00EC", normalizer.normalize_vi("\u0069\u0300")); + Assert.assertEquals("\u00F2", normalizer.normalize_vi("\u006F\u0300")); + Assert.assertEquals("\u00F9", normalizer.normalize_vi("\u0075\u0300")); + Assert.assertEquals("\u1EF3", normalizer.normalize_vi("\u0079\u0300")); + Assert.assertEquals("\u1EA6", normalizer.normalize_vi("\u00C2\u0300")); + Assert.assertEquals("\u1EC0", normalizer.normalize_vi("\u00CA\u0300")); + Assert.assertEquals("\u1ED2", normalizer.normalize_vi("\u00D4\u0300")); + Assert.assertEquals("\u1EA7", normalizer.normalize_vi("\u00E2\u0300")); + Assert.assertEquals("\u1EC1", normalizer.normalize_vi("\u00EA\u0300")); + Assert.assertEquals("\u1ED3", normalizer.normalize_vi("\u00F4\u0300")); + Assert.assertEquals("\u1EB0", normalizer.normalize_vi("\u0102\u0300")); + Assert.assertEquals("\u1EB1", normalizer.normalize_vi("\u0103\u0300")); + Assert.assertEquals("\u1EDC", normalizer.normalize_vi("\u01A0\u0300")); + Assert.assertEquals("\u1EDD", normalizer.normalize_vi("\u01A1\u0300")); + Assert.assertEquals("\u1EEA", normalizer.normalize_vi("\u01AF\u0300")); + Assert.assertEquals("\u1EEB", normalizer.normalize_vi("\u01B0\u0300")); + + Assert.assertEquals("\u00C1", normalizer.normalize_vi("\u0041\u0301")); + Assert.assertEquals("\u00C9", normalizer.normalize_vi("\u0045\u0301")); + Assert.assertEquals("\u00CD", normalizer.normalize_vi("\u0049\u0301")); + Assert.assertEquals("\u00D3", normalizer.normalize_vi("\u004F\u0301")); + Assert.assertEquals("\u00DA", normalizer.normalize_vi("\u0055\u0301")); + Assert.assertEquals("\u00DD", normalizer.normalize_vi("\u0059\u0301")); + Assert.assertEquals("\u00E1", normalizer.normalize_vi("\u0061\u0301")); + Assert.assertEquals("\u00E9", normalizer.normalize_vi("\u0065\u0301")); + Assert.assertEquals("\u00ED", normalizer.normalize_vi("\u0069\u0301")); + Assert.assertEquals("\u00F3", normalizer.normalize_vi("\u006F\u0301")); + Assert.assertEquals("\u00FA", normalizer.normalize_vi("\u0075\u0301")); + Assert.assertEquals("\u00FD", normalizer.normalize_vi("\u0079\u0301")); + Assert.assertEquals("\u1EA4", normalizer.normalize_vi("\u00C2\u0301")); + Assert.assertEquals("\u1EBE", normalizer.normalize_vi("\u00CA\u0301")); + Assert.assertEquals("\u1ED0", normalizer.normalize_vi("\u00D4\u0301")); + Assert.assertEquals("\u1EA5", normalizer.normalize_vi("\u00E2\u0301")); + Assert.assertEquals("\u1EBF", normalizer.normalize_vi("\u00EA\u0301")); + Assert.assertEquals("\u1ED1", normalizer.normalize_vi("\u00F4\u0301")); + Assert.assertEquals("\u1EAE", normalizer.normalize_vi("\u0102\u0301")); + Assert.assertEquals("\u1EAF", normalizer.normalize_vi("\u0103\u0301")); + Assert.assertEquals("\u1EDA", normalizer.normalize_vi("\u01A0\u0301")); + Assert.assertEquals("\u1EDB", normalizer.normalize_vi("\u01A1\u0301")); + Assert.assertEquals("\u1EE8", normalizer.normalize_vi("\u01AF\u0301")); + Assert.assertEquals("\u1EE9", normalizer.normalize_vi("\u01B0\u0301")); + + Assert.assertEquals("\u00C3", normalizer.normalize_vi("\u0041\u0303")); + Assert.assertEquals("\u1EBC", normalizer.normalize_vi("\u0045\u0303")); + Assert.assertEquals("\u0128", normalizer.normalize_vi("\u0049\u0303")); + Assert.assertEquals("\u00D5", normalizer.normalize_vi("\u004F\u0303")); + Assert.assertEquals("\u0168", normalizer.normalize_vi("\u0055\u0303")); + Assert.assertEquals("\u1EF8", normalizer.normalize_vi("\u0059\u0303")); + Assert.assertEquals("\u00E3", normalizer.normalize_vi("\u0061\u0303")); + Assert.assertEquals("\u1EBD", normalizer.normalize_vi("\u0065\u0303")); + Assert.assertEquals("\u0129", normalizer.normalize_vi("\u0069\u0303")); + Assert.assertEquals("\u00F5", normalizer.normalize_vi("\u006F\u0303")); + Assert.assertEquals("\u0169", normalizer.normalize_vi("\u0075\u0303")); + Assert.assertEquals("\u1EF9", normalizer.normalize_vi("\u0079\u0303")); + Assert.assertEquals("\u1EAA", normalizer.normalize_vi("\u00C2\u0303")); + Assert.assertEquals("\u1EC4", normalizer.normalize_vi("\u00CA\u0303")); + Assert.assertEquals("\u1ED6", normalizer.normalize_vi("\u00D4\u0303")); + Assert.assertEquals("\u1EAB", normalizer.normalize_vi("\u00E2\u0303")); + Assert.assertEquals("\u1EC5", normalizer.normalize_vi("\u00EA\u0303")); + Assert.assertEquals("\u1ED7", normalizer.normalize_vi("\u00F4\u0303")); + Assert.assertEquals("\u1EB4", normalizer.normalize_vi("\u0102\u0303")); + Assert.assertEquals("\u1EB5", normalizer.normalize_vi("\u0103\u0303")); + Assert.assertEquals("\u1EE0", normalizer.normalize_vi("\u01A0\u0303")); + Assert.assertEquals("\u1EE1", normalizer.normalize_vi("\u01A1\u0303")); + Assert.assertEquals("\u1EEE", normalizer.normalize_vi("\u01AF\u0303")); + Assert.assertEquals("\u1EEF", normalizer.normalize_vi("\u01B0\u0303")); + + Assert.assertEquals("\u1EA2", normalizer.normalize_vi("\u0041\u0309")); + Assert.assertEquals("\u1EBA", normalizer.normalize_vi("\u0045\u0309")); + Assert.assertEquals("\u1EC8", normalizer.normalize_vi("\u0049\u0309")); + Assert.assertEquals("\u1ECE", normalizer.normalize_vi("\u004F\u0309")); + Assert.assertEquals("\u1EE6", normalizer.normalize_vi("\u0055\u0309")); + Assert.assertEquals("\u1EF6", normalizer.normalize_vi("\u0059\u0309")); + Assert.assertEquals("\u1EA3", normalizer.normalize_vi("\u0061\u0309")); + Assert.assertEquals("\u1EBB", normalizer.normalize_vi("\u0065\u0309")); + Assert.assertEquals("\u1EC9", normalizer.normalize_vi("\u0069\u0309")); + Assert.assertEquals("\u1ECF", normalizer.normalize_vi("\u006F\u0309")); + Assert.assertEquals("\u1EE7", normalizer.normalize_vi("\u0075\u0309")); + Assert.assertEquals("\u1EF7", normalizer.normalize_vi("\u0079\u0309")); + Assert.assertEquals("\u1EA8", normalizer.normalize_vi("\u00C2\u0309")); + Assert.assertEquals("\u1EC2", normalizer.normalize_vi("\u00CA\u0309")); + Assert.assertEquals("\u1ED4", normalizer.normalize_vi("\u00D4\u0309")); + Assert.assertEquals("\u1EA9", normalizer.normalize_vi("\u00E2\u0309")); + Assert.assertEquals("\u1EC3", normalizer.normalize_vi("\u00EA\u0309")); + Assert.assertEquals("\u1ED5", normalizer.normalize_vi("\u00F4\u0309")); + Assert.assertEquals("\u1EB2", normalizer.normalize_vi("\u0102\u0309")); + Assert.assertEquals("\u1EB3", normalizer.normalize_vi("\u0103\u0309")); + Assert.assertEquals("\u1EDE", normalizer.normalize_vi("\u01A0\u0309")); + Assert.assertEquals("\u1EDF", normalizer.normalize_vi("\u01A1\u0309")); + Assert.assertEquals("\u1EEC", normalizer.normalize_vi("\u01AF\u0309")); + Assert.assertEquals("\u1EED", normalizer.normalize_vi("\u01B0\u0309")); + + Assert.assertEquals("\u1EA0", normalizer.normalize_vi("\u0041\u0323")); + Assert.assertEquals("\u1EB8", normalizer.normalize_vi("\u0045\u0323")); + Assert.assertEquals("\u1ECA", normalizer.normalize_vi("\u0049\u0323")); + Assert.assertEquals("\u1ECC", normalizer.normalize_vi("\u004F\u0323")); + Assert.assertEquals("\u1EE4", normalizer.normalize_vi("\u0055\u0323")); + Assert.assertEquals("\u1EF4", normalizer.normalize_vi("\u0059\u0323")); + Assert.assertEquals("\u1EA1", normalizer.normalize_vi("\u0061\u0323")); + Assert.assertEquals("\u1EB9", normalizer.normalize_vi("\u0065\u0323")); + Assert.assertEquals("\u1ECB", normalizer.normalize_vi("\u0069\u0323")); + Assert.assertEquals("\u1ECD", normalizer.normalize_vi("\u006F\u0323")); + Assert.assertEquals("\u1EE5", normalizer.normalize_vi("\u0075\u0323")); + Assert.assertEquals("\u1EF5", normalizer.normalize_vi("\u0079\u0323")); + Assert.assertEquals("\u1EAC", normalizer.normalize_vi("\u00C2\u0323")); + Assert.assertEquals("\u1EC6", normalizer.normalize_vi("\u00CA\u0323")); + Assert.assertEquals("\u1ED8", normalizer.normalize_vi("\u00D4\u0323")); + Assert.assertEquals("\u1EAD", normalizer.normalize_vi("\u00E2\u0323")); + Assert.assertEquals("\u1EC7", normalizer.normalize_vi("\u00EA\u0323")); + Assert.assertEquals("\u1ED9", normalizer.normalize_vi("\u00F4\u0323")); + Assert.assertEquals("\u1EB6", normalizer.normalize_vi("\u0102\u0323")); + Assert.assertEquals("\u1EB7", normalizer.normalize_vi("\u0103\u0323")); + Assert.assertEquals("\u1EE2", normalizer.normalize_vi("\u01A0\u0323")); + Assert.assertEquals("\u1EE3", normalizer.normalize_vi("\u01A1\u0323")); + Assert.assertEquals("\u1EF0", normalizer.normalize_vi("\u01AF\u0323")); + Assert.assertEquals("\u1EF1", normalizer.normalize_vi("\u01B0\u0323")); + + } +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/b406dbe1/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/UrlCharSequenceNormalizerTest.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/UrlCharSequenceNormalizerTest.java b/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/UrlCharSequenceNormalizerTest.java new file mode 100644 index 0000000..72eb83a --- /dev/null +++ b/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/UrlCharSequenceNormalizerTest.java @@ -0,0 +1,47 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.util.normalizer; + +import org.junit.Assert; +import org.junit.Test; + + +public class UrlCharSequenceNormalizerTest { + + public UrlCharSequenceNormalizer normalizer = UrlCharSequenceNormalizer.getInstance(); + + @Test + public void normalizeUrl() throws Exception { + Assert.assertEquals( + "asdf 2nnfdf", normalizer.normalize("asdf http://asdf.com/dfa/cxs 2nnfdf")); + + + Assert.assertEquals( + "asdf 2nnfdf ", normalizer.normalize("asdf http://asdf.com/dfa/cx" + + "s 2nnfdf http://asdf.com/dfa/cxs")); + } + + @Test + public void normalizeEmail() throws Exception { + Assert.assertEquals( + "asdf 2nnfdf", normalizer.normalize("asdf [email protected] 2nnfdf")); + Assert.assertEquals( + "asdf 2nnfdf ", normalizer.normalize("asdf [email protected]" + + " 2nnfdf [email protected]")); + } +}
