Repository: opennlp Updated Branches: refs/heads/LangDetect cffa3352a -> 839d2deab (forced update)
http://git-wip-us.apache.org/repos/asf/opennlp/blob/839d2dea/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/AggregateCharSequenceNormalizer.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/AggregateCharSequenceNormalizer.java b/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/AggregateCharSequenceNormalizer.java new file mode 100644 index 0000000..771be19 --- /dev/null +++ b/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/AggregateCharSequenceNormalizer.java @@ -0,0 +1,39 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +package opennlp.tools.util.normalizer; + +public class AggregateCharSequenceNormalizer implements CharSequenceNormalizer { + + private final CharSequenceNormalizer[] normalizers; + + public AggregateCharSequenceNormalizer(CharSequenceNormalizer ... normalizers) { + this.normalizers = normalizers; + } + + public CharSequence normalize (CharSequence text) { + + for (CharSequenceNormalizer normalizers : + normalizers) { + text = normalizers.normalize(text); + } + + return text; + } + +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/839d2dea/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/CharSequenceNormalizer.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/CharSequenceNormalizer.java b/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/CharSequenceNormalizer.java new file mode 100644 index 0000000..b5c1f3f --- /dev/null +++ b/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/CharSequenceNormalizer.java @@ -0,0 +1,23 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +package opennlp.tools.util.normalizer; + +public interface CharSequenceNormalizer { + CharSequence normalize(CharSequence text); +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/839d2dea/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/EmojiCharSequenceNormalizer.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/EmojiCharSequenceNormalizer.java b/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/EmojiCharSequenceNormalizer.java new file mode 100644 index 0000000..d1c161c --- /dev/null +++ b/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/EmojiCharSequenceNormalizer.java @@ -0,0 +1,38 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +package opennlp.tools.util.normalizer; + +import java.util.regex.Pattern; + +public class EmojiCharSequenceNormalizer implements CharSequenceNormalizer { + + private static final EmojiCharSequenceNormalizer INSTANCE = new EmojiCharSequenceNormalizer(); + + public static EmojiCharSequenceNormalizer getInstance() { + return INSTANCE; + } + + private static final Pattern EMOJI_REGEX = + Pattern.compile("[\\uD83C-\\uDBFF\\uDC00-\\uDFFF]+"); + + public CharSequence normalize (CharSequence text) { + String modified = EMOJI_REGEX.matcher(text).replaceAll(" "); + return modified; + } +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/839d2dea/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/NumberCharSequenceNormalizer.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/NumberCharSequenceNormalizer.java b/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/NumberCharSequenceNormalizer.java new file mode 100644 index 0000000..6b0452d --- /dev/null +++ b/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/NumberCharSequenceNormalizer.java @@ -0,0 +1,36 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +package opennlp.tools.util.normalizer; + +import java.util.regex.Pattern; + +public class NumberCharSequenceNormalizer implements CharSequenceNormalizer { + + private static final Pattern NUMBER_REGEX = Pattern.compile("\\d+"); + + private static final NumberCharSequenceNormalizer INSTANCE = new NumberCharSequenceNormalizer(); + + public static NumberCharSequenceNormalizer getInstance() { + return INSTANCE; + } + + public CharSequence normalize (CharSequence text) { + return NUMBER_REGEX.matcher(text).replaceAll(" "); + } +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/839d2dea/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/ShrinkCharSequenceNormalizer.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/ShrinkCharSequenceNormalizer.java b/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/ShrinkCharSequenceNormalizer.java new file mode 100644 index 0000000..6183367 --- /dev/null +++ b/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/ShrinkCharSequenceNormalizer.java @@ -0,0 +1,40 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +package opennlp.tools.util.normalizer; + +import java.util.regex.Pattern; + +public class ShrinkCharSequenceNormalizer implements CharSequenceNormalizer { + + private static final Pattern REPEATED_CHAR_REGEX = Pattern.compile("(.)\\1{2,}", + Pattern.CASE_INSENSITIVE); + private static final Pattern SPACE_REGEX = Pattern.compile("\\s{2,}", + Pattern.CASE_INSENSITIVE); + + private static final ShrinkCharSequenceNormalizer INSTANCE = new ShrinkCharSequenceNormalizer(); + + public static ShrinkCharSequenceNormalizer getInstance() { + return INSTANCE; + } + + public CharSequence normalize (CharSequence text) { + text = SPACE_REGEX.matcher(text).replaceAll(" "); + return REPEATED_CHAR_REGEX.matcher(text).replaceAll("$1$1").trim(); + } +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/839d2dea/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/TwitterCharSequenceNormalizer.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/TwitterCharSequenceNormalizer.java b/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/TwitterCharSequenceNormalizer.java new file mode 100644 index 0000000..b5a8625 --- /dev/null +++ b/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/TwitterCharSequenceNormalizer.java @@ -0,0 +1,50 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +package opennlp.tools.util.normalizer; + +import java.util.regex.Pattern; + +public class TwitterCharSequenceNormalizer implements CharSequenceNormalizer { + + private static final Pattern HASH_USER_REGEX = + Pattern.compile("[#@]\\S+"); + + private static final Pattern RT_REGEX = + Pattern.compile("\\b(rt[ :])+", Pattern.CASE_INSENSITIVE); + + private static final Pattern FACE_REGEX = + Pattern.compile("[:;x]-?[()dop]", Pattern.CASE_INSENSITIVE); + + private static final Pattern LAUGH_REGEX = + Pattern.compile("([hj])+([aieou])+(\\1+\\2+)+", Pattern.CASE_INSENSITIVE); + + private static final TwitterCharSequenceNormalizer INSTANCE = new TwitterCharSequenceNormalizer(); + + public static TwitterCharSequenceNormalizer getInstance() { + return INSTANCE; + } + + public CharSequence normalize (CharSequence text) { + String modified = HASH_USER_REGEX.matcher(text).replaceAll(" "); + modified = RT_REGEX.matcher(modified).replaceAll(" "); + modified = FACE_REGEX.matcher(modified).replaceAll(" "); + modified = LAUGH_REGEX.matcher(modified).replaceAll("$1$2$1$2"); + return modified; + } +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/839d2dea/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/UrlCharSequenceNormalizer.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/UrlCharSequenceNormalizer.java b/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/UrlCharSequenceNormalizer.java new file mode 100644 index 0000000..4be9b63 --- /dev/null +++ b/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/UrlCharSequenceNormalizer.java @@ -0,0 +1,40 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +package opennlp.tools.util.normalizer; + +import java.util.regex.Pattern; + +public class UrlCharSequenceNormalizer implements CharSequenceNormalizer { + + private static final Pattern URL_REGEX = + Pattern.compile("https?://[-_.?&~;+=/#0-9A-Za-z]+"); + private static final Pattern MAIL_REGEX = + Pattern.compile("[-_.0-9A-Za-z]+@[-_0-9A-Za-z]+[-_.0-9A-Za-z]+"); + + private static final UrlCharSequenceNormalizer INSTANCE = new UrlCharSequenceNormalizer(); + + public static UrlCharSequenceNormalizer getInstance() { + return INSTANCE; + } + + public CharSequence normalize (CharSequence text) { + String modified = URL_REGEX.matcher(text).replaceAll(" "); + return MAIL_REGEX.matcher(modified).replaceAll(" "); + } +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/839d2dea/opennlp-tools/src/test/java/opennlp/tools/langdetect/DummyFactory.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/test/java/opennlp/tools/langdetect/DummyFactory.java b/opennlp-tools/src/test/java/opennlp/tools/langdetect/DummyFactory.java new file mode 100644 index 0000000..1aae887 --- /dev/null +++ b/opennlp-tools/src/test/java/opennlp/tools/langdetect/DummyFactory.java @@ -0,0 +1,88 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.langdetect; + + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Iterator; +import java.util.List; + +import opennlp.tools.ngram.NGramModel; +import opennlp.tools.tokenize.SimpleTokenizer; +import opennlp.tools.util.StringList; +import opennlp.tools.util.normalizer.CharSequenceNormalizer; + +public class DummyFactory extends LanguageDetectorFactory { + + + public DummyFactory() { + super(); + } + + @Override + public void init() { + super.init(); + } + + @Override + public LanguageDetectorContextGenerator getContextGenerator() { + return new DummyFactory.MyContectGenerator(2, 5, + new DummyFactory.UpperCaseNormalizer()); + } + + public class UpperCaseNormalizer implements CharSequenceNormalizer { + @Override + public CharSequence normalize(CharSequence text) { + return text.toString().toUpperCase(); + } + } + + public class MyContectGenerator extends LanguageDetectorContextGenerator { + + public MyContectGenerator(int min, int max, CharSequenceNormalizer... normalizers) { + super(min, max, normalizers); + } + + @Override + public String[] getContext(String document) { + String[] superContext = super.getContext(document); + + List<String> context = new ArrayList(Arrays.asList(superContext)); + + document = this.normalizer.normalize(document).toString(); + + SimpleTokenizer tokenizer = SimpleTokenizer.INSTANCE; + String[] words = tokenizer.tokenize(document); + NGramModel tokenNgramModel = new NGramModel(); + if (words.length > 0) { + tokenNgramModel.add(new StringList(words), 1, 3); + Iterator tokenNgramIterator = tokenNgramModel.iterator(); + + while (tokenNgramIterator.hasNext()) { + StringList tokenList = (StringList) tokenNgramIterator.next(); + if (tokenList.size() > 0) { + context.add("tg=" + tokenList.toString()); + } + } + } + + return context.toArray(new String[context.size()]); + } + } +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/839d2dea/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorContextGeneratorTest.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorContextGeneratorTest.java b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorContextGeneratorTest.java new file mode 100644 index 0000000..dc6ca26 --- /dev/null +++ b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorContextGeneratorTest.java @@ -0,0 +1,43 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.langdetect; + +import java.util.Arrays; +import java.util.Collection; + +import org.junit.Assert; +import org.junit.Test; + + +public class LanguageDetectorContextGeneratorTest { + + @Test + public void extractContext() throws Exception { + String doc = "abcde fghijk"; + + LanguageDetectorContextGenerator cg = new LanguageDetectorContextGenerator(1, 3); + + Collection<String> features = Arrays.asList(cg.getContext(doc)); + + Assert.assertEquals(33, features.size()); + Assert.assertTrue(features.contains("ab")); + Assert.assertTrue(features.contains("abc")); + Assert.assertTrue(features.contains("e f")); + Assert.assertTrue(features.contains(" fg")); + } +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/839d2dea/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorCrossValidatorTest.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorCrossValidatorTest.java b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorCrossValidatorTest.java new file mode 100644 index 0000000..520fc71 --- /dev/null +++ b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorCrossValidatorTest.java @@ -0,0 +1,64 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.langdetect; + +import java.util.concurrent.atomic.AtomicInteger; + +import org.junit.Assert; +import org.junit.Test; + +import opennlp.tools.util.TrainingParameters; + +public class LanguageDetectorCrossValidatorTest { + + @Test + public void evaluate() throws Exception { + + TrainingParameters params = new TrainingParameters(); + params.put(TrainingParameters.ITERATIONS_PARAM, 100); + params.put(TrainingParameters.CUTOFF_PARAM, 5); + params.put("PrintMessages", false); + + + final AtomicInteger correctCount = new AtomicInteger(); + final AtomicInteger incorrectCount = new AtomicInteger(); + + LanguageDetectorCrossValidator cv = new LanguageDetectorCrossValidator(params, + new LanguageDetectorFactory(), new LanguageDetectorEvaluationMonitor() { + @Override + public void correctlyClassified(LanguageSample reference, + LanguageSample prediction) { + correctCount.incrementAndGet(); + } + + @Override + public void missclassified(LanguageSample reference, + LanguageSample prediction) { + incorrectCount.incrementAndGet(); + } + }); + + LanguageDetectorSampleStream sampleStream = LanguageDetectorMETest.createSampleStream(); + + cv.evaluate(sampleStream, 2); + + Assert.assertEquals(99, cv.getDocumentCount()); + Assert.assertEquals(0.98989898989899, cv.getDocumentAccuracy(), 0.01); + } + +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/839d2dea/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorEvaluatorTest.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorEvaluatorTest.java b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorEvaluatorTest.java new file mode 100644 index 0000000..8bdd71b --- /dev/null +++ b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorEvaluatorTest.java @@ -0,0 +1,68 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.langdetect; + +import java.util.concurrent.atomic.AtomicInteger; + +import org.junit.Assert; +import org.junit.Test; + + +public class LanguageDetectorEvaluatorTest { + + @Test + public void processSample() throws Exception { + LanguageDetectorModel model = LanguageDetectorMETest.trainModel(); + LanguageDetectorME langdetector = new LanguageDetectorME(model); + + final AtomicInteger correctCount = new AtomicInteger(); + final AtomicInteger incorrectCount = new AtomicInteger(); + + LanguageDetectorEvaluator evaluator = new LanguageDetectorEvaluator(langdetector, + new LanguageDetectorEvaluationMonitor() { + @Override + public void correctlyClassified(LanguageSample reference, + LanguageSample prediction) { + correctCount.incrementAndGet(); + } + + @Override + public void missclassified(LanguageSample reference, + LanguageSample prediction) { + incorrectCount.incrementAndGet(); + } + }); + + evaluator.evaluateSample(new LanguageSample(new Language("pob"), + "escreve e faz palestras pelo mundo inteiro sobre anjos")); + + evaluator.evaluateSample(new LanguageSample(new Language("fra"), + "escreve e faz palestras pelo mundo inteiro sobre anjos")); + + evaluator.evaluateSample(new LanguageSample(new Language("fra"), + "escreve e faz palestras pelo mundo inteiro sobre anjos")); + + + Assert.assertEquals(1, correctCount.get()); + Assert.assertEquals(2, incorrectCount.get()); + + Assert.assertEquals(3, evaluator.getDocumentCount()); + Assert.assertEquals(0.33, evaluator.getAccuracy(), 0.01); + } + +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/839d2dea/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorFactoryTest.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorFactoryTest.java b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorFactoryTest.java new file mode 100644 index 0000000..60afef2 --- /dev/null +++ b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorFactoryTest.java @@ -0,0 +1,90 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.langdetect; + + +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.util.Arrays; +import java.util.HashSet; +import java.util.Set; + +import org.junit.Assert; +import org.junit.BeforeClass; +import org.junit.Test; + +import opennlp.tools.formats.ResourceAsStreamFactory; +import opennlp.tools.util.PlainTextByLineStream; +import opennlp.tools.util.TrainingParameters; + +public class LanguageDetectorFactoryTest { + + + private static LanguageDetectorModel model; + + @BeforeClass + public static void train() throws Exception { + + ResourceAsStreamFactory streamFactory = new ResourceAsStreamFactory( + LanguageDetectorMETest.class, "/opennlp/tools/doccat/DoccatSample.txt"); + + PlainTextByLineStream lineStream = new PlainTextByLineStream(streamFactory, "UTF-8"); + + LanguageDetectorSampleStream sampleStream = new LanguageDetectorSampleStream(lineStream); + + TrainingParameters params = new TrainingParameters(); + params.put(TrainingParameters.ITERATIONS_PARAM, "100"); + params.put(TrainingParameters.CUTOFF_PARAM, "5"); + params.put(TrainingParameters.ALGORITHM_PARAM, "NAIVEBAYES"); + + model = LanguageDetectorME.train(sampleStream, params, new DummyFactory()); + } + + @Test + public void testCorrectFactory() throws IOException { + byte[] serialized = LanguageDetectorMETest.serializeModel(model); + + LanguageDetectorModel myModel = new LanguageDetectorModel(new ByteArrayInputStream(serialized)); + + Assert.assertTrue(myModel.getFactory() instanceof DummyFactory); + + } + + @Test + public void testDummyFactory() throws Exception { + byte[] serialized = LanguageDetectorMETest.serializeModel(model); + + LanguageDetectorModel myModel = new LanguageDetectorModel(new ByteArrayInputStream(serialized)); + + Assert.assertTrue(myModel.getFactory() instanceof DummyFactory); + } + + @Test + public void testDummyFactoryContextGenerator() throws Exception { + LanguageDetectorContextGenerator cg = model.getFactory().getContextGenerator(); + String[] context = cg.getContext( + "a dummy text phrase to test if the context generator works!!!!!!!!!!!!"); + + Set<String> set = new HashSet(Arrays.asList(context)); + + Assert.assertTrue(set.contains("!!!!!")); // default normalizer would remove the repeated ! + Assert.assertTrue(set.contains("a dum")); + Assert.assertTrue(set.contains("tg=[THE,CONTEXT,GENERATOR]")); + } + +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/839d2dea/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorMETest.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorMETest.java b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorMETest.java new file mode 100644 index 0000000..acdffc1 --- /dev/null +++ b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorMETest.java @@ -0,0 +1,116 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package opennlp.tools.langdetect; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; + +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; + +import opennlp.tools.formats.ResourceAsStreamFactory; +import opennlp.tools.util.PlainTextByLineStream; +import opennlp.tools.util.TrainingParameters; + + +public class LanguageDetectorMETest { + + private LanguageDetectorModel model; + + @Before + public void init() throws Exception { + + this.model = trainModel(); + + } + + @Test + public void testPredictLanguages() { + LanguageDetector ld = new LanguageDetectorME(this.model); + Language[] languages = ld.predictLanguages("estava em uma marcenaria na Rua Bruno"); + + Assert.assertEquals(4, languages.length); + Assert.assertEquals("pob", languages[0].getLang()); + Assert.assertEquals("ita", languages[1].getLang()); + Assert.assertEquals("spa", languages[2].getLang()); + Assert.assertEquals("fra", languages[3].getLang()); + } + + @Test + public void testPredictLanguage() { + LanguageDetector ld = new LanguageDetectorME(this.model); + Language language = ld.predictLanguage("Dove è meglio che giochi"); + + Assert.assertEquals("ita", language.getLang()); + } + + @Test + public void testSupportedLanguages() { + + LanguageDetector ld = new LanguageDetectorME(this.model); + String[] supportedLanguages = ld.getSupportedLanguages(); + + Assert.assertEquals(4, supportedLanguages.length); + } + + @Test + public void testLoadFromSerialized() throws IOException { + byte[] serialized = serializeModel(model); + + LanguageDetectorModel myModel = new LanguageDetectorModel(new ByteArrayInputStream(serialized)); + + Assert.assertNotNull(myModel); + + } + + protected static byte[] serializeModel(LanguageDetectorModel model) throws IOException { + + ByteArrayOutputStream out = new ByteArrayOutputStream(); + model.serialize(out); + return out.toByteArray(); + } + + public static LanguageDetectorModel trainModel() throws Exception { + return trainModel(new LanguageDetectorFactory()); + } + + public static LanguageDetectorModel trainModel(LanguageDetectorFactory factory) throws Exception { + + + LanguageDetectorSampleStream sampleStream = createSampleStream(); + + TrainingParameters params = new TrainingParameters(); + params.put(TrainingParameters.ITERATIONS_PARAM, 100); + params.put(TrainingParameters.CUTOFF_PARAM, 15); + params.put("DataIndexer", "TwoPass"); + params.put(TrainingParameters.ALGORITHM_PARAM, "NAIVEBAYES"); + + return LanguageDetectorME.train(sampleStream, params, factory); + } + + public static LanguageDetectorSampleStream createSampleStream() throws IOException { + + ResourceAsStreamFactory streamFactory = new ResourceAsStreamFactory( + LanguageDetectorMETest.class, "/opennlp/tools/doccat/DoccatSample.txt"); + + PlainTextByLineStream lineStream = new PlainTextByLineStream(streamFactory, "UTF-8"); + + return new LanguageDetectorSampleStream(lineStream); + } +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/839d2dea/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageSampleTest.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageSampleTest.java b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageSampleTest.java new file mode 100644 index 0000000..7d12581 --- /dev/null +++ b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageSampleTest.java @@ -0,0 +1,89 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.langdetect; + + +import org.junit.Assert; +import org.junit.Test; + + +public class LanguageSampleTest { + + @Test + public void testConstructor() { + Language lang = new Language("aLang"); + CharSequence context = "aContext"; + + LanguageSample sample = new LanguageSample(lang, context); + + Assert.assertEquals(lang, sample.getLanguage()); + Assert.assertEquals(context, sample.getContext()); + } + + @Test(expected = NullPointerException.class) + public void testNullLang() throws Exception { + CharSequence context = "aContext"; + + new LanguageSample(null, context); + } + + @Test(expected = NullPointerException.class) + public void testNullContext() { + Language lang = new Language("aLang"); + + new LanguageSample(lang, null); + } + + @Test + public void testToString() { + Language lang = new Language("aLang"); + CharSequence context = "aContext"; + + LanguageSample sample = new LanguageSample(lang, context); + + Assert.assertEquals(lang.getLang() + "\t" + context, sample.toString()); + } + + @Test + public void testHash() { + + int hashA = new LanguageSample(new Language("aLang"), "aContext").hashCode(); + int hashB = new LanguageSample(new Language("bLang"), "aContext").hashCode(); + int hashC = new LanguageSample(new Language("aLang"), "bContext").hashCode(); + + Assert.assertNotEquals(hashA, hashB); + Assert.assertNotEquals(hashA, hashC); + Assert.assertNotEquals(hashB, hashC); + } + + @Test + public void testEquals() throws Exception { + + LanguageSample sampleA = new LanguageSample(new Language("aLang"), "aContext"); + LanguageSample sampleA1 = new LanguageSample(new Language("aLang"), "aContext"); + LanguageSample sampleB = new LanguageSample(new Language("bLang"), "aContext"); + LanguageSample sampleC = new LanguageSample(new Language("aLang"), "bContext"); + + Assert.assertEquals(sampleA, sampleA); + Assert.assertEquals(sampleA, sampleA1); + Assert.assertNotEquals(sampleA, sampleB); + Assert.assertNotEquals(sampleA, sampleC); + Assert.assertNotEquals(sampleB, sampleC); + Assert.assertFalse(sampleA.equals("something else")); + } +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/839d2dea/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageTest.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageTest.java b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageTest.java new file mode 100644 index 0000000..dc25bc6 --- /dev/null +++ b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageTest.java @@ -0,0 +1,101 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.langdetect; + +import org.junit.Assert; +import org.junit.Test; + + +public class LanguageTest { + + + @Test + public void emptyConfidence() throws Exception { + String languageCode = "aLanguage"; + Language lang = new Language(languageCode); + + Assert.assertEquals(languageCode, lang.getLang()); + Assert.assertEquals(0, lang.getConfidence(), 0); + } + + @Test + public void nonEmptyConfidence() throws Exception { + String languageCode = "aLanguage"; + double confidence = 0.05; + Language lang = new Language(languageCode, confidence); + + Assert.assertEquals(languageCode, lang.getLang()); + Assert.assertEquals(confidence, lang.getConfidence(), 0); + } + + @Test(expected = NullPointerException.class) + public void emptyLanguage() throws Exception { + new Language(null); + } + + @Test(expected = NullPointerException.class) + public void emptyLanguageConfidence() throws Exception { + new Language(null, 0.05); + } + + @Test + public void testToString() { + Language lang = new Language("aLang"); + + Assert.assertEquals("aLang (0.0)", lang.toString()); + + lang = new Language("aLang", 0.0886678); + + Assert.assertEquals("aLang (0.0886678)", lang.toString()); + } + + + @Test + public void testHash() { + int hashA = new Language("aLang").hashCode(); + int hashAA = new Language("aLang").hashCode(); + int hashB = new Language("BLang").hashCode(); + int hashA5 = new Language("aLang", 5.0).hashCode(); + int hashA6 = new Language("BLang", 6.0).hashCode(); + + Assert.assertEquals(hashA, hashAA); + + Assert.assertNotEquals(hashA, hashB); + Assert.assertNotEquals(hashA, hashA5); + Assert.assertNotEquals(hashB, hashA5); + Assert.assertNotEquals(hashA5, hashA6); + } + + @Test + public void testEquals() { + Language langA = new Language("langA"); + Language langB = new Language("langB"); + Language langA5 = new Language("langA5", 5.0); + Language langA6 = new Language("langA5", 6.0); + + Assert.assertEquals(langA, langA); + Assert.assertEquals(langA5, langA5); + + Assert.assertNotEquals(langA, langA5); + Assert.assertNotEquals(langA, langB); + + Assert.assertEquals(langA6, langA5); + + Assert.assertNotEquals(langA, "something else"); + } +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/839d2dea/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/EmojiCharSequenceNormalizerTest.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/EmojiCharSequenceNormalizerTest.java b/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/EmojiCharSequenceNormalizerTest.java new file mode 100644 index 0000000..0f8dfe7 --- /dev/null +++ b/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/EmojiCharSequenceNormalizerTest.java @@ -0,0 +1,43 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.util.normalizer; + +import org.junit.Assert; +import org.junit.Test; + + +public class EmojiCharSequenceNormalizerTest { + + public EmojiCharSequenceNormalizer normalizer = EmojiCharSequenceNormalizer.getInstance(); + + @Test + public void normalizeEmoji() throws Exception { + + String s = new StringBuilder() + .append("Any funny text goes here ") + .appendCodePoint(0x1F606) + .appendCodePoint(0x1F606) + .appendCodePoint(0x1F606) + .append(" ") + .appendCodePoint(0x1F61B) + .toString(); + Assert.assertEquals( + "Any funny text goes here ", normalizer.normalize(s)); + } + +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/839d2dea/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/NumberCharSequenceNormalizerTest.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/NumberCharSequenceNormalizerTest.java b/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/NumberCharSequenceNormalizerTest.java new file mode 100644 index 0000000..50b1f0c --- /dev/null +++ b/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/NumberCharSequenceNormalizerTest.java @@ -0,0 +1,32 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package opennlp.tools.util.normalizer; + +import org.junit.Assert; +import org.junit.Test; + + +public class NumberCharSequenceNormalizerTest { + + public NumberCharSequenceNormalizer normalizer = NumberCharSequenceNormalizer.getInstance(); + + + @Test + public void normalize() throws Exception { + Assert.assertEquals("absc , abcd", normalizer.normalize("absc 123,0123 abcd")); + } +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/839d2dea/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/ShrinkCharSequenceNormalizerTest.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/ShrinkCharSequenceNormalizerTest.java b/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/ShrinkCharSequenceNormalizerTest.java new file mode 100644 index 0000000..95cf300 --- /dev/null +++ b/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/ShrinkCharSequenceNormalizerTest.java @@ -0,0 +1,41 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.util.normalizer; + +import org.junit.Assert; +import org.junit.Test; + + +public class ShrinkCharSequenceNormalizerTest { + + public ShrinkCharSequenceNormalizer normalizer = ShrinkCharSequenceNormalizer.getInstance(); + + @Test + public void normalizeSpace() throws Exception { + Assert.assertEquals( + "a text extra space", normalizer.normalize("a text extra space")); + } + + @Test + public void normalizeChar() throws Exception { + Assert.assertEquals("Helloo", normalizer.normalize("Helllllloooooo")); + Assert.assertEquals("Hello", normalizer.normalize("Hello")); + Assert.assertEquals("HHello", normalizer.normalize("HHello")); + } + +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/839d2dea/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/TwitterCharSequenceNormalizerTest.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/TwitterCharSequenceNormalizerTest.java b/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/TwitterCharSequenceNormalizerTest.java new file mode 100644 index 0000000..f0bd517 --- /dev/null +++ b/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/TwitterCharSequenceNormalizerTest.java @@ -0,0 +1,62 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.util.normalizer; + +import org.junit.Assert; +import org.junit.Test; + + +public class TwitterCharSequenceNormalizerTest { + + public TwitterCharSequenceNormalizer normalizer = TwitterCharSequenceNormalizer.getInstance(); + + @Test + public void normalizeHashtag() throws Exception { + Assert.assertEquals("asdf 2nnfdf", normalizer.normalize("asdf #hasdk23 2nnfdf")); + } + + @Test + public void normalizeUser() throws Exception { + Assert.assertEquals("asdf 2nnfdf", normalizer.normalize("asdf @hasdk23 2nnfdf")); + } + + @Test + public void normalizeRT() throws Exception { + Assert.assertEquals(" 2nnfdf", normalizer.normalize("RT RT RT 2nnfdf")); + } + + @Test + public void normalizeLaugh() throws Exception { + Assert.assertEquals("ahahah", normalizer.normalize("ahahahah")); + Assert.assertEquals("haha", normalizer.normalize("hahha")); + Assert.assertEquals("haha", normalizer.normalize("hahaa")); + Assert.assertEquals("ahaha", normalizer.normalize("ahahahahhahahhahahaaaa")); + Assert.assertEquals("jaja", normalizer.normalize("jajjajajaja")); + } + + + + @Test + public void normalizeFace() throws Exception { + Assert.assertEquals("hello hello", normalizer.normalize("hello :-) hello")); + Assert.assertEquals("hello hello", normalizer.normalize("hello ;) hello")); + Assert.assertEquals(" hello", normalizer.normalize(":) hello")); + Assert.assertEquals("hello ", normalizer.normalize("hello :P")); + } + +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/839d2dea/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/UrlCharSequenceNormalizerTest.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/UrlCharSequenceNormalizerTest.java b/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/UrlCharSequenceNormalizerTest.java new file mode 100644 index 0000000..72eb83a --- /dev/null +++ b/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/UrlCharSequenceNormalizerTest.java @@ -0,0 +1,47 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.util.normalizer; + +import org.junit.Assert; +import org.junit.Test; + + +public class UrlCharSequenceNormalizerTest { + + public UrlCharSequenceNormalizer normalizer = UrlCharSequenceNormalizer.getInstance(); + + @Test + public void normalizeUrl() throws Exception { + Assert.assertEquals( + "asdf 2nnfdf", normalizer.normalize("asdf http://asdf.com/dfa/cxs 2nnfdf")); + + + Assert.assertEquals( + "asdf 2nnfdf ", normalizer.normalize("asdf http://asdf.com/dfa/cx" + + "s 2nnfdf http://asdf.com/dfa/cxs")); + } + + @Test + public void normalizeEmail() throws Exception { + Assert.assertEquals( + "asdf 2nnfdf", normalizer.normalize("asdf [email protected] 2nnfdf")); + Assert.assertEquals( + "asdf 2nnfdf ", normalizer.normalize("asdf [email protected]" + + " 2nnfdf [email protected]")); + } +}
