OPENNLP-788: Add LanguageDetector tool closes #143
Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/a189d4ec Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/a189d4ec Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/a189d4ec Branch: refs/heads/LangDetect Commit: a189d4ecc7edd3639dd8cc68376fe04e05cdb265 Parents: 911d59f Author: William D C M SILVA <[email protected]> Authored: Wed May 17 13:34:21 2017 -0300 Committer: William D C M SILVA <[email protected]> Committed: Wed May 17 13:34:21 2017 -0300 ---------------------------------------------------------------------- NOTICE | 7 + .../main/java/opennlp/tools/cmdline/CLI.java | 10 + .../cmdline/FineGrainedReportListener.java | 13 +- .../tools/cmdline/StreamFactoryRegistry.java | 2 + .../LanguageDetectorConverterTool.java | 28 ++ .../LanguageDetectorCrossValidatorTool.java | 123 ++++++++ ...LanguageDetectorEvaluationErrorListener.java | 54 ++++ .../LanguageDetectorEvaluatorTool.java | 139 +++++++++ ...nguageDetectorFineGrainedReportListener.java | 70 +++++ .../langdetect/LanguageDetectorModelLoader.java | 42 +++ .../langdetect/LanguageDetectorTool.java | 88 ++++++ .../langdetect/LanguageDetectorTrainerTool.java | 83 ++++++ .../cmdline/langdetect/TrainingParams.java | 40 +++ .../LanguageDetectorSampleStreamFactory.java | 66 +++++ .../java/opennlp/tools/langdetect/Language.java | 73 +++++ .../tools/langdetect/LanguageDetector.java | 31 ++ .../LanguageDetectorContextGenerator.java | 79 +++++ .../LanguageDetectorCrossValidator.java | 107 +++++++ .../LanguageDetectorEvaluationMonitor.java | 28 ++ .../langdetect/LanguageDetectorEvaluator.java | 99 +++++++ .../langdetect/LanguageDetectorEventStream.java | 69 +++++ .../langdetect/LanguageDetectorFactory.java | 53 ++++ .../tools/langdetect/LanguageDetectorME.java | 97 ++++++ .../tools/langdetect/LanguageDetectorModel.java | 82 +++++ .../LanguageDetectorSampleStream.java | 56 ++++ .../tools/langdetect/LanguageSample.java | 75 +++++ .../AggregateCharSequenceNormalizer.java | 39 +++ .../util/normalizer/CharSequenceNormalizer.java | 23 ++ .../normalizer/EmojiCharSequenceNormalizer.java | 38 +++ .../NumberCharSequenceNormalizer.java | 36 +++ .../ShrinkCharSequenceNormalizer.java | 40 +++ .../TwitterCharSequenceNormalizer.java | 50 ++++ .../UnicodeCharSequenceNormalizer.java | 297 +++++++++++++++++++ .../normalizer/UrlCharSequenceNormalizer.java | 40 +++ .../normalizer/unicode_normalizer.properties | 154 ++++++++++ .../opennlp/tools/langdetect/DummyFactory.java | 33 +++ .../LanguageDetectorContextGeneratorTest.java | 50 ++++ .../LanguageDetectorCrossValidatorTest.java | 63 ++++ .../LanguageDetectorEvaluatorTest.java | 68 +++++ .../langdetect/LanguageDetectorFactoryTest.java | 75 +++++ .../langdetect/LanguageDetectorMETest.java | 114 +++++++ .../tools/langdetect/LanguageSampleTest.java | 89 ++++++ .../opennlp/tools/langdetect/LanguageTest.java | 101 +++++++ .../EmojiCharSequenceNormalizerTest.java | 43 +++ .../NumberCharSequenceNormalizerTest.java | 32 ++ .../ShrinkCharSequenceNormalizerTest.java | 41 +++ .../TwitterCharSequenceNormalizerTest.java | 62 ++++ .../UnicodeCharSequenceNormalizerTest.java | 263 ++++++++++++++++ .../UrlCharSequenceNormalizerTest.java | 47 +++ 49 files changed, 3410 insertions(+), 2 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/opennlp/blob/a189d4ec/NOTICE ---------------------------------------------------------------------- diff --git a/NOTICE b/NOTICE index c0b8394..36d90e2 100644 --- a/NOTICE +++ b/NOTICE @@ -10,3 +10,10 @@ opennlp-tools/src/main/java/opennlp/tools/stemmer/snowball were developed by Martin Porter and Richard Boulton. The full snowball package is available from http://snowball.tartarus.org/ + + +The Language Detector normalizer in +opennlp.tools.util.normalizer.UnicodeCharSequenceNormalizer.java +and its tests and resources were developed by Shuyo Nakatani. +The full Language Detector package is available from +https://github.com/shuyo/language-detection http://git-wip-us.apache.org/repos/asf/opennlp/blob/a189d4ec/opennlp-tools/src/main/java/opennlp/tools/cmdline/CLI.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/CLI.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/CLI.java index b575f71..9cef28b 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/cmdline/CLI.java +++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/CLI.java @@ -37,6 +37,10 @@ import opennlp.tools.cmdline.doccat.DoccatEvaluatorTool; import opennlp.tools.cmdline.doccat.DoccatTool; import opennlp.tools.cmdline.doccat.DoccatTrainerTool; import opennlp.tools.cmdline.entitylinker.EntityLinkerTool; +import opennlp.tools.cmdline.langdetect.LanguageDetectorCrossValidatorTool; +import opennlp.tools.cmdline.langdetect.LanguageDetectorEvaluatorTool; +import opennlp.tools.cmdline.langdetect.LanguageDetectorTool; +import opennlp.tools.cmdline.langdetect.LanguageDetectorTrainerTool; import opennlp.tools.cmdline.languagemodel.NGramLanguageModelTool; import opennlp.tools.cmdline.lemmatizer.LemmatizerEvaluatorTool; import opennlp.tools.cmdline.lemmatizer.LemmatizerMETool; @@ -90,6 +94,12 @@ public final class CLI { tools.add(new DoccatCrossValidatorTool()); tools.add(new DoccatConverterTool()); + // Language Detector + tools.add(new LanguageDetectorTool()); + tools.add(new LanguageDetectorTrainerTool()); + tools.add(new LanguageDetectorCrossValidatorTool()); + tools.add(new LanguageDetectorEvaluatorTool()); + // Dictionary Builder tools.add(new DictionaryBuilderTool()); http://git-wip-us.apache.org/repos/asf/opennlp/blob/a189d4ec/opennlp-tools/src/main/java/opennlp/tools/cmdline/FineGrainedReportListener.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/FineGrainedReportListener.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/FineGrainedReportListener.java index 714561a..75b84aa 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/cmdline/FineGrainedReportListener.java +++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/FineGrainedReportListener.java @@ -802,8 +802,8 @@ public abstract class FineGrainedReportListener { } } - public void add(String[] text, String ref, String pred) { - int length = text.length; + public void add(int length, String ref, String pred) { + averageSentenceLength.add(length); if (minimalSentenceLength > length) { @@ -820,7 +820,16 @@ public abstract class FineGrainedReportListener { updateTagFMeasure(refs, preds); commit("", ref, pred); + } + + public void add(String[] text, String ref, String pred) { + int length = text.length; + this.add(length, ref, pred); + } + public void add(CharSequence text, String ref, String pred) { + int length = text.length(); + this.add(length, ref, pred); } http://git-wip-us.apache.org/repos/asf/opennlp/blob/a189d4ec/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java index 9977519..d1e8c89 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java +++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java @@ -29,6 +29,7 @@ import opennlp.tools.formats.ConllXSentenceSampleStreamFactory; import opennlp.tools.formats.ConllXTokenSampleStreamFactory; import opennlp.tools.formats.DocumentSampleStreamFactory; import opennlp.tools.formats.EvalitaNameSampleStreamFactory; +import opennlp.tools.formats.LanguageDetectorSampleStreamFactory; import opennlp.tools.formats.LeipzigDocumentSampleStreamFactory; import opennlp.tools.formats.LemmatizerSampleStreamFactory; import opennlp.tools.formats.NameSampleDataStreamFactory; @@ -75,6 +76,7 @@ public final class StreamFactoryRegistry { TokenSampleStreamFactory.registerFactory(); WordTagSampleStreamFactory.registerFactory(); LemmatizerSampleStreamFactory.registerFactory(); + LanguageDetectorSampleStreamFactory.registerFactory(); NameToSentenceSampleStreamFactory.registerFactory(); NameToTokenSampleStreamFactory.registerFactory(); http://git-wip-us.apache.org/repos/asf/opennlp/blob/a189d4ec/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorConverterTool.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorConverterTool.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorConverterTool.java new file mode 100644 index 0000000..69d9db7 --- /dev/null +++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorConverterTool.java @@ -0,0 +1,28 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.cmdline.langdetect; + +import opennlp.tools.cmdline.AbstractConverterTool; +import opennlp.tools.langdetect.LanguageSample; + +public class LanguageDetectorConverterTool extends AbstractConverterTool<LanguageSample> { + + public LanguageDetectorConverterTool() { + super(LanguageSample.class); + } +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/a189d4ec/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorCrossValidatorTool.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorCrossValidatorTool.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorCrossValidatorTool.java new file mode 100644 index 0000000..bf68fbb --- /dev/null +++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorCrossValidatorTool.java @@ -0,0 +1,123 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.cmdline.langdetect; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.OutputStream; +import java.util.LinkedList; +import java.util.List; + +import opennlp.tools.cmdline.AbstractCrossValidatorTool; +import opennlp.tools.cmdline.CmdLineUtil; +import opennlp.tools.cmdline.TerminateToolException; +import opennlp.tools.cmdline.params.CVParams; +import opennlp.tools.cmdline.params.FineGrainedEvaluatorParams; +import opennlp.tools.langdetect.LanguageDetectorCrossValidator; +import opennlp.tools.langdetect.LanguageDetectorEvaluationMonitor; +import opennlp.tools.langdetect.LanguageDetectorFactory; +import opennlp.tools.langdetect.LanguageSample; +import opennlp.tools.util.eval.EvaluationMonitor; +import opennlp.tools.util.model.ModelUtil; + +public final class LanguageDetectorCrossValidatorTool extends + AbstractCrossValidatorTool<LanguageSample, + LanguageDetectorCrossValidatorTool.CVToolParams> { + + interface CVToolParams extends CVParams, TrainingParams, FineGrainedEvaluatorParams { + } + + public LanguageDetectorCrossValidatorTool() { + super(LanguageSample.class, CVToolParams.class); + } + + public String getShortDescription() { + return "K-fold cross validator for the learnable Language Detector"; + } + + public void run(String format, String[] args) { + super.run(format, args); + + mlParams = CmdLineUtil.loadTrainingParameters(params.getParams(), false); + if (mlParams == null) { + mlParams = ModelUtil.createDefaultTrainingParameters(); + } + + List<EvaluationMonitor<LanguageSample>> listeners = new LinkedList<>(); + if (params.getMisclassified()) { + listeners.add(new LanguageDetectorEvaluationErrorListener()); + } + + LanguageDetectorFineGrainedReportListener reportListener = null; + File reportFile = params.getReportOutputFile(); + OutputStream reportOutputStream = null; + if (reportFile != null) { + CmdLineUtil.checkOutputFile("Report Output File", reportFile); + try { + reportOutputStream = new FileOutputStream(reportFile); + reportListener = new LanguageDetectorFineGrainedReportListener(reportOutputStream); + listeners.add(reportListener); + } catch (FileNotFoundException e) { + throw createTerminationIOException(e); + } + } + + LanguageDetectorEvaluationMonitor[] listenersArr = listeners + .toArray(new LanguageDetectorEvaluationMonitor[listeners.size()]); + + LanguageDetectorCrossValidator validator; + try { + LanguageDetectorFactory factory = LanguageDetectorFactory.create(params.getFactory()); + validator = new LanguageDetectorCrossValidator(mlParams, + factory, listenersArr); + + validator.evaluate(sampleStream, params.getFolds()); + } catch (IOException e) { + throw new TerminateToolException(-1, + "IO error while reading training data or indexing data: " + e.getMessage(), e); + } finally { + try { + sampleStream.close(); + } catch (IOException e) { + // sorry that this can fail + } + } + + System.out.println("done"); + + if (reportListener != null) { + System.out.println("Writing fine-grained report to " + + params.getReportOutputFile().getAbsolutePath()); + reportListener.writeReport(); + + try { + // TODO: is it a problem to close the stream now? + reportOutputStream.close(); + } catch (IOException e) { + // nothing to do + } + } + + System.out.println(); + + System.out.println("Accuracy: " + validator.getDocumentAccuracy() + "\n" + + "Number of documents: " + validator.getDocumentCount()); + } +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/a189d4ec/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorEvaluationErrorListener.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorEvaluationErrorListener.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorEvaluationErrorListener.java new file mode 100644 index 0000000..073ef31 --- /dev/null +++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorEvaluationErrorListener.java @@ -0,0 +1,54 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.cmdline.langdetect; + +import java.io.OutputStream; + +import opennlp.tools.cmdline.EvaluationErrorPrinter; +import opennlp.tools.langdetect.LanguageDetectorEvaluationMonitor; +import opennlp.tools.langdetect.LanguageSample; +import opennlp.tools.util.eval.EvaluationMonitor; + +/** + * A default implementation of {@link EvaluationMonitor} that prints to an + * output stream. + * + */ +public class LanguageDetectorEvaluationErrorListener extends + EvaluationErrorPrinter<LanguageSample> implements LanguageDetectorEvaluationMonitor { + + /** + * Creates a listener that will print to System.err + */ + public LanguageDetectorEvaluationErrorListener() { + super(System.err); + } + + /** + * Creates a listener that will print to a given {@link OutputStream} + */ + public LanguageDetectorEvaluationErrorListener(OutputStream outputStream) { + super(outputStream); + } + + @Override + public void missclassified(LanguageSample reference, LanguageSample prediction) { + printError(reference, prediction); + } + +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/a189d4ec/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorEvaluatorTool.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorEvaluatorTool.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorEvaluatorTool.java new file mode 100644 index 0000000..fb929bf --- /dev/null +++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorEvaluatorTool.java @@ -0,0 +1,139 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.cmdline.langdetect; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.OutputStream; +import java.util.LinkedList; +import java.util.List; + +import opennlp.tools.cmdline.AbstractEvaluatorTool; +import opennlp.tools.cmdline.CmdLineUtil; +import opennlp.tools.cmdline.PerformanceMonitor; +import opennlp.tools.cmdline.TerminateToolException; +import opennlp.tools.cmdline.params.EvaluatorParams; +import opennlp.tools.cmdline.params.FineGrainedEvaluatorParams; +import opennlp.tools.langdetect.LanguageDetectorEvaluationMonitor; +import opennlp.tools.langdetect.LanguageDetectorEvaluator; +import opennlp.tools.langdetect.LanguageDetectorME; +import opennlp.tools.langdetect.LanguageDetectorModel; +import opennlp.tools.langdetect.LanguageSample; +import opennlp.tools.util.ObjectStream; +import opennlp.tools.util.eval.EvaluationMonitor; + +public final class LanguageDetectorEvaluatorTool extends + AbstractEvaluatorTool<LanguageSample, LanguageDetectorEvaluatorTool.EvalToolParams> { + + interface EvalToolParams extends EvaluatorParams, FineGrainedEvaluatorParams { + } + + public LanguageDetectorEvaluatorTool() { + super(LanguageSample.class, EvalToolParams.class); + } + + public String getShortDescription() { + return "Measures the performance of the Language Detector model with the reference data"; + } + + public void run(String format, String[] args) { + super.run(format, args); + + LanguageDetectorModel model = new LanguageDetectorModelLoader().load(params.getModel()); + + List<EvaluationMonitor<LanguageSample>> listeners = new LinkedList<>(); + if (params.getMisclassified()) { + listeners.add(new LanguageDetectorEvaluationErrorListener()); + } + + LanguageDetectorFineGrainedReportListener reportListener = null; + File reportFile = params.getReportOutputFile(); + OutputStream reportOutputStream = null; + if (reportFile != null) { + CmdLineUtil.checkOutputFile("Report Output File", reportFile); + try { + reportOutputStream = new FileOutputStream(reportFile); + reportListener = new LanguageDetectorFineGrainedReportListener(reportOutputStream); + listeners.add(reportListener); + } catch (FileNotFoundException e) { + throw new TerminateToolException(-1, + "IO error while creating LanguageDetector fine-grained report file: " + + e.getMessage()); + } + } + + LanguageDetectorEvaluator evaluator = new LanguageDetectorEvaluator( + new LanguageDetectorME(model), + listeners.toArray(new LanguageDetectorEvaluationMonitor[listeners.size()])); + + final PerformanceMonitor monitor = new PerformanceMonitor("doc"); + + ObjectStream<LanguageSample> measuredSampleStream = new ObjectStream<LanguageSample>() { + + public LanguageSample read() throws IOException { + monitor.incrementCounter(); + return sampleStream.read(); + } + + public void reset() throws IOException { + sampleStream.reset(); + } + + public void close() throws IOException { + sampleStream.close(); + } + }; + + monitor.startAndPrintThroughput(); + + try { + evaluator.evaluate(measuredSampleStream); + } catch (IOException e) { + System.err.println("failed"); + throw new TerminateToolException(-1, "IO error while reading test data: " + + e.getMessage(), e); + } finally { + try { + measuredSampleStream.close(); + } catch (IOException e) { + // sorry that this can fail + } + } + + monitor.stopAndPrintFinalResult(); + + System.out.println(); + + System.out.println(evaluator); + + if (reportListener != null) { + System.out.println("Writing fine-grained report to " + + params.getReportOutputFile().getAbsolutePath()); + reportListener.writeReport(); + + try { + // TODO: is it a problem to close the stream now? + reportOutputStream.close(); + } catch (IOException e) { + // nothing to do + } + } + } +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/a189d4ec/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorFineGrainedReportListener.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorFineGrainedReportListener.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorFineGrainedReportListener.java new file mode 100644 index 0000000..70bf3eb --- /dev/null +++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorFineGrainedReportListener.java @@ -0,0 +1,70 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.cmdline.langdetect; + +import java.io.OutputStream; + +import opennlp.tools.cmdline.FineGrainedReportListener; +import opennlp.tools.langdetect.LanguageDetectorEvaluationMonitor; +import opennlp.tools.langdetect.LanguageSample; + +/** + * Generates a detailed report for the POS Tagger. + * <p> + * It is possible to use it from an API and access the statistics using the + * provided getters + */ +public class LanguageDetectorFineGrainedReportListener + extends FineGrainedReportListener implements LanguageDetectorEvaluationMonitor { + + /** + * Creates a listener that will print to {@link System#err} + */ + public LanguageDetectorFineGrainedReportListener() { + this(System.err); + } + + /** + * Creates a listener that prints to a given {@link OutputStream} + */ + public LanguageDetectorFineGrainedReportListener(OutputStream outputStream) { + super(outputStream); + } + + // methods inherited from EvaluationMonitor + + public void missclassified(LanguageSample reference, LanguageSample prediction) { + statsAdd(reference, prediction); + } + + public void correctlyClassified(LanguageSample reference, LanguageSample prediction) { + statsAdd(reference, prediction); + } + + private void statsAdd(LanguageSample reference, LanguageSample prediction) { + getStats().add(reference.getContext(), + reference.getLanguage().getLang(), prediction.getLanguage().getLang()); + } + + public void writeReport() { + printGeneralStatistics(); + printTagsErrorRank(); + printGeneralConfusionTable(); + } + +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/a189d4ec/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorModelLoader.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorModelLoader.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorModelLoader.java new file mode 100644 index 0000000..c8700fd --- /dev/null +++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorModelLoader.java @@ -0,0 +1,42 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.cmdline.langdetect; + +import java.io.IOException; +import java.io.InputStream; + +import opennlp.tools.cmdline.ModelLoader; +import opennlp.tools.langdetect.LanguageDetectorModel; + +/** + * Loads a Language Detector Model for the command line tools. + * <p> + * <b>Note:</b> Do not use this class, internal use only! + */ +public class LanguageDetectorModelLoader extends ModelLoader<LanguageDetectorModel> { + + public LanguageDetectorModelLoader() { + super("Language Detector"); + } + + @Override + protected LanguageDetectorModel loadModel(InputStream modelIn) throws IOException { + return new LanguageDetectorModel(modelIn); + } + +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/a189d4ec/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorTool.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorTool.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorTool.java new file mode 100644 index 0000000..6175fe3 --- /dev/null +++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorTool.java @@ -0,0 +1,88 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.cmdline.langdetect; + +import java.io.File; +import java.io.IOException; + +import opennlp.tools.cmdline.BasicCmdLineTool; +import opennlp.tools.cmdline.CLI; +import opennlp.tools.cmdline.CmdLineUtil; +import opennlp.tools.cmdline.PerformanceMonitor; +import opennlp.tools.cmdline.SystemInputStreamFactory; +import opennlp.tools.langdetect.Language; +import opennlp.tools.langdetect.LanguageDetector; +import opennlp.tools.langdetect.LanguageDetectorME; +import opennlp.tools.langdetect.LanguageDetectorModel; +import opennlp.tools.langdetect.LanguageSample; +import opennlp.tools.util.ObjectStream; +import opennlp.tools.util.ParagraphStream; +import opennlp.tools.util.PlainTextByLineStream; + +public class LanguageDetectorTool extends BasicCmdLineTool { + + @Override + public String getShortDescription() { + return "learned language detector"; + } + + @Override + public String getHelp() { + return "Usage: " + CLI.CMD + " " + getName() + " model < documents"; + } + + @Override + public void run(String[] args) { + + if (0 == args.length) { + System.out.println(getHelp()); + } else { + + LanguageDetectorModel model = new LanguageDetectorModelLoader().load(new File(args[0])); + + LanguageDetector langDetectME = new LanguageDetectorME(model); + + /* + * moved initialization to the try block to catch new IOException + */ + ObjectStream<String> documentStream; + + PerformanceMonitor perfMon = new PerformanceMonitor(System.err, "doc"); + perfMon.start(); + + try { + documentStream = new ParagraphStream(new PlainTextByLineStream( + new SystemInputStreamFactory(), SystemInputStreamFactory.encoding())); + String document; + while ((document = documentStream.read()) != null) { + + Language lang = langDetectME.predictLanguage(document); + + LanguageSample sample = new LanguageSample(lang, document); + System.out.println(sample.toString()); + + perfMon.incrementCounter(); + } + } catch (IOException e) { + CmdLineUtil.handleStdinIoError(e); + } + + perfMon.stopAndPrintFinalResult(); + } + } +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/a189d4ec/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorTrainerTool.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorTrainerTool.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorTrainerTool.java new file mode 100644 index 0000000..6735293 --- /dev/null +++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorTrainerTool.java @@ -0,0 +1,83 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.cmdline.langdetect; + +import java.io.File; +import java.io.IOException; + +import opennlp.tools.cmdline.AbstractTrainerTool; +import opennlp.tools.cmdline.ArgumentParser; +import opennlp.tools.cmdline.CmdLineUtil; +import opennlp.tools.langdetect.LanguageDetectorFactory; +import opennlp.tools.langdetect.LanguageDetectorME; +import opennlp.tools.langdetect.LanguageDetectorModel; +import opennlp.tools.langdetect.LanguageSample; +import opennlp.tools.util.model.ModelUtil; + +public class LanguageDetectorTrainerTool + extends AbstractTrainerTool<LanguageSample, LanguageDetectorTrainerTool.TrainerToolParams> { + + interface TrainerToolParams extends TrainingParams { + @ArgumentParser.ParameterDescription(valueName = "modelFile", description = "output model file.") + File getModel(); + + @ArgumentParser.ParameterDescription(valueName = "paramsFile", description = "training parameters file.") + @ArgumentParser.OptionalParameter() + String getParams(); + } + + public LanguageDetectorTrainerTool() { + super(LanguageSample.class, TrainerToolParams.class); + } + + @Override + public String getShortDescription() { + return "trainer for the learnable language detector"; + } + + @Override + public void run(String format, String[] args) { + super.run(format, args); + + mlParams = CmdLineUtil.loadTrainingParameters(params.getParams(), false); + if (mlParams == null) { + mlParams = ModelUtil.createDefaultTrainingParameters(); + } + + File modelOutFile = params.getModel(); + + CmdLineUtil.checkOutputFile("language detector model", modelOutFile); + + LanguageDetectorModel model; + try { + LanguageDetectorFactory factory = LanguageDetectorFactory.create(params.getFactory()); + model = LanguageDetectorME.train(sampleStream, mlParams, factory); + } catch (IOException e) { + throw createTerminationIOException(e); + } + finally { + try { + sampleStream.close(); + } catch (IOException e) { + // sorry that this can fail + } + } + + CmdLineUtil.writeModel("language detector", modelOutFile, model); + } +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/a189d4ec/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/TrainingParams.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/TrainingParams.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/TrainingParams.java new file mode 100644 index 0000000..2937c3d --- /dev/null +++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/TrainingParams.java @@ -0,0 +1,40 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.cmdline.langdetect; + +import opennlp.tools.cmdline.ArgumentParser.OptionalParameter; +import opennlp.tools.cmdline.ArgumentParser.ParameterDescription; + +/** + * TrainingParams for Language Detector. + * + * Note: Do not use this class, internal use only! + */ +interface TrainingParams { + + @ParameterDescription(valueName = "paramsFile", description = "training parameters file.") + @OptionalParameter() + String getParams(); + + @ParameterDescription(valueName = "factoryName", + description = "A sub-class of LanguageDetectorFactory" + + " where to get implementation and resources.") + @OptionalParameter + String getFactory(); + +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/a189d4ec/opennlp-tools/src/main/java/opennlp/tools/formats/LanguageDetectorSampleStreamFactory.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/LanguageDetectorSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/LanguageDetectorSampleStreamFactory.java new file mode 100644 index 0000000..ef60063 --- /dev/null +++ b/opennlp-tools/src/main/java/opennlp/tools/formats/LanguageDetectorSampleStreamFactory.java @@ -0,0 +1,66 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.formats; + +import java.io.IOException; + +import opennlp.tools.cmdline.ArgumentParser; +import opennlp.tools.cmdline.CmdLineUtil; +import opennlp.tools.cmdline.StreamFactoryRegistry; +import opennlp.tools.cmdline.params.BasicFormatParams; +import opennlp.tools.doccat.DocumentSampleStream; +import opennlp.tools.langdetect.LanguageDetectorSampleStream; +import opennlp.tools.langdetect.LanguageSample; +import opennlp.tools.util.InputStreamFactory; +import opennlp.tools.util.ObjectStream; +import opennlp.tools.util.PlainTextByLineStream; + +/** + * Factory producing OpenNLP {@link DocumentSampleStream}s. + */ +public class LanguageDetectorSampleStreamFactory + extends AbstractSampleStreamFactory<LanguageSample> { + + interface Parameters extends BasicFormatParams { + } + + public static void registerFactory() { + StreamFactoryRegistry.registerFactory(LanguageSample.class, + StreamFactoryRegistry.DEFAULT_FORMAT, + new LanguageDetectorSampleStreamFactory(Parameters.class)); + } + + protected <P> LanguageDetectorSampleStreamFactory(Class<P> params) { + super(params); + } + + public ObjectStream<LanguageSample> create(String[] args) { + Parameters params = ArgumentParser.parse(args, Parameters.class); + + CmdLineUtil.checkInputFile("Data", params.getData()); + InputStreamFactory sampleDataIn = CmdLineUtil.createInputStreamFactory(params.getData()); + ObjectStream<String> lineStream = null; + try { + lineStream = new PlainTextByLineStream(sampleDataIn, params.getEncoding()); + } catch (IOException ex) { + CmdLineUtil.handleCreateObjectStreamError(ex); + } + + return new LanguageDetectorSampleStream(lineStream); + } +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/a189d4ec/opennlp-tools/src/main/java/opennlp/tools/langdetect/Language.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/Language.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/Language.java new file mode 100644 index 0000000..f780759 --- /dev/null +++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/Language.java @@ -0,0 +1,73 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.langdetect; + +import java.util.Objects; + +/** + * Class for holding the document language and its confidence + */ +public class Language { + private final String lang; + private final double confidence; + + public Language(String lang) { + this(lang, 0); + } + + public Language(String lang, double confidence) { + Objects.requireNonNull(lang, "lang must not be null"); + this.lang = lang; + this.confidence = confidence; + } + + public String getLang() { + return lang; + } + + public double getConfidence() { + return confidence; + } + + @Override + public String toString() { + StringBuilder sb = new StringBuilder(); + sb.append(getLang()).append(" (").append(this.confidence).append(")"); + return sb.toString(); + } + + @Override + public int hashCode() { + return Objects.hash(getLang(), getConfidence()); + } + + @Override + public boolean equals(Object obj) { + if (this == obj) { + return true; + } + + if (obj instanceof Language) { + Language a = (Language) obj; + + return getLang().equals(a.getLang()); + } + + return false; + } +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/a189d4ec/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetector.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetector.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetector.java new file mode 100644 index 0000000..0004494 --- /dev/null +++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetector.java @@ -0,0 +1,31 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.langdetect; + +/** + * The interface for LanguageDetector which provide the @{@link Language} according to the context. + */ +public interface LanguageDetector { + + Language[] predictLanguages(CharSequence content); + + Language predictLanguage(CharSequence content); + + String[] getSupportedLanguages(); + +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/a189d4ec/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorContextGenerator.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorContextGenerator.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorContextGenerator.java new file mode 100644 index 0000000..c63ba76 --- /dev/null +++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorContextGenerator.java @@ -0,0 +1,79 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.langdetect; + +import java.util.Collection; +import java.util.LinkedList; + +import opennlp.tools.ngram.NGramModel; +import opennlp.tools.util.StringList; +import opennlp.tools.util.StringUtil; +import opennlp.tools.util.normalizer.AggregateCharSequenceNormalizer; +import opennlp.tools.util.normalizer.CharSequenceNormalizer; +import opennlp.tools.util.normalizer.EmojiCharSequenceNormalizer; +import opennlp.tools.util.normalizer.NumberCharSequenceNormalizer; +import opennlp.tools.util.normalizer.ShrinkCharSequenceNormalizer; +import opennlp.tools.util.normalizer.TwitterCharSequenceNormalizer; +import opennlp.tools.util.normalizer.UnicodeCharSequenceNormalizer; +import opennlp.tools.util.normalizer.UrlCharSequenceNormalizer; + +/** + * Context generator for document categorizer + */ +class LanguageDetectorContextGenerator { + + private final int minLength; + private final int maxLength; + private final CharSequenceNormalizer normalizer; + + LanguageDetectorContextGenerator(int minLength, int maxLength) { + this.minLength = minLength; + this.maxLength = maxLength; + + this.normalizer = new AggregateCharSequenceNormalizer( + EmojiCharSequenceNormalizer.getInstance(), + UrlCharSequenceNormalizer.getInstance(), + TwitterCharSequenceNormalizer.getInstance(), + NumberCharSequenceNormalizer.getInstance(), + UnicodeCharSequenceNormalizer.getInstance(), + ShrinkCharSequenceNormalizer.getInstance()); + } + + /** + * Initializes the current instance with min 2 length and max 5 length of ngrams. + */ + LanguageDetectorContextGenerator() { + this(2, 5); + } + + public String[] getContext(String document) { + + Collection<String> context = new LinkedList<>(); + + NGramModel model = new NGramModel(); + String normalized = normalizer.normalize(document).toString(); + model.add(normalized, minLength, maxLength); + + for (StringList tokenList : model) { + if (tokenList.size() > 0) { + context.add("ng=" + StringUtil.toLowerCase(tokenList.getToken(0))); + } + } + return context.toArray(new String[context.size()]); + } +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/a189d4ec/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorCrossValidator.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorCrossValidator.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorCrossValidator.java new file mode 100644 index 0000000..ce1823a --- /dev/null +++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorCrossValidator.java @@ -0,0 +1,107 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.langdetect; + +import java.io.IOException; + +import opennlp.tools.doccat.FeatureGenerator; +import opennlp.tools.util.ObjectStream; +import opennlp.tools.util.TrainingParameters; +import opennlp.tools.util.eval.CrossValidationPartitioner; +import opennlp.tools.util.eval.Mean; + +/** + * Cross validator for language detector + */ +public class LanguageDetectorCrossValidator { + + private final TrainingParameters params; + + private Mean documentAccuracy = new Mean(); + + private LanguageDetectorEvaluationMonitor[] listeners; + + private LanguageDetectorFactory factory; + + + /** + * Creates a {@link LanguageDetectorCrossValidator} with the given + * {@link FeatureGenerator}s. + */ + public LanguageDetectorCrossValidator(TrainingParameters mlParams, + LanguageDetectorFactory factory, + LanguageDetectorEvaluationMonitor ... listeners) { + this.params = mlParams; + this.listeners = listeners; + this.factory = factory; + } + + /** + * Starts the evaluation. + * + * @param samples + * the data to train and test + * @param nFolds + * number of folds + * + * @throws IOException + */ + public void evaluate(ObjectStream<LanguageSample> samples, int nFolds) + throws IOException { + + CrossValidationPartitioner<LanguageSample> partitioner = + new CrossValidationPartitioner<>(samples, nFolds); + + while (partitioner.hasNext()) { + + CrossValidationPartitioner.TrainingSampleStream<LanguageSample> trainingSampleStream = + partitioner.next(); + + LanguageDetectorModel model = LanguageDetectorME.train( + trainingSampleStream, params, factory); + + LanguageDetectorEvaluator evaluator = new LanguageDetectorEvaluator( + new LanguageDetectorME(model), listeners); + + evaluator.evaluate(trainingSampleStream.getTestSampleStream()); + + documentAccuracy.add(evaluator.getAccuracy(), + evaluator.getDocumentCount()); + + } + } + + /** + * Retrieves the accuracy for all iterations. + * + * @return the word accuracy + */ + public double getDocumentAccuracy() { + return documentAccuracy.mean(); + } + + /** + * Retrieves the number of words which where validated over all iterations. + * The result is the amount of folds multiplied by the total number of words. + * + * @return the word count + */ + public long getDocumentCount() { + return documentAccuracy.count(); + } +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/a189d4ec/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorEvaluationMonitor.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorEvaluationMonitor.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorEvaluationMonitor.java new file mode 100644 index 0000000..30f3313 --- /dev/null +++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorEvaluationMonitor.java @@ -0,0 +1,28 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.langdetect; + +import opennlp.tools.util.eval.EvaluationMonitor; + +/** + * {@link EvaluationMonitor} for Language Detector. + */ +public interface LanguageDetectorEvaluationMonitor extends + EvaluationMonitor<LanguageSample> { + +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/a189d4ec/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorEvaluator.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorEvaluator.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorEvaluator.java new file mode 100644 index 0000000..bbf73c3 --- /dev/null +++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorEvaluator.java @@ -0,0 +1,99 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.langdetect; + +import opennlp.tools.doccat.DocumentCategorizer; +import opennlp.tools.util.eval.Evaluator; +import opennlp.tools.util.eval.Mean; + +/** + * The {@link LanguageDetectorEvaluator} measures the performance of + * the given {@link LanguageDetector} with the provided reference + * {@link LanguageSample}s. + * + * @see LanguageDetector + * @see LanguageSample + */ +public class LanguageDetectorEvaluator extends Evaluator<LanguageSample> { + + private LanguageDetector languageDetector; + + private Mean accuracy = new Mean(); + + /** + * Initializes the current instance. + * + * @param langDetect the language detector instance + */ + public LanguageDetectorEvaluator(LanguageDetector langDetect, + LanguageDetectorEvaluationMonitor ... listeners) { + super(listeners); + this.languageDetector = langDetect; + } + + /** + * Evaluates the given reference {@link LanguageSample} object. + * + * This is done by categorizing the document from the provided + * {@link LanguageSample}. The detected language is then used + * to calculate and update the score. + * + * @param sample the reference {@link LanguageSample}. + */ + public LanguageSample processSample(LanguageSample sample) { + + CharSequence document = sample.getContext(); + + Language predicted = languageDetector.predictLanguage(document); + + + + if (sample.getLanguage().getLang().equals(predicted.getLang())) { + accuracy.add(1); + } + else { + accuracy.add(0); + } + + return new LanguageSample(predicted, sample.getContext()); + } + + /** + * Retrieves the accuracy of provided {@link DocumentCategorizer}. + * + * accuracy = correctly categorized documents / total documents + * + * @return the accuracy + */ + public double getAccuracy() { + return accuracy.mean(); + } + + public long getDocumentCount() { + return accuracy.count(); + } + + /** + * Represents this objects as human readable {@link String}. + */ + @Override + public String toString() { + return "Accuracy: " + accuracy.mean() + "\n" + + "Number of documents: " + accuracy.count(); + } +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/a189d4ec/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorEventStream.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorEventStream.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorEventStream.java new file mode 100644 index 0000000..b556a4d --- /dev/null +++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorEventStream.java @@ -0,0 +1,69 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.langdetect; + +import java.util.Iterator; + +import opennlp.tools.ml.model.Event; +import opennlp.tools.util.AbstractEventStream; +import opennlp.tools.util.ObjectStream; + +/** + * Iterator-like class for modeling language detector events. + */ +public class LanguageDetectorEventStream extends AbstractEventStream<LanguageSample> { + + private LanguageDetectorContextGenerator mContextGenerator; + + /** + * Initializes the current instance via samples and feature generators. + * + * @param data {@link ObjectStream} of {@link LanguageSample}s + */ + public LanguageDetectorEventStream(ObjectStream<LanguageSample> data) { + super(data); + + mContextGenerator = + new LanguageDetectorContextGenerator(); + } + + @Override + protected Iterator<Event> createEvents(final LanguageSample sample) { + + return new Iterator<Event>() { + + private boolean isVirgin = true; + + public boolean hasNext() { + return isVirgin; + } + + public Event next() { + + isVirgin = false; + + return new Event(sample.getLanguage().getLang(), + mContextGenerator.getContext(sample.getContext().toString())); + } + + public void remove() { + throw new UnsupportedOperationException(); + } + }; + } +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/a189d4ec/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorFactory.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorFactory.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorFactory.java new file mode 100644 index 0000000..5cebbba --- /dev/null +++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorFactory.java @@ -0,0 +1,53 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.langdetect; + +import opennlp.tools.util.BaseToolFactory; +import opennlp.tools.util.InvalidFormatException; +import opennlp.tools.util.ext.ExtensionLoader; + + +public class LanguageDetectorFactory extends BaseToolFactory { + + public static LanguageDetectorFactory create(String subclassName) + throws InvalidFormatException { + if (subclassName == null) { + // will create the default factory + return new LanguageDetectorFactory(); + } + try { + LanguageDetectorFactory theFactory = ExtensionLoader.instantiateExtension( + LanguageDetectorFactory.class, subclassName); + theFactory.init(); + return theFactory; + } catch (Exception e) { + String msg = "Could not instantiate the " + subclassName + + ". The initialization throw an exception."; + throw new InvalidFormatException(msg, e); + } + } + + public void init() { + // nothing to do + } + + @Override + public void validateArtifactMap() throws InvalidFormatException { + // nothing to validate + } +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/a189d4ec/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorME.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorME.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorME.java new file mode 100644 index 0000000..74a1cea --- /dev/null +++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorME.java @@ -0,0 +1,97 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.langdetect; + +import java.io.IOException; +import java.util.Arrays; +import java.util.HashMap; +import java.util.Map; + +import opennlp.tools.ml.AbstractEventTrainer; +import opennlp.tools.ml.EventTrainer; +import opennlp.tools.ml.TrainerFactory; +import opennlp.tools.ml.model.MaxentModel; +import opennlp.tools.util.ObjectStream; +import opennlp.tools.util.TrainingParameters; + +/** + * Implements learnable Language Detector + */ +public class LanguageDetectorME implements LanguageDetector { + + private LanguageDetectorModel model; + private LanguageDetectorContextGenerator mContextGenerator; + + /** + * Initializes the current instance with a language detector model. Default feature + * generation is used. + * + * @param model the language detector model + */ + public LanguageDetectorME(LanguageDetectorModel model) { + this.model = model; + this.mContextGenerator = new LanguageDetectorContextGenerator(); + } + + @Override + public Language[] predictLanguages(CharSequence content) { + double[] eval = model.getMaxentModel().eval(mContextGenerator.getContext(content.toString())); + Language[] arr = new Language[eval.length]; + for (int i = 0; i < eval.length; i++) { + arr[i] = new Language(model.getMaxentModel().getOutcome(i), eval[i]); + } + + Arrays.sort(arr, (o1, o2) -> Double.compare(o2.getConfidence(), o1.getConfidence())); + return arr; + } + + @Override + public Language predictLanguage(CharSequence content) { + return predictLanguages(content)[0]; + } + + @Override + public String[] getSupportedLanguages() { + int numberLanguages = model.getMaxentModel().getNumOutcomes(); + String[] languages = new String[numberLanguages]; + for (int i = 0; i < numberLanguages; i++) { + languages[i] = model.getMaxentModel().getOutcome(i); + } + return languages; + } + + + public static LanguageDetectorModel train(ObjectStream<LanguageSample> samples, + TrainingParameters mlParams, + LanguageDetectorFactory factory) + throws IOException { + + Map<String, String> manifestInfoEntries = new HashMap<>(); + + mlParams.putIfAbsent(AbstractEventTrainer.DATA_INDEXER_PARAM, + AbstractEventTrainer.DATA_INDEXER_ONE_PASS_VALUE); + + EventTrainer trainer = TrainerFactory.getEventTrainer( + mlParams, manifestInfoEntries); + + MaxentModel model = trainer.train( + new LanguageDetectorEventStream(samples)); + + return new LanguageDetectorModel(model, manifestInfoEntries, factory); + } +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/a189d4ec/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorModel.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorModel.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorModel.java new file mode 100644 index 0000000..c0d9703 --- /dev/null +++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorModel.java @@ -0,0 +1,82 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.langdetect; + +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.net.URL; +import java.util.Map; + +import opennlp.tools.ml.model.AbstractModel; +import opennlp.tools.ml.model.MaxentModel; +import opennlp.tools.util.BaseToolFactory; +import opennlp.tools.util.InvalidFormatException; +import opennlp.tools.util.model.BaseModel; + +/** + * A model for language detection + */ +public class LanguageDetectorModel extends BaseModel { + + private static final String COMPONENT_NAME = "LanguageDetectorME"; + private static final String LANGDETECT_MODEL_ENTRY_NAME = "langdetect.model"; + + public LanguageDetectorModel(MaxentModel langdetectModel, + Map<String, String> manifestInfoEntries, + LanguageDetectorFactory factory) { + super(COMPONENT_NAME, "und", manifestInfoEntries, factory); + + artifactMap.put(LANGDETECT_MODEL_ENTRY_NAME, langdetectModel); + checkArtifactMap(); + } + + public LanguageDetectorModel(InputStream in) throws IOException { + super(COMPONENT_NAME, in); + } + + public LanguageDetectorModel(File modelFile) throws IOException { + super(COMPONENT_NAME, modelFile); + } + + public LanguageDetectorModel(URL modelURL) throws IOException { + super(COMPONENT_NAME, modelURL); + } + + @Override + protected void validateArtifactMap() throws InvalidFormatException { + super.validateArtifactMap(); + + if (!(artifactMap.get(LANGDETECT_MODEL_ENTRY_NAME) instanceof AbstractModel)) { + throw new InvalidFormatException("Language detector model is incomplete!"); + } + } + + public LanguageDetectorFactory getFactory() { + return (LanguageDetectorFactory) this.toolFactory; + } + + @Override + protected Class<? extends BaseToolFactory> getDefaultFactory() { + return LanguageDetectorFactory.class; + } + + public MaxentModel getMaxentModel() { + return (MaxentModel) artifactMap.get(LANGDETECT_MODEL_ENTRY_NAME); + } +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/a189d4ec/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorSampleStream.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorSampleStream.java new file mode 100644 index 0000000..bffb59b --- /dev/null +++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorSampleStream.java @@ -0,0 +1,56 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.langdetect; + +import java.io.IOException; + +import opennlp.tools.util.FilterObjectStream; +import opennlp.tools.util.ObjectStream; + +/** + * This class reads in string encoded training samples, parses them and + * outputs {@link LanguageSample} objects. + * <p> + * Format:<br> + * Each line contains one sample document.<br> + * The language is the first string in the line followed by a tab and the document content.<br> + * Sample line: category-string tab-char document line-break-char(s)<br> + */ +public class LanguageDetectorSampleStream + extends FilterObjectStream<String, LanguageSample> { + + public LanguageDetectorSampleStream(ObjectStream<String> samples) { + super(samples); + } + + public LanguageSample read() throws IOException { + String sampleString = samples.read(); + + if (sampleString != null) { + + int tabIndex = sampleString.indexOf("\t"); + if (tabIndex > 0) { + String lang = sampleString.substring(0, tabIndex); + String context = sampleString.substring(tabIndex + 1); + + return new LanguageSample(new Language(lang), context); + } + } + return null; + } +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/a189d4ec/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageSample.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageSample.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageSample.java new file mode 100644 index 0000000..6f2fda7 --- /dev/null +++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageSample.java @@ -0,0 +1,75 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.langdetect; + +import java.util.Objects; + +/** + * Class which holds a classified document and its @{@link Language}. + */ +public class LanguageSample { + + private final Language language; + private final CharSequence context; + + public LanguageSample(Language language, CharSequence context) { + Objects.requireNonNull(context, "context must not be null"); + Objects.requireNonNull(language, "language must not be null"); + this.language = language; + this.context = context; + } + + public Language getLanguage() { + return language; + } + + public CharSequence getContext() { + return context; + } + + @Override + public String toString() { + + StringBuilder sampleString = new StringBuilder(); + + sampleString.append(language.getLang()).append('\t').append(context); + + return sampleString.toString(); + } + + @Override + public int hashCode() { + return Objects.hash(getContext(), getLanguage()); + } + + @Override + public boolean equals(Object obj) { + if (this == obj) { + return true; + } + + if (obj instanceof LanguageSample) { + LanguageSample a = (LanguageSample) obj; + + return getLanguage().equals(a.getLanguage()) + && getContext().equals(a.getContext()); + } + + return false; + } +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/a189d4ec/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/AggregateCharSequenceNormalizer.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/AggregateCharSequenceNormalizer.java b/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/AggregateCharSequenceNormalizer.java new file mode 100644 index 0000000..771be19 --- /dev/null +++ b/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/AggregateCharSequenceNormalizer.java @@ -0,0 +1,39 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +package opennlp.tools.util.normalizer; + +public class AggregateCharSequenceNormalizer implements CharSequenceNormalizer { + + private final CharSequenceNormalizer[] normalizers; + + public AggregateCharSequenceNormalizer(CharSequenceNormalizer ... normalizers) { + this.normalizers = normalizers; + } + + public CharSequence normalize (CharSequence text) { + + for (CharSequenceNormalizer normalizers : + normalizers) { + text = normalizers.normalize(text); + } + + return text; + } + +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/a189d4ec/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/CharSequenceNormalizer.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/CharSequenceNormalizer.java b/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/CharSequenceNormalizer.java new file mode 100644 index 0000000..b5c1f3f --- /dev/null +++ b/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/CharSequenceNormalizer.java @@ -0,0 +1,23 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +package opennlp.tools.util.normalizer; + +public interface CharSequenceNormalizer { + CharSequence normalize(CharSequence text); +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/a189d4ec/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/EmojiCharSequenceNormalizer.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/EmojiCharSequenceNormalizer.java b/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/EmojiCharSequenceNormalizer.java new file mode 100644 index 0000000..d1c161c --- /dev/null +++ b/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/EmojiCharSequenceNormalizer.java @@ -0,0 +1,38 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +package opennlp.tools.util.normalizer; + +import java.util.regex.Pattern; + +public class EmojiCharSequenceNormalizer implements CharSequenceNormalizer { + + private static final EmojiCharSequenceNormalizer INSTANCE = new EmojiCharSequenceNormalizer(); + + public static EmojiCharSequenceNormalizer getInstance() { + return INSTANCE; + } + + private static final Pattern EMOJI_REGEX = + Pattern.compile("[\\uD83C-\\uDBFF\\uDC00-\\uDFFF]+"); + + public CharSequence normalize (CharSequence text) { + String modified = EMOJI_REGEX.matcher(text).replaceAll(" "); + return modified; + } +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/a189d4ec/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/NumberCharSequenceNormalizer.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/NumberCharSequenceNormalizer.java b/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/NumberCharSequenceNormalizer.java new file mode 100644 index 0000000..6b0452d --- /dev/null +++ b/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/NumberCharSequenceNormalizer.java @@ -0,0 +1,36 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +package opennlp.tools.util.normalizer; + +import java.util.regex.Pattern; + +public class NumberCharSequenceNormalizer implements CharSequenceNormalizer { + + private static final Pattern NUMBER_REGEX = Pattern.compile("\\d+"); + + private static final NumberCharSequenceNormalizer INSTANCE = new NumberCharSequenceNormalizer(); + + public static NumberCharSequenceNormalizer getInstance() { + return INSTANCE; + } + + public CharSequence normalize (CharSequence text) { + return NUMBER_REGEX.matcher(text).replaceAll(" "); + } +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/a189d4ec/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/ShrinkCharSequenceNormalizer.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/ShrinkCharSequenceNormalizer.java b/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/ShrinkCharSequenceNormalizer.java new file mode 100644 index 0000000..6183367 --- /dev/null +++ b/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/ShrinkCharSequenceNormalizer.java @@ -0,0 +1,40 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +package opennlp.tools.util.normalizer; + +import java.util.regex.Pattern; + +public class ShrinkCharSequenceNormalizer implements CharSequenceNormalizer { + + private static final Pattern REPEATED_CHAR_REGEX = Pattern.compile("(.)\\1{2,}", + Pattern.CASE_INSENSITIVE); + private static final Pattern SPACE_REGEX = Pattern.compile("\\s{2,}", + Pattern.CASE_INSENSITIVE); + + private static final ShrinkCharSequenceNormalizer INSTANCE = new ShrinkCharSequenceNormalizer(); + + public static ShrinkCharSequenceNormalizer getInstance() { + return INSTANCE; + } + + public CharSequence normalize (CharSequence text) { + text = SPACE_REGEX.matcher(text).replaceAll(" "); + return REPEATED_CHAR_REGEX.matcher(text).replaceAll("$1$1").trim(); + } +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/a189d4ec/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/TwitterCharSequenceNormalizer.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/TwitterCharSequenceNormalizer.java b/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/TwitterCharSequenceNormalizer.java new file mode 100644 index 0000000..b5a8625 --- /dev/null +++ b/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/TwitterCharSequenceNormalizer.java @@ -0,0 +1,50 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +package opennlp.tools.util.normalizer; + +import java.util.regex.Pattern; + +public class TwitterCharSequenceNormalizer implements CharSequenceNormalizer { + + private static final Pattern HASH_USER_REGEX = + Pattern.compile("[#@]\\S+"); + + private static final Pattern RT_REGEX = + Pattern.compile("\\b(rt[ :])+", Pattern.CASE_INSENSITIVE); + + private static final Pattern FACE_REGEX = + Pattern.compile("[:;x]-?[()dop]", Pattern.CASE_INSENSITIVE); + + private static final Pattern LAUGH_REGEX = + Pattern.compile("([hj])+([aieou])+(\\1+\\2+)+", Pattern.CASE_INSENSITIVE); + + private static final TwitterCharSequenceNormalizer INSTANCE = new TwitterCharSequenceNormalizer(); + + public static TwitterCharSequenceNormalizer getInstance() { + return INSTANCE; + } + + public CharSequence normalize (CharSequence text) { + String modified = HASH_USER_REGEX.matcher(text).replaceAll(" "); + modified = RT_REGEX.matcher(modified).replaceAll(" "); + modified = FACE_REGEX.matcher(modified).replaceAll(" "); + modified = LAUGH_REGEX.matcher(modified).replaceAll("$1$2$1$2"); + return modified; + } +}
