Repository: opennlp Updated Branches: refs/heads/sentiment [created] a33eb1ee2
OPENNLP-855: New SentimentAnalysisParser This closes #101 Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/a33eb1ee Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/a33eb1ee Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/a33eb1ee Branch: refs/heads/sentiment Commit: a33eb1ee27269056fbd7f7f958f63c12d1fdc308 Parents: 73936c0 Author: amensiko <[email protected]> Authored: Tue Jan 10 22:09:51 2017 +0200 Committer: Jörn Kottmann <[email protected]> Committed: Wed Feb 8 18:18:57 2017 +0100 ---------------------------------------------------------------------- .../main/java/opennlp/tools/cmdline/CLI.java | 8 + .../tools/cmdline/StreamFactoryRegistry.java | 3 + .../sentiment/SentimentCrossValidatorTool.java | 126 +++++++++ .../SentimentDetailedFMeasureListener.java | 43 +++ .../SentimentEvaluationErrorListener.java | 63 +++++ .../sentiment/SentimentEvaluatorTool.java | 154 ++++++++++ .../cmdline/sentiment/SentimentModelLoader.java | 51 ++++ .../cmdline/sentiment/SentimentTrainerTool.java | 113 ++++++++ .../formats/SentimentSampleStreamFactory.java | 83 ++++++ .../sentiment/SentimentContextGenerator.java | 83 ++++++ .../sentiment/SentimentCrossValidator.java | 243 ++++++++++++++++ .../sentiment/SentimentEvaluationMonitor.java | 28 ++ .../tools/sentiment/SentimentEvaluator.java | 67 +++++ .../tools/sentiment/SentimentEventStream.java | 80 ++++++ .../tools/sentiment/SentimentFactory.java | 73 +++++ .../opennlp/tools/sentiment/SentimentME.java | 281 +++++++++++++++++++ .../opennlp/tools/sentiment/SentimentModel.java | 125 +++++++++ .../tools/sentiment/SentimentSample.java | 98 +++++++ .../tools/sentiment/SentimentSampleStream.java | 76 +++++ .../sentiment/SentimentSampleTypeFilter.java | 68 +++++ 20 files changed, 1866 insertions(+) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/opennlp/blob/a33eb1ee/opennlp-tools/src/main/java/opennlp/tools/cmdline/CLI.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/CLI.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/CLI.java index 9385a18..3b2854c 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/cmdline/CLI.java +++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/CLI.java @@ -63,6 +63,9 @@ import opennlp.tools.cmdline.sentdetect.SentenceDetectorCrossValidatorTool; import opennlp.tools.cmdline.sentdetect.SentenceDetectorEvaluatorTool; import opennlp.tools.cmdline.sentdetect.SentenceDetectorTool; import opennlp.tools.cmdline.sentdetect.SentenceDetectorTrainerTool; +import opennlp.tools.cmdline.sentiment.SentimentCrossValidatorTool; +import opennlp.tools.cmdline.sentiment.SentimentEvaluatorTool; +import opennlp.tools.cmdline.sentiment.SentimentTrainerTool; import opennlp.tools.cmdline.tokenizer.DictionaryDetokenizerTool; import opennlp.tools.cmdline.tokenizer.SimpleTokenizerTool; import opennlp.tools.cmdline.tokenizer.TokenizerConverterTool; @@ -148,6 +151,11 @@ public final class CLI { // Entity Linker tools.add(new EntityLinkerTool()); + + // Sentiment Analysis Parser + tools.add(new SentimentTrainerTool()); + tools.add(new SentimentEvaluatorTool()); + tools.add(new SentimentCrossValidatorTool()); // Language Model tools.add(new NGramLanguageModelTool()); http://git-wip-us.apache.org/repos/asf/opennlp/blob/a33eb1ee/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java index 9977519..6026000 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java +++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java @@ -34,6 +34,7 @@ import opennlp.tools.formats.LemmatizerSampleStreamFactory; import opennlp.tools.formats.NameSampleDataStreamFactory; import opennlp.tools.formats.ParseSampleStreamFactory; import opennlp.tools.formats.SentenceSampleStreamFactory; +import opennlp.tools.formats.SentimentSampleStreamFactory; import opennlp.tools.formats.TokenSampleStreamFactory; import opennlp.tools.formats.WordTagSampleStreamFactory; import opennlp.tools.formats.ad.ADChunkSampleStreamFactory; @@ -115,6 +116,8 @@ public final class StreamFactoryRegistry { ConlluPOSSampleStreamFactory.registerFactory(); ConlluLemmaSampleStreamFactory.registerFactory(); + + SentimentSampleStreamFactory.registerFactory(); } public static final String DEFAULT_FORMAT = "opennlp"; http://git-wip-us.apache.org/repos/asf/opennlp/blob/a33eb1ee/opennlp-tools/src/main/java/opennlp/tools/cmdline/sentiment/SentimentCrossValidatorTool.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/sentiment/SentimentCrossValidatorTool.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/sentiment/SentimentCrossValidatorTool.java new file mode 100644 index 0000000..05035a4 --- /dev/null +++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/sentiment/SentimentCrossValidatorTool.java @@ -0,0 +1,126 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.cmdline.sentiment; + +import java.io.IOException; +import java.util.LinkedList; +import java.util.List; + +import opennlp.tools.cmdline.AbstractCrossValidatorTool; +import opennlp.tools.cmdline.CmdLineUtil; +import opennlp.tools.cmdline.TerminateToolException; +import opennlp.tools.cmdline.params.BasicTrainingParams; +import opennlp.tools.cmdline.params.CVParams; +import opennlp.tools.cmdline.params.DetailedFMeasureEvaluatorParams; +import opennlp.tools.cmdline.sentiment.SentimentCrossValidatorTool.CVToolParams; +import opennlp.tools.sentiment.SentimentCrossValidator; +import opennlp.tools.sentiment.SentimentEvaluationMonitor; +import opennlp.tools.sentiment.SentimentFactory; +import opennlp.tools.sentiment.SentimentSample; +import opennlp.tools.util.eval.EvaluationMonitor; +import opennlp.tools.util.model.ModelUtil; + +/** + * Class for helping perform cross validation on the Sentiment Analysis Parser. + */ +public class SentimentCrossValidatorTool + extends AbstractCrossValidatorTool<SentimentSample, CVToolParams> { + + /** + * Interface for parameters + */ + interface CVToolParams + extends BasicTrainingParams, CVParams, DetailedFMeasureEvaluatorParams { + + } + + /** + * Constructor + */ + public SentimentCrossValidatorTool() { + super(SentimentSample.class, CVToolParams.class); + } + + /** + * Returns the short description of the tool + * + * @return short description + */ + public String getShortDescription() { + return "K-fold cross validator for the learnable Sentiment Analysis Parser"; + } + + /** + * Runs the tool + * + * @param format + * the format to be used + * @param args + * the arguments + */ + public void run(String format, String[] args) { + super.run(format, args); + + mlParams = CmdLineUtil.loadTrainingParameters(params.getParams(), true); + if (mlParams == null) { + mlParams = ModelUtil.createDefaultTrainingParameters(); + } + + List<EvaluationMonitor<SentimentSample>> listeners = new LinkedList<EvaluationMonitor<SentimentSample>>(); + if (params.getMisclassified()) { + listeners.add(new SentimentEvaluationErrorListener()); + } + SentimentDetailedFMeasureListener detailedFListener = null; + if (params.getDetailedF()) { + detailedFListener = new SentimentDetailedFMeasureListener(); + listeners.add(detailedFListener); + } + + SentimentFactory sentimentFactory = new SentimentFactory(); + + SentimentCrossValidator validator; + try { + validator = new SentimentCrossValidator(params.getLang(), mlParams, + sentimentFactory, + listeners.toArray(new SentimentEvaluationMonitor[listeners.size()])); + validator.evaluate(sampleStream, params.getFolds()); + } catch (IOException e) { + throw new TerminateToolException(-1, + "IO error while reading training data or indexing data: " + + e.getMessage(), + e); + } finally { + try { + sampleStream.close(); + } catch (IOException e) { + // sorry that this can fail + } + } + + System.out.println("done"); + + System.out.println(); + + if (detailedFListener == null) { + System.out.println(validator.getFMeasure()); + } else { + System.out.println(detailedFListener.toString()); + } + } + +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/a33eb1ee/opennlp-tools/src/main/java/opennlp/tools/cmdline/sentiment/SentimentDetailedFMeasureListener.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/sentiment/SentimentDetailedFMeasureListener.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/sentiment/SentimentDetailedFMeasureListener.java new file mode 100644 index 0000000..c99fcfc --- /dev/null +++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/sentiment/SentimentDetailedFMeasureListener.java @@ -0,0 +1,43 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.cmdline.sentiment; + +import opennlp.tools.cmdline.DetailedFMeasureListener; +import opennlp.tools.sentiment.SentimentEvaluationMonitor; +import opennlp.tools.sentiment.SentimentSample; +import opennlp.tools.util.Span; + +/** + * Class for creating a detailed F-Measure listener + */ +public class SentimentDetailedFMeasureListener + extends DetailedFMeasureListener<SentimentSample> + implements SentimentEvaluationMonitor { + + /** + * Returns the sentiment sample as a span array + * + * @param sample + * the sentiment sample to be returned + * @return span array of the sample + */ + @Override + protected Span[] asSpanArray(SentimentSample sample) { + return null; + } +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/a33eb1ee/opennlp-tools/src/main/java/opennlp/tools/cmdline/sentiment/SentimentEvaluationErrorListener.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/sentiment/SentimentEvaluationErrorListener.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/sentiment/SentimentEvaluationErrorListener.java new file mode 100644 index 0000000..317a67a --- /dev/null +++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/sentiment/SentimentEvaluationErrorListener.java @@ -0,0 +1,63 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.cmdline.sentiment; + +import java.io.OutputStream; + +import opennlp.tools.cmdline.EvaluationErrorPrinter; +import opennlp.tools.sentiment.SentimentSample; +import opennlp.tools.util.eval.EvaluationMonitor; + +/** + * Class for creating an evaluation error listener. + */ +public class SentimentEvaluationErrorListener + extends EvaluationErrorPrinter<SentimentSample> + implements EvaluationMonitor<SentimentSample> { + + /** + * Constructor + */ + public SentimentEvaluationErrorListener() { + super(System.err); + } + + /** + * Constructor + */ + protected SentimentEvaluationErrorListener(OutputStream outputStream) { + super(outputStream); + } + + /** + * Prints the error in case of a missclassification in the evaluator + * + * @param reference + * the sentiment sample reference to be used + * @param prediction + * the sentiment sampple prediction + */ + @Override + public void missclassified(SentimentSample reference, + SentimentSample prediction) { + printError(new String[] { reference.getSentiment() }, + new String[] { prediction.getSentiment() }, reference, prediction, + reference.getSentence()); + } + +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/a33eb1ee/opennlp-tools/src/main/java/opennlp/tools/cmdline/sentiment/SentimentEvaluatorTool.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/sentiment/SentimentEvaluatorTool.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/sentiment/SentimentEvaluatorTool.java new file mode 100644 index 0000000..0572863 --- /dev/null +++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/sentiment/SentimentEvaluatorTool.java @@ -0,0 +1,154 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.cmdline.sentiment; + +import java.io.IOException; +import java.util.LinkedList; +import java.util.List; + +import opennlp.tools.cmdline.AbstractEvaluatorTool; +import opennlp.tools.cmdline.ArgumentParser.OptionalParameter; +import opennlp.tools.cmdline.ArgumentParser.ParameterDescription; +import opennlp.tools.cmdline.PerformanceMonitor; +import opennlp.tools.cmdline.TerminateToolException; +import opennlp.tools.cmdline.params.DetailedFMeasureEvaluatorParams; +import opennlp.tools.cmdline.params.EvaluatorParams; +import opennlp.tools.cmdline.sentiment.SentimentEvaluatorTool.EvalToolParams; +import opennlp.tools.sentiment.SentimentEvaluationMonitor; +import opennlp.tools.sentiment.SentimentEvaluator; +import opennlp.tools.sentiment.SentimentME; +import opennlp.tools.sentiment.SentimentModel; +import opennlp.tools.sentiment.SentimentSample; +import opennlp.tools.sentiment.SentimentSampleTypeFilter; +import opennlp.tools.util.ObjectStream; +import opennlp.tools.util.eval.EvaluationMonitor; + +/** + * Class for creating an evaluation tool for sentiment analysis. + */ +public class SentimentEvaluatorTool + extends AbstractEvaluatorTool<SentimentSample, EvalToolParams> { + + /** + * Interface for parameters to be used in evaluation + */ + interface EvalToolParams + extends EvaluatorParams, DetailedFMeasureEvaluatorParams { + @OptionalParameter + @ParameterDescription(valueName = "types", description = "name types to use for evaluation") + String getNameTypes(); + } + + /** + * Constructor + */ + public SentimentEvaluatorTool() { + super(SentimentSample.class, EvalToolParams.class); + } + + /** + * Returns the short description of the tool + * + * @return short description + */ + public String getShortDescription() { + return "Measures the performance of the Sentiment model with the reference data"; + } + + /** + * Runs the tool + * + * @param format + * the format to be used + * @param args + * the arguments + */ + public void run(String format, String[] args) { + super.run(format, args); + + SentimentModel model = new SentimentModelLoader().load(params.getModel()); + // TODO: check EvalToolParams --> getNameTypes() + + List<EvaluationMonitor<SentimentSample>> listeners = new LinkedList<EvaluationMonitor<SentimentSample>>(); + if (params.getMisclassified()) { + listeners.add(new SentimentEvaluationErrorListener()); + } + SentimentDetailedFMeasureListener detailedFListener = null; + if (params.getDetailedF()) { + detailedFListener = new SentimentDetailedFMeasureListener(); + listeners.add(detailedFListener); + } + + if (params.getNameTypes() != null) { + String nameTypes[] = params.getNameTypes().split(","); + sampleStream = new SentimentSampleTypeFilter(nameTypes, sampleStream); + } + + SentimentEvaluator evaluator = new SentimentEvaluator( + new SentimentME(model), + listeners.toArray(new SentimentEvaluationMonitor[listeners.size()])); + + final PerformanceMonitor monitor = new PerformanceMonitor("sent"); + + ObjectStream<SentimentSample> measuredSampleStream = new ObjectStream<SentimentSample>() { + + public SentimentSample read() throws IOException { + SentimentSample sample = sampleStream.read(); + if (sample != null) { + monitor.incrementCounter(); + } + return sample; + } + + public void reset() throws IOException { + sampleStream.reset(); + } + + public void close() throws IOException { + sampleStream.close(); + } + }; + + monitor.startAndPrintThroughput(); + + try { + evaluator.evaluate(measuredSampleStream); + } catch (IOException e) { + System.err.println("failed"); + throw new TerminateToolException(-1, + "IO error while reading test data: " + e.getMessage(), e); + } finally { + try { + measuredSampleStream.close(); + } catch (IOException e) { + // sorry that this can fail + } + } + + monitor.stopAndPrintFinalResult(); + + System.out.println(); + + if (detailedFListener == null) { + System.out.println(evaluator.getFMeasure()); + } else { + System.out.println(detailedFListener.toString()); + } + } + +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/a33eb1ee/opennlp-tools/src/main/java/opennlp/tools/cmdline/sentiment/SentimentModelLoader.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/sentiment/SentimentModelLoader.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/sentiment/SentimentModelLoader.java new file mode 100644 index 0000000..8cf2874 --- /dev/null +++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/sentiment/SentimentModelLoader.java @@ -0,0 +1,51 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.cmdline.sentiment; + +import java.io.IOException; +import java.io.InputStream; + +import opennlp.tools.cmdline.ModelLoader; +import opennlp.tools.sentiment.SentimentModel; +import opennlp.tools.util.InvalidFormatException; + +/** + * Class for loading a sentiment model. + */ +public class SentimentModelLoader extends ModelLoader<SentimentModel> { + + /** + * Constructor + */ + public SentimentModelLoader() { + super("Sentiment"); + } + + /** + * Loads the sentiment model + * + * @param modelIn + * the input stream model + * @return the model + */ + @Override + protected SentimentModel loadModel(InputStream modelIn) + throws IOException, InvalidFormatException { + return new SentimentModel(modelIn); + } +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/a33eb1ee/opennlp-tools/src/main/java/opennlp/tools/cmdline/sentiment/SentimentTrainerTool.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/sentiment/SentimentTrainerTool.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/sentiment/SentimentTrainerTool.java new file mode 100644 index 0000000..582ffee --- /dev/null +++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/sentiment/SentimentTrainerTool.java @@ -0,0 +1,113 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.cmdline.sentiment; + +import java.io.File; +import java.io.IOException; + +import opennlp.tools.cmdline.AbstractTrainerTool; +import opennlp.tools.cmdline.CLI; +import opennlp.tools.cmdline.CmdLineUtil; +import opennlp.tools.cmdline.TerminateToolException; +import opennlp.tools.cmdline.params.TrainingToolParams; +import opennlp.tools.sentiment.SentimentFactory; +import opennlp.tools.sentiment.SentimentME; +import opennlp.tools.sentiment.SentimentModel; +import opennlp.tools.sentiment.SentimentSample; +import opennlp.tools.util.model.ModelUtil; + +/** + * Class for helping train a sentiment analysis model. + */ +public class SentimentTrainerTool + extends AbstractTrainerTool<SentimentSample, TrainingToolParams> { + + /** + * Constructor + */ + public SentimentTrainerTool() { + super(SentimentSample.class, TrainingToolParams.class); + } + + /** + * Runs the trainer + * + * @param format + * the format to be used + * @param args + * the arguments + */ + @Override + public void run(String format, String[] args) { + super.run(format, args); + if (0 == args.length) { + System.out.println(getHelp()); + } else { + + mlParams = CmdLineUtil.loadTrainingParameters(params.getParams(), false); + if (mlParams == null) { + mlParams = ModelUtil.createDefaultTrainingParameters(); + } + + File modelOutFile = params.getModel(); + + CmdLineUtil.checkOutputFile("sentiment analysis model", modelOutFile); + + SentimentModel model; + try { + SentimentFactory factory = new SentimentFactory(); + model = SentimentME.train(params.getLang(), sampleStream, mlParams, + factory); + } catch (IOException e) { + throw new TerminateToolException(-1, + "IO error while reading training data or indexing data: " + + e.getMessage(), + e); + } finally { + try { + sampleStream.close(); + } catch (IOException e) { + // sorry that this can fail + } + } + + CmdLineUtil.writeModel("sentiment analysis", modelOutFile, model); + } + } + + /** + * Returns the help message + * + * @return the message + */ + @Override + public String getHelp() { + return "Usage: " + CLI.CMD + " " + getName() + " model < documents"; + } + + /** + * Returns the short description of the programme + * + * @return the description + */ + @Override + public String getShortDescription() { + return "learnable sentiment analysis"; + } + +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/a33eb1ee/opennlp-tools/src/main/java/opennlp/tools/formats/SentimentSampleStreamFactory.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/SentimentSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/SentimentSampleStreamFactory.java new file mode 100644 index 0000000..3396740 --- /dev/null +++ b/opennlp-tools/src/main/java/opennlp/tools/formats/SentimentSampleStreamFactory.java @@ -0,0 +1,83 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.formats; + +import java.io.IOException; + +import opennlp.tools.cmdline.ArgumentParser; +import opennlp.tools.cmdline.CmdLineUtil; +import opennlp.tools.cmdline.StreamFactoryRegistry; +import opennlp.tools.cmdline.params.BasicFormatParams; +import opennlp.tools.sentiment.SentimentSample; +import opennlp.tools.sentiment.SentimentSampleStream; +import opennlp.tools.util.InputStreamFactory; +import opennlp.tools.util.ObjectStream; +import opennlp.tools.util.PlainTextByLineStream; + +/** + * Class for creating a sample stream factory for sentiment analysis. + */ +public class SentimentSampleStreamFactory + extends AbstractSampleStreamFactory<SentimentSample> { + + /** + * The constructor of the class; initialises the factory + * + * @param params + * any given parameters + */ + protected <P> SentimentSampleStreamFactory(Class<P> params) { + super(params); + } + + /** + * Creates a sentiment sample stream factory + * + * @param args + * the necessary arguments + * @return SentimentSample stream (factory) + */ + @Override + public ObjectStream<SentimentSample> create(String[] args) { + BasicFormatParams params = ArgumentParser.parse(args, + BasicFormatParams.class); + + CmdLineUtil.checkInputFile("Data", params.getData()); + InputStreamFactory sampleDataIn = CmdLineUtil + .createInputStreamFactory(params.getData()); + ObjectStream<String> lineStream = null; + try { + lineStream = new PlainTextByLineStream(sampleDataIn, + params.getEncoding()); + } catch (IOException ex) { + CmdLineUtil.handleCreateObjectStreamError(ex); + } + + return new SentimentSampleStream(lineStream); + } + + /** + * Registers a SentimentSample stream factory + */ + public static void registerFactory() { + StreamFactoryRegistry.registerFactory(SentimentSample.class, + StreamFactoryRegistry.DEFAULT_FORMAT, + new SentimentSampleStreamFactory(BasicFormatParams.class)); + } + +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/a33eb1ee/opennlp-tools/src/main/java/opennlp/tools/sentiment/SentimentContextGenerator.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/sentiment/SentimentContextGenerator.java b/opennlp-tools/src/main/java/opennlp/tools/sentiment/SentimentContextGenerator.java new file mode 100644 index 0000000..903bb2a --- /dev/null +++ b/opennlp-tools/src/main/java/opennlp/tools/sentiment/SentimentContextGenerator.java @@ -0,0 +1,83 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.sentiment; + +import opennlp.tools.util.BeamSearchContextGenerator; +import opennlp.tools.util.featuregen.AdaptiveFeatureGenerator; + +/** + * Class for using a Context Generator for Sentiment Analysis. + */ +public class SentimentContextGenerator + implements BeamSearchContextGenerator<String> { + + private AdaptiveFeatureGenerator[] featureGenerators; + + public SentimentContextGenerator() { + this(new AdaptiveFeatureGenerator[0]); + } + + public SentimentContextGenerator( + AdaptiveFeatureGenerator[] featureGenerators) { + this.featureGenerators = featureGenerators; + } + + /** + * Returns the context + * + * @param text + * the given text to be returned as context + * @return the text (the context) + */ + public String[] getContext(String text[]) { + return text; + } + + /** + * Returns the context + * + * @param index + * the index of the context + * @param sequence + * String sequence given + * @param priorDecisions + * decisions given earlier + * @param additionalContext + * any additional context + * @return the context + */ + @Override + public String[] getContext(int index, String[] sequence, + String[] priorDecisions, Object[] additionalContext) { + return new String[] {}; + } + + public void updateAdaptiveData(String[] tokens, String[] outcomes) { + + if (tokens != null && outcomes != null + && tokens.length != outcomes.length) { + throw new IllegalArgumentException( + "The tokens and outcome arrays MUST have the same size!"); + } + + for (AdaptiveFeatureGenerator featureGenerator : featureGenerators) { + featureGenerator.updateAdaptiveData(tokens, outcomes); + } + } + +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/a33eb1ee/opennlp-tools/src/main/java/opennlp/tools/sentiment/SentimentCrossValidator.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/sentiment/SentimentCrossValidator.java b/opennlp-tools/src/main/java/opennlp/tools/sentiment/SentimentCrossValidator.java new file mode 100644 index 0000000..03b7038 --- /dev/null +++ b/opennlp-tools/src/main/java/opennlp/tools/sentiment/SentimentCrossValidator.java @@ -0,0 +1,243 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.sentiment; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.Iterator; +import java.util.List; + +import opennlp.tools.util.FilterObjectStream; +import opennlp.tools.util.ObjectStream; +import opennlp.tools.util.TrainingParameters; +import opennlp.tools.util.eval.CrossValidationPartitioner; +import opennlp.tools.util.eval.FMeasure; + +/** + * Class for performing cross validation on the Sentiment Analysis Parser. + */ +public class SentimentCrossValidator { + + /** + * Class for creating a document sample + */ + private class DocumentSample { + + private SentimentSample samples[]; + + /** + * Constructor + */ + DocumentSample(SentimentSample samples[]) { + this.samples = samples; + } + + /** + * Returns the short description of the tool + * + * @return the samples + */ + private SentimentSample[] getSamples() { + return samples; + } + } + + /** + * Reads Sentiment Samples to group them as a document based on the clear + * adaptive data flag. + */ + private class SentimentToDocumentSampleStream + extends FilterObjectStream<SentimentSample, DocumentSample> { + + private SentimentSample beginSample; + + /** + * Constructor + */ + protected SentimentToDocumentSampleStream( + ObjectStream<SentimentSample> samples) { + super(samples); + } + + /** + * Reads Sentiment Samples to group them as a document + * + * @return the resulting DocumentSample + */ + public DocumentSample read() throws IOException { + + List<SentimentSample> document = new ArrayList<SentimentSample>(); + + if (beginSample == null) { + // Assume that the clear flag is set + beginSample = samples.read(); + } + + // Underlying stream is exhausted! + if (beginSample == null) { + return null; + } + + document.add(beginSample); + + SentimentSample sample; + while ((sample = samples.read()) != null) { + + if (sample.isClearAdaptiveDataSet()) { + beginSample = sample; + break; + } + + document.add(sample); + } + + // Underlying stream is exhausted, + // next call must return null + if (sample == null) { + beginSample = null; + } + + return new DocumentSample( + document.toArray(new SentimentSample[document.size()])); + } + + /** + * Performs a reset + * + * @return the resulting DocumentSample + */ + @Override + public void reset() throws IOException, UnsupportedOperationException { + super.reset(); + beginSample = null; + } + } + + /** + * Splits DocumentSample into SentimentSamples. + */ + private class DocumentToSentimentSampleStream + extends FilterObjectStream<DocumentSample, SentimentSample> { + + /** + * Constructor + */ + protected DocumentToSentimentSampleStream( + ObjectStream<DocumentSample> samples) { + super(samples); + } + + private Iterator<SentimentSample> documentSamples = Collections + .<SentimentSample>emptyList().iterator(); + + /** + * Reads Document Sample into SentimentSample + * + * @return the resulting DocumentSample + */ + public SentimentSample read() throws IOException { + + // Note: Empty document samples should be skipped + + if (documentSamples.hasNext()) { + return documentSamples.next(); + } else { + DocumentSample docSample = samples.read(); + + if (docSample != null) { + documentSamples = Arrays.asList(docSample.getSamples()).iterator(); + + return read(); + } else { + return null; + } + } + } + } + + private final String languageCode; + private final TrainingParameters params; + private SentimentEvaluationMonitor[] listeners; + + private SentimentFactory factory; + private FMeasure fmeasure = new FMeasure(); + + /** + * Constructor + */ + public SentimentCrossValidator(String lang, TrainingParameters params, + SentimentFactory factory, SentimentEvaluationMonitor[] monitors) { + + this.languageCode = lang; + this.factory = factory; + this.params = params; + this.listeners = monitors; + } + + /** + * Performs evaluation + * + * @param samples + * stream of SentimentSamples + * @param nFolds + * the number of folds to be used in cross validation + */ + public void evaluate(ObjectStream<SentimentSample> samples, int nFolds) + throws IOException { + + // Note: The sentiment samples need to be grouped on a document basis. + + CrossValidationPartitioner<DocumentSample> partitioner = new CrossValidationPartitioner<DocumentSample>( + new SentimentToDocumentSampleStream(samples), nFolds); + + SentimentModel model = null; + + while (partitioner.hasNext()) { + + CrossValidationPartitioner.TrainingSampleStream<DocumentSample> trainingSampleStream = partitioner + .next(); + + if (factory != null) { + model = SentimentME.train(languageCode, + new DocumentToSentimentSampleStream(trainingSampleStream), params, + factory); + } + + // do testing + SentimentEvaluator evaluator = new SentimentEvaluator( + new SentimentME(model), listeners); + + evaluator.evaluate(new DocumentToSentimentSampleStream( + trainingSampleStream.getTestSampleStream())); + + fmeasure.mergeInto(evaluator.getFMeasure()); + } + } + + /** + * Returns the F-Measure + * + * @return the F-Measure + */ + public FMeasure getFMeasure() { + return fmeasure; + } + +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/a33eb1ee/opennlp-tools/src/main/java/opennlp/tools/sentiment/SentimentEvaluationMonitor.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/sentiment/SentimentEvaluationMonitor.java b/opennlp-tools/src/main/java/opennlp/tools/sentiment/SentimentEvaluationMonitor.java new file mode 100644 index 0000000..ab503f6 --- /dev/null +++ b/opennlp-tools/src/main/java/opennlp/tools/sentiment/SentimentEvaluationMonitor.java @@ -0,0 +1,28 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.sentiment; + +import opennlp.tools.util.eval.EvaluationMonitor; + +/** + * Evaluation Monitor to be used by the evaluator + */ +public interface SentimentEvaluationMonitor + extends EvaluationMonitor<SentimentSample> { + +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/a33eb1ee/opennlp-tools/src/main/java/opennlp/tools/sentiment/SentimentEvaluator.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/sentiment/SentimentEvaluator.java b/opennlp-tools/src/main/java/opennlp/tools/sentiment/SentimentEvaluator.java new file mode 100644 index 0000000..1eaaaa1 --- /dev/null +++ b/opennlp-tools/src/main/java/opennlp/tools/sentiment/SentimentEvaluator.java @@ -0,0 +1,67 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.sentiment; + +import opennlp.tools.util.eval.Evaluator; +import opennlp.tools.util.eval.FMeasure; + +/** + * Class for performing evaluation on the Sentiment Analysis Parser. + */ +public class SentimentEvaluator extends Evaluator<SentimentSample> { + + private FMeasure fmeasure = new FMeasure(); + + private SentimentME sentiment; + + /** + * Constructor + */ + public SentimentEvaluator(SentimentME sentiment, + SentimentEvaluationMonitor... listeners) { + super(listeners); + this.sentiment = sentiment; + } + + /** + * Returns the short description of the tool + * + * @param reference + * the reference to the SentimentSample to be processed + * @return the processed samples + */ + @Override + protected SentimentSample processSample(SentimentSample reference) { + String prediction = sentiment.predict(reference.getSentence()); + String label = reference.getSentiment(); + + fmeasure.updateScores(new String[] { label }, new String[] { prediction }); + + return new SentimentSample(prediction, reference.getSentence()); + } + + /** + * Returns the F-Measure + * + * @return the F-Measure + */ + public FMeasure getFMeasure() { + return fmeasure; + } + +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/a33eb1ee/opennlp-tools/src/main/java/opennlp/tools/sentiment/SentimentEventStream.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/sentiment/SentimentEventStream.java b/opennlp-tools/src/main/java/opennlp/tools/sentiment/SentimentEventStream.java new file mode 100644 index 0000000..8043460 --- /dev/null +++ b/opennlp-tools/src/main/java/opennlp/tools/sentiment/SentimentEventStream.java @@ -0,0 +1,80 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.sentiment; + +import java.util.Iterator; + +import opennlp.tools.ml.model.Event; +import opennlp.tools.util.AbstractEventStream; +import opennlp.tools.util.ObjectStream; + +/** + * Class for creating events for Sentiment Analysis that is later sent to + * MaxEnt. + */ +public class SentimentEventStream extends AbstractEventStream<SentimentSample> { + + private SentimentContextGenerator contextGenerator; + + /** + * Initializes the event stream. + * + * @param samples + * the sentiment samples to be used + * @param createContextGenerator + * the context generator to be used + */ + public SentimentEventStream(ObjectStream<SentimentSample> samples, + SentimentContextGenerator createContextGenerator) { + super(samples); + contextGenerator = createContextGenerator; + } + + /** + * Creates events. + * + * @param sample + * the sentiment sample to be used + * @return event iterator + */ + @Override + protected Iterator<Event> createEvents(final SentimentSample sample) { + + return new Iterator<Event>() { + + private boolean isVirgin = true; + + public boolean hasNext() { + return isVirgin; + } + + public Event next() { + + isVirgin = false; + + return new Event(sample.getSentiment(), + contextGenerator.getContext(sample.getSentence())); + } + + public void remove() { + throw new UnsupportedOperationException(); + } + }; + } + +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/a33eb1ee/opennlp-tools/src/main/java/opennlp/tools/sentiment/SentimentFactory.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/sentiment/SentimentFactory.java b/opennlp-tools/src/main/java/opennlp/tools/sentiment/SentimentFactory.java new file mode 100644 index 0000000..9c284e4 --- /dev/null +++ b/opennlp-tools/src/main/java/opennlp/tools/sentiment/SentimentFactory.java @@ -0,0 +1,73 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.sentiment; + +import opennlp.tools.tokenize.Tokenizer; +import opennlp.tools.tokenize.WhitespaceTokenizer; +import opennlp.tools.util.BaseToolFactory; +import opennlp.tools.util.InvalidFormatException; +import opennlp.tools.util.ext.ExtensionLoader; + +/** + * Class for creating sentiment factories for training. + */ +public class SentimentFactory extends BaseToolFactory { + + private static final String TOKENIZER_NAME = "sentiment.tokenizer"; + + private Tokenizer tokenizer; + + /** + * Validates the artifact map --> nothing to validate. + */ + @Override + public void validateArtifactMap() throws InvalidFormatException { + // nothing to validate + } + + /** + * Creates a new context generator. + * + * @return a context generator for Sentiment Analysis + */ + public SentimentContextGenerator createContextGenerator() { + return new SentimentContextGenerator(); + } + + /** + * Returns the tokenizer + * + * @return the tokenizer + */ + public Tokenizer getTokenizer() { + if (this.tokenizer == null) { + if (artifactProvider != null) { + String className = artifactProvider.getManifestProperty(TOKENIZER_NAME); + if (className != null) { + this.tokenizer = ExtensionLoader.instantiateExtension(Tokenizer.class, + className); + } + } + if (this.tokenizer == null) { // could not load using artifact provider + this.tokenizer = WhitespaceTokenizer.INSTANCE; + } + } + return tokenizer; + } + +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/a33eb1ee/opennlp-tools/src/main/java/opennlp/tools/sentiment/SentimentME.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/sentiment/SentimentME.java b/opennlp-tools/src/main/java/opennlp/tools/sentiment/SentimentME.java new file mode 100644 index 0000000..e722fe9 --- /dev/null +++ b/opennlp-tools/src/main/java/opennlp/tools/sentiment/SentimentME.java @@ -0,0 +1,281 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.sentiment; + +import java.io.IOException; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import opennlp.tools.ml.EventTrainer; +import opennlp.tools.ml.TrainerFactory; +import opennlp.tools.ml.model.Event; +import opennlp.tools.ml.model.MaxentModel; +import opennlp.tools.ml.model.SequenceClassificationModel; +import opennlp.tools.namefind.BioCodec; +import opennlp.tools.util.ObjectStream; +import opennlp.tools.util.Sequence; +import opennlp.tools.util.SequenceCodec; +import opennlp.tools.util.SequenceValidator; +import opennlp.tools.util.Span; +import opennlp.tools.util.TrainingParameters; +import opennlp.tools.util.featuregen.AdaptiveFeatureGenerator; +import opennlp.tools.util.featuregen.AdditionalContextFeatureGenerator; + +/** + * Class for creating a maximum-entropy-based Sentiment Analysis model. + */ +public class SentimentME { + + public static final String OTHER = "other"; + public static final String START = "start"; + public static final String CONTINUE = "cont"; + public static final int DEFAULT_BEAM_SIZE = 3; + + private static String[][] EMPTY = new String[0][0]; + + protected SentimentContextGenerator contextGenerator; + + private AdditionalContextFeatureGenerator additionalContextFeatureGenerator = + new AdditionalContextFeatureGenerator(); + + private Sequence bestSequence; + protected SequenceClassificationModel<String> model; + private SequenceValidator<String> sequenceValidator; + private SentimentFactory factory; + private MaxentModel maxentModel; + private SequenceCodec<String> seqCodec = new BioCodec(); + private AdaptiveFeatureGenerator featureGenerators[]; + + /** + * Constructor, initialises + * + * @param sentModel + * sentiment analysis model + */ + public SentimentME(SentimentModel sentModel) { + + this.model = sentModel.getSentimentModel(); + maxentModel = sentModel.getMaxentModel(); + + factory = sentModel.getFactory(); + + contextGenerator = factory.createContextGenerator(); + } + + /** + * Trains a Sentiment Analysis model. + * + * @param languageCode + * the code for the language of the text, e.g. "en" + * @param samples + * the sentiment samples to be used + * @param trainParams + * parameters for training + * @param factory + * a Sentiment Analysis factory + * @return a Sentiment Analysis model + */ + public static SentimentModel train(String languageCode, + ObjectStream<SentimentSample> samples, TrainingParameters trainParams, + SentimentFactory factory) throws IOException { + + Map<String, String> entries = new HashMap<String, String>(); + + MaxentModel sentimentModel = null; + + ObjectStream<Event> eventStream = new SentimentEventStream(samples, + factory.createContextGenerator()); + + EventTrainer trainer = TrainerFactory.getEventTrainer(trainParams, entries); + sentimentModel = trainer.train(eventStream); + + Map<String, String> manifestInfoEntries = new HashMap<String, String>(); + + return new SentimentModel(languageCode, sentimentModel, manifestInfoEntries, + factory); + + } + + /** + * Makes a sentiment prediction + * + * @param sentence + * the text to be analysed for its sentiment + * @return the predicted sentiment + */ + public String predict(String sentence) { + String[] tokens = factory.getTokenizer().tokenize(sentence); + + return predict(tokens); + } + + public String predict(String[] tokens) { + + double prob[] = probabilities(tokens); + String sentiment = getBestSentiment(prob); + + return sentiment; + } + + /** + * Returns the best chosen sentiment for the text predicted on + * + * @param outcome + * the outcome + * @return the best sentiment + */ + public String getBestSentiment(double[] outcome) { + return maxentModel.getBestOutcome(outcome); + } + + /** + * Returns the analysis probabilities + * + * @param text + * the text to categorize + */ + public double[] probabilities(String text[]) { + return maxentModel.eval(contextGenerator.getContext(text)); + } + + /** + * Returns an array of probabilities for each of the specified spans which is + * the arithmetic mean of the probabilities for each of the outcomes which + * make up the span. + * + * @param spans + * The spans of the sentiments for which probabilities are desired. + * @return an array of probabilities for each of the specified spans. + */ + public double[] probs(Span[] spans) { + + double[] sprobs = new double[spans.length]; + double[] probs = bestSequence.getProbs(); + + for (int si = 0; si < spans.length; si++) { + + double p = 0; + + for (int oi = spans[si].getStart(); oi < spans[si].getEnd(); oi++) { + p += probs[oi]; + } + + p /= spans[si].length(); + + sprobs[si] = p; + } + + return sprobs; + } + + /** + * Sets the probs for the spans + * + * @param spans + * the spans to be analysed + * @return the span of probs + */ + private Span[] setProbs(Span[] spans) { + double[] probs = probs(spans); + if (probs != null) { + + for (int i = 0; i < probs.length; i++) { + double prob = probs[i]; + spans[i] = new Span(spans[i], prob); + } + } + return spans; + } + + /** + * Generates sentiment tags for the given sequence, typically a sentence, + * returning token spans for any identified sentiments. + * + * @param tokens + * an array of the tokens or words of the sequence, typically a + * sentence + * @return an array of spans for each of the names identified. + */ + public Span[] find(String[] tokens) { + return find(tokens, EMPTY); + } + + /** + * Generates sentiment tags for the given sequence, typically a sentence, + * returning token spans for any identified sentiments. + * + * @param tokens + * an array of the tokens or words of the sequence, typically a + * sentence. + * @param additionalContext + * features which are based on context outside of the sentence but + * which should also be used. + * + * @return an array of spans for each of the names identified. + */ + public Span[] find(String[] tokens, String[][] additionalContext) { + + additionalContextFeatureGenerator.setCurrentContext(additionalContext); + + bestSequence = model.bestSequence(tokens, additionalContext, + contextGenerator, sequenceValidator); + + List<String> c = bestSequence.getOutcomes(); + + contextGenerator.updateAdaptiveData(tokens, + c.toArray(new String[c.size()])); + Span[] spans = seqCodec.decode(c); + spans = setProbs(spans); + return spans; + } + + /** + * Makes a sentiment prediction by calling the helper method + * + * @param tokens + * the text to be analysed for its sentiment + * @return the prediction made by the helper method + */ + public Span[] predict2(String[] tokens) { + return predict2(tokens, EMPTY); + } + + /** + * Makes a sentiment prediction + * + * @param tokens + * the text to be analysed for its sentiment + * @param additionalContext + * any required additional context + * @return the predictions + */ + public Span[] predict2(String[] tokens, String[][] additionalContext) { + + additionalContextFeatureGenerator.setCurrentContext(additionalContext); + + bestSequence = model.bestSequence(tokens, additionalContext, + contextGenerator, sequenceValidator); + + List<String> c = bestSequence.getOutcomes(); + + Span[] spans = seqCodec.decode(c); + return spans; + } + +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/a33eb1ee/opennlp-tools/src/main/java/opennlp/tools/sentiment/SentimentModel.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/sentiment/SentimentModel.java b/opennlp-tools/src/main/java/opennlp/tools/sentiment/SentimentModel.java new file mode 100644 index 0000000..b2fce71 --- /dev/null +++ b/opennlp-tools/src/main/java/opennlp/tools/sentiment/SentimentModel.java @@ -0,0 +1,125 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.sentiment; + +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.net.URL; +import java.util.Map; +import java.util.Properties; + +import opennlp.tools.ml.BeamSearch; +import opennlp.tools.ml.model.MaxentModel; +import opennlp.tools.ml.model.SequenceClassificationModel; +import opennlp.tools.util.InvalidFormatException; +import opennlp.tools.util.model.BaseModel; + +/** + * Class for the basis of the Sentiment Analysis model. + */ +public class SentimentModel extends BaseModel { + + private static final String COMPONENT_NAME = "SentimentME"; + private static final String SENTIMENT_MODEL_ENTRY_NAME = "sentiment.model"; + + /** + * Initializes the Sentiment Analysis model. + * + * @param languageCode + * the code for the language of the text, e.g. "en" + * @param sentimentModel + * a MaxEnt sentiment model + * @param manifestInfoEntries + * additional information in the manifest + * @param factory + * a Sentiment Analysis factory + */ + public SentimentModel(String languageCode, MaxentModel sentimentModel, + Map<String, String> manifestInfoEntries, SentimentFactory factory) { + super(COMPONENT_NAME, languageCode, manifestInfoEntries, factory); + artifactMap.put(SENTIMENT_MODEL_ENTRY_NAME, sentimentModel); + checkArtifactMap(); + } + + /** + * Initializes the Sentiment Analysis model. + * + * @param modelURL + * the URL to a file required for the model + */ + public SentimentModel(URL modelURL) + throws IOException, InvalidFormatException { + super(COMPONENT_NAME, modelURL); + } + + /** + * Initializes the Sentiment Analysis model. + * + * @param file + * the file required for the model + */ + public SentimentModel(File file) throws InvalidFormatException, IOException { + super(COMPONENT_NAME, file); + } + + public SentimentModel(InputStream modelIn) + throws InvalidFormatException, IOException { + super(COMPONENT_NAME, modelIn); + } + + /** + * Return the model + * + * @return the model + */ + @Deprecated + public SequenceClassificationModel<String> getSentimentModel() { + Properties manifest = (Properties) artifactMap.get(MANIFEST_ENTRY); + + String beamSizeString = manifest + .getProperty(BeamSearch.BEAM_SIZE_PARAMETER); + + int beamSize = SentimentME.DEFAULT_BEAM_SIZE; + if (beamSizeString != null) { + beamSize = Integer.parseInt(beamSizeString); + } + + return new BeamSearch<>(beamSize, + (MaxentModel) artifactMap.get(SENTIMENT_MODEL_ENTRY_NAME)); + } + + /** + * Returns the sentiment factory + * + * @return the sentiment factory for the model + */ + public SentimentFactory getFactory() { + return (SentimentFactory) this.toolFactory; + } + + /** + * Returns the MaxEntropy model + * + * @return the MaxEnt model + */ + public MaxentModel getMaxentModel() { + return (MaxentModel) artifactMap.get(SENTIMENT_MODEL_ENTRY_NAME); + } + +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/a33eb1ee/opennlp-tools/src/main/java/opennlp/tools/sentiment/SentimentSample.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/sentiment/SentimentSample.java b/opennlp-tools/src/main/java/opennlp/tools/sentiment/SentimentSample.java new file mode 100644 index 0000000..a35096d --- /dev/null +++ b/opennlp-tools/src/main/java/opennlp/tools/sentiment/SentimentSample.java @@ -0,0 +1,98 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.sentiment; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; + +/** + * Class for holding text used for sentiment analysis. + */ +public class SentimentSample { + + private final String sentiment; + private final List<String> sentence; + private final boolean isClearAdaptiveData; + private final String id = null; + + /** + * Initializes the current instance. + * + * @param sentiment + * training sentiment + * @param sentence + * training sentence + */ + public SentimentSample(String sentiment, String[] sentence) { + this(sentiment, sentence, true); + } + + public SentimentSample(String sentiment, String[] sentence, + boolean clearAdaptiveData) { + if (sentiment == null) { + throw new IllegalArgumentException("sentiment must not be null"); + } + if (sentence == null) { + throw new IllegalArgumentException("sentence must not be null"); + } + + this.sentiment = sentiment; + this.sentence = Collections + .unmodifiableList(new ArrayList<String>(Arrays.asList(sentence))); + this.isClearAdaptiveData = clearAdaptiveData; + } + + /** + * Returns the sentiment + * + * @return the sentiment + */ + public String getSentiment() { + return sentiment; + } + + /** + * Returns the sentence used + * + * @return the sentence + */ + public String[] getSentence() { + return sentence.toArray(new String[0]); + } + + /** + * Returns the id + * + * @return the id + */ + public String getId() { + return id; + } + + /** + * Returns the value of isClearAdaptiveData + * + * @return true or false + */ + public boolean isClearAdaptiveDataSet() { + return isClearAdaptiveData; + } + +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/a33eb1ee/opennlp-tools/src/main/java/opennlp/tools/sentiment/SentimentSampleStream.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/sentiment/SentimentSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/sentiment/SentimentSampleStream.java new file mode 100644 index 0000000..839cb3e --- /dev/null +++ b/opennlp-tools/src/main/java/opennlp/tools/sentiment/SentimentSampleStream.java @@ -0,0 +1,76 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.sentiment; + +import java.io.IOException; + +import opennlp.tools.tokenize.WhitespaceTokenizer; +import opennlp.tools.util.FilterObjectStream; +import opennlp.tools.util.ObjectStream; + +/** + * Class for converting Strings through Data Stream to SentimentSample using + * tokenised text. + */ +public class SentimentSampleStream + extends FilterObjectStream<String, SentimentSample> { + + /** + * Initializes the sample stream. + * + * @param samples + * the sentiment samples to be used + */ + public SentimentSampleStream(ObjectStream<String> samples) { + super(samples); + } + + /** + * Reads the text + * + * @return a ready-to-be-trained SentimentSample object + */ + @Override + public SentimentSample read() throws IOException { + String sentence = samples.read(); + + if (sentence != null) { + + // Whitespace tokenize entire string + String tokens[] = WhitespaceTokenizer.INSTANCE.tokenize(sentence); + + SentimentSample sample; + + if (tokens.length > 1) { + String sentiment = tokens[0]; + String sentTokens[] = new String[tokens.length - 1]; + System.arraycopy(tokens, 1, sentTokens, 0, tokens.length - 1); + + sample = new SentimentSample(sentiment, sentTokens); + } else { + throw new IOException( + "Empty lines, or lines with only a category string are not allowed!"); + } + + return sample; + } + + return null; + } + +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/a33eb1ee/opennlp-tools/src/main/java/opennlp/tools/sentiment/SentimentSampleTypeFilter.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/sentiment/SentimentSampleTypeFilter.java b/opennlp-tools/src/main/java/opennlp/tools/sentiment/SentimentSampleTypeFilter.java new file mode 100644 index 0000000..68e7ecc --- /dev/null +++ b/opennlp-tools/src/main/java/opennlp/tools/sentiment/SentimentSampleTypeFilter.java @@ -0,0 +1,68 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.sentiment; + +import java.io.IOException; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashSet; +import java.util.Set; + +import opennlp.tools.util.FilterObjectStream; +import opennlp.tools.util.ObjectStream; + +/** + * Class for creating a type filter + */ +public class SentimentSampleTypeFilter + extends FilterObjectStream<SentimentSample, SentimentSample> { + + private final Set<String> types; + + /** + * Constructor + */ + public SentimentSampleTypeFilter(String[] types, + ObjectStream<SentimentSample> samples) { + super(samples); + this.types = Collections + .unmodifiableSet(new HashSet<String>(Arrays.asList(types))); + } + + /** + * Constructor + */ + public SentimentSampleTypeFilter(Set<String> types, + ObjectStream<SentimentSample> samples) { + super(samples); + this.types = Collections.unmodifiableSet(new HashSet<String>(types)); + } + + /** + * Reads and returns sentiment samples. + * + * @return the sentiment sample read + */ + @Override + public SentimentSample read() throws IOException { + SentimentSample sample = samples.read(); + return sample; + + } + +}
