Repository: opennlp Updated Branches: refs/heads/master 9a9366c78 -> 53e5e3fa8
OPENNLP-972 - add LM#predictNextTokens, rename to NGramLMTool, this closes apache/opennlp#100 Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/53e5e3fa Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/53e5e3fa Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/53e5e3fa Branch: refs/heads/master Commit: 53e5e3fa8531bbb4d70523197215fd19b89c1a76 Parents: 9a9366c Author: Tommaso Teofili <[email protected]> Authored: Sun Jan 29 00:25:33 2017 +0100 Committer: Tommaso Teofili <[email protected]> Committed: Sun Jan 29 00:25:33 2017 +0100 ---------------------------------------------------------------------- .../main/java/opennlp/tools/cmdline/CLI.java | 4 +- .../languagemodel/LanguageModelTool.java | 103 ------------------- .../languagemodel/NGramLanguageModelTool.java | 102 ++++++++++++++++++ 3 files changed, 104 insertions(+), 105 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/opennlp/blob/53e5e3fa/opennlp-tools/src/main/java/opennlp/tools/cmdline/CLI.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/CLI.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/CLI.java index ca9b12f..9385a18 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/cmdline/CLI.java +++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/CLI.java @@ -37,7 +37,7 @@ import opennlp.tools.cmdline.doccat.DoccatEvaluatorTool; import opennlp.tools.cmdline.doccat.DoccatTool; import opennlp.tools.cmdline.doccat.DoccatTrainerTool; import opennlp.tools.cmdline.entitylinker.EntityLinkerTool; -import opennlp.tools.cmdline.languagemodel.LanguageModelTool; +import opennlp.tools.cmdline.languagemodel.NGramLanguageModelTool; import opennlp.tools.cmdline.lemmatizer.LemmatizerEvaluatorTool; import opennlp.tools.cmdline.lemmatizer.LemmatizerMETool; import opennlp.tools.cmdline.lemmatizer.LemmatizerTrainerTool; @@ -150,7 +150,7 @@ public final class CLI { tools.add(new EntityLinkerTool()); // Language Model - tools.add(new LanguageModelTool()); + tools.add(new NGramLanguageModelTool()); for (CmdLineTool tool : tools) { toolLookupMap.put(tool.getName(), tool); http://git-wip-us.apache.org/repos/asf/opennlp/blob/53e5e3fa/opennlp-tools/src/main/java/opennlp/tools/cmdline/languagemodel/LanguageModelTool.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/languagemodel/LanguageModelTool.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/languagemodel/LanguageModelTool.java deleted file mode 100644 index aa46355..0000000 --- a/opennlp-tools/src/main/java/opennlp/tools/cmdline/languagemodel/LanguageModelTool.java +++ /dev/null @@ -1,103 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package opennlp.tools.cmdline.languagemodel; - -import java.io.File; -import java.io.FileInputStream; -import java.io.IOException; -import java.util.Arrays; - -import opennlp.tools.cmdline.BasicCmdLineTool; -import opennlp.tools.cmdline.CLI; -import opennlp.tools.cmdline.CmdLineUtil; -import opennlp.tools.cmdline.PerformanceMonitor; -import opennlp.tools.cmdline.SystemInputStreamFactory; -import opennlp.tools.languagemodel.NGramLanguageModel; -import opennlp.tools.util.ObjectStream; -import opennlp.tools.util.PlainTextByLineStream; -import opennlp.tools.util.StringList; - -/** - * Command line tool for {@link opennlp.tools.languagemodel.LanguageModel#calculateProbability(StringList)}. - */ -public class LanguageModelTool extends BasicCmdLineTool { - - @Override - public String getShortDescription() { - return "gives the probability of a sequence of tokens in a language model"; - } - - @Override - public void run(String[] args) { - File lmFile = new File(args[0]); - FileInputStream stream = null; - try { - stream = new FileInputStream(lmFile); - NGramLanguageModel nGramLanguageModel = new NGramLanguageModel( - stream); - - ObjectStream<String> lineStream; - PerformanceMonitor perfMon = null; - - try { - lineStream = new PlainTextByLineStream( - new SystemInputStreamFactory(), - SystemInputStreamFactory.encoding()); - perfMon = new PerformanceMonitor(System.err, "lm"); - perfMon.start(); - String line; - while ((line = lineStream.read()) != null) { - double probability; - String[] tokens = line.split(" "); - try { - probability = nGramLanguageModel - .calculateProbability(new StringList(tokens)); - } catch (Exception e) { - System.err.println("Error:" + e.getLocalizedMessage()); - System.err.println(line); - continue; - } - - System.out.println("sequence '" + Arrays.toString(tokens) - + "' has a probability of " + probability); - - perfMon.incrementCounter(); - } - } catch (IOException e) { - CmdLineUtil.handleStdinIoError(e); - } - - perfMon.stopAndPrintFinalResult(); - - } catch (java.io.IOException e) { - System.err.println(e.getLocalizedMessage()); - } finally { - if (stream != null) { - try { - stream.close(); - } catch (IOException e) { - // do nothing - } - } - } - } - - @Override - public String getHelp() { - return "Usage: " + CLI.CMD + " " + getName() + " model"; - } -} http://git-wip-us.apache.org/repos/asf/opennlp/blob/53e5e3fa/opennlp-tools/src/main/java/opennlp/tools/cmdline/languagemodel/NGramLanguageModelTool.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/languagemodel/NGramLanguageModelTool.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/languagemodel/NGramLanguageModelTool.java new file mode 100644 index 0000000..1c599c5 --- /dev/null +++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/languagemodel/NGramLanguageModelTool.java @@ -0,0 +1,102 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package opennlp.tools.cmdline.languagemodel; + +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; + +import opennlp.tools.cmdline.BasicCmdLineTool; +import opennlp.tools.cmdline.CLI; +import opennlp.tools.cmdline.CmdLineUtil; +import opennlp.tools.cmdline.PerformanceMonitor; +import opennlp.tools.cmdline.SystemInputStreamFactory; +import opennlp.tools.languagemodel.NGramLanguageModel; +import opennlp.tools.util.ObjectStream; +import opennlp.tools.util.PlainTextByLineStream; +import opennlp.tools.util.StringList; + +/** + * Command line tool for {@link opennlp.tools.languagemodel.NGramLanguageModel}. + */ +public class NGramLanguageModelTool extends BasicCmdLineTool { + + @Override + public String getShortDescription() { + return "gives the probability and most probable next token(s) of a sequence of tokens in a " + + "language model"; + } + + @Override + public void run(String[] args) { + File lmFile = new File(args[0]); + FileInputStream stream = null; + try { + stream = new FileInputStream(lmFile); + NGramLanguageModel nGramLanguageModel = new NGramLanguageModel(stream); + + ObjectStream<String> lineStream; + PerformanceMonitor perfMon = null; + + try { + lineStream = new PlainTextByLineStream(new SystemInputStreamFactory(), + SystemInputStreamFactory.encoding()); + perfMon = new PerformanceMonitor(System.err, "nglm"); + perfMon.start(); + String line; + while ((line = lineStream.read()) != null) { + double probability; + StringList predicted; + String[] tokens = line.split(" "); + StringList sample = new StringList(tokens); + try { + probability = nGramLanguageModel.calculateProbability(sample); + predicted = nGramLanguageModel.predictNextTokens(sample); + } catch (Exception e) { + System.err.println("Error:" + e.getLocalizedMessage()); + System.err.println(line); + continue; + } + + System.out.println(sample + " -> prob:" + probability + ", next:" + predicted); + + perfMon.incrementCounter(); + } + } catch (IOException e) { + CmdLineUtil.handleStdinIoError(e); + } + + perfMon.stopAndPrintFinalResult(); + + } catch (java.io.IOException e) { + System.err.println(e.getLocalizedMessage()); + } finally { + if (stream != null) { + try { + stream.close(); + } catch (IOException e) { + // do nothing + } + } + } + } + + @Override + public String getHelp() { + return "Usage: " + CLI.CMD + " " + getName() + " model"; + } +}
