Repository: opennlp Updated Branches: refs/heads/master a59765cd4 -> f8fbfc9fd
OPENNLP-1022:Fix documentation to remove references to 'Save XXXModel to database, this closes apache/opennlp#158 Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/f8fbfc9f Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/f8fbfc9f Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/f8fbfc9f Branch: refs/heads/master Commit: f8fbfc9fdca4b5e9ba1a5608ca17e7b6feb18c3c Parents: a59765c Author: smarthi <[email protected]> Authored: Sun Apr 16 21:45:17 2017 -0400 Committer: smarthi <[email protected]> Committed: Sun Apr 16 21:45:17 2017 -0400 ---------------------------------------------------------------------- opennlp-docs/src/docbkx/chunker.xml | 29 ++------- opennlp-docs/src/docbkx/doccat.xml | 44 ++------------ opennlp-docs/src/docbkx/introduction.xml | 17 +----- opennlp-docs/src/docbkx/lemmatizer.xml | 38 +++--------- opennlp-docs/src/docbkx/namefinder.xml | 36 +++--------- opennlp-docs/src/docbkx/parser.xml | 2 +- opennlp-docs/src/docbkx/postagger.xml | 62 ++------------------ opennlp-docs/src/docbkx/sentdetect.xml | 33 ++--------- opennlp-docs/src/docbkx/tokenizer.xml | 15 +---- .../main/java/opennlp/tools/ml/BeamSearch.java | 23 +++----- 10 files changed, 46 insertions(+), 253 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/opennlp/blob/f8fbfc9f/opennlp-docs/src/docbkx/chunker.xml ---------------------------------------------------------------------- diff --git a/opennlp-docs/src/docbkx/chunker.xml b/opennlp-docs/src/docbkx/chunker.xml index 0c04e8a..b67a7fd 100644 --- a/opennlp-docs/src/docbkx/chunker.xml +++ b/opennlp-docs/src/docbkx/chunker.xml @@ -81,19 +81,8 @@ Rockwell_NNP said_VBD the_DT agreement_NN calls_VBZ for_IN it_PRP to_TO supply_V InputStream modelIn = null; ChunkerModel model = null; -try { - modelIn = new FileInputStream("en-chunker.bin"); +try (modelIn = new FileInputStream("en-chunker.bin")){ model = new ChunkerModel(modelIn); -} catch (IOException e) { - // Model loading failed, handle the error - e.printStackTrace(); -} finally { - if (modelIn != null) { - try { - modelIn.close(); - } catch (IOException e) { - } - } }]]> </programlisting> After the model is loaded a Chunker can be instantiated. @@ -242,28 +231,18 @@ $ opennlp ChunkerTrainerME -model en-chunker.bin -lang en -data en-chunker.train illustrates how to do it: <programlisting language="java"> <![CDATA[ -Charset charset = Charset.forName("UTF-8"); ObjectStream<String> lineStream = - new PlainTextByLineStream(new FileInputStream("en-chunker.train"),charset); -ObjectStream<ChunkSample> sampleStream = new ChunkSampleStream(lineStream); + new PlainTextByLineStream(new FileInputStream("en-chunker.train"), StandardCharsets.UTF_8); ChunkerModel model; -try { +try(ObjectStream<ChunkSample> sampleStream = new ChunkSampleStream(lineStream)) { model = ChunkerME.train("en", sampleStream, new DefaultChunkerContextGenerator(), TrainingParameters.defaultParams()); } -finally { - sampleStream.close(); -} -OutputStream modelOut = null; -try { - modelOut = new BufferedOutputStream(new FileOutputStream(modelFile)); +try (OutputStream modelOut = new BufferedOutputStream(new FileOutputStream(modelFile))) { model.serialize(modelOut); -} finally { - if (modelOut != null) - modelOut.close(); }]]> </programlisting> </para> http://git-wip-us.apache.org/repos/asf/opennlp/blob/f8fbfc9f/opennlp-docs/src/docbkx/doccat.xml ---------------------------------------------------------------------- diff --git a/opennlp-docs/src/docbkx/doccat.xml b/opennlp-docs/src/docbkx/doccat.xml index 7fe3f1f..c056732 100644 --- a/opennlp-docs/src/docbkx/doccat.xml +++ b/opennlp-docs/src/docbkx/doccat.xml @@ -127,33 +127,16 @@ $ opennlp DoccatTrainer -model en-doccat.bin -lang en -data en-doccat.train -enc <programlisting language="java"> <![CDATA[ DoccatModel model = null; - InputStream dataIn = null; -try { - dataIn = new FileInputStream("en-sentiment.train"); + +try (dataIn = new FileInputStream("en-sentiment.train")) { ObjectStream<String> lineStream = new PlainTextByLineStream(dataIn, "UTF-8"); ObjectStream<DocumentSample> sampleStream = new DocumentSampleStream(lineStream); model = DocumentCategorizerME.train("en", sampleStream); } -catch (IOException e) { - // Failed to read or parse training data, training failed - e.printStackTrace(); -} -finally { - if (dataIn != null) { - try { - dataIn.close(); - } - catch (IOException e) { - // Not an issue, training already finished. - // The exception should be logged and investigated - // if part of a production system. - e.printStackTrace(); - } - } -}]]> +]]> </programlisting> Now might be a good time to cruise over to Hulu or something, because this could take a while if you've got a large training set. You may see a lot of output as well. Once you're done, you can pretty quickly step to classification directly, @@ -162,27 +145,10 @@ finally { <para> <programlisting language="java"> <![CDATA[ -OutputStream modelOut = null; -try { - modelOut = new BufferedOutputStream(new FileOutputStream(modelFile)); +try (OutputStream modelOut = new BufferedOutputStream(new FileOutputStream(modelFile))) { model.serialize(modelOut); } -catch (IOException e) { - // Failed to save model - e.printStackTrace(); -} -finally { - if (modelOut != null) { - try { - modelOut.close(); - } - catch (IOException e) { - // Failed to correctly save model. - // Written model might be invalid. - e.printStackTrace(); - } - } -}]]> +]]> </programlisting> </para> </section> http://git-wip-us.apache.org/repos/asf/opennlp/blob/f8fbfc9f/opennlp-docs/src/docbkx/introduction.xml ---------------------------------------------------------------------- diff --git a/opennlp-docs/src/docbkx/introduction.xml b/opennlp-docs/src/docbkx/introduction.xml index a3bd482..65fcd9d 100644 --- a/opennlp-docs/src/docbkx/introduction.xml +++ b/opennlp-docs/src/docbkx/introduction.xml @@ -65,23 +65,10 @@ under the License. constructor of the model class: <programlisting language="java"> <![CDATA[ -InputStream modelIn = new FileInputStream("lang-model-name.bin"); - -try { +try (InputStream modelIn = new FileInputStream("lang-model-name.bin")) { SomeModel model = new SomeModel(modelIn); } -catch (IOException e) { - //handle the exception -} -finally { - if (null != modelIn) { - try { - modelIn.close(); - } - catch (IOException e) { - } - } -}]]> +]]> </programlisting> </para> <para> http://git-wip-us.apache.org/repos/asf/opennlp/blob/f8fbfc9f/opennlp-docs/src/docbkx/lemmatizer.xml ---------------------------------------------------------------------- diff --git a/opennlp-docs/src/docbkx/lemmatizer.xml b/opennlp-docs/src/docbkx/lemmatizer.xml index 34668d0..1fa5540 100644 --- a/opennlp-docs/src/docbkx/lemmatizer.xml +++ b/opennlp-docs/src/docbkx/lemmatizer.xml @@ -88,22 +88,11 @@ signed VBD sign In the example below it is loaded from disk: <programlisting language="java"> <![CDATA[ -InputStream modelIn = null; LemmatizerModel model = null; -try { - modelIn = new FileInputStream("en-lemmatizer.bin"); +try (InputStream modelIn = new FileInputStream("en-lemmatizer.bin"))) { model = new LemmatizerModel(modelIn); -} catch (IOException e) { - // Model loading failed, handle the error - e.printStackTrace(); -} finally { - if (modelIn != null) { - try { - modelIn.close(); - } catch (IOException e) { - } - } -}]]> +} +]]> </programlisting> After the model is loaded a LemmatizerME can be instantiated. <programlisting language="java"> @@ -174,22 +163,10 @@ shrapnel NN shrapnel <![CDATA[ InputStream dictLemmatizer = null; -try { - dictLemmatizer = new FileInputStream("english-lemmatizer.txt"); -} -catch (IOException e) { - // dictionary loading failed, handle the error - e.printStackTrace(); +try (dictLemmatizer = new FileInputStream("english-lemmatizer.txt")) { + } -finally { - if (dictLemmatizer != null) { - try { - dictLemmatizer.close(); - } - catch (IOException e) { - } - } -}]]> +]]> </programlisting> After the dictionary is loaded the DictionaryLemmatizer can be instantiated. @@ -303,8 +280,7 @@ $ opennlp LemmatizerTrainerME -model en-lemmatizer.bin -params PerceptronTrainer TrainingParameters mlParams = CmdLineUtil.loadTrainingParameters(params.getParams(), false); if (mlParams == null) { mlParams = ModelUtil.createDefaultTrainingParameters(); - } -]]> + }]]> </programlisting> Then we read the training data: <programlisting language="java"> http://git-wip-us.apache.org/repos/asf/opennlp/blob/f8fbfc9f/opennlp-docs/src/docbkx/namefinder.xml ---------------------------------------------------------------------- diff --git a/opennlp-docs/src/docbkx/namefinder.xml b/opennlp-docs/src/docbkx/namefinder.xml index 1e72a82..2f68c47 100644 --- a/opennlp-docs/src/docbkx/namefinder.xml +++ b/opennlp-docs/src/docbkx/namefinder.xml @@ -80,23 +80,10 @@ Mr . <START:person> Vinken <END> is chairman of Elsevier N.V. , the Dutch publis In the sample below it is loaded from disk. <programlisting language="java"> <![CDATA[ -InputStream modelIn = new FileInputStream("en-ner-person.bin"); - -try { +try (InputStream modelIn = new FileInputStream("en-ner-person.bin")){ TokenNameFinderModel model = new TokenNameFinderModel(modelIn); } -catch (IOException e) { - e.printStackTrace(); -} -finally { - if (modelIn != null) { - try { - modelIn.close(); - } - catch (IOException e) { - } - } -}]]> +]]> </programlisting> There is a number of reasons why the model loading can fail: <itemizedlist> @@ -274,33 +261,24 @@ $ opennlp TokenNameFinderTrainer -featuregen brown.xml -sequenceCodec BILOU -res <para>Call the NameFinderME.train method</para> </listitem> <listitem> - <para>Save the TokenNameFinderModel to a file or database</para> + <para>Save the TokenNameFinderModel to a file</para> </listitem> </itemizedlist> The three steps are illustrated by the following sample code: <programlisting language="java"> <![CDATA[ -Charset charset = Charset.forName("UTF-8"); ObjectStream<String> lineStream = - new PlainTextByLineStream(new FileInputStream("en-ner-person.train"), charset); -ObjectStream<NameSample> sampleStream = new NameSampleDataStream(lineStream); + new PlainTextByLineStream(new FileInputStream("en-ner-person.train"), StandardCharsets.UTF8); TokenNameFinderModel model; -try { +try (ObjectStream<NameSample> sampleStream = new NameSampleDataStream(lineStream)) { model = NameFinderME.train("en", "person", sampleStream, TrainingParameters.defaultParams(), TokenNameFinderFactory nameFinderFactory); } -finally { - sampleStream.close(); -} -try { - modelOut = new BufferedOutputStream(new FileOutputStream(modelFile)); +try (modelOut = new BufferedOutputStream(new FileOutputStream(modelFile)){ model.serialize(modelOut); -} finally { - if (modelOut != null) - modelOut.close(); }]]> </programlisting> </para> @@ -542,7 +520,7 @@ System.out.println(result.toString());]]> <programlisting language="java"> <![CDATA[ FileInputStream sampleDataIn = new FileInputStream("en-ner-person.train"); -ObjectStream<NameSample> sampleStream = new PlainTextByLineStream(sampleDataIn.getChannel(), "UTF-8"); +ObjectStream<NameSample> sampleStream = new PlainTextByLineStream(sampleDataIn.getChannel(), StandardCharsets.UTF_8); TokenNameFinderCrossValidator evaluator = new TokenNameFinderCrossValidator("en", 100, 5); evaluator.evaluate(sampleStream, 10); http://git-wip-us.apache.org/repos/asf/opennlp/blob/f8fbfc9f/opennlp-docs/src/docbkx/parser.xml ---------------------------------------------------------------------- diff --git a/opennlp-docs/src/docbkx/parser.xml b/opennlp-docs/src/docbkx/parser.xml index a81c078..614293b 100644 --- a/opennlp-docs/src/docbkx/parser.xml +++ b/opennlp-docs/src/docbkx/parser.xml @@ -218,7 +218,7 @@ $ opennlp TaggerModelReplacer en-parser-chunking.bin en-pos-maxent.bin]]> <para>Call a Parser train method: This can be either the CHUNKING or the TREEINSERT parser.</para> </listitem> <listitem> - <para>Save the ParseModel to a file or database.</para> + <para>Save the ParseModel to a file</para> </listitem> </itemizedlist> The following code snippet shows how to instantiate the HeadRules: http://git-wip-us.apache.org/repos/asf/opennlp/blob/f8fbfc9f/opennlp-docs/src/docbkx/postagger.xml ---------------------------------------------------------------------- diff --git a/opennlp-docs/src/docbkx/postagger.xml b/opennlp-docs/src/docbkx/postagger.xml index e981c3a..b623d2e 100644 --- a/opennlp-docs/src/docbkx/postagger.xml +++ b/opennlp-docs/src/docbkx/postagger.xml @@ -69,24 +69,8 @@ Mr._NNP Vinken_NNP is_VBZ chairman_NN of_IN Elsevier_NNP N.V._NNP ,_, the_DT Dut In the sample below its loaded from disk. <programlisting language="java"> <![CDATA[ -InputStream modelIn = null; - -try { - modelIn = new FileInputStream("en-pos-maxent.bin"); +try (InputStream modelIn = new FileInputStream("en-pos-maxent.bin"){ POSModel model = new POSModel(modelIn); -} -catch (IOException e) { - // Model loading failed, handle the error - e.printStackTrace(); -} -finally { - if (modelIn != null) { - try { - modelIn.close(); - } - catch (IOException e) { - } - } }]]> </programlisting> After the model is loaded the POSTaggerME can be instantiated. @@ -214,7 +198,7 @@ $ opennlp POSTaggerTrainer -type maxent -model en-pos-maxent.bin \ <para>Call the POSTagger.train method</para> </listitem> <listitem> - <para>Save the POSModel to a file or database</para> + <para>Save the POSModel to a file</para> </listitem> </itemizedlist> The following code illustrates that: @@ -222,30 +206,11 @@ $ opennlp POSTaggerTrainer -type maxent -model en-pos-maxent.bin \ <![CDATA[ POSModel model = null; -InputStream dataIn = null; -try { - dataIn = new FileInputStream("en-pos.train"); - ObjectStream<String> lineStream = new PlainTextByLineStream(dataIn, "UTF-8"); +try (InputStream dataIn = new FileInputStream("en-pos.train")){ + ObjectStream<String> lineStream = new PlainTextByLineStream(dataIn, StandardCharsets.UTF_8); ObjectStream<POSSample> sampleStream = new WordTagSampleStream(lineStream); model = POSTaggerME.train("en", sampleStream, TrainingParameters.defaultParams(), null, null); -} -catch (IOException e) { - // Failed to read or parse training data, training failed - e.printStackTrace(); -} -finally { - if (dataIn != null) { - try { - dataIn.close(); - } - catch (IOException e) { - // Not an issue, training already finished. - // The exception should be logged and investigated - // if part of a production system. - e.printStackTrace(); - } - } }]]> </programlisting> The above code performs the first two steps, opening the data and training @@ -253,25 +218,8 @@ finally { the sample below it is written into a file. <programlisting language="java"> <![CDATA[ -OutputStream modelOut = null; -try { - modelOut = new BufferedOutputStream(new FileOutputStream(modelFile)); +try (OutputStream modelOut = new BufferedOutputStream(new FileOutputStream(modelFile))){ model.serialize(modelOut); -} -catch (IOException e) { - // Failed to save model - e.printStackTrace(); -} -finally { - if (modelOut != null) { - try { - modelOut.close(); - } - catch (IOException e) { - // Failed to correctly save model. - // Written model might be invalid. - e.printStackTrace(); - } }]]> </programlisting> </para> http://git-wip-us.apache.org/repos/asf/opennlp/blob/f8fbfc9f/opennlp-docs/src/docbkx/sentdetect.xml ---------------------------------------------------------------------- diff --git a/opennlp-docs/src/docbkx/sentdetect.xml b/opennlp-docs/src/docbkx/sentdetect.xml index 0c67b51..aacd4d3 100644 --- a/opennlp-docs/src/docbkx/sentdetect.xml +++ b/opennlp-docs/src/docbkx/sentdetect.xml @@ -81,22 +81,9 @@ $ opennlp SentenceDetector en-sent.bin < input.txt > output.txt]]> To instantiate the Sentence Detector the sentence model must be loaded first. <programlisting language="java"> <![CDATA[ -InputStream modelIn = new FileInputStream("en-sent.bin"); -try { +try (InputStream modelIn = new FileInputStream("en-sent.bin")) { SentenceModel model = new SentenceModel(modelIn); -} -catch (IOException e) { - e.printStackTrace(); -} -finally { - if (modelIn != null) { - try { - modelIn.close(); - } - catch (IOException e) { - } - } }]]> </programlisting> After the model is loaded the SentenceDetectorME can be instantiated. @@ -123,7 +110,7 @@ Span sentences[] = sentenceDetector.sentPosDetect(" First sentence. Second sent </section> <section id="tools.sentdetect.training"> <title>Sentence Detector Training</title> - <para></para> + <para/> <section id="tools.sentdetect.training.tool"> <title>Training Tool</title> <para> @@ -220,27 +207,17 @@ Path: en-sent.bin The following sample code illustrates these steps: <programlisting language="java"> <![CDATA[ -Charset charset = Charset.forName("UTF-8"); ObjectStream<String> lineStream = - new PlainTextByLineStream(new FileInputStream("en-sent.train"), charset); -ObjectStream<SentenceSample> sampleStream = new SentenceSampleStream(lineStream); + new PlainTextByLineStream(new FileInputStream("en-sent.train"), StandardCharsets.UTF_8); SentenceModel model; -try { +try (ObjectStream<SentenceSample> sampleStream = new SentenceSampleStream(lineStream)) { model = SentenceDetectorME.train("en", sampleStream, true, null, TrainingParameters.defaultParams()); } -finally { - sampleStream.close(); -} -OutputStream modelOut = null; -try { - modelOut = new BufferedOutputStream(new FileOutputStream(modelFile)); +try (OutputStream modelOut = new BufferedOutputStream(new FileOutputStream(modelFile))) { model.serialize(modelOut); -} finally { - if (modelOut != null) - modelOut.close(); }]]> </programlisting> </para> http://git-wip-us.apache.org/repos/asf/opennlp/blob/f8fbfc9f/opennlp-docs/src/docbkx/tokenizer.xml ---------------------------------------------------------------------- diff --git a/opennlp-docs/src/docbkx/tokenizer.xml b/opennlp-docs/src/docbkx/tokenizer.xml index d8df477..6d54c3c 100644 --- a/opennlp-docs/src/docbkx/tokenizer.xml +++ b/opennlp-docs/src/docbkx/tokenizer.xml @@ -154,22 +154,9 @@ London share prices were bolstered largely by continued gains on Wall Street and can be loaded. <programlisting language="java"> <![CDATA[ -InputStream modelIn = new FileInputStream("en-token.bin"); -try { +try (InputStream modelIn = new FileInputStream("en-token.bin")) { TokenizerModel model = new TokenizerModel(modelIn); -} -catch (IOException e) { - e.printStackTrace(); -} -finally { - if (modelIn != null) { - try { - modelIn.close(); - } - catch (IOException e) { - } - } }]]> </programlisting> After the model is loaded the TokenizerME can be instantiated. http://git-wip-us.apache.org/repos/asf/opennlp/blob/f8fbfc9f/opennlp-tools/src/main/java/opennlp/tools/ml/BeamSearch.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/ml/BeamSearch.java b/opennlp-tools/src/main/java/opennlp/tools/ml/BeamSearch.java index 949a408..7987b9f 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/ml/BeamSearch.java +++ b/opennlp-tools/src/main/java/opennlp/tools/ml/BeamSearch.java @@ -105,13 +105,8 @@ public class BeamSearch<T> implements SequenceClassificationModel<T> { String[] contexts = cg.getContext(i, sequence, outcomes, additionalContext); double[] scores; if (contextsCache != null) { - scores = contextsCache.get(contexts); - if (scores == null) { - scores = model.eval(contexts, probs); - contextsCache.put(contexts,scores); - } - } - else { + scores = contextsCache.computeIfAbsent(contexts, c -> model.eval(c, probs)); + } else { scores = model.eval(contexts, probs); } @@ -123,13 +118,13 @@ public class BeamSearch<T> implements SequenceClassificationModel<T> { double min = temp_scores[Math.max(0,scores.length - size)]; for (int p = 0; p < scores.length; p++) { - if (scores[p] < min) - continue; //only advance first "size" outcomes - String out = model.getOutcome(p); - if (validator.validSequence(i, sequence, outcomes, out)) { - Sequence ns = new Sequence(top, out, scores[p]); - if (ns.getScore() > minSequenceScore) { - next.add(ns); + if (scores[p] >= min) { + String out = model.getOutcome(p); + if (validator.validSequence(i, sequence, outcomes, out)) { + Sequence ns = new Sequence(top, out, scores[p]); + if (ns.getScore() > minSequenceScore) { + next.add(ns); + } } } }
