This is an automated email from the ASF dual-hosted git repository.
jzemerick pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/opennlp.git
The following commit(s) were added to refs/heads/master by this push:
new 2cedfb2a OPENNLP-1404 Enhance JavaDoc in opennlp.tools.postag package
(#446)
2cedfb2a is described below
commit 2cedfb2a43c57b0cc7b55416c940b867106507b8
Author: Martin Wiesner <[email protected]>
AuthorDate: Mon Dec 12 14:04:08 2022 +0100
OPENNLP-1404 Enhance JavaDoc in opennlp.tools.postag package (#446)
- adds missing JavaDoc
- improves existing documentation for clarity
- removes superfluous text
- adds 'final' modifier where useful and applicable
- adds 'Override' annotation where useful and applicable
- sanitizes some unused, deprecated code fragments, mostly in
`POSTaggerFactory`
- reduced visibility of some methods in `POSTaggerFactory` as those were
only used 'internally'
- fixes some typos
- cures open comment from OPENNLP-1403 by reviewer 'kinow' in
`DefaultLanguageDetectorContextGenerator`
- fixes wrong import and JavaDoc flaw
- fixes minor typos found in review by kinow.
---
.../tagdict/MorfologikPOSTaggerFactory.java | 8 --
.../tagdict/MorfologikTagDictionary.java | 22 ++--
.../DefaultLanguageDetectorContextGenerator.java | 2 +-
.../postag/ConfigurablePOSContextGenerator.java | 43 ++++---
.../tools/postag/DefaultPOSContextGenerator.java | 48 +++++---
.../tools/postag/DefaultPOSSequenceValidator.java | 9 +-
.../opennlp/tools/postag/MutableTagDictionary.java | 9 --
.../opennlp/tools/postag/POSContextGenerator.java | 19 ++-
.../java/opennlp/tools/postag/POSDictionary.java | 41 ++++---
.../java/opennlp/tools/postag/POSEvaluator.java | 35 +++---
.../main/java/opennlp/tools/postag/POSModel.java | 81 ++++++++++---
.../main/java/opennlp/tools/postag/POSSample.java | 50 +++++++-
.../opennlp/tools/postag/POSSampleEventStream.java | 24 ++--
.../tools/postag/POSSampleSequenceStream.java | 28 ++++-
.../main/java/opennlp/tools/postag/POSTagger.java | 26 ++++-
.../tools/postag/POSTaggerCrossValidator.java | 53 ++++-----
.../tools/postag/POSTaggerEvaluationMonitor.java | 3 +
.../opennlp/tools/postag/POSTaggerFactory.java | 127 +++++++++++++--------
.../java/opennlp/tools/postag/POSTaggerME.java | 76 +++++++-----
.../java/opennlp/tools/postag/TagDictionary.java | 13 ++-
.../opennlp/tools/postag/WordTagSampleStream.java | 22 ++--
.../opennlp/tools/util/model/ArtifactProvider.java | 8 +-
.../java/opennlp/tools/postag/POSSampleTest.java | 4 +-
23 files changed, 489 insertions(+), 262 deletions(-)
diff --git
a/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/tagdict/MorfologikPOSTaggerFactory.java
b/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/tagdict/MorfologikPOSTaggerFactory.java
index 592ef7d9..a49a7823 100644
---
a/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/tagdict/MorfologikPOSTaggerFactory.java
+++
b/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/tagdict/MorfologikPOSTaggerFactory.java
@@ -28,7 +28,6 @@ import java.util.Map;
import morfologik.stemming.DictionaryMetadata;
-import opennlp.tools.dictionary.Dictionary;
import opennlp.tools.postag.POSTaggerFactory;
import opennlp.tools.postag.TagDictionary;
import opennlp.tools.util.model.ArtifactSerializer;
@@ -70,13 +69,6 @@ public class MorfologikPOSTaggerFactory extends
POSTaggerFactory {
return createMorfologikDictionary(dictData, dictInfo);
}
-
- @Override
- protected void init(Dictionary ngramDictionary, TagDictionary posDictionary)
{
- super.init(ngramDictionary, null);
- this.dict = posDictionary;
- }
-
@Override
public TagDictionary getTagDictionary() {
if (this.dict == null) {
diff --git
a/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/tagdict/MorfologikTagDictionary.java
b/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/tagdict/MorfologikTagDictionary.java
index 1c89b7e9..0d03eb8d 100644
---
a/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/tagdict/MorfologikTagDictionary.java
+++
b/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/tagdict/MorfologikTagDictionary.java
@@ -17,7 +17,6 @@
package opennlp.morfologik.tagdict;
-import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
@@ -29,13 +28,13 @@ import morfologik.stemming.WordData;
import opennlp.tools.postag.TagDictionary;
/**
- * A POS Tagger dictionary implementation based on Morfologik binary
+ * A {@link TagDictionary} implementation based on Morfologik binary
* dictionaries
*/
public class MorfologikTagDictionary implements TagDictionary {
- private IStemmer dictLookup;
- private boolean isCaseSensitive;
+ private final IStemmer dictLookup;
+ private final boolean isCaseSensitive;
/**
* Creates a case sensitive {@link MorfologikTagDictionary}
@@ -44,11 +43,9 @@ public class MorfologikTagDictionary implements
TagDictionary {
* a Morfologik FSA dictionary
* @throws IllegalArgumentException
* if FSA's root node cannot be acquired (dictionary is empty).
- * @throws IOException
- * could not read dictionary from dictURL
*/
public MorfologikTagDictionary(Dictionary dict)
- throws IllegalArgumentException, IOException {
+ throws IllegalArgumentException {
this(dict, true);
}
@@ -58,14 +55,12 @@ public class MorfologikTagDictionary implements
TagDictionary {
* @param dict
* a Morfologik FSA dictionary
* @param caseSensitive
- * if true it performs case sensitive lookup
+ * if true it performs case-sensitive lookup
* @throws IllegalArgumentException
* if FSA's root node cannot be acquired (dictionary is empty).
- * @throws IOException
- * could not read dictionary from dictURL
*/
public MorfologikTagDictionary(Dictionary dict, boolean caseSensitive)
- throws IllegalArgumentException, IOException {
+ throws IllegalArgumentException {
this.dictLookup = new DictionaryLookup(dict);
this.isCaseSensitive = caseSensitive;
}
@@ -88,4 +83,9 @@ public class MorfologikTagDictionary implements TagDictionary
{
}
return null;
}
+
+ @Override
+ public boolean isCaseSensitive() {
+ return isCaseSensitive;
+ }
}
diff --git
a/opennlp-tools/src/main/java/opennlp/tools/langdetect/DefaultLanguageDetectorContextGenerator.java
b/opennlp-tools/src/main/java/opennlp/tools/langdetect/DefaultLanguageDetectorContextGenerator.java
index 26fffa8b..bbece483 100644
---
a/opennlp-tools/src/main/java/opennlp/tools/langdetect/DefaultLanguageDetectorContextGenerator.java
+++
b/opennlp-tools/src/main/java/opennlp/tools/langdetect/DefaultLanguageDetectorContextGenerator.java
@@ -39,7 +39,7 @@ public class DefaultLanguageDetectorContextGenerator
implements LanguageDetector
* @param minLength The min number of ngrams characters. Must be greater
than {@code 0}.
* @param maxLength The max number of ngrams characters. Must be greater
than {@code 0}
* and must be greater than {@code minLength}.
- * @param normalizers zero or more normalizers to be applied in to the text
before extracting ngrams.
+ * @param normalizers Zero or more normalizers to be applied in to the text
before extracting ngrams.
*/
public DefaultLanguageDetectorContextGenerator(int minLength, int maxLength,
CharSequenceNormalizer...
normalizers) {
diff --git
a/opennlp-tools/src/main/java/opennlp/tools/postag/ConfigurablePOSContextGenerator.java
b/opennlp-tools/src/main/java/opennlp/tools/postag/ConfigurablePOSContextGenerator.java
index e6b65df7..1ae7c9fe 100644
---
a/opennlp-tools/src/main/java/opennlp/tools/postag/ConfigurablePOSContextGenerator.java
+++
b/opennlp-tools/src/main/java/opennlp/tools/postag/ConfigurablePOSContextGenerator.java
@@ -25,7 +25,12 @@ import opennlp.tools.util.Cache;
import opennlp.tools.util.featuregen.AdaptiveFeatureGenerator;
/**
- * A context generator for the POS Tagger.
+ * A configurable {@link POSContextGenerator context generator} for a {@link
POSTagger}.
+ * This implementation makes use of {@link AdaptiveFeatureGenerator}.
+ *
+ * @see POSTagger
+ * @see POSTaggerME
+ * @see DefaultPOSContextGenerator
*/
public class ConfigurablePOSContextGenerator implements POSContextGenerator {
@@ -35,9 +40,21 @@ public class ConfigurablePOSContextGenerator implements
POSContextGenerator {
private final AdaptiveFeatureGenerator featureGenerator;
/**
- * Initializes the current instance.
+ * Initializes a {@link ConfigurablePOSContextGenerator} instance.
+ * A cache size of {@code 0} will be used as default.
+ *
+ * @param featureGenerator The {@link AdaptiveFeatureGenerator} to be used.
+ */
+ public ConfigurablePOSContextGenerator(AdaptiveFeatureGenerator
featureGenerator) {
+ this(0, featureGenerator);
+ }
+
+ /**
+ * Initializes a {@link ConfigurablePOSContextGenerator} instance.
*
- * @param cacheSize
+ * @param cacheSize The size of the {@link Cache} to set.
+ * Must be greater than {@code 0} to have an effect.
+ * @param featureGenerator The {@link AdaptiveFeatureGenerator} to be used.
*/
public ConfigurablePOSContextGenerator(int cacheSize,
AdaptiveFeatureGenerator featureGenerator) {
this.featureGenerator = Objects.requireNonNull(featureGenerator,
"featureGenerator must not be null");
@@ -48,22 +65,18 @@ public class ConfigurablePOSContextGenerator implements
POSContextGenerator {
}
/**
- * Initializes the current instance.
+ * Returns the context for making a postag decision at the specified token
{@code index}
+ * given the specified {@code tokens} and previous {@code tags}.
*
- */
- public ConfigurablePOSContextGenerator(AdaptiveFeatureGenerator
featureGenerator) {
- this(0, featureGenerator);
- }
-
- /**
- * Returns the context for making a pos tag decision at the specified token
index
- * given the specified tokens and previous tags.
* @param index The index of the token for which the context is provided.
- * @param tokens The tokens in the sentence.
+ * @param tokens The tokens representing a sentence.
* @param tags The tags assigned to the previous words in the sentence.
- * @return The context for making a pos tag decision at the specified token
index
- * given the specified tokens and previous tags.
+ * @param additionalContext The context for additional information.
+ *
+ * @return The context for making a postag decision at the specified token
{@code index}
+ * given the specified {@code tokens} and previous {@code tags}.
*/
+ @Override
public String[] getContext(int index, String[] tokens, String[] tags,
Object[] additionalContext) {
diff --git
a/opennlp-tools/src/main/java/opennlp/tools/postag/DefaultPOSContextGenerator.java
b/opennlp-tools/src/main/java/opennlp/tools/postag/DefaultPOSContextGenerator.java
index d9881978..3c36dc5e 100644
---
a/opennlp-tools/src/main/java/opennlp/tools/postag/DefaultPOSContextGenerator.java
+++
b/opennlp-tools/src/main/java/opennlp/tools/postag/DefaultPOSContextGenerator.java
@@ -27,7 +27,10 @@ import opennlp.tools.util.Cache;
import opennlp.tools.util.StringList;
/**
- * A context generator for the POS Tagger.
+ * A default {@link POSContextGenerator context generator} for a {@link
POSTagger}.
+ *
+ * @see POSTagger
+ * @see POSTaggerME
*/
public class DefaultPOSContextGenerator implements POSContextGenerator {
@@ -36,28 +39,30 @@ public class DefaultPOSContextGenerator implements
POSContextGenerator {
private static final int PREFIX_LENGTH = 4;
private static final int SUFFIX_LENGTH = 4;
- private static Pattern hasCap = Pattern.compile("[A-Z]");
- private static Pattern hasNum = Pattern.compile("[0-9]");
+ private static final Pattern hasCap = Pattern.compile("[A-Z]");
+ private static final Pattern hasNum = Pattern.compile("[0-9]");
private Cache<String, String[]> contextsCache;
private Object wordsKey;
- private Dictionary dict;
+ private final Dictionary dict;
/**
- * Initializes the current instance.
+ * Initializes a {@link DefaultPOSContextGenerator} instance.
+ * A cache size of {@code 0} will be used as default.
*
- * @param dict
+ * @param dict The {@link Dictionary} to be used.
*/
public DefaultPOSContextGenerator(Dictionary dict) {
this(0,dict);
}
/**
- * Initializes the current instance.
+ * Initializes a {@link DefaultPOSContextGenerator} instance.
*
- * @param cacheSize
- * @param dict
+ * @param cacheSize The size of the {@link Cache} to set.
+ * Must be greater than {@code 0} to have an effect.
+ * @param dict The {@link Dictionary} to be used.
*/
public DefaultPOSContextGenerator(int cacheSize, Dictionary dict) {
this.dict = dict;
@@ -83,19 +88,34 @@ public class DefaultPOSContextGenerator implements
POSContextGenerator {
return suffs;
}
+ /**
+ * Returns the context for making a postag decision at the specified token
{@code index}
+ * given the specified {@code tokens} and previous {@code tags}.
+ *
+ * @param index The index of the token for which the context is provided.
+ * @param sequence The token sequence representing a sentence.
+ * @param priorDecisions The tags assigned to the previous words in the
sentence.
+ * @param additionalContext The context for additional information.
+ *
+ * @return The context for making a postag decision at the specified token
{@code index}
+ * given the specified {@code tokens} and previous {@code tags}.
+ */
+ @Override
public String[] getContext(int index, String[] sequence, String[]
priorDecisions,
Object[] additionalContext) {
return getContext(index,sequence,priorDecisions);
}
/**
- * Returns the context for making a pos tag decision at the specified token
index
- * given the specified tokens and previous tags.
+ * Returns the context for making a postag decision at the specified token
{@code index}
+ * given the specified {@code tokens} and previous {@code tags}.
+ *
* @param index The index of the token for which the context is provided.
- * @param tokens The tokens in the sentence.
+ * @param tokens The tokens representing a sentence.
* @param tags The tags assigned to the previous words in the sentence.
- * @return The context for making a pos tag decision at the specified token
index
- * given the specified tokens and previous tags.
+ *
+ * @return The context for making a postag decision at the specified token
{@code index}
+ * given the specified {@code tokens} and previous {@code tags}.
*/
public String[] getContext(int index, Object[] tokens, String[] tags) {
String next, nextnext = null, lex, prev, prevprev = null;
diff --git
a/opennlp-tools/src/main/java/opennlp/tools/postag/DefaultPOSSequenceValidator.java
b/opennlp-tools/src/main/java/opennlp/tools/postag/DefaultPOSSequenceValidator.java
index 3fdeb441..358deab3 100644
---
a/opennlp-tools/src/main/java/opennlp/tools/postag/DefaultPOSSequenceValidator.java
+++
b/opennlp-tools/src/main/java/opennlp/tools/postag/DefaultPOSSequenceValidator.java
@@ -21,14 +21,21 @@ import java.util.Arrays;
import opennlp.tools.util.SequenceValidator;
+/**
+ * The default POS tagger {@link SequenceValidator} implementation.
+ */
public class DefaultPOSSequenceValidator implements SequenceValidator<String> {
- private TagDictionary tagDictionary;
+ private final TagDictionary tagDictionary;
+ /**
+ * @param tagDictionary A {@link TagDictionary} used for the new {@link
SequenceValidator}.
+ */
public DefaultPOSSequenceValidator(TagDictionary tagDictionary) {
this.tagDictionary = tagDictionary;
}
+ @Override
public boolean validSequence(int i, String[] inputSequence,
String[] outcomesSequence, String outcome) {
if (tagDictionary == null) {
diff --git
a/opennlp-tools/src/main/java/opennlp/tools/postag/MutableTagDictionary.java
b/opennlp-tools/src/main/java/opennlp/tools/postag/MutableTagDictionary.java
index 52df16dc..8f72a891 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/postag/MutableTagDictionary.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/postag/MutableTagDictionary.java
@@ -38,13 +38,4 @@ public interface MutableTagDictionary extends TagDictionary {
*/
String[] put(String word, String... tags);
- /**
- * Whether if the dictionary is case sensitive or not
- *
- * @return true if the dictionary is case sensitive
- */
- // TODO: move to TagDictionary, can't do it now because of backward
- // compatibility.
- boolean isCaseSensitive();
-
}
diff --git
a/opennlp-tools/src/main/java/opennlp/tools/postag/POSContextGenerator.java
b/opennlp-tools/src/main/java/opennlp/tools/postag/POSContextGenerator.java
index 29c90d55..dfcab161 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/postag/POSContextGenerator.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/postag/POSContextGenerator.java
@@ -22,8 +22,21 @@ import opennlp.tools.util.BeamSearchContextGenerator;
/**
- * The interface for a context generator for the POS Tagger.
+ * Interface for a {@link BeamSearchContextGenerator} used in POS tagging.
*/
-public interface POSContextGenerator extends
BeamSearchContextGenerator<String> {
- String[] getContext(int pos, String[] tokens, String[] prevTags, Object[]
ac);
+public interface POSContextGenerator extends
BeamSearchContextGenerator<String> {
+
+ /**
+ * Returns the context for making a postag decision at the specified token
{@code index}
+ * given the specified {@code tokens} and previous {@code tags}.
+ *
+ * @param index The index of the token for which the context is provided.
+ * @param tokens The token sequence representing a sentence.
+ * @param prevTags The tags assigned to the previous words in the sentence.
+ * @param additionalContext The context for additional information.
+ *
+ * @return The context for making a postag decision at the specified token
{@code index}
+ * given the specified {@code tokens} and previous {@code tags}.
+ */
+ String[] getContext(int index, String[] tokens, String[] prevTags, Object[]
additionalContext);
}
diff --git
a/opennlp-tools/src/main/java/opennlp/tools/postag/POSDictionary.java
b/opennlp-tools/src/main/java/opennlp/tools/postag/POSDictionary.java
index cc358dd1..842c56b4 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/postag/POSDictionary.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/postag/POSDictionary.java
@@ -36,7 +36,7 @@ import opennlp.tools.util.model.SerializableArtifact;
/**
* Provides a means of determining which tags are valid for a particular word
- * based on a tag dictionary read from a file.
+ * based on a {@link TagDictionary} read from a file.
*/
public class POSDictionary implements Iterable<String>, MutableTagDictionary,
SerializableArtifact {
@@ -53,7 +53,9 @@ public class POSDictionary implements Iterable<String>,
MutableTagDictionary, Se
/**
* Initializes an empty {@link POSDictionary}.
- * @param caseSensitive the {@link POSDictionary} case sensitivity
+ *
+ * @param caseSensitive {@code true} if the {@link POSDictionary} is case
sensitive,
+ * {@code false} otherwise.
*/
public POSDictionary(boolean caseSensitive) {
dictionary = new HashMap<>();
@@ -61,12 +63,12 @@ public class POSDictionary implements Iterable<String>,
MutableTagDictionary, Se
}
/**
- * Returns a list of valid tags for the specified word.
+ * Returns a list of valid tags for the specified {@code word}.
*
* @param word The word.
*
- * @return A list of valid tags for the specified word or
- * null if no information is available for that word.
+ * @return An array of valid tags for the specified word or
+ * {@code null} if no information is available for that word.
*/
public String[] getTags(String word) {
if (caseSensitive) {
@@ -78,7 +80,7 @@ public class POSDictionary implements Iterable<String>,
MutableTagDictionary, Se
}
/**
- * Retrieves an iterator over all words in the dictionary.
+ * Retrieves an {@link Iterator} over all words in the dictionary.
*/
public Iterator<String> iterator() {
return dictionary.keySet().iterator();
@@ -103,7 +105,7 @@ public class POSDictionary implements Iterable<String>,
MutableTagDictionary, Se
/**
* Writes the {@link POSDictionary} to the given {@link OutputStream};
- *
+ * <p>
* After the serialization is finished the provided
* {@link OutputStream} remains open.
*
@@ -111,12 +113,12 @@ public class POSDictionary implements Iterable<String>,
MutableTagDictionary, Se
* the {@link OutputStream} to write the dictionary into.
*
* @throws IOException
- * if writing to the {@link OutputStream} fails
+ * Throw if writing to the {@link OutputStream} fails
*/
public void serialize(OutputStream out) throws IOException {
- Iterator<Entry> entries = new Iterator<Entry>() {
+ Iterator<Entry> entries = new Iterator<>() {
- Iterator<String> iterator = dictionary.keySet().iterator();
+ final Iterator<String> iterator = dictionary.keySet().iterator();
public boolean hasNext() {
return iterator.hasNext();
@@ -185,7 +187,7 @@ public class POSDictionary implements Iterable<String>,
MutableTagDictionary, Se
@Override
public String toString() {
- // it is time consuming to output the dictionary entries.
+ // it is time-consuming to output the dictionary entries.
// will output something meaningful for debugging, like
// POSDictionary{size=100, caseSensitive=true}
@@ -194,16 +196,17 @@ public class POSDictionary implements Iterable<String>,
MutableTagDictionary, Se
}
/**
- * Creates a new {@link POSDictionary} from a provided {@link InputStream}.
- *
+ * Creates a new {@link POSDictionary} from an {@link InputStream}.
+ * <p>
* After creation is finished the provided {@link InputStream} is closed.
*
- * @param in
+ * @param in The {@link InputStream} used for creating the {@link
POSDictionary}.
+ * The stream must be open and have bytes available to read from.
*
- * @return the pos dictionary
+ * @return A valid {@link POSDictionary} instance.
*
- * @throws IOException
- * @throws InvalidFormatException
+ * @throws IOException Thrown if IO errors occurred during creation.
+ * @throws InvalidFormatException Thrown if the entries don't have exactly
one token.
*/
public static POSDictionary create(InputStream in) throws IOException {
@@ -212,9 +215,7 @@ public class POSDictionary implements Iterable<String>,
MutableTagDictionary, Se
boolean isCaseSensitive = DictionaryEntryPersistor.create(in, entry -> {
String tagString = entry.getAttributes().getValue("tags");
-
String[] tags = tagString.split(" ");
-
StringList word = entry.getTokens();
if (word.size() != 1)
@@ -239,6 +240,7 @@ public class POSDictionary implements Iterable<String>,
MutableTagDictionary, Se
return newPosDict;
}
+ @Override
public String[] put(String word, String... tags) {
if (this.caseSensitive) {
return dictionary.put(word, tags);
@@ -247,6 +249,7 @@ public class POSDictionary implements Iterable<String>,
MutableTagDictionary, Se
}
}
+ @Override
public boolean isCaseSensitive() {
return this.caseSensitive;
}
diff --git a/opennlp-tools/src/main/java/opennlp/tools/postag/POSEvaluator.java
b/opennlp-tools/src/main/java/opennlp/tools/postag/POSEvaluator.java
index eaf6bafc..0d7db4e3 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/postag/POSEvaluator.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/postag/POSEvaluator.java
@@ -22,21 +22,20 @@ import opennlp.tools.util.eval.Evaluator;
import opennlp.tools.util.eval.Mean;
/**
- * The {@link POSEvaluator} measures the performance of
- * the given {@link POSTagger} with the provided reference
- * {@link POSSample}s.
+ * The {@link POSEvaluator} measures the performance of the given {@link
POSTagger}
+ * with the provided reference {@link POSSample samples}.
*/
public class POSEvaluator extends Evaluator<POSSample> {
- private POSTagger tagger;
+ private final POSTagger tagger;
- private Mean wordAccuracy = new Mean();
+ private final Mean wordAccuracy = new Mean();
/**
* Initializes the current instance.
*
- * @param tagger
- * @param listeners an array of evaluation listeners
+ * @param tagger The {@link POSTagger} to evaluate.
+ * @param listeners the {@link POSTaggerEvaluationMonitor evaluation
listeners}.
*/
public POSEvaluator(POSTagger tagger, POSTaggerEvaluationMonitor ...
listeners) {
super(listeners);
@@ -45,19 +44,18 @@ public class POSEvaluator extends Evaluator<POSSample> {
/**
* Evaluates the given reference {@link POSSample} object.
- *
* This is done by tagging the sentence from the reference
* {@link POSSample} with the {@link POSTagger}. The
* tags are then used to update the word accuracy score.
*
- * @param reference the reference {@link POSSample}.
+ * @param reference The {@link POSSample} to process.
*
- * @return the predicted {@link POSSample}.
+ * @return The predicted {@link POSSample}.
*/
@Override
protected POSSample processSample(POSSample reference) {
- String[] predictedTags = tagger.tag(reference.getSentence(),
reference.getAddictionalContext());
+ String[] predictedTags = tagger.tag(reference.getSentence(),
reference.getAdditionalContext());
String[] referenceTags = reference.getTags();
for (int i = 0; i < referenceTags.length; i++) {
@@ -73,29 +71,24 @@ public class POSEvaluator extends Evaluator<POSSample> {
}
/**
- * Retrieves the word accuracy.
- *
- * This is defined as:
- * word accuracy = correctly detected tags / total words
+ * Accuracy defined as:
+ * {@code word accuracy = correctly detected tags / total words}
*
- * @return the word accuracy
+ * @return Retrieves the mean word accuracy.
*/
public double getWordAccuracy() {
return wordAccuracy.mean();
}
/**
- * Retrieves the total number of words considered
- * in the evaluation.
- *
- * @return the word count
+ * @return Retrieves the total number of words considered in the evaluation.
*/
public long getWordCount() {
return wordAccuracy.count();
}
/**
- * Represents this objects as human readable {@link String}.
+ * Represents this object as human-readable {@link String}.
*/
@Override
public String toString() {
diff --git a/opennlp-tools/src/main/java/opennlp/tools/postag/POSModel.java
b/opennlp-tools/src/main/java/opennlp/tools/postag/POSModel.java
index 030e1d9f..422cac56 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/postag/POSModel.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/postag/POSModel.java
@@ -26,7 +26,6 @@ import java.util.Map;
import java.util.Objects;
import java.util.Properties;
-import opennlp.tools.dictionary.Dictionary;
import opennlp.tools.ml.BeamSearch;
import opennlp.tools.ml.model.MaxentModel;
import opennlp.tools.ml.model.SequenceClassificationModel;
@@ -39,8 +38,7 @@ import opennlp.tools.util.model.POSModelSerializer;
import opennlp.tools.util.model.SerializableArtifact;
/**
- * The {@link POSModel} is the model used
- * by a learnable {@link POSTagger}.
+ * The {@link POSModel} is the model used by a learnable {@link POSTagger}.
*
* @see POSTaggerME
*/
@@ -51,6 +49,14 @@ public final class POSModel extends BaseModel implements
SerializableArtifact {
static final String POS_MODEL_ENTRY_NAME = "pos.model";
static final String GENERATOR_DESCRIPTOR_ENTRY_NAME = "generator.featuregen";
+ /**
+ * Initializes a {@link POSModel} instance via given parameters.
+ *
+ * @param languageCode An ISO conform language code.
+ * @param posModel A valid {@link SequenceClassificationModel}.
+ * @param manifestInfoEntries Additional information kept in the manifest.
+ * @param posFactory The {@link POSTaggerFactory} for creating related
objects.
+ */
public POSModel(String languageCode, SequenceClassificationModel<String>
posModel,
Map<String, String> manifestInfoEntries, POSTaggerFactory posFactory) {
@@ -69,11 +75,28 @@ public final class POSModel extends BaseModel implements
SerializableArtifact {
// checkArtifactMap();
}
+ /**
+ * Initializes a {@link POSModel} instance via given parameters.
+ *
+ * @param languageCode An ISO conform language code.
+ * @param posModel A valid {@link MaxentModel}.
+ * @param manifestInfoEntries Additional information kept in the manifest.
+ * @param posFactory The {@link POSTaggerFactory} for creating related
objects.
+ */
public POSModel(String languageCode, MaxentModel posModel,
Map<String, String> manifestInfoEntries, POSTaggerFactory posFactory) {
this(languageCode, posModel, POSTaggerME.DEFAULT_BEAM_SIZE,
manifestInfoEntries, posFactory);
}
+ /**
+ * Initializes a {@link POSModel} instance via given parameters.
+ *
+ * @param languageCode An ISO conform language code.
+ * @param posModel A valid {@link MaxentModel}.
+ * @param beamSize The size of the beam that should be used when decoding
sequences.
+ * @param manifestInfoEntries Additional information kept in the manifest.
+ * @param posFactory The {@link POSTaggerFactory} for creating related
objects.
+ */
public POSModel(String languageCode, MaxentModel posModel, int beamSize,
Map<String, String> manifestInfoEntries, POSTaggerFactory posFactory) {
@@ -86,26 +109,51 @@ public final class POSModel extends BaseModel implements
SerializableArtifact {
artifactMap.put(POS_MODEL_ENTRY_NAME, posModel);
artifactMap.put(GENERATOR_DESCRIPTOR_ENTRY_NAME,
posFactory.getFeatureGenerator());
-
- for (Map.Entry<String, Object> resource :
posFactory.getResources().entrySet()) {
- artifactMap.put(resource.getKey(), resource.getValue());
- }
+ artifactMap.putAll(posFactory.getResources());
checkArtifactMap();
}
+ /**
+ * Initializes a {@link POSModel} instance via a valid {@link InputStream}.
+ *
+ * @param in The {@link InputStream} used for loading the model.
+ *
+ * @throws IOException Thrown if IO errors occurred during initialization.
+ */
public POSModel(InputStream in) throws IOException {
super(COMPONENT_NAME, in);
}
+ /**
+ * Initializes a {@link POSModel} instance via a valid {@link File}.
+ *
+ * @param modelFile The {@link File} used for loading the model.
+ *
+ * @throws IOException Thrown if IO errors occurred during initialization.
+ */
public POSModel(File modelFile) throws IOException {
super(COMPONENT_NAME, modelFile);
}
+ /**
+ * Initializes a {@link POSModel} instance via a valid {@link Path}.
+ *
+ * @param modelPath The {@link Path} used for loading the model.
+ *
+ * @throws IOException Thrown if IO errors occurred during initialization.
+ */
public POSModel(Path modelPath) throws IOException {
this(modelPath.toFile());
}
+ /**
+ * Initializes a {@link POSModel} instance via a valid {@link URL}.
+ *
+ * @param modelURL The {@link URL} used for loading the model.
+ *
+ * @throws IOException Thrown if IO errors occurred during initialization.
+ */
public POSModel(URL modelURL) throws IOException {
super(COMPONENT_NAME, modelURL);
}
@@ -125,7 +173,7 @@ public final class POSModel extends BaseModel implements
SerializableArtifact {
}
/**
- * @deprecated use getPosSequenceModel instead. This method will be removed
soon.
+ * @deprecated use {@link POSModel#getPosSequenceModel} instead. This method
will be removed soon.
* Only required for Parser 1.5.x backward compatibility. Newer models don't
need this anymore.
*/
@Deprecated
@@ -138,6 +186,9 @@ public final class POSModel extends BaseModel implements
SerializableArtifact {
}
}
+ /**
+ * @return Retrieves a {@link SequenceClassificationModel}.
+ */
public SequenceClassificationModel<String> getPosSequenceModel() {
Properties manifest = (Properties) artifactMap.get(MANIFEST_ENTRY);
@@ -160,6 +211,9 @@ public final class POSModel extends BaseModel implements
SerializableArtifact {
}
}
+ /**
+ * @return Retrieves the active {@link POSTaggerFactory}.
+ */
public POSTaggerFactory getFactory() {
return (POSTaggerFactory) this.toolFactory;
}
@@ -171,17 +225,6 @@ public final class POSModel extends BaseModel implements
SerializableArtifact {
serializers.put("featuregen", new ByteArraySerializer());
}
- /**
- * Retrieves the ngram dictionary.
- *
- * @return ngram dictionary or null if not used
- */
- public Dictionary getNgramDictionary() {
- if (getFactory() != null)
- return getFactory().getDictionary();
- return null;
- }
-
@Override
public Class<POSModelSerializer> getArtifactSerializerClass() {
return POSModelSerializer.class;
diff --git a/opennlp-tools/src/main/java/opennlp/tools/postag/POSSample.java
b/opennlp-tools/src/main/java/opennlp/tools/postag/POSSample.java
index d6c40fdf..1367344a 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/postag/POSSample.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/postag/POSSample.java
@@ -27,7 +27,7 @@ import opennlp.tools.tokenize.WhitespaceTokenizer;
import opennlp.tools.util.InvalidFormatException;
/**
- * Represents an pos-tagged sentence.
+ * Represents an pos-tagged {@link Sample sentence}.
*/
public class POSSample implements Sample {
@@ -37,14 +37,33 @@ public class POSSample implements Sample {
private final String[][] additionalContext;
+ /**
+ * Initializes a {@link POSSample} instance.
+ *
+ * @param sentence The sentence of tokens to be tagged.
+ * @param tags An array of pos tags for each token provided in {@code
sentence}.
+ */
public POSSample(String[] sentence, String[] tags) {
this(sentence, tags, null);
}
+ /**
+ * Initializes a {@link POSSample} instance.
+ *
+ * @param sentence The sentence to be tagged.
+ * @param tags A {@link List} of pos tags for each token provided in {@code
sentence}.
+ */
public POSSample(List<String> sentence, List<String> tags) {
this(sentence, tags, null);
}
+ /**
+ * Initializes a {@link POSSample} instance.
+ *
+ * @param sentence The sentence to be tagged.
+ * @param tags A {@link List} of pos tags for each token provided in {@code
sentence}.
+ * @param additionalContext A 2D array which holds additional information
for the context.
+ */
public POSSample(List<String> sentence, List<String> tags,
String[][] additionalContext) {
this.sentence = Collections.unmodifiableList(sentence);
@@ -66,8 +85,14 @@ public class POSSample implements Sample {
this.additionalContext = ac;
}
- public POSSample(String[] sentence, String[] tags,
- String[][] additionalContext) {
+ /**
+ * Initializes a {@link POSSample} instance.
+ *
+ * @param sentence The sentence to be tagged.
+ * @param tags An array of pos tags for each token provided in {@code
sentence}.
+ * @param additionalContext A 2D array which holds additional information
for the context.
+ */
+ public POSSample(String[] sentence, String[] tags, String[][]
additionalContext) {
this(Arrays.asList(sentence), Arrays.asList(tags), additionalContext);
}
@@ -86,15 +111,24 @@ public class POSSample implements Sample {
}
}
+ /**
+ * @return Retrieves the sentence as array.
+ */
public String[] getSentence() {
return sentence.toArray(new String[sentence.size()]);
}
+ /**
+ * @return Retrieves the tags as array.
+ */
public String[] getTags() {
return tags.toArray(new String[tags.size()]);
}
- public String[][] getAddictionalContext() {
+ /**
+ * @return Retrieves additional information for the context.
+ */
+ public String[][] getAdditionalContext() {
return this.additionalContext;
}
@@ -118,6 +152,14 @@ public class POSSample implements Sample {
return result.toString();
}
+ /**
+ * Parses a {@code sentenceString}.
+ *
+ * @param sentenceString The sentence to be parsed.
+ * @return A valid {@link POSSample} result.
+ *
+ * @throws InvalidFormatException Thrown if errors occurred during parsing.
+ */
public static POSSample parse(String sentenceString) throws
InvalidFormatException {
String[] tokenTags = WhitespaceTokenizer.INSTANCE.tokenize(sentenceString);
diff --git
a/opennlp-tools/src/main/java/opennlp/tools/postag/POSSampleEventStream.java
b/opennlp-tools/src/main/java/opennlp/tools/postag/POSSampleEventStream.java
index aa3c99dd..e7cbff05 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/postag/POSSampleEventStream.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/postag/POSSampleEventStream.java
@@ -27,35 +27,33 @@ import opennlp.tools.util.AbstractEventStream;
import opennlp.tools.util.ObjectStream;
/**
- * This class reads the {@link POSSample}s from the given {@link Iterator}
- * and converts the {@link POSSample}s into {@link Event}s which
+ * Reads the {@link POSSample samples} from an {@link Iterator}
+ * and converts those samples into {@link Event events} which
* can be used by the maxent library for training.
*/
public class POSSampleEventStream extends AbstractEventStream<POSSample> {
/**
- * The {@link POSContextGenerator} used
- * to create the training {@link Event}s.
+ * The {@link POSContextGenerator} used to create the training {@link Event
events}.
*/
- private POSContextGenerator cg;
+ private final POSContextGenerator cg;
/**
- * Initializes the current instance with the given samples and the
- * given {@link POSContextGenerator}.
+ * Initializes the current instance with the given samples and a {@link
POSContextGenerator}.
*
- * @param samples
- * @param cg
+ * @param samples The data stream for this event stream.
+ * @param cg A {@link POSContextGenerator} to process the event stream
{@code samples}.
*/
public POSSampleEventStream(ObjectStream<POSSample> samples,
POSContextGenerator cg) {
super(samples);
-
this.cg = cg;
}
/**
* Initializes the current instance with given samples
* and a {@link DefaultPOSContextGenerator}.
- * @param samples
+ *
+ * @param samples The data stream for this event stream.
*/
public POSSampleEventStream(ObjectStream<POSSample> samples) {
this(samples, new DefaultPOSContextGenerator(null));
@@ -65,14 +63,14 @@ public class POSSampleEventStream extends
AbstractEventStream<POSSample> {
protected Iterator<Event> createEvents(POSSample sample) {
String[] sentence = sample.getSentence();
String[] tags = sample.getTags();
- Object[] ac = sample.getAddictionalContext();
+ Object[] ac = sample.getAdditionalContext();
List<Event> events = generateEvents(sentence, tags, ac, cg);
return events.iterator();
}
public static List<Event> generateEvents(String[] sentence, String[] tags,
Object[] additionalContext, POSContextGenerator cg) {
- List<Event> events = new ArrayList<Event>(sentence.length);
+ List<Event> events = new ArrayList<>(sentence.length);
for (int i = 0; i < sentence.length; i++) {
diff --git
a/opennlp-tools/src/main/java/opennlp/tools/postag/POSSampleSequenceStream.java
b/opennlp-tools/src/main/java/opennlp/tools/postag/POSSampleSequenceStream.java
index c69cf6e9..7b36eacb 100644
---
a/opennlp-tools/src/main/java/opennlp/tools/postag/POSSampleSequenceStream.java
+++
b/opennlp-tools/src/main/java/opennlp/tools/postag/POSSampleSequenceStream.java
@@ -25,25 +25,41 @@ import opennlp.tools.ml.model.Sequence;
import opennlp.tools.ml.model.SequenceStream;
import opennlp.tools.util.ObjectStream;
+/**
+ * A {@link SequenceStream} implementation encapsulating {@link POSSample
samples}.
+ */
public class POSSampleSequenceStream implements SequenceStream<POSSample> {
- private POSContextGenerator pcg;
- private ObjectStream<POSSample> psi;
+ private final POSContextGenerator pcg;
+ private final ObjectStream<POSSample> psi;
- public POSSampleSequenceStream(ObjectStream<POSSample> psi) throws
IOException {
+ /**
+ * Creates a {@link POSSampleSequenceStream} with given {@code samples} using
+ * a {@link DefaultPOSContextGenerator}.
+ *
+ * @param psi The data stream of {@link POSSample samples}.
+ */
+ public POSSampleSequenceStream(ObjectStream<POSSample> psi) {
this(psi, new DefaultPOSContextGenerator(null));
}
- public POSSampleSequenceStream(ObjectStream<POSSample> psi,
POSContextGenerator pcg)
- throws IOException {
+ /**
+ * Creates a {@link POSSampleSequenceStream} with given {@code samples} using
+ * a {@link POSContextGenerator}.
+ *
+ * @param psi The data stream of {@link POSSample samples}.
+ * @param pcg A {@link POSContextGenerator} which shall be used.
+ */
+ public POSSampleSequenceStream(ObjectStream<POSSample> psi,
POSContextGenerator pcg) {
this.psi = psi;
this.pcg = pcg;
}
+ @Override
public Event[] updateContext(Sequence<POSSample> pss, AbstractModel model) {
POSTagger tagger = new POSTaggerME(new POSModel("x-unspecified", model,
null, new POSTaggerFactory()));
String[] sentence = pss.getSource().getSentence();
- Object[] ac = pss.getSource().getAddictionalContext();
+ Object[] ac = pss.getSource().getAdditionalContext();
String[] tags = tagger.tag(pss.getSource().getSentence());
Event[] events = new Event[sentence.length];
POSSampleEventStream.generateEvents(sentence, tags, ac,
pcg).toArray(events);
diff --git a/opennlp-tools/src/main/java/opennlp/tools/postag/POSTagger.java
b/opennlp-tools/src/main/java/opennlp/tools/postag/POSTagger.java
index e7599047..6f1661a7 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/postag/POSTagger.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/postag/POSTagger.java
@@ -26,14 +26,38 @@ public interface POSTagger {
/**
* Assigns the sentence of tokens pos tags.
+ *
* @param sentence The sentence of tokens to be tagged.
- * @return an array of pos tags for each token provided in sentence.
+ * @return An array of pos tags for each token provided in {@code sentence}.
*/
String[] tag(String[] sentence);
+ /**
+ * Assigns the sentence of tokens pos tags.
+ *
+ * @param sentence The sentence of tokens to be tagged.
+ * @param additionalContext The context to provide additional information
with.
+ *
+ * @return An array of pos tags for each token provided in {@code sentence}.
+ */
String[] tag(String[] sentence, Object[] additionalContext);
+ /**
+ * Assigns the sentence the top-k {@link Sequence sequences}.
+ *
+ * @param sentence The sentence of tokens to be tagged.
+ *
+ * @return An array of {@link Sequence sequeneces} for each token provided
in {@code sentence}.
+ */
Sequence[] topKSequences(String[] sentence);
+ /**
+ * Assigns the sentence the top-k {@link Sequence sequences}.
+ *
+ * @param sentence The sentence of tokens to be tagged.
+ * @param additionalContext The context to provide additional information
with.
+ *
+ * @return An array of {@link Sequence sequences} for each token provided in
{@code sentence}.
+ */
Sequence[] topKSequences(String[] sentence, Object[] additionalContext);
}
diff --git
a/opennlp-tools/src/main/java/opennlp/tools/postag/POSTaggerCrossValidator.java
b/opennlp-tools/src/main/java/opennlp/tools/postag/POSTaggerCrossValidator.java
index a35bbb6c..96ffa10b 100644
---
a/opennlp-tools/src/main/java/opennlp/tools/postag/POSTaggerCrossValidator.java
+++
b/opennlp-tools/src/main/java/opennlp/tools/postag/POSTaggerCrossValidator.java
@@ -21,7 +21,6 @@ import java.io.File;
import java.io.IOException;
import java.util.Map;
-import opennlp.tools.dictionary.Dictionary;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.TrainingParameters;
import opennlp.tools.util.eval.CrossValidationPartitioner;
@@ -36,21 +35,29 @@ public class POSTaggerCrossValidator {
private byte[] featureGeneratorBytes;
private Map<String, Object> resources;
- private Mean wordAccuracy = new Mean();
- private POSTaggerEvaluationMonitor[] listeners;
+ private final Mean wordAccuracy = new Mean();
+ private final POSTaggerEvaluationMonitor[] listeners;
/* this will be used to load the factory after the ngram dictionary was
created */
private String factoryClassName;
/* user can also send a ready to use factory */
private POSTaggerFactory factory;
- private Integer tagdicCutoff = null;
+ private final Integer tagdicCutoff;
private File tagDictionaryFile;
/**
- * Creates a {@link POSTaggerCrossValidator} that builds a ngram dictionary
- * dynamically. It instantiates a sub-class of {@link POSTaggerFactory} using
+ * Initializes a {@link POSTaggerCrossValidator} that builds a ngram
dictionary
+ * dynamically. It instantiates a subclass of {@link POSTaggerFactory} using
* the tag and the ngram dictionaries.
+ *
+ * @param languageCode An ISO conform language code.
+ * @param trainParam The {@link TrainingParameters} for the context of cross
validation.
+ * @param tagDictionary The {@link File} that references the a {@link
TagDictionary}.
+ * @param featureGeneratorBytes The bytes for feature generation.
+ * @param resources Additional resources as key-value map.
+ * @param factoryClass The class name used for factory instantiation.
+ * @param listeners The {@link POSTaggerEvaluationMonitor evaluation
listeners}.
*/
public POSTaggerCrossValidator(String languageCode,
TrainingParameters trainParam, File
tagDictionary,
@@ -69,8 +76,12 @@ public class POSTaggerCrossValidator {
/**
- * Creates a {@link POSTaggerCrossValidator} using the given
- * {@link POSTaggerFactory}.
+ * Creates a {@link POSTaggerCrossValidator} using the given {@link
POSTaggerFactory}.
+ *
+ * @param languageCode An ISO conform language code.
+ * @param trainParam The {@link TrainingParameters} for the context of cross
validation.
+ * @param factory The {@link POSTaggerFactory} to be used.
+ * @param listeners The {@link POSTaggerEvaluationMonitor evaluation
listeners}.
*/
public POSTaggerCrossValidator(String languageCode,
TrainingParameters trainParam, POSTaggerFactory factory,
@@ -85,12 +96,10 @@ public class POSTaggerCrossValidator {
/**
* Starts the evaluation.
*
- * @param samples
- * the data to train and test
- * @param nFolds
- * number of folds
+ * @param samples The {@link ObjectStream} of {@link POSSample samples} to
train and test with.
+ * @param nFolds Number of folds. It must be greater than zero.
*
- * @throws IOException
+ * @throws IOException Thrown if IO errors occurred.
*/
public void evaluate(ObjectStream<POSSample> samples, int nFolds) throws
IOException {
@@ -102,7 +111,6 @@ public class POSTaggerCrossValidator {
CrossValidationPartitioner.TrainingSampleStream<POSSample>
trainingSampleStream = partitioner
.next();
-
if (this.tagDictionaryFile != null
&& this.factory.getTagDictionary() == null) {
this.factory.setTagDictionary(this.factory
@@ -147,26 +155,19 @@ public class POSTaggerCrossValidator {
}
/**
- * Retrieves the accuracy for all iterations.
- *
- * @return the word accuracy
+ * @return Retrieves the accuracy for all iterations.
*/
public double getWordAccuracy() {
return wordAccuracy.mean();
}
/**
- * Retrieves the number of words which where validated
- * over all iterations. The result is the amount of folds
- * multiplied by the total number of words.
- *
- * @return the word count
+ * @return Retrieves the number of words which where validated
+ * over all iterations. The result is the amount of folds
+ * multiplied by the total number of words.
*/
public long getWordCount() {
return wordAccuracy.count();
}
-
- private static POSTaggerFactory create(Dictionary ngram, TagDictionary pos) {
- return new POSTaggerFactory(ngram, pos);
- }
+
}
diff --git
a/opennlp-tools/src/main/java/opennlp/tools/postag/POSTaggerEvaluationMonitor.java
b/opennlp-tools/src/main/java/opennlp/tools/postag/POSTaggerEvaluationMonitor.java
index d3ece7a9..d8e031a2 100644
---
a/opennlp-tools/src/main/java/opennlp/tools/postag/POSTaggerEvaluationMonitor.java
+++
b/opennlp-tools/src/main/java/opennlp/tools/postag/POSTaggerEvaluationMonitor.java
@@ -19,6 +19,9 @@ package opennlp.tools.postag;
import opennlp.tools.util.eval.EvaluationMonitor;
+/**
+ * A marker interface for evaluating {@link POSTagger pos taggers}.
+ */
public interface POSTaggerEvaluationMonitor extends
EvaluationMonitor<POSSample> {
}
diff --git
a/opennlp-tools/src/main/java/opennlp/tools/postag/POSTaggerFactory.java
b/opennlp-tools/src/main/java/opennlp/tools/postag/POSTaggerFactory.java
index 501f9f6c..94fcd500 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/postag/POSTaggerFactory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/postag/POSTaggerFactory.java
@@ -45,44 +45,32 @@ import opennlp.tools.util.model.ArtifactSerializer;
import opennlp.tools.util.model.UncloseableInputStream;
/**
- * The factory that provides POS Tagger default implementations and resources
+ * The factory that provides {@link POSTagger} default implementations and
resources.
*/
public class POSTaggerFactory extends BaseToolFactory {
private static final String TAG_DICTIONARY_ENTRY_NAME = "tags.tagdict";
private static final String NGRAM_DICTIONARY_ENTRY_NAME = "ngram.dictionary";
-
protected Dictionary ngramDictionary;
private byte[] featureGeneratorBytes;
private Map<String, Object> resources;
protected TagDictionary posDictionary;
/**
- * Creates a {@link POSTaggerFactory} that provides the default
implementation
+ * Initializes a {@link POSTaggerFactory} that provides the default
implementation
* of the resources.
*/
public POSTaggerFactory() {
}
/**
- * Creates a {@link POSTaggerFactory}. Use this constructor to
- * programmatically create a factory.
- *
- * @param ngramDictionary
- * @param posDictionary
+ * Initializes a {@link POSTaggerFactory} from a given set of the resources.
*
- * @deprecated this constructor is here for backward compatibility and
- * is not functional anymore in the training of 1.8.x series
models
+ * @param featureGeneratorBytes The bytes for feature generation.
+ * @param resources Additional resources as key-value map.
+ * @param posDictionary A {@link TagDictionary} used for the new instance.
*/
- @Deprecated
- public POSTaggerFactory(Dictionary ngramDictionary, TagDictionary
posDictionary) {
- this.init(ngramDictionary, posDictionary);
-
- // TODO: This could be made functional by creating some default feature
generation
- // which uses the dictionary ...
- }
-
public POSTaggerFactory(byte[] featureGeneratorBytes, final Map<String,
Object> resources,
TagDictionary posDictionary) {
this.featureGeneratorBytes = featureGeneratorBytes;
@@ -95,8 +83,19 @@ public class POSTaggerFactory extends BaseToolFactory {
this.posDictionary = posDictionary;
}
+
+ // reduced visibility to ensure deprecation is respected in future versions
+ @Deprecated
+ POSTaggerFactory(Dictionary ngramDictionary, TagDictionary posDictionary) {
+ this.init(ngramDictionary, posDictionary);
+
+ // TODO: This could be made functional by creating some default feature
generation
+ // which uses the dictionary ...
+ }
+
+ // reduced visibility to ensure deprecation is respected in future versions
@Deprecated // will be removed when only 8 series models are supported
- protected void init(Dictionary ngramDictionary, TagDictionary posDictionary)
{
+ void init(Dictionary ngramDictionary, TagDictionary posDictionary) {
this.ngramDictionary = ngramDictionary;
this.posDictionary = posDictionary;
}
@@ -133,11 +132,11 @@ public class POSTaggerFactory extends BaseToolFactory {
/**
* Creates the {@link AdaptiveFeatureGenerator}. Usually this
* is a set of generators contained in the {@link
AggregatedFeatureGenerator}.
- *
+ * <p>
* Note:
* The generators are created on every call to this method.
*
- * @return the feature generator or null if there is no descriptor in the
model
+ * @return the feature generator or {@link null} if there is no descriptor
in the model
*/
public AdaptiveFeatureGenerator createFeatureGenerators() {
@@ -207,11 +206,27 @@ public class POSTaggerFactory extends BaseToolFactory {
return artifactMap;
}
+ /**
+ * Initializes a {@link TagDictionary} from a {@link File dictionary file}.
+ *
+ * @param dictionary The {@link File} used for creating the dictionary.
+ * @return A valid {@link TagDictionary} ready for use.
+ *
+ * @throws IOException Thrown if IO errors occurred during creation.
+ */
public TagDictionary createTagDictionary(File dictionary)
throws IOException {
return createTagDictionary(new FileInputStream(dictionary));
}
+ /**
+ * Initializes a {@link TagDictionary} from a {@link InputStream dictionary
stream}.
+ *
+ * @param in The {@link InputStream} used for creating the dictionary.
+ * @return A valid {@link TagDictionary} ready for use.
+ *
+ * @throws IOException Thrown if IO errors occurred during creation.
+ */
public TagDictionary createTagDictionary(InputStream in)
throws IOException {
return POSDictionary.create(in);
@@ -225,9 +240,10 @@ public class POSTaggerFactory extends BaseToolFactory {
this.posDictionary = dictionary;
}
+ /**
+ * @return The key-value based resources map, or an empty map.
+ */
protected Map<String, Object> getResources() {
-
-
if (resources != null) {
return resources;
}
@@ -235,39 +251,40 @@ public class POSTaggerFactory extends BaseToolFactory {
return Collections.emptyMap();
}
+ /**
+ * @return The feature generator bytes used.
+ */
protected byte[] getFeatureGenerator() {
return featureGeneratorBytes;
}
+ /**
+ * @return The {@link TagDictionary} used.
+ */
public TagDictionary getTagDictionary() {
if (this.posDictionary == null && artifactProvider != null)
this.posDictionary =
artifactProvider.getArtifact(TAG_DICTIONARY_ENTRY_NAME);
return this.posDictionary;
}
- /**
- * @deprecated this will be reduced in visibility and later removed
- */
- @Deprecated
- public Dictionary getDictionary() {
+ @Deprecated // will be removed when only 8 series models are supported
+ private Dictionary getDictionary() {
if (this.ngramDictionary == null && artifactProvider != null)
this.ngramDictionary =
artifactProvider.getArtifact(NGRAM_DICTIONARY_ENTRY_NAME);
return this.ngramDictionary;
}
- @Deprecated
- public void setDictionary(Dictionary ngramDict) {
- if (artifactProvider != null) {
- throw new IllegalStateException(
- "Can not set ngram dictionary while using artifact provider.");
- }
- this.ngramDictionary = ngramDict;
- }
-
+ /**
+ * @return The {@link POSContextGenerator} with a default cache size of
{@code 0}.
+ */
public POSContextGenerator getPOSContextGenerator() {
return getPOSContextGenerator(0);
}
+ /**
+ * @param cacheSize Must be greater than or equal to {@code 0}.
+ * @return The {@link POSContextGenerator} configured with the given {@code
cacheSize}.
+ */
public POSContextGenerator getPOSContextGenerator(int cacheSize) {
if (artifactProvider != null) {
@@ -282,9 +299,11 @@ public class POSTaggerFactory extends BaseToolFactory {
}
return new ConfigurablePOSContextGenerator(cacheSize,
createFeatureGenerators());
-
}
+ /**
+ * @return The {@link SequenceValidator} used.
+ */
public SequenceValidator<String> getSequenceValidator() {
return new DefaultPOSSequenceValidator(getTagDictionary());
}
@@ -308,8 +327,8 @@ public class POSTaggerFactory extends BaseToolFactory {
}
}
- protected void validatePOSDictionary(POSDictionary posDict,
- AbstractModel posModel) throws InvalidFormatException {
+ protected void validatePOSDictionary(POSDictionary posDict, AbstractModel
posModel)
+ throws InvalidFormatException {
Set<String> dictTags = new HashSet<>();
for (String word : posDict) {
@@ -363,11 +382,11 @@ public class POSTaggerFactory extends BaseToolFactory {
if (ngramDictEntry != null && !(ngramDictEntry instanceof Dictionary)) {
throw new InvalidFormatException("NGram dictionary has wrong type!");
}
-
}
+ // reduced visibility to ensure deprecation is respected in future versions
@Deprecated
- public static POSTaggerFactory create(String subclassName,
+ static POSTaggerFactory create(String subclassName,
Dictionary ngramDictionary, TagDictionary posDictionary)
throws InvalidFormatException {
if (subclassName == null) {
@@ -386,6 +405,21 @@ public class POSTaggerFactory extends BaseToolFactory {
}
}
+ /**
+ * Instantiates a {@link POSTaggerFactory} via a given {@code subclassName}.
+ *
+ * @param subclassName The class name used for instantiation. If {@code
null}, an
+ * instance of {@link POSTaggerFactory} will be returned
+ * per default. Otherwise, the {@link ExtensionLoader}
mechanism
+ * is applied to load the requested {@code subclassName}.
+ * @param featureGeneratorBytes The bytes for feature generation.
+ * @param resources Additional resources as key-value map.
+ * @param posDictionary A {@link TagDictionary} used for the new instance.
+ * @return @return A valid {@link POSTaggerFactory} instance.
+ *
+ * @throws InvalidFormatException Thrown if the {@link ExtensionLoader}
mechanism failed to
+ * load the factory via the {@code
subclassName}.
+ */
public static POSTaggerFactory create(String subclassName, byte[]
featureGeneratorBytes,
Map<String, Object> resources,
TagDictionary posDictionary)
throws InvalidFormatException {
@@ -394,12 +428,12 @@ public class POSTaggerFactory extends BaseToolFactory {
if (subclassName == null) {
// will create the default factory
- theFactory = new POSTaggerFactory(null, posDictionary);
+ theFactory = new POSTaggerFactory();
+ theFactory.init(featureGeneratorBytes, resources, posDictionary);
}
else {
try {
- theFactory = ExtensionLoader.instantiateExtension(
- POSTaggerFactory.class, subclassName);
+ theFactory =
ExtensionLoader.instantiateExtension(POSTaggerFactory.class, subclassName);
} catch (Exception e) {
String msg = "Could not instantiate the " + subclassName
+ ". The initialization throw an exception.";
@@ -412,6 +446,9 @@ public class POSTaggerFactory extends BaseToolFactory {
return theFactory;
}
+ /**
+ * @return An empty, case-sensitive {@link TagDictionary}.
+ */
public TagDictionary createEmptyTagDictionary() {
this.posDictionary = new POSDictionary(true);
return this.posDictionary;
diff --git a/opennlp-tools/src/main/java/opennlp/tools/postag/POSTaggerME.java
b/opennlp-tools/src/main/java/opennlp/tools/postag/POSTaggerME.java
index 655c2fde..f2ecc32f 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/postag/POSTaggerME.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/postag/POSTaggerME.java
@@ -46,16 +46,16 @@ import opennlp.tools.util.TrainingParameters;
import opennlp.tools.util.featuregen.StringPattern;
/**
- * A part-of-speech tagger that uses maximum entropy. Tries to predict whether
- * words are nouns, verbs, or any of 70 other POS tags depending on their
- * surrounding context.
- *
+ * A {@link POSTagger part-of-speech tagger} that uses maximum entropy.
+ * <p>
+ * Tries to predict whether words are nouns, verbs, or any of 70 other POS tags
+ * depending on their surrounding context.
*/
public class POSTaggerME implements POSTagger {
public static final int DEFAULT_BEAM_SIZE = 3;
- private POSModel modelPackage;
+ private final POSModel modelPackage;
/**
* The feature context generator.
@@ -63,18 +63,17 @@ public class POSTaggerME implements POSTagger {
protected POSContextGenerator contextGen;
/**
- * Tag dictionary used for restricting words to a fixed set of tags.
+ * {@link TagDictionary} used for restricting words to a fixed set of tags.
*/
protected TagDictionary tagDictionary;
- protected Dictionary ngramDictionary;
+ protected Dictionary ngramDictionary; // TODO unused - Could this be removed?
/**
* Says whether a filter should be used to check whether a tag assignment
- * is to a word outside of a closed class.
+ * is to a word outside a closed class.
*/
- protected boolean useClosedClassTagsFilter = false;
-
+ protected boolean useClosedClassTagsFilter = false; // TODO unused - Could
this be removed?
/**
* The size of the beam to be used in determining the best sequence of pos
tags.
@@ -83,23 +82,26 @@ public class POSTaggerME implements POSTagger {
private Sequence bestSequence;
- private SequenceClassificationModel<String> model;
+ private final SequenceClassificationModel<String> model;
- private SequenceValidator<String> sequenceValidator;
+ private final SequenceValidator<String> sequenceValidator;
/**
- * Initializes the sentence detector by downloading a default model.
- * @param language The language of the POS tagger
- * @throws IOException Thrown if the model cannot be downloaded or saved.
+ * Initializes a {@link POSTaggerME} by downloading a default model for a
given
+ * {@code language}.
+ *
+ * @param language An ISO conform language code.
+ *
+ * @throws IOException Thrown if the model could not be downloaded or saved.
*/
public POSTaggerME(String language) throws IOException {
this((POSModel) DownloadUtil.downloadModel(language,
DownloadUtil.ModelType.POS, POSModel.class));
}
/**
- * Initializes the current instance with the provided model.
+ * Initializes a {@link POSTaggerME} with the provided {@link POSModel
model}.
*
- * @param model
+ * @param model A valid {@link POSModel}.
*/
public POSTaggerME(POSModel model) {
POSTaggerFactory factory = model.getFactory();
@@ -131,19 +133,18 @@ public class POSTaggerME implements POSTagger {
}
/**
- * Retrieves an array of all possible part-of-speech tags from the
- * tagger.
- *
- * @return String[]
+ * @return Retrieves an array of all possible part-of-speech tags from the
tagger.
*/
public String[] getAllPosTags() {
return model.getOutcomes();
}
+ @Override
public String[] tag(String[] sentence) {
return this.tag(sentence, null);
}
+ @Override
public String[] tag(String[] sentence, Object[] additionalContext) {
bestSequence = model.bestSequence(sentence, additionalContext, contextGen,
sequenceValidator);
List<String> t = bestSequence.getOutcomes();
@@ -151,12 +152,12 @@ public class POSTaggerME implements POSTagger {
}
/**
- * Returns at most the specified number of taggings for the specified
sentence.
+ * Returns at most the specified {@code numTaggings} for the specified
{@code sentence}.
*
* @param numTaggings The number of tagging to be returned.
* @param sentence An array of tokens which make up a sentence.
*
- * @return At most the specified number of taggings for the specified
sentence.
+ * @return At most the specified number of taggings for the specified {@code
sentence}.
*/
public String[][] tag(int numTaggings, String[] sentence) {
Sequence[] bestSequences = model.bestSequences(numTaggings, sentence, null,
@@ -169,10 +170,12 @@ public class POSTaggerME implements POSTagger {
return tags;
}
+ @Override
public Sequence[] topKSequences(String[] sentence) {
return this.topKSequences(sentence, null);
}
+ @Override
public Sequence[] topKSequences(String[] sentence, Object[]
additionalContext) {
return model.bestSequences(size, sentence, additionalContext, contextGen,
sequenceValidator);
}
@@ -187,9 +190,7 @@ public class POSTaggerME implements POSTagger {
}
/**
- * Returns an array with the probabilities for each tag of the last tagged
sentence.
- *
- * @return an array with the probabilities for each tag of the last tagged
sentence.
+ * @return An array with the probabilities for each tag of the last tagged
sentence.
*/
public double[] probs() {
return bestSequence.getProbs();
@@ -227,7 +228,7 @@ public class POSTaggerME implements POSTagger {
}
else {
throw new UnsupportedOperationException("This method can only be called
if the "
- + "classifcation model is an event model!");
+ + "classification model is an event model!");
}
}
@@ -279,11 +280,20 @@ public class POSTaggerME implements POSTagger {
}
}
+ /**
+ * Constructs a {@link Dictionary nGram dictionary} from an {@link
ObjectStream} of samples.
+ *
+ * @param samples The {@link ObjectStream} to process.
+ * @param cutoff A non-negative cut-off value.
+ *
+ * @return A valid {@link Dictionary} instance holding nGrams.
+ *
+ * @throws IOException Thrown if IO errors occurred during dictionary
construction.
+ */
public static Dictionary buildNGramDictionary(ObjectStream<POSSample>
samples, int cutoff)
throws IOException {
NGramModel ngramModel = new NGramModel();
-
POSSample sample;
while ((sample = samples.read()) != null) {
String[] words = sample.getSentence();
@@ -297,8 +307,18 @@ public class POSTaggerME implements POSTagger {
return ngramModel.toDictionary(true);
}
+ /**
+ * Populates a {@link POSDictionary} from an {@link ObjectStream} of samples.
+ *
+ * @param samples The {@link ObjectStream} to process.
+ * @param dict The {@link MutableTagDictionary} to use during population.
+ * @param cutoff A non-negative cut-off value.
+ *
+ * @throws IOException Thrown if IO errors occurred during dictionary
construction.
+ */
public static void populatePOSDictionary(ObjectStream<POSSample> samples,
MutableTagDictionary dict, int cutoff) throws IOException {
+
System.out.println("Expanding POS Dictionary ...");
long start = System.nanoTime();
diff --git
a/opennlp-tools/src/main/java/opennlp/tools/postag/TagDictionary.java
b/opennlp-tools/src/main/java/opennlp/tools/postag/TagDictionary.java
index 1e31d8ae..2b39b4f9 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/postag/TagDictionary.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/postag/TagDictionary.java
@@ -25,11 +25,18 @@ package opennlp.tools.postag;
public interface TagDictionary {
/**
- * Returns a list of valid tags for the specified word.
+ * Retrieves a list of valid tags for the specified {@code word}.
*
* @param word The word.
- * @return A list of valid tags for the specified word or null if no
information
- * is available for that word.
+ * @return An array of valid tags for the specified {@code word} or {@code
null} if
+ * no information is available for that word.
*/
String[] getTags(String word);
+
+ /**
+ * Whether the dictionary is case-sensitive or not.
+ *
+ * @return {@code true} if the dictionary is case-sensitive, else {@code
false}.
+ */
+ boolean isCaseSensitive();
}
diff --git
a/opennlp-tools/src/main/java/opennlp/tools/postag/WordTagSampleStream.java
b/opennlp-tools/src/main/java/opennlp/tools/postag/WordTagSampleStream.java
index 86f1a6e3..7e4c04d7 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/postag/WordTagSampleStream.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/postag/WordTagSampleStream.java
@@ -26,29 +26,32 @@ import opennlp.tools.util.ObjectStream;
/**
* A stream filter which reads a sentence per line which contains
- * words and tags in word_tag format and outputs a {@link POSSample} objects.
+ * words and tags in {@code word_tag} format and outputs a {@link POSSample}
objects.
*/
public class WordTagSampleStream extends FilterObjectStream<String, POSSample>
{
/**
- * Initializes the current instance.
+ * Initializes a {@link POSSample} instance.
*
- * @param sentences the sentences
+ * @param sentences The {@link ObjectStream sentences} to wrap.
*/
public WordTagSampleStream(ObjectStream<String> sentences) {
super(sentences);
}
/**
- * Parses the next sentence and return the next
- * {@link POSSample} object.
- *
+ * Parses the next sentence and return the next {@link POSSample} object.
+ * <p>
* If an error occurs an empty {@link POSSample} object is returned
- * and an warning message is logged. Usually it does not matter if one
- * of many sentences is ignored.
+ * and a warning message is logged. Usually it does not matter if one
+ * or many sentences are ignored.
+ *
+ * @return A valid {@link POSSample} or {@code null} if the
+ * {@link ObjectStream sentence stream} is exhausted.
*
- * TODO: An exception in error case should be thrown.
+ * @throws IOException Thrown if IO errors occurred during read.
*/
+ @Override
public POSSample read() throws IOException {
String sentence = samples.read();
@@ -58,6 +61,7 @@ public class WordTagSampleStream extends
FilterObjectStream<String, POSSample> {
try {
sample = POSSample.parse(sentence);
} catch (InvalidFormatException e) {
+ // TODO: An exception in error case should be thrown.
System.out.println("Error during parsing, ignoring sentence: " +
sentence);
sample = new POSSample(new String[]{}, new String[]{});
diff --git
a/opennlp-tools/src/main/java/opennlp/tools/util/model/ArtifactProvider.java
b/opennlp-tools/src/main/java/opennlp/tools/util/model/ArtifactProvider.java
index 99de0950..de347153 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/util/model/ArtifactProvider.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/util/model/ArtifactProvider.java
@@ -25,7 +25,7 @@ public interface ArtifactProvider {
/**
* Gets an artifact by name
*/
- public <T> T getArtifact(String key);
+ <T> T getArtifact(String key);
/**
* Retrieves the value to the given key from the manifest.properties
@@ -35,7 +35,7 @@ public interface ArtifactProvider {
*
* @return the value
*/
- public String getManifestProperty(String key);
+ String getManifestProperty(String key);
/**
* Retrieves the language code of the material which was used to train the
@@ -43,7 +43,7 @@ public interface ArtifactProvider {
*
* @return the language code of this model
*/
- public String getLanguage();
+ String getLanguage();
/**
* Indicates if this provider was loaded from serialized. It is useful, for
@@ -52,5 +52,5 @@ public interface ArtifactProvider {
*
* @return true if this model was loaded from serialized
*/
- public boolean isLoadedFromSerialized();
+ boolean isLoadedFromSerialized();
}
diff --git
a/opennlp-tools/src/test/java/opennlp/tools/postag/POSSampleTest.java
b/opennlp-tools/src/test/java/opennlp/tools/postag/POSSampleTest.java
index 966a44be..d4c34bb1 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/postag/POSSampleTest.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/postag/POSSampleTest.java
@@ -76,8 +76,8 @@ public class POSSampleTest {
}
Assertions.assertNotNull(deSerializedPOSSample);
- Assertions.assertArrayEquals(posSample.getAddictionalContext(),
- deSerializedPOSSample.getAddictionalContext());
+ Assertions.assertArrayEquals(posSample.getAdditionalContext(),
+ deSerializedPOSSample.getAdditionalContext());
Assertions.assertArrayEquals(posSample.getSentence(),
deSerializedPOSSample.getSentence());
Assertions.assertArrayEquals(posSample.getTags(),
deSerializedPOSSample.getTags());
}