This is an automated email from the ASF dual-hosted git repository.
jzemerick pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/opennlp.git
The following commit(s) were added to refs/heads/master by this push:
new 74ece59f OPENNLP-1405 Enhance JavaDoc in opennlp.tools.tokenize
package (#448)
74ece59f is described below
commit 74ece59fac2bb70f63a294a8bcce047e7eae9cb3
Author: Martin Wiesner <[email protected]>
AuthorDate: Sat Dec 10 15:21:51 2022 +0100
OPENNLP-1405 Enhance JavaDoc in opennlp.tools.tokenize package (#448)
- adds missing JavaDoc
- improves existing documentation for clarity
- removes superfluous text
- adds 'final' modifier where useful and applicable
- adds 'Override' annotation where useful and applicable
- fixes some typos
---
.../lemmatizer/LemmatizerEvaluationMonitor.java | 4 +-
.../tools/lemmatizer/LemmatizerFactory.java | 16 +++-
.../opennlp/tools/tokenize/AbstractTokenizer.java | 6 ++
.../tokenize/DefaultTokenContextGenerator.java | 34 ++++----
.../tools/tokenize/DetokenizationDictionary.java | 92 ++++++++++++++++------
.../java/opennlp/tools/tokenize/Detokenizer.java | 26 +++---
.../tools/tokenize/DetokenizerEvaluator.java | 14 ++--
.../tools/tokenize/DictionaryDetokenizer.java | 9 ++-
.../opennlp/tools/tokenize/SimpleTokenizer.java | 25 +++---
.../opennlp/tools/tokenize/TokSpanEventStream.java | 52 ++++++------
.../tools/tokenize/TokenContextGenerator.java | 11 +--
.../java/opennlp/tools/tokenize/TokenSample.java | 32 ++++++--
.../opennlp/tools/tokenize/TokenSampleStream.java | 34 +++++---
.../java/opennlp/tools/tokenize/Tokenizer.java | 20 +++--
.../tools/tokenize/TokenizerCrossValidator.java | 22 ++++--
.../tools/tokenize/TokenizerEvaluationMonitor.java | 6 +-
.../opennlp/tools/tokenize/TokenizerEvaluator.java | 16 ++--
.../opennlp/tools/tokenize/TokenizerFactory.java | 82 +++++++++----------
.../java/opennlp/tools/tokenize/TokenizerME.java | 76 +++++++++---------
.../opennlp/tools/tokenize/TokenizerModel.java | 53 +++++++++----
.../opennlp/tools/tokenize/TokenizerStream.java | 17 +++-
.../tools/tokenize/WhitespaceTokenStream.java | 8 +-
.../tools/tokenize/WhitespaceTokenizer.java | 12 +--
.../opennlp/tools/tokenize/WordpieceTokenizer.java | 42 ++++++++--
.../java/opennlp/tools/tokenize/package-info.java | 6 +-
.../java/opennlp/tools/util/eval/Evaluator.java | 4 +-
26 files changed, 446 insertions(+), 273 deletions(-)
diff --git
a/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerEvaluationMonitor.java
b/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerEvaluationMonitor.java
index 55993406..78c3fcba 100644
---
a/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerEvaluationMonitor.java
+++
b/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerEvaluationMonitor.java
@@ -20,9 +20,7 @@ package opennlp.tools.lemmatizer;
import opennlp.tools.util.eval.EvaluationMonitor;
/**
- * Interface for the lemmatizer evaluator.
- * @version 2016-02-18
- *
+ * A marker interface for evaluating {@link Lemmatizer lemmatizers}.
*/
public interface LemmatizerEvaluationMonitor extends
EvaluationMonitor<LemmaSample> {
diff --git
a/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerFactory.java
b/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerFactory.java
index 0effba27..1d804a85 100644
---
a/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerFactory.java
+++
b/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerFactory.java
@@ -22,15 +22,29 @@ import opennlp.tools.util.InvalidFormatException;
import opennlp.tools.util.SequenceValidator;
import opennlp.tools.util.ext.ExtensionLoader;
+/**
+ * The factory that provides {@link Lemmatizer} default implementation and
+ * resources.
+ */
public class LemmatizerFactory extends BaseToolFactory {
/**
- * Creates a {@link LemmatizerFactory} that provides the default
implementation
+ * Instantiates a {@link LemmatizerFactory} that provides the default
implementation
* of the resources.
*/
public LemmatizerFactory() {
}
+ /**
+ * Instantiates a {@link LemmatizerFactory} via a given {@code subclassName}.
+ *
+ * @param subclassName The class name used for instantiation. If {@code
null}, an
+ * instance of {@link LemmatizerFactory} will be returned
+ * per default. Otherwise, the {@link ExtensionLoader}
mechanism
+ * is applied to load the requested {@code subclassName}.
+ *
+ * @return A valid {@link LemmatizerFactory} instance.
+ */
public static LemmatizerFactory create(String subclassName)
throws InvalidFormatException {
if (subclassName == null) {
diff --git
a/opennlp-tools/src/main/java/opennlp/tools/tokenize/AbstractTokenizer.java
b/opennlp-tools/src/main/java/opennlp/tools/tokenize/AbstractTokenizer.java
index 2dc3754c..6cd17f49 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/tokenize/AbstractTokenizer.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/tokenize/AbstractTokenizer.java
@@ -23,10 +23,16 @@ abstract class AbstractTokenizer implements Tokenizer {
protected boolean keepNewLines = false;
+ @Override
public String[] tokenize(String s) {
return Span.spansToStrings(tokenizePos(s), s);
}
+ /**
+ * Switches whether to keep new lines or not.
+ *
+ * @param keepNewLines {@code True} if new lines are kept, {@code false}
otherwise.
+ */
public void setKeepNewLines(boolean keepNewLines) {
this.keepNewLines = keepNewLines;
}
diff --git
a/opennlp-tools/src/main/java/opennlp/tools/tokenize/DefaultTokenContextGenerator.java
b/opennlp-tools/src/main/java/opennlp/tools/tokenize/DefaultTokenContextGenerator.java
index 552cc691..f334e287 100644
---
a/opennlp-tools/src/main/java/opennlp/tools/tokenize/DefaultTokenContextGenerator.java
+++
b/opennlp-tools/src/main/java/opennlp/tools/tokenize/DefaultTokenContextGenerator.java
@@ -25,31 +25,31 @@ import java.util.Set;
import opennlp.tools.util.StringUtil;
/**
- * Generate events for maxent decisions for tokenization.
+ * A default {@link TokenContextGenerator} which produces events for maxent
decisions
+ * for tokenization.
*/
public class DefaultTokenContextGenerator implements TokenContextGenerator {
protected final Set<String> inducedAbbreviations;
/**
- * Creates a default context generator for tokenizer.
+ * Initializes a plain {@link DefaultTokenContextGenerator} instance.
*/
public DefaultTokenContextGenerator() {
this(Collections.emptySet());
}
/**
- * Creates a default context generator for tokenizer.
+ * Initializes a customized {@link DefaultTokenContextGenerator} instance via
+ * a set of {@code inducedAbbreviations}.
*
- * @param inducedAbbreviations the induced abbreviations
+ * @param inducedAbbreviations The induced abbreviations to be used for this
instance.
*/
public DefaultTokenContextGenerator(Set<String> inducedAbbreviations) {
this.inducedAbbreviations = inducedAbbreviations;
}
- /* (non-Javadoc)
- * @see
opennlp.tools.tokenize.TokenContextGenerator#getContext(java.lang.String, int)
- */
+ @Override
public String[] getContext(String sentence, int index) {
List<String> preds = createContext(sentence, index);
String[] context = new String[preds.size()];
@@ -58,15 +58,15 @@ public class DefaultTokenContextGenerator implements
TokenContextGenerator {
}
/**
- * Returns an {@link ArrayList} of features for the specified sentence string
- * at the specified index. Extensions of this class can override this method
- * to create a customized {@link TokenContextGenerator}
+ * Computes a {@link List} of features for the specified {@code sentence}
+ * at the specified {@code index}. Extensions of {@link
DefaultTokenContextGenerator}
+ * can override this method to create a customized behaviour.
*
* @param sentence
- * the token been analyzed
+ * The sentence to create features for.
* @param index
- * the index of the character been analyzed
- * @return an {@link ArrayList} of features for the specified sentence string
+ * The positional index. Must be a non-negative number or {@code 0}.
+ * @return A {@link List} of features for the specified {@code sentence}
string
* at the specified index.
*/
protected List<String> createContext(String sentence, int index) {
@@ -110,7 +110,13 @@ public class DefaultTokenContextGenerator implements
TokenContextGenerator {
/**
- * Helper function for getContext.
+ * Helper function for {@link #createContext} that appends to a given {@code
key}
+ * a fixed text sequence depending on {@code c}. The resulting combination
is added
+ * to the given list {@code preds}.
+ *
+ * @param key The input string to process.
+ * @param c A character used to discriminate which fixed text shall be
appended.
+ * @param preds The list into which the resulting combinations will be added.
*/
protected void addCharPreds(String key, char c, List<String> preds) {
preds.add(key + "=" + c);
diff --git
a/opennlp-tools/src/main/java/opennlp/tools/tokenize/DetokenizationDictionary.java
b/opennlp-tools/src/main/java/opennlp/tools/tokenize/DetokenizationDictionary.java
index 55eca1c8..d3b0a495 100644
---
a/opennlp-tools/src/main/java/opennlp/tools/tokenize/DetokenizationDictionary.java
+++
b/opennlp-tools/src/main/java/opennlp/tools/tokenize/DetokenizationDictionary.java
@@ -23,6 +23,7 @@ import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
+import java.nio.file.Path;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
@@ -53,27 +54,36 @@ public class DetokenizationDictionary {
MOVE_BOTH,
/**
- * Attaches the token token to the right token on first occurrence, and
+ * Attaches the token to the right token on first occurrence, and
* to the token on the left side on the second occurrence.
*/
RIGHT_LEFT_MATCHING;
+ /**
+ * @param operation The string representation for which an {@link
Operation}
+ * instance is to be found.
+ * @return The {@link Operation enum} instance that matches the given
{@code operation},
+ * or {@code null} if the input has no equivalent.
+ */
public static Operation parse(String operation) {
-
- if (MOVE_RIGHT.toString().equals(operation)) {
- return MOVE_RIGHT;
- }
- else if (MOVE_LEFT.toString().equals(operation)) {
- return MOVE_LEFT;
- }
- else if (MOVE_BOTH.toString().equals(operation)) {
- return MOVE_BOTH;
- }
- else if (RIGHT_LEFT_MATCHING.toString().equals(operation)) {
- return RIGHT_LEFT_MATCHING;
- }
- else {
+ if (operation == null) {
return null;
+ } else {
+ if (MOVE_RIGHT.toString().equals(operation)) {
+ return MOVE_RIGHT;
+ }
+ else if (MOVE_LEFT.toString().equals(operation)) {
+ return MOVE_LEFT;
+ }
+ else if (MOVE_BOTH.toString().equals(operation)) {
+ return MOVE_BOTH;
+ }
+ else if (RIGHT_LEFT_MATCHING.toString().equals(operation)) {
+ return RIGHT_LEFT_MATCHING;
+ }
+ else {
+ return null;
+ }
}
}
}
@@ -81,11 +91,11 @@ public class DetokenizationDictionary {
private final Map<String, DetokenizationDictionary.Operation> operationTable
= new HashMap<>();
/**
- * Initializes the current instance.
+ * Initializes a {@link DetokenizationDictionary} instance.
*
- * @param tokens an array of tokens that should be detokenized according to
an operation
- * @param operations an array of operations which specifies which operation
- * should be used for the provided tokens
+ * @param tokens An array of tokens that should be de-tokenized according to
{@code operations}.
+ * @param operations An array of operations which specifies which operation
+ * should be used for the provided {@code tokens}.
*/
public DetokenizationDictionary(String[] tokens,
DetokenizationDictionary.Operation[] operations) {
@@ -107,16 +117,44 @@ public class DetokenizationDictionary {
}
}
+ /**
+ * Initializes a {@link DetokenizationDictionary} instance via a valid
{@link InputStream}.
+ *
+ * @param in The {@link InputStream} used for loading the dictionary.
+ *
+ * @throws IOException Thrown if IO errors occurred during initialization.
+ */
public DetokenizationDictionary(InputStream in) throws IOException {
init(in);
}
+ /**
+ * Initializes a {@link DetokenizationDictionary} instance via a valid
{@link File}.
+ *
+ * @param file The {@link File} used for loading the dictionary.
+ *
+ * @throws IOException Thrown if IO errors occurred during initialization.
+ */
public DetokenizationDictionary(File file) throws IOException {
try (InputStream in = new BufferedInputStream(new FileInputStream(file))) {
init(in);
}
}
+ /**
+ * Initializes a {@link DetokenizationDictionary} instance via a valid
{@link Path}.
+ *
+ * @param path The {@link Path} used for loading the dictionary.
+ *
+ * @throws IOException Thrown if IO errors occurred during initialization.
+ */
+ public DetokenizationDictionary(Path path) throws IOException {
+ this(path.toFile());
+ }
+
+ /*
+ * Builds up the dictionary from an InputStream.
+ */
private void init(InputStream in) throws IOException {
DictionaryEntryPersistor.create(in, entry -> {
@@ -137,15 +175,25 @@ public class DetokenizationDictionary {
});
}
+ /**
+ * @param token The input string for which a valid {@link Operation} is to
be found.
+ * @return The {@link Operation} that fits the given {@code token}.
+ */
DetokenizationDictionary.Operation getOperation(String token) {
return operationTable.get(token);
}
- // serialize method
+ /**
+ * Serializes the current state of a {@link DetokenizationDictionary} via an
+ * {@link OutputStream output stream}.
+ *
+ * @param out A valid, open {@link OutputStream} ready to be used for
serialization.
+ * @throws IOException Thrown if IO errors occurred during serialization.
+ */
public void serialize(OutputStream out) throws IOException {
- Iterator<Entry> entries = new Iterator<Entry>() {
+ Iterator<Entry> entries = new Iterator<>() {
- Iterator<String> iterator = operationTable.keySet().iterator();
+ final Iterator<String> iterator = operationTable.keySet().iterator();
public boolean hasNext() {
return iterator.hasNext();
diff --git
a/opennlp-tools/src/main/java/opennlp/tools/tokenize/Detokenizer.java
b/opennlp-tools/src/main/java/opennlp/tools/tokenize/Detokenizer.java
index acb9f45f..49d2a17b 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/tokenize/Detokenizer.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/tokenize/Detokenizer.java
@@ -18,8 +18,7 @@
package opennlp.tools.tokenize;
/**
- * A Detokenizer merges tokens back to their untokenized representation.
- *
+ * A {@link Detokenizer} merges tokens back to their detokenized
representation.
*/
public interface Detokenizer {
@@ -29,7 +28,7 @@ public interface Detokenizer {
*/
enum DetokenizationOperation {
/**
- * The current token should be attached to the begin token on the right
side.
+ * The current token should be attached to the start token on the right
side.
*/
MERGE_TO_RIGHT,
@@ -40,7 +39,7 @@ public interface Detokenizer {
/**
* The current token should be attached to the string on the left side, as
- * well as to the begin token on the right side.
+ * well as to the start token on the right side.
*/
MERGE_BOTH,
@@ -52,22 +51,23 @@ public interface Detokenizer {
}
/**
- * Detokenize the input tokens.
+ * Detokenizes the collection of tokens.
*
- * @param tokens the tokens to detokenize.
- * @return the merge operations to detokenize the input tokens.
+ * @param tokens The elements which should be detokenized.
+ * @return The {@link DetokenizationOperation merge operations} to handle
+ * given {@code tokens}.
*/
DetokenizationOperation[] detokenize(String[] tokens);
/**
- * Detokenize the input tokens into a String. Tokens which
- * are connected without a space inbetween can be separated by
- * a split marker.
+ * Detokenizes the input {@code tokens} into a String. Tokens which
+ * are connected without a {@code whitespace} character in
+ * between can be separated by a given {@code splitMarker}.
*
- * @param tokens the token which should be concatenated
- * @param splitMarker the split marker or null
+ * @param tokens The elements which should be concatenated.
+ * @param splitMarker The split marker or {@code null}.
*
- * @return the concatenated tokens
+ * @return The concatenated tokens as a single string.
*/
String detokenize(String[] tokens, String splitMarker);
}
diff --git
a/opennlp-tools/src/main/java/opennlp/tools/tokenize/DetokenizerEvaluator.java
b/opennlp-tools/src/main/java/opennlp/tools/tokenize/DetokenizerEvaluator.java
index 7d9df4fc..1b98fdb1 100644
---
a/opennlp-tools/src/main/java/opennlp/tools/tokenize/DetokenizerEvaluator.java
+++
b/opennlp-tools/src/main/java/opennlp/tools/tokenize/DetokenizerEvaluator.java
@@ -27,28 +27,26 @@ import opennlp.tools.util.eval.FMeasure;
/**
* The {@link DetokenizerEvaluator} measures the performance of
* the given {@link Detokenizer} with the provided reference
- * {@link TokenSample}s.
+ * {@link TokenSample samples}.
*
- * @see DetokenizerEvaluator
* @see Detokenizer
* @see TokenSample
*/
public class DetokenizerEvaluator extends Evaluator<TokenSample> {
- private FMeasure fmeasure = new FMeasure();
+ private final FMeasure fmeasure = new FMeasure();
/**
- * The {@link Detokenizer} used to create the
- * predicted tokens.
+ * The {@link Detokenizer} used to create the predicted tokens.
*/
- private Detokenizer detokenizer;
+ private final Detokenizer detokenizer;
/**
* Initializes the current instance with the
* given {@link Detokenizer}.
*
- * @param detokenizer the {@link Detokenizer} to evaluate.
- * @param listeners evaluation sample listeners
+ * @param detokenizer The {@link Detokenizer} to evaluate.
+ * @param listeners The {@link DetokenEvaluationErrorListener evaluation
sample listeners}.
*/
public DetokenizerEvaluator(Detokenizer detokenizer,
DetokenEvaluationErrorListener... listeners) {
super(listeners);
diff --git
a/opennlp-tools/src/main/java/opennlp/tools/tokenize/DictionaryDetokenizer.java
b/opennlp-tools/src/main/java/opennlp/tools/tokenize/DictionaryDetokenizer.java
index d53eefa1..e6cf3f6a 100644
---
a/opennlp-tools/src/main/java/opennlp/tools/tokenize/DictionaryDetokenizer.java
+++
b/opennlp-tools/src/main/java/opennlp/tools/tokenize/DictionaryDetokenizer.java
@@ -22,7 +22,7 @@ import java.util.Set;
/**
* A rule based detokenizer. Simple rules which indicate in which direction a
token should be
- * moved are looked up in a {@link DetokenizationDictionary} object.
+ * moved are looked up in a {@link DetokenizationDictionary dictionary}.
*
* @see Detokenizer
* @see DetokenizationDictionary
@@ -31,10 +31,16 @@ public class DictionaryDetokenizer implements Detokenizer {
private final DetokenizationDictionary dict;
+ /**
+ * Initializes a {@link DictionaryDetokenizer} instance.
+ *
+ * @param dict The {@link DetokenizationDictionary} to be used.
+ */
public DictionaryDetokenizer(DetokenizationDictionary dict) {
this.dict = dict;
}
+ @Override
public DetokenizationOperation[] detokenize(String[] tokens) {
DetokenizationOperation[] operations = new
DetokenizationOperation[tokens.length];
@@ -79,6 +85,7 @@ public class DictionaryDetokenizer implements Detokenizer {
return operations;
}
+ @Override
public String detokenize(String[] tokens, String splitMarker) {
DetokenizationOperation[] operations = detokenize(tokens);
diff --git
a/opennlp-tools/src/main/java/opennlp/tools/tokenize/SimpleTokenizer.java
b/opennlp-tools/src/main/java/opennlp/tools/tokenize/SimpleTokenizer.java
index b9b86c85..b2b1c173 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/tokenize/SimpleTokenizer.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/tokenize/SimpleTokenizer.java
@@ -24,7 +24,11 @@ import opennlp.tools.util.Span;
import opennlp.tools.util.StringUtil;
/**
- * Performs tokenization using character classes.
+ * A basic {@link Tokenizer} implementation which performs tokenization
+ * using character classes.
+ * <p>
+ * To obtain an instance of this tokenizer use the static final
+ * {@link #INSTANCE} field.
*/
public class SimpleTokenizer extends AbstractTokenizer {
@@ -34,7 +38,7 @@ public class SimpleTokenizer extends AbstractTokenizer {
static final CharacterEnum NUMERIC = new CharacterEnum("numeric");
static final CharacterEnum OTHER = new CharacterEnum("other");
- private String name;
+ private final String name;
private CharacterEnum(String name) {
this.name = name;
@@ -45,21 +49,22 @@ public class SimpleTokenizer extends AbstractTokenizer {
return name;
}
}
-
- public static final SimpleTokenizer INSTANCE;
- static {
- INSTANCE = new SimpleTokenizer();
- }
+ /**
+ * Use this static reference to retrieve an instance of the
+ * {@link SimpleTokenizer}.
+ */
+ public static final SimpleTokenizer INSTANCE = new SimpleTokenizer();
/**
- * @deprecated Use INSTANCE field instead to obtain an instance, constructor
- * will be made private in the future.
+ * @deprecated Use {@link SimpleTokenizer#INSTANCE} field instead to obtain
an instance.
+ * This constructor will be made private in the future.
*/
- @Deprecated
+ @Deprecated // TODO Decide when this will be private (see deprecation note!)
public SimpleTokenizer() {
}
+ @Override
public Span[] tokenizePos(String s) {
CharacterEnum charType = CharacterEnum.WHITESPACE;
CharacterEnum state = charType;
diff --git
a/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokSpanEventStream.java
b/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokSpanEventStream.java
index 75eb2414..3c04e3f2 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokSpanEventStream.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokSpanEventStream.java
@@ -30,27 +30,30 @@ import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.Span;
/**
- * This class reads the {@link TokenSample}s from the given {@link Iterator}
- * and converts the {@link TokenSample}s into {@link Event}s which
+ * This class reads the {@link TokenSample samples} via an {@link Iterator}
+ * and converts the samples into {@link Event events} which
* can be used by the maxent library for training.
*/
public class TokSpanEventStream extends AbstractEventStream<TokenSample> {
- private TokenContextGenerator cg;
+ private final TokenContextGenerator cg;
- private boolean skipAlphaNumerics;
+ private final boolean skipAlphaNumerics;
private final Pattern alphaNumeric;
/**
- * Initializes the current instance.
+ * Initializes a new event stream based on the data stream using a {@link
TokenContextGenerator}.
*
- * @param tokenSamples
- * @param skipAlphaNumerics
- * @param cg
+ * @param tokenSamples The {@link ObjectStream data stream} for this event
stream.
+ * @param skipAlphaNumerics Whether alphanumerics are skipped, or not.
+ * @param alphaNumeric A custom alphanumeric {@link Pattern} or {@code null}.
+ * Default is: {@code "^[A-Za-z0-9]+$"}, provided by
+ * {@link Factory#DEFAULT_ALPHANUMERIC}.
+ * @param cg A {@link TokenContextGenerator} which should be used for the
event stream {@code d}.
*/
- public TokSpanEventStream(ObjectStream<TokenSample> tokenSamples,
- boolean skipAlphaNumerics, Pattern alphaNumeric, TokenContextGenerator
cg) {
+ public TokSpanEventStream(ObjectStream<TokenSample> tokenSamples, boolean
skipAlphaNumerics,
+ Pattern alphaNumeric, TokenContextGenerator cg) {
super(tokenSamples);
this.alphaNumeric = alphaNumeric;
this.skipAlphaNumerics = skipAlphaNumerics;
@@ -58,26 +61,23 @@ public class TokSpanEventStream extends
AbstractEventStream<TokenSample> {
}
/**
- * Initializes the current instance.
+ * Initializes a new event stream based on the data stream using a {@link
TokenContextGenerator}.
*
- * @param tokenSamples
- * @param skipAlphaNumerics
- * @param cg
+ * @param tokenSamples The {@link ObjectStream data stream} for this event
stream.
+ * @param skipAlphaNumerics Whether alphanumerics are skipped, or not.
+ * @param cg A {@link TokenContextGenerator} which should be used for the
event stream {@code d}.
*/
- public TokSpanEventStream(ObjectStream<TokenSample> tokenSamples,
- boolean skipAlphaNumerics, TokenContextGenerator cg) {
- super(tokenSamples);
- Factory factory = new Factory();
- this.alphaNumeric = factory.getAlphanumeric(null);
- this.skipAlphaNumerics = skipAlphaNumerics;
- this.cg = cg;
+ public TokSpanEventStream(ObjectStream<TokenSample> tokenSamples, boolean
skipAlphaNumerics,
+ TokenContextGenerator cg) {
+ this(tokenSamples, skipAlphaNumerics, new Factory().getAlphanumeric(null),
cg );
}
/**
- * Initializes the current instance.
+ * Initializes a new event stream based on the data stream using a {@link
TokenContextGenerator}
+ * that relies on a {@link DefaultTokenContextGenerator}.
*
- * @param tokenSamples
- * @param skipAlphaNumerics
+ * @param tokenSamples The {@link ObjectStream data stream} for this event
stream.
+ * @param skipAlphaNumerics Whether alphanumerics are skipped, or not.
*/
public TokSpanEventStream(ObjectStream<TokenSample> tokenSamples,
boolean skipAlphaNumerics) {
@@ -85,10 +85,10 @@ public class TokSpanEventStream extends
AbstractEventStream<TokenSample> {
}
/**
- * Adds training events to the event stream for each of the specified tokens.
+ * Adds training events to the event stream for each of the specified {@link
TokenSample sample}.
*
* @param tokenSample character offsets into the specified text.
- * @return The text of the tokens.
+ * @return An {@link Iterator} for text {@link Event events} representing
the {@code tokenSample}.
*/
@Override
protected Iterator<Event> createEvents(TokenSample tokenSample) {
diff --git
a/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenContextGenerator.java
b/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenContextGenerator.java
index b15fd91c..475146bd 100644
---
a/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenContextGenerator.java
+++
b/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenContextGenerator.java
@@ -18,18 +18,15 @@
package opennlp.tools.tokenize;
/**
- * Interface for {@link TokenizerME} context generators.
+ * Interface for context generators required for {@link TokenizerME}.
*/
public interface TokenContextGenerator {
/**
- * Returns an array of features for the specified sentence string at the
specified index.
+ * @param sentence The string that represents a sentence.
+ * @param index The index to consider splitting tokens.
*
- * @param sentence The string for a sentence.
- * @param index The index to consider splitting as a token.
- *
- * @return an array of features for the specified sentence string at the
- * specified index.
+ * @return An array of features for a {@code sentence} at the specified
{@code index}.
*/
String[] getContext(String sentence, int index);
}
diff --git
a/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenSample.java
b/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenSample.java
index b4e374ce..03f29470 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenSample.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenSample.java
@@ -42,10 +42,12 @@ public class TokenSample implements Serializable {
private final List<Span> tokenSpans;
/**
- * Initializes the current instance.
+ * Initializes a {@link TokenSample instance}.
*
- * @param text the text which contains the tokens.
- * @param tokenSpans the spans which mark the begin and end of the tokens.
+ * @param text The text which contains the tokens.
+ * Must not be {@code null}.
+ * @param tokenSpans The spans which mark the start and end of the tokens.
+ * Must not be {@code null}.
*/
public TokenSample(String text, Span[] tokenSpans) {
Objects.requireNonNull(tokenSpans, "tokenSpans must not be null");
@@ -62,13 +64,21 @@ public class TokenSample implements Serializable {
}
}
+ /**
+ * Initializes a {@link TokenSample instance} via a {@link Detokenizer}.
+ *
+ * @param detokenizer The text which contains the tokens. Must not be {@code
null}.
+ * @param tokens The tokens to be processed. Must not be {@code null}.
+ */
public TokenSample(Detokenizer detokenizer, String[] tokens) {
- StringBuilder sentence = new StringBuilder();
-
+ Objects.requireNonNull(detokenizer, "detokenizer must not be null");
+ Objects.requireNonNull(tokens, "tokens must not be null");
+
DetokenizationOperation[] operations = detokenizer.detokenize(tokens);
List<Span> mergedTokenSpans = new ArrayList<>();
+ StringBuilder sentence = new StringBuilder();
for (int i = 0; i < operations.length; i++) {
@@ -100,14 +110,14 @@ public class TokenSample implements Serializable {
}
/**
- * Retrieves the text.
+ * @return Retrieves the text.
*/
public String getText() {
return text;
}
/**
- * Retrieves the token spans.
+ * @return Retrieves the token {@link Span spans}.
*/
public Span[] getTokenSpans() {
return tokenSpans.toArray(new Span[tokenSpans.size()]);
@@ -157,6 +167,14 @@ public class TokenSample implements Serializable {
sample.append(" ");
}
+ /**
+ * Parses a string sample.
+ *
+ * @param sampleString The sample to be parsed. Must not be {@code null}.
+ * @param separatorChars The characters to be considered separators.
+ * See {@link #DEFAULT_SEPARATOR_CHARS}. Must not be
{@code null}.
+ * @return A valid {@link TokenSample} instance.
+ */
public static TokenSample parse(String sampleString, String separatorChars) {
Objects.requireNonNull(sampleString, "sampleString must not be null");
Objects.requireNonNull(separatorChars, "separatorChars must not be null");
diff --git
a/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenSampleStream.java
b/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenSampleStream.java
index 0beddc3a..84a0a63a 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenSampleStream.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenSampleStream.java
@@ -24,31 +24,47 @@ import opennlp.tools.util.FilterObjectStream;
import opennlp.tools.util.ObjectStream;
/**
- * This class is a stream filter which reads in string encoded samples and
creates
- * {@link TokenSample}s out of them. The input string sample is tokenized if a
- * whitespace or the special separator chars occur.
+ * This class is a {@link FilterObjectStream stream filter} which reads in
string encoded
+ * samples and creates {@link TokenSample samples} out of them.
+ * The input string sample is tokenized if a whitespace or the special
separator chars occur.
* <p>
* Sample:<br>
* "token1 token2 token3<SPLIT>token4"<br>
- * The tokens token1 and token2 are separated by a whitespace, token3 and
token3
- * are separated by the special character sequence, in this case the default
- * split sequence.
+ * The tokens {@code token1} and {@code token2} are separated by a whitespace,
+ * {@code token3} and {@code token4} are separated by the special character
sequence.
+ * In this case, the default split sequence applies.
* <p>
- * The sequence must be unique in the input string and is not escaped.
+ * Note: The sequence must be unique in the input string and is not escaped.
*/
public class TokenSampleStream extends FilterObjectStream<String, TokenSample>
{
private final String separatorChars;
- public TokenSampleStream(ObjectStream<String> sampleStrings, String
separatorChars) {
- super(Objects.requireNonNull(sampleStrings, "sampleStrings must not be
null"));
+ /**
+ * Initializes a {@link TokenSampleStream instance}.
+ *
+ * @param samples A plain text {@link ObjectStream line stream}.
+ * Must not be {@code null}.
+ * @param separatorChars The characters to be considered separators.
+ * See {@link TokenSample#DEFAULT_SEPARATOR_CHARS}.
+ * Must not be {@code null}.
+ */
+ public TokenSampleStream(ObjectStream<String> samples, String
separatorChars) {
+ super(Objects.requireNonNull(samples, "sampleStrings must not be null"));
this.separatorChars =
Objects.requireNonNull(separatorChars,"separatorChars must not be null");
}
+ /**
+ * Initializes a {@link TokenSampleStream instance}.
+ *
+ * @param sentences A plain text {@link ObjectStream line stream}.
+ * Must not be {@code null}.
+ */
public TokenSampleStream(ObjectStream<String> sentences) {
this(sentences, TokenSample.DEFAULT_SEPARATOR_CHARS);
}
+ @Override
public TokenSample read() throws IOException {
String sampleString = samples.read();
diff --git a/opennlp-tools/src/main/java/opennlp/tools/tokenize/Tokenizer.java
b/opennlp-tools/src/main/java/opennlp/tools/tokenize/Tokenizer.java
index 92b5e9b8..8a6bc37f 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/tokenize/Tokenizer.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/tokenize/Tokenizer.java
@@ -23,17 +23,17 @@ import opennlp.tools.util.Span;
/**
* The interface for tokenizers, which segment a string into its tokens.
* <p>
- * Tokenization is a necessary step before more complex NLP tasks can be
applied,
- * these usually process text on a token level. The quality of tokenization is
+ * Tokenization is a necessary step before more complex NLP tasks can be
applied.
+ * These usually process text on a token level. The quality of tokenization is
* important because it influences the performance of high-level task applied
to it.
* <p>
- * In segmented languages like English most words are segmented by white spaces
+ * In segmented languages like English most words are segmented by whitespaces
* expect for punctuations, etc. which is directly attached to the word
without a white space
* in between, it is not possible to just split at all punctuations because in
abbreviations dots
- * are a part of the token itself. A tokenizer is now responsible to split
these tokens
+ * are a part of the token itself. A {@link Tokenizer} is now responsible to
split those tokens
* correctly.
* <p>
- * In non-segmented languages like Chinese tokenization is more difficult
since words
+ * In non-segmented languages like Chinese, tokenization is more difficult
since words
* are not segmented by a whitespace.
* <p>
* Tokenizers can also be used to segment already identified tokens further
into more
@@ -41,16 +41,15 @@ import opennlp.tools.util.Span;
* to gain insight into tokens which do not represent words like numbers,
units or tokens
* which are part of a special notation.
* <p>
- * For most further task it is desirable to over tokenize rather than under
tokenize.
+ * For most subsequent NLP tasks, it is desirable to over-tokenize rather than
to under-tokenize.
*/
public interface Tokenizer {
/**
- * Splits a string into its atomic parts
+ * Splits a string into its atomic parts.
*
* @param s The string to be tokenized.
- * @return The String[] with the individual tokens as the array
- * elements.
+ * @return The String[] with the individual tokens as the array elements.
*/
String[] tokenize(String s);
@@ -58,8 +57,7 @@ public interface Tokenizer {
* Finds the boundaries of atomic parts in a string.
*
* @param s The string to be tokenized.
- * @return The Span[] with the spans (offsets into s) for each
- * token as the individuals array elements.
+ * @return The {@link Span spans (offsets into {@code s})} for each token as
the individuals array elements.
*/
Span[] tokenizePos(String s);
diff --git
a/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerCrossValidator.java
b/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerCrossValidator.java
index 96d8d354..38351136 100644
---
a/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerCrossValidator.java
+++
b/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerCrossValidator.java
@@ -24,14 +24,24 @@ import opennlp.tools.util.TrainingParameters;
import opennlp.tools.util.eval.CrossValidationPartitioner;
import opennlp.tools.util.eval.FMeasure;
+/**
+ * A cross validator for {@link Tokenizer tokenizers}.
+ */
public class TokenizerCrossValidator {
private final TrainingParameters params;
- private FMeasure fmeasure = new FMeasure();
- private TokenizerEvaluationMonitor[] listeners;
+ private final FMeasure fmeasure = new FMeasure();
+ private final TokenizerEvaluationMonitor[] listeners;
private final TokenizerFactory factory;
+ /**
+ * Creates a {@link TokenizerCrossValidator} using the given {@link
TokenizerFactory}.
+ *
+ * @param params The {@link TrainingParameters} for the context of cross
validation.
+ * @param factory The {@link TokenizerFactory} to be used.
+ * @param listeners The {@link TokenizerEvaluationMonitor evaluation
listeners}.
+ */
public TokenizerCrossValidator(TrainingParameters params,
TokenizerFactory factory, TokenizerEvaluationMonitor... listeners) {
this.params = params;
@@ -42,12 +52,10 @@ public class TokenizerCrossValidator {
/**
* Starts the evaluation.
*
- * @param samples
- * the data to train and test
- * @param nFolds
- * number of folds
+ * @param samples The {@link ObjectStream} of {@link TokenSample samples} to
train and test with.
+ * @param nFolds Number of folds. It must be greater than zero.
*
- * @throws IOException
+ * @throws IOException Thrown if IO errors occurred during evaluation.
*/
public void evaluate(ObjectStream<TokenSample> samples, int nFolds) throws
IOException {
diff --git
a/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerEvaluationMonitor.java
b/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerEvaluationMonitor.java
index 905a139b..6c3872fb 100644
---
a/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerEvaluationMonitor.java
+++
b/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerEvaluationMonitor.java
@@ -19,7 +19,9 @@ package opennlp.tools.tokenize;
import opennlp.tools.util.eval.EvaluationMonitor;
-public interface TokenizerEvaluationMonitor extends
- EvaluationMonitor<TokenSample> {
+/**
+ * A marker interface for evaluating {@link Tokenizer tokenizers}.
+ */
+public interface TokenizerEvaluationMonitor extends
EvaluationMonitor<TokenSample> {
}
diff --git
a/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerEvaluator.java
b/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerEvaluator.java
index fa4d35bd..65e722d5 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerEvaluator.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerEvaluator.java
@@ -25,7 +25,7 @@ import opennlp.tools.util.eval.FMeasure;
/**
* The {@link TokenizerEvaluator} measures the performance of
* the given {@link Tokenizer} with the provided reference
- * {@link TokenSample}s.
+ * {@link TokenSample samples}.
*
* @see Evaluator
* @see Tokenizer
@@ -33,20 +33,18 @@ import opennlp.tools.util.eval.FMeasure;
*/
public class TokenizerEvaluator extends Evaluator<TokenSample> {
- private FMeasure fmeasure = new FMeasure();
+ private final FMeasure fmeasure = new FMeasure();
/**
- * The {@link Tokenizer} used to create the
- * predicted tokens.
+ * The {@link Tokenizer} used to create the predicted tokens.
*/
- private Tokenizer tokenizer;
+ private final Tokenizer tokenizer;
/**
- * Initializes the current instance with the
- * given {@link Tokenizer}.
+ * Initializes an instance to evaluate a {@link Tokenizer}.
*
- * @param tokenizer the {@link Tokenizer} to evaluate.
- * @param listeners evaluation sample listeners
+ * @param tokenizer The {@link Tokenizer} to evaluate.
+ * @param listeners The {@link TokenizerEvaluationMonitor evaluation
listeners}.
*/
public TokenizerEvaluator(Tokenizer tokenizer, TokenizerEvaluationMonitor
... listeners) {
super(listeners);
diff --git
a/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerFactory.java
b/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerFactory.java
index ba3d285f..ca75071c 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerFactory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerFactory.java
@@ -29,7 +29,7 @@ import opennlp.tools.util.InvalidFormatException;
import opennlp.tools.util.ext.ExtensionLoader;
/**
- * The factory that provides {@link Tokenizer} default implementations and
+ * The factory that provides {@link Tokenizer} default implementation and
* resources. Users can extend this class if their application requires
* overriding the {@link TokenContextGenerator}, {@link Dictionary} etc.
*/
@@ -45,33 +45,37 @@ public class TokenizerFactory extends BaseToolFactory {
private static final String ALPHA_NUMERIC_PATTERN = "alphaNumericPattern";
/**
- * Creates a {@link TokenizerFactory} that provides the default
implementation
+ * Instantiates a {@link TokenizerFactory} that provides the default
implementation
* of the resources.
*/
public TokenizerFactory() {
}
-
+
/**
- * Creates a {@link TokenizerFactory}. Use this constructor to
+ * Instantiates a {@link TokenizerFactory}. Use this constructor to
* programmatically create a factory.
*
- * @param languageCode
- * the language of the natural text
- * @param abbreviationDictionary
- * an abbreviations dictionary
- * @param useAlphaNumericOptimization
- * if true alpha numerics are skipped
- * @param alphaNumericPattern
- * null or a custom alphanumeric pattern (default is:
- * "^[A-Za-z0-9]+$", provided by {@link
Factory#DEFAULT_ALPHANUMERIC}
+ * @param languageCode The ISO language code to be used for this factory.
+ * @param abbreviationDictionary The {@link Dictionary} which holds
abbreviations.
+ * @param useAlphaNumericOptimization Whether alphanumerics are skipped, or
not.
+ * @param alphaNumericPattern {@code null} or a custom alphanumeric {@link
Pattern}
+ * (default is: {@code "^[A-Za-z0-9]+$"},
provided by
+ * {@link Factory#DEFAULT_ALPHANUMERIC}.
*/
- public TokenizerFactory(String languageCode,
- Dictionary abbreviationDictionary, boolean useAlphaNumericOptimization,
- Pattern alphaNumericPattern) {
+ public TokenizerFactory(String languageCode, Dictionary
abbreviationDictionary,
+ boolean useAlphaNumericOptimization, Pattern
alphaNumericPattern) {
this.init(languageCode, abbreviationDictionary,
useAlphaNumericOptimization, alphaNumericPattern);
}
+ /**
+ * @param languageCode The ISO language code to be used for this factory.
+ * @param abbreviationDictionary The {@link Dictionary} which holds
abbreviations.
+ * @param useAlphaNumericOptimization Whether alphanumerics are skipped, or
not.
+ * @param alphaNumericPattern {@code null} or a custom alphanumeric {@link
Pattern}
+ * (default is: {@code "^[A-Za-z0-9]+$"},
provided by
+ * {@link Factory#DEFAULT_ALPHANUMERIC}.
+ */
protected void init(String languageCode, Dictionary abbreviationDictionary,
boolean useAlphaNumericOptimization, Pattern alphaNumericPattern) {
this.languageCode = languageCode;
@@ -122,22 +126,24 @@ public class TokenizerFactory extends BaseToolFactory {
}
/**
- * Factory method the framework uses create a new {@link TokenizerFactory}.
+ * Factory method the framework uses instantiate a new {@link
TokenizerFactory}.
*
- * @param subclassName the name of the class implementing the {@link
TokenizerFactory}
- * @param languageCode the language code the tokenizer should use
- * @param abbreviationDictionary an optional dictionary containing
abbreviations, or null if not present
- * @param useAlphaNumericOptimization indicate if the alpha numeric
optimization
- * should be enabled or disabled
- * @param alphaNumericPattern the pattern the alpha numeric optimization
should use
+ * @param subclassName The name of the class implementing the {@link
TokenizerFactory}.
+ * @param languageCode The ISO language code the {@link Tokenizer} should
use.
+ * @param abbreviationDictionary An optional {@link Dictionary} containing
abbreviations,
+ * or {@code null} if not present.
+ * @param useAlphaNumericOptimization Whether the alphanumeric optimization
is be enabled or not.
+ * @param alphaNumericPattern The {@link Pattern} the alphanumeric
optimization should use,
+ * if enabled.
*
- * @return the instance of the Tokenizer Factory
+ * @return A valid {@link TokenizerFactory} instance.
*
- * @throws InvalidFormatException if once of the input parameters doesn't
comply if the expected format
+ * @throws InvalidFormatException Thrown if one of the input parameters
doesn't comply the expected format.
*/
- public static TokenizerFactory create(String subclassName,
- String languageCode, Dictionary abbreviationDictionary,
- boolean useAlphaNumericOptimization, Pattern alphaNumericPattern)
+ public static TokenizerFactory create(String subclassName, String
languageCode,
+ Dictionary abbreviationDictionary,
+ boolean useAlphaNumericOptimization,
+ Pattern alphaNumericPattern)
throws InvalidFormatException {
if (subclassName == null) {
// will create the default factory
@@ -160,9 +166,7 @@ public class TokenizerFactory extends BaseToolFactory {
}
/**
- * Gets the alpha numeric pattern.
- *
- * @return the user specified alpha numeric pattern or a default.
+ * @return Retrieves the (user-)specified alphanumeric {@link Pattern} or a
default.
*/
public Pattern getAlphaNumericPattern() {
if (this.alphaNumericPattern == null) {
@@ -182,9 +186,7 @@ public class TokenizerFactory extends BaseToolFactory {
}
/**
- * Gets whether to use alphanumeric optimization.
- *
- * @return true if the alpha numeric optimization is enabled, otherwise false
+ * @return {@code true} if the alphanumeric optimization is enabled,
otherwise {@code false}.
*/
public boolean isUseAlphaNumericOptmization() {
if (artifactProvider != null) {
@@ -195,9 +197,7 @@ public class TokenizerFactory extends BaseToolFactory {
}
/**
- * Gets the abbreviation dictionary
- *
- * @return null or the abbreviation dictionary
+ * @return The abbreviation {@link Dictionary} or {@code null} if none is
active.
*/
public Dictionary getAbbreviationDictionary() {
if (this.abbreviationDictionary == null && artifactProvider != null) {
@@ -207,9 +207,7 @@ public class TokenizerFactory extends BaseToolFactory {
}
/**
- * Retrieves the language code.
- *
- * @return the language code
+ * @return Retrieves the ISO language code in use.
*/
public String getLanguageCode() {
if (this.languageCode == null && this.artifactProvider != null) {
@@ -219,9 +217,7 @@ public class TokenizerFactory extends BaseToolFactory {
}
/**
- * Gets the context generator
- *
- * @return a new instance of the context generator
+ * @return Retrieves a {@link TokenContextGenerator} instance.
*/
public TokenContextGenerator getContextGenerator() {
Factory f = new Factory();
diff --git
a/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerME.java
b/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerME.java
index c64c2355..10086e9b 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerME.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerME.java
@@ -38,22 +38,22 @@ import opennlp.tools.util.Span;
import opennlp.tools.util.TrainingParameters;
/**
- * A Tokenizer for converting raw text into separated tokens. It uses
- * Maximum Entropy to make its decisions. The features are loosely
+ * A {@link Tokenizer} for converting raw text into separated tokens. It uses
+ * Maximum Entropy to make its decisions. The features are loosely
* based off of Jeff Reynar's UPenn thesis "Topic Segmentation:
* Algorithms and Applications.", which is available from his
* homepage: <a
href="http://www.cis.upenn.edu/~jcreynar">http://www.cis.upenn.edu/~jcreynar</a>.
* <p>
- * This tokenizer needs a statistical model to tokenize a text which reproduces
+ * This implementation needs a statistical model to tokenize a text which
reproduces
* the tokenization observed in the training data used to create the model.
- * The {@link TokenizerModel} class encapsulates the model and provides
+ * The {@link TokenizerModel} class encapsulates that model and provides
* methods to create it from the binary representation.
* <p>
- * A tokenizer instance is not thread safe. For each thread one tokenizer
- * must be instantiated which can share one <code>TokenizerModel</code>
instance
+ * A tokenizer instance is not thread-safe. For each thread, one tokenizer
+ * must be instantiated which can share one {@link TokenizerModel} instance
* to safe memory.
* <p>
- * To train a new model {{@link #train(ObjectStream, TokenizerFactory,
TrainingParameters)} method
+ * To train a new model, the {@link #train(ObjectStream, TokenizerFactory,
TrainingParameters) method
* can be used.
* <p>
* Sample usage:
@@ -69,7 +69,8 @@ import opennlp.tools.util.TrainingParameters;
* <br>
* String tokens[] = tokenizer.tokenize("A sentence to be tokenized.");
* </code>
- *
+ * <p>
+ *
* @see Tokenizer
* @see TokenizerModel
* @see TokenSample
@@ -95,32 +96,31 @@ public class TokenizerME extends AbstractTokenizer {
private final Pattern alphanumeric;
- /**
+ /*
* The maximum entropy model to use to evaluate contexts.
*/
- private MaxentModel model;
+ private final MaxentModel model;
- /**
+ /*
* The context generator.
*/
private final TokenContextGenerator cg;
- /**
- * Optimization flag to skip alpha numeric tokens for further
- * tokenization
+ /*
+ * Optimization flag to skip alphanumeric tokens for further tokenization
*/
- private boolean useAlphaNumericOptimization;
+ private final boolean useAlphaNumericOptimization;
- /**
+ /*
* List of probabilities for each token returned from a call to
* <code>tokenize</code> or <code>tokenizePos</code>.
*/
- private List<Double> tokProbs;
+ private final List<Double> tokProbs;
- private List<Span> newTokens;
+ private final List<Span> newTokens;
/**
- * Initializes the tokenizer by downloading a default model.
+ * Initializes a {@link TokenizerME} by downloading a default model.
* @param language The language of the tokenizer.
* @throws IOException Thrown if the model cannot be downloaded or saved.
*/
@@ -129,6 +129,11 @@ public class TokenizerME extends AbstractTokenizer {
TokenizerModel.class));
}
+ /**
+ * Instantiates a {@link TokenizerME} with an existing {@link
TokenizerModel}.
+ *
+ * @param model The {@link TokenizerModel} to be used.
+ */
public TokenizerME(TokenizerModel model) {
TokenizerFactory factory = model.getFactory();
this.alphanumeric = factory.getAlphaNumericPattern();
@@ -144,6 +149,7 @@ public class TokenizerME extends AbstractTokenizer {
* @deprecated use {@link TokenizerFactory} to extend the Tokenizer
* functionality
*/
+ @Deprecated
public TokenizerME(TokenizerModel model, Factory factory) {
String languageCode = model.getLanguage();
@@ -166,11 +172,9 @@ public class TokenizerME extends AbstractTokenizer {
}
/**
- * Returns the probabilities associated with the most recent
- * calls to {@link TokenizerME#tokenize(String)} or {@link
TokenizerME#tokenizePos(String)}.
- *
- * @return probability for each token returned for the most recent
- * call to tokenize. If not applicable an empty array is returned.
+ * @return the probabilities associated with the most recent calls to
+ * {@link TokenizerME#tokenize(String)} or {@link
TokenizerME#tokenizePos(String)}.
+ * If not applicable an empty array is returned.
*/
public double[] getTokenProbabilities() {
double[] tokProbArray = new double[tokProbs.size()];
@@ -185,7 +189,7 @@ public class TokenizerME extends AbstractTokenizer {
*
* @param d The string to be tokenized.
*
- * @return A span array containing individual tokens as elements.
+ * @return A {@link Span} array containing individual tokens as elements.
*/
public Span[] tokenizePos(String d) {
WhitespaceTokenizer whitespaceTokenizer = WhitespaceTokenizer.INSTANCE;
@@ -232,18 +236,12 @@ public class TokenizerME extends AbstractTokenizer {
/**
* Trains a model for the {@link TokenizerME}.
*
- * @param samples
- * the samples used for the training.
- * @param factory
- * a {@link TokenizerFactory} to get resources from
- * @param mlParams
- * the machine learning train parameters
- * @return the trained {@link TokenizerModel}
- * @throws IOException
- * it throws an {@link IOException} if an {@link IOException} is
- * thrown during IO operations on a temp file which is created
- * during training. Or if reading from the {@link ObjectStream}
- * fails.
+ * @param samples The samples used for the training.
+ * @param factory A {@link TokenizerFactory} to get resources from.
+ * @param mlParams The machine learning {@link TrainingParameters train
parameters}.
+ * @return A trained {@link TokenizerModel}.
+ * @throws IOException Thrown during IO operations on a temp file which is
created
+ * during training. Or if reading from the {@link ObjectStream}
fails.
*/
public static TokenizerModel train(ObjectStream<TokenSample> samples,
TokenizerFactory factory,
TrainingParameters mlParams) throws IOException {
@@ -263,9 +261,7 @@ public class TokenizerME extends AbstractTokenizer {
}
/**
- * Returns the value of the alpha-numeric optimization flag.
- *
- * @return true if the tokenizer should use alpha-numeric optimization,
false otherwise.
+ * @return {@code true} if the tokenizer uses alphanumeric optimization,
{@code false} otherwise.
*/
public boolean useAlphaNumericOptimization() {
return useAlphaNumericOptimization;
diff --git
a/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerModel.java
b/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerModel.java
index 8e732991..b2d5003f 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerModel.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerModel.java
@@ -38,6 +38,7 @@ import opennlp.tools.util.model.ModelUtil;
* by a learnable {@link Tokenizer}.
*
* @see TokenizerME
+ * @see TokenizerFactory
*/
public final class TokenizerModel extends BaseModel {
@@ -46,11 +47,11 @@ public final class TokenizerModel extends BaseModel {
private static final String TOKENIZER_MODEL_ENTRY = "token.model";
/**
- * Initializes the current instance.
+ * Initializes a {@link TokenizerModel} instance via a {@link MaxentModel}
and related resources.
*
- * @param tokenizerModel the model
- * @param manifestInfoEntries the manifest
- * @param tokenizerFactory the factory
+ * @param tokenizerModel The {@link MaxentModel model} to be used.
+ * @param manifestInfoEntries Additional information kept in the manifest.
+ * @param tokenizerFactory The {@link TokenizerFactory} to be used
internally.
*/
public TokenizerModel(MaxentModel tokenizerModel,
Map<String, String> manifestInfoEntries, TokenizerFactory
tokenizerFactory) {
@@ -60,48 +61,54 @@ public final class TokenizerModel extends BaseModel {
}
/**
- * Initializes the current instance.
+ * Initializes a {@link TokenizerModel} instance via a valid {@link
InputStream}.
*
- * @param in the Input Stream to load the model from
+ * @param in The {@link InputStream} used for loading the model.
*
- * @throws IOException if reading from the stream fails in anyway
- * @throws InvalidFormatException if the stream doesn't have the expected
format
+ * @throws IOException Thrown if IO errors occurred during initialization.
*/
public TokenizerModel(InputStream in) throws IOException {
super(COMPONENT_NAME, in);
}
/**
- * Initializes the current instance.
+ * Initializes a {@link TokenizerModel} instance via a valid {@link File}.
*
- * @param modelFile the file containing the tokenizer model
+ * @param modelFile The {@link File} used for loading the model.
*
- * @throws IOException if reading from the stream fails in anyway
+ * @throws IOException Thrown if IO errors occurred during initialization.
*/
public TokenizerModel(File modelFile) throws IOException {
super(COMPONENT_NAME, modelFile);
}
+ /**
+ * Initializes a {@link TokenizerModel} instance via a valid {@link Path}.
+ *
+ * @param modelPath The {@link Path} used for loading the model.
+ *
+ * @throws IOException Thrown if IO errors occurred during initialization.
+ */
public TokenizerModel(Path modelPath) throws IOException {
this(modelPath.toFile());
}
/**
- * Initializes the current instance.
+ * Initializes a {@link TokenizerModel} instance via a valid {@link URL}.
*
- * @param modelURL the URL pointing to the tokenizer model
+ * @param modelURL The {@link URL} used for loading the model.
*
- * @throws IOException if reading from the stream fails in anyway
+ * @throws IOException Thrown if IO errors occurred during initialization.
*/
public TokenizerModel(URL modelURL) throws IOException {
super(COMPONENT_NAME, modelURL);
}
/**
- * Checks if the tokenizer model has the right outcomes.
+ * Checks if the {@link TokenizerModel} has the right outcomes.
*
- * @param model
- * @return
+ * @param model The {@link MaxentModel} to be checked.
+ * @return {@code true} if the model could be validated, {@code false}
otherwise.
*/
private static boolean isModelCompatible(MaxentModel model) {
return ModelUtil.validateOutcomes(model, TokenizerME.SPLIT,
TokenizerME.NO_SPLIT);
@@ -120,6 +127,9 @@ public final class TokenizerModel extends BaseModel {
}
}
+ /**
+ * @return Retrieves the active {@link TokenizerFactory}.
+ */
public TokenizerFactory getFactory() {
return (TokenizerFactory) this.toolFactory;
}
@@ -129,10 +139,16 @@ public final class TokenizerModel extends BaseModel {
return TokenizerFactory.class;
}
+ /**
+ * @return Retrieves the model as {@link MaxentModel} instance.
+ */
public MaxentModel getMaxentModel() {
return (MaxentModel) artifactMap.get(TOKENIZER_MODEL_ENTRY);
}
+ /**
+ * @return Retrieves the active abbreviation {@link Dictionary}.
+ */
public Dictionary getAbbreviations() {
if (getFactory() != null) {
return getFactory().getAbbreviationDictionary();
@@ -140,6 +156,9 @@ public final class TokenizerModel extends BaseModel {
return null;
}
+ /**
+ * @return {@code true} if alphanumeric optimization is active, {@code
false} otherwise.
+ */
public boolean useAlphaNumericOptimization() {
return getFactory() != null && getFactory().isUseAlphaNumericOptmization();
}
diff --git
a/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerStream.java
b/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerStream.java
index bfb87c56..c8296773 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerStream.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerStream.java
@@ -24,19 +24,26 @@ import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.Span;
/**
- * The {@link TokenizerStream} uses a tokenizer to tokenize the
- * input string and output {@link TokenSample}s.
+ * The {@link TokenizerStream} uses a {@link Tokenizer} to tokenize the
+ * input string and output {@link TokenSample samples}.
*/
public class TokenizerStream implements ObjectStream<TokenSample> {
- private Tokenizer tokenizer;
- private ObjectStream<String> input;
+ private final Tokenizer tokenizer;
+ private final ObjectStream<String> input;
+ /**
+ * Initializes a {@link TokenizerStream instance}.
+ *
+ * @param tokenizer A working {@link Tokenizer} instance.
+ * @param input A plain text {@link ObjectStream line stream}.
+ */
public TokenizerStream(Tokenizer tokenizer, ObjectStream<String> input) {
this.tokenizer = tokenizer;
this.input = input;
}
+ @Override
public TokenSample read() throws IOException {
String inputString = input.read();
@@ -49,10 +56,12 @@ public class TokenizerStream implements
ObjectStream<TokenSample> {
return null;
}
+ @Override
public void close() throws IOException {
input.close();
}
+ @Override
public void reset() throws IOException,
UnsupportedOperationException {
input.reset();
diff --git
a/opennlp-tools/src/main/java/opennlp/tools/tokenize/WhitespaceTokenStream.java
b/opennlp-tools/src/main/java/opennlp/tools/tokenize/WhitespaceTokenStream.java
index 4f6694f9..24ddef48 100644
---
a/opennlp-tools/src/main/java/opennlp/tools/tokenize/WhitespaceTokenStream.java
+++
b/opennlp-tools/src/main/java/opennlp/tools/tokenize/WhitespaceTokenStream.java
@@ -24,15 +24,21 @@ import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.Span;
/**
- * This stream formats a {@link TokenSample}s into whitespace
+ * This stream formats {@link ObjectStream} of {@link TokenSample samples}
into whitespace
* separated token strings.
*/
public class WhitespaceTokenStream extends FilterObjectStream<TokenSample,
String> {
+ /**
+ * Initializes a {@link WhitespaceTokenStream}.
+ *
+ * @param tokens The {@link ObjectStream stream} of tokens to be separated.
+ */
public WhitespaceTokenStream(ObjectStream<TokenSample> tokens) {
super(tokens);
}
+ @Override
public String read() throws IOException {
TokenSample tokenSample = samples.read();
diff --git
a/opennlp-tools/src/main/java/opennlp/tools/tokenize/WhitespaceTokenizer.java
b/opennlp-tools/src/main/java/opennlp/tools/tokenize/WhitespaceTokenizer.java
index f918bceb..7209a754 100644
---
a/opennlp-tools/src/main/java/opennlp/tools/tokenize/WhitespaceTokenizer.java
+++
b/opennlp-tools/src/main/java/opennlp/tools/tokenize/WhitespaceTokenizer.java
@@ -24,10 +24,11 @@ import opennlp.tools.util.Span;
import opennlp.tools.util.StringUtil;
/**
- * This tokenizer uses white spaces to tokenize the input text.
- *
+ * A basic {@link Tokenizer} implementation which performs tokenization
+ * using white spaces.
+ * <p>
* To obtain an instance of this tokenizer use the static final
- * <code>INSTANCE</code> field.
+ * {@link #INSTANCE} field.
*/
public class WhitespaceTokenizer extends AbstractTokenizer {
@@ -37,18 +38,19 @@ public class WhitespaceTokenizer extends AbstractTokenizer {
*/
public static final WhitespaceTokenizer INSTANCE = new WhitespaceTokenizer();
- /**
+ /*
* Use the {@link WhitespaceTokenizer#INSTANCE} field to retrieve an
instance.
*/
private WhitespaceTokenizer() {
}
+ @Override
public Span[] tokenizePos(String d) {
int tokStart = -1;
List<Span> tokens = new ArrayList<>();
boolean inTok = false;
- //gather up potential tokens
+ // gather potential tokens
int end = d.length();
for (int i = 0; i < end; i++) {
if (StringUtil.isWhitespace(d.charAt(i))) {
diff --git
a/opennlp-tools/src/main/java/opennlp/tools/tokenize/WordpieceTokenizer.java
b/opennlp-tools/src/main/java/opennlp/tools/tokenize/WordpieceTokenizer.java
index bbd31458..79752a48 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/tokenize/WordpieceTokenizer.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/tokenize/WordpieceTokenizer.java
@@ -24,13 +24,23 @@ import java.util.Set;
import opennlp.tools.util.Span;
/**
- * A WordPiece tokenizer.
- *
- * Adapted from https://github.com/robrua/easy-bert under the MIT license.
- *
+ * A {@link Tokenizer} implementation which performs tokenization
+ * using word pieces.
+ * <p>
+ * Adapted under MIT license from
+ * <a
href="https://github.com/robrua/easy-bert">https://github.com/robrua/easy-bert</a>.
+ * <p>
* For reference see:
- * -
https://www.tensorflow.org/text/guide/subwords_tokenizer#applying_wordpiece
- * -
https://cran.r-project.org/web/packages/wordpiece/vignettes/basic_usage.html
+ * <ul>
+ * <li>
+ * <a
href="https://www.tensorflow.org/text/guide/subwords_tokenizer#applying_wordpiece">
+ *
https://www.tensorflow.org/text/guide/subwords_tokenizer#applying_wordpiece</a>
+ * </li>
+ * <li>
+ * <a
href="https://cran.r-project.org/web/packages/wordpiece/vignettes/basic_usage.html">
+ *
https://cran.r-project.org/web/packages/wordpiece/vignettes/basic_usage.html</a>
+ * </li>
+ * </ul>
*/
public class WordpieceTokenizer implements Tokenizer {
@@ -38,15 +48,28 @@ public class WordpieceTokenizer implements Tokenizer {
private static final String SEPARATOR_TOKEN = "[SEP]";
private static final String UNKNOWN_TOKEN = "[UNK]";
- private Set<String> vocabulary;
+ private final Set<String> vocabulary;
private int maxTokenLength = 50;
+ /**
+ * Initializes a {@link WordpieceTokenizer} with a {@code vocabulary} and a
default
+ * {@code maxTokenLength} of {@code 50}.
+ *
+ * @param vocabulary A set of tokens considered the vocabulary.
+ */
public WordpieceTokenizer(Set<String> vocabulary) {
this.vocabulary = vocabulary;
}
+ /**
+ * Initializes a {@link WordpieceTokenizer} with a {@code vocabulary} and a
custom
+ * {@code maxTokenLength}.
+ *
+ * @param vocabulary A set of tokens considered the vocabulary.
+ * @param maxTokenLength A non-negative number that is used as maximum token
length.
+ */
public WordpieceTokenizer(Set<String> vocabulary, int maxTokenLength) {
- this.vocabulary = vocabulary;
+ this(vocabulary);
this.maxTokenLength = maxTokenLength;
}
@@ -145,6 +168,9 @@ public class WordpieceTokenizer implements Tokenizer {
}
+ /**
+ * @return The maximum token length.
+ */
public int getMaxTokenLength() {
return maxTokenLength;
}
diff --git
a/opennlp-tools/src/main/java/opennlp/tools/tokenize/package-info.java
b/opennlp-tools/src/main/java/opennlp/tools/tokenize/package-info.java
index 43650605..0b45b1fc 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/tokenize/package-info.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/tokenize/package-info.java
@@ -17,8 +17,8 @@
/**
* Contains classes related to finding token or words in a string. All
- * tokenizer implement the Tokenizer interface. Currently there is the
- * learnable <code>TokenizerME</code>, the <code>WhitespaceTokenizer</code> and
- * the <code>SimpleTokenizer</code> which is a character class tokenizer.
+ * tokenizer implement the Tokenizer interface. Currently, there is the
+ * learnable {@code TokenizerME}, the {@code WhitespaceTokenizer} and
+ * the {@code SimpleTokenizer} which is a character class tokenizer.
*/
package opennlp.tools.tokenize;
diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/eval/Evaluator.java
b/opennlp-tools/src/main/java/opennlp/tools/util/eval/Evaluator.java
index ee8d49ab..d00cf7dc 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/util/eval/Evaluator.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/util/eval/Evaluator.java
@@ -51,13 +51,13 @@ public abstract class Evaluator<T> {
}
/**
- * Evaluates the given reference sample object.
+ * Evaluates the given reference {@link T} sample object.
*
* The implementation has to update the score after every invocation.
*
* @param reference the reference sample.
*
- * @return the predicted sample
+ * @return the predicted {@link T} sample
*/
protected abstract T processSample(T reference);