[opennlp] branch master updated: OPENNLP-1405 Enhance JavaDoc in opennlp.tools.tokenize package (#448)

jzemerick Sat, 10 Dec 2022 06:22:03 -0800

This is an automated email from the ASF dual-hosted git repository.

jzemerick pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/opennlp.git



The following commit(s) were added to refs/heads/master by this push:
     new 74ece59f OPENNLP-1405 Enhance JavaDoc in opennlp.tools.tokenize 
package (#448)
74ece59f is described below

commit 74ece59fac2bb70f63a294a8bcce047e7eae9cb3
Author: Martin Wiesner <[email protected]>
AuthorDate: Sat Dec 10 15:21:51 2022 +0100

    OPENNLP-1405 Enhance JavaDoc in opennlp.tools.tokenize package (#448)
    
    - adds missing JavaDoc
    - improves existing documentation for clarity
    - removes superfluous text
    - adds 'final' modifier where useful and applicable
    - adds 'Override' annotation where useful and applicable
    - fixes some typos
---
 .../lemmatizer/LemmatizerEvaluationMonitor.java    |  4 +-
 .../tools/lemmatizer/LemmatizerFactory.java        | 16 +++-
 .../opennlp/tools/tokenize/AbstractTokenizer.java  |  6 ++
 .../tokenize/DefaultTokenContextGenerator.java     | 34 ++++----
 .../tools/tokenize/DetokenizationDictionary.java   | 92 ++++++++++++++++------
 .../java/opennlp/tools/tokenize/Detokenizer.java   | 26 +++---
 .../tools/tokenize/DetokenizerEvaluator.java       | 14 ++--
 .../tools/tokenize/DictionaryDetokenizer.java      |  9 ++-
 .../opennlp/tools/tokenize/SimpleTokenizer.java    | 25 +++---
 .../opennlp/tools/tokenize/TokSpanEventStream.java | 52 ++++++------
 .../tools/tokenize/TokenContextGenerator.java      | 11 +--
 .../java/opennlp/tools/tokenize/TokenSample.java   | 32 ++++++--
 .../opennlp/tools/tokenize/TokenSampleStream.java  | 34 +++++---
 .../java/opennlp/tools/tokenize/Tokenizer.java     | 20 +++--
 .../tools/tokenize/TokenizerCrossValidator.java    | 22 ++++--
 .../tools/tokenize/TokenizerEvaluationMonitor.java |  6 +-
 .../opennlp/tools/tokenize/TokenizerEvaluator.java | 16 ++--
 .../opennlp/tools/tokenize/TokenizerFactory.java   | 82 +++++++++----------
 .../java/opennlp/tools/tokenize/TokenizerME.java   | 76 +++++++++---------
 .../opennlp/tools/tokenize/TokenizerModel.java     | 53 +++++++++----
 .../opennlp/tools/tokenize/TokenizerStream.java    | 17 +++-
 .../tools/tokenize/WhitespaceTokenStream.java      |  8 +-
 .../tools/tokenize/WhitespaceTokenizer.java        | 12 +--
 .../opennlp/tools/tokenize/WordpieceTokenizer.java | 42 ++++++++--
 .../java/opennlp/tools/tokenize/package-info.java  |  6 +-
 .../java/opennlp/tools/util/eval/Evaluator.java    |  4 +-
 26 files changed, 446 insertions(+), 273 deletions(-)

diff --git 
a/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerEvaluationMonitor.java
 
b/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerEvaluationMonitor.java
index 55993406..78c3fcba 100644
--- 
a/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerEvaluationMonitor.java
+++ 
b/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerEvaluationMonitor.java
@@ -20,9 +20,7 @@ package opennlp.tools.lemmatizer;
 import opennlp.tools.util.eval.EvaluationMonitor;
 
 /**
- * Interface for the lemmatizer evaluator.
- * @version 2016-02-18
- *
+ * A marker interface for evaluating {@link Lemmatizer lemmatizers}.
  */
 public interface LemmatizerEvaluationMonitor extends 
EvaluationMonitor<LemmaSample> {
 
diff --git 
a/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerFactory.java 
b/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerFactory.java
index 0effba27..1d804a85 100644
--- 
a/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerFactory.java
+++ 
b/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerFactory.java
@@ -22,15 +22,29 @@ import opennlp.tools.util.InvalidFormatException;
 import opennlp.tools.util.SequenceValidator;
 import opennlp.tools.util.ext.ExtensionLoader;
 
+/**
+ * The factory that provides {@link Lemmatizer} default implementation and
+ * resources.
+ */
 public class LemmatizerFactory extends BaseToolFactory {
 
   /**
-   * Creates a {@link LemmatizerFactory} that provides the default 
implementation
+   * Instantiates a {@link LemmatizerFactory} that provides the default 
implementation
    * of the resources.
    */
   public LemmatizerFactory() {
   }
 
+  /**
+   * Instantiates a {@link LemmatizerFactory} via a given {@code subclassName}.
+   *
+   * @param subclassName The class name used for instantiation. If {@code 
null}, an
+   *                     instance of {@link LemmatizerFactory} will be returned
+   *                     per default. Otherwise, the {@link ExtensionLoader} 
mechanism
+   *                     is applied to load the requested {@code subclassName}.
+   *
+   * @return A valid {@link LemmatizerFactory} instance.
+   */
   public static LemmatizerFactory create(String subclassName)
       throws InvalidFormatException {
     if (subclassName == null) {
diff --git 
a/opennlp-tools/src/main/java/opennlp/tools/tokenize/AbstractTokenizer.java 
b/opennlp-tools/src/main/java/opennlp/tools/tokenize/AbstractTokenizer.java
index 2dc3754c..6cd17f49 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/tokenize/AbstractTokenizer.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/tokenize/AbstractTokenizer.java
@@ -23,10 +23,16 @@ abstract class AbstractTokenizer implements Tokenizer {
 
   protected boolean keepNewLines = false;
 
+  @Override
   public String[] tokenize(String s) {
     return Span.spansToStrings(tokenizePos(s), s);
   }
 
+  /**
+   * Switches whether to keep new lines or not.
+   *
+   * @param keepNewLines {@code True} if new lines are kept, {@code false} 
otherwise.
+   */
   public void setKeepNewLines(boolean keepNewLines) {
     this.keepNewLines = keepNewLines;
   }
diff --git 
a/opennlp-tools/src/main/java/opennlp/tools/tokenize/DefaultTokenContextGenerator.java
 
b/opennlp-tools/src/main/java/opennlp/tools/tokenize/DefaultTokenContextGenerator.java
index 552cc691..f334e287 100644
--- 
a/opennlp-tools/src/main/java/opennlp/tools/tokenize/DefaultTokenContextGenerator.java
+++ 
b/opennlp-tools/src/main/java/opennlp/tools/tokenize/DefaultTokenContextGenerator.java
@@ -25,31 +25,31 @@ import java.util.Set;
 import opennlp.tools.util.StringUtil;
 
 /**
- * Generate events for maxent decisions for tokenization.
+ * A default {@link TokenContextGenerator} which produces events for maxent 
decisions
+ * for tokenization.
  */
 public class DefaultTokenContextGenerator implements TokenContextGenerator {
 
   protected final Set<String> inducedAbbreviations;
 
   /**
-   * Creates a default context generator for tokenizer.
+   * Initializes a plain {@link DefaultTokenContextGenerator} instance.
    */
   public DefaultTokenContextGenerator() {
     this(Collections.emptySet());
   }
 
   /**
-   * Creates a default context generator for tokenizer.
+   * Initializes a customized {@link DefaultTokenContextGenerator} instance via
+   * a set of {@code inducedAbbreviations}.
    *
-   * @param inducedAbbreviations the induced abbreviations
+   * @param inducedAbbreviations The induced abbreviations to be used for this 
instance.
    */
   public DefaultTokenContextGenerator(Set<String> inducedAbbreviations) {
     this.inducedAbbreviations = inducedAbbreviations;
   }
 
-  /* (non-Javadoc)
-   * @see 
opennlp.tools.tokenize.TokenContextGenerator#getContext(java.lang.String, int)
-   */
+  @Override
   public String[] getContext(String sentence, int index) {
     List<String> preds = createContext(sentence, index);
     String[] context = new String[preds.size()];
@@ -58,15 +58,15 @@ public class DefaultTokenContextGenerator implements 
TokenContextGenerator {
   }
 
   /**
-   * Returns an {@link ArrayList} of features for the specified sentence string
-   * at the specified index. Extensions of this class can override this method
-   * to create a customized {@link TokenContextGenerator}
+   * Computes a {@link List} of features for the specified {@code sentence}
+   * at the specified {@code index}. Extensions of {@link 
DefaultTokenContextGenerator}
+   * can override this method to create a customized behaviour.
    *
    * @param sentence
-   *          the token been analyzed
+   *          The sentence to create features for.
    * @param index
-   *          the index of the character been analyzed
-   * @return an {@link ArrayList} of features for the specified sentence string
+   *          The positional index. Must be a non-negative number or {@code 0}.
+   * @return A {@link List} of features for the specified {@code sentence} 
string
    *         at the specified index.
    */
   protected List<String> createContext(String sentence, int index) {
@@ -110,7 +110,13 @@ public class DefaultTokenContextGenerator implements 
TokenContextGenerator {
 
 
   /**
-   * Helper function for getContext.
+   * Helper function for {@link #createContext} that appends to a given {@code 
key}
+   * a fixed text sequence depending on {@code c}. The resulting combination 
is added
+   * to the given list {@code preds}.
+   *
+   * @param key The input string to process.
+   * @param c   A character used to discriminate which fixed text shall be 
appended.
+   * @param preds The list into which the resulting combinations will be added.
    */
   protected void addCharPreds(String key, char c, List<String> preds) {
     preds.add(key + "=" + c);
diff --git 
a/opennlp-tools/src/main/java/opennlp/tools/tokenize/DetokenizationDictionary.java
 
b/opennlp-tools/src/main/java/opennlp/tools/tokenize/DetokenizationDictionary.java
index 55eca1c8..d3b0a495 100644
--- 
a/opennlp-tools/src/main/java/opennlp/tools/tokenize/DetokenizationDictionary.java
+++ 
b/opennlp-tools/src/main/java/opennlp/tools/tokenize/DetokenizationDictionary.java
@@ -23,6 +23,7 @@ import java.io.FileInputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.OutputStream;
+import java.nio.file.Path;
 import java.util.HashMap;
 import java.util.Iterator;
 import java.util.Map;
@@ -53,27 +54,36 @@ public class DetokenizationDictionary {
     MOVE_BOTH,
 
     /**
-     * Attaches the token token to the right token on first occurrence, and
+     * Attaches the token to the right token on first occurrence, and
      * to the token on the left side on the second occurrence.
      */
     RIGHT_LEFT_MATCHING;
 
+    /**
+     * @param operation The string representation for which an {@link 
Operation}
+     *                  instance is to be found.
+     * @return The {@link Operation enum} instance that matches the given 
{@code operation},
+     *         or {@code null} if the input has no equivalent.
+     */
     public static Operation parse(String operation) {
-
-      if (MOVE_RIGHT.toString().equals(operation)) {
-        return MOVE_RIGHT;
-      }
-      else if (MOVE_LEFT.toString().equals(operation)) {
-        return MOVE_LEFT;
-      }
-      else if (MOVE_BOTH.toString().equals(operation)) {
-        return MOVE_BOTH;
-      }
-      else if (RIGHT_LEFT_MATCHING.toString().equals(operation)) {
-        return RIGHT_LEFT_MATCHING;
-      }
-      else {
+      if (operation == null) {
         return null;
+      } else {
+        if (MOVE_RIGHT.toString().equals(operation)) {
+          return MOVE_RIGHT;
+        }
+        else if (MOVE_LEFT.toString().equals(operation)) {
+          return MOVE_LEFT;
+        }
+        else if (MOVE_BOTH.toString().equals(operation)) {
+          return MOVE_BOTH;
+        }
+        else if (RIGHT_LEFT_MATCHING.toString().equals(operation)) {
+          return RIGHT_LEFT_MATCHING;
+        }
+        else {
+          return null;
+        }
       }
     }
   }
@@ -81,11 +91,11 @@ public class DetokenizationDictionary {
   private final Map<String, DetokenizationDictionary.Operation> operationTable 
= new HashMap<>();
 
   /**
-   * Initializes the current instance.
+   * Initializes a {@link DetokenizationDictionary} instance.
    *
-   * @param tokens an array of tokens that should be detokenized according to 
an operation
-   * @param operations an array of operations which specifies which operation
-   *        should be used for the provided tokens
+   * @param tokens An array of tokens that should be de-tokenized according to 
{@code operations}.
+   * @param operations An array of operations which specifies which operation
+   *        should be used for the provided {@code tokens}.
    */
   public DetokenizationDictionary(String[] tokens,
       DetokenizationDictionary.Operation[] operations) {
@@ -107,16 +117,44 @@ public class DetokenizationDictionary {
     }
   }
 
+  /**
+   * Initializes a {@link DetokenizationDictionary} instance via a valid 
{@link InputStream}.
+   *
+   * @param in The {@link InputStream} used for loading the dictionary.
+   *
+   * @throws IOException Thrown if IO errors occurred during initialization.
+   */
   public DetokenizationDictionary(InputStream in) throws IOException {
     init(in);
   }
 
+  /**
+   * Initializes a {@link DetokenizationDictionary} instance via a valid 
{@link File}.
+   *
+   * @param file The {@link File} used for loading the dictionary.
+   *
+   * @throws IOException Thrown if IO errors occurred during initialization.
+   */
   public DetokenizationDictionary(File file) throws IOException {
     try (InputStream in = new BufferedInputStream(new FileInputStream(file))) {
       init(in);
     }
   }
 
+  /**
+   * Initializes a {@link DetokenizationDictionary} instance via a valid 
{@link Path}.
+   *
+   * @param path The {@link Path} used for loading the dictionary.
+   *
+   * @throws IOException Thrown if IO errors occurred during initialization.
+   */
+  public DetokenizationDictionary(Path path) throws IOException {
+    this(path.toFile());
+  }
+
+  /*
+   * Builds up the dictionary from an InputStream.
+   */
   private void init(InputStream in) throws IOException {
     DictionaryEntryPersistor.create(in, entry -> {
 
@@ -137,15 +175,25 @@ public class DetokenizationDictionary {
     });
   }
 
+  /**
+   * @param token The input string for which a valid {@link Operation} is to 
be found.
+   * @return The {@link Operation} that fits the given {@code token}.
+   */
   DetokenizationDictionary.Operation getOperation(String token) {
     return operationTable.get(token);
   }
 
-  // serialize method
+  /**
+   * Serializes the current state of a {@link DetokenizationDictionary} via an
+   * {@link OutputStream output stream}.
+   *
+   * @param out A valid, open {@link OutputStream} ready to be used for 
serialization.
+   * @throws IOException  Thrown if IO errors occurred during serialization.
+   */
   public void serialize(OutputStream out) throws IOException {
-    Iterator<Entry> entries = new Iterator<Entry>() {
+    Iterator<Entry> entries = new Iterator<>() {
 
-      Iterator<String> iterator = operationTable.keySet().iterator();
+      final Iterator<String> iterator = operationTable.keySet().iterator();
 
       public boolean hasNext() {
         return iterator.hasNext();
diff --git 
a/opennlp-tools/src/main/java/opennlp/tools/tokenize/Detokenizer.java 
b/opennlp-tools/src/main/java/opennlp/tools/tokenize/Detokenizer.java
index acb9f45f..49d2a17b 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/tokenize/Detokenizer.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/tokenize/Detokenizer.java
@@ -18,8 +18,7 @@
 package opennlp.tools.tokenize;
 
 /**
- * A Detokenizer merges tokens back to their untokenized representation.
- *
+ * A {@link Detokenizer} merges tokens back to their detokenized 
representation.
  */
 public interface Detokenizer {
 
@@ -29,7 +28,7 @@ public interface Detokenizer {
    */
   enum DetokenizationOperation {
     /**
-     * The current token should be attached to the begin token on the right 
side.
+     * The current token should be attached to the start token on the right 
side.
      */
     MERGE_TO_RIGHT,
 
@@ -40,7 +39,7 @@ public interface Detokenizer {
 
     /**
      * The current token should be attached to the string on the left side, as
-     * well as to the begin token on the right side.
+     * well as to the start token on the right side.
      */
     MERGE_BOTH,
 
@@ -52,22 +51,23 @@ public interface Detokenizer {
   }
 
   /**
-   * Detokenize the input tokens.
+   * Detokenizes the collection of tokens.
    *
-   * @param tokens the tokens to detokenize.
-   * @return the merge operations to detokenize the input tokens.
+   * @param tokens The elements which should be detokenized.
+   * @return The {@link DetokenizationOperation merge operations} to handle
+   *         given {@code tokens}.
    */
   DetokenizationOperation[] detokenize(String[] tokens);
 
   /**
-   * Detokenize the input tokens into a String. Tokens which
-   * are connected without a space inbetween can be separated by
-   * a split marker.
+   * Detokenizes the input {@code tokens} into a String. Tokens which
+   * are connected without a {@code whitespace} character in
+   * between can be separated by a given {@code splitMarker}.
    *
-   * @param tokens the token which should be concatenated
-   * @param splitMarker the split marker or null
+   * @param tokens The elements which should be concatenated.
+   * @param splitMarker The split marker or {@code null}.
    *
-   * @return the concatenated tokens
+   * @return The concatenated tokens as a single string.
    */
   String detokenize(String[] tokens, String splitMarker);
 }
diff --git 
a/opennlp-tools/src/main/java/opennlp/tools/tokenize/DetokenizerEvaluator.java 
b/opennlp-tools/src/main/java/opennlp/tools/tokenize/DetokenizerEvaluator.java
index 7d9df4fc..1b98fdb1 100644
--- 
a/opennlp-tools/src/main/java/opennlp/tools/tokenize/DetokenizerEvaluator.java
+++ 
b/opennlp-tools/src/main/java/opennlp/tools/tokenize/DetokenizerEvaluator.java
@@ -27,28 +27,26 @@ import opennlp.tools.util.eval.FMeasure;
 /**
  * The {@link DetokenizerEvaluator} measures the performance of
  * the given {@link Detokenizer} with the provided reference
- * {@link TokenSample}s.
+ * {@link TokenSample samples}.
  *
- * @see DetokenizerEvaluator
  * @see Detokenizer
  * @see TokenSample
  */
 
 public class DetokenizerEvaluator extends Evaluator<TokenSample> {
-  private FMeasure fmeasure = new FMeasure();
+  private final FMeasure fmeasure = new FMeasure();
 
   /**
-   * The {@link Detokenizer} used to create the
-   * predicted tokens.
+   * The {@link Detokenizer} used to create the predicted tokens.
    */
-  private Detokenizer detokenizer;
+  private final Detokenizer detokenizer;
 
   /**
    * Initializes the current instance with the
    * given {@link Detokenizer}.
    *
-   * @param detokenizer the {@link Detokenizer} to evaluate.
-   * @param listeners   evaluation sample listeners
+   * @param detokenizer The {@link Detokenizer} to evaluate.
+   * @param listeners   The {@link DetokenEvaluationErrorListener evaluation 
sample listeners}.
    */
   public DetokenizerEvaluator(Detokenizer detokenizer, 
DetokenEvaluationErrorListener... listeners) {
     super(listeners);
diff --git 
a/opennlp-tools/src/main/java/opennlp/tools/tokenize/DictionaryDetokenizer.java 
b/opennlp-tools/src/main/java/opennlp/tools/tokenize/DictionaryDetokenizer.java
index d53eefa1..e6cf3f6a 100644
--- 
a/opennlp-tools/src/main/java/opennlp/tools/tokenize/DictionaryDetokenizer.java
+++ 
b/opennlp-tools/src/main/java/opennlp/tools/tokenize/DictionaryDetokenizer.java
@@ -22,7 +22,7 @@ import java.util.Set;
 
 /**
  * A rule based detokenizer. Simple rules which indicate in which direction a 
token should be
- * moved are looked up in a {@link DetokenizationDictionary} object.
+ * moved are looked up in a {@link DetokenizationDictionary dictionary}.
  *
  * @see Detokenizer
  * @see DetokenizationDictionary
@@ -31,10 +31,16 @@ public class DictionaryDetokenizer implements Detokenizer {
 
   private final DetokenizationDictionary dict;
 
+  /**
+   * Initializes a {@link DictionaryDetokenizer} instance.
+   * 
+   * @param dict The {@link DetokenizationDictionary} to be used.
+   */
   public DictionaryDetokenizer(DetokenizationDictionary dict) {
     this.dict = dict;
   }
 
+  @Override
   public DetokenizationOperation[] detokenize(String[] tokens) {
 
     DetokenizationOperation[] operations = new 
DetokenizationOperation[tokens.length];
@@ -79,6 +85,7 @@ public class DictionaryDetokenizer implements Detokenizer {
     return operations;
   }
 
+  @Override
   public String detokenize(String[] tokens, String splitMarker) {
 
     DetokenizationOperation[] operations = detokenize(tokens);
diff --git 
a/opennlp-tools/src/main/java/opennlp/tools/tokenize/SimpleTokenizer.java 
b/opennlp-tools/src/main/java/opennlp/tools/tokenize/SimpleTokenizer.java
index b9b86c85..b2b1c173 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/tokenize/SimpleTokenizer.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/tokenize/SimpleTokenizer.java
@@ -24,7 +24,11 @@ import opennlp.tools.util.Span;
 import opennlp.tools.util.StringUtil;
 
 /**
- * Performs tokenization using character classes.
+ * A basic {@link Tokenizer} implementation which performs tokenization
+ * using character classes.
+ * <p>
+ * To obtain an instance of this tokenizer use the static final
+ * {@link #INSTANCE} field.
  */
 public class SimpleTokenizer extends AbstractTokenizer {
 
@@ -34,7 +38,7 @@ public class SimpleTokenizer extends AbstractTokenizer {
     static final CharacterEnum NUMERIC = new CharacterEnum("numeric");
     static final CharacterEnum OTHER = new CharacterEnum("other");
 
-    private String name;
+    private final String name;
 
     private CharacterEnum(String name) {
       this.name = name;
@@ -45,21 +49,22 @@ public class SimpleTokenizer extends AbstractTokenizer {
       return name;
     }
   }
-  
-  public static final SimpleTokenizer INSTANCE;
 
-  static {
-    INSTANCE = new SimpleTokenizer();
-  }
+  /**
+   * Use this static reference to retrieve an instance of the
+   * {@link SimpleTokenizer}.
+   */
+  public static final SimpleTokenizer INSTANCE = new SimpleTokenizer();
 
   /**
-   * @deprecated Use INSTANCE field instead to obtain an instance, constructor
-   *     will be made private in the future.
+   * @deprecated Use {@link SimpleTokenizer#INSTANCE} field instead to obtain 
an instance.
+   *     This constructor will be made private in the future.
    */
-  @Deprecated
+  @Deprecated // TODO Decide when this will be private (see deprecation note!)
   public SimpleTokenizer() {
   }
 
+  @Override
   public Span[] tokenizePos(String s) {
     CharacterEnum charType = CharacterEnum.WHITESPACE;
     CharacterEnum state = charType;
diff --git 
a/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokSpanEventStream.java 
b/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokSpanEventStream.java
index 75eb2414..3c04e3f2 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokSpanEventStream.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokSpanEventStream.java
@@ -30,27 +30,30 @@ import opennlp.tools.util.ObjectStream;
 import opennlp.tools.util.Span;
 
 /**
- * This class reads the {@link TokenSample}s from the given {@link Iterator}
- * and converts the {@link TokenSample}s into {@link Event}s which
+ * This class reads the {@link TokenSample samples} via an {@link Iterator}
+ * and converts the samples into {@link Event events} which
  * can be used by the maxent library for training.
  */
 public class TokSpanEventStream extends AbstractEventStream<TokenSample> {
 
-  private TokenContextGenerator cg;
+  private final TokenContextGenerator cg;
 
-  private boolean skipAlphaNumerics;
+  private final boolean skipAlphaNumerics;
 
   private final Pattern alphaNumeric;
 
   /**
-   * Initializes the current instance.
+   * Initializes a new event stream based on the data stream using a {@link 
TokenContextGenerator}.
    *
-   * @param tokenSamples
-   * @param skipAlphaNumerics
-   * @param cg
+   * @param tokenSamples The {@link ObjectStream data stream} for this event 
stream.
+   * @param skipAlphaNumerics Whether alphanumerics are skipped, or not.
+   * @param alphaNumeric A custom alphanumeric {@link Pattern} or {@code null}.
+   *                     Default is: {@code "^[A-Za-z0-9]+$"}, provided by
+   *                     {@link Factory#DEFAULT_ALPHANUMERIC}.
+   * @param cg A {@link TokenContextGenerator} which should be used for the 
event stream {@code d}.
    */
-  public TokSpanEventStream(ObjectStream<TokenSample> tokenSamples,
-        boolean skipAlphaNumerics, Pattern alphaNumeric, TokenContextGenerator 
cg) {
+  public TokSpanEventStream(ObjectStream<TokenSample> tokenSamples, boolean 
skipAlphaNumerics,
+                            Pattern alphaNumeric, TokenContextGenerator cg) {
     super(tokenSamples);
     this.alphaNumeric = alphaNumeric;
     this.skipAlphaNumerics = skipAlphaNumerics;
@@ -58,26 +61,23 @@ public class TokSpanEventStream extends 
AbstractEventStream<TokenSample> {
   }
 
   /**
-   * Initializes the current instance.
+   * Initializes a new event stream based on the data stream using a {@link 
TokenContextGenerator}.
    *
-   * @param tokenSamples
-   * @param skipAlphaNumerics
-   * @param cg
+   * @param tokenSamples The {@link ObjectStream data stream} for this event 
stream.
+   * @param skipAlphaNumerics Whether alphanumerics are skipped, or not.
+   * @param cg A {@link TokenContextGenerator} which should be used for the 
event stream {@code d}.
    */
-  public TokSpanEventStream(ObjectStream<TokenSample> tokenSamples,
-        boolean skipAlphaNumerics, TokenContextGenerator cg) {
-    super(tokenSamples);
-    Factory factory = new Factory();
-    this.alphaNumeric = factory.getAlphanumeric(null);
-    this.skipAlphaNumerics = skipAlphaNumerics;
-    this.cg = cg;
+  public TokSpanEventStream(ObjectStream<TokenSample> tokenSamples, boolean 
skipAlphaNumerics,
+                            TokenContextGenerator cg) {
+    this(tokenSamples, skipAlphaNumerics, new Factory().getAlphanumeric(null), 
cg );
   }
 
   /**
-   * Initializes the current instance.
+   * Initializes a new event stream based on the data stream using a {@link 
TokenContextGenerator}
+   * that relies on a {@link DefaultTokenContextGenerator}.
    *
-   * @param tokenSamples
-   * @param skipAlphaNumerics
+   * @param tokenSamples The {@link ObjectStream data stream} for this event 
stream.
+   * @param skipAlphaNumerics Whether alphanumerics are skipped, or not.
    */
   public TokSpanEventStream(ObjectStream<TokenSample> tokenSamples,
       boolean skipAlphaNumerics) {
@@ -85,10 +85,10 @@ public class TokSpanEventStream extends 
AbstractEventStream<TokenSample> {
   }
 
   /**
-   * Adds training events to the event stream for each of the specified tokens.
+   * Adds training events to the event stream for each of the specified {@link 
TokenSample sample}.
    *
    * @param tokenSample character offsets into the specified text.
-   * @return The text of the tokens.
+   * @return An {@link Iterator} for text {@link Event events} representing 
the {@code tokenSample}.
    */
   @Override
   protected Iterator<Event> createEvents(TokenSample tokenSample) {
diff --git 
a/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenContextGenerator.java 
b/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenContextGenerator.java
index b15fd91c..475146bd 100644
--- 
a/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenContextGenerator.java
+++ 
b/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenContextGenerator.java
@@ -18,18 +18,15 @@
 package opennlp.tools.tokenize;
 
 /**
- * Interface for {@link TokenizerME} context generators.
+ * Interface for context generators required for {@link TokenizerME}.
  */
 public interface TokenContextGenerator {
 
   /**
-   * Returns an array of features for the specified sentence string at the 
specified index.
+   * @param sentence The string that represents a sentence.
+   * @param index The index to consider splitting tokens.
    *
-   * @param sentence The string for a sentence.
-   * @param index The index to consider splitting as a token.
-   *
-   * @return an array of features for the specified sentence string at the
-   *   specified index.
+   * @return An array of features for a {@code sentence} at the specified 
{@code index}.
    */
   String[] getContext(String sentence, int index);
 }
diff --git 
a/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenSample.java 
b/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenSample.java
index b4e374ce..03f29470 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenSample.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenSample.java
@@ -42,10 +42,12 @@ public class TokenSample implements Serializable {
   private final List<Span> tokenSpans;
 
   /**
-   * Initializes the current instance.
+   * Initializes a {@link TokenSample instance}.
    *
-   * @param text the text which contains the tokens.
-   * @param tokenSpans the spans which mark the begin and end of the tokens.
+   * @param text The text which contains the tokens.
+   *             Must not be {@code null}.
+   * @param tokenSpans The spans which mark the start and end of the tokens.
+   *                   Must not be {@code null}.
    */
   public TokenSample(String text, Span[] tokenSpans) {
     Objects.requireNonNull(tokenSpans, "tokenSpans must not be null");
@@ -62,13 +64,21 @@ public class TokenSample implements Serializable {
     }
   }
 
+  /**
+   * Initializes a {@link TokenSample instance} via a {@link Detokenizer}.
+   *
+   * @param detokenizer The text which contains the tokens. Must not be {@code 
null}.
+   * @param tokens The tokens to be processed. Must not be {@code null}.
+   */
   public TokenSample(Detokenizer detokenizer, String[] tokens) {
 
-    StringBuilder sentence = new StringBuilder();
-
+    Objects.requireNonNull(detokenizer, "detokenizer must not be null");
+    Objects.requireNonNull(tokens, "tokens must not be null");
+    
     DetokenizationOperation[] operations = detokenizer.detokenize(tokens);
 
     List<Span> mergedTokenSpans = new ArrayList<>();
+    StringBuilder sentence = new StringBuilder();
 
     for (int i = 0; i < operations.length; i++) {
 
@@ -100,14 +110,14 @@ public class TokenSample implements Serializable {
   }
 
   /**
-   * Retrieves the text.
+   * @return Retrieves the text.
    */
   public String getText() {
     return text;
   }
 
   /**
-   * Retrieves the token spans.
+   * @return Retrieves the token {@link Span spans}.
    */
   public Span[] getTokenSpans() {
     return tokenSpans.toArray(new Span[tokenSpans.size()]);
@@ -157,6 +167,14 @@ public class TokenSample implements Serializable {
         sample.append(" ");
   }
 
+  /**
+   * Parses a string sample.
+   *
+   * @param sampleString The sample to be parsed. Must not be {@code null}.
+   * @param separatorChars The characters to be considered separators.
+   *                       See {@link #DEFAULT_SEPARATOR_CHARS}. Must not be 
{@code null}.
+   * @return A valid {@link TokenSample} instance.
+   */
   public static TokenSample parse(String sampleString, String separatorChars) {
     Objects.requireNonNull(sampleString, "sampleString must not be null");
     Objects.requireNonNull(separatorChars, "separatorChars must not be null");
diff --git 
a/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenSampleStream.java 
b/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenSampleStream.java
index 0beddc3a..84a0a63a 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenSampleStream.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenSampleStream.java
@@ -24,31 +24,47 @@ import opennlp.tools.util.FilterObjectStream;
 import opennlp.tools.util.ObjectStream;
 
 /**
- * This class is a stream filter which reads in string encoded samples and 
creates
- * {@link TokenSample}s out of them. The input string sample is tokenized if a
- * whitespace or the special separator chars occur.
+ * This class is a {@link FilterObjectStream stream filter} which reads in 
string encoded
+ * samples and creates {@link TokenSample samples} out of them.
+ * The input string sample is tokenized if a whitespace or the special 
separator chars occur.
  * <p>
  * Sample:<br>
  * "token1 token2 token3&lt;SPLIT&gt;token4"<br>
- * The tokens token1 and token2 are separated by a whitespace, token3 and 
token3
- * are separated by the special character sequence, in this case the default
- * split sequence.
+ * The tokens {@code token1} and {@code token2} are separated by a whitespace,
+ * {@code token3} and {@code token4} are separated by the special character 
sequence.
+ * In this case, the default split sequence applies.
  * <p>
- * The sequence must be unique in the input string and is not escaped.
+ * Note: The sequence must be unique in the input string and is not escaped.
  */
 public class TokenSampleStream extends FilterObjectStream<String, TokenSample> 
{
 
   private final String separatorChars;
 
-  public TokenSampleStream(ObjectStream<String> sampleStrings, String 
separatorChars) {
-    super(Objects.requireNonNull(sampleStrings, "sampleStrings must not be 
null"));
+  /**
+   * Initializes a {@link TokenSampleStream instance}.
+   *
+   * @param samples A plain text {@link ObjectStream line stream}.
+   *                Must not be {@code null}.
+   * @param separatorChars The characters to be considered separators.
+   *                       See {@link TokenSample#DEFAULT_SEPARATOR_CHARS}.
+   *                       Must not be {@code null}.
+   */
+  public TokenSampleStream(ObjectStream<String> samples, String 
separatorChars) {
+    super(Objects.requireNonNull(samples, "sampleStrings must not be null"));
     this.separatorChars = 
Objects.requireNonNull(separatorChars,"separatorChars must not be null");
   }
 
+  /**
+   * Initializes a {@link TokenSampleStream instance}.
+   *
+   * @param sentences A plain text {@link ObjectStream line stream}.
+   *                  Must not be {@code null}.
+   */
   public TokenSampleStream(ObjectStream<String> sentences) {
     this(sentences, TokenSample.DEFAULT_SEPARATOR_CHARS);
   }
 
+  @Override
   public TokenSample read() throws IOException {
     String sampleString = samples.read();
 
diff --git a/opennlp-tools/src/main/java/opennlp/tools/tokenize/Tokenizer.java 
b/opennlp-tools/src/main/java/opennlp/tools/tokenize/Tokenizer.java
index 92b5e9b8..8a6bc37f 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/tokenize/Tokenizer.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/tokenize/Tokenizer.java
@@ -23,17 +23,17 @@ import opennlp.tools.util.Span;
 /**
  * The interface for tokenizers, which segment a string into its tokens.
  * <p>
- * Tokenization is a necessary step before more complex NLP tasks can be 
applied,
- * these usually process text on a token level. The quality of tokenization is
+ * Tokenization is a necessary step before more complex NLP tasks can be 
applied.
+ * These usually process text on a token level. The quality of tokenization is
  * important because it influences the performance of high-level task applied 
to it.
  * <p>
- * In segmented languages like English most words are segmented by white spaces
+ * In segmented languages like English most words are segmented by whitespaces
  * expect for punctuations, etc. which is directly attached to the word 
without a white space
  * in between, it is not possible to just split at all punctuations because in 
abbreviations dots
- * are a part of the token itself. A tokenizer is now responsible to split 
these tokens
+ * are a part of the token itself. A {@link Tokenizer} is now responsible to 
split those tokens
  * correctly.
  * <p>
- * In non-segmented languages like Chinese tokenization is more difficult 
since words
+ * In non-segmented languages like Chinese, tokenization is more difficult 
since words
  * are not segmented by a whitespace.
  * <p>
  * Tokenizers can also be used to segment already identified tokens further 
into more
@@ -41,16 +41,15 @@ import opennlp.tools.util.Span;
  * to gain insight into tokens which do not represent words like numbers, 
units or tokens
  * which are part of a special notation.
  * <p>
- * For most further task it is desirable to over tokenize rather than under 
tokenize.
+ * For most subsequent NLP tasks, it is desirable to over-tokenize rather than 
to under-tokenize.
  */
 public interface Tokenizer {
 
   /**
-   * Splits a string into its atomic parts
+   * Splits a string into its atomic parts.
    *
    * @param s The string to be tokenized.
-   * @return  The String[] with the individual tokens as the array
-   *          elements.
+   * @return  The String[] with the individual tokens as the array elements.
    */
   String[] tokenize(String s);
 
@@ -58,8 +57,7 @@ public interface Tokenizer {
    * Finds the boundaries of atomic parts in a string.
    *
    * @param s The string to be tokenized.
-   * @return The Span[] with the spans (offsets into s) for each
-   * token as the individuals array elements.
+   * @return The {@link Span spans (offsets into {@code s})} for each token as 
the individuals array elements.
    */
   Span[] tokenizePos(String s);
 
diff --git 
a/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerCrossValidator.java
 
b/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerCrossValidator.java
index 96d8d354..38351136 100644
--- 
a/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerCrossValidator.java
+++ 
b/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerCrossValidator.java
@@ -24,14 +24,24 @@ import opennlp.tools.util.TrainingParameters;
 import opennlp.tools.util.eval.CrossValidationPartitioner;
 import opennlp.tools.util.eval.FMeasure;
 
+/**
+ * A cross validator for {@link Tokenizer tokenizers}.
+ */
 public class TokenizerCrossValidator {
 
   private final TrainingParameters params;
 
-  private FMeasure fmeasure = new FMeasure();
-  private TokenizerEvaluationMonitor[] listeners;
+  private final FMeasure fmeasure = new FMeasure();
+  private final TokenizerEvaluationMonitor[] listeners;
   private final TokenizerFactory factory;
 
+  /**
+   * Creates a {@link TokenizerCrossValidator} using the given {@link 
TokenizerFactory}.
+   *
+   * @param params The {@link TrainingParameters} for the context of cross 
validation.
+   * @param factory The {@link TokenizerFactory} to be used.
+   * @param listeners The {@link TokenizerEvaluationMonitor evaluation 
listeners}.
+   */
   public TokenizerCrossValidator(TrainingParameters params,
       TokenizerFactory factory, TokenizerEvaluationMonitor... listeners) {
     this.params = params;
@@ -42,12 +52,10 @@ public class TokenizerCrossValidator {
   /**
    * Starts the evaluation.
    *
-   * @param samples
-   *          the data to train and test
-   * @param nFolds
-   *          number of folds
+   * @param samples The {@link ObjectStream} of {@link TokenSample samples} to 
train and test with.
+   * @param nFolds Number of folds. It must be greater than zero.
    *
-   * @throws IOException
+   * @throws IOException Thrown if IO errors occurred during evaluation.
    */
   public void evaluate(ObjectStream<TokenSample> samples, int nFolds) throws 
IOException {
 
diff --git 
a/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerEvaluationMonitor.java
 
b/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerEvaluationMonitor.java
index 905a139b..6c3872fb 100644
--- 
a/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerEvaluationMonitor.java
+++ 
b/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerEvaluationMonitor.java
@@ -19,7 +19,9 @@ package opennlp.tools.tokenize;
 
 import opennlp.tools.util.eval.EvaluationMonitor;
 
-public interface TokenizerEvaluationMonitor extends
-    EvaluationMonitor<TokenSample> {
+/**
+ * A marker interface for evaluating {@link Tokenizer tokenizers}.
+ */
+public interface TokenizerEvaluationMonitor extends 
EvaluationMonitor<TokenSample> {
 
 }
diff --git 
a/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerEvaluator.java 
b/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerEvaluator.java
index fa4d35bd..65e722d5 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerEvaluator.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerEvaluator.java
@@ -25,7 +25,7 @@ import opennlp.tools.util.eval.FMeasure;
 /**
  * The {@link TokenizerEvaluator} measures the performance of
  * the given {@link Tokenizer} with the provided reference
- * {@link TokenSample}s.
+ * {@link TokenSample samples}.
  *
  * @see Evaluator
  * @see Tokenizer
@@ -33,20 +33,18 @@ import opennlp.tools.util.eval.FMeasure;
  */
 public class TokenizerEvaluator extends Evaluator<TokenSample> {
 
-  private FMeasure fmeasure = new FMeasure();
+  private final FMeasure fmeasure = new FMeasure();
 
   /**
-   * The {@link Tokenizer} used to create the
-   * predicted tokens.
+   * The {@link Tokenizer} used to create the predicted tokens.
    */
-  private Tokenizer tokenizer;
+  private final Tokenizer tokenizer;
 
   /**
-   * Initializes the current instance with the
-   * given {@link Tokenizer}.
+   * Initializes an instance to evaluate a {@link Tokenizer}.
    *
-   * @param tokenizer the {@link Tokenizer} to evaluate.
-   * @param listeners evaluation sample listeners
+   * @param tokenizer The {@link Tokenizer} to evaluate.
+   * @param listeners The {@link TokenizerEvaluationMonitor evaluation 
listeners}.
    */
   public TokenizerEvaluator(Tokenizer tokenizer, TokenizerEvaluationMonitor 
... listeners) {
     super(listeners);
diff --git 
a/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerFactory.java 
b/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerFactory.java
index ba3d285f..ca75071c 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerFactory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerFactory.java
@@ -29,7 +29,7 @@ import opennlp.tools.util.InvalidFormatException;
 import opennlp.tools.util.ext.ExtensionLoader;
 
 /**
- * The factory that provides {@link Tokenizer} default implementations and
+ * The factory that provides {@link Tokenizer} default implementation and
  * resources. Users can extend this class if their application requires
  * overriding the {@link TokenContextGenerator}, {@link Dictionary} etc.
  */
@@ -45,33 +45,37 @@ public class TokenizerFactory extends BaseToolFactory {
   private static final String ALPHA_NUMERIC_PATTERN = "alphaNumericPattern";
 
   /**
-   * Creates a {@link TokenizerFactory} that provides the default 
implementation
+   * Instantiates a {@link TokenizerFactory} that provides the default 
implementation
    * of the resources.
    */
   public TokenizerFactory() {
   }
-
+  
   /**
-   * Creates a {@link TokenizerFactory}. Use this constructor to
+   * Instantiates a {@link TokenizerFactory}. Use this constructor to
    * programmatically create a factory.
    *
-   * @param languageCode
-   *          the language of the natural text
-   * @param abbreviationDictionary
-   *          an abbreviations dictionary
-   * @param useAlphaNumericOptimization
-   *          if true alpha numerics are skipped
-   * @param alphaNumericPattern
-   *          null or a custom alphanumeric pattern (default is:
-   *          "^[A-Za-z0-9]+$", provided by {@link 
Factory#DEFAULT_ALPHANUMERIC}
+   * @param languageCode The ISO language code to be used for this factory.
+   * @param abbreviationDictionary The {@link Dictionary} which holds 
abbreviations.
+   * @param useAlphaNumericOptimization Whether alphanumerics are skipped, or 
not.
+   * @param alphaNumericPattern {@code null} or a custom alphanumeric {@link 
Pattern}
+   *                            (default is: {@code "^[A-Za-z0-9]+$"}, 
provided by
+   *                            {@link Factory#DEFAULT_ALPHANUMERIC}.
    */
-  public TokenizerFactory(String languageCode,
-      Dictionary abbreviationDictionary, boolean useAlphaNumericOptimization,
-      Pattern alphaNumericPattern) {
+  public TokenizerFactory(String languageCode, Dictionary 
abbreviationDictionary,
+                          boolean useAlphaNumericOptimization, Pattern 
alphaNumericPattern) {
     this.init(languageCode, abbreviationDictionary,
         useAlphaNumericOptimization, alphaNumericPattern);
   }
 
+  /**
+   * @param languageCode The ISO language code to be used for this factory.
+   * @param abbreviationDictionary The {@link Dictionary} which holds 
abbreviations.
+   * @param useAlphaNumericOptimization Whether alphanumerics are skipped, or 
not.
+   * @param alphaNumericPattern {@code null} or a custom alphanumeric {@link 
Pattern}
+   *                            (default is: {@code "^[A-Za-z0-9]+$"}, 
provided by
+   *                            {@link Factory#DEFAULT_ALPHANUMERIC}.
+   */
   protected void init(String languageCode, Dictionary abbreviationDictionary,
       boolean useAlphaNumericOptimization, Pattern alphaNumericPattern) {
     this.languageCode = languageCode;
@@ -122,22 +126,24 @@ public class TokenizerFactory extends BaseToolFactory {
   }
 
   /**
-   * Factory method the framework uses create a new {@link TokenizerFactory}.
+   * Factory method the framework uses instantiate a new {@link 
TokenizerFactory}.
    *
-   * @param subclassName the name of the class implementing the {@link 
TokenizerFactory}
-   * @param languageCode the language code the tokenizer should use
-   * @param abbreviationDictionary an optional dictionary containing 
abbreviations, or null if not present
-   * @param useAlphaNumericOptimization indicate if the alpha numeric 
optimization
-   *     should be enabled or disabled
-   * @param alphaNumericPattern the pattern the alpha numeric optimization 
should use
+   * @param subclassName The name of the class implementing the {@link 
TokenizerFactory}.
+   * @param languageCode The ISO language code the {@link Tokenizer} should 
use.
+   * @param abbreviationDictionary An optional {@link Dictionary} containing 
abbreviations,
+   *                               or {@code null} if not present.
+   * @param useAlphaNumericOptimization Whether the alphanumeric optimization 
is be enabled or not.
+   * @param alphaNumericPattern The {@link Pattern} the alphanumeric 
optimization should use,
+   *                            if enabled.
    *
-   * @return the instance of the Tokenizer Factory
+   * @return A valid {@link TokenizerFactory} instance.
    *
-   * @throws InvalidFormatException if once of the input parameters doesn't 
comply if the expected format
+   * @throws InvalidFormatException Thrown if one of the input parameters 
doesn't comply the expected format.
    */
-  public static TokenizerFactory create(String subclassName,
-      String languageCode, Dictionary abbreviationDictionary,
-      boolean useAlphaNumericOptimization, Pattern alphaNumericPattern)
+  public static TokenizerFactory create(String subclassName, String 
languageCode,
+                                        Dictionary abbreviationDictionary,
+                                        boolean useAlphaNumericOptimization,
+                                        Pattern alphaNumericPattern)
       throws InvalidFormatException {
     if (subclassName == null) {
       // will create the default factory
@@ -160,9 +166,7 @@ public class TokenizerFactory extends BaseToolFactory {
   }
 
   /**
-   * Gets the alpha numeric pattern.
-   *
-   * @return the user specified alpha numeric pattern or a default.
+   * @return Retrieves the (user-)specified alphanumeric {@link Pattern} or a 
default.
    */
   public Pattern getAlphaNumericPattern() {
     if (this.alphaNumericPattern == null) {
@@ -182,9 +186,7 @@ public class TokenizerFactory extends BaseToolFactory {
   }
 
   /**
-   * Gets whether to use alphanumeric optimization.
-   *
-   * @return true if the alpha numeric optimization is enabled, otherwise false
+   * @return {@code true} if the alphanumeric optimization is enabled, 
otherwise {@code false}.
    */
   public boolean isUseAlphaNumericOptmization() {
     if (artifactProvider != null) {
@@ -195,9 +197,7 @@ public class TokenizerFactory extends BaseToolFactory {
   }
 
   /**
-   * Gets the abbreviation dictionary
-   *
-   * @return null or the abbreviation dictionary
+   * @return The abbreviation {@link Dictionary} or {@code null} if none is 
active.
    */
   public Dictionary getAbbreviationDictionary() {
     if (this.abbreviationDictionary == null && artifactProvider != null) {
@@ -207,9 +207,7 @@ public class TokenizerFactory extends BaseToolFactory {
   }
 
   /**
-   * Retrieves the language code.
-   *
-   * @return the language code
+   * @return Retrieves the ISO language code in use.
    */
   public String getLanguageCode() {
     if (this.languageCode == null && this.artifactProvider != null) {
@@ -219,9 +217,7 @@ public class TokenizerFactory extends BaseToolFactory {
   }
 
   /**
-   * Gets the context generator
-   *
-   * @return a new instance of the context generator
+   * @return Retrieves a {@link TokenContextGenerator} instance.
    */
   public TokenContextGenerator getContextGenerator() {
     Factory f = new Factory();
diff --git 
a/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerME.java 
b/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerME.java
index c64c2355..10086e9b 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerME.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerME.java
@@ -38,22 +38,22 @@ import opennlp.tools.util.Span;
 import opennlp.tools.util.TrainingParameters;
 
 /**
- * A Tokenizer for converting raw text into separated tokens.  It uses
- * Maximum Entropy to make its decisions.  The features are loosely
+ * A {@link Tokenizer} for converting raw text into separated tokens. It uses
+ * Maximum Entropy to make its decisions. The features are loosely
  * based off of Jeff Reynar's UPenn thesis "Topic Segmentation:
  * Algorithms and Applications.", which is available from his
  * homepage: <a 
href="http://www.cis.upenn.edu/~jcreynar";>http://www.cis.upenn.edu/~jcreynar</a>.
  * <p>
- * This tokenizer needs a statistical model to tokenize a text which reproduces
+ * This implementation needs a statistical model to tokenize a text which 
reproduces
  * the tokenization observed in the training data used to create the model.
- * The {@link TokenizerModel} class encapsulates the model and provides
+ * The {@link TokenizerModel} class encapsulates that model and provides
  * methods to create it from the binary representation.
  * <p>
- * A tokenizer instance is not thread safe. For each thread one tokenizer
- * must be instantiated which can share one <code>TokenizerModel</code> 
instance
+ * A tokenizer instance is not thread-safe. For each thread, one tokenizer
+ * must be instantiated which can share one {@link TokenizerModel} instance
  * to safe memory.
  * <p>
- * To train a new model {{@link #train(ObjectStream, TokenizerFactory, 
TrainingParameters)} method
+ * To train a new model, the {@link #train(ObjectStream, TokenizerFactory, 
TrainingParameters) method
  * can be used.
  * <p>
  * Sample usage:
@@ -69,7 +69,8 @@ import opennlp.tools.util.TrainingParameters;
  * <br>
  * String tokens[] = tokenizer.tokenize("A sentence to be tokenized.");
  * </code>
- *
+ * <p>
+ *   
  * @see Tokenizer
  * @see TokenizerModel
  * @see TokenSample
@@ -95,32 +96,31 @@ public class TokenizerME extends AbstractTokenizer {
 
   private final Pattern alphanumeric;
 
-  /**
+  /*
    * The maximum entropy model to use to evaluate contexts.
    */
-  private MaxentModel model;
+  private final MaxentModel model;
 
-  /**
+  /*
    * The context generator.
    */
   private final TokenContextGenerator cg;
 
-  /**
-   * Optimization flag to skip alpha numeric tokens for further
-   * tokenization
+  /*
+   * Optimization flag to skip alphanumeric tokens for further tokenization
    */
-  private boolean useAlphaNumericOptimization;
+  private final boolean useAlphaNumericOptimization;
 
-  /**
+  /*
    * List of probabilities for each token returned from a call to
    * <code>tokenize</code> or <code>tokenizePos</code>.
    */
-  private List<Double> tokProbs;
+  private final List<Double> tokProbs;
 
-  private List<Span> newTokens;
+  private final List<Span> newTokens;
 
   /**
-   * Initializes the tokenizer by downloading a default model.
+   * Initializes a {@link TokenizerME} by downloading a default model.
    * @param language The language of the tokenizer.
    * @throws IOException Thrown if the model cannot be downloaded or saved.
    */
@@ -129,6 +129,11 @@ public class TokenizerME extends AbstractTokenizer {
             TokenizerModel.class));
   }
 
+  /**
+   * Instantiates a {@link TokenizerME} with an existing {@link 
TokenizerModel}.
+   *
+   * @param model The {@link TokenizerModel} to be used.
+   */
   public TokenizerME(TokenizerModel model) {
     TokenizerFactory factory = model.getFactory();
     this.alphanumeric = factory.getAlphaNumericPattern();
@@ -144,6 +149,7 @@ public class TokenizerME extends AbstractTokenizer {
    * @deprecated use {@link TokenizerFactory} to extend the Tokenizer
    *             functionality
    */
+  @Deprecated
   public TokenizerME(TokenizerModel model, Factory factory) {
     String languageCode = model.getLanguage();
 
@@ -166,11 +172,9 @@ public class TokenizerME extends AbstractTokenizer {
   }
 
   /**
-   * Returns the probabilities associated with the most recent
-   * calls to {@link TokenizerME#tokenize(String)} or {@link 
TokenizerME#tokenizePos(String)}.
-   *
-   * @return probability for each token returned for the most recent
-   *     call to tokenize.  If not applicable an empty array is returned.
+   * @return the probabilities associated with the most recent calls to
+   *         {@link TokenizerME#tokenize(String)} or {@link 
TokenizerME#tokenizePos(String)}.
+   *         If not applicable an empty array is returned.
    */
   public double[] getTokenProbabilities() {
     double[] tokProbArray = new double[tokProbs.size()];
@@ -185,7 +189,7 @@ public class TokenizerME extends AbstractTokenizer {
    *
    * @param d  The string to be tokenized.
    *
-   * @return   A span array containing individual tokens as elements.
+   * @return   A {@link Span} array containing individual tokens as elements.
    */
   public Span[] tokenizePos(String d) {
     WhitespaceTokenizer whitespaceTokenizer = WhitespaceTokenizer.INSTANCE;
@@ -232,18 +236,12 @@ public class TokenizerME extends AbstractTokenizer {
   /**
    * Trains a model for the {@link TokenizerME}.
    *
-   * @param samples
-   *          the samples used for the training.
-   * @param factory
-   *          a {@link TokenizerFactory} to get resources from
-   * @param mlParams
-   *          the machine learning train parameters
-   * @return the trained {@link TokenizerModel}
-   * @throws IOException
-   *           it throws an {@link IOException} if an {@link IOException} is
-   *           thrown during IO operations on a temp file which is created
-   *           during training. Or if reading from the {@link ObjectStream}
-   *           fails.
+   * @param samples The samples used for the training.
+   * @param factory A {@link TokenizerFactory} to get resources from.
+   * @param mlParams The machine learning {@link TrainingParameters train 
parameters}.
+   * @return A trained {@link TokenizerModel}.
+   * @throws IOException Thrown during IO operations on a temp file which is 
created
+   *           during training. Or if reading from the {@link ObjectStream} 
fails.
    */
   public static TokenizerModel train(ObjectStream<TokenSample> samples, 
TokenizerFactory factory,
       TrainingParameters mlParams) throws IOException {
@@ -263,9 +261,7 @@ public class TokenizerME extends AbstractTokenizer {
   }
 
   /**
-   * Returns the value of the alpha-numeric optimization flag.
-   *
-   * @return true if the tokenizer should use alpha-numeric optimization, 
false otherwise.
+   * @return {@code true} if the tokenizer uses alphanumeric optimization, 
{@code false} otherwise.
    */
   public boolean useAlphaNumericOptimization() {
     return useAlphaNumericOptimization;
diff --git 
a/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerModel.java 
b/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerModel.java
index 8e732991..b2d5003f 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerModel.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerModel.java
@@ -38,6 +38,7 @@ import opennlp.tools.util.model.ModelUtil;
  * by a learnable {@link Tokenizer}.
  *
  * @see TokenizerME
+ * @see TokenizerFactory
  */
 public final class TokenizerModel extends BaseModel {
 
@@ -46,11 +47,11 @@ public final class TokenizerModel extends BaseModel {
   private static final String TOKENIZER_MODEL_ENTRY = "token.model";
 
   /**
-   * Initializes the current instance.
+   * Initializes a {@link TokenizerModel} instance via a {@link MaxentModel} 
and related resources.
    *
-   * @param tokenizerModel the model
-   * @param manifestInfoEntries the manifest
-   * @param tokenizerFactory the factory
+   * @param tokenizerModel The {@link MaxentModel model} to be used.
+   * @param manifestInfoEntries Additional information kept in the manifest.
+   * @param tokenizerFactory The {@link TokenizerFactory} to be used 
internally.
    */
   public TokenizerModel(MaxentModel tokenizerModel,
       Map<String, String> manifestInfoEntries, TokenizerFactory 
tokenizerFactory) {
@@ -60,48 +61,54 @@ public final class TokenizerModel extends BaseModel {
   }
 
   /**
-   * Initializes the current instance.
+   * Initializes a {@link TokenizerModel} instance via a valid {@link 
InputStream}.
    *
-   * @param in the Input Stream to load the model from
+   * @param in The {@link InputStream} used for loading the model.
    *
-   * @throws IOException if reading from the stream fails in anyway
-   * @throws InvalidFormatException if the stream doesn't have the expected 
format
+   * @throws IOException Thrown if IO errors occurred during initialization.
    */
   public TokenizerModel(InputStream in) throws IOException {
     super(COMPONENT_NAME, in);
   }
 
   /**
-   * Initializes the current instance.
+   * Initializes a {@link TokenizerModel} instance via a valid {@link File}.
    *
-   * @param modelFile the file containing the tokenizer model
+   * @param modelFile The {@link File} used for loading the model.
    *
-   * @throws IOException if reading from the stream fails in anyway
+   * @throws IOException Thrown if IO errors occurred during initialization.
    */
   public TokenizerModel(File modelFile) throws IOException {
     super(COMPONENT_NAME, modelFile);
   }
 
+  /**
+   * Initializes a {@link TokenizerModel} instance via a valid {@link Path}.
+   *
+   * @param modelPath The {@link Path} used for loading the model.
+   *
+   * @throws IOException Thrown if IO errors occurred during initialization.
+   */
   public TokenizerModel(Path modelPath) throws IOException {
     this(modelPath.toFile());
   }
 
   /**
-   * Initializes the current instance.
+   * Initializes a {@link TokenizerModel} instance via a valid {@link URL}.
    *
-   * @param modelURL the URL pointing to the tokenizer model
+   * @param modelURL The {@link URL} used for loading the model.
    *
-   * @throws IOException if reading from the stream fails in anyway
+   * @throws IOException Thrown if IO errors occurred during initialization.
    */
   public TokenizerModel(URL modelURL) throws IOException {
     super(COMPONENT_NAME, modelURL);
   }
 
   /**
-   * Checks if the tokenizer model has the right outcomes.
+   * Checks if the {@link TokenizerModel} has the right outcomes.
    *
-   * @param model
-   * @return
+   * @param model The {@link MaxentModel} to be checked.
+   * @return {@code true} if the model could be validated, {@code false} 
otherwise.
    */
   private static boolean isModelCompatible(MaxentModel model) {
     return ModelUtil.validateOutcomes(model, TokenizerME.SPLIT, 
TokenizerME.NO_SPLIT);
@@ -120,6 +127,9 @@ public final class TokenizerModel extends BaseModel {
     }
   }
 
+  /**
+   * @return Retrieves the active {@link TokenizerFactory}.
+   */
   public TokenizerFactory getFactory() {
     return (TokenizerFactory) this.toolFactory;
   }
@@ -129,10 +139,16 @@ public final class TokenizerModel extends BaseModel {
     return TokenizerFactory.class;
   }
 
+  /**
+   * @return Retrieves the model as {@link MaxentModel} instance.
+   */
   public MaxentModel getMaxentModel() {
     return (MaxentModel) artifactMap.get(TOKENIZER_MODEL_ENTRY);
   }
 
+  /**
+   * @return Retrieves the active abbreviation {@link Dictionary}.
+   */
   public Dictionary getAbbreviations() {
     if (getFactory() != null) {
       return getFactory().getAbbreviationDictionary();
@@ -140,6 +156,9 @@ public final class TokenizerModel extends BaseModel {
     return null;
   }
 
+  /**
+   * @return {@code true} if alphanumeric optimization is active, {@code 
false} otherwise.
+   */
   public boolean useAlphaNumericOptimization() {
     return getFactory() != null && getFactory().isUseAlphaNumericOptmization();
   }
diff --git 
a/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerStream.java 
b/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerStream.java
index bfb87c56..c8296773 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerStream.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerStream.java
@@ -24,19 +24,26 @@ import opennlp.tools.util.ObjectStream;
 import opennlp.tools.util.Span;
 
 /**
- * The {@link TokenizerStream} uses a tokenizer to tokenize the
- * input string and output {@link TokenSample}s.
+ * The {@link TokenizerStream} uses a {@link Tokenizer} to tokenize the
+ * input string and output {@link TokenSample samples}.
  */
 public class TokenizerStream implements ObjectStream<TokenSample> {
 
-  private Tokenizer tokenizer;
-  private ObjectStream<String> input;
+  private final Tokenizer tokenizer;
+  private final ObjectStream<String> input;
 
+  /**
+   * Initializes a {@link TokenizerStream instance}.
+   *
+   * @param tokenizer A working {@link Tokenizer} instance.
+   * @param input A plain text {@link ObjectStream line stream}.
+   */
   public TokenizerStream(Tokenizer tokenizer, ObjectStream<String> input) {
     this.tokenizer = tokenizer;
     this.input = input;
   }
 
+  @Override
   public TokenSample read() throws IOException {
     String inputString = input.read();
 
@@ -49,10 +56,12 @@ public class TokenizerStream implements 
ObjectStream<TokenSample> {
     return null;
   }
 
+  @Override
   public void close() throws IOException {
     input.close();
   }
 
+  @Override
   public void reset() throws IOException,
       UnsupportedOperationException {
     input.reset();
diff --git 
a/opennlp-tools/src/main/java/opennlp/tools/tokenize/WhitespaceTokenStream.java 
b/opennlp-tools/src/main/java/opennlp/tools/tokenize/WhitespaceTokenStream.java
index 4f6694f9..24ddef48 100644
--- 
a/opennlp-tools/src/main/java/opennlp/tools/tokenize/WhitespaceTokenStream.java
+++ 
b/opennlp-tools/src/main/java/opennlp/tools/tokenize/WhitespaceTokenStream.java
@@ -24,15 +24,21 @@ import opennlp.tools.util.ObjectStream;
 import opennlp.tools.util.Span;
 
 /**
- * This stream formats a {@link TokenSample}s into whitespace
+ * This stream formats {@link ObjectStream} of {@link TokenSample samples} 
into whitespace
  * separated token strings.
  */
 public class WhitespaceTokenStream extends FilterObjectStream<TokenSample, 
String> {
 
+  /**
+   * Initializes a {@link WhitespaceTokenStream}.
+   *
+   * @param tokens The {@link ObjectStream stream} of tokens to be separated.
+   */
   public WhitespaceTokenStream(ObjectStream<TokenSample> tokens) {
     super(tokens);
   }
 
+  @Override
   public String read() throws IOException {
     TokenSample tokenSample = samples.read();
 
diff --git 
a/opennlp-tools/src/main/java/opennlp/tools/tokenize/WhitespaceTokenizer.java 
b/opennlp-tools/src/main/java/opennlp/tools/tokenize/WhitespaceTokenizer.java
index f918bceb..7209a754 100644
--- 
a/opennlp-tools/src/main/java/opennlp/tools/tokenize/WhitespaceTokenizer.java
+++ 
b/opennlp-tools/src/main/java/opennlp/tools/tokenize/WhitespaceTokenizer.java
@@ -24,10 +24,11 @@ import opennlp.tools.util.Span;
 import opennlp.tools.util.StringUtil;
 
 /**
- * This tokenizer uses white spaces to tokenize the input text.
- *
+ * A basic {@link Tokenizer} implementation which performs tokenization
+ * using white spaces.
+ * <p>
  * To obtain an instance of this tokenizer use the static final
- * <code>INSTANCE</code> field.
+ * {@link #INSTANCE} field.
  */
 public class WhitespaceTokenizer extends AbstractTokenizer {
 
@@ -37,18 +38,19 @@ public class WhitespaceTokenizer extends AbstractTokenizer {
    */
   public static final WhitespaceTokenizer INSTANCE = new WhitespaceTokenizer();
 
-  /**
+  /*
    * Use the {@link WhitespaceTokenizer#INSTANCE} field to retrieve an 
instance.
    */
   private WhitespaceTokenizer() {
   }
 
+  @Override
   public Span[] tokenizePos(String d) {
     int tokStart = -1;
     List<Span> tokens = new ArrayList<>();
     boolean inTok = false;
 
-    //gather up potential tokens
+    // gather potential tokens
     int end = d.length();
     for (int i = 0; i < end; i++) {
       if (StringUtil.isWhitespace(d.charAt(i))) {
diff --git 
a/opennlp-tools/src/main/java/opennlp/tools/tokenize/WordpieceTokenizer.java 
b/opennlp-tools/src/main/java/opennlp/tools/tokenize/WordpieceTokenizer.java
index bbd31458..79752a48 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/tokenize/WordpieceTokenizer.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/tokenize/WordpieceTokenizer.java
@@ -24,13 +24,23 @@ import java.util.Set;
 import opennlp.tools.util.Span;
 
 /**
- * A WordPiece tokenizer.
- *
- * Adapted from https://github.com/robrua/easy-bert under the MIT license.
- *
+ * A {@link Tokenizer} implementation which performs tokenization
+ * using word pieces.
+ * <p>
+ * Adapted under MIT license from
+ * <a 
href="https://github.com/robrua/easy-bert";>https://github.com/robrua/easy-bert</a>.
+ * <p>
  * For reference see:
- *  - 
https://www.tensorflow.org/text/guide/subwords_tokenizer#applying_wordpiece
- *  - 
https://cran.r-project.org/web/packages/wordpiece/vignettes/basic_usage.html
+ * <ul>
+ *  <li>
+ *  <a 
href="https://www.tensorflow.org/text/guide/subwords_tokenizer#applying_wordpiece";>
+ *    
https://www.tensorflow.org/text/guide/subwords_tokenizer#applying_wordpiece</a>
+ *  </li>
+ *  <li>
+ *  <a 
href="https://cran.r-project.org/web/packages/wordpiece/vignettes/basic_usage.html";>
+ *    
https://cran.r-project.org/web/packages/wordpiece/vignettes/basic_usage.html</a>
+ *  </li>
+ * </ul>
  */
 public class WordpieceTokenizer implements Tokenizer {
 
@@ -38,15 +48,28 @@ public class WordpieceTokenizer implements Tokenizer {
   private static final String SEPARATOR_TOKEN = "[SEP]";
   private static final String UNKNOWN_TOKEN = "[UNK]";
 
-  private Set<String> vocabulary;
+  private final Set<String> vocabulary;
   private int maxTokenLength = 50;
 
+  /**
+   * Initializes a {@link WordpieceTokenizer} with a {@code vocabulary} and a 
default
+   * {@code maxTokenLength} of {@code 50}.
+   *
+   * @param vocabulary  A set of tokens considered the vocabulary.
+   */
   public WordpieceTokenizer(Set<String> vocabulary) {
     this.vocabulary = vocabulary;
   }
 
+  /**
+   * Initializes a {@link WordpieceTokenizer} with a {@code vocabulary} and a 
custom
+   * {@code maxTokenLength}.
+   *
+   * @param vocabulary  A set of tokens considered the vocabulary.
+   * @param maxTokenLength A non-negative number that is used as maximum token 
length.
+   */
   public WordpieceTokenizer(Set<String> vocabulary, int maxTokenLength) {
-    this.vocabulary = vocabulary;
+    this(vocabulary);
     this.maxTokenLength = maxTokenLength;
   }
 
@@ -145,6 +168,9 @@ public class WordpieceTokenizer implements Tokenizer {
 
   }
 
+  /**
+   * @return The maximum token length.
+   */
   public int getMaxTokenLength() {
     return maxTokenLength;
   }
diff --git 
a/opennlp-tools/src/main/java/opennlp/tools/tokenize/package-info.java 
b/opennlp-tools/src/main/java/opennlp/tools/tokenize/package-info.java
index 43650605..0b45b1fc 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/tokenize/package-info.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/tokenize/package-info.java
@@ -17,8 +17,8 @@
 
 /**
  * Contains classes related to finding token or words in a string. All
- * tokenizer implement the Tokenizer interface. Currently there is the
- * learnable <code>TokenizerME</code>, the <code>WhitespaceTokenizer</code> and
- * the <code>SimpleTokenizer</code> which is a character class tokenizer.
+ * tokenizer implement the Tokenizer interface. Currently, there is the
+ * learnable {@code TokenizerME}, the {@code WhitespaceTokenizer} and
+ * the {@code SimpleTokenizer} which is a character class tokenizer.
  */
 package opennlp.tools.tokenize;
diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/eval/Evaluator.java 
b/opennlp-tools/src/main/java/opennlp/tools/util/eval/Evaluator.java
index ee8d49ab..d00cf7dc 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/util/eval/Evaluator.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/util/eval/Evaluator.java
@@ -51,13 +51,13 @@ public abstract class Evaluator<T> {
   }
 
   /**
-   * Evaluates the given reference sample object.
+   * Evaluates the given reference {@link T} sample object.
    *
    * The implementation has to update the score after every invocation.
    *
    * @param reference the reference sample.
    *
-   * @return the predicted sample
+   * @return the predicted {@link T} sample
    */
   protected abstract T processSample(T reference);

[opennlp] branch master updated: OPENNLP-1405 Enhance JavaDoc in opennlp.tools.tokenize package (#448)

Reply via email to