(opennlp) branch main updated: OPENNLP-1677: Extend JavaDoc of POSTaggerME (#717)

mawiesne Sat, 21 Dec 2024 08:32:27 -0800

This is an automated email from the ASF dual-hosted git repository.

mawiesne pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/opennlp.git



The following commit(s) were added to refs/heads/main by this push:
     new 49678c37 OPENNLP-1677: Extend JavaDoc of POSTaggerME (#717)
49678c37 is described below

commit 49678c3743962d27ef47c0b86c75a7a52c026151
Author: Martin Wiesner <[email protected]>
AuthorDate: Sat Dec 21 17:32:18 2024 +0100

    OPENNLP-1677: Extend JavaDoc of POSTaggerME (#717)
---
 .../java/opennlp/tools/postag/POSTaggerME.java     | 74 +++++++++++++++-------
 1 file changed, 52 insertions(+), 22 deletions(-)

diff --git a/opennlp-tools/src/main/java/opennlp/tools/postag/POSTaggerME.java 
b/opennlp-tools/src/main/java/opennlp/tools/postag/POSTaggerME.java
index cd419e67..a85ceb55 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/postag/POSTaggerME.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/postag/POSTaggerME.java
@@ -50,15 +50,22 @@ import opennlp.tools.util.TrainingParameters;
 import opennlp.tools.util.featuregen.StringPattern;
 
 /**
- * A {@link POSTagger part-of-speech tagger} that uses maximum entropy.
+ * A {@link POSTagger part-of-speech tagger} implementation that uses maximum 
entropy.
  * <p>
- * Tries to predict whether words are nouns, verbs, or any of 70 other POS tags
+ * Tries to predict whether words are nouns, verbs, or any other {@link 
POSTagFormat POS tags}
  * depending on their surrounding context.
+ *
+ * @see POSModel
+ * @see POSTagFormat
+ * @see POSTagger
  */
 public class POSTaggerME implements POSTagger {
 
   private static final Logger logger = 
LoggerFactory.getLogger(POSTaggerME.class);
 
+  /**
+   * The default beam size value is 3.
+   */
   public static final int DEFAULT_BEAM_SIZE = 3;
 
   private final POSModel modelPackage;
@@ -66,7 +73,7 @@ public class POSTaggerME implements POSTagger {
   /**
    * The {@link POSContextGenerator feature context generator}.
    */
-  protected final POSContextGenerator contextGen;
+  protected final POSContextGenerator cg;
 
   /**
    * {@link TagDictionary} used for restricting words to a fixed set of tags.
@@ -140,7 +147,7 @@ public class POSTaggerME implements POSTagger {
 
     modelPackage = model;
 
-    contextGen = factory.getPOSContextGenerator(beamSize);
+    cg = factory.getPOSContextGenerator(beamSize);
     tagDictionary = factory.getTagDictionary();
     size = beamSize;
 
@@ -165,14 +172,20 @@ public class POSTaggerME implements POSTagger {
     return model.getOutcomes();
   }
 
+  /**
+   * {@inheritDoc}
+   */
   @Override
   public String[] tag(String[] sentence) {
     return this.tag(sentence, null);
   }
 
+  /**
+   * {@inheritDoc}
+   */
   @Override
   public String[] tag(String[] sentence, Object[] additionalContext) {
-    bestSequence = model.bestSequence(sentence, additionalContext, contextGen, 
sequenceValidator);
+    bestSequence = model.bestSequence(sentence, additionalContext, cg, 
sequenceValidator);
     final List<String> t = bestSequence.getOutcomes();
     return convertTags(t);
   }
@@ -186,7 +199,7 @@ public class POSTaggerME implements POSTagger {
    */
   public String[][] tag(int numTaggings, String[] sentence) {
     Sequence[] bestSequences = model.bestSequences(numTaggings, sentence, null,
-        contextGen, sequenceValidator);
+            cg, sequenceValidator);
     String[][] tags = new String[bestSequences.length][];
     for (int si = 0; si < tags.length; si++) {
       List<String> t = bestSequences[si].getOutcomes();
@@ -204,18 +217,25 @@ public class POSTaggerME implements POSTagger {
     }
   }
 
+  /**
+   * {@inheritDoc}
+   */
   @Override
   public Sequence[] topKSequences(String[] sentence) {
     return this.topKSequences(sentence, null);
   }
 
+  /**
+   * {@inheritDoc}
+   */
   @Override
   public Sequence[] topKSequences(String[] sentence, Object[] 
additionalContext) {
-    return model.bestSequences(size, sentence, additionalContext, contextGen, 
sequenceValidator);
+    return model.bestSequences(size, sentence, additionalContext, cg, 
sequenceValidator);
   }
 
   /**
-   * Populates the specified array with the probabilities for each tag of the 
last tagged sentence.
+   * Populates the specified {@code probs} array with the probabilities
+   * for each tag of the last tagged sentence.
    *
    * @param probs An array to put the probabilities into.
    */
@@ -239,7 +259,7 @@ public class POSTaggerME implements POSTagger {
     MaxentModel posModel = 
modelPackage.getArtifact(POSModel.POS_MODEL_ENTRY_NAME);
     if (posModel != null) {
 
-      double[] probs = posModel.eval(contextGen.getContext(index, 
words.toArray(new String[0]),
+      double[] probs = posModel.eval(cg.getContext(index, words.toArray(new 
String[0]),
           tags.toArray(new String[0]), null));
 
       String[] orderedTags = new String[probs.length];
@@ -263,34 +283,44 @@ public class POSTaggerME implements POSTagger {
     }
   }
 
-  public static POSModel train(String languageCode,
-                               ObjectStream<POSSample> samples, 
TrainingParameters trainParams,
-                               POSTaggerFactory posFactory) throws IOException 
{
-
-    int beamSize = trainParams.getIntParameter(BeamSearch.BEAM_SIZE_PARAMETER, 
POSTaggerME.DEFAULT_BEAM_SIZE);
-
-    POSContextGenerator contextGenerator = posFactory.getPOSContextGenerator();
+  /**
+   * Starts a training of a {@link POSModel} with the given parameters.
+   *
+   * @param languageCode  The ISO language code to train the model. Must not 
be {@code null}.
+   * @param samples       The {@link ObjectStream} of {@link POSSample} used 
as input for training.
+   * @param mlParams      The {@link TrainingParameters} for the context of 
the training process.
+   * @param posFactory    The {@link POSTaggerFactory} for creating related 
objects as defined
+   *                      via {@code mlParams}.
+   *
+   * @return A valid, trained {@link POSModel} instance.
+   * @throws IOException Thrown if IO errors occurred.
+   */
+  public static POSModel train(String languageCode, ObjectStream<POSSample> 
samples,
+                               TrainingParameters mlParams, POSTaggerFactory 
posFactory)
+      throws IOException {
 
-    Map<String, String> manifestInfoEntries = new HashMap<>();
+    final int beamSize = mlParams.getIntParameter(
+            BeamSearch.BEAM_SIZE_PARAMETER, POSTaggerME.DEFAULT_BEAM_SIZE);
 
-    TrainerType trainerType = TrainerFactory.getTrainerType(trainParams);
+    final POSContextGenerator contextGenerator = 
posFactory.getPOSContextGenerator();
+    final TrainerType trainerType = TrainerFactory.getTrainerType(mlParams);
+    final Map<String, String> manifestInfoEntries = new HashMap<>();
 
     MaxentModel posModel = null;
     SequenceClassificationModel seqPosModel = null;
     if (TrainerType.EVENT_MODEL_TRAINER.equals(trainerType)) {
       ObjectStream<Event> es = new POSSampleEventStream(samples, 
contextGenerator);
 
-      EventTrainer trainer = TrainerFactory.getEventTrainer(trainParams,
-          manifestInfoEntries);
+      EventTrainer trainer = TrainerFactory.getEventTrainer(mlParams, 
manifestInfoEntries);
       posModel = trainer.train(es);
     } else if (TrainerType.EVENT_MODEL_SEQUENCE_TRAINER.equals(trainerType)) {
       POSSampleSequenceStream ss = new POSSampleSequenceStream(samples, 
contextGenerator);
       EventModelSequenceTrainer<POSSample> trainer =
-          TrainerFactory.getEventModelSequenceTrainer(trainParams, 
manifestInfoEntries);
+          TrainerFactory.getEventModelSequenceTrainer(mlParams, 
manifestInfoEntries);
       posModel = trainer.train(ss);
     } else if (TrainerType.SEQUENCE_TRAINER.equals(trainerType)) {
       SequenceTrainer trainer = TrainerFactory.getSequenceModelTrainer(
-          trainParams, manifestInfoEntries);
+          mlParams, manifestInfoEntries);
 
       // TODO: This will probably cause issue, since the feature generator 
uses the outcomes array

(opennlp) branch main updated: OPENNLP-1677: Extend JavaDoc of POSTaggerME (#717)

Reply via email to