This is an automated email from the ASF dual-hosted git repository.

mawiesne pushed a commit to branch 
OPENNLP-1585-Reduce-creation-of-String-objects-for-prefixes-in-several-FeatureGenerator-classes
in repository https://gitbox.apache.org/repos/asf/opennlp.git

commit 107935b25a414749864143e0cd2eafb4529ee17b
Author: Martin Wiesner <[email protected]>
AuthorDate: Mon Jul 1 11:18:53 2024 +0200

    OPENNLP-1585 Reduce creation of String objects for prefixes in several 
FeatureGenerator classes- enhances the Pattern in FeatureGeneratorUtil
    - extract constants from prefixes in several FeatureGenerator classes
    - improves JavaDoc along the path
---
 .../main/java/opennlp/tools/ngram/NGramModel.java  |  9 +++---
 .../AdditionalContextFeatureGenerator.java         |  4 ++-
 .../tools/util/featuregen/BrownCluster.java        |  1 +
 .../featuregen/CharacterNgramFeatureGenerator.java |  6 ++--
 .../tools/util/featuregen/GeneratorFactory.java    |  2 +-
 .../util/featuregen/PrefixFeatureGenerator.java    |  4 ++-
 .../SentenceFeatureGeneratorFactory.java           |  1 -
 .../tools/util/featuregen/StringPattern.java       | 32 +++++++++++++++++-----
 .../util/featuregen/SuffixFeatureGenerator.java    |  4 ++-
 .../featuregen/TokenClassFeatureGenerator.java     | 10 +++----
 .../util/featuregen/TokenFeatureGenerator.java     |  6 ++--
 .../featuregen/TokenPatternFeatureGenerator.java   | 12 +++++---
 12 files changed, 60 insertions(+), 31 deletions(-)

diff --git a/opennlp-tools/src/main/java/opennlp/tools/ngram/NGramModel.java 
b/opennlp-tools/src/main/java/opennlp/tools/ngram/NGramModel.java
index db1beee0..87572d7f 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/ngram/NGramModel.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/ngram/NGramModel.java
@@ -46,16 +46,16 @@ public class NGramModel implements Iterable<StringList> {
   private final Map<StringList, Integer> mNGrams = new LinkedHashMap<>();
 
   /**
-   * Initializes an empty instance.
+   * Instantiates an empty {@link NGramModel} instance.
    */
   public NGramModel() {
   }
 
   /**
-   * Initializes the current instance.
+   * Instantiates a {@link NGramModel} via an {@link InputStream} reference.
    *
    * @param in the serialized model stream
-   * @throws IOException
+   * @throws IOException Thrown if errors occurred reading from {@code in}.
    */
   public NGramModel(InputStream in) throws IOException {
     DictionaryEntryPersistor.create(in, entry -> {
@@ -67,8 +67,7 @@ public class NGramModel implements Iterable<StringList> {
         countValueString = entry.attributes().getValue(COUNT);
 
         if (countValueString == null) {
-          throw new InvalidFormatException(
-              "The count attribute must be set!");
+          throw new InvalidFormatException("The count attribute must be set!");
         }
 
         count = Integer.parseInt(countValueString);
diff --git 
a/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/AdditionalContextFeatureGenerator.java
 
b/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/AdditionalContextFeatureGenerator.java
index 24fdf5c9..c35a1c1a 100644
--- 
a/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/AdditionalContextFeatureGenerator.java
+++ 
b/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/AdditionalContextFeatureGenerator.java
@@ -26,6 +26,8 @@ import java.util.List;
  */
 public class AdditionalContextFeatureGenerator implements 
AdaptiveFeatureGenerator {
 
+  private static final String PREFIX = "ne=";
+
   private String[][] additionalContext;
 
   @Override
@@ -36,7 +38,7 @@ public class AdditionalContextFeatureGenerator implements 
AdaptiveFeatureGenerat
       String[] context = additionalContext[index];
 
       for (String s : context) {
-        features.add("ne=" + s);
+        features.add(PREFIX + s);
       }
     }
   }
diff --git 
a/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/BrownCluster.java 
b/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/BrownCluster.java
index b4ddd2b7..37641a2a 100644
--- 
a/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/BrownCluster.java
+++ 
b/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/BrownCluster.java
@@ -39,6 +39,7 @@ import opennlp.tools.util.model.SerializableArtifact;
  * <p>
  * Originally available at: <a 
href="http://metaoptimize.com/projects/wordreprs/";>
  * http://metaoptimize.com/projects/wordreprs/</a>.
+ * <p>
  * Further details can be found in the
  * <a href="https://dl.acm.org/doi/10.5555/1858681.1858721";>related research 
paper</a>.
  * <p>
diff --git 
a/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/CharacterNgramFeatureGenerator.java
 
b/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/CharacterNgramFeatureGenerator.java
index 50cb2522..bcad5840 100644
--- 
a/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/CharacterNgramFeatureGenerator.java
+++ 
b/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/CharacterNgramFeatureGenerator.java
@@ -29,6 +29,8 @@ import opennlp.tools.util.StringUtil;
  */
 public class CharacterNgramFeatureGenerator implements 
AdaptiveFeatureGenerator {
 
+  private static final String PREFIX = "ng=";
+
   private final int minLength;
   private final int maxLength;
 
@@ -45,7 +47,7 @@ public class CharacterNgramFeatureGenerator implements 
AdaptiveFeatureGenerator
 
   /**
    * Initializes a {@link CharacterNgramFeatureGenerator} with
-   * min 2 length and max 5 length of ngrams.
+   * min ength of {@code 2} and max length of {@code 5} for ngrams.
    */
   public CharacterNgramFeatureGenerator() {
     this(2, 5);
@@ -58,7 +60,7 @@ public class CharacterNgramFeatureGenerator implements 
AdaptiveFeatureGenerator
 
     for (StringList tokenList : model) {
       if (tokenList.size() > 0) {
-        features.add("ng=" + StringUtil.toLowerCase(tokenList.getToken(0)));
+        features.add(PREFIX + StringUtil.toLowerCase(tokenList.getToken(0)));
       }
     }
   }
diff --git 
a/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/GeneratorFactory.java
 
b/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/GeneratorFactory.java
index efbae0ec..7dccaed6 100644
--- 
a/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/GeneratorFactory.java
+++ 
b/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/GeneratorFactory.java
@@ -516,7 +516,7 @@ public class GeneratorFactory {
     }
 
     /**
-     * @return null if the subclass uses {@link #resourceManager} to 
instantiate
+     * @return {@code null} if the subclass uses {@link #resourceManager} to 
instantiate
      * @throws InvalidFormatException
      */
     public abstract AdaptiveFeatureGenerator create() throws 
InvalidFormatException;
diff --git 
a/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/PrefixFeatureGenerator.java
 
b/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/PrefixFeatureGenerator.java
index 2e10195f..6466feef 100644
--- 
a/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/PrefixFeatureGenerator.java
+++ 
b/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/PrefixFeatureGenerator.java
@@ -21,6 +21,8 @@ import java.util.List;
 
 public class PrefixFeatureGenerator implements AdaptiveFeatureGenerator {
 
+  private static final String PREFIX = "pre=";
+
   static final int DEFAULT_MAX_LENGTH = 4;
   
   private final int prefixLength;
@@ -38,7 +40,7 @@ public class PrefixFeatureGenerator implements 
AdaptiveFeatureGenerator {
       String[] previousOutcomes) {
     String[] prefs = getPrefixes(tokens[index]);
     for (String pref : prefs) {
-      features.add("pre=" + pref);
+      features.add(PREFIX + pref);
     }
   }
   
diff --git 
a/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/SentenceFeatureGeneratorFactory.java
 
b/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/SentenceFeatureGeneratorFactory.java
index 3ffefaa6..d452fe7e 100644
--- 
a/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/SentenceFeatureGeneratorFactory.java
+++ 
b/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/SentenceFeatureGeneratorFactory.java
@@ -31,7 +31,6 @@ public class SentenceFeatureGeneratorFactory
 
   @Override
   public AdaptiveFeatureGenerator create() throws InvalidFormatException {
-    String beginFeatureString = generatorElement.getAttribute("begin");
     return new SentenceFeatureGenerator(getBool("begin", true), getBool("end", 
true));
   }
 }
diff --git 
a/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/StringPattern.java 
b/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/StringPattern.java
index af4cb96b..500b5a24 100644
--- 
a/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/StringPattern.java
+++ 
b/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/StringPattern.java
@@ -141,49 +141,49 @@ public class StringPattern {
   }
 
   /**
-   * @return true if all characters are letters.
+   * @return {@code true} if all characters are letters.
    */
   public boolean isAllLetter() {
     return (pattern & ALL_LETTERS) > 0;
   }
 
   /**
-   * @return true if first letter is capital.
+   * @return {@code true} if first letter is capital.
    */
   public boolean isInitialCapitalLetter() {
     return (pattern & INITAL_CAPITAL_LETTER) > 0;
   }
 
   /**
-   * @return true if all letters are capital.
+   * @return {@code true} if all letters are capital.
    */
   public boolean isAllCapitalLetter() {
     return (pattern & ALL_CAPITAL_LETTER) > 0;
   }
 
   /**
-   * @return true if all letters are lower case.
+   * @return {@code true} if all letters are lower case.
    */
   public boolean isAllLowerCaseLetter() {
     return (pattern & ALL_LOWERCASE_LETTER) > 0;
   }
 
   /**
-   * @return true if all chars are digits.
+   * @return {@code true} if all chars are digits.
    */
   public boolean isAllDigit() {
     return (pattern & ALL_DIGIT) > 0;
   }
 
   /**
-   * @return true if all chars are hiragana.
+   * @return {@code true} if all chars are hiragana.
    */
   public boolean isAllHiragana() {
     return (pattern & ALL_HIRAGANA) > 0;
   }
 
   /**
-   * @return true if all chars are katakana.
+   * @return {@code true} if all chars are katakana.
    */
   public boolean isAllKatakana() {
     return (pattern & ALL_KATAKANA) > 0;
@@ -196,26 +196,44 @@ public class StringPattern {
     return digits;
   }
 
+  /**
+   * @return {@code true} if a period is contained.
+   */
   public boolean containsPeriod() {
     return (pattern & CONTAINS_PERIOD) > 0;
   }
 
+  /**
+   * @return {@code true} if a comma is contained.
+   */
   public boolean containsComma() {
     return (pattern & CONTAINS_COMMA) > 0;
   }
 
+  /**
+   * @return {@code true} if a slash is contained.
+   */
   public boolean containsSlash() {
     return (pattern & CONTAINS_SLASH) > 0;
   }
 
+  /**
+   * @return {@code true} if a digit is contained.
+   */
   public boolean containsDigit() {
     return (pattern & CONTAINS_DIGIT) > 0;
   }
 
+  /**
+   * @return {@code true} if a hypen is contained.
+   */
   public boolean containsHyphen() {
     return (pattern & CONTAINS_HYPHEN) > 0;
   }
 
+  /**
+   * @return {@code true} if a letters are contained.
+   */
   public boolean containsLetters() {
     return (pattern & CONTAINS_LETTERS) > 0;
   }
diff --git 
a/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/SuffixFeatureGenerator.java
 
b/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/SuffixFeatureGenerator.java
index f1a18d83..e890cf08 100644
--- 
a/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/SuffixFeatureGenerator.java
+++ 
b/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/SuffixFeatureGenerator.java
@@ -21,6 +21,8 @@ import java.util.List;
 
 public class SuffixFeatureGenerator implements AdaptiveFeatureGenerator {
 
+  private static final String PREFIX = "suf=";
+
   static final int DEFAULT_MAX_LENGTH = 4;
     
   private final int suffixLength;
@@ -38,7 +40,7 @@ public class SuffixFeatureGenerator implements 
AdaptiveFeatureGenerator {
       String[] previousOutcomes) {
     String[] suffs = getSuffixes(tokens[index]);
     for (String suff : suffs) {
-      features.add("suf=" + suff);
+      features.add(PREFIX + suff);
     }
   }
   
diff --git 
a/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/TokenClassFeatureGenerator.java
 
b/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/TokenClassFeatureGenerator.java
index 56546c37..07a8d40f 100644
--- 
a/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/TokenClassFeatureGenerator.java
+++ 
b/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/TokenClassFeatureGenerator.java
@@ -23,12 +23,12 @@ import opennlp.tools.util.StringUtil;
 
 
 /**
- * Generates features for different for the class of the token.
+ * Generates features for the class of the token.
  */
 public class TokenClassFeatureGenerator implements AdaptiveFeatureGenerator {
 
-  private static final String TOKEN_CLASS_PREFIX = "wc";
-  private static final String TOKEN_AND_CLASS_PREFIX = "w&c";
+  private static final String TOKEN_CLASS_PREFIX = "wc=";
+  private static final String TOKEN_AND_CLASS_PREFIX = "w&c=";
 
   private final boolean generateWordAndClassFeature;
 
@@ -43,10 +43,10 @@ public class TokenClassFeatureGenerator implements 
AdaptiveFeatureGenerator {
   @Override
   public void createFeatures(List<String> features, String[] tokens, int 
index, String[] preds) {
     String wordClass = FeatureGeneratorUtil.tokenFeature(tokens[index]);
-    features.add(TOKEN_CLASS_PREFIX + "=" + wordClass);
+    features.add(TOKEN_CLASS_PREFIX + wordClass);
 
     if (generateWordAndClassFeature) {
-      features.add(TOKEN_AND_CLASS_PREFIX + "=" + 
StringUtil.toLowerCase(tokens[index]) +
+      features.add(TOKEN_AND_CLASS_PREFIX + 
StringUtil.toLowerCase(tokens[index]) +
           "," + wordClass);
     }
   }
diff --git 
a/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/TokenFeatureGenerator.java
 
b/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/TokenFeatureGenerator.java
index ced4a55c..53119e06 100644
--- 
a/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/TokenFeatureGenerator.java
+++ 
b/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/TokenFeatureGenerator.java
@@ -27,7 +27,7 @@ import opennlp.tools.util.StringUtil;
  */
 public class TokenFeatureGenerator implements AdaptiveFeatureGenerator {
 
-  private static final String WORD_PREFIX = "w";
+  private static final String WORD_PREFIX = "w=";
   private final boolean lowercase;
 
   /**
@@ -49,10 +49,10 @@ public class TokenFeatureGenerator implements 
AdaptiveFeatureGenerator {
   @Override
   public void createFeatures(List<String> features, String[] tokens, int 
index, String[] preds) {
     if (lowercase) {
-      features.add(WORD_PREFIX + "=" + StringUtil.toLowerCase(tokens[index]));
+      features.add(WORD_PREFIX + StringUtil.toLowerCase(tokens[index]));
     }
     else {
-      features.add(WORD_PREFIX + "=" + tokens[index]);
+      features.add(WORD_PREFIX + tokens[index]);
     }
   }
 }
diff --git 
a/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/TokenPatternFeatureGenerator.java
 
b/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/TokenPatternFeatureGenerator.java
index 99adfc98..512230a5 100644
--- 
a/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/TokenPatternFeatureGenerator.java
+++ 
b/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/TokenPatternFeatureGenerator.java
@@ -31,6 +31,10 @@ import opennlp.tools.util.StringUtil;
  */
 public class TokenPatternFeatureGenerator implements AdaptiveFeatureGenerator {
 
+  private static final String SUB_TOKEN_PREFIX = "st=" ;
+  private static final String SUB_TOKEN_PART2_PREFIX = "pt2=" ;
+  private static final String SUB_TOKEN_PART3_PREFIX = "pt3=" ;
+
   private final Pattern noLetters = Pattern.compile("[^a-zA-Z]");
   private final Tokenizer tokenizer;
 
@@ -57,7 +61,7 @@ public class TokenPatternFeatureGenerator implements 
AdaptiveFeatureGenerator {
     String[] tokenized = tokenizer.tokenize(toks[index]);
 
     if (tokenized.length == 1) {
-      feats.add("st=" + StringUtil.toLowerCase(toks[index]));
+      feats.add(SUB_TOKEN_PREFIX + StringUtil.toLowerCase(toks[index]));
       return;
     }
 
@@ -68,12 +72,12 @@ public class TokenPatternFeatureGenerator implements 
AdaptiveFeatureGenerator {
     for (int i = 0; i < tokenized.length; i++) {
 
       if (i < tokenized.length - 1) {
-        feats.add("pt2=" + FeatureGeneratorUtil.tokenFeature(tokenized[i]) +
+        feats.add(SUB_TOKEN_PART2_PREFIX + 
FeatureGeneratorUtil.tokenFeature(tokenized[i]) +
             FeatureGeneratorUtil.tokenFeature(tokenized[i + 1]));
       }
 
       if (i < tokenized.length - 2) {
-        feats.add("pt3=" + FeatureGeneratorUtil.tokenFeature(tokenized[i]) +
+        feats.add(SUB_TOKEN_PART3_PREFIX + 
FeatureGeneratorUtil.tokenFeature(tokenized[i]) +
             FeatureGeneratorUtil.tokenFeature(tokenized[i + 1]) +
             FeatureGeneratorUtil.tokenFeature(tokenized[i + 2]));
       }
@@ -81,7 +85,7 @@ public class TokenPatternFeatureGenerator implements 
AdaptiveFeatureGenerator {
       pattern.append(FeatureGeneratorUtil.tokenFeature(tokenized[i]));
 
       if (!noLetters.matcher(tokenized[i]).find()) {
-        feats.add("st=" + StringUtil.toLowerCase(tokenized[i]));
+        feats.add(SUB_TOKEN_PREFIX + StringUtil.toLowerCase(tokenized[i]));
       }
     }
 

Reply via email to