(opennlp) 01/01: OPENNLP-1826 : Prevent OOM during Array Allocation (#1035)

mawiesne Fri, 12 Jun 2026 06:22:40 -0700

This is an automated email from the ASF dual-hosted git repository.

mawiesne pushed a commit to branch opennlp-1.x
in repository https://gitbox.apache.org/repos/asf/opennlp.git


commit db84ed59684ad7b555affc9ed5e17d30d9d14e58
Author: subbudvk <[email protected]>
AuthorDate: Sat May 9 00:24:41 2026 +0530

    OPENNLP-1826 : Prevent OOM during Array Allocation (#1035)
    
    * Prevent OOM due to Array Allocation
    
    * Prevent OOM due to Array Allocation
    
    * Prevent OOM due to Array Allocation
    
    * Prevent OOM due to Array Allocation
    
    * Prevent OOM due to Array Allocation
    
    * Prevent OOM due to Array Allocation
---
 .../opennlp/tools/parser/lang/en/HeadRules.java    | 18 +++++++-
 .../parser/lang/es/AncoraSpanishHeadRules.java     | 18 +++++++-
 .../tools/parser/lang/en/HeadRulesTest.java        | 53 ++++++++++++++++++++++
 3 files changed, 87 insertions(+), 2 deletions(-)

diff --git 
a/opennlp-tools/src/main/java/opennlp/tools/parser/lang/en/HeadRules.java 
b/opennlp-tools/src/main/java/opennlp/tools/parser/lang/en/HeadRules.java
index 6784204e3..8e7dc162c 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/parser/lang/en/HeadRules.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/parser/lang/en/HeadRules.java
@@ -49,6 +49,12 @@ import opennlp.tools.util.model.SerializableArtifact;
  */
 public class HeadRules implements opennlp.tools.parser.HeadRules, GapLabeler, 
SerializableArtifact {
 
+  // POS tagsets are fixed by linguistic convention (Penn Treebank: ~45 tags).
+  // No single head rule will ever list more than a small fraction of the 
tagset.
+  // 1000 gives 20x headroom over the real-world maximum and is not 
configurable
+  // because tag counts are a linguistics constraint, not a deployment 
parameter.
+  private static final int MAX_TAGS_PER_RULE = 1_000;
+
   public static class HeadRulesSerializer implements 
ArtifactSerializer<HeadRules> {
 
     public HeadRules create(InputStream in) throws IOException {
@@ -208,7 +214,17 @@ public class HeadRules implements 
opennlp.tools.parser.HeadRules, GapLabeler, Se
       String num = st.nextToken();
       String type = st.nextToken();
       String dir = st.nextToken();
-      String[] tags = new String[Integer.parseInt(num) - 2];
+      int rawCount;
+      try {
+        rawCount = Integer.parseInt(num);
+      } catch (NumberFormatException e) {
+        throw new IOException("Invalid tag count in head rules: " + num, e);
+      }
+      int numTags = rawCount - 2;
+      if (numTags < 0 || numTags > MAX_TAGS_PER_RULE) {
+        throw new IOException("Invalid tag count in head rules: " + num);
+      }
+      String[] tags = new String[numTags];
       int ti = 0;
       while (st.hasMoreTokens()) {
         tags[ti] = st.nextToken();
diff --git 
a/opennlp-tools/src/main/java/opennlp/tools/parser/lang/es/AncoraSpanishHeadRules.java
 
b/opennlp-tools/src/main/java/opennlp/tools/parser/lang/es/AncoraSpanishHeadRules.java
index 6083f3517..b655f8f83 100644
--- 
a/opennlp-tools/src/main/java/opennlp/tools/parser/lang/es/AncoraSpanishHeadRules.java
+++ 
b/opennlp-tools/src/main/java/opennlp/tools/parser/lang/es/AncoraSpanishHeadRules.java
@@ -62,6 +62,12 @@ import opennlp.tools.util.model.SerializableArtifact;
  */
 public class AncoraSpanishHeadRules implements HeadRules, GapLabeler, 
SerializableArtifact {
 
+  // POS tagsets are fixed by linguistic convention (Penn Treebank: ~45 tags).
+  // No single head rule will ever list more than a small fraction of the 
tagset.
+  // 1000 gives 20x headroom over the real-world maximum and is not 
configurable
+  // because tag counts are a linguistics constraint, not a deployment 
parameter.
+  private static final int MAX_TAGS_PER_RULE = 1_000;
+
   public static class HeadRulesSerializer implements 
ArtifactSerializer<AncoraSpanishHeadRules> {
 
     public AncoraSpanishHeadRules create(InputStream in) throws IOException {
@@ -213,7 +219,17 @@ public class AncoraSpanishHeadRules implements HeadRules, 
GapLabeler, Serializab
       String num = st.nextToken();
       String type = st.nextToken();
       String dir = st.nextToken();
-      String[] tags = new String[Integer.parseInt(num) - 2];
+      int rawCount;
+      try {
+        rawCount = Integer.parseInt(num);
+      } catch (NumberFormatException e) {
+        throw new IOException("Invalid tag count in head rules: " + num, e);
+      }
+      int numTags = rawCount - 2;
+      if (numTags < 0 || numTags > MAX_TAGS_PER_RULE) {
+        throw new IOException("Invalid tag count in head rules: " + num);
+      }
+      String[] tags = new String[numTags];
       int ti = 0;
       while (st.hasMoreTokens()) {
         tags[ti] = st.nextToken();
diff --git 
a/opennlp-tools/src/test/java/opennlp/tools/parser/lang/en/HeadRulesTest.java 
b/opennlp-tools/src/test/java/opennlp/tools/parser/lang/en/HeadRulesTest.java
index 037ded46f..d57db8541 100644
--- 
a/opennlp-tools/src/test/java/opennlp/tools/parser/lang/en/HeadRulesTest.java
+++ 
b/opennlp-tools/src/test/java/opennlp/tools/parser/lang/en/HeadRulesTest.java
@@ -23,6 +23,7 @@ import java.io.IOException;
 import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.io.OutputStreamWriter;
+import java.io.StringReader;
 import java.nio.charset.StandardCharsets;
 
 import org.junit.Assert;
@@ -30,6 +31,58 @@ import org.junit.Test;
 
 public class HeadRulesTest {
 
+  /**
+   * Positive: a well-formed head rules line with a small tag count loads 
without error.
+   */
+  @Test
+  void testValidTagCountLoads() throws IOException {
+    // "5 NP 1 NN NNS" — num=5, tags=3 (5-2=3)
+    String rules = "5 NP 1 NN NNS NNP\n";
+    Assertions.assertDoesNotThrow(() -> new HeadRules(new 
StringReader(rules)));
+  }
+
+  /**
+   * Negative: a head rules line with a huge tag count must throw IOException,
+   * not attempt to allocate Integer.MAX_VALUE bytes.
+   */
+  @Test
+  void testOversizedTagCountThrows() {
+    String rules = "2147483647 NP 1\n";
+    Assertions.assertThrows(IOException.class,
+        () -> new HeadRules(new StringReader(rules)));
+  }
+
+  /**
+   * Negative: a tag count that would produce a negative array size must throw 
IOException.
+   */
+  @Test
+  void testNegativeTagCountThrows() {
+    String rules = "1 NP 1\n";  // 1 - 2 = -1
+    Assertions.assertThrows(IOException.class,
+        () -> new HeadRules(new StringReader(rules)));
+  }
+
+  /**
+   * Boundary: value just above MAX_TAGS_PER_RULE (1003 → numTags = 1001) must 
throw IOException.
+   */
+  @Test
+  void testJustAboveLimitThrows() {
+    // 1003 declared; 1003 - 2 = 1001 tags, which exceeds MAX_TAGS_PER_RULE 
(1000)
+    String rules = "1003 NP 1\n";
+    Assertions.assertThrows(IOException.class,
+        () -> new HeadRules(new StringReader(rules)));
+  }
+
+  /**
+   * Negative: non-numeric token count must throw IOException, not 
NumberFormatException.
+   */
+  @Test
+  void testNonNumericTagCountThrows() {
+    String rules = "NaN NP 1\n";
+    Assertions.assertThrows(IOException.class,
+        () -> new HeadRules(new StringReader(rules)));
+  }
+
   @Test
   public void testSerialization() throws IOException {
     InputStream headRulesIn =

(opennlp) 01/01: OPENNLP-1826 : Prevent OOM during Array Allocation (#1035)

Reply via email to