This is an automated email from the ASF dual-hosted git repository.

rzo1 pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/opennlp.git


The following commit(s) were added to refs/heads/main by this push:
     new 77393307f OPENNLP-1826 : Prevent OOM during Array Allocation (#1035)
77393307f is described below

commit 77393307fcdc36d519e3377d63354243e5dbfb71
Author: subbudvk <[email protected]>
AuthorDate: Sat May 9 00:24:41 2026 +0530

    OPENNLP-1826 : Prevent OOM during Array Allocation (#1035)
    
    * Prevent OOM due to Array Allocation
    
    * Prevent OOM due to Array Allocation
    
    * Prevent OOM due to Array Allocation
    
    * Prevent OOM due to Array Allocation
    
    * Prevent OOM due to Array Allocation
    
    * Prevent OOM due to Array Allocation
---
 .../opennlp/tools/parser/lang/en/HeadRules.java    | 18 +++++++-
 .../parser/lang/es/AncoraSpanishHeadRules.java     | 18 +++++++-
 .../tools/parser/lang/en/HeadRulesTest.java        | 53 ++++++++++++++++++++++
 3 files changed, 87 insertions(+), 2 deletions(-)

diff --git 
a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/parser/lang/en/HeadRules.java
 
b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/parser/lang/en/HeadRules.java
index 677d37bff..1cf8ce136 100644
--- 
a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/parser/lang/en/HeadRules.java
+++ 
b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/parser/lang/en/HeadRules.java
@@ -48,6 +48,12 @@ import opennlp.tools.util.model.SerializableArtifact;
  */
 public class HeadRules implements opennlp.tools.parser.HeadRules, GapLabeler, 
SerializableArtifact {
 
+  // POS tagsets are fixed by linguistic convention (Penn Treebank: ~45 tags).
+  // No single head rule will ever list more than a small fraction of the 
tagset.
+  // 1000 gives 20x headroom over the real-world maximum and is not 
configurable
+  // because tag counts are a linguistics constraint, not a deployment 
parameter.
+  private static final int MAX_TAGS_PER_RULE = 1_000;
+
   public static class HeadRulesSerializer implements 
ArtifactSerializer<HeadRules> {
 
     public HeadRules create(InputStream in) throws IOException {
@@ -196,7 +202,17 @@ public class HeadRules implements 
opennlp.tools.parser.HeadRules, GapLabeler, Se
       String num = st.nextToken();
       String type = st.nextToken();
       String dir = st.nextToken();
-      String[] tags = new String[Integer.parseInt(num) - 2];
+      int rawCount;
+      try {
+        rawCount = Integer.parseInt(num);
+      } catch (NumberFormatException e) {
+        throw new IOException("Invalid tag count in head rules: " + num, e);
+      }
+      int numTags = rawCount - 2;
+      if (numTags < 0 || numTags > MAX_TAGS_PER_RULE) {
+        throw new IOException("Invalid tag count in head rules: " + num);
+      }
+      String[] tags = new String[numTags];
       int ti = 0;
       while (st.hasMoreTokens()) {
         tags[ti] = st.nextToken();
diff --git 
a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/parser/lang/es/AncoraSpanishHeadRules.java
 
b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/parser/lang/es/AncoraSpanishHeadRules.java
index ced56a087..82b0d6d96 100644
--- 
a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/parser/lang/es/AncoraSpanishHeadRules.java
+++ 
b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/parser/lang/es/AncoraSpanishHeadRules.java
@@ -62,6 +62,12 @@ import opennlp.tools.util.model.SerializableArtifact;
  */
 public class AncoraSpanishHeadRules implements HeadRules, GapLabeler, 
SerializableArtifact {
 
+  // POS tagsets are fixed by linguistic convention (Penn Treebank: ~45 tags).
+  // No single head rule will ever list more than a small fraction of the 
tagset.
+  // 1000 gives 20x headroom over the real-world maximum and is not 
configurable
+  // because tag counts are a linguistics constraint, not a deployment 
parameter.
+  private static final int MAX_TAGS_PER_RULE = 1_000;
+
   public static class HeadRulesSerializer implements 
ArtifactSerializer<AncoraSpanishHeadRules> {
 
     public AncoraSpanishHeadRules create(InputStream in) throws IOException {
@@ -212,7 +218,17 @@ public class AncoraSpanishHeadRules implements HeadRules, 
GapLabeler, Serializab
       String num = st.nextToken();
       String type = st.nextToken();
       String dir = st.nextToken();
-      String[] tags = new String[Integer.parseInt(num) - 2];
+      int rawCount;
+      try {
+        rawCount = Integer.parseInt(num);
+      } catch (NumberFormatException e) {
+        throw new IOException("Invalid tag count in head rules: " + num, e);
+      }
+      int numTags = rawCount - 2;
+      if (numTags < 0 || numTags > MAX_TAGS_PER_RULE) {
+        throw new IOException("Invalid tag count in head rules: " + num);
+      }
+      String[] tags = new String[numTags];
       int ti = 0;
       while (st.hasMoreTokens()) {
         tags[ti] = st.nextToken();
diff --git 
a/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/parser/lang/en/HeadRulesTest.java
 
b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/parser/lang/en/HeadRulesTest.java
index 7b60d502f..eb099bf1d 100644
--- 
a/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/parser/lang/en/HeadRulesTest.java
+++ 
b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/parser/lang/en/HeadRulesTest.java
@@ -23,6 +23,7 @@ import java.io.IOException;
 import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.io.OutputStreamWriter;
+import java.io.StringReader;
 import java.nio.charset.StandardCharsets;
 
 import org.junit.jupiter.api.Assertions;
@@ -30,6 +31,58 @@ import org.junit.jupiter.api.Test;
 
 public class HeadRulesTest {
 
+  /**
+   * Positive: a well-formed head rules line with a small tag count loads 
without error.
+   */
+  @Test
+  void testValidTagCountLoads() throws IOException {
+    // "5 NP 1 NN NNS" — num=5, tags=3 (5-2=3)
+    String rules = "5 NP 1 NN NNS NNP\n";
+    Assertions.assertDoesNotThrow(() -> new HeadRules(new 
StringReader(rules)));
+  }
+
+  /**
+   * Negative: a head rules line with a huge tag count must throw IOException,
+   * not attempt to allocate Integer.MAX_VALUE bytes.
+   */
+  @Test
+  void testOversizedTagCountThrows() {
+    String rules = "2147483647 NP 1\n";
+    Assertions.assertThrows(IOException.class,
+        () -> new HeadRules(new StringReader(rules)));
+  }
+
+  /**
+   * Negative: a tag count that would produce a negative array size must throw 
IOException.
+   */
+  @Test
+  void testNegativeTagCountThrows() {
+    String rules = "1 NP 1\n";  // 1 - 2 = -1
+    Assertions.assertThrows(IOException.class,
+        () -> new HeadRules(new StringReader(rules)));
+  }
+
+  /**
+   * Boundary: value just above MAX_TAGS_PER_RULE (1003 → numTags = 1001) must 
throw IOException.
+   */
+  @Test
+  void testJustAboveLimitThrows() {
+    // 1003 declared; 1003 - 2 = 1001 tags, which exceeds MAX_TAGS_PER_RULE 
(1000)
+    String rules = "1003 NP 1\n";
+    Assertions.assertThrows(IOException.class,
+        () -> new HeadRules(new StringReader(rules)));
+  }
+
+  /**
+   * Negative: non-numeric token count must throw IOException, not 
NumberFormatException.
+   */
+  @Test
+  void testNonNumericTagCountThrows() {
+    String rules = "NaN NP 1\n";
+    Assertions.assertThrows(IOException.class,
+        () -> new HeadRules(new StringReader(rules)));
+  }
+
   @Test
   void testSerialization() throws IOException {
     try (InputStream headRulesIn =

Reply via email to