This is an automated email from the ASF dual-hosted git repository.
rzo1 pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/opennlp.git
The following commit(s) were added to refs/heads/main by this push:
new 77393307f OPENNLP-1826 : Prevent OOM during Array Allocation (#1035)
77393307f is described below
commit 77393307fcdc36d519e3377d63354243e5dbfb71
Author: subbudvk <[email protected]>
AuthorDate: Sat May 9 00:24:41 2026 +0530
OPENNLP-1826 : Prevent OOM during Array Allocation (#1035)
* Prevent OOM due to Array Allocation
* Prevent OOM due to Array Allocation
* Prevent OOM due to Array Allocation
* Prevent OOM due to Array Allocation
* Prevent OOM due to Array Allocation
* Prevent OOM due to Array Allocation
---
.../opennlp/tools/parser/lang/en/HeadRules.java | 18 +++++++-
.../parser/lang/es/AncoraSpanishHeadRules.java | 18 +++++++-
.../tools/parser/lang/en/HeadRulesTest.java | 53 ++++++++++++++++++++++
3 files changed, 87 insertions(+), 2 deletions(-)
diff --git
a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/parser/lang/en/HeadRules.java
b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/parser/lang/en/HeadRules.java
index 677d37bff..1cf8ce136 100644
---
a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/parser/lang/en/HeadRules.java
+++
b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/parser/lang/en/HeadRules.java
@@ -48,6 +48,12 @@ import opennlp.tools.util.model.SerializableArtifact;
*/
public class HeadRules implements opennlp.tools.parser.HeadRules, GapLabeler,
SerializableArtifact {
+ // POS tagsets are fixed by linguistic convention (Penn Treebank: ~45 tags).
+ // No single head rule will ever list more than a small fraction of the
tagset.
+ // 1000 gives 20x headroom over the real-world maximum and is not
configurable
+ // because tag counts are a linguistics constraint, not a deployment
parameter.
+ private static final int MAX_TAGS_PER_RULE = 1_000;
+
public static class HeadRulesSerializer implements
ArtifactSerializer<HeadRules> {
public HeadRules create(InputStream in) throws IOException {
@@ -196,7 +202,17 @@ public class HeadRules implements
opennlp.tools.parser.HeadRules, GapLabeler, Se
String num = st.nextToken();
String type = st.nextToken();
String dir = st.nextToken();
- String[] tags = new String[Integer.parseInt(num) - 2];
+ int rawCount;
+ try {
+ rawCount = Integer.parseInt(num);
+ } catch (NumberFormatException e) {
+ throw new IOException("Invalid tag count in head rules: " + num, e);
+ }
+ int numTags = rawCount - 2;
+ if (numTags < 0 || numTags > MAX_TAGS_PER_RULE) {
+ throw new IOException("Invalid tag count in head rules: " + num);
+ }
+ String[] tags = new String[numTags];
int ti = 0;
while (st.hasMoreTokens()) {
tags[ti] = st.nextToken();
diff --git
a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/parser/lang/es/AncoraSpanishHeadRules.java
b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/parser/lang/es/AncoraSpanishHeadRules.java
index ced56a087..82b0d6d96 100644
---
a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/parser/lang/es/AncoraSpanishHeadRules.java
+++
b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/parser/lang/es/AncoraSpanishHeadRules.java
@@ -62,6 +62,12 @@ import opennlp.tools.util.model.SerializableArtifact;
*/
public class AncoraSpanishHeadRules implements HeadRules, GapLabeler,
SerializableArtifact {
+ // POS tagsets are fixed by linguistic convention (Penn Treebank: ~45 tags).
+ // No single head rule will ever list more than a small fraction of the
tagset.
+ // 1000 gives 20x headroom over the real-world maximum and is not
configurable
+ // because tag counts are a linguistics constraint, not a deployment
parameter.
+ private static final int MAX_TAGS_PER_RULE = 1_000;
+
public static class HeadRulesSerializer implements
ArtifactSerializer<AncoraSpanishHeadRules> {
public AncoraSpanishHeadRules create(InputStream in) throws IOException {
@@ -212,7 +218,17 @@ public class AncoraSpanishHeadRules implements HeadRules,
GapLabeler, Serializab
String num = st.nextToken();
String type = st.nextToken();
String dir = st.nextToken();
- String[] tags = new String[Integer.parseInt(num) - 2];
+ int rawCount;
+ try {
+ rawCount = Integer.parseInt(num);
+ } catch (NumberFormatException e) {
+ throw new IOException("Invalid tag count in head rules: " + num, e);
+ }
+ int numTags = rawCount - 2;
+ if (numTags < 0 || numTags > MAX_TAGS_PER_RULE) {
+ throw new IOException("Invalid tag count in head rules: " + num);
+ }
+ String[] tags = new String[numTags];
int ti = 0;
while (st.hasMoreTokens()) {
tags[ti] = st.nextToken();
diff --git
a/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/parser/lang/en/HeadRulesTest.java
b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/parser/lang/en/HeadRulesTest.java
index 7b60d502f..eb099bf1d 100644
---
a/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/parser/lang/en/HeadRulesTest.java
+++
b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/parser/lang/en/HeadRulesTest.java
@@ -23,6 +23,7 @@ import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
+import java.io.StringReader;
import java.nio.charset.StandardCharsets;
import org.junit.jupiter.api.Assertions;
@@ -30,6 +31,58 @@ import org.junit.jupiter.api.Test;
public class HeadRulesTest {
+ /**
+ * Positive: a well-formed head rules line with a small tag count loads
without error.
+ */
+ @Test
+ void testValidTagCountLoads() throws IOException {
+ // "5 NP 1 NN NNS" — num=5, tags=3 (5-2=3)
+ String rules = "5 NP 1 NN NNS NNP\n";
+ Assertions.assertDoesNotThrow(() -> new HeadRules(new
StringReader(rules)));
+ }
+
+ /**
+ * Negative: a head rules line with a huge tag count must throw IOException,
+ * not attempt to allocate Integer.MAX_VALUE bytes.
+ */
+ @Test
+ void testOversizedTagCountThrows() {
+ String rules = "2147483647 NP 1\n";
+ Assertions.assertThrows(IOException.class,
+ () -> new HeadRules(new StringReader(rules)));
+ }
+
+ /**
+ * Negative: a tag count that would produce a negative array size must throw
IOException.
+ */
+ @Test
+ void testNegativeTagCountThrows() {
+ String rules = "1 NP 1\n"; // 1 - 2 = -1
+ Assertions.assertThrows(IOException.class,
+ () -> new HeadRules(new StringReader(rules)));
+ }
+
+ /**
+ * Boundary: value just above MAX_TAGS_PER_RULE (1003 → numTags = 1001) must
throw IOException.
+ */
+ @Test
+ void testJustAboveLimitThrows() {
+ // 1003 declared; 1003 - 2 = 1001 tags, which exceeds MAX_TAGS_PER_RULE
(1000)
+ String rules = "1003 NP 1\n";
+ Assertions.assertThrows(IOException.class,
+ () -> new HeadRules(new StringReader(rules)));
+ }
+
+ /**
+ * Negative: non-numeric token count must throw IOException, not
NumberFormatException.
+ */
+ @Test
+ void testNonNumericTagCountThrows() {
+ String rules = "NaN NP 1\n";
+ Assertions.assertThrows(IOException.class,
+ () -> new HeadRules(new StringReader(rules)));
+ }
+
@Test
void testSerialization() throws IOException {
try (InputStream headRulesIn =