This is an automated email from the ASF dual-hosted git repository. mawiesne pushed a commit to branch opennlp-1.x in repository https://gitbox.apache.org/repos/asf/opennlp.git
commit db84ed59684ad7b555affc9ed5e17d30d9d14e58 Author: subbudvk <[email protected]> AuthorDate: Sat May 9 00:24:41 2026 +0530 OPENNLP-1826 : Prevent OOM during Array Allocation (#1035) * Prevent OOM due to Array Allocation * Prevent OOM due to Array Allocation * Prevent OOM due to Array Allocation * Prevent OOM due to Array Allocation * Prevent OOM due to Array Allocation * Prevent OOM due to Array Allocation --- .../opennlp/tools/parser/lang/en/HeadRules.java | 18 +++++++- .../parser/lang/es/AncoraSpanishHeadRules.java | 18 +++++++- .../tools/parser/lang/en/HeadRulesTest.java | 53 ++++++++++++++++++++++ 3 files changed, 87 insertions(+), 2 deletions(-) diff --git a/opennlp-tools/src/main/java/opennlp/tools/parser/lang/en/HeadRules.java b/opennlp-tools/src/main/java/opennlp/tools/parser/lang/en/HeadRules.java index 6784204e3..8e7dc162c 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/parser/lang/en/HeadRules.java +++ b/opennlp-tools/src/main/java/opennlp/tools/parser/lang/en/HeadRules.java @@ -49,6 +49,12 @@ import opennlp.tools.util.model.SerializableArtifact; */ public class HeadRules implements opennlp.tools.parser.HeadRules, GapLabeler, SerializableArtifact { + // POS tagsets are fixed by linguistic convention (Penn Treebank: ~45 tags). + // No single head rule will ever list more than a small fraction of the tagset. + // 1000 gives 20x headroom over the real-world maximum and is not configurable + // because tag counts are a linguistics constraint, not a deployment parameter. + private static final int MAX_TAGS_PER_RULE = 1_000; + public static class HeadRulesSerializer implements ArtifactSerializer<HeadRules> { public HeadRules create(InputStream in) throws IOException { @@ -208,7 +214,17 @@ public class HeadRules implements opennlp.tools.parser.HeadRules, GapLabeler, Se String num = st.nextToken(); String type = st.nextToken(); String dir = st.nextToken(); - String[] tags = new String[Integer.parseInt(num) - 2]; + int rawCount; + try { + rawCount = Integer.parseInt(num); + } catch (NumberFormatException e) { + throw new IOException("Invalid tag count in head rules: " + num, e); + } + int numTags = rawCount - 2; + if (numTags < 0 || numTags > MAX_TAGS_PER_RULE) { + throw new IOException("Invalid tag count in head rules: " + num); + } + String[] tags = new String[numTags]; int ti = 0; while (st.hasMoreTokens()) { tags[ti] = st.nextToken(); diff --git a/opennlp-tools/src/main/java/opennlp/tools/parser/lang/es/AncoraSpanishHeadRules.java b/opennlp-tools/src/main/java/opennlp/tools/parser/lang/es/AncoraSpanishHeadRules.java index 6083f3517..b655f8f83 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/parser/lang/es/AncoraSpanishHeadRules.java +++ b/opennlp-tools/src/main/java/opennlp/tools/parser/lang/es/AncoraSpanishHeadRules.java @@ -62,6 +62,12 @@ import opennlp.tools.util.model.SerializableArtifact; */ public class AncoraSpanishHeadRules implements HeadRules, GapLabeler, SerializableArtifact { + // POS tagsets are fixed by linguistic convention (Penn Treebank: ~45 tags). + // No single head rule will ever list more than a small fraction of the tagset. + // 1000 gives 20x headroom over the real-world maximum and is not configurable + // because tag counts are a linguistics constraint, not a deployment parameter. + private static final int MAX_TAGS_PER_RULE = 1_000; + public static class HeadRulesSerializer implements ArtifactSerializer<AncoraSpanishHeadRules> { public AncoraSpanishHeadRules create(InputStream in) throws IOException { @@ -213,7 +219,17 @@ public class AncoraSpanishHeadRules implements HeadRules, GapLabeler, Serializab String num = st.nextToken(); String type = st.nextToken(); String dir = st.nextToken(); - String[] tags = new String[Integer.parseInt(num) - 2]; + int rawCount; + try { + rawCount = Integer.parseInt(num); + } catch (NumberFormatException e) { + throw new IOException("Invalid tag count in head rules: " + num, e); + } + int numTags = rawCount - 2; + if (numTags < 0 || numTags > MAX_TAGS_PER_RULE) { + throw new IOException("Invalid tag count in head rules: " + num); + } + String[] tags = new String[numTags]; int ti = 0; while (st.hasMoreTokens()) { tags[ti] = st.nextToken(); diff --git a/opennlp-tools/src/test/java/opennlp/tools/parser/lang/en/HeadRulesTest.java b/opennlp-tools/src/test/java/opennlp/tools/parser/lang/en/HeadRulesTest.java index 037ded46f..d57db8541 100644 --- a/opennlp-tools/src/test/java/opennlp/tools/parser/lang/en/HeadRulesTest.java +++ b/opennlp-tools/src/test/java/opennlp/tools/parser/lang/en/HeadRulesTest.java @@ -23,6 +23,7 @@ import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.OutputStreamWriter; +import java.io.StringReader; import java.nio.charset.StandardCharsets; import org.junit.Assert; @@ -30,6 +31,58 @@ import org.junit.Test; public class HeadRulesTest { + /** + * Positive: a well-formed head rules line with a small tag count loads without error. + */ + @Test + void testValidTagCountLoads() throws IOException { + // "5 NP 1 NN NNS" — num=5, tags=3 (5-2=3) + String rules = "5 NP 1 NN NNS NNP\n"; + Assertions.assertDoesNotThrow(() -> new HeadRules(new StringReader(rules))); + } + + /** + * Negative: a head rules line with a huge tag count must throw IOException, + * not attempt to allocate Integer.MAX_VALUE bytes. + */ + @Test + void testOversizedTagCountThrows() { + String rules = "2147483647 NP 1\n"; + Assertions.assertThrows(IOException.class, + () -> new HeadRules(new StringReader(rules))); + } + + /** + * Negative: a tag count that would produce a negative array size must throw IOException. + */ + @Test + void testNegativeTagCountThrows() { + String rules = "1 NP 1\n"; // 1 - 2 = -1 + Assertions.assertThrows(IOException.class, + () -> new HeadRules(new StringReader(rules))); + } + + /** + * Boundary: value just above MAX_TAGS_PER_RULE (1003 → numTags = 1001) must throw IOException. + */ + @Test + void testJustAboveLimitThrows() { + // 1003 declared; 1003 - 2 = 1001 tags, which exceeds MAX_TAGS_PER_RULE (1000) + String rules = "1003 NP 1\n"; + Assertions.assertThrows(IOException.class, + () -> new HeadRules(new StringReader(rules))); + } + + /** + * Negative: non-numeric token count must throw IOException, not NumberFormatException. + */ + @Test + void testNonNumericTagCountThrows() { + String rules = "NaN NP 1\n"; + Assertions.assertThrows(IOException.class, + () -> new HeadRules(new StringReader(rules))); + } + @Test public void testSerialization() throws IOException { InputStream headRulesIn =
