This is an automated email from the ASF dual-hosted git repository.
mawiesne pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/opennlp.git
The following commit(s) were added to refs/heads/main by this push:
new b6af87589 OPENNLP-1839 : Fix native memory leak and vocabulary NPE in
DocumentCategorizerDL (#1074)
b6af87589 is described below
commit b6af87589770ca8c39ef94d2726a21cad6d9f32f
Author: Kristian Rickert <[email protected]>
AuthorDate: Fri Jun 12 08:04:24 2026 -0400
OPENNLP-1839 : Fix native memory leak and vocabulary NPE in
DocumentCategorizerDL (#1074)
Every categorize() call leaked the OnnxTensor inputs and the
OrtSession.Result for each document chunk. Tensors are now closed in a
finally block and the result with try-with-resources.
Tokens absent from the vocabulary caused an opaque NullPointerException
through auto-unboxing, which the broad catch in categorize() swallowed.
The token-to-id mapping now throws IllegalArgumentException naming the
missing token, indicating a vocabulary/model mismatch.
---
.../opennlp/dl/doccat/DocumentCategorizerDL.java | 75 +++++++++++++++-------
.../dl/doccat/DocumentCategorizerDLTest.java | 60 +++++++++++++++++
2 files changed, 112 insertions(+), 23 deletions(-)
diff --git
a/opennlp-core/opennlp-ml/opennlp-dl/src/main/java/opennlp/dl/doccat/DocumentCategorizerDL.java
b/opennlp-core/opennlp-ml/opennlp-dl/src/main/java/opennlp/dl/doccat/DocumentCategorizerDL.java
index a0c9ede77..f02dd875b 100644
---
a/opennlp-core/opennlp-ml/opennlp-dl/src/main/java/opennlp/dl/doccat/DocumentCategorizerDL.java
+++
b/opennlp-core/opennlp-ml/opennlp-dl/src/main/java/opennlp/dl/doccat/DocumentCategorizerDL.java
@@ -145,23 +145,31 @@ public class DocumentCategorizerDL extends AbstractDL
implements DocumentCategor
final Map<String, OnnxTensor> inputs = new HashMap<>();
- inputs.put(INPUT_IDS, OnnxTensor.createTensor(env,
- LongBuffer.wrap(t.ids()), new long[] {1, t.ids().length}));
-
- if (inferenceOptions.isIncludeAttentionMask()) {
- inputs.put(ATTENTION_MASK, OnnxTensor.createTensor(env,
- LongBuffer.wrap(t.mask()), new long[] {1, t.mask().length}));
- }
-
- if (inferenceOptions.isIncludeTokenTypeIds()) {
- inputs.put(TOKEN_TYPE_IDS, OnnxTensor.createTensor(env,
- LongBuffer.wrap(t.types()), new long[] {1, t.types().length}));
+ final Object output;
+ try {
+ inputs.put(INPUT_IDS, OnnxTensor.createTensor(env,
+ LongBuffer.wrap(t.ids()), new long[] {1, t.ids().length}));
+
+ if (inferenceOptions.isIncludeAttentionMask()) {
+ inputs.put(ATTENTION_MASK, OnnxTensor.createTensor(env,
+ LongBuffer.wrap(t.mask()), new long[] {1, t.mask().length}));
+ }
+
+ if (inferenceOptions.isIncludeTokenTypeIds()) {
+ inputs.put(TOKEN_TYPE_IDS, OnnxTensor.createTensor(env,
+ LongBuffer.wrap(t.types()), new long[] {1, t.types().length}));
+ }
+
+ // The outputs from the model. Some models return a 2D array (e.g.
BERT),
+ // while others return a 1D array (e.g. RoBERTa).
+ try (OrtSession.Result result = session.run(inputs)) {
+ // getValue() copies the tensor into Java arrays, so the result
can be closed safely.
+ output = result.get(0).getValue();
+ }
+ } finally {
+ inputs.values().forEach(OnnxTensor::close);
}
- // The outputs from the model. Some models return a 2D array (e.g.
BERT),
- // while others return a 1D array (e.g. RoBERTa).
- final Object output = session.run(inputs).get(0).getValue();
-
final float[] rawScores;
if (output instanceof float[][] v) {
rawScores = v[0];
@@ -300,13 +308,7 @@ public class DocumentCategorizerDL extends AbstractDL
implements DocumentCategor
// Now we can tokenize the group and continue.
final String[] tokens = tokenizer.tokenize(group);
- final int[] ids = new int[tokens.length];
-
- for (int x = 0; x < tokens.length; x++) {
- ids[x] = vocab.get(tokens[x]);
- }
-
- final long[] lids = Arrays.stream(ids).mapToLong(i -> i).toArray();
+ final long[] ids = tokenIds(tokens, vocab);
final long[] mask = new long[ids.length];
Arrays.fill(mask, 1);
@@ -314,7 +316,7 @@ public class DocumentCategorizerDL extends AbstractDL
implements DocumentCategor
final long[] types = new long[ids.length];
Arrays.fill(types, 0);
- t.add(new Tokens(tokens, lids, mask, types));
+ t.add(new Tokens(tokens, ids, mask, types));
}
@@ -322,6 +324,33 @@ public class DocumentCategorizerDL extends AbstractDL
implements DocumentCategor
}
+ /**
+ * Maps tokens to their vocabulary ids.
+ *
+ * @param tokens The tokens to map.
+ * @param vocab The vocabulary map.
+ * @return The token ids.
+ *
+ * @throws IllegalArgumentException Thrown if a token is not present in the
+ * vocabulary.
+ */
+ static long[] tokenIds(final String[] tokens, final Map<String, Integer>
vocab) {
+
+ final long[] ids = new long[tokens.length];
+
+ for (int x = 0; x < tokens.length; x++) {
+ final Integer id = vocab.get(tokens[x]);
+ if (id == null) {
+ throw new IllegalArgumentException("Token '" + tokens[x]
+ + "' is not present in the vocabulary; the vocabulary file does
not match the model.");
+ }
+ ids[x] = id;
+ }
+
+ return ids;
+
+ }
+
/**
* Applies softmax to an array of values.
*
diff --git
a/opennlp-core/opennlp-ml/opennlp-dl/src/test/java/opennlp/dl/doccat/DocumentCategorizerDLTest.java
b/opennlp-core/opennlp-ml/opennlp-dl/src/test/java/opennlp/dl/doccat/DocumentCategorizerDLTest.java
new file mode 100644
index 000000000..a6bab39f6
--- /dev/null
+++
b/opennlp-core/opennlp-ml/opennlp-dl/src/test/java/opennlp/dl/doccat/DocumentCategorizerDLTest.java
@@ -0,0 +1,60 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.dl.doccat;
+
+import java.util.HashMap;
+import java.util.Map;
+
+import org.junit.jupiter.api.Test;
+
+import opennlp.tools.tokenize.WordpieceTokenizer;
+
+import static org.junit.jupiter.api.Assertions.assertArrayEquals;
+import static org.junit.jupiter.api.Assertions.assertThrows;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+public class DocumentCategorizerDLTest {
+
+ private static Map<String, Integer> vocab() {
+ final Map<String, Integer> vocab = new HashMap<>();
+ vocab.put(WordpieceTokenizer.BERT_CLS_TOKEN, 0);
+ vocab.put(WordpieceTokenizer.BERT_SEP_TOKEN, 1);
+ vocab.put(WordpieceTokenizer.BERT_UNK_TOKEN, 2);
+ vocab.put("hello", 3);
+ vocab.put("world", 4);
+ return vocab;
+ }
+
+ @Test
+ void testTokenIdsMapsTokensToVocabularyIds() {
+ final long[] ids = DocumentCategorizerDL.tokenIds(
+ new String[] {WordpieceTokenizer.BERT_CLS_TOKEN, "hello", "world",
+ WordpieceTokenizer.BERT_SEP_TOKEN}, vocab());
+
+ assertArrayEquals(new long[] {0, 3, 4, 1}, ids);
+ }
+
+ @Test
+ void testTokenIdsRejectsTokensMissingFromVocabulary() {
+ final IllegalArgumentException e =
assertThrows(IllegalArgumentException.class, () ->
+ DocumentCategorizerDL.tokenIds(new String[] {"hello", "missing"},
vocab()));
+
+ assertTrue(e.getMessage().contains("missing"),
+ "the error message should name the missing token: " + e.getMessage());
+ }
+}