This is an automated email from the ASF dual-hosted git repository. mawiesne pushed a commit to branch opennlp-2.x in repository https://gitbox.apache.org/repos/asf/opennlp.git
commit d1e48c89b8b36df81bed485efec8daab33677dac Author: Kristian Rickert <[email protected]> AuthorDate: Fri Jun 12 14:19:48 2026 +0200 OPENNLP-1840: Fix native memory leak and vocabulary NPE in NameFinderDL (#1076) Every find() call leaked the OnnxTensor inputs and the OrtSession.Result for each sentence chunk. Tensors are now closed in a finally block and the result with try-with-resources. Tokens absent from the vocabulary caused an opaque NullPointerException through auto-unboxing. The token-to-id mapping now throws IllegalArgumentException naming the missing token, indicating a vocabulary/model mismatch. (cherry picked from commit aaf1b1fc43d1ededa9ab469befcaa457bc31978c) --- .../java/opennlp/dl/namefinder/NameFinderDL.java | 70 +++++++++++++++------- .../opennlp/dl/namefinder/NameFinderDLTest.java | 60 +++++++++++++++++++ 2 files changed, 110 insertions(+), 20 deletions(-) diff --git a/opennlp-dl/src/main/java/opennlp/dl/namefinder/NameFinderDL.java b/opennlp-dl/src/main/java/opennlp/dl/namefinder/NameFinderDL.java index 74e5a1aac..f7373700e 100644 --- a/opennlp-dl/src/main/java/opennlp/dl/namefinder/NameFinderDL.java +++ b/opennlp-dl/src/main/java/opennlp/dl/namefinder/NameFinderDL.java @@ -130,21 +130,30 @@ public class NameFinderDL extends AbstractDL implements TokenNameFinder { // The inputs to the ONNX model. final Map<String, OnnxTensor> inputs = new HashMap<>(); - inputs.put(INPUT_IDS, OnnxTensor.createTensor(env, LongBuffer.wrap(tokens.ids()), - new long[] {1, tokens.ids().length})); - if (inferenceOptions.isIncludeAttentionMask()) { - inputs.put(ATTENTION_MASK, OnnxTensor.createTensor(env, - LongBuffer.wrap(tokens.mask()), new long[] {1, tokens.mask().length})); - } + final float[][][] v; + try { + inputs.put(INPUT_IDS, OnnxTensor.createTensor(env, LongBuffer.wrap(tokens.ids()), + new long[] {1, tokens.ids().length})); - if (inferenceOptions.isIncludeTokenTypeIds()) { - inputs.put(TOKEN_TYPE_IDS, OnnxTensor.createTensor(env, - LongBuffer.wrap(tokens.types()), new long[] {1, tokens.types().length})); - } + if (inferenceOptions.isIncludeAttentionMask()) { + inputs.put(ATTENTION_MASK, OnnxTensor.createTensor(env, + LongBuffer.wrap(tokens.mask()), new long[] {1, tokens.mask().length})); + } - // The outputs from the model. - final float[][][] v = (float[][][]) session.run(inputs).get(0).getValue(); + if (inferenceOptions.isIncludeTokenTypeIds()) { + inputs.put(TOKEN_TYPE_IDS, OnnxTensor.createTensor(env, + LongBuffer.wrap(tokens.types()), new long[] {1, tokens.types().length})); + } + + // The outputs from the model. + try (OrtSession.Result result = session.run(inputs)) { + // getValue() copies the tensor into Java arrays, so the result can be closed safely. + v = (float[][][]) result.get(0).getValue(); + } + } finally { + inputs.values().forEach(OnnxTensor::close); + } // Find consecutive B-PER and I-PER labels and combine the spans where necessary. // There are also B-LOC and I-LOC tags for locations that might be useful at some point. @@ -376,13 +385,7 @@ public class NameFinderDL extends AbstractDL implements TokenNameFinder { // Now we can tokenize the group and continue. final String[] tokens = tokenizer.tokenize(group); - final int[] ids = new int[tokens.length]; - - for (int x = 0; x < tokens.length; x++) { - ids[x] = vocab.get(tokens[x]); - } - - final long[] lids = Arrays.stream(ids).mapToLong(i -> i).toArray(); + final long[] ids = tokenIds(tokens, vocab); final long[] mask = new long[ids.length]; Arrays.fill(mask, 1); @@ -390,7 +393,7 @@ public class NameFinderDL extends AbstractDL implements TokenNameFinder { final long[] types = new long[ids.length]; Arrays.fill(types, 0); - t.add(new Tokens(tokens, lids, mask, types)); + t.add(new Tokens(tokens, ids, mask, types)); } @@ -398,4 +401,31 @@ public class NameFinderDL extends AbstractDL implements TokenNameFinder { } + /** + * Maps tokens to their vocabulary ids. + * + * @param tokens The tokens to map. + * @param vocab The vocabulary map. + * @return The token ids. + * + * @throws IllegalArgumentException Thrown if a token is not present in the + * vocabulary. + */ + static long[] tokenIds(final String[] tokens, final Map<String, Integer> vocab) { + + final long[] ids = new long[tokens.length]; + + for (int x = 0; x < tokens.length; x++) { + final Integer id = vocab.get(tokens[x]); + if (id == null) { + throw new IllegalArgumentException("Token '" + tokens[x] + + "' is not present in the vocabulary; the vocabulary file does not match the model."); + } + ids[x] = id; + } + + return ids; + + } + } diff --git a/opennlp-dl/src/test/java/opennlp/dl/namefinder/NameFinderDLTest.java b/opennlp-dl/src/test/java/opennlp/dl/namefinder/NameFinderDLTest.java new file mode 100644 index 000000000..87fe18c9b --- /dev/null +++ b/opennlp-dl/src/test/java/opennlp/dl/namefinder/NameFinderDLTest.java @@ -0,0 +1,60 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.dl.namefinder; + +import java.util.HashMap; +import java.util.Map; + +import org.junit.jupiter.api.Test; + +import opennlp.tools.tokenize.WordpieceTokenizer; + +import static org.junit.jupiter.api.Assertions.assertArrayEquals; +import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.assertTrue; + +public class NameFinderDLTest { + + private static Map<String, Integer> vocab() { + final Map<String, Integer> vocab = new HashMap<>(); + vocab.put(WordpieceTokenizer.BERT_CLS_TOKEN, 0); + vocab.put(WordpieceTokenizer.BERT_SEP_TOKEN, 1); + vocab.put(WordpieceTokenizer.BERT_UNK_TOKEN, 2); + vocab.put("hello", 3); + vocab.put("world", 4); + return vocab; + } + + @Test + void testTokenIdsMapsTokensToVocabularyIds() { + final long[] ids = NameFinderDL.tokenIds( + new String[] {WordpieceTokenizer.BERT_CLS_TOKEN, "hello", "world", + WordpieceTokenizer.BERT_SEP_TOKEN}, vocab()); + + assertArrayEquals(new long[] {0, 3, 4, 1}, ids); + } + + @Test + void testTokenIdsRejectsTokensMissingFromVocabulary() { + final IllegalArgumentException e = assertThrows(IllegalArgumentException.class, () -> + NameFinderDL.tokenIds(new String[] {"hello", "missing"}, vocab())); + + assertTrue(e.getMessage().contains("missing"), + "the error message should name the missing token: " + e.getMessage()); + } +}
