(opennlp) branch main updated: OPENNLP-1837: Add BertTokenizer with BERT basic tokenization (#1073)

mawiesne Fri, 12 Jun 2026 04:52:52 -0700

This is an automated email from the ASF dual-hosted git repository.

mawiesne pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/opennlp.git



The following commit(s) were added to refs/heads/main by this push:
     new e7e1189b4 OPENNLP-1837: Add BertTokenizer with BERT basic tokenization 
(#1073)
e7e1189b4 is described below

commit e7e1189b4350b18e850557b5034209d6058a9124
Author: Kristian Rickert <[email protected]>
AuthorDate: Fri Jun 12 07:52:31 2026 -0400

    OPENNLP-1837: Add BertTokenizer with BERT basic tokenization (#1073)
    
    * OPENNLP-1837 - Add BertTokenizer with BERT basic tokenization
    
    WordpieceTokenizer performs only the wordpiece stage, so uncased models
    map every capitalized or accented word to the unknown token. The new
    BertTokenizer adds the missing normalization stage: control character
    cleanup, whitespace normalization, CJK isolation, optional lower casing
    with accent stripping, and per-character punctuation splitting.
    
    Also fixes three WordpieceTokenizer defects: punctuation runs were
    split as one token, partially matched words emitted prefix pieces
    instead of a single unknown token, and tokenizePos returned null.
    
    * Potential fix for pull request finding
    
    LGTM
    
    Co-authored-by: Copilot Autofix powered by AI 
<[email protected]>
    
    * OPENNLP-1837 - Address review comments
    
    Treat all C* categories as control characters matching the reference
    implementation, hoist shared character predicates into a package-private
    BertNormalization helper, validate constructor arguments, and document
    the WordpieceTokenizer behavior changes.
    
    ---------
    
    Co-authored-by: Copilot Autofix powered by AI 
<[email protected]>
---
 .../opennlp/tools/tokenize/BertNormalization.java  | 107 +++++++++++
 .../java/opennlp/tools/tokenize/BertTokenizer.java | 211 +++++++++++++++++++++
 .../opennlp/tools/tokenize/WordpieceTokenizer.java |  91 +++++++--
 .../opennlp/tools/tokenize/BertTokenizerTest.java  | 162 ++++++++++++++++
 .../tools/tokenize/WordpieceTokenizerTest.java     |  50 +++++
 5 files changed, 605 insertions(+), 16 deletions(-)

diff --git 
a/opennlp-api/src/main/java/opennlp/tools/tokenize/BertNormalization.java 
b/opennlp-api/src/main/java/opennlp/tools/tokenize/BertNormalization.java
new file mode 100644
index 000000000..455f67f11
--- /dev/null
+++ b/opennlp-api/src/main/java/opennlp/tools/tokenize/BertNormalization.java
@@ -0,0 +1,107 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.tokenize;
+
+/**
+ * Character classifications and text transforms of the reference BERT
+ * {@code BasicTokenizer}, shared by {@link BertTokenizer} and
+ * {@link WordpieceTokenizer}.
+ */
+final class BertNormalization {
+
+  private BertNormalization() {
+  }
+
+  /**
+   * Surrounds every punctuation character with spaces, so each punctuation
+   * character becomes its own token.
+   */
+  static String isolatePunctuation(String text) {
+    final StringBuilder spaced = new StringBuilder(text.length());
+    text.codePoints().forEach(codePoint -> {
+      if (isPunctuation(codePoint)) {
+        spaced.append(' ').appendCodePoint(codePoint).append(' ');
+      } else {
+        spaced.appendCodePoint(codePoint);
+      }
+    });
+    return spaced.toString();
+  }
+
+  /**
+   * A control character in the BERT sense: any {@code C*} category
+   * (control, format, surrogate, private use, unassigned), except the
+   * characters treated as whitespace by {@link #isWhitespace(int)}.
+   */
+  static boolean isControl(int codePoint) {
+    if (codePoint == '\t' || codePoint == '\n' || codePoint == '\r') {
+      return false;
+    }
+    return switch (Character.getType(codePoint)) {
+      case Character.CONTROL, Character.FORMAT, Character.SURROGATE,
+           Character.PRIVATE_USE, Character.UNASSIGNED -> true;
+      default -> false;
+    };
+  }
+
+  /**
+   * A whitespace character in the BERT sense: space, tab, newline, carriage
+   * return, or Unicode space separators ({@code Zs}).
+   */
+  static boolean isWhitespace(int codePoint) {
+    if (codePoint == ' ' || codePoint == '\t' || codePoint == '\n' || 
codePoint == '\r') {
+      return true;
+    }
+    return Character.getType(codePoint) == Character.SPACE_SEPARATOR;
+  }
+
+  /**
+   * A punctuation character in the BERT sense: any non-alphanumeric ASCII
+   * character that is not whitespace, or any Unicode punctuation category.
+   */
+  static boolean isPunctuation(int codePoint) {
+    if ((codePoint >= 33 && codePoint <= 47) || (codePoint >= 58 && codePoint 
<= 64)
+        || (codePoint >= 91 && codePoint <= 96) || (codePoint >= 123 && 
codePoint <= 126)) {
+      return true;
+    }
+    return switch (Character.getType(codePoint)) {
+      case Character.CONNECTOR_PUNCTUATION, Character.DASH_PUNCTUATION,
+           Character.START_PUNCTUATION, Character.END_PUNCTUATION,
+           Character.INITIAL_QUOTE_PUNCTUATION, 
Character.FINAL_QUOTE_PUNCTUATION,
+           Character.OTHER_PUNCTUATION -> true;
+      default -> false;
+    };
+  }
+
+  /**
+   * A CJK ideograph as defined by the reference BERT implementation: the CJK
+   * Unified Ideographs blocks and their extensions. This intentionally does
+   * not cover Japanese kana or Korean hangul, matching the reference.
+   */
+  static boolean isCjk(int codePoint) {
+    return (codePoint >= 0x4E00 && codePoint <= 0x9FFF)
+        || (codePoint >= 0x3400 && codePoint <= 0x4DBF)
+        || (codePoint >= 0x20000 && codePoint <= 0x2A6DF)
+        || (codePoint >= 0x2A700 && codePoint <= 0x2B73F)
+        || (codePoint >= 0x2B740 && codePoint <= 0x2B81F)
+        || (codePoint >= 0x2B820 && codePoint <= 0x2CEAF)
+        || (codePoint >= 0xF900 && codePoint <= 0xFAFF)
+        || (codePoint >= 0x2F800 && codePoint <= 0x2FA1F);
+  }
+
+}
diff --git 
a/opennlp-api/src/main/java/opennlp/tools/tokenize/BertTokenizer.java 
b/opennlp-api/src/main/java/opennlp/tools/tokenize/BertTokenizer.java
new file mode 100644
index 000000000..5517548a4
--- /dev/null
+++ b/opennlp-api/src/main/java/opennlp/tools/tokenize/BertTokenizer.java
@@ -0,0 +1,211 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.tokenize;
+
+import java.text.Normalizer;
+import java.util.Locale;
+import java.util.Objects;
+import java.util.Set;
+
+import opennlp.tools.util.Span;
+
+/**
+ * A {@link Tokenizer} implementation of the full BERT tokenization pipeline:
+ * basic tokenization (text normalization) followed by wordpiece tokenization.
+ * <p>
+ * The basic tokenization stage reproduces the reference BERT
+ * {@code BasicTokenizer}:
+ * <ol>
+ *  <li>Removal of control characters and normalization of all whitespace
+ *      to single spaces.</li>
+ *  <li>Whitespace isolation of CJK ideographs.</li>
+ *  <li>For <i>uncased</i> models: lower casing and accent stripping
+ *      (Unicode NFD decomposition with removal of combining marks).</li>
+ *  <li>Isolation of every punctuation character as its own token.</li>
+ * </ol>
+ * The normalized text is then split into subwords by a
+ * {@link WordpieceTokenizer} sharing the same vocabulary and special tokens.
+ * <p>
+ * This pipeline is required for correct results with BERT-style models:
+ * feeding raw text directly to {@link WordpieceTokenizer} maps every token
+ * that does not literally appear in the vocabulary - for uncased models that
+ * includes every capitalized word - to the unknown token.
+ * <p>
+ * Whether to use the lower casing variant is a property of the model: uncased
+ * models (for example {@code bert-base-uncased} and the
+ * {@code sentence-transformers} models derived from it) require it, cased
+ * models must not use it. Accent stripping is coupled to lower casing, as in
+ * the reference implementation's default ({@code strip_accents} follows
+ * {@code do_lower_case} unless overridden).
+ * <p>
+ * For reference see:
+ * <ul>
+ *  <li><a href="https://github.com/google-research/bert";>
+ *    https://github.com/google-research/bert</a> ({@code 
tokenization.py})</li>
+ * </ul>
+ *
+ * @see WordpieceTokenizer
+ */
+public class BertTokenizer implements Tokenizer {
+
+  /**
+   * Maximum characters per word before the word is replaced with the unknown
+   * token, matching the reference BERT implementation.
+   */
+  private static final int MAX_WORD_CHARACTERS = 100;
+
+  private final WordpieceTokenizer wordpieceTokenizer;
+  private final boolean lowerCase;
+
+  /**
+   * Initializes a {@link BertTokenizer} for an <i>uncased</i> BERT model,
+   * with lower casing and accent stripping enabled.
+   *
+   * @param vocabulary The wordpiece vocabulary. Must not be {@code null}.
+   */
+  public BertTokenizer(Set<String> vocabulary) {
+    this(vocabulary, true);
+  }
+
+  /**
+   * Initializes a {@link BertTokenizer} with BERT special tokens.
+   *
+   * @param vocabulary The wordpiece vocabulary. Must not be {@code null}.
+   * @param lowerCase  {@code true} for uncased models (lower casing and accent
+   *                   stripping), {@code false} for cased models.
+   */
+  public BertTokenizer(Set<String> vocabulary, boolean lowerCase) {
+    this(vocabulary, lowerCase, WordpieceTokenizer.BERT_CLS_TOKEN,
+        WordpieceTokenizer.BERT_SEP_TOKEN, WordpieceTokenizer.BERT_UNK_TOKEN);
+  }
+
+  /**
+   * Initializes a {@link BertTokenizer} with custom special tokens, for models
+   * like RoBERTa that do not use the BERT defaults.
+   *
+   * @param vocabulary          The wordpiece vocabulary. Must not be {@code 
null}.
+   * @param lowerCase           {@code true} for uncased models (lower casing 
and
+   *                            accent stripping), {@code false} for cased 
models.
+   * @param classificationToken The CLS token.
+   * @param separatorToken      The SEP token.
+   * @param unknownToken        The UNK token.
+   */
+  public BertTokenizer(Set<String> vocabulary, boolean lowerCase,
+      String classificationToken, String separatorToken, String unknownToken) {
+    Objects.requireNonNull(vocabulary, "vocabulary must not be null");
+    Objects.requireNonNull(classificationToken, "classificationToken must not 
be null");
+    Objects.requireNonNull(separatorToken, "separatorToken must not be null");
+    Objects.requireNonNull(unknownToken, "unknownToken must not be null");
+    this.wordpieceTokenizer = new WordpieceTokenizer(vocabulary,
+        classificationToken, separatorToken, unknownToken, 
MAX_WORD_CHARACTERS);
+    this.lowerCase = lowerCase;
+  }
+
+  /**
+   * Tokenizes the given text into wordpieces, surrounded by the classification
+   * and separator tokens.
+   *
+   * @param text The text to tokenize. Must not be {@code null}.
+   *
+   * @return The wordpiece tokens.
+   */
+  @Override
+  public String[] tokenize(String text) {
+    return wordpieceTokenizer.tokenize(normalize(text));
+  }
+
+  /**
+   * Not supported: wordpiece tokens (subwords, {@code ##} continuations and
+   * special tokens) have no faithful character spans in the original text.
+   *
+   * @throws UnsupportedOperationException Always.
+   */
+  @Override
+  public Span[] tokenizePos(String text) {
+    throw new UnsupportedOperationException(
+        "Wordpiece tokens cannot be mapped to character spans of the original 
text");
+  }
+
+  /**
+   * Applies the BERT basic tokenization (normalization) stage.
+   *
+   * @param text The text to normalize. Must not be {@code null}.
+   *
+   * @return The normalized text, ready for wordpiece tokenization.
+   */
+  String normalize(String text) {
+    Objects.requireNonNull(text, "text must not be null");
+    String normalized = cleanText(text);
+    normalized = isolateCjkCharacters(normalized);
+    if (lowerCase) {
+      normalized = stripAccents(normalized.toLowerCase(Locale.ROOT));
+    }
+    return BertNormalization.isolatePunctuation(normalized);
+  }
+
+  /**
+   * Removes invalid and control characters and normalizes all whitespace
+   * characters to plain spaces.
+   */
+  private static String cleanText(String text) {
+    final StringBuilder cleaned = new StringBuilder(text.length());
+    text.codePoints().forEach(codePoint -> {
+      if (codePoint == 0 || codePoint == 0xFFFD || 
BertNormalization.isControl(codePoint)) {
+        return;
+      }
+      if (BertNormalization.isWhitespace(codePoint)) {
+        cleaned.append(' ');
+      } else {
+        cleaned.appendCodePoint(codePoint);
+      }
+    });
+    return cleaned.toString();
+  }
+
+  /**
+   * Surrounds every CJK ideograph with spaces, so each ideograph becomes its
+   * own token, matching the reference BERT treatment of Chinese text.
+   */
+  private static String isolateCjkCharacters(String text) {
+    final StringBuilder spaced = new StringBuilder(text.length());
+    text.codePoints().forEach(codePoint -> {
+      if (BertNormalization.isCjk(codePoint)) {
+        spaced.append(' ').appendCodePoint(codePoint).append(' ');
+      } else {
+        spaced.appendCodePoint(codePoint);
+      }
+    });
+    return spaced.toString();
+  }
+
+  /**
+   * Removes accents by Unicode NFD decomposition followed by removal of
+   * combining marks ({@code Mn}).
+   */
+  private static String stripAccents(String text) {
+    final String decomposed = Normalizer.normalize(text, Normalizer.Form.NFD);
+    final StringBuilder stripped = new StringBuilder(decomposed.length());
+    decomposed.codePoints().forEach(codePoint -> {
+      if (Character.getType(codePoint) != Character.NON_SPACING_MARK) {
+        stripped.appendCodePoint(codePoint);
+      }
+    });
+    return stripped.toString();
+  }
+
+}
diff --git 
a/opennlp-api/src/main/java/opennlp/tools/tokenize/WordpieceTokenizer.java 
b/opennlp-api/src/main/java/opennlp/tools/tokenize/WordpieceTokenizer.java
index 4d92c86bd..97e240e87 100644
--- a/opennlp-api/src/main/java/opennlp/tools/tokenize/WordpieceTokenizer.java
+++ b/opennlp-api/src/main/java/opennlp/tools/tokenize/WordpieceTokenizer.java
@@ -20,7 +20,6 @@ package opennlp.tools.tokenize;
 import java.util.LinkedList;
 import java.util.List;
 import java.util.Set;
-import java.util.regex.Pattern;
 
 import opennlp.tools.util.Span;
 
@@ -31,6 +30,22 @@ import opennlp.tools.util.Span;
  * Adapted under MIT license from
  * <a 
href="https://github.com/robrua/easy-bert";>https://github.com/robrua/easy-bert</a>.
  * <p>
+ * Note that this tokenizer performs <i>only</i> the wordpiece (subword) stage
+ * of BERT tokenization. It does not normalize the input text: no lower casing,
+ * no accent stripping, no control character removal. Text that does not match
+ * the vocabulary's casing - for uncased models that includes every capitalized
+ * word - is mapped to the unknown token. Use {@link BertTokenizer} for the
+ * full BERT tokenization pipeline.
+ * <p>
+ * As of OpenNLP 3.0.0 the behavior matches the reference BERT wordpiece
+ * implementation in three respects that differ from earlier releases:
+ * runs of punctuation (and non-ASCII punctuation) are split into individual
+ * single-character tokens, words that cannot be fully represented by
+ * vocabulary pieces become a single unknown token instead of the matched
+ * prefix pieces followed by the unknown token, and {@link 
#tokenizePos(String)}
+ * throws {@link UnsupportedOperationException} instead of returning
+ * {@code null}.
+ * <p>
  * For reference see:
  * <ul>
  *  <li>
@@ -42,6 +57,8 @@ import opennlp.tools.util.Span;
  *    
https://cran.r-project.org/web/packages/wordpiece/vignettes/basic_usage.html</a>
  *  </li>
  * </ul>
+ *
+ * @see BertTokenizer
  */
 public class WordpieceTokenizer implements Tokenizer {
 
@@ -59,9 +76,6 @@ public class WordpieceTokenizer implements Tokenizer {
   /** RoBERTa unknown token. */
   public static final String ROBERTA_UNK_TOKEN = "<unk>";
 
-  private static final Pattern PUNCTUATION_PATTERN =
-      Pattern.compile("\\p{Punct}+");
-
   private final Set<String> vocabulary;
   private final String classificationToken;
   private final String separatorToken;
@@ -87,7 +101,7 @@ public class WordpieceTokenizer implements Tokenizer {
    */
   public WordpieceTokenizer(Set<String> vocabulary, int maxTokenLength) {
     this(vocabulary);
-    this.maxTokenLength = maxTokenLength;
+    this.maxTokenLength = requireNonNegative(maxTokenLength);
   }
 
   /**
@@ -113,10 +127,44 @@ public class WordpieceTokenizer implements Tokenizer {
     this.unknownToken = unknownToken;
   }
 
+  /**
+   * Initializes a {@link WordpieceTokenizer} with a {@code vocabulary},
+   * custom special tokens and a custom {@code maxTokenLength}.
+   *
+   * @param vocabulary          The vocabulary.
+   * @param classificationToken The CLS token.
+   * @param separatorToken      The SEP token.
+   * @param unknownToken        The UNK token.
+   * @param maxTokenLength      A non-negative number that is used as maximum 
token length.
+   */
+  public WordpieceTokenizer(
+      final Set<String> vocabulary,
+      final String classificationToken,
+      final String separatorToken,
+      final String unknownToken,
+      final int maxTokenLength) {
+    this(vocabulary, classificationToken, separatorToken, unknownToken);
+    this.maxTokenLength = requireNonNegative(maxTokenLength);
+  }
+
+  private static int requireNonNegative(final int maxTokenLength) {
+    if (maxTokenLength < 0) {
+      throw new IllegalArgumentException(
+          "maxTokenLength must be non-negative: " + maxTokenLength);
+    }
+    return maxTokenLength;
+  }
+
+  /**
+   * Not supported: wordpiece tokens (subwords, {@code ##} continuations and
+   * special tokens) have no faithful character spans in the original text.
+   *
+   * @throws UnsupportedOperationException Always.
+   */
   @Override
   public Span[] tokenizePos(final String text) {
-    // TODO: Implement this.
-    return null;
+    throw new UnsupportedOperationException(
+        "Wordpiece tokens cannot be mapped to character spans of the original 
text");
   }
 
   @Override
@@ -125,8 +173,9 @@ public class WordpieceTokenizer implements Tokenizer {
     final List<String> tokens = new LinkedList<>();
     tokens.add(classificationToken);
 
-    // Put spaces around punctuation.
-    final String spacedPunctuation = 
PUNCTUATION_PATTERN.matcher(text).replaceAll(" $0 ");
+    // Isolate each punctuation character as its own token, as the reference
+    // BERT tokenization does. Runs of punctuation become individual tokens.
+    final String spacedPunctuation = 
BertNormalization.isolatePunctuation(text);
 
     // Split based on whitespace.
     final String[] split = 
WhitespaceTokenizer.INSTANCE.tokenize(spacedPunctuation);
@@ -134,22 +183,27 @@ public class WordpieceTokenizer implements Tokenizer {
     // For each resulting word, if the word is found in the WordPiece 
vocabulary, keep it as-is.
     // If not, starting from the beginning, pull off the biggest piece that is 
in the vocabulary,
     // and prefix "##" to the remaining piece. Repeat until the entire word is 
represented by
-    // pieces from the vocabulary, if possible.
+    // pieces from the vocabulary. If the word cannot be fully represented, 
the whole word
+    // becomes a single unknown token, as in the reference BERT implementation.
     for (final String token : split) {
 
       final char[] characters = token.toCharArray();
 
       if (characters.length <= maxTokenLength) {
 
+        // The pieces of this word. Only added to the result if the whole word 
matches.
+        final List<String> wordPieces = new LinkedList<>();
+
         // To start, the substring is the whole token.
         int start = 0;
         int end;
+        boolean found = true;
 
         // Look at the token from the start.
         while (start < characters.length) {
 
           end = characters.length;
-          boolean found = false;
+          found = false;
 
           // Look at the token from the end until the end is equal to the 
start.
           while (start < end) {
@@ -165,8 +219,8 @@ public class WordpieceTokenizer implements Tokenizer {
             // See if the substring is in the vocabulary.
             if (vocabulary.contains(substring)) {
 
-              // It is in the vocabulary so add it to the list of tokens.
-              tokens.add(substring);
+              // It is in the vocabulary so add it to the pieces of this word.
+              wordPieces.add(substring);
 
               // Next time we can pick up where we left off.
               start = end;
@@ -181,10 +235,9 @@ public class WordpieceTokenizer implements Tokenizer {
 
           }
 
-          // If the word can't be represented by vocabulary pieces replace
-          // it with a specified "unknown" token.
+          // A part of the word is not representable by vocabulary pieces, so 
the
+          // whole word is replaced with the unknown token.
           if (!found) {
-            tokens.add(unknownToken);
             break;
           }
 
@@ -193,6 +246,12 @@ public class WordpieceTokenizer implements Tokenizer {
 
         }
 
+        if (found) {
+          tokens.addAll(wordPieces);
+        } else {
+          tokens.add(unknownToken);
+        }
+
       } else {
 
         // If the token's length is greater than the max length just add 
unknown token instead.
diff --git 
a/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/tokenize/BertTokenizerTest.java
 
b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/tokenize/BertTokenizerTest.java
new file mode 100644
index 000000000..d8f706f4e
--- /dev/null
+++ 
b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/tokenize/BertTokenizerTest.java
@@ -0,0 +1,162 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.tokenize;
+
+import java.util.Set;
+
+import org.junit.jupiter.api.Assertions;
+import org.junit.jupiter.api.Test;
+
+/**
+ * Tests {@link BertTokenizer}.
+ * <p>
+ * All expected token sequences in this test were generated with the 
HuggingFace
+ * {@code tokenizers} reference implementation ({@code BertWordPieceTokenizer})
+ * using the same vocabulary, so they are verified to be identical to the
+ * reference BERT tokenization.
+ */
+public class BertTokenizerTest {
+
+  private static final Set<String> VOCABULARY = Set.of(
+      "the", "quick", "brown", "fox", "jumps", "over", "lazy", "dog",
+      "em", "##bed", "##ding", "##s",
+      "wurttemberg", "strasse", "grosse",
+      "don", "t", "wait", "what", ".", ",", "?", "!", "'",
+      "\u6211", "\u7231",  // CJK: 我 爱
+      "natural", "language", "processing");
+
+  @Test
+  void testLowerCasesCapitalizedWords() {
+    final Tokenizer tokenizer = new BertTokenizer(VOCABULARY);
+    final String[] tokens = tokenizer.tokenize("The quick brown fox jumps over 
the lazy dog.");
+
+    final String[] expected = {"[CLS]", "the", "quick", "brown", "fox", 
"jumps", "over",
+        "the", "lazy", "dog", ".", "[SEP]"};
+    Assertions.assertArrayEquals(expected, tokens);
+  }
+
+  @Test
+  void testLowerCasesBeforeWordpieceSplitting() {
+    final Tokenizer tokenizer = new BertTokenizer(VOCABULARY);
+    final String[] tokens = tokenizer.tokenize("Embeddings");
+
+    final String[] expected = {"[CLS]", "em", "##bed", "##ding", "##s", 
"[SEP]"};
+    Assertions.assertArrayEquals(expected, tokens);
+  }
+
+  @Test
+  void testStripsAccentsButKeepsNonCombiningCharacters() {
+    final Tokenizer tokenizer = new BertTokenizer(VOCABULARY);
+    // ü decomposes to u + combining diaeresis and the mark is stripped;
+    // ß is not a combining mark and must survive, leaving an OOV token.
+    final String[] tokens = tokenizer.tokenize("W\u00fcrttemberg Stra\u00dfe");
+
+    final String[] expected = {"[CLS]", "wurttemberg", "[UNK]", "[SEP]"};
+    Assertions.assertArrayEquals(expected, tokens);
+  }
+
+  @Test
+  void testSplitsPunctuationRunsIntoSingleCharacters() {
+    final Tokenizer tokenizer = new BertTokenizer(VOCABULARY);
+    final String[] tokens = tokenizer.tokenize("Wait... what?!");
+
+    final String[] expected = {"[CLS]", "wait", ".", ".", ".", "what", "?", 
"!", "[SEP]"};
+    Assertions.assertArrayEquals(expected, tokens);
+  }
+
+  @Test
+  void testSplitsApostrophesAsPunctuation() {
+    final Tokenizer tokenizer = new BertTokenizer(VOCABULARY);
+    final String[] tokens = tokenizer.tokenize("don't");
+
+    final String[] expected = {"[CLS]", "don", "'", "t", "[SEP]"};
+    Assertions.assertArrayEquals(expected, tokens);
+  }
+
+  @Test
+  void testIsolatesCjkIdeographs() {
+    final Tokenizer tokenizer = new BertTokenizer(VOCABULARY);
+    final String[] tokens = tokenizer.tokenize("\u6211\u7231natural language 
processing");
+
+    final String[] expected = {"[CLS]", "\u6211", "\u7231", "natural", 
"language",
+        "processing", "[SEP]"};
+    Assertions.assertArrayEquals(expected, tokens);
+  }
+
+  @Test
+  void testCleansControlCharactersAndNormalizesWhitespace() {
+    final Tokenizer tokenizer = new BertTokenizer(VOCABULARY);
+    // Tab and no-break space are whitespace; the NUL character is removed,
+    // joining "brown" and "fox" into one out-of-vocabulary token.
+    final String[] tokens = 
tokenizer.tokenize("the\tquick\u00a0brown\u0000fox");
+
+    final String[] expected = {"[CLS]", "the", "quick", "[UNK]", "[SEP]"};
+    Assertions.assertArrayEquals(expected, tokens);
+  }
+
+  @Test
+  void testRemovesPrivateUseAndUnassignedCharacters() {
+    final Tokenizer tokenizer = new BertTokenizer(VOCABULARY);
+    // The reference implementation treats all C* categories as control
+    // characters: private use (U+E000, Co) and noncharacters (U+FDD0, Cn)
+    // are removed, joining the surrounding text into one OOV token.
+    final String[] tokens = tokenizer.tokenize("fox\ue000jumps and 
fox\ufdd0jumps");
+
+    final String[] expected = {"[CLS]", "[UNK]", "[UNK]", "[UNK]", "[SEP]"};
+    Assertions.assertArrayEquals(expected, tokens);
+  }
+
+  @Test
+  void testRejectsNullSpecialTokens() {
+    Assertions.assertThrows(NullPointerException.class,
+        () -> new BertTokenizer(VOCABULARY, true, null, "[SEP]", "[UNK]"));
+    Assertions.assertThrows(NullPointerException.class,
+        () -> new BertTokenizer(VOCABULARY, true, "[CLS]", null, "[UNK]"));
+    Assertions.assertThrows(NullPointerException.class,
+        () -> new BertTokenizer(VOCABULARY, true, "[CLS]", "[SEP]", null));
+  }
+
+  @Test
+  void testCasedModeKeepsCaseAndAccents() {
+    final Tokenizer tokenizer = new BertTokenizer(
+        Set.of("The", "W\u00fcrttemberg", "fox"), false);
+    final String[] tokens = tokenizer.tokenize("The W\u00fcrttemberg fox");
+
+    final String[] expected = {"[CLS]", "The", "W\u00fcrttemberg", "fox", 
"[SEP]"};
+    Assertions.assertArrayEquals(expected, tokens);
+  }
+
+  @Test
+  void testCustomSpecialTokens() {
+    final Tokenizer tokenizer = new BertTokenizer(Set.of("the", "fox"), true,
+        WordpieceTokenizer.ROBERTA_CLS_TOKEN, 
WordpieceTokenizer.ROBERTA_SEP_TOKEN,
+        WordpieceTokenizer.ROBERTA_UNK_TOKEN);
+    final String[] tokens = tokenizer.tokenize("The unknown fox");
+
+    final String[] expected = {"<s>", "the", "<unk>", "fox", "</s>"};
+    Assertions.assertArrayEquals(expected, tokens);
+  }
+
+  @Test
+  void testTokenizePosIsUnsupported() {
+    final Tokenizer tokenizer = new BertTokenizer(VOCABULARY);
+    Assertions.assertThrows(UnsupportedOperationException.class,
+        () -> tokenizer.tokenizePos("the fox"));
+  }
+
+}
diff --git 
a/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/tokenize/WordpieceTokenizerTest.java
 
b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/tokenize/WordpieceTokenizerTest.java
index 68ea52d4b..87362bde1 100644
--- 
a/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/tokenize/WordpieceTokenizerTest.java
+++ 
b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/tokenize/WordpieceTokenizerTest.java
@@ -51,6 +51,56 @@ public class WordpieceTokenizerTest {
 
   }
 
+  @Test
+  void testPunctuationRunsAreSplitIntoSingleCharacters() {
+
+    final Set<String> vocabulary = getVocabulary();
+    vocabulary.add(".");
+
+    final Tokenizer tokenizer = new WordpieceTokenizer(vocabulary);
+    final String[] tokens = tokenizer.tokenize("the lazy dog...");
+
+    final String[] expected = {"[CLS]", "the", "lazy", "dog", ".", ".", ".", 
"[SEP]"};
+
+    Assertions.assertArrayEquals(expected, tokens);
+
+  }
+
+  @Test
+  void testPartiallyMatchedWordBecomesSingleUnknownToken() {
+
+    // "brownfox" starts with the vocabulary piece "brown", but the remainder 
has no
+    // matching piece. The reference BERT implementation replaces the whole 
word with
+    // the unknown token instead of emitting the matched prefix pieces.
+    final Tokenizer tokenizer = new WordpieceTokenizer(getVocabulary());
+    final String[] tokens = tokenizer.tokenize("the brownfox jumps");
+
+    final String[] expected = {"[CLS]", "the", "[UNK]", "jumps", "[SEP]"};
+
+    Assertions.assertArrayEquals(expected, tokens);
+
+  }
+
+  @Test
+  void testRejectsNegativeMaxTokenLength() {
+
+    Assertions.assertThrows(IllegalArgumentException.class,
+        () -> new WordpieceTokenizer(getVocabulary(), -1));
+    Assertions.assertThrows(IllegalArgumentException.class,
+        () -> new WordpieceTokenizer(getVocabulary(), "[CLS]", "[SEP]", 
"[UNK]", -1));
+
+  }
+
+  @Test
+  void testTokenizePosIsUnsupported() {
+
+    final Tokenizer tokenizer = new WordpieceTokenizer(getVocabulary());
+
+    Assertions.assertThrows(UnsupportedOperationException.class,
+        () -> tokenizer.tokenizePos("the lazy dog"));
+
+  }
+
   private Set<String> getVocabulary() {
 
     final Set<String> vocabulary = new HashSet<>();

(opennlp) branch main updated: OPENNLP-1837: Add BertTokenizer with BERT basic tokenization (#1073)

Reply via email to