rzo1 commented on code in PR #1108: URL: https://github.com/apache/opennlp/pull/1108#discussion_r3482957588
########## opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/Confusables.java: ########## @@ -0,0 +1,179 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package opennlp.tools.util.normalizer; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.UncheckedIOException; +import java.nio.charset.StandardCharsets; +import java.text.Normalizer; +import java.util.HashMap; +import java.util.Map; + +/** + * Computes the Unicode confusable <em>skeleton</em> of text, following the skeleton algorithm + * defined in <a href="https://www.unicode.org/reports/tr39/">UTS #39</a> (Unicode Security + * Mechanisms). Two + * strings are confusable, for example Latin {@code "paypal"} and a version using Cyrillic + * lookalikes, exactly when their skeletons are equal. + * + * <p>The mapping is loaded once from the {@code confusables.txt} resource of the Unicode security + * data (parsed with simple cursor scanning, no regular expression). The skeleton of a string is + * {@code NFD(map(NFD(s)))}: decompose, replace each code point with its prototype, and decompose + * again. This changes length and offsets, so it belongs to the derived, matching-only form rather + * than to any offset-preserving transform.</p> + * + * <p>This implements only the skeleton transform and the confusable-detection test built on + * skeleton equality. The other mechanisms defined in UTS #39, such as identifier + * restriction levels, mixed-script and whole-script confusable detection, and the bidirectional + * skeleton, are out of scope; the skeleton here is a comparison form, not a security-grade + * conformance claim for the full report.</p> + */ +public final class Confusables { + + private static final String RESOURCE = "confusables.txt"; + + // Maps a single confusable code point to its prototype sequence (one or more code points). + // Loaded lazily on first use (see prototypes()) so a missing or unreadable resource surfaces as a + // catchable exception at call time rather than an ExceptionInInitializerError that permanently + // poisons the class -- a real risk in container, OSGi, shaded, or modular setups where the bundled + // resource may not be visible to the classloader that loaded this class. + private static volatile Map<Integer, String> prototypes; + + private Confusables() { + } + + // Double-checked lazy initialization: load() runs once on first use, and a failure leaves the + // field null so a later call retries instead of the class being permanently unusable. + private static Map<Integer, String> prototypes() { + Map<Integer, String> map = prototypes; + if (map == null) { + synchronized (Confusables.class) { + map = prototypes; + if (map == null) { + map = load(); + prototypes = map; + } + } + } + return map; + } + + private static Map<Integer, String> load() { + try (InputStream in = Confusables.class.getResourceAsStream(RESOURCE)) { + if (in == null) { + throw new IllegalStateException("Missing confusables data resource: " + RESOURCE); + } + return parse(in); + } catch (IOException e) { + throw new UncheckedIOException("Unable to read confusables data resource " + RESOURCE, e); + } + } + + // Package-private so the malformed-data handling can be exercised without the bundled resource. + static Map<Integer, String> parse(InputStream in) throws IOException { + final Map<Integer, String> map = new HashMap<>(12000); Review Comment: why 12.000? ########## opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/CaseFoldCharSequenceNormalizer.java: ########## @@ -0,0 +1,81 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package opennlp.tools.util.normalizer; + +import java.util.Locale; +import java.util.Objects; + +/** + * A {@link CharSequenceNormalizer} that lower cases text for case-insensitive matching. It uses + * {@link Locale#ROOT} by default, so the result does not depend on the JVM's default locale. + * + * <p>This is the case-folding step of a search / BM25 analysis chain (the counterpart to Lucene's Review Comment: I would not reference Lucene in our docs. Also the search / BM25 ref seems oddly placed here. ########## opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/Confusables.java: ########## @@ -0,0 +1,179 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package opennlp.tools.util.normalizer; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.UncheckedIOException; +import java.nio.charset.StandardCharsets; +import java.text.Normalizer; +import java.util.HashMap; +import java.util.Map; + +/** + * Computes the Unicode confusable <em>skeleton</em> of text, following the skeleton algorithm + * defined in <a href="https://www.unicode.org/reports/tr39/">UTS #39</a> (Unicode Security + * Mechanisms). Two + * strings are confusable, for example Latin {@code "paypal"} and a version using Cyrillic + * lookalikes, exactly when their skeletons are equal. + * + * <p>The mapping is loaded once from the {@code confusables.txt} resource of the Unicode security + * data (parsed with simple cursor scanning, no regular expression). The skeleton of a string is + * {@code NFD(map(NFD(s)))}: decompose, replace each code point with its prototype, and decompose + * again. This changes length and offsets, so it belongs to the derived, matching-only form rather + * than to any offset-preserving transform.</p> + * + * <p>This implements only the skeleton transform and the confusable-detection test built on + * skeleton equality. The other mechanisms defined in UTS #39, such as identifier + * restriction levels, mixed-script and whole-script confusable detection, and the bidirectional + * skeleton, are out of scope; the skeleton here is a comparison form, not a security-grade + * conformance claim for the full report.</p> + */ +public final class Confusables { + + private static final String RESOURCE = "confusables.txt"; + + // Maps a single confusable code point to its prototype sequence (one or more code points). + // Loaded lazily on first use (see prototypes()) so a missing or unreadable resource surfaces as a + // catchable exception at call time rather than an ExceptionInInitializerError that permanently + // poisons the class -- a real risk in container, OSGi, shaded, or modular setups where the bundled + // resource may not be visible to the classloader that loaded this class. + private static volatile Map<Integer, String> prototypes; + + private Confusables() { + } + + // Double-checked lazy initialization: load() runs once on first use, and a failure leaves the + // field null so a later call retries instead of the class being permanently unusable. + private static Map<Integer, String> prototypes() { + Map<Integer, String> map = prototypes; + if (map == null) { + synchronized (Confusables.class) { + map = prototypes; + if (map == null) { + map = load(); + prototypes = map; + } + } + } + return map; + } + + private static Map<Integer, String> load() { + try (InputStream in = Confusables.class.getResourceAsStream(RESOURCE)) { + if (in == null) { + throw new IllegalStateException("Missing confusables data resource: " + RESOURCE); + } + return parse(in); + } catch (IOException e) { + throw new UncheckedIOException("Unable to read confusables data resource " + RESOURCE, e); + } + } + + // Package-private so the malformed-data handling can be exercised without the bundled resource. + static Map<Integer, String> parse(InputStream in) throws IOException { + final Map<Integer, String> map = new HashMap<>(12000); + try (BufferedReader reader = + new BufferedReader(new InputStreamReader(in, StandardCharsets.UTF_8))) { + String line; + int lineNumber = 0; + while ((line = reader.readLine()) != null) { + lineNumber++; + final int hash = line.indexOf('#'); + final String content = (hash < 0 ? line : line.substring(0, hash)).strip(); + if (content.isEmpty()) { + continue; + } + final int firstSemicolon = content.indexOf(';'); + final int secondSemicolon = content.indexOf(';', firstSemicolon + 1); + if (firstSemicolon < 0 || secondSemicolon < 0) { + // A present-but-structurally-wrong line (fewer than two ';') is a hard error, like the + // malformed-hex path below and the sibling loaders -- never silently dropped. + throw new IllegalArgumentException("Malformed confusables data in " + RESOURCE + " at line " + + lineNumber + ": " + content); + } + try { + final int source = Integer.parseInt(content.substring(0, firstSemicolon).strip(), 16); + final String target = content.substring(firstSemicolon + 1, secondSemicolon).strip(); + final StringBuilder prototype = new StringBuilder(); + // Scan the whitespace-delimited hex tokens by hand to honor the no-regex contract and + // avoid compiling a Pattern for every one of the ~10k lines during static init. + final int targetLength = target.length(); + int pos = 0; + while (pos < targetLength) { + while (pos < targetLength && target.charAt(pos) <= ' ') { + pos++; + } + int end = pos; + while (end < targetLength && target.charAt(end) > ' ') { + end++; + } + if (end > pos) { + prototype.appendCodePoint(Integer.parseInt(target.substring(pos, end), 16)); + } + pos = end; + } + map.put(source, prototype.toString()); + } catch (NumberFormatException e) { + // Report the offending line, mirroring CodePointSet.fromFile, instead of letting a Review Comment: Would drop that comment because it can diverge if we change fromFile in CodePointSet. ########## opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/AccentFoldCharSequenceNormalizer.java: ########## @@ -0,0 +1,133 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package opennlp.tools.util.normalizer; + +import java.text.Normalizer; +import java.util.Set; + +/** + * A {@link CharSequenceNormalizer} that folds diacritics for search and matching, the + * multilingual-safe counterpart to a Latin-only ASCII folding filter. + * + * <p>Folding decomposes the text (NFD) and drops nonspacing combining marks, but only for base + * characters whose script is in {@code foldScripts} (Latin, Greek, and Cyrillic by default). Marks + * on other scripts are left untouched, because there they are essential orthography rather than + * decoration: stripping an Indic vowel sign or a virama, an Arabic harakat, a Hebrew point, or a + * Thai vowel changes the word. This script gating is the key correctness rule; never strip all + * nonspacing marks globally.</p> + * + * <p>Many "accented" Latin letters are atomic and do not decompose ({@code o} with stroke, the + * {@code ae}/{@code oe} ligatures, eszett, thorn, and so on). When {@code foldStrokeLetters} is + * enabled (the default) these are mapped to an ASCII approximation. Folding is a recall + * optimization, not a linguistically correct transform, so it is intended for a search/matching + * token rather than for display or language-specific analysis.</p> + * + * <p>Scanning is a single cursor pass over the decomposed text; no regular expression is used, and + * no global {@code \p{Mn}} strip is performed.</p> + */ +public class AccentFoldCharSequenceNormalizer implements CharSequenceNormalizer { + + private static final long serialVersionUID = 7843116209554120071L; + + private static final Set<Character.UnicodeScript> DEFAULT_SCRIPTS = Set.of( + Character.UnicodeScript.LATIN, + Character.UnicodeScript.GREEK, + Character.UnicodeScript.CYRILLIC); + + private static final AccentFoldCharSequenceNormalizer INSTANCE = + new AccentFoldCharSequenceNormalizer(DEFAULT_SCRIPTS, true); + + private final Set<Character.UnicodeScript> foldScripts; + private final boolean foldStrokeLetters; + + /** + * Creates a folder. Review Comment: This comment seems odd to me. ########## opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/TextNormalizer.java: ########## @@ -0,0 +1,151 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package opennlp.tools.util.normalizer; + +import java.util.ArrayList; +import java.util.List; +import java.util.Objects; + +/** + * Entry point for composing the normalization rungs into a single {@link CharSequenceNormalizer}. + * + * <p>Use {@link #builder()} to assemble a chain, or {@link #searchDefault()} for a conservative, + * search-oriented chain. The rungs are applied in the order they are added, so the caller controls + * the chain. Each rung is a shared, stateless normalizer; the built normalizer is an + * {@link AggregateCharSequenceNormalizer} that applies them in sequence.</p> + * + * <pre>{@code + * CharSequenceNormalizer n = TextNormalizer.builder() + * .nfc().caseFold().accentFold() + * .build(); + * }</pre> + */ +public final class TextNormalizer { + + private TextNormalizer() { + } + + /** {@return a new, empty {@link Builder}} */ + public static Builder builder() { + return new Builder(); + } + + /** + * {@return a conservative search/matching chain} + * + * <p>The chain strips invisible controls, applies NFC, collapses whitespace, folds quotes and + * dashes, case folds, and finally applies script-gated diacritic folding.</p> + */ + public static CharSequenceNormalizer searchDefault() { Review Comment: By default OpenNLP has nothing to do with `search`. Perhaps rename this method. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
