This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch charset-detection-improvements in repository https://gitbox.apache.org/repos/asf/tika.git
commit c6105af044953abff6e7329257940df765c77c54 Author: tallison <[email protected]> AuthorDate: Mon Apr 13 11:56:38 2026 -0400 narrow per-probe decode-equivalence for win-1252 --- .../tika/ml/chardetect/CharsetConfusables.java | 21 +++ .../tika/ml/chardetect/DecodeEquivalence.java | 143 ++++++++++++++++++ .../ml/chardetect/MojibusterEncodingDetector.java | 62 +++++++- .../tika/ml/chardetect/LatinFallbackTest.java | 91 ++++++++++++ .../ConfigurableByteNgramFeatureExtractor.java | 127 ++++++++++++++-- .../ml/chardetect/tools/TrainCharsetModel.java | 18 ++- .../chardetect/ConfigurableGlobalFeatureTest.java | 160 +++++++++++++++++++++ 7 files changed, 606 insertions(+), 16 deletions(-) diff --git a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/CharsetConfusables.java b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/CharsetConfusables.java index 6bff118a45..e8c3d02183 100644 --- a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/CharsetConfusables.java +++ b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/CharsetConfusables.java @@ -139,6 +139,21 @@ public final class CharsetConfusables { private static final Map<String, Set<String>> SYMMETRIC_PEER_MAP; + /** + * Single-byte Latin-family charsets that may decode byte-identically to + * windows-1252 on sparse probes (where the only high bytes present fall + * in positions the family agrees on — e.g. 0xE4='ä' in every member). + * + * <p>Used by the Latin-windows-1252 fallback rule in + * {@link MojibusterEncodingDetector}: if the top candidate is a member + * of this set AND the probe decodes byte-identically under windows-1252, + * swap to windows-1252 as the unmarked Latin default. This is a + * narrower replacement for an earlier general "decode-equivalence + * expansion" design — see {@code charset-detection.md} for the full + * design-options discussion.</p> + */ + public static final Set<String> SBCS_LATIN_FAMILY; + static { // ---------------------------------------------------------------- // Symmetric groups @@ -277,6 +292,12 @@ public final class CharsetConfusables { } } SYMMETRIC_PEER_MAP = Collections.unmodifiableMap(peerMap); + + SBCS_LATIN_FAMILY = Collections.unmodifiableSet(new HashSet<>(Arrays.asList( + "windows-1250", "windows-1252", "windows-1254", "windows-1257", + "ISO-8859-1", "ISO-8859-2", "ISO-8859-3", "ISO-8859-4", + "ISO-8859-9", "ISO-8859-13", "ISO-8859-15", "ISO-8859-16", + "x-MacRoman"))); } private CharsetConfusables() { diff --git a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/DecodeEquivalence.java b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/DecodeEquivalence.java new file mode 100644 index 0000000000..f194c216ec --- /dev/null +++ b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/DecodeEquivalence.java @@ -0,0 +1,143 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.ml.chardetect; + +import java.nio.ByteBuffer; +import java.nio.CharBuffer; +import java.nio.charset.Charset; +import java.nio.charset.CharsetDecoder; +import java.nio.charset.CoderResult; +import java.nio.charset.CodingErrorAction; +import java.util.Map; +import java.util.concurrent.ConcurrentHashMap; + +/** + * Cheap byte-wise decode-equivalence check for single-byte charsets. + * + * <p>For single-byte codepages, the mapping from byte value (0x00..0xFF) to + * Unicode codepoint is a fixed table. Two charsets decode a probe + * byte-for-byte identically iff their byte-to-char tables agree on every + * byte value that appears in the probe. ASCII bytes (below {@code 0x80}) + * map identically in every Latin-family codepage and are skipped; the check + * reduces to "do these charsets agree on every high byte present in this + * probe?"</p> + * + * <p>Cost: {@code O(probe.length)} per call in the worst case, typically + * short-circuits on the first disagreement. Byte-to-char tables are + * computed lazily on first use and cached for process lifetime.</p> + * + * <p>This is the inference-time counterpart to the broader + * {@link CharsetConfusables#POTENTIAL_DECODE_EQUIV_FAMILIES} declaration — + * families enumerate which pairs are <em>potentially</em> byte-identical; + * this class decides whether they are <em>actually</em> byte-identical on a + * specific probe.</p> + */ +public final class DecodeEquivalence { + + /** Per-charset byte-to-char tables, lazily populated. */ + private static final Map<String, char[]> TABLE_CACHE = new ConcurrentHashMap<>(); + + private DecodeEquivalence() { + } + + /** + * Returns {@code true} if decoding {@code probe} under charsets {@code a} + * and {@code b} produces bit-identical character sequences. Only the + * high-byte positions (bytes {@code >= 0x80}) are compared; all Latin-family + * charsets agree on ASCII. + * + * <p>Returns {@code false} (and caches nothing) if either charset's byte + * table cannot be resolved (e.g. stateful, multi-byte, or JVM-unsupported). + * Callers should restrict invocation to single-byte charsets, typically + * via {@link CharsetConfusables#potentialDecodeEquivPeersOf(String)}.</p> + */ + public static boolean byteIdenticalOnProbe(byte[] probe, Charset a, Charset b) { + if (a.equals(b)) { + return true; + } + char[] tableA = tableFor(a); + char[] tableB = tableFor(b); + if (tableA == null || tableB == null) { + return false; + } + for (int i = 0; i < probe.length; i++) { + int v = probe[i] & 0xFF; + if (v < 0x80) { + continue; // ASCII agrees in every Latin-family SBCS + } + if (tableA[v] != tableB[v]) { + return false; + } + } + return true; + } + + /** + * Returns a 256-element byte-to-char table for a single-byte charset, or + * {@code null} if the charset is not single-byte or is unresolvable on + * this JVM. The table is cached across calls. + * + * <p>"Single-byte" is verified by decoding all 256 possible byte values + * and requiring exactly one char of output per input byte (or the + * replacement char on unmapped positions — still one char). Multi-byte + * charsets (Shift_JIS, UTF-8, …) produce variable-length output and are + * excluded.</p> + */ + static char[] tableFor(Charset cs) { + char[] cached = TABLE_CACHE.get(cs.name()); + if (cached != null) { + return cached; + } + char[] built = buildTable(cs); + if (built != null) { + TABLE_CACHE.put(cs.name(), built); + } + return built; + } + + private static char[] buildTable(Charset cs) { + try { + CharsetDecoder dec = cs.newDecoder() + .onMalformedInput(CodingErrorAction.REPLACE) + .onUnmappableCharacter(CodingErrorAction.REPLACE) + .replaceWith("\uFFFD"); + char[] table = new char[256]; + byte[] one = new byte[1]; + for (int v = 0; v < 256; v++) { + one[0] = (byte) v; + CharBuffer out = CharBuffer.allocate(4); + ByteBuffer in = ByteBuffer.wrap(one); + dec.reset(); + CoderResult cr = dec.decode(in, out, true); + if (cr.isError()) { + return null; + } + dec.decode(ByteBuffer.allocate(0), out, true); + dec.flush(out); + out.flip(); + if (out.remaining() != 1) { + // Multi-byte / stateful charset — not a single-byte table. + return null; + } + table[v] = out.get(); + } + return table; + } catch (Exception e) { + return null; + } + } +} diff --git a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java index 6f6590e38d..69cbdc9163 100644 --- a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java +++ b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java @@ -106,7 +106,23 @@ public class MojibusterEncodingDetector implements EncodingDetector { * line ending) does <em>not</em> trigger this rule. Mirrors the legacy * {@code UniversalEncodingListener.report()} heuristic.</p> */ - CRLF_TO_WINDOWS + CRLF_TO_WINDOWS, + /** + * When the top candidate is a single-byte Latin-family charset + * (see {@link CharsetConfusables#SBCS_LATIN_FAMILY}) other than + * windows-1252, and the probe decodes byte-identically under + * windows-1252, swap the result to windows-1252 as the unmarked + * Latin default. Cheap per-probe byte walk via + * {@link DecodeEquivalence#byteIdenticalOnProbe}; short-circuits on + * the first disagreeing high byte. Zero cost for probes whose top + * candidate isn't Latin-family (CJK, UTF-*, EBCDIC, Cyrillic, + * Arabic, Greek, Hebrew). + * + * <p>Narrow by design — see {@code charset-detection.md} for the + * full options discussion (generalized candidate expansion and + * per-family canonicals were considered and rejected for now).</p> + */ + LATIN_FALLBACK_WIN1252 } private static final long serialVersionUID = 1L; @@ -452,6 +468,10 @@ public class MojibusterEncodingDetector implements EncodingDetector { results = selectAtLeast(model, logits, MIN_CANDIDATES, probe, grammar); } + if (enabledRules.contains(Rule.LATIN_FALLBACK_WIN1252)) { + results = applyLatinFallback(probe, results); + } + if (enabledRules.contains(Rule.ISO_TO_WINDOWS) && StructuralEncodingRules.hasC1Bytes(probe)) { results = upgradeIsoToWindows(results); } @@ -659,6 +679,46 @@ public class MojibusterEncodingDetector implements EncodingDetector { return upgraded; } + private static final String WIN1252 = "windows-1252"; + + /** + * Latin→windows-1252 fallback. See {@link Rule#LATIN_FALLBACK_WIN1252}. + * + * <p>For each candidate whose label is in {@link CharsetConfusables#SBCS_LATIN_FAMILY} + * but is not already windows-1252, if the probe decodes byte-identically + * under windows-1252 (cheap per-probe byte walk via + * {@link DecodeEquivalence#byteIdenticalOnProbe}), swap the result to + * windows-1252 at the same confidence. A candidate that is already + * windows-1252 short-circuits the rest of the list — once windows-1252 + * has been selected there's nothing to relabel.</p> + */ + private static List<EncodingResult> applyLatinFallback(byte[] probe, + List<EncodingResult> results) { + if (results.isEmpty()) { + return results; + } + Charset win1252 = labelToCharset(WIN1252); + if (win1252 == null) { + return results; + } + List<EncodingResult> out = new ArrayList<>(results.size()); + boolean replaced = false; + for (EncodingResult er : results) { + String label = er.getLabel() != null ? er.getLabel() : er.getCharset().name(); + if (!replaced + && CharsetConfusables.SBCS_LATIN_FAMILY.contains(label) + && !WIN1252.equals(label) + && DecodeEquivalence.byteIdenticalOnProbe(probe, er.getCharset(), win1252)) { + out.add(new EncodingResult(win1252, er.getConfidence(), WIN1252, + er.getResultType())); + replaced = true; + } else { + out.add(er); + } + } + return out; + } + private static List<EncodingResult> upgradeIsoToWindows(List<EncodingResult> results) { List<EncodingResult> upgraded = new ArrayList<>(results.size()); for (EncodingResult er : results) { diff --git a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/test/java/org/apache/tika/ml/chardetect/LatinFallbackTest.java b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/test/java/org/apache/tika/ml/chardetect/LatinFallbackTest.java new file mode 100644 index 0000000000..f1878b2087 --- /dev/null +++ b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/test/java/org/apache/tika/ml/chardetect/LatinFallbackTest.java @@ -0,0 +1,91 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.ml.chardetect; + +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; + +import org.junit.jupiter.api.Test; + +/** + * Tests for the byte-walk decode-equivalence helper and the narrow + * Latin→windows-1252 fallback semantics. Integration with the detector + * pipeline is exercised in the broader regression tests. + */ +public class LatinFallbackTest { + + private static final Charset WIN1252 = Charset.forName("windows-1252"); + private static final Charset WIN1257 = Charset.forName("windows-1257"); + private static final Charset WIN1250 = Charset.forName("windows-1250"); + private static final Charset MACROMAN = Charset.forName("x-MacRoman"); + private static final Charset ISO8859_1 = Charset.forName("ISO-8859-1"); + private static final Charset IBM852 = Charset.forName("IBM852"); + + @Test + public void vcardSingleUmlautIsByteIdenticalUnderLatin1252And1257() { + byte[] probe = "BEGIN:VCARD\r\nN:M\u00FCller\r\nFN:Hans M\u00FCller\r\nEND:VCARD\r\n" + .getBytes(ISO8859_1); + assertTrue(DecodeEquivalence.byteIdenticalOnProbe(probe, WIN1257, WIN1252), + "German vCard bytes should decode identically under 1257 and 1252"); + } + + @Test + public void ibm852DiffersFrom1252OnUmlaut() { + // 0xFC in windows-1252 is 'ü'; in IBM852 it's 'Ř'. The fallback + // must NOT relabel IBM852 to windows-1252 when the probe contains + // bytes where the two genuinely differ. + byte[] probe = "stra\u00DFe".getBytes(ISO8859_1); // 'ß' = 0xDF + // 0xDF in IBM852 is different from 0xDF in 1252 — check byte 0xFC too + byte[] probeWithUmlaut = new byte[]{'M', (byte) 0xFC, 'l', 'l', 'e', 'r'}; + assertFalse(DecodeEquivalence.byteIdenticalOnProbe(probeWithUmlaut, IBM852, WIN1252), + "IBM852 'Ř' must not be byte-identical to 1252 'ü'"); + } + + @Test + public void pureAsciiIsByteIdenticalAcrossAllLatinFamily() { + byte[] probe = "Hello, world! No accents here at all.\r\n" + .getBytes(StandardCharsets.US_ASCII); + assertTrue(DecodeEquivalence.byteIdenticalOnProbe(probe, WIN1257, WIN1252)); + assertTrue(DecodeEquivalence.byteIdenticalOnProbe(probe, WIN1250, WIN1252)); + assertTrue(DecodeEquivalence.byteIdenticalOnProbe(probe, MACROMAN, WIN1252)); + } + + @Test + public void win1257EuroSignDiffersFrom1252() { + // 0xA4 in windows-1257 is the generic currency sign '¤'; + // in windows-1252 it is also '¤' — they AGREE here. + // But 0xB8 differs: 1257='ø', 1252='¸'. + byte[] probe = new byte[]{'t', 'e', 's', 't', (byte) 0xB8}; + assertFalse(DecodeEquivalence.byteIdenticalOnProbe(probe, WIN1257, WIN1252), + "0xB8 differs between 1257 and 1252 — must not byte-match"); + } + + @Test + public void sameCharsetIsAlwaysEquivalent() { + byte[] probe = "anything at all \u00E4\u00F6\u00FC".getBytes(ISO8859_1); + assertTrue(DecodeEquivalence.byteIdenticalOnProbe(probe, WIN1252, WIN1252)); + } + + @Test + public void emptyProbeIsEquivalentEverywhere() { + assertTrue(DecodeEquivalence.byteIdenticalOnProbe(new byte[0], WIN1257, WIN1252)); + assertTrue(DecodeEquivalence.byteIdenticalOnProbe(new byte[0], IBM852, WIN1252)); + } +} diff --git a/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/ConfigurableByteNgramFeatureExtractor.java b/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/ConfigurableByteNgramFeatureExtractor.java index 1308733148..c2659396d2 100644 --- a/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/ConfigurableByteNgramFeatureExtractor.java +++ b/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/ConfigurableByteNgramFeatureExtractor.java @@ -56,36 +56,126 @@ public class ConfigurableByteNgramFeatureExtractor implements FeatureExtractor<b /** Distinct salt for stride-2 bigrams — prevents collision with stride-1 hashes. */ private static final int FNV_STRIDE2_SALT = 0x9e3779b9; + /** + * Number of reserved slots at the high end of the feature vector used for + * global (whole-probe) features when {@link #useGlobalFeatures} is enabled. + * Currently 6 slots hold ASCII-low-byte density bins (see + * {@link #asciiDensityBin(byte[])}). + */ + public static final int GLOBAL_FEATURE_COUNT = 6; + private final int numBuckets; + private final int hashBuckets; private final boolean useUnigrams; private final boolean useBigrams; private final boolean useTrigrams; private final boolean useAnchoredBigrams; private final boolean useStride2Bigrams; + private final boolean useGlobalFeatures; + + /** + * Backwards-compatible constructor (no global features). + */ + public ConfigurableByteNgramFeatureExtractor(int numBuckets, + boolean useUnigrams, + boolean useBigrams, + boolean useTrigrams, + boolean useAnchoredBigrams, + boolean useStride2Bigrams) { + this(numBuckets, useUnigrams, useBigrams, useTrigrams, + useAnchoredBigrams, useStride2Bigrams, false); + } /** - * @param numBuckets number of hash buckets (feature-vector dimension) + * @param numBuckets total feature-vector dimension. When + * {@code useGlobalFeatures} is {@code true}, the + * last {@link #GLOBAL_FEATURE_COUNT} slots are + * reserved for global features and hashed n-gram + * features mod into the first + * {@code numBuckets - GLOBAL_FEATURE_COUNT} slots. * @param useUnigrams emit unigram for each high byte * @param useBigrams emit bigram anchored on each high byte * @param useTrigrams emit trigram anchored on each high byte * @param useAnchoredBigrams emit bigram anchored on each low trail byte * @param useStride2Bigrams emit stride-2 bigrams at even positions (all bytes) + * @param useGlobalFeatures emit whole-probe global features into the + * reserved tail slots (ASCII-density bins) */ public ConfigurableByteNgramFeatureExtractor(int numBuckets, boolean useUnigrams, boolean useBigrams, boolean useTrigrams, boolean useAnchoredBigrams, - boolean useStride2Bigrams) { + boolean useStride2Bigrams, + boolean useGlobalFeatures) { if (numBuckets <= 0) { throw new IllegalArgumentException("numBuckets must be positive: " + numBuckets); } + if (useGlobalFeatures && numBuckets <= GLOBAL_FEATURE_COUNT) { + throw new IllegalArgumentException( + "numBuckets must exceed GLOBAL_FEATURE_COUNT (" + GLOBAL_FEATURE_COUNT + + ") when useGlobalFeatures=true: " + numBuckets); + } this.numBuckets = numBuckets; + this.hashBuckets = useGlobalFeatures ? numBuckets - GLOBAL_FEATURE_COUNT : numBuckets; this.useUnigrams = useUnigrams; this.useBigrams = useBigrams; this.useTrigrams = useTrigrams; this.useAnchoredBigrams = useAnchoredBigrams; this.useStride2Bigrams = useStride2Bigrams; + this.useGlobalFeatures = useGlobalFeatures; + } + + /** + * Returns which ASCII-text-density bin this probe falls into, in [0, 6). + * + * <p>Counts only <em>ASCII text bytes</em> — printable (0x20..0x7E) plus + * common whitespace (0x09 tab, 0x0A LF, 0x0D CR). NUL and other control + * bytes do <em>not</em> count. This matters because UTF-16LE/BE probes + * contain ~50% 0x00 bytes; if we counted those as "low", UTF-16 English + * would look like sparse Latin to the model, defeating the point of the + * feature. With the current definition, real UTF-16 English lands around + * bin 2-3 (half ASCII-letter bytes, half nulls), distinguishable from + * plain-ASCII probes (bin 5) and from real EBCDIC (bin 0-1).</p> + * + * <p>Bin layout (fraction of bytes that are ASCII-text):</p> + * <ul> + * <li>0: [0.00, 0.10) — effectively no ASCII text (real EBCDIC letters)</li> + * <li>1: [0.10, 0.50) — heavy non-ASCII content (CJK text, UTF-16 mixed)</li> + * <li>2: [0.50, 0.80) — text with dense foreign script, UTF-16 Latin</li> + * <li>3: [0.80, 0.95) — normal foreign-script text with ASCII markup</li> + * <li>4: [0.95, 0.99) — sparse-diacritic Western text</li> + * <li>5: [0.99, 1.00] — near-pure ASCII (vCards, config, scripts)</li> + * </ul> + */ + public static int asciiDensityBin(byte[] input) { + if (input == null || input.length == 0) { + return 5; + } + int asciiText = 0; + for (byte b : input) { + int v = b & 0xFF; + if ((v >= 0x20 && v <= 0x7E) || v == 0x09 || v == 0x0A || v == 0x0D) { + asciiText++; + } + } + double p = (double) asciiText / input.length; + if (p < 0.10) { + return 0; + } + if (p < 0.50) { + return 1; + } + if (p < 0.80) { + return 2; + } + if (p < 0.95) { + return 3; + } + if (p < 0.99) { + return 4; + } + return 5; } @Override @@ -121,7 +211,7 @@ public class ConfigurableByteNgramFeatureExtractor implements FeatureExtractor<b if (useUnigrams) { int h = (FNV_OFFSET ^ bi) * FNV_PRIME; - int bkt = (h & 0x7fffffff) % numBuckets; + int bkt = (h & 0x7fffffff) % hashBuckets; if (dense[bkt] == 0) { touched[n++] = bkt; } @@ -134,7 +224,7 @@ public class ConfigurableByteNgramFeatureExtractor implements FeatureExtractor<b if (useBigrams) { int h = (FNV_OFFSET ^ bi) * FNV_PRIME; h = (h ^ bi1) * FNV_PRIME; - int bkt = (h & 0x7fffffff) % numBuckets; + int bkt = (h & 0x7fffffff) % hashBuckets; if (dense[bkt] == 0) { touched[n++] = bkt; } @@ -146,7 +236,7 @@ public class ConfigurableByteNgramFeatureExtractor implements FeatureExtractor<b if (i + 2 < input.length) { h = (h ^ (input[i + 2] & 0xFF)) * FNV_PRIME; } - int bkt = (h & 0x7fffffff) % numBuckets; + int bkt = (h & 0x7fffffff) % hashBuckets; if (dense[bkt] == 0) { touched[n++] = bkt; } @@ -158,7 +248,7 @@ public class ConfigurableByteNgramFeatureExtractor implements FeatureExtractor<b int h = (FNV_OFFSET ^ bi) * FNV_PRIME; h = (h ^ bi1) * FNV_PRIME; h = (h ^ bi2) * FNV_PRIME; - int bkt = (h & 0x7fffffff) % numBuckets; + int bkt = (h & 0x7fffffff) % hashBuckets; if (dense[bkt] == 0) { touched[n++] = bkt; } @@ -174,7 +264,7 @@ public class ConfigurableByteNgramFeatureExtractor implements FeatureExtractor<b int b1 = input[i + 1] & 0xFF; int h = (FNV_STRIDE2_SALT ^ b0) * FNV_PRIME; h = (h ^ b1) * FNV_PRIME; - int bkt = (h & 0x7fffffff) % numBuckets; + int bkt = (h & 0x7fffffff) % hashBuckets; if (dense[bkt] == 0) { touched[n++] = bkt; } @@ -182,6 +272,15 @@ public class ConfigurableByteNgramFeatureExtractor implements FeatureExtractor<b } } + // Global features at reserved tail slots: fire exactly one ASCII-density bin. + if (useGlobalFeatures) { + int bkt = hashBuckets + asciiDensityBin(input); + if (dense[bkt] == 0) { + touched[n++] = bkt; + } + dense[bkt]++; + } + return n; } @@ -234,10 +333,17 @@ public class ConfigurableByteNgramFeatureExtractor implements FeatureExtractor<b counts[bucket(h)]++; } } + + // Global features at reserved tail slots: fire exactly one ASCII-density bin. + if (useGlobalFeatures) { + byte[] slice = (from == 0 && to == b.length) + ? b : java.util.Arrays.copyOfRange(b, from, to); + counts[hashBuckets + asciiDensityBin(slice)]++; + } } private int bucket(int hash) { - return (hash & 0x7fffffff) % numBuckets; + return (hash & 0x7fffffff) % hashBuckets; } @Override @@ -248,7 +354,8 @@ public class ConfigurableByteNgramFeatureExtractor implements FeatureExtractor<b @Override public String toString() { return String.format(java.util.Locale.ROOT, - "ConfigurableByteNgramFeatureExtractor{buckets=%d, uni=%b, bi=%b, tri=%b, anchored=%b, stride2=%b}", - numBuckets, useUnigrams, useBigrams, useTrigrams, useAnchoredBigrams, useStride2Bigrams); + "ConfigurableByteNgramFeatureExtractor{buckets=%d, hash=%d, uni=%b, bi=%b, tri=%b, anchored=%b, stride2=%b, globals=%b}", + numBuckets, hashBuckets, useUnigrams, useBigrams, useTrigrams, + useAnchoredBigrams, useStride2Bigrams, useGlobalFeatures); } } diff --git a/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/TrainCharsetModel.java b/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/TrainCharsetModel.java index 9fd35ab7df..7a38d3bce9 100644 --- a/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/TrainCharsetModel.java +++ b/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/TrainCharsetModel.java @@ -81,6 +81,7 @@ public class TrainCharsetModel { boolean useTrigrams = true; boolean useAnchoredBigrams = false; boolean useStride2Bigrams = true; + boolean useGlobalFeatures = false; // --label-remap src1:dst1,src2:dst2 — merges multiple source labels into // one target label at training time (e.g. merge script variants into one class). Map<String, String> labelRemap = new HashMap<>(); @@ -140,6 +141,12 @@ public class TrainCharsetModel { case "--no-stride2": useStride2Bigrams = false; break; + case "--globals": + useGlobalFeatures = true; + break; + case "--no-globals": + useGlobalFeatures = false; + break; case "--exclude": for (String label : args[++i].split(",")) { excludeLabels.add(label.trim()); @@ -164,6 +171,7 @@ public class TrainCharsetModel { System.err.println(" --tri / --no-tri enable/disable trigram features (default: on)"); System.err.println(" --anchored / --no-anchored anchored bigrams (default: off)"); System.err.println(" --stride2 / --no-stride2 stride-2 bigrams at even positions (default: on)"); + System.err.println(" --globals / --no-globals emit global ASCII-density bin features (default: off)"); System.err.println(" --exclude cs1,cs2 skip these charset labels (e.g. UTF-32-BE,UTF-32-LE)"); System.exit(1); } @@ -211,12 +219,14 @@ public class TrainCharsetModel { "Buckets: %d epochs: %d lr: %.4f max-samples/class: %d%n", numBuckets, epochs, lr, maxSamplesPerClass); System.out.printf(java.util.Locale.ROOT, - "Features: uni=%b bi=%b tri=%b anchored=%b stride2=%b%n", - useUnigrams, useBigrams, useTrigrams, useAnchoredBigrams, useStride2Bigrams); + "Features: uni=%b bi=%b tri=%b anchored=%b stride2=%b globals=%b%n", + useUnigrams, useBigrams, useTrigrams, useAnchoredBigrams, useStride2Bigrams, + useGlobalFeatures); ConfigurableByteNgramFeatureExtractor extractor = new ConfigurableByteNgramFeatureExtractor(numBuckets, - useUnigrams, useBigrams, useTrigrams, useAnchoredBigrams, useStride2Bigrams); + useUnigrams, useBigrams, useTrigrams, useAnchoredBigrams, + useStride2Bigrams, useGlobalFeatures); // Build class index map Map<String, Integer> labelIndex = new HashMap<>(); @@ -281,8 +291,6 @@ public class TrainCharsetModel { // Sparse extraction: O(probeLength), not O(numBuckets) int nActive = extractor.extractSparseInto(sample, denseScratch, touched); - // L1 normalization: compute sum of feature counts so each sample - // contributes equal total mass regardless of encoding density. // Forward pass: only iterate active buckets float[] logits = new float[numClasses]; for (int c = 0; c < numClasses; c++) { diff --git a/tika-ml/tika-ml-chardetect/src/test/java/org/apache/tika/ml/chardetect/ConfigurableGlobalFeatureTest.java b/tika-ml/tika-ml-chardetect/src/test/java/org/apache/tika/ml/chardetect/ConfigurableGlobalFeatureTest.java new file mode 100644 index 0000000000..c40ef78075 --- /dev/null +++ b/tika-ml/tika-ml-chardetect/src/test/java/org/apache/tika/ml/chardetect/ConfigurableGlobalFeatureTest.java @@ -0,0 +1,160 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.ml.chardetect; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; + +import org.junit.jupiter.api.Test; + +import org.apache.tika.ml.chardetect.tools.ConfigurableByteNgramFeatureExtractor; + +public class ConfigurableGlobalFeatureTest { + + private static final int NUM_BUCKETS = 16384; + private static final int HASH_BUCKETS = NUM_BUCKETS + - ConfigurableByteNgramFeatureExtractor.GLOBAL_FEATURE_COUNT; + + private static ConfigurableByteNgramFeatureExtractor withGlobals() { + return new ConfigurableByteNgramFeatureExtractor( + NUM_BUCKETS, true, true, false, false, true, true); + } + + private static ConfigurableByteNgramFeatureExtractor withoutGlobals() { + return new ConfigurableByteNgramFeatureExtractor( + NUM_BUCKETS, true, true, false, false, true, false); + } + + @Test + public void pureAsciiLandsInTopBin() { + assertEquals(5, ConfigurableByteNgramFeatureExtractor.asciiDensityBin( + "BEGIN:VCARD\r\nVERSION:3.0\r\nEND:VCARD\r\n".getBytes(StandardCharsets.US_ASCII))); + } + + @Test + public void sparseLatinVcardLandsInTopBin() { + // 99.4% ASCII: 3 high bytes in ~510 bytes of vCard text + byte[] probe = "BEGIN:VCARD\r\nN:M\u00FCller;Hans\r\nFN:Hans M\u00FCller\r\nADR:K\u00F6ln\r\nEND:VCARD\r\n" + .getBytes(StandardCharsets.ISO_8859_1); + int bin = ConfigurableByteNgramFeatureExtractor.asciiDensityBin(probe); + assertTrue(bin >= 4, "sparse-Latin vCard should land in bin 4 or 5, got: " + bin); + } + + @Test + public void ebcdicTextLandsInLowBin() { + // Real EBCDIC: letters 0x81..0xE9 (~80%), 0x40 space (~20%) + // Under the ASCII-text bin definition, 0x40 IS printable ASCII ('@'), + // so EBCDIC lands in bin 1, not bin 0. What matters is that it's + // cleanly separated from the plain-ASCII bin 5. + byte[] ebcdic = new byte[100]; + int p = 0; + for (int i = 0; i < 20; i++) { + ebcdic[p++] = 0x40; // space + } + for (int i = 0; i < 80; i++) { + ebcdic[p++] = (byte) (0x81 + (i % 9)); // letters + } + int bin = ConfigurableByteNgramFeatureExtractor.asciiDensityBin(ebcdic); + assertTrue(bin <= 2, "EBCDIC should land in bin 0-2, got: " + bin); + assertNotEquals(5, bin, "EBCDIC must not collide with the ASCII bin"); + } + + @Test + public void utf16LeEnglishLandsInMiddleBin() { + // UTF-16LE "Hello, world" — every other byte is 0x00 + byte[] utf16 = "Hello, world! This is English text in UTF-16LE." + .getBytes(Charset.forName("UTF-16LE")); + int bin = ConfigurableByteNgramFeatureExtractor.asciiDensityBin(utf16); + assertTrue(bin == 2, "UTF-16LE English should land in bin 2 (~50%), got: " + bin); + } + + @Test + public void utf16LeBmpTextLandsInMidHighBin() { + // UTF-16LE of BMP text (Hiragana U+3040..U+309F etc.) — note that the + // "high byte of the codepoint" (0x30 here) is printable ASCII '0', and + // the "low byte" of most Hiragana falls in 0x40..0x9F — half printable. + // So UTF-16LE BMP text has a HIGH printable-ASCII-byte fraction despite + // not being ASCII text. The global feature does not try to distinguish + // UTF-16 from ASCII — that's stride-2's job. This test documents the + // observed behaviour so it isn't mistaken for a bug later. + byte[] utf16 = "\u6587\u7AE0\u3042\u3044\u3046\u3048\u304A\u304B\u304D\u304F" + .getBytes(Charset.forName("UTF-16LE")); + int bin = ConfigurableByteNgramFeatureExtractor.asciiDensityBin(utf16); + assertTrue(bin >= 2, "UTF-16LE BMP text has many printable bytes, got bin: " + bin); + } + + @Test + public void globalFeatureFiresExactlyOneTailSlot() { + ConfigurableByteNgramFeatureExtractor ext = withGlobals(); + int[] dense = new int[NUM_BUCKETS]; + int[] touched = new int[NUM_BUCKETS]; + + int n = ext.extractSparseInto( + "Plain ASCII text with no accents at all.".getBytes(StandardCharsets.US_ASCII), + dense, touched); + + int tailFirings = 0; + int tailSlot = -1; + for (int i = 0; i < n; i++) { + if (touched[i] >= HASH_BUCKETS) { + tailFirings++; + tailSlot = touched[i]; + } + } + assertEquals(1, tailFirings, "exactly one global tail slot must fire"); + assertEquals(HASH_BUCKETS + 5, tailSlot, "pure ASCII should fire bin 5"); + assertEquals(1, dense[tailSlot], "count for global bin must be 1"); + } + + @Test + public void disablingGlobalsLeavesTailEmpty() { + ConfigurableByteNgramFeatureExtractor ext = withoutGlobals(); + int[] dense = new int[NUM_BUCKETS]; + int[] touched = new int[NUM_BUCKETS]; + + int n = ext.extractSparseInto( + "Plain ASCII text".getBytes(StandardCharsets.US_ASCII), + dense, touched); + + for (int i = 0; i < n; i++) { + assertTrue(touched[i] < NUM_BUCKETS, + "all firings must be in hash range when globals are off"); + } + } + + @Test + public void sparseAndDenseExtractionAgreeWithGlobals() { + ConfigurableByteNgramFeatureExtractor ext = withGlobals(); + byte[] probe = "r\u00E9sum\u00E9 caf\u00E9 cr\u00E8me br\u00FBl\u00E9e" + .getBytes(StandardCharsets.ISO_8859_1); + + int[] dense = ext.extract(probe); + + int[] sparseDense = new int[NUM_BUCKETS]; + int[] touched = new int[NUM_BUCKETS]; + ext.extractSparseInto(probe, sparseDense, touched); + + for (int i = 0; i < NUM_BUCKETS; i++) { + assertEquals(dense[i], sparseDense[i], + "bucket " + i + " differs between dense and sparse paths"); + } + } +}
