(tika) 01/03: narrow per-probe decode-equivalence for win-1252

tallison Mon, 13 Apr 2026 10:40:23 -0700

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch charset-detection-improvements
in repository https://gitbox.apache.org/repos/asf/tika.git


commit c6105af044953abff6e7329257940df765c77c54
Author: tallison <[email protected]>
AuthorDate: Mon Apr 13 11:56:38 2026 -0400

    narrow per-probe decode-equivalence for win-1252
---
 .../tika/ml/chardetect/CharsetConfusables.java     |  21 +++
 .../tika/ml/chardetect/DecodeEquivalence.java      | 143 ++++++++++++++++++
 .../ml/chardetect/MojibusterEncodingDetector.java  |  62 +++++++-
 .../tika/ml/chardetect/LatinFallbackTest.java      |  91 ++++++++++++
 .../ConfigurableByteNgramFeatureExtractor.java     | 127 ++++++++++++++--
 .../ml/chardetect/tools/TrainCharsetModel.java     |  18 ++-
 .../chardetect/ConfigurableGlobalFeatureTest.java  | 160 +++++++++++++++++++++
 7 files changed, 606 insertions(+), 16 deletions(-)

diff --git 
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/CharsetConfusables.java
 
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/CharsetConfusables.java
index 6bff118a45..e8c3d02183 100644
--- 
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/CharsetConfusables.java
+++ 
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/CharsetConfusables.java
@@ -139,6 +139,21 @@ public final class CharsetConfusables {
 
     private static final Map<String, Set<String>> SYMMETRIC_PEER_MAP;
 
+    /**
+     * Single-byte Latin-family charsets that may decode byte-identically to
+     * windows-1252 on sparse probes (where the only high bytes present fall
+     * in positions the family agrees on — e.g. 0xE4='ä' in every member).
+     *
+     * <p>Used by the Latin-windows-1252 fallback rule in
+     * {@link MojibusterEncodingDetector}: if the top candidate is a member
+     * of this set AND the probe decodes byte-identically under windows-1252,
+     * swap to windows-1252 as the unmarked Latin default.  This is a
+     * narrower replacement for an earlier general "decode-equivalence
+     * expansion" design — see {@code charset-detection.md} for the full
+     * design-options discussion.</p>
+     */
+    public static final Set<String> SBCS_LATIN_FAMILY;
+
     static {
         // ----------------------------------------------------------------
         // Symmetric groups
@@ -277,6 +292,12 @@ public final class CharsetConfusables {
             }
         }
         SYMMETRIC_PEER_MAP = Collections.unmodifiableMap(peerMap);
+
+        SBCS_LATIN_FAMILY = Collections.unmodifiableSet(new 
HashSet<>(Arrays.asList(
+                "windows-1250", "windows-1252", "windows-1254", "windows-1257",
+                "ISO-8859-1", "ISO-8859-2", "ISO-8859-3", "ISO-8859-4",
+                "ISO-8859-9", "ISO-8859-13", "ISO-8859-15", "ISO-8859-16",
+                "x-MacRoman")));
     }
 
     private CharsetConfusables() {
diff --git 
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/DecodeEquivalence.java
 
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/DecodeEquivalence.java
new file mode 100644
index 0000000000..f194c216ec
--- /dev/null
+++ 
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/DecodeEquivalence.java
@@ -0,0 +1,143 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.ml.chardetect;
+
+import java.nio.ByteBuffer;
+import java.nio.CharBuffer;
+import java.nio.charset.Charset;
+import java.nio.charset.CharsetDecoder;
+import java.nio.charset.CoderResult;
+import java.nio.charset.CodingErrorAction;
+import java.util.Map;
+import java.util.concurrent.ConcurrentHashMap;
+
+/**
+ * Cheap byte-wise decode-equivalence check for single-byte charsets.
+ *
+ * <p>For single-byte codepages, the mapping from byte value (0x00..0xFF) to
+ * Unicode codepoint is a fixed table.  Two charsets decode a probe
+ * byte-for-byte identically iff their byte-to-char tables agree on every
+ * byte value that appears in the probe.  ASCII bytes (below {@code 0x80})
+ * map identically in every Latin-family codepage and are skipped; the check
+ * reduces to "do these charsets agree on every high byte present in this
+ * probe?"</p>
+ *
+ * <p>Cost: {@code O(probe.length)} per call in the worst case, typically
+ * short-circuits on the first disagreement.  Byte-to-char tables are
+ * computed lazily on first use and cached for process lifetime.</p>
+ *
+ * <p>This is the inference-time counterpart to the broader
+ * {@link CharsetConfusables#POTENTIAL_DECODE_EQUIV_FAMILIES} declaration —
+ * families enumerate which pairs are <em>potentially</em> byte-identical;
+ * this class decides whether they are <em>actually</em> byte-identical on a
+ * specific probe.</p>
+ */
+public final class DecodeEquivalence {
+
+    /** Per-charset byte-to-char tables, lazily populated. */
+    private static final Map<String, char[]> TABLE_CACHE = new 
ConcurrentHashMap<>();
+
+    private DecodeEquivalence() {
+    }
+
+    /**
+     * Returns {@code true} if decoding {@code probe} under charsets {@code a}
+     * and {@code b} produces bit-identical character sequences.  Only the
+     * high-byte positions (bytes {@code >= 0x80}) are compared; all 
Latin-family
+     * charsets agree on ASCII.
+     *
+     * <p>Returns {@code false} (and caches nothing) if either charset's byte
+     * table cannot be resolved (e.g. stateful, multi-byte, or 
JVM-unsupported).
+     * Callers should restrict invocation to single-byte charsets, typically
+     * via {@link CharsetConfusables#potentialDecodeEquivPeersOf(String)}.</p>
+     */
+    public static boolean byteIdenticalOnProbe(byte[] probe, Charset a, 
Charset b) {
+        if (a.equals(b)) {
+            return true;
+        }
+        char[] tableA = tableFor(a);
+        char[] tableB = tableFor(b);
+        if (tableA == null || tableB == null) {
+            return false;
+        }
+        for (int i = 0; i < probe.length; i++) {
+            int v = probe[i] & 0xFF;
+            if (v < 0x80) {
+                continue;  // ASCII agrees in every Latin-family SBCS
+            }
+            if (tableA[v] != tableB[v]) {
+                return false;
+            }
+        }
+        return true;
+    }
+
+    /**
+     * Returns a 256-element byte-to-char table for a single-byte charset, or
+     * {@code null} if the charset is not single-byte or is unresolvable on
+     * this JVM.  The table is cached across calls.
+     *
+     * <p>"Single-byte" is verified by decoding all 256 possible byte values
+     * and requiring exactly one char of output per input byte (or the
+     * replacement char on unmapped positions — still one char).  Multi-byte
+     * charsets (Shift_JIS, UTF-8, …) produce variable-length output and are
+     * excluded.</p>
+     */
+    static char[] tableFor(Charset cs) {
+        char[] cached = TABLE_CACHE.get(cs.name());
+        if (cached != null) {
+            return cached;
+        }
+        char[] built = buildTable(cs);
+        if (built != null) {
+            TABLE_CACHE.put(cs.name(), built);
+        }
+        return built;
+    }
+
+    private static char[] buildTable(Charset cs) {
+        try {
+            CharsetDecoder dec = cs.newDecoder()
+                    .onMalformedInput(CodingErrorAction.REPLACE)
+                    .onUnmappableCharacter(CodingErrorAction.REPLACE)
+                    .replaceWith("\uFFFD");
+            char[] table = new char[256];
+            byte[] one = new byte[1];
+            for (int v = 0; v < 256; v++) {
+                one[0] = (byte) v;
+                CharBuffer out = CharBuffer.allocate(4);
+                ByteBuffer in = ByteBuffer.wrap(one);
+                dec.reset();
+                CoderResult cr = dec.decode(in, out, true);
+                if (cr.isError()) {
+                    return null;
+                }
+                dec.decode(ByteBuffer.allocate(0), out, true);
+                dec.flush(out);
+                out.flip();
+                if (out.remaining() != 1) {
+                    // Multi-byte / stateful charset — not a single-byte table.
+                    return null;
+                }
+                table[v] = out.get();
+            }
+            return table;
+        } catch (Exception e) {
+            return null;
+        }
+    }
+}
diff --git 
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java
 
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java
index 6f6590e38d..69cbdc9163 100644
--- 
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java
+++ 
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java
@@ -106,7 +106,23 @@ public class MojibusterEncodingDetector implements 
EncodingDetector {
          * line ending) does <em>not</em> trigger this rule.  Mirrors the 
legacy
          * {@code UniversalEncodingListener.report()} heuristic.</p>
          */
-        CRLF_TO_WINDOWS
+        CRLF_TO_WINDOWS,
+        /**
+         * When the top candidate is a single-byte Latin-family charset
+         * (see {@link CharsetConfusables#SBCS_LATIN_FAMILY}) other than
+         * windows-1252, and the probe decodes byte-identically under
+         * windows-1252, swap the result to windows-1252 as the unmarked
+         * Latin default.  Cheap per-probe byte walk via
+         * {@link DecodeEquivalence#byteIdenticalOnProbe}; short-circuits on
+         * the first disagreeing high byte.  Zero cost for probes whose top
+         * candidate isn't Latin-family (CJK, UTF-*, EBCDIC, Cyrillic,
+         * Arabic, Greek, Hebrew).
+         *
+         * <p>Narrow by design — see {@code charset-detection.md} for the
+         * full options discussion (generalized candidate expansion and
+         * per-family canonicals were considered and rejected for now).</p>
+         */
+        LATIN_FALLBACK_WIN1252
     }
 
     private static final long serialVersionUID = 1L;
@@ -452,6 +468,10 @@ public class MojibusterEncodingDetector implements 
EncodingDetector {
             results = selectAtLeast(model, logits, MIN_CANDIDATES, probe, 
grammar);
         }
 
+        if (enabledRules.contains(Rule.LATIN_FALLBACK_WIN1252)) {
+            results = applyLatinFallback(probe, results);
+        }
+
         if (enabledRules.contains(Rule.ISO_TO_WINDOWS) && 
StructuralEncodingRules.hasC1Bytes(probe)) {
             results = upgradeIsoToWindows(results);
         }
@@ -659,6 +679,46 @@ public class MojibusterEncodingDetector implements 
EncodingDetector {
         return upgraded;
     }
 
+    private static final String WIN1252 = "windows-1252";
+
+    /**
+     * Latin→windows-1252 fallback.  See {@link Rule#LATIN_FALLBACK_WIN1252}.
+     *
+     * <p>For each candidate whose label is in {@link 
CharsetConfusables#SBCS_LATIN_FAMILY}
+     * but is not already windows-1252, if the probe decodes byte-identically
+     * under windows-1252 (cheap per-probe byte walk via
+     * {@link DecodeEquivalence#byteIdenticalOnProbe}), swap the result to
+     * windows-1252 at the same confidence.  A candidate that is already
+     * windows-1252 short-circuits the rest of the list — once windows-1252
+     * has been selected there's nothing to relabel.</p>
+     */
+    private static List<EncodingResult> applyLatinFallback(byte[] probe,
+                                                           
List<EncodingResult> results) {
+        if (results.isEmpty()) {
+            return results;
+        }
+        Charset win1252 = labelToCharset(WIN1252);
+        if (win1252 == null) {
+            return results;
+        }
+        List<EncodingResult> out = new ArrayList<>(results.size());
+        boolean replaced = false;
+        for (EncodingResult er : results) {
+            String label = er.getLabel() != null ? er.getLabel() : 
er.getCharset().name();
+            if (!replaced
+                    && CharsetConfusables.SBCS_LATIN_FAMILY.contains(label)
+                    && !WIN1252.equals(label)
+                    && DecodeEquivalence.byteIdenticalOnProbe(probe, 
er.getCharset(), win1252)) {
+                out.add(new EncodingResult(win1252, er.getConfidence(), 
WIN1252,
+                        er.getResultType()));
+                replaced = true;
+            } else {
+                out.add(er);
+            }
+        }
+        return out;
+    }
+
     private static List<EncodingResult> 
upgradeIsoToWindows(List<EncodingResult> results) {
         List<EncodingResult> upgraded = new ArrayList<>(results.size());
         for (EncodingResult er : results) {
diff --git 
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/test/java/org/apache/tika/ml/chardetect/LatinFallbackTest.java
 
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/test/java/org/apache/tika/ml/chardetect/LatinFallbackTest.java
new file mode 100644
index 0000000000..f1878b2087
--- /dev/null
+++ 
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/test/java/org/apache/tika/ml/chardetect/LatinFallbackTest.java
@@ -0,0 +1,91 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.ml.chardetect;
+
+import static org.junit.jupiter.api.Assertions.assertFalse;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
+
+import org.junit.jupiter.api.Test;
+
+/**
+ * Tests for the byte-walk decode-equivalence helper and the narrow
+ * Latin→windows-1252 fallback semantics.  Integration with the detector
+ * pipeline is exercised in the broader regression tests.
+ */
+public class LatinFallbackTest {
+
+    private static final Charset WIN1252 = Charset.forName("windows-1252");
+    private static final Charset WIN1257 = Charset.forName("windows-1257");
+    private static final Charset WIN1250 = Charset.forName("windows-1250");
+    private static final Charset MACROMAN = Charset.forName("x-MacRoman");
+    private static final Charset ISO8859_1 = Charset.forName("ISO-8859-1");
+    private static final Charset IBM852 = Charset.forName("IBM852");
+
+    @Test
+    public void vcardSingleUmlautIsByteIdenticalUnderLatin1252And1257() {
+        byte[] probe = "BEGIN:VCARD\r\nN:M\u00FCller\r\nFN:Hans 
M\u00FCller\r\nEND:VCARD\r\n"
+                .getBytes(ISO8859_1);
+        assertTrue(DecodeEquivalence.byteIdenticalOnProbe(probe, WIN1257, 
WIN1252),
+                "German vCard bytes should decode identically under 1257 and 
1252");
+    }
+
+    @Test
+    public void ibm852DiffersFrom1252OnUmlaut() {
+        // 0xFC in windows-1252 is 'ü'; in IBM852 it's 'Ř'.  The fallback
+        // must NOT relabel IBM852 to windows-1252 when the probe contains
+        // bytes where the two genuinely differ.
+        byte[] probe = "stra\u00DFe".getBytes(ISO8859_1);  // 'ß' = 0xDF
+        // 0xDF in IBM852 is different from 0xDF in 1252 — check byte 0xFC too
+        byte[] probeWithUmlaut = new byte[]{'M', (byte) 0xFC, 'l', 'l', 'e', 
'r'};
+        assertFalse(DecodeEquivalence.byteIdenticalOnProbe(probeWithUmlaut, 
IBM852, WIN1252),
+                "IBM852 'Ř' must not be byte-identical to 1252 'ü'");
+    }
+
+    @Test
+    public void pureAsciiIsByteIdenticalAcrossAllLatinFamily() {
+        byte[] probe = "Hello, world!  No accents here at all.\r\n"
+                .getBytes(StandardCharsets.US_ASCII);
+        assertTrue(DecodeEquivalence.byteIdenticalOnProbe(probe, WIN1257, 
WIN1252));
+        assertTrue(DecodeEquivalence.byteIdenticalOnProbe(probe, WIN1250, 
WIN1252));
+        assertTrue(DecodeEquivalence.byteIdenticalOnProbe(probe, MACROMAN, 
WIN1252));
+    }
+
+    @Test
+    public void win1257EuroSignDiffersFrom1252() {
+        // 0xA4 in windows-1257 is the generic currency sign '¤';
+        // in windows-1252 it is also '¤' — they AGREE here.
+        // But 0xB8 differs: 1257='ø', 1252='¸'.
+        byte[] probe = new byte[]{'t', 'e', 's', 't', (byte) 0xB8};
+        assertFalse(DecodeEquivalence.byteIdenticalOnProbe(probe, WIN1257, 
WIN1252),
+                "0xB8 differs between 1257 and 1252 — must not byte-match");
+    }
+
+    @Test
+    public void sameCharsetIsAlwaysEquivalent() {
+        byte[] probe = "anything at all 
\u00E4\u00F6\u00FC".getBytes(ISO8859_1);
+        assertTrue(DecodeEquivalence.byteIdenticalOnProbe(probe, WIN1252, 
WIN1252));
+    }
+
+    @Test
+    public void emptyProbeIsEquivalentEverywhere() {
+        assertTrue(DecodeEquivalence.byteIdenticalOnProbe(new byte[0], 
WIN1257, WIN1252));
+        assertTrue(DecodeEquivalence.byteIdenticalOnProbe(new byte[0], IBM852, 
WIN1252));
+    }
+}
diff --git 
a/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/ConfigurableByteNgramFeatureExtractor.java
 
b/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/ConfigurableByteNgramFeatureExtractor.java
index 1308733148..c2659396d2 100644
--- 
a/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/ConfigurableByteNgramFeatureExtractor.java
+++ 
b/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/ConfigurableByteNgramFeatureExtractor.java
@@ -56,36 +56,126 @@ public class ConfigurableByteNgramFeatureExtractor 
implements FeatureExtractor<b
     /** Distinct salt for stride-2 bigrams — prevents collision with stride-1 
hashes. */
     private static final int FNV_STRIDE2_SALT = 0x9e3779b9;
 
+    /**
+     * Number of reserved slots at the high end of the feature vector used for
+     * global (whole-probe) features when {@link #useGlobalFeatures} is 
enabled.
+     * Currently 6 slots hold ASCII-low-byte density bins (see
+     * {@link #asciiDensityBin(byte[])}).
+     */
+    public static final int GLOBAL_FEATURE_COUNT = 6;
+
     private final int numBuckets;
+    private final int hashBuckets;
     private final boolean useUnigrams;
     private final boolean useBigrams;
     private final boolean useTrigrams;
     private final boolean useAnchoredBigrams;
     private final boolean useStride2Bigrams;
+    private final boolean useGlobalFeatures;
+
+    /**
+     * Backwards-compatible constructor (no global features).
+     */
+    public ConfigurableByteNgramFeatureExtractor(int numBuckets,
+                                                 boolean useUnigrams,
+                                                 boolean useBigrams,
+                                                 boolean useTrigrams,
+                                                 boolean useAnchoredBigrams,
+                                                 boolean useStride2Bigrams) {
+        this(numBuckets, useUnigrams, useBigrams, useTrigrams,
+                useAnchoredBigrams, useStride2Bigrams, false);
+    }
 
     /**
-     * @param numBuckets         number of hash buckets (feature-vector 
dimension)
+     * @param numBuckets         total feature-vector dimension.  When
+     *                           {@code useGlobalFeatures} is {@code true}, the
+     *                           last {@link #GLOBAL_FEATURE_COUNT} slots are
+     *                           reserved for global features and hashed n-gram
+     *                           features mod into the first
+     *                           {@code numBuckets - GLOBAL_FEATURE_COUNT} 
slots.
      * @param useUnigrams        emit unigram for each high byte
      * @param useBigrams         emit bigram anchored on each high byte
      * @param useTrigrams        emit trigram anchored on each high byte
      * @param useAnchoredBigrams emit bigram anchored on each low trail byte
      * @param useStride2Bigrams  emit stride-2 bigrams at even positions (all 
bytes)
+     * @param useGlobalFeatures  emit whole-probe global features into the
+     *                           reserved tail slots (ASCII-density bins)
      */
     public ConfigurableByteNgramFeatureExtractor(int numBuckets,
                                                  boolean useUnigrams,
                                                  boolean useBigrams,
                                                  boolean useTrigrams,
                                                  boolean useAnchoredBigrams,
-                                                 boolean useStride2Bigrams) {
+                                                 boolean useStride2Bigrams,
+                                                 boolean useGlobalFeatures) {
         if (numBuckets <= 0) {
             throw new IllegalArgumentException("numBuckets must be positive: " 
+ numBuckets);
         }
+        if (useGlobalFeatures && numBuckets <= GLOBAL_FEATURE_COUNT) {
+            throw new IllegalArgumentException(
+                    "numBuckets must exceed GLOBAL_FEATURE_COUNT (" + 
GLOBAL_FEATURE_COUNT
+                            + ") when useGlobalFeatures=true: " + numBuckets);
+        }
         this.numBuckets = numBuckets;
+        this.hashBuckets = useGlobalFeatures ? numBuckets - 
GLOBAL_FEATURE_COUNT : numBuckets;
         this.useUnigrams = useUnigrams;
         this.useBigrams = useBigrams;
         this.useTrigrams = useTrigrams;
         this.useAnchoredBigrams = useAnchoredBigrams;
         this.useStride2Bigrams = useStride2Bigrams;
+        this.useGlobalFeatures = useGlobalFeatures;
+    }
+
+    /**
+     * Returns which ASCII-text-density bin this probe falls into, in [0, 6).
+     *
+     * <p>Counts only <em>ASCII text bytes</em> — printable (0x20..0x7E) plus
+     * common whitespace (0x09 tab, 0x0A LF, 0x0D CR).  NUL and other control
+     * bytes do <em>not</em> count.  This matters because UTF-16LE/BE probes
+     * contain ~50% 0x00 bytes; if we counted those as "low", UTF-16 English
+     * would look like sparse Latin to the model, defeating the point of the
+     * feature.  With the current definition, real UTF-16 English lands around
+     * bin 2-3 (half ASCII-letter bytes, half nulls), distinguishable from
+     * plain-ASCII probes (bin 5) and from real EBCDIC (bin 0-1).</p>
+     *
+     * <p>Bin layout (fraction of bytes that are ASCII-text):</p>
+     * <ul>
+     *   <li>0: [0.00, 0.10) — effectively no ASCII text (real EBCDIC 
letters)</li>
+     *   <li>1: [0.10, 0.50) — heavy non-ASCII content (CJK text, UTF-16 
mixed)</li>
+     *   <li>2: [0.50, 0.80) — text with dense foreign script, UTF-16 
Latin</li>
+     *   <li>3: [0.80, 0.95) — normal foreign-script text with ASCII 
markup</li>
+     *   <li>4: [0.95, 0.99) — sparse-diacritic Western text</li>
+     *   <li>5: [0.99, 1.00] — near-pure ASCII (vCards, config, scripts)</li>
+     * </ul>
+     */
+    public static int asciiDensityBin(byte[] input) {
+        if (input == null || input.length == 0) {
+            return 5;
+        }
+        int asciiText = 0;
+        for (byte b : input) {
+            int v = b & 0xFF;
+            if ((v >= 0x20 && v <= 0x7E) || v == 0x09 || v == 0x0A || v == 
0x0D) {
+                asciiText++;
+            }
+        }
+        double p = (double) asciiText / input.length;
+        if (p < 0.10) {
+            return 0;
+        }
+        if (p < 0.50) {
+            return 1;
+        }
+        if (p < 0.80) {
+            return 2;
+        }
+        if (p < 0.95) {
+            return 3;
+        }
+        if (p < 0.99) {
+            return 4;
+        }
+        return 5;
     }
 
     @Override
@@ -121,7 +211,7 @@ public class ConfigurableByteNgramFeatureExtractor 
implements FeatureExtractor<b
 
             if (useUnigrams) {
                 int h = (FNV_OFFSET ^ bi) * FNV_PRIME;
-                int bkt = (h & 0x7fffffff) % numBuckets;
+                int bkt = (h & 0x7fffffff) % hashBuckets;
                 if (dense[bkt] == 0) {
                     touched[n++] = bkt;
                 }
@@ -134,7 +224,7 @@ public class ConfigurableByteNgramFeatureExtractor 
implements FeatureExtractor<b
                 if (useBigrams) {
                     int h = (FNV_OFFSET ^ bi) * FNV_PRIME;
                     h = (h ^ bi1) * FNV_PRIME;
-                    int bkt = (h & 0x7fffffff) % numBuckets;
+                    int bkt = (h & 0x7fffffff) % hashBuckets;
                     if (dense[bkt] == 0) {
                         touched[n++] = bkt;
                     }
@@ -146,7 +236,7 @@ public class ConfigurableByteNgramFeatureExtractor 
implements FeatureExtractor<b
                     if (i + 2 < input.length) {
                         h = (h ^ (input[i + 2] & 0xFF)) * FNV_PRIME;
                     }
-                    int bkt = (h & 0x7fffffff) % numBuckets;
+                    int bkt = (h & 0x7fffffff) % hashBuckets;
                     if (dense[bkt] == 0) {
                         touched[n++] = bkt;
                     }
@@ -158,7 +248,7 @@ public class ConfigurableByteNgramFeatureExtractor 
implements FeatureExtractor<b
                     int h = (FNV_OFFSET ^ bi) * FNV_PRIME;
                     h = (h ^ bi1) * FNV_PRIME;
                     h = (h ^ bi2) * FNV_PRIME;
-                    int bkt = (h & 0x7fffffff) % numBuckets;
+                    int bkt = (h & 0x7fffffff) % hashBuckets;
                     if (dense[bkt] == 0) {
                         touched[n++] = bkt;
                     }
@@ -174,7 +264,7 @@ public class ConfigurableByteNgramFeatureExtractor 
implements FeatureExtractor<b
                 int b1 = input[i + 1] & 0xFF;
                 int h = (FNV_STRIDE2_SALT ^ b0) * FNV_PRIME;
                 h = (h ^ b1) * FNV_PRIME;
-                int bkt = (h & 0x7fffffff) % numBuckets;
+                int bkt = (h & 0x7fffffff) % hashBuckets;
                 if (dense[bkt] == 0) {
                     touched[n++] = bkt;
                 }
@@ -182,6 +272,15 @@ public class ConfigurableByteNgramFeatureExtractor 
implements FeatureExtractor<b
             }
         }
 
+        // Global features at reserved tail slots: fire exactly one 
ASCII-density bin.
+        if (useGlobalFeatures) {
+            int bkt = hashBuckets + asciiDensityBin(input);
+            if (dense[bkt] == 0) {
+                touched[n++] = bkt;
+            }
+            dense[bkt]++;
+        }
+
         return n;
     }
 
@@ -234,10 +333,17 @@ public class ConfigurableByteNgramFeatureExtractor 
implements FeatureExtractor<b
                 counts[bucket(h)]++;
             }
         }
+
+        // Global features at reserved tail slots: fire exactly one 
ASCII-density bin.
+        if (useGlobalFeatures) {
+            byte[] slice = (from == 0 && to == b.length)
+                    ? b : java.util.Arrays.copyOfRange(b, from, to);
+            counts[hashBuckets + asciiDensityBin(slice)]++;
+        }
     }
 
     private int bucket(int hash) {
-        return (hash & 0x7fffffff) % numBuckets;
+        return (hash & 0x7fffffff) % hashBuckets;
     }
 
     @Override
@@ -248,7 +354,8 @@ public class ConfigurableByteNgramFeatureExtractor 
implements FeatureExtractor<b
     @Override
     public String toString() {
         return String.format(java.util.Locale.ROOT,
-                "ConfigurableByteNgramFeatureExtractor{buckets=%d, uni=%b, 
bi=%b, tri=%b, anchored=%b, stride2=%b}",
-                numBuckets, useUnigrams, useBigrams, useTrigrams, 
useAnchoredBigrams, useStride2Bigrams);
+                "ConfigurableByteNgramFeatureExtractor{buckets=%d, hash=%d, 
uni=%b, bi=%b, tri=%b, anchored=%b, stride2=%b, globals=%b}",
+                numBuckets, hashBuckets, useUnigrams, useBigrams, useTrigrams,
+                useAnchoredBigrams, useStride2Bigrams, useGlobalFeatures);
     }
 }
diff --git 
a/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/TrainCharsetModel.java
 
b/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/TrainCharsetModel.java
index 9fd35ab7df..7a38d3bce9 100644
--- 
a/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/TrainCharsetModel.java
+++ 
b/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/TrainCharsetModel.java
@@ -81,6 +81,7 @@ public class TrainCharsetModel {
         boolean useTrigrams = true;
         boolean useAnchoredBigrams = false;
         boolean useStride2Bigrams = true;
+        boolean useGlobalFeatures = false;
         // --label-remap src1:dst1,src2:dst2 — merges multiple source labels 
into
         // one target label at training time (e.g. merge script variants into 
one class).
         Map<String, String> labelRemap = new HashMap<>();
@@ -140,6 +141,12 @@ public class TrainCharsetModel {
                 case "--no-stride2":
                     useStride2Bigrams = false;
                     break;
+                case "--globals":
+                    useGlobalFeatures = true;
+                    break;
+                case "--no-globals":
+                    useGlobalFeatures = false;
+                    break;
                 case "--exclude":
                     for (String label : args[++i].split(",")) {
                         excludeLabels.add(label.trim());
@@ -164,6 +171,7 @@ public class TrainCharsetModel {
             System.err.println("  --tri / --no-tri         enable/disable 
trigram features (default: on)");
             System.err.println("  --anchored / --no-anchored  anchored bigrams 
(default: off)");
             System.err.println("  --stride2 / --no-stride2    stride-2 bigrams 
at even positions (default: on)");
+            System.err.println("  --globals / --no-globals    emit global 
ASCII-density bin features (default: off)");
             System.err.println("  --exclude cs1,cs2          skip these 
charset labels (e.g. UTF-32-BE,UTF-32-LE)");
             System.exit(1);
         }
@@ -211,12 +219,14 @@ public class TrainCharsetModel {
                 "Buckets: %d  epochs: %d  lr: %.4f  max-samples/class: %d%n",
                 numBuckets, epochs, lr, maxSamplesPerClass);
         System.out.printf(java.util.Locale.ROOT,
-                "Features: uni=%b  bi=%b  tri=%b  anchored=%b  stride2=%b%n",
-                useUnigrams, useBigrams, useTrigrams, useAnchoredBigrams, 
useStride2Bigrams);
+                "Features: uni=%b  bi=%b  tri=%b  anchored=%b  stride2=%b  
globals=%b%n",
+                useUnigrams, useBigrams, useTrigrams, useAnchoredBigrams, 
useStride2Bigrams,
+                useGlobalFeatures);
 
         ConfigurableByteNgramFeatureExtractor extractor =
                 new ConfigurableByteNgramFeatureExtractor(numBuckets,
-                        useUnigrams, useBigrams, useTrigrams, 
useAnchoredBigrams, useStride2Bigrams);
+                        useUnigrams, useBigrams, useTrigrams, 
useAnchoredBigrams,
+                        useStride2Bigrams, useGlobalFeatures);
 
         // Build class index map
         Map<String, Integer> labelIndex = new HashMap<>();
@@ -281,8 +291,6 @@ public class TrainCharsetModel {
                 // Sparse extraction: O(probeLength), not O(numBuckets)
                 int nActive = extractor.extractSparseInto(sample, 
denseScratch, touched);
 
-                // L1 normalization: compute sum of feature counts so each 
sample
-                // contributes equal total mass regardless of encoding density.
                 // Forward pass: only iterate active buckets
                 float[] logits = new float[numClasses];
                 for (int c = 0; c < numClasses; c++) {
diff --git 
a/tika-ml/tika-ml-chardetect/src/test/java/org/apache/tika/ml/chardetect/ConfigurableGlobalFeatureTest.java
 
b/tika-ml/tika-ml-chardetect/src/test/java/org/apache/tika/ml/chardetect/ConfigurableGlobalFeatureTest.java
new file mode 100644
index 0000000000..c40ef78075
--- /dev/null
+++ 
b/tika-ml/tika-ml-chardetect/src/test/java/org/apache/tika/ml/chardetect/ConfigurableGlobalFeatureTest.java
@@ -0,0 +1,160 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.ml.chardetect;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNotEquals;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
+
+import org.junit.jupiter.api.Test;
+
+import 
org.apache.tika.ml.chardetect.tools.ConfigurableByteNgramFeatureExtractor;
+
+public class ConfigurableGlobalFeatureTest {
+
+    private static final int NUM_BUCKETS = 16384;
+    private static final int HASH_BUCKETS = NUM_BUCKETS
+            - ConfigurableByteNgramFeatureExtractor.GLOBAL_FEATURE_COUNT;
+
+    private static ConfigurableByteNgramFeatureExtractor withGlobals() {
+        return new ConfigurableByteNgramFeatureExtractor(
+                NUM_BUCKETS, true, true, false, false, true, true);
+    }
+
+    private static ConfigurableByteNgramFeatureExtractor withoutGlobals() {
+        return new ConfigurableByteNgramFeatureExtractor(
+                NUM_BUCKETS, true, true, false, false, true, false);
+    }
+
+    @Test
+    public void pureAsciiLandsInTopBin() {
+        assertEquals(5, ConfigurableByteNgramFeatureExtractor.asciiDensityBin(
+                
"BEGIN:VCARD\r\nVERSION:3.0\r\nEND:VCARD\r\n".getBytes(StandardCharsets.US_ASCII)));
+    }
+
+    @Test
+    public void sparseLatinVcardLandsInTopBin() {
+        // 99.4% ASCII: 3 high bytes in ~510 bytes of vCard text
+        byte[] probe = "BEGIN:VCARD\r\nN:M\u00FCller;Hans\r\nFN:Hans 
M\u00FCller\r\nADR:K\u00F6ln\r\nEND:VCARD\r\n"
+                .getBytes(StandardCharsets.ISO_8859_1);
+        int bin = ConfigurableByteNgramFeatureExtractor.asciiDensityBin(probe);
+        assertTrue(bin >= 4, "sparse-Latin vCard should land in bin 4 or 5, 
got: " + bin);
+    }
+
+    @Test
+    public void ebcdicTextLandsInLowBin() {
+        // Real EBCDIC: letters 0x81..0xE9 (~80%), 0x40 space (~20%)
+        // Under the ASCII-text bin definition, 0x40 IS printable ASCII ('@'),
+        // so EBCDIC lands in bin 1, not bin 0.  What matters is that it's
+        // cleanly separated from the plain-ASCII bin 5.
+        byte[] ebcdic = new byte[100];
+        int p = 0;
+        for (int i = 0; i < 20; i++) {
+            ebcdic[p++] = 0x40;  // space
+        }
+        for (int i = 0; i < 80; i++) {
+            ebcdic[p++] = (byte) (0x81 + (i % 9));  // letters
+        }
+        int bin = 
ConfigurableByteNgramFeatureExtractor.asciiDensityBin(ebcdic);
+        assertTrue(bin <= 2, "EBCDIC should land in bin 0-2, got: " + bin);
+        assertNotEquals(5, bin, "EBCDIC must not collide with the ASCII bin");
+    }
+
+    @Test
+    public void utf16LeEnglishLandsInMiddleBin() {
+        // UTF-16LE "Hello, world" — every other byte is 0x00
+        byte[] utf16 = "Hello, world! This is English text in UTF-16LE."
+                .getBytes(Charset.forName("UTF-16LE"));
+        int bin = ConfigurableByteNgramFeatureExtractor.asciiDensityBin(utf16);
+        assertTrue(bin == 2, "UTF-16LE English should land in bin 2 (~50%), 
got: " + bin);
+    }
+
+    @Test
+    public void utf16LeBmpTextLandsInMidHighBin() {
+        // UTF-16LE of BMP text (Hiragana U+3040..U+309F etc.) — note that the
+        // "high byte of the codepoint" (0x30 here) is printable ASCII '0', and
+        // the "low byte" of most Hiragana falls in 0x40..0x9F — half 
printable.
+        // So UTF-16LE BMP text has a HIGH printable-ASCII-byte fraction 
despite
+        // not being ASCII text.  The global feature does not try to 
distinguish
+        // UTF-16 from ASCII — that's stride-2's job.  This test documents the
+        // observed behaviour so it isn't mistaken for a bug later.
+        byte[] utf16 = 
"\u6587\u7AE0\u3042\u3044\u3046\u3048\u304A\u304B\u304D\u304F"
+                .getBytes(Charset.forName("UTF-16LE"));
+        int bin = ConfigurableByteNgramFeatureExtractor.asciiDensityBin(utf16);
+        assertTrue(bin >= 2, "UTF-16LE BMP text has many printable bytes, got 
bin: " + bin);
+    }
+
+    @Test
+    public void globalFeatureFiresExactlyOneTailSlot() {
+        ConfigurableByteNgramFeatureExtractor ext = withGlobals();
+        int[] dense = new int[NUM_BUCKETS];
+        int[] touched = new int[NUM_BUCKETS];
+
+        int n = ext.extractSparseInto(
+                "Plain ASCII text with no accents at 
all.".getBytes(StandardCharsets.US_ASCII),
+                dense, touched);
+
+        int tailFirings = 0;
+        int tailSlot = -1;
+        for (int i = 0; i < n; i++) {
+            if (touched[i] >= HASH_BUCKETS) {
+                tailFirings++;
+                tailSlot = touched[i];
+            }
+        }
+        assertEquals(1, tailFirings, "exactly one global tail slot must fire");
+        assertEquals(HASH_BUCKETS + 5, tailSlot, "pure ASCII should fire bin 
5");
+        assertEquals(1, dense[tailSlot], "count for global bin must be 1");
+    }
+
+    @Test
+    public void disablingGlobalsLeavesTailEmpty() {
+        ConfigurableByteNgramFeatureExtractor ext = withoutGlobals();
+        int[] dense = new int[NUM_BUCKETS];
+        int[] touched = new int[NUM_BUCKETS];
+
+        int n = ext.extractSparseInto(
+                "Plain ASCII text".getBytes(StandardCharsets.US_ASCII),
+                dense, touched);
+
+        for (int i = 0; i < n; i++) {
+            assertTrue(touched[i] < NUM_BUCKETS,
+                    "all firings must be in hash range when globals are off");
+        }
+    }
+
+    @Test
+    public void sparseAndDenseExtractionAgreeWithGlobals() {
+        ConfigurableByteNgramFeatureExtractor ext = withGlobals();
+        byte[] probe = "r\u00E9sum\u00E9 caf\u00E9 cr\u00E8me br\u00FBl\u00E9e"
+                .getBytes(StandardCharsets.ISO_8859_1);
+
+        int[] dense = ext.extract(probe);
+
+        int[] sparseDense = new int[NUM_BUCKETS];
+        int[] touched = new int[NUM_BUCKETS];
+        ext.extractSparseInto(probe, sparseDense, touched);
+
+        for (int i = 0; i < NUM_BUCKETS; i++) {
+            assertEquals(dense[i], sparseDense[i],
+                    "bucket " + i + " differs between dense and sparse paths");
+        }
+    }
+}

(tika) 01/03: narrow per-probe decode-equivalence for win-1252

Reply via email to