(tika) branch main updated: TIKA-4745-follow-on-junk-improvements (#2872)

tallison Fri, 05 Jun 2026 09:46:11 -0700

This is an automated email from the ASF dual-hosted git repository.

tballison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git



The following commit(s) were added to refs/heads/main by this push:
     new d1e81e39b4 TIKA-4745-follow-on-junk-improvements (#2872)
d1e81e39b4 is described below

commit d1e81e39b4ce210ee3f982462e2af18816950101
Author: Tim Allison <[email protected]>
AuthorDate: Fri Jun 5 12:45:57 2026 -0400

    TIKA-4745-follow-on-junk-improvements (#2872)
---
 .../tika/ml/chardetect/CjkDecodeValidator.java     |   5 +-
 .../tika/ml/chardetect/CosineFamilyArbiter.java    | 241 ------------------
 .../apache/tika/ml/chardetect/ScoredCandidate.java |  56 -----
 .../apache/tika/ml/chardetect/cosine-profiles.bin  | Bin 1080313 -> 0 bytes
 .../apache/tika/ml/junkdetect/JunkDetector.java    | 280 +++++++++++++++------
 .../ml/junkdetect/JunkFilterEncodingDetector.java  |   5 +-
 .../tika/ml/junkdetect/TextQualityFeatures.java    |  31 ++-
 .../ml/junkdetect/tools/BuildJunkTrainingData.java |  17 +-
 .../tika/ml/junkdetect/tools/TrainJunkModel.java   | 118 +++++++--
 .../org/apache/tika/ml/junkdetect/junkdetect.bin   | Bin 2321862 -> 2316809 
bytes
 .../DecodeCorruptionDiscriminationTest.java        | 172 +++++++++++++
 .../ml/junkdetect/JunkDetectorRoundTripTest.java   | 132 ++++++++--
 .../org/apache/tika/parser/html/JSoupParser.java   |   4 +
 .../apache/tika/parser/html/HtmlParserTest.java    |  36 +++
 14 files changed, 672 insertions(+), 425 deletions(-)

diff --git 
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/CjkDecodeValidator.java
 
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/CjkDecodeValidator.java
index 4c7254c6ee..cf4e4e6554 100644
--- 
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/CjkDecodeValidator.java
+++ 
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/CjkDecodeValidator.java
@@ -68,10 +68,7 @@ public final class CjkDecodeValidator {
      *         little legacy evidence (legacy high bytes &lt; {@link 
#MIN_HIGH_BYTES})
      */
     public static double strippedFailureRate(byte[] bytes, Charset cjkCharset) 
{
-        Charset decodeAs = CharsetSupersets.supersetOf(cjkCharset);
-        if (decodeAs == null) {
-            decodeAs = cjkCharset;
-        }
+        Charset decodeAs = CharsetSupersets.decodeAs(cjkCharset);
         CharsetDecoder dec = decodeAs.newDecoder()
                 .onMalformedInput(CodingErrorAction.REPORT)
                 .onUnmappableCharacter(CodingErrorAction.REPORT);
diff --git 
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/CosineFamilyArbiter.java
 
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/CosineFamilyArbiter.java
deleted file mode 100644
index 057bba4a92..0000000000
--- 
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/CosineFamilyArbiter.java
+++ /dev/null
@@ -1,241 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.ml.chardetect;
-
-import java.io.DataInputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.nio.charset.Charset;
-import java.nio.charset.IllegalCharsetNameException;
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Locale;
-import java.util.Map;
-
-import org.apache.tika.detect.EncodingResult;
-
-/**
- * Family-level guard over the NB statistical pick, defending against the
- * single-byte-&rarr;CJK collision (Cyrillic / Greek / accented-Latin content
- * whose high bytes coincide with legal GBK lead/trail pairs and accumulate
- * spurious GB18030 / Big5 likelihood under the multinomial NB).
- *
- * <p>Two complementary, model-light signals, both blind to NB:</p>
- * <ul>
- *   <li><b>high-byte cosine</b> &mdash; cosine between the probe's high-byte
- *       (&ge; 0x80) byte-bigram occupancy and each class's control-stripped
- *       high-byte profile.  Direction-based, so length/density-invariant; the
- *       ASCII quadrant is dropped so shared English text can't dominate.  When
- *       NB picks a CJK class but the cosine argmax is non-CJK (with enough
- *       high-byte evidence), the CJK pick is vetoed.</li>
- *   <li><b>GBK illegality</b> &mdash; fraction of high-byte lead bytes that do
- *       not begin a valid GBK 2-byte or GB18030 4-byte sequence.  A genuine
- *       GB18030 document is ~0% illegal; Cyrillic/Greek forced through GBK
- *       throws illegal trails.  Scoped to GB18030 only (it says nothing about
- *       Shift_JIS/EUC).</li>
- * </ul>
- *
- * <p>On veto the CJK pick is replaced by the best non-CJK candidate (by cosine
- * when evidence is sufficient, else the highest-ranked non-CJK NB candidate);
- * real-CJK picks are left untouched (cosine argmax stays CJK, illegality ~0),
- * so the guard is regression-safe for genuine CJK.</p>
- */
-public final class CosineFamilyArbiter {
-
-    /** Minimum high-byte bigram count before the cosine veto is trusted. */
-    public static final int MIN_HIGH_BYTE_SUPPORT = 15;
-
-    /** GBK-illegality fraction above which a GB18030 pick is refuted. */
-    public static final double GBK_ILLEGAL_THRESHOLD = 0.02;
-
-    private static final String GB18030 = "GB18030";
-
-    private final String[] names;
-    private final boolean[] cjk;
-    private final Charset[] charsets;   // resolved JVM charset, null if 
unsupported
-    private final int[][] bigramIds;
-    private final float[][] weights;   // L2-normalized per class
-
-    public CosineFamilyArbiter(InputStream in) throws IOException {
-        try (DataInputStream dis = new DataInputStream(in)) {
-            int nc = dis.readInt();
-            names = new String[nc];
-            cjk = new boolean[nc];
-            charsets = new Charset[nc];
-            bigramIds = new int[nc][];
-            weights = new float[nc][];
-            for (int c = 0; c < nc; c++) {
-                names[c] = dis.readUTF();
-                cjk[c] = isCjkName(names[c]);
-                charsets[c] = resolve(names[c]);
-                int nnz = dis.readInt();
-                int[] ids = new int[nnz];
-                float[] w = new float[nnz];
-                for (int k = 0; k < nnz; k++) {
-                    ids[k] = dis.readUnsignedShort();
-                    w[k] = dis.readFloat();
-                }
-                bigramIds[c] = ids;
-                weights[c] = w;
-            }
-        }
-    }
-
-    private static Charset resolve(String name) {
-        try {
-            return Charset.isSupported(name) ? Charset.forName(name) : null;
-        } catch (IllegalCharsetNameException e) {
-            return null;
-        }
-    }
-
-    static boolean isCjkName(String name) {
-        String n = name.toLowerCase(Locale.ROOT);
-        return n.contains("gb") || n.contains("big5") || n.contains("euc")
-                || n.contains("shift") || n.contains("jis") || 
n.contains("2022")
-                || n.contains("949");
-    }
-
-    /**
-     * Apply the family guard to NB's ranked candidates.  Returns {@code
-     * nbResults} unchanged unless NB's top pick is CJK and a veto fires, in
-     * which case a non-CJK replacement is promoted to the front.
-     */
-    public List<EncodingResult> arbitrate(byte[] probe, List<EncodingResult> 
nbResults) {
-        if (nbResults == null || nbResults.isEmpty()) {
-            return nbResults;
-        }
-        if (!isCjkName(nbResults.get(0).getCharset().name())) {
-            return nbResults;
-        }
-        // Build high-byte bigram occupancy.
-        Map<Integer, Integer> docMap = new HashMap<>();
-        long support = 0;
-        for (int i = 0; i + 1 < probe.length; i++) {
-            int b0 = probe[i] & 0xFF;
-            int b1 = probe[i + 1] & 0xFF;
-            if (b0 >= 0x80 || b1 >= 0x80) {
-                int bg = (b0 << 8) | b1;
-                docMap.merge(bg, 1, Integer::sum);
-                support++;
-            }
-        }
-        double docNorm = 0;
-        for (int v : docMap.values()) {
-            docNorm += (double) v * v;
-        }
-        docNorm = Math.sqrt(docNorm);
-
-        boolean gbkTop = GB18030.equals(nbResults.get(0).getCharset().name());
-        double illegal = gbkIllegalRate(probe);
-
-        int cosArg = -1;
-        double bestCos = -1;
-        double[] cos = new double[names.length];
-        if (docNorm > 0) {
-            for (int c = 0; c < names.length; c++) {
-                double dot = 0;
-                int[] ids = bigramIds[c];
-                float[] w = weights[c];
-                for (int k = 0; k < ids.length; k++) {
-                    Integer dc = docMap.get(ids[k]);
-                    if (dc != null) {
-                        dot += w[k] * dc;
-                    }
-                }
-                cos[c] = dot / docNorm;
-                if (cos[c] > bestCos) {
-                    bestCos = cos[c];
-                    cosArg = c;
-                }
-            }
-        }
-
-        boolean veto = (gbkTop && illegal > GBK_ILLEGAL_THRESHOLD)
-                || (support >= MIN_HIGH_BYTE_SUPPORT && cosArg >= 0 && 
!cjk[cosArg]);
-        if (!veto) {
-            return nbResults;
-        }
-
-        // Choose replacement: best non-CJK by cosine when evidence is
-        // sufficient, else the highest-ranked non-CJK NB candidate.
-        Charset replacement = null;
-        if (support >= MIN_HIGH_BYTE_SUPPORT && docNorm > 0) {
-            double bv = -1;
-            for (int c = 0; c < names.length; c++) {
-                if (!cjk[c] && charsets[c] != null && cos[c] > bv) {
-                    bv = cos[c];
-                    replacement = charsets[c];
-                }
-            }
-        }
-        float conf = nbResults.get(0).getConfidence();
-        List<EncodingResult> out = new ArrayList<>(nbResults.size() + 1);
-        if (replacement != null) {
-            out.add(new EncodingResult(replacement, conf, replacement.name(),
-                    EncodingResult.ResultType.STATISTICAL));
-        }
-        for (EncodingResult r : nbResults) {
-            if (isCjkName(r.getCharset().name())) {
-                continue;
-            }
-            if (replacement != null && 
r.getCharset().name().equals(replacement.name())) {
-                continue;
-            }
-            out.add(r);
-        }
-        // If we couldn't form any non-CJK candidate, don't strand the caller
-        // with an empty list — leave NB's result untouched.
-        return out.isEmpty() ? nbResults : out;
-    }
-
-    /**
-     * Fraction of high-byte lead bytes that fail to begin a valid GBK 2-byte
-     * or GB18030 4-byte sequence.  0 for genuine GB18030.
-     */
-    static double gbkIllegalRate(byte[] b) {
-        int n = b.length;
-        int i = 0;
-        int illegal = 0;
-        int lead = 0;
-        while (i < n) {
-            int c = b[i] & 0xFF;
-            if (c < 0x80) {
-                i++;
-                continue;
-            }
-            lead++;
-            if (c >= 0x81 && c <= 0xFE && i + 1 < n) {
-                int t = b[i + 1] & 0xFF;
-                if (((t >= 0x40 && t <= 0x7E) || (t >= 0x80 && t <= 0xFE)) && 
t != 0x7F) {
-                    i += 2;
-                    continue;
-                }
-                if (t >= 0x30 && t <= 0x39 && i + 3 < n
-                        && (b[i + 2] & 0xFF) >= 0x81 && (b[i + 2] & 0xFF) <= 
0xFE
-                        && (b[i + 3] & 0xFF) >= 0x30 && (b[i + 3] & 0xFF) <= 
0x39) {
-                    i += 4;
-                    continue;
-                }
-            }
-            illegal++;
-            i++;
-        }
-        return lead == 0 ? 0 : (double) illegal / lead;
-    }
-}
diff --git 
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/ScoredCandidate.java
 
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/ScoredCandidate.java
deleted file mode 100644
index 60564bf884..0000000000
--- 
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/ScoredCandidate.java
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.ml.chardetect;
-
-import java.util.Collections;
-import java.util.LinkedHashSet;
-import java.util.Set;
-
-/**
- * Pooled candidate from {@link LogLinearCombiner}: label, raw summed score
- * (larger is better, not normalized), and the specialists that contributed.
- */
-public final class ScoredCandidate {
-
-    private final String label;
-    private final float score;
-    private final Set<String> contributingSpecialists;
-
-    public ScoredCandidate(String label, float score, Set<String> 
contributingSpecialists) {
-        this.label = label;
-        this.score = score;
-        this.contributingSpecialists =
-                Collections.unmodifiableSet(new 
LinkedHashSet<>(contributingSpecialists));
-    }
-
-    public String getLabel() {
-        return label;
-    }
-
-    public float getScore() {
-        return score;
-    }
-
-    public Set<String> getContributingSpecialists() {
-        return contributingSpecialists;
-    }
-
-    @Override
-    public String toString() {
-        return "ScoredCandidate{" + label + "=" + score + " from " + 
contributingSpecialists + "}";
-    }
-}
diff --git 
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/resources/org/apache/tika/ml/chardetect/cosine-profiles.bin
 
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/resources/org/apache/tika/ml/chardetect/cosine-profiles.bin
deleted file mode 100644
index 646a7e7923..0000000000
Binary files 
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/resources/org/apache/tika/ml/chardetect/cosine-profiles.bin
 and /dev/null differ
diff --git 
a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkDetector.java
 
b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkDetector.java
index c231494d60..2f117479d0 100644
--- 
a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkDetector.java
+++ 
b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkDetector.java
@@ -353,6 +353,16 @@ public final class JunkDetector implements 
TextQualityDetector {
                 f1TablesByScript.put(script, BigramTables.readFrom(dis));
             }
 
+            requireUsableSigma("scriptTransition", 
scriptTransitionCalibration);
+            requireUsableSigma("block", blockCalibration);
+            requireUsableSigma("control", controlCalibration);
+            requireUsableSigma("z5", z5Calibration);
+            requireUsableSigma("z6", z6Calibration);
+            requireUsableSigma("z9", z9Calibration);
+            for (Map.Entry<String, float[]> e : calibrations.entrySet()) {
+                requireUsableSigma("z1[" + e.getKey() + "]", e.getValue());
+            }
+
             return new JunkDetector(calibrations,
                     blockTable, blockTableQuant, blockCalibration,
                     controlCalibration, combinerWeights,
@@ -363,6 +373,22 @@ public final class JunkDetector implements 
TextQualityDetector {
         }
     }
 
+    /**
+     * Validates a calibration {@code {mu, sigma}} from the model file: sigma 
is the
+     * divisor in every z-score, so it must be finite and &gt; 0.  Single 
enforcement
+     * point for that invariant -- inference divides without re-checking.
+     */
+    static void requireUsableSigma(String name, float[] calibration) throws 
IOException {
+        boolean ok = calibration != null && calibration.length >= 2
+                && Float.isFinite(calibration[1]) && calibration[1] > 0f;
+        if (!ok) {
+            String sigma = (calibration == null || calibration.length < 2)
+                    ? "absent" : Float.toString(calibration[1]);
+            throw new IOException("Invalid model: " + name
+                    + " calibration sigma must be finite and > 0 but was " + 
sigma);
+        }
+    }
+
     /** Read {@code size} big-endian int16 values as a short[]. */
     private static short[] readShortTable(DataInputStream dis, int size) 
throws IOException {
         byte[] raw = dis.readNBytes(size * 2);
@@ -463,8 +489,12 @@ public final class JunkDetector implements 
TextQualityDetector {
      * produces the logit.  No runs, no sentinels, no per-script z2/z3.
      */
     private Agg aggregate(String text) {
-        // NFC-normalize so inference matches the trainer's tally.
-        text = java.text.Normalizer.normalize(text, 
java.text.Normalizer.Form.NFC);
+        // NFKC-normalize so inference matches the trainer's tally AND legacy
+        // compatibility forms fold to canonical — critically half-width 
katakana
+        // (U+FF66-FF9F) -> full-width, which the HAN bigram table is trained 
on.
+        // Without this, half-width-katakana pages (Shift_JIS-era Japanese
+        // e-commerce) floor every bigram as "unseen" (z1 ~-9, false-junk).
+        text = java.text.Normalizer.normalize(text, 
java.text.Normalizer.Form.NFKC);
         int[] cps = text.codePoints().toArray();
 
         Map<String, double[]> buckets = new HashMap<>(); // script -> 
{sumLogP, count}
@@ -522,12 +552,15 @@ public final class JunkDetector implements 
TextQualityDetector {
         agg.totalBigrams = (int) z1Count;
 
         if (z1Count == 0 || agg.dominantScript == null) {
-            // No scoreable script — doc-level fallback (same anchors as 
before):
+            // No scoreable LETTER at all (zero runs) — doc-level fallback:
             // density=0 -> very negative; density=1 coherence=1 -> positive
             // (unmodeled coherent script); density=1 coherence=0 -> very 
negative.
+            // z6 is included so a letter-free but FFFD/anomaly-heavy doc 
(which can
+            // no longer flood z1) is still penalized here — no path ignores 
the
+            // replacement ratio.  For unmodeled-but-clean script z6~0, so 
it's inert.
             agg.dominantScript = null;
             agg.z1 = Float.NaN;
-            agg.logit = -7f + 4f * agg.z7 + 6f * agg.z8;
+            agg.logit = -7f + 4f * agg.z7 + 6f * agg.z8 + 4f * agg.z6;
             return agg;
         }
         agg.z1 = (float) (weightedZ1 / z1Count);
@@ -635,7 +668,7 @@ public final class JunkDetector implements 
TextQualityDetector {
      */
     public float computeZ5LetterAdjacentToMarkRatio(String text) {
         double raw = TextQualityFeatures.letterAdjacentToMarkRatio(text);
-        if (Double.isNaN(raw) || z5Calibration == null || z5Calibration[1] <= 
0) {
+        if (Double.isNaN(raw) || z5Calibration == null) {
             return 0f;
         }
         return ((float) raw - z5Calibration[0]) / z5Calibration[1];
@@ -651,7 +684,7 @@ public final class JunkDetector implements 
TextQualityDetector {
      */
     public float computeZ6ReplacementRatio(String text) {
         double raw = TextQualityFeatures.replacementRatio(text);
-        if (Double.isNaN(raw) || z6Calibration == null || z6Calibration[1] <= 
0) {
+        if (Double.isNaN(raw) || z6Calibration == null) {
             return 0f;
         }
         // Flip sign: higher replacement = lower quality, so feature is
@@ -669,7 +702,7 @@ public final class JunkDetector implements 
TextQualityDetector {
      */
     public float computeZ9AlternationRatio(String text) {
         double raw = TextQualityFeatures.scriptAlternationRatio(text);
-        if (Double.isNaN(raw) || z9Calibration == null || z9Calibration[1] <= 
0) {
+        if (Double.isNaN(raw) || z9Calibration == null) {
             return 0f;
         }
         // Higher alternation = junkier; (mu - raw) / sigma so clean text → 
positive z9.
@@ -973,15 +1006,73 @@ public final class JunkDetector implements 
TextQualityDetector {
      * once when scanning the text (avoiding a redundant binary search per
      * codepoint).
      */
+    /** Small per-bigram log-prob penalty subtracted from the case-folded
+     *  (lowercase) value when scoring an uppercase pair.  All-caps is a 
genuinely
+     *  weaker/rarer signal than lowercase, so it should score a hair BELOW its
+     *  lowercase form, not equal to it — and the margin guards the edge case 
where
+     *  an all-caps *mojibake* decode whose lowercase twin happens to be a seen
+     *  bigram would otherwise score like real lowercase text.  Kept small 
(0.25):
+     *  the lowercase/junk margin is ~0.8 logit, and δ=0.5 thinned it to ~0.1, 
so
+     *  0.25 leaves all-caps clearly clean (~0.5 above junk) while honoring the
+     *  "somewhat less languagey" principle. */
+    private static final double CASE_FOLD_PENALTY = 0.25;
+
     private static double scorePairF1(int cpA, int idxA, int cpB, int idxB,
                                          BigramTables tables) {
+        double direct = Double.NaN;
         if (idxA >= 0 && idxB >= 0) {
             int slot = lookupBigramSlot(tables, idxA, idxB);
             if (slot >= 0) {
-                return dequantize(tables.bigramValues[slot],
+                direct = dequantize(tables.bigramValues[slot],
                         tables.bigramQuantMin, tables.bigramQuantMax);
             }
         }
+        // Case-folded backoff: an ALL-UPPERCASE pair that is the case variant 
of
+        // a SEEN lowercase pair is real text wearing a different case 
(all-caps
+        // headings / emphasis, e.g. Greek "ΚΑΤΑΛΟΓΟΣ", Russian "МУЗЕЙ"), NOT 
junk.
+        // Score it as the BETTER of its own log-prob and its lowercase twin's 
—
+        // i.e. max(direct, fold).  max (not fold-only-on-miss) is essential: 
real
+        // all-caps bigrams ARE present in training (from headings) but rare, 
so the
+        // direct lookup hits a low value (МУ −12.4 vs lowercase му −6.7) and 
would
+        // otherwise bypass the fold and floor.  This is the discriminator raw
+        // probability cannot be: all-caps real text and all-caps mojibake are 
both
+        // improbable, but only real text has a SEEN lowercase twin.  Gated on 
BOTH
+        // codepoints being uppercase (case-CONSISTENT) so alternating-case 
junk
+        // ("tHiS") stays unfolded and floors; and only the lowercase twin's 
value
+        // is borrowed when that pair is actually seen, so all-caps mojibake
+        // (lowercase form also unseen) floors.
+        // Gate = "at least one uppercase letter AND no LOWERCASE letter" — so 
it
+        // folds both an interior all-caps pair (МУ) AND an edge pair where 
the other
+        // side is a sentinel or glue (^М, Й$, "М."), but NOT a mixed-case 
pair (the
+        // lowercase letter in "aB"/"tHiS" trips the gate, so 
case-inconsistent junk
+        // still floors).  Each uppercase letter is folded; 
sentinels/digits/glue
+        // pass through unchanged.  Folding the edges too is what fully 
rescues short
+        // all-caps headings, whose ^X/X$ bigrams would otherwise floor on the 
rare
+        // uppercase-letter unigram backoff.
+        boolean upperA = Character.isValidCodePoint(cpA) && 
Character.isUpperCase(cpA);
+        boolean upperB = Character.isValidCodePoint(cpB) && 
Character.isUpperCase(cpB);
+        boolean lowerA = Character.isValidCodePoint(cpA) && 
Character.isLowerCase(cpA);
+        boolean lowerB = Character.isValidCodePoint(cpB) && 
Character.isLowerCase(cpB);
+        if ((upperA || upperB) && !(lowerA || lowerB)) {
+            int lcA = upperA ? Character.toLowerCase(cpA) : cpA;
+            int lcB = upperB ? Character.toLowerCase(cpB) : cpB;
+            if (lcA != cpA || lcB != cpB) {
+                int lcIdxA = codepointToIndex(tables, lcA);
+                int lcIdxB = codepointToIndex(tables, lcB);
+                if (lcIdxA >= 0 && lcIdxB >= 0) {
+                    int slot = lookupBigramSlot(tables, lcIdxA, lcIdxB);
+                    if (slot >= 0) {
+                        double fold = dequantize(tables.bigramValues[slot],
+                                tables.bigramQuantMin, tables.bigramQuantMax)
+                                - CASE_FOLD_PENALTY;
+                        return Double.isNaN(direct) ? fold : Math.max(direct, 
fold);
+                    }
+                }
+            }
+        }
+        if (!Double.isNaN(direct)) {
+            return direct;
+        }
         // Unigram backoff for unseen pair or for codepoints absent from the
         // per-script index.  α=1.0 = plain independence.
         double ua = unigramLogProb(tables, idxA);
@@ -1068,6 +1159,15 @@ public final class JunkDetector implements 
TextQualityDetector {
     /** Model script key for the pooled COMMON (digits/punctuation/symbols) 
table. */
     public static final String COMMON_SCRIPT = "COMMON";
 
+    /** Sentinel "codepoints" (above the Unicode maximum U+10FFFF, so they 
cannot
+     *  collide with real text) that wrap each letter token: TOKEN_START (^) 
before
+     *  the first letter of a run and TOKEN_END ($) after the last.  Emitted by
+     *  {@link #forEachScriptBigram} so the bigram LM learns word-initial / 
word-
+     *  final letter typicality, and so z1 never empties for text containing 
even a
+     *  single letter. */
+    public static final int TOKEN_START = 0x110000;
+    public static final int TOKEN_END = 0x110001;
+
     /** COMMON-class predicate: COMMON, INHERITED, UNKNOWN all pool into 
COMMON. */
     static String classKey(int cp) {
         Character.UnicodeScript s = Character.UnicodeScript.of(cp);
@@ -1095,21 +1195,29 @@ public final class JunkDetector implements 
TextQualityDetector {
     }
 
     // -----------------------------------------------------------------------
-    // Bucket-by-script bigram enumeration (the keystone).
+    // Word-level bigram enumeration (the keystone).
     // Single source of truth for BOTH inference z1 scoring and training tally.
-    // A bigram (a,b) is:
-    //   - skipped if either codepoint is charset-invariant (digit) — digits
-    //     never enter any bucket;
-    //   - assigned to the adjacent real script when one side is COMMON glue
-    //     (space/punct fold into the neighbouring word);
-    //   - assigned to the COMMON bucket when BOTH sides are COMMON — so that
-    //     symbol/punctuation salad ("*^&(...") scores against the COMMON table
-    //     (unseen → junky), while a formatted number (digits skipped, its
-    //     punctuation digit-adjacent) lands almost nothing → neutral;
-    //   - skipped if a and b are two DIFFERENT real scripts (cross-script
-    //     boundary — not comparable, would mix tables).
-    // COMMON is a normal bucket (its own table + calibration + combiner 
weight),
-    // scored through this same path.
+    // The codepoint stream is tokenized into maximal same-script runs:
+    //   - LETTERs (Lu/Ll/Lt/Lm/Lo) are the scoreable content, bucketed by 
script;
+    //   - combining MARKs (Mn/Me/Mc) attach to the current run (so NFD 
accents,
+    //     Arabic harakat, Indic matras, Thai vowel signs stay inside their 
word);
+    //   - GLUE — every other non-letter that is NOT whitespace/NUL and NOT a 
decode
+    //     anomaly (punctuation, symbols, numbers) — ALSO attaches to the open 
run
+    //     and IS scored, at codepoint resolution.  This is what lets z1 catch 
a
+    //     wrong-charset symbol wedged mid-word: the LM learns letter->'.' is 
common
+    //     but letter->U+2030 is ~0, so 'Hausj‰rven' (Latin-sibling misdecode) 
floors
+    //     while 'Hausjärven' (a real accented letter) does not.  Resolution 
is the
+    //     codepoint, never the Unicode category — '%', U+2030, U+2020 are all 
Po
+    //     like '.', so a typed/binned boundary would hide them behind the 
period's
+    //     frequency (measured: letter->'.' = 235k vs letter->U+2030 = 0 in 
LATIN);
+    //   - BOUNDARIES that split a run and emit NOTHING: whitespace and NUL 
(word /
+    //     structure separators) and the decode-anomaly set (U+FFFD / C1 / 
anomalous
+    //     Cc / PUA), whose penalty is carried solely by z6 (anomaly ratio) 
and z3,
+    //     NEVER z1 — keeping anomalies out of z1 is what stops z1 
cannibalizing the
+    //     FFFD signal z6 owns;
+    //   - a letter-script change is also a boundary (cross-script structure 
is z4/z9).
+    // Each run is wrapped TOKEN_START (^) ... TOKEN_END ($) so the LM learns
+    // word-initial/final typicality and never empties for text with even one 
letter.
     // -----------------------------------------------------------------------
 
     /** Sink for {@link #forEachScriptBigram}: (modelScript, cpA, cpB). */
@@ -1118,9 +1226,22 @@ public final class JunkDetector implements 
TextQualityDetector {
         void accept(String script, int a, int b);
     }
 
-    /** Charset-invariant content excluded from per-script bigram scoring. */
-    static boolean isSkipCodepoint(int cp) {
-        return Character.isDigit(cp);
+    /** True for letter codepoints (Lu/Ll/Lt/Lm/Lo) — the scoreable token 
content
+     *  that forms per-script runs.  {@code type} is {@link 
Character#getType}. */
+    static boolean isLetterCp(int type) {
+        return type == Character.UPPERCASE_LETTER
+                || type == Character.LOWERCASE_LETTER
+                || type == Character.TITLECASE_LETTER
+                || type == Character.MODIFIER_LETTER
+                || type == Character.OTHER_LETTER;
+    }
+
+    /** True for combining marks (Mn/Me/Mc) — they attach to the current run
+     *  rather than splitting it.  {@code type} is {@link Character#getType}. 
*/
+    static boolean isMarkCp(int type) {
+        return type == Character.NON_SPACING_MARK
+                || type == Character.ENCLOSING_MARK
+                || type == Character.COMBINING_SPACING_MARK;
     }
 
     /**
@@ -1154,64 +1275,67 @@ public final class JunkDetector implements 
TextQualityDetector {
         return buckets;
     }
 
-    /** "Real" structural whitespace collapses to canonical U+0020 before 
bigram
-     *  emission.  Matches {@link #computeZ3ControlByte}'s definition of
-     *  non-anomalous whitespace: HT (0x09), LF (0x0A), CR (0x0D), regular
-     *  space (0x20), plus the Zs/Zl/Zp Unicode categories (NBSP, ideographic
-     *  space, line/paragraph separators).
-     *
-     *  <p><strong>Anomalous Cc (0x01-0x08, 0x0B, 0x0C, 0x0E-0x1F, 0x7F, U+0085
-     *  NEL, U+0080-0x009F C1 controls) and Cf (format chars) are DELIBERATELY
-     *  NOT normalized.</strong>  Their OOV-floor signal is carrying real
-     *  evidence that the decode is wrong — e.g., windows-1252 bytes 0x80-0x9F
-     *  decode to printable curly quotes / em-dashes; ISO-8859-16 misdecodes
-     *  them as C1 control codepoints; the bigram-table OOV-floor on those
-     *  Cc-touching bigrams is what correctly penalizes the wrong decode.
-     *  z3 has had this distinction since v15; this brings z1 in line. */
-    static int normalizeWhitespace(int cp) {
-        if (cp == 0x20) {
-            return cp;
-        }
-        if (cp == 0x09 || cp == 0x0A || cp == 0x0D) {
-            return 0x20;
-        }
-        int t = Character.getType(cp);
-        if (t == Character.SPACE_SEPARATOR
-                || t == Character.LINE_SEPARATOR
-                || t == Character.PARAGRAPH_SEPARATOR) {
-            return 0x20;
-        }
-        return cp;
-    }
-
     public static void forEachScriptBigram(int[] cps, BigramSink sink) {
-        if (cps == null || cps.length < 2) {
+        if (cps == null || cps.length == 0) {
             return;
         }
-        for (int i = 0; i + 1 < cps.length; i++) {
-            int a = normalizeWhitespace(cps[i]);
-            int b = normalizeWhitespace(cps[i + 1]);
-            if (isSkipCodepoint(a) || isSkipCodepoint(b)) {
-                continue;
-            }
-            String ka = classKey(a);
-            String kb = classKey(b);
-            boolean aCommon = COMMON_SCRIPT.equals(ka);
-            boolean bCommon = COMMON_SCRIPT.equals(kb);
-            String script;
-            if (aCommon && bCommon) {
-                script = COMMON_SCRIPT;        // symbol/punct salad → COMMON 
bucket
-            } else if (aCommon) {
-                script = kb;                   // glue folds into the real side
-            } else if (bCommon) {
-                script = ka;
-            } else if (ka.equals(kb)) {
-                script = ka;                   // same real script
-            } else {
-                continue;                      // cross-script boundary
+        String curScript = null;   // script of the run in progress; null = no 
open run
+        int prev = -1;             // previous codepoint in the open run (left 
side of next bigram)
+        for (int cp : cps) {
+            int type = Character.getType(cp);
+            if (isLetterCp(type)) {
+                String sc = classKey(cp);
+                if (curScript != null && sc.equals(curScript)) {
+                    sink.accept(curScript, prev, cp);          // within-run 
letter bigram
+                } else {
+                    if (curScript != null) {
+                        sink.accept(curScript, prev, TOKEN_END);   // close 
the prior run
+                    }
+                    curScript = sc;
+                    sink.accept(curScript, TOKEN_START, cp);        // open a 
new run
+                }
+                prev = cp;
+            } else if (isBoundaryCp(cp)) {
+                // WORD/STRUCTURE boundary (whitespace, NUL) or a decode 
anomaly
+                // (U+FFFD / C1 / anomalous Cc / PUA — scored by z6/z3, never 
z1):
+                // close the run, emit nothing.
+                if (curScript != null) {
+                    sink.accept(curScript, prev, TOKEN_END);
+                    curScript = null;
+                    prev = -1;
+                }
+            } else if (curScript != null) {
+                // GLUE (punctuation / symbol / number) or a combining MARK 
inside an
+                // open run: attach and SCORE it at codepoint resolution.  
This is the
+                // intrusion signal: the LM learns letter->'.' is common but
+                // letter->U+2030 (per-mille) is ~0, so a wrong-charset symbol 
wedged
+                // mid-word (the Latin-sibling misdecode, e.g. 'Hausj‰rven') 
floors
+                // z1, while a clean accented letter (a real letter, scored as 
a letter
+                // bigram) does not.  Resolution is the codepoint, never the 
Unicode
+                // category: '%', U+2030, U+2020 are all Po like '.', so 
binning by
+                // category would hide them behind the period's huge frequency.
+                sink.accept(curScript, prev, cp);
+                prev = cp;
             }
-            sink.accept(script, a, b);
+            // else: orphan glue/mark with no open run -> nothing to attach 
to, skip.
         }
+        if (curScript != null) {
+            sink.accept(curScript, prev, TOKEN_END);            // close the 
final run
+        }
+    }
+
+    /** True for codepoints that BREAK a run without being scored in z1: 
whitespace
+     *  and NUL (word/structure boundaries) plus the z6/z3 decode-anomaly set
+     *  ({@link TextQualityFeatures#isAnomalyCodepoint} — U+FFFD, C1, 
anomalous Cc,
+     *  private-use).  Every other non-letter (punctuation, symbol, number) is 
GLUE:
+     *  it attaches to the open run and is scored, so the LM can floor a symbol
+     *  wedged mid-word while keeping the anomaly penalty solely in z6 (so z1 
never
+     *  cannibalizes the FFFD signal). */
+    static boolean isBoundaryCp(int cp) {
+        return cp == 0x00
+                || Character.isWhitespace(cp)
+                || Character.isSpaceChar(cp)
+                || TextQualityFeatures.isAnomalyCodepoint(cp);
     }
 
     /**
diff --git 
a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkFilterEncodingDetector.java
 
b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkFilterEncodingDetector.java
index a7706e2118..b8cb75de01 100644
--- 
a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkFilterEncodingDetector.java
+++ 
b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkFilterEncodingDetector.java
@@ -594,10 +594,7 @@ public class JunkFilterEncodingDetector implements 
MetaEncodingDetector {
         // Score CJK candidates on their vendor superset, not the strict base
         // (which U+FFFDs vendor-extension chars and unfairly penalizes real
         // CJK). AutoDetectReader re-applies the same superset for content.
-        Charset decodeAs = CharsetSupersets.supersetOf(charset);
-        if (decodeAs == null) {
-            decodeAs = charset;
-        }
+        Charset decodeAs = CharsetSupersets.decodeAs(charset);
         try {
             return new String(bytes, decodeAs);
         } catch (Exception e) {
diff --git 
a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/TextQualityFeatures.java
 
b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/TextQualityFeatures.java
index 25f07b765f..2ae926a927 100644
--- 
a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/TextQualityFeatures.java
+++ 
b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/TextQualityFeatures.java
@@ -229,17 +229,29 @@ public final class TextQualityFeatures {
         if (text == null || text.isEmpty()) {
             return Double.NaN;
         }
-        int total = 0;
+        int denom = 0;     // high bytes + sub-0x80 anomalies = codepoints 
that CAN be decode failures
         int anomaly = 0;
         for (int i = 0; i < text.length(); ) {
             int cp = text.codePointAt(i);
             i += Character.charCount(cp);
-            total++;
-            if (isAnomalyCodepoint(cp)) {
+            boolean anom = isAnomalyCodepoint(cp);
+            if (anom) {
                 anomaly++;
             }
+            if (cp >= 0x80 || anom) {
+                denom++;
+            }
         }
-        return total == 0 ? Double.NaN : (double) anomaly / total;
+        // Denominator is non-ASCII (+ sub-0x80 control anomalies), NOT total
+        // codepoints: anomalies only arise from undecodable HIGH bytes, so 
this is
+        // the fraction "of the bytes that COULD fail, how many did" — 
undiluted by
+        // ASCII.  PURE ratio (no smoothing, no min-denom cliff): a page whose 
few
+        // non-ASCII chars all failed (1 FFFD / 1 non-ASCII = 1.0) is a strong 
wrong-
+        // charset signal and must register fully, while a content-rich page 
with a
+        // few stray FFFD has a naturally-tiny ratio (1/500 ≈ 0).  This is the 
signal
+        // that distinguishes Latin wrong-charset (ratio→1) from CJK 
mixed-encoding
+        // (ratio→0.06).  All-ASCII (denom 0) → 0 (clean).
+        return denom == 0 ? 0.0 : (double) anomaly / denom;
     }
 
     /** True if {@code cp} is in the z6 anomaly set: U+FFFD, anomalous Cc
@@ -364,6 +376,14 @@ public final class TextQualityFeatures {
      * signal) from "all-whitespace / digit-only content" (zero density
      * → strong negative signal in JunkDetector's bigram-based judgment,
      * mild signal for general-purpose junk filtering).
+     *
+     * <p><strong>U+FFFD is excluded</strong> from both numerator and
+     * denominator: it is a decode-failure marker scored by the dedicated
+     * replacement-char feature (z6), so counting it here too would (a) double-
+     * count FFFD and (b) re-create the FFFD-drag when this feature dominates —
+     * a permissive wrong decode (few FFFD) would out-score a correct mixed-
+     * encoding decode (many FFFD from undecodable widget bytes).  This 
measures
+     * the composition of the *decodable* content only.
      */
     public static double scriptDensity(String text) {
         if (text == null || text.isEmpty()) {
@@ -374,6 +394,9 @@ public final class TextQualityFeatures {
         for (int i = 0; i < text.length(); ) {
             int cp = text.codePointAt(i);
             i += Character.charCount(cp);
+            if (cp == 0xFFFD) {
+                continue;
+            }
             total++;
             Character.UnicodeScript s = Character.UnicodeScript.of(cp);
             if (s != Character.UnicodeScript.COMMON
diff --git 
a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/BuildJunkTrainingData.java
 
b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/BuildJunkTrainingData.java
index a45f74a82c..b4460501b4 100644
--- 
a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/BuildJunkTrainingData.java
+++ 
b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/BuildJunkTrainingData.java
@@ -658,14 +658,15 @@ public class BuildJunkTrainingData {
         if (text.indexOf('\uFFFD') >= 0) {
             return null;
         }
-        // NFC so the training tally matches JunkDetector.aggregate at
-        // inference time (which also NFC-normalises — see comment at
-        // JunkDetector#aggregate).  Precomposed characters (Latin
-        // diacritics, Vietnamese, Indic combining-mark sequences) are
-        // stored as single codepoints, so bigram counts collapse mark
-        // + letter into one unit instead of splitting them — matching
-        // the natural NFC form of most source text.
-        text = Normalizer.normalize(text, Normalizer.Form.NFC);
+        // NFKC so the training tally matches JunkDetector.aggregate at
+        // inference time (which also NFKC-normalises — see comment at
+        // JunkDetector#aggregate).  Like NFC it stores precomposed characters
+        // (Latin diacritics, Vietnamese, Indic combining-mark sequences) as
+        // single codepoints so bigram counts collapse mark + letter into one
+        // unit; additionally it folds legacy compatibility forms to canonical
+        // — critically half-width katakana (U+FF66-FF9F) -> full-width — so
+        // those bigrams match the trained HAN tables instead of flooring.
+        text = Normalizer.normalize(text, Normalizer.Form.NFKC);
         if (text.getBytes(StandardCharsets.UTF_8).length < minBytes) {
             return null;
         }
diff --git 
a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/TrainJunkModel.java
 
b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/TrainJunkModel.java
index a764444e59..63bce5317f 100644
--- 
a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/TrainJunkModel.java
+++ 
b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/TrainJunkModel.java
@@ -83,6 +83,19 @@ public class TrainJunkModel {
      */
     static final float CONTROL_BYTE_MIN_SIGMA = 0.005f;
 
+    /**
+     * Minimum sigma for the per-script z1 calibration.  Well-trained scripts
+     * (&gt;1000 dev windows) reach sigma in [0.38, 1.29]; below ~0.38 the 
value is
+     * an artifact of too few windows (TIFINAGH at 2 windows gave 
sigma=0.0006).  A
+     * garbage doc that scatters a few codepoints into such a degenerate 
bucket then
+     * gets z1 = numerator/sigma in the thousands, dragging the count-weighted 
doc z1
+     * to −700+ (languageness ≈ −458) — correct sign, absurd magnitude.  
Flooring at
+     * the reliable-training floor caps under-trained scripts to the genuine 
z1 scale
+     * (worst ≈ −60) and leaves well-trained scripts untouched (their sigma 
already
+     * exceeds the floor, so no real garbage is clipped).
+     */
+    static final float Z1_MIN_SIGMA = 0.4f;
+
     /**
      * Full-text byte-level mojibake pairs used by {@link #byteLevelMojibake}.
      * Each entry is {sourceCodec, wrongCodec}: training text gets encoded in
@@ -259,11 +272,14 @@ public class TrainJunkModel {
         }
 
         // 
-----------------------------------------------------------------------
-        // Phase 1 — bucket-by-script bigram tables (per real script + COMMON).
-        // ONE tally pass over every train file via forEachScriptBigram: digits
-        // skipped, COMMON glue folded into the adjacent script, both-COMMON
-        // bigrams charged to the COMMON bucket.  No runs, no sentinels.  
COMMON
-        // is a normal bucket (its own table + calibration + combiner weight).
+        // Phase 1 — per-script bigram tables.  ONE tally pass over every train
+        // file via forEachScriptBigram (the SAME tokenizer inference uses): 
the
+        // text is split into maximal same-script LETTER runs (marks attach), 
each
+        // run wrapped TOKEN_START..TOKEN_END, and the within-run bigrams 
(incl. the
+        // ^/$ edges) tallied to the run's script.  Non-letters are boundaries 
and
+        // emit nothing, so the trained tables hold exactly the bigrams 
inference
+        // scores.  The COMMON bucket is now effectively vestigial (only the 
rare
+        // COMMON-scripted letter lands there).
         // 
-----------------------------------------------------------------------
         System.out.println("\n--- Phase 1: bucket-by-script F1 tables ---");
         Map<String, HashMap<Long, long[]>> pairsByScript = new HashMap<>();
@@ -312,6 +328,7 @@ public class TrainJunkModel {
         for (String script : f1TablesByScript.keySet()) {
             List<Double> scores = z1ScoresByScript.getOrDefault(script, new 
ArrayList<>());
             float[] cal = scores.isEmpty() ? new float[]{0f, 1f} : 
muSigma(scores);
+            cal[1] = Math.max(cal[1], Z1_MIN_SIGMA);   // floor degenerate 
(under-trained) sigma
             f1Calibrations.put(script, cal);
             System.out.printf("  [%s] mu=%.4f sigma=%.4f (%,d windows)%n",
                     script, cal[0], cal[1], scores.size());
@@ -446,11 +463,9 @@ public class TrainJunkModel {
                 end++;
             }
             String s = new String(bytes, start, end - start, 
StandardCharsets.UTF_8);
-            // NFD-normalize on read so calibration/training feature math
-            // matches JunkDetector.scoreText's NFD path.  On-disk corpus
-            // may be NFC (older builds of BuildJunkTrainingData); NFD is
-            // idempotent on already-NFD text.
-            s = java.text.Normalizer.normalize(s, 
java.text.Normalizer.Form.NFC);
+            // NFKC-normalize so training matches JunkDetector inference (folds
+            // compatibility forms incl. half-width katakana -> full-width).
+            s = java.text.Normalizer.normalize(s, 
java.text.Normalizer.Form.NFKC);
             result.add(s);
         }
         return result;
@@ -479,7 +494,7 @@ public class TrainJunkModel {
                     continue;
                 }
                 String norm = java.text.Normalizer.normalize(
-                        line, java.text.Normalizer.Form.NFC);
+                        line, java.text.Normalizer.Form.NFKC);
                 int[] cps = norm.codePoints().toArray();
                 JunkDetector.forEachScriptBigram(cps, (script, a, b) -> {
                     HashMap<Long, long[]> pairs = 
pairsByScript.computeIfAbsent(
@@ -579,7 +594,9 @@ public class TrainJunkModel {
                 scores.add(bk[0] / bk[1]);
             }
         }
-        return scores.isEmpty() ? new float[]{0f, 1f} : muSigma(scores);
+        float[] cal = scores.isEmpty() ? new float[]{0f, 1f} : muSigma(scores);
+        cal[1] = Math.max(cal[1], Z1_MIN_SIGMA);       // floor degenerate 
(under-trained) sigma
+        return cal;
     }
 
     /** Single GLOBAL z3 (control-byte) calibration, pooled over all files. */
@@ -698,6 +715,29 @@ public class TrainJunkModel {
         return new String(codepoints, 0, codepoints.length);
     }
 
+    /**
+     * Replaces a fraction of HIGH (non-ASCII) codepoints with U+FFFD, 
simulating
+     * bytes the chosen charset could not decode.  ASCII is never touched (a
+     * decode failure is a high byte, never clean ASCII).  Used two ways: a LOW
+     * rate over coherent text for mixed-encoding CLEAN positives (real text 
with
+     * a few undecodable widget bytes), and a HIGH rate over already-incoherent
+     * text for FFFD-heavy NEGATIVES.  Pairing the two teaches the combiner 
that
+     * FFFD (z6) is junk evidence only when z1/coherence is ALSO low — so 
removing
+     * FFFD from z1 does not make z6/z7 over-penalize mixed-encoding pages.
+     */
+    static String injectReplacementChars(String text, double rate, Random rng) 
{
+        if (text.isEmpty()) {
+            return text;
+        }
+        int[] codepoints = text.codePoints().toArray();
+        for (int i = 0; i < codepoints.length; i++) {
+            if (codepoints[i] >= 0x80 && rng.nextDouble() < rate) {
+                codepoints[i] = 0xFFFD;
+            }
+        }
+        return new String(codepoints, 0, codepoints.length);
+    }
+
     /**
      * Strips combining marks (Mn / Mc / Me categories) after NFD
      * normalization.  Models the PDF/OCR pipeline that drops marks
@@ -1118,6 +1158,38 @@ public class TrainJunkModel {
                     addContrastivePair(fx, w, byteLevelMojibake(w, pr[0], 
pr[1]),
                             fc, pairCorrect, pairWrong);
                 }
+
+                // Mixed-encoding (korA-class) pair: coherent text with heavy
+                // undecodable FFFD widget bytes (~15-20% of high bytes) must 
BEAT
+                // coherent-looking mojibake of the same source text (the 
clean w,
+                // without the FFFD widgets).  As a contrastive
+                // PAIR (not a pointwise positive) it adds no class imbalance, 
and
+                // it forces z1/coherence to stay the discriminator: the 
correct
+                // side has MORE FFFD (z6 favors the wrong side) and equal z7
+                // (FFFD excluded), so only z1 can rank it correctly.  This is 
what
+                // keeps removing FFFD from z1 from letting z6/z7 sink korA.
+                String mixed = injectReplacementChars(w, 0.15 + 
rng.nextDouble() * 0.05, rng);
+                float[] fMixed = featureVector(fx, mixed);
+                if (fMixed != null) {
+                    String[] pr = BYTE_LEVEL_MOJIBAKE_PAIRS[
+                            rng.nextInt(BYTE_LEVEL_MOJIBAKE_PAIRS.length)];
+                    addContrastivePair(fx, mixed, byteLevelMojibake(w, pr[0], 
pr[1]),
+                            fMixed, pairCorrect, pairWrong);
+                }
+
+                // All-caps pair (cased scripts only): all-caps CLEAN (z1 
recovered
+                // by the case-folded backoff) must beat all-caps mojibake of 
the
+                // same bytes (no seen lowercase twin -> still floored).
+                String upper = w.toUpperCase(java.util.Locale.ROOT);
+                if (!upper.equals(w)) {
+                    float[] fUpper = featureVector(fx, upper);
+                    if (fUpper != null) {
+                        String[] pr = BYTE_LEVEL_MOJIBAKE_PAIRS[
+                                rng.nextInt(BYTE_LEVEL_MOJIBAKE_PAIRS.length)];
+                        addContrastivePair(fx, upper, byteLevelMojibake(upper, 
pr[0], pr[1]),
+                                fUpper, pairCorrect, pairWrong);
+                    }
+                }
                 if ("LATIN".equals(script)) {
                     String[] pr = LATIN_TO_CJK_PAIRS[
                             rng.nextInt(LATIN_TO_CJK_PAIRS.length)];
@@ -1129,15 +1201,33 @@ public class TrainJunkModel {
                             pairCorrect, pairWrong);
                 }
 
+                // Monotonicity anchor: the SAME clean text with its non-ASCII 
chars
+                // replaced by U+FFFD must rank BELOW the clean text.  This 
leaves z1
+                // HIGH (often higher: FFFD removes the hard accented bigram 
and the
+                // surviving fragments are common -- de_correct z1=0.27 vs 
de_fffd
+                // 0.55) and z7 high, so z6 (replacement ratio) is the ONLY 
feature
+                // that separates it from clean -- forcing the combiner to 
weight z6
+                // enough to overrule z1's perverse FFFD gain (the deu/gsw, 
deu/frr
+                // cases).  (A generalized real-charset-mojibake variant was 
tried for
+                // the within-Latin SIBLING case but reverted -- it cost 
korA/z6
+                // without fixing the siblings; see 
LANGUAGENESS_RESIDUAL_FAILURES.md.)
+                String fffdSame = injectReplacementChars(w, 0.5 + 
rng.nextDouble() * 0.5, rng);
+                addContrastivePair(fx, w, fffdSame, fc, pairCorrect, 
pairWrong);
+
                 // Pointwise garbage anchor (generic junk, no correct 
counterpart).
                 String junk;
-                int mode = rng.nextInt(3);
+                int mode = rng.nextInt(4);
                 if (mode == 0) {
                     junk = injectControlChars(w, 0.15, rng);
                 } else if (mode == 1) {
                     junk = shuffleChars(w, rng);
-                } else {
+                } else if (mode == 2) {
                     junk = injectPrivateUseAreaChars(w, 0.12, rng);
+                } else {
+                    // FFFD-heavy junk: incoherent (shuffled) AND 
mostly-undecodable.
+                    // Pairs with the mixed-encoding clean positive above so 
the
+                    // combiner learns FFFD is junk evidence only when z1 is 
low.
+                    junk = injectReplacementChars(shuffleChars(w, rng), 0.4, 
rng);
                 }
                 float[] fb = featureVector(fx, junk);
                 if (fb != null) {
diff --git 
a/tika-ml/tika-ml-junkdetect/src/main/resources/org/apache/tika/ml/junkdetect/junkdetect.bin
 
b/tika-ml/tika-ml-junkdetect/src/main/resources/org/apache/tika/ml/junkdetect/junkdetect.bin
index a83dd647e6..ead028cbb3 100644
Binary files 
a/tika-ml/tika-ml-junkdetect/src/main/resources/org/apache/tika/ml/junkdetect/junkdetect.bin
 and 
b/tika-ml/tika-ml-junkdetect/src/main/resources/org/apache/tika/ml/junkdetect/junkdetect.bin
 differ
diff --git 
a/tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/DecodeCorruptionDiscriminationTest.java
 
b/tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/DecodeCorruptionDiscriminationTest.java
new file mode 100644
index 0000000000..5f2ceb298f
--- /dev/null
+++ 
b/tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/DecodeCorruptionDiscriminationTest.java
@@ -0,0 +1,172 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.ml.junkdetect;
+
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+import java.util.Locale;
+
+import org.junit.jupiter.api.BeforeAll;
+import org.junit.jupiter.api.Test;
+
+/**
+ * Languageness must be MONOTONIC under decode corruption: a clean German 
phrase
+ * must score strictly higher than the same phrase broken two distinct ways, 
each
+ * exercising a different feature.
+ *
+ * <ol>
+ *   <li>U+FFFD -- a decode failure (accented chars replaced by the replacement
+ *       char).  Caught by z6 (replacement ratio).  Subtle: FFFD is a token
+ *       boundary, so it DROPS the hard accented bigram and the surviving 
common-
+ *       letter fragments LIFT z1 -- coherence is fooled into preferring the 
broken
+ *       decode, which used to score HIGHER than clean on short pages in the 
150k
+ *       CommonCrawl eval (deu/gsw, deu/frr).  z6 must overrule that.</li>
+ *   <li>Wrong accented letter -- real Latin letters that do not belong in 
German
+ *       (Nordic a-ring / y-diaeresis / thorn), no FFFD.  Caught by z1 (letter
+ *       coherence); the margin is smaller (pan-Latin pools many languages) 
but the
+ *       clean decode must still win.</li>
+ * </ol>
+ *
+ * <p>Non-ASCII is written with Unicode escapes so the source stays ASCII-only.
+ */
+public class DecodeCorruptionDiscriminationTest {
+
+    private static JunkDetector jd;
+
+    @BeforeAll
+    static void load() throws Exception {
+        jd = JunkDetector.loadFromClasspath();
+    }
+
+    private static float languageness(String s) {
+        return jd.scoreWithFeatureComponents(s).logit;
+    }
+
+    /** Clean German prose: "Die naechste Stunde beginnt am Montag um neun Uhr 
im
+     *  grossen Saal fuer alle Anfaenger" (with real umlauts/eszett). */
+    private static final String CLEAN =
+            "Die n\u00E4chste Stunde beginnt am Montag um neun Uhr im "
+            + "gro\u00DFen Saal f\u00FCr alle Anf\u00E4nger";
+
+    @Test
+    void cleanOutscoresFffdBrokenWord() {
+        // Every accented char -> U+FFFD: a decode failure that leaves the
+        // surrounding letters real and in order, so z1 is unharmed (even 
helped).
+        // z6 (replacement ratio) must overrule z1 and rank clean higher.
+        String fffd =
+                "Die n\uFFFDchste Stunde beginnt am Montag um neun Uhr im "
+                + "gro\uFFFDen Saal f\uFFFDr alle Anf\uFFFDnger";
+        float clean = languageness(CLEAN);
+        float broken = languageness(fffd);
+        assertTrue(clean > broken,
+                "clean German must outscore the U+FFFD-broken decode (a decode 
"
+                + "failure must never raise languageness); clean=" + clean
+                + " fffd=" + broken);
+    }
+
+    @Test
+    void cleanOutscoresWrongAccentedLetter() {
+        // Accented Latin letters that do not belong in German: ae->a-ring 
(U+00E5),
+        // ss->thorn (U+00FE), ue->y-diaeresis (U+00FF).  No FFFD; z1 
(coherence)
+        // must rank the clean decode higher even though the substitutes are 
valid
+        // Latin in some language.
+        String wrong =
+                "Die n\u00E5chste Stunde beginnt am Montag um neun Uhr im "
+                + "gro\u00FEen Saal f\u00FFr alle Anf\u00E5nger";
+        float clean = languageness(CLEAN);
+        float broken = languageness(wrong);
+        assertTrue(clean > broken,
+                "clean German must outscore the wrong-accented-letter decode; "
+                + "clean=" + clean + " wrong=" + broken);
+    }
+
+    @Test
+    void allCapsAndTitleOutscoreAlternatingCase() {
+        // Case-CONSISTENT real text (ALL-CAPS headings, Title Case) scores 
clean;
+        // case-INCONSISTENT alternating case ("aLtErNaTiNg") is the junk 
pattern and
+        // must floor.  The all-caps fix borrows the lowercase score for 
consistent
+        // uppercase; the case-consistency gate keeps alternating case from 
rescue.
+        String lower = "international organization standards committee 
meeting";
+        String allCaps = lower.toUpperCase(Locale.ROOT);
+        String title = toTitleCase(lower);
+        String alt = toAlternatingCase(lower);
+        float lLower = languageness(lower);
+        float lAll = languageness(allCaps);
+        float lTitle = languageness(title);
+        float lAlt = languageness(alt);
+        assertTrue(lAll > lAlt,
+                "ALL-CAPS must outscore aLtErNaTiNg junk; allCaps=" + lAll + " 
alt=" + lAlt);
+        assertTrue(lTitle > lAlt,
+                "Title-case must outscore aLtErNaTiNg junk; title=" + lTitle + 
" alt=" + lAlt);
+        assertTrue(lAll > lLower - 1.0f,
+                "ALL-CAPS must score ~= lowercase (case-fold rescue); 
allCaps=" + lAll
+                + " lower=" + lLower);
+    }
+
+    @Test
+    void allCapsCyrillicOutscoresGibberishDecode() {
+        // The real 150k regression (corpus file 1A68D...): all-caps Russian
+        // "MUZEJ BUDUSchEGO" was scored BELOW a KOI8-R gibberish decode, so 
the
+        // detector chose the gibberish.  The case-fold must rank the all-caps 
real
+        // Russian like its lowercase form, above the gibberish.  ASCII 
source: the
+        // Russian is written with Unicode escapes.
+        String allCaps = "\u041C\u0423\u0417\u0415\u0419 
\u0411\u0423\u0414\u0423\u0429\u0415\u0413\u041E";
+        String lower = "\u043C\u0443\u0437\u0435\u0439 
\u0431\u0443\u0434\u0443\u0449\u0435\u0433\u043E";
+        String gibberish = "\u043B\u0441\u0433\u0435\u0438 
\u0430\u0441\u0434\u0441\u044B\u0435\u0446\u043D";
+        float lAll = languageness(allCaps);
+        float lLower = languageness(lower);
+        float lJunk = languageness(gibberish);
+        assertTrue(lAll > lJunk,
+                "all-caps real Russian must outscore KOI8-R gibberish; 
allCaps=" + lAll
+                + " gibberish=" + lJunk);
+        assertTrue(Math.abs(lAll - lLower) < 0.6f,
+                "all-caps must score ~= lowercase; allCaps=" + lAll + " 
lower=" + lLower);
+    }
+
+    private static String toTitleCase(String s) {
+        StringBuilder sb = new StringBuilder(s.length());
+        boolean start = true;
+        for (int i = 0; i < s.length(); i++) {
+            char c = s.charAt(i);
+            if (Character.isWhitespace(c)) {
+                start = true;
+                sb.append(c);
+            } else if (start) {
+                sb.append(Character.toUpperCase(c));
+                start = false;
+            } else {
+                sb.append(c);
+            }
+        }
+        return sb.toString();
+    }
+
+    private static String toAlternatingCase(String s) {
+        StringBuilder sb = new StringBuilder(s.length());
+        boolean up = false;
+        for (int i = 0; i < s.length(); i++) {
+            char c = s.charAt(i);
+            if (Character.isLetter(c)) {
+                sb.append(up ? Character.toUpperCase(c) : 
Character.toLowerCase(c));
+                up = !up;
+            } else {
+                sb.append(c);
+            }
+        }
+        return sb.toString();
+    }
+}
diff --git 
a/tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/JunkDetectorRoundTripTest.java
 
b/tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/JunkDetectorRoundTripTest.java
index 73133cf9c9..07efc64dd8 100644
--- 
a/tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/JunkDetectorRoundTripTest.java
+++ 
b/tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/JunkDetectorRoundTripTest.java
@@ -16,7 +16,9 @@
  */
 package org.apache.tika.ml.junkdetect;
 
+import static org.junit.jupiter.api.Assertions.assertDoesNotThrow;
 import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertThrows;
 import static org.junit.jupiter.api.Assertions.assertTrue;
 
 import java.io.BufferedWriter;
@@ -25,6 +27,7 @@ import java.io.OutputStreamWriter;
 import java.nio.charset.StandardCharsets;
 import java.nio.file.Files;
 import java.nio.file.Path;
+import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.List;
 import java.util.TreeMap;
@@ -51,6 +54,21 @@ import org.apache.tika.quality.TextQualityScore;
  */
 public class JunkDetectorRoundTripTest {
 
+    @Test
+    void requireUsableSigmaRejectsNonPositiveOrNonFinite() {
+        assertDoesNotThrow(() -> JunkDetector.requireUsableSigma("ok", new 
float[]{-3f, 0.5f}));
+        for (float badSigma : new float[]{0f, -0.1f, Float.NaN,
+                Float.POSITIVE_INFINITY, Float.NEGATIVE_INFINITY}) {
+            assertThrows(IOException.class,
+                    () -> JunkDetector.requireUsableSigma("bad", new 
float[]{0f, badSigma}),
+                    "sigma=" + badSigma + " must be rejected");
+        }
+        assertThrows(IOException.class,
+                () -> JunkDetector.requireUsableSigma("null", null));
+        assertThrows(IOException.class,
+                () -> JunkDetector.requireUsableSigma("short", new 
float[]{1f}));
+    }
+
     @Test
     void roundTripSeenPairAndUnigramBackoff(@TempDir Path tmp) throws 
IOException {
         // -----------------------------------------------------------------
@@ -273,25 +291,107 @@ public class JunkDetectorRoundTripTest {
                 -10.0f, 1.0f);
     }
 
-    /** Expected z1: mean log-prob over every non-digit adjacent bigram scored
-     *  against the single-script {@code tables}, calibrated.  Mirrors
-     *  {@link JunkDetector}'s aggregate for single-script text. */
+    @Test
+    void tokenizationScoresGlueButKeepsAnomaliesAndWhitespaceAsBoundaries() {
+        // A letter run is wrapped ^...$.  GLUE (punctuation, symbols, numbers)
+        // attaches to the open run and IS scored at codepoint resolution, so a
+        // symbol wedged mid-word becomes a real (rare) bigram the LM can 
floor.
+        // But DECODE ANOMALIES (here U+FFFD; also C1 / PUA) and WHITESPACE 
stay
+        // boundaries that split the run and emit nothing — the anomaly penalty
+        // lives solely in z6, never z1, so z1 cannot cannibalize the FFFD 
signal.
+        String fffd = String.valueOf((char) 0xFFFD);
+        // all-letter run
+        assertEquals(List.of("^-a", "a-b", "b-c", "c-d", "d-$"), 
bigrams("abcd"));
+        // glue (period, U+2030 per-mille) is SCORED inside the run, not 
dropped
+        assertEquals(List.of("^-a", "a-b", "b-.", ".-c", "c-d", "d-$"), 
bigrams("ab.cd"));
+        assertEquals(List.of("^-a", "a-b", "b-\u2030", "\u2030-c", "c-d", 
"d-$"),
+                bigrams("ab\u2030cd"));
+        // U+FFFD (decode anomaly) is still a BOUNDARY: splits, emits nothing
+        assertEquals(List.of("^-a", "a-b", "b-$", "^-c", "c-d", "d-$"),
+                bigrams("ab" + fffd + "cd"));
+        assertEquals(List.of("^-a", "a-b", "b-$"), bigrams("ab" + fffd + 
fffd));
+        // whitespace is a boundary too
+        assertEquals(List.of("^-a", "a-b", "b-$", "^-c", "c-d", "d-$"),
+                bigrams("ab cd"));
+    }
+
+    /** Collects {@link JunkDetector#forEachScriptBigram} output as "a-b" 
strings,
+     *  rendering the run-boundary sentinels as {@code ^} (start) / {@code $} 
(end). */
+    private static List<String> bigrams(String s) {
+        List<String> out = new ArrayList<>();
+        JunkDetector.forEachScriptBigram(s.codePoints().toArray(), (script, a, 
b) ->
+                out.add(fmtCp(a) + "-" + fmtCp(b)));
+        return out;
+    }
+
+    private static String fmtCp(int cp) {
+        if (cp == JunkDetector.TOKEN_START) return "^";
+        if (cp == JunkDetector.TOKEN_END) return "$";
+        return new String(Character.toChars(cp));
+    }
+
+    @Test
+    void caseFoldedBackoffRescuesAllCapsButNotMixedOrMojibake() {
+        // Synthetic LATIN table: index ['a','b'], the lowercase pair (a,b) 
seen
+        // at a high log-prob (-1.0).  Uppercase 'A'/'B' are absent from the 
index.
+        BigramTables t = buildLatinTablesLowerAB();
+        double seenLower = JunkDetector.computeF1MeanLogP(new int[]{'a', 'b'}, 
t);
+        double allCaps = JunkDetector.computeF1MeanLogP(new int[]{'A', 'B'}, 
t);
+        double mixed = JunkDetector.computeF1MeanLogP(new int[]{'a', 'B'}, t);
+        double noTwin = JunkDetector.computeF1MeanLogP(new int[]{'B', 'A'}, t);
+        // All-caps "AB" folds to the SEEN lowercase "ab", landing a small 
case-fold
+        // penalty BELOW it (all-caps is a slightly weaker signal) -- but 
nowhere near
+        // the independence floor the mixed/mojibake cases hit below.
+        assertTrue(allCaps < seenLower && allCaps > seenLower - 0.5,
+                "all-caps AB must fold to ~ the seen lowercase ab (minus a 
small penalty); "
+                + "seenLower=" + seenLower + " allCaps=" + allCaps);
+        // Mixed-case "aB" is case-INCONSISTENT -> not folded -> independence 
floor.
+        assertTrue(mixed < allCaps - 1.0,
+                "mixed-case aB must not fold (consistency gate)");
+        // All-caps "BA" whose lowercase twin (b,a) is UNSEEN -> floors 
(mojibake case).
+        assertTrue(noTwin < allCaps - 1.0,
+                "all-caps with no seen lowercase twin must floor");
+    }
+
+    /** Like {@link #buildLatinTablesAB} but indexed on LOWERCASE ['a','b'] 
with
+     *  the lowercase pair (a,b) seen at -1.0 — exercises the case-folded 
backoff
+     *  (uppercase 'A'/'B' are absent from the index, so they must fold). */
+    private static BigramTables buildLatinTablesLowerAB() {
+        int[] cpIndex = new int[]{'a', 'b'};
+        int[] keys = new int[4];
+        Arrays.fill(keys, BigramTables.EMPTY_KEY);
+        byte[] values = new byte[4];
+        float bMin = -10.0f;
+        float bMax = -1.0f;
+        insertOA(keys, values, JunkDetector.packBigramKey(0, 1),
+                quantizeOne(-1.0f, bMin, bMax));
+        float uMin = -5.0f;
+        float uMax = -2.0f;
+        byte[] unigramBytes = new byte[]{
+                quantizeOne(-2.0f, uMin, uMax),
+                quantizeOne(-2.0f, uMin, uMax),
+        };
+        return new BigramTables(cpIndex, keys, values, unigramBytes,
+                bMin, bMax, uMin, uMax, -10.0f, 1.0f);
+    }
+
+    /** Expected z1: mean log-prob over the bigrams {@link
+     *  JunkDetector#forEachScriptBigram} emits (word-run tokenization with ^/$
+     *  wrapping), scored against the single-script {@code tables}, calibrated.
+     *  Delegates to the production tokenizer so it cannot drift from 
inference. */
     private static float expectedRunZ(BigramTables tables, String text, float 
mu, float sigma) {
-        int[] cps = text.codePoints().toArray();
-        double sum = 0;
-        long n = 0;
-        for (int i = 0; i + 1 < cps.length; i++) {
-            if (Character.isDigit(cps[i]) || Character.isDigit(cps[i + 1])) {
-                continue;
-            }
-            double f1 = JunkDetector.computeF1MeanLogP(new int[]{cps[i], cps[i 
+ 1]}, tables);
-            if (Double.isNaN(f1)) {
-                continue;
+        double[] acc = new double[2]; // {sum, count}
+        JunkDetector.forEachScriptBigram(text.codePoints().toArray(), (script, 
a, b) -> {
+            double f1 = JunkDetector.computeF1MeanLogP(new int[]{a, b}, 
tables);
+            if (!Double.isNaN(f1)) {
+                acc[0] += f1;
+                acc[1] += 1;
             }
-            sum += f1;
-            n++;
+        });
+        if (acc[1] == 0) {
+            throw new IllegalArgumentException("no scorable bigrams for: " + 
text);
         }
-        return (float) ((sum / n - mu) / sigma);
+        return (float) ((acc[0] / acc[1] - mu) / sigma);
     }
 
     /** Quantize a single float to 8-bit unsigned using the explicit range. */
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/JSoupParser.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/JSoupParser.java
index 05d8f836e6..c38cc2fd7f 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/JSoupParser.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/JSoupParser.java
@@ -53,6 +53,7 @@ import org.apache.tika.detect.EncodingResult;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.AbstractEncodingDetectorParser;
 import org.apache.tika.parser.ParseContext;
@@ -165,6 +166,9 @@ public class JSoupParser extends 
AbstractEncodingDetectorParser {
                 : encResults.get(0).getCharset();
         Charset decodeAs = encResults.isEmpty() ? DEFAULT_CHARSET
                 : encResults.get(0).getDecodeAs();
+        if (!decodeAs.equals(charset)) {
+            metadata.set(TikaCoreProperties.DECODED_CHARSET, decodeAs.name());
+        }
         String previous = metadata.get(Metadata.CONTENT_TYPE);
         MediaType contentType = null;
         if (previous == null || previous.startsWith("text/html")) {
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
index f10f1be80d..0b3776066e 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
@@ -286,6 +286,42 @@ public class HtmlParserTest extends TikaTest {
         assertEquals("ISO-8859-15", metadata.get(Metadata.CONTENT_ENCODING));
     }
 
+    /**
+     * A page that declares {@code charset=euc-kr} but actually uses UHC 
(MS949)
+     * extension Hangul must be decoded with the superset {@code 
x-windows-949},
+     * not strict EUC-KR (which U+FFFDs the extension syllables).  Mirrors the
+     * promotion {@code AutoDetectReader} already applies for non-HTML.
+     * {@code CONTENT_ENCODING} still reports the detected charset; the 
superset
+     * actually used is recorded in {@code DECODED_CHARSET}.
+     *
+     * @see org.apache.tika.detect.CharsetSupersets
+     */
+    @Test
+    public void testEucKrPromotedToMs949Superset() throws Exception {
+        // U+AC02 is outside EUC-KR (KS X 1001) but inside x-windows-949 
(MS949);
+        // its MS949 bytes 0x81 0x41 decode to U+FFFD followed by 'A' under 
strict
+        // EUC-KR, so a correct decode proves the superset was used.  U+D55C 
U+AD6D
+        // ("Korea") is a normal EUC-KR syllable pair.
+        String test = "<html><head><meta charset=\"euc-kr\" />" +
+                "<title>title</title></head>" +
+                "<body><p>\uAC02 \uD55C\uAD6D</p></body></html>";
+        Metadata metadata = new Metadata();
+        BodyContentHandler handler = new BodyContentHandler();
+        try (TikaInputStream tis = 
TikaInputStream.get(test.getBytes("x-windows-949"))) {
+            new JSoupParser().parse(tis, handler, metadata, new 
ParseContext());
+        }
+        // Metadata reports the *detected* charset ...
+        assertEquals("EUC-KR", metadata.get(Metadata.CONTENT_ENCODING));
+        // ... but decoding used the superset, recorded in DECODED_CHARSET.
+        assertEquals(java.nio.charset.Charset.forName("x-windows-949").name(),
+                metadata.get(TikaCoreProperties.DECODED_CHARSET));
+        // The UHC-only syllable round-trips; under strict EUC-KR it would be 
U+FFFD.
+        String content = handler.toString();
+        assertContains("\uAC02", content);
+        assertFalse(content.contains("\uFFFD"),
+                "no replacement chars expected under the MS949 superset 
decode");
+    }
+
     /**
      * Test case for TIKA-334
      *

(tika) branch main updated: TIKA-4745-follow-on-junk-improvements (#2872)

Reply via email to