This is an automated email from the ASF dual-hosted git repository.
tballison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new d1e81e39b4 TIKA-4745-follow-on-junk-improvements (#2872)
d1e81e39b4 is described below
commit d1e81e39b4ce210ee3f982462e2af18816950101
Author: Tim Allison <[email protected]>
AuthorDate: Fri Jun 5 12:45:57 2026 -0400
TIKA-4745-follow-on-junk-improvements (#2872)
---
.../tika/ml/chardetect/CjkDecodeValidator.java | 5 +-
.../tika/ml/chardetect/CosineFamilyArbiter.java | 241 ------------------
.../apache/tika/ml/chardetect/ScoredCandidate.java | 56 -----
.../apache/tika/ml/chardetect/cosine-profiles.bin | Bin 1080313 -> 0 bytes
.../apache/tika/ml/junkdetect/JunkDetector.java | 280 +++++++++++++++------
.../ml/junkdetect/JunkFilterEncodingDetector.java | 5 +-
.../tika/ml/junkdetect/TextQualityFeatures.java | 31 ++-
.../ml/junkdetect/tools/BuildJunkTrainingData.java | 17 +-
.../tika/ml/junkdetect/tools/TrainJunkModel.java | 118 +++++++--
.../org/apache/tika/ml/junkdetect/junkdetect.bin | Bin 2321862 -> 2316809
bytes
.../DecodeCorruptionDiscriminationTest.java | 172 +++++++++++++
.../ml/junkdetect/JunkDetectorRoundTripTest.java | 132 ++++++++--
.../org/apache/tika/parser/html/JSoupParser.java | 4 +
.../apache/tika/parser/html/HtmlParserTest.java | 36 +++
14 files changed, 672 insertions(+), 425 deletions(-)
diff --git
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/CjkDecodeValidator.java
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/CjkDecodeValidator.java
index 4c7254c6ee..cf4e4e6554 100644
---
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/CjkDecodeValidator.java
+++
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/CjkDecodeValidator.java
@@ -68,10 +68,7 @@ public final class CjkDecodeValidator {
* little legacy evidence (legacy high bytes < {@link
#MIN_HIGH_BYTES})
*/
public static double strippedFailureRate(byte[] bytes, Charset cjkCharset)
{
- Charset decodeAs = CharsetSupersets.supersetOf(cjkCharset);
- if (decodeAs == null) {
- decodeAs = cjkCharset;
- }
+ Charset decodeAs = CharsetSupersets.decodeAs(cjkCharset);
CharsetDecoder dec = decodeAs.newDecoder()
.onMalformedInput(CodingErrorAction.REPORT)
.onUnmappableCharacter(CodingErrorAction.REPORT);
diff --git
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/CosineFamilyArbiter.java
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/CosineFamilyArbiter.java
deleted file mode 100644
index 057bba4a92..0000000000
---
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/CosineFamilyArbiter.java
+++ /dev/null
@@ -1,241 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.ml.chardetect;
-
-import java.io.DataInputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.nio.charset.Charset;
-import java.nio.charset.IllegalCharsetNameException;
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Locale;
-import java.util.Map;
-
-import org.apache.tika.detect.EncodingResult;
-
-/**
- * Family-level guard over the NB statistical pick, defending against the
- * single-byte-→CJK collision (Cyrillic / Greek / accented-Latin content
- * whose high bytes coincide with legal GBK lead/trail pairs and accumulate
- * spurious GB18030 / Big5 likelihood under the multinomial NB).
- *
- * <p>Two complementary, model-light signals, both blind to NB:</p>
- * <ul>
- * <li><b>high-byte cosine</b> — cosine between the probe's high-byte
- * (≥ 0x80) byte-bigram occupancy and each class's control-stripped
- * high-byte profile. Direction-based, so length/density-invariant; the
- * ASCII quadrant is dropped so shared English text can't dominate. When
- * NB picks a CJK class but the cosine argmax is non-CJK (with enough
- * high-byte evidence), the CJK pick is vetoed.</li>
- * <li><b>GBK illegality</b> — fraction of high-byte lead bytes that do
- * not begin a valid GBK 2-byte or GB18030 4-byte sequence. A genuine
- * GB18030 document is ~0% illegal; Cyrillic/Greek forced through GBK
- * throws illegal trails. Scoped to GB18030 only (it says nothing about
- * Shift_JIS/EUC).</li>
- * </ul>
- *
- * <p>On veto the CJK pick is replaced by the best non-CJK candidate (by cosine
- * when evidence is sufficient, else the highest-ranked non-CJK NB candidate);
- * real-CJK picks are left untouched (cosine argmax stays CJK, illegality ~0),
- * so the guard is regression-safe for genuine CJK.</p>
- */
-public final class CosineFamilyArbiter {
-
- /** Minimum high-byte bigram count before the cosine veto is trusted. */
- public static final int MIN_HIGH_BYTE_SUPPORT = 15;
-
- /** GBK-illegality fraction above which a GB18030 pick is refuted. */
- public static final double GBK_ILLEGAL_THRESHOLD = 0.02;
-
- private static final String GB18030 = "GB18030";
-
- private final String[] names;
- private final boolean[] cjk;
- private final Charset[] charsets; // resolved JVM charset, null if
unsupported
- private final int[][] bigramIds;
- private final float[][] weights; // L2-normalized per class
-
- public CosineFamilyArbiter(InputStream in) throws IOException {
- try (DataInputStream dis = new DataInputStream(in)) {
- int nc = dis.readInt();
- names = new String[nc];
- cjk = new boolean[nc];
- charsets = new Charset[nc];
- bigramIds = new int[nc][];
- weights = new float[nc][];
- for (int c = 0; c < nc; c++) {
- names[c] = dis.readUTF();
- cjk[c] = isCjkName(names[c]);
- charsets[c] = resolve(names[c]);
- int nnz = dis.readInt();
- int[] ids = new int[nnz];
- float[] w = new float[nnz];
- for (int k = 0; k < nnz; k++) {
- ids[k] = dis.readUnsignedShort();
- w[k] = dis.readFloat();
- }
- bigramIds[c] = ids;
- weights[c] = w;
- }
- }
- }
-
- private static Charset resolve(String name) {
- try {
- return Charset.isSupported(name) ? Charset.forName(name) : null;
- } catch (IllegalCharsetNameException e) {
- return null;
- }
- }
-
- static boolean isCjkName(String name) {
- String n = name.toLowerCase(Locale.ROOT);
- return n.contains("gb") || n.contains("big5") || n.contains("euc")
- || n.contains("shift") || n.contains("jis") ||
n.contains("2022")
- || n.contains("949");
- }
-
- /**
- * Apply the family guard to NB's ranked candidates. Returns {@code
- * nbResults} unchanged unless NB's top pick is CJK and a veto fires, in
- * which case a non-CJK replacement is promoted to the front.
- */
- public List<EncodingResult> arbitrate(byte[] probe, List<EncodingResult>
nbResults) {
- if (nbResults == null || nbResults.isEmpty()) {
- return nbResults;
- }
- if (!isCjkName(nbResults.get(0).getCharset().name())) {
- return nbResults;
- }
- // Build high-byte bigram occupancy.
- Map<Integer, Integer> docMap = new HashMap<>();
- long support = 0;
- for (int i = 0; i + 1 < probe.length; i++) {
- int b0 = probe[i] & 0xFF;
- int b1 = probe[i + 1] & 0xFF;
- if (b0 >= 0x80 || b1 >= 0x80) {
- int bg = (b0 << 8) | b1;
- docMap.merge(bg, 1, Integer::sum);
- support++;
- }
- }
- double docNorm = 0;
- for (int v : docMap.values()) {
- docNorm += (double) v * v;
- }
- docNorm = Math.sqrt(docNorm);
-
- boolean gbkTop = GB18030.equals(nbResults.get(0).getCharset().name());
- double illegal = gbkIllegalRate(probe);
-
- int cosArg = -1;
- double bestCos = -1;
- double[] cos = new double[names.length];
- if (docNorm > 0) {
- for (int c = 0; c < names.length; c++) {
- double dot = 0;
- int[] ids = bigramIds[c];
- float[] w = weights[c];
- for (int k = 0; k < ids.length; k++) {
- Integer dc = docMap.get(ids[k]);
- if (dc != null) {
- dot += w[k] * dc;
- }
- }
- cos[c] = dot / docNorm;
- if (cos[c] > bestCos) {
- bestCos = cos[c];
- cosArg = c;
- }
- }
- }
-
- boolean veto = (gbkTop && illegal > GBK_ILLEGAL_THRESHOLD)
- || (support >= MIN_HIGH_BYTE_SUPPORT && cosArg >= 0 &&
!cjk[cosArg]);
- if (!veto) {
- return nbResults;
- }
-
- // Choose replacement: best non-CJK by cosine when evidence is
- // sufficient, else the highest-ranked non-CJK NB candidate.
- Charset replacement = null;
- if (support >= MIN_HIGH_BYTE_SUPPORT && docNorm > 0) {
- double bv = -1;
- for (int c = 0; c < names.length; c++) {
- if (!cjk[c] && charsets[c] != null && cos[c] > bv) {
- bv = cos[c];
- replacement = charsets[c];
- }
- }
- }
- float conf = nbResults.get(0).getConfidence();
- List<EncodingResult> out = new ArrayList<>(nbResults.size() + 1);
- if (replacement != null) {
- out.add(new EncodingResult(replacement, conf, replacement.name(),
- EncodingResult.ResultType.STATISTICAL));
- }
- for (EncodingResult r : nbResults) {
- if (isCjkName(r.getCharset().name())) {
- continue;
- }
- if (replacement != null &&
r.getCharset().name().equals(replacement.name())) {
- continue;
- }
- out.add(r);
- }
- // If we couldn't form any non-CJK candidate, don't strand the caller
- // with an empty list — leave NB's result untouched.
- return out.isEmpty() ? nbResults : out;
- }
-
- /**
- * Fraction of high-byte lead bytes that fail to begin a valid GBK 2-byte
- * or GB18030 4-byte sequence. 0 for genuine GB18030.
- */
- static double gbkIllegalRate(byte[] b) {
- int n = b.length;
- int i = 0;
- int illegal = 0;
- int lead = 0;
- while (i < n) {
- int c = b[i] & 0xFF;
- if (c < 0x80) {
- i++;
- continue;
- }
- lead++;
- if (c >= 0x81 && c <= 0xFE && i + 1 < n) {
- int t = b[i + 1] & 0xFF;
- if (((t >= 0x40 && t <= 0x7E) || (t >= 0x80 && t <= 0xFE)) &&
t != 0x7F) {
- i += 2;
- continue;
- }
- if (t >= 0x30 && t <= 0x39 && i + 3 < n
- && (b[i + 2] & 0xFF) >= 0x81 && (b[i + 2] & 0xFF) <=
0xFE
- && (b[i + 3] & 0xFF) >= 0x30 && (b[i + 3] & 0xFF) <=
0x39) {
- i += 4;
- continue;
- }
- }
- illegal++;
- i++;
- }
- return lead == 0 ? 0 : (double) illegal / lead;
- }
-}
diff --git
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/ScoredCandidate.java
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/ScoredCandidate.java
deleted file mode 100644
index 60564bf884..0000000000
---
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/ScoredCandidate.java
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.ml.chardetect;
-
-import java.util.Collections;
-import java.util.LinkedHashSet;
-import java.util.Set;
-
-/**
- * Pooled candidate from {@link LogLinearCombiner}: label, raw summed score
- * (larger is better, not normalized), and the specialists that contributed.
- */
-public final class ScoredCandidate {
-
- private final String label;
- private final float score;
- private final Set<String> contributingSpecialists;
-
- public ScoredCandidate(String label, float score, Set<String>
contributingSpecialists) {
- this.label = label;
- this.score = score;
- this.contributingSpecialists =
- Collections.unmodifiableSet(new
LinkedHashSet<>(contributingSpecialists));
- }
-
- public String getLabel() {
- return label;
- }
-
- public float getScore() {
- return score;
- }
-
- public Set<String> getContributingSpecialists() {
- return contributingSpecialists;
- }
-
- @Override
- public String toString() {
- return "ScoredCandidate{" + label + "=" + score + " from " +
contributingSpecialists + "}";
- }
-}
diff --git
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/resources/org/apache/tika/ml/chardetect/cosine-profiles.bin
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/resources/org/apache/tika/ml/chardetect/cosine-profiles.bin
deleted file mode 100644
index 646a7e7923..0000000000
Binary files
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/resources/org/apache/tika/ml/chardetect/cosine-profiles.bin
and /dev/null differ
diff --git
a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkDetector.java
b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkDetector.java
index c231494d60..2f117479d0 100644
---
a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkDetector.java
+++
b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkDetector.java
@@ -353,6 +353,16 @@ public final class JunkDetector implements
TextQualityDetector {
f1TablesByScript.put(script, BigramTables.readFrom(dis));
}
+ requireUsableSigma("scriptTransition",
scriptTransitionCalibration);
+ requireUsableSigma("block", blockCalibration);
+ requireUsableSigma("control", controlCalibration);
+ requireUsableSigma("z5", z5Calibration);
+ requireUsableSigma("z6", z6Calibration);
+ requireUsableSigma("z9", z9Calibration);
+ for (Map.Entry<String, float[]> e : calibrations.entrySet()) {
+ requireUsableSigma("z1[" + e.getKey() + "]", e.getValue());
+ }
+
return new JunkDetector(calibrations,
blockTable, blockTableQuant, blockCalibration,
controlCalibration, combinerWeights,
@@ -363,6 +373,22 @@ public final class JunkDetector implements
TextQualityDetector {
}
}
+ /**
+ * Validates a calibration {@code {mu, sigma}} from the model file: sigma
is the
+ * divisor in every z-score, so it must be finite and > 0. Single
enforcement
+ * point for that invariant -- inference divides without re-checking.
+ */
+ static void requireUsableSigma(String name, float[] calibration) throws
IOException {
+ boolean ok = calibration != null && calibration.length >= 2
+ && Float.isFinite(calibration[1]) && calibration[1] > 0f;
+ if (!ok) {
+ String sigma = (calibration == null || calibration.length < 2)
+ ? "absent" : Float.toString(calibration[1]);
+ throw new IOException("Invalid model: " + name
+ + " calibration sigma must be finite and > 0 but was " +
sigma);
+ }
+ }
+
/** Read {@code size} big-endian int16 values as a short[]. */
private static short[] readShortTable(DataInputStream dis, int size)
throws IOException {
byte[] raw = dis.readNBytes(size * 2);
@@ -463,8 +489,12 @@ public final class JunkDetector implements
TextQualityDetector {
* produces the logit. No runs, no sentinels, no per-script z2/z3.
*/
private Agg aggregate(String text) {
- // NFC-normalize so inference matches the trainer's tally.
- text = java.text.Normalizer.normalize(text,
java.text.Normalizer.Form.NFC);
+ // NFKC-normalize so inference matches the trainer's tally AND legacy
+ // compatibility forms fold to canonical — critically half-width
katakana
+ // (U+FF66-FF9F) -> full-width, which the HAN bigram table is trained
on.
+ // Without this, half-width-katakana pages (Shift_JIS-era Japanese
+ // e-commerce) floor every bigram as "unseen" (z1 ~-9, false-junk).
+ text = java.text.Normalizer.normalize(text,
java.text.Normalizer.Form.NFKC);
int[] cps = text.codePoints().toArray();
Map<String, double[]> buckets = new HashMap<>(); // script ->
{sumLogP, count}
@@ -522,12 +552,15 @@ public final class JunkDetector implements
TextQualityDetector {
agg.totalBigrams = (int) z1Count;
if (z1Count == 0 || agg.dominantScript == null) {
- // No scoreable script — doc-level fallback (same anchors as
before):
+ // No scoreable LETTER at all (zero runs) — doc-level fallback:
// density=0 -> very negative; density=1 coherence=1 -> positive
// (unmodeled coherent script); density=1 coherence=0 -> very
negative.
+ // z6 is included so a letter-free but FFFD/anomaly-heavy doc
(which can
+ // no longer flood z1) is still penalized here — no path ignores
the
+ // replacement ratio. For unmodeled-but-clean script z6~0, so
it's inert.
agg.dominantScript = null;
agg.z1 = Float.NaN;
- agg.logit = -7f + 4f * agg.z7 + 6f * agg.z8;
+ agg.logit = -7f + 4f * agg.z7 + 6f * agg.z8 + 4f * agg.z6;
return agg;
}
agg.z1 = (float) (weightedZ1 / z1Count);
@@ -635,7 +668,7 @@ public final class JunkDetector implements
TextQualityDetector {
*/
public float computeZ5LetterAdjacentToMarkRatio(String text) {
double raw = TextQualityFeatures.letterAdjacentToMarkRatio(text);
- if (Double.isNaN(raw) || z5Calibration == null || z5Calibration[1] <=
0) {
+ if (Double.isNaN(raw) || z5Calibration == null) {
return 0f;
}
return ((float) raw - z5Calibration[0]) / z5Calibration[1];
@@ -651,7 +684,7 @@ public final class JunkDetector implements
TextQualityDetector {
*/
public float computeZ6ReplacementRatio(String text) {
double raw = TextQualityFeatures.replacementRatio(text);
- if (Double.isNaN(raw) || z6Calibration == null || z6Calibration[1] <=
0) {
+ if (Double.isNaN(raw) || z6Calibration == null) {
return 0f;
}
// Flip sign: higher replacement = lower quality, so feature is
@@ -669,7 +702,7 @@ public final class JunkDetector implements
TextQualityDetector {
*/
public float computeZ9AlternationRatio(String text) {
double raw = TextQualityFeatures.scriptAlternationRatio(text);
- if (Double.isNaN(raw) || z9Calibration == null || z9Calibration[1] <=
0) {
+ if (Double.isNaN(raw) || z9Calibration == null) {
return 0f;
}
// Higher alternation = junkier; (mu - raw) / sigma so clean text →
positive z9.
@@ -973,15 +1006,73 @@ public final class JunkDetector implements
TextQualityDetector {
* once when scanning the text (avoiding a redundant binary search per
* codepoint).
*/
+ /** Small per-bigram log-prob penalty subtracted from the case-folded
+ * (lowercase) value when scoring an uppercase pair. All-caps is a
genuinely
+ * weaker/rarer signal than lowercase, so it should score a hair BELOW its
+ * lowercase form, not equal to it — and the margin guards the edge case
where
+ * an all-caps *mojibake* decode whose lowercase twin happens to be a seen
+ * bigram would otherwise score like real lowercase text. Kept small
(0.25):
+ * the lowercase/junk margin is ~0.8 logit, and δ=0.5 thinned it to ~0.1,
so
+ * 0.25 leaves all-caps clearly clean (~0.5 above junk) while honoring the
+ * "somewhat less languagey" principle. */
+ private static final double CASE_FOLD_PENALTY = 0.25;
+
private static double scorePairF1(int cpA, int idxA, int cpB, int idxB,
BigramTables tables) {
+ double direct = Double.NaN;
if (idxA >= 0 && idxB >= 0) {
int slot = lookupBigramSlot(tables, idxA, idxB);
if (slot >= 0) {
- return dequantize(tables.bigramValues[slot],
+ direct = dequantize(tables.bigramValues[slot],
tables.bigramQuantMin, tables.bigramQuantMax);
}
}
+ // Case-folded backoff: an ALL-UPPERCASE pair that is the case variant
of
+ // a SEEN lowercase pair is real text wearing a different case
(all-caps
+ // headings / emphasis, e.g. Greek "ΚΑΤΑΛΟΓΟΣ", Russian "МУЗЕЙ"), NOT
junk.
+ // Score it as the BETTER of its own log-prob and its lowercase twin's
—
+ // i.e. max(direct, fold). max (not fold-only-on-miss) is essential:
real
+ // all-caps bigrams ARE present in training (from headings) but rare,
so the
+ // direct lookup hits a low value (МУ −12.4 vs lowercase му −6.7) and
would
+ // otherwise bypass the fold and floor. This is the discriminator raw
+ // probability cannot be: all-caps real text and all-caps mojibake are
both
+ // improbable, but only real text has a SEEN lowercase twin. Gated on
BOTH
+ // codepoints being uppercase (case-CONSISTENT) so alternating-case
junk
+ // ("tHiS") stays unfolded and floors; and only the lowercase twin's
value
+ // is borrowed when that pair is actually seen, so all-caps mojibake
+ // (lowercase form also unseen) floors.
+ // Gate = "at least one uppercase letter AND no LOWERCASE letter" — so
it
+ // folds both an interior all-caps pair (МУ) AND an edge pair where
the other
+ // side is a sentinel or glue (^М, Й$, "М."), but NOT a mixed-case
pair (the
+ // lowercase letter in "aB"/"tHiS" trips the gate, so
case-inconsistent junk
+ // still floors). Each uppercase letter is folded;
sentinels/digits/glue
+ // pass through unchanged. Folding the edges too is what fully
rescues short
+ // all-caps headings, whose ^X/X$ bigrams would otherwise floor on the
rare
+ // uppercase-letter unigram backoff.
+ boolean upperA = Character.isValidCodePoint(cpA) &&
Character.isUpperCase(cpA);
+ boolean upperB = Character.isValidCodePoint(cpB) &&
Character.isUpperCase(cpB);
+ boolean lowerA = Character.isValidCodePoint(cpA) &&
Character.isLowerCase(cpA);
+ boolean lowerB = Character.isValidCodePoint(cpB) &&
Character.isLowerCase(cpB);
+ if ((upperA || upperB) && !(lowerA || lowerB)) {
+ int lcA = upperA ? Character.toLowerCase(cpA) : cpA;
+ int lcB = upperB ? Character.toLowerCase(cpB) : cpB;
+ if (lcA != cpA || lcB != cpB) {
+ int lcIdxA = codepointToIndex(tables, lcA);
+ int lcIdxB = codepointToIndex(tables, lcB);
+ if (lcIdxA >= 0 && lcIdxB >= 0) {
+ int slot = lookupBigramSlot(tables, lcIdxA, lcIdxB);
+ if (slot >= 0) {
+ double fold = dequantize(tables.bigramValues[slot],
+ tables.bigramQuantMin, tables.bigramQuantMax)
+ - CASE_FOLD_PENALTY;
+ return Double.isNaN(direct) ? fold : Math.max(direct,
fold);
+ }
+ }
+ }
+ }
+ if (!Double.isNaN(direct)) {
+ return direct;
+ }
// Unigram backoff for unseen pair or for codepoints absent from the
// per-script index. α=1.0 = plain independence.
double ua = unigramLogProb(tables, idxA);
@@ -1068,6 +1159,15 @@ public final class JunkDetector implements
TextQualityDetector {
/** Model script key for the pooled COMMON (digits/punctuation/symbols)
table. */
public static final String COMMON_SCRIPT = "COMMON";
+ /** Sentinel "codepoints" (above the Unicode maximum U+10FFFF, so they
cannot
+ * collide with real text) that wrap each letter token: TOKEN_START (^)
before
+ * the first letter of a run and TOKEN_END ($) after the last. Emitted by
+ * {@link #forEachScriptBigram} so the bigram LM learns word-initial /
word-
+ * final letter typicality, and so z1 never empties for text containing
even a
+ * single letter. */
+ public static final int TOKEN_START = 0x110000;
+ public static final int TOKEN_END = 0x110001;
+
/** COMMON-class predicate: COMMON, INHERITED, UNKNOWN all pool into
COMMON. */
static String classKey(int cp) {
Character.UnicodeScript s = Character.UnicodeScript.of(cp);
@@ -1095,21 +1195,29 @@ public final class JunkDetector implements
TextQualityDetector {
}
// -----------------------------------------------------------------------
- // Bucket-by-script bigram enumeration (the keystone).
+ // Word-level bigram enumeration (the keystone).
// Single source of truth for BOTH inference z1 scoring and training tally.
- // A bigram (a,b) is:
- // - skipped if either codepoint is charset-invariant (digit) — digits
- // never enter any bucket;
- // - assigned to the adjacent real script when one side is COMMON glue
- // (space/punct fold into the neighbouring word);
- // - assigned to the COMMON bucket when BOTH sides are COMMON — so that
- // symbol/punctuation salad ("*^&(...") scores against the COMMON table
- // (unseen → junky), while a formatted number (digits skipped, its
- // punctuation digit-adjacent) lands almost nothing → neutral;
- // - skipped if a and b are two DIFFERENT real scripts (cross-script
- // boundary — not comparable, would mix tables).
- // COMMON is a normal bucket (its own table + calibration + combiner
weight),
- // scored through this same path.
+ // The codepoint stream is tokenized into maximal same-script runs:
+ // - LETTERs (Lu/Ll/Lt/Lm/Lo) are the scoreable content, bucketed by
script;
+ // - combining MARKs (Mn/Me/Mc) attach to the current run (so NFD
accents,
+ // Arabic harakat, Indic matras, Thai vowel signs stay inside their
word);
+ // - GLUE — every other non-letter that is NOT whitespace/NUL and NOT a
decode
+ // anomaly (punctuation, symbols, numbers) — ALSO attaches to the open
run
+ // and IS scored, at codepoint resolution. This is what lets z1 catch
a
+ // wrong-charset symbol wedged mid-word: the LM learns letter->'.' is
common
+ // but letter->U+2030 is ~0, so 'Hausj‰rven' (Latin-sibling misdecode)
floors
+ // while 'Hausjärven' (a real accented letter) does not. Resolution
is the
+ // codepoint, never the Unicode category — '%', U+2030, U+2020 are all
Po
+ // like '.', so a typed/binned boundary would hide them behind the
period's
+ // frequency (measured: letter->'.' = 235k vs letter->U+2030 = 0 in
LATIN);
+ // - BOUNDARIES that split a run and emit NOTHING: whitespace and NUL
(word /
+ // structure separators) and the decode-anomaly set (U+FFFD / C1 /
anomalous
+ // Cc / PUA), whose penalty is carried solely by z6 (anomaly ratio)
and z3,
+ // NEVER z1 — keeping anomalies out of z1 is what stops z1
cannibalizing the
+ // FFFD signal z6 owns;
+ // - a letter-script change is also a boundary (cross-script structure
is z4/z9).
+ // Each run is wrapped TOKEN_START (^) ... TOKEN_END ($) so the LM learns
+ // word-initial/final typicality and never empties for text with even one
letter.
// -----------------------------------------------------------------------
/** Sink for {@link #forEachScriptBigram}: (modelScript, cpA, cpB). */
@@ -1118,9 +1226,22 @@ public final class JunkDetector implements
TextQualityDetector {
void accept(String script, int a, int b);
}
- /** Charset-invariant content excluded from per-script bigram scoring. */
- static boolean isSkipCodepoint(int cp) {
- return Character.isDigit(cp);
+ /** True for letter codepoints (Lu/Ll/Lt/Lm/Lo) — the scoreable token
content
+ * that forms per-script runs. {@code type} is {@link
Character#getType}. */
+ static boolean isLetterCp(int type) {
+ return type == Character.UPPERCASE_LETTER
+ || type == Character.LOWERCASE_LETTER
+ || type == Character.TITLECASE_LETTER
+ || type == Character.MODIFIER_LETTER
+ || type == Character.OTHER_LETTER;
+ }
+
+ /** True for combining marks (Mn/Me/Mc) — they attach to the current run
+ * rather than splitting it. {@code type} is {@link Character#getType}.
*/
+ static boolean isMarkCp(int type) {
+ return type == Character.NON_SPACING_MARK
+ || type == Character.ENCLOSING_MARK
+ || type == Character.COMBINING_SPACING_MARK;
}
/**
@@ -1154,64 +1275,67 @@ public final class JunkDetector implements
TextQualityDetector {
return buckets;
}
- /** "Real" structural whitespace collapses to canonical U+0020 before
bigram
- * emission. Matches {@link #computeZ3ControlByte}'s definition of
- * non-anomalous whitespace: HT (0x09), LF (0x0A), CR (0x0D), regular
- * space (0x20), plus the Zs/Zl/Zp Unicode categories (NBSP, ideographic
- * space, line/paragraph separators).
- *
- * <p><strong>Anomalous Cc (0x01-0x08, 0x0B, 0x0C, 0x0E-0x1F, 0x7F, U+0085
- * NEL, U+0080-0x009F C1 controls) and Cf (format chars) are DELIBERATELY
- * NOT normalized.</strong> Their OOV-floor signal is carrying real
- * evidence that the decode is wrong — e.g., windows-1252 bytes 0x80-0x9F
- * decode to printable curly quotes / em-dashes; ISO-8859-16 misdecodes
- * them as C1 control codepoints; the bigram-table OOV-floor on those
- * Cc-touching bigrams is what correctly penalizes the wrong decode.
- * z3 has had this distinction since v15; this brings z1 in line. */
- static int normalizeWhitespace(int cp) {
- if (cp == 0x20) {
- return cp;
- }
- if (cp == 0x09 || cp == 0x0A || cp == 0x0D) {
- return 0x20;
- }
- int t = Character.getType(cp);
- if (t == Character.SPACE_SEPARATOR
- || t == Character.LINE_SEPARATOR
- || t == Character.PARAGRAPH_SEPARATOR) {
- return 0x20;
- }
- return cp;
- }
-
public static void forEachScriptBigram(int[] cps, BigramSink sink) {
- if (cps == null || cps.length < 2) {
+ if (cps == null || cps.length == 0) {
return;
}
- for (int i = 0; i + 1 < cps.length; i++) {
- int a = normalizeWhitespace(cps[i]);
- int b = normalizeWhitespace(cps[i + 1]);
- if (isSkipCodepoint(a) || isSkipCodepoint(b)) {
- continue;
- }
- String ka = classKey(a);
- String kb = classKey(b);
- boolean aCommon = COMMON_SCRIPT.equals(ka);
- boolean bCommon = COMMON_SCRIPT.equals(kb);
- String script;
- if (aCommon && bCommon) {
- script = COMMON_SCRIPT; // symbol/punct salad → COMMON
bucket
- } else if (aCommon) {
- script = kb; // glue folds into the real side
- } else if (bCommon) {
- script = ka;
- } else if (ka.equals(kb)) {
- script = ka; // same real script
- } else {
- continue; // cross-script boundary
+ String curScript = null; // script of the run in progress; null = no
open run
+ int prev = -1; // previous codepoint in the open run (left
side of next bigram)
+ for (int cp : cps) {
+ int type = Character.getType(cp);
+ if (isLetterCp(type)) {
+ String sc = classKey(cp);
+ if (curScript != null && sc.equals(curScript)) {
+ sink.accept(curScript, prev, cp); // within-run
letter bigram
+ } else {
+ if (curScript != null) {
+ sink.accept(curScript, prev, TOKEN_END); // close
the prior run
+ }
+ curScript = sc;
+ sink.accept(curScript, TOKEN_START, cp); // open a
new run
+ }
+ prev = cp;
+ } else if (isBoundaryCp(cp)) {
+ // WORD/STRUCTURE boundary (whitespace, NUL) or a decode
anomaly
+ // (U+FFFD / C1 / anomalous Cc / PUA — scored by z6/z3, never
z1):
+ // close the run, emit nothing.
+ if (curScript != null) {
+ sink.accept(curScript, prev, TOKEN_END);
+ curScript = null;
+ prev = -1;
+ }
+ } else if (curScript != null) {
+ // GLUE (punctuation / symbol / number) or a combining MARK
inside an
+ // open run: attach and SCORE it at codepoint resolution.
This is the
+ // intrusion signal: the LM learns letter->'.' is common but
+ // letter->U+2030 (per-mille) is ~0, so a wrong-charset symbol
wedged
+ // mid-word (the Latin-sibling misdecode, e.g. 'Hausj‰rven')
floors
+ // z1, while a clean accented letter (a real letter, scored as
a letter
+ // bigram) does not. Resolution is the codepoint, never the
Unicode
+ // category: '%', U+2030, U+2020 are all Po like '.', so
binning by
+ // category would hide them behind the period's huge frequency.
+ sink.accept(curScript, prev, cp);
+ prev = cp;
}
- sink.accept(script, a, b);
+ // else: orphan glue/mark with no open run -> nothing to attach
to, skip.
}
+ if (curScript != null) {
+ sink.accept(curScript, prev, TOKEN_END); // close the
final run
+ }
+ }
+
+ /** True for codepoints that BREAK a run without being scored in z1:
whitespace
+ * and NUL (word/structure boundaries) plus the z6/z3 decode-anomaly set
+ * ({@link TextQualityFeatures#isAnomalyCodepoint} — U+FFFD, C1,
anomalous Cc,
+ * private-use). Every other non-letter (punctuation, symbol, number) is
GLUE:
+ * it attaches to the open run and is scored, so the LM can floor a symbol
+ * wedged mid-word while keeping the anomaly penalty solely in z6 (so z1
never
+ * cannibalizes the FFFD signal). */
+ static boolean isBoundaryCp(int cp) {
+ return cp == 0x00
+ || Character.isWhitespace(cp)
+ || Character.isSpaceChar(cp)
+ || TextQualityFeatures.isAnomalyCodepoint(cp);
}
/**
diff --git
a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkFilterEncodingDetector.java
b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkFilterEncodingDetector.java
index a7706e2118..b8cb75de01 100644
---
a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkFilterEncodingDetector.java
+++
b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkFilterEncodingDetector.java
@@ -594,10 +594,7 @@ public class JunkFilterEncodingDetector implements
MetaEncodingDetector {
// Score CJK candidates on their vendor superset, not the strict base
// (which U+FFFDs vendor-extension chars and unfairly penalizes real
// CJK). AutoDetectReader re-applies the same superset for content.
- Charset decodeAs = CharsetSupersets.supersetOf(charset);
- if (decodeAs == null) {
- decodeAs = charset;
- }
+ Charset decodeAs = CharsetSupersets.decodeAs(charset);
try {
return new String(bytes, decodeAs);
} catch (Exception e) {
diff --git
a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/TextQualityFeatures.java
b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/TextQualityFeatures.java
index 25f07b765f..2ae926a927 100644
---
a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/TextQualityFeatures.java
+++
b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/TextQualityFeatures.java
@@ -229,17 +229,29 @@ public final class TextQualityFeatures {
if (text == null || text.isEmpty()) {
return Double.NaN;
}
- int total = 0;
+ int denom = 0; // high bytes + sub-0x80 anomalies = codepoints
that CAN be decode failures
int anomaly = 0;
for (int i = 0; i < text.length(); ) {
int cp = text.codePointAt(i);
i += Character.charCount(cp);
- total++;
- if (isAnomalyCodepoint(cp)) {
+ boolean anom = isAnomalyCodepoint(cp);
+ if (anom) {
anomaly++;
}
+ if (cp >= 0x80 || anom) {
+ denom++;
+ }
}
- return total == 0 ? Double.NaN : (double) anomaly / total;
+ // Denominator is non-ASCII (+ sub-0x80 control anomalies), NOT total
+ // codepoints: anomalies only arise from undecodable HIGH bytes, so
this is
+ // the fraction "of the bytes that COULD fail, how many did" —
undiluted by
+ // ASCII. PURE ratio (no smoothing, no min-denom cliff): a page whose
few
+ // non-ASCII chars all failed (1 FFFD / 1 non-ASCII = 1.0) is a strong
wrong-
+ // charset signal and must register fully, while a content-rich page
with a
+ // few stray FFFD has a naturally-tiny ratio (1/500 ≈ 0). This is the
signal
+ // that distinguishes Latin wrong-charset (ratio→1) from CJK
mixed-encoding
+ // (ratio→0.06). All-ASCII (denom 0) → 0 (clean).
+ return denom == 0 ? 0.0 : (double) anomaly / denom;
}
/** True if {@code cp} is in the z6 anomaly set: U+FFFD, anomalous Cc
@@ -364,6 +376,14 @@ public final class TextQualityFeatures {
* signal) from "all-whitespace / digit-only content" (zero density
* → strong negative signal in JunkDetector's bigram-based judgment,
* mild signal for general-purpose junk filtering).
+ *
+ * <p><strong>U+FFFD is excluded</strong> from both numerator and
+ * denominator: it is a decode-failure marker scored by the dedicated
+ * replacement-char feature (z6), so counting it here too would (a) double-
+ * count FFFD and (b) re-create the FFFD-drag when this feature dominates —
+ * a permissive wrong decode (few FFFD) would out-score a correct mixed-
+ * encoding decode (many FFFD from undecodable widget bytes). This
measures
+ * the composition of the *decodable* content only.
*/
public static double scriptDensity(String text) {
if (text == null || text.isEmpty()) {
@@ -374,6 +394,9 @@ public final class TextQualityFeatures {
for (int i = 0; i < text.length(); ) {
int cp = text.codePointAt(i);
i += Character.charCount(cp);
+ if (cp == 0xFFFD) {
+ continue;
+ }
total++;
Character.UnicodeScript s = Character.UnicodeScript.of(cp);
if (s != Character.UnicodeScript.COMMON
diff --git
a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/BuildJunkTrainingData.java
b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/BuildJunkTrainingData.java
index a45f74a82c..b4460501b4 100644
---
a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/BuildJunkTrainingData.java
+++
b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/BuildJunkTrainingData.java
@@ -658,14 +658,15 @@ public class BuildJunkTrainingData {
if (text.indexOf('\uFFFD') >= 0) {
return null;
}
- // NFC so the training tally matches JunkDetector.aggregate at
- // inference time (which also NFC-normalises — see comment at
- // JunkDetector#aggregate). Precomposed characters (Latin
- // diacritics, Vietnamese, Indic combining-mark sequences) are
- // stored as single codepoints, so bigram counts collapse mark
- // + letter into one unit instead of splitting them — matching
- // the natural NFC form of most source text.
- text = Normalizer.normalize(text, Normalizer.Form.NFC);
+ // NFKC so the training tally matches JunkDetector.aggregate at
+ // inference time (which also NFKC-normalises — see comment at
+ // JunkDetector#aggregate). Like NFC it stores precomposed characters
+ // (Latin diacritics, Vietnamese, Indic combining-mark sequences) as
+ // single codepoints so bigram counts collapse mark + letter into one
+ // unit; additionally it folds legacy compatibility forms to canonical
+ // — critically half-width katakana (U+FF66-FF9F) -> full-width — so
+ // those bigrams match the trained HAN tables instead of flooring.
+ text = Normalizer.normalize(text, Normalizer.Form.NFKC);
if (text.getBytes(StandardCharsets.UTF_8).length < minBytes) {
return null;
}
diff --git
a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/TrainJunkModel.java
b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/TrainJunkModel.java
index a764444e59..63bce5317f 100644
---
a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/TrainJunkModel.java
+++
b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/TrainJunkModel.java
@@ -83,6 +83,19 @@ public class TrainJunkModel {
*/
static final float CONTROL_BYTE_MIN_SIGMA = 0.005f;
+ /**
+ * Minimum sigma for the per-script z1 calibration. Well-trained scripts
+ * (>1000 dev windows) reach sigma in [0.38, 1.29]; below ~0.38 the
value is
+ * an artifact of too few windows (TIFINAGH at 2 windows gave
sigma=0.0006). A
+ * garbage doc that scatters a few codepoints into such a degenerate
bucket then
+ * gets z1 = numerator/sigma in the thousands, dragging the count-weighted
doc z1
+ * to −700+ (languageness ≈ −458) — correct sign, absurd magnitude.
Flooring at
+ * the reliable-training floor caps under-trained scripts to the genuine
z1 scale
+ * (worst ≈ −60) and leaves well-trained scripts untouched (their sigma
already
+ * exceeds the floor, so no real garbage is clipped).
+ */
+ static final float Z1_MIN_SIGMA = 0.4f;
+
/**
* Full-text byte-level mojibake pairs used by {@link #byteLevelMojibake}.
* Each entry is {sourceCodec, wrongCodec}: training text gets encoded in
@@ -259,11 +272,14 @@ public class TrainJunkModel {
}
//
-----------------------------------------------------------------------
- // Phase 1 — bucket-by-script bigram tables (per real script + COMMON).
- // ONE tally pass over every train file via forEachScriptBigram: digits
- // skipped, COMMON glue folded into the adjacent script, both-COMMON
- // bigrams charged to the COMMON bucket. No runs, no sentinels.
COMMON
- // is a normal bucket (its own table + calibration + combiner weight).
+ // Phase 1 — per-script bigram tables. ONE tally pass over every train
+ // file via forEachScriptBigram (the SAME tokenizer inference uses):
the
+ // text is split into maximal same-script LETTER runs (marks attach),
each
+ // run wrapped TOKEN_START..TOKEN_END, and the within-run bigrams
(incl. the
+ // ^/$ edges) tallied to the run's script. Non-letters are boundaries
and
+ // emit nothing, so the trained tables hold exactly the bigrams
inference
+ // scores. The COMMON bucket is now effectively vestigial (only the
rare
+ // COMMON-scripted letter lands there).
//
-----------------------------------------------------------------------
System.out.println("\n--- Phase 1: bucket-by-script F1 tables ---");
Map<String, HashMap<Long, long[]>> pairsByScript = new HashMap<>();
@@ -312,6 +328,7 @@ public class TrainJunkModel {
for (String script : f1TablesByScript.keySet()) {
List<Double> scores = z1ScoresByScript.getOrDefault(script, new
ArrayList<>());
float[] cal = scores.isEmpty() ? new float[]{0f, 1f} :
muSigma(scores);
+ cal[1] = Math.max(cal[1], Z1_MIN_SIGMA); // floor degenerate
(under-trained) sigma
f1Calibrations.put(script, cal);
System.out.printf(" [%s] mu=%.4f sigma=%.4f (%,d windows)%n",
script, cal[0], cal[1], scores.size());
@@ -446,11 +463,9 @@ public class TrainJunkModel {
end++;
}
String s = new String(bytes, start, end - start,
StandardCharsets.UTF_8);
- // NFD-normalize on read so calibration/training feature math
- // matches JunkDetector.scoreText's NFD path. On-disk corpus
- // may be NFC (older builds of BuildJunkTrainingData); NFD is
- // idempotent on already-NFD text.
- s = java.text.Normalizer.normalize(s,
java.text.Normalizer.Form.NFC);
+ // NFKC-normalize so training matches JunkDetector inference (folds
+ // compatibility forms incl. half-width katakana -> full-width).
+ s = java.text.Normalizer.normalize(s,
java.text.Normalizer.Form.NFKC);
result.add(s);
}
return result;
@@ -479,7 +494,7 @@ public class TrainJunkModel {
continue;
}
String norm = java.text.Normalizer.normalize(
- line, java.text.Normalizer.Form.NFC);
+ line, java.text.Normalizer.Form.NFKC);
int[] cps = norm.codePoints().toArray();
JunkDetector.forEachScriptBigram(cps, (script, a, b) -> {
HashMap<Long, long[]> pairs =
pairsByScript.computeIfAbsent(
@@ -579,7 +594,9 @@ public class TrainJunkModel {
scores.add(bk[0] / bk[1]);
}
}
- return scores.isEmpty() ? new float[]{0f, 1f} : muSigma(scores);
+ float[] cal = scores.isEmpty() ? new float[]{0f, 1f} : muSigma(scores);
+ cal[1] = Math.max(cal[1], Z1_MIN_SIGMA); // floor degenerate
(under-trained) sigma
+ return cal;
}
/** Single GLOBAL z3 (control-byte) calibration, pooled over all files. */
@@ -698,6 +715,29 @@ public class TrainJunkModel {
return new String(codepoints, 0, codepoints.length);
}
+ /**
+ * Replaces a fraction of HIGH (non-ASCII) codepoints with U+FFFD,
simulating
+ * bytes the chosen charset could not decode. ASCII is never touched (a
+ * decode failure is a high byte, never clean ASCII). Used two ways: a LOW
+ * rate over coherent text for mixed-encoding CLEAN positives (real text
with
+ * a few undecodable widget bytes), and a HIGH rate over already-incoherent
+ * text for FFFD-heavy NEGATIVES. Pairing the two teaches the combiner
that
+ * FFFD (z6) is junk evidence only when z1/coherence is ALSO low — so
removing
+ * FFFD from z1 does not make z6/z7 over-penalize mixed-encoding pages.
+ */
+ static String injectReplacementChars(String text, double rate, Random rng)
{
+ if (text.isEmpty()) {
+ return text;
+ }
+ int[] codepoints = text.codePoints().toArray();
+ for (int i = 0; i < codepoints.length; i++) {
+ if (codepoints[i] >= 0x80 && rng.nextDouble() < rate) {
+ codepoints[i] = 0xFFFD;
+ }
+ }
+ return new String(codepoints, 0, codepoints.length);
+ }
+
/**
* Strips combining marks (Mn / Mc / Me categories) after NFD
* normalization. Models the PDF/OCR pipeline that drops marks
@@ -1118,6 +1158,38 @@ public class TrainJunkModel {
addContrastivePair(fx, w, byteLevelMojibake(w, pr[0],
pr[1]),
fc, pairCorrect, pairWrong);
}
+
+ // Mixed-encoding (korA-class) pair: coherent text with heavy
+ // undecodable FFFD widget bytes (~15-20% of high bytes) must
BEAT
+ // coherent-looking mojibake of the same source text (the
clean w,
+ // without the FFFD widgets). As a contrastive
+ // PAIR (not a pointwise positive) it adds no class imbalance,
and
+ // it forces z1/coherence to stay the discriminator: the
correct
+ // side has MORE FFFD (z6 favors the wrong side) and equal z7
+ // (FFFD excluded), so only z1 can rank it correctly. This is
what
+ // keeps removing FFFD from z1 from letting z6/z7 sink korA.
+ String mixed = injectReplacementChars(w, 0.15 +
rng.nextDouble() * 0.05, rng);
+ float[] fMixed = featureVector(fx, mixed);
+ if (fMixed != null) {
+ String[] pr = BYTE_LEVEL_MOJIBAKE_PAIRS[
+ rng.nextInt(BYTE_LEVEL_MOJIBAKE_PAIRS.length)];
+ addContrastivePair(fx, mixed, byteLevelMojibake(w, pr[0],
pr[1]),
+ fMixed, pairCorrect, pairWrong);
+ }
+
+ // All-caps pair (cased scripts only): all-caps CLEAN (z1
recovered
+ // by the case-folded backoff) must beat all-caps mojibake of
the
+ // same bytes (no seen lowercase twin -> still floored).
+ String upper = w.toUpperCase(java.util.Locale.ROOT);
+ if (!upper.equals(w)) {
+ float[] fUpper = featureVector(fx, upper);
+ if (fUpper != null) {
+ String[] pr = BYTE_LEVEL_MOJIBAKE_PAIRS[
+ rng.nextInt(BYTE_LEVEL_MOJIBAKE_PAIRS.length)];
+ addContrastivePair(fx, upper, byteLevelMojibake(upper,
pr[0], pr[1]),
+ fUpper, pairCorrect, pairWrong);
+ }
+ }
if ("LATIN".equals(script)) {
String[] pr = LATIN_TO_CJK_PAIRS[
rng.nextInt(LATIN_TO_CJK_PAIRS.length)];
@@ -1129,15 +1201,33 @@ public class TrainJunkModel {
pairCorrect, pairWrong);
}
+ // Monotonicity anchor: the SAME clean text with its non-ASCII
chars
+ // replaced by U+FFFD must rank BELOW the clean text. This
leaves z1
+ // HIGH (often higher: FFFD removes the hard accented bigram
and the
+ // surviving fragments are common -- de_correct z1=0.27 vs
de_fffd
+ // 0.55) and z7 high, so z6 (replacement ratio) is the ONLY
feature
+ // that separates it from clean -- forcing the combiner to
weight z6
+ // enough to overrule z1's perverse FFFD gain (the deu/gsw,
deu/frr
+ // cases). (A generalized real-charset-mojibake variant was
tried for
+ // the within-Latin SIBLING case but reverted -- it cost
korA/z6
+ // without fixing the siblings; see
LANGUAGENESS_RESIDUAL_FAILURES.md.)
+ String fffdSame = injectReplacementChars(w, 0.5 +
rng.nextDouble() * 0.5, rng);
+ addContrastivePair(fx, w, fffdSame, fc, pairCorrect,
pairWrong);
+
// Pointwise garbage anchor (generic junk, no correct
counterpart).
String junk;
- int mode = rng.nextInt(3);
+ int mode = rng.nextInt(4);
if (mode == 0) {
junk = injectControlChars(w, 0.15, rng);
} else if (mode == 1) {
junk = shuffleChars(w, rng);
- } else {
+ } else if (mode == 2) {
junk = injectPrivateUseAreaChars(w, 0.12, rng);
+ } else {
+ // FFFD-heavy junk: incoherent (shuffled) AND
mostly-undecodable.
+ // Pairs with the mixed-encoding clean positive above so
the
+ // combiner learns FFFD is junk evidence only when z1 is
low.
+ junk = injectReplacementChars(shuffleChars(w, rng), 0.4,
rng);
}
float[] fb = featureVector(fx, junk);
if (fb != null) {
diff --git
a/tika-ml/tika-ml-junkdetect/src/main/resources/org/apache/tika/ml/junkdetect/junkdetect.bin
b/tika-ml/tika-ml-junkdetect/src/main/resources/org/apache/tika/ml/junkdetect/junkdetect.bin
index a83dd647e6..ead028cbb3 100644
Binary files
a/tika-ml/tika-ml-junkdetect/src/main/resources/org/apache/tika/ml/junkdetect/junkdetect.bin
and
b/tika-ml/tika-ml-junkdetect/src/main/resources/org/apache/tika/ml/junkdetect/junkdetect.bin
differ
diff --git
a/tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/DecodeCorruptionDiscriminationTest.java
b/tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/DecodeCorruptionDiscriminationTest.java
new file mode 100644
index 0000000000..5f2ceb298f
--- /dev/null
+++
b/tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/DecodeCorruptionDiscriminationTest.java
@@ -0,0 +1,172 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.ml.junkdetect;
+
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+import java.util.Locale;
+
+import org.junit.jupiter.api.BeforeAll;
+import org.junit.jupiter.api.Test;
+
+/**
+ * Languageness must be MONOTONIC under decode corruption: a clean German
phrase
+ * must score strictly higher than the same phrase broken two distinct ways,
each
+ * exercising a different feature.
+ *
+ * <ol>
+ * <li>U+FFFD -- a decode failure (accented chars replaced by the replacement
+ * char). Caught by z6 (replacement ratio). Subtle: FFFD is a token
+ * boundary, so it DROPS the hard accented bigram and the surviving
common-
+ * letter fragments LIFT z1 -- coherence is fooled into preferring the
broken
+ * decode, which used to score HIGHER than clean on short pages in the
150k
+ * CommonCrawl eval (deu/gsw, deu/frr). z6 must overrule that.</li>
+ * <li>Wrong accented letter -- real Latin letters that do not belong in
German
+ * (Nordic a-ring / y-diaeresis / thorn), no FFFD. Caught by z1 (letter
+ * coherence); the margin is smaller (pan-Latin pools many languages)
but the
+ * clean decode must still win.</li>
+ * </ol>
+ *
+ * <p>Non-ASCII is written with Unicode escapes so the source stays ASCII-only.
+ */
+public class DecodeCorruptionDiscriminationTest {
+
+ private static JunkDetector jd;
+
+ @BeforeAll
+ static void load() throws Exception {
+ jd = JunkDetector.loadFromClasspath();
+ }
+
+ private static float languageness(String s) {
+ return jd.scoreWithFeatureComponents(s).logit;
+ }
+
+ /** Clean German prose: "Die naechste Stunde beginnt am Montag um neun Uhr
im
+ * grossen Saal fuer alle Anfaenger" (with real umlauts/eszett). */
+ private static final String CLEAN =
+ "Die n\u00E4chste Stunde beginnt am Montag um neun Uhr im "
+ + "gro\u00DFen Saal f\u00FCr alle Anf\u00E4nger";
+
+ @Test
+ void cleanOutscoresFffdBrokenWord() {
+ // Every accented char -> U+FFFD: a decode failure that leaves the
+ // surrounding letters real and in order, so z1 is unharmed (even
helped).
+ // z6 (replacement ratio) must overrule z1 and rank clean higher.
+ String fffd =
+ "Die n\uFFFDchste Stunde beginnt am Montag um neun Uhr im "
+ + "gro\uFFFDen Saal f\uFFFDr alle Anf\uFFFDnger";
+ float clean = languageness(CLEAN);
+ float broken = languageness(fffd);
+ assertTrue(clean > broken,
+ "clean German must outscore the U+FFFD-broken decode (a decode
"
+ + "failure must never raise languageness); clean=" + clean
+ + " fffd=" + broken);
+ }
+
+ @Test
+ void cleanOutscoresWrongAccentedLetter() {
+ // Accented Latin letters that do not belong in German: ae->a-ring
(U+00E5),
+ // ss->thorn (U+00FE), ue->y-diaeresis (U+00FF). No FFFD; z1
(coherence)
+ // must rank the clean decode higher even though the substitutes are
valid
+ // Latin in some language.
+ String wrong =
+ "Die n\u00E5chste Stunde beginnt am Montag um neun Uhr im "
+ + "gro\u00FEen Saal f\u00FFr alle Anf\u00E5nger";
+ float clean = languageness(CLEAN);
+ float broken = languageness(wrong);
+ assertTrue(clean > broken,
+ "clean German must outscore the wrong-accented-letter decode; "
+ + "clean=" + clean + " wrong=" + broken);
+ }
+
+ @Test
+ void allCapsAndTitleOutscoreAlternatingCase() {
+ // Case-CONSISTENT real text (ALL-CAPS headings, Title Case) scores
clean;
+ // case-INCONSISTENT alternating case ("aLtErNaTiNg") is the junk
pattern and
+ // must floor. The all-caps fix borrows the lowercase score for
consistent
+ // uppercase; the case-consistency gate keeps alternating case from
rescue.
+ String lower = "international organization standards committee
meeting";
+ String allCaps = lower.toUpperCase(Locale.ROOT);
+ String title = toTitleCase(lower);
+ String alt = toAlternatingCase(lower);
+ float lLower = languageness(lower);
+ float lAll = languageness(allCaps);
+ float lTitle = languageness(title);
+ float lAlt = languageness(alt);
+ assertTrue(lAll > lAlt,
+ "ALL-CAPS must outscore aLtErNaTiNg junk; allCaps=" + lAll + "
alt=" + lAlt);
+ assertTrue(lTitle > lAlt,
+ "Title-case must outscore aLtErNaTiNg junk; title=" + lTitle +
" alt=" + lAlt);
+ assertTrue(lAll > lLower - 1.0f,
+ "ALL-CAPS must score ~= lowercase (case-fold rescue);
allCaps=" + lAll
+ + " lower=" + lLower);
+ }
+
+ @Test
+ void allCapsCyrillicOutscoresGibberishDecode() {
+ // The real 150k regression (corpus file 1A68D...): all-caps Russian
+ // "MUZEJ BUDUSchEGO" was scored BELOW a KOI8-R gibberish decode, so
the
+ // detector chose the gibberish. The case-fold must rank the all-caps
real
+ // Russian like its lowercase form, above the gibberish. ASCII
source: the
+ // Russian is written with Unicode escapes.
+ String allCaps = "\u041C\u0423\u0417\u0415\u0419
\u0411\u0423\u0414\u0423\u0429\u0415\u0413\u041E";
+ String lower = "\u043C\u0443\u0437\u0435\u0439
\u0431\u0443\u0434\u0443\u0449\u0435\u0433\u043E";
+ String gibberish = "\u043B\u0441\u0433\u0435\u0438
\u0430\u0441\u0434\u0441\u044B\u0435\u0446\u043D";
+ float lAll = languageness(allCaps);
+ float lLower = languageness(lower);
+ float lJunk = languageness(gibberish);
+ assertTrue(lAll > lJunk,
+ "all-caps real Russian must outscore KOI8-R gibberish;
allCaps=" + lAll
+ + " gibberish=" + lJunk);
+ assertTrue(Math.abs(lAll - lLower) < 0.6f,
+ "all-caps must score ~= lowercase; allCaps=" + lAll + "
lower=" + lLower);
+ }
+
+ private static String toTitleCase(String s) {
+ StringBuilder sb = new StringBuilder(s.length());
+ boolean start = true;
+ for (int i = 0; i < s.length(); i++) {
+ char c = s.charAt(i);
+ if (Character.isWhitespace(c)) {
+ start = true;
+ sb.append(c);
+ } else if (start) {
+ sb.append(Character.toUpperCase(c));
+ start = false;
+ } else {
+ sb.append(c);
+ }
+ }
+ return sb.toString();
+ }
+
+ private static String toAlternatingCase(String s) {
+ StringBuilder sb = new StringBuilder(s.length());
+ boolean up = false;
+ for (int i = 0; i < s.length(); i++) {
+ char c = s.charAt(i);
+ if (Character.isLetter(c)) {
+ sb.append(up ? Character.toUpperCase(c) :
Character.toLowerCase(c));
+ up = !up;
+ } else {
+ sb.append(c);
+ }
+ }
+ return sb.toString();
+ }
+}
diff --git
a/tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/JunkDetectorRoundTripTest.java
b/tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/JunkDetectorRoundTripTest.java
index 73133cf9c9..07efc64dd8 100644
---
a/tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/JunkDetectorRoundTripTest.java
+++
b/tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/JunkDetectorRoundTripTest.java
@@ -16,7 +16,9 @@
*/
package org.apache.tika.ml.junkdetect;
+import static org.junit.jupiter.api.Assertions.assertDoesNotThrow;
import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertThrows;
import static org.junit.jupiter.api.Assertions.assertTrue;
import java.io.BufferedWriter;
@@ -25,6 +27,7 @@ import java.io.OutputStreamWriter;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
+import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.TreeMap;
@@ -51,6 +54,21 @@ import org.apache.tika.quality.TextQualityScore;
*/
public class JunkDetectorRoundTripTest {
+ @Test
+ void requireUsableSigmaRejectsNonPositiveOrNonFinite() {
+ assertDoesNotThrow(() -> JunkDetector.requireUsableSigma("ok", new
float[]{-3f, 0.5f}));
+ for (float badSigma : new float[]{0f, -0.1f, Float.NaN,
+ Float.POSITIVE_INFINITY, Float.NEGATIVE_INFINITY}) {
+ assertThrows(IOException.class,
+ () -> JunkDetector.requireUsableSigma("bad", new
float[]{0f, badSigma}),
+ "sigma=" + badSigma + " must be rejected");
+ }
+ assertThrows(IOException.class,
+ () -> JunkDetector.requireUsableSigma("null", null));
+ assertThrows(IOException.class,
+ () -> JunkDetector.requireUsableSigma("short", new
float[]{1f}));
+ }
+
@Test
void roundTripSeenPairAndUnigramBackoff(@TempDir Path tmp) throws
IOException {
// -----------------------------------------------------------------
@@ -273,25 +291,107 @@ public class JunkDetectorRoundTripTest {
-10.0f, 1.0f);
}
- /** Expected z1: mean log-prob over every non-digit adjacent bigram scored
- * against the single-script {@code tables}, calibrated. Mirrors
- * {@link JunkDetector}'s aggregate for single-script text. */
+ @Test
+ void tokenizationScoresGlueButKeepsAnomaliesAndWhitespaceAsBoundaries() {
+ // A letter run is wrapped ^...$. GLUE (punctuation, symbols, numbers)
+ // attaches to the open run and IS scored at codepoint resolution, so a
+ // symbol wedged mid-word becomes a real (rare) bigram the LM can
floor.
+ // But DECODE ANOMALIES (here U+FFFD; also C1 / PUA) and WHITESPACE
stay
+ // boundaries that split the run and emit nothing — the anomaly penalty
+ // lives solely in z6, never z1, so z1 cannot cannibalize the FFFD
signal.
+ String fffd = String.valueOf((char) 0xFFFD);
+ // all-letter run
+ assertEquals(List.of("^-a", "a-b", "b-c", "c-d", "d-$"),
bigrams("abcd"));
+ // glue (period, U+2030 per-mille) is SCORED inside the run, not
dropped
+ assertEquals(List.of("^-a", "a-b", "b-.", ".-c", "c-d", "d-$"),
bigrams("ab.cd"));
+ assertEquals(List.of("^-a", "a-b", "b-\u2030", "\u2030-c", "c-d",
"d-$"),
+ bigrams("ab\u2030cd"));
+ // U+FFFD (decode anomaly) is still a BOUNDARY: splits, emits nothing
+ assertEquals(List.of("^-a", "a-b", "b-$", "^-c", "c-d", "d-$"),
+ bigrams("ab" + fffd + "cd"));
+ assertEquals(List.of("^-a", "a-b", "b-$"), bigrams("ab" + fffd +
fffd));
+ // whitespace is a boundary too
+ assertEquals(List.of("^-a", "a-b", "b-$", "^-c", "c-d", "d-$"),
+ bigrams("ab cd"));
+ }
+
+ /** Collects {@link JunkDetector#forEachScriptBigram} output as "a-b"
strings,
+ * rendering the run-boundary sentinels as {@code ^} (start) / {@code $}
(end). */
+ private static List<String> bigrams(String s) {
+ List<String> out = new ArrayList<>();
+ JunkDetector.forEachScriptBigram(s.codePoints().toArray(), (script, a,
b) ->
+ out.add(fmtCp(a) + "-" + fmtCp(b)));
+ return out;
+ }
+
+ private static String fmtCp(int cp) {
+ if (cp == JunkDetector.TOKEN_START) return "^";
+ if (cp == JunkDetector.TOKEN_END) return "$";
+ return new String(Character.toChars(cp));
+ }
+
+ @Test
+ void caseFoldedBackoffRescuesAllCapsButNotMixedOrMojibake() {
+ // Synthetic LATIN table: index ['a','b'], the lowercase pair (a,b)
seen
+ // at a high log-prob (-1.0). Uppercase 'A'/'B' are absent from the
index.
+ BigramTables t = buildLatinTablesLowerAB();
+ double seenLower = JunkDetector.computeF1MeanLogP(new int[]{'a', 'b'},
t);
+ double allCaps = JunkDetector.computeF1MeanLogP(new int[]{'A', 'B'},
t);
+ double mixed = JunkDetector.computeF1MeanLogP(new int[]{'a', 'B'}, t);
+ double noTwin = JunkDetector.computeF1MeanLogP(new int[]{'B', 'A'}, t);
+ // All-caps "AB" folds to the SEEN lowercase "ab", landing a small
case-fold
+ // penalty BELOW it (all-caps is a slightly weaker signal) -- but
nowhere near
+ // the independence floor the mixed/mojibake cases hit below.
+ assertTrue(allCaps < seenLower && allCaps > seenLower - 0.5,
+ "all-caps AB must fold to ~ the seen lowercase ab (minus a
small penalty); "
+ + "seenLower=" + seenLower + " allCaps=" + allCaps);
+ // Mixed-case "aB" is case-INCONSISTENT -> not folded -> independence
floor.
+ assertTrue(mixed < allCaps - 1.0,
+ "mixed-case aB must not fold (consistency gate)");
+ // All-caps "BA" whose lowercase twin (b,a) is UNSEEN -> floors
(mojibake case).
+ assertTrue(noTwin < allCaps - 1.0,
+ "all-caps with no seen lowercase twin must floor");
+ }
+
+ /** Like {@link #buildLatinTablesAB} but indexed on LOWERCASE ['a','b']
with
+ * the lowercase pair (a,b) seen at -1.0 — exercises the case-folded
backoff
+ * (uppercase 'A'/'B' are absent from the index, so they must fold). */
+ private static BigramTables buildLatinTablesLowerAB() {
+ int[] cpIndex = new int[]{'a', 'b'};
+ int[] keys = new int[4];
+ Arrays.fill(keys, BigramTables.EMPTY_KEY);
+ byte[] values = new byte[4];
+ float bMin = -10.0f;
+ float bMax = -1.0f;
+ insertOA(keys, values, JunkDetector.packBigramKey(0, 1),
+ quantizeOne(-1.0f, bMin, bMax));
+ float uMin = -5.0f;
+ float uMax = -2.0f;
+ byte[] unigramBytes = new byte[]{
+ quantizeOne(-2.0f, uMin, uMax),
+ quantizeOne(-2.0f, uMin, uMax),
+ };
+ return new BigramTables(cpIndex, keys, values, unigramBytes,
+ bMin, bMax, uMin, uMax, -10.0f, 1.0f);
+ }
+
+ /** Expected z1: mean log-prob over the bigrams {@link
+ * JunkDetector#forEachScriptBigram} emits (word-run tokenization with ^/$
+ * wrapping), scored against the single-script {@code tables}, calibrated.
+ * Delegates to the production tokenizer so it cannot drift from
inference. */
private static float expectedRunZ(BigramTables tables, String text, float
mu, float sigma) {
- int[] cps = text.codePoints().toArray();
- double sum = 0;
- long n = 0;
- for (int i = 0; i + 1 < cps.length; i++) {
- if (Character.isDigit(cps[i]) || Character.isDigit(cps[i + 1])) {
- continue;
- }
- double f1 = JunkDetector.computeF1MeanLogP(new int[]{cps[i], cps[i
+ 1]}, tables);
- if (Double.isNaN(f1)) {
- continue;
+ double[] acc = new double[2]; // {sum, count}
+ JunkDetector.forEachScriptBigram(text.codePoints().toArray(), (script,
a, b) -> {
+ double f1 = JunkDetector.computeF1MeanLogP(new int[]{a, b},
tables);
+ if (!Double.isNaN(f1)) {
+ acc[0] += f1;
+ acc[1] += 1;
}
- sum += f1;
- n++;
+ });
+ if (acc[1] == 0) {
+ throw new IllegalArgumentException("no scorable bigrams for: " +
text);
}
- return (float) ((sum / n - mu) / sigma);
+ return (float) ((acc[0] / acc[1] - mu) / sigma);
}
/** Quantize a single float to 8-bit unsigned using the explicit range. */
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/JSoupParser.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/JSoupParser.java
index 05d8f836e6..c38cc2fd7f 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/JSoupParser.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/JSoupParser.java
@@ -53,6 +53,7 @@ import org.apache.tika.detect.EncodingResult;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractEncodingDetectorParser;
import org.apache.tika.parser.ParseContext;
@@ -165,6 +166,9 @@ public class JSoupParser extends
AbstractEncodingDetectorParser {
: encResults.get(0).getCharset();
Charset decodeAs = encResults.isEmpty() ? DEFAULT_CHARSET
: encResults.get(0).getDecodeAs();
+ if (!decodeAs.equals(charset)) {
+ metadata.set(TikaCoreProperties.DECODED_CHARSET, decodeAs.name());
+ }
String previous = metadata.get(Metadata.CONTENT_TYPE);
MediaType contentType = null;
if (previous == null || previous.startsWith("text/html")) {
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
index f10f1be80d..0b3776066e 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
@@ -286,6 +286,42 @@ public class HtmlParserTest extends TikaTest {
assertEquals("ISO-8859-15", metadata.get(Metadata.CONTENT_ENCODING));
}
+ /**
+ * A page that declares {@code charset=euc-kr} but actually uses UHC
(MS949)
+ * extension Hangul must be decoded with the superset {@code
x-windows-949},
+ * not strict EUC-KR (which U+FFFDs the extension syllables). Mirrors the
+ * promotion {@code AutoDetectReader} already applies for non-HTML.
+ * {@code CONTENT_ENCODING} still reports the detected charset; the
superset
+ * actually used is recorded in {@code DECODED_CHARSET}.
+ *
+ * @see org.apache.tika.detect.CharsetSupersets
+ */
+ @Test
+ public void testEucKrPromotedToMs949Superset() throws Exception {
+ // U+AC02 is outside EUC-KR (KS X 1001) but inside x-windows-949
(MS949);
+ // its MS949 bytes 0x81 0x41 decode to U+FFFD followed by 'A' under
strict
+ // EUC-KR, so a correct decode proves the superset was used. U+D55C
U+AD6D
+ // ("Korea") is a normal EUC-KR syllable pair.
+ String test = "<html><head><meta charset=\"euc-kr\" />" +
+ "<title>title</title></head>" +
+ "<body><p>\uAC02 \uD55C\uAD6D</p></body></html>";
+ Metadata metadata = new Metadata();
+ BodyContentHandler handler = new BodyContentHandler();
+ try (TikaInputStream tis =
TikaInputStream.get(test.getBytes("x-windows-949"))) {
+ new JSoupParser().parse(tis, handler, metadata, new
ParseContext());
+ }
+ // Metadata reports the *detected* charset ...
+ assertEquals("EUC-KR", metadata.get(Metadata.CONTENT_ENCODING));
+ // ... but decoding used the superset, recorded in DECODED_CHARSET.
+ assertEquals(java.nio.charset.Charset.forName("x-windows-949").name(),
+ metadata.get(TikaCoreProperties.DECODED_CHARSET));
+ // The UHC-only syllable round-trips; under strict EUC-KR it would be
U+FFFD.
+ String content = handler.toString();
+ assertContains("\uAC02", content);
+ assertFalse(content.contains("\uFFFD"),
+ "no replacement chars expected under the MS949 superset
decode");
+ }
+
/**
* Test case for TIKA-334
*