This is an automated email from the ASF dual-hosted git repository.
tballison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 9eff319635 TIKA-4745 - small twiddle on charset detection (#2886)
9eff319635 is described below
commit 9eff3196354b3338e9298b502d564e12a508e367
Author: Tim Allison <[email protected]>
AuthorDate: Tue Jun 9 21:47:58 2026 +0200
TIKA-4745 - small twiddle on charset detection (#2886)
---
.../tika/ml/chardetect/CjkDecodeValidator.java | 21 +++++++++++++--
.../ml/chardetect/MojibusterEncodingDetector.java | 31 +++++++++++++++-------
.../NaiveBayesBigramEncodingDetector.java | 28 +++++++++++++++++++
.../tika/ml/chardetect/CjkDecodeValidatorTest.java | 21 +++++++++++++++
4 files changed, 90 insertions(+), 11 deletions(-)
diff --git
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/CjkDecodeValidator.java
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/CjkDecodeValidator.java
index cf4e4e6554..00f44dcee5 100644
---
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/CjkDecodeValidator.java
+++
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/CjkDecodeValidator.java
@@ -64,8 +64,18 @@ public final class CjkDecodeValidator {
* Failure rate of {@code bytes} under {@code cjkCharset}'s vendor
superset,
* counting only legacy high bytes (embedded UTF-8 is skipped, not
counted).
*
- * @return failures / legacy-high-bytes, or {@code -1.0} when there is too
- * little legacy evidence (legacy high bytes < {@link
#MIN_HIGH_BYTES})
+ * <p>Special case: if every high byte is a valid UTF-8 sequence (i.e.,
+ * {@code nHigh == 0}) and there are at least {@link #MIN_HIGH_BYTES} UTF-8
+ * multi-byte sequences, the probe is pure UTF-8 — no legacy CJK content at
+ * all. In that case {@code 1.0} is returned to trigger the CJK veto.
+ * Real legacy CJK encodings (Shift_JIS, Big5, EUC-JP, GB18030 …) always
+ * have lead bytes in 0x81–0x9F or 0xF5–0xFF that are not valid UTF-8
starts,
+ * so {@code nHigh > 0} for any genuine CJK document.
+ *
+ * @return failures / legacy-high-bytes, {@code 1.0} when the probe is pure
+ * UTF-8 (nHigh==0, nUTF8seqs≥{@link #MIN_HIGH_BYTES}), or
+ * {@code -1.0} when there is too little evidence either way
+ * (legacy high bytes < {@link #MIN_HIGH_BYTES} and not pure
UTF-8)
*/
public static double strippedFailureRate(byte[] bytes, Charset cjkCharset)
{
Charset decodeAs = CharsetSupersets.decodeAs(cjkCharset);
@@ -77,6 +87,7 @@ public final class CjkDecodeValidator {
int n = bytes.length;
int fail = 0;
int nHigh = 0;
+ int nUtf8Seqs = 0;
while (i < n) {
int x = bytes[i] & 0xFF;
if (x < 0x80) {
@@ -85,6 +96,7 @@ public final class CjkDecodeValidator {
}
int ulen = utf8SequenceLength(bytes, i);
if (ulen > 0) {
+ nUtf8Seqs++;
i += ulen; // embedded UTF-8 — not legacy content, skip
continue;
}
@@ -102,6 +114,11 @@ public final class CjkDecodeValidator {
}
}
if (nHigh < MIN_HIGH_BYTES) {
+ // Pure UTF-8: no legacy high bytes at all but enough UTF-8
sequences
+ // to be confident. Return 1.0 so the CJK veto fires.
+ if (nHigh == 0 && nUtf8Seqs >= MIN_HIGH_BYTES) {
+ return 1.0;
+ }
return -1.0;
}
return (double) fail / nHigh;
diff --git
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java
index e225366091..3d48b595d0 100644
---
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java
+++
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java
@@ -427,17 +427,30 @@ public class MojibusterEncodingDetector implements
EncodingDetector {
LOG.trace("mojibuster pool empty -> windows-1252 fallback");
return windows1252Fallback();
}
+ // When the top result is STRUCTURAL (clean UTF-8/UTF-32/ISO-2022
grammar),
+ // return only that one result. JunkFilter must not re-open
Mojibuster's
+ // internal ordering and pick a lower-ranked STATISTICAL CJK candidate
+ // over the STRUCTURAL winner on non-languagey content — that was the
11k
+ // regression root cause. With a single STRUCTURAL result, JunkFilter
+ // still arbitrates when *another* detector disagrees (lying HTML
headers),
+ // which is the intended use case.
+ //
+ // When the top result is STATISTICAL, keep the full ranked list so
that
+ // JunkFilter can arbitrate within-family ambiguities (e.g. GB18030 vs
+ // x-windows-949: NB scores Chinese higher than Korean on JS-heavy
files
+ // because ASCII bigram distributions differ between training corpora,
but
+ // JunkFilter's language-quality scoring correctly prefers Korean
text).
+ EncodingResult top = finalResults.get(0);
+ List<EncodingResult> toReturn = (top.getResultType() ==
EncodingResult.ResultType.STRUCTURAL)
+ ? List.of(top) : finalResults;
if (LOG.isTraceEnabled()) {
- StringBuilder sb = new StringBuilder();
- for (EncodingResult r : finalResults) {
- if (sb.length() > 0) sb.append(", ");
- sb.append(r.getCharset().name())
- .append("[").append(r.getResultType()).append("]")
- .append("@").append(String.format(Locale.ROOT, "%.2f",
r.getConfidence()));
- }
- LOG.trace("mojibuster exit ({} results) [{}]",
finalResults.size(), sb);
+ LOG.trace("mojibuster exit ({}) {}[{}]@{}",
+ top.getResultType() ==
EncodingResult.ResultType.STRUCTURAL ? "top1" : "full",
+ top.getCharset().name(),
+ top.getResultType(),
+ String.format(Locale.ROOT, "%.2f", top.getConfidence()));
}
- return finalResults;
+ return toReturn;
}
/**
diff --git
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/NaiveBayesBigramEncodingDetector.java
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/NaiveBayesBigramEncodingDetector.java
index da87d46b74..7db1c34796 100644
---
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/NaiveBayesBigramEncodingDetector.java
+++
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/NaiveBayesBigramEncodingDetector.java
@@ -239,6 +239,8 @@ public class NaiveBayesBigramEncodingDetector implements
EncodingDetector {
*/
private final double[] perClassDequant;
private final int numClasses;
+ // BETA-1 WORKAROUND: GB18030 class index for markup-bigram suppression —
see isOffendingAscii.
+ private final int gb18030ClassIdx;
public NaiveBayesBigramEncodingDetector(Path modelPath) throws IOException
{
this(Files.newInputStream(modelPath));
@@ -364,6 +366,16 @@ public class NaiveBayesBigramEncodingDetector implements
EncodingDetector {
for (int c = 0; c < numClasses; c++) {
perClassDequant[c] = (double) scale[c] * idfScale;
}
+
+ // Locate GB18030's class index for the markup-bigram suppression.
+ int gb18030Idx = -1;
+ for (int c = 0; c < numClasses; c++) {
+ if ("GB18030".equalsIgnoreCase(labels[c])) {
+ gb18030Idx = c;
+ break;
+ }
+ }
+ this.gb18030ClassIdx = gb18030Idx;
}
}
@@ -379,6 +391,13 @@ public class NaiveBayesBigramEncodingDetector implements
EncodingDetector {
|| b == 0x0d || b == 0x20;
}
+ // BETA-1 WORKAROUND: bigrams containing these HTML/JS markup chars are
+ // over-represented in GB18030 training data and cause misclassification.
+ // Suppressed only for GB18030 in scoreClassesAndCount.
+ static boolean isOffendingAscii(int b) {
+ return b == '{' || b == '"' || b == '&' || b == '<' || b == '>';
+ }
+
public List<EncodingResult> detect(byte[] probe) {
ScoreResult sr = scoreClassesAndCount(probe);
if (sr == null) {
@@ -554,9 +573,17 @@ public class NaiveBayesBigramEncodingDetector implements
EncodingDetector {
double countTimesIdf = tf * w;
int base = bigram * numClasses;
+ // BETA-1 WORKAROUND: skip this bigram for GB18030 if either byte
is
+ // an HTML/JS markup char that inflates GB18030 scores on Latin
pages.
+ int bg0 = (bigram >> 8) & 0xFF;
+ int bg1 = bigram & 0xFF;
+ boolean skipGb18030 = gb18030ClassIdx >= 0
+ && (isOffendingAscii(bg0) || isOffendingAscii(bg1));
+
if (!applyCap) {
// Fast path: no cap, just accumulate.
for (int c = 0; c < numClasses; c++) {
+ if (skipGb18030 && c == gb18030ClassIdx) continue;
score[c] += logP8[base + c] * countTimesIdf *
perClassDequant[c];
}
continue;
@@ -602,6 +629,7 @@ public class NaiveBayesBigramEncodingDetector implements
EncodingDetector {
double capValue = bestCrossCohort + CAP_PER_BIGRAM_NATS;
boolean clip = max > capValue;
for (int c = 0; c < numClasses; c++) {
+ if (skipGb18030 && c == gb18030ClassIdx) continue;
double v = contributions[c];
if (clip && v > capValue) {
v = capValue;
diff --git
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/test/java/org/apache/tika/ml/chardetect/CjkDecodeValidatorTest.java
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/test/java/org/apache/tika/ml/chardetect/CjkDecodeValidatorTest.java
index 14e074212a..7629129ca9 100644
---
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/test/java/org/apache/tika/ml/chardetect/CjkDecodeValidatorTest.java
+++
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/test/java/org/apache/tika/ml/chardetect/CjkDecodeValidatorTest.java
@@ -69,6 +69,27 @@ public class CjkDecodeValidatorTest {
assertEquals(-1.0, CjkDecodeValidator.strippedFailureRate(b,
Charset.forName("GB18030")));
}
+ /**
+ * Pure UTF-8 file (zero legacy CJK bytes, many UTF-8 multi-byte
sequences):
+ * strippedFailureRate must return 1.0 so the CJK veto fires for all CJK
charsets.
+ * This covers the regression where Shift_JIS / Big5-HKSCS / GB18030 were
wrongly
+ * chosen over UTF-8 STRUCTURAL for pure-UTF-8 Latin/Cyrillic/etc. files.
+ */
+ @Test
+ public void pureUtf8ReturnsCjkVeto() throws Exception {
+ // Croatian text encoded as UTF-8 — all high bytes are valid UTF-8
sequences,
+ // none are legacy CJK lead bytes.
+ byte[] b = ("Ovo je čist UTF-8 tekst s hrvatskim slovima: "
+ + "čćžšđ ČĆŽŠĐ. Ponavljamo dovoljno puta da premašimo prag od
"
+ + "trideset UTF-8 sekvenci: šššššššššš čččččččččč đđđđđđđđđđ.")
+ .getBytes("UTF-8");
+ for (String cs : new String[]{"Shift_JIS", "Big5-HKSCS", "GB18030",
"EUC-JP"}) {
+ double rate = CjkDecodeValidator.strippedFailureRate(b,
Charset.forName(cs));
+ assertEquals(1.0, rate, 0.0,
+ "pure UTF-8 must return 1.0 (veto) for " + cs + ", got " +
rate);
+ }
+ }
+
@Test
public void appliesToLegacyCjkButNotIso2022OrLatin() {
assertTrue(CjkDecodeValidator.appliesTo("GB18030"));