(tika) branch main updated: TIKA-4745 - small twiddle on charset detection (#2886)

tallison Tue, 09 Jun 2026 12:48:22 -0700

This is an automated email from the ASF dual-hosted git repository.

tballison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git



The following commit(s) were added to refs/heads/main by this push:
     new 9eff319635 TIKA-4745 - small twiddle on charset detection (#2886)
9eff319635 is described below

commit 9eff3196354b3338e9298b502d564e12a508e367
Author: Tim Allison <[email protected]>
AuthorDate: Tue Jun 9 21:47:58 2026 +0200

    TIKA-4745 - small twiddle on charset detection (#2886)
---
 .../tika/ml/chardetect/CjkDecodeValidator.java     | 21 +++++++++++++--
 .../ml/chardetect/MojibusterEncodingDetector.java  | 31 +++++++++++++++-------
 .../NaiveBayesBigramEncodingDetector.java          | 28 +++++++++++++++++++
 .../tika/ml/chardetect/CjkDecodeValidatorTest.java | 21 +++++++++++++++
 4 files changed, 90 insertions(+), 11 deletions(-)

diff --git 
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/CjkDecodeValidator.java
 
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/CjkDecodeValidator.java
index cf4e4e6554..00f44dcee5 100644
--- 
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/CjkDecodeValidator.java
+++ 
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/CjkDecodeValidator.java
@@ -64,8 +64,18 @@ public final class CjkDecodeValidator {
      * Failure rate of {@code bytes} under {@code cjkCharset}'s vendor 
superset,
      * counting only legacy high bytes (embedded UTF-8 is skipped, not 
counted).
      *
-     * @return failures / legacy-high-bytes, or {@code -1.0} when there is too
-     *         little legacy evidence (legacy high bytes &lt; {@link 
#MIN_HIGH_BYTES})
+     * <p>Special case: if every high byte is a valid UTF-8 sequence (i.e.,
+     * {@code nHigh == 0}) and there are at least {@link #MIN_HIGH_BYTES} UTF-8
+     * multi-byte sequences, the probe is pure UTF-8 — no legacy CJK content at
+     * all.  In that case {@code 1.0} is returned to trigger the CJK veto.
+     * Real legacy CJK encodings (Shift_JIS, Big5, EUC-JP, GB18030 …) always
+     * have lead bytes in 0x81–0x9F or 0xF5–0xFF that are not valid UTF-8 
starts,
+     * so {@code nHigh > 0} for any genuine CJK document.
+     *
+     * @return failures / legacy-high-bytes, {@code 1.0} when the probe is pure
+     *         UTF-8 (nHigh==0, nUTF8seqs&ge;{@link #MIN_HIGH_BYTES}), or
+     *         {@code -1.0} when there is too little evidence either way
+     *         (legacy high bytes &lt; {@link #MIN_HIGH_BYTES} and not pure 
UTF-8)
      */
     public static double strippedFailureRate(byte[] bytes, Charset cjkCharset) 
{
         Charset decodeAs = CharsetSupersets.decodeAs(cjkCharset);
@@ -77,6 +87,7 @@ public final class CjkDecodeValidator {
         int n = bytes.length;
         int fail = 0;
         int nHigh = 0;
+        int nUtf8Seqs = 0;
         while (i < n) {
             int x = bytes[i] & 0xFF;
             if (x < 0x80) {
@@ -85,6 +96,7 @@ public final class CjkDecodeValidator {
             }
             int ulen = utf8SequenceLength(bytes, i);
             if (ulen > 0) {
+                nUtf8Seqs++;
                 i += ulen; // embedded UTF-8 — not legacy content, skip
                 continue;
             }
@@ -102,6 +114,11 @@ public final class CjkDecodeValidator {
             }
         }
         if (nHigh < MIN_HIGH_BYTES) {
+            // Pure UTF-8: no legacy high bytes at all but enough UTF-8 
sequences
+            // to be confident.  Return 1.0 so the CJK veto fires.
+            if (nHigh == 0 && nUtf8Seqs >= MIN_HIGH_BYTES) {
+                return 1.0;
+            }
             return -1.0;
         }
         return (double) fail / nHigh;
diff --git 
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java
 
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java
index e225366091..3d48b595d0 100644
--- 
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java
+++ 
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java
@@ -427,17 +427,30 @@ public class MojibusterEncodingDetector implements 
EncodingDetector {
             LOG.trace("mojibuster pool empty -> windows-1252 fallback");
             return windows1252Fallback();
         }
+        // When the top result is STRUCTURAL (clean UTF-8/UTF-32/ISO-2022 
grammar),
+        // return only that one result.  JunkFilter must not re-open 
Mojibuster's
+        // internal ordering and pick a lower-ranked STATISTICAL CJK candidate
+        // over the STRUCTURAL winner on non-languagey content — that was the 
11k
+        // regression root cause.  With a single STRUCTURAL result, JunkFilter
+        // still arbitrates when *another* detector disagrees (lying HTML 
headers),
+        // which is the intended use case.
+        //
+        // When the top result is STATISTICAL, keep the full ranked list so 
that
+        // JunkFilter can arbitrate within-family ambiguities (e.g. GB18030 vs
+        // x-windows-949: NB scores Chinese higher than Korean on JS-heavy 
files
+        // because ASCII bigram distributions differ between training corpora, 
but
+        // JunkFilter's language-quality scoring correctly prefers Korean 
text).
+        EncodingResult top = finalResults.get(0);
+        List<EncodingResult> toReturn = (top.getResultType() == 
EncodingResult.ResultType.STRUCTURAL)
+                ? List.of(top) : finalResults;
         if (LOG.isTraceEnabled()) {
-            StringBuilder sb = new StringBuilder();
-            for (EncodingResult r : finalResults) {
-                if (sb.length() > 0) sb.append(", ");
-                sb.append(r.getCharset().name())
-                  .append("[").append(r.getResultType()).append("]")
-                  .append("@").append(String.format(Locale.ROOT, "%.2f", 
r.getConfidence()));
-            }
-            LOG.trace("mojibuster exit ({} results) [{}]", 
finalResults.size(), sb);
+            LOG.trace("mojibuster exit ({}) {}[{}]@{}",
+                    top.getResultType() == 
EncodingResult.ResultType.STRUCTURAL ? "top1" : "full",
+                    top.getCharset().name(),
+                    top.getResultType(),
+                    String.format(Locale.ROOT, "%.2f", top.getConfidence()));
         }
-        return finalResults;
+        return toReturn;
     }
 
     /**
diff --git 
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/NaiveBayesBigramEncodingDetector.java
 
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/NaiveBayesBigramEncodingDetector.java
index da87d46b74..7db1c34796 100644
--- 
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/NaiveBayesBigramEncodingDetector.java
+++ 
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/NaiveBayesBigramEncodingDetector.java
@@ -239,6 +239,8 @@ public class NaiveBayesBigramEncodingDetector implements 
EncodingDetector {
      */
     private final double[] perClassDequant;
     private final int numClasses;
+    // BETA-1 WORKAROUND: GB18030 class index for markup-bigram suppression — 
see isOffendingAscii.
+    private final int gb18030ClassIdx;
 
     public NaiveBayesBigramEncodingDetector(Path modelPath) throws IOException 
{
         this(Files.newInputStream(modelPath));
@@ -364,6 +366,16 @@ public class NaiveBayesBigramEncodingDetector implements 
EncodingDetector {
             for (int c = 0; c < numClasses; c++) {
                 perClassDequant[c] = (double) scale[c] * idfScale;
             }
+
+            // Locate GB18030's class index for the markup-bigram suppression.
+            int gb18030Idx = -1;
+            for (int c = 0; c < numClasses; c++) {
+                if ("GB18030".equalsIgnoreCase(labels[c])) {
+                    gb18030Idx = c;
+                    break;
+                }
+            }
+            this.gb18030ClassIdx = gb18030Idx;
         }
     }
 
@@ -379,6 +391,13 @@ public class NaiveBayesBigramEncodingDetector implements 
EncodingDetector {
                 || b == 0x0d || b == 0x20;
     }
 
+    // BETA-1 WORKAROUND: bigrams containing these HTML/JS markup chars are
+    // over-represented in GB18030 training data and cause misclassification.
+    // Suppressed only for GB18030 in scoreClassesAndCount.
+    static boolean isOffendingAscii(int b) {
+        return b == '{' || b == '"' || b == '&' || b == '<' || b == '>';
+    }
+
     public List<EncodingResult> detect(byte[] probe) {
         ScoreResult sr = scoreClassesAndCount(probe);
         if (sr == null) {
@@ -554,9 +573,17 @@ public class NaiveBayesBigramEncodingDetector implements 
EncodingDetector {
             double countTimesIdf = tf * w;
             int base = bigram * numClasses;
 
+            // BETA-1 WORKAROUND: skip this bigram for GB18030 if either byte 
is
+            // an HTML/JS markup char that inflates GB18030 scores on Latin 
pages.
+            int bg0 = (bigram >> 8) & 0xFF;
+            int bg1 = bigram & 0xFF;
+            boolean skipGb18030 = gb18030ClassIdx >= 0
+                    && (isOffendingAscii(bg0) || isOffendingAscii(bg1));
+
             if (!applyCap) {
                 // Fast path: no cap, just accumulate.
                 for (int c = 0; c < numClasses; c++) {
+                    if (skipGb18030 && c == gb18030ClassIdx) continue;
                     score[c] += logP8[base + c] * countTimesIdf * 
perClassDequant[c];
                 }
                 continue;
@@ -602,6 +629,7 @@ public class NaiveBayesBigramEncodingDetector implements 
EncodingDetector {
             double capValue = bestCrossCohort + CAP_PER_BIGRAM_NATS;
             boolean clip = max > capValue;
             for (int c = 0; c < numClasses; c++) {
+                if (skipGb18030 && c == gb18030ClassIdx) continue;
                 double v = contributions[c];
                 if (clip && v > capValue) {
                     v = capValue;
diff --git 
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/test/java/org/apache/tika/ml/chardetect/CjkDecodeValidatorTest.java
 
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/test/java/org/apache/tika/ml/chardetect/CjkDecodeValidatorTest.java
index 14e074212a..7629129ca9 100644
--- 
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/test/java/org/apache/tika/ml/chardetect/CjkDecodeValidatorTest.java
+++ 
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/test/java/org/apache/tika/ml/chardetect/CjkDecodeValidatorTest.java
@@ -69,6 +69,27 @@ public class CjkDecodeValidatorTest {
         assertEquals(-1.0, CjkDecodeValidator.strippedFailureRate(b, 
Charset.forName("GB18030")));
     }
 
+    /**
+     * Pure UTF-8 file (zero legacy CJK bytes, many UTF-8 multi-byte 
sequences):
+     * strippedFailureRate must return 1.0 so the CJK veto fires for all CJK 
charsets.
+     * This covers the regression where Shift_JIS / Big5-HKSCS / GB18030 were 
wrongly
+     * chosen over UTF-8 STRUCTURAL for pure-UTF-8 Latin/Cyrillic/etc. files.
+     */
+    @Test
+    public void pureUtf8ReturnsCjkVeto() throws Exception {
+        // Croatian text encoded as UTF-8 — all high bytes are valid UTF-8 
sequences,
+        // none are legacy CJK lead bytes.
+        byte[] b = ("Ovo je čist UTF-8 tekst s hrvatskim slovima: "
+                + "čćžšđ ČĆŽŠĐ.  Ponavljamo dovoljno puta da premašimo prag od 
"
+                + "trideset UTF-8 sekvenci: šššššššššš čččččččččč đđđđđđđđđđ.")
+                .getBytes("UTF-8");
+        for (String cs : new String[]{"Shift_JIS", "Big5-HKSCS", "GB18030", 
"EUC-JP"}) {
+            double rate = CjkDecodeValidator.strippedFailureRate(b, 
Charset.forName(cs));
+            assertEquals(1.0, rate, 0.0,
+                    "pure UTF-8 must return 1.0 (veto) for " + cs + ", got " + 
rate);
+        }
+    }
+
     @Test
     public void appliesToLegacyCjkButNotIso2022OrLatin() {
         assertTrue(CjkDecodeValidator.appliesTo("GB18030"));

(tika) branch main updated: TIKA-4745 - small twiddle on charset detection (#2886)

Reply via email to