Re: [PR] TIKA-4731 - improve charset detection and junk detection [tika]

via GitHub Wed, 27 May 2026 05:40:02 -0700


tballison commented on code in PR #2839:
URL: https://github.com/apache/tika/pull/2839#discussion_r3310816762



##########
tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/NaiveBayesBigramEncodingDetector.java:
##########
@@ -211,62 +268,262 @@ public List<EncodingResult> detect(TikaInputStream tis, 
Metadata metadata,
         return detect(readProbe(tis));
     }
 
+    /** ASCII whitespace: TAB, LF, VT, FF, CR, SPACE. */
+    private static boolean isWhitespace(int b) {
+        return b == 0x09 || b == 0x0a || b == 0x0b || b == 0x0c
+                || b == 0x0d || b == 0x20;
+    }
+
     public List<EncodingResult> detect(byte[] probe) {
-        if (probe == null || probe.length < 2) {
+        ScoreResult sr = scoreClassesAndCount(probe);
+        if (sr == null) {
             return Collections.emptyList();
         }
-        int len = Math.min(probe.length, MAX_PROBE_BYTES);
+        return emitCandidates(sr.scores, sr.scoredBigrams);
+    }
+
+    /**
+     * Score result returned by {@link #scoreClassesAndCount(byte[])}.
+     * Exposes the raw per-class score vector together with the number
+     * of bigrams that actually contributed to the dot product (i.e.,
+     * bigrams with non-zero IDF and not skipped by the whitespace-pair
+     * rule) and the total bigrams in the scored region of the probe.
+     * {@code scoredBigrams} is the unit of "evidence available to NB"
+     * — robust to HTML / whitespace noise in the input because those
+     * bigrams have IDF == 0 and don't contribute.
+     */
+    public static final class ScoreResult {
+        public final double[] scores;
+        public final int scoredBigrams;
+        public final int totalBigrams;
+        public ScoreResult(double[] scores, int scoredBigrams, int 
totalBigrams) {
+            this.scores = scores;
+            this.scoredBigrams = scoredBigrams;
+            this.totalBigrams = totalBigrams;
+        }
+    }
+
+    /**
+     * Compute the raw per-class score vector for a probe, without
+     * top-K extraction or softmax.  Returns {@code null} for null /
+     * tiny probes that can't be scored.
+     */
+    public double[] scoreClasses(byte[] probe) {
+        ScoreResult sr = scoreClassesAndCount(probe);
+        return sr == null ? null : sr.scores;
+    }
+
+    /**
+     * Per-bigram contribution to the per-class score, used for
+     * diagnostic tools that want to understand why a probe scores
+     * one class over another.  Returned by
+     * {@link #analyzeBigrams(byte[], int, int)}.
+     */
+    public static final class BigramContrib {
+        public final int bigram;       // (b0 << 8) | b1
+        public final double contribA;  // logP_A * idf in nats
+        public final double contribB;
+        public BigramContrib(int bigram, double a, double b) {
+            this.bigram = bigram;
+            this.contribA = a;
+            this.contribB = b;
+        }
+        public double diff() {
+            return contribA - contribB;
+        }
+    }
 
-        // Integer hot loop — CharSoup-style.  int8 logP × int8 IDF →
-        // int16 product, accumulated into int32 per class.  Overflow
-        // safety: at MAX_PROBE_BYTES=1024, max 1023 bigrams × 127 × 127
-        // ≈ 16.5M per class, well inside int32's 2.1B headroom.
-        int[] dots = new int[numClasses];
+    /**
+     * For each scored bigram in the probe (same skip rules as
+     * {@link #scoreClasses(byte[])}), compute and return its
+     * dequantized contribution to two specified classes' scores.
+     * The list is in probe order, with duplicates allowed (a bigram
+     * that appears N times in the probe yields N entries).
+     */
+    public List<BigramContrib> analyzeBigrams(byte[] probe, int classA, int 
classB) {
+        List<BigramContrib> out = new java.util.ArrayList<>();
+        if (probe == null || probe.length < 2) {
+            return out;
+        }
+        int len = Math.min(probe.length, MAX_PROBE_BYTES);
+        // perClassDequant[c] folds scale[c] × idfScale already, so
+        // contribution(bigram, c) = logP8[..c] * idf8[bigram] * 
perClassDequant[c]
+        double dqA = perClassDequant[classA];
+        double dqB = perClassDequant[classB];
         for (int i = 0; i + 1 < len; i++) {
-            int bigram = ((probe[i] & 0xFF) << 8) | (probe[i + 1] & 0xFF);
-            int w = idf8[bigram];  // non-negative, 0..127
+            int b0 = probe[i] & 0xFF;
+            int b1 = probe[i + 1] & 0xFF;
+            if (isWhitespace(b0) && isWhitespace(b1)) {
+                continue;
+            }
+            int bigram = (b0 << 8) | b1;
+            int w = idf8[bigram];
             if (w == 0) {
-                continue; // bigram appears in every class; no signal
+                continue;
             }
             int base = bigram * numClasses;
-            for (int c = 0; c < numClasses; c++) {
-                dots[c] += logP8[base + c] * w;
+            double contribA = logP8[base + classA] * w * dqA;
+            double contribB = logP8[base + classB] * w * dqB;
+            out.add(new BigramContrib(bigram, contribA, contribB));
+        }
+        return out;
+    }
+
+    /**
+     * Like {@link #scoreClasses(byte[])} but also reports the number
+     * of bigrams that contributed to the dot product vs the total
+     * scored region.  Used by offline calibration to bucket samples
+     * by "evidence available" rather than raw byte length.
+     */
+    public ScoreResult scoreClassesAndCount(byte[] probe) {
+        if (probe == null || probe.length < 2) {
+            return null;
+        }
+        int len = Math.min(probe.length, MAX_PROBE_BYTES);
+
+        // Pass 1: count distinct bigrams.  Whitespace and zero-IDF
+        // bigrams are skipped as in the original hot loop.  short[] is
+        // enough since count fits in 16383 (max possible).  Track the
+        // ids of distinct bigrams in a parallel array so pass 2 doesn't
+        // need to scan the full 65k space.
+        short[] count = new short[BIGRAM_SPACE];
+        int[] distinctBigrams = new int[len];
+        int distinctIdx = 0;
+        int scored = 0;
+        int total = 0;
+        for (int i = 0; i + 1 < len; i++) {
+            int b0 = probe[i] & 0xFF;
+            int b1 = probe[i + 1] & 0xFF;
+            total++;
+            if (isWhitespace(b0) && isWhitespace(b1)) {
+                continue;
+            }
+            int bigram = (b0 << 8) | b1;
+            int w = idf8[bigram];
+            if (w == 0) {
+                continue;
             }
+            scored++;
+            if (count[bigram] == 0) {
+                distinctBigrams[distinctIdx++] = bigram;
+            }
+            count[bigram]++;
+        }

Review Comment:
   y, need optimization here. Sparse table. Needs to be threadsafe, and I do 
not like thread local



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] TIKA-4731 - improve charset detection and junk detection [tika]

Reply via email to