[
https://issues.apache.org/jira/browse/TIKA-4731?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=18083830#comment-18083830
]
ASF GitHub Bot commented on TIKA-4731:
--------------------------------------
tballison commented on code in PR #2839:
URL: https://github.com/apache/tika/pull/2839#discussion_r3310816762
##########
tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/NaiveBayesBigramEncodingDetector.java:
##########
@@ -211,62 +268,262 @@ public List<EncodingResult> detect(TikaInputStream tis,
Metadata metadata,
return detect(readProbe(tis));
}
+ /** ASCII whitespace: TAB, LF, VT, FF, CR, SPACE. */
+ private static boolean isWhitespace(int b) {
+ return b == 0x09 || b == 0x0a || b == 0x0b || b == 0x0c
+ || b == 0x0d || b == 0x20;
+ }
+
public List<EncodingResult> detect(byte[] probe) {
- if (probe == null || probe.length < 2) {
+ ScoreResult sr = scoreClassesAndCount(probe);
+ if (sr == null) {
return Collections.emptyList();
}
- int len = Math.min(probe.length, MAX_PROBE_BYTES);
+ return emitCandidates(sr.scores, sr.scoredBigrams);
+ }
+
+ /**
+ * Score result returned by {@link #scoreClassesAndCount(byte[])}.
+ * Exposes the raw per-class score vector together with the number
+ * of bigrams that actually contributed to the dot product (i.e.,
+ * bigrams with non-zero IDF and not skipped by the whitespace-pair
+ * rule) and the total bigrams in the scored region of the probe.
+ * {@code scoredBigrams} is the unit of "evidence available to NB"
+ * — robust to HTML / whitespace noise in the input because those
+ * bigrams have IDF == 0 and don't contribute.
+ */
+ public static final class ScoreResult {
+ public final double[] scores;
+ public final int scoredBigrams;
+ public final int totalBigrams;
+ public ScoreResult(double[] scores, int scoredBigrams, int
totalBigrams) {
+ this.scores = scores;
+ this.scoredBigrams = scoredBigrams;
+ this.totalBigrams = totalBigrams;
+ }
+ }
+
+ /**
+ * Compute the raw per-class score vector for a probe, without
+ * top-K extraction or softmax. Returns {@code null} for null /
+ * tiny probes that can't be scored.
+ */
+ public double[] scoreClasses(byte[] probe) {
+ ScoreResult sr = scoreClassesAndCount(probe);
+ return sr == null ? null : sr.scores;
+ }
+
+ /**
+ * Per-bigram contribution to the per-class score, used for
+ * diagnostic tools that want to understand why a probe scores
+ * one class over another. Returned by
+ * {@link #analyzeBigrams(byte[], int, int)}.
+ */
+ public static final class BigramContrib {
+ public final int bigram; // (b0 << 8) | b1
+ public final double contribA; // logP_A * idf in nats
+ public final double contribB;
+ public BigramContrib(int bigram, double a, double b) {
+ this.bigram = bigram;
+ this.contribA = a;
+ this.contribB = b;
+ }
+ public double diff() {
+ return contribA - contribB;
+ }
+ }
- // Integer hot loop — CharSoup-style. int8 logP × int8 IDF →
- // int16 product, accumulated into int32 per class. Overflow
- // safety: at MAX_PROBE_BYTES=1024, max 1023 bigrams × 127 × 127
- // ≈ 16.5M per class, well inside int32's 2.1B headroom.
- int[] dots = new int[numClasses];
+ /**
+ * For each scored bigram in the probe (same skip rules as
+ * {@link #scoreClasses(byte[])}), compute and return its
+ * dequantized contribution to two specified classes' scores.
+ * The list is in probe order, with duplicates allowed (a bigram
+ * that appears N times in the probe yields N entries).
+ */
+ public List<BigramContrib> analyzeBigrams(byte[] probe, int classA, int
classB) {
+ List<BigramContrib> out = new java.util.ArrayList<>();
+ if (probe == null || probe.length < 2) {
+ return out;
+ }
+ int len = Math.min(probe.length, MAX_PROBE_BYTES);
+ // perClassDequant[c] folds scale[c] × idfScale already, so
+ // contribution(bigram, c) = logP8[..c] * idf8[bigram] *
perClassDequant[c]
+ double dqA = perClassDequant[classA];
+ double dqB = perClassDequant[classB];
for (int i = 0; i + 1 < len; i++) {
- int bigram = ((probe[i] & 0xFF) << 8) | (probe[i + 1] & 0xFF);
- int w = idf8[bigram]; // non-negative, 0..127
+ int b0 = probe[i] & 0xFF;
+ int b1 = probe[i + 1] & 0xFF;
+ if (isWhitespace(b0) && isWhitespace(b1)) {
+ continue;
+ }
+ int bigram = (b0 << 8) | b1;
+ int w = idf8[bigram];
if (w == 0) {
- continue; // bigram appears in every class; no signal
+ continue;
}
int base = bigram * numClasses;
- for (int c = 0; c < numClasses; c++) {
- dots[c] += logP8[base + c] * w;
+ double contribA = logP8[base + classA] * w * dqA;
+ double contribB = logP8[base + classB] * w * dqB;
+ out.add(new BigramContrib(bigram, contribA, contribB));
+ }
+ return out;
+ }
+
+ /**
+ * Like {@link #scoreClasses(byte[])} but also reports the number
+ * of bigrams that contributed to the dot product vs the total
+ * scored region. Used by offline calibration to bucket samples
+ * by "evidence available" rather than raw byte length.
+ */
+ public ScoreResult scoreClassesAndCount(byte[] probe) {
+ if (probe == null || probe.length < 2) {
+ return null;
+ }
+ int len = Math.min(probe.length, MAX_PROBE_BYTES);
+
+ // Pass 1: count distinct bigrams. Whitespace and zero-IDF
+ // bigrams are skipped as in the original hot loop. short[] is
+ // enough since count fits in 16383 (max possible). Track the
+ // ids of distinct bigrams in a parallel array so pass 2 doesn't
+ // need to scan the full 65k space.
+ short[] count = new short[BIGRAM_SPACE];
+ int[] distinctBigrams = new int[len];
+ int distinctIdx = 0;
+ int scored = 0;
+ int total = 0;
+ for (int i = 0; i + 1 < len; i++) {
+ int b0 = probe[i] & 0xFF;
+ int b1 = probe[i + 1] & 0xFF;
+ total++;
+ if (isWhitespace(b0) && isWhitespace(b1)) {
+ continue;
+ }
+ int bigram = (b0 << 8) | b1;
+ int w = idf8[bigram];
+ if (w == 0) {
+ continue;
}
+ scored++;
+ if (count[bigram] == 0) {
+ distinctBigrams[distinctIdx++] = bigram;
+ }
+ count[bigram]++;
+ }
Review Comment:
y, need optimization here. Sparse table. Needs to be threadsafe, and I do
not like thread local
> Ongoing improvements to the junk detector
> -----------------------------------------
>
> Key: TIKA-4731
> URL: https://issues.apache.org/jira/browse/TIKA-4731
> Project: Tika
> Issue Type: Task
> Reporter: Tim Allison
> Priority: Minor
>
> With [https://github.com/apache/tika/pull/2818,] I think we have a decent
> shape for the junk detector.
> There are several areas for improvement, but I think it is ready to go.
> This ticket tracks follow on work, including:
> * Smaller model
> * Handling pathological code block changes
> * Handling candidates with different character counts
> * Other items to be discovered in our commoncrawl/govdocs1 corpus?
> We have some coverage for the middle two item, but need to address those more
> directly.
> This work is not a blocker on the 4.0.0-beta-1 release.
--
This message was sent by Atlassian Jira
(v8.20.10#820010)