This is an automated email from the ASF dual-hosted git repository. tballison pushed a commit to branch TIKA-4744-nb-cohort-cap in repository https://gitbox.apache.org/repos/asf/tika.git
commit 4969062d15b508dc69e7a910a23d0d142caa5bfe Author: tallison <[email protected]> AuthorDate: Thu May 28 14:34:40 2026 -0400 add cohort-specific caps --- .skills/tika-eval-encoding-regression.md | 171 +++++++++++++++++++++ .../NaiveBayesBigramEncodingDetector.java | 108 ++++++++++--- 2 files changed, 256 insertions(+), 23 deletions(-) diff --git a/.skills/tika-eval-encoding-regression.md b/.skills/tika-eval-encoding-regression.md new file mode 100644 index 0000000000..148acd05df --- /dev/null +++ b/.skills/tika-eval-encoding-regression.md @@ -0,0 +1,171 @@ +# tika-eval for encoding-detector regression hunts + +A condensed pattern for finding SBCS→CJK style charset-detector regressions +(or any "A picks encoding X, B picks encoding Y" question) without +building two tika-app distributions. + +## Two configs, one build + +Encoding-detector experiments don't need a "before" and "after" tika-app — +the chain composition is per-config. Run the SAME tika-app twice against +two configs, treat the outputs as `-a` and `-b`. Much faster than +`tika-eval-compare`'s two-build flow. + +```bash +# build once +./mvnw clean install -pl tika-app -am -Pfast -DskipTests \ + -Dmaven.repo.local=$(pwd)/.local_m2_repo +unzip -q tika-app/target/tika-app-*.zip -d /tmp/tika-app-current + +# two configs (any combination of detectors) +java -jar /tmp/tika-app-current/tika-app-*.jar \ + --config=tika-config-3x-default.json \ + -i <corpus> -o ~/data/extracts/A -n 6 +java -jar /tmp/tika-app-current/tika-app-*.jar \ + --config=tika-config-junkfilter-combiner.json \ + -i <corpus> -o ~/data/extracts/B -n 6 + +# normal Compare +java -jar /tmp/tika-eval-current/tika-eval-app-*.jar Compare \ + -a ~/data/extracts/A -b ~/data/extracts/B -d ~/data/extracts/A-vs-B -r -rd ~/data/extracts/A-vs-B-reports +``` + +### Canonical 3.x-default encoding chain config + +```json +{ + "encoding-detectors": [ + {"html-encoding-detector": {}}, + {"universal-encoding-detector": {}}, + {"icu4j-encoding-detector": {}} + ] +} +``` + +Existing copy: `~/data/claude-work/tika-config-3x-default.json`. + +### Canonical 4.x junkfilter chain config + +```json +{ + "encoding-detectors": [ + {"bom-detector": {}}, + {"html-encoding-detector": {}}, + {"mojibuster-encoding-detector": {}}, + {"junk-filter-encoding-detector": {}} + ] +} +``` + +Existing copy: `~/data/smoke/eval-runtime/tika-config-junkfilter-combiner.json`. + +### Per-detector isolation configs + +Each detector wired alone lives in `~/data/commoncrawl/cc-html-eval/configs/`: +`tika-config-bom.json`, `tika-config-html.json`, `tika-config-htmlstandard.json`, +`tika-config-universal.json`, `tika-config-icu4j.json`, +`tika-config-mojibuster.json`, `tika-config-junkfilter-chain.json`. +Use these for chain-attribution work (which detector did the detection). + +## Encoding-pair flip query + +`MIMES.MIME_STRING` for text-y mimes is `text/html; charset=X` form. Extract +the charset with a regex split, group by `(enc_a, enc_b)`, filter pairs. +A=before/`-a`, B=after/`-b`; join on `pa.ID = pb.ID` (paired by id). + +```sql +SELECT + REGEXP_REPLACE(ma.MIME_STRING, '^.*charset=', '') AS enc_a, + REGEXP_REPLACE(mb.MIME_STRING, '^.*charset=', '') AS enc_b, + COUNT(*) n, + SUM(cb.NUM_COMMON_TOKENS - ca.NUM_COMMON_TOKENS) AS delta_common +FROM PROFILES_A pa +JOIN PROFILES_B pb ON pa.ID = pb.ID +JOIN MIMES ma ON pa.MIME_ID = ma.MIME_ID +JOIN MIMES mb ON pb.MIME_ID = mb.MIME_ID +JOIN CONTENTS_A ca ON ca.ID = pa.ID +JOIN CONTENTS_B cb ON cb.ID = pb.ID +WHERE ma.MIME_STRING LIKE '%charset=%' AND mb.MIME_STRING LIKE '%charset=%' + AND REGEXP_REPLACE(ma.MIME_STRING, '^.*charset=', '') <> + REGEXP_REPLACE(mb.MIME_STRING, '^.*charset=', '') +GROUP BY enc_a, enc_b +ORDER BY n DESC, delta_common ASC LIMIT 50; +``` + +Add an `IN (...)` filter on either side to constrain to a family +(e.g. SBCS-Western → CJK): + +```sql + AND REGEXP_REPLACE(ma.MIME_STRING,'^.*charset=','') + IN ('windows-1252','ISO-8859-1','ISO-8859-15','ISO-8859-2','ISO-8859-3', + 'windows-1250','windows-1254','windows-1257','ISO-8859-13', + 'windows-1258','x-MacRoman','IBM850','IBM852') + AND REGEXP_REPLACE(mb.MIME_STRING,'^.*charset=','') + IN ('GB18030','GBK','GB2312','Big5','Big5-HKSCS','Shift_JIS','EUC-JP', + 'EUC-KR','x-EUC-TW','x-windows-874','x-windows-949', + 'ISO-2022-JP','ISO-2022-KR','ISO-2022-CN') +``` + +### Per-file drilldown + +Join `CONTAINERS` to get the source path; pull `LANG_ID_1` from both sides +to see whether language detection agrees the content is Western while the +charset has flipped to CJK (the regression's defining shape): + +```sql +SELECT ct.FILE_PATH, + REGEXP_REPLACE(ma.MIME_STRING,'^.*charset=','') AS enc_a, + REGEXP_REPLACE(mb.MIME_STRING,'^.*charset=','') AS enc_b, + ca.NUM_COMMON_TOKENS AS ca_tok, cb.NUM_COMMON_TOKENS AS cb_tok, + cb.NUM_COMMON_TOKENS - ca.NUM_COMMON_TOKENS AS delta, + ca.LANG_ID_1 AS lang_a, cb.LANG_ID_1 AS lang_b +FROM PROFILES_A pa JOIN PROFILES_B pb ON pa.ID = pb.ID +JOIN MIMES ma ON pa.MIME_ID = ma.MIME_ID JOIN MIMES mb ON pb.MIME_ID = mb.MIME_ID +JOIN CONTENTS_A ca ON ca.ID = pa.ID JOIN CONTENTS_B cb ON cb.ID = pb.ID +JOIN CONTAINERS ct ON ct.CONTAINER_ID = pa.CONTAINER_ID +WHERE <enc_a/enc_b filter as above> +ORDER BY delta ASC LIMIT 15; +``` + +## Per-file detector attribution (`X-TIKA:encodingDetectionTrace`) + +Every JSON extract from a chain with multiple detectors carries +`X-TIKA:encodingDetectionTrace` in metadata. It's a per-detector emission +log with the META detector's arbitration tag at the end: + +``` +MojibusterEncodingDetector->Shift_JIS[STATISTICAL](1.00) [junk-filter-selected] +``` + +When investigating "why did B pick X for this file?", read this trace first +— it tells you which base detector(s) emitted candidates and which one the +meta detector chose. If the trace shows ONLY Mojibuster firing with a CJK +pick, the bug is in Mojibuster's emission (pool too narrow), not in +JunkFilter's arbitration. + +`X-TIKA:encodingDetector` is the simple-name credit string; +`X-TIKA:detectedEncoding` is the final answer (also in `Content-Encoding`). + +## Reproducing a single-file detection without a full chain + +```bash +./mvnw -q -pl tika-ml/tika-ml-junkdetect -Dmaven.repo.local=$(pwd)/.local_m2_repo \ + -Dexec.classpathScope=test \ + -Dexec.mainClass=org.apache.tika.ml.junkdetect.TraceJunkFilter \ + -Dexec.args="--file <path> --auto-candidates --content-cleaner --head-bytes 524288 --sample 120" \ + exec:java +``` + +Key flags: +- `--auto-candidates` — use Mojibuster's per-file pool as the candidate set +- `--content-cleaner` — decode each candidate then run text through + `HtmlContentCleaner` to match the live chain +- `--head-bytes 524288` — read up to 512 KB raw to match + `AdaptiveProbe.DEFAULT_RAW_CAP`. The default `READ_LIMIT` of 16 KB will + give a *different* probe than the live chain on long markup-heavy pages + and lead you to disagree with the live chain's pick. Always pass this + when reconciling a TraceJunkFilter run with a live extract. + +Without `--head-bytes`, you are looking at a different probe than the +chain saw — this is the most common source of "trace says X, chain +says Y" confusion. diff --git a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/NaiveBayesBigramEncodingDetector.java b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/NaiveBayesBigramEncodingDetector.java index 2460656f0c..5e87150fb6 100644 --- a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/NaiveBayesBigramEncodingDetector.java +++ b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/NaiveBayesBigramEncodingDetector.java @@ -26,7 +26,9 @@ import java.nio.file.Path; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; +import java.util.HashMap; import java.util.List; +import java.util.Map; import org.apache.commons.io.IOUtils; @@ -104,23 +106,15 @@ public class NaiveBayesBigramEncodingDetector implements EncodingDetector { public static final double MARGIN_THRESHOLD_NATS_PER_BIGRAM = 0.20; /** - * Per-bigram cross-class total-contribution cap (Type C clipping). - * For each distinct bigram in the probe, the top-scoring class's - * total contribution (count × logP × idf, after dequantization) is - * capped at the runner-up class's contribution + this many nats. - * - * <p>Defends against corpus-skew pathologies where one class - * accumulates extreme bigram mass that swings classification on - * one or two byte-pairs alone (e.g., Czech "ČR" digraph in - * ISO-8859-2 contributing +186 nats over win-1252 on Italian text). - * Length-invariant by construction: the cap is on per-bigram - * advantage, regardless of how many times the bigram appears.</p> - * - * <p>20 nats = e^20 ≈ 5×10^8 probability-ratio advantage per - * bigram — preserves legitimate CJK-vs-Latin and other cross-script - * signal while bounding the diffuse-corpus-skew tail.</p> + * Per-distinct-bigram cap: top-scoring class's contribution is + * clipped to the best <em>cross-cohort</em> class's contribution + + * this many nats. Bounds both single-bigram corpus skew and the + * diffuse coverage asymmetry where broad-vocab cohorts (CJK, + * EBCDIC) collectively swamp narrow-vocab cohorts (LATIN) on + * rare-ASCII bigrams that fall to the unseen floor in the narrow + * cohort. See {@link Cohort}. */ - public static final double CAP_PER_BIGRAM_NATS = 20.0; + public static final double CAP_PER_BIGRAM_NATS = 10.0; /** * Minimum distinct bigrams required before the per-bigram cap @@ -149,9 +143,60 @@ public class NaiveBayesBigramEncodingDetector implements EncodingDetector { */ public static final int MIN_BIGRAMS_FOR_DIVERSITY_GATE = 100; + /** + * Script / writing-system family used by {@link #CAP_PER_BIGRAM_NATS}. + * UTF-8 stands alone so the cap engages on UTF-vs-anything pairs + * (UTF-8 misread as win-1252 or as GBK). + */ + public enum Cohort { + LATIN, CJK, CYRILLIC, GREEK, HEBREW, ARABIC, THAI, EBCDIC, UTF + } + + /** + * Class label → cohort. Must cover every NB-model label; load + * fails fast on an unmapped label (model and code travel together + * in git, no BWC layer). + */ + private static final Map<String, Cohort> COHORT_TABLE = buildCohortTable(); + + private static Map<String, Cohort> buildCohortTable() { + Map<String, Cohort> m = new HashMap<>(); + for (String label : new String[]{ + "windows-1252", "windows-1250", "windows-1254", "windows-1257", + "windows-1258", "ISO-8859-2", "ISO-8859-3", "ISO-8859-16", + "x-MacRoman", "IBM850", "IBM852"}) { + m.put(label, Cohort.LATIN); + } + for (String label : new String[]{ + "Big5-HKSCS", "EUC-JP", "GB18030", "Shift_JIS", + "x-EUC-TW", "x-windows-949"}) { + m.put(label, Cohort.CJK); + } + for (String label : new String[]{ + "windows-1251", "KOI8-R", "KOI8-U", "IBM855", "IBM866", + "x-mac-cyrillic"}) { + m.put(label, Cohort.CYRILLIC); + } + m.put("windows-1253", Cohort.GREEK); + m.put("windows-1255", Cohort.HEBREW); + m.put("windows-1256", Cohort.ARABIC); + m.put("windows-874", Cohort.THAI); + // Bidi-suffix variants (-ltr/-rtl) share a cohort; toJavaCharsetName + // collapses them at Charset lookup, but their bigram tables differ. + for (String label : new String[]{ + "IBM1047", "IBM500", "IBM420-ltr", "IBM420-rtl", + "IBM424-ltr", "IBM424-rtl"}) { + m.put(label, Cohort.EBCDIC); + } + m.put("UTF-8", Cohort.UTF); + return Collections.unmodifiableMap(m); + } + private final String[] labels; /** Charset objects cached at load — one {@code Charset.forName} per class, ever. */ private final Charset[] charsets; + /** Per-class cohort, parallel to {@link #labels}. */ + private final Cohort[] cohorts; /** * Bigram-major int8 logP layout. Quantized at load time via * per-class scale {@code scale[c] = maxAbs(class c's logP column) / 127}. @@ -198,6 +243,7 @@ public class NaiveBayesBigramEncodingDetector implements EncodingDetector { this.numClasses = dis.readInt(); this.labels = new String[numClasses]; this.charsets = new Charset[numClasses]; + this.cohorts = new Cohort[numClasses]; // Read quantized IDF table + scale. float idfScale = dis.readFloat(); @@ -228,6 +274,14 @@ public class NaiveBayesBigramEncodingDetector implements EncodingDetector { cs = null; } charsets[c] = cs; + Cohort cohort = COHORT_TABLE.get(labels[c]); + if (cohort == null) { + throw new IOException( + "NB model class label \"" + labels[c] + + "\" has no cohort assignment; " + + "update NaiveBayesBigramEncodingDetector.COHORT_TABLE."); + } + cohorts[c] = cohort; scale[c] = dis.readFloat(); unseenQ[c] = dis.readByte(); @@ -454,21 +508,29 @@ public class NaiveBayesBigramEncodingDetector implements EncodingDetector { } // logPs are negative; "best" class for the bigram = highest - // (least negative) contribution after dequant. + // (least negative) contribution after dequant. Cap reference + // is the best contribution from a class outside top-1's + // cohort, so the cap engages on cross-cohort gaps that a + // max-vs-overall-runner-up cap missed when multiple classes + // in top-1's cohort sat close together. + int topClass = -1; double max = Double.NEGATIVE_INFINITY; - double secondMax = Double.NEGATIVE_INFINITY; for (int c = 0; c < numClasses; c++) { double contrib = logP8[base + c] * countTimesIdf * perClassDequant[c]; contributions[c] = contrib; if (contrib > max) { - secondMax = max; max = contrib; - } else if (contrib > secondMax) { - secondMax = contrib; + topClass = c; + } + } + Cohort topCohort = cohorts[topClass]; + double bestCrossCohort = Double.NEGATIVE_INFINITY; + for (int c = 0; c < numClasses; c++) { + if (cohorts[c] != topCohort && contributions[c] > bestCrossCohort) { + bestCrossCohort = contributions[c]; } } - // Cap any class whose contribution exceeds runner-up + cap. - double capValue = secondMax + CAP_PER_BIGRAM_NATS; + double capValue = bestCrossCohort + CAP_PER_BIGRAM_NATS; if (max > capValue) { for (int c = 0; c < numClasses; c++) { if (contributions[c] > capValue) {
