This is an automated email from the ASF dual-hosted git repository. tballison pushed a commit to branch junk-detector-v6 in repository https://gitbox.apache.org/repos/asf/tika.git
commit 9bae0246a80ba55bbb06df161ac5a92349ea9cb8 Author: tallison <[email protected]> AuthorDate: Thu May 14 16:11:01 2026 -0400 checkpoint v7 --- tika-ml/tika-ml-junkdetect/pom.xml | 22 + .../apache/tika/ml/junkdetect/JunkDetector.java | 12 +- .../tika/ml/junkdetect/tools/DebugScriptRuns.java | 282 +++++++++++ .../junkdetect/tools/PrototypeCodepointHash.java | 531 +++++++++++++++++++++ .../tika/ml/junkdetect/JunkDetectorSmokeTest.java | 41 ++ 5 files changed, 884 insertions(+), 4 deletions(-) diff --git a/tika-ml/tika-ml-junkdetect/pom.xml b/tika-ml/tika-ml-junkdetect/pom.xml index a10d73ad64..7701ec6ff0 100644 --- a/tika-ml/tika-ml-junkdetect/pom.xml +++ b/tika-ml/tika-ml-junkdetect/pom.xml @@ -59,6 +59,28 @@ <artifactId>tika-encoding-detector-mojibuster</artifactId> <version>${revision}</version> </dependency> + <!-- Used by EvalFixtures-mode tooling to invoke the three production base + detectors (BOM + HTML header + universal statistical) against fixture + bytes. Compile-scope so the diagnostic tool under tools/ links; + the production junk-filter detector discovers these via ServiceLoader. --> + <dependency> + <groupId>org.apache.tika</groupId> + <artifactId>tika-encoding-detector-html</artifactId> + <version>${revision}</version> + </dependency> + <!-- Bundles the StandardCharsets_unsupported_by_IANA.txt resource that + HtmlEncodingDetector loads from its static initializer. Without + this dep on the eval tool's classpath, HtmlEncodingDetector NPEs. --> + <dependency> + <groupId>org.apache.tika</groupId> + <artifactId>tika-parser-html-module</artifactId> + <version>${revision}</version> + </dependency> + <dependency> + <groupId>org.apache.tika</groupId> + <artifactId>tika-encoding-detector-universal</artifactId> + <version>${revision}</version> + </dependency> <!-- Test dependencies --> <dependency> diff --git a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkDetector.java b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkDetector.java index d932da97cc..a60e66e93c 100644 --- a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkDetector.java +++ b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkDetector.java @@ -414,8 +414,12 @@ public final class JunkDetector implements TextQualityDetector { continue; // skip scripts not in model; treat as neutral, not junk } byte[] runUtf8 = run.text.getBytes(StandardCharsets.UTF_8); - if (runUtf8.length < 2) { - continue; // too short to score + // Skip if too short to form a bigram by either metric. A single + // CJK char is 3 UTF-8 bytes (passes the byte filter) but 1 UTF-16 + // unit, and computeF1MeanLogP filters by text.length() < 2 and + // returns NaN — which would poison the weighted sum here. + if (runUtf8.length < 2 || run.text.length() < 2) { + continue; } float logit = scoreChunk(runUtf8, run.text, run.script, z4); int n = runUtf8.length; @@ -477,8 +481,8 @@ public final class JunkDetector implements TextQualityDetector { continue; } byte[] runUtf8 = run.text.getBytes(StandardCharsets.UTF_8); - if (runUtf8.length < 2) { - continue; + if (runUtf8.length < 2 || run.text.length() < 2) { + continue; // see scoreText: paired filter avoids NaN poisoning } float[] zs = computeChunkZs(runUtf8, run.text, run.script); float chunkLogit = combineLogit(zs[0], zs[1], zs[2], z4, run.script); diff --git a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/DebugScriptRuns.java b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/DebugScriptRuns.java new file mode 100644 index 0000000000..36f3a897a0 --- /dev/null +++ b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/DebugScriptRuns.java @@ -0,0 +1,282 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.ml.junkdetect.tools; + +import java.io.IOException; +import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Map; +import java.util.TreeMap; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.apache.tika.ml.chardetect.HtmlByteStripper; +import org.apache.tika.ml.junkdetect.JunkDetector; +import org.apache.tika.quality.TextQualityScore; + +/** + * Diagnostic: replicate JunkDetector.buildScriptRuns exactly on a fixture + * and print every run. Helps explain why score() returns UNKNOWN. + * + * <p>Usage: + * <pre> + * ./mvnw exec:java -pl tika-ml/tika-ml-junkdetect \ + * -Dexec.mainClass=org.apache.tika.ml.junkdetect.tools.DebugScriptRuns \ + * -Dexec.args="--file ~/data/regression/.../AIT5... --charset GB18030 --bytes 1024" + * </pre> + */ +public class DebugScriptRuns { + + // Mirror of JunkDetector.SCRIPT_MODEL_FALLBACK — keep in sync if production changes. + private static final Map<String, String> SCRIPT_MODEL_FALLBACK = Map.of( + "HIRAGANA", "HAN", + "KATAKANA", "HAN"); + + public static void main(String[] args) throws IOException { + Path file = null; + String charset = "GB18030"; + int probeBytes = 1024; + boolean strip = true; + boolean expand = true; + + for (int i = 0; i < args.length; i++) { + switch (args[i]) { + case "--file": + file = Paths.get(expandHome(args[++i])); + break; + case "--charset": + charset = args[++i]; + break; + case "--bytes": + probeBytes = Integer.parseInt(args[++i]); + break; + case "--no-strip": + strip = false; + break; + case "--no-expand": + expand = false; + break; + default: + System.err.println("unknown: " + args[i]); + System.exit(1); + } + } + if (file == null) { + System.err.println("Required: --file <path>"); + System.exit(1); + } + byte[] raw = Files.readAllBytes(file); + byte[] forDecode = raw; + if (strip) { + byte[] dst = new byte[raw.length]; + HtmlByteStripper.Result r = HtmlByteStripper.strip(raw, 0, raw.length, dst, 0); + if (r.tagCount > 0 && r.length > 0) { + forDecode = Arrays.copyOf(dst, r.length); + } + System.err.println("After strip: " + forDecode.length + " bytes (was " + raw.length + ")"); + } + if (forDecode.length > probeBytes) { + forDecode = Arrays.copyOf(forDecode, probeBytes); + } + System.err.println("Probe: " + forDecode.length + " bytes decoded as " + charset); + + String decoded = new String(forDecode, Charset.forName(charset)); + if (expand) { + decoded = expandEntities(decoded); + } + System.err.println("Decoded codepoints: " + decoded.codePointCount(0, decoded.length())); + + List<Run> runs = buildScriptRuns(decoded); + System.err.println("Built " + runs.size() + " script runs."); + + // Mirror JunkDetector.scoreText filter and report what would be scored. + JunkDetector detector = JunkDetector.loadFromClasspath(); + java.util.Set<String> modeled = detector.knownScripts(); + + TreeMap<String, int[]> totals = new TreeMap<>(); // script -> {chars, bytes, runs, modeled?} + int totalScored = 0; + int totalSkippedShort = 0; + int totalSkippedUnmodeled = 0; + long totalBytesScored = 0; + + for (Run r : runs) { + byte[] runUtf8 = r.text.getBytes(StandardCharsets.UTF_8); + boolean isModeled = modeled.contains(r.script); + boolean longEnough = runUtf8.length >= 2; + totals.merge(r.script, new int[]{r.text.codePointCount(0, r.text.length()), + runUtf8.length, 1, isModeled ? 1 : 0}, + (a, b) -> new int[]{a[0] + b[0], a[1] + b[1], a[2] + b[2], a[3]}); + if (!isModeled) { + totalSkippedUnmodeled++; + } else if (!longEnough) { + totalSkippedShort++; + } else { + totalScored++; + totalBytesScored += runUtf8.length; + } + } + + System.out.println("Script roll-up (script: cps, utf8_bytes, runs, modeled):"); + for (Map.Entry<String, int[]> e : totals.entrySet()) { + int[] v = e.getValue(); + System.out.printf(" %-15s cps=%-5d bytes=%-6d runs=%-4d modeled=%s%n", + e.getKey(), v[0], v[1], v[2], v[3] == 1 ? "Y" : "N"); + } + System.out.println(); + System.out.println("Scoring filter outcome:"); + System.out.println(" runs scored: " + totalScored); + System.out.println(" runs skipped (short): " + totalSkippedShort); + System.out.println(" runs skipped (unmod): " + totalSkippedUnmodeled); + System.out.println(" total bytes scored: " + totalBytesScored); + + // The bug: computeF1MeanLogP returns NaN when String.length() < 2. + // String.length() counts UTF-16 code units, but the outer filter uses + // UTF-8 bytes. A single CJK char = 1 UTF-16 unit but 3 UTF-8 bytes, + // so it passes the outer filter and produces NaN inside. + int nanCausing = 0; + for (Run r : runs) { + byte[] u = r.text.getBytes(StandardCharsets.UTF_8); + if (u.length >= 2 && r.text.length() < 2 && modeled.contains(r.script)) { + nanCausing++; + } + } + System.out.println(); + System.out.println("NaN-causing runs (utf8≥2 but utf16<2, modeled): " + nanCausing); + + TextQualityScore score = detector.score(decoded); + System.out.println(" detector.score() z: " + + (score.isUnknown() ? "UNKNOWN(" + score.getDominantScript() + ")" + : String.format("%.3f (script=%s)", score.getZScore(), score.getDominantScript()))); + + // Print the longest 10 runs so we can see what's actually in there. + System.out.println(); + System.out.println("Longest 10 runs:"); + runs.sort((a, b) -> Integer.compare(b.text.length(), a.text.length())); + for (int i = 0; i < Math.min(10, runs.size()); i++) { + Run r = runs.get(i); + byte[] u = r.text.getBytes(StandardCharsets.UTF_8); + String preview = r.text.length() > 30 + ? r.text.substring(0, 30) + "…" : r.text; + preview = preview.replace("\n", "\\n").replace("\r", "\\r"); + System.out.printf(" %-15s cps=%-4d bytes=%-4d preview=%s%n", + r.script, r.text.codePointCount(0, r.text.length()), u.length, preview); + } + } + + // Exact mirror of JunkDetector.buildScriptRuns (private, copied here for diagnosis). + private static List<Run> buildScriptRuns(String text) { + List<Run> runs = new ArrayList<>(); + String currentScript = null; + StringBuilder currentText = new StringBuilder(); + StringBuilder leadingCommon = new StringBuilder(); + for (int i = 0; i < text.length(); ) { + int cp = text.codePointAt(i); + i += Character.charCount(cp); + Character.UnicodeScript s = Character.UnicodeScript.of(cp); + if (s == Character.UnicodeScript.COMMON + || s == Character.UnicodeScript.INHERITED + || s == Character.UnicodeScript.UNKNOWN) { + if (currentScript != null) { + currentText.appendCodePoint(cp); + } else { + leadingCommon.appendCodePoint(cp); + } + continue; + } + String scriptName = SCRIPT_MODEL_FALLBACK.getOrDefault(s.name(), s.name()); + if (!scriptName.equals(currentScript)) { + if (currentScript != null && currentText.length() > 0) { + runs.add(new Run(currentScript, currentText.toString())); + } + currentScript = scriptName; + currentText = new StringBuilder(); + if (leadingCommon.length() > 0) { + currentText.append(leadingCommon); + leadingCommon.setLength(0); + } + } + currentText.appendCodePoint(cp); + } + if (currentScript != null && currentText.length() > 0) { + runs.add(new Run(currentScript, currentText.toString())); + } + return runs; + } + + private static final class Run { + final String script; + final String text; + Run(String s, String t) { + this.script = s; + this.text = t; + } + } + + private static final Pattern NUM_DEC = Pattern.compile("&#(\\d{1,7});"); + private static final Pattern NUM_HEX = Pattern.compile("&#[xX]([0-9a-fA-F]{1,6});"); + private static final Pattern NAMED = + Pattern.compile("&(amp|lt|gt|quot|apos|nbsp|copy|reg);"); + + private static String expandEntities(String in) { + String s = NUM_DEC.matcher(in).replaceAll(mr -> { + try { + int cp = Integer.parseInt(mr.group(1)); + if (cp >= 0 && cp <= 0x10FFFF) { + return Matcher.quoteReplacement(new String(Character.toChars(cp))); + } + } catch (NumberFormatException ignored) { + // leave unchanged + } + return Matcher.quoteReplacement(mr.group()); + }); + s = NUM_HEX.matcher(s).replaceAll(mr -> { + try { + int cp = Integer.parseInt(mr.group(1), 16); + if (cp >= 0 && cp <= 0x10FFFF) { + return Matcher.quoteReplacement(new String(Character.toChars(cp))); + } + } catch (NumberFormatException ignored) { + // leave unchanged + } + return Matcher.quoteReplacement(mr.group()); + }); + s = NAMED.matcher(s).replaceAll(mr -> { + switch (mr.group(1)) { + case "amp": return "&"; + case "lt": return "<"; + case "gt": return ">"; + case "quot": return "\""; + case "apos": return "'"; + case "nbsp": return " "; + case "copy": return "©"; + case "reg": return "®"; + default: return Matcher.quoteReplacement(mr.group()); + } + }); + return s; + } + + private static String expandHome(String s) { + return s.startsWith("~/") ? System.getProperty("user.home") + s.substring(1) : s; + } +} diff --git a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/PrototypeCodepointHash.java b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/PrototypeCodepointHash.java index 2e044f7f43..0d5f04bdee 100644 --- a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/PrototypeCodepointHash.java +++ b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/PrototypeCodepointHash.java @@ -38,10 +38,18 @@ import java.util.List; import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; +import java.util.stream.Stream; import java.util.zip.GZIPInputStream; +import org.apache.tika.detect.BOMDetector; +import org.apache.tika.detect.EncodingResult; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; import org.apache.tika.ml.chardetect.HtmlByteStripper; import org.apache.tika.ml.junkdetect.JunkDetector; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.html.HtmlEncodingDetector; +import org.apache.tika.parser.txt.UniversalEncodingDetector; import org.apache.tika.quality.TextQualityScore; /** @@ -110,6 +118,13 @@ public class PrototypeCodepointHash { int maxRecords = MAX_RECORDS_PER_FILE; List<Path> fixturesDirs = new ArrayList<>(); String wrongCharsetName = "GB18030"; + boolean singleModel = false; + List<String> candidates = List.of( + "UTF-8", "GB18030", "windows-1252", "windows-1251", "windows-1257", + "Shift_JIS", "EUC-JP", "ISO-2022-JP", "UTF-16LE", "UTF-16BE"); + List<String> forceCandidates = null; // when set, skip base detectors + String expected = "UTF-8"; + int[] probeSizes = null; // when set, sweep these probe sizes per fixture for (int i = 0; i < args.length; i++) { switch (args[i]) { @@ -128,6 +143,30 @@ public class PrototypeCodepointHash { case "--wrong-charset": wrongCharsetName = args[++i]; break; + case "--single-model": + // Skip prototype training; run N-way fixture eval on bundled JunkDetector only. + singleModel = true; + break; + case "--candidates": + candidates = Arrays.asList(args[++i].split(",")); + break; + case "--force-candidates": + // Bypass base detectors; pairwise tournament directly on these. + forceCandidates = Arrays.asList(args[++i].split(",")); + break; + case "--expected": + expected = args[++i]; + break; + case "--probe-sizes": + // Comma-separated probe sizes (bytes). Each fixture + // gets one row per size, so you can see how length + // affects UNKNOWN vs scored. + String[] sizes = args[++i].split(","); + probeSizes = new int[sizes.length]; + for (int k = 0; k < sizes.length; k++) { + probeSizes[k] = Integer.parseInt(sizes[k].trim()); + } + break; default: System.err.println("Unknown arg: " + args[i]); System.exit(1); @@ -135,6 +174,18 @@ public class PrototypeCodepointHash { } Files.createDirectories(outputDir); + // --single-model bypasses the v5/v6-prototype comparison apparatus. + // For evaluating the currently-bundled JunkDetector against real fixtures. + if (singleModel) { + if (fixturesDirs.isEmpty()) { + System.err.println("--single-model requires --fixtures-dir"); + System.exit(1); + } + evalFixturesSingleModel(fixturesDirs, candidates, forceCandidates, expected, + probeSizes, outputDir); + return; + } + System.err.println("=== PrototypeCodepointHash ==="); System.err.println(" devtest-dir: " + devtestDir); System.err.println(" output-dir: " + outputDir); @@ -249,6 +300,486 @@ public class PrototypeCodepointHash { System.err.println("Done."); } + // ----------------------------------------------------------------------- + // Real-life fixture eval: runs the production base detectors (BOM + + // HtmlEncodingDetector + UniversalEncodingDetector) and asks the + // JunkDetector to pick among their candidates via pairwise compare. + // Mirrors the production charset-detection arbitration. + // ----------------------------------------------------------------------- + + private static void evalFixturesSingleModel(List<Path> fixturesDirs, + List<String> candidates, // ignored + List<String> forceCandidates, + String expected, + int[] probeSizes, + Path outputDir) throws IOException { + boolean forceMode = forceCandidates != null && !forceCandidates.isEmpty(); + if (forceMode) { + System.err.println("\n--- Forced-candidates fixture eval ---"); + System.err.println(" candidates: " + forceCandidates); + } else { + System.err.println("\n--- Real-life fixture eval (BOM + HTML + Universal) ---"); + } + JunkDetector detector = JunkDetector.loadFromClasspath(); + System.err.println(" model version: " + detector.getModelVersion()); + System.err.println(" expected: " + expected); + + // Pre-resolve forced charsets; skip unsupported ones up front. + List<Charset> forced = new ArrayList<>(); + if (forceMode) { + for (String n : forceCandidates) { + try { + forced.add(Charset.forName(n)); + } catch (Exception e) { + System.err.println(" skip unsupported charset: " + n); + } + } + } + + BOMDetector bom = new BOMDetector(); + HtmlEncodingDetector html = new HtmlEncodingDetector(); + UniversalEncodingDetector universal = new UniversalEncodingDetector(); + ParseContext pctx = new ParseContext(); + + Path out = outputDir.resolve("fixtures-real-life.tsv"); + try (PrintWriter pw = new PrintWriter( + Files.newBufferedWriter(out, StandardCharsets.UTF_8))) { + pw.println("dir\tfile\tn_bytes\tprobe_size\texpected\tbom_cs\thtml_cs\tuniversal_cs" + + "\tcandidates\twinner\tmargin\tstatus\tnotes"); + int pass = 0, fail = 0, skip = 0, agree = 0; + double passMarginSum = 0.0; + List<String> failingLines = new ArrayList<>(); + + for (Path dir : fixturesDirs) { + if (!Files.isDirectory(dir)) { + System.err.println(" WARN: not a directory: " + dir); + continue; + } + try (Stream<Path> stream = Files.walk(dir)) { + List<Path> files = new ArrayList<>(); + stream.filter(Files::isRegularFile).forEach(files::add); + Collections.sort(files); + int[] sizes = probeSizes != null ? probeSizes : new int[]{16_384}; + for (Path f : files) { + for (int sz : sizes) { + FixtureResult r = forceMode + ? evalOneForced(f, expected, detector, forced, sz) + : evalOneRealLife(f, expected, detector, bom, html, + universal, pctx, sz); + pw.println(r.toTsvLine()); + switch (r.status) { + case "PASS": + pass++; + passMarginSum += r.margin; + break; + case "FAIL": + fail++; + failingLines.add(r.dir + "/" + r.shortName + + "@" + sz + " -> " + r.winner + + " (expected " + r.expected + ")"); + break; + case "AGREE": + agree++; + break; + default: + skip++; + } + } + } + } + } + int n = pass + fail; + System.err.println(); + System.err.println("=== Summary ==="); + System.err.printf("Pass: %d / %d (%.1f%%) — JunkDetector picked the expected charset%n", + pass, n, n == 0 ? 0.0 : 100.0 * pass / n); + System.err.printf("Fail: %d%n", fail); + System.err.printf("Agree: %d (all detectors agreed; no arbitration needed)%n", agree); + System.err.printf("Skip: %d%n", skip); + if (pass > 0) { + System.err.printf("Mean margin on pass: %.3f%n", passMarginSum / pass); + } + if (!failingLines.isEmpty()) { + System.err.println("Failing:"); + Collections.sort(failingLines); + for (String line : failingLines) { + System.err.println(" " + line); + } + } + } + System.err.println("Wrote " + out); + } + + private static FixtureResult evalOneForced(Path file, String expected, + JunkDetector detector, + List<Charset> forced, + int probeBytes) throws IOException { + byte[] raw = Files.readAllBytes(file); + FixtureResult r = new FixtureResult(); + r.dir = file.getParent().getFileName().toString(); + String fname = file.getFileName().toString(); + r.shortName = fname.length() > 24 ? fname.substring(0, 24) : fname; + r.bytes = raw.length; + r.probeSize = probeBytes; + r.expected = expected; + + if (isBinaryMagic(raw)) { + r.status = "SKIP_BIN"; + return r; + } + // Strip HTML on the WHOLE raw buffer first, then slice to probeBytes + // from the stripped content. Otherwise a small probe slice can land + // entirely inside <!DOCTYPE>/<html>/<head> boilerplate and leave + // nothing to score after strip. + byte[] strippedFull = stripHtmlBytes(raw); + byte[] forDecode = strippedFull.length > probeBytes + ? Arrays.copyOf(strippedFull, probeBytes) : strippedFull; + r.candidatesStr = forced.stream().map(Charset::name) + .reduce((a, b) -> a + "," + b).orElse("-"); + + // Always log every candidate in notes — even those JunkDetector + // rejects as unknown — so the failure mode is visible. An + // "unknown" score itself is meaningful information when the other + // candidate scored fine. + String winner = null; + String runner = null; + float winnerZ = Float.NEGATIVE_INFINITY; + float runnerZ = Float.NEGATIVE_INFINITY; + StringBuilder notes = new StringBuilder(); + int decoded_scored = 0; + for (Charset cs : forced) { + String decoded = applyEntityVariant(new String(forDecode, cs), "expanded"); + int cps = toCodepoints(decoded).length; + if (cps < 3) { + notes.append(cs.name()).append("=TOO_SHORT(").append(cps).append(") "); + continue; + } + TextQualityScore s = detector.score(decoded); + if (s.isUnknown()) { + // Diagnose: is this script-not-in-model (neutral case) or + // all-runs-fragmented-too-short (a real mojibake signal)? + String why = diagnoseUnknown(decoded, detector); + notes.append(cs.name()).append("=UNK[").append(why).append("] "); + continue; + } + float z = s.getZScore(); + notes.append(cs.name()).append("=").append(String.format("%.2f", z)).append(" "); + decoded_scored++; + if (z > winnerZ) { + runner = winner; + runnerZ = winnerZ; + winner = cs.name(); + winnerZ = z; + } else if (z > runnerZ) { + runner = cs.name(); + runnerZ = z; + } + } + if (winner == null) { + r.status = "NO_DECODE"; + r.notes = notes.toString().trim(); + return r; + } + r.winner = winner; + if (decoded_scored < 2) { + // Only one candidate scored; no real arbitration happened. + r.margin = Float.NaN; + r.status = safeCanonical(winner).equals(safeCanonical(expected)) + ? "ONLY_EXPECTED_SCORED" : "ONLY_WRONG_SCORED"; + } else { + r.margin = winnerZ - runnerZ; + r.status = safeCanonical(winner).equals(safeCanonical(expected)) ? "PASS" : "FAIL"; + } + r.notes = notes.toString().trim(); + return r; + } + + private static FixtureResult evalOneRealLife(Path file, String expected, + JunkDetector detector, + BOMDetector bom, + HtmlEncodingDetector html, + UniversalEncodingDetector universal, + ParseContext pctx, + int probeBytes) throws IOException { + byte[] raw = Files.readAllBytes(file); + int origLen = raw.length; + FixtureResult r = new FixtureResult(); + r.dir = file.getParent().getFileName().toString(); + String fname = file.getFileName().toString(); + r.shortName = fname.length() > 24 ? fname.substring(0, 24) : fname; + r.bytes = origLen; + r.probeSize = probeBytes; + r.expected = expected; + + if (isBinaryMagic(raw)) { + r.status = "SKIP_BIN"; + return r; + } + + // Probe bytes for the base detectors (16 KB matches production read limit). + // For the base detectors we keep the raw bytes (the BOM detector and + // HTML-header sniff both want the original prefix). + byte[] probe = raw.length > probeBytes ? Arrays.copyOf(raw, probeBytes) : raw; + + r.bomCs = firstCharset(bom, probe, pctx); + r.htmlCs = firstCharset(html, probe, pctx); + r.universalCs = firstCharset(universal, probe, pctx); + + // Collect distinct candidates in order of priority: BOM > HTML > universal. + List<Charset> candList = new ArrayList<>(); + addUnique(candList, r.bomCs); + addUnique(candList, r.htmlCs); + addUnique(candList, r.universalCs); + r.candidatesStr = candList.stream().map(Charset::name) + .reduce((a, b) -> a + "," + b).orElse("-"); + + if (candList.isEmpty()) { + r.status = "NO_CANDIDATES"; + return r; + } + if (candList.size() == 1) { + // All detectors agreed (or only one fired): no arbitration to do. + r.winner = candList.get(0).name(); + r.status = safeCanonical(r.winner).equals(safeCanonical(expected)) ? "AGREE" : "AGREE_WRONG"; + return r; + } + + // Strip HTML from the FULL raw bytes, then slice to probeBytes from + // the stripped content — so a small probe-size doesn't land inside + // the DOCTYPE/head boilerplate with nothing left to score. + byte[] strippedFull = stripHtmlBytes(raw); + byte[] forDecode = strippedFull.length > probeBytes + ? Arrays.copyOf(strippedFull, probeBytes) : strippedFull; + // Pairwise tournament — pick the candidate that beats all others. + Charset winnerCs = candList.get(0); + float bestMargin = Float.POSITIVE_INFINITY; + for (int i = 1; i < candList.size(); i++) { + Charset challenger = candList.get(i); + String aDecoded = applyEntityVariant(new String(forDecode, winnerCs), "expanded"); + String bDecoded = applyEntityVariant(new String(forDecode, challenger), "expanded"); + TextQualityScore aScore = detector.score(aDecoded); + TextQualityScore bScore = detector.score(bDecoded); + if (aScore.isUnknown() || bScore.isUnknown()) { + continue; + } + float margin = aScore.getZScore() - bScore.getZScore(); + if (margin < 0) { + winnerCs = challenger; + margin = -margin; + } + bestMargin = Math.min(bestMargin, Math.abs(margin)); + } + r.winner = winnerCs.name(); + r.margin = Float.isInfinite(bestMargin) ? Float.NaN : bestMargin; + r.status = safeCanonical(r.winner).equals(safeCanonical(expected)) ? "PASS" : "FAIL"; + return r; + } + + private static String firstCharset(org.apache.tika.detect.EncodingDetector d, + byte[] bytes, ParseContext pctx) { + try (TikaInputStream tis = + TikaInputStream.get(new java.io.ByteArrayInputStream(bytes))) { + List<EncodingResult> results = d.detect(tis, new Metadata(), pctx); + if (results == null || results.isEmpty()) { + return null; + } + Charset cs = results.get(0).getCharset(); + return cs == null ? null : cs.name(); + } catch (Exception e) { + return null; + } + } + + private static void addUnique(List<Charset> list, String name) { + if (name == null) { + return; + } + Charset cs; + try { + cs = Charset.forName(name); + } catch (Exception e) { + return; + } + for (Charset c : list) { + if (c.equals(cs)) { + return; + } + } + list.add(cs); + } + + /** + * Diagnose why JunkDetector returned UNKNOWN for {@code text}. Walks + * the same script-run logic, then classifies the failure mode: + * <ul> + * <li>{@code EMPTY} — input had no characters.</li> + * <li>{@code NO_MODELED_SCRIPT} — all runs are in scripts the model + * doesn't know (legit reason to be neutral).</li> + * <li>{@code ALL_RUNS_TOO_SHORT(N)} — runs exist in modeled scripts + * but every one is <2 UTF-8 bytes. Strong mojibake signal — + * text is a salad of single codepoints from many scripts.</li> + * <li>{@code MIXED} — some runs were modeled-but-too-short and + * some were unmodeled.</li> + * </ul> + */ + private static String diagnoseUnknown(String text, JunkDetector detector) { + if (text == null || text.isEmpty()) { + return "EMPTY"; + } + Set<String> modeled = detector.knownScripts(); + // Walk codepoints, splitting on script boundaries — same as + // JunkDetector.buildScriptRuns conceptually. Track per-script: + // longest UTF-8-byte run length, plus a separate "unmodeled" tally. + java.util.Map<String, Integer> longestModeled = new java.util.HashMap<>(); + int unmodeledRuns = 0; + int modeledTooShortRuns = 0; + int currentBytes = 0; + String currentScript = null; + for (int i = 0; i < text.length(); ) { + int cp = text.codePointAt(i); + int charCount = Character.charCount(cp); + String script = Character.UnicodeScript.of(cp).name(); + // COMMON / INHERITED / UNKNOWN attach to preceding run, but for + // diagnosis we don't need to be that precise — treat them as a + // continuation. + if ("COMMON".equals(script) || "INHERITED".equals(script) + || "UNKNOWN".equals(script)) { + if (currentScript != null) { + currentBytes += new String(new int[]{cp}, 0, 1) + .getBytes(StandardCharsets.UTF_8).length; + } + } else if (script.equals(currentScript)) { + currentBytes += new String(new int[]{cp}, 0, 1) + .getBytes(StandardCharsets.UTF_8).length; + } else { + // close out previous run + tallyRun(currentScript, currentBytes, modeled, longestModeled); + if (currentScript != null) { + if (!modeled.contains(currentScript)) { + unmodeledRuns++; + } else if (currentBytes < 2) { + modeledTooShortRuns++; + } + } + currentScript = script; + currentBytes = new String(new int[]{cp}, 0, 1) + .getBytes(StandardCharsets.UTF_8).length; + } + i += charCount; + } + // close final run + if (currentScript != null) { + if (!modeled.contains(currentScript)) { + unmodeledRuns++; + } else if (currentBytes < 2) { + modeledTooShortRuns++; + } else { + longestModeled.merge(currentScript, currentBytes, Math::max); + } + } + boolean anyModeledLong = !longestModeled.isEmpty(); + if (anyModeledLong) { + // Some modeled run is ≥2 bytes — shouldn't have hit UNKNOWN. + // (Possible discrepancy with the production logic; reported as MIXED.) + return "MIXED(modeled_long=" + longestModeled.size() + ")"; + } + if (modeledTooShortRuns > 0 && unmodeledRuns > 0) { + return "MIXED(short=" + modeledTooShortRuns + + ",unmodeled=" + unmodeledRuns + ")"; + } + if (modeledTooShortRuns > 0) { + return "ALL_RUNS_TOO_SHORT(" + modeledTooShortRuns + ")"; + } + if (unmodeledRuns > 0) { + return "NO_MODELED_SCRIPT(" + unmodeledRuns + ")"; + } + return "OTHER"; + } + + private static void tallyRun(String script, int bytes, Set<String> modeled, + java.util.Map<String, Integer> longestModeled) { + if (script == null) { + return; + } + if (modeled.contains(script) && bytes >= 2) { + longestModeled.merge(script, bytes, Math::max); + } + } + + /** + * Run HtmlByteStripper over the entire input; return the stripped + * content bytes (or the input verbatim if no tags found). + */ + private static byte[] stripHtmlBytes(byte[] raw) { + byte[] dst = new byte[raw.length]; + HtmlByteStripper.Result r = + HtmlByteStripper.strip(raw, 0, raw.length, dst, 0); + if (r.tagCount > 0 && r.length > 0) { + return Arrays.copyOf(dst, r.length); + } + return raw; + } + + private static boolean isBinaryMagic(byte[] b) { + if (b.length < 4) { + return false; + } + if (b[0] == 0x50 && b[1] == 0x4B + && (b[2] == 0x03 || b[2] == 0x05 || b[2] == 0x07)) { + return true; // ZIP / JAR / APK / docx + } + if ((b[0] & 0xFF) == 0x1F && (b[1] & 0xFF) == 0x8B) { + return true; // gzip + } + if (b[0] == '%' && b[1] == 'P' && b[2] == 'D' && b[3] == 'F') { + return true; // PDF + } + if ((b[0] & 0xFF) == 0xD0 && (b[1] & 0xFF) == 0xCF) { + return true; // OLE2 + } + return false; + } + + private static String safeCanonical(String charset) { + if (charset == null) { + return ""; + } + try { + return Charset.forName(charset).name(); + } catch (Exception e) { + return charset.toUpperCase(); + } + } + + private static final class FixtureResult { + String dir; + String shortName; + int bytes; + int probeSize; + String expected; + String bomCs; + String htmlCs; + String universalCs; + String candidatesStr = "-"; + String winner = "-"; + float margin = Float.NaN; + String status = ""; + String notes = ""; + + String toTsvLine() { + return String.format("%s\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s", + dir, shortName, bytes, probeSize, expected, + str(bomCs), str(htmlCs), str(universalCs), + candidatesStr, str(winner), + Float.isNaN(margin) ? "-" : String.format("%.3f", margin), + status, notes.isEmpty() ? "-" : notes); + } + + private static String str(String s) { + return s == null ? "-" : s; + } + } + // ----------------------------------------------------------------------- // Fixture eval: score real-world AIT5-class HTML files under v5 and v6 // prototype, with byte-level HTML stripping and entity-variant comparison. diff --git a/tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/JunkDetectorSmokeTest.java b/tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/JunkDetectorSmokeTest.java index a277d2d79f..d8f267ecc2 100644 --- a/tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/JunkDetectorSmokeTest.java +++ b/tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/JunkDetectorSmokeTest.java @@ -17,6 +17,7 @@ package org.apache.tika.ml.junkdetect; import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; import static org.junit.jupiter.api.Assertions.assertTrue; import java.nio.charset.StandardCharsets; @@ -200,6 +201,46 @@ public class JunkDetectorSmokeTest { "Shift-JIS decode should beat garbled UTF-8 for short Japanese filename"); } + /** + * Regression: a single CJK codepoint sandwiched between modeled-script + * runs used to NaN-poison the entire score, because the byte-length + * filter ({@code runUtf8.length >= 2}) and the UTF-16 char-length + * filter inside {@code computeF1MeanLogP} ({@code text.length() >= 2}) + * disagreed. A single CJK char is 3 UTF-8 bytes (1 UTF-16 unit), so + * it passed the outer filter, computed NaN inside, and poisoned the + * weighted aggregate — surfacing as UNKNOWN to callers. This was the + * root cause of the AIT5-class regressions (UTF-8 Malayalam decoded as + * GB18030 returns lots of single-Han-char runs). + */ + @Test + void singleCjkCharDoesNotNaNPoisonScore() { + // Latin sentence with a stray CJK char dropped in — exactly the + // shape of a GB18030-mojibake-of-UTF-8 decode at the run-boundary + // level. The CJK char forms a single-codepoint HAN run. + String text = "The quick brown 中 fox jumps over the lazy dog. " + + "Pack 中 my box with five dozen liquor jugs."; + TextQualityScore score = detector.score(text); + assertFalse(score.isUnknown(), + "score should not be UNKNOWN — single-CJK run should be skipped, " + + "not poison the aggregate. Got: " + score); + } + + /** + * Sibling regression: the same NaN-poisoning case caused by a single + * supplementary-plane (4-byte UTF-8, 2-UTF-16-unit) codepoint. Less + * load-bearing than the BMP-CJK case — supplementary chars decode to + * {@code text.length() == 2} so they pass the inner filter — but + * worth pinning the behaviour. + */ + @Test + void supplementaryPlaneCharSurvivesScoring() { + // U+1F600 (😀) is a 2-UTF-16-unit supplementary char with script COMMON, + // so it attaches to a preceding modeled run rather than forming its own. + String text = "Hello world 😀 this is some plain English text."; + TextQualityScore score = detector.score(text); + assertFalse(score.isUnknown(), "supplementary char should not break scoring: " + score); + } + // ----------------------------------------------------------------------- /**
