(tika) 01/02: checkpoint v7

tallison Thu, 14 May 2026 13:41:36 -0700

This is an automated email from the ASF dual-hosted git repository.

tballison pushed a commit to branch junk-detector-v6
in repository https://gitbox.apache.org/repos/asf/tika.git


commit 9bae0246a80ba55bbb06df161ac5a92349ea9cb8
Author: tallison <[email protected]>
AuthorDate: Thu May 14 16:11:01 2026 -0400

    checkpoint v7
---
 tika-ml/tika-ml-junkdetect/pom.xml                 |  22 +
 .../apache/tika/ml/junkdetect/JunkDetector.java    |  12 +-
 .../tika/ml/junkdetect/tools/DebugScriptRuns.java  | 282 +++++++++++
 .../junkdetect/tools/PrototypeCodepointHash.java   | 531 +++++++++++++++++++++
 .../tika/ml/junkdetect/JunkDetectorSmokeTest.java  |  41 ++
 5 files changed, 884 insertions(+), 4 deletions(-)

diff --git a/tika-ml/tika-ml-junkdetect/pom.xml 
b/tika-ml/tika-ml-junkdetect/pom.xml
index a10d73ad64..7701ec6ff0 100644
--- a/tika-ml/tika-ml-junkdetect/pom.xml
+++ b/tika-ml/tika-ml-junkdetect/pom.xml
@@ -59,6 +59,28 @@
       <artifactId>tika-encoding-detector-mojibuster</artifactId>
       <version>${revision}</version>
     </dependency>
+    <!-- Used by EvalFixtures-mode tooling to invoke the three production base
+         detectors (BOM + HTML header + universal statistical) against fixture
+         bytes.  Compile-scope so the diagnostic tool under tools/ links;
+         the production junk-filter detector discovers these via 
ServiceLoader. -->
+    <dependency>
+      <groupId>org.apache.tika</groupId>
+      <artifactId>tika-encoding-detector-html</artifactId>
+      <version>${revision}</version>
+    </dependency>
+    <!-- Bundles the StandardCharsets_unsupported_by_IANA.txt resource that
+         HtmlEncodingDetector loads from its static initializer.  Without
+         this dep on the eval tool's classpath, HtmlEncodingDetector NPEs. -->
+    <dependency>
+      <groupId>org.apache.tika</groupId>
+      <artifactId>tika-parser-html-module</artifactId>
+      <version>${revision}</version>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.tika</groupId>
+      <artifactId>tika-encoding-detector-universal</artifactId>
+      <version>${revision}</version>
+    </dependency>
 
     <!-- Test dependencies -->
     <dependency>
diff --git 
a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkDetector.java
 
b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkDetector.java
index d932da97cc..a60e66e93c 100644
--- 
a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkDetector.java
+++ 
b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkDetector.java
@@ -414,8 +414,12 @@ public final class JunkDetector implements 
TextQualityDetector {
                 continue; // skip scripts not in model; treat as neutral, not 
junk
             }
             byte[] runUtf8 = run.text.getBytes(StandardCharsets.UTF_8);
-            if (runUtf8.length < 2) {
-                continue; // too short to score
+            // Skip if too short to form a bigram by either metric.  A single
+            // CJK char is 3 UTF-8 bytes (passes the byte filter) but 1 UTF-16
+            // unit, and computeF1MeanLogP filters by text.length() < 2 and
+            // returns NaN — which would poison the weighted sum here.
+            if (runUtf8.length < 2 || run.text.length() < 2) {
+                continue;
             }
             float logit = scoreChunk(runUtf8, run.text, run.script, z4);
             int n = runUtf8.length;
@@ -477,8 +481,8 @@ public final class JunkDetector implements 
TextQualityDetector {
                 continue;
             }
             byte[] runUtf8 = run.text.getBytes(StandardCharsets.UTF_8);
-            if (runUtf8.length < 2) {
-                continue;
+            if (runUtf8.length < 2 || run.text.length() < 2) {
+                continue; // see scoreText: paired filter avoids NaN poisoning
             }
             float[] zs = computeChunkZs(runUtf8, run.text, run.script);
             float chunkLogit = combineLogit(zs[0], zs[1], zs[2], z4, 
run.script);
diff --git 
a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/DebugScriptRuns.java
 
b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/DebugScriptRuns.java
new file mode 100644
index 0000000000..36f3a897a0
--- /dev/null
+++ 
b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/DebugScriptRuns.java
@@ -0,0 +1,282 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.ml.junkdetect.tools;
+
+import java.io.IOException;
+import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Map;
+import java.util.TreeMap;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.tika.ml.chardetect.HtmlByteStripper;
+import org.apache.tika.ml.junkdetect.JunkDetector;
+import org.apache.tika.quality.TextQualityScore;
+
+/**
+ * Diagnostic: replicate JunkDetector.buildScriptRuns exactly on a fixture
+ * and print every run.  Helps explain why score() returns UNKNOWN.
+ *
+ * <p>Usage:
+ * <pre>
+ *   ./mvnw exec:java -pl tika-ml/tika-ml-junkdetect \
+ *     -Dexec.mainClass=org.apache.tika.ml.junkdetect.tools.DebugScriptRuns \
+ *     -Dexec.args="--file ~/data/regression/.../AIT5... --charset GB18030 
--bytes 1024"
+ * </pre>
+ */
+public class DebugScriptRuns {
+
+    // Mirror of JunkDetector.SCRIPT_MODEL_FALLBACK — keep in sync if 
production changes.
+    private static final Map<String, String> SCRIPT_MODEL_FALLBACK = Map.of(
+            "HIRAGANA", "HAN",
+            "KATAKANA", "HAN");
+
+    public static void main(String[] args) throws IOException {
+        Path file = null;
+        String charset = "GB18030";
+        int probeBytes = 1024;
+        boolean strip = true;
+        boolean expand = true;
+
+        for (int i = 0; i < args.length; i++) {
+            switch (args[i]) {
+                case "--file":
+                    file = Paths.get(expandHome(args[++i]));
+                    break;
+                case "--charset":
+                    charset = args[++i];
+                    break;
+                case "--bytes":
+                    probeBytes = Integer.parseInt(args[++i]);
+                    break;
+                case "--no-strip":
+                    strip = false;
+                    break;
+                case "--no-expand":
+                    expand = false;
+                    break;
+                default:
+                    System.err.println("unknown: " + args[i]);
+                    System.exit(1);
+            }
+        }
+        if (file == null) {
+            System.err.println("Required: --file <path>");
+            System.exit(1);
+        }
+        byte[] raw = Files.readAllBytes(file);
+        byte[] forDecode = raw;
+        if (strip) {
+            byte[] dst = new byte[raw.length];
+            HtmlByteStripper.Result r = HtmlByteStripper.strip(raw, 0, 
raw.length, dst, 0);
+            if (r.tagCount > 0 && r.length > 0) {
+                forDecode = Arrays.copyOf(dst, r.length);
+            }
+            System.err.println("After strip: " + forDecode.length + " bytes 
(was " + raw.length + ")");
+        }
+        if (forDecode.length > probeBytes) {
+            forDecode = Arrays.copyOf(forDecode, probeBytes);
+        }
+        System.err.println("Probe: " + forDecode.length + " bytes decoded as " 
+ charset);
+
+        String decoded = new String(forDecode, Charset.forName(charset));
+        if (expand) {
+            decoded = expandEntities(decoded);
+        }
+        System.err.println("Decoded codepoints: " + decoded.codePointCount(0, 
decoded.length()));
+
+        List<Run> runs = buildScriptRuns(decoded);
+        System.err.println("Built " + runs.size() + " script runs.");
+
+        // Mirror JunkDetector.scoreText filter and report what would be 
scored.
+        JunkDetector detector = JunkDetector.loadFromClasspath();
+        java.util.Set<String> modeled = detector.knownScripts();
+
+        TreeMap<String, int[]> totals = new TreeMap<>(); // script -> {chars, 
bytes, runs, modeled?}
+        int totalScored = 0;
+        int totalSkippedShort = 0;
+        int totalSkippedUnmodeled = 0;
+        long totalBytesScored = 0;
+
+        for (Run r : runs) {
+            byte[] runUtf8 = r.text.getBytes(StandardCharsets.UTF_8);
+            boolean isModeled = modeled.contains(r.script);
+            boolean longEnough = runUtf8.length >= 2;
+            totals.merge(r.script, new int[]{r.text.codePointCount(0, 
r.text.length()),
+                            runUtf8.length, 1, isModeled ? 1 : 0},
+                    (a, b) -> new int[]{a[0] + b[0], a[1] + b[1], a[2] + b[2], 
a[3]});
+            if (!isModeled) {
+                totalSkippedUnmodeled++;
+            } else if (!longEnough) {
+                totalSkippedShort++;
+            } else {
+                totalScored++;
+                totalBytesScored += runUtf8.length;
+            }
+        }
+
+        System.out.println("Script roll-up (script: cps, utf8_bytes, runs, 
modeled):");
+        for (Map.Entry<String, int[]> e : totals.entrySet()) {
+            int[] v = e.getValue();
+            System.out.printf("  %-15s cps=%-5d bytes=%-6d runs=%-4d 
modeled=%s%n",
+                    e.getKey(), v[0], v[1], v[2], v[3] == 1 ? "Y" : "N");
+        }
+        System.out.println();
+        System.out.println("Scoring filter outcome:");
+        System.out.println("  runs scored:           " + totalScored);
+        System.out.println("  runs skipped (short):  " + totalSkippedShort);
+        System.out.println("  runs skipped (unmod):  " + 
totalSkippedUnmodeled);
+        System.out.println("  total bytes scored:    " + totalBytesScored);
+
+        // The bug: computeF1MeanLogP returns NaN when String.length() < 2.
+        // String.length() counts UTF-16 code units, but the outer filter uses
+        // UTF-8 bytes.  A single CJK char = 1 UTF-16 unit but 3 UTF-8 bytes,
+        // so it passes the outer filter and produces NaN inside.
+        int nanCausing = 0;
+        for (Run r : runs) {
+            byte[] u = r.text.getBytes(StandardCharsets.UTF_8);
+            if (u.length >= 2 && r.text.length() < 2 && 
modeled.contains(r.script)) {
+                nanCausing++;
+            }
+        }
+        System.out.println();
+        System.out.println("NaN-causing runs (utf8≥2 but utf16<2, modeled): " 
+ nanCausing);
+
+        TextQualityScore score = detector.score(decoded);
+        System.out.println("  detector.score() z:    "
+                + (score.isUnknown() ? "UNKNOWN(" + score.getDominantScript() 
+ ")"
+                : String.format("%.3f (script=%s)", score.getZScore(), 
score.getDominantScript())));
+
+        // Print the longest 10 runs so we can see what's actually in there.
+        System.out.println();
+        System.out.println("Longest 10 runs:");
+        runs.sort((a, b) -> Integer.compare(b.text.length(), a.text.length()));
+        for (int i = 0; i < Math.min(10, runs.size()); i++) {
+            Run r = runs.get(i);
+            byte[] u = r.text.getBytes(StandardCharsets.UTF_8);
+            String preview = r.text.length() > 30
+                    ? r.text.substring(0, 30) + "…" : r.text;
+            preview = preview.replace("\n", "\\n").replace("\r", "\\r");
+            System.out.printf("  %-15s cps=%-4d bytes=%-4d preview=%s%n",
+                    r.script, r.text.codePointCount(0, r.text.length()), 
u.length, preview);
+        }
+    }
+
+    // Exact mirror of JunkDetector.buildScriptRuns (private, copied here for 
diagnosis).
+    private static List<Run> buildScriptRuns(String text) {
+        List<Run> runs = new ArrayList<>();
+        String currentScript = null;
+        StringBuilder currentText = new StringBuilder();
+        StringBuilder leadingCommon = new StringBuilder();
+        for (int i = 0; i < text.length(); ) {
+            int cp = text.codePointAt(i);
+            i += Character.charCount(cp);
+            Character.UnicodeScript s = Character.UnicodeScript.of(cp);
+            if (s == Character.UnicodeScript.COMMON
+                    || s == Character.UnicodeScript.INHERITED
+                    || s == Character.UnicodeScript.UNKNOWN) {
+                if (currentScript != null) {
+                    currentText.appendCodePoint(cp);
+                } else {
+                    leadingCommon.appendCodePoint(cp);
+                }
+                continue;
+            }
+            String scriptName = SCRIPT_MODEL_FALLBACK.getOrDefault(s.name(), 
s.name());
+            if (!scriptName.equals(currentScript)) {
+                if (currentScript != null && currentText.length() > 0) {
+                    runs.add(new Run(currentScript, currentText.toString()));
+                }
+                currentScript = scriptName;
+                currentText = new StringBuilder();
+                if (leadingCommon.length() > 0) {
+                    currentText.append(leadingCommon);
+                    leadingCommon.setLength(0);
+                }
+            }
+            currentText.appendCodePoint(cp);
+        }
+        if (currentScript != null && currentText.length() > 0) {
+            runs.add(new Run(currentScript, currentText.toString()));
+        }
+        return runs;
+    }
+
+    private static final class Run {
+        final String script;
+        final String text;
+        Run(String s, String t) {
+            this.script = s;
+            this.text = t;
+        }
+    }
+
+    private static final Pattern NUM_DEC = Pattern.compile("&#(\\d{1,7});");
+    private static final Pattern NUM_HEX = 
Pattern.compile("&#[xX]([0-9a-fA-F]{1,6});");
+    private static final Pattern NAMED =
+            Pattern.compile("&(amp|lt|gt|quot|apos|nbsp|copy|reg);");
+
+    private static String expandEntities(String in) {
+        String s = NUM_DEC.matcher(in).replaceAll(mr -> {
+            try {
+                int cp = Integer.parseInt(mr.group(1));
+                if (cp >= 0 && cp <= 0x10FFFF) {
+                    return Matcher.quoteReplacement(new 
String(Character.toChars(cp)));
+                }
+            } catch (NumberFormatException ignored) {
+                // leave unchanged
+            }
+            return Matcher.quoteReplacement(mr.group());
+        });
+        s = NUM_HEX.matcher(s).replaceAll(mr -> {
+            try {
+                int cp = Integer.parseInt(mr.group(1), 16);
+                if (cp >= 0 && cp <= 0x10FFFF) {
+                    return Matcher.quoteReplacement(new 
String(Character.toChars(cp)));
+                }
+            } catch (NumberFormatException ignored) {
+                // leave unchanged
+            }
+            return Matcher.quoteReplacement(mr.group());
+        });
+        s = NAMED.matcher(s).replaceAll(mr -> {
+            switch (mr.group(1)) {
+                case "amp":  return "&";
+                case "lt":   return "<";
+                case "gt":   return ">";
+                case "quot": return "\"";
+                case "apos": return "'";
+                case "nbsp": return " ";
+                case "copy": return "©";
+                case "reg":  return "®";
+                default:     return Matcher.quoteReplacement(mr.group());
+            }
+        });
+        return s;
+    }
+
+    private static String expandHome(String s) {
+        return s.startsWith("~/") ? System.getProperty("user.home") + 
s.substring(1) : s;
+    }
+}
diff --git 
a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/PrototypeCodepointHash.java
 
b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/PrototypeCodepointHash.java
index 2e044f7f43..0d5f04bdee 100644
--- 
a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/PrototypeCodepointHash.java
+++ 
b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/PrototypeCodepointHash.java
@@ -38,10 +38,18 @@ import java.util.List;
 import java.util.Set;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
+import java.util.stream.Stream;
 import java.util.zip.GZIPInputStream;
 
+import org.apache.tika.detect.BOMDetector;
+import org.apache.tika.detect.EncodingResult;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
 import org.apache.tika.ml.chardetect.HtmlByteStripper;
 import org.apache.tika.ml.junkdetect.JunkDetector;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.html.HtmlEncodingDetector;
+import org.apache.tika.parser.txt.UniversalEncodingDetector;
 import org.apache.tika.quality.TextQualityScore;
 
 /**
@@ -110,6 +118,13 @@ public class PrototypeCodepointHash {
         int maxRecords = MAX_RECORDS_PER_FILE;
         List<Path> fixturesDirs = new ArrayList<>();
         String wrongCharsetName = "GB18030";
+        boolean singleModel = false;
+        List<String> candidates = List.of(
+                "UTF-8", "GB18030", "windows-1252", "windows-1251", 
"windows-1257",
+                "Shift_JIS", "EUC-JP", "ISO-2022-JP", "UTF-16LE", "UTF-16BE");
+        List<String> forceCandidates = null; // when set, skip base detectors
+        String expected = "UTF-8";
+        int[] probeSizes = null; // when set, sweep these probe sizes per 
fixture
 
         for (int i = 0; i < args.length; i++) {
             switch (args[i]) {
@@ -128,6 +143,30 @@ public class PrototypeCodepointHash {
                 case "--wrong-charset":
                     wrongCharsetName = args[++i];
                     break;
+                case "--single-model":
+                    // Skip prototype training; run N-way fixture eval on 
bundled JunkDetector only.
+                    singleModel = true;
+                    break;
+                case "--candidates":
+                    candidates = Arrays.asList(args[++i].split(","));
+                    break;
+                case "--force-candidates":
+                    // Bypass base detectors; pairwise tournament directly on 
these.
+                    forceCandidates = Arrays.asList(args[++i].split(","));
+                    break;
+                case "--expected":
+                    expected = args[++i];
+                    break;
+                case "--probe-sizes":
+                    // Comma-separated probe sizes (bytes).  Each fixture
+                    // gets one row per size, so you can see how length
+                    // affects UNKNOWN vs scored.
+                    String[] sizes = args[++i].split(",");
+                    probeSizes = new int[sizes.length];
+                    for (int k = 0; k < sizes.length; k++) {
+                        probeSizes[k] = Integer.parseInt(sizes[k].trim());
+                    }
+                    break;
                 default:
                     System.err.println("Unknown arg: " + args[i]);
                     System.exit(1);
@@ -135,6 +174,18 @@ public class PrototypeCodepointHash {
         }
         Files.createDirectories(outputDir);
 
+        // --single-model bypasses the v5/v6-prototype comparison apparatus.
+        // For evaluating the currently-bundled JunkDetector against real 
fixtures.
+        if (singleModel) {
+            if (fixturesDirs.isEmpty()) {
+                System.err.println("--single-model requires --fixtures-dir");
+                System.exit(1);
+            }
+            evalFixturesSingleModel(fixturesDirs, candidates, forceCandidates, 
expected,
+                    probeSizes, outputDir);
+            return;
+        }
+
         System.err.println("=== PrototypeCodepointHash ===");
         System.err.println("  devtest-dir:  " + devtestDir);
         System.err.println("  output-dir:   " + outputDir);
@@ -249,6 +300,486 @@ public class PrototypeCodepointHash {
         System.err.println("Done.");
     }
 
+    // -----------------------------------------------------------------------
+    // Real-life fixture eval: runs the production base detectors (BOM +
+    // HtmlEncodingDetector + UniversalEncodingDetector) and asks the
+    // JunkDetector to pick among their candidates via pairwise compare.
+    // Mirrors the production charset-detection arbitration.
+    // -----------------------------------------------------------------------
+
+    private static void evalFixturesSingleModel(List<Path> fixturesDirs,
+                                                List<String> candidates, // 
ignored
+                                                List<String> forceCandidates,
+                                                String expected,
+                                                int[] probeSizes,
+                                                Path outputDir) throws 
IOException {
+        boolean forceMode = forceCandidates != null && 
!forceCandidates.isEmpty();
+        if (forceMode) {
+            System.err.println("\n--- Forced-candidates fixture eval ---");
+            System.err.println("  candidates: " + forceCandidates);
+        } else {
+            System.err.println("\n--- Real-life fixture eval (BOM + HTML + 
Universal) ---");
+        }
+        JunkDetector detector = JunkDetector.loadFromClasspath();
+        System.err.println("  model version: " + detector.getModelVersion());
+        System.err.println("  expected:      " + expected);
+
+        // Pre-resolve forced charsets; skip unsupported ones up front.
+        List<Charset> forced = new ArrayList<>();
+        if (forceMode) {
+            for (String n : forceCandidates) {
+                try {
+                    forced.add(Charset.forName(n));
+                } catch (Exception e) {
+                    System.err.println("  skip unsupported charset: " + n);
+                }
+            }
+        }
+
+        BOMDetector bom = new BOMDetector();
+        HtmlEncodingDetector html = new HtmlEncodingDetector();
+        UniversalEncodingDetector universal = new UniversalEncodingDetector();
+        ParseContext pctx = new ParseContext();
+
+        Path out = outputDir.resolve("fixtures-real-life.tsv");
+        try (PrintWriter pw = new PrintWriter(
+                Files.newBufferedWriter(out, StandardCharsets.UTF_8))) {
+            
pw.println("dir\tfile\tn_bytes\tprobe_size\texpected\tbom_cs\thtml_cs\tuniversal_cs"
+                    + "\tcandidates\twinner\tmargin\tstatus\tnotes");
+            int pass = 0, fail = 0, skip = 0, agree = 0;
+            double passMarginSum = 0.0;
+            List<String> failingLines = new ArrayList<>();
+
+            for (Path dir : fixturesDirs) {
+                if (!Files.isDirectory(dir)) {
+                    System.err.println("  WARN: not a directory: " + dir);
+                    continue;
+                }
+                try (Stream<Path> stream = Files.walk(dir)) {
+                    List<Path> files = new ArrayList<>();
+                    stream.filter(Files::isRegularFile).forEach(files::add);
+                    Collections.sort(files);
+                    int[] sizes = probeSizes != null ? probeSizes : new 
int[]{16_384};
+                    for (Path f : files) {
+                        for (int sz : sizes) {
+                            FixtureResult r = forceMode
+                                    ? evalOneForced(f, expected, detector, 
forced, sz)
+                                    : evalOneRealLife(f, expected, detector, 
bom, html,
+                                            universal, pctx, sz);
+                            pw.println(r.toTsvLine());
+                            switch (r.status) {
+                                case "PASS":
+                                    pass++;
+                                    passMarginSum += r.margin;
+                                    break;
+                                case "FAIL":
+                                    fail++;
+                                    failingLines.add(r.dir + "/" + r.shortName
+                                            + "@" + sz + " -> " + r.winner
+                                            + " (expected " + r.expected + 
")");
+                                    break;
+                                case "AGREE":
+                                    agree++;
+                                    break;
+                                default:
+                                    skip++;
+                            }
+                        }
+                    }
+                }
+            }
+            int n = pass + fail;
+            System.err.println();
+            System.err.println("=== Summary ===");
+            System.err.printf("Pass:    %d / %d (%.1f%%) — JunkDetector picked 
the expected charset%n",
+                    pass, n, n == 0 ? 0.0 : 100.0 * pass / n);
+            System.err.printf("Fail:    %d%n", fail);
+            System.err.printf("Agree:   %d  (all detectors agreed; no 
arbitration needed)%n", agree);
+            System.err.printf("Skip:    %d%n", skip);
+            if (pass > 0) {
+                System.err.printf("Mean margin on pass: %.3f%n", passMarginSum 
/ pass);
+            }
+            if (!failingLines.isEmpty()) {
+                System.err.println("Failing:");
+                Collections.sort(failingLines);
+                for (String line : failingLines) {
+                    System.err.println("  " + line);
+                }
+            }
+        }
+        System.err.println("Wrote " + out);
+    }
+
+    private static FixtureResult evalOneForced(Path file, String expected,
+                                               JunkDetector detector,
+                                               List<Charset> forced,
+                                               int probeBytes) throws 
IOException {
+        byte[] raw = Files.readAllBytes(file);
+        FixtureResult r = new FixtureResult();
+        r.dir = file.getParent().getFileName().toString();
+        String fname = file.getFileName().toString();
+        r.shortName = fname.length() > 24 ? fname.substring(0, 24) : fname;
+        r.bytes = raw.length;
+        r.probeSize = probeBytes;
+        r.expected = expected;
+
+        if (isBinaryMagic(raw)) {
+            r.status = "SKIP_BIN";
+            return r;
+        }
+        // Strip HTML on the WHOLE raw buffer first, then slice to probeBytes
+        // from the stripped content.  Otherwise a small probe slice can land
+        // entirely inside <!DOCTYPE>/<html>/<head> boilerplate and leave
+        // nothing to score after strip.
+        byte[] strippedFull = stripHtmlBytes(raw);
+        byte[] forDecode = strippedFull.length > probeBytes
+                ? Arrays.copyOf(strippedFull, probeBytes) : strippedFull;
+        r.candidatesStr = forced.stream().map(Charset::name)
+                .reduce((a, b) -> a + "," + b).orElse("-");
+
+        // Always log every candidate in notes — even those JunkDetector
+        // rejects as unknown — so the failure mode is visible.  An
+        // "unknown" score itself is meaningful information when the other
+        // candidate scored fine.
+        String winner = null;
+        String runner = null;
+        float winnerZ = Float.NEGATIVE_INFINITY;
+        float runnerZ = Float.NEGATIVE_INFINITY;
+        StringBuilder notes = new StringBuilder();
+        int decoded_scored = 0;
+        for (Charset cs : forced) {
+            String decoded = applyEntityVariant(new String(forDecode, cs), 
"expanded");
+            int cps = toCodepoints(decoded).length;
+            if (cps < 3) {
+                
notes.append(cs.name()).append("=TOO_SHORT(").append(cps).append(") ");
+                continue;
+            }
+            TextQualityScore s = detector.score(decoded);
+            if (s.isUnknown()) {
+                // Diagnose: is this script-not-in-model (neutral case) or
+                // all-runs-fragmented-too-short (a real mojibake signal)?
+                String why = diagnoseUnknown(decoded, detector);
+                notes.append(cs.name()).append("=UNK[").append(why).append("] 
");
+                continue;
+            }
+            float z = s.getZScore();
+            notes.append(cs.name()).append("=").append(String.format("%.2f", 
z)).append(" ");
+            decoded_scored++;
+            if (z > winnerZ) {
+                runner = winner;
+                runnerZ = winnerZ;
+                winner = cs.name();
+                winnerZ = z;
+            } else if (z > runnerZ) {
+                runner = cs.name();
+                runnerZ = z;
+            }
+        }
+        if (winner == null) {
+            r.status = "NO_DECODE";
+            r.notes = notes.toString().trim();
+            return r;
+        }
+        r.winner = winner;
+        if (decoded_scored < 2) {
+            // Only one candidate scored; no real arbitration happened.
+            r.margin = Float.NaN;
+            r.status = safeCanonical(winner).equals(safeCanonical(expected))
+                    ? "ONLY_EXPECTED_SCORED" : "ONLY_WRONG_SCORED";
+        } else {
+            r.margin = winnerZ - runnerZ;
+            r.status = safeCanonical(winner).equals(safeCanonical(expected)) ? 
"PASS" : "FAIL";
+        }
+        r.notes = notes.toString().trim();
+        return r;
+    }
+
+    private static FixtureResult evalOneRealLife(Path file, String expected,
+                                                 JunkDetector detector,
+                                                 BOMDetector bom,
+                                                 HtmlEncodingDetector html,
+                                                 UniversalEncodingDetector 
universal,
+                                                 ParseContext pctx,
+                                                 int probeBytes) throws 
IOException {
+        byte[] raw = Files.readAllBytes(file);
+        int origLen = raw.length;
+        FixtureResult r = new FixtureResult();
+        r.dir = file.getParent().getFileName().toString();
+        String fname = file.getFileName().toString();
+        r.shortName = fname.length() > 24 ? fname.substring(0, 24) : fname;
+        r.bytes = origLen;
+        r.probeSize = probeBytes;
+        r.expected = expected;
+
+        if (isBinaryMagic(raw)) {
+            r.status = "SKIP_BIN";
+            return r;
+        }
+
+        // Probe bytes for the base detectors (16 KB matches production read 
limit).
+        // For the base detectors we keep the raw bytes (the BOM detector and
+        // HTML-header sniff both want the original prefix).
+        byte[] probe = raw.length > probeBytes ? Arrays.copyOf(raw, 
probeBytes) : raw;
+
+        r.bomCs    = firstCharset(bom,       probe, pctx);
+        r.htmlCs   = firstCharset(html,      probe, pctx);
+        r.universalCs = firstCharset(universal, probe, pctx);
+
+        // Collect distinct candidates in order of priority: BOM > HTML > 
universal.
+        List<Charset> candList = new ArrayList<>();
+        addUnique(candList, r.bomCs);
+        addUnique(candList, r.htmlCs);
+        addUnique(candList, r.universalCs);
+        r.candidatesStr = candList.stream().map(Charset::name)
+                .reduce((a, b) -> a + "," + b).orElse("-");
+
+        if (candList.isEmpty()) {
+            r.status = "NO_CANDIDATES";
+            return r;
+        }
+        if (candList.size() == 1) {
+            // All detectors agreed (or only one fired): no arbitration to do.
+            r.winner = candList.get(0).name();
+            r.status = safeCanonical(r.winner).equals(safeCanonical(expected)) 
? "AGREE" : "AGREE_WRONG";
+            return r;
+        }
+
+        // Strip HTML from the FULL raw bytes, then slice to probeBytes from
+        // the stripped content — so a small probe-size doesn't land inside
+        // the DOCTYPE/head boilerplate with nothing left to score.
+        byte[] strippedFull = stripHtmlBytes(raw);
+        byte[] forDecode = strippedFull.length > probeBytes
+                ? Arrays.copyOf(strippedFull, probeBytes) : strippedFull;
+        // Pairwise tournament — pick the candidate that beats all others.
+        Charset winnerCs = candList.get(0);
+        float bestMargin = Float.POSITIVE_INFINITY;
+        for (int i = 1; i < candList.size(); i++) {
+            Charset challenger = candList.get(i);
+            String aDecoded = applyEntityVariant(new String(forDecode, 
winnerCs), "expanded");
+            String bDecoded = applyEntityVariant(new String(forDecode, 
challenger), "expanded");
+            TextQualityScore aScore = detector.score(aDecoded);
+            TextQualityScore bScore = detector.score(bDecoded);
+            if (aScore.isUnknown() || bScore.isUnknown()) {
+                continue;
+            }
+            float margin = aScore.getZScore() - bScore.getZScore();
+            if (margin < 0) {
+                winnerCs = challenger;
+                margin = -margin;
+            }
+            bestMargin = Math.min(bestMargin, Math.abs(margin));
+        }
+        r.winner = winnerCs.name();
+        r.margin = Float.isInfinite(bestMargin) ? Float.NaN : bestMargin;
+        r.status = safeCanonical(r.winner).equals(safeCanonical(expected)) ? 
"PASS" : "FAIL";
+        return r;
+    }
+
+    private static String firstCharset(org.apache.tika.detect.EncodingDetector 
d,
+                                       byte[] bytes, ParseContext pctx) {
+        try (TikaInputStream tis =
+                     TikaInputStream.get(new 
java.io.ByteArrayInputStream(bytes))) {
+            List<EncodingResult> results = d.detect(tis, new Metadata(), pctx);
+            if (results == null || results.isEmpty()) {
+                return null;
+            }
+            Charset cs = results.get(0).getCharset();
+            return cs == null ? null : cs.name();
+        } catch (Exception e) {
+            return null;
+        }
+    }
+
+    private static void addUnique(List<Charset> list, String name) {
+        if (name == null) {
+            return;
+        }
+        Charset cs;
+        try {
+            cs = Charset.forName(name);
+        } catch (Exception e) {
+            return;
+        }
+        for (Charset c : list) {
+            if (c.equals(cs)) {
+                return;
+            }
+        }
+        list.add(cs);
+    }
+
+    /**
+     * Diagnose why JunkDetector returned UNKNOWN for {@code text}.  Walks
+     * the same script-run logic, then classifies the failure mode:
+     * <ul>
+     *   <li>{@code EMPTY} — input had no characters.</li>
+     *   <li>{@code NO_MODELED_SCRIPT} — all runs are in scripts the model
+     *       doesn't know (legit reason to be neutral).</li>
+     *   <li>{@code ALL_RUNS_TOO_SHORT(N)} — runs exist in modeled scripts
+     *       but every one is &lt;2 UTF-8 bytes.  Strong mojibake signal —
+     *       text is a salad of single codepoints from many scripts.</li>
+     *   <li>{@code MIXED} — some runs were modeled-but-too-short and
+     *       some were unmodeled.</li>
+     * </ul>
+     */
+    private static String diagnoseUnknown(String text, JunkDetector detector) {
+        if (text == null || text.isEmpty()) {
+            return "EMPTY";
+        }
+        Set<String> modeled = detector.knownScripts();
+        // Walk codepoints, splitting on script boundaries — same as
+        // JunkDetector.buildScriptRuns conceptually.  Track per-script:
+        // longest UTF-8-byte run length, plus a separate "unmodeled" tally.
+        java.util.Map<String, Integer> longestModeled = new 
java.util.HashMap<>();
+        int unmodeledRuns = 0;
+        int modeledTooShortRuns = 0;
+        int currentBytes = 0;
+        String currentScript = null;
+        for (int i = 0; i < text.length(); ) {
+            int cp = text.codePointAt(i);
+            int charCount = Character.charCount(cp);
+            String script = Character.UnicodeScript.of(cp).name();
+            // COMMON / INHERITED / UNKNOWN attach to preceding run, but for
+            // diagnosis we don't need to be that precise — treat them as a
+            // continuation.
+            if ("COMMON".equals(script) || "INHERITED".equals(script)
+                    || "UNKNOWN".equals(script)) {
+                if (currentScript != null) {
+                    currentBytes += new String(new int[]{cp}, 0, 1)
+                            .getBytes(StandardCharsets.UTF_8).length;
+                }
+            } else if (script.equals(currentScript)) {
+                currentBytes += new String(new int[]{cp}, 0, 1)
+                        .getBytes(StandardCharsets.UTF_8).length;
+            } else {
+                // close out previous run
+                tallyRun(currentScript, currentBytes, modeled, longestModeled);
+                if (currentScript != null) {
+                    if (!modeled.contains(currentScript)) {
+                        unmodeledRuns++;
+                    } else if (currentBytes < 2) {
+                        modeledTooShortRuns++;
+                    }
+                }
+                currentScript = script;
+                currentBytes = new String(new int[]{cp}, 0, 1)
+                        .getBytes(StandardCharsets.UTF_8).length;
+            }
+            i += charCount;
+        }
+        // close final run
+        if (currentScript != null) {
+            if (!modeled.contains(currentScript)) {
+                unmodeledRuns++;
+            } else if (currentBytes < 2) {
+                modeledTooShortRuns++;
+            } else {
+                longestModeled.merge(currentScript, currentBytes, Math::max);
+            }
+        }
+        boolean anyModeledLong = !longestModeled.isEmpty();
+        if (anyModeledLong) {
+            // Some modeled run is ≥2 bytes — shouldn't have hit UNKNOWN.
+            // (Possible discrepancy with the production logic; reported as 
MIXED.)
+            return "MIXED(modeled_long=" + longestModeled.size() + ")";
+        }
+        if (modeledTooShortRuns > 0 && unmodeledRuns > 0) {
+            return "MIXED(short=" + modeledTooShortRuns
+                    + ",unmodeled=" + unmodeledRuns + ")";
+        }
+        if (modeledTooShortRuns > 0) {
+            return "ALL_RUNS_TOO_SHORT(" + modeledTooShortRuns + ")";
+        }
+        if (unmodeledRuns > 0) {
+            return "NO_MODELED_SCRIPT(" + unmodeledRuns + ")";
+        }
+        return "OTHER";
+    }
+
+    private static void tallyRun(String script, int bytes, Set<String> modeled,
+                                 java.util.Map<String, Integer> 
longestModeled) {
+        if (script == null) {
+            return;
+        }
+        if (modeled.contains(script) && bytes >= 2) {
+            longestModeled.merge(script, bytes, Math::max);
+        }
+    }
+
+    /**
+     * Run HtmlByteStripper over the entire input; return the stripped
+     * content bytes (or the input verbatim if no tags found).
+     */
+    private static byte[] stripHtmlBytes(byte[] raw) {
+        byte[] dst = new byte[raw.length];
+        HtmlByteStripper.Result r =
+                HtmlByteStripper.strip(raw, 0, raw.length, dst, 0);
+        if (r.tagCount > 0 && r.length > 0) {
+            return Arrays.copyOf(dst, r.length);
+        }
+        return raw;
+    }
+
+    private static boolean isBinaryMagic(byte[] b) {
+        if (b.length < 4) {
+            return false;
+        }
+        if (b[0] == 0x50 && b[1] == 0x4B
+                && (b[2] == 0x03 || b[2] == 0x05 || b[2] == 0x07)) {
+            return true; // ZIP / JAR / APK / docx
+        }
+        if ((b[0] & 0xFF) == 0x1F && (b[1] & 0xFF) == 0x8B) {
+            return true; // gzip
+        }
+        if (b[0] == '%' && b[1] == 'P' && b[2] == 'D' && b[3] == 'F') {
+            return true; // PDF
+        }
+        if ((b[0] & 0xFF) == 0xD0 && (b[1] & 0xFF) == 0xCF) {
+            return true; // OLE2
+        }
+        return false;
+    }
+
+    private static String safeCanonical(String charset) {
+        if (charset == null) {
+            return "";
+        }
+        try {
+            return Charset.forName(charset).name();
+        } catch (Exception e) {
+            return charset.toUpperCase();
+        }
+    }
+
+    private static final class FixtureResult {
+        String dir;
+        String shortName;
+        int bytes;
+        int probeSize;
+        String expected;
+        String bomCs;
+        String htmlCs;
+        String universalCs;
+        String candidatesStr = "-";
+        String winner = "-";
+        float margin = Float.NaN;
+        String status = "";
+        String notes = "";
+
+        String toTsvLine() {
+            return 
String.format("%s\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s",
+                    dir, shortName, bytes, probeSize, expected,
+                    str(bomCs), str(htmlCs), str(universalCs),
+                    candidatesStr, str(winner),
+                    Float.isNaN(margin) ? "-" : String.format("%.3f", margin),
+                    status, notes.isEmpty() ? "-" : notes);
+        }
+
+        private static String str(String s) {
+            return s == null ? "-" : s;
+        }
+    }
+
     // -----------------------------------------------------------------------
     // Fixture eval: score real-world AIT5-class HTML files under v5 and v6
     // prototype, with byte-level HTML stripping and entity-variant comparison.
diff --git 
a/tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/JunkDetectorSmokeTest.java
 
b/tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/JunkDetectorSmokeTest.java
index a277d2d79f..d8f267ecc2 100644
--- 
a/tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/JunkDetectorSmokeTest.java
+++ 
b/tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/JunkDetectorSmokeTest.java
@@ -17,6 +17,7 @@
 package org.apache.tika.ml.junkdetect;
 
 import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertFalse;
 import static org.junit.jupiter.api.Assertions.assertTrue;
 
 import java.nio.charset.StandardCharsets;
@@ -200,6 +201,46 @@ public class JunkDetectorSmokeTest {
                 "Shift-JIS decode should beat garbled UTF-8 for short Japanese 
filename");
     }
 
+    /**
+     * Regression: a single CJK codepoint sandwiched between modeled-script
+     * runs used to NaN-poison the entire score, because the byte-length
+     * filter ({@code runUtf8.length >= 2}) and the UTF-16 char-length
+     * filter inside {@code computeF1MeanLogP} ({@code text.length() >= 2})
+     * disagreed.  A single CJK char is 3 UTF-8 bytes (1 UTF-16 unit), so
+     * it passed the outer filter, computed NaN inside, and poisoned the
+     * weighted aggregate — surfacing as UNKNOWN to callers.  This was the
+     * root cause of the AIT5-class regressions (UTF-8 Malayalam decoded as
+     * GB18030 returns lots of single-Han-char runs).
+     */
+    @Test
+    void singleCjkCharDoesNotNaNPoisonScore() {
+        // Latin sentence with a stray CJK char dropped in — exactly the
+        // shape of a GB18030-mojibake-of-UTF-8 decode at the run-boundary
+        // level.  The CJK char forms a single-codepoint HAN run.
+        String text = "The quick brown 中 fox jumps over the lazy dog. "
+                + "Pack 中 my box with five dozen liquor jugs.";
+        TextQualityScore score = detector.score(text);
+        assertFalse(score.isUnknown(),
+                "score should not be UNKNOWN — single-CJK run should be 
skipped, "
+                        + "not poison the aggregate.  Got: " + score);
+    }
+
+    /**
+     * Sibling regression: the same NaN-poisoning case caused by a single
+     * supplementary-plane (4-byte UTF-8, 2-UTF-16-unit) codepoint.  Less
+     * load-bearing than the BMP-CJK case — supplementary chars decode to
+     * {@code text.length() == 2} so they pass the inner filter — but
+     * worth pinning the behaviour.
+     */
+    @Test
+    void supplementaryPlaneCharSurvivesScoring() {
+        // U+1F600 (😀) is a 2-UTF-16-unit supplementary char with script 
COMMON,
+        // so it attaches to a preceding modeled run rather than forming its 
own.
+        String text = "Hello world 😀 this is some plain English text.";
+        TextQualityScore score = detector.score(text);
+        assertFalse(score.isUnknown(), "supplementary char should not break 
scoring: " + score);
+    }
+
     // -----------------------------------------------------------------------
 
     /**

(tika) 01/02: checkpoint v7

Reply via email to