(tika) 02/02: checkpoint v7

tallison Thu, 14 May 2026 13:42:38 -0700

This is an automated email from the ASF dual-hosted git repository.

tballison pushed a commit to branch junk-detector-v6
in repository https://gitbox.apache.org/repos/asf/tika.git


commit f5c61f31af7100dd334b8c14ed34045682afffc8
Author: tallison <[email protected]>
AuthorDate: Thu May 14 16:41:15 2026 -0400

    checkpoint v7
---
 .../apache/tika/quality/TextQualityComparison.java |   9 +-
 .../apache/tika/quality/TextQualityDetector.java   |   2 +-
 tika-ml/tika-ml-junkdetect/pom.xml                 |  22 ---
 .../apache/tika/ml/junkdetect/JunkDetector.java    |   4 +-
 .../ml/junkdetect/JunkFilterEncodingDetector.java  | 162 +-------------------
 .../tika/ml/junkdetect/tools/EvalJunkDetector.java |   2 +-
 .../junkdetect/tools/PrototypeCodepointHash.java   | 166 +++------------------
 .../tika/ml/junkdetect/JunkDetectorSmokeTest.java  |   6 +-
 8 files changed, 30 insertions(+), 343 deletions(-)

diff --git 
a/tika-core/src/main/java/org/apache/tika/quality/TextQualityComparison.java 
b/tika-core/src/main/java/org/apache/tika/quality/TextQualityComparison.java
index 8c054b0ef7..c1f78cebb6 100644
--- a/tika-core/src/main/java/org/apache/tika/quality/TextQualityComparison.java
+++ b/tika-core/src/main/java/org/apache/tika/quality/TextQualityComparison.java
@@ -50,8 +50,8 @@ public final class TextQualityComparison {
     }
 
     /**
-     * Returns {@code "A"} if candidate A is cleaner, {@code "B"} otherwise.
-     * Check {@link #delta()} to gauge confidence.
+     * Returns the label of the cleaner candidate ({@link #labelA()} or
+     * {@link #labelB()}).  Check {@link #delta()} to gauge confidence.
      */
     public String winner() {
         return winner;
@@ -88,8 +88,7 @@ public final class TextQualityComparison {
     @Override
     public String toString() {
         return String.format(java.util.Locale.ROOT,
-                "TextQualityComparison[winner=%s(%s) delta=%.3f A=%s B=%s]",
-                winner, winner.equals("A") ? labelA : labelB,
-                delta, scoreA, scoreB);
+                "TextQualityComparison[winner=%s delta=%.3f A=%s(%s) 
B=%s(%s)]",
+                winner, delta, labelA, scoreA, labelB, scoreB);
     }
 }
diff --git 
a/tika-core/src/main/java/org/apache/tika/quality/TextQualityDetector.java 
b/tika-core/src/main/java/org/apache/tika/quality/TextQualityDetector.java
index d832b5a169..b91315e727 100644
--- a/tika-core/src/main/java/org/apache/tika/quality/TextQualityDetector.java
+++ b/tika-core/src/main/java/org/apache/tika/quality/TextQualityDetector.java
@@ -37,7 +37,7 @@ package org.apache.tika.quality;
  * // Arbitrate between two charset decodings
  * TextQualityComparison cmp = detector.compare("cp1252", decodedAsCp1252,
  *                                               "cp1251", decodedAsCp1251);
- * String winner = cmp.winner();  // "A" or "B"
+ * String winner = cmp.winner();  // returns the chosen label, e.g. "cp1251"
  * }</pre>
  */
 public interface TextQualityDetector {
diff --git a/tika-ml/tika-ml-junkdetect/pom.xml 
b/tika-ml/tika-ml-junkdetect/pom.xml
index 7701ec6ff0..a10d73ad64 100644
--- a/tika-ml/tika-ml-junkdetect/pom.xml
+++ b/tika-ml/tika-ml-junkdetect/pom.xml
@@ -59,28 +59,6 @@
       <artifactId>tika-encoding-detector-mojibuster</artifactId>
       <version>${revision}</version>
     </dependency>
-    <!-- Used by EvalFixtures-mode tooling to invoke the three production base
-         detectors (BOM + HTML header + universal statistical) against fixture
-         bytes.  Compile-scope so the diagnostic tool under tools/ links;
-         the production junk-filter detector discovers these via 
ServiceLoader. -->
-    <dependency>
-      <groupId>org.apache.tika</groupId>
-      <artifactId>tika-encoding-detector-html</artifactId>
-      <version>${revision}</version>
-    </dependency>
-    <!-- Bundles the StandardCharsets_unsupported_by_IANA.txt resource that
-         HtmlEncodingDetector loads from its static initializer.  Without
-         this dep on the eval tool's classpath, HtmlEncodingDetector NPEs. -->
-    <dependency>
-      <groupId>org.apache.tika</groupId>
-      <artifactId>tika-parser-html-module</artifactId>
-      <version>${revision}</version>
-    </dependency>
-    <dependency>
-      <groupId>org.apache.tika</groupId>
-      <artifactId>tika-encoding-detector-universal</artifactId>
-      <version>${revision}</version>
-    </dependency>
 
     <!-- Test dependencies -->
     <dependency>
diff --git 
a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkDetector.java
 
b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkDetector.java
index a60e66e93c..5635f6f168 100644
--- 
a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkDetector.java
+++ 
b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkDetector.java
@@ -81,7 +81,7 @@ import org.apache.tika.quality.TextQualityScore;
  *
  * // Arbitrate between two charset decodings
  * TextQualityComparison result = detector.compare("cp1252", ascp1252, 
"cp1251", ascp1251);
- * String winner = result.winner();  // "A" or "B"
+ * String winner = result.winner();  // returns "cp1252" or "cp1251"
  * }</pre>
  */
 public final class JunkDetector implements TextQualityDetector {
@@ -374,7 +374,7 @@ public final class JunkDetector implements 
TextQualityDetector {
         float zA = scoreA.isUnknown() ? 0f : scoreA.getZScore();
         float zB = scoreB.isUnknown() ? 0f : scoreB.getZScore();
 
-        String winner = zA >= zB ? "A" : "B";
+        String winner = zA >= zB ? labelA : labelB;
         float delta = Math.abs(zA - zB);
 
         return new TextQualityComparison(winner, delta, scoreA, scoreB, 
labelA, labelB);
diff --git 
a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkFilterEncodingDetector.java
 
b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkFilterEncodingDetector.java
index 9f7df16aad..72e51e8094 100644
--- 
a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkFilterEncodingDetector.java
+++ 
b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkFilterEncodingDetector.java
@@ -41,7 +41,6 @@ import org.apache.tika.ml.chardetect.HtmlByteStripper;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.quality.TextQualityComparison;
 import org.apache.tika.quality.TextQualityDetector;
-import org.apache.tika.quality.TextQualityScore;
 
 /**
  * A {@link MetaEncodingDetector} that arbitrates charset candidates by
@@ -78,34 +77,6 @@ public class JunkFilterEncodingDetector implements 
MetaEncodingDetector {
      * default read limit used by the charset base detectors. */
     private static final int DEFAULT_READ_LIMIT = 16384;
 
-    // ---------------------------------------------------------------------
-    // TACTICAL: declarative-override gate constants.
-    //
-    // These exist to compensate for known per-script calibration unevenness
-    // in the quality scorer (HAN noise floor too generous; MALAYALAM/TAMIL/
-    // BENGALI floors too strict).  They produce wrong tournaments when an
-    // honest in-document declaration (`<meta charset>` / XML decl) decodes
-    // to sparse non-Latin content that scores junky-but-correct, while a
-    // statistical pick decodes to dense mojibake-Han that scores decent-
-    // but-wrong.  See `analyses/2026-04-26-tika-eval-charset-and-other.md`
-    // and the indic-collapse + Korean+Hanja fixtures.
-    //
-    // REMOVE when the quality scorer is recalibrated per-script — the
-    // tournament should then be reliable on its own.
-    // ---------------------------------------------------------------------
-
-    /** Maximum delta in z-score units we tolerate before honoring the
-     *  in-document declaration over the tournament winner.  Tuned so that
-     *  small same-script-different-codepage deltas (windows-1252 vs
-     *  windows-1257 ≈ 1-2 units) don't trigger override when scripts
-     *  match, while indic-vs-mojibake-Han deltas (~3-5 units) do. */
-    private static final float DECLARATIVE_OVERRIDE_MAX_DELTA = 6.0f;
-
-    /** Maximum fraction of REPLACEMENT CHARACTER (U+FFFD) in the declared
-     *  decoder's output.  Above this, the declared charset clearly cannot
-     *  decode the bytes and we should not honor the declaration. */
-    private static final double DECLARATIVE_MAX_FFFD_RATE = 0.01;
-
     /** Cached quality detector.  {@code null} if none is on the classpath. */
     private final TextQualityDetector qualityDetector;
 
@@ -259,148 +230,17 @@ public class JunkFilterEncodingDetector implements 
MetaEncodingDetector {
                     champion.getKey().name(), challenger.getKey().name(),
                     cmp.winner(), String.format(java.util.Locale.ROOT, "%.3f", 
cmp.delta()),
                     cmp.scoreA(), cmp.scoreB());
-            if ("B".equals(cmp.winner())) {
+            if (challenger.getKey().name().equals(cmp.winner())) {
                 champion = challenger;
             }
         }
         LOG.trace("junk-filter -> {} (tournament champion)", 
champion.getKey().name());
 
-        // TACTICAL: declarative override.  See class-level comment block.
-        // REMOVE when quality scorer is recalibrated per-script.
-        Charset declarativeOverride = applyInDocumentDeclarativeOverride(
-                context, candidates, champion.getKey());
-        if (declarativeOverride != null) {
-            float conf = context.getTopConfidenceFor(declarativeOverride);
-            context.setArbitrationInfo("junk-filter-declarative-override");
-            LOG.trace("junk-filter -> {} (declarative override of tournament 
winner {})",
-                    declarativeOverride.name(), champion.getKey().name());
-            return List.of(new EncodingResult(declarativeOverride, conf));
-        }
-
         float confidence = context.getTopConfidenceFor(champion.getKey());
         context.setArbitrationInfo("junk-filter-selected");
         return List.of(new EncodingResult(champion.getKey(), confidence));
     }
 
-    /**
-     * Tactical fix: honor an in-document {@code <meta charset>} or XML
-     * declaration when the quality scorer's per-script calibration unevenness
-     * would otherwise mis-rank candidates of <em>different scripts</em>.
-     *
-     * <p>Returns the in-document declared charset to use, or {@code null} to
-     * leave the tournament winner intact.</p>
-     *
-     * <p>Gates (all must hold to override):</p>
-     * <ol>
-     *   <li><strong>(a) Decode is mostly clean</strong>: declared decoder 
produces
-     *       fewer than {@link #DECLARATIVE_MAX_FFFD_RATE} U+FFFD per 
char.</li>
-     *   <li><strong>(b) Both decoded</strong>: declared and tournament winner 
are
-     *       both in the candidate map (already guaranteed by upstream 
code).</li>
-     *   <li><strong>(c) Quality gap small</strong>: tournament winner's 
z-score
-     *       is not vastly higher than the declared's; specifically
-     *       {@code winner.z - declared.z &lt;= 
DECLARATIVE_OVERRIDE_MAX_DELTA}.</li>
-     *   <li><strong>(d) Different scripts</strong>: declared and winner 
classify
-     *       as different scripts.  Same-script Latin-cousin lies (e.g. 
windows-1252
-     *       declared on a windows-1257 file) fall through to the tournament,
-     *       which correctly handles them via byte-distribution scoring.</li>
-     * </ol>
-     *
-     * <p>"In-document" means {@code HtmlEncodingDetector} or any future 
XML-decl
-     * source — explicitly NOT {@code MetadataCharsetDetector} (outer 
Content-Type
-     * header), which is more often wrong.</p>
-     */
-    private Charset applyInDocumentDeclarativeOverride(
-            EncodingDetectorContext context,
-            Map<Charset, String> candidates,
-            Charset champion) {
-        Charset declared = findInDocumentDeclarative(context);
-        if (declared == null) {
-            return null;
-        }
-        if (declared.equals(champion)) {
-            return null; // already winning
-        }
-        // Per HTML5 spec, <meta charset> cannot validly declare UTF-16 / 
UTF-32:
-        // the meta tag itself is bytes that have to be parsed before its
-        // declaration is known, and UTF-16/32 require a BOM.  If the
-        // declaration claims UTF-16/32 and no BOM was found (BOMDetector runs
-        // first in the chain), we treat the declaration as invalid and let
-        // the tournament winner stand.  This catches govdocs1-style "utf-16
-        // declared on a Latin file" lies that would otherwise look like a
-        // legitimate script-mismatch override.
-        String declaredName = declared.name();
-        if (declaredName.startsWith("UTF-16") || 
declaredName.startsWith("UTF-32")) {
-            LOG.trace("junk-filter declarative-override skipped: UTF-16/32 in 
<meta> (HTML5 invalid)");
-            return null;
-        }
-        String championText = candidates.get(champion);
-        String declaredText = candidates.get(declared);
-        if (declaredText == null || championText == null) {
-            return null; // failed to decode
-        }
-        // (a) decode mostly clean
-        double fffdRate = replacementCharRate(declaredText);
-        if (fffdRate > DECLARATIVE_MAX_FFFD_RATE) {
-            LOG.trace("junk-filter declarative-override skipped: U+FFFD rate 
{} > {}",
-                    fffdRate, DECLARATIVE_MAX_FFFD_RATE);
-            return null;
-        }
-        TextQualityScore declaredScore = qualityDetector.score(declaredText);
-        TextQualityScore championScore = qualityDetector.score(championText);
-        // (c) winner not vastly higher
-        float delta = championScore.getZScore() - declaredScore.getZScore();
-        if (delta > DECLARATIVE_OVERRIDE_MAX_DELTA) {
-            LOG.trace("junk-filter declarative-override skipped: delta {} > 
{}",
-                    delta, DECLARATIVE_OVERRIDE_MAX_DELTA);
-            return null;
-        }
-        // (d) different scripts
-        String declaredScript = declaredScore.getDominantScript();
-        String championScript = championScore.getDominantScript();
-        if (declaredScript == null || declaredScript.equals(championScript)) {
-            LOG.trace("junk-filter declarative-override skipped: same script 
{}",
-                    declaredScript);
-            return null;
-        }
-        LOG.trace("junk-filter declarative-override fires: declared={} 
(script={}, z={}) vs winner={} (script={}, z={}) delta={}",
-                declared.name(), declaredScript, declaredScore.getZScore(),
-                champion.name(), championScript, championScore.getZScore(), 
delta);
-        return declared;
-    }
-
-    /**
-     * Find the first in-document DECLARATIVE candidate (from
-     * {@code HtmlEncodingDetector} / XML declaration), or {@code null}.
-     * Outer Content-Type metadata ({@code MetadataCharsetDetector}) is
-     * intentionally excluded — those headers lie too often.
-     */
-    private static Charset findInDocumentDeclarative(EncodingDetectorContext 
context) {
-        for (EncodingDetectorContext.Result r : context.getResults()) {
-            String name = r.getDetectorName();
-            if (("HtmlEncodingDetector".equals(name)
-                    || "StandardHtmlEncodingDetector".equals(name))
-                    && r.getResultType() == 
EncodingResult.ResultType.DECLARATIVE) {
-                return r.getCharset();
-            }
-        }
-        return null;
-    }
-
-    /** Fraction of {@code U+FFFD} (REPLACEMENT CHARACTER) in the decoded 
String —
-     * a proxy for "this charset cannot decode these bytes". */
-    private static double replacementCharRate(String s) {
-        if (s.isEmpty()) {
-            return 0.0;
-        }
-        long count = 0;
-        for (int i = 0; i < s.length(); i++) {
-            if (s.charAt(i) == '�') {
-                count++;
-            }
-        }
-        return (double) count / s.length();
-    }
-
     /**
      * Return the first DECLARATIVE charset whose decoded output equals at
      * least one other candidate's, or {@code null}.
diff --git 
a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/EvalJunkDetector.java
 
b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/EvalJunkDetector.java
index 6b6057fc34..e0b4bc0ae1 100644
--- 
a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/EvalJunkDetector.java
+++ 
b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/EvalJunkDetector.java
@@ -470,7 +470,7 @@ public class EvalJunkDetector {
                                 sourceCodec, asSource, wrongCodec, asWrong);
 
                         deltas.add(result.delta());
-                        if ("A".equals(result.winner())) nCorrect++;
+                        if (sourceCodec.equals(result.winner())) nCorrect++;
                     }
 
                     if (deltas.isEmpty()) continue;
diff --git 
a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/PrototypeCodepointHash.java
 
b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/PrototypeCodepointHash.java
index 0d5f04bdee..47ee346d37 100644
--- 
a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/PrototypeCodepointHash.java
+++ 
b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/PrototypeCodepointHash.java
@@ -41,15 +41,8 @@ import java.util.regex.Pattern;
 import java.util.stream.Stream;
 import java.util.zip.GZIPInputStream;
 
-import org.apache.tika.detect.BOMDetector;
-import org.apache.tika.detect.EncodingResult;
-import org.apache.tika.io.TikaInputStream;
-import org.apache.tika.metadata.Metadata;
 import org.apache.tika.ml.chardetect.HtmlByteStripper;
 import org.apache.tika.ml.junkdetect.JunkDetector;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.html.HtmlEncodingDetector;
-import org.apache.tika.parser.txt.UniversalEncodingDetector;
 import org.apache.tika.quality.TextQualityScore;
 
 /**
@@ -175,13 +168,20 @@ public class PrototypeCodepointHash {
         Files.createDirectories(outputDir);
 
         // --single-model bypasses the v5/v6-prototype comparison apparatus.
-        // For evaluating the currently-bundled JunkDetector against real 
fixtures.
+        // Requires --force-candidates to specify the charsets to compare;
+        // the base-detector-driven path was removed to keep tika-ml-junkdetect
+        // free of heavy encoding-detector deps.
         if (singleModel) {
             if (fixturesDirs.isEmpty()) {
                 System.err.println("--single-model requires --fixtures-dir");
                 System.exit(1);
             }
-            evalFixturesSingleModel(fixturesDirs, candidates, forceCandidates, 
expected,
+            if (forceCandidates == null || forceCandidates.isEmpty()) {
+                System.err.println("--single-model requires --force-candidates 
"
+                        + "(e.g. --force-candidates UTF-8,GB18030)");
+                System.exit(1);
+            }
+            evalFixturesSingleModel(fixturesDirs, forceCandidates, expected,
                     probeSizes, outputDir);
             return;
         }
@@ -308,39 +308,25 @@ public class PrototypeCodepointHash {
     // -----------------------------------------------------------------------
 
     private static void evalFixturesSingleModel(List<Path> fixturesDirs,
-                                                List<String> candidates, // 
ignored
                                                 List<String> forceCandidates,
                                                 String expected,
                                                 int[] probeSizes,
                                                 Path outputDir) throws 
IOException {
-        boolean forceMode = forceCandidates != null && 
!forceCandidates.isEmpty();
-        if (forceMode) {
-            System.err.println("\n--- Forced-candidates fixture eval ---");
-            System.err.println("  candidates: " + forceCandidates);
-        } else {
-            System.err.println("\n--- Real-life fixture eval (BOM + HTML + 
Universal) ---");
-        }
+        System.err.println("\n--- Forced-candidates fixture eval ---");
+        System.err.println("  candidates: " + forceCandidates);
         JunkDetector detector = JunkDetector.loadFromClasspath();
         System.err.println("  model version: " + detector.getModelVersion());
         System.err.println("  expected:      " + expected);
 
-        // Pre-resolve forced charsets; skip unsupported ones up front.
         List<Charset> forced = new ArrayList<>();
-        if (forceMode) {
-            for (String n : forceCandidates) {
-                try {
-                    forced.add(Charset.forName(n));
-                } catch (Exception e) {
-                    System.err.println("  skip unsupported charset: " + n);
-                }
+        for (String n : forceCandidates) {
+            try {
+                forced.add(Charset.forName(n));
+            } catch (Exception e) {
+                System.err.println("  skip unsupported charset: " + n);
             }
         }
 
-        BOMDetector bom = new BOMDetector();
-        HtmlEncodingDetector html = new HtmlEncodingDetector();
-        UniversalEncodingDetector universal = new UniversalEncodingDetector();
-        ParseContext pctx = new ParseContext();
-
         Path out = outputDir.resolve("fixtures-real-life.tsv");
         try (PrintWriter pw = new PrintWriter(
                 Files.newBufferedWriter(out, StandardCharsets.UTF_8))) {
@@ -362,10 +348,8 @@ public class PrototypeCodepointHash {
                     int[] sizes = probeSizes != null ? probeSizes : new 
int[]{16_384};
                     for (Path f : files) {
                         for (int sz : sizes) {
-                            FixtureResult r = forceMode
-                                    ? evalOneForced(f, expected, detector, 
forced, sz)
-                                    : evalOneRealLife(f, expected, detector, 
bom, html,
-                                            universal, pctx, sz);
+                            FixtureResult r =
+                                    evalOneForced(f, expected, detector, 
forced, sz);
                             pw.println(r.toTsvLine());
                             switch (r.status) {
                                 case "PASS":
@@ -494,120 +478,6 @@ public class PrototypeCodepointHash {
         return r;
     }
 
-    private static FixtureResult evalOneRealLife(Path file, String expected,
-                                                 JunkDetector detector,
-                                                 BOMDetector bom,
-                                                 HtmlEncodingDetector html,
-                                                 UniversalEncodingDetector 
universal,
-                                                 ParseContext pctx,
-                                                 int probeBytes) throws 
IOException {
-        byte[] raw = Files.readAllBytes(file);
-        int origLen = raw.length;
-        FixtureResult r = new FixtureResult();
-        r.dir = file.getParent().getFileName().toString();
-        String fname = file.getFileName().toString();
-        r.shortName = fname.length() > 24 ? fname.substring(0, 24) : fname;
-        r.bytes = origLen;
-        r.probeSize = probeBytes;
-        r.expected = expected;
-
-        if (isBinaryMagic(raw)) {
-            r.status = "SKIP_BIN";
-            return r;
-        }
-
-        // Probe bytes for the base detectors (16 KB matches production read 
limit).
-        // For the base detectors we keep the raw bytes (the BOM detector and
-        // HTML-header sniff both want the original prefix).
-        byte[] probe = raw.length > probeBytes ? Arrays.copyOf(raw, 
probeBytes) : raw;
-
-        r.bomCs    = firstCharset(bom,       probe, pctx);
-        r.htmlCs   = firstCharset(html,      probe, pctx);
-        r.universalCs = firstCharset(universal, probe, pctx);
-
-        // Collect distinct candidates in order of priority: BOM > HTML > 
universal.
-        List<Charset> candList = new ArrayList<>();
-        addUnique(candList, r.bomCs);
-        addUnique(candList, r.htmlCs);
-        addUnique(candList, r.universalCs);
-        r.candidatesStr = candList.stream().map(Charset::name)
-                .reduce((a, b) -> a + "," + b).orElse("-");
-
-        if (candList.isEmpty()) {
-            r.status = "NO_CANDIDATES";
-            return r;
-        }
-        if (candList.size() == 1) {
-            // All detectors agreed (or only one fired): no arbitration to do.
-            r.winner = candList.get(0).name();
-            r.status = safeCanonical(r.winner).equals(safeCanonical(expected)) 
? "AGREE" : "AGREE_WRONG";
-            return r;
-        }
-
-        // Strip HTML from the FULL raw bytes, then slice to probeBytes from
-        // the stripped content — so a small probe-size doesn't land inside
-        // the DOCTYPE/head boilerplate with nothing left to score.
-        byte[] strippedFull = stripHtmlBytes(raw);
-        byte[] forDecode = strippedFull.length > probeBytes
-                ? Arrays.copyOf(strippedFull, probeBytes) : strippedFull;
-        // Pairwise tournament — pick the candidate that beats all others.
-        Charset winnerCs = candList.get(0);
-        float bestMargin = Float.POSITIVE_INFINITY;
-        for (int i = 1; i < candList.size(); i++) {
-            Charset challenger = candList.get(i);
-            String aDecoded = applyEntityVariant(new String(forDecode, 
winnerCs), "expanded");
-            String bDecoded = applyEntityVariant(new String(forDecode, 
challenger), "expanded");
-            TextQualityScore aScore = detector.score(aDecoded);
-            TextQualityScore bScore = detector.score(bDecoded);
-            if (aScore.isUnknown() || bScore.isUnknown()) {
-                continue;
-            }
-            float margin = aScore.getZScore() - bScore.getZScore();
-            if (margin < 0) {
-                winnerCs = challenger;
-                margin = -margin;
-            }
-            bestMargin = Math.min(bestMargin, Math.abs(margin));
-        }
-        r.winner = winnerCs.name();
-        r.margin = Float.isInfinite(bestMargin) ? Float.NaN : bestMargin;
-        r.status = safeCanonical(r.winner).equals(safeCanonical(expected)) ? 
"PASS" : "FAIL";
-        return r;
-    }
-
-    private static String firstCharset(org.apache.tika.detect.EncodingDetector 
d,
-                                       byte[] bytes, ParseContext pctx) {
-        try (TikaInputStream tis =
-                     TikaInputStream.get(new 
java.io.ByteArrayInputStream(bytes))) {
-            List<EncodingResult> results = d.detect(tis, new Metadata(), pctx);
-            if (results == null || results.isEmpty()) {
-                return null;
-            }
-            Charset cs = results.get(0).getCharset();
-            return cs == null ? null : cs.name();
-        } catch (Exception e) {
-            return null;
-        }
-    }
-
-    private static void addUnique(List<Charset> list, String name) {
-        if (name == null) {
-            return;
-        }
-        Charset cs;
-        try {
-            cs = Charset.forName(name);
-        } catch (Exception e) {
-            return;
-        }
-        for (Charset c : list) {
-            if (c.equals(cs)) {
-                return;
-            }
-        }
-        list.add(cs);
-    }
-
     /**
      * Diagnose why JunkDetector returned UNKNOWN for {@code text}.  Walks
      * the same script-run logic, then classifies the failure mode:
diff --git 
a/tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/JunkDetectorSmokeTest.java
 
b/tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/JunkDetectorSmokeTest.java
index d8f267ecc2..e670f9e163 100644
--- 
a/tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/JunkDetectorSmokeTest.java
+++ 
b/tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/JunkDetectorSmokeTest.java
@@ -112,7 +112,7 @@ public class JunkDetectorSmokeTest {
 
         System.out.println("Baltic comparison: " + result);
 
-        assertEquals("B", result.winner(),
+        assertEquals("cp1257", result.winner(),
                 "cp1257 should be identified as the correct encoding for 
Lithuanian text");
         // Delta is weak (pooled LATIN model dilutes Baltic-specific bigrams).
         // Production threshold is delta > 1.0; PoC floor is 0.1.
@@ -144,7 +144,7 @@ public class JunkDetectorSmokeTest {
 
         System.out.println("Russian Cyrillic comparison: " + result);
 
-        assertEquals("B", result.winner(),
+        assertEquals("cp1251", result.winner(),
                 "cp1251 should be identified as the correct encoding for 
Russian text");
         assertTrue(result.delta() > 1.0,
                 "Cyrillic codec separation should be strong: delta=" + 
result.delta());
@@ -197,7 +197,7 @@ public class JunkDetectorSmokeTest {
 
         System.out.println("Shift-JIS zip entry: " + result);
 
-        assertEquals("A", result.winner(),
+        assertEquals("Shift-JIS", result.winner(),
                 "Shift-JIS decode should beat garbled UTF-8 for short Japanese 
filename");
     }

(tika) 02/02: checkpoint v7

Reply via email to