This is an automated email from the ASF dual-hosted git repository. tballison pushed a commit to branch junk-detector-v6 in repository https://gitbox.apache.org/repos/asf/tika.git
commit f5c61f31af7100dd334b8c14ed34045682afffc8 Author: tallison <[email protected]> AuthorDate: Thu May 14 16:41:15 2026 -0400 checkpoint v7 --- .../apache/tika/quality/TextQualityComparison.java | 9 +- .../apache/tika/quality/TextQualityDetector.java | 2 +- tika-ml/tika-ml-junkdetect/pom.xml | 22 --- .../apache/tika/ml/junkdetect/JunkDetector.java | 4 +- .../ml/junkdetect/JunkFilterEncodingDetector.java | 162 +------------------- .../tika/ml/junkdetect/tools/EvalJunkDetector.java | 2 +- .../junkdetect/tools/PrototypeCodepointHash.java | 166 +++------------------ .../tika/ml/junkdetect/JunkDetectorSmokeTest.java | 6 +- 8 files changed, 30 insertions(+), 343 deletions(-) diff --git a/tika-core/src/main/java/org/apache/tika/quality/TextQualityComparison.java b/tika-core/src/main/java/org/apache/tika/quality/TextQualityComparison.java index 8c054b0ef7..c1f78cebb6 100644 --- a/tika-core/src/main/java/org/apache/tika/quality/TextQualityComparison.java +++ b/tika-core/src/main/java/org/apache/tika/quality/TextQualityComparison.java @@ -50,8 +50,8 @@ public final class TextQualityComparison { } /** - * Returns {@code "A"} if candidate A is cleaner, {@code "B"} otherwise. - * Check {@link #delta()} to gauge confidence. + * Returns the label of the cleaner candidate ({@link #labelA()} or + * {@link #labelB()}). Check {@link #delta()} to gauge confidence. */ public String winner() { return winner; @@ -88,8 +88,7 @@ public final class TextQualityComparison { @Override public String toString() { return String.format(java.util.Locale.ROOT, - "TextQualityComparison[winner=%s(%s) delta=%.3f A=%s B=%s]", - winner, winner.equals("A") ? labelA : labelB, - delta, scoreA, scoreB); + "TextQualityComparison[winner=%s delta=%.3f A=%s(%s) B=%s(%s)]", + winner, delta, labelA, scoreA, labelB, scoreB); } } diff --git a/tika-core/src/main/java/org/apache/tika/quality/TextQualityDetector.java b/tika-core/src/main/java/org/apache/tika/quality/TextQualityDetector.java index d832b5a169..b91315e727 100644 --- a/tika-core/src/main/java/org/apache/tika/quality/TextQualityDetector.java +++ b/tika-core/src/main/java/org/apache/tika/quality/TextQualityDetector.java @@ -37,7 +37,7 @@ package org.apache.tika.quality; * // Arbitrate between two charset decodings * TextQualityComparison cmp = detector.compare("cp1252", decodedAsCp1252, * "cp1251", decodedAsCp1251); - * String winner = cmp.winner(); // "A" or "B" + * String winner = cmp.winner(); // returns the chosen label, e.g. "cp1251" * }</pre> */ public interface TextQualityDetector { diff --git a/tika-ml/tika-ml-junkdetect/pom.xml b/tika-ml/tika-ml-junkdetect/pom.xml index 7701ec6ff0..a10d73ad64 100644 --- a/tika-ml/tika-ml-junkdetect/pom.xml +++ b/tika-ml/tika-ml-junkdetect/pom.xml @@ -59,28 +59,6 @@ <artifactId>tika-encoding-detector-mojibuster</artifactId> <version>${revision}</version> </dependency> - <!-- Used by EvalFixtures-mode tooling to invoke the three production base - detectors (BOM + HTML header + universal statistical) against fixture - bytes. Compile-scope so the diagnostic tool under tools/ links; - the production junk-filter detector discovers these via ServiceLoader. --> - <dependency> - <groupId>org.apache.tika</groupId> - <artifactId>tika-encoding-detector-html</artifactId> - <version>${revision}</version> - </dependency> - <!-- Bundles the StandardCharsets_unsupported_by_IANA.txt resource that - HtmlEncodingDetector loads from its static initializer. Without - this dep on the eval tool's classpath, HtmlEncodingDetector NPEs. --> - <dependency> - <groupId>org.apache.tika</groupId> - <artifactId>tika-parser-html-module</artifactId> - <version>${revision}</version> - </dependency> - <dependency> - <groupId>org.apache.tika</groupId> - <artifactId>tika-encoding-detector-universal</artifactId> - <version>${revision}</version> - </dependency> <!-- Test dependencies --> <dependency> diff --git a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkDetector.java b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkDetector.java index a60e66e93c..5635f6f168 100644 --- a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkDetector.java +++ b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkDetector.java @@ -81,7 +81,7 @@ import org.apache.tika.quality.TextQualityScore; * * // Arbitrate between two charset decodings * TextQualityComparison result = detector.compare("cp1252", ascp1252, "cp1251", ascp1251); - * String winner = result.winner(); // "A" or "B" + * String winner = result.winner(); // returns "cp1252" or "cp1251" * }</pre> */ public final class JunkDetector implements TextQualityDetector { @@ -374,7 +374,7 @@ public final class JunkDetector implements TextQualityDetector { float zA = scoreA.isUnknown() ? 0f : scoreA.getZScore(); float zB = scoreB.isUnknown() ? 0f : scoreB.getZScore(); - String winner = zA >= zB ? "A" : "B"; + String winner = zA >= zB ? labelA : labelB; float delta = Math.abs(zA - zB); return new TextQualityComparison(winner, delta, scoreA, scoreB, labelA, labelB); diff --git a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkFilterEncodingDetector.java b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkFilterEncodingDetector.java index 9f7df16aad..72e51e8094 100644 --- a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkFilterEncodingDetector.java +++ b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkFilterEncodingDetector.java @@ -41,7 +41,6 @@ import org.apache.tika.ml.chardetect.HtmlByteStripper; import org.apache.tika.parser.ParseContext; import org.apache.tika.quality.TextQualityComparison; import org.apache.tika.quality.TextQualityDetector; -import org.apache.tika.quality.TextQualityScore; /** * A {@link MetaEncodingDetector} that arbitrates charset candidates by @@ -78,34 +77,6 @@ public class JunkFilterEncodingDetector implements MetaEncodingDetector { * default read limit used by the charset base detectors. */ private static final int DEFAULT_READ_LIMIT = 16384; - // --------------------------------------------------------------------- - // TACTICAL: declarative-override gate constants. - // - // These exist to compensate for known per-script calibration unevenness - // in the quality scorer (HAN noise floor too generous; MALAYALAM/TAMIL/ - // BENGALI floors too strict). They produce wrong tournaments when an - // honest in-document declaration (`<meta charset>` / XML decl) decodes - // to sparse non-Latin content that scores junky-but-correct, while a - // statistical pick decodes to dense mojibake-Han that scores decent- - // but-wrong. See `analyses/2026-04-26-tika-eval-charset-and-other.md` - // and the indic-collapse + Korean+Hanja fixtures. - // - // REMOVE when the quality scorer is recalibrated per-script — the - // tournament should then be reliable on its own. - // --------------------------------------------------------------------- - - /** Maximum delta in z-score units we tolerate before honoring the - * in-document declaration over the tournament winner. Tuned so that - * small same-script-different-codepage deltas (windows-1252 vs - * windows-1257 ≈ 1-2 units) don't trigger override when scripts - * match, while indic-vs-mojibake-Han deltas (~3-5 units) do. */ - private static final float DECLARATIVE_OVERRIDE_MAX_DELTA = 6.0f; - - /** Maximum fraction of REPLACEMENT CHARACTER (U+FFFD) in the declared - * decoder's output. Above this, the declared charset clearly cannot - * decode the bytes and we should not honor the declaration. */ - private static final double DECLARATIVE_MAX_FFFD_RATE = 0.01; - /** Cached quality detector. {@code null} if none is on the classpath. */ private final TextQualityDetector qualityDetector; @@ -259,148 +230,17 @@ public class JunkFilterEncodingDetector implements MetaEncodingDetector { champion.getKey().name(), challenger.getKey().name(), cmp.winner(), String.format(java.util.Locale.ROOT, "%.3f", cmp.delta()), cmp.scoreA(), cmp.scoreB()); - if ("B".equals(cmp.winner())) { + if (challenger.getKey().name().equals(cmp.winner())) { champion = challenger; } } LOG.trace("junk-filter -> {} (tournament champion)", champion.getKey().name()); - // TACTICAL: declarative override. See class-level comment block. - // REMOVE when quality scorer is recalibrated per-script. - Charset declarativeOverride = applyInDocumentDeclarativeOverride( - context, candidates, champion.getKey()); - if (declarativeOverride != null) { - float conf = context.getTopConfidenceFor(declarativeOverride); - context.setArbitrationInfo("junk-filter-declarative-override"); - LOG.trace("junk-filter -> {} (declarative override of tournament winner {})", - declarativeOverride.name(), champion.getKey().name()); - return List.of(new EncodingResult(declarativeOverride, conf)); - } - float confidence = context.getTopConfidenceFor(champion.getKey()); context.setArbitrationInfo("junk-filter-selected"); return List.of(new EncodingResult(champion.getKey(), confidence)); } - /** - * Tactical fix: honor an in-document {@code <meta charset>} or XML - * declaration when the quality scorer's per-script calibration unevenness - * would otherwise mis-rank candidates of <em>different scripts</em>. - * - * <p>Returns the in-document declared charset to use, or {@code null} to - * leave the tournament winner intact.</p> - * - * <p>Gates (all must hold to override):</p> - * <ol> - * <li><strong>(a) Decode is mostly clean</strong>: declared decoder produces - * fewer than {@link #DECLARATIVE_MAX_FFFD_RATE} U+FFFD per char.</li> - * <li><strong>(b) Both decoded</strong>: declared and tournament winner are - * both in the candidate map (already guaranteed by upstream code).</li> - * <li><strong>(c) Quality gap small</strong>: tournament winner's z-score - * is not vastly higher than the declared's; specifically - * {@code winner.z - declared.z <= DECLARATIVE_OVERRIDE_MAX_DELTA}.</li> - * <li><strong>(d) Different scripts</strong>: declared and winner classify - * as different scripts. Same-script Latin-cousin lies (e.g. windows-1252 - * declared on a windows-1257 file) fall through to the tournament, - * which correctly handles them via byte-distribution scoring.</li> - * </ol> - * - * <p>"In-document" means {@code HtmlEncodingDetector} or any future XML-decl - * source — explicitly NOT {@code MetadataCharsetDetector} (outer Content-Type - * header), which is more often wrong.</p> - */ - private Charset applyInDocumentDeclarativeOverride( - EncodingDetectorContext context, - Map<Charset, String> candidates, - Charset champion) { - Charset declared = findInDocumentDeclarative(context); - if (declared == null) { - return null; - } - if (declared.equals(champion)) { - return null; // already winning - } - // Per HTML5 spec, <meta charset> cannot validly declare UTF-16 / UTF-32: - // the meta tag itself is bytes that have to be parsed before its - // declaration is known, and UTF-16/32 require a BOM. If the - // declaration claims UTF-16/32 and no BOM was found (BOMDetector runs - // first in the chain), we treat the declaration as invalid and let - // the tournament winner stand. This catches govdocs1-style "utf-16 - // declared on a Latin file" lies that would otherwise look like a - // legitimate script-mismatch override. - String declaredName = declared.name(); - if (declaredName.startsWith("UTF-16") || declaredName.startsWith("UTF-32")) { - LOG.trace("junk-filter declarative-override skipped: UTF-16/32 in <meta> (HTML5 invalid)"); - return null; - } - String championText = candidates.get(champion); - String declaredText = candidates.get(declared); - if (declaredText == null || championText == null) { - return null; // failed to decode - } - // (a) decode mostly clean - double fffdRate = replacementCharRate(declaredText); - if (fffdRate > DECLARATIVE_MAX_FFFD_RATE) { - LOG.trace("junk-filter declarative-override skipped: U+FFFD rate {} > {}", - fffdRate, DECLARATIVE_MAX_FFFD_RATE); - return null; - } - TextQualityScore declaredScore = qualityDetector.score(declaredText); - TextQualityScore championScore = qualityDetector.score(championText); - // (c) winner not vastly higher - float delta = championScore.getZScore() - declaredScore.getZScore(); - if (delta > DECLARATIVE_OVERRIDE_MAX_DELTA) { - LOG.trace("junk-filter declarative-override skipped: delta {} > {}", - delta, DECLARATIVE_OVERRIDE_MAX_DELTA); - return null; - } - // (d) different scripts - String declaredScript = declaredScore.getDominantScript(); - String championScript = championScore.getDominantScript(); - if (declaredScript == null || declaredScript.equals(championScript)) { - LOG.trace("junk-filter declarative-override skipped: same script {}", - declaredScript); - return null; - } - LOG.trace("junk-filter declarative-override fires: declared={} (script={}, z={}) vs winner={} (script={}, z={}) delta={}", - declared.name(), declaredScript, declaredScore.getZScore(), - champion.name(), championScript, championScore.getZScore(), delta); - return declared; - } - - /** - * Find the first in-document DECLARATIVE candidate (from - * {@code HtmlEncodingDetector} / XML declaration), or {@code null}. - * Outer Content-Type metadata ({@code MetadataCharsetDetector}) is - * intentionally excluded — those headers lie too often. - */ - private static Charset findInDocumentDeclarative(EncodingDetectorContext context) { - for (EncodingDetectorContext.Result r : context.getResults()) { - String name = r.getDetectorName(); - if (("HtmlEncodingDetector".equals(name) - || "StandardHtmlEncodingDetector".equals(name)) - && r.getResultType() == EncodingResult.ResultType.DECLARATIVE) { - return r.getCharset(); - } - } - return null; - } - - /** Fraction of {@code U+FFFD} (REPLACEMENT CHARACTER) in the decoded String — - * a proxy for "this charset cannot decode these bytes". */ - private static double replacementCharRate(String s) { - if (s.isEmpty()) { - return 0.0; - } - long count = 0; - for (int i = 0; i < s.length(); i++) { - if (s.charAt(i) == '�') { - count++; - } - } - return (double) count / s.length(); - } - /** * Return the first DECLARATIVE charset whose decoded output equals at * least one other candidate's, or {@code null}. diff --git a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/EvalJunkDetector.java b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/EvalJunkDetector.java index 6b6057fc34..e0b4bc0ae1 100644 --- a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/EvalJunkDetector.java +++ b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/EvalJunkDetector.java @@ -470,7 +470,7 @@ public class EvalJunkDetector { sourceCodec, asSource, wrongCodec, asWrong); deltas.add(result.delta()); - if ("A".equals(result.winner())) nCorrect++; + if (sourceCodec.equals(result.winner())) nCorrect++; } if (deltas.isEmpty()) continue; diff --git a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/PrototypeCodepointHash.java b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/PrototypeCodepointHash.java index 0d5f04bdee..47ee346d37 100644 --- a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/PrototypeCodepointHash.java +++ b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/PrototypeCodepointHash.java @@ -41,15 +41,8 @@ import java.util.regex.Pattern; import java.util.stream.Stream; import java.util.zip.GZIPInputStream; -import org.apache.tika.detect.BOMDetector; -import org.apache.tika.detect.EncodingResult; -import org.apache.tika.io.TikaInputStream; -import org.apache.tika.metadata.Metadata; import org.apache.tika.ml.chardetect.HtmlByteStripper; import org.apache.tika.ml.junkdetect.JunkDetector; -import org.apache.tika.parser.ParseContext; -import org.apache.tika.parser.html.HtmlEncodingDetector; -import org.apache.tika.parser.txt.UniversalEncodingDetector; import org.apache.tika.quality.TextQualityScore; /** @@ -175,13 +168,20 @@ public class PrototypeCodepointHash { Files.createDirectories(outputDir); // --single-model bypasses the v5/v6-prototype comparison apparatus. - // For evaluating the currently-bundled JunkDetector against real fixtures. + // Requires --force-candidates to specify the charsets to compare; + // the base-detector-driven path was removed to keep tika-ml-junkdetect + // free of heavy encoding-detector deps. if (singleModel) { if (fixturesDirs.isEmpty()) { System.err.println("--single-model requires --fixtures-dir"); System.exit(1); } - evalFixturesSingleModel(fixturesDirs, candidates, forceCandidates, expected, + if (forceCandidates == null || forceCandidates.isEmpty()) { + System.err.println("--single-model requires --force-candidates " + + "(e.g. --force-candidates UTF-8,GB18030)"); + System.exit(1); + } + evalFixturesSingleModel(fixturesDirs, forceCandidates, expected, probeSizes, outputDir); return; } @@ -308,39 +308,25 @@ public class PrototypeCodepointHash { // ----------------------------------------------------------------------- private static void evalFixturesSingleModel(List<Path> fixturesDirs, - List<String> candidates, // ignored List<String> forceCandidates, String expected, int[] probeSizes, Path outputDir) throws IOException { - boolean forceMode = forceCandidates != null && !forceCandidates.isEmpty(); - if (forceMode) { - System.err.println("\n--- Forced-candidates fixture eval ---"); - System.err.println(" candidates: " + forceCandidates); - } else { - System.err.println("\n--- Real-life fixture eval (BOM + HTML + Universal) ---"); - } + System.err.println("\n--- Forced-candidates fixture eval ---"); + System.err.println(" candidates: " + forceCandidates); JunkDetector detector = JunkDetector.loadFromClasspath(); System.err.println(" model version: " + detector.getModelVersion()); System.err.println(" expected: " + expected); - // Pre-resolve forced charsets; skip unsupported ones up front. List<Charset> forced = new ArrayList<>(); - if (forceMode) { - for (String n : forceCandidates) { - try { - forced.add(Charset.forName(n)); - } catch (Exception e) { - System.err.println(" skip unsupported charset: " + n); - } + for (String n : forceCandidates) { + try { + forced.add(Charset.forName(n)); + } catch (Exception e) { + System.err.println(" skip unsupported charset: " + n); } } - BOMDetector bom = new BOMDetector(); - HtmlEncodingDetector html = new HtmlEncodingDetector(); - UniversalEncodingDetector universal = new UniversalEncodingDetector(); - ParseContext pctx = new ParseContext(); - Path out = outputDir.resolve("fixtures-real-life.tsv"); try (PrintWriter pw = new PrintWriter( Files.newBufferedWriter(out, StandardCharsets.UTF_8))) { @@ -362,10 +348,8 @@ public class PrototypeCodepointHash { int[] sizes = probeSizes != null ? probeSizes : new int[]{16_384}; for (Path f : files) { for (int sz : sizes) { - FixtureResult r = forceMode - ? evalOneForced(f, expected, detector, forced, sz) - : evalOneRealLife(f, expected, detector, bom, html, - universal, pctx, sz); + FixtureResult r = + evalOneForced(f, expected, detector, forced, sz); pw.println(r.toTsvLine()); switch (r.status) { case "PASS": @@ -494,120 +478,6 @@ public class PrototypeCodepointHash { return r; } - private static FixtureResult evalOneRealLife(Path file, String expected, - JunkDetector detector, - BOMDetector bom, - HtmlEncodingDetector html, - UniversalEncodingDetector universal, - ParseContext pctx, - int probeBytes) throws IOException { - byte[] raw = Files.readAllBytes(file); - int origLen = raw.length; - FixtureResult r = new FixtureResult(); - r.dir = file.getParent().getFileName().toString(); - String fname = file.getFileName().toString(); - r.shortName = fname.length() > 24 ? fname.substring(0, 24) : fname; - r.bytes = origLen; - r.probeSize = probeBytes; - r.expected = expected; - - if (isBinaryMagic(raw)) { - r.status = "SKIP_BIN"; - return r; - } - - // Probe bytes for the base detectors (16 KB matches production read limit). - // For the base detectors we keep the raw bytes (the BOM detector and - // HTML-header sniff both want the original prefix). - byte[] probe = raw.length > probeBytes ? Arrays.copyOf(raw, probeBytes) : raw; - - r.bomCs = firstCharset(bom, probe, pctx); - r.htmlCs = firstCharset(html, probe, pctx); - r.universalCs = firstCharset(universal, probe, pctx); - - // Collect distinct candidates in order of priority: BOM > HTML > universal. - List<Charset> candList = new ArrayList<>(); - addUnique(candList, r.bomCs); - addUnique(candList, r.htmlCs); - addUnique(candList, r.universalCs); - r.candidatesStr = candList.stream().map(Charset::name) - .reduce((a, b) -> a + "," + b).orElse("-"); - - if (candList.isEmpty()) { - r.status = "NO_CANDIDATES"; - return r; - } - if (candList.size() == 1) { - // All detectors agreed (or only one fired): no arbitration to do. - r.winner = candList.get(0).name(); - r.status = safeCanonical(r.winner).equals(safeCanonical(expected)) ? "AGREE" : "AGREE_WRONG"; - return r; - } - - // Strip HTML from the FULL raw bytes, then slice to probeBytes from - // the stripped content — so a small probe-size doesn't land inside - // the DOCTYPE/head boilerplate with nothing left to score. - byte[] strippedFull = stripHtmlBytes(raw); - byte[] forDecode = strippedFull.length > probeBytes - ? Arrays.copyOf(strippedFull, probeBytes) : strippedFull; - // Pairwise tournament — pick the candidate that beats all others. - Charset winnerCs = candList.get(0); - float bestMargin = Float.POSITIVE_INFINITY; - for (int i = 1; i < candList.size(); i++) { - Charset challenger = candList.get(i); - String aDecoded = applyEntityVariant(new String(forDecode, winnerCs), "expanded"); - String bDecoded = applyEntityVariant(new String(forDecode, challenger), "expanded"); - TextQualityScore aScore = detector.score(aDecoded); - TextQualityScore bScore = detector.score(bDecoded); - if (aScore.isUnknown() || bScore.isUnknown()) { - continue; - } - float margin = aScore.getZScore() - bScore.getZScore(); - if (margin < 0) { - winnerCs = challenger; - margin = -margin; - } - bestMargin = Math.min(bestMargin, Math.abs(margin)); - } - r.winner = winnerCs.name(); - r.margin = Float.isInfinite(bestMargin) ? Float.NaN : bestMargin; - r.status = safeCanonical(r.winner).equals(safeCanonical(expected)) ? "PASS" : "FAIL"; - return r; - } - - private static String firstCharset(org.apache.tika.detect.EncodingDetector d, - byte[] bytes, ParseContext pctx) { - try (TikaInputStream tis = - TikaInputStream.get(new java.io.ByteArrayInputStream(bytes))) { - List<EncodingResult> results = d.detect(tis, new Metadata(), pctx); - if (results == null || results.isEmpty()) { - return null; - } - Charset cs = results.get(0).getCharset(); - return cs == null ? null : cs.name(); - } catch (Exception e) { - return null; - } - } - - private static void addUnique(List<Charset> list, String name) { - if (name == null) { - return; - } - Charset cs; - try { - cs = Charset.forName(name); - } catch (Exception e) { - return; - } - for (Charset c : list) { - if (c.equals(cs)) { - return; - } - } - list.add(cs); - } - /** * Diagnose why JunkDetector returned UNKNOWN for {@code text}. Walks * the same script-run logic, then classifies the failure mode: diff --git a/tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/JunkDetectorSmokeTest.java b/tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/JunkDetectorSmokeTest.java index d8f267ecc2..e670f9e163 100644 --- a/tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/JunkDetectorSmokeTest.java +++ b/tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/JunkDetectorSmokeTest.java @@ -112,7 +112,7 @@ public class JunkDetectorSmokeTest { System.out.println("Baltic comparison: " + result); - assertEquals("B", result.winner(), + assertEquals("cp1257", result.winner(), "cp1257 should be identified as the correct encoding for Lithuanian text"); // Delta is weak (pooled LATIN model dilutes Baltic-specific bigrams). // Production threshold is delta > 1.0; PoC floor is 0.1. @@ -144,7 +144,7 @@ public class JunkDetectorSmokeTest { System.out.println("Russian Cyrillic comparison: " + result); - assertEquals("B", result.winner(), + assertEquals("cp1251", result.winner(), "cp1251 should be identified as the correct encoding for Russian text"); assertTrue(result.delta() > 1.0, "Cyrillic codec separation should be strong: delta=" + result.delta()); @@ -197,7 +197,7 @@ public class JunkDetectorSmokeTest { System.out.println("Shift-JIS zip entry: " + result); - assertEquals("A", result.winner(), + assertEquals("Shift-JIS", result.winner(), "Shift-JIS decode should beat garbled UTF-8 for short Japanese filename"); }
