This is an automated email from the ASF dual-hosted git repository.

tballison pushed a commit to branch charset-cjk-cc-mirror
in repository https://gitbox.apache.org/repos/asf/tika.git

commit c6b5d7f6d5720c4f21106e3ab00ec665a1faad10
Author: tallison <[email protected]>
AuthorDate: Fri May 29 15:31:57 2026 -0400

    dynamic top-n selector
---
 .../ml/junkdetect/JunkFilterEncodingDetector.java  | 104 ++++++++++++++++-----
 1 file changed, 82 insertions(+), 22 deletions(-)

diff --git 
a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkFilterEncodingDetector.java
 
b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkFilterEncodingDetector.java
index 056c768a65..b7810158ea 100644
--- 
a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkFilterEncodingDetector.java
+++ 
b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkFilterEncodingDetector.java
@@ -18,9 +18,13 @@ package org.apache.tika.ml.junkdetect;
 
 import java.io.IOException;
 import java.nio.charset.Charset;
+import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collections;
+import java.util.HashMap;
+import java.util.HashSet;
 import java.util.LinkedHashMap;
+import java.util.LinkedHashSet;
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
@@ -80,6 +84,16 @@ public class JunkFilterEncodingDetector implements 
MetaEncodingDetector {
      *  anchor instead of arbitrating near-identical decodes by quality. */
     private static final float NO_INFO_CONFIDENCE = 0.1f;
 
+    // Adaptive candidate band (TIKA speed lever).  The tournament only needs
+    // NB's top-2 statistical candidates plus any lower-ranked candidate still
+    // within MIN_TAIL_CONFIDENCE of the top; deeper, low-confidence candidates
+    // are clearly dominated and almost never win (measured: δ=0.5 retains
+    // ~98-99% of selected winners, ~20% smaller pool).  Anchors (DECLARATIVE,
+    // STRUCTURAL) are always kept regardless of confidence.  Quality impact is
+    // validated by a full common-token/OOV eval, NOT assumed.
+    private static final int ALWAYS_KEEP_TOP_N = 2;
+    private static final float MIN_TAIL_CONFIDENCE = 0.5f;
+
     /** Cached quality detector.  {@code null} if none is on the classpath. */
     private final TextQualityDetector qualityDetector;
 
@@ -152,24 +166,25 @@ public class JunkFilterEncodingDetector implements 
MetaEncodingDetector {
         // become codepoints whose cross-script transitions expose mojibake
         // under a wrong decoding (AIT5 case).
         Map<Charset, String> candidates = new LinkedHashMap<>();
-        for (Charset cs : uniqueCharsets) {
-            String decoded = safeDecode(bytes, cs);
-            if (decoded != null && !decoded.isEmpty()) {
-                decoded = HtmlContentCleaner.clean(decoded);
+        // Dedup: charsets that decode the raw probe to the identical string
+        // (e.g. GB18030/GBK, x-windows-949/EUC-KR on non-extension content)
+        // share one clean() call — the cleaned result is identical by
+        // construction, so this is quality-neutral, purely a work saving.
+        Map<String, String> cleanedByRaw = new HashMap<>();
+        Set<Charset> candidateCharsets = bandFilter(context, uniqueCharsets);
+        for (Charset cs : candidateCharsets) {
+            String raw = safeDecode(bytes, cs);
+            if (raw == null || raw.isEmpty()) {
+                LOG.trace("junk-filter decode {} -> null/empty", cs.name());
+                continue;
+            }
+            String decoded = cleanedByRaw.get(raw);
+            if (decoded == null) {
+                decoded = HtmlContentCleaner.clean(raw);
+                cleanedByRaw.put(raw, decoded);
             }
             if (decoded != null && !decoded.isEmpty()) {
                 candidates.put(cs, decoded);
-                if (LOG.isTraceEnabled()) {
-                    int sampleLen = Math.min(400, decoded.length());
-                    String sample = decoded.substring(0, sampleLen)
-                            .replace('\n', ' ').replace('\r', ' ');
-                    LOG.trace("junk-filter decoded {}: '{}{}' (len={})",
-                            cs.name(), sample,
-                            decoded.length() > sampleLen ? "…" : "",
-                            decoded.length());
-                }
-            } else {
-                LOG.trace("junk-filter decode {} -> null/empty", cs.name());
             }
         }
         if (candidates.size() <= 1) {
@@ -228,15 +243,20 @@ public class JunkFilterEncodingDetector implements 
MetaEncodingDetector {
         Charset champion = null;
         double championZ = Double.NEGATIVE_INFINITY;
         Map<Charset, Double> scoreByCharset = new LinkedHashMap<>();
+        // Dedup: identical decoded text scores identically; score it once.
+        Map<String, Float> zByText = new HashMap<>();
         for (Map.Entry<Charset, String> entry : candidates.entrySet()) {
-            org.apache.tika.quality.TextQualityScore sc =
-                    qualityDetector.score(entry.getValue());
-            float rawZ = sc.isUnknown() ? Float.NEGATIVE_INFINITY : 
sc.getZScore();
+            String text = entry.getValue();
+            Float cached = zByText.get(text);
+            float rawZ;
+            if (cached != null) {
+                rawZ = cached;
+            } else {
+                org.apache.tika.quality.TextQualityScore sc = 
qualityDetector.score(text);
+                rawZ = sc.isUnknown() ? Float.NEGATIVE_INFINITY : 
sc.getZScore();
+                zByText.put(text, rawZ);
+            }
             scoreByCharset.put(entry.getKey(), (double) rawZ);
-            LOG.trace("junk-filter score {} z={} script={}",
-                    entry.getKey().name(),
-                    String.format(java.util.Locale.ROOT, "%.3f", rawZ),
-                    sc.isUnknown() ? "UNKNOWN" : sc.getDominantScript());
             if (rawZ > championZ) {
                 championZ = rawZ;
                 champion = entry.getKey();
@@ -274,6 +294,46 @@ public class JunkFilterEncodingDetector implements 
MetaEncodingDetector {
         return List.of(new EncodingResult(champion, confidence));
     }
 
+    /**
+     * Restrict the candidate set the tournament will decode+clean+score: keep
+     * every DECLARATIVE/STRUCTURAL anchor (author intent / byte-grammar 
proof),
+     * plus the top {@link #ALWAYS_KEEP_TOP_N} STATISTICAL candidates by
+     * confidence, plus any deeper STATISTICAL candidate still within
+     * {@link #MIN_TAIL_CONFIDENCE}.  Drops the dominated low-confidence tail —
+     * the speed lever — without removing any anchor or NB's real contenders.
+     * Returns a subset of {@code all}, preserving its iteration order.
+     */
+    private static Set<Charset> bandFilter(EncodingDetectorContext context, 
Set<Charset> all) {
+        Set<Charset> anchors = new HashSet<>();
+        List<EncodingResult> stats = new ArrayList<>();
+        for (EncodingDetectorContext.Result r : context.getResults()) {
+            for (EncodingResult er : r.getEncodingResults()) {
+                EncodingResult.ResultType t = er.getResultType();
+                if (t == EncodingResult.ResultType.DECLARATIVE
+                        || t == EncodingResult.ResultType.STRUCTURAL) {
+                    anchors.add(er.getCharset());
+                } else if (t == EncodingResult.ResultType.STATISTICAL) {
+                    stats.add(er);
+                }
+            }
+        }
+        stats.sort((a, b) -> Float.compare(b.getConfidence(), 
a.getConfidence()));
+        Set<Charset> keepStat = new HashSet<>();
+        for (int i = 0; i < stats.size(); i++) {
+            if (i < ALWAYS_KEEP_TOP_N
+                    || stats.get(i).getConfidence() >= MIN_TAIL_CONFIDENCE) {
+                keepStat.add(stats.get(i).getCharset());
+            }
+        }
+        Set<Charset> kept = new LinkedHashSet<>();
+        for (Charset cs : all) {
+            if (anchors.contains(cs) || keepStat.contains(cs)) {
+                kept.add(cs);
+            }
+        }
+        return kept;
+    }
+
     /**
      * True if some detector produced a confident non-declarative signal: any
      * STRUCTURAL result (byte-grammar proof), or any STATISTICAL result above

Reply via email to