(tika) branch main updated: TIKA-4671-lang-aware-charset-detection (#2621)

tallison Fri, 20 Feb 2026 08:34:52 -0800

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git



The following commit(s) were added to refs/heads/main by this push:
     new d49b6d1d39 TIKA-4671-lang-aware-charset-detection (#2621)
d49b6d1d39 is described below

commit d49b6d1d3992b12b7b216a4f56e758d8eac31344
Author: Tim Allison <[email protected]>
AuthorDate: Fri Feb 20 11:34:36 2026 -0500

    TIKA-4671-lang-aware-charset-detection (#2621)
---
 .../tika/detect/CompositeEncodingDetector.java     | 155 ++++++++++++++--
 .../tika/detect/DefaultEncodingDetector.java       |  27 +--
 .../tika/detect/EncodingDetectorContext.java       | 105 +++++++++++
 .../apache/tika/detect/MetaEncodingDetector.java   |  39 ++++
 .../tika/language/detect/LanguageResult.java       |  28 +++
 .../apache/tika/metadata/TikaCoreProperties.java   |   8 +
 .../tika/langdetect/charsoup/CharSoupModel.java    |  18 +-
 .../charsoup/CharSoupEncodingDetector.java         | 186 +++++++++++++++++++
 .../charsoup/CharSoupLanguageDetector.java         | 155 +++++++++++++++-
 .../charsoup/CharSoupEncodingDetectorTest.java     | 206 +++++++++++++++++++++
 .../langdetect/charsoup/TextQualityDiagTest.java   | 141 ++++++++++++++
 .../tika-parsers-standard-package/pom.xml          |   6 +
 .../tika/config/TikaEncodingDetectorTest.java      |  49 ++++-
 ...KA-4671-exclude-charsoup-encoding-detector.json |  11 ++
 .../testArabicMisleadingCharset.html               |  11 ++
 15 files changed, 1110 insertions(+), 35 deletions(-)

diff --git 
a/tika-core/src/main/java/org/apache/tika/detect/CompositeEncodingDetector.java 
b/tika-core/src/main/java/org/apache/tika/detect/CompositeEncodingDetector.java
index e5c6152e6a..5fbf44dace 100644
--- 
a/tika-core/src/main/java/org/apache/tika/detect/CompositeEncodingDetector.java
+++ 
b/tika-core/src/main/java/org/apache/tika/detect/CompositeEncodingDetector.java
@@ -19,56 +19,105 @@ package org.apache.tika.detect;
 import java.io.IOException;
 import java.io.Serializable;
 import java.nio.charset.Charset;
+import java.util.ArrayList;
 import java.util.Collection;
 import java.util.Collections;
-import java.util.LinkedList;
 import java.util.List;
 
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.parser.ParseContext;
 
+/**
+ * A composite encoding detector that runs child detectors.
+ *
+ * <p>If a {@link MetaEncodingDetector} is among the children, this
+ * composite switches from first-match-wins to collect-all mode:
+ * all base detectors run first and their results are collected in an
+ * {@link EncodingDetectorContext}, then the meta detector runs last
+ * to arbitrate. Only one meta detector is supported.</p>
+ *
+ * <p>If no meta detector is present, the first non-null result wins
+ * (traditional behavior).</p>
+ */
 public class CompositeEncodingDetector implements EncodingDetector, 
Serializable {
 
-    /**
-     * Serial version UID
-     */
     private static final long serialVersionUID = 5980683158436430252L;
 
+    private static final Logger LOG =
+            LoggerFactory.getLogger(CompositeEncodingDetector.class);
+
     private final List<EncodingDetector> detectors;
+    private final List<EncodingDetector> baseDetectors;
+    private final MetaEncodingDetector metaDetector;
 
     public CompositeEncodingDetector(List<EncodingDetector> detectors,
                                      Collection<Class<? extends 
EncodingDetector>>
                                              excludeEncodingDetectors) {
-        this.detectors = new LinkedList<>();
+        this.detectors = new ArrayList<>();
         for (EncodingDetector encodingDetector : detectors) {
             if (!isExcluded(excludeEncodingDetectors, 
encodingDetector.getClass())) {
                 this.detectors.add(encodingDetector);
             }
         }
-
+        this.baseDetectors = new ArrayList<>();
+        this.metaDetector = partition(this.detectors, baseDetectors);
     }
 
     public CompositeEncodingDetector(List<EncodingDetector> detectors) {
-        this.detectors = new LinkedList<>();
-        this.detectors.addAll(detectors);
+        this.detectors = new ArrayList<>(detectors);
+        this.baseDetectors = new ArrayList<>();
+        this.metaDetector = partition(this.detectors, baseDetectors);
     }
 
     /**
-     * @param tis      text document input stream, or <code>null</code>
-     * @param metadata input metadata for the document
-     * @return the detected Charset or null if no charset could be detected
-     * @throws IOException
+     * Partition detectors into base detectors and at most one meta detector.
      */
+    private static MetaEncodingDetector partition(
+            List<EncodingDetector> all, List<EncodingDetector> base) {
+        MetaEncodingDetector meta = null;
+        for (EncodingDetector d : all) {
+            if (d instanceof MetaEncodingDetector) {
+                if (meta == null) {
+                    meta = (MetaEncodingDetector) d;
+                } else {
+                    LOG.warn("Multiple MetaEncodingDetectors found; " +
+                            "ignoring {}",
+                            d.getClass().getName());
+                }
+            } else {
+                base.add(d);
+            }
+        }
+        return meta;
+    }
+
     @Override
-    public Charset detect(TikaInputStream tis, Metadata metadata, ParseContext 
parseContext) throws IOException {
+    public Charset detect(TikaInputStream tis, Metadata metadata,
+                          ParseContext parseContext) throws IOException {
+        if (metaDetector != null) {
+            return detectWithMeta(tis, metadata, parseContext);
+        }
+        return detectFirstMatch(tis, metadata, parseContext);
+    }
+
+    /**
+     * Traditional first-match-wins behavior.
+     */
+    private Charset detectFirstMatch(TikaInputStream tis, Metadata metadata,
+                                     ParseContext parseContext)
+            throws IOException {
         for (EncodingDetector detector : getDetectors()) {
             Charset detected = detector.detect(tis, metadata, parseContext);
             if (detected != null) {
-                metadata.set(TikaCoreProperties.DETECTED_ENCODING, 
detected.name());
-                //if this has been set by a leaf detector, do not overwrite
-                if (! 
detector.getClass().getSimpleName().equals("CompositeEncodingDetector")) {
+                metadata.set(TikaCoreProperties.DETECTED_ENCODING,
+                        detected.name());
+                if (!detector.getClass().getSimpleName()
+                        .equals("CompositeEncodingDetector")) {
                     metadata.set(TikaCoreProperties.ENCODING_DETECTOR,
                             detector.getClass().getSimpleName());
                 }
@@ -78,6 +127,80 @@ public class CompositeEncodingDetector implements 
EncodingDetector, Serializable
         return null;
     }
 
+    /**
+     * Collect-all mode: run every base detector, populate context,
+     * then let the meta detector arbitrate.
+     */
+    private Charset detectWithMeta(TikaInputStream tis, Metadata metadata,
+                                   ParseContext parseContext)
+            throws IOException {
+        EncodingDetectorContext context = new EncodingDetectorContext();
+        parseContext.set(EncodingDetectorContext.class, context);
+        try {
+            for (EncodingDetector detector : baseDetectors) {
+                Charset detected =
+                        detector.detect(tis, metadata, parseContext);
+                if (detected != null) {
+                    context.addResult(detected,
+                            detector.getClass().getSimpleName());
+                }
+            }
+
+            Charset result =
+                    metaDetector.detect(tis, metadata, parseContext);
+
+            // If meta detector returned null (disabled or no candidates),
+            // fall back to first base detector's result
+            if (result == null && !context.getResults().isEmpty()) {
+                EncodingDetectorContext.Result first =
+                        context.getResults().get(0);
+                result = first.getCharset();
+                metadata.set(TikaCoreProperties.DETECTED_ENCODING,
+                        result.name());
+                metadata.set(TikaCoreProperties.ENCODING_DETECTOR,
+                        first.getDetectorName());
+            } else if (result != null) {
+                metadata.set(TikaCoreProperties.DETECTED_ENCODING,
+                        result.name());
+                String detectorName =
+                        metaDetector.getClass().getSimpleName();
+                for (EncodingDetectorContext.Result r :
+                        context.getResults()) {
+                    if (r.getCharset().equals(result)) {
+                        detectorName = r.getDetectorName();
+                        break;
+                    }
+                }
+                metadata.set(TikaCoreProperties.ENCODING_DETECTOR,
+                        detectorName);
+            }
+
+            // Build and set the detection trace
+            metadata.set(TikaCoreProperties.ENCODING_DETECTION_TRACE,
+                    buildTrace(context));
+
+            return result;
+        } finally {
+            parseContext.set(EncodingDetectorContext.class, null);
+        }
+    }
+
+    private static String buildTrace(EncodingDetectorContext context) {
+        StringBuilder sb = new StringBuilder();
+        for (EncodingDetectorContext.Result r : context.getResults()) {
+            if (sb.length() > 0) {
+                sb.append(", ");
+            }
+            sb.append(r.getDetectorName()).append("->")
+                    .append(r.getCharset().name());
+        }
+        String info = context.getArbitrationInfo();
+        if (info != null) {
+            sb.append(" (").append(info).append(")");
+        }
+        return sb.toString();
+    }
+
     public List<EncodingDetector> getDetectors() {
         return Collections.unmodifiableList(detectors);
     }
diff --git 
a/tika-core/src/main/java/org/apache/tika/detect/DefaultEncodingDetector.java 
b/tika-core/src/main/java/org/apache/tika/detect/DefaultEncodingDetector.java
index 67cf26e27d..72dd3ba4c0 100644
--- 
a/tika-core/src/main/java/org/apache/tika/detect/DefaultEncodingDetector.java
+++ 
b/tika-core/src/main/java/org/apache/tika/detect/DefaultEncodingDetector.java
@@ -22,15 +22,20 @@ import javax.imageio.spi.ServiceRegistry;
 import org.apache.tika.config.ServiceLoader;
 
 /**
- * A composite encoding detector based on all the {@link EncodingDetector} 
implementations
- * available through the {@link ServiceRegistry service provider mechanism}.  
Those
- * loaded via the service provider mechanism are ordered by how they appear in 
the
- * file, if there is a single service file.  If multiple, there is no 
guarantee of order.
- * <p>
- * <p>
- * If you need to control the order of the Detectors, you should instead
- * construct your own {@link CompositeDetector} and pass in the list
- * of Detectors in the required order.
+ * A composite encoding detector based on all the {@link EncodingDetector}
+ * implementations available through the
+ * {@link ServiceRegistry service provider mechanism}.
+ *
+ * <p>Those loaded via the service provider mechanism are ordered by how
+ * they appear in the file, if there is a single service file. If
+ * multiple, there is no guarantee of order.</p>
+ *
+ * <p>If you need to control the order of the Detectors, you should
+ * instead construct your own {@link CompositeEncodingDetector} and pass
+ * in the list of Detectors in the required order.</p>
+ *
+ * <p>{@link MetaEncodingDetector} handling (collect-all-then-arbitrate)
+ * is provided by {@link CompositeEncodingDetector}.</p>
  *
  * @since Apache Tika 1.15
  */
@@ -47,7 +52,7 @@ public class DefaultEncodingDetector extends 
CompositeEncodingDetector {
     public DefaultEncodingDetector(ServiceLoader loader,
                                    Collection<Class<? extends 
EncodingDetector>>
                                            excludeEncodingDetectors) {
-        super(loader.loadServiceProviders(EncodingDetector.class), 
excludeEncodingDetectors);
+        super(loader.loadServiceProviders(EncodingDetector.class),
+                excludeEncodingDetectors);
     }
-
 }
diff --git 
a/tika-core/src/main/java/org/apache/tika/detect/EncodingDetectorContext.java 
b/tika-core/src/main/java/org/apache/tika/detect/EncodingDetectorContext.java
new file mode 100644
index 0000000000..6ac55f87da
--- /dev/null
+++ 
b/tika-core/src/main/java/org/apache/tika/detect/EncodingDetectorContext.java
@@ -0,0 +1,105 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.detect;
+
+import java.nio.charset.Charset;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.LinkedHashSet;
+import java.util.List;
+import java.util.Set;
+
+/**
+ * Context object that collects encoding detection results from base
+ * detectors. Stored in {@link org.apache.tika.parser.ParseContext} by
+ * {@link DefaultEncodingDetector} so that the {@link MetaEncodingDetector}
+ * can see all candidates and arbitrate. Removed after detection to
+ * prevent contamination during recursive parsing.
+ *
+ * @since Apache Tika 3.2
+ */
+public class EncodingDetectorContext {
+
+    private final List<Result> results = new ArrayList<>();
+    private String arbitrationInfo;
+
+    /**
+     * Record a detection result from a child detector.
+     *
+     * @param charset      the detected charset (must not be null)
+     * @param detectorName the simple class name of the detector
+     */
+    public void addResult(Charset charset, String detectorName) {
+        results.add(new Result(charset, detectorName));
+    }
+
+    /**
+     * @return unmodifiable list of all results in detection order
+     */
+    public List<Result> getResults() {
+        return Collections.unmodifiableList(results);
+    }
+
+    /**
+     * @return unique charsets in detection order
+     */
+    public Set<Charset> getUniqueCharsets() {
+        Set<Charset> charsets = new LinkedHashSet<>();
+        for (Result r : results) {
+            charsets.add(r.getCharset());
+        }
+        return charsets;
+    }
+
+    /**
+     * Set by the meta detector to describe how it reached its decision.
+     * Values: "unanimous", "compatible", "scored", "too-few-bigrams", 
"disabled".
+     */
+    public void setArbitrationInfo(String info) {
+        this.arbitrationInfo = info;
+    }
+
+    public String getArbitrationInfo() {
+        return arbitrationInfo;
+    }
+
+    /**
+     * A single detection result pairing a charset with the detector that 
found it.
+     */
+    public static class Result {
+        private final Charset charset;
+        private final String detectorName;
+
+        public Result(Charset charset, String detectorName) {
+            this.charset = charset;
+            this.detectorName = detectorName;
+        }
+
+        public Charset getCharset() {
+            return charset;
+        }
+
+        public String getDetectorName() {
+            return detectorName;
+        }
+
+        @Override
+        public String toString() {
+            return detectorName + "=" + charset.name();
+        }
+    }
+}
diff --git 
a/tika-core/src/main/java/org/apache/tika/detect/MetaEncodingDetector.java 
b/tika-core/src/main/java/org/apache/tika/detect/MetaEncodingDetector.java
new file mode 100644
index 0000000000..e8a46f647b
--- /dev/null
+++ b/tika-core/src/main/java/org/apache/tika/detect/MetaEncodingDetector.java
@@ -0,0 +1,39 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.detect;
+
+/**
+ * Marker interface for encoding detectors that arbitrate among
+ * candidates collected by base detectors rather than detecting
+ * encoding directly from the stream.
+ *
+ * <p>When a {@code MetaEncodingDetector} is present in a
+ * {@link CompositeEncodingDetector}, the composite switches from
+ * first-match-wins to collect-all mode: all base detectors run
+ * first and their results are collected in an
+ * {@link EncodingDetectorContext}, then the meta detector's
+ * {@link #detect} method is called to pick the winner.</p>
+ *
+ * <p>The {@link EncodingDetectorContext} is placed in the
+ * {@link org.apache.tika.parser.ParseContext} before the meta
+ * detector is invoked, so implementations can retrieve it via
+ * {@code parseContext.get(EncodingDetectorContext.class)}.</p>
+ *
+ * @since Apache Tika 3.2
+ */
+public interface MetaEncodingDetector extends EncodingDetector {
+}
diff --git 
a/tika-core/src/main/java/org/apache/tika/language/detect/LanguageResult.java 
b/tika-core/src/main/java/org/apache/tika/language/detect/LanguageResult.java
index dada5fda17..348232587e 100644
--- 
a/tika-core/src/main/java/org/apache/tika/language/detect/LanguageResult.java
+++ 
b/tika-core/src/main/java/org/apache/tika/language/detect/LanguageResult.java
@@ -31,14 +31,32 @@ public class LanguageResult {
     // greater confidence.
     private final float rawScore;
 
+    // Detector-agnostic confidence score (0.0 to 1.0, higher = more 
confident).
+    // Detectors can populate this however makes sense for their internals
+    // (e.g., entropy-derived for CharSoup, probability-based for OpenNLP).
+    // Defaults to rawScore for backwards compatibility.
+    private final float confidenceScore;
+
     /**
      * @param language ISO 639-1 language code (plus optional country code)
      * @param rawScore confidence of detector in the result.
      */
     public LanguageResult(String language, LanguageConfidence confidence, 
float rawScore) {
+        this(language, confidence, rawScore, rawScore);
+    }
+
+    /**
+     * @param language        ISO 639-1 language code (plus optional country 
code)
+     * @param rawScore        detector-specific score (e.g., softmax 
probability)
+     * @param confidenceScore detector-agnostic confidence (0.0 to 1.0, higher 
= more confident).
+     *                        For comparing results across different decodings 
or detectors.
+     */
+    public LanguageResult(String language, LanguageConfidence confidence,
+                          float rawScore, float confidenceScore) {
         this.language = language;
         this.confidence = confidence;
         this.rawScore = rawScore;
+        this.confidenceScore = confidenceScore;
     }
 
     /**
@@ -54,6 +72,16 @@ public class LanguageResult {
         return rawScore;
     }
 
+    /**
+     * Detector-agnostic confidence score (0.0 to 1.0). Higher values indicate
+     * the detector is more confident in the result. This can be used to 
compare
+     * results across different text decodings (e.g., for encoding detection)
+     * without knowing the detector implementation.
+     */
+    public float getConfidenceScore() {
+        return confidenceScore;
+    }
+
     public LanguageConfidence getConfidence() {
         return confidence;
     }
diff --git 
a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java 
b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
index fdd52259e3..0d57f8cd03 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
@@ -414,6 +414,14 @@ public interface TikaCoreProperties {
      */
     Property ENCODING_DETECTOR = Property.externalText(TIKA_META_PREFIX + 
"encodingDetector");
 
+    /**
+     * Diagnostic trace showing which encoding detectors ran and what each 
returned,
+     * plus the arbitration method used when detectors disagreed.
+     * Example: {@code "HtmlEncodingDetector->UTF-8, 
Icu4jEncodingDetector->windows-1256 (scored)"}
+     */
+    Property ENCODING_DETECTION_TRACE =
+            Property.externalText(TIKA_META_PREFIX + "encodingDetectionTrace");
+
     /**
      * General metadata key for the count of non-final versions available 
within a file.  This
      * was added initially to support generalizing incremental updates in PDF.
diff --git 
a/tika-langdetect/tika-langdetect-charsoup-core/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupModel.java
 
b/tika-langdetect/tika-langdetect-charsoup-core/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupModel.java
index 2cb1adc64f..809d8b00c3 100644
--- 
a/tika-langdetect/tika-langdetect-charsoup-core/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupModel.java
+++ 
b/tika-langdetect/tika-langdetect-charsoup-core/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupModel.java
@@ -197,6 +197,22 @@ public class CharSoupModel {
      *         (softmax probabilities, sum ≈ 1.0)
      */
     public float[] predict(int[] features) {
+        float[] logits = predictLogits(features);
+        return softmax(logits);
+    }
+
+    /**
+     * Compute raw logits (pre-softmax scores) for the given
+     * feature vector. Higher logits indicate stronger match.
+     * Unlike {@link #predict}, this preserves the full dynamic
+     * range of the model's output, which is useful when
+     * comparing confidence across different input texts.
+     *
+     * @param features int array of size {@code numBuckets}
+     * @return float array of size {@code numClasses}
+     *         (raw logits, not normalized)
+     */
+    public float[] predictLogits(int[] features) {
         int nnz = 0;
         for (int b = 0; b < numBuckets; b++) {
             if (features[b] != 0) {
@@ -225,7 +241,7 @@ public class CharSoupModel {
         for (int c = 0; c < numClasses; c++) {
             logits[c] = biases[c] + scales[c] * dots[c];
         }
-        return softmax(logits);
+        return logits;
     }
 
     /**
diff --git 
a/tika-langdetect/tika-langdetect-charsoup/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupEncodingDetector.java
 
b/tika-langdetect/tika-langdetect-charsoup/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupEncodingDetector.java
new file mode 100644
index 0000000000..75176f69fc
--- /dev/null
+++ 
b/tika-langdetect/tika-langdetect-charsoup/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupEncodingDetector.java
@@ -0,0 +1,186 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.langdetect.charsoup;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.nio.CharBuffer;
+import java.nio.charset.Charset;
+import java.nio.charset.CharsetDecoder;
+import java.nio.charset.CodingErrorAction;
+import java.util.LinkedHashMap;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.tika.config.TikaComponent;
+import org.apache.tika.detect.EncodingDetectorContext;
+import org.apache.tika.detect.MetaEncodingDetector;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+
+/**
+ * A {@link MetaEncodingDetector} that uses the CharSoup language detector
+ * to arbitrate when base encoding detectors disagree.
+ *
+ * <p>When base detectors all agree, the unanimous charset is returned
+ * without any language detection. When they disagree, raw bytes are
+ * read from the stream, decoded with each candidate charset, and each
+ * decoded text is scored by {@link CharSoupLanguageDetector}. The
+ * charset that produces the highest-confidence language detection wins.</p>
+ *
+ * <p>To enable, add this detector to your encoding detector chain in
+ * tika-config:</p>
+ * <pre>{@code
+ * "encoding-detectors": [
+ *   { "default-encoding-detector": {} },
+ *   { "charsoup-encoding-detector": {} }
+ * ]
+ * }</pre>
+ *
+ * @since Apache Tika 3.2
+ */
+@TikaComponent(name = "charsoup-encoding-detector")
+public class CharSoupEncodingDetector implements MetaEncodingDetector {
+
+    private static final long serialVersionUID = 1L;
+
+    private static final int DEFAULT_READ_LIMIT = 16384;
+
+    private int readLimit = DEFAULT_READ_LIMIT;
+
+    @Override
+    public Charset detect(TikaInputStream tis, Metadata metadata,
+                          ParseContext parseContext) throws IOException {
+        EncodingDetectorContext context =
+                parseContext.get(EncodingDetectorContext.class);
+        if (context == null || context.getResults().isEmpty()) {
+            return null;
+        }
+
+        Set<Charset> uniqueCharsets = context.getUniqueCharsets();
+
+        if (uniqueCharsets.size() <= 1) {
+            // Unanimous or single detector — no arbitration needed
+            EncodingDetectorContext.Result first = context.getResults().get(0);
+            context.setArbitrationInfo("unanimous");
+            return first.getCharset();
+        }
+
+        // Disagreement — arbitrate via language detection scoring
+        return arbitrate(tis, context, uniqueCharsets);
+    }
+
+    private Charset arbitrate(TikaInputStream tis,
+                              EncodingDetectorContext context,
+                              Set<Charset> uniqueCharsets) throws IOException {
+        EncodingDetectorContext.Result firstResult = 
context.getResults().get(0);
+
+        if (tis == null) {
+            context.setArbitrationInfo("no-stream");
+            return firstResult.getCharset();
+        }
+
+        byte[] bytes = readBytes(tis);
+        if (bytes == null || bytes.length == 0) {
+            context.setArbitrationInfo("empty-stream");
+            return firstResult.getCharset();
+        }
+
+        Map<Charset, String> candidates = new LinkedHashMap<>();
+        for (Charset candidate : uniqueCharsets) {
+            candidates.put(candidate, stripTags(decode(bytes, candidate)));
+        }
+
+        CharSoupLanguageDetector langDetector = new CharSoupLanguageDetector();
+        Charset bestCharset = langDetector.compareLanguageSignal(candidates);
+        if (bestCharset == null) {
+            bestCharset = firstResult.getCharset();
+        }
+
+        context.setArbitrationInfo("scored");
+        return bestCharset;
+    }
+
+    private byte[] readBytes(TikaInputStream tis) throws IOException {
+        try {
+            tis.mark(readLimit);
+            byte[] buf = new byte[readLimit];
+            int totalRead = 0;
+            int bytesRead;
+            while (totalRead < readLimit &&
+                    (bytesRead = tis.read(buf, totalRead,
+                            readLimit - totalRead)) != -1) {
+                totalRead += bytesRead;
+            }
+            if (totalRead == 0) {
+                return null;
+            }
+            if (totalRead < readLimit) {
+                byte[] trimmed = new byte[totalRead];
+                System.arraycopy(buf, 0, trimmed, 0, totalRead);
+                return trimmed;
+            }
+            return buf;
+        } finally {
+            tis.reset();
+        }
+    }
+
+    /**
+     * Decode bytes using the given charset, replacing malformed/unmappable
+     * characters rather than throwing.
+     */
+    static String decode(byte[] bytes, Charset charset) {
+        CharsetDecoder decoder = charset.newDecoder()
+                .onMalformedInput(CodingErrorAction.REPLACE)
+                .onUnmappableCharacter(CodingErrorAction.REPLACE);
+        CharBuffer cb = CharBuffer.allocate(bytes.length * 2);
+        decoder.decode(ByteBuffer.wrap(bytes), cb, true);
+        decoder.flush(cb);
+        cb.flip();
+        return cb.toString();
+    }
+
+    /**
+     * Simple tag stripping: removes &lt;...&gt; sequences so that
+     * HTML/XML tag names and attributes don't pollute language scoring.
+     */
+    static String stripTags(String text) {
+        StringBuilder sb = new StringBuilder(text.length());
+        boolean inTag = false;
+        for (int i = 0; i < text.length(); i++) {
+            char c = text.charAt(i);
+            if (c == '<') {
+                inTag = true;
+            } else if (c == '>') {
+                inTag = false;
+            } else if (!inTag) {
+                sb.append(c);
+            }
+        }
+        return sb.toString();
+    }
+
+    public int getReadLimit() {
+        return readLimit;
+    }
+
+    public void setReadLimit(int readLimit) {
+        this.readLimit = readLimit;
+    }
+}
diff --git 
a/tika-langdetect/tika-langdetect-charsoup/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupLanguageDetector.java
 
b/tika-langdetect/tika-langdetect-charsoup/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupLanguageDetector.java
index 1fa6e2cce3..abd0f8e0d6 100644
--- 
a/tika-langdetect/tika-langdetect-charsoup/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupLanguageDetector.java
+++ 
b/tika-langdetect/tika-langdetect-charsoup/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupLanguageDetector.java
@@ -26,6 +26,9 @@ import java.util.List;
 import java.util.Map;
 import java.util.Set;
 
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
 import org.apache.tika.config.TikaComponent;
 import org.apache.tika.language.detect.LanguageConfidence;
 import org.apache.tika.language.detect.LanguageDetector;
@@ -50,9 +53,12 @@ import org.apache.tika.language.detect.LanguageResult;
  * keeping the implementation simple and predictable.
  * </p>
  */
-@TikaComponent
+@TikaComponent(name = "charsoup-language-detector")
 public class CharSoupLanguageDetector extends LanguageDetector {
 
+    private static final Logger LOG =
+            LoggerFactory.getLogger(CharSoupLanguageDetector.class);
+
     private static final String MODEL_RESOURCE =
             "/org/apache/tika/langdetect/charsoup/langdetect.bin";
 
@@ -267,6 +273,126 @@ public class CharSoupLanguageDetector extends 
LanguageDetector {
         return lastEntropy;
     }
 
+    /**
+     * Minimum confidence (inverse logit of the max logit) for a candidate to
+     * be considered a genuine language match. If no candidate exceeds this
+     * threshold, the comparison is inconclusive and {@code null} is returned.
+     * <p>
+     * 0.88 corresponds to a raw logit of ~2.0. Typical values:
+     * <ul>
+     *   <li>Arabic (windows-1256): 0.9999994 (logit +14.3)</li>
+     *   <li>UTF-8 garbled: 0.97 (logit +3.5)</li>
+     *   <li>EBCDIC garbage: 0.79 (logit +1.3) — below threshold</li>
+     *   <li>Short English: 0.025 (logit -3.7) — well below threshold</li>
+     * </ul>
+     */
+    private static final float MIN_CONFIDENCE_THRESHOLD = 0.88f;
+
+    /**
+     * Maximum ratio of junk characters (U+FFFD replacement chars + C0/C1
+     * control chars) allowed in a candidate text. Candidates exceeding
+     * this ratio are discarded before language scoring — they are almost
+     * certainly decoded with the wrong charset.
+     * <p>
+     * Typical values:
+     * <ul>
+     *   <li>Correct decoding: 0.00</li>
+     *   <li>UTF-8 decoding of windows-1256 bytes: 0.80</li>
+     *   <li>IBM500 decoding of ASCII bytes: 0.23</li>
+     * </ul>
+     */
+    private static final float MAX_JUNK_RATIO = 0.10f;
+
+    /**
+     * Compare multiple candidate texts and return the key of the one with
+     * the strongest language signal. Candidates with a high ratio of
+     * replacement or control characters are discarded first. Remaining
+     * candidates are scored using the inverse logit (sigmoid) of the
+     * model's maximum pre-softmax logit.
+     * <p>
+     * Returns {@code null} if no candidate exceeds the minimum confidence
+     * threshold, indicating the comparison is inconclusive.
+     *
+     * @param candidates map of arbitrary keys to candidate text strings
+     * @param <K>        key type (e.g., {@link java.nio.charset.Charset})
+     * @return the key whose text has the strongest language signal,
+     *         or {@code null} if the map is empty or no candidate is
+     *         confident enough
+     */
+    public <K> K compareLanguageSignal(Map<K, String> candidates) {
+        if (candidates.isEmpty()) {
+            return null;
+        }
+
+        float bestConfidence = Float.NEGATIVE_INFINITY;
+        K bestKey = null;
+
+        for (Map.Entry<K, String> entry : candidates.entrySet()) {
+            float junkRatio = junkRatio(entry.getValue());
+            if (junkRatio > MAX_JUNK_RATIO) {
+                LOG.debug("compareLanguageSignal: {} -> skipped 
(junkRatio={})",
+                        entry.getKey(), junkRatio);
+                continue;
+            }
+
+            int[] features = EXTRACTOR.extract(entry.getValue());
+            float[] logits = MODEL.predictLogits(features);
+            float confidence = sigmoid(max(logits));
+
+            LOG.debug("compareLanguageSignal: {} -> confidence={}",
+                    entry.getKey(), confidence);
+
+            if (confidence > bestConfidence) {
+                bestConfidence = confidence;
+                bestKey = entry.getKey();
+            }
+        }
+
+        if (bestConfidence < MIN_CONFIDENCE_THRESHOLD) {
+            LOG.debug("compareLanguageSignal: inconclusive (bestConfidence={} 
< {})",
+                    bestConfidence, MIN_CONFIDENCE_THRESHOLD);
+            return null;
+        }
+
+        return bestKey;
+    }
+
+    /**
+     * Ratio of junk characters (U+FFFD replacement + ISO control + C1
+     * control range U+0080-U+009F) to total characters. High values
+     * indicate a wrong-charset decoding.
+     */
+    static float junkRatio(String text) {
+        if (text == null || text.isEmpty()) {
+            return 0f;
+        }
+        int junk = 0;
+        int total = 0;
+        for (int i = 0; i < text.length(); ) {
+            int cp = text.codePointAt(i);
+            i += Character.charCount(cp);
+            total++;
+            if (cp == 0xFFFD || Character.isISOControl(cp)) {
+                junk++;
+            }
+        }
+        return total == 0 ? 0f : (float) junk / total;
+    }
+
+    private static float sigmoid(float x) {
+        return 1.0f / (1.0f + (float) Math.exp(-x));
+    }
+
+    private static float max(float[] arr) {
+        float m = Float.NEGATIVE_INFINITY;
+        for (float v : arr) {
+            if (v > m) {
+                m = v;
+            }
+        }
+        return m;
+    }
+
     @Override
     public LanguageDetector loadModels() throws IOException {
         // Models are loaded statically; nothing to do.
@@ -367,6 +493,22 @@ public class CharSoupLanguageDetector extends 
LanguageDetector {
         return buildResults(bestProbs);
     }
 
+    /**
+     * Maximum meaningful entropy (bits) for normalizing confidenceScore.
+     * log2(numClasses) for ~165 classes is ~7.4. We cap at 7.0 so that
+     * even moderately uncertain text gets a near-zero confidenceScore.
+     */
+    private static final float MAX_ENTROPY = 7.0f;
+
+    /**
+     * Convert entropy to a 0-1 confidence score. Lower entropy = higher 
confidence.
+     * Uses 1/(1+entropy) to preserve discrimination even at very low 
entropies,
+     * unlike a linear mapping which saturates at 1.0 too quickly.
+     */
+    private static float entropyToConfidenceScore(float entropy) {
+        return 1.0f / (1.0f + entropy);
+    }
+
     /**
      * Build sorted LanguageResult list from raw probabilities.
      */
@@ -374,18 +516,23 @@ public class CharSoupLanguageDetector extends 
LanguageDetector {
         // Compute entropy on collapsed distribution
         float[] collapsed = collapseGroups(probs, GROUP_INDICES);
         lastEntropy = CharSoupModel.entropy(collapsed);
+        float confScore = entropyToConfidenceScore(lastEntropy);
 
         // Build results from raw probabilities sorted by probability 
descending
         List<LanguageResult> results = new ArrayList<>(MODEL.getNumClasses());
         for (int c = 0; c < MODEL.getNumClasses(); c++) {
             results.add(new LanguageResult(
-                    MODEL.getLabel(c), toConfidence(probs[c], lastEntropy), 
probs[c]));
+                    MODEL.getLabel(c), toConfidence(probs[c], lastEntropy),
+                    probs[c], confScore));
         }
         results.sort((a, b) -> Float.compare(b.getRawScore(), 
a.getRawScore()));
 
-        // If top score is below NONE threshold, return NULL
+        // If top score is below NONE threshold, return a NULL-like result
+        // but preserve the confidenceScore so encoding arbitration can
+        // still compare across candidate decodings.
         if (results.get(0).getConfidence() == LanguageConfidence.NONE) {
-            return Collections.singletonList(LanguageResult.NULL);
+            return Collections.singletonList(
+                    new LanguageResult("", LanguageConfidence.NONE, 0.0f, 
confScore));
         }
 
         return results;
diff --git 
a/tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/CharSoupEncodingDetectorTest.java
 
b/tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/CharSoupEncodingDetectorTest.java
new file mode 100644
index 0000000000..5ff028d442
--- /dev/null
+++ 
b/tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/CharSoupEncodingDetectorTest.java
@@ -0,0 +1,206 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.langdetect.charsoup;
+
+import static java.nio.charset.StandardCharsets.ISO_8859_1;
+import static java.nio.charset.StandardCharsets.UTF_8;
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNull;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+import java.io.ByteArrayInputStream;
+import java.nio.charset.Charset;
+
+import org.junit.jupiter.api.Test;
+
+import org.apache.tika.detect.EncodingDetectorContext;
+import org.apache.tika.detect.MetaEncodingDetector;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+
+public class CharSoupEncodingDetectorTest {
+
+    @Test
+    public void testIsMetaEncodingDetector() {
+        assertTrue(new CharSoupEncodingDetector() instanceof 
MetaEncodingDetector);
+    }
+
+    @Test
+    public void testUnanimous() throws Exception {
+        CharSoupEncodingDetector detector = new CharSoupEncodingDetector();
+        EncodingDetectorContext context = new EncodingDetectorContext();
+        context.addResult(UTF_8, "DetectorA");
+        context.addResult(UTF_8, "DetectorB");
+
+        ParseContext parseContext = new ParseContext();
+        parseContext.set(EncodingDetectorContext.class, context);
+
+        byte[] data = "Hello, world!".getBytes(UTF_8);
+        try (TikaInputStream tis = TikaInputStream.get(
+                new ByteArrayInputStream(data))) {
+            Charset result = detector.detect(tis, new Metadata(), 
parseContext);
+            assertEquals(UTF_8, result);
+            assertEquals("unanimous", context.getArbitrationInfo());
+        }
+    }
+
+    @Test
+    public void testNoContext() throws Exception {
+        CharSoupEncodingDetector detector = new CharSoupEncodingDetector();
+        ParseContext parseContext = new ParseContext();
+
+        byte[] data = "Test".getBytes(UTF_8);
+        try (TikaInputStream tis = TikaInputStream.get(
+                new ByteArrayInputStream(data))) {
+            Charset result = detector.detect(tis, new Metadata(), 
parseContext);
+            assertNull(result);
+        }
+    }
+
+    @Test
+    public void testEmptyResults() throws Exception {
+        CharSoupEncodingDetector detector = new CharSoupEncodingDetector();
+        EncodingDetectorContext context = new EncodingDetectorContext();
+
+        ParseContext parseContext = new ParseContext();
+        parseContext.set(EncodingDetectorContext.class, context);
+
+        byte[] data = "Test".getBytes(UTF_8);
+        try (TikaInputStream tis = TikaInputStream.get(
+                new ByteArrayInputStream(data))) {
+            Charset result = detector.detect(tis, new Metadata(), 
parseContext);
+            assertNull(result);
+        }
+    }
+
+    @Test
+    public void testArabicEncodingArbitration() throws Exception {
+        // Arabic text encoded in windows-1256.
+        // When decoded as UTF-8 it produces replacement chars / garbage.
+        // When decoded as windows-1256 it produces valid Arabic.
+        // The language detector should pick windows-1256.
+        Charset windows1256 = Charset.forName("windows-1256");
+
+        String arabicText =
+                "\u0641\u064a \u0642\u0631\u064a\u0629 \u0645\u0646 " +
+                "\u0627\u0644\u0642\u0631\u0649 \u0643\u0627\u0646 " +
+                "\u0647\u0646\u0627\u0643 \u0631\u062c\u0644 " +
+                "\u062d\u0643\u064a\u0645 \u064a\u0639\u0631\u0641 " +
+                "\u0643\u0644 \u0634\u064a\u0621 \u0639\u0646 " +
+                "\u0627\u0644\u062d\u064a\u0627\u0629 \u0648\u0643\u0627\u0646 
" +
+                "\u064a\u0639\u0644\u0645 \u0627\u0644\u0646\u0627\u0633 " +
+                "\u0643\u064a\u0641 \u064a\u0639\u064a\u0634\u0648\u0646 " +
+                "\u0628\u0633\u0644\u0627\u0645 
\u0648\u0627\u0646\u0633\u062c\u0627\u0645. " +
+                "\u0627\u0644\u0644\u063a\u0629 
\u0627\u0644\u0639\u0631\u0628\u064a\u0629 " +
+                "\u0647\u064a \u0648\u0627\u062d\u062f\u0629 \u0645\u0646 " +
+                "\u0623\u0643\u062b\u0631 \u0627\u0644\u0644\u063a\u0627\u062a 
" +
+                "\u0627\u0646\u062a\u0634\u0627\u0631\u0627 \u0641\u064a " +
+                "\u0627\u0644\u0639\u0627\u0644\u0645 
\u0648\u064a\u062a\u062d\u062b\u0647\u0627 " +
+                "\u0623\u0643\u062b\u0631 \u0645\u0646 
\u062b\u0644\u0627\u062b\u0645\u0627\u0626\u0629 " +
+                "\u0645\u0644\u064a\u0648\u0646 
\u0625\u0646\u0633\u0627\u0646.";
+        byte[] arabicBytes = arabicText.getBytes(windows1256);
+
+        EncodingDetectorContext context = new EncodingDetectorContext();
+        context.addResult(UTF_8, "HtmlEncodingDetector");
+        context.addResult(windows1256, "Icu4jEncodingDetector");
+
+        ParseContext parseContext = new ParseContext();
+        parseContext.set(EncodingDetectorContext.class, context);
+
+        CharSoupEncodingDetector detector = new CharSoupEncodingDetector();
+        try (TikaInputStream tis = TikaInputStream.get(
+                new ByteArrayInputStream(arabicBytes))) {
+            Charset result = detector.detect(tis, new Metadata(), 
parseContext);
+            assertEquals(windows1256, result);
+            assertEquals("scored", context.getArbitrationInfo());
+        }
+    }
+
+    @Test
+    public void testStreamResetAfterDetection() throws Exception {
+        EncodingDetectorContext context = new EncodingDetectorContext();
+        context.addResult(UTF_8, "DetectorA");
+        context.addResult(ISO_8859_1, "DetectorB");
+
+        ParseContext parseContext = new ParseContext();
+        parseContext.set(EncodingDetectorContext.class, context);
+
+        byte[] data = "Hello, world! This is a test of encoding 
detection.".getBytes(UTF_8);
+        CharSoupEncodingDetector detector = new CharSoupEncodingDetector();
+        try (TikaInputStream tis = TikaInputStream.get(
+                new ByteArrayInputStream(data))) {
+            detector.detect(tis, new Metadata(), parseContext);
+
+            // Verify stream is back at the start
+            byte[] readBack = new byte[data.length];
+            int bytesRead = tis.read(readBack);
+            assertEquals(data.length, bytesRead);
+            assertEquals("Hello, world! This is a test of encoding detection.",
+                    new String(readBack, UTF_8));
+        }
+    }
+
+    @Test
+    public void testStripTags() {
+        assertEquals("Hello world",
+                CharSoupEncodingDetector.stripTags(
+                        "<html><body>Hello world</body></html>"));
+        assertEquals("no tags here",
+                CharSoupEncodingDetector.stripTags("no tags here"));
+        assertEquals("",
+                CharSoupEncodingDetector.stripTags("<empty/>"));
+    }
+
+    @Test
+    public void testDecode() {
+        byte[] utf8Bytes = "caf\u00e9".getBytes(UTF_8);
+        assertEquals("caf\u00e9",
+                CharSoupEncodingDetector.decode(utf8Bytes, UTF_8));
+    }
+
+    @Test
+    public void testReadLimitGetterSetter() {
+        CharSoupEncodingDetector detector = new CharSoupEncodingDetector();
+        assertEquals(16384, detector.getReadLimit());
+        detector.setReadLimit(4096);
+        assertEquals(4096, detector.getReadLimit());
+    }
+
+    @Test
+    public void testJunkRatio() {
+        // Clean text — no junk
+        assertEquals(0f,
+                CharSoupLanguageDetector.junkRatio("Hello, world!"), 0.001f);
+
+        // U+FFFD replacement chars
+        assertEquals(0.5f,
+                CharSoupLanguageDetector.junkRatio("ab\uFFFD\uFFFD"), 0.001f);
+
+        // C1 control chars (U+0080-U+009F are isISOControl)
+        assertEquals(0.25f,
+                CharSoupLanguageDetector.junkRatio("abc\u0080"), 0.001f);
+
+        // Mixed: \r\n are control chars too
+        assertEquals(2f / 13f,
+                CharSoupLanguageDetector.junkRatio("hello world\r\n"), 0.001f);
+
+        // Empty/null
+        assertEquals(0f, CharSoupLanguageDetector.junkRatio(""), 0.001f);
+        assertEquals(0f, CharSoupLanguageDetector.junkRatio(null), 0.001f);
+    }
+}
diff --git 
a/tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/TextQualityDiagTest.java
 
b/tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/TextQualityDiagTest.java
new file mode 100644
index 0000000000..7b56089c26
--- /dev/null
+++ 
b/tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/TextQualityDiagTest.java
@@ -0,0 +1,141 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.langdetect.charsoup;
+
+import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
+import java.util.Locale;
+
+import org.junit.jupiter.api.Test;
+
+/**
+ * Diagnostic test to explore text quality scores for encoding arbitration.
+ * Not a regression test — just prints scores for analysis.
+ */
+public class TextQualityDiagTest {
+
+    @Test
+    public void dumpScores() {
+        // Arabic text in windows-1256
+        Charset windows1256 = Charset.forName("windows-1256");
+        String arabicText =
+                "\u0641\u064a \u0642\u0631\u064a\u0629 \u0645\u0646 " +
+                "\u0627\u0644\u0642\u0631\u0649 \u0643\u0627\u0646 " +
+                "\u0647\u0646\u0627\u0643 \u0631\u062c\u0644 " +
+                "\u062d\u0643\u064a\u0645 \u064a\u0639\u0631\u0641 " +
+                "\u0643\u0644 \u0634\u064a\u0621 \u0639\u0646 " +
+                "\u0627\u0644\u062d\u064a\u0627\u0629 \u0648\u0643\u0627\u0646 
" +
+                "\u064a\u0639\u0644\u0645 \u0627\u0644\u0646\u0627\u0633 " +
+                "\u0643\u064a\u0641 \u064a\u0639\u064a\u0634\u0648\u0646 " +
+                "\u0628\u0633\u0644\u0627\u0645 
\u0648\u0627\u0646\u0633\u062c\u0627\u0645.";
+        byte[] arabicBytes = arabicText.getBytes(windows1256);
+
+        // "hello world\r\n" as windows-1252
+        byte[] helloBytes = "hello 
world\r\n".getBytes(StandardCharsets.US_ASCII);
+
+        System.out.println("=== Arabic bytes decoded with different charsets 
===");
+        for (String csName : new String[]{"windows-1256", "x-MacCyrillic", 
"UTF-8"}) {
+            Charset cs = Charset.forName(csName);
+            String decoded = CharSoupEncodingDetector.decode(arabicBytes, cs);
+            printScores(csName, decoded);
+        }
+
+        System.out.println("\n=== 'hello world\\r\\n' decoded with different 
charsets ===");
+        for (String csName : new String[]{"windows-1252", "IBM500"}) {
+            Charset cs = Charset.forName(csName);
+            String decoded = CharSoupEncodingDetector.decode(helloBytes, cs);
+            printScores(csName, decoded);
+        }
+
+        // Also try some real-world short text
+        System.out.println("\n=== Short real text ===");
+        printScores("English sentence", "The quick brown fox jumps over the 
lazy dog.");
+        printScores("French sentence", "Le renard brun rapide saute par-dessus 
le chien paresseux.");
+        printScores("German sentence", "Der schnelle braune Fuchs springt 
\u00fcber den faulen Hund.");
+    }
+
+    private void printScores(String label, String text) {
+        int totalChars = text.length();
+        int letterCount = 0;
+        int replacementCount = 0;
+        int controlCount = 0;
+        int spaceCount = 0;
+        int digitCount = 0;
+        int punctCount = 0;
+        int otherCount = 0;
+
+        for (int i = 0; i < text.length(); ) {
+            int cp = text.codePointAt(i);
+            i += Character.charCount(cp);
+
+            if (cp == 0xFFFD) {
+                replacementCount++;
+            } else if (Character.isISOControl(cp) || (cp >= 0x80 && cp <= 
0x9F)) {
+                controlCount++;
+            } else if (Character.isLetter(cp)) {
+                letterCount++;
+            } else if (Character.isWhitespace(cp)) {
+                spaceCount++;
+            } else if (Character.isDigit(cp)) {
+                digitCount++;
+            } else if (isPunctuation(cp)) {
+                punctCount++;
+            } else {
+                otherCount++;
+            }
+        }
+
+        float letterRatio = totalChars > 0 ? (float) letterCount / totalChars 
: 0;
+        float junkRatio = totalChars > 0 ?
+                (float) (replacementCount + controlCount) / totalChars : 0;
+        float nonLetterNonSpaceRatio = totalChars > 0 ?
+                (float) (totalChars - letterCount - spaceCount) / totalChars : 
0;
+
+        System.out.printf(Locale.ROOT,
+                "  %-20s len=%3d  letters=%.2f  junk(repl+ctrl)=%.2f  " +
+                        "nonLetterNonSpace=%.2f  [L=%d S=%d P=%d D=%d R=%d 
C=%d O=%d]%n",
+                label, totalChars, letterRatio, junkRatio, 
nonLetterNonSpaceRatio,
+                letterCount, spaceCount, punctCount, digitCount,
+                replacementCount, controlCount, otherCount);
+
+        // Show first 60 chars with hex for non-printable
+        StringBuilder preview = new StringBuilder();
+        for (int i = 0; i < Math.min(text.length(), 60); ) {
+            int cp = text.codePointAt(i);
+            i += Character.charCount(cp);
+            if (cp >= 0x20 && cp < 0x7F) {
+                preview.appendCodePoint(cp);
+            } else if (Character.isLetter(cp)) {
+                preview.appendCodePoint(cp);
+            } else {
+                preview.append(String.format(Locale.ROOT, "\\u%04X", cp));
+            }
+        }
+        System.out.printf(Locale.ROOT, "  %-20s text: %s%n", "", preview);
+    }
+
+    private boolean isPunctuation(int cp) {
+        int type = Character.getType(cp);
+        return type == Character.CONNECTOR_PUNCTUATION ||
+                type == Character.DASH_PUNCTUATION ||
+                type == Character.END_PUNCTUATION ||
+                type == Character.FINAL_QUOTE_PUNCTUATION ||
+                type == Character.INITIAL_QUOTE_PUNCTUATION ||
+                type == Character.OTHER_PUNCTUATION ||
+                type == Character.START_PUNCTUATION;
+    }
+}
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/pom.xml 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/pom.xml
index 9824379de8..6363afc8a2 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/pom.xml
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/pom.xml
@@ -173,6 +173,12 @@
       <version>${project.version}</version>
       <scope>test</scope>
     </dependency>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>tika-langdetect-charsoup</artifactId>
+      <version>${project.version}</version>
+      <scope>test</scope>
+    </dependency>
     <dependency>
       <groupId>${project.groupId}</groupId>
       <artifactId>tika-serialization</artifactId>
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/config/TikaEncodingDetectorTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/config/TikaEncodingDetectorTest.java
index 2cd4af4913..2524ef404d 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/config/TikaEncodingDetectorTest.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/config/TikaEncodingDetectorTest.java
@@ -36,11 +36,13 @@ import org.apache.tika.TikaTest;
 import org.apache.tika.config.loader.TikaLoader;
 import org.apache.tika.detect.CompositeEncodingDetector;
 import org.apache.tika.detect.EncodingDetector;
+import org.apache.tika.detect.MetaEncodingDetector;
 import org.apache.tika.detect.OverrideEncodingDetector;
 import org.apache.tika.exception.TikaConfigException;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.parser.AbstractEncodingDetectorParser;
 import org.apache.tika.parser.CompositeParser;
 import org.apache.tika.parser.Parser;
@@ -57,10 +59,12 @@ public class TikaEncodingDetectorTest extends TikaTest {
         EncodingDetector detector = 
TikaLoader.loadDefault().loadEncodingDetectors();
         assertTrue(detector instanceof CompositeEncodingDetector);
         List<EncodingDetector> detectors = ((CompositeEncodingDetector) 
detector).getDetectors();
-        assertEquals(3, detectors.size());
+        // 3 base detectors + CharSoupEncodingDetector (MetaEncodingDetector)
+        assertEquals(4, detectors.size());
         assertTrue(detectors.get(0) instanceof HtmlEncodingDetector);
         assertTrue(detectors.get(1) instanceof UniversalEncodingDetector);
         assertTrue(detectors.get(2) instanceof Icu4jEncodingDetector);
+        assertTrue(detectors.get(3) instanceof MetaEncodingDetector);
     }
 
     @Test
@@ -69,15 +73,18 @@ public class TikaEncodingDetectorTest extends TikaTest {
         EncodingDetector detector = tikaLoader.loadEncodingDetectors();
         assertTrue(detector instanceof CompositeEncodingDetector);
         List<EncodingDetector> detectors = ((CompositeEncodingDetector) 
detector).getDetectors();
+        // default-encoding-detector (inner composite) + 
override-encoding-detector
+        // The inner composite now includes CharSoupEncodingDetector from SPI
         assertEquals(2, detectors.size());
 
         EncodingDetector detector1 = detectors.get(0);
         assertTrue(detector1 instanceof CompositeEncodingDetector);
         List<EncodingDetector> detectors1Children =
                 ((CompositeEncodingDetector) detector1).getDetectors();
-        assertEquals(2, detectors1Children.size());
+        assertEquals(3, detectors1Children.size());
         assertTrue(detectors1Children.get(0) instanceof 
UniversalEncodingDetector);
         assertTrue(detectors1Children.get(1) instanceof Icu4jEncodingDetector);
+        assertTrue(detectors1Children.get(2) instanceof MetaEncodingDetector);
 
         assertTrue(detectors.get(1) instanceof OverrideEncodingDetector);
 
@@ -175,7 +182,8 @@ public class TikaEncodingDetectorTest extends TikaTest {
                     ((AbstractEncodingDetectorParser) encodingDetectingParser)
                             .getEncodingDetector();
             assertTrue(encodingDetector instanceof CompositeEncodingDetector);
-            assertEquals(2, ((CompositeEncodingDetector) 
encodingDetector).getDetectors().size());
+            // HtmlEncodingDetector, UniversalEncodingDetector, 
CharSoupEncodingDetector
+            assertEquals(3, ((CompositeEncodingDetector) 
encodingDetector).getDetectors().size());
             for (EncodingDetector child : ((CompositeEncodingDetector) 
encodingDetector)
                     .getDetectors()) {
                 assertNotContained("cu4j", 
child.getClass().getCanonicalName());
@@ -263,6 +271,41 @@ public class TikaEncodingDetectorTest extends TikaTest {
     }
 
 
+    @Test
+    public void testExcludeCharSoupEncodingDetector() throws Exception {
+        TikaLoader tikaLoader = TikaLoaderHelper.getLoader(
+                "TIKA-4671-exclude-charsoup-encoding-detector.json");
+        EncodingDetector detector = tikaLoader.loadEncodingDetectors();
+        assertTrue(detector instanceof CompositeEncodingDetector);
+        List<EncodingDetector> detectors =
+                ((CompositeEncodingDetector) detector).getDetectors();
+        // 3 base detectors, no MetaEncodingDetector
+        assertEquals(3, detectors.size());
+        assertTrue(detectors.get(0) instanceof HtmlEncodingDetector);
+        assertTrue(detectors.get(1) instanceof UniversalEncodingDetector);
+        assertTrue(detectors.get(2) instanceof Icu4jEncodingDetector);
+        for (EncodingDetector d : detectors) {
+            assertNotContained("CharSoup", d.getClass().getSimpleName());
+        }
+    }
+
+    @Test
+    public void testArabicMisleadingCharsetHtml() throws Exception {
+        // This HTML file is encoded in windows-1256 but declares charset=UTF-8
+        // in the meta tag. The CharSoupEncodingDetector should override the
+        // misleading HTML meta and detect that the actual content is Arabic
+        // (windows-1256) because windows-1256 decoded text produces a higher
+        // language detection score.
+        Metadata metadata = new Metadata();
+        XMLResult result = getXML("testArabicMisleadingCharset.html", 
metadata);
+        // Verify encoding was detected as windows-1256, not the misleading 
UTF-8
+        assertEquals("windows-1256",
+                metadata.get(TikaCoreProperties.DETECTED_ENCODING));
+        // Verify extracted text contains readable Arabic, not mojibake
+        // \u0627\u0644\u0639\u0631\u0628\u064a\u0629 = "العربية" (Arabic)
+        assertContains("\u0627\u0644\u0639\u0631\u0628\u064a\u0629", 
result.xml);
+    }
+
     private void findEncodingDetectionParsers(Parser p, List<Parser> 
encodingDetectionParsers) {
 
         if (p instanceof CompositeParser) {
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/TIKA-4671-exclude-charsoup-encoding-detector.json
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/TIKA-4671-exclude-charsoup-encoding-detector.json
new file mode 100644
index 0000000000..74ef9f5bac
--- /dev/null
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/TIKA-4671-exclude-charsoup-encoding-detector.json
@@ -0,0 +1,11 @@
+{
+  "encoding-detectors": [
+    {
+      "default-encoding-detector": {
+        "exclude": [
+          "charsoup-encoding-detector"
+        ]
+      }
+    }
+  ]
+}
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/test-documents/testArabicMisleadingCharset.html
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/test-documents/testArabicMisleadingCharset.html
new file mode 100644
index 0000000000..e9884177f2
--- /dev/null
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/test-documents/testArabicMisleadingCharset.html
@@ -0,0 +1,11 @@
+<html>
+<head>
+<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
+<title>Test Arabic</title>
+</head>
+<body>
+<p>�� ���� �� ����� ��� ���� ��� ���� ���� �� ��� �� ������ ���� ���� ����� 
��� ������ ����� �������. ����� ������� �� ����� �� ���� ������ ������� �� 
������ ������� ���� �� �������� ����� �����.</p>
+<p>�� ���� �� ����� ��� ���� ��� ���� ���� �� ��� �� ������ ���� ���� ����� 
��� ������ ����� �������. ����� ������� �� ����� �� ���� ������ ������� �� 
������ ������� ���� �� �������� ����� �����.</p>
+<p>�� ���� �� ����� ��� ���� ��� ���� ���� �� ��� �� ������ ���� ���� ����� 
��� ������ ����� �������. ����� ������� �� ����� �� ���� ������ ������� �� 
������ ������� ���� �� �������� ����� �����.</p>
+</body>
+</html>
\ No newline at end of file

(tika) branch main updated: TIKA-4671-lang-aware-charset-detection (#2621)

Reply via email to