This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new d49b6d1d39 TIKA-4671-lang-aware-charset-detection (#2621)
d49b6d1d39 is described below
commit d49b6d1d3992b12b7b216a4f56e758d8eac31344
Author: Tim Allison <[email protected]>
AuthorDate: Fri Feb 20 11:34:36 2026 -0500
TIKA-4671-lang-aware-charset-detection (#2621)
---
.../tika/detect/CompositeEncodingDetector.java | 155 ++++++++++++++--
.../tika/detect/DefaultEncodingDetector.java | 27 +--
.../tika/detect/EncodingDetectorContext.java | 105 +++++++++++
.../apache/tika/detect/MetaEncodingDetector.java | 39 ++++
.../tika/language/detect/LanguageResult.java | 28 +++
.../apache/tika/metadata/TikaCoreProperties.java | 8 +
.../tika/langdetect/charsoup/CharSoupModel.java | 18 +-
.../charsoup/CharSoupEncodingDetector.java | 186 +++++++++++++++++++
.../charsoup/CharSoupLanguageDetector.java | 155 +++++++++++++++-
.../charsoup/CharSoupEncodingDetectorTest.java | 206 +++++++++++++++++++++
.../langdetect/charsoup/TextQualityDiagTest.java | 141 ++++++++++++++
.../tika-parsers-standard-package/pom.xml | 6 +
.../tika/config/TikaEncodingDetectorTest.java | 49 ++++-
...KA-4671-exclude-charsoup-encoding-detector.json | 11 ++
.../testArabicMisleadingCharset.html | 11 ++
15 files changed, 1110 insertions(+), 35 deletions(-)
diff --git
a/tika-core/src/main/java/org/apache/tika/detect/CompositeEncodingDetector.java
b/tika-core/src/main/java/org/apache/tika/detect/CompositeEncodingDetector.java
index e5c6152e6a..5fbf44dace 100644
---
a/tika-core/src/main/java/org/apache/tika/detect/CompositeEncodingDetector.java
+++
b/tika-core/src/main/java/org/apache/tika/detect/CompositeEncodingDetector.java
@@ -19,56 +19,105 @@ package org.apache.tika.detect;
import java.io.IOException;
import java.io.Serializable;
import java.nio.charset.Charset;
+import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
-import java.util.LinkedList;
import java.util.List;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.ParseContext;
+/**
+ * A composite encoding detector that runs child detectors.
+ *
+ * <p>If a {@link MetaEncodingDetector} is among the children, this
+ * composite switches from first-match-wins to collect-all mode:
+ * all base detectors run first and their results are collected in an
+ * {@link EncodingDetectorContext}, then the meta detector runs last
+ * to arbitrate. Only one meta detector is supported.</p>
+ *
+ * <p>If no meta detector is present, the first non-null result wins
+ * (traditional behavior).</p>
+ */
public class CompositeEncodingDetector implements EncodingDetector,
Serializable {
- /**
- * Serial version UID
- */
private static final long serialVersionUID = 5980683158436430252L;
+ private static final Logger LOG =
+ LoggerFactory.getLogger(CompositeEncodingDetector.class);
+
private final List<EncodingDetector> detectors;
+ private final List<EncodingDetector> baseDetectors;
+ private final MetaEncodingDetector metaDetector;
public CompositeEncodingDetector(List<EncodingDetector> detectors,
Collection<Class<? extends
EncodingDetector>>
excludeEncodingDetectors) {
- this.detectors = new LinkedList<>();
+ this.detectors = new ArrayList<>();
for (EncodingDetector encodingDetector : detectors) {
if (!isExcluded(excludeEncodingDetectors,
encodingDetector.getClass())) {
this.detectors.add(encodingDetector);
}
}
-
+ this.baseDetectors = new ArrayList<>();
+ this.metaDetector = partition(this.detectors, baseDetectors);
}
public CompositeEncodingDetector(List<EncodingDetector> detectors) {
- this.detectors = new LinkedList<>();
- this.detectors.addAll(detectors);
+ this.detectors = new ArrayList<>(detectors);
+ this.baseDetectors = new ArrayList<>();
+ this.metaDetector = partition(this.detectors, baseDetectors);
}
/**
- * @param tis text document input stream, or <code>null</code>
- * @param metadata input metadata for the document
- * @return the detected Charset or null if no charset could be detected
- * @throws IOException
+ * Partition detectors into base detectors and at most one meta detector.
*/
+ private static MetaEncodingDetector partition(
+ List<EncodingDetector> all, List<EncodingDetector> base) {
+ MetaEncodingDetector meta = null;
+ for (EncodingDetector d : all) {
+ if (d instanceof MetaEncodingDetector) {
+ if (meta == null) {
+ meta = (MetaEncodingDetector) d;
+ } else {
+ LOG.warn("Multiple MetaEncodingDetectors found; " +
+ "ignoring {}",
+ d.getClass().getName());
+ }
+ } else {
+ base.add(d);
+ }
+ }
+ return meta;
+ }
+
@Override
- public Charset detect(TikaInputStream tis, Metadata metadata, ParseContext
parseContext) throws IOException {
+ public Charset detect(TikaInputStream tis, Metadata metadata,
+ ParseContext parseContext) throws IOException {
+ if (metaDetector != null) {
+ return detectWithMeta(tis, metadata, parseContext);
+ }
+ return detectFirstMatch(tis, metadata, parseContext);
+ }
+
+ /**
+ * Traditional first-match-wins behavior.
+ */
+ private Charset detectFirstMatch(TikaInputStream tis, Metadata metadata,
+ ParseContext parseContext)
+ throws IOException {
for (EncodingDetector detector : getDetectors()) {
Charset detected = detector.detect(tis, metadata, parseContext);
if (detected != null) {
- metadata.set(TikaCoreProperties.DETECTED_ENCODING,
detected.name());
- //if this has been set by a leaf detector, do not overwrite
- if (!
detector.getClass().getSimpleName().equals("CompositeEncodingDetector")) {
+ metadata.set(TikaCoreProperties.DETECTED_ENCODING,
+ detected.name());
+ if (!detector.getClass().getSimpleName()
+ .equals("CompositeEncodingDetector")) {
metadata.set(TikaCoreProperties.ENCODING_DETECTOR,
detector.getClass().getSimpleName());
}
@@ -78,6 +127,80 @@ public class CompositeEncodingDetector implements
EncodingDetector, Serializable
return null;
}
+ /**
+ * Collect-all mode: run every base detector, populate context,
+ * then let the meta detector arbitrate.
+ */
+ private Charset detectWithMeta(TikaInputStream tis, Metadata metadata,
+ ParseContext parseContext)
+ throws IOException {
+ EncodingDetectorContext context = new EncodingDetectorContext();
+ parseContext.set(EncodingDetectorContext.class, context);
+ try {
+ for (EncodingDetector detector : baseDetectors) {
+ Charset detected =
+ detector.detect(tis, metadata, parseContext);
+ if (detected != null) {
+ context.addResult(detected,
+ detector.getClass().getSimpleName());
+ }
+ }
+
+ Charset result =
+ metaDetector.detect(tis, metadata, parseContext);
+
+ // If meta detector returned null (disabled or no candidates),
+ // fall back to first base detector's result
+ if (result == null && !context.getResults().isEmpty()) {
+ EncodingDetectorContext.Result first =
+ context.getResults().get(0);
+ result = first.getCharset();
+ metadata.set(TikaCoreProperties.DETECTED_ENCODING,
+ result.name());
+ metadata.set(TikaCoreProperties.ENCODING_DETECTOR,
+ first.getDetectorName());
+ } else if (result != null) {
+ metadata.set(TikaCoreProperties.DETECTED_ENCODING,
+ result.name());
+ String detectorName =
+ metaDetector.getClass().getSimpleName();
+ for (EncodingDetectorContext.Result r :
+ context.getResults()) {
+ if (r.getCharset().equals(result)) {
+ detectorName = r.getDetectorName();
+ break;
+ }
+ }
+ metadata.set(TikaCoreProperties.ENCODING_DETECTOR,
+ detectorName);
+ }
+
+ // Build and set the detection trace
+ metadata.set(TikaCoreProperties.ENCODING_DETECTION_TRACE,
+ buildTrace(context));
+
+ return result;
+ } finally {
+ parseContext.set(EncodingDetectorContext.class, null);
+ }
+ }
+
+ private static String buildTrace(EncodingDetectorContext context) {
+ StringBuilder sb = new StringBuilder();
+ for (EncodingDetectorContext.Result r : context.getResults()) {
+ if (sb.length() > 0) {
+ sb.append(", ");
+ }
+ sb.append(r.getDetectorName()).append("->")
+ .append(r.getCharset().name());
+ }
+ String info = context.getArbitrationInfo();
+ if (info != null) {
+ sb.append(" (").append(info).append(")");
+ }
+ return sb.toString();
+ }
+
public List<EncodingDetector> getDetectors() {
return Collections.unmodifiableList(detectors);
}
diff --git
a/tika-core/src/main/java/org/apache/tika/detect/DefaultEncodingDetector.java
b/tika-core/src/main/java/org/apache/tika/detect/DefaultEncodingDetector.java
index 67cf26e27d..72dd3ba4c0 100644
---
a/tika-core/src/main/java/org/apache/tika/detect/DefaultEncodingDetector.java
+++
b/tika-core/src/main/java/org/apache/tika/detect/DefaultEncodingDetector.java
@@ -22,15 +22,20 @@ import javax.imageio.spi.ServiceRegistry;
import org.apache.tika.config.ServiceLoader;
/**
- * A composite encoding detector based on all the {@link EncodingDetector}
implementations
- * available through the {@link ServiceRegistry service provider mechanism}.
Those
- * loaded via the service provider mechanism are ordered by how they appear in
the
- * file, if there is a single service file. If multiple, there is no
guarantee of order.
- * <p>
- * <p>
- * If you need to control the order of the Detectors, you should instead
- * construct your own {@link CompositeDetector} and pass in the list
- * of Detectors in the required order.
+ * A composite encoding detector based on all the {@link EncodingDetector}
+ * implementations available through the
+ * {@link ServiceRegistry service provider mechanism}.
+ *
+ * <p>Those loaded via the service provider mechanism are ordered by how
+ * they appear in the file, if there is a single service file. If
+ * multiple, there is no guarantee of order.</p>
+ *
+ * <p>If you need to control the order of the Detectors, you should
+ * instead construct your own {@link CompositeEncodingDetector} and pass
+ * in the list of Detectors in the required order.</p>
+ *
+ * <p>{@link MetaEncodingDetector} handling (collect-all-then-arbitrate)
+ * is provided by {@link CompositeEncodingDetector}.</p>
*
* @since Apache Tika 1.15
*/
@@ -47,7 +52,7 @@ public class DefaultEncodingDetector extends
CompositeEncodingDetector {
public DefaultEncodingDetector(ServiceLoader loader,
Collection<Class<? extends
EncodingDetector>>
excludeEncodingDetectors) {
- super(loader.loadServiceProviders(EncodingDetector.class),
excludeEncodingDetectors);
+ super(loader.loadServiceProviders(EncodingDetector.class),
+ excludeEncodingDetectors);
}
-
}
diff --git
a/tika-core/src/main/java/org/apache/tika/detect/EncodingDetectorContext.java
b/tika-core/src/main/java/org/apache/tika/detect/EncodingDetectorContext.java
new file mode 100644
index 0000000000..6ac55f87da
--- /dev/null
+++
b/tika-core/src/main/java/org/apache/tika/detect/EncodingDetectorContext.java
@@ -0,0 +1,105 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.detect;
+
+import java.nio.charset.Charset;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.LinkedHashSet;
+import java.util.List;
+import java.util.Set;
+
+/**
+ * Context object that collects encoding detection results from base
+ * detectors. Stored in {@link org.apache.tika.parser.ParseContext} by
+ * {@link DefaultEncodingDetector} so that the {@link MetaEncodingDetector}
+ * can see all candidates and arbitrate. Removed after detection to
+ * prevent contamination during recursive parsing.
+ *
+ * @since Apache Tika 3.2
+ */
+public class EncodingDetectorContext {
+
+ private final List<Result> results = new ArrayList<>();
+ private String arbitrationInfo;
+
+ /**
+ * Record a detection result from a child detector.
+ *
+ * @param charset the detected charset (must not be null)
+ * @param detectorName the simple class name of the detector
+ */
+ public void addResult(Charset charset, String detectorName) {
+ results.add(new Result(charset, detectorName));
+ }
+
+ /**
+ * @return unmodifiable list of all results in detection order
+ */
+ public List<Result> getResults() {
+ return Collections.unmodifiableList(results);
+ }
+
+ /**
+ * @return unique charsets in detection order
+ */
+ public Set<Charset> getUniqueCharsets() {
+ Set<Charset> charsets = new LinkedHashSet<>();
+ for (Result r : results) {
+ charsets.add(r.getCharset());
+ }
+ return charsets;
+ }
+
+ /**
+ * Set by the meta detector to describe how it reached its decision.
+ * Values: "unanimous", "compatible", "scored", "too-few-bigrams",
"disabled".
+ */
+ public void setArbitrationInfo(String info) {
+ this.arbitrationInfo = info;
+ }
+
+ public String getArbitrationInfo() {
+ return arbitrationInfo;
+ }
+
+ /**
+ * A single detection result pairing a charset with the detector that
found it.
+ */
+ public static class Result {
+ private final Charset charset;
+ private final String detectorName;
+
+ public Result(Charset charset, String detectorName) {
+ this.charset = charset;
+ this.detectorName = detectorName;
+ }
+
+ public Charset getCharset() {
+ return charset;
+ }
+
+ public String getDetectorName() {
+ return detectorName;
+ }
+
+ @Override
+ public String toString() {
+ return detectorName + "=" + charset.name();
+ }
+ }
+}
diff --git
a/tika-core/src/main/java/org/apache/tika/detect/MetaEncodingDetector.java
b/tika-core/src/main/java/org/apache/tika/detect/MetaEncodingDetector.java
new file mode 100644
index 0000000000..e8a46f647b
--- /dev/null
+++ b/tika-core/src/main/java/org/apache/tika/detect/MetaEncodingDetector.java
@@ -0,0 +1,39 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.detect;
+
+/**
+ * Marker interface for encoding detectors that arbitrate among
+ * candidates collected by base detectors rather than detecting
+ * encoding directly from the stream.
+ *
+ * <p>When a {@code MetaEncodingDetector} is present in a
+ * {@link CompositeEncodingDetector}, the composite switches from
+ * first-match-wins to collect-all mode: all base detectors run
+ * first and their results are collected in an
+ * {@link EncodingDetectorContext}, then the meta detector's
+ * {@link #detect} method is called to pick the winner.</p>
+ *
+ * <p>The {@link EncodingDetectorContext} is placed in the
+ * {@link org.apache.tika.parser.ParseContext} before the meta
+ * detector is invoked, so implementations can retrieve it via
+ * {@code parseContext.get(EncodingDetectorContext.class)}.</p>
+ *
+ * @since Apache Tika 3.2
+ */
+public interface MetaEncodingDetector extends EncodingDetector {
+}
diff --git
a/tika-core/src/main/java/org/apache/tika/language/detect/LanguageResult.java
b/tika-core/src/main/java/org/apache/tika/language/detect/LanguageResult.java
index dada5fda17..348232587e 100644
---
a/tika-core/src/main/java/org/apache/tika/language/detect/LanguageResult.java
+++
b/tika-core/src/main/java/org/apache/tika/language/detect/LanguageResult.java
@@ -31,14 +31,32 @@ public class LanguageResult {
// greater confidence.
private final float rawScore;
+ // Detector-agnostic confidence score (0.0 to 1.0, higher = more
confident).
+ // Detectors can populate this however makes sense for their internals
+ // (e.g., entropy-derived for CharSoup, probability-based for OpenNLP).
+ // Defaults to rawScore for backwards compatibility.
+ private final float confidenceScore;
+
/**
* @param language ISO 639-1 language code (plus optional country code)
* @param rawScore confidence of detector in the result.
*/
public LanguageResult(String language, LanguageConfidence confidence,
float rawScore) {
+ this(language, confidence, rawScore, rawScore);
+ }
+
+ /**
+ * @param language ISO 639-1 language code (plus optional country
code)
+ * @param rawScore detector-specific score (e.g., softmax
probability)
+ * @param confidenceScore detector-agnostic confidence (0.0 to 1.0, higher
= more confident).
+ * For comparing results across different decodings
or detectors.
+ */
+ public LanguageResult(String language, LanguageConfidence confidence,
+ float rawScore, float confidenceScore) {
this.language = language;
this.confidence = confidence;
this.rawScore = rawScore;
+ this.confidenceScore = confidenceScore;
}
/**
@@ -54,6 +72,16 @@ public class LanguageResult {
return rawScore;
}
+ /**
+ * Detector-agnostic confidence score (0.0 to 1.0). Higher values indicate
+ * the detector is more confident in the result. This can be used to
compare
+ * results across different text decodings (e.g., for encoding detection)
+ * without knowing the detector implementation.
+ */
+ public float getConfidenceScore() {
+ return confidenceScore;
+ }
+
public LanguageConfidence getConfidence() {
return confidence;
}
diff --git
a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
index fdd52259e3..0d57f8cd03 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
@@ -414,6 +414,14 @@ public interface TikaCoreProperties {
*/
Property ENCODING_DETECTOR = Property.externalText(TIKA_META_PREFIX +
"encodingDetector");
+ /**
+ * Diagnostic trace showing which encoding detectors ran and what each
returned,
+ * plus the arbitration method used when detectors disagreed.
+ * Example: {@code "HtmlEncodingDetector->UTF-8,
Icu4jEncodingDetector->windows-1256 (scored)"}
+ */
+ Property ENCODING_DETECTION_TRACE =
+ Property.externalText(TIKA_META_PREFIX + "encodingDetectionTrace");
+
/**
* General metadata key for the count of non-final versions available
within a file. This
* was added initially to support generalizing incremental updates in PDF.
diff --git
a/tika-langdetect/tika-langdetect-charsoup-core/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupModel.java
b/tika-langdetect/tika-langdetect-charsoup-core/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupModel.java
index 2cb1adc64f..809d8b00c3 100644
---
a/tika-langdetect/tika-langdetect-charsoup-core/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupModel.java
+++
b/tika-langdetect/tika-langdetect-charsoup-core/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupModel.java
@@ -197,6 +197,22 @@ public class CharSoupModel {
* (softmax probabilities, sum ≈ 1.0)
*/
public float[] predict(int[] features) {
+ float[] logits = predictLogits(features);
+ return softmax(logits);
+ }
+
+ /**
+ * Compute raw logits (pre-softmax scores) for the given
+ * feature vector. Higher logits indicate stronger match.
+ * Unlike {@link #predict}, this preserves the full dynamic
+ * range of the model's output, which is useful when
+ * comparing confidence across different input texts.
+ *
+ * @param features int array of size {@code numBuckets}
+ * @return float array of size {@code numClasses}
+ * (raw logits, not normalized)
+ */
+ public float[] predictLogits(int[] features) {
int nnz = 0;
for (int b = 0; b < numBuckets; b++) {
if (features[b] != 0) {
@@ -225,7 +241,7 @@ public class CharSoupModel {
for (int c = 0; c < numClasses; c++) {
logits[c] = biases[c] + scales[c] * dots[c];
}
- return softmax(logits);
+ return logits;
}
/**
diff --git
a/tika-langdetect/tika-langdetect-charsoup/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupEncodingDetector.java
b/tika-langdetect/tika-langdetect-charsoup/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupEncodingDetector.java
new file mode 100644
index 0000000000..75176f69fc
--- /dev/null
+++
b/tika-langdetect/tika-langdetect-charsoup/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupEncodingDetector.java
@@ -0,0 +1,186 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.langdetect.charsoup;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.nio.CharBuffer;
+import java.nio.charset.Charset;
+import java.nio.charset.CharsetDecoder;
+import java.nio.charset.CodingErrorAction;
+import java.util.LinkedHashMap;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.tika.config.TikaComponent;
+import org.apache.tika.detect.EncodingDetectorContext;
+import org.apache.tika.detect.MetaEncodingDetector;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+
+/**
+ * A {@link MetaEncodingDetector} that uses the CharSoup language detector
+ * to arbitrate when base encoding detectors disagree.
+ *
+ * <p>When base detectors all agree, the unanimous charset is returned
+ * without any language detection. When they disagree, raw bytes are
+ * read from the stream, decoded with each candidate charset, and each
+ * decoded text is scored by {@link CharSoupLanguageDetector}. The
+ * charset that produces the highest-confidence language detection wins.</p>
+ *
+ * <p>To enable, add this detector to your encoding detector chain in
+ * tika-config:</p>
+ * <pre>{@code
+ * "encoding-detectors": [
+ * { "default-encoding-detector": {} },
+ * { "charsoup-encoding-detector": {} }
+ * ]
+ * }</pre>
+ *
+ * @since Apache Tika 3.2
+ */
+@TikaComponent(name = "charsoup-encoding-detector")
+public class CharSoupEncodingDetector implements MetaEncodingDetector {
+
+ private static final long serialVersionUID = 1L;
+
+ private static final int DEFAULT_READ_LIMIT = 16384;
+
+ private int readLimit = DEFAULT_READ_LIMIT;
+
+ @Override
+ public Charset detect(TikaInputStream tis, Metadata metadata,
+ ParseContext parseContext) throws IOException {
+ EncodingDetectorContext context =
+ parseContext.get(EncodingDetectorContext.class);
+ if (context == null || context.getResults().isEmpty()) {
+ return null;
+ }
+
+ Set<Charset> uniqueCharsets = context.getUniqueCharsets();
+
+ if (uniqueCharsets.size() <= 1) {
+ // Unanimous or single detector — no arbitration needed
+ EncodingDetectorContext.Result first = context.getResults().get(0);
+ context.setArbitrationInfo("unanimous");
+ return first.getCharset();
+ }
+
+ // Disagreement — arbitrate via language detection scoring
+ return arbitrate(tis, context, uniqueCharsets);
+ }
+
+ private Charset arbitrate(TikaInputStream tis,
+ EncodingDetectorContext context,
+ Set<Charset> uniqueCharsets) throws IOException {
+ EncodingDetectorContext.Result firstResult =
context.getResults().get(0);
+
+ if (tis == null) {
+ context.setArbitrationInfo("no-stream");
+ return firstResult.getCharset();
+ }
+
+ byte[] bytes = readBytes(tis);
+ if (bytes == null || bytes.length == 0) {
+ context.setArbitrationInfo("empty-stream");
+ return firstResult.getCharset();
+ }
+
+ Map<Charset, String> candidates = new LinkedHashMap<>();
+ for (Charset candidate : uniqueCharsets) {
+ candidates.put(candidate, stripTags(decode(bytes, candidate)));
+ }
+
+ CharSoupLanguageDetector langDetector = new CharSoupLanguageDetector();
+ Charset bestCharset = langDetector.compareLanguageSignal(candidates);
+ if (bestCharset == null) {
+ bestCharset = firstResult.getCharset();
+ }
+
+ context.setArbitrationInfo("scored");
+ return bestCharset;
+ }
+
+ private byte[] readBytes(TikaInputStream tis) throws IOException {
+ try {
+ tis.mark(readLimit);
+ byte[] buf = new byte[readLimit];
+ int totalRead = 0;
+ int bytesRead;
+ while (totalRead < readLimit &&
+ (bytesRead = tis.read(buf, totalRead,
+ readLimit - totalRead)) != -1) {
+ totalRead += bytesRead;
+ }
+ if (totalRead == 0) {
+ return null;
+ }
+ if (totalRead < readLimit) {
+ byte[] trimmed = new byte[totalRead];
+ System.arraycopy(buf, 0, trimmed, 0, totalRead);
+ return trimmed;
+ }
+ return buf;
+ } finally {
+ tis.reset();
+ }
+ }
+
+ /**
+ * Decode bytes using the given charset, replacing malformed/unmappable
+ * characters rather than throwing.
+ */
+ static String decode(byte[] bytes, Charset charset) {
+ CharsetDecoder decoder = charset.newDecoder()
+ .onMalformedInput(CodingErrorAction.REPLACE)
+ .onUnmappableCharacter(CodingErrorAction.REPLACE);
+ CharBuffer cb = CharBuffer.allocate(bytes.length * 2);
+ decoder.decode(ByteBuffer.wrap(bytes), cb, true);
+ decoder.flush(cb);
+ cb.flip();
+ return cb.toString();
+ }
+
+ /**
+ * Simple tag stripping: removes <...> sequences so that
+ * HTML/XML tag names and attributes don't pollute language scoring.
+ */
+ static String stripTags(String text) {
+ StringBuilder sb = new StringBuilder(text.length());
+ boolean inTag = false;
+ for (int i = 0; i < text.length(); i++) {
+ char c = text.charAt(i);
+ if (c == '<') {
+ inTag = true;
+ } else if (c == '>') {
+ inTag = false;
+ } else if (!inTag) {
+ sb.append(c);
+ }
+ }
+ return sb.toString();
+ }
+
+ public int getReadLimit() {
+ return readLimit;
+ }
+
+ public void setReadLimit(int readLimit) {
+ this.readLimit = readLimit;
+ }
+}
diff --git
a/tika-langdetect/tika-langdetect-charsoup/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupLanguageDetector.java
b/tika-langdetect/tika-langdetect-charsoup/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupLanguageDetector.java
index 1fa6e2cce3..abd0f8e0d6 100644
---
a/tika-langdetect/tika-langdetect-charsoup/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupLanguageDetector.java
+++
b/tika-langdetect/tika-langdetect-charsoup/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupLanguageDetector.java
@@ -26,6 +26,9 @@ import java.util.List;
import java.util.Map;
import java.util.Set;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
import org.apache.tika.config.TikaComponent;
import org.apache.tika.language.detect.LanguageConfidence;
import org.apache.tika.language.detect.LanguageDetector;
@@ -50,9 +53,12 @@ import org.apache.tika.language.detect.LanguageResult;
* keeping the implementation simple and predictable.
* </p>
*/
-@TikaComponent
+@TikaComponent(name = "charsoup-language-detector")
public class CharSoupLanguageDetector extends LanguageDetector {
+ private static final Logger LOG =
+ LoggerFactory.getLogger(CharSoupLanguageDetector.class);
+
private static final String MODEL_RESOURCE =
"/org/apache/tika/langdetect/charsoup/langdetect.bin";
@@ -267,6 +273,126 @@ public class CharSoupLanguageDetector extends
LanguageDetector {
return lastEntropy;
}
+ /**
+ * Minimum confidence (inverse logit of the max logit) for a candidate to
+ * be considered a genuine language match. If no candidate exceeds this
+ * threshold, the comparison is inconclusive and {@code null} is returned.
+ * <p>
+ * 0.88 corresponds to a raw logit of ~2.0. Typical values:
+ * <ul>
+ * <li>Arabic (windows-1256): 0.9999994 (logit +14.3)</li>
+ * <li>UTF-8 garbled: 0.97 (logit +3.5)</li>
+ * <li>EBCDIC garbage: 0.79 (logit +1.3) — below threshold</li>
+ * <li>Short English: 0.025 (logit -3.7) — well below threshold</li>
+ * </ul>
+ */
+ private static final float MIN_CONFIDENCE_THRESHOLD = 0.88f;
+
+ /**
+ * Maximum ratio of junk characters (U+FFFD replacement chars + C0/C1
+ * control chars) allowed in a candidate text. Candidates exceeding
+ * this ratio are discarded before language scoring — they are almost
+ * certainly decoded with the wrong charset.
+ * <p>
+ * Typical values:
+ * <ul>
+ * <li>Correct decoding: 0.00</li>
+ * <li>UTF-8 decoding of windows-1256 bytes: 0.80</li>
+ * <li>IBM500 decoding of ASCII bytes: 0.23</li>
+ * </ul>
+ */
+ private static final float MAX_JUNK_RATIO = 0.10f;
+
+ /**
+ * Compare multiple candidate texts and return the key of the one with
+ * the strongest language signal. Candidates with a high ratio of
+ * replacement or control characters are discarded first. Remaining
+ * candidates are scored using the inverse logit (sigmoid) of the
+ * model's maximum pre-softmax logit.
+ * <p>
+ * Returns {@code null} if no candidate exceeds the minimum confidence
+ * threshold, indicating the comparison is inconclusive.
+ *
+ * @param candidates map of arbitrary keys to candidate text strings
+ * @param <K> key type (e.g., {@link java.nio.charset.Charset})
+ * @return the key whose text has the strongest language signal,
+ * or {@code null} if the map is empty or no candidate is
+ * confident enough
+ */
+ public <K> K compareLanguageSignal(Map<K, String> candidates) {
+ if (candidates.isEmpty()) {
+ return null;
+ }
+
+ float bestConfidence = Float.NEGATIVE_INFINITY;
+ K bestKey = null;
+
+ for (Map.Entry<K, String> entry : candidates.entrySet()) {
+ float junkRatio = junkRatio(entry.getValue());
+ if (junkRatio > MAX_JUNK_RATIO) {
+ LOG.debug("compareLanguageSignal: {} -> skipped
(junkRatio={})",
+ entry.getKey(), junkRatio);
+ continue;
+ }
+
+ int[] features = EXTRACTOR.extract(entry.getValue());
+ float[] logits = MODEL.predictLogits(features);
+ float confidence = sigmoid(max(logits));
+
+ LOG.debug("compareLanguageSignal: {} -> confidence={}",
+ entry.getKey(), confidence);
+
+ if (confidence > bestConfidence) {
+ bestConfidence = confidence;
+ bestKey = entry.getKey();
+ }
+ }
+
+ if (bestConfidence < MIN_CONFIDENCE_THRESHOLD) {
+ LOG.debug("compareLanguageSignal: inconclusive (bestConfidence={}
< {})",
+ bestConfidence, MIN_CONFIDENCE_THRESHOLD);
+ return null;
+ }
+
+ return bestKey;
+ }
+
+ /**
+ * Ratio of junk characters (U+FFFD replacement + ISO control + C1
+ * control range U+0080-U+009F) to total characters. High values
+ * indicate a wrong-charset decoding.
+ */
+ static float junkRatio(String text) {
+ if (text == null || text.isEmpty()) {
+ return 0f;
+ }
+ int junk = 0;
+ int total = 0;
+ for (int i = 0; i < text.length(); ) {
+ int cp = text.codePointAt(i);
+ i += Character.charCount(cp);
+ total++;
+ if (cp == 0xFFFD || Character.isISOControl(cp)) {
+ junk++;
+ }
+ }
+ return total == 0 ? 0f : (float) junk / total;
+ }
+
+ private static float sigmoid(float x) {
+ return 1.0f / (1.0f + (float) Math.exp(-x));
+ }
+
+ private static float max(float[] arr) {
+ float m = Float.NEGATIVE_INFINITY;
+ for (float v : arr) {
+ if (v > m) {
+ m = v;
+ }
+ }
+ return m;
+ }
+
@Override
public LanguageDetector loadModels() throws IOException {
// Models are loaded statically; nothing to do.
@@ -367,6 +493,22 @@ public class CharSoupLanguageDetector extends
LanguageDetector {
return buildResults(bestProbs);
}
+ /**
+ * Maximum meaningful entropy (bits) for normalizing confidenceScore.
+ * log2(numClasses) for ~165 classes is ~7.4. We cap at 7.0 so that
+ * even moderately uncertain text gets a near-zero confidenceScore.
+ */
+ private static final float MAX_ENTROPY = 7.0f;
+
+ /**
+ * Convert entropy to a 0-1 confidence score. Lower entropy = higher
confidence.
+ * Uses 1/(1+entropy) to preserve discrimination even at very low
entropies,
+ * unlike a linear mapping which saturates at 1.0 too quickly.
+ */
+ private static float entropyToConfidenceScore(float entropy) {
+ return 1.0f / (1.0f + entropy);
+ }
+
/**
* Build sorted LanguageResult list from raw probabilities.
*/
@@ -374,18 +516,23 @@ public class CharSoupLanguageDetector extends
LanguageDetector {
// Compute entropy on collapsed distribution
float[] collapsed = collapseGroups(probs, GROUP_INDICES);
lastEntropy = CharSoupModel.entropy(collapsed);
+ float confScore = entropyToConfidenceScore(lastEntropy);
// Build results from raw probabilities sorted by probability
descending
List<LanguageResult> results = new ArrayList<>(MODEL.getNumClasses());
for (int c = 0; c < MODEL.getNumClasses(); c++) {
results.add(new LanguageResult(
- MODEL.getLabel(c), toConfidence(probs[c], lastEntropy),
probs[c]));
+ MODEL.getLabel(c), toConfidence(probs[c], lastEntropy),
+ probs[c], confScore));
}
results.sort((a, b) -> Float.compare(b.getRawScore(),
a.getRawScore()));
- // If top score is below NONE threshold, return NULL
+ // If top score is below NONE threshold, return a NULL-like result
+ // but preserve the confidenceScore so encoding arbitration can
+ // still compare across candidate decodings.
if (results.get(0).getConfidence() == LanguageConfidence.NONE) {
- return Collections.singletonList(LanguageResult.NULL);
+ return Collections.singletonList(
+ new LanguageResult("", LanguageConfidence.NONE, 0.0f,
confScore));
}
return results;
diff --git
a/tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/CharSoupEncodingDetectorTest.java
b/tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/CharSoupEncodingDetectorTest.java
new file mode 100644
index 0000000000..5ff028d442
--- /dev/null
+++
b/tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/CharSoupEncodingDetectorTest.java
@@ -0,0 +1,206 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.langdetect.charsoup;
+
+import static java.nio.charset.StandardCharsets.ISO_8859_1;
+import static java.nio.charset.StandardCharsets.UTF_8;
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNull;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+import java.io.ByteArrayInputStream;
+import java.nio.charset.Charset;
+
+import org.junit.jupiter.api.Test;
+
+import org.apache.tika.detect.EncodingDetectorContext;
+import org.apache.tika.detect.MetaEncodingDetector;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+
+public class CharSoupEncodingDetectorTest {
+
+ @Test
+ public void testIsMetaEncodingDetector() {
+ assertTrue(new CharSoupEncodingDetector() instanceof
MetaEncodingDetector);
+ }
+
+ @Test
+ public void testUnanimous() throws Exception {
+ CharSoupEncodingDetector detector = new CharSoupEncodingDetector();
+ EncodingDetectorContext context = new EncodingDetectorContext();
+ context.addResult(UTF_8, "DetectorA");
+ context.addResult(UTF_8, "DetectorB");
+
+ ParseContext parseContext = new ParseContext();
+ parseContext.set(EncodingDetectorContext.class, context);
+
+ byte[] data = "Hello, world!".getBytes(UTF_8);
+ try (TikaInputStream tis = TikaInputStream.get(
+ new ByteArrayInputStream(data))) {
+ Charset result = detector.detect(tis, new Metadata(),
parseContext);
+ assertEquals(UTF_8, result);
+ assertEquals("unanimous", context.getArbitrationInfo());
+ }
+ }
+
+ @Test
+ public void testNoContext() throws Exception {
+ CharSoupEncodingDetector detector = new CharSoupEncodingDetector();
+ ParseContext parseContext = new ParseContext();
+
+ byte[] data = "Test".getBytes(UTF_8);
+ try (TikaInputStream tis = TikaInputStream.get(
+ new ByteArrayInputStream(data))) {
+ Charset result = detector.detect(tis, new Metadata(),
parseContext);
+ assertNull(result);
+ }
+ }
+
+ @Test
+ public void testEmptyResults() throws Exception {
+ CharSoupEncodingDetector detector = new CharSoupEncodingDetector();
+ EncodingDetectorContext context = new EncodingDetectorContext();
+
+ ParseContext parseContext = new ParseContext();
+ parseContext.set(EncodingDetectorContext.class, context);
+
+ byte[] data = "Test".getBytes(UTF_8);
+ try (TikaInputStream tis = TikaInputStream.get(
+ new ByteArrayInputStream(data))) {
+ Charset result = detector.detect(tis, new Metadata(),
parseContext);
+ assertNull(result);
+ }
+ }
+
+ @Test
+ public void testArabicEncodingArbitration() throws Exception {
+ // Arabic text encoded in windows-1256.
+ // When decoded as UTF-8 it produces replacement chars / garbage.
+ // When decoded as windows-1256 it produces valid Arabic.
+ // The language detector should pick windows-1256.
+ Charset windows1256 = Charset.forName("windows-1256");
+
+ String arabicText =
+ "\u0641\u064a \u0642\u0631\u064a\u0629 \u0645\u0646 " +
+ "\u0627\u0644\u0642\u0631\u0649 \u0643\u0627\u0646 " +
+ "\u0647\u0646\u0627\u0643 \u0631\u062c\u0644 " +
+ "\u062d\u0643\u064a\u0645 \u064a\u0639\u0631\u0641 " +
+ "\u0643\u0644 \u0634\u064a\u0621 \u0639\u0646 " +
+ "\u0627\u0644\u062d\u064a\u0627\u0629 \u0648\u0643\u0627\u0646
" +
+ "\u064a\u0639\u0644\u0645 \u0627\u0644\u0646\u0627\u0633 " +
+ "\u0643\u064a\u0641 \u064a\u0639\u064a\u0634\u0648\u0646 " +
+ "\u0628\u0633\u0644\u0627\u0645
\u0648\u0627\u0646\u0633\u062c\u0627\u0645. " +
+ "\u0627\u0644\u0644\u063a\u0629
\u0627\u0644\u0639\u0631\u0628\u064a\u0629 " +
+ "\u0647\u064a \u0648\u0627\u062d\u062f\u0629 \u0645\u0646 " +
+ "\u0623\u0643\u062b\u0631 \u0627\u0644\u0644\u063a\u0627\u062a
" +
+ "\u0627\u0646\u062a\u0634\u0627\u0631\u0627 \u0641\u064a " +
+ "\u0627\u0644\u0639\u0627\u0644\u0645
\u0648\u064a\u062a\u062d\u062b\u0647\u0627 " +
+ "\u0623\u0643\u062b\u0631 \u0645\u0646
\u062b\u0644\u0627\u062b\u0645\u0627\u0626\u0629 " +
+ "\u0645\u0644\u064a\u0648\u0646
\u0625\u0646\u0633\u0627\u0646.";
+ byte[] arabicBytes = arabicText.getBytes(windows1256);
+
+ EncodingDetectorContext context = new EncodingDetectorContext();
+ context.addResult(UTF_8, "HtmlEncodingDetector");
+ context.addResult(windows1256, "Icu4jEncodingDetector");
+
+ ParseContext parseContext = new ParseContext();
+ parseContext.set(EncodingDetectorContext.class, context);
+
+ CharSoupEncodingDetector detector = new CharSoupEncodingDetector();
+ try (TikaInputStream tis = TikaInputStream.get(
+ new ByteArrayInputStream(arabicBytes))) {
+ Charset result = detector.detect(tis, new Metadata(),
parseContext);
+ assertEquals(windows1256, result);
+ assertEquals("scored", context.getArbitrationInfo());
+ }
+ }
+
+ @Test
+ public void testStreamResetAfterDetection() throws Exception {
+ EncodingDetectorContext context = new EncodingDetectorContext();
+ context.addResult(UTF_8, "DetectorA");
+ context.addResult(ISO_8859_1, "DetectorB");
+
+ ParseContext parseContext = new ParseContext();
+ parseContext.set(EncodingDetectorContext.class, context);
+
+ byte[] data = "Hello, world! This is a test of encoding
detection.".getBytes(UTF_8);
+ CharSoupEncodingDetector detector = new CharSoupEncodingDetector();
+ try (TikaInputStream tis = TikaInputStream.get(
+ new ByteArrayInputStream(data))) {
+ detector.detect(tis, new Metadata(), parseContext);
+
+ // Verify stream is back at the start
+ byte[] readBack = new byte[data.length];
+ int bytesRead = tis.read(readBack);
+ assertEquals(data.length, bytesRead);
+ assertEquals("Hello, world! This is a test of encoding detection.",
+ new String(readBack, UTF_8));
+ }
+ }
+
+ @Test
+ public void testStripTags() {
+ assertEquals("Hello world",
+ CharSoupEncodingDetector.stripTags(
+ "<html><body>Hello world</body></html>"));
+ assertEquals("no tags here",
+ CharSoupEncodingDetector.stripTags("no tags here"));
+ assertEquals("",
+ CharSoupEncodingDetector.stripTags("<empty/>"));
+ }
+
+ @Test
+ public void testDecode() {
+ byte[] utf8Bytes = "caf\u00e9".getBytes(UTF_8);
+ assertEquals("caf\u00e9",
+ CharSoupEncodingDetector.decode(utf8Bytes, UTF_8));
+ }
+
+ @Test
+ public void testReadLimitGetterSetter() {
+ CharSoupEncodingDetector detector = new CharSoupEncodingDetector();
+ assertEquals(16384, detector.getReadLimit());
+ detector.setReadLimit(4096);
+ assertEquals(4096, detector.getReadLimit());
+ }
+
+ @Test
+ public void testJunkRatio() {
+ // Clean text — no junk
+ assertEquals(0f,
+ CharSoupLanguageDetector.junkRatio("Hello, world!"), 0.001f);
+
+ // U+FFFD replacement chars
+ assertEquals(0.5f,
+ CharSoupLanguageDetector.junkRatio("ab\uFFFD\uFFFD"), 0.001f);
+
+ // C1 control chars (U+0080-U+009F are isISOControl)
+ assertEquals(0.25f,
+ CharSoupLanguageDetector.junkRatio("abc\u0080"), 0.001f);
+
+ // Mixed: \r\n are control chars too
+ assertEquals(2f / 13f,
+ CharSoupLanguageDetector.junkRatio("hello world\r\n"), 0.001f);
+
+ // Empty/null
+ assertEquals(0f, CharSoupLanguageDetector.junkRatio(""), 0.001f);
+ assertEquals(0f, CharSoupLanguageDetector.junkRatio(null), 0.001f);
+ }
+}
diff --git
a/tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/TextQualityDiagTest.java
b/tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/TextQualityDiagTest.java
new file mode 100644
index 0000000000..7b56089c26
--- /dev/null
+++
b/tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/TextQualityDiagTest.java
@@ -0,0 +1,141 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.langdetect.charsoup;
+
+import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
+import java.util.Locale;
+
+import org.junit.jupiter.api.Test;
+
+/**
+ * Diagnostic test to explore text quality scores for encoding arbitration.
+ * Not a regression test — just prints scores for analysis.
+ */
+public class TextQualityDiagTest {
+
+ @Test
+ public void dumpScores() {
+ // Arabic text in windows-1256
+ Charset windows1256 = Charset.forName("windows-1256");
+ String arabicText =
+ "\u0641\u064a \u0642\u0631\u064a\u0629 \u0645\u0646 " +
+ "\u0627\u0644\u0642\u0631\u0649 \u0643\u0627\u0646 " +
+ "\u0647\u0646\u0627\u0643 \u0631\u062c\u0644 " +
+ "\u062d\u0643\u064a\u0645 \u064a\u0639\u0631\u0641 " +
+ "\u0643\u0644 \u0634\u064a\u0621 \u0639\u0646 " +
+ "\u0627\u0644\u062d\u064a\u0627\u0629 \u0648\u0643\u0627\u0646
" +
+ "\u064a\u0639\u0644\u0645 \u0627\u0644\u0646\u0627\u0633 " +
+ "\u0643\u064a\u0641 \u064a\u0639\u064a\u0634\u0648\u0646 " +
+ "\u0628\u0633\u0644\u0627\u0645
\u0648\u0627\u0646\u0633\u062c\u0627\u0645.";
+ byte[] arabicBytes = arabicText.getBytes(windows1256);
+
+ // "hello world\r\n" as windows-1252
+ byte[] helloBytes = "hello
world\r\n".getBytes(StandardCharsets.US_ASCII);
+
+ System.out.println("=== Arabic bytes decoded with different charsets
===");
+ for (String csName : new String[]{"windows-1256", "x-MacCyrillic",
"UTF-8"}) {
+ Charset cs = Charset.forName(csName);
+ String decoded = CharSoupEncodingDetector.decode(arabicBytes, cs);
+ printScores(csName, decoded);
+ }
+
+ System.out.println("\n=== 'hello world\\r\\n' decoded with different
charsets ===");
+ for (String csName : new String[]{"windows-1252", "IBM500"}) {
+ Charset cs = Charset.forName(csName);
+ String decoded = CharSoupEncodingDetector.decode(helloBytes, cs);
+ printScores(csName, decoded);
+ }
+
+ // Also try some real-world short text
+ System.out.println("\n=== Short real text ===");
+ printScores("English sentence", "The quick brown fox jumps over the
lazy dog.");
+ printScores("French sentence", "Le renard brun rapide saute par-dessus
le chien paresseux.");
+ printScores("German sentence", "Der schnelle braune Fuchs springt
\u00fcber den faulen Hund.");
+ }
+
+ private void printScores(String label, String text) {
+ int totalChars = text.length();
+ int letterCount = 0;
+ int replacementCount = 0;
+ int controlCount = 0;
+ int spaceCount = 0;
+ int digitCount = 0;
+ int punctCount = 0;
+ int otherCount = 0;
+
+ for (int i = 0; i < text.length(); ) {
+ int cp = text.codePointAt(i);
+ i += Character.charCount(cp);
+
+ if (cp == 0xFFFD) {
+ replacementCount++;
+ } else if (Character.isISOControl(cp) || (cp >= 0x80 && cp <=
0x9F)) {
+ controlCount++;
+ } else if (Character.isLetter(cp)) {
+ letterCount++;
+ } else if (Character.isWhitespace(cp)) {
+ spaceCount++;
+ } else if (Character.isDigit(cp)) {
+ digitCount++;
+ } else if (isPunctuation(cp)) {
+ punctCount++;
+ } else {
+ otherCount++;
+ }
+ }
+
+ float letterRatio = totalChars > 0 ? (float) letterCount / totalChars
: 0;
+ float junkRatio = totalChars > 0 ?
+ (float) (replacementCount + controlCount) / totalChars : 0;
+ float nonLetterNonSpaceRatio = totalChars > 0 ?
+ (float) (totalChars - letterCount - spaceCount) / totalChars :
0;
+
+ System.out.printf(Locale.ROOT,
+ " %-20s len=%3d letters=%.2f junk(repl+ctrl)=%.2f " +
+ "nonLetterNonSpace=%.2f [L=%d S=%d P=%d D=%d R=%d
C=%d O=%d]%n",
+ label, totalChars, letterRatio, junkRatio,
nonLetterNonSpaceRatio,
+ letterCount, spaceCount, punctCount, digitCount,
+ replacementCount, controlCount, otherCount);
+
+ // Show first 60 chars with hex for non-printable
+ StringBuilder preview = new StringBuilder();
+ for (int i = 0; i < Math.min(text.length(), 60); ) {
+ int cp = text.codePointAt(i);
+ i += Character.charCount(cp);
+ if (cp >= 0x20 && cp < 0x7F) {
+ preview.appendCodePoint(cp);
+ } else if (Character.isLetter(cp)) {
+ preview.appendCodePoint(cp);
+ } else {
+ preview.append(String.format(Locale.ROOT, "\\u%04X", cp));
+ }
+ }
+ System.out.printf(Locale.ROOT, " %-20s text: %s%n", "", preview);
+ }
+
+ private boolean isPunctuation(int cp) {
+ int type = Character.getType(cp);
+ return type == Character.CONNECTOR_PUNCTUATION ||
+ type == Character.DASH_PUNCTUATION ||
+ type == Character.END_PUNCTUATION ||
+ type == Character.FINAL_QUOTE_PUNCTUATION ||
+ type == Character.INITIAL_QUOTE_PUNCTUATION ||
+ type == Character.OTHER_PUNCTUATION ||
+ type == Character.START_PUNCTUATION;
+ }
+}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/pom.xml
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/pom.xml
index 9824379de8..6363afc8a2 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/pom.xml
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/pom.xml
@@ -173,6 +173,12 @@
<version>${project.version}</version>
<scope>test</scope>
</dependency>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-langdetect-charsoup</artifactId>
+ <version>${project.version}</version>
+ <scope>test</scope>
+ </dependency>
<dependency>
<groupId>${project.groupId}</groupId>
<artifactId>tika-serialization</artifactId>
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/config/TikaEncodingDetectorTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/config/TikaEncodingDetectorTest.java
index 2cd4af4913..2524ef404d 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/config/TikaEncodingDetectorTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/config/TikaEncodingDetectorTest.java
@@ -36,11 +36,13 @@ import org.apache.tika.TikaTest;
import org.apache.tika.config.loader.TikaLoader;
import org.apache.tika.detect.CompositeEncodingDetector;
import org.apache.tika.detect.EncodingDetector;
+import org.apache.tika.detect.MetaEncodingDetector;
import org.apache.tika.detect.OverrideEncodingDetector;
import org.apache.tika.exception.TikaConfigException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.AbstractEncodingDetectorParser;
import org.apache.tika.parser.CompositeParser;
import org.apache.tika.parser.Parser;
@@ -57,10 +59,12 @@ public class TikaEncodingDetectorTest extends TikaTest {
EncodingDetector detector =
TikaLoader.loadDefault().loadEncodingDetectors();
assertTrue(detector instanceof CompositeEncodingDetector);
List<EncodingDetector> detectors = ((CompositeEncodingDetector)
detector).getDetectors();
- assertEquals(3, detectors.size());
+ // 3 base detectors + CharSoupEncodingDetector (MetaEncodingDetector)
+ assertEquals(4, detectors.size());
assertTrue(detectors.get(0) instanceof HtmlEncodingDetector);
assertTrue(detectors.get(1) instanceof UniversalEncodingDetector);
assertTrue(detectors.get(2) instanceof Icu4jEncodingDetector);
+ assertTrue(detectors.get(3) instanceof MetaEncodingDetector);
}
@Test
@@ -69,15 +73,18 @@ public class TikaEncodingDetectorTest extends TikaTest {
EncodingDetector detector = tikaLoader.loadEncodingDetectors();
assertTrue(detector instanceof CompositeEncodingDetector);
List<EncodingDetector> detectors = ((CompositeEncodingDetector)
detector).getDetectors();
+ // default-encoding-detector (inner composite) +
override-encoding-detector
+ // The inner composite now includes CharSoupEncodingDetector from SPI
assertEquals(2, detectors.size());
EncodingDetector detector1 = detectors.get(0);
assertTrue(detector1 instanceof CompositeEncodingDetector);
List<EncodingDetector> detectors1Children =
((CompositeEncodingDetector) detector1).getDetectors();
- assertEquals(2, detectors1Children.size());
+ assertEquals(3, detectors1Children.size());
assertTrue(detectors1Children.get(0) instanceof
UniversalEncodingDetector);
assertTrue(detectors1Children.get(1) instanceof Icu4jEncodingDetector);
+ assertTrue(detectors1Children.get(2) instanceof MetaEncodingDetector);
assertTrue(detectors.get(1) instanceof OverrideEncodingDetector);
@@ -175,7 +182,8 @@ public class TikaEncodingDetectorTest extends TikaTest {
((AbstractEncodingDetectorParser) encodingDetectingParser)
.getEncodingDetector();
assertTrue(encodingDetector instanceof CompositeEncodingDetector);
- assertEquals(2, ((CompositeEncodingDetector)
encodingDetector).getDetectors().size());
+ // HtmlEncodingDetector, UniversalEncodingDetector,
CharSoupEncodingDetector
+ assertEquals(3, ((CompositeEncodingDetector)
encodingDetector).getDetectors().size());
for (EncodingDetector child : ((CompositeEncodingDetector)
encodingDetector)
.getDetectors()) {
assertNotContained("cu4j",
child.getClass().getCanonicalName());
@@ -263,6 +271,41 @@ public class TikaEncodingDetectorTest extends TikaTest {
}
+ @Test
+ public void testExcludeCharSoupEncodingDetector() throws Exception {
+ TikaLoader tikaLoader = TikaLoaderHelper.getLoader(
+ "TIKA-4671-exclude-charsoup-encoding-detector.json");
+ EncodingDetector detector = tikaLoader.loadEncodingDetectors();
+ assertTrue(detector instanceof CompositeEncodingDetector);
+ List<EncodingDetector> detectors =
+ ((CompositeEncodingDetector) detector).getDetectors();
+ // 3 base detectors, no MetaEncodingDetector
+ assertEquals(3, detectors.size());
+ assertTrue(detectors.get(0) instanceof HtmlEncodingDetector);
+ assertTrue(detectors.get(1) instanceof UniversalEncodingDetector);
+ assertTrue(detectors.get(2) instanceof Icu4jEncodingDetector);
+ for (EncodingDetector d : detectors) {
+ assertNotContained("CharSoup", d.getClass().getSimpleName());
+ }
+ }
+
+ @Test
+ public void testArabicMisleadingCharsetHtml() throws Exception {
+ // This HTML file is encoded in windows-1256 but declares charset=UTF-8
+ // in the meta tag. The CharSoupEncodingDetector should override the
+ // misleading HTML meta and detect that the actual content is Arabic
+ // (windows-1256) because windows-1256 decoded text produces a higher
+ // language detection score.
+ Metadata metadata = new Metadata();
+ XMLResult result = getXML("testArabicMisleadingCharset.html",
metadata);
+ // Verify encoding was detected as windows-1256, not the misleading
UTF-8
+ assertEquals("windows-1256",
+ metadata.get(TikaCoreProperties.DETECTED_ENCODING));
+ // Verify extracted text contains readable Arabic, not mojibake
+ // \u0627\u0644\u0639\u0631\u0628\u064a\u0629 = "العربية" (Arabic)
+ assertContains("\u0627\u0644\u0639\u0631\u0628\u064a\u0629",
result.xml);
+ }
+
private void findEncodingDetectionParsers(Parser p, List<Parser>
encodingDetectionParsers) {
if (p instanceof CompositeParser) {
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/TIKA-4671-exclude-charsoup-encoding-detector.json
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/TIKA-4671-exclude-charsoup-encoding-detector.json
new file mode 100644
index 0000000000..74ef9f5bac
--- /dev/null
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/TIKA-4671-exclude-charsoup-encoding-detector.json
@@ -0,0 +1,11 @@
+{
+ "encoding-detectors": [
+ {
+ "default-encoding-detector": {
+ "exclude": [
+ "charsoup-encoding-detector"
+ ]
+ }
+ }
+ ]
+}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/test-documents/testArabicMisleadingCharset.html
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/test-documents/testArabicMisleadingCharset.html
new file mode 100644
index 0000000000..e9884177f2
--- /dev/null
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/test-documents/testArabicMisleadingCharset.html
@@ -0,0 +1,11 @@
+<html>
+<head>
+<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
+<title>Test Arabic</title>
+</head>
+<body>
+<p>�� ���� �� ����� ��� ���� ��� ���� ���� �� ��� �� ������ ���� ���� �����
��� ������ ����� �������. ����� ������� �� ����� �� ���� ������ ������� ��
������ ������� ���� �� �������� ����� �����.</p>
+<p>�� ���� �� ����� ��� ���� ��� ���� ���� �� ��� �� ������ ���� ���� �����
��� ������ ����� �������. ����� ������� �� ����� �� ���� ������ ������� ��
������ ������� ���� �� �������� ����� �����.</p>
+<p>�� ���� �� ����� ��� ���� ��� ���� ���� �� ��� �� ������ ���� ���� �����
��� ������ ����� �������. ����� ������� �� ����� �� ���� ������ ������� ��
������ ������� ���� �� �������� ����� �����.</p>
+</body>
+</html>
\ No newline at end of file