This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4671-lang-aware-charset-detection in repository https://gitbox.apache.org/repos/asf/tika.git
commit c4a0f147fb21a52dbcde4582f43dbf16283c7a22 Author: tallison <[email protected]> AuthorDate: Wed Feb 18 18:01:03 2026 -0500 TIKA-4671 - language aware charset detection --- docs/modules/ROOT/nav.adoc | 1 + docs/modules/ROOT/pages/advanced/index.adoc | 1 + docs/modules/ROOT/pages/advanced/tika-eval.adoc | 294 +++++++++++++++++++++ .../tika/detect/CompositeEncodingDetector.java | 155 +++++++++-- .../tika/detect/DefaultEncodingDetector.java | 27 +- .../tika/detect/EncodingDetectorContext.java | 105 ++++++++ .../tika/language/detect/LanguageResult.java | 28 ++ .../apache/tika/metadata/TikaCoreProperties.java | 8 + .../charsoup/CharSoupLanguageDetector.java | 70 ++++- .../tika-parsers-standard-package/pom.xml | 6 + .../tika/config/TikaEncodingDetectorTest.java | 31 ++- 11 files changed, 693 insertions(+), 33 deletions(-) diff --git a/docs/modules/ROOT/nav.adoc b/docs/modules/ROOT/nav.adoc index d4bf3cb857..dc35714b63 100644 --- a/docs/modules/ROOT/nav.adoc +++ b/docs/modules/ROOT/nav.adoc @@ -44,6 +44,7 @@ ** xref:advanced/spooling.adoc[Spooling] ** xref:advanced/embedded-documents.adoc[Embedded Document Metadata] ** xref:advanced/local-vlm-server.adoc[Running a Local VLM Server] +** xref:advanced/tika-eval.adoc[Text Quality Scoring] * xref:developers/index.adoc[Developers] ** xref:developers/serialization.adoc[Serialization and Configuration] * xref:faq.adoc[FAQ] diff --git a/docs/modules/ROOT/pages/advanced/index.adoc b/docs/modules/ROOT/pages/advanced/index.adoc index 72d1252269..26321e7527 100644 --- a/docs/modules/ROOT/pages/advanced/index.adoc +++ b/docs/modules/ROOT/pages/advanced/index.adoc @@ -28,6 +28,7 @@ This section covers advanced usage and internals of Apache Tika. * xref:advanced/spooling.adoc[TikaInputStream and Spooling] - Understanding how TikaInputStream handles buffering, caching, and spooling to disk * xref:advanced/embedded-documents.adoc[Embedded Document Metadata] - Understanding how Tika tracks embedded documents and their paths * xref:advanced/zip-detection.adoc[ZIP Detection and Salvaging] - How Tika detects and recovers truncated ZIP-based files +* xref:advanced/tika-eval.adoc[Text Quality Scoring] - Measuring extracted text quality using character bigram profiles * xref:advanced/local-vlm-server.adoc[Running a Local VLM Server] - Run an open-source VLM locally as an OpenAI-compatible endpoint for air-gapped OCR diff --git a/docs/modules/ROOT/pages/advanced/tika-eval.adoc b/docs/modules/ROOT/pages/advanced/tika-eval.adoc new file mode 100644 index 0000000000..00a086ec59 --- /dev/null +++ b/docs/modules/ROOT/pages/advanced/tika-eval.adoc @@ -0,0 +1,294 @@ +// +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + += Text Quality Scoring (tika-eval-lite) + +The `tika-eval-lite` module provides a lightweight text quality scorer +that measures how well extracted text matches known language patterns. +It uses character bigram frequency profiles derived from language corpora +and requires no external dependencies beyond `tika-core`. + +== Overview + +The scorer computes the average log~2~-likelihood per character bigram +against language-specific profiles. Higher (less negative) scores indicate +text that better matches known language patterns. The score is naturally +normalized by text length, so short and long texts produce comparable +values. + +Scores are designed for *comparison*, not absolute thresholds. Compare +two text variants (e.g., forward vs reversed, charset A vs charset B) +against the same language -- the higher score wins. + +== Use Cases + +* **RTL text direction detection** -- Score both the original and + reversed text against the same language profile. The higher score + indicates the correct reading order. +* **Charset detection** -- Score text decoded under candidate charsets. + The highest-scoring charset is most likely correct. +* **Mojibake / junk detection** -- Compare the extracted text's score + against known-good text in the same language. A significantly lower + score suggests garbled or wrong-charset text. + +== Maven Dependency + +[source,xml] +---- +<dependency> + <groupId>org.apache.tika</groupId> + <artifactId>tika-eval-lite</artifactId> + <version>${tika.version}</version> +</dependency> +---- + +The module depends only on `tika-core` (provided scope). When +`tika-eval-lite` is on the classpath, `TextQualityScorer.getDefault()` +returns the `BigramTextQualityScorer` via the ServiceLoader/SPI +mechanism. When it is absent, a no-op scorer is returned. + +== Basic Usage + +[source,java] +---- +import org.apache.tika.textquality.TextQualityScorer; +import org.apache.tika.textquality.TextQualityResult; + +TextQualityScorer scorer = TextQualityScorer.getDefault(); + +TextQualityResult result = scorer.score("The quick brown fox..."); + +double score = result.getScore(); // e.g. -8.03 +String language = result.getLanguage(); // e.g. "eng" +double confidence = result.getConfidence(); // gap to 2nd-best +int bigramCount = result.getBigramCount(); // bigrams analyzed +---- + +=== Scoring Against a Specific Language + +When comparing two variants of the same text (e.g., for RTL detection +or charset selection), score both against the same language profile: + +[source,java] +---- +TextQualityResult forward = scorer.score(text, "ara"); +TextQualityResult reversed = scorer.score(reversedText, "ara"); + +// Higher score wins +if (forward.getScore() > reversed.getScore()) { + // forward is the correct reading order +} +---- + +=== Configuring Maximum Text Length + +By default, only the first 10,000 characters are analyzed. Beyond this +length, additional text adds negligible precision. To change: + +[source,java] +---- +BigramTextQualityScorer scorer = new BigramTextQualityScorer(); +scorer.setMaxTextLength(20_000); +---- + +== How It Works + +=== Character Bigram Extraction + +Input text is first normalized with NFKD decomposition and combining +mark (diacritic) removal. This matches the ICU folding applied to the +Wikipedia and Leipzig corpus data used to build the profiles. It also +prevents Arabic tashkeel (fatha, kasra, shadda, etc.) from breaking +the bigram chain. + +The scorer then extracts consecutive pairs of lowercase letters. Non-letter +characters (digits, punctuation, whitespace) act as word boundaries. For +each word, three types of bigrams are emitted: + +* **Internal bigrams**: consecutive letter pairs (`he`, `el`, `ll`, `lo`) +* **Word-start bigram**: `_h` (boundary marker + first letter) +* **Word-end bigram**: `o_` (last letter + boundary marker) + +For example, `"Hello, World!"` produces: `_h`, `he`, `el`, `ll`, `lo`, +`o_`, `_w`, `wo`, `or`, `rl`, `ld`, `d_`. + +Word-boundary bigrams are critical for RTL detection because word-initial +and word-final character patterns are highly directional. For example, +Hebrew final forms (sofit letters like ך ,ם ,ן) appear at word ends in +forward text but at word starts when reversed. + +=== Language Profiles + +Each language profile contains the top 500 character bigrams (including +word-boundary bigrams) and their log~2~-probabilities. Profiles are +derived from the `common_tokens` data in `tika-eval-core`, which was +built from Wikipedia and Leipzig corpus data processed through ICU +folding (NFKC normalization, case folding, diacritic removal). The +profiles cover 148 languages. Profiles from corpora with fewer than +500,000 total bigram occurrences are excluded to avoid unreliable +probability estimates. + +Each profile file includes precomputed statistics in its header: + +[cols="1,3"] +|=== +| Header | Description + +| `TOTAL_BIGRAMS` +| Total bigram occurrences in the source corpus. + +| `UNIQUE_BIGRAMS` +| Distinct bigram types observed in the corpus. + +| `UNSEEN_LOG_PROB` +| Estimated log~2~-probability for bigrams not in the profile + (see <<unseen-estimation>>). + +| `EXPECTED_SCORE` +| Expected average log~2~-likelihood for perfect text drawn from + this language's distribution (negative entropy). Stored for + reference; not used by the scorer at runtime. +|=== + +=== Scoring Algorithm + +For each language profile, the scorer computes: + +[stem] +++++ +\text{score} = \frac{1}{N} \sum_{i=1}^{N} \log_2 P(b_i) +++++ + +where _N_ is the total bigram count and _P(b~i~)_ is the probability +of bigram _b~i~_ under the profile. Bigrams not in the profile receive +the profile's unseen log-probability. + +The language with the highest score is selected as the best match. The +*confidence* is the score difference between the best and second-best +language. + +[[unseen-estimation]] +=== Unseen Bigram Estimation + +Rather than using an arbitrary fixed penalty for bigrams not in the +profile, the scorer uses held-out estimation from the corpus statistics: + +[stem] +++++ +P_{\text{unseen}} = \frac{1 - \sum_{j=1}^{K} P(b_j)}{U - K} +++++ + +where _K_ is the number of bigrams in the profile (500), _U_ is the +total number of unique bigrams in the corpus, and the numerator is the +remaining probability mass not covered by the profile. + +This produces per-language calibrated penalties: + +* *English* (606 unique bigrams, top-500 covers ~99%): unseen + log~2~-prob = -19.4 (harsh -- almost all bigrams are known) +* *Arabic* (835 unique bigrams, top-500 covers ~99%): unseen + log~2~-prob = -15.2 +* *Chinese* (29,673 unique bigrams, top-500 covers ~14.5%): unseen + log~2~-prob = -15.4 + +Note that Arabic and Chinese have similar per-bigram unseen penalties +despite very different coverage. This is because the ratio of remaining +mass to unseen count converges. The practical difference is captured by +how *often* unseen bigrams are hit, which is reflected in the expected +score. + +== Known Limitations + +=== CJK Coverage + +The top-500 bigram profiles cover only ~14.5% of Chinese character +bigrams (compared to ~99% for English). This means most CJK bigrams +in the input text will hit the unseen floor penalty, compressing the +score range and reducing discrimination between good and garbled CJK +text. + +For CJK mojibake detection, complement bigram scoring with +script-level checks: replacement characters (U+FFFD), unexpected +script mixing, and CID/GID fallback patterns are more reliable +signals than bigram scores for CJK text. + +=== Arabic Alphabet Symmetry + +Arabic has a small alphabet (28 letters). Approximately 82% of bigrams +in the Arabic profile have their reverse also present in the profile. +This means the forward/reverse score difference for Arabic text is +modest (~0.6 bits/bigram), compared to Hebrew (~1.2 bits/bigram) +which benefits from distinctive sofit (final-form) letters at word +boundaries. + +Arabic RTL detection still works -- the signal is real, just smaller. +Word-boundary bigrams help significantly (Arabic word-start and +word-end patterns are more asymmetric than internal bigrams). + +=== Not a Language Detector + +While the scorer identifies the best-matching language profile, it +is not designed as a general-purpose language detector. It lacks +the sophistication of dedicated tools (e.g., language priors, +n-gram interpolation, script-based shortcuts). Use it for quality +scoring and comparison, not language identification. + +=== Raw Scores Are Language-Dependent + +Raw scores vary by language (e.g., English ~-8.0 vs Chinese ~-13.7 +for good text) because languages differ in character inventory size +and bigram entropy. Each profile's header includes an `EXPECTED_SCORE` +(the negative entropy of the language model) for reference, but +the scorer does not use it at runtime. All three intended use cases +-- RTL detection, charset detection, and mojibake detection -- work +by comparing two variants, so absolute score normalization is +unnecessary. + +== Regenerating Profiles + +Profiles are generated from `tika-eval-core`'s `common_tokens` data. +The generation logic is documented and reproducible via +`BigramProfileGenerator`: + +[source,bash] +---- +java -cp tika-eval-lite.jar \ + org.apache.tika.eval.textquality.BigramProfileGenerator \ + path/to/common_tokens \ + path/to/output/bigram_profiles \ + 500 +---- + +The generator reads each language's token file, decomposes words into +character bigrams weighted by term frequency, selects the top-N, and +writes profile files with all precomputed statistics. + +== Architecture + +The interface and result class live in `tika-core` to allow scoring +without pulling in additional dependencies: + +* `org.apache.tika.textquality.TextQualityScorer` -- abstract class + with SPI discovery +* `org.apache.tika.textquality.TextQualityResult` -- immutable result + +The implementation lives in `tika-eval-lite`: + +* `org.apache.tika.eval.textquality.BigramTextQualityScorer` -- + bigram-based scorer (discovered via `META-INF/services`) +* `org.apache.tika.eval.textquality.BigramProfileGenerator` -- + profile generation and documentation of formulas diff --git a/tika-core/src/main/java/org/apache/tika/detect/CompositeEncodingDetector.java b/tika-core/src/main/java/org/apache/tika/detect/CompositeEncodingDetector.java index e5c6152e6a..5fbf44dace 100644 --- a/tika-core/src/main/java/org/apache/tika/detect/CompositeEncodingDetector.java +++ b/tika-core/src/main/java/org/apache/tika/detect/CompositeEncodingDetector.java @@ -19,56 +19,105 @@ package org.apache.tika.detect; import java.io.IOException; import java.io.Serializable; import java.nio.charset.Charset; +import java.util.ArrayList; import java.util.Collection; import java.util.Collections; -import java.util.LinkedList; import java.util.List; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.parser.ParseContext; +/** + * A composite encoding detector that runs child detectors. + * + * <p>If a {@link MetaEncodingDetector} is among the children, this + * composite switches from first-match-wins to collect-all mode: + * all base detectors run first and their results are collected in an + * {@link EncodingDetectorContext}, then the meta detector runs last + * to arbitrate. Only one meta detector is supported.</p> + * + * <p>If no meta detector is present, the first non-null result wins + * (traditional behavior).</p> + */ public class CompositeEncodingDetector implements EncodingDetector, Serializable { - /** - * Serial version UID - */ private static final long serialVersionUID = 5980683158436430252L; + private static final Logger LOG = + LoggerFactory.getLogger(CompositeEncodingDetector.class); + private final List<EncodingDetector> detectors; + private final List<EncodingDetector> baseDetectors; + private final MetaEncodingDetector metaDetector; public CompositeEncodingDetector(List<EncodingDetector> detectors, Collection<Class<? extends EncodingDetector>> excludeEncodingDetectors) { - this.detectors = new LinkedList<>(); + this.detectors = new ArrayList<>(); for (EncodingDetector encodingDetector : detectors) { if (!isExcluded(excludeEncodingDetectors, encodingDetector.getClass())) { this.detectors.add(encodingDetector); } } - + this.baseDetectors = new ArrayList<>(); + this.metaDetector = partition(this.detectors, baseDetectors); } public CompositeEncodingDetector(List<EncodingDetector> detectors) { - this.detectors = new LinkedList<>(); - this.detectors.addAll(detectors); + this.detectors = new ArrayList<>(detectors); + this.baseDetectors = new ArrayList<>(); + this.metaDetector = partition(this.detectors, baseDetectors); } /** - * @param tis text document input stream, or <code>null</code> - * @param metadata input metadata for the document - * @return the detected Charset or null if no charset could be detected - * @throws IOException + * Partition detectors into base detectors and at most one meta detector. */ + private static MetaEncodingDetector partition( + List<EncodingDetector> all, List<EncodingDetector> base) { + MetaEncodingDetector meta = null; + for (EncodingDetector d : all) { + if (d instanceof MetaEncodingDetector) { + if (meta == null) { + meta = (MetaEncodingDetector) d; + } else { + LOG.warn("Multiple MetaEncodingDetectors found; " + + "ignoring {}", + d.getClass().getName()); + } + } else { + base.add(d); + } + } + return meta; + } + @Override - public Charset detect(TikaInputStream tis, Metadata metadata, ParseContext parseContext) throws IOException { + public Charset detect(TikaInputStream tis, Metadata metadata, + ParseContext parseContext) throws IOException { + if (metaDetector != null) { + return detectWithMeta(tis, metadata, parseContext); + } + return detectFirstMatch(tis, metadata, parseContext); + } + + /** + * Traditional first-match-wins behavior. + */ + private Charset detectFirstMatch(TikaInputStream tis, Metadata metadata, + ParseContext parseContext) + throws IOException { for (EncodingDetector detector : getDetectors()) { Charset detected = detector.detect(tis, metadata, parseContext); if (detected != null) { - metadata.set(TikaCoreProperties.DETECTED_ENCODING, detected.name()); - //if this has been set by a leaf detector, do not overwrite - if (! detector.getClass().getSimpleName().equals("CompositeEncodingDetector")) { + metadata.set(TikaCoreProperties.DETECTED_ENCODING, + detected.name()); + if (!detector.getClass().getSimpleName() + .equals("CompositeEncodingDetector")) { metadata.set(TikaCoreProperties.ENCODING_DETECTOR, detector.getClass().getSimpleName()); } @@ -78,6 +127,80 @@ public class CompositeEncodingDetector implements EncodingDetector, Serializable return null; } + /** + * Collect-all mode: run every base detector, populate context, + * then let the meta detector arbitrate. + */ + private Charset detectWithMeta(TikaInputStream tis, Metadata metadata, + ParseContext parseContext) + throws IOException { + EncodingDetectorContext context = new EncodingDetectorContext(); + parseContext.set(EncodingDetectorContext.class, context); + try { + for (EncodingDetector detector : baseDetectors) { + Charset detected = + detector.detect(tis, metadata, parseContext); + if (detected != null) { + context.addResult(detected, + detector.getClass().getSimpleName()); + } + } + + Charset result = + metaDetector.detect(tis, metadata, parseContext); + + // If meta detector returned null (disabled or no candidates), + // fall back to first base detector's result + if (result == null && !context.getResults().isEmpty()) { + EncodingDetectorContext.Result first = + context.getResults().get(0); + result = first.getCharset(); + metadata.set(TikaCoreProperties.DETECTED_ENCODING, + result.name()); + metadata.set(TikaCoreProperties.ENCODING_DETECTOR, + first.getDetectorName()); + } else if (result != null) { + metadata.set(TikaCoreProperties.DETECTED_ENCODING, + result.name()); + String detectorName = + metaDetector.getClass().getSimpleName(); + for (EncodingDetectorContext.Result r : + context.getResults()) { + if (r.getCharset().equals(result)) { + detectorName = r.getDetectorName(); + break; + } + } + metadata.set(TikaCoreProperties.ENCODING_DETECTOR, + detectorName); + } + + // Build and set the detection trace + metadata.set(TikaCoreProperties.ENCODING_DETECTION_TRACE, + buildTrace(context)); + + return result; + } finally { + parseContext.set(EncodingDetectorContext.class, null); + } + } + + private static String buildTrace(EncodingDetectorContext context) { + StringBuilder sb = new StringBuilder(); + for (EncodingDetectorContext.Result r : context.getResults()) { + if (sb.length() > 0) { + sb.append(", "); + } + sb.append(r.getDetectorName()).append("->") + .append(r.getCharset().name()); + } + String info = context.getArbitrationInfo(); + if (info != null) { + sb.append(" (").append(info).append(")"); + } + return sb.toString(); + } + public List<EncodingDetector> getDetectors() { return Collections.unmodifiableList(detectors); } diff --git a/tika-core/src/main/java/org/apache/tika/detect/DefaultEncodingDetector.java b/tika-core/src/main/java/org/apache/tika/detect/DefaultEncodingDetector.java index 67cf26e27d..72dd3ba4c0 100644 --- a/tika-core/src/main/java/org/apache/tika/detect/DefaultEncodingDetector.java +++ b/tika-core/src/main/java/org/apache/tika/detect/DefaultEncodingDetector.java @@ -22,15 +22,20 @@ import javax.imageio.spi.ServiceRegistry; import org.apache.tika.config.ServiceLoader; /** - * A composite encoding detector based on all the {@link EncodingDetector} implementations - * available through the {@link ServiceRegistry service provider mechanism}. Those - * loaded via the service provider mechanism are ordered by how they appear in the - * file, if there is a single service file. If multiple, there is no guarantee of order. - * <p> - * <p> - * If you need to control the order of the Detectors, you should instead - * construct your own {@link CompositeDetector} and pass in the list - * of Detectors in the required order. + * A composite encoding detector based on all the {@link EncodingDetector} + * implementations available through the + * {@link ServiceRegistry service provider mechanism}. + * + * <p>Those loaded via the service provider mechanism are ordered by how + * they appear in the file, if there is a single service file. If + * multiple, there is no guarantee of order.</p> + * + * <p>If you need to control the order of the Detectors, you should + * instead construct your own {@link CompositeEncodingDetector} and pass + * in the list of Detectors in the required order.</p> + * + * <p>{@link MetaEncodingDetector} handling (collect-all-then-arbitrate) + * is provided by {@link CompositeEncodingDetector}.</p> * * @since Apache Tika 1.15 */ @@ -47,7 +52,7 @@ public class DefaultEncodingDetector extends CompositeEncodingDetector { public DefaultEncodingDetector(ServiceLoader loader, Collection<Class<? extends EncodingDetector>> excludeEncodingDetectors) { - super(loader.loadServiceProviders(EncodingDetector.class), excludeEncodingDetectors); + super(loader.loadServiceProviders(EncodingDetector.class), + excludeEncodingDetectors); } - } diff --git a/tika-core/src/main/java/org/apache/tika/detect/EncodingDetectorContext.java b/tika-core/src/main/java/org/apache/tika/detect/EncodingDetectorContext.java new file mode 100644 index 0000000000..6ac55f87da --- /dev/null +++ b/tika-core/src/main/java/org/apache/tika/detect/EncodingDetectorContext.java @@ -0,0 +1,105 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.detect; + +import java.nio.charset.Charset; +import java.util.ArrayList; +import java.util.Collections; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Set; + +/** + * Context object that collects encoding detection results from base + * detectors. Stored in {@link org.apache.tika.parser.ParseContext} by + * {@link DefaultEncodingDetector} so that the {@link MetaEncodingDetector} + * can see all candidates and arbitrate. Removed after detection to + * prevent contamination during recursive parsing. + * + * @since Apache Tika 3.2 + */ +public class EncodingDetectorContext { + + private final List<Result> results = new ArrayList<>(); + private String arbitrationInfo; + + /** + * Record a detection result from a child detector. + * + * @param charset the detected charset (must not be null) + * @param detectorName the simple class name of the detector + */ + public void addResult(Charset charset, String detectorName) { + results.add(new Result(charset, detectorName)); + } + + /** + * @return unmodifiable list of all results in detection order + */ + public List<Result> getResults() { + return Collections.unmodifiableList(results); + } + + /** + * @return unique charsets in detection order + */ + public Set<Charset> getUniqueCharsets() { + Set<Charset> charsets = new LinkedHashSet<>(); + for (Result r : results) { + charsets.add(r.getCharset()); + } + return charsets; + } + + /** + * Set by the meta detector to describe how it reached its decision. + * Values: "unanimous", "compatible", "scored", "too-few-bigrams", "disabled". + */ + public void setArbitrationInfo(String info) { + this.arbitrationInfo = info; + } + + public String getArbitrationInfo() { + return arbitrationInfo; + } + + /** + * A single detection result pairing a charset with the detector that found it. + */ + public static class Result { + private final Charset charset; + private final String detectorName; + + public Result(Charset charset, String detectorName) { + this.charset = charset; + this.detectorName = detectorName; + } + + public Charset getCharset() { + return charset; + } + + public String getDetectorName() { + return detectorName; + } + + @Override + public String toString() { + return detectorName + "=" + charset.name(); + } + } +} diff --git a/tika-core/src/main/java/org/apache/tika/language/detect/LanguageResult.java b/tika-core/src/main/java/org/apache/tika/language/detect/LanguageResult.java index dada5fda17..348232587e 100644 --- a/tika-core/src/main/java/org/apache/tika/language/detect/LanguageResult.java +++ b/tika-core/src/main/java/org/apache/tika/language/detect/LanguageResult.java @@ -31,14 +31,32 @@ public class LanguageResult { // greater confidence. private final float rawScore; + // Detector-agnostic confidence score (0.0 to 1.0, higher = more confident). + // Detectors can populate this however makes sense for their internals + // (e.g., entropy-derived for CharSoup, probability-based for OpenNLP). + // Defaults to rawScore for backwards compatibility. + private final float confidenceScore; + /** * @param language ISO 639-1 language code (plus optional country code) * @param rawScore confidence of detector in the result. */ public LanguageResult(String language, LanguageConfidence confidence, float rawScore) { + this(language, confidence, rawScore, rawScore); + } + + /** + * @param language ISO 639-1 language code (plus optional country code) + * @param rawScore detector-specific score (e.g., softmax probability) + * @param confidenceScore detector-agnostic confidence (0.0 to 1.0, higher = more confident). + * For comparing results across different decodings or detectors. + */ + public LanguageResult(String language, LanguageConfidence confidence, + float rawScore, float confidenceScore) { this.language = language; this.confidence = confidence; this.rawScore = rawScore; + this.confidenceScore = confidenceScore; } /** @@ -54,6 +72,16 @@ public class LanguageResult { return rawScore; } + /** + * Detector-agnostic confidence score (0.0 to 1.0). Higher values indicate + * the detector is more confident in the result. This can be used to compare + * results across different text decodings (e.g., for encoding detection) + * without knowing the detector implementation. + */ + public float getConfidenceScore() { + return confidenceScore; + } + public LanguageConfidence getConfidence() { return confidence; } diff --git a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java index fdd52259e3..0d57f8cd03 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java @@ -414,6 +414,14 @@ public interface TikaCoreProperties { */ Property ENCODING_DETECTOR = Property.externalText(TIKA_META_PREFIX + "encodingDetector"); + /** + * Diagnostic trace showing which encoding detectors ran and what each returned, + * plus the arbitration method used when detectors disagreed. + * Example: {@code "HtmlEncodingDetector->UTF-8, Icu4jEncodingDetector->windows-1256 (scored)"} + */ + Property ENCODING_DETECTION_TRACE = + Property.externalText(TIKA_META_PREFIX + "encodingDetectionTrace"); + /** * General metadata key for the count of non-final versions available within a file. This * was added initially to support generalizing incremental updates in PDF. diff --git a/tika-langdetect/tika-langdetect-charsoup/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupLanguageDetector.java b/tika-langdetect/tika-langdetect-charsoup/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupLanguageDetector.java index 1fa6e2cce3..6e60e88447 100644 --- a/tika-langdetect/tika-langdetect-charsoup/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupLanguageDetector.java +++ b/tika-langdetect/tika-langdetect-charsoup/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupLanguageDetector.java @@ -26,6 +26,9 @@ import java.util.List; import java.util.Map; import java.util.Set; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + import org.apache.tika.config.TikaComponent; import org.apache.tika.language.detect.LanguageConfidence; import org.apache.tika.language.detect.LanguageDetector; @@ -53,6 +56,9 @@ import org.apache.tika.language.detect.LanguageResult; @TikaComponent public class CharSoupLanguageDetector extends LanguageDetector { + private static final Logger LOG = + LoggerFactory.getLogger(CharSoupLanguageDetector.class); + private static final String MODEL_RESOURCE = "/org/apache/tika/langdetect/charsoup/langdetect.bin"; @@ -267,6 +273,43 @@ public class CharSoupLanguageDetector extends LanguageDetector { return lastEntropy; } + /** + * Compare multiple candidate texts and return the key of the one with + * the strongest language signal (lowest entropy). This is useful for + * encoding detection: decode raw bytes with each candidate charset, + * pass the decoded texts here, and the winner is the best charset. + * + * @param candidates map of arbitrary keys to candidate text strings + * @param <K> key type (e.g., {@link java.nio.charset.Charset}) + * @return the key whose text has the strongest language signal, + * or {@code null} if the map is empty + */ + public <K> K compareLanguageSignal(Map<K, String> candidates) { + if (candidates.isEmpty()) { + return null; + } + + float bestEntropy = Float.MAX_VALUE; + K bestKey = null; + + for (Map.Entry<K, String> entry : candidates.entrySet()) { + reset(); + addText(entry.getValue()); + detectAll(); + float entropy = getDistributionEntropy(); + + LOG.debug("compareLanguageSignal: {} -> entropy={}", + entry.getKey(), entropy); + + if (entropy < bestEntropy) { + bestEntropy = entropy; + bestKey = entry.getKey(); + } + } + + return bestKey; + } + @Override public LanguageDetector loadModels() throws IOException { // Models are loaded statically; nothing to do. @@ -367,6 +410,22 @@ public class CharSoupLanguageDetector extends LanguageDetector { return buildResults(bestProbs); } + /** + * Maximum meaningful entropy (bits) for normalizing confidenceScore. + * log2(numClasses) for ~165 classes is ~7.4. We cap at 7.0 so that + * even moderately uncertain text gets a near-zero confidenceScore. + */ + private static final float MAX_ENTROPY = 7.0f; + + /** + * Convert entropy to a 0-1 confidence score. Lower entropy = higher confidence. + * Uses 1/(1+entropy) to preserve discrimination even at very low entropies, + * unlike a linear mapping which saturates at 1.0 too quickly. + */ + private static float entropyToConfidenceScore(float entropy) { + return 1.0f / (1.0f + entropy); + } + /** * Build sorted LanguageResult list from raw probabilities. */ @@ -374,18 +433,23 @@ public class CharSoupLanguageDetector extends LanguageDetector { // Compute entropy on collapsed distribution float[] collapsed = collapseGroups(probs, GROUP_INDICES); lastEntropy = CharSoupModel.entropy(collapsed); + float confScore = entropyToConfidenceScore(lastEntropy); // Build results from raw probabilities sorted by probability descending List<LanguageResult> results = new ArrayList<>(MODEL.getNumClasses()); for (int c = 0; c < MODEL.getNumClasses(); c++) { results.add(new LanguageResult( - MODEL.getLabel(c), toConfidence(probs[c], lastEntropy), probs[c])); + MODEL.getLabel(c), toConfidence(probs[c], lastEntropy), + probs[c], confScore)); } results.sort((a, b) -> Float.compare(b.getRawScore(), a.getRawScore())); - // If top score is below NONE threshold, return NULL + // If top score is below NONE threshold, return a NULL-like result + // but preserve the confidenceScore so encoding arbitration can + // still compare across candidate decodings. if (results.get(0).getConfidence() == LanguageConfidence.NONE) { - return Collections.singletonList(LanguageResult.NULL); + return Collections.singletonList( + new LanguageResult("", LanguageConfidence.NONE, 0.0f, confScore)); } return results; diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/pom.xml b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/pom.xml index 9824379de8..6363afc8a2 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/pom.xml +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/pom.xml @@ -173,6 +173,12 @@ <version>${project.version}</version> <scope>test</scope> </dependency> + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>tika-langdetect-charsoup</artifactId> + <version>${project.version}</version> + <scope>test</scope> + </dependency> <dependency> <groupId>${project.groupId}</groupId> <artifactId>tika-serialization</artifactId> diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/config/TikaEncodingDetectorTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/config/TikaEncodingDetectorTest.java index 2cd4af4913..bbd3caf272 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/config/TikaEncodingDetectorTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/config/TikaEncodingDetectorTest.java @@ -36,11 +36,13 @@ import org.apache.tika.TikaTest; import org.apache.tika.config.loader.TikaLoader; import org.apache.tika.detect.CompositeEncodingDetector; import org.apache.tika.detect.EncodingDetector; +import org.apache.tika.detect.MetaEncodingDetector; import org.apache.tika.detect.OverrideEncodingDetector; import org.apache.tika.exception.TikaConfigException; import org.apache.tika.exception.TikaException; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.parser.AbstractEncodingDetectorParser; import org.apache.tika.parser.CompositeParser; import org.apache.tika.parser.Parser; @@ -57,10 +59,12 @@ public class TikaEncodingDetectorTest extends TikaTest { EncodingDetector detector = TikaLoader.loadDefault().loadEncodingDetectors(); assertTrue(detector instanceof CompositeEncodingDetector); List<EncodingDetector> detectors = ((CompositeEncodingDetector) detector).getDetectors(); - assertEquals(3, detectors.size()); + // 3 base detectors + CharSoupEncodingDetector (MetaEncodingDetector) + assertEquals(4, detectors.size()); assertTrue(detectors.get(0) instanceof HtmlEncodingDetector); assertTrue(detectors.get(1) instanceof UniversalEncodingDetector); assertTrue(detectors.get(2) instanceof Icu4jEncodingDetector); + assertTrue(detectors.get(3) instanceof MetaEncodingDetector); } @Test @@ -69,15 +73,18 @@ public class TikaEncodingDetectorTest extends TikaTest { EncodingDetector detector = tikaLoader.loadEncodingDetectors(); assertTrue(detector instanceof CompositeEncodingDetector); List<EncodingDetector> detectors = ((CompositeEncodingDetector) detector).getDetectors(); + // default-encoding-detector (inner composite) + override-encoding-detector + // The inner composite now includes CharSoupEncodingDetector from SPI assertEquals(2, detectors.size()); EncodingDetector detector1 = detectors.get(0); assertTrue(detector1 instanceof CompositeEncodingDetector); List<EncodingDetector> detectors1Children = ((CompositeEncodingDetector) detector1).getDetectors(); - assertEquals(2, detectors1Children.size()); + assertEquals(3, detectors1Children.size()); assertTrue(detectors1Children.get(0) instanceof UniversalEncodingDetector); assertTrue(detectors1Children.get(1) instanceof Icu4jEncodingDetector); + assertTrue(detectors1Children.get(2) instanceof MetaEncodingDetector); assertTrue(detectors.get(1) instanceof OverrideEncodingDetector); @@ -175,7 +182,8 @@ public class TikaEncodingDetectorTest extends TikaTest { ((AbstractEncodingDetectorParser) encodingDetectingParser) .getEncodingDetector(); assertTrue(encodingDetector instanceof CompositeEncodingDetector); - assertEquals(2, ((CompositeEncodingDetector) encodingDetector).getDetectors().size()); + // HtmlEncodingDetector, UniversalEncodingDetector, CharSoupEncodingDetector + assertEquals(3, ((CompositeEncodingDetector) encodingDetector).getDetectors().size()); for (EncodingDetector child : ((CompositeEncodingDetector) encodingDetector) .getDetectors()) { assertNotContained("cu4j", child.getClass().getCanonicalName()); @@ -263,6 +271,23 @@ public class TikaEncodingDetectorTest extends TikaTest { } + @Test + public void testArabicMisleadingCharsetHtml() throws Exception { + // This HTML file is encoded in windows-1256 but declares charset=UTF-8 + // in the meta tag. The CharSoupEncodingDetector should override the + // misleading HTML meta and detect that the actual content is Arabic + // (windows-1256) because windows-1256 decoded text produces a higher + // language detection score. + Metadata metadata = new Metadata(); + XMLResult result = getXML("testArabicMisleadingCharset.html", metadata); + // Verify encoding was detected as windows-1256, not the misleading UTF-8 + assertEquals("windows-1256", + metadata.get(TikaCoreProperties.DETECTED_ENCODING)); + // Verify extracted text contains readable Arabic, not mojibake + // \u0627\u0644\u0639\u0631\u0628\u064a\u0629 = "العربية" (Arabic) + assertContains("\u0627\u0644\u0639\u0631\u0628\u064a\u0629", result.xml); + } + private void findEncodingDetectionParsers(Parser p, List<Parser> encodingDetectionParsers) { if (p instanceof CompositeParser) {
