This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/tika.git
commit c4d67657a0b6eaaaf3ddb33448ebc0321e2df87d Author: tallison <[email protected]> AuthorDate: Wed Feb 18 18:03:33 2026 -0500 Revert "TIKA-4671 - language aware charset detection" This reverts commit 52f7d4c82e0ce0a3747a5db181fd4e55641f968b. --- docs/modules/ROOT/nav.adoc | 1 - docs/modules/ROOT/pages/advanced/index.adoc | 1 - docs/modules/ROOT/pages/advanced/tika-eval.adoc | 294 --------------------- .../tika/detect/CompositeEncodingDetector.java | 155 ++--------- .../tika/detect/DefaultEncodingDetector.java | 27 +- .../tika/detect/EncodingDetectorContext.java | 105 -------- .../tika/language/detect/LanguageResult.java | 28 -- .../apache/tika/metadata/TikaCoreProperties.java | 8 - .../charsoup/CharSoupLanguageDetector.java | 70 +---- .../tika-parsers-standard-package/pom.xml | 6 - .../tika/config/TikaEncodingDetectorTest.java | 31 +-- 11 files changed, 33 insertions(+), 693 deletions(-) diff --git a/docs/modules/ROOT/nav.adoc b/docs/modules/ROOT/nav.adoc index dc35714b63..d4bf3cb857 100644 --- a/docs/modules/ROOT/nav.adoc +++ b/docs/modules/ROOT/nav.adoc @@ -44,7 +44,6 @@ ** xref:advanced/spooling.adoc[Spooling] ** xref:advanced/embedded-documents.adoc[Embedded Document Metadata] ** xref:advanced/local-vlm-server.adoc[Running a Local VLM Server] -** xref:advanced/tika-eval.adoc[Text Quality Scoring] * xref:developers/index.adoc[Developers] ** xref:developers/serialization.adoc[Serialization and Configuration] * xref:faq.adoc[FAQ] diff --git a/docs/modules/ROOT/pages/advanced/index.adoc b/docs/modules/ROOT/pages/advanced/index.adoc index 26321e7527..72d1252269 100644 --- a/docs/modules/ROOT/pages/advanced/index.adoc +++ b/docs/modules/ROOT/pages/advanced/index.adoc @@ -28,7 +28,6 @@ This section covers advanced usage and internals of Apache Tika. * xref:advanced/spooling.adoc[TikaInputStream and Spooling] - Understanding how TikaInputStream handles buffering, caching, and spooling to disk * xref:advanced/embedded-documents.adoc[Embedded Document Metadata] - Understanding how Tika tracks embedded documents and their paths * xref:advanced/zip-detection.adoc[ZIP Detection and Salvaging] - How Tika detects and recovers truncated ZIP-based files -* xref:advanced/tika-eval.adoc[Text Quality Scoring] - Measuring extracted text quality using character bigram profiles * xref:advanced/local-vlm-server.adoc[Running a Local VLM Server] - Run an open-source VLM locally as an OpenAI-compatible endpoint for air-gapped OCR diff --git a/docs/modules/ROOT/pages/advanced/tika-eval.adoc b/docs/modules/ROOT/pages/advanced/tika-eval.adoc deleted file mode 100644 index 00a086ec59..0000000000 --- a/docs/modules/ROOT/pages/advanced/tika-eval.adoc +++ /dev/null @@ -1,294 +0,0 @@ -// -// Licensed to the Apache Software Foundation (ASF) under one or more -// contributor license agreements. See the NOTICE file distributed with -// this work for additional information regarding copyright ownership. -// The ASF licenses this file to You under the Apache License, Version 2.0 -// (the "License"); you may not use this file except in compliance with -// the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// - -= Text Quality Scoring (tika-eval-lite) - -The `tika-eval-lite` module provides a lightweight text quality scorer -that measures how well extracted text matches known language patterns. -It uses character bigram frequency profiles derived from language corpora -and requires no external dependencies beyond `tika-core`. - -== Overview - -The scorer computes the average log~2~-likelihood per character bigram -against language-specific profiles. Higher (less negative) scores indicate -text that better matches known language patterns. The score is naturally -normalized by text length, so short and long texts produce comparable -values. - -Scores are designed for *comparison*, not absolute thresholds. Compare -two text variants (e.g., forward vs reversed, charset A vs charset B) -against the same language -- the higher score wins. - -== Use Cases - -* **RTL text direction detection** -- Score both the original and - reversed text against the same language profile. The higher score - indicates the correct reading order. -* **Charset detection** -- Score text decoded under candidate charsets. - The highest-scoring charset is most likely correct. -* **Mojibake / junk detection** -- Compare the extracted text's score - against known-good text in the same language. A significantly lower - score suggests garbled or wrong-charset text. - -== Maven Dependency - -[source,xml] ----- -<dependency> - <groupId>org.apache.tika</groupId> - <artifactId>tika-eval-lite</artifactId> - <version>${tika.version}</version> -</dependency> ----- - -The module depends only on `tika-core` (provided scope). When -`tika-eval-lite` is on the classpath, `TextQualityScorer.getDefault()` -returns the `BigramTextQualityScorer` via the ServiceLoader/SPI -mechanism. When it is absent, a no-op scorer is returned. - -== Basic Usage - -[source,java] ----- -import org.apache.tika.textquality.TextQualityScorer; -import org.apache.tika.textquality.TextQualityResult; - -TextQualityScorer scorer = TextQualityScorer.getDefault(); - -TextQualityResult result = scorer.score("The quick brown fox..."); - -double score = result.getScore(); // e.g. -8.03 -String language = result.getLanguage(); // e.g. "eng" -double confidence = result.getConfidence(); // gap to 2nd-best -int bigramCount = result.getBigramCount(); // bigrams analyzed ----- - -=== Scoring Against a Specific Language - -When comparing two variants of the same text (e.g., for RTL detection -or charset selection), score both against the same language profile: - -[source,java] ----- -TextQualityResult forward = scorer.score(text, "ara"); -TextQualityResult reversed = scorer.score(reversedText, "ara"); - -// Higher score wins -if (forward.getScore() > reversed.getScore()) { - // forward is the correct reading order -} ----- - -=== Configuring Maximum Text Length - -By default, only the first 10,000 characters are analyzed. Beyond this -length, additional text adds negligible precision. To change: - -[source,java] ----- -BigramTextQualityScorer scorer = new BigramTextQualityScorer(); -scorer.setMaxTextLength(20_000); ----- - -== How It Works - -=== Character Bigram Extraction - -Input text is first normalized with NFKD decomposition and combining -mark (diacritic) removal. This matches the ICU folding applied to the -Wikipedia and Leipzig corpus data used to build the profiles. It also -prevents Arabic tashkeel (fatha, kasra, shadda, etc.) from breaking -the bigram chain. - -The scorer then extracts consecutive pairs of lowercase letters. Non-letter -characters (digits, punctuation, whitespace) act as word boundaries. For -each word, three types of bigrams are emitted: - -* **Internal bigrams**: consecutive letter pairs (`he`, `el`, `ll`, `lo`) -* **Word-start bigram**: `_h` (boundary marker + first letter) -* **Word-end bigram**: `o_` (last letter + boundary marker) - -For example, `"Hello, World!"` produces: `_h`, `he`, `el`, `ll`, `lo`, -`o_`, `_w`, `wo`, `or`, `rl`, `ld`, `d_`. - -Word-boundary bigrams are critical for RTL detection because word-initial -and word-final character patterns are highly directional. For example, -Hebrew final forms (sofit letters like ך ,ם ,ן) appear at word ends in -forward text but at word starts when reversed. - -=== Language Profiles - -Each language profile contains the top 500 character bigrams (including -word-boundary bigrams) and their log~2~-probabilities. Profiles are -derived from the `common_tokens` data in `tika-eval-core`, which was -built from Wikipedia and Leipzig corpus data processed through ICU -folding (NFKC normalization, case folding, diacritic removal). The -profiles cover 148 languages. Profiles from corpora with fewer than -500,000 total bigram occurrences are excluded to avoid unreliable -probability estimates. - -Each profile file includes precomputed statistics in its header: - -[cols="1,3"] -|=== -| Header | Description - -| `TOTAL_BIGRAMS` -| Total bigram occurrences in the source corpus. - -| `UNIQUE_BIGRAMS` -| Distinct bigram types observed in the corpus. - -| `UNSEEN_LOG_PROB` -| Estimated log~2~-probability for bigrams not in the profile - (see <<unseen-estimation>>). - -| `EXPECTED_SCORE` -| Expected average log~2~-likelihood for perfect text drawn from - this language's distribution (negative entropy). Stored for - reference; not used by the scorer at runtime. -|=== - -=== Scoring Algorithm - -For each language profile, the scorer computes: - -[stem] -++++ -\text{score} = \frac{1}{N} \sum_{i=1}^{N} \log_2 P(b_i) -++++ - -where _N_ is the total bigram count and _P(b~i~)_ is the probability -of bigram _b~i~_ under the profile. Bigrams not in the profile receive -the profile's unseen log-probability. - -The language with the highest score is selected as the best match. The -*confidence* is the score difference between the best and second-best -language. - -[[unseen-estimation]] -=== Unseen Bigram Estimation - -Rather than using an arbitrary fixed penalty for bigrams not in the -profile, the scorer uses held-out estimation from the corpus statistics: - -[stem] -++++ -P_{\text{unseen}} = \frac{1 - \sum_{j=1}^{K} P(b_j)}{U - K} -++++ - -where _K_ is the number of bigrams in the profile (500), _U_ is the -total number of unique bigrams in the corpus, and the numerator is the -remaining probability mass not covered by the profile. - -This produces per-language calibrated penalties: - -* *English* (606 unique bigrams, top-500 covers ~99%): unseen - log~2~-prob = -19.4 (harsh -- almost all bigrams are known) -* *Arabic* (835 unique bigrams, top-500 covers ~99%): unseen - log~2~-prob = -15.2 -* *Chinese* (29,673 unique bigrams, top-500 covers ~14.5%): unseen - log~2~-prob = -15.4 - -Note that Arabic and Chinese have similar per-bigram unseen penalties -despite very different coverage. This is because the ratio of remaining -mass to unseen count converges. The practical difference is captured by -how *often* unseen bigrams are hit, which is reflected in the expected -score. - -== Known Limitations - -=== CJK Coverage - -The top-500 bigram profiles cover only ~14.5% of Chinese character -bigrams (compared to ~99% for English). This means most CJK bigrams -in the input text will hit the unseen floor penalty, compressing the -score range and reducing discrimination between good and garbled CJK -text. - -For CJK mojibake detection, complement bigram scoring with -script-level checks: replacement characters (U+FFFD), unexpected -script mixing, and CID/GID fallback patterns are more reliable -signals than bigram scores for CJK text. - -=== Arabic Alphabet Symmetry - -Arabic has a small alphabet (28 letters). Approximately 82% of bigrams -in the Arabic profile have their reverse also present in the profile. -This means the forward/reverse score difference for Arabic text is -modest (~0.6 bits/bigram), compared to Hebrew (~1.2 bits/bigram) -which benefits from distinctive sofit (final-form) letters at word -boundaries. - -Arabic RTL detection still works -- the signal is real, just smaller. -Word-boundary bigrams help significantly (Arabic word-start and -word-end patterns are more asymmetric than internal bigrams). - -=== Not a Language Detector - -While the scorer identifies the best-matching language profile, it -is not designed as a general-purpose language detector. It lacks -the sophistication of dedicated tools (e.g., language priors, -n-gram interpolation, script-based shortcuts). Use it for quality -scoring and comparison, not language identification. - -=== Raw Scores Are Language-Dependent - -Raw scores vary by language (e.g., English ~-8.0 vs Chinese ~-13.7 -for good text) because languages differ in character inventory size -and bigram entropy. Each profile's header includes an `EXPECTED_SCORE` -(the negative entropy of the language model) for reference, but -the scorer does not use it at runtime. All three intended use cases --- RTL detection, charset detection, and mojibake detection -- work -by comparing two variants, so absolute score normalization is -unnecessary. - -== Regenerating Profiles - -Profiles are generated from `tika-eval-core`'s `common_tokens` data. -The generation logic is documented and reproducible via -`BigramProfileGenerator`: - -[source,bash] ----- -java -cp tika-eval-lite.jar \ - org.apache.tika.eval.textquality.BigramProfileGenerator \ - path/to/common_tokens \ - path/to/output/bigram_profiles \ - 500 ----- - -The generator reads each language's token file, decomposes words into -character bigrams weighted by term frequency, selects the top-N, and -writes profile files with all precomputed statistics. - -== Architecture - -The interface and result class live in `tika-core` to allow scoring -without pulling in additional dependencies: - -* `org.apache.tika.textquality.TextQualityScorer` -- abstract class - with SPI discovery -* `org.apache.tika.textquality.TextQualityResult` -- immutable result - -The implementation lives in `tika-eval-lite`: - -* `org.apache.tika.eval.textquality.BigramTextQualityScorer` -- - bigram-based scorer (discovered via `META-INF/services`) -* `org.apache.tika.eval.textquality.BigramProfileGenerator` -- - profile generation and documentation of formulas diff --git a/tika-core/src/main/java/org/apache/tika/detect/CompositeEncodingDetector.java b/tika-core/src/main/java/org/apache/tika/detect/CompositeEncodingDetector.java index 5fbf44dace..e5c6152e6a 100644 --- a/tika-core/src/main/java/org/apache/tika/detect/CompositeEncodingDetector.java +++ b/tika-core/src/main/java/org/apache/tika/detect/CompositeEncodingDetector.java @@ -19,105 +19,56 @@ package org.apache.tika.detect; import java.io.IOException; import java.io.Serializable; import java.nio.charset.Charset; -import java.util.ArrayList; import java.util.Collection; import java.util.Collections; +import java.util.LinkedList; import java.util.List; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.parser.ParseContext; -/** - * A composite encoding detector that runs child detectors. - * - * <p>If a {@link MetaEncodingDetector} is among the children, this - * composite switches from first-match-wins to collect-all mode: - * all base detectors run first and their results are collected in an - * {@link EncodingDetectorContext}, then the meta detector runs last - * to arbitrate. Only one meta detector is supported.</p> - * - * <p>If no meta detector is present, the first non-null result wins - * (traditional behavior).</p> - */ public class CompositeEncodingDetector implements EncodingDetector, Serializable { + /** + * Serial version UID + */ private static final long serialVersionUID = 5980683158436430252L; - private static final Logger LOG = - LoggerFactory.getLogger(CompositeEncodingDetector.class); - private final List<EncodingDetector> detectors; - private final List<EncodingDetector> baseDetectors; - private final MetaEncodingDetector metaDetector; public CompositeEncodingDetector(List<EncodingDetector> detectors, Collection<Class<? extends EncodingDetector>> excludeEncodingDetectors) { - this.detectors = new ArrayList<>(); + this.detectors = new LinkedList<>(); for (EncodingDetector encodingDetector : detectors) { if (!isExcluded(excludeEncodingDetectors, encodingDetector.getClass())) { this.detectors.add(encodingDetector); } } - this.baseDetectors = new ArrayList<>(); - this.metaDetector = partition(this.detectors, baseDetectors); + } public CompositeEncodingDetector(List<EncodingDetector> detectors) { - this.detectors = new ArrayList<>(detectors); - this.baseDetectors = new ArrayList<>(); - this.metaDetector = partition(this.detectors, baseDetectors); + this.detectors = new LinkedList<>(); + this.detectors.addAll(detectors); } /** - * Partition detectors into base detectors and at most one meta detector. + * @param tis text document input stream, or <code>null</code> + * @param metadata input metadata for the document + * @return the detected Charset or null if no charset could be detected + * @throws IOException */ - private static MetaEncodingDetector partition( - List<EncodingDetector> all, List<EncodingDetector> base) { - MetaEncodingDetector meta = null; - for (EncodingDetector d : all) { - if (d instanceof MetaEncodingDetector) { - if (meta == null) { - meta = (MetaEncodingDetector) d; - } else { - LOG.warn("Multiple MetaEncodingDetectors found; " + - "ignoring {}", - d.getClass().getName()); - } - } else { - base.add(d); - } - } - return meta; - } - @Override - public Charset detect(TikaInputStream tis, Metadata metadata, - ParseContext parseContext) throws IOException { - if (metaDetector != null) { - return detectWithMeta(tis, metadata, parseContext); - } - return detectFirstMatch(tis, metadata, parseContext); - } - - /** - * Traditional first-match-wins behavior. - */ - private Charset detectFirstMatch(TikaInputStream tis, Metadata metadata, - ParseContext parseContext) - throws IOException { + public Charset detect(TikaInputStream tis, Metadata metadata, ParseContext parseContext) throws IOException { for (EncodingDetector detector : getDetectors()) { Charset detected = detector.detect(tis, metadata, parseContext); if (detected != null) { - metadata.set(TikaCoreProperties.DETECTED_ENCODING, - detected.name()); - if (!detector.getClass().getSimpleName() - .equals("CompositeEncodingDetector")) { + metadata.set(TikaCoreProperties.DETECTED_ENCODING, detected.name()); + //if this has been set by a leaf detector, do not overwrite + if (! detector.getClass().getSimpleName().equals("CompositeEncodingDetector")) { metadata.set(TikaCoreProperties.ENCODING_DETECTOR, detector.getClass().getSimpleName()); } @@ -127,80 +78,6 @@ public class CompositeEncodingDetector implements EncodingDetector, Serializable return null; } - /** - * Collect-all mode: run every base detector, populate context, - * then let the meta detector arbitrate. - */ - private Charset detectWithMeta(TikaInputStream tis, Metadata metadata, - ParseContext parseContext) - throws IOException { - EncodingDetectorContext context = new EncodingDetectorContext(); - parseContext.set(EncodingDetectorContext.class, context); - try { - for (EncodingDetector detector : baseDetectors) { - Charset detected = - detector.detect(tis, metadata, parseContext); - if (detected != null) { - context.addResult(detected, - detector.getClass().getSimpleName()); - } - } - - Charset result = - metaDetector.detect(tis, metadata, parseContext); - - // If meta detector returned null (disabled or no candidates), - // fall back to first base detector's result - if (result == null && !context.getResults().isEmpty()) { - EncodingDetectorContext.Result first = - context.getResults().get(0); - result = first.getCharset(); - metadata.set(TikaCoreProperties.DETECTED_ENCODING, - result.name()); - metadata.set(TikaCoreProperties.ENCODING_DETECTOR, - first.getDetectorName()); - } else if (result != null) { - metadata.set(TikaCoreProperties.DETECTED_ENCODING, - result.name()); - String detectorName = - metaDetector.getClass().getSimpleName(); - for (EncodingDetectorContext.Result r : - context.getResults()) { - if (r.getCharset().equals(result)) { - detectorName = r.getDetectorName(); - break; - } - } - metadata.set(TikaCoreProperties.ENCODING_DETECTOR, - detectorName); - } - - // Build and set the detection trace - metadata.set(TikaCoreProperties.ENCODING_DETECTION_TRACE, - buildTrace(context)); - - return result; - } finally { - parseContext.set(EncodingDetectorContext.class, null); - } - } - - private static String buildTrace(EncodingDetectorContext context) { - StringBuilder sb = new StringBuilder(); - for (EncodingDetectorContext.Result r : context.getResults()) { - if (sb.length() > 0) { - sb.append(", "); - } - sb.append(r.getDetectorName()).append("->") - .append(r.getCharset().name()); - } - String info = context.getArbitrationInfo(); - if (info != null) { - sb.append(" (").append(info).append(")"); - } - return sb.toString(); - } - public List<EncodingDetector> getDetectors() { return Collections.unmodifiableList(detectors); } diff --git a/tika-core/src/main/java/org/apache/tika/detect/DefaultEncodingDetector.java b/tika-core/src/main/java/org/apache/tika/detect/DefaultEncodingDetector.java index 72dd3ba4c0..67cf26e27d 100644 --- a/tika-core/src/main/java/org/apache/tika/detect/DefaultEncodingDetector.java +++ b/tika-core/src/main/java/org/apache/tika/detect/DefaultEncodingDetector.java @@ -22,20 +22,15 @@ import javax.imageio.spi.ServiceRegistry; import org.apache.tika.config.ServiceLoader; /** - * A composite encoding detector based on all the {@link EncodingDetector} - * implementations available through the - * {@link ServiceRegistry service provider mechanism}. - * - * <p>Those loaded via the service provider mechanism are ordered by how - * they appear in the file, if there is a single service file. If - * multiple, there is no guarantee of order.</p> - * - * <p>If you need to control the order of the Detectors, you should - * instead construct your own {@link CompositeEncodingDetector} and pass - * in the list of Detectors in the required order.</p> - * - * <p>{@link MetaEncodingDetector} handling (collect-all-then-arbitrate) - * is provided by {@link CompositeEncodingDetector}.</p> + * A composite encoding detector based on all the {@link EncodingDetector} implementations + * available through the {@link ServiceRegistry service provider mechanism}. Those + * loaded via the service provider mechanism are ordered by how they appear in the + * file, if there is a single service file. If multiple, there is no guarantee of order. + * <p> + * <p> + * If you need to control the order of the Detectors, you should instead + * construct your own {@link CompositeDetector} and pass in the list + * of Detectors in the required order. * * @since Apache Tika 1.15 */ @@ -52,7 +47,7 @@ public class DefaultEncodingDetector extends CompositeEncodingDetector { public DefaultEncodingDetector(ServiceLoader loader, Collection<Class<? extends EncodingDetector>> excludeEncodingDetectors) { - super(loader.loadServiceProviders(EncodingDetector.class), - excludeEncodingDetectors); + super(loader.loadServiceProviders(EncodingDetector.class), excludeEncodingDetectors); } + } diff --git a/tika-core/src/main/java/org/apache/tika/detect/EncodingDetectorContext.java b/tika-core/src/main/java/org/apache/tika/detect/EncodingDetectorContext.java deleted file mode 100644 index 6ac55f87da..0000000000 --- a/tika-core/src/main/java/org/apache/tika/detect/EncodingDetectorContext.java +++ /dev/null @@ -1,105 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika.detect; - -import java.nio.charset.Charset; -import java.util.ArrayList; -import java.util.Collections; -import java.util.LinkedHashSet; -import java.util.List; -import java.util.Set; - -/** - * Context object that collects encoding detection results from base - * detectors. Stored in {@link org.apache.tika.parser.ParseContext} by - * {@link DefaultEncodingDetector} so that the {@link MetaEncodingDetector} - * can see all candidates and arbitrate. Removed after detection to - * prevent contamination during recursive parsing. - * - * @since Apache Tika 3.2 - */ -public class EncodingDetectorContext { - - private final List<Result> results = new ArrayList<>(); - private String arbitrationInfo; - - /** - * Record a detection result from a child detector. - * - * @param charset the detected charset (must not be null) - * @param detectorName the simple class name of the detector - */ - public void addResult(Charset charset, String detectorName) { - results.add(new Result(charset, detectorName)); - } - - /** - * @return unmodifiable list of all results in detection order - */ - public List<Result> getResults() { - return Collections.unmodifiableList(results); - } - - /** - * @return unique charsets in detection order - */ - public Set<Charset> getUniqueCharsets() { - Set<Charset> charsets = new LinkedHashSet<>(); - for (Result r : results) { - charsets.add(r.getCharset()); - } - return charsets; - } - - /** - * Set by the meta detector to describe how it reached its decision. - * Values: "unanimous", "compatible", "scored", "too-few-bigrams", "disabled". - */ - public void setArbitrationInfo(String info) { - this.arbitrationInfo = info; - } - - public String getArbitrationInfo() { - return arbitrationInfo; - } - - /** - * A single detection result pairing a charset with the detector that found it. - */ - public static class Result { - private final Charset charset; - private final String detectorName; - - public Result(Charset charset, String detectorName) { - this.charset = charset; - this.detectorName = detectorName; - } - - public Charset getCharset() { - return charset; - } - - public String getDetectorName() { - return detectorName; - } - - @Override - public String toString() { - return detectorName + "=" + charset.name(); - } - } -} diff --git a/tika-core/src/main/java/org/apache/tika/language/detect/LanguageResult.java b/tika-core/src/main/java/org/apache/tika/language/detect/LanguageResult.java index 348232587e..dada5fda17 100644 --- a/tika-core/src/main/java/org/apache/tika/language/detect/LanguageResult.java +++ b/tika-core/src/main/java/org/apache/tika/language/detect/LanguageResult.java @@ -31,32 +31,14 @@ public class LanguageResult { // greater confidence. private final float rawScore; - // Detector-agnostic confidence score (0.0 to 1.0, higher = more confident). - // Detectors can populate this however makes sense for their internals - // (e.g., entropy-derived for CharSoup, probability-based for OpenNLP). - // Defaults to rawScore for backwards compatibility. - private final float confidenceScore; - /** * @param language ISO 639-1 language code (plus optional country code) * @param rawScore confidence of detector in the result. */ public LanguageResult(String language, LanguageConfidence confidence, float rawScore) { - this(language, confidence, rawScore, rawScore); - } - - /** - * @param language ISO 639-1 language code (plus optional country code) - * @param rawScore detector-specific score (e.g., softmax probability) - * @param confidenceScore detector-agnostic confidence (0.0 to 1.0, higher = more confident). - * For comparing results across different decodings or detectors. - */ - public LanguageResult(String language, LanguageConfidence confidence, - float rawScore, float confidenceScore) { this.language = language; this.confidence = confidence; this.rawScore = rawScore; - this.confidenceScore = confidenceScore; } /** @@ -72,16 +54,6 @@ public class LanguageResult { return rawScore; } - /** - * Detector-agnostic confidence score (0.0 to 1.0). Higher values indicate - * the detector is more confident in the result. This can be used to compare - * results across different text decodings (e.g., for encoding detection) - * without knowing the detector implementation. - */ - public float getConfidenceScore() { - return confidenceScore; - } - public LanguageConfidence getConfidence() { return confidence; } diff --git a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java index 0d57f8cd03..fdd52259e3 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java @@ -414,14 +414,6 @@ public interface TikaCoreProperties { */ Property ENCODING_DETECTOR = Property.externalText(TIKA_META_PREFIX + "encodingDetector"); - /** - * Diagnostic trace showing which encoding detectors ran and what each returned, - * plus the arbitration method used when detectors disagreed. - * Example: {@code "HtmlEncodingDetector->UTF-8, Icu4jEncodingDetector->windows-1256 (scored)"} - */ - Property ENCODING_DETECTION_TRACE = - Property.externalText(TIKA_META_PREFIX + "encodingDetectionTrace"); - /** * General metadata key for the count of non-final versions available within a file. This * was added initially to support generalizing incremental updates in PDF. diff --git a/tika-langdetect/tika-langdetect-charsoup/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupLanguageDetector.java b/tika-langdetect/tika-langdetect-charsoup/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupLanguageDetector.java index 6e60e88447..1fa6e2cce3 100644 --- a/tika-langdetect/tika-langdetect-charsoup/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupLanguageDetector.java +++ b/tika-langdetect/tika-langdetect-charsoup/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupLanguageDetector.java @@ -26,9 +26,6 @@ import java.util.List; import java.util.Map; import java.util.Set; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - import org.apache.tika.config.TikaComponent; import org.apache.tika.language.detect.LanguageConfidence; import org.apache.tika.language.detect.LanguageDetector; @@ -56,9 +53,6 @@ import org.apache.tika.language.detect.LanguageResult; @TikaComponent public class CharSoupLanguageDetector extends LanguageDetector { - private static final Logger LOG = - LoggerFactory.getLogger(CharSoupLanguageDetector.class); - private static final String MODEL_RESOURCE = "/org/apache/tika/langdetect/charsoup/langdetect.bin"; @@ -273,43 +267,6 @@ public class CharSoupLanguageDetector extends LanguageDetector { return lastEntropy; } - /** - * Compare multiple candidate texts and return the key of the one with - * the strongest language signal (lowest entropy). This is useful for - * encoding detection: decode raw bytes with each candidate charset, - * pass the decoded texts here, and the winner is the best charset. - * - * @param candidates map of arbitrary keys to candidate text strings - * @param <K> key type (e.g., {@link java.nio.charset.Charset}) - * @return the key whose text has the strongest language signal, - * or {@code null} if the map is empty - */ - public <K> K compareLanguageSignal(Map<K, String> candidates) { - if (candidates.isEmpty()) { - return null; - } - - float bestEntropy = Float.MAX_VALUE; - K bestKey = null; - - for (Map.Entry<K, String> entry : candidates.entrySet()) { - reset(); - addText(entry.getValue()); - detectAll(); - float entropy = getDistributionEntropy(); - - LOG.debug("compareLanguageSignal: {} -> entropy={}", - entry.getKey(), entropy); - - if (entropy < bestEntropy) { - bestEntropy = entropy; - bestKey = entry.getKey(); - } - } - - return bestKey; - } - @Override public LanguageDetector loadModels() throws IOException { // Models are loaded statically; nothing to do. @@ -410,22 +367,6 @@ public class CharSoupLanguageDetector extends LanguageDetector { return buildResults(bestProbs); } - /** - * Maximum meaningful entropy (bits) for normalizing confidenceScore. - * log2(numClasses) for ~165 classes is ~7.4. We cap at 7.0 so that - * even moderately uncertain text gets a near-zero confidenceScore. - */ - private static final float MAX_ENTROPY = 7.0f; - - /** - * Convert entropy to a 0-1 confidence score. Lower entropy = higher confidence. - * Uses 1/(1+entropy) to preserve discrimination even at very low entropies, - * unlike a linear mapping which saturates at 1.0 too quickly. - */ - private static float entropyToConfidenceScore(float entropy) { - return 1.0f / (1.0f + entropy); - } - /** * Build sorted LanguageResult list from raw probabilities. */ @@ -433,23 +374,18 @@ public class CharSoupLanguageDetector extends LanguageDetector { // Compute entropy on collapsed distribution float[] collapsed = collapseGroups(probs, GROUP_INDICES); lastEntropy = CharSoupModel.entropy(collapsed); - float confScore = entropyToConfidenceScore(lastEntropy); // Build results from raw probabilities sorted by probability descending List<LanguageResult> results = new ArrayList<>(MODEL.getNumClasses()); for (int c = 0; c < MODEL.getNumClasses(); c++) { results.add(new LanguageResult( - MODEL.getLabel(c), toConfidence(probs[c], lastEntropy), - probs[c], confScore)); + MODEL.getLabel(c), toConfidence(probs[c], lastEntropy), probs[c])); } results.sort((a, b) -> Float.compare(b.getRawScore(), a.getRawScore())); - // If top score is below NONE threshold, return a NULL-like result - // but preserve the confidenceScore so encoding arbitration can - // still compare across candidate decodings. + // If top score is below NONE threshold, return NULL if (results.get(0).getConfidence() == LanguageConfidence.NONE) { - return Collections.singletonList( - new LanguageResult("", LanguageConfidence.NONE, 0.0f, confScore)); + return Collections.singletonList(LanguageResult.NULL); } return results; diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/pom.xml b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/pom.xml index 6363afc8a2..9824379de8 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/pom.xml +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/pom.xml @@ -173,12 +173,6 @@ <version>${project.version}</version> <scope>test</scope> </dependency> - <dependency> - <groupId>${project.groupId}</groupId> - <artifactId>tika-langdetect-charsoup</artifactId> - <version>${project.version}</version> - <scope>test</scope> - </dependency> <dependency> <groupId>${project.groupId}</groupId> <artifactId>tika-serialization</artifactId> diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/config/TikaEncodingDetectorTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/config/TikaEncodingDetectorTest.java index bbd3caf272..2cd4af4913 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/config/TikaEncodingDetectorTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/config/TikaEncodingDetectorTest.java @@ -36,13 +36,11 @@ import org.apache.tika.TikaTest; import org.apache.tika.config.loader.TikaLoader; import org.apache.tika.detect.CompositeEncodingDetector; import org.apache.tika.detect.EncodingDetector; -import org.apache.tika.detect.MetaEncodingDetector; import org.apache.tika.detect.OverrideEncodingDetector; import org.apache.tika.exception.TikaConfigException; import org.apache.tika.exception.TikaException; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; -import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.parser.AbstractEncodingDetectorParser; import org.apache.tika.parser.CompositeParser; import org.apache.tika.parser.Parser; @@ -59,12 +57,10 @@ public class TikaEncodingDetectorTest extends TikaTest { EncodingDetector detector = TikaLoader.loadDefault().loadEncodingDetectors(); assertTrue(detector instanceof CompositeEncodingDetector); List<EncodingDetector> detectors = ((CompositeEncodingDetector) detector).getDetectors(); - // 3 base detectors + CharSoupEncodingDetector (MetaEncodingDetector) - assertEquals(4, detectors.size()); + assertEquals(3, detectors.size()); assertTrue(detectors.get(0) instanceof HtmlEncodingDetector); assertTrue(detectors.get(1) instanceof UniversalEncodingDetector); assertTrue(detectors.get(2) instanceof Icu4jEncodingDetector); - assertTrue(detectors.get(3) instanceof MetaEncodingDetector); } @Test @@ -73,18 +69,15 @@ public class TikaEncodingDetectorTest extends TikaTest { EncodingDetector detector = tikaLoader.loadEncodingDetectors(); assertTrue(detector instanceof CompositeEncodingDetector); List<EncodingDetector> detectors = ((CompositeEncodingDetector) detector).getDetectors(); - // default-encoding-detector (inner composite) + override-encoding-detector - // The inner composite now includes CharSoupEncodingDetector from SPI assertEquals(2, detectors.size()); EncodingDetector detector1 = detectors.get(0); assertTrue(detector1 instanceof CompositeEncodingDetector); List<EncodingDetector> detectors1Children = ((CompositeEncodingDetector) detector1).getDetectors(); - assertEquals(3, detectors1Children.size()); + assertEquals(2, detectors1Children.size()); assertTrue(detectors1Children.get(0) instanceof UniversalEncodingDetector); assertTrue(detectors1Children.get(1) instanceof Icu4jEncodingDetector); - assertTrue(detectors1Children.get(2) instanceof MetaEncodingDetector); assertTrue(detectors.get(1) instanceof OverrideEncodingDetector); @@ -182,8 +175,7 @@ public class TikaEncodingDetectorTest extends TikaTest { ((AbstractEncodingDetectorParser) encodingDetectingParser) .getEncodingDetector(); assertTrue(encodingDetector instanceof CompositeEncodingDetector); - // HtmlEncodingDetector, UniversalEncodingDetector, CharSoupEncodingDetector - assertEquals(3, ((CompositeEncodingDetector) encodingDetector).getDetectors().size()); + assertEquals(2, ((CompositeEncodingDetector) encodingDetector).getDetectors().size()); for (EncodingDetector child : ((CompositeEncodingDetector) encodingDetector) .getDetectors()) { assertNotContained("cu4j", child.getClass().getCanonicalName()); @@ -271,23 +263,6 @@ public class TikaEncodingDetectorTest extends TikaTest { } - @Test - public void testArabicMisleadingCharsetHtml() throws Exception { - // This HTML file is encoded in windows-1256 but declares charset=UTF-8 - // in the meta tag. The CharSoupEncodingDetector should override the - // misleading HTML meta and detect that the actual content is Arabic - // (windows-1256) because windows-1256 decoded text produces a higher - // language detection score. - Metadata metadata = new Metadata(); - XMLResult result = getXML("testArabicMisleadingCharset.html", metadata); - // Verify encoding was detected as windows-1256, not the misleading UTF-8 - assertEquals("windows-1256", - metadata.get(TikaCoreProperties.DETECTED_ENCODING)); - // Verify extracted text contains readable Arabic, not mojibake - // \u0627\u0644\u0639\u0631\u0628\u064a\u0629 = "العربية" (Arabic) - assertContains("\u0627\u0644\u0639\u0631\u0628\u064a\u0629", result.xml); - } - private void findEncodingDetectionParsers(Parser p, List<Parser> encodingDetectionParsers) { if (p instanceof CompositeParser) {
