This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-4671-lang-aware-charset-detection
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 006739ce4b9d7eb534a6b06b1fd10be5029139cf
Author: tallison <[email protected]>
AuthorDate: Thu Feb 19 08:05:30 2026 -0500

    TIKA-4671 - tweaks
---
 .../langdetect/charsoup/CharSoupEncodingDetector.java  |  2 +-
 .../langdetect/charsoup/CharSoupLanguageDetector.java  |  2 +-
 .../apache/tika/config/TikaEncodingDetectorTest.java   | 18 ++++++++++++++++++
 3 files changed, 20 insertions(+), 2 deletions(-)

diff --git 
a/tika-langdetect/tika-langdetect-charsoup/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupEncodingDetector.java
 
b/tika-langdetect/tika-langdetect-charsoup/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupEncodingDetector.java
index cb393cadf7..75176f69fc 100644
--- 
a/tika-langdetect/tika-langdetect-charsoup/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupEncodingDetector.java
+++ 
b/tika-langdetect/tika-langdetect-charsoup/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupEncodingDetector.java
@@ -54,7 +54,7 @@ import org.apache.tika.parser.ParseContext;
  *
  * @since Apache Tika 3.2
  */
-@TikaComponent
+@TikaComponent(name = "charsoup-encoding-detector")
 public class CharSoupEncodingDetector implements MetaEncodingDetector {
 
     private static final long serialVersionUID = 1L;
diff --git 
a/tika-langdetect/tika-langdetect-charsoup/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupLanguageDetector.java
 
b/tika-langdetect/tika-langdetect-charsoup/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupLanguageDetector.java
index 6e60e88447..31534f2e38 100644
--- 
a/tika-langdetect/tika-langdetect-charsoup/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupLanguageDetector.java
+++ 
b/tika-langdetect/tika-langdetect-charsoup/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupLanguageDetector.java
@@ -53,7 +53,7 @@ import org.apache.tika.language.detect.LanguageResult;
  * keeping the implementation simple and predictable.
  * </p>
  */
-@TikaComponent
+@TikaComponent(name = "charsoup-language-detector")
 public class CharSoupLanguageDetector extends LanguageDetector {
 
     private static final Logger LOG =
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/config/TikaEncodingDetectorTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/config/TikaEncodingDetectorTest.java
index bbd3caf272..2524ef404d 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/config/TikaEncodingDetectorTest.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/config/TikaEncodingDetectorTest.java
@@ -271,6 +271,24 @@ public class TikaEncodingDetectorTest extends TikaTest {
     }
 
 
+    @Test
+    public void testExcludeCharSoupEncodingDetector() throws Exception {
+        TikaLoader tikaLoader = TikaLoaderHelper.getLoader(
+                "TIKA-4671-exclude-charsoup-encoding-detector.json");
+        EncodingDetector detector = tikaLoader.loadEncodingDetectors();
+        assertTrue(detector instanceof CompositeEncodingDetector);
+        List<EncodingDetector> detectors =
+                ((CompositeEncodingDetector) detector).getDetectors();
+        // 3 base detectors, no MetaEncodingDetector
+        assertEquals(3, detectors.size());
+        assertTrue(detectors.get(0) instanceof HtmlEncodingDetector);
+        assertTrue(detectors.get(1) instanceof UniversalEncodingDetector);
+        assertTrue(detectors.get(2) instanceof Icu4jEncodingDetector);
+        for (EncodingDetector d : detectors) {
+            assertNotContained("CharSoup", d.getClass().getSimpleName());
+        }
+    }
+
     @Test
     public void testArabicMisleadingCharsetHtml() throws Exception {
         // This HTML file is encoded in windows-1256 but declares charset=UTF-8

Reply via email to