(tika) 02/03: TIKA-4671 - git add

tallison Thu, 19 Feb 2026 05:07:20 -0800

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-4671-lang-aware-charset-detection
in repository https://gitbox.apache.org/repos/asf/tika.git


commit a9aeb8f448e18c6b195ed1d7260f3388264a06c5
Author: tallison <[email protected]>
AuthorDate: Wed Feb 18 18:01:33 2026 -0500

    TIKA-4671 - git add
---
 .../apache/tika/detect/MetaEncodingDetector.java   |  39 +++++
 .../charsoup/CharSoupEncodingDetector.java         | 186 +++++++++++++++++++++
 .../charsoup/CharSoupEncodingDetectorTest.java     | 183 ++++++++++++++++++++
 .../testArabicMisleadingCharset.html               |  11 ++
 4 files changed, 419 insertions(+)

diff --git 
a/tika-core/src/main/java/org/apache/tika/detect/MetaEncodingDetector.java 
b/tika-core/src/main/java/org/apache/tika/detect/MetaEncodingDetector.java
new file mode 100644
index 0000000000..e8a46f647b
--- /dev/null
+++ b/tika-core/src/main/java/org/apache/tika/detect/MetaEncodingDetector.java
@@ -0,0 +1,39 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.detect;
+
+/**
+ * Marker interface for encoding detectors that arbitrate among
+ * candidates collected by base detectors rather than detecting
+ * encoding directly from the stream.
+ *
+ * <p>When a {@code MetaEncodingDetector} is present in a
+ * {@link CompositeEncodingDetector}, the composite switches from
+ * first-match-wins to collect-all mode: all base detectors run
+ * first and their results are collected in an
+ * {@link EncodingDetectorContext}, then the meta detector's
+ * {@link #detect} method is called to pick the winner.</p>
+ *
+ * <p>The {@link EncodingDetectorContext} is placed in the
+ * {@link org.apache.tika.parser.ParseContext} before the meta
+ * detector is invoked, so implementations can retrieve it via
+ * {@code parseContext.get(EncodingDetectorContext.class)}.</p>
+ *
+ * @since Apache Tika 3.2
+ */
+public interface MetaEncodingDetector extends EncodingDetector {
+}
diff --git 
a/tika-langdetect/tika-langdetect-charsoup/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupEncodingDetector.java
 
b/tika-langdetect/tika-langdetect-charsoup/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupEncodingDetector.java
new file mode 100644
index 0000000000..cb393cadf7
--- /dev/null
+++ 
b/tika-langdetect/tika-langdetect-charsoup/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupEncodingDetector.java
@@ -0,0 +1,186 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.langdetect.charsoup;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.nio.CharBuffer;
+import java.nio.charset.Charset;
+import java.nio.charset.CharsetDecoder;
+import java.nio.charset.CodingErrorAction;
+import java.util.LinkedHashMap;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.tika.config.TikaComponent;
+import org.apache.tika.detect.EncodingDetectorContext;
+import org.apache.tika.detect.MetaEncodingDetector;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+
+/**
+ * A {@link MetaEncodingDetector} that uses the CharSoup language detector
+ * to arbitrate when base encoding detectors disagree.
+ *
+ * <p>When base detectors all agree, the unanimous charset is returned
+ * without any language detection. When they disagree, raw bytes are
+ * read from the stream, decoded with each candidate charset, and each
+ * decoded text is scored by {@link CharSoupLanguageDetector}. The
+ * charset that produces the highest-confidence language detection wins.</p>
+ *
+ * <p>To enable, add this detector to your encoding detector chain in
+ * tika-config:</p>
+ * <pre>{@code
+ * "encoding-detectors": [
+ *   { "default-encoding-detector": {} },
+ *   { "charsoup-encoding-detector": {} }
+ * ]
+ * }</pre>
+ *
+ * @since Apache Tika 3.2
+ */
+@TikaComponent
+public class CharSoupEncodingDetector implements MetaEncodingDetector {
+
+    private static final long serialVersionUID = 1L;
+
+    private static final int DEFAULT_READ_LIMIT = 16384;
+
+    private int readLimit = DEFAULT_READ_LIMIT;
+
+    @Override
+    public Charset detect(TikaInputStream tis, Metadata metadata,
+                          ParseContext parseContext) throws IOException {
+        EncodingDetectorContext context =
+                parseContext.get(EncodingDetectorContext.class);
+        if (context == null || context.getResults().isEmpty()) {
+            return null;
+        }
+
+        Set<Charset> uniqueCharsets = context.getUniqueCharsets();
+
+        if (uniqueCharsets.size() <= 1) {
+            // Unanimous or single detector — no arbitration needed
+            EncodingDetectorContext.Result first = context.getResults().get(0);
+            context.setArbitrationInfo("unanimous");
+            return first.getCharset();
+        }
+
+        // Disagreement — arbitrate via language detection scoring
+        return arbitrate(tis, context, uniqueCharsets);
+    }
+
+    private Charset arbitrate(TikaInputStream tis,
+                              EncodingDetectorContext context,
+                              Set<Charset> uniqueCharsets) throws IOException {
+        EncodingDetectorContext.Result firstResult = 
context.getResults().get(0);
+
+        if (tis == null) {
+            context.setArbitrationInfo("no-stream");
+            return firstResult.getCharset();
+        }
+
+        byte[] bytes = readBytes(tis);
+        if (bytes == null || bytes.length == 0) {
+            context.setArbitrationInfo("empty-stream");
+            return firstResult.getCharset();
+        }
+
+        Map<Charset, String> candidates = new LinkedHashMap<>();
+        for (Charset candidate : uniqueCharsets) {
+            candidates.put(candidate, stripTags(decode(bytes, candidate)));
+        }
+
+        CharSoupLanguageDetector langDetector = new CharSoupLanguageDetector();
+        Charset bestCharset = langDetector.compareLanguageSignal(candidates);
+        if (bestCharset == null) {
+            bestCharset = firstResult.getCharset();
+        }
+
+        context.setArbitrationInfo("scored");
+        return bestCharset;
+    }
+
+    private byte[] readBytes(TikaInputStream tis) throws IOException {
+        try {
+            tis.mark(readLimit);
+            byte[] buf = new byte[readLimit];
+            int totalRead = 0;
+            int bytesRead;
+            while (totalRead < readLimit &&
+                    (bytesRead = tis.read(buf, totalRead,
+                            readLimit - totalRead)) != -1) {
+                totalRead += bytesRead;
+            }
+            if (totalRead == 0) {
+                return null;
+            }
+            if (totalRead < readLimit) {
+                byte[] trimmed = new byte[totalRead];
+                System.arraycopy(buf, 0, trimmed, 0, totalRead);
+                return trimmed;
+            }
+            return buf;
+        } finally {
+            tis.reset();
+        }
+    }
+
+    /**
+     * Decode bytes using the given charset, replacing malformed/unmappable
+     * characters rather than throwing.
+     */
+    static String decode(byte[] bytes, Charset charset) {
+        CharsetDecoder decoder = charset.newDecoder()
+                .onMalformedInput(CodingErrorAction.REPLACE)
+                .onUnmappableCharacter(CodingErrorAction.REPLACE);
+        CharBuffer cb = CharBuffer.allocate(bytes.length * 2);
+        decoder.decode(ByteBuffer.wrap(bytes), cb, true);
+        decoder.flush(cb);
+        cb.flip();
+        return cb.toString();
+    }
+
+    /**
+     * Simple tag stripping: removes &lt;...&gt; sequences so that
+     * HTML/XML tag names and attributes don't pollute language scoring.
+     */
+    static String stripTags(String text) {
+        StringBuilder sb = new StringBuilder(text.length());
+        boolean inTag = false;
+        for (int i = 0; i < text.length(); i++) {
+            char c = text.charAt(i);
+            if (c == '<') {
+                inTag = true;
+            } else if (c == '>') {
+                inTag = false;
+            } else if (!inTag) {
+                sb.append(c);
+            }
+        }
+        return sb.toString();
+    }
+
+    public int getReadLimit() {
+        return readLimit;
+    }
+
+    public void setReadLimit(int readLimit) {
+        this.readLimit = readLimit;
+    }
+}
diff --git 
a/tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/CharSoupEncodingDetectorTest.java
 
b/tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/CharSoupEncodingDetectorTest.java
new file mode 100644
index 0000000000..e9b6001dcb
--- /dev/null
+++ 
b/tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/CharSoupEncodingDetectorTest.java
@@ -0,0 +1,183 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.langdetect.charsoup;
+
+import static java.nio.charset.StandardCharsets.ISO_8859_1;
+import static java.nio.charset.StandardCharsets.UTF_8;
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNull;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+import java.io.ByteArrayInputStream;
+import java.nio.charset.Charset;
+
+import org.junit.jupiter.api.Test;
+
+import org.apache.tika.detect.EncodingDetectorContext;
+import org.apache.tika.detect.MetaEncodingDetector;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+
+public class CharSoupEncodingDetectorTest {
+
+    @Test
+    public void testIsMetaEncodingDetector() {
+        assertTrue(new CharSoupEncodingDetector() instanceof 
MetaEncodingDetector);
+    }
+
+    @Test
+    public void testUnanimous() throws Exception {
+        CharSoupEncodingDetector detector = new CharSoupEncodingDetector();
+        EncodingDetectorContext context = new EncodingDetectorContext();
+        context.addResult(UTF_8, "DetectorA");
+        context.addResult(UTF_8, "DetectorB");
+
+        ParseContext parseContext = new ParseContext();
+        parseContext.set(EncodingDetectorContext.class, context);
+
+        byte[] data = "Hello, world!".getBytes(UTF_8);
+        try (TikaInputStream tis = TikaInputStream.get(
+                new ByteArrayInputStream(data))) {
+            Charset result = detector.detect(tis, new Metadata(), 
parseContext);
+            assertEquals(UTF_8, result);
+            assertEquals("unanimous", context.getArbitrationInfo());
+        }
+    }
+
+    @Test
+    public void testNoContext() throws Exception {
+        CharSoupEncodingDetector detector = new CharSoupEncodingDetector();
+        ParseContext parseContext = new ParseContext();
+
+        byte[] data = "Test".getBytes(UTF_8);
+        try (TikaInputStream tis = TikaInputStream.get(
+                new ByteArrayInputStream(data))) {
+            Charset result = detector.detect(tis, new Metadata(), 
parseContext);
+            assertNull(result);
+        }
+    }
+
+    @Test
+    public void testEmptyResults() throws Exception {
+        CharSoupEncodingDetector detector = new CharSoupEncodingDetector();
+        EncodingDetectorContext context = new EncodingDetectorContext();
+
+        ParseContext parseContext = new ParseContext();
+        parseContext.set(EncodingDetectorContext.class, context);
+
+        byte[] data = "Test".getBytes(UTF_8);
+        try (TikaInputStream tis = TikaInputStream.get(
+                new ByteArrayInputStream(data))) {
+            Charset result = detector.detect(tis, new Metadata(), 
parseContext);
+            assertNull(result);
+        }
+    }
+
+    @Test
+    public void testArabicEncodingArbitration() throws Exception {
+        // Arabic text encoded in windows-1256.
+        // When decoded as UTF-8 it produces replacement chars / garbage.
+        // When decoded as windows-1256 it produces valid Arabic.
+        // The language detector should pick windows-1256.
+        Charset windows1256 = Charset.forName("windows-1256");
+
+        String arabicText =
+                "\u0641\u064a \u0642\u0631\u064a\u0629 \u0645\u0646 " +
+                "\u0627\u0644\u0642\u0631\u0649 \u0643\u0627\u0646 " +
+                "\u0647\u0646\u0627\u0643 \u0631\u062c\u0644 " +
+                "\u062d\u0643\u064a\u0645 \u064a\u0639\u0631\u0641 " +
+                "\u0643\u0644 \u0634\u064a\u0621 \u0639\u0646 " +
+                "\u0627\u0644\u062d\u064a\u0627\u0629 \u0648\u0643\u0627\u0646 
" +
+                "\u064a\u0639\u0644\u0645 \u0627\u0644\u0646\u0627\u0633 " +
+                "\u0643\u064a\u0641 \u064a\u0639\u064a\u0634\u0648\u0646 " +
+                "\u0628\u0633\u0644\u0627\u0645 
\u0648\u0627\u0646\u0633\u062c\u0627\u0645. " +
+                "\u0627\u0644\u0644\u063a\u0629 
\u0627\u0644\u0639\u0631\u0628\u064a\u0629 " +
+                "\u0647\u064a \u0648\u0627\u062d\u062f\u0629 \u0645\u0646 " +
+                "\u0623\u0643\u062b\u0631 \u0627\u0644\u0644\u063a\u0627\u062a 
" +
+                "\u0627\u0646\u062a\u0634\u0627\u0631\u0627 \u0641\u064a " +
+                "\u0627\u0644\u0639\u0627\u0644\u0645 
\u0648\u064a\u062a\u062d\u062b\u0647\u0627 " +
+                "\u0623\u0643\u062b\u0631 \u0645\u0646 
\u062b\u0644\u0627\u062b\u0645\u0627\u0626\u0629 " +
+                "\u0645\u0644\u064a\u0648\u0646 
\u0625\u0646\u0633\u0627\u0646.";
+        byte[] arabicBytes = arabicText.getBytes(windows1256);
+
+        EncodingDetectorContext context = new EncodingDetectorContext();
+        context.addResult(UTF_8, "HtmlEncodingDetector");
+        context.addResult(windows1256, "Icu4jEncodingDetector");
+
+        ParseContext parseContext = new ParseContext();
+        parseContext.set(EncodingDetectorContext.class, context);
+
+        CharSoupEncodingDetector detector = new CharSoupEncodingDetector();
+        try (TikaInputStream tis = TikaInputStream.get(
+                new ByteArrayInputStream(arabicBytes))) {
+            Charset result = detector.detect(tis, new Metadata(), 
parseContext);
+            assertEquals(windows1256, result);
+            assertEquals("scored", context.getArbitrationInfo());
+        }
+    }
+
+    @Test
+    public void testStreamResetAfterDetection() throws Exception {
+        EncodingDetectorContext context = new EncodingDetectorContext();
+        context.addResult(UTF_8, "DetectorA");
+        context.addResult(ISO_8859_1, "DetectorB");
+
+        ParseContext parseContext = new ParseContext();
+        parseContext.set(EncodingDetectorContext.class, context);
+
+        byte[] data = "Hello, world! This is a test of encoding 
detection.".getBytes(UTF_8);
+        CharSoupEncodingDetector detector = new CharSoupEncodingDetector();
+        try (TikaInputStream tis = TikaInputStream.get(
+                new ByteArrayInputStream(data))) {
+            detector.detect(tis, new Metadata(), parseContext);
+
+            // Verify stream is back at the start
+            byte[] readBack = new byte[data.length];
+            int bytesRead = tis.read(readBack);
+            assertEquals(data.length, bytesRead);
+            assertEquals("Hello, world! This is a test of encoding detection.",
+                    new String(readBack, UTF_8));
+        }
+    }
+
+    @Test
+    public void testStripTags() {
+        assertEquals("Hello world",
+                CharSoupEncodingDetector.stripTags(
+                        "<html><body>Hello world</body></html>"));
+        assertEquals("no tags here",
+                CharSoupEncodingDetector.stripTags("no tags here"));
+        assertEquals("",
+                CharSoupEncodingDetector.stripTags("<empty/>"));
+    }
+
+    @Test
+    public void testDecode() {
+        byte[] utf8Bytes = "caf\u00e9".getBytes(UTF_8);
+        assertEquals("caf\u00e9",
+                CharSoupEncodingDetector.decode(utf8Bytes, UTF_8));
+    }
+
+    @Test
+    public void testReadLimitGetterSetter() {
+        CharSoupEncodingDetector detector = new CharSoupEncodingDetector();
+        assertEquals(16384, detector.getReadLimit());
+        detector.setReadLimit(4096);
+        assertEquals(4096, detector.getReadLimit());
+    }
+}
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/test-documents/testArabicMisleadingCharset.html
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/test-documents/testArabicMisleadingCharset.html
new file mode 100644
index 0000000000..e9884177f2
--- /dev/null
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/test-documents/testArabicMisleadingCharset.html
@@ -0,0 +1,11 @@
+<html>
+<head>
+<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
+<title>Test Arabic</title>
+</head>
+<body>
+<p>�� ���� �� ����� ��� ���� ��� ���� ���� �� ��� �� ������ ���� ���� ����� 
��� ������ ����� �������. ����� ������� �� ����� �� ���� ������ ������� �� 
������ ������� ���� �� �������� ����� �����.</p>
+<p>�� ���� �� ����� ��� ���� ��� ���� ���� �� ��� �� ������ ���� ���� ����� 
��� ������ ����� �������. ����� ������� �� ����� �� ���� ������ ������� �� 
������ ������� ���� �� �������� ����� �����.</p>
+<p>�� ���� �� ����� ��� ���� ��� ���� ���� �� ��� �� ������ ���� ���� ����� 
��� ������ ����� �������. ����� ������� �� ����� �� ���� ������ ������� �� 
������ ������� ���� �� �������� ����� �����.</p>
+</body>
+</html>
\ No newline at end of file

(tika) 02/03: TIKA-4671 - git add

Reply via email to