This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4671-lang-aware-charset-detection in repository https://gitbox.apache.org/repos/asf/tika.git
commit a9aeb8f448e18c6b195ed1d7260f3388264a06c5 Author: tallison <[email protected]> AuthorDate: Wed Feb 18 18:01:33 2026 -0500 TIKA-4671 - git add --- .../apache/tika/detect/MetaEncodingDetector.java | 39 +++++ .../charsoup/CharSoupEncodingDetector.java | 186 +++++++++++++++++++++ .../charsoup/CharSoupEncodingDetectorTest.java | 183 ++++++++++++++++++++ .../testArabicMisleadingCharset.html | 11 ++ 4 files changed, 419 insertions(+) diff --git a/tika-core/src/main/java/org/apache/tika/detect/MetaEncodingDetector.java b/tika-core/src/main/java/org/apache/tika/detect/MetaEncodingDetector.java new file mode 100644 index 0000000000..e8a46f647b --- /dev/null +++ b/tika-core/src/main/java/org/apache/tika/detect/MetaEncodingDetector.java @@ -0,0 +1,39 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.detect; + +/** + * Marker interface for encoding detectors that arbitrate among + * candidates collected by base detectors rather than detecting + * encoding directly from the stream. + * + * <p>When a {@code MetaEncodingDetector} is present in a + * {@link CompositeEncodingDetector}, the composite switches from + * first-match-wins to collect-all mode: all base detectors run + * first and their results are collected in an + * {@link EncodingDetectorContext}, then the meta detector's + * {@link #detect} method is called to pick the winner.</p> + * + * <p>The {@link EncodingDetectorContext} is placed in the + * {@link org.apache.tika.parser.ParseContext} before the meta + * detector is invoked, so implementations can retrieve it via + * {@code parseContext.get(EncodingDetectorContext.class)}.</p> + * + * @since Apache Tika 3.2 + */ +public interface MetaEncodingDetector extends EncodingDetector { +} diff --git a/tika-langdetect/tika-langdetect-charsoup/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupEncodingDetector.java b/tika-langdetect/tika-langdetect-charsoup/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupEncodingDetector.java new file mode 100644 index 0000000000..cb393cadf7 --- /dev/null +++ b/tika-langdetect/tika-langdetect-charsoup/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupEncodingDetector.java @@ -0,0 +1,186 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.langdetect.charsoup; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.CharBuffer; +import java.nio.charset.Charset; +import java.nio.charset.CharsetDecoder; +import java.nio.charset.CodingErrorAction; +import java.util.LinkedHashMap; +import java.util.Map; +import java.util.Set; + +import org.apache.tika.config.TikaComponent; +import org.apache.tika.detect.EncodingDetectorContext; +import org.apache.tika.detect.MetaEncodingDetector; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.ParseContext; + +/** + * A {@link MetaEncodingDetector} that uses the CharSoup language detector + * to arbitrate when base encoding detectors disagree. + * + * <p>When base detectors all agree, the unanimous charset is returned + * without any language detection. When they disagree, raw bytes are + * read from the stream, decoded with each candidate charset, and each + * decoded text is scored by {@link CharSoupLanguageDetector}. The + * charset that produces the highest-confidence language detection wins.</p> + * + * <p>To enable, add this detector to your encoding detector chain in + * tika-config:</p> + * <pre>{@code + * "encoding-detectors": [ + * { "default-encoding-detector": {} }, + * { "charsoup-encoding-detector": {} } + * ] + * }</pre> + * + * @since Apache Tika 3.2 + */ +@TikaComponent +public class CharSoupEncodingDetector implements MetaEncodingDetector { + + private static final long serialVersionUID = 1L; + + private static final int DEFAULT_READ_LIMIT = 16384; + + private int readLimit = DEFAULT_READ_LIMIT; + + @Override + public Charset detect(TikaInputStream tis, Metadata metadata, + ParseContext parseContext) throws IOException { + EncodingDetectorContext context = + parseContext.get(EncodingDetectorContext.class); + if (context == null || context.getResults().isEmpty()) { + return null; + } + + Set<Charset> uniqueCharsets = context.getUniqueCharsets(); + + if (uniqueCharsets.size() <= 1) { + // Unanimous or single detector — no arbitration needed + EncodingDetectorContext.Result first = context.getResults().get(0); + context.setArbitrationInfo("unanimous"); + return first.getCharset(); + } + + // Disagreement — arbitrate via language detection scoring + return arbitrate(tis, context, uniqueCharsets); + } + + private Charset arbitrate(TikaInputStream tis, + EncodingDetectorContext context, + Set<Charset> uniqueCharsets) throws IOException { + EncodingDetectorContext.Result firstResult = context.getResults().get(0); + + if (tis == null) { + context.setArbitrationInfo("no-stream"); + return firstResult.getCharset(); + } + + byte[] bytes = readBytes(tis); + if (bytes == null || bytes.length == 0) { + context.setArbitrationInfo("empty-stream"); + return firstResult.getCharset(); + } + + Map<Charset, String> candidates = new LinkedHashMap<>(); + for (Charset candidate : uniqueCharsets) { + candidates.put(candidate, stripTags(decode(bytes, candidate))); + } + + CharSoupLanguageDetector langDetector = new CharSoupLanguageDetector(); + Charset bestCharset = langDetector.compareLanguageSignal(candidates); + if (bestCharset == null) { + bestCharset = firstResult.getCharset(); + } + + context.setArbitrationInfo("scored"); + return bestCharset; + } + + private byte[] readBytes(TikaInputStream tis) throws IOException { + try { + tis.mark(readLimit); + byte[] buf = new byte[readLimit]; + int totalRead = 0; + int bytesRead; + while (totalRead < readLimit && + (bytesRead = tis.read(buf, totalRead, + readLimit - totalRead)) != -1) { + totalRead += bytesRead; + } + if (totalRead == 0) { + return null; + } + if (totalRead < readLimit) { + byte[] trimmed = new byte[totalRead]; + System.arraycopy(buf, 0, trimmed, 0, totalRead); + return trimmed; + } + return buf; + } finally { + tis.reset(); + } + } + + /** + * Decode bytes using the given charset, replacing malformed/unmappable + * characters rather than throwing. + */ + static String decode(byte[] bytes, Charset charset) { + CharsetDecoder decoder = charset.newDecoder() + .onMalformedInput(CodingErrorAction.REPLACE) + .onUnmappableCharacter(CodingErrorAction.REPLACE); + CharBuffer cb = CharBuffer.allocate(bytes.length * 2); + decoder.decode(ByteBuffer.wrap(bytes), cb, true); + decoder.flush(cb); + cb.flip(); + return cb.toString(); + } + + /** + * Simple tag stripping: removes <...> sequences so that + * HTML/XML tag names and attributes don't pollute language scoring. + */ + static String stripTags(String text) { + StringBuilder sb = new StringBuilder(text.length()); + boolean inTag = false; + for (int i = 0; i < text.length(); i++) { + char c = text.charAt(i); + if (c == '<') { + inTag = true; + } else if (c == '>') { + inTag = false; + } else if (!inTag) { + sb.append(c); + } + } + return sb.toString(); + } + + public int getReadLimit() { + return readLimit; + } + + public void setReadLimit(int readLimit) { + this.readLimit = readLimit; + } +} diff --git a/tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/CharSoupEncodingDetectorTest.java b/tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/CharSoupEncodingDetectorTest.java new file mode 100644 index 0000000000..e9b6001dcb --- /dev/null +++ b/tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/CharSoupEncodingDetectorTest.java @@ -0,0 +1,183 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.langdetect.charsoup; + +import static java.nio.charset.StandardCharsets.ISO_8859_1; +import static java.nio.charset.StandardCharsets.UTF_8; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNull; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.io.ByteArrayInputStream; +import java.nio.charset.Charset; + +import org.junit.jupiter.api.Test; + +import org.apache.tika.detect.EncodingDetectorContext; +import org.apache.tika.detect.MetaEncodingDetector; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.ParseContext; + +public class CharSoupEncodingDetectorTest { + + @Test + public void testIsMetaEncodingDetector() { + assertTrue(new CharSoupEncodingDetector() instanceof MetaEncodingDetector); + } + + @Test + public void testUnanimous() throws Exception { + CharSoupEncodingDetector detector = new CharSoupEncodingDetector(); + EncodingDetectorContext context = new EncodingDetectorContext(); + context.addResult(UTF_8, "DetectorA"); + context.addResult(UTF_8, "DetectorB"); + + ParseContext parseContext = new ParseContext(); + parseContext.set(EncodingDetectorContext.class, context); + + byte[] data = "Hello, world!".getBytes(UTF_8); + try (TikaInputStream tis = TikaInputStream.get( + new ByteArrayInputStream(data))) { + Charset result = detector.detect(tis, new Metadata(), parseContext); + assertEquals(UTF_8, result); + assertEquals("unanimous", context.getArbitrationInfo()); + } + } + + @Test + public void testNoContext() throws Exception { + CharSoupEncodingDetector detector = new CharSoupEncodingDetector(); + ParseContext parseContext = new ParseContext(); + + byte[] data = "Test".getBytes(UTF_8); + try (TikaInputStream tis = TikaInputStream.get( + new ByteArrayInputStream(data))) { + Charset result = detector.detect(tis, new Metadata(), parseContext); + assertNull(result); + } + } + + @Test + public void testEmptyResults() throws Exception { + CharSoupEncodingDetector detector = new CharSoupEncodingDetector(); + EncodingDetectorContext context = new EncodingDetectorContext(); + + ParseContext parseContext = new ParseContext(); + parseContext.set(EncodingDetectorContext.class, context); + + byte[] data = "Test".getBytes(UTF_8); + try (TikaInputStream tis = TikaInputStream.get( + new ByteArrayInputStream(data))) { + Charset result = detector.detect(tis, new Metadata(), parseContext); + assertNull(result); + } + } + + @Test + public void testArabicEncodingArbitration() throws Exception { + // Arabic text encoded in windows-1256. + // When decoded as UTF-8 it produces replacement chars / garbage. + // When decoded as windows-1256 it produces valid Arabic. + // The language detector should pick windows-1256. + Charset windows1256 = Charset.forName("windows-1256"); + + String arabicText = + "\u0641\u064a \u0642\u0631\u064a\u0629 \u0645\u0646 " + + "\u0627\u0644\u0642\u0631\u0649 \u0643\u0627\u0646 " + + "\u0647\u0646\u0627\u0643 \u0631\u062c\u0644 " + + "\u062d\u0643\u064a\u0645 \u064a\u0639\u0631\u0641 " + + "\u0643\u0644 \u0634\u064a\u0621 \u0639\u0646 " + + "\u0627\u0644\u062d\u064a\u0627\u0629 \u0648\u0643\u0627\u0646 " + + "\u064a\u0639\u0644\u0645 \u0627\u0644\u0646\u0627\u0633 " + + "\u0643\u064a\u0641 \u064a\u0639\u064a\u0634\u0648\u0646 " + + "\u0628\u0633\u0644\u0627\u0645 \u0648\u0627\u0646\u0633\u062c\u0627\u0645. " + + "\u0627\u0644\u0644\u063a\u0629 \u0627\u0644\u0639\u0631\u0628\u064a\u0629 " + + "\u0647\u064a \u0648\u0627\u062d\u062f\u0629 \u0645\u0646 " + + "\u0623\u0643\u062b\u0631 \u0627\u0644\u0644\u063a\u0627\u062a " + + "\u0627\u0646\u062a\u0634\u0627\u0631\u0627 \u0641\u064a " + + "\u0627\u0644\u0639\u0627\u0644\u0645 \u0648\u064a\u062a\u062d\u062b\u0647\u0627 " + + "\u0623\u0643\u062b\u0631 \u0645\u0646 \u062b\u0644\u0627\u062b\u0645\u0627\u0626\u0629 " + + "\u0645\u0644\u064a\u0648\u0646 \u0625\u0646\u0633\u0627\u0646."; + byte[] arabicBytes = arabicText.getBytes(windows1256); + + EncodingDetectorContext context = new EncodingDetectorContext(); + context.addResult(UTF_8, "HtmlEncodingDetector"); + context.addResult(windows1256, "Icu4jEncodingDetector"); + + ParseContext parseContext = new ParseContext(); + parseContext.set(EncodingDetectorContext.class, context); + + CharSoupEncodingDetector detector = new CharSoupEncodingDetector(); + try (TikaInputStream tis = TikaInputStream.get( + new ByteArrayInputStream(arabicBytes))) { + Charset result = detector.detect(tis, new Metadata(), parseContext); + assertEquals(windows1256, result); + assertEquals("scored", context.getArbitrationInfo()); + } + } + + @Test + public void testStreamResetAfterDetection() throws Exception { + EncodingDetectorContext context = new EncodingDetectorContext(); + context.addResult(UTF_8, "DetectorA"); + context.addResult(ISO_8859_1, "DetectorB"); + + ParseContext parseContext = new ParseContext(); + parseContext.set(EncodingDetectorContext.class, context); + + byte[] data = "Hello, world! This is a test of encoding detection.".getBytes(UTF_8); + CharSoupEncodingDetector detector = new CharSoupEncodingDetector(); + try (TikaInputStream tis = TikaInputStream.get( + new ByteArrayInputStream(data))) { + detector.detect(tis, new Metadata(), parseContext); + + // Verify stream is back at the start + byte[] readBack = new byte[data.length]; + int bytesRead = tis.read(readBack); + assertEquals(data.length, bytesRead); + assertEquals("Hello, world! This is a test of encoding detection.", + new String(readBack, UTF_8)); + } + } + + @Test + public void testStripTags() { + assertEquals("Hello world", + CharSoupEncodingDetector.stripTags( + "<html><body>Hello world</body></html>")); + assertEquals("no tags here", + CharSoupEncodingDetector.stripTags("no tags here")); + assertEquals("", + CharSoupEncodingDetector.stripTags("<empty/>")); + } + + @Test + public void testDecode() { + byte[] utf8Bytes = "caf\u00e9".getBytes(UTF_8); + assertEquals("caf\u00e9", + CharSoupEncodingDetector.decode(utf8Bytes, UTF_8)); + } + + @Test + public void testReadLimitGetterSetter() { + CharSoupEncodingDetector detector = new CharSoupEncodingDetector(); + assertEquals(16384, detector.getReadLimit()); + detector.setReadLimit(4096); + assertEquals(4096, detector.getReadLimit()); + } +} diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/test-documents/testArabicMisleadingCharset.html b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/test-documents/testArabicMisleadingCharset.html new file mode 100644 index 0000000000..e9884177f2 --- /dev/null +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/test-documents/testArabicMisleadingCharset.html @@ -0,0 +1,11 @@ +<html> +<head> +<meta http-equiv="Content-Type" content="text/html; charset=UTF-8"> +<title>Test Arabic</title> +</head> +<body> +<p>�� ���� �� ����� ��� ���� ��� ���� ���� �� ��� �� ������ ���� ���� ����� ��� ������ ����� �������. ����� ������� �� ����� �� ���� ������ ������� �� ������ ������� ���� �� �������� ����� �����.</p> +<p>�� ���� �� ����� ��� ���� ��� ���� ���� �� ��� �� ������ ���� ���� ����� ��� ������ ����� �������. ����� ������� �� ����� �� ���� ������ ������� �� ������ ������� ���� �� �������� ����� �����.</p> +<p>�� ���� �� ����� ��� ���� ��� ���� ���� �� ��� �� ������ ���� ���� ����� ��� ������ ����� �������. ����� ������� �� ����� �� ���� ������ ������� �� ������ ������� ���� �� �������� ����� �����.</p> +</body> +</html> \ No newline at end of file
