This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/tika.git
commit 1a236d2a4e071f4525eaa50e10fe1c6fc5a66e63 Author: tallison <[email protected]> AuthorDate: Wed Feb 18 18:03:32 2026 -0500 Revert "TIKA-4671 - git add" This reverts commit 1bf02554296fa38f236870dc9feef7f687e23abb. --- .../apache/tika/detect/MetaEncodingDetector.java | 39 ----- .../charsoup/CharSoupEncodingDetector.java | 186 --------------------- .../charsoup/CharSoupEncodingDetectorTest.java | 183 -------------------- .../testArabicMisleadingCharset.html | 11 -- 4 files changed, 419 deletions(-) diff --git a/tika-core/src/main/java/org/apache/tika/detect/MetaEncodingDetector.java b/tika-core/src/main/java/org/apache/tika/detect/MetaEncodingDetector.java deleted file mode 100644 index e8a46f647b..0000000000 --- a/tika-core/src/main/java/org/apache/tika/detect/MetaEncodingDetector.java +++ /dev/null @@ -1,39 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika.detect; - -/** - * Marker interface for encoding detectors that arbitrate among - * candidates collected by base detectors rather than detecting - * encoding directly from the stream. - * - * <p>When a {@code MetaEncodingDetector} is present in a - * {@link CompositeEncodingDetector}, the composite switches from - * first-match-wins to collect-all mode: all base detectors run - * first and their results are collected in an - * {@link EncodingDetectorContext}, then the meta detector's - * {@link #detect} method is called to pick the winner.</p> - * - * <p>The {@link EncodingDetectorContext} is placed in the - * {@link org.apache.tika.parser.ParseContext} before the meta - * detector is invoked, so implementations can retrieve it via - * {@code parseContext.get(EncodingDetectorContext.class)}.</p> - * - * @since Apache Tika 3.2 - */ -public interface MetaEncodingDetector extends EncodingDetector { -} diff --git a/tika-langdetect/tika-langdetect-charsoup/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupEncodingDetector.java b/tika-langdetect/tika-langdetect-charsoup/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupEncodingDetector.java deleted file mode 100644 index cb393cadf7..0000000000 --- a/tika-langdetect/tika-langdetect-charsoup/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupEncodingDetector.java +++ /dev/null @@ -1,186 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika.langdetect.charsoup; - -import java.io.IOException; -import java.nio.ByteBuffer; -import java.nio.CharBuffer; -import java.nio.charset.Charset; -import java.nio.charset.CharsetDecoder; -import java.nio.charset.CodingErrorAction; -import java.util.LinkedHashMap; -import java.util.Map; -import java.util.Set; - -import org.apache.tika.config.TikaComponent; -import org.apache.tika.detect.EncodingDetectorContext; -import org.apache.tika.detect.MetaEncodingDetector; -import org.apache.tika.io.TikaInputStream; -import org.apache.tika.metadata.Metadata; -import org.apache.tika.parser.ParseContext; - -/** - * A {@link MetaEncodingDetector} that uses the CharSoup language detector - * to arbitrate when base encoding detectors disagree. - * - * <p>When base detectors all agree, the unanimous charset is returned - * without any language detection. When they disagree, raw bytes are - * read from the stream, decoded with each candidate charset, and each - * decoded text is scored by {@link CharSoupLanguageDetector}. The - * charset that produces the highest-confidence language detection wins.</p> - * - * <p>To enable, add this detector to your encoding detector chain in - * tika-config:</p> - * <pre>{@code - * "encoding-detectors": [ - * { "default-encoding-detector": {} }, - * { "charsoup-encoding-detector": {} } - * ] - * }</pre> - * - * @since Apache Tika 3.2 - */ -@TikaComponent -public class CharSoupEncodingDetector implements MetaEncodingDetector { - - private static final long serialVersionUID = 1L; - - private static final int DEFAULT_READ_LIMIT = 16384; - - private int readLimit = DEFAULT_READ_LIMIT; - - @Override - public Charset detect(TikaInputStream tis, Metadata metadata, - ParseContext parseContext) throws IOException { - EncodingDetectorContext context = - parseContext.get(EncodingDetectorContext.class); - if (context == null || context.getResults().isEmpty()) { - return null; - } - - Set<Charset> uniqueCharsets = context.getUniqueCharsets(); - - if (uniqueCharsets.size() <= 1) { - // Unanimous or single detector — no arbitration needed - EncodingDetectorContext.Result first = context.getResults().get(0); - context.setArbitrationInfo("unanimous"); - return first.getCharset(); - } - - // Disagreement — arbitrate via language detection scoring - return arbitrate(tis, context, uniqueCharsets); - } - - private Charset arbitrate(TikaInputStream tis, - EncodingDetectorContext context, - Set<Charset> uniqueCharsets) throws IOException { - EncodingDetectorContext.Result firstResult = context.getResults().get(0); - - if (tis == null) { - context.setArbitrationInfo("no-stream"); - return firstResult.getCharset(); - } - - byte[] bytes = readBytes(tis); - if (bytes == null || bytes.length == 0) { - context.setArbitrationInfo("empty-stream"); - return firstResult.getCharset(); - } - - Map<Charset, String> candidates = new LinkedHashMap<>(); - for (Charset candidate : uniqueCharsets) { - candidates.put(candidate, stripTags(decode(bytes, candidate))); - } - - CharSoupLanguageDetector langDetector = new CharSoupLanguageDetector(); - Charset bestCharset = langDetector.compareLanguageSignal(candidates); - if (bestCharset == null) { - bestCharset = firstResult.getCharset(); - } - - context.setArbitrationInfo("scored"); - return bestCharset; - } - - private byte[] readBytes(TikaInputStream tis) throws IOException { - try { - tis.mark(readLimit); - byte[] buf = new byte[readLimit]; - int totalRead = 0; - int bytesRead; - while (totalRead < readLimit && - (bytesRead = tis.read(buf, totalRead, - readLimit - totalRead)) != -1) { - totalRead += bytesRead; - } - if (totalRead == 0) { - return null; - } - if (totalRead < readLimit) { - byte[] trimmed = new byte[totalRead]; - System.arraycopy(buf, 0, trimmed, 0, totalRead); - return trimmed; - } - return buf; - } finally { - tis.reset(); - } - } - - /** - * Decode bytes using the given charset, replacing malformed/unmappable - * characters rather than throwing. - */ - static String decode(byte[] bytes, Charset charset) { - CharsetDecoder decoder = charset.newDecoder() - .onMalformedInput(CodingErrorAction.REPLACE) - .onUnmappableCharacter(CodingErrorAction.REPLACE); - CharBuffer cb = CharBuffer.allocate(bytes.length * 2); - decoder.decode(ByteBuffer.wrap(bytes), cb, true); - decoder.flush(cb); - cb.flip(); - return cb.toString(); - } - - /** - * Simple tag stripping: removes <...> sequences so that - * HTML/XML tag names and attributes don't pollute language scoring. - */ - static String stripTags(String text) { - StringBuilder sb = new StringBuilder(text.length()); - boolean inTag = false; - for (int i = 0; i < text.length(); i++) { - char c = text.charAt(i); - if (c == '<') { - inTag = true; - } else if (c == '>') { - inTag = false; - } else if (!inTag) { - sb.append(c); - } - } - return sb.toString(); - } - - public int getReadLimit() { - return readLimit; - } - - public void setReadLimit(int readLimit) { - this.readLimit = readLimit; - } -} diff --git a/tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/CharSoupEncodingDetectorTest.java b/tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/CharSoupEncodingDetectorTest.java deleted file mode 100644 index e9b6001dcb..0000000000 --- a/tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/CharSoupEncodingDetectorTest.java +++ /dev/null @@ -1,183 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika.langdetect.charsoup; - -import static java.nio.charset.StandardCharsets.ISO_8859_1; -import static java.nio.charset.StandardCharsets.UTF_8; -import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertNull; -import static org.junit.jupiter.api.Assertions.assertTrue; - -import java.io.ByteArrayInputStream; -import java.nio.charset.Charset; - -import org.junit.jupiter.api.Test; - -import org.apache.tika.detect.EncodingDetectorContext; -import org.apache.tika.detect.MetaEncodingDetector; -import org.apache.tika.io.TikaInputStream; -import org.apache.tika.metadata.Metadata; -import org.apache.tika.parser.ParseContext; - -public class CharSoupEncodingDetectorTest { - - @Test - public void testIsMetaEncodingDetector() { - assertTrue(new CharSoupEncodingDetector() instanceof MetaEncodingDetector); - } - - @Test - public void testUnanimous() throws Exception { - CharSoupEncodingDetector detector = new CharSoupEncodingDetector(); - EncodingDetectorContext context = new EncodingDetectorContext(); - context.addResult(UTF_8, "DetectorA"); - context.addResult(UTF_8, "DetectorB"); - - ParseContext parseContext = new ParseContext(); - parseContext.set(EncodingDetectorContext.class, context); - - byte[] data = "Hello, world!".getBytes(UTF_8); - try (TikaInputStream tis = TikaInputStream.get( - new ByteArrayInputStream(data))) { - Charset result = detector.detect(tis, new Metadata(), parseContext); - assertEquals(UTF_8, result); - assertEquals("unanimous", context.getArbitrationInfo()); - } - } - - @Test - public void testNoContext() throws Exception { - CharSoupEncodingDetector detector = new CharSoupEncodingDetector(); - ParseContext parseContext = new ParseContext(); - - byte[] data = "Test".getBytes(UTF_8); - try (TikaInputStream tis = TikaInputStream.get( - new ByteArrayInputStream(data))) { - Charset result = detector.detect(tis, new Metadata(), parseContext); - assertNull(result); - } - } - - @Test - public void testEmptyResults() throws Exception { - CharSoupEncodingDetector detector = new CharSoupEncodingDetector(); - EncodingDetectorContext context = new EncodingDetectorContext(); - - ParseContext parseContext = new ParseContext(); - parseContext.set(EncodingDetectorContext.class, context); - - byte[] data = "Test".getBytes(UTF_8); - try (TikaInputStream tis = TikaInputStream.get( - new ByteArrayInputStream(data))) { - Charset result = detector.detect(tis, new Metadata(), parseContext); - assertNull(result); - } - } - - @Test - public void testArabicEncodingArbitration() throws Exception { - // Arabic text encoded in windows-1256. - // When decoded as UTF-8 it produces replacement chars / garbage. - // When decoded as windows-1256 it produces valid Arabic. - // The language detector should pick windows-1256. - Charset windows1256 = Charset.forName("windows-1256"); - - String arabicText = - "\u0641\u064a \u0642\u0631\u064a\u0629 \u0645\u0646 " + - "\u0627\u0644\u0642\u0631\u0649 \u0643\u0627\u0646 " + - "\u0647\u0646\u0627\u0643 \u0631\u062c\u0644 " + - "\u062d\u0643\u064a\u0645 \u064a\u0639\u0631\u0641 " + - "\u0643\u0644 \u0634\u064a\u0621 \u0639\u0646 " + - "\u0627\u0644\u062d\u064a\u0627\u0629 \u0648\u0643\u0627\u0646 " + - "\u064a\u0639\u0644\u0645 \u0627\u0644\u0646\u0627\u0633 " + - "\u0643\u064a\u0641 \u064a\u0639\u064a\u0634\u0648\u0646 " + - "\u0628\u0633\u0644\u0627\u0645 \u0648\u0627\u0646\u0633\u062c\u0627\u0645. " + - "\u0627\u0644\u0644\u063a\u0629 \u0627\u0644\u0639\u0631\u0628\u064a\u0629 " + - "\u0647\u064a \u0648\u0627\u062d\u062f\u0629 \u0645\u0646 " + - "\u0623\u0643\u062b\u0631 \u0627\u0644\u0644\u063a\u0627\u062a " + - "\u0627\u0646\u062a\u0634\u0627\u0631\u0627 \u0641\u064a " + - "\u0627\u0644\u0639\u0627\u0644\u0645 \u0648\u064a\u062a\u062d\u062b\u0647\u0627 " + - "\u0623\u0643\u062b\u0631 \u0645\u0646 \u062b\u0644\u0627\u062b\u0645\u0627\u0626\u0629 " + - "\u0645\u0644\u064a\u0648\u0646 \u0625\u0646\u0633\u0627\u0646."; - byte[] arabicBytes = arabicText.getBytes(windows1256); - - EncodingDetectorContext context = new EncodingDetectorContext(); - context.addResult(UTF_8, "HtmlEncodingDetector"); - context.addResult(windows1256, "Icu4jEncodingDetector"); - - ParseContext parseContext = new ParseContext(); - parseContext.set(EncodingDetectorContext.class, context); - - CharSoupEncodingDetector detector = new CharSoupEncodingDetector(); - try (TikaInputStream tis = TikaInputStream.get( - new ByteArrayInputStream(arabicBytes))) { - Charset result = detector.detect(tis, new Metadata(), parseContext); - assertEquals(windows1256, result); - assertEquals("scored", context.getArbitrationInfo()); - } - } - - @Test - public void testStreamResetAfterDetection() throws Exception { - EncodingDetectorContext context = new EncodingDetectorContext(); - context.addResult(UTF_8, "DetectorA"); - context.addResult(ISO_8859_1, "DetectorB"); - - ParseContext parseContext = new ParseContext(); - parseContext.set(EncodingDetectorContext.class, context); - - byte[] data = "Hello, world! This is a test of encoding detection.".getBytes(UTF_8); - CharSoupEncodingDetector detector = new CharSoupEncodingDetector(); - try (TikaInputStream tis = TikaInputStream.get( - new ByteArrayInputStream(data))) { - detector.detect(tis, new Metadata(), parseContext); - - // Verify stream is back at the start - byte[] readBack = new byte[data.length]; - int bytesRead = tis.read(readBack); - assertEquals(data.length, bytesRead); - assertEquals("Hello, world! This is a test of encoding detection.", - new String(readBack, UTF_8)); - } - } - - @Test - public void testStripTags() { - assertEquals("Hello world", - CharSoupEncodingDetector.stripTags( - "<html><body>Hello world</body></html>")); - assertEquals("no tags here", - CharSoupEncodingDetector.stripTags("no tags here")); - assertEquals("", - CharSoupEncodingDetector.stripTags("<empty/>")); - } - - @Test - public void testDecode() { - byte[] utf8Bytes = "caf\u00e9".getBytes(UTF_8); - assertEquals("caf\u00e9", - CharSoupEncodingDetector.decode(utf8Bytes, UTF_8)); - } - - @Test - public void testReadLimitGetterSetter() { - CharSoupEncodingDetector detector = new CharSoupEncodingDetector(); - assertEquals(16384, detector.getReadLimit()); - detector.setReadLimit(4096); - assertEquals(4096, detector.getReadLimit()); - } -} diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/test-documents/testArabicMisleadingCharset.html b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/test-documents/testArabicMisleadingCharset.html deleted file mode 100644 index e9884177f2..0000000000 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/test-documents/testArabicMisleadingCharset.html +++ /dev/null @@ -1,11 +0,0 @@ -<html> -<head> -<meta http-equiv="Content-Type" content="text/html; charset=UTF-8"> -<title>Test Arabic</title> -</head> -<body> -<p>�� ���� �� ����� ��� ���� ��� ���� ���� �� ��� �� ������ ���� ���� ����� ��� ������ ����� �������. ����� ������� �� ����� �� ���� ������ ������� �� ������ ������� ���� �� �������� ����� �����.</p> -<p>�� ���� �� ����� ��� ���� ��� ���� ���� �� ��� �� ������ ���� ���� ����� ��� ������ ����� �������. ����� ������� �� ����� �� ���� ������ ������� �� ������ ������� ���� �� �������� ����� �����.</p> -<p>�� ���� �� ����� ��� ���� ��� ���� ���� �� ��� �� ������ ���� ���� ����� ��� ������ ����� �������. ����� ������� �� ����� �� ���� ������ ������� �� ������ ������� ���� �� �������� ����� �����.</p> -</body> -</html> \ No newline at end of file
