(tika) 01/02: Revert "TIKA-4671 - git add"

tallison Wed, 18 Feb 2026 15:04:18 -0800

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


commit 1a236d2a4e071f4525eaa50e10fe1c6fc5a66e63
Author: tallison <[email protected]>
AuthorDate: Wed Feb 18 18:03:32 2026 -0500

    Revert "TIKA-4671 - git add"
    
    This reverts commit 1bf02554296fa38f236870dc9feef7f687e23abb.
---
 .../apache/tika/detect/MetaEncodingDetector.java   |  39 -----
 .../charsoup/CharSoupEncodingDetector.java         | 186 ---------------------
 .../charsoup/CharSoupEncodingDetectorTest.java     | 183 --------------------
 .../testArabicMisleadingCharset.html               |  11 --
 4 files changed, 419 deletions(-)

diff --git 
a/tika-core/src/main/java/org/apache/tika/detect/MetaEncodingDetector.java 
b/tika-core/src/main/java/org/apache/tika/detect/MetaEncodingDetector.java
deleted file mode 100644
index e8a46f647b..0000000000
--- a/tika-core/src/main/java/org/apache/tika/detect/MetaEncodingDetector.java
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.detect;
-
-/**
- * Marker interface for encoding detectors that arbitrate among
- * candidates collected by base detectors rather than detecting
- * encoding directly from the stream.
- *
- * <p>When a {@code MetaEncodingDetector} is present in a
- * {@link CompositeEncodingDetector}, the composite switches from
- * first-match-wins to collect-all mode: all base detectors run
- * first and their results are collected in an
- * {@link EncodingDetectorContext}, then the meta detector's
- * {@link #detect} method is called to pick the winner.</p>
- *
- * <p>The {@link EncodingDetectorContext} is placed in the
- * {@link org.apache.tika.parser.ParseContext} before the meta
- * detector is invoked, so implementations can retrieve it via
- * {@code parseContext.get(EncodingDetectorContext.class)}.</p>
- *
- * @since Apache Tika 3.2
- */
-public interface MetaEncodingDetector extends EncodingDetector {
-}
diff --git 
a/tika-langdetect/tika-langdetect-charsoup/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupEncodingDetector.java
 
b/tika-langdetect/tika-langdetect-charsoup/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupEncodingDetector.java
deleted file mode 100644
index cb393cadf7..0000000000
--- 
a/tika-langdetect/tika-langdetect-charsoup/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupEncodingDetector.java
+++ /dev/null
@@ -1,186 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.langdetect.charsoup;
-
-import java.io.IOException;
-import java.nio.ByteBuffer;
-import java.nio.CharBuffer;
-import java.nio.charset.Charset;
-import java.nio.charset.CharsetDecoder;
-import java.nio.charset.CodingErrorAction;
-import java.util.LinkedHashMap;
-import java.util.Map;
-import java.util.Set;
-
-import org.apache.tika.config.TikaComponent;
-import org.apache.tika.detect.EncodingDetectorContext;
-import org.apache.tika.detect.MetaEncodingDetector;
-import org.apache.tika.io.TikaInputStream;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.parser.ParseContext;
-
-/**
- * A {@link MetaEncodingDetector} that uses the CharSoup language detector
- * to arbitrate when base encoding detectors disagree.
- *
- * <p>When base detectors all agree, the unanimous charset is returned
- * without any language detection. When they disagree, raw bytes are
- * read from the stream, decoded with each candidate charset, and each
- * decoded text is scored by {@link CharSoupLanguageDetector}. The
- * charset that produces the highest-confidence language detection wins.</p>
- *
- * <p>To enable, add this detector to your encoding detector chain in
- * tika-config:</p>
- * <pre>{@code
- * "encoding-detectors": [
- *   { "default-encoding-detector": {} },
- *   { "charsoup-encoding-detector": {} }
- * ]
- * }</pre>
- *
- * @since Apache Tika 3.2
- */
-@TikaComponent
-public class CharSoupEncodingDetector implements MetaEncodingDetector {
-
-    private static final long serialVersionUID = 1L;
-
-    private static final int DEFAULT_READ_LIMIT = 16384;
-
-    private int readLimit = DEFAULT_READ_LIMIT;
-
-    @Override
-    public Charset detect(TikaInputStream tis, Metadata metadata,
-                          ParseContext parseContext) throws IOException {
-        EncodingDetectorContext context =
-                parseContext.get(EncodingDetectorContext.class);
-        if (context == null || context.getResults().isEmpty()) {
-            return null;
-        }
-
-        Set<Charset> uniqueCharsets = context.getUniqueCharsets();
-
-        if (uniqueCharsets.size() <= 1) {
-            // Unanimous or single detector — no arbitration needed
-            EncodingDetectorContext.Result first = context.getResults().get(0);
-            context.setArbitrationInfo("unanimous");
-            return first.getCharset();
-        }
-
-        // Disagreement — arbitrate via language detection scoring
-        return arbitrate(tis, context, uniqueCharsets);
-    }
-
-    private Charset arbitrate(TikaInputStream tis,
-                              EncodingDetectorContext context,
-                              Set<Charset> uniqueCharsets) throws IOException {
-        EncodingDetectorContext.Result firstResult = 
context.getResults().get(0);
-
-        if (tis == null) {
-            context.setArbitrationInfo("no-stream");
-            return firstResult.getCharset();
-        }
-
-        byte[] bytes = readBytes(tis);
-        if (bytes == null || bytes.length == 0) {
-            context.setArbitrationInfo("empty-stream");
-            return firstResult.getCharset();
-        }
-
-        Map<Charset, String> candidates = new LinkedHashMap<>();
-        for (Charset candidate : uniqueCharsets) {
-            candidates.put(candidate, stripTags(decode(bytes, candidate)));
-        }
-
-        CharSoupLanguageDetector langDetector = new CharSoupLanguageDetector();
-        Charset bestCharset = langDetector.compareLanguageSignal(candidates);
-        if (bestCharset == null) {
-            bestCharset = firstResult.getCharset();
-        }
-
-        context.setArbitrationInfo("scored");
-        return bestCharset;
-    }
-
-    private byte[] readBytes(TikaInputStream tis) throws IOException {
-        try {
-            tis.mark(readLimit);
-            byte[] buf = new byte[readLimit];
-            int totalRead = 0;
-            int bytesRead;
-            while (totalRead < readLimit &&
-                    (bytesRead = tis.read(buf, totalRead,
-                            readLimit - totalRead)) != -1) {
-                totalRead += bytesRead;
-            }
-            if (totalRead == 0) {
-                return null;
-            }
-            if (totalRead < readLimit) {
-                byte[] trimmed = new byte[totalRead];
-                System.arraycopy(buf, 0, trimmed, 0, totalRead);
-                return trimmed;
-            }
-            return buf;
-        } finally {
-            tis.reset();
-        }
-    }
-
-    /**
-     * Decode bytes using the given charset, replacing malformed/unmappable
-     * characters rather than throwing.
-     */
-    static String decode(byte[] bytes, Charset charset) {
-        CharsetDecoder decoder = charset.newDecoder()
-                .onMalformedInput(CodingErrorAction.REPLACE)
-                .onUnmappableCharacter(CodingErrorAction.REPLACE);
-        CharBuffer cb = CharBuffer.allocate(bytes.length * 2);
-        decoder.decode(ByteBuffer.wrap(bytes), cb, true);
-        decoder.flush(cb);
-        cb.flip();
-        return cb.toString();
-    }
-
-    /**
-     * Simple tag stripping: removes &lt;...&gt; sequences so that
-     * HTML/XML tag names and attributes don't pollute language scoring.
-     */
-    static String stripTags(String text) {
-        StringBuilder sb = new StringBuilder(text.length());
-        boolean inTag = false;
-        for (int i = 0; i < text.length(); i++) {
-            char c = text.charAt(i);
-            if (c == '<') {
-                inTag = true;
-            } else if (c == '>') {
-                inTag = false;
-            } else if (!inTag) {
-                sb.append(c);
-            }
-        }
-        return sb.toString();
-    }
-
-    public int getReadLimit() {
-        return readLimit;
-    }
-
-    public void setReadLimit(int readLimit) {
-        this.readLimit = readLimit;
-    }
-}
diff --git 
a/tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/CharSoupEncodingDetectorTest.java
 
b/tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/CharSoupEncodingDetectorTest.java
deleted file mode 100644
index e9b6001dcb..0000000000
--- 
a/tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/CharSoupEncodingDetectorTest.java
+++ /dev/null
@@ -1,183 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.langdetect.charsoup;
-
-import static java.nio.charset.StandardCharsets.ISO_8859_1;
-import static java.nio.charset.StandardCharsets.UTF_8;
-import static org.junit.jupiter.api.Assertions.assertEquals;
-import static org.junit.jupiter.api.Assertions.assertNull;
-import static org.junit.jupiter.api.Assertions.assertTrue;
-
-import java.io.ByteArrayInputStream;
-import java.nio.charset.Charset;
-
-import org.junit.jupiter.api.Test;
-
-import org.apache.tika.detect.EncodingDetectorContext;
-import org.apache.tika.detect.MetaEncodingDetector;
-import org.apache.tika.io.TikaInputStream;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.parser.ParseContext;
-
-public class CharSoupEncodingDetectorTest {
-
-    @Test
-    public void testIsMetaEncodingDetector() {
-        assertTrue(new CharSoupEncodingDetector() instanceof 
MetaEncodingDetector);
-    }
-
-    @Test
-    public void testUnanimous() throws Exception {
-        CharSoupEncodingDetector detector = new CharSoupEncodingDetector();
-        EncodingDetectorContext context = new EncodingDetectorContext();
-        context.addResult(UTF_8, "DetectorA");
-        context.addResult(UTF_8, "DetectorB");
-
-        ParseContext parseContext = new ParseContext();
-        parseContext.set(EncodingDetectorContext.class, context);
-
-        byte[] data = "Hello, world!".getBytes(UTF_8);
-        try (TikaInputStream tis = TikaInputStream.get(
-                new ByteArrayInputStream(data))) {
-            Charset result = detector.detect(tis, new Metadata(), 
parseContext);
-            assertEquals(UTF_8, result);
-            assertEquals("unanimous", context.getArbitrationInfo());
-        }
-    }
-
-    @Test
-    public void testNoContext() throws Exception {
-        CharSoupEncodingDetector detector = new CharSoupEncodingDetector();
-        ParseContext parseContext = new ParseContext();
-
-        byte[] data = "Test".getBytes(UTF_8);
-        try (TikaInputStream tis = TikaInputStream.get(
-                new ByteArrayInputStream(data))) {
-            Charset result = detector.detect(tis, new Metadata(), 
parseContext);
-            assertNull(result);
-        }
-    }
-
-    @Test
-    public void testEmptyResults() throws Exception {
-        CharSoupEncodingDetector detector = new CharSoupEncodingDetector();
-        EncodingDetectorContext context = new EncodingDetectorContext();
-
-        ParseContext parseContext = new ParseContext();
-        parseContext.set(EncodingDetectorContext.class, context);
-
-        byte[] data = "Test".getBytes(UTF_8);
-        try (TikaInputStream tis = TikaInputStream.get(
-                new ByteArrayInputStream(data))) {
-            Charset result = detector.detect(tis, new Metadata(), 
parseContext);
-            assertNull(result);
-        }
-    }
-
-    @Test
-    public void testArabicEncodingArbitration() throws Exception {
-        // Arabic text encoded in windows-1256.
-        // When decoded as UTF-8 it produces replacement chars / garbage.
-        // When decoded as windows-1256 it produces valid Arabic.
-        // The language detector should pick windows-1256.
-        Charset windows1256 = Charset.forName("windows-1256");
-
-        String arabicText =
-                "\u0641\u064a \u0642\u0631\u064a\u0629 \u0645\u0646 " +
-                "\u0627\u0644\u0642\u0631\u0649 \u0643\u0627\u0646 " +
-                "\u0647\u0646\u0627\u0643 \u0631\u062c\u0644 " +
-                "\u062d\u0643\u064a\u0645 \u064a\u0639\u0631\u0641 " +
-                "\u0643\u0644 \u0634\u064a\u0621 \u0639\u0646 " +
-                "\u0627\u0644\u062d\u064a\u0627\u0629 \u0648\u0643\u0627\u0646 
" +
-                "\u064a\u0639\u0644\u0645 \u0627\u0644\u0646\u0627\u0633 " +
-                "\u0643\u064a\u0641 \u064a\u0639\u064a\u0634\u0648\u0646 " +
-                "\u0628\u0633\u0644\u0627\u0645 
\u0648\u0627\u0646\u0633\u062c\u0627\u0645. " +
-                "\u0627\u0644\u0644\u063a\u0629 
\u0627\u0644\u0639\u0631\u0628\u064a\u0629 " +
-                "\u0647\u064a \u0648\u0627\u062d\u062f\u0629 \u0645\u0646 " +
-                "\u0623\u0643\u062b\u0631 \u0627\u0644\u0644\u063a\u0627\u062a 
" +
-                "\u0627\u0646\u062a\u0634\u0627\u0631\u0627 \u0641\u064a " +
-                "\u0627\u0644\u0639\u0627\u0644\u0645 
\u0648\u064a\u062a\u062d\u062b\u0647\u0627 " +
-                "\u0623\u0643\u062b\u0631 \u0645\u0646 
\u062b\u0644\u0627\u062b\u0645\u0627\u0626\u0629 " +
-                "\u0645\u0644\u064a\u0648\u0646 
\u0625\u0646\u0633\u0627\u0646.";
-        byte[] arabicBytes = arabicText.getBytes(windows1256);
-
-        EncodingDetectorContext context = new EncodingDetectorContext();
-        context.addResult(UTF_8, "HtmlEncodingDetector");
-        context.addResult(windows1256, "Icu4jEncodingDetector");
-
-        ParseContext parseContext = new ParseContext();
-        parseContext.set(EncodingDetectorContext.class, context);
-
-        CharSoupEncodingDetector detector = new CharSoupEncodingDetector();
-        try (TikaInputStream tis = TikaInputStream.get(
-                new ByteArrayInputStream(arabicBytes))) {
-            Charset result = detector.detect(tis, new Metadata(), 
parseContext);
-            assertEquals(windows1256, result);
-            assertEquals("scored", context.getArbitrationInfo());
-        }
-    }
-
-    @Test
-    public void testStreamResetAfterDetection() throws Exception {
-        EncodingDetectorContext context = new EncodingDetectorContext();
-        context.addResult(UTF_8, "DetectorA");
-        context.addResult(ISO_8859_1, "DetectorB");
-
-        ParseContext parseContext = new ParseContext();
-        parseContext.set(EncodingDetectorContext.class, context);
-
-        byte[] data = "Hello, world! This is a test of encoding 
detection.".getBytes(UTF_8);
-        CharSoupEncodingDetector detector = new CharSoupEncodingDetector();
-        try (TikaInputStream tis = TikaInputStream.get(
-                new ByteArrayInputStream(data))) {
-            detector.detect(tis, new Metadata(), parseContext);
-
-            // Verify stream is back at the start
-            byte[] readBack = new byte[data.length];
-            int bytesRead = tis.read(readBack);
-            assertEquals(data.length, bytesRead);
-            assertEquals("Hello, world! This is a test of encoding detection.",
-                    new String(readBack, UTF_8));
-        }
-    }
-
-    @Test
-    public void testStripTags() {
-        assertEquals("Hello world",
-                CharSoupEncodingDetector.stripTags(
-                        "<html><body>Hello world</body></html>"));
-        assertEquals("no tags here",
-                CharSoupEncodingDetector.stripTags("no tags here"));
-        assertEquals("",
-                CharSoupEncodingDetector.stripTags("<empty/>"));
-    }
-
-    @Test
-    public void testDecode() {
-        byte[] utf8Bytes = "caf\u00e9".getBytes(UTF_8);
-        assertEquals("caf\u00e9",
-                CharSoupEncodingDetector.decode(utf8Bytes, UTF_8));
-    }
-
-    @Test
-    public void testReadLimitGetterSetter() {
-        CharSoupEncodingDetector detector = new CharSoupEncodingDetector();
-        assertEquals(16384, detector.getReadLimit());
-        detector.setReadLimit(4096);
-        assertEquals(4096, detector.getReadLimit());
-    }
-}
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/test-documents/testArabicMisleadingCharset.html
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/test-documents/testArabicMisleadingCharset.html
deleted file mode 100644
index e9884177f2..0000000000
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/test-documents/testArabicMisleadingCharset.html
+++ /dev/null
@@ -1,11 +0,0 @@
-<html>
-<head>
-<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
-<title>Test Arabic</title>
-</head>
-<body>
-<p>�� ���� �� ����� ��� ���� ��� ���� ���� �� ��� �� ������ ���� ���� ����� 
��� ������ ����� �������. ����� ������� �� ����� �� ���� ������ ������� �� 
������ ������� ���� �� �������� ����� �����.</p>
-<p>�� ���� �� ����� ��� ���� ��� ���� ���� �� ��� �� ������ ���� ���� ����� 
��� ������ ����� �������. ����� ������� �� ����� �� ���� ������ ������� �� 
������ ������� ���� �� �������� ����� �����.</p>
-<p>�� ���� �� ����� ��� ���� ��� ���� ���� �� ��� �� ������ ���� ���� ����� 
��� ������ ����� �������. ����� ������� �� ����� �� ���� ������ ������� �� 
������ ������� ���� �� �������� ����� �����.</p>
-</body>
-</html>
\ No newline at end of file

(tika) 01/02: Revert "TIKA-4671 - git add"

Reply via email to