(tika) branch main updated: TIKA-4675 -- improve wide unicode detection (#2647)

tallison Wed, 25 Feb 2026 17:19:29 -0800

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git



The following commit(s) were added to refs/heads/main by this push:
     new e7238901df TIKA-4675 -- improve wide unicode detection (#2647)
e7238901df is described below

commit e7238901df2eadfc53a0a9341779e3c85d928d5c
Author: Tim Allison <[email protected]>
AuthorDate: Wed Feb 25 20:19:18 2026 -0500

    TIKA-4675 -- improve wide unicode detection (#2647)
---
 .../apache/tika/detect/WideUnicodeDetector.java    | 490 +++++++++++++++++++++
 .../tika/detect/WideUnicodeDetectorTest.java       | 452 +++++++++++++++++++
 2 files changed, 942 insertions(+)

diff --git 
a/tika-core/src/main/java/org/apache/tika/detect/WideUnicodeDetector.java 
b/tika-core/src/main/java/org/apache/tika/detect/WideUnicodeDetector.java
new file mode 100644
index 0000000000..89f904d91a
--- /dev/null
+++ b/tika-core/src/main/java/org/apache/tika/detect/WideUnicodeDetector.java
@@ -0,0 +1,490 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.detect;
+
+import java.io.IOException;
+import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
+import java.util.Arrays;
+
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+
+/**
+ * An {@link EncodingDetector} that identifies UTF-16 LE/BE and UTF-32 LE/BE
+ * purely from structural byte-position patterns — no BOM reliance.
+ *
+ * <h3>Detection strategy</h3>
+ *
+ * <p><strong>UTF-32 (null-position check):</strong> For BMP codepoints
+ * (U+0000–U+FFFF, covering all major scripts), UTF-32BE always produces
+ * {@code 0x00} at byte positions 0 and 1 within each 4-byte group;
+ * UTF-32LE always produces {@code 0x00} at positions 2 and 3. UTF-32 is
+ * tested before UTF-16 because Latin UTF-32 also triggers the UTF-16
+ * null-column check.</p>
+ *
+ * <p><strong>UTF-16 (three-phase):</strong>
+ * <ol>
+ *   <li><em>Null-column</em> — Latin/ASCII content: one byte column
+ *       (even or odd positions) has a high {@code 0x00} rate.</li>
+ *   <li><em>Variety-ratio</em> — scripts with a narrow block-prefix byte
+ *       (Arabic 0x06, Hebrew 0x05, Greek 0x03, Devanagari 0x09, …): the
+ *       glyph-index column has at least 2× as many distinct values as the
+ *       block-prefix column.</li>
+ *   <li><em>Block-prefix range</em> — CJK (0x4E–0x9F, 82 values) and
+ *       Hangul (0xAC–0xD7, 44 values) where the variety ratio alone may not
+ *       be decisive with limited samples: if all values in one column fall
+ *       below the surrogate boundary (0xD8) and the other column has more
+ *       distinct values, the constrained column is carrying Unicode
+ *       block-prefix bytes.</li>
+ * </ol>
+ * </p>
+ *
+ * <h3>BOM handling</h3>
+ * <p>Any BOM at the start of the stream is stripped <em>before</em> analysis
+ * to preserve fixed-width group alignment. A 3-byte UTF-8 BOM would otherwise
+ * shift every subsequent byte position by 3, breaking both UTF-16 pair
+ * alignment and UTF-32 4-byte group alignment. The BOM bytes are not used to
+ * infer the encoding — only the content after them is examined.</p>
+ *
+ * <h3>What this class does NOT do</h3>
+ * <ul>
+ *   <li>UTF-8 detection — UTF-8 is variable-width and self-describing; use
+ *       grammar validation instead.</li>
+ *   <li>Single-byte or multi-byte Asian encoding detection — left to
+ *       statistical detectors (Universal, ICU4J).</li>
+ * </ul>
+ *
+ * @since Apache Tika 3.2
+ */
+public class WideUnicodeDetector implements EncodingDetector {
+
+    /**
+     * Maximum bytes read from the stream per call to {@link #detect}.
+     * Must be at least as large as {@link #SAMPLE_LIMIT} plus the longest
+     * possible BOM (4 bytes for UTF-32).
+     */
+    private static final int STREAM_READ_LIMIT = 516;
+
+    /**
+     * Maximum content bytes analysed (after BOM stripping). Must be a
+     * multiple of 4 so that UTF-32 group alignment is preserved.
+     */
+    private static final int SAMPLE_LIMIT = 512;
+
+    /**
+     * Null-column threshold for UTF-16 Latin detection: the null rate in the
+     * constrained column must exceed {@code 1 / NULL_THRESHOLD_DENOM}.
+     *
+     * <p>Set to 4 (threshold {@literal >} 25%) rather than 10 (10%) for two
+     * reasons discovered during corpus analysis:
+     * <ul>
+     *   <li>OLE2 compound documents (.doc, .xls, .msg) have ~12–15% null at
+     *       the odd-byte column from 2-byte little-endian header integers; at
+     *       the 10% threshold they were wrongly detected as UTF-16LE.</li>
+     *   <li>Small bzip2 frames have ~20–25% null at the even-byte column from
+     *       block-header zeros; same false-positive risk.</li>
+     * </ul>
+     * Real Latin UTF-16 text has {@literal >} 90% null in the null column,
+     * so raising the threshold to 25% has no effect on legitimate detections.
+     * </p>
+     */
+    private static final int NULL_THRESHOLD_DENOM = 4;
+
+    /**
+     * UTF-32 BMP null threshold: at least 90% of 4-byte groups must have both
+     * structural positions equal to {@code 0x00}, allowing up to 10% non-BMP
+     * codepoints (emoji, historic scripts, etc.).
+     */
+    private static final double UTF32_NULL_THRESHOLD = 0.90;
+
+    /**
+     * Minimum fraction of 4-byte groups whose content byte (position 3 for
+     * UTF-32BE, position 0 for UTF-32LE) must be non-zero. Guards against
+     * false-positive detection on nearly-null binary data. Set to 0.80 to
+     * tolerate CJK characters whose low byte happens to be 0x00 (e.g.
+     * U+4E00, U+5000, …), which represent roughly 0.4% of common CJK chars.
+     */
+    private static final double UTF32_CONTENT_NONZERO_MIN = 0.80;
+
+    /**
+     * Minimum ratio of distinct values between the glyph-index column and the
+     * block-prefix column for the variety-ratio check to fire.
+     */
+    private static final double UTF16_VARIETY_RATIO = 2.0;
+
+    /**
+     * The constrained column must have fewer than this fraction of pairs as
+     * distinct values. Guards against firing on uniformly random data.
+     */
+    private static final double UTF16_CONSTRAINED_MAX_RATIO = 0.40;
+
+    /**
+     * Upper bound (exclusive) for the UTF-16 block-prefix range check.
+     * Set to 0xD8 — the start of the UTF-16 surrogate range — which covers
+     * every assigned BMP script: Latin, Greek, Cyrillic, Arabic, Hebrew,
+     * Devanagari, CJK (0x4E–0x9F), Yi (0xA0–0xA4), Hangul (0xAC–0xD7).
+     */
+    private static final int BMP_BLOCK_PREFIX_MAX = 0xD8;
+
+    /**
+     * Known BOM sequences, longest first so that the 4-byte UTF-32 BOMs are
+     * matched before the 2-byte UTF-16 BOMs that share their prefix.
+     */
+    private static final byte[][] BOMS = {
+            {(byte) 0x00, (byte) 0x00, (byte) 0xFE, (byte) 0xFF}, // UTF-32BE
+            {(byte) 0xFF, (byte) 0xFE, (byte) 0x00, (byte) 0x00}, // UTF-32LE
+            {(byte) 0xEF, (byte) 0xBB, (byte) 0xBF},               // UTF-8
+            {(byte) 0xFE, (byte) 0xFF},                             // UTF-16BE
+            {(byte) 0xFF, (byte) 0xFE},                             // UTF-16LE
+    };
+
+    @Override
+    public Charset detect(TikaInputStream tis, Metadata metadata,
+                          ParseContext context) throws IOException {
+        if (tis == null || !tis.markSupported()) {
+            return null;
+        }
+
+        tis.mark(STREAM_READ_LIMIT);
+        byte[] buf;
+        try {
+            buf = tis.readNBytes(STREAM_READ_LIMIT);
+        } finally {
+            tis.reset();
+        }
+
+        // 12 = longest BOM (4 bytes) + minimum analysable content (8 bytes)
+        if (buf.length < 12) {
+            return null;
+        }
+
+        return detectEncoding(buf);
+    }
+
+    /**
+     * Detect the wide Unicode encoding of a raw byte array.
+     * Any leading BOM is stripped before analysis so that the fixed-width
+     * group alignment is preserved. Callers do not need to strip the BOM
+     * themselves.
+     *
+     * @param bytes raw content bytes, with or without a leading BOM
+     * @return detected charset, or {@code null} if no wide Unicode structure
+     *         is found
+     */
+    public static Charset detectEncoding(byte[] bytes) {
+        if (bytes == null || bytes.length < 8) {
+            return null;
+        }
+
+        bytes = skipBom(bytes);
+
+        int sampleLen = (Math.min(bytes.length, SAMPLE_LIMIT) / 4) * 4;
+        if (sampleLen < 8) {
+            return null;
+        }
+
+        Charset utf32 = tryUtf32(bytes, sampleLen);
+        if (utf32 != null) {
+            return utf32;
+        }
+        return tryUtf16(bytes, sampleLen);
+    }
+
+    /**
+     * Strips any leading BOM from {@code bytes}. If no BOM is found the
+     * original array is returned unchanged (no copy).
+     */
+    public static byte[] skipBom(byte[] bytes) {
+        for (byte[] bom : BOMS) {
+            if (startsWith(bytes, bom)) {
+                return Arrays.copyOfRange(bytes, bom.length, bytes.length);
+            }
+        }
+        return bytes;
+    }
+
+    /**
+     * UTF-32 detection via null-position signature, structural validity, and
+     * content-position non-zero check.
+     *
+     * <p>Structural validity: each 4-byte group is checked against the Unicode
+     * codepoint range (U+0000–U+10FFFF) and the surrogate exclusion zone
+     * (U+D800–U+DFFF). A single invalid group rules out that byte order.</p>
+     *
+     * <p>Content-position check: the byte carrying the actual character value
+     * (position 3 for BE, position 0 for LE) must be non-zero in at least
+     * {@value #UTF32_CONTENT_NONZERO_MIN} of groups. This rejects nearly-null
+     * binary data that satisfies the structural-zero check by accident.</p>
+     */
+    private static Charset tryUtf32(byte[] bytes, int sampleLen) {
+        int groups = sampleLen / 4;
+        int bothZeroAt01 = 0;
+        int bothZeroAt23 = 0;
+        int[] countsPos0 = new int[256];
+        int[] countsPos3 = new int[256];
+        boolean invalidBe = false;
+        boolean invalidLe = false;
+
+        for (int g = 0; g < groups; g++) {
+            int b0 = bytes[g * 4]     & 0xFF;
+            int b1 = bytes[g * 4 + 1] & 0xFF;
+            int b2 = bytes[g * 4 + 2] & 0xFF;
+            int b3 = bytes[g * 4 + 3] & 0xFF;
+
+            if (b0 == 0 && b1 == 0) bothZeroAt01++;
+            if (b2 == 0 && b3 == 0) bothZeroAt23++;
+            countsPos0[b0]++;
+            countsPos3[b3]++;
+
+            // UTF-32BE validity: codepoint = (b0<<24)|(b1<<16)|(b2<<8)|b3
+            // Valid range: 0x000000–0x10FFFF, excluding surrogates 
0xD800–0xDFFF
+            if (!invalidBe) {
+                if (b0 != 0 || b1 > 0x10 || (b1 == 0 && 0xD8 <= b2 && b2 <= 
0xDF)) {
+                    invalidBe = true;
+                }
+            }
+
+            // UTF-32LE validity: codepoint = (b3<<24)|(b2<<16)|(b1<<8)|b0
+            if (!invalidLe) {
+                if (b3 != 0 || b2 > 0x10 || (b2 == 0 && 0xD8 <= b1 && b1 <= 
0xDF)) {
+                    invalidLe = true;
+                }
+            }
+        }
+
+        // UTF-32BE: structural zeros at positions 0,1 + validity + content 
non-zero
+        if (!invalidBe
+                && (double) bothZeroAt01 / groups >= UTF32_NULL_THRESHOLD
+                && countUnique(countsPos3) >= 2
+                && (double) (groups - countsPos3[0]) / groups >= 
UTF32_CONTENT_NONZERO_MIN) {
+            return Charset.forName("UTF-32BE");
+        }
+
+        // UTF-32LE: structural zeros at positions 2,3 + validity + content 
non-zero
+        if (!invalidLe
+                && (double) bothZeroAt23 / groups >= UTF32_NULL_THRESHOLD
+                && countUnique(countsPos0) >= 2
+                && (double) (groups - countsPos0[0]) / groups >= 
UTF32_CONTENT_NONZERO_MIN) {
+            return Charset.forName("UTF-32LE");
+        }
+
+        return null;
+    }
+
+    /**
+     * UTF-16 detection via three phases, with surrogate-pair sequence 
validation
+     * running in parallel to rule out structurally impossible byte sequences.
+     *
+     * <p>Surrogate validation: in UTF-16BE the even byte is the high byte of
+     * each code unit; in UTF-16LE the odd byte is the high byte. A high 
surrogate
+     * (0xD8–0xDB) must be immediately followed by a low surrogate (0xDC–0xDF).
+     * A lone low surrogate, or a high surrogate not followed by a low 
surrogate,
+     * marks that byte order as invalid.</p>
+     */
+    private static Charset tryUtf16(byte[] bytes, int sampleLen) {
+        int pairs = sampleLen / 2;
+        int nullsAtEven = 0;
+        int nullsAtOdd  = 0;
+        int[] countsEven = new int[256];
+        int[] countsOdd  = new int[256];
+
+        // Surrogate-pair state: true = we just saw a high surrogate and 
expect a low
+        boolean awaitLowBe = false; // UTF-16BE: high byte is even
+        boolean awaitLowLe = false; // UTF-16LE: high byte is odd
+        boolean invalidBe  = false;
+        boolean invalidLe  = false;
+
+        for (int p = 0; p < pairs; p++) {
+            int even = bytes[p * 2]     & 0xFF;
+            int odd  = bytes[p * 2 + 1] & 0xFF;
+
+            if (even == 0) nullsAtEven++;
+            if (odd  == 0) nullsAtOdd++;
+            countsEven[even]++;
+            countsOdd[odd]++;
+
+            // UTF-16BE surrogate validation (high byte = even)
+            if (!invalidBe) {
+                if (awaitLowBe) {
+                    if (0xDC <= even && even <= 0xDF) {
+                        awaitLowBe = false; // valid low surrogate completes 
pair
+                    } else {
+                        invalidBe = true;   // expected low surrogate, got 
something else
+                    }
+                } else {
+                    if (0xD8 <= even && even <= 0xDB) {
+                        awaitLowBe = true;  // high surrogate — expect low next
+                    } else if (0xDC <= even && even <= 0xDF) {
+                        invalidBe = true;   // lone low surrogate
+                    }
+                }
+            }
+
+            // UTF-16LE surrogate validation (high byte = odd)
+            if (!invalidLe) {
+                if (awaitLowLe) {
+                    if (0xDC <= odd && odd <= 0xDF) {
+                        awaitLowLe = false;
+                    } else {
+                        invalidLe = true;
+                    }
+                } else {
+                    if (0xD8 <= odd && odd <= 0xDB) {
+                        awaitLowLe = true;
+                    } else if (0xDC <= odd && odd <= 0xDF) {
+                        invalidLe = true;
+                    }
+                }
+            }
+        }
+
+        // An unmatched high surrogate at end of sample is also invalid
+        if (awaitLowBe) invalidBe = true;
+        if (awaitLowLe) invalidLe = true;
+
+        // Phase 1: null-column (Latin/ASCII)
+        boolean highEven = nullsAtEven * NULL_THRESHOLD_DENOM > pairs;
+        boolean highOdd  = nullsAtOdd  * NULL_THRESHOLD_DENOM > pairs;
+        if (highOdd  && !highEven && !invalidLe) return 
StandardCharsets.UTF_16LE;
+        if (highEven && !highOdd  && !invalidBe) return 
StandardCharsets.UTF_16BE;
+
+        // Phase 2: variety-ratio (Arabic, Hebrew, Greek, Devanagari, …)
+        //
+        // Extra guard: the constrained (block-prefix) column must NOT be
+        // dominated by 0x00. Real script prefixes are non-null constants
+        // (Arabic 0x06, Hebrew 0x05, Greek 0x03, Devanagari 0x09, …).
+        // When 0x00 is the dominant byte in the constrained column we are
+        // likely looking at binary data with sparse high bytes (e.g. MIPS
+        // big-endian code where R-type opcodes place 0x00 at byte 0 of
+        // every instruction), not a Unicode block prefix.
+        int uniqueEven = countUnique(countsEven);
+        int uniqueOdd  = countUnique(countsOdd);
+        double constrainedMax = pairs * UTF16_CONSTRAINED_MAX_RATIO;
+
+        if (!invalidLe && (double) uniqueEven / uniqueOdd >= 
UTF16_VARIETY_RATIO
+                && uniqueOdd <= constrainedMax
+                && mostCommon(countsOdd) != 0) {
+            return StandardCharsets.UTF_16LE;
+        }
+        if (!invalidBe && (double) uniqueOdd / uniqueEven >= 
UTF16_VARIETY_RATIO
+                && uniqueEven <= constrainedMax
+                && mostCommon(countsEven) != 0) {
+            return StandardCharsets.UTF_16BE;
+        }
+
+        // Phase 3: block-prefix range (CJK, Hangul — wide block-prefix ranges)
+        //
+        // We require:
+        //   oddInRange  — all odd-position bytes < 0xD8 (below the surrogate
+        //                 boundary).  This is the high-byte (block-prefix)
+        //                 column for UTF-16LE.
+        //   !evenInRange — at least one even-position byte ≥ 0xD8 (the glyph-
+        //                 index column overflows the surrogate boundary, which
+        //                 is expected for CJK low bytes like 0xE5, 0xEF, 
0xF4).
+        //   nonNullAllAbove — no non-null byte below 0x20 in the constrained
+        //                 column.  Binary formats inject control tokens here
+        //                 (ISOBMFF 0x01/0x02, LZ4 0x04/0x15); CJK and Hangul
+        //                 block-prefix bytes are always ≥ 0x4E/0xAC.
+        //
+        // We also require uniqueEven > uniqueOdd for LE (uniqueOdd > 
uniqueEven
+        // for BE).  This is not about discriminating from random binary — the
+        // scenario where "12 vs 11 unique values, all below 0xD8" could cause
+        // a false positive is already prevented by !evenInRange (which 
requires
+        // at least one byte ≥ 0xD8 in the even column, contradicting "all
+        // below 0xD8").  The real purpose is orientation: it prevents Latin
+        // UTF-16BE data (whose odd column is full of diverse ASCII content
+        // bytes and whose even column is dominated by 0x00) from being
+        // misidentified as UTF-16LE when a stray byte ≥ 0xD8 lands in the
+        // even column (e.g. an injected or corrupted surrogate byte).  In that
+        // pathological case uniqueEven ≈ 2 << uniqueOdd, so the check fails
+        // cleanly.  For legitimate CJK and Hangul the glyph-index column
+        // always has at least as many distinct values as the block-prefix
+        // column, so a gap of 1 is enough.
+        boolean oddInRange  = allInRange(countsOdd,  BMP_BLOCK_PREFIX_MAX);
+        boolean evenInRange = allInRange(countsEven, BMP_BLOCK_PREFIX_MAX);
+        if (!invalidLe && oddInRange  && !evenInRange && uniqueEven > uniqueOdd
+                && nonNullAllAbove(countsOdd,  0x20)) {
+            return StandardCharsets.UTF_16LE;
+        }
+        if (!invalidBe && evenInRange && !oddInRange  && uniqueOdd  > 
uniqueEven
+                && nonNullAllAbove(countsEven, 0x20)) {
+            return StandardCharsets.UTF_16BE;
+        }
+
+        return null;
+    }
+
+    private static int countUnique(int[] counts) {
+        int n = 0;
+        for (int c : counts) if (c > 0) n++;
+        return n;
+    }
+
+    private static boolean allInRange(int[] counts, int maxExclusive) {
+        for (int v = maxExclusive; v < counts.length; v++) {
+            if (counts[v] > 0) return false;
+        }
+        return true;
+    }
+
+    /**
+     * Returns the byte value (0–255) that appears most frequently in
+     * {@code counts}. Ties are broken in favour of the lower value.
+     */
+    private static int mostCommon(int[] counts) {
+        int best = 0;
+        for (int v = 1; v < counts.length; v++) {
+            if (counts[v] > counts[best]) {
+                best = v;
+            }
+        }
+        return best;
+    }
+
+    /**
+     * Returns {@code true} if every <em>non-null</em> byte value in
+     * {@code counts} is {@literal >=} {@code minInclusive}.
+     *
+     * <p>Null bytes ({@code 0x00}) are explicitly allowed: in a CJK or Hangul
+     * UTF-16 document the block-prefix (high-byte) column contains {@code 
0x00}
+     * for any ASCII character in the text (spaces, punctuation, Latin product
+     * names, …). Those null values are harmless — the {@code allInRange} check
+     * already bounds the column below {@code 0xD8}.</p>
+     *
+     * <p>What we actually reject are <em>non-zero</em> control bytes such as
+     * {@code 0x01}–{@code 0x1F}: these appear in binary format tokens (ISOBMFF
+     * version/flags fields at {@code 0x01}–{@code 0x02}, LZ4 frame tokens at
+     * {@code 0x04} and {@code 0x15}, …) but never as Unicode block prefixes
+     * for scripts above Basic Latin.</p>
+     */
+    private static boolean nonNullAllAbove(int[] counts, int minInclusive) {
+        for (int v = 1; v < minInclusive; v++) { // start at 1, skip null
+            if (counts[v] > 0) return false;
+        }
+        return true;
+    }
+
+    private static boolean startsWith(byte[] bytes, byte[] prefix) {
+        if (bytes.length < prefix.length) return false;
+        for (int i = 0; i < prefix.length; i++) {
+            if (bytes[i] != prefix[i]) return false;
+        }
+        return true;
+    }
+}
diff --git 
a/tika-core/src/test/java/org/apache/tika/detect/WideUnicodeDetectorTest.java 
b/tika-core/src/test/java/org/apache/tika/detect/WideUnicodeDetectorTest.java
new file mode 100644
index 0000000000..66d8c65271
--- /dev/null
+++ 
b/tika-core/src/test/java/org/apache/tika/detect/WideUnicodeDetectorTest.java
@@ -0,0 +1,452 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.detect;
+
+import static org.junit.jupiter.api.Assertions.assertArrayEquals;
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNotEquals;
+import static org.junit.jupiter.api.Assertions.assertNull;
+
+import java.io.ByteArrayInputStream;
+import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
+
+import org.junit.jupiter.api.Test;
+
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+
+public class WideUnicodeDetectorTest {
+
+    private static final Charset UTF32LE = Charset.forName("UTF-32LE");
+    private static final Charset UTF32BE = Charset.forName("UTF-32BE");
+
+    private final WideUnicodeDetector detector = new WideUnicodeDetector();
+
+    private static byte[] encode(String text, Charset cs) {
+        return text.getBytes(cs);
+    }
+
+    private static byte[] prepend(byte[] prefix, byte[] body) {
+        byte[] out = new byte[prefix.length + body.length];
+        System.arraycopy(prefix, 0, out, 0, prefix.length);
+        System.arraycopy(body, 0, out, prefix.length, body.length);
+        return out;
+    }
+
+    /** Build a diverse CJK string spanning 4 high-byte groups. */
+    private static String diverseCjk(int countPerBlock) {
+        StringBuilder sb = new StringBuilder();
+        for (int start : new int[]{0x4E00, 0x5000, 0x6000, 0x7000}) {
+            for (int i = 0; i < countPerBlock; i++) sb.appendCodePoint(start + 
i);
+        }
+        return sb.toString();
+    }
+
+    private Charset detectViaStream(byte[] bytes) throws Exception {
+        try (TikaInputStream tis = TikaInputStream.get(new 
ByteArrayInputStream(bytes))) {
+            return detector.detect(tis, new Metadata(), new ParseContext());
+        }
+    }
+
+    // ── skipBom 
──────────────────────────────────────────────────────────────
+
+    @Test
+    void skipBomStripsUtf8Bom() {
+        byte[] in = {(byte) 0xEF, (byte) 0xBB, (byte) 0xBF, 0x41, 0x42, 0x43};
+        assertArrayEquals(new byte[]{0x41, 0x42, 0x43}, 
WideUnicodeDetector.skipBom(in));
+    }
+
+    @Test
+    void skipBomStripsUtf16LeBom() {
+        byte[] in = {(byte) 0xFF, (byte) 0xFE, 0x41, 0x00};
+        assertArrayEquals(new byte[]{0x41, 0x00}, 
WideUnicodeDetector.skipBom(in));
+    }
+
+    @Test
+    void skipBomStripsUtf16BeBom() {
+        byte[] in = {(byte) 0xFE, (byte) 0xFF, 0x00, 0x41};
+        assertArrayEquals(new byte[]{0x00, 0x41}, 
WideUnicodeDetector.skipBom(in));
+    }
+
+    @Test
+    void skipBomStripsUtf32LeBom() {
+        // FF FE 00 00 must match as UTF-32LE before FF FE matches as UTF-16LE
+        byte[] in = {(byte) 0xFF, (byte) 0xFE, 0x00, 0x00, 0x41, 0x00, 0x00, 
0x00};
+        assertArrayEquals(new byte[]{0x41, 0x00, 0x00, 0x00}, 
WideUnicodeDetector.skipBom(in));
+    }
+
+    @Test
+    void skipBomStripsUtf32BeBom() {
+        byte[] in = {0x00, 0x00, (byte) 0xFE, (byte) 0xFF, 0x00, 0x00, 0x00, 
0x41};
+        assertArrayEquals(new byte[]{0x00, 0x00, 0x00, 0x41}, 
WideUnicodeDetector.skipBom(in));
+    }
+
+    @Test
+    void skipBomLeavesNoBomContentUnchanged() {
+        byte[] in = {0x41, 0x00, 0x42, 0x00};
+        assertArrayEquals(in, WideUnicodeDetector.skipBom(in));
+    }
+
+    // ── detectEncoding BOM handling 
───────────────────────────────────────────
+
+    @Test
+    void detectEncodingStripsUtf8BomBeforeAnalysis() {
+        // A UTF-8 BOM (EF BB BF) before UTF-16LE content would shift every
+        // subsequent byte position by 3, misaligning the fixed-width pairs.
+        // detectEncoding() must strip it internally so callers don't have to.
+        String latin = "Hello world from a test. ".repeat(6);
+        byte[] raw = encode(latin, StandardCharsets.UTF_16LE);
+        byte[] withBom = new byte[3 + raw.length];
+        withBom[0] = (byte) 0xEF;
+        withBom[1] = (byte) 0xBB;
+        withBom[2] = (byte) 0xBF;
+        System.arraycopy(raw, 0, withBom, 3, raw.length);
+        assertEquals(StandardCharsets.UTF_16LE, 
WideUnicodeDetector.detectEncoding(withBom));
+    }
+
+    @Test
+    void detectEncodingStripsUtf16LeBomBeforeAnalysis() {
+        String latin = "Hello world from a test. ".repeat(6);
+        byte[] raw = encode(latin, StandardCharsets.UTF_16LE);
+        byte[] withBom = new byte[2 + raw.length];
+        withBom[0] = (byte) 0xFF;
+        withBom[1] = (byte) 0xFE;
+        System.arraycopy(raw, 0, withBom, 2, raw.length);
+        assertEquals(StandardCharsets.UTF_16LE, 
WideUnicodeDetector.detectEncoding(withBom));
+    }
+
+    // ── UTF-16 LE 
────────────────────────────────────────────────────────────
+
+    @Test
+    void utf16LeLatinText() {
+        String text = "The quick brown fox jumps over the lazy dog. 
".repeat(6);
+        assertEquals(StandardCharsets.UTF_16LE,
+                WideUnicodeDetector.detectEncoding(encode(text, 
StandardCharsets.UTF_16LE)));
+    }
+
+    @Test
+    void utf16LeCjkDiverse() {
+        assertEquals(StandardCharsets.UTF_16LE,
+                WideUnicodeDetector.detectEncoding(encode(diverseCjk(32), 
StandardCharsets.UTF_16LE)));
+    }
+
+    @Test
+    void utf16LeCjkSmallSample() {
+        // 10 unique chars — variety ratio ~1.1×, below threshold.
+        // Block-prefix range check (odd bytes all in [0x4E,0x8B] ⊂ [0,0xD8)) 
saves it.
+        String cjk = 
"\u4E2D\u6587\u6D4B\u8BD5\u5185\u5BB9\u53EF\u4EE5\u68C0\u6D4B".repeat(6);
+        assertEquals(StandardCharsets.UTF_16LE,
+                WideUnicodeDetector.detectEncoding(encode(cjk, 
StandardCharsets.UTF_16LE)));
+    }
+
+    @Test
+    void utf16LeHangul() {
+        // Hangul Syllables U+AC00–U+D7A3: high bytes 0xAC–0xD7, above old 
0xA0 threshold.
+        String hangul = 
"\uAC00\uAC01\uB098\uB2E4\uB77C\uB9C8\uBC14\uC0AC\uC544\uC790".repeat(6);
+        assertEquals(StandardCharsets.UTF_16LE,
+                WideUnicodeDetector.detectEncoding(encode(hangul, 
StandardCharsets.UTF_16LE)));
+    }
+
+    @Test
+    void utf16LeMixed() {
+        String mixed = ("Hello \u4E16\u754C! ").repeat(15);
+        assertEquals(StandardCharsets.UTF_16LE,
+                WideUnicodeDetector.detectEncoding(encode(mixed, 
StandardCharsets.UTF_16LE)));
+    }
+
+    // ── UTF-16 BE 
────────────────────────────────────────────────────────────
+
+    @Test
+    void utf16BeLatinText() {
+        String text = "The quick brown fox jumps over the lazy dog. 
".repeat(6);
+        assertEquals(StandardCharsets.UTF_16BE,
+                WideUnicodeDetector.detectEncoding(encode(text, 
StandardCharsets.UTF_16BE)));
+    }
+
+    @Test
+    void utf16BeCjkDiverse() {
+        assertEquals(StandardCharsets.UTF_16BE,
+                WideUnicodeDetector.detectEncoding(encode(diverseCjk(32), 
StandardCharsets.UTF_16BE)));
+    }
+
+    @Test
+    void utf16BeCjkSmallSample() {
+        String cjk = 
"\u4E2D\u6587\u6D4B\u8BD5\u5185\u5BB9\u53EF\u4EE5\u68C0\u6D4B".repeat(6);
+        assertEquals(StandardCharsets.UTF_16BE,
+                WideUnicodeDetector.detectEncoding(encode(cjk, 
StandardCharsets.UTF_16BE)));
+    }
+
+    @Test
+    void utf16BeHangul() {
+        String hangul = 
"\uAC00\uAC01\uB098\uB2E4\uB77C\uB9C8\uBC14\uC0AC\uC544\uC790".repeat(6);
+        assertEquals(StandardCharsets.UTF_16BE,
+                WideUnicodeDetector.detectEncoding(encode(hangul, 
StandardCharsets.UTF_16BE)));
+    }
+
+    @Test
+    void utf16LeCjkWithAsciiSpaces() {
+        // Realistic CJK document: ~16% ASCII spaces between Chinese 
characters.
+        // null_odd ≈ 16% — below the Phase 1 threshold (25%) — so Phase 1 does
+        // not fire.  The characters are chosen so that some low bytes are ≥ 
0xD8
+        // (以=E5, 们=EC, 说=F4), making !evenInRange true and routing detection
+        // to Phase 3.  The 0x00 high bytes from the spaces appear in the odd
+        // (block-prefix) column; nonNullAllAbove allows them because they are
+        // null — only non-null bytes below 0x20 (binary control tokens) are
+        // rejected.  The old allAbove guard would have rejected this text.
+        String text = ("\u4ee5\u4eec\u8bf4\u4e2d\u6587 ").repeat(12);
+        assertEquals(StandardCharsets.UTF_16LE,
+                WideUnicodeDetector.detectEncoding(encode(text, 
StandardCharsets.UTF_16LE)));
+    }
+
+    @Test
+    void utf16BeCjkWithAsciiSpaces() {
+        String text = ("\u4ee5\u4eec\u8bf4\u4e2d\u6587 ").repeat(12);
+        assertEquals(StandardCharsets.UTF_16BE,
+                WideUnicodeDetector.detectEncoding(encode(text, 
StandardCharsets.UTF_16BE)));
+    }
+
+    // ── UTF-32 LE 
────────────────────────────────────────────────────────────
+
+    @Test
+    void utf32LeLatinText() {
+        String text = "The quick brown fox jumps over the lazy dog. 
".repeat(4);
+        assertEquals(UTF32LE, WideUnicodeDetector.detectEncoding(encode(text, 
UTF32LE)));
+    }
+
+    @Test
+    void utf32LeCjkSmallSample() {
+        // Null-position check fires regardless of sample diversity: bytes 2,3 
always 0x00 for BMP.
+        String cjk = 
"\u4E2D\u6587\u6D4B\u8BD5\u5185\u5BB9\u53EF\u4EE5\u68C0\u6D4B".repeat(4);
+        assertEquals(UTF32LE, WideUnicodeDetector.detectEncoding(encode(cjk, 
UTF32LE)));
+    }
+
+    @Test
+    void utf32LeMixed() {
+        assertEquals(UTF32LE,
+                WideUnicodeDetector.detectEncoding(encode(("Hello 
\u4E16\u754C! ").repeat(8), UTF32LE)));
+    }
+
+    // ── UTF-32 BE 
────────────────────────────────────────────────────────────
+
+    @Test
+    void utf32BeLatinText() {
+        String text = "The quick brown fox jumps over the lazy dog. 
".repeat(4);
+        assertEquals(UTF32BE, WideUnicodeDetector.detectEncoding(encode(text, 
UTF32BE)));
+    }
+
+    @Test
+    void utf32BeCjkSmallSample() {
+        // Bytes 0,1 always 0x00 for BMP — null-position check works with any 
sample size.
+        String cjk = 
"\u4E2D\u6587\u6D4B\u8BD5\u5185\u5BB9\u53EF\u4EE5\u68C0\u6D4B".repeat(4);
+        assertEquals(UTF32BE, WideUnicodeDetector.detectEncoding(encode(cjk, 
UTF32BE)));
+    }
+
+    @Test
+    void utf32BeMixed() {
+        assertEquals(UTF32BE,
+                WideUnicodeDetector.detectEncoding(encode(("Hello 
\u4E16\u754C! ").repeat(8), UTF32BE)));
+    }
+
+    // ── Non-wide encodings return null 
────────────────────────────────────────
+
+    @Test
+    void utf8LatinReturnsNull() {
+        String text = "The quick brown fox jumps over the lazy dog. 
".repeat(6);
+        assertNull(WideUnicodeDetector.detectEncoding(encode(text, 
StandardCharsets.UTF_8)));
+    }
+
+    @Test
+    void utf8CjkReturnsNull() {
+        assertNull(WideUnicodeDetector.detectEncoding(encode(diverseCjk(32), 
StandardCharsets.UTF_8)));
+    }
+
+    @Test
+    void iso88591ReturnsNull() {
+        byte[] bytes = new byte[512];
+        for (int i = 0; i < bytes.length; i++) bytes[i] = (byte) (i & 0xFF);
+        assertNull(WideUnicodeDetector.detectEncoding(bytes));
+    }
+
+    @Test
+    void tooShortReturnsNull() {
+        assertNull(WideUnicodeDetector.detectEncoding(new byte[]{0x41, 0x00}));
+    }
+
+    @Test
+    void nullInputReturnsNull() {
+        assertNull(WideUnicodeDetector.detectEncoding(null));
+    }
+
+    // ── Misleading BOM 
───────────────────────────────────────────────────────
+
+    @Test
+    void misleadingUtf16LeBomWithUtf8Content() {
+        // UTF-16LE BOM prepended to UTF-8 body — structural check on body 
returns null.
+        byte[] combined = prepend(
+                new byte[]{(byte) 0xFF, (byte) 0xFE},
+                encode("The quick brown fox jumps over the lazy dog. 
".repeat(6), StandardCharsets.UTF_8));
+        
assertNull(WideUnicodeDetector.detectEncoding(WideUnicodeDetector.skipBom(combined)));
+    }
+
+    // ── Structural validity — UTF-32 
─────────────────────────────────────────
+
+    @Test
+    void utf32BeInvalidCodepointRuledOut() {
+        // Build bytes that look like UTF-32BE structurally (positions 0,1 
zero)
+        // but contain codepoints above U+10FFFF (b1 = 0x11 > max plane 0x10).
+        // The data falls back to UTF-16BE because b0=0x00 at every group 
creates
+        // a 50% null rate in the even-byte column; UTF-32BE is still ruled 
out.
+        byte[] bytes = new byte[128];
+        for (int g = 0; g < 32; g++) {
+            bytes[g * 4]     = 0x00;
+            bytes[g * 4 + 1] = 0x11; // > 0x10 → invalid codepoint
+            bytes[g * 4 + 2] = (byte) (0x20 + g);
+            bytes[g * 4 + 3] = (byte) (0x41 + g % 26);
+        }
+        assertNotEquals(Charset.forName("UTF-32BE"),
+                WideUnicodeDetector.detectEncoding(bytes),
+                "UTF-32BE with out-of-range codepoint should be ruled out");
+    }
+
+    @Test
+    void utf32BeSurrogateRuledOut() {
+        // UTF-32BE bytes with a surrogate codepoint (U+D800 = 00 00 D8 00).
+        byte[] bytes = new byte[128];
+        for (int g = 0; g < 32; g++) {
+            bytes[g * 4]     = 0x00;
+            bytes[g * 4 + 1] = 0x00;
+            bytes[g * 4 + 2] = (byte) 0xD8; // surrogate range
+            bytes[g * 4 + 3] = (byte) (0x41 + g % 26);
+        }
+        assertNull(WideUnicodeDetector.detectEncoding(bytes),
+                "UTF-32BE with surrogate codepoint should be ruled out");
+    }
+
+    @Test
+    void utf32BeContentByteAllZeroRuledOut() {
+        // Positions 0,1 are zero (passes the 90% null check) and position 3
+        // is always 0x00 — only one distinct value, so countUnique < 2 rules
+        // out UTF-32BE before the content-nonzero threshold is even reached.
+        // The data falls back to UTF-16LE via the variety-ratio check.
+        byte[] bytes = new byte[128];
+        for (int g = 0; g < 32; g++) {
+            bytes[g * 4]     = 0x00;
+            bytes[g * 4 + 1] = 0x00;
+            bytes[g * 4 + 2] = (byte) (g % 5); // some variety at pos 2
+            bytes[g * 4 + 3] = 0x00;            // content byte always zero
+        }
+        assertNotEquals(Charset.forName("UTF-32BE"),
+                WideUnicodeDetector.detectEncoding(bytes),
+                "UTF-32BE with all-zero content bytes should be ruled out");
+    }
+
+    @Test
+    void utf32BeContentNonzeroThresholdRuledOut() {
+        // Positions 0,1 are zero (passes 90% null check) and position 3 has
+        // 2 distinct values (passes countUnique check), but only 6/32 = 18.75%
+        // non-zero — well below UTF32_CONTENT_NONZERO_MIN (0.80).
+        // The new content-nonzero check is what rules this out as UTF-32BE.
+        byte[] bytes = new byte[128];
+        for (int g = 0; g < 32; g++) {
+            bytes[g * 4]     = 0x00;
+            bytes[g * 4 + 1] = 0x00;
+            bytes[g * 4 + 2] = 0x00;
+            bytes[g * 4 + 3] = (g < 6) ? (byte) 0x41 : 0x00; // only 6 
non-zero → 18.75%
+        }
+        assertNotEquals(Charset.forName("UTF-32BE"),
+                WideUnicodeDetector.detectEncoding(bytes),
+                "UTF-32BE with content byte mostly zero should be ruled out by 
nonzero threshold");
+    }
+
+    // ── Structural validity — UTF-16 
─────────────────────────────────────────
+
+    @Test
+    void utf16BeLoneLowSurrogateRuledOut() {
+        // UTF-16BE bytes where even positions contain lone low surrogates 
(0xDC–0xDF).
+        // These must appear only after a high surrogate — lone ones are 
invalid.
+        byte[] base = encode("The quick brown fox jumps over the lazy dog. 
".repeat(3),
+                StandardCharsets.UTF_16BE);
+        // Corrupt: inject a lone low surrogate at even position 0
+        base[0] = (byte) 0xDC;
+        base[1] = 0x00;
+        assertNull(WideUnicodeDetector.detectEncoding(base),
+                "UTF-16BE with lone low surrogate should be ruled out");
+    }
+
+    @Test
+    void utf16LeLoneLowSurrogateRuledOut() {
+        byte[] base = encode("The quick brown fox jumps over the lazy dog. 
".repeat(3),
+                StandardCharsets.UTF_16LE);
+        // Corrupt: inject a lone low surrogate at odd position 1
+        base[0] = 0x00;
+        base[1] = (byte) 0xDC;
+        assertNull(WideUnicodeDetector.detectEncoding(base),
+                "UTF-16LE with lone low surrogate should be ruled out");
+    }
+
+    @Test
+    void utf16ValidSurrogatePairAccepted() {
+        // U+1F600 (emoji 😀) encodes as surrogate pair in UTF-16LE:
+        // high surrogate U+D83D → 3D D8, low surrogate U+DE00 → 00 DE
+        // Embed it in otherwise Latin text so null-column check fires.
+        byte[] latin = encode("Hello world ".repeat(10), 
StandardCharsets.UTF_16LE);
+        // Overwrite first 4 bytes with a valid surrogate pair
+        latin[0] = 0x3D;
+        latin[1] = (byte) 0xD8; // high surrogate
+        latin[2] = 0x00;
+        latin[3] = (byte) 0xDE; // low surrogate
+        assertEquals(StandardCharsets.UTF_16LE,
+                WideUnicodeDetector.detectEncoding(latin),
+                "Valid surrogate pair in UTF-16LE should still be detected");
+    }
+
+    // ── Stream-based detection (via EncodingDetector interface) 
──────────────
+
+    @Test
+    void streamDetectsUtf16Le() throws Exception {
+        String text = "The quick brown fox jumps over the lazy dog. 
".repeat(6);
+        assertEquals(StandardCharsets.UTF_16LE, detectViaStream(encode(text, 
StandardCharsets.UTF_16LE)));
+    }
+
+    @Test
+    void streamDetectsUtf32Be() throws Exception {
+        String text = "The quick brown fox jumps. ".repeat(6);
+        assertEquals(UTF32BE, detectViaStream(encode(text, UTF32BE)));
+    }
+
+    @Test
+    void streamDetectsUtf16LeWithUtf8Bom() throws Exception {
+        // UTF-8 BOM prepended to UTF-16LE content — BOM stripped, then 
UTF-16LE detected.
+        byte[] combined = prepend(
+                new byte[]{(byte) 0xEF, (byte) 0xBB, (byte) 0xBF},
+                encode("The quick brown fox jumps over the lazy dog. 
".repeat(6), StandardCharsets.UTF_16LE));
+        assertEquals(StandardCharsets.UTF_16LE, detectViaStream(combined));
+    }
+
+    @Test
+    void streamReturnsNullForUtf8() throws Exception {
+        String text = "The quick brown fox jumps over the lazy dog. 
".repeat(6);
+        assertNull(detectViaStream(encode(text, StandardCharsets.UTF_8)));
+    }
+
+    @Test
+    void streamReturnsNullForTooShort() throws Exception {
+        assertNull(detectViaStream(new byte[]{0x41, 0x00, 0x42, 0x00}));
+    }
+}

(tika) branch main updated: TIKA-4675 -- improve wide unicode detection (#2647)

Reply via email to