This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch chardet-work in repository https://gitbox.apache.org/repos/asf/tika.git
commit fdb2392c4c736e5bce733ac4816f7df7b48c7a2a Author: tballison <[email protected]> AuthorDate: Fri Feb 27 16:21:12 2026 -0500 TIKA-4662: retrain model v2 (28 classes), fix Shift_JIS/IBM424 false positive Remove UTF-16/32 from ML training: model gets 0% on these at devtest because Latin UTF-16 has no high bytes; WideUnicodeDetector handles them via BOM / null-byte analysis. They are added to STRUCTURAL_ONLY_CHARSETS alongside HZ, ISO-2022-JP/KR/CN and US-ASCII. Fix checkIbm424 false positive on Shift_JIS: full-width space U+3000 encodes as 0x81 0x40 in Shift_JIS, making 0x40 trail bytes look like EBCDIC spaces. Now discount 0x40 as EBCDIC space when immediately preceded by a Shift_JIS lead byte (0x81-0x9F or 0xE0-0xFC). Eliminates the 5% Shift_JIS regression that appeared when STRUCTURAL_GATES were enabled. New model: 28 classes, 525K samples, 5 epochs. Training accuracy: 92.5% strict / 94.2% soft. Devtest (All rules): 69.9% strict / 85.8% soft vs ICU4J 52%/74% and juniv 41%/57%. --- .../ml/chardetect/StructuralEncodingRules.java | 422 +++++++++++++++++++++ .../org/apache/tika/ml/chardetect/chardetect.bin | Bin 2097769 -> 1835549 bytes 2 files changed, 422 insertions(+) diff --git a/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/StructuralEncodingRules.java b/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/StructuralEncodingRules.java new file mode 100644 index 0000000000..d10e8161c6 --- /dev/null +++ b/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/StructuralEncodingRules.java @@ -0,0 +1,422 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.ml.chardetect; + +import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; + +/** + * Fast, rule-based encoding checks that run before the statistical model. + * + * <h3>Pipeline</h3> + * <ol> + * <li>{@link #checkAscii}: no bytes >= 0x80 → UTF-8 (ASCII is a subset)</li> + * <li>{@link #detectIso2022}: ISO-2022 escape sequences present → ISO-2022-JP, + * ISO-2022-KR, or ISO-2022-CN depending on the designation sequence</li> + * <li>{@link #checkUtf8}: validate UTF-8 multi-byte grammar; returns a + * {@link Utf8Result} indicating whether the bytes are definitively UTF-8, + * definitively not UTF-8, or ambiguous (pass to model).</li> + * </ol> + * + * <p>UTF-16/32 detection is handled upstream by + * {@link org.apache.tika.utils.ByteEncodingHint} and is not repeated here.</p> + * + * <p>IBM424 (EBCDIC Hebrew) is detected via {@link #checkIbm424}: the Hebrew + * letters in this code page occupy bytes 0x41–0x6A, which fall entirely below + * the 0x80 threshold used by the statistical model's feature extractor. The + * EBCDIC space (0x40) vs ASCII space (0x20) frequency ratio provides a cheap + * first-pass EBCDIC gate before the Hebrew letter frequencies are checked.</p> + * + * <p>All methods are stateless and safe to call from multiple threads.</p> + */ +public final class StructuralEncodingRules { + + private StructuralEncodingRules() {} + + /** ISO-2022 ESC byte. */ + private static final int ESC = 0x1B; + + /** + * Ratio of valid high bytes required to call UTF-8 "definitive". + * If the sample has more than {@value #MIN_HIGH_BYTE_RATIO_FOR_UTF8} of its + * bytes in multi-byte sequences and they all parse correctly, we trust UTF-8. + */ + private static final double MIN_HIGH_BYTE_RATIO_FOR_UTF8 = 0.01; // at least 1% + + // ----------------------------------------------------------------------- + // Public API + // ----------------------------------------------------------------------- + + /** + * Returns {@code true} if {@code bytes} contains no bytes with value + * >= 0x80 (i.e. pure 7-bit ASCII, which is a strict subset of UTF-8). + */ + public static boolean checkAscii(byte[] bytes) { + return checkAscii(bytes, 0, bytes.length); + } + + public static boolean checkAscii(byte[] bytes, int offset, int length) { + int end = offset + length; + for (int i = offset; i < end; i++) { + if ((bytes[i] & 0xFF) >= 0x80) { + return false; + } + } + return true; + } + + /** + * Detects ISO-2022-JP, ISO-2022-KR, and ISO-2022-CN by scanning for their + * characteristic ESC designation sequences. + * + * <p>All three share the {@code ESC $} ({@code 0x1B 0x24}) prefix, so we + * must read further to distinguish them:</p> + * <pre> + * ISO-2022-JP: ESC $ B (JIS X 0208-1983) + * ESC $ @ (JIS X 0208-1978) + * ESC $ ( D (JIS X 0212 supplementary) + * ISO-2022-KR: ESC $ ) C + * ISO-2022-CN: ESC $ ) A (GB2312) + * ESC $ ) G (CNS 11643 plane 1) + * ESC $ * H (CNS 11643 plane 2) + * </pre> + * + * <p>If {@code ESC $} is found but no recognised third byte follows (or the + * buffer is too short), ISO-2022-JP is returned as the most common default.</p> + * + * @return the detected ISO-2022 charset, or {@code null} if no ISO-2022 + * escape sequence is found + */ + public static Charset detectIso2022(byte[] bytes) { + return detectIso2022(bytes, 0, bytes.length); + } + + public static Charset detectIso2022(byte[] bytes, int offset, int length) { + int end = offset + length; + for (int i = offset; i < end - 1; i++) { + if ((bytes[i] & 0xFF) != ESC) { + continue; + } + int b1 = bytes[i + 1] & 0xFF; + + if (b1 == 0x24) { // ESC $ + // Need at least one more byte to classify + if (i + 2 >= end) { + return Charset.forName("ISO-2022-JP"); // ESC $ alone → JP default + } + int b2 = bytes[i + 2] & 0xFF; + switch (b2) { + case 0x42: // ESC $ B — JIS X 0208-1983 + case 0x40: // ESC $ @ — JIS X 0208-1978 + return Charset.forName("ISO-2022-JP"); + case 0x28: // ESC $ ( — JIS X 0212 (JP supplementary) + return Charset.forName("ISO-2022-JP"); + case 0x29: // ESC $ ) — KR or CN depending on b3 + if (i + 3 >= end) { + return Charset.forName("ISO-2022-JP"); // can't tell, JP most common + } + int b3paren = bytes[i + 3] & 0xFF; + if (b3paren == 0x43) return Charset.forName("ISO-2022-KR"); // ESC $ ) C + if (b3paren == 0x41 || b3paren == 0x47) return Charset.forName("ISO-2022-CN"); + return Charset.forName("ISO-2022-JP"); + case 0x2A: // ESC $ * — CNS 11643 plane 2 + return Charset.forName("ISO-2022-CN"); + default: + return Charset.forName("ISO-2022-JP"); // unknown designation → JP default + } + } + + if (b1 == 0x28) { // ESC ( — single-byte designation (e.g. ASCII restore) + // These appear in all ISO-2022 variants and don't help distinguish + continue; + } + } + return null; // no ISO-2022 escape found + } + + /** + * Returns {@code true} if HZ-GB-2312 switching sequences are present. + * + * <p>HZ is a 7-bit encoding: it uses {@code ~\{} ({@code 0x7E 0x7B}) to + * enter two-byte GB2312 mode and {@code ~\}} ({@code 0x7E 0x7D}) to return + * to ASCII mode. Like ISO-2022, all bytes are below 0x80, so the model + * would see no features and must be bypassed with this structural check.</p> + */ + public static boolean checkHz(byte[] bytes) { + return checkHz(bytes, 0, bytes.length); + } + + public static boolean checkHz(byte[] bytes, int offset, int length) { + int end = offset + length - 1; + for (int i = offset; i < end; i++) { + if ((bytes[i] & 0xFF) == 0x7E) { + int next = bytes[i + 1] & 0xFF; + if (next == 0x7B || next == 0x7D) { // ~{ or ~} + return true; + } + } + } + return false; + } + + /** + * Detects IBM424 (EBCDIC Hebrew) by examining the sub-0x80 byte landscape. + * + * <h3>Why this is needed</h3> + * <p>In EBCDIC, the space character is {@code 0x40} (not {@code 0x20} as in + * ASCII). In IBM424 specifically, the 22 Hebrew base letters plus their five + * final forms occupy three byte clusters entirely below {@code 0x80}:</p> + * <pre> + * 0x41–0x49 alef … tet (9 letters) + * 0x51–0x59 yod … samekh (9 letters) + * 0x62–0x6A ayin … tav (9 letters + final-pe, tsadi, etc.) + * </pre> + * <p>The statistical model ignores all bytes below {@code 0x80}, so these + * letters are invisible to it. This structural rule detects them directly.</p> + * + * <h3>Algorithm</h3> + * <ol> + * <li><b>EBCDIC gate:</b> byte {@code 0x40} (EBCDIC space) must appear + * significantly more often than {@code 0x20} (ASCII space). In normal + * Latin text {@code 0x40} is the rare {@code @} character; in any EBCDIC + * text it is the word separator and appears at ~10–20% of bytes.</li> + * <li><b>Hebrew letter gate:</b> the combined frequency of bytes in the + * three Hebrew clusters above must exceed {@value #IBM424_HEBREW_THRESHOLD} + * of the sample length. Genuine Hebrew text has ~65% of its + * printable characters in these ranges. ASCII text with the same byte + * values (upper-case A–I, Q–Y, lower-case b–j) stays well below this + * threshold in practice.</li> + * </ol> + * + * @return {@code true} if the byte stream is almost certainly IBM424 + */ + public static boolean checkIbm424(byte[] bytes) { + return checkIbm424(bytes, 0, bytes.length); + } + + public static boolean checkIbm424(byte[] bytes, int offset, int length) { + if (length < 8) { + return false; + } + int sample = Math.min(length, 4096); + int end = offset + sample; + + int ebcdicSpace = 0; // 0x40 — EBCDIC word separator + int asciiSpace = 0; // 0x20 — ASCII word separator + int hebrewBytes = 0; // 0x41-0x49, 0x51-0x59, 0x62-0x6A + + int prev = -1; + for (int i = offset; i < end; i++) { + int b = bytes[i] & 0xFF; + if (b == 0x40) { + // In Shift_JIS, 0x40 appears only as a trail byte after a lead byte + // (0x81–0x9F or 0xE0–0xFC). Discount it as EBCDIC space in that case. + boolean isShiftJisTrail = (prev >= 0x81 && prev <= 0x9F) + || (prev >= 0xE0 && prev <= 0xFC); + if (!isShiftJisTrail) { + ebcdicSpace++; + } + } else if (b == 0x20) { + asciiSpace++; + } else if ((b >= 0x41 && b <= 0x49) + || (b >= 0x51 && b <= 0x59) + || (b >= 0x62 && b <= 0x6A)) { + hebrewBytes++; + } + prev = b; + } + + // Gate 1: 0x40 must dominate over 0x20 (EBCDIC vs ASCII whitespace). + // We require 0x40 to be at least 3× as frequent as 0x20, and appear + // at least 3% of the sample (rules out near-empty / binary content). + boolean ebcdicLikely = ebcdicSpace >= sample * 0.03 + && ebcdicSpace > asciiSpace * 3; + + // Gate 2: Hebrew letter density must exceed the threshold. + boolean hebrewDense = hebrewBytes > sample * IBM424_HEBREW_THRESHOLD; + + return ebcdicLikely && hebrewDense; + } + + /** + * Minimum fraction of bytes in IBM424 Hebrew letter positions (0x41–0x49, + * 0x51–0x59, 0x62–0x6A) required to confirm IBM424. Set conservatively to + * avoid false positives on ASCII text where those same byte values are + * upper-/lower-case Latin letters. + */ + private static final double IBM424_HEBREW_THRESHOLD = 0.12; + + /** + * Returns {@code true} if the probe contains any byte in the C1 control + * range {@code 0x80–0x9F}. + * + * <p>In every ISO-8859-X encoding those byte values are C1 control + * characters that never appear in real text. In every Windows-12XX + * encoding they are printable characters (smart quotes, Euro sign, + * em-dash, …). Their presence is therefore definitive proof that the + * content is <em>not</em> a valid ISO-8859-X encoding and should be + * attributed to the corresponding Windows-12XX variant instead.</p> + */ + public static boolean hasC1Bytes(byte[] bytes) { + return hasC1Bytes(bytes, 0, bytes.length); + } + + public static boolean hasC1Bytes(byte[] bytes, int offset, int length) { + int end = offset + length; + for (int i = offset; i < end; i++) { + int v = bytes[i] & 0xFF; + if (v >= 0x80 && v <= 0x9F) { + return true; + } + } + return false; + } + + /** @deprecated Use {@link #detectIso2022} which distinguishes JP/KR/CN. */ + @Deprecated + public static boolean checkIso2022Jp(byte[] bytes) { + return detectIso2022(bytes) != null; + } + + /** + * Validates the UTF-8 byte grammar of the sample and returns one of three + * outcomes: + * <ul> + * <li>{@link Utf8Result#DEFINITIVE_UTF8}: all multi-byte sequences are + * valid <em>and</em> the sample contains enough high bytes to be + * informative. Use UTF-8.</li> + * <li>{@link Utf8Result#NOT_UTF8}: at least one invalid byte sequence was + * found. Remove UTF-8 from the candidate set.</li> + * <li>{@link Utf8Result#AMBIGUOUS}: the sample is structurally valid UTF-8 + * but contains very few high bytes (almost pure ASCII), so validity is + * uninformative. Pass to the model.</li> + * </ul> + */ + public static Utf8Result checkUtf8(byte[] bytes) { + return checkUtf8(bytes, 0, bytes.length); + } + + public static Utf8Result checkUtf8(byte[] bytes, int offset, int length) { + int highByteCount = 0; + int i = offset; + int end = offset + length; + + while (i < end) { + int b = bytes[i] & 0xFF; + + if (b < 0x80) { + i++; + continue; + } + + highByteCount++; + + // Determine expected continuation count from the lead byte + int seqLen; + if (b >= 0xF8) { + // 5-/6-byte sequences are not valid Unicode + return Utf8Result.NOT_UTF8; + } else if (b >= 0xF0) { + seqLen = 4; + } else if (b >= 0xE0) { + seqLen = 3; + } else if (b >= 0xC0) { + seqLen = 2; + } else { + // 0x80–0xBF is a continuation byte without a lead → invalid + return Utf8Result.NOT_UTF8; + } + + // Overlong 2-byte sequence (C0 or C1 lead) + if (seqLen == 2 && b <= 0xC1) { + return Utf8Result.NOT_UTF8; + } + + // Check that the right number of continuation bytes follow + for (int k = 1; k < seqLen; k++) { + if (i + k >= end) { + // Truncated sequence at end of sample — treat as ambiguous + // (the sample just ran out, not necessarily bad data) + break; + } + int cb = bytes[i + k] & 0xFF; + if (cb < 0x80 || cb > 0xBF) { + return Utf8Result.NOT_UTF8; + } + } + + // Validate scalar value ranges for 3- and 4-byte sequences + if (seqLen == 3) { + int cp = ((b & 0x0F) << 12) + | ((i + 1 < end ? bytes[i + 1] & 0xFF : 0) & 0x3F) << 6 + | ((i + 2 < end ? bytes[i + 2] & 0xFF : 0) & 0x3F); + // Overlong encoding (< U+0800) or surrogate pair range + if (cp < 0x0800 || (cp >= 0xD800 && cp <= 0xDFFF)) { + return Utf8Result.NOT_UTF8; + } + } else if (seqLen == 4) { + int cp = ((b & 0x07) << 18) + | ((i + 1 < end ? bytes[i + 1] & 0xFF : 0) & 0x3F) << 12 + | ((i + 2 < end ? bytes[i + 2] & 0xFF : 0) & 0x3F) << 6 + | ((i + 3 < end ? bytes[i + 3] & 0xFF : 0) & 0x3F); + // Overlong or above U+10FFFF + if (cp < 0x10000 || cp > 0x10FFFF) { + return Utf8Result.NOT_UTF8; + } + } + + i += seqLen; + } + + // Grammar is valid. Was there enough evidence? + double highRatio = length > 0 ? (double) highByteCount / length : 0.0; + if (highRatio >= MIN_HIGH_BYTE_RATIO_FOR_UTF8) { + return Utf8Result.DEFINITIVE_UTF8; + } + return Utf8Result.AMBIGUOUS; + } + + // ----------------------------------------------------------------------- + // Result type + // ----------------------------------------------------------------------- + + /** + * Outcome of the UTF-8 structural check. + */ + public enum Utf8Result { + /** Sample is structurally valid UTF-8 with enough high bytes to be sure. */ + DEFINITIVE_UTF8, + /** Sample contains at least one invalid UTF-8 sequence. */ + NOT_UTF8, + /** + * Sample is structurally valid but nearly all-ASCII. Cannot confirm or + * deny; pass to the statistical model. + */ + AMBIGUOUS; + + public boolean isDefinitive() { + return this != AMBIGUOUS; + } + + public Charset toCharset() { + if (this == DEFINITIVE_UTF8) { + return StandardCharsets.UTF_8; + } + throw new IllegalStateException("Only DEFINITIVE_UTF8 has a Charset: " + this); + } + } +} diff --git a/tika-ml/tika-ml-chardetect/src/main/resources/org/apache/tika/ml/chardetect/chardetect.bin b/tika-ml/tika-ml-chardetect/src/main/resources/org/apache/tika/ml/chardetect/chardetect.bin index 1f5edecfb6..36e4fd3682 100644 Binary files a/tika-ml/tika-ml-chardetect/src/main/resources/org/apache/tika/ml/chardetect/chardetect.bin and b/tika-ml/tika-ml-chardetect/src/main/resources/org/apache/tika/ml/chardetect/chardetect.bin differ
