This is an automated email from the ASF dual-hosted git repository. tballison pushed a commit to branch TIKA-4731-common-script in repository https://gitbox.apache.org/repos/asf/tika.git
commit 23cc3ab5274d4a21892f0e00b71b5771835ef34d Author: tallison <[email protected]> AuthorDate: Wed May 20 14:28:22 2026 -0400 TIKA-4731 - checkpoint before "common" refactoring --- .../tika/ml/chardetect/HtmlByteStripper.java | 29 ++++- .../ml/chardetect/MojibusterEncodingDetector.java | 25 ++--- .../tika/ml/chardetect/HtmlByteStripperTest.java | 54 ++++++++++ .../ml/junkdetect/JunkFilterEncodingDetector.java | 117 +++------------------ .../tika/ml/junkdetect/tools/TrainJunkModel.java | 14 ++- .../org/apache/tika/ml/junkdetect/junkdetect.bin | Bin 2901358 -> 2898974 bytes 6 files changed, 119 insertions(+), 120 deletions(-) diff --git a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/HtmlByteStripper.java b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/HtmlByteStripper.java index 09768e0977..37ae5ad26b 100644 --- a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/HtmlByteStripper.java +++ b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/HtmlByteStripper.java @@ -159,6 +159,33 @@ public final class HtmlByteStripper { */ public static Result strip(byte[] src, int srcOffset, int srcLen, byte[] dst, int dstOffset) { + // Back-compat alias for charset-detection callers (Mojibuster, + // training tools): strips tags AND well-formed entities. + return stripTagsAndEntities(src, srcOffset, srcLen, dst, dstOffset); + } + + /** + * Strip tags <em>and</em> well-formed HTML entities. For <b>charset + * detection</b> (Mojibuster): entities are ASCII, charset-neutral, and + * bias the byte-bigram statistics, so they're removed. + */ + public static Result stripTagsAndEntities(byte[] src, int srcOffset, int srcLen, + byte[] dst, int dstOffset) { + return strip(src, srcOffset, srcLen, dst, dstOffset, true); + } + + /** + * Strip tags only; entities pass through unchanged. For <b>junk + * detection</b>, which expands them in string space afterward (dropping + * them here would make that expansion a no-op). + */ + public static Result stripTags(byte[] src, int srcOffset, int srcLen, + byte[] dst, int dstOffset) { + return strip(src, srcOffset, srcLen, dst, dstOffset, false); + } + + private static Result strip(byte[] src, int srcOffset, int srcLen, + byte[] dst, int dstOffset, boolean dropEntities) { int w = dstOffset; int state = TEXT; int nameStart = 0; @@ -183,7 +210,7 @@ public final class HtmlByteStripper { case TEXT: if (b == '<') { state = LT; - } else if (b == '&') { + } else if (b == '&' && dropEntities) { state = ENTITY; entityStart = i; } else { diff --git a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java index 78dc9400ae..550b4fa946 100644 --- a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java +++ b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java @@ -24,7 +24,6 @@ import java.util.Collections; import java.util.List; import java.util.Locale; -import org.apache.commons.io.IOUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -76,10 +75,12 @@ public class MojibusterEncodingDetector implements EncodingDetector { public static final String DEFAULT_MODEL_RESOURCE = "/org/apache/tika/ml/chardetect/nb-bigram.bin"; - // 16 KB matches the production read limit used by UniversalEncodingDetector - // and JunkFilterEncodingDetector; uniform probe size across the chain - // makes downstream candidate-pool arbitration consistent. - private static final int MAX_PROBE_BYTES = 16384; + // Probe sized by tag-stripped content (16 KB target), capped at 512 KB raw. + // Markup-heavy pages whose distinguishing bytes (esp. UTF-8 multi-byte + // sequences) sit past a fixed 16 KB raw window would otherwise starve the + // structural UTF-8 check and NB scoring. See AdaptiveProbe. + private static final int PROBE_CONTENT_TARGET = AdaptiveProbe.DEFAULT_CONTENT_TARGET; + private static final int PROBE_RAW_CAP = AdaptiveProbe.DEFAULT_RAW_CAP; /** * Minimum number of successfully-parsed well-formed tags required @@ -709,18 +710,6 @@ public class MojibusterEncodingDetector implements EncodingDetector { } private static byte[] readProbe(TikaInputStream tis) throws IOException { - tis.mark(MAX_PROBE_BYTES); - byte[] buf = new byte[MAX_PROBE_BYTES]; - try { - int n = IOUtils.read(tis, buf); - if (n < buf.length) { - byte[] trimmed = new byte[n]; - System.arraycopy(buf, 0, trimmed, 0, n); - return trimmed; - } - return buf; - } finally { - tis.reset(); - } + return AdaptiveProbe.read(tis, PROBE_CONTENT_TARGET, PROBE_RAW_CAP); } } diff --git a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/test/java/org/apache/tika/ml/chardetect/HtmlByteStripperTest.java b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/test/java/org/apache/tika/ml/chardetect/HtmlByteStripperTest.java index 28b027f47c..74aa691728 100644 --- a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/test/java/org/apache/tika/ml/chardetect/HtmlByteStripperTest.java +++ b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/test/java/org/apache/tika/ml/chardetect/HtmlByteStripperTest.java @@ -17,7 +17,9 @@ package org.apache.tika.ml.chardetect; import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; +import java.nio.charset.Charset; import java.nio.charset.StandardCharsets; import org.junit.jupiter.api.Test; @@ -44,6 +46,58 @@ public class HtmlByteStripperTest { } } + /** Helper: tagCount when stripping the given bytes (tags+entities). */ + private static int tagCount(byte[] src) { + byte[] dst = new byte[src.length]; + return HtmlByteStripper.strip(src, 0, src.length, dst, 0).tagCount; + } + + @Test + public void multiByteUnicodeIsNotTagStripped() { + // The byte-level stripper must not mangle UTF-16/UTF-32: those bytes + // don't form single-byte ASCII tags, so tagCount stays 0 and callers + // (Mojibuster / JunkFilter) fall back to the raw bytes via their + // `tagCount > 0` gate. Regression guard for the "does the byte + // stripper botch wide Unicode?" question. + String html = "<html><head><title>商品</title></head>" + + "<body><p>这是中文测试 with markup</p></body></html>"; + // ASCII-compatible encodings: tags ARE recognized (and safely stripped). + assertTrue(tagCount(html.getBytes(StandardCharsets.UTF_8)) > 0, + "UTF-8 tags should be recognized"); + assertTrue(tagCount(html.getBytes(Charset.forName("GBK"))) > 0, + "GBK (ASCII-compatible) tags should be recognized"); + // Wide Unicode: no single-byte ASCII tags → tagCount 0 → strip not used. + assertEquals(0, tagCount(html.getBytes(Charset.forName("UTF-16LE"))), + "UTF-16LE must not register tags"); + assertEquals(0, tagCount(html.getBytes(Charset.forName("UTF-16BE"))), + "UTF-16BE must not register tags"); + assertEquals(0, tagCount(html.getBytes(Charset.forName("UTF-32LE"))), + "UTF-32LE must not register tags"); + assertEquals(0, tagCount(html.getBytes(Charset.forName("UTF-32BE"))), + "UTF-32BE must not register tags"); + } + + @Test + public void stripTagsPreservesEntitiesForJunkDetection() { + // JunkFilter path: tags removed, entities KEPT (expanded later in + // string space). Charset path (default strip) removes both. + String in = "<p>Copyright © 2024 café</p>"; + byte[] src = in.getBytes(StandardCharsets.US_ASCII); + byte[] dstA = new byte[src.length]; + byte[] dstB = new byte[src.length]; + HtmlByteStripper.Result tagsOnly = + HtmlByteStripper.stripTags(src, 0, src.length, dstA, 0); + HtmlByteStripper.Result both = + HtmlByteStripper.stripTagsAndEntities(src, 0, src.length, dstB, 0); + assertEquals("Copyright © 2024 café", + new String(dstA, 0, tagsOnly.length, StandardCharsets.US_ASCII)); + assertEquals("Copyright 2024 caf", + new String(dstB, 0, both.length, StandardCharsets.US_ASCII)); + // tagsOnly does not count entities (it doesn't enter the entity path) + assertEquals(0, tagsOnly.entityCount); + assertEquals(2, both.entityCount); + } + @Test public void namedEntityIsStripped() { StripOutcome r = strip("hello & world"); diff --git a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkFilterEncodingDetector.java b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkFilterEncodingDetector.java index e4fbcfb4bb..6966b292ae 100644 --- a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkFilterEncodingDetector.java +++ b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkFilterEncodingDetector.java @@ -24,8 +24,6 @@ import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import java.util.Set; -import java.util.regex.Matcher; -import java.util.regex.Pattern; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -36,7 +34,7 @@ import org.apache.tika.detect.EncodingResult; import org.apache.tika.detect.MetaEncodingDetector; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; -import org.apache.tika.ml.chardetect.HtmlByteStripper; +import org.apache.tika.ml.chardetect.AdaptiveProbe; import org.apache.tika.parser.ParseContext; import org.apache.tika.quality.TextQualityDetector; @@ -177,38 +175,18 @@ public class JunkFilterEncodingDetector implements MetaEncodingDetector { } bytes = stripBomBytes(bytes); - // Strip HTML/XML markup before decoding so the quality score reflects - // body text, not whitespace and tags. Falls back to the raw probe - // when no well-formed tags are detected. - byte[] forDecode = bytes; - byte[] stripDst = new byte[bytes.length]; - HtmlByteStripper.Result stripped = - HtmlByteStripper.strip(bytes, 0, bytes.length, stripDst, 0); - boolean stripUsed = stripped.tagCount > 0 && stripped.length > 0; - LOG.trace("junk-filter strip: input={}B tagCount={} stripped={}B used={}", - bytes.length, stripped.tagCount, stripped.length, stripUsed); - if (stripUsed) { - forDecode = new byte[stripped.length]; - System.arraycopy(stripDst, 0, forDecode, 0, stripped.length); - } - - // Decode probe under each candidate, preserving insertion order so - // tournament seeding is deterministic. - // - // Each decoded string is then run through HTML entity expansion. - // For entity-encoded HTML (numeric refs like ്), this is - // load-bearing: entity refs are ASCII bytes that decode identically - // under every candidate charset, so they don't differentiate. - // After expansion they become real codepoints — and crucially, in - // the *wrong* decoding (e.g. mojibake-as-HAN), they introduce - // cross-script transitions (HAN ↔ MALAYALAM mid-document) that the - // quality detector's script-transition feature correctly penalises. - // See `20260512-junkdetector-codepoint-hash-plan.md` (AIT5 case). + // Decode each candidate, then HtmlContentCleaner.clean — the same + // tag-strip + entity-expand TrainJunkModel applies, so train and + // inference match. Entity expansion is load-bearing: numeric refs + // become codepoints whose cross-script transitions expose mojibake + // under a wrong decoding (AIT5 case). Map<Charset, String> candidates = new LinkedHashMap<>(); for (Charset cs : uniqueCharsets) { - String decoded = safeDecode(forDecode, cs); + String decoded = safeDecode(bytes, cs); + if (decoded != null && !decoded.isEmpty()) { + decoded = HtmlContentCleaner.clean(decoded); + } if (decoded != null && !decoded.isEmpty()) { - decoded = expandHtmlEntities(decoded); candidates.put(cs, decoded); if (LOG.isTraceEnabled()) { int sampleLen = Math.min(400, decoded.length()); @@ -330,27 +308,9 @@ public class JunkFilterEncodingDetector implements MetaEncodingDetector { } private byte[] readProbe(TikaInputStream tis) throws IOException { - try { - tis.mark(readLimit); - byte[] buf = new byte[readLimit]; - int total = 0; - int read; - while (total < readLimit - && (read = tis.read(buf, total, readLimit - total)) != -1) { - total += read; - } - if (total == 0) { - return null; - } - if (total < readLimit) { - byte[] trimmed = new byte[total]; - System.arraycopy(buf, 0, trimmed, 0, total); - return trimmed; - } - return buf; - } finally { - tis.reset(); - } + // readLimit is the tag-stripped content target; cap raw reads at 512 KB. + byte[] probe = AdaptiveProbe.read(tis, readLimit, AdaptiveProbe.DEFAULT_RAW_CAP); + return probe.length == 0 ? null : probe; } private static String safeDecode(byte[] bytes, Charset charset) { @@ -373,56 +333,13 @@ public class JunkFilterEncodingDetector implements MetaEncodingDetector { // intermixed with raw UTF-8 codepoints. // ----------------------------------------------------------------------- - private static final Pattern ENTITY_DEC = Pattern.compile("&#(\\d{1,7});"); - private static final Pattern ENTITY_HEX = Pattern.compile("&#[xX]([0-9a-fA-F]{1,6});"); - private static final Pattern ENTITY_NAMED = - Pattern.compile("&(amp|lt|gt|quot|apos|nbsp|copy|reg);"); - /** - * Expands HTML numeric and a small set of named entity references in - * {@code s}. Malformed or out-of-range entities pass through unchanged. - * The named-entity set is intentionally small — only the universally- - * declared HTML5 entities that don't depend on a DOCTYPE. Anything more - * exotic stays as a literal entity reference (which scores as ASCII noise, - * the same as it would have before). + * Delegates to {@link HtmlContentCleaner#expandHtmlEntities} — the single + * implementation shared with training. Retained here as the historical + * entry point used by tests and diagnostics. */ static String expandHtmlEntities(String s) { - s = ENTITY_DEC.matcher(s).replaceAll(mr -> { - try { - int cp = Integer.parseInt(mr.group(1)); - if (cp >= 0 && cp <= 0x10FFFF) { - return Matcher.quoteReplacement(new String(Character.toChars(cp))); - } - } catch (NumberFormatException ignored) { - // overflow — fall through, leave entity literal - } - return Matcher.quoteReplacement(mr.group()); - }); - s = ENTITY_HEX.matcher(s).replaceAll(mr -> { - try { - int cp = Integer.parseInt(mr.group(1), 16); - if (cp >= 0 && cp <= 0x10FFFF) { - return Matcher.quoteReplacement(new String(Character.toChars(cp))); - } - } catch (NumberFormatException ignored) { - // overflow — fall through, leave entity literal - } - return Matcher.quoteReplacement(mr.group()); - }); - s = ENTITY_NAMED.matcher(s).replaceAll(mr -> { - switch (mr.group(1)) { - case "amp": return "&"; - case "lt": return "<"; - case "gt": return ">"; - case "quot": return "\""; - case "apos": return "'"; - case "nbsp": return " "; - case "copy": return "©"; - case "reg": return "®"; - default: return Matcher.quoteReplacement(mr.group()); - } - }); - return s; + return HtmlContentCleaner.expandHtmlEntities(s); } /** diff --git a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/TrainJunkModel.java b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/TrainJunkModel.java index b52e185eff..77517a4e3b 100644 --- a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/TrainJunkModel.java +++ b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/TrainJunkModel.java @@ -38,6 +38,7 @@ import java.util.TreeMap; import java.util.zip.GZIPInputStream; import java.util.zip.GZIPOutputStream; +import org.apache.tika.ml.junkdetect.HtmlContentCleaner; import org.apache.tika.ml.junkdetect.JunkDetector; import org.apache.tika.ml.junkdetect.V7Tables; @@ -1820,11 +1821,22 @@ public class TrainJunkModel { return new float[]{(float) mu, (float) sigma}; } + /** + * Opens a gzipped train/dev file, applying {@link HtmlContentCleaner#clean} + * to every line — the same cleaning {@code JunkFilterEncodingDetector} does + * at inference, so train and inference match. No-op on clean corpus lines. + */ static BufferedReader openGzipped(Path path) throws IOException { return new BufferedReader( new InputStreamReader( new GZIPInputStream(Files.newInputStream(path)), - StandardCharsets.UTF_8)); + StandardCharsets.UTF_8)) { + @Override + public String readLine() throws IOException { + String l = super.readLine(); + return l == null ? null : HtmlContentCleaner.clean(l); + } + }; } /** diff --git a/tika-ml/tika-ml-junkdetect/src/main/resources/org/apache/tika/ml/junkdetect/junkdetect.bin b/tika-ml/tika-ml-junkdetect/src/main/resources/org/apache/tika/ml/junkdetect/junkdetect.bin index af491ba162..c09c38cdb4 100644 Binary files a/tika-ml/tika-ml-junkdetect/src/main/resources/org/apache/tika/ml/junkdetect/junkdetect.bin and b/tika-ml/tika-ml-junkdetect/src/main/resources/org/apache/tika/ml/junkdetect/junkdetect.bin differ
