(tika) 03/09: TIKA-4731 - checkpoint before "common" refactoring

tallison Tue, 26 May 2026 12:18:51 -0700

This is an automated email from the ASF dual-hosted git repository.

tballison pushed a commit to branch TIKA-4731-common-script
in repository https://gitbox.apache.org/repos/asf/tika.git


commit 23cc3ab5274d4a21892f0e00b71b5771835ef34d
Author: tallison <[email protected]>
AuthorDate: Wed May 20 14:28:22 2026 -0400

    TIKA-4731 - checkpoint before "common" refactoring
---
 .../tika/ml/chardetect/HtmlByteStripper.java       |  29 ++++-
 .../ml/chardetect/MojibusterEncodingDetector.java  |  25 ++---
 .../tika/ml/chardetect/HtmlByteStripperTest.java   |  54 ++++++++++
 .../ml/junkdetect/JunkFilterEncodingDetector.java  | 117 +++------------------
 .../tika/ml/junkdetect/tools/TrainJunkModel.java   |  14 ++-
 .../org/apache/tika/ml/junkdetect/junkdetect.bin   | Bin 2901358 -> 2898974 
bytes
 6 files changed, 119 insertions(+), 120 deletions(-)

diff --git 
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/HtmlByteStripper.java
 
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/HtmlByteStripper.java
index 09768e0977..37ae5ad26b 100644
--- 
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/HtmlByteStripper.java
+++ 
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/HtmlByteStripper.java
@@ -159,6 +159,33 @@ public final class HtmlByteStripper {
      */
     public static Result strip(byte[] src, int srcOffset, int srcLen,
                      byte[] dst, int dstOffset) {
+        // Back-compat alias for charset-detection callers (Mojibuster,
+        // training tools): strips tags AND well-formed entities.
+        return stripTagsAndEntities(src, srcOffset, srcLen, dst, dstOffset);
+    }
+
+    /**
+     * Strip tags <em>and</em> well-formed HTML entities.  For <b>charset
+     * detection</b> (Mojibuster): entities are ASCII, charset-neutral, and
+     * bias the byte-bigram statistics, so they're removed.
+     */
+    public static Result stripTagsAndEntities(byte[] src, int srcOffset, int 
srcLen,
+                     byte[] dst, int dstOffset) {
+        return strip(src, srcOffset, srcLen, dst, dstOffset, true);
+    }
+
+    /**
+     * Strip tags only; entities pass through unchanged.  For <b>junk
+     * detection</b>, which expands them in string space afterward (dropping
+     * them here would make that expansion a no-op).
+     */
+    public static Result stripTags(byte[] src, int srcOffset, int srcLen,
+                     byte[] dst, int dstOffset) {
+        return strip(src, srcOffset, srcLen, dst, dstOffset, false);
+    }
+
+    private static Result strip(byte[] src, int srcOffset, int srcLen,
+                     byte[] dst, int dstOffset, boolean dropEntities) {
         int w = dstOffset;
         int state = TEXT;
         int nameStart = 0;
@@ -183,7 +210,7 @@ public final class HtmlByteStripper {
                 case TEXT:
                     if (b == '<') {
                         state = LT;
-                    } else if (b == '&') {
+                    } else if (b == '&' && dropEntities) {
                         state = ENTITY;
                         entityStart = i;
                     } else {
diff --git 
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java
 
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java
index 78dc9400ae..550b4fa946 100644
--- 
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java
+++ 
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java
@@ -24,7 +24,6 @@ import java.util.Collections;
 import java.util.List;
 import java.util.Locale;
 
-import org.apache.commons.io.IOUtils;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -76,10 +75,12 @@ public class MojibusterEncodingDetector implements 
EncodingDetector {
     public static final String DEFAULT_MODEL_RESOURCE =
             "/org/apache/tika/ml/chardetect/nb-bigram.bin";
 
-    // 16 KB matches the production read limit used by 
UniversalEncodingDetector
-    // and JunkFilterEncodingDetector; uniform probe size across the chain
-    // makes downstream candidate-pool arbitration consistent.
-    private static final int MAX_PROBE_BYTES = 16384;
+    // Probe sized by tag-stripped content (16 KB target), capped at 512 KB 
raw.
+    // Markup-heavy pages whose distinguishing bytes (esp. UTF-8 multi-byte
+    // sequences) sit past a fixed 16 KB raw window would otherwise starve the
+    // structural UTF-8 check and NB scoring. See AdaptiveProbe.
+    private static final int PROBE_CONTENT_TARGET = 
AdaptiveProbe.DEFAULT_CONTENT_TARGET;
+    private static final int PROBE_RAW_CAP = AdaptiveProbe.DEFAULT_RAW_CAP;
 
     /**
      * Minimum number of successfully-parsed well-formed tags required
@@ -709,18 +710,6 @@ public class MojibusterEncodingDetector implements 
EncodingDetector {
     }
 
     private static byte[] readProbe(TikaInputStream tis) throws IOException {
-        tis.mark(MAX_PROBE_BYTES);
-        byte[] buf = new byte[MAX_PROBE_BYTES];
-        try {
-            int n = IOUtils.read(tis, buf);
-            if (n < buf.length) {
-                byte[] trimmed = new byte[n];
-                System.arraycopy(buf, 0, trimmed, 0, n);
-                return trimmed;
-            }
-            return buf;
-        } finally {
-            tis.reset();
-        }
+        return AdaptiveProbe.read(tis, PROBE_CONTENT_TARGET, PROBE_RAW_CAP);
     }
 }
diff --git 
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/test/java/org/apache/tika/ml/chardetect/HtmlByteStripperTest.java
 
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/test/java/org/apache/tika/ml/chardetect/HtmlByteStripperTest.java
index 28b027f47c..74aa691728 100644
--- 
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/test/java/org/apache/tika/ml/chardetect/HtmlByteStripperTest.java
+++ 
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/test/java/org/apache/tika/ml/chardetect/HtmlByteStripperTest.java
@@ -17,7 +17,9 @@
 package org.apache.tika.ml.chardetect;
 
 import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertTrue;
 
+import java.nio.charset.Charset;
 import java.nio.charset.StandardCharsets;
 
 import org.junit.jupiter.api.Test;
@@ -44,6 +46,58 @@ public class HtmlByteStripperTest {
         }
     }
 
+    /** Helper: tagCount when stripping the given bytes (tags+entities). */
+    private static int tagCount(byte[] src) {
+        byte[] dst = new byte[src.length];
+        return HtmlByteStripper.strip(src, 0, src.length, dst, 0).tagCount;
+    }
+
+    @Test
+    public void multiByteUnicodeIsNotTagStripped() {
+        // The byte-level stripper must not mangle UTF-16/UTF-32: those bytes
+        // don't form single-byte ASCII tags, so tagCount stays 0 and callers
+        // (Mojibuster / JunkFilter) fall back to the raw bytes via their
+        // `tagCount > 0` gate.  Regression guard for the "does the byte
+        // stripper botch wide Unicode?" question.
+        String html = "<html><head><title>商品</title></head>"
+                + "<body><p>这是中文测试 with markup</p></body></html>";
+        // ASCII-compatible encodings: tags ARE recognized (and safely 
stripped).
+        assertTrue(tagCount(html.getBytes(StandardCharsets.UTF_8)) > 0,
+                "UTF-8 tags should be recognized");
+        assertTrue(tagCount(html.getBytes(Charset.forName("GBK"))) > 0,
+                "GBK (ASCII-compatible) tags should be recognized");
+        // Wide Unicode: no single-byte ASCII tags → tagCount 0 → strip not 
used.
+        assertEquals(0, tagCount(html.getBytes(Charset.forName("UTF-16LE"))),
+                "UTF-16LE must not register tags");
+        assertEquals(0, tagCount(html.getBytes(Charset.forName("UTF-16BE"))),
+                "UTF-16BE must not register tags");
+        assertEquals(0, tagCount(html.getBytes(Charset.forName("UTF-32LE"))),
+                "UTF-32LE must not register tags");
+        assertEquals(0, tagCount(html.getBytes(Charset.forName("UTF-32BE"))),
+                "UTF-32BE must not register tags");
+    }
+
+    @Test
+    public void stripTagsPreservesEntitiesForJunkDetection() {
+        // JunkFilter path: tags removed, entities KEPT (expanded later in
+        // string space).  Charset path (default strip) removes both.
+        String in = "<p>Copyright &#169; 2024 caf&eacute;</p>";
+        byte[] src = in.getBytes(StandardCharsets.US_ASCII);
+        byte[] dstA = new byte[src.length];
+        byte[] dstB = new byte[src.length];
+        HtmlByteStripper.Result tagsOnly =
+                HtmlByteStripper.stripTags(src, 0, src.length, dstA, 0);
+        HtmlByteStripper.Result both =
+                HtmlByteStripper.stripTagsAndEntities(src, 0, src.length, 
dstB, 0);
+        assertEquals("Copyright &#169; 2024 caf&eacute;",
+                new String(dstA, 0, tagsOnly.length, 
StandardCharsets.US_ASCII));
+        assertEquals("Copyright  2024 caf",
+                new String(dstB, 0, both.length, StandardCharsets.US_ASCII));
+        // tagsOnly does not count entities (it doesn't enter the entity path)
+        assertEquals(0, tagsOnly.entityCount);
+        assertEquals(2, both.entityCount);
+    }
+
     @Test
     public void namedEntityIsStripped() {
         StripOutcome r = strip("hello &amp; world");
diff --git 
a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkFilterEncodingDetector.java
 
b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkFilterEncodingDetector.java
index e4fbcfb4bb..6966b292ae 100644
--- 
a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkFilterEncodingDetector.java
+++ 
b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkFilterEncodingDetector.java
@@ -24,8 +24,6 @@ import java.util.LinkedHashMap;
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
 
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -36,7 +34,7 @@ import org.apache.tika.detect.EncodingResult;
 import org.apache.tika.detect.MetaEncodingDetector;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
-import org.apache.tika.ml.chardetect.HtmlByteStripper;
+import org.apache.tika.ml.chardetect.AdaptiveProbe;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.quality.TextQualityDetector;
 
@@ -177,38 +175,18 @@ public class JunkFilterEncodingDetector implements 
MetaEncodingDetector {
         }
         bytes = stripBomBytes(bytes);
 
-        // Strip HTML/XML markup before decoding so the quality score reflects
-        // body text, not whitespace and tags.  Falls back to the raw probe
-        // when no well-formed tags are detected.
-        byte[] forDecode = bytes;
-        byte[] stripDst = new byte[bytes.length];
-        HtmlByteStripper.Result stripped =
-                HtmlByteStripper.strip(bytes, 0, bytes.length, stripDst, 0);
-        boolean stripUsed = stripped.tagCount > 0 && stripped.length > 0;
-        LOG.trace("junk-filter strip: input={}B tagCount={} stripped={}B 
used={}",
-                bytes.length, stripped.tagCount, stripped.length, stripUsed);
-        if (stripUsed) {
-            forDecode = new byte[stripped.length];
-            System.arraycopy(stripDst, 0, forDecode, 0, stripped.length);
-        }
-
-        // Decode probe under each candidate, preserving insertion order so
-        // tournament seeding is deterministic.
-        //
-        // Each decoded string is then run through HTML entity expansion.
-        // For entity-encoded HTML (numeric refs like &#3405;), this is
-        // load-bearing: entity refs are ASCII bytes that decode identically
-        // under every candidate charset, so they don't differentiate.
-        // After expansion they become real codepoints — and crucially, in
-        // the *wrong* decoding (e.g. mojibake-as-HAN), they introduce
-        // cross-script transitions (HAN ↔ MALAYALAM mid-document) that the
-        // quality detector's script-transition feature correctly penalises.
-        // See `20260512-junkdetector-codepoint-hash-plan.md` (AIT5 case).
+        // Decode each candidate, then HtmlContentCleaner.clean — the same
+        // tag-strip + entity-expand TrainJunkModel applies, so train and
+        // inference match.  Entity expansion is load-bearing: numeric refs
+        // become codepoints whose cross-script transitions expose mojibake
+        // under a wrong decoding (AIT5 case).
         Map<Charset, String> candidates = new LinkedHashMap<>();
         for (Charset cs : uniqueCharsets) {
-            String decoded = safeDecode(forDecode, cs);
+            String decoded = safeDecode(bytes, cs);
+            if (decoded != null && !decoded.isEmpty()) {
+                decoded = HtmlContentCleaner.clean(decoded);
+            }
             if (decoded != null && !decoded.isEmpty()) {
-                decoded = expandHtmlEntities(decoded);
                 candidates.put(cs, decoded);
                 if (LOG.isTraceEnabled()) {
                     int sampleLen = Math.min(400, decoded.length());
@@ -330,27 +308,9 @@ public class JunkFilterEncodingDetector implements 
MetaEncodingDetector {
     }
 
     private byte[] readProbe(TikaInputStream tis) throws IOException {
-        try {
-            tis.mark(readLimit);
-            byte[] buf = new byte[readLimit];
-            int total = 0;
-            int read;
-            while (total < readLimit
-                    && (read = tis.read(buf, total, readLimit - total)) != -1) 
{
-                total += read;
-            }
-            if (total == 0) {
-                return null;
-            }
-            if (total < readLimit) {
-                byte[] trimmed = new byte[total];
-                System.arraycopy(buf, 0, trimmed, 0, total);
-                return trimmed;
-            }
-            return buf;
-        } finally {
-            tis.reset();
-        }
+        // readLimit is the tag-stripped content target; cap raw reads at 512 
KB.
+        byte[] probe = AdaptiveProbe.read(tis, readLimit, 
AdaptiveProbe.DEFAULT_RAW_CAP);
+        return probe.length == 0 ? null : probe;
     }
 
     private static String safeDecode(byte[] bytes, Charset charset) {
@@ -373,56 +333,13 @@ public class JunkFilterEncodingDetector implements 
MetaEncodingDetector {
     // intermixed with raw UTF-8 codepoints.
     // -----------------------------------------------------------------------
 
-    private static final Pattern ENTITY_DEC = Pattern.compile("&#(\\d{1,7});");
-    private static final Pattern ENTITY_HEX = 
Pattern.compile("&#[xX]([0-9a-fA-F]{1,6});");
-    private static final Pattern ENTITY_NAMED =
-            Pattern.compile("&(amp|lt|gt|quot|apos|nbsp|copy|reg);");
-
     /**
-     * Expands HTML numeric and a small set of named entity references in
-     * {@code s}.  Malformed or out-of-range entities pass through unchanged.
-     * The named-entity set is intentionally small — only the universally-
-     * declared HTML5 entities that don't depend on a DOCTYPE.  Anything more
-     * exotic stays as a literal entity reference (which scores as ASCII noise,
-     * the same as it would have before).
+     * Delegates to {@link HtmlContentCleaner#expandHtmlEntities} — the single
+     * implementation shared with training.  Retained here as the historical
+     * entry point used by tests and diagnostics.
      */
     static String expandHtmlEntities(String s) {
-        s = ENTITY_DEC.matcher(s).replaceAll(mr -> {
-            try {
-                int cp = Integer.parseInt(mr.group(1));
-                if (cp >= 0 && cp <= 0x10FFFF) {
-                    return Matcher.quoteReplacement(new 
String(Character.toChars(cp)));
-                }
-            } catch (NumberFormatException ignored) {
-                // overflow — fall through, leave entity literal
-            }
-            return Matcher.quoteReplacement(mr.group());
-        });
-        s = ENTITY_HEX.matcher(s).replaceAll(mr -> {
-            try {
-                int cp = Integer.parseInt(mr.group(1), 16);
-                if (cp >= 0 && cp <= 0x10FFFF) {
-                    return Matcher.quoteReplacement(new 
String(Character.toChars(cp)));
-                }
-            } catch (NumberFormatException ignored) {
-                // overflow — fall through, leave entity literal
-            }
-            return Matcher.quoteReplacement(mr.group());
-        });
-        s = ENTITY_NAMED.matcher(s).replaceAll(mr -> {
-            switch (mr.group(1)) {
-                case "amp":  return "&";
-                case "lt":   return "<";
-                case "gt":   return ">";
-                case "quot": return "\"";
-                case "apos": return "'";
-                case "nbsp": return " ";
-                case "copy": return "©";
-                case "reg":  return "®";
-                default:     return Matcher.quoteReplacement(mr.group());
-            }
-        });
-        return s;
+        return HtmlContentCleaner.expandHtmlEntities(s);
     }
 
     /**
diff --git 
a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/TrainJunkModel.java
 
b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/TrainJunkModel.java
index b52e185eff..77517a4e3b 100644
--- 
a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/TrainJunkModel.java
+++ 
b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/TrainJunkModel.java
@@ -38,6 +38,7 @@ import java.util.TreeMap;
 import java.util.zip.GZIPInputStream;
 import java.util.zip.GZIPOutputStream;
 
+import org.apache.tika.ml.junkdetect.HtmlContentCleaner;
 import org.apache.tika.ml.junkdetect.JunkDetector;
 import org.apache.tika.ml.junkdetect.V7Tables;
 
@@ -1820,11 +1821,22 @@ public class TrainJunkModel {
         return new float[]{(float) mu, (float) sigma};
     }
 
+    /**
+     * Opens a gzipped train/dev file, applying {@link 
HtmlContentCleaner#clean}
+     * to every line — the same cleaning {@code JunkFilterEncodingDetector} 
does
+     * at inference, so train and inference match.  No-op on clean corpus 
lines.
+     */
     static BufferedReader openGzipped(Path path) throws IOException {
         return new BufferedReader(
                 new InputStreamReader(
                         new GZIPInputStream(Files.newInputStream(path)),
-                        StandardCharsets.UTF_8));
+                        StandardCharsets.UTF_8)) {
+            @Override
+            public String readLine() throws IOException {
+                String l = super.readLine();
+                return l == null ? null : HtmlContentCleaner.clean(l);
+            }
+        };
     }
 
     /**
diff --git 
a/tika-ml/tika-ml-junkdetect/src/main/resources/org/apache/tika/ml/junkdetect/junkdetect.bin
 
b/tika-ml/tika-ml-junkdetect/src/main/resources/org/apache/tika/ml/junkdetect/junkdetect.bin
index af491ba162..c09c38cdb4 100644
Binary files 
a/tika-ml/tika-ml-junkdetect/src/main/resources/org/apache/tika/ml/junkdetect/junkdetect.bin
 and 
b/tika-ml/tika-ml-junkdetect/src/main/resources/org/apache/tika/ml/junkdetect/junkdetect.bin
 differ

(tika) 03/09: TIKA-4731 - checkpoint before "common" refactoring

Reply via email to