(tika) branch main updated: TIKA-4745 -- efficiency improvements (#2878)

tallison Sat, 06 Jun 2026 10:47:40 -0700

This is an automated email from the ASF dual-hosted git repository.

tballison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git



The following commit(s) were added to refs/heads/main by this push:
     new 8d9900e211 TIKA-4745 -- efficiency improvements (#2878)
8d9900e211 is described below

commit 8d9900e21127de0a20334ebc59ca0529395575bf
Author: Tim Allison <[email protected]>
AuthorDate: Sat Jun 6 13:47:25 2026 -0400

    TIKA-4745 -- efficiency improvements (#2878)
---
 .../tika/detect/EncodingDetectorContext.java       |  11 ++
 .../org/apache/tika/detect/EncodingProbeCache.java |  65 +++++++++++
 .../tika/parser/html/HtmlEncodingDetector.java     |  51 ++++++++-
 .../html/StandardCharsets_unsupported_by_IANA.txt  |   0
 .../tika/parser/html/HtmlEncodingDetectorTest.java |   0
 .../html/StandardHtmlEncodingDetectorTest.java     |   0
 .../apache/tika/ml/chardetect/AdaptiveProbe.java   |  15 ++-
 .../ml/chardetect/MojibusterEncodingDetector.java  |  22 +++-
 .../NaiveBayesBigramEncodingDetector.java          |  70 ++++++++----
 .../org/apache/tika/ml/chardetect/nb-bigram.bin    | Bin 1008871 -> 696579 
bytes
 .../charsoup/CharSoupFeatureExtractor.java         |  14 ++-
 tika-ml/pom.xml                                    |   1 +
 .../ml/chardetect/tools/TrainNaiveBayesBigram.java |  21 +++-
 .../pom.xml                                        |  56 +++-------
 .../ml/junkdetect/tools/BoundaryBigramAudit.java   |   0
 .../ml/junkdetect/tools/BuildJunkTrainingData.java |   0
 .../tika/ml/junkdetect/tools/DebugScriptRuns.java  |   0
 .../tools/JunkDetectorTrainingConfig.java          |   0
 .../ml/junkdetect/tools/LineScriptFractions.java   |   0
 .../tika/ml/junkdetect/tools/ScriptCensus.java     |   0
 .../tika/ml/junkdetect/tools/TrainJunkModel.java   |  62 ++++-------
 .../ml/junkdetect/JunkDetectorRoundTripTest.java   |  43 ++-----
 .../tools/BuildJunkAugmentationData.java           |   0
 .../tools/BuildJunkAugmentationDataTest.java       |   0
 .../tools/JunkDetectorTrainingConfigTest.java      |   0
 tika-ml/tika-ml-junkdetect/pom.xml                 |  65 +----------
 .../apache/tika/ml/junkdetect/BigramTables.java    |  82 ++++++++++----
 .../apache/tika/ml/junkdetect/JunkDetector.java    | 110 +++++++++---------
 .../ml/junkdetect/JunkFilterEncodingDetector.java  | 123 ++++++++++++---------
 .../tika/ml/junkdetect/TextQualityFeatures.java    |  52 ++++++++-
 .../org/apache/tika/ml/junkdetect/junkdetect.bin   | Bin 2316809 -> 727979 
bytes
 .../tika-parser-html-module/pom.xml                |   3 +
 32 files changed, 509 insertions(+), 357 deletions(-)

diff --git 
a/tika-core/src/main/java/org/apache/tika/detect/EncodingDetectorContext.java 
b/tika-core/src/main/java/org/apache/tika/detect/EncodingDetectorContext.java
index 6957601e2c..426f508c14 100644
--- 
a/tika-core/src/main/java/org/apache/tika/detect/EncodingDetectorContext.java
+++ 
b/tika-core/src/main/java/org/apache/tika/detect/EncodingDetectorContext.java
@@ -40,8 +40,19 @@ import java.util.Set;
 public class EncodingDetectorContext {
 
     private final List<Result> results = new ArrayList<>();
+    private final EncodingProbeCache probeCache = new EncodingProbeCache();
     private String arbitrationInfo;
 
+    /**
+     * Per-detection cache of the raw detection probe, shared across the 
detectors in
+     * this chain so they don't each re-read the same leading bytes. It lives 
and dies
+     * with this context (which is removed after detection), so it never leaks 
into
+     * recursive/attachment parsing.
+     */
+    public EncodingProbeCache getProbeCache() {
+        return probeCache;
+    }
+
     /**
      * Record the ranked results from a child detector.
      *
diff --git 
a/tika-core/src/main/java/org/apache/tika/detect/EncodingProbeCache.java 
b/tika-core/src/main/java/org/apache/tika/detect/EncodingProbeCache.java
new file mode 100644
index 0000000000..a4d783fb02
--- /dev/null
+++ b/tika-core/src/main/java/org/apache/tika/detect/EncodingProbeCache.java
@@ -0,0 +1,65 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.detect;
+
+/**
+ * Caches the raw encoding-detection probe (the leading bytes read for 
detection)
+ * so that multiple detectors in a chain do not each re-read and re-tag-strip 
the
+ * same bytes. For example a statistical detector and a downstream meta 
detector
+ * that re-reads the bytes for arbitration can share one probe.
+ * <p>
+ * An instance is held by {@link EncodingDetectorContext}, so it inherits that
+ * context's per-detection lifecycle: created fresh per detection and discarded
+ * with the context immediately afterwards. That matters because a
+ * {@link org.apache.tika.parser.ParseContext} flows on into recursive
+ * (attachment/embedded) parsing — a probe must never outlive the single 
detection
+ * it was read for.
+ * <p>
+ * Not thread-safe: a single detection runs its detectors sequentially on one
+ * thread. The cache is keyed by the probe parameters — {@link #get} returns 
the
+ * cached probe only when both {@code contentTarget} and {@code rawCap} match 
what
+ * it was stored with, so a detector that wants a differently-sized probe
+ * transparently reads (and caches) its own.
+ * <p>
+ * The cached array is shared read-only state; callers must not mutate it in 
place.
+ */
+public class EncodingProbeCache {
+
+    private byte[] probe;
+    private int contentTarget = -1;
+    private int rawCap = -1;
+
+    /**
+     * @return the cached probe if one was stored with the same {@code 
contentTarget} and
+     * {@code rawCap}; otherwise {@code null}
+     */
+    public byte[] get(int contentTarget, int rawCap) {
+        if (probe != null && this.contentTarget == contentTarget && 
this.rawCap == rawCap) {
+            return probe;
+        }
+        return null;
+    }
+
+    /**
+     * Stores the probe bytes read with the given parameters.
+     */
+    public void put(byte[] probe, int contentTarget, int rawCap) {
+        this.probe = probe;
+        this.contentTarget = contentTarget;
+        this.rawCap = rawCap;
+    }
+}
diff --git 
a/tika-encoding-detectors/tika-encoding-detector-html/src/main/java/org/apache/tika/parser/html/HtmlEncodingDetector.java
 
b/tika-encoding-detectors/tika-encoding-detector-html/src/main/java/org/apache/tika/parser/html/HtmlEncodingDetector.java
index c052b062b6..1d9398ce5e 100644
--- 
a/tika-encoding-detectors/tika-encoding-detector-html/src/main/java/org/apache/tika/parser/html/HtmlEncodingDetector.java
+++ 
b/tika-encoding-detectors/tika-encoding-detector-html/src/main/java/org/apache/tika/parser/html/HtmlEncodingDetector.java
@@ -20,7 +20,6 @@ import java.io.BufferedReader;
 import java.io.IOException;
 import java.io.InputStreamReader;
 import java.io.Serializable;
-import java.nio.ByteBuffer;
 import java.nio.charset.Charset;
 import java.nio.charset.StandardCharsets;
 import java.util.Collections;
@@ -85,6 +84,7 @@ public class HtmlEncodingDetector implements EncodingDetector 
{
     private static final Pattern FLEXIBLE_CHARSET_ATTR_PATTERN =
             
Pattern.compile(("(?is)\\bcharset\\s*=\\s*(?:['\\\"]\\s*)?([-_:\\.a-z0-9]+)"));
     private static final Charset ASCII = Charset.forName("US-ASCII");
+    private static final Pattern HTML_COMMENT_PATTERN = 
Pattern.compile("<!--.*?(-->|$)");
     /**
      * HTML can include non-iana supported charsets that Java
      * recognizes, e.g. "unicode".  This can lead to incorrect 
detection/mojibake.
@@ -162,10 +162,20 @@ public class HtmlEncodingDetector implements 
EncodingDetector {
         }
         tis.reset();
 
-        String head = ASCII.decode(ByteBuffer.wrap(buffer, 0, n)).toString();
-        String headNoComments = head.replaceAll("<!--.*?(-->|$)", " ");
+        // findCharset only ever matches a meta tag (HTTP_META_PATTERN = 
"<\s*meta...").
+        // If the probe has no such tag, the full ASCII decode + 
comment-stripping
+        // regex below can only produce null — skip them. Byte-level, no 
allocation;
+        // a strict necessary condition for any non-empty result.
+        if (!containsMetaTag(buffer, n)) {
+            return Collections.emptyList();
+        }
+
+        String head = new String(buffer, 0, n, ASCII);
+        boolean hasComment = head.indexOf("<!--") >= 0;
+        String headNoComments =
+                hasComment ? HTML_COMMENT_PATTERN.matcher(head).replaceAll(" 
") : head;
         Charset charset = findCharset(headNoComments);
-        if (charset == null) {
+        if (charset == null && hasComment) {
             charset = findCharset(head);
         }
         if (charset == null) {
@@ -175,6 +185,39 @@ public class HtmlEncodingDetector implements 
EncodingDetector {
                 EncodingResult.ResultType.DECLARATIVE));
     }
 
+    /**
+     * Byte-level scan for an opening meta tag, mirroring the {@code <\s*meta} 
prefix of
+     * {@link #HTTP_META_PATTERN} (ASCII, case-insensitive). Lets {@link 
#detect} skip the
+     * full ASCII decode + comment-stripping regex on probes that cannot 
contain a meta
+     * charset declaration. {@code <}, ASCII whitespace and {@code meta} are 
all ASCII, so a
+     * raw-byte scan is equivalent to scanning the decoded head.
+     */
+    private static boolean containsMetaTag(byte[] buf, int len) {
+        for (int i = 0; i < len; i++) {
+            if (buf[i] != '<') {
+                continue;
+            }
+            int j = i + 1;
+            while (j < len) {
+                int c = buf[j] & 0xFF;
+                if (c == ' ' || c == '\t' || c == '\n' || c == 0x0B || c == 
'\f'
+                        || c == '\r') {
+                    j++;
+                } else {
+                    break;
+                }
+            }
+            if (j + 4 <= len
+                    && (((buf[j] & 0xFF) | 0x20) == 'm')
+                    && (((buf[j + 1] & 0xFF) | 0x20) == 'e')
+                    && (((buf[j + 2] & 0xFF) | 0x20) == 't')
+                    && (((buf[j + 3] & 0xFF) | 0x20) == 'a')) {
+                return true;
+            }
+        }
+        return false;
+    }
+
     //returns null if no charset was found
     private Charset findCharset(String s) {
 
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/resources/org/apache/tika/parser/html/StandardCharsets_unsupported_by_IANA.txt
 
b/tika-encoding-detectors/tika-encoding-detector-html/src/main/resources/org/apache/tika/parser/html/StandardCharsets_unsupported_by_IANA.txt
similarity index 100%
rename from 
tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/resources/org/apache/tika/parser/html/StandardCharsets_unsupported_by_IANA.txt
rename to 
tika-encoding-detectors/tika-encoding-detector-html/src/main/resources/org/apache/tika/parser/html/StandardCharsets_unsupported_by_IANA.txt
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlEncodingDetectorTest.java
 
b/tika-encoding-detectors/tika-encoding-detector-html/src/test/java/org/apache/tika/parser/html/HtmlEncodingDetectorTest.java
similarity index 100%
rename from 
tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlEncodingDetectorTest.java
rename to 
tika-encoding-detectors/tika-encoding-detector-html/src/test/java/org/apache/tika/parser/html/HtmlEncodingDetectorTest.java
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/StandardHtmlEncodingDetectorTest.java
 
b/tika-encoding-detectors/tika-encoding-detector-html/src/test/java/org/apache/tika/parser/html/StandardHtmlEncodingDetectorTest.java
similarity index 100%
rename from 
tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/StandardHtmlEncodingDetectorTest.java
rename to 
tika-encoding-detectors/tika-encoding-detector-html/src/test/java/org/apache/tika/parser/html/StandardHtmlEncodingDetectorTest.java
diff --git 
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/AdaptiveProbe.java
 
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/AdaptiveProbe.java
index 34a081ec32..e2100b0533 100644
--- 
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/AdaptiveProbe.java
+++ 
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/AdaptiveProbe.java
@@ -55,11 +55,20 @@ public final class AdaptiveProbe {
             throws IOException {
         tis.mark(rawCap);
         try {
-            byte[] buf = new byte[rawCap];
-            byte[] stripDst = new byte[rawCap];
+            // Grow on demand rather than allocating (and zeroing) the full 
rawCap
+            // (e.g. 512 KB) twice up front: the vast majority of probes are 
far
+            // smaller. Bytes returned are identical to the eager-allocation 
version.
+            int cap = Math.min(rawCap, contentTarget);
+            byte[] buf = new byte[cap];
+            byte[] stripDst = new byte[cap];
             int total = 0;
             while (total < rawCap) {
                 int want = Math.min(rawCap - total, contentTarget);
+                if (total + want > buf.length) {
+                    int newCap = Math.min(rawCap, Math.max(buf.length * 2, 
total + want));
+                    buf = Arrays.copyOf(buf, newCap);
+                    stripDst = Arrays.copyOf(stripDst, newCap);
+                }
                 int n = IOUtils.read(tis, buf, total, want);
                 total += n;
                 HtmlByteStripper.Result r =
@@ -72,7 +81,7 @@ public final class AdaptiveProbe {
             if (total == 0) {
                 return new byte[0];
             }
-            return total == rawCap ? buf : Arrays.copyOf(buf, total);
+            return total == buf.length ? buf : Arrays.copyOf(buf, total);
         } finally {
             tis.reset();
         }
diff --git 
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java
 
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java
index 45a919274a..e225366091 100644
--- 
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java
+++ 
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java
@@ -29,6 +29,8 @@ import org.slf4j.LoggerFactory;
 
 import org.apache.tika.config.TikaComponent;
 import org.apache.tika.detect.EncodingDetector;
+import org.apache.tika.detect.EncodingDetectorContext;
+import org.apache.tika.detect.EncodingProbeCache;
 import org.apache.tika.detect.EncodingResult;
 import org.apache.tika.detect.HighByteLetterStats;
 import org.apache.tika.io.TikaInputStream;
@@ -208,7 +210,7 @@ public class MojibusterEncodingDetector implements 
EncodingDetector {
     @Override
     public List<EncodingResult> detect(TikaInputStream tis, Metadata metadata,
                                        ParseContext parseContext) throws 
IOException {
-        byte[] probe = readProbe(tis);
+        byte[] probe = readProbe(tis, parseContext);
         return detect(probe, metadata);
     }
 
@@ -749,7 +751,21 @@ public class MojibusterEncodingDetector implements 
EncodingDetector {
         return lower.contains("html") || lower.contains("xml");
     }
 
-    private static byte[] readProbe(TikaInputStream tis) throws IOException {
-        return AdaptiveProbe.read(tis, PROBE_CONTENT_TARGET, PROBE_RAW_CAP);
+    private static byte[] readProbe(TikaInputStream tis, ParseContext 
parseContext)
+            throws IOException {
+        EncodingDetectorContext context =
+                parseContext == null ? null : 
parseContext.get(EncodingDetectorContext.class);
+        EncodingProbeCache cache = context == null ? null : 
context.getProbeCache();
+        if (cache != null) {
+            byte[] cached = cache.get(PROBE_CONTENT_TARGET, PROBE_RAW_CAP);
+            if (cached != null) {
+                return cached;
+            }
+        }
+        byte[] probe = AdaptiveProbe.read(tis, PROBE_CONTENT_TARGET, 
PROBE_RAW_CAP);
+        if (cache != null) {
+            cache.put(probe, PROBE_CONTENT_TARGET, PROBE_RAW_CAP);
+        }
+        return probe;
     }
 }
diff --git 
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/NaiveBayesBigramEncodingDetector.java
 
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/NaiveBayesBigramEncodingDetector.java
index 5becf20ce6..da87d46b74 100644
--- 
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/NaiveBayesBigramEncodingDetector.java
+++ 
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/NaiveBayesBigramEncodingDetector.java
@@ -62,12 +62,11 @@ public class NaiveBayesBigramEncodingDetector implements 
EncodingDetector {
     private static final int BIGRAM_SPACE = 65_536;
 
     /**
-     * Cap probe scanning at 10 KB.  Bigram-based identification
+     * Cap probe scanning at 16 KB.  Bigram-based identification
      * saturates quickly — beyond the first 500-1000 bytes every
      * additional bigram nudges scores by &lt; 0.1 log-likelihood and
-     * doesn't change the argmax.  Reducing the cap from 4 KB to 1 KB
-     * quartes the inner-loop work on long probes at no measurable
-     * accuracy cost.
+     * doesn't change the argmax, so capping the scan bounds the
+     * inner-loop work on long probes at no measurable accuracy cost.
      */
     private static final int MAX_PROBE_BYTES = 16 * 1024;
 
@@ -216,8 +215,8 @@ public class NaiveBayesBigramEncodingDetector implements 
EncodingDetector {
     /**
      * Bigram-major int8 logP layout.  Quantized at load time via
      * per-class scale {@code scale[c] = maxAbs(class c's logP column) / 127}.
-     * In-memory footprint: {@code 65_536 × numClasses} bytes ≈ 2 MB for
-     * 32 classes, 4× smaller than float32.  The hot-loop accumulates
+     * In-memory footprint: {@code 65_536 × numClasses} bytes ≈ 2.1 MB for
+     * 34 classes, 4× smaller than float32.  The hot-loop accumulates
      * raw int8 products and applies dequantization once at the end of
      * the probe, CharSoup-style.
      */
@@ -309,9 +308,28 @@ public class NaiveBayesBigramEncodingDetector implements 
EncodingDetector {
                 for (int bg = 0; bg < BIGRAM_SPACE; bg++) {
                     logP8[bg * numClasses + c] = u;
                 }
-                // Overwrite with trained pairs.
+                // Overwrite with trained pairs. Bigram ids are sorted 
ascending and
+                // stored as varint deltas (LEB128) from the previous id.
+                int bigram = 0;
                 for (int i = 0; i < vocabSize; i++) {
-                    int bigram = dis.readUnsignedShort();
+                    long delta = 0;
+                    int shift = 0;
+                    int b;
+                    do {
+                        if (shift >= 35) {
+                            throw new IOException(
+                                    "Malformed varint in bigram-id deltas (too 
long)");
+                        }
+                        b = dis.readUnsignedByte();
+                        delta |= (long) (b & 0x7F) << shift;
+                        shift += 7;
+                    } while ((b & 0x80) != 0);
+                    long next = bigram + delta;
+                    if (next < 0 || next >= BIGRAM_SPACE) {
+                        throw new IOException("Bigram id out of range: " + next
+                                + " (expected [0, " + BIGRAM_SPACE + "))");
+                    }
+                    bigram = (int) next;
                     byte q = dis.readByte();
                     logP8[bigram * numClasses + c] = q;
                 }
@@ -521,6 +539,7 @@ public class NaiveBayesBigramEncodingDetector implements 
EncodingDetector {
         // diagnostic path.
         double[] score = new double[numClasses];
         double[] contributions = new double[numClasses];
+        double[] bestPerCohort = new double[Cohort.values().length];
         int hashCap = counts.capacity();
         for (int slot = 0; slot < hashCap; slot++) {
             int bigram = counts.keyAt(slot);
@@ -549,6 +568,15 @@ public class NaiveBayesBigramEncodingDetector implements 
EncodingDetector {
             // cohort, so the cap engages on cross-cohort gaps that a
             // max-vs-overall-runner-up cap missed when multiple classes
             // in top-1's cohort sat close together.
+            //
+            // Single per-class pass computes the contributions, the running
+            // max/topClass, AND the best contribution per cohort; 
bestCrossCohort
+            // then reduces over the (few) cohorts instead of a second full
+            // per-class pass, and the clip is fused into the accumulate.
+            // Bit-identical to the prior four-pass form: max/cross-cohort are 
exact
+            // (comparisons over the same value set), and the contribution 
formula
+            // and score[] accumulation order are unchanged.
+            java.util.Arrays.fill(bestPerCohort, Double.NEGATIVE_INFINITY);
             int topClass = -1;
             double max = Double.NEGATIVE_INFINITY;
             for (int c = 0; c < numClasses; c++) {
@@ -558,25 +586,27 @@ public class NaiveBayesBigramEncodingDetector implements 
EncodingDetector {
                     max = contrib;
                     topClass = c;
                 }
+                int co = cohorts[c].ordinal();
+                if (contrib > bestPerCohort[co]) {
+                    bestPerCohort[co] = contrib;
+                }
             }
-            Cohort topCohort = cohorts[topClass];
+            int topCohort = cohorts[topClass].ordinal();
             double bestCrossCohort = Double.NEGATIVE_INFINITY;
-            for (int c = 0; c < numClasses; c++) {
-                if (cohorts[c] != topCohort && contributions[c] > 
bestCrossCohort) {
-                    bestCrossCohort = contributions[c];
+            for (int k = 0; k < bestPerCohort.length; k++) {
+                if (k != topCohort && bestPerCohort[k] > bestCrossCohort) {
+                    bestCrossCohort = bestPerCohort[k];
                 }
             }
             // bestCrossCohort is always finite here: load requires >=2 
cohorts.
             double capValue = bestCrossCohort + CAP_PER_BIGRAM_NATS;
-            if (max > capValue) {
-                for (int c = 0; c < numClasses; c++) {
-                    if (contributions[c] > capValue) {
-                        contributions[c] = capValue;
-                    }
-                }
-            }
+            boolean clip = max > capValue;
             for (int c = 0; c < numClasses; c++) {
-                score[c] += contributions[c];
+                double v = contributions[c];
+                if (clip && v > capValue) {
+                    v = capValue;
+                }
+                score[c] += v;
             }
         }
         return new ScoreResult(score, scored, total);
diff --git 
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/resources/org/apache/tika/ml/chardetect/nb-bigram.bin
 
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/resources/org/apache/tika/ml/chardetect/nb-bigram.bin
index b89188bb32..0cebc858bb 100644
Binary files 
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/resources/org/apache/tika/ml/chardetect/nb-bigram.bin
 and 
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/resources/org/apache/tika/ml/chardetect/nb-bigram.bin
 differ
diff --git 
a/tika-langdetect/tika-langdetect-charsoup-core/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupFeatureExtractor.java
 
b/tika-langdetect/tika-langdetect-charsoup-core/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupFeatureExtractor.java
index 7bbfc032e5..6b7e39bc68 100644
--- 
a/tika-langdetect/tika-langdetect-charsoup-core/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupFeatureExtractor.java
+++ 
b/tika-langdetect/tika-langdetect-charsoup-core/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupFeatureExtractor.java
@@ -244,9 +244,17 @@ public class CharSoupFeatureExtractor {
      * @return cleaned, NFC-normalized text
      */
     public static String preprocessNoTruncate(String rawText) {
-        // Strip URLs and emails
-        String text = URL_REGEX.matcher(rawText).replaceAll(" ");
-        text = MAIL_REGEX.matcher(text).replaceAll(" ");
+        // Strip URLs and emails. Both regexes scan the entire input on every 
call;
+        // skip each unless its required marker is present ("://" for 
URL_REGEX, "@"
+        // for MAIL_REGEX). This is a no-op for the common (markerless) case — 
the
+        // output is identical — but avoids a full-buffer regex scan + Matcher 
alloc.
+        String text = rawText;
+        if (text.indexOf("://") >= 0) {
+            text = URL_REGEX.matcher(text).replaceAll(" ");
+        }
+        if (text.indexOf('@') >= 0) {
+            text = MAIL_REGEX.matcher(text).replaceAll(" ");
+        }
 
         // NFC normalize
         if (!Normalizer.isNormalized(text, Normalizer.Form.NFC)) {
diff --git a/tika-ml/pom.xml b/tika-ml/pom.xml
index 11ddcb221c..46de69dc69 100644
--- a/tika-ml/pom.xml
+++ b/tika-ml/pom.xml
@@ -35,6 +35,7 @@
     <module>tika-ml-core</module>
     <module>tika-ml-chardetect</module>
     <module>tika-ml-junkdetect</module>
+    <module>tika-ml-junkdetect-tools</module>
   </modules>
 
   <build>
diff --git 
a/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/TrainNaiveBayesBigram.java
 
b/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/TrainNaiveBayesBigram.java
index a082f4c5be..6eaee95c08 100644
--- 
a/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/TrainNaiveBayesBigram.java
+++ 
b/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/TrainNaiveBayesBigram.java
@@ -427,8 +427,8 @@ public class TrainNaiveBayesBigram {
      *     float32 scale                      (per-class dequant)
      *     byte    unseenQ                    (int8 quantized unseen floor)
      *     int32 vocabSize                    (number of trained pairs)
-     *     for each kept bigram:
-     *       uint16 bigramKey
+     *     bigram keys sorted ascending, each pair stored as:
+     *       varint deltaFromPrevKey         (LEB128; first delta is the key 
itself)
      *       byte   logP8                     (int8 quantized)
      *
      * <p>Sparse representation: only trained bigram pairs are stored;
@@ -497,11 +497,22 @@ public class TrainNaiveBayesBigram {
                 dos.writeByte(unseenQ[c]);
                 float scale = perClassScale[c];
                 dos.writeInt(logProbsPerClass[c].size());
-                for (Map.Entry<Integer, Float> e : 
logProbsPerClass[c].entrySet()) {
-                    int q = Math.round(e.getValue() / scale);
+                // Bigram ids sorted ascending, stored as varint (LEB128) 
deltas from
+                // the previous id — most deltas fit in a single byte.
+                int[] keys = logProbsPerClass[c].keySet().stream()
+                        .mapToInt(Integer::intValue).sorted().toArray();
+                int prev = 0;
+                for (int key : keys) {
+                    int q = Math.round(logProbsPerClass[c].get(key) / scale);
                     if (q > 127) q = 127;
                     if (q < -127) q = -127;
-                    dos.writeShort(e.getKey());
+                    int delta = key - prev;
+                    prev = key;
+                    while ((delta & ~0x7F) != 0) {
+                        dos.writeByte((delta & 0x7F) | 0x80);
+                        delta >>>= 7;
+                    }
+                    dos.writeByte(delta);
                     dos.writeByte(q);
                 }
             }
diff --git a/tika-ml/tika-ml-junkdetect/pom.xml 
b/tika-ml/tika-ml-junkdetect-tools/pom.xml
similarity index 72%
copy from tika-ml/tika-ml-junkdetect/pom.xml
copy to tika-ml/tika-ml-junkdetect-tools/pom.xml
index fe717998cf..fe684093cd 100644
--- a/tika-ml/tika-ml-junkdetect/pom.xml
+++ b/tika-ml/tika-ml-junkdetect-tools/pom.xml
@@ -25,33 +25,19 @@
   </parent>
   <modelVersion>4.0.0</modelVersion>
 
-  <artifactId>tika-ml-junkdetect</artifactId>
-  <name>Apache Tika ML junk detector — runtime and training tools</name>
+  <artifactId>tika-ml-junkdetect-tools</artifactId>
+  <name>Apache Tika ML junk detector — training and evaluation tools</name>
   <description>
-    Language-agnostic text quality scorer that discriminates between clean 
UTF-8 text and
-    mojibake, reversed text, wrong-codec decodings, and other corruption forms.
-    Provides a standalone "languageyness" score suitable for re-OCR triggering 
and
-    charset-decoding arbitration.
-
-    Runtime classes (JunkDetector, ScriptDetector, feature extractors) and 
bundled model
-    resources live here. Training and evaluation CLI tools live in the tools 
subpackage.
+    Build-time training, evaluation, and diagnostic CLIs for the junk detector
+    (TrainJunkModel, BuildJunkTrainingData, and diagnostics).  These are not 
part
+    of the runtime detector — they are kept out of the tika-ml-junkdetect 
runtime
+    jar and built into a self-contained tools jar via the 'train' profile.
   </description>
 
   <dependencies>
     <dependency>
       <groupId>org.apache.tika</groupId>
-      <artifactId>tika-core</artifactId>
-      <version>${revision}</version>
-    </dependency>
-    <dependency>
-      <groupId>org.apache.tika</groupId>
-      <artifactId>tika-annotation-processor</artifactId>
-      <version>${revision}</version>
-      <scope>provided</scope>
-    </dependency>
-    <dependency>
-      <groupId>org.apache.tika</groupId>
-      <artifactId>tika-ml-core</artifactId>
+      <artifactId>tika-ml-junkdetect</artifactId>
       <version>${revision}</version>
     </dependency>
     <dependency>
@@ -63,9 +49,7 @@
     <!-- Test dependencies -->
     <!--
       tika-serialization is test-scope only because the one consumer
-      (BuildJunkAugmentationData) lives in src/test/java — it's a corpus-prep
-      tool, not part of the runtime detector. Keeps the production classpath of
-      tika-ml-junkdetect free of the serialization dep.
+      (BuildJunkAugmentationData) is a corpus-prep tool in src/test/java.
     -->
     <dependency>
       <groupId>org.apache.tika</groupId>
@@ -93,21 +77,11 @@
         <configuration>
           <archive>
             <manifestEntries>
-              
<Automatic-Module-Name>org.apache.tika.ml.junkdetect</Automatic-Module-Name>
+              
<Automatic-Module-Name>org.apache.tika.ml.junkdetect.tools</Automatic-Module-Name>
             </manifestEntries>
           </archive>
         </configuration>
       </plugin>
-      <plugin>
-        <groupId>org.apache.rat</groupId>
-        <artifactId>apache-rat-plugin</artifactId>
-        <configuration>
-          <inputExcludes>
-            <inputExclude>**/*.bin</inputExclude>
-            <inputExclude>**/*.txt</inputExclude>
-          </inputExcludes>
-        </configuration>
-      </plugin>
       <!-- Tools package uses System.out/printf freely -->
       <plugin>
         <groupId>de.thetaphi</groupId>
@@ -123,12 +97,11 @@
     <profile>
       <!--
         Build a self-contained fat JAR for model training and evaluation.
-        Usage:
-          ./mvnw package -pl tika-ml/tika-ml-junkdetect -am -Ptrain 
-DskipTests \
+        Usage (run from the repo root so the model resource path resolves):
+          ./mvnw package -pl tika-ml/tika-ml-junkdetect-tools -am -Ptrain 
-DskipTests \
               -Dmaven.repo.local=.local_m2_repo
-          java -jar 
tika-ml/tika-ml-junkdetect/target/tika-ml-junkdetect-*-tools.jar \
-              [BuildJunkTrainingData|TrainJunkModel|EvalJunkDetector] \
-              [args...]
+          java -jar 
tika-ml/tika-ml-junkdetect-tools/target/tika-ml-junkdetect-tools-*-tools.jar \
+              [BuildJunkTrainingData|TrainJunkModel|EvalJunkDetector] [args...]
       -->
       <id>train</id>
       <build>
@@ -170,7 +143,4 @@
     </profile>
   </profiles>
 
-  <scm>
-    <tag>3.0.0-rc1</tag>
-  </scm>
 </project>
diff --git 
a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/BoundaryBigramAudit.java
 
b/tika-ml/tika-ml-junkdetect-tools/src/main/java/org/apache/tika/ml/junkdetect/tools/BoundaryBigramAudit.java
similarity index 100%
rename from 
tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/BoundaryBigramAudit.java
rename to 
tika-ml/tika-ml-junkdetect-tools/src/main/java/org/apache/tika/ml/junkdetect/tools/BoundaryBigramAudit.java
diff --git 
a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/BuildJunkTrainingData.java
 
b/tika-ml/tika-ml-junkdetect-tools/src/main/java/org/apache/tika/ml/junkdetect/tools/BuildJunkTrainingData.java
similarity index 100%
rename from 
tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/BuildJunkTrainingData.java
rename to 
tika-ml/tika-ml-junkdetect-tools/src/main/java/org/apache/tika/ml/junkdetect/tools/BuildJunkTrainingData.java
diff --git 
a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/DebugScriptRuns.java
 
b/tika-ml/tika-ml-junkdetect-tools/src/main/java/org/apache/tika/ml/junkdetect/tools/DebugScriptRuns.java
similarity index 100%
rename from 
tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/DebugScriptRuns.java
rename to 
tika-ml/tika-ml-junkdetect-tools/src/main/java/org/apache/tika/ml/junkdetect/tools/DebugScriptRuns.java
diff --git 
a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/JunkDetectorTrainingConfig.java
 
b/tika-ml/tika-ml-junkdetect-tools/src/main/java/org/apache/tika/ml/junkdetect/tools/JunkDetectorTrainingConfig.java
similarity index 100%
rename from 
tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/JunkDetectorTrainingConfig.java
rename to 
tika-ml/tika-ml-junkdetect-tools/src/main/java/org/apache/tika/ml/junkdetect/tools/JunkDetectorTrainingConfig.java
diff --git 
a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/LineScriptFractions.java
 
b/tika-ml/tika-ml-junkdetect-tools/src/main/java/org/apache/tika/ml/junkdetect/tools/LineScriptFractions.java
similarity index 100%
rename from 
tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/LineScriptFractions.java
rename to 
tika-ml/tika-ml-junkdetect-tools/src/main/java/org/apache/tika/ml/junkdetect/tools/LineScriptFractions.java
diff --git 
a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/ScriptCensus.java
 
b/tika-ml/tika-ml-junkdetect-tools/src/main/java/org/apache/tika/ml/junkdetect/tools/ScriptCensus.java
similarity index 100%
rename from 
tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/ScriptCensus.java
rename to 
tika-ml/tika-ml-junkdetect-tools/src/main/java/org/apache/tika/ml/junkdetect/tools/ScriptCensus.java
diff --git 
a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/TrainJunkModel.java
 
b/tika-ml/tika-ml-junkdetect-tools/src/main/java/org/apache/tika/ml/junkdetect/tools/TrainJunkModel.java
similarity index 97%
rename from 
tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/TrainJunkModel.java
rename to 
tika-ml/tika-ml-junkdetect-tools/src/main/java/org/apache/tika/ml/junkdetect/tools/TrainJunkModel.java
index 63bce5317f..13cbc20381 100644
--- 
a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/TrainJunkModel.java
+++ 
b/tika-ml/tika-ml-junkdetect-tools/src/main/java/org/apache/tika/ml/junkdetect/tools/TrainJunkModel.java
@@ -46,7 +46,7 @@ import org.apache.tika.ml.junkdetect.JunkDetector;
  *
  * <p>z1 (codepoint-bigram log-probability) is trained per script by bucketing
  * every bigram to its script ({@link JunkDetector#forEachScriptBigram}) and
- * building a per-script open-addressing bigram table with unigram backoff.
+ * building a per-script sorted-occupied bigram table with unigram backoff.
  * z2 (Unicode block-transition), z3 (control-byte fraction), and z4
  * (script-transition) are single global document-level features.  All features
  * are calibrated (mu/sigma) and combined by a single global contrastive
@@ -932,7 +932,7 @@ public class TrainJunkModel {
      *
      * @param trainFile         the per-script {@code *.train.gz}
      * @param minBigramCount    drop pairs whose count is below this
-     * @param loadFactor        target OA table load factor (e.g. 0.5)
+     * @param loadFactor        unused (retained for signature compatibility)
      * @param keyIndexBits      bit-width per index in the packed key
      *                          (each side of the pair must fit)
      */
@@ -970,9 +970,12 @@ public class TrainJunkModel {
     /**
      * Builds the {@link BigramTables} carrier from pre-tallied pair/unigram
      * counts.  Drops pairs below
-     * {@code minBigramCount}, assigns dense codepoint indices, and packs an
-     * open-addressing bigram table; unigram log-probs use {@code unigramTotal}
+     * {@code minBigramCount}, assigns dense codepoint indices, and packs the
+     * sorted-occupied bigram table; unigram log-probs use {@code unigramTotal}
      * as the denominator.
+     *
+     * <p>{@code loadFactor} is retained for signature compatibility but 
unused:
+     * the sorted-occupied table (binary-search lookup) has no load factor.
      */
     public static BigramTables buildBigramTablesFromCounts(
             HashMap<Long, long[]> pairCounts,
@@ -1037,12 +1040,8 @@ public class TrainJunkModel {
         // Quantize unigram log-probs.
         QuantizedFloats qUnigram = quantizeFloats(unigramLogP);
 
-        // --- Build the open-addressing bigram table. ---
-        int slots = nextPowerOfTwo((int) Math.max(2, Math.ceil(keptPairs / 
loadFactor)));
-        int[] keys = new int[slots];
-        java.util.Arrays.fill(keys, BigramTables.EMPTY_KEY);
-        // Compute log-probs first, quantize once, then write into the table
-        // alongside its key.
+        // --- Build the sorted-occupied bigram table (binary-search lookup). 
---
+        // Compute log-probs first, quantize once, then sort by key.
         float[] keptLogP = new float[keptPairs];
         int[] keptKeys = new int[keptPairs];
         int writeIdx = 0;
@@ -1067,16 +1066,25 @@ public class TrainJunkModel {
         }
         // Quantize all kept log-probs together so they share min/max.
         QuantizedFloats qBigram = quantizeFloats(keptLogP);
-        byte[] values = new byte[slots];
+        // Sort (key, value) ascending by signed key so the loader can 
binary-search.
+        // Pack into a long (key in high 32 bits, value byte in low 8) for one 
sort.
+        long[] sortable = new long[keptPairs];
+        for (int i = 0; i < keptPairs; i++) {
+            sortable[i] = (((long) keptKeys[i]) << 32) | (qBigram.bytes[i] & 
0xFFL);
+        }
+        java.util.Arrays.sort(sortable);
+        int[] keys = new int[keptPairs];
+        byte[] values = new byte[keptPairs];
         for (int i = 0; i < keptPairs; i++) {
-            insertOA(keys, values, keptKeys[i], qBigram.bytes[i]);
+            keys[i] = (int) (sortable[i] >> 32);
+            values[i] = (byte) (sortable[i] & 0xFF);
         }
 
         System.out.printf(
                 "    pair_counts: distinct=%,d, kept=%,d (>=%d), dropped=%,d  "
-                + "cp_index=%,d  slots=%,d (load=%.2f)%n",
+                + "cp_index=%,d  bigram_entries=%,d%n",
                 totalDistinct, keptPairs, minBigramCount, dropped,
-                cpIndex.length, slots, keptPairs / (double) slots);
+                cpIndex.length, keptPairs);
 
         return new BigramTables(cpIndex, keys, values, qUnigram.bytes,
                 qBigram.min, qBigram.max,
@@ -1084,32 +1092,6 @@ public class TrainJunkModel {
                 unigramFallbackLogP, BACKOFF_ALPHA);
     }
 
-    /**
-     * Inserts a {@code (packedKey, value)} pair into the open-addressing
-     * table.  The caller is responsible for sizing the table large enough
-     * to avoid an infinite probe (any load &lt; 1.0 is safe).
-     */
-    private static void insertOA(int[] keys, byte[] values, int packedKey, 
byte value) {
-        int mask = keys.length - 1;
-        int h = JunkDetector.mixIndexKey(packedKey) & mask;
-        while (keys[h] != BigramTables.EMPTY_KEY) {
-            if (keys[h] == packedKey) {
-                // Same key twice — shouldn't happen with our dedup, but be
-                // defensive and overwrite rather than corrupt.
-                values[h] = value;
-                return;
-            }
-            h = (h + 1) & mask;
-        }
-        keys[h] = packedKey;
-        values[h] = value;
-    }
-
-    private static int nextPowerOfTwo(int n) {
-        if (n < 1) return 1;
-        int p = Integer.highestOneBit(n - 1) << 1;
-        return Math.max(1, p);
-    }
 
     // -----------------------------------------------------------------------
     // Global contrastive combiner training
diff --git 
a/tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/JunkDetectorRoundTripTest.java
 
b/tika-ml/tika-ml-junkdetect-tools/src/test/java/org/apache/tika/ml/junkdetect/JunkDetectorRoundTripTest.java
similarity index 93%
rename from 
tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/JunkDetectorRoundTripTest.java
rename to 
tika-ml/tika-ml-junkdetect-tools/src/test/java/org/apache/tika/ml/junkdetect/JunkDetectorRoundTripTest.java
index 07efc64dd8..7433820efe 100644
--- 
a/tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/JunkDetectorRoundTripTest.java
+++ 
b/tika-ml/tika-ml-junkdetect-tools/src/test/java/org/apache/tika/ml/junkdetect/JunkDetectorRoundTripTest.java
@@ -111,14 +111,12 @@ public class JunkDetectorRoundTripTest {
         // Same shape as the first test but with BOTH (A,B) and (B,A) in the
         // bigram table.  mean log-prob = -1.0, z1 = +4.0, logit = +4.0.
         int[] cpIndex = new int[]{'A', 'B'};
-        int[] keys = new int[4];
-        Arrays.fill(keys, BigramTables.EMPTY_KEY);
-        byte[] values = new byte[4];
         float bMin = -10.0f;
         float bMax = -1.0f;
         byte b = quantizeOne(-1.0f, bMin, bMax);
-        insertOA(keys, values, JunkDetector.packBigramKey(0, 1), b);
-        insertOA(keys, values, JunkDetector.packBigramKey(1, 0), b);
+        // sorted-occupied: packBigramKey(0,1)=1 < packBigramKey(1,0)=65536
+        int[] keys = {JunkDetector.packBigramKey(0, 1), 
JunkDetector.packBigramKey(1, 0)};
+        byte[] values = {b, b};
 
         float uMin = -5.0f;
         float uMax = -2.0f;
@@ -266,17 +264,14 @@ public class JunkDetectorRoundTripTest {
     private static BigramTables buildLatinTablesAB() {
         int[] cpIndex = new int[]{'A', 'B'};
 
-        // 4 slots ≈ 25% load for 1 pair.  Open-addressing with linear probe.
-        int[] keys = new int[4];
-        Arrays.fill(keys, BigramTables.EMPTY_KEY);
-        byte[] values = new byte[4];
-
         // Manual quantization with a chosen range so we don't hit the
         // degenerate single-element case.  range=[-10, -1] → -1.0 → byte 255.
         float bMin = -10.0f;
         float bMax = -1.0f;
         byte b = quantizeOne(-1.0f, bMin, bMax);
-        insertOA(keys, values, JunkDetector.packBigramKey(0, 1), b);
+        // sorted-occupied table with a single trained pair.
+        int[] keys = {JunkDetector.packBigramKey(0, 1)};
+        byte[] values = {b};
 
         float uMin = -5.0f;
         float uMax = -2.0f;
@@ -358,13 +353,10 @@ public class JunkDetectorRoundTripTest {
      *  (uppercase 'A'/'B' are absent from the index, so they must fold). */
     private static BigramTables buildLatinTablesLowerAB() {
         int[] cpIndex = new int[]{'a', 'b'};
-        int[] keys = new int[4];
-        Arrays.fill(keys, BigramTables.EMPTY_KEY);
-        byte[] values = new byte[4];
         float bMin = -10.0f;
         float bMax = -1.0f;
-        insertOA(keys, values, JunkDetector.packBigramKey(0, 1),
-                quantizeOne(-1.0f, bMin, bMax));
+        int[] keys = {JunkDetector.packBigramKey(0, 1)};
+        byte[] values = {quantizeOne(-1.0f, bMin, bMax)};
         float uMin = -5.0f;
         float uMax = -2.0f;
         byte[] unigramBytes = new byte[]{
@@ -403,25 +395,6 @@ public class JunkDetectorRoundTripTest {
         return (byte) q;
     }
 
-    /**
-     * Replica of {@code TrainJunkModel.insertOA} (package-private) for the
-     * test's hand-constructed tables.  Uses the same mix-hash as the
-     * production code path.
-     */
-    private static void insertOA(int[] keys, byte[] values, int packedKey, 
byte value) {
-        int mask = keys.length - 1;
-        int h = JunkDetector.mixIndexKey(packedKey) & mask;
-        while (keys[h] != BigramTables.EMPTY_KEY) {
-            if (keys[h] == packedKey) {
-                values[h] = value;
-                return;
-            }
-            h = (h + 1) & mask;
-        }
-        keys[h] = packedKey;
-        values[h] = value;
-    }
-
     /**
      * Saves a minimal model containing only LATIN, with the block / control /
      * script-transition features zeroed out and pure-z1 combiner weights
diff --git 
a/tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/tools/BuildJunkAugmentationData.java
 
b/tika-ml/tika-ml-junkdetect-tools/src/test/java/org/apache/tika/ml/junkdetect/tools/BuildJunkAugmentationData.java
similarity index 100%
rename from 
tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/tools/BuildJunkAugmentationData.java
rename to 
tika-ml/tika-ml-junkdetect-tools/src/test/java/org/apache/tika/ml/junkdetect/tools/BuildJunkAugmentationData.java
diff --git 
a/tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/tools/BuildJunkAugmentationDataTest.java
 
b/tika-ml/tika-ml-junkdetect-tools/src/test/java/org/apache/tika/ml/junkdetect/tools/BuildJunkAugmentationDataTest.java
similarity index 100%
rename from 
tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/tools/BuildJunkAugmentationDataTest.java
rename to 
tika-ml/tika-ml-junkdetect-tools/src/test/java/org/apache/tika/ml/junkdetect/tools/BuildJunkAugmentationDataTest.java
diff --git 
a/tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/tools/JunkDetectorTrainingConfigTest.java
 
b/tika-ml/tika-ml-junkdetect-tools/src/test/java/org/apache/tika/ml/junkdetect/tools/JunkDetectorTrainingConfigTest.java
similarity index 100%
rename from 
tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/tools/JunkDetectorTrainingConfigTest.java
rename to 
tika-ml/tika-ml-junkdetect-tools/src/test/java/org/apache/tika/ml/junkdetect/tools/JunkDetectorTrainingConfigTest.java
diff --git a/tika-ml/tika-ml-junkdetect/pom.xml 
b/tika-ml/tika-ml-junkdetect/pom.xml
index fe717998cf..5027cbe743 100644
--- a/tika-ml/tika-ml-junkdetect/pom.xml
+++ b/tika-ml/tika-ml-junkdetect/pom.xml
@@ -61,18 +61,6 @@
     </dependency>
 
     <!-- Test dependencies -->
-    <!--
-      tika-serialization is test-scope only because the one consumer
-      (BuildJunkAugmentationData) lives in src/test/java — it's a corpus-prep
-      tool, not part of the runtime detector. Keeps the production classpath of
-      tika-ml-junkdetect free of the serialization dep.
-    -->
-    <dependency>
-      <groupId>org.apache.tika</groupId>
-      <artifactId>tika-serialization</artifactId>
-      <version>${revision}</version>
-      <scope>test</scope>
-    </dependency>
     <dependency>
       <groupId>org.junit.jupiter</groupId>
       <artifactId>junit-jupiter-api</artifactId>
@@ -108,7 +96,7 @@
           </inputExcludes>
         </configuration>
       </plugin>
-      <!-- Tools package uses System.out/printf freely -->
+      <!-- Diagnostic tests print to stdout / use default-locale formatting 
freely. -->
       <plugin>
         <groupId>de.thetaphi</groupId>
         <artifactId>forbiddenapis</artifactId>
@@ -119,57 +107,6 @@
     </plugins>
   </build>
 
-  <profiles>
-    <profile>
-      <!--
-        Build a self-contained fat JAR for model training and evaluation.
-        Usage:
-          ./mvnw package -pl tika-ml/tika-ml-junkdetect -am -Ptrain 
-DskipTests \
-              -Dmaven.repo.local=.local_m2_repo
-          java -jar 
tika-ml/tika-ml-junkdetect/target/tika-ml-junkdetect-*-tools.jar \
-              [BuildJunkTrainingData|TrainJunkModel|EvalJunkDetector] \
-              [args...]
-      -->
-      <id>train</id>
-      <build>
-        <plugins>
-          <plugin>
-            <groupId>org.apache.maven.plugins</groupId>
-            <artifactId>maven-shade-plugin</artifactId>
-            <executions>
-              <execution>
-                <phase>package</phase>
-                <goals><goal>shade</goal></goals>
-                <configuration>
-                  <shadedArtifactAttached>true</shadedArtifactAttached>
-                  <shadedClassifierName>tools</shadedClassifierName>
-                  <transformers>
-                    <transformer 
implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
-                      
<mainClass>org.apache.tika.ml.junkdetect.tools.TrainJunkModel</mainClass>
-                    </transformer>
-                    <transformer 
implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer"
 />
-                    <transformer 
implementation="org.apache.maven.plugins.shade.resource.ApacheLicenseResourceTransformer"
 />
-                    <transformer 
implementation="org.apache.maven.plugins.shade.resource.ApacheNoticeResourceTransformer"
 />
-                  </transformers>
-                  <filters>
-                    <filter>
-                      <artifact>*:*</artifact>
-                      <excludes>
-                        <exclude>META-INF/*.SF</exclude>
-                        <exclude>META-INF/*.DSA</exclude>
-                        <exclude>META-INF/*.RSA</exclude>
-                      </excludes>
-                    </filter>
-                  </filters>
-                </configuration>
-              </execution>
-            </executions>
-          </plugin>
-        </plugins>
-      </build>
-    </profile>
-  </profiles>
-
   <scm>
     <tag>3.0.0-rc1</tag>
   </scm>
diff --git 
a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/BigramTables.java
 
b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/BigramTables.java
index 5c7e738290..05974bd927 100644
--- 
a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/BigramTables.java
+++ 
b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/BigramTables.java
@@ -36,12 +36,11 @@ import java.nio.ByteOrder;
  *       script.  Codepoint → dense index is a binary search; index →
  *       codepoint is direct array access.  Typical sizes: ~7K-15K for HAN,
  *       ~200-500 for most other scripts.
- *   <li>{@code bigramKeys} / {@code bigramValues} — parallel arrays
- *       implementing an open-addressed hash table with linear probing.
- *       Each key is a 32-bit value {@code (idxA << 16) | idxB}; key {@code
- *       -1} means "empty slot."  Indices are bounded at 16 bits (65535),
- *       which is comfortably above the largest per-script codepoint count
- *       we observe.
+ *   <li>{@code bigramKeys} / {@code bigramValues} — parallel arrays of the
+ *       occupied entries only, sorted ascending by key for binary-search
+ *       lookup.  Each key is a 32-bit value {@code (idxA << 16) | idxB}.
+ *       Indices are bounded at 16 bits (65535), comfortably above the
+ *       largest per-script codepoint count we observe.
  *   <li>{@code unigramTable} — {@code byte[numCodepoints]}, quantized
  *       unigram log-probabilities indexed by the same codepoint→index map.
  *   <li>{@code bigramQuantMin/Max}, {@code unigramQuantMin/Max} —
@@ -56,14 +55,12 @@ import java.nio.ByteOrder;
  *       independence sum.
  * </ul>
  *
- * <p>Membership semantics: no Bloom filter.  The empty-slot sentinel is
- * the membership oracle — a pair is "seen" iff binary-search finds both
- * codepoints in the index AND a probe sequence hits a matching key before
- * an empty slot.  Lookups are therefore exact.
+ * <p>Membership semantics: no Bloom filter.  A pair is "seen" iff
+ * binary-search finds both codepoints in the index AND finds the packed
+ * key in {@code bigramKeys}.  Lookups are therefore exact.
  *
- * <p>Fields are package-private so the
- * {@link org.apache.tika.ml.junkdetect.tools.TrainJunkModel} trainer can
- * construct instances directly without going through accessors.
+ * <p>Instances are built by the trainer ({@code TrainJunkModel}, in the
+ * tika-ml-junkdetect-tools module) and read back via {@link #readFrom}.
  */
 public final class BigramTables {
 
@@ -124,14 +121,23 @@ public final class BigramTables {
         cpBuf.asIntBuffer().put(codepointIndex);
         dos.write(cpBuf.array());
 
-        // Bigram open-addressing table (keys + values).
+        // Bigram table: sorted-occupied keys (ascending) + parallel values.
+        // Store key[0] raw, then varint (LEB128) deltas from the previous key;
+        // deltas are small because the keys are sorted and dense.
         dos.writeInt(bigramKeys.length);
         dos.writeFloat(bigramQuantMin);
         dos.writeFloat(bigramQuantMax);
-        ByteBuffer keyBuf = ByteBuffer.allocate(bigramKeys.length * 4)
-                .order(ByteOrder.BIG_ENDIAN);
-        keyBuf.asIntBuffer().put(bigramKeys);
-        dos.write(keyBuf.array());
+        if (bigramKeys.length > 0) {
+            dos.writeInt(bigramKeys[0]);
+            for (int i = 1; i < bigramKeys.length; i++) {
+                long delta = (long) bigramKeys[i] - (long) bigramKeys[i - 1];
+                if (delta <= 0) {
+                    throw new IOException("bigramKeys must be strictly 
ascending "
+                            + "(no duplicates); non-increasing at index " + i);
+                }
+                writeVarLong(dos, delta);
+            }
+        }
         dos.write(bigramValues);
 
         // Unigram table.
@@ -153,9 +159,18 @@ public final class BigramTables {
         int slots = dis.readInt();
         float bMin = dis.readFloat();
         float bMax = dis.readFloat();
-        byte[] keyBytes = dis.readNBytes(slots * 4);
         int[] keys = new int[slots];
-        
ByteBuffer.wrap(keyBytes).order(ByteOrder.BIG_ENDIAN).asIntBuffer().get(keys);
+        if (slots > 0) {
+            keys[0] = dis.readInt();
+            for (int i = 1; i < slots; i++) {
+                long next = (long) keys[i - 1] + readVarLong(dis);
+                if (next <= keys[i - 1] || next > Integer.MAX_VALUE) {
+                    throw new IOException("Corrupt bigram keys: not strictly "
+                            + "ascending / out of range at index " + i);
+                }
+                keys[i] = (int) next;
+            }
+        }
         byte[] values = dis.readNBytes(slots);
 
         float uMin = dis.readFloat();
@@ -167,11 +182,36 @@ public final class BigramTables {
                 bMin, bMax, uMin, uMax, uFallback, backoffAlpha);
     }
 
+    /** Writes a non-negative long as an unsigned LEB128 varint. */
+    private static void writeVarLong(DataOutputStream dos, long v) throws 
IOException {
+        while ((v & ~0x7FL) != 0) {
+            dos.writeByte((int) ((v & 0x7F) | 0x80));
+            v >>>= 7;
+        }
+        dos.writeByte((int) v);
+    }
+
+    /** Reads an unsigned LEB128 varint written by {@link #writeVarLong}. */
+    private static long readVarLong(DataInputStream dis) throws IOException {
+        long v = 0;
+        int shift = 0;
+        int b;
+        do {
+            if (shift >= 64) {
+                throw new IOException("Malformed varint in bigram key deltas 
(too long)");
+            }
+            b = dis.readUnsignedByte();
+            v |= (long) (b & 0x7F) << shift;
+            shift += 7;
+        } while ((b & 0x80) != 0);
+        return v;
+    }
+
     /**
      * Returns a one-line summary for trainer progress output.
      */
     public String statsString() {
-        return String.format(
+        return String.format(java.util.Locale.ROOT,
                 "  cp_index=%d, bigram_slots=%d (load≈%.2f), "
                 + "bigram_range=[%.3f, %.3f], unigram_range=[%.3f, %.3f]",
                 codepointIndex.length, bigramKeys.length,
diff --git 
a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkDetector.java
 
b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkDetector.java
index 2f117479d0..b63f8787e8 100644
--- 
a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkDetector.java
+++ 
b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkDetector.java
@@ -235,41 +235,41 @@ public final class JunkDetector implements 
TextQualityDetector {
      *   [4 bytes]    num_scripts (int BE)
      *   [1 byte]     block_scheme_version  (must equal
      *                {@link UnicodeBlockRanges#SCHEME_VERSION})
+     *   // z4 — global script-transition section
      *   [1 byte]     num_script_buckets
      *   for each bucket:
      *     [2 bytes]      name length (ushort BE)
      *     [name bytes]   bucket name (UTF-8)
-     *   [num_script_buckets² × 4 bytes]  script-transition log-prob table (F4)
-     *   [4 bytes]    mu4 (float32 BE)
-     *   [4 bytes]    sigma4 (float32 BE)
+     *   [4 bytes]    scriptTrans_quant_min (float32 BE)
+     *   [4 bytes]    scriptTrans_quant_max (float32 BE)
+     *   [num_script_buckets² × 2 bytes]  script-transition table (z4, 
int16-quantized)
+     *   [4 bytes]    mu4 (z4 calibration, float32 BE)
+     *   [4 bytes]    sigma4
+     *   // z2 — global block-transition section
+     *   [4 bytes]    block_quant_min (float32 BE)
+     *   [4 bytes]    block_quant_max (float32 BE)
+     *   [block_N² × 2 bytes]  block-transition table (z2, int16-quantized)
+     *   [4 bytes]    mu2 (z2 calibration)
+     *   [4 bytes]    sigma2
+     *   // global per-feature calibrations, {mu, sigma} float32 pairs
+     *   [8 bytes]    z3 calibration (control-byte ratio)
+     *   [8 bytes]    z5 calibration (letter-adjacent-to-mark)
+     *   [8 bytes]    z6 calibration (replacement-char ratio)
+     *   [8 bytes]    z9 calibration (script-alternation)
+     *   // global combiner
+     *   [1 byte]     num_features
+     *   [(num_features+1) × 4 bytes]  combiner weights w1..wN and bias
+     *   // per-script section
      *   for each script (sorted by name):
      *     [2 bytes]      name length
      *     [name bytes]   script name (UTF-8)
-     *     [4 bytes]      mu1 (F1 calibration, codepoint-bigram mean log-prob)
+     *     [4 bytes]      mu1 (z1 calibration, codepoint-bigram mean log-prob)
      *     [4 bytes]      sigma1
-     *     // bigram tables for this script — see {@link BigramTables#writeTo}
-     *     [4 bytes]      backoff_alpha (float32 BE)
-     *     [4 bytes]      codepoint_count
-     *     [codepoint_count × 4 bytes]  codepoint index (sorted, ascending)
-     *     [4 bytes]      bigram_slots (power of 2)
-     *     [4 bytes]      bigram_quant_min (float32 BE)
-     *     [4 bytes]      bigram_quant_max (float32 BE)
-     *     [bigram_slots × 4 bytes]  bigram open-addressing keys
-     *                                ((idxA<<16)|idxB, or {@link 
BigramTables#EMPTY_KEY})
-     *     [bigram_slots bytes]      bigram values (8-bit quantized log-probs)
-     *     [4 bytes]      unigram_quant_min (float32 BE)
-     *     [4 bytes]      unigram_quant_max (float32 BE)
-     *     [4 bytes]      unigram_fallback_log_prob (float32 BE; used for
-     *                                                codepoints not in index)
-     *     [codepoint_count bytes]   unigram values (8-bit quantized log-probs)
-     *     // F2/F3/classifier
-     *     [4 bytes]      mu2 (F2 calibration)
-     *     [4 bytes]      sigma2
-     *     [block_N² × 4 bytes]  block-transition log-prob table (F2)
-     *     [4 bytes]      mu3 (F3 calibration)
-     *     [4 bytes]      sigma3
-     *     [1 byte]       num_features
-     *     [(num_features+1) × 4 bytes]  classifier weights w1..wN and bias
+     *     [variable]     bigram + unigram tables — exact layout in
+     *                    {@link BigramTables#writeTo}: codepoint index, then 
the
+     *                    sorted-occupied bigram keys (key[0] as int32 BE 
followed
+     *                    by LEB128 varint deltas) and 8-bit quantized bigram 
and
+     *                    unigram log-prob values
      * </pre>
      */
     public static JunkDetector load(InputStream rawIs) throws IOException {
@@ -498,6 +498,16 @@ public final class JunkDetector implements 
TextQualityDetector {
         int[] cps = text.codePoints().toArray();
 
         Map<String, double[]> buckets = new HashMap<>(); // script -> 
{sumLogP, count}
+        // Left-index memo.  forEachScriptBigram emits (^,x),(x,y),(y,$)... so 
within
+        // a run each pair's right codepoint b is the next pair's left 
codepoint a.
+        // Reuse the previous pair's right-index as this pair's left-index 
when they
+        // match (same codepoint AND same script => same table), so each 
codepoint is
+        // binary-searched in the script's index once instead of twice.  
Bit-identical
+        // to scoring each pair independently; the guard falls back to a fresh 
search
+        // whenever the overlap doesn't hold (run boundary, sentinel, script 
change).
+        String[] lastScript = {null};
+        int[] lastB = {Integer.MIN_VALUE};
+        int[] lastBIdx = {-1};
         forEachScriptBigram(cps, (script, a, b) -> {
             if (!calibrations.containsKey(script)) {
                 return;
@@ -506,7 +516,13 @@ public final class JunkDetector implements 
TextQualityDetector {
             if (t == null) {
                 return;
             }
-            double lp = computeF1MeanLogP(new int[]{a, b}, t);
+            int idxA = (a == lastB[0] && script.equals(lastScript[0]))
+                    ? lastBIdx[0] : codepointToIndex(t, a);
+            int idxB = codepointToIndex(t, b);
+            lastScript[0] = script;
+            lastB[0] = b;
+            lastBIdx[0] = idxB;
+            double lp = scorePairF1(a, idxA, b, idxB, t);
             if (Double.isNaN(lp)) {
                 return;
             }
@@ -839,7 +855,7 @@ public final class JunkDetector implements 
TextQualityDetector {
         for (int i = 0; i < text.length(); ) {
             int cp = text.codePointAt(i);
             i += Character.charCount(cp);
-            Character.UnicodeScript s = Character.UnicodeScript.of(cp);
+            Character.UnicodeScript s = TextQualityFeatures.scriptOf(cp);
             if (s == Character.UnicodeScript.COMMON
                     || s == Character.UnicodeScript.INHERITED
                     || s == Character.UnicodeScript.UNKNOWN) {
@@ -970,22 +986,6 @@ public final class JunkDetector implements 
TextQualityDetector {
         return java.util.Arrays.binarySearch(tables.codepointIndex, cp);
     }
 
-    /**
-     * Mixing function used to scatter packed (idxA, idxB) keys across
-     * the open-addressing table.  A simple integer finalizer (splitmix32
-     * style) gives good distribution for sequential index values.
-     *
-     * <p>Public so the trainer's open-addressing insertion routine uses
-     * the same probe order as inference — drift here would silently
-     * corrupt every lookup.
-     */
-    public static int mixIndexKey(int packedKey) {
-        int x = packedKey;
-        x = (x ^ (x >>> 16)) * 0x7feb352d;
-        x = (x ^ (x >>> 15)) * 0x846ca68b;
-        x = x ^ (x >>> 16);
-        return x;
-    }
 
     /**
      * Packed bigram key for indices {@code (a, b)} where each index fits in
@@ -1085,20 +1085,12 @@ public final class JunkDetector implements 
TextQualityDetector {
      * for {@code (idxA, idxB)}, or {@code -1} if not present (probe hit an
      * empty slot first).
      *
-     * <p>Linear probing with the same mix-hash used at training time —
-     * required for the table to be readable, not just writable.
+     * <p>{@code bigramKeys} is sorted ascending (signed), so this is a binary 
search.
      */
     static int lookupBigramSlot(BigramTables tables, int idxA, int idxB) {
         int packedKey = packBigramKey(idxA, idxB);
-        int[] keys = tables.bigramKeys;
-        int mask = keys.length - 1;
-        int h = mixIndexKey(packedKey) & mask;
-        while (true) {
-            int k = keys[h];
-            if (k == BigramTables.EMPTY_KEY) return -1;
-            if (k == packedKey) return h;
-            h = (h + 1) & mask;
-        }
+        int slot = java.util.Arrays.binarySearch(tables.bigramKeys, packedKey);
+        return slot >= 0 ? slot : -1;
     }
 
     private static double unigramLogProb(BigramTables tables, int idx) {
@@ -1134,7 +1126,7 @@ public final class JunkDetector implements 
TextQualityDetector {
         for (int i = 0; i < text.length(); ) {
             int cp = text.codePointAt(i);
             i += Character.charCount(cp);
-            Character.UnicodeScript s = Character.UnicodeScript.of(cp);
+            Character.UnicodeScript s = TextQualityFeatures.scriptOf(cp);
             if (s == Character.UnicodeScript.COMMON
                     || s == Character.UnicodeScript.INHERITED
                     || s == Character.UnicodeScript.UNKNOWN) {
@@ -1170,7 +1162,7 @@ public final class JunkDetector implements 
TextQualityDetector {
 
     /** COMMON-class predicate: COMMON, INHERITED, UNKNOWN all pool into 
COMMON. */
     static String classKey(int cp) {
-        Character.UnicodeScript s = Character.UnicodeScript.of(cp);
+        Character.UnicodeScript s = TextQualityFeatures.scriptOf(cp);
         if (s == Character.UnicodeScript.COMMON
                 || s == Character.UnicodeScript.INHERITED
                 || s == Character.UnicodeScript.UNKNOWN) {
@@ -1381,7 +1373,7 @@ public final class JunkDetector implements 
TextQualityDetector {
         Map<Character.UnicodeScript, Integer> counts = new HashMap<>();
         for (int i = 0; i < text.length(); ) {
             int cp = text.codePointAt(i);
-            Character.UnicodeScript s = Character.UnicodeScript.of(cp);
+            Character.UnicodeScript s = TextQualityFeatures.scriptOf(cp);
             if (s != Character.UnicodeScript.COMMON
                     && s != Character.UnicodeScript.INHERITED
                     && s != Character.UnicodeScript.UNKNOWN) {
diff --git 
a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkFilterEncodingDetector.java
 
b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkFilterEncodingDetector.java
index b8cb75de01..0f627744bb 100644
--- 
a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkFilterEncodingDetector.java
+++ 
b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkFilterEncodingDetector.java
@@ -35,6 +35,7 @@ import org.slf4j.LoggerFactory;
 import org.apache.tika.config.TikaComponent;
 import org.apache.tika.detect.CharsetSupersets;
 import org.apache.tika.detect.EncodingDetectorContext;
+import org.apache.tika.detect.EncodingProbeCache;
 import org.apache.tika.detect.EncodingResult;
 import org.apache.tika.detect.HighByteLetterStats;
 import org.apache.tika.detect.MetaEncodingDetector;
@@ -156,7 +157,7 @@ public class JunkFilterEncodingDetector implements 
MetaEncodingDetector {
             return Collections.emptyList();
         }
 
-        byte[] bytes = readProbe(tis);
+        byte[] bytes = readProbe(tis, context);
         if (bytes == null || bytes.length == 0) {
             context.setArbitrationInfo("junk-filter-empty-stream");
             return Collections.emptyList();
@@ -246,31 +247,20 @@ public class JunkFilterEncodingDetector implements 
MetaEncodingDetector {
         Charset champion = null;
         double championZ = Double.NEGATIVE_INFINITY;
         Map<Charset, Double> scoreByCharset = new LinkedHashMap<>();
-        Map<Charset, Double> diffByCharset = new LinkedHashMap<>();
-        // Dedup by text: [0] = whole-text z (the champion + anchor metric, 
kept
-        // exactly as before); [1] = script-letter "diff" z (codepoints >= 0x80
-        // that are letters/ideographs — the high bytes where the candidate
-        // decodes actually differ), used ONLY for the family gate below.
-        Map<String, float[]> zByText = new HashMap<>();
+        // Whole-text z (the champion + anchor metric), deduped by decoded 
text.
+        Map<String, Float> wholeZByText = new HashMap<>();
         for (Map.Entry<Charset, String> entry : candidates.entrySet()) {
             String text = entry.getValue();
-            float[] zs = zByText.get(text);
-            if (zs == null) {
+            Float wholeZ = wholeZByText.get(text);
+            if (wholeZ == null) {
                 org.apache.tika.quality.TextQualityScore sc = 
qualityDetector.score(text);
-                float wholeZ = sc.isUnknown() ? Float.NEGATIVE_INFINITY : 
sc.getZScore();
-                String diff = scriptLetters(text);
-                float diffZ = Float.NEGATIVE_INFINITY;
-                if (!diff.isEmpty()) {
-                    org.apache.tika.quality.TextQualityScore d = 
qualityDetector.score(diff);
-                    diffZ = d.isUnknown() ? Float.NEGATIVE_INFINITY : 
d.getZScore();
-                }
-                zs = new float[]{wholeZ, diffZ};
-                zByText.put(text, zs);
+                wholeZ = sc.isUnknown() ? Float.NEGATIVE_INFINITY : 
sc.getZScore();
+                wholeZByText.put(text, wholeZ);
             }
-            scoreByCharset.put(entry.getKey(), (double) zs[0]);
-            diffByCharset.put(entry.getKey(), (double) zs[1]);
-            if (zs[0] > championZ) {
-                championZ = zs[0];
+            double z = wholeZ;
+            scoreByCharset.put(entry.getKey(), z);
+            if (z > championZ) {
+                championZ = z;
                 champion = entry.getKey();
             }
         }
@@ -284,34 +274,53 @@ public class JunkFilterEncodingDetector implements 
MetaEncodingDetector {
         // CJK/non-CJK BOUNDARY for COMMON-dominated docs (markup/digits/punct
         // decode identically and swamp the few discriminating high bytes),
         // producing false-CJK and real-CJK demotion.  The script-letter 
"diff" z
-        // reads that boundary cleanly (coherent CJK vs garbage), so use it to
-        // decide ONLY the family; within a family the whole-text champion 
stands
-        // (Latin-vs-Latin etc. untouched — a blanket diff-score regressed 
there).
-        // Override only on a clear diff margin.
-        double bestCjkDiff = Double.NEGATIVE_INFINITY;
-        double bestNonCjkDiff = Double.NEGATIVE_INFINITY;
-        for (Map.Entry<Charset, Double> e : diffByCharset.entrySet()) {
-            if (isCjkCharset(e.getKey().name())) {
-                bestCjkDiff = Math.max(bestCjkDiff, e.getValue());
-            } else {
-                bestNonCjkDiff = Math.max(bestNonCjkDiff, e.getValue());
+        // (codepoints >= 0x80 that are letters/ideographs — the high bytes 
where
+        // candidate decodes actually differ) reads that boundary cleanly, so 
use
+        // it to decide ONLY the family; within a family the whole-text 
champion
+        // stands (Latin-vs-Latin etc. untouched — a blanket diff-score 
regressed).
+        //
+        // DEMOTE-ONLY and CJK-champion-only: the gate fires only to demote a 
CJK
+        // champion to non-CJK (the false-CJK fix).  The reverse (promote 
non-CJK
+        // -> CJK) is NOT done: measured at 29k, the diff z reliably says 
"this CJK
+        // pick is really non-CJK" (OOV improves on every such flip) but 
UNreliably
+        // the reverse (the junk model over-rates ideograph mojibake vs sparse
+        // Latin letters); the promote direction is also unnecessary — genuine 
CJK
+        // is html-meta-declared upstream.  Because the gate can only act when 
the
+        // champion is CJK, the second "diff" score per candidate is needed 
ONLY
+        // then — compute it lazily and skip it entirely for the common non-CJK
+        // champion (halving the score() calls there).
+        if (isCjkCharset(champion.name())) {
+            double bestCjkDiff = Double.NEGATIVE_INFINITY;
+            double bestNonCjkDiff = Double.NEGATIVE_INFINITY;
+            Map<String, Float> diffZByText = new HashMap<>();
+            for (Map.Entry<Charset, String> entry : candidates.entrySet()) {
+                String text = entry.getValue();
+                Float diffZ = diffZByText.get(text);
+                if (diffZ == null) {
+                    String diff = scriptLetters(text);
+                    float dz = Float.NEGATIVE_INFINITY;
+                    if (!diff.isEmpty()) {
+                        org.apache.tika.quality.TextQualityScore d = 
qualityDetector.score(diff);
+                        dz = d.isUnknown() ? Float.NEGATIVE_INFINITY : 
d.getZScore();
+                    }
+                    diffZ = dz;
+                    diffZByText.put(text, diffZ);
+                }
+                double dz = diffZ;
+                if (isCjkCharset(entry.getKey().name())) {
+                    bestCjkDiff = Math.max(bestCjkDiff, dz);
+                } else {
+                    bestNonCjkDiff = Math.max(bestNonCjkDiff, dz);
+                }
             }
-        }
-        // DEMOTE-ONLY: fire only to demote a CJK champion to non-CJK when the
-        // diff z clearly prefers non-CJK (the false-CJK fix).  The reverse
-        // (promote non-CJK -> CJK) is NOT done: measured at 29k, the diff z
-        // reliably says "this CJK pick is really non-CJK" (OOV improves on 
every
-        // such flip) but UNreliably says "this non-CJK pick is really CJK" 
(the
-        // junk model over-rates ideograph mojibake vs sparse Latin letters — 
OOV
-        // worsened on every promote flip).  The promote direction is also
-        // unnecessary: genuine CJK is html-meta-declared upstream.
-        if (isCjkCharset(champion.name())
-                && bestNonCjkDiff > bestCjkDiff + FAMILY_DIFF_MARGIN) {
-            Charset reFam = bestInFamily(scoreByCharset, false);
-            if (reFam != null) {
-                LOG.trace("junk-filter family gate: {} (CJK) -> {} (non-CJK by 
diff z)",
-                        champion.name(), reFam.name());
-                champion = reFam;
+            // Override only on a clear diff margin.
+            if (bestNonCjkDiff > bestCjkDiff + FAMILY_DIFF_MARGIN) {
+                Charset reFam = bestInFamily(scoreByCharset, false);
+                if (reFam != null) {
+                    LOG.trace("junk-filter family gate: {} (CJK) -> {} 
(non-CJK by diff z)",
+                            champion.name(), reFam.name());
+                    champion = reFam;
+                }
             }
         }
 
@@ -584,9 +593,21 @@ public class JunkFilterEncodingDetector implements 
MetaEncodingDetector {
         return true;
     }
 
-    private byte[] readProbe(TikaInputStream tis) throws IOException {
+    private byte[] readProbe(TikaInputStream tis, EncodingDetectorContext 
context)
+            throws IOException {
         // readLimit is the tag-stripped content target; cap raw reads at 512 
KB.
-        byte[] probe = AdaptiveProbe.read(tis, readLimit, 
AdaptiveProbe.DEFAULT_RAW_CAP);
+        int rawCap = AdaptiveProbe.DEFAULT_RAW_CAP;
+        EncodingProbeCache cache = context == null ? null : 
context.getProbeCache();
+        if (cache != null) {
+            byte[] cached = cache.get(readLimit, rawCap);
+            if (cached != null) {
+                return cached.length == 0 ? null : cached;
+            }
+        }
+        byte[] probe = AdaptiveProbe.read(tis, readLimit, rawCap);
+        if (cache != null) {
+            cache.put(probe, readLimit, rawCap);
+        }
         return probe.length == 0 ? null : probe;
     }
 
diff --git 
a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/TextQualityFeatures.java
 
b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/TextQualityFeatures.java
index 2ae926a927..b5b7c5aeb2 100644
--- 
a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/TextQualityFeatures.java
+++ 
b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/TextQualityFeatures.java
@@ -43,6 +43,46 @@ public final class TextQualityFeatures {
     private TextQualityFeatures() {
     }
 
+    // -----------------------------------------------------------------------
+    // Memoized Unicode-script lookup
+    // -----------------------------------------------------------------------
+
+    /** Cached {@code UnicodeScript.values()} so an ordinal-&gt;enum lookup 
never
+     *  re-allocates the values array. */
+    private static final Character.UnicodeScript[] SCRIPT_VALUES =
+            Character.UnicodeScript.values();
+
+    /**
+     * Memoized {@link Character.UnicodeScript#of(int)} for the BMP.  Scoring a
+     * document classifies every codepoint's script ~5 times (z4/z7/z8/z9 plus 
the
+     * z1 bigram bucketing), and {@code UnicodeScript.of} is a binary search 
over
+     * the script-range table (measured ~12 ns/cp, 10-20x {@code 
Character.getType}).
+     * The result is a pure function of the codepoint for a given JVM, so 
cache it:
+     * BMP codepoints (&gt;99% of text) become an O(1) array lookup after first
+     * sight, shared across every call site and every document.  Slot 0 means 
"not
+     * yet computed"; otherwise {@code ordinal + 1}.  The fill is a benign 
data race
+     * — every writer stores the same deterministic value and {@code short} 
writes
+     * do not tear.
+     */
+    private static final short[] BMP_SCRIPT_CACHE = new short[0x10000];
+
+    /**
+     * Script of {@code codePoint}, memoized for the BMP — identical result to
+     * {@link Character.UnicodeScript#of(int)} (the same singleton enum 
constant).
+     */
+    static Character.UnicodeScript scriptOf(int codePoint) {
+        if (codePoint >= 0 && codePoint < 0x10000) {
+            short v = BMP_SCRIPT_CACHE[codePoint];
+            if (v != 0) {
+                return SCRIPT_VALUES[v - 1];
+            }
+            Character.UnicodeScript s = Character.UnicodeScript.of(codePoint);
+            BMP_SCRIPT_CACHE[codePoint] = (short) (s.ordinal() + 1);
+            return s;
+        }
+        return Character.UnicodeScript.of(codePoint);
+    }
+
     // -----------------------------------------------------------------------
     // Strip modes
     // -----------------------------------------------------------------------
@@ -99,7 +139,7 @@ public final class TextQualityFeatures {
                 return type == Character.CONTROL || type == Character.FORMAT;
             }
             case ALL_COMMON: {
-                Character.UnicodeScript s = Character.UnicodeScript.of(cp);
+                Character.UnicodeScript s = scriptOf(cp);
                 return s == Character.UnicodeScript.COMMON
                         || s == Character.UnicodeScript.INHERITED
                         || s == Character.UnicodeScript.UNKNOWN;
@@ -398,7 +438,7 @@ public final class TextQualityFeatures {
                 continue;
             }
             total++;
-            Character.UnicodeScript s = Character.UnicodeScript.of(cp);
+            Character.UnicodeScript s = scriptOf(cp);
             if (s != Character.UnicodeScript.COMMON
                     && s != Character.UnicodeScript.INHERITED
                     && s != Character.UnicodeScript.UNKNOWN) {
@@ -442,7 +482,7 @@ public final class TextQualityFeatures {
         for (int i = 0; i < text.length(); ) {
             int cp = text.codePointAt(i);
             i += Character.charCount(cp);
-            Character.UnicodeScript s = Character.UnicodeScript.of(cp);
+            Character.UnicodeScript s = scriptOf(cp);
             if (s == Character.UnicodeScript.COMMON
                     || s == Character.UnicodeScript.INHERITED
                     || s == Character.UnicodeScript.UNKNOWN) {
@@ -508,7 +548,7 @@ public final class TextQualityFeatures {
         for (int i = 0; i < text.length(); ) {
             int cp = text.codePointAt(i);
             i += Character.charCount(cp);
-            Character.UnicodeScript s = Character.UnicodeScript.of(cp);
+            Character.UnicodeScript s = scriptOf(cp);
             if (s == Character.UnicodeScript.COMMON
                     || s == Character.UnicodeScript.INHERITED
                     || s == Character.UnicodeScript.UNKNOWN) {
@@ -538,7 +578,7 @@ public final class TextQualityFeatures {
         for (int i = 0; i < text.length(); ) {
             int cp = text.codePointAt(i);
             i += Character.charCount(cp);
-            Character.UnicodeScript s = Character.UnicodeScript.of(cp);
+            Character.UnicodeScript s = scriptOf(cp);
             if (s == Character.UnicodeScript.COMMON
                     || s == Character.UnicodeScript.INHERITED
                     || s == Character.UnicodeScript.UNKNOWN) {
@@ -616,7 +656,7 @@ public final class TextQualityFeatures {
     }
 
     private static String scriptClusterOf(int cp) {
-        Character.UnicodeScript s = Character.UnicodeScript.of(cp);
+        Character.UnicodeScript s = scriptOf(cp);
         switch (s) {
             case HAN:
             case HIRAGANA:
diff --git 
a/tika-ml/tika-ml-junkdetect/src/main/resources/org/apache/tika/ml/junkdetect/junkdetect.bin
 
b/tika-ml/tika-ml-junkdetect/src/main/resources/org/apache/tika/ml/junkdetect/junkdetect.bin
index ead028cbb3..06179e75e6 100644
Binary files 
a/tika-ml/tika-ml-junkdetect/src/main/resources/org/apache/tika/ml/junkdetect/junkdetect.bin
 and 
b/tika-ml/tika-ml-junkdetect/src/main/resources/org/apache/tika/ml/junkdetect/junkdetect.bin
 differ
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/pom.xml
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/pom.xml
index b558f836a2..ead93cf083 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/pom.xml
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/pom.xml
@@ -30,10 +30,13 @@
   <name>Apache Tika html parser module</name>
 
   <dependencies>
+    <!-- Test scope: shipped parser stays decoupled from any specific
+         EncodingDetector; tests still exercise real <meta charset> handling. 
-->
     <dependency>
       <groupId>org.apache.tika</groupId>
       <artifactId>tika-encoding-detector-html</artifactId>
       <version>${project.version}</version>
+      <scope>test</scope>
     </dependency>
 
     <dependency>

(tika) branch main updated: TIKA-4745 -- efficiency improvements (#2878)

Reply via email to