(tika) 01/01: TIKA-4745 -- efficiency improvements

tallison Fri, 05 Jun 2026 15:11:47 -0700

This is an automated email from the ASF dual-hosted git repository.

tballison pushed a commit to branch TIKA-4745-more-junk-charset
in repository https://gitbox.apache.org/repos/asf/tika.git


commit d5ef09b8a8c9914ef898c4ae4c9c770e85005e4c
Author: tallison <[email protected]>
AuthorDate: Fri Jun 5 18:11:24 2026 -0400

    TIKA-4745 -- efficiency improvements
---
 .../tika/detect/EncodingDetectorContext.java       |  11 ++++
 .../org/apache/tika/detect/EncodingProbeCache.java |  65 +++++++++++++++++++++
 .../tika/parser/html/HtmlEncodingDetector.java     |  41 +++++++++++++
 .../apache/tika/ml/chardetect/AdaptiveProbe.java   |  15 ++++-
 .../ml/chardetect/MojibusterEncodingDetector.java  |  21 ++++++-
 .../NaiveBayesBigramEncodingDetector.java          |  14 ++++-
 .../org/apache/tika/ml/chardetect/nb-bigram.bin    | Bin 1008871 -> 696579 
bytes
 .../charsoup/CharSoupFeatureExtractor.java         |  14 ++++-
 .../ml/chardetect/tools/TrainNaiveBayesBigram.java |  21 +++++--
 .../apache/tika/ml/junkdetect/BigramTables.java    |  18 +++---
 .../apache/tika/ml/junkdetect/JunkDetector.java    |  30 +---------
 .../ml/junkdetect/JunkFilterEncodingDetector.java  |  19 +++++-
 .../tika/ml/junkdetect/tools/TrainJunkModel.java   |  51 +++++-----------
 .../org/apache/tika/ml/junkdetect/junkdetect.bin   | Bin 2316809 -> 1395108 
bytes
 .../ml/junkdetect/JunkDetectorRoundTripTest.java   |  43 +++-----------
 15 files changed, 236 insertions(+), 127 deletions(-)

diff --git 
a/tika-core/src/main/java/org/apache/tika/detect/EncodingDetectorContext.java 
b/tika-core/src/main/java/org/apache/tika/detect/EncodingDetectorContext.java
index 6957601e2c..426f508c14 100644
--- 
a/tika-core/src/main/java/org/apache/tika/detect/EncodingDetectorContext.java
+++ 
b/tika-core/src/main/java/org/apache/tika/detect/EncodingDetectorContext.java
@@ -40,8 +40,19 @@ import java.util.Set;
 public class EncodingDetectorContext {
 
     private final List<Result> results = new ArrayList<>();
+    private final EncodingProbeCache probeCache = new EncodingProbeCache();
     private String arbitrationInfo;
 
+    /**
+     * Per-detection cache of the raw detection probe, shared across the 
detectors in
+     * this chain so they don't each re-read the same leading bytes. It lives 
and dies
+     * with this context (which is removed after detection), so it never leaks 
into
+     * recursive/attachment parsing.
+     */
+    public EncodingProbeCache getProbeCache() {
+        return probeCache;
+    }
+
     /**
      * Record the ranked results from a child detector.
      *
diff --git 
a/tika-core/src/main/java/org/apache/tika/detect/EncodingProbeCache.java 
b/tika-core/src/main/java/org/apache/tika/detect/EncodingProbeCache.java
new file mode 100644
index 0000000000..a4d783fb02
--- /dev/null
+++ b/tika-core/src/main/java/org/apache/tika/detect/EncodingProbeCache.java
@@ -0,0 +1,65 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.detect;
+
+/**
+ * Caches the raw encoding-detection probe (the leading bytes read for 
detection)
+ * so that multiple detectors in a chain do not each re-read and re-tag-strip 
the
+ * same bytes. For example a statistical detector and a downstream meta 
detector
+ * that re-reads the bytes for arbitration can share one probe.
+ * <p>
+ * An instance is held by {@link EncodingDetectorContext}, so it inherits that
+ * context's per-detection lifecycle: created fresh per detection and discarded
+ * with the context immediately afterwards. That matters because a
+ * {@link org.apache.tika.parser.ParseContext} flows on into recursive
+ * (attachment/embedded) parsing — a probe must never outlive the single 
detection
+ * it was read for.
+ * <p>
+ * Not thread-safe: a single detection runs its detectors sequentially on one
+ * thread. The cache is keyed by the probe parameters — {@link #get} returns 
the
+ * cached probe only when both {@code contentTarget} and {@code rawCap} match 
what
+ * it was stored with, so a detector that wants a differently-sized probe
+ * transparently reads (and caches) its own.
+ * <p>
+ * The cached array is shared read-only state; callers must not mutate it in 
place.
+ */
+public class EncodingProbeCache {
+
+    private byte[] probe;
+    private int contentTarget = -1;
+    private int rawCap = -1;
+
+    /**
+     * @return the cached probe if one was stored with the same {@code 
contentTarget} and
+     * {@code rawCap}; otherwise {@code null}
+     */
+    public byte[] get(int contentTarget, int rawCap) {
+        if (probe != null && this.contentTarget == contentTarget && 
this.rawCap == rawCap) {
+            return probe;
+        }
+        return null;
+    }
+
+    /**
+     * Stores the probe bytes read with the given parameters.
+     */
+    public void put(byte[] probe, int contentTarget, int rawCap) {
+        this.probe = probe;
+        this.contentTarget = contentTarget;
+        this.rawCap = rawCap;
+    }
+}
diff --git 
a/tika-encoding-detectors/tika-encoding-detector-html/src/main/java/org/apache/tika/parser/html/HtmlEncodingDetector.java
 
b/tika-encoding-detectors/tika-encoding-detector-html/src/main/java/org/apache/tika/parser/html/HtmlEncodingDetector.java
index c052b062b6..5cce8c87e9 100644
--- 
a/tika-encoding-detectors/tika-encoding-detector-html/src/main/java/org/apache/tika/parser/html/HtmlEncodingDetector.java
+++ 
b/tika-encoding-detectors/tika-encoding-detector-html/src/main/java/org/apache/tika/parser/html/HtmlEncodingDetector.java
@@ -162,6 +162,14 @@ public class HtmlEncodingDetector implements 
EncodingDetector {
         }
         tis.reset();
 
+        // findCharset only ever matches a meta tag (HTTP_META_PATTERN = 
"<\s*meta...").
+        // If the probe has no such tag, the full ASCII decode + 
comment-stripping
+        // regex below can only produce null — skip them. Byte-level, no 
allocation;
+        // a strict necessary condition for any non-empty result.
+        if (!containsMetaTag(buffer, n)) {
+            return Collections.emptyList();
+        }
+
         String head = ASCII.decode(ByteBuffer.wrap(buffer, 0, n)).toString();
         String headNoComments = head.replaceAll("<!--.*?(-->|$)", " ");
         Charset charset = findCharset(headNoComments);
@@ -175,6 +183,39 @@ public class HtmlEncodingDetector implements 
EncodingDetector {
                 EncodingResult.ResultType.DECLARATIVE));
     }
 
+    /**
+     * Byte-level scan for an opening meta tag, mirroring the {@code <\s*meta} 
prefix of
+     * {@link #HTTP_META_PATTERN} (ASCII, case-insensitive). Lets {@link 
#detect} skip the
+     * full ASCII decode + comment-stripping regex on probes that cannot 
contain a meta
+     * charset declaration. {@code <}, ASCII whitespace and {@code meta} are 
all ASCII, so a
+     * raw-byte scan is equivalent to scanning the decoded head.
+     */
+    private static boolean containsMetaTag(byte[] buf, int len) {
+        for (int i = 0; i < len; i++) {
+            if (buf[i] != '<') {
+                continue;
+            }
+            int j = i + 1;
+            while (j < len) {
+                int c = buf[j] & 0xFF;
+                if (c == ' ' || c == '\t' || c == '\n' || c == 0x0B || c == 
'\f'
+                        || c == '\r') {
+                    j++;
+                } else {
+                    break;
+                }
+            }
+            if (j + 4 <= len
+                    && (((buf[j] & 0xFF) | 0x20) == 'm')
+                    && (((buf[j + 1] & 0xFF) | 0x20) == 'e')
+                    && (((buf[j + 2] & 0xFF) | 0x20) == 't')
+                    && (((buf[j + 3] & 0xFF) | 0x20) == 'a')) {
+                return true;
+            }
+        }
+        return false;
+    }
+
     //returns null if no charset was found
     private Charset findCharset(String s) {
 
diff --git 
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/AdaptiveProbe.java
 
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/AdaptiveProbe.java
index 34a081ec32..e2100b0533 100644
--- 
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/AdaptiveProbe.java
+++ 
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/AdaptiveProbe.java
@@ -55,11 +55,20 @@ public final class AdaptiveProbe {
             throws IOException {
         tis.mark(rawCap);
         try {
-            byte[] buf = new byte[rawCap];
-            byte[] stripDst = new byte[rawCap];
+            // Grow on demand rather than allocating (and zeroing) the full 
rawCap
+            // (e.g. 512 KB) twice up front: the vast majority of probes are 
far
+            // smaller. Bytes returned are identical to the eager-allocation 
version.
+            int cap = Math.min(rawCap, contentTarget);
+            byte[] buf = new byte[cap];
+            byte[] stripDst = new byte[cap];
             int total = 0;
             while (total < rawCap) {
                 int want = Math.min(rawCap - total, contentTarget);
+                if (total + want > buf.length) {
+                    int newCap = Math.min(rawCap, Math.max(buf.length * 2, 
total + want));
+                    buf = Arrays.copyOf(buf, newCap);
+                    stripDst = Arrays.copyOf(stripDst, newCap);
+                }
                 int n = IOUtils.read(tis, buf, total, want);
                 total += n;
                 HtmlByteStripper.Result r =
@@ -72,7 +81,7 @@ public final class AdaptiveProbe {
             if (total == 0) {
                 return new byte[0];
             }
-            return total == rawCap ? buf : Arrays.copyOf(buf, total);
+            return total == buf.length ? buf : Arrays.copyOf(buf, total);
         } finally {
             tis.reset();
         }
diff --git 
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java
 
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java
index 45a919274a..0c67cc5bd2 100644
--- 
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java
+++ 
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java
@@ -29,6 +29,8 @@ import org.slf4j.LoggerFactory;
 
 import org.apache.tika.config.TikaComponent;
 import org.apache.tika.detect.EncodingDetector;
+import org.apache.tika.detect.EncodingDetectorContext;
+import org.apache.tika.detect.EncodingProbeCache;
 import org.apache.tika.detect.EncodingResult;
 import org.apache.tika.detect.HighByteLetterStats;
 import org.apache.tika.io.TikaInputStream;
@@ -208,7 +210,7 @@ public class MojibusterEncodingDetector implements 
EncodingDetector {
     @Override
     public List<EncodingResult> detect(TikaInputStream tis, Metadata metadata,
                                        ParseContext parseContext) throws 
IOException {
-        byte[] probe = readProbe(tis);
+        byte[] probe = readProbe(tis, parseContext);
         return detect(probe, metadata);
     }
 
@@ -749,7 +751,20 @@ public class MojibusterEncodingDetector implements 
EncodingDetector {
         return lower.contains("html") || lower.contains("xml");
     }
 
-    private static byte[] readProbe(TikaInputStream tis) throws IOException {
-        return AdaptiveProbe.read(tis, PROBE_CONTENT_TARGET, PROBE_RAW_CAP);
+    private static byte[] readProbe(TikaInputStream tis, ParseContext 
parseContext)
+            throws IOException {
+        EncodingDetectorContext context = 
parseContext.get(EncodingDetectorContext.class);
+        EncodingProbeCache cache = context == null ? null : 
context.getProbeCache();
+        if (cache != null) {
+            byte[] cached = cache.get(PROBE_CONTENT_TARGET, PROBE_RAW_CAP);
+            if (cached != null) {
+                return cached;
+            }
+        }
+        byte[] probe = AdaptiveProbe.read(tis, PROBE_CONTENT_TARGET, 
PROBE_RAW_CAP);
+        if (cache != null) {
+            cache.put(probe, PROBE_CONTENT_TARGET, PROBE_RAW_CAP);
+        }
+        return probe;
     }
 }
diff --git 
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/NaiveBayesBigramEncodingDetector.java
 
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/NaiveBayesBigramEncodingDetector.java
index 5becf20ce6..d4b35625bd 100644
--- 
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/NaiveBayesBigramEncodingDetector.java
+++ 
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/NaiveBayesBigramEncodingDetector.java
@@ -309,9 +309,19 @@ public class NaiveBayesBigramEncodingDetector implements 
EncodingDetector {
                 for (int bg = 0; bg < BIGRAM_SPACE; bg++) {
                     logP8[bg * numClasses + c] = u;
                 }
-                // Overwrite with trained pairs.
+                // Overwrite with trained pairs. Bigram ids are sorted 
ascending and
+                // stored as varint deltas (LEB128) from the previous id.
+                int bigram = 0;
                 for (int i = 0; i < vocabSize; i++) {
-                    int bigram = dis.readUnsignedShort();
+                    int delta = 0;
+                    int shift = 0;
+                    int b;
+                    do {
+                        b = dis.readUnsignedByte();
+                        delta |= (b & 0x7F) << shift;
+                        shift += 7;
+                    } while ((b & 0x80) != 0);
+                    bigram += delta;
                     byte q = dis.readByte();
                     logP8[bigram * numClasses + c] = q;
                 }
diff --git 
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/resources/org/apache/tika/ml/chardetect/nb-bigram.bin
 
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/resources/org/apache/tika/ml/chardetect/nb-bigram.bin
index b89188bb32..0cebc858bb 100644
Binary files 
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/resources/org/apache/tika/ml/chardetect/nb-bigram.bin
 and 
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/resources/org/apache/tika/ml/chardetect/nb-bigram.bin
 differ
diff --git 
a/tika-langdetect/tika-langdetect-charsoup-core/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupFeatureExtractor.java
 
b/tika-langdetect/tika-langdetect-charsoup-core/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupFeatureExtractor.java
index 7bbfc032e5..6b7e39bc68 100644
--- 
a/tika-langdetect/tika-langdetect-charsoup-core/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupFeatureExtractor.java
+++ 
b/tika-langdetect/tika-langdetect-charsoup-core/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupFeatureExtractor.java
@@ -244,9 +244,17 @@ public class CharSoupFeatureExtractor {
      * @return cleaned, NFC-normalized text
      */
     public static String preprocessNoTruncate(String rawText) {
-        // Strip URLs and emails
-        String text = URL_REGEX.matcher(rawText).replaceAll(" ");
-        text = MAIL_REGEX.matcher(text).replaceAll(" ");
+        // Strip URLs and emails. Both regexes scan the entire input on every 
call;
+        // skip each unless its required marker is present ("://" for 
URL_REGEX, "@"
+        // for MAIL_REGEX). This is a no-op for the common (markerless) case — 
the
+        // output is identical — but avoids a full-buffer regex scan + Matcher 
alloc.
+        String text = rawText;
+        if (text.indexOf("://") >= 0) {
+            text = URL_REGEX.matcher(text).replaceAll(" ");
+        }
+        if (text.indexOf('@') >= 0) {
+            text = MAIL_REGEX.matcher(text).replaceAll(" ");
+        }
 
         // NFC normalize
         if (!Normalizer.isNormalized(text, Normalizer.Form.NFC)) {
diff --git 
a/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/TrainNaiveBayesBigram.java
 
b/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/TrainNaiveBayesBigram.java
index a082f4c5be..6eaee95c08 100644
--- 
a/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/TrainNaiveBayesBigram.java
+++ 
b/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/TrainNaiveBayesBigram.java
@@ -427,8 +427,8 @@ public class TrainNaiveBayesBigram {
      *     float32 scale                      (per-class dequant)
      *     byte    unseenQ                    (int8 quantized unseen floor)
      *     int32 vocabSize                    (number of trained pairs)
-     *     for each kept bigram:
-     *       uint16 bigramKey
+     *     bigram keys sorted ascending, each pair stored as:
+     *       varint deltaFromPrevKey         (LEB128; first delta is the key 
itself)
      *       byte   logP8                     (int8 quantized)
      *
      * <p>Sparse representation: only trained bigram pairs are stored;
@@ -497,11 +497,22 @@ public class TrainNaiveBayesBigram {
                 dos.writeByte(unseenQ[c]);
                 float scale = perClassScale[c];
                 dos.writeInt(logProbsPerClass[c].size());
-                for (Map.Entry<Integer, Float> e : 
logProbsPerClass[c].entrySet()) {
-                    int q = Math.round(e.getValue() / scale);
+                // Bigram ids sorted ascending, stored as varint (LEB128) 
deltas from
+                // the previous id — most deltas fit in a single byte.
+                int[] keys = logProbsPerClass[c].keySet().stream()
+                        .mapToInt(Integer::intValue).sorted().toArray();
+                int prev = 0;
+                for (int key : keys) {
+                    int q = Math.round(logProbsPerClass[c].get(key) / scale);
                     if (q > 127) q = 127;
                     if (q < -127) q = -127;
-                    dos.writeShort(e.getKey());
+                    int delta = key - prev;
+                    prev = key;
+                    while ((delta & ~0x7F) != 0) {
+                        dos.writeByte((delta & 0x7F) | 0x80);
+                        delta >>>= 7;
+                    }
+                    dos.writeByte(delta);
                     dos.writeByte(q);
                 }
             }
diff --git 
a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/BigramTables.java
 
b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/BigramTables.java
index 5c7e738290..f210860cee 100644
--- 
a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/BigramTables.java
+++ 
b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/BigramTables.java
@@ -36,12 +36,11 @@ import java.nio.ByteOrder;
  *       script.  Codepoint → dense index is a binary search; index →
  *       codepoint is direct array access.  Typical sizes: ~7K-15K for HAN,
  *       ~200-500 for most other scripts.
- *   <li>{@code bigramKeys} / {@code bigramValues} — parallel arrays
- *       implementing an open-addressed hash table with linear probing.
- *       Each key is a 32-bit value {@code (idxA << 16) | idxB}; key {@code
- *       -1} means "empty slot."  Indices are bounded at 16 bits (65535),
- *       which is comfortably above the largest per-script codepoint count
- *       we observe.
+ *   <li>{@code bigramKeys} / {@code bigramValues} — parallel arrays of the
+ *       occupied entries only, sorted ascending by key for binary-search
+ *       lookup.  Each key is a 32-bit value {@code (idxA << 16) | idxB}.
+ *       Indices are bounded at 16 bits (65535), comfortably above the
+ *       largest per-script codepoint count we observe.
  *   <li>{@code unigramTable} — {@code byte[numCodepoints]}, quantized
  *       unigram log-probabilities indexed by the same codepoint→index map.
  *   <li>{@code bigramQuantMin/Max}, {@code unigramQuantMin/Max} —
@@ -56,10 +55,9 @@ import java.nio.ByteOrder;
  *       independence sum.
  * </ul>
  *
- * <p>Membership semantics: no Bloom filter.  The empty-slot sentinel is
- * the membership oracle — a pair is "seen" iff binary-search finds both
- * codepoints in the index AND a probe sequence hits a matching key before
- * an empty slot.  Lookups are therefore exact.
+ * <p>Membership semantics: no Bloom filter.  A pair is "seen" iff
+ * binary-search finds both codepoints in the index AND finds the packed
+ * key in {@code bigramKeys}.  Lookups are therefore exact.
  *
  * <p>Fields are package-private so the
  * {@link org.apache.tika.ml.junkdetect.tools.TrainJunkModel} trainer can
diff --git 
a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkDetector.java
 
b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkDetector.java
index 2f117479d0..a4ec277bea 100644
--- 
a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkDetector.java
+++ 
b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkDetector.java
@@ -970,22 +970,6 @@ public final class JunkDetector implements 
TextQualityDetector {
         return java.util.Arrays.binarySearch(tables.codepointIndex, cp);
     }
 
-    /**
-     * Mixing function used to scatter packed (idxA, idxB) keys across
-     * the open-addressing table.  A simple integer finalizer (splitmix32
-     * style) gives good distribution for sequential index values.
-     *
-     * <p>Public so the trainer's open-addressing insertion routine uses
-     * the same probe order as inference — drift here would silently
-     * corrupt every lookup.
-     */
-    public static int mixIndexKey(int packedKey) {
-        int x = packedKey;
-        x = (x ^ (x >>> 16)) * 0x7feb352d;
-        x = (x ^ (x >>> 15)) * 0x846ca68b;
-        x = x ^ (x >>> 16);
-        return x;
-    }
 
     /**
      * Packed bigram key for indices {@code (a, b)} where each index fits in
@@ -1085,20 +1069,12 @@ public final class JunkDetector implements 
TextQualityDetector {
      * for {@code (idxA, idxB)}, or {@code -1} if not present (probe hit an
      * empty slot first).
      *
-     * <p>Linear probing with the same mix-hash used at training time —
-     * required for the table to be readable, not just writable.
+     * <p>{@code bigramKeys} is sorted ascending (signed), so this is a binary 
search.
      */
     static int lookupBigramSlot(BigramTables tables, int idxA, int idxB) {
         int packedKey = packBigramKey(idxA, idxB);
-        int[] keys = tables.bigramKeys;
-        int mask = keys.length - 1;
-        int h = mixIndexKey(packedKey) & mask;
-        while (true) {
-            int k = keys[h];
-            if (k == BigramTables.EMPTY_KEY) return -1;
-            if (k == packedKey) return h;
-            h = (h + 1) & mask;
-        }
+        int slot = java.util.Arrays.binarySearch(tables.bigramKeys, packedKey);
+        return slot >= 0 ? slot : -1;
     }
 
     private static double unigramLogProb(BigramTables tables, int idx) {
diff --git 
a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkFilterEncodingDetector.java
 
b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkFilterEncodingDetector.java
index b8cb75de01..7571f97f20 100644
--- 
a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkFilterEncodingDetector.java
+++ 
b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkFilterEncodingDetector.java
@@ -35,6 +35,7 @@ import org.slf4j.LoggerFactory;
 import org.apache.tika.config.TikaComponent;
 import org.apache.tika.detect.CharsetSupersets;
 import org.apache.tika.detect.EncodingDetectorContext;
+import org.apache.tika.detect.EncodingProbeCache;
 import org.apache.tika.detect.EncodingResult;
 import org.apache.tika.detect.HighByteLetterStats;
 import org.apache.tika.detect.MetaEncodingDetector;
@@ -156,7 +157,7 @@ public class JunkFilterEncodingDetector implements 
MetaEncodingDetector {
             return Collections.emptyList();
         }
 
-        byte[] bytes = readProbe(tis);
+        byte[] bytes = readProbe(tis, context);
         if (bytes == null || bytes.length == 0) {
             context.setArbitrationInfo("junk-filter-empty-stream");
             return Collections.emptyList();
@@ -584,9 +585,21 @@ public class JunkFilterEncodingDetector implements 
MetaEncodingDetector {
         return true;
     }
 
-    private byte[] readProbe(TikaInputStream tis) throws IOException {
+    private byte[] readProbe(TikaInputStream tis, EncodingDetectorContext 
context)
+            throws IOException {
         // readLimit is the tag-stripped content target; cap raw reads at 512 
KB.
-        byte[] probe = AdaptiveProbe.read(tis, readLimit, 
AdaptiveProbe.DEFAULT_RAW_CAP);
+        int rawCap = AdaptiveProbe.DEFAULT_RAW_CAP;
+        EncodingProbeCache cache = context == null ? null : 
context.getProbeCache();
+        if (cache != null) {
+            byte[] cached = cache.get(readLimit, rawCap);
+            if (cached != null) {
+                return cached.length == 0 ? null : cached;
+            }
+        }
+        byte[] probe = AdaptiveProbe.read(tis, readLimit, rawCap);
+        if (cache != null) {
+            cache.put(probe, readLimit, rawCap);
+        }
         return probe.length == 0 ? null : probe;
     }
 
diff --git 
a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/TrainJunkModel.java
 
b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/TrainJunkModel.java
index 63bce5317f..4f5dfc3587 100644
--- 
a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/TrainJunkModel.java
+++ 
b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/TrainJunkModel.java
@@ -1037,12 +1037,8 @@ public class TrainJunkModel {
         // Quantize unigram log-probs.
         QuantizedFloats qUnigram = quantizeFloats(unigramLogP);
 
-        // --- Build the open-addressing bigram table. ---
-        int slots = nextPowerOfTwo((int) Math.max(2, Math.ceil(keptPairs / 
loadFactor)));
-        int[] keys = new int[slots];
-        java.util.Arrays.fill(keys, BigramTables.EMPTY_KEY);
-        // Compute log-probs first, quantize once, then write into the table
-        // alongside its key.
+        // --- Build the sorted-occupied bigram table (binary-search lookup). 
---
+        // Compute log-probs first, quantize once, then sort by key.
         float[] keptLogP = new float[keptPairs];
         int[] keptKeys = new int[keptPairs];
         int writeIdx = 0;
@@ -1067,16 +1063,25 @@ public class TrainJunkModel {
         }
         // Quantize all kept log-probs together so they share min/max.
         QuantizedFloats qBigram = quantizeFloats(keptLogP);
-        byte[] values = new byte[slots];
+        // Sort (key, value) ascending by signed key so the loader can 
binary-search.
+        // Pack into a long (key in high 32 bits, value byte in low 8) for one 
sort.
+        long[] sortable = new long[keptPairs];
         for (int i = 0; i < keptPairs; i++) {
-            insertOA(keys, values, keptKeys[i], qBigram.bytes[i]);
+            sortable[i] = (((long) keptKeys[i]) << 32) | (qBigram.bytes[i] & 
0xFFL);
+        }
+        java.util.Arrays.sort(sortable);
+        int[] keys = new int[keptPairs];
+        byte[] values = new byte[keptPairs];
+        for (int i = 0; i < keptPairs; i++) {
+            keys[i] = (int) (sortable[i] >> 32);
+            values[i] = (byte) (sortable[i] & 0xFF);
         }
 
         System.out.printf(
                 "    pair_counts: distinct=%,d, kept=%,d (>=%d), dropped=%,d  "
-                + "cp_index=%,d  slots=%,d (load=%.2f)%n",
+                + "cp_index=%,d  bigram_entries=%,d%n",
                 totalDistinct, keptPairs, minBigramCount, dropped,
-                cpIndex.length, slots, keptPairs / (double) slots);
+                cpIndex.length, keptPairs);
 
         return new BigramTables(cpIndex, keys, values, qUnigram.bytes,
                 qBigram.min, qBigram.max,
@@ -1084,32 +1089,6 @@ public class TrainJunkModel {
                 unigramFallbackLogP, BACKOFF_ALPHA);
     }
 
-    /**
-     * Inserts a {@code (packedKey, value)} pair into the open-addressing
-     * table.  The caller is responsible for sizing the table large enough
-     * to avoid an infinite probe (any load &lt; 1.0 is safe).
-     */
-    private static void insertOA(int[] keys, byte[] values, int packedKey, 
byte value) {
-        int mask = keys.length - 1;
-        int h = JunkDetector.mixIndexKey(packedKey) & mask;
-        while (keys[h] != BigramTables.EMPTY_KEY) {
-            if (keys[h] == packedKey) {
-                // Same key twice — shouldn't happen with our dedup, but be
-                // defensive and overwrite rather than corrupt.
-                values[h] = value;
-                return;
-            }
-            h = (h + 1) & mask;
-        }
-        keys[h] = packedKey;
-        values[h] = value;
-    }
-
-    private static int nextPowerOfTwo(int n) {
-        if (n < 1) return 1;
-        int p = Integer.highestOneBit(n - 1) << 1;
-        return Math.max(1, p);
-    }
 
     // -----------------------------------------------------------------------
     // Global contrastive combiner training
diff --git 
a/tika-ml/tika-ml-junkdetect/src/main/resources/org/apache/tika/ml/junkdetect/junkdetect.bin
 
b/tika-ml/tika-ml-junkdetect/src/main/resources/org/apache/tika/ml/junkdetect/junkdetect.bin
index ead028cbb3..42c9216a5c 100644
Binary files 
a/tika-ml/tika-ml-junkdetect/src/main/resources/org/apache/tika/ml/junkdetect/junkdetect.bin
 and 
b/tika-ml/tika-ml-junkdetect/src/main/resources/org/apache/tika/ml/junkdetect/junkdetect.bin
 differ
diff --git 
a/tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/JunkDetectorRoundTripTest.java
 
b/tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/JunkDetectorRoundTripTest.java
index 07efc64dd8..7433820efe 100644
--- 
a/tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/JunkDetectorRoundTripTest.java
+++ 
b/tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/JunkDetectorRoundTripTest.java
@@ -111,14 +111,12 @@ public class JunkDetectorRoundTripTest {
         // Same shape as the first test but with BOTH (A,B) and (B,A) in the
         // bigram table.  mean log-prob = -1.0, z1 = +4.0, logit = +4.0.
         int[] cpIndex = new int[]{'A', 'B'};
-        int[] keys = new int[4];
-        Arrays.fill(keys, BigramTables.EMPTY_KEY);
-        byte[] values = new byte[4];
         float bMin = -10.0f;
         float bMax = -1.0f;
         byte b = quantizeOne(-1.0f, bMin, bMax);
-        insertOA(keys, values, JunkDetector.packBigramKey(0, 1), b);
-        insertOA(keys, values, JunkDetector.packBigramKey(1, 0), b);
+        // sorted-occupied: packBigramKey(0,1)=1 < packBigramKey(1,0)=65536
+        int[] keys = {JunkDetector.packBigramKey(0, 1), 
JunkDetector.packBigramKey(1, 0)};
+        byte[] values = {b, b};
 
         float uMin = -5.0f;
         float uMax = -2.0f;
@@ -266,17 +264,14 @@ public class JunkDetectorRoundTripTest {
     private static BigramTables buildLatinTablesAB() {
         int[] cpIndex = new int[]{'A', 'B'};
 
-        // 4 slots ≈ 25% load for 1 pair.  Open-addressing with linear probe.
-        int[] keys = new int[4];
-        Arrays.fill(keys, BigramTables.EMPTY_KEY);
-        byte[] values = new byte[4];
-
         // Manual quantization with a chosen range so we don't hit the
         // degenerate single-element case.  range=[-10, -1] → -1.0 → byte 255.
         float bMin = -10.0f;
         float bMax = -1.0f;
         byte b = quantizeOne(-1.0f, bMin, bMax);
-        insertOA(keys, values, JunkDetector.packBigramKey(0, 1), b);
+        // sorted-occupied table with a single trained pair.
+        int[] keys = {JunkDetector.packBigramKey(0, 1)};
+        byte[] values = {b};
 
         float uMin = -5.0f;
         float uMax = -2.0f;
@@ -358,13 +353,10 @@ public class JunkDetectorRoundTripTest {
      *  (uppercase 'A'/'B' are absent from the index, so they must fold). */
     private static BigramTables buildLatinTablesLowerAB() {
         int[] cpIndex = new int[]{'a', 'b'};
-        int[] keys = new int[4];
-        Arrays.fill(keys, BigramTables.EMPTY_KEY);
-        byte[] values = new byte[4];
         float bMin = -10.0f;
         float bMax = -1.0f;
-        insertOA(keys, values, JunkDetector.packBigramKey(0, 1),
-                quantizeOne(-1.0f, bMin, bMax));
+        int[] keys = {JunkDetector.packBigramKey(0, 1)};
+        byte[] values = {quantizeOne(-1.0f, bMin, bMax)};
         float uMin = -5.0f;
         float uMax = -2.0f;
         byte[] unigramBytes = new byte[]{
@@ -403,25 +395,6 @@ public class JunkDetectorRoundTripTest {
         return (byte) q;
     }
 
-    /**
-     * Replica of {@code TrainJunkModel.insertOA} (package-private) for the
-     * test's hand-constructed tables.  Uses the same mix-hash as the
-     * production code path.
-     */
-    private static void insertOA(int[] keys, byte[] values, int packedKey, 
byte value) {
-        int mask = keys.length - 1;
-        int h = JunkDetector.mixIndexKey(packedKey) & mask;
-        while (keys[h] != BigramTables.EMPTY_KEY) {
-            if (keys[h] == packedKey) {
-                values[h] = value;
-                return;
-            }
-            h = (h + 1) & mask;
-        }
-        keys[h] = packedKey;
-        values[h] = value;
-    }
-
     /**
      * Saves a minimal model containing only LATIN, with the block / control /
      * script-transition features zeroed out and pure-z1 combiner weights

(tika) 01/01: TIKA-4745 -- efficiency improvements

Reply via email to