This is an automated email from the ASF dual-hosted git repository. tballison pushed a commit to branch TIKA-4745-more-junk-charset in repository https://gitbox.apache.org/repos/asf/tika.git
commit d5ef09b8a8c9914ef898c4ae4c9c770e85005e4c Author: tallison <[email protected]> AuthorDate: Fri Jun 5 18:11:24 2026 -0400 TIKA-4745 -- efficiency improvements --- .../tika/detect/EncodingDetectorContext.java | 11 ++++ .../org/apache/tika/detect/EncodingProbeCache.java | 65 +++++++++++++++++++++ .../tika/parser/html/HtmlEncodingDetector.java | 41 +++++++++++++ .../apache/tika/ml/chardetect/AdaptiveProbe.java | 15 ++++- .../ml/chardetect/MojibusterEncodingDetector.java | 21 ++++++- .../NaiveBayesBigramEncodingDetector.java | 14 ++++- .../org/apache/tika/ml/chardetect/nb-bigram.bin | Bin 1008871 -> 696579 bytes .../charsoup/CharSoupFeatureExtractor.java | 14 ++++- .../ml/chardetect/tools/TrainNaiveBayesBigram.java | 21 +++++-- .../apache/tika/ml/junkdetect/BigramTables.java | 18 +++--- .../apache/tika/ml/junkdetect/JunkDetector.java | 30 +--------- .../ml/junkdetect/JunkFilterEncodingDetector.java | 19 +++++- .../tika/ml/junkdetect/tools/TrainJunkModel.java | 51 +++++----------- .../org/apache/tika/ml/junkdetect/junkdetect.bin | Bin 2316809 -> 1395108 bytes .../ml/junkdetect/JunkDetectorRoundTripTest.java | 43 +++----------- 15 files changed, 236 insertions(+), 127 deletions(-) diff --git a/tika-core/src/main/java/org/apache/tika/detect/EncodingDetectorContext.java b/tika-core/src/main/java/org/apache/tika/detect/EncodingDetectorContext.java index 6957601e2c..426f508c14 100644 --- a/tika-core/src/main/java/org/apache/tika/detect/EncodingDetectorContext.java +++ b/tika-core/src/main/java/org/apache/tika/detect/EncodingDetectorContext.java @@ -40,8 +40,19 @@ import java.util.Set; public class EncodingDetectorContext { private final List<Result> results = new ArrayList<>(); + private final EncodingProbeCache probeCache = new EncodingProbeCache(); private String arbitrationInfo; + /** + * Per-detection cache of the raw detection probe, shared across the detectors in + * this chain so they don't each re-read the same leading bytes. It lives and dies + * with this context (which is removed after detection), so it never leaks into + * recursive/attachment parsing. + */ + public EncodingProbeCache getProbeCache() { + return probeCache; + } + /** * Record the ranked results from a child detector. * diff --git a/tika-core/src/main/java/org/apache/tika/detect/EncodingProbeCache.java b/tika-core/src/main/java/org/apache/tika/detect/EncodingProbeCache.java new file mode 100644 index 0000000000..a4d783fb02 --- /dev/null +++ b/tika-core/src/main/java/org/apache/tika/detect/EncodingProbeCache.java @@ -0,0 +1,65 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.detect; + +/** + * Caches the raw encoding-detection probe (the leading bytes read for detection) + * so that multiple detectors in a chain do not each re-read and re-tag-strip the + * same bytes. For example a statistical detector and a downstream meta detector + * that re-reads the bytes for arbitration can share one probe. + * <p> + * An instance is held by {@link EncodingDetectorContext}, so it inherits that + * context's per-detection lifecycle: created fresh per detection and discarded + * with the context immediately afterwards. That matters because a + * {@link org.apache.tika.parser.ParseContext} flows on into recursive + * (attachment/embedded) parsing — a probe must never outlive the single detection + * it was read for. + * <p> + * Not thread-safe: a single detection runs its detectors sequentially on one + * thread. The cache is keyed by the probe parameters — {@link #get} returns the + * cached probe only when both {@code contentTarget} and {@code rawCap} match what + * it was stored with, so a detector that wants a differently-sized probe + * transparently reads (and caches) its own. + * <p> + * The cached array is shared read-only state; callers must not mutate it in place. + */ +public class EncodingProbeCache { + + private byte[] probe; + private int contentTarget = -1; + private int rawCap = -1; + + /** + * @return the cached probe if one was stored with the same {@code contentTarget} and + * {@code rawCap}; otherwise {@code null} + */ + public byte[] get(int contentTarget, int rawCap) { + if (probe != null && this.contentTarget == contentTarget && this.rawCap == rawCap) { + return probe; + } + return null; + } + + /** + * Stores the probe bytes read with the given parameters. + */ + public void put(byte[] probe, int contentTarget, int rawCap) { + this.probe = probe; + this.contentTarget = contentTarget; + this.rawCap = rawCap; + } +} diff --git a/tika-encoding-detectors/tika-encoding-detector-html/src/main/java/org/apache/tika/parser/html/HtmlEncodingDetector.java b/tika-encoding-detectors/tika-encoding-detector-html/src/main/java/org/apache/tika/parser/html/HtmlEncodingDetector.java index c052b062b6..5cce8c87e9 100644 --- a/tika-encoding-detectors/tika-encoding-detector-html/src/main/java/org/apache/tika/parser/html/HtmlEncodingDetector.java +++ b/tika-encoding-detectors/tika-encoding-detector-html/src/main/java/org/apache/tika/parser/html/HtmlEncodingDetector.java @@ -162,6 +162,14 @@ public class HtmlEncodingDetector implements EncodingDetector { } tis.reset(); + // findCharset only ever matches a meta tag (HTTP_META_PATTERN = "<\s*meta..."). + // If the probe has no such tag, the full ASCII decode + comment-stripping + // regex below can only produce null — skip them. Byte-level, no allocation; + // a strict necessary condition for any non-empty result. + if (!containsMetaTag(buffer, n)) { + return Collections.emptyList(); + } + String head = ASCII.decode(ByteBuffer.wrap(buffer, 0, n)).toString(); String headNoComments = head.replaceAll("<!--.*?(-->|$)", " "); Charset charset = findCharset(headNoComments); @@ -175,6 +183,39 @@ public class HtmlEncodingDetector implements EncodingDetector { EncodingResult.ResultType.DECLARATIVE)); } + /** + * Byte-level scan for an opening meta tag, mirroring the {@code <\s*meta} prefix of + * {@link #HTTP_META_PATTERN} (ASCII, case-insensitive). Lets {@link #detect} skip the + * full ASCII decode + comment-stripping regex on probes that cannot contain a meta + * charset declaration. {@code <}, ASCII whitespace and {@code meta} are all ASCII, so a + * raw-byte scan is equivalent to scanning the decoded head. + */ + private static boolean containsMetaTag(byte[] buf, int len) { + for (int i = 0; i < len; i++) { + if (buf[i] != '<') { + continue; + } + int j = i + 1; + while (j < len) { + int c = buf[j] & 0xFF; + if (c == ' ' || c == '\t' || c == '\n' || c == 0x0B || c == '\f' + || c == '\r') { + j++; + } else { + break; + } + } + if (j + 4 <= len + && (((buf[j] & 0xFF) | 0x20) == 'm') + && (((buf[j + 1] & 0xFF) | 0x20) == 'e') + && (((buf[j + 2] & 0xFF) | 0x20) == 't') + && (((buf[j + 3] & 0xFF) | 0x20) == 'a')) { + return true; + } + } + return false; + } + //returns null if no charset was found private Charset findCharset(String s) { diff --git a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/AdaptiveProbe.java b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/AdaptiveProbe.java index 34a081ec32..e2100b0533 100644 --- a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/AdaptiveProbe.java +++ b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/AdaptiveProbe.java @@ -55,11 +55,20 @@ public final class AdaptiveProbe { throws IOException { tis.mark(rawCap); try { - byte[] buf = new byte[rawCap]; - byte[] stripDst = new byte[rawCap]; + // Grow on demand rather than allocating (and zeroing) the full rawCap + // (e.g. 512 KB) twice up front: the vast majority of probes are far + // smaller. Bytes returned are identical to the eager-allocation version. + int cap = Math.min(rawCap, contentTarget); + byte[] buf = new byte[cap]; + byte[] stripDst = new byte[cap]; int total = 0; while (total < rawCap) { int want = Math.min(rawCap - total, contentTarget); + if (total + want > buf.length) { + int newCap = Math.min(rawCap, Math.max(buf.length * 2, total + want)); + buf = Arrays.copyOf(buf, newCap); + stripDst = Arrays.copyOf(stripDst, newCap); + } int n = IOUtils.read(tis, buf, total, want); total += n; HtmlByteStripper.Result r = @@ -72,7 +81,7 @@ public final class AdaptiveProbe { if (total == 0) { return new byte[0]; } - return total == rawCap ? buf : Arrays.copyOf(buf, total); + return total == buf.length ? buf : Arrays.copyOf(buf, total); } finally { tis.reset(); } diff --git a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java index 45a919274a..0c67cc5bd2 100644 --- a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java +++ b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java @@ -29,6 +29,8 @@ import org.slf4j.LoggerFactory; import org.apache.tika.config.TikaComponent; import org.apache.tika.detect.EncodingDetector; +import org.apache.tika.detect.EncodingDetectorContext; +import org.apache.tika.detect.EncodingProbeCache; import org.apache.tika.detect.EncodingResult; import org.apache.tika.detect.HighByteLetterStats; import org.apache.tika.io.TikaInputStream; @@ -208,7 +210,7 @@ public class MojibusterEncodingDetector implements EncodingDetector { @Override public List<EncodingResult> detect(TikaInputStream tis, Metadata metadata, ParseContext parseContext) throws IOException { - byte[] probe = readProbe(tis); + byte[] probe = readProbe(tis, parseContext); return detect(probe, metadata); } @@ -749,7 +751,20 @@ public class MojibusterEncodingDetector implements EncodingDetector { return lower.contains("html") || lower.contains("xml"); } - private static byte[] readProbe(TikaInputStream tis) throws IOException { - return AdaptiveProbe.read(tis, PROBE_CONTENT_TARGET, PROBE_RAW_CAP); + private static byte[] readProbe(TikaInputStream tis, ParseContext parseContext) + throws IOException { + EncodingDetectorContext context = parseContext.get(EncodingDetectorContext.class); + EncodingProbeCache cache = context == null ? null : context.getProbeCache(); + if (cache != null) { + byte[] cached = cache.get(PROBE_CONTENT_TARGET, PROBE_RAW_CAP); + if (cached != null) { + return cached; + } + } + byte[] probe = AdaptiveProbe.read(tis, PROBE_CONTENT_TARGET, PROBE_RAW_CAP); + if (cache != null) { + cache.put(probe, PROBE_CONTENT_TARGET, PROBE_RAW_CAP); + } + return probe; } } diff --git a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/NaiveBayesBigramEncodingDetector.java b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/NaiveBayesBigramEncodingDetector.java index 5becf20ce6..d4b35625bd 100644 --- a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/NaiveBayesBigramEncodingDetector.java +++ b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/NaiveBayesBigramEncodingDetector.java @@ -309,9 +309,19 @@ public class NaiveBayesBigramEncodingDetector implements EncodingDetector { for (int bg = 0; bg < BIGRAM_SPACE; bg++) { logP8[bg * numClasses + c] = u; } - // Overwrite with trained pairs. + // Overwrite with trained pairs. Bigram ids are sorted ascending and + // stored as varint deltas (LEB128) from the previous id. + int bigram = 0; for (int i = 0; i < vocabSize; i++) { - int bigram = dis.readUnsignedShort(); + int delta = 0; + int shift = 0; + int b; + do { + b = dis.readUnsignedByte(); + delta |= (b & 0x7F) << shift; + shift += 7; + } while ((b & 0x80) != 0); + bigram += delta; byte q = dis.readByte(); logP8[bigram * numClasses + c] = q; } diff --git a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/resources/org/apache/tika/ml/chardetect/nb-bigram.bin b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/resources/org/apache/tika/ml/chardetect/nb-bigram.bin index b89188bb32..0cebc858bb 100644 Binary files a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/resources/org/apache/tika/ml/chardetect/nb-bigram.bin and b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/resources/org/apache/tika/ml/chardetect/nb-bigram.bin differ diff --git a/tika-langdetect/tika-langdetect-charsoup-core/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupFeatureExtractor.java b/tika-langdetect/tika-langdetect-charsoup-core/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupFeatureExtractor.java index 7bbfc032e5..6b7e39bc68 100644 --- a/tika-langdetect/tika-langdetect-charsoup-core/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupFeatureExtractor.java +++ b/tika-langdetect/tika-langdetect-charsoup-core/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupFeatureExtractor.java @@ -244,9 +244,17 @@ public class CharSoupFeatureExtractor { * @return cleaned, NFC-normalized text */ public static String preprocessNoTruncate(String rawText) { - // Strip URLs and emails - String text = URL_REGEX.matcher(rawText).replaceAll(" "); - text = MAIL_REGEX.matcher(text).replaceAll(" "); + // Strip URLs and emails. Both regexes scan the entire input on every call; + // skip each unless its required marker is present ("://" for URL_REGEX, "@" + // for MAIL_REGEX). This is a no-op for the common (markerless) case — the + // output is identical — but avoids a full-buffer regex scan + Matcher alloc. + String text = rawText; + if (text.indexOf("://") >= 0) { + text = URL_REGEX.matcher(text).replaceAll(" "); + } + if (text.indexOf('@') >= 0) { + text = MAIL_REGEX.matcher(text).replaceAll(" "); + } // NFC normalize if (!Normalizer.isNormalized(text, Normalizer.Form.NFC)) { diff --git a/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/TrainNaiveBayesBigram.java b/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/TrainNaiveBayesBigram.java index a082f4c5be..6eaee95c08 100644 --- a/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/TrainNaiveBayesBigram.java +++ b/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/TrainNaiveBayesBigram.java @@ -427,8 +427,8 @@ public class TrainNaiveBayesBigram { * float32 scale (per-class dequant) * byte unseenQ (int8 quantized unseen floor) * int32 vocabSize (number of trained pairs) - * for each kept bigram: - * uint16 bigramKey + * bigram keys sorted ascending, each pair stored as: + * varint deltaFromPrevKey (LEB128; first delta is the key itself) * byte logP8 (int8 quantized) * * <p>Sparse representation: only trained bigram pairs are stored; @@ -497,11 +497,22 @@ public class TrainNaiveBayesBigram { dos.writeByte(unseenQ[c]); float scale = perClassScale[c]; dos.writeInt(logProbsPerClass[c].size()); - for (Map.Entry<Integer, Float> e : logProbsPerClass[c].entrySet()) { - int q = Math.round(e.getValue() / scale); + // Bigram ids sorted ascending, stored as varint (LEB128) deltas from + // the previous id — most deltas fit in a single byte. + int[] keys = logProbsPerClass[c].keySet().stream() + .mapToInt(Integer::intValue).sorted().toArray(); + int prev = 0; + for (int key : keys) { + int q = Math.round(logProbsPerClass[c].get(key) / scale); if (q > 127) q = 127; if (q < -127) q = -127; - dos.writeShort(e.getKey()); + int delta = key - prev; + prev = key; + while ((delta & ~0x7F) != 0) { + dos.writeByte((delta & 0x7F) | 0x80); + delta >>>= 7; + } + dos.writeByte(delta); dos.writeByte(q); } } diff --git a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/BigramTables.java b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/BigramTables.java index 5c7e738290..f210860cee 100644 --- a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/BigramTables.java +++ b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/BigramTables.java @@ -36,12 +36,11 @@ import java.nio.ByteOrder; * script. Codepoint → dense index is a binary search; index → * codepoint is direct array access. Typical sizes: ~7K-15K for HAN, * ~200-500 for most other scripts. - * <li>{@code bigramKeys} / {@code bigramValues} — parallel arrays - * implementing an open-addressed hash table with linear probing. - * Each key is a 32-bit value {@code (idxA << 16) | idxB}; key {@code - * -1} means "empty slot." Indices are bounded at 16 bits (65535), - * which is comfortably above the largest per-script codepoint count - * we observe. + * <li>{@code bigramKeys} / {@code bigramValues} — parallel arrays of the + * occupied entries only, sorted ascending by key for binary-search + * lookup. Each key is a 32-bit value {@code (idxA << 16) | idxB}. + * Indices are bounded at 16 bits (65535), comfortably above the + * largest per-script codepoint count we observe. * <li>{@code unigramTable} — {@code byte[numCodepoints]}, quantized * unigram log-probabilities indexed by the same codepoint→index map. * <li>{@code bigramQuantMin/Max}, {@code unigramQuantMin/Max} — @@ -56,10 +55,9 @@ import java.nio.ByteOrder; * independence sum. * </ul> * - * <p>Membership semantics: no Bloom filter. The empty-slot sentinel is - * the membership oracle — a pair is "seen" iff binary-search finds both - * codepoints in the index AND a probe sequence hits a matching key before - * an empty slot. Lookups are therefore exact. + * <p>Membership semantics: no Bloom filter. A pair is "seen" iff + * binary-search finds both codepoints in the index AND finds the packed + * key in {@code bigramKeys}. Lookups are therefore exact. * * <p>Fields are package-private so the * {@link org.apache.tika.ml.junkdetect.tools.TrainJunkModel} trainer can diff --git a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkDetector.java b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkDetector.java index 2f117479d0..a4ec277bea 100644 --- a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkDetector.java +++ b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkDetector.java @@ -970,22 +970,6 @@ public final class JunkDetector implements TextQualityDetector { return java.util.Arrays.binarySearch(tables.codepointIndex, cp); } - /** - * Mixing function used to scatter packed (idxA, idxB) keys across - * the open-addressing table. A simple integer finalizer (splitmix32 - * style) gives good distribution for sequential index values. - * - * <p>Public so the trainer's open-addressing insertion routine uses - * the same probe order as inference — drift here would silently - * corrupt every lookup. - */ - public static int mixIndexKey(int packedKey) { - int x = packedKey; - x = (x ^ (x >>> 16)) * 0x7feb352d; - x = (x ^ (x >>> 15)) * 0x846ca68b; - x = x ^ (x >>> 16); - return x; - } /** * Packed bigram key for indices {@code (a, b)} where each index fits in @@ -1085,20 +1069,12 @@ public final class JunkDetector implements TextQualityDetector { * for {@code (idxA, idxB)}, or {@code -1} if not present (probe hit an * empty slot first). * - * <p>Linear probing with the same mix-hash used at training time — - * required for the table to be readable, not just writable. + * <p>{@code bigramKeys} is sorted ascending (signed), so this is a binary search. */ static int lookupBigramSlot(BigramTables tables, int idxA, int idxB) { int packedKey = packBigramKey(idxA, idxB); - int[] keys = tables.bigramKeys; - int mask = keys.length - 1; - int h = mixIndexKey(packedKey) & mask; - while (true) { - int k = keys[h]; - if (k == BigramTables.EMPTY_KEY) return -1; - if (k == packedKey) return h; - h = (h + 1) & mask; - } + int slot = java.util.Arrays.binarySearch(tables.bigramKeys, packedKey); + return slot >= 0 ? slot : -1; } private static double unigramLogProb(BigramTables tables, int idx) { diff --git a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkFilterEncodingDetector.java b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkFilterEncodingDetector.java index b8cb75de01..7571f97f20 100644 --- a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkFilterEncodingDetector.java +++ b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkFilterEncodingDetector.java @@ -35,6 +35,7 @@ import org.slf4j.LoggerFactory; import org.apache.tika.config.TikaComponent; import org.apache.tika.detect.CharsetSupersets; import org.apache.tika.detect.EncodingDetectorContext; +import org.apache.tika.detect.EncodingProbeCache; import org.apache.tika.detect.EncodingResult; import org.apache.tika.detect.HighByteLetterStats; import org.apache.tika.detect.MetaEncodingDetector; @@ -156,7 +157,7 @@ public class JunkFilterEncodingDetector implements MetaEncodingDetector { return Collections.emptyList(); } - byte[] bytes = readProbe(tis); + byte[] bytes = readProbe(tis, context); if (bytes == null || bytes.length == 0) { context.setArbitrationInfo("junk-filter-empty-stream"); return Collections.emptyList(); @@ -584,9 +585,21 @@ public class JunkFilterEncodingDetector implements MetaEncodingDetector { return true; } - private byte[] readProbe(TikaInputStream tis) throws IOException { + private byte[] readProbe(TikaInputStream tis, EncodingDetectorContext context) + throws IOException { // readLimit is the tag-stripped content target; cap raw reads at 512 KB. - byte[] probe = AdaptiveProbe.read(tis, readLimit, AdaptiveProbe.DEFAULT_RAW_CAP); + int rawCap = AdaptiveProbe.DEFAULT_RAW_CAP; + EncodingProbeCache cache = context == null ? null : context.getProbeCache(); + if (cache != null) { + byte[] cached = cache.get(readLimit, rawCap); + if (cached != null) { + return cached.length == 0 ? null : cached; + } + } + byte[] probe = AdaptiveProbe.read(tis, readLimit, rawCap); + if (cache != null) { + cache.put(probe, readLimit, rawCap); + } return probe.length == 0 ? null : probe; } diff --git a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/TrainJunkModel.java b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/TrainJunkModel.java index 63bce5317f..4f5dfc3587 100644 --- a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/TrainJunkModel.java +++ b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/TrainJunkModel.java @@ -1037,12 +1037,8 @@ public class TrainJunkModel { // Quantize unigram log-probs. QuantizedFloats qUnigram = quantizeFloats(unigramLogP); - // --- Build the open-addressing bigram table. --- - int slots = nextPowerOfTwo((int) Math.max(2, Math.ceil(keptPairs / loadFactor))); - int[] keys = new int[slots]; - java.util.Arrays.fill(keys, BigramTables.EMPTY_KEY); - // Compute log-probs first, quantize once, then write into the table - // alongside its key. + // --- Build the sorted-occupied bigram table (binary-search lookup). --- + // Compute log-probs first, quantize once, then sort by key. float[] keptLogP = new float[keptPairs]; int[] keptKeys = new int[keptPairs]; int writeIdx = 0; @@ -1067,16 +1063,25 @@ public class TrainJunkModel { } // Quantize all kept log-probs together so they share min/max. QuantizedFloats qBigram = quantizeFloats(keptLogP); - byte[] values = new byte[slots]; + // Sort (key, value) ascending by signed key so the loader can binary-search. + // Pack into a long (key in high 32 bits, value byte in low 8) for one sort. + long[] sortable = new long[keptPairs]; for (int i = 0; i < keptPairs; i++) { - insertOA(keys, values, keptKeys[i], qBigram.bytes[i]); + sortable[i] = (((long) keptKeys[i]) << 32) | (qBigram.bytes[i] & 0xFFL); + } + java.util.Arrays.sort(sortable); + int[] keys = new int[keptPairs]; + byte[] values = new byte[keptPairs]; + for (int i = 0; i < keptPairs; i++) { + keys[i] = (int) (sortable[i] >> 32); + values[i] = (byte) (sortable[i] & 0xFF); } System.out.printf( " pair_counts: distinct=%,d, kept=%,d (>=%d), dropped=%,d " - + "cp_index=%,d slots=%,d (load=%.2f)%n", + + "cp_index=%,d bigram_entries=%,d%n", totalDistinct, keptPairs, minBigramCount, dropped, - cpIndex.length, slots, keptPairs / (double) slots); + cpIndex.length, keptPairs); return new BigramTables(cpIndex, keys, values, qUnigram.bytes, qBigram.min, qBigram.max, @@ -1084,32 +1089,6 @@ public class TrainJunkModel { unigramFallbackLogP, BACKOFF_ALPHA); } - /** - * Inserts a {@code (packedKey, value)} pair into the open-addressing - * table. The caller is responsible for sizing the table large enough - * to avoid an infinite probe (any load < 1.0 is safe). - */ - private static void insertOA(int[] keys, byte[] values, int packedKey, byte value) { - int mask = keys.length - 1; - int h = JunkDetector.mixIndexKey(packedKey) & mask; - while (keys[h] != BigramTables.EMPTY_KEY) { - if (keys[h] == packedKey) { - // Same key twice — shouldn't happen with our dedup, but be - // defensive and overwrite rather than corrupt. - values[h] = value; - return; - } - h = (h + 1) & mask; - } - keys[h] = packedKey; - values[h] = value; - } - - private static int nextPowerOfTwo(int n) { - if (n < 1) return 1; - int p = Integer.highestOneBit(n - 1) << 1; - return Math.max(1, p); - } // ----------------------------------------------------------------------- // Global contrastive combiner training diff --git a/tika-ml/tika-ml-junkdetect/src/main/resources/org/apache/tika/ml/junkdetect/junkdetect.bin b/tika-ml/tika-ml-junkdetect/src/main/resources/org/apache/tika/ml/junkdetect/junkdetect.bin index ead028cbb3..42c9216a5c 100644 Binary files a/tika-ml/tika-ml-junkdetect/src/main/resources/org/apache/tika/ml/junkdetect/junkdetect.bin and b/tika-ml/tika-ml-junkdetect/src/main/resources/org/apache/tika/ml/junkdetect/junkdetect.bin differ diff --git a/tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/JunkDetectorRoundTripTest.java b/tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/JunkDetectorRoundTripTest.java index 07efc64dd8..7433820efe 100644 --- a/tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/JunkDetectorRoundTripTest.java +++ b/tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/JunkDetectorRoundTripTest.java @@ -111,14 +111,12 @@ public class JunkDetectorRoundTripTest { // Same shape as the first test but with BOTH (A,B) and (B,A) in the // bigram table. mean log-prob = -1.0, z1 = +4.0, logit = +4.0. int[] cpIndex = new int[]{'A', 'B'}; - int[] keys = new int[4]; - Arrays.fill(keys, BigramTables.EMPTY_KEY); - byte[] values = new byte[4]; float bMin = -10.0f; float bMax = -1.0f; byte b = quantizeOne(-1.0f, bMin, bMax); - insertOA(keys, values, JunkDetector.packBigramKey(0, 1), b); - insertOA(keys, values, JunkDetector.packBigramKey(1, 0), b); + // sorted-occupied: packBigramKey(0,1)=1 < packBigramKey(1,0)=65536 + int[] keys = {JunkDetector.packBigramKey(0, 1), JunkDetector.packBigramKey(1, 0)}; + byte[] values = {b, b}; float uMin = -5.0f; float uMax = -2.0f; @@ -266,17 +264,14 @@ public class JunkDetectorRoundTripTest { private static BigramTables buildLatinTablesAB() { int[] cpIndex = new int[]{'A', 'B'}; - // 4 slots ≈ 25% load for 1 pair. Open-addressing with linear probe. - int[] keys = new int[4]; - Arrays.fill(keys, BigramTables.EMPTY_KEY); - byte[] values = new byte[4]; - // Manual quantization with a chosen range so we don't hit the // degenerate single-element case. range=[-10, -1] → -1.0 → byte 255. float bMin = -10.0f; float bMax = -1.0f; byte b = quantizeOne(-1.0f, bMin, bMax); - insertOA(keys, values, JunkDetector.packBigramKey(0, 1), b); + // sorted-occupied table with a single trained pair. + int[] keys = {JunkDetector.packBigramKey(0, 1)}; + byte[] values = {b}; float uMin = -5.0f; float uMax = -2.0f; @@ -358,13 +353,10 @@ public class JunkDetectorRoundTripTest { * (uppercase 'A'/'B' are absent from the index, so they must fold). */ private static BigramTables buildLatinTablesLowerAB() { int[] cpIndex = new int[]{'a', 'b'}; - int[] keys = new int[4]; - Arrays.fill(keys, BigramTables.EMPTY_KEY); - byte[] values = new byte[4]; float bMin = -10.0f; float bMax = -1.0f; - insertOA(keys, values, JunkDetector.packBigramKey(0, 1), - quantizeOne(-1.0f, bMin, bMax)); + int[] keys = {JunkDetector.packBigramKey(0, 1)}; + byte[] values = {quantizeOne(-1.0f, bMin, bMax)}; float uMin = -5.0f; float uMax = -2.0f; byte[] unigramBytes = new byte[]{ @@ -403,25 +395,6 @@ public class JunkDetectorRoundTripTest { return (byte) q; } - /** - * Replica of {@code TrainJunkModel.insertOA} (package-private) for the - * test's hand-constructed tables. Uses the same mix-hash as the - * production code path. - */ - private static void insertOA(int[] keys, byte[] values, int packedKey, byte value) { - int mask = keys.length - 1; - int h = JunkDetector.mixIndexKey(packedKey) & mask; - while (keys[h] != BigramTables.EMPTY_KEY) { - if (keys[h] == packedKey) { - values[h] = value; - return; - } - h = (h + 1) & mask; - } - keys[h] = packedKey; - values[h] = value; - } - /** * Saves a minimal model containing only LATIN, with the block / control / * script-transition features zeroed out and pure-z1 combiner weights
