This is an automated email from the ASF dual-hosted git repository.
tballison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 8d9900e211 TIKA-4745 -- efficiency improvements (#2878)
8d9900e211 is described below
commit 8d9900e21127de0a20334ebc59ca0529395575bf
Author: Tim Allison <[email protected]>
AuthorDate: Sat Jun 6 13:47:25 2026 -0400
TIKA-4745 -- efficiency improvements (#2878)
---
.../tika/detect/EncodingDetectorContext.java | 11 ++
.../org/apache/tika/detect/EncodingProbeCache.java | 65 +++++++++++
.../tika/parser/html/HtmlEncodingDetector.java | 51 ++++++++-
.../html/StandardCharsets_unsupported_by_IANA.txt | 0
.../tika/parser/html/HtmlEncodingDetectorTest.java | 0
.../html/StandardHtmlEncodingDetectorTest.java | 0
.../apache/tika/ml/chardetect/AdaptiveProbe.java | 15 ++-
.../ml/chardetect/MojibusterEncodingDetector.java | 22 +++-
.../NaiveBayesBigramEncodingDetector.java | 70 ++++++++----
.../org/apache/tika/ml/chardetect/nb-bigram.bin | Bin 1008871 -> 696579
bytes
.../charsoup/CharSoupFeatureExtractor.java | 14 ++-
tika-ml/pom.xml | 1 +
.../ml/chardetect/tools/TrainNaiveBayesBigram.java | 21 +++-
.../pom.xml | 56 +++-------
.../ml/junkdetect/tools/BoundaryBigramAudit.java | 0
.../ml/junkdetect/tools/BuildJunkTrainingData.java | 0
.../tika/ml/junkdetect/tools/DebugScriptRuns.java | 0
.../tools/JunkDetectorTrainingConfig.java | 0
.../ml/junkdetect/tools/LineScriptFractions.java | 0
.../tika/ml/junkdetect/tools/ScriptCensus.java | 0
.../tika/ml/junkdetect/tools/TrainJunkModel.java | 62 ++++-------
.../ml/junkdetect/JunkDetectorRoundTripTest.java | 43 ++-----
.../tools/BuildJunkAugmentationData.java | 0
.../tools/BuildJunkAugmentationDataTest.java | 0
.../tools/JunkDetectorTrainingConfigTest.java | 0
tika-ml/tika-ml-junkdetect/pom.xml | 65 +----------
.../apache/tika/ml/junkdetect/BigramTables.java | 82 ++++++++++----
.../apache/tika/ml/junkdetect/JunkDetector.java | 110 +++++++++---------
.../ml/junkdetect/JunkFilterEncodingDetector.java | 123 ++++++++++++---------
.../tika/ml/junkdetect/TextQualityFeatures.java | 52 ++++++++-
.../org/apache/tika/ml/junkdetect/junkdetect.bin | Bin 2316809 -> 727979
bytes
.../tika-parser-html-module/pom.xml | 3 +
32 files changed, 509 insertions(+), 357 deletions(-)
diff --git
a/tika-core/src/main/java/org/apache/tika/detect/EncodingDetectorContext.java
b/tika-core/src/main/java/org/apache/tika/detect/EncodingDetectorContext.java
index 6957601e2c..426f508c14 100644
---
a/tika-core/src/main/java/org/apache/tika/detect/EncodingDetectorContext.java
+++
b/tika-core/src/main/java/org/apache/tika/detect/EncodingDetectorContext.java
@@ -40,8 +40,19 @@ import java.util.Set;
public class EncodingDetectorContext {
private final List<Result> results = new ArrayList<>();
+ private final EncodingProbeCache probeCache = new EncodingProbeCache();
private String arbitrationInfo;
+ /**
+ * Per-detection cache of the raw detection probe, shared across the
detectors in
+ * this chain so they don't each re-read the same leading bytes. It lives
and dies
+ * with this context (which is removed after detection), so it never leaks
into
+ * recursive/attachment parsing.
+ */
+ public EncodingProbeCache getProbeCache() {
+ return probeCache;
+ }
+
/**
* Record the ranked results from a child detector.
*
diff --git
a/tika-core/src/main/java/org/apache/tika/detect/EncodingProbeCache.java
b/tika-core/src/main/java/org/apache/tika/detect/EncodingProbeCache.java
new file mode 100644
index 0000000000..a4d783fb02
--- /dev/null
+++ b/tika-core/src/main/java/org/apache/tika/detect/EncodingProbeCache.java
@@ -0,0 +1,65 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.detect;
+
+/**
+ * Caches the raw encoding-detection probe (the leading bytes read for
detection)
+ * so that multiple detectors in a chain do not each re-read and re-tag-strip
the
+ * same bytes. For example a statistical detector and a downstream meta
detector
+ * that re-reads the bytes for arbitration can share one probe.
+ * <p>
+ * An instance is held by {@link EncodingDetectorContext}, so it inherits that
+ * context's per-detection lifecycle: created fresh per detection and discarded
+ * with the context immediately afterwards. That matters because a
+ * {@link org.apache.tika.parser.ParseContext} flows on into recursive
+ * (attachment/embedded) parsing — a probe must never outlive the single
detection
+ * it was read for.
+ * <p>
+ * Not thread-safe: a single detection runs its detectors sequentially on one
+ * thread. The cache is keyed by the probe parameters — {@link #get} returns
the
+ * cached probe only when both {@code contentTarget} and {@code rawCap} match
what
+ * it was stored with, so a detector that wants a differently-sized probe
+ * transparently reads (and caches) its own.
+ * <p>
+ * The cached array is shared read-only state; callers must not mutate it in
place.
+ */
+public class EncodingProbeCache {
+
+ private byte[] probe;
+ private int contentTarget = -1;
+ private int rawCap = -1;
+
+ /**
+ * @return the cached probe if one was stored with the same {@code
contentTarget} and
+ * {@code rawCap}; otherwise {@code null}
+ */
+ public byte[] get(int contentTarget, int rawCap) {
+ if (probe != null && this.contentTarget == contentTarget &&
this.rawCap == rawCap) {
+ return probe;
+ }
+ return null;
+ }
+
+ /**
+ * Stores the probe bytes read with the given parameters.
+ */
+ public void put(byte[] probe, int contentTarget, int rawCap) {
+ this.probe = probe;
+ this.contentTarget = contentTarget;
+ this.rawCap = rawCap;
+ }
+}
diff --git
a/tika-encoding-detectors/tika-encoding-detector-html/src/main/java/org/apache/tika/parser/html/HtmlEncodingDetector.java
b/tika-encoding-detectors/tika-encoding-detector-html/src/main/java/org/apache/tika/parser/html/HtmlEncodingDetector.java
index c052b062b6..1d9398ce5e 100644
---
a/tika-encoding-detectors/tika-encoding-detector-html/src/main/java/org/apache/tika/parser/html/HtmlEncodingDetector.java
+++
b/tika-encoding-detectors/tika-encoding-detector-html/src/main/java/org/apache/tika/parser/html/HtmlEncodingDetector.java
@@ -20,7 +20,6 @@ import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Serializable;
-import java.nio.ByteBuffer;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.Collections;
@@ -85,6 +84,7 @@ public class HtmlEncodingDetector implements EncodingDetector
{
private static final Pattern FLEXIBLE_CHARSET_ATTR_PATTERN =
Pattern.compile(("(?is)\\bcharset\\s*=\\s*(?:['\\\"]\\s*)?([-_:\\.a-z0-9]+)"));
private static final Charset ASCII = Charset.forName("US-ASCII");
+ private static final Pattern HTML_COMMENT_PATTERN =
Pattern.compile("<!--.*?(-->|$)");
/**
* HTML can include non-iana supported charsets that Java
* recognizes, e.g. "unicode". This can lead to incorrect
detection/mojibake.
@@ -162,10 +162,20 @@ public class HtmlEncodingDetector implements
EncodingDetector {
}
tis.reset();
- String head = ASCII.decode(ByteBuffer.wrap(buffer, 0, n)).toString();
- String headNoComments = head.replaceAll("<!--.*?(-->|$)", " ");
+ // findCharset only ever matches a meta tag (HTTP_META_PATTERN =
"<\s*meta...").
+ // If the probe has no such tag, the full ASCII decode +
comment-stripping
+ // regex below can only produce null — skip them. Byte-level, no
allocation;
+ // a strict necessary condition for any non-empty result.
+ if (!containsMetaTag(buffer, n)) {
+ return Collections.emptyList();
+ }
+
+ String head = new String(buffer, 0, n, ASCII);
+ boolean hasComment = head.indexOf("<!--") >= 0;
+ String headNoComments =
+ hasComment ? HTML_COMMENT_PATTERN.matcher(head).replaceAll("
") : head;
Charset charset = findCharset(headNoComments);
- if (charset == null) {
+ if (charset == null && hasComment) {
charset = findCharset(head);
}
if (charset == null) {
@@ -175,6 +185,39 @@ public class HtmlEncodingDetector implements
EncodingDetector {
EncodingResult.ResultType.DECLARATIVE));
}
+ /**
+ * Byte-level scan for an opening meta tag, mirroring the {@code <\s*meta}
prefix of
+ * {@link #HTTP_META_PATTERN} (ASCII, case-insensitive). Lets {@link
#detect} skip the
+ * full ASCII decode + comment-stripping regex on probes that cannot
contain a meta
+ * charset declaration. {@code <}, ASCII whitespace and {@code meta} are
all ASCII, so a
+ * raw-byte scan is equivalent to scanning the decoded head.
+ */
+ private static boolean containsMetaTag(byte[] buf, int len) {
+ for (int i = 0; i < len; i++) {
+ if (buf[i] != '<') {
+ continue;
+ }
+ int j = i + 1;
+ while (j < len) {
+ int c = buf[j] & 0xFF;
+ if (c == ' ' || c == '\t' || c == '\n' || c == 0x0B || c ==
'\f'
+ || c == '\r') {
+ j++;
+ } else {
+ break;
+ }
+ }
+ if (j + 4 <= len
+ && (((buf[j] & 0xFF) | 0x20) == 'm')
+ && (((buf[j + 1] & 0xFF) | 0x20) == 'e')
+ && (((buf[j + 2] & 0xFF) | 0x20) == 't')
+ && (((buf[j + 3] & 0xFF) | 0x20) == 'a')) {
+ return true;
+ }
+ }
+ return false;
+ }
+
//returns null if no charset was found
private Charset findCharset(String s) {
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/resources/org/apache/tika/parser/html/StandardCharsets_unsupported_by_IANA.txt
b/tika-encoding-detectors/tika-encoding-detector-html/src/main/resources/org/apache/tika/parser/html/StandardCharsets_unsupported_by_IANA.txt
similarity index 100%
rename from
tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/resources/org/apache/tika/parser/html/StandardCharsets_unsupported_by_IANA.txt
rename to
tika-encoding-detectors/tika-encoding-detector-html/src/main/resources/org/apache/tika/parser/html/StandardCharsets_unsupported_by_IANA.txt
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlEncodingDetectorTest.java
b/tika-encoding-detectors/tika-encoding-detector-html/src/test/java/org/apache/tika/parser/html/HtmlEncodingDetectorTest.java
similarity index 100%
rename from
tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlEncodingDetectorTest.java
rename to
tika-encoding-detectors/tika-encoding-detector-html/src/test/java/org/apache/tika/parser/html/HtmlEncodingDetectorTest.java
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/StandardHtmlEncodingDetectorTest.java
b/tika-encoding-detectors/tika-encoding-detector-html/src/test/java/org/apache/tika/parser/html/StandardHtmlEncodingDetectorTest.java
similarity index 100%
rename from
tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/StandardHtmlEncodingDetectorTest.java
rename to
tika-encoding-detectors/tika-encoding-detector-html/src/test/java/org/apache/tika/parser/html/StandardHtmlEncodingDetectorTest.java
diff --git
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/AdaptiveProbe.java
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/AdaptiveProbe.java
index 34a081ec32..e2100b0533 100644
---
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/AdaptiveProbe.java
+++
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/AdaptiveProbe.java
@@ -55,11 +55,20 @@ public final class AdaptiveProbe {
throws IOException {
tis.mark(rawCap);
try {
- byte[] buf = new byte[rawCap];
- byte[] stripDst = new byte[rawCap];
+ // Grow on demand rather than allocating (and zeroing) the full
rawCap
+ // (e.g. 512 KB) twice up front: the vast majority of probes are
far
+ // smaller. Bytes returned are identical to the eager-allocation
version.
+ int cap = Math.min(rawCap, contentTarget);
+ byte[] buf = new byte[cap];
+ byte[] stripDst = new byte[cap];
int total = 0;
while (total < rawCap) {
int want = Math.min(rawCap - total, contentTarget);
+ if (total + want > buf.length) {
+ int newCap = Math.min(rawCap, Math.max(buf.length * 2,
total + want));
+ buf = Arrays.copyOf(buf, newCap);
+ stripDst = Arrays.copyOf(stripDst, newCap);
+ }
int n = IOUtils.read(tis, buf, total, want);
total += n;
HtmlByteStripper.Result r =
@@ -72,7 +81,7 @@ public final class AdaptiveProbe {
if (total == 0) {
return new byte[0];
}
- return total == rawCap ? buf : Arrays.copyOf(buf, total);
+ return total == buf.length ? buf : Arrays.copyOf(buf, total);
} finally {
tis.reset();
}
diff --git
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java
index 45a919274a..e225366091 100644
---
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java
+++
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java
@@ -29,6 +29,8 @@ import org.slf4j.LoggerFactory;
import org.apache.tika.config.TikaComponent;
import org.apache.tika.detect.EncodingDetector;
+import org.apache.tika.detect.EncodingDetectorContext;
+import org.apache.tika.detect.EncodingProbeCache;
import org.apache.tika.detect.EncodingResult;
import org.apache.tika.detect.HighByteLetterStats;
import org.apache.tika.io.TikaInputStream;
@@ -208,7 +210,7 @@ public class MojibusterEncodingDetector implements
EncodingDetector {
@Override
public List<EncodingResult> detect(TikaInputStream tis, Metadata metadata,
ParseContext parseContext) throws
IOException {
- byte[] probe = readProbe(tis);
+ byte[] probe = readProbe(tis, parseContext);
return detect(probe, metadata);
}
@@ -749,7 +751,21 @@ public class MojibusterEncodingDetector implements
EncodingDetector {
return lower.contains("html") || lower.contains("xml");
}
- private static byte[] readProbe(TikaInputStream tis) throws IOException {
- return AdaptiveProbe.read(tis, PROBE_CONTENT_TARGET, PROBE_RAW_CAP);
+ private static byte[] readProbe(TikaInputStream tis, ParseContext
parseContext)
+ throws IOException {
+ EncodingDetectorContext context =
+ parseContext == null ? null :
parseContext.get(EncodingDetectorContext.class);
+ EncodingProbeCache cache = context == null ? null :
context.getProbeCache();
+ if (cache != null) {
+ byte[] cached = cache.get(PROBE_CONTENT_TARGET, PROBE_RAW_CAP);
+ if (cached != null) {
+ return cached;
+ }
+ }
+ byte[] probe = AdaptiveProbe.read(tis, PROBE_CONTENT_TARGET,
PROBE_RAW_CAP);
+ if (cache != null) {
+ cache.put(probe, PROBE_CONTENT_TARGET, PROBE_RAW_CAP);
+ }
+ return probe;
}
}
diff --git
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/NaiveBayesBigramEncodingDetector.java
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/NaiveBayesBigramEncodingDetector.java
index 5becf20ce6..da87d46b74 100644
---
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/NaiveBayesBigramEncodingDetector.java
+++
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/NaiveBayesBigramEncodingDetector.java
@@ -62,12 +62,11 @@ public class NaiveBayesBigramEncodingDetector implements
EncodingDetector {
private static final int BIGRAM_SPACE = 65_536;
/**
- * Cap probe scanning at 10 KB. Bigram-based identification
+ * Cap probe scanning at 16 KB. Bigram-based identification
* saturates quickly — beyond the first 500-1000 bytes every
* additional bigram nudges scores by < 0.1 log-likelihood and
- * doesn't change the argmax. Reducing the cap from 4 KB to 1 KB
- * quartes the inner-loop work on long probes at no measurable
- * accuracy cost.
+ * doesn't change the argmax, so capping the scan bounds the
+ * inner-loop work on long probes at no measurable accuracy cost.
*/
private static final int MAX_PROBE_BYTES = 16 * 1024;
@@ -216,8 +215,8 @@ public class NaiveBayesBigramEncodingDetector implements
EncodingDetector {
/**
* Bigram-major int8 logP layout. Quantized at load time via
* per-class scale {@code scale[c] = maxAbs(class c's logP column) / 127}.
- * In-memory footprint: {@code 65_536 × numClasses} bytes ≈ 2 MB for
- * 32 classes, 4× smaller than float32. The hot-loop accumulates
+ * In-memory footprint: {@code 65_536 × numClasses} bytes ≈ 2.1 MB for
+ * 34 classes, 4× smaller than float32. The hot-loop accumulates
* raw int8 products and applies dequantization once at the end of
* the probe, CharSoup-style.
*/
@@ -309,9 +308,28 @@ public class NaiveBayesBigramEncodingDetector implements
EncodingDetector {
for (int bg = 0; bg < BIGRAM_SPACE; bg++) {
logP8[bg * numClasses + c] = u;
}
- // Overwrite with trained pairs.
+ // Overwrite with trained pairs. Bigram ids are sorted
ascending and
+ // stored as varint deltas (LEB128) from the previous id.
+ int bigram = 0;
for (int i = 0; i < vocabSize; i++) {
- int bigram = dis.readUnsignedShort();
+ long delta = 0;
+ int shift = 0;
+ int b;
+ do {
+ if (shift >= 35) {
+ throw new IOException(
+ "Malformed varint in bigram-id deltas (too
long)");
+ }
+ b = dis.readUnsignedByte();
+ delta |= (long) (b & 0x7F) << shift;
+ shift += 7;
+ } while ((b & 0x80) != 0);
+ long next = bigram + delta;
+ if (next < 0 || next >= BIGRAM_SPACE) {
+ throw new IOException("Bigram id out of range: " + next
+ + " (expected [0, " + BIGRAM_SPACE + "))");
+ }
+ bigram = (int) next;
byte q = dis.readByte();
logP8[bigram * numClasses + c] = q;
}
@@ -521,6 +539,7 @@ public class NaiveBayesBigramEncodingDetector implements
EncodingDetector {
// diagnostic path.
double[] score = new double[numClasses];
double[] contributions = new double[numClasses];
+ double[] bestPerCohort = new double[Cohort.values().length];
int hashCap = counts.capacity();
for (int slot = 0; slot < hashCap; slot++) {
int bigram = counts.keyAt(slot);
@@ -549,6 +568,15 @@ public class NaiveBayesBigramEncodingDetector implements
EncodingDetector {
// cohort, so the cap engages on cross-cohort gaps that a
// max-vs-overall-runner-up cap missed when multiple classes
// in top-1's cohort sat close together.
+ //
+ // Single per-class pass computes the contributions, the running
+ // max/topClass, AND the best contribution per cohort;
bestCrossCohort
+ // then reduces over the (few) cohorts instead of a second full
+ // per-class pass, and the clip is fused into the accumulate.
+ // Bit-identical to the prior four-pass form: max/cross-cohort are
exact
+ // (comparisons over the same value set), and the contribution
formula
+ // and score[] accumulation order are unchanged.
+ java.util.Arrays.fill(bestPerCohort, Double.NEGATIVE_INFINITY);
int topClass = -1;
double max = Double.NEGATIVE_INFINITY;
for (int c = 0; c < numClasses; c++) {
@@ -558,25 +586,27 @@ public class NaiveBayesBigramEncodingDetector implements
EncodingDetector {
max = contrib;
topClass = c;
}
+ int co = cohorts[c].ordinal();
+ if (contrib > bestPerCohort[co]) {
+ bestPerCohort[co] = contrib;
+ }
}
- Cohort topCohort = cohorts[topClass];
+ int topCohort = cohorts[topClass].ordinal();
double bestCrossCohort = Double.NEGATIVE_INFINITY;
- for (int c = 0; c < numClasses; c++) {
- if (cohorts[c] != topCohort && contributions[c] >
bestCrossCohort) {
- bestCrossCohort = contributions[c];
+ for (int k = 0; k < bestPerCohort.length; k++) {
+ if (k != topCohort && bestPerCohort[k] > bestCrossCohort) {
+ bestCrossCohort = bestPerCohort[k];
}
}
// bestCrossCohort is always finite here: load requires >=2
cohorts.
double capValue = bestCrossCohort + CAP_PER_BIGRAM_NATS;
- if (max > capValue) {
- for (int c = 0; c < numClasses; c++) {
- if (contributions[c] > capValue) {
- contributions[c] = capValue;
- }
- }
- }
+ boolean clip = max > capValue;
for (int c = 0; c < numClasses; c++) {
- score[c] += contributions[c];
+ double v = contributions[c];
+ if (clip && v > capValue) {
+ v = capValue;
+ }
+ score[c] += v;
}
}
return new ScoreResult(score, scored, total);
diff --git
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/resources/org/apache/tika/ml/chardetect/nb-bigram.bin
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/resources/org/apache/tika/ml/chardetect/nb-bigram.bin
index b89188bb32..0cebc858bb 100644
Binary files
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/resources/org/apache/tika/ml/chardetect/nb-bigram.bin
and
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/resources/org/apache/tika/ml/chardetect/nb-bigram.bin
differ
diff --git
a/tika-langdetect/tika-langdetect-charsoup-core/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupFeatureExtractor.java
b/tika-langdetect/tika-langdetect-charsoup-core/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupFeatureExtractor.java
index 7bbfc032e5..6b7e39bc68 100644
---
a/tika-langdetect/tika-langdetect-charsoup-core/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupFeatureExtractor.java
+++
b/tika-langdetect/tika-langdetect-charsoup-core/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupFeatureExtractor.java
@@ -244,9 +244,17 @@ public class CharSoupFeatureExtractor {
* @return cleaned, NFC-normalized text
*/
public static String preprocessNoTruncate(String rawText) {
- // Strip URLs and emails
- String text = URL_REGEX.matcher(rawText).replaceAll(" ");
- text = MAIL_REGEX.matcher(text).replaceAll(" ");
+ // Strip URLs and emails. Both regexes scan the entire input on every
call;
+ // skip each unless its required marker is present ("://" for
URL_REGEX, "@"
+ // for MAIL_REGEX). This is a no-op for the common (markerless) case —
the
+ // output is identical — but avoids a full-buffer regex scan + Matcher
alloc.
+ String text = rawText;
+ if (text.indexOf("://") >= 0) {
+ text = URL_REGEX.matcher(text).replaceAll(" ");
+ }
+ if (text.indexOf('@') >= 0) {
+ text = MAIL_REGEX.matcher(text).replaceAll(" ");
+ }
// NFC normalize
if (!Normalizer.isNormalized(text, Normalizer.Form.NFC)) {
diff --git a/tika-ml/pom.xml b/tika-ml/pom.xml
index 11ddcb221c..46de69dc69 100644
--- a/tika-ml/pom.xml
+++ b/tika-ml/pom.xml
@@ -35,6 +35,7 @@
<module>tika-ml-core</module>
<module>tika-ml-chardetect</module>
<module>tika-ml-junkdetect</module>
+ <module>tika-ml-junkdetect-tools</module>
</modules>
<build>
diff --git
a/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/TrainNaiveBayesBigram.java
b/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/TrainNaiveBayesBigram.java
index a082f4c5be..6eaee95c08 100644
---
a/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/TrainNaiveBayesBigram.java
+++
b/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/TrainNaiveBayesBigram.java
@@ -427,8 +427,8 @@ public class TrainNaiveBayesBigram {
* float32 scale (per-class dequant)
* byte unseenQ (int8 quantized unseen floor)
* int32 vocabSize (number of trained pairs)
- * for each kept bigram:
- * uint16 bigramKey
+ * bigram keys sorted ascending, each pair stored as:
+ * varint deltaFromPrevKey (LEB128; first delta is the key
itself)
* byte logP8 (int8 quantized)
*
* <p>Sparse representation: only trained bigram pairs are stored;
@@ -497,11 +497,22 @@ public class TrainNaiveBayesBigram {
dos.writeByte(unseenQ[c]);
float scale = perClassScale[c];
dos.writeInt(logProbsPerClass[c].size());
- for (Map.Entry<Integer, Float> e :
logProbsPerClass[c].entrySet()) {
- int q = Math.round(e.getValue() / scale);
+ // Bigram ids sorted ascending, stored as varint (LEB128)
deltas from
+ // the previous id — most deltas fit in a single byte.
+ int[] keys = logProbsPerClass[c].keySet().stream()
+ .mapToInt(Integer::intValue).sorted().toArray();
+ int prev = 0;
+ for (int key : keys) {
+ int q = Math.round(logProbsPerClass[c].get(key) / scale);
if (q > 127) q = 127;
if (q < -127) q = -127;
- dos.writeShort(e.getKey());
+ int delta = key - prev;
+ prev = key;
+ while ((delta & ~0x7F) != 0) {
+ dos.writeByte((delta & 0x7F) | 0x80);
+ delta >>>= 7;
+ }
+ dos.writeByte(delta);
dos.writeByte(q);
}
}
diff --git a/tika-ml/tika-ml-junkdetect/pom.xml
b/tika-ml/tika-ml-junkdetect-tools/pom.xml
similarity index 72%
copy from tika-ml/tika-ml-junkdetect/pom.xml
copy to tika-ml/tika-ml-junkdetect-tools/pom.xml
index fe717998cf..fe684093cd 100644
--- a/tika-ml/tika-ml-junkdetect/pom.xml
+++ b/tika-ml/tika-ml-junkdetect-tools/pom.xml
@@ -25,33 +25,19 @@
</parent>
<modelVersion>4.0.0</modelVersion>
- <artifactId>tika-ml-junkdetect</artifactId>
- <name>Apache Tika ML junk detector — runtime and training tools</name>
+ <artifactId>tika-ml-junkdetect-tools</artifactId>
+ <name>Apache Tika ML junk detector — training and evaluation tools</name>
<description>
- Language-agnostic text quality scorer that discriminates between clean
UTF-8 text and
- mojibake, reversed text, wrong-codec decodings, and other corruption forms.
- Provides a standalone "languageyness" score suitable for re-OCR triggering
and
- charset-decoding arbitration.
-
- Runtime classes (JunkDetector, ScriptDetector, feature extractors) and
bundled model
- resources live here. Training and evaluation CLI tools live in the tools
subpackage.
+ Build-time training, evaluation, and diagnostic CLIs for the junk detector
+ (TrainJunkModel, BuildJunkTrainingData, and diagnostics). These are not
part
+ of the runtime detector — they are kept out of the tika-ml-junkdetect
runtime
+ jar and built into a self-contained tools jar via the 'train' profile.
</description>
<dependencies>
<dependency>
<groupId>org.apache.tika</groupId>
- <artifactId>tika-core</artifactId>
- <version>${revision}</version>
- </dependency>
- <dependency>
- <groupId>org.apache.tika</groupId>
- <artifactId>tika-annotation-processor</artifactId>
- <version>${revision}</version>
- <scope>provided</scope>
- </dependency>
- <dependency>
- <groupId>org.apache.tika</groupId>
- <artifactId>tika-ml-core</artifactId>
+ <artifactId>tika-ml-junkdetect</artifactId>
<version>${revision}</version>
</dependency>
<dependency>
@@ -63,9 +49,7 @@
<!-- Test dependencies -->
<!--
tika-serialization is test-scope only because the one consumer
- (BuildJunkAugmentationData) lives in src/test/java — it's a corpus-prep
- tool, not part of the runtime detector. Keeps the production classpath of
- tika-ml-junkdetect free of the serialization dep.
+ (BuildJunkAugmentationData) is a corpus-prep tool in src/test/java.
-->
<dependency>
<groupId>org.apache.tika</groupId>
@@ -93,21 +77,11 @@
<configuration>
<archive>
<manifestEntries>
-
<Automatic-Module-Name>org.apache.tika.ml.junkdetect</Automatic-Module-Name>
+
<Automatic-Module-Name>org.apache.tika.ml.junkdetect.tools</Automatic-Module-Name>
</manifestEntries>
</archive>
</configuration>
</plugin>
- <plugin>
- <groupId>org.apache.rat</groupId>
- <artifactId>apache-rat-plugin</artifactId>
- <configuration>
- <inputExcludes>
- <inputExclude>**/*.bin</inputExclude>
- <inputExclude>**/*.txt</inputExclude>
- </inputExcludes>
- </configuration>
- </plugin>
<!-- Tools package uses System.out/printf freely -->
<plugin>
<groupId>de.thetaphi</groupId>
@@ -123,12 +97,11 @@
<profile>
<!--
Build a self-contained fat JAR for model training and evaluation.
- Usage:
- ./mvnw package -pl tika-ml/tika-ml-junkdetect -am -Ptrain
-DskipTests \
+ Usage (run from the repo root so the model resource path resolves):
+ ./mvnw package -pl tika-ml/tika-ml-junkdetect-tools -am -Ptrain
-DskipTests \
-Dmaven.repo.local=.local_m2_repo
- java -jar
tika-ml/tika-ml-junkdetect/target/tika-ml-junkdetect-*-tools.jar \
- [BuildJunkTrainingData|TrainJunkModel|EvalJunkDetector] \
- [args...]
+ java -jar
tika-ml/tika-ml-junkdetect-tools/target/tika-ml-junkdetect-tools-*-tools.jar \
+ [BuildJunkTrainingData|TrainJunkModel|EvalJunkDetector] [args...]
-->
<id>train</id>
<build>
@@ -170,7 +143,4 @@
</profile>
</profiles>
- <scm>
- <tag>3.0.0-rc1</tag>
- </scm>
</project>
diff --git
a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/BoundaryBigramAudit.java
b/tika-ml/tika-ml-junkdetect-tools/src/main/java/org/apache/tika/ml/junkdetect/tools/BoundaryBigramAudit.java
similarity index 100%
rename from
tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/BoundaryBigramAudit.java
rename to
tika-ml/tika-ml-junkdetect-tools/src/main/java/org/apache/tika/ml/junkdetect/tools/BoundaryBigramAudit.java
diff --git
a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/BuildJunkTrainingData.java
b/tika-ml/tika-ml-junkdetect-tools/src/main/java/org/apache/tika/ml/junkdetect/tools/BuildJunkTrainingData.java
similarity index 100%
rename from
tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/BuildJunkTrainingData.java
rename to
tika-ml/tika-ml-junkdetect-tools/src/main/java/org/apache/tika/ml/junkdetect/tools/BuildJunkTrainingData.java
diff --git
a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/DebugScriptRuns.java
b/tika-ml/tika-ml-junkdetect-tools/src/main/java/org/apache/tika/ml/junkdetect/tools/DebugScriptRuns.java
similarity index 100%
rename from
tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/DebugScriptRuns.java
rename to
tika-ml/tika-ml-junkdetect-tools/src/main/java/org/apache/tika/ml/junkdetect/tools/DebugScriptRuns.java
diff --git
a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/JunkDetectorTrainingConfig.java
b/tika-ml/tika-ml-junkdetect-tools/src/main/java/org/apache/tika/ml/junkdetect/tools/JunkDetectorTrainingConfig.java
similarity index 100%
rename from
tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/JunkDetectorTrainingConfig.java
rename to
tika-ml/tika-ml-junkdetect-tools/src/main/java/org/apache/tika/ml/junkdetect/tools/JunkDetectorTrainingConfig.java
diff --git
a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/LineScriptFractions.java
b/tika-ml/tika-ml-junkdetect-tools/src/main/java/org/apache/tika/ml/junkdetect/tools/LineScriptFractions.java
similarity index 100%
rename from
tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/LineScriptFractions.java
rename to
tika-ml/tika-ml-junkdetect-tools/src/main/java/org/apache/tika/ml/junkdetect/tools/LineScriptFractions.java
diff --git
a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/ScriptCensus.java
b/tika-ml/tika-ml-junkdetect-tools/src/main/java/org/apache/tika/ml/junkdetect/tools/ScriptCensus.java
similarity index 100%
rename from
tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/ScriptCensus.java
rename to
tika-ml/tika-ml-junkdetect-tools/src/main/java/org/apache/tika/ml/junkdetect/tools/ScriptCensus.java
diff --git
a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/TrainJunkModel.java
b/tika-ml/tika-ml-junkdetect-tools/src/main/java/org/apache/tika/ml/junkdetect/tools/TrainJunkModel.java
similarity index 97%
rename from
tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/TrainJunkModel.java
rename to
tika-ml/tika-ml-junkdetect-tools/src/main/java/org/apache/tika/ml/junkdetect/tools/TrainJunkModel.java
index 63bce5317f..13cbc20381 100644
---
a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/TrainJunkModel.java
+++
b/tika-ml/tika-ml-junkdetect-tools/src/main/java/org/apache/tika/ml/junkdetect/tools/TrainJunkModel.java
@@ -46,7 +46,7 @@ import org.apache.tika.ml.junkdetect.JunkDetector;
*
* <p>z1 (codepoint-bigram log-probability) is trained per script by bucketing
* every bigram to its script ({@link JunkDetector#forEachScriptBigram}) and
- * building a per-script open-addressing bigram table with unigram backoff.
+ * building a per-script sorted-occupied bigram table with unigram backoff.
* z2 (Unicode block-transition), z3 (control-byte fraction), and z4
* (script-transition) are single global document-level features. All features
* are calibrated (mu/sigma) and combined by a single global contrastive
@@ -932,7 +932,7 @@ public class TrainJunkModel {
*
* @param trainFile the per-script {@code *.train.gz}
* @param minBigramCount drop pairs whose count is below this
- * @param loadFactor target OA table load factor (e.g. 0.5)
+ * @param loadFactor unused (retained for signature compatibility)
* @param keyIndexBits bit-width per index in the packed key
* (each side of the pair must fit)
*/
@@ -970,9 +970,12 @@ public class TrainJunkModel {
/**
* Builds the {@link BigramTables} carrier from pre-tallied pair/unigram
* counts. Drops pairs below
- * {@code minBigramCount}, assigns dense codepoint indices, and packs an
- * open-addressing bigram table; unigram log-probs use {@code unigramTotal}
+ * {@code minBigramCount}, assigns dense codepoint indices, and packs the
+ * sorted-occupied bigram table; unigram log-probs use {@code unigramTotal}
* as the denominator.
+ *
+ * <p>{@code loadFactor} is retained for signature compatibility but
unused:
+ * the sorted-occupied table (binary-search lookup) has no load factor.
*/
public static BigramTables buildBigramTablesFromCounts(
HashMap<Long, long[]> pairCounts,
@@ -1037,12 +1040,8 @@ public class TrainJunkModel {
// Quantize unigram log-probs.
QuantizedFloats qUnigram = quantizeFloats(unigramLogP);
- // --- Build the open-addressing bigram table. ---
- int slots = nextPowerOfTwo((int) Math.max(2, Math.ceil(keptPairs /
loadFactor)));
- int[] keys = new int[slots];
- java.util.Arrays.fill(keys, BigramTables.EMPTY_KEY);
- // Compute log-probs first, quantize once, then write into the table
- // alongside its key.
+ // --- Build the sorted-occupied bigram table (binary-search lookup).
---
+ // Compute log-probs first, quantize once, then sort by key.
float[] keptLogP = new float[keptPairs];
int[] keptKeys = new int[keptPairs];
int writeIdx = 0;
@@ -1067,16 +1066,25 @@ public class TrainJunkModel {
}
// Quantize all kept log-probs together so they share min/max.
QuantizedFloats qBigram = quantizeFloats(keptLogP);
- byte[] values = new byte[slots];
+ // Sort (key, value) ascending by signed key so the loader can
binary-search.
+ // Pack into a long (key in high 32 bits, value byte in low 8) for one
sort.
+ long[] sortable = new long[keptPairs];
+ for (int i = 0; i < keptPairs; i++) {
+ sortable[i] = (((long) keptKeys[i]) << 32) | (qBigram.bytes[i] &
0xFFL);
+ }
+ java.util.Arrays.sort(sortable);
+ int[] keys = new int[keptPairs];
+ byte[] values = new byte[keptPairs];
for (int i = 0; i < keptPairs; i++) {
- insertOA(keys, values, keptKeys[i], qBigram.bytes[i]);
+ keys[i] = (int) (sortable[i] >> 32);
+ values[i] = (byte) (sortable[i] & 0xFF);
}
System.out.printf(
" pair_counts: distinct=%,d, kept=%,d (>=%d), dropped=%,d "
- + "cp_index=%,d slots=%,d (load=%.2f)%n",
+ + "cp_index=%,d bigram_entries=%,d%n",
totalDistinct, keptPairs, minBigramCount, dropped,
- cpIndex.length, slots, keptPairs / (double) slots);
+ cpIndex.length, keptPairs);
return new BigramTables(cpIndex, keys, values, qUnigram.bytes,
qBigram.min, qBigram.max,
@@ -1084,32 +1092,6 @@ public class TrainJunkModel {
unigramFallbackLogP, BACKOFF_ALPHA);
}
- /**
- * Inserts a {@code (packedKey, value)} pair into the open-addressing
- * table. The caller is responsible for sizing the table large enough
- * to avoid an infinite probe (any load < 1.0 is safe).
- */
- private static void insertOA(int[] keys, byte[] values, int packedKey,
byte value) {
- int mask = keys.length - 1;
- int h = JunkDetector.mixIndexKey(packedKey) & mask;
- while (keys[h] != BigramTables.EMPTY_KEY) {
- if (keys[h] == packedKey) {
- // Same key twice — shouldn't happen with our dedup, but be
- // defensive and overwrite rather than corrupt.
- values[h] = value;
- return;
- }
- h = (h + 1) & mask;
- }
- keys[h] = packedKey;
- values[h] = value;
- }
-
- private static int nextPowerOfTwo(int n) {
- if (n < 1) return 1;
- int p = Integer.highestOneBit(n - 1) << 1;
- return Math.max(1, p);
- }
// -----------------------------------------------------------------------
// Global contrastive combiner training
diff --git
a/tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/JunkDetectorRoundTripTest.java
b/tika-ml/tika-ml-junkdetect-tools/src/test/java/org/apache/tika/ml/junkdetect/JunkDetectorRoundTripTest.java
similarity index 93%
rename from
tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/JunkDetectorRoundTripTest.java
rename to
tika-ml/tika-ml-junkdetect-tools/src/test/java/org/apache/tika/ml/junkdetect/JunkDetectorRoundTripTest.java
index 07efc64dd8..7433820efe 100644
---
a/tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/JunkDetectorRoundTripTest.java
+++
b/tika-ml/tika-ml-junkdetect-tools/src/test/java/org/apache/tika/ml/junkdetect/JunkDetectorRoundTripTest.java
@@ -111,14 +111,12 @@ public class JunkDetectorRoundTripTest {
// Same shape as the first test but with BOTH (A,B) and (B,A) in the
// bigram table. mean log-prob = -1.0, z1 = +4.0, logit = +4.0.
int[] cpIndex = new int[]{'A', 'B'};
- int[] keys = new int[4];
- Arrays.fill(keys, BigramTables.EMPTY_KEY);
- byte[] values = new byte[4];
float bMin = -10.0f;
float bMax = -1.0f;
byte b = quantizeOne(-1.0f, bMin, bMax);
- insertOA(keys, values, JunkDetector.packBigramKey(0, 1), b);
- insertOA(keys, values, JunkDetector.packBigramKey(1, 0), b);
+ // sorted-occupied: packBigramKey(0,1)=1 < packBigramKey(1,0)=65536
+ int[] keys = {JunkDetector.packBigramKey(0, 1),
JunkDetector.packBigramKey(1, 0)};
+ byte[] values = {b, b};
float uMin = -5.0f;
float uMax = -2.0f;
@@ -266,17 +264,14 @@ public class JunkDetectorRoundTripTest {
private static BigramTables buildLatinTablesAB() {
int[] cpIndex = new int[]{'A', 'B'};
- // 4 slots ≈ 25% load for 1 pair. Open-addressing with linear probe.
- int[] keys = new int[4];
- Arrays.fill(keys, BigramTables.EMPTY_KEY);
- byte[] values = new byte[4];
-
// Manual quantization with a chosen range so we don't hit the
// degenerate single-element case. range=[-10, -1] → -1.0 → byte 255.
float bMin = -10.0f;
float bMax = -1.0f;
byte b = quantizeOne(-1.0f, bMin, bMax);
- insertOA(keys, values, JunkDetector.packBigramKey(0, 1), b);
+ // sorted-occupied table with a single trained pair.
+ int[] keys = {JunkDetector.packBigramKey(0, 1)};
+ byte[] values = {b};
float uMin = -5.0f;
float uMax = -2.0f;
@@ -358,13 +353,10 @@ public class JunkDetectorRoundTripTest {
* (uppercase 'A'/'B' are absent from the index, so they must fold). */
private static BigramTables buildLatinTablesLowerAB() {
int[] cpIndex = new int[]{'a', 'b'};
- int[] keys = new int[4];
- Arrays.fill(keys, BigramTables.EMPTY_KEY);
- byte[] values = new byte[4];
float bMin = -10.0f;
float bMax = -1.0f;
- insertOA(keys, values, JunkDetector.packBigramKey(0, 1),
- quantizeOne(-1.0f, bMin, bMax));
+ int[] keys = {JunkDetector.packBigramKey(0, 1)};
+ byte[] values = {quantizeOne(-1.0f, bMin, bMax)};
float uMin = -5.0f;
float uMax = -2.0f;
byte[] unigramBytes = new byte[]{
@@ -403,25 +395,6 @@ public class JunkDetectorRoundTripTest {
return (byte) q;
}
- /**
- * Replica of {@code TrainJunkModel.insertOA} (package-private) for the
- * test's hand-constructed tables. Uses the same mix-hash as the
- * production code path.
- */
- private static void insertOA(int[] keys, byte[] values, int packedKey,
byte value) {
- int mask = keys.length - 1;
- int h = JunkDetector.mixIndexKey(packedKey) & mask;
- while (keys[h] != BigramTables.EMPTY_KEY) {
- if (keys[h] == packedKey) {
- values[h] = value;
- return;
- }
- h = (h + 1) & mask;
- }
- keys[h] = packedKey;
- values[h] = value;
- }
-
/**
* Saves a minimal model containing only LATIN, with the block / control /
* script-transition features zeroed out and pure-z1 combiner weights
diff --git
a/tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/tools/BuildJunkAugmentationData.java
b/tika-ml/tika-ml-junkdetect-tools/src/test/java/org/apache/tika/ml/junkdetect/tools/BuildJunkAugmentationData.java
similarity index 100%
rename from
tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/tools/BuildJunkAugmentationData.java
rename to
tika-ml/tika-ml-junkdetect-tools/src/test/java/org/apache/tika/ml/junkdetect/tools/BuildJunkAugmentationData.java
diff --git
a/tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/tools/BuildJunkAugmentationDataTest.java
b/tika-ml/tika-ml-junkdetect-tools/src/test/java/org/apache/tika/ml/junkdetect/tools/BuildJunkAugmentationDataTest.java
similarity index 100%
rename from
tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/tools/BuildJunkAugmentationDataTest.java
rename to
tika-ml/tika-ml-junkdetect-tools/src/test/java/org/apache/tika/ml/junkdetect/tools/BuildJunkAugmentationDataTest.java
diff --git
a/tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/tools/JunkDetectorTrainingConfigTest.java
b/tika-ml/tika-ml-junkdetect-tools/src/test/java/org/apache/tika/ml/junkdetect/tools/JunkDetectorTrainingConfigTest.java
similarity index 100%
rename from
tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/tools/JunkDetectorTrainingConfigTest.java
rename to
tika-ml/tika-ml-junkdetect-tools/src/test/java/org/apache/tika/ml/junkdetect/tools/JunkDetectorTrainingConfigTest.java
diff --git a/tika-ml/tika-ml-junkdetect/pom.xml
b/tika-ml/tika-ml-junkdetect/pom.xml
index fe717998cf..5027cbe743 100644
--- a/tika-ml/tika-ml-junkdetect/pom.xml
+++ b/tika-ml/tika-ml-junkdetect/pom.xml
@@ -61,18 +61,6 @@
</dependency>
<!-- Test dependencies -->
- <!--
- tika-serialization is test-scope only because the one consumer
- (BuildJunkAugmentationData) lives in src/test/java — it's a corpus-prep
- tool, not part of the runtime detector. Keeps the production classpath of
- tika-ml-junkdetect free of the serialization dep.
- -->
- <dependency>
- <groupId>org.apache.tika</groupId>
- <artifactId>tika-serialization</artifactId>
- <version>${revision}</version>
- <scope>test</scope>
- </dependency>
<dependency>
<groupId>org.junit.jupiter</groupId>
<artifactId>junit-jupiter-api</artifactId>
@@ -108,7 +96,7 @@
</inputExcludes>
</configuration>
</plugin>
- <!-- Tools package uses System.out/printf freely -->
+ <!-- Diagnostic tests print to stdout / use default-locale formatting
freely. -->
<plugin>
<groupId>de.thetaphi</groupId>
<artifactId>forbiddenapis</artifactId>
@@ -119,57 +107,6 @@
</plugins>
</build>
- <profiles>
- <profile>
- <!--
- Build a self-contained fat JAR for model training and evaluation.
- Usage:
- ./mvnw package -pl tika-ml/tika-ml-junkdetect -am -Ptrain
-DskipTests \
- -Dmaven.repo.local=.local_m2_repo
- java -jar
tika-ml/tika-ml-junkdetect/target/tika-ml-junkdetect-*-tools.jar \
- [BuildJunkTrainingData|TrainJunkModel|EvalJunkDetector] \
- [args...]
- -->
- <id>train</id>
- <build>
- <plugins>
- <plugin>
- <groupId>org.apache.maven.plugins</groupId>
- <artifactId>maven-shade-plugin</artifactId>
- <executions>
- <execution>
- <phase>package</phase>
- <goals><goal>shade</goal></goals>
- <configuration>
- <shadedArtifactAttached>true</shadedArtifactAttached>
- <shadedClassifierName>tools</shadedClassifierName>
- <transformers>
- <transformer
implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
-
<mainClass>org.apache.tika.ml.junkdetect.tools.TrainJunkModel</mainClass>
- </transformer>
- <transformer
implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer"
/>
- <transformer
implementation="org.apache.maven.plugins.shade.resource.ApacheLicenseResourceTransformer"
/>
- <transformer
implementation="org.apache.maven.plugins.shade.resource.ApacheNoticeResourceTransformer"
/>
- </transformers>
- <filters>
- <filter>
- <artifact>*:*</artifact>
- <excludes>
- <exclude>META-INF/*.SF</exclude>
- <exclude>META-INF/*.DSA</exclude>
- <exclude>META-INF/*.RSA</exclude>
- </excludes>
- </filter>
- </filters>
- </configuration>
- </execution>
- </executions>
- </plugin>
- </plugins>
- </build>
- </profile>
- </profiles>
-
<scm>
<tag>3.0.0-rc1</tag>
</scm>
diff --git
a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/BigramTables.java
b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/BigramTables.java
index 5c7e738290..05974bd927 100644
---
a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/BigramTables.java
+++
b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/BigramTables.java
@@ -36,12 +36,11 @@ import java.nio.ByteOrder;
* script. Codepoint → dense index is a binary search; index →
* codepoint is direct array access. Typical sizes: ~7K-15K for HAN,
* ~200-500 for most other scripts.
- * <li>{@code bigramKeys} / {@code bigramValues} — parallel arrays
- * implementing an open-addressed hash table with linear probing.
- * Each key is a 32-bit value {@code (idxA << 16) | idxB}; key {@code
- * -1} means "empty slot." Indices are bounded at 16 bits (65535),
- * which is comfortably above the largest per-script codepoint count
- * we observe.
+ * <li>{@code bigramKeys} / {@code bigramValues} — parallel arrays of the
+ * occupied entries only, sorted ascending by key for binary-search
+ * lookup. Each key is a 32-bit value {@code (idxA << 16) | idxB}.
+ * Indices are bounded at 16 bits (65535), comfortably above the
+ * largest per-script codepoint count we observe.
* <li>{@code unigramTable} — {@code byte[numCodepoints]}, quantized
* unigram log-probabilities indexed by the same codepoint→index map.
* <li>{@code bigramQuantMin/Max}, {@code unigramQuantMin/Max} —
@@ -56,14 +55,12 @@ import java.nio.ByteOrder;
* independence sum.
* </ul>
*
- * <p>Membership semantics: no Bloom filter. The empty-slot sentinel is
- * the membership oracle — a pair is "seen" iff binary-search finds both
- * codepoints in the index AND a probe sequence hits a matching key before
- * an empty slot. Lookups are therefore exact.
+ * <p>Membership semantics: no Bloom filter. A pair is "seen" iff
+ * binary-search finds both codepoints in the index AND finds the packed
+ * key in {@code bigramKeys}. Lookups are therefore exact.
*
- * <p>Fields are package-private so the
- * {@link org.apache.tika.ml.junkdetect.tools.TrainJunkModel} trainer can
- * construct instances directly without going through accessors.
+ * <p>Instances are built by the trainer ({@code TrainJunkModel}, in the
+ * tika-ml-junkdetect-tools module) and read back via {@link #readFrom}.
*/
public final class BigramTables {
@@ -124,14 +121,23 @@ public final class BigramTables {
cpBuf.asIntBuffer().put(codepointIndex);
dos.write(cpBuf.array());
- // Bigram open-addressing table (keys + values).
+ // Bigram table: sorted-occupied keys (ascending) + parallel values.
+ // Store key[0] raw, then varint (LEB128) deltas from the previous key;
+ // deltas are small because the keys are sorted and dense.
dos.writeInt(bigramKeys.length);
dos.writeFloat(bigramQuantMin);
dos.writeFloat(bigramQuantMax);
- ByteBuffer keyBuf = ByteBuffer.allocate(bigramKeys.length * 4)
- .order(ByteOrder.BIG_ENDIAN);
- keyBuf.asIntBuffer().put(bigramKeys);
- dos.write(keyBuf.array());
+ if (bigramKeys.length > 0) {
+ dos.writeInt(bigramKeys[0]);
+ for (int i = 1; i < bigramKeys.length; i++) {
+ long delta = (long) bigramKeys[i] - (long) bigramKeys[i - 1];
+ if (delta <= 0) {
+ throw new IOException("bigramKeys must be strictly
ascending "
+ + "(no duplicates); non-increasing at index " + i);
+ }
+ writeVarLong(dos, delta);
+ }
+ }
dos.write(bigramValues);
// Unigram table.
@@ -153,9 +159,18 @@ public final class BigramTables {
int slots = dis.readInt();
float bMin = dis.readFloat();
float bMax = dis.readFloat();
- byte[] keyBytes = dis.readNBytes(slots * 4);
int[] keys = new int[slots];
-
ByteBuffer.wrap(keyBytes).order(ByteOrder.BIG_ENDIAN).asIntBuffer().get(keys);
+ if (slots > 0) {
+ keys[0] = dis.readInt();
+ for (int i = 1; i < slots; i++) {
+ long next = (long) keys[i - 1] + readVarLong(dis);
+ if (next <= keys[i - 1] || next > Integer.MAX_VALUE) {
+ throw new IOException("Corrupt bigram keys: not strictly "
+ + "ascending / out of range at index " + i);
+ }
+ keys[i] = (int) next;
+ }
+ }
byte[] values = dis.readNBytes(slots);
float uMin = dis.readFloat();
@@ -167,11 +182,36 @@ public final class BigramTables {
bMin, bMax, uMin, uMax, uFallback, backoffAlpha);
}
+ /** Writes a non-negative long as an unsigned LEB128 varint. */
+ private static void writeVarLong(DataOutputStream dos, long v) throws
IOException {
+ while ((v & ~0x7FL) != 0) {
+ dos.writeByte((int) ((v & 0x7F) | 0x80));
+ v >>>= 7;
+ }
+ dos.writeByte((int) v);
+ }
+
+ /** Reads an unsigned LEB128 varint written by {@link #writeVarLong}. */
+ private static long readVarLong(DataInputStream dis) throws IOException {
+ long v = 0;
+ int shift = 0;
+ int b;
+ do {
+ if (shift >= 64) {
+ throw new IOException("Malformed varint in bigram key deltas
(too long)");
+ }
+ b = dis.readUnsignedByte();
+ v |= (long) (b & 0x7F) << shift;
+ shift += 7;
+ } while ((b & 0x80) != 0);
+ return v;
+ }
+
/**
* Returns a one-line summary for trainer progress output.
*/
public String statsString() {
- return String.format(
+ return String.format(java.util.Locale.ROOT,
" cp_index=%d, bigram_slots=%d (load≈%.2f), "
+ "bigram_range=[%.3f, %.3f], unigram_range=[%.3f, %.3f]",
codepointIndex.length, bigramKeys.length,
diff --git
a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkDetector.java
b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkDetector.java
index 2f117479d0..b63f8787e8 100644
---
a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkDetector.java
+++
b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkDetector.java
@@ -235,41 +235,41 @@ public final class JunkDetector implements
TextQualityDetector {
* [4 bytes] num_scripts (int BE)
* [1 byte] block_scheme_version (must equal
* {@link UnicodeBlockRanges#SCHEME_VERSION})
+ * // z4 — global script-transition section
* [1 byte] num_script_buckets
* for each bucket:
* [2 bytes] name length (ushort BE)
* [name bytes] bucket name (UTF-8)
- * [num_script_buckets² × 4 bytes] script-transition log-prob table (F4)
- * [4 bytes] mu4 (float32 BE)
- * [4 bytes] sigma4 (float32 BE)
+ * [4 bytes] scriptTrans_quant_min (float32 BE)
+ * [4 bytes] scriptTrans_quant_max (float32 BE)
+ * [num_script_buckets² × 2 bytes] script-transition table (z4,
int16-quantized)
+ * [4 bytes] mu4 (z4 calibration, float32 BE)
+ * [4 bytes] sigma4
+ * // z2 — global block-transition section
+ * [4 bytes] block_quant_min (float32 BE)
+ * [4 bytes] block_quant_max (float32 BE)
+ * [block_N² × 2 bytes] block-transition table (z2, int16-quantized)
+ * [4 bytes] mu2 (z2 calibration)
+ * [4 bytes] sigma2
+ * // global per-feature calibrations, {mu, sigma} float32 pairs
+ * [8 bytes] z3 calibration (control-byte ratio)
+ * [8 bytes] z5 calibration (letter-adjacent-to-mark)
+ * [8 bytes] z6 calibration (replacement-char ratio)
+ * [8 bytes] z9 calibration (script-alternation)
+ * // global combiner
+ * [1 byte] num_features
+ * [(num_features+1) × 4 bytes] combiner weights w1..wN and bias
+ * // per-script section
* for each script (sorted by name):
* [2 bytes] name length
* [name bytes] script name (UTF-8)
- * [4 bytes] mu1 (F1 calibration, codepoint-bigram mean log-prob)
+ * [4 bytes] mu1 (z1 calibration, codepoint-bigram mean log-prob)
* [4 bytes] sigma1
- * // bigram tables for this script — see {@link BigramTables#writeTo}
- * [4 bytes] backoff_alpha (float32 BE)
- * [4 bytes] codepoint_count
- * [codepoint_count × 4 bytes] codepoint index (sorted, ascending)
- * [4 bytes] bigram_slots (power of 2)
- * [4 bytes] bigram_quant_min (float32 BE)
- * [4 bytes] bigram_quant_max (float32 BE)
- * [bigram_slots × 4 bytes] bigram open-addressing keys
- * ((idxA<<16)|idxB, or {@link
BigramTables#EMPTY_KEY})
- * [bigram_slots bytes] bigram values (8-bit quantized log-probs)
- * [4 bytes] unigram_quant_min (float32 BE)
- * [4 bytes] unigram_quant_max (float32 BE)
- * [4 bytes] unigram_fallback_log_prob (float32 BE; used for
- * codepoints not in index)
- * [codepoint_count bytes] unigram values (8-bit quantized log-probs)
- * // F2/F3/classifier
- * [4 bytes] mu2 (F2 calibration)
- * [4 bytes] sigma2
- * [block_N² × 4 bytes] block-transition log-prob table (F2)
- * [4 bytes] mu3 (F3 calibration)
- * [4 bytes] sigma3
- * [1 byte] num_features
- * [(num_features+1) × 4 bytes] classifier weights w1..wN and bias
+ * [variable] bigram + unigram tables — exact layout in
+ * {@link BigramTables#writeTo}: codepoint index, then
the
+ * sorted-occupied bigram keys (key[0] as int32 BE
followed
+ * by LEB128 varint deltas) and 8-bit quantized bigram
and
+ * unigram log-prob values
* </pre>
*/
public static JunkDetector load(InputStream rawIs) throws IOException {
@@ -498,6 +498,16 @@ public final class JunkDetector implements
TextQualityDetector {
int[] cps = text.codePoints().toArray();
Map<String, double[]> buckets = new HashMap<>(); // script ->
{sumLogP, count}
+ // Left-index memo. forEachScriptBigram emits (^,x),(x,y),(y,$)... so
within
+ // a run each pair's right codepoint b is the next pair's left
codepoint a.
+ // Reuse the previous pair's right-index as this pair's left-index
when they
+ // match (same codepoint AND same script => same table), so each
codepoint is
+ // binary-searched in the script's index once instead of twice.
Bit-identical
+ // to scoring each pair independently; the guard falls back to a fresh
search
+ // whenever the overlap doesn't hold (run boundary, sentinel, script
change).
+ String[] lastScript = {null};
+ int[] lastB = {Integer.MIN_VALUE};
+ int[] lastBIdx = {-1};
forEachScriptBigram(cps, (script, a, b) -> {
if (!calibrations.containsKey(script)) {
return;
@@ -506,7 +516,13 @@ public final class JunkDetector implements
TextQualityDetector {
if (t == null) {
return;
}
- double lp = computeF1MeanLogP(new int[]{a, b}, t);
+ int idxA = (a == lastB[0] && script.equals(lastScript[0]))
+ ? lastBIdx[0] : codepointToIndex(t, a);
+ int idxB = codepointToIndex(t, b);
+ lastScript[0] = script;
+ lastB[0] = b;
+ lastBIdx[0] = idxB;
+ double lp = scorePairF1(a, idxA, b, idxB, t);
if (Double.isNaN(lp)) {
return;
}
@@ -839,7 +855,7 @@ public final class JunkDetector implements
TextQualityDetector {
for (int i = 0; i < text.length(); ) {
int cp = text.codePointAt(i);
i += Character.charCount(cp);
- Character.UnicodeScript s = Character.UnicodeScript.of(cp);
+ Character.UnicodeScript s = TextQualityFeatures.scriptOf(cp);
if (s == Character.UnicodeScript.COMMON
|| s == Character.UnicodeScript.INHERITED
|| s == Character.UnicodeScript.UNKNOWN) {
@@ -970,22 +986,6 @@ public final class JunkDetector implements
TextQualityDetector {
return java.util.Arrays.binarySearch(tables.codepointIndex, cp);
}
- /**
- * Mixing function used to scatter packed (idxA, idxB) keys across
- * the open-addressing table. A simple integer finalizer (splitmix32
- * style) gives good distribution for sequential index values.
- *
- * <p>Public so the trainer's open-addressing insertion routine uses
- * the same probe order as inference — drift here would silently
- * corrupt every lookup.
- */
- public static int mixIndexKey(int packedKey) {
- int x = packedKey;
- x = (x ^ (x >>> 16)) * 0x7feb352d;
- x = (x ^ (x >>> 15)) * 0x846ca68b;
- x = x ^ (x >>> 16);
- return x;
- }
/**
* Packed bigram key for indices {@code (a, b)} where each index fits in
@@ -1085,20 +1085,12 @@ public final class JunkDetector implements
TextQualityDetector {
* for {@code (idxA, idxB)}, or {@code -1} if not present (probe hit an
* empty slot first).
*
- * <p>Linear probing with the same mix-hash used at training time —
- * required for the table to be readable, not just writable.
+ * <p>{@code bigramKeys} is sorted ascending (signed), so this is a binary
search.
*/
static int lookupBigramSlot(BigramTables tables, int idxA, int idxB) {
int packedKey = packBigramKey(idxA, idxB);
- int[] keys = tables.bigramKeys;
- int mask = keys.length - 1;
- int h = mixIndexKey(packedKey) & mask;
- while (true) {
- int k = keys[h];
- if (k == BigramTables.EMPTY_KEY) return -1;
- if (k == packedKey) return h;
- h = (h + 1) & mask;
- }
+ int slot = java.util.Arrays.binarySearch(tables.bigramKeys, packedKey);
+ return slot >= 0 ? slot : -1;
}
private static double unigramLogProb(BigramTables tables, int idx) {
@@ -1134,7 +1126,7 @@ public final class JunkDetector implements
TextQualityDetector {
for (int i = 0; i < text.length(); ) {
int cp = text.codePointAt(i);
i += Character.charCount(cp);
- Character.UnicodeScript s = Character.UnicodeScript.of(cp);
+ Character.UnicodeScript s = TextQualityFeatures.scriptOf(cp);
if (s == Character.UnicodeScript.COMMON
|| s == Character.UnicodeScript.INHERITED
|| s == Character.UnicodeScript.UNKNOWN) {
@@ -1170,7 +1162,7 @@ public final class JunkDetector implements
TextQualityDetector {
/** COMMON-class predicate: COMMON, INHERITED, UNKNOWN all pool into
COMMON. */
static String classKey(int cp) {
- Character.UnicodeScript s = Character.UnicodeScript.of(cp);
+ Character.UnicodeScript s = TextQualityFeatures.scriptOf(cp);
if (s == Character.UnicodeScript.COMMON
|| s == Character.UnicodeScript.INHERITED
|| s == Character.UnicodeScript.UNKNOWN) {
@@ -1381,7 +1373,7 @@ public final class JunkDetector implements
TextQualityDetector {
Map<Character.UnicodeScript, Integer> counts = new HashMap<>();
for (int i = 0; i < text.length(); ) {
int cp = text.codePointAt(i);
- Character.UnicodeScript s = Character.UnicodeScript.of(cp);
+ Character.UnicodeScript s = TextQualityFeatures.scriptOf(cp);
if (s != Character.UnicodeScript.COMMON
&& s != Character.UnicodeScript.INHERITED
&& s != Character.UnicodeScript.UNKNOWN) {
diff --git
a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkFilterEncodingDetector.java
b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkFilterEncodingDetector.java
index b8cb75de01..0f627744bb 100644
---
a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkFilterEncodingDetector.java
+++
b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkFilterEncodingDetector.java
@@ -35,6 +35,7 @@ import org.slf4j.LoggerFactory;
import org.apache.tika.config.TikaComponent;
import org.apache.tika.detect.CharsetSupersets;
import org.apache.tika.detect.EncodingDetectorContext;
+import org.apache.tika.detect.EncodingProbeCache;
import org.apache.tika.detect.EncodingResult;
import org.apache.tika.detect.HighByteLetterStats;
import org.apache.tika.detect.MetaEncodingDetector;
@@ -156,7 +157,7 @@ public class JunkFilterEncodingDetector implements
MetaEncodingDetector {
return Collections.emptyList();
}
- byte[] bytes = readProbe(tis);
+ byte[] bytes = readProbe(tis, context);
if (bytes == null || bytes.length == 0) {
context.setArbitrationInfo("junk-filter-empty-stream");
return Collections.emptyList();
@@ -246,31 +247,20 @@ public class JunkFilterEncodingDetector implements
MetaEncodingDetector {
Charset champion = null;
double championZ = Double.NEGATIVE_INFINITY;
Map<Charset, Double> scoreByCharset = new LinkedHashMap<>();
- Map<Charset, Double> diffByCharset = new LinkedHashMap<>();
- // Dedup by text: [0] = whole-text z (the champion + anchor metric,
kept
- // exactly as before); [1] = script-letter "diff" z (codepoints >= 0x80
- // that are letters/ideographs — the high bytes where the candidate
- // decodes actually differ), used ONLY for the family gate below.
- Map<String, float[]> zByText = new HashMap<>();
+ // Whole-text z (the champion + anchor metric), deduped by decoded
text.
+ Map<String, Float> wholeZByText = new HashMap<>();
for (Map.Entry<Charset, String> entry : candidates.entrySet()) {
String text = entry.getValue();
- float[] zs = zByText.get(text);
- if (zs == null) {
+ Float wholeZ = wholeZByText.get(text);
+ if (wholeZ == null) {
org.apache.tika.quality.TextQualityScore sc =
qualityDetector.score(text);
- float wholeZ = sc.isUnknown() ? Float.NEGATIVE_INFINITY :
sc.getZScore();
- String diff = scriptLetters(text);
- float diffZ = Float.NEGATIVE_INFINITY;
- if (!diff.isEmpty()) {
- org.apache.tika.quality.TextQualityScore d =
qualityDetector.score(diff);
- diffZ = d.isUnknown() ? Float.NEGATIVE_INFINITY :
d.getZScore();
- }
- zs = new float[]{wholeZ, diffZ};
- zByText.put(text, zs);
+ wholeZ = sc.isUnknown() ? Float.NEGATIVE_INFINITY :
sc.getZScore();
+ wholeZByText.put(text, wholeZ);
}
- scoreByCharset.put(entry.getKey(), (double) zs[0]);
- diffByCharset.put(entry.getKey(), (double) zs[1]);
- if (zs[0] > championZ) {
- championZ = zs[0];
+ double z = wholeZ;
+ scoreByCharset.put(entry.getKey(), z);
+ if (z > championZ) {
+ championZ = z;
champion = entry.getKey();
}
}
@@ -284,34 +274,53 @@ public class JunkFilterEncodingDetector implements
MetaEncodingDetector {
// CJK/non-CJK BOUNDARY for COMMON-dominated docs (markup/digits/punct
// decode identically and swamp the few discriminating high bytes),
// producing false-CJK and real-CJK demotion. The script-letter
"diff" z
- // reads that boundary cleanly (coherent CJK vs garbage), so use it to
- // decide ONLY the family; within a family the whole-text champion
stands
- // (Latin-vs-Latin etc. untouched — a blanket diff-score regressed
there).
- // Override only on a clear diff margin.
- double bestCjkDiff = Double.NEGATIVE_INFINITY;
- double bestNonCjkDiff = Double.NEGATIVE_INFINITY;
- for (Map.Entry<Charset, Double> e : diffByCharset.entrySet()) {
- if (isCjkCharset(e.getKey().name())) {
- bestCjkDiff = Math.max(bestCjkDiff, e.getValue());
- } else {
- bestNonCjkDiff = Math.max(bestNonCjkDiff, e.getValue());
+ // (codepoints >= 0x80 that are letters/ideographs — the high bytes
where
+ // candidate decodes actually differ) reads that boundary cleanly, so
use
+ // it to decide ONLY the family; within a family the whole-text
champion
+ // stands (Latin-vs-Latin etc. untouched — a blanket diff-score
regressed).
+ //
+ // DEMOTE-ONLY and CJK-champion-only: the gate fires only to demote a
CJK
+ // champion to non-CJK (the false-CJK fix). The reverse (promote
non-CJK
+ // -> CJK) is NOT done: measured at 29k, the diff z reliably says
"this CJK
+ // pick is really non-CJK" (OOV improves on every such flip) but
UNreliably
+ // the reverse (the junk model over-rates ideograph mojibake vs sparse
+ // Latin letters); the promote direction is also unnecessary — genuine
CJK
+ // is html-meta-declared upstream. Because the gate can only act when
the
+ // champion is CJK, the second "diff" score per candidate is needed
ONLY
+ // then — compute it lazily and skip it entirely for the common non-CJK
+ // champion (halving the score() calls there).
+ if (isCjkCharset(champion.name())) {
+ double bestCjkDiff = Double.NEGATIVE_INFINITY;
+ double bestNonCjkDiff = Double.NEGATIVE_INFINITY;
+ Map<String, Float> diffZByText = new HashMap<>();
+ for (Map.Entry<Charset, String> entry : candidates.entrySet()) {
+ String text = entry.getValue();
+ Float diffZ = diffZByText.get(text);
+ if (diffZ == null) {
+ String diff = scriptLetters(text);
+ float dz = Float.NEGATIVE_INFINITY;
+ if (!diff.isEmpty()) {
+ org.apache.tika.quality.TextQualityScore d =
qualityDetector.score(diff);
+ dz = d.isUnknown() ? Float.NEGATIVE_INFINITY :
d.getZScore();
+ }
+ diffZ = dz;
+ diffZByText.put(text, diffZ);
+ }
+ double dz = diffZ;
+ if (isCjkCharset(entry.getKey().name())) {
+ bestCjkDiff = Math.max(bestCjkDiff, dz);
+ } else {
+ bestNonCjkDiff = Math.max(bestNonCjkDiff, dz);
+ }
}
- }
- // DEMOTE-ONLY: fire only to demote a CJK champion to non-CJK when the
- // diff z clearly prefers non-CJK (the false-CJK fix). The reverse
- // (promote non-CJK -> CJK) is NOT done: measured at 29k, the diff z
- // reliably says "this CJK pick is really non-CJK" (OOV improves on
every
- // such flip) but UNreliably says "this non-CJK pick is really CJK"
(the
- // junk model over-rates ideograph mojibake vs sparse Latin letters —
OOV
- // worsened on every promote flip). The promote direction is also
- // unnecessary: genuine CJK is html-meta-declared upstream.
- if (isCjkCharset(champion.name())
- && bestNonCjkDiff > bestCjkDiff + FAMILY_DIFF_MARGIN) {
- Charset reFam = bestInFamily(scoreByCharset, false);
- if (reFam != null) {
- LOG.trace("junk-filter family gate: {} (CJK) -> {} (non-CJK by
diff z)",
- champion.name(), reFam.name());
- champion = reFam;
+ // Override only on a clear diff margin.
+ if (bestNonCjkDiff > bestCjkDiff + FAMILY_DIFF_MARGIN) {
+ Charset reFam = bestInFamily(scoreByCharset, false);
+ if (reFam != null) {
+ LOG.trace("junk-filter family gate: {} (CJK) -> {}
(non-CJK by diff z)",
+ champion.name(), reFam.name());
+ champion = reFam;
+ }
}
}
@@ -584,9 +593,21 @@ public class JunkFilterEncodingDetector implements
MetaEncodingDetector {
return true;
}
- private byte[] readProbe(TikaInputStream tis) throws IOException {
+ private byte[] readProbe(TikaInputStream tis, EncodingDetectorContext
context)
+ throws IOException {
// readLimit is the tag-stripped content target; cap raw reads at 512
KB.
- byte[] probe = AdaptiveProbe.read(tis, readLimit,
AdaptiveProbe.DEFAULT_RAW_CAP);
+ int rawCap = AdaptiveProbe.DEFAULT_RAW_CAP;
+ EncodingProbeCache cache = context == null ? null :
context.getProbeCache();
+ if (cache != null) {
+ byte[] cached = cache.get(readLimit, rawCap);
+ if (cached != null) {
+ return cached.length == 0 ? null : cached;
+ }
+ }
+ byte[] probe = AdaptiveProbe.read(tis, readLimit, rawCap);
+ if (cache != null) {
+ cache.put(probe, readLimit, rawCap);
+ }
return probe.length == 0 ? null : probe;
}
diff --git
a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/TextQualityFeatures.java
b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/TextQualityFeatures.java
index 2ae926a927..b5b7c5aeb2 100644
---
a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/TextQualityFeatures.java
+++
b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/TextQualityFeatures.java
@@ -43,6 +43,46 @@ public final class TextQualityFeatures {
private TextQualityFeatures() {
}
+ // -----------------------------------------------------------------------
+ // Memoized Unicode-script lookup
+ // -----------------------------------------------------------------------
+
+ /** Cached {@code UnicodeScript.values()} so an ordinal->enum lookup
never
+ * re-allocates the values array. */
+ private static final Character.UnicodeScript[] SCRIPT_VALUES =
+ Character.UnicodeScript.values();
+
+ /**
+ * Memoized {@link Character.UnicodeScript#of(int)} for the BMP. Scoring a
+ * document classifies every codepoint's script ~5 times (z4/z7/z8/z9 plus
the
+ * z1 bigram bucketing), and {@code UnicodeScript.of} is a binary search
over
+ * the script-range table (measured ~12 ns/cp, 10-20x {@code
Character.getType}).
+ * The result is a pure function of the codepoint for a given JVM, so
cache it:
+ * BMP codepoints (>99% of text) become an O(1) array lookup after first
+ * sight, shared across every call site and every document. Slot 0 means
"not
+ * yet computed"; otherwise {@code ordinal + 1}. The fill is a benign
data race
+ * — every writer stores the same deterministic value and {@code short}
writes
+ * do not tear.
+ */
+ private static final short[] BMP_SCRIPT_CACHE = new short[0x10000];
+
+ /**
+ * Script of {@code codePoint}, memoized for the BMP — identical result to
+ * {@link Character.UnicodeScript#of(int)} (the same singleton enum
constant).
+ */
+ static Character.UnicodeScript scriptOf(int codePoint) {
+ if (codePoint >= 0 && codePoint < 0x10000) {
+ short v = BMP_SCRIPT_CACHE[codePoint];
+ if (v != 0) {
+ return SCRIPT_VALUES[v - 1];
+ }
+ Character.UnicodeScript s = Character.UnicodeScript.of(codePoint);
+ BMP_SCRIPT_CACHE[codePoint] = (short) (s.ordinal() + 1);
+ return s;
+ }
+ return Character.UnicodeScript.of(codePoint);
+ }
+
// -----------------------------------------------------------------------
// Strip modes
// -----------------------------------------------------------------------
@@ -99,7 +139,7 @@ public final class TextQualityFeatures {
return type == Character.CONTROL || type == Character.FORMAT;
}
case ALL_COMMON: {
- Character.UnicodeScript s = Character.UnicodeScript.of(cp);
+ Character.UnicodeScript s = scriptOf(cp);
return s == Character.UnicodeScript.COMMON
|| s == Character.UnicodeScript.INHERITED
|| s == Character.UnicodeScript.UNKNOWN;
@@ -398,7 +438,7 @@ public final class TextQualityFeatures {
continue;
}
total++;
- Character.UnicodeScript s = Character.UnicodeScript.of(cp);
+ Character.UnicodeScript s = scriptOf(cp);
if (s != Character.UnicodeScript.COMMON
&& s != Character.UnicodeScript.INHERITED
&& s != Character.UnicodeScript.UNKNOWN) {
@@ -442,7 +482,7 @@ public final class TextQualityFeatures {
for (int i = 0; i < text.length(); ) {
int cp = text.codePointAt(i);
i += Character.charCount(cp);
- Character.UnicodeScript s = Character.UnicodeScript.of(cp);
+ Character.UnicodeScript s = scriptOf(cp);
if (s == Character.UnicodeScript.COMMON
|| s == Character.UnicodeScript.INHERITED
|| s == Character.UnicodeScript.UNKNOWN) {
@@ -508,7 +548,7 @@ public final class TextQualityFeatures {
for (int i = 0; i < text.length(); ) {
int cp = text.codePointAt(i);
i += Character.charCount(cp);
- Character.UnicodeScript s = Character.UnicodeScript.of(cp);
+ Character.UnicodeScript s = scriptOf(cp);
if (s == Character.UnicodeScript.COMMON
|| s == Character.UnicodeScript.INHERITED
|| s == Character.UnicodeScript.UNKNOWN) {
@@ -538,7 +578,7 @@ public final class TextQualityFeatures {
for (int i = 0; i < text.length(); ) {
int cp = text.codePointAt(i);
i += Character.charCount(cp);
- Character.UnicodeScript s = Character.UnicodeScript.of(cp);
+ Character.UnicodeScript s = scriptOf(cp);
if (s == Character.UnicodeScript.COMMON
|| s == Character.UnicodeScript.INHERITED
|| s == Character.UnicodeScript.UNKNOWN) {
@@ -616,7 +656,7 @@ public final class TextQualityFeatures {
}
private static String scriptClusterOf(int cp) {
- Character.UnicodeScript s = Character.UnicodeScript.of(cp);
+ Character.UnicodeScript s = scriptOf(cp);
switch (s) {
case HAN:
case HIRAGANA:
diff --git
a/tika-ml/tika-ml-junkdetect/src/main/resources/org/apache/tika/ml/junkdetect/junkdetect.bin
b/tika-ml/tika-ml-junkdetect/src/main/resources/org/apache/tika/ml/junkdetect/junkdetect.bin
index ead028cbb3..06179e75e6 100644
Binary files
a/tika-ml/tika-ml-junkdetect/src/main/resources/org/apache/tika/ml/junkdetect/junkdetect.bin
and
b/tika-ml/tika-ml-junkdetect/src/main/resources/org/apache/tika/ml/junkdetect/junkdetect.bin
differ
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/pom.xml
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/pom.xml
index b558f836a2..ead93cf083 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/pom.xml
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/pom.xml
@@ -30,10 +30,13 @@
<name>Apache Tika html parser module</name>
<dependencies>
+ <!-- Test scope: shipped parser stays decoupled from any specific
+ EncodingDetector; tests still exercise real <meta charset> handling.
-->
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-encoding-detector-html</artifactId>
<version>${project.version}</version>
+ <scope>test</scope>
</dependency>
<dependency>