This is an automated email from the ASF dual-hosted git repository.
tballison pushed a commit to branch TIKA-4731-common-script
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/TIKA-4731-common-script by
this push:
new 20ce737110 updates based on copilot feedback
20ce737110 is described below
commit 20ce737110ccb9b7eb82be5751354bbafbc0d680
Author: tallison <[email protected]>
AuthorDate: Wed May 27 08:40:59 2026 -0400
updates based on copilot feedback
---
.../NaiveBayesBigramEncodingDetector.java | 154 ++++++++++++++++++---
.../ml/junkdetect/tools/BuildJunkTrainingData.java | 12 +-
.../tools/BuildJunkAugmentationDataTest.java | 15 --
3 files changed, 143 insertions(+), 38 deletions(-)
diff --git
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/NaiveBayesBigramEncodingDetector.java
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/NaiveBayesBigramEncodingDetector.java
index 84d721bc12..2460656f0c 100644
---
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/NaiveBayesBigramEncodingDetector.java
+++
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/NaiveBayesBigramEncodingDetector.java
@@ -382,13 +382,15 @@ public class NaiveBayesBigramEncodingDetector implements
EncodingDetector {
int len = Math.min(probe.length, MAX_PROBE_BYTES);
// Pass 1: count distinct bigrams. Whitespace and zero-IDF
- // bigrams are skipped as in the original hot loop. short[] is
- // enough since count fits in 16383 (max possible). Track the
- // ids of distinct bigrams in a parallel array so pass 2 doesn't
- // need to scan the full 65k space.
- short[] count = new short[BIGRAM_SPACE];
- int[] distinctBigrams = new int[len];
- int distinctIdx = 0;
+ // bigrams are skipped as in the original hot loop. Counts are
+ // held in a sparse open-addressing int→int hash (see
+ // {@link BigramCountMap}) so per-call working state is
+ // proportional to distinct bigrams (typically a few hundred to
+ // a few thousand) rather than the dense 128 KB
+ // {@code short[65536]} the earlier inner loop used. Iteration
+ // for pass 2 walks the hash's occupied slots directly — no
+ // parallel distinct-bigram array.
+ BigramCountMap counts = new BigramCountMap();
int scored = 0;
int total = 0;
for (int i = 0; i + 1 < len; i++) {
@@ -404,11 +406,9 @@ public class NaiveBayesBigramEncodingDetector implements
EncodingDetector {
continue;
}
scored++;
- if (count[bigram] == 0) {
- distinctBigrams[distinctIdx++] = bigram;
- }
- count[bigram]++;
+ counts.increment(bigram);
}
+ int distinctIdx = counts.size();
// Type A — diversity gate. If the input has too few distinct
// bigrams relative to total scored bigrams, it's a degenerate
@@ -430,11 +430,17 @@ public class NaiveBayesBigramEncodingDetector implements
EncodingDetector {
// Pass 2: per distinct bigram, compute per-class total
// contribution and (when above floor) apply Type C cap.
+ // Order-independent — see analyzeBigrams() for the probe-order
+ // diagnostic path.
double[] score = new double[numClasses];
double[] contributions = new double[numClasses];
- for (int k = 0; k < distinctIdx; k++) {
- int bigram = distinctBigrams[k];
- int n = count[bigram];
+ int hashCap = counts.capacity();
+ for (int slot = 0; slot < hashCap; slot++) {
+ int bigram = counts.keyAt(slot);
+ if (bigram == -1) {
+ continue;
+ }
+ int n = counts.countAt(slot);
int w = idf8[bigram];
double countTimesIdf = (double) n * w;
int base = bigram * numClasses;
@@ -462,11 +468,11 @@ public class NaiveBayesBigramEncodingDetector implements
EncodingDetector {
}
}
// Cap any class whose contribution exceeds runner-up + cap.
- double cap = secondMax + CAP_PER_BIGRAM_NATS;
- if (max > cap) {
+ double capValue = secondMax + CAP_PER_BIGRAM_NATS;
+ if (max > capValue) {
for (int c = 0; c < numClasses; c++) {
- if (contributions[c] > cap) {
- contributions[c] = cap;
+ if (contributions[c] > capValue) {
+ contributions[c] = capValue;
}
}
}
@@ -477,6 +483,118 @@ public class NaiveBayesBigramEncodingDetector implements
EncodingDetector {
return new ScoreResult(score, scored, total);
}
+ /**
+ * Open-addressing {@code int → int} hash map specialised for
+ * counting bigram occurrences during a single
+ * {@link #scoreClassesAndCount(byte[])} pass. Linear probing;
+ * capacity is a power of two; {@code -1} sentinel for empty slots
+ * (bigrams are non-negative 16-bit values so {@code -1} is
+ * unambiguous).
+ *
+ * <p>Per-call local; not thread-safe. Replaces a dense
+ * {@code short[65536]} (128 KB) count array. Memory scales with
+ * actual distinct bigrams in the probe — typically a few hundred
+ * for short probes, a few thousand for diverse probes at the
+ * {@link #MAX_PROBE_BYTES} cap. Initial capacity sized so short
+ * probes never resize; longer probes trigger one or two
+ * power-of-two doublings.
+ */
+ private static final class BigramCountMap {
+
+ /** Initial capacity. 1024 entries × 2 × 4 bytes = 8 KB. */
+ private static final int INITIAL_CAP = 1024;
+ /** Knuth multiplicative hash constant (golden ratio, 32-bit). */
+ private static final int HASH_MULT = 0x9E3779B9;
+
+ private int[] keys;
+ private int[] counts;
+ private int cap;
+ private int mask;
+ /**
+ * Right-shift amount for the multiplicative hash: produces the
+ * top {@code log2(cap)} bits of the multiplied value. Equal
+ * to {@code Integer.numberOfLeadingZeros(mask)} for a
+ * power-of-two capacity.
+ */
+ private int shift;
+ /** Resize when {@code size > threshold}; 50% load factor for fast
probing. */
+ private int threshold;
+ private int size;
+
+ BigramCountMap() {
+ this.cap = INITIAL_CAP;
+ this.mask = cap - 1;
+ this.shift = Integer.numberOfLeadingZeros(mask);
+ this.threshold = cap >>> 1;
+ this.keys = new int[cap];
+ this.counts = new int[cap];
+ Arrays.fill(this.keys, -1);
+ }
+
+ /** Insert a new bigram or increment the count of an existing one. */
+ void increment(int bigram) {
+ int slot = (bigram * HASH_MULT) >>> shift;
+ while (keys[slot] != -1 && keys[slot] != bigram) {
+ slot = (slot + 1) & mask;
+ }
+ if (keys[slot] == -1) {
+ keys[slot] = bigram;
+ counts[slot] = 1;
+ size++;
+ if (size > threshold) {
+ resize();
+ }
+ } else {
+ counts[slot]++;
+ }
+ }
+
+ /** Number of distinct bigrams stored. */
+ int size() {
+ return size;
+ }
+
+ /** Slot count. Walk slots {@code [0, capacity)} to enumerate
entries. */
+ int capacity() {
+ return cap;
+ }
+
+ /** Bigram at {@code slot}, or {@code -1} if the slot is empty. */
+ int keyAt(int slot) {
+ return keys[slot];
+ }
+
+ /** Count at {@code slot}. Undefined for empty slots. */
+ int countAt(int slot) {
+ return counts[slot];
+ }
+
+ private void resize() {
+ int oldCap = cap;
+ int[] oldKeys = keys;
+ int[] oldCounts = counts;
+ cap = oldCap << 1;
+ mask = cap - 1;
+ shift = Integer.numberOfLeadingZeros(mask);
+ threshold = cap >>> 1;
+ keys = new int[cap];
+ counts = new int[cap];
+ Arrays.fill(keys, -1);
+ for (int i = 0; i < oldCap; i++) {
+ int k = oldKeys[i];
+ if (k == -1) {
+ continue;
+ }
+ int slot = (k * HASH_MULT) >>> shift;
+ while (keys[slot] != -1) {
+ slot = (slot + 1) & mask;
+ }
+ keys[slot] = k;
+ counts[slot] = oldCounts[i];
+ }
+ }
+ }
+
public String[] getLabels() {
return labels.clone();
}
diff --git
a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/BuildJunkTrainingData.java
b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/BuildJunkTrainingData.java
index 451efccb36..a45f74a82c 100644
---
a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/BuildJunkTrainingData.java
+++
b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/BuildJunkTrainingData.java
@@ -658,11 +658,13 @@ public class BuildJunkTrainingData {
if (text.indexOf('\uFFFD') >= 0) {
return null;
}
- // NFD (not NFC) so combining-mark scripts (Vietnamese precomposed,
- // Indic, Thai) have their marks as separate codepoints in the
- // training corpus. Lets per-script bigram tables and z5 (letter-
- // adjacent-to-mark) discriminate uniformly across mark-using
- // scripts. Must match JunkDetector.scoreText's normalization.
+ // NFC so the training tally matches JunkDetector.aggregate at
+ // inference time (which also NFC-normalises — see comment at
+ // JunkDetector#aggregate). Precomposed characters (Latin
+ // diacritics, Vietnamese, Indic combining-mark sequences) are
+ // stored as single codepoints, so bigram counts collapse mark
+ // + letter into one unit instead of splitting them — matching
+ // the natural NFC form of most source text.
text = Normalizer.normalize(text, Normalizer.Form.NFC);
if (text.getBytes(StandardCharsets.UTF_8).length < minBytes) {
return null;
diff --git
a/tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/tools/BuildJunkAugmentationDataTest.java
b/tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/tools/BuildJunkAugmentationDataTest.java
index 26f316d91e..ac901b3a2a 100644
---
a/tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/tools/BuildJunkAugmentationDataTest.java
+++
b/tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/tools/BuildJunkAugmentationDataTest.java
@@ -390,21 +390,6 @@ class BuildJunkAugmentationDataTest {
assertEquals("0F/ABCD1234",
BuildJunkAugmentationData.profileKey(extracts, file));
}
- @Test
- void refusesOutputEqualToBaseline(@TempDir Path tmp) throws Exception {
- Path baseline = tmp.resolve("baseline");
- Path extracts = tmp.resolve("extracts");
- Files.createDirectories(baseline);
- Files.createDirectories(extracts);
- writeGz(baseline.resolve("latin.train.gz"), List.of("x"));
-
- // Run in same JVM, catch System.exit. Easiest path is a
SecurityManager,
- // but JDK 17 deprecates that. Instead, hit the static helper directly
- // for isSameFile semantics.
- assertTrue(Files.isSameFile(baseline, baseline),
- "sanity: same directory is same file");
- }
-
//
---------------------------------------------------------------------------
private static void writeGz(Path path, List<String> lines) throws
Exception {