(tika) branch TIKA-4731-common-script updated: updates based on copilot feedback

tallison Wed, 27 May 2026 05:41:19 -0700

This is an automated email from the ASF dual-hosted git repository.

tballison pushed a commit to branch TIKA-4731-common-script
in repository https://gitbox.apache.org/repos/asf/tika.git



The following commit(s) were added to refs/heads/TIKA-4731-common-script by 
this push:
     new 20ce737110 updates based on copilot feedback
20ce737110 is described below

commit 20ce737110ccb9b7eb82be5751354bbafbc0d680
Author: tallison <[email protected]>
AuthorDate: Wed May 27 08:40:59 2026 -0400

    updates based on copilot feedback
---
 .../NaiveBayesBigramEncodingDetector.java          | 154 ++++++++++++++++++---
 .../ml/junkdetect/tools/BuildJunkTrainingData.java |  12 +-
 .../tools/BuildJunkAugmentationDataTest.java       |  15 --
 3 files changed, 143 insertions(+), 38 deletions(-)

diff --git 
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/NaiveBayesBigramEncodingDetector.java
 
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/NaiveBayesBigramEncodingDetector.java
index 84d721bc12..2460656f0c 100644
--- 
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/NaiveBayesBigramEncodingDetector.java
+++ 
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/NaiveBayesBigramEncodingDetector.java
@@ -382,13 +382,15 @@ public class NaiveBayesBigramEncodingDetector implements 
EncodingDetector {
         int len = Math.min(probe.length, MAX_PROBE_BYTES);
 
         // Pass 1: count distinct bigrams.  Whitespace and zero-IDF
-        // bigrams are skipped as in the original hot loop.  short[] is
-        // enough since count fits in 16383 (max possible).  Track the
-        // ids of distinct bigrams in a parallel array so pass 2 doesn't
-        // need to scan the full 65k space.
-        short[] count = new short[BIGRAM_SPACE];
-        int[] distinctBigrams = new int[len];
-        int distinctIdx = 0;
+        // bigrams are skipped as in the original hot loop.  Counts are
+        // held in a sparse open-addressing int→int hash (see
+        // {@link BigramCountMap}) so per-call working state is
+        // proportional to distinct bigrams (typically a few hundred to
+        // a few thousand) rather than the dense 128 KB
+        // {@code short[65536]} the earlier inner loop used.  Iteration
+        // for pass 2 walks the hash's occupied slots directly — no
+        // parallel distinct-bigram array.
+        BigramCountMap counts = new BigramCountMap();
         int scored = 0;
         int total = 0;
         for (int i = 0; i + 1 < len; i++) {
@@ -404,11 +406,9 @@ public class NaiveBayesBigramEncodingDetector implements 
EncodingDetector {
                 continue;
             }
             scored++;
-            if (count[bigram] == 0) {
-                distinctBigrams[distinctIdx++] = bigram;
-            }
-            count[bigram]++;
+            counts.increment(bigram);
         }
+        int distinctIdx = counts.size();
 
         // Type A — diversity gate.  If the input has too few distinct
         // bigrams relative to total scored bigrams, it's a degenerate
@@ -430,11 +430,17 @@ public class NaiveBayesBigramEncodingDetector implements 
EncodingDetector {
 
         // Pass 2: per distinct bigram, compute per-class total
         // contribution and (when above floor) apply Type C cap.
+        // Order-independent — see analyzeBigrams() for the probe-order
+        // diagnostic path.
         double[] score = new double[numClasses];
         double[] contributions = new double[numClasses];
-        for (int k = 0; k < distinctIdx; k++) {
-            int bigram = distinctBigrams[k];
-            int n = count[bigram];
+        int hashCap = counts.capacity();
+        for (int slot = 0; slot < hashCap; slot++) {
+            int bigram = counts.keyAt(slot);
+            if (bigram == -1) {
+                continue;
+            }
+            int n = counts.countAt(slot);
             int w = idf8[bigram];
             double countTimesIdf = (double) n * w;
             int base = bigram * numClasses;
@@ -462,11 +468,11 @@ public class NaiveBayesBigramEncodingDetector implements 
EncodingDetector {
                 }
             }
             // Cap any class whose contribution exceeds runner-up + cap.
-            double cap = secondMax + CAP_PER_BIGRAM_NATS;
-            if (max > cap) {
+            double capValue = secondMax + CAP_PER_BIGRAM_NATS;
+            if (max > capValue) {
                 for (int c = 0; c < numClasses; c++) {
-                    if (contributions[c] > cap) {
-                        contributions[c] = cap;
+                    if (contributions[c] > capValue) {
+                        contributions[c] = capValue;
                     }
                 }
             }
@@ -477,6 +483,118 @@ public class NaiveBayesBigramEncodingDetector implements 
EncodingDetector {
         return new ScoreResult(score, scored, total);
     }
 
+    /**
+     * Open-addressing {@code int → int} hash map specialised for
+     * counting bigram occurrences during a single
+     * {@link #scoreClassesAndCount(byte[])} pass.  Linear probing;
+     * capacity is a power of two; {@code -1} sentinel for empty slots
+     * (bigrams are non-negative 16-bit values so {@code -1} is
+     * unambiguous).
+     *
+     * <p>Per-call local; not thread-safe.  Replaces a dense
+     * {@code short[65536]} (128 KB) count array.  Memory scales with
+     * actual distinct bigrams in the probe — typically a few hundred
+     * for short probes, a few thousand for diverse probes at the
+     * {@link #MAX_PROBE_BYTES} cap.  Initial capacity sized so short
+     * probes never resize; longer probes trigger one or two
+     * power-of-two doublings.
+     */
+    private static final class BigramCountMap {
+
+        /** Initial capacity. 1024 entries × 2 × 4 bytes = 8 KB. */
+        private static final int INITIAL_CAP = 1024;
+        /** Knuth multiplicative hash constant (golden ratio, 32-bit). */
+        private static final int HASH_MULT = 0x9E3779B9;
+
+        private int[] keys;
+        private int[] counts;
+        private int cap;
+        private int mask;
+        /**
+         * Right-shift amount for the multiplicative hash: produces the
+         * top {@code log2(cap)} bits of the multiplied value.  Equal
+         * to {@code Integer.numberOfLeadingZeros(mask)} for a
+         * power-of-two capacity.
+         */
+        private int shift;
+        /** Resize when {@code size > threshold}; 50% load factor for fast 
probing. */
+        private int threshold;
+        private int size;
+
+        BigramCountMap() {
+            this.cap = INITIAL_CAP;
+            this.mask = cap - 1;
+            this.shift = Integer.numberOfLeadingZeros(mask);
+            this.threshold = cap >>> 1;
+            this.keys = new int[cap];
+            this.counts = new int[cap];
+            Arrays.fill(this.keys, -1);
+        }
+
+        /** Insert a new bigram or increment the count of an existing one. */
+        void increment(int bigram) {
+            int slot = (bigram * HASH_MULT) >>> shift;
+            while (keys[slot] != -1 && keys[slot] != bigram) {
+                slot = (slot + 1) & mask;
+            }
+            if (keys[slot] == -1) {
+                keys[slot] = bigram;
+                counts[slot] = 1;
+                size++;
+                if (size > threshold) {
+                    resize();
+                }
+            } else {
+                counts[slot]++;
+            }
+        }
+
+        /** Number of distinct bigrams stored. */
+        int size() {
+            return size;
+        }
+
+        /** Slot count.  Walk slots {@code [0, capacity)} to enumerate 
entries. */
+        int capacity() {
+            return cap;
+        }
+
+        /** Bigram at {@code slot}, or {@code -1} if the slot is empty. */
+        int keyAt(int slot) {
+            return keys[slot];
+        }
+
+        /** Count at {@code slot}.  Undefined for empty slots. */
+        int countAt(int slot) {
+            return counts[slot];
+        }
+
+        private void resize() {
+            int oldCap = cap;
+            int[] oldKeys = keys;
+            int[] oldCounts = counts;
+            cap = oldCap << 1;
+            mask = cap - 1;
+            shift = Integer.numberOfLeadingZeros(mask);
+            threshold = cap >>> 1;
+            keys = new int[cap];
+            counts = new int[cap];
+            Arrays.fill(keys, -1);
+            for (int i = 0; i < oldCap; i++) {
+                int k = oldKeys[i];
+                if (k == -1) {
+                    continue;
+                }
+                int slot = (k * HASH_MULT) >>> shift;
+                while (keys[slot] != -1) {
+                    slot = (slot + 1) & mask;
+                }
+                keys[slot] = k;
+                counts[slot] = oldCounts[i];
+            }
+        }
+    }
+
     public String[] getLabels() {
         return labels.clone();
     }
diff --git 
a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/BuildJunkTrainingData.java
 
b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/BuildJunkTrainingData.java
index 451efccb36..a45f74a82c 100644
--- 
a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/BuildJunkTrainingData.java
+++ 
b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/BuildJunkTrainingData.java
@@ -658,11 +658,13 @@ public class BuildJunkTrainingData {
         if (text.indexOf('\uFFFD') >= 0) {
             return null;
         }
-        // NFD (not NFC) so combining-mark scripts (Vietnamese precomposed,
-        // Indic, Thai) have their marks as separate codepoints in the
-        // training corpus.  Lets per-script bigram tables and z5 (letter-
-        // adjacent-to-mark) discriminate uniformly across mark-using
-        // scripts.  Must match JunkDetector.scoreText's normalization.
+        // NFC so the training tally matches JunkDetector.aggregate at
+        // inference time (which also NFC-normalises — see comment at
+        // JunkDetector#aggregate).  Precomposed characters (Latin
+        // diacritics, Vietnamese, Indic combining-mark sequences) are
+        // stored as single codepoints, so bigram counts collapse mark
+        // + letter into one unit instead of splitting them — matching
+        // the natural NFC form of most source text.
         text = Normalizer.normalize(text, Normalizer.Form.NFC);
         if (text.getBytes(StandardCharsets.UTF_8).length < minBytes) {
             return null;
diff --git 
a/tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/tools/BuildJunkAugmentationDataTest.java
 
b/tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/tools/BuildJunkAugmentationDataTest.java
index 26f316d91e..ac901b3a2a 100644
--- 
a/tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/tools/BuildJunkAugmentationDataTest.java
+++ 
b/tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/tools/BuildJunkAugmentationDataTest.java
@@ -390,21 +390,6 @@ class BuildJunkAugmentationDataTest {
         assertEquals("0F/ABCD1234", 
BuildJunkAugmentationData.profileKey(extracts, file));
     }
 
-    @Test
-    void refusesOutputEqualToBaseline(@TempDir Path tmp) throws Exception {
-        Path baseline = tmp.resolve("baseline");
-        Path extracts = tmp.resolve("extracts");
-        Files.createDirectories(baseline);
-        Files.createDirectories(extracts);
-        writeGz(baseline.resolve("latin.train.gz"), List.of("x"));
-
-        // Run in same JVM, catch System.exit. Easiest path is a 
SecurityManager,
-        // but JDK 17 deprecates that. Instead, hit the static helper directly
-        // for isSameFile semantics.
-        assertTrue(Files.isSameFile(baseline, baseline),
-                "sanity: same directory is same file");
-    }
-
     // 
---------------------------------------------------------------------------
 
     private static void writeGz(Path path, List<String> lines) throws 
Exception {

(tika) branch TIKA-4731-common-script updated: updates based on copilot feedback

Reply via email to