This is an automated email from the ASF dual-hosted git repository. thomasm pushed a commit to branch OAK-11781 in repository https://gitbox.apache.org/repos/asf/jackrabbit-oak.git
commit ff07c3b10d19ba80418f5220fe203bac4d4a5729 Author: Thomas Mueller <[email protected]> AuthorDate: Mon Jun 30 16:22:45 2025 +0200 OAK-11781 Binary reference statistics are inaccurate for very large repositories --- .../oak/index/indexer/document/flatfile/analysis/StatsBuilder.java | 2 +- .../indexer/document/flatfile/analysis/modules/DistinctBinarySize.java | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/StatsBuilder.java b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/StatsBuilder.java index 018670e97d..2bac1d025f 100644 --- a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/StatsBuilder.java +++ b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/StatsBuilder.java @@ -90,7 +90,7 @@ public class StatsBuilder { collectors.add(new BinarySizeHistogram(1)); collectors.add(new TopLargestBinaries(10)); collectors.add(new DistinctBinarySizeHistogram(1)); - collectors.add(new DistinctBinarySize(16, 16)); + collectors.add(new DistinctBinarySize(32, 128)); Profiler prof = null; if (profiler) { diff --git a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/DistinctBinarySize.java b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/DistinctBinarySize.java index d1583007c7..272c7f357b 100644 --- a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/DistinctBinarySize.java +++ b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/DistinctBinarySize.java @@ -174,6 +174,7 @@ public class DistinctBinarySize implements StatsCollector { double fpp = BloomFilter.calculateFpp(smallBinariesEstimatedCount, bloomFilter.getBitCount(), bloomFilter.getK()); long bloomFilterEstimatedSize = bloomFilterMinSize; bloomFilterEstimatedSize += fpp * bloomFilterIgnoredSize; + storage.add("small Bloom filter ffp*1000", (long) (fpp * 1000)); storage.add("small binaries count", smallBinariesEstimatedCount); storage.add("small binaries HLL count", smallBinariesEstimatedCountHLL); storage.add("small binaries size", bloomFilterEstimatedSize);
