Re: [PR] Introduce DocValuesStatsCollectorManager [lucene]

via GitHub Sun, 10 May 2026 02:45:02 -0700


gaobinlong commented on code in PR #15653:
URL: https://github.com/apache/lucene/pull/15653#discussion_r3214651413



##########
lucene/misc/src/test/org/apache/lucene/misc/search/TestDocValuesStatsCollector.java:
##########
@@ -99,228 +107,257 @@ public void testOneDoc() throws IOException {
   }
 
   public void testDocsWithLongValues() throws IOException {
-    try (Directory dir = newDirectory();
-        IndexWriter indexWriter = new IndexWriter(dir, 
newIndexWriterConfig())) {
-      String field = "numeric";
-      int numDocs = TestUtil.nextInt(random(), 1, 100);
-      long[] docValues = new long[numDocs];
-      int nextVal = 1;
-      for (int i = 0; i < numDocs; i++) {
-        Document doc = new Document();
-        if (random().nextBoolean()) { // not all documents have a value
-          doc.add(new NumericDocValuesField(field, nextVal));
-          doc.add(new StringField("id", "doc" + i, Store.NO));
-          docValues[i] = nextVal;
-          ++nextVal;
-        }
-        indexWriter.addDocument(doc);
-      }
-
-      // 20% of cases delete some docs
-      if (random().nextDouble() < 0.2) {

Review Comment:
   Added it back.



##########
lucene/misc/src/java/org/apache/lucene/misc/search/DocValuesStats.java:
##########
@@ -73,6 +73,32 @@ final void addMissing() {
     ++missing;
   }
 
+  void merge(DocValuesStats<?> other) {

Review Comment:
   Changed that, thanks!



##########
lucene/misc/src/test/org/apache/lucene/misc/search/TestDocValuesStatsCollector.java:
##########
@@ -99,228 +107,257 @@ public void testOneDoc() throws IOException {
   }
 
   public void testDocsWithLongValues() throws IOException {
-    try (Directory dir = newDirectory();
-        IndexWriter indexWriter = new IndexWriter(dir, 
newIndexWriterConfig())) {
-      String field = "numeric";
-      int numDocs = TestUtil.nextInt(random(), 1, 100);
-      long[] docValues = new long[numDocs];
-      int nextVal = 1;
-      for (int i = 0; i < numDocs; i++) {
-        Document doc = new Document();
-        if (random().nextBoolean()) { // not all documents have a value
-          doc.add(new NumericDocValuesField(field, nextVal));
-          doc.add(new StringField("id", "doc" + i, Store.NO));
-          docValues[i] = nextVal;
-          ++nextVal;
-        }
-        indexWriter.addDocument(doc);
-      }
-
-      // 20% of cases delete some docs
-      if (random().nextDouble() < 0.2) {
+    try (Directory dir = newDirectory()) {
+      IndexWriterConfig config = newIndexWriterConfig();
+      config.setMaxBufferedDocs(10);
+      try (RandomIndexWriter indexWriter = new RandomIndexWriter(random(), 
dir, config)) {
+        String field = "numeric";
+        int numDocs = TestUtil.nextInt(random(), 50, 100);
+        long[] docValues = new long[numDocs];
+        int nextVal = 1;
         for (int i = 0; i < numDocs; i++) {
+          Document doc = new Document();
           if (random().nextBoolean()) {
-            indexWriter.deleteDocuments(new Term("id", "doc" + i));
-            docValues[i] = 0;
+            doc.add(new NumericDocValuesField(field, nextVal));
+            doc.add(new StringField("id", "doc" + i, Store.NO));
+            docValues[i] = nextVal;
+            ++nextVal;
+          }
+          if (i != 0 && i % 10 == 0) {
+            indexWriter.commit();
           }
+          indexWriter.addDocument(doc);
         }
-      }
 
-      try (DirectoryReader reader = DirectoryReader.open(indexWriter)) {
-        IndexSearcher searcher = new IndexSearcher(reader);
-        LongDocValuesStats stats = new LongDocValuesStats(field);
-        searcher.search(MatchAllDocsQuery.INSTANCE, new 
DocValuesStatsCollector(stats));
+        if (random().nextDouble() < 0.2) {
+          for (int i = 0; i < numDocs; i++) {
+            if (random().nextBoolean()) {
+              indexWriter.deleteDocuments(new Term("id", "doc" + i));
+              docValues[i] = 0;
+            }
+          }
+        }
 
-        int expCount = (int) Arrays.stream(docValues).filter(v -> v > 
0).count();
-        assertEquals(expCount, stats.count());
-        int numDocsWithoutField = (int) getZeroValues(docValues).count();
-        assertEquals(computeExpMissing(numDocsWithoutField, numDocs, reader), 
stats.missing());
-        if (stats.count() > 0) {
-          LongSummaryStatistics sumStats = 
getPositiveValues(docValues).summaryStatistics();
-          assertEquals(sumStats.getMax(), stats.max().longValue());
-          assertEquals(sumStats.getMin(), stats.min().longValue());
-          assertEquals(sumStats.getAverage(), stats.mean(), 0.00001);
-          assertEquals(sumStats.getSum(), stats.sum().longValue());
-          double variance = computeVariance(docValues, stats.mean, 
stats.count());
-          assertEquals(variance, stats.variance(), 0.00001);
-          assertEquals(Math.sqrt(variance), stats.stdev(), 0.00001);
+        try (DirectoryReader reader = indexWriter.getReader()) {
+          IndexSearcher searcher = newSearcher(reader);
+          LongDocValuesStats stats =
+              searcher.search(
+                  MatchAllDocsQuery.INSTANCE,
+                  new DocValuesStatsCollectorManager<>(() -> new 
LongDocValuesStats(field)));
+
+          int expCount = (int) Arrays.stream(docValues).filter(v -> v > 
0).count();
+          assertEquals(expCount, stats.count());
+          int numDocsWithoutField = (int) getZeroValues(docValues).count();
+          assertEquals(computeExpMissing(numDocsWithoutField, numDocs, 
reader), stats.missing());
+          if (stats.count() > 0) {
+            LongSummaryStatistics sumStats = 
getPositiveValues(docValues).summaryStatistics();
+            assertEquals(sumStats.getMax(), stats.max().longValue());
+            assertEquals(sumStats.getMin(), stats.min().longValue());
+            assertEquals(sumStats.getAverage(), stats.mean(), 0.00001);
+            assertEquals(sumStats.getSum(), stats.sum().longValue());
+            double variance = computeVariance(docValues, stats.mean, 
stats.count());
+            assertEquals(variance, stats.variance(), 0.00001);
+            assertEquals(Math.sqrt(variance), stats.stdev(), 0.00001);
+          }
         }
       }
     }
   }
 
   public void testDocsWithDoubleValues() throws IOException {
-    try (Directory dir = newDirectory();
-        IndexWriter indexWriter = new IndexWriter(dir, 
newIndexWriterConfig())) {
-      String field = "numeric";
-      int numDocs = TestUtil.nextInt(random(), 1, 100);
-      double[] docValues = new double[numDocs];
-      double nextVal = 1.0;
-      for (int i = 0; i < numDocs; i++) {
-        Document doc = new Document();
-        if (random().nextBoolean()) { // not all documents have a value
-          doc.add(new DoubleDocValuesField(field, nextVal));
-          doc.add(new StringField("id", "doc" + i, Store.NO));
-          docValues[i] = nextVal;
-          ++nextVal;
-        }
-        indexWriter.addDocument(doc);
-      }
-
-      // 20% of cases delete some docs
-      if (random().nextDouble() < 0.2) {
+    try (Directory dir = newDirectory()) {
+      IndexWriterConfig config = newIndexWriterConfig();
+      config.setMaxBufferedDocs(10);
+      try (RandomIndexWriter indexWriter = new RandomIndexWriter(random(), 
dir, config)) {
+        String field = "numeric";
+        int numDocs = TestUtil.nextInt(random(), 50, 100);
+        double[] docValues = new double[numDocs];
+        double nextVal = 1.0;
         for (int i = 0; i < numDocs; i++) {
+          Document doc = new Document();
           if (random().nextBoolean()) {
-            indexWriter.deleteDocuments(new Term("id", "doc" + i));
-            docValues[i] = 0;
+            doc.add(new DoubleDocValuesField(field, nextVal));
+            doc.add(new StringField("id", "doc" + i, Store.NO));
+            docValues[i] = nextVal;
+            ++nextVal;
+          }
+          if (i != 0 && i % 10 == 0) {
+            indexWriter.commit();
           }
+          indexWriter.addDocument(doc);
         }
-      }
 
-      try (DirectoryReader reader = DirectoryReader.open(indexWriter)) {
-        IndexSearcher searcher = new IndexSearcher(reader);
-        DoubleDocValuesStats stats = new DoubleDocValuesStats(field);
-        searcher.search(MatchAllDocsQuery.INSTANCE, new 
DocValuesStatsCollector(stats));
+        if (random().nextDouble() < 0.2) {
+          for (int i = 0; i < numDocs; i++) {
+            if (random().nextBoolean()) {
+              indexWriter.deleteDocuments(new Term("id", "doc" + i));
+              docValues[i] = 0;
+            }
+          }
+        }
 
-        int expCount = (int) Arrays.stream(docValues).filter(v -> v > 
0).count();
-        assertEquals(expCount, stats.count());
-        int numDocsWithoutField = (int) getZeroValues(docValues).count();
-        assertEquals(computeExpMissing(numDocsWithoutField, numDocs, reader), 
stats.missing());
-        if (stats.count() > 0) {
-          DoubleSummaryStatistics sumStats = 
getPositiveValues(docValues).summaryStatistics();
-          assertEquals(sumStats.getMax(), stats.max().doubleValue(), 0.00001);
-          assertEquals(sumStats.getMin(), stats.min().doubleValue(), 0.00001);
-          assertEquals(sumStats.getAverage(), stats.mean(), 0.00001);
-          assertEquals(sumStats.getSum(), stats.sum(), 0.00001);
-          double variance = computeVariance(docValues, stats.mean, 
stats.count());
-          assertEquals(variance, stats.variance(), 0.00001);
-          assertEquals(Math.sqrt(variance), stats.stdev(), 0.00001);
+        try (DirectoryReader reader = indexWriter.getReader()) {
+          IndexSearcher searcher = newSearcher(reader);
+          DoubleDocValuesStats stats =
+              searcher.search(
+                  MatchAllDocsQuery.INSTANCE,
+                  new DocValuesStatsCollectorManager<>(() -> new 
DoubleDocValuesStats(field)));
+
+          int expCount = (int) Arrays.stream(docValues).filter(v -> v > 
0).count();
+          assertEquals(expCount, stats.count());
+          int numDocsWithoutField = (int) getZeroValues(docValues).count();
+          assertEquals(computeExpMissing(numDocsWithoutField, numDocs, 
reader), stats.missing());
+          if (stats.count() > 0) {
+            DoubleSummaryStatistics sumStats = 
getPositiveValues(docValues).summaryStatistics();
+            assertEquals(sumStats.getMax(), stats.max().doubleValue(), 
0.00001);
+            assertEquals(sumStats.getMin(), stats.min().doubleValue(), 
0.00001);
+            assertEquals(sumStats.getAverage(), stats.mean(), 0.00001);
+            assertEquals(sumStats.getSum(), stats.sum(), 0.00001);
+            double variance = computeVariance(docValues, stats.mean, 
stats.count());
+            assertEquals(variance, stats.variance(), 0.00001);
+            assertEquals(Math.sqrt(variance), stats.stdev(), 0.00001);
+          }
         }
       }
     }
   }
 
   public void testDocsWithMultipleLongValues() throws IOException {
-    try (Directory dir = newDirectory();
-        IndexWriter indexWriter = new IndexWriter(dir, 
newIndexWriterConfig())) {
-      String field = "numeric";
-      int numDocs = TestUtil.nextInt(random(), 1, 100);
-      long[][] docValues = new long[numDocs][];
-      long nextVal = 1;
-      for (int i = 0; i < numDocs; i++) {
-        Document doc = new Document();
-        if (random().nextBoolean()) { // not all documents have a value
-          int numValues = TestUtil.nextInt(random(), 1, 5);
-          docValues[i] = new long[numValues];
-          for (int j = 0; j < numValues; j++) {
-            doc.add(new SortedNumericDocValuesField(field, nextVal));
-            docValues[i][j] = nextVal;
-            ++nextVal;
-          }
-          doc.add(new StringField("id", "doc" + i, Store.NO));
-        }
-        indexWriter.addDocument(doc);
-      }
-
-      // 20% of cases delete some docs
-      if (random().nextDouble() < 0.2) {
+    try (Directory dir = newDirectory()) {
+      IndexWriterConfig config = newIndexWriterConfig();
+      config.setMaxBufferedDocs(10);
+      try (RandomIndexWriter indexWriter = new RandomIndexWriter(random(), 
dir, config)) {
+        String field = "numeric";
+        int numDocs = TestUtil.nextInt(random(), 50, 100);
+        long[][] docValues = new long[numDocs][];
+        long nextVal = 1;
         for (int i = 0; i < numDocs; i++) {
+          Document doc = new Document();
           if (random().nextBoolean()) {
-            indexWriter.deleteDocuments(new Term("id", "doc" + i));
-            docValues[i] = null;
+            int numValues = TestUtil.nextInt(random(), 1, 5);
+            docValues[i] = new long[numValues];
+            for (int j = 0; j < numValues; j++) {
+              doc.add(new SortedNumericDocValuesField(field, nextVal));
+              docValues[i][j] = nextVal;
+              ++nextVal;
+            }
+            doc.add(new StringField("id", "doc" + i, Store.NO));
           }
+          if (i != 0 && i % 10 == 0) {
+            indexWriter.commit();
+          }
+          indexWriter.addDocument(doc);
         }
-      }
 
-      try (DirectoryReader reader = DirectoryReader.open(indexWriter)) {
-        IndexSearcher searcher = new IndexSearcher(reader);
-        SortedLongDocValuesStats stats = new SortedLongDocValuesStats(field);
-        searcher.search(MatchAllDocsQuery.INSTANCE, new 
DocValuesStatsCollector(stats));
+        if (random().nextDouble() < 0.2) {
+          for (int i = 0; i < numDocs; i++) {
+            if (random().nextBoolean()) {
+              indexWriter.deleteDocuments(new Term("id", "doc" + i));
+              docValues[i] = null;
+            }
+          }
+        }
 
-        assertEquals(nonNull(docValues).count(), stats.count());
-        int numDocsWithoutField = (int) isNull(docValues).count();
-        assertEquals(computeExpMissing(numDocsWithoutField, numDocs, reader), 
stats.missing());
-        if (stats.count() > 0) {
-          LongSummaryStatistics sumStats =
-              filterAndFlatValues(docValues, (v) -> v != 
null).summaryStatistics();
-          assertEquals(sumStats.getMax(), stats.max().longValue());
-          assertEquals(sumStats.getMin(), stats.min().longValue());
-          assertEquals(sumStats.getAverage(), stats.mean(), 0.00001);
-          assertEquals(sumStats.getSum(), stats.sum().longValue());
-          assertEquals(sumStats.getCount(), stats.valuesCount());
-          double variance =
-              computeVariance(
-                  filterAndFlatValues(docValues, (v) -> v != null), 
stats.mean, stats.count());
-          assertEquals(variance, stats.variance(), 0.00001);
-          assertEquals(Math.sqrt(variance), stats.stdev(), 0.00001);
+        try (DirectoryReader reader = indexWriter.getReader()) {
+          IndexSearcher searcher = newSearcher(reader);
+          SortedLongDocValuesStats stats =
+              searcher.search(
+                  MatchAllDocsQuery.INSTANCE,
+                  new DocValuesStatsCollectorManager<>(() -> new 
SortedLongDocValuesStats(field)));
+
+          assertEquals(nonNull(docValues).count(), stats.count());
+          int numDocsWithoutField = (int) isNull(docValues).count();
+          assertEquals(computeExpMissing(numDocsWithoutField, numDocs, 
reader), stats.missing());
+          if (stats.count() > 0) {
+            LongSummaryStatistics sumStats =
+                filterAndFlatValues(docValues, (v) -> v != 
null).summaryStatistics();
+            assertEquals(sumStats.getMax(), stats.max().longValue());
+            assertEquals(sumStats.getMin(), stats.min().longValue());
+            assertEquals(sumStats.getAverage(), stats.mean(), 0.00001);
+            assertEquals(sumStats.getSum(), stats.sum().longValue());
+            assertEquals(sumStats.getCount(), stats.valuesCount());
+            double variance =
+                computeVariance(
+                    filterAndFlatValues(docValues, (v) -> v != null), 
stats.mean, stats.count());
+            assertEquals(variance, stats.variance(), 0.00001);
+            assertEquals(Math.sqrt(variance), stats.stdev(), 0.00001);
+          }
         }
       }
     }
   }
 
   public void testDocsWithMultipleDoubleValues() throws IOException {
-    try (Directory dir = newDirectory();
-        IndexWriter indexWriter = new IndexWriter(dir, 
newIndexWriterConfig())) {
-      String field = "numeric";
-      int numDocs = TestUtil.nextInt(random(), 1, 100);
-      double[][] docValues = new double[numDocs][];
-      double nextVal = 1;
-      for (int i = 0; i < numDocs; i++) {
-        Document doc = new Document();
-        if (random().nextBoolean()) { // not all documents have a value
-          int numValues = TestUtil.nextInt(random(), 1, 5);
-          docValues[i] = new double[numValues];
-          for (int j = 0; j < numValues; j++) {
-            doc.add(new SortedNumericDocValuesField(field, 
Double.doubleToRawLongBits(nextVal)));
-            docValues[i][j] = nextVal;
-            ++nextVal;
-          }
-          doc.add(new StringField("id", "doc" + i, Store.NO));
-        }
-        indexWriter.addDocument(doc);
-      }
-
-      // 20% of cases delete some docs
-      if (random().nextDouble() < 0.2) {
+    try (Directory dir = newDirectory()) {
+      IndexWriterConfig config = newIndexWriterConfig();
+      config.setMaxBufferedDocs(10);

Review Comment:
   This is also unnecessary, removed it.



##########
lucene/misc/src/test/org/apache/lucene/misc/search/TestDocValuesStatsCollector.java:
##########
@@ -99,228 +107,257 @@ public void testOneDoc() throws IOException {
   }
 
   public void testDocsWithLongValues() throws IOException {
-    try (Directory dir = newDirectory();
-        IndexWriter indexWriter = new IndexWriter(dir, 
newIndexWriterConfig())) {
-      String field = "numeric";
-      int numDocs = TestUtil.nextInt(random(), 1, 100);
-      long[] docValues = new long[numDocs];
-      int nextVal = 1;
-      for (int i = 0; i < numDocs; i++) {
-        Document doc = new Document();
-        if (random().nextBoolean()) { // not all documents have a value
-          doc.add(new NumericDocValuesField(field, nextVal));
-          doc.add(new StringField("id", "doc" + i, Store.NO));
-          docValues[i] = nextVal;
-          ++nextVal;
-        }
-        indexWriter.addDocument(doc);
-      }
-
-      // 20% of cases delete some docs
-      if (random().nextDouble() < 0.2) {
+    try (Directory dir = newDirectory()) {
+      IndexWriterConfig config = newIndexWriterConfig();
+      config.setMaxBufferedDocs(10);
+      try (RandomIndexWriter indexWriter = new RandomIndexWriter(random(), 
dir, config)) {
+        String field = "numeric";
+        int numDocs = TestUtil.nextInt(random(), 50, 100);
+        long[] docValues = new long[numDocs];
+        int nextVal = 1;
         for (int i = 0; i < numDocs; i++) {
+          Document doc = new Document();
           if (random().nextBoolean()) {
-            indexWriter.deleteDocuments(new Term("id", "doc" + i));
-            docValues[i] = 0;
+            doc.add(new NumericDocValuesField(field, nextVal));
+            doc.add(new StringField("id", "doc" + i, Store.NO));
+            docValues[i] = nextVal;
+            ++nextVal;
+          }
+          if (i != 0 && i % 10 == 0) {
+            indexWriter.commit();

Review Comment:
   Yeah, you're right, randomIndexWriter flush documents randomly, this is not 
necessary.



##########
lucene/misc/src/java/org/apache/lucene/misc/search/DocValuesStats.java:
##########
@@ -142,6 +168,45 @@ public final double stdev() {
      * might overflow.
      */
     public abstract T sum();
+
+    @SuppressWarnings("unchecked")

Review Comment:
   That's better, changed it to `DocValuesStats<T>`.



##########
lucene/misc/src/java/org/apache/lucene/misc/search/DocValuesStats.java:
##########
@@ -73,6 +73,32 @@ final void addMissing() {
     ++missing;
   }
 
+  void merge(DocValuesStats<?> other) {
+    count += other.count;
+    missing += other.missing;
+    if (other.min != null && (min == null || compareValues(other.min, min) < 
0)) {
+      copyMin(other);
+    }
+    if (other.max != null && (max == null || compareValues(other.max, max) > 
0)) {
+      copyMax(other);
+    }
+  }
+
+  @SuppressWarnings("unchecked")
+  void copyMin(DocValuesStats<?> other) {
+    min = (T) other.min;
+  }
+
+  @SuppressWarnings("unchecked")
+  void copyMax(DocValuesStats<?> other) {

Review Comment:
   Both copyMin and copyMax are folded into the merge method.



##########
lucene/misc/src/java/org/apache/lucene/misc/search/DocValuesStats.java:
##########
@@ -73,6 +73,32 @@ final void addMissing() {
     ++missing;
   }
 
+  void merge(DocValuesStats<?> other) {
+    count += other.count;
+    missing += other.missing;
+    if (other.min != null && (min == null || compareValues(other.min, min) < 
0)) {
+      copyMin(other);
+    }
+    if (other.max != null && (max == null || compareValues(other.max, max) > 
0)) {
+      copyMax(other);
+    }
+  }
+
+  @SuppressWarnings("unchecked")
+  void copyMin(DocValuesStats<?> other) {
+    min = (T) other.min;
+  }
+
+  @SuppressWarnings("unchecked")
+  void copyMax(DocValuesStats<?> other) {
+    max = (T) other.max;
+  }
+
+  @SuppressWarnings({"unchecked", "rawtypes"})
+  int compareValues(Object a, Object b) {

Review Comment:
   Simplify this by folding the compareValues() method into the caller method 
merge().



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] Introduce DocValuesStatsCollectorManager [lucene]

Reply via email to