clintropolis commented on a change in pull request #8495: Check 
targetCompactionSizeBytes to search for candidate segments in auto compaction
URL: https://github.com/apache/incubator-druid/pull/8495#discussion_r322510975
 
 

 ##########
 File path: 
server/src/main/java/org/apache/druid/server/coordinator/helper/SegmentCompactorUtil.java
 ##########
 @@ -20,21 +20,36 @@
 package org.apache.druid.server.coordinator.helper;
 
 import com.google.common.base.Preconditions;
+import org.apache.druid.timeline.DataSegment;
 import org.joda.time.Interval;
 
+import javax.annotation.Nullable;
+import java.util.List;
+
 /**
  * Util class used by {@link DruidCoordinatorSegmentCompactor} and {@link 
CompactionSegmentSearchPolicy}.
  */
 class SegmentCompactorUtil
 {
-  static boolean isCompactibleSize(long targetBytes, long currentTotalBytes, 
long additionalBytes)
-  {
-    return currentTotalBytes + additionalBytes <= targetBytes;
-  }
+  /**
+   * The allowed error rate of the segment size after compaction.
+   * Its value is determined experimentally.
+   */
+  private static final double ALLOWED_ERROR_OF_SEGMENT_SIZE = .2;
 
-  static boolean isCompactibleNum(int numTargetSegments, int 
numCurrentSegments, int numAdditionalSegments)
+  static boolean needsCompaction(@Nullable Long targetCompactionSizeBytes, 
List<DataSegment> candidates)
   {
-    return numCurrentSegments + numAdditionalSegments <= numTargetSegments;
+    if (targetCompactionSizeBytes == null) {
+      // If targetCompactionSizeBytes is null, we have no way to check that 
the given segments need compaction or not.
+      return true;
+    }
+    final double minAllowedSize = targetCompactionSizeBytes * (1 - 
ALLOWED_ERROR_OF_SEGMENT_SIZE);
+    final double maxAllowedSize = targetCompactionSizeBytes * (1 + 
ALLOWED_ERROR_OF_SEGMENT_SIZE);
+
+    return candidates
+        .stream()
+        .filter(segment -> segment.getSize() < minAllowedSize || 
segment.getSize() > maxAllowedSize)
+        .count() > 1;
 
 Review comment:
   ~nit: I think this should use `anyMatch` instead of `filter` + `count` since 
the former might not have to materialize the entire set upon reaching first 
match~

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@druid.apache.org
For additional commands, e-mail: commits-h...@druid.apache.org

Reply via email to