This is an automated email from the ASF dual-hosted git repository.

chenhang pushed a commit to branch branch-4.14
in repository https://gitbox.apache.org/repos/asf/bookkeeper.git

commit 9f89a395fb4e9d74f6e7f5660b0125c5170f0ed7
Author: Hang Chen <[email protected]>
AuthorDate: Thu Mar 16 11:28:48 2023 +0800

    Add small files check in garbage collection (#3631)
    
    When we use `TransactionalEntryLogCompactor` to compact the entry log 
files, it will generate a lot of small entry log files, and for those files, 
the file usage is usually greater than 90%, which can not be compacted unless 
the file usage decreased.
    
    
![image](https://user-images.githubusercontent.com/5436568/201135615-4d6072f5-e353-483d-9afb-48fad8134044.png)
    
    We introduce the entry log file size check during compaction, and the 
checker is controlled by `gcEntryLogSizeRatio`.
    If the total entry log file size is less than `gcEntryLogSizeRatio * 
logSizeLimit`, the entry log file will be compacted even though the file usage 
is greater than 90%. This feature is disabled by default and the 
`gcEntryLogSizeRatio` default value is `0.0`
    
    (cherry picked from commit 2fad33bfcf24a72f7fdf103969ed4b0aa26778a2)
---
 .../org/apache/bookkeeper/bookie/GarbageCollectorThread.java | 12 +++++++++---
 .../java/org/apache/bookkeeper/conf/ServerConfiguration.java | 10 ++++++++++
 conf/bk_server.conf                                          |  8 ++++++++
 3 files changed, 27 insertions(+), 3 deletions(-)

diff --git 
a/bookkeeper-server/src/main/java/org/apache/bookkeeper/bookie/GarbageCollectorThread.java
 
b/bookkeeper-server/src/main/java/org/apache/bookkeeper/bookie/GarbageCollectorThread.java
index 60a6cbdceb..6afc7e4bfa 100644
--- 
a/bookkeeper-server/src/main/java/org/apache/bookkeeper/bookie/GarbageCollectorThread.java
+++ 
b/bookkeeper-server/src/main/java/org/apache/bookkeeper/bookie/GarbageCollectorThread.java
@@ -466,15 +466,21 @@ public class GarbageCollectorThread extends SafeRunnable {
         long timeDiff = 0;
 
         for (EntryLogMetadata meta : logsToCompact) {
-            int bucketIndex = calculateUsageIndex(numBuckets, meta.getUsage());
+            double usage = meta.getUsage();
+            if (conf.isUseTargetEntryLogSizeForGc() && usage < 1.0d) {
+                usage = (double) meta.getRemainingSize() / 
Math.max(meta.getTotalSize(), conf.getEntryLogSizeLimit());
+            }
+            int bucketIndex = calculateUsageIndex(numBuckets, usage);
             entryLogUsageBuckets[bucketIndex]++;
 
             if (timeDiff < maxTimeMillis) {
                 end = System.currentTimeMillis();
                 timeDiff = end - start;
             }
-            if (meta.getUsage() >= threshold || (maxTimeMillis > 0 && timeDiff 
> maxTimeMillis) || !running) {
-                // We allow the usage limit calculation to continue so that we 
get a accurate
+            if ((usage >= threshold
+                || (maxTimeMillis > 0 && timeDiff >= maxTimeMillis)
+                || !running)) {
+                // We allow the usage limit calculation to continue so that we 
get an accurate
                 // report of where the usage was prior to running compaction.
                 continue;
             }
diff --git 
a/bookkeeper-server/src/main/java/org/apache/bookkeeper/conf/ServerConfiguration.java
 
b/bookkeeper-server/src/main/java/org/apache/bookkeeper/conf/ServerConfiguration.java
index 18c7e32112..427812aeec 100644
--- 
a/bookkeeper-server/src/main/java/org/apache/bookkeeper/conf/ServerConfiguration.java
+++ 
b/bookkeeper-server/src/main/java/org/apache/bookkeeper/conf/ServerConfiguration.java
@@ -113,6 +113,7 @@ public class ServerConfiguration extends 
AbstractConfiguration<ServerConfigurati
     protected static final String GC_OVERREPLICATED_LEDGER_WAIT_TIME = 
"gcOverreplicatedLedgerWaitTime";
     protected static final String USE_TRANSACTIONAL_COMPACTION = 
"useTransactionalCompaction";
     protected static final String VERIFY_METADATA_ON_GC = "verifyMetadataOnGC";
+    protected static final String USE_TARGET_ENTRYLOG_SIZE_FOR_GC = 
"useTargetEntryLogSizeForGc";
     // Scrub Parameters
     protected static final String LOCAL_SCRUB_PERIOD = "localScrubInterval";
     protected static final String LOCAL_SCRUB_RATE_LIMIT = 
"localScrubRateLimit";
@@ -459,6 +460,15 @@ public class ServerConfiguration extends 
AbstractConfiguration<ServerConfigurati
         return this;
     }
 
+    public boolean isUseTargetEntryLogSizeForGc() {
+        return getBoolean(USE_TARGET_ENTRYLOG_SIZE_FOR_GC, false);
+    }
+
+    public ServerConfiguration setUseTargetEntryLogSizeForGc(boolean 
useTargetEntryLogSizeForGc) {
+        this.setProperty(USE_TARGET_ENTRYLOG_SIZE_FOR_GC, 
useTargetEntryLogSizeForGc);
+        return this;
+    }
+
     /**
      * Get whether local scrub is enabled.
      *
diff --git a/conf/bk_server.conf b/conf/bk_server.conf
index 1d275586fa..f786a175c0 100755
--- a/conf/bk_server.conf
+++ b/conf/bk_server.conf
@@ -582,6 +582,14 @@ ledgerDirectories=/tmp/bk-data
 # True if the bookie should double check readMetadata prior to gc
 # verifyMetadataOnGC=false
 
+# When judging whether an entry log file need to be compacted, we calculate 
the usage rate of the entry log file based
+# on the actual size of the entry log file. However, if an entry log file is 
1MB in size and 0.9MB of data is
+# being used, this entry log file won't be compacted by garbage collector due 
to the high usage ratio,
+# which will result in many small entry log files.
+# We introduced the parameter `useTargetEntryLogSizeForGc` to determine 
whether to calculate entry log file usage
+# based on the configured target entry log file size, which is configured by 
`logSizeLimit`.
+# Default: useTargetEntryLogSizeForGc is false.
+# useTargetEntryLogSizeForGc=false
 #############################################################################
 ## Disk utilization
 #############################################################################

Reply via email to