rakeshadr commented on a change in pull request #2093:
URL: https://github.com/apache/ozone/pull/2093#discussion_r604677396



##########
File path: 
hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/DirectoryDeletingService.java
##########
@@ -0,0 +1,285 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with this
+ * work for additional information regarding copyright ownership.  The ASF
+ * licenses this file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * <p>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations 
under
+ * the License.
+ */
+package org.apache.hadoop.ozone.om;
+
+import com.google.common.annotations.VisibleForTesting;
+import com.google.protobuf.ServiceException;
+import org.apache.hadoop.hdds.utils.BackgroundService;
+import org.apache.hadoop.hdds.utils.BackgroundTask;
+import org.apache.hadoop.hdds.utils.BackgroundTaskQueue;
+import org.apache.hadoop.hdds.utils.BackgroundTaskResult;
+import org.apache.hadoop.ozone.om.helpers.OMRatisHelper;
+import org.apache.hadoop.ozone.om.helpers.OmKeyInfo;
+import org.apache.hadoop.ozone.protocol.proto.OzoneManagerProtocolProtos;
+import org.apache.hadoop.util.Time;
+import org.apache.ratis.protocol.ClientId;
+import org.apache.ratis.protocol.Message;
+import org.apache.ratis.protocol.RaftClientRequest;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicLong;
+
+import static org.apache.hadoop.ozone.ClientVersions.CURRENT_VERSION;
+import static 
org.apache.hadoop.ozone.om.OMConfigKeys.OZONE_PATH_DELETING_LIMIT_PER_TASK;
+import static 
org.apache.hadoop.ozone.om.OMConfigKeys.OZONE_PATH_DELETING_LIMIT_PER_TASK_DEFAULT;
+
+/**
+ * This is a background service to delete orphan directories and its
+ * sub paths(sub-dirs and sub-files).
+ *
+ * <p>
+ * This will scan the metadata of om periodically to get the orphan dirs from
+ * DeletedDirectoryTable and find its sub paths. It will fetch all sub-files
+ * from KeyTable and move those to DeletedTable so that OM's
+ * KeyDeletingService will cleanup those files later. It will fetch all
+ * sub-directories from the DirectoryTable and move those to
+ * DeletedDirectoryTable so that these will be visited in next iterations.
+ *
+ * <p>
+ * After moving all sub-files and sub-dirs the parent orphan directory will be
+ * deleted by this service. It will continue traversing until all the leaf path
+ * components of an orphan directory is visited.
+ */
+public class DirectoryDeletingService extends BackgroundService {
+
+  private final KeyManager keyManager;
+  private final OzoneManager ozoneManager;
+  private AtomicLong deletedDirsCount;
+  private AtomicLong deletedFilesCount;
+  private final AtomicLong runCount;
+
+  private static ClientId clientId = ClientId.randomId();
+
+  // Use only a single thread for DirDeletion. Multiple threads would read
+  // or write to same tables and can send deletion requests for same key
+  // multiple times.
+  private static final int DIR_DELETING_CORE_POOL_SIZE = 1;
+
+  // Number of items(dirs/files) to be batched in an iteration.
+  private final long pathLimitPerTask;
+
+  public DirectoryDeletingService(long interval, TimeUnit unit,
+      long serviceTimeout, OzoneManager ozoneManager) {
+    super("DirectoryDeletingService", interval, unit,
+        DIR_DELETING_CORE_POOL_SIZE, serviceTimeout);
+    this.keyManager = ozoneManager.getKeyManager();
+    this.ozoneManager = ozoneManager;
+    this.deletedDirsCount = new AtomicLong(0);
+    this.deletedFilesCount = new AtomicLong(0);
+    this.runCount = new AtomicLong(0);
+    this.pathLimitPerTask = ozoneManager.getConfiguration()
+        .getInt(OZONE_PATH_DELETING_LIMIT_PER_TASK,
+            OZONE_PATH_DELETING_LIMIT_PER_TASK_DEFAULT);
+  }
+
+  private boolean shouldRun() {
+    if (ozoneManager == null) {
+      // OzoneManager can be null for testing
+      return true;
+    }
+    return ozoneManager.isLeaderReady();
+  }
+
+  private boolean isRatisEnabled() {
+    if (ozoneManager == null) {
+      return false;
+    }
+    return ozoneManager.isRatisEnabled();
+  }
+
+  @Override
+  public BackgroundTaskQueue getTasks() {
+    BackgroundTaskQueue queue = new BackgroundTaskQueue();
+    queue.add(new DirectoryDeletingService.DirDeletingTask());
+    return queue;
+  }
+
+  private class DirDeletingTask implements BackgroundTask {
+
+    @Override
+    public int getPriority() {
+      return 0;
+    }
+
+    @Override
+    public BackgroundTaskResult call() throws Exception {
+      if (shouldRun()) {
+        runCount.incrementAndGet();
+        long count = pathLimitPerTask;
+        try {
+          long startTime = Time.monotonicNow();
+          // step-1) Get one pending deleted directory
+          OmKeyInfo pendingDeletedDirInfo = keyManager.getPendingDeletionDir();
+          if (pendingDeletedDirInfo != null) {
+            if (LOG.isDebugEnabled()) {
+              LOG.debug("Pending deleted dir name: {}",
+                  pendingDeletedDirInfo.getKeyName());
+            }
+            // step-1: get all sub directories under the deletedDir
+            List<OmKeyInfo> dirs =
+                keyManager.getPendingDeletionSubDirs(pendingDeletedDirInfo,
+                    --count);
+            count = count - dirs.size();
+            List<OmKeyInfo> deletedSubDirList = new ArrayList<>();
+            for (OmKeyInfo dirInfo : dirs) {
+              deletedSubDirList.add(dirInfo);
+              if (LOG.isDebugEnabled()) {
+                LOG.debug("deleted sub dir name: {}",
+                    dirInfo.getKeyName());
+              }
+            }
+
+            // step-2: get all sub files under the deletedDir
+            List<OmKeyInfo> purgeDeletedFiles =
+                keyManager.getPendingDeletionSubFiles(pendingDeletedDirInfo,
+                    --count);
+
+            if (LOG.isDebugEnabled()) {
+              for (OmKeyInfo fileInfo : purgeDeletedFiles) {
+                LOG.debug("deleted sub file name: {}", fileInfo.getKeyName());
+              }
+            }
+
+            // step-3: Since there is a boundary condition of 'numEntries' in
+            // each batch, check whether the sub paths count reached batch size
+            // limit. If count reached limit then there can be some more child
+            // paths to be visited and will keep the parent deleted directory
+            // for one more pass.
+            List<String> purgeDeletedDirs = new ArrayList<>();
+            if (count > 0) {

Review comment:
       @linyiqun Thanks for adding more details.
   
   >A better way I am thinking we could use recursive way to collect all dirs 
and then stored in a stack structure.
   Say, if user has deleted a very large parent dir with millions of 
sub-children with more depth(say 10 levels) and each level has more than 
batchSized dirs and files. I think, maintaining stack will have complexity by 
increasing heap memory consumption.
   
   > Current problem is that we delete dir from parent dir to child, it's not 
efficient because parent dir relied on subdir to be purged firstly
   I am deleting the parentdir once all its immediate sub-dirs are added to 
`DeletedDirectortTable` and sub-files to `DeletedTable`. This ensures that 
parentdir can be safely removed and the sub-dirs will become/promotted as 
parentdirs for the next DDService iteration.
   
   Appreciate your feedback. Thanks again!




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]



---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to