steveloughran commented on a change in pull request #951: HADOOP-15183. S3Guard 
store becomes inconsistent after partial failure of rename
URL: https://github.com/apache/hadoop/pull/951#discussion_r293514525
 
 

 ##########
 File path: 
hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/DynamoDBMetadataStore.java
 ##########
 @@ -699,39 +769,168 @@ DirListingMetadata 
getDirListingMetadataFromDirMetaAndList(Path path,
   }
 
   /**
-   * build the list of all parent entries.
+   * Build the list of all parent entries.
+   * <p>
+   * <b>Thread safety:</b> none. Callers must synchronize access.
+   * <p>
+   * Callers are required to synchronize on ancestorState.
    * @param pathsToCreate paths to create
+   * @param ancestorState ongoing ancestor state.
    * @return the full ancestry paths
    */
-  Collection<DDBPathMetadata> completeAncestry(
-      Collection<DDBPathMetadata> pathsToCreate) {
-    // Key on path to allow fast lookup
-    Map<Path, DDBPathMetadata> ancestry = new HashMap<>();
-
-    for (DDBPathMetadata meta : pathsToCreate) {
+  private Collection<DDBPathMetadata> completeAncestry(
+      final Collection<DDBPathMetadata> pathsToCreate,
+      final AncestorState ancestorState) throws PathIOException {
+    List<DDBPathMetadata> ancestorsToAdd = new ArrayList<>(0);
+    LOG.debug("Completing ancestry for {} paths", pathsToCreate.size());
+    // we sort the inputs to guarantee that the topmost entries come first.
+    // that way if the put request contains both parents and children
+    // then the existing parents will not be re-created -they will just
+    // be added to the ancestor list first.
+    List<DDBPathMetadata> sortedPaths = new ArrayList<>(pathsToCreate);
+    sortedPaths.sort(PathOrderComparators.TOPMOST_PM_FIRST);
+    for (DDBPathMetadata meta : sortedPaths) {
       Preconditions.checkArgument(meta != null);
       Path path = meta.getFileStatus().getPath();
+      LOG.debug("Adding entry {}", path);
       if (path.isRoot()) {
         break;
       }
-      ancestry.put(path, new DDBPathMetadata(meta));
+      // add the new entry
+      DDBPathMetadata entry = new DDBPathMetadata(meta);
+      DDBPathMetadata oldEntry = ancestorState.put(path, entry);
+      if (oldEntry != null) {
+        if (!oldEntry.getFileStatus().isDirectory()
+            || !entry.getFileStatus().isDirectory()) {
+          // check for and warn if the existing bulk operation overwrote it.
+          // this should never occur outside tests explicitly crating it
+          LOG.warn("Overwriting a S3Guard entry created in the operation: {}",
+              oldEntry);
+          LOG.warn("With new entry: {}", entry);
+          // restore the old state
+          ancestorState.put(path, oldEntry);
+          // then raise an exception
+          throw new PathIOException(path.toString(), E_INCONSISTENT_UPDATE);
+        } else {
+          // directory is already present, so skip adding it and any parents.
+          continue;
+        }
+      }
+      ancestorsToAdd.add(entry);
       Path parent = path.getParent();
-      while (!parent.isRoot() && !ancestry.containsKey(parent)) {
+      while (!parent.isRoot()) {
+        if (ancestorState.findEntry(parent, true)) {
+          break;
+        }
         LOG.debug("auto-create ancestor path {} for child path {}",
             parent, path);
         final S3AFileStatus status = makeDirStatus(parent, username);
-        ancestry.put(parent, new DDBPathMetadata(status, Tristate.FALSE,
-            false));
+        DDBPathMetadata md = new DDBPathMetadata(status, Tristate.FALSE,
+            false);
+        ancestorState.put(parent, md);
+        ancestorsToAdd.add(md);
         parent = parent.getParent();
       }
     }
-    return ancestry.values();
+    return ancestorsToAdd;
+  }
+
+  /**
+   * {@inheritDoc}
+   * <p>
+   * if {@code operationState} is not null, when this method returns the
+   * operation state will be updated with all new entries created.
+   * This ensures that subsequent operations with the same store will not
+   * trigger new updates.
+   * The scan on
+   * @param qualifiedPath path to update
+   * @param operationState (nullable) operational state for a bulk update
+   * @throws IOException on failure.
+   */
+  @SuppressWarnings("SynchronizationOnLocalVariableOrMethodParameter")
+  @Override
+  @Retries.RetryTranslated
+  public void addAncestors(
+      final Path qualifiedPath,
+      @Nullable final BulkOperationState operationState) throws IOException {
+
+    Collection<DDBPathMetadata> newDirs = new ArrayList<>();
+    final AncestorState ancestorState = extractOrCreate(operationState,
+        BulkOperationState.OperationType.Rename);
+    Path parent = qualifiedPath.getParent();
+
+    // Iterate up the parents.
+    // note that only ancestorState get/set operations are synchronized;
+    // the DDB read between them is not. As a result, more than one
+    // thread may probe the state, find the entry missing, do the database
+    // query and add the entry.
+    // This is done to avoid making the remote dynamo query part of the
+    // synchronized block.
+    // If a race does occur, the cost is simply one extra GET and potentially
+    // one extra PUT.
+    while (!parent.isRoot()) {
+      synchronized (ancestorState) {
+        if (ancestorState.contains(parent)) {
+          // the ancestry map contains the key, so no need to even look for it.
+          break;
+        }
+      }
+      PathMetadata directory = get(parent);
+      if (directory == null || directory.isDeleted()) {
+        S3AFileStatus status = makeDirStatus(username, parent);
+        LOG.debug("Adding new ancestor entry {}", status);
+        DDBPathMetadata meta = new DDBPathMetadata(status, Tristate.FALSE,
+            false);
+        newDirs.add(meta);
+        // Do not update ancestor state here, as it
+        // will happen in the innerPut() call. Were we to add it
+        // here that put operation would actually (mistakenly) skip
+        // creating the entry.
+      } else {
+        if (directory.getFileStatus().isFile()) {
+          throw new PathIOException(parent.toString(),
+              "Cannot overwrite parent file: metadatstore is"
+                  + " in an inconsistent state");
+        }
+        // the directory exists. Add it to the ancestor state for next time.
+        synchronized (ancestorState) {
+          ancestorState.put(parent, new DDBPathMetadata(directory));
+        }
+        break;
+      }
+      parent = parent.getParent();
+    }
+    // the listing of directories to put is all those parents which we know
+    // are not in the store or BulkOperationState.
+    if (!newDirs.isEmpty()) {
+      innerPut(newDirs, operationState);
+    }
   }
 
+  /**
+   * {@inheritDoc}.
+   *
+   * The DDB implementation sorts all the paths such that new items
+   * are ordered highest level entry first; deleted items are ordered
+   * lowest entry first.
+   *
+   * This is to ensure that if a client failed partway through the update,
+   * there will no entries in the table which lack parent entries.
 
 Review comment:
   thanks. We do need it, just for the extra resilience

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

---------------------------------------------------------------------
To unsubscribe, e-mail: common-issues-unsubscr...@hadoop.apache.org
For additional commands, e-mail: common-issues-h...@hadoop.apache.org

Reply via email to