This is an automated email from the ASF dual-hosted git repository.

josephwu pushed a commit to branch 1.6.x
in repository https://gitbox.apache.org/repos/asf/mesos.git

commit 12aab9efe82e945ccb3f5b6cbe273c8c8611cc7a
Author: Andrei Budnik <abud...@mesosphere.com>
AuthorDate: Wed Nov 14 11:51:24 2018 -0800

    Added validation of cache files to the URI Fetcher.
    
    Previously, missing cache files could lead to task launch failures.
    This patch introduces validation of cache files which happens each
    time a downloaded cache entry is requested via the `get()` method.
    If validation fails, `get()` returns `None()`, so that the fetcher
    retries downloading URI. Currently, we only check the existence of
    the cache file during validation.
    
    Review: https://reviews.apache.org/r/69171/
---
 src/slave/containerizer/fetcher.cpp         | 40 +++++++++++++++++++++++++++++
 src/slave/containerizer/fetcher_process.hpp |  3 +++
 2 files changed, 43 insertions(+)

diff --git a/src/slave/containerizer/fetcher.cpp 
b/src/slave/containerizer/fetcher.cpp
index 7de57c2..314a75d 100644
--- a/src/slave/containerizer/fetcher.cpp
+++ b/src/slave/containerizer/fetcher.cpp
@@ -1013,6 +1013,22 @@ FetcherProcess::Cache::get(
 
   Option<shared_ptr<Entry>> entry = table.get(key);
   if (entry.isSome()) {
+    // The FetcherProcess will always remove a failed download
+    // synchronously after marking this future as failed.
+    CHECK(!entry.get()->completion().isFailed());
+
+    // Validate the cache file, if it has been downloaded.
+    if (entry.get()->completion().isReady()) {
+      Try<Nothing> validation = validate(entry.get());
+      if (validation.isError()) {
+        LOG(WARNING) << "Validation failed: '" + validation.error() +
+                        "'. Removing cache entry...";
+
+        remove(entry.get());
+        return None();
+      }
+    }
+
     // Refresh the cache entry by moving it to the back of lruSortedEntries.
     lruSortedEntries.remove(entry.get());
     lruSortedEntries.push_back(entry.get());
@@ -1048,6 +1064,7 @@ bool FetcherProcess::Cache::contains(
 //   (1) We failed to determine its prospective cache file size.
 //   (2) We failed to download it when invoking the mesos-fetcher.
 //   (3) We're evicting it to make room for another entry.
+//   (4) We failed to validate the cache file.
 //
 // In (1) and (2) the contract is that we'll have failed the entry's
 // future before we call remove, so the entry's future should no
@@ -1059,6 +1076,9 @@ bool FetcherProcess::Cache::contains(
 // reference count and therefore the future must either be ready or
 // failed in which case this is just case (1) above.
 //
+// In (4) we explicitly only validate a cache file if the future is
+// ready (i.e., the file has been downloaded).
+//
 // NOTE: It is not necessarily the case that this cache entry has
 // zero references because there might be some waiters on the
 // downloading of this entry which haven't been able to run and find
@@ -1161,6 +1181,26 @@ Try<Nothing> FetcherProcess::Cache::reserve(
 }
 
 
+Try<Nothing> FetcherProcess::Cache::validate(
+    const std::shared_ptr<Cache::Entry>& entry)
+{
+    VLOG(1) << "Validating cache entry '" << entry->key
+            << "' with filename: " << entry->filename;
+
+    if (!os::exists(entry->path().string())) {
+      return Error("Cache file does not exist: " + entry->filename);
+    }
+
+    // TODO(abudnik): Consider adding validation of the cache file by either:
+    //   1. Comparing a known file checksum with the actual checksum of the 
file
+    //      stored on disk.
+    //   2. Reading the whole file by chunks. Many filesystems detect data
+    //      corruptions when reading file's data. As a positive side effect,
+    //      the file's data will be loaded into the page cache.
+    return Nothing();
+}
+
+
 Try<Nothing> FetcherProcess::Cache::adjust(
     const shared_ptr<FetcherProcess::Cache::Entry>& entry)
 {
diff --git a/src/slave/containerizer/fetcher_process.hpp 
b/src/slave/containerizer/fetcher_process.hpp
index 56fecf2..faa5ac8 100644
--- a/src/slave/containerizer/fetcher_process.hpp
+++ b/src/slave/containerizer/fetcher_process.hpp
@@ -190,6 +190,9 @@ public:
     size_t size() const;
 
   private:
+    // Returns whether the cache file exists and not corrupted.
+    Try<Nothing> validate(const std::shared_ptr<Cache::Entry>& entry);
+
     // Maximum storable number of bytes in the cache directory.
     const Bytes space;
 

Reply via email to