Repository: mesos Updated Branches: refs/heads/master 34d4a2f5a -> 7b3522b00
Adjusted the order in which filters are added/removed to tolerate slave crashes while adding/removing filters. Review: https://reviews.apache.org/r/24020 Project: http://git-wip-us.apache.org/repos/asf/mesos/repo Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/7b3522b0 Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/7b3522b0 Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/7b3522b0 Branch: refs/heads/master Commit: 7b3522b00959a34ff33f2b998db867312b163210 Parents: 34d4a2f Author: Jie Yu <[email protected]> Authored: Mon Jul 28 16:51:19 2014 -0700 Committer: Jie Yu <[email protected]> Committed: Wed Jul 30 15:57:15 2014 -0700 ---------------------------------------------------------------------- .../isolators/network/port_mapping.cpp | 261 ++++++++++--------- .../isolators/network/port_mapping.hpp | 5 +- 2 files changed, 147 insertions(+), 119 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/mesos/blob/7b3522b0/src/slave/containerizer/isolators/network/port_mapping.cpp ---------------------------------------------------------------------- diff --git a/src/slave/containerizer/isolators/network/port_mapping.cpp b/src/slave/containerizer/isolators/network/port_mapping.cpp index 4dfaeef..8445874 100644 --- a/src/slave/containerizer/isolators/network/port_mapping.cpp +++ b/src/slave/containerizer/isolators/network/port_mapping.cpp @@ -1060,6 +1060,7 @@ Future<Nothing> PortMappingIsolatorProcess::recover( return Failure("Failed to get all the links: " + links.error()); } + // The 'pids' here are extracted from veth devices. hashset<pid_t> pids; foreach (const string& name, links.get()) { Option<pid_t> pid = getPid(name); @@ -1096,17 +1097,29 @@ Future<Nothing> PortMappingIsolatorProcess::recover( << containerId << " with pid " << pid; if (!pids.contains(pid)) { - // This is possible because the container was launched by the - // slave with network isolation disabled, so the pid could not - // be found in the device names in the system. + // There are two possible cases here: + // + // 1) The container was launched by the slave with network + // isolation disabled, so the pid could not be found in the + // device names in the system. + // + // 2) The container was launched by the slave with network + // isolation enabled, but veth is removed (because the + // corresponding container is destroyed), but the slave + // restarts before it is able to write the sentinel file. + // + // In both cases, we treat the container as unmanaged. For case + // (2), it's safe to do so because the container has already + // been destroyed. VLOG(1) << "Skipped recovery for container " << containerId - << "with pid " << pid << " as it was not managed by " - << "the network isolator"; + << "with pid " << pid << " as either it was not managed by " + << "the network isolator or it has already been destroyed"; + unmanaged.insert(containerId); continue; } - Result<Info*> recover = _recover(pid); + Try<Info*> recover = _recover(pid); if (recover.isError()) { foreachvalue (Info* info, infos) { delete info; @@ -1117,15 +1130,6 @@ Future<Nothing> PortMappingIsolatorProcess::recover( return Failure( "Failed to recover container " + stringify(containerId) + " with pid " + stringify(pid) + ": " + recover.error()); - } else if (recover.isNone()) { - LOG(WARNING) << "Cannot recover container " << containerId - << " with pid " << pid - << ". It may have already been destroyed"; - - // This may occur if the executor has exited and the isolator - // has destroyed the container but the slave dies before - // noticing this. - continue; } infos[containerId] = recover.get(); @@ -1136,7 +1140,7 @@ Future<Nothing> PortMappingIsolatorProcess::recover( // If there are orphaned containers left, clean them up. foreach (pid_t pid, pids) { - Result<Info*> recover = _recover(pid); + Try<Info*> recover = _recover(pid); if (recover.isError()) { foreachvalue (Info* info, infos) { delete info; @@ -1147,11 +1151,6 @@ Future<Nothing> PortMappingIsolatorProcess::recover( return Failure( "Failed to recover orphaned container with pid " + stringify(pid) + ": " + recover.error()); - } else if (recover.isNone()) { - // If the control reaches here, a serious problem has occurred - // because our link (veth) has been unexpectedly deleted. - LOG(FATAL) << "The veth for orphaned container with pid " - << pid << " has been unexpectedly deleted"; } // The recovery should fail if we cannot cleanup an orphan. @@ -1175,10 +1174,15 @@ Future<Nothing> PortMappingIsolatorProcess::recover( } -Result<PortMappingIsolatorProcess::Info*> +Try<PortMappingIsolatorProcess::Info*> PortMappingIsolatorProcess::_recover(pid_t pid) { // Get all the IP filters on veth. + // NOTE: We only look at veth devices to recover port ranges + // assigned to each container. That's the reason why we need to make + // sure that we add filters to veth before adding filters to host + // eth0 and host lo. Also, we need to make sure we remove filters + // from host eth0 and host lo before removing filters from veth. Result<vector<ip::Classifier> > classifiers = ip::classifiers(veth(pid), ingress::HANDLE); @@ -1187,15 +1191,9 @@ PortMappingIsolatorProcess::_recover(pid_t pid) "Failed to get all the IP filters on " + veth(pid) + ": " + classifiers.error()); } else if (classifiers.isNone()) { - // Since we bind mount the network namespace handle (which causes - // an extra reference), the veth should be present even if the - // executor has exited. However, we may encounter a case where the - // veth is removed (because the corresponding container is - // destroyed), but the slave restarts before it is able to write - // the sentinel file. In that case, when the slave restarts, it - // will try to recover a container that has already been - // destroyed. To distinguish this case, we return None here. - return None(); + return Error( + "Failed to get all the IP filters on " + veth(pid) + + ": link does not exist"); } IntervalSet<uint16_t> nonEphemeralPorts; @@ -1230,25 +1228,35 @@ PortMappingIsolatorProcess::_recover(pid_t pid) } } - if (ephemeralPorts.empty()) { - return Error("No ephemeral ports found"); - } + Info* info = NULL; - if (ephemeralPorts.intervalCount() != 1) { - return Error("Each container should have only one ephemeral port range"); - } + if (ephemeralPorts.empty()) { + // NOTE: This is possible because the slave may crash while + // calling 'isolate()', leaving a partially isolated container. To + // clean up this partially isolated container, we still create an + // Info struct here and let the 'cleanup' function clean it up + // later. + LOG(WARNING) << "No ephemeral ports found for container with pid " + << stringify(pid) << ". This could happen if slave crashes " + << "while isolating a container"; + + info = new Info(nonEphemeralPorts, Interval<uint16_t>(), pid); + } else { + if (ephemeralPorts.intervalCount() != 1) { + return Error("Each container should have only one ephemeral port range"); + } - // Tell the allocator that this ephemeral port range is used. - ephemeralPortsAllocator->allocate(*ephemeralPorts.begin()); + // Tell the allocator that this ephemeral port range is used. + ephemeralPortsAllocator->allocate(*ephemeralPorts.begin()); - Info* info = new Info(nonEphemeralPorts, *ephemeralPorts.begin(), pid); - CHECK_NOTNULL(info); + info = new Info(nonEphemeralPorts, *ephemeralPorts.begin(), pid); - VLOG(1) << "Recovered network isolator for container with pid " << pid - << " non-ephemeral port ranges " << nonEphemeralPorts - << " and ephemeral port range " << *ephemeralPorts.begin(); + VLOG(1) << "Recovered network isolator for container with pid " << pid + << " non-ephemeral port ranges " << nonEphemeralPorts + << " and ephemeral port range " << *ephemeralPorts.begin(); + } - return info; + return CHECK_NOTNULL(info); } @@ -1857,13 +1865,6 @@ Try<Nothing> PortMappingIsolatorProcess::_cleanup(Info* _info) pid_t pid = info->pid.get(); - // If the bind mount does not exist, there is no need to proceed as - // there will be nothing to cleanup. - const string target = path::join(BIND_MOUNT_ROOT, stringify(pid)); - if (!os::exists(target)) { - return Error("The bind mount at '" + target + "' does not exist"); - } - // NOTE: The 'isolate()' function above may fail at any point if the // child process with 'pid' is gone (e.g., killed by a user, failed // to load shared libraries, etc.). Therefore, this cleanup function @@ -1881,7 +1882,9 @@ Try<Nothing> PortMappingIsolatorProcess::_cleanup(Info* _info) LOG(INFO) << "Removing IP packet filters with ports " << range << " for container with pid " << pid; - Try<Nothing> removing = removeHostIPFilters(range, veth(pid)); + // No need to remove filters on veth as they will be automatically + // removed by the kernel when we remove the link below. + Try<Nothing> removing = removeHostIPFilters(range, veth(pid), false); if (removing.isError()) { errors.push_back( "Failed to remove IP packet filter with ports " + @@ -1891,7 +1894,9 @@ Try<Nothing> PortMappingIsolatorProcess::_cleanup(Info* _info) } // Free the ephemeral ports used by this container. - ephemeralPortsAllocator->deallocate(info->ephemeralPorts); + if (info->ephemeralPorts != Interval<uint16_t>()) { + ephemeralPortsAllocator->deallocate(info->ephemeralPorts); + } LOG(INFO) << "Freed ephemeral ports " << info->ephemeralPorts << " for container with pid " << pid; @@ -1988,6 +1993,8 @@ Try<Nothing> PortMappingIsolatorProcess::_cleanup(Info* _info) } // Release the bind mount for this container. + const string target = path::join(BIND_MOUNT_ROOT, stringify(pid)); + Try<Nothing> unmount = fs::unmount(target, MNT_DETACH); if (unmount.isError()) { errors.push_back("Failed to umount: " + unmount.error()); @@ -1999,7 +2006,8 @@ Try<Nothing> PortMappingIsolatorProcess::_cleanup(Info* _info) // someone entered into the container for debugging purpose. In that // case remove will fail, which is okay, because we only leaked an // empty file, which could also be reused later if the pid (the name - // of the file) is used later. + // of the file) is used again. However, we still return error to + // indicate that the cleanup hasn't been successful. Try<Nothing> rm = os::rm(target); if (rm.isError()) { errors.push_back("Failed to remove " + target + ": " + rm.error()); @@ -2029,53 +2037,39 @@ Try<Nothing> PortMappingIsolatorProcess::addHostIPFilters( const PortRange& range, const string& veth) { - // Add an IP packet filter from host eth0 to veth of the container - // such that any incoming IP packet will be properly redirected to - // the corresponding container based on its destination port. - Try<bool> hostEth0ToVeth = filter::ip::create( - eth0, - ingress::HANDLE, - ip::Classifier(hostMAC, net::IP(hostIP.address()), None(), range), - Priority(IP_FILTER_PRIORITY, NORMAL), - action::Redirect(veth)); - - if (hostEth0ToVeth.isError()) { - ++metrics.adding_eth0_ip_filters_errors; + // NOTE: The order in which these filters are added is important! + // For each port range, we need to make sure that we don't try to + // add filters on host eth0 and host lo until we have successfully + // added filters on veth. This is because the slave could crash + // while we are adding filters, we want to make sure we don't leak + // any filters on host eth0 and host lo. - return Error( - "Failed to create an IP packet filter from host " + - eth0 + " to " + veth + ": " + hostEth0ToVeth.error()); - } else if (!hostEth0ToVeth.get()) { - ++metrics.adding_eth0_ip_filters_already_exist; - - return Error( - "The IP packet filter from host " + eth0 + " to " + - veth + " already exists"); - } - - // Add an IP packet filter from host lo to veth of the container - // such that any internally generated IP packet will be properly - // redirected to the corresponding container based on its - // destination port. - Try<bool> hostLoToVeth = filter::ip::create( - lo, + // Add an IP packet filter from veth of the container to host eth0 + // to properly redirect IP packets sent from one container to + // external hosts. This filter has a lower priority compared to the + // 'vethToHostLo' filter because it does not check the destination + // IP. Notice that here we also check the source port of a packet. + // If the source port is not within the port ranges allocated for + // the container, the packet will get dropped. + Try<bool> vethToHostEth0 = filter::ip::create( + veth, ingress::HANDLE, - ip::Classifier(None(), None(), None(), range), - Priority(IP_FILTER_PRIORITY, NORMAL), - action::Redirect(veth)); + ip::Classifier(None(), None(), range, None()), + Priority(IP_FILTER_PRIORITY, LOW), + action::Redirect(eth0)); - if (hostLoToVeth.isError()) { - ++metrics.adding_lo_ip_filters_errors; + if (vethToHostEth0.isError()) { + ++metrics.adding_veth_ip_filters_errors; return Error( - "Failed to create an IP packet filter from host " + - lo + " to " + veth + ": " + hostLoToVeth.error()); - } else if (!hostLoToVeth.get()) { - ++metrics.adding_lo_ip_filters_already_exist; + "Failed to create an IP packet filter from " + veth + + " to host " + eth0 + ": " + vethToHostEth0.error()); + } else if (!vethToHostEth0.get()) { + ++metrics.adding_veth_ip_filters_already_exist; return Error( - "The IP packet filter from host " + lo + " to " + - veth + " already exists"); + "The IP packet filter from " + veth + + " to host " + eth0 + " already exists"); } // Add two IP packet filters (one for public IP and one for loopback @@ -2126,32 +2120,53 @@ Try<Nothing> PortMappingIsolatorProcess::addHostIPFilters( veth + " to host " + lo + " already exists"); } - // Add an IP packet filter from veth of the container to host eth0 - // to properly redirect IP packets sent from one container to - // external hosts. This filter has a lower priority compared to the - // 'vethToHostLo' filter because it does not check the destination - // IP. Notice that here we also check the source port of a packet. - // If the source port is not within the port ranges allocated for - // the container, the packet will get dropped. - Try<bool> vethToHostEth0 = filter::ip::create( - veth, + // Add an IP packet filter from host eth0 to veth of the container + // such that any incoming IP packet will be properly redirected to + // the corresponding container based on its destination port. + Try<bool> hostEth0ToVeth = filter::ip::create( + eth0, ingress::HANDLE, - ip::Classifier(None(), None(), range, None()), - Priority(IP_FILTER_PRIORITY, LOW), - action::Redirect(eth0)); + ip::Classifier(hostMAC, net::IP(hostIP.address()), None(), range), + Priority(IP_FILTER_PRIORITY, NORMAL), + action::Redirect(veth)); - if (vethToHostEth0.isError()) { - ++metrics.adding_veth_ip_filters_errors; + if (hostEth0ToVeth.isError()) { + ++metrics.adding_eth0_ip_filters_errors; return Error( - "Failed to create an IP packet filter from " + veth + - " to host " + eth0 + ": " + vethToHostEth0.error()); - } else if (!vethToHostEth0.get()) { - ++metrics.adding_veth_ip_filters_already_exist; + "Failed to create an IP packet filter from host " + + eth0 + " to " + veth + ": " + hostEth0ToVeth.error()); + } else if (!hostEth0ToVeth.get()) { + ++metrics.adding_eth0_ip_filters_already_exist; return Error( - "The IP packet filter from " + veth + - " to host " + eth0 + " already exists"); + "The IP packet filter from host " + eth0 + " to " + + veth + " already exists"); + } + + // Add an IP packet filter from host lo to veth of the container + // such that any internally generated IP packet will be properly + // redirected to the corresponding container based on its + // destination port. + Try<bool> hostLoToVeth = filter::ip::create( + lo, + ingress::HANDLE, + ip::Classifier(None(), None(), None(), range), + Priority(IP_FILTER_PRIORITY, NORMAL), + action::Redirect(veth)); + + if (hostLoToVeth.isError()) { + ++metrics.adding_lo_ip_filters_errors; + + return Error( + "Failed to create an IP packet filter from host " + + lo + " to " + veth + ": " + hostLoToVeth.error()); + } else if (!hostLoToVeth.get()) { + ++metrics.adding_lo_ip_filters_already_exist; + + return Error( + "The IP packet filter from host " + lo + " to " + + veth + " already exists"); } return Nothing(); @@ -2159,11 +2174,17 @@ Try<Nothing> PortMappingIsolatorProcess::addHostIPFilters( // Helper function to remove IP filters from the host side for a given -// port range. +// port range. The boolean flag 'removeFiltersOnVeth' indicates if we +// need to remove filters on veth. Try<Nothing> PortMappingIsolatorProcess::removeHostIPFilters( const PortRange& range, - const string& veth) + const string& veth, + bool removeFiltersOnVeth) { + // NOTE: Similar to above. The order in which these filters are + // removed is important. We need to remove filters on host eth0 and + // host lo first before we remove filters on veth. + // Remove the IP packet filter from host eth0 to veth of the container Try<bool> hostEth0ToVeth = filter::ip::remove( eth0, @@ -2202,6 +2223,12 @@ Try<Nothing> PortMappingIsolatorProcess::removeHostIPFilters( << " to " << veth << " does not exist"; } + // Now, we try to remove filters on veth. No need to proceed if the + // user does not ask us to do so. + if (!removeFiltersOnVeth) { + return Nothing(); + } + // Remove the IP packet filter from veth of the container to // host lo for the public IP. Try<bool> vethToHostLoPublic = filter::ip::remove( http://git-wip-us.apache.org/repos/asf/mesos/blob/7b3522b0/src/slave/containerizer/isolators/network/port_mapping.hpp ---------------------------------------------------------------------- diff --git a/src/slave/containerizer/isolators/network/port_mapping.hpp b/src/slave/containerizer/isolators/network/port_mapping.hpp index dd1db7f..f75429c 100644 --- a/src/slave/containerizer/isolators/network/port_mapping.hpp +++ b/src/slave/containerizer/isolators/network/port_mapping.hpp @@ -239,7 +239,7 @@ private: // Continuations. Try<Nothing> _cleanup(Info* info); - Result<Info*> _recover(pid_t pid); + Try<Info*> _recover(pid_t pid); void _update( const process::Future<Option<int> >& status, @@ -252,7 +252,8 @@ private: Try<Nothing> removeHostIPFilters( const routing::filter::ip::PortRange& range, - const std::string& veth); + const std::string& veth, + bool removeFiltersOnVeth = true); // Return the scripts that will be executed in the child context. std::string scripts(Info* info);
