Repository: mesos Updated Branches: refs/heads/master 6a8738f89 -> 39acb0a6b
Made `shutdown_grace_period` configurable in `ExecutorInfo`. If `ExecutorInfo.shutdown_grace_period` is set, the executor driver uses it, otherwise it falls back to the environment variable `MESOS_EXECUTOR_SHUTDOWN_GRACE_PERIOD`. Review: https://reviews.apache.org/r/44655/ Project: http://git-wip-us.apache.org/repos/asf/mesos/repo Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/92266d05 Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/92266d05 Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/92266d05 Branch: refs/heads/master Commit: 92266d05a883c2898a41f29a47766a1679fa827c Parents: 6a8738f Author: Alexander Rukletsov <ruklet...@gmail.com> Authored: Thu Mar 24 17:29:10 2016 +0100 Committer: Alexander Rukletsov <al...@apache.org> Committed: Thu Mar 24 18:20:59 2016 +0100 ---------------------------------------------------------------------- CHANGELOG | 9 +++++++++ docs/configuration.md | 9 +++++++-- include/mesos/executor/executor.proto | 21 +++++++++++++++------ include/mesos/mesos.proto | 8 ++++++++ include/mesos/v1/executor/executor.proto | 21 +++++++++++++++------ include/mesos/v1/mesos.proto | 8 ++++++++ src/slave/containerizer/containerizer.cpp | 11 ++++++++++- src/slave/flags.cpp | 8 ++++++-- src/slave/slave.cpp | 10 +++++++++- 9 files changed, 87 insertions(+), 18 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/mesos/blob/92266d05/CHANGELOG ---------------------------------------------------------------------- diff --git a/CHANGELOG b/CHANGELOG index 1fadaa1..5aeb479 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,5 +1,14 @@ Release Notes - Mesos - Version 0.29.0 (WIP) -------------------------------------------- +This release contains the following new features: + * [MESOS-4949] - The executor shutdown grace period can now be configured in + `ExecutorInfo`, which overrides the agent flag. When shutting down an + executor the agent will wait in a best-effort manner for the grace period + specified here before forcibly destroying the container. The executor must + not assume that it will always be allotted the full grace period, as the + agent may decide to allot a shorter period and failures / forcible + terminations may occur. Together with kill policies this gives frameworks + flexibility around how to clean up tasks and executors. Deprecations: * [MESOS-5001] - The 'allocator/event_queue_dispatches' metric is now http://git-wip-us.apache.org/repos/asf/mesos/blob/92266d05/docs/configuration.md ---------------------------------------------------------------------- diff --git a/docs/configuration.md b/docs/configuration.md index 70686ad..b0f9cb5 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -1181,8 +1181,13 @@ shutting it down (e.g., 60secs, 3mins, etc) (default: 1mins) --executor_shutdown_grace_period=VALUE </td> <td> -Amount of time to wait for an executor -to shut down (e.g., 60secs, 3mins, etc) (default: 5secs) +Default amount of time to wait for an executor to shut down +(e.g. 60secs, 3mins, etc). ExecutorInfo.shutdown_grace_period +overrides this default. Note that the executor must not assume +that it will always be allotted the full grace period, as the +agent may decide to allot a shorter period, and failures / forcible +terminations may occur. +(default: 5secs) </td> </tr> <tr> http://git-wip-us.apache.org/repos/asf/mesos/blob/92266d05/include/mesos/executor/executor.proto ---------------------------------------------------------------------- diff --git a/include/mesos/executor/executor.proto b/include/mesos/executor/executor.proto index ae21119..6b4141f 100644 --- a/include/mesos/executor/executor.proto +++ b/include/mesos/executor/executor.proto @@ -41,12 +41,21 @@ message Event { // Received when the agent asks the executor to shutdown/kill itself. // The executor is then required to kill all its active tasks, send - // `TASK_KILLED` status updates and gracefully exit. If an executor - // does not terminate within a `MESOS_EXECUTOR_SHUTDOWN_GRACE_PERIOD` - // period (an environment variable set by the agent upon executor - // startup), the agent will forcefully destroy the container where the - // executor is running. The agent would then send `TASK_LOST` updates - // for any remaining active tasks of this executor. + // `TASK_KILLED` status updates and gracefully exit. The executor + // should terminate within a `MESOS_EXECUTOR_SHUTDOWN_GRACE_PERIOD` + // (an environment variable set by the agent upon executor startup); + // it can be configured via `ExecutorInfo.shutdown_grace_period`. If + // the executor fails to do so, the agent will forcefully destroy the + // container where the executor is running. The agent would then send + // `TASK_LOST` updates for any remaining active tasks of this executor. + // + // NOTE: The executor must not assume that it will always be allotted + // the full grace period, as the agent may decide to allot a shorter + // period and failures / forcible terminations may occur. + // + // TODO(alexr): Consider adding a duration field into the `Shutdown` + // message so that the agent can communicate when a shorter period + // has been allotted. SHUTDOWN = 7; } http://git-wip-us.apache.org/repos/asf/mesos/blob/92266d05/include/mesos/mesos.proto ---------------------------------------------------------------------- diff --git a/include/mesos/mesos.proto b/include/mesos/mesos.proto index 59f5d3a..ade6f5a 100644 --- a/include/mesos/mesos.proto +++ b/include/mesos/mesos.proto @@ -450,6 +450,14 @@ message ExecutorInfo { // discovery system to use this information as needed and to handle // executors without service discovery information. optional DiscoveryInfo discovery = 12; + + // When shutting down an executor the agent will wait in a + // best-effort manner for the grace period specified here + // before forcibly destroying the container. The executor + // must not assume that it will always be allotted the full + // grace period, as the agent may decide to allot a shorter + // period and failures / forcible terminations may occur. + optional DurationInfo shutdown_grace_period = 13; } http://git-wip-us.apache.org/repos/asf/mesos/blob/92266d05/include/mesos/v1/executor/executor.proto ---------------------------------------------------------------------- diff --git a/include/mesos/v1/executor/executor.proto b/include/mesos/v1/executor/executor.proto index 36a2b3f..d7b1dd5 100644 --- a/include/mesos/v1/executor/executor.proto +++ b/include/mesos/v1/executor/executor.proto @@ -41,12 +41,21 @@ message Event { // Received when the agent asks the executor to shutdown/kill itself. // The executor is then required to kill all its active tasks, send - // `TASK_KILLED` status updates and gracefully exit. If an executor - // does not terminate within a `MESOS_EXECUTOR_SHUTDOWN_GRACE_PERIOD` - // period (an environment variable set by the agent upon executor - // startup), the agent will forcefully destroy the container where the - // executor is running. The agent would then send `TASK_LOST` updates - // for any remaining active tasks of this executor. + // `TASK_KILLED` status updates and gracefully exit. The executor + // should terminate within a `MESOS_EXECUTOR_SHUTDOWN_GRACE_PERIOD` + // (an environment variable set by the agent upon executor startup); + // it can be configured via `ExecutorInfo.shutdown_grace_period`. If + // the executor fails to do so, the agent will forcefully destroy the + // container where the executor is running. The agent would then send + // `TASK_LOST` updates for any remaining active tasks of this executor. + // + // NOTE: The executor must not assume that it will always be allotted + // the full grace period, as the agent may decide to allot a shorter + // period and failures / forcible terminations may occur. + // + // TODO(alexr): Consider adding a duration field into the `Shutdown` + // message so that the agent can communicate when a shorter period + // has been allotted. SHUTDOWN = 7; } http://git-wip-us.apache.org/repos/asf/mesos/blob/92266d05/include/mesos/v1/mesos.proto ---------------------------------------------------------------------- diff --git a/include/mesos/v1/mesos.proto b/include/mesos/v1/mesos.proto index 6556a1c..7a5b680 100644 --- a/include/mesos/v1/mesos.proto +++ b/include/mesos/v1/mesos.proto @@ -450,6 +450,14 @@ message ExecutorInfo { // discovery system to use this information as needed and to handle // executors without service discovery information. optional DiscoveryInfo discovery = 12; + + // When shutting down an executor the agent will wait in a + // best-effort manner for the grace period specified here + // before forcibly destroying the container. The executor + // must not assume that it will always be allotted the full + // grace period, as the agent may decide to allot a shorter + // period and failures / forcible terminations may occur. + optional DurationInfo shutdown_grace_period = 13; } http://git-wip-us.apache.org/repos/asf/mesos/blob/92266d05/src/slave/containerizer/containerizer.cpp ---------------------------------------------------------------------- diff --git a/src/slave/containerizer/containerizer.cpp b/src/slave/containerizer/containerizer.cpp index f6fc786..3556040 100644 --- a/src/slave/containerizer/containerizer.cpp +++ b/src/slave/containerizer/containerizer.cpp @@ -329,8 +329,17 @@ map<string, string> executorEnvironment( environment["MESOS_SLAVE_PID"] = stringify(slavePid); environment["MESOS_AGENT_ENDPOINT"] = stringify(slavePid.address); environment["MESOS_CHECKPOINT"] = checkpoint ? "1" : "0"; + + // Set executor's shutdown grace period. If set, the customized value + // from `ExecutorInfo` overrides the default from agent flags. + Duration executorShutdownGracePeriod = flags.executor_shutdown_grace_period; + if (executorInfo.has_shutdown_grace_period()) { + executorShutdownGracePeriod = + Nanoseconds(executorInfo.shutdown_grace_period().nanoseconds()); + } + environment["MESOS_EXECUTOR_SHUTDOWN_GRACE_PERIOD"] = - stringify(flags.executor_shutdown_grace_period); + stringify(executorShutdownGracePeriod); if (checkpoint) { environment["MESOS_RECOVERY_TIMEOUT"] = stringify(flags.recovery_timeout); http://git-wip-us.apache.org/repos/asf/mesos/blob/92266d05/src/slave/flags.cpp ---------------------------------------------------------------------- diff --git a/src/slave/flags.cpp b/src/slave/flags.cpp index 71685ce..e831ce7 100644 --- a/src/slave/flags.cpp +++ b/src/slave/flags.cpp @@ -251,8 +251,12 @@ mesos::internal::slave::Flags::Flags() add(&Flags::executor_shutdown_grace_period, "executor_shutdown_grace_period", - "Amount of time to wait for an executor\n" - "to shut down (e.g., 60secs, 3mins, etc)", + "Default amount of time to wait for an executor to shut down\n" + "(e.g. 60secs, 3mins, etc). ExecutorInfo.shutdown_grace_period\n" + "overrides this default. Note that the executor must not assume\n" + "that it will always be allotted the full grace period, as the\n" + "agent may decide to allot a shorter period, and failures / forcible\n" + "terminations may occur.", DEFAULT_EXECUTOR_SHUTDOWN_GRACE_PERIOD); add(&Flags::gc_delay, http://git-wip-us.apache.org/repos/asf/mesos/blob/92266d05/src/slave/slave.cpp ---------------------------------------------------------------------- diff --git a/src/slave/slave.cpp b/src/slave/slave.cpp index 08ff1ed..c6fd810 100644 --- a/src/slave/slave.cpp +++ b/src/slave/slave.cpp @@ -4346,8 +4346,16 @@ void Slave::_shutdownExecutor(Framework* framework, Executor* executor) // will be dropped to the floor! executor->send(ShutdownExecutorMessage()); + // If the executor specifies shutdown grace period, + // pass it instead of the default. + Duration shutdownTimeout = flags.executor_shutdown_grace_period; + if (executor->info.has_shutdown_grace_period()) { + shutdownTimeout = Nanoseconds( + executor->info.shutdown_grace_period().nanoseconds()); + } + // Prepare for sending a kill if the executor doesn't comply. - delay(flags.executor_shutdown_grace_period, + delay(shutdownTimeout, self(), &Slave::shutdownExecutorTimeout, framework->id(),