Moved StatusUpdate.uuid from required to optional. Review: https://reviews.apache.org/r/35911
Project: http://git-wip-us.apache.org/repos/asf/mesos/repo Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/fda49c04 Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/fda49c04 Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/fda49c04 Branch: refs/heads/master Commit: fda49c04c88ce3c2c28866553669e64c8b65b956 Parents: a964bec Author: Benjamin Mahler <[email protected]> Authored: Thu Jun 25 22:44:53 2015 -0700 Committer: Benjamin Mahler <[email protected]> Committed: Mon Jun 29 10:01:27 2015 -0700 ---------------------------------------------------------------------- include/mesos/mesos.proto | 3 +- src/common/type_utils.cpp | 17 ++++----- src/master/master.cpp | 10 +++-- src/messages/messages.proto | 6 ++- src/sched/sched.cpp | 63 ++++++++++++++++---------------- src/scheduler/scheduler.cpp | 18 ++++++--- src/slave/slave.cpp | 15 ++++++++ src/slave/status_update_manager.cpp | 4 ++ 8 files changed, 86 insertions(+), 50 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/mesos/blob/fda49c04/include/mesos/mesos.proto ---------------------------------------------------------------------- diff --git a/include/mesos/mesos.proto b/include/mesos/mesos.proto index 81008ed..0ebe5d3 100644 --- a/include/mesos/mesos.proto +++ b/include/mesos/mesos.proto @@ -920,7 +920,8 @@ message TaskStatus { // driver implicitly acknowledge (default). // // TODO(bmahler): This is currently overwritten in the scheduler - // driver, even if executors set this. + // driver and executor driver, but executors will need to set this + // to a valid RFC-4122 UUID if using the HTTP API. optional bytes uuid = 11; // Describes whether the task has been determined to be healthy http://git-wip-us.apache.org/repos/asf/mesos/blob/fda49c04/src/common/type_utils.cpp ---------------------------------------------------------------------- diff --git a/src/common/type_utils.cpp b/src/common/type_utils.cpp index f88dff7..f7b21c6 100644 --- a/src/common/type_utils.cpp +++ b/src/common/type_utils.cpp @@ -388,19 +388,18 @@ std::ostream& operator << ( std::ostream& stream, const StatusUpdate& update) { - stream - << update.status().state() - << " (UUID: " << UUID::fromBytes(update.uuid()) - << ") for task " << update.status().task_id(); + stream << update.status().state() + << (update.has_uuid() + ? " (UUID: " + stringify(UUID::fromBytes(update.uuid())) + : "") + << ") for task " << update.status().task_id(); if (update.status().has_healthy()) { - stream - << " in health state " - << (update.status().healthy() ? "healthy" : "unhealthy"); + stream << " in health state " + << (update.status().healthy() ? "healthy" : "unhealthy"); } - return stream - << " of framework " << update.framework_id(); + return stream << " of framework " << update.framework_id(); } } // namespace internal { http://git-wip-us.apache.org/repos/asf/mesos/blob/fda49c04/src/master/master.cpp ---------------------------------------------------------------------- diff --git a/src/master/master.cpp b/src/master/master.cpp index cbc6618..b8ed699 100644 --- a/src/master/master.cpp +++ b/src/master/master.cpp @@ -4990,9 +4990,13 @@ void Master::updateTask(Task* task, const StatusUpdate& update) task->set_state(status.state()); } - // Set the status update state and uuid for the task. - task->set_status_update_state(status.state()); - task->set_status_update_uuid(update.uuid()); + // Set the status update state and uuid for the task. Note that + // master-generated updates are terminal and do not have a uuid + // (in which case the master also calls removeTask()). + if (update.has_uuid()) { + task->set_status_update_state(status.state()); + task->set_status_update_uuid(update.uuid()); + } // TODO(brenden) Consider wiping the `message` field? if (task->statuses_size() > 0 && http://git-wip-us.apache.org/repos/asf/mesos/blob/fda49c04/src/messages/messages.proto ---------------------------------------------------------------------- diff --git a/src/messages/messages.proto b/src/messages/messages.proto index a1e71d8..165a16d 100644 --- a/src/messages/messages.proto +++ b/src/messages/messages.proto @@ -71,7 +71,11 @@ message StatusUpdate { optional SlaveID slave_id = 3; required TaskStatus status = 4; required double timestamp = 5; - required bytes uuid = 6; + + // This is being deprecated in favor of TaskStatus.uuid. In 0.23.0, + // we set the TaskStatus 'uuid' in the executor driver for all + // retryable status updates. + optional bytes uuid = 6; // This corresponds to the latest state of the task according to the // slave. Note that this state might be different than the state in http://git-wip-us.apache.org/repos/asf/mesos/blob/fda49c04/src/sched/sched.cpp ---------------------------------------------------------------------- diff --git a/src/sched/sched.cpp b/src/sched/sched.cpp index a4e35aa..d37b256 100644 --- a/src/sched/sched.cpp +++ b/src/sched/sched.cpp @@ -71,6 +71,8 @@ #include "authentication/cram_md5/authenticatee.hpp" +#include "common/protobuf_utils.hpp" + #include "local/flags.hpp" #include "local/local.hpp" @@ -690,16 +692,19 @@ protected: TaskStatus status = update.status(); - // If the update is driver-generated or master-generated, it - // does not require acknowledgement and so we unset the 'uuid' - // field of TaskStatus. Otherwise, we overwrite the field to - // ensure that a 0.22.0 scheduler driver supports explicit - // acknowledgements, even if running against a 0.21.0 cluster. + // If the update does not have a 'uuid', it does not need + // acknowledging. However, prior to 0.23.0, the update uuid + // was required and always set. In 0.24.0, we can rely on the + // update uuid check here, until then we must still check for + // this being sent from the driver (from == UPID()) or from + // the master (pid == UPID()). // - // TODO(bmahler): Update master and slave to ensure that 'uuid' is - // set accurately by the time it reaches the scheduler driver. - // This will be required for pure bindings. - if (from == UPID() || pid == UPID()) { + // TODO(bmahler): For the HTTP API, we will have to update the + // master and slave to ensure the 'uuid' in TaskStatus is set + // correctly. + if (!update.has_uuid()) { + status.clear_uuid(); + } else if (from == UPID() || pid == UPID()) { status.clear_uuid(); } else { status.set_uuid(update.uuid()); @@ -724,8 +729,8 @@ protected: return; } - // Don't acknowledge updates created by the driver or master. - if (from != UPID() && pid != UPID()) { + // See above for when we don't need to acknowledge. + if (update.has_uuid() && from != UPID() && pid != UPID()) { // We drop updates while we're disconnected. CHECK(connected); CHECK_SOME(master); @@ -915,16 +920,14 @@ protected: // master failover etc). The correct way for schedulers to deal // with this situation is to use 'reconcileTasks()'. foreach (const TaskInfo& task, tasks) { - StatusUpdate update; - update.mutable_framework_id()->MergeFrom(framework.id()); - TaskStatus* status = update.mutable_status(); - status->mutable_task_id()->MergeFrom(task.task_id()); - status->set_state(TASK_LOST); - status->set_source(TaskStatus::SOURCE_MASTER); - status->set_reason(TaskStatus::REASON_MASTER_DISCONNECTED); - status->set_message("Master Disconnected"); - update.set_timestamp(Clock::now().secs()); - update.set_uuid(UUID::random().toBytes()); + StatusUpdate update = protobuf::createStatusUpdate( + framework.id(), + None(), + task.task_id(), + TASK_LOST, + TaskStatus::SOURCE_MASTER, + "Master disconnected", + TaskStatus::REASON_MASTER_DISCONNECTED); statusUpdate(UPID(), update, UPID()); } @@ -996,16 +999,14 @@ protected: } foreach (const TaskInfo& task, operation.launch().task_infos()) { - StatusUpdate update; - update.mutable_framework_id()->MergeFrom(framework.id()); - TaskStatus* status = update.mutable_status(); - status->mutable_task_id()->MergeFrom(task.task_id()); - status->set_state(TASK_LOST); - status->set_source(TaskStatus::SOURCE_MASTER); - status->set_reason(TaskStatus::REASON_MASTER_DISCONNECTED); - status->set_message("Master Disconnected"); - update.set_timestamp(Clock::now().secs()); - update.set_uuid(UUID::random().toBytes()); + StatusUpdate update = protobuf::createStatusUpdate( + framework.id(), + None(), + task.task_id(), + TASK_LOST, + TaskStatus::SOURCE_MASTER, + "Master disconnected", + TaskStatus::REASON_MASTER_DISCONNECTED); statusUpdate(UPID(), update, UPID()); } http://git-wip-us.apache.org/repos/asf/mesos/blob/fda49c04/src/scheduler/scheduler.cpp ---------------------------------------------------------------------- diff --git a/src/scheduler/scheduler.cpp b/src/scheduler/scheduler.cpp index 1efc6fb..f360e4d 100644 --- a/src/scheduler/scheduler.cpp +++ b/src/scheduler/scheduler.cpp @@ -624,11 +624,19 @@ protected: update->mutable_status()->set_timestamp(message.update().timestamp()); - // If the update is generated by the master it doesn't need to be - // acknowledged; so we unset the UUID inside TaskStatus. - // TODO(vinod): Update master and slave to ensure that 'uuid' is - // set accurately by the time it reaches the scheduler. - if (UPID(message.pid()) == UPID()) { + // If the update does not have a 'uuid', it does not need + // acknowledging. However, prior to 0.23.0, the update uuid + // was required and always set. In 0.24.0, we can rely on the + // update uuid check here, until then we must still check for + // this being sent from the driver (from == UPID()) or from + // the master (pid == UPID()). + // + // TODO(bmahler): For the HTTP API, we will have to update the + // master and slave to ensure the 'uuid' in TaskStatus is set + // correctly. + if (!message.update().has_uuid()) { + update->mutable_status()->clear_uuid(); + } else if (UPID(message.pid()) == UPID()) { update->mutable_status()->clear_uuid(); } else { update->mutable_status()->set_uuid(message.update().uuid()); http://git-wip-us.apache.org/repos/asf/mesos/blob/fda49c04/src/slave/slave.cpp ---------------------------------------------------------------------- diff --git a/src/slave/slave.cpp b/src/slave/slave.cpp index 9b72fad..1105a66 100644 --- a/src/slave/slave.cpp +++ b/src/slave/slave.cpp @@ -2663,6 +2663,15 @@ void Slave::statusUpdate(StatusUpdate update, const UPID& pid) state == RUNNING || state == TERMINATING) << state; + // TODO(bmahler): With the HTTP API, we must validate the UUID + // inside the TaskStatus. For now, we only care about the UUID + // inside the StatusUpdate, as the scheduler driver overwrites it. + if (!update.has_uuid()) { + LOG(WARNING) << "Ignoring status update " << update << " without 'uuid'"; + metrics.invalid_status_updates++; + return; + } + // Set the source before forwarding the status update. update.mutable_status()->set_source( pid == UPID() ? TaskStatus::SOURCE_SLAVE : TaskStatus::SOURCE_EXECUTOR); @@ -2880,6 +2889,9 @@ void Slave::forward(StatusUpdate update) } if (task != NULL) { + CHECK(update.has_uuid()) + << "Expecting updates without 'uuid' to have been rejected"; + // We set the status update state of the task here because in // steady state master updates the status update state of the // task when it receives this update. If the master fails over, @@ -5120,6 +5132,9 @@ void Executor::recoverTask(const TaskState& state) launchedTasks.contains(state.id)) { terminateTask(state.id, update.status()); + CHECK(update.has_uuid()) + << "Expecting updates without 'uuid' to have been rejected"; + // If the terminal update has been acknowledged, remove it. if (state.acks.contains(UUID::fromBytes(update.uuid()))) { completeTask(state.id); http://git-wip-us.apache.org/repos/asf/mesos/blob/fda49c04/src/slave/status_update_manager.cpp ---------------------------------------------------------------------- diff --git a/src/slave/status_update_manager.cpp b/src/slave/status_update_manager.cpp index 0ad2450..1978ac8 100644 --- a/src/slave/status_update_manager.cpp +++ b/src/slave/status_update_manager.cpp @@ -711,6 +711,10 @@ Try<bool> StatusUpdateStream::update(const StatusUpdate& update) return Error(error.get()); } + if (!update.has_uuid()) { + return Error("Status update is missing 'uuid'"); + } + // Check that this status update has not already been acknowledged. // This could happen in the rare case when the slave received the ACK // from the framework, died, but slave's ACK to the executor never made it!
