This is an automated email from the ASF dual-hosted git repository. bennoe pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/mesos.git
commit ede2a94ebaf9710516816bae7d012d926c533a59 Author: Benno Evers <[email protected]> AuthorDate: Thu Feb 28 18:02:56 2019 +0100 Added unit tests for offer operation feedback metrics. This adds a set of checks to verify the metrics introduced in the previous commit are working as intended. Review: https://reviews.apache.org/r/70117 --- src/tests/agent_operation_feedback_tests.cpp | 10 ++++ src/tests/api_tests.cpp | 10 ++++ src/tests/master_slave_reconciliation_tests.cpp | 1 + src/tests/master_tests.cpp | 11 ++++ src/tests/operation_reconciliation_tests.cpp | 8 +++ src/tests/persistent_volume_endpoints_tests.cpp | 39 +++++++++++-- src/tests/reservation_endpoints_tests.cpp | 1 + src/tests/scheduler_tests.cpp | 2 + src/tests/slave_tests.cpp | 5 ++ .../storage_local_resource_provider_tests.cpp | 64 +++++++++++----------- 10 files changed, 114 insertions(+), 37 deletions(-) diff --git a/src/tests/agent_operation_feedback_tests.cpp b/src/tests/agent_operation_feedback_tests.cpp index 5a8f54c..e427441 100644 --- a/src/tests/agent_operation_feedback_tests.cpp +++ b/src/tests/agent_operation_feedback_tests.cpp @@ -146,6 +146,10 @@ TEST_P(AgentOperationFeedbackTest, RetryOperationStatusUpdate) EXPECT_EQ(operationId, update->status().operation_id()); EXPECT_EQ(OPERATION_FINISHED, update->status().state()); + // The master has already seen the update, so the operation is counted + // in the metrics. + EXPECT_TRUE(metricEquals("master/operations/finished", 1)); + // The agent should resend the unacknowledged operation status update once the // status update retry interval lapses. Future<scheduler::Event::UpdateOperationStatus> retriedUpdate; @@ -172,6 +176,9 @@ TEST_P(AgentOperationFeedbackTest, RetryOperationStatusUpdate) AWAIT_READY(acknowledgeOperationStatusMessage); + // Verify that the update has not been double-counted. + EXPECT_TRUE(metricEquals("master/operations/finished", 1)); + // The operation status update has been acknowledged, so the agent shouldn't // send further status updates. EXPECT_NO_FUTURE_PROTOBUFS(UpdateOperationStatusMessage(), _, _); @@ -263,6 +270,7 @@ TEST_P(AgentOperationFeedbackTest, CleanupAcknowledgedTerminalOperation) EXPECT_EQ(operationId, update->status().operation_id()); EXPECT_EQ(OPERATION_FINISHED, update->status().state()); + EXPECT_TRUE(metricEquals("master/operations/finished", 1)); { master::Call call; @@ -417,6 +425,7 @@ TEST_P(AgentOperationFeedbackTest, OperationUpdateContainsAgentID) EXPECT_EQ(operationId, update->status().operation_id()); EXPECT_EQ(OPERATION_FINISHED, update->status().state()); + EXPECT_TRUE(metricEquals("master/operations/finished", 1)); ASSERT_TRUE(update->status().has_agent_id()); EXPECT_EQ(agentId, update->status().agent_id()); @@ -562,6 +571,7 @@ TEST_P(AgentOperationFeedbackTest, DroppedOperationStatusUpdate) EXPECT_EQ(OPERATION_DROPPED, update->status().state()); EXPECT_FALSE(update->status().has_uuid()); EXPECT_FALSE(update->status().has_resource_provider_id()); + EXPECT_TRUE(metricEquals("master/operations/dropped", 1)); const AgentID& agentId(offer.agent_id()); ASSERT_TRUE(update->status().has_agent_id()); diff --git a/src/tests/api_tests.cpp b/src/tests/api_tests.cpp index f241064..e76417a 100644 --- a/src/tests/api_tests.cpp +++ b/src/tests/api_tests.cpp @@ -5114,6 +5114,11 @@ TEST_P(MasterAPITest, OperationUpdatesUponAgentGone) EXPECT_EQ( v1::OPERATION_GONE_BY_OPERATOR, operationGoneUpdate->status().state()); + + // TODO(bevers): We have to reset the agent before we can access the + // metrics to work around MESOS-9644. + slave->reset(); + EXPECT_TRUE(metricEquals("master/operations/gone_by_operator", 1)); } @@ -5253,6 +5258,11 @@ TEST_P(MasterAPITest, OperationUpdatesUponUnreachable) v1::OPERATION_UNREACHABLE, operationUnreachableUpdate->status().state()); + // TODO(bevers): We have to reset the agent before we can access the + // metrics to work around MESOS-9644. + slave->reset(); + EXPECT_TRUE(metricEquals("master/operations/unreachable", 1)); + Clock::resume(); } diff --git a/src/tests/master_slave_reconciliation_tests.cpp b/src/tests/master_slave_reconciliation_tests.cpp index 002be27..7b6ac50 100644 --- a/src/tests/master_slave_reconciliation_tests.cpp +++ b/src/tests/master_slave_reconciliation_tests.cpp @@ -605,6 +605,7 @@ TEST_F( EXPECT_EQ(operationId, operationDroppedUpdate->status().operation_id()); EXPECT_EQ(v1::OPERATION_DROPPED, operationDroppedUpdate->status().state()); + EXPECT_TRUE(metricEquals("master/operations/dropped", 1)); } // This test verifies that the master reconciles tasks that are diff --git a/src/tests/master_tests.cpp b/src/tests/master_tests.cpp index 5a92683..964d935 100644 --- a/src/tests/master_tests.cpp +++ b/src/tests/master_tests.cpp @@ -9247,6 +9247,9 @@ TEST_F(MasterTest, OperationUpdateDuringFailover) AWAIT_READY(updateOperationStatusMessage1); AWAIT_READY(updateOperationStatusMessage2); + EXPECT_TRUE(metricEquals("master/operations/pending", 2)); + EXPECT_TRUE(metricEquals("master/operations/reserve/pending", 1)); + EXPECT_TRUE(metricEquals("master/operations/create_disk/pending", 1)); // Fail over the master. master->reset(); @@ -9314,6 +9317,14 @@ TEST_F(MasterTest, OperationUpdateDuringFailover) EXPECT_EQ(receivedOperationUUIDs, expectedOperationUUIDs); } + EXPECT_TRUE(metricEquals("master/operations/pending", 0)); + EXPECT_TRUE(metricEquals("master/operations/reserve/pending", 0)); + EXPECT_TRUE(metricEquals("master/operations/create_disk/pending", 0)); + + EXPECT_TRUE(metricEquals("master/operations/finished", 2)); + EXPECT_TRUE(metricEquals("master/operations/reserve/finished", 1)); + EXPECT_TRUE(metricEquals("master/operations/create_disk/finished", 1)); + driver.stop(); driver.join(); } diff --git a/src/tests/operation_reconciliation_tests.cpp b/src/tests/operation_reconciliation_tests.cpp index 9e79f9e..eae318d 100644 --- a/src/tests/operation_reconciliation_tests.cpp +++ b/src/tests/operation_reconciliation_tests.cpp @@ -1108,6 +1108,7 @@ TEST_P(OperationReconciliationTest, ReconcileUnackedAgentOperation) EXPECT_EQ(operationId, update->status().operation_id()); EXPECT_EQ(OPERATION_FINISHED, update->status().state()); + EXPECT_TRUE(metricEquals("master/operations/finished", 1)); ASSERT_TRUE(update->status().has_agent_id()); EXPECT_EQ(agentId, update->status().agent_id()); @@ -1253,6 +1254,7 @@ TEST_P(OperationReconciliationTest, UnackedAgentOperationAfterMasterFailover) EXPECT_EQ(operationId, update->status().operation_id()); EXPECT_EQ(OPERATION_FINISHED, update->status().state()); + EXPECT_TRUE(metricEquals("master/operations/finished", 1)); // Simulate master failover. EXPECT_CALL(*scheduler, disconnected(_)); @@ -1440,6 +1442,7 @@ TEST_P(OperationReconciliationTest, ReconcileAgentOperationOnGoneAgent) EXPECT_EQ(operationId, update->status().operation_id()); EXPECT_EQ(OPERATION_FINISHED, update->status().state()); + EXPECT_TRUE(metricEquals("master/operations/finished", 1)); ASSERT_TRUE(update->status().has_agent_id()); EXPECT_EQ(agentId, update->status().agent_id()); @@ -1473,6 +1476,11 @@ TEST_P(OperationReconciliationTest, ReconcileAgentOperationOnGoneAgent) EXPECT_EQ(operationId, goneUpdate->status().operation_id()); EXPECT_EQ(OPERATION_GONE_BY_OPERATOR, goneUpdate->status().state()); + // TODO(bevers): We have to reset the agent before we can access the + // metrics to work around MESOS-9644. + slave->reset(); + EXPECT_TRUE(metricEquals("master/operations/gone_by_operator", 1)); + ASSERT_TRUE(goneUpdate->status().has_agent_id()); EXPECT_EQ(agentId, goneUpdate->status().agent_id()); diff --git a/src/tests/persistent_volume_endpoints_tests.cpp b/src/tests/persistent_volume_endpoints_tests.cpp index 40d7e6a..bddc946 100644 --- a/src/tests/persistent_volume_endpoints_tests.cpp +++ b/src/tests/persistent_volume_endpoints_tests.cpp @@ -1533,6 +1533,7 @@ TEST_F(PersistentVolumeEndpointsTest, OfferCreateThenEndpointRemove) // Make sure the allocator processes the `RESERVE` operation before summoning // an offer. AWAIT_READY(updateOperationStatusMessage); + EXPECT_TRUE(metricEquals("master/operations/finished", 1)); // Summon an offer. EXPECT_CALL(sched, resourceOffers(&driver, _)) @@ -1567,6 +1568,7 @@ TEST_F(PersistentVolumeEndpointsTest, OfferCreateThenEndpointRemove) // Make sure the master processes the accept call before summoning an offer. AWAIT_READY(updateOperationStatusMessage); + EXPECT_TRUE(metricEquals("master/operations/finished", 2)); // Summon an offer. EXPECT_CALL(sched, resourceOffers(&driver, _)) @@ -1679,6 +1681,9 @@ TEST_F(PersistentVolumeEndpointsTest, EndpointCreateThenOfferRemove) unreserved.pushReservation(createDynamicReservationInfo( frameworkInfo.roles(0), DEFAULT_CREDENTIAL.principal())); + Future<AcknowledgeOperationStatusMessage> acknowledgeOperationStatus1 = + FUTURE_PROTOBUF(AcknowledgeOperationStatusMessage(), _, _); + Future<Response> response = process::http::post( master.get()->pid, "reserve", @@ -1686,6 +1691,7 @@ TEST_F(PersistentVolumeEndpointsTest, EndpointCreateThenOfferRemove) createRequestBody(slaveId, "resources", dynamicallyReserved)); AWAIT_EXPECT_RESPONSE_STATUS_EQ(Accepted().status, response); + AWAIT_READY(acknowledgeOperationStatus1); // Create a 1MB persistent volume. Resources volume = createPersistentVolume( @@ -1697,6 +1703,9 @@ TEST_F(PersistentVolumeEndpointsTest, EndpointCreateThenOfferRemove) None(), frameworkInfo.principal()); + Future<AcknowledgeOperationStatusMessage> acknowledgeOperationStatus2 = + FUTURE_PROTOBUF(AcknowledgeOperationStatusMessage(), _, _); + response = process::http::post( master.get()->pid, "create-volumes", @@ -1704,6 +1713,7 @@ TEST_F(PersistentVolumeEndpointsTest, EndpointCreateThenOfferRemove) createRequestBody(slaveId, "volumes", volume)); AWAIT_EXPECT_RESPONSE_STATUS_EQ(Accepted().status, response); + AWAIT_READY(acknowledgeOperationStatus2); MockScheduler sched; MesosSchedulerDriver driver( @@ -1727,13 +1737,18 @@ TEST_F(PersistentVolumeEndpointsTest, EndpointCreateThenOfferRemove) EXPECT_TRUE(Resources(offer.resources()).contains( allocatedResources(volume, frameworkInfo.roles(0)))); - Future<UpdateOperationStatusMessage> updateOperationStatusMessage = - FUTURE_PROTOBUF(UpdateOperationStatusMessage(), _, _); + Future<AcknowledgeOperationStatusMessage> acknowledgeOperationStatus3 = + FUTURE_PROTOBUF(AcknowledgeOperationStatusMessage(), _, _); driver.acceptOffers({offer.id()}, {DESTROY(volume)}); // Make sure the master processes the accept call before summoning an offer. - AWAIT_READY(updateOperationStatusMessage); + AWAIT_READY(acknowledgeOperationStatus3); + + // We expect 3 finished operations, `Reserve`, `CreateVolume` and finally + // `DestroyVolume`. The first two are not checked individually because it + // would require additional heavy machinery to do it in a non-flaky way. + EXPECT_TRUE(metricEquals("master/operations/finished", 3)); // Summon an offer. EXPECT_CALL(sched, resourceOffers(&driver, _)) @@ -1753,14 +1768,15 @@ TEST_F(PersistentVolumeEndpointsTest, EndpointCreateThenOfferRemove) << Resources(offer.resources()) << " vs " << allocatedResources(dynamicallyReserved, frameworkInfo.roles(0)); - updateOperationStatusMessage = - FUTURE_PROTOBUF(UpdateOperationStatusMessage(), _, _); + Future<AcknowledgeOperationStatusMessage> acknowledgeOperationStatus4 = + FUTURE_PROTOBUF(AcknowledgeOperationStatusMessage(), _, _); // Unreserve the resources. driver.acceptOffers({offer.id()}, {UNRESERVE(dynamicallyReserved)}); // Make sure the master processes the accept call before summoning an offer. - AWAIT_READY(updateOperationStatusMessage); + AWAIT_READY(acknowledgeOperationStatus4); + EXPECT_TRUE(metricEquals("master/operations/finished", 4)); // Summon an offer. EXPECT_CALL(sched, resourceOffers(&driver, _)) @@ -1829,6 +1845,9 @@ TEST_F(PersistentVolumeEndpointsTest, ReserveAndSlaveRemoval) slave1Unreserved.pushReservation(createDynamicReservationInfo( frameworkInfo.roles(0), DEFAULT_CREDENTIAL.principal())); + Future<AcknowledgeOperationStatusMessage> acknowledgeOperationStatus1 = + FUTURE_PROTOBUF(AcknowledgeOperationStatusMessage(), _, _); + Future<Response> response = process::http::post( master.get()->pid, "reserve", @@ -1836,6 +1855,7 @@ TEST_F(PersistentVolumeEndpointsTest, ReserveAndSlaveRemoval) createRequestBody(slaveId1, "resources", slave1Reserved)); AWAIT_EXPECT_RESPONSE_STATUS_EQ(Accepted().status, response); + AWAIT_READY(acknowledgeOperationStatus1); MockScheduler sched; MesosSchedulerDriver driver( @@ -1861,6 +1881,9 @@ TEST_F(PersistentVolumeEndpointsTest, ReserveAndSlaveRemoval) Future<UpdateOperationStatusMessage> updateOperationStatusMessage = FUTURE_PROTOBUF(UpdateOperationStatusMessage(), _, _); + Future<AcknowledgeOperationStatusMessage> acknowledgeOperationStatus2 = + FUTURE_PROTOBUF(AcknowledgeOperationStatusMessage(), _, _); + // Use the offers API to reserve all CPUs on `slave2`. Resources slave2Unreserved = Resources::parse("cpus:3").get(); Resources slave2Reserved = @@ -1889,6 +1912,10 @@ TEST_F(PersistentVolumeEndpointsTest, ReserveAndSlaveRemoval) EXPECT_EQ(sentResources, slave2Reserved); AWAIT_READY(updateOperationStatusMessage); + AWAIT_READY(acknowledgeOperationStatus2); + + // Expect both `Reserve` operations to be finished. + EXPECT_TRUE(metricEquals("master/operations/finished", 2)); // Shutdown `slave2` with an explicit shutdown message. EXPECT_CALL(sched, offerRescinded(_, _)); diff --git a/src/tests/reservation_endpoints_tests.cpp b/src/tests/reservation_endpoints_tests.cpp index b189759..c8e5ece 100644 --- a/src/tests/reservation_endpoints_tests.cpp +++ b/src/tests/reservation_endpoints_tests.cpp @@ -1559,6 +1559,7 @@ TEST_F(ReservationEndpointsTest, AgentStateEndpointResources) // the agent to receive and process the `ApplyOperationMessage` and respond // with an initial operation status update. AWAIT_READY(updateOperationStatusMessage); + EXPECT_TRUE(metricEquals("master/operations/finished", 1)); // Make sure CheckpointResourcesMessage handling is completed // before proceeding. diff --git a/src/tests/scheduler_tests.cpp b/src/tests/scheduler_tests.cpp index 5fb6960..e0ed029 100644 --- a/src/tests/scheduler_tests.cpp +++ b/src/tests/scheduler_tests.cpp @@ -1246,6 +1246,8 @@ TEST_P(SchedulerTest, OperationFeedbackValidationNoResourceProviderCapability) EXPECT_EQ( mesos::v1::OPERATION_ERROR, updateOperationStatus->status().state()); + + EXPECT_TRUE(metricEquals("master/operations/error", 1)); } diff --git a/src/tests/slave_tests.cpp b/src/tests/slave_tests.cpp index 5ee5609..528a25a 100644 --- a/src/tests/slave_tests.cpp +++ b/src/tests/slave_tests.cpp @@ -6549,6 +6549,7 @@ TEST_F(SlaveTest, UpdateOperationStatusRetry) AWAIT_READY(updateOperationStatusMessage); ASSERT_EQ(updateOperationStatusMessage->operation_uuid(), operationUuid); + EXPECT_TRUE(metricEquals("master/operations/finished", 1)); Clock::resume(); } @@ -10935,6 +10936,9 @@ TEST_F(SlaveTest, RemoveResourceProvider) EXPECT_TRUE(updateOperationStatus->status().has_agent_id()); EXPECT_FALSE(updateOperationStatus->status().has_resource_provider_id()); + // The associated metrics should have also been increased. + EXPECT_TRUE(metricEquals("master/operations/gone_by_operator", 1)); + // The agent should also report a change to its resources. AWAIT_READY(updateSlaveMessage); @@ -11521,6 +11525,7 @@ TEST_F(SlaveTest, RetryOperationStatusUpdateAfterRecovery) EXPECT_EQ(operationId, update->status().operation_id()); EXPECT_EQ(v1::OPERATION_FINISHED, update->status().state()); + EXPECT_TRUE(metricEquals("master/operations/finished", 1)); // Restart the agent. slave.get()->terminate(); diff --git a/src/tests/storage_local_resource_provider_tests.cpp b/src/tests/storage_local_resource_provider_tests.cpp index 55c9389..2ec1628 100644 --- a/src/tests/storage_local_resource_provider_tests.cpp +++ b/src/tests/storage_local_resource_provider_tests.cpp @@ -3608,6 +3608,7 @@ TEST_F(StorageLocalResourceProviderTest, OperationUpdate) ASSERT_EQ( mesos::v1::OperationState::OPERATION_FINISHED, update->status().state()); ASSERT_TRUE(update->status().has_uuid()); + EXPECT_TRUE(metricEquals("master/operations/finished", 1)); ASSERT_TRUE(update->status().has_agent_id()); EXPECT_EQ(agentId, update->status().agent_id()); @@ -3829,20 +3830,17 @@ TEST_F(StorageLocalResourceProviderTest, OperationStateMetrics) AWAIT_READY(operationDroppedStatus); EXPECT_EQ(OPERATION_DROPPED, operationDroppedStatus->status().state()); - snapshot = Metrics(); - - ASSERT_NE(0u, snapshot.values.count(metricName( - "operations/create_disk/finished"))); - EXPECT_EQ(1, snapshot.values.at(metricName( - "operations/create_disk/finished"))); - ASSERT_NE(0u, snapshot.values.count(metricName( - "operations/destroy_disk/failed"))); - EXPECT_EQ(1, snapshot.values.at(metricName( - "operations/destroy_disk/failed"))); - ASSERT_NE(0u, snapshot.values.count(metricName( - "operations/destroy_disk/dropped"))); - EXPECT_EQ(1, snapshot.values.at(metricName( - "operations/destroy_disk/dropped"))); + EXPECT_TRUE(metricEquals("master/operations/dropped", 1)); + EXPECT_TRUE(metricEquals("master/operations/create_disk/finished", 1)); + EXPECT_TRUE(metricEquals("master/operations/destroy_disk/failed", 1)); + EXPECT_TRUE(metricEquals("master/operations/destroy_disk/dropped", 1)); + + EXPECT_TRUE(metricEquals(metricName( + "operations/create_disk/finished"), 1)); + EXPECT_TRUE(metricEquals(metricName( + "operations/destroy_disk/failed"), 1)); + EXPECT_TRUE(metricEquals(metricName( + "operations/destroy_disk/dropped"), 1)); } @@ -4483,6 +4481,9 @@ TEST_F(StorageLocalResourceProviderTest, RetryOperationStatusUpdateToScheduler) AWAIT_READY(acknowledgeOperationStatusMessage); + // Verify that the retry was only counted as one operation. + EXPECT_TRUE(metricEquals("master/operations/finished", 1)); + // Now that the SLRP has received the acknowledgement, the SLRP shouldn't // send further operation status updates. EXPECT_NO_FUTURE_PROTOBUFS(UpdateOperationStatusMessage(), _, _); @@ -4638,6 +4639,7 @@ TEST_F( ASSERT_EQ(operationId, update->status().operation_id()); ASSERT_EQ(v1::OperationState::OPERATION_FINISHED, update->status().state()); ASSERT_TRUE(update->status().has_uuid()); + EXPECT_TRUE(metricEquals("master/operations/finished", 1)); v1::scheduler::Call::ReconcileOperations::Operation operation; operation.mutable_operation_id()->CopyFrom(operationId); @@ -5000,24 +5002,24 @@ TEST_F(StorageLocalResourceProviderTest, RetryRpcWithExponentialBackoff) // Verify that the RPC metrics count the successes and errors correctly. // // TODO(chhsiao): verify the retry metrics instead once they are in place. - JSON::Object snapshot = Metrics(); + EXPECT_TRUE(metricEquals("master/operations/failed", 1)); + EXPECT_TRUE(metricEquals("master/operations/finished", 1)); - ASSERT_NE(0u, snapshot.values.count(metricName( - "csi_plugin/rpcs/csi.v0.Controller.CreateVolume/successes"))); - EXPECT_EQ(1, snapshot.values.at(metricName( - "csi_plugin/rpcs/csi.v0.Controller.CreateVolume/successes"))); - ASSERT_NE(0u, snapshot.values.count(metricName( - "csi_plugin/rpcs/csi.v0.Controller.CreateVolume/errors"))); - EXPECT_EQ(numRetryableErrors, snapshot.values.at(metricName( - "csi_plugin/rpcs/csi.v0.Controller.CreateVolume/errors"))); - ASSERT_NE(0u, snapshot.values.count(metricName( - "csi_plugin/rpcs/csi.v0.Controller.DeleteVolume/successes"))); - EXPECT_EQ(0, snapshot.values.at(metricName( - "csi_plugin/rpcs/csi.v0.Controller.DeleteVolume/successes"))); - ASSERT_NE(0u, snapshot.values.count(metricName( - "csi_plugin/rpcs/csi.v0.Controller.DeleteVolume/errors"))); - EXPECT_EQ(numRetryableErrors + 1, snapshot.values.at(metricName( - "csi_plugin/rpcs/csi.v0.Controller.DeleteVolume/errors"))); + EXPECT_TRUE(metricEquals(metricName( + "csi_plugin/rpcs/csi.v0.Controller.CreateVolume/successes"), + 1)); + + EXPECT_TRUE(metricEquals(metricName( + "csi_plugin/rpcs/csi.v0.Controller.CreateVolume/errors"), + numRetryableErrors)); + + EXPECT_TRUE(metricEquals(metricName( + "csi_plugin/rpcs/csi.v0.Controller.DeleteVolume/successes"), + 0)); + + EXPECT_TRUE(metricEquals(metricName( + "csi_plugin/rpcs/csi.v0.Controller.DeleteVolume/errors"), + numRetryableErrors + 1)); }
