kingamarton commented on a change in pull request #330:
URL: 
https://github.com/apache/incubator-yunikorn-core/pull/330#discussion_r734364198



##########
File path: pkg/scheduler/health_checker.go
##########
@@ -151,5 +161,40 @@ func checkSchedulingContext(schedulerContext 
*ClusterContext) []dao.HealthCheckI
        infos[6] = CreateCheckInfo(true, "Reservation check",
                "Check the reservation nr compared to the number of nodes",
                fmt.Sprintf("Reservation/node nr ratio: %f", 
partitionReservationRatio))
+       infos[7] = CreateCheckInfo(len(orphanAllocationsOnNode) == 0, "Orphan 
allocation on node check",
+               "Check if there are orphan allocations on the nodes",
+               fmt.Sprintf("Orphan allocations: %v", orphanAllocationsOnNode))
+       infos[8] = CreateCheckInfo(len(orphanAllocationsOnApp) == 0, "Orphan 
allocation on app check",
+               "Check if there are orphan allocations on the applications",
+               fmt.Sprintf("OrphanAllocations: %v", orphanAllocationsOnApp))
        return infos
 }
+
+func checkAppAllocations(app *objects.Application, nodes 
objects.NodeCollection) []*objects.Allocation {
+       orphanAllocationsOnApp := make([]*objects.Allocation, 0)
+       for _, alloc := range app.GetAllAllocations() {
+               if node := nodes.GetNode(alloc.NodeID); node != nil {
+                       node.GetAllocation(alloc.UUID)

Review comment:
       Good catch. Is not neeeded.

##########
File path: pkg/scheduler/health_checker_test.go
##########
@@ -84,6 +84,28 @@ func TestGetSchedulerHealthStatusContext(t *testing.T) {
        assert.NilError(t, err, "Unexpected error while adding a new node")
        healthInfo = GetSchedulerHealthStatus(schedulerMetrics, 
schedulerContext)
        assert.Assert(t, !healthInfo.Healthy, "Scheduler should not be healthy")
+
+       // add orphan allocation to a node
+       node := schedulerContext.partitions[partName].nodes.GetNode("node")
+       alloc := objects.NewAllocation(allocID, "node", newAllocationAsk("key", 
"appID", resources.NewResource()))
+       node.AddAllocation(alloc)
+       healthInfo = GetSchedulerHealthStatus(schedulerMetrics, 
schedulerContext)
+       assert.Assert(t, !healthInfo.Healthy, "Scheduler should not be healthy")
+       assert.Assert(t, !healthInfo.HealthChecks[9].Succeeded, "The orphan 
allocation check on the node should not be successful")
+
+       // add the allocation to the app as well
+       part := schedulerContext.partitions[partName]
+       app := newApplication("appID", partName, "queue")
+       app.AddAllocation(alloc)
+       part.applications["appID"] = app
+       healthInfo = GetSchedulerHealthStatus(schedulerMetrics, 
schedulerContext)
+       assert.Assert(t, healthInfo.HealthChecks[9].Succeeded, "The orphan 
allocation check on the node should be successful")
+
+       // remove the allocation from the node, so we will have an orphan 
allocation assigned to the app
+       node.RemoveAllocation(allocID)
+       healthInfo = GetSchedulerHealthStatus(schedulerMetrics, 
schedulerContext)
+       assert.Assert(t, healthInfo.HealthChecks[9].Succeeded, "The orphan 
allocation check on the node should be successful")
+       assert.Assert(t, !healthInfo.HealthChecks[10].Succeeded, "The orphan 
allocation check on the app should not be successful")

Review comment:
       In the normal cases no, but we have seen this kind of behaviour in some 
escalations, when a node was unexpectedly removed and we had some placeholders 
processing. 




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to