This is an automated email from the ASF dual-hosted git repository.
ccondit pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/yunikorn-core.git
The following commit(s) were added to refs/heads/master by this push:
new 7c5b3259 [YUNIKORN-2805] Remove invalid health checks regarding
negative capacity (#947)
7c5b3259 is described below
commit 7c5b32590f8a5b5c9a89d2d2de1767ba2c39fbdb
Author: Craig Condit <[email protected]>
AuthorDate: Thu Aug 15 14:17:22 2024 -0500
[YUNIKORN-2805] Remove invalid health checks regarding negative capacity
(#947)
Now that we properly handle cases where node resources can shrink while
existing allocations remain, remove the assumption from the health
checker that allocated resources are always <= capacity.
Closes: #947
---
pkg/scheduler/health_checker.go | 57 +++++++++++++-----------------------
pkg/scheduler/health_checker_test.go | 8 ++---
2 files changed, 24 insertions(+), 41 deletions(-)
diff --git a/pkg/scheduler/health_checker.go b/pkg/scheduler/health_checker.go
index 84c7947a..177d9b8d 100644
--- a/pkg/scheduler/health_checker.go
+++ b/pkg/scheduler/health_checker.go
@@ -233,21 +233,16 @@ func checkFailedNodes(metrics *metrics.SchedulerMetrics)
dao.HealthCheckInfo {
}
func checkSchedulingContext(schedulerContext *ClusterContext)
[]dao.HealthCheckInfo {
- // 1. check resources
- // 1.1 check for negative resources
+ // check for negative resources
var partitionsWithNegResources []string
var nodesWithNegResources []string
- // 1.3 node allocated resource <= total resource of the node
- var allocationMismatch []string
- // 1.2 total partition resource = sum of node resources
+ // total partition resource = sum of node resources
var totalResourceMismatch []string
- // 1.4 node total resource = allocated resource + occupied resource +
available resource
+ // node total resource = allocated resource + occupied resource +
available resource
var nodeTotalMismatch []string
- // 1.5 node capacity >= allocated resources on the node
- var nodeCapacityMismatch []string
- // 2. check reservation/node ration
+ // check reservation/node ration
var partitionReservationRatio []float32
- // 3. check for orphan allocations
+ // check for orphan allocations
orphanAllocationsOnNode := make([]*objects.Allocation, 0)
orphanAllocationsOnApp := make([]*objects.Allocation, 0)
@@ -282,9 +277,6 @@ func checkSchedulingContext(schedulerContext
*ClusterContext) []dao.HealthCheckI
if node.GetOccupiedResource().HasNegativeValue() {
nodesWithNegResources =
append(nodesWithNegResources, node.NodeID)
}
- if
!resources.StrictlyGreaterThanOrEquals(node.GetCapacity(),
node.GetAllocatedResource()) {
- nodeCapacityMismatch =
append(nodeCapacityMismatch, node.NodeID)
- }
orphanAllocationsOnNode =
append(orphanAllocationsOnNode, checkNodeAllocations(node, part)...)
}
// check if there are allocations assigned to an app but there
are missing from the nodes
@@ -292,42 +284,33 @@ func checkSchedulingContext(schedulerContext
*ClusterContext) []dao.HealthCheckI
orphanAllocationsOnApp = append(orphanAllocationsOnApp,
checkAppAllocations(app, part.nodes)...)
}
partitionReservationRatio = append(partitionReservationRatio,
float32(sumReservation)/(float32(part.GetTotalNodeCount())))
- if !resources.Equals(sumNodeAllocatedResources,
part.GetAllocatedResource()) {
- allocationMismatch = append(allocationMismatch,
part.Name)
- }
if !resources.EqualsOrEmpty(sumNodeResources,
part.GetTotalPartitionResource()) {
totalResourceMismatch = append(totalResourceMismatch,
part.Name)
}
}
- var info = make([]dao.HealthCheckInfo, 9)
- info[0] = CreateCheckInfo(len(partitionsWithNegResources) == 0,
"Negative resources",
+ var info = make([]dao.HealthCheckInfo, 0)
+ info = append(info, CreateCheckInfo(len(partitionsWithNegResources) ==
0, "Negative resources",
"Check for negative resources in the partitions",
- fmt.Sprintf("Partitions with negative resources: %q",
partitionsWithNegResources))
- info[1] = CreateCheckInfo(len(nodesWithNegResources) == 0, "Negative
resources",
+ fmt.Sprintf("Partitions with negative resources: %q",
partitionsWithNegResources)))
+ info = append(info, CreateCheckInfo(len(nodesWithNegResources) == 0,
"Negative resources",
"Check for negative resources in the nodes",
- fmt.Sprintf("Nodes with negative resources: %q",
nodesWithNegResources))
- info[2] = CreateCheckInfo(len(allocationMismatch) == 0, "Consistency of
data",
- "Check if a partition's allocated resource <= total resource of
the partition",
- fmt.Sprintf("Partitions with inconsistent data: %q",
allocationMismatch))
- info[3] = CreateCheckInfo(len(totalResourceMismatch) == 0, "Consistency
of data",
+ fmt.Sprintf("Nodes with negative resources: %q",
nodesWithNegResources)))
+ info = append(info, CreateCheckInfo(len(totalResourceMismatch) == 0,
"Consistency of data",
"Check if total partition resource == sum of the node resources
from the partition",
- fmt.Sprintf("Partitions with inconsistent data: %q",
totalResourceMismatch))
- info[4] = CreateCheckInfo(len(nodeTotalMismatch) == 0, "Consistency of
data",
+ fmt.Sprintf("Partitions with inconsistent data: %q",
totalResourceMismatch)))
+ info = append(info, CreateCheckInfo(len(nodeTotalMismatch) == 0,
"Consistency of data",
"Check if node total resource = allocated resource + occupied
resource + available resource",
- fmt.Sprintf("Nodes with inconsistent data: %q",
nodeTotalMismatch))
- info[5] = CreateCheckInfo(len(nodeCapacityMismatch) == 0, "Consistency
of data",
- "Check if node capacity >= allocated resources on the node",
- fmt.Sprintf("Nodes with inconsistent data: %q",
nodeCapacityMismatch))
+ fmt.Sprintf("Nodes with inconsistent data: %q",
nodeTotalMismatch)))
// mark it as succeeded for a while until we will know what is not
considered a normal value anymore
- info[6] = CreateCheckInfo(true, "Reservation check",
+ info = append(info, CreateCheckInfo(true, "Reservation check",
"Check the reservation nr compared to the number of nodes",
- fmt.Sprintf("Reservation/node nr ratio: %f",
partitionReservationRatio))
- info[7] = CreateCheckInfo(len(orphanAllocationsOnNode) == 0, "Orphan
allocation on node check",
+ fmt.Sprintf("Reservation/node nr ratio: %f",
partitionReservationRatio)))
+ info = append(info, CreateCheckInfo(len(orphanAllocationsOnNode) == 0,
"Orphan allocation on node check",
"Check if there are orphan allocations on the nodes",
- fmt.Sprintf("Orphan allocations: %v", orphanAllocationsOnNode))
- info[8] = CreateCheckInfo(len(orphanAllocationsOnApp) == 0, "Orphan
allocation on app check",
+ fmt.Sprintf("Orphan allocations: %v", orphanAllocationsOnNode)))
+ info = append(info, CreateCheckInfo(len(orphanAllocationsOnApp) == 0,
"Orphan allocation on app check",
"Check if there are orphan allocations on the applications",
- fmt.Sprintf("OrphanAllocations: %v", orphanAllocationsOnApp))
+ fmt.Sprintf("OrphanAllocations: %v", orphanAllocationsOnApp)))
return info
}
diff --git a/pkg/scheduler/health_checker_test.go
b/pkg/scheduler/health_checker_test.go
index 6d3e50ad..407f3f22 100644
--- a/pkg/scheduler/health_checker_test.go
+++ b/pkg/scheduler/health_checker_test.go
@@ -209,7 +209,7 @@ func TestGetSchedulerHealthStatusContext(t *testing.T) {
node.AddAllocation(alloc)
healthInfo = GetSchedulerHealthStatus(schedulerMetrics,
schedulerContext)
assert.Assert(t, !healthInfo.Healthy, "Scheduler should not be healthy")
- assert.Assert(t, !healthInfo.HealthChecks[9].Succeeded, "The orphan
allocation check on the node should not be successful")
+ assert.Assert(t, !healthInfo.HealthChecks[7].Succeeded, "The orphan
allocation check on the node should not be successful")
// add the allocation to the app as well
part := schedulerContext.partitions[partName]
@@ -218,13 +218,13 @@ func TestGetSchedulerHealthStatusContext(t *testing.T) {
err = part.AddApplication(app)
assert.NilError(t, err, "Could not add application")
healthInfo = GetSchedulerHealthStatus(schedulerMetrics,
schedulerContext)
- assert.Assert(t, healthInfo.HealthChecks[9].Succeeded, "The orphan
allocation check on the node should be successful")
+ assert.Assert(t, healthInfo.HealthChecks[7].Succeeded, "The orphan
allocation check on the node should be successful")
// remove the allocation from the node, so we will have an orphan
allocation assigned to the app
node.RemoveAllocation("key")
healthInfo = GetSchedulerHealthStatus(schedulerMetrics,
schedulerContext)
- assert.Assert(t, healthInfo.HealthChecks[9].Succeeded, "The orphan
allocation check on the node should be successful")
- assert.Assert(t, !healthInfo.HealthChecks[10].Succeeded, "The orphan
allocation check on the app should not be successful")
+ assert.Assert(t, healthInfo.HealthChecks[7].Succeeded, "The orphan
allocation check on the node should be successful")
+ assert.Assert(t, !healthInfo.HealthChecks[8].Succeeded, "The orphan
allocation check on the app should not be successful")
}
func TestGetSchedulerHealthStatusMetrics(t *testing.T) {
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]