This is an automated email from the ASF dual-hosted git repository.

ccondit pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/yunikorn-core.git


The following commit(s) were added to refs/heads/master by this push:
     new 7c5b3259 [YUNIKORN-2805] Remove invalid health checks regarding 
negative capacity (#947)
7c5b3259 is described below

commit 7c5b32590f8a5b5c9a89d2d2de1767ba2c39fbdb
Author: Craig Condit <[email protected]>
AuthorDate: Thu Aug 15 14:17:22 2024 -0500

    [YUNIKORN-2805] Remove invalid health checks regarding negative capacity 
(#947)
    
    Now that we properly handle cases where node resources can shrink while
    existing allocations remain, remove the assumption from the health
    checker that allocated resources are always <= capacity.
    
    Closes: #947
---
 pkg/scheduler/health_checker.go      | 57 +++++++++++++-----------------------
 pkg/scheduler/health_checker_test.go |  8 ++---
 2 files changed, 24 insertions(+), 41 deletions(-)

diff --git a/pkg/scheduler/health_checker.go b/pkg/scheduler/health_checker.go
index 84c7947a..177d9b8d 100644
--- a/pkg/scheduler/health_checker.go
+++ b/pkg/scheduler/health_checker.go
@@ -233,21 +233,16 @@ func checkFailedNodes(metrics *metrics.SchedulerMetrics) 
dao.HealthCheckInfo {
 }
 
 func checkSchedulingContext(schedulerContext *ClusterContext) 
[]dao.HealthCheckInfo {
-       // 1. check resources
-       // 1.1 check for negative resources
+       // check for negative resources
        var partitionsWithNegResources []string
        var nodesWithNegResources []string
-       // 1.3 node allocated resource <= total resource of the node
-       var allocationMismatch []string
-       // 1.2 total partition resource = sum of node resources
+       // total partition resource = sum of node resources
        var totalResourceMismatch []string
-       // 1.4 node total resource = allocated resource + occupied resource + 
available resource
+       // node total resource = allocated resource + occupied resource + 
available resource
        var nodeTotalMismatch []string
-       // 1.5 node capacity >= allocated resources on the node
-       var nodeCapacityMismatch []string
-       // 2. check reservation/node ration
+       // check reservation/node ration
        var partitionReservationRatio []float32
-       // 3. check for orphan allocations
+       // check for orphan allocations
        orphanAllocationsOnNode := make([]*objects.Allocation, 0)
        orphanAllocationsOnApp := make([]*objects.Allocation, 0)
 
@@ -282,9 +277,6 @@ func checkSchedulingContext(schedulerContext 
*ClusterContext) []dao.HealthCheckI
                        if node.GetOccupiedResource().HasNegativeValue() {
                                nodesWithNegResources = 
append(nodesWithNegResources, node.NodeID)
                        }
-                       if 
!resources.StrictlyGreaterThanOrEquals(node.GetCapacity(), 
node.GetAllocatedResource()) {
-                               nodeCapacityMismatch = 
append(nodeCapacityMismatch, node.NodeID)
-                       }
                        orphanAllocationsOnNode = 
append(orphanAllocationsOnNode, checkNodeAllocations(node, part)...)
                }
                // check if there are allocations assigned to an app but there 
are missing from the nodes
@@ -292,42 +284,33 @@ func checkSchedulingContext(schedulerContext 
*ClusterContext) []dao.HealthCheckI
                        orphanAllocationsOnApp = append(orphanAllocationsOnApp, 
checkAppAllocations(app, part.nodes)...)
                }
                partitionReservationRatio = append(partitionReservationRatio, 
float32(sumReservation)/(float32(part.GetTotalNodeCount())))
-               if !resources.Equals(sumNodeAllocatedResources, 
part.GetAllocatedResource()) {
-                       allocationMismatch = append(allocationMismatch, 
part.Name)
-               }
                if !resources.EqualsOrEmpty(sumNodeResources, 
part.GetTotalPartitionResource()) {
                        totalResourceMismatch = append(totalResourceMismatch, 
part.Name)
                }
        }
-       var info = make([]dao.HealthCheckInfo, 9)
-       info[0] = CreateCheckInfo(len(partitionsWithNegResources) == 0, 
"Negative resources",
+       var info = make([]dao.HealthCheckInfo, 0)
+       info = append(info, CreateCheckInfo(len(partitionsWithNegResources) == 
0, "Negative resources",
                "Check for negative resources in the partitions",
-               fmt.Sprintf("Partitions with negative resources: %q", 
partitionsWithNegResources))
-       info[1] = CreateCheckInfo(len(nodesWithNegResources) == 0, "Negative 
resources",
+               fmt.Sprintf("Partitions with negative resources: %q", 
partitionsWithNegResources)))
+       info = append(info, CreateCheckInfo(len(nodesWithNegResources) == 0, 
"Negative resources",
                "Check for negative resources in the nodes",
-               fmt.Sprintf("Nodes with negative resources: %q", 
nodesWithNegResources))
-       info[2] = CreateCheckInfo(len(allocationMismatch) == 0, "Consistency of 
data",
-               "Check if a partition's allocated resource <= total resource of 
the partition",
-               fmt.Sprintf("Partitions with inconsistent data: %q", 
allocationMismatch))
-       info[3] = CreateCheckInfo(len(totalResourceMismatch) == 0, "Consistency 
of data",
+               fmt.Sprintf("Nodes with negative resources: %q", 
nodesWithNegResources)))
+       info = append(info, CreateCheckInfo(len(totalResourceMismatch) == 0, 
"Consistency of data",
                "Check if total partition resource == sum of the node resources 
from the partition",
-               fmt.Sprintf("Partitions with inconsistent data: %q", 
totalResourceMismatch))
-       info[4] = CreateCheckInfo(len(nodeTotalMismatch) == 0, "Consistency of 
data",
+               fmt.Sprintf("Partitions with inconsistent data: %q", 
totalResourceMismatch)))
+       info = append(info, CreateCheckInfo(len(nodeTotalMismatch) == 0, 
"Consistency of data",
                "Check if node total resource = allocated resource + occupied 
resource + available resource",
-               fmt.Sprintf("Nodes with inconsistent data: %q", 
nodeTotalMismatch))
-       info[5] = CreateCheckInfo(len(nodeCapacityMismatch) == 0, "Consistency 
of data",
-               "Check if node capacity >= allocated resources on the node",
-               fmt.Sprintf("Nodes with inconsistent data: %q", 
nodeCapacityMismatch))
+               fmt.Sprintf("Nodes with inconsistent data: %q", 
nodeTotalMismatch)))
        // mark it as succeeded for a while until we will know what is not 
considered a normal value anymore
-       info[6] = CreateCheckInfo(true, "Reservation check",
+       info = append(info, CreateCheckInfo(true, "Reservation check",
                "Check the reservation nr compared to the number of nodes",
-               fmt.Sprintf("Reservation/node nr ratio: %f", 
partitionReservationRatio))
-       info[7] = CreateCheckInfo(len(orphanAllocationsOnNode) == 0, "Orphan 
allocation on node check",
+               fmt.Sprintf("Reservation/node nr ratio: %f", 
partitionReservationRatio)))
+       info = append(info, CreateCheckInfo(len(orphanAllocationsOnNode) == 0, 
"Orphan allocation on node check",
                "Check if there are orphan allocations on the nodes",
-               fmt.Sprintf("Orphan allocations: %v", orphanAllocationsOnNode))
-       info[8] = CreateCheckInfo(len(orphanAllocationsOnApp) == 0, "Orphan 
allocation on app check",
+               fmt.Sprintf("Orphan allocations: %v", orphanAllocationsOnNode)))
+       info = append(info, CreateCheckInfo(len(orphanAllocationsOnApp) == 0, 
"Orphan allocation on app check",
                "Check if there are orphan allocations on the applications",
-               fmt.Sprintf("OrphanAllocations: %v", orphanAllocationsOnApp))
+               fmt.Sprintf("OrphanAllocations: %v", orphanAllocationsOnApp)))
        return info
 }
 
diff --git a/pkg/scheduler/health_checker_test.go 
b/pkg/scheduler/health_checker_test.go
index 6d3e50ad..407f3f22 100644
--- a/pkg/scheduler/health_checker_test.go
+++ b/pkg/scheduler/health_checker_test.go
@@ -209,7 +209,7 @@ func TestGetSchedulerHealthStatusContext(t *testing.T) {
        node.AddAllocation(alloc)
        healthInfo = GetSchedulerHealthStatus(schedulerMetrics, 
schedulerContext)
        assert.Assert(t, !healthInfo.Healthy, "Scheduler should not be healthy")
-       assert.Assert(t, !healthInfo.HealthChecks[9].Succeeded, "The orphan 
allocation check on the node should not be successful")
+       assert.Assert(t, !healthInfo.HealthChecks[7].Succeeded, "The orphan 
allocation check on the node should not be successful")
 
        // add the allocation to the app as well
        part := schedulerContext.partitions[partName]
@@ -218,13 +218,13 @@ func TestGetSchedulerHealthStatusContext(t *testing.T) {
        err = part.AddApplication(app)
        assert.NilError(t, err, "Could not add application")
        healthInfo = GetSchedulerHealthStatus(schedulerMetrics, 
schedulerContext)
-       assert.Assert(t, healthInfo.HealthChecks[9].Succeeded, "The orphan 
allocation check on the node should be successful")
+       assert.Assert(t, healthInfo.HealthChecks[7].Succeeded, "The orphan 
allocation check on the node should be successful")
 
        // remove the allocation from the node, so we will have an orphan 
allocation assigned to the app
        node.RemoveAllocation("key")
        healthInfo = GetSchedulerHealthStatus(schedulerMetrics, 
schedulerContext)
-       assert.Assert(t, healthInfo.HealthChecks[9].Succeeded, "The orphan 
allocation check on the node should be successful")
-       assert.Assert(t, !healthInfo.HealthChecks[10].Succeeded, "The orphan 
allocation check on the app should not be successful")
+       assert.Assert(t, healthInfo.HealthChecks[7].Succeeded, "The orphan 
allocation check on the node should be successful")
+       assert.Assert(t, !healthInfo.HealthChecks[8].Succeeded, "The orphan 
allocation check on the app should not be successful")
 }
 
 func TestGetSchedulerHealthStatusMetrics(t *testing.T) {


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to