This is an automated email from the ASF dual-hosted git repository.

pbacsko pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/yunikorn-core.git


The following commit(s) were added to refs/heads/master by this push:
     new 4ea22516 [YUNIKORN-2897] Health checker reports foreign allocation as 
orphan (#977)
4ea22516 is described below

commit 4ea225160acfe8197e9da0dd1042361065025d40
Author: Peter Bacsko <[email protected]>
AuthorDate: Thu Oct 3 17:21:25 2024 +0200

    [YUNIKORN-2897] Health checker reports foreign allocation as orphan (#977)
    
    Closes: #977
    
    Signed-off-by: Peter Bacsko <[email protected]>
---
 pkg/scheduler/health_checker.go                  |  2 +-
 pkg/scheduler/health_checker_test.go             |  9 +++++++++
 pkg/scheduler/objects/node.go                    | 13 -------------
 pkg/scheduler/objects/node_test.go               | 23 -----------------------
 pkg/scheduler/objects/required_node_preemptor.go |  2 +-
 pkg/scheduler/partition.go                       |  2 +-
 pkg/scheduler/partition_test.go                  | 24 ++++++++++++------------
 pkg/scheduler/tests/recovery_test.go             |  4 ++--
 8 files changed, 26 insertions(+), 53 deletions(-)

diff --git a/pkg/scheduler/health_checker.go b/pkg/scheduler/health_checker.go
index 177d9b8d..eaf1ea69 100644
--- a/pkg/scheduler/health_checker.go
+++ b/pkg/scheduler/health_checker.go
@@ -330,7 +330,7 @@ func checkAppAllocations(app *objects.Application, nodes 
objects.NodeCollection)
 
 func checkNodeAllocations(node *objects.Node, partitionContext 
*PartitionContext) []*objects.Allocation {
        orphanAllocationsOnNode := make([]*objects.Allocation, 0)
-       for _, alloc := range node.GetAllAllocations() {
+       for _, alloc := range node.GetYunikornAllocations() {
                if app := 
partitionContext.getApplication(alloc.GetApplicationID()); app != nil {
                        if !app.IsAllocationAssignedToApp(alloc) {
                                orphanAllocationsOnNode = 
append(orphanAllocationsOnNode, alloc)
diff --git a/pkg/scheduler/health_checker_test.go 
b/pkg/scheduler/health_checker_test.go
index 407f3f22..02402fc9 100644
--- a/pkg/scheduler/health_checker_test.go
+++ b/pkg/scheduler/health_checker_test.go
@@ -225,6 +225,15 @@ func TestGetSchedulerHealthStatusContext(t *testing.T) {
        healthInfo = GetSchedulerHealthStatus(schedulerMetrics, 
schedulerContext)
        assert.Assert(t, healthInfo.HealthChecks[7].Succeeded, "The orphan 
allocation check on the node should be successful")
        assert.Assert(t, !healthInfo.HealthChecks[8].Succeeded, "The orphan 
allocation check on the app should not be successful")
+       part.removeApplication("appID")
+       healthInfo = GetSchedulerHealthStatus(schedulerMetrics, 
schedulerContext)
+       assert.Assert(t, healthInfo.HealthChecks[8].Succeeded, "The orphan 
allocation check on the app still fails after removing the app")
+
+       // check that foreign allocation does not interfere with health check
+       falloc := newForeignAllocation("foreign-1", "node")
+       node.AddAllocation(falloc)
+       healthInfo = GetSchedulerHealthStatus(schedulerMetrics, 
schedulerContext)
+       assert.Assert(t, healthInfo.HealthChecks[7].Succeeded, "Foreign 
allocation was detected as orphan")
 }
 
 func TestGetSchedulerHealthStatusMetrics(t *testing.T) {
diff --git a/pkg/scheduler/objects/node.go b/pkg/scheduler/objects/node.go
index a4907111..db51fa74 100644
--- a/pkg/scheduler/objects/node.go
+++ b/pkg/scheduler/objects/node.go
@@ -230,19 +230,6 @@ func (sn *Node) GetAllocation(allocationKey string) 
*Allocation {
        return sn.allocations[allocationKey]
 }
 
-// Get a copy of the allocations on this node
-func (sn *Node) GetAllAllocations() []*Allocation {
-       sn.RLock()
-       defer sn.RUnlock()
-
-       arr := make([]*Allocation, 0)
-       for _, v := range sn.allocations {
-               arr = append(arr, v)
-       }
-
-       return arr
-}
-
 // GetYunikornAllocations returns a copy of Yunikorn allocations on this node
 func (sn *Node) GetYunikornAllocations() []*Allocation {
        sn.RLock()
diff --git a/pkg/scheduler/objects/node_test.go 
b/pkg/scheduler/objects/node_test.go
index 90a30e62..4b38e436 100644
--- a/pkg/scheduler/objects/node_test.go
+++ b/pkg/scheduler/objects/node_test.go
@@ -688,29 +688,6 @@ func TestGetAllocation(t *testing.T) {
        }
 }
 
-func TestGetAllAllocations(t *testing.T) {
-       node := newNode("node-123", map[string]resources.Quantity{"first": 100, 
"second": 200})
-       if !resources.IsZero(node.GetAllocatedResource()) {
-               t.Fatal("Failed to initialize resource")
-       }
-
-       // nothing allocated get an empty list
-       allocs := node.GetAllAllocations()
-       if allocs == nil || len(allocs) != 0 {
-               t.Fatalf("allocation length should be 0 on new node")
-       }
-       alloc1 := newAllocation(appID1, nodeID1, nil)
-       alloc2 := newAllocation(appID1, nodeID1, nil)
-
-       // allocate
-       node.AddAllocation(alloc1)
-       node.AddAllocation(alloc2)
-       assert.Equal(t, 2, len(node.GetAllAllocations()), "allocation length 
mismatch")
-       // This should not happen in real code just making sure the code does 
do what is expected
-       node.AddAllocation(alloc2)
-       assert.Equal(t, 2, len(node.GetAllAllocations()), "allocation length 
mismatch")
-}
-
 func TestSchedulingState(t *testing.T) {
        node := newNode("node-123", nil)
        if !node.IsSchedulable() {
diff --git a/pkg/scheduler/objects/required_node_preemptor.go 
b/pkg/scheduler/objects/required_node_preemptor.go
index 39ac81e5..507eb868 100644
--- a/pkg/scheduler/objects/required_node_preemptor.go
+++ b/pkg/scheduler/objects/required_node_preemptor.go
@@ -45,7 +45,7 @@ func NewRequiredNodePreemptor(node *Node, requiredAsk 
*Allocation) *PreemptionCo
 func (p *PreemptionContext) filterAllocations() {
        p.Lock()
        defer p.Unlock()
-       for _, allocation := range p.node.GetAllAllocations() {
+       for _, allocation := range p.node.GetYunikornAllocations() {
                // skip daemon set pods and higher priority allocation
                if allocation.GetRequiredNode() != "" || 
allocation.GetPriority() > p.requiredAsk.GetPriority() {
                        continue
diff --git a/pkg/scheduler/partition.go b/pkg/scheduler/partition.go
index 0db2dbda..b3d72ba0 100644
--- a/pkg/scheduler/partition.go
+++ b/pkg/scheduler/partition.go
@@ -672,7 +672,7 @@ func (pc *PartitionContext) removeNodeAllocations(node 
*objects.Node) ([]*object
        released := make([]*objects.Allocation, 0)
        confirmed := make([]*objects.Allocation, 0)
        // walk over all allocations still registered for this node
-       for _, alloc := range node.GetAllAllocations() {
+       for _, alloc := range node.GetYunikornAllocations() {
                allocationKey := alloc.GetAllocationKey()
                // since we are not locking the node and or application we 
could have had an update while processing
                // note that we do not return the allocation if the app or 
allocation is not found and assume that it
diff --git a/pkg/scheduler/partition_test.go b/pkg/scheduler/partition_test.go
index a086d326..8a16d34d 100644
--- a/pkg/scheduler/partition_test.go
+++ b/pkg/scheduler/partition_test.go
@@ -267,7 +267,7 @@ func TestRemoveNodeWithAllocations(t *testing.T) {
        assert.NilError(t, err)
        assert.Check(t, allocCreated)
        // get what was allocated
-       allocated := node.GetAllAllocations()
+       allocated := node.GetYunikornAllocations()
        assert.Equal(t, 1, len(allocated), "allocation not added correctly")
        assertLimits(t, getTestUserGroup(), appRes)
 
@@ -316,7 +316,7 @@ func TestRemoveNodeWithPlaceholders(t *testing.T) {
        assert.NilError(t, err)
        assert.Check(t, allocCreated)
        // get what was allocated
-       allocated := node1.GetAllAllocations()
+       allocated := node1.GetYunikornAllocations()
        assert.Equal(t, 1, len(allocated), "allocation not added correctly to 
node1 expected 1 got: %v", allocated)
        assert.Assert(t, resources.Equals(node1.GetAllocatedResource(), 
appRes), "allocation not added correctly to node1")
        assertLimits(t, getTestUserGroup(), appRes)
@@ -432,7 +432,7 @@ func TestPlaceholderDataWithPlaceholderPreemption(t 
*testing.T) {
        assert.Check(t, allocCreated)
 
        // get what was allocated
-       allocated := node1.GetAllAllocations()
+       allocated := node1.GetYunikornAllocations()
        assert.Equal(t, 1, len(allocated), "allocation not added correctly to 
node1")
        assert.Assert(t, resources.Equals(node1.GetAllocatedResource(), 
appRes), "allocation not added correctly to node1")
 
@@ -561,7 +561,7 @@ func TestPlaceholderDataWithNodeRemoval(t *testing.T) {
        assert.Check(t, allocCreated)
 
        // get what was allocated
-       allocated := node1.GetAllAllocations()
+       allocated := node1.GetYunikornAllocations()
        assert.Equal(t, 1, len(allocated), "allocation not added correctly to 
node1")
        assert.Assert(t, resources.Equals(node1.GetAllocatedResource(), 
appRes), "allocation not added correctly to node1")
 
@@ -648,7 +648,7 @@ func TestPlaceholderDataWithRemoval(t *testing.T) {
        assert.Check(t, allocCreated)
 
        // get what was allocated
-       allocated := node1.GetAllAllocations()
+       allocated := node1.GetYunikornAllocations()
        assert.Equal(t, 1, len(allocated), "allocation not added correctly to 
node1")
        assert.Assert(t, resources.Equals(node1.GetAllocatedResource(), 
appRes), "allocation not added correctly to node1")
 
@@ -733,7 +733,7 @@ func TestRemoveNodeWithReplacement(t *testing.T) {
        assert.Check(t, allocCreated)
 
        // get what was allocated
-       allocated := node1.GetAllAllocations()
+       allocated := node1.GetYunikornAllocations()
        assert.Equal(t, 1, len(allocated), "allocation not added correctly to 
node1")
        assertLimits(t, getTestUserGroup(), appRes)
        assert.Assert(t, resources.Equals(node1.GetAllocatedResource(), 
appRes), "allocation not added correctly to node1")
@@ -753,7 +753,7 @@ func TestRemoveNodeWithReplacement(t *testing.T) {
        alloc := newAllocationAll(allocKey, appID1, nodeID2, taskGroup, appRes, 
1, false)
        alloc.SetRelease(ph)
        node2.AddAllocation(alloc)
-       allocated = node2.GetAllAllocations()
+       allocated = node2.GetYunikornAllocations()
        assert.Equal(t, 1, len(allocated), "allocation not added correctly to 
node2")
        assert.Assert(t, resources.Equals(node2.GetAllocatedResource(), 
appRes), "allocation not added correctly to node2 (resource count)")
        assertLimits(t, getTestUserGroup(), appRes)
@@ -768,7 +768,7 @@ func TestRemoveNodeWithReplacement(t *testing.T) {
        // remove the node with the placeholder
        released, confirmed := partition.removeNode(nodeID1)
        assert.Equal(t, 1, partition.GetTotalNodeCount(), "node list was not 
updated, node was not removed")
-       assert.Equal(t, 1, len(node2.GetAllAllocations()), "remaining node 
should have allocation")
+       assert.Equal(t, 1, len(node2.GetYunikornAllocations()), "remaining node 
should have allocation")
        assert.Equal(t, 1, len(released), "node removal did not release correct 
allocation")
        assert.Equal(t, 1, len(confirmed), "node removal did not confirm 
correct allocation")
        assert.Equal(t, ph.GetAllocationKey(), released[0].GetAllocationKey(), 
"allocationKey returned by release not the same as the placeholder")
@@ -807,7 +807,7 @@ func TestRemoveNodeWithReal(t *testing.T) {
        assert.NilError(t, err)
        assert.Check(t, allocCreated)
        // get what was allocated
-       allocated := node1.GetAllAllocations()
+       allocated := node1.GetYunikornAllocations()
        assert.Equal(t, 1, len(allocated), "allocation not added correctly to 
node1")
        assert.Assert(t, resources.Equals(node1.GetAllocatedResource(), 
appRes), "allocation not added correctly to node1")
        assertLimits(t, getTestUserGroup(), appRes)
@@ -827,7 +827,7 @@ func TestRemoveNodeWithReal(t *testing.T) {
        alloc := newAllocationAll(allocKey, appID1, nodeID2, taskGroup, appRes, 
1, false)
        alloc.SetRelease(ph)
        node2.AddAllocation(alloc)
-       allocated = node2.GetAllAllocations()
+       allocated = node2.GetYunikornAllocations()
        assert.Equal(t, 1, len(allocated), "allocation not added correctly to 
node2")
        assert.Assert(t, resources.Equals(node2.GetAllocatedResource(), 
appRes), "allocation not added correctly to node2 (resource count)")
        assertLimits(t, getTestUserGroup(), appRes)
@@ -4729,7 +4729,7 @@ func TestForeignAllocation(t *testing.T) {
        assert.NilError(t, err)
        assert.Equal(t, 1, len(partition.foreignAllocs))
        assert.Equal(t, nodeID1, partition.foreignAllocs[foreignAlloc1])
-       assert.Equal(t, 1, len(node1.GetAllAllocations()))
+       assert.Equal(t, 0, len(node1.GetYunikornAllocations()))
        assert.Assert(t, node1.GetAllocation(foreignAlloc1) != nil)
 
        // remove allocation
@@ -4739,6 +4739,6 @@ func TestForeignAllocation(t *testing.T) {
        assert.Assert(t, released == nil)
        assert.Assert(t, confirmed == nil)
        assert.Equal(t, 0, len(partition.foreignAllocs))
-       assert.Equal(t, 0, len(node1.GetAllAllocations()))
+       assert.Equal(t, 0, len(node1.GetYunikornAllocations()))
        assert.Assert(t, node1.GetAllocation(foreignAlloc1) == nil)
 }
diff --git a/pkg/scheduler/tests/recovery_test.go 
b/pkg/scheduler/tests/recovery_test.go
index eeb9b8f4..a7102077 100644
--- a/pkg/scheduler/tests/recovery_test.go
+++ b/pkg/scheduler/tests/recovery_test.go
@@ -298,8 +298,8 @@ func TestSchedulerRecovery(t *testing.T) {
        // verify nodes
        assert.Equal(t, 2, part.GetTotalNodeCount(), "incorrect recovered node 
count")
 
-       assert.Equal(t, len(node1Allocations), 
len(part.GetNode("node-1:1234").GetAllAllocations()), "allocations on node-1 
not as expected")
-       assert.Equal(t, len(node2Allocations), 
len(part.GetNode("node-2:1234").GetAllAllocations()), "allocations on node-1 
not as expected")
+       assert.Equal(t, len(node1Allocations), 
len(part.GetNode("node-1:1234").GetYunikornAllocations()), "allocations on 
node-1 not as expected")
+       assert.Equal(t, len(node2Allocations), 
len(part.GetNode("node-2:1234").GetYunikornAllocations()), "allocations on 
node-1 not as expected")
 
        node1AllocatedMemory := 
part.GetNode("node-1:1234").GetAllocatedResource().Resources[common.Memory]
        node2AllocatedMemory := 
part.GetNode("node-2:1234").GetAllocatedResource().Resources[common.Memory]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to