This is an automated email from the ASF dual-hosted git repository.
pbacsko pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/yunikorn-core.git
The following commit(s) were added to refs/heads/master by this push:
new 4ea22516 [YUNIKORN-2897] Health checker reports foreign allocation as
orphan (#977)
4ea22516 is described below
commit 4ea225160acfe8197e9da0dd1042361065025d40
Author: Peter Bacsko <[email protected]>
AuthorDate: Thu Oct 3 17:21:25 2024 +0200
[YUNIKORN-2897] Health checker reports foreign allocation as orphan (#977)
Closes: #977
Signed-off-by: Peter Bacsko <[email protected]>
---
pkg/scheduler/health_checker.go | 2 +-
pkg/scheduler/health_checker_test.go | 9 +++++++++
pkg/scheduler/objects/node.go | 13 -------------
pkg/scheduler/objects/node_test.go | 23 -----------------------
pkg/scheduler/objects/required_node_preemptor.go | 2 +-
pkg/scheduler/partition.go | 2 +-
pkg/scheduler/partition_test.go | 24 ++++++++++++------------
pkg/scheduler/tests/recovery_test.go | 4 ++--
8 files changed, 26 insertions(+), 53 deletions(-)
diff --git a/pkg/scheduler/health_checker.go b/pkg/scheduler/health_checker.go
index 177d9b8d..eaf1ea69 100644
--- a/pkg/scheduler/health_checker.go
+++ b/pkg/scheduler/health_checker.go
@@ -330,7 +330,7 @@ func checkAppAllocations(app *objects.Application, nodes
objects.NodeCollection)
func checkNodeAllocations(node *objects.Node, partitionContext
*PartitionContext) []*objects.Allocation {
orphanAllocationsOnNode := make([]*objects.Allocation, 0)
- for _, alloc := range node.GetAllAllocations() {
+ for _, alloc := range node.GetYunikornAllocations() {
if app :=
partitionContext.getApplication(alloc.GetApplicationID()); app != nil {
if !app.IsAllocationAssignedToApp(alloc) {
orphanAllocationsOnNode =
append(orphanAllocationsOnNode, alloc)
diff --git a/pkg/scheduler/health_checker_test.go
b/pkg/scheduler/health_checker_test.go
index 407f3f22..02402fc9 100644
--- a/pkg/scheduler/health_checker_test.go
+++ b/pkg/scheduler/health_checker_test.go
@@ -225,6 +225,15 @@ func TestGetSchedulerHealthStatusContext(t *testing.T) {
healthInfo = GetSchedulerHealthStatus(schedulerMetrics,
schedulerContext)
assert.Assert(t, healthInfo.HealthChecks[7].Succeeded, "The orphan
allocation check on the node should be successful")
assert.Assert(t, !healthInfo.HealthChecks[8].Succeeded, "The orphan
allocation check on the app should not be successful")
+ part.removeApplication("appID")
+ healthInfo = GetSchedulerHealthStatus(schedulerMetrics,
schedulerContext)
+ assert.Assert(t, healthInfo.HealthChecks[8].Succeeded, "The orphan
allocation check on the app still fails after removing the app")
+
+ // check that foreign allocation does not interfere with health check
+ falloc := newForeignAllocation("foreign-1", "node")
+ node.AddAllocation(falloc)
+ healthInfo = GetSchedulerHealthStatus(schedulerMetrics,
schedulerContext)
+ assert.Assert(t, healthInfo.HealthChecks[7].Succeeded, "Foreign
allocation was detected as orphan")
}
func TestGetSchedulerHealthStatusMetrics(t *testing.T) {
diff --git a/pkg/scheduler/objects/node.go b/pkg/scheduler/objects/node.go
index a4907111..db51fa74 100644
--- a/pkg/scheduler/objects/node.go
+++ b/pkg/scheduler/objects/node.go
@@ -230,19 +230,6 @@ func (sn *Node) GetAllocation(allocationKey string)
*Allocation {
return sn.allocations[allocationKey]
}
-// Get a copy of the allocations on this node
-func (sn *Node) GetAllAllocations() []*Allocation {
- sn.RLock()
- defer sn.RUnlock()
-
- arr := make([]*Allocation, 0)
- for _, v := range sn.allocations {
- arr = append(arr, v)
- }
-
- return arr
-}
-
// GetYunikornAllocations returns a copy of Yunikorn allocations on this node
func (sn *Node) GetYunikornAllocations() []*Allocation {
sn.RLock()
diff --git a/pkg/scheduler/objects/node_test.go
b/pkg/scheduler/objects/node_test.go
index 90a30e62..4b38e436 100644
--- a/pkg/scheduler/objects/node_test.go
+++ b/pkg/scheduler/objects/node_test.go
@@ -688,29 +688,6 @@ func TestGetAllocation(t *testing.T) {
}
}
-func TestGetAllAllocations(t *testing.T) {
- node := newNode("node-123", map[string]resources.Quantity{"first": 100,
"second": 200})
- if !resources.IsZero(node.GetAllocatedResource()) {
- t.Fatal("Failed to initialize resource")
- }
-
- // nothing allocated get an empty list
- allocs := node.GetAllAllocations()
- if allocs == nil || len(allocs) != 0 {
- t.Fatalf("allocation length should be 0 on new node")
- }
- alloc1 := newAllocation(appID1, nodeID1, nil)
- alloc2 := newAllocation(appID1, nodeID1, nil)
-
- // allocate
- node.AddAllocation(alloc1)
- node.AddAllocation(alloc2)
- assert.Equal(t, 2, len(node.GetAllAllocations()), "allocation length
mismatch")
- // This should not happen in real code just making sure the code does
do what is expected
- node.AddAllocation(alloc2)
- assert.Equal(t, 2, len(node.GetAllAllocations()), "allocation length
mismatch")
-}
-
func TestSchedulingState(t *testing.T) {
node := newNode("node-123", nil)
if !node.IsSchedulable() {
diff --git a/pkg/scheduler/objects/required_node_preemptor.go
b/pkg/scheduler/objects/required_node_preemptor.go
index 39ac81e5..507eb868 100644
--- a/pkg/scheduler/objects/required_node_preemptor.go
+++ b/pkg/scheduler/objects/required_node_preemptor.go
@@ -45,7 +45,7 @@ func NewRequiredNodePreemptor(node *Node, requiredAsk
*Allocation) *PreemptionCo
func (p *PreemptionContext) filterAllocations() {
p.Lock()
defer p.Unlock()
- for _, allocation := range p.node.GetAllAllocations() {
+ for _, allocation := range p.node.GetYunikornAllocations() {
// skip daemon set pods and higher priority allocation
if allocation.GetRequiredNode() != "" ||
allocation.GetPriority() > p.requiredAsk.GetPriority() {
continue
diff --git a/pkg/scheduler/partition.go b/pkg/scheduler/partition.go
index 0db2dbda..b3d72ba0 100644
--- a/pkg/scheduler/partition.go
+++ b/pkg/scheduler/partition.go
@@ -672,7 +672,7 @@ func (pc *PartitionContext) removeNodeAllocations(node
*objects.Node) ([]*object
released := make([]*objects.Allocation, 0)
confirmed := make([]*objects.Allocation, 0)
// walk over all allocations still registered for this node
- for _, alloc := range node.GetAllAllocations() {
+ for _, alloc := range node.GetYunikornAllocations() {
allocationKey := alloc.GetAllocationKey()
// since we are not locking the node and or application we
could have had an update while processing
// note that we do not return the allocation if the app or
allocation is not found and assume that it
diff --git a/pkg/scheduler/partition_test.go b/pkg/scheduler/partition_test.go
index a086d326..8a16d34d 100644
--- a/pkg/scheduler/partition_test.go
+++ b/pkg/scheduler/partition_test.go
@@ -267,7 +267,7 @@ func TestRemoveNodeWithAllocations(t *testing.T) {
assert.NilError(t, err)
assert.Check(t, allocCreated)
// get what was allocated
- allocated := node.GetAllAllocations()
+ allocated := node.GetYunikornAllocations()
assert.Equal(t, 1, len(allocated), "allocation not added correctly")
assertLimits(t, getTestUserGroup(), appRes)
@@ -316,7 +316,7 @@ func TestRemoveNodeWithPlaceholders(t *testing.T) {
assert.NilError(t, err)
assert.Check(t, allocCreated)
// get what was allocated
- allocated := node1.GetAllAllocations()
+ allocated := node1.GetYunikornAllocations()
assert.Equal(t, 1, len(allocated), "allocation not added correctly to
node1 expected 1 got: %v", allocated)
assert.Assert(t, resources.Equals(node1.GetAllocatedResource(),
appRes), "allocation not added correctly to node1")
assertLimits(t, getTestUserGroup(), appRes)
@@ -432,7 +432,7 @@ func TestPlaceholderDataWithPlaceholderPreemption(t
*testing.T) {
assert.Check(t, allocCreated)
// get what was allocated
- allocated := node1.GetAllAllocations()
+ allocated := node1.GetYunikornAllocations()
assert.Equal(t, 1, len(allocated), "allocation not added correctly to
node1")
assert.Assert(t, resources.Equals(node1.GetAllocatedResource(),
appRes), "allocation not added correctly to node1")
@@ -561,7 +561,7 @@ func TestPlaceholderDataWithNodeRemoval(t *testing.T) {
assert.Check(t, allocCreated)
// get what was allocated
- allocated := node1.GetAllAllocations()
+ allocated := node1.GetYunikornAllocations()
assert.Equal(t, 1, len(allocated), "allocation not added correctly to
node1")
assert.Assert(t, resources.Equals(node1.GetAllocatedResource(),
appRes), "allocation not added correctly to node1")
@@ -648,7 +648,7 @@ func TestPlaceholderDataWithRemoval(t *testing.T) {
assert.Check(t, allocCreated)
// get what was allocated
- allocated := node1.GetAllAllocations()
+ allocated := node1.GetYunikornAllocations()
assert.Equal(t, 1, len(allocated), "allocation not added correctly to
node1")
assert.Assert(t, resources.Equals(node1.GetAllocatedResource(),
appRes), "allocation not added correctly to node1")
@@ -733,7 +733,7 @@ func TestRemoveNodeWithReplacement(t *testing.T) {
assert.Check(t, allocCreated)
// get what was allocated
- allocated := node1.GetAllAllocations()
+ allocated := node1.GetYunikornAllocations()
assert.Equal(t, 1, len(allocated), "allocation not added correctly to
node1")
assertLimits(t, getTestUserGroup(), appRes)
assert.Assert(t, resources.Equals(node1.GetAllocatedResource(),
appRes), "allocation not added correctly to node1")
@@ -753,7 +753,7 @@ func TestRemoveNodeWithReplacement(t *testing.T) {
alloc := newAllocationAll(allocKey, appID1, nodeID2, taskGroup, appRes,
1, false)
alloc.SetRelease(ph)
node2.AddAllocation(alloc)
- allocated = node2.GetAllAllocations()
+ allocated = node2.GetYunikornAllocations()
assert.Equal(t, 1, len(allocated), "allocation not added correctly to
node2")
assert.Assert(t, resources.Equals(node2.GetAllocatedResource(),
appRes), "allocation not added correctly to node2 (resource count)")
assertLimits(t, getTestUserGroup(), appRes)
@@ -768,7 +768,7 @@ func TestRemoveNodeWithReplacement(t *testing.T) {
// remove the node with the placeholder
released, confirmed := partition.removeNode(nodeID1)
assert.Equal(t, 1, partition.GetTotalNodeCount(), "node list was not
updated, node was not removed")
- assert.Equal(t, 1, len(node2.GetAllAllocations()), "remaining node
should have allocation")
+ assert.Equal(t, 1, len(node2.GetYunikornAllocations()), "remaining node
should have allocation")
assert.Equal(t, 1, len(released), "node removal did not release correct
allocation")
assert.Equal(t, 1, len(confirmed), "node removal did not confirm
correct allocation")
assert.Equal(t, ph.GetAllocationKey(), released[0].GetAllocationKey(),
"allocationKey returned by release not the same as the placeholder")
@@ -807,7 +807,7 @@ func TestRemoveNodeWithReal(t *testing.T) {
assert.NilError(t, err)
assert.Check(t, allocCreated)
// get what was allocated
- allocated := node1.GetAllAllocations()
+ allocated := node1.GetYunikornAllocations()
assert.Equal(t, 1, len(allocated), "allocation not added correctly to
node1")
assert.Assert(t, resources.Equals(node1.GetAllocatedResource(),
appRes), "allocation not added correctly to node1")
assertLimits(t, getTestUserGroup(), appRes)
@@ -827,7 +827,7 @@ func TestRemoveNodeWithReal(t *testing.T) {
alloc := newAllocationAll(allocKey, appID1, nodeID2, taskGroup, appRes,
1, false)
alloc.SetRelease(ph)
node2.AddAllocation(alloc)
- allocated = node2.GetAllAllocations()
+ allocated = node2.GetYunikornAllocations()
assert.Equal(t, 1, len(allocated), "allocation not added correctly to
node2")
assert.Assert(t, resources.Equals(node2.GetAllocatedResource(),
appRes), "allocation not added correctly to node2 (resource count)")
assertLimits(t, getTestUserGroup(), appRes)
@@ -4729,7 +4729,7 @@ func TestForeignAllocation(t *testing.T) {
assert.NilError(t, err)
assert.Equal(t, 1, len(partition.foreignAllocs))
assert.Equal(t, nodeID1, partition.foreignAllocs[foreignAlloc1])
- assert.Equal(t, 1, len(node1.GetAllAllocations()))
+ assert.Equal(t, 0, len(node1.GetYunikornAllocations()))
assert.Assert(t, node1.GetAllocation(foreignAlloc1) != nil)
// remove allocation
@@ -4739,6 +4739,6 @@ func TestForeignAllocation(t *testing.T) {
assert.Assert(t, released == nil)
assert.Assert(t, confirmed == nil)
assert.Equal(t, 0, len(partition.foreignAllocs))
- assert.Equal(t, 0, len(node1.GetAllAllocations()))
+ assert.Equal(t, 0, len(node1.GetYunikornAllocations()))
assert.Assert(t, node1.GetAllocation(foreignAlloc1) == nil)
}
diff --git a/pkg/scheduler/tests/recovery_test.go
b/pkg/scheduler/tests/recovery_test.go
index eeb9b8f4..a7102077 100644
--- a/pkg/scheduler/tests/recovery_test.go
+++ b/pkg/scheduler/tests/recovery_test.go
@@ -298,8 +298,8 @@ func TestSchedulerRecovery(t *testing.T) {
// verify nodes
assert.Equal(t, 2, part.GetTotalNodeCount(), "incorrect recovered node
count")
- assert.Equal(t, len(node1Allocations),
len(part.GetNode("node-1:1234").GetAllAllocations()), "allocations on node-1
not as expected")
- assert.Equal(t, len(node2Allocations),
len(part.GetNode("node-2:1234").GetAllAllocations()), "allocations on node-1
not as expected")
+ assert.Equal(t, len(node1Allocations),
len(part.GetNode("node-1:1234").GetYunikornAllocations()), "allocations on
node-1 not as expected")
+ assert.Equal(t, len(node2Allocations),
len(part.GetNode("node-2:1234").GetYunikornAllocations()), "allocations on
node-1 not as expected")
node1AllocatedMemory :=
part.GetNode("node-1:1234").GetAllocatedResource().Resources[common.Memory]
node2AllocatedMemory :=
part.GetNode("node-2:1234").GetAllocatedResource().Resources[common.Memory]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]