This is an automated email from the ASF dual-hosted git repository.
mani pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/yunikorn-core.git
The following commit(s) were added to refs/heads/master by this push:
new 956ade54 [YUNIKORN-3241] Fix fence root searching when queue is
over/at max capacity (#1076)
956ade54 is described below
commit 956ade546a8914516a29cff90d0d3e5cc773ec1f
Author: Victor Zhou <[email protected]>
AuthorDate: Tue Mar 24 12:15:12 2026 +0530
[YUNIKORN-3241] Fix fence root searching when queue is over/at max capacity
(#1076)
Closes: #1076
Signed-off-by: mani <[email protected]>
---
pkg/scheduler/objects/queue.go | 19 ++++++++++++++-----
pkg/scheduler/objects/queue_test.go | 32 ++++++++++++++++++++++++++------
2 files changed, 40 insertions(+), 11 deletions(-)
diff --git a/pkg/scheduler/objects/queue.go b/pkg/scheduler/objects/queue.go
index db7a1f18..45ce26db 100644
--- a/pkg/scheduler/objects/queue.go
+++ b/pkg/scheduler/objects/queue.go
@@ -2005,7 +2005,7 @@ func (sq *Queue) FindEligiblePreemptionVictims(queuePath
string, ask *Allocation
priorityMap := make(map[string]int64)
// get the queue which acts as the fence boundary
- fence := sq.findPreemptionFenceRoot(priorityMap, int64(ask.priority))
+ fence := sq.findPreemptionFenceRoot(priorityMap, int64(ask.priority),
ask.GetAllocatedResource())
if fence == nil {
return nil
}
@@ -2184,7 +2184,7 @@ func prunePreemptionSnapshots(results
map[string]*QueuePreemptionSnapshot, askQu
}
}
-func (sq *Queue) findPreemptionFenceRoot(priorityMap map[string]int64,
currentPriority int64) *Queue {
+func (sq *Queue) findPreemptionFenceRoot(priorityMap map[string]int64,
currentPriority int64, askResource *resources.Resource) *Queue {
if sq == nil {
return nil
}
@@ -2197,11 +2197,20 @@ func (sq *Queue) findPreemptionFenceRoot(priorityMap
map[string]int64, currentPr
}
priorityMap[sq.QueuePath] = currentPriority
- // Return this queue as fence root if: 1. FencePreemptionPolicy is set
2. root queue 3. allocations in the queue reached maximum resources
- if sq.parent == nil || sq.GetPreemptionPolicy() ==
policies.FencePreemptionPolicy || resources.Equals(sq.maxResource,
sq.allocatedResource) {
+ shouldFenceByMax := false
+ maxResource := sq.GetMaxResource()
+ if maxResource != nil && len(maxResource.Resources) > 0 {
+ projected := resources.Add(sq.allocatedResource, askResource)
+ shouldFenceByMax =
!maxResource.StrictlyGreaterThanOrEqualsOnlyExisting(projected)
+ }
+ // Return this queue as fence root if:
+ // 1. FencePreemptionPolicy is set
+ // 2. root queue
+ // 3. projected allocations (current usage + ask) reached or exceeded
any configured max resource
+ if sq.parent == nil || sq.GetPreemptionPolicy() ==
policies.FencePreemptionPolicy || shouldFenceByMax {
return sq
}
- return sq.parent.findPreemptionFenceRoot(priorityMap, currentPriority)
+ return sq.parent.findPreemptionFenceRoot(priorityMap, currentPriority,
askResource)
}
func (sq *Queue) GetCurrentPriority() int32 {
diff --git a/pkg/scheduler/objects/queue_test.go
b/pkg/scheduler/objects/queue_test.go
index d78d6334..dd00d46e 100644
--- a/pkg/scheduler/objects/queue_test.go
+++ b/pkg/scheduler/objects/queue_test.go
@@ -2019,14 +2019,14 @@ func TestFindEligiblePreemptionVictims(t *testing.T) {
// disabling preemption on victim queue should remove victims from
consideration
leaf2.preemptionPolicy = policies.DisabledPreemptionPolicy
- assert.Equal(t, leaf1.findPreemptionFenceRoot(make(map[string]int64),
int64(ask.priority)).QueuePath, "root")
+ assert.Equal(t, leaf1.findPreemptionFenceRoot(make(map[string]int64),
int64(ask.priority), ask.GetAllocatedResource()).QueuePath, "root")
snapshot = leaf1.FindEligiblePreemptionVictims(leaf1.QueuePath, ask)
assert.Equal(t, 0, len(victims(snapshot)), "found victims")
leaf2.preemptionPolicy = policies.DefaultPreemptionPolicy
// fencing parent1 queue should limit scope
parent1.preemptionPolicy = policies.FencePreemptionPolicy
- assert.Equal(t, leaf1.findPreemptionFenceRoot(make(map[string]int64),
int64(ask.priority)).QueuePath, "root.parent1")
+ assert.Equal(t, leaf1.findPreemptionFenceRoot(make(map[string]int64),
int64(ask.priority), ask.GetAllocatedResource()).QueuePath, "root.parent1")
snapshot = leaf1.FindEligiblePreemptionVictims(leaf1.QueuePath, ask)
assert.Equal(t, 3, len(snapshot), "wrong snapshot count")
assert.Equal(t, 0, len(victims(snapshot)), "found victims")
@@ -2034,7 +2034,7 @@ func TestFindEligiblePreemptionVictims(t *testing.T) {
// fencing leaf1 queue should limit scope
leaf1.preemptionPolicy = policies.FencePreemptionPolicy
- assert.Equal(t, leaf1.findPreemptionFenceRoot(make(map[string]int64),
int64(ask.priority)).QueuePath, "root.parent1.leaf1")
+ assert.Equal(t, leaf1.findPreemptionFenceRoot(make(map[string]int64),
int64(ask.priority), ask.GetAllocatedResource()).QueuePath,
"root.parent1.leaf1")
snapshot = leaf1.FindEligiblePreemptionVictims(leaf1.QueuePath, ask)
assert.Equal(t, 3, len(snapshot), "wrong snapshot count")
assert.Equal(t, 0, len(victims(snapshot)), "found victims")
@@ -2042,14 +2042,14 @@ func TestFindEligiblePreemptionVictims(t *testing.T) {
// fencing parent2 queue should not limit scope
parent2.preemptionPolicy = policies.FencePreemptionPolicy
- assert.Equal(t, leaf1.findPreemptionFenceRoot(make(map[string]int64),
int64(ask.priority)).QueuePath, "root")
+ assert.Equal(t, leaf1.findPreemptionFenceRoot(make(map[string]int64),
int64(ask.priority), ask.GetAllocatedResource()).QueuePath, "root")
snapshot = leaf1.FindEligiblePreemptionVictims(leaf1.QueuePath, ask)
assert.Equal(t, 2, len(victims(snapshot)), "wrong victim count")
parent2.preemptionPolicy = policies.DefaultPreemptionPolicy
// fencing leaf2 queue should not limit scope
leaf2.preemptionPolicy = policies.FencePreemptionPolicy
- assert.Equal(t, leaf1.findPreemptionFenceRoot(make(map[string]int64),
int64(ask.priority)).QueuePath, "root")
+ assert.Equal(t, leaf1.findPreemptionFenceRoot(make(map[string]int64),
int64(ask.priority), ask.GetAllocatedResource()).QueuePath, "root")
snapshot = leaf1.FindEligiblePreemptionVictims(leaf1.QueuePath, ask)
assert.Equal(t, 5, len(snapshot), "wrong victim count")
assert.Equal(t, 2, len(victims(snapshot)), "wrong victim count")
@@ -2062,12 +2062,32 @@ func TestFindEligiblePreemptionVictims(t *testing.T) {
parent1.allocatedResource = parent1.maxResource
assert.Equal(t, parent1.preemptionPolicy,
policies.DefaultPreemptionPolicy)
assert.Equal(t, leaf1.preemptionPolicy,
policies.DefaultPreemptionPolicy)
- assert.Equal(t, leaf1.findPreemptionFenceRoot(make(map[string]int64),
int64(ask.priority)).QueuePath, parent1.QueuePath)
+ assert.Equal(t, leaf1.findPreemptionFenceRoot(make(map[string]int64),
int64(ask.priority), ask.GetAllocatedResource()).QueuePath, parent1.QueuePath)
snapshot = leaf1.FindEligiblePreemptionVictims(leaf1.QueuePath, ask)
assert.Equal(t, 3, len(snapshot), "wrong snapshot count")
assert.Equal(t, 0, len(victims(snapshot)), "wrong victim count")
parent1.allocatedResource = used
+ // parent1 queue is not full yet, and this ask would bring it exactly
to max resources.
+ // Reaching max exactly should not fence by max check.
+ parent1.allocatedResource =
resources.NewResourceFromMap(map[string]resources.Quantity{siCommon.Memory:
100})
+ assert.Equal(t, leaf1.findPreemptionFenceRoot(make(map[string]int64),
int64(ask.priority), ask.GetAllocatedResource()).QueuePath, "root")
+ snapshot = leaf1.FindEligiblePreemptionVictims(leaf1.QueuePath, ask)
+ assert.Equal(t, 5, len(snapshot), "wrong snapshot count")
+ assert.Equal(t, 2, len(victims(snapshot)), "wrong victim count")
+ parent1.allocatedResource = used
+
+ // empty max resources should not fence by max check
+ usedMax := parent1.maxResource
+ parent1.maxResource = resources.NewResource()
+ assert.Equal(t, leaf1.findPreemptionFenceRoot(make(map[string]int64),
int64(ask.priority), ask.GetAllocatedResource()).QueuePath, "root")
+ parent1.maxResource = usedMax
+
+ // disjoint max and projected resources should not fence by max check
+ parent1.maxResource =
resources.NewResourceFromMap(map[string]resources.Quantity{"gpu": 1})
+ assert.Equal(t, leaf1.findPreemptionFenceRoot(make(map[string]int64),
int64(ask.priority), ask.GetAllocatedResource()).QueuePath, "root")
+ parent1.maxResource = usedMax
+
// requiring a specific node take alloc out of consideration
alloc2.SetRequiredNode(nodeID1)
snapshot = leaf1.FindEligiblePreemptionVictims(leaf1.QueuePath, ask)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]