This is an automated email from the ASF dual-hosted git repository.
mani pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/yunikorn-core.git
The following commit(s) were added to refs/heads/master by this push:
new e3bcd343 [YUNIKORN-2270] GPU Preemption is not triggered as expected
when all available GPUs are used. (#759)
e3bcd343 is described below
commit e3bcd343669d65450b88b20cdad0abd0e5b61789
Author: Weiwei Yang <[email protected]>
AuthorDate: Thu Dec 21 11:46:35 2023 +0530
[YUNIKORN-2270] GPU Preemption is not triggered as expected when all
available GPUs are used. (#759)
Closes: #759
Signed-off-by: Manikandan R <[email protected]>
---
pkg/scheduler/objects/application_test.go | 2 +-
pkg/scheduler/objects/preemption.go | 15 +++------------
pkg/scheduler/partition_test.go | 3 ++-
3 files changed, 6 insertions(+), 14 deletions(-)
diff --git a/pkg/scheduler/objects/application_test.go
b/pkg/scheduler/objects/application_test.go
index 55cebf30..f91d5216 100644
--- a/pkg/scheduler/objects/application_test.go
+++ b/pkg/scheduler/objects/application_test.go
@@ -1871,7 +1871,7 @@ func TestTryAllocatePreemptQueue(t *testing.T) {
// pass the time and try again
ask3.createTime = ask3.createTime.Add(-30 * time.Second)
alloc3 =
app2.tryAllocate(resources.NewResourceFromMap(map[string]resources.Quantity{"first":
0}), true, 30*time.Second, &preemptionAttemptsRemaining, iterator, iterator,
getNode)
- assert.Assert(t, alloc3 == nil, "alloc3 not expected")
+ assert.Assert(t, alloc3 != nil && alloc3.result == Reserved, "alloc3
should be a reservation")
assert.Assert(t, alloc2.IsPreempted(), "alloc2 should have been
preempted")
}
diff --git a/pkg/scheduler/objects/preemption.go
b/pkg/scheduler/objects/preemption.go
index ebf82da8..0e37845b 100644
--- a/pkg/scheduler/objects/preemption.go
+++ b/pkg/scheduler/objects/preemption.go
@@ -573,20 +573,11 @@ func (p *Preemptor) TryPreemption() (*Allocation, bool) {
"preempting allocations to free up resources to run ask:
"+p.ask.GetAllocationKey())
// reserve the selected node for the new allocation if it will fit
- if p.headRoom.FitInMaxUndef(p.ask.GetAllocatedResource()) {
- log.Log(log.SchedPreemption).Info("Reserving node for ask after
preemption",
- zap.String("allocationKey", p.ask.GetAllocationKey()),
- zap.String("nodeID", nodeID),
- zap.Int("victimCount", len(victims)))
- return newReservedAllocation(nodeID, p.ask), true
- }
-
- // can't reserve as queue is still too full, but scheduling should
succeed after preemption occurs
- log.Log(log.SchedPreemption).Info("Preempting allocations for ask, but
not reserving yet as queue is still above capacity",
+ log.Log(log.SchedPreemption).Info("Reserving node for ask after
preemption",
zap.String("allocationKey", p.ask.GetAllocationKey()),
+ zap.String("nodeID", nodeID),
zap.Int("victimCount", len(victims)))
-
- return nil, true
+ return newReservedAllocation(nodeID, p.ask), true
}
type predicateCheckResult struct {
diff --git a/pkg/scheduler/partition_test.go b/pkg/scheduler/partition_test.go
index 3ac19868..1565f5fb 100644
--- a/pkg/scheduler/partition_test.go
+++ b/pkg/scheduler/partition_test.go
@@ -1908,6 +1908,7 @@ func TestPreemption(t *testing.T) {
assert.Assert(t, alloc2.IsPreempted(), "alloc-2 is not preempted")
// allocation should still not do anything as we have not yet released
the preempted allocation
+ // but the ask should have a reservation
alloc = partition.tryAllocate()
if alloc != nil {
t.Fatal("unexpected allocation")
@@ -1937,7 +1938,7 @@ func TestPreemption(t *testing.T) {
t.Fatal("missing allocation")
}
assert.Equal(t, 0, len(app2.GetReservations()), "ask should not be
reserved")
- assert.Equal(t, alloc.GetResult(), objects.Allocated, "result should be
allocated")
+ assert.Equal(t, alloc.GetResult(), objects.AllocatedReserved, "result
should be allocated from reservation")
assert.Equal(t, alloc.GetAllocationKey(), allocID3, "expected ask
alloc-3 to be allocated")
assertUserGroupResourceMaxLimits(t, getTestUserGroup(),
resources.NewResourceFromMap(map[string]resources.Quantity{"vcore": 10000}),
getExpectedQueuesLimitsForPreemption())
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]