This is an automated email from the ASF dual-hosted git repository.
mani pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/yunikorn-core.git
The following commit(s) were added to refs/heads/master by this push:
new 79e66aa7 [YUNIKORN-1900] Orphan allocations with gang scheduling (#611)
79e66aa7 is described below
commit 79e66aa7ec60bf4b050d3b1b3c38ecd1bc822a7c
Author: Wilfred Spiegelenburg <[email protected]>
AuthorDate: Thu Aug 17 16:31:23 2023 +0530
[YUNIKORN-1900] Orphan allocations with gang scheduling (#611)
Transition back to RUNNING from COMPLETING when an allocation is
confirmed as part of a placeholder replacement.
Closes: #611
Signed-off-by: Manikandan R <[email protected]>
---
pkg/scheduler/objects/application.go | 4 +++-
pkg/scheduler/partition_test.go | 28 ++++++++++------------------
2 files changed, 13 insertions(+), 19 deletions(-)
diff --git a/pkg/scheduler/objects/application.go
b/pkg/scheduler/objects/application.go
index 00d3650c..bdbf72fd 100644
--- a/pkg/scheduler/objects/application.go
+++ b/pkg/scheduler/objects/application.go
@@ -1642,7 +1642,9 @@ func (sa *Application) addAllocationInternal(info
*Allocation) {
} else {
// skip the state change if this is the first replacement
allocation as we have done that change
// already when the last placeholder was allocated
- if info.GetResult() != Replaced ||
!resources.IsZero(sa.allocatedResource) {
+ // special case COMPLETING: gang with only one placeholder
moves to COMPLETING and causes orphaned
+ // allocations
+ if info.GetResult() != Replaced ||
!resources.IsZero(sa.allocatedResource) || sa.IsCompleting() {
// progress the state based on where we are, we should
never fail in this case
// keep track of a failure in log.
if err := sa.HandleApplicationEvent(RunApplication);
err != nil {
diff --git a/pkg/scheduler/partition_test.go b/pkg/scheduler/partition_test.go
index ed1e69e6..d1f55669 100644
--- a/pkg/scheduler/partition_test.go
+++ b/pkg/scheduler/partition_test.go
@@ -818,6 +818,7 @@ func TestRemoveNodeWithReplacement(t *testing.T) {
assert.Equal(t, phID, released[0].GetUUID(), "uuid returned by release
not the same as the placeholder")
assert.Equal(t, allocID, confirmed[0].GetUUID(), "uuid returned by
confirmed not the same as the real allocation")
assert.Assert(t, resources.IsZero(app.GetPendingResource()), "app
should not have pending resources")
+ assert.Assert(t, !app.IsCompleting(), "app should not be COMPLETING
after confirming allocation")
allocs = app.GetAllAllocations()
assert.Equal(t, 1, len(allocs), "expected one allocation for the app
(real)")
assert.Equal(t, allocID, allocs[0].GetUUID(), "uuid for the app is not
the same as the real allocation")
@@ -2801,6 +2802,7 @@ func TestPlaceholderBiggerThanReal(t *testing.T) {
assert.Equal(t, 1, partition.GetTotalAllocationCount(), "real
allocation should be registered on the partition")
assert.Assert(t, resources.Equals(smallRes,
app.GetQueue().GetAllocatedResource()), "real size should be allocated on
queue")
assert.Assert(t, resources.Equals(smallRes,
node.GetAllocatedResource()), "real size should be allocated on node")
+ assert.Assert(t, !app.IsCompleting(), "application with allocation
should not be in COMPLETING state")
assertLimits(t, getTestUserGroup(), smallRes)
}
@@ -3055,9 +3057,7 @@ func TestFailReplacePlaceholder(t *testing.T) {
assert.Assert(t, alloc.IsPlaceholder(), "placeholder alloc should
return a placeholder allocation")
assert.Equal(t, alloc.GetResult(), objects.Allocated, "placeholder
alloc should return an allocated result")
assert.Equal(t, alloc.GetNodeID(), nodeID1, "should be allocated on
node-1")
- if !resources.Equals(app.GetPlaceholderResource(), res) {
- t.Fatalf("placeholder allocation not updated as expected: got
%s, expected %s", app.GetPlaceholderResource(), res)
- }
+ assert.Assert(t, resources.Equals(app.GetPlaceholderResource(), res),
"placeholder allocation not updated as expected: got %s, expected %s",
app.GetPlaceholderResource(), res)
assertLimits(t, getTestUserGroup(), res)
// add 2nd node to allow allocation
@@ -3077,17 +3077,12 @@ func TestFailReplacePlaceholder(t *testing.T) {
// allocation must be added as it is on a different node
assert.Equal(t, alloc.GetNodeID(), nodeID2, "should be allocated on
node-2")
assert.Assert(t, resources.IsZero(app.GetAllocatedResource()),
"allocated resources should be zero")
- if !resources.Equals(node.GetAllocatedResource(), res) {
- t.Fatalf("node-1 allocation not updated as expected: got %s,
expected %s", node.GetAllocatedResource(), res)
- }
- if !resources.Equals(node2.GetAllocatedResource(), res) {
- t.Fatalf("node-2 allocation not updated as expected: got %s,
expected %s", node2.GetAllocatedResource(), res)
- }
+ assert.Assert(t, resources.Equals(node.GetAllocatedResource(), res),
"node-1 allocation not updated as expected: got %s, expected %s",
node.GetAllocatedResource(), res)
+ assert.Assert(t, resources.Equals(node2.GetAllocatedResource(), res),
"node-2 allocation not updated as expected: got %s, expected %s",
node2.GetAllocatedResource(), res)
+
phUUID := alloc.GetFirstRelease().GetUUID()
// placeholder is not released until confirmed by the shim
- if !resources.Equals(app.GetPlaceholderResource(), res) {
- t.Fatalf("placeholder allocation not updated as expected: got
%s, expected %s", app.GetPlaceholderResource(), resources.Multiply(res, 2))
- }
+ assert.Assert(t, resources.Equals(app.GetPlaceholderResource(), res),
"placeholder allocation not updated as expected: got %s, expected %s",
app.GetPlaceholderResource(), resources.Multiply(res, 2))
assertLimits(t, getTestUserGroup(), res)
// release placeholder: do what the context would do after the shim
processing
@@ -3105,13 +3100,10 @@ func TestFailReplacePlaceholder(t *testing.T) {
}
assert.Equal(t, confirmed.GetUUID(), alloc.GetUUID(), "confirmed
allocation has unexpected uuid")
assert.Assert(t, resources.IsZero(app.GetPlaceholderResource()),
"placeholder resources should be zero")
- if !resources.Equals(app.GetAllocatedResource(), res) {
- t.Fatalf("allocations not updated as expected: got %s, expected
%s", app.GetAllocatedResource(), res)
- }
+ assert.Assert(t, resources.Equals(app.GetAllocatedResource(), res),
"allocations not updated as expected: got %s, expected %s",
app.GetAllocatedResource(), res)
assert.Assert(t, resources.IsZero(node.GetAllocatedResource()), "node-1
allocated resources should be zero")
- if !resources.Equals(node2.GetAllocatedResource(), res) {
- t.Fatalf("node-2 allocations not set as expected: got %s,
expected %s", node2.GetAllocatedResource(), res)
- }
+ assert.Assert(t, resources.Equals(node2.GetAllocatedResource(), res),
"node-2 allocations not set as expected: got %s, expected %s",
node2.GetAllocatedResource(), res)
+ assert.Assert(t, !app.IsCompleting(), "application with allocation
should not be in COMPLETING state")
assertLimits(t, getTestUserGroup(), res)
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]