This is an automated email from the ASF dual-hosted git repository.

mani pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/yunikorn-core.git


The following commit(s) were added to refs/heads/master by this push:
     new 79e66aa7 [YUNIKORN-1900] Orphan allocations with gang scheduling (#611)
79e66aa7 is described below

commit 79e66aa7ec60bf4b050d3b1b3c38ecd1bc822a7c
Author: Wilfred Spiegelenburg <[email protected]>
AuthorDate: Thu Aug 17 16:31:23 2023 +0530

    [YUNIKORN-1900] Orphan allocations with gang scheduling (#611)
    
    Transition back to RUNNING from COMPLETING when an allocation is
    confirmed as part of a placeholder replacement.
    
    Closes: #611
    
    Signed-off-by: Manikandan R <[email protected]>
---
 pkg/scheduler/objects/application.go |  4 +++-
 pkg/scheduler/partition_test.go      | 28 ++++++++++------------------
 2 files changed, 13 insertions(+), 19 deletions(-)

diff --git a/pkg/scheduler/objects/application.go 
b/pkg/scheduler/objects/application.go
index 00d3650c..bdbf72fd 100644
--- a/pkg/scheduler/objects/application.go
+++ b/pkg/scheduler/objects/application.go
@@ -1642,7 +1642,9 @@ func (sa *Application) addAllocationInternal(info 
*Allocation) {
        } else {
                // skip the state change if this is the first replacement 
allocation as we have done that change
                // already when the last placeholder was allocated
-               if info.GetResult() != Replaced || 
!resources.IsZero(sa.allocatedResource) {
+               // special case COMPLETING: gang with only one placeholder 
moves to COMPLETING and causes orphaned
+               // allocations
+               if info.GetResult() != Replaced || 
!resources.IsZero(sa.allocatedResource) || sa.IsCompleting() {
                        // progress the state based on where we are, we should 
never fail in this case
                        // keep track of a failure in log.
                        if err := sa.HandleApplicationEvent(RunApplication); 
err != nil {
diff --git a/pkg/scheduler/partition_test.go b/pkg/scheduler/partition_test.go
index ed1e69e6..d1f55669 100644
--- a/pkg/scheduler/partition_test.go
+++ b/pkg/scheduler/partition_test.go
@@ -818,6 +818,7 @@ func TestRemoveNodeWithReplacement(t *testing.T) {
        assert.Equal(t, phID, released[0].GetUUID(), "uuid returned by release 
not the same as the placeholder")
        assert.Equal(t, allocID, confirmed[0].GetUUID(), "uuid returned by 
confirmed not the same as the real allocation")
        assert.Assert(t, resources.IsZero(app.GetPendingResource()), "app 
should not have pending resources")
+       assert.Assert(t, !app.IsCompleting(), "app should not be COMPLETING 
after confirming allocation")
        allocs = app.GetAllAllocations()
        assert.Equal(t, 1, len(allocs), "expected one allocation for the app 
(real)")
        assert.Equal(t, allocID, allocs[0].GetUUID(), "uuid for the app is not 
the same as the real allocation")
@@ -2801,6 +2802,7 @@ func TestPlaceholderBiggerThanReal(t *testing.T) {
        assert.Equal(t, 1, partition.GetTotalAllocationCount(), "real 
allocation should be registered on the partition")
        assert.Assert(t, resources.Equals(smallRes, 
app.GetQueue().GetAllocatedResource()), "real size should be allocated on 
queue")
        assert.Assert(t, resources.Equals(smallRes, 
node.GetAllocatedResource()), "real size should be allocated on node")
+       assert.Assert(t, !app.IsCompleting(), "application with allocation 
should not be in COMPLETING state")
        assertLimits(t, getTestUserGroup(), smallRes)
 }
 
@@ -3055,9 +3057,7 @@ func TestFailReplacePlaceholder(t *testing.T) {
        assert.Assert(t, alloc.IsPlaceholder(), "placeholder alloc should 
return a placeholder allocation")
        assert.Equal(t, alloc.GetResult(), objects.Allocated, "placeholder 
alloc should return an allocated result")
        assert.Equal(t, alloc.GetNodeID(), nodeID1, "should be allocated on 
node-1")
-       if !resources.Equals(app.GetPlaceholderResource(), res) {
-               t.Fatalf("placeholder allocation not updated as expected: got 
%s, expected %s", app.GetPlaceholderResource(), res)
-       }
+       assert.Assert(t, resources.Equals(app.GetPlaceholderResource(), res), 
"placeholder allocation not updated as expected: got %s, expected %s", 
app.GetPlaceholderResource(), res)
        assertLimits(t, getTestUserGroup(), res)
 
        // add 2nd node to allow allocation
@@ -3077,17 +3077,12 @@ func TestFailReplacePlaceholder(t *testing.T) {
        // allocation must be added as it is on a different node
        assert.Equal(t, alloc.GetNodeID(), nodeID2, "should be allocated on 
node-2")
        assert.Assert(t, resources.IsZero(app.GetAllocatedResource()), 
"allocated resources should be zero")
-       if !resources.Equals(node.GetAllocatedResource(), res) {
-               t.Fatalf("node-1 allocation not updated as expected: got %s, 
expected %s", node.GetAllocatedResource(), res)
-       }
-       if !resources.Equals(node2.GetAllocatedResource(), res) {
-               t.Fatalf("node-2 allocation not updated as expected: got %s, 
expected %s", node2.GetAllocatedResource(), res)
-       }
+       assert.Assert(t, resources.Equals(node.GetAllocatedResource(), res), 
"node-1 allocation not updated as expected: got %s, expected %s", 
node.GetAllocatedResource(), res)
+       assert.Assert(t, resources.Equals(node2.GetAllocatedResource(), res), 
"node-2 allocation not updated as expected: got %s, expected %s", 
node2.GetAllocatedResource(), res)
+
        phUUID := alloc.GetFirstRelease().GetUUID()
        // placeholder is not released until confirmed by the shim
-       if !resources.Equals(app.GetPlaceholderResource(), res) {
-               t.Fatalf("placeholder allocation not updated as expected: got 
%s, expected %s", app.GetPlaceholderResource(), resources.Multiply(res, 2))
-       }
+       assert.Assert(t, resources.Equals(app.GetPlaceholderResource(), res), 
"placeholder allocation not updated as expected: got %s, expected %s", 
app.GetPlaceholderResource(), resources.Multiply(res, 2))
        assertLimits(t, getTestUserGroup(), res)
 
        // release placeholder: do what the context would do after the shim 
processing
@@ -3105,13 +3100,10 @@ func TestFailReplacePlaceholder(t *testing.T) {
        }
        assert.Equal(t, confirmed.GetUUID(), alloc.GetUUID(), "confirmed 
allocation has unexpected uuid")
        assert.Assert(t, resources.IsZero(app.GetPlaceholderResource()), 
"placeholder resources should be zero")
-       if !resources.Equals(app.GetAllocatedResource(), res) {
-               t.Fatalf("allocations not updated as expected: got %s, expected 
%s", app.GetAllocatedResource(), res)
-       }
+       assert.Assert(t, resources.Equals(app.GetAllocatedResource(), res), 
"allocations not updated as expected: got %s, expected %s", 
app.GetAllocatedResource(), res)
        assert.Assert(t, resources.IsZero(node.GetAllocatedResource()), "node-1 
allocated resources should be zero")
-       if !resources.Equals(node2.GetAllocatedResource(), res) {
-               t.Fatalf("node-2 allocations not set as expected: got %s, 
expected %s", node2.GetAllocatedResource(), res)
-       }
+       assert.Assert(t, resources.Equals(node2.GetAllocatedResource(), res), 
"node-2 allocations not set as expected: got %s, expected %s", 
node2.GetAllocatedResource(), res)
+       assert.Assert(t, !app.IsCompleting(), "application with allocation 
should not be in COMPLETING state")
        assertLimits(t, getTestUserGroup(), res)
 }
 


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to