This is an automated email from the ASF dual-hosted git repository.
ccondit pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/yunikorn-core.git
The following commit(s) were added to refs/heads/master by this push:
new 09e5d741 [YUNIKORN-2495] Remove Starting application state (#824)
09e5d741 is described below
commit 09e5d741b67c397330a6c0d630131a25ae75ef4d
Author: Craig Condit <[email protected]>
AuthorDate: Wed Mar 20 09:28:32 2024 -0500
[YUNIKORN-2495] Remove Starting application state (#824)
Closes: #824
---
go.mod | 4 +-
go.sum | 4 +-
pkg/scheduler/objects/application.go | 20 ++----
pkg/scheduler/objects/application_state.go | 46 ++++----------
pkg/scheduler/objects/application_state_test.go | 80 ++++++++----------------
pkg/scheduler/objects/application_test.go | 59 ++++-------------
pkg/scheduler/objects/queue_test.go | 2 +-
pkg/scheduler/partition_test.go | 2 +-
pkg/scheduler/tests/application_tracking_test.go | 4 +-
pkg/webservice/handlers.go | 1 -
pkg/webservice/handlers_test.go | 33 +++++-----
11 files changed, 77 insertions(+), 178 deletions(-)
diff --git a/go.mod b/go.mod
index 0c527eeb..23a22da2 100644
--- a/go.mod
+++ b/go.mod
@@ -22,7 +22,7 @@ module github.com/apache/yunikorn-core
go 1.21
require (
- github.com/apache/yunikorn-scheduler-interface
v0.0.0-20240222205935-94c25b6d2579
+ github.com/apache/yunikorn-scheduler-interface
v0.0.0-20240320010951-d392f3c7272d
github.com/google/btree v1.1.2
github.com/google/go-cmp v0.6.0
github.com/google/uuid v1.6.0
@@ -50,7 +50,7 @@ require (
golang.org/x/sys v0.17.0 // indirect
golang.org/x/text v0.14.0 // indirect
google.golang.org/genproto/googleapis/rpc
v0.0.0-20230711160842-782d3b101e98 // indirect
- google.golang.org/protobuf v1.31.0 // indirect
+ google.golang.org/protobuf v1.33.0 // indirect
)
replace (
diff --git a/go.sum b/go.sum
index cc5fd398..7831f636 100644
--- a/go.sum
+++ b/go.sum
@@ -1,5 +1,5 @@
-github.com/apache/yunikorn-scheduler-interface
v0.0.0-20240222205935-94c25b6d2579
h1:uXGmiGX3sfrUtPM8sb4Nco5pgzPKrDJ0CtiBizhgbRM=
-github.com/apache/yunikorn-scheduler-interface
v0.0.0-20240222205935-94c25b6d2579/go.mod
h1:3jCo/Ash4yEmw05ozK3BihJDEEAMOZEN7rmxNfb0gO0=
+github.com/apache/yunikorn-scheduler-interface
v0.0.0-20240320010951-d392f3c7272d
h1:ywKv8csdOkrUcQqHit60/PuHXuELpHcZ4ftm7/b3c6Q=
+github.com/apache/yunikorn-scheduler-interface
v0.0.0-20240320010951-d392f3c7272d/go.mod
h1:0f4l3ManMROX60xU7GbhejCEYYyMksH275oY2xIVkbM=
github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM=
github.com/beorn7/perks v1.0.1/go.mod
h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw=
github.com/cespare/xxhash/v2 v2.2.0
h1:DC2CZ1Ep5Y4k3ZQ899DldepgrayRUGE6BBZ/cd9Cj44=
diff --git a/pkg/scheduler/objects/application.go
b/pkg/scheduler/objects/application.go
index 1461a497..7606840f 100644
--- a/pkg/scheduler/objects/application.go
+++ b/pkg/scheduler/objects/application.go
@@ -44,7 +44,6 @@ import (
var (
reservationDelay = 2 * time.Second
- startingTimeout = 5 * time.Minute
completingTimeout = 30 * time.Second
terminatedTimeout = 3 * 24 * time.Hour
defaultPlaceholderTimeout = 15 * time.Minute
@@ -100,7 +99,6 @@ type Application struct {
placeholderAsk *resources.Resource // total placeholder
request for the app (all task groups)
stateMachine *fsm.FSM // application state
machine
stateTimer *time.Timer // timer for state time
- startTimeout time.Duration // timeout for the
application starting state
execTimeout time.Duration // execTimeout for the
application run
placeholderTimer *time.Timer // placeholder replace
timer
gangSchedulingStyle string // gang scheduling
style can be hard (after timeout we fail the application), or soft (after
timeeout we schedule it as a normal application)
@@ -184,7 +182,6 @@ func NewApplication(siApp *si.AddApplicationRequest, ugi
security.UserGroup, eve
}
app.gangSchedulingStyle = gangSchedStyle
app.execTimeout = placeholderTimeout
- app.startTimeout = startingTimeout
app.user = ugi
app.rmEventHandler = eventHandler
app.rmID = rmID
@@ -233,10 +230,6 @@ func (sa *Application) CurrentState() string {
return sa.stateMachine.Current()
}
-func (sa *Application) IsStarting() bool {
- return sa.stateMachine.Is(Starting.String())
-}
-
func (sa *Application) IsAccepted() bool {
return sa.stateMachine.Is(Accepted.String())
}
@@ -327,9 +320,9 @@ func (sa *Application) OnStateChange(event *fsm.Event,
eventInfo string) {
})
}
-// Set the starting timer to make sure the application will not get stuck in a
starting state too long.
-// This prevents an app from not progressing to Running when it only has 1
allocation.
-// Called when entering the Starting state by the state machine.
+// Set the state timer to make sure the application will not get stuck in a
time-sensitive state too long.
+// This prevents an app from not progressing to the next state if a timeout is
required.
+// Used for placeholder timeout and completion handling.
func (sa *Application) setStateTimer(timeout time.Duration, currentState
string, event applicationEvent) {
log.Log(log.SchedApplication).Debug("Application state timer initiated",
zap.String("appID", sa.ApplicationID),
@@ -371,9 +364,8 @@ func (sa *Application) timeoutStateTimer(expectedState
string, event application
}
}
-// Clear the starting timer. If the application has progressed out of the
starting state we need to stop the
-// timer and clean up.
-// Called when leaving the Starting state by the state machine.
+// Clear the state timer. If the application has progressed out of a
time-sensitive state we need to stop the timer and
+// clean up. Called when transitioning from Completed to Completing or when
expiring an application.
func (sa *Application) clearStateTimer() {
if sa == nil || sa.stateTimer == nil {
return
@@ -416,7 +408,7 @@ func (sa *Application) timeoutPlaceholderProcessing() {
defer sa.Unlock()
switch {
// Case 1: if all app's placeholders are allocated, only part of them
gets replaced, just delete the remaining placeholders
- case (sa.IsRunning() || sa.IsStarting() || sa.IsCompleting()) &&
!resources.IsZero(sa.allocatedPlaceholder):
+ case (sa.IsRunning() || sa.IsCompleting()) &&
!resources.IsZero(sa.allocatedPlaceholder):
var toRelease []*Allocation
replacing := 0
for _, alloc := range sa.getPlaceholderAllocations() {
diff --git a/pkg/scheduler/objects/application_state.go
b/pkg/scheduler/objects/application_state.go
index d07a72cc..a23f61da 100644
--- a/pkg/scheduler/objects/application_state.go
+++ b/pkg/scheduler/objects/application_state.go
@@ -64,7 +64,6 @@ type applicationState int
const (
New applicationState = iota
Accepted
- Starting
Running
Rejected
Completing
@@ -77,7 +76,6 @@ const (
var stateEvents = map[string]si.EventRecord_ChangeDetail{
Accepted.String(): si.EventRecord_APP_ACCEPTED,
- Starting.String(): si.EventRecord_APP_STARTING,
Running.String(): si.EventRecord_APP_RUNNING,
Rejected.String(): si.EventRecord_APP_REJECT,
Completing.String(): si.EventRecord_APP_COMPLETING,
@@ -89,7 +87,7 @@ var stateEvents = map[string]si.EventRecord_ChangeDetail{
}
func (as applicationState) String() string {
- return [...]string{"New", "Accepted", "Starting", "Running",
"Rejected", "Completing", "Completed", "Failing", "Failed", "Expired",
"Resuming"}[as]
+ return [...]string{"New", "Accepted", "Running", "Rejected",
"Completing", "Completed", "Failing", "Failed", "Expired", "Resuming"}[as]
}
func NewAppState() *fsm.FSM {
@@ -105,15 +103,11 @@ func NewAppState() *fsm.FSM {
Dst: Accepted.String(),
}, {
Name: RunApplication.String(),
- Src: []string{Accepted.String()},
- Dst: Starting.String(),
- }, {
- Name: RunApplication.String(),
- Src: []string{Running.String(),
Starting.String(), Completing.String()},
+ Src: []string{Accepted.String(),
Running.String(), Completing.String()},
Dst: Running.String(),
}, {
Name: CompleteApplication.String(),
- Src: []string{Accepted.String(),
Running.String(), Starting.String()},
+ Src: []string{Accepted.String(),
Running.String()},
Dst: Completing.String(),
}, {
Name: CompleteApplication.String(),
@@ -121,7 +115,7 @@ func NewAppState() *fsm.FSM {
Dst: Completed.String(),
}, {
Name: FailApplication.String(),
- Src: []string{New.String(), Accepted.String(),
Starting.String(), Running.String()},
+ Src: []string{New.String(), Accepted.String(),
Running.String()},
Dst: Failing.String(),
}, {
Name: FailApplication.String(),
@@ -169,22 +163,6 @@ func NewAppState() *fsm.FSM {
"leave_state": func(_ context.Context, event
*fsm.Event) {
event.Args[0].(*Application).clearStateTimer()
//nolint:errcheck
},
- fmt.Sprintf("enter_%s", Starting.String()): func(_
context.Context, event *fsm.Event) {
- app := event.Args[0].(*Application)
//nolint:errcheck
- app.startTime = time.Now()
- app.setStateTimer(app.startTimeout,
app.stateMachine.Current(), RunApplication)
- app.queue.incRunningApps(app.ApplicationID)
-
metrics.GetQueueMetrics(app.queuePath).IncQueueApplicationsRunning()
-
metrics.GetSchedulerMetrics().IncTotalApplicationsRunning()
- },
- fmt.Sprintf("leave_%s", Starting.String()): func(_
context.Context, event *fsm.Event) {
- if event.Dst != Running.String() {
- app := event.Args[0].(*Application)
//nolint:errcheck
- app.queue.decRunningApps()
-
metrics.GetQueueMetrics(app.queuePath).DecQueueApplicationsRunning()
-
metrics.GetSchedulerMetrics().DecTotalApplicationsRunning()
- }
- },
fmt.Sprintf("enter_%s", Completing.String()): func(_
context.Context, event *fsm.Event) {
app := event.Args[0].(*Application)
//nolint:errcheck
app.setStateTimer(completingTimeout,
app.stateMachine.Current(), CompleteApplication)
@@ -209,19 +187,21 @@ func NewAppState() *fsm.FSM {
}
},
fmt.Sprintf("enter_%s", Running.String()): func(_
context.Context, event *fsm.Event) {
- app := event.Args[0].(*Application)
//nolint:errcheck
- // account for going back into running state
- if event.Src == Completing.String() {
+ if event.Src != Running.String() {
+ app := event.Args[0].(*Application)
//nolint:errcheck
+ app.startTime = time.Now()
app.queue.incRunningApps(app.ApplicationID)
metrics.GetQueueMetrics(app.queuePath).IncQueueApplicationsRunning()
metrics.GetSchedulerMetrics().IncTotalApplicationsRunning()
}
},
fmt.Sprintf("leave_%s", Running.String()): func(_
context.Context, event *fsm.Event) {
- app := event.Args[0].(*Application)
//nolint:errcheck
- app.queue.decRunningApps()
-
metrics.GetQueueMetrics(app.queuePath).DecQueueApplicationsRunning()
-
metrics.GetSchedulerMetrics().DecTotalApplicationsRunning()
+ if event.Dst != Running.String() {
+ app := event.Args[0].(*Application)
//nolint:errcheck
+ app.queue.decRunningApps()
+
metrics.GetQueueMetrics(app.queuePath).DecQueueApplicationsRunning()
+
metrics.GetSchedulerMetrics().DecTotalApplicationsRunning()
+ }
},
fmt.Sprintf("enter_%s", Completed.String()): func(_
context.Context, event *fsm.Event) {
app := event.Args[0].(*Application)
//nolint:errcheck
diff --git a/pkg/scheduler/objects/application_state_test.go
b/pkg/scheduler/objects/application_state_test.go
index 7a07f808..7a41fb36 100644
--- a/pkg/scheduler/objects/application_state_test.go
+++ b/pkg/scheduler/objects/application_state_test.go
@@ -74,44 +74,14 @@ func TestRejectStateTransition(t *testing.T) {
assert.Equal(t, app.CurrentState(), Rejected.String())
}
-func TestStartStateTransition(t *testing.T) {
- // starting only from accepted
- appInfo := newApplication("app-00001", "default", "root.a")
- assert.Equal(t, appInfo.CurrentState(), New.String())
- err := appInfo.HandleApplicationEvent(RunApplication)
- assert.NilError(t, err, "no error expected new to accepted (start
test)")
- assert.Equal(t, appInfo.CurrentState(), Accepted.String())
-
- // start app
- err = appInfo.HandleApplicationEvent(RunApplication)
- assert.Assert(t, err, "no error expected new to starting")
- assert.Equal(t, appInfo.CurrentState(), Starting.String())
-
- // starting to rejected: error expected
- err = appInfo.HandleApplicationEvent(RejectApplication)
- assert.Assert(t, err != nil, "error expected starting to rejected")
- assert.Equal(t, appInfo.CurrentState(), Starting.String())
-
- // start to failing
- err = appInfo.HandleApplicationEvent(FailApplication)
- assert.NilError(t, err, "no error expected starting to failing")
- err = common.WaitFor(10*time.Microsecond, time.Millisecond*100,
appInfo.IsFailing)
- assert.NilError(t, err, "App should be in Failing state")
-}
-
func TestRunStateTransition(t *testing.T) {
- // run only from starting
+ // run from new
appInfo := newApplication("app-00001", "default", "root.a")
assert.Equal(t, appInfo.CurrentState(), New.String())
err := appInfo.HandleApplicationEvent(RunApplication)
assert.NilError(t, err, "no error expected new to accepted (run test)")
err = appInfo.HandleApplicationEvent(RunApplication)
- assert.NilError(t, err, "no error expected accepted to starting (run
test)")
- assert.Equal(t, appInfo.CurrentState(), Starting.String())
-
- // run app
- err = appInfo.HandleApplicationEvent(RunApplication)
- assert.NilError(t, err, "no error expected starting to running")
+ assert.NilError(t, err, "no error expected accepted to running (run
test)")
assert.Equal(t, appInfo.CurrentState(), Running.String())
// run app: same state is allowed for running
@@ -143,9 +113,9 @@ func TestCompletedStateTransition(t *testing.T) {
err := appInfo1.HandleApplicationEvent(RunApplication)
assert.NilError(t, err, "no error expected new to accepted (completed
test)")
err = appInfo1.HandleApplicationEvent(RunApplication)
- assert.NilError(t, err, "no error expected accepted to starting
(completed test)")
+ assert.NilError(t, err, "no error expected accepted to running
(completed test)")
err = appInfo1.HandleApplicationEvent(RunApplication)
- assert.NilError(t, err, "no error expected starting to running
(completed test)")
+ assert.NilError(t, err, "no error expected running to running
(completed test)")
assert.Equal(t, appInfo1.CurrentState(), Running.String())
// completed from run through completing
err = appInfo1.HandleApplicationEvent(CompleteApplication)
@@ -197,21 +167,21 @@ func TestCompletingStateTransition(t *testing.T) {
assert.NilError(t, err, "no error expected accepted to completing")
assert.Equal(t, appInfo1.CurrentState(), Completing.String())
- // starting to completing
+ // running to completing
appInfo2 := newApplication("app-00002", "default", "root.a")
assert.Equal(t, appInfo2.CurrentState(), New.String())
err = appInfo2.HandleApplicationEvent(RunApplication)
assert.NilError(t, err, "no error expected new to accepted (completing
test)")
err = appInfo2.HandleApplicationEvent(RunApplication)
- assert.NilError(t, err, "no error expected accepted to starting")
- assert.Equal(t, appInfo2.CurrentState(), Starting.String())
+ assert.NilError(t, err, "no error expected accepted to running")
+ assert.Equal(t, appInfo2.CurrentState(), Running.String())
err = appInfo2.HandleApplicationEvent(CompleteApplication)
- assert.NilError(t, err, "no error expected starting to completing")
+ assert.NilError(t, err, "no error expected running to completing")
assert.Equal(t, appInfo2.CurrentState(), Completing.String())
// completing to run and back again
err = appInfo2.HandleApplicationEvent(RunApplication)
- assert.NilError(t, err, "no error expected starting to running
(completing test)")
+ assert.NilError(t, err, "no error expected completing to running
(completing test)")
err = appInfo2.HandleApplicationEvent(CompleteApplication)
assert.NilError(t, err, "no error expected running to completing")
assert.Equal(t, appInfo2.CurrentState(), Completing.String())
@@ -260,7 +230,7 @@ func TestAppStateTransitionEvents(t *testing.T) {
// completing to run
err = appInfo.HandleApplicationEvent(RunApplication)
- assert.NilError(t, err, "no error expected starting to running
(completing test)")
+ assert.NilError(t, err, "no error expected completing to running
(completing test)")
// run to failing
err = appInfo.HandleApplicationEvent(FailApplication)
@@ -306,15 +276,15 @@ func TestAppStateTransitionEvents(t *testing.T) {
}
// Test to verify metrics after applications state transition
-// app-00001: New -> Resuming -> Accepted -> Starting -> Running ->
Completing-> Completed
-// app-00002: New -> Accepted -> Starting -> Completing -> Running ->
Failing-> Failed
-// app-00003: New -> Accepted -> Starting -> Failing -> Failed
+// app-00001: New -> Resuming -> Accepted -> Running -> Completing-> Completed
+// app-00002: New -> Accepted -> Running -> Completing -> Running -> Failing->
Failed
+// app-00003: New -> Accepted -> Running -> Failing -> Failed
// app-00004: New -> Rejected
// Final metrics will be: 0 running, 3 accepted, 1 completed, 2 failed and 1
rejected applications
func TestAppStateTransitionMetrics(t *testing.T) { //nolint:funlen
queue := createQueue(t, "root.metrics")
metrics.GetSchedulerMetrics().Reset()
- // app-00001: New -> Resuming -> Accepted --> Starting -> Running ->
Completing-> Completed
+ // app-00001: New -> Resuming -> Accepted --> Running -> Completing->
Completed
app := newApplication("app-00001", "default", "root.metrics")
app.SetQueue(queue)
assertState(t, app, nil, New.String())
@@ -345,9 +315,9 @@ func TestAppStateTransitionMetrics(t *testing.T) {
//nolint:funlen
assertQueueApplicationsRejectedMetrics(t, app, 0)
assertQueueApplicationsFailedMetrics(t, app, 0)
assertQueueApplicationsCompletedMetrics(t, app, 0)
- // Accepted -> Starting
+ // Accepted -> Running
err = app.HandleApplicationEvent(RunApplication)
- assertState(t, app, err, Starting.String())
+ assertState(t, app, err, Running.String())
assertTotalAppsRunningMetrics(t, 1)
assertTotalAppsCompletedMetrics(t, 0)
assertTotalAppsRejectedMetrics(t, 0)
@@ -357,7 +327,7 @@ func TestAppStateTransitionMetrics(t *testing.T) {
//nolint:funlen
assertQueueApplicationsRejectedMetrics(t, app, 0)
assertQueueApplicationsFailedMetrics(t, app, 0)
assertQueueApplicationsCompletedMetrics(t, app, 0)
- // Starting -> Running
+ // Running -> Running
err = app.HandleApplicationEvent(RunApplication)
assertState(t, app, err, Running.String())
assertTotalAppsRunningMetrics(t, 1)
@@ -394,17 +364,17 @@ func TestAppStateTransitionMetrics(t *testing.T) {
//nolint:funlen
assertQueueApplicationsFailedMetrics(t, app, 0)
assertQueueApplicationsCompletedMetrics(t, app, 1)
- // app-00002: New -> Accepted -> Starting -> Completing -> Running ->
Failing-> Failed
+ // app-00002: New -> Accepted -> Completing -> Running -> Failing->
Failed
app = newApplication("app-00002", "default", "root.metrics")
app.SetQueue(queue)
assertState(t, app, nil, New.String())
// New -> Accepted
err = app.HandleApplicationEvent(RunApplication)
assertState(t, app, err, Accepted.String())
- // Accepted -> Starting
+ // Accepted -> Running
err = app.HandleApplicationEvent(RunApplication)
- assertState(t, app, err, Starting.String())
- // Starting -> Completing
+ assertState(t, app, err, Running.String())
+ // Running -> Completing
err = app.HandleApplicationEvent(CompleteApplication)
assertState(t, app, err, Completing.String())
// Completing -> Running
@@ -426,17 +396,17 @@ func TestAppStateTransitionMetrics(t *testing.T) {
//nolint:funlen
assertQueueApplicationsFailedMetrics(t, app, 1)
assertQueueApplicationsCompletedMetrics(t, app, 1)
- // app-00003: New -> Accepted -> Starting -> Failing -> Failed
+ // app-00003: New -> Accepted -> Running -> Failing -> Failed
app = newApplication("app-00003", "default", "root.metrics")
app.SetQueue(queue)
assertState(t, app, nil, New.String())
// New -> Accepted
err = app.HandleApplicationEvent(RunApplication)
assertState(t, app, err, Accepted.String())
- // Accepted -> Starting
+ // Accepted -> Running
err = app.HandleApplicationEvent(RunApplication)
- assertState(t, app, err, Starting.String())
- // Starting -> Failing
+ assertState(t, app, err, Running.String())
+ // Running -> Failing
err = app.HandleApplicationEvent(FailApplication)
assertState(t, app, err, Failing.String())
// Failing -> Failed
diff --git a/pkg/scheduler/objects/application_test.go
b/pkg/scheduler/objects/application_test.go
index 4ddafa0b..034da766 100644
--- a/pkg/scheduler/objects/application_test.go
+++ b/pkg/scheduler/objects/application_test.go
@@ -750,14 +750,14 @@ func TestStateChangeOnUpdate(t *testing.T) {
// add an alloc
allocInfo := NewAllocation(nodeID1, ask)
app.AddAllocation(allocInfo)
- // app should be starting
- assert.Assert(t, app.IsStarting(), "Application did not return starting
state after alloc: %s", app.CurrentState())
+ // app should be running
+ assert.Assert(t, app.IsRunning(), "Application did not return running
state after alloc: %s", app.CurrentState())
assertUserGroupResource(t, getTestUserGroup(), res)
// removing the ask should not move anywhere as there is an allocation
released = app.RemoveAllocationAsk(askID)
assert.Equal(t, released, 0, "allocation ask should not have been
reserved")
- assert.Assert(t, app.IsStarting(), "Application should have stayed
same, changed unexpectedly: %s", app.CurrentState())
+ assert.Assert(t, app.IsRunning(), "Application should have stayed same,
changed unexpectedly: %s", app.CurrentState())
// remove the allocation, ask has been removed so nothing left
app.RemoveAllocation(askID+"-0",
si.TerminationType_UNKNOWN_TERMINATION_TYPE)
@@ -767,7 +767,7 @@ func TestStateChangeOnUpdate(t *testing.T) {
log := app.GetStateLog()
assert.Equal(t, len(log), 3, "wrong number of app events")
assert.Equal(t, log[0].ApplicationState, Accepted.String())
- assert.Equal(t, log[1].ApplicationState, Starting.String())
+ assert.Equal(t, log[1].ApplicationState, Running.String())
assert.Equal(t, log[2].ApplicationState, Completing.String())
}
@@ -918,7 +918,7 @@ func TestGangAllocChange(t *testing.T) {
app.AddAllocation(alloc)
assert.Assert(t, resources.Equals(app.allocatedPlaceholder, totalPH),
"allocated placeholders resources is not updated correctly: %s",
app.allocatedPlaceholder.String())
assert.Equal(t, len(app.GetAllAllocations()), 2)
- assert.Assert(t, app.IsStarting(), "app should have changed to starting
state")
+ assert.Assert(t, app.IsRunning(), "app should have changed to running
state")
assertUserGroupResource(t, getTestUserGroup(), resources.Multiply(res,
2))
// add a real alloc this should NOT trigger state update
@@ -926,15 +926,15 @@ func TestGangAllocChange(t *testing.T) {
alloc.SetResult(Replaced)
app.AddAllocation(alloc)
assert.Equal(t, len(app.GetAllAllocations()), 3)
- assert.Assert(t, app.IsStarting(), "app should still be in starting
state")
+ assert.Assert(t, app.IsRunning(), "app should still be in running
state")
assertUserGroupResource(t, getTestUserGroup(), resources.Multiply(res,
3))
- // add a second real alloc this should trigger state update
+ // add a second real alloc this should NOT trigger state update
alloc = newAllocation(appID1, nodeID1, res)
alloc.SetResult(Replaced)
app.AddAllocation(alloc)
assert.Equal(t, len(app.GetAllAllocations()), 4)
- assert.Assert(t, app.IsRunning(), "app should be in running state")
+ assert.Assert(t, app.IsRunning(), "app should still be in running
state")
assertUserGroupResource(t, getTestUserGroup(), resources.Multiply(res,
4))
}
@@ -956,7 +956,7 @@ func TestAllocChange(t *testing.T) {
app.AddAllocation(alloc)
assert.Assert(t, resources.Equals(app.allocatedResource, res),
"allocated resources is not updated correctly: %s",
app.allocatedResource.String())
assert.Equal(t, len(app.GetAllAllocations()), 1)
- assert.Assert(t, app.IsStarting(), "app should be in starting state")
+ assert.Assert(t, app.IsRunning(), "app should be in running state")
assertUserGroupResource(t, getTestUserGroup(), res)
// add a second real alloc this should trigger state update
@@ -978,43 +978,6 @@ func TestQueueUpdate(t *testing.T) {
assert.Equal(t, app.GetQueuePath(), "root.test")
}
-func TestStateTimeOut(t *testing.T) {
- startingTimeout = time.Microsecond * 100
- defer func() { startingTimeout = time.Minute * 5 }()
- app := newApplication(appID1, "default", "root.a")
- err := app.handleApplicationEventWithLocking(RunApplication)
- assert.NilError(t, err, "no error expected new to accepted (timeout
test)")
- err = app.handleApplicationEventWithLocking(RunApplication)
- assert.NilError(t, err, "no error expected accepted to starting
(timeout test)")
- // give it some time to run and progress
- time.Sleep(time.Millisecond * 100)
- if app.IsStarting() {
- t.Fatal("Starting state should have timed out")
- }
- if app.stateTimer != nil {
- t.Fatalf("Startup timer has not be cleared on time out as
expected, %v", app.stateTimer)
- }
-
- startingTimeout = time.Millisecond * 100
- app = newApplication(appID1, "default", "root.a")
- err = app.handleApplicationEventWithLocking(RunApplication)
- assert.NilError(t, err, "no error expected new to accepted (timeout
test2)")
- err = app.handleApplicationEventWithLocking(RunApplication)
- assert.NilError(t, err, "no error expected accepted to starting
(timeout test2)")
- // give it some time to run and progress
- time.Sleep(time.Microsecond * 100)
- if !app.IsStarting() || app.stateTimer == nil {
- t.Fatalf("Starting state and timer should not have timed out
yet, state: %s", app.stateMachine.Current())
- }
- err = app.handleApplicationEventWithLocking(RunApplication)
- assert.NilError(t, err, "no error expected starting to run (timeout
test2)")
- // give it some time to run and progress
- time.Sleep(time.Microsecond * 100)
- if !app.stateMachine.Is(Running.String()) || app.stateTimer != nil {
- t.Fatalf("State is not running or timer was not cleared, state:
%s, timer %v", app.stateMachine.Current(), app.stateTimer)
- }
-}
-
func TestCompleted(t *testing.T) {
completingTimeout = time.Millisecond * 100
terminatedTimeout = time.Millisecond * 100
@@ -1498,12 +1461,12 @@ func TestTimeoutPlaceholderAllocReleased(t *testing.T) {
alloc := newAllocation(appID1, nodeID1, res)
app.AddAllocation(alloc)
- assert.Assert(t, app.IsStarting(), "App should be in starting state
after the first allocation")
+ assert.Assert(t, app.IsRunning(), "App should be in running state after
the first allocation")
err = common.WaitFor(10*time.Millisecond, 1*time.Second, func() bool {
return app.getPlaceholderTimer() == nil
})
assert.NilError(t, err, "Placeholder timeout cleanup did not trigger
unexpectedly")
- assert.Assert(t, app.IsStarting(), "App should be in starting state
after the first allocation")
+ assert.Assert(t, app.IsRunning(), "App should be in running state after
the first allocation")
assertUserGroupResource(t, getTestUserGroup(), resources.Multiply(res,
3))
// two state updates and 1 release event
events := testHandler.GetEvents()
diff --git a/pkg/scheduler/objects/queue_test.go
b/pkg/scheduler/objects/queue_test.go
index 6947b358..aacb809c 100644
--- a/pkg/scheduler/objects/queue_test.go
+++ b/pkg/scheduler/objects/queue_test.go
@@ -2590,7 +2590,7 @@ func TestQueueRunningAppsForSingleAllocationApp(t
*testing.T) {
alloc := NewAllocation(nodeID1, ask)
app.AddAllocation(alloc)
- assert.Equal(t, app.CurrentState(), Starting.String(), "app state
should be starting")
+ assert.Equal(t, app.CurrentState(), Running.String(), "app state should
be running")
assert.Equal(t, leaf.runningApps, uint64(1), "leaf should have 1 app
running")
_, err = app.updateAskRepeatInternal(ask, -1)
diff --git a/pkg/scheduler/partition_test.go b/pkg/scheduler/partition_test.go
index f1f0de89..90159e79 100644
--- a/pkg/scheduler/partition_test.go
+++ b/pkg/scheduler/partition_test.go
@@ -3689,7 +3689,7 @@ func TestTryAllocateMaxRunning(t *testing.T) {
assert.Equal(t, alloc.GetReleaseCount(), 0, "released allocations
should have been 0")
assert.Equal(t, alloc.GetApplicationID(), appID1, "expected application
app-1 to be allocated")
assert.Equal(t, alloc.GetAllocationKey(), "alloc-2", "expected ask
alloc-2 to be allocated")
- assert.Equal(t, app.CurrentState(), objects.Starting.String(),
"application should have moved to starting state")
+ assert.Equal(t, app.CurrentState(), objects.Running.String(),
"application should have moved to running state")
// allocation should still fail: max running apps on parent reached
alloc = partition.tryAllocate()
diff --git a/pkg/scheduler/tests/application_tracking_test.go
b/pkg/scheduler/tests/application_tracking_test.go
index 6bfb76bf..d14df297 100644
--- a/pkg/scheduler/tests/application_tracking_test.go
+++ b/pkg/scheduler/tests/application_tracking_test.go
@@ -327,13 +327,13 @@ func verifyAllocationAskAddedEvents(t *testing.T, events
[]*si.EventRecord) {
assert.Equal(t, si.EventRecord_ADD, events[2].EventChangeType)
assert.Equal(t, si.EventRecord_NODE_ALLOC, events[2].EventChangeDetail)
- // state change to Starting
+ // state change to Running
assert.Equal(t, "app-1", events[3].ObjectID)
assert.Equal(t, "", events[3].Message)
assert.Equal(t, "", events[3].ReferenceID)
assert.Equal(t, si.EventRecord_APP, events[3].Type)
assert.Equal(t, si.EventRecord_SET, events[3].EventChangeType)
- assert.Equal(t, si.EventRecord_APP_STARTING,
events[3].EventChangeDetail)
+ assert.Equal(t, si.EventRecord_APP_RUNNING, events[3].EventChangeDetail)
// Track resource usage for the user - increment
assert.Equal(t, "testuser", events[4].ObjectID)
diff --git a/pkg/webservice/handlers.go b/pkg/webservice/handlers.go
index ffea7844..e8d644c3 100644
--- a/pkg/webservice/handlers.go
+++ b/pkg/webservice/handlers.go
@@ -70,7 +70,6 @@ func init() {
allowedAppActiveStatuses["new"] = true
allowedAppActiveStatuses["accepted"] = true
- allowedAppActiveStatuses["starting"] = true
allowedAppActiveStatuses["running"] = true
allowedAppActiveStatuses["completing"] = true
allowedAppActiveStatuses["failing"] = true
diff --git a/pkg/webservice/handlers_test.go b/pkg/webservice/handlers_test.go
index 6b0f8a90..16ac0cc8 100644
--- a/pkg/webservice/handlers_test.go
+++ b/pkg/webservice/handlers_test.go
@@ -948,29 +948,25 @@ func TestPartitions(t *testing.T) {
app1 := addAndConfirmApplicationExists(t, partitionName,
defaultPartition, "app-1")
app1.SetState(objects.Accepted.String())
- // add a new app2 - starting
+ // add a new app2 - running
app2 := addAndConfirmApplicationExists(t, partitionName,
defaultPartition, "app-2")
- app2.SetState(objects.Starting.String())
+ app2.SetState(objects.Running.String())
- // add a new app3 - running
+ // add a new app3 - completing
app3 := addAndConfirmApplicationExists(t, partitionName,
defaultPartition, "app-3")
- app3.SetState(objects.Running.String())
+ app3.SetState(objects.Completing.String())
- // add a new app4 - completing
+ // add a new app4 - rejected
app4 := addAndConfirmApplicationExists(t, partitionName,
defaultPartition, "app-4")
- app4.SetState(objects.Completing.String())
+ app4.SetState(objects.Rejected.String())
- // add a new app5 - rejected
+ // add a new app5 - completed
app5 := addAndConfirmApplicationExists(t, partitionName,
defaultPartition, "app-5")
- app5.SetState(objects.Rejected.String())
-
- // add a new app6 - completed
- app6 := addAndConfirmApplicationExists(t, partitionName,
defaultPartition, "app-6")
- app6.SetState(objects.Completed.String())
+ app5.SetState(objects.Completed.String())
// add a new app7 - failed
- app7 := addAndConfirmApplicationExists(t, partitionName,
defaultPartition, "app-7")
- app7.SetState(objects.Failed.String())
+ app6 := addAndConfirmApplicationExists(t, partitionName,
defaultPartition, "app-6")
+ app6.SetState(objects.Failed.String())
NewWebApp(schedulerContext, nil)
@@ -984,8 +980,8 @@ func TestPartitions(t *testing.T) {
// create test allocations
resAlloc1 :=
resources.NewResourceFromMap(map[string]resources.Quantity{siCommon.Memory:
100, siCommon.CPU: 400})
resAlloc2 :=
resources.NewResourceFromMap(map[string]resources.Quantity{siCommon.Memory:
200, siCommon.CPU: 300})
- ask1 := objects.NewAllocationAsk("alloc-1", app6.ApplicationID,
resAlloc1)
- ask2 := objects.NewAllocationAsk("alloc-2", app3.ApplicationID,
resAlloc2)
+ ask1 := objects.NewAllocationAsk("alloc-1", app5.ApplicationID,
resAlloc1)
+ ask2 := objects.NewAllocationAsk("alloc-2", app2.ApplicationID,
resAlloc2)
allocs := []*objects.Allocation{objects.NewAllocation(node1ID, ask1)}
err = defaultPartition.AddNode(node1, allocs)
assert.NilError(t, err, "add node to partition should not have failed")
@@ -1011,10 +1007,9 @@ func TestPartitions(t *testing.T) {
assert.Equal(t, cs["default"].NodeSortingPolicy.Type, "fair")
assert.Equal(t,
cs["default"].NodeSortingPolicy.ResourceWeights["vcore"], 1.0)
assert.Equal(t,
cs["default"].NodeSortingPolicy.ResourceWeights["memory"], 1.0)
- assert.Equal(t, cs["default"].Applications["total"], 8)
+ assert.Equal(t, cs["default"].Applications["total"], 7)
assert.Equal(t, cs["default"].Applications[objects.New.String()], 1)
assert.Equal(t, cs["default"].Applications[objects.Accepted.String()],
1)
- assert.Equal(t, cs["default"].Applications[objects.Starting.String()],
1)
assert.Equal(t, cs["default"].Applications[objects.Running.String()], 1)
assert.Equal(t,
cs["default"].Applications[objects.Completing.String()], 1)
assert.Equal(t, cs["default"].Applications[objects.Rejected.String()],
1)
@@ -2481,7 +2476,7 @@ func prepareUserAndGroupContext(t *testing.T, config
string) {
// add an alloc
allocInfo := objects.NewAllocation("node-1", ask)
app.AddAllocation(allocInfo)
- assert.Assert(t, app.IsStarting(), "Application did not return starting
state after alloc: %s", app.CurrentState())
+ assert.Assert(t, app.IsRunning(), "Application did not return running
state after alloc: %s", app.CurrentState())
NewWebApp(schedulerContext, nil)
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]