yangwwei commented on a change in pull request #145: URL: https://github.com/apache/incubator-yunikorn-core/pull/145#discussion_r427720947
########## File path: pkg/cache/application_info.go ########## @@ -73,45 +81,117 @@ func (ai *ApplicationInfo) GetAllAllocations() []*AllocationInfo { return allocations } -// Return the current state for the application. +// Return the current state or a checked specific state for the application. // The state machine handles the locking. func (ai *ApplicationInfo) GetApplicationState() string { return ai.stateMachine.Current() } +func (ai *ApplicationInfo) IsStarting() bool { + return ai.stateMachine.Is(Starting.String()) +} + +func (ai *ApplicationInfo) IsAccepted() bool { + return ai.stateMachine.Is(Accepted.String()) +} + +func (ai *ApplicationInfo) IsNew() bool { + return ai.stateMachine.Is(New.String()) +} + +func (ai *ApplicationInfo) IsRunning() bool { + return ai.stateMachine.Is(Running.String()) +} + +func (ai *ApplicationInfo) IsWaiting() bool { + return ai.stateMachine.Is(Waiting.String()) +} + // Handle the state event for the application. // The state machine handles the locking. -func (ai *ApplicationInfo) HandleApplicationEvent(event ApplicationEvent) error { - err := ai.stateMachine.Event(event.String(), ai.ApplicationID) +func (ai *ApplicationInfo) HandleApplicationEvent(event applicationEvent) error { + err := ai.stateMachine.Event(event.String(), ai) // handle the same state transition not nil error (limit of fsm). if err != nil && err.Error() == "no transition" { return nil } return err } +// Set the starting timer to make sure the application will not get stuck in a starting state too long. +// This prevents an app from not progressing to Running when it only has 1 allocation. +// Called when entering the Starting state by the state machine. +func (ai *ApplicationInfo) setStartingTimer() { + ai.Lock() + defer ai.Unlock() + + log.Logger().Debug("Application Starting state timer initiated", + zap.String("appID", ai.ApplicationID), + zap.Duration("timeout", startingTimeout)) + ai.stateTimer = time.AfterFunc(startingTimeout, ai.timeOutStarting) +} + +// Clear the starting timer. If the application has progressed out of the starting state we need to stop the +// timer and clean up. +// Called when leaving the Starting state by the state machine. +func (ai *ApplicationInfo) clearStartingTimer() { + ai.Lock() + defer ai.Unlock() + + ai.stateTimer.Stop() + ai.stateTimer = nil +} + +// In case of state aware scheduling we do not want to get stuck in starting as we might have an application that only +// requires one allocation. +// This will progress the state of the application from Starting to Running +func (ai *ApplicationInfo) timeOutStarting() { + // make sure we are still in the right state + // we could have been killed or something might have happened while waiting for a lock + if ai.IsStarting() { + log.Logger().Warn("Application in starting state timed out: auto progress", + zap.String("applicationID", ai.ApplicationID), + zap.String("state", ai.stateMachine.Current())) + + //nolint: errcheck + _ = ai.HandleApplicationEvent(RunApplication) + } +} + // Return the total allocated resources for the application. func (ai *ApplicationInfo) GetAllocatedResource() *resources.Resource { - ai.lock.RLock() - defer ai.lock.RUnlock() + ai.RLock() + defer ai.RUnlock() return ai.allocatedResource.Clone() } // Set the leaf queue the application runs in. Update the queue name also to match as this might be different from the // queue that was given when submitting the application. func (ai *ApplicationInfo) SetQueue(leaf *QueueInfo) { - ai.lock.Lock() - defer ai.lock.Unlock() + ai.Lock() + defer ai.Unlock() ai.leafQueue = leaf ai.QueueName = leaf.GetQueuePath() } // Add a new allocation to the application func (ai *ApplicationInfo) addAllocation(info *AllocationInfo) { - ai.lock.Lock() - defer ai.lock.Unlock() + // progress the state based on where we are, we cannot fail in this case + // do NOT change the order as that will break the state + // ignore the errors explicitly and marked as nolint + if ai.IsStarting() { + //nolint: errcheck + _ = ai.HandleApplicationEvent(RunApplication) + } + if ai.IsAccepted() { + //nolint: errcheck + _ = ai.HandleApplicationEvent(StartApplication) + } + // add the allocation + ai.Lock() + defer ai.Unlock() Review comment: Do we need these state checks? I think we should leverage the state machine. I think we can define an `AddAllocationEvent`. Then in the state machine, we can have something like the following: ``` Event: AddAllocationEvent src: Accepted dst: Starting Event: AddAllocationEvent src: Starting dst: Runing ``` ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org