Qi Zhu created YUNIKORN-2884:
--------------------------------

             Summary: Task fail with post allocated but the pod will keep 
pending
                 Key: YUNIKORN-2884
                 URL: https://issues.apache.org/jira/browse/YUNIKORN-2884
             Project: Apache YuniKorn
          Issue Type: Bug
          Components: shim - kubernetes
            Reporter: Qi Zhu


We will fail task post allocated, but we don't update the pod to terminal state.


For example we bind pod volume failed post allocated, the pod will not go to 
terminal state, it will fail:


{code:java}
2024-09-20T11:22:27.601Z    INFO    shim.fsm    cache/task_state.go:381    Task 
state transition    {"app": "yunikorn-dev-03c96-autogen", "task": 
"6f3dd7fa-72b4-40cf-a700-43e51394a06b", "taskAlias": "dev-03c96/pod-bgg9h", 
"source": "Scheduling", "destination": "Allocated", "event": "TaskAllocated"}
2024-09-20T11:22:37.606Z    DEBUG    shim.cache.task    cache/task.go:499    
prepare to send release request    {"applicationID": 
"yunikorn-dev-03c96-autogen", "taskID": "6f3dd7fa-72b4-40cf-a700-43e51394a06b", 
"taskAlias": "dev-03c96/pod-bgg9h", "allocationKey": 
"6f3dd7fa-72b4-40cf-a700-43e51394a06b", "task": "Allocated", "terminationType": 
""}
2024-09-20T11:22:37.606Z    DEBUG    core.scheduler    
scheduler/scheduler.go:117    enqueued event    {"eventType": 
"*rmevent.RMUpdateAllocationEvent", "event": 
{"Request":{"releases":{"allocationsToRelease":[{"partitionName":"[mycluster]default","applicationID":"yunikorn-dev-03c96-autogen","terminationType":1,"message":"task
 
completed","allocationKey":"6f3dd7fa-72b4-40cf-a700-43e51394a06b"}]},"rmID":"mycluster"}},
 "currentQueueSize": 0}
2024-09-20T11:22:37.606Z    ERROR    shim.cache.task    cache/task.go:475    
task failed    {"appID": "yunikorn-dev-03c96-autogen", "taskID": 
"6f3dd7fa-72b4-40cf-a700-43e51394a06b", "reason": "bind volumes to pod failed, 
name: dev-03c96/pod-bgg9h, binding volumes: context deadline exceeded"}
2024-09-20T11:22:37.606Z    INFO    shim.fsm    cache/task_state.go:381    Task 
state transition    {"app": "yunikorn-dev-03c96-autogen", "task": 
"6f3dd7fa-72b4-40cf-a700-43e51394a06b", "taskAlias": "dev-03c96/pod-bgg9h", 
"source": "Allocated", "destination": "Failed", "event": "TaskFail"}
2024-09-20T11:22:37.606Z    INFO    core.scheduler.partition    
scheduler/partition.go:1359    removing allocation from application    
{"appID": "yunikorn-dev-03c96-autogen", "allocationKey": 
"6f3dd7fa-72b4-40cf-a700-43e51394a06b", "terminationType": "STOPPED_BY_RM"}
2024-09-20T11:22:37.606Z    DEBUG    core.scheduler.ugm    ugm/manager.go:132   
 Decreasing resource usage    {"user": "kubernetes-admin", "queue path": 
"root.dev-03c96", "application": "yunikorn-dev-03c96-autogen", "resource": 
"map[pods:1]", "removeApp": true}
2024-09-20T11:22:37.606Z    DEBUG    core.scheduler.ugm    ugm/manager.go:152   
 Decreasing resource usage for user    {"user": "kubernetes-admin", "queue 
path": "root.dev-03c96", "application": "yunikorn-dev-03c96-autogen", "group": 
"", "resource": "map[pods:1]", "removeApp": true}
2024-09-20T11:22:37.606Z    DEBUG    core.scheduler.ugm    
ugm/queue_tracker.go:132    Decreasing resource usage    {"queue path": "root", 
"hierarchy": ["root", "dev-03c96"], "application": 
"yunikorn-dev-03c96-autogen", "resource": "map[pods:1]", "removeApp": true}
2024-09-20T11:22:37.607Z    DEBUG    core.scheduler.ugm    
ugm/queue_tracker.go:132    Decreasing resource usage    {"queue path": 
"root.dev-03c96", "hierarchy": ["dev-03c96"], "application": 
"yunikorn-dev-03c96-autogen", "resource": "map[pods:1]", "removeApp": true}
2024-09-20T11:22:37.607Z    DEBUG    core.scheduler.ugm    
ugm/queue_tracker.go:159    Removed application from running applications    
{"application": "yunikorn-dev-03c96-autogen", "queue path": "root.dev-03c96", 
"queue name": "dev-03c96"}
2024-09-20T11:22:37.608Z    DEBUG    core.scheduler.ugm    
ugm/queue_tracker.go:165    Successfully decreased resource usage    {"queue 
path": "root.dev-03c96", "application": "yunikorn-dev-03c96-autogen", 
"resource": "map[pods:1]", "total resource after decreasing": "map[]", "total 
applications after decreasing": 0}
2024-09-20T11:22:37.608Z    DEBUG    core.scheduler.ugm    
ugm/queue_tracker.go:159    Removed application from running applications    
{"application": "yunikorn-dev-03c96-autogen", "queue path": "root", "queue 
name": "root"}
2024-09-20T11:22:37.608Z    DEBUG    core.scheduler.ugm    
ugm/queue_tracker.go:165    Successfully decreased resource usage    {"queue 
path": "root", "application": "yunikorn-dev-03c96-autogen", "resource": 
"map[pods:1]", "total resource after decreasing": "map[]", "total applications 
after decreasing": 0}
2024-09-20T11:22:37.608Z    DEBUG    core.scheduler.application    
objects/application.go:336    Application state timer initiated    {"appID": 
"yunikorn-dev-03c96-autogen", "state": "Completing", "timeout": "30s"}
2024-09-20T11:22:37.608Z    INFO    core.scheduler.fsm    
objects/application_state.go:147    Application state transition    {"appID": 
"yunikorn-dev-03c96-autogen", "source": "Running", "destination": "Completing", 
"event": "completeApplication"}
2024-09-20T11:22:37.608Z    DEBUG    core.rmproxy    rmproxy/rmproxy.go:60    
enqueue event    {"eventType": "*rmevent.RMApplicationUpdateEvent", "event": 
{"RmID":"mycluster","AcceptedApplications":[],"RejectedApplications":[],"UpdatedApplications":[{"applicationID":"yunikorn-dev-03c96-autogen","state":"Completing","stateTransitionTimestamp":1726831357608331511,"message":"completeApplication"}]},
 "currentQueueSize": 0}
2024-09-20T11:22:37.608Z    INFO    core.scheduler.application    
objects/application.go:615    ask removed successfully from application    
{"appID": "yunikorn-dev-03c96-autogen", "ask": 
"6f3dd7fa-72b4-40cf-a700-43e51394a06b", "pendingDelta": "nil resource"}
2024-09-20T11:22:37.608Z    DEBUG    core.rmproxy    rmproxy/rmproxy.go:60    
enqueue event    {"eventType": "*rmevent.RMReleaseAllocationEvent", "event": 
{"RmID":"mycluster","ReleasedAllocations":[{"partitionName":"[mycluster]default","applicationID":"yunikorn-dev-03c96-autogen","terminationType":1,"message":"allocation
 remove as per RM 
request","allocationKey":"6f3dd7fa-72b4-40cf-a700-43e51394a06b"}]}, 
"currentQueueSize": 1}
2024-09-20T11:22:37.608Z    DEBUG    shim.rmcallback    
cache/scheduler_callback.go:108    UpdateApplication callback received    
{"UpdateApplicationResponse": 
"updated:{applicationID:\"yunikorn-dev-03c96-autogen\" state:\"Completing\" 
stateTransitionTimestamp:1726831357608331511 message:\"completeApplication\"}"}
2024-09-20T11:22:37.608Z    DEBUG    shim.rmcallback    
cache/scheduler_callback.go:137    status update callback received    {"appId": 
"yunikorn-dev-03c96-autogen", "new status": "Completing"}
2024-09-20T11:22:37.608Z    DEBUG    shim.rmcallback    
cache/scheduler_callback.go:47    UpdateAllocation callback received    
{"UpdateAllocationResponse": "released:{partitionName:\"[mycluster]default\" 
applicationID:\"yunikorn-dev-03c96-autogen\" terminationType:STOPPED_BY_RM 
message:\"allocation remove as per RM request\" 
allocationKey:\"6f3dd7fa-72b4-40cf-a700-43e51394a06b\"}"}
2024-09-20T11:22:38.605Z    INFO    shim.cache.application    
cache/application.go:239    task removed    {"appID": 
"yunikorn-dev-03c96-autogen", "taskID": "6f3dd7fa-72b4-40cf-a700-43e51394a06b"}
2024-09-20T11:23:07.607Z    DEBUG    core.scheduler.application    
objects/application.go:352    Application state: auto progress    
{"applicationID": "yunikorn-dev-03c96-autogen", "state": "Completing"}
2024-09-20T11:23:07.607Z    DEBUG    core.scheduler.application    
objects/application.go:384    Application state timer cleared    {"appID": 
"yunikorn-dev-03c96-autogen", "state": "Completing"}
2024-09-20T11:23:07.607Z    DEBUG    core.scheduler.application    
objects/application.go:336    Application state timer initiated    {"appID": 
"yunikorn-dev-03c96-autogen", "state": "Completed", "timeout": "72h0m0s"}
2024-09-20T11:23:07.607Z    INFO    core.scheduler.fsm    
objects/application_state.go:147    Application state transition    {"appID": 
"yunikorn-dev-03c96-autogen", "source": "Completing", "destination": 
"Completed", "event": "completeApplication"}
2024-09-20T11:23:07.607Z    DEBUG    core.rmproxy    rmproxy/rmproxy.go:60    
enqueue event    {"eventType": "*rmevent.RMApplicationUpdateEvent", "event": 
{"RmID":"mycluster","AcceptedApplications":[],"RejectedApplications":[],"UpdatedApplications":[{"applicationID":"yunikorn-dev-03c96-autogen","state":"Completed","stateTransitionTimestamp":1726831387607633721,"message":"completeApplication"}]},
 "currentQueueSize": 0}
2024-09-20T11:23:07.607Z    DEBUG    shim.rmcallback    
cache/scheduler_callback.go:108    UpdateApplication callback received    
{"UpdateApplicationResponse": 
"updated:{applicationID:\"yunikorn-dev-03c96-autogen\" state:\"Completed\" 
stateTransitionTimestamp:1726831387607633721 message:\"completeApplication\"}"}
2024-09-20T11:23:07.607Z    DEBUG    shim.rmcallback    
cache/scheduler_callback.go:137    status update callback received    {"appId": 
"yunikorn-dev-03c96-autogen", "new status": "Completed"}
2024-09-20T11:23:07.607Z    INFO    core.scheduler.queue    
objects/queue.go:830    Application completed and removed from queue    
{"queueName": "root.dev-03c96", "applicationID": "yunikorn-dev-03c96-autogen"}
2024-09-20T11:23:07.607Z    INFO    core.scheduler.partition    
scheduler/partition.go:1539    Removing terminated application from the 
application list    {"appID": "yunikorn-dev-03c96-autogen", "app status": 
"Completed"}
2024-09-20T11:23:07.607Z    INFO    core.scheduler.application.usage    
objects/application_summary.go:60    YK_APP_SUMMARY: {ApplicationID: 
yunikorn-dev-03c96-autogen, SubmissionTime: 1726831345581, StartTime: 
1726831347597, FinishTime: 1726831387607, User: kubernetes-admin, Queue: 
root.dev-03c96, State: Completed, RmID: mycluster, ResourceUsage: 
TrackedResource{UNKNOWN:pods=10}, PreemptedResource: TrackedResource{}, 
PlaceholderResource: TrackedResource{}} {code}



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to