[ 
https://issues.apache.org/jira/browse/YUNIKORN-2884?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Qi Zhu updated YUNIKORN-2884:
-----------------------------
    Description: 
We will fail task post allocated, but we don't update the pod to terminal state.

For example we bind pod volume failed post allocated, the pod will not go to 
terminal state, it will fail:


Pod event:
{code:java}
Events:
  Type     Reason                 Age   From      Message
  ----     ------                 ----  ----      -------
  Normal   Scheduling             30s   yunikorn  dev-nnjxy/pod-btv0y is queued 
and waiting for allocation
  Normal   Scheduled              30s   yunikorn  Successfully assigned 
dev-nnjxy/pod-btv0y to node yktest-worker
  Warning  PodVolumesBindFailure  20s   yunikorn  bind volumes to pod failed, 
name: dev-nnjxy/pod-btv0y, binding volumes: context deadline exceeded
  Normal   TaskFailed             20s   yunikorn  Task dev-nnjxy/pod-btv0y is 
failed{code}

Pod pending not going to terminal state






{code:java}
2024-09-20T11:22:27.601Z    INFO    shim.fsm    cache/task_state.go:381    Task 
state transition    {"app": "yunikorn-dev-03c96-autogen", "task": 
"6f3dd7fa-72b4-40cf-a700-43e51394a06b", "taskAlias": "dev-03c96/pod-bgg9h", 
"source": "Scheduling", "destination": "Allocated", "event": "TaskAllocated"}
2024-09-20T11:22:37.606Z    DEBUG    shim.cache.task    cache/task.go:499    
prepare to send release request    {"applicationID": 
"yunikorn-dev-03c96-autogen", "taskID": "6f3dd7fa-72b4-40cf-a700-43e51394a06b", 
"taskAlias": "dev-03c96/pod-bgg9h", "allocationKey": 
"6f3dd7fa-72b4-40cf-a700-43e51394a06b", "task": "Allocated", "terminationType": 
""}
2024-09-20T11:22:37.606Z    DEBUG    core.scheduler    
scheduler/scheduler.go:117    enqueued event    {"eventType": 
"*rmevent.RMUpdateAllocationEvent", "event": 
{"Request":{"releases":{"allocationsToRelease":[{"partitionName":"[mycluster]default","applicationID":"yunikorn-dev-03c96-autogen","terminationType":1,"message":"task
 
completed","allocationKey":"6f3dd7fa-72b4-40cf-a700-43e51394a06b"}]},"rmID":"mycluster"}},
 "currentQueueSize": 0}
2024-09-20T11:22:37.606Z    ERROR    shim.cache.task    cache/task.go:475    
task failed    {"appID": "yunikorn-dev-03c96-autogen", "taskID": 
"6f3dd7fa-72b4-40cf-a700-43e51394a06b", "reason": "bind volumes to pod failed, 
name: dev-03c96/pod-bgg9h, binding volumes: context deadline exceeded"}
2024-09-20T11:22:37.606Z    INFO    shim.fsm    cache/task_state.go:381    Task 
state transition    {"app": "yunikorn-dev-03c96-autogen", "task": 
"6f3dd7fa-72b4-40cf-a700-43e51394a06b", "taskAlias": "dev-03c96/pod-bgg9h", 
"source": "Allocated", "destination": "Failed", "event": "TaskFail"}
2024-09-20T11:22:37.606Z    INFO    core.scheduler.partition    
scheduler/partition.go:1359    removing allocation from application    
{"appID": "yunikorn-dev-03c96-autogen", "allocationKey": 
"6f3dd7fa-72b4-40cf-a700-43e51394a06b", "terminationType": "STOPPED_BY_RM"}
2024-09-20T11:22:37.606Z    DEBUG    core.scheduler.ugm    ugm/manager.go:132   
 Decreasing resource usage    {"user": "kubernetes-admin", "queue path": 
"root.dev-03c96", "application": "yunikorn-dev-03c96-autogen", "resource": 
"map[pods:1]", "removeApp": true}
2024-09-20T11:22:37.606Z    DEBUG    core.scheduler.ugm    ugm/manager.go:152   
 Decreasing resource usage for user    {"user": "kubernetes-admin", "queue 
path": "root.dev-03c96", "application": "yunikorn-dev-03c96-autogen", "group": 
"", "resource": "map[pods:1]", "removeApp": true}
2024-09-20T11:22:37.606Z    DEBUG    core.scheduler.ugm    
ugm/queue_tracker.go:132    Decreasing resource usage    {"queue path": "root", 
"hierarchy": ["root", "dev-03c96"], "application": 
"yunikorn-dev-03c96-autogen", "resource": "map[pods:1]", "removeApp": true}
2024-09-20T11:22:37.607Z    DEBUG    core.scheduler.ugm    
ugm/queue_tracker.go:132    Decreasing resource usage    {"queue path": 
"root.dev-03c96", "hierarchy": ["dev-03c96"], "application": 
"yunikorn-dev-03c96-autogen", "resource": "map[pods:1]", "removeApp": true}
2024-09-20T11:22:37.607Z    DEBUG    core.scheduler.ugm    
ugm/queue_tracker.go:159    Removed application from running applications    
{"application": "yunikorn-dev-03c96-autogen", "queue path": "root.dev-03c96", 
"queue name": "dev-03c96"}
2024-09-20T11:22:37.608Z    DEBUG    core.scheduler.ugm    
ugm/queue_tracker.go:165    Successfully decreased resource usage    {"queue 
path": "root.dev-03c96", "application": "yunikorn-dev-03c96-autogen", 
"resource": "map[pods:1]", "total resource after decreasing": "map[]", "total 
applications after decreasing": 0}
2024-09-20T11:22:37.608Z    DEBUG    core.scheduler.ugm    
ugm/queue_tracker.go:159    Removed application from running applications    
{"application": "yunikorn-dev-03c96-autogen", "queue path": "root", "queue 
name": "root"}
2024-09-20T11:22:37.608Z    DEBUG    core.scheduler.ugm    
ugm/queue_tracker.go:165    Successfully decreased resource usage    {"queue 
path": "root", "application": "yunikorn-dev-03c96-autogen", "resource": 
"map[pods:1]", "total resource after decreasing": "map[]", "total applications 
after decreasing": 0}
2024-09-20T11:22:37.608Z    DEBUG    core.scheduler.application    
objects/application.go:336    Application state timer initiated    {"appID": 
"yunikorn-dev-03c96-autogen", "state": "Completing", "timeout": "30s"}
2024-09-20T11:22:37.608Z    INFO    core.scheduler.fsm    
objects/application_state.go:147    Application state transition    {"appID": 
"yunikorn-dev-03c96-autogen", "source": "Running", "destination": "Completing", 
"event": "completeApplication"}
2024-09-20T11:22:37.608Z    DEBUG    core.rmproxy    rmproxy/rmproxy.go:60    
enqueue event    {"eventType": "*rmevent.RMApplicationUpdateEvent", "event": 
{"RmID":"mycluster","AcceptedApplications":[],"RejectedApplications":[],"UpdatedApplications":[{"applicationID":"yunikorn-dev-03c96-autogen","state":"Completing","stateTransitionTimestamp":1726831357608331511,"message":"completeApplication"}]},
 "currentQueueSize": 0}
2024-09-20T11:22:37.608Z    INFO    core.scheduler.application    
objects/application.go:615    ask removed successfully from application    
{"appID": "yunikorn-dev-03c96-autogen", "ask": 
"6f3dd7fa-72b4-40cf-a700-43e51394a06b", "pendingDelta": "nil resource"}
2024-09-20T11:22:37.608Z    DEBUG    core.rmproxy    rmproxy/rmproxy.go:60    
enqueue event    {"eventType": "*rmevent.RMReleaseAllocationEvent", "event": 
{"RmID":"mycluster","ReleasedAllocations":[{"partitionName":"[mycluster]default","applicationID":"yunikorn-dev-03c96-autogen","terminationType":1,"message":"allocation
 remove as per RM 
request","allocationKey":"6f3dd7fa-72b4-40cf-a700-43e51394a06b"}]}, 
"currentQueueSize": 1}
2024-09-20T11:22:37.608Z    DEBUG    shim.rmcallback    
cache/scheduler_callback.go:108    UpdateApplication callback received    
{"UpdateApplicationResponse": 
"updated:{applicationID:\"yunikorn-dev-03c96-autogen\" state:\"Completing\" 
stateTransitionTimestamp:1726831357608331511 message:\"completeApplication\"}"}
2024-09-20T11:22:37.608Z    DEBUG    shim.rmcallback    
cache/scheduler_callback.go:137    status update callback received    {"appId": 
"yunikorn-dev-03c96-autogen", "new status": "Completing"}
2024-09-20T11:22:37.608Z    DEBUG    shim.rmcallback    
cache/scheduler_callback.go:47    UpdateAllocation callback received    
{"UpdateAllocationResponse": "released:{partitionName:\"[mycluster]default\" 
applicationID:\"yunikorn-dev-03c96-autogen\" terminationType:STOPPED_BY_RM 
message:\"allocation remove as per RM request\" 
allocationKey:\"6f3dd7fa-72b4-40cf-a700-43e51394a06b\"}"}
2024-09-20T11:22:38.605Z    INFO    shim.cache.application    
cache/application.go:239    task removed    {"appID": 
"yunikorn-dev-03c96-autogen", "taskID": "6f3dd7fa-72b4-40cf-a700-43e51394a06b"}
2024-09-20T11:23:07.607Z    DEBUG    core.scheduler.application    
objects/application.go:352    Application state: auto progress    
{"applicationID": "yunikorn-dev-03c96-autogen", "state": "Completing"}
2024-09-20T11:23:07.607Z    DEBUG    core.scheduler.application    
objects/application.go:384    Application state timer cleared    {"appID": 
"yunikorn-dev-03c96-autogen", "state": "Completing"}
2024-09-20T11:23:07.607Z    DEBUG    core.scheduler.application    
objects/application.go:336    Application state timer initiated    {"appID": 
"yunikorn-dev-03c96-autogen", "state": "Completed", "timeout": "72h0m0s"}
2024-09-20T11:23:07.607Z    INFO    core.scheduler.fsm    
objects/application_state.go:147    Application state transition    {"appID": 
"yunikorn-dev-03c96-autogen", "source": "Completing", "destination": 
"Completed", "event": "completeApplication"}
2024-09-20T11:23:07.607Z    DEBUG    core.rmproxy    rmproxy/rmproxy.go:60    
enqueue event    {"eventType": "*rmevent.RMApplicationUpdateEvent", "event": 
{"RmID":"mycluster","AcceptedApplications":[],"RejectedApplications":[],"UpdatedApplications":[{"applicationID":"yunikorn-dev-03c96-autogen","state":"Completed","stateTransitionTimestamp":1726831387607633721,"message":"completeApplication"}]},
 "currentQueueSize": 0}
2024-09-20T11:23:07.607Z    DEBUG    shim.rmcallback    
cache/scheduler_callback.go:108    UpdateApplication callback received    
{"UpdateApplicationResponse": 
"updated:{applicationID:\"yunikorn-dev-03c96-autogen\" state:\"Completed\" 
stateTransitionTimestamp:1726831387607633721 message:\"completeApplication\"}"}
2024-09-20T11:23:07.607Z    DEBUG    shim.rmcallback    
cache/scheduler_callback.go:137    status update callback received    {"appId": 
"yunikorn-dev-03c96-autogen", "new status": "Completed"}
2024-09-20T11:23:07.607Z    INFO    core.scheduler.queue    
objects/queue.go:830    Application completed and removed from queue    
{"queueName": "root.dev-03c96", "applicationID": "yunikorn-dev-03c96-autogen"}
2024-09-20T11:23:07.607Z    INFO    core.scheduler.partition    
scheduler/partition.go:1539    Removing terminated application from the 
application list    {"appID": "yunikorn-dev-03c96-autogen", "app status": 
"Completed"}
2024-09-20T11:23:07.607Z    INFO    core.scheduler.application.usage    
objects/application_summary.go:60    YK_APP_SUMMARY: {ApplicationID: 
yunikorn-dev-03c96-autogen, SubmissionTime: 1726831345581, StartTime: 
1726831347597, FinishTime: 1726831387607, User: kubernetes-admin, Queue: 
root.dev-03c96, State: Completed, RmID: mycluster, ResourceUsage: 
TrackedResource{UNKNOWN:pods=10}, PreemptedResource: TrackedResource{}, 
PlaceholderResource: TrackedResource{}} {code}

  was:
We will fail task post allocated, but we don't update the pod to terminal state.


For example we bind pod volume failed post allocated, the pod will not go to 
terminal state, it will fail:


{code:java}
2024-09-20T11:22:27.601Z    INFO    shim.fsm    cache/task_state.go:381    Task 
state transition    {"app": "yunikorn-dev-03c96-autogen", "task": 
"6f3dd7fa-72b4-40cf-a700-43e51394a06b", "taskAlias": "dev-03c96/pod-bgg9h", 
"source": "Scheduling", "destination": "Allocated", "event": "TaskAllocated"}
2024-09-20T11:22:37.606Z    DEBUG    shim.cache.task    cache/task.go:499    
prepare to send release request    {"applicationID": 
"yunikorn-dev-03c96-autogen", "taskID": "6f3dd7fa-72b4-40cf-a700-43e51394a06b", 
"taskAlias": "dev-03c96/pod-bgg9h", "allocationKey": 
"6f3dd7fa-72b4-40cf-a700-43e51394a06b", "task": "Allocated", "terminationType": 
""}
2024-09-20T11:22:37.606Z    DEBUG    core.scheduler    
scheduler/scheduler.go:117    enqueued event    {"eventType": 
"*rmevent.RMUpdateAllocationEvent", "event": 
{"Request":{"releases":{"allocationsToRelease":[{"partitionName":"[mycluster]default","applicationID":"yunikorn-dev-03c96-autogen","terminationType":1,"message":"task
 
completed","allocationKey":"6f3dd7fa-72b4-40cf-a700-43e51394a06b"}]},"rmID":"mycluster"}},
 "currentQueueSize": 0}
2024-09-20T11:22:37.606Z    ERROR    shim.cache.task    cache/task.go:475    
task failed    {"appID": "yunikorn-dev-03c96-autogen", "taskID": 
"6f3dd7fa-72b4-40cf-a700-43e51394a06b", "reason": "bind volumes to pod failed, 
name: dev-03c96/pod-bgg9h, binding volumes: context deadline exceeded"}
2024-09-20T11:22:37.606Z    INFO    shim.fsm    cache/task_state.go:381    Task 
state transition    {"app": "yunikorn-dev-03c96-autogen", "task": 
"6f3dd7fa-72b4-40cf-a700-43e51394a06b", "taskAlias": "dev-03c96/pod-bgg9h", 
"source": "Allocated", "destination": "Failed", "event": "TaskFail"}
2024-09-20T11:22:37.606Z    INFO    core.scheduler.partition    
scheduler/partition.go:1359    removing allocation from application    
{"appID": "yunikorn-dev-03c96-autogen", "allocationKey": 
"6f3dd7fa-72b4-40cf-a700-43e51394a06b", "terminationType": "STOPPED_BY_RM"}
2024-09-20T11:22:37.606Z    DEBUG    core.scheduler.ugm    ugm/manager.go:132   
 Decreasing resource usage    {"user": "kubernetes-admin", "queue path": 
"root.dev-03c96", "application": "yunikorn-dev-03c96-autogen", "resource": 
"map[pods:1]", "removeApp": true}
2024-09-20T11:22:37.606Z    DEBUG    core.scheduler.ugm    ugm/manager.go:152   
 Decreasing resource usage for user    {"user": "kubernetes-admin", "queue 
path": "root.dev-03c96", "application": "yunikorn-dev-03c96-autogen", "group": 
"", "resource": "map[pods:1]", "removeApp": true}
2024-09-20T11:22:37.606Z    DEBUG    core.scheduler.ugm    
ugm/queue_tracker.go:132    Decreasing resource usage    {"queue path": "root", 
"hierarchy": ["root", "dev-03c96"], "application": 
"yunikorn-dev-03c96-autogen", "resource": "map[pods:1]", "removeApp": true}
2024-09-20T11:22:37.607Z    DEBUG    core.scheduler.ugm    
ugm/queue_tracker.go:132    Decreasing resource usage    {"queue path": 
"root.dev-03c96", "hierarchy": ["dev-03c96"], "application": 
"yunikorn-dev-03c96-autogen", "resource": "map[pods:1]", "removeApp": true}
2024-09-20T11:22:37.607Z    DEBUG    core.scheduler.ugm    
ugm/queue_tracker.go:159    Removed application from running applications    
{"application": "yunikorn-dev-03c96-autogen", "queue path": "root.dev-03c96", 
"queue name": "dev-03c96"}
2024-09-20T11:22:37.608Z    DEBUG    core.scheduler.ugm    
ugm/queue_tracker.go:165    Successfully decreased resource usage    {"queue 
path": "root.dev-03c96", "application": "yunikorn-dev-03c96-autogen", 
"resource": "map[pods:1]", "total resource after decreasing": "map[]", "total 
applications after decreasing": 0}
2024-09-20T11:22:37.608Z    DEBUG    core.scheduler.ugm    
ugm/queue_tracker.go:159    Removed application from running applications    
{"application": "yunikorn-dev-03c96-autogen", "queue path": "root", "queue 
name": "root"}
2024-09-20T11:22:37.608Z    DEBUG    core.scheduler.ugm    
ugm/queue_tracker.go:165    Successfully decreased resource usage    {"queue 
path": "root", "application": "yunikorn-dev-03c96-autogen", "resource": 
"map[pods:1]", "total resource after decreasing": "map[]", "total applications 
after decreasing": 0}
2024-09-20T11:22:37.608Z    DEBUG    core.scheduler.application    
objects/application.go:336    Application state timer initiated    {"appID": 
"yunikorn-dev-03c96-autogen", "state": "Completing", "timeout": "30s"}
2024-09-20T11:22:37.608Z    INFO    core.scheduler.fsm    
objects/application_state.go:147    Application state transition    {"appID": 
"yunikorn-dev-03c96-autogen", "source": "Running", "destination": "Completing", 
"event": "completeApplication"}
2024-09-20T11:22:37.608Z    DEBUG    core.rmproxy    rmproxy/rmproxy.go:60    
enqueue event    {"eventType": "*rmevent.RMApplicationUpdateEvent", "event": 
{"RmID":"mycluster","AcceptedApplications":[],"RejectedApplications":[],"UpdatedApplications":[{"applicationID":"yunikorn-dev-03c96-autogen","state":"Completing","stateTransitionTimestamp":1726831357608331511,"message":"completeApplication"}]},
 "currentQueueSize": 0}
2024-09-20T11:22:37.608Z    INFO    core.scheduler.application    
objects/application.go:615    ask removed successfully from application    
{"appID": "yunikorn-dev-03c96-autogen", "ask": 
"6f3dd7fa-72b4-40cf-a700-43e51394a06b", "pendingDelta": "nil resource"}
2024-09-20T11:22:37.608Z    DEBUG    core.rmproxy    rmproxy/rmproxy.go:60    
enqueue event    {"eventType": "*rmevent.RMReleaseAllocationEvent", "event": 
{"RmID":"mycluster","ReleasedAllocations":[{"partitionName":"[mycluster]default","applicationID":"yunikorn-dev-03c96-autogen","terminationType":1,"message":"allocation
 remove as per RM 
request","allocationKey":"6f3dd7fa-72b4-40cf-a700-43e51394a06b"}]}, 
"currentQueueSize": 1}
2024-09-20T11:22:37.608Z    DEBUG    shim.rmcallback    
cache/scheduler_callback.go:108    UpdateApplication callback received    
{"UpdateApplicationResponse": 
"updated:{applicationID:\"yunikorn-dev-03c96-autogen\" state:\"Completing\" 
stateTransitionTimestamp:1726831357608331511 message:\"completeApplication\"}"}
2024-09-20T11:22:37.608Z    DEBUG    shim.rmcallback    
cache/scheduler_callback.go:137    status update callback received    {"appId": 
"yunikorn-dev-03c96-autogen", "new status": "Completing"}
2024-09-20T11:22:37.608Z    DEBUG    shim.rmcallback    
cache/scheduler_callback.go:47    UpdateAllocation callback received    
{"UpdateAllocationResponse": "released:{partitionName:\"[mycluster]default\" 
applicationID:\"yunikorn-dev-03c96-autogen\" terminationType:STOPPED_BY_RM 
message:\"allocation remove as per RM request\" 
allocationKey:\"6f3dd7fa-72b4-40cf-a700-43e51394a06b\"}"}
2024-09-20T11:22:38.605Z    INFO    shim.cache.application    
cache/application.go:239    task removed    {"appID": 
"yunikorn-dev-03c96-autogen", "taskID": "6f3dd7fa-72b4-40cf-a700-43e51394a06b"}
2024-09-20T11:23:07.607Z    DEBUG    core.scheduler.application    
objects/application.go:352    Application state: auto progress    
{"applicationID": "yunikorn-dev-03c96-autogen", "state": "Completing"}
2024-09-20T11:23:07.607Z    DEBUG    core.scheduler.application    
objects/application.go:384    Application state timer cleared    {"appID": 
"yunikorn-dev-03c96-autogen", "state": "Completing"}
2024-09-20T11:23:07.607Z    DEBUG    core.scheduler.application    
objects/application.go:336    Application state timer initiated    {"appID": 
"yunikorn-dev-03c96-autogen", "state": "Completed", "timeout": "72h0m0s"}
2024-09-20T11:23:07.607Z    INFO    core.scheduler.fsm    
objects/application_state.go:147    Application state transition    {"appID": 
"yunikorn-dev-03c96-autogen", "source": "Completing", "destination": 
"Completed", "event": "completeApplication"}
2024-09-20T11:23:07.607Z    DEBUG    core.rmproxy    rmproxy/rmproxy.go:60    
enqueue event    {"eventType": "*rmevent.RMApplicationUpdateEvent", "event": 
{"RmID":"mycluster","AcceptedApplications":[],"RejectedApplications":[],"UpdatedApplications":[{"applicationID":"yunikorn-dev-03c96-autogen","state":"Completed","stateTransitionTimestamp":1726831387607633721,"message":"completeApplication"}]},
 "currentQueueSize": 0}
2024-09-20T11:23:07.607Z    DEBUG    shim.rmcallback    
cache/scheduler_callback.go:108    UpdateApplication callback received    
{"UpdateApplicationResponse": 
"updated:{applicationID:\"yunikorn-dev-03c96-autogen\" state:\"Completed\" 
stateTransitionTimestamp:1726831387607633721 message:\"completeApplication\"}"}
2024-09-20T11:23:07.607Z    DEBUG    shim.rmcallback    
cache/scheduler_callback.go:137    status update callback received    {"appId": 
"yunikorn-dev-03c96-autogen", "new status": "Completed"}
2024-09-20T11:23:07.607Z    INFO    core.scheduler.queue    
objects/queue.go:830    Application completed and removed from queue    
{"queueName": "root.dev-03c96", "applicationID": "yunikorn-dev-03c96-autogen"}
2024-09-20T11:23:07.607Z    INFO    core.scheduler.partition    
scheduler/partition.go:1539    Removing terminated application from the 
application list    {"appID": "yunikorn-dev-03c96-autogen", "app status": 
"Completed"}
2024-09-20T11:23:07.607Z    INFO    core.scheduler.application.usage    
objects/application_summary.go:60    YK_APP_SUMMARY: {ApplicationID: 
yunikorn-dev-03c96-autogen, SubmissionTime: 1726831345581, StartTime: 
1726831347597, FinishTime: 1726831387607, User: kubernetes-admin, Queue: 
root.dev-03c96, State: Completed, RmID: mycluster, ResourceUsage: 
TrackedResource{UNKNOWN:pods=10}, PreemptedResource: TrackedResource{}, 
PlaceholderResource: TrackedResource{}} {code}


> Task fail with post allocated but the pod will keep pending
> -----------------------------------------------------------
>
>                 Key: YUNIKORN-2884
>                 URL: https://issues.apache.org/jira/browse/YUNIKORN-2884
>             Project: Apache YuniKorn
>          Issue Type: Bug
>          Components: shim - kubernetes
>            Reporter: Qi Zhu
>            Assignee: Qi Zhu
>            Priority: Major
>
> We will fail task post allocated, but we don't update the pod to terminal 
> state.
> For example we bind pod volume failed post allocated, the pod will not go to 
> terminal state, it will fail:
> Pod event:
> {code:java}
> Events:
>   Type     Reason                 Age   From      Message
>   ----     ------                 ----  ----      -------
>   Normal   Scheduling             30s   yunikorn  dev-nnjxy/pod-btv0y is 
> queued and waiting for allocation
>   Normal   Scheduled              30s   yunikorn  Successfully assigned 
> dev-nnjxy/pod-btv0y to node yktest-worker
>   Warning  PodVolumesBindFailure  20s   yunikorn  bind volumes to pod failed, 
> name: dev-nnjxy/pod-btv0y, binding volumes: context deadline exceeded
>   Normal   TaskFailed             20s   yunikorn  Task dev-nnjxy/pod-btv0y is 
> failed{code}
> Pod pending not going to terminal state
> {code:java}
> 2024-09-20T11:22:27.601Z    INFO    shim.fsm    cache/task_state.go:381    
> Task state transition    {"app": "yunikorn-dev-03c96-autogen", "task": 
> "6f3dd7fa-72b4-40cf-a700-43e51394a06b", "taskAlias": "dev-03c96/pod-bgg9h", 
> "source": "Scheduling", "destination": "Allocated", "event": "TaskAllocated"}
> 2024-09-20T11:22:37.606Z    DEBUG    shim.cache.task    cache/task.go:499    
> prepare to send release request    {"applicationID": 
> "yunikorn-dev-03c96-autogen", "taskID": 
> "6f3dd7fa-72b4-40cf-a700-43e51394a06b", "taskAlias": "dev-03c96/pod-bgg9h", 
> "allocationKey": "6f3dd7fa-72b4-40cf-a700-43e51394a06b", "task": "Allocated", 
> "terminationType": ""}
> 2024-09-20T11:22:37.606Z    DEBUG    core.scheduler    
> scheduler/scheduler.go:117    enqueued event    {"eventType": 
> "*rmevent.RMUpdateAllocationEvent", "event": 
> {"Request":{"releases":{"allocationsToRelease":[{"partitionName":"[mycluster]default","applicationID":"yunikorn-dev-03c96-autogen","terminationType":1,"message":"task
>  
> completed","allocationKey":"6f3dd7fa-72b4-40cf-a700-43e51394a06b"}]},"rmID":"mycluster"}},
>  "currentQueueSize": 0}
> 2024-09-20T11:22:37.606Z    ERROR    shim.cache.task    cache/task.go:475    
> task failed    {"appID": "yunikorn-dev-03c96-autogen", "taskID": 
> "6f3dd7fa-72b4-40cf-a700-43e51394a06b", "reason": "bind volumes to pod 
> failed, name: dev-03c96/pod-bgg9h, binding volumes: context deadline 
> exceeded"}
> 2024-09-20T11:22:37.606Z    INFO    shim.fsm    cache/task_state.go:381    
> Task state transition    {"app": "yunikorn-dev-03c96-autogen", "task": 
> "6f3dd7fa-72b4-40cf-a700-43e51394a06b", "taskAlias": "dev-03c96/pod-bgg9h", 
> "source": "Allocated", "destination": "Failed", "event": "TaskFail"}
> 2024-09-20T11:22:37.606Z    INFO    core.scheduler.partition    
> scheduler/partition.go:1359    removing allocation from application    
> {"appID": "yunikorn-dev-03c96-autogen", "allocationKey": 
> "6f3dd7fa-72b4-40cf-a700-43e51394a06b", "terminationType": "STOPPED_BY_RM"}
> 2024-09-20T11:22:37.606Z    DEBUG    core.scheduler.ugm    ugm/manager.go:132 
>    Decreasing resource usage    {"user": "kubernetes-admin", "queue path": 
> "root.dev-03c96", "application": "yunikorn-dev-03c96-autogen", "resource": 
> "map[pods:1]", "removeApp": true}
> 2024-09-20T11:22:37.606Z    DEBUG    core.scheduler.ugm    ugm/manager.go:152 
>    Decreasing resource usage for user    {"user": "kubernetes-admin", "queue 
> path": "root.dev-03c96", "application": "yunikorn-dev-03c96-autogen", 
> "group": "", "resource": "map[pods:1]", "removeApp": true}
> 2024-09-20T11:22:37.606Z    DEBUG    core.scheduler.ugm    
> ugm/queue_tracker.go:132    Decreasing resource usage    {"queue path": 
> "root", "hierarchy": ["root", "dev-03c96"], "application": 
> "yunikorn-dev-03c96-autogen", "resource": "map[pods:1]", "removeApp": true}
> 2024-09-20T11:22:37.607Z    DEBUG    core.scheduler.ugm    
> ugm/queue_tracker.go:132    Decreasing resource usage    {"queue path": 
> "root.dev-03c96", "hierarchy": ["dev-03c96"], "application": 
> "yunikorn-dev-03c96-autogen", "resource": "map[pods:1]", "removeApp": true}
> 2024-09-20T11:22:37.607Z    DEBUG    core.scheduler.ugm    
> ugm/queue_tracker.go:159    Removed application from running applications    
> {"application": "yunikorn-dev-03c96-autogen", "queue path": "root.dev-03c96", 
> "queue name": "dev-03c96"}
> 2024-09-20T11:22:37.608Z    DEBUG    core.scheduler.ugm    
> ugm/queue_tracker.go:165    Successfully decreased resource usage    {"queue 
> path": "root.dev-03c96", "application": "yunikorn-dev-03c96-autogen", 
> "resource": "map[pods:1]", "total resource after decreasing": "map[]", "total 
> applications after decreasing": 0}
> 2024-09-20T11:22:37.608Z    DEBUG    core.scheduler.ugm    
> ugm/queue_tracker.go:159    Removed application from running applications    
> {"application": "yunikorn-dev-03c96-autogen", "queue path": "root", "queue 
> name": "root"}
> 2024-09-20T11:22:37.608Z    DEBUG    core.scheduler.ugm    
> ugm/queue_tracker.go:165    Successfully decreased resource usage    {"queue 
> path": "root", "application": "yunikorn-dev-03c96-autogen", "resource": 
> "map[pods:1]", "total resource after decreasing": "map[]", "total 
> applications after decreasing": 0}
> 2024-09-20T11:22:37.608Z    DEBUG    core.scheduler.application    
> objects/application.go:336    Application state timer initiated    {"appID": 
> "yunikorn-dev-03c96-autogen", "state": "Completing", "timeout": "30s"}
> 2024-09-20T11:22:37.608Z    INFO    core.scheduler.fsm    
> objects/application_state.go:147    Application state transition    {"appID": 
> "yunikorn-dev-03c96-autogen", "source": "Running", "destination": 
> "Completing", "event": "completeApplication"}
> 2024-09-20T11:22:37.608Z    DEBUG    core.rmproxy    rmproxy/rmproxy.go:60    
> enqueue event    {"eventType": "*rmevent.RMApplicationUpdateEvent", "event": 
> {"RmID":"mycluster","AcceptedApplications":[],"RejectedApplications":[],"UpdatedApplications":[{"applicationID":"yunikorn-dev-03c96-autogen","state":"Completing","stateTransitionTimestamp":1726831357608331511,"message":"completeApplication"}]},
>  "currentQueueSize": 0}
> 2024-09-20T11:22:37.608Z    INFO    core.scheduler.application    
> objects/application.go:615    ask removed successfully from application    
> {"appID": "yunikorn-dev-03c96-autogen", "ask": 
> "6f3dd7fa-72b4-40cf-a700-43e51394a06b", "pendingDelta": "nil resource"}
> 2024-09-20T11:22:37.608Z    DEBUG    core.rmproxy    rmproxy/rmproxy.go:60    
> enqueue event    {"eventType": "*rmevent.RMReleaseAllocationEvent", "event": 
> {"RmID":"mycluster","ReleasedAllocations":[{"partitionName":"[mycluster]default","applicationID":"yunikorn-dev-03c96-autogen","terminationType":1,"message":"allocation
>  remove as per RM 
> request","allocationKey":"6f3dd7fa-72b4-40cf-a700-43e51394a06b"}]}, 
> "currentQueueSize": 1}
> 2024-09-20T11:22:37.608Z    DEBUG    shim.rmcallback    
> cache/scheduler_callback.go:108    UpdateApplication callback received    
> {"UpdateApplicationResponse": 
> "updated:{applicationID:\"yunikorn-dev-03c96-autogen\" state:\"Completing\" 
> stateTransitionTimestamp:1726831357608331511 
> message:\"completeApplication\"}"}
> 2024-09-20T11:22:37.608Z    DEBUG    shim.rmcallback    
> cache/scheduler_callback.go:137    status update callback received    
> {"appId": "yunikorn-dev-03c96-autogen", "new status": "Completing"}
> 2024-09-20T11:22:37.608Z    DEBUG    shim.rmcallback    
> cache/scheduler_callback.go:47    UpdateAllocation callback received    
> {"UpdateAllocationResponse": "released:{partitionName:\"[mycluster]default\" 
> applicationID:\"yunikorn-dev-03c96-autogen\" terminationType:STOPPED_BY_RM 
> message:\"allocation remove as per RM request\" 
> allocationKey:\"6f3dd7fa-72b4-40cf-a700-43e51394a06b\"}"}
> 2024-09-20T11:22:38.605Z    INFO    shim.cache.application    
> cache/application.go:239    task removed    {"appID": 
> "yunikorn-dev-03c96-autogen", "taskID": 
> "6f3dd7fa-72b4-40cf-a700-43e51394a06b"}
> 2024-09-20T11:23:07.607Z    DEBUG    core.scheduler.application    
> objects/application.go:352    Application state: auto progress    
> {"applicationID": "yunikorn-dev-03c96-autogen", "state": "Completing"}
> 2024-09-20T11:23:07.607Z    DEBUG    core.scheduler.application    
> objects/application.go:384    Application state timer cleared    {"appID": 
> "yunikorn-dev-03c96-autogen", "state": "Completing"}
> 2024-09-20T11:23:07.607Z    DEBUG    core.scheduler.application    
> objects/application.go:336    Application state timer initiated    {"appID": 
> "yunikorn-dev-03c96-autogen", "state": "Completed", "timeout": "72h0m0s"}
> 2024-09-20T11:23:07.607Z    INFO    core.scheduler.fsm    
> objects/application_state.go:147    Application state transition    {"appID": 
> "yunikorn-dev-03c96-autogen", "source": "Completing", "destination": 
> "Completed", "event": "completeApplication"}
> 2024-09-20T11:23:07.607Z    DEBUG    core.rmproxy    rmproxy/rmproxy.go:60    
> enqueue event    {"eventType": "*rmevent.RMApplicationUpdateEvent", "event": 
> {"RmID":"mycluster","AcceptedApplications":[],"RejectedApplications":[],"UpdatedApplications":[{"applicationID":"yunikorn-dev-03c96-autogen","state":"Completed","stateTransitionTimestamp":1726831387607633721,"message":"completeApplication"}]},
>  "currentQueueSize": 0}
> 2024-09-20T11:23:07.607Z    DEBUG    shim.rmcallback    
> cache/scheduler_callback.go:108    UpdateApplication callback received    
> {"UpdateApplicationResponse": 
> "updated:{applicationID:\"yunikorn-dev-03c96-autogen\" state:\"Completed\" 
> stateTransitionTimestamp:1726831387607633721 
> message:\"completeApplication\"}"}
> 2024-09-20T11:23:07.607Z    DEBUG    shim.rmcallback    
> cache/scheduler_callback.go:137    status update callback received    
> {"appId": "yunikorn-dev-03c96-autogen", "new status": "Completed"}
> 2024-09-20T11:23:07.607Z    INFO    core.scheduler.queue    
> objects/queue.go:830    Application completed and removed from queue    
> {"queueName": "root.dev-03c96", "applicationID": "yunikorn-dev-03c96-autogen"}
> 2024-09-20T11:23:07.607Z    INFO    core.scheduler.partition    
> scheduler/partition.go:1539    Removing terminated application from the 
> application list    {"appID": "yunikorn-dev-03c96-autogen", "app status": 
> "Completed"}
> 2024-09-20T11:23:07.607Z    INFO    core.scheduler.application.usage    
> objects/application_summary.go:60    YK_APP_SUMMARY: {ApplicationID: 
> yunikorn-dev-03c96-autogen, SubmissionTime: 1726831345581, StartTime: 
> 1726831347597, FinishTime: 1726831387607, User: kubernetes-admin, Queue: 
> root.dev-03c96, State: Completed, RmID: mycluster, ResourceUsage: 
> TrackedResource{UNKNOWN:pods=10}, PreemptedResource: TrackedResource{}, 
> PlaceholderResource: TrackedResource{}} {code}



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]


Reply via email to