[ 
https://issues.apache.org/jira/browse/YUNIKORN-3239?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Wilfred Spiegelenburg updated YUNIKORN-3239:
--------------------------------------------
    Description: 
We are running yunikorn-core + yunikorn k8s shim. We observe long delays in 
yunikorn processing node update events. 
As shown in this stack trace, shim is waiting for more than 53 min to get an 
ack from core for a node update event. Also have attached the stack trace 
collected
{code:java}
goroutine 2976 [sync.WaitGroup.Wait, 53 minutes]:
sync.runtime_SemacquireWaitGroup(0xc000056a20?)
        runtime/sema.go:110 +0x25
sync.(*WaitGroup).Wait(0xc000056a20?)
        sync/waitgroup.go:118 +0x48
github.com/apache/yunikorn-k8shim/pkg/cache.(*Context).registerNodesInternal(0xc00086f830,
 {0xc00039ce48, 0x1, 0x1}, 0xc09be3bd40)
        github.com/apache/yunikorn-k8shim/pkg/cache/context.go:1605 +0x58c
github.com/apache/yunikorn-k8shim/pkg/cache.(*Context).registerNodes(0xc00086f830,
 {0xc082d6bad0, 0x1, 0xc000280690?})
        github.com/apache/yunikorn-k8shim/pkg/cache/context.go:1538 +0x4c5
github.com/apache/yunikorn-k8shim/pkg/cache.(*Context).registerNode(0xc000280690?,
 0xc006e99b08)
        github.com/apache/yunikorn-k8shim/pkg/cache/context.go:1499 +0x2e
github.com/apache/yunikorn-k8shim/pkg/cache.(*Context).updateNodeInternal(0xc00086f830,
 0xc006e99b08, 0x1)
        github.com/apache/yunikorn-k8shim/pkg/cache/context.go:187 +0x79
github.com/apache/yunikorn-k8shim/pkg/cache.(*Context).updateNode(0xc00086f830, 
{0xc007aea000?, 0xc016770c08?}, {0x2509680, 0xc006e99b08})
        github.com/apache/yunikorn-k8shim/pkg/cache/context.go:176 +0x291
github.com/apache/yunikorn-k8shim/pkg/cache.(*Context).addNode(...)
        github.com/apache/yunikorn-k8shim/pkg/cache/context.go:165
k8s.io/client-go/tools/cache.ResourceEventHandlerFuncs.OnAdd(...)
        k8s.io/[email protected]/tools/cache/controller.go:246
k8s.io/client-go/tools/cache.(*processorListener).run.func1()
        k8s.io/[email protected]/tools/cache/shared_informer.go:978 +0xa9
k8s.io/apimachinery/pkg/util/wait.BackoffUntil.func1(0xc000a80008?)
        k8s.io/[email protected]/pkg/util/wait/backoff.go:226 +0x33
k8s.io/apimachinery/pkg/util/wait.BackoffUntil(0xc082d6bf70, {0x28a6800, 
0xc0409f8120}, 0x1, 0xc01434b340)
        k8s.io/[email protected]/pkg/util/wait/backoff.go:227 +0xaf
k8s.io/apimachinery/pkg/util/wait.JitterUntil(0xc010b79770, 0x3b9aca00, 0x0, 
0x1, 0xc01434b340)
        k8s.io/[email protected]/pkg/util/wait/backoff.go:204 +0x7f
k8s.io/apimachinery/pkg/util/wait.Until(...)
        k8s.io/[email protected]/pkg/util/wait/backoff.go:161
k8s.io/client-go/tools/cache.(*processorListener).run(0xc01b59afc0)
        k8s.io/[email protected]/tools/cache/shared_informer.go:972 +0x5a
k8s.io/apimachinery/pkg/util/wait.(*Group).Start.func1()
        k8s.io/[email protected]/pkg/util/wait/wait.go:72 +0x4c
created by k8s.io/apimachinery/pkg/util/wait.(*Group).Start in goroutine 1
        k8s.io/[email protected]/pkg/util/wait/wait.go:70 +0x73 {code}

  was:
We are running yunikorn-core + yunikorn k8s shim. We observe long delays in 
yunikorn processing node update events. 
As shown in this stack trace, shim is waiting for more than 53 min to get an 
ack from core for a node update event. Also have attached the stack trace 
collected

goroutine 2976 [sync.WaitGroup.Wait, 53 minutes]: 
sync.runtime_SemacquireWaitGroup(0xc000056a20?) runtime/sema.go:110 +0x25 
sync.(*WaitGroup).Wait(0xc000056a20?) sync/waitgroup.go:118 +0x48 
github.com/apache/yunikorn-k8shim/pkg/cache.(*Context).registerNodesInternal(0xc00086f830,
 \{0xc00039ce48, 0x1, 0x1}, 0xc09be3bd40) 
github.com/apache/yunikorn-k8shim/pkg/cache/context.go:1605 +0x58c 
github.com/apache/yunikorn-k8shim/pkg/cache.(*Context).registerNodes(0xc00086f830,
 \{0xc082d6bad0, 0x1, 0xc000280690?}) 
github.com/apache/yunikorn-k8shim/pkg/cache/context.go:1538 +0x4c5 
github.com/apache/yunikorn-k8shim/pkg/cache.(*Context).registerNode(0xc000280690?,
 0xc006e99b08) github.com/apache/yunikorn-k8shim/pkg/cache/context.go:1499 
+0x2e 
github.com/apache/yunikorn-k8shim/pkg/cache.(*Context).updateNodeInternal(0xc00086f830,
 0xc006e99b08, 0x1) github.com/apache/yunikorn-k8shim/pkg/cache/context.go:187 
+0x79 
github.com/apache/yunikorn-k8shim/pkg/cache.(*Context).updateNode(0xc00086f830, 
\{0xc007aea000?, 0xc016770c08?}, \{0x2509680, 0xc006e99b08}) 
github.com/apache/yunikorn-k8shim/pkg/cache/context.go:176 +0x291 
github.com/apache/yunikorn-k8shim/pkg/cache.(*Context).addNode(...) 
github.com/apache/yunikorn-k8shim/pkg/cache/context.go:165 
k8s.io/client-go/tools/cache.ResourceEventHandlerFuncs.OnAdd(...) 
k8s.io/[email protected]/tools/cache/controller.go:246 
k8s.io/client-go/tools/cache.(*processorListener).run.func1() 
k8s.io/[email protected]/tools/cache/shared_informer.go:978 +0xa9 
k8s.io/apimachinery/pkg/util/wait.BackoffUntil.func1(0xc000a80008?) 
k8s.io/[email protected]/pkg/util/wait/backoff.go:226 +0x33 
k8s.io/apimachinery/pkg/util/wait.BackoffUntil(0xc082d6bf70, \{0x28a6800, 
0xc0409f8120}, 0x1, 0xc01434b340) 
k8s.io/[email protected]/pkg/util/wait/backoff.go:227 +0xaf 
k8s.io/apimachinery/pkg/util/wait.JitterUntil(0xc010b79770, 0x3b9aca00, 0x0, 
0x1, 0xc01434b340) k8s.io/[email protected]/pkg/util/wait/backoff.go:204 
+0x7f k8s.io/apimachinery/pkg/util/wait.Until(...) 
k8s.io/[email protected]/pkg/util/wait/backoff.go:161 
k8s.io/client-go/tools/cache.(*processorListener).run(0xc01b59afc0) 
k8s.io/[email protected]/tools/cache/shared_informer.go:972 +0x5a 
k8s.io/apimachinery/pkg/util/wait.(*Group).Start.func1() 
k8s.io/[email protected]/pkg/util/wait/wait.go:72 +0x4c created by 
k8s.io/apimachinery/pkg/util/wait.(*Group).Start in goroutine 1 
k8s.io/[email protected]/pkg/util/wait/wait.go:70 +0x73


> Lag is processing register node event in yunikorn core
> ------------------------------------------------------
>
>                 Key: YUNIKORN-3239
>                 URL: https://issues.apache.org/jira/browse/YUNIKORN-3239
>             Project: Apache YuniKorn
>          Issue Type: Bug
>          Components: core - scheduler
>            Reporter: Ashwin Shroff
>            Assignee: Ashwin Shroff
>            Priority: Major
>         Attachments: stack-trace.json
>
>
> We are running yunikorn-core + yunikorn k8s shim. We observe long delays in 
> yunikorn processing node update events. 
> As shown in this stack trace, shim is waiting for more than 53 min to get an 
> ack from core for a node update event. Also have attached the stack trace 
> collected
> {code:java}
> goroutine 2976 [sync.WaitGroup.Wait, 53 minutes]:
> sync.runtime_SemacquireWaitGroup(0xc000056a20?)
>         runtime/sema.go:110 +0x25
> sync.(*WaitGroup).Wait(0xc000056a20?)
>         sync/waitgroup.go:118 +0x48
> github.com/apache/yunikorn-k8shim/pkg/cache.(*Context).registerNodesInternal(0xc00086f830,
>  {0xc00039ce48, 0x1, 0x1}, 0xc09be3bd40)
>         github.com/apache/yunikorn-k8shim/pkg/cache/context.go:1605 +0x58c
> github.com/apache/yunikorn-k8shim/pkg/cache.(*Context).registerNodes(0xc00086f830,
>  {0xc082d6bad0, 0x1, 0xc000280690?})
>         github.com/apache/yunikorn-k8shim/pkg/cache/context.go:1538 +0x4c5
> github.com/apache/yunikorn-k8shim/pkg/cache.(*Context).registerNode(0xc000280690?,
>  0xc006e99b08)
>         github.com/apache/yunikorn-k8shim/pkg/cache/context.go:1499 +0x2e
> github.com/apache/yunikorn-k8shim/pkg/cache.(*Context).updateNodeInternal(0xc00086f830,
>  0xc006e99b08, 0x1)
>         github.com/apache/yunikorn-k8shim/pkg/cache/context.go:187 +0x79
> github.com/apache/yunikorn-k8shim/pkg/cache.(*Context).updateNode(0xc00086f830,
>  {0xc007aea000?, 0xc016770c08?}, {0x2509680, 0xc006e99b08})
>         github.com/apache/yunikorn-k8shim/pkg/cache/context.go:176 +0x291
> github.com/apache/yunikorn-k8shim/pkg/cache.(*Context).addNode(...)
>         github.com/apache/yunikorn-k8shim/pkg/cache/context.go:165
> k8s.io/client-go/tools/cache.ResourceEventHandlerFuncs.OnAdd(...)
>         k8s.io/[email protected]/tools/cache/controller.go:246
> k8s.io/client-go/tools/cache.(*processorListener).run.func1()
>         k8s.io/[email protected]/tools/cache/shared_informer.go:978 +0xa9
> k8s.io/apimachinery/pkg/util/wait.BackoffUntil.func1(0xc000a80008?)
>         k8s.io/[email protected]/pkg/util/wait/backoff.go:226 +0x33
> k8s.io/apimachinery/pkg/util/wait.BackoffUntil(0xc082d6bf70, {0x28a6800, 
> 0xc0409f8120}, 0x1, 0xc01434b340)
>         k8s.io/[email protected]/pkg/util/wait/backoff.go:227 +0xaf
> k8s.io/apimachinery/pkg/util/wait.JitterUntil(0xc010b79770, 0x3b9aca00, 0x0, 
> 0x1, 0xc01434b340)
>         k8s.io/[email protected]/pkg/util/wait/backoff.go:204 +0x7f
> k8s.io/apimachinery/pkg/util/wait.Until(...)
>         k8s.io/[email protected]/pkg/util/wait/backoff.go:161
> k8s.io/client-go/tools/cache.(*processorListener).run(0xc01b59afc0)
>         k8s.io/[email protected]/tools/cache/shared_informer.go:972 +0x5a
> k8s.io/apimachinery/pkg/util/wait.(*Group).Start.func1()
>         k8s.io/[email protected]/pkg/util/wait/wait.go:72 +0x4c
> created by k8s.io/apimachinery/pkg/util/wait.(*Group).Start in goroutine 1
>         k8s.io/[email protected]/pkg/util/wait/wait.go:70 +0x73 {code}



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to