[
https://issues.apache.org/jira/browse/YUNIKORN-3239?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
]
Wilfred Spiegelenburg updated YUNIKORN-3239:
--------------------------------------------
Description:
We are running yunikorn-core + yunikorn k8s shim. We observe long delays in
yunikorn processing node update events.
As shown in this stack trace, shim is waiting for more than 53 min to get an
ack from core for a node update event. Also have attached the stack trace
collected
{code:java}
goroutine 2976 [sync.WaitGroup.Wait, 53 minutes]:
sync.runtime_SemacquireWaitGroup(0xc000056a20?)
runtime/sema.go:110 +0x25
sync.(*WaitGroup).Wait(0xc000056a20?)
sync/waitgroup.go:118 +0x48
github.com/apache/yunikorn-k8shim/pkg/cache.(*Context).registerNodesInternal(0xc00086f830,
{0xc00039ce48, 0x1, 0x1}, 0xc09be3bd40)
github.com/apache/yunikorn-k8shim/pkg/cache/context.go:1605 +0x58c
github.com/apache/yunikorn-k8shim/pkg/cache.(*Context).registerNodes(0xc00086f830,
{0xc082d6bad0, 0x1, 0xc000280690?})
github.com/apache/yunikorn-k8shim/pkg/cache/context.go:1538 +0x4c5
github.com/apache/yunikorn-k8shim/pkg/cache.(*Context).registerNode(0xc000280690?,
0xc006e99b08)
github.com/apache/yunikorn-k8shim/pkg/cache/context.go:1499 +0x2e
github.com/apache/yunikorn-k8shim/pkg/cache.(*Context).updateNodeInternal(0xc00086f830,
0xc006e99b08, 0x1)
github.com/apache/yunikorn-k8shim/pkg/cache/context.go:187 +0x79
github.com/apache/yunikorn-k8shim/pkg/cache.(*Context).updateNode(0xc00086f830,
{0xc007aea000?, 0xc016770c08?}, {0x2509680, 0xc006e99b08})
github.com/apache/yunikorn-k8shim/pkg/cache/context.go:176 +0x291
github.com/apache/yunikorn-k8shim/pkg/cache.(*Context).addNode(...)
github.com/apache/yunikorn-k8shim/pkg/cache/context.go:165
k8s.io/client-go/tools/cache.ResourceEventHandlerFuncs.OnAdd(...)
k8s.io/[email protected]/tools/cache/controller.go:246
k8s.io/client-go/tools/cache.(*processorListener).run.func1()
k8s.io/[email protected]/tools/cache/shared_informer.go:978 +0xa9
k8s.io/apimachinery/pkg/util/wait.BackoffUntil.func1(0xc000a80008?)
k8s.io/[email protected]/pkg/util/wait/backoff.go:226 +0x33
k8s.io/apimachinery/pkg/util/wait.BackoffUntil(0xc082d6bf70, {0x28a6800,
0xc0409f8120}, 0x1, 0xc01434b340)
k8s.io/[email protected]/pkg/util/wait/backoff.go:227 +0xaf
k8s.io/apimachinery/pkg/util/wait.JitterUntil(0xc010b79770, 0x3b9aca00, 0x0,
0x1, 0xc01434b340)
k8s.io/[email protected]/pkg/util/wait/backoff.go:204 +0x7f
k8s.io/apimachinery/pkg/util/wait.Until(...)
k8s.io/[email protected]/pkg/util/wait/backoff.go:161
k8s.io/client-go/tools/cache.(*processorListener).run(0xc01b59afc0)
k8s.io/[email protected]/tools/cache/shared_informer.go:972 +0x5a
k8s.io/apimachinery/pkg/util/wait.(*Group).Start.func1()
k8s.io/[email protected]/pkg/util/wait/wait.go:72 +0x4c
created by k8s.io/apimachinery/pkg/util/wait.(*Group).Start in goroutine 1
k8s.io/[email protected]/pkg/util/wait/wait.go:70 +0x73 {code}
was:
We are running yunikorn-core + yunikorn k8s shim. We observe long delays in
yunikorn processing node update events.
As shown in this stack trace, shim is waiting for more than 53 min to get an
ack from core for a node update event. Also have attached the stack trace
collected
goroutine 2976 [sync.WaitGroup.Wait, 53 minutes]:
sync.runtime_SemacquireWaitGroup(0xc000056a20?) runtime/sema.go:110 +0x25
sync.(*WaitGroup).Wait(0xc000056a20?) sync/waitgroup.go:118 +0x48
github.com/apache/yunikorn-k8shim/pkg/cache.(*Context).registerNodesInternal(0xc00086f830,
\{0xc00039ce48, 0x1, 0x1}, 0xc09be3bd40)
github.com/apache/yunikorn-k8shim/pkg/cache/context.go:1605 +0x58c
github.com/apache/yunikorn-k8shim/pkg/cache.(*Context).registerNodes(0xc00086f830,
\{0xc082d6bad0, 0x1, 0xc000280690?})
github.com/apache/yunikorn-k8shim/pkg/cache/context.go:1538 +0x4c5
github.com/apache/yunikorn-k8shim/pkg/cache.(*Context).registerNode(0xc000280690?,
0xc006e99b08) github.com/apache/yunikorn-k8shim/pkg/cache/context.go:1499
+0x2e
github.com/apache/yunikorn-k8shim/pkg/cache.(*Context).updateNodeInternal(0xc00086f830,
0xc006e99b08, 0x1) github.com/apache/yunikorn-k8shim/pkg/cache/context.go:187
+0x79
github.com/apache/yunikorn-k8shim/pkg/cache.(*Context).updateNode(0xc00086f830,
\{0xc007aea000?, 0xc016770c08?}, \{0x2509680, 0xc006e99b08})
github.com/apache/yunikorn-k8shim/pkg/cache/context.go:176 +0x291
github.com/apache/yunikorn-k8shim/pkg/cache.(*Context).addNode(...)
github.com/apache/yunikorn-k8shim/pkg/cache/context.go:165
k8s.io/client-go/tools/cache.ResourceEventHandlerFuncs.OnAdd(...)
k8s.io/[email protected]/tools/cache/controller.go:246
k8s.io/client-go/tools/cache.(*processorListener).run.func1()
k8s.io/[email protected]/tools/cache/shared_informer.go:978 +0xa9
k8s.io/apimachinery/pkg/util/wait.BackoffUntil.func1(0xc000a80008?)
k8s.io/[email protected]/pkg/util/wait/backoff.go:226 +0x33
k8s.io/apimachinery/pkg/util/wait.BackoffUntil(0xc082d6bf70, \{0x28a6800,
0xc0409f8120}, 0x1, 0xc01434b340)
k8s.io/[email protected]/pkg/util/wait/backoff.go:227 +0xaf
k8s.io/apimachinery/pkg/util/wait.JitterUntil(0xc010b79770, 0x3b9aca00, 0x0,
0x1, 0xc01434b340) k8s.io/[email protected]/pkg/util/wait/backoff.go:204
+0x7f k8s.io/apimachinery/pkg/util/wait.Until(...)
k8s.io/[email protected]/pkg/util/wait/backoff.go:161
k8s.io/client-go/tools/cache.(*processorListener).run(0xc01b59afc0)
k8s.io/[email protected]/tools/cache/shared_informer.go:972 +0x5a
k8s.io/apimachinery/pkg/util/wait.(*Group).Start.func1()
k8s.io/[email protected]/pkg/util/wait/wait.go:72 +0x4c created by
k8s.io/apimachinery/pkg/util/wait.(*Group).Start in goroutine 1
k8s.io/[email protected]/pkg/util/wait/wait.go:70 +0x73
> Lag is processing register node event in yunikorn core
> ------------------------------------------------------
>
> Key: YUNIKORN-3239
> URL: https://issues.apache.org/jira/browse/YUNIKORN-3239
> Project: Apache YuniKorn
> Issue Type: Bug
> Components: core - scheduler
> Reporter: Ashwin Shroff
> Assignee: Ashwin Shroff
> Priority: Major
> Attachments: stack-trace.json
>
>
> We are running yunikorn-core + yunikorn k8s shim. We observe long delays in
> yunikorn processing node update events.
> As shown in this stack trace, shim is waiting for more than 53 min to get an
> ack from core for a node update event. Also have attached the stack trace
> collected
> {code:java}
> goroutine 2976 [sync.WaitGroup.Wait, 53 minutes]:
> sync.runtime_SemacquireWaitGroup(0xc000056a20?)
> runtime/sema.go:110 +0x25
> sync.(*WaitGroup).Wait(0xc000056a20?)
> sync/waitgroup.go:118 +0x48
> github.com/apache/yunikorn-k8shim/pkg/cache.(*Context).registerNodesInternal(0xc00086f830,
> {0xc00039ce48, 0x1, 0x1}, 0xc09be3bd40)
> github.com/apache/yunikorn-k8shim/pkg/cache/context.go:1605 +0x58c
> github.com/apache/yunikorn-k8shim/pkg/cache.(*Context).registerNodes(0xc00086f830,
> {0xc082d6bad0, 0x1, 0xc000280690?})
> github.com/apache/yunikorn-k8shim/pkg/cache/context.go:1538 +0x4c5
> github.com/apache/yunikorn-k8shim/pkg/cache.(*Context).registerNode(0xc000280690?,
> 0xc006e99b08)
> github.com/apache/yunikorn-k8shim/pkg/cache/context.go:1499 +0x2e
> github.com/apache/yunikorn-k8shim/pkg/cache.(*Context).updateNodeInternal(0xc00086f830,
> 0xc006e99b08, 0x1)
> github.com/apache/yunikorn-k8shim/pkg/cache/context.go:187 +0x79
> github.com/apache/yunikorn-k8shim/pkg/cache.(*Context).updateNode(0xc00086f830,
> {0xc007aea000?, 0xc016770c08?}, {0x2509680, 0xc006e99b08})
> github.com/apache/yunikorn-k8shim/pkg/cache/context.go:176 +0x291
> github.com/apache/yunikorn-k8shim/pkg/cache.(*Context).addNode(...)
> github.com/apache/yunikorn-k8shim/pkg/cache/context.go:165
> k8s.io/client-go/tools/cache.ResourceEventHandlerFuncs.OnAdd(...)
> k8s.io/[email protected]/tools/cache/controller.go:246
> k8s.io/client-go/tools/cache.(*processorListener).run.func1()
> k8s.io/[email protected]/tools/cache/shared_informer.go:978 +0xa9
> k8s.io/apimachinery/pkg/util/wait.BackoffUntil.func1(0xc000a80008?)
> k8s.io/[email protected]/pkg/util/wait/backoff.go:226 +0x33
> k8s.io/apimachinery/pkg/util/wait.BackoffUntil(0xc082d6bf70, {0x28a6800,
> 0xc0409f8120}, 0x1, 0xc01434b340)
> k8s.io/[email protected]/pkg/util/wait/backoff.go:227 +0xaf
> k8s.io/apimachinery/pkg/util/wait.JitterUntil(0xc010b79770, 0x3b9aca00, 0x0,
> 0x1, 0xc01434b340)
> k8s.io/[email protected]/pkg/util/wait/backoff.go:204 +0x7f
> k8s.io/apimachinery/pkg/util/wait.Until(...)
> k8s.io/[email protected]/pkg/util/wait/backoff.go:161
> k8s.io/client-go/tools/cache.(*processorListener).run(0xc01b59afc0)
> k8s.io/[email protected]/tools/cache/shared_informer.go:972 +0x5a
> k8s.io/apimachinery/pkg/util/wait.(*Group).Start.func1()
> k8s.io/[email protected]/pkg/util/wait/wait.go:72 +0x4c
> created by k8s.io/apimachinery/pkg/util/wait.(*Group).Start in goroutine 1
> k8s.io/[email protected]/pkg/util/wait/wait.go:70 +0x73 {code}
--
This message was sent by Atlassian Jira
(v8.20.10#820010)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]