Craig Condit created YUNIKORN-1227:
--------------------------------------

             Summary: Race condition in Predicates code on Node / Pod
                 Key: YUNIKORN-1227
                 URL: https://issues.apache.org/jira/browse/YUNIKORN-1227
             Project: Apache YuniKorn
          Issue Type: Improvement
          Components: shim - kubernetes
            Reporter: Craig Condit
            Assignee: Craig Condit


A data race was recently uncovered during testing:
{noformat}
==================
WARNING: DATA RACE
Write at 0x00c000171688 by goroutine 168:
  k8s.io/kubernetes/pkg/scheduler/framework.(*NodeInfo).RemovePod()
      
/home/testuser/go/pkg/mod/k8s.io/[email protected]/pkg/scheduler/framework/types.go:560
 +0x7ec
  
github.com/apache/yunikorn-k8shim/pkg/cache/external.(*SchedulerCache).updatePod()
      
/home/testuser/repos/incubator-yunikorn-k8shim/pkg/cache/external/scheduler_cache.go:258
 +0x1fe
  
github.com/apache/yunikorn-k8shim/pkg/cache/external.(*SchedulerCache).UpdatePod()
      
/home/testuser/repos/incubator-yunikorn-k8shim/pkg/cache/external/scheduler_cache.go:244
 +0x11d
  github.com/apache/yunikorn-k8shim/pkg/cache.(*Context).updatePodInCache()
      /home/testuser/repos/incubator-yunikorn-k8shim/pkg/cache/context.go:263 
+0x94a
  github.com/apache/yunikorn-k8shim/pkg/cache.(*Context).updatePodInCache-fm()
      <autogenerated>:1 +0x6d
  k8s.io/client-go/tools/cache.ResourceEventHandlerFuncs.OnUpdate()
      
/home/testuser/go/pkg/mod/k8s.io/[email protected]/tools/cache/controller.go:238
 +0x8b
  k8s.io/client-go/tools/cache.(*ResourceEventHandlerFuncs).OnUpdate()
      <autogenerated>:1 +0x29
  k8s.io/client-go/tools/cache.FilteringResourceEventHandler.OnUpdate()
      
/home/testuser/go/pkg/mod/k8s.io/[email protected]/tools/cache/controller.go:273
 +0xf5
  k8s.io/client-go/tools/cache.(*FilteringResourceEventHandler).OnUpdate()
      <autogenerated>:1 +0x8d
  k8s.io/client-go/tools/cache.(*processorListener).run.func1()
      
/home/testuser/go/pkg/mod/k8s.io/[email protected]/tools/cache/shared_informer.go:775
 +0x2b7
  k8s.io/apimachinery/pkg/util/wait.BackoffUntil.func1()
      
/home/testuser/go/pkg/mod/k8s.io/[email protected]/pkg/util/wait/wait.go:155
 +0x48
  k8s.io/apimachinery/pkg/util/wait.BackoffUntil()
      
/home/testuser/go/pkg/mod/k8s.io/[email protected]/pkg/util/wait/wait.go:156
 +0xce
  k8s.io/apimachinery/pkg/util/wait.JitterUntil()
      
/home/testuser/go/pkg/mod/k8s.io/[email protected]/pkg/util/wait/wait.go:133
 +0x104
  k8s.io/apimachinery/pkg/util/wait.Until()
      
/home/testuser/go/pkg/mod/k8s.io/[email protected]/pkg/util/wait/wait.go:90 
+0x78
  k8s.io/client-go/tools/cache.(*processorListener).run()
      
/home/testuser/go/pkg/mod/k8s.io/[email protected]/tools/cache/shared_informer.go:771
 +0x18
  k8s.io/client-go/tools/cache.(*processorListener).run-fm()
      <autogenerated>:1 +0x39
  k8s.io/apimachinery/pkg/util/wait.(*Group).Start.func1()
      
/home/testuser/go/pkg/mod/k8s.io/[email protected]/pkg/util/wait/wait.go:73 
+0x73Previous read at 0x00c000171688 by goroutine 46:
  k8s.io/kubernetes/pkg/scheduler/framework/plugins/noderesources.fitsRequest()
      
/home/testuser/go/pkg/mod/k8s.io/[email protected]/pkg/scheduler/framework/plugins/noderesources/fit.go:234
 +0xd7
  
k8s.io/kubernetes/pkg/scheduler/framework/plugins/noderesources.(*Fit).Filter()
      
/home/testuser/go/pkg/mod/k8s.io/[email protected]/pkg/scheduler/framework/plugins/noderesources/fit.go:201
 +0xbe
  
github.com/apache/yunikorn-k8shim/pkg/plugin/predicates.(*predicateManagerImpl).runFilterPlugin()
      
/home/testuser/repos/incubator-yunikorn-k8shim/pkg/plugin/predicates/predicate_manager.go:151
 +0x158
  
github.com/apache/yunikorn-k8shim/pkg/plugin/predicates.(*predicateManagerImpl).runFilterPlugins()
      
/home/testuser/repos/incubator-yunikorn-k8shim/pkg/plugin/predicates/predicate_manager.go:129
 +0x126
  
github.com/apache/yunikorn-k8shim/pkg/plugin/predicates.(*predicateManagerImpl).podFitsNode()
      
/home/testuser/repos/incubator-yunikorn-k8shim/pkg/plugin/predicates/predicate_manager.go:94
 +0x137
  
github.com/apache/yunikorn-k8shim/pkg/plugin/predicates.(*predicateManagerImpl).predicatesAllocate()
      
/home/testuser/repos/incubator-yunikorn-k8shim/pkg/plugin/predicates/predicate_manager.go:78
 +0x211
  
github.com/apache/yunikorn-k8shim/pkg/plugin/predicates.(*predicateManagerImpl).Predicates()
      
/home/testuser/repos/incubator-yunikorn-k8shim/pkg/plugin/predicates/predicate_manager.go:64
 +0x52
  github.com/apache/yunikorn-k8shim/pkg/cache.(*Context).IsPodFitNode()
      /home/testuser/repos/incubator-yunikorn-k8shim/pkg/cache/context.go:341 
+0x241
  github.com/apache/yunikorn-k8shim/pkg/callback.(*AsyncRMCallback).Predicates()
      
/home/testuser/repos/incubator-yunikorn-k8shim/pkg/callback/scheduler_callback.go:187
 +0xb7
  github.com/apache/yunikorn-core/pkg/scheduler/objects.(*Node).preConditions()
      
/home/testuser/repos/incubator-yunikorn-core/pkg/scheduler/objects/node.go:386 
+0x1c7
  
github.com/apache/yunikorn-core/pkg/scheduler/objects.(*Node).preAllocateConditions()
      
/home/testuser/repos/incubator-yunikorn-core/pkg/scheduler/objects/node.go:368 
+0xe4
  github.com/apache/yunikorn-core/pkg/scheduler/objects.(*Application).tryNode()
      
/home/testuser/repos/incubator-yunikorn-core/pkg/scheduler/objects/application.go:1190
 +0xe7
  
github.com/apache/yunikorn-core/pkg/scheduler/objects.(*Application).tryNodes()
      
/home/testuser/repos/incubator-yunikorn-core/pkg/scheduler/objects/application.go:1112
 +0x7c4
  
github.com/apache/yunikorn-core/pkg/scheduler/objects.(*Application).tryAllocate()
      
/home/testuser/repos/incubator-yunikorn-core/pkg/scheduler/objects/application.go:849
 +0x7a4
  github.com/apache/yunikorn-core/pkg/scheduler/objects.(*Queue).TryAllocate()
      
/home/testuser/repos/incubator-yunikorn-core/pkg/scheduler/objects/queue.go:1070
 +0x18c
  github.com/apache/yunikorn-core/pkg/scheduler/objects.(*Queue).TryAllocate()
      
/home/testuser/repos/incubator-yunikorn-core/pkg/scheduler/objects/queue.go:1082
 +0xf7
  
github.com/apache/yunikorn-core/pkg/scheduler.(*PartitionContext).tryAllocate()
      
/home/testuser/repos/incubator-yunikorn-core/pkg/scheduler/partition.go:831 
+0x15c
  github.com/apache/yunikorn-core/pkg/scheduler.(*ClusterContext).schedule()
      /home/testuser/repos/incubator-yunikorn-core/pkg/scheduler/context.go:137 
+0x1b6
  github.com/apache/yunikorn-core/pkg/scheduler.(*Scheduler).internalSchedule()
      
/home/testuser/repos/incubator-yunikorn-core/pkg/scheduler/scheduler.go:77 +0x47
  
github.com/apache/yunikorn-core/pkg/scheduler.(*Scheduler).StartService.func2()
      
/home/testuser/repos/incubator-yunikorn-core/pkg/scheduler/scheduler.go:67 
+0x39Goroutine 168 (running) created at:
  k8s.io/apimachinery/pkg/util/wait.(*Group).Start()
      
/home/testuser/go/pkg/mod/k8s.io/[email protected]/pkg/util/wait/wait.go:71 
+0xdc
  k8s.io/client-go/tools/cache.(*sharedProcessor).addListener()
      
/home/testuser/go/pkg/mod/k8s.io/[email protected]/tools/cache/shared_informer.go:593
 +0x379
  
k8s.io/client-go/tools/cache.(*sharedIndexInformer).AddEventHandlerWithResyncPeriod()
      
/home/testuser/go/pkg/mod/k8s.io/[email protected]/tools/cache/shared_informer.go:521
 +0x644
  github.com/apache/yunikorn-k8shim/pkg/client.(*APIFactory).addEventHandlers()
      
/home/testuser/repos/incubator-yunikorn-k8shim/pkg/client/apifactory.go:183 
+0x182
  github.com/apache/yunikorn-k8shim/pkg/client.(*APIFactory).AddEventHandler()
      
/home/testuser/repos/incubator-yunikorn-k8shim/pkg/client/apifactory.go:175 
+0x293
  
github.com/apache/yunikorn-k8shim/pkg/cache.(*Context).AddSchedulingEventHandlers()
      /home/testuser/repos/incubator-yunikorn-k8shim/pkg/cache/context.go:101 
+0x673
  github.com/apache/yunikorn-k8shim/pkg/shim.(*KubernetesShim).doScheduling()
      /home/testuser/repos/incubator-yunikorn-k8shim/pkg/shim/scheduler.go:235 
+0x55
  github.com/apache/yunikorn-k8shim/pkg/shim.(*KubernetesShim).doScheduling-fm()
      <autogenerated>:1 +0x44
  github.com/looplab/fsm.(*FSM).enterStateCallbacks()
      /home/testuser/go/pkg/mod/github.com/looplab/[email protected]/fsm.go:403 +0xb6
  github.com/looplab/fsm.(*FSM).Event.func1()
      /home/testuser/go/pkg/mod/github.com/looplab/[email protected]/fsm.go:308 +0xa8
  github.com/looplab/fsm.transitionerStruct.transition()
      /home/testuser/go/pkg/mod/github.com/looplab/[email protected]/fsm.go:354 +0x99
  github.com/looplab/fsm.(*transitionerStruct).transition()
      <autogenerated>:1 +0x29
  github.com/looplab/fsm.(*FSM).doTransition()
      /home/testuser/go/pkg/mod/github.com/looplab/[email protected]/fsm.go:339 +0x701
  github.com/looplab/fsm.(*FSM).Event()
      /home/testuser/go/pkg/mod/github.com/looplab/[email protected]/fsm.go:321 +0x6da
  github.com/apache/yunikorn-k8shim/pkg/shim.(*KubernetesShim).handle()
      /home/testuser/repos/incubator-yunikorn-k8shim/pkg/shim/scheduler.go:279 
+0x1e4
  
github.com/apache/yunikorn-k8shim/pkg/shim.(*KubernetesShim).SchedulerEventHandler.func1()
      /home/testuser/repos/incubator-yunikorn-k8shim/pkg/shim/scheduler.go:152 
+0xa9
  github.com/apache/yunikorn-k8shim/pkg/dispatcher.Start.func1()
      
/home/testuser/repos/incubator-yunikorn-k8shim/pkg/dispatcher/dispatcher.go:199 
+0x36bGoroutine 46 (running) created at:
  github.com/apache/yunikorn-core/pkg/scheduler.(*Scheduler).StartService()
      
/home/testuser/repos/incubator-yunikorn-core/pkg/scheduler/scheduler.go:67 
+0x384
  
github.com/apache/yunikorn-core/pkg/entrypoint.startAllServicesWithParameters()
      
/home/testuser/repos/incubator-yunikorn-core/pkg/entrypoint/entrypoint.go:90 
+0x624
  github.com/apache/yunikorn-core/pkg/entrypoint.StartAllServices()
      
/home/testuser/repos/incubator-yunikorn-core/pkg/entrypoint/entrypoint.go:44 
+0x4f
  github.com/apache/yunikorn-core/pkg/entrypoint.StartAllServicesWithLogger()
      
/home/testuser/repos/incubator-yunikorn-core/pkg/entrypoint/entrypoint.go:55 
+0x3b
  main.main()
      /home/testuser/repos/incubator-yunikorn-k8shim/pkg/cmd/shim/main.go:50 
+0x4c4{noformat}
Based on analysis of this race, it appears that we need to make defensive 
copies of Node / Pod information when calling the K8s predicates. The default 
scheduler creates a snapshot per scheduler run; it's likely we need to do 
something similar.



--
This message was sent by Atlassian Jira
(v8.20.7#820007)

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to