[
https://issues.apache.org/jira/browse/YUNIKORN-2574?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17839710#comment-17839710
]
Peter Bacsko commented on YUNIKORN-2574:
----------------------------------------
The test case {{TestUpdateNodeCapacity()}} actually can fail with a data race
because of this:
{noformat}
2024-04-18T12:01:02.1227519Z WARNING: DATA RACE
2024-04-18T12:01:02.1227810Z Read at 0x00c0004e88a0 by goroutine 1762:
2024-04-18T12:01:02.1228164Z reflect.maplen()
2024-04-18T12:01:02.1228643Z
/opt/hostedtoolcache/go/1.22.2/x64/src/runtime/map.go:1406 +0x0
2024-04-18T12:01:02.1229122Z reflect.Value.lenNonSlice()
2024-04-18T12:01:02.1229669Z
/opt/hostedtoolcache/go/1.22.2/x64/src/reflect/value.go:1785 +0x1e9
2024-04-18T12:01:02.1230137Z reflect.Value.Len()
2024-04-18T12:01:02.1230646Z
/opt/hostedtoolcache/go/1.22.2/x64/src/reflect/value.go:1774 +0x137
2024-04-18T12:01:02.1231110Z internal/fmtsort.Sort()
2024-04-18T12:01:02.1231657Z
/opt/hostedtoolcache/go/1.22.2/x64/src/internal/fmtsort/sort.go:58 +0x121
2024-04-18T12:01:02.1232148Z fmt.(*pp).printValue()
2024-04-18T12:01:02.1232637Z
/opt/hostedtoolcache/go/1.22.2/x64/src/fmt/print.go:816 +0x1144
2024-04-18T12:01:02.1233241Z fmt.(*pp).printArg()
2024-04-18T12:01:02.1233725Z
/opt/hostedtoolcache/go/1.22.2/x64/src/fmt/print.go:759 +0xb84
2024-04-18T12:01:02.1234166Z fmt.(*pp).doPrintf()
2024-04-18T12:01:02.1234643Z
/opt/hostedtoolcache/go/1.22.2/x64/src/fmt/print.go:1075 +0x592
2024-04-18T12:01:02.1235078Z fmt.Sprintf()
2024-04-18T12:01:02.1235529Z
/opt/hostedtoolcache/go/1.22.2/x64/src/fmt/print.go:239 +0x5c
2024-04-18T12:01:02.1236289Z
github.com/apache/yunikorn-core/pkg/common/resources.(*Resource).String()
2024-04-18T12:01:02.1237218Z
/home/runner/work/yunikorn-core/yunikorn-core/pkg/common/resources/resources.go:110
+0x84
2024-04-18T12:01:02.1237830Z fmt.(*pp).handleMethods()
2024-04-18T12:01:02.1238331Z
/opt/hostedtoolcache/go/1.22.2/x64/src/fmt/print.go:673 +0x6ea
2024-04-18T12:01:02.1238772Z fmt.(*pp).printArg()
2024-04-18T12:01:02.1239244Z
/opt/hostedtoolcache/go/1.22.2/x64/src/fmt/print.go:756 +0xb4b
2024-04-18T12:01:02.1239677Z fmt.(*pp).doPrintf()
2024-04-18T12:01:02.1240154Z
/opt/hostedtoolcache/go/1.22.2/x64/src/fmt/print.go:1075 +0x592
2024-04-18T12:01:02.1240586Z fmt.Sprintf()
2024-04-18T12:01:02.1241029Z
/opt/hostedtoolcache/go/1.22.2/x64/src/fmt/print.go:239 +0x5c
2024-04-18T12:01:02.1241473Z testing.(*common).Errorf()
2024-04-18T12:01:02.1241995Z
/opt/hostedtoolcache/go/1.22.2/x64/src/testing/testing.go:1074 +0x8d
2024-04-18T12:01:02.1242742Z
github.com/apache/yunikorn-core/pkg/scheduler/tests.TestUpdateNodeCapacity()
2024-04-18T12:01:02.1243746Z
/home/runner/work/yunikorn-core/yunikorn-core/pkg/scheduler/tests/operation_test.go:428
+0x21b1
2024-04-18T12:01:02.1244362Z testing.tRunner()
2024-04-18T12:01:02.1244863Z
/opt/hostedtoolcache/go/1.22.2/x64/src/testing/testing.go:1689 +0x21e
2024-04-18T12:01:02.1245346Z testing.(*T).Run.gowrap1()
2024-04-18T12:01:02.1245905Z
/opt/hostedtoolcache/go/1.22.2/x64/src/testing/testing.go:1742 +0x44
2024-04-18T12:01:02.1246267Z
2024-04-18T12:01:02.1246444Z Previous write at 0x00c0004e88a0 by goroutine 1772:
2024-04-18T12:01:02.1246849Z runtime.mapassign_faststr()
2024-04-18T12:01:02.1247403Z
/opt/hostedtoolcache/go/1.22.2/x64/src/runtime/map_faststr.go:203 +0x0
2024-04-18T12:01:02.1248120Z
github.com/apache/yunikorn-core/pkg/common/resources.(*Resource).AddTo()
2024-04-18T12:01:02.1249048Z
/home/runner/work/yunikorn-core/yunikorn-core/pkg/common/resources/resources.go:160
+0x1ae
2024-04-18T12:01:02.1249986Z
github.com/apache/yunikorn-core/pkg/scheduler.(*PartitionContext).updatePartitionResource()
2024-04-18T12:01:02.1250958Z
/home/runner/work/yunikorn-core/yunikorn-core/pkg/scheduler/partition.go:583
+0x237
2024-04-18T12:01:02.1251926Z
github.com/apache/yunikorn-core/pkg/scheduler.(*ClusterContext).updateNode()
2024-04-18T12:01:02.1252821Z
/home/runner/work/yunikorn-core/yunikorn-core/pkg/scheduler/context.go:665
+0x1434
2024-04-18T12:01:02.1253636Z
github.com/apache/yunikorn-core/pkg/scheduler.(*ClusterContext).processNodes()
2024-04-18T12:01:02.1254755Z
/home/runner/work/yunikorn-core/yunikorn-core/pkg/scheduler/context.go:289
+0x57b
2024-04-18T12:01:02.1255655Z
github.com/apache/yunikorn-core/pkg/scheduler.(*ClusterContext).handleRMUpdateNodeEvent()
2024-04-18T12:01:02.1256616Z
/home/runner/work/yunikorn-core/yunikorn-core/pkg/scheduler/context.go:253
+0x24f
2024-04-18T12:01:02.1257395Z
github.com/apache/yunikorn-core/pkg/scheduler.(*Scheduler).handleRMEvent()
2024-04-18T12:01:02.1258275Z
/home/runner/work/yunikorn-core/yunikorn-core/pkg/scheduler/scheduler.go:137
+0x209
2024-04-18T12:01:02.1259117Z
github.com/apache/yunikorn-core/pkg/scheduler.(*Scheduler).StartService.gowrap1()
2024-04-18T12:01:02.1260029Z
/home/runner/work/yunikorn-core/yunikorn-core/pkg/scheduler/scheduler.go:60
+0x33
2024-04-18T12:01:02.1260462Z
{noformat}
> totalPartitionResource should not be mutated with AddTo/SubFrom
> ---------------------------------------------------------------
>
> Key: YUNIKORN-2574
> URL: https://issues.apache.org/jira/browse/YUNIKORN-2574
> Project: Apache YuniKorn
> Issue Type: Bug
> Components: core - scheduler
> Affects Versions: 1.4.0, 1.5.0
> Reporter: Peter Bacsko
> Assignee: Peter Bacsko
> Priority: Major
>
> There is a potential data race in PartitionContext: the field
> "totalPartitionResource" is mutated in place. The problem is that the method
> {{GetTotalPartitionResource()}} does not clone it.
> {noformat}
> func (pc *PartitionContext) GetTotalPartitionResource() *resources.Resource {
> pc.RLock()
> defer pc.RUnlock()
> return pc.totalPartitionResource
> }
> {noformat}
> In general, we should prefer the immutable approach for variables like this,
> just like in {{objects.Queue}}:
> {noformat}
> func (sq *Queue) IncAllocatedResource(alloc *resources.Resource, nodeReported
> bool) error {
> // check this queue: failure stops checks if the allocation is not part
> of a node addition
> newAllocated := resources.Add(sq.allocatedResource, alloc)
> [ ... removed ... ]
> sq.Lock()
> defer sq.Unlock()
> // all OK update this queue
> sq.allocatedResource = newAllocated
> sq.updateAllocatedResourceMetrics()
> return nil
> }
> // incPendingResource increments pending resource of this queue and its
> parents.
> func (sq *Queue) incPendingResource(delta *resources.Resource) {
> // update the parent
> if sq.parent != nil {
> sq.parent.incPendingResource(delta)
> }
> // update this queue
> sq.Lock()
> defer sq.Unlock()
> sq.pending = resources.Add(sq.pending, delta)
> sq.updatePendingResourceMetrics()
> }
> {noformat}
--
This message was sent by Atlassian Jira
(v8.20.10#820010)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]