This is an automated email from the ASF dual-hosted git repository. wilfreds pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/yunikorn-core.git
The following commit(s) were added to refs/heads/master by this push: new 29e95b60 [YUNIKORN-2930] Add time consumption metrics for scheduling/tryNode cycle (#1017) 29e95b60 is described below commit 29e95b60e635a310a4f8d063577f765ffb2ddc30 Author: kaichiachen <kaichia...@gmail.com> AuthorDate: Fri Jul 4 10:21:41 2025 +1000 [YUNIKORN-2930] Add time consumption metrics for scheduling/tryNode cycle (#1017) Add two new metrics that observe: * each scheduling cycle, regardless of whether a pod was scheduled. * total time taken to find a node for a given pod. Add metrics to test Closes: #1017 Signed-off-by: Wilfred Spiegelenburg <wilfr...@apache.org> --- pkg/metrics/scheduler.go | 33 +++++++++++++++++++++++++++++++++ pkg/metrics/scheduler_test.go | 18 ++++++++++++++++++ pkg/scheduler/context.go | 2 ++ pkg/scheduler/objects/application.go | 2 ++ 4 files changed, 55 insertions(+) diff --git a/pkg/metrics/scheduler.go b/pkg/metrics/scheduler.go index 15342905..abf48644 100644 --- a/pkg/metrics/scheduler.go +++ b/pkg/metrics/scheduler.go @@ -62,9 +62,11 @@ type SchedulerMetrics struct { node *prometheus.GaugeVec nodeResourceUsage map[string]*prometheus.GaugeVec schedulingLatency prometheus.Histogram + schedulingCycle prometheus.Histogram sortingLatency *prometheus.HistogramVec tryNodeLatency prometheus.Histogram tryPreemptionLatency prometheus.Histogram + tryNodeEvaluation prometheus.Histogram lock locking.RWMutex } @@ -117,6 +119,17 @@ func InitSchedulerMetrics() *SchedulerMetrics { Buckets: prometheus.ExponentialBuckets(0.0001, 10, 8), // start from 0.1ms }, ) + + s.schedulingCycle = prometheus.NewHistogram( + prometheus.HistogramOpts{ + Namespace: Namespace, + Subsystem: SchedulerSubsystem, + Name: "scheduling_cycle_milliseconds", + Help: "Time taken for a scheduling cycle, in seconds.", + Buckets: prometheus.ExponentialBuckets(0.0001, 10, 8), + }, + ) + s.sortingLatency = prometheus.NewHistogramVec( prometheus.HistogramOpts{ Namespace: Namespace, @@ -136,6 +149,16 @@ func InitSchedulerMetrics() *SchedulerMetrics { }, ) + s.tryNodeEvaluation = prometheus.NewHistogram( + prometheus.HistogramOpts{ + Namespace: Namespace, + Subsystem: SchedulerSubsystem, + Name: "trynode_evaluation_milliseconds", + Help: "Time taken to evaluate nodes for a pod, in seconds.", + Buckets: prometheus.ExponentialBuckets(0.0001, 10, 8), + }, + ) + s.tryPreemptionLatency = prometheus.NewHistogram( prometheus.HistogramOpts{ Namespace: Namespace, @@ -155,6 +178,8 @@ func InitSchedulerMetrics() *SchedulerMetrics { s.schedulingLatency, s.sortingLatency, s.tryNodeLatency, + s.schedulingCycle, + s.tryNodeEvaluation, s.tryPreemptionLatency, } for _, metric := range metricsList { @@ -182,6 +207,10 @@ func (m *SchedulerMetrics) ObserveSchedulingLatency(start time.Time) { m.schedulingLatency.Observe(SinceInSeconds(start)) } +func (m *SchedulerMetrics) ObserveSchedulingCycle(start time.Time) { + m.schedulingCycle.Observe(SinceInSeconds(start)) +} + func (m *SchedulerMetrics) ObserveAppSortingLatency(start time.Time) { m.sortingLatency.WithLabelValues(SortingApp).Observe(SinceInSeconds(start)) } @@ -194,6 +223,10 @@ func (m *SchedulerMetrics) ObserveTryNodeLatency(start time.Time) { m.tryNodeLatency.Observe(SinceInSeconds(start)) } +func (m *SchedulerMetrics) ObserveTryNodeEvaluation(start time.Time) { + m.tryNodeEvaluation.Observe(SinceInSeconds(start)) +} + func (m *SchedulerMetrics) ObserveTryPreemptionLatency(start time.Time) { m.tryPreemptionLatency.Observe(SinceInSeconds(start)) } diff --git a/pkg/metrics/scheduler_test.go b/pkg/metrics/scheduler_test.go index 42879b67..c290a78b 100644 --- a/pkg/metrics/scheduler_test.go +++ b/pkg/metrics/scheduler_test.go @@ -168,6 +168,22 @@ func TestSchedulerApplicationsFailed(t *testing.T) { verifyMetric(t, 1, "failed", "yunikorn_scheduler_application_total", dto.MetricType_GAUGE, "state") } +func TestSchedulingCycle(t *testing.T) { + sm = getSchedulerMetrics(t) + defer unregisterMetrics() + + sm.ObserveSchedulingCycle(time.Now().Add(-1 * time.Minute)) + verifyHistogram(t, "scheduling_cycle_milliseconds", 60, 1) +} + +func TestTryNodeEvaluation(t *testing.T) { + sm = getSchedulerMetrics(t) + defer unregisterMetrics() + + sm.ObserveTryNodeEvaluation(time.Now().Add(-1 * time.Minute)) + verifyHistogram(t, "trynode_evaluation_milliseconds", 60, 1) +} + func getSchedulerMetrics(t *testing.T) *SchedulerMetrics { unregisterMetrics() return InitSchedulerMetrics() @@ -223,7 +239,9 @@ func unregisterMetrics() { prometheus.Unregister(sm.application) prometheus.Unregister(sm.node) prometheus.Unregister(sm.schedulingLatency) + prometheus.Unregister(sm.schedulingCycle) prometheus.Unregister(sm.sortingLatency) prometheus.Unregister(sm.tryNodeLatency) + prometheus.Unregister(sm.tryNodeEvaluation) prometheus.Unregister(sm.tryPreemptionLatency) } diff --git a/pkg/scheduler/context.go b/pkg/scheduler/context.go index b5b91c2a..402a2c5f 100644 --- a/pkg/scheduler/context.go +++ b/pkg/scheduler/context.go @@ -120,6 +120,7 @@ func (cc *ClusterContext) setEventHandler(rmHandler handler.EventHandler) { func (cc *ClusterContext) schedule() bool { // schedule each partition defined in the cluster activity := false + scheduleCycleStart := time.Now() for _, psc := range cc.GetPartitionMapClone() { // if there are no resources in the partition just skip if psc.root.GetMaxResource() == nil { @@ -151,6 +152,7 @@ func (cc *ClusterContext) schedule() bool { activity = true } } + metrics.GetSchedulerMetrics().ObserveSchedulingCycle(scheduleCycleStart) return activity } diff --git a/pkg/scheduler/objects/application.go b/pkg/scheduler/objects/application.go index c9e15a41..a4ad75e6 100644 --- a/pkg/scheduler/objects/application.go +++ b/pkg/scheduler/objects/application.go @@ -1446,6 +1446,7 @@ func (sa *Application) tryNodes(ask *Allocation, iterator NodeIterator) *Allocat reserved := sa.reservations[allocKey] var allocResult *AllocationResult var predicateErrors map[string]int + tryNodeCycleStart := time.Now() iterator.ForEachNode(func(node *Node) bool { // skip the node if the node is not schedulable if !node.IsSchedulable() { @@ -1510,6 +1511,7 @@ func (sa *Application) tryNodes(ask *Allocation, iterator NodeIterator) *Allocat } return true }) + metrics.GetSchedulerMetrics().ObserveTryNodeEvaluation(tryNodeCycleStart) if allocResult != nil { return allocResult --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@yunikorn.apache.org For additional commands, e-mail: issues-h...@yunikorn.apache.org