(yunikorn-core) branch master updated: [YUNIKORN-2930] Add time consumption metrics for scheduling/tryNode cycle (#1017)

wilfreds Thu, 03 Jul 2025 17:27:05 -0700

This is an automated email from the ASF dual-hosted git repository.

wilfreds pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/yunikorn-core.git



The following commit(s) were added to refs/heads/master by this push:
     new 29e95b60 [YUNIKORN-2930] Add time consumption metrics for 
scheduling/tryNode cycle (#1017)
29e95b60 is described below

commit 29e95b60e635a310a4f8d063577f765ffb2ddc30
Author: kaichiachen <kaichia...@gmail.com>
AuthorDate: Fri Jul 4 10:21:41 2025 +1000

    [YUNIKORN-2930] Add time consumption metrics for scheduling/tryNode cycle 
(#1017)
    
    Add two new metrics that observe:
    * each scheduling cycle, regardless of whether a pod was scheduled.
    * total time taken to find a node for a given pod.
    
    Add metrics to test
    
    Closes: #1017
    
    Signed-off-by: Wilfred Spiegelenburg <wilfr...@apache.org>
---
 pkg/metrics/scheduler.go             | 33 +++++++++++++++++++++++++++++++++
 pkg/metrics/scheduler_test.go        | 18 ++++++++++++++++++
 pkg/scheduler/context.go             |  2 ++
 pkg/scheduler/objects/application.go |  2 ++
 4 files changed, 55 insertions(+)

diff --git a/pkg/metrics/scheduler.go b/pkg/metrics/scheduler.go
index 15342905..abf48644 100644
--- a/pkg/metrics/scheduler.go
+++ b/pkg/metrics/scheduler.go
@@ -62,9 +62,11 @@ type SchedulerMetrics struct {
        node                  *prometheus.GaugeVec
        nodeResourceUsage     map[string]*prometheus.GaugeVec
        schedulingLatency     prometheus.Histogram
+       schedulingCycle       prometheus.Histogram
        sortingLatency        *prometheus.HistogramVec
        tryNodeLatency        prometheus.Histogram
        tryPreemptionLatency  prometheus.Histogram
+       tryNodeEvaluation     prometheus.Histogram
        lock                  locking.RWMutex
 }
 
@@ -117,6 +119,17 @@ func InitSchedulerMetrics() *SchedulerMetrics {
                        Buckets:   prometheus.ExponentialBuckets(0.0001, 10, 
8), // start from 0.1ms
                },
        )
+
+       s.schedulingCycle = prometheus.NewHistogram(
+               prometheus.HistogramOpts{
+                       Namespace: Namespace,
+                       Subsystem: SchedulerSubsystem,
+                       Name:      "scheduling_cycle_milliseconds",
+                       Help:      "Time taken for a scheduling cycle, in 
seconds.",
+                       Buckets:   prometheus.ExponentialBuckets(0.0001, 10, 8),
+               },
+       )
+
        s.sortingLatency = prometheus.NewHistogramVec(
                prometheus.HistogramOpts{
                        Namespace: Namespace,
@@ -136,6 +149,16 @@ func InitSchedulerMetrics() *SchedulerMetrics {
                },
        )
 
+       s.tryNodeEvaluation = prometheus.NewHistogram(
+               prometheus.HistogramOpts{
+                       Namespace: Namespace,
+                       Subsystem: SchedulerSubsystem,
+                       Name:      "trynode_evaluation_milliseconds",
+                       Help:      "Time taken to evaluate nodes for a pod, in 
seconds.",
+                       Buckets:   prometheus.ExponentialBuckets(0.0001, 10, 8),
+               },
+       )
+
        s.tryPreemptionLatency = prometheus.NewHistogram(
                prometheus.HistogramOpts{
                        Namespace: Namespace,
@@ -155,6 +178,8 @@ func InitSchedulerMetrics() *SchedulerMetrics {
                s.schedulingLatency,
                s.sortingLatency,
                s.tryNodeLatency,
+               s.schedulingCycle,
+               s.tryNodeEvaluation,
                s.tryPreemptionLatency,
        }
        for _, metric := range metricsList {
@@ -182,6 +207,10 @@ func (m *SchedulerMetrics) ObserveSchedulingLatency(start 
time.Time) {
        m.schedulingLatency.Observe(SinceInSeconds(start))
 }
 
+func (m *SchedulerMetrics) ObserveSchedulingCycle(start time.Time) {
+       m.schedulingCycle.Observe(SinceInSeconds(start))
+}
+
 func (m *SchedulerMetrics) ObserveAppSortingLatency(start time.Time) {
        
m.sortingLatency.WithLabelValues(SortingApp).Observe(SinceInSeconds(start))
 }
@@ -194,6 +223,10 @@ func (m *SchedulerMetrics) ObserveTryNodeLatency(start 
time.Time) {
        m.tryNodeLatency.Observe(SinceInSeconds(start))
 }
 
+func (m *SchedulerMetrics) ObserveTryNodeEvaluation(start time.Time) {
+       m.tryNodeEvaluation.Observe(SinceInSeconds(start))
+}
+
 func (m *SchedulerMetrics) ObserveTryPreemptionLatency(start time.Time) {
        m.tryPreemptionLatency.Observe(SinceInSeconds(start))
 }
diff --git a/pkg/metrics/scheduler_test.go b/pkg/metrics/scheduler_test.go
index 42879b67..c290a78b 100644
--- a/pkg/metrics/scheduler_test.go
+++ b/pkg/metrics/scheduler_test.go
@@ -168,6 +168,22 @@ func TestSchedulerApplicationsFailed(t *testing.T) {
        verifyMetric(t, 1, "failed", "yunikorn_scheduler_application_total", 
dto.MetricType_GAUGE, "state")
 }
 
+func TestSchedulingCycle(t *testing.T) {
+       sm = getSchedulerMetrics(t)
+       defer unregisterMetrics()
+
+       sm.ObserveSchedulingCycle(time.Now().Add(-1 * time.Minute))
+       verifyHistogram(t, "scheduling_cycle_milliseconds", 60, 1)
+}
+
+func TestTryNodeEvaluation(t *testing.T) {
+       sm = getSchedulerMetrics(t)
+       defer unregisterMetrics()
+
+       sm.ObserveTryNodeEvaluation(time.Now().Add(-1 * time.Minute))
+       verifyHistogram(t, "trynode_evaluation_milliseconds", 60, 1)
+}
+
 func getSchedulerMetrics(t *testing.T) *SchedulerMetrics {
        unregisterMetrics()
        return InitSchedulerMetrics()
@@ -223,7 +239,9 @@ func unregisterMetrics() {
        prometheus.Unregister(sm.application)
        prometheus.Unregister(sm.node)
        prometheus.Unregister(sm.schedulingLatency)
+       prometheus.Unregister(sm.schedulingCycle)
        prometheus.Unregister(sm.sortingLatency)
        prometheus.Unregister(sm.tryNodeLatency)
+       prometheus.Unregister(sm.tryNodeEvaluation)
        prometheus.Unregister(sm.tryPreemptionLatency)
 }
diff --git a/pkg/scheduler/context.go b/pkg/scheduler/context.go
index b5b91c2a..402a2c5f 100644
--- a/pkg/scheduler/context.go
+++ b/pkg/scheduler/context.go
@@ -120,6 +120,7 @@ func (cc *ClusterContext) setEventHandler(rmHandler 
handler.EventHandler) {
 func (cc *ClusterContext) schedule() bool {
        // schedule each partition defined in the cluster
        activity := false
+       scheduleCycleStart := time.Now()
        for _, psc := range cc.GetPartitionMapClone() {
                // if there are no resources in the partition just skip
                if psc.root.GetMaxResource() == nil {
@@ -151,6 +152,7 @@ func (cc *ClusterContext) schedule() bool {
                        activity = true
                }
        }
+       metrics.GetSchedulerMetrics().ObserveSchedulingCycle(scheduleCycleStart)
        return activity
 }
 
diff --git a/pkg/scheduler/objects/application.go 
b/pkg/scheduler/objects/application.go
index c9e15a41..a4ad75e6 100644
--- a/pkg/scheduler/objects/application.go
+++ b/pkg/scheduler/objects/application.go
@@ -1446,6 +1446,7 @@ func (sa *Application) tryNodes(ask *Allocation, iterator 
NodeIterator) *Allocat
        reserved := sa.reservations[allocKey]
        var allocResult *AllocationResult
        var predicateErrors map[string]int
+       tryNodeCycleStart := time.Now()
        iterator.ForEachNode(func(node *Node) bool {
                // skip the node if the node is not schedulable
                if !node.IsSchedulable() {
@@ -1510,6 +1511,7 @@ func (sa *Application) tryNodes(ask *Allocation, iterator 
NodeIterator) *Allocat
                }
                return true
        })
+       
metrics.GetSchedulerMetrics().ObserveTryNodeEvaluation(tryNodeCycleStart)
 
        if allocResult != nil {
                return allocResult


---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@yunikorn.apache.org
For additional commands, e-mail: issues-h...@yunikorn.apache.org

(yunikorn-core) branch master updated: [YUNIKORN-2930] Add time consumption metrics for scheduling/tryNode cycle (#1017)

Reply via email to