This is an automated email from the ASF dual-hosted git repository.

ethanfeng pushed a commit to branch branch-0.3
in repository https://gitbox.apache.org/repos/asf/incubator-celeborn.git


The following commit(s) were added to refs/heads/branch-0.3 by this push:
     new d9ded4c65 [CELEBORN-1035] Expose RunningApplicationCount, 
PartitionWritten and PartitionFileCount metric by Celeborn master
d9ded4c65 is described below

commit d9ded4c652643b6ba8de10a80164a628d5603cd9
Author: SteNicholas <[email protected]>
AuthorDate: Thu Oct 19 22:07:17 2023 +0800

    [CELEBORN-1035] Expose RunningApplicationCount, PartitionWritten and 
PartitionFileCount metric by Celeborn master
    
    ### What changes were proposed in this pull request?
    
    Meta manager records `appHeartbeatTime`, `partitionTotalWritten` and 
`partitionTotalFileCount`, which are useful to monitor the application 
heartbeat and shuffle partition. `RunningApplicationCount`, `PartitionWritten` 
and `PartitionFileCount` metrics are exposed by Celeborn master to monitor the 
application and shuffle partition.
    
    ### Why are the changes needed?
    
    `Master` exposes `RunningApplicationCount`, `PartitionWritten` and 
`PartitionFileCount` metrics.
    
    ### Does this PR introduce _any_ user-facing change?
    
    None.
    
    ### How was this patch tested?
    
    Internal tests.
    
    Closes #1976 from SteNicholas/CELEBORN-1035.
    
    Authored-by: SteNicholas <[email protected]>
    Signed-off-by: mingji <[email protected]>
---
 METRICS.md                                         |   3 +
 assets/grafana/celeborn-dashboard.json             | 274 +++++++++++++++++++++
 docs/monitoring.md                                 |   5 +
 .../celeborn/service/deploy/master/Master.scala    |  26 +-
 .../service/deploy/master/MasterSource.scala       |   6 +
 5 files changed, 312 insertions(+), 2 deletions(-)

diff --git a/METRICS.md b/METRICS.md
index b9b76e3c8..7b5081a9d 100644
--- a/METRICS.md
+++ b/METRICS.md
@@ -65,8 +65,11 @@ Here is an example of grafana dashboard importing.
 
|:--------------------------------------:|:-----------------:|:---------------------------------------------------------------------------------------------------------------:|
 |              WorkerCount               |      master       |                 
                         The count of active workers.                           
                |
 |          ExcludedWorkerCount           |      master       |                 
                    The count of workers in excluded list.                      
                |
+|        RunningApplicationCount         |      master       |                 
               The count of running applications in the cluster.                
                |
 |             OfferSlotsTime             |      master       |                 
                           The time of offer slots.                             
                |
 |             PartitionSize              |      master       |          The 
estimated partition size of last 20 flush window whose length is 15 seconds by 
defaults.           |
+|            PartitionWritten            |      master       |                 
                           The active shuffle size.                             
                |
+|           PartitionFileCount           |      master       |                 
                      The active shuffle partition count.                       
                |
 |         RegisteredShuffleCount         | master and worker |                 
                 The value means count of registered shuffle.                   
                |
 |            CommitFilesTime             |      worker       |                 
          CommitFiles means flush and close a shuffle partition file.           
                |
 |            ReserveSlotsTime            |      worker       |                 
    ReserveSlots means acquire a disk buffer and record partition location.     
                |
diff --git a/assets/grafana/celeborn-dashboard.json 
b/assets/grafana/celeborn-dashboard.json
index 146b33cc5..cb676a140 100644
--- a/assets/grafana/celeborn-dashboard.json
+++ b/assets/grafana/celeborn-dashboard.json
@@ -164,6 +164,7 @@
             "type": "prometheus",
             "uid": "${DS_PROMETHEUS}"
           },
+          "description": "The count of registered shuffle.",
           "fieldConfig": {
             "defaults": {
               "color": {
@@ -247,6 +248,96 @@
           ],
           "title": "metrics_RegisteredShuffleCount_Value",
           "type": "timeseries"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "description": "The count of running applications.",
+          "fieldConfig": {
+            "defaults": {
+              "color": {
+                "mode": "palette-classic"
+              },
+              "custom": {
+                "axisCenteredZero": false,
+                "axisColorMode": "text",
+                "axisLabel": "",
+                "axisPlacement": "auto",
+                "barAlignment": 0,
+                "drawStyle": "line",
+                "fillOpacity": 0,
+                "gradientMode": "none",
+                "hideFrom": {
+                  "legend": false,
+                  "tooltip": false,
+                  "viz": false
+                },
+                "lineInterpolation": "linear",
+                "lineWidth": 1,
+                "pointSize": 5,
+                "scaleDistribution": {
+                  "type": "linear"
+                },
+                "showPoints": "auto",
+                "spanNulls": false,
+                "stacking": {
+                  "group": "A",
+                  "mode": "none"
+                },
+                "thresholdsStyle": {
+                  "mode": "off"
+                }
+              },
+              "mappings": [],
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  {
+                    "color": "green",
+                    "value": null
+                  },
+                  {
+                    "color": "red",
+                    "value": 80
+                  }
+                ]
+              }
+            },
+            "overrides": []
+          },
+          "gridPos": {
+            "h": 9,
+            "w": 12,
+            "x": 12,
+            "y": 1
+          },
+          "id": 95,
+          "options": {
+            "legend": {
+              "calcs": [],
+              "displayMode": "list",
+              "placement": "bottom",
+              "showLegend": true
+            },
+            "tooltip": {
+              "mode": "single",
+              "sort": "none"
+            }
+          },
+          "targets": [
+            {
+              "datasource": {
+                "type": "prometheus",
+                "uid": "${DS_PROMETHEUS}"
+              },
+              "expr": "metrics_ApplicationCount_Value",
+              "refId": "A"
+            }
+          ],
+          "title": "metrics_RunningApplicationCount_Value",
+          "type": "timeseries"
         }
       ],
       "title": "Overall",
@@ -446,6 +537,189 @@
           "title": "metrics_PartitionSize_Value",
           "type": "timeseries"
         },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "description": "The active shuffle size.",
+          "fieldConfig": {
+            "defaults": {
+              "color": {
+                "mode": "palette-classic"
+              },
+              "custom": {
+                "axisCenteredZero": false,
+                "axisColorMode": "text",
+                "axisLabel": "",
+                "axisPlacement": "auto",
+                "barAlignment": 0,
+                "drawStyle": "line",
+                "fillOpacity": 0,
+                "gradientMode": "none",
+                "hideFrom": {
+                  "legend": false,
+                  "tooltip": false,
+                  "viz": false
+                },
+                "lineInterpolation": "linear",
+                "lineWidth": 1,
+                "pointSize": 5,
+                "scaleDistribution": {
+                  "type": "linear"
+                },
+                "showPoints": "auto",
+                "spanNulls": false,
+                "stacking": {
+                  "group": "A",
+                  "mode": "none"
+                },
+                "thresholdsStyle": {
+                  "mode": "off"
+                }
+              },
+              "mappings": [],
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  {
+                    "color": "green"
+                  },
+                  {
+                    "color": "red",
+                    "value": 80
+                  }
+                ]
+              },
+              "unit": "bytes"
+            },
+            "overrides": []
+          },
+          "gridPos": {
+            "h": 8,
+            "w": 12,
+            "x": 12,
+            "y": 27
+          },
+          "id": 122,
+          "options": {
+            "legend": {
+              "calcs": [],
+              "displayMode": "list",
+              "placement": "bottom",
+              "showLegend": true
+            },
+            "tooltip": {
+              "mode": "single",
+              "sort": "none"
+            }
+          },
+          "targets": [
+            {
+              "datasource": {
+                "type": "prometheus",
+                "uid": "${DS_PROMETHEUS}"
+              },
+              "editorMode": "code",
+              "expr": "metrics_PartitionWritten_Value",
+              "range": true,
+              "refId": "A"
+            }
+          ],
+          "title": "metrics_PartitionWritten_Value",
+          "type": "timeseries"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "description": "The active shuffle partition count.",
+          "fieldConfig": {
+            "defaults": {
+              "color": {
+                "mode": "palette-classic"
+              },
+              "custom": {
+                "axisCenteredZero": false,
+                "axisColorMode": "text",
+                "axisLabel": "",
+                "axisPlacement": "auto",
+                "barAlignment": 0,
+                "drawStyle": "line",
+                "fillOpacity": 0,
+                "gradientMode": "none",
+                "hideFrom": {
+                  "legend": false,
+                  "tooltip": false,
+                  "viz": false
+                },
+                "lineInterpolation": "linear",
+                "lineWidth": 1,
+                "pointSize": 5,
+                "scaleDistribution": {
+                  "type": "linear"
+                },
+                "showPoints": "auto",
+                "spanNulls": false,
+                "stacking": {
+                  "group": "A",
+                  "mode": "none"
+                },
+                "thresholdsStyle": {
+                  "mode": "off"
+                }
+              },
+              "mappings": [],
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  {
+                    "color": "green"
+                  },
+                  {
+                    "color": "red",
+                    "value": 80
+                  }
+                ]
+              }
+            },
+            "overrides": []
+          },
+          "gridPos": {
+            "h": 8,
+            "w": 12,
+            "x": 12,
+            "y": 27
+          },
+          "id": 124,
+          "options": {
+            "legend": {
+              "calcs": [],
+              "displayMode": "list",
+              "placement": "bottom",
+              "showLegend": true
+            },
+            "tooltip": {
+              "mode": "single",
+              "sort": "none"
+            }
+          },
+          "targets": [
+            {
+              "datasource": {
+                "type": "prometheus",
+                "uid": "${DS_PROMETHEUS}"
+              },
+              "editorMode": "code",
+              "expr": "metrics_PartitionFileCount_Value",
+              "range": true,
+              "refId": "A"
+            }
+          ],
+          "title": "metrics_PartitionFileCount_Value",
+          "type": "timeseries"
+        },
         {
           "datasource": {
             "type": "prometheus",
diff --git a/docs/monitoring.md b/docs/monitoring.md
index dd991877d..5999d7f83 100644
--- a/docs/monitoring.md
+++ b/docs/monitoring.md
@@ -97,9 +97,14 @@ These metrics are exposed by Celeborn master.
     - LostWorkers
     - ExcludedWorkerCount
     - RegisteredShuffleCount
+    - RunningApplicationCount
     - IsActiveMaster
     - PartitionSize
         - The size of estimated shuffle partition.
+    - PartitionWritten
+        - The active shuffle size.
+    - PartitionFileCount
+        - The active shuffle partition count.
     - OfferSlotsTime
         - The time for masters to handle `RequestSlots` request when 
registering shuffle.
 
diff --git 
a/master/src/main/scala/org/apache/celeborn/service/deploy/master/Master.scala 
b/master/src/main/scala/org/apache/celeborn/service/deploy/master/Master.scala
index b134781b8..73957e6e1 100644
--- 
a/master/src/main/scala/org/apache/celeborn/service/deploy/master/Master.scala
+++ 
b/master/src/main/scala/org/apache/celeborn/service/deploy/master/Master.scala
@@ -20,11 +20,10 @@ package org.apache.celeborn.service.deploy.master
 import java.io.IOException
 import java.net.BindException
 import java.util
-import java.util.Collections
 import java.util.concurrent.{ConcurrentHashMap, ScheduledFuture, TimeUnit}
+import java.util.function.ToLongFunction
 
 import scala.collection.JavaConverters._
-import scala.collection.mutable
 import scala.util.Random
 
 import org.apache.hadoop.fs.{FileSystem, Path}
@@ -161,7 +160,30 @@ private[celeborn] class Master(
   }
   masterSource.addGauge(MasterSource.WORKER_COUNT) { () => 
statusSystem.workers.size }
   masterSource.addGauge(MasterSource.LOST_WORKER_COUNT) { () => 
statusSystem.lostWorkers.size }
+  masterSource.addGauge(MasterSource.RUNNING_APPLICATION_COUNT) { () =>
+    statusSystem.appHeartbeatTime.size
+  }
   masterSource.addGauge(MasterSource.PARTITION_SIZE) { () => 
statusSystem.estimatedPartitionSize }
+  masterSource.addGauge(MasterSource.PARTITION_WRITTEN) { () =>
+    statusSystem.workers.parallelStream()
+      .mapToLong(new ToLongFunction[WorkerInfo]() {
+        override def applyAsLong(value: WorkerInfo): Long =
+          value.userResourceConsumption.values().parallelStream()
+            .mapToLong(new ToLongFunction[ResourceConsumption]() {
+              override def applyAsLong(value: ResourceConsumption): Long = 
value.diskBytesWritten
+            }).sum()
+      }).sum()
+  }
+  masterSource.addGauge(MasterSource.PARTITION_FILE_COUNT) { () =>
+    statusSystem.workers.parallelStream()
+      .mapToLong(new ToLongFunction[WorkerInfo]() {
+        override def applyAsLong(value: WorkerInfo): Long =
+          value.userResourceConsumption.values().parallelStream()
+            .mapToLong(new ToLongFunction[ResourceConsumption]() {
+              override def applyAsLong(value: ResourceConsumption): Long = 
value.diskFileCount
+            }).sum()
+      }).sum()
+  }
   masterSource.addGauge(MasterSource.IS_ACTIVE_MASTER) { () => isMasterActive }
 
   metricsSystem.registerSource(resourceConsumptionSource)
diff --git 
a/master/src/main/scala/org/apache/celeborn/service/deploy/master/MasterSource.scala
 
b/master/src/main/scala/org/apache/celeborn/service/deploy/master/MasterSource.scala
index cd3f1f0ea..05c14b7df 100644
--- 
a/master/src/main/scala/org/apache/celeborn/service/deploy/master/MasterSource.scala
+++ 
b/master/src/main/scala/org/apache/celeborn/service/deploy/master/MasterSource.scala
@@ -41,9 +41,15 @@ object MasterSource {
 
   val REGISTERED_SHUFFLE_COUNT = "RegisteredShuffleCount"
 
+  val RUNNING_APPLICATION_COUNT = "RunningApplicationCount"
+
   val IS_ACTIVE_MASTER = "IsActiveMaster"
 
   val PARTITION_SIZE = "PartitionSize"
 
+  val PARTITION_WRITTEN = "PartitionWritten"
+
+  val PARTITION_FILE_COUNT = "PartitionFileCount"
+
   val OFFER_SLOTS_TIME = "OfferSlotsTime"
 }

Reply via email to