This is an automated email from the ASF dual-hosted git repository.

ethanfeng pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/incubator-celeborn.git


The following commit(s) were added to refs/heads/main by this push:
     new 277f7ced5 [CELEBORN-1187] Unify the size and file count of active 
shuffle metrics for master and worker
277f7ced5 is described below

commit 277f7ced5787e904579e4159ebcb9bc6cb9d4367
Author: SteNicholas <[email protected]>
AuthorDate: Fri Dec 22 17:07:39 2023 +0800

    [CELEBORN-1187] Unify the size and file count of active shuffle metrics for 
master and worker
    
    ### What changes were proposed in this pull request?
    
    Unify the size and file count of active shuffle metrics for `MasterSource` 
and `WorkerSource`.
    
    ### Why are the changes needed?
    
    `MasterSource` uses `PartitionWritten` and `PartitionFileCount` metrics as 
the size and file count of active shuffle for all workers. Meanwhile, 
`WorkerSource` uses `ActiveShuffleSize` and `ActiveShuffleFileCount` metrics as 
the size and file count of active shuffle for a worker including master replica 
and slave replica. It's recommended to unify the size and file count of active 
shuffle metrics between `MasterSource` and `WorkerSource`.
    
    ### Does this PR introduce _any_ user-facing change?
    
    No.
    
    ### How was this patch tested?
    
    Internal tests.
    
    Closes #2171 from SteNicholas/CELEBORN-1187.
    
    Lead-authored-by: SteNicholas <[email protected]>
    Co-authored-by: 蒋晓峰 <[email protected]>
    Signed-off-by: mingji <[email protected]>
---
 METRICS.md                                                   |  8 +++-----
 assets/grafana/celeborn-dashboard.json                       | 12 ++++++------
 docs/migration.md                                            |  4 ++++
 docs/monitoring.md                                           |  4 ++--
 .../org/apache/celeborn/service/deploy/master/Master.scala   |  4 ++--
 .../apache/celeborn/service/deploy/master/MasterSource.scala |  4 ++--
 6 files changed, 19 insertions(+), 17 deletions(-)

diff --git a/METRICS.md b/METRICS.md
index c4e0862d4..ec0ffad10 100644
--- a/METRICS.md
+++ b/METRICS.md
@@ -81,13 +81,13 @@ Here is an example of Grafana dashboard importing.
 |        RunningApplicationCount         |      master       |                 
               The count of running applications in the cluster.                
                |
 |             OfferSlotsTime             |      master       |                 
                           The time of offer slots.                             
                |
 |             PartitionSize              |      master       |          The 
estimated partition size of last 20 flush window whose length is 15 seconds by 
defaults.           |
-|            PartitionWritten            |      master       |                 
                           The active shuffle size.                             
                |
-|           PartitionFileCount           |      master       |                 
                      The active shuffle partition count.                       
                |
+|         RegisteredShuffleCount         | master and worker |                 
                 The value means count of registered shuffle.                   
                |
+|           ActiveShuffleSize            | master and worker |   The value 
means the active shuffle size for workers or a worker including master replica 
and slave replica.   |
+|         ActiveShuffleFileCount         | master and worker |   The value 
means the active shuffle size for workers or a worker including master replica 
and slave replica.   |
 |             diskFileCount              | master and worker |                 
               The count of disk files consumption by each user.                
                |
 |            diskBytesWritten            | master and worker |                 
              The amount of disk files consumption by each user.                
                |
 |             hdfsFileCount              | master and worker |                 
               The count of hdfs files consumption by each user.                
                |
 |            hdfsBytesWritten            | master and worker |                 
              The amount of hdfs files consumption by each user.                
                |
-|         RegisteredShuffleCount         | master and worker |                 
                 The value means count of registered shuffle.                   
                |
 |            CommitFilesTime             |      worker       |                 
          CommitFiles means flush and close a shuffle partition file.           
                |
 |            ReserveSlotsTime            |      worker       |                 
    ReserveSlots means acquire a disk buffer and record partition location.     
                |
 |             FlushDataTime              |      worker       |                 
                 FlushData means flush a disk buffer to disk.                   
                |
@@ -113,8 +113,6 @@ Here is an example of Grafana dashboard importing.
 |               DiskBuffer               |      worker       | Disk buffers 
are part of netty used memory, means data need to write to disk but haven't 
been written to disk.  |
 |             PausePushData              |      worker       |                 
  PausePushData means the count of worker stopped receiving data from client.   
                |
 |       PausePushDataAndReplicate        |      worker       |    
PausePushDataAndReplicate means the count of worker stopped receiving data from 
client and other workers.    |
-|           ActiveShuffleSize            |      worker       |                 
The active shuffle size of a worker including master replica and slave replica. 
                |
-|         ActiveShuffleFileCount         |      worker       |              
The active shuffle file count of a worker including master replica and slave 
replica.              |
 |              jvm_gc_count              |        JVM        |                 
                    The GC count of each garbage collector.                     
                |
 |              jvm_gc_time               |        JVM        |                 
                  The GC cost time of each garbage collector.                   
                |
 |          jvm_memory_heap_init          |        JVM        |                 
                        The amount of heap init memory.                         
                |
diff --git a/assets/grafana/celeborn-dashboard.json 
b/assets/grafana/celeborn-dashboard.json
index e2a3dc30c..653d5863e 100644
--- a/assets/grafana/celeborn-dashboard.json
+++ b/assets/grafana/celeborn-dashboard.json
@@ -626,13 +626,13 @@
                 "uid": "${DS_PROMETHEUS}"
               },
               "editorMode": "code",
-              "expr": "metrics_PartitionWritten_Value",
+              "expr": "metrics_ActiveShuffleSize_Value{role=\"Master\"}",
               "legendFormat": "${baseLegend}",
               "range": true,
               "refId": "A"
             }
           ],
-          "title": "metrics_PartitionWritten_Value",
+          "title": "metrics_ActiveShuffleSize_Value",
           "type": "timeseries"
         },
         {
@@ -718,13 +718,13 @@
                 "uid": "${DS_PROMETHEUS}"
               },
               "editorMode": "code",
-              "expr": "metrics_PartitionFileCount_Value",
+              "expr": "metrics_ActiveShuffleFileCount_Value{role=\"Master\"}",
               "legendFormat": "${baseLegend}",
               "range": true,
               "refId": "A"
             }
           ],
-          "title": "metrics_PartitionFileCount_Value",
+          "title": "metrics_ActiveShuffleFileCount_Value",
           "type": "timeseries"
         },
         {
@@ -1845,7 +1845,7 @@
                 "uid": "${DS_PROMETHEUS}"
               },
               "editorMode": "builder",
-              "expr": "metrics_ActiveShuffleSize_Value",
+              "expr": "metrics_ActiveShuffleSize_Value{role=\"Worker\"}",
               "legendFormat": "${baseLegend}",
               "instant": false,
               "range": true,
@@ -1964,7 +1964,7 @@
                 "uid": "${DS_PROMETHEUS}"
               },
               "editorMode": "builder",
-              "expr": "metrics_ActiveShuffleFileCount_Value",
+              "expr": "metrics_ActiveShuffleFileCount_Value{role=\"Worker\"}",
               "legendFormat": "${baseLegend}",
               "instant": false,
               "range": true,
diff --git a/docs/migration.md b/docs/migration.md
index 15937d3e9..538b7ada0 100644
--- a/docs/migration.md
+++ b/docs/migration.md
@@ -44,6 +44,10 @@ license: |
 
 - Since 0.4.0, Celeborn deprecate `celeborn.storage.activeTypes`. Please use 
`celeborn.storage.availableTypes` instead.
 
+- Since 0.4.0, Celeborn master metrics `PartitionWritten` is renamed as 
`ActiveShuffleSize`.
+
+- Since 0.4.0, Celeborn master metrics `PartitionFileCount` is renamed as 
`ActiveShuffleFileCount`.
+
 ## Upgrading from 0.3.1 to 0.3.2
 
 - Since 0.3.1, Celeborn changed the default value of 
`raft.client.rpc.request.timeout` from `3s` to `10s`.
diff --git a/docs/monitoring.md b/docs/monitoring.md
index 3aa210907..64592932f 100644
--- a/docs/monitoring.md
+++ b/docs/monitoring.md
@@ -99,9 +99,9 @@ These metrics are exposed by Celeborn master.
     - IsActiveMaster
     - PartitionSize
         - The size of estimated shuffle partition.
-    - PartitionWritten
+    - ActiveShuffleSize
         - The active shuffle size of workers.
-    - PartitionFileCount
+    - ActiveShuffleFileCount
         - The active shuffle file count of workers.
     - OfferSlotsTime
         - The time for masters to handle `RequestSlots` request when 
registering shuffle.
diff --git 
a/master/src/main/scala/org/apache/celeborn/service/deploy/master/Master.scala 
b/master/src/main/scala/org/apache/celeborn/service/deploy/master/Master.scala
index 2235d8cfc..3fcb94b01 100644
--- 
a/master/src/main/scala/org/apache/celeborn/service/deploy/master/Master.scala
+++ 
b/master/src/main/scala/org/apache/celeborn/service/deploy/master/Master.scala
@@ -168,7 +168,7 @@ private[celeborn] class Master(
     statusSystem.appHeartbeatTime.size
   }
   masterSource.addGauge(MasterSource.PARTITION_SIZE) { () => 
statusSystem.estimatedPartitionSize }
-  masterSource.addGauge(MasterSource.PARTITION_WRITTEN) { () =>
+  masterSource.addGauge(MasterSource.ACTIVE_SHUFFLE_SIZE) { () =>
     statusSystem.workers.parallelStream()
       .mapToLong(new ToLongFunction[WorkerInfo]() {
         override def applyAsLong(value: WorkerInfo): Long =
@@ -178,7 +178,7 @@ private[celeborn] class Master(
             }).sum()
       }).sum()
   }
-  masterSource.addGauge(MasterSource.PARTITION_FILE_COUNT) { () =>
+  masterSource.addGauge(MasterSource.ACTIVE_SHUFFLE_FILE_COUNT) { () =>
     statusSystem.workers.parallelStream()
       .mapToLong(new ToLongFunction[WorkerInfo]() {
         override def applyAsLong(value: WorkerInfo): Long =
diff --git 
a/master/src/main/scala/org/apache/celeborn/service/deploy/master/MasterSource.scala
 
b/master/src/main/scala/org/apache/celeborn/service/deploy/master/MasterSource.scala
index 8010f0a74..0fbb35121 100644
--- 
a/master/src/main/scala/org/apache/celeborn/service/deploy/master/MasterSource.scala
+++ 
b/master/src/main/scala/org/apache/celeborn/service/deploy/master/MasterSource.scala
@@ -46,9 +46,9 @@ object MasterSource {
 
   val PARTITION_SIZE = "PartitionSize"
 
-  val PARTITION_WRITTEN = "PartitionWritten"
+  val ACTIVE_SHUFFLE_SIZE = "ActiveShuffleSize"
 
-  val PARTITION_FILE_COUNT = "PartitionFileCount"
+  val ACTIVE_SHUFFLE_FILE_COUNT = "ActiveShuffleFileCount"
 
   val OFFER_SLOTS_TIME = "OfferSlotsTime"
 }

Reply via email to