This is an automated email from the ASF dual-hosted git repository.

nicholasjiang pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/celeborn.git


The following commit(s) were added to refs/heads/main by this push:
     new 308eed28c [CELEBORN-1427] Add Capacity metrics for Celeborn
308eed28c is described below

commit 308eed28c92263d7f96cce073c0c9916da5cb1bf
Author: Shuang <[email protected]>
AuthorDate: Thu May 23 16:06:11 2024 +0800

    [CELEBORN-1427] Add Capacity metrics for Celeborn
    
    ### What changes were proposed in this pull request?
    As title
    
    ### Why are the changes needed?
    The Celeborn cluster does not currently provide metrics for 'TotalCapacity' 
and 'TotalFreeCapacity
    
    ### Does this PR introduce _any_ user-facing change?
    No.
    
    ### How was this patch tested?
    Pass GA
    
    Closes #2521 from RexXiong/CELEBORN-1427.
    
    Authored-by: Shuang <[email protected]>
    Signed-off-by: SteNicholas <[email protected]>
---
 METRICS.md                                         |   4 +-
 assets/grafana/celeborn-dashboard.json             | 206 +++++++++++++++++++--
 common/src/main/proto/TransportMessages.proto      |   1 +
 .../apache/celeborn/common/meta/DeviceInfo.scala   |   8 +
 .../apache/celeborn/common/meta/WorkerInfo.scala   |   9 +
 .../apache/celeborn/common/util/PbSerDeUtils.scala |   2 +
 .../celeborn/common/meta/WorkerInfoSuite.scala     |  19 +-
 .../celeborn/common/util/PbSerDeUtilsTest.scala    |   5 +
 docs/monitoring.md                                 |   2 +
 .../celeborn/service/deploy/master/Master.scala    |   9 +
 .../service/deploy/master/MasterSource.scala       |   4 +
 .../deploy/worker/storage/DeviceMonitor.scala      |   2 +-
 .../deploy/worker/storage/StorageManager.scala     |   1 +
 13 files changed, 250 insertions(+), 22 deletions(-)

diff --git a/METRICS.md b/METRICS.md
index ec40d638b..09532d088 100644
--- a/METRICS.md
+++ b/METRICS.md
@@ -84,6 +84,8 @@ Here is an example of Grafana dashboard importing.
 |            diskBytesWritten            | master and worker |                 
                  The amount of disk files consumption by each user.            
                       |
 |             hdfsFileCount              | master and worker |                 
                  The count of hdfs files consumption by each user.             
                       |
 |            hdfsBytesWritten            | master and worker |                 
                  The amount of hdfs files consumption by each user.            
                       |
+|        DeviceCelebornFreeBytes         | master and worker |                 
             This value means actual usable space of Celeborn for device.       
                       |
+|        DeviceCelebornTotalBytes        | master and worker |                 
                 This value means total space of Celeborn for device.           
                       |
 |              WorkerCount               |      master       |                 
                             The count of active workers.                       
                       |
 |            LostWorkerCount             |      master       |                 
                          The count of workers in lost list.                    
                       |
 |          ExcludedWorkerCount           |      master       |                 
                        The count of workers in excluded list.                  
                       |
@@ -142,8 +144,6 @@ Here is an example of Grafana dashboard importing.
 |        ActiveMapPartitionCount         |      worker       |                 
           This value means count of active map partition reading streams.      
                       |
 |           DeviceOSFreeBytes            |      worker       |                 
            This value means actual usable space of OS for device monitor.      
                       |
 |           DeviceOSTotalBytes           |      worker       |                 
            This value means total usable space of OS for device monitor.       
                       |
-|        DeviceCelebornFreeBytes         |      worker       |                 
         This value means actual usable space of Celeborn for device monitor.   
                       |
-|        DeviceCelebornTotalBytes        |      worker       |                 
       This value means configured usable space of Celeborn for device monitor. 
                       |
 |         PotentialConsumeSpeed          |      worker       |                 
       This value means speed of potential consumption for congestion control.  
                       |
 |            UserProduceSpeed            |      worker       |                 
          This value means speed of user production for congestion control.     
                       |
 |           WorkerConsumeSpeed           |      worker       |                 
         This value means speed of worker consumption for congestion control.   
                       |
diff --git a/assets/grafana/celeborn-dashboard.json 
b/assets/grafana/celeborn-dashboard.json
index a8fe453dd..f7b4de9cf 100644
--- a/assets/grafana/celeborn-dashboard.json
+++ b/assets/grafana/celeborn-dashboard.json
@@ -249,6 +249,186 @@
           "title": "metrics_RegisteredShuffleCount_Value",
           "type": "timeseries"
         },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "description": "Celeborn total device capacity.",
+          "fieldConfig": {
+            "defaults": {
+              "color": {
+                "mode": "palette-classic"
+              },
+              "custom": {
+                "axisLabel": "",
+                "axisPlacement": "auto",
+                "barAlignment": 0,
+                "drawStyle": "line",
+                "fillOpacity": 0,
+                "gradientMode": "none",
+                "hideFrom": {
+                  "legend": false,
+                  "tooltip": false,
+                  "viz": false
+                },
+                "lineInterpolation": "linear",
+                "lineWidth": 1,
+                "pointSize": 5,
+                "scaleDistribution": {
+                  "type": "linear"
+                },
+                "showPoints": "auto",
+                "spanNulls": false,
+                "stacking": {
+                  "group": "A",
+                  "mode": "none"
+                },
+                "thresholdsStyle": {
+                  "mode": "off"
+                }
+              },
+              "mappings": [],
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  {
+                    "color": "green",
+                    "value": null
+                  },
+                  {
+                    "color": "red",
+                    "value": 80
+                  }
+                ]
+              },
+              "unit": "bytes"
+            },
+            "overrides": []
+          },
+          "gridPos": {
+            "h": 9,
+            "w": 12,
+            "x": 0,
+            "y": 10
+          },
+          "id": 185,
+          "options": {
+            "legend": {
+              "calcs": [],
+              "displayMode": "list",
+              "placement": "bottom",
+              "showLegend": true
+            },
+            "tooltip": {
+              "mode": "single",
+              "sort": "none"
+            }
+          },
+          "targets": [
+            {
+              "datasource": {
+                "type": "prometheus",
+                "uid": "${DS_PROMETHEUS}"
+              },
+              "expr": 
"metrics_DeviceCelebornTotalBytes_Value{role=\"Master\"}",
+              "legendFormat": "${baseLegend}",
+              "refId": "A"
+            }
+          ],
+          "title": "metrics_DeviceCelebornTotalBytes_Value",
+          "type": "timeseries"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "description": "Celeborn total device free capacity.",
+          "fieldConfig": {
+            "defaults": {
+              "color": {
+                "mode": "palette-classic"
+              },
+              "custom": {
+                "axisLabel": "",
+                "axisPlacement": "auto",
+                "barAlignment": 0,
+                "drawStyle": "line",
+                "fillOpacity": 0,
+                "gradientMode": "none",
+                "hideFrom": {
+                  "legend": false,
+                  "tooltip": false,
+                  "viz": false
+                },
+                "lineInterpolation": "linear",
+                "lineWidth": 1,
+                "pointSize": 5,
+                "scaleDistribution": {
+                  "type": "linear"
+                },
+                "showPoints": "auto",
+                "spanNulls": false,
+                "stacking": {
+                  "group": "A",
+                  "mode": "none"
+                },
+                "thresholdsStyle": {
+                  "mode": "off"
+                }
+              },
+              "mappings": [],
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  {
+                    "color": "green",
+                    "value": null
+                  },
+                  {
+                    "color": "red",
+                    "value": 80
+                  }
+                ]
+              },
+              "unit": "bytes"
+            },
+            "overrides": []
+          },
+          "gridPos": {
+            "h": 9,
+            "w": 12,
+            "x": 12,
+            "y": 10
+          },
+          "id": 186,
+          "options": {
+            "legend": {
+              "calcs": [],
+              "displayMode": "list",
+              "placement": "bottom",
+              "showLegend": true
+            },
+            "tooltip": {
+              "mode": "single",
+              "sort": "none"
+            }
+          },
+          "targets": [
+            {
+              "datasource": {
+                "type": "prometheus",
+                "uid": "${DS_PROMETHEUS}"
+              },
+              "expr": "metrics_DeviceCelebornFreeBytes_Value{role=\"Master\"}",
+              "legendFormat": "${baseLegend}",
+              "refId": "A"
+            }
+          ],
+          "title": "metrics_DeviceCelebornFreeBytes_Value",
+          "type": "timeseries"
+        },
         {
           "datasource": {
             "type": "prometheus",
@@ -310,7 +490,7 @@
             "h": 9,
             "w": 12,
             "x": 0,
-            "y": 10
+            "y": 19
           },
           "id": 95,
           "options": {
@@ -349,7 +529,7 @@
         "h": 1,
         "w": 24,
         "x": 0,
-        "y": 19
+        "y": 1
       },
       "id": 119,
       "panels": [
@@ -1188,7 +1368,7 @@
         "h": 1,
         "w": 24,
         "x": 0,
-        "y": 60
+        "y": 2
       },
       "id": 28,
       "panels": [
@@ -2200,7 +2380,7 @@
         "h": 1,
         "w": 24,
         "x": 0,
-        "y": 109
+        "y": 3
       },
       "id": 134,
       "panels": [
@@ -3393,7 +3573,7 @@
         "h": 1,
         "w": 24,
         "x": 0,
-        "y": 166
+        "y": 4
       },
       "id": 12,
       "panels": [
@@ -4222,7 +4402,7 @@
         "h": 1,
         "w": 24,
         "x": 0,
-        "y": 207
+        "y": 5
       },
       "id": 10,
       "panels": [
@@ -4776,7 +4956,7 @@
         "h": 1,
         "w": 24,
         "x": 0,
-        "y": 232
+        "y": 6
       },
       "id": 8,
       "panels": [
@@ -5798,7 +5978,7 @@
         "h": 1,
         "w": 24,
         "x": 0,
-        "y": 281
+        "y": 7
       },
       "id": 50,
       "panels": [
@@ -6354,7 +6534,7 @@
         "h": 1,
         "w": 24,
         "x": 0,
-        "y": 306
+        "y": 8
       },
       "id": 157,
       "panels": [
@@ -6647,7 +6827,7 @@
         "h": 1,
         "w": 24,
         "x": 0,
-        "y": 323
+        "y": 9
       },
       "id": 137,
       "panels": [
@@ -8038,7 +8218,7 @@
         "h": 1,
         "w": 24,
         "x": 0,
-        "y": 388
+        "y": 10
       },
       "id": 110,
       "panels": [
@@ -8234,7 +8414,7 @@
         "h": 1,
         "w": 24,
         "x": 0,
-        "y": 397
+        "y": 11
       },
       "id": 123,
       "panels": [
@@ -8712,7 +8892,7 @@
         "h": 1,
         "w": 24,
         "x": 0,
-        "y": 422
+        "y": 12
       },
       "id": 172,
       "panels": [
diff --git a/common/src/main/proto/TransportMessages.proto 
b/common/src/main/proto/TransportMessages.proto
index 7c457bdf7..95b1769d1 100644
--- a/common/src/main/proto/TransportMessages.proto
+++ b/common/src/main/proto/TransportMessages.proto
@@ -158,6 +158,7 @@ message PbDiskInfo {
   int32 status = 5;
   int64 avgFetchTime = 6;
   int32 storageType = 7;
+  int64 totalSpace = 8;
 }
 
 message PbWorkerInfo {
diff --git 
a/common/src/main/scala/org/apache/celeborn/common/meta/DeviceInfo.scala 
b/common/src/main/scala/org/apache/celeborn/common/meta/DeviceInfo.scala
index b0438bd1b..0fc2f4984 100644
--- a/common/src/main/scala/org/apache/celeborn/common/meta/DeviceInfo.scala
+++ b/common/src/main/scala/org/apache/celeborn/common/meta/DeviceInfo.scala
@@ -82,6 +82,7 @@ class DiskInfo(
   var status: DiskStatus = DiskStatus.HEALTHY
   var threadCount = 1
   var configuredUsableSpace = 0L
+  var totalSpace = 0L
   var storageType: StorageInfo.Type = StorageInfo.Type.SSD
   var maxSlots: Long = 0
   lazy val shuffleAllocations = new util.HashMap[String, Integer]()
@@ -101,6 +102,11 @@ class DiskInfo(
     this
   }
 
+  def setTotalSpace(totalSpace: Long): this.type = this.synchronized {
+    this.totalSpace = totalSpace
+    this
+  }
+
   def updateFlushTime(): Unit = {
     avgFlushTime = flushTimeMetrics.getAverage()
   }
@@ -175,6 +181,7 @@ class DiskInfo(
       s" shuffleAllocations: ${nonEmptyShuffles.toMap}," +
       s" mountPoint: $mountPoint," +
       s" usableSpace: ${Utils.bytesToString(actualUsableSpace)}," +
+      s" totalSpace: ${Utils.bytesToString(totalSpace)}," +
       s" avgFlushTime: ${Utils.nanoDurationToString(avgFlushTime)}," +
       s" avgFetchTime: ${Utils.nanoDurationToString(avgFetchTime)}," +
       s" activeSlots: $activeSlots," +
@@ -286,6 +293,7 @@ object DeviceInfo {
             conf)
           val (_, maxUsableSpace, threadCount, storageType) = dirs(0)
           diskInfo.configuredUsableSpace = maxUsableSpace
+          diskInfo.totalSpace = maxUsableSpace
           diskInfo.threadCount = threadCount
           diskInfo.storageType = storageType
           deviceInfo.addDiskInfo(diskInfo)
diff --git 
a/common/src/main/scala/org/apache/celeborn/common/meta/WorkerInfo.scala 
b/common/src/main/scala/org/apache/celeborn/common/meta/WorkerInfo.scala
index 3a1446be3..a0eedc3f9 100644
--- a/common/src/main/scala/org/apache/celeborn/common/meta/WorkerInfo.scala
+++ b/common/src/main/scala/org/apache/celeborn/common/meta/WorkerInfo.scala
@@ -188,6 +188,14 @@ class WorkerInfo(
     diskInfos.asScala.map(_._2.availableSlots()).sum
   }
 
+  def totalSpace(): Long = this.synchronized {
+    diskInfos.asScala.map(_._2.totalSpace).sum
+  }
+
+  def totalActualUsableSpace(): Long = this.synchronized {
+    diskInfos.asScala.map(_._2.actualUsableSpace).sum
+  }
+
   def updateThenGetDiskInfos(
       newDiskInfos: java.util.Map[String, DiskInfo],
       estimatedPartitionSize: Option[Long] = None): util.Map[String, DiskInfo] 
= this.synchronized {
@@ -197,6 +205,7 @@ class WorkerInfo(
       val curDisk = diskInfos.get(mountPoint)
       if (curDisk != null) {
         curDisk.actualUsableSpace = newDisk.actualUsableSpace
+        curDisk.totalSpace = newDisk.totalSpace
         // Update master's diskinfo activeslots to worker's value
         curDisk.activeSlots = newDisk.activeSlots
         curDisk.avgFlushTime = newDisk.avgFlushTime
diff --git 
a/common/src/main/scala/org/apache/celeborn/common/util/PbSerDeUtils.scala 
b/common/src/main/scala/org/apache/celeborn/common/util/PbSerDeUtils.scala
index f2ef85c48..cffe76857 100644
--- a/common/src/main/scala/org/apache/celeborn/common/util/PbSerDeUtils.scala
+++ b/common/src/main/scala/org/apache/celeborn/common/util/PbSerDeUtils.scala
@@ -72,6 +72,7 @@ object PbSerDeUtils {
       pbDiskInfo.getAvgFetchTime,
       pbDiskInfo.getUsedSlots)
       .setStatus(Utils.toDiskStatus(pbDiskInfo.getStatus))
+      .setTotalSpace(pbDiskInfo.getTotalSpace)
     
diskInfo.setStorageType(StorageInfo.typesMap.get(pbDiskInfo.getStorageType))
     diskInfo
   }
@@ -85,6 +86,7 @@ object PbSerDeUtils {
       .setUsedSlots(diskInfo.activeSlots)
       .setStatus(diskInfo.status.getValue)
       .setStorageType(diskInfo.storageType.getValue)
+      .setTotalSpace(diskInfo.totalSpace)
       .build
 
   def fromPbFileInfo(pbFileInfo: PbFileInfo): DiskFileInfo =
diff --git 
a/common/src/test/scala/org/apache/celeborn/common/meta/WorkerInfoSuite.scala 
b/common/src/test/scala/org/apache/celeborn/common/meta/WorkerInfoSuite.scala
index 2dc817c9d..401177e63 100644
--- 
a/common/src/test/scala/org/apache/celeborn/common/meta/WorkerInfoSuite.scala
+++ 
b/common/src/test/scala/org/apache/celeborn/common/meta/WorkerInfoSuite.scala
@@ -246,9 +246,16 @@ class WorkerInfoSuite extends CelebornFunSuite {
       null)
 
     val disks = new util.HashMap[String, DiskInfo]()
-    disks.put("disk1", new DiskInfo("disk1", Int.MaxValue, 1, 1, 10))
-    disks.put("disk2", new DiskInfo("disk2", Int.MaxValue, 2, 2, 20))
-    disks.put("disk3", new DiskInfo("disk3", Int.MaxValue, 3, 3, 30))
+    val diskInfo1 = new DiskInfo("disk1", Int.MaxValue, 1, 1, 10)
+    val diskInfo2 = new DiskInfo("disk2", Int.MaxValue, 2, 2, 20)
+    val diskInfo3 = new DiskInfo("disk3", Int.MaxValue, 3, 3, 30)
+    diskInfo1.setTotalSpace(Int.MaxValue)
+    diskInfo2.setTotalSpace(Int.MaxValue)
+    diskInfo3.setTotalSpace(Int.MaxValue)
+
+    disks.put("disk1", diskInfo1)
+    disks.put("disk2", diskInfo2)
+    disks.put("disk3", diskInfo3)
     val userResourceConsumption =
       JavaUtils.newConcurrentHashMap[UserIdentifier, ResourceConsumption]()
     userResourceConsumption.put(
@@ -341,9 +348,9 @@ class WorkerInfoSuite extends CelebornFunSuite {
            |SlotsUsed: 60
            |LastHeartbeat: 0
            |Disks: $placeholder
-           |  DiskInfo0: DiskInfo(maxSlots: 0, committed shuffles 0, running 
applications 0, shuffleAllocations: Map(), mountPoint: disk3, usableSpace: 
2048.0 MiB, avgFlushTime: 3 ns, avgFetchTime: 3 ns, activeSlots: 30, 
storageType: SSD) status: HEALTHY dirs $placeholder
-           |  DiskInfo1: DiskInfo(maxSlots: 0, committed shuffles 0, running 
applications 0, shuffleAllocations: Map(), mountPoint: disk1, usableSpace: 
2048.0 MiB, avgFlushTime: 1 ns, avgFetchTime: 1 ns, activeSlots: 10, 
storageType: SSD) status: HEALTHY dirs $placeholder
-           |  DiskInfo2: DiskInfo(maxSlots: 0, committed shuffles 0, running 
applications 0, shuffleAllocations: Map(), mountPoint: disk2, usableSpace: 
2048.0 MiB, avgFlushTime: 2 ns, avgFetchTime: 2 ns, activeSlots: 20, 
storageType: SSD) status: HEALTHY dirs $placeholder
+           |  DiskInfo0: DiskInfo(maxSlots: 0, committed shuffles 0, running 
applications 0, shuffleAllocations: Map(), mountPoint: disk3, usableSpace: 
2048.0 MiB, totalSpace: 2048.0 MiB, avgFlushTime: 3 ns, avgFetchTime: 3 ns, 
activeSlots: 30, storageType: SSD) status: HEALTHY dirs $placeholder
+           |  DiskInfo1: DiskInfo(maxSlots: 0, committed shuffles 0, running 
applications 0, shuffleAllocations: Map(), mountPoint: disk1, usableSpace: 
2048.0 MiB, totalSpace: 2048.0 MiB, avgFlushTime: 1 ns, avgFetchTime: 1 ns, 
activeSlots: 10, storageType: SSD) status: HEALTHY dirs $placeholder
+           |  DiskInfo2: DiskInfo(maxSlots: 0, committed shuffles 0, running 
applications 0, shuffleAllocations: Map(), mountPoint: disk2, usableSpace: 
2048.0 MiB, totalSpace: 2048.0 MiB, avgFlushTime: 2 ns, avgFetchTime: 2 ns, 
activeSlots: 20, storageType: SSD) status: HEALTHY dirs $placeholder
            |UserResourceConsumption: $placeholder
            |  UserIdentifier: `tenant1`.`name1`, ResourceConsumption: 
ResourceConsumption(diskBytesWritten: 20.0 MiB, diskFileCount: 1, 
hdfsBytesWritten: 50.0 MiB, hdfsFileCount: 1, subResourceConsumptions: 
(application_1697697127390_2171854 -> ResourceConsumption(diskBytesWritten: 
20.0 MiB, diskFileCount: 1, hdfsBytesWritten: 50.0 MiB, hdfsFileCount: 1, 
subResourceConsumptions: empty)))
            |WorkerRef: null
diff --git 
a/common/src/test/scala/org/apache/celeborn/common/util/PbSerDeUtilsTest.scala 
b/common/src/test/scala/org/apache/celeborn/common/util/PbSerDeUtilsTest.scala
index 5166c7965..840b9896d 100644
--- 
a/common/src/test/scala/org/apache/celeborn/common/util/PbSerDeUtilsTest.scala
+++ 
b/common/src/test/scala/org/apache/celeborn/common/util/PbSerDeUtilsTest.scala
@@ -50,6 +50,9 @@ class PbSerDeUtilsTest extends CelebornFunSuite {
   val device = new DeviceInfo("device-a")
   val diskInfo1 = new DiskInfo("/mnt/disk/0", 1000, 1000, 1000, 1000, files, 
device)
   val diskInfo2 = new DiskInfo("/mnt/disk/1", 2000, 2000, 2000, 2000, files, 
device)
+  diskInfo1.setTotalSpace(100000000)
+  diskInfo1.setTotalSpace(200000000)
+
   val diskInfos = new util.HashMap[String, DiskInfo]()
   diskInfos.put("disk1", diskInfo1)
   diskInfos.put("disk2", diskInfo2)
@@ -170,6 +173,8 @@ class PbSerDeUtilsTest extends CelebornFunSuite {
     assert(restoredDiskInfo.avgFlushTime.equals(diskInfo1.avgFlushTime))
     assert(restoredDiskInfo.avgFetchTime.equals(diskInfo1.avgFetchTime))
     assert(restoredDiskInfo.activeSlots.equals(diskInfo1.activeSlots))
+    assert(restoredDiskInfo.totalSpace.equals(diskInfo1.totalSpace))
+
     assert(restoredDiskInfo.dirs.equals(List.empty))
     assert(restoredDiskInfo.deviceInfo == null)
   }
diff --git a/docs/monitoring.md b/docs/monitoring.md
index 098a8f910..e9a1f3f56 100644
--- a/docs/monitoring.md
+++ b/docs/monitoring.md
@@ -92,6 +92,8 @@ These metrics are exposed by Celeborn master.
 
   - namespace=master 
     - RegisteredShuffleCount
+    - DeviceCelebornFreeBytes
+    - DeviceCelebornTotalBytes
     - RunningApplicationCount
     - ActiveShuffleSize
         - The active shuffle size of workers.
diff --git 
a/master/src/main/scala/org/apache/celeborn/service/deploy/master/Master.scala 
b/master/src/main/scala/org/apache/celeborn/service/deploy/master/Master.scala
index b7f51b00b..9fb5d594b 100644
--- 
a/master/src/main/scala/org/apache/celeborn/service/deploy/master/Master.scala
+++ 
b/master/src/main/scala/org/apache/celeborn/service/deploy/master/Master.scala
@@ -247,6 +247,15 @@ private[celeborn] class Master(
             }).sum()
       }).sum()
   }
+
+  masterSource.addGauge(MasterSource.DEVICE_CELEBORN_TOTAL_CAPACITY) { () =>
+    statusSystem.workers.asScala.map(_.totalSpace()).sum
+  }
+
+  masterSource.addGauge(MasterSource.DEVICE_CELEBORN_FREE_CAPACITY) { () =>
+    statusSystem.workers.asScala.map(_.totalActualUsableSpace()).sum
+  }
+
   masterSource.addGauge(MasterSource.IS_ACTIVE_MASTER) { () => isMasterActive }
 
   private val threadsStarted: AtomicBoolean = new AtomicBoolean(false)
diff --git 
a/master/src/main/scala/org/apache/celeborn/service/deploy/master/MasterSource.scala
 
b/master/src/main/scala/org/apache/celeborn/service/deploy/master/MasterSource.scala
index b0b3e008e..94ef037be 100644
--- 
a/master/src/main/scala/org/apache/celeborn/service/deploy/master/MasterSource.scala
+++ 
b/master/src/main/scala/org/apache/celeborn/service/deploy/master/MasterSource.scala
@@ -53,4 +53,8 @@ object MasterSource {
   val ACTIVE_SHUFFLE_FILE_COUNT = "ActiveShuffleFileCount"
 
   val OFFER_SLOTS_TIME = "OfferSlotsTime"
+
+  // Capacity
+  val DEVICE_CELEBORN_FREE_CAPACITY = "DeviceCelebornFreeBytes"
+  val DEVICE_CELEBORN_TOTAL_CAPACITY = "DeviceCelebornTotalBytes"
 }
diff --git 
a/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/storage/DeviceMonitor.scala
 
b/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/storage/DeviceMonitor.scala
index 7efa568b1..ae3304308 100644
--- 
a/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/storage/DeviceMonitor.scala
+++ 
b/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/storage/DeviceMonitor.scala
@@ -94,7 +94,7 @@ class LocalDeviceMonitor(
           usage.freeSpace
         }
         workerSource.addGauge(WorkerSource.DEVICE_CELEBORN_TOTAL_CAPACITY, 
deviceLabel) { () =>
-          diskInfos.map(_.configuredUsableSpace).sum
+          diskInfos.map(_.totalSpace).sum
         }
         workerSource.addGauge(WorkerSource.DEVICE_CELEBORN_FREE_CAPACITY, 
deviceLabel) { () =>
           diskInfos.map(_.actualUsableSpace).sum
diff --git 
a/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/storage/StorageManager.scala
 
b/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/storage/StorageManager.scala
index 194e13378..82e0f2ef1 100644
--- 
a/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/storage/StorageManager.scala
+++ 
b/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/storage/StorageManager.scala
@@ -726,6 +726,7 @@ final private[worker] class StorageManager(conf: 
CelebornConf, workerSource: Abs
         Math.min(diskInfo.configuredUsableSpace - totalUsage, 
fileSystemReportedUsableSpace)
       logDebug(s"updateDiskInfos  workingDirUsableSpace:$workingDirUsableSpace 
filemeta:$fileSystemReportedUsableSpace conf:${diskInfo.configuredUsableSpace} 
totalUsage:$totalUsage")
       diskInfo.setUsableSpace(workingDirUsableSpace)
+      diskInfo.setTotalSpace(totalUsage + workingDirUsableSpace)
       diskInfo.updateFlushTime()
       diskInfo.updateFetchTime()
     }

Reply via email to