This is an automated email from the ASF dual-hosted git repository.

chengpan pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/incubator-celeborn.git


The following commit(s) were added to refs/heads/main by this push:
     new 5310bcaf6 [CELEBORN-313] Add rest endpoint to show master group info
5310bcaf6 is described below

commit 5310bcaf6bf4221734632432cfac9b0a59337255
Author: sychen <[email protected]>
AuthorDate: Thu Sep 28 20:08:31 2023 +0800

    [CELEBORN-313] Add rest endpoint to show master group info
    
    ### What changes were proposed in this pull request?
    
    <img width="1347" alt="image" 
src="https://github.com/apache/incubator-celeborn/assets/3898450/43d10bff-6878-4591-9461-889494d797f9";>
    
    ### Why are the changes needed?
    
    ### Does this PR introduce _any_ user-facing change?
    No.
    
    ### How was this patch tested?
    
    ```bash
    ./bin/celeborn-ratis sh -Draft.rpc.type=NETTY  group info   -peers 
clb-1:9872,clb-2:9873,clb-3:9874
    ```
    
    ```
    group id: c5196f6d-2c34-3ed3-8b8a-47bede733167
    leader info: 1(clb-1:9872)
    
    [server {
      id: "1"
      address: "clb-1:9872"
      clientAddress: "clb-1:9097"
      startupRole: FOLLOWER
    }
    commitIndex: 316
    , server {
      id: "2"
      address: "clb-2:9873"
      clientAddress: "clb-2:9098"
      startupRole: FOLLOWER
    }
    commitIndex: 316
    , server {
      id: "3"
      address: "clb-3:9874"
      clientAddress: "clb-3:9099"
      startupRole: FOLLOWER
    }
    commitIndex: 316
    ]
    ```
    
    ```bash
    curl http://clb-3:9983/masterGroupInfo
    ```
    
    ```
    ====================== Master Group INFO ==============================
    group id: c5196f6d-2c34-3ed3-8b8a-47bede733167
    leader info: 1(clb-1:9872)
    
    [server {
      id: "3"
      address: "clb-3:9874"
      clientAddress: "clb-3:9099"
      startupRole: FOLLOWER
    }
    commitIndex: 316
    , server {
      id: "1"
      address: "clb-1:9872"
      clientAddress: "clb-1:9097"
      startupRole: FOLLOWER
    }
    commitIndex: 316
    , server {
      id: "2"
      address: "clb-2:9873"
      clientAddress: "clb-2:9098"
      startupRole: FOLLOWER
    }
    commitIndex: 316
    ]
    ```
    
    Closes #1946 from cxzl25/CELEBORN-313.
    
    Authored-by: sychen <[email protected]>
    Signed-off-by: Cheng Pan <[email protected]>
---
 docs/monitoring.md                                 |  1 +
 .../deploy/master/clustermeta/ha/HARaftServer.java |  2 +-
 .../celeborn/service/deploy/master/Master.scala    | 43 ++++++++++++++++++++++
 .../celeborn/server/common/HttpService.scala       |  2 +
 .../server/common/http/HttpRequestHandler.scala    |  2 +
 5 files changed, 49 insertions(+), 1 deletion(-)

diff --git a/docs/monitoring.md b/docs/monitoring.md
index 8180e8a66..d6a7cf17e 100644
--- a/docs/monitoring.md
+++ b/docs/monitoring.md
@@ -291,6 +291,7 @@ API path listed as below:
 
|-----------------------|-------------------------------------------------------------------------------------------------------------|
 | /metrics/prometheus   | List the metrics data in prometheus format of the 
master.                                                   |
 | /conf                 | List the conf setting of the master.                 
                                                       |
+| /masterGroupInfo      | List master group information of the service. It 
will list all master's LEADER, FOLLOWER information.       |
 | /workerInfo           | List worker information of the service. It will list 
all registered workers 's information.                 |
 | /lostWorkers          | List all lost workers of the master.                 
                                                       |
 | /excludedWorkers      | List all excluded workers of the master.             
                                                       |
diff --git 
a/master/src/main/java/org/apache/celeborn/service/deploy/master/clustermeta/ha/HARaftServer.java
 
b/master/src/main/java/org/apache/celeborn/service/deploy/master/clustermeta/ha/HARaftServer.java
index 73149bc01..31e04a0ba 100644
--- 
a/master/src/main/java/org/apache/celeborn/service/deploy/master/clustermeta/ha/HARaftServer.java
+++ 
b/master/src/main/java/org/apache/celeborn/service/deploy/master/clustermeta/ha/HARaftServer.java
@@ -483,7 +483,7 @@ public class HARaftServer {
     }
   }
 
-  private GroupInfoReply getGroupInfo() throws IOException {
+  public GroupInfoReply getGroupInfo() throws IOException {
     GroupInfoRequest groupInfoRequest =
         new GroupInfoRequest(clientId, raftPeerId, RAFT_GROUP_ID, 
nextCallId());
     return server.getGroupInfo(groupInfoRequest);
diff --git 
a/master/src/main/scala/org/apache/celeborn/service/deploy/master/Master.scala 
b/master/src/main/scala/org/apache/celeborn/service/deploy/master/Master.scala
index 1bf92b049..32bb22781 100644
--- 
a/master/src/main/scala/org/apache/celeborn/service/deploy/master/Master.scala
+++ 
b/master/src/main/scala/org/apache/celeborn/service/deploy/master/Master.scala
@@ -28,6 +28,8 @@ import scala.collection.mutable
 import scala.util.Random
 
 import org.apache.hadoop.fs.{FileSystem, Path}
+import org.apache.ratis.proto.RaftProtos
+import org.apache.ratis.proto.RaftProtos.RaftPeerRole
 
 import org.apache.celeborn.common.CelebornConf
 import org.apache.celeborn.common.client.MasterClient
@@ -840,6 +842,13 @@ private[celeborn] class Master(
     }.asJava
   }
 
+  override def getMasterGroupInfo: String = {
+    val sb = new StringBuilder
+    sb.append("====================== Master Group INFO 
==============================\n")
+    sb.append(getMasterGroupInfoInternal())
+    sb.toString()
+  }
+
   override def getWorkerInfo: String = {
     val sb = new StringBuilder
     sb.append("====================== Workers Info in Master 
=========================\n")
@@ -940,6 +949,40 @@ private[celeborn] class Master(
     isActive
   }
 
+  private def getMasterGroupInfoInternal(): String = {
+    if (conf.haEnabled) {
+      val sb = new StringBuilder
+      val groupInfo = 
statusSystem.asInstanceOf[HAMasterMetaManager].getRatisServer.getGroupInfo
+      sb.append(s"group id: ${groupInfo.getGroup.getGroupId.getUuid}\n")
+
+      def getLeader(roleInfo: RaftProtos.RoleInfoProto): 
RaftProtos.RaftPeerProto = {
+        if (roleInfo == null) {
+          return null
+        }
+        if (roleInfo.getRole == RaftPeerRole.LEADER) {
+          return roleInfo.getSelf
+        }
+        val followerInfo = roleInfo.getFollowerInfo
+        if (followerInfo == null) {
+          return null
+        }
+        followerInfo.getLeaderInfo.getId
+      }
+
+      val leader = getLeader(groupInfo.getRoleInfoProto)
+      if (leader == null) {
+        sb.append("leader not found\n")
+      } else {
+        sb.append(s"leader info: 
${leader.getId.toStringUtf8}(${leader.getAddress})\n\n")
+      }
+      sb.append(groupInfo.getCommitInfos)
+      sb.append("\n")
+      sb.toString()
+    } else {
+      "HA is not enabled"
+    }
+  }
+
   override def initialize(): Unit = {
     super.initialize()
     logInfo("Master started.")
diff --git 
a/service/src/main/scala/org/apache/celeborn/server/common/HttpService.scala 
b/service/src/main/scala/org/apache/celeborn/server/common/HttpService.scala
index 6b5ec4196..868c0e1fc 100644
--- a/service/src/main/scala/org/apache/celeborn/server/common/HttpService.scala
+++ b/service/src/main/scala/org/apache/celeborn/server/common/HttpService.scala
@@ -41,6 +41,8 @@ abstract class HttpService extends Service with Logging {
     sb.toString()
   }
 
+  def getMasterGroupInfo: String = throw new UnsupportedOperationException()
+
   def getWorkerInfo: String
 
   def getLostWorkers: String = throw new UnsupportedOperationException()
diff --git 
a/service/src/main/scala/org/apache/celeborn/server/common/http/HttpRequestHandler.scala
 
b/service/src/main/scala/org/apache/celeborn/server/common/http/HttpRequestHandler.scala
index 779b8e476..2115aef14 100644
--- 
a/service/src/main/scala/org/apache/celeborn/server/common/http/HttpRequestHandler.scala
+++ 
b/service/src/main/scala/org/apache/celeborn/server/common/http/HttpRequestHandler.scala
@@ -65,6 +65,8 @@ class HttpRequestHandler(
     path match {
       case "/conf" =>
         service.getConf
+      case "/masterGroupInfo" if service.serviceName == Service.MASTER =>
+        service.getMasterGroupInfo
       case "/workerInfo" =>
         service.getWorkerInfo
       case "/lostWorkers" if service.serviceName == Service.MASTER =>

Reply via email to