This is an automated email from the ASF dual-hosted git repository.
chengpan pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/incubator-celeborn.git
The following commit(s) were added to refs/heads/main by this push:
new 5310bcaf6 [CELEBORN-313] Add rest endpoint to show master group info
5310bcaf6 is described below
commit 5310bcaf6bf4221734632432cfac9b0a59337255
Author: sychen <[email protected]>
AuthorDate: Thu Sep 28 20:08:31 2023 +0800
[CELEBORN-313] Add rest endpoint to show master group info
### What changes were proposed in this pull request?
<img width="1347" alt="image"
src="https://github.com/apache/incubator-celeborn/assets/3898450/43d10bff-6878-4591-9461-889494d797f9">
### Why are the changes needed?
### Does this PR introduce _any_ user-facing change?
No.
### How was this patch tested?
```bash
./bin/celeborn-ratis sh -Draft.rpc.type=NETTY group info -peers
clb-1:9872,clb-2:9873,clb-3:9874
```
```
group id: c5196f6d-2c34-3ed3-8b8a-47bede733167
leader info: 1(clb-1:9872)
[server {
id: "1"
address: "clb-1:9872"
clientAddress: "clb-1:9097"
startupRole: FOLLOWER
}
commitIndex: 316
, server {
id: "2"
address: "clb-2:9873"
clientAddress: "clb-2:9098"
startupRole: FOLLOWER
}
commitIndex: 316
, server {
id: "3"
address: "clb-3:9874"
clientAddress: "clb-3:9099"
startupRole: FOLLOWER
}
commitIndex: 316
]
```
```bash
curl http://clb-3:9983/masterGroupInfo
```
```
====================== Master Group INFO ==============================
group id: c5196f6d-2c34-3ed3-8b8a-47bede733167
leader info: 1(clb-1:9872)
[server {
id: "3"
address: "clb-3:9874"
clientAddress: "clb-3:9099"
startupRole: FOLLOWER
}
commitIndex: 316
, server {
id: "1"
address: "clb-1:9872"
clientAddress: "clb-1:9097"
startupRole: FOLLOWER
}
commitIndex: 316
, server {
id: "2"
address: "clb-2:9873"
clientAddress: "clb-2:9098"
startupRole: FOLLOWER
}
commitIndex: 316
]
```
Closes #1946 from cxzl25/CELEBORN-313.
Authored-by: sychen <[email protected]>
Signed-off-by: Cheng Pan <[email protected]>
---
docs/monitoring.md | 1 +
.../deploy/master/clustermeta/ha/HARaftServer.java | 2 +-
.../celeborn/service/deploy/master/Master.scala | 43 ++++++++++++++++++++++
.../celeborn/server/common/HttpService.scala | 2 +
.../server/common/http/HttpRequestHandler.scala | 2 +
5 files changed, 49 insertions(+), 1 deletion(-)
diff --git a/docs/monitoring.md b/docs/monitoring.md
index 8180e8a66..d6a7cf17e 100644
--- a/docs/monitoring.md
+++ b/docs/monitoring.md
@@ -291,6 +291,7 @@ API path listed as below:
|-----------------------|-------------------------------------------------------------------------------------------------------------|
| /metrics/prometheus | List the metrics data in prometheus format of the
master. |
| /conf | List the conf setting of the master.
|
+| /masterGroupInfo | List master group information of the service. It
will list all master's LEADER, FOLLOWER information. |
| /workerInfo | List worker information of the service. It will list
all registered workers 's information. |
| /lostWorkers | List all lost workers of the master.
|
| /excludedWorkers | List all excluded workers of the master.
|
diff --git
a/master/src/main/java/org/apache/celeborn/service/deploy/master/clustermeta/ha/HARaftServer.java
b/master/src/main/java/org/apache/celeborn/service/deploy/master/clustermeta/ha/HARaftServer.java
index 73149bc01..31e04a0ba 100644
---
a/master/src/main/java/org/apache/celeborn/service/deploy/master/clustermeta/ha/HARaftServer.java
+++
b/master/src/main/java/org/apache/celeborn/service/deploy/master/clustermeta/ha/HARaftServer.java
@@ -483,7 +483,7 @@ public class HARaftServer {
}
}
- private GroupInfoReply getGroupInfo() throws IOException {
+ public GroupInfoReply getGroupInfo() throws IOException {
GroupInfoRequest groupInfoRequest =
new GroupInfoRequest(clientId, raftPeerId, RAFT_GROUP_ID,
nextCallId());
return server.getGroupInfo(groupInfoRequest);
diff --git
a/master/src/main/scala/org/apache/celeborn/service/deploy/master/Master.scala
b/master/src/main/scala/org/apache/celeborn/service/deploy/master/Master.scala
index 1bf92b049..32bb22781 100644
---
a/master/src/main/scala/org/apache/celeborn/service/deploy/master/Master.scala
+++
b/master/src/main/scala/org/apache/celeborn/service/deploy/master/Master.scala
@@ -28,6 +28,8 @@ import scala.collection.mutable
import scala.util.Random
import org.apache.hadoop.fs.{FileSystem, Path}
+import org.apache.ratis.proto.RaftProtos
+import org.apache.ratis.proto.RaftProtos.RaftPeerRole
import org.apache.celeborn.common.CelebornConf
import org.apache.celeborn.common.client.MasterClient
@@ -840,6 +842,13 @@ private[celeborn] class Master(
}.asJava
}
+ override def getMasterGroupInfo: String = {
+ val sb = new StringBuilder
+ sb.append("====================== Master Group INFO
==============================\n")
+ sb.append(getMasterGroupInfoInternal())
+ sb.toString()
+ }
+
override def getWorkerInfo: String = {
val sb = new StringBuilder
sb.append("====================== Workers Info in Master
=========================\n")
@@ -940,6 +949,40 @@ private[celeborn] class Master(
isActive
}
+ private def getMasterGroupInfoInternal(): String = {
+ if (conf.haEnabled) {
+ val sb = new StringBuilder
+ val groupInfo =
statusSystem.asInstanceOf[HAMasterMetaManager].getRatisServer.getGroupInfo
+ sb.append(s"group id: ${groupInfo.getGroup.getGroupId.getUuid}\n")
+
+ def getLeader(roleInfo: RaftProtos.RoleInfoProto):
RaftProtos.RaftPeerProto = {
+ if (roleInfo == null) {
+ return null
+ }
+ if (roleInfo.getRole == RaftPeerRole.LEADER) {
+ return roleInfo.getSelf
+ }
+ val followerInfo = roleInfo.getFollowerInfo
+ if (followerInfo == null) {
+ return null
+ }
+ followerInfo.getLeaderInfo.getId
+ }
+
+ val leader = getLeader(groupInfo.getRoleInfoProto)
+ if (leader == null) {
+ sb.append("leader not found\n")
+ } else {
+ sb.append(s"leader info:
${leader.getId.toStringUtf8}(${leader.getAddress})\n\n")
+ }
+ sb.append(groupInfo.getCommitInfos)
+ sb.append("\n")
+ sb.toString()
+ } else {
+ "HA is not enabled"
+ }
+ }
+
override def initialize(): Unit = {
super.initialize()
logInfo("Master started.")
diff --git
a/service/src/main/scala/org/apache/celeborn/server/common/HttpService.scala
b/service/src/main/scala/org/apache/celeborn/server/common/HttpService.scala
index 6b5ec4196..868c0e1fc 100644
--- a/service/src/main/scala/org/apache/celeborn/server/common/HttpService.scala
+++ b/service/src/main/scala/org/apache/celeborn/server/common/HttpService.scala
@@ -41,6 +41,8 @@ abstract class HttpService extends Service with Logging {
sb.toString()
}
+ def getMasterGroupInfo: String = throw new UnsupportedOperationException()
+
def getWorkerInfo: String
def getLostWorkers: String = throw new UnsupportedOperationException()
diff --git
a/service/src/main/scala/org/apache/celeborn/server/common/http/HttpRequestHandler.scala
b/service/src/main/scala/org/apache/celeborn/server/common/http/HttpRequestHandler.scala
index 779b8e476..2115aef14 100644
---
a/service/src/main/scala/org/apache/celeborn/server/common/http/HttpRequestHandler.scala
+++
b/service/src/main/scala/org/apache/celeborn/server/common/http/HttpRequestHandler.scala
@@ -65,6 +65,8 @@ class HttpRequestHandler(
path match {
case "/conf" =>
service.getConf
+ case "/masterGroupInfo" if service.serviceName == Service.MASTER =>
+ service.getMasterGroupInfo
case "/workerInfo" =>
service.getWorkerInfo
case "/lostWorkers" if service.serviceName == Service.MASTER =>