This is an automated email from the ASF dual-hosted git repository.
adoroszlai pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/ozone.git
The following commit(s) were added to refs/heads/master by this push:
new 34f9d9e870 HDDS-11742. Update metrics with leaderId if known when
starting SCM (#7471)
34f9d9e870 is described below
commit 34f9d9e870cbeb7aa0982db2a10d6f01c4db5a9b
Author: Slava Tutrinov <[email protected]>
AuthorDate: Fri Nov 29 18:25:18 2024 +0300
HDDS-11742. Update metrics with leaderId if known when starting SCM (#7471)
---
.../hadoop/hdds/scm/ha/SCMHAManagerStub.java | 8 ++
.../apache/hadoop/hdds/scm/ha/SCMRatisServer.java | 3 +
.../hadoop/hdds/scm/ha/SCMRatisServerImpl.java | 11 +++
.../hdds/scm/server/StorageContainerManager.java | 6 +-
.../hdds/scm/ha/TestReplicationAnnotation.java | 7 ++
.../hadoop/hdds/scm/ha/TestSCMRatisServerImpl.java | 108 +++++++++++++++++++++
.../hdds/scm/TestStorageContainerManagerHA.java | 83 +++++++++++++++-
7 files changed, 221 insertions(+), 5 deletions(-)
diff --git
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/ha/SCMHAManagerStub.java
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/ha/SCMHAManagerStub.java
index b9539684ed..85664dd232 100644
---
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/ha/SCMHAManagerStub.java
+++
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/ha/SCMHAManagerStub.java
@@ -23,6 +23,7 @@ import java.util.Arrays;
import java.util.EnumMap;
import java.util.List;
import java.util.Map;
+import java.util.UUID;
import com.google.common.base.Preconditions;
import com.google.protobuf.InvalidProtocolBufferException;
@@ -170,6 +171,8 @@ public final class SCMHAManagerStub implements SCMHAManager
{
private Map<RequestType, Object> handlers =
new EnumMap<>(RequestType.class);
+ private RaftPeerId leaderId =
RaftPeerId.valueOf(UUID.randomUUID().toString());
+
@Override
public void start() {
}
@@ -283,5 +286,10 @@ public final class SCMHAManagerStub implements
SCMHAManager {
public GrpcTlsConfig getGrpcTlsConfig() {
return null;
}
+
+ @Override
+ public RaftPeerId getLeaderId() {
+ return leaderId;
+ }
}
}
diff --git
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/ha/SCMRatisServer.java
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/ha/SCMRatisServer.java
index a786bd2944..4e883b27a7 100644
---
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/ha/SCMRatisServer.java
+++
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/ha/SCMRatisServer.java
@@ -21,6 +21,7 @@ import
org.apache.hadoop.hdds.protocol.proto.SCMRatisProtocol.RequestType;
import org.apache.hadoop.hdds.scm.AddSCMRequest;
import org.apache.hadoop.hdds.scm.RemoveSCMRequest;
import org.apache.ratis.grpc.GrpcTlsConfig;
+import org.apache.ratis.protocol.RaftPeerId;
import org.apache.ratis.protocol.exceptions.NotLeaderException;
import org.apache.ratis.server.RaftServer;
@@ -68,4 +69,6 @@ public interface SCMRatisServer {
GrpcTlsConfig getGrpcTlsConfig();
+ RaftPeerId getLeaderId();
+
}
diff --git
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/ha/SCMRatisServerImpl.java
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/ha/SCMRatisServerImpl.java
index 70dffba27e..0383bf1809 100644
---
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/ha/SCMRatisServerImpl.java
+++
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/ha/SCMRatisServerImpl.java
@@ -29,6 +29,7 @@ import java.util.concurrent.TimeoutException;
import java.util.concurrent.atomic.AtomicLong;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Preconditions;
+import jakarta.annotation.Nullable;
import org.apache.hadoop.hdds.HddsUtils;
import org.apache.hadoop.hdds.conf.ConfigurationSource;
import org.apache.hadoop.hdds.conf.OzoneConfiguration;
@@ -147,6 +148,16 @@ public class SCMRatisServerImpl implements SCMRatisServer {
return grpcTlsConfig;
}
+ @Override
+ @Nullable
+ public RaftPeerId getLeaderId() {
+ RaftPeer raftLeaderPeer = getLeader();
+ if (raftLeaderPeer != null) {
+ return raftLeaderPeer.getId();
+ }
+ return null;
+ }
+
private static void waitForLeaderToBeReady(RaftServer server,
OzoneConfiguration conf, RaftGroup group) throws IOException {
boolean ready;
diff --git
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/StorageContainerManager.java
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/StorageContainerManager.java
index 9390318a29..5895ecc12b 100644
---
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/StorageContainerManager.java
+++
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/StorageContainerManager.java
@@ -1595,8 +1595,9 @@ public final class StorageContainerManager extends
ServiceRuntimeInfoImpl
setStartTime();
- // At this point leader is not known
- scmHAMetricsUpdate(null);
+ RaftPeerId leaderId = SCMHAUtils.isSCMHAEnabled(configuration)
+ ? getScmHAManager().getRatisServer().getLeaderId() : null;
+ scmHAMetricsUpdate(Objects.toString(leaderId, null));
if (scmCertificateClient != null) {
// In case root CA certificate is rotated during this SCM is offline
@@ -2298,7 +2299,6 @@ public final class StorageContainerManager extends
ServiceRuntimeInfoImpl
// unregister, in case metrics already exist
// so that the metric tags will get updated.
SCMHAMetrics.unRegister();
-
scmHAMetrics = SCMHAMetrics.create(getScmId(), leaderId);
}
diff --git
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/ha/TestReplicationAnnotation.java
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/ha/TestReplicationAnnotation.java
index a5a2054a8a..049f38480d 100644
---
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/ha/TestReplicationAnnotation.java
+++
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/ha/TestReplicationAnnotation.java
@@ -23,6 +23,7 @@ import org.apache.hadoop.hdds.scm.AddSCMRequest;
import org.apache.hadoop.hdds.scm.RemoveSCMRequest;
import org.apache.hadoop.hdds.scm.container.ContainerStateManager;
import org.apache.ratis.grpc.GrpcTlsConfig;
+import org.apache.ratis.protocol.RaftPeerId;
import org.apache.ratis.protocol.exceptions.NotLeaderException;
import org.apache.ratis.server.RaftServer;
import org.junit.jupiter.api.BeforeEach;
@@ -31,6 +32,7 @@ import org.junit.jupiter.api.Test;
import java.io.IOException;
import java.lang.reflect.Proxy;
import java.util.List;
+import java.util.UUID;
import java.util.concurrent.ExecutionException;
import static org.assertj.core.api.Assertions.assertThat;
@@ -111,6 +113,11 @@ public class TestReplicationAnnotation {
public GrpcTlsConfig getGrpcTlsConfig() {
return null;
}
+
+ @Override
+ public RaftPeerId getLeaderId() {
+ return RaftPeerId.valueOf(UUID.randomUUID().toString());
+ }
};
}
diff --git
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/ha/TestSCMRatisServerImpl.java
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/ha/TestSCMRatisServerImpl.java
new file mode 100644
index 0000000000..6919ce41ed
--- /dev/null
+++
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/ha/TestSCMRatisServerImpl.java
@@ -0,0 +1,108 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ * <p>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hdds.scm.ha;
+
+import org.apache.hadoop.hdds.conf.ConfigurationSource;
+import org.apache.hadoop.hdds.scm.server.StorageContainerManager;
+import org.apache.hadoop.hdds.security.SecurityConfig;
+import org.apache.ratis.conf.RaftProperties;
+import org.apache.ratis.protocol.RaftPeer;
+import org.apache.ratis.protocol.RaftPeerId;
+import org.apache.ratis.server.RaftServer;
+import org.junit.jupiter.api.Test;
+import org.mockito.MockedConstruction;
+import org.mockito.MockedStatic;
+
+import java.util.UUID;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNull;
+import static org.mockito.ArgumentMatchers.any;
+import static org.mockito.Mockito.doReturn;
+import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.mockConstruction;
+import static org.mockito.Mockito.mockStatic;
+import static org.mockito.Mockito.spy;
+import static org.mockito.Mockito.when;
+
+/**
+ * Test for SCM Ratis Server Implementation.
+ */
+public class TestSCMRatisServerImpl {
+
+ @Test
+ public void testGetLeaderId() throws Exception {
+
+ try (
+ MockedConstruction<SecurityConfig> mockedSecurityConfigConstruction =
mockConstruction(SecurityConfig.class);
+ MockedStatic<RaftServer> staticMockedRaftServer =
mockStatic(RaftServer.class);
+ MockedStatic<RatisUtil> staticMockedRatisUtil =
mockStatic(RatisUtil.class);
+ ) {
+ // given
+ ConfigurationSource conf = mock(ConfigurationSource.class);
+ StorageContainerManager scm = mock(StorageContainerManager.class);
+ String clusterId = "CID-" + UUID.randomUUID();
+ when(scm.getClusterId()).thenReturn(clusterId);
+ SCMHADBTransactionBuffer dbTransactionBuffer =
mock(SCMHADBTransactionBuffer.class);
+
+ RaftServer.Builder raftServerBuilder = mock(RaftServer.Builder.class);
+ when(raftServerBuilder.setServerId(any())).thenReturn(raftServerBuilder);
+
when(raftServerBuilder.setProperties(any())).thenReturn(raftServerBuilder);
+
when(raftServerBuilder.setStateMachineRegistry(any())).thenReturn(raftServerBuilder);
+ when(raftServerBuilder.setOption(any())).thenReturn(raftServerBuilder);
+ when(raftServerBuilder.setGroup(any())).thenReturn(raftServerBuilder);
+
when(raftServerBuilder.setParameters(any())).thenReturn(raftServerBuilder);
+
+ RaftServer raftServer = mock(RaftServer.class);
+
+ RaftServer.Division division = mock(RaftServer.Division.class);
+ when(raftServer.getDivision(any())).thenReturn(division);
+
+ SCMStateMachine scmStateMachine = mock(SCMStateMachine.class);
+ when(division.getStateMachine()).thenReturn(scmStateMachine);
+
+ when(raftServerBuilder.build()).thenReturn(raftServer);
+
+
staticMockedRaftServer.when(RaftServer::newBuilder).thenReturn(raftServerBuilder);
+
+ RaftProperties raftProperties = mock(RaftProperties.class);
+ staticMockedRatisUtil.when(() ->
RatisUtil.newRaftProperties(conf)).thenReturn(raftProperties);
+
+ SecurityConfig sc = new SecurityConfig(conf);
+ when(sc.isSecurityEnabled()).thenReturn(false);
+
+ SCMRatisServerImpl scmRatisServer = spy(new SCMRatisServerImpl(conf,
scm, dbTransactionBuffer));
+
doReturn(RaftPeer.newBuilder().setId(RaftPeerId.valueOf("peer1")).build()).when(scmRatisServer).getLeader();
+
+ // when
+ RaftPeerId leaderId = scmRatisServer.getLeaderId();
+
+ // then
+ assertEquals(RaftPeerId.valueOf("peer1"), leaderId);
+
+ // but when
+ doReturn(null).when(scmRatisServer).getLeader();
+ leaderId = scmRatisServer.getLeaderId();
+
+ // then
+ assertNull(leaderId);
+ }
+ }
+
+}
diff --git
a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/hdds/scm/TestStorageContainerManagerHA.java
b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/hdds/scm/TestStorageContainerManagerHA.java
index 2986484d2a..2f9c8c938a 100644
---
a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/hdds/scm/TestStorageContainerManagerHA.java
+++
b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/hdds/scm/TestStorageContainerManagerHA.java
@@ -24,6 +24,7 @@ import org.apache.hadoop.hdds.conf.OzoneConfiguration;
import org.apache.hadoop.hdds.protocol.proto.HddsProtos;
import org.apache.hadoop.hdds.scm.container.ContainerID;
import org.apache.hadoop.hdds.scm.ha.SCMHAMetrics;
+import org.apache.hadoop.hdds.scm.ha.SCMHAUtils;
import org.apache.hadoop.hdds.scm.ha.SCMRatisServerImpl;
import org.apache.hadoop.hdds.scm.server.SCMStorageConfig;
import org.apache.hadoop.hdds.scm.server.StorageContainerManager;
@@ -43,9 +44,10 @@ import org.apache.hadoop.ozone.om.helpers.OmKeyLocationInfo;
import org.apache.ozone.test.GenericTestUtils;
import org.apache.ratis.statemachine.SnapshotInfo;
import org.junit.jupiter.api.AfterEach;
-import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.Timeout;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.time.Instant;
@@ -54,7 +56,9 @@ import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.UUID;
+import java.util.concurrent.CountDownLatch;
import java.util.concurrent.TimeoutException;
+import java.util.concurrent.atomic.AtomicInteger;
import static java.nio.charset.StandardCharsets.UTF_8;
import static org.apache.hadoop.hdds.client.ReplicationFactor.ONE;
@@ -72,6 +76,8 @@ import static org.junit.jupiter.api.Assertions.assertTrue;
@Timeout(300)
public class TestStorageContainerManagerHA {
+ private static final Logger LOG =
LoggerFactory.getLogger(TestStorageContainerManagerHA.class);
+
private MiniOzoneHAClusterImpl cluster = null;
private OzoneConfiguration conf;
private String omServiceId;
@@ -86,7 +92,6 @@ public class TestStorageContainerManagerHA {
*
* @throws IOException
*/
- @BeforeEach
public void init() throws Exception {
conf = new OzoneConfiguration();
conf.set(ScmConfigKeys.OZONE_SCM_PIPELINE_CREATION_INTERVAL, "10s");
@@ -118,6 +123,7 @@ public class TestStorageContainerManagerHA {
@Test
void testAllSCMAreRunning() throws Exception {
+ init();
int count = 0;
List<StorageContainerManager> scms = cluster.getStorageContainerManagers();
assertEquals(numOfSCMs, scms.size());
@@ -129,6 +135,9 @@ public class TestStorageContainerManagerHA {
count++;
leaderScm = scm;
}
+ if (SCMHAUtils.isSCMHAEnabled(conf)) {
+ assertNotNull(scm.getScmHAManager().getRatisServer().getLeaderId());
+ }
assertEquals(peerSize, numOfSCMs);
}
assertEquals(1, count);
@@ -246,6 +255,7 @@ public class TestStorageContainerManagerHA {
@Test
public void testPrimordialSCM() throws Exception {
+ init();
StorageContainerManager scm1 =
cluster.getStorageContainerManagers().get(0);
StorageContainerManager scm2 =
cluster.getStorageContainerManagers().get(1);
OzoneConfiguration conf1 = scm1.getConfiguration();
@@ -264,6 +274,7 @@ public class TestStorageContainerManagerHA {
@Test
public void testBootStrapSCM() throws Exception {
+ init();
StorageContainerManager scm2 =
cluster.getStorageContainerManagers().get(1);
OzoneConfiguration conf2 = scm2.getConfiguration();
boolean isDeleted = scm2.getScmStorageConfig().getVersionFile().delete();
@@ -323,4 +334,72 @@ public class TestStorageContainerManagerHA {
}, 1000, (int) ScmConfigKeys
.OZONE_SCM_HA_RATIS_LEADER_READY_WAIT_TIMEOUT_DEFAULT);
}
+
+ @Test
+ public void testSCMLeadershipMetric() throws IOException,
InterruptedException {
+ // GIVEN
+ int scmInstancesCount = 3;
+ conf = new OzoneConfiguration();
+ MiniOzoneHAClusterImpl.Builder haMiniClusterBuilder =
MiniOzoneCluster.newHABuilder(conf)
+ .setSCMServiceId("scm-service-id")
+ .setOMServiceId("om-service-id")
+ .setNumOfActiveOMs(0)
+ .setNumOfStorageContainerManagers(scmInstancesCount)
+ .setNumOfActiveSCMs(1);
+ haMiniClusterBuilder.setNumDatanodes(0);
+
+ // start single SCM instance without other Ozone services
+ // in order to initialize and bootstrap SCM instances only
+ cluster = haMiniClusterBuilder.build();
+
+ List<StorageContainerManager> storageContainerManagersList =
cluster.getStorageContainerManagersList();
+
+ // stop the single SCM instance in order to imitate further simultaneous
start of SCMs
+ storageContainerManagersList.get(0).stop();
+ storageContainerManagersList.get(0).join();
+
+ // WHEN (imitate simultaneous start of the SCMs)
+ int retryCount = 0;
+ while (true) {
+ CountDownLatch scmInstancesCounter = new
CountDownLatch(scmInstancesCount);
+ AtomicInteger failedSCMs = new AtomicInteger();
+ for (StorageContainerManager scm : storageContainerManagersList) {
+ new Thread(() -> {
+ try {
+ scm.start();
+ } catch (IOException e) {
+ failedSCMs.incrementAndGet();
+ } finally {
+ scmInstancesCounter.countDown();
+ }
+ }).start();
+ }
+ scmInstancesCounter.await();
+ if (failedSCMs.get() == 0) {
+ break;
+ } else {
+ for (StorageContainerManager scm : storageContainerManagersList) {
+ scm.stop();
+ scm.join();
+ LOG.info("Stopping StorageContainerManager server at {}",
+ scm.getClientRpcAddress());
+ }
+ ++retryCount;
+ LOG.info("SCMs port conflicts, retried {} times",
+ retryCount);
+ failedSCMs.set(0);
+ }
+ }
+
+ // THEN expect only one SCM node (leader) will have
'scmha_metrics_scmha_leader_state' metric set to 1
+ int leaderCount = 0;
+ for (StorageContainerManager scm : storageContainerManagersList) {
+ if (scm.getScmHAMetrics() != null &&
scm.getScmHAMetrics().getSCMHAMetricsInfoLeaderState() == 1) {
+ leaderCount++;
+ break;
+ }
+ }
+ assertEquals(1, leaderCount);
+ }
+
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]