This is an automated email from the ASF dual-hosted git repository.
sammichen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/ozone.git
The following commit(s) were added to refs/heads/master by this push:
new 8e701bf40e HDDS-10749. Shutdown datanode when RatisServer is down
(#6587)
8e701bf40e is described below
commit 8e701bf40e44d236ca50736216b0cdc39970b5ec
Author: Sammi Chen <[email protected]>
AuthorDate: Tue Aug 13 16:45:30 2024 +0800
HDDS-10749. Shutdown datanode when RatisServer is down (#6587)
---
.../apache/hadoop/ozone/HddsDatanodeService.java | 6 ++-
.../common/statemachine/DatanodeStateMachine.java | 9 ++--
.../server/ratis/ContainerStateMachine.java | 53 +++++++++++++++++++++-
.../transport/server/ratis/XceiverServerRatis.java | 12 +++--
.../ozone/container/ozoneimpl/OzoneContainer.java | 7 +--
.../ozone/container/common/ContainerTestUtils.java | 2 +-
.../transport/server/ratis/TestCSMMetrics.java | 2 +-
.../ozoneimpl/TestOzoneContainerWithTLS.java | 2 +-
.../ozoneimpl/TestSecureOzoneContainer.java | 2 +-
.../container/server/TestContainerServer.java | 2 +-
.../server/TestSecureContainerServer.java | 2 +-
11 files changed, 81 insertions(+), 18 deletions(-)
diff --git
a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/HddsDatanodeService.java
b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/HddsDatanodeService.java
index b2ed8691a6..6b32b74dc7 100644
---
a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/HddsDatanodeService.java
+++
b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/HddsDatanodeService.java
@@ -288,7 +288,7 @@ public class HddsDatanodeService extends GenericCli
implements ServicePlugin {
.register(REPLICATION_STREAMS_LIMIT_KEY,
this::reconfigReplicationStreamsLimit);
- datanodeStateMachine = new DatanodeStateMachine(datanodeDetails, conf,
+ datanodeStateMachine = new DatanodeStateMachine(this, datanodeDetails,
conf,
dnCertClient, secretKeyClient, this::terminateDatanode,
reconfigurationHandler);
try {
@@ -620,6 +620,10 @@ public class HddsDatanodeService extends GenericCli
implements ServicePlugin {
}
}
+ public boolean isStopped() {
+ return isStopped.get();
+ }
+
/**
* Check ozone admin privilege, throws exception if not admin.
*/
diff --git
a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/statemachine/DatanodeStateMachine.java
b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/statemachine/DatanodeStateMachine.java
index 7b3202b4a4..a460e30ede 100644
---
a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/statemachine/DatanodeStateMachine.java
+++
b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/statemachine/DatanodeStateMachine.java
@@ -44,6 +44,7 @@ import
org.apache.hadoop.hdds.security.x509.certificate.client.CertificateClient
import org.apache.hadoop.hdds.upgrade.HDDSLayoutVersionManager;
import org.apache.hadoop.hdds.utils.IOUtils;
import org.apache.hadoop.hdds.utils.NettyMetrics;
+import org.apache.hadoop.ozone.HddsDatanodeService;
import org.apache.hadoop.ozone.HddsDatanodeStopService;
import org.apache.hadoop.ozone.container.common.DatanodeLayoutStorage;
import org.apache.hadoop.ozone.container.common.report.ReportManager;
@@ -137,7 +138,9 @@ public class DatanodeStateMachine implements Closeable {
* @param certClient - Datanode Certificate client, required if security is
* enabled
*/
- public DatanodeStateMachine(DatanodeDetails datanodeDetails,
+ @SuppressWarnings("checkstyle:ParameterNumber")
+ public DatanodeStateMachine(HddsDatanodeService hddsDatanodeService,
+ DatanodeDetails datanodeDetails,
ConfigurationSource conf,
CertificateClient certClient,
SecretKeyClient secretKeyClient,
@@ -177,7 +180,7 @@ public class DatanodeStateMachine implements Closeable {
// HDDS-3116 for more details.
constructionLock.writeLock().lock();
try {
- container = new OzoneContainer(this.datanodeDetails,
+ container = new OzoneContainer(hddsDatanodeService, this.datanodeDetails,
conf, context, certClient, secretKeyClient);
} finally {
constructionLock.writeLock().unlock();
@@ -273,7 +276,7 @@ public class DatanodeStateMachine implements Closeable {
@VisibleForTesting
public DatanodeStateMachine(DatanodeDetails datanodeDetails,
ConfigurationSource conf) throws IOException {
- this(datanodeDetails, conf, null, null, null,
+ this(null, datanodeDetails, conf, null, null, null,
new ReconfigurationHandler("DN", (OzoneConfiguration) conf, op -> {
}));
}
diff --git
a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/ContainerStateMachine.java
b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/ContainerStateMachine.java
index 90fca79550..28b9e151ff 100644
---
a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/ContainerStateMachine.java
+++
b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/ContainerStateMachine.java
@@ -26,6 +26,7 @@ import java.io.OutputStream;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
+import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Objects;
@@ -60,6 +61,7 @@ import
org.apache.hadoop.hdds.scm.container.common.helpers.ContainerNotOpenExcep
import
org.apache.hadoop.hdds.scm.container.common.helpers.StorageContainerException;
import org.apache.hadoop.hdds.utils.Cache;
import org.apache.hadoop.hdds.utils.ResourceCache;
+import org.apache.hadoop.ozone.HddsDatanodeService;
import org.apache.hadoop.ozone.OzoneConfigKeys;
import org.apache.hadoop.ozone.common.utils.BufferUtils;
import org.apache.hadoop.ozone.container.common.interfaces.ContainerDispatcher;
@@ -70,6 +72,7 @@ import org.apache.hadoop.util.Time;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Preconditions;
+import org.apache.ratis.proto.RaftProtos;
import org.apache.ratis.proto.RaftProtos.StateMachineEntryProto;
import org.apache.ratis.proto.RaftProtos.LogEntryProto;
import org.apache.ratis.proto.RaftProtos.RaftPeerRole;
@@ -94,6 +97,7 @@ import
org.apache.ratis.statemachine.impl.SingleFileSnapshotInfo;
import org.apache.ratis.thirdparty.com.google.protobuf.ByteString;
import
org.apache.ratis.thirdparty.com.google.protobuf.InvalidProtocolBufferException;
import org.apache.ratis.thirdparty.com.google.protobuf.TextFormat;
+import org.apache.ratis.util.LifeCycle;
import org.apache.ratis.util.TaskQueue;
import org.apache.ratis.util.function.CheckedSupplier;
import org.apache.ratis.util.JavaUtils;
@@ -198,19 +202,23 @@ public class ContainerStateMachine extends
BaseStateMachine {
private final Semaphore applyTransactionSemaphore;
private final boolean waitOnBothFollowers;
+ private final HddsDatanodeService datanodeService;
+ private static Semaphore semaphore = new Semaphore(1);
+
/**
* CSM metrics.
*/
private final CSMMetrics metrics;
@SuppressWarnings("parameternumber")
- public ContainerStateMachine(RaftGroupId gid,
+ public ContainerStateMachine(HddsDatanodeService hddsDatanodeService,
RaftGroupId gid,
ContainerDispatcher dispatcher,
ContainerController containerController,
List<ThreadPoolExecutor> chunkExecutors,
XceiverServerRatis ratisServer,
ConfigurationSource conf,
String threadNamePrefix) {
+ this.datanodeService = hddsDatanodeService;
this.gid = gid;
this.dispatcher = dispatcher;
this.containerController = containerController;
@@ -904,6 +912,49 @@ public class ContainerStateMachine extends
BaseStateMachine {
removeStateMachineDataIfNeeded(index);
}
+ @Override
+ public void notifyServerShutdown(RaftProtos.RoleInfoProto roleInfo, boolean
allServer) {
+ // if datanodeService is stopped , it indicates this `close` originates
+ // from `HddsDatanodeService.stop()`, otherwise, it indicates this `close`
originates from ratis.
+ if (allServer) {
+ if (datanodeService != null && !datanodeService.isStopped()) {
+ LOG.info("{} is closed by ratis", gid);
+ if (semaphore.tryAcquire()) {
+ // run with a different thread, so this raft group can be closed
+ Runnable runnable = () -> {
+ try {
+ int closed = 0, total = 0;
+ try {
+ Thread.sleep(5000); // sleep 5s
+ } catch (InterruptedException e) {
+ Thread.currentThread().interrupt();
+ }
+ Iterator<RaftGroupId> iterator =
ratisServer.getServer().getGroupIds().iterator();
+ while (iterator.hasNext()) {
+ RaftGroupId id = iterator.next();
+ RaftServer.Division division =
ratisServer.getServer().getDivision(id);
+ if (division.getRaftServer().getLifeCycleState() ==
LifeCycle.State.CLOSED) {
+ closed++;
+ }
+ total++;
+ }
+ LOG.error("Container statemachine is closed by ratis,
terminating HddsDatanodeService. " +
+ "closed({})/total({})", closed, total);
+ datanodeService.terminateDatanode();
+ } catch (IOException e) {
+ LOG.warn("Failed to get division for raft groups", e);
+ LOG.error("Container statemachine is closed by ratis,
terminating HddsDatanodeService");
+ datanodeService.terminateDatanode();
+ }
+ };
+ CompletableFuture.runAsync(runnable);
+ }
+ } else {
+ LOG.info("{} is closed by HddsDatanodeService", gid);
+ }
+ }
+ }
+
private CompletableFuture<ContainerCommandResponseProto> applyTransaction(
ContainerCommandRequestProto request, DispatcherContext context,
Consumer<Throwable> exceptionHandler) {
diff --git
a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/XceiverServerRatis.java
b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/XceiverServerRatis.java
index dc0c4b0676..2ae372320e 100644
---
a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/XceiverServerRatis.java
+++
b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/XceiverServerRatis.java
@@ -59,6 +59,7 @@ import org.apache.hadoop.hdds.security.SecurityConfig;
import
org.apache.hadoop.hdds.security.x509.certificate.client.CertificateClient;
import org.apache.hadoop.hdds.tracing.TracingUtil;
import org.apache.hadoop.hdds.utils.HddsServerUtil;
+import org.apache.hadoop.ozone.HddsDatanodeService;
import org.apache.hadoop.ozone.OzoneConfigKeys;
import org.apache.hadoop.ozone.OzoneConsts;
import org.apache.hadoop.ozone.container.common.impl.ContainerData;
@@ -177,13 +178,15 @@ public final class XceiverServerRatis implements
XceiverServerSpi {
private final boolean shouldDeleteRatisLogDirectory;
private final boolean streamEnable;
private final DatanodeRatisServerConfig ratisServerConfig;
+ private final HddsDatanodeService datanodeService;
- private XceiverServerRatis(DatanodeDetails dd,
+ private XceiverServerRatis(HddsDatanodeService hddsDatanodeService,
DatanodeDetails dd,
ContainerDispatcher dispatcher, ContainerController containerController,
StateContext context, ConfigurationSource conf, Parameters parameters)
throws IOException {
this.conf = conf;
Objects.requireNonNull(dd, "DatanodeDetails == null");
+ datanodeService = hddsDatanodeService;
datanodeDetails = dd;
ratisServerConfig = conf.getObject(DatanodeRatisServerConfig.class);
assignPorts();
@@ -241,7 +244,7 @@ public final class XceiverServerRatis implements
XceiverServerSpi {
}
private ContainerStateMachine getStateMachine(RaftGroupId gid) {
- return new ContainerStateMachine(gid, dispatcher, containerController,
+ return new ContainerStateMachine(datanodeService, gid, dispatcher,
containerController,
chunkExecutors, this, conf, datanodeDetails.threadNamePrefix());
}
@@ -521,14 +524,14 @@ public final class XceiverServerRatis implements
XceiverServerSpi {
.valueOf(pendingRequestsMegaBytesLimit, TraditionalBinaryPrefix.MEGA));
}
- public static XceiverServerRatis newXceiverServerRatis(
+ public static XceiverServerRatis newXceiverServerRatis(HddsDatanodeService
hddsDatanodeService,
DatanodeDetails datanodeDetails, ConfigurationSource ozoneConf,
ContainerDispatcher dispatcher, ContainerController containerController,
CertificateClient caClient, StateContext context) throws IOException {
Parameters parameters = createTlsParameters(
new SecurityConfig(ozoneConf), caClient);
- return new XceiverServerRatis(datanodeDetails, dispatcher,
+ return new XceiverServerRatis(hddsDatanodeService, datanodeDetails,
dispatcher,
containerController, context, ozoneConf, parameters);
}
@@ -591,6 +594,7 @@ public final class XceiverServerRatis implements
XceiverServerSpi {
public void stop() {
if (isStarted) {
try {
+ LOG.info("Stopping {} {}", getClass().getSimpleName(), server.getId());
// shutdown server before the executors as while shutting down,
// some of the tasks would be executed using the executors.
server.close();
diff --git
a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/OzoneContainer.java
b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/OzoneContainer.java
index 4fa211a92c..b3809fff4b 100644
---
a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/OzoneContainer.java
+++
b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/OzoneContainer.java
@@ -34,6 +34,7 @@ import
org.apache.hadoop.hdds.security.symmetric.SecretKeyVerifierClient;
import org.apache.hadoop.hdds.security.token.TokenVerifier;
import
org.apache.hadoop.hdds.security.x509.certificate.client.CertificateClient;
import org.apache.hadoop.hdds.utils.HddsServerUtil;
+import org.apache.hadoop.ozone.HddsDatanodeService;
import org.apache.hadoop.ozone.container.common.helpers.ContainerMetrics;
import org.apache.hadoop.ozone.container.common.impl.BlockDeletingService;
import org.apache.hadoop.ozone.container.common.impl.ContainerSet;
@@ -138,7 +139,7 @@ public class OzoneContainer {
* @throws DiskOutOfSpaceException
* @throws IOException
*/
- public OzoneContainer(
+ public OzoneContainer(HddsDatanodeService hddsDatanodeService,
DatanodeDetails datanodeDetails, ConfigurationSource conf,
StateContext context, CertificateClient certClient,
SecretKeyVerifierClient secretKeyClient) throws IOException {
@@ -205,7 +206,7 @@ public class OzoneContainer {
*/
controller = new ContainerController(containerSet, handlers);
- writeChannel = XceiverServerRatis.newXceiverServerRatis(
+ writeChannel =
XceiverServerRatis.newXceiverServerRatis(hddsDatanodeService,
datanodeDetails, config, hddsDispatcher, controller, certClient,
context);
@@ -277,7 +278,7 @@ public class OzoneContainer {
public OzoneContainer(
DatanodeDetails datanodeDetails, ConfigurationSource conf,
StateContext context) throws IOException {
- this(datanodeDetails, conf, context, null, null);
+ this(null, datanodeDetails, conf, context, null, null);
}
public GrpcTlsConfig getTlsClientConfig() {
diff --git
a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/ContainerTestUtils.java
b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/ContainerTestUtils.java
index c63f82025e..53ba8b6857 100644
---
a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/ContainerTestUtils.java
+++
b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/ContainerTestUtils.java
@@ -347,7 +347,7 @@ public final class ContainerTestUtils {
conf.setInt(OzoneConfigKeys.HDDS_CONTAINER_RATIS_IPC_PORT,
dn.getPort(DatanodeDetails.Port.Name.RATIS).getValue());
- return XceiverServerRatis.newXceiverServerRatis(dn, conf,
+ return XceiverServerRatis.newXceiverServerRatis(null, dn, conf,
getNoopContainerDispatcher(), getEmptyContainerController(),
null, null);
}
diff --git
a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/TestCSMMetrics.java
b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/TestCSMMetrics.java
index 0fd31bb4b7..e68831b494 100644
---
a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/TestCSMMetrics.java
+++
b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/TestCSMMetrics.java
@@ -189,7 +189,7 @@ public class TestCSMMetrics {
conf.set(OzoneConfigKeys.HDDS_CONTAINER_RATIS_DATANODE_STORAGE_DIR, dir);
final ContainerDispatcher dispatcher = new TestContainerDispatcher();
- return XceiverServerRatis.newXceiverServerRatis(dn, conf, dispatcher,
+ return XceiverServerRatis.newXceiverServerRatis(null, dn, conf, dispatcher,
new ContainerController(new ContainerSet(1000), Maps.newHashMap()),
null, null);
}
diff --git
a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/container/ozoneimpl/TestOzoneContainerWithTLS.java
b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/container/ozoneimpl/TestOzoneContainerWithTLS.java
index a1e8e1781f..50cb8f6b9b 100644
---
a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/container/ozoneimpl/TestOzoneContainerWithTLS.java
+++
b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/container/ozoneimpl/TestOzoneContainerWithTLS.java
@@ -311,7 +311,7 @@ public class TestOzoneContainerWithTLS {
try {
StateContext stateContext = ContainerTestUtils.getMockContext(dn, conf);
container = new OzoneContainer(
- dn, conf, stateContext, caClient, keyClient);
+ null, dn, conf, stateContext, caClient, keyClient);
MutableVolumeSet volumeSet = container.getVolumeSet();
StorageVolumeUtil.getHddsVolumesList(volumeSet.getVolumesList())
.forEach(hddsVolume ->
hddsVolume.setDbParentDir(tempFolder.toFile()));
diff --git
a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/container/ozoneimpl/TestSecureOzoneContainer.java
b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/container/ozoneimpl/TestSecureOzoneContainer.java
index 5585696dfc..92d716f7a4 100644
---
a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/container/ozoneimpl/TestSecureOzoneContainer.java
+++
b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/container/ozoneimpl/TestSecureOzoneContainer.java
@@ -137,7 +137,7 @@ class TestSecureOzoneContainer {
conf.setBoolean(OzoneConfigKeys.HDDS_CONTAINER_IPC_RANDOM_PORT, false);
DatanodeDetails dn = MockDatanodeDetails.randomDatanodeDetails();
- container = new OzoneContainer(dn, conf, ContainerTestUtils
+ container = new OzoneContainer(null, dn, conf, ContainerTestUtils
.getMockContext(dn, conf), caClient, secretKeyClient);
MutableVolumeSet volumeSet = container.getVolumeSet();
StorageVolumeUtil.getHddsVolumesList(volumeSet.getVolumesList())
diff --git
a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/container/server/TestContainerServer.java
b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/container/server/TestContainerServer.java
index 630c4d3149..8db7b13747 100644
---
a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/container/server/TestContainerServer.java
+++
b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/container/server/TestContainerServer.java
@@ -132,7 +132,7 @@ public class TestContainerServer {
conf.set(OzoneConfigKeys.HDDS_CONTAINER_RATIS_DATANODE_STORAGE_DIR, dir);
final ContainerDispatcher dispatcher = new TestContainerDispatcher();
- return XceiverServerRatis.newXceiverServerRatis(dn, conf, dispatcher,
+ return XceiverServerRatis.newXceiverServerRatis(null, dn, conf, dispatcher,
new ContainerController(new ContainerSet(1000), Maps.newHashMap()),
caClient, null);
}
diff --git
a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/container/server/TestSecureContainerServer.java
b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/container/server/TestSecureContainerServer.java
index 8044685bb7..0bdf61b3bd 100644
---
a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/container/server/TestSecureContainerServer.java
+++
b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/container/server/TestSecureContainerServer.java
@@ -218,7 +218,7 @@ public class TestSecureContainerServer {
conf.set(OzoneConfigKeys.HDDS_CONTAINER_RATIS_DATANODE_STORAGE_DIR, dir);
final ContainerDispatcher dispatcher = createDispatcher(dn,
UUID.randomUUID(), conf);
- return XceiverServerRatis.newXceiverServerRatis(dn, conf, dispatcher,
+ return XceiverServerRatis.newXceiverServerRatis(null, dn, conf, dispatcher,
new ContainerController(new ContainerSet(1000), Maps.newHashMap()),
caClient, null);
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]