fapifta commented on code in PR #4943:
URL: https://github.com/apache/ozone/pull/4943#discussion_r1247509051
##########
hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/security/x509/certificate/client/DefaultCertificateClient.java:
##########
@@ -1220,6 +1245,14 @@ public SCMSecurityProtocolClientSideTranslatorPB
getScmSecureClient()
return scmSecurityClient;
}
+ public static void acquirePermit() throws InterruptedException {
+ semaphore.acquire();
Review Comment:
It might happen together when the grace period is 0 seconds, so the two
monitors are kicking in almost at the same time, besides that an other case,
when there is a failure, and the wait ack is equal to the grace period, as then
the first failure will push the second try to the time when the certificate
monitor also kicks in.
These are rare cases, and should not cause any trouble due to the proper
locking now, it might result in unnecessary renewals. I am not worried about
those, I think it is just more explicit to have the certificate lifetime
monitor explicitly disabled showing that it is not used in SCM. But let's leave
ti for now as it is, I fine with it this way as well.
##########
hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/security/RootCARotationManager.java:
##########
@@ -255,11 +489,210 @@ public Duration
timeBefore2ExpiryGracePeriod(X509Certificate certificate) {
}
}
+ /**
+ * Task to generate sub-ca key and certificate.
+ */
+ public class SubCARotationPrepareTask implements Runnable {
+ private String rootCACertId;
+
+ public SubCARotationPrepareTask(String newRootCertId) {
+ this.rootCACertId = newRootCertId;
+ }
+
+ @Override
+ public void run() {
+ // Lock to protect the sub CA certificate rotation preparation process,
+ // to make sure there is only one task is ongoing at one time.
+ // Sub CA rotation preparation steps:
+ // 1. generate new sub CA keys
+ // 2. send CSR to leader SCM
+ // 3. wait CSR response and persist the certificate to disk
+ try {
+ acquireLock();
+ } catch (InterruptedException e) {
+ Thread.currentThread().interrupt();
+ return;
+ }
+ try {
+ LOG.info("SubCARotationPrepareTask[rootCertId = {}] - started.",
+ rootCACertId);
+
+ SecurityConfig securityConfig =
+ scmCertClient.getSecurityConfig();
+ String progressComponent = SCMCertificateClient.COMPONENT_NAME +
+ HDDS_NEW_KEY_CERT_DIR_NAME_SUFFIX +
+ HDDS_NEW_KEY_CERT_DIR_NAME_PROGRESS_SUFFIX;
+ final String newSubCAProgressPath =
+ securityConfig.getLocation(progressComponent).toString();
+ final String newSubCAPath = securityConfig.getLocation(
+ SCMCertificateClient.COMPONENT_NAME +
+ HDDS_NEW_KEY_CERT_DIR_NAME_SUFFIX).toString();
+
+ File newProgressDir = new File(newSubCAProgressPath);
+ File newDir = new File(newSubCAPath);
+ try {
+ FileUtils.deleteDirectory(newProgressDir);
+ FileUtils.deleteDirectory(newDir);
+ Files.createDirectories(newProgressDir.toPath());
+ } catch (IOException e) {
+ LOG.error("Failed to delete and create {}, or delete {}",
+ newProgressDir, newDir, e);
+ String message = "Terminate SCM, encounter IO exception(" +
+ e.getMessage() + ") when deleting and create directory";
+ scm.shutDown(message);
+ }
+
+ // Generate key
+ Path keyDir = securityConfig.getKeyLocation(progressComponent);
+ KeyCodec keyCodec = new KeyCodec(securityConfig, keyDir);
+ KeyPair newKeyPair = null;
+ try {
+ HDDSKeyGenerator keyGenerator = new HDDSKeyGenerator(securityConfig);
+ newKeyPair = keyGenerator.generateKey();
+ keyCodec.writePublicKey(newKeyPair.getPublic());
+ keyCodec.writePrivateKey(newKeyPair.getPrivate());
+ LOG.info("SubCARotationPrepareTask[rootCertId = {}] - " +
+ "scm key generated.", rootCACertId);
+ } catch (Exception e) {
+ LOG.error("Failed to generate key under {}", newProgressDir, e);
+ String message = "Terminate SCM, encounter exception(" +
+ e.getMessage() + ") when generating new key under " +
+ newProgressDir;
+ scm.shutDown(message);
+ }
+
+ // Get certificate signed
+ String newCertSerialId = "";
+ try {
+ CertificateSignRequest.Builder csrBuilder =
+ scmCertClient.getCSRBuilder();
+ csrBuilder.setKey(newKeyPair);
+ newCertSerialId = scmCertClient.signAndStoreCertificate(
+ csrBuilder.build(),
+ Paths.get(newSubCAProgressPath, HDDS_X509_DIR_NAME_DEFAULT),
+ true);
+ LOG.info("SubCARotationPrepareTask[rootCertId = {}] - " +
+ "scm certificate {} signed.", rootCACertId, newCertSerialId);
+ } catch (Exception e) {
+ LOG.error("Failed to generate certificate under {}",
+ newProgressDir, e);
+ String message = "Terminate SCM, encounter exception(" +
+ e.getMessage() + ") when generating new certificate " +
+ newProgressDir;
+ scm.shutDown(message);
+ }
+
+ // move dir from *-next-progress to *-next
+ try {
+ Files.move(newProgressDir.toPath(), newDir.toPath(),
+ StandardCopyOption.ATOMIC_MOVE,
+ StandardCopyOption.REPLACE_EXISTING);
+ } catch (IOException e) {
+ LOG.error("Failed to move {} to {}",
+ newSubCAProgressPath, newSubCAPath, e);
+ String message = "Terminate SCM, encounter exception(" +
+ e.getMessage() + ") when moving " + newSubCAProgressPath +
+ " to " + newSubCAPath;
+ scm.shutDown(message);
+ }
+
+ // Send ack to rotationPrepare request
+ try {
+ handler.rotationPrepareAck(rootCACertId, newCertSerialId,
+ scm.getScmId());
+ LOG.info("SubCARotationPrepareTask[rootCertId = {}] - " +
+ "rotation prepare ack sent out, new scm certificate {}",
+ rootCACertId, newCertSerialId);
+ } catch (Exception e) {
+ LOG.error("Failed to send ack to rotationPrepare request", e);
+ String message = "Terminate SCM, encounter exception(" +
+ e.getMessage() + ") when sending out rotationPrepare ack";
+ scm.shutDown(message);
+ }
+
+ handler.setSubCACertId(newCertSerialId);
+
+ releaseLockOnTimeoutTask = executorService.schedule(() -> {
+ // If no rotation commit request received after rotation prepare
+ LOG.warn("Failed to have enough rotation acks from SCM. This " +
+ " time root rotation {} is failed. Release the lock.",
+ rootCACertId);
+ releaseLock();
+ }, ackTimeout.toMillis(), TimeUnit.MILLISECONDS);
+
+ } catch (Throwable e) {
+ LOG.error("Unexpected error happen", e);
+ scm.shutDown("Unexpected error happen, " + e.getMessage());
+ }
+ }
+ }
+
+ /**
+ * Task to wait the all acks of prepare request.
+ */
+ public class WaitSubCARotationPrepareAckTask implements Runnable {
+ private String rootCACertId;
+
+ public WaitSubCARotationPrepareAckTask(
+ X509CertificateHolder rootCertHolder) {
+ this.rootCACertId = rootCertHolder.getSerialNumber().toString();
+ }
+
+ @Override
+ public void run() {
+ if (!isRunning()) {
+ LOG.info("SCM is not leader anymore. Delete the in-progress " +
+ "root CA directory");
+ cleanupAndStop("SCM is not leader anymore");
+ return;
+ }
+ synchronized (RootCARotationManager.class) {
+ int numFromHADetails =
+ scm.getSCMHANodeDetails().getPeerNodeDetails().size() + 1;
+ int numFromRatisServer = scm.getScmHAManager().getRatisServer()
+ .getDivision().getRaftConf().getCurrentPeers().size();
+ LOG.info("numFromHADetails {}, numFromRatisServer {}",
+ numFromHADetails, numFromRatisServer);
+ if (handler.rotationPrepareAcks() == numFromRatisServer) {
+ // all acks are received.
+ try {
+ waitAckTimeoutTask.cancel(false);
+ handler.rotationCommit(rootCACertId);
+ handler.rotationCommitted(rootCACertId);
+
+ metrics.incrSuccessRotationNum();
+ long timeTaken = System.nanoTime() - processStartTime.get();
+ metrics.setSuccessTimeInNs(timeTaken);
+ processStartTime.set(null);
+
+ // reset state
+ handler.resetRotationPrepareAcks();
+ String msg = "Root certificate " + rootCACertId +
+ " rotation is finished successfully after " + timeTaken + "
ns";
+ cleanupAndStop(msg);
+ } catch (Throwable e) {
+ LOG.error("Execution error", e);
+ handler.resetRotationPrepareAcks();
+ cleanupAndStop("Execution error, " + e.getMessage());
+ } finally {
+ // stop this task to re-execute again in next cycle
+ throw new RuntimeException("Exit the this " +
+ "WaitSubCARotationPrepareAckTask for root certificate " +
+ rootCACertId + " since the rotation is finished execution");
+ }
+ }
+ }
+ }
+ }
+
/**
* Stops scheduled monitor task.
*/
@Override
public void stop() {
+ if (metrics != null) {
+ metrics.unRegister();
+ }
try {
executorService.shutdown();
if (!executorService.awaitTermination(3, TimeUnit.SECONDS)) {
Review Comment:
Right, let's use scm.shutdown in combination with just shutdownNow in the
scm.stop() method for the rotation manager. Thank you for looking into it!
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]