errose28 commented on code in PR #9376:
URL: https://github.com/apache/ozone/pull/9376#discussion_r2665710171
##########
hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/reconfig/TestScmReconfiguration.java:
##########
@@ -49,6 +50,7 @@ void reconfigurableProperties() {
Set<String> expected = ImmutableSet.<String>builder()
.add(OZONE_ADMINISTRATORS)
.add(OZONE_READONLY_ADMINISTRATORS)
+ .add(HddsConfigKeys.HDDS_SCM_SAFEMODE_LOG_INTERVAL)
Review Comment:
The other config values here have dedicated tests in this suite, let's add
one for this new property too.
##########
hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SCMSafeModeManager.java:
##########
@@ -225,6 +244,97 @@ public double getCurrentContainerThreshold() {
.getCurrentContainerThreshold();
}
+ private synchronized void startSafeModePeriodicLogger() {
+ if (!getInSafeMode()) {
+ return;
+ }
+ if (safeModeLogExecutor == null) {
+ safeModeLogExecutor = Executors.newScheduledThreadPool(1,
+ new ThreadFactoryBuilder()
+ .setNameFormat(scmContext.threadNamePrefix() +
"SCM-SafeMode-Log-%d")
+ .setDaemon(true)
+ .build());
+ }
+
+ if (safeModeLogTask != null && !safeModeLogTask.isDone()) {
+ safeModeLogTask.cancel(false);
+ }
+ safeModeLogTask = safeModeLogExecutor.scheduleAtFixedRate(() -> {
+ try {
+ logSafeModeStatus();
+ } catch (Throwable t) {
+ LOG.warn("Safe mode periodic logger encountered an error", t);
+ }
+ }, 0L, safeModeLogIntervalMs, TimeUnit.MILLISECONDS);
+ LOG.info("Started periodic Safe Mode logging with interval {} ms",
safeModeLogIntervalMs);
+ }
+
+ private synchronized void logSafeModeStatus() {
+ SafeModeStatus safeModeStatus = status.get();
+ int validatedCount = validatedRules.size();
+ int preCheckValidatedCount = validatedPreCheckRules.size();
+ StringBuilder statusLog = new StringBuilder();
+ statusLog.append(String.format(
+ "%nSCM SafeMode Status | state=%s preCheckComplete=%s
validatedPreCheckRules=%d/%d validatedRules=%d/%d",
+ safeModeStatus.isInSafeMode() ?
+ (safeModeStatus.isPreCheckComplete() ? "PRE_CHECKS_PASSED" :
"INITIAL") : "OUT_OF_SAFE_MODE",
+ safeModeStatus.isPreCheckComplete(), preCheckValidatedCount,
preCheckRules.size(), validatedCount,
+ exitRules.size()));
+
+ for (SafeModeExitRule<?> rule : exitRules.values()) {
+ String name = rule.getRuleName();
+ boolean isValidated = validatedRules.contains(name);
+ String statusText = rule.getStatusText();
+
+ if (statusText.endsWith(";")) {
+ statusText = statusText.substring(0, statusText.length() - 1);
+ }
+
+ statusLog.append(String.format("%nSCM SafeMode Status | %s (%s) %s",
+ name,
+ isValidated ? "validated" : "waiting",
+ statusText));
+ }
+
+ LOG.info(statusLog.toString());
+ if (!getInSafeMode()) {
+ stopSafeModePeriodicLogger();
+ }
+ }
+
+ private synchronized void stopSafeModePeriodicLogger() {
Review Comment:
After stopping the periodic logger I think we should print one last summary
message immediately when SCM exits safemode. Otherwise if we are grepping for
the prefix while tailing the logs, we won't have in indication that it finished.
##########
hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SCMSafeModeManager.java:
##########
@@ -225,6 +244,97 @@ public double getCurrentContainerThreshold() {
.getCurrentContainerThreshold();
}
+ private synchronized void startSafeModePeriodicLogger() {
+ if (!getInSafeMode()) {
+ return;
+ }
+ if (safeModeLogExecutor == null) {
+ safeModeLogExecutor = Executors.newScheduledThreadPool(1,
+ new ThreadFactoryBuilder()
+ .setNameFormat(scmContext.threadNamePrefix() +
"SCM-SafeMode-Log-%d")
+ .setDaemon(true)
+ .build());
+ }
+
+ if (safeModeLogTask != null && !safeModeLogTask.isDone()) {
+ safeModeLogTask.cancel(false);
+ }
+ safeModeLogTask = safeModeLogExecutor.scheduleAtFixedRate(() -> {
+ try {
+ logSafeModeStatus();
+ } catch (Throwable t) {
+ LOG.warn("Safe mode periodic logger encountered an error", t);
+ }
+ }, 0L, safeModeLogIntervalMs, TimeUnit.MILLISECONDS);
+ LOG.info("Started periodic Safe Mode logging with interval {} ms",
safeModeLogIntervalMs);
+ }
+
+ private synchronized void logSafeModeStatus() {
+ SafeModeStatus safeModeStatus = status.get();
+ int validatedCount = validatedRules.size();
+ int preCheckValidatedCount = validatedPreCheckRules.size();
+ StringBuilder statusLog = new StringBuilder();
+ statusLog.append(String.format(
+ "%nSCM SafeMode Status | state=%s preCheckComplete=%s
validatedPreCheckRules=%d/%d validatedRules=%d/%d",
+ safeModeStatus.isInSafeMode() ?
+ (safeModeStatus.isPreCheckComplete() ? "PRE_CHECKS_PASSED" :
"INITIAL") : "OUT_OF_SAFE_MODE",
+ safeModeStatus.isPreCheckComplete(), preCheckValidatedCount,
preCheckRules.size(), validatedCount,
+ exitRules.size()));
Review Comment:
This is hard to read, can we split this out to just use the `StringBuilder`
instead of nesting `String.format` and ternary operators?
##########
hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SCMSafeModeManager.java:
##########
@@ -225,6 +244,97 @@ public double getCurrentContainerThreshold() {
.getCurrentContainerThreshold();
}
+ private synchronized void startSafeModePeriodicLogger() {
+ if (!getInSafeMode()) {
+ return;
+ }
+ if (safeModeLogExecutor == null) {
+ safeModeLogExecutor = Executors.newScheduledThreadPool(1,
+ new ThreadFactoryBuilder()
+ .setNameFormat(scmContext.threadNamePrefix() +
"SCM-SafeMode-Log-%d")
+ .setDaemon(true)
+ .build());
+ }
+
+ if (safeModeLogTask != null && !safeModeLogTask.isDone()) {
+ safeModeLogTask.cancel(false);
+ }
+ safeModeLogTask = safeModeLogExecutor.scheduleAtFixedRate(() -> {
+ try {
+ logSafeModeStatus();
+ } catch (Throwable t) {
+ LOG.warn("Safe mode periodic logger encountered an error", t);
+ }
+ }, 0L, safeModeLogIntervalMs, TimeUnit.MILLISECONDS);
+ LOG.info("Started periodic Safe Mode logging with interval {} ms",
safeModeLogIntervalMs);
+ }
+
+ private synchronized void logSafeModeStatus() {
+ SafeModeStatus safeModeStatus = status.get();
+ int validatedCount = validatedRules.size();
+ int preCheckValidatedCount = validatedPreCheckRules.size();
+ StringBuilder statusLog = new StringBuilder();
+ statusLog.append(String.format(
+ "%nSCM SafeMode Status | state=%s preCheckComplete=%s
validatedPreCheckRules=%d/%d validatedRules=%d/%d",
+ safeModeStatus.isInSafeMode() ?
+ (safeModeStatus.isPreCheckComplete() ? "PRE_CHECKS_PASSED" :
"INITIAL") : "OUT_OF_SAFE_MODE",
+ safeModeStatus.isPreCheckComplete(), preCheckValidatedCount,
preCheckRules.size(), validatedCount,
+ exitRules.size()));
+
+ for (SafeModeExitRule<?> rule : exitRules.values()) {
+ String name = rule.getRuleName();
+ boolean isValidated = validatedRules.contains(name);
+ String statusText = rule.getStatusText();
+
+ if (statusText.endsWith(";")) {
+ statusText = statusText.substring(0, statusText.length() - 1);
+ }
Review Comment:
Looks like we can just remove the semicolon from the end of
`AbstractContainerSafeModeRule#getStatusText` instead of stripping it here.
Neither I nor Cursor can find a case that depends on them.
##########
hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestSCMSafeModeManager.java:
##########
@@ -782,4 +782,53 @@ public void testPipelinesNotCreatedUntilPreCheckPasses()
throws Exception {
assertTrue(scmSafeModeManager.getPreCheckComplete());
assertFalse(scmSafeModeManager.getInSafeMode());
}
+
+ /**
+ * Test that each safemode rule's getStatusText is being logged periodically
+ * while SCM is in safe mode.
+ */
+ @Test
+ public void testSafeModePeriodicLogging() throws Exception {
Review Comment:
This test is good for when SCM is in safemode, but we should also test that
the logger is stopped and prints one final message when safemode exits normally
or is force exited.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]