devmadhuu commented on code in PR #10074:
URL: https://github.com/apache/ozone/pull/10074#discussion_r3258605231


##########
hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/metrics/ReconScmContainerSyncMetrics.java:
##########
@@ -0,0 +1,166 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.ozone.recon.metrics;
+
+import org.apache.hadoop.hdds.annotation.InterfaceAudience;
+import org.apache.hadoop.metrics2.MetricsSystem;
+import org.apache.hadoop.metrics2.annotation.Metric;
+import org.apache.hadoop.metrics2.annotation.Metrics;
+import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem;
+import org.apache.hadoop.metrics2.lib.MutableCounterLong;
+import org.apache.hadoop.metrics2.lib.MutableGaugeInt;
+import org.apache.hadoop.metrics2.lib.MutableGaugeLong;
+import org.apache.hadoop.ozone.OzoneConsts;
+import org.apache.hadoop.util.Time;
+
+/**
+ * Metrics for Recon SCM container sync decisions and targeted sync execution.
+ */
[email protected]
+@Metrics(about = "Recon SCM Container Sync Metrics", context = 
OzoneConsts.OZONE)
+public final class ReconScmContainerSyncMetrics {
+
+  private static final String SOURCE_NAME =
+      ReconScmContainerSyncMetrics.class.getSimpleName();
+
+  /**
+   * No targeted sync has run yet, or the latest scheduler cycle did not run 
one.
+   */
+  public static final int TARGETED_SYNC_STATUS_IDLE = 0;
+  /**
+   * Targeted sync is currently running.
+   */
+  public static final int TARGETED_SYNC_STATUS_IN_PROGRESS = 1;
+  /**
+   * The last targeted sync completed successfully.
+   */
+  public static final int TARGETED_SYNC_STATUS_SUCCESS = 2;
+  /**
+   * The last targeted sync completed with one or more failed passes.
+   */
+  public static final int TARGETED_SYNC_STATUS_FAILURE = 3;
+
+  @Metric(about = "Count of events where non-OPEN container drift exceeded "
+      + "the full SCM DB snapshot threshold")
+  private MutableCounterLong fullScmDbSnapshotThresholdExceededCount;
+
+  @Metric(about = "Last non-OPEN container drift observed when the full SCM "
+      + "DB snapshot threshold was exceeded")
+  private MutableGaugeLong lastFullScmDbSnapshotThresholdExceededNonOpenDrift;
+
+  @Metric(about = "Time between the last two full SCM DB snapshot threshold "
+      + "exceeded events in milliseconds")
+  private MutableGaugeLong 
intervalSinceLastFullScmDbSnapshotThresholdExceededMs;

Review Comment:
   we want to record those metrics right for observability when there is large 
gap exceeded , so that we can check metrics, and later tune the current 1M 
threshold and 6 hour interval based on metrics for a cluster



##########
hadoop-hdds/common/src/main/resources/ozone-default.xml:
##########
@@ -4608,20 +4604,32 @@
   </property>
 
   <property>
-    <name>ozone.recon.scm.snapshot.task.initial.delay</name>
-    <value>1m</value>
-    <tag>OZONE, MANAGEMENT, RECON</tag>
+    <name>ozone.recon.scm.container.sync.task.initial.delay</name>
+    <value>2m</value>
+    <tag>OZONE, MANAGEMENT, RECON, SCM</tag>
     <description>
-      Initial delay in MINUTES by Recon to request SCM DB Snapshot.
+      Initial delay before Recon starts the incremental SCM container sync 
task.
+      This gives Recon startup enough time to initialize the SCM DB before the
+      first incremental sync runs.
     </description>
   </property>
-
   <property>
-    <name>ozone.recon.scm.snapshot.task.interval.delay</name>
-    <value>24h</value>
-    <tag>OZONE, MANAGEMENT, RECON</tag>
+    <name>ozone.recon.scm.container.sync.task.interval.delay</name>
+    <value>6h</value>
+    <tag>OZONE, MANAGEMENT, RECON, SCM</tag>
+    <description>
+      Interval between incremental SCM container sync runs in Recon. Each cycle
+      evaluates drift between SCM and Recon and either runs the targeted
+      multi-pass sync or takes no action.
+    </description>
+  </property>
+  <property>
+    <name>ozone.recon.scm.deleted.container.check.batch.size</name>
+    <value>500</value>

Review Comment:
   done



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to