kevinrr888 commented on code in PR #4850: URL: https://github.com/apache/accumulo/pull/4850#discussion_r1745799050
########## core/src/main/java/org/apache/accumulo/core/metrics/MetricsDocGen.java: ########## @@ -0,0 +1,102 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.accumulo.core.metrics; + +import static java.nio.charset.StandardCharsets.UTF_8; + +import java.io.IOException; +import java.io.PrintStream; +import java.util.Arrays; +import java.util.Comparator; +import java.util.TreeSet; + +/** + * This class generates documentation to inform users of the available metrics in a presentable + * form. + */ +public class MetricsDocGen { + private final PrintStream doc; + private final TreeSet<Metric> sortedMetrics = + new TreeSet<>(Comparator.comparing(Metric::getName)); + + void generate() { + pageHeader(); + + // Generate sections for each category of metrics + generateCategorySection(Metric.MetricCategory.GENERAL_SERVER, "General Server Metrics"); + generateCategorySection(Metric.MetricCategory.COMPACTOR, "Compactor Metrics"); + generateCategorySection(Metric.MetricCategory.SCAN_SERVER, "Scan Server Metrics"); + generateCategorySection(Metric.MetricCategory.FATE, "Fate Metrics"); + + doc.close(); + } + + void pageHeader() { + doc.println("---"); + doc.println("title: Metrics Documentation (3.x)"); + doc.println("category: metrics"); + doc.println("order: 6"); + doc.println("---\n"); Review Comment: What is the purpose of this? Is this info needed for how this should appear on the website? What is "order: 6"? ########## core/src/main/java/org/apache/accumulo/core/metrics/MetricsDocGen.java: ########## @@ -0,0 +1,102 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.accumulo.core.metrics; + +import static java.nio.charset.StandardCharsets.UTF_8; + +import java.io.IOException; +import java.io.PrintStream; +import java.util.Arrays; +import java.util.Comparator; +import java.util.TreeSet; + +/** + * This class generates documentation to inform users of the available metrics in a presentable + * form. + */ +public class MetricsDocGen { + private final PrintStream doc; + private final TreeSet<Metric> sortedMetrics = + new TreeSet<>(Comparator.comparing(Metric::getName)); + + void generate() { + pageHeader(); + + // Generate sections for each category of metrics + generateCategorySection(Metric.MetricCategory.GENERAL_SERVER, "General Server Metrics"); + generateCategorySection(Metric.MetricCategory.COMPACTOR, "Compactor Metrics"); + generateCategorySection(Metric.MetricCategory.SCAN_SERVER, "Scan Server Metrics"); + generateCategorySection(Metric.MetricCategory.FATE, "Fate Metrics"); + + doc.close(); Review Comment: ```suggestion ``` I don't think this is needed ########## core/src/main/java/org/apache/accumulo/core/metrics/MetricsDocGen.java: ########## @@ -0,0 +1,102 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.accumulo.core.metrics; + +import static java.nio.charset.StandardCharsets.UTF_8; + +import java.io.IOException; +import java.io.PrintStream; +import java.util.Arrays; +import java.util.Comparator; +import java.util.TreeSet; + +/** + * This class generates documentation to inform users of the available metrics in a presentable + * form. + */ +public class MetricsDocGen { Review Comment: The way the metrics are generated and the format of the output look really good. Also verified all the metrics prior to these changes still exist (82 metrics before and after) ########## core/src/main/java/org/apache/accumulo/core/metrics/Metric.java: ########## @@ -0,0 +1,266 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.accumulo.core.metrics; + +import java.util.HashMap; +import java.util.Map; + +public enum Metric { + // General Server Metrics + SERVER_IDLE("accumulo.server.idle", MetricType.GAUGE, + "Indicates if the server is idle or not. The value will be 1 when idle and 0 when not idle.", + MetricCategory.GENERAL_SERVER), + LOW_MEMORY("accumulo.detected.low.memory", MetricType.GAUGE, + "Reports 1 when process memory usage is above threshold, 0 when memory is okay.", + MetricCategory.GENERAL_SERVER), + + // Compactor Metrics + COMPACTOR_MAJC_STUCK("accumulo.compactor.majc.stuck", MetricType.LONG_TASK_TIMER, "", + MetricCategory.COMPACTOR), + COMPACTOR_ENTRIES_READ("accumulo.compactor.entries.read", MetricType.FUNCTION_COUNTER, + "Number of entries read by all threads performing compactions.", MetricCategory.COMPACTOR), + COMPACTOR_ENTRIES_WRITTEN("accumulo.compactor.entries.written", MetricType.FUNCTION_COUNTER, + "Number of entries written by all threads performing compactions.", MetricCategory.COMPACTOR), + + // Fate Metrics + FATE_TYPE_IN_PROGRESS("accumulo.fate.ops.in.progress.by.type", MetricType.GAUGE, + "Number of FATE operations in progress. The type is designated by the `op.type` tag.", + MetricCategory.FATE), + FATE_OPS("accumulo.fate.ops", MetricType.GAUGE, "Tracks all the current FATE ops in any state.", + MetricCategory.FATE), + FATE_OPS_ACTIVITY("accumulo.fate.ops.activity", MetricType.GAUGE, "", MetricCategory.FATE), + FATE_ERRORS("accumulo.fate.errors", MetricType.GAUGE, "", MetricCategory.FATE), + FATE_TX("accumulo.fate.tx", MetricType.GAUGE, + "The state is now in a tag (e.g., state=new, state=in.progress, state=failed, etc.).", + MetricCategory.FATE), + + // Garbage Collection Metrics + GC_STARTED("accumulo.gc.started", MetricType.GAUGE, "", MetricCategory.GARBAGE_COLLECTION), + GC_FINISHED("accumulo.gc.finished", MetricType.GAUGE, "", MetricCategory.GARBAGE_COLLECTION), + GC_CANDIDATES("accumulo.gc.candidates", MetricType.GAUGE, "", MetricCategory.GARBAGE_COLLECTION), + GC_IN_USE("accumulo.gc.in.use", MetricType.GAUGE, "", MetricCategory.GARBAGE_COLLECTION), + GC_DELETED("accumulo.gc.deleted", MetricType.GAUGE, "", MetricCategory.GARBAGE_COLLECTION), + GC_ERRORS("accumulo.gc.errors", MetricType.GAUGE, "", MetricCategory.GARBAGE_COLLECTION), + GC_WAL_STARTED("accumulo.gc.wal.started", MetricType.GAUGE, "", + MetricCategory.GARBAGE_COLLECTION), + GC_WAL_FINISHED("accumulo.gc.wal.finished", MetricType.GAUGE, "", + MetricCategory.GARBAGE_COLLECTION), + GC_WAL_CANDIDATES("accumulo.gc.wal.candidates", MetricType.GAUGE, "", + MetricCategory.GARBAGE_COLLECTION), + GC_WAL_IN_USE("accumulo.gc.wal.in.use", MetricType.GAUGE, "", MetricCategory.GARBAGE_COLLECTION), + GC_WAL_DELETED("accumulo.gc.wal.deleted", MetricType.GAUGE, "", + MetricCategory.GARBAGE_COLLECTION), + GC_WAL_ERRORS("accumulo.gc.wal.errors", MetricType.GAUGE, "", MetricCategory.GARBAGE_COLLECTION), + GC_POST_OP_DURATION("accumulo.gc.post.op.duration", MetricType.GAUGE, "", + MetricCategory.GARBAGE_COLLECTION), + GC_RUN_CYCLE("accumulo.gc.run.cycle", MetricType.GAUGE, "", MetricCategory.GARBAGE_COLLECTION), + + // Tablet Server Metrics + TSERVER_ENTRIES("accumulo.tserver.entries", MetricType.GAUGE, "", MetricCategory.TABLET_SERVER), + TSERVER_MEM_ENTRIES("accumulo.tserver.entries.mem", MetricType.GAUGE, "", + MetricCategory.TABLET_SERVER), + TSERVER_MAJC_RUNNING("accumulo.tserver.majc.running", MetricType.GAUGE, "", + MetricCategory.TABLET_SERVER), + TSERVER_MAJC_STUCK("accumulo.tserver.majc.stuck", MetricType.GAUGE, "", + MetricCategory.TABLET_SERVER), + TSERVER_MAJC_QUEUED("accumulo.tserver.majc.queued", MetricType.GAUGE, "", + MetricCategory.TABLET_SERVER), + TSERVER_MINC_QUEUED("accumulo.tserver.minc.queued", MetricType.GAUGE, "", + MetricCategory.TABLET_SERVER), + TSERVER_MINC_RUNNING("accumulo.tserver.minc.running", MetricType.GAUGE, "", + MetricCategory.TABLET_SERVER), + TSERVER_MINC_TOTAL("accumulo.tserver.minc.total", MetricType.GAUGE, "", + MetricCategory.TABLET_SERVER), + TSERVER_TABLETS_ONLINE("accumulo.tserver.tablets.online", MetricType.GAUGE, "", + MetricCategory.TABLET_SERVER), + TSERVER_TABLETS_ASSIGNMENTS_WARNING("accumulo.tserver.tablets.assignments.warning", + MetricType.GAUGE, "", MetricCategory.TABLET_SERVER), + TSERVER_TABLETS_OPENING("accumulo.tserver.tablets.opening", MetricType.GAUGE, "", + MetricCategory.TABLET_SERVER), + TSERVER_TABLETS_UNOPENED("accumulo.tserver.tablets.unopened", MetricType.GAUGE, "", + MetricCategory.TABLET_SERVER), + TSERVER_TABLETS_FILES("accumulo.tserver.tablets.files", MetricType.GAUGE, "", + MetricCategory.TABLET_SERVER), + TSERVER_INGEST_MUTATIONS("accumulo.tserver.ingest.mutations", MetricType.GAUGE, + "Prior to 2.1.0 this metric was reported as a rate, it is now the count and the rate can be derived.", + MetricCategory.TABLET_SERVER), + TSERVER_INGEST_BYTES("accumulo.tserver.ingest.bytes", MetricType.GAUGE, + "Prior to 2.1.0 this metric was reported as a rate, it is now the count and the rate can be derived.", + MetricCategory.TABLET_SERVER), + TSERVER_HOLD("accumulo.tserver.hold", MetricType.GAUGE, "", MetricCategory.TABLET_SERVER), + + // Scan Metrics + SCAN_RESERVATION_TOTAL_TIMER("accumulo.scan.reservation.total.timer", MetricType.TIMER, + "Time to reserve a tablet's files for scan.", MetricCategory.SCAN_SERVER), + SCAN_RESERVATION_WRITEOUT_TIMER("accumulo.scan.reservation.writeout.timer", MetricType.TIMER, + "Time to write out a tablets file reservations for scan", MetricCategory.SCAN_SERVER), + SCAN_RESERVATION_CONFLICT_TIMER("accumulo.scan.reservation.conflict.timer", MetricType.TIMER, "", + MetricCategory.SCAN_SERVER), + SCAN_BUSY_TIMEOUT_COUNT("accumulo.scan.busy.timeout.count", MetricType.COUNTER, + "Count of the scans where a busy timeout happened.", MetricCategory.SCAN_SERVER), + SCAN_TABLET_METADATA_CACHE("accumulo.scan.tablet.metadata.cache", MetricType.CACHE, + "Scan server tablet cache metrics.", MetricCategory.SCAN_SERVER), + SCAN_TIMES("accumulo.scan.times", MetricType.TIMER, "", MetricCategory.SCAN_SERVER), + SCAN_OPEN_FILES("accumulo.scan.files.open", MetricType.GAUGE, "", MetricCategory.SCAN_SERVER), + SCAN_RESULTS("accumulo.scan.result", MetricType.GAUGE, "", MetricCategory.SCAN_SERVER), + SCAN_YIELDS("accumulo.scan.yields", MetricType.GAUGE, "", MetricCategory.SCAN_SERVER), + SCAN_START("accumulo.scan.start", MetricType.COUNTER, "", MetricCategory.SCAN_SERVER), + SCAN_CONTINUE("accumulo.scan.continue", MetricType.COUNTER, "", MetricCategory.SCAN_SERVER), + SCAN_CLOSE("accumulo.scan.close", MetricType.COUNTER, "", MetricCategory.SCAN_SERVER), + SCAN_QUERIES("accumulo.scan.queries", MetricType.GAUGE, "", MetricCategory.SCAN_SERVER), Review Comment: Noticed there's a lot of metrics without a description. Every metric should probably have a description. Not something for this PR, but maybe should be done in a follow on ########## core/src/main/java/org/apache/accumulo/core/metrics/MetricsProducer.java: ########## @@ -34,710 +29,17 @@ * Metrics2</a> framework. In 2.1.0 Accumulo migrated away from the Metrics2 framework to * <a href="https://micrometer.io/">Micrometer</a>. Micrometer suggests using a particular * <a href="https://micrometer.io/docs/concepts#_naming_meters">naming convention</a> for the - * metrics. The table below contains a mapping of the old to new metric names. - * <table border="1"> - * <caption>Summary of Metric Changes</caption> Review Comment: Do we no longer need this table of old to new names? Would make sense since these are changes for 3.1, but just want to be sure ########## test/src/main/java/org/apache/accumulo/test/compaction/ExternalCompactionMetricsIT.java: ########## @@ -127,7 +127,7 @@ public void testMetrics() throws Exception { if (shutdownTailer.get()) { break; } - if (s.startsWith(MetricsProducer.METRICS_MAJC_QUEUED)) { + if (s.startsWith(TSERVER_MAJC_QUEUED.getName())) { Review Comment: Is this equivalent to what it was previously? ########## test/src/main/java/org/apache/accumulo/test/metrics/MetricsIT.java: ########## @@ -100,50 +110,55 @@ public void confirmMetricsPublished() throws Exception { cluster.stop(); // meter names sorted and formatting disabled to make it easier to diff changes // @formatter:off - Set<String> unexpectedMetrics = - Set.of(METRICS_COMPACTOR_MAJC_STUCK, - METRICS_SCAN_YIELDS); + Set<Metric> unexpectedMetrics = Set.of( + COMPACTOR_MAJC_STUCK, + SCAN_YIELDS + ); // add sserver as flaky until scan server included in mini tests. - Set<String> flakyMetrics = Set.of(METRICS_FATE_TYPE_IN_PROGRESS, - METRICS_MANAGER_BALANCER_MIGRATIONS_NEEDED, - METRICS_SCAN_BUSY_TIMEOUT_COUNTER, - METRICS_SCAN_RESERVATION_CONFLICT_COUNTER, - METRICS_SCAN_RESERVATION_TOTAL_TIMER, - METRICS_SCAN_RESERVATION_WRITEOUT_TIMER, - METRICS_SCAN_TABLET_METADATA_CACHE, - METRICS_SERVER_IDLE); + Set<Metric> flakyMetrics = Set.of( + FATE_TYPE_IN_PROGRESS, + MANAGER_BALANCER_MIGRATIONS_NEEDED, + SCAN_BUSY_TIMEOUT_COUNT, + SCAN_RESERVATION_CONFLICT_TIMER, Review Comment: Is this equivalent to what it was previously? ########## test/src/main/java/org/apache/accumulo/test/compaction/ExternalCompactionProgressIT.java: ########## @@ -282,18 +283,14 @@ private static Thread getMetricsCheckerThread(AtomicLong totalEntriesRead, break out; } TestStatsDSink.Metric metric = TestStatsDSink.parseStatsDMetric(s); - if (!metric.getName().startsWith(MetricsProducer.METRICS_COMPACTOR_PREFIX)) { - continue; - } + final String metricName = metric.getName(); int value = Integer.parseInt(metric.getValue()); - log.debug("Found metric: {} with value: {}", metric.getName(), value); - switch (metric.getName()) { - case MetricsProducer.METRICS_COMPACTOR_ENTRIES_READ: - totalEntriesRead.addAndGet(value); - break; - case MetricsProducer.METRICS_COMPACTOR_ENTRIES_WRITTEN: - totalEntriesWritten.addAndGet(value); - break; + if (metricName.equals(COMPACTOR_ENTRIES_READ.getName())) { + totalEntriesRead.set(value); + log.debug("Found metric: {} with value: {}", metricName, value); + } else if (metricName.equals(COMPACTOR_ENTRIES_WRITTEN.getName())) { + totalEntriesWritten.set(value); + log.debug("Found metric: {} with value: {}", metricName, value); Review Comment: Is there a reason this was changed from `addAndGet` to `set`? I'm not sure this is equivalent. ########## server/tserver/src/main/java/org/apache/accumulo/tserver/ScanServerMetrics.java: ########## @@ -49,15 +55,14 @@ public ScanServerMetrics(final LoadingCache<KeyExtent,TabletMetadata> tabletMeta @Override public void registerMetrics(MeterRegistry registry) { - totalReservationTimer = Timer.builder(MetricsProducer.METRICS_SCAN_RESERVATION_TOTAL_TIMER) + totalReservationTimer = Timer.builder(SCAN_RESERVATION_TOTAL_TIMER.getName()) .description("Time to reserve a tablets files for scan").register(registry); - writeOutReservationTimer = Timer - .builder(MetricsProducer.METRICS_SCAN_RESERVATION_WRITEOUT_TIMER) + writeOutReservationTimer = Timer.builder(SCAN_RESERVATION_WRITEOUT_TIMER.getName()) .description("Time to write out a tablets file reservations for scan").register(registry); - FunctionCounter.builder(METRICS_SCAN_BUSY_TIMEOUT_COUNTER, busyTimeoutCount, AtomicLong::get) + FunctionCounter.builder(SCAN_BUSY_TIMEOUT_COUNT.getName(), busyTimeoutCount, AtomicLong::get) .description("The number of scans where a busy timeout happened").register(registry); FunctionCounter - .builder(METRICS_SCAN_RESERVATION_CONFLICT_COUNTER, reservationConflictCount, + .builder(SCAN_RESERVATION_CONFLICT_TIMER.getName(), reservationConflictCount, Review Comment: Is this equivalent to what it was previously? ########## test/src/main/java/org/apache/accumulo/test/functional/MemoryStarvedMajCIT.java: ########## @@ -89,7 +89,7 @@ public static void start() throws Exception { } if (line.startsWith("accumulo")) { Metric metric = TestStatsDSink.parseStatsDMetric(line); - if (MetricsProducer.METRICS_MAJC_PAUSED.equals(metric.getName())) { + if (MINC_PAUSED.getName().equals(metric.getName())) { Review Comment: Is this equivalent to what it was previously? ########## server/tserver/src/main/java/org/apache/accumulo/tserver/metrics/TabletServerMetrics.java: ########## @@ -47,58 +66,62 @@ private long getTotalEntriesWritten() { @Override public void registerMetrics(MeterRegistry registry) { FunctionCounter - .builder(METRICS_COMPACTOR_ENTRIES_READ, this, TabletServerMetrics::getTotalEntriesRead) + .builder(COMPACTOR_ENTRIES_READ.getName(), this, TabletServerMetrics::getTotalEntriesRead) .description("Number of entries read by all compactions that have run on this tserver") .register(registry); FunctionCounter - .builder(METRICS_COMPACTOR_ENTRIES_WRITTEN, this, + .builder(COMPACTOR_ENTRIES_WRITTEN.getName(), this, TabletServerMetrics::getTotalEntriesWritten) .description("Number of entries written by all compactions that have run on this tserver") .register(registry); - LongTaskTimer timer = LongTaskTimer.builder(METRICS_TSERVER_MAJC_STUCK) + LongTaskTimer timer = LongTaskTimer.builder(TSERVER_MAJC_STUCK.getName()) .description("Number and duration of stuck major compactions").register(registry); CompactionWatcher.setTimer(timer); Gauge - .builder(METRICS_TSERVER_TABLETS_LONG_ASSIGNMENTS, util, + .builder(TSERVER_TABLETS_ASSIGNMENTS_WARNING.getName(), util, Review Comment: Is this equivalent to what it was previously? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
