mlbiscoc commented on code in PR #4063:
URL: https://github.com/apache/solr/pull/4063#discussion_r2708894442


##########
solr/cross-dc-manager/src/java/org/apache/solr/crossdc/manager/consumer/PrometheusMetrics.java:
##########
@@ -0,0 +1,158 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.crossdc.manager.consumer;
+
+import io.prometheus.metrics.core.datapoints.Timer;
+import io.prometheus.metrics.core.metrics.Counter;
+import io.prometheus.metrics.core.metrics.Histogram;
+import io.prometheus.metrics.model.registry.PrometheusRegistry;
+import org.apache.solr.client.solrj.SolrRequest;
+import org.apache.solr.client.solrj.request.UpdateRequest;
+import org.apache.solr.crossdc.common.MirroredSolrRequest;
+
+public class PrometheusMetrics implements ConsumerMetrics {
+
+  protected PrometheusRegistry registry;
+  protected Counter input;
+  protected Counter collapsed;
+  protected Counter output;
+  protected Histogram outputBatchSizeHistogram;
+  protected Histogram outputTimeHistogram;
+  protected Histogram outputBackoffHistogram;
+  protected Histogram outputFirstAttemptHistogram;
+
+  public PrometheusMetrics() {
+    register(PrometheusRegistry.defaultRegistry);
+  }
+
+  protected void register(PrometheusRegistry registry) {
+    this.registry = registry;
+    input =
+        Counter.builder()
+            .name("consumer_input_total")
+            .help("Total number of input messages")
+            .labelNames("type", "subtype")
+            .register(registry);
+
+    collapsed =
+        Counter.builder()
+            .name("consumer_collapsed_total")
+            .help("Total number of collapsed messages")
+            .register(registry);
+
+    output =
+        Counter.builder()
+            .name("consumer_output_total")
+            .help("Total number of output requests")
+            .labelNames("type", "result")
+            .register(registry);
+
+    outputBatchSizeHistogram =
+        Histogram.builder()
+            .name("consumer_output_batch_size_histogram")
+            .help("Histogram of output batch sizes")
+            .labelNames("type", "subtype")
+            .register(registry);
+
+    outputBackoffHistogram =
+        Histogram.builder()
+            .name("consumer_output_backoff_histogram")
+            .help("Histogram of output backoff sleep times")
+            .labelNames("type")
+            .register(registry);
+
+    outputTimeHistogram =
+        Histogram.builder()
+            .name("consumer_output_time_histogram")

Review Comment:
   What are these times? Milliseconds? Instead of histogram suffix, append the 
unit of time. Also I would confirm if the buckets of these histograms are 
useful for whatever the time unit is.



##########
solr/cross-dc-manager/src/java/org/apache/solr/crossdc/manager/consumer/PrometheusMetrics.java:
##########
@@ -0,0 +1,158 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.crossdc.manager.consumer;
+
+import io.prometheus.metrics.core.datapoints.Timer;
+import io.prometheus.metrics.core.metrics.Counter;
+import io.prometheus.metrics.core.metrics.Histogram;
+import io.prometheus.metrics.model.registry.PrometheusRegistry;
+import org.apache.solr.client.solrj.SolrRequest;
+import org.apache.solr.client.solrj.request.UpdateRequest;
+import org.apache.solr.crossdc.common.MirroredSolrRequest;
+
+public class PrometheusMetrics implements ConsumerMetrics {
+
+  protected PrometheusRegistry registry;
+  protected Counter input;
+  protected Counter collapsed;
+  protected Counter output;
+  protected Histogram outputBatchSizeHistogram;
+  protected Histogram outputTimeHistogram;
+  protected Histogram outputBackoffHistogram;
+  protected Histogram outputFirstAttemptHistogram;
+
+  public PrometheusMetrics() {
+    register(PrometheusRegistry.defaultRegistry);
+  }
+
+  protected void register(PrometheusRegistry registry) {
+    this.registry = registry;
+    input =
+        Counter.builder()
+            .name("consumer_input_total")
+            .help("Total number of input messages")
+            .labelNames("type", "subtype")
+            .register(registry);
+
+    collapsed =
+        Counter.builder()
+            .name("consumer_collapsed_total")
+            .help("Total number of collapsed messages")
+            .register(registry);
+
+    output =
+        Counter.builder()
+            .name("consumer_output_total")
+            .help("Total number of output requests")
+            .labelNames("type", "result")
+            .register(registry);
+
+    outputBatchSizeHistogram =
+        Histogram.builder()
+            .name("consumer_output_batch_size_histogram")

Review Comment:
   You can drop the histogram suffix. `# TYPE` will have the type that it is a 
histogram and it gets appended with bucket which is the only suffix this will 
need. Same with ones below.



##########
solr/cross-dc-manager/src/java/org/apache/solr/crossdc/manager/consumer/KafkaCrossDcConsumer.java:
##########
@@ -408,19 +408,19 @@ boolean pollAndProcessRequests() {
               List<SolrInputDocument> docs = update.getDocuments();
               if (docs != null) {
                 updateReqBatch.add(docs);
-                metrics.counter(MetricRegistry.name(type.name(), 
"add")).inc(docs.size());
+                metrics.incrementInputCounter(type.name(), "add");
               }
               List<String> deletes = update.getDeleteById();
               if (deletes != null) {
                 updateReqBatch.deleteById(deletes);
-                metrics.counter(MetricRegistry.name(type.name(), 
"dbi")).inc(deletes.size());
+                metrics.incrementInputCounter(type.name(), "dbi");

Review Comment:
   I think it is better to actually have this say `delete_by_id` instead of 
dbi. Same with below. Feel free to disagree.



##########
solr/test-framework/src/java/org/apache/solr/util/SolrKafkaTestsIgnoredThreadsFilter.java:
##########
@@ -45,6 +45,11 @@ public boolean reject(Thread t) {
       return true;
     }
 
+    // Prometheus Scheduler doesn't provide any method to shut down its worker 
threads
+    if (t.isDaemon()) {

Review Comment:
   Won't this filter _every_ daemon thread? Does the thread have something like 
`prometheus` in its name?



##########
solr/cross-dc-manager/src/test/org/apache/solr/crossdc/manager/SolrAndKafkaIntegrationTest.java:
##########
@@ -332,6 +338,39 @@ public void testParallelUpdatesToCluster2() throws 
Exception {
     assertCluster2EventuallyHasDocs(COLLECTION, "*:*", 5000);
   }
 
+  @Test
+  @SuppressWarnings({"unchecked"})
+  public void testMetrics() throws Exception {
+    CloudSolrClient client = solrCluster1.getSolrClient();
+    SolrInputDocument doc = new SolrInputDocument();
+    doc.addField("id", String.valueOf(new Date().getTime()));
+    doc.addField("text", "some test");
+
+    client.add(COLLECTION, doc);
+
+    client.commit(COLLECTION);
+
+    System.out.println("Sent producer record");

Review Comment:
   Use `log.info` instead or just don't print this. Doesn't seem like we need 
it.



##########
gradle/libs.versions.toml:
##########
@@ -498,6 +498,8 @@ ow2-asm-commons = { module = "org.ow2.asm:asm-commons", 
version.ref = "ow2-asm"
 ow2-asm-tree = { module = "org.ow2.asm:asm-tree", version.ref = "ow2-asm" }
 # @keep transitive dependency for version alignment
 perfmark-api = { module = "io.perfmark:perfmark-api", version.ref = "perfmark" 
}
+prometheus-metrics-core = { module = "io.prometheus:prometheus-metrics-core", 
version.ref = "prometheus-metrics" }

Review Comment:
   You should be able to do this with OTEL sdk/api which also offers prometheus 
right out the box. Any reason you just went with re-introducing this prometheus 
dependency? This is way better than the dropwizard -> prometheus bridge so not 
going to say this is a blocker. Just having alignment would be nice.



##########
solr/cross-dc-manager/src/java/org/apache/solr/crossdc/manager/consumer/PrometheusMetrics.java:
##########
@@ -0,0 +1,158 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.crossdc.manager.consumer;
+
+import io.prometheus.metrics.core.datapoints.Timer;
+import io.prometheus.metrics.core.metrics.Counter;
+import io.prometheus.metrics.core.metrics.Histogram;
+import io.prometheus.metrics.model.registry.PrometheusRegistry;
+import org.apache.solr.client.solrj.SolrRequest;
+import org.apache.solr.client.solrj.request.UpdateRequest;
+import org.apache.solr.crossdc.common.MirroredSolrRequest;
+
+public class PrometheusMetrics implements ConsumerMetrics {
+
+  protected PrometheusRegistry registry;
+  protected Counter input;
+  protected Counter collapsed;
+  protected Counter output;
+  protected Histogram outputBatchSizeHistogram;
+  protected Histogram outputTimeHistogram;
+  protected Histogram outputBackoffHistogram;
+  protected Histogram outputFirstAttemptHistogram;
+
+  public PrometheusMetrics() {
+    register(PrometheusRegistry.defaultRegistry);
+  }
+
+  protected void register(PrometheusRegistry registry) {
+    this.registry = registry;
+    input =
+        Counter.builder()
+            .name("consumer_input_total")
+            .help("Total number of input messages")
+            .labelNames("type", "subtype")
+            .register(registry);
+
+    collapsed =
+        Counter.builder()
+            .name("consumer_collapsed_total")
+            .help("Total number of collapsed messages")
+            .register(registry);
+
+    output =
+        Counter.builder()
+            .name("consumer_output_total")
+            .help("Total number of output requests")
+            .labelNames("type", "result")
+            .register(registry);
+
+    outputBatchSizeHistogram =
+        Histogram.builder()
+            .name("consumer_output_batch_size_histogram")
+            .help("Histogram of output batch sizes")
+            .labelNames("type", "subtype")
+            .register(registry);
+
+    outputBackoffHistogram =
+        Histogram.builder()
+            .name("consumer_output_backoff_histogram")
+            .help("Histogram of output backoff sleep times")
+            .labelNames("type")
+            .register(registry);
+
+    outputTimeHistogram =
+        Histogram.builder()
+            .name("consumer_output_time_histogram")
+            .help("Histogram of output request times")
+            .labelNames("type")
+            .register(registry);
+
+    outputFirstAttemptHistogram =
+        Histogram.builder()
+            .name("consumer_output_first_attempt_histogram")
+            .help("Histogram of first attempt request times")
+            .labelNames("type")
+            .register(registry);
+  }
+
+  public PrometheusRegistry getRegistry() {
+    return registry;
+  }
+
+  @Override
+  public void incrementCollapsedCounter() {
+    collapsed.inc();
+  }
+
+  @Override
+  public void incrementInputCounter(String type, String subType) {
+    incrementInputCounter(type, subType, 1);
+  }
+
+  @Override
+  public void incrementInputCounter(String type, String subType, int delta) {
+    input.labelValues(type, subType).inc(delta);
+  }
+
+  @Override
+  public void incrementOutputCounter(String type, String result) {
+    incrementOutputCounter(type, result, 1);
+  }
+
+  @Override
+  public void incrementOutputCounter(String type, String result, int delta) {
+    output.labelValues(type, result).inc(delta);
+  }
+
+  @Override
+  public void recordOutputBatchSize(MirroredSolrRequest.Type type, 
SolrRequest<?> solrRequest) {
+    if (type != MirroredSolrRequest.Type.UPDATE) {
+      outputBatchSizeHistogram.labelValues(type.name(), 
solrRequest.getPath()).observe(1);
+      return;
+    }
+    UpdateRequest req = (UpdateRequest) solrRequest;
+    int addCount = req.getDocuments() == null ? 0 : req.getDocuments().size();
+    int dbiCount = req.getDeleteById() == null ? 0 : 
req.getDeleteById().size();
+    int dbqCount = req.getDeleteQuery() == null ? 0 : 
req.getDeleteQuery().size();
+    if (addCount > 0) {
+      outputBatchSizeHistogram.labelValues(type.name(), 
"add").observe(addCount);
+    }
+    if (dbiCount > 0) {
+      outputBatchSizeHistogram.labelValues(type.name(), 
"dbi").observe(dbiCount);
+    }
+    if (dbqCount > 0) {
+      outputBatchSizeHistogram.labelValues(type.name(), 
"dbq").observe(dbqCount);
+    }
+  }
+
+  @Override
+  public void recordOutputBackoffSize(MirroredSolrRequest.Type type, long 
backoffTimeMs) {
+    outputBackoffHistogram.labelValues(type.name()).observe(backoffTimeMs);
+  }
+
+  @Override
+  public void recordOutputFirstAttemptSize(MirroredSolrRequest.Type type, long 
firstAttemptTimeNs) {
+    
outputFirstAttemptHistogram.labelValues(type.name()).observe(firstAttemptTimeNs);
+  }

Review Comment:
   This says "size" but it recording a time? But also this is just recording a 
first attempt time based on the name. Does that mean this records literally one 
time for this metric and never observed again? If so, I debate if these metrics 
are worth having. Metrics are for aggregation across a time series and if this 
records just once, then I think just logging has more value.



##########
solr/cross-dc-manager/src/java/org/apache/solr/crossdc/manager/consumer/KafkaCrossDcConsumer.java:
##########
@@ -408,19 +408,19 @@ boolean pollAndProcessRequests() {
               List<SolrInputDocument> docs = update.getDocuments();
               if (docs != null) {
                 updateReqBatch.add(docs);
-                metrics.counter(MetricRegistry.name(type.name(), 
"add")).inc(docs.size());
+                metrics.incrementInputCounter(type.name(), "add");

Review Comment:
   This and the metric below are only incrementing by 1 instead of `doc.size` 
now. Is that right?



##########
solr/cross-dc-manager/src/java/org/apache/solr/crossdc/manager/consumer/PrometheusMetrics.java:
##########
@@ -0,0 +1,158 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.crossdc.manager.consumer;
+
+import io.prometheus.metrics.core.datapoints.Timer;
+import io.prometheus.metrics.core.metrics.Counter;
+import io.prometheus.metrics.core.metrics.Histogram;
+import io.prometheus.metrics.model.registry.PrometheusRegistry;
+import org.apache.solr.client.solrj.SolrRequest;
+import org.apache.solr.client.solrj.request.UpdateRequest;
+import org.apache.solr.crossdc.common.MirroredSolrRequest;
+
+public class PrometheusMetrics implements ConsumerMetrics {
+
+  protected PrometheusRegistry registry;
+  protected Counter input;
+  protected Counter collapsed;
+  protected Counter output;
+  protected Histogram outputBatchSizeHistogram;
+  protected Histogram outputTimeHistogram;
+  protected Histogram outputBackoffHistogram;
+  protected Histogram outputFirstAttemptHistogram;
+
+  public PrometheusMetrics() {
+    register(PrometheusRegistry.defaultRegistry);
+  }
+
+  protected void register(PrometheusRegistry registry) {
+    this.registry = registry;
+    input =
+        Counter.builder()
+            .name("consumer_input_total")
+            .help("Total number of input messages")
+            .labelNames("type", "subtype")

Review Comment:
   I question most of these metrics really need `type` label. What is the 
cardinality of it and possible different combinations? I see in the test 
`UPDATE` is one. Is there also `QUERY` or something along those lines?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to