AlexStocks commented on code in PR #962:
URL: https://github.com/apache/dubbo-go-pixiu/pull/962#discussion_r3401226326


##########
pkg/cluster/metrics.go:
##########
@@ -0,0 +1,87 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package cluster
+
+import (
+       "context"
+       "sync"
+)
+
+import (
+       "go.opentelemetry.io/otel"
+       "go.opentelemetry.io/otel/attribute"
+       "go.opentelemetry.io/otel/metric"
+)
+
+import (
+       "github.com/apache/dubbo-go-pixiu/pkg/logger"
+)
+
+// Endpoint snapshot publication metrics. Instruments are bound lazily to the
+// global MeterProvider on first publish; when metrics are disabled the 
provider
+// is a no-op, so recording stays a cheap no-op and never blocks publication.
+var (
+       snapshotMetricsOnce sync.Once
+
+       snapshotPublishTotal  metric.Int64Counter
+       snapshotEndpointCount metric.Int64Gauge
+       snapshotHealthyCount  metric.Int64Gauge
+)
+
+func initSnapshotMetrics() {
+       snapshotMetricsOnce.Do(func() {
+               meter := otel.GetMeterProvider().Meter("pixiu")
+
+               var err error
+               if snapshotPublishTotal, err = 
meter.Int64Counter("pixiu_cluster_snapshot_publish_total",
+                       metric.WithDescription("Total number of cluster 
endpoint snapshots successfully published.")); err != nil {
+                       logger.Errorf("[dubbo-go-pixiu] register 
pixiu_cluster_snapshot_publish_total failed: %v", err)
+               }
+               if snapshotEndpointCount, err = 
meter.Int64Gauge("pixiu_cluster_snapshot_endpoint_count",
+                       metric.WithDescription("Total endpoints in the latest 
published cluster snapshot.")); err != nil {
+                       logger.Errorf("[dubbo-go-pixiu] register 
pixiu_cluster_snapshot_endpoint_count failed: %v", err)
+               }
+               if snapshotHealthyCount, err = 
meter.Int64Gauge("pixiu_cluster_snapshot_healthy_endpoint_count",
+                       metric.WithDescription("Healthy endpoints in the latest 
published cluster snapshot.")); err != nil {
+                       logger.Errorf("[dubbo-go-pixiu] register 
pixiu_cluster_snapshot_healthy_endpoint_count failed: %v", err)
+               }
+       })
+}
+
+// recordSnapshotPublish reports a successful snapshot publication for a 
cluster.
+// Call exactly once per successful CompareAndSwap of the published snapshot. 
The
+// only label is the cluster name, which is bounded by configuration rather 
than
+// request traffic, so cardinality stays low. The health-update callers invoke
+// this while holding the cluster's healthMu, so the body must stay
+// allocation-light and must not block.
+func recordSnapshotPublish(clusterName string, snapshot *EndpointSnapshot) {
+       initSnapshotMetrics()
+
+       attrs := metric.WithAttributes(attribute.String("cluster", clusterName))
+       ctx := context.Background()
+
+       if snapshotPublishTotal != nil {
+               snapshotPublishTotal.Add(ctx, 1, attrs)
+       }

Review Comment:
   [P1] initSnapshotMetrics 第一次调用会在 healthMu 
持有期间执行:UpdateEndpointHealth/UpdateEndpointAddressHealth 都在 healthMu.Lock() 
之后直接走到 recordSnapshotPublish → initSnapshotMetrics → 
meter.Int64Counter/Int64Gauge × 3,注册失败时还要走 logger.Errorf,整个路径的分配和潜在 IO 
全部发生在锁内。这与本函数注释“health-update callers invoke this while holding the cluster s 
healthMu, so the body must stay allocation-light and must not 
block”直接冲突,会让首个健康事件、或者每次 registerOtelMeterProvider 之后第一次 publish 抢占 
healthMu,阻塞所有并发健康事件。建议在 cluster 包初始化(或 registerOtelMetricMeter 完成后)显式调用一次 
initSnapshotMetrics 把仪表预绑定好,让 recordSnapshotPublish 只剩 Add/Record。



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to