This is an automated email from the ASF dual-hosted git repository.

michaelsmith pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git


The following commit(s) were added to refs/heads/master by this push:
     new fbc4d701b IMPALA-12492: Add metrics for total pending events and lag 
time of the event-processor
fbc4d701b is described below

commit fbc4d701b1caa2c376c73a11e919d4ca38d64d48
Author: stiga-huang <[email protected]>
AuthorDate: Wed Oct 11 16:36:03 2023 +0800

    IMPALA-12492: Add metrics for total pending events and lag time of the 
event-processor
    
    We have last-synced-event-time and latest-event-time on the catalogd
    WebUI. It's tedious for users to calculate their difference to get the
    lag time. This patch addes the lag in the WebUI directly. The unit is
    set to TIME_S so it's shown in a human readable format. Also adds the
    total pending events in the same page.
    
    Tests
     - Manually verify the metrics by generating lots of HMS events,
       hard-code the EVENTS_BATCH_SIZE_PER_RPC to 5 and add some sleeps in
       event processing. So it's more easy to show lags.
    
    Change-Id: Id329879bfacba9c8415f920b57939dc571d1aad9
    Reviewed-on: http://gerrit.cloudera.org:8080/20560
    Reviewed-by: Impala Public Jenkins <[email protected]>
    Tested-by: Impala Public Jenkins <[email protected]>
---
 be/src/util/event-metrics.cc | 29 +++++++++++++++++++++++++++++
 be/src/util/event-metrics.h  | 12 ++++++++++++
 common/thrift/metrics.json   | 20 ++++++++++++++++++++
 3 files changed, 61 insertions(+)

diff --git a/be/src/util/event-metrics.cc b/be/src/util/event-metrics.cc
index 061025c6d..d06ddd91d 100644
--- a/be/src/util/event-metrics.cc
+++ b/be/src/util/event-metrics.cc
@@ -70,6 +70,9 @@ string MetastoreEventMetrics::LATEST_EVENT_ID_METRIC_NAME =
     "events-processor.latest-event-id";
 string MetastoreEventMetrics::LATEST_EVENT_TIME_METRIC_NAME =
     "events-processor.latest-event-time";
+string MetastoreEventMetrics::PENDING_EVENTS_METRIC_NAME =
+    "events-processor.pending-events";
+string MetastoreEventMetrics::LAG_TIME_METRIC_NAME = 
"events-processor.lag-time";
 
 IntCounter* MetastoreEventMetrics::NUM_EVENTS_RECEIVED_COUNTER = nullptr;
 IntCounter* MetastoreEventMetrics::NUM_EVENTS_SKIPPED_COUNTER = nullptr;
@@ -95,6 +98,8 @@ IntCounter* MetastoreEventMetrics::LAST_SYNCED_EVENT_ID = 
nullptr;
 IntCounter* MetastoreEventMetrics::LAST_SYNCED_EVENT_TIME = nullptr;
 IntCounter* MetastoreEventMetrics::LATEST_EVENT_ID = nullptr;
 IntCounter* MetastoreEventMetrics::LATEST_EVENT_TIME = nullptr;
+IntCounter* MetastoreEventMetrics::PENDING_EVENTS = nullptr;
+IntCounter* MetastoreEventMetrics::LAG_TIME = nullptr;
 
 // Initialize all the metrics for the events metric group
 void MetastoreEventMetrics::InitMetastoreEventMetrics(MetricGroup* 
metric_group) {
@@ -146,6 +151,8 @@ void 
MetastoreEventMetrics::InitMetastoreEventMetrics(MetricGroup* metric_group)
       event_metrics->AddCounter(LATEST_EVENT_ID_METRIC_NAME, 0);
   LATEST_EVENT_TIME =
       event_metrics->AddCounter(LATEST_EVENT_TIME_METRIC_NAME, 0);
+  PENDING_EVENTS = event_metrics->AddCounter(PENDING_EVENTS_METRIC_NAME, 0);
+  LAG_TIME = event_metrics->AddCounter(LAG_TIME_METRIC_NAME, 0);
 }
 
 void MetastoreEventMetrics::refresh(TEventProcessorMetrics* response) {
@@ -213,5 +220,27 @@ void 
MetastoreEventMetrics::refresh(TEventProcessorMetrics* response) {
   if (response->__isset.latest_event_time) {
     LATEST_EVENT_TIME->SetValue(response->latest_event_time);
   }
+  // last_synced_event_time is 0 at the startup until we have synced any 
events.
+  if (response->__isset.latest_event_time && 
response->__isset.last_synced_event_time
+      && response->last_synced_event_time > 0) {
+    // latest_event_time and last_synced_event_time are updated by different 
threads.
+    // It's possible that latest_event_time is stale and smaller than
+    // last_synced_event_time. Set the lag to 0 in this case.
+    if (response->latest_event_time <= response->last_synced_event_time) {
+      LAG_TIME->SetValue(0);
+    } else {
+      LAG_TIME->SetValue(response->latest_event_time - 
response->last_synced_event_time);
+    }
+  }
+  if (response->__isset.latest_event_id && 
response->__isset.last_synced_event_id) {
+    // Same as above, latest_event_id and last_synced_event_id are updated by 
different
+    // threads. Set the value to 0 if latest_event_id is stale.
+    if (response->latest_event_id <= response->last_synced_event_id) {
+      PENDING_EVENTS->SetValue(0);
+    } else {
+      PENDING_EVENTS->SetValue(
+          response->latest_event_id - response->last_synced_event_id);
+    }
+  }
 }
 } // namespace impala
diff --git a/be/src/util/event-metrics.h b/be/src/util/event-metrics.h
index e314c9bcf..fd6dc1f9f 100644
--- a/be/src/util/event-metrics.h
+++ b/be/src/util/event-metrics.h
@@ -85,6 +85,12 @@ class MetastoreEventMetrics {
   /// Latest metastore event time
   static IntCounter* LATEST_EVENT_TIME;
 
+  /// Number of events pending to be synced
+  static IntCounter* PENDING_EVENTS;
+
+  /// Lag time of the event processing
+  static IntCounter* LAG_TIME;
+
  private:
   /// Following metric names must match with the key in metrics.json
 
@@ -135,6 +141,12 @@ class MetastoreEventMetrics {
 
   /// Metric name for the event time of the latest metastore event
   static std::string LATEST_EVENT_TIME_METRIC_NAME;
+
+  /// Metric name for the number of pending events to be synced
+  static std::string PENDING_EVENTS_METRIC_NAME;
+
+  /// Metric name for the lag time of the event processing
+  static std::string LAG_TIME_METRIC_NAME;
 };
 
 } // namespace impala
diff --git a/common/thrift/metrics.json b/common/thrift/metrics.json
index d9391818c..ae53d06d9 100644
--- a/common/thrift/metrics.json
+++ b/common/thrift/metrics.json
@@ -3157,6 +3157,26 @@
     "kind" : "COUNTER",
     "key" : "events-processor.latest-event-time"
   },
+  {
+    "description": "Number of pending events to be synced, i.e. the difference 
between latest-event-id and last-synced-event-id",
+    "contexts": [
+      "CATALOGSERVER"
+    ],
+    "label": "Pending Events",
+    "units": "NONE",
+    "kind" : "COUNTER",
+    "key" : "events-processor.pending-events"
+  },
+  {
+    "description": "Lag time of the event processing, i.e. the difference 
between latest-event-time and last-synced-event-time",
+    "contexts": [
+      "CATALOGSERVER"
+    ],
+    "label": "Lag Time",
+    "units": "TIME_S",
+    "kind" : "COUNTER",
+    "key" : "events-processor.lag-time"
+  },
   {
     "description": "Total number of executor groups that have at least one 
executor",
     "contexts": [

Reply via email to