This is an automated email from the ASF dual-hosted git repository.
cmcfarlen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/trafficserver.git
The following commit(s) were added to refs/heads/master by this push:
new 53adece861 Event loop sub-stats (#11761)
53adece861 is described below
commit 53adece861b975b85b53e6197d1d6b297225b593
Author: Nathan Wang <[email protected]>
AuthorDate: Tue Oct 1 11:40:53 2024 -0700
Event loop sub-stats (#11761)
* introduce sub-event loop stats
* add documentation
---------
Co-authored-by: Nathan Wang <[email protected]>
---
.../monitoring/statistics/core/eventloop.en.rst | 45 +++++++++++++
include/iocore/eventsystem/EThread.h | 78 ++++++++++++++++++----
src/iocore/eventsystem/UnixEThread.cc | 31 ++++++---
src/iocore/eventsystem/UnixEventProcessor.cc | 3 +
src/iocore/net/NetHandler.cc | 9 ++-
5 files changed, 142 insertions(+), 24 deletions(-)
diff --git a/doc/admin-guide/monitoring/statistics/core/eventloop.en.rst
b/doc/admin-guide/monitoring/statistics/core/eventloop.en.rst
index e8946ee189..9d2acb7e73 100644
--- a/doc/admin-guide/monitoring/statistics/core/eventloop.en.rst
+++ b/doc/admin-guide/monitoring/statistics/core/eventloop.en.rst
@@ -80,6 +80,21 @@ high then it is likely transactions are experiencing
significant latency.
The maximum amount of time spent in a single loop in the last 10 seconds.
+.. ts:stat:: global proxy.process.eventloop.drain.queue.max.10s integer
+ :units: nanoseconds
+
+ The maximum amount of time spent draining the event queue in a single loop
in the last 10 seconds.
+
+.. ts:stat:: global proxy.process.eventloop.io.wait.max.10s integer
+ :units: nanoseconds
+
+ The maximum amount of time spent waiting for network IO in a single loop
in the last 10 seconds.
+
+.. ts:stat:: global proxy.process.eventloop.io.work.max.10s integer
+ :units: nanoseconds
+
+ The maximum amount of time spent processing network IO in a single loop in
the last 10 seconds.
+
.. rubric:: 100 Second Metrics
.. ts:stat:: global proxy.process.eventloop.count.100s integer
@@ -113,6 +128,21 @@ high then it is likely transactions are experiencing
significant latency.
The maximum amount of time spent in a single loop in the last 100 seconds.
+.. ts:stat:: global proxy.process.eventloop.drain.queue.max.100s integer
+ :units: nanoseconds
+
+ The maximum amount of time spent draining the event queue in a single loop
in the last 100 seconds.
+
+.. ts:stat:: global proxy.process.eventloop.io.wait.max.100s integer
+ :units: nanoseconds
+
+ The maximum amount of time spent waiting for network IO in a single loop
in the last 100 seconds.
+
+.. ts:stat:: global proxy.process.eventloop.io.work.max.100s integer
+ :units: nanoseconds
+
+ The maximum amount of time spent processing network IO in a single loop in
the last 100 seconds.
+
.. rubric:: 1000 Second Metrics
.. ts:stat:: global proxy.process.eventloop.count.1000s integer
@@ -146,6 +176,21 @@ high then it is likely transactions are experiencing
significant latency.
The maximum amount of time spent in a single loop in the last 1000 seconds.
+.. ts:stat:: global proxy.process.eventloop.drain.queue.max.1000s integer
+ :units: nanoseconds
+
+ The maximum amount of time spent draining the event queue in a single loop
in the last 1000 seconds.
+
+.. ts:stat:: global proxy.process.eventloop.io.wait.max.1000s integer
+ :units: nanoseconds
+
+ The maximum amount of time spent waiting for network IO in a single loop
in the last 1000 seconds.
+
+.. ts:stat:: global proxy.process.eventloop.io.work.max.1000s integer
+ :units: nanoseconds
+
+ The maximum amount of time spent processing network IO in a single loop in
the last 1000 seconds.
+
.. rubric:: Histogram Metrics
.. ts:stat:: global proxy.process.eventloop.time.*ms integer
diff --git a/include/iocore/eventsystem/EThread.h
b/include/iocore/eventsystem/EThread.h
index 8e404e7540..9b6f64bfed 100644
--- a/include/iocore/eventsystem/EThread.h
+++ b/include/iocore/eventsystem/EThread.h
@@ -418,10 +418,15 @@ public:
/// Data for timing of the loop.
struct Duration {
- ink_hrtime _start = 0; ///< The time of the first loop for
this sample. Used to mark valid entries.
- ink_hrtime _min = INT64_MAX; ///< Shortest loop time.
- ink_hrtime _max = 0; ///< Longest loop time.
- Duration() = default;
+ ink_hrtime _start = 0; ///< The time of the first
loop for this sample. Used to mark valid entries.
+ ink_hrtime _min = INT64_MAX; ///< Shortest loop time.
+ ink_hrtime _max = 0; ///< Longest loop time.
+ ink_hrtime _max_io_wait = 0; ///< Longest time spent
waiting for IO activity
+ ink_hrtime _max_io_work = 0; ///< Longest time spent
processing IO ready list
+ ink_hrtime _max_drain_queue = 0; ///< Longest time spent
draining the event queue
+ ink_hrtime _last_io_work = 0; ///< time spent processing IO
this loop, needed for ratio
+ ink_hrtime _io_work_at_max = 0; ///< the time spent
processing IO within the longest loop time
+ Duration() = default;
} _duration;
/// Events in the slice.
@@ -456,6 +461,21 @@ public:
*/
self_type &record_event_count(int count);
+ /** Record duration of loop spent processing event queue.
+ *
+ * @param time Time spent processing the queue
+ * @return @a this.
+ */
+ self_type &record_drain_queue(ink_hrtime time);
+
+ /** Record time spent doing IO, both poll and processing time.
+ *
+ * @param io_wait Time spent polling for IO activity
+ * @param io_work Time spent processing the ready list
+ * @return @a this.
+ */
+ self_type &record_io_stats(ink_hrtime io_wait, ink_hrtime io_work);
+
/// Add @a that to @a this data.
/// This embodies the custom logic per member concerning whether each is
a sum, min, or max.
Slice &operator+=(Slice const &that);
@@ -465,16 +485,19 @@ public:
More than one part of the code depends on this exact order. Be
careful and thorough when changing.
*/
enum class STAT_ID {
- LOOP_COUNT, ///< # of event loops executed.
- LOOP_EVENTS, ///< # of events
- LOOP_EVENTS_MIN, ///< min # of events dispatched in a loop
- LOOP_EVENTS_MAX, ///< max # of events dispatched in a loop
- LOOP_WAIT, ///< # of loops that did a conditional wait.
- LOOP_TIME_MIN, ///< Shortest time spent in loop.
- LOOP_TIME_MAX, ///< Longest time spent in loop.
+ LOOP_COUNT, ///< # of event loops executed.
+ LOOP_EVENTS, ///< # of events
+ LOOP_EVENTS_MIN, ///< min # of events dispatched in a loop
+ LOOP_EVENTS_MAX, ///< max # of events dispatched in a loop
+ LOOP_WAIT, ///< # of loops that did a conditional wait.
+ LOOP_TIME_MIN, ///< Shortest time spent in loop.
+ LOOP_TIME_MAX, ///< Longest time spent in loop.
+ LOOP_DRAIN_QUEUE_MAX, ///< Max time Draining the event queue
+ LOOP_IO_WAIT_MAX, ///< Min time spent IO waiting
+ LOOP_IO_WORK_MAX, ///< Max time spent IO processing
};
/// Number of statistics for a slice.
- static constexpr unsigned N_STAT_ID = unsigned(STAT_ID::LOOP_TIME_MAX) +
1;
+ static constexpr unsigned N_STAT_ID =
unsigned(STAT_ID::LOOP_IO_WORK_MAX) + 1;
/// Statistic name stems.
/// These will be qualified by time scale.
@@ -578,7 +601,8 @@ inline auto
EThread::Metrics::Slice::record_loop_duration(ink_hrtime delta) -> self_type &
{
if (delta > _duration._max) {
- _duration._max = delta;
+ _duration._max = delta;
+ _duration._io_work_at_max = _duration._last_io_work;
}
if (delta < _duration._min) {
_duration._min = delta;
@@ -599,6 +623,34 @@ EThread::Metrics::Slice::record_event_count(int count) ->
self_type &
return *this;
}
+inline auto
+EThread::Metrics::Slice::record_drain_queue(ink_hrtime drain_queue) ->
self_type &
+{
+ if (drain_queue > 0) {
+ if (drain_queue > _duration._max_drain_queue) {
+ _duration._max_drain_queue = drain_queue;
+ }
+ }
+ return *this;
+}
+
+inline auto
+EThread::Metrics::Slice::record_io_stats(ink_hrtime io_wait, ink_hrtime
io_work) -> self_type &
+{
+ if (io_wait > 0) {
+ if (io_wait > _duration._max_io_wait) {
+ _duration._max_io_wait = io_wait;
+ }
+ }
+ if (io_work > 0) {
+ _duration._last_io_work = io_work;
+ if (io_work > _duration._max_io_work) {
+ _duration._max_io_work = io_work;
+ }
+ }
+ return *this;
+}
+
inline EThread::Metrics::Slice *
EThread::Metrics::prev_slice(EThread::Metrics::Slice *current)
{
diff --git a/src/iocore/eventsystem/UnixEThread.cc
b/src/iocore/eventsystem/UnixEThread.cc
index b2e1f37560..cfcd889fbe 100644
--- a/src/iocore/eventsystem/UnixEThread.cc
+++ b/src/iocore/eventsystem/UnixEThread.cc
@@ -45,9 +45,12 @@ struct AIOCallback;
// !! THIS MUST BE IN THE ENUM ORDER !!
char const *const EThread::Metrics::Slice::STAT_NAME[] = {
- "proxy.process.eventloop.count", "proxy.process.eventloop.events",
"proxy.process.eventloop.events.min",
- "proxy.process.eventloop.events.max", "proxy.process.eventloop.wait",
"proxy.process.eventloop.time.min",
- "proxy.process.eventloop.time.max"};
+ "proxy.process.eventloop.count", "proxy.process.eventloop.events",
+ "proxy.process.eventloop.events.min", "proxy.process.eventloop.events.max",
+ "proxy.process.eventloop.wait", "proxy.process.eventloop.time.min",
+ "proxy.process.eventloop.time.max",
"proxy.process.eventloop.drain.queue.max",
+ "proxy.process.eventloop.io.wait.max", "proxy.process.eventloop.io.work.max",
+};
int thread_max_heartbeat_mseconds = THREAD_MAX_HEARTBEAT_MSECONDS;
@@ -297,6 +300,10 @@ EThread::execute_regular()
sleep_time = 0;
}
+ // drained the queue by this point
+ ink_hrtime post_drain = ink_get_hrtime();
+ ink_hrtime drain_queue = post_drain - loop_start_time;
+
tail_cb->waitForActivity(sleep_time);
// loop cleanup
@@ -309,6 +316,7 @@ EThread::execute_regular()
metrics.decay();
metrics.record_loop_time(delta);
current_slice->record_event_count(ev_count);
+ current_slice->record_drain_queue(drain_queue);
}
}
@@ -364,13 +372,16 @@ EThread::execute()
EThread::Metrics::Slice &
EThread::Metrics::Slice::operator+=(Slice const &that)
{
- this->_events._max = std::max(this->_events._max, that._events._max);
- this->_events._min = std::min(this->_events._min, that._events._min);
- this->_events._total += that._events._total;
- this->_duration._min = std::min(this->_duration._min, that._duration._min);
- this->_duration._max = std::max(this->_duration._max, that._duration._max);
- this->_count += that._count;
- this->_wait += that._wait;
+ this->_events._max = std::max(this->_events._max,
that._events._max);
+ this->_events._min = std::min(this->_events._min,
that._events._min);
+ this->_events._total += that._events._total;
+ this->_duration._min = std::min(this->_duration._min,
that._duration._min);
+ this->_duration._max = std::max(this->_duration._max,
that._duration._max);
+ this->_count += that._count;
+ this->_wait += that._wait;
+ this->_duration._max_drain_queue =
std::max(this->_duration._max_drain_queue, that._duration._max_drain_queue);
+ this->_duration._max_io_wait = std::max(this->_duration._max_io_wait,
that._duration._max_io_wait);
+ this->_duration._max_io_work = std::max(this->_duration._max_io_work,
that._duration._max_io_work);
return *this;
}
diff --git a/src/iocore/eventsystem/UnixEventProcessor.cc
b/src/iocore/eventsystem/UnixEventProcessor.cc
index aae8e76f4c..27f620d69c 100644
--- a/src/iocore/eventsystem/UnixEventProcessor.cc
+++ b/src/iocore/eventsystem/UnixEventProcessor.cc
@@ -118,6 +118,9 @@ EventMetricStatSync(const char *, RecDataT, RecData *,
RecRawStatBlock *rsb, int
slice_stat_update(ID::LOOP_EVENTS, id, slice->_events._total);
slice_stat_update(ID::LOOP_EVENTS_MIN, id, slice->_events._min);
slice_stat_update(ID::LOOP_EVENTS_MAX, id, slice->_events._max);
+ slice_stat_update(ID::LOOP_DRAIN_QUEUE_MAX, id,
slice->_duration._max_drain_queue);
+ slice_stat_update(ID::LOOP_IO_WAIT_MAX, id, slice->_duration._max_io_wait);
+ slice_stat_update(ID::LOOP_IO_WORK_MAX, id, slice->_duration._max_io_work);
}
// Next are the event loop histogram buckets.
diff --git a/src/iocore/net/NetHandler.cc b/src/iocore/net/NetHandler.cc
index 1b36780051..208728636a 100644
--- a/src/iocore/net/NetHandler.cc
+++ b/src/iocore/net/NetHandler.cc
@@ -357,8 +357,11 @@ NetHandler::waitForActivity(ink_hrtime timeout)
#endif
// Polling event by PollCont
- PollCont *p = get_PollCont(this->thread);
+ PollCont *p = get_PollCont(this->thread);
+ ink_hrtime pre_poll = ink_get_hrtime();
p->do_poll(timeout);
+ ink_hrtime post_poll = ink_get_hrtime();
+ ink_hrtime poll_time = post_poll - pre_poll;
// Get & Process polling result
PollDescriptor *pd = get_PollDescriptor(this->thread);
@@ -372,6 +375,10 @@ NetHandler::waitForActivity(ink_hrtime timeout)
pd->result = 0;
process_ready_list();
+ ink_hrtime post_process = ink_get_hrtime();
+ ink_hrtime process_time = post_process - post_poll;
+
this->thread->metrics.current_slice.load(std::memory_order_acquire)->record_io_stats(poll_time,
process_time);
+
#if TS_USE_LINUX_IO_URING
ur->service();
#endif