Re: Frequent wake-ups for mpm_event

Yann Ylavic Sat, 20 Aug 2016 19:15:07 -0700

On Thu, Aug 18, 2016 at 3:17 PM, Yann Ylavic <[email protected]> wrote:
>
> I was thinking of something like the attached patch, where we compute
> the exact/minimal timeout needed for the queues.
>
> It requires walking/locking the queues (looking at the first item
> only), but besides getting the exact poll() timeout which helps
> spurious wakeups, it also allows to avoid walking this queues after
> the poll() (previously each 0.1s, for maintenance) when not necessary.
> So overall I think it's a gain.


Finally I came up with another way which requires no walking (hence
locking) in the listener outside queues/timers processing (patch
attached).
It looks good to me, not heavily tested/debugged still...

The approach is, for both timeout queues and timers, to maintain the
next (lowest) expiry time of all entries (namely queues_next_expiry
and timers_next_expiry).
These global values are kept up to date in TO_QUEUE_APPEND (for
timeout queues) and event_get_timer_event (for timers, where the
skiplist is filled) by comparing the expiry of newly added entries
(already protected by their respective timeout/skiplist mutex).
This happens in the workers, and is cheap (locking was there already
to protect from concurrent workers), it just requires an
apr_pollset_wakeup to notice the listener whenever one of these
expiries is updated.

So now, in the listener, we can read these values directly to compute
the min poll() timeout to use, and also for the timeout queues to
determine whether or not it's time to process them.
If an expiry is updated while poll()ing, it's woken up and the timeout
is also updated.
No locking needed here, I use volatile reads which is enough (no
atomic/ordered read required, no apr_atomic_read64 to handle an
apr_time_t anyway) since the listener is garanteed to be woken up when
needed.

This also benefits the !listener_is_wakeable case (with a maximum
poll() timeout of 100ms), since we avoid the lock when no timer is
armed (we don't use timers in 2.4.x afaict), and also avoid spurious
timeout queues processing.

The patch includes Luca's http://apaste.info/mke (base of
APR_POLLSET_WAKEABLE handling) and also tries to make existing
critical sections as short as possible (barely related, but while at
it :)

Thoughts?


Regards,
Yann.

Index: server/mpm/event/event.c
===================================================================
--- server/mpm/event/event.c	(revision 1757019)
+++ server/mpm/event/event.c	(working copy)
@@ -107,6 +107,8 @@
 #include "serf.h"
 #endif
 
+#define VOLATILE_READ(T, x) (*(volatile T *)&(x));
+
 /* Limit on the total --- clients will be locked out if more servers than
  * this are needed.  It is intended solely to keep the server from crashing
  * when things get out of hand.
@@ -182,6 +184,7 @@ static int dying = 0;
 static int workers_may_exit = 0;
 static int start_thread_may_exit = 0;
 static int listener_may_exit = 0;
+static int listener_is_wakeable = 0;        /* Pollset supports APR_POLLSET_WAKEABLE */
 static int num_listensocks = 0;
 static apr_int32_t conns_this_child;        /* MaxConnectionsPerChild, only access
                                                in listener thread */
@@ -204,6 +207,17 @@ module AP_MODULE_DECLARE_DATA mpm_event_module;
 struct event_srv_cfg_s;
 typedef struct event_srv_cfg_s event_srv_cfg;
 
+/*
+ * The pollset for sockets that are in any of the timeout queues. Currently
+ * we use the timeout_mutex to make sure that connections are added/removed
+ * atomically to/from both event_pollset and a timeout queue. Otherwise
+ * some confusion can happen under high load if timeout queues and pollset
+ * get out of sync.
+ * XXX: It should be possible to make the lock unnecessary in many or even all
+ * XXX: cases.
+ */
+static apr_pollset_t *event_pollset;
+
 struct event_conn_state_t {
     /** APR_RING of expiration timeouts */
     APR_RING_ENTRY(event_conn_state_t) timeout_list;
@@ -249,6 +263,7 @@ static struct timeout_queue *write_completion_q,
                             *keepalive_q,
                             *linger_q,
                             *short_linger_q;
+static apr_time_t queues_next_expiry;
 
 static apr_pollfd_t *listener_pollfd;
 
@@ -256,14 +271,26 @@ static apr_pollfd_t *listener_pollfd;
  * Macros for accessing struct timeout_queue.
  * For TO_QUEUE_APPEND and TO_QUEUE_REMOVE, timeout_mutex must be held.
  */
-#define TO_QUEUE_APPEND(q, el)                                                \
-    do {                                                                      \
-        APR_RING_INSERT_TAIL(&(q)->head, el, event_conn_state_t,              \
-                             timeout_list);                                   \
-        ++*(q)->total;                                                        \
-        ++(q)->count;                                                         \
-    } while (0)
+static void TO_QUEUE_APPEND(struct timeout_queue *q, event_conn_state_t *el)
+{
+    APR_RING_INSERT_TAIL(&q->head, el, event_conn_state_t, timeout_list);
+    ++*q->total;
+    ++q->count;
 
+    /* Cheaply update the overall queues' next expiry according to the
+     * first entry of this queue (oldest), if necessary.
+     */
+    el = APR_RING_FIRST(&q->head);
+    if (!queues_next_expiry
+            || queues_next_expiry > el->queue_timestamp + q->timeout) {
+        queues_next_expiry = el->queue_timestamp + q->timeout;
+        /* Unblock the listener if it's waiting on a longer timeout. */
+        if (listener_is_wakeable) {
+            apr_pollset_wakeup(event_pollset);
+        }
+    }
+}
+
 #define TO_QUEUE_REMOVE(q, el)                                                \
     do {                                                                      \
         APR_RING_REMOVE(el, timeout_list);                                    \
@@ -284,17 +311,6 @@ static apr_pollfd_t *listener_pollfd;
 
 #define TO_QUEUE_ELEM_INIT(el) APR_RING_ELEM_INIT(el, timeout_list)
 
-/*
- * The pollset for sockets that are in any of the timeout queues. Currently
- * we use the timeout_mutex to make sure that connections are added/removed
- * atomically to/from both event_pollset and a timeout queue. Otherwise
- * some confusion can happen under high load if timeout queues and pollset
- * get out of sync.
- * XXX: It should be possible to make the lock unnecessary in many or even all
- * XXX: cases.
- */
-static apr_pollset_t *event_pollset;
-
 #if HAVE_SERF
 typedef struct {
     apr_pollset_t *pollset;
@@ -493,6 +509,11 @@ static void wakeup_listener(void)
         return;
     }
 
+    /* unblock the listener if it's poll()ing */
+    if (listener_is_wakeable) {
+        apr_pollset_wakeup(event_pollset);
+    }
+
     /* unblock the listener if it's waiting for a worker */
     ap_queue_info_term(worker_queue_info);
 
@@ -696,7 +717,11 @@ static apr_status_t decrement_connection_count(voi
         default:
             break;
     }
-    apr_atomic_dec32(&connection_count);
+    /* Unblock the listener if it's waiting for connection_count = 0 */
+    if (!apr_atomic_dec32(&connection_count)
+             && listener_is_wakeable && listener_may_exit) {
+        apr_pollset_wakeup(event_pollset);
+    }
     return APR_SUCCESS;
 }
 
@@ -859,6 +884,7 @@ static void notify_resume(event_conn_state_t *cs,
 
 static int start_lingering_close_common(event_conn_state_t *cs, int in_worker)
 {
+    int done = 0;
     apr_status_t rv;
     struct timeout_queue *q;
     apr_socket_t *csd = cs->pfd.desc.s;
@@ -870,7 +896,6 @@ static int start_lingering_close_common(event_conn
 #else
     apr_socket_timeout_set(csd, 0);
 #endif
-    cs->queue_timestamp = apr_time_now();
     /*
      * If some module requested a shortened waiting period, only wait for
      * 2s (SECONDS_TO_LINGER). This is useful for mitigating certain
@@ -891,25 +916,25 @@ static int start_lingering_close_common(event_conn
     else {
         cs->c->sbh = NULL;
     }
-    apr_thread_mutex_lock(timeout_mutex);
-    TO_QUEUE_APPEND(q, cs);
     cs->pfd.reqevents = (
             cs->pub.sense == CONN_SENSE_WANT_WRITE ? APR_POLLOUT :
                     APR_POLLIN) | APR_POLLHUP | APR_POLLERR;
     cs->pub.sense = CONN_SENSE_DEFAULT;
+    cs->queue_timestamp = apr_time_now();
+    apr_thread_mutex_lock(timeout_mutex);
     rv = apr_pollset_add(event_pollset, &cs->pfd);
+    if (rv == APR_SUCCESS || APR_STATUS_IS_EEXIST(rv)) {
+        TO_QUEUE_APPEND(q, cs);
+        done = 1;
+    }
     apr_thread_mutex_unlock(timeout_mutex);
-    if (rv != APR_SUCCESS && !APR_STATUS_IS_EEXIST(rv)) {
+    if (!done) {
         ap_log_error(APLOG_MARK, APLOG_ERR, rv, ap_server_conf, APLOGNO(03092)
                      "start_lingering_close: apr_pollset_add failure");
-        apr_thread_mutex_lock(timeout_mutex);
-        TO_QUEUE_REMOVE(q, cs);
-        apr_thread_mutex_unlock(timeout_mutex);
         apr_socket_close(cs->pfd.desc.s);
         ap_push_pool(worker_queue_info, cs->p);
-        return 0;
     }
-    return 1;
+    return done;
 }
 
 /*
@@ -1170,15 +1195,15 @@ read_request:
              * Set a write timeout for this connection, and let the
              * event thread poll for writeability.
              */
-            cs->queue_timestamp = apr_time_now();
             notify_suspend(cs);
-            apr_thread_mutex_lock(timeout_mutex);
-            TO_QUEUE_APPEND(cs->sc->wc_q, cs);
             cs->pfd.reqevents = (
                     cs->pub.sense == CONN_SENSE_WANT_READ ? APR_POLLIN :
                             APR_POLLOUT) | APR_POLLHUP | APR_POLLERR;
             cs->pub.sense = CONN_SENSE_DEFAULT;
-            rc = apr_pollset_add(event_pollset, &cs->pfd);
+            cs->queue_timestamp = apr_time_now();
+            apr_thread_mutex_lock(timeout_mutex);
+            apr_pollset_add(event_pollset, &cs->pfd);
+            TO_QUEUE_APPEND(cs->sc->wc_q, cs);
             apr_thread_mutex_unlock(timeout_mutex);
             return;
         }
@@ -1209,14 +1234,13 @@ read_request:
          * timeout today.  With a normal client, the socket will be readable in
          * a few milliseconds anyway.
          */
-        cs->queue_timestamp = apr_time_now();
         notify_suspend(cs);
-        apr_thread_mutex_lock(timeout_mutex);
-        TO_QUEUE_APPEND(cs->sc->ka_q, cs);
-
         /* Add work to pollset. */
         cs->pfd.reqevents = APR_POLLIN;
+        cs->queue_timestamp = apr_time_now();
+        apr_thread_mutex_lock(timeout_mutex);
         rc = apr_pollset_add(event_pollset, &cs->pfd);
+        TO_QUEUE_APPEND(cs->sc->ka_q, cs);
         apr_thread_mutex_unlock(timeout_mutex);
 
         if (rc != APR_SUCCESS) {
@@ -1247,13 +1271,14 @@ static apr_status_t event_resume_suspended (conn_r
     apr_atomic_dec32(&suspended_count);
     c->suspended_baton = NULL;
 
-    apr_thread_mutex_lock(timeout_mutex);
-    TO_QUEUE_APPEND(cs->sc->wc_q, cs);
     cs->pfd.reqevents = (
             cs->pub.sense == CONN_SENSE_WANT_READ ? APR_POLLIN :
                     APR_POLLOUT) | APR_POLLHUP | APR_POLLERR;
     cs->pub.sense = CONN_SENSE_DEFAULT;
+    cs->queue_timestamp = apr_time_now();
+    apr_thread_mutex_lock(timeout_mutex);
     apr_pollset_add(event_pollset, &cs->pfd);
+    TO_QUEUE_APPEND(cs->sc->wc_q, cs);
     apr_thread_mutex_unlock(timeout_mutex);
 
     return OK;
@@ -1470,6 +1495,7 @@ static void get_worker(int *have_idle_worker_p, in
 static APR_RING_HEAD(timer_free_ring_t, timer_event_t) timer_free_ring;
 
 static apr_skiplist *timer_skiplist;
+static apr_time_t timers_next_expiry;
 
 /* The following compare function is used by apr_skiplist_insert() to keep the
  * elements (timers) sorted and provide O(log n) complexity (this is also true
@@ -1501,8 +1527,9 @@ static timer_event_t * event_get_timer_event(apr_t
                                              apr_array_header_t *remove)
 {
     timer_event_t *te;
+    apr_time_t now = (t < 0) ? 0 : apr_time_now();
+
     /* oh yeah, and make locking smarter/fine grained. */
-
     apr_thread_mutex_lock(g_timer_skiplist_mtx);
 
     if (!APR_RING_EMPTY(&timer_free_ring, timer_event_t, link)) {
@@ -1517,12 +1544,24 @@ static timer_event_t * event_get_timer_event(apr_t
     te->cbfunc = cbfn;
     te->baton = baton;
     te->canceled = 0;
-    te->when = t;
+    te->when = now + t;
     te->remove = remove;
 
     if (insert) { 
         /* Okay, add sorted by when.. */
         apr_skiplist_insert(timer_skiplist, te);
+
+        /* Cheaply update the overall timers' next expiry according to
+         * this event, if necessary.
+         */
+        if (!timers_next_expiry
+                || timers_next_expiry > te->when) {
+            timers_next_expiry = te->when;
+            /* Unblock the listener if it's waiting on a longer timer. */
+            if (listener_is_wakeable) {
+                apr_pollset_wakeup(event_pollset);
+            }
+        }
     }
     apr_thread_mutex_unlock(g_timer_skiplist_mtx);
 
@@ -1534,7 +1573,7 @@ static apr_status_t event_register_timed_callback_
                                                   void *baton, 
                                                   apr_array_header_t *remove)
 {
-    event_get_timer_event(t + apr_time_now(), cbfn, baton, 1, remove);
+    event_get_timer_event(t, cbfn, baton, 1, remove);
     return APR_SUCCESS;
 }
 
@@ -1594,7 +1633,7 @@ static apr_status_t event_register_poll_callback_e
 
     if (timeout > 0) { 
         /* XXX:  This cancel timer event count fire before the pollset is updated */
-        scb->cancel_event = event_get_timer_event(timeout + apr_time_now(), tofn, baton, 1, pfds);
+        scb->cancel_event = event_get_timer_event(timeout, tofn, baton, 1, pfds);
     }
     for (i = 0; i < pfds->nelts; i++) {
         apr_pollfd_t *pfd = (apr_pollfd_t *)pfds->elts + i;
@@ -1685,20 +1724,32 @@ static void process_timeout_queue(struct timeout_q
         count = 0;
         cs = first = last = APR_RING_FIRST(&qp->head);
         while (cs != APR_RING_SENTINEL(&qp->head, event_conn_state_t,
-                                       timeout_list)
-               /* Trash the entry if:
-                * - no timeout_time was given (asked for all), or
-                * - it expired (according to the queue timeout), or
-                * - the system clock skewed in the past: no entry should be
-                *   registered above the given timeout_time (~now) + the queue
-                *   timeout, we won't keep any here (eg. for centuries).
-                * Stop otherwise, no following entry will match thanks to the
-                * single timeout per queue (entries are added to the end!).
-                * This allows maintenance in O(1).
-                */
-               && (!timeout_time
-                   || cs->queue_timestamp + qp->timeout < timeout_time
-                   || cs->queue_timestamp > timeout_time + qp->timeout)) {
+                                       timeout_list)) {
+            /* Trash the entry if:
+             * - no timeout_time was given (asked for all), or
+             * - it expired (according to the queue timeout), or
+             * - the system clock skewed in the past: no entry should be
+             *   registered above the given timeout_time (~now) + the queue
+             *   timeout, we won't keep any here (eg. for centuries).
+             *
+             * Otherwise stop, no following entry will match thanks to the
+             * single timeout per queue (entries are added to the end!).
+             * This allows maintenance in O(1).
+             */
+            if (timeout_time
+                    && cs->queue_timestamp + qp->timeout > timeout_time
+                    && cs->queue_timestamp < timeout_time + qp->timeout) {
+                /* Since this is the next expiring of this queue, update the
+                 * overall queues' next expiry if it's later than this one.
+                 */
+                apr_time_t cs_expiry = cs->queue_timestamp + qp->timeout;
+                if (!queues_next_expiry
+                        || queues_next_expiry > cs_expiry) {
+                    queues_next_expiry = cs_expiry;
+                }
+                break;
+            }
+
             last = cs;
             rv = apr_pollset_remove(event_pollset, &cs->pfd);
             if (rv != APR_SUCCESS && !APR_STATUS_IS_NOTFOUND(rv)) {
@@ -1738,10 +1789,11 @@ static void * APR_THREAD_FUNC listener_thread(apr_
     apr_status_t rc;
     proc_info *ti = dummy;
     int process_slot = ti->pslot;
+    struct process_score *ps = ap_get_scoreboard_process(process_slot);
     apr_pool_t *tpool = apr_thread_pool_get(thd);
-    apr_time_t timeout_time = 0, last_log;
     int closed = 0, listeners_disabled = 0;
     int have_idle_worker = 0;
+    apr_time_t last_log;
 
     last_log = apr_time_now();
     free(ti);
@@ -1776,8 +1828,9 @@ static void * APR_THREAD_FUNC listener_thread(apr_
         apr_int32_t num = 0;
         apr_uint32_t c_count, l_count, i_count;
         apr_interval_time_t timeout_interval;
-        apr_time_t now;
+        apr_time_t now, timeout_time;
         int workers_were_busy = 0;
+
         if (listener_may_exit) {
             close_listeners(process_slot, &closed);
             if (terminate_mode == ST_UNGRACEFUL
@@ -1820,49 +1873,81 @@ static void * APR_THREAD_FUNC listener_thread(apr_
         }
 #endif
 
+        /* Update poll() timeout below according to the next expiring
+         * timer or queue entry, if any.
+         */
+        timeout_interval = -1;
         now = apr_time_now();
-        apr_thread_mutex_lock(g_timer_skiplist_mtx);
-        te = apr_skiplist_peek(timer_skiplist);
-        if (te) {
-            if (te->when > now) {
-                timeout_interval = te->when - now;
-            }
-            else {
-                timeout_interval = 1;
-            }
-        }
-        else {
-            timeout_interval = apr_time_from_msec(100);
-        }
-        while (te) {
-            if (te->when < now + EVENT_FUDGE_FACTOR) {
-                apr_skiplist_pop(timer_skiplist, NULL);
-                if (!te->canceled) { 
-                    if (te->remove) {
-                        int i;
-                        for (i = 0; i < te->remove->nelts; i++) {
-                            apr_pollfd_t *pfd = (apr_pollfd_t *)te->remove->elts + i;
-                            apr_pollset_remove(event_pollset, pfd);
+
+        /* Avoid locking if there's no expiring timer in the list,
+         * poll() will be woken up anyway if a new timer comes in.
+         */
+        timeout_time = VOLATILE_READ(apr_time_t, timers_next_expiry);
+        if (timeout_time && timeout_time < now + EVENT_FUDGE_FACTOR) {
+            /* Push expired timers to a worker, the first one remaining
+             * determines the maximum time to poll() below.
+             */
+            apr_thread_mutex_lock(g_timer_skiplist_mtx);
+            while ((te = apr_skiplist_peek(timer_skiplist))) {
+                if (te->when < now + EVENT_FUDGE_FACTOR) {
+                    apr_skiplist_pop(timer_skiplist, NULL);
+                    if (!te->canceled) { 
+                        if (te->remove) {
+                            int i;
+                            for (i = 0; i < te->remove->nelts; i++) {
+                                apr_pollfd_t *pfd;
+                                pfd = (apr_pollfd_t *)te->remove->elts + i;
+                                apr_pollset_remove(event_pollset, pfd);
+                            }
                         }
+                        push_timer2worker(te);
                     }
-                    push_timer2worker(te);
+                    else {
+                        APR_RING_INSERT_TAIL(&timer_free_ring, te,
+                                             timer_event_t, link);
+                    }
                 }
                 else {
-                    APR_RING_INSERT_TAIL(&timer_free_ring, te, timer_event_t,
-                                         link);
+                    timeout_interval = te->when - now;
+                    break;
                 }
             }
-            else {
-                break;
+            /* If there are no timers in the list, either the listener is
+             * wakeable and it can poll() indefinitely until a wake up occurs,
+             * or periodic checks must be performed.
+             */
+            if (!te) {
+                if (!listener_is_wakeable) {
+                    timeout_interval = apr_time_from_msec(100);
+                }
+                timers_next_expiry = 0;
             }
-            te = apr_skiplist_peek(timer_skiplist);
+            apr_thread_mutex_unlock(g_timer_skiplist_mtx);
         }
-        apr_thread_mutex_unlock(g_timer_skiplist_mtx);
 
+        /* Same for queues, if the listener is wakeable use the current expiry
+         * time and expect to be woken up for an earlier one, otherwise use the
+         * maintenance timeout (max).
+         */
+        timeout_time = VOLATILE_READ(apr_time_t, queues_next_expiry);
+        if (timeout_time
+                && (timeout_interval < 0
+                    || timeout_time <= now
+                    || timeout_interval > timeout_time - now)) {
+            timeout_interval = timeout_time > now ? timeout_time - now : 1;
+        }
+        if (!listener_is_wakeable
+                && timeout_interval > apr_time_from_msec(100)) {
+            timeout_interval = apr_time_from_msec(100);
+        }
+
         rc = apr_pollset_poll(event_pollset, timeout_interval, &num, &out_pfd);
         if (rc != APR_SUCCESS) {
             if (APR_STATUS_IS_EINTR(rc)) {
-                continue;
+                /* Woken up, either update timeouts or shutdown,
+                 * both logics are above.
+                 */
+                 continue;
             }
             if (!APR_STATUS_IS_TIMEUP(rc)) {
                 ap_log_error(APLOG_MARK, APLOG_CRIT, rc, ap_server_conf,
@@ -1871,6 +1956,7 @@ static void * APR_THREAD_FUNC listener_thread(apr_
                              "shutdown process gracefully");
                 signal_threads(ST_GRACEFUL);
             }
+            num = 0;
         }
 
         if (listener_may_exit) {
@@ -2083,35 +2169,25 @@ static void * APR_THREAD_FUNC listener_thread(apr_
         /* XXX possible optimization: stash the current time for use as
          * r->request_time for new requests
          */
-        now = apr_time_now();
-        /* We only do this once per 0.1s (TIMEOUT_FUDGE_FACTOR), or on a clock
-         * skew (if the system time is set back in the meantime, timeout_time
-         * will exceed now + TIMEOUT_FUDGE_FACTOR, can't happen otherwise).
+        /* We process the timeout queues here only when their overall next
+         * expiry (read once above) is over. This happens accurately since
+         * adding to the queues (in workers) can only decrease this expiry,
+         * while latest ones are only taken into account here (in listener)
+         * during queues' processing, with the lock held. This works both
+         * with and without wake-ability.
          */
-        if (now > timeout_time || now + TIMEOUT_FUDGE_FACTOR < timeout_time ) {
-            struct process_score *ps;
+        if (timeout_time && timeout_time < (now = apr_time_now())) {
             timeout_time = now + TIMEOUT_FUDGE_FACTOR;
 
             /* handle timed out sockets */
             apr_thread_mutex_lock(timeout_mutex);
 
+            /* Processing all the queues below will recompute this. */
+            queues_next_expiry = 0;
+
             /* Step 1: keepalive timeouts */
-            /* If all workers are busy, we kill older keep-alive connections so that they
-             * may connect to another process.
-             */
-            if ((workers_were_busy || dying) && *keepalive_q->total) {
-                if (!dying)
-                    ap_log_error(APLOG_MARK, APLOG_TRACE1, 0, ap_server_conf,
-                                 "All workers are busy, will close %d keep-alive "
-                                 "connections",
-                                 *keepalive_q->total);
-                process_timeout_queue(keepalive_q, 0,
-                                      start_lingering_close_nonblocking);
-            }
-            else {
-                process_timeout_queue(keepalive_q, timeout_time,
-                                      start_lingering_close_nonblocking);
-            }
+            process_timeout_queue(keepalive_q, timeout_time,
+                                  start_lingering_close_nonblocking);
             /* Step 2: write completion timeouts */
             process_timeout_queue(write_completion_q, timeout_time,
                                   start_lingering_close_nonblocking);
@@ -2120,7 +2196,6 @@ static void * APR_THREAD_FUNC listener_thread(apr_
             /* Step 4: (short) lingering close completion timeouts */
             process_timeout_queue(short_linger_q, timeout_time, stop_lingering_close);
 
-            ps = ap_get_scoreboard_process(process_slot);
             ps->write_completion = *write_completion_q->total;
             ps->keep_alive = *keepalive_q->total;
             apr_thread_mutex_unlock(timeout_mutex);
@@ -2129,6 +2204,19 @@ static void * APR_THREAD_FUNC listener_thread(apr_
             ps->suspended = apr_atomic_read32(&suspended_count);
             ps->lingering_close = apr_atomic_read32(&lingering_count);
         }
+        else if ((workers_were_busy || dying) && *keepalive_q->total) {
+            if (!dying) {
+                ap_log_error(APLOG_MARK, APLOG_TRACE1, 0, ap_server_conf,
+                             "All workers are busy, will close %d keep-alive "
+                             "connections", *keepalive_q->total);
+            }
+            apr_thread_mutex_lock(timeout_mutex);
+            process_timeout_queue(keepalive_q, 0,
+                                  start_lingering_close_nonblocking);
+            ps->keep_alive = 0;
+            apr_thread_mutex_unlock(timeout_mutex);
+        }
+
         if (listeners_disabled && !workers_were_busy
             && ((c_count = apr_atomic_read32(&connection_count))
                     >= (l_count = apr_atomic_read32(&lingering_count))
@@ -2341,6 +2429,8 @@ static void *APR_THREAD_FUNC start_threads(apr_thr
     int prev_threads_created;
     int max_recycled_pools = -1;
     int good_methods[] = {APR_POLLSET_KQUEUE, APR_POLLSET_PORT, APR_POLLSET_EPOLL};
+    /* XXX don't we need more to handle K-A or lingering close? */
+    const apr_uint32_t pollset_size = threads_per_child * 2;
 
     /* We must create the fd queues before we start up the listener
      * and worker threads. */
@@ -2380,24 +2470,24 @@ static void *APR_THREAD_FUNC start_threads(apr_thr
 
     /* Create the main pollset */
     for (i = 0; i < sizeof(good_methods) / sizeof(good_methods[0]); i++) {
-        rv = apr_pollset_create_ex(&event_pollset,
-                            threads_per_child*2, /* XXX don't we need more, to handle
-                                                * connections in K-A or lingering
-                                                * close?
-                                                */
-                            pchild, APR_POLLSET_THREADSAFE | APR_POLLSET_NOCOPY | APR_POLLSET_NODEFAULT,
-                            good_methods[i]);
+        apr_uint32_t flags = APR_POLLSET_THREADSAFE | APR_POLLSET_NOCOPY |
+                             APR_POLLSET_NODEFAULT | APR_POLLSET_WAKEABLE;
+        rv = apr_pollset_create_ex(&event_pollset, pollset_size, pchild, flags,
+                                   good_methods[i]);
         if (rv == APR_SUCCESS) {
+            listener_is_wakeable = 1;
             break;
         }
+        flags &= ~APR_POLLSET_WAKEABLE;
+        rv = apr_pollset_create_ex(&event_pollset, pollset_size, pchild, flags,
+                                   good_methods[i]);
+        if (rv == APR_SUCCESS) {
+            break;
+        }
     }
     if (rv != APR_SUCCESS) {
-        rv = apr_pollset_create(&event_pollset,
-                               threads_per_child*2, /* XXX don't we need more, to handle
-                                                     * connections in K-A or lingering
-                                                     * close?
-                                                     */
-                               pchild, APR_POLLSET_THREADSAFE | APR_POLLSET_NOCOPY);
+        rv = apr_pollset_create(&event_pollset, pollset_size, pchild,
+                                APR_POLLSET_THREADSAFE | APR_POLLSET_NOCOPY);
     }
     if (rv != APR_SUCCESS) {
         ap_log_error(APLOG_MARK, APLOG_ERR, rv, ap_server_conf, APLOGNO(03103)
@@ -2406,7 +2496,9 @@ static void *APR_THREAD_FUNC start_threads(apr_thr
     }
 
     ap_log_error(APLOG_MARK, APLOG_DEBUG, 0, ap_server_conf, APLOGNO(02471)
-                 "start_threads: Using %s", apr_pollset_method_name(event_pollset));
+                 "start_threads: Using %s (%swakeable)",
+                 apr_pollset_method_name(event_pollset),
+                 listener_is_wakeable ? "" : "not ");
     worker_sockets = apr_pcalloc(pchild, threads_per_child
                                  * sizeof(apr_socket_t *));

Re: Frequent wake-ups for mpm_event

Reply via email to