Hi Stefan,

thanks again for the tests and very useful feedbacks!

On Fri, Sep 23, 2016 at 9:26 PM, Stefan Priebe - Profihost AG
<s.pri...@profihost.ag> wrote:
>
> It just seems a lot of threads are hanging / deadlocked.

Clearly...

>
> gdb says:
[...]
>
> Thread 27 (Thread 0x7f92255d4700 (LWP 13783)):
> #0  0x00007f9235b6608f in pthread_cond_wait@@GLIBC_2.3.2 () from
> /lib/x86_64-linux-gnu/libpthread.so.0
> #1  0x000000000055fa26 in ap_queue_pop_something ()
> #2  0x000000000055a633 in worker_thread ()
> #3  0x00007f9235b620a4 in start_thread () from
> /lib/x86_64-linux-gnu/libpthread.so.0
> #4  0x00007f92358975dd in clone () from /lib/x86_64-linux-gnu/libc.so.6
[...]
>
> Thread 2 (Thread 0x7f92147e8700 (LWP 13808)):
> #0  0x00007f9235897bb3 in epoll_wait () from /lib/x86_64-linux-gnu/libc.so.6
> #1  0x00007f9235d99fc3 in ?? () from /usr/lib/x86_64-linux-gnu/libapr-1.so.0
> #2  0x00000000005595bd in listener_thread ()
> #3  0x00007f9235b620a4 in start_thread () from
> /lib/x86_64-linux-gnu/libpthread.so.0
> #4  0x00007f92358975dd in clone () from /lib/x86_64-linux-gnu/libc.so.6
>
> Thread 1 (Thread 0x7f9237159780 (LWP 13703)):
> #0  0x00007f9235b634db in pthread_join () from
> /lib/x86_64-linux-gnu/libpthread.so.0
> #1  0x00007f9235d9e9bb in apr_thread_join () from
> /usr/lib/x86_64-linux-gnu/libapr-1.so.0
> #2  0x000000000055b283 in join_workers ()
> #3  0x000000000055b8cc in child_main ()
> #4  0x000000000055ba3f in make_child ()
> #5  0x000000000055c21e in perform_idle_server_maintenance ()
> #6  0x000000000055c619 in server_main_loop ()
> #7  0x000000000055c959 in event_run ()
> #8  0x0000000000445f91 in ap_run_mpm ()
> #9  0x000000000043dd19 in main ()

So the listener is not woken up while the child process is being
gracefuly stopped, and no worker seems to be working.

It seems that 2.4.x is missing an important fix for my patch to work
correctly (namely http://svn.apache.org/r1732228, which was never
backported to 2.4).
Without it, the listener/poll()ing wake-up thing is not enabled on linux...

This says that my patch is broken when no "good" polling method is
available on the system (I can probably fix that later), but this
shouldn't the case for linux' epoll.

Could you please give a(nother) try to this new v4 patch (attached)?
This is simply v3 (from PR 57399) plus trunk's r1732228 mentioned above.

Regards,
Yann.
Index: server/mpm/event/event.c
===================================================================
--- server/mpm/event/event.c	(revision 1761356)
+++ server/mpm/event/event.c	(working copy)
@@ -100,6 +100,8 @@
 #include <limits.h>             /* for INT_MAX */
 
 
+#define VOLATILE_READ(T, x) (*(volatile T *)&(x))
+
 /* Limit on the total --- clients will be locked out if more servers than
  * this are needed.  It is intended solely to keep the server from crashing
  * when things get out of hand.
@@ -174,6 +176,7 @@ static int dying = 0;
 static int workers_may_exit = 0;
 static int start_thread_may_exit = 0;
 static int listener_may_exit = 0;
+static int listener_is_wakeable = 0;        /* Pollset supports APR_POLLSET_WAKEABLE */
 static int num_listensocks = 0;
 static apr_int32_t conns_this_child;        /* MaxConnectionsPerChild, only access
                                                in listener thread */
@@ -194,6 +197,17 @@ module AP_MODULE_DECLARE_DATA mpm_event_module;
 struct event_srv_cfg_s;
 typedef struct event_srv_cfg_s event_srv_cfg;
 
+/*
+ * The pollset for sockets that are in any of the timeout queues. Currently
+ * we use the timeout_mutex to make sure that connections are added/removed
+ * atomically to/from both event_pollset and a timeout queue. Otherwise
+ * some confusion can happen under high load if timeout queues and pollset
+ * get out of sync.
+ * XXX: It should be possible to make the lock unnecessary in many or even all
+ * XXX: cases.
+ */
+static apr_pollset_t *event_pollset;
+
 struct event_conn_state_t {
     /** APR_RING of expiration timeouts */
     APR_RING_ENTRY(event_conn_state_t) timeout_list;
@@ -239,24 +253,52 @@ static struct timeout_queue *write_completion_q,
                             *keepalive_q,
                             *linger_q,
                             *short_linger_q;
+static apr_time_t queues_next_expiry;
 
 static apr_pollfd_t *listener_pollfd;
 
+/* Prevent extra poll/wakeup calls for timeouts close in the future (queues
+ * have the granularity of a second anyway).
+ * XXX: Wouldn't 0.5s (instead of 0.1s) be "enough"?
+ */
+#define TIMEOUT_FUDGE_FACTOR APR_TIME_C(100000) /* 100 ms */
+
+/* Same goal as for TIMEOUT_FUDGE_FACTOR (avoid extra poll calls), but applied
+ * to timers. Since their timeouts are custom (user defined), we can't be too
+ * approximative here (hence using 0.01s).
+ */
+#define EVENT_FUDGE_FACTOR APR_TIME_C(10000) /* 10 ms */
+
 /*
  * Macros for accessing struct timeout_queue.
  * For TO_QUEUE_APPEND and TO_QUEUE_REMOVE, timeout_mutex must be held.
  */
-#define TO_QUEUE_APPEND(q, el)                                                \
-    do {                                                                      \
-        APR_RING_INSERT_TAIL(&(q)->head, el, event_conn_state_t,              \
-                             timeout_list);                                   \
-        ++*(q)->total;                                                        \
-        ++(q)->count;                                                         \
-    } while (0)
+static void TO_QUEUE_APPEND(struct timeout_queue *q, event_conn_state_t *el)
+{
+    apr_time_t q_expiry;
 
+    APR_RING_INSERT_TAIL(&q->head, el, event_conn_state_t, timeout_list);
+    ++*q->total;
+    ++q->count;
+
+    /* Cheaply update the overall queues' next expiry according to the
+     * first entry of this queue (oldest), if necessary.
+     */
+    el = APR_RING_FIRST(&q->head);
+    q_expiry = el->queue_timestamp + q->timeout;
+    if (!queues_next_expiry
+            || queues_next_expiry > q_expiry + TIMEOUT_FUDGE_FACTOR) {
+        VOLATILE_READ(apr_time_t, queues_next_expiry) = q_expiry;
+        /* Unblock the poll()ing listener for it to update its timeout. */
+        if (listener_is_wakeable) {
+            apr_pollset_wakeup(event_pollset);
+        }
+    }
+}
+
 #define TO_QUEUE_REMOVE(q, el)                                                \
     do {                                                                      \
-        APR_RING_REMOVE(el, timeout_list);                                    \
+        APR_RING_REMOVE((el), timeout_list);                                  \
         --*(q)->total;                                                        \
         --(q)->count;                                                         \
     } while (0)
@@ -272,19 +314,9 @@ static apr_pollfd_t *listener_pollfd;
         (q)->next = NULL;                                                     \
     } while (0)
 
-#define TO_QUEUE_ELEM_INIT(el) APR_RING_ELEM_INIT(el, timeout_list)
+#define TO_QUEUE_ELEM_INIT(el) \
+    APR_RING_ELEM_INIT(el, timeout_list)
 
-/*
- * The pollset for sockets that are in any of the timeout queues. Currently
- * we use the timeout_mutex to make sure that connections are added/removed
- * atomically to/from both event_pollset and a timeout queue. Otherwise
- * some confusion can happen under high load if timeout queues and pollset
- * get out of sync.
- * XXX: It should be possible to make the lock unnecessary in many or even all
- * XXX: cases.
- */
-static apr_pollset_t *event_pollset;
-
 /* The structure used to pass unique initialization info to each thread */
 typedef struct
 {
@@ -462,6 +494,11 @@ static void wakeup_listener(void)
         return;
     }
 
+    /* Unblock the listener if it's poll()ing */
+    if (listener_is_wakeable) {
+        apr_pollset_wakeup(event_pollset);
+    }
+
     /* unblock the listener if it's waiting for a worker */
     ap_queue_info_term(worker_queue_info);
 
@@ -656,7 +693,11 @@ static apr_status_t decrement_connection_count(voi
         default:
             break;
     }
-    apr_atomic_dec32(&connection_count);
+    /* Unblock the listener if it's waiting for connection_count = 0 */
+    if (!apr_atomic_dec32(&connection_count)
+             && listener_is_wakeable && listener_may_exit) {
+        apr_pollset_wakeup(event_pollset);
+    }
     return APR_SUCCESS;
 }
 
@@ -819,6 +860,7 @@ static void notify_resume(event_conn_state_t *cs,
 
 static int start_lingering_close_common(event_conn_state_t *cs, int in_worker)
 {
+    int done = 0;
     apr_status_t rv;
     struct timeout_queue *q;
     apr_socket_t *csd = cs->pfd.desc.s;
@@ -830,7 +872,6 @@ static int start_lingering_close_common(event_conn
 #else
     apr_socket_timeout_set(csd, 0);
 #endif
-    cs->queue_timestamp = apr_time_now();
     /*
      * If some module requested a shortened waiting period, only wait for
      * 2s (SECONDS_TO_LINGER). This is useful for mitigating certain
@@ -851,25 +892,25 @@ static int start_lingering_close_common(event_conn
     else {
         cs->c->sbh = NULL;
     }
-    apr_thread_mutex_lock(timeout_mutex);
-    TO_QUEUE_APPEND(q, cs);
     cs->pfd.reqevents = (
             cs->pub.sense == CONN_SENSE_WANT_WRITE ? APR_POLLOUT :
                     APR_POLLIN) | APR_POLLHUP | APR_POLLERR;
     cs->pub.sense = CONN_SENSE_DEFAULT;
+    cs->queue_timestamp = apr_time_now();
+    apr_thread_mutex_lock(timeout_mutex);
     rv = apr_pollset_add(event_pollset, &cs->pfd);
+    if (rv == APR_SUCCESS || APR_STATUS_IS_EEXIST(rv)) {
+        TO_QUEUE_APPEND(q, cs);
+        done = 1;
+    }
     apr_thread_mutex_unlock(timeout_mutex);
-    if (rv != APR_SUCCESS && !APR_STATUS_IS_EEXIST(rv)) {
+    if (!done) {
         ap_log_error(APLOG_MARK, APLOG_ERR, rv, ap_server_conf, APLOGNO(03092)
                      "start_lingering_close: apr_pollset_add failure");
-        apr_thread_mutex_lock(timeout_mutex);
-        TO_QUEUE_REMOVE(q, cs);
-        apr_thread_mutex_unlock(timeout_mutex);
         apr_socket_close(cs->pfd.desc.s);
         ap_push_pool(worker_queue_info, cs->p);
-        return 0;
     }
-    return 1;
+    return done;
 }
 
 /*
@@ -1129,15 +1170,15 @@ read_request:
              * Set a write timeout for this connection, and let the
              * event thread poll for writeability.
              */
-            cs->queue_timestamp = apr_time_now();
             notify_suspend(cs);
-            apr_thread_mutex_lock(timeout_mutex);
-            TO_QUEUE_APPEND(cs->sc->wc_q, cs);
             cs->pfd.reqevents = (
                     cs->pub.sense == CONN_SENSE_WANT_READ ? APR_POLLIN :
                             APR_POLLOUT) | APR_POLLHUP | APR_POLLERR;
             cs->pub.sense = CONN_SENSE_DEFAULT;
-            rc = apr_pollset_add(event_pollset, &cs->pfd);
+            cs->queue_timestamp = apr_time_now();
+            apr_thread_mutex_lock(timeout_mutex);
+            apr_pollset_add(event_pollset, &cs->pfd);
+            TO_QUEUE_APPEND(cs->sc->wc_q, cs);
             apr_thread_mutex_unlock(timeout_mutex);
             return;
         }
@@ -1166,14 +1207,13 @@ read_request:
          * timeout today.  With a normal client, the socket will be readable in
          * a few milliseconds anyway.
          */
-        cs->queue_timestamp = apr_time_now();
         notify_suspend(cs);
-        apr_thread_mutex_lock(timeout_mutex);
-        TO_QUEUE_APPEND(cs->sc->ka_q, cs);
-
         /* Add work to pollset. */
         cs->pfd.reqevents = APR_POLLIN;
+        cs->queue_timestamp = apr_time_now();
+        apr_thread_mutex_lock(timeout_mutex);
         rc = apr_pollset_add(event_pollset, &cs->pfd);
+        TO_QUEUE_APPEND(cs->sc->ka_q, cs);
         apr_thread_mutex_unlock(timeout_mutex);
 
         if (rc != APR_SUCCESS) {
@@ -1347,6 +1387,7 @@ static void get_worker(int *have_idle_worker_p, in
 static APR_RING_HEAD(timer_free_ring_t, timer_event_t) timer_free_ring;
 
 static apr_skiplist *timer_skiplist;
+static apr_time_t timers_next_expiry;
 
 /* The following compare function is used by apr_skiplist_insert() to keep the
  * elements (timers) sorted and provide O(log n) complexity (this is also true
@@ -1396,6 +1437,18 @@ static apr_status_t event_register_timed_callback(
     /* Okay, add sorted by when.. */
     apr_skiplist_insert(timer_skiplist, te);
 
+    /* Cheaply update the overall timers' next expiry according to
+     * this event, if necessary.
+     */
+    if (!timers_next_expiry
+            || timers_next_expiry > te->when + EVENT_FUDGE_FACTOR) {
+        VOLATILE_READ(apr_time_t, timers_next_expiry) = te->when;
+        /* Unblock the poll()ing listener for it to update its timeout. */
+        if (listener_is_wakeable) {
+            apr_pollset_wakeup(event_pollset);
+        }
+    }
+
     apr_thread_mutex_unlock(g_timer_skiplist_mtx);
 
     return APR_SUCCESS;
@@ -1464,20 +1517,31 @@ static void process_timeout_queue(struct timeout_q
         count = 0;
         cs = first = last = APR_RING_FIRST(&qp->head);
         while (cs != APR_RING_SENTINEL(&qp->head, event_conn_state_t,
-                                       timeout_list)
-               /* Trash the entry if:
-                * - no timeout_time was given (asked for all), or
-                * - it expired (according to the queue timeout), or
-                * - the system clock skewed in the past: no entry should be
-                *   registered above the given timeout_time (~now) + the queue
-                *   timeout, we won't keep any here (eg. for centuries).
-                * Stop otherwise, no following entry will match thanks to the
-                * single timeout per queue (entries are added to the end!).
-                * This allows maintenance in O(1).
-                */
-               && (!timeout_time
-                   || cs->queue_timestamp + qp->timeout < timeout_time
-                   || cs->queue_timestamp > timeout_time + qp->timeout)) {
+                                       timeout_list)) {
+            /* Trash the entry if:
+             * - no timeout_time was given (asked for all), or
+             * - it expired (according to the queue timeout), or
+             * - the system clock skewed in the past: no entry should be
+             *   registered above the given timeout_time (~now) + the queue
+             *   timeout, we won't keep any here (eg. for centuries).
+             *
+             * Otherwise stop, no following entry will match thanks to the
+             * single timeout per queue (entries are added to the end!).
+             * This allows maintenance in O(1).
+             */
+            if (timeout_time
+                    && cs->queue_timestamp + qp->timeout > timeout_time
+                    && cs->queue_timestamp < timeout_time + qp->timeout) {
+                /* Since this is the next expiring of this queue, update the
+                 * overall queues' next expiry if it's later than this one.
+                 */
+                apr_time_t q_expiry = cs->queue_timestamp + qp->timeout;
+                if (!queues_next_expiry || queues_next_expiry > q_expiry) {
+                    VOLATILE_READ(apr_time_t, queues_next_expiry) = q_expiry;
+                }
+                break;
+            }
+
             last = cs;
             rv = apr_pollset_remove(event_pollset, &cs->pfd);
             if (rv != APR_SUCCESS && !APR_STATUS_IS_NOTFOUND(rv)) {
@@ -1514,11 +1578,11 @@ static void process_timeout_queue(struct timeout_q
 
 static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy)
 {
-    timer_event_t *ep;
     timer_event_t *te;
     apr_status_t rc;
     proc_info *ti = dummy;
     int process_slot = ti->pid;
+    struct process_score *ps = ap_get_scoreboard_process(process_slot);
     apr_pool_t *tpool = apr_thread_pool_get(thd);
     void *csd = NULL;
     apr_pool_t *ptrans;         /* Pool for per-transaction stuff */
@@ -1527,21 +1591,13 @@ static void * APR_THREAD_FUNC listener_thread(apr_
     const apr_pollfd_t *out_pfd;
     apr_int32_t num = 0;
     apr_interval_time_t timeout_interval;
-    apr_time_t timeout_time = 0, now, last_log;
     listener_poll_type *pt;
     int closed = 0, listeners_disabled = 0;
+    apr_time_t last_log;
 
     last_log = apr_time_now();
     free(ti);
 
-    /* the following times out events that are really close in the future
-     *   to prevent extra poll calls
-     *
-     * current value is .1 second
-     */
-#define TIMEOUT_FUDGE_FACTOR 100000
-#define EVENT_FUDGE_FACTOR 10000
-
     rc = init_pollset(tpool);
     if (rc != APR_SUCCESS) {
         ap_log_error(APLOG_MARK, APLOG_ERR, rc, ap_server_conf,
@@ -1559,6 +1615,9 @@ static void * APR_THREAD_FUNC listener_thread(apr_
 
     for (;;) {
         int workers_were_busy = 0;
+        apr_time_t now, timeout_time;
+        int keepalives;
+
         if (listener_may_exit) {
             close_listeners(process_slot, &closed);
             if (terminate_mode == ST_UNGRACEFUL
@@ -1569,7 +1628,12 @@ static void * APR_THREAD_FUNC listener_thread(apr_
         if (conns_this_child <= 0)
             check_infinite_requests();
 
+        /* Update poll() timeout below according to the next expiring
+         * timer or queue entry, if any.
+         */
+        timeout_interval = -1;
         now = apr_time_now();
+
         if (APLOGtrace6(ap_server_conf)) {
             /* trace log status every second */
             if (now - last_log > apr_time_from_msec(1000)) {
@@ -1588,24 +1652,61 @@ static void * APR_THREAD_FUNC listener_thread(apr_
             }
         }
 
-        apr_thread_mutex_lock(g_timer_skiplist_mtx);
-        te = apr_skiplist_peek(timer_skiplist);
-        if (te) {
-            if (te->when > now) {
-                timeout_interval = te->when - now;
+        /* Avoid locking if there's no expiring timer in the list,
+         * poll() will be woken up anyway if a new timer comes in.
+         */
+        timeout_time = VOLATILE_READ(apr_time_t, timers_next_expiry);
+        if (timeout_time && timeout_time < now + EVENT_FUDGE_FACTOR) {
+            /* Push expired timers to a worker, the first one remaining
+             * determines the maximum time to poll() below.
+             */
+            apr_thread_mutex_lock(g_timer_skiplist_mtx);
+            while ((te = apr_skiplist_peek(timer_skiplist))) {
+                if (te->when < now + EVENT_FUDGE_FACTOR) {
+                    apr_skiplist_pop(timer_skiplist, NULL);
+                    push_timer2worker(te);
+                 }
+                 else {
+                    timeout_interval = te->when - now;
+                    timers_next_expiry = te->when;
+                    break;
+                }
             }
-            else {
-                timeout_interval = 1;
+            /* If there are no timers in the list, either the listener is
+             * wakeable and it can poll() indefinitely until a wake up occurs,
+             * or periodic checks must be performed.
+             */
+            if (!te) {
+                if (!listener_is_wakeable) {
+                    timeout_interval = apr_time_from_msec(100);
+                }
+                timers_next_expiry = 0;
             }
+            apr_thread_mutex_unlock(g_timer_skiplist_mtx);
         }
-        else {
+
+        /* Same for queues, if the listener is wakeable use the current expiry
+         * time and expect to be woken up for an earlier one, otherwise use the
+         * maintenance timeout (max).
+         */
+        timeout_time = VOLATILE_READ(apr_time_t, queues_next_expiry);
+        if (timeout_time
+                && (timeout_interval < 0
+                    || timeout_time <= now
+                    || timeout_interval > timeout_time - now)) {
+            timeout_interval = timeout_time > now ? timeout_time - now : 1;
+        }
+        if (!listener_is_wakeable
+                && timeout_interval > apr_time_from_msec(100)) {
             timeout_interval = apr_time_from_msec(100);
         }
-        apr_thread_mutex_unlock(g_timer_skiplist_mtx);
 
         rc = apr_pollset_poll(event_pollset, timeout_interval, &num, &out_pfd);
         if (rc != APR_SUCCESS) {
             if (APR_STATUS_IS_EINTR(rc)) {
+                /* Woken up, either update timeouts or shutdown,
+                 * both logics are above.
+                 */
                 continue;
             }
             if (!APR_STATUS_IS_TIMEUP(rc)) {
@@ -1614,6 +1715,7 @@ static void * APR_THREAD_FUNC listener_thread(apr_
                              "shutdown process gracefully");
                 signal_threads(ST_GRACEFUL);
             }
+            num = 0;
         }
 
         if (listener_may_exit) {
@@ -1623,21 +1725,6 @@ static void * APR_THREAD_FUNC listener_thread(apr_
                 break;
         }
 
-        now = apr_time_now();
-        apr_thread_mutex_lock(g_timer_skiplist_mtx);
-        ep = apr_skiplist_peek(timer_skiplist);
-        while (ep) {
-            if (ep->when < now + EVENT_FUDGE_FACTOR) {
-                apr_skiplist_pop(timer_skiplist, NULL);
-                push_timer2worker(ep);
-            }
-            else {
-                break;
-            }
-            ep = apr_skiplist_peek(timer_skiplist);
-        }
-        apr_thread_mutex_unlock(g_timer_skiplist_mtx);
-
         while (num) {
             pt = (listener_poll_type *) out_pfd->client_data;
             if (pt->type == PT_CSD) {
@@ -1803,42 +1890,35 @@ static void * APR_THREAD_FUNC listener_thread(apr_
          * r->request_time for new requests
          */
         now = apr_time_now();
-        /* We only do this once per 0.1s (TIMEOUT_FUDGE_FACTOR), or on a clock
-         * skew (if the system time is set back in the meantime, timeout_time
-         * will exceed now + TIMEOUT_FUDGE_FACTOR, can't happen otherwise).
+        /* We process the timeout queues here only when their overall next
+         * expiry (read once above) is over. This happens accurately since
+         * adding to the queues (in workers) can only decrease this expiry,
+         * while latest ones are only taken into account here (in listener)
+         * during queues' processing, with the lock held. This works both
+         * with and without wake-ability.
          */
-        if (now > timeout_time || now + TIMEOUT_FUDGE_FACTOR < timeout_time ) {
-            struct process_score *ps;
+        if (timeout_time && timeout_time < (now = apr_time_now())) {
             timeout_time = now + TIMEOUT_FUDGE_FACTOR;
 
             /* handle timed out sockets */
             apr_thread_mutex_lock(timeout_mutex);
 
+            /* Processing all the queues below will recompute this. */
+            queues_next_expiry = 0;
+
             /* Step 1: keepalive timeouts */
-            /* If all workers are busy, we kill older keep-alive connections so that they
-             * may connect to another process.
-             */
-            if (workers_were_busy && *keepalive_q->total) {
-                ap_log_error(APLOG_MARK, APLOG_TRACE1, 0, ap_server_conf,
-                             "All workers are busy, will close %d keep-alive "
-                             "connections",
-                             *keepalive_q->total);
-                process_timeout_queue(keepalive_q, 0,
-                                      start_lingering_close_nonblocking);
-            }
-            else {
-                process_timeout_queue(keepalive_q, timeout_time,
-                                      start_lingering_close_nonblocking);
-            }
+            process_timeout_queue(keepalive_q, timeout_time,
+                                  start_lingering_close_nonblocking);
             /* Step 2: write completion timeouts */
             process_timeout_queue(write_completion_q, timeout_time,
                                   start_lingering_close_nonblocking);
             /* Step 3: (normal) lingering close completion timeouts */
-            process_timeout_queue(linger_q, timeout_time, stop_lingering_close);
+            process_timeout_queue(linger_q, timeout_time,
+                                  stop_lingering_close);
             /* Step 4: (short) lingering close completion timeouts */
-            process_timeout_queue(short_linger_q, timeout_time, stop_lingering_close);
+            process_timeout_queue(short_linger_q, timeout_time,
+                                  stop_lingering_close);
 
-            ps = ap_get_scoreboard_process(process_slot);
             ps->write_completion = *write_completion_q->total;
             ps->keep_alive = *keepalive_q->total;
             apr_thread_mutex_unlock(timeout_mutex);
@@ -1847,6 +1927,22 @@ static void * APR_THREAD_FUNC listener_thread(apr_
             ps->suspended = apr_atomic_read32(&suspended_count);
             ps->lingering_close = apr_atomic_read32(&lingering_count);
         }
+        else if (workers_were_busy
+                 && (keepalives = VOLATILE_READ(int, *keepalive_q->total))) {
+            /* If all workers are busy, we kill older keep-alive connections so
+             * that they may connect to another process.
+             */
+            ap_log_error(APLOG_MARK, APLOG_TRACE1, 0, ap_server_conf,
+                         "All workers are busy, will close %d keep-alive "
+                         "connections",
+                         keepalives);
+            apr_thread_mutex_lock(timeout_mutex);
+            process_timeout_queue(keepalive_q, 0,
+                                  start_lingering_close_nonblocking);
+            ps->keep_alive = 0;
+            apr_thread_mutex_unlock(timeout_mutex);
+        }
+
         if (listeners_disabled && !workers_were_busy
             && (int)apr_atomic_read32(&connection_count)
                - (int)apr_atomic_read32(&lingering_count)
@@ -2027,6 +2123,8 @@ static void *APR_THREAD_FUNC start_threads(apr_thr
     int prev_threads_created;
     int max_recycled_pools = -1;
     int good_methods[] = {APR_POLLSET_KQUEUE, APR_POLLSET_PORT, APR_POLLSET_EPOLL};
+    /* XXX don't we need more to handle K-A or lingering close? */
+    const apr_uint32_t pollset_size = threads_per_child * 2;
 
     /* We must create the fd queues before we start up the listener
      * and worker threads. */
@@ -2065,25 +2163,25 @@ static void *APR_THREAD_FUNC start_threads(apr_thr
     }
 
     /* Create the main pollset */
-    for (i = 0; i < sizeof(good_methods) / sizeof(void*); i++) {
-        rv = apr_pollset_create_ex(&event_pollset,
-                            threads_per_child*2, /* XXX don't we need more, to handle
-                                                * connections in K-A or lingering
-                                                * close?
-                                                */
-                            pchild, APR_POLLSET_THREADSAFE | APR_POLLSET_NOCOPY | APR_POLLSET_NODEFAULT,
-                            good_methods[i]);
+    for (i = 0; i < sizeof(good_methods) / sizeof(good_methods[0]); i++) {
+        apr_uint32_t flags = APR_POLLSET_THREADSAFE | APR_POLLSET_NOCOPY |
+                             APR_POLLSET_NODEFAULT | APR_POLLSET_WAKEABLE;
+        rv = apr_pollset_create_ex(&event_pollset, pollset_size, pchild, flags,
+                                   good_methods[i]);
         if (rv == APR_SUCCESS) {
+            listener_is_wakeable = 1;
             break;
         }
+        flags &= ~APR_POLLSET_WAKEABLE;
+        rv = apr_pollset_create_ex(&event_pollset, pollset_size, pchild, flags,
+                                   good_methods[i]);
+        if (rv == APR_SUCCESS) {
+            break;
+        }
     }
     if (rv != APR_SUCCESS) {
-        rv = apr_pollset_create(&event_pollset,
-                               threads_per_child*2, /* XXX don't we need more, to handle
-                                                     * connections in K-A or lingering
-                                                     * close?
-                                                     */
-                               pchild, APR_POLLSET_THREADSAFE | APR_POLLSET_NOCOPY);
+        rv = apr_pollset_create(&event_pollset, pollset_size, pchild,
+                                APR_POLLSET_THREADSAFE | APR_POLLSET_NOCOPY);
     }
     if (rv != APR_SUCCESS) {
         ap_log_error(APLOG_MARK, APLOG_ERR, rv, ap_server_conf, APLOGNO(03103)
@@ -2092,7 +2190,9 @@ static void *APR_THREAD_FUNC start_threads(apr_thr
     }
 
     ap_log_error(APLOG_MARK, APLOG_DEBUG, 0, ap_server_conf, APLOGNO(02471)
-                 "start_threads: Using %s", apr_pollset_method_name(event_pollset));
+                 "start_threads: Using %s (%swakeable)",
+                 apr_pollset_method_name(event_pollset),
+                 listener_is_wakeable ? "" : "not ");
     worker_sockets = apr_pcalloc(pchild, threads_per_child
                                  * sizeof(apr_socket_t *));
 

Reply via email to