On Wed, Jan 18, 2017 at 10:49 PM, Stefan Priebe - Profihost AG <s.pri...@profihost.ag> wrote: > > v5 does not apply to 2.4.25. If you can send me a v5 version that > applies to 2.4.25 i'll test.
Here it is (slightly modified to fix possible deadlocks, per r1762742 and r1774538). Thanks!
Index: server/mpm/event/event.c =================================================================== --- server/mpm/event/event.c (revision 1778313) +++ server/mpm/event/event.c (working copy) @@ -100,6 +100,8 @@ #include <limits.h> /* for INT_MAX */ +#define VOLATILE_READ(T, x) (*(volatile T *)&(x)) + /* Limit on the total --- clients will be locked out if more servers than * this are needed. It is intended solely to keep the server from crashing * when things get out of hand. @@ -177,6 +179,7 @@ static int dying = 0; static int workers_may_exit = 0; static int start_thread_may_exit = 0; static int listener_may_exit = 0; +static int listener_is_wakeable = 0; /* Pollset supports APR_POLLSET_WAKEABLE */ static int num_listensocks = 0; static apr_int32_t conns_this_child; /* MaxConnectionsPerChild, only access in listener thread */ @@ -199,6 +202,17 @@ module AP_MODULE_DECLARE_DATA mpm_event_module; struct event_srv_cfg_s; typedef struct event_srv_cfg_s event_srv_cfg; +/* + * The pollset for sockets that are in any of the timeout queues. Currently + * we use the timeout_mutex to make sure that connections are added/removed + * atomically to/from both event_pollset and a timeout queue. Otherwise + * some confusion can happen under high load if timeout queues and pollset + * get out of sync. + * XXX: It should be possible to make the lock unnecessary in many or even all + * XXX: cases. + */ +static apr_pollset_t *event_pollset; + struct event_conn_state_t { /** APR_RING of expiration timeouts */ APR_RING_ENTRY(event_conn_state_t) timeout_list; @@ -244,24 +258,52 @@ static struct timeout_queue *write_completion_q, *keepalive_q, *linger_q, *short_linger_q; +static apr_time_t queues_next_expiry; static apr_pollfd_t *listener_pollfd; +/* Prevent extra poll/wakeup calls for timeouts close in the future (queues + * have the granularity of a second anyway). + * XXX: Wouldn't 0.5s (instead of 0.1s) be "enough"? + */ +#define TIMEOUT_FUDGE_FACTOR APR_TIME_C(100000) /* 100 ms */ + +/* Same goal as for TIMEOUT_FUDGE_FACTOR (avoid extra poll calls), but applied + * to timers. Since their timeouts are custom (user defined), we can't be too + * approximative here (hence using 0.01s). + */ +#define EVENT_FUDGE_FACTOR APR_TIME_C(10000) /* 10 ms */ + /* * Macros for accessing struct timeout_queue. * For TO_QUEUE_APPEND and TO_QUEUE_REMOVE, timeout_mutex must be held. */ -#define TO_QUEUE_APPEND(q, el) \ - do { \ - APR_RING_INSERT_TAIL(&(q)->head, el, event_conn_state_t, \ - timeout_list); \ - ++*(q)->total; \ - ++(q)->count; \ - } while (0) +static void TO_QUEUE_APPEND(struct timeout_queue *q, event_conn_state_t *el) +{ + apr_time_t q_expiry; + APR_RING_INSERT_TAIL(&q->head, el, event_conn_state_t, timeout_list); + ++*q->total; + ++q->count; + + /* Cheaply update the overall queues' next expiry according to the + * first entry of this queue (oldest), if necessary. + */ + el = APR_RING_FIRST(&q->head); + q_expiry = el->queue_timestamp + q->timeout; + if (!queues_next_expiry + || queues_next_expiry > q_expiry + TIMEOUT_FUDGE_FACTOR) { + VOLATILE_READ(apr_time_t, queues_next_expiry) = q_expiry; + /* Unblock the poll()ing listener for it to update its timeout. */ + if (listener_is_wakeable) { + apr_pollset_wakeup(event_pollset); + } + } +} + #define TO_QUEUE_REMOVE(q, el) \ do { \ - APR_RING_REMOVE(el, timeout_list); \ + APR_RING_REMOVE((el), timeout_list); \ --*(q)->total; \ --(q)->count; \ } while (0) @@ -277,19 +319,9 @@ static apr_pollfd_t *listener_pollfd; (q)->next = NULL; \ } while (0) -#define TO_QUEUE_ELEM_INIT(el) APR_RING_ELEM_INIT(el, timeout_list) +#define TO_QUEUE_ELEM_INIT(el) \ + APR_RING_ELEM_INIT(el, timeout_list) -/* - * The pollset for sockets that are in any of the timeout queues. Currently - * we use the timeout_mutex to make sure that connections are added/removed - * atomically to/from both event_pollset and a timeout queue. Otherwise - * some confusion can happen under high load if timeout queues and pollset - * get out of sync. - * XXX: It should be possible to make the lock unnecessary in many or even all - * XXX: cases. - */ -static apr_pollset_t *event_pollset; - /* The structure used to pass unique initialization info to each thread */ typedef struct { @@ -474,6 +506,11 @@ static void wakeup_listener(void) return; } + /* Unblock the listener if it's poll()ing */ + if (listener_is_wakeable) { + apr_pollset_wakeup(event_pollset); + } + /* unblock the listener if it's waiting for a worker */ ap_queue_info_term(worker_queue_info); @@ -647,7 +684,11 @@ static apr_status_t decrement_connection_count(voi default: break; } - apr_atomic_dec32(&connection_count); + /* Unblock the listener if it's waiting for connection_count = 0 */ + if (!apr_atomic_dec32(&connection_count) + && listener_is_wakeable && listener_may_exit) { + apr_pollset_wakeup(event_pollset); + } return APR_SUCCESS; } @@ -810,6 +851,7 @@ static void notify_resume(event_conn_state_t *cs, static int start_lingering_close_common(event_conn_state_t *cs, int in_worker) { + int done = 0; apr_status_t rv; struct timeout_queue *q; apr_socket_t *csd = cs->pfd.desc.s; @@ -821,7 +863,6 @@ static int start_lingering_close_common(event_conn #else apr_socket_timeout_set(csd, 0); #endif - cs->queue_timestamp = apr_time_now(); /* * If some module requested a shortened waiting period, only wait for * 2s (SECONDS_TO_LINGER). This is useful for mitigating certain @@ -842,25 +883,25 @@ static int start_lingering_close_common(event_conn else { cs->c->sbh = NULL; } - apr_thread_mutex_lock(timeout_mutex); - TO_QUEUE_APPEND(q, cs); cs->pfd.reqevents = ( cs->pub.sense == CONN_SENSE_WANT_WRITE ? APR_POLLOUT : APR_POLLIN) | APR_POLLHUP | APR_POLLERR; cs->pub.sense = CONN_SENSE_DEFAULT; + cs->queue_timestamp = apr_time_now(); + apr_thread_mutex_lock(timeout_mutex); rv = apr_pollset_add(event_pollset, &cs->pfd); + if (rv == APR_SUCCESS || APR_STATUS_IS_EEXIST(rv)) { + TO_QUEUE_APPEND(q, cs); + done = 1; + } apr_thread_mutex_unlock(timeout_mutex); - if (rv != APR_SUCCESS && !APR_STATUS_IS_EEXIST(rv)) { + if (!done) { ap_log_error(APLOG_MARK, APLOG_ERR, rv, ap_server_conf, APLOGNO(03092) "start_lingering_close: apr_pollset_add failure"); - apr_thread_mutex_lock(timeout_mutex); - TO_QUEUE_REMOVE(q, cs); - apr_thread_mutex_unlock(timeout_mutex); apr_socket_close(cs->pfd.desc.s); ap_push_pool(worker_queue_info, cs->p); - return 0; } - return 1; + return done; } /* @@ -1124,15 +1165,15 @@ read_request: * Set a write timeout for this connection, and let the * event thread poll for writeability. */ - cs->queue_timestamp = apr_time_now(); notify_suspend(cs); - apr_thread_mutex_lock(timeout_mutex); - TO_QUEUE_APPEND(cs->sc->wc_q, cs); cs->pfd.reqevents = ( cs->pub.sense == CONN_SENSE_WANT_READ ? APR_POLLIN : APR_POLLOUT) | APR_POLLHUP | APR_POLLERR; cs->pub.sense = CONN_SENSE_DEFAULT; - rc = apr_pollset_add(event_pollset, &cs->pfd); + cs->queue_timestamp = apr_time_now(); + apr_thread_mutex_lock(timeout_mutex); + apr_pollset_add(event_pollset, &cs->pfd); + TO_QUEUE_APPEND(cs->sc->wc_q, cs); apr_thread_mutex_unlock(timeout_mutex); return; } @@ -1161,14 +1202,13 @@ read_request: * timeout today. With a normal client, the socket will be readable in * a few milliseconds anyway. */ - cs->queue_timestamp = apr_time_now(); notify_suspend(cs); - apr_thread_mutex_lock(timeout_mutex); - TO_QUEUE_APPEND(cs->sc->ka_q, cs); - /* Add work to pollset. */ cs->pfd.reqevents = APR_POLLIN; + cs->queue_timestamp = apr_time_now(); + apr_thread_mutex_lock(timeout_mutex); rc = apr_pollset_add(event_pollset, &cs->pfd); + TO_QUEUE_APPEND(cs->sc->ka_q, cs); apr_thread_mutex_unlock(timeout_mutex); if (rc != APR_SUCCESS) { @@ -1345,6 +1385,7 @@ static void get_worker(int *have_idle_worker_p, in static APR_RING_HEAD(timer_free_ring_t, timer_event_t) timer_free_ring; static apr_skiplist *timer_skiplist; +static apr_time_t timers_next_expiry; /* The following compare function is used by apr_skiplist_insert() to keep the * elements (timers) sorted and provide O(log n) complexity (this is also true @@ -1394,6 +1435,18 @@ static apr_status_t event_register_timed_callback( /* Okay, add sorted by when.. */ apr_skiplist_insert(timer_skiplist, te); + /* Cheaply update the overall timers' next expiry according to + * this event, if necessary. + */ + if (!timers_next_expiry + || timers_next_expiry > te->when + EVENT_FUDGE_FACTOR) { + VOLATILE_READ(apr_time_t, timers_next_expiry) = te->when; + /* Unblock the poll()ing listener for it to update its timeout. */ + if (listener_is_wakeable) { + apr_pollset_wakeup(event_pollset); + } + } + apr_thread_mutex_unlock(g_timer_skiplist_mtx); return APR_SUCCESS; @@ -1464,20 +1517,31 @@ static void process_timeout_queue(struct timeout_q count = 0; cs = first = last = APR_RING_FIRST(&qp->head); while (cs != APR_RING_SENTINEL(&qp->head, event_conn_state_t, - timeout_list) - /* Trash the entry if: - * - no timeout_time was given (asked for all), or - * - it expired (according to the queue timeout), or - * - the system clock skewed in the past: no entry should be - * registered above the given timeout_time (~now) + the queue - * timeout, we won't keep any here (eg. for centuries). - * Stop otherwise, no following entry will match thanks to the - * single timeout per queue (entries are added to the end!). - * This allows maintenance in O(1). - */ - && (!timeout_time - || cs->queue_timestamp + qp->timeout < timeout_time - || cs->queue_timestamp > timeout_time + qp->timeout)) { + timeout_list)) { + /* Trash the entry if: + * - no timeout_time was given (asked for all), or + * - it expired (according to the queue timeout), or + * - the system clock skewed in the past: no entry should be + * registered above the given timeout_time (~now) + the queue + * timeout, we won't keep any here (eg. for centuries). + * + * Otherwise stop, no following entry will match thanks to the + * single timeout per queue (entries are added to the end!). + * This allows maintenance in O(1). + */ + if (timeout_time + && cs->queue_timestamp + qp->timeout > timeout_time + && cs->queue_timestamp < timeout_time + qp->timeout) { + /* Since this is the next expiring of this queue, update the + * overall queues' next expiry if it's later than this one. + */ + apr_time_t q_expiry = cs->queue_timestamp + qp->timeout; + if (!queues_next_expiry || queues_next_expiry > q_expiry) { + VOLATILE_READ(apr_time_t, queues_next_expiry) = q_expiry; + } + break; + } + last = cs; rv = apr_pollset_remove(event_pollset, &cs->pfd); if (rv != APR_SUCCESS && !APR_STATUS_IS_NOTFOUND(rv)) { @@ -1514,11 +1578,11 @@ static void process_timeout_queue(struct timeout_q static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy) { - timer_event_t *ep; timer_event_t *te; apr_status_t rc; proc_info *ti = dummy; int process_slot = ti->pslot; + struct process_score *ps = ap_get_scoreboard_process(process_slot); apr_pool_t *tpool = apr_thread_pool_get(thd); void *csd = NULL; apr_pool_t *ptrans; /* Pool for per-transaction stuff */ @@ -1527,21 +1591,13 @@ static void * APR_THREAD_FUNC listener_thread(apr_ const apr_pollfd_t *out_pfd; apr_int32_t num = 0; apr_interval_time_t timeout_interval; - apr_time_t timeout_time = 0, now, last_log; listener_poll_type *pt; int closed = 0, listeners_disabled = 0; + apr_time_t last_log; last_log = apr_time_now(); free(ti); - /* the following times out events that are really close in the future - * to prevent extra poll calls - * - * current value is .1 second - */ -#define TIMEOUT_FUDGE_FACTOR 100000 -#define EVENT_FUDGE_FACTOR 10000 - rc = init_pollset(tpool); if (rc != APR_SUCCESS) { ap_log_error(APLOG_MARK, APLOG_ERR, rc, ap_server_conf, @@ -1559,6 +1615,9 @@ static void * APR_THREAD_FUNC listener_thread(apr_ for (;;) { int workers_were_busy = 0; + apr_time_t now, timeout_time; + int keepalives; + if (listener_may_exit) { close_listeners(process_slot, &closed); if (terminate_mode == ST_UNGRACEFUL @@ -1569,7 +1628,12 @@ static void * APR_THREAD_FUNC listener_thread(apr_ if (conns_this_child <= 0) check_infinite_requests(); + /* Update poll() timeout below according to the next expiring + * timer or queue entry, if any. + */ + timeout_interval = -1; now = apr_time_now(); + if (APLOGtrace6(ap_server_conf)) { /* trace log status every second */ if (now - last_log > apr_time_from_msec(1000)) { @@ -1594,32 +1658,75 @@ static void * APR_THREAD_FUNC listener_thread(apr_ } } - apr_thread_mutex_lock(g_timer_skiplist_mtx); - te = apr_skiplist_peek(timer_skiplist); - if (te) { - if (te->when > now) { - timeout_interval = te->when - now; + /* Avoid locking if there's no expiring timer in the list, + * poll() will be woken up anyway if a new timer comes in. + */ + timeout_time = VOLATILE_READ(apr_time_t, timers_next_expiry); + if (timeout_time && timeout_time < now + EVENT_FUDGE_FACTOR) { + /* Push expired timers to a worker, the first one remaining + * determines the maximum time to poll() below. + */ + apr_thread_mutex_lock(g_timer_skiplist_mtx); + while ((te = apr_skiplist_peek(timer_skiplist))) { + if (te->when < now + EVENT_FUDGE_FACTOR) { + apr_skiplist_pop(timer_skiplist, NULL); + push_timer2worker(te); + } + else { + timeout_interval = te->when - now; + timers_next_expiry = te->when; + break; + } } - else { - timeout_interval = 1; + /* If there are no timers in the list, either the listener is + * wakeable and it can poll() indefinitely until a wake up occurs, + * or periodic checks must be performed. + */ + if (!te) { + if (!listener_is_wakeable) { + timeout_interval = apr_time_from_msec(100); + } + timers_next_expiry = 0; } + apr_thread_mutex_unlock(g_timer_skiplist_mtx); } - else { + + /* Same for queues, if the listener is wakeable use the current expiry + * time and expect to be woken up for an earlier one, otherwise use the + * maintenance timeout (max). + */ + timeout_time = VOLATILE_READ(apr_time_t, queues_next_expiry); + if (timeout_time + && (timeout_interval < 0 + || timeout_time <= now + || timeout_interval > timeout_time - now)) { + timeout_interval = timeout_time > now ? timeout_time - now : 1; + } + if (!listener_is_wakeable + && (timeout_interval < 0 + || timeout_interval > apr_time_from_msec(100))) { timeout_interval = apr_time_from_msec(100); } - apr_thread_mutex_unlock(g_timer_skiplist_mtx); rc = apr_pollset_poll(event_pollset, timeout_interval, &num, &out_pfd); if (rc != APR_SUCCESS) { if (APR_STATUS_IS_EINTR(rc)) { - continue; + /* Woken up, if we are exiting we must fall through to kill + * kept-alive connections, otherwise we only need to update + * timeouts (logic is above, so restart the loop). + */ + if (!listener_may_exit) { + continue; + } + timeout_time = 0; } - if (!APR_STATUS_IS_TIMEUP(rc)) { + else if (!APR_STATUS_IS_TIMEUP(rc)) { ap_log_error(APLOG_MARK, APLOG_CRIT, rc, ap_server_conf, "apr_pollset_poll failed. Attempting to " "shutdown process gracefully"); signal_threads(ST_GRACEFUL); } + num = 0; } if (listener_may_exit) { @@ -1629,21 +1736,6 @@ static void * APR_THREAD_FUNC listener_thread(apr_ break; } - now = apr_time_now(); - apr_thread_mutex_lock(g_timer_skiplist_mtx); - ep = apr_skiplist_peek(timer_skiplist); - while (ep) { - if (ep->when < now + EVENT_FUDGE_FACTOR) { - apr_skiplist_pop(timer_skiplist, NULL); - push_timer2worker(ep); - } - else { - break; - } - ep = apr_skiplist_peek(timer_skiplist); - } - apr_thread_mutex_unlock(g_timer_skiplist_mtx); - while (num) { pt = (listener_poll_type *) out_pfd->client_data; if (pt->type == PT_CSD) { @@ -1808,28 +1900,27 @@ static void * APR_THREAD_FUNC listener_thread(apr_ /* XXX possible optimization: stash the current time for use as * r->request_time for new requests */ - now = apr_time_now(); - /* We only do this once per 0.1s (TIMEOUT_FUDGE_FACTOR), or on a clock - * skew (if the system time is set back in the meantime, timeout_time - * will exceed now + TIMEOUT_FUDGE_FACTOR, can't happen otherwise). + /* We process the timeout queues here only when their overall next + * expiry (read once above) is over. This happens accurately since + * adding to the queues (in workers) can only decrease this expiry, + * while latest ones are only taken into account here (in listener) + * during queues' processing, with the lock held. This works both + * with and without wake-ability. */ - if (now > timeout_time || now + TIMEOUT_FUDGE_FACTOR < timeout_time ) { - struct process_score *ps; + if (timeout_time && timeout_time < (now = apr_time_now())) { timeout_time = now + TIMEOUT_FUDGE_FACTOR; /* handle timed out sockets */ apr_thread_mutex_lock(timeout_mutex); + /* Processing all the queues below will recompute this. */ + queues_next_expiry = 0; + /* Step 1: keepalive timeouts */ /* If all workers are busy, we kill older keep-alive connections so that they * may connect to another process. */ - if ((workers_were_busy || dying) && *keepalive_q->total) { - if (!dying) - ap_log_error(APLOG_MARK, APLOG_TRACE1, 0, ap_server_conf, - "All workers are busy, will close %d keep-alive " - "connections", - *keepalive_q->total); + if (workers_were_busy || dying) { process_timeout_queue(keepalive_q, 0, start_lingering_close_nonblocking); } @@ -1854,6 +1945,22 @@ static void * APR_THREAD_FUNC listener_thread(apr_ ps->suspended = apr_atomic_read32(&suspended_count); ps->lingering_close = apr_atomic_read32(&lingering_count); } + else if ((workers_were_busy || dying) + && (keepalives = VOLATILE_READ(int, *keepalive_q->total))) { + /* If all workers are busy, we kill older keep-alive connections so + * that they may connect to another process. + */ + ap_log_error(APLOG_MARK, APLOG_TRACE1, 0, ap_server_conf, + "All workers are busy or dying, will close %d " + "keep-alive connections", + keepalives); + apr_thread_mutex_lock(timeout_mutex); + process_timeout_queue(keepalive_q, 0, + start_lingering_close_nonblocking); + ps->keep_alive = 0; + apr_thread_mutex_unlock(timeout_mutex); + } + if (listeners_disabled && !workers_were_busy && (int)apr_atomic_read32(&connection_count) - (int)apr_atomic_read32(&lingering_count) @@ -2064,6 +2171,8 @@ static void *APR_THREAD_FUNC start_threads(apr_thr int prev_threads_created; int max_recycled_pools = -1; int good_methods[] = {APR_POLLSET_KQUEUE, APR_POLLSET_PORT, APR_POLLSET_EPOLL}; + /* XXX don't we need more to handle K-A or lingering close? */ + const apr_uint32_t pollset_size = threads_per_child * 2; /* We must create the fd queues before we start up the listener * and worker threads. */ @@ -2103,24 +2212,24 @@ static void *APR_THREAD_FUNC start_threads(apr_thr /* Create the main pollset */ for (i = 0; i < sizeof(good_methods) / sizeof(good_methods[0]); i++) { - rv = apr_pollset_create_ex(&event_pollset, - threads_per_child*2, /* XXX don't we need more, to handle - * connections in K-A or lingering - * close? - */ - pchild, APR_POLLSET_THREADSAFE | APR_POLLSET_NOCOPY | APR_POLLSET_NODEFAULT, - good_methods[i]); + apr_uint32_t flags = APR_POLLSET_THREADSAFE | APR_POLLSET_NOCOPY | + APR_POLLSET_NODEFAULT | APR_POLLSET_WAKEABLE; + rv = apr_pollset_create_ex(&event_pollset, pollset_size, pchild, flags, + good_methods[i]); if (rv == APR_SUCCESS) { + listener_is_wakeable = 1; break; } + flags &= ~APR_POLLSET_WAKEABLE; + rv = apr_pollset_create_ex(&event_pollset, pollset_size, pchild, flags, + good_methods[i]); + if (rv == APR_SUCCESS) { + break; + } } if (rv != APR_SUCCESS) { - rv = apr_pollset_create(&event_pollset, - threads_per_child*2, /* XXX don't we need more, to handle - * connections in K-A or lingering - * close? - */ - pchild, APR_POLLSET_THREADSAFE | APR_POLLSET_NOCOPY); + rv = apr_pollset_create(&event_pollset, pollset_size, pchild, + APR_POLLSET_THREADSAFE | APR_POLLSET_NOCOPY); } if (rv != APR_SUCCESS) { ap_log_error(APLOG_MARK, APLOG_ERR, rv, ap_server_conf, APLOGNO(03103) @@ -2129,7 +2238,9 @@ static void *APR_THREAD_FUNC start_threads(apr_thr } ap_log_error(APLOG_MARK, APLOG_DEBUG, 0, ap_server_conf, APLOGNO(02471) - "start_threads: Using %s", apr_pollset_method_name(event_pollset)); + "start_threads: Using %s (%swakeable)", + apr_pollset_method_name(event_pollset), + listener_is_wakeable ? "" : "not "); worker_sockets = apr_pcalloc(pchild, threads_per_child * sizeof(apr_socket_t *));