On Thu, Jul 8, 2021 at 4:21 PM Stefan Eissing <stefan.eiss...@greenbytes.de> wrote: > > I see many of those: > [Thu Jul 08 14:16:55.301670 2021] [mpm_event:error] [pid 81101:tid > 123145411510272] (9)Bad file descriptor: AH00468: error closing socket > -1/7ff9cf0086b0 from process_socket > > which come from event.c#1263: rc = start_lingering_close_blocking(cs); > calling event.c#864: kill_connection_ex(cs, from); > and event.c#837: close_socket_nonblocking_ex(cs->pfd.desc.s, from);
OK, so I think this could be addressed by the attached patch. This is the same as v0 plus some changes in ap_start_lingering_close() (from server/connection.c) to not close the socket on error. ap_start_lingering_close() did not close the socket consistently, so the caller had to call apr_socket_close() anyway and sometimes fail.. The close on failure is moved to ap_lingering_close() which is the one that should care. Still the AP_DEBUG_ASSERT(0) triggering with this?
Index: server/mpm/event/event.c =================================================================== --- server/mpm/event/event.c (revision 1891217) +++ server/mpm/event/event.c (working copy) @@ -554,9 +554,10 @@ static APR_INLINE int connections_above_limit(int return 1; } -static void abort_socket_nonblocking(apr_socket_t *csd) +static void close_socket_nonblocking(apr_socket_t *csd) { apr_status_t rv; + apr_socket_timeout_set(csd, 0); rv = apr_socket_close(csd); if (rv != APR_SUCCESS) { @@ -573,7 +574,7 @@ static void close_worker_sockets(void) apr_socket_t *csd = worker_sockets[i]; if (csd) { worker_sockets[i] = NULL; - abort_socket_nonblocking(csd); + close_socket_nonblocking(csd); } } for (;;) { @@ -587,12 +588,16 @@ static void close_worker_sockets(void) continue; } cs->chain = NULL; - abort_socket_nonblocking(cs->pfd.desc.s); + close_socket_nonblocking(cs->pfd.desc.s); } } -static void wakeup_listener(void) +static void shutdown_listener(void) { + ap_log_error(APLOG_MARK, APLOG_DEBUG, 0, ap_server_conf, + "shutting down listener (%s)", + listener_may_exit ? "again" : "first"); + listener_may_exit = 1; disable_listensocks(); @@ -643,7 +648,7 @@ static void signal_threads(int mode) /* in case we weren't called from the listener thread, wake up the * listener thread */ - wakeup_listener(); + shutdown_listener(); /* for ungraceful termination, let the workers exit now; * for graceful termination, the listener thread will notify the @@ -779,6 +784,8 @@ static apr_status_t decrement_connection_count(voi { int is_last_connection; event_conn_state_t *cs = cs_; + ap_log_cerror(APLOG_MARK, APLOG_TRACE4, 0, cs->c, + "cleanup connection in state %i", (int)cs->pub.state); switch (cs->pub.state) { case CONN_STATE_LINGER_NORMAL: case CONN_STATE_LINGER_SHORT: @@ -817,6 +824,22 @@ static void notify_resume(event_conn_state_t *cs, ap_run_resume_connection(cs->c, cs->r); } +static void kill_connection(event_conn_state_t *cs) +{ + ap_log_cerror(APLOG_MARK, APLOG_TRACE4, 0, cs->c, + "kill connection in state %i", (int)cs->pub.state); + close_socket_nonblocking(cs->pfd.desc.s); + ap_queue_info_push_pool(worker_queue_info, cs->p); +} + +static int kill_connection_nonblocking(event_conn_state_t *cs) +{ + kill_connection(cs); + if (dying) + ap_queue_interrupt_one(worker_queue); + return 1; +} + /* * Close our side of the connection, flushing data to the client first. * Pre-condition: cs is not in any timeout queue and not in the pollset, @@ -827,12 +850,11 @@ static void notify_resume(event_conn_state_t *cs, */ static int start_lingering_close_blocking(event_conn_state_t *cs) { - apr_socket_t *csd = cs->pfd.desc.s; - + ap_log_cerror(APLOG_MARK, APLOG_TRACE4, 0, cs->c, + "lingering close in state %i", (int)cs->pub.state); if (ap_start_lingering_close(cs->c)) { notify_suspend(cs); - apr_socket_close(csd); - ap_queue_info_push_pool(worker_queue_info, cs->p); + kill_connection(cs); return DONE; } @@ -839,11 +861,11 @@ static int start_lingering_close_blocking(event_co #ifdef AP_DEBUG { apr_status_t rv; - rv = apr_socket_timeout_set(csd, 0); + rv = apr_socket_timeout_set(cs->pfd.desc.s, 0); AP_DEBUG_ASSERT(rv == APR_SUCCESS); } #else - apr_socket_timeout_set(csd, 0); + apr_socket_timeout_set(cs->pfd.desc.s, 0); #endif cs->queue_timestamp = apr_time_now(); @@ -874,9 +896,11 @@ static int start_lingering_close_blocking(event_co */ static int start_lingering_close_nonblocking(event_conn_state_t *cs) { - event_conn_state_t *chain; + ap_log_cerror(APLOG_MARK, APLOG_TRACE4, 0, cs->c, + "lingering close nonblocking in state %i", + (int)cs->pub.state); for (;;) { - cs->chain = chain = defer_linger_chain; + event_conn_state_t *chain = cs->chain = defer_linger_chain; if (apr_atomic_casptr((void *)&defer_linger_chain, cs, chain) != chain) { /* Race lost, try again */ @@ -887,24 +911,6 @@ static int start_lingering_close_nonblocking(event } /* - * forcibly close a lingering connection after the lingering period has - * expired - * Pre-condition: cs is not in any timeout queue and not in the pollset - * return: irrelevant (need same prototype as start_lingering_close) - */ -static int stop_lingering_close(event_conn_state_t *cs) -{ - apr_socket_t *csd = ap_get_conn_socket(cs->c); - ap_log_error(APLOG_MARK, APLOG_TRACE4, 0, ap_server_conf, - "socket abort in state %i", (int)cs->pub.state); - abort_socket_nonblocking(csd); - ap_queue_info_push_pool(worker_queue_info, cs->p); - if (dying) - ap_queue_interrupt_one(worker_queue); - return 0; -} - -/* * This runs before any non-MPM cleanup code on the connection; * if the connection is currently suspended as far as modules * know, provide notification of resumption. @@ -973,9 +979,12 @@ static int event_post_read_request(request_rec *r) /* Forward declare */ static void process_lingering_close(event_conn_state_t *cs); -static void update_reqevents_from_sense(event_conn_state_t *cs) +static void update_reqevents_from_sense(event_conn_state_t *cs, int sense) { - if (cs->pub.sense == CONN_SENSE_WANT_READ) { + if (sense < 0) { + sense = cs->pub.sense; + } + if (sense == CONN_SENSE_WANT_READ) { cs->pfd.reqevents = APR_POLLIN | APR_POLLHUP; } else { @@ -1020,14 +1029,14 @@ static void process_socket(apr_thread_t *thd, apr_ apr_pool_cleanup_null); ap_set_module_config(c->conn_config, &mpm_event_module, cs); c->current_thread = thd; + c->cs = &(cs->pub); cs->c = c; - c->cs = &(cs->pub); cs->p = p; cs->sc = ap_get_module_config(ap_server_conf->module_config, &mpm_event_module); cs->pfd.desc_type = APR_POLL_SOCKET; - cs->pfd.reqevents = APR_POLLIN; cs->pfd.desc.s = sock; + update_reqevents_from_sense(cs, CONN_SENSE_WANT_READ); pt->type = PT_CSD; pt->baton = cs; cs->pfd.client_data = pt; @@ -1168,7 +1177,7 @@ read_request: cs->queue_timestamp = apr_time_now(); notify_suspend(cs); - update_reqevents_from_sense(cs); + update_reqevents_from_sense(cs, -1); apr_thread_mutex_lock(timeout_mutex); TO_QUEUE_APPEND(cs->sc->wc_q, cs); rv = apr_pollset_add(event_pollset, &cs->pfd); @@ -1179,8 +1188,7 @@ read_request: ap_log_error(APLOG_MARK, APLOG_ERR, rv, ap_server_conf, APLOGNO(03465) "process_socket: apr_pollset_add failure for " "write completion"); - apr_socket_close(cs->pfd.desc.s); - ap_queue_info_push_pool(worker_queue_info, cs->p); + kill_connection(cs); } else { apr_thread_mutex_unlock(timeout_mutex); @@ -1217,7 +1225,7 @@ read_request: notify_suspend(cs); /* Add work to pollset. */ - cs->pfd.reqevents = APR_POLLIN; + update_reqevents_from_sense(cs, CONN_SENSE_WANT_READ); apr_thread_mutex_lock(timeout_mutex); TO_QUEUE_APPEND(cs->sc->ka_q, cs); rv = apr_pollset_add(event_pollset, &cs->pfd); @@ -1228,8 +1236,7 @@ read_request: ap_log_error(APLOG_MARK, APLOG_ERR, rv, ap_server_conf, APLOGNO(03093) "process_socket: apr_pollset_add failure for " "keep alive"); - apr_socket_close(cs->pfd.desc.s); - ap_queue_info_push_pool(worker_queue_info, cs->p); + kill_connection(cs); } else { apr_thread_mutex_unlock(timeout_mutex); @@ -1281,7 +1288,7 @@ static apr_status_t event_resume_suspended (conn_r cs->pub.state = CONN_STATE_WRITE_COMPLETION; notify_suspend(cs); - update_reqevents_from_sense(cs); + update_reqevents_from_sense(cs, -1); apr_thread_mutex_lock(timeout_mutex); TO_QUEUE_APPEND(cs->sc->wc_q, cs); apr_pollset_add(event_pollset, &cs->pfd); @@ -1309,10 +1316,13 @@ static void check_infinite_requests(void) static void close_listeners(int *closed) { + ap_log_error(APLOG_MARK, APLOG_TRACE4, 0, ap_server_conf, + "clos%s listeners (connection_count=%u)", + *closed ? "ed" : "ing", apr_atomic_read32(&connection_count)); if (!*closed) { int i; + *closed = 1; ap_close_listeners_ex(my_bucket->listeners); - *closed = 1; dying = 1; ap_scoreboard_image->parent[ap_child_slot].quiescing = 1; for (i = 0; i < threads_per_child; ++i) { @@ -1417,11 +1427,16 @@ static apr_status_t push2worker(event_conn_state_t /* trash the connection; we couldn't queue the connected * socket to a worker */ - if (csd) { - abort_socket_nonblocking(csd); + if (cs) { + kill_connection_nonblocking(cs); } - if (ptrans) { - ap_queue_info_push_pool(worker_queue_info, ptrans); + else { + if (csd) { + close_socket_nonblocking(csd); + } + if (ptrans) { + ap_queue_info_push_pool(worker_queue_info, ptrans); + } } signal_threads(ST_GRACEFUL); } @@ -1678,15 +1693,12 @@ static void process_lingering_close(event_conn_sta } while (rv == APR_SUCCESS); if (!APR_STATUS_IS_EAGAIN(rv)) { - rv = apr_socket_close(csd); - AP_DEBUG_ASSERT(rv == APR_SUCCESS); - ap_queue_info_push_pool(worker_queue_info, cs->p); + kill_connection(cs); return; } /* Re-queue the connection to come back when readable */ - cs->pfd.reqevents = APR_POLLIN; - cs->pub.sense = CONN_SENSE_DEFAULT; + update_reqevents_from_sense(cs, CONN_SENSE_WANT_READ); q = (cs->pub.state == CONN_STATE_LINGER_SHORT) ? short_linger_q : linger_q; apr_thread_mutex_lock(timeout_mutex); TO_QUEUE_APPEND(q, cs); @@ -1697,20 +1709,17 @@ static void process_lingering_close(event_conn_sta apr_thread_mutex_unlock(timeout_mutex); ap_log_error(APLOG_MARK, APLOG_ERR, rv, ap_server_conf, APLOGNO(03092) "process_lingering_close: apr_pollset_add failure"); - rv = apr_socket_close(cs->pfd.desc.s); - AP_DEBUG_ASSERT(rv == APR_SUCCESS); - ap_queue_info_push_pool(worker_queue_info, cs->p); + kill_connection(cs); return; } apr_thread_mutex_unlock(timeout_mutex); } -/* call 'func' for all elements of 'q' with timeout less than 'timeout_time'. +/* call 'func' for all elements of 'q' with expiry before 'now'. * Pre-condition: timeout_mutex must already be locked * Post-condition: timeout_mutex will be locked again */ -static void process_timeout_queue(struct timeout_queue *q, - apr_time_t timeout_time, +static void process_timeout_queue(struct timeout_queue *q, apr_time_t now, int (*func)(event_conn_state_t *)) { apr_uint32_t total = 0, count; @@ -1730,19 +1739,18 @@ static void process_lingering_close(event_conn_sta while (cs != APR_RING_SENTINEL(&qp->head, event_conn_state_t, timeout_list)) { /* Trash the entry if: - * - no timeout_time was given (asked for all), or + * - no 'now' was given (asked for all), or * - it expired (according to the queue timeout), or * - the system clock skewed in the past: no entry should be - * registered above the given timeout_time (~now) + the queue - * timeout, we won't keep any here (eg. for centuries). + * registered above the given 'now' + the queue timeout, + * we won't keep any here (eg. for centuries). * * Otherwise stop, no following entry will match thanks to the * single timeout per queue (entries are added to the end!). * This allows maintenance in O(1). */ - if (timeout_time - && cs->queue_timestamp + qp->timeout > timeout_time - && cs->queue_timestamp < timeout_time + qp->timeout) { + if (now && cs->queue_timestamp + qp->timeout > now + && cs->queue_timestamp < now + qp->timeout) { /* Since this is the next expiring of this queue, update the * overall queues' next expiry if it's later than this one. */ @@ -1790,18 +1798,17 @@ static void process_lingering_close(event_conn_sta apr_thread_mutex_lock(timeout_mutex); } -static void process_keepalive_queue(apr_time_t timeout_time) +static void process_keepalive_queue(apr_time_t now) { /* If all workers are busy, we kill older keep-alive connections so * that they may connect to another process. */ - if (!timeout_time) { + if (!now) { ap_log_error(APLOG_MARK, APLOG_TRACE1, 0, ap_server_conf, - "All workers are busy or dying, will close %u " + "All workers are busy or dying, will kill %u " "keep-alive connections", *keepalive_q->total); } - process_timeout_queue(keepalive_q, timeout_time, - start_lingering_close_nonblocking); + process_timeout_queue(keepalive_q, now, kill_connection_nonblocking); } static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy) @@ -1831,9 +1838,9 @@ static void * APR_THREAD_FUNC listener_thread(apr_ timer_event_t *te; const apr_pollfd_t *out_pfd; apr_int32_t num = 0; - apr_interval_time_t timeout_interval; + apr_interval_time_t timeout; socket_callback_baton_t *user_chain; - apr_time_t now, timeout_time; + apr_time_t now, next_expiry; int workers_were_busy = 0; if (conns_this_child <= 0) @@ -1886,18 +1893,18 @@ static void * APR_THREAD_FUNC listener_thread(apr_ * up occurs, otherwise periodic checks (maintenance, shutdown, ...) * must be performed. */ - timeout_interval = -1; + timeout = -1; /* Push expired timers to a worker, the first remaining one determines * the maximum time to poll() below, if any. */ - timeout_time = timers_next_expiry; - if (timeout_time && timeout_time < now + EVENT_FUDGE_FACTOR) { + next_expiry = timers_next_expiry; + if (next_expiry && next_expiry < now + EVENT_FUDGE_FACTOR) { apr_thread_mutex_lock(g_timer_skiplist_mtx); while ((te = apr_skiplist_peek(timer_skiplist))) { if (te->when > now + EVENT_FUDGE_FACTOR) { timers_next_expiry = te->when; - timeout_interval = te->when - now; + timeout = te->when - now; break; } apr_skiplist_pop(timer_skiplist, NULL); @@ -1921,37 +1928,27 @@ static void * APR_THREAD_FUNC listener_thread(apr_ } /* Same for queues, use their next expiry, if any. */ - timeout_time = queues_next_expiry; - if (timeout_time - && (timeout_interval < 0 - || timeout_time <= now - || timeout_interval > timeout_time - now)) { - timeout_interval = timeout_time > now ? timeout_time - now : 1; + next_expiry = queues_next_expiry; + if (next_expiry + && (timeout < 0 + || next_expiry <= now + || timeout > next_expiry - now)) { + timeout = next_expiry > now ? next_expiry - now : 1; } /* When non-wakeable, don't wait more than 100 ms, in any case. */ #define NON_WAKEABLE_POLL_TIMEOUT apr_time_from_msec(100) if (!listener_is_wakeable - && (timeout_interval < 0 - || timeout_interval > NON_WAKEABLE_POLL_TIMEOUT)) { - timeout_interval = NON_WAKEABLE_POLL_TIMEOUT; + && (timeout < 0 + || timeout > NON_WAKEABLE_POLL_TIMEOUT)) { + timeout = NON_WAKEABLE_POLL_TIMEOUT; } - rc = apr_pollset_poll(event_pollset, timeout_interval, &num, &out_pfd); + ap_log_error(APLOG_MARK, APLOG_TRACE8, 0, ap_server_conf, + "polling timeout=%" APR_TIME_T_FMT, timeout); + rc = apr_pollset_poll(event_pollset, timeout, &num, &out_pfd); if (rc != APR_SUCCESS) { - if (APR_STATUS_IS_EINTR(rc)) { - /* Woken up, if we are exiting or listeners are disabled we - * must fall through to kill kept-alive connections or test - * whether listeners should be re-enabled. Otherwise we only - * need to update timeouts (logic is above, so simply restart - * the loop). - */ - if (!listener_may_exit && !listeners_disabled()) { - continue; - } - timeout_time = 0; - } - else if (!APR_STATUS_IS_TIMEUP(rc)) { + if (!APR_STATUS_IS_EINTR(rc) && !APR_STATUS_IS_TIMEUP(rc)) { ap_log_error(APLOG_MARK, APLOG_CRIT, rc, ap_server_conf, APLOGNO(03267) "apr_pollset_poll failed. Attempting to " @@ -1960,12 +1957,12 @@ static void * APR_THREAD_FUNC listener_thread(apr_ } num = 0; } - - if (listener_may_exit) { - close_listeners(&closed); - if (terminate_mode == ST_UNGRACEFUL - || apr_atomic_read32(&connection_count) == 0) - break; + if (APLOGtrace8(ap_server_conf)) { + apr_time_t expiry = queues_next_expiry; + ap_log_error(APLOG_MARK, APLOG_TRACE8, rc, ap_server_conf, + "polled num=%u exit=%d count=%d expiry=%" APR_TIME_T_FMT, + num, listener_may_exit, apr_atomic_read32(&connection_count), + expiry > now ? expiry - now : expiry); } for (user_chain = NULL; num; --num, ++out_pfd) { @@ -2020,7 +2017,7 @@ static void * APR_THREAD_FUNC listener_thread(apr_ AP_DEBUG_ASSERT(0); ap_log_error(APLOG_MARK, APLOG_ERR, rc, ap_server_conf, APLOGNO(03094) "pollset remove failed"); - start_lingering_close_nonblocking(cs); + kill_connection_nonblocking(cs); break; } @@ -2033,12 +2030,7 @@ static void * APR_THREAD_FUNC listener_thread(apr_ get_worker(&have_idle_worker, blocking, &workers_were_busy); if (!have_idle_worker) { - if (remove_from_q == cs->sc->ka_q) { - start_lingering_close_nonblocking(cs); - } - else { - stop_lingering_close(cs); - } + kill_connection_nonblocking(cs); } else if (push2worker(cs, NULL, NULL) == APR_SUCCESS) { have_idle_worker = 0; @@ -2184,7 +2176,7 @@ static void * APR_THREAD_FUNC listener_thread(apr_ * during queues' processing, with the lock held. This works both * with and without wake-ability. */ - if (timeout_time && timeout_time < (now = apr_time_now())) { + if (next_expiry && next_expiry < (now = apr_time_now())) { /* handle timed out sockets */ apr_thread_mutex_lock(timeout_mutex); @@ -2198,15 +2190,27 @@ static void * APR_THREAD_FUNC listener_thread(apr_ else { process_keepalive_queue(now); } + ap_log_error(APLOG_MARK, APLOG_TRACE8, 0, ap_server_conf, + "process_ka_q() expiry=%" APR_TIME_T_FMT, + queues_next_expiry > now ? queues_next_expiry - now : 0); /* Step 2: write completion timeouts */ process_timeout_queue(write_completion_q, now, start_lingering_close_nonblocking); + ap_log_error(APLOG_MARK, APLOG_TRACE8, 0, ap_server_conf, + "process_wc_q() expiry=%" APR_TIME_T_FMT, + queues_next_expiry > now ? queues_next_expiry - now : 0); /* Step 3: (normal) lingering close completion timeouts */ process_timeout_queue(linger_q, now, - stop_lingering_close); + kill_connection_nonblocking); + ap_log_error(APLOG_MARK, APLOG_TRACE8, 0, ap_server_conf, + "process_linger_q() expiry=%" APR_TIME_T_FMT, + queues_next_expiry > now ? queues_next_expiry - now : 0); /* Step 4: (short) lingering close completion timeouts */ process_timeout_queue(short_linger_q, now, - stop_lingering_close); + kill_connection_nonblocking); + ap_log_error(APLOG_MARK, APLOG_TRACE8, 0, ap_server_conf, + "process_short_linger_q() expiry=%" APR_TIME_T_FMT, + queues_next_expiry > now ? queues_next_expiry - now : 0); apr_thread_mutex_unlock(timeout_mutex); @@ -2571,7 +2575,7 @@ static void setup_threads_runtime(void) AP_DEBUG_ASSERT(i < num_listensocks); pfd = &listener_pollfd[i]; - pfd->reqevents = APR_POLLIN; + pfd->reqevents = APR_POLLIN | APR_POLLHUP; pfd->desc_type = APR_POLL_SOCKET; pfd->desc.s = lr->sd; @@ -2703,11 +2707,16 @@ static void join_workers(apr_thread_t * listener, */ iter = 0; - while (iter < 10 && !dying) { - /* listener has not stopped accepting yet */ + while (iter < 10) { apr_sleep(apr_time_make(0, 500000)); - wakeup_listener(); - ++iter; + /* listener has stopped accepting yet? */ + if (dying) { + break; + } + iter++; + ap_log_error(APLOG_MARK, APLOG_DEBUG, 0, ap_server_conf, + "listener has not stopped accepting yet (%d iter)", iter); + shutdown_listener(); } if (iter >= 10) { ap_log_error(APLOG_MARK, APLOG_DEBUG, 0, ap_server_conf, APLOGNO(00475) @@ -2918,7 +2927,13 @@ static void child_main(int child_num_arg, int chil * If the worker hasn't exited, then this blocks until * they have (then cleans up). */ + ap_log_error(APLOG_MARK, APLOG_TRACE1, 0, ap_server_conf, + "%s termination received, joining workers", + rv == AP_MPM_PODX_GRACEFUL ? "graceful" : "ungraceful"); join_workers(ts->listener, threads); + ap_log_error(APLOG_MARK, APLOG_TRACE1, 0, ap_server_conf, + "%s termination, workers joined, exiting", + rv == AP_MPM_PODX_GRACEFUL ? "graceful" : "ungraceful"); } free(threads); Index: server/connection.c =================================================================== --- server/connection.c (revision 1891217) +++ server/connection.c (working copy) @@ -139,18 +139,12 @@ AP_DECLARE(int) ap_start_lingering_close(conn_rec ap_flush_conn(c); #ifdef NO_LINGCLOSE - apr_socket_close(csd); return 1; #else /* Shut down the socket for write, which will send a FIN * to the peer. */ - if (c->aborted - || apr_socket_shutdown(csd, APR_SHUTDOWN_WRITE) != APR_SUCCESS) { - apr_socket_close(csd); - return 1; - } - return 0; + return (c->aborted || apr_socket_shutdown(csd, APR_SHUTDOWN_WRITE)); #endif } @@ -162,6 +156,7 @@ AP_DECLARE(void) ap_lingering_close(conn_rec *c) apr_socket_t *csd = ap_get_conn_socket(c); if (ap_start_lingering_close(c)) { + apr_socket_close(csd); return; }