On Mon, 2009-10-19 at 09:22 +1100, Bojan Smojver wrote: > and I haven't touched worker at all
Here is another take on the whole thing, this time worker included. In my tests, prefork performed a lot better with this approach. Worker tended to remain under DoS more and would also disconnect far more "good" connections than prefork. Probably something to do with me closing wrong sockets - didn't have time to check in detail. Patches combine two different approaches when we get maxed out: 1. If over 95% of workers are stuck in read, we close reader sockets. 2. If scoreboard hasn't changed during the maintenance interval, we close all readers and at least 10% of all sockets. -- Bojan
--- httpd-2.2.14-v/server/mpm/prefork/prefork.c 2009-02-01 07:54:55.000000000 +1100 +++ httpd-2.2.14/server/mpm/prefork/prefork.c 2009-10-21 09:34:54.508085337 +1100 @@ -48,6 +48,7 @@ #include "ap_listen.h" #include "ap_mmn.h" #include "apr_poll.h" +#include "apr_md5.h" #ifdef HAVE_BSTRING_H #include <bstring.h> /* for IRIX, FD_SET calls bzero() */ @@ -336,6 +337,29 @@ die_now = 1; } +static int volatile client_socket = -1; + +#ifndef NO_USE_SIGACTION +static void close_client_socket(int sig, siginfo_t *info, void *context) +#else +static void close_client_socket(int sig) +#endif +{ +#ifndef NO_USE_SIGACTION + if (info->si_pid == getppid()) { +#endif + if (client_socket != -1) { + close(client_socket); + client_socket = -1; + } +#ifndef NO_USE_SIGACTION + } + else { + clean_child_exit(0); + } +#endif +} + /* volatile just in case */ static int volatile shutdown_pending; static int volatile restart_pending; @@ -659,8 +683,12 @@ current_conn = ap_run_create_connection(ptrans, ap_server_conf, csd, my_child_num, sbh, bucket_alloc); if (current_conn) { + apr_os_sock_get((apr_os_sock_t *)&client_socket, csd); + ap_process_connection(current_conn, csd); ap_lingering_close(current_conn); + + client_socket = -1; } /* Check the pod and the generation number after processing a @@ -733,6 +761,10 @@ } if (!pid) { +#ifndef NO_USE_SIGACTION + struct sigaction act; +#endif + #ifdef HAVE_BINDPROCESSOR /* by default AIX binds to a single processor * this bit unbinds children which will then bind to another cpu @@ -755,6 +787,19 @@ * The pod is used for signalling the graceful restart. */ apr_signal(AP_SIG_GRACEFUL, stop_listening); + + /* If the parent sends SIGINT to the child, we close the client + * socket, as we suspect that we are under DoS attack. + */ +#ifndef NO_USE_SIGACTION + memset(&act, 0, sizeof(act)); + act.sa_flags = SA_SIGINFO; + act.sa_sigaction = close_client_socket; + sigaction(SIGINT, &act, NULL); +#else + apr_signal(SIGINT, close_client_socket); +#endif + child_main(slot); } @@ -803,6 +848,8 @@ int free_slots[MAX_SPAWN_RATE]; int last_non_dead; int total_non_dead; + int status; + static apr_time_t maxed_out = 0; /* initialize the free_list */ free_length = 0; @@ -813,8 +860,6 @@ total_non_dead = 0; for (i = 0; i < ap_daemons_limit; ++i) { - int status; - if (i >= ap_max_daemons_limit && free_length == idle_spawn_rate) break; ws = &ap_scoreboard_image->servers[i][0]; @@ -856,12 +901,17 @@ */ ap_mpm_pod_signal(pod); idle_spawn_rate = 1; + maxed_out = 0; } else if (idle_count < ap_daemons_min_free) { /* terminate the free list */ if (free_length == 0) { /* only report this condition once */ static int reported = 0; + static unsigned char sb_digest[APR_MD5_DIGESTSIZE]; + apr_time_t now = apr_time_now(); + apr_md5_ctx_t ctx; + pid_t pid; if (!reported) { ap_log_error(APLOG_MARK, APLOG_ERR, 0, ap_server_conf, @@ -870,6 +920,117 @@ reported = 1; } idle_spawn_rate = 1; + + /* If after one maintenace interval we still see the same + * situation on the scoreboard, close all client sockets in + * read state and at least 10% of all client sockets. + * Crude, but seems to clear things out. + */ + if (maxed_out) { + apr_time_t diff = now - maxed_out; + + if (diff >= SCOREBOARD_MAINTENANCE_INTERVAL) { + unsigned char cur_digest[APR_MD5_DIGESTSIZE]; + + /* Current digest of the scoreboard. + */ + apr_md5_init(&ctx); + for (i = 0; i < ap_daemons_limit; ++i) { + status = ap_scoreboard_image->servers[i][0].status; + apr_md5_update(&ctx, &status, sizeof(status)); + + pid = ap_scoreboard_image->parent[i].pid; + apr_md5_update(&ctx, &pid, sizeof(pid)); + } + apr_md5_final(cur_digest, &ctx); + + /* If we haven't had a change for one maintenance + * interval, we need to make room. + */ + if (memcmp(sb_digest, cur_digest, APR_MD5_DIGESTSIZE)) { + maxed_out = 0; + } + else { + int rdrs = 0, cull = ap_daemons_limit / 10; + + /* Disconnect all readers (includes keep alive). + */ + for (i = 0; i < ap_daemons_limit; ++i) { + pid = ap_scoreboard_image->parent[i].pid; + status = ap_scoreboard_image->servers[i][0].status; + + if (status == SERVER_BUSY_READ || + status == SERVER_BUSY_KEEPALIVE) { + ap_mpm_safe_kill(pid, SIGINT); + rdrs++; + } + } + + /* Make up to 10% of all sockets, if required. + */ + for (i = 0; i < ap_daemons_limit && cull > rdrs; ++i) { + pid = ap_scoreboard_image->parent[i].pid; + status = ap_scoreboard_image->servers[i][0].status; + + if (status != SERVER_BUSY_READ && + status != SERVER_BUSY_KEEPALIVE) { + ap_mpm_safe_kill(pid, SIGINT); + cull--; + } + } + } + } + } + else { + int rdrs = 0; + + /* Create digest of the scorboard, see if things + * change next time around. + */ + apr_md5_init(&ctx); + for (i = 0; i < ap_daemons_limit; ++i) { + status = ap_scoreboard_image->servers[i][0].status; + + /* These are the conditions we are concerned with. + */ + switch (status) { + case SERVER_BUSY_READ: + case SERVER_BUSY_KEEPALIVE: + rdrs++; + case SERVER_BUSY_WRITE: + case SERVER_DEAD: + break; + default: + return; + } + + apr_md5_update(&ctx, &status, sizeof(status)); + + pid = ap_scoreboard_image->parent[i].pid; + apr_md5_update(&ctx, &pid, sizeof(pid)); + } + apr_md5_final(sb_digest, &ctx); + + /* Over 95% in read state (includes keep alive), clear now. + */ + if (ap_daemons_limit - rdrs < ap_daemons_limit / 20) { + /* Disconnect all readers (includes keep alive). + */ + for (i = 0; i < ap_daemons_limit; ++i) { + pid = ap_scoreboard_image->parent[i].pid; + status = ap_scoreboard_image->servers[i][0].status; + + if (status == SERVER_BUSY_READ || + status == SERVER_BUSY_KEEPALIVE) { + ap_mpm_safe_kill(pid, SIGINT); + rdrs++; + } + } + } + else { + maxed_out = now; + } + } } else { if (idle_spawn_rate >= 8) { @@ -902,10 +1063,13 @@ else if (idle_spawn_rate < MAX_SPAWN_RATE) { idle_spawn_rate *= 2; } + + maxed_out = 0; } } else { idle_spawn_rate = 1; + maxed_out = 0; } } --- httpd-2.2.14-v/server/mpm/worker/worker.c 2007-07-18 00:48:25.000000000 +1000 +++ httpd-2.2.14/server/mpm/worker/worker.c 2009-10-21 07:54:37.861072656 +1100 @@ -32,6 +32,7 @@ #include "apr_poll.h" #define APR_WANT_STRFUNC #include "apr_want.h" +#include "apr_md5.h" #if APR_HAVE_UNISTD_H #include <unistd.h> @@ -357,6 +358,83 @@ clean_child_exit(0); } +#ifndef NO_USE_SIGACTION +static void close_client_socket(int sig, siginfo_t *info, void *context) +#else +static void close_client_socket(int sig) +#endif +{ +#ifndef NO_USE_SIGACTION + if (info->si_pid == getppid()) +#endif + { + int csd, i, j, slot = 0, status, total_rdrs = 0, rdrs = 0, cull; + + /* Determine total number of readers (includes keep alive), our + * slot and the number of our own readers. + */ + for (i = 0; i < ap_daemons_limit; ++i) { + if (ap_scoreboard_image->parent[i].pid == ap_my_pid) { + slot = i; + } + + for (j = 0; j < ap_threads_per_child; j++) { + status = ap_scoreboard_image->servers[i][j].status; + + if (status == SERVER_BUSY_READ || + status == SERVER_BUSY_KEEPALIVE) { + + total_rdrs++; + + if (slot == i) { + rdrs++; + } + } + } + } + + /* Our share of non-readers to close. + */ + cull = ((ap_threads_per_child - rdrs) * + (ap_daemons_limit * ap_threads_per_child / 10 - total_rdrs)) / + (ap_daemons_limit * ap_threads_per_child); + + /* Disconnect all readers (includes keep alive). + */ + for (j = 0; j < ap_threads_per_child; j++) { + status = ap_scoreboard_image->servers[slot][j].status; + + if (worker_sockets[j] && + (status == SERVER_BUSY_READ || + status == SERVER_BUSY_KEEPALIVE)) { + + apr_os_sock_get((apr_os_sock_t *)&csd, worker_sockets[j]); + close(csd); + } + } + + /* Make up to 10% of all sockets, if required. + */ + for (j = 0; j < ap_threads_per_child && cull > 0; j++) { + status = ap_scoreboard_image->servers[slot][j].status; + + if (worker_sockets[j] && + status != SERVER_BUSY_READ && + status != SERVER_BUSY_KEEPALIVE) { + + apr_os_sock_get((apr_os_sock_t *)&csd, worker_sockets[j]); + close(csd); + cull--; + } + } + } +#ifndef NO_USE_SIGACTION + else { + clean_child_exit(0); + } +#endif +} + /***************************************************************** * Connection structures and accounting... */ @@ -1247,12 +1325,31 @@ join_workers(ts->listener, threads); } else { /* !one_process */ +#ifndef NO_USE_SIGACTION + struct sigaction act; +#endif + /* remove SIGTERM from the set of blocked signals... if one of * the other threads in the process needs to take us down * (e.g., for MaxRequestsPerChild) it will send us SIGTERM */ unblock_signal(SIGTERM); apr_signal(SIGTERM, dummy_signal_handler); + + + /* If the parent sends SIGINT to the child, we close the client + * socket, as we suspect that we are under DoS attack. + */ + unblock_signal(SIGINT); +#ifndef NO_USE_SIGACTION + memset(&act, 0, sizeof(act)); + act.sa_flags = SA_SIGINFO; + act.sa_sigaction = close_client_socket; + sigaction(SIGINT, &act, NULL); +#else + apr_signal(SIGINT, close_client_socket); +#endif + /* Watch for any messages from the parent over the POD */ while (1) { rv = ap_mpm_pod_check(pod); @@ -1404,6 +1501,8 @@ int last_non_dead; int total_non_dead; int active_thread_count = 0; + int status = SERVER_DEAD; + static apr_time_t maxed_out = 0; /* initialize the free_list */ free_length = 0; @@ -1415,7 +1514,6 @@ for (i = 0; i < ap_daemons_limit; ++i) { /* Initialization to satisfy the compiler. It doesn't know * that ap_threads_per_child is always > 0 */ - int status = SERVER_DEAD; int any_dying_threads = 0; int any_dead_threads = 0; int all_dead_threads = 1; @@ -1509,12 +1607,17 @@ /* Kill off one child */ ap_mpm_pod_signal(pod, TRUE); idle_spawn_rate = 1; + maxed_out = 0; } else if (idle_thread_count < min_spare_threads) { /* terminate the free list */ if (free_length == 0) { /* only report this condition once */ static int reported = 0; + static unsigned char sb_digest[APR_MD5_DIGESTSIZE]; + apr_time_t now = apr_time_now(); + apr_md5_ctx_t ctx; + pid_t pid; if (!reported) { ap_log_error(APLOG_MARK, APLOG_ERR, 0, @@ -1524,6 +1627,94 @@ reported = 1; } idle_spawn_rate = 1; + + /* If after one maintenace interval we still see the same + * situation on the scoreboard, close all client sockets in + * read state and at least 10% of all client sockets. + * Crude, but seems to clear things out. + */ + if (maxed_out) { + apr_time_t diff = now - maxed_out; + + if (diff >= SCOREBOARD_MAINTENANCE_INTERVAL) { + unsigned char cur_digest[APR_MD5_DIGESTSIZE]; + + /* Current digest of the scoreboard. + */ + apr_md5_init(&ctx); + for (i = 0; i < ap_daemons_limit; ++i) { + for (j = 0; j < ap_threads_per_child; j++) { + status = ap_scoreboard_image->servers[i][j].status; + apr_md5_update(&ctx, &status, sizeof(status)); + } + + pid = ap_scoreboard_image->parent[i].pid; + apr_md5_update(&ctx, &pid, sizeof(pid)); + } + apr_md5_final(cur_digest, &ctx); + + /* If we haven't had a change for one maintenance + * interval, we need to make room. + */ + if (memcmp(sb_digest, cur_digest, APR_MD5_DIGESTSIZE)) { + maxed_out = 0; + } + else { + /* Signal child processes to close client sockets. + */ + for (i = 0; i < ap_daemons_limit; ++i) { + pid = ap_scoreboard_image->parent[i].pid; + ap_mpm_safe_kill(pid, SIGINT); + } + } + } + } + else { + int rdrs = 0; + + /* Create digest of the scoreboard, see if things + * change next time around. + */ + apr_md5_init(&ctx); + for (i = 0; i < ap_daemons_limit; ++i) { + for (j = 0; j < ap_threads_per_child; j++) { + status = ap_scoreboard_image->servers[i][j].status; + + /* These are conditions we are concerned with. + */ + switch (status) { + case SERVER_BUSY_READ: + case SERVER_BUSY_KEEPALIVE: + rdrs++; + case SERVER_DEAD: + case SERVER_BUSY_WRITE: + break; + default: + return; + } + + apr_md5_update(&ctx, &status, sizeof(status)); + } + + pid = ap_scoreboard_image->parent[i].pid; + apr_md5_update(&ctx, &pid, sizeof(pid)); + } + apr_md5_final(sb_digest, &ctx); + + /* Over 95% in read state (includes keep alive), clear now. + */ + if (ap_daemons_limit - rdrs < ap_daemons_limit / 20) { + /* Signal child processes to close client sockets. + */ + for (i = 0; i < ap_daemons_limit; ++i) { + pid = ap_scoreboard_image->parent[i].pid; + ap_mpm_safe_kill(pid, SIGINT); + } + } + else { + maxed_out = now; + } + } } else { if (free_length > idle_spawn_rate) { @@ -1551,10 +1742,13 @@ else if (idle_spawn_rate < MAX_SPAWN_RATE) { idle_spawn_rate *= 2; } + + maxed_out = 0; } } else { idle_spawn_rate = 1; + maxed_out = 0; } }