The branch, master has been updated
       via  f73a4b1495830bcdd094a93732a89dd53b3c2f78 (commit)
      from  90e8c4dc7d89c460ad2da5a9af1cef006db3b1c0 (commit)

http://gitweb.samba.org/?p=ctdb.git;a=shortlog;h=master


- Log -----------------------------------------------------------------
commit f73a4b1495830bcdd094a93732a89dd53b3c2f78
Author: Ronnie Sahlberg <[email protected]>
Date:   Thu May 3 11:42:41 2012 +1000

    Track all child process so we never send a signal to an unrelated process 
(our child died  and kernel wrapped the pid-space and reused the pid for a 
different process
    
    Wrap all creation of child processes inside ctdb_fork() which is used to 
track all processes we have spawned.
    Capture SIGCHLD to track also which child processes have terminated.
    
    Wrap kill() inside ctdb_kill() and make sure that we never send a !0 signal 
to a child process pid that has already terminated (and might have been 
replaced with a

-----------------------------------------------------------------------

Summary of changes:
 Makefile.in                 |    2 +-
 common/ctdb_fork.c          |  124 +++++++++++++++++++++++++++++++++++++++++++
 common/ctdb_util.c          |   18 ------
 include/ctdb_private.h      |    5 ++
 server/ctdb_call.c          |    4 +-
 server/ctdb_daemon.c        |   39 ++-----------
 server/ctdb_freeze.c        |    6 +-
 server/ctdb_lockwait.c      |    4 +-
 server/ctdb_recover.c       |    6 +-
 server/ctdb_recoverd.c      |   12 ++--
 server/ctdb_takeover.c      |    6 +-
 server/ctdb_traverse.c      |    2 +-
 server/ctdb_update_record.c |    6 +-
 server/ctdb_vacuum.c        |    2 +-
 server/eventscript.c        |    6 +-
 tests/src/ctdb_test.c       |    1 +
 tests/src/ctdbd_test.c      |    1 +
 17 files changed, 165 insertions(+), 79 deletions(-)
 create mode 100644 common/ctdb_fork.c


Changeset truncated at 500 lines:

diff --git a/Makefile.in b/Makefile.in
index 843a948..ecf03f2 100755
--- a/Makefile.in
+++ b/Makefile.in
@@ -68,7 +68,7 @@ UTIL_OBJ = lib/util/idtree.o lib/util/db_wrap.o 
lib/util/strlist.o lib/util/util
 CTDB_COMMON_OBJ =  common/ctdb_io.o common/ctdb_util.o \
        common/ctdb_ltdb.o common/ctdb_message.o common/cmdline.o  \
        lib/util/debug.o common/rb_tree.o @CTDB_SYSTEM_OBJ@ 
common/system_common.o \
-       common/ctdb_logging.c
+       common/ctdb_logging.c common/ctdb_fork.o
 
 CTDB_LIB_OBJ = libctdb/ctdb.o libctdb/io_elem.o libctdb/local_tdb.o \
        libctdb/messages.o libctdb/sync.o libctdb/control.o \
diff --git a/common/ctdb_fork.c b/common/ctdb_fork.c
new file mode 100644
index 0000000..81055c5
--- /dev/null
+++ b/common/ctdb_fork.c
@@ -0,0 +1,124 @@
+/* 
+   functions to track and manage processes
+
+   Copyright (C) Ronnie Sahlberg 2012
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+   
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+   
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "includes.h"
+#include "system/wait.h"
+#include "../include/ctdb_client.h"
+#include "../include/ctdb_private.h"
+#include "../common/rb_tree.h"
+
+/*
+ * This function forks a child process and drops the realtime 
+ * scheduler for the child process.
+ */
+pid_t ctdb_fork(struct ctdb_context *ctdb)
+{
+       pid_t pid;
+       char *process;
+
+       pid = fork();
+       if (pid == -1) {
+               return -1;
+       }
+       if (pid == 0) {
+               if (ctdb->do_setsched) {
+                       ctdb_restore_scheduler(ctdb);
+               }
+               ctdb->can_send_controls = false;
+               return 0;
+       }
+
+       if (getpid() != ctdb->ctdbd_pid) {
+               return pid;
+       }
+
+       process = talloc_asprintf(ctdb->child_processes, "process:%d", 
(int)pid);
+       trbt_insert32(ctdb->child_processes, pid, process);
+
+       return pid;
+}
+
+
+
+static void ctdb_sigchld_handler(struct tevent_context *ev,
+       struct tevent_signal *te, int signum, int count,
+       void *dont_care, 
+       void *private_data)
+{
+       struct ctdb_context *ctdb = talloc_get_type(private_data, struct 
ctdb_context);
+       int status;
+       pid_t pid = -1;
+
+       while (pid != 0) {
+               pid = waitpid(-1, &status, WNOHANG);
+               if (pid == -1) {
+                       DEBUG(DEBUG_ERR, (__location__ " waitpid() returned 
error. errno:%d\n", errno));
+                       return;
+               }
+               if (pid > 0) {
+                       char *process;
+
+                       if (getpid() != ctdb->ctdbd_pid) {
+                               continue;
+                       }
+
+                       process = trbt_lookup32(ctdb->child_processes, pid);
+                       if (process == NULL) {
+                               DEBUG(DEBUG_ERR,("Got SIGCHLD from pid:%d we 
didn not spawn with ctdb_fork\n", pid));
+                       }
+
+                       DEBUG(DEBUG_DEBUG, ("SIGCHLD from %d %s\n", (int)pid, 
process));
+                       talloc_free(process);
+               }
+       }
+}
+
+
+struct tevent_signal *
+ctdb_init_sigchld(struct ctdb_context *ctdb)
+{
+       struct tevent_signal *se;
+
+       ctdb->child_processes = trbt_create(ctdb, 0);
+
+       se = tevent_add_signal(ctdb->ev, ctdb, SIGCHLD, 0, 
ctdb_sigchld_handler, ctdb);
+       return se;
+}
+
+int
+ctdb_kill(struct ctdb_context *ctdb, pid_t pid, int signum)
+{
+       char *process;
+
+       if (signum == 0) {
+               return kill(pid, signum);
+       }
+
+       if (getpid() != ctdb->ctdbd_pid) {
+               return kill(pid, signum);
+       }
+
+       process = trbt_lookup32(ctdb->child_processes, pid);
+       if (process == NULL) {
+               DEBUG(DEBUG_ERR,("ctdb_kill: trying to kill(%d, %d) a process 
that does not exist\n", pid, signum));
+               return 0;
+       }
+
+       return kill(pid, signum);
+}
diff --git a/common/ctdb_util.c b/common/ctdb_util.c
index 5584c17..b9af95b 100644
--- a/common/ctdb_util.c
+++ b/common/ctdb_util.c
@@ -332,24 +332,6 @@ void ctdb_restore_scheduler(struct ctdb_context *ctdb)
 #endif
 }
 
-/*
- * This function forks a child process and drops the realtime 
- * scheduler for the child process.
- */
-pid_t ctdb_fork(struct ctdb_context *ctdb)
-{
-       pid_t pid;
-
-       pid = fork();
-       if (pid == 0) {
-               if (ctdb->do_setsched) {
-                       ctdb_restore_scheduler(ctdb);
-               }
-               ctdb->can_send_controls = false;
-       }
-       return pid;
-}
-
 void set_nonblocking(int fd)
 {
        unsigned v;
diff --git a/include/ctdb_private.h b/include/ctdb_private.h
index d973fcc..086e427 100644
--- a/include/ctdb_private.h
+++ b/include/ctdb_private.h
@@ -501,6 +501,7 @@ struct ctdb_context {
        struct ctdb_reloadips_handle *reload_ips;
 
        const char *public_addresses_file;
+       struct trbt_tree *child_processes; 
 };
 
 struct ctdb_db_context {
@@ -1031,7 +1032,11 @@ void ctdb_node_connected(struct ctdb_node *node);
 bool ctdb_blocking_freeze(struct ctdb_context *ctdb);
 void ctdb_set_scheduler(struct ctdb_context *ctdb);
 void ctdb_restore_scheduler(struct ctdb_context *ctdb);
+
+struct tevent_signal *ctdb_init_sigchld(struct ctdb_context *ctdb);
 pid_t ctdb_fork(struct ctdb_context *ctdb);
+int ctdb_kill(struct ctdb_context *ctdb, pid_t pid, int signum);
+
 int32_t ctdb_control_takeover_ip(struct ctdb_context *ctdb, 
                                 struct ctdb_req_control *c,
                                 TDB_DATA indata, 
diff --git a/server/ctdb_call.c b/server/ctdb_call.c
index 08b701c..fe7e947 100644
--- a/server/ctdb_call.c
+++ b/server/ctdb_call.c
@@ -1388,7 +1388,7 @@ static int revokechild_destructor(struct 
revokechild_handle *rc)
        if (rc->fd[1] != -1) {
                close(rc->fd[1]);
        }
-       kill(rc->child, SIGKILL);
+       ctdb_kill(rc->ctdb, rc->child, SIGKILL);
 
        DLIST_REMOVE(rc->ctdb_db->revokechild_active, rc);
        return 0;
@@ -1617,7 +1617,7 @@ int ctdb_start_revoke_ro_record(struct ctdb_context 
*ctdb, struct ctdb_db_contex
 child_finished:
                write(rc->fd[1], &c, 1);
                /* make sure we die when our parent dies */
-               while (kill(parent, 0) == 0 || errno != ESRCH) {
+               while (ctdb_kill(ctdb, parent, 0) == 0 || errno != ESRCH) {
                        sleep(5);
                }
                _exit(0);
diff --git a/server/ctdb_daemon.c b/server/ctdb_daemon.c
index f946ac6..899b3bd 100644
--- a/server/ctdb_daemon.c
+++ b/server/ctdb_daemon.c
@@ -1029,27 +1029,6 @@ failed:
        return -1;      
 }
 
-static void sig_child_handler(struct event_context *ev,
-       struct signal_event *se, int signum, int count,
-       void *dont_care, 
-       void *private_data)
-{
-//     struct ctdb_context *ctdb = talloc_get_type(private_data, struct 
ctdb_context);
-       int status;
-       pid_t pid = -1;
-
-       while (pid != 0) {
-               pid = waitpid(-1, &status, WNOHANG);
-               if (pid == -1) {
-                       DEBUG(DEBUG_ERR, (__location__ " waitpid() returned 
error. errno:%d\n", errno));
-                       return;
-               }
-               if (pid > 0) {
-                       DEBUG(DEBUG_DEBUG, ("SIGCHLD from %d\n", (int)pid));
-               }
-       }
-}
-
 static void ctdb_setup_event_callback(struct ctdb_context *ctdb, int status,
                                      void *private_data)
 {
@@ -1074,7 +1053,6 @@ int ctdb_start_daemon(struct ctdb_context *ctdb, bool 
do_fork, bool use_syslog,
        int res, ret = -1;
        struct fd_event *fde;
        const char *domain_socket_name;
-       struct signal_event *se;
 
        /* get rid of any old sockets */
        unlink(ctdb->daemon.name);
@@ -1105,7 +1083,6 @@ int ctdb_start_daemon(struct ctdb_context *ctdb, bool 
do_fork, bool use_syslog,
        ctdbd_pid = getpid();
        ctdb->ctdbd_pid = ctdbd_pid;
 
-
        DEBUG(DEBUG_ERR, ("Starting CTDBD as pid : %u\n", ctdbd_pid));
 
        if (ctdb->do_setsched) {
@@ -1128,6 +1105,12 @@ int ctdb_start_daemon(struct ctdb_context *ctdb, bool 
do_fork, bool use_syslog,
                exit(1);
        }
 
+       /* set up a handler to pick up sigchld */
+       if (ctdb_init_sigchld(ctdb) == NULL) {
+               DEBUG(DEBUG_CRIT,("Failed to set up signal handler for 
SIGCHLD\n"));
+               exit(1);
+       }
+
        ctdb_set_child_logging(ctdb);
 
        /* initialize statistics collection */
@@ -1203,16 +1186,6 @@ int ctdb_start_daemon(struct ctdb_context *ctdb, bool 
do_fork, bool use_syslog,
        /* start the transport going */
        ctdb_start_transport(ctdb);
 
-       /* set up a handler to pick up sigchld */
-       se = event_add_signal(ctdb->ev, ctdb,
-                                    SIGCHLD, 0,
-                                    sig_child_handler,
-                                    ctdb);
-       if (se == NULL) {
-               DEBUG(DEBUG_CRIT,("Failed to set up signal handler for 
SIGCHLD\n"));
-               exit(1);
-       }
-
        ret = ctdb_event_script_callback(ctdb,
                                         ctdb,
                                         ctdb_setup_event_callback,
diff --git a/server/ctdb_freeze.c b/server/ctdb_freeze.c
index 7233317..56dc19c 100644
--- a/server/ctdb_freeze.c
+++ b/server/ctdb_freeze.c
@@ -122,7 +122,7 @@ static int ctdb_freeze_handle_destructor(struct 
ctdb_freeze_handle *h)
        ctdb->freeze_mode[h->priority]    = CTDB_FREEZE_NONE;
        ctdb->freeze_handles[h->priority] = NULL;
 
-       kill(h->child, SIGKILL);
+       ctdb_kill(h->ctdb, h->child, SIGKILL);
        return 0;
 }
 
@@ -189,7 +189,7 @@ static struct ctdb_freeze_handle *ctdb_freeze_lock(struct 
ctdb_context *ctdb, ui
                return NULL;
        }
        
-       h->child = fork();
+       h->child = ctdb_fork(ctdb);
        if (h->child == -1) {
                DEBUG(DEBUG_ERR,("Failed to fork child for 
ctdb_freeze_lock\n"));
                talloc_free(h);
@@ -216,7 +216,7 @@ static struct ctdb_freeze_handle *ctdb_freeze_lock(struct 
ctdb_context *ctdb, ui
 
                while (1) {
                        sleep(1);
-                       if (kill(ctdb->ctdbd_pid, 0) != 0) {
+                       if (ctdb_kill(ctdb, ctdb->ctdbd_pid, 0) != 0) {
                                DEBUG(DEBUG_ERR,("Parent died. Exiting lock 
wait child\n"));
 
                                _exit(0);
diff --git a/server/ctdb_lockwait.c b/server/ctdb_lockwait.c
index 35f5902..2339dc8 100644
--- a/server/ctdb_lockwait.c
+++ b/server/ctdb_lockwait.c
@@ -72,7 +72,7 @@ static void do_overflow(struct ctdb_db_context *ctdb_db,
 static int lockwait_destructor(struct lockwait_handle *h)
 {
        CTDB_DECREMENT_STAT(h->ctdb, pending_lockwait_calls);
-       kill(h->child, SIGKILL);
+       ctdb_kill(h->ctdb, h->child, SIGKILL);
        h->ctdb_db->pending_requests--;
        DLIST_REMOVE(h->ctdb_db->lockwait_active, h);
        return 0;
@@ -202,7 +202,7 @@ struct lockwait_handle *ctdb_lockwait(struct 
ctdb_db_context *ctdb_db,
                tdb_chainlock(ctdb_db->ltdb->tdb, key);
                write(result->fd[1], &c, 1);
                /* make sure we die when our parent dies */
-               while (kill(parent, 0) == 0 || errno != ESRCH) {
+               while (ctdb_kill(ctdb_db->ctdb, parent, 0) == 0 || errno != 
ESRCH) {
                        sleep(5);
                }
                _exit(0);
diff --git a/server/ctdb_recover.c b/server/ctdb_recover.c
index 937a2b8..15e9f03 100644
--- a/server/ctdb_recover.c
+++ b/server/ctdb_recover.c
@@ -626,7 +626,7 @@ static int set_recmode_destructor(struct 
ctdb_set_recmode_state *state)
        if (state->fd[1] != -1) {
                state->fd[1] = -1;
        }
-       kill(state->child, SIGKILL);
+       ctdb_kill(state->ctdb, state->child, SIGKILL);
        return 0;
 }
 
@@ -778,7 +778,7 @@ int32_t ctdb_control_set_recmode(struct ctdb_context *ctdb,
                return -1;
        }
 
-       state->child = fork();
+       state->child = ctdb_fork(ctdb);
        if (state->child == (pid_t)-1) {
                close(state->fd[0]);
                close(state->fd[1]);
@@ -801,7 +801,7 @@ int32_t ctdb_control_set_recmode(struct ctdb_context *ctdb,
 
                write(state->fd[1], &cc, 1);
                /* make sure we die when our parent dies */
-               while (kill(parent, 0) == 0 || errno != ESRCH) {
+               while (ctdb_kill(ctdb, parent, 0) == 0 || errno != ESRCH) {
                        sleep(5);
                        write(state->fd[1], &cc, 1);
                }
diff --git a/server/ctdb_recoverd.c b/server/ctdb_recoverd.c
index d56fdb5..f739900 100644
--- a/server/ctdb_recoverd.c
+++ b/server/ctdb_recoverd.c
@@ -2949,7 +2949,7 @@ static int check_reclock_destructor(struct 
ctdb_check_reclock_state *state)
                close(state->fd[1]);
                state->fd[1] = -1;
        }
-       kill(state->child, SIGKILL);
+       ctdb_kill(ctdb, state->child, SIGKILL);
        return 0;
 }
 
@@ -3047,7 +3047,7 @@ static int check_recovery_lock(struct ctdb_context *ctdb)
 
                write(state->fd[1], &cc, 1);
                /* make sure we die when our parent dies */
-               while (kill(parent, 0) == 0 || errno != ESRCH) {
+               while (ctdb_kill(ctdb, parent, 0) == 0 || errno != ESRCH) {
                        sleep(5);
                        write(state->fd[1], &cc, 1);
                }
@@ -3166,7 +3166,7 @@ static void main_loop(struct ctdb_context *ctdb, struct 
ctdb_recoverd *rec,
 
 
        /* verify that the main daemon is still running */
-       if (kill(ctdb->ctdbd_pid, 0) != 0) {
+       if (ctdb_kill(ctdb, ctdb->ctdbd_pid, 0) != 0) {
                DEBUG(DEBUG_CRIT,("CTDB daemon is no longer available. Shutting 
down recovery daemon\n"));
                exit(-1);
        }
@@ -3809,7 +3809,7 @@ static void ctdb_check_recd(struct event_context *ev, 
struct timed_event *te,
 {
        struct ctdb_context *ctdb = talloc_get_type(p, struct ctdb_context);
 
-       if (kill(ctdb->recoverd_pid, 0) != 0) {
+       if (ctdb_kill(ctdb, ctdb->recoverd_pid, 0) != 0) {
                DEBUG(DEBUG_ERR,("Recovery daemon (pid:%d) is no longer 
running. Trying to restart recovery daemon.\n", (int)ctdb->recoverd_pid));
 
                event_add_timed(ctdb->ev, ctdb, timeval_zero(), 
@@ -3861,7 +3861,7 @@ int ctdb_start_recoverd(struct ctdb_context *ctdb)
 
        ctdb->ctdbd_pid = getpid();
 
-       ctdb->recoverd_pid = fork();
+       ctdb->recoverd_pid = ctdb_fork(ctdb);
        if (ctdb->recoverd_pid == -1) {
                return -1;
        }
@@ -3915,7 +3915,7 @@ void ctdb_stop_recoverd(struct ctdb_context *ctdb)
        }
 
        DEBUG(DEBUG_NOTICE,("Shutting down recovery daemon\n"));
-       kill(ctdb->recoverd_pid, SIGTERM);
+       ctdb_kill(ctdb, ctdb->recoverd_pid, SIGTERM);
 }
 
 static void ctdb_restart_recd(struct event_context *ev, struct timed_event 
*te, 
diff --git a/server/ctdb_takeover.c b/server/ctdb_takeover.c
index f147446..ee6c723 100644
--- a/server/ctdb_takeover.c
+++ b/server/ctdb_takeover.c
@@ -741,7 +741,7 @@ static void release_kill_clients(struct ctdb_context *ctdb, 
ctdb_sock_addr *addr
                                        (unsigned)client->pid,
                                        ctdb_addr_to_str(addr),
                                        ip->client_id));
-                               kill(client->pid, SIGKILL);
+                               ctdb_kill(ctdb, client->pid, SIGKILL);
                        }
                }
        }
@@ -3675,7 +3675,7 @@ static int ctdb_reloadips_destructor(struct 
ctdb_reloadips_handle *h)
                ctdb_request_control_reply(h->ctdb, h->c, NULL, h->status, 
NULL);
                h->c = NULL;
        }
-       kill(h->child, SIGKILL);
+       ctdb_kill(h->ctdb, h->child, SIGKILL);
        return 0;
 }
 
@@ -3850,7 +3850,7 @@ int32_t ctdb_control_reload_public_ips(struct 
ctdb_context *ctdb, struct ctdb_re
 
                write(h->fd[1], &res, 1);
                /* make sure we die when our parent dies */
-               while (kill(parent, 0) == 0 || errno != ESRCH) {
+               while (ctdb_kill(ctdb, parent, 0) == 0 || errno != ESRCH) {
                        sleep(5);
                }
                _exit(0);
diff --git a/server/ctdb_traverse.c b/server/ctdb_traverse.c
index d739934..54ee70f 100644
--- a/server/ctdb_traverse.c
+++ b/server/ctdb_traverse.c
@@ -78,7 +78,7 @@ static void ctdb_traverse_local_handler(uint8_t *rawdata, 
size_t length, void *p
 static int traverse_local_destructor(struct ctdb_traverse_local_handle *h)
 {
        DLIST_REMOVE(h->ctdb_db->traverse, h);
-       kill(h->child, SIGKILL);
+       ctdb_kill(h->ctdb_db->ctdb, h->child, SIGKILL);
        return 0;
 }
 
diff --git a/server/ctdb_update_record.c b/server/ctdb_update_record.c
index 92b304c..1543b46 100644
--- a/server/ctdb_update_record.c
+++ b/server/ctdb_update_record.c
@@ -162,7 +162,7 @@ struct childwrite_handle {
 static int childwrite_destructor(struct childwrite_handle *h)
 {
        CTDB_DECREMENT_STAT(h->ctdb, pending_childwrite_calls);
-       kill(h->child, SIGKILL);
+       ctdb_kill(h->ctdb, h->child, SIGKILL);
        return 0;
 }
 
@@ -199,7 +199,7 @@ static void childwrite_handler(struct event_context *ev, 
struct fd_event *fde,
 
        callback(c, p);
 
-       kill(child, SIGKILL);


-- 
CTDB repository

Reply via email to