Have you updated the slurmdb daemon first as described here: http://schedmd.com/slurmdocs/quickstart_admin.html
*/David* On Wed, Mar 6, 2013 at 5:58 PM, Lennart Karlsson <[email protected]>wrote: > > Hi, > > Today I upgraded SLURM from v 2.4.3 to v 2.5.3. > > It seems like a mistake, because slurmctld crashes. Any ideas > about what to do, except downgrading back to 2.4.3? > > I think that I run a normal slurmctld -> (munge) -> slurmdbd -> MySQL > setup and it worked in the start: slurmdbd built some new tables > and jobs survived, but slurmctld went down after a while and now > refuses to keep living. > > An "strace -f" ends with: > [pid 18169] close(11) = 0 > [pid 18169] rt_sigaction(SIGALRM, {SIG_DFL, [ALRM], SA_RESTORER, > 0x35a060f500}, {SIG_DFL, [ALRM], SA_RESTORER, 0x35a060f500}, 8) = 0 > [pid 18169] rt_sigaction(SIGPIPE, {SIG_IGN, [PIPE], SA_RESTORER, > 0x35a060f500}, {SIG_DFL, [], 0}, 8) = 0 > [pid 18169] fcntl(8, F_GETFL) = 0x2 (flags O_RDWR) > [pid 18169] fcntl(8, F_GETFL) = 0x2 (flags O_RDWR) > [pid 18169] fcntl(8, F_SETFL, O_RDWR|O_NONBLOCK) = 0 > [pid 18169] poll([{fd=8, events=POLLOUT}], 1, 60000) = 1 ([{fd=8, > revents=POLLOUT}]) > [pid 18169] recvfrom(8, 0x2b0bdc3016a0, 1, 0, 0, 0) = -1 EAGAIN (Resource > temporarily unavailable) > [pid 18169] sendto(8, "\0\0\0\257", 4, 0, NULL, 0) = 4 > [pid 18169] fcntl(8, F_SETFL, O_RDWR) = 0 > [pid 18169] fcntl(8, F_GETFL) = 0x2 (flags O_RDWR) > [pid 18169] fcntl(8, F_GETFL) = 0x2 (flags O_RDWR) > [pid 18169] fcntl(8, F_SETFL, O_RDWR|O_NONBLOCK) = 0 > [pid 18169] poll([{fd=8, events=POLLOUT}], 1, 60000) = 1 ([{fd=8, > revents=POLLOUT}]) > [pid 18169] recvfrom(8, 0x2b0bdc3016a0, 1, 0, 0, 0) = -1 EAGAIN (Resource > temporarily unavailable) > [pid 18169] sendto(8, > "\31\0\0\0\37A\0\0\0\4\0\0\0\0\0\0\0\0\0\0\0\0\0\vauth/mun"..., 175, 0, > NULL, 0) = 175 > [pid 18169] fcntl(8, F_SETFL, O_RDWR) = 0 > [pid 18169] rt_sigaction(SIGPIPE, {SIG_DFL, [PIPE], SA_RESTORER, > 0x35a060f500}, {SIG_IGN, [PIPE], SA_RESTORER, 0x35a060f500}, 8) = 0 > [pid 18169] close(8) = 0 > [pid 18169] madvise(0x2b0bdc202000, 1028096, MADV_DONTNEED) = 0 > [pid 18169] _exit(0) = ? > Process 18169 detached > [pid 18164] <... select resumed> ) = 1 (in [4]) > [pid 18164] accept(4, {sa_family=AF_INET, sin_port=htons(51303), > sin_addr=inet_addr("130.238.136.157")}, [16]) = 8 > [pid 18164] clone(Process 18172 attached (waiting for parent) > Process 18172 resumed (parent 18164 ready) > child_stack=0x2b0bdc301ff0, > flags=CLONE_VM|CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD|CLONE_SYSVSEM|CLONE_SETTLS|CLONE_PARENT_SETTID|CLONE_CHILD_CLEARTID, > parent_tidptr=0x2b0bdc3029d0, tls=0x2b0bdc302700, > child_tidptr=0x2b0bdc3029d0) = 18172 > [pid 18172] set_robust_list(0x2b0bdc3029e0, 0x18) = 0 > [pid 18172] fcntl(8, F_GETFL <unfinished ...> > [pid 18164] select(5, [4], NULL, NULL, NULL <unfinished ...> > [pid 18172] <... fcntl resumed> ) = 0x2 (flags O_RDWR) > [pid 18172] fcntl(8, F_GETFL) = 0x2 (flags O_RDWR) > [pid 18172] fcntl(8, F_SETFL, O_RDWR|O_NONBLOCK) = 0 > [pid 18172] poll([{fd=8, events=POLLIN}], 1, 60000 <unfinished ...> > [pid 18168] <... poll resumed> ) = 1 ([{fd=7, revents=POLLIN}]) > [pid 18168] recvfrom(7, "\0\0\1\6", 4, 0, NULL, NULL) = 4 > [pid 18168] fcntl(7, F_SETFL, O_RDWR) = 0 > [pid 18168] fcntl(7, F_GETFL) = 0x2 (flags O_RDWR) > [pid 18168] fcntl(7, F_GETFL) = 0x2 (flags O_RDWR) > [pid 18168] fcntl(7, F_SETFL, O_RDWR|O_NONBLOCK) = 0 > [pid 18168] poll([{fd=7, events=POLLIN}], 1, 60000) = 1 ([{fd=7, > revents=POLLIN}]) > [pid 18168] recvfrom(7, > "\30\0\0\0\3\352\0\0\0[\0\0\0\0\0\0\0\0\0\0\0\0\0\vauth/mun"..., 262, 0, > NULL, NULL) = 262 > [pid 18168] fcntl(7, F_SETFL, O_RDWR) = 0 > [pid 18168] stat("/var/run/munge/munge.socket.2", {st_mode=S_IFSOCK|0777, > st_size=0, ...}) = 0 > [pid 18168] socket(PF_FILE, SOCK_STREAM, 0) = 11 > [pid 18168] fcntl(11, F_GETFL) = 0x2 (flags O_RDWR) > [pid 18168] fcntl(11, F_SETFL, O_RDWR|O_NONBLOCK) = 0 > [pid 18168] connect(11, {sa_family=AF_FILE, > path="/var/run/munge/munge.socket.2"}, 110) = 0 > [pid 18168] writev(11, [{"\0`mK\4\4\0\0\0\0\204", 11}, > {"\0\0\0\200MUNGE:AwQDAAA89DxchL99mxvooO"..., 132}], 2) = 143 > [pid 18168] read(11, 0x2b0bdc200bf0, 11) = -1 EAGAIN (Resource temporarily > unavailable) > [pid 18168] poll([{fd=11, events=POLLIN}], 1, 3000) = 1 ([{fd=11, > revents=POLLIN|POLLHUP}]) > [pid 18170] <... poll resumed> ) = 1 ([{fd=9, revents=POLLIN}]) > [pid 18168] read(11, "\0`mK\4\5\0\0\0\0+", 11) = 11 > [pid 18168] read(11, <unfinished ...> > [pid 18170] recvfrom(9, <unfinished ...> > [pid 18168] <... read resumed> > "\0\0\4\3\0\0\0\0\1,\4\202\356\210xQ7PEQ7PE\0\0\0\0\0\0\0\0\377"..., 43) = > 43 > [pid 18168] close(11) = 0 > [pid 18168] --- SIGSEGV (Segmentation fault) @ 0 (0) --- > Process 18168 detached > [pid 18172] +++ killed by SIGSEGV (core dumped) +++ > [pid 18171] +++ killed by SIGSEGV (core dumped) +++ > [pid 18170] +++ killed by SIGSEGV (core dumped) +++ > [pid 18166] +++ killed by SIGSEGV (core dumped) +++ > [pid 18165] +++ killed by SIGSEGV (core dumped) +++ > [pid 18163] +++ killed by SIGSEGV (core dumped) +++ > [pid 18162] +++ killed by SIGSEGV (core dumped) +++ > [pid 18161] +++ killed by SIGSEGV (core dumped) +++ > [pid 18160] +++ killed by SIGSEGV (core dumped) +++ > [pid 18159] +++ killed by SIGSEGV (core dumped) +++ > [pid 18156] +++ killed by SIGSEGV (core dumped) +++ > [pid 18155] +++ killed by SIGSEGV (core dumped) +++ > [pid 18164] +++ killed by SIGSEGV (core dumped) +++ > +++ killed by SIGSEGV (core dumped) +++ > > > Log file /var/log/messages says: > Mar 6 15:18:35 kalkyl2 kernel: slurmctld[18150]: segfault at 0 ip > 000000000043c2eb sp 00002ab4d8200820 error 4 in slurmctld[400 > > > Best regards, > -- Lennart Karlsson, UPPMAX, Uppsala University, Sweden >
