Do you happen to have a backtrace or anything from the logs on the matter?
On 03/06/2013 08:57 AM, Lennart Karlsson wrote:
> Hi,
>
> Today I upgraded SLURM from v 2.4.3 to v 2.5.3.
>
> It seems like a mistake, because slurmctld crashes. Any ideas
> about what to do, except downgrading back to 2.4.3?
>
> I think that I run a normal slurmctld -> (munge) -> slurmdbd -> MySQL
> setup and it worked in the start: slurmdbd built some new tables
> and jobs survived, but slurmctld went down after a while and now
> refuses to keep living.
>
> An "strace -f" ends with:
> [pid 18169] close(11) = 0
> [pid 18169] rt_sigaction(SIGALRM, {SIG_DFL, [ALRM], SA_RESTORER,
> 0x35a060f500}, {SIG_DFL, [ALRM], SA_RESTORER, 0x35a060f500}, 8) = 0
> [pid 18169] rt_sigaction(SIGPIPE, {SIG_IGN, [PIPE], SA_RESTORER,
> 0x35a060f500}, {SIG_DFL, [], 0}, 8) = 0
> [pid 18169] fcntl(8, F_GETFL) = 0x2 (flags O_RDWR)
> [pid 18169] fcntl(8, F_GETFL) = 0x2 (flags O_RDWR)
> [pid 18169] fcntl(8, F_SETFL, O_RDWR|O_NONBLOCK) = 0
> [pid 18169] poll([{fd=8, events=POLLOUT}], 1, 60000) = 1 ([{fd=8,
> revents=POLLOUT}])
> [pid 18169] recvfrom(8, 0x2b0bdc3016a0, 1, 0, 0, 0) = -1 EAGAIN (Resource
> temporarily unavailable)
> [pid 18169] sendto(8, "\0\0\0\257", 4, 0, NULL, 0) = 4
> [pid 18169] fcntl(8, F_SETFL, O_RDWR) = 0
> [pid 18169] fcntl(8, F_GETFL) = 0x2 (flags O_RDWR)
> [pid 18169] fcntl(8, F_GETFL) = 0x2 (flags O_RDWR)
> [pid 18169] fcntl(8, F_SETFL, O_RDWR|O_NONBLOCK) = 0
> [pid 18169] poll([{fd=8, events=POLLOUT}], 1, 60000) = 1 ([{fd=8,
> revents=POLLOUT}])
> [pid 18169] recvfrom(8, 0x2b0bdc3016a0, 1, 0, 0, 0) = -1 EAGAIN (Resource
> temporarily unavailable)
> [pid 18169] sendto(8,
> "\31\0\0\0\37A\0\0\0\4\0\0\0\0\0\0\0\0\0\0\0\0\0\vauth/mun"..., 175, 0, NULL,
> 0) = 175
> [pid 18169] fcntl(8, F_SETFL, O_RDWR) = 0
> [pid 18169] rt_sigaction(SIGPIPE, {SIG_DFL, [PIPE], SA_RESTORER,
> 0x35a060f500}, {SIG_IGN, [PIPE], SA_RESTORER, 0x35a060f500}, 8) = 0
> [pid 18169] close(8) = 0
> [pid 18169] madvise(0x2b0bdc202000, 1028096, MADV_DONTNEED) = 0
> [pid 18169] _exit(0) = ?
> Process 18169 detached
> [pid 18164] <... select resumed> ) = 1 (in [4])
> [pid 18164] accept(4, {sa_family=AF_INET, sin_port=htons(51303),
> sin_addr=inet_addr("130.238.136.157")}, [16]) = 8
> [pid 18164] clone(Process 18172 attached (waiting for parent)
> Process 18172 resumed (parent 18164 ready)
> child_stack=0x2b0bdc301ff0,
> flags=CLONE_VM|CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD|CLONE_SYSVSEM|CLONE_SETTLS|CLONE_PARENT_SETTID|CLONE_CHILD_CLEARTID,
> parent_tidptr=0x2b0bdc3029d0, tls=0x2b0bdc302700,
> child_tidptr=0x2b0bdc3029d0) = 18172
> [pid 18172] set_robust_list(0x2b0bdc3029e0, 0x18) = 0
> [pid 18172] fcntl(8, F_GETFL <unfinished ...>
> [pid 18164] select(5, [4], NULL, NULL, NULL <unfinished ...>
> [pid 18172] <... fcntl resumed> ) = 0x2 (flags O_RDWR)
> [pid 18172] fcntl(8, F_GETFL) = 0x2 (flags O_RDWR)
> [pid 18172] fcntl(8, F_SETFL, O_RDWR|O_NONBLOCK) = 0
> [pid 18172] poll([{fd=8, events=POLLIN}], 1, 60000 <unfinished ...>
> [pid 18168] <... poll resumed> ) = 1 ([{fd=7, revents=POLLIN}])
> [pid 18168] recvfrom(7, "\0\0\1\6", 4, 0, NULL, NULL) = 4
> [pid 18168] fcntl(7, F_SETFL, O_RDWR) = 0
> [pid 18168] fcntl(7, F_GETFL) = 0x2 (flags O_RDWR)
> [pid 18168] fcntl(7, F_GETFL) = 0x2 (flags O_RDWR)
> [pid 18168] fcntl(7, F_SETFL, O_RDWR|O_NONBLOCK) = 0
> [pid 18168] poll([{fd=7, events=POLLIN}], 1, 60000) = 1 ([{fd=7,
> revents=POLLIN}])
> [pid 18168] recvfrom(7,
> "\30\0\0\0\3\352\0\0\0[\0\0\0\0\0\0\0\0\0\0\0\0\0\vauth/mun"..., 262, 0,
> NULL, NULL) = 262
> [pid 18168] fcntl(7, F_SETFL, O_RDWR) = 0
> [pid 18168] stat("/var/run/munge/munge.socket.2", {st_mode=S_IFSOCK|0777,
> st_size=0, ...}) = 0
> [pid 18168] socket(PF_FILE, SOCK_STREAM, 0) = 11
> [pid 18168] fcntl(11, F_GETFL) = 0x2 (flags O_RDWR)
> [pid 18168] fcntl(11, F_SETFL, O_RDWR|O_NONBLOCK) = 0
> [pid 18168] connect(11, {sa_family=AF_FILE,
> path="/var/run/munge/munge.socket.2"}, 110) = 0
> [pid 18168] writev(11, [{"\0`mK\4\4\0\0\0\0\204", 11},
> {"\0\0\0\200MUNGE:AwQDAAA89DxchL99mxvooO"..., 132}], 2) = 143
> [pid 18168] read(11, 0x2b0bdc200bf0, 11) = -1 EAGAIN (Resource temporarily
> unavailable)
> [pid 18168] poll([{fd=11, events=POLLIN}], 1, 3000) = 1 ([{fd=11,
> revents=POLLIN|POLLHUP}])
> [pid 18170] <... poll resumed> ) = 1 ([{fd=9, revents=POLLIN}])
> [pid 18168] read(11, "\0`mK\4\5\0\0\0\0+", 11) = 11
> [pid 18168] read(11, <unfinished ...>
> [pid 18170] recvfrom(9, <unfinished ...>
> [pid 18168] <... read resumed>
> "\0\0\4\3\0\0\0\0\1,\4\202\356\210xQ7PEQ7PE\0\0\0\0\0\0\0\0\377"..., 43) = 43
> [pid 18168] close(11) = 0
> [pid 18168] --- SIGSEGV (Segmentation fault) @ 0 (0) ---
> Process 18168 detached
> [pid 18172] +++ killed by SIGSEGV (core dumped) +++
> [pid 18171] +++ killed by SIGSEGV (core dumped) +++
> [pid 18170] +++ killed by SIGSEGV (core dumped) +++
> [pid 18166] +++ killed by SIGSEGV (core dumped) +++
> [pid 18165] +++ killed by SIGSEGV (core dumped) +++
> [pid 18163] +++ killed by SIGSEGV (core dumped) +++
> [pid 18162] +++ killed by SIGSEGV (core dumped) +++
> [pid 18161] +++ killed by SIGSEGV (core dumped) +++
> [pid 18160] +++ killed by SIGSEGV (core dumped) +++
> [pid 18159] +++ killed by SIGSEGV (core dumped) +++
> [pid 18156] +++ killed by SIGSEGV (core dumped) +++
> [pid 18155] +++ killed by SIGSEGV (core dumped) +++
> [pid 18164] +++ killed by SIGSEGV (core dumped) +++
> +++ killed by SIGSEGV (core dumped) +++
>
>
> Log file /var/log/messages says:
> Mar 6 15:18:35 kalkyl2 kernel: slurmctld[18150]: segfault at 0 ip
> 000000000043c2eb sp 00002ab4d8200820 error 4 in slurmctld[400
>
>
> Best regards,
> -- Lennart Karlsson, UPPMAX, Uppsala University, Sweden