Dear gridengine users,
we have been using open grid engine for some time now on our linux machines,
which were running suse linux enterprise server 11.1. I recently updated some
of them to SLES 11.3 (some were just patched, and some had a fresh install) and
since then, gridengine has some faulty behavior:
The first job submitted to an execution host runs and finishes correctly, but
if one submits a second one, the host switches into an error state instantly,
leaving the second job as 'qw'. I seems as if there is a very small time-window
(< ~1 s), where a second job can be submitted after the first one but anything
later, independent of whether the first one is still running or not, results in
the error.
To be able to run the next job, one has to stop and start /etc/init.d/sgeexecd.
The messages-file of the spools-directory says:
___________________________________________________________________________________________________
04/17/2014 11:40:15| main|host-4|I|controlled shutdown 2011.11
04/17/2014 11:40:22| main|host-4|I|starting up OGS/GE 2011.11 (linux-x64)
04/17/2014 11:41:28| main|host-4|E|shepherd of job 4758.1 died through signal
= 11
04/17/2014 11:41:28| main|host-4|E|abnormal termination of shepherd for job
4758.1: no "exit_status" file
04/17/2014 11:41:28| main|host-4|E|can't open file active_jobs/4758.1/error:
Datei oder Verzeichnis nicht gefunden
04/17/2014 11:41:28| main|host-4|E|can't open pid file
"active_jobs/4758.1/pid" for job 4758.1
___________________________________________________________________________________________________
Where 4758 is the second job. The signal mostly is 11, sometimes 6, I don't
know how to influence this. I used strace on the execd, to maybe get a clue.
The output for the newly started process, which is invoked for the second job
contained this:
___________________________________________________________________________________________________
set_robust_list(0x7f0c9366c9e0, 0x18) = 0
getsockname(3, {sa_family=AF_INET, sin_port=htons(60960),
sin_addr=inet_addr("1.2.3.4")}, [16]) = 0
getpeername(3, {sa_family=AF_INET, sin_port=htons(389),
sin_addr=inet_addr("5.6.7.8")}, [16]) = 0
fcntl(3, F_GETFD) = 0x1 (flags FD_CLOEXEC)
dup(3) = 7
fcntl(7, F_SETFD, FD_CLOEXEC) = 0
socket(PF_INET, SOCK_STREAM, IPPROTO_IP) = 8
fcntl(8, F_GETFD) = 0
dup2(8, 3) = 3
fcntl(3, F_SETFD, 0) = 0
close(8) = 0
--- SIGSEGV (Segmentation fault) @ 0 (0) ---
___________________________________________________________________________________________________
Since there is a segmentation fault, I thought, that maybe some libraries
changed on the new suse version, so I compiled gridengine on one of the new
machines. Since I probably don't need everything, they are only used as
execution hosts, I used ./aimk -only-core -no-jni -no-java. With some tinkering
it finally worked until and including the creation of the local distribution.
But the install_execd-script complained that qmake, qtcsh, rlogin, rsh and rshd
are missing. So I just copied all the other binaries, libraries and files to a
host with an old gridengine version installed. Unfortunately this didn't solve
the problem.
The output of strace of the new process now looks a bit different:
___________________________________________________________________________________________________
set_robust_list(0x7ffe451639e0, 0x18) = 0
getsockname(3, {sa_family=AF_INET, sin_port=htons(51974),
sin_addr=inet_addr("1.2.3.5")}, [16]) = 0
getpeername(3, {sa_family=AF_INET, sin_port=htons(389),
sin_addr=inet_addr("5.6.7.8")}, [16]) = 0
fcntl(3, F_GETFD) = 0x1 (flags FD_CLOEXEC)
dup(3) = 7
fcntl(7, F_SETFD, FD_CLOEXEC) = 0
socket(PF_INET, SOCK_STREAM, IPPROTO_IP) = 8
fcntl(8, F_GETFD) = 0
dup2(8, 3) = 3
fcntl(3, F_SETFD, 0) = 0
close(8) = 0
open("/dev/tty", O_RDWR|O_NOCTTY|O_NONBLOCK) =
-1 ENXIO (No such device or address)
writev(2,
[{"*** glibc detected *** ", 23}, {"/opt/sge/bin/linux-x64/sge_execd",
32}, {": ", 2}, {"free(): invalid pointer", 23}, {": 0x", 4},
{"00007ffe44425188", 16}, {" ***\n", 5}],
7) = 105
open("/opt/sge/bin/linux-x64/../../lib/linux-x64/libgcc_s.so.1", O_RDONLY) = -1
ENOENT (No such file or directory)
open("/opt/sge/lib/linux-x64/libgcc_s.so.1", O_RDONLY) = -1 ENOENT (No such
file or directory)
open("/etc/ld.so.cache", O_RDONLY) = 8
fstat(8, {st_mode=S_IFREG|0644, st_size=50062, ...}) = 0
mmap(NULL, 50062, PROT_READ, MAP_PRIVATE, 8, 0) = 0x7ffe45136000
close(8) = 0
open("/lib64/libgcc_s.so.1", O_RDONLY) = 8
read(8, "\177ELF\2\1\1\0\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0\200.\0\0\0\0\0\0"...,
832) = 832
fstat(8, {st_mode=S_IFREG|0755, st_size=88552, ...}) = 0
mmap(NULL, 2184216, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 8, 0) =
0x7ffe3fe06000
fadvise64(8, 0, 2184216, POSIX_FADV_WILLNEED) = 0
mprotect(0x7ffe3fe1b000, 2093056, PROT_NONE) = 0
mmap(0x7ffe4001a000,
8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 8,
0x14000) = 0x7ffe4001a000
close(8) = 0
mprotect(0x7ffe4001a000, 4096, PROT_READ) = 0
munmap(0x7ffe45136000, 50062) = 0
futex(0x7ffe448ba610, FUTEX_WAKE_PRIVATE, 2147483647) = 0
futex(0x7ffe4001b1a4, FUTEX_WAKE_PRIVATE, 2147483647) = 0
write(2, "======= Backtrace: =========\n", 29) =
29
writev(2,
[{"/lib64/libc.so.6", 16}, {"(", 1}, {"+0x", 3}, {"76618", 5}, {")",
1}, {"[0x", 3}, {"7ffe445ba618", 12}, {"]\n", 2}], 8) = 43
writev(2,
[{"/usr/lib64/libldap-2.4.so.2", 27}, {"(", 1}, {"ldap_free_urldesc",
17}, {"+0x", 3}, {"19", 2}, {")", 1}, {"[0x", 3}, {"7ffe435ae449", 12},
{"]\n", 2}], 9) = 68
writev(2,
[{"/usr/lib64/libldap-2.4.so.2", 27}, {"(", 1}, {"ldap_free_urllist",
17}, {"+0x", 3}, {"18", 2}, {")", 1}, {"[0x", 3}, {"7ffe435ae4c8", 12},
{"]\n", 2}], 9) = 68
writev(2, [{"/usr/lib64/libldap-2.4.so.2", 27},
{"(", 1}, {"ldap_free_connection", 20}, {"+0x", 3}, {"132", 3}, {")",
1}, {"[0x", 3}, {"7ffe435aaed2", 12}, {"]\n", 2}],
9) = 72
writev(2,
[{"/usr/lib64/libldap-2.4.so.2", 27}, {"(", 1}, {"ldap_ld_free", 12},
{"+0x", 3}, {"b7", 2}, {")", 1}, {"[0x", 3}, {"7ffe435a1d77", 12},
{"]\n", 2}], 9) = 63
writev(2, [{"/lib64/libnss_ldap.so.2", 23},
{"(", 1}, {"+0x", 3}, {"4047", 4}, {")", 1}, {"[0x", 3},
{"7ffe437d5047", 12}, {"]\n", 2}], 8) =
49
writev(2, [{"/lib64/libnss_ldap.so.2", 23}, {"(", 1}, {"+0x", 3},
{"7ad5", 4}, {")", 1}, {"[0x", 3}, {"7ffe437d8ad5", 12}, {"]\n", 2}],
8) = 49
writev(2, [{"/lib64/libc.so.6", 16}, {"(", 1},
{"__libc_fork", 11}, {"+0x", 3}, {"1df", 3}, {")", 1}, {"[0x", 3},
{"7ffe445ec87f", 12}, {"]\n", 2}], 9) = 52
writev(2,
[{"/opt/sge/bin/linux-x64/sge_execd", 32}, {"(", 1}, {"sge_exec_job",
12}, {"+0x", 3}, {"5a05", 4}, {")", 1}, {"[0x", 3}, {"4332f5", 6},
{"]\n", 2}], 9) = 64
writev(2, [{"/opt/sge/bin/linux-x64/sge_execd", 32}, {"[0x", 3}, {"435293", 6},
{"]\n", 2}], 4) = 43
writev(2,
[{"/opt/sge/bin/linux-x64/sge_execd", 32}, {"(", 1}, {"do_ck_to_do",
11}, {"+0x", 3}, {"286", 3}, {")", 1}, {"[0x", 3}, {"435906", 6},
{"]\n", 2}], 9) = 62
writev(2, [{"/opt/sge/bin/linux-x64/sge_execd",
32}, {"(", 1}, {"sge_execd_process_messages", 26}, {"+0x", 3}, {"43c",
3}, {")", 1}, {"[0x", 3}, {"42cd7c", 6}, {"]\n", 2}], 9) = 77
writev(2,
[{"/opt/sge/bin/linux-x64/sge_execd", 32}, {"(", 1}, {"main", 4},
{"+0x", 3}, {"b14", 3}, {")", 1}, {"[0x", 3}, {"429ed4", 6}, {"]\n",
2}], 9) = 55
writev(2, [{"/lib64/libc.so.6", 16}, {"(", 1},
{"__libc_start_main", 17}, {"+0x", 3}, {"e6", 2}, {")", 1}, {"[0x", 3},
{"7ffe44562c36", 12}, {"]\n", 2}], 9) = 57
writev(2,
[{"/opt/sge/bin/linux-x64/sge_execd", 32}, {"(", 1}, {"setlocale", 9},
{"+0x", 3}, {"1f9", 3}, {")", 1}, {"[0x", 3}, {"428cd9", 6}, {"]\n",
2}], 9) = 60
write(2, "======= Memory map: ========\n", 29) =
29
open("/proc/self/maps", O_RDONLY) = 8
read(8, "00400000-005b1000 r-xp 00000000 "..., 1024) = 1024
write(2, "00400000-005b1000 r-xp 00000000 "..., 1024) = 1024
read(8, " /lib64/libz.so.1.2.7\n"..., 1024) = 1024
write(2, " /lib64/libz.so.1.2.7\n"..., 1024)
= 1024
read(8, "0.1\n7ffe41e40000-7ffe41e41000 rw"..., 1024) = 1024
write(2, "0.1\n7ffe41e40000-7ffe41e41000 rw"..., 1024) = 1024
read(8, "03:01 2611268 "..., 1024) = 1024
write(2, "03:01 2611268 "..., 1024) = 1024
read(8, "000 r--p 00014000 103:01 1733331"..., 1024) = 1024
write(2, "000 r--p 00014000 103:01 1733331"..., 1024) = 1024
read(8, "m_err.so.2.1\n7ffe42ebc000-7ffe42"..., 1024) = 1024
write(2, "m_err.so.2.1\n7ffe42ebc000-7ffe42"..., 1024) = 1024
read(8, ".2.7.1\n7ffe43586000-7ffe43587000"..., 1024) = 1024
write(2, ".2.7.1\n7ffe43586000-7ffe43587000"..., 1024) = 1024
read(8, "\n7ffe439e6000-7ffe439f2000 rw-p "..., 1024) = 1024
write(2, "\n7ffe439e6000-7ffe439f2000 rw-p "..., 1024) = 1024
read(8, "0000 00:00 0 \n7ffe448bc000-7ffe4"..., 1024) = 1024
write(2, "0000 00:00 0 \n7ffe448bc000-7ffe4"..., 1024) = 1024
read(8, "ibdl-2.11.3.so\n7ffe44f54000-7ffe"..., 1024) = 1024
write(2, "ibdl-2.11.3.so\n7ffe44f54000-7ffe"..., 1024)
= 1024
read(8, " 00:00 0 "..., 1024) = 206
write(2, " 00:00 0 "..., 206) = 206
read(8, "", 1024) = 0
close(8) = 0
rt_sigprocmask(SIG_UNBLOCK, [ABRT], NULL, 8) = 0
tgkill(13089, 13089, SIGABRT) = 0
--- SIGABRT (Aborted) @ 0 (0) ---
___________________________________________________________________________________________________
I can't interpret this good enough, to know what went wrong (if it's in it in
the first place). The strace output of the five execd-processes running
constantly in the background is too long. The one probably managing the job
didn't look very different, comparing a working and a not working submission
(first and second job), the first real difference are the last two lines out of
this excerpt, the rest is only slightly differing numbers etc.:
___________________________________________________________________________________________________
[...]
stat("/proc/6/status", {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
open("/proc/6/status", O_RDONLY) = 8
fstat(8, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) =
0x7ffe45169000
read(8, "Name:\tmigration/0\nState:\tS (slee"..., 1024) = 799
close(8) = 0
munmap(0x7ffe45169000, 4096) = 0
close(8) = -1 EBADF (Bad file descriptor)
stat("/proc/7/stat", {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
open("/proc/7/stat", O_RDONLY) = 8
read(8, "7 (watchdog/0) S 2 0 0 0 -1 2216"..., 1023) = 164
close(8) = 0
stat("/proc/7/status", {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
open("/proc/7/status", O_RDONLY) = 8
fstat(8, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) =
0x7ffe45169000
read(8, "Name:\twatchdog/0\nState:\tS (sleep"..., 1024) = 800
close(8) = 0
munmap(0x7ffe45169000, 4096) = 0
close(8) = -1 EBADF (Bad file descriptor)
stat("/proc/8/stat", {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
open("/proc/8/stat", O_RDONLY) = 8
read(8, "8 (migration/1) S 2 0 0 0 -1 221"..., 1023) = 164
close(8) = 0
--- SIGCHLD (Child exited) @ 0 (0) ---
rt_sigreturn(0x11) = 0
___________________________________________________________________________________________________
I'd be happy about any suggestion on how to solve this, or just where I could
continue searching for the root of the problem.
Thanks, Sven
_______________________________________________
users mailing list
[email protected]
https://gridengine.org/mailman/listinfo/users