Dear gridengine users,
we have been using open grid engine for some time now on our linux machines, 
which were running suse linux enterprise server 11.1. I recently updated some 
of them to SLES 11.3 (some were just patched, and some had a fresh install) and 
since then, gridengine has some faulty behavior:

The first job submitted to an execution host runs and finishes correctly, but 
if one submits a second one, the host switches into an error state instantly, 
leaving the second job as 'qw'. I seems as if there is a very small time-window 
(< ~1 s), where a second job can be submitted after the first one but anything 
later, independent of whether the first one is still running or not, results in 
the error.
To be able to run the next job, one has to stop and start /etc/init.d/sgeexecd.

The messages-file of the spools-directory says:
___________________________________________________________________________________________________
04/17/2014 11:40:15|  main|host-4|I|controlled shutdown 2011.11
04/17/2014 11:40:22|  main|host-4|I|starting up OGS/GE 2011.11 (linux-x64)
04/17/2014 11:41:28|  main|host-4|E|shepherd of job 4758.1 died through signal 
= 11
04/17/2014 11:41:28|  main|host-4|E|abnormal termination of shepherd for job 
4758.1: no "exit_status" file
04/17/2014 11:41:28|  main|host-4|E|can't open file active_jobs/4758.1/error: 
Datei oder Verzeichnis nicht gefunden
04/17/2014 11:41:28|  main|host-4|E|can't open pid file 
"active_jobs/4758.1/pid" for job 4758.1
___________________________________________________________________________________________________

Where 4758 is the second job. The signal mostly is 11, sometimes 6, I don't 
know how to influence this. I used strace on the execd, to maybe get a clue. 
The output for the newly started process, which is invoked for the second job 
contained this:
___________________________________________________________________________________________________
set_robust_list(0x7f0c9366c9e0, 0x18) = 0
getsockname(3, {sa_family=AF_INET, sin_port=htons(60960), 
sin_addr=inet_addr("1.2.3.4")}, [16]) = 0
getpeername(3, {sa_family=AF_INET, sin_port=htons(389), 
sin_addr=inet_addr("5.6.7.8")}, [16]) = 0
fcntl(3, F_GETFD) = 0x1 (flags FD_CLOEXEC)
dup(3)                                  = 7
fcntl(7, F_SETFD, FD_CLOEXEC) = 0
socket(PF_INET, SOCK_STREAM, IPPROTO_IP) = 8
fcntl(8, F_GETFD) = 0
dup2(8, 3) = 3
fcntl(3, F_SETFD, 0) = 0
close(8)                                = 0
--- SIGSEGV (Segmentation fault) @ 0 (0) ---
___________________________________________________________________________________________________

Since there is a segmentation fault, I thought, that maybe some libraries 
changed on the new suse version, so I compiled gridengine on one of the new 
machines. Since I probably don't need everything, they are only used as 
execution hosts, I used ./aimk -only-core -no-jni -no-java. With some tinkering 
it finally worked until and including the creation of the local distribution. 
But the install_execd-script complained that qmake, qtcsh, rlogin, rsh and rshd 
are missing. So I just copied all the other binaries, libraries and files to a 
host with an old gridengine version installed. Unfortunately this didn't solve 
the problem.
The output of strace of the new process now looks a bit different:
___________________________________________________________________________________________________
set_robust_list(0x7ffe451639e0, 0x18) = 0
getsockname(3, {sa_family=AF_INET, sin_port=htons(51974), 
sin_addr=inet_addr("1.2.3.5")}, [16]) = 0
getpeername(3, {sa_family=AF_INET, sin_port=htons(389), 
sin_addr=inet_addr("5.6.7.8")}, [16]) = 0
fcntl(3, F_GETFD)                  = 0x1 (flags FD_CLOEXEC)
dup(3) = 7
fcntl(7, F_SETFD, FD_CLOEXEC) = 0
socket(PF_INET, SOCK_STREAM, IPPROTO_IP) = 8
fcntl(8, F_GETFD)                                = 0
dup2(8, 3) = 3
fcntl(3, F_SETFD, 0)                    = 0
close(8) = 0
open("/dev/tty", O_RDWR|O_NOCTTY|O_NONBLOCK)                                = 
-1 ENXIO (No such device or address)
writev(2,
 [{"*** glibc detected *** ", 23}, {"/opt/sge/bin/linux-x64/sge_execd", 
32}, {": ", 2}, {"free(): invalid pointer", 23}, {": 0x", 4}, 
{"00007ffe44425188", 16}, {" ***\n", 5}], 
7)                                = 105
open("/opt/sge/bin/linux-x64/../../lib/linux-x64/libgcc_s.so.1", O_RDONLY) = -1 
ENOENT (No such file or directory)
open("/opt/sge/lib/linux-x64/libgcc_s.so.1", O_RDONLY) = -1 ENOENT (No such 
file or directory)
open("/etc/ld.so.cache", O_RDONLY) = 8
fstat(8, {st_mode=S_IFREG|0644, st_size=50062, ...}) = 0
mmap(NULL, 50062, PROT_READ, MAP_PRIVATE, 8, 0) = 0x7ffe45136000
close(8) = 0
open("/lib64/libgcc_s.so.1", O_RDONLY)          = 8
read(8, "\177ELF\2\1\1\0\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0\200.\0\0\0\0\0\0"..., 
832) = 832
fstat(8, {st_mode=S_IFREG|0755, st_size=88552, ...}) = 0
mmap(NULL, 2184216, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 8, 0) = 
0x7ffe3fe06000
fadvise64(8, 0, 2184216, POSIX_FADV_WILLNEED) = 0
mprotect(0x7ffe3fe1b000, 2093056, PROT_NONE) = 0
mmap(0x7ffe4001a000,
 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 8, 
0x14000)                         = 0x7ffe4001a000
close(8)                                = 0
mprotect(0x7ffe4001a000, 4096, PROT_READ) = 0
munmap(0x7ffe45136000, 50062)           = 0
futex(0x7ffe448ba610, FUTEX_WAKE_PRIVATE, 2147483647) = 0
futex(0x7ffe4001b1a4, FUTEX_WAKE_PRIVATE, 2147483647)          = 0
write(2, "======= Backtrace: =========\n", 29)                                = 
29
writev(2,
 [{"/lib64/libc.so.6", 16}, {"(", 1}, {"+0x", 3}, {"76618", 5}, {")", 
1}, {"[0x", 3}, {"7ffe445ba618", 12}, {"]\n", 2}], 8)        = 43
writev(2,
 [{"/usr/lib64/libldap-2.4.so.2", 27}, {"(", 1}, {"ldap_free_urldesc", 
17}, {"+0x", 3}, {"19", 2}, {")", 1}, {"[0x", 3}, {"7ffe435ae449", 12}, 
{"]\n", 2}], 9)                              = 68
writev(2, 
[{"/usr/lib64/libldap-2.4.so.2", 27}, {"(", 1}, {"ldap_free_urllist", 
17}, {"+0x", 3}, {"18", 2}, {")", 1}, {"[0x", 3}, {"7ffe435ae4c8", 12}, 
{"]\n", 2}], 9) = 68
writev(2, [{"/usr/lib64/libldap-2.4.so.2", 27}, 
{"(", 1}, {"ldap_free_connection", 20}, {"+0x", 3}, {"132", 3}, {")", 
1}, {"[0x", 3}, {"7ffe435aaed2", 12}, {"]\n", 2}], 
9)                               = 72
writev(2, 
[{"/usr/lib64/libldap-2.4.so.2", 27}, {"(", 1}, {"ldap_ld_free", 12}, 
{"+0x", 3}, {"b7", 2}, {")", 1}, {"[0x", 3}, {"7ffe435a1d77", 12}, 
{"]\n", 2}], 9) = 63
writev(2, [{"/lib64/libnss_ldap.so.2", 23}, 
{"(", 1}, {"+0x", 3}, {"4047", 4}, {")", 1}, {"[0x", 3}, 
{"7ffe437d5047", 12}, {"]\n", 2}], 8)                                = 
49
writev(2, [{"/lib64/libnss_ldap.so.2", 23}, {"(", 1}, {"+0x", 3}, 
{"7ad5", 4}, {")", 1}, {"[0x", 3}, {"7ffe437d8ad5", 12}, {"]\n", 2}], 
8)            = 49
writev(2, [{"/lib64/libc.so.6", 16}, {"(", 1}, 
{"__libc_fork", 11}, {"+0x", 3}, {"1df", 3}, {")", 1}, {"[0x", 3}, 
{"7ffe445ec87f", 12}, {"]\n", 2}], 9)                  = 52
writev(2,
 [{"/opt/sge/bin/linux-x64/sge_execd", 32}, {"(", 1}, {"sge_exec_job", 
12}, {"+0x", 3}, {"5a05", 4}, {")", 1}, {"[0x", 3}, {"4332f5", 6}, 
{"]\n", 2}], 9) = 64
writev(2, [{"/opt/sge/bin/linux-x64/sge_execd", 32}, {"[0x", 3}, {"435293", 6}, 
{"]\n", 2}], 4) = 43
writev(2,
 [{"/opt/sge/bin/linux-x64/sge_execd", 32}, {"(", 1}, {"do_ck_to_do", 
11}, {"+0x", 3}, {"286", 3}, {")", 1}, {"[0x", 3}, {"435906", 6}, 
{"]\n", 2}], 9) = 62
writev(2, [{"/opt/sge/bin/linux-x64/sge_execd", 
32}, {"(", 1}, {"sge_execd_process_messages", 26}, {"+0x", 3}, {"43c", 
3}, {")", 1}, {"[0x", 3}, {"42cd7c", 6}, {"]\n", 2}], 9) = 77
writev(2,
 [{"/opt/sge/bin/linux-x64/sge_execd", 32}, {"(", 1}, {"main", 4}, 
{"+0x", 3}, {"b14", 3}, {")", 1}, {"[0x", 3}, {"429ed4", 6}, {"]\n", 
2}], 9) = 55
writev(2, [{"/lib64/libc.so.6", 16}, {"(", 1}, 
{"__libc_start_main", 17}, {"+0x", 3}, {"e6", 2}, {")", 1}, {"[0x", 3}, 
{"7ffe44562c36", 12}, {"]\n", 2}], 9) = 57
writev(2, 
[{"/opt/sge/bin/linux-x64/sge_execd", 32}, {"(", 1}, {"setlocale", 9}, 
{"+0x", 3}, {"1f9", 3}, {")", 1}, {"[0x", 3}, {"428cd9", 6}, {"]\n", 
2}], 9) = 60
write(2, "======= Memory map: ========\n", 29)                                = 
29
open("/proc/self/maps", O_RDONLY)                = 8
read(8, "00400000-005b1000 r-xp 00000000 "..., 1024) = 1024
write(2, "00400000-005b1000 r-xp 00000000 "..., 1024) = 1024
read(8, "           /lib64/libz.so.1.2.7\n"..., 1024) = 1024
write(2, "           /lib64/libz.so.1.2.7\n"..., 1024)                          
      = 1024
read(8, "0.1\n7ffe41e40000-7ffe41e41000 rw"..., 1024) = 1024
write(2, "0.1\n7ffe41e40000-7ffe41e41000 rw"..., 1024) = 1024
read(8, "03:01 2611268                   "..., 1024) = 1024
write(2, "03:01 2611268                   "..., 1024) = 1024
read(8, "000 r--p 00014000 103:01 1733331"..., 1024) = 1024
write(2, "000 r--p 00014000 103:01 1733331"..., 1024) = 1024
read(8, "m_err.so.2.1\n7ffe42ebc000-7ffe42"..., 1024) = 1024
write(2, "m_err.so.2.1\n7ffe42ebc000-7ffe42"..., 1024)        = 1024
read(8, ".2.7.1\n7ffe43586000-7ffe43587000"..., 1024) = 1024
write(2, ".2.7.1\n7ffe43586000-7ffe43587000"..., 1024) = 1024
read(8, "\n7ffe439e6000-7ffe439f2000 rw-p "..., 1024) = 1024
write(2, "\n7ffe439e6000-7ffe439f2000 rw-p "..., 1024) = 1024
read(8, "0000 00:00 0 \n7ffe448bc000-7ffe4"..., 1024) = 1024
write(2, "0000 00:00 0 \n7ffe448bc000-7ffe4"..., 1024)                  = 1024
read(8, "ibdl-2.11.3.so\n7ffe44f54000-7ffe"..., 1024) = 1024
write(2, "ibdl-2.11.3.so\n7ffe44f54000-7ffe"..., 1024)                          
      = 1024
read(8, " 00:00 0                        "..., 1024) = 206
write(2, " 00:00 0                        "..., 206)        = 206
read(8, "", 1024)                      = 0
close(8) = 0
rt_sigprocmask(SIG_UNBLOCK, [ABRT], NULL, 8)                        = 0
tgkill(13089, 13089, SIGABRT) = 0
--- SIGABRT (Aborted) @ 0 (0) ---
___________________________________________________________________________________________________

I can't interpret this good enough, to know what went wrong (if it's in it in 
the first place). The strace output of the five execd-processes running 
constantly in the background is too long. The one probably managing the job 
didn't look very different, comparing a working and a not working submission 
(first and second job), the first real difference are the last two lines out of 
this excerpt, the rest is only slightly differing numbers etc.:
___________________________________________________________________________________________________
[...]
stat("/proc/6/status", {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
open("/proc/6/status", O_RDONLY) = 8
fstat(8, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 
0x7ffe45169000
read(8, "Name:\tmigration/0\nState:\tS (slee"..., 1024) = 799
close(8) = 0
munmap(0x7ffe45169000, 4096)                               = 0
close(8) = -1 EBADF (Bad file descriptor)
stat("/proc/7/stat", {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
open("/proc/7/stat", O_RDONLY)          = 8
read(8, "7 (watchdog/0) S 2 0 0 0 -1 2216"..., 1023) = 164
close(8) = 0
stat("/proc/7/status", {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
open("/proc/7/status", O_RDONLY) = 8
fstat(8, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 
0x7ffe45169000
read(8, "Name:\twatchdog/0\nState:\tS (sleep"..., 1024) = 800
close(8)                                = 0
munmap(0x7ffe45169000, 4096)            = 0
close(8)                                = -1 EBADF (Bad file descriptor)
stat("/proc/8/stat", {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
open("/proc/8/stat", O_RDONLY)          = 8
read(8, "8 (migration/1) S 2 0 0 0 -1 221"..., 1023) = 164
close(8)                                = 0
--- SIGCHLD (Child exited) @ 0 (0) ---
rt_sigreturn(0x11)                      = 0
___________________________________________________________________________________________________

I'd be happy about any suggestion on how to solve this, or just where I could 
continue searching for the root of the problem.
Thanks, Sven
                                                                                
  
_______________________________________________
users mailing list
[email protected]
https://gridengine.org/mailman/listinfo/users

Reply via email to