On Jul 8, 2011, at 5:52 AM, Reuti wrote:
>
> can you please check on the node what is generating this high load?
>
> $ ps -e f
PID TTY STAT TIME COMMAND
1 ? Ss 0:01 init [3]
2 ? S< 0:00 [migration/0]
3 ? SN 0:00 [ksoftirqd/0]
4 ? S< 0:00 [watchdog/0]
5 ? S< 0:00 [migration/1]
6 ? SN 0:00 [ksoftirqd/1]
7 ? S< 0:00 [watchdog/1]
8 ? S< 0:00 [migration/2]
9 ? SN 0:00 [ksoftirqd/2]
10 ? S< 0:00 [watchdog/2]
11 ? S< 0:00 [migration/3]
12 ? SN 0:00 [ksoftirqd/3]
13 ? S< 0:00 [watchdog/3]
14 ? S< 0:00 [migration/4]
15 ? SN 0:00 [ksoftirqd/4]
16 ? S< 0:00 [watchdog/4]
17 ? S< 0:00 [migration/5]
18 ? SN 0:00 [ksoftirqd/5]
19 ? S< 0:00 [watchdog/5]
20 ? S< 0:00 [migration/6]
21 ? SN 0:00 [ksoftirqd/6]
22 ? S< 0:00 [watchdog/6]
23 ? S< 0:00 [migration/7]
24 ? SN 0:00 [ksoftirqd/7]
25 ? S< 0:00 [watchdog/7]
26 ? S< 0:00 [migration/8]
27 ? SN 0:00 [ksoftirqd/8]
28 ? S< 0:00 [watchdog/8]
29 ? S< 0:00 [migration/9]
30 ? SN 0:00 [ksoftirqd/9]
31 ? S< 0:00 [watchdog/9]
32 ? S< 0:00 [migration/10]
33 ? SN 0:00 [ksoftirqd/10]
34 ? S< 0:00 [watchdog/10]
35 ? S< 0:00 [migration/11]
36 ? SN 0:00 [ksoftirqd/11]
37 ? S< 0:00 [watchdog/11]
38 ? S< 0:00 [events/0]
39 ? S< 0:00 [events/1]
40 ? S< 0:00 [events/2]
41 ? S< 0:00 [events/3]
42 ? S< 0:00 [events/4]
43 ? S< 0:00 [events/5]
44 ? S< 0:00 [events/6]
45 ? S< 0:00 [events/7]
46 ? S< 0:00 [events/8]
47 ? S< 0:00 [events/9]
48 ? S< 0:00 [events/10]
49 ? S< 0:00 [events/11]
50 ? S< 0:00 [khelper]
443 ? S< 0:00 [kthread]
459 ? S< 0:00 \_ [kblockd/0]
460 ? S< 0:00 \_ [kblockd/1]
461 ? S< 0:00 \_ [kblockd/2]
462 ? S< 0:00 \_ [kblockd/3]
463 ? S< 0:00 \_ [kblockd/4]
464 ? S< 0:00 \_ [kblockd/5]
465 ? S< 0:00 \_ [kblockd/6]
466 ? S< 0:00 \_ [kblockd/7]
467 ? S< 0:00 \_ [kblockd/8]
468 ? S< 0:00 \_ [kblockd/9]
469 ? S< 0:00 \_ [kblockd/10]
470 ? S< 0:00 \_ [kblockd/11]
471 ? S< 0:00 \_ [kacpid]
633 ? S< 0:00 \_ [cqueue/0]
634 ? S< 0:00 \_ [cqueue/1]
635 ? S< 0:00 \_ [cqueue/2]
636 ? S< 0:00 \_ [cqueue/3]
637 ? S< 0:00 \_ [cqueue/4]
638 ? S< 0:00 \_ [cqueue/5]
639 ? S< 0:00 \_ [cqueue/6]
640 ? S< 0:00 \_ [cqueue/7]
641 ? S< 0:00 \_ [cqueue/8]
642 ? S< 0:00 \_ [cqueue/9]
643 ? S< 0:00 \_ [cqueue/10]
644 ? S< 0:00 \_ [cqueue/11]
647 ? S< 0:00 \_ [khubd]
649 ? S< 0:00 \_ [kseriod]
804 ? S 0:00 \_ [pdflush]
805 ? S 0:19 \_ [pdflush]
806 ? S< 0:00 \_ [kswapd0]
807 ? S< 0:00 \_ [kswapd1]
808 ? S< 0:00 \_ [aio/0]
809 ? S< 0:00 \_ [aio/1]
810 ? S< 0:00 \_ [aio/2]
811 ? S< 0:00 \_ [aio/3]
812 ? S< 0:00 \_ [aio/4]
813 ? S< 0:00 \_ [aio/5]
814 ? S< 0:00 \_ [aio/6]
815 ? S< 0:00 \_ [aio/7]
816 ? S< 0:00 \_ [aio/8]
817 ? S< 0:00 \_ [aio/9]
818 ? S< 0:00 \_ [aio/10]
819 ? S< 0:00 \_ [aio/11]
971 ? S< 0:00 \_ [kpsmoused]
1083 ? S< 0:00 \_ [ata/0]
1084 ? S< 0:00 \_ [ata/1]
1085 ? S< 0:00 \_ [ata/2]
1086 ? S< 0:00 \_ [ata/3]
1087 ? S< 0:00 \_ [ata/4]
1088 ? S< 0:00 \_ [ata/5]
1089 ? S< 0:00 \_ [ata/6]
1090 ? S< 0:00 \_ [ata/7]
1091 ? S< 0:00 \_ [ata/8]
1092 ? S< 0:00 \_ [ata/9]
1093 ? S< 0:00 \_ [ata/10]
1094 ? S< 0:00 \_ [ata/11]
1095 ? S< 0:00 \_ [ata_aux]
1109 ? S< 0:00 \_ [scsi_eh_0]
1110 ? S< 0:00 \_ [scsi_eh_1]
1111 ? S< 0:00 \_ [scsi_eh_2]
1112 ? S< 0:00 \_ [scsi_eh_3]
1113 ? S< 0:00 \_ [scsi_eh_4]
1114 ? S< 0:00 \_ [scsi_eh_5]
1141 ? S< 0:00 \_ [kstriped]
1194 ? S< 0:10 \_ [kjournald]
1219 ? S< 0:01 \_ [kauditd]
2703 ? S< 0:00 \_ [kmpathd/0]
2704 ? S< 0:00 \_ [kmpathd/1]
2705 ? S< 0:00 \_ [kmpathd/2]
2706 ? S< 0:00 \_ [kmpathd/3]
2707 ? S< 0:00 \_ [kmpathd/4]
2708 ? S< 0:00 \_ [kmpathd/5]
2709 ? S< 0:00 \_ [kmpathd/6]
2710 ? S< 0:00 \_ [kmpathd/7]
2711 ? S< 0:00 \_ [kmpathd/8]
2712 ? S< 0:00 \_ [kmpathd/9]
2713 ? S< 0:00 \_ [kmpathd/10]
2714 ? S< 0:00 \_ [kmpathd/11]
2715 ? S< 0:00 \_ [kmpath_handlerd]
2754 ? S< 0:05 \_ [kjournald]
2756 ? S< 0:04 \_ [kjournald]
3807 ? S< 0:00 \_ [rpciod/0]
3808 ? S< 0:08 \_ [rpciod/1]
3809 ? S< 0:02 \_ [rpciod/2]
3810 ? S< 0:03 \_ [rpciod/3]
3811 ? S< 0:01 \_ [rpciod/4]
3812 ? S< 0:11 \_ [rpciod/5]
3813 ? S< 0:00 \_ [rpciod/6]
3814 ? S< 0:00 \_ [rpciod/7]
3815 ? S< 0:00 \_ [rpciod/8]
3816 ? S< 0:03 \_ [rpciod/9]
3817 ? S< 0:00 \_ [rpciod/10]
3818 ? S< 0:58 \_ [rpciod/11]
4047 ? SN 348:07 \_ [kipmi0]
1250 ? S<s 0:00 /sbin/udevd -d
3598 ? S<sl 0:05 auditd
3600 ? S<sl 0:02 \_ /sbin/audispd
3722 ? Sl 207:37 /opt/rocks/bin/python /opt/rocks/bin/greceptor
3734 ? Ss 0:00 syslogd -m 0
3737 ? Ss 0:00 klogd -x
3750 ? Ss 0:12 irqbalance
3767 ? Ss 0:00 portmap
3835 ? Ss 0:00 rpc.statd
3866 ? Ss 0:00 rpc.idmapd
3887 ? Ss 0:00 dbus-daemon --system
3951 ? S 0:00 [lockd]
3965 ? Ss 0:00 /usr/sbin/acpid
3977 ? Ss 0:01 hald
3978 ? S 0:00 \_ hald-runner
3987 ? S 0:00 \_ hald-addon-acpi: listening on acpid socket
/var/run/acpid.socket
3995 ? S 0:00 \_ hald-addon-keyboard: listening on
/dev/input/event0
4125 ? Ssl 0:00 automount
4189 ? Rl 13:30 /opt/gridengine/bin/lx26-amd64/sge_execd
19334 ? Z 0:00 \_ [sge_shepherd] <defunct>
20252 ? S 0:00 \_ sge_shepherd-127467 -bg
20253 ? Ss 0:00 | \_ -bash
/opt/gridengine/default/spool/compute-0-0/job_scripts/127467 ohscal_4
UAF_X_squared1 svm_poly_optimize_c
20385 ? Sl 31:47 | \_
/usr/local/MATLAB/R2011a/bin/glnxa64/MATLAB -nodisplay -r control_script_cl
ohscal_4 UAF_X_squared1 svm_poly_optimize_c; quit; -nojvm
20588 ? S 0:00 \_ sge_shepherd-127468 -bg
20589 ? Ss 0:00 | \_ -bash
/opt/gridengine/default/spool/compute-0-0/job_scripts/127468 ohscal_4
UAF_X_squared1 krr_poly_optimize_c
20721 ? Sl 81:14 | \_
/usr/local/MATLAB/R2011a/bin/glnxa64/MATLAB -nodisplay -r control_script_cl
ohscal_4 UAF_X_squared1 krr_poly_optimize_c; quit; -nojvm
20829 ? S 0:00 \_ sge_shepherd-127468 -bg
20830 ? Ss 0:00 | \_ -bash
/opt/gridengine/default/spool/compute-0-0/job_scripts/127468 ohscal_4
UAF_X_squared1 krr_poly_optimize_c
20962 ? Sl 84:24 | \_
/usr/local/MATLAB/R2011a/bin/glnxa64/MATLAB -nodisplay -r control_script_cl
ohscal_4 UAF_X_squared1 krr_poly_optimize_c; quit; -nojvm
21135 ? S 0:00 \_ sge_shepherd-127468 -bg
21136 ? Ss 0:00 | \_ -bash
/opt/gridengine/default/spool/compute-0-0/job_scripts/127468 ohscal_4
UAF_X_squared1 krr_poly_optimize_c
21268 ? Sl 71:47 | \_
/usr/local/MATLAB/R2011a/bin/glnxa64/MATLAB -nodisplay -r control_script_cl
ohscal_4 UAF_X_squared1 krr_poly_optimize_c; quit; -nojvm
21371 ? S 0:00 \_ sge_shepherd-127468 -bg
21372 ? Ss 0:00 \_ -bash
/opt/gridengine/default/spool/compute-0-0/job_scripts/127468 ohscal_4
UAF_X_squared1 krr_poly_optimize_c
21504 ? Sl 74:30 \_
/usr/local/MATLAB/R2011a/bin/glnxa64/MATLAB -nodisplay -r control_script_cl
ohscal_4 UAF_X_squared1 krr_poly_optimize_c; quit; -nojvm
4211 ? Sl 0:06 /usr/sbin/snmpd -Lsd -Lf /dev/null -p
/var/run/snmpd.pid -a
4227 ? Ss 0:00 /usr/sbin/sshd
23976 ? Ss 0:00 \_ sshd: root@notty
23978 ? Rs 0:00 \_ ps -e f
4244 ? Ss 0:00 xinetd -stayalive -pidfile /var/run/xinetd.pid
4332 ? Ss 0:00 /usr/libexec/postfix/master
4352 ? S 0:00 \_ qmgr -l -t fifo -u
23354 ? S 0:00 \_ pickup -l -t fifo -u
4344 ? Ss 0:01 crond
4378 ? Ss 0:00 xfs -droppriv -daemon
4401 ? Ss 0:00 /usr/sbin/atd
4457 ? S 0:00 /usr/sbin/smartd -q never
4461 tty1 Ss+ 0:00 /sbin/mingetty tty1
4463 tty2 Ss+ 0:00 /sbin/mingetty tty2
4465 tty3 Ss+ 0:00 /sbin/mingetty tty3
4466 tty4 Ss+ 0:00 /sbin/mingetty tty4
4467 tty5 Ss+ 0:00 /sbin/mingetty tty5
4469 tty6 Ss+ 0:00 /sbin/mingetty tty6
28118 ? SLs 0:00 ntpd -A -u ntp:ntp -p /var/run/ntpd.pid
6784 ? Ss 2:53 /usr/sbin/gmond
> (f w/o -) will generate a readable output. Are all jobs bound to the
> sge_execd and the sge_shepherds?
All the heavy processes are.
> Are there kernel tasks in state D?
No, nothing seems to be in state D.
For what it is worth, here is the output of top -n 1 on the same node:
top - 11:41:59 up 32 days, 1:22, 1 user, load average: 37.85, 39.53, 38.43
Tasks: 203 total, 2 running, 201 sleeping, 0 stopped, 0 zombie
Cpu(s): 32.2%us, 11.3%sy, 0.0%ni, 56.4%id, 0.1%wa, 0.0%hi, 0.0%si, 0.0%st
Mem: 49449280k total, 24527668k used, 24921612k free, 43444k buffers
Swap: 1020116k total, 0k used, 1020116k free, 519884k cached
PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND
21268 liz06 25 0 5350m 4.2g 34m S 425.1 8.8 69:36.86 MATLAB
20721 liz06 25 0 5300m 4.2g 34m S 311.9 8.8 77:36.80 MATLAB
21504 liz06 25 0 5410m 4.2g 34m S 224.5 8.9 71:42.39 MATLAB
20962 liz06 25 0 5343m 4.2g 34m S 139.1 8.8 81:30.63 MATLAB
19600 liz06 25 0 3307m 2.7g 33m S 63.6 5.8 35:21.36 MATLAB
20385 liz06 21 0 3666m 2.6g 33m S 31.8 5.6 31:07.20 MATLAB
1 root 15 0 10348 696 584 S 0.0 0.0 0:01.93 init
2 root RT -5 0 0 0 S 0.0 0.0 0:00.52 migration/0
3 root 34 19 0 0 0 S 0.0 0.0 0:00.03 ksoftirqd/0
4 root RT -5 0 0 0 S 0.0 0.0 0:00.00 watchdog/0
5 root RT -5 0 0 0 S 0.0 0.0 0:00.59 migration/1
6 root 34 19 0 0 0 S 0.0 0.0 0:00.10 ksoftirqd/1
7 root RT -5 0 0 0 S 0.0 0.0 0:00.00 watchdog/1
8 root RT -5 0 0 0 S 0.0 0.0 0:00.44 migration/2
9 root 34 19 0 0 0 S 0.0 0.0 0:00.02 ksoftirqd/2
10 root RT -5 0 0 0 S 0.0 0.0 0:00.00 watchdog/2
11 root RT -5 0 0 0 S 0.0 0.0 0:00.32 migration/3
12 root 34 19 0 0 0 S 0.0 0.0 0:00.01 ksoftirqd/3
13 root RT -5 0 0 0 S 0.0 0.0 0:00.00 watchdog/3
14 root RT -5 0 0 0 S 0.0 0.0 0:00.30 migration/4
15 root 34 19 0 0 0 S 0.0 0.0 0:00.01 ksoftirqd/4
16 root RT -5 0 0 0 S 0.0 0.0 0:00.00 watchdog/4
17 root RT -5 0 0 0 S 0.0 0.0 0:00.24 migration/5
18 root 34 19 0 0 0 S 0.0 0.0 0:00.01 ksoftirqd/5
19 root RT -5 0 0 0 S 0.0 0.0 0:00.00 watchdog/5
20 root RT -5 0 0 0 S 0.0 0.0 0:00.60 migration/6
21 root 34 19 0 0 0 S 0.0 0.0 0:00.02 ksoftirqd/6
22 root RT -5 0 0 0 S 0.0 0.0 0:00.00 watchdog/6
23 root RT -5 0 0 0 S 0.0 0.0 0:00.85 migration/7
24 root 34 19 0 0 0 S 0.0 0.0 0:00.22 ksoftirqd/7
25 root RT -5 0 0 0 S 0.0 0.0 0:00.00 watchdog/7
26 root RT -5 0 0 0 S 0.0 0.0 0:00.35 migration/8
27 root 34 19 0 0 0 S 0.0 0.0 0:00.03 ksoftirqd/8
28 root RT -5 0 0 0 S 0.0 0.0 0:00.00 watchdog/8
29 root RT -5 0 0 0 S 0.0 0.0 0:00.20 migration/9
30 root 34 19 0 0 0 S 0.0 0.0 0:00.02 ksoftirqd/9
31 root RT -5 0 0 0 S 0.0 0.0 0:00.00 watchdog/9
32 root RT -5 0 0 0 S 0.0 0.0 0:00.18 migration/10
33 root 34 19 0 0 0 S 0.0 0.0 0:00.01 ksoftirqd/10
34 root RT -5 0 0 0 S 0.0 0.0 0:00.00 watchdog/10
35 root RT -5 0 0 0 S 0.0 0.0 0:00.17 migration/11
36 root 34 19 0 0 0 S 0.0 0.0 0:00.01 ksoftirqd/11
37 root RT -5 0 0 0 S 0.0 0.0 0:00.00 watchdog/11
38 root 10 -5 0 0 0 S 0.0 0.0 0:00.02 events/0
39 root 10 -5 0 0 0 S 0.0 0.0 0:00.00 events/1
40 root 10 -5 0 0 0 S 0.0 0.0 0:00.01 events/2
41 root 10 -5 0 0 0 S 0.0 0.0 0:00.00 events/3
42 root 10 -5 0 0 0 S 0.0 0.0 0:00.01 events/4
------------------------------------------------------------
This email message, including any attachments, is for the sole use of the
intended recipient(s) and may contain information that is proprietary,
confidential, and exempt from disclosure under applicable law. Any unauthorized
review, use, disclosure, or distribution is prohibited. If you have received
this email in error please notify the sender by return email and delete the
original message. Please note, the recipient should check this email and any
attachments for the presence of viruses. The organization accepts no liability
for any damage caused by any virus transmitted by this email.
=================================
_______________________________________________
users mailing list
[email protected]
https://gridengine.org/mailman/listinfo/users