Hello,
We are using the jobAcctGatherLinux plugin for accounting. We can't use
the others since our cluster doesn't have Lustre nor infiniband. I am on
branch slurm 14.11.7 and I did a "git pull" last friday.
We noticed that the information given by sacct was sometimes erroneous
for memory and IO. So, I decided to test it.
I use the program in attachment for that.
Basically, this program takes 2 argument:
1- the number of seconds that you want to keep a processor busy,
2- a number of Bytes to allocate, use and write to disk.
I tested with different polling interval : 1s, 15s and 30s (30s is what
is recommended in the slurm documentation for production environments).
I ran several times the command : srun test_cpu_mem_io 150 300000000
and got different results for accounting depending on the polling
interval (whereas the job always do exactly the same things) :
With a polling interval of 1s (all results are close to what was really
allocated/used/written),
sacct -a -o
JobId,JobName,State,User,cpuTimeRaw,SystemCPU,UserCPU,Start,End,Elapsed,MaxVMSize,MaxRSS,MaxPages,MaxDiskRead,MaxDiskWrite,NCPUS,NTASKS,Account
14717 test_cpu_+ COMPLETED vlegrand 156 00:00.303
02:31.321 2015-06-25T14:31:50 2015-06-25T14:34:26 00:02:36 297032K
293048K 0 0.00M 286M 1 1 test
14718 test_cpu_+ COMPLETED vlegrand 153 00:00.303
02:31.382 2015-06-25T14:34:26 2015-06-25T14:36:59 00:02:33 297032K
293052K 0 0.00M 286M 1 1 test
14719 test_cpu_+ COMPLETED vlegrand 153 00:00.310
02:31.449 2015-06-25T14:36:59 2015-06-25T14:39:32 00:02:33 297032K
293052K 0 0.00M 286M 1 1 test
14720 test_cpu_+ COMPLETED vlegrand 153 00:00.363
02:31.189 2015-06-25T14:39:32 2015-06-25T14:42:05 00:02:33 297032K
293052K 0 0.00M 286M 1 1 test
14721 test_cpu_+ COMPLETED vlegrand 154 00:00.487
02:30.937 2015-06-25T14:42:05 2015-06-25T14:44:39 00:02:34 297032K
293048K 0 0.00M 286M 1 1 test
14722 test_cpu_+ COMPLETED vlegrand 155 00:00.291
02:31.321 2015-06-25T14:44:41 2015-06-25T14:47:16 00:02:35 297032K
293052K 0 0.00M 286M 1 1 test
14723 test_cpu_+ COMPLETED vlegrand 154 00:00.312
02:31.285 2015-06-25T14:47:16 2015-06-25T14:49:50 00:02:34 297032K
293052K 0 0.00M 286M 1 1 test
14724 test_cpu_+ COMPLETED vlegrand 157 00:00.290
02:31.326 2015-06-25T14:49:50 2015-06-25T14:52:27 00:02:37 297032K
293052K 0 0.00M 286M 1 1 test
14725 test_cpu_+ COMPLETED vlegrand 156 00:00.282
02:31.321 2015-06-25T14:52:29 2015-06-25T14:55:05 00:02:36 297032K
293052K 0 0.00M 286M 1 1 test
14726 test_cpu_+ COMPLETED vlegrand 154 00:00.554
02:31.125 2015-06-25T14:55:05 2015-06-25T14:57:39 00:02:34 297032K
293052K 0 0.00M 286M 1 1 test
With a polling interval of 15s (all results for memory and io are
erroneous),
sacct -a -o
JobId,JobName,State,User,cpuTimeRaw,SystemCPU,UserCPU,Start,End,Elapsed,MaxVMSize,MaxRSS,MaxPages,MaxDiskRead,MaxDiskWrite,NCPUS,NTASKS,Account
14695 test_cpu_+ COMPLETED vlegrand 153 00:00.442
02:31.444 2015-06-25T12:57:21 2015-06-25T12:59:54 00:02:33
267604K 536K 0 0.00M 0.00M 1 1 test
14696 test_cpu_+ COMPLETED vlegrand 153 00:00.341
02:31.314 2015-06-25T12:59:54 2015-06-25T13:02:27 00:02:33
296896K 6812K 0 0.00M 0.00M 1 1 test
14697 test_cpu_+ COMPLETED vlegrand 153 00:00.319
02:31.333 2015-06-25T13:02:27 2015-06-25T13:05:00 00:02:33
296896K 7292K 0 0.00M 0.00M 1 1 test
14698 test_cpu_+ COMPLETED vlegrand 153 00:00.274
02:31.091 2015-06-25T13:05:00 2015-06-25T13:07:33 00:02:33
296896K 8556K 0 0.00M 0.00M 1 1 test
14699 test_cpu_+ COMPLETED vlegrand 153 00:00.317
02:31.350 2015-06-25T13:07:33 2015-06-25T13:10:06 00:02:33 296896K
11044K 0 0.00M 0.00M 1 1 test
14700 test_cpu_+ COMPLETED vlegrand 153 00:00.348
02:31.034 2015-06-25T13:10:07 2015-06-25T13:12:40 00:02:33
296896K 6232K 0 0.00M 0.00M 1 1 test
14701 test_cpu_+ COMPLETED vlegrand 153 00:00.362
02:31.244 2015-06-25T13:12:40 2015-06-25T13:15:13 00:02:33
296896K 6664K 0 0.00M 0.00M 1 1 test
14702 test_cpu_+ COMPLETED vlegrand 153 00:00.290
02:31.317 2015-06-25T13:15:13 2015-06-25T13:17:46 00:02:33
296896K 8800K 0 0.00M 0.00M 1 1 test
14703 test_cpu_+ COMPLETED vlegrand 153 00:00.314
02:31.352 2015-06-25T13:17:46 2015-06-25T13:20:19 00:02:33
296896K 5704K 0 0.00M 0.00M 1 1 test
14704 test_cpu_+ COMPLETED vlegrand 153 00:00.309
02:31.268 2015-06-25T13:20:19 2015-06-25T13:22:52 00:02:33
296896K 7652K 0 0.00M 0.00M 1 1 test
With a polling interval of 30s (only 1 result is close to what was
really allocated/used/written to disk)
sacct -a -o
JobId,JobName,State,User,cpuTimeRaw,SystemCPU,UserCPU,Start,End,Elapsed,MaxVMSize,MaxRSS,MaxPages,MaxDiskRead,MaxDiskWrite,NCPUS,NTASKS,Account
14707 test_cpu_+ COMPLETED vlegrand 153 00:00.390
02:31.144 2015-06-25T14:03:45 2015-06-25T14:06:18 00:02:33
296896K 8312K 0 0.00M 0.00M 1 1 test
14708 test_cpu_+ COMPLETED vlegrand 154 00:00.428
02:31.236 2015-06-25T14:06:18 2015-06-25T14:08:52 00:02:34 296896K
14956K 0 0.00M 0.00M 1 1 test
14709 test_cpu_+ COMPLETED vlegrand 153 00:00.417
02:30.998 2015-06-25T14:08:52 2015-06-25T14:11:25 00:02:33
296896K 5524K 0 0.00M 0.00M 1 1 test
14710 test_cpu_+ COMPLETED vlegrand 153 00:00.369
02:31.064 2015-06-25T14:11:25 2015-06-25T14:13:58 00:02:33
296896K 7684K 0 0.00M 0.00M 1 1 test
14711 test_cpu_+ COMPLETED vlegrand 153 00:00.325
02:31.323 2015-06-25T14:13:58 2015-06-25T14:16:31 00:02:33
296896K 8176K 0 0.00M 0.00M 1 1 test
14712 test_cpu_+ COMPLETED vlegrand 153 00:00.328
02:31.308 2015-06-25T14:16:31 2015-06-25T14:19:04 00:02:33 297032K
293052K 0 0.00M 286M 1 1 test
14713 test_cpu_+ COMPLETED vlegrand 153 00:00.291
02:31.366 2015-06-25T14:19:04 2015-06-25T14:21:37 00:02:33
296896K 6988K 0 0.00M 0.00M 1 1 test
14714 test_cpu_+ COMPLETED vlegrand 153 00:00.289
02:31.319 2015-06-25T14:21:37 2015-06-25T14:24:10 00:02:33
296896K 8032K 0 0.00M 0.00M 1 1 test
14715 test_cpu_+ COMPLETED vlegrand 153 00:00.289
02:31.344 2015-06-25T14:24:10 2015-06-25T14:26:43 00:02:33
296896K 7872K 0 0.00M 0.00M 1 1 test
I investigated in the code of slurm. I found that for getting accounting
data, slurm reads the /proc/pid/stat and /proc/pid/io file for each
process (and descendants) launched for the job. The file is read at each
polling interval and when the job finishes. The problem is : at least
once for every job, the info that is read is not correct (around 0 for
write_bytes for example). I mean that the file exists and the calls to
"open" and "read" don't return an error (I checked the return code in
traces and debugger) but we don't get the values that we expect. With my
colleagues, we suppose that this is because slurm reads the
/proc/pid/XXX files just after the process is over.
I can't see a workaround for that:
- setting the polling interval to a very low value would give us more
precise results but could slow the cluster in production,
- keeping it to 30s would lead to possibly missing a lot of important
information (a process can write gigabytes in 30 seconds or much less!)
Any idea is welcome,
Best regards,
Véronique
--
Véronique Legrand
IT engineer - Scientific computing
IT center for biology - C3BI
Institut Pasteur, Paris
mail: [email protected]
Tel: 01 44 38 95 03
#include <unistd.h>
#include <signal.h>
#include <stdio.h>
#include <sys/resource.h>
#include <sys/time.h>
#include <stdlib.h>
/*
This test aims at using the processor during argv[1] seconds,
then allocate argv[2] Bytes, write things in them (to use them)
them write the in-memory content to a temporary file.
*/
volatile sig_atomic_t cond=1;
void catch_alarm(int sig) {
cond=0;
signal(sig,catch_alarm);
}
int main(int argc,char ** argv) {
printf("number of args : %d \n",argc);
if (argc!=3) {
printf("usage : %s nb_sec nb_Bytes \n ",argv[0]);
return -1;
}
signal(SIGALRM,catch_alarm);
int dur=atoi(argv[1]);
int m_siz=atoi(argv[2]);
alarm(dur);
while(cond) {
// do nothing this is just to keep the processor busy.
}
char * one_alloc;
one_alloc=malloc(m_siz*sizeof(char));
int i;
for (i=0;i<m_siz;i++) {
one_alloc[i]='T';
if (i % 2 ==0 ) one_alloc[i]='A';
if (i % 3 ==0 ) one_alloc[i]='C';
if (i % 5 ==0 ) one_alloc[i]='G';
}
FILE * temp = fopen("/tmp/toto.txt","w");
fprintf(temp,"%s \n",one_alloc);
fclose(temp);
free(one_alloc);
struct rusage my_usage;
int ret=getrusage(RUSAGE_CHILDREN,&my_usage);
if (ret!=0) {
printf("error in getrusage\n");
}
printf("end of: %s; getrusage result for children: \n",__func__);
printf("ru_maxrss %ld \n",my_usage.ru_maxrss);
printf("ru_inblock %ld \n",my_usage.ru_inblock);
printf("ru_outblock %ld \n",my_usage.ru_oublock);
printf("ru_utime %ld \n",my_usage.ru_utime.tv_usec); // use %ld for my test
for the moment. Will see later to do something more portable.
printf("ru_stime %ld \n",my_usage.ru_stime.tv_usec); // use %ld for my test
for the moment. Will see later to do something more portable.
ret=getrusage(RUSAGE_SELF,&my_usage);
if (ret!=0) {
printf("error in getrusage\n");
}
printf("end of: %s; getrusage result for myself: \n",__func__);
printf("ru_maxrss %ld \n",my_usage.ru_maxrss);
printf("ru_inblock %ld \n",my_usage.ru_inblock);
printf("ru_outblock %ld \n",my_usage.ru_oublock);
printf("ru_utime %ld \n",my_usage.ru_utime.tv_usec); // use %ld for my test
for the moment. Will see later to do something more portable.
printf("ru_stime %ld \n",my_usage.ru_stime.tv_usec); // use %ld for my test
for the moment. Will see later to do something more portable.
return 0;
}