Hello,

We are using the jobAcctGatherLinux plugin for accounting. We can't use the others since our cluster doesn't have Lustre nor infiniband. I am on branch slurm 14.11.7 and I did a "git pull" last friday. We noticed that the information given by sacct was sometimes erroneous for memory and IO. So, I decided to test it.
I use the program in attachment for that.
Basically, this program takes 2 argument:
1- the number of seconds that you want to keep a processor busy,
2- a number of Bytes to allocate, use and write to disk.

I tested with different polling interval : 1s, 15s and 30s (30s is what is recommended in the slurm documentation for production environments).

I ran several times the command : srun test_cpu_mem_io 150 300000000
and got different results for accounting depending on the polling interval (whereas the job always do exactly the same things) :

With a polling interval of 1s (all results are close to what was really allocated/used/written),

sacct -a -o JobId,JobName,State,User,cpuTimeRaw,SystemCPU,UserCPU,Start,End,Elapsed,MaxVMSize,MaxRSS,MaxPages,MaxDiskRead,MaxDiskWrite,NCPUS,NTASKS,Account

14717 test_cpu_+ COMPLETED vlegrand 156 00:00.303 02:31.321 2015-06-25T14:31:50 2015-06-25T14:34:26 00:02:36 297032K 293048K 0 0.00M 286M 1 1 test 14718 test_cpu_+ COMPLETED vlegrand 153 00:00.303 02:31.382 2015-06-25T14:34:26 2015-06-25T14:36:59 00:02:33 297032K 293052K 0 0.00M 286M 1 1 test 14719 test_cpu_+ COMPLETED vlegrand 153 00:00.310 02:31.449 2015-06-25T14:36:59 2015-06-25T14:39:32 00:02:33 297032K 293052K 0 0.00M 286M 1 1 test 14720 test_cpu_+ COMPLETED vlegrand 153 00:00.363 02:31.189 2015-06-25T14:39:32 2015-06-25T14:42:05 00:02:33 297032K 293052K 0 0.00M 286M 1 1 test 14721 test_cpu_+ COMPLETED vlegrand 154 00:00.487 02:30.937 2015-06-25T14:42:05 2015-06-25T14:44:39 00:02:34 297032K 293048K 0 0.00M 286M 1 1 test 14722 test_cpu_+ COMPLETED vlegrand 155 00:00.291 02:31.321 2015-06-25T14:44:41 2015-06-25T14:47:16 00:02:35 297032K 293052K 0 0.00M 286M 1 1 test 14723 test_cpu_+ COMPLETED vlegrand 154 00:00.312 02:31.285 2015-06-25T14:47:16 2015-06-25T14:49:50 00:02:34 297032K 293052K 0 0.00M 286M 1 1 test 14724 test_cpu_+ COMPLETED vlegrand 157 00:00.290 02:31.326 2015-06-25T14:49:50 2015-06-25T14:52:27 00:02:37 297032K 293052K 0 0.00M 286M 1 1 test 14725 test_cpu_+ COMPLETED vlegrand 156 00:00.282 02:31.321 2015-06-25T14:52:29 2015-06-25T14:55:05 00:02:36 297032K 293052K 0 0.00M 286M 1 1 test 14726 test_cpu_+ COMPLETED vlegrand 154 00:00.554 02:31.125 2015-06-25T14:55:05 2015-06-25T14:57:39 00:02:34 297032K 293052K 0 0.00M 286M 1 1 test


With a polling interval of 15s (all results for memory and io are erroneous),

sacct -a -o JobId,JobName,State,User,cpuTimeRaw,SystemCPU,UserCPU,Start,End,Elapsed,MaxVMSize,MaxRSS,MaxPages,MaxDiskRead,MaxDiskWrite,NCPUS,NTASKS,Account 14695 test_cpu_+ COMPLETED vlegrand 153 00:00.442 02:31.444 2015-06-25T12:57:21 2015-06-25T12:59:54 00:02:33 267604K 536K 0 0.00M 0.00M 1 1 test 14696 test_cpu_+ COMPLETED vlegrand 153 00:00.341 02:31.314 2015-06-25T12:59:54 2015-06-25T13:02:27 00:02:33 296896K 6812K 0 0.00M 0.00M 1 1 test 14697 test_cpu_+ COMPLETED vlegrand 153 00:00.319 02:31.333 2015-06-25T13:02:27 2015-06-25T13:05:00 00:02:33 296896K 7292K 0 0.00M 0.00M 1 1 test 14698 test_cpu_+ COMPLETED vlegrand 153 00:00.274 02:31.091 2015-06-25T13:05:00 2015-06-25T13:07:33 00:02:33 296896K 8556K 0 0.00M 0.00M 1 1 test 14699 test_cpu_+ COMPLETED vlegrand 153 00:00.317 02:31.350 2015-06-25T13:07:33 2015-06-25T13:10:06 00:02:33 296896K 11044K 0 0.00M 0.00M 1 1 test 14700 test_cpu_+ COMPLETED vlegrand 153 00:00.348 02:31.034 2015-06-25T13:10:07 2015-06-25T13:12:40 00:02:33 296896K 6232K 0 0.00M 0.00M 1 1 test 14701 test_cpu_+ COMPLETED vlegrand 153 00:00.362 02:31.244 2015-06-25T13:12:40 2015-06-25T13:15:13 00:02:33 296896K 6664K 0 0.00M 0.00M 1 1 test 14702 test_cpu_+ COMPLETED vlegrand 153 00:00.290 02:31.317 2015-06-25T13:15:13 2015-06-25T13:17:46 00:02:33 296896K 8800K 0 0.00M 0.00M 1 1 test 14703 test_cpu_+ COMPLETED vlegrand 153 00:00.314 02:31.352 2015-06-25T13:17:46 2015-06-25T13:20:19 00:02:33 296896K 5704K 0 0.00M 0.00M 1 1 test 14704 test_cpu_+ COMPLETED vlegrand 153 00:00.309 02:31.268 2015-06-25T13:20:19 2015-06-25T13:22:52 00:02:33 296896K 7652K 0 0.00M 0.00M 1 1 test



With a polling interval of 30s (only 1 result is close to what was really allocated/used/written to disk)

sacct -a -o JobId,JobName,State,User,cpuTimeRaw,SystemCPU,UserCPU,Start,End,Elapsed,MaxVMSize,MaxRSS,MaxPages,MaxDiskRead,MaxDiskWrite,NCPUS,NTASKS,Account 14707 test_cpu_+ COMPLETED vlegrand 153 00:00.390 02:31.144 2015-06-25T14:03:45 2015-06-25T14:06:18 00:02:33 296896K 8312K 0 0.00M 0.00M 1 1 test 14708 test_cpu_+ COMPLETED vlegrand 154 00:00.428 02:31.236 2015-06-25T14:06:18 2015-06-25T14:08:52 00:02:34 296896K 14956K 0 0.00M 0.00M 1 1 test 14709 test_cpu_+ COMPLETED vlegrand 153 00:00.417 02:30.998 2015-06-25T14:08:52 2015-06-25T14:11:25 00:02:33 296896K 5524K 0 0.00M 0.00M 1 1 test 14710 test_cpu_+ COMPLETED vlegrand 153 00:00.369 02:31.064 2015-06-25T14:11:25 2015-06-25T14:13:58 00:02:33 296896K 7684K 0 0.00M 0.00M 1 1 test 14711 test_cpu_+ COMPLETED vlegrand 153 00:00.325 02:31.323 2015-06-25T14:13:58 2015-06-25T14:16:31 00:02:33 296896K 8176K 0 0.00M 0.00M 1 1 test 14712 test_cpu_+ COMPLETED vlegrand 153 00:00.328 02:31.308 2015-06-25T14:16:31 2015-06-25T14:19:04 00:02:33 297032K 293052K 0 0.00M 286M 1 1 test 14713 test_cpu_+ COMPLETED vlegrand 153 00:00.291 02:31.366 2015-06-25T14:19:04 2015-06-25T14:21:37 00:02:33 296896K 6988K 0 0.00M 0.00M 1 1 test 14714 test_cpu_+ COMPLETED vlegrand 153 00:00.289 02:31.319 2015-06-25T14:21:37 2015-06-25T14:24:10 00:02:33 296896K 8032K 0 0.00M 0.00M 1 1 test 14715 test_cpu_+ COMPLETED vlegrand 153 00:00.289 02:31.344 2015-06-25T14:24:10 2015-06-25T14:26:43 00:02:33 296896K 7872K 0 0.00M 0.00M 1 1 test



I investigated in the code of slurm. I found that for getting accounting data, slurm reads the /proc/pid/stat and /proc/pid/io file for each process (and descendants) launched for the job. The file is read at each polling interval and when the job finishes. The problem is : at least once for every job, the info that is read is not correct (around 0 for write_bytes for example). I mean that the file exists and the calls to "open" and "read" don't return an error (I checked the return code in traces and debugger) but we don't get the values that we expect. With my colleagues, we suppose that this is because slurm reads the /proc/pid/XXX files just after the process is over.

I can't see a workaround for that:
- setting the polling interval to a very low value would give us more precise results but could slow the cluster in production, - keeping it to 30s would lead to possibly missing a lot of important information (a process can write gigabytes in 30 seconds or much less!)

Any idea is welcome,

Best regards,

Véronique

--
Véronique Legrand
IT engineer - Scientific computing
IT center for biology - C3BI
Institut Pasteur, Paris

mail: [email protected]
Tel: 01 44 38 95 03

#include <unistd.h>
#include <signal.h>
#include <stdio.h>
#include <sys/resource.h>
#include <sys/time.h>
#include <stdlib.h>


/*
This test aims at using the processor during argv[1] seconds,
then allocate argv[2] Bytes, write things in them (to use them)
them write the in-memory content to a temporary file.
*/


volatile sig_atomic_t cond=1;

void catch_alarm(int sig) {
   cond=0;
   signal(sig,catch_alarm);
}



int main(int argc,char ** argv) {
   printf("number of args : %d \n",argc);
    if (argc!=3) {
       printf("usage : %s nb_sec nb_Bytes \n ",argv[0]);
       return -1;
    }
    
    signal(SIGALRM,catch_alarm);
    int dur=atoi(argv[1]);
    int m_siz=atoi(argv[2]);
    alarm(dur);

    while(cond) {
       // do nothing this is just to keep the processor busy.
    }

    char * one_alloc;
    one_alloc=malloc(m_siz*sizeof(char));
    int i;
    for (i=0;i<m_siz;i++) {
        one_alloc[i]='T';
        if (i % 2 ==0 ) one_alloc[i]='A';
        if (i % 3 ==0 ) one_alloc[i]='C';
        if (i % 5 ==0 ) one_alloc[i]='G';
    }

    FILE * temp = fopen("/tmp/toto.txt","w");
    fprintf(temp,"%s \n",one_alloc);
    fclose(temp);

    free(one_alloc);

    struct rusage my_usage; 
    int ret=getrusage(RUSAGE_CHILDREN,&my_usage);
    if (ret!=0) {
        printf("error in getrusage\n");
    }
    printf("end of: %s; getrusage result for children: \n",__func__);
    printf("ru_maxrss %ld \n",my_usage.ru_maxrss);
    printf("ru_inblock %ld \n",my_usage.ru_inblock);
    printf("ru_outblock %ld \n",my_usage.ru_oublock);
    printf("ru_utime %ld \n",my_usage.ru_utime.tv_usec); // use %ld for my test 
for the moment. Will see later to do something more portable.
    printf("ru_stime %ld \n",my_usage.ru_stime.tv_usec); // use %ld for my test 
for the moment. Will see later to do something more portable.
    
    ret=getrusage(RUSAGE_SELF,&my_usage);
    if (ret!=0) {
        printf("error in getrusage\n");
    }
    printf("end of: %s; getrusage result for myself: \n",__func__);
    printf("ru_maxrss %ld \n",my_usage.ru_maxrss);
    printf("ru_inblock %ld \n",my_usage.ru_inblock);
    printf("ru_outblock %ld \n",my_usage.ru_oublock);
    printf("ru_utime %ld \n",my_usage.ru_utime.tv_usec); // use %ld for my test 
for the moment. Will see later to do something more portable.
    printf("ru_stime %ld \n",my_usage.ru_stime.tv_usec); // use %ld for my test 
for the moment. Will see later to do something more portable.


    return 0;

}

Reply via email to