Dear Linux Perf Users Community,

I noticed some inconsistencies with the perf tool. I would like to determine whether I am doing something wrong, or whether there are problem in the perf tool. Here is the problem:

I would like to obtain flops on a simple matrix-to-matrix multiplication algorithm. The code is available in the attachment as mmmtest.c. To obtain flops, I run the perf tool using raw counters. When I try to obtain flops for matrices having sizes bellow 150x150, I obtain accurate results. Example (anticipated flops: 100 * 100 * 100 * 2 = 2'000'000):

perf stat -e r538010 ./mmmtest 100

 Performance counter stats for './mmmtest 100':

         2,078,775 r538010

       0.003889544 seconds time elapsed


However, whenever I try to run matrices of bigger size, the reported flops are not even close to the flops that I am supposed to obtain (anticipated results: 600 * 600 * 600 * 2 = 432'000'000):

perf stat -e r538010 ./mmmtest 600

 Performance counter stats for './mmmtest 600':

     2,348,148,851 r538010

       0.955511968 seconds time elapsed


To give you more info to replicate the problem, I provide you with the following:

CPU: Intel(R) Xeon(R) CPU E5-2643 0 @ 3.30GHz, 8 cores
Linux Kernel: 3.11.0-12-generic
GCC Version: gcc version 4.8.1 (Ubuntu/Linaro 4.8.1-10ubuntu8)
Monitored events: FP_COMP_OPS_EXE:SSE_SCALAR_DOUBLE - Raw event: 0x538010 (converted using libpfm4)

I have compiled the mmmtest.c using gcc -O3 -march=corei7-avx -o mmmtest mmmtest.c. You can also find mmmtest.s asm version in the attachment.

Do you know why does this happens ? How can I instruct perf to obtain accurate results ?

Greetings,
Alen
#include <stdlib.h>

int m, n, k;
double *A, *B, *C;

void compute() {
        int i,j,h;
        for(i = 0; i < m; ++i) {
                for(j = 0; j < n; ++j) {
                        for(h = 0; h < k; ++h) {
                                C[i*n+j] += A[i*k+h] * B[h*n+j];
                        }
                }
        }
}

int main(int argc, char **argv)
{
        m = atoi(argv[1]); n = m; k = m;

        A = (double *) malloc (m * k * sizeof(double));
        B = (double *) malloc (k * n * sizeof(double));
        C = (double *) malloc (m * n * sizeof(double));

        compute ();

        free(A);
        free(B);
        free(C);
}
        .file   "mmmtest.c"
        .text
        .p2align 4,,15
        .globl  compute
        .type   compute, @function
compute:
.LFB14:
        .cfi_startproc
        pushq   %r15
        .cfi_def_cfa_offset 16
        .cfi_offset 15, -16
        pushq   %r14
        .cfi_def_cfa_offset 24
        .cfi_offset 14, -24
        pushq   %r13
        .cfi_def_cfa_offset 32
        .cfi_offset 13, -32
        pushq   %r12
        .cfi_def_cfa_offset 40
        .cfi_offset 12, -40
        movl    m(%rip), %r12d
        pushq   %rbp
        .cfi_def_cfa_offset 48
        .cfi_offset 6, -48
        pushq   %rbx
        .cfi_def_cfa_offset 56
        .cfi_offset 3, -56
        testl   %r12d, %r12d
        jle     .L9
        movl    n(%rip), %ebp
        xorl    %ebx, %ebx
        movl    k(%rip), %esi
        movq    B(%rip), %r15
        movq    A(%rip), %rdi
        movq    C(%rip), %r11
        leal    -1(%rbp), %eax
        movslq  %ebp, %r8
        leaq    8(,%rax,8), %r13
        movslq  %esi, %r14
        salq    $3, %r8
        salq    $3, %r14
.L3:
        testl   %ebp, %ebp
        jle     .L5
        leaq    0(%r13,%r11), %r10
        movq    %r15, %r9
        movq    %r11, %rcx
        .p2align 4,,10
        .p2align 3
.L8:
        testl   %esi, %esi
        jle     .L6
        vmovsd  (%rcx), %xmm0
        movq    %r9, %rdx
        xorl    %eax, %eax
        .p2align 4,,10
        .p2align 3
.L7:
        vmovsd  (%rdi,%rax,8), %xmm1
        addq    $1, %rax
        vmulsd  (%rdx), %xmm1, %xmm1
        addq    %r8, %rdx
        cmpl    %eax, %esi
        vaddsd  %xmm1, %xmm0, %xmm0
        vmovsd  %xmm0, (%rcx)
        jg      .L7
.L6:
        addq    $8, %rcx
        addq    $8, %r9
        cmpq    %r10, %rcx
        jne     .L8
.L5:
        addl    $1, %ebx
        addq    %r14, %rdi
        addq    %r8, %r11
        cmpl    %r12d, %ebx
        jne     .L3
.L9:
        popq    %rbx
        .cfi_def_cfa_offset 48
        popq    %rbp
        .cfi_def_cfa_offset 40
        popq    %r12
        .cfi_def_cfa_offset 32
        popq    %r13
        .cfi_def_cfa_offset 24
        popq    %r14
        .cfi_def_cfa_offset 16
        popq    %r15
        .cfi_def_cfa_offset 8
        ret
        .cfi_endproc
.LFE14:
        .size   compute, .-compute
        .section        .text.startup,"ax",@progbits
        .p2align 4,,15
        .globl  main
        .type   main, @function
main:
.LFB15:
        .cfi_startproc
        pushq   %rbx
        .cfi_def_cfa_offset 16
        .cfi_offset 3, -16
        movl    $10, %edx
        movq    8(%rsi), %rdi
        xorl    %esi, %esi
        call    strtol
        movl    %eax, m(%rip)
        movl    %eax, n(%rip)
        movl    %eax, k(%rip)
        imull   %eax, %eax
        movslq  %eax, %rbx
        salq    $3, %rbx
        movq    %rbx, %rdi
        call    malloc
        movq    %rbx, %rdi
        movq    %rax, A(%rip)
        call    malloc
        movq    %rbx, %rdi
        movq    %rax, B(%rip)
        call    malloc
        movq    %rax, C(%rip)
        xorl    %eax, %eax
        call    compute
        movq    A(%rip), %rdi
        call    free
        movq    B(%rip), %rdi
        call    free
        movq    C(%rip), %rdi
        call    free
        popq    %rbx
        .cfi_def_cfa_offset 8
        ret
        .cfi_endproc
.LFE15:
        .size   main, .-main
        .comm   C,8,8
        .comm   B,8,8
        .comm   A,8,8
        .comm   k,4,4
        .comm   n,4,4
        .comm   m,4,4
        .ident  "GCC: (Ubuntu/Linaro 4.8.1-10ubuntu8) 4.8.1"
        .section        .note.GNU-stack,"",@progbits

Reply via email to