Dear Linux Perf Users Community,
I noticed some inconsistencies with the perf tool. I would like to
determine whether I am doing something wrong, or whether there are
problem in the perf tool. Here is the problem:
I would like to obtain flops on a simple matrix-to-matrix multiplication
algorithm. The code is available in the attachment as mmmtest.c. To
obtain flops, I run the perf tool using raw counters. When I try to
obtain flops for matrices having sizes bellow 150x150, I obtain accurate
results. Example (anticipated flops: 100 * 100 * 100 * 2 = 2'000'000):
perf stat -e r538010 ./mmmtest 100
Performance counter stats for './mmmtest 100':
2,078,775 r538010
0.003889544 seconds time elapsed
However, whenever I try to run matrices of bigger size, the reported
flops are not even close to the flops that I am supposed to obtain
(anticipated results: 600 * 600 * 600 * 2 = 432'000'000):
perf stat -e r538010 ./mmmtest 600
Performance counter stats for './mmmtest 600':
2,348,148,851 r538010
0.955511968 seconds time elapsed
To give you more info to replicate the problem, I provide you with the
following:
CPU: Intel(R) Xeon(R) CPU E5-2643 0 @ 3.30GHz, 8 cores
Linux Kernel: 3.11.0-12-generic
GCC Version: gcc version 4.8.1 (Ubuntu/Linaro 4.8.1-10ubuntu8)
Monitored events: FP_COMP_OPS_EXE:SSE_SCALAR_DOUBLE - Raw event:
0x538010 (converted using libpfm4)
I have compiled the mmmtest.c using gcc -O3 -march=corei7-avx -o mmmtest
mmmtest.c. You can also find mmmtest.s asm version in the attachment.
Do you know why does this happens ? How can I instruct perf to obtain
accurate results ?
Greetings,
Alen
#include <stdlib.h>
int m, n, k;
double *A, *B, *C;
void compute() {
int i,j,h;
for(i = 0; i < m; ++i) {
for(j = 0; j < n; ++j) {
for(h = 0; h < k; ++h) {
C[i*n+j] += A[i*k+h] * B[h*n+j];
}
}
}
}
int main(int argc, char **argv)
{
m = atoi(argv[1]); n = m; k = m;
A = (double *) malloc (m * k * sizeof(double));
B = (double *) malloc (k * n * sizeof(double));
C = (double *) malloc (m * n * sizeof(double));
compute ();
free(A);
free(B);
free(C);
}
.file "mmmtest.c"
.text
.p2align 4,,15
.globl compute
.type compute, @function
compute:
.LFB14:
.cfi_startproc
pushq %r15
.cfi_def_cfa_offset 16
.cfi_offset 15, -16
pushq %r14
.cfi_def_cfa_offset 24
.cfi_offset 14, -24
pushq %r13
.cfi_def_cfa_offset 32
.cfi_offset 13, -32
pushq %r12
.cfi_def_cfa_offset 40
.cfi_offset 12, -40
movl m(%rip), %r12d
pushq %rbp
.cfi_def_cfa_offset 48
.cfi_offset 6, -48
pushq %rbx
.cfi_def_cfa_offset 56
.cfi_offset 3, -56
testl %r12d, %r12d
jle .L9
movl n(%rip), %ebp
xorl %ebx, %ebx
movl k(%rip), %esi
movq B(%rip), %r15
movq A(%rip), %rdi
movq C(%rip), %r11
leal -1(%rbp), %eax
movslq %ebp, %r8
leaq 8(,%rax,8), %r13
movslq %esi, %r14
salq $3, %r8
salq $3, %r14
.L3:
testl %ebp, %ebp
jle .L5
leaq 0(%r13,%r11), %r10
movq %r15, %r9
movq %r11, %rcx
.p2align 4,,10
.p2align 3
.L8:
testl %esi, %esi
jle .L6
vmovsd (%rcx), %xmm0
movq %r9, %rdx
xorl %eax, %eax
.p2align 4,,10
.p2align 3
.L7:
vmovsd (%rdi,%rax,8), %xmm1
addq $1, %rax
vmulsd (%rdx), %xmm1, %xmm1
addq %r8, %rdx
cmpl %eax, %esi
vaddsd %xmm1, %xmm0, %xmm0
vmovsd %xmm0, (%rcx)
jg .L7
.L6:
addq $8, %rcx
addq $8, %r9
cmpq %r10, %rcx
jne .L8
.L5:
addl $1, %ebx
addq %r14, %rdi
addq %r8, %r11
cmpl %r12d, %ebx
jne .L3
.L9:
popq %rbx
.cfi_def_cfa_offset 48
popq %rbp
.cfi_def_cfa_offset 40
popq %r12
.cfi_def_cfa_offset 32
popq %r13
.cfi_def_cfa_offset 24
popq %r14
.cfi_def_cfa_offset 16
popq %r15
.cfi_def_cfa_offset 8
ret
.cfi_endproc
.LFE14:
.size compute, .-compute
.section .text.startup,"ax",@progbits
.p2align 4,,15
.globl main
.type main, @function
main:
.LFB15:
.cfi_startproc
pushq %rbx
.cfi_def_cfa_offset 16
.cfi_offset 3, -16
movl $10, %edx
movq 8(%rsi), %rdi
xorl %esi, %esi
call strtol
movl %eax, m(%rip)
movl %eax, n(%rip)
movl %eax, k(%rip)
imull %eax, %eax
movslq %eax, %rbx
salq $3, %rbx
movq %rbx, %rdi
call malloc
movq %rbx, %rdi
movq %rax, A(%rip)
call malloc
movq %rbx, %rdi
movq %rax, B(%rip)
call malloc
movq %rax, C(%rip)
xorl %eax, %eax
call compute
movq A(%rip), %rdi
call free
movq B(%rip), %rdi
call free
movq C(%rip), %rdi
call free
popq %rbx
.cfi_def_cfa_offset 8
ret
.cfi_endproc
.LFE15:
.size main, .-main
.comm C,8,8
.comm B,8,8
.comm A,8,8
.comm k,4,4
.comm n,4,4
.comm m,4,4
.ident "GCC: (Ubuntu/Linaro 4.8.1-10ubuntu8) 4.8.1"
.section .note.GNU-stack,"",@progbits