On Thu, 25 Mar 2010, stephane eranian wrote:
> >
> I strongly believe this is related to interrupts. That's the only explanation
> for the fluctuations. Furthermore, it's not the PMU interrupts,
> because those, I assume,
> are constant for this test program.

I've run some tests with a program that exercises memory more (it mmaps 
64MB of memory, writes to all of it, then reads it all back, one byte at a 
time).

Expected values:
   retired_instructions:   402,653,208
   loads:                   67,108,866
   stores:                  67,108,866
   branches:               134,217,729

On core2, 
  retired instructions is almost exactly expected+HW_INT_RECV+page_faults
  stores is an exactly correct value (as reported before)

On Nehalem:
  retired instructions is Expected+page_faults +  a small value
     that's probably hw interrupts, but no counter for that :(
  retired stores is Expected + roughly same value extra as retired 
      instructions.

For some reason "perf" uses L1D_CACHE_ST:MESI as the retired stores 
counter rather than "MEM_INST_RETIRED.STORES".  It's the latter that
I use in my experiment, the former is off by more, possibly due to
cache coherence issues.

Most of the other machines I have access to don't have 
retired_loads/retired_stores counts so I can't test on those.  I do have 
an atom machine I can test on to.

Vince

# load/store microbenchmark
# by Vince Weaver   vweaver1 _at_ eecs.utk.edu

# Build with
#   as --32 -o load_store.o load_store.s ; ld -melf_i386 -o load_store 
load_store.o

# On a deterministic system you'd expect to see:
#     402,653,208 retired_instructions
#      67,108,866 reitred_loads
#      67,108,866 retired_stores
#     134,217,729 retired_branches

        
.equ TEST_SIZE, 64*1024*1024            # 64M

.equ SYSCALL_MUNMAP,     91
.equ SYSCALL_MMAP2,     192

        # /opt/perfmon/linux-2.6.29/include/asm-generic/mman.h
        .equ PROT_NONE,  0x0
        .equ PROT_READ,  0x1
        .equ PROT_WRITE, 0x2
        .equ PROT_EXEC,  0x4
        .equ MAP_SHARED,      0x01
        .equ MAP_PRIVATE,     0x02
        .equ MAP_TYPE,        0x0f
        .equ MAP_FIXED,       0x10
        .equ MAP_ANONYMOUS,   0x20
        
        
        .globl _start   
_start: 

        
        # run mmap() of 64MB value

        mov     $SYSCALL_MMAP2,%eax     # syscall number
        mov     $0,%ebx                 # address
        mov     $TEST_SIZE,%ecx         # size = 64M
        mov     $(PROT_READ | PROT_WRITE), %edx
                                        # prot
        mov     $(MAP_PRIVATE | MAP_ANONYMOUS), %esi
                                        # flags
        mov     $-1,%edi                                
                                        # fd
        mov     $0,%ebp                         
                                        # pgoffset
        int     $0x80
        
        push    %eax                    # save the address

        # store 0x5a 64M times
        
        mov     %eax,%edi               # point dest to mmap'd region
        mov     $TEST_SIZE,%ecx         # load in size
        mov     $0x5a,%al
write_loop:
        stosb                           # store the byte to mem
        dec     %ecx
        jnz     write_loop              # could use "rep stosb"
                                        # but that would only count
                                        # as one instruction
        
        pop     %eax                    # restore address
        
        push    %eax                    # and save it again
        mov     %eax,%esi               # point source to address
        
        mov     $TEST_SIZE,%ecx         # size to go through
        
        xor     %eax,%eax               # clear eax
read_loop:
        lodsb                           # load a byte
        dec     %ecx
        jnz     read_loop       
        

munmap:

        # run munmap()

        mov     $SYSCALL_MUNMAP,%eax
        pop     %ebx                    # restore address
        mov     $TEST_SIZE,%ecx         # size
        int     $0x80


        # fall through to exit

        #================================
        # Exit
        #================================

exit:

        xor     %ebx,%ebx               # we return 0
        mov     $1,%eax
        int     $0x80
        

------------------------------------------------------------------------------
Download Intel® Parallel Studio Eval
Try the new software tools for yourself. Speed compiling, find bugs
proactively, and fine-tune applications for parallel performance.
See why Intel Parallel Studio got high marks during beta.
http://p.sf.net/sfu/intel-sw-dev
_______________________________________________
perfmon2-devel mailing list
perfmon2-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/perfmon2-devel

Reply via email to