Playing with CPU caches

yglukhov Tue, 03 Aug 2021 02:30:13 -0700

While migrating our [game engine](https://github.com/yglukhov/rod/) to ECS 
architecture we went out to validate some simple ideas about CPU caches, and 
were a bit surprised. I'm hoping someone can explain what's happening here. 
    
    
    import random, std/monotimes
    
    const MAX = 9000000
    
    type Elem = int
    
    var x = newSeq[Elem](MAX)
    var y = newSeq[Elem](MAX)
    var z = newSeq[Elem](MAX)
    var m = newSeq[Elem](MAX)
    
    type S = object
      x, y, z, m: Elem
    
    var d = newSeq[S](MAX)
    
    template bench(name: string, code: untyped) =
      let s = getMonotime().ticks
      code
      let e = getMonotime().ticks
      echo "test ", name, ": ", e - s
    
    randomize()
    
    proc testSOA(i: int) =
      m[i] = x[i] + y[i] + z[i]
    
    proc testAOS(i: int) =
      d[i].m = d[i].x + d[i].y + d[i].z
    
    proc test() =
        for i in 0 .. MAX - 1:
            x[i] = rand(Elem.high)
            y[i] = rand(Elem.high)
            z[i] = rand(Elem.high)
            m[i] = rand(Elem.high)
            d[i] = S(x: x[i], y: y[i], z: z[i], m: m[i])
        
        bench "A1":
            for i in 0 .. MAX - 1:
                testSOA(i)
                # m[i] = x[i] + y[i] + z[i]
        
        bench "S1":
            for i in 0 .. MAX - 1:
               testAOS(i)
    
    test()
    
    
    Run
    
    
    nim c -d:release -d:danger test.nim
    
    
    Run
    
    
    test A1: 9957102
    test S1: 15503720
    
    
    Run


Which suggests that "array of structs" is ~1.5 times slower than "struct of 
arrays". Also disassembly shows that `testSOA` has more instructions than 
`testAOS`.

  * `testSOA`:


    
    
    Dump of assembler code for function testSOA_t_92:
       0x000000000000e260 <+0>: mov    0x1a0b1(%rip),%rdx        # 0x28318 
<x_t_26>
       0x000000000000e267 <+7>: mov    0x1a0a2(%rip),%rax        # 0x28310 
<y_t_34>
       0x000000000000e26e <+14>:        add    $0x2,%rdi
       0x000000000000e272 <+18>:        mov    (%rax,%rdi,8),%rax
       0x000000000000e276 <+22>:        add    (%rdx,%rdi,8),%rax
       0x000000000000e27a <+26>:        mov    0x1a087(%rip),%rdx        # 
0x28308 <z_t_42>
       0x000000000000e281 <+33>:        add    (%rdx,%rdi,8),%rax
       0x000000000000e285 <+37>:        mov    0x1a074(%rip),%rdx        # 
0x28300 <m_t_50>
       0x000000000000e28c <+44>:        mov    %rax,(%rdx,%rdi,8)
       0x000000000000e290 <+48>:        ret
    
    
    Run

  * `testAOS`:


    
    
    Dump of assembler code for function testAOS_t_94:
       0x000000000000e2a0 <+0>: shl    $0x5,%rdi
       0x000000000000e2a4 <+4>: add    0x1a04d(%rip),%rdi        # 0x282f8 
<d_t_86>
       0x000000000000e2ab <+11>:        mov    0x18(%rdi),%rax
       0x000000000000e2af <+15>:        add    0x10(%rdi),%rax
       0x000000000000e2b3 <+19>:        add    0x20(%rdi),%rax
       0x000000000000e2b7 <+23>:        mov    %rax,0x28(%rdi)
       0x000000000000e2bb <+27>:        ret
    
    
    Run

So my intuition tells me that AOS should be just as fast or faster than SOA, 
but apparently I'm wrong. Why?

Playing with CPU caches

Reply via email to