While migrating our [game engine](https://github.com/yglukhov/rod/) to ECS 
architecture we went out to validate some simple ideas about CPU caches, and 
were a bit surprised. I'm hoping someone can explain what's happening here. 
    
    
    import random, std/monotimes
    
    const MAX = 9000000
    
    type Elem = int
    
    var x = newSeq[Elem](MAX)
    var y = newSeq[Elem](MAX)
    var z = newSeq[Elem](MAX)
    var m = newSeq[Elem](MAX)
    
    type S = object
      x, y, z, m: Elem
    
    var d = newSeq[S](MAX)
    
    template bench(name: string, code: untyped) =
      let s = getMonotime().ticks
      code
      let e = getMonotime().ticks
      echo "test ", name, ": ", e - s
    
    randomize()
    
    proc testSOA(i: int) =
      m[i] = x[i] + y[i] + z[i]
    
    proc testAOS(i: int) =
      d[i].m = d[i].x + d[i].y + d[i].z
    
    proc test() =
        for i in 0 .. MAX - 1:
            x[i] = rand(Elem.high)
            y[i] = rand(Elem.high)
            z[i] = rand(Elem.high)
            m[i] = rand(Elem.high)
            d[i] = S(x: x[i], y: y[i], z: z[i], m: m[i])
        
        bench "A1":
            for i in 0 .. MAX - 1:
                testSOA(i)
                # m[i] = x[i] + y[i] + z[i]
        
        bench "S1":
            for i in 0 .. MAX - 1:
               testAOS(i)
    
    test()
    
    
    Run
    
    
    nim c -d:release -d:danger test.nim
    
    
    Run
    
    
    test A1: 9957102
    test S1: 15503720
    
    
    Run

Which suggests that "array of structs" is ~1.5 times slower than "struct of 
arrays". Also disassembly shows that `testSOA` has more instructions than 
`testAOS`.

  * `testSOA`:


    
    
    Dump of assembler code for function testSOA_t_92:
       0x000000000000e260 <+0>: mov    0x1a0b1(%rip),%rdx        # 0x28318 
<x_t_26>
       0x000000000000e267 <+7>: mov    0x1a0a2(%rip),%rax        # 0x28310 
<y_t_34>
       0x000000000000e26e <+14>:        add    $0x2,%rdi
       0x000000000000e272 <+18>:        mov    (%rax,%rdi,8),%rax
       0x000000000000e276 <+22>:        add    (%rdx,%rdi,8),%rax
       0x000000000000e27a <+26>:        mov    0x1a087(%rip),%rdx        # 
0x28308 <z_t_42>
       0x000000000000e281 <+33>:        add    (%rdx,%rdi,8),%rax
       0x000000000000e285 <+37>:        mov    0x1a074(%rip),%rdx        # 
0x28300 <m_t_50>
       0x000000000000e28c <+44>:        mov    %rax,(%rdx,%rdi,8)
       0x000000000000e290 <+48>:        ret
    
    
    Run

  * `testAOS`:


    
    
    Dump of assembler code for function testAOS_t_94:
       0x000000000000e2a0 <+0>: shl    $0x5,%rdi
       0x000000000000e2a4 <+4>: add    0x1a04d(%rip),%rdi        # 0x282f8 
<d_t_86>
       0x000000000000e2ab <+11>:        mov    0x18(%rdi),%rax
       0x000000000000e2af <+15>:        add    0x10(%rdi),%rax
       0x000000000000e2b3 <+19>:        add    0x20(%rdi),%rax
       0x000000000000e2b7 <+23>:        mov    %rax,0x28(%rdi)
       0x000000000000e2bb <+27>:        ret
    
    
    Run

So my intuition tells me that AOS should be just as fast or faster than SOA, 
but apparently I'm wrong. Why?

Reply via email to