On Mon, Sep 07, 2020 at 06:29:13PM +0200, Ahmed S. Darwish wrote:

> I've been unsuccessful in reproducing this huge, 200+ bytes, difference.
> Can I please get the defconfig and GCC version?

I think I lost the config and it's either gcc-9.3 or gcc-10, I can't
remember.

I just tried with:

  make defconfig
  ./scripts/config --enable PREEMPT --enable DEBUG_ATOMIC_SLEEP
  make oldconfig

And that reproduces things a little, but nowhere near as horrible as I
reported. Clearly I had something mad enabled by accident.

> Here are the two competing implementations:
> 
> noinline void cyc2ns_read_begin_v1(struct cyc2ns_data *data)
> {
>       seqcount_latch_t *seqcount;
>       int seq, idx;
> 
>       preempt_disable_notrace();
> 
>       seqcount = &this_cpu_ptr(&cyc2ns)->seq;
>       do {
>               seq = raw_read_seqcount_latch(seqcount);
>               idx = seq & 1;
> 
>               data->cyc2ns_offset = 
> this_cpu_read(cyc2ns.data[idx].cyc2ns_offset);
>               data->cyc2ns_mul    = 
> this_cpu_read(cyc2ns.data[idx].cyc2ns_mul);
>               data->cyc2ns_shift  = 
> this_cpu_read(cyc2ns.data[idx].cyc2ns_shift);
> 
>       } while (read_seqcount_latch_retry(seqcount, seq));
> }
> 
> noinline void cyc2ns_read_begin_v2(struct cyc2ns_data *data)
> {
>       int seq, idx;
> 
>       preempt_disable_notrace();
> 
>       do {
>               seq = this_cpu_read(cyc2ns.seq.seqcount.sequence);
>               idx = seq & 1;
> 
>               data->cyc2ns_offset = 
> this_cpu_read(cyc2ns.data[idx].cyc2ns_offset);
>               data->cyc2ns_mul    = 
> this_cpu_read(cyc2ns.data[idx].cyc2ns_mul);
>               data->cyc2ns_shift  = 
> this_cpu_read(cyc2ns.data[idx].cyc2ns_shift);
> 
>       } while (unlikely(seq != this_cpu_read(cyc2ns.seq.seqcount.sequence)));
> }

Don't look at this function in isolation, look at native_sched_clock()
where it's used as a whole.

What happened (afaict) is that the change caused it to use more
registers and ended up spiling crap on the stack.

GCC-9.3 gives me:

(this_cpu variant)


0000 0000000000000c00 <native_sched_clock>:
0000  c00:      e9 65 00 00 00          jmpq   c6a <native_sched_clock+0x6a>
0005  c05:      0f 31                   rdtsc  
0007  c07:      48 c1 e2 20             shl    $0x20,%rdx
000b  c0b:      48 09 c2                or     %rax,%rdx
000e  c0e:      65 ff 05 00 00 00 00    incl   %gs:0x0(%rip)        # c15 
<native_sched_clock+0x15>
0011                    c11: R_X86_64_PC32      __preempt_count-0x4
0015  c15:      65 44 8b 05 00 00 00    mov    %gs:0x0(%rip),%r8d        # c1d 
<native_sched_clock+0x1d>
001c  c1c:      00 
0019                    c19: R_X86_64_PC32      
.data..percpu..shared_aligned+0x1c
001d  c1d:      44 89 c0                mov    %r8d,%eax
0020  c20:      83 e0 01                and    $0x1,%eax
0023  c23:      48 c1 e0 04             shl    $0x4,%rax
0027  c27:      48 8d 88 00 00 00 00    lea    0x0(%rax),%rcx
002a                    c2a: R_X86_64_32S       .data..percpu..shared_aligned
002e  c2e:      65 48 8b 79 08          mov    %gs:0x8(%rcx),%rdi
0033  c33:      65 8b b0 00 00 00 00    mov    %gs:0x0(%rax),%esi
0036                    c36: R_X86_64_32S       .data..percpu..shared_aligned
003a  c3a:      65 8b 49 04             mov    %gs:0x4(%rcx),%ecx
003e  c3e:      65 8b 05 00 00 00 00    mov    %gs:0x0(%rip),%eax        # c45 
<native_sched_clock+0x45>
0041                    c41: R_X86_64_PC32      
.data..percpu..shared_aligned+0x1c
0045  c45:      41 39 c0                cmp    %eax,%r8d
0048  c48:      75 cb                   jne    c15 <native_sched_clock+0x15>
004a  c4a:      89 f0                   mov    %esi,%eax
004c  c4c:      48 f7 e2                mul    %rdx
004f  c4f:      48 0f ad d0             shrd   %cl,%rdx,%rax
0053  c53:      48 d3 ea                shr    %cl,%rdx
0056  c56:      f6 c1 40                test   $0x40,%cl
0059  c59:      48 0f 45 c2             cmovne %rdx,%rax
005d  c5d:      48 01 f8                add    %rdi,%rax
0060  c60:      65 ff 0d 00 00 00 00    decl   %gs:0x0(%rip)        # c67 
<native_sched_clock+0x67>
0063                    c63: R_X86_64_PC32      __preempt_count-0x4
0067  c67:      74 1a                   je     c83 <native_sched_clock+0x83>
0069  c69:      c3                      retq   
006a  c6a:      48 69 05 00 00 00 00    imul   $0xf4240,0x0(%rip),%rax        # 
c75 <native_sched_clock+0x75>
0071  c71:      40 42 0f 00 
006d                    c6d: R_X86_64_PC32      jiffies_64-0x8
0075  c75:      48 ba 00 b8 64 d9 05    movabs $0xfff0be05d964b800,%rdx
007c  c7c:      be f0 ff 
007f  c7f:      48 01 d0                add    %rdx,%rax
0082  c82:      c3                      retq   
0083  c83:      55                      push   %rbp
0084  c84:      48 89 e5                mov    %rsp,%rbp
0087  c87:      e8 00 00 00 00          callq  c8c <native_sched_clock+0x8c>
0088                    c88: R_X86_64_PLT32     
preempt_schedule_notrace_thunk-0x4
008c  c8c:      5d                      pop    %rbp
008d  c8d:      c3                      retq   



(seqcount_latch variant)


0000 0000000000000c20 <native_sched_clock>:
0000  c20:      e9 89 00 00 00          jmpq   cae <native_sched_clock+0x8e>
0005  c25:      55                      push   %rbp
0006  c26:      48 89 e5                mov    %rsp,%rbp
0009  c29:      41 54                   push   %r12
000b  c2b:      53                      push   %rbx
000c  c2c:      48 83 e4 f0             and    $0xfffffffffffffff0,%rsp
0010  c30:      0f 31                   rdtsc  
0012  c32:      48 c1 e2 20             shl    $0x20,%rdx
0016  c36:      48 89 d3                mov    %rdx,%rbx
0019  c39:      48 09 c3                or     %rax,%rbx
001c  c3c:      65 ff 05 00 00 00 00    incl   %gs:0x0(%rip)        # c43 
<native_sched_clock+0x23>
001f                    c3f: R_X86_64_PC32      __preempt_count-0x4
0023  c43:      e8 00 00 00 00          callq  c48 <native_sched_clock+0x28>
0024                    c44: R_X86_64_PLT32     debug_smp_processor_id-0x4
0028  c48:      49 c7 c4 00 00 00 00    mov    $0x0,%r12
002b                    c4b: R_X86_64_32S       
.data..percpu..shared_aligned+0x20
002f  c4f:      89 c0                   mov    %eax,%eax
0031  c51:      4c 03 24 c5 00 00 00    add    0x0(,%rax,8),%r12
0038  c58:      00 
0035                    c55: R_X86_64_32S       __per_cpu_offset
0039  c59:      4c 89 e0                mov    %r12,%rax
003c  c5c:      8b 30                   mov    (%rax),%esi
003e  c5e:      89 f1                   mov    %esi,%ecx
0040  c60:      83 e1 01                and    $0x1,%ecx
0043  c63:      48 c1 e1 04             shl    $0x4,%rcx
0047  c67:      48 8d b9 00 00 00 00    lea    0x0(%rcx),%rdi
004a                    c6a: R_X86_64_32S       .data..percpu..shared_aligned
004e  c6e:      65 4c 8b 47 08          mov    %gs:0x8(%rdi),%r8
0053  c73:      65 44 8b 89 00 00 00    mov    %gs:0x0(%rcx),%r9d
005a  c7a:      00 
0057                    c77: R_X86_64_32S       .data..percpu..shared_aligned
005b  c7b:      65 8b 4f 04             mov    %gs:0x4(%rdi),%ecx
005f  c7f:      8b 10                   mov    (%rax),%edx
0061  c81:      39 d6                   cmp    %edx,%esi
0063  c83:      75 d7                   jne    c5c <native_sched_clock+0x3c>
0065  c85:      44 89 c8                mov    %r9d,%eax
0068  c88:      48 f7 e3                mul    %rbx
006b  c8b:      48 0f ad d0             shrd   %cl,%rdx,%rax
006f  c8f:      48 d3 ea                shr    %cl,%rdx
0072  c92:      f6 c1 40                test   $0x40,%cl
0075  c95:      48 0f 45 c2             cmovne %rdx,%rax
0079  c99:      4c 01 c0                add    %r8,%rax
007c  c9c:      65 ff 0d 00 00 00 00    decl   %gs:0x0(%rip)        # ca3 
<native_sched_clock+0x83>
007f                    c9f: R_X86_64_PC32      __preempt_count-0x4
0083  ca3:      74 22                   je     cc7 <native_sched_clock+0xa7>
0085  ca5:      48 8d 65 f0             lea    -0x10(%rbp),%rsp
0089  ca9:      5b                      pop    %rbx
008a  caa:      41 5c                   pop    %r12
008c  cac:      5d                      pop    %rbp
008d  cad:      c3                      retq   
008e  cae:      48 69 05 00 00 00 00    imul   $0xf4240,0x0(%rip),%rax        # 
cb9 <native_sched_clock+0x99>
0095  cb5:      40 42 0f 00 
0091                    cb1: R_X86_64_PC32      jiffies_64-0x8
0099  cb9:      49 b8 00 b8 64 d9 05    movabs $0xfff0be05d964b800,%r8
00a0  cc0:      be f0 ff 
00a3  cc3:      4c 01 c0                add    %r8,%rax
00a6  cc6:      c3                      retq   
00a7  cc7:      e8 00 00 00 00          callq  ccc <native_sched_clock+0xac>
00a8                    cc8: R_X86_64_PLT32     
preempt_schedule_notrace_thunk-0x4
00ac  ccc:      eb d7                   jmp    ca5 <native_sched_clock+0x85>


And you see it starting to spill stuff.

When we disable DEBUG_ATOMIC_SLEEP it becomes much saner again:


0000 0000000000000b40 <native_sched_clock>:
0000  b40:      e9 6b 00 00 00          jmpq   bb0 <native_sched_clock+0x70>
0005  b45:      0f 31                   rdtsc  
0007  b47:      48 c1 e2 20             shl    $0x20,%rdx
000b  b4b:      48 09 c2                or     %rax,%rdx
000e  b4e:      65 ff 05 00 00 00 00    incl   %gs:0x0(%rip)        # b55 
<native_sched_clock+0x15>
0011                    b51: R_X86_64_PC32      __preempt_count-0x4
0015  b55:      49 c7 c0 00 00 00 00    mov    $0x0,%r8
0018                    b58: R_X86_64_32S       
.data..percpu..shared_aligned+0x20
001c  b5c:      65 4c 03 05 00 00 00    add    %gs:0x0(%rip),%r8        # b64 
<native_sched_clock+0x24>
0023  b63:      00 
0020                    b60: R_X86_64_PC32      this_cpu_off-0x4
0024  b64:      41 8b 30                mov    (%r8),%esi
0027  b67:      89 f1                   mov    %esi,%ecx
0029  b69:      83 e1 01                and    $0x1,%ecx
002c  b6c:      48 c1 e1 04             shl    $0x4,%rcx
0030  b70:      48 8d b9 00 00 00 00    lea    0x0(%rcx),%rdi
0033                    b73: R_X86_64_32S       .data..percpu..shared_aligned
0037  b77:      65 4c 8b 57 08          mov    %gs:0x8(%rdi),%r10
003c  b7c:      65 44 8b 89 00 00 00    mov    %gs:0x0(%rcx),%r9d
0043  b83:      00 
0040                    b80: R_X86_64_32S       .data..percpu..shared_aligned
0044  b84:      65 8b 4f 04             mov    %gs:0x4(%rdi),%ecx
0048  b88:      41 8b 38                mov    (%r8),%edi
004b  b8b:      39 fe                   cmp    %edi,%esi
004d  b8d:      75 d5                   jne    b64 <native_sched_clock+0x24>
004f  b8f:      44 89 c8                mov    %r9d,%eax
0052  b92:      48 f7 e2                mul    %rdx
0055  b95:      48 0f ad d0             shrd   %cl,%rdx,%rax
0059  b99:      48 d3 ea                shr    %cl,%rdx
005c  b9c:      f6 c1 40                test   $0x40,%cl
005f  b9f:      48 0f 45 c2             cmovne %rdx,%rax
0063  ba3:      4c 01 d0                add    %r10,%rax
0066  ba6:      65 ff 0d 00 00 00 00    decl   %gs:0x0(%rip)        # bad 
<native_sched_clock+0x6d>
0069                    ba9: R_X86_64_PC32      __preempt_count-0x4
006d  bad:      74 1a                   je     bc9 <native_sched_clock+0x89>
006f  baf:      c3                      retq   
0070  bb0:      48 69 05 00 00 00 00    imul   $0xf4240,0x0(%rip),%rax        # 
bbb <native_sched_clock+0x7b>
0077  bb7:      40 42 0f 00 
0073                    bb3: R_X86_64_PC32      jiffies_64-0x8
007b  bbb:      49 ba 00 b8 64 d9 05    movabs $0xfff0be05d964b800,%r10
0082  bc2:      be f0 ff 
0085  bc5:      4c 01 d0                add    %r10,%rax
0088  bc8:      c3                      retq   
0089  bc9:      55                      push   %rbp
008a  bca:      48 89 e5                mov    %rsp,%rbp
008d  bcd:      e8 00 00 00 00          callq  bd2 <native_sched_clock+0x92>
008e                    bce: R_X86_64_PLT32     
preempt_schedule_notrace_thunk-0x4
0092  bd2:      5d                      pop    %rbp
0093  bd3:      c3                      retq   

But that's still slightly larger.



Anyway, I frobbed the patch to use the this_cpu variant, and I've queued
the lot.

Reply via email to