On Mon, May 25, 2020 at 12:40:38PM +0200, Peter Zijlstra wrote: > On Mon, May 25, 2020 at 12:02:48PM +0200, Rasmus Villemoes wrote: > > > Naive question: did you check disassembly to see whether gcc threw your > > native_get_debugreg() away, given that the asm isn't volatile and the > > result is not used for anything? Testing here only shows a "mov > > %r9,%db7", but the read did seem to get thrown away. > > Argh.. no I did not. Writing it all in asm gets me: > > [ 1.627405] XXX: 3900 8304 22632 > > which is a lot worse...
+ u64 empty = 0, read = 0, write = 0, cpu = 0, cpu1 = 0; + unsigned long dr7; + + for (i=0; i<100; i++) { + u64 s; + + s = rdtsc(); + asm volatile ("lfence; lfence;"); + empty += rdtsc() - s; + + s = rdtsc(); + asm volatile ("lfence; mov %%db7, %0; lfence;" : "=r" (dr7)); + read += rdtsc() - s; + + s = rdtsc(); + asm volatile ("lfence; mov %0, %%db7; lfence;" :: "r" (dr7)); + write += rdtsc() - s; + + s = rdtsc(); + asm volatile ("lfence; mov %0, %%db7; lfence;" :: "r" (dr7)); + write += rdtsc() - s; + + clflush(this_cpu_ptr(&cpu_dr7)); + + s = rdtsc(); + asm volatile ("lfence;"); + dr7 = this_cpu_read(cpu_dr7); + asm volatile ("lfence;"); + cpu += rdtsc() - s; + + s = rdtsc(); + asm volatile ("lfence;"); + dr7 = this_cpu_read(cpu_dr7); + asm volatile ("lfence;"); + cpu1 += rdtsc() - s; + } + + printk("XXX: %ld %ld %ld %ld %ld\n", empty, read, write, cpu, cpu1); [ 1.628252] XXX: 3820 8224 45516 35560 4800 Which still seems to suggest using DR7 directly is probably a good thing. It's slower than a L1 hit, but massively faster than a full miss. --- 11f: 0f 31 rdtsc 121: 48 89 d1 mov %rdx,%rcx 124: 48 89 c6 mov %rax,%rsi 127: 0f ae e8 lfence 12a: 0f ae e8 lfence 12d: 0f 31 rdtsc 12f: 48 c1 e2 20 shl $0x20,%rdx 133: 48 c1 e1 20 shl $0x20,%rcx 137: 48 09 c2 or %rax,%rdx 13a: 48 09 f1 or %rsi,%rcx 13d: 48 29 ca sub %rcx,%rdx 140: 48 01 d3 add %rdx,%rbx 143: 0f 31 rdtsc 145: 48 89 d1 mov %rdx,%rcx 148: 48 89 c6 mov %rax,%rsi 14b: 0f ae e8 lfence 14e: 41 0f 21 fb mov %db7,%r11 152: 0f ae e8 lfence 155: 0f 31 rdtsc 157: 48 c1 e2 20 shl $0x20,%rdx 15b: 48 c1 e1 20 shl $0x20,%rcx 15f: 48 09 c2 or %rax,%rdx 162: 48 09 f1 or %rsi,%rcx 165: 48 29 ca sub %rcx,%rdx 168: 48 01 d5 add %rdx,%rbp 16b: 0f 31 rdtsc 16d: 48 89 d6 mov %rdx,%rsi 170: 49 89 c1 mov %rax,%r9 173: 0f ae e8 lfence 176: 41 0f 23 fb mov %r11,%db7 17a: 0f ae e8 lfence 17d: 0f 31 rdtsc 17f: 48 89 d7 mov %rdx,%rdi 182: 49 89 c2 mov %rax,%r10 185: 0f 31 rdtsc 187: 48 89 d1 mov %rdx,%rcx 18a: 49 89 c0 mov %rax,%r8 18d: 0f ae e8 lfence 190: 41 0f 23 fb mov %r11,%db7 194: 0f ae e8 lfence 197: 0f 31 rdtsc 199: 48 c1 e2 20 shl $0x20,%rdx 19d: 48 c1 e6 20 shl $0x20,%rsi 1a1: 48 09 c2 or %rax,%rdx 1a4: 48 89 f8 mov %rdi,%rax 1a7: 48 c1 e1 20 shl $0x20,%rcx 1ab: 48 c1 e0 20 shl $0x20,%rax 1af: 49 09 f1 or %rsi,%r9 1b2: 49 09 c8 or %rcx,%r8 1b5: 49 09 c2 or %rax,%r10 1b8: 4a 8d 04 12 lea (%rdx,%r10,1),%rax 1bc: 48 c7 c2 00 00 00 00 mov $0x0,%rdx 1bf: R_X86_64_32S cpu_dr7 1c3: 4c 29 c8 sub %r9,%rax 1c6: 4c 29 c0 sub %r8,%rax 1c9: 49 01 c4 add %rax,%r12 1cc: 48 89 14 24 mov %rdx,(%rsp) 1d0: 48 89 54 24 08 mov %rdx,0x8(%rsp) 1d5: e8 00 00 00 00 callq 1da <sched_init+0xe1> 1d6: R_X86_64_PLT32 debug_smp_processor_id-0x4 1da: 48 c7 c1 00 00 00 00 mov $0x0,%rcx 1dd: R_X86_64_32S __per_cpu_offset 1e1: 48 8b 14 24 mov (%rsp),%rdx 1e5: 89 c0 mov %eax,%eax 1e7: 48 03 14 c1 add (%rcx,%rax,8),%rdx 1eb: 0f ae 3a clflush (%rdx) 1ee: 0f 31 rdtsc 1f0: 48 89 d1 mov %rdx,%rcx 1f3: 48 89 c6 mov %rax,%rsi 1f6: 0f ae e8 lfence 1f9: 65 48 8b 05 00 00 00 mov %gs:0x0(%rip),%rax # 201 <sched_init+0x108> 200: 00 1fd: R_X86_64_PC32 cpu_dr7-0x4 201: 0f ae e8 lfence 204: 0f 31 rdtsc 206: 48 c1 e2 20 shl $0x20,%rdx 20a: 48 c1 e1 20 shl $0x20,%rcx 20e: 48 09 c2 or %rax,%rdx 211: 48 09 f1 or %rsi,%rcx 214: 48 29 ca sub %rcx,%rdx 217: 49 01 d5 add %rdx,%r13 21a: 0f 31 rdtsc 21c: 48 89 d1 mov %rdx,%rcx 21f: 48 89 c6 mov %rax,%rsi 222: 0f ae e8 lfence 225: 65 48 8b 05 00 00 00 mov %gs:0x0(%rip),%rax # 22d <sched_init+0x134> 22c: 00 229: R_X86_64_PC32 cpu_dr7-0x4 22d: 0f ae e8 lfence 230: 0f 31 rdtsc 232: 48 c1 e2 20 shl $0x20,%rdx 236: 48 c1 e1 20 shl $0x20,%rcx 23a: 48 09 c2 or %rax,%rdx 23d: 48 09 f1 or %rsi,%rcx 240: 48 29 ca sub %rcx,%rdx 243: 49 01 d6 add %rdx,%r14 246: 41 ff cf dec %r15d 249: 0f 85 d0 fe ff ff jne 11f <sched_init+0x26>