Re: [PATCH v4 3/3] locking/rwsem: Optimize down_read_trylock()
On 02/21/2019 09:14 AM, Will Deacon wrote: > On Wed, Feb 13, 2019 at 05:00:17PM -0500, Waiman Long wrote: >> Modify __down_read_trylock() to optimize for an unlocked rwsem and make >> it generate slightly better code. >> >> Before this patch, down_read_trylock: >> >>0x <+0>: callq 0x5 >>0x0005 <+5>: jmp0x18 >>0x0007 <+7>: lea0x1(%rdx),%rcx >>0x000b <+11>:mov%rdx,%rax >>0x000e <+14>:lock cmpxchg %rcx,(%rdi) >>0x0013 <+19>:cmp%rax,%rdx >>0x0016 <+22>:je 0x23 >>0x0018 <+24>:mov(%rdi),%rdx >>0x001b <+27>:test %rdx,%rdx >>0x001e <+30>:jns0x7 >>0x0020 <+32>:xor%eax,%eax >>0x0022 <+34>:retq >>0x0023 <+35>:mov%gs:0x0,%rax >>0x002c <+44>:or $0x3,%rax >>0x0030 <+48>:mov%rax,0x20(%rdi) >>0x0034 <+52>:mov$0x1,%eax >>0x0039 <+57>:retq >> >> After patch, down_read_trylock: >> >>0x <+0>: callq 0x5 >>0x0005 <+5>: xor%eax,%eax >>0x0007 <+7>: lea0x1(%rax),%rdx >>0x000b <+11>: lock cmpxchg %rdx,(%rdi) >>0x0010 <+16>: jne0x29 >>0x0012 <+18>: mov%gs:0x0,%rax >>0x001b <+27>: or $0x3,%rax >>0x001f <+31>: mov%rax,0x20(%rdi) >>0x0023 <+35>: mov$0x1,%eax >>0x0028 <+40>: retq >>0x0029 <+41>: test %rax,%rax >>0x002c <+44>: jns0x7 >>0x002e <+46>: xor%eax,%eax >>0x0030 <+48>: retq >> >> By using a rwsem microbenchmark, the down_read_trylock() rate (with a >> load of 10 to lengthen the lock critical section) on a x86-64 system >> before and after the patch were: >> >> Before PatchAfter Patch >># of Threads rlock rlock >> - - >> 1 14,496 14,716 >> 28,644 8,453 >> 46,799 6,983 >> 85,664 7,190 >> >> On a ARM64 system, the performance results were: >> >> Before PatchAfter Patch >># of Threads rlock rlock >> - - >> 1 23,676 24,488 >> 27,697 9,502 >> 44,945 3,440 >> 82,641 1,603 >> >> For the uncontended case (1 thread), the new down_read_trylock() is a >> little bit faster. For the contended cases, the new down_read_trylock() >> perform pretty well in x86-64, but performance degrades at high >> contention level on ARM64. >> >> Suggested-by: Linus Torvalds >> Signed-off-by: Waiman Long >> --- >> kernel/locking/rwsem.h | 13 - >> 1 file changed, 8 insertions(+), 5 deletions(-) >> >> diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h >> index 45ee002..1f5775a 100644 >> --- a/kernel/locking/rwsem.h >> +++ b/kernel/locking/rwsem.h >> @@ -174,14 +174,17 @@ static inline int __down_read_killable(struct >> rw_semaphore *sem) >> >> static inline int __down_read_trylock(struct rw_semaphore *sem) >> { >> -long tmp; >> +/* >> + * Optimize for the case when the rwsem is not locked at all. >> + */ >> +long tmp = RWSEM_UNLOCKED_VALUE; >> >> -while ((tmp = atomic_long_read(>count)) >= 0) { >> -if (tmp == atomic_long_cmpxchg_acquire(>count, tmp, >> - tmp + RWSEM_ACTIVE_READ_BIAS)) { >> +do { >> +if (atomic_long_try_cmpxchg_acquire(>count, , >> +tmp + RWSEM_ACTIVE_READ_BIAS)) { >> return 1; >> } >> -} >> +} while (tmp >= 0); > Nit: but I guess that should be RWSEM_UNLOCKED_VALUE instead of 0. > > Will Using RWSEM_UNLOCKED_VALUE may be better. Anyway, it is not a big deal as I am going to change this again in a later patch. Thanks, Longman RWSEM_UNLOCKED_VALUE
Re: [PATCH v4 3/3] locking/rwsem: Optimize down_read_trylock()
On Wed, Feb 13, 2019 at 05:00:17PM -0500, Waiman Long wrote: > Modify __down_read_trylock() to optimize for an unlocked rwsem and make > it generate slightly better code. > > Before this patch, down_read_trylock: > >0x <+0>: callq 0x5 >0x0005 <+5>: jmp0x18 >0x0007 <+7>: lea0x1(%rdx),%rcx >0x000b <+11>:mov%rdx,%rax >0x000e <+14>:lock cmpxchg %rcx,(%rdi) >0x0013 <+19>:cmp%rax,%rdx >0x0016 <+22>:je 0x23 >0x0018 <+24>:mov(%rdi),%rdx >0x001b <+27>:test %rdx,%rdx >0x001e <+30>:jns0x7 >0x0020 <+32>:xor%eax,%eax >0x0022 <+34>:retq >0x0023 <+35>:mov%gs:0x0,%rax >0x002c <+44>:or $0x3,%rax >0x0030 <+48>:mov%rax,0x20(%rdi) >0x0034 <+52>:mov$0x1,%eax >0x0039 <+57>:retq > > After patch, down_read_trylock: > >0x <+0>: callq 0x5 >0x0005 <+5>: xor%eax,%eax >0x0007 <+7>: lea0x1(%rax),%rdx >0x000b <+11>: lock cmpxchg %rdx,(%rdi) >0x0010 <+16>: jne0x29 >0x0012 <+18>: mov%gs:0x0,%rax >0x001b <+27>: or $0x3,%rax >0x001f <+31>: mov%rax,0x20(%rdi) >0x0023 <+35>: mov$0x1,%eax >0x0028 <+40>: retq >0x0029 <+41>: test %rax,%rax >0x002c <+44>: jns0x7 >0x002e <+46>: xor%eax,%eax >0x0030 <+48>: retq > > By using a rwsem microbenchmark, the down_read_trylock() rate (with a > load of 10 to lengthen the lock critical section) on a x86-64 system > before and after the patch were: > > Before PatchAfter Patch ># of Threads rlock rlock > - - > 1 14,496 14,716 > 28,644 8,453 > 46,799 6,983 > 85,664 7,190 > > On a ARM64 system, the performance results were: > > Before PatchAfter Patch ># of Threads rlock rlock > - - > 1 23,676 24,488 > 27,697 9,502 > 44,945 3,440 > 82,641 1,603 > > For the uncontended case (1 thread), the new down_read_trylock() is a > little bit faster. For the contended cases, the new down_read_trylock() > perform pretty well in x86-64, but performance degrades at high > contention level on ARM64. > > Suggested-by: Linus Torvalds > Signed-off-by: Waiman Long > --- > kernel/locking/rwsem.h | 13 - > 1 file changed, 8 insertions(+), 5 deletions(-) > > diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h > index 45ee002..1f5775a 100644 > --- a/kernel/locking/rwsem.h > +++ b/kernel/locking/rwsem.h > @@ -174,14 +174,17 @@ static inline int __down_read_killable(struct > rw_semaphore *sem) > > static inline int __down_read_trylock(struct rw_semaphore *sem) > { > - long tmp; > + /* > + * Optimize for the case when the rwsem is not locked at all. > + */ > + long tmp = RWSEM_UNLOCKED_VALUE; > > - while ((tmp = atomic_long_read(>count)) >= 0) { > - if (tmp == atomic_long_cmpxchg_acquire(>count, tmp, > -tmp + RWSEM_ACTIVE_READ_BIAS)) { > + do { > + if (atomic_long_try_cmpxchg_acquire(>count, , > + tmp + RWSEM_ACTIVE_READ_BIAS)) { > return 1; > } > - } > + } while (tmp >= 0); Nit: but I guess that should be RWSEM_UNLOCKED_VALUE instead of 0. Will
[PATCH v4 3/3] locking/rwsem: Optimize down_read_trylock()
Modify __down_read_trylock() to optimize for an unlocked rwsem and make it generate slightly better code. Before this patch, down_read_trylock: 0x <+0>: callq 0x5 0x0005 <+5>: jmp0x18 0x0007 <+7>: lea0x1(%rdx),%rcx 0x000b <+11>:mov%rdx,%rax 0x000e <+14>:lock cmpxchg %rcx,(%rdi) 0x0013 <+19>:cmp%rax,%rdx 0x0016 <+22>:je 0x23 0x0018 <+24>:mov(%rdi),%rdx 0x001b <+27>:test %rdx,%rdx 0x001e <+30>:jns0x7 0x0020 <+32>:xor%eax,%eax 0x0022 <+34>:retq 0x0023 <+35>:mov%gs:0x0,%rax 0x002c <+44>:or $0x3,%rax 0x0030 <+48>:mov%rax,0x20(%rdi) 0x0034 <+52>:mov$0x1,%eax 0x0039 <+57>:retq After patch, down_read_trylock: 0x <+0>: callq 0x5 0x0005 <+5>: xor%eax,%eax 0x0007 <+7>: lea0x1(%rax),%rdx 0x000b <+11>:lock cmpxchg %rdx,(%rdi) 0x0010 <+16>:jne0x29 0x0012 <+18>:mov%gs:0x0,%rax 0x001b <+27>:or $0x3,%rax 0x001f <+31>:mov%rax,0x20(%rdi) 0x0023 <+35>:mov$0x1,%eax 0x0028 <+40>:retq 0x0029 <+41>:test %rax,%rax 0x002c <+44>:jns0x7 0x002e <+46>:xor%eax,%eax 0x0030 <+48>:retq By using a rwsem microbenchmark, the down_read_trylock() rate (with a load of 10 to lengthen the lock critical section) on a x86-64 system before and after the patch were: Before PatchAfter Patch # of Threads rlock rlock - - 1 14,496 14,716 28,644 8,453 46,799 6,983 85,664 7,190 On a ARM64 system, the performance results were: Before PatchAfter Patch # of Threads rlock rlock - - 1 23,676 24,488 27,697 9,502 44,945 3,440 82,641 1,603 For the uncontended case (1 thread), the new down_read_trylock() is a little bit faster. For the contended cases, the new down_read_trylock() perform pretty well in x86-64, but performance degrades at high contention level on ARM64. Suggested-by: Linus Torvalds Signed-off-by: Waiman Long --- kernel/locking/rwsem.h | 13 - 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h index 45ee002..1f5775a 100644 --- a/kernel/locking/rwsem.h +++ b/kernel/locking/rwsem.h @@ -174,14 +174,17 @@ static inline int __down_read_killable(struct rw_semaphore *sem) static inline int __down_read_trylock(struct rw_semaphore *sem) { - long tmp; + /* +* Optimize for the case when the rwsem is not locked at all. +*/ + long tmp = RWSEM_UNLOCKED_VALUE; - while ((tmp = atomic_long_read(>count)) >= 0) { - if (tmp == atomic_long_cmpxchg_acquire(>count, tmp, - tmp + RWSEM_ACTIVE_READ_BIAS)) { + do { + if (atomic_long_try_cmpxchg_acquire(>count, , + tmp + RWSEM_ACTIVE_READ_BIAS)) { return 1; } - } + } while (tmp >= 0); return 0; } -- 1.8.3.1