Re: [PATCH v2 4/5] locking/lockdep: Make class->ops a percpu counter
On 10/04/2018 06:14 AM, Ingo Molnar wrote: > * Waiman Long wrote: > >> diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h >> index b0d0b51c4d85..1fd82ff99c65 100644 >> --- a/include/linux/lockdep.h >> +++ b/include/linux/lockdep.h >> @@ -99,13 +99,8 @@ struct lock_class { >> */ >> unsigned intversion; >> >> -/* >> - * Statistics counter: >> - */ >> -unsigned long ops; >> - >> -const char *name; >> int name_version; >> +const char *name; > 'name' gets moved by the patch - better structure packing on 64-bit kernels? > > Looks good to me otherwise. > > Thanks, > > Ingo Yes, that is done on purpose to pack the 2 integers together to remove two 4-byte holes on 64-bit archs. Cheers, Longman
Re: [PATCH v2 4/5] locking/lockdep: Make class->ops a percpu counter
On 10/04/2018 06:14 AM, Ingo Molnar wrote: > * Waiman Long wrote: > >> diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h >> index b0d0b51c4d85..1fd82ff99c65 100644 >> --- a/include/linux/lockdep.h >> +++ b/include/linux/lockdep.h >> @@ -99,13 +99,8 @@ struct lock_class { >> */ >> unsigned intversion; >> >> -/* >> - * Statistics counter: >> - */ >> -unsigned long ops; >> - >> -const char *name; >> int name_version; >> +const char *name; > 'name' gets moved by the patch - better structure packing on 64-bit kernels? > > Looks good to me otherwise. > > Thanks, > > Ingo Yes, that is done on purpose to pack the 2 integers together to remove two 4-byte holes on 64-bit archs. Cheers, Longman
Re: [PATCH v2 4/5] locking/lockdep: Make class->ops a percpu counter
* Waiman Long wrote: > diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h > index b0d0b51c4d85..1fd82ff99c65 100644 > --- a/include/linux/lockdep.h > +++ b/include/linux/lockdep.h > @@ -99,13 +99,8 @@ struct lock_class { >*/ > unsigned intversion; > > - /* > - * Statistics counter: > - */ > - unsigned long ops; > - > - const char *name; > int name_version; > + const char *name; 'name' gets moved by the patch - better structure packing on 64-bit kernels? Looks good to me otherwise. Thanks, Ingo
Re: [PATCH v2 4/5] locking/lockdep: Make class->ops a percpu counter
* Waiman Long wrote: > diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h > index b0d0b51c4d85..1fd82ff99c65 100644 > --- a/include/linux/lockdep.h > +++ b/include/linux/lockdep.h > @@ -99,13 +99,8 @@ struct lock_class { >*/ > unsigned intversion; > > - /* > - * Statistics counter: > - */ > - unsigned long ops; > - > - const char *name; > int name_version; > + const char *name; 'name' gets moved by the patch - better structure packing on 64-bit kernels? Looks good to me otherwise. Thanks, Ingo
Re: [PATCH v2 4/5] locking/lockdep: Make class->ops a percpu counter
On 10/03/2018 09:57 AM, Waiman Long wrote: > On 10/03/2018 03:48 AM, Peter Zijlstra wrote: >> On Tue, Oct 02, 2018 at 04:19:19PM -0400, Waiman Long wrote: >>> +DEFINE_PER_CPU(unsigned long [MAX_LOCKDEP_KEYS], lock_class_ops); >>> @@ -179,9 +181,30 @@ DECLARE_PER_CPU(struct lockdep_stats, lockdep_stats); >>> } \ >>> __total;\ >>> }) >>> + >>> +static inline void debug_class_ops_inc(struct lock_class *class) >>> +{ >>> + int idx; >>> + >>> + idx = class - lock_classes; >>> + __this_cpu_inc(lock_class_ops[idx]); >>> +} >>> + >>> +static inline unsigned long debug_class_ops_read(struct lock_class *class) >>> +{ >>> + int idx, cpu; >>> + unsigned long ops = 0; >>> + >>> + idx = class - lock_classes; >>> + for_each_possible_cpu(cpu) >>> + ops += per_cpu(lock_class_ops[idx], cpu); >>> + return ops; >>> +} >> Would it make sense to stick that in struct lockdep_stats ? >> >> A little something like: >> >> struct lockdep_stats { >> /* ... */ >> int lock_class_ops[MAX_LOCKDEP_KEYS]; >> }; >> >> and then use: >> >> debug_atomic_inc(lock_class_ops[idx]); >> > Yes, I can certainly do that. Thanks for pointing this out. Will post an > updated patch just for this one. > > Cheers, > Longman > Attached is the updated patch 4. Please let me know if you guys are OK with that. Cheers, Longman >From e615f62dfa155cbf9c04318f146570047f2c0c0c Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Wed, 3 Oct 2018 11:36:24 -0400 Subject: [PATCH v3 4/5] locking/lockdep: Make class->ops a percpu counter A sizable portion of the CPU cycles spent on the __lock_acquire() is used up by the atomic increment of class->ops stat counter. By taking it out from the lock_class structure and changing it to a per-cpu per-lock-class counter, we can reduce the amount of cacheline contention on the class structure when multiple CPUs are trying to acquire locks of the same class simultaneously. To limit the increase in memory consumption because of the percpu nature of that counter, it is now put back under the CONFIG_DEBUG_LOCKDEP config option. So the memory consumption increase will only occur if CONFIG_DEBUG_LOCKDEP is defined. The lock_class structure, however, is reduced in size by 16 bytes on 64-bit archs after ops removal and a minor restructuring of the fields. This patch also fixes a bug in the increment code as the counter is of the unsigned long type, but atomic_inc() was used to increment it. v3: Move lock_class_ops[] into lockdep_stats structure. Signed-off-by: Waiman Long --- include/linux/lockdep.h| 7 +-- kernel/locking/lockdep.c | 11 --- kernel/locking/lockdep_internals.h | 27 +++ kernel/locking/lockdep_proc.c | 2 +- 4 files changed, 37 insertions(+), 10 deletions(-) diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index b0d0b51c4d85..1fd82ff99c65 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -99,13 +99,8 @@ struct lock_class { */ unsigned int version; - /* - * Statistics counter: - */ - unsigned long ops; - - const char *name; intname_version; + const char *name; #ifdef CONFIG_LOCK_STAT unsigned long contention_point[LOCKSTAT_POINTS]; diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index b7774ff2516f..57e3f153474f 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -138,7 +138,7 @@ static struct lock_list list_entries[MAX_LOCKDEP_ENTRIES]; * get freed - this significantly simplifies the debugging code. */ unsigned long nr_lock_classes; -static struct lock_class lock_classes[MAX_LOCKDEP_KEYS]; +struct lock_class lock_classes[MAX_LOCKDEP_KEYS]; static inline struct lock_class *hlock_class(struct held_lock *hlock) { @@ -435,6 +435,7 @@ unsigned int max_lockdep_depth; * Various lockdep statistics: */ DEFINE_PER_CPU(struct lockdep_stats, lockdep_stats); +DEFINE_PER_CPU(unsigned long [MAX_LOCKDEP_KEYS], lock_class_ops); #endif /* @@ -1391,7 +1392,9 @@ static void print_lock_class_header(struct lock_class *class, int depth) printk("%*s->", depth, ""); print_lock_name(class); - printk(KERN_CONT " ops: %lu", class->ops); +#ifdef CONFIG_DEBUG_LOCKDEP + printk(KERN_CONT " ops: %lu", debug_class_ops_read(class)); +#endif printk(KERN_CONT " {\n"); for (bit = 0; bit < LOCK_USAGE_STATES; bit++) { @@ -3226,7 +3229,9 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, if (!class) return 0; } - atomic_inc((atomic_t *)>ops); + + debug_class_ops_inc(class); + if (very_verbose(class)) { printk("\nacquire class [%px] %s", class->key, class->name); if (class->name_version > 1) diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h index d459d624ba2a..88c847a41c8a 100644 ---
Re: [PATCH v2 4/5] locking/lockdep: Make class->ops a percpu counter
On 10/03/2018 09:57 AM, Waiman Long wrote: > On 10/03/2018 03:48 AM, Peter Zijlstra wrote: >> On Tue, Oct 02, 2018 at 04:19:19PM -0400, Waiman Long wrote: >>> +DEFINE_PER_CPU(unsigned long [MAX_LOCKDEP_KEYS], lock_class_ops); >>> @@ -179,9 +181,30 @@ DECLARE_PER_CPU(struct lockdep_stats, lockdep_stats); >>> } \ >>> __total;\ >>> }) >>> + >>> +static inline void debug_class_ops_inc(struct lock_class *class) >>> +{ >>> + int idx; >>> + >>> + idx = class - lock_classes; >>> + __this_cpu_inc(lock_class_ops[idx]); >>> +} >>> + >>> +static inline unsigned long debug_class_ops_read(struct lock_class *class) >>> +{ >>> + int idx, cpu; >>> + unsigned long ops = 0; >>> + >>> + idx = class - lock_classes; >>> + for_each_possible_cpu(cpu) >>> + ops += per_cpu(lock_class_ops[idx], cpu); >>> + return ops; >>> +} >> Would it make sense to stick that in struct lockdep_stats ? >> >> A little something like: >> >> struct lockdep_stats { >> /* ... */ >> int lock_class_ops[MAX_LOCKDEP_KEYS]; >> }; >> >> and then use: >> >> debug_atomic_inc(lock_class_ops[idx]); >> > Yes, I can certainly do that. Thanks for pointing this out. Will post an > updated patch just for this one. > > Cheers, > Longman > Attached is the updated patch 4. Please let me know if you guys are OK with that. Cheers, Longman >From e615f62dfa155cbf9c04318f146570047f2c0c0c Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Wed, 3 Oct 2018 11:36:24 -0400 Subject: [PATCH v3 4/5] locking/lockdep: Make class->ops a percpu counter A sizable portion of the CPU cycles spent on the __lock_acquire() is used up by the atomic increment of class->ops stat counter. By taking it out from the lock_class structure and changing it to a per-cpu per-lock-class counter, we can reduce the amount of cacheline contention on the class structure when multiple CPUs are trying to acquire locks of the same class simultaneously. To limit the increase in memory consumption because of the percpu nature of that counter, it is now put back under the CONFIG_DEBUG_LOCKDEP config option. So the memory consumption increase will only occur if CONFIG_DEBUG_LOCKDEP is defined. The lock_class structure, however, is reduced in size by 16 bytes on 64-bit archs after ops removal and a minor restructuring of the fields. This patch also fixes a bug in the increment code as the counter is of the unsigned long type, but atomic_inc() was used to increment it. v3: Move lock_class_ops[] into lockdep_stats structure. Signed-off-by: Waiman Long --- include/linux/lockdep.h| 7 +-- kernel/locking/lockdep.c | 11 --- kernel/locking/lockdep_internals.h | 27 +++ kernel/locking/lockdep_proc.c | 2 +- 4 files changed, 37 insertions(+), 10 deletions(-) diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index b0d0b51c4d85..1fd82ff99c65 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -99,13 +99,8 @@ struct lock_class { */ unsigned int version; - /* - * Statistics counter: - */ - unsigned long ops; - - const char *name; intname_version; + const char *name; #ifdef CONFIG_LOCK_STAT unsigned long contention_point[LOCKSTAT_POINTS]; diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index b7774ff2516f..57e3f153474f 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -138,7 +138,7 @@ static struct lock_list list_entries[MAX_LOCKDEP_ENTRIES]; * get freed - this significantly simplifies the debugging code. */ unsigned long nr_lock_classes; -static struct lock_class lock_classes[MAX_LOCKDEP_KEYS]; +struct lock_class lock_classes[MAX_LOCKDEP_KEYS]; static inline struct lock_class *hlock_class(struct held_lock *hlock) { @@ -435,6 +435,7 @@ unsigned int max_lockdep_depth; * Various lockdep statistics: */ DEFINE_PER_CPU(struct lockdep_stats, lockdep_stats); +DEFINE_PER_CPU(unsigned long [MAX_LOCKDEP_KEYS], lock_class_ops); #endif /* @@ -1391,7 +1392,9 @@ static void print_lock_class_header(struct lock_class *class, int depth) printk("%*s->", depth, ""); print_lock_name(class); - printk(KERN_CONT " ops: %lu", class->ops); +#ifdef CONFIG_DEBUG_LOCKDEP + printk(KERN_CONT " ops: %lu", debug_class_ops_read(class)); +#endif printk(KERN_CONT " {\n"); for (bit = 0; bit < LOCK_USAGE_STATES; bit++) { @@ -3226,7 +3229,9 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, if (!class) return 0; } - atomic_inc((atomic_t *)>ops); + + debug_class_ops_inc(class); + if (very_verbose(class)) { printk("\nacquire class [%px] %s", class->key, class->name); if (class->name_version > 1) diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h index d459d624ba2a..88c847a41c8a 100644 ---
Re: [PATCH v2 4/5] locking/lockdep: Make class->ops a percpu counter
On 10/03/2018 03:48 AM, Peter Zijlstra wrote: > On Tue, Oct 02, 2018 at 04:19:19PM -0400, Waiman Long wrote: >> +DEFINE_PER_CPU(unsigned long [MAX_LOCKDEP_KEYS], lock_class_ops); >> @@ -179,9 +181,30 @@ DECLARE_PER_CPU(struct lockdep_stats, lockdep_stats); >> } \ >> __total;\ >> }) >> + >> +static inline void debug_class_ops_inc(struct lock_class *class) >> +{ >> +int idx; >> + >> +idx = class - lock_classes; >> +__this_cpu_inc(lock_class_ops[idx]); >> +} >> + >> +static inline unsigned long debug_class_ops_read(struct lock_class *class) >> +{ >> +int idx, cpu; >> +unsigned long ops = 0; >> + >> +idx = class - lock_classes; >> +for_each_possible_cpu(cpu) >> +ops += per_cpu(lock_class_ops[idx], cpu); >> +return ops; >> +} > Would it make sense to stick that in struct lockdep_stats ? > > A little something like: > > struct lockdep_stats { > /* ... */ > int lock_class_ops[MAX_LOCKDEP_KEYS]; > }; > > and then use: > > debug_atomic_inc(lock_class_ops[idx]); > Yes, I can certainly do that. Thanks for pointing this out. Will post an updated patch just for this one. Cheers, Longman
Re: [PATCH v2 4/5] locking/lockdep: Make class->ops a percpu counter
On 10/03/2018 03:48 AM, Peter Zijlstra wrote: > On Tue, Oct 02, 2018 at 04:19:19PM -0400, Waiman Long wrote: >> +DEFINE_PER_CPU(unsigned long [MAX_LOCKDEP_KEYS], lock_class_ops); >> @@ -179,9 +181,30 @@ DECLARE_PER_CPU(struct lockdep_stats, lockdep_stats); >> } \ >> __total;\ >> }) >> + >> +static inline void debug_class_ops_inc(struct lock_class *class) >> +{ >> +int idx; >> + >> +idx = class - lock_classes; >> +__this_cpu_inc(lock_class_ops[idx]); >> +} >> + >> +static inline unsigned long debug_class_ops_read(struct lock_class *class) >> +{ >> +int idx, cpu; >> +unsigned long ops = 0; >> + >> +idx = class - lock_classes; >> +for_each_possible_cpu(cpu) >> +ops += per_cpu(lock_class_ops[idx], cpu); >> +return ops; >> +} > Would it make sense to stick that in struct lockdep_stats ? > > A little something like: > > struct lockdep_stats { > /* ... */ > int lock_class_ops[MAX_LOCKDEP_KEYS]; > }; > > and then use: > > debug_atomic_inc(lock_class_ops[idx]); > Yes, I can certainly do that. Thanks for pointing this out. Will post an updated patch just for this one. Cheers, Longman
Re: [PATCH v2 4/5] locking/lockdep: Make class->ops a percpu counter
On Wed, Oct 03, 2018 at 09:54:46AM +0200, Ingo Molnar wrote: > > * Peter Zijlstra wrote: > > > On Tue, Oct 02, 2018 at 04:19:19PM -0400, Waiman Long wrote: > > > +DEFINE_PER_CPU(unsigned long [MAX_LOCKDEP_KEYS], lock_class_ops); > > > > > @@ -179,9 +181,30 @@ DECLARE_PER_CPU(struct lockdep_stats, lockdep_stats); > > > } \ > > > __total;\ > > > }) > > > + > > > +static inline void debug_class_ops_inc(struct lock_class *class) > > > +{ > > > + int idx; > > > + > > > + idx = class - lock_classes; > > > + __this_cpu_inc(lock_class_ops[idx]); > > > +} > > > + > > > +static inline unsigned long debug_class_ops_read(struct lock_class > > > *class) > > > +{ > > > + int idx, cpu; > > > + unsigned long ops = 0; > > > + > > > + idx = class - lock_classes; > > > + for_each_possible_cpu(cpu) > > > + ops += per_cpu(lock_class_ops[idx], cpu); > > > + return ops; > > > +} > > > > Would it make sense to stick that in struct lockdep_stats ? > > > > A little something like: > > > > struct lockdep_stats { > > /* ... */ > > int lock_class_ops[MAX_LOCKDEP_KEYS]; > > }; > > Did you shrink the 'long' to 'int' intentionally? nah, was because all of lockdep_stats is int and I didn't pay much attention.
Re: [PATCH v2 4/5] locking/lockdep: Make class->ops a percpu counter
On Wed, Oct 03, 2018 at 09:54:46AM +0200, Ingo Molnar wrote: > > * Peter Zijlstra wrote: > > > On Tue, Oct 02, 2018 at 04:19:19PM -0400, Waiman Long wrote: > > > +DEFINE_PER_CPU(unsigned long [MAX_LOCKDEP_KEYS], lock_class_ops); > > > > > @@ -179,9 +181,30 @@ DECLARE_PER_CPU(struct lockdep_stats, lockdep_stats); > > > } \ > > > __total;\ > > > }) > > > + > > > +static inline void debug_class_ops_inc(struct lock_class *class) > > > +{ > > > + int idx; > > > + > > > + idx = class - lock_classes; > > > + __this_cpu_inc(lock_class_ops[idx]); > > > +} > > > + > > > +static inline unsigned long debug_class_ops_read(struct lock_class > > > *class) > > > +{ > > > + int idx, cpu; > > > + unsigned long ops = 0; > > > + > > > + idx = class - lock_classes; > > > + for_each_possible_cpu(cpu) > > > + ops += per_cpu(lock_class_ops[idx], cpu); > > > + return ops; > > > +} > > > > Would it make sense to stick that in struct lockdep_stats ? > > > > A little something like: > > > > struct lockdep_stats { > > /* ... */ > > int lock_class_ops[MAX_LOCKDEP_KEYS]; > > }; > > Did you shrink the 'long' to 'int' intentionally? nah, was because all of lockdep_stats is int and I didn't pay much attention.
Re: [PATCH v2 4/5] locking/lockdep: Make class->ops a percpu counter
* Peter Zijlstra wrote: > On Tue, Oct 02, 2018 at 04:19:19PM -0400, Waiman Long wrote: > > +DEFINE_PER_CPU(unsigned long [MAX_LOCKDEP_KEYS], lock_class_ops); > > > @@ -179,9 +181,30 @@ DECLARE_PER_CPU(struct lockdep_stats, lockdep_stats); > > } \ > > __total;\ > > }) > > + > > +static inline void debug_class_ops_inc(struct lock_class *class) > > +{ > > + int idx; > > + > > + idx = class - lock_classes; > > + __this_cpu_inc(lock_class_ops[idx]); > > +} > > + > > +static inline unsigned long debug_class_ops_read(struct lock_class *class) > > +{ > > + int idx, cpu; > > + unsigned long ops = 0; > > + > > + idx = class - lock_classes; > > + for_each_possible_cpu(cpu) > > + ops += per_cpu(lock_class_ops[idx], cpu); > > + return ops; > > +} > > Would it make sense to stick that in struct lockdep_stats ? > > A little something like: > > struct lockdep_stats { > /* ... */ > int lock_class_ops[MAX_LOCKDEP_KEYS]; > }; Did you shrink the 'long' to 'int' intentionally? Thanks, Ingo
Re: [PATCH v2 4/5] locking/lockdep: Make class->ops a percpu counter
* Peter Zijlstra wrote: > On Tue, Oct 02, 2018 at 04:19:19PM -0400, Waiman Long wrote: > > +DEFINE_PER_CPU(unsigned long [MAX_LOCKDEP_KEYS], lock_class_ops); > > > @@ -179,9 +181,30 @@ DECLARE_PER_CPU(struct lockdep_stats, lockdep_stats); > > } \ > > __total;\ > > }) > > + > > +static inline void debug_class_ops_inc(struct lock_class *class) > > +{ > > + int idx; > > + > > + idx = class - lock_classes; > > + __this_cpu_inc(lock_class_ops[idx]); > > +} > > + > > +static inline unsigned long debug_class_ops_read(struct lock_class *class) > > +{ > > + int idx, cpu; > > + unsigned long ops = 0; > > + > > + idx = class - lock_classes; > > + for_each_possible_cpu(cpu) > > + ops += per_cpu(lock_class_ops[idx], cpu); > > + return ops; > > +} > > Would it make sense to stick that in struct lockdep_stats ? > > A little something like: > > struct lockdep_stats { > /* ... */ > int lock_class_ops[MAX_LOCKDEP_KEYS]; > }; Did you shrink the 'long' to 'int' intentionally? Thanks, Ingo
Re: [PATCH v2 4/5] locking/lockdep: Make class->ops a percpu counter
On Tue, Oct 02, 2018 at 04:19:19PM -0400, Waiman Long wrote: > +DEFINE_PER_CPU(unsigned long [MAX_LOCKDEP_KEYS], lock_class_ops); > @@ -179,9 +181,30 @@ DECLARE_PER_CPU(struct lockdep_stats, lockdep_stats); > } \ > __total;\ > }) > + > +static inline void debug_class_ops_inc(struct lock_class *class) > +{ > + int idx; > + > + idx = class - lock_classes; > + __this_cpu_inc(lock_class_ops[idx]); > +} > + > +static inline unsigned long debug_class_ops_read(struct lock_class *class) > +{ > + int idx, cpu; > + unsigned long ops = 0; > + > + idx = class - lock_classes; > + for_each_possible_cpu(cpu) > + ops += per_cpu(lock_class_ops[idx], cpu); > + return ops; > +} Would it make sense to stick that in struct lockdep_stats ? A little something like: struct lockdep_stats { /* ... */ int lock_class_ops[MAX_LOCKDEP_KEYS]; }; and then use: debug_atomic_inc(lock_class_ops[idx]);
Re: [PATCH v2 4/5] locking/lockdep: Make class->ops a percpu counter
On Tue, Oct 02, 2018 at 04:19:19PM -0400, Waiman Long wrote: > +DEFINE_PER_CPU(unsigned long [MAX_LOCKDEP_KEYS], lock_class_ops); > @@ -179,9 +181,30 @@ DECLARE_PER_CPU(struct lockdep_stats, lockdep_stats); > } \ > __total;\ > }) > + > +static inline void debug_class_ops_inc(struct lock_class *class) > +{ > + int idx; > + > + idx = class - lock_classes; > + __this_cpu_inc(lock_class_ops[idx]); > +} > + > +static inline unsigned long debug_class_ops_read(struct lock_class *class) > +{ > + int idx, cpu; > + unsigned long ops = 0; > + > + idx = class - lock_classes; > + for_each_possible_cpu(cpu) > + ops += per_cpu(lock_class_ops[idx], cpu); > + return ops; > +} Would it make sense to stick that in struct lockdep_stats ? A little something like: struct lockdep_stats { /* ... */ int lock_class_ops[MAX_LOCKDEP_KEYS]; }; and then use: debug_atomic_inc(lock_class_ops[idx]);
[PATCH v2 4/5] locking/lockdep: Make class->ops a percpu counter
A sizable portion of the CPU cycles spent on the __lock_acquire() is used up by the atomic increment of class->ops stat counter. By taking it out from the lock_class structure and changing it to a per-cpu per-lock-class counter, we can reduce the amount of cacheline contention on the class structure when multiple CPUs are trying to acquire locks of the same class simultaneously. To limit the increase in memory consumption because of the percpu nature of that counter, it is now put back under the CONFIG_DEBUG_LOCKDEP config option. So the memory consumption increase will only occur if CONFIG_DEBUG_LOCKDEP is defined. The lock_class structure, however, is reduced in size by 16 bytes on 64-bit archs after ops removal and a minor restructuring of the fields. This patch also fixes a bug in the increment code as the counter is of the unsigned long type, but atomic_inc() was used to increment it. Signed-off-by: Waiman Long --- include/linux/lockdep.h| 7 +-- kernel/locking/lockdep.c | 11 --- kernel/locking/lockdep_internals.h | 23 +++ kernel/locking/lockdep_proc.c | 2 +- 4 files changed, 33 insertions(+), 10 deletions(-) diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index b0d0b51c4d85..1fd82ff99c65 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -99,13 +99,8 @@ struct lock_class { */ unsigned intversion; - /* -* Statistics counter: -*/ - unsigned long ops; - - const char *name; int name_version; + const char *name; #ifdef CONFIG_LOCK_STAT unsigned long contention_point[LOCKSTAT_POINTS]; diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index b7774ff2516f..57e3f153474f 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -138,7 +138,7 @@ static struct lock_list list_entries[MAX_LOCKDEP_ENTRIES]; * get freed - this significantly simplifies the debugging code. */ unsigned long nr_lock_classes; -static struct lock_class lock_classes[MAX_LOCKDEP_KEYS]; +struct lock_class lock_classes[MAX_LOCKDEP_KEYS]; static inline struct lock_class *hlock_class(struct held_lock *hlock) { @@ -435,6 +435,7 @@ unsigned int max_lockdep_depth; * Various lockdep statistics: */ DEFINE_PER_CPU(struct lockdep_stats, lockdep_stats); +DEFINE_PER_CPU(unsigned long [MAX_LOCKDEP_KEYS], lock_class_ops); #endif /* @@ -1391,7 +1392,9 @@ static void print_lock_class_header(struct lock_class *class, int depth) printk("%*s->", depth, ""); print_lock_name(class); - printk(KERN_CONT " ops: %lu", class->ops); +#ifdef CONFIG_DEBUG_LOCKDEP + printk(KERN_CONT " ops: %lu", debug_class_ops_read(class)); +#endif printk(KERN_CONT " {\n"); for (bit = 0; bit < LOCK_USAGE_STATES; bit++) { @@ -3226,7 +3229,9 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, if (!class) return 0; } - atomic_inc((atomic_t *)>ops); + + debug_class_ops_inc(class); + if (very_verbose(class)) { printk("\nacquire class [%px] %s", class->key, class->name); if (class->name_version > 1) diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h index d459d624ba2a..536012e81d04 100644 --- a/kernel/locking/lockdep_internals.h +++ b/kernel/locking/lockdep_internals.h @@ -155,6 +155,8 @@ struct lockdep_stats { }; DECLARE_PER_CPU(struct lockdep_stats, lockdep_stats); +DECLARE_PER_CPU(unsigned long [MAX_LOCKDEP_KEYS], lock_class_ops); +extern struct lock_class lock_classes[MAX_LOCKDEP_KEYS]; #define __debug_atomic_inc(ptr)\ this_cpu_inc(lockdep_stats.ptr); @@ -179,9 +181,30 @@ DECLARE_PER_CPU(struct lockdep_stats, lockdep_stats); } \ __total;\ }) + +static inline void debug_class_ops_inc(struct lock_class *class) +{ + int idx; + + idx = class - lock_classes; + __this_cpu_inc(lock_class_ops[idx]); +} + +static inline unsigned long debug_class_ops_read(struct lock_class *class) +{ + int idx, cpu; + unsigned long ops = 0; + + idx = class - lock_classes; + for_each_possible_cpu(cpu) + ops += per_cpu(lock_class_ops[idx], cpu); + return ops; +} + #else # define __debug_atomic_inc(ptr) do { } while (0) # define debug_atomic_inc(ptr) do { } while (0) # define debug_atomic_dec(ptr) do { } while (0) # define debug_atomic_read(ptr)0 +# define debug_class_ops_inc(ptr) do { } while (0) #endif diff --git a/kernel/locking/lockdep_proc.c
[PATCH v2 4/5] locking/lockdep: Make class->ops a percpu counter
A sizable portion of the CPU cycles spent on the __lock_acquire() is used up by the atomic increment of class->ops stat counter. By taking it out from the lock_class structure and changing it to a per-cpu per-lock-class counter, we can reduce the amount of cacheline contention on the class structure when multiple CPUs are trying to acquire locks of the same class simultaneously. To limit the increase in memory consumption because of the percpu nature of that counter, it is now put back under the CONFIG_DEBUG_LOCKDEP config option. So the memory consumption increase will only occur if CONFIG_DEBUG_LOCKDEP is defined. The lock_class structure, however, is reduced in size by 16 bytes on 64-bit archs after ops removal and a minor restructuring of the fields. This patch also fixes a bug in the increment code as the counter is of the unsigned long type, but atomic_inc() was used to increment it. Signed-off-by: Waiman Long --- include/linux/lockdep.h| 7 +-- kernel/locking/lockdep.c | 11 --- kernel/locking/lockdep_internals.h | 23 +++ kernel/locking/lockdep_proc.c | 2 +- 4 files changed, 33 insertions(+), 10 deletions(-) diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index b0d0b51c4d85..1fd82ff99c65 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -99,13 +99,8 @@ struct lock_class { */ unsigned intversion; - /* -* Statistics counter: -*/ - unsigned long ops; - - const char *name; int name_version; + const char *name; #ifdef CONFIG_LOCK_STAT unsigned long contention_point[LOCKSTAT_POINTS]; diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index b7774ff2516f..57e3f153474f 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -138,7 +138,7 @@ static struct lock_list list_entries[MAX_LOCKDEP_ENTRIES]; * get freed - this significantly simplifies the debugging code. */ unsigned long nr_lock_classes; -static struct lock_class lock_classes[MAX_LOCKDEP_KEYS]; +struct lock_class lock_classes[MAX_LOCKDEP_KEYS]; static inline struct lock_class *hlock_class(struct held_lock *hlock) { @@ -435,6 +435,7 @@ unsigned int max_lockdep_depth; * Various lockdep statistics: */ DEFINE_PER_CPU(struct lockdep_stats, lockdep_stats); +DEFINE_PER_CPU(unsigned long [MAX_LOCKDEP_KEYS], lock_class_ops); #endif /* @@ -1391,7 +1392,9 @@ static void print_lock_class_header(struct lock_class *class, int depth) printk("%*s->", depth, ""); print_lock_name(class); - printk(KERN_CONT " ops: %lu", class->ops); +#ifdef CONFIG_DEBUG_LOCKDEP + printk(KERN_CONT " ops: %lu", debug_class_ops_read(class)); +#endif printk(KERN_CONT " {\n"); for (bit = 0; bit < LOCK_USAGE_STATES; bit++) { @@ -3226,7 +3229,9 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, if (!class) return 0; } - atomic_inc((atomic_t *)>ops); + + debug_class_ops_inc(class); + if (very_verbose(class)) { printk("\nacquire class [%px] %s", class->key, class->name); if (class->name_version > 1) diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h index d459d624ba2a..536012e81d04 100644 --- a/kernel/locking/lockdep_internals.h +++ b/kernel/locking/lockdep_internals.h @@ -155,6 +155,8 @@ struct lockdep_stats { }; DECLARE_PER_CPU(struct lockdep_stats, lockdep_stats); +DECLARE_PER_CPU(unsigned long [MAX_LOCKDEP_KEYS], lock_class_ops); +extern struct lock_class lock_classes[MAX_LOCKDEP_KEYS]; #define __debug_atomic_inc(ptr)\ this_cpu_inc(lockdep_stats.ptr); @@ -179,9 +181,30 @@ DECLARE_PER_CPU(struct lockdep_stats, lockdep_stats); } \ __total;\ }) + +static inline void debug_class_ops_inc(struct lock_class *class) +{ + int idx; + + idx = class - lock_classes; + __this_cpu_inc(lock_class_ops[idx]); +} + +static inline unsigned long debug_class_ops_read(struct lock_class *class) +{ + int idx, cpu; + unsigned long ops = 0; + + idx = class - lock_classes; + for_each_possible_cpu(cpu) + ops += per_cpu(lock_class_ops[idx], cpu); + return ops; +} + #else # define __debug_atomic_inc(ptr) do { } while (0) # define debug_atomic_inc(ptr) do { } while (0) # define debug_atomic_dec(ptr) do { } while (0) # define debug_atomic_read(ptr)0 +# define debug_class_ops_inc(ptr) do { } while (0) #endif diff --git a/kernel/locking/lockdep_proc.c