On Wed, May 09, 2018 at 01:36:08PM +0200, Martin Pieuchot wrote:
> On 08/05/18(Tue) 18:03, Martin Pieuchot wrote:
> > Diff below, gypped from jmatthew@, allows us to see how much time CPUs
> > spend spinning on a mutex or on the KERNEL_LOCK().
> >
> > On my sparc64 with 16 CPUs, top(1) now reports:
> >
> > 16 CPUs: 16.8% user, 0.0% nice, 9.2% sys, 49.4% spin, 0.1% intr, 24.5%
> > idle
> >
> > And systat(1) CPU view:
> >
> > CPU User Nice System Spin Interrupt
> > Idle
> > 0 3.9% 0.0% 3.9% 74.5% 2.0%
> > 15.7%
> > 1 5.9% 0.0% 21.6% 72.5% 0.0%
> > 0.0%
> > 2 9.8% 0.0% 5.9% 84.3% 0.0%
> > 0.0%
> > 3 5.9% 0.0% 9.8% 82.4% 0.0%
> > 2.0%
> > ...
> > 15 2.0% 0.0% 13.7% 56.9% 0.0%
> > 27.5%
> >
> > To keep it simple, systat(1) vmstat view now shows the spinning time instead
> > of the nice time.
> >
> > I'd like to put something like that in, to be able to better understand
> > profiling and performance analysis.
>
> Updated diff that addresses issues pointed by bluhm@ and visa@:
>
> - Count spinning time in interrupt handlers
>
> - Move `ci_spinning' to the per-CPU scheduler state
>
> - Increment/decrement `spc_spinning' to not corrupt the value if
> spinning recursively in an interrupt
>
> Many people support the idea so far, so ok?
OK
> Index: usr.bin/top/machine.c
> ===================================================================
> RCS file: /cvs/src/usr.bin/top/machine.c,v
> retrieving revision 1.89
> diff -u -p -r1.89 machine.c
> --- usr.bin/top/machine.c 30 May 2017 06:01:30 -0000 1.89
> +++ usr.bin/top/machine.c 9 May 2018 10:44:50 -0000
> @@ -108,7 +108,7 @@ char *procstatenames[] = {
> /* these are for detailing the cpu states */
> int64_t *cpu_states;
> char *cpustatenames[] = {
> - "user", "nice", "system", "interrupt", "idle", NULL
> + "user", "nice", "sys", "spin", "intr", "idle", NULL
> };
>
> /* these are for detailing the memory statistics */
> Index: usr.bin/systat/vmstat.c
> ===================================================================
> RCS file: /cvs/src/usr.bin/systat/vmstat.c,v
> retrieving revision 1.83
> diff -u -p -r1.83 vmstat.c
> --- usr.bin/systat/vmstat.c 3 May 2018 07:49:18 -0000 1.83
> +++ usr.bin/systat/vmstat.c 9 May 2018 10:44:50 -0000
> @@ -268,7 +268,7 @@ labelkre(void)
> mvprintw(GENSTATROW, GENSTATCOL, " Csw Trp Sys Int Sof Flt");
>
> mvprintw(GRAPHROW, GRAPHCOL,
> - " . %%Int . %%Sys . %%Usr . %%Nic . %%Idle");
> + " . %%Int . %%Sys . %%Usr . %%Spn . %%Idle");
> mvprintw(PROCSROW, PROCSCOL, "Proc:r d s w");
> mvprintw(GRAPHROW + 1, GRAPHCOL,
> "| | | | | | | | | | |");
> @@ -306,8 +306,8 @@ labelkre(void)
> } while (0)
> #define MAXFAIL 5
>
> -static char cpuchar[CPUSTATES] = { '|', '=', '>', '-', ' ' };
> -static char cpuorder[CPUSTATES] = { CP_INTR, CP_SYS, CP_USER, CP_NICE,
> CP_IDLE };
> +static char cpuchar[] = { '|', '=', '>', '-', ' ' };
> +static char cpuorder[] = { CP_INTR, CP_SYS, CP_USER, CP_SPIN, CP_IDLE
> };
>
> void
> showkre(void)
> @@ -367,7 +367,7 @@ showkre(void)
> psiz = 0;
> f2 = 0.0;
>
> - for (c = 0; c < CPUSTATES; c++) {
> + for (c = 0; c < nitems(cpuorder); c++) {
> i = cpuorder[c];
> f1 = cputime(i);
> f2 += f1;
> Index: usr.bin/systat/cpu.c
> ===================================================================
> RCS file: /cvs/src/usr.bin/systat/cpu.c,v
> retrieving revision 1.5
> diff -u -p -r1.5 cpu.c
> --- usr.bin/systat/cpu.c 2 Jan 2016 20:02:40 -0000 1.5
> +++ usr.bin/systat/cpu.c 9 May 2018 10:44:50 -0000
> @@ -68,21 +68,23 @@ field_def fields_cpu[] = {
> { "User", 10, 20, 1, FLD_ALIGN_RIGHT, -1, 0, 0, 0 },
> { "Nice", 10, 20, 1, FLD_ALIGN_RIGHT, -1, 0, 0, 0 },
> { "System", 10, 20, 1, FLD_ALIGN_RIGHT, -1, 0, 0, 0 },
> + { "Spin", 10, 20, 1, FLD_ALIGN_RIGHT, -1, 0, 0, 0 },
> { "Interrupt", 10, 20, 1, FLD_ALIGN_RIGHT, -1, 0, 0, 0 },
> { "Idle", 10, 20, 1, FLD_ALIGN_RIGHT, -1, 0, 0, 0 },
> };
>
> #define FLD_CPU_CPU FIELD_ADDR(fields_cpu, 0)
> -#define FLD_CPU_INT FIELD_ADDR(fields_cpu, 1)
> -#define FLD_CPU_SYS FIELD_ADDR(fields_cpu, 2)
> -#define FLD_CPU_USR FIELD_ADDR(fields_cpu, 3)
> -#define FLD_CPU_NIC FIELD_ADDR(fields_cpu, 4)
> -#define FLD_CPU_IDLE FIELD_ADDR(fields_cpu, 5)
> +#define FLD_CPU_USR FIELD_ADDR(fields_cpu, 1)
> +#define FLD_CPU_NIC FIELD_ADDR(fields_cpu, 2)
> +#define FLD_CPU_SYS FIELD_ADDR(fields_cpu, 3)
> +#define FLD_CPU_SPIN FIELD_ADDR(fields_cpu, 4)
> +#define FLD_CPU_INT FIELD_ADDR(fields_cpu, 5)
> +#define FLD_CPU_IDLE FIELD_ADDR(fields_cpu, 6)
>
> /* Define views */
> field_def *view_cpu_0[] = {
> - FLD_CPU_CPU,
> - FLD_CPU_INT, FLD_CPU_SYS, FLD_CPU_USR, FLD_CPU_NIC, FLD_CPU_IDLE, NULL
> + FLD_CPU_CPU, FLD_CPU_USR, FLD_CPU_NIC, FLD_CPU_SYS, FLD_CPU_SPIN,
> + FLD_CPU_INT, FLD_CPU_IDLE, NULL
> };
>
> /* Define view managers */
> @@ -232,11 +234,12 @@ initcpu(void)
> do { \
> if (cur >= dispstart && cur < end) { \
> print_fld_size(FLD_CPU_CPU, (v)); \
> - print_fld_percentage(FLD_CPU_INT, (cs[0])); \
> - print_fld_percentage(FLD_CPU_SYS, (cs[1])); \
> - print_fld_percentage(FLD_CPU_USR, (cs[2])); \
> - print_fld_percentage(FLD_CPU_NIC, (cs[3])); \
> - print_fld_percentage(FLD_CPU_IDLE, (cs[4])); \
> + print_fld_percentage(FLD_CPU_USR, (cs[CP_USER]));\
> + print_fld_percentage(FLD_CPU_NIC, (cs[CP_NICE]));\
> + print_fld_percentage(FLD_CPU_SYS, (cs[CP_SYS]));\
> + print_fld_percentage(FLD_CPU_SPIN, (cs[CP_SPIN]));\
> + print_fld_percentage(FLD_CPU_INT, (cs[CP_INTR]));\
> + print_fld_percentage(FLD_CPU_IDLE, (cs[CP_IDLE]));
> \
> end_line(); \
> } \
> if (++cur >= end) \
> Index: sys/sys/sched.h
> ===================================================================
> RCS file: /cvs/src/sys/sys/sched.h,v
> retrieving revision 1.44
> diff -u -p -r1.44 sched.h
> --- sys/sys/sched.h 14 Dec 2017 23:21:04 -0000 1.44
> +++ sys/sys/sched.h 9 May 2018 10:51:22 -0000
> @@ -83,17 +83,20 @@
> #define CP_USER 0
> #define CP_NICE 1
> #define CP_SYS 2
> -#define CP_INTR 3
> -#define CP_IDLE 4
> -#define CPUSTATES 5
> +#define CP_SPIN 3
> +#define CP_INTR 4
> +#define CP_IDLE 5
> +#define CPUSTATES 6
>
> #define SCHED_NQS 32 /* 32 run queues. */
>
> /*
> * Per-CPU scheduler state.
> - * XXX - expose to userland for now.
> */
> struct schedstate_percpu {
> + struct proc *spc_idleproc; /* idle proc for this cpu */
> + TAILQ_HEAD(prochead, proc) spc_qs[SCHED_NQS];
> + LIST_HEAD(,proc) spc_deadproc;
> struct timespec spc_runtime; /* time curproc started running */
> volatile int spc_schedflags; /* flags; see below */
> u_int spc_schedticks; /* ticks for schedclock() */
> @@ -102,18 +105,12 @@ struct schedstate_percpu {
> int spc_rrticks; /* ticks until roundrobin() */
> int spc_pscnt; /* prof/stat counter */
> int spc_psdiv; /* prof/stat divisor */
> - struct proc *spc_idleproc; /* idle proc for this cpu */
>
> u_int spc_nrun; /* procs on the run queues */
> fixpt_t spc_ldavg; /* shortest load avg. for this cpu */
>
> - TAILQ_HEAD(prochead, proc) spc_qs[SCHED_NQS];
> volatile uint32_t spc_whichqs;
> -
> -#ifdef notyet
> - struct proc *spc_reaper; /* dead proc reaper */
> -#endif
> - LIST_HEAD(,proc) spc_deadproc;
> + volatile u_int spc_spinning; /* this cpu is currently spinning */
> };
>
> #ifdef _KERNEL
> Index: sys/kern/kern_lock.c
> ===================================================================
> RCS file: /cvs/src/sys/kern/kern_lock.c,v
> retrieving revision 1.63
> diff -u -p -r1.63 kern_lock.c
> --- sys/kern/kern_lock.c 26 Apr 2018 06:51:48 -0000 1.63
> +++ sys/kern/kern_lock.c 9 May 2018 10:49:25 -0000
> @@ -113,10 +113,12 @@ ___mp_lock_init(struct __mp_lock *mpl, s
> static __inline void
> __mp_lock_spin(struct __mp_lock *mpl, u_int me)
> {
> + struct schedstate_percpu *spc = &curcpu()->ci_schedstate;
> #ifdef MP_LOCKDEBUG
> int nticks = __mp_lock_spinout;
> #endif
>
> + spc->spc_spinning++;
> while (mpl->mpl_ticket != me) {
> CPU_BUSY_CYCLE();
>
> @@ -128,6 +130,7 @@ __mp_lock_spin(struct __mp_lock *mpl, u_
> }
> #endif
> }
> + spc->spc_spinning--;
> }
>
> void
> @@ -257,10 +260,12 @@ __mtx_init(struct mutex *mtx, int wantip
> void
> __mtx_enter(struct mutex *mtx)
> {
> + struct schedstate_percpu *spc = &curcpu()->ci_schedstate;
> #ifdef MP_LOCKDEBUG
> int nticks = __mp_lock_spinout;
> #endif
>
> + spc->spc_spinning++;
> while (__mtx_enter_try(mtx) == 0) {
> CPU_BUSY_CYCLE();
>
> @@ -272,6 +277,7 @@ __mtx_enter(struct mutex *mtx)
> }
> #endif
> }
> + spc->spc_spinning--;
> }
>
> int
> Index: sys/kern/kern_clock.c
> ===================================================================
> RCS file: /cvs/src/sys/kern/kern_clock.c,v
> retrieving revision 1.93
> diff -u -p -r1.93 kern_clock.c
> --- sys/kern/kern_clock.c 22 Jul 2017 14:33:45 -0000 1.93
> +++ sys/kern/kern_clock.c 9 May 2018 10:56:24 -0000
> @@ -381,6 +381,7 @@ statclock(struct clockframe *frame)
> return;
> /*
> * Came from kernel mode, so we were:
> + * - spinning on a lock
> * - handling an interrupt,
> * - doing syscall or trap work on behalf of the current
> * user process, or
> @@ -391,7 +392,9 @@ statclock(struct clockframe *frame)
> * so that we know how much of its real time was spent
> * in ``non-process'' (i.e., interrupt) work.
> */
> - if (CLKF_INTR(frame)) {
> + if (spc->spc_spinning)
> + spc->spc_cp_time[CP_SPIN]++;
> + else if (CLKF_INTR(frame)) {
> if (p != NULL)
> p->p_iticks++;
> spc->spc_cp_time[CP_INTR]++;