On 08/05/18(Tue) 18:03, Martin Pieuchot wrote:
> Diff below, gypped from jmatthew@, allows us to see how much time CPUs
> spend spinning on a mutex or on the KERNEL_LOCK().
>
> On my sparc64 with 16 CPUs, top(1) now reports:
>
> 16 CPUs: 16.8% user, 0.0% nice, 9.2% sys, 49.4% spin, 0.1% intr, 24.5%
> idle
>
> And systat(1) CPU view:
>
> CPU User Nice System Spin Interrupt
> Idle
> 0 3.9% 0.0% 3.9% 74.5% 2.0%
> 15.7%
> 1 5.9% 0.0% 21.6% 72.5% 0.0%
> 0.0%
> 2 9.8% 0.0% 5.9% 84.3% 0.0%
> 0.0%
> 3 5.9% 0.0% 9.8% 82.4% 0.0%
> 2.0%
> ...
> 15 2.0% 0.0% 13.7% 56.9% 0.0%
> 27.5%
>
> To keep it simple, systat(1) vmstat view now shows the spinning time instead
> of the nice time.
>
> I'd like to put something like that in, to be able to better understand
> profiling and performance analysis.
Updated diff that addresses issues pointed by bluhm@ and visa@:
- Count spinning time in interrupt handlers
- Move `ci_spinning' to the per-CPU scheduler state
- Increment/decrement `spc_spinning' to not corrupt the value if
spinning recursively in an interrupt
Many people support the idea so far, so ok?
Index: usr.bin/top/machine.c
===================================================================
RCS file: /cvs/src/usr.bin/top/machine.c,v
retrieving revision 1.89
diff -u -p -r1.89 machine.c
--- usr.bin/top/machine.c 30 May 2017 06:01:30 -0000 1.89
+++ usr.bin/top/machine.c 9 May 2018 10:44:50 -0000
@@ -108,7 +108,7 @@ char *procstatenames[] = {
/* these are for detailing the cpu states */
int64_t *cpu_states;
char *cpustatenames[] = {
- "user", "nice", "system", "interrupt", "idle", NULL
+ "user", "nice", "sys", "spin", "intr", "idle", NULL
};
/* these are for detailing the memory statistics */
Index: usr.bin/systat/vmstat.c
===================================================================
RCS file: /cvs/src/usr.bin/systat/vmstat.c,v
retrieving revision 1.83
diff -u -p -r1.83 vmstat.c
--- usr.bin/systat/vmstat.c 3 May 2018 07:49:18 -0000 1.83
+++ usr.bin/systat/vmstat.c 9 May 2018 10:44:50 -0000
@@ -268,7 +268,7 @@ labelkre(void)
mvprintw(GENSTATROW, GENSTATCOL, " Csw Trp Sys Int Sof Flt");
mvprintw(GRAPHROW, GRAPHCOL,
- " . %%Int . %%Sys . %%Usr . %%Nic . %%Idle");
+ " . %%Int . %%Sys . %%Usr . %%Spn . %%Idle");
mvprintw(PROCSROW, PROCSCOL, "Proc:r d s w");
mvprintw(GRAPHROW + 1, GRAPHCOL,
"| | | | | | | | | | |");
@@ -306,8 +306,8 @@ labelkre(void)
} while (0)
#define MAXFAIL 5
-static char cpuchar[CPUSTATES] = { '|', '=', '>', '-', ' ' };
-static char cpuorder[CPUSTATES] = { CP_INTR, CP_SYS, CP_USER, CP_NICE, CP_IDLE
};
+static char cpuchar[] = { '|', '=', '>', '-', ' ' };
+static char cpuorder[] = { CP_INTR, CP_SYS, CP_USER, CP_SPIN, CP_IDLE };
void
showkre(void)
@@ -367,7 +367,7 @@ showkre(void)
psiz = 0;
f2 = 0.0;
- for (c = 0; c < CPUSTATES; c++) {
+ for (c = 0; c < nitems(cpuorder); c++) {
i = cpuorder[c];
f1 = cputime(i);
f2 += f1;
Index: usr.bin/systat/cpu.c
===================================================================
RCS file: /cvs/src/usr.bin/systat/cpu.c,v
retrieving revision 1.5
diff -u -p -r1.5 cpu.c
--- usr.bin/systat/cpu.c 2 Jan 2016 20:02:40 -0000 1.5
+++ usr.bin/systat/cpu.c 9 May 2018 10:44:50 -0000
@@ -68,21 +68,23 @@ field_def fields_cpu[] = {
{ "User", 10, 20, 1, FLD_ALIGN_RIGHT, -1, 0, 0, 0 },
{ "Nice", 10, 20, 1, FLD_ALIGN_RIGHT, -1, 0, 0, 0 },
{ "System", 10, 20, 1, FLD_ALIGN_RIGHT, -1, 0, 0, 0 },
+ { "Spin", 10, 20, 1, FLD_ALIGN_RIGHT, -1, 0, 0, 0 },
{ "Interrupt", 10, 20, 1, FLD_ALIGN_RIGHT, -1, 0, 0, 0 },
{ "Idle", 10, 20, 1, FLD_ALIGN_RIGHT, -1, 0, 0, 0 },
};
#define FLD_CPU_CPU FIELD_ADDR(fields_cpu, 0)
-#define FLD_CPU_INT FIELD_ADDR(fields_cpu, 1)
-#define FLD_CPU_SYS FIELD_ADDR(fields_cpu, 2)
-#define FLD_CPU_USR FIELD_ADDR(fields_cpu, 3)
-#define FLD_CPU_NIC FIELD_ADDR(fields_cpu, 4)
-#define FLD_CPU_IDLE FIELD_ADDR(fields_cpu, 5)
+#define FLD_CPU_USR FIELD_ADDR(fields_cpu, 1)
+#define FLD_CPU_NIC FIELD_ADDR(fields_cpu, 2)
+#define FLD_CPU_SYS FIELD_ADDR(fields_cpu, 3)
+#define FLD_CPU_SPIN FIELD_ADDR(fields_cpu, 4)
+#define FLD_CPU_INT FIELD_ADDR(fields_cpu, 5)
+#define FLD_CPU_IDLE FIELD_ADDR(fields_cpu, 6)
/* Define views */
field_def *view_cpu_0[] = {
- FLD_CPU_CPU,
- FLD_CPU_INT, FLD_CPU_SYS, FLD_CPU_USR, FLD_CPU_NIC, FLD_CPU_IDLE, NULL
+ FLD_CPU_CPU, FLD_CPU_USR, FLD_CPU_NIC, FLD_CPU_SYS, FLD_CPU_SPIN,
+ FLD_CPU_INT, FLD_CPU_IDLE, NULL
};
/* Define view managers */
@@ -232,11 +234,12 @@ initcpu(void)
do { \
if (cur >= dispstart && cur < end) { \
print_fld_size(FLD_CPU_CPU, (v)); \
- print_fld_percentage(FLD_CPU_INT, (cs[0])); \
- print_fld_percentage(FLD_CPU_SYS, (cs[1])); \
- print_fld_percentage(FLD_CPU_USR, (cs[2])); \
- print_fld_percentage(FLD_CPU_NIC, (cs[3])); \
- print_fld_percentage(FLD_CPU_IDLE, (cs[4])); \
+ print_fld_percentage(FLD_CPU_USR, (cs[CP_USER]));\
+ print_fld_percentage(FLD_CPU_NIC, (cs[CP_NICE]));\
+ print_fld_percentage(FLD_CPU_SYS, (cs[CP_SYS]));\
+ print_fld_percentage(FLD_CPU_SPIN, (cs[CP_SPIN]));\
+ print_fld_percentage(FLD_CPU_INT, (cs[CP_INTR]));\
+ print_fld_percentage(FLD_CPU_IDLE, (cs[CP_IDLE]));
\
end_line(); \
} \
if (++cur >= end) \
Index: sys/sys/sched.h
===================================================================
RCS file: /cvs/src/sys/sys/sched.h,v
retrieving revision 1.44
diff -u -p -r1.44 sched.h
--- sys/sys/sched.h 14 Dec 2017 23:21:04 -0000 1.44
+++ sys/sys/sched.h 9 May 2018 10:51:22 -0000
@@ -83,17 +83,20 @@
#define CP_USER 0
#define CP_NICE 1
#define CP_SYS 2
-#define CP_INTR 3
-#define CP_IDLE 4
-#define CPUSTATES 5
+#define CP_SPIN 3
+#define CP_INTR 4
+#define CP_IDLE 5
+#define CPUSTATES 6
#define SCHED_NQS 32 /* 32 run queues. */
/*
* Per-CPU scheduler state.
- * XXX - expose to userland for now.
*/
struct schedstate_percpu {
+ struct proc *spc_idleproc; /* idle proc for this cpu */
+ TAILQ_HEAD(prochead, proc) spc_qs[SCHED_NQS];
+ LIST_HEAD(,proc) spc_deadproc;
struct timespec spc_runtime; /* time curproc started running */
volatile int spc_schedflags; /* flags; see below */
u_int spc_schedticks; /* ticks for schedclock() */
@@ -102,18 +105,12 @@ struct schedstate_percpu {
int spc_rrticks; /* ticks until roundrobin() */
int spc_pscnt; /* prof/stat counter */
int spc_psdiv; /* prof/stat divisor */
- struct proc *spc_idleproc; /* idle proc for this cpu */
u_int spc_nrun; /* procs on the run queues */
fixpt_t spc_ldavg; /* shortest load avg. for this cpu */
- TAILQ_HEAD(prochead, proc) spc_qs[SCHED_NQS];
volatile uint32_t spc_whichqs;
-
-#ifdef notyet
- struct proc *spc_reaper; /* dead proc reaper */
-#endif
- LIST_HEAD(,proc) spc_deadproc;
+ volatile u_int spc_spinning; /* this cpu is currently spinning */
};
#ifdef _KERNEL
Index: sys/kern/kern_lock.c
===================================================================
RCS file: /cvs/src/sys/kern/kern_lock.c,v
retrieving revision 1.63
diff -u -p -r1.63 kern_lock.c
--- sys/kern/kern_lock.c 26 Apr 2018 06:51:48 -0000 1.63
+++ sys/kern/kern_lock.c 9 May 2018 10:49:25 -0000
@@ -113,10 +113,12 @@ ___mp_lock_init(struct __mp_lock *mpl, s
static __inline void
__mp_lock_spin(struct __mp_lock *mpl, u_int me)
{
+ struct schedstate_percpu *spc = &curcpu()->ci_schedstate;
#ifdef MP_LOCKDEBUG
int nticks = __mp_lock_spinout;
#endif
+ spc->spc_spinning++;
while (mpl->mpl_ticket != me) {
CPU_BUSY_CYCLE();
@@ -128,6 +130,7 @@ __mp_lock_spin(struct __mp_lock *mpl, u_
}
#endif
}
+ spc->spc_spinning--;
}
void
@@ -257,10 +260,12 @@ __mtx_init(struct mutex *mtx, int wantip
void
__mtx_enter(struct mutex *mtx)
{
+ struct schedstate_percpu *spc = &curcpu()->ci_schedstate;
#ifdef MP_LOCKDEBUG
int nticks = __mp_lock_spinout;
#endif
+ spc->spc_spinning++;
while (__mtx_enter_try(mtx) == 0) {
CPU_BUSY_CYCLE();
@@ -272,6 +277,7 @@ __mtx_enter(struct mutex *mtx)
}
#endif
}
+ spc->spc_spinning--;
}
int
Index: sys/kern/kern_clock.c
===================================================================
RCS file: /cvs/src/sys/kern/kern_clock.c,v
retrieving revision 1.93
diff -u -p -r1.93 kern_clock.c
--- sys/kern/kern_clock.c 22 Jul 2017 14:33:45 -0000 1.93
+++ sys/kern/kern_clock.c 9 May 2018 10:56:24 -0000
@@ -381,6 +381,7 @@ statclock(struct clockframe *frame)
return;
/*
* Came from kernel mode, so we were:
+ * - spinning on a lock
* - handling an interrupt,
* - doing syscall or trap work on behalf of the current
* user process, or
@@ -391,7 +392,9 @@ statclock(struct clockframe *frame)
* so that we know how much of its real time was spent
* in ``non-process'' (i.e., interrupt) work.
*/
- if (CLKF_INTR(frame)) {
+ if (spc->spc_spinning)
+ spc->spc_cp_time[CP_SPIN]++;
+ else if (CLKF_INTR(frame)) {
if (p != NULL)
p->p_iticks++;
spc->spc_cp_time[CP_INTR]++;